From bb6135880e5e453d7701764b9f2e4ad3356a68d7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Jul 2019 22:34:46 -0400 Subject: [PATCH] STYLE: Apply black formatting --- asv_bench/benchmarks/algorithms.py | 123 +- asv_bench/benchmarks/attrs_caching.py | 6 +- asv_bench/benchmarks/binary_ops.py | 51 +- asv_bench/benchmarks/categoricals.py | 114 +- asv_bench/benchmarks/ctors.py | 52 +- asv_bench/benchmarks/dtypes.py | 32 +- asv_bench/benchmarks/eval.py | 31 +- asv_bench/benchmarks/frame_ctor.py | 22 +- asv_bench/benchmarks/frame_methods.py | 223 +- asv_bench/benchmarks/gil.py | 136 +- asv_bench/benchmarks/groupby.py | 515 +- asv_bench/benchmarks/index_object.py | 75 +- asv_bench/benchmarks/indexing.py | 135 +- asv_bench/benchmarks/indexing_engines.py | 59 +- asv_bench/benchmarks/inference.py | 75 +- asv_bench/benchmarks/io/csv.py | 309 +- asv_bench/benchmarks/io/excel.py | 18 +- asv_bench/benchmarks/io/hdf.py | 106 +- asv_bench/benchmarks/io/json.py | 134 +- asv_bench/benchmarks/io/msgpack.py | 13 +- asv_bench/benchmarks/io/parsers.py | 18 +- asv_bench/benchmarks/io/pickle.py | 13 +- asv_bench/benchmarks/io/sas.py | 20 +- asv_bench/benchmarks/io/sql.py | 161 +- asv_bench/benchmarks/io/stata.py | 38 +- asv_bench/benchmarks/join_merge.py | 274 +- asv_bench/benchmarks/multiindex_object.py | 84 +- asv_bench/benchmarks/offset.py | 73 +- asv_bench/benchmarks/pandas_vb_common.py | 39 +- asv_bench/benchmarks/period.py | 92 +- asv_bench/benchmarks/plotting.py | 47 +- asv_bench/benchmarks/reindex.py | 70 +- asv_bench/benchmarks/replace.py | 29 +- asv_bench/benchmarks/reshape.py | 144 +- asv_bench/benchmarks/rolling.py | 80 +- asv_bench/benchmarks/series_methods.py | 141 +- asv_bench/benchmarks/sparse.py | 49 +- asv_bench/benchmarks/stat_ops.py | 47 +- asv_bench/benchmarks/strings.py | 84 +- asv_bench/benchmarks/timedelta.py | 59 +- asv_bench/benchmarks/timeseries.py | 206 +- asv_bench/benchmarks/timestamp.py | 35 +- ci/print_skipped.py | 33 +- doc/logo/pandas_logo.py | 16 +- doc/make.py | 266 +- doc/source/conf.py | 347 +- doc/sphinxext/announce.py | 48 +- doc/sphinxext/contributors.py | 24 +- pandas/__init__.py | 189 +- pandas/_config/__init__.py | 21 +- pandas/_config/config.py | 126 +- pandas/_config/dates.py | 12 +- pandas/_config/display.py | 11 +- pandas/_config/localization.py | 22 +- pandas/_libs/__init__.py | 9 +- pandas/_typing.py | 28 +- pandas/_version.py | 123 +- pandas/api/extensions/__init__.py | 14 +- pandas/api/types/__init__.py | 6 +- pandas/arrays/__init__.py | 27 +- pandas/compat/__init__.py | 14 +- pandas/compat/_optional.py | 17 +- pandas/compat/chainmap.py | 1 - pandas/compat/numpy/__init__.py | 47 +- pandas/compat/numpy/function.py | 296 +- pandas/compat/pickle_compat.py | 136 +- pandas/conftest.py | 274 +- pandas/core/accessor.py | 67 +- pandas/core/algorithms.py | 605 ++- pandas/core/api.py | 19 +- pandas/core/apply.py | 162 +- pandas/core/arrays/__init__.py | 5 +- pandas/core/arrays/_ranges.py | 78 +- pandas/core/arrays/array_.py | 31 +- pandas/core/arrays/base.py | 97 +- pandas/core/arrays/categorical.py | 636 +-- pandas/core/arrays/datetimelike.py | 354 +- pandas/core/arrays/datetimes.py | 647 ++- pandas/core/arrays/integer.py | 274 +- pandas/core/arrays/interval.py | 348 +- pandas/core/arrays/numpy_.py | 175 +- pandas/core/arrays/period.py | 267 +- pandas/core/arrays/sparse.py | 462 +- pandas/core/arrays/timedeltas.py | 249 +- pandas/core/base.py | 368 +- pandas/core/common.py | 50 +- pandas/core/computation/align.py | 36 +- pandas/core/computation/check.py | 5 +- pandas/core/computation/common.py | 2 +- pandas/core/computation/engines.py | 23 +- pandas/core/computation/eval.py | 95 +- pandas/core/computation/expr.py | 347 +- pandas/core/computation/expressions.py | 88 +- pandas/core/computation/ops.py | 194 +- pandas/core/computation/pytables.py | 205 +- pandas/core/computation/scope.py | 66 +- pandas/core/config_init.py | 351 +- pandas/core/dtypes/api.py | 55 +- pandas/core/dtypes/base.py | 16 +- pandas/core/dtypes/cast.py | 342 +- pandas/core/dtypes/common.py | 228 +- pandas/core/dtypes/concat.py | 167 +- pandas/core/dtypes/dtypes.py | 243 +- pandas/core/dtypes/generic.py | 125 +- pandas/core/dtypes/inference.py | 47 +- pandas/core/dtypes/missing.py | 109 +- pandas/core/frame.py | 1980 +++++--- pandas/core/generic.py | 2806 +++++++---- pandas/core/groupby/__init__.py | 5 +- pandas/core/groupby/base.py | 67 +- pandas/core/groupby/categorical.py | 15 +- pandas/core/groupby/generic.py | 467 +- pandas/core/groupby/groupby.py | 600 ++- pandas/core/groupby/grouper.py | 230 +- pandas/core/groupby/ops.py | 351 +- pandas/core/index.py | 28 +- pandas/core/indexes/accessors.py | 112 +- pandas/core/indexes/api.py | 83 +- pandas/core/indexes/base.py | 1376 +++--- pandas/core/indexes/category.py | 253 +- pandas/core/indexes/datetimelike.py | 182 +- pandas/core/indexes/datetimes.py | 582 ++- pandas/core/indexes/frozen.py | 27 +- pandas/core/indexes/interval.py | 524 +- pandas/core/indexes/multi.py | 944 ++-- pandas/core/indexes/numeric.py | 217 +- pandas/core/indexes/period.py | 406 +- pandas/core/indexes/range.py | 206 +- pandas/core/indexes/timedeltas.py | 262 +- pandas/core/indexing.py | 531 +- pandas/core/internals/__init__.py | 26 +- pandas/core/internals/blocks.py | 1119 +++-- pandas/core/internals/concat.py | 149 +- pandas/core/internals/construction.py | 236 +- pandas/core/internals/managers.py | 641 ++- pandas/core/missing.py | 292 +- pandas/core/nanops.py | 297 +- pandas/core/ops.py | 828 ++-- pandas/core/resample.py | 510 +- pandas/core/reshape/concat.py | 212 +- pandas/core/reshape/melt.py | 94 +- pandas/core/reshape/merge.py | 920 ++-- pandas/core/reshape/pivot.py | 267 +- pandas/core/reshape/reshape.py | 324 +- pandas/core/reshape/tile.py | 182 +- pandas/core/reshape/util.py | 9 +- pandas/core/series.py | 1041 ++-- pandas/core/sorting.py | 106 +- pandas/core/sparse/frame.py | 396 +- pandas/core/sparse/scipy_sparse.py | 49 +- pandas/core/sparse/series.py | 277 +- pandas/core/strings.py | 798 +-- pandas/core/tools/datetimes.py | 391 +- pandas/core/tools/numeric.py | 48 +- pandas/core/tools/timedeltas.py | 57 +- pandas/core/util/hashing.py | 132 +- pandas/core/window.py | 964 ++-- pandas/errors/__init__.py | 11 +- pandas/io/clipboard/__init__.py | 41 +- pandas/io/clipboard/clipboards.py | 64 +- pandas/io/clipboard/exceptions.py | 1 - pandas/io/clipboard/windows.py | 49 +- pandas/io/clipboards.py | 55 +- pandas/io/common.py | 149 +- pandas/io/date_converters.py | 15 +- pandas/io/excel/_base.py | 365 +- pandas/io/excel/_odfreader.py | 44 +- pandas/io/excel/_openpyxl.py | 109 +- pandas/io/excel/_util.py | 43 +- pandas/io/excel/_xlrd.py | 38 +- pandas/io/excel/_xlsxwriter.py | 240 +- pandas/io/excel/_xlwt.py | 59 +- pandas/io/feather_format.py | 36 +- pandas/io/formats/console.py | 17 +- pandas/io/formats/css.py | 156 +- pandas/io/formats/csvs.py | 188 +- pandas/io/formats/excel.py | 456 +- pandas/io/formats/format.py | 699 +-- pandas/io/formats/html.py | 285 +- pandas/io/formats/latex.py | 151 +- pandas/io/formats/printing.py | 179 +- pandas/io/formats/style.py | 445 +- pandas/io/gbq.py | 81 +- pandas/io/gcs.py | 13 +- pandas/io/html.py | 254 +- pandas/io/json/json.py | 581 ++- pandas/io/json/normalize.py | 74 +- pandas/io/json/table_schema.py | 152 +- pandas/io/msgpack/__init__.py | 4 +- pandas/io/msgpack/exceptions.py | 1 - pandas/io/packers.py | 732 +-- pandas/io/parquet.py | 136 +- pandas/io/parsers.py | 1605 +++--- pandas/io/pickle.py | 14 +- pandas/io/pytables.py | 2055 ++++---- pandas/io/s3.py | 13 +- pandas/io/sas/sas7bdat.py | 326 +- pandas/io/sas/sas_constants.py | 140 +- pandas/io/sas/sas_xport.py | 190 +- pandas/io/sas/sasreader.py | 38 +- pandas/io/spss.py | 13 +- pandas/io/sql.py | 672 ++- pandas/io/stata.py | 1243 +++-- pandas/plotting/__init__.py | 50 +- pandas/plotting/_core.py | 380 +- pandas/plotting/_matplotlib/__init__.py | 81 +- pandas/plotting/_matplotlib/boxplot.py | 254 +- pandas/plotting/_matplotlib/compat.py | 12 +- pandas/plotting/_matplotlib/converter.py | 433 +- pandas/plotting/_matplotlib/core.py | 613 ++- pandas/plotting/_matplotlib/hist.py | 316 +- pandas/plotting/_matplotlib/misc.py | 171 +- pandas/plotting/_matplotlib/style.py | 24 +- pandas/plotting/_matplotlib/timeseries.py | 143 +- pandas/plotting/_matplotlib/tools.py | 103 +- pandas/plotting/_misc.py | 138 +- pandas/testing.py | 5 +- pandas/tests/api/test_api.py | 215 +- pandas/tests/api/test_types.py | 72 +- pandas/tests/arithmetic/conftest.py | 150 +- pandas/tests/arithmetic/test_datetime64.py | 1484 +++--- pandas/tests/arithmetic/test_numeric.py | 544 ++- pandas/tests/arithmetic/test_object.py | 215 +- pandas/tests/arithmetic/test_period.py | 641 +-- pandas/tests/arithmetic/test_timedelta64.py | 1071 ++-- pandas/tests/arrays/categorical/common.py | 6 +- pandas/tests/arrays/categorical/test_algos.py | 78 +- .../arrays/categorical/test_analytics.py | 144 +- pandas/tests/arrays/categorical/test_api.py | 261 +- .../arrays/categorical/test_constructors.py | 293 +- .../tests/arrays/categorical/test_dtypes.py | 136 +- .../tests/arrays/categorical/test_indexing.py | 173 +- .../tests/arrays/categorical/test_missing.py | 36 +- .../arrays/categorical/test_operators.py | 228 +- pandas/tests/arrays/categorical/test_repr.py | 116 +- .../tests/arrays/categorical/test_sorting.py | 38 +- .../tests/arrays/categorical/test_subclass.py | 13 +- .../tests/arrays/categorical/test_warnings.py | 12 +- pandas/tests/arrays/interval/test_interval.py | 75 +- pandas/tests/arrays/interval/test_ops.py | 56 +- pandas/tests/arrays/sparse/test_accessor.py | 94 +- .../tests/arrays/sparse/test_arithmetics.py | 144 +- pandas/tests/arrays/sparse/test_array.py | 446 +- pandas/tests/arrays/sparse/test_dtype.py | 182 +- pandas/tests/arrays/sparse/test_libsparse.py | 266 +- pandas/tests/arrays/test_array.py | 381 +- pandas/tests/arrays/test_datetimelike.py | 193 +- pandas/tests/arrays/test_datetimes.py | 193 +- pandas/tests/arrays/test_integer.py | 316 +- pandas/tests/arrays/test_numpy.py | 93 +- pandas/tests/arrays/test_period.py | 194 +- pandas/tests/arrays/test_timedeltas.py | 77 +- pandas/tests/computation/test_compat.py | 15 +- pandas/tests/computation/test_eval.py | 1173 ++--- pandas/tests/config/test_config.py | 371 +- pandas/tests/config/test_localization.py | 8 +- .../dtypes/cast/test_construct_from_scalar.py | 6 +- .../dtypes/cast/test_construct_ndarray.py | 17 +- .../dtypes/cast/test_construct_object_arr.py | 6 +- pandas/tests/dtypes/cast/test_downcast.py | 30 +- .../dtypes/cast/test_find_common_type.py | 154 +- .../dtypes/cast/test_infer_datetimelike.py | 13 +- pandas/tests/dtypes/cast/test_infer_dtype.py | 93 +- pandas/tests/dtypes/cast/test_promote.py | 575 ++- pandas/tests/dtypes/cast/test_upcast.py | 90 +- pandas/tests/dtypes/test_common.py | 480 +- pandas/tests/dtypes/test_concat.py | 100 +- pandas/tests/dtypes/test_dtypes.py | 681 +-- pandas/tests/dtypes/test_generic.py | 38 +- pandas/tests/dtypes/test_inference.py | 978 ++-- pandas/tests/dtypes/test_missing.py | 344 +- pandas/tests/extension/arrow/bool.py | 32 +- pandas/tests/extension/arrow/test_bool.py | 4 +- pandas/tests/extension/base/__init__.py | 8 +- pandas/tests/extension/base/base.py | 4 +- pandas/tests/extension/base/constructors.py | 5 +- pandas/tests/extension/base/dtype.py | 27 +- pandas/tests/extension/base/getitem.py | 48 +- pandas/tests/extension/base/groupby.py | 59 +- pandas/tests/extension/base/interface.py | 9 +- pandas/tests/extension/base/io.py | 13 +- pandas/tests/extension/base/methods.py | 136 +- pandas/tests/extension/base/missing.py | 55 +- pandas/tests/extension/base/ops.py | 19 +- pandas/tests/extension/base/printing.py | 6 +- pandas/tests/extension/base/reduce.py | 13 +- pandas/tests/extension/base/reshaping.py | 199 +- pandas/tests/extension/base/setitem.py | 46 +- pandas/tests/extension/conftest.py | 25 +- pandas/tests/extension/decimal/__init__.py | 2 +- pandas/tests/extension/decimal/array.py | 35 +- .../tests/extension/decimal/test_decimal.py | 147 +- pandas/tests/extension/json/__init__.py | 2 +- pandas/tests/extension/json/array.py | 44 +- pandas/tests/extension/json/test_json.py | 82 +- pandas/tests/extension/test_categorical.py | 27 +- pandas/tests/extension/test_common.py | 31 +- pandas/tests/extension/test_datetime.py | 94 +- pandas/tests/extension/test_external_block.py | 21 +- pandas/tests/extension/test_integer.py | 59 +- pandas/tests/extension/test_interval.py | 7 +- pandas/tests/extension/test_numpy.py | 62 +- pandas/tests/extension/test_period.py | 28 +- pandas/tests/extension/test_sparse.py | 90 +- pandas/tests/frame/common.py | 114 +- pandas/tests/frame/conftest.py | 72 +- pandas/tests/frame/test_alter_axes.py | 1289 ++--- pandas/tests/frame/test_analytics.py | 2001 ++++---- pandas/tests/frame/test_api.py | 264 +- pandas/tests/frame/test_apply.py | 1013 ++-- pandas/tests/frame/test_arithmetic.py | 303 +- pandas/tests/frame/test_asof.py | 79 +- .../tests/frame/test_axis_select_reindex.py | 735 +-- pandas/tests/frame/test_block_internals.py | 461 +- pandas/tests/frame/test_combine_concat.py | 800 +-- pandas/tests/frame/test_constructors.py | 1827 +++---- pandas/tests/frame/test_convert_to.py | 634 +-- pandas/tests/frame/test_dtypes.py | 1182 +++-- pandas/tests/frame/test_duplicates.py | 300 +- pandas/tests/frame/test_indexing.py | 1975 ++++---- pandas/tests/frame/test_join.py | 154 +- pandas/tests/frame/test_missing.py | 721 +-- pandas/tests/frame/test_mutate_columns.py | 209 +- pandas/tests/frame/test_nonunique_indexes.py | 495 +- pandas/tests/frame/test_operators.py | 514 +- pandas/tests/frame/test_period.py | 101 +- pandas/tests/frame/test_quantile.py | 410 +- pandas/tests/frame/test_query_eval.py | 709 +-- pandas/tests/frame/test_rank.py | 176 +- pandas/tests/frame/test_replace.py | 1143 +++-- pandas/tests/frame/test_repr_info.py | 317 +- pandas/tests/frame/test_reshape.py | 1023 ++-- .../frame/test_sort_values_level_as_str.py | 71 +- pandas/tests/frame/test_sorting.py | 616 +-- pandas/tests/frame/test_subclass.py | 586 +-- pandas/tests/frame/test_timeseries.py | 590 +-- pandas/tests/frame/test_timezones.py | 179 +- pandas/tests/frame/test_to_csv.py | 889 ++-- pandas/tests/frame/test_validate.py | 19 +- pandas/tests/generic/test_frame.py | 220 +- pandas/tests/generic/test_generic.py | 332 +- .../generic/test_label_or_level_utils.py | 99 +- pandas/tests/generic/test_series.py | 151 +- .../tests/groupby/aggregate/test_aggregate.py | 407 +- pandas/tests/groupby/aggregate/test_cython.py | 216 +- pandas/tests/groupby/aggregate/test_other.py | 595 ++- pandas/tests/groupby/conftest.py | 98 +- pandas/tests/groupby/test_apply.py | 444 +- pandas/tests/groupby/test_bin_groupby.py | 59 +- pandas/tests/groupby/test_categorical.py | 1089 +++-- pandas/tests/groupby/test_counting.py | 125 +- pandas/tests/groupby/test_filters.py | 338 +- pandas/tests/groupby/test_function.py | 1259 ++--- pandas/tests/groupby/test_groupby.py | 1187 ++--- pandas/tests/groupby/test_grouping.py | 666 +-- pandas/tests/groupby/test_index_as_string.py | 70 +- pandas/tests/groupby/test_nth.py | 516 +- pandas/tests/groupby/test_rank.py | 566 ++- pandas/tests/groupby/test_timegrouper.py | 845 ++-- pandas/tests/groupby/test_transform.py | 750 +-- pandas/tests/groupby/test_value_counts.py | 48 +- pandas/tests/groupby/test_whitelist.py | 318 +- pandas/tests/indexes/common.py | 206 +- pandas/tests/indexes/conftest.py | 44 +- pandas/tests/indexes/datetimelike.py | 13 +- .../indexes/datetimes/test_arithmetic.py | 93 +- pandas/tests/indexes/datetimes/test_astype.py | 283 +- .../indexes/datetimes/test_construction.py | 832 ++-- .../indexes/datetimes/test_date_range.py | 699 +-- .../tests/indexes/datetimes/test_datetime.py | 221 +- .../indexes/datetimes/test_datetimelike.py | 9 +- .../tests/indexes/datetimes/test_formats.py | 262 +- .../tests/indexes/datetimes/test_indexing.py | 593 ++- pandas/tests/indexes/datetimes/test_misc.py | 289 +- .../tests/indexes/datetimes/test_missing.py | 82 +- pandas/tests/indexes/datetimes/test_ops.py | 312 +- .../indexes/datetimes/test_partial_slicing.py | 381 +- .../indexes/datetimes/test_scalar_compat.py | 272 +- pandas/tests/indexes/datetimes/test_setops.py | 263 +- .../tests/indexes/datetimes/test_timezones.py | 956 ++-- pandas/tests/indexes/datetimes/test_tools.py | 2062 ++++---- pandas/tests/indexes/interval/test_astype.py | 139 +- .../indexes/interval/test_construction.py | 240 +- .../tests/indexes/interval/test_interval.py | 581 ++- .../indexes/interval/test_interval_new.py | 213 +- .../indexes/interval/test_interval_range.py | 229 +- .../indexes/interval/test_interval_tree.py | 105 +- pandas/tests/indexes/interval/test_setops.py | 61 +- pandas/tests/indexes/multi/conftest.py | 47 +- pandas/tests/indexes/multi/test_analytics.py | 203 +- pandas/tests/indexes/multi/test_astype.py | 8 +- pandas/tests/indexes/multi/test_compat.py | 10 +- .../tests/indexes/multi/test_constructor.py | 481 +- pandas/tests/indexes/multi/test_contains.py | 66 +- pandas/tests/indexes/multi/test_conversion.py | 164 +- pandas/tests/indexes/multi/test_copy.py | 35 +- pandas/tests/indexes/multi/test_drop.py | 86 +- pandas/tests/indexes/multi/test_duplicates.py | 156 +- .../tests/indexes/multi/test_equivalence.py | 36 +- pandas/tests/indexes/multi/test_format.py | 36 +- pandas/tests/indexes/multi/test_get_set.py | 154 +- pandas/tests/indexes/multi/test_indexing.py | 237 +- pandas/tests/indexes/multi/test_integrity.py | 125 +- pandas/tests/indexes/multi/test_join.py | 50 +- pandas/tests/indexes/multi/test_missing.py | 59 +- pandas/tests/indexes/multi/test_monotonic.py | 131 +- pandas/tests/indexes/multi/test_names.py | 57 +- .../indexes/multi/test_partial_indexing.py | 42 +- pandas/tests/indexes/multi/test_reindex.py | 43 +- pandas/tests/indexes/multi/test_reshape.py | 100 +- pandas/tests/indexes/multi/test_set_ops.py | 77 +- pandas/tests/indexes/multi/test_sorting.py | 138 +- .../tests/indexes/period/test_arithmetic.py | 87 +- pandas/tests/indexes/period/test_asfreq.py | 203 +- pandas/tests/indexes/period/test_astype.py | 78 +- .../tests/indexes/period/test_construction.py | 398 +- pandas/tests/indexes/period/test_formats.py | 185 +- pandas/tests/indexes/period/test_indexing.py | 494 +- pandas/tests/indexes/period/test_ops.py | 220 +- .../indexes/period/test_partial_slicing.py | 121 +- pandas/tests/indexes/period/test_period.py | 344 +- .../tests/indexes/period/test_period_range.py | 70 +- .../indexes/period/test_scalar_compat.py | 10 +- pandas/tests/indexes/period/test_setops.py | 381 +- pandas/tests/indexes/period/test_tools.py | 322 +- pandas/tests/indexes/test_base.py | 1791 ++++--- pandas/tests/indexes/test_category.py | 679 +-- pandas/tests/indexes/test_common.py | 75 +- pandas/tests/indexes/test_frozen.py | 4 +- pandas/tests/indexes/test_numeric.py | 527 +- pandas/tests/indexes/test_numpy_compat.py | 58 +- pandas/tests/indexes/test_range.py | 426 +- pandas/tests/indexes/test_setops.py | 74 +- .../indexes/timedeltas/test_arithmetic.py | 187 +- .../tests/indexes/timedeltas/test_astype.py | 79 +- .../indexes/timedeltas/test_construction.py | 145 +- .../tests/indexes/timedeltas/test_formats.py | 108 +- .../tests/indexes/timedeltas/test_indexing.py | 265 +- pandas/tests/indexes/timedeltas/test_ops.py | 152 +- .../timedeltas/test_partial_slicing.py | 59 +- .../indexes/timedeltas/test_scalar_compat.py | 50 +- .../tests/indexes/timedeltas/test_setops.py | 98 +- .../indexes/timedeltas/test_timedelta.py | 189 +- .../timedeltas/test_timedelta_range.py | 51 +- pandas/tests/indexes/timedeltas/test_tools.py | 156 +- pandas/tests/indexing/common.py | 156 +- pandas/tests/indexing/conftest.py | 27 +- .../tests/indexing/interval/test_interval.py | 18 +- .../indexing/interval/test_interval_new.py | 35 +- pandas/tests/indexing/multiindex/conftest.py | 23 +- .../multiindex/test_chaining_and_caching.py | 29 +- .../indexing/multiindex/test_datetime.py | 8 +- .../tests/indexing/multiindex/test_getitem.py | 210 +- pandas/tests/indexing/multiindex/test_iloc.py | 82 +- .../indexing/multiindex/test_indexing_slow.py | 53 +- pandas/tests/indexing/multiindex/test_ix.py | 41 +- pandas/tests/indexing/multiindex/test_loc.py | 247 +- .../indexing/multiindex/test_multiindex.py | 82 +- .../tests/indexing/multiindex/test_partial.py | 127 +- .../tests/indexing/multiindex/test_set_ops.py | 25 +- .../tests/indexing/multiindex/test_setitem.py | 400 +- .../tests/indexing/multiindex/test_slice.py | 554 ++- .../tests/indexing/multiindex/test_sorted.py | 55 +- pandas/tests/indexing/multiindex/test_xs.py | 190 +- pandas/tests/indexing/test_callable.py | 167 +- pandas/tests/indexing/test_categorical.py | 515 +- .../indexing/test_chaining_and_caching.py | 268 +- pandas/tests/indexing/test_coercion.py | 891 ++-- pandas/tests/indexing/test_datetime.py | 238 +- pandas/tests/indexing/test_floats.py | 786 +-- pandas/tests/indexing/test_iloc.py | 435 +- pandas/tests/indexing/test_indexing.py | 929 ++-- .../tests/indexing/test_indexing_engines.py | 33 +- pandas/tests/indexing/test_indexing_slow.py | 5 +- pandas/tests/indexing/test_ix.py | 291 +- pandas/tests/indexing/test_loc.py | 882 ++-- pandas/tests/indexing/test_partial.py | 317 +- pandas/tests/indexing/test_scalar.py | 111 +- pandas/tests/indexing/test_timedelta.py | 101 +- pandas/tests/internals/test_internals.py | 934 ++-- pandas/tests/io/conftest.py | 34 +- pandas/tests/io/excel/conftest.py | 5 +- pandas/tests/io/excel/test_odf.py | 17 +- pandas/tests/io/excel/test_openpyxl.py | 98 +- pandas/tests/io/excel/test_readers.py | 767 +-- pandas/tests/io/excel/test_style.py | 143 +- pandas/tests/io/excel/test_writers.py | 745 +-- pandas/tests/io/excel/test_xlrd.py | 7 +- pandas/tests/io/excel/test_xlsxwriter.py | 21 +- pandas/tests/io/excel/test_xlwt.py | 33 +- pandas/tests/io/formats/test_console.py | 50 +- pandas/tests/io/formats/test_css.py | 297 +- .../tests/io/formats/test_eng_formatting.py | 165 +- pandas/tests/io/formats/test_format.py | 2682 +++++----- pandas/tests/io/formats/test_printing.py | 111 +- pandas/tests/io/formats/test_style.py | 1794 ++++--- pandas/tests/io/formats/test_to_csv.py | 477 +- pandas/tests/io/formats/test_to_excel.py | 471 +- pandas/tests/io/formats/test_to_html.py | 633 +-- pandas/tests/io/formats/test_to_latex.py | 212 +- .../tests/io/generate_legacy_storage_files.py | 404 +- pandas/tests/io/json/test_compression.py | 43 +- .../tests/io/json/test_json_table_schema.py | 852 ++-- pandas/tests/io/json/test_normalize.py | 774 +-- pandas/tests/io/json/test_pandas.py | 1314 +++-- pandas/tests/io/json/test_readlines.py | 81 +- pandas/tests/io/json/test_ujson.py | 512 +- pandas/tests/io/msgpack/test_buffer.py | 11 +- pandas/tests/io/msgpack/test_case.py | 96 +- pandas/tests/io/msgpack/test_except.py | 11 +- pandas/tests/io/msgpack/test_extension.py | 50 +- pandas/tests/io/msgpack/test_format.py | 101 +- pandas/tests/io/msgpack/test_limits.py | 15 +- pandas/tests/io/msgpack/test_newspec.py | 58 +- pandas/tests/io/msgpack/test_obj.py | 33 +- pandas/tests/io/msgpack/test_pack.py | 86 +- pandas/tests/io/msgpack/test_read_size.py | 42 +- pandas/tests/io/msgpack/test_seq.py | 2 +- pandas/tests/io/msgpack/test_sequnpack.py | 77 +- pandas/tests/io/msgpack/test_subtype.py | 2 +- pandas/tests/io/msgpack/test_unpack.py | 20 +- pandas/tests/io/msgpack/test_unpack_raw.py | 10 +- pandas/tests/io/parser/conftest.py | 12 +- pandas/tests/io/parser/test_c_parser_only.py | 289 +- pandas/tests/io/parser/test_comment.py | 41 +- pandas/tests/io/parser/test_common.py | 1056 ++-- pandas/tests/io/parser/test_compression.py | 22 +- pandas/tests/io/parser/test_converters.py | 42 +- pandas/tests/io/parser/test_dialect.py | 65 +- pandas/tests/io/parser/test_dtypes.py | 316 +- pandas/tests/io/parser/test_header.py | 375 +- pandas/tests/io/parser/test_index_col.py | 112 +- pandas/tests/io/parser/test_mangle_dupes.py | 89 +- pandas/tests/io/parser/test_multi_thread.py | 37 +- pandas/tests/io/parser/test_na_values.py | 384 +- pandas/tests/io/parser/test_network.py | 126 +- pandas/tests/io/parser/test_parse_dates.py | 1357 ++++-- .../io/parser/test_python_parser_only.py | 96 +- pandas/tests/io/parser/test_quoting.py | 88 +- pandas/tests/io/parser/test_read_fwf.py | 235 +- pandas/tests/io/parser/test_skiprows.py | 182 +- pandas/tests/io/parser/test_textreader.py | 259 +- pandas/tests/io/parser/test_unsupported.py | 47 +- pandas/tests/io/parser/test_usecols.py | 377 +- pandas/tests/io/pytables/test_compat.py | 24 +- pandas/tests/io/pytables/test_pytables.py | 4292 +++++++++-------- pandas/tests/io/sas/test_sas.py | 11 +- pandas/tests/io/sas/test_sas7bdat.py | 88 +- pandas/tests/io/sas/test_xport.py | 17 +- pandas/tests/io/test_clipboard.py | 191 +- pandas/tests/io/test_common.py | 267 +- pandas/tests/io/test_compression.py | 92 +- pandas/tests/io/test_date_converters.py | 13 +- pandas/tests/io/test_feather.py | 106 +- pandas/tests/io/test_gbq.py | 72 +- pandas/tests/io/test_gcs.py | 65 +- pandas/tests/io/test_html.py | 652 +-- pandas/tests/io/test_packers.py | 503 +- pandas/tests/io/test_parquet.py | 386 +- pandas/tests/io/test_pickle.py | 76 +- pandas/tests/io/test_s3.py | 8 +- pandas/tests/io/test_spss.py | 7 +- pandas/tests/io/test_sql.py | 1734 ++++--- pandas/tests/io/test_stata.py | 1482 +++--- pandas/tests/plotting/common.py | 133 +- pandas/tests/plotting/test_backend.py | 27 +- pandas/tests/plotting/test_boxplot_method.py | 290 +- pandas/tests/plotting/test_converter.py | 154 +- pandas/tests/plotting/test_datetimelike.py | 607 +-- pandas/tests/plotting/test_frame.py | 1817 +++---- pandas/tests/plotting/test_groupby.py | 38 +- pandas/tests/plotting/test_hist_method.py | 155 +- pandas/tests/plotting/test_misc.py | 354 +- pandas/tests/plotting/test_series.py | 329 +- pandas/tests/reductions/test_reductions.py | 526 +- .../tests/reductions/test_stat_reductions.py | 102 +- pandas/tests/resample/conftest.py | 36 +- pandas/tests/resample/test_base.py | 95 +- pandas/tests/resample/test_datetime_index.py | 1277 ++--- pandas/tests/resample/test_period_index.py | 815 ++-- pandas/tests/resample/test_resample_api.py | 505 +- .../tests/resample/test_resampler_grouper.py | 238 +- pandas/tests/resample/test_time_grouper.py | 241 +- pandas/tests/resample/test_timedelta.py | 110 +- pandas/tests/reshape/merge/test_join.py | 755 +-- pandas/tests/reshape/merge/test_merge.py | 2257 +++++---- pandas/tests/reshape/merge/test_merge_asof.py | 1499 +++--- .../merge/test_merge_index_as_string.py | 99 +- .../tests/reshape/merge/test_merge_ordered.py | 100 +- pandas/tests/reshape/merge/test_multi.py | 914 ++-- pandas/tests/reshape/test_concat.py | 1973 ++++---- pandas/tests/reshape/test_cut.py | 366 +- pandas/tests/reshape/test_melt.py | 1271 +++-- pandas/tests/reshape/test_pivot.py | 2829 ++++++----- pandas/tests/reshape/test_qcut.py | 131 +- pandas/tests/reshape/test_reshape.py | 568 +-- .../tests/reshape/test_union_categoricals.py | 228 +- pandas/tests/reshape/test_util.py | 16 +- pandas/tests/scalar/interval/test_interval.py | 117 +- pandas/tests/scalar/interval/test_ops.py | 26 +- pandas/tests/scalar/period/test_asfreq.py | 1140 ++--- pandas/tests/scalar/period/test_period.py | 1170 ++--- pandas/tests/scalar/test_nat.py | 263 +- .../tests/scalar/timedelta/test_arithmetic.py | 243 +- .../scalar/timedelta/test_construction.py | 286 +- pandas/tests/scalar/timedelta/test_formats.py | 49 +- .../tests/scalar/timedelta/test_timedelta.py | 649 +-- .../tests/scalar/timestamp/test_arithmetic.py | 56 +- .../scalar/timestamp/test_comparisons.py | 41 +- .../tests/scalar/timestamp/test_rendering.py | 38 +- .../tests/scalar/timestamp/test_timestamp.py | 588 ++- .../tests/scalar/timestamp/test_timezones.py | 330 +- .../tests/scalar/timestamp/test_unary_ops.py | 297 +- pandas/tests/series/common.py | 7 +- pandas/tests/series/conftest.py | 6 +- pandas/tests/series/indexing/conftest.py | 2 +- .../tests/series/indexing/test_alter_index.py | 248 +- pandas/tests/series/indexing/test_boolean.py | 185 +- pandas/tests/series/indexing/test_callable.py | 16 +- pandas/tests/series/indexing/test_datetime.py | 309 +- pandas/tests/series/indexing/test_indexing.py | 323 +- pandas/tests/series/indexing/test_loc.py | 32 +- pandas/tests/series/indexing/test_numeric.py | 155 +- pandas/tests/series/test_alter_axes.py | 210 +- pandas/tests/series/test_analytics.py | 938 ++-- pandas/tests/series/test_api.py | 300 +- pandas/tests/series/test_apply.py | 562 ++- pandas/tests/series/test_arithmetic.py | 65 +- pandas/tests/series/test_asof.py | 58 +- pandas/tests/series/test_block_internals.py | 12 +- pandas/tests/series/test_combine_concat.py | 284 +- pandas/tests/series/test_constructors.py | 774 +-- pandas/tests/series/test_datetime_values.py | 539 ++- pandas/tests/series/test_dtypes.py | 313 +- pandas/tests/series/test_duplicates.py | 82 +- pandas/tests/series/test_internals.py | 112 +- pandas/tests/series/test_io.py | 118 +- pandas/tests/series/test_missing.py | 1194 +++-- pandas/tests/series/test_operators.py | 327 +- pandas/tests/series/test_period.py | 126 +- pandas/tests/series/test_quantile.py | 122 +- pandas/tests/series/test_rank.py | 463 +- pandas/tests/series/test_replace.py | 140 +- pandas/tests/series/test_repr.py | 150 +- pandas/tests/series/test_sorting.py | 100 +- pandas/tests/series/test_subclass.py | 59 +- pandas/tests/series/test_timeseries.py | 574 ++- pandas/tests/series/test_timezones.py | 235 +- pandas/tests/series/test_ufunc.py | 121 +- pandas/tests/series/test_validate.py | 9 +- pandas/tests/sparse/frame/conftest.py | 33 +- pandas/tests/sparse/frame/test_analytics.py | 4 +- pandas/tests/sparse/frame/test_apply.py | 39 +- pandas/tests/sparse/frame/test_frame.py | 1043 ++-- pandas/tests/sparse/frame/test_indexing.py | 74 +- pandas/tests/sparse/frame/test_to_csv.py | 9 +- .../tests/sparse/frame/test_to_from_scipy.py | 60 +- pandas/tests/sparse/series/test_indexing.py | 82 +- pandas/tests/sparse/series/test_series.py | 783 +-- pandas/tests/sparse/test_combine_concat.py | 230 +- pandas/tests/sparse/test_format.py | 125 +- pandas/tests/sparse/test_groupby.py | 48 +- pandas/tests/sparse/test_indexing.py | 751 ++- pandas/tests/sparse/test_pivot.py | 68 +- pandas/tests/sparse/test_reshape.py | 4 +- pandas/tests/test_algos.py | 1361 ++++-- pandas/tests/test_base.py | 778 +-- pandas/tests/test_common.py | 57 +- pandas/tests/test_downstream.py | 36 +- pandas/tests/test_errors.py | 24 +- pandas/tests/test_expressions.py | 263 +- pandas/tests/test_join.py | 196 +- pandas/tests/test_lib.py | 37 +- pandas/tests/test_multilevel.py | 1638 ++++--- pandas/tests/test_nanops.py | 815 ++-- pandas/tests/test_optional_dependency.py | 10 +- pandas/tests/test_register_accessor.py | 46 +- pandas/tests/test_sorting.py | 265 +- pandas/tests/test_strings.py | 2511 +++++----- pandas/tests/test_take.py | 203 +- pandas/tests/test_window.py | 3378 +++++++------ pandas/tests/tools/test_numeric.py | 371 +- .../tseries/frequencies/test_freq_code.py | 184 +- .../tseries/frequencies/test_inference.py | 324 +- .../tseries/frequencies/test_to_offset.py | 184 +- pandas/tests/tseries/holiday/test_calendar.py | 38 +- pandas/tests/tseries/holiday/test_federal.py | 34 +- pandas/tests/tseries/holiday/test_holiday.py | 289 +- .../tests/tseries/holiday/test_observance.py | 70 +- pandas/tests/tseries/offsets/common.py | 14 +- pandas/tests/tseries/offsets/conftest.py | 10 +- pandas/tests/tseries/offsets/test_fiscal.py | 613 +-- pandas/tests/tseries/offsets/test_offsets.py | 4208 +++++++++------- .../offsets/test_offsets_properties.py | 68 +- pandas/tests/tseries/offsets/test_ticks.py | 174 +- .../tests/tseries/offsets/test_yqm_offsets.py | 1624 ++++--- pandas/tests/tslibs/test_api.py | 66 +- pandas/tests/tslibs/test_array_to_datetime.py | 101 +- pandas/tests/tslibs/test_ccalendar.py | 15 +- pandas/tests/tslibs/test_conversion.py | 41 +- pandas/tests/tslibs/test_libfrequencies.py | 146 +- pandas/tests/tslibs/test_liboffsets.py | 162 +- pandas/tests/tslibs/test_normalize_date.py | 25 +- pandas/tests/tslibs/test_parse_iso8601.py | 76 +- pandas/tests/tslibs/test_parsing.py | 194 +- pandas/tests/tslibs/test_period_asfreq.py | 125 +- pandas/tests/tslibs/test_timedeltas.py | 21 +- pandas/tests/tslibs/test_timezones.py | 30 +- pandas/tests/util/test_assert_almost_equal.py | 176 +- .../util/test_assert_categorical_equal.py | 12 +- .../util/test_assert_extension_array_equal.py | 25 +- pandas/tests/util/test_assert_frame_equal.py | 124 +- pandas/tests/util/test_assert_index_equal.py | 35 +- .../util/test_assert_interval_array_equal.py | 13 +- .../util/test_assert_numpy_array_equal.py | 46 +- .../util/test_assert_produces_warning.py | 9 +- pandas/tests/util/test_assert_series_equal.py | 75 +- pandas/tests/util/test_deprecate.py | 28 +- pandas/tests/util/test_deprecate_kwarg.py | 6 +- pandas/tests/util/test_hashing.py | 176 +- pandas/tests/util/test_move.py | 1 + pandas/tests/util/test_safe_import.py | 13 +- pandas/tests/util/test_util.py | 5 +- pandas/tests/util/test_validate_args.py | 27 +- .../util/test_validate_args_and_kwargs.py | 56 +- pandas/tests/util/test_validate_kwargs.py | 18 +- pandas/tseries/converter.py | 24 +- pandas/tseries/frequencies.py | 155 +- pandas/tseries/holiday.py | 131 +- pandas/tseries/offsets.py | 936 ++-- pandas/util/__init__.py | 3 +- pandas/util/_decorators.py | 106 +- pandas/util/_depr_module.py | 32 +- pandas/util/_doctools.py | 72 +- pandas/util/_print_versions.py | 85 +- pandas/util/_test_decorators.py | 89 +- pandas/util/_tester.py | 6 +- pandas/util/_validators.py | 96 +- pandas/util/testing.py | 1176 +++-- scripts/download_wheels.py | 20 +- scripts/find_commits_touching_func.py | 131 +- scripts/generate_pip_deps_from_conda.py | 61 +- scripts/merge-pr.py | 146 +- scripts/tests/conftest.py | 7 +- scripts/tests/test_validate_docstrings.py | 687 ++- scripts/validate_docstrings.py | 702 +-- setup.py | 821 ++-- versioneer.py | 213 +- 748 files changed, 126206 insertions(+), 97282 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index b69efb4689486..436093ef195ef 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,7 @@ import pandas as pd from pandas.util import testing as tm -for imp in ['pandas.util', 'pandas.tools.hashing']: +for imp in ["pandas.util", "pandas.tools.hashing"]: try: hashing = import_module(imp) break @@ -15,15 +15,17 @@ class Factorize: - params = [[True, False], ['int', 'uint', 'float', 'string']] - param_names = ['sort', 'dtype'] + params = [[True, False], ["int", "uint", "float", "string"]] + param_names = ["sort", "dtype"] def setup(self, sort, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N).repeat(5)), - 'uint': pd.UInt64Index(np.arange(N).repeat(5)), - 'float': pd.Float64Index(np.random.randn(N).repeat(5)), - 'string': tm.makeStringIndex(N).repeat(5)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N).repeat(5)), + "uint": pd.UInt64Index(np.arange(N).repeat(5)), + "float": pd.Float64Index(np.random.randn(N).repeat(5)), + "string": tm.makeStringIndex(N).repeat(5), + } self.idx = data[dtype] def time_factorize(self, sort, dtype): @@ -32,15 +34,17 @@ def time_factorize(self, sort, dtype): class FactorizeUnique: - params = [[True, False], ['int', 'uint', 'float', 'string']] - param_names = ['sort', 'dtype'] + params = [[True, False], ["int", "uint", "float", "string"]] + param_names = ["sort", "dtype"] def setup(self, sort, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N)), - 'uint': pd.UInt64Index(np.arange(N)), - 'float': pd.Float64Index(np.arange(N)), - 'string': tm.makeStringIndex(N)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N)), + "uint": pd.UInt64Index(np.arange(N)), + "float": pd.Float64Index(np.arange(N)), + "string": tm.makeStringIndex(N), + } self.idx = data[dtype] assert self.idx.is_unique @@ -50,15 +54,17 @@ def time_factorize(self, sort, dtype): class Duplicated: - params = [['first', 'last', False], ['int', 'uint', 'float', 'string']] - param_names = ['keep', 'dtype'] + params = [["first", "last", False], ["int", "uint", "float", "string"]] + param_names = ["keep", "dtype"] def setup(self, keep, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N).repeat(5)), - 'uint': pd.UInt64Index(np.arange(N).repeat(5)), - 'float': pd.Float64Index(np.random.randn(N).repeat(5)), - 'string': tm.makeStringIndex(N).repeat(5)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N).repeat(5)), + "uint": pd.UInt64Index(np.arange(N).repeat(5)), + "float": pd.Float64Index(np.random.randn(N).repeat(5)), + "string": tm.makeStringIndex(N).repeat(5), + } self.idx = data[dtype] # cache is_unique self.idx.is_unique @@ -69,15 +75,17 @@ def time_duplicated(self, keep, dtype): class DuplicatedUniqueIndex: - params = ['int', 'uint', 'float', 'string'] - param_names = ['dtype'] + params = ["int", "uint", "float", "string"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**5 - data = {'int': pd.Int64Index(np.arange(N)), - 'uint': pd.UInt64Index(np.arange(N)), - 'float': pd.Float64Index(np.random.randn(N)), - 'string': tm.makeStringIndex(N)} + N = 10 ** 5 + data = { + "int": pd.Int64Index(np.arange(N)), + "uint": pd.UInt64Index(np.arange(N)), + "float": pd.Float64Index(np.random.randn(N)), + "string": tm.makeStringIndex(N), + } self.idx = data[dtype] # cache is_unique self.idx.is_unique @@ -87,18 +95,21 @@ def time_duplicated_unique(self, dtype): class Hashing: - def setup_cache(self): - N = 10**5 + N = 10 ** 5 df = pd.DataFrame( - {'strings': pd.Series(tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=N))), - 'floats': np.random.randn(N), - 'ints': np.arange(N), - 'dates': pd.date_range('20110101', freq='s', periods=N), - 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) - df['categories'] = df['strings'].astype('category') + { + "strings": pd.Series( + tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N)) + ), + "floats": np.random.randn(N), + "ints": np.arange(N), + "dates": pd.date_range("20110101", freq="s", periods=N), + "timedeltas": pd.timedelta_range("1 day", freq="s", periods=N), + } + ) + df["categories"] = df["strings"].astype("category") df.iloc[10:20] = np.nan return df @@ -106,35 +117,39 @@ def time_frame(self, df): hashing.hash_pandas_object(df) def time_series_int(self, df): - hashing.hash_pandas_object(df['ints']) + hashing.hash_pandas_object(df["ints"]) def time_series_string(self, df): - hashing.hash_pandas_object(df['strings']) + hashing.hash_pandas_object(df["strings"]) def time_series_float(self, df): - hashing.hash_pandas_object(df['floats']) + hashing.hash_pandas_object(df["floats"]) def time_series_categorical(self, df): - hashing.hash_pandas_object(df['categories']) + hashing.hash_pandas_object(df["categories"]) def time_series_timedeltas(self, df): - hashing.hash_pandas_object(df['timedeltas']) + hashing.hash_pandas_object(df["timedeltas"]) def time_series_dates(self, df): - hashing.hash_pandas_object(df['dates']) + hashing.hash_pandas_object(df["dates"]) class Quantile: - params = [[0, 0.5, 1], - ['linear', 'nearest', 'lower', 'higher', 'midpoint'], - ['float', 'int', 'uint']] - param_names = ['quantile', 'interpolation', 'dtype'] + params = [ + [0, 0.5, 1], + ["linear", "nearest", "lower", "higher", "midpoint"], + ["float", "int", "uint"], + ] + param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): - N = 10**5 - data = {'int': np.arange(N), - 'uint': np.arange(N).astype(np.uint64), - 'float': np.random.randn(N)} + N = 10 ** 5 + data = { + "int": np.arange(N), + "uint": np.arange(N).astype(np.uint64), + "float": np.random.randn(N), + } self.idx = pd.Series(data[dtype].repeat(5)) def time_quantile(self, quantile, interpolation, dtype): @@ -142,12 +157,12 @@ def time_quantile(self, quantile, interpolation, dtype): class SortIntegerArray: - params = [10**3, 10**5] + params = [10 ** 3, 10 ** 5] def setup(self, N): data = np.arange(N, dtype=float) data[40] = np.nan - self.array = pd.array(data, dtype='Int64') + self.array = pd.array(data, dtype="Int64") def time_argsort(self, N): self.array.argsort() diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index dd316a2bc88d0..c43e5dfd729aa 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,5 +1,6 @@ import numpy as np from pandas import DataFrame + try: from pandas.util import cache_readonly except ImportError: @@ -7,7 +8,6 @@ class DataFrameAttributes: - def setup(self): self.df = DataFrame(np.random.randn(10, 6)) self.cur_index = self.df.index @@ -20,14 +20,12 @@ def time_set_index(self): class CacheReadonly: - def setup(self): - class Foo: - @cache_readonly def prop(self): return 5 + self.obj = Foo() def time_cache_readonly(self): diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 26cd66284c41e..fd3324b78f1c3 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,6 +1,7 @@ import numpy as np from pandas import DataFrame, Series, date_range from pandas.core.algorithms import checked_add_with_arr + try: import pandas.core.computation.expressions as expr except ImportError: @@ -9,14 +10,14 @@ class Ops: - params = [[True, False], ['default', 1]] - param_names = ['use_numexpr', 'threads'] + params = [[True, False], ["default", 1]] + param_names = ["use_numexpr", "threads"] def setup(self, use_numexpr, threads): self.df = DataFrame(np.random.randn(20000, 100)) self.df2 = DataFrame(np.random.randn(20000, 100)) - if threads != 'default': + if threads != "default": expr.set_numexpr_threads(threads) if not use_numexpr: expr.set_use_numexpr(False) @@ -39,18 +40,21 @@ def teardown(self, use_numexpr, threads): class Ops2: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df = DataFrame(np.random.randn(N, N)) self.df2 = DataFrame(np.random.randn(N, N)) - self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(N, N))) - self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(N, N))) + self.df_int = DataFrame( + np.random.randint( + np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N) + ) + ) + self.df2_int = DataFrame( + np.random.randint( + np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N) + ) + ) self.s = Series(np.random.randn(N)) @@ -90,16 +94,16 @@ def time_frame_series_dot(self): class Timeseries: - params = [None, 'US/Eastern'] - param_names = ['tz'] + params = [None, "US/Eastern"] + param_names = ["tz"] def setup(self, tz): - N = 10**6 + N = 10 ** 6 halfway = (N // 2) - 1 - self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz)) + self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz)) self.ts = self.s[halfway] - self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz)) + self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz)) def time_series_timestamp_compare(self, tz): self.s <= self.ts @@ -117,10 +121,10 @@ def time_timestamp_ops_diff_with_shift(self, tz): class AddOverflowScalar: params = [1, -1, 0] - param_names = ['scalar'] + param_names = ["scalar"] def setup(self, scalar): - N = 10**6 + N = 10 ** 6 self.arr = np.arange(N) def time_add_overflow_scalar(self, scalar): @@ -128,9 +132,8 @@ def time_add_overflow_scalar(self, scalar): class AddOverflowArray: - def setup(self): - N = 10**6 + N = 10 ** 6 self.arr = np.arange(N) self.arr_rev = np.arange(-N, 0) self.arr_mixed = np.array([1, -1]).repeat(N / 2) @@ -144,12 +147,12 @@ def time_add_overflow_arr_mask_nan(self): checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) def time_add_overflow_b_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, - b_mask=self.arr_nan_1) + checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1) def time_add_overflow_both_arg_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, - b_mask=self.arr_nan_2) + checked_add_with_arr( + self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2 + ) from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index f1afca5941fe5..933946b1ca1ac 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pandas.util.testing as tm + try: from pandas.api.types import union_categoricals except ImportError: @@ -11,13 +12,12 @@ class Concat: - def setup(self): - N = 10**5 - self.s = pd.Series(list('aabbcd') * N).astype('category') + N = 10 ** 5 + self.s = pd.Series(list("aabbcd") * N).astype("category") - self.a = pd.Categorical(list('aabbcd') * N) - self.b = pd.Categorical(list('bbcdjk') * N) + self.a = pd.Categorical(list("aabbcd") * N) + self.b = pd.Categorical(list("bbcdjk") * N) def time_concat(self): pd.concat([self.s, self.s]) @@ -27,23 +27,22 @@ def time_union(self): class Constructor: - def setup(self): - N = 10**5 - self.categories = list('abcde') + N = 10 ** 5 + self.categories = list("abcde") self.cat_idx = pd.Index(self.categories) self.values = np.tile(self.categories, N) self.codes = np.tile(range(len(self.categories)), N) - self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00', - periods=N / 10, - freq='s')) + self.datetimes = pd.Series( + pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s") + ) self.datetimes_with_nat = self.datetimes.copy() self.datetimes_with_nat.iloc[-1] = pd.NaT self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) - self.values_all_int8 = np.ones(N, 'int8') + self.values_all_int8 = np.ones(N, "int8") self.categorical = pd.Categorical(self.values, self.categories) self.series = pd.Series(self.categorical) @@ -78,62 +77,55 @@ def time_existing_series(self): class ValueCounts: params = [True, False] - param_names = ['dropna'] + param_names = ["dropna"] def setup(self, dropna): - n = 5 * 10**5 - arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, - size=n)] - self.ts = pd.Series(arr).astype('category') + n = 5 * 10 ** 5 + arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype("category") def time_value_counts(self, dropna): self.ts.value_counts(dropna=dropna) class Repr: - def setup(self): - self.sel = pd.Series(['s1234']).astype('category') + self.sel = pd.Series(["s1234"]).astype("category") def time_rendering(self): str(self.sel) class SetCategories: - def setup(self): - n = 5 * 10**5 - arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, - size=n)] - self.ts = pd.Series(arr).astype('category') + n = 5 * 10 ** 5 + arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype("category") def time_set_categories(self): self.ts.cat.set_categories(self.ts.cat.categories[::2]) class RemoveCategories: - def setup(self): - n = 5 * 10**5 - arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10, - size=n)] - self.ts = pd.Series(arr).astype('category') + n = 5 * 10 ** 5 + arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype("category") def time_remove_categories(self): self.ts.cat.remove_categories(self.ts.cat.categories[::2]) class Rank: - def setup(self): - N = 10**5 + N = 10 ** 5 ncats = 100 self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) - self.s_str_cat = self.s_str.astype('category') + self.s_str_cat = self.s_str.astype("category") self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered() self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) - self.s_int_cat = self.s_int.astype('category') + self.s_int_cat = self.s_int.astype("category") self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered() def time_rank_string(self): @@ -157,28 +149,27 @@ def time_rank_int_cat_ordered(self): class Isin: - params = ['object', 'int64'] - param_names = ['dtype'] + params = ["object", "int64"] + param_names = ["dtype"] def setup(self, dtype): np.random.seed(1234) - n = 5 * 10**5 + n = 5 * 10 ** 5 sample_size = 100 arr = [i for i in np.random.randint(0, n // 10, size=n)] - if dtype == 'object': - arr = ['s{:04d}'.format(i) for i in arr] + if dtype == "object": + arr = ["s{:04d}".format(i) for i in arr] self.sample = np.random.choice(arr, sample_size) - self.series = pd.Series(arr).astype('category') + self.series = pd.Series(arr).astype("category") def time_isin_categorical(self, dtype): self.series.isin(self.sample) class IsMonotonic: - def setup(self): N = 1000 - self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N)) + self.c = pd.CategoricalIndex(list("a" * N + "b" * N + "c" * N)) self.s = pd.Series(self.c) def time_categorical_index_is_monotonic_increasing(self): @@ -195,9 +186,8 @@ def time_categorical_series_is_monotonic_decreasing(self): class Contains: - def setup(self): - N = 10**5 + N = 10 ** 5 self.ci = tm.makeCategoricalIndex(N) self.c = self.ci.values self.key = self.ci.categories[0] @@ -211,34 +201,33 @@ def time_categorical_contains(self): class CategoricalSlicing: - params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] - param_names = ['index'] + params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] + param_names = ["index"] def setup(self, index): - N = 10**6 - categories = ['a', 'b', 'c'] + N = 10 ** 6 + categories = ["a", "b", "c"] values = [0] * N + [1] * N + [2] * N - if index == 'monotonic_incr': - self.data = pd.Categorical.from_codes(values, - categories=categories) - elif index == 'monotonic_decr': - self.data = pd.Categorical.from_codes(list(reversed(values)), - categories=categories) - elif index == 'non_monotonic': - self.data = pd.Categorical.from_codes([0, 1, 2] * N, - categories=categories) + if index == "monotonic_incr": + self.data = pd.Categorical.from_codes(values, categories=categories) + elif index == "monotonic_decr": + self.data = pd.Categorical.from_codes( + list(reversed(values)), categories=categories + ) + elif index == "non_monotonic": + self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) else: - raise ValueError('Invalid index param: {}'.format(index)) + raise ValueError("Invalid index param: {}".format(index)) self.scalar = 10000 self.list = list(range(10000)) - self.cat_scalar = 'b' + self.cat_scalar = "b" def time_getitem_scalar(self, index): self.data[self.scalar] def time_getitem_slice(self, index): - self.data[:self.scalar] + self.data[: self.scalar] def time_getitem_list_like(self, index): self.data[[self.scalar]] @@ -251,9 +240,8 @@ def time_getitem_bool_array(self, index): class Indexing: - def setup(self): - N = 10**5 + N = 10 ** 5 self.index = pd.CategoricalIndex(range(N), range(N)) self.series = pd.Series(range(N), index=self.index).sort_index() self.category = self.index[500] @@ -268,7 +256,7 @@ def time_shallow_copy(self): self.index._shallow_copy() def time_align(self): - pd.DataFrame({'a': self.series, 'b': self.series[:500]}) + pd.DataFrame({"a": self.series, "b": self.series[:500]}) def time_intersection(self): self.index[:750].intersection(self.index[250:]) @@ -280,7 +268,7 @@ def time_reindex(self): self.index.reindex(self.index[:500]) def time_reindex_missing(self): - self.index.reindex(['a', 'b', 'c', 'd']) + self.index.reindex(["a", "b", "c", "d"]) def time_sort_values(self): self.index.sort_values(ascending=False) diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 42adede631a01..654075292cdf6 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -42,18 +42,22 @@ def list_of_lists_with_none(arr): class SeriesConstructors: param_names = ["data_fmt", "with_index", "dtype"] - params = [[no_change, - list, - list_of_str, - gen_of_str, - arr_dict, - list_of_tuples, - gen_of_tuples, - list_of_lists, - list_of_tuples_with_none, - list_of_lists_with_none], - [False, True], - ['float', 'int']] + params = [ + [ + no_change, + list, + list_of_str, + gen_of_str, + arr_dict, + list_of_tuples, + gen_of_tuples, + list_of_lists, + list_of_tuples_with_none, + list_of_lists_with_none, + ], + [False, True], + ["float", "int"], + ] # Generators get exhausted on use, so run setup before every call number = 1 @@ -61,10 +65,11 @@ class SeriesConstructors: def setup(self, data_fmt, with_index, dtype): if data_fmt in (gen_of_str, gen_of_tuples) and with_index: - raise NotImplementedError('Series constructors do not support ' - 'using generators with indexes') - N = 10**4 - if dtype == 'float': + raise NotImplementedError( + "Series constructors do not support " "using generators with indexes" + ) + N = 10 ** 4 + if dtype == "float": arr = np.random.randn(N) else: arr = np.arange(N) @@ -76,13 +81,15 @@ def time_series_constructor(self, data_fmt, with_index, dtype): class SeriesDtypesConstructors: - def setup(self): - N = 10**4 + N = 10 ** 4 self.arr = np.random.randn(N) - self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) - self.s = Series([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')] * N * 10) + self.arr_str = np.array(["foo", "bar", "baz"], dtype=object) + self.s = Series( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + * N + * 10 + ) def time_index_from_array_string(self): Index(self.arr_str) @@ -98,9 +105,8 @@ def time_dtindex_from_index_with_series(self): class MultiIndexConstructor: - def setup(self): - N = 10**4 + N = 10 ** 4 self.iterables = [tm.makeStringIndex(N), range(20)] def time_multiindex_from_iterables(self): diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 9bfaaa8696009..60800b1f9cae7 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -2,32 +2,36 @@ import numpy as np from .pandas_vb_common import ( - numeric_dtypes, datetime_dtypes, string_dtypes, extension_dtypes) + numeric_dtypes, + datetime_dtypes, + string_dtypes, + extension_dtypes, +) -_numpy_dtypes = [np.dtype(dtype) - for dtype in (numeric_dtypes + - datetime_dtypes + - string_dtypes)] +_numpy_dtypes = [ + np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes) +] _dtypes = _numpy_dtypes + extension_dtypes class Dtypes: - params = (_dtypes + - list(map(lambda dt: dt.name, _dtypes))) - param_names = ['dtype'] + params = _dtypes + list(map(lambda dt: dt.name, _dtypes)) + param_names = ["dtype"] def time_pandas_dtype(self, dtype): pandas_dtype(dtype) class DtypesInvalid: - param_names = ['dtype'] - params = ['scalar-string', 'scalar-int', 'list-string', 'array-string'] - data_dict = {'scalar-string': 'foo', - 'scalar-int': 1, - 'list-string': ['foo'] * 1000, - 'array-string': np.array(['foo'] * 1000)} + param_names = ["dtype"] + params = ["scalar-string", "scalar-int", "list-string", "array-string"] + data_dict = { + "scalar-string": "foo", + "scalar-int": 1, + "list-string": ["foo"] * 1000, + "array-string": np.array(["foo"] * 1000), + } def time_pandas_dtype_invalid(self, dtype): try: diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index be47d35f2cad1..84e94315cc28b 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd + try: import pandas.core.computation.expressions as expr except ImportError: @@ -8,8 +9,8 @@ class Eval: - params = [['numexpr', 'python'], [1, 'all']] - param_names = ['engine', 'threads'] + params = [["numexpr", "python"], [1, "all"]] + param_names = ["engine", "threads"] def setup(self, engine, threads): self.df = pd.DataFrame(np.random.randn(20000, 100)) @@ -21,44 +22,44 @@ def setup(self, engine, threads): expr.set_numexpr_threads(1) def time_add(self, engine, threads): - pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine) + pd.eval("self.df + self.df2 + self.df3 + self.df4", engine=engine) def time_and(self, engine, threads): - pd.eval('(self.df > 0) & (self.df2 > 0) & ' - '(self.df3 > 0) & (self.df4 > 0)', engine=engine) + pd.eval( + "(self.df > 0) & (self.df2 > 0) & " "(self.df3 > 0) & (self.df4 > 0)", + engine=engine, + ) def time_chained_cmp(self, engine, threads): - pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine) + pd.eval("self.df < self.df2 < self.df3 < self.df4", engine=engine) def time_mult(self, engine, threads): - pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine) + pd.eval("self.df * self.df2 * self.df3 * self.df4", engine=engine) def teardown(self, engine, threads): expr.set_numexpr_threads() class Query: - def setup(self): - N = 10**6 + N = 10 ** 6 halfway = (N // 2) - 1 - index = pd.date_range('20010101', periods=N, freq='T') + index = pd.date_range("20010101", periods=N, freq="T") s = pd.Series(index) self.ts = s.iloc[halfway] - self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': index}, - index=index) + self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index) data = np.random.randn(N) self.min_val = data.min() self.max_val = data.max() def time_query_datetime_index(self): - self.df.query('index < @self.ts') + self.df.query("index < @self.ts") def time_query_datetime_column(self): - self.df.query('dates < @self.ts') + self.df.query("dates < @self.ts") def time_query_with_boolean_selection(self): - self.df.query('(a >= @self.min_val) & (a <= @self.max_val)') + self.df.query("(a >= @self.min_val) & (a <= @self.max_val)") from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 9533938b30fac..acfb26bcf5d7c 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,25 +1,23 @@ import numpy as np import pandas.util.testing as tm from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range + try: from pandas.tseries.offsets import Nano, Hour except ImportError: # For compatibility with older versions - from pandas.core.datetools import * # noqa + from pandas.core.datetools import * # noqa class FromDicts: - def setup(self): N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) - frame = DataFrame(np.random.randn(N, K), index=self.index, - columns=self.columns) + frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = frame.to_dict() - self.dict_list = frame.to_dict(orient='records') - self.data2 = {i: {j: float(j) for j in range(100)} - for i in range(2000)} + self.dict_list = frame.to_dict(orient="records") + self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} def time_list_of_dict(self): DataFrame(self.dict_list) @@ -42,7 +40,6 @@ def time_nested_dict_int64(self): class FromSeries: - def setup(self): mi = MultiIndex.from_product([range(100), range(100)]) self.s = Series(np.random.randn(10000), index=mi) @@ -54,12 +51,12 @@ def time_mi_series(self): class FromDictwithTimestamp: params = [Nano(1), Hour(1)] - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): - N = 10**3 + N = 10 ** 3 np.random.seed(1234) - idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N) + idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N) df = DataFrame(np.random.randn(N, 10), index=idx) self.d = df.to_dict() @@ -70,7 +67,7 @@ def time_dict_with_timestamp_offsets(self, offset): class FromRecords: params = [None, 1000] - param_names = ['nrows'] + param_names = ["nrows"] # Generators get exhausted on use, so run setup before every call number = 1 @@ -86,7 +83,6 @@ def time_frame_from_records_generator(self, nrows): class FromNDArray: - def setup(self): N = 100000 self.data = np.random.randn(N) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 5b76eeba115a4..af4741f94d294 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -2,17 +2,15 @@ import numpy as np -from pandas import ( - DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range) +from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range import pandas.util.testing as tm class GetNumericData: - def setup(self): self.df = DataFrame(np.random.randn(10000, 25)) - self.df['foo'] = 'bar' - self.df['bar'] = 'baz' + self.df["foo"] = "bar" + self.df["bar"] = "baz" self.df = self.df._consolidate() def time_frame_get_numeric_data(self): @@ -20,17 +18,17 @@ def time_frame_get_numeric_data(self): class Lookup: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), - columns=list('abcdefgh')) - self.df['foo'] = 'bar' + self.df = DataFrame(np.random.randn(10000, 8), columns=list("abcdefgh")) + self.df["foo"] = "bar" self.row_labels = list(self.df.index[::10])[:900] self.col_labels = list(self.df.columns) * 100 self.row_labels_all = np.array( - list(self.df.index) * len(self.df.columns), dtype='object') + list(self.df.index) * len(self.df.columns), dtype="object" + ) self.col_labels_all = np.array( - list(self.df.columns) * len(self.df.index), dtype='object') + list(self.df.columns) * len(self.df.index), dtype="object" + ) def time_frame_fancy_lookup(self): self.df.lookup(self.row_labels, self.col_labels) @@ -40,17 +38,21 @@ def time_frame_fancy_lookup_all(self): class Reindex: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( - {c: {0: np.random.randint(0, 2, N).astype(np.bool_), - 1: np.random.randint(0, N, N).astype(np.int16), - 2: np.random.randint(0, N, N).astype(np.int32), - 3: np.random.randint(0, N, N).astype(np.int64)} - [np.random.randint(0, 4)] for c in range(N)}) + { + c: { + 0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64), + }[np.random.randint(0, 4)] + for c in range(N) + } + ) def time_reindex_axis0(self): self.df.reindex(self.idx) @@ -66,18 +68,22 @@ def time_reindex_upcast(self): class Rename: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.dict_idx = {k: k for k in self.idx} self.df2 = DataFrame( - {c: {0: np.random.randint(0, 2, N).astype(np.bool_), - 1: np.random.randint(0, N, N).astype(np.int16), - 2: np.random.randint(0, N, N).astype(np.int32), - 3: np.random.randint(0, N, N).astype(np.int64)} - [np.random.randint(0, 4)] for c in range(N)}) + { + c: { + 0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64), + }[np.random.randint(0, 4)] + for c in range(N) + } + ) def time_rename_single(self): self.df.rename({0: 0}) @@ -103,13 +109,14 @@ def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) - self.df3 = DataFrame(np.random.randn(N, 5 * N), - columns=['C' + str(c) for c in range(N * 5)]) + self.df3 = DataFrame( + np.random.randn(N, 5 * N), columns=["C" + str(c) for c in range(N * 5)] + ) self.df4 = DataFrame(np.random.randn(N * 1000, 10)) def time_iteritems(self): # (monitor no-copying behaviour) - if hasattr(self.df, '_item_cache'): + if hasattr(self.df, "_item_cache"): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass @@ -192,7 +199,6 @@ def time_iterrows(self): class ToString: - def setup(self): self.df = DataFrame(np.random.randn(100, 10)) @@ -201,11 +207,10 @@ def time_to_string_floats(self): class ToHTML: - def setup(self): nrows = 500 self.df2 = DataFrame(np.random.randn(nrows, 10)) - self.df2[0] = period_range('2000', periods=nrows) + self.df2[0] = period_range("2000", periods=nrows) self.df2[1] = range(nrows) def time_to_html_mixed(self): @@ -213,7 +218,6 @@ def time_to_html_mixed(self): class Repr: - def setup(self): nrows = 10000 data = np.random.randn(nrows, 10) @@ -238,7 +242,6 @@ def time_frame_repr_wide(self): class MaskBool: - def setup(self): data = np.random.randn(1000, 500) df = DataFrame(data) @@ -254,9 +257,8 @@ def time_frame_mask_floats(self): class Isnull: - def setup(self): - N = 10**3 + N = 10 ** 3 self.df_no_null = DataFrame(np.random.randn(N, N)) sample = np.array([np.nan, 1.0]) @@ -267,8 +269,20 @@ def setup(self): data = np.random.choice(sample, (N, N)) self.df_strings = DataFrame(data) - sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), - np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + sample = np.array( + [ + NaT, + np.nan, + None, + np.datetime64("NaT"), + np.timedelta64("NaT"), + 0, + 1, + 2.0, + "", + "abcd", + ] + ) data = np.random.choice(sample, (N, N)) self.df_obj = DataFrame(data) @@ -287,8 +301,8 @@ def time_isnull_obj(self): class Fillna: - params = ([True, False], ['pad', 'bfill']) - param_names = ['inplace', 'method'] + params = ([True, False], ["pad", "bfill"]) + param_names = ["inplace", "method"] def setup(self, inplace, method): values = np.random.randn(10000, 100) @@ -301,8 +315,8 @@ def time_frame_fillna(self, inplace, method): class Dropna: - params = (['all', 'any'], [0, 1]) - param_names = ['how', 'axis'] + params = (["all", "any"], [0, 1]) + param_names = ["how", "axis"] def setup(self, how, axis): self.df = DataFrame(np.random.randn(10000, 1000)) @@ -310,7 +324,7 @@ def setup(self, how, axis): self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() - self.df_mixed['foo'] = 'bar' + self.df_mixed["foo"] = "bar" def time_dropna(self, how, axis): self.df.dropna(how=how, axis=axis) @@ -322,7 +336,7 @@ def time_dropna_axis_mixed_dtypes(self, how, axis): class Count: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): self.df = DataFrame(np.random.randn(10000, 1000)) @@ -330,15 +344,16 @@ def setup(self, axis): self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() - self.df_mixed['foo'] = 'bar' + self.df_mixed["foo"] = "bar" self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) - self.df.columns = MultiIndex.from_arrays([self.df.columns, - self.df.columns]) - self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index, - self.df_mixed.index]) - self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns, - self.df_mixed.columns]) + self.df.columns = MultiIndex.from_arrays([self.df.columns, self.df.columns]) + self.df_mixed.index = MultiIndex.from_arrays( + [self.df_mixed.index, self.df_mixed.index] + ) + self.df_mixed.columns = MultiIndex.from_arrays( + [self.df_mixed.columns, self.df_mixed.columns] + ) def time_count_level_multi(self, axis): self.df.count(axis=axis, level=1) @@ -348,13 +363,12 @@ def time_count_level_mixed_dtypes_multi(self, axis): class Apply: - def setup(self): self.df = DataFrame(np.random.randn(1000, 100)) self.s = Series(np.arange(1028.0)) self.df2 = DataFrame({i: self.s for i in range(1028)}) - self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_apply_user_func(self): self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) @@ -372,11 +386,10 @@ def time_apply_pass_thru(self): self.df.apply(lambda x: x) def time_apply_ref_by_name(self): - self.df3.apply(lambda x: x['A'] + x['B'], axis=1) + self.df3.apply(lambda x: x["A"] + x["B"], axis=1) class Dtypes: - def setup(self): self.df = DataFrame(np.random.randn(1000, 1000)) @@ -385,19 +398,18 @@ def time_frame_dtypes(self): class Equals: - def setup(self): - N = 10**3 + N = 10 ** 3 self.float_df = DataFrame(np.random.randn(N, N)) self.float_df_nan = self.float_df.copy() self.float_df_nan.iloc[-1, -1] = np.nan - self.object_df = DataFrame('foo', index=range(N), columns=range(N)) + self.object_df = DataFrame("foo", index=range(N), columns=range(N)) self.object_df_nan = self.object_df.copy() self.object_df_nan.iloc[-1, -1] = np.nan self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) + self.nonunique_cols.columns = ["A"] * len(self.nonunique_cols.columns) self.nonunique_cols_nan = self.nonunique_cols.copy() self.nonunique_cols_nan.iloc[-1, -1] = np.nan @@ -422,8 +434,8 @@ def time_frame_object_unequal(self): class Interpolate: - params = [None, 'infer'] - param_names = ['downcast'] + params = [None, "infer"] + param_names = ["downcast"] def setup(self, downcast): N = 10000 @@ -431,12 +443,16 @@ def setup(self, downcast): self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan - self.df2 = DataFrame({'A': np.arange(0, N), - 'B': np.random.randint(0, 100, N), - 'C': np.random.randn(N), - 'D': np.random.randn(N)}) - self.df2.loc[1::5, 'A'] = np.nan - self.df2.loc[1::5, 'C'] = np.nan + self.df2 = DataFrame( + { + "A": np.arange(0, N), + "B": np.random.randint(0, 100, N), + "C": np.random.randn(N), + "D": np.random.randn(N), + } + ) + self.df2.loc[1::5, "A"] = np.nan + self.df2.loc[1::5, "C"] = np.nan def time_interpolate(self, downcast): self.df.interpolate(downcast=downcast) @@ -448,7 +464,7 @@ def time_interpolate_some_good(self, downcast): class Shift: # frame shift speedup issue-5609 params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): self.df = DataFrame(np.random.rand(10000, 500)) @@ -458,7 +474,6 @@ def time_shift(self, axis): class Nunique: - def setup(self): self.df = DataFrame(np.random.randn(10000, 1000)) @@ -467,14 +482,17 @@ def time_frame_nunique(self): class Duplicated: - def setup(self): - n = (1 << 20) - t = date_range('2015-01-01', freq='S', periods=(n // 64)) + n = 1 << 20 + t = date_range("2015-01-01", freq="S", periods=(n // 64)) xs = np.random.randn(n // 64).round(2) - self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), - 'b': np.random.choice(t, n), - 'c': np.random.choice(xs, n)}) + self.df = DataFrame( + { + "a": np.random.randint(-1 << 8, 1 << 8, n), + "b": np.random.choice(t, n), + "c": np.random.choice(xs, n), + } + ) self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T def time_frame_duplicated(self): @@ -487,10 +505,10 @@ def time_frame_duplicated_wide(self): class XS: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): - self.N = 10**4 + self.N = 10 ** 4 self.df = DataFrame(np.random.randn(self.N, self.N)) def time_frame_xs(self, axis): @@ -500,35 +518,38 @@ def time_frame_xs(self, axis): class SortValues: params = [True, False] - param_names = ['ascending'] + param_names = ["ascending"] def setup(self, ascending): - self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) + self.df = DataFrame(np.random.randn(1000000, 2), columns=list("AB")) def time_frame_sort_values(self, ascending): - self.df.sort_values(by='A', ascending=ascending) + self.df.sort_values(by="A", ascending=ascending) class SortIndexByColumns: - def setup(self): N = 10000 K = 10 - self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), - 'key2': tm.makeStringIndex(N).values.repeat(K), - 'value': np.random.randn(N * K)}) + self.df = DataFrame( + { + "key1": tm.makeStringIndex(N).values.repeat(K), + "key2": tm.makeStringIndex(N).values.repeat(K), + "value": np.random.randn(N * K), + } + ) def time_frame_sort_values_by_columns(self): - self.df.sort_values(by=['key1', 'key2']) + self.df.sort_values(by=["key1", "key2"]) class Quantile: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): - self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + self.df = DataFrame(np.random.randn(1000, 3), columns=list("ABC")) def time_frame_quantile(self, axis): self.df.quantile([0.1, 0.5], axis=axis) @@ -548,37 +569,37 @@ def time_info(self): class NSort: - params = ['first', 'last', 'all'] - param_names = ['keep'] + params = ["first", "last", "all"] + param_names = ["keep"] def setup(self, keep): - self.df = DataFrame(np.random.randn(100000, 3), - columns=list('ABC')) + self.df = DataFrame(np.random.randn(100000, 3), columns=list("ABC")) def time_nlargest_one_column(self, keep): - self.df.nlargest(100, 'A', keep=keep) + self.df.nlargest(100, "A", keep=keep) def time_nlargest_two_columns(self, keep): - self.df.nlargest(100, ['A', 'B'], keep=keep) + self.df.nlargest(100, ["A", "B"], keep=keep) def time_nsmallest_one_column(self, keep): - self.df.nsmallest(100, 'A', keep=keep) + self.df.nsmallest(100, "A", keep=keep) def time_nsmallest_two_columns(self, keep): - self.df.nsmallest(100, ['A', 'B'], keep=keep) + self.df.nsmallest(100, ["A", "B"], keep=keep) class Describe: - def setup(self): - self.df = DataFrame({ - 'a': np.random.randint(0, 100, int(1e6)), - 'b': np.random.randint(0, 100, int(1e6)), - 'c': np.random.randint(0, 100, int(1e6)) - }) + self.df = DataFrame( + { + "a": np.random.randint(0, 100, int(1e6)), + "b": np.random.randint(0, 100, int(1e6)), + "c": np.random.randint(0, 100, int(1e6)), + } + ) def time_series_describe(self): - self.df['a'].describe() + self.df["a"].describe() def time_dataframe_describe(self): self.df.describe() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 65a03bfda48c5..0d0b75561d057 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -2,9 +2,19 @@ import pandas.util.testing as tm from pandas import DataFrame, Series, read_csv, factorize, date_range from pandas.core.algorithms import take_1d + try: - from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max, - rolling_var, rolling_skew, rolling_kurt, rolling_std) + from pandas import ( + rolling_median, + rolling_mean, + rolling_min, + rolling_max, + rolling_var, + rolling_skew, + rolling_kurt, + rolling_std, + ) + have_rolling_methods = True except ImportError: have_rolling_methods = False @@ -14,6 +24,7 @@ from pandas import algos try: from pandas.util.testing import test_parallel + have_real_test_parallel = True except ImportError: have_real_test_parallel = False @@ -21,32 +32,36 @@ def test_parallel(num_threads=1): def wrapper(fname): return fname + return wrapper + from .pandas_vb_common import BaseIO class ParallelGroupbyMethods: - params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod', - 'sum', 'var']) - param_names = ['threads', 'method'] + params = ([2, 4, 8], ["count", "last", "max", "mean", "min", "prod", "sum", "var"]) + param_names = ["threads", "method"] def setup(self, threads, method): if not have_real_test_parallel: raise NotImplementedError - N = 10**6 - ngroups = 10**3 - df = DataFrame({'key': np.random.randint(0, ngroups, size=N), - 'data': np.random.randn(N)}) + N = 10 ** 6 + ngroups = 10 ** 3 + df = DataFrame( + {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} + ) @test_parallel(num_threads=threads) def parallel(): - getattr(df.groupby('key')['data'], method)() + getattr(df.groupby("key")["data"], method)() + self.parallel = parallel def loop(): - getattr(df.groupby('key')['data'], method)() + getattr(df.groupby("key")["data"], method)() + self.loop = loop def time_parallel(self, threads, method): @@ -60,18 +75,19 @@ def time_loop(self, threads, method): class ParallelGroups: params = [2, 4, 8] - param_names = ['threads'] + param_names = ["threads"] def setup(self, threads): if not have_real_test_parallel: raise NotImplementedError - size = 2**22 - ngroups = 10**3 + size = 2 ** 22 + ngroups = 10 ** 3 data = Series(np.random.randint(0, ngroups, size=size)) @test_parallel(num_threads=threads) def get_groups(): data.groupby(data).groups + self.get_groups = get_groups def time_get_groups(self, threads): @@ -80,19 +96,20 @@ def time_get_groups(self, threads): class ParallelTake1D: - params = ['int64', 'float64'] - param_names = ['dtype'] + params = ["int64", "float64"] + param_names = ["dtype"] def setup(self, dtype): if not have_real_test_parallel: raise NotImplementedError - N = 10**6 - df = DataFrame({'col': np.arange(N, dtype=dtype)}) + N = 10 ** 6 + df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) @test_parallel(num_threads=2) def parallel_take1d(): - take_1d(df['col'].values, indexer) + take_1d(df["col"].values, indexer) + self.parallel_take1d = parallel_take1d def time_take1d(self, dtype): @@ -107,14 +124,14 @@ class ParallelKth: def setup(self): if not have_real_test_parallel: raise NotImplementedError - N = 10**7 - k = 5 * 10**5 - kwargs_list = [{'arr': np.random.randn(N)}, - {'arr': np.random.randn(N)}] + N = 10 ** 7 + k = 5 * 10 ** 5 + kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] @test_parallel(num_threads=2, kwargs_list=kwargs_list) def parallel_kth_smallest(arr): algos.kth_smallest(arr, k) + self.parallel_kth_smallest = parallel_kth_smallest def time_kth_smallest(self): @@ -122,81 +139,90 @@ def time_kth_smallest(self): class ParallelDatetimeFields: - def setup(self): if not have_real_test_parallel: raise NotImplementedError - N = 10**6 - self.dti = date_range('1900-01-01', periods=N, freq='T') - self.period = self.dti.to_period('D') + N = 10 ** 6 + self.dti = date_range("1900-01-01", periods=N, freq="T") + self.period = self.dti.to_period("D") def time_datetime_field_year(self): @test_parallel(num_threads=2) def run(dti): dti.year + run(self.dti) def time_datetime_field_day(self): @test_parallel(num_threads=2) def run(dti): dti.day + run(self.dti) def time_datetime_field_daysinmonth(self): @test_parallel(num_threads=2) def run(dti): dti.days_in_month + run(self.dti) def time_datetime_field_normalize(self): @test_parallel(num_threads=2) def run(dti): dti.normalize() + run(self.dti) def time_datetime_to_period(self): @test_parallel(num_threads=2) def run(dti): - dti.to_period('S') + dti.to_period("S") + run(self.dti) def time_period_to_datetime(self): @test_parallel(num_threads=2) def run(period): period.to_timestamp() + run(self.period) class ParallelRolling: - params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std'] - param_names = ['method'] + params = ["median", "mean", "min", "max", "var", "skew", "kurt", "std"] + param_names = ["method"] def setup(self, method): if not have_real_test_parallel: raise NotImplementedError win = 100 arr = np.random.rand(100000) - if hasattr(DataFrame, 'rolling'): + if hasattr(DataFrame, "rolling"): df = DataFrame(arr).rolling(win) @test_parallel(num_threads=2) def parallel_rolling(): getattr(df, method)() + self.parallel_rolling = parallel_rolling elif have_rolling_methods: - rolling = {'median': rolling_median, - 'mean': rolling_mean, - 'min': rolling_min, - 'max': rolling_max, - 'var': rolling_var, - 'skew': rolling_skew, - 'kurt': rolling_kurt, - 'std': rolling_std} + rolling = { + "median": rolling_median, + "mean": rolling_mean, + "min": rolling_min, + "max": rolling_max, + "var": rolling_var, + "skew": rolling_skew, + "kurt": rolling_kurt, + "std": rolling_std, + } @test_parallel(num_threads=2) def parallel_rolling(): rolling[method](arr, win) + self.parallel_rolling = parallel_rolling else: raise NotImplementedError @@ -209,30 +235,34 @@ class ParallelReadCSV(BaseIO): number = 1 repeat = 5 - params = ['float', 'object', 'datetime'] - param_names = ['dtype'] + params = ["float", "object", "datetime"] + param_names = ["dtype"] def setup(self, dtype): if not have_real_test_parallel: raise NotImplementedError rows = 10000 cols = 50 - data = {'float': DataFrame(np.random.randn(rows, cols)), - 'datetime': DataFrame(np.random.randn(rows, cols), - index=date_range('1/1/2000', - periods=rows)), - 'object': DataFrame('foo', - index=range(rows), - columns=['object%03d'.format(i) - for i in range(5)])} - - self.fname = '__test_{}__.csv'.format(dtype) + data = { + "float": DataFrame(np.random.randn(rows, cols)), + "datetime": DataFrame( + np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) + ), + "object": DataFrame( + "foo", + index=range(rows), + columns=["object%03d".format(i) for i in range(5)], + ), + } + + self.fname = "__test_{}__.csv".format(dtype) df = data[dtype] df.to_csv(self.fname) @test_parallel(num_threads=2) def parallel_read_csv(): read_csv(self.fname) + self.parallel_read_csv = parallel_read_csv def time_read_csv(self, dtype): @@ -244,7 +274,7 @@ class ParallelFactorize: number = 1 repeat = 5 params = [2, 4, 8] - param_names = ['threads'] + param_names = ["threads"] def setup(self, threads): if not have_real_test_parallel: @@ -255,10 +285,12 @@ def setup(self, threads): @test_parallel(num_threads=threads) def parallel(): factorize(strings) + self.parallel = parallel def loop(): factorize(strings) + self.loop = loop def time_parallel(self, threads): diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 3097ada6d2022..39b07d4734399 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,18 +5,55 @@ import numpy as np from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timestamp, - date_range, period_range) + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + period_range, +) import pandas.util.testing as tm method_blacklist = { - 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', - 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', - 'var', 'mad', 'describe', 'std', 'quantile'}, - 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', - 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', - 'std'} + "object": { + "median", + "prod", + "sem", + "cumsum", + "sum", + "cummin", + "mean", + "max", + "skew", + "cumprod", + "cummax", + "rank", + "pct_change", + "min", + "var", + "mad", + "describe", + "std", + "quantile", + }, + "datetime": { + "median", + "prod", + "sem", + "cumsum", + "sum", + "mean", + "skew", + "cumprod", + "cummax", + "pct_change", + "var", + "mad", + "describe", + "std", + }, } @@ -26,28 +63,31 @@ def setup(self): self.data = Series(np.random.randn(len(self.labels))) def time_groupby_apply_dict_return(self): - self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0], - 'last': x.values[-1]}) + self.data.groupby(self.labels).apply( + lambda x: {"first": x.values[0], "last": x.values[-1]} + ) class Apply: - def setup_cache(self): - N = 10**4 + N = 10 ** 4 labels = np.random.randint(0, 2000, size=N) labels2 = np.random.randint(0, 3, size=N) - df = DataFrame({'key': labels, - 'key2': labels2, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4) - }) + df = DataFrame( + { + "key": labels, + "key2": labels2, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) return df def time_scalar_function_multi_col(self, df): - df.groupby(['key', 'key2']).apply(lambda x: 1) + df.groupby(["key", "key2"]).apply(lambda x: 1) def time_scalar_function_single_col(self, df): - df.groupby('key').apply(lambda x: 1) + df.groupby("key").apply(lambda x: 1) @staticmethod def df_copy_function(g): @@ -56,27 +96,29 @@ def df_copy_function(g): return g.copy() def time_copy_function_multi_col(self, df): - df.groupby(['key', 'key2']).apply(self.df_copy_function) + df.groupby(["key", "key2"]).apply(self.df_copy_function) def time_copy_overhead_single_col(self, df): - df.groupby('key').apply(self.df_copy_function) + df.groupby("key").apply(self.df_copy_function) class Groups: - param_names = ['key'] - params = ['int64_small', 'int64_large', 'object_small', 'object_large'] + param_names = ["key"] + params = ["int64_small", "int64_large", "object_small", "object_large"] def setup_cache(self): - size = 10**6 - data = {'int64_small': Series(np.random.randint(0, 100, size=size)), - 'int64_large': Series(np.random.randint(0, 10000, size=size)), - 'object_small': Series( - tm.makeStringIndex(100).take( - np.random.randint(0, 100, size=size))), - 'object_large': Series( - tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=size)))} + size = 10 ** 6 + data = { + "int64_small": Series(np.random.randint(0, 100, size=size)), + "int64_large": Series(np.random.randint(0, 10000, size=size)), + "object_small": Series( + tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size)) + ), + "object_large": Series( + tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size)) + ), + } return data def setup(self, data, key): @@ -89,7 +131,7 @@ def time_series_groups(self, data, key): class GroupManyLabels: params = [1, 1000] - param_names = ['ncols'] + param_names = ["ncols"] def setup(self, ncols): N = 1000 @@ -103,46 +145,45 @@ def time_sum(self, ncols): class Nth: - param_names = ['dtype'] - params = ['float32', 'float64', 'datetime', 'object'] + param_names = ["dtype"] + params = ["float32", "float64", "datetime", "object"] def setup(self, dtype): - N = 10**5 + N = 10 ** 5 # with datetimes (GH7555) - if dtype == 'datetime': - values = date_range('1/1/2011', periods=N, freq='s') - elif dtype == 'object': - values = ['foo'] * N + if dtype == "datetime": + values = date_range("1/1/2011", periods=N, freq="s") + elif dtype == "object": + values = ["foo"] * N else: values = np.arange(N).astype(dtype) key = np.arange(N) - self.df = DataFrame({'key': key, 'values': values}) + self.df = DataFrame({"key": key, "values": values}) self.df.iloc[1, 1] = np.nan # insert missing data def time_frame_nth_any(self, dtype): - self.df.groupby('key').nth(0, dropna='any') + self.df.groupby("key").nth(0, dropna="any") def time_groupby_nth_all(self, dtype): - self.df.groupby('key').nth(0, dropna='all') + self.df.groupby("key").nth(0, dropna="all") def time_frame_nth(self, dtype): - self.df.groupby('key').nth(0) + self.df.groupby("key").nth(0) def time_series_nth_any(self, dtype): - self.df['values'].groupby(self.df['key']).nth(0, dropna='any') + self.df["values"].groupby(self.df["key"]).nth(0, dropna="any") def time_series_nth_all(self, dtype): - self.df['values'].groupby(self.df['key']).nth(0, dropna='all') + self.df["values"].groupby(self.df["key"]).nth(0, dropna="all") def time_series_nth(self, dtype): - self.df['values'].groupby(self.df['key']).nth(0) + self.df["values"].groupby(self.df["key"]).nth(0) class DateAttributes: - def setup(self): - rng = date_range('1/1/2000', '12/31/2005', freq='H') + rng = date_range("1/1/2000", "12/31/2005", freq="H") self.year, self.month, self.day = rng.year, rng.month, rng.day self.ts = Series(np.random.randn(len(rng)), index=rng) @@ -151,154 +192,167 @@ def time_len_groupby_object(self): class Int64: - def setup(self): arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5)) i = np.random.choice(len(arr), len(arr) * 5) arr = np.vstack((arr, arr[i])) i = np.random.permutation(len(arr)) arr = arr[i] - self.cols = list('abcde') + self.cols = list("abcde") self.df = DataFrame(arr, columns=self.cols) - self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10 + self.df["jim"], self.df["joe"] = np.random.randn(2, len(self.df)) * 10 def time_overflow(self): self.df.groupby(self.cols).max() class CountMultiDtype: - def setup_cache(self): n = 10000 - offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') - dates = np.datetime64('now') + offsets - dates[np.random.rand(n) > 0.5] = np.datetime64('nat') - offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') + offsets = np.random.randint(n, size=n).astype("timedelta64[ns]") + dates = np.datetime64("now") + offsets + dates[np.random.rand(n) > 0.5] = np.datetime64("nat") + offsets[np.random.rand(n) > 0.5] = np.timedelta64("nat") value2 = np.random.randn(n) value2[np.random.rand(n) > 0.5] = np.nan - obj = np.random.choice(list('ab'), size=n).astype(object) + obj = np.random.choice(list("ab"), size=n).astype(object) obj[np.random.randn(n) > 0.5] = np.nan - df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'dates': dates, - 'value2': value2, - 'value3': np.random.randn(n), - 'ints': np.random.randint(0, 1000, size=n), - 'obj': obj, - 'offsets': offsets}) + df = DataFrame( + { + "key1": np.random.randint(0, 500, size=n), + "key2": np.random.randint(0, 100, size=n), + "dates": dates, + "value2": value2, + "value3": np.random.randn(n), + "ints": np.random.randint(0, 1000, size=n), + "obj": obj, + "offsets": offsets, + } + ) return df def time_multi_count(self, df): - df.groupby(['key1', 'key2']).count() + df.groupby(["key1", "key2"]).count() class CountMultiInt: - def setup_cache(self): n = 10000 - df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'ints': np.random.randint(0, 1000, size=n), - 'ints2': np.random.randint(0, 1000, size=n)}) + df = DataFrame( + { + "key1": np.random.randint(0, 500, size=n), + "key2": np.random.randint(0, 100, size=n), + "ints": np.random.randint(0, 1000, size=n), + "ints2": np.random.randint(0, 1000, size=n), + } + ) return df def time_multi_int_count(self, df): - df.groupby(['key1', 'key2']).count() + df.groupby(["key1", "key2"]).count() def time_multi_int_nunique(self, df): - df.groupby(['key1', 'key2']).nunique() + df.groupby(["key1", "key2"]).nunique() class AggFunctions: - def setup_cache(self): - N = 10**5 - fac1 = np.array(['A', 'B', 'C'], dtype='O') - fac2 = np.array(['one', 'two'], dtype='O') - df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)), - 'key2': fac2.take(np.random.randint(0, 2, size=N)), - 'value1': np.random.randn(N), - 'value2': np.random.randn(N), - 'value3': np.random.randn(N)}) + N = 10 ** 5 + fac1 = np.array(["A", "B", "C"], dtype="O") + fac2 = np.array(["one", "two"], dtype="O") + df = DataFrame( + { + "key1": fac1.take(np.random.randint(0, 3, size=N)), + "key2": fac2.take(np.random.randint(0, 2, size=N)), + "value1": np.random.randn(N), + "value2": np.random.randn(N), + "value3": np.random.randn(N), + } + ) return df def time_different_str_functions(self, df): - df.groupby(['key1', 'key2']).agg({'value1': 'mean', - 'value2': 'var', - 'value3': 'sum'}) + df.groupby(["key1", "key2"]).agg( + {"value1": "mean", "value2": "var", "value3": "sum"} + ) def time_different_numpy_functions(self, df): - df.groupby(['key1', 'key2']).agg({'value1': np.mean, - 'value2': np.var, - 'value3': np.sum}) + df.groupby(["key1", "key2"]).agg( + {"value1": np.mean, "value2": np.var, "value3": np.sum} + ) def time_different_python_functions_multicol(self, df): - df.groupby(['key1', 'key2']).agg([sum, min, max]) + df.groupby(["key1", "key2"]).agg([sum, min, max]) def time_different_python_functions_singlecol(self, df): - df.groupby('key1').agg([sum, min, max]) + df.groupby("key1").agg([sum, min, max]) class GroupStrings: - def setup(self): - n = 2 * 10**5 - alpha = list(map(''.join, product(ascii_letters, repeat=4))) + n = 2 * 10 ** 5 + alpha = list(map("".join, product(ascii_letters, repeat=4))) data = np.random.choice(alpha, (n // 5, 4), replace=False) data = np.repeat(data, 5, axis=0) - self.df = DataFrame(data, columns=list('abcd')) - self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) + self.df = DataFrame(data, columns=list("abcd")) + self.df["joe"] = (np.random.randn(len(self.df)) * 10).round(3) self.df = self.df.sample(frac=1).reset_index(drop=True) def time_multi_columns(self): - self.df.groupby(list('abcd')).max() + self.df.groupby(list("abcd")).max() class MultiColumn: - def setup_cache(self): - N = 10**5 + N = 10 ** 5 key1 = np.tile(np.arange(100, dtype=object), 1000) key2 = key1.copy() np.random.shuffle(key1) np.random.shuffle(key2) - df = DataFrame({'key1': key1, - 'key2': key2, - 'data1': np.random.randn(N), - 'data2': np.random.randn(N)}) + df = DataFrame( + { + "key1": key1, + "key2": key2, + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) return df def time_lambda_sum(self, df): - df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum()) + df.groupby(["key1", "key2"]).agg(lambda x: x.values.sum()) def time_cython_sum(self, df): - df.groupby(['key1', 'key2']).sum() + df.groupby(["key1", "key2"]).sum() def time_col_select_lambda_sum(self, df): - df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum()) + df.groupby(["key1", "key2"])["data1"].agg(lambda x: x.values.sum()) def time_col_select_numpy_sum(self, df): - df.groupby(['key1', 'key2'])['data1'].agg(np.sum) + df.groupby(["key1", "key2"])["data1"].agg(np.sum) class Size: - def setup(self): - n = 10**5 - offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') - dates = np.datetime64('now') + offsets - self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), - 'key2': np.random.randint(0, 100, size=n), - 'value1': np.random.randn(n), - 'value2': np.random.randn(n), - 'value3': np.random.randn(n), - 'dates': dates}) + n = 10 ** 5 + offsets = np.random.randint(n, size=n).astype("timedelta64[ns]") + dates = np.datetime64("now") + offsets + self.df = DataFrame( + { + "key1": np.random.randint(0, 500, size=n), + "key2": np.random.randint(0, 100, size=n), + "value1": np.random.randn(n), + "value2": np.random.randn(n), + "value3": np.random.randn(n), + "dates": dates, + } + ) self.draws = Series(np.random.randn(n)) - labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) - self.cats = labels.astype('category') + labels = Series(["foo", "bar", "baz", "qux"] * (n // 4)) + self.cats = labels.astype("category") def time_multi_size(self): - self.df.groupby(['key1', 'key2']).size() + self.df.groupby(["key1", "key2"]).size() def time_category_size(self): self.draws.groupby(self.cats).size() @@ -306,15 +360,47 @@ def time_category_size(self): class GroupByMethods: - param_names = ['dtype', 'method', 'application'] - params = [['int', 'float', 'object', 'datetime'], - ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', - 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', - 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', - 'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift', - 'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts', - 'var'], - ['direct', 'transformation']] + param_names = ["dtype", "method", "application"] + params = [ + ["int", "float", "object", "datetime"], + [ + "all", + "any", + "bfill", + "count", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "describe", + "ffill", + "first", + "head", + "last", + "mad", + "max", + "min", + "median", + "mean", + "nunique", + "pct_change", + "prod", + "quantile", + "rank", + "sem", + "shift", + "size", + "skew", + "std", + "sum", + "tail", + "unique", + "value_counts", + "var", + ], + ["direct", "transformation"], + ] def setup(self, dtype, method, application): if method in method_blacklist.get(dtype, {}): @@ -323,29 +409,28 @@ def setup(self, dtype, method, application): size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) - if dtype == 'int': + if dtype == "int": key = np.random.randint(0, size, size=size) - elif dtype == 'float': - key = np.concatenate([np.random.random(ngroups) * 0.1, - np.random.random(ngroups) * 10.0]) - elif dtype == 'object': - key = ['foo'] * size - elif dtype == 'datetime': - key = date_range('1/1/2011', periods=size, freq='s') - - df = DataFrame({'values': values, 'key': key}) - - if application == 'transform': - if method == 'describe': + elif dtype == "float": + key = np.concatenate( + [np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0] + ) + elif dtype == "object": + key = ["foo"] * size + elif dtype == "datetime": + key = date_range("1/1/2011", periods=size, freq="s") + + df = DataFrame({"values": values, "key": key}) + + if application == "transform": + if method == "describe": raise NotImplementedError - self.as_group_method = lambda: df.groupby( - 'key')['values'].transform(method) - self.as_field_method = lambda: df.groupby( - 'values')['key'].transform(method) + self.as_group_method = lambda: df.groupby("key")["values"].transform(method) + self.as_field_method = lambda: df.groupby("values")["key"].transform(method) else: - self.as_group_method = getattr(df.groupby('key')['values'], method) - self.as_field_method = getattr(df.groupby('values')['key'], method) + self.as_group_method = getattr(df.groupby("key")["values"], method) + self.as_field_method = getattr(df.groupby("values")["key"], method) def time_dtype_as_group(self, dtype, method, application): self.as_group_method() @@ -356,20 +441,22 @@ def time_dtype_as_field(self, dtype, method, application): class RankWithTies: # GH 21237 - param_names = ['dtype', 'tie_method'] - params = [['float64', 'float32', 'int64', 'datetime64'], - ['first', 'average', 'dense', 'min', 'max']] + param_names = ["dtype", "tie_method"] + params = [ + ["float64", "float32", "int64", "datetime64"], + ["first", "average", "dense", "min", "max"], + ] def setup(self, dtype, tie_method): - N = 10**4 - if dtype == 'datetime64': + N = 10 ** 4 + if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: data = np.array([1] * N, dtype=dtype) - self.df = DataFrame({'values': data, 'key': ['foo'] * N}) + self.df = DataFrame({"values": data, "key": ["foo"] * N}) def time_rank_ties(self, dtype, tie_method): - self.df.groupby('key').rank(method=tie_method) + self.df.groupby("key").rank(method=tie_method) class Float32: @@ -382,57 +469,61 @@ def setup(self): self.df = DataFrame(dict(a=arr, b=arr)) def time_sum(self): - self.df.groupby(['a'])['b'].sum() + self.df.groupby(["a"])["b"].sum() class Categories: - def setup(self): - N = 10**5 + N = 10 ** 5 arr = np.random.random(N) - data = {'a': Categorical(np.random.randint(10000, size=N)), - 'b': arr} + data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr} self.df = DataFrame(data) - data = {'a': Categorical(np.random.randint(10000, size=N), - ordered=True), - 'b': arr} + data = { + "a": Categorical(np.random.randint(10000, size=N), ordered=True), + "b": arr, + } self.df_ordered = DataFrame(data) - data = {'a': Categorical(np.random.randint(100, size=N), - categories=np.arange(10000)), - 'b': arr} + data = { + "a": Categorical( + np.random.randint(100, size=N), categories=np.arange(10000) + ), + "b": arr, + } self.df_extra_cat = DataFrame(data) def time_groupby_sort(self): - self.df.groupby('a')['b'].count() + self.df.groupby("a")["b"].count() def time_groupby_nosort(self): - self.df.groupby('a', sort=False)['b'].count() + self.df.groupby("a", sort=False)["b"].count() def time_groupby_ordered_sort(self): - self.df_ordered.groupby('a')['b'].count() + self.df_ordered.groupby("a")["b"].count() def time_groupby_ordered_nosort(self): - self.df_ordered.groupby('a', sort=False)['b'].count() + self.df_ordered.groupby("a", sort=False)["b"].count() def time_groupby_extra_cat_sort(self): - self.df_extra_cat.groupby('a')['b'].count() + self.df_extra_cat.groupby("a")["b"].count() def time_groupby_extra_cat_nosort(self): - self.df_extra_cat.groupby('a', sort=False)['b'].count() + self.df_extra_cat.groupby("a", sort=False)["b"].count() class Datelike: # GH 14338 - params = ['period_range', 'date_range', 'date_range_tz'] - param_names = ['grouper'] + params = ["period_range", "date_range", "date_range_tz"] + param_names = ["grouper"] def setup(self, grouper): - N = 10**4 - rng_map = {'period_range': period_range, - 'date_range': date_range, - 'date_range_tz': partial(date_range, tz='US/Central')} - self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N) - self.df = DataFrame(np.random.randn(10**4, 2)) + N = 10 ** 4 + rng_map = { + "period_range": period_range, + "date_range": date_range, + "date_range_tz": partial(date_range, tz="US/Central"), + } + self.grouper = rng_map[grouper]("1900-01-01", freq="D", periods=N) + self.df = DataFrame(np.random.randn(10 ** 4, 2)) def time_sum(self, grouper): self.df.groupby(self.grouper).sum() @@ -442,11 +533,10 @@ class SumBools: # GH 2692 def setup(self): N = 500 - self.df = DataFrame({'ii': range(N), - 'bb': [True] * N}) + self.df = DataFrame({"ii": range(N), "bb": [True] * N}) def time_groupby_sum_booleans(self): - self.df.groupby('ii').sum() + self.df.groupby("ii").sum() class SumMultiLevel: @@ -455,84 +545,85 @@ class SumMultiLevel: def setup(self): N = 50 - self.df = DataFrame({'A': list(range(N)) * 2, - 'B': range(N * 2), - 'C': 1}).set_index(['A', 'B']) + self.df = DataFrame( + {"A": list(range(N)) * 2, "B": range(N * 2), "C": 1} + ).set_index(["A", "B"]) def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() class Transform: - def setup(self): n1 = 400 n2 = 250 - index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], - codes=[np.repeat(range(n1), n2).tolist(), - list(range(n2)) * n1], - names=['lev1', 'lev2']) + index = MultiIndex( + levels=[np.arange(n1), tm.makeStringIndex(n2)], + codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1], + names=["lev1", "lev2"], + ) arr = np.random.randn(n1 * n2, 3) arr[::10000, 0] = np.nan arr[1::10000, 1] = np.nan arr[2::10000, 2] = np.nan - data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3']) + data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"]) self.df = data n = 20000 - self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), - columns=['jim', 'joe', 'jolie']) + self.df1 = DataFrame( + np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"] + ) self.df2 = self.df1.copy() - self.df2['jim'] = self.df2['joe'] + self.df2["jim"] = self.df2["joe"] - self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)), - columns=['jim', 'joe', 'jolie']) + self.df3 = DataFrame( + np.random.randint(1, (n / 10), (n, 3)), columns=["jim", "joe", "jolie"] + ) self.df4 = self.df3.copy() - self.df4['jim'] = self.df4['joe'] + self.df4["jim"] = self.df4["joe"] def time_transform_lambda_max(self): - self.df.groupby(level='lev1').transform(lambda x: max(x)) + self.df.groupby(level="lev1").transform(lambda x: max(x)) def time_transform_ufunc_max(self): - self.df.groupby(level='lev1').transform(np.max) + self.df.groupby(level="lev1").transform(np.max) def time_transform_multi_key1(self): - self.df1.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df1.groupby(["jim", "joe"])["jolie"].transform("max") def time_transform_multi_key2(self): - self.df2.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df2.groupby(["jim", "joe"])["jolie"].transform("max") def time_transform_multi_key3(self): - self.df3.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df3.groupby(["jim", "joe"])["jolie"].transform("max") def time_transform_multi_key4(self): - self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') + self.df4.groupby(["jim", "joe"])["jolie"].transform("max") class TransformBools: - def setup(self): N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) transitions = np.zeros(N, dtype=np.bool) transitions[transition_points] = True self.g = transitions.cumsum() - self.df = DataFrame({'signal': np.random.rand(N)}) + self.df = DataFrame({"signal": np.random.rand(N)}) def time_transform_mean(self): - self.df['signal'].groupby(self.g).transform(np.mean) + self.df["signal"].groupby(self.g).transform(np.mean) class TransformNaN: # GH 12737 def setup(self): - self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10), - 'B': np.nan, - 'C': np.nan}) - self.df_nans.loc[4::10, 'B':'C'] = 5 + self.df_nans = DataFrame( + {"key": np.repeat(np.arange(1000), 10), "B": np.nan, "C": np.nan} + ) + self.df_nans.loc[4::10, "B":"C"] = 5 def time_first(self): - self.df_nans.groupby('key').transform('first') + self.df_nans.groupby("key").transform("first") from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 1eedc1a2b3021..6541ddcb0397d 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,38 +1,47 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex, - Float64Index, IntervalIndex) +from pandas import ( + Series, + date_range, + DatetimeIndex, + Index, + RangeIndex, + Float64Index, + IntervalIndex, +) class SetOperations: - params = (['datetime', 'date_string', 'int', 'strings'], - ['intersection', 'union', 'symmetric_difference']) - param_names = ['dtype', 'method'] + params = ( + ["datetime", "date_string", "int", "strings"], + ["intersection", "union", "symmetric_difference"], + ) + param_names = ["dtype", "method"] def setup(self, dtype, method): - N = 10**5 - dates_left = date_range('1/1/2000', periods=N, freq='T') - fmt = '%Y-%m-%d %H:%M:%S' + N = 10 ** 5 + dates_left = date_range("1/1/2000", periods=N, freq="T") + fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) str_left = tm.makeStringIndex(N) - data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]}, - 'date_string': {'left': date_str_left, - 'right': date_str_left[:-1]}, - 'int': {'left': int_left, 'right': int_left[:-1]}, - 'strings': {'left': str_left, 'right': str_left[:-1]}} - self.left = data[dtype]['left'] - self.right = data[dtype]['right'] + data = { + "datetime": {"left": dates_left, "right": dates_left[:-1]}, + "date_string": {"left": date_str_left, "right": date_str_left[:-1]}, + "int": {"left": int_left, "right": int_left[:-1]}, + "strings": {"left": str_left, "right": str_left[:-1]}, + } + self.left = data[dtype]["left"] + self.right = data[dtype]["right"] def time_operation(self, dtype, method): getattr(self.left, method)(self.right) class SetDisjoint: - def setup(self): - N = 10**5 + N = 10 ** 5 B = N + 20000 self.datetime_left = DatetimeIndex(range(N)) self.datetime_right = DatetimeIndex(range(N, B)) @@ -42,9 +51,8 @@ def time_datetime_difference_disjoint(self): class Datetime: - def setup(self): - self.dr = date_range('20000101', freq='D', periods=10000) + self.dr = date_range("20000101", freq="D", periods=10000) def time_is_dates_only(self): self.dr._is_dates_only @@ -52,12 +60,12 @@ def time_is_dates_only(self): class Ops: - params = ['float', 'int'] - param_names = ['dtype'] + params = ["float", "int"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**6 - indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'} + N = 10 ** 6 + indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"} self.index = getattr(tm, indexes[dtype])(N) def time_add(self, dtype): @@ -77,10 +85,9 @@ def time_modulo(self, dtype): class Range: - def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10**7, step=3) - self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3) + self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3) + self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3) def time_max(self): self.idx_inc.max() @@ -102,7 +109,6 @@ def time_get_loc_dec(self): class IndexAppend: - def setup(self): N = 10000 @@ -132,19 +138,20 @@ def time_append_obj_list(self): class Indexing: - params = ['String', 'Float', 'Int'] - param_names = ['dtype'] + params = ["String", "Float", "Int"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**6 - self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) + N = 10 ** 6 + self.idx = getattr(tm, "make{}Index".format(dtype))(N) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() half = N // 2 self.non_unique = self.idx[:half].append(self.idx[:half]) - self.non_unique_sorted = (self.sorted[:half].append(self.sorted[:half]) - .sort_values()) + self.non_unique_sorted = ( + self.sorted[:half].append(self.sorted[:half]).sort_values() + ) self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): @@ -188,7 +195,7 @@ def time_get_loc(self): class IntervalIndexMethod: # GH 24813 - params = [10**3, 10**5] + params = [10 ** 3, 10 ** 5] def setup(self, N): left = np.append(np.arange(N), np.array(0)) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 4e82fa5592529..489e5c4cd63ea 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -2,26 +2,37 @@ import numpy as np import pandas.util.testing as tm -from pandas import (Series, DataFrame, MultiIndex, - Int64Index, UInt64Index, Float64Index, - IntervalIndex, CategoricalIndex, - IndexSlice, concat, date_range, option_context) +from pandas import ( + Series, + DataFrame, + MultiIndex, + Int64Index, + UInt64Index, + Float64Index, + IntervalIndex, + CategoricalIndex, + IndexSlice, + concat, + date_range, + option_context, +) class NumericSeriesIndexing: params = [ (Int64Index, UInt64Index, Float64Index), - ('unique_monotonic_inc', 'nonunique_monotonic_inc'), + ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] - param_names = ['index_dtype', 'index_structure'] + param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): - N = 10**6 + N = 10 ** 6 indices = { - 'unique_monotonic_inc': index(range(N)), - 'nonunique_monotonic_inc': index( - list(range(55)) + [54] + list(range(55, N - 1))), + "unique_monotonic_inc": index(range(N)), + "nonunique_monotonic_inc": index( + list(range(55)) + [54] + list(range(55, N - 1)) + ), } self.data = Series(np.random.rand(N), index=indices[index_structure]) self.array = np.arange(10000) @@ -82,23 +93,25 @@ def time_loc_slice(self, index, index_structure): class NonNumericSeriesIndexing: params = [ - ('string', 'datetime'), - ('unique_monotonic_inc', 'nonunique_monotonic_inc'), + ("string", "datetime"), + ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] - param_names = ['index_dtype', 'index_structure'] + param_names = ["index_dtype", "index_structure"] def setup(self, index, index_structure): - N = 10**6 - indexes = {'string': tm.makeStringIndex(N), - 'datetime': date_range('1900', periods=N, freq='s')} + N = 10 ** 6 + indexes = { + "string": tm.makeStringIndex(N), + "datetime": date_range("1900", periods=N, freq="s"), + } index = indexes[index] - if index_structure == 'nonunique_monotonic_inc': + if index_structure == "nonunique_monotonic_inc": index = index.insert(item=index[2], loc=2)[:-1] self.s = Series(np.random.rand(N), index=index) self.lbl = index[80000] def time_getitem_label_slice(self, index, index_structure): - self.s[:self.lbl] + self.s[: self.lbl] def time_getitem_pos_slice(self, index, index_structure): self.s[:80000] @@ -115,12 +128,10 @@ def time_getitem_list_like(self, index, index_structure): class DataFrameStringIndexing: - def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=index, - columns=columns) + self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 @@ -147,7 +158,6 @@ def time_boolean_rows_object(self): class DataFrameNumericIndexing: - def setup(self): self.idx_dupe = np.array(range(30)) * 99 self.df = DataFrame(np.random.randn(10000, 5)) @@ -172,13 +182,15 @@ def time_bool_indexer(self): class Take: - params = ['int', 'datetime'] - param_names = ['index'] + params = ["int", "datetime"] + param_names = ["index"] def setup(self, index): N = 100000 - indexes = {'int': Int64Index(np.arange(N)), - 'datetime': date_range('2011-01-01', freq='S', periods=N)} + indexes = { + "int": Int64Index(np.arange(N)), + "datetime": date_range("2011-01-01", freq="S", periods=N), + } index = indexes[index] self.s = Series(np.random.rand(N), index=index) self.indexer = [True, False, True, True, False] * 20000 @@ -188,22 +200,24 @@ def time_take(self, index): class MultiIndexing: - def setup(self): mi = MultiIndex.from_product([range(1000), range(1000)]) self.s = Series(np.random.randn(1000000), index=mi) self.df = DataFrame(self.s) n = 100000 - self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000), - n), - 'B': np.random.choice(range(10, 400), n), - 'C': np.random.choice(range(1, 150), n), - 'D': np.random.choice(range(10000, 45000), n), - 'x': np.random.choice(range(400), n), - 'y': np.random.choice(range(25), n)}) + self.mdt = DataFrame( + { + "A": np.random.choice(range(10000, 45000, 1000), n), + "B": np.random.choice(range(10, 400), n), + "C": np.random.choice(range(1, 150), n), + "D": np.random.choice(range(10000, 45000), n), + "x": np.random.choice(range(400), n), + "y": np.random.choice(range(25), n), + } + ) self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] - self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index() + self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index() def time_series_ix(self): self.s.ix[999] @@ -216,7 +230,6 @@ def time_index_slice(self): class IntervalIndexing: - def setup_cache(self): idx = IntervalIndex.from_breaks(np.arange(1000001)) monotonic = Series(np.arange(1000000), index=idx) @@ -237,29 +250,30 @@ def time_loc_list(self, monotonic): class CategoricalIndexIndexing: - params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] - param_names = ['index'] + params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] + param_names = ["index"] def setup(self, index): - N = 10**5 - values = list('a' * N + 'b' * N + 'c' * N) + N = 10 ** 5 + values = list("a" * N + "b" * N + "c" * N) indices = { - 'monotonic_incr': CategoricalIndex(values), - 'monotonic_decr': CategoricalIndex(reversed(values)), - 'non_monotonic': CategoricalIndex(list('abc' * N))} + "monotonic_incr": CategoricalIndex(values), + "monotonic_decr": CategoricalIndex(reversed(values)), + "non_monotonic": CategoricalIndex(list("abc" * N)), + } self.data = indices[index] self.int_scalar = 10000 self.int_list = list(range(10000)) - self.cat_scalar = 'b' - self.cat_list = ['a', 'c'] + self.cat_scalar = "b" + self.cat_list = ["a", "c"] def time_getitem_scalar(self, index): self.data[self.int_scalar] def time_getitem_slice(self, index): - self.data[:self.int_scalar] + self.data[: self.int_scalar] def time_getitem_list_like(self, index): self.data[[self.int_scalar]] @@ -278,7 +292,6 @@ def time_get_indexer_list(self, index): class MethodLookup: - def setup_cache(self): s = Series() return s @@ -294,40 +307,36 @@ def time_lookup_loc(self, s): class GetItemSingleColumn: - def setup(self): - self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=["A"]) self.df_int_col = DataFrame(np.random.randn(3000, 1)) def time_frame_getitem_single_column_label(self): - self.df_string_col['A'] + self.df_string_col["A"] def time_frame_getitem_single_column_int(self): self.df_int_col[0] class AssignTimeseriesIndex: - def setup(self): N = 100000 - idx = date_range('1/1/2000', periods=N, freq='H') - self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) + idx = date_range("1/1/2000", periods=N, freq="H") + self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx) def time_frame_assign_timeseries_index(self): - self.df['date'] = self.df.index + self.df["date"] = self.df.index class InsertColumns: - def setup(self): - self.N = 10**3 + self.N = 10 ** 3 self.df = DataFrame(index=range(self.N)) def time_insert(self): np.random.seed(1234) for i in range(100): - self.df.insert(0, i, np.random.randn(self.N), - allow_duplicates=True) + self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True) def time_assign_with_setitem(self): np.random.seed(1234) @@ -337,18 +346,18 @@ def time_assign_with_setitem(self): class ChainIndexing: - params = [None, 'warn'] - param_names = ['mode'] + params = [None, "warn"] + param_names = ["mode"] def setup(self, mode): self.N = 1000000 def time_chained_indexing(self, mode): with warnings.catch_warnings(record=True): - with option_context('mode.chained_assignment', mode): - df = DataFrame({'A': np.arange(self.N), 'B': 'foo'}) + with option_context("mode.chained_assignment", mode): + df = DataFrame({"A": np.arange(self.N), "B": "foo"}) df2 = df[df.A > self.N // 2] - df2['C'] = 1.0 + df2["C"] = 1.0 from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 5655701781846..44a22dfa77791 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -5,33 +5,40 @@ def _get_numeric_engines(): engine_names = [ - ('Int64Engine', np.int64), ('Int32Engine', np.int32), - ('Int16Engine', np.int16), ('Int8Engine', np.int8), - ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32), - ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8), - ('Float64Engine', np.float64), ('Float32Engine', np.float32), + ("Int64Engine", np.int64), + ("Int32Engine", np.int32), + ("Int16Engine", np.int16), + ("Int8Engine", np.int8), + ("UInt64Engine", np.uint64), + ("UInt32Engine", np.uint32), + ("UInt16engine", np.uint16), + ("UInt8Engine", np.uint8), + ("Float64Engine", np.float64), + ("Float32Engine", np.float32), + ] + return [ + (getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name) ] - return [(getattr(libindex, engine_name), dtype) - for engine_name, dtype in engine_names - if hasattr(libindex, engine_name)] class NumericEngineIndexing: - params = [_get_numeric_engines(), - ['monotonic_incr', 'monotonic_decr', 'non_monotonic'], - ] - param_names = ['engine_and_dtype', 'index_type'] + params = [ + _get_numeric_engines(), + ["monotonic_incr", "monotonic_decr", "non_monotonic"], + ] + param_names = ["engine_and_dtype", "index_type"] def setup(self, engine_and_dtype, index_type): engine, dtype = engine_and_dtype - N = 10**5 + N = 10 ** 5 values = list([1] * N + [2] * N + [3] * N) arr = { - 'monotonic_incr': np.array(values, dtype=dtype), - 'monotonic_decr': np.array(list(reversed(values)), - dtype=dtype), - 'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype), + "monotonic_incr": np.array(values, dtype=dtype), + "monotonic_decr": np.array(list(reversed(values)), dtype=dtype), + "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype), }[index_type] self.data = engine(lambda: arr, len(arr)) @@ -44,21 +51,21 @@ def time_get_loc(self, engine_and_dtype, index_type): class ObjectEngineIndexing: - params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')] - param_names = ['index_type'] + params = [("monotonic_incr", "monotonic_decr", "non_monotonic")] + param_names = ["index_type"] def setup(self, index_type): - N = 10**5 - values = list('a' * N + 'b' * N + 'c' * N) + N = 10 ** 5 + values = list("a" * N + "b" * N + "c" * N) arr = { - 'monotonic_incr': np.array(values, dtype=object), - 'monotonic_decr': np.array(list(reversed(values)), dtype=object), - 'non_monotonic': np.array(list('abc') * N, dtype=object), + "monotonic_incr": np.array(values, dtype=object), + "monotonic_decr": np.array(list(reversed(values)), dtype=object), + "non_monotonic": np.array(list("abc") * N, dtype=object), }[index_type] self.data = libindex.ObjectEngine(lambda: arr, len(arr)) # code belows avoids populating the mapping etc. while timing. - self.data.get_loc('b') + self.data.get_loc("b") def time_get_loc(self, index_type): - self.data.get_loc('b') + self.data.get_loc("b") diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 065c82207d251..66ef4f2aec380 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -8,56 +8,57 @@ class NumericInferOps: # from GH 7332 params = numeric_dtypes - param_names = ['dtype'] + param_names = ["dtype"] def setup(self, dtype): - N = 5 * 10**5 - self.df = DataFrame({'A': np.arange(N).astype(dtype), - 'B': np.arange(N).astype(dtype)}) + N = 5 * 10 ** 5 + self.df = DataFrame( + {"A": np.arange(N).astype(dtype), "B": np.arange(N).astype(dtype)} + ) def time_add(self, dtype): - self.df['A'] + self.df['B'] + self.df["A"] + self.df["B"] def time_subtract(self, dtype): - self.df['A'] - self.df['B'] + self.df["A"] - self.df["B"] def time_multiply(self, dtype): - self.df['A'] * self.df['B'] + self.df["A"] * self.df["B"] def time_divide(self, dtype): - self.df['A'] / self.df['B'] + self.df["A"] / self.df["B"] def time_modulo(self, dtype): - self.df['A'] % self.df['B'] + self.df["A"] % self.df["B"] class DateInferOps: # from GH 7332 def setup_cache(self): - N = 5 * 10**5 - df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')}) - df['timedelta'] = df['datetime64'] - df['datetime64'] + N = 5 * 10 ** 5 + df = DataFrame({"datetime64": np.arange(N).astype("datetime64[ms]")}) + df["timedelta"] = df["datetime64"] - df["datetime64"] return df def time_subtract_datetimes(self, df): - df['datetime64'] - df['datetime64'] + df["datetime64"] - df["datetime64"] def time_timedelta_plus_datetime(self, df): - df['timedelta'] + df['datetime64'] + df["timedelta"] + df["datetime64"] def time_add_timedeltas(self, df): - df['timedelta'] + df['timedelta'] + df["timedelta"] + df["timedelta"] class ToNumeric: - params = ['ignore', 'coerce'] - param_names = ['errors'] + params = ["ignore", "coerce"] + param_names = ["errors"] def setup(self, errors): N = 10000 self.float = Series(np.random.randn(N)) - self.numstr = self.float.astype('str') + self.numstr = self.float.astype("str") self.str = Series(tm.makeStringIndex(N)) def time_from_float(self, errors): @@ -72,21 +73,32 @@ def time_from_str(self, errors): class ToNumericDowncast: - param_names = ['dtype', 'downcast'] - params = [['string-float', 'string-int', 'string-nint', 'datetime64', - 'int-list', 'int32'], - [None, 'integer', 'signed', 'unsigned', 'float']] + param_names = ["dtype", "downcast"] + params = [ + [ + "string-float", + "string-int", + "string-nint", + "datetime64", + "int-list", + "int32", + ], + [None, "integer", "signed", "unsigned", "float"], + ] N = 500000 N2 = int(N / 2) - data_dict = {'string-int': ['1'] * N2 + [2] * N2, - 'string-nint': ['-1'] * N2 + [2] * N2, - 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], - dtype='datetime64[D]'), N), - 'string-float': ['1.1'] * N2 + [2] * N2, - 'int-list': [1] * N2 + [2] * N2, - 'int32': np.repeat(np.int32(1), N)} + data_dict = { + "string-int": ["1"] * N2 + [2] * N2, + "string-nint": ["-1"] * N2 + [2] * N2, + "datetime64": np.repeat( + np.array(["1970-01-01", "1970-01-02"], dtype="datetime64[D]"), N + ), + "string-float": ["1.1"] * N2 + [2] * N2, + "int-list": [1] * N2 + [2] * N2, + "int32": np.repeat(np.int32(1), N), + } def setup(self, dtype, downcast): self.data = self.data_dict[dtype] @@ -96,10 +108,9 @@ def time_downcast(self, dtype, downcast): class MaybeConvertNumeric: - def setup_cache(self): - N = 10**6 - arr = np.repeat([2**63], N) + np.arange(N).astype('uint64') + N = 10 ** 6 + arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64") data = arr.astype(object) data[1::2] = arr[1::2].astype(str) data[-1] = -1 diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index fbb96380a5813..4525e504fc4dd 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -11,27 +11,31 @@ class ToCSV(BaseIO): - fname = '__test__.csv' - params = ['wide', 'long', 'mixed'] - param_names = ['kind'] + fname = "__test__.csv" + params = ["wide", "long", "mixed"] + param_names = ["kind"] def setup(self, kind): wide_frame = DataFrame(np.random.randn(3000, 30)) - long_frame = DataFrame({'A': np.arange(50000), - 'B': np.arange(50000) + 1., - 'C': np.arange(50000) + 2., - 'D': np.arange(50000) + 3.}) - mixed_frame = DataFrame({'float': np.random.randn(5000), - 'int': np.random.randn(5000).astype(int), - 'bool': (np.arange(5000) % 2) == 0, - 'datetime': date_range('2001', - freq='s', - periods=5000), - 'object': ['foo'] * 5000}) - mixed_frame.loc[30:500, 'float'] = np.nan - data = {'wide': wide_frame, - 'long': long_frame, - 'mixed': mixed_frame} + long_frame = DataFrame( + { + "A": np.arange(50000), + "B": np.arange(50000) + 1.0, + "C": np.arange(50000) + 2.0, + "D": np.arange(50000) + 3.0, + } + ) + mixed_frame = DataFrame( + { + "float": np.random.randn(5000), + "int": np.random.randn(5000).astype(int), + "bool": (np.arange(5000) % 2) == 0, + "datetime": date_range("2001", freq="s", periods=5000), + "object": ["foo"] * 5000, + } + ) + mixed_frame.loc[30:500, "float"] = np.nan + data = {"wide": wide_frame, "long": long_frame, "mixed": mixed_frame} self.df = data[kind] def time_frame(self, kind): @@ -40,36 +44,39 @@ def time_frame(self, kind): class ToCSVDatetime(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" def setup(self): - rng = date_range('1/1/2000', periods=1000) + rng = date_range("1/1/2000", periods=1000) self.data = DataFrame(rng, index=rng) def time_frame_date_formatting(self): - self.data.to_csv(self.fname, date_format='%Y%m%d') + self.data.to_csv(self.fname, date_format="%Y%m%d") class ToCSVDatetimeBig(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] - param_names = ['obs'] + param_names = ["obs"] def setup(self, obs): - d = '2018-11-29' - dt = '2018-11-26 11:18:27.0' - self.data = DataFrame({'dt': [np.datetime64(dt)] * obs, - 'd': [np.datetime64(d)] * obs, - 'r': [np.random.uniform()] * obs}) + d = "2018-11-29" + dt = "2018-11-26 11:18:27.0" + self.data = DataFrame( + { + "dt": [np.datetime64(dt)] * obs, + "d": [np.datetime64(d)] * obs, + "r": [np.random.uniform()] * obs, + } + ) def time_frame(self, obs): self.data.to_csv(self.fname) class StringIORewind: - def data(self, stringio_object): stringio_object.seek(0) return stringio_object @@ -77,68 +84,84 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): - params = ([True, False], ['custom', 'iso8601', 'ymd']) - param_names = ['infer_datetime_format', 'format'] + params = ([True, False], ["custom", "iso8601", "ymd"]) + param_names = ["infer_datetime_format", "format"] def setup(self, infer_datetime_format, format): - rng = date_range('1/1/2000', periods=1000) - formats = {'custom': '%m/%d/%Y %H:%M:%S.%f', - 'iso8601': '%Y-%m-%d %H:%M:%S', - 'ymd': '%Y%m%d'} + rng = date_range("1/1/2000", periods=1000) + formats = { + "custom": "%m/%d/%Y %H:%M:%S.%f", + "iso8601": "%Y-%m-%d %H:%M:%S", + "ymd": "%Y%m%d", + } dt_format = formats[format] - self.StringIO_input = StringIO('\n'.join( - rng.strftime(dt_format).tolist())) + self.StringIO_input = StringIO("\n".join(rng.strftime(dt_format).tolist())) def time_read_csv(self, infer_datetime_format, format): - read_csv(self.data(self.StringIO_input), - header=None, names=['foo'], parse_dates=['foo'], - infer_datetime_format=infer_datetime_format) + read_csv( + self.data(self.StringIO_input), + header=None, + names=["foo"], + parse_dates=["foo"], + infer_datetime_format=infer_datetime_format, + ) class ReadCSVConcatDatetime(StringIORewind): - iso8601 = '%Y-%m-%d %H:%M:%S' + iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): - rng = date_range('1/1/2000', periods=50000, freq='S') - self.StringIO_input = StringIO('\n'.join( - rng.strftime(self.iso8601).tolist())) + rng = date_range("1/1/2000", periods=50000, freq="S") + self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist())) def time_read_csv(self): - read_csv(self.data(self.StringIO_input), - header=None, names=['foo'], parse_dates=['foo'], - infer_datetime_format=False) + read_csv( + self.data(self.StringIO_input), + header=None, + names=["foo"], + parse_dates=["foo"], + infer_datetime_format=False, + ) class ReadCSVConcatDatetimeBadDateValue(StringIORewind): - params = (['nan', '0', ''],) - param_names = ['bad_date_value'] + params = (["nan", "0", ""],) + param_names = ["bad_date_value"] def setup(self, bad_date_value): - self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000) + self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000) def time_read_csv(self, bad_date_value): - read_csv(self.data(self.StringIO_input), - header=None, names=['foo', 'bar'], parse_dates=['foo'], - infer_datetime_format=False) + read_csv( + self.data(self.StringIO_input), + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + infer_datetime_format=False, + ) class ReadCSVSkipRows(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" params = [None, 10000] - param_names = ['skiprows'] + param_names = ["skiprows"] def setup(self, skiprows): N = 20000 index = tm.makeStringIndex(N) - df = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N), - 'string1': ['foo'] * N, - 'bool1': [True] * N, - 'int1': np.random.randint(0, N, size=N)}, - index=index) + df = DataFrame( + { + "float1": np.random.randn(N), + "float2": np.random.randn(N), + "string1": ["foo"] * N, + "bool1": [True] * N, + "int1": np.random.randint(0, N, size=N), + }, + index=index, + ) df.to_csv(self.fname) def time_skipprows(self, skiprows): @@ -146,31 +169,31 @@ def time_skipprows(self, skiprows): class ReadUint64Integers(StringIORewind): - def setup(self): - self.na_values = [2**63 + 500] - arr = np.arange(10000).astype('uint64') + 2**63 - self.data1 = StringIO('\n'.join(arr.astype(str).tolist())) + self.na_values = [2 ** 63 + 500] + arr = np.arange(10000).astype("uint64") + 2 ** 63 + self.data1 = StringIO("\n".join(arr.astype(str).tolist())) arr = arr.astype(object) arr[500] = -1 - self.data2 = StringIO('\n'.join(arr.astype(str).tolist())) + self.data2 = StringIO("\n".join(arr.astype(str).tolist())) def time_read_uint64(self): - read_csv(self.data(self.data1), header=None, names=['foo']) + read_csv(self.data(self.data1), header=None, names=["foo"]) def time_read_uint64_neg_values(self): - read_csv(self.data(self.data2), header=None, names=['foo']) + read_csv(self.data(self.data2), header=None, names=["foo"]) def time_read_uint64_na_values(self): - read_csv(self.data(self.data1), header=None, names=['foo'], - na_values=self.na_values) + read_csv( + self.data(self.data1), header=None, names=["foo"], na_values=self.na_values + ) class ReadCSVThousands(BaseIO): - fname = '__test__.csv' - params = ([',', '|'], [None, ',']) - param_names = ['sep', 'thousands'] + fname = "__test__.csv" + params = ([",", "|"], [None, ","]) + param_names = ["sep", "thousands"] def setup(self, sep, thousands): N = 10000 @@ -178,8 +201,8 @@ def setup(self, sep, thousands): data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) df = DataFrame(data) if thousands is not None: - fmt = ':{}'.format(thousands) - fmt = '{' + fmt + '}' + fmt = ":{}".format(thousands) + fmt = "{" + fmt + "}" df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) @@ -188,57 +211,68 @@ def time_thousands(self, sep, thousands): class ReadCSVComment(StringIORewind): - def setup(self): - data = ['A,B,C'] + (['1,2,3 # comment'] * 100000) - self.StringIO_input = StringIO('\n'.join(data)) + data = ["A,B,C"] + (["1,2,3 # comment"] * 100000) + self.StringIO_input = StringIO("\n".join(data)) def time_comment(self): - read_csv(self.data(self.StringIO_input), comment='#', - header=None, names=list('abc')) + read_csv( + self.data(self.StringIO_input), comment="#", header=None, names=list("abc") + ) class ReadCSVFloatPrecision(StringIORewind): - params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip']) - param_names = ['sep', 'decimal', 'float_precision'] + params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) + param_names = ["sep", "decimal", "float_precision"] def setup(self, sep, decimal, float_precision): - floats = [''.join(random.choice(string.digits) for _ in range(28)) - for _ in range(15)] - rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n' + floats = [ + "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15) + ] + rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n" data = rows * 5 data = data.format(*floats) * 200 # 1000 x 3 strings csv self.StringIO_input = StringIO(data) def time_read_csv(self, sep, decimal, float_precision): - read_csv(self.data(self.StringIO_input), sep=sep, header=None, - names=list('abc'), float_precision=float_precision) + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + names=list("abc"), + float_precision=float_precision, + ) def time_read_csv_python_engine(self, sep, decimal, float_precision): - read_csv(self.data(self.StringIO_input), sep=sep, header=None, - engine='python', float_precision=None, names=list('abc')) + read_csv( + self.data(self.StringIO_input), + sep=sep, + header=None, + engine="python", + float_precision=None, + names=list("abc"), + ) class ReadCSVCategorical(BaseIO): - fname = '__test__.csv' + fname = "__test__.csv" def setup(self): N = 100000 - group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] - df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc')) + group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"] + df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc")) df.to_csv(self.fname, index=False) def time_convert_post(self): read_csv(self.fname).apply(Categorical) def time_convert_direct(self): - read_csv(self.fname, dtype='category') + read_csv(self.fname, dtype="category") class ReadCSVParseDates(StringIORewind): - def setup(self): data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n @@ -246,34 +280,47 @@ def setup(self): {},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n {},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n """ - two_cols = ['KORD,19990127'] * 5 + two_cols = ["KORD,19990127"] * 5 data = data.format(*two_cols) self.StringIO_input = StringIO(data) def time_multiple_date(self): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]]) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + names=list(string.digits[:9]), + parse_dates=[[1, 2], [1, 3]], + ) def time_baseline(self): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - parse_dates=[1], - names=list(string.digits[:9])) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + parse_dates=[1], + names=list(string.digits[:9]), + ) class ReadCSVCachedParseDates(StringIORewind): params = ([True, False],) - param_names = ['do_cache'] + param_names = ["do_cache"] def setup(self, do_cache): - data = ('\n'.join('10/{}'.format(year) - for year in range(2000, 2100)) + '\n') * 10 + data = ( + "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n" + ) * 10 self.StringIO_input = StringIO(data) def time_read_csv_cached(self, do_cache): try: - read_csv(self.data(self.StringIO_input), header=None, - parse_dates=[0], cache_dates=do_cache) + read_csv( + self.data(self.StringIO_input), + header=None, + parse_dates=[0], + cache_dates=do_cache, + ) except TypeError: # cache_dates is a new keyword in 0.25 pass @@ -299,12 +346,12 @@ def mem_parser_chunks(self): class ReadCSVParseSpecialDate(StringIORewind): - params = (['mY', 'mdY', 'hm'],) - param_names = ['value'] + params = (["mY", "mdY", "hm"],) + param_names = ["value"] objects = { - 'mY': '01-2019\n10-2019\n02/2000\n', - 'mdY': '12/02/2010\n', - 'hm': '21:34\n' + "mY": "01-2019\n10-2019\n02/2000\n", + "mdY": "12/02/2010\n", + "hm": "21:34\n", } def setup(self, value): @@ -313,38 +360,50 @@ def setup(self, value): self.StringIO_input = StringIO(data) def time_read_special_date(self, value): - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date']) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + names=["Date"], + parse_dates=["Date"], + ) class ParseDateComparison(StringIORewind): params = ([False, True],) - param_names = ['cache_dates'] + param_names = ["cache_dates"] def setup(self, cache_dates): count_elem = 10000 - data = '12-02-2010\n' * count_elem + data = "12-02-2010\n" * count_elem self.StringIO_input = StringIO(data) def time_read_csv_dayfirst(self, cache_dates): try: - read_csv(self.data(self.StringIO_input), sep=',', header=None, - names=['Date'], parse_dates=['Date'], - cache_dates=cache_dates, - dayfirst=True) + read_csv( + self.data(self.StringIO_input), + sep=",", + header=None, + names=["Date"], + parse_dates=["Date"], + cache_dates=cache_dates, + dayfirst=True, + ) except TypeError: # cache_dates is a new keyword in 0.25 pass def time_to_datetime_dayfirst(self, cache_dates): - df = read_csv(self.data(self.StringIO_input), - dtype={'date': str}, names=['date']) - to_datetime(df['date'], cache=cache_dates, dayfirst=True) + df = read_csv( + self.data(self.StringIO_input), dtype={"date": str}, names=["date"] + ) + to_datetime(df["date"], cache=cache_dates, dayfirst=True) def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): - df = read_csv(self.data(self.StringIO_input), - dtype={'date': str}, names=['date']) - to_datetime(df['date'], cache=cache_dates, format='%d-%m-%Y') + df = read_csv( + self.data(self.StringIO_input), dtype={"date": str}, names=["date"] + ) + to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 1decb83f2f723..12e70f84e5203 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -6,19 +6,21 @@ class Excel: - params = ['openpyxl', 'xlsxwriter', 'xlwt'] - param_names = ['engine'] + params = ["openpyxl", "xlsxwriter", "xlwt"] + param_names = ["engine"] def setup(self, engine): N = 2000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) self.bio_read = BytesIO() self.writer_read = ExcelWriter(self.bio_read, engine=engine) - self.df.to_excel(self.writer_read, sheet_name='Sheet1') + self.df.to_excel(self.writer_read, sheet_name="Sheet1") self.writer_read.save() self.bio_read.seek(0) @@ -29,7 +31,7 @@ def time_write_excel(self, engine): bio_write = BytesIO() bio_write.seek(0) writer_write = ExcelWriter(bio_write, engine=engine) - self.df.to_excel(writer_write, sheet_name='Sheet1') + self.df.to_excel(writer_write, sheet_name="Sheet1") writer_write.save() diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index a5dc28eb9508c..2874a7889156b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -6,86 +6,92 @@ class HDFStoreDataFrame(BaseIO): - def setup(self): N = 25000 index = tm.makeStringIndex(N) - self.df = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N)}, - index=index) - self.df_mixed = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N), - 'string1': ['foo'] * N, - 'bool1': [True] * N, - 'int1': np.random.randint(0, N, size=N)}, - index=index) + self.df = DataFrame( + {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index + ) + self.df_mixed = DataFrame( + { + "float1": np.random.randn(N), + "float2": np.random.randn(N), + "string1": ["foo"] * N, + "bool1": [True] * N, + "int1": np.random.randint(0, N, size=N), + }, + index=index, + ) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] - self.df2 = DataFrame({'float1': np.random.randn(N), - 'float2': np.random.randn(N)}, - index=date_range('1/1/2000', periods=N)) + self.df2 = DataFrame( + {"float1": np.random.randn(N), "float2": np.random.randn(N)}, + index=date_range("1/1/2000", periods=N), + ) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] - self.df_wide2 = DataFrame(np.random.randn(N, 100), - index=date_range('1/1/2000', periods=N)) - self.df_dc = DataFrame(np.random.randn(N, 10), - columns=['C%03d' % i for i in range(10)]) + self.df_wide2 = DataFrame( + np.random.randn(N, 100), index=date_range("1/1/2000", periods=N) + ) + self.df_dc = DataFrame( + np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)] + ) - self.fname = '__test__.h5' + self.fname = "__test__.h5" self.store = HDFStore(self.fname) - self.store.put('fixed', self.df) - self.store.put('fixed_mixed', self.df_mixed) - self.store.append('table', self.df2) - self.store.append('table_mixed', self.df_mixed) - self.store.append('table_wide', self.df_wide) - self.store.append('table_wide2', self.df_wide2) + self.store.put("fixed", self.df) + self.store.put("fixed_mixed", self.df_mixed) + self.store.append("table", self.df2) + self.store.append("table_mixed", self.df_mixed) + self.store.append("table_wide", self.df_wide) + self.store.append("table_wide2", self.df_wide2) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store(self): - self.store.get('fixed') + self.store.get("fixed") def time_read_store_mixed(self): - self.store.get('fixed_mixed') + self.store.get("fixed_mixed") def time_write_store(self): - self.store.put('fixed_write', self.df) + self.store.put("fixed_write", self.df) def time_write_store_mixed(self): - self.store.put('fixed_mixed_write', self.df_mixed) + self.store.put("fixed_mixed_write", self.df_mixed) def time_read_store_table_mixed(self): - self.store.select('table_mixed') + self.store.select("table_mixed") def time_write_store_table_mixed(self): - self.store.append('table_mixed_write', self.df_mixed) + self.store.append("table_mixed_write", self.df_mixed) def time_read_store_table(self): - self.store.select('table') + self.store.select("table") def time_write_store_table(self): - self.store.append('table_write', self.df) + self.store.append("table_write", self.df) def time_read_store_table_wide(self): - self.store.select('table_wide') + self.store.select("table_wide") def time_write_store_table_wide(self): - self.store.append('table_wide_write', self.df_wide) + self.store.append("table_wide_write", self.df_wide) def time_write_store_table_dc(self): - self.store.append('table_dc_write', self.df_dc, data_columns=True) + self.store.append("table_dc_write", self.df_dc, data_columns=True) def time_query_store_table_wide(self): - self.store.select('table_wide', where="index > self.start_wide and " - "index < self.stop_wide") + self.store.select( + "table_wide", where="index > self.start_wide and " "index < self.stop_wide" + ) def time_query_store_table(self): - self.store.select('table', where="index > self.start and " - "index < self.stop") + self.store.select("table", where="index > self.start and " "index < self.stop") def time_store_repr(self): repr(self.store) @@ -99,24 +105,26 @@ def time_store_info(self): class HDF(BaseIO): - params = ['table', 'fixed'] - param_names = ['format'] + params = ["table", "fixed"] + param_names = ["format"] def setup(self, format): - self.fname = '__test__.h5' + self.fname = "__test__.h5" N = 100000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) - self.df.to_hdf(self.fname, 'df', format=format) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) + self.df.to_hdf(self.fname, "df", format=format) def time_read_hdf(self, format): - read_hdf(self.fname, 'df') + read_hdf(self.fname, "df") def time_write_hdf(self, format): - self.df.to_hdf(self.fname, 'df', format=format) + self.df.to_hdf(self.fname, "df", format=format) from ..pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 19d11e6610198..0ce42856fb14a 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -8,16 +8,20 @@ class ReadJSON(BaseIO): fname = "__test__.json" - params = (['split', 'index', 'records'], ['int', 'datetime']) - param_names = ['orient', 'index'] + params = (["split", "index", "records"], ["int", "datetime"]) + param_names = ["orient", "index"] def setup(self, orient, index): N = 100000 - indexes = {'int': np.arange(N), - 'datetime': date_range('20000101', periods=N, freq='H')} - df = DataFrame(np.random.randn(N, 5), - columns=['float_{}'.format(i) for i in range(5)], - index=indexes[index]) + indexes = { + "int": np.arange(N), + "datetime": date_range("20000101", periods=N, freq="H"), + } + df = DataFrame( + np.random.randn(N, 5), + columns=["float_{}".format(i) for i in range(5)], + index=indexes[index], + ) df.to_json(self.fname, orient=orient) def time_read_json(self, orient, index): @@ -27,71 +31,85 @@ def time_read_json(self, orient, index): class ReadJSONLines(BaseIO): fname = "__test_lines__.json" - params = ['int', 'datetime'] - param_names = ['index'] + params = ["int", "datetime"] + param_names = ["index"] def setup(self, index): N = 100000 - indexes = {'int': np.arange(N), - 'datetime': date_range('20000101', periods=N, freq='H')} - df = DataFrame(np.random.randn(N, 5), - columns=['float_{}'.format(i) for i in range(5)], - index=indexes[index]) - df.to_json(self.fname, orient='records', lines=True) + indexes = { + "int": np.arange(N), + "datetime": date_range("20000101", periods=N, freq="H"), + } + df = DataFrame( + np.random.randn(N, 5), + columns=["float_{}".format(i) for i in range(5)], + index=indexes[index], + ) + df.to_json(self.fname, orient="records", lines=True) def time_read_json_lines(self, index): - read_json(self.fname, orient='records', lines=True) + read_json(self.fname, orient="records", lines=True) def time_read_json_lines_concat(self, index): - concat(read_json(self.fname, orient='records', lines=True, - chunksize=25000)) + concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) def peakmem_read_json_lines(self, index): - read_json(self.fname, orient='records', lines=True) + read_json(self.fname, orient="records", lines=True) def peakmem_read_json_lines_concat(self, index): - concat(read_json(self.fname, orient='records', lines=True, - chunksize=25000)) + concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) class ToJSON(BaseIO): fname = "__test__.json" - params = ['split', 'columns', 'index'] - param_names = ['orient'] + params = ["split", "columns", "index"] + param_names = ["orient"] def setup(self, lines_orient): - N = 10**5 + N = 10 ** 5 ncols = 5 - index = date_range('20000101', periods=N, freq='H') - timedeltas = timedelta_range(start=1, periods=N, freq='s') - datetimes = date_range(start=1, periods=N, freq='s') + index = date_range("20000101", periods=N, freq="H") + timedeltas = timedelta_range(start=1, periods=N, freq="s") + datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) - self.df_td_int_ts = DataFrame({'td_1': timedeltas, - 'td_2': timedeltas, - 'int_1': ints, - 'int_2': ints, - 'ts_1': datetimes, - 'ts_2': datetimes}, - index=index) - self.df_int_floats = DataFrame({'int_1': ints, - 'int_2': ints, - 'int_3': ints, - 'float_1': floats, - 'float_2': floats, - 'float_3': floats}, - index=index) - self.df_int_float_str = DataFrame({'int_1': ints, - 'int_2': ints, - 'float_1': floats, - 'float_2': floats, - 'str_1': strings, - 'str_2': strings}, - index=index) + self.df_td_int_ts = DataFrame( + { + "td_1": timedeltas, + "td_2": timedeltas, + "int_1": ints, + "int_2": ints, + "ts_1": datetimes, + "ts_2": datetimes, + }, + index=index, + ) + self.df_int_floats = DataFrame( + { + "int_1": ints, + "int_2": ints, + "int_3": ints, + "float_1": floats, + "float_2": floats, + "float_3": floats, + }, + index=index, + ) + self.df_int_float_str = DataFrame( + { + "int_1": ints, + "int_2": ints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) def time_floats_with_int_index(self, orient): self.df.to_json(self.fname, orient=orient) @@ -109,39 +127,35 @@ def time_float_int_str(self, orient): self.df_int_float_str.to_json(self.fname, orient=orient) def time_floats_with_int_idex_lines(self, orient): - self.df.to_json(self.fname, orient='records', lines=True) + self.df.to_json(self.fname, orient="records", lines=True) def time_floats_with_dt_index_lines(self, orient): - self.df_date_idx.to_json(self.fname, orient='records', lines=True) + self.df_date_idx.to_json(self.fname, orient="records", lines=True) def time_delta_int_tstamp_lines(self, orient): - self.df_td_int_ts.to_json(self.fname, orient='records', lines=True) + self.df_td_int_ts.to_json(self.fname, orient="records", lines=True) def time_float_int_lines(self, orient): - self.df_int_floats.to_json(self.fname, orient='records', lines=True) + self.df_int_floats.to_json(self.fname, orient="records", lines=True) def time_float_int_str_lines(self, orient): - self.df_int_float_str.to_json(self.fname, orient='records', lines=True) + self.df_int_float_str.to_json(self.fname, orient="records", lines=True) class ToJSONMem: - def setup_cache(self): df = DataFrame([[1]]) - frames = { - 'int': df, - 'float': df.astype(float), - } + frames = {"int": df, "float": df.astype(float)} return frames def peakmem_int(self, frames): - df = frames['int'] + df = frames["int"] for _ in range(100_000): df.to_json() def peakmem_float(self, frames): - df = frames['float'] + df = frames["float"] for _ in range(100_000): df.to_json() diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index dc2642d920fd0..c43df7c2e91ed 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -6,15 +6,16 @@ class MSGPack(BaseIO): - def setup(self): - self.fname = '__test__.msg' + self.fname = "__test__.msg" N = 100000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) self.df.to_msgpack(self.fname) def time_read_msgpack(self): diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index edba0358c821a..40256e043a008 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,7 +2,9 @@ try: from pandas._libs.tslibs.parsing import ( - _concat_date_cols, _does_string_look_like_datetime) + _concat_date_cols, + _does_string_look_like_datetime, + ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) pass @@ -10,8 +12,8 @@ class DoesStringLookLikeDatetime(object): - params = (['2Q2005', '0.0', '10000'],) - param_names = ['value'] + params = (["2Q2005", "0.0", "10000"],) + param_names = ["value"] def setup(self, value): self.objects = [value] * 1000000 @@ -23,16 +25,18 @@ def time_check_datetimes(self, value): class ConcatDateCols(object): - params = ([1234567890, 'AAAA'], [1, 2]) - param_names = ['value', 'dim'] + params = ([1234567890, "AAAA"], [1, 2]) + param_names = ["value", "dim"] def setup(self, value, dim): count_elem = 10000 if dim == 1: self.object = (np.array([value] * count_elem),) if dim == 2: - self.object = (np.array([value] * count_elem), - np.array([value] * count_elem)) + self.object = ( + np.array([value] * count_elem), + np.array([value] * count_elem), + ) def time_check_concat(self, value, dim): _concat_date_cols(self.object) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 74a58bbb946aa..286ac767c02e7 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -6,15 +6,16 @@ class Pickle(BaseIO): - def setup(self): - self.fname = '__test__.pkl' + self.fname = "__test__.pkl" N = 100000 C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(N) + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(N) self.df.to_pickle(self.fname) def time_read_pickle(self): diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py index 8181f1d41ac70..7ce8ef8c12639 100644 --- a/asv_bench/benchmarks/io/sas.py +++ b/asv_bench/benchmarks/io/sas.py @@ -5,15 +5,25 @@ class SAS: - params = ['sas7bdat', 'xport'] - param_names = ['format'] + params = ["sas7bdat", "xport"] + param_names = ["format"] def setup(self, format): # Read files that are located in 'pandas/io/tests/sas/data' - files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'} + files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"} file = files[format] - paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas', - 'tests', 'io', 'sas', 'data', file] + paths = [ + os.path.dirname(__file__), + "..", + "..", + "..", + "pandas", + "tests", + "io", + "sas", + "data", + file, + ] self.f = os.path.join(*paths) def time_read_msgpack(self, format): diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index ee48f3bd0a3ab..b80872b17a9e4 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -8,31 +8,35 @@ class SQL: - params = ['sqlalchemy', 'sqlite'] - param_names = ['connection'] + params = ["sqlalchemy", "sqlite"] + param_names = ["connection"] def setup(self, connection): N = 10000 - con = {'sqlalchemy': create_engine('sqlite:///:memory:'), - 'sqlite': sqlite3.connect(':memory:')} - self.table_name = 'test_type' - self.query_all = 'SELECT * FROM {}'.format(self.table_name) + con = { + "sqlalchemy": create_engine("sqlite:///:memory:"), + "sqlite": sqlite3.connect(":memory:"), + } + self.table_name = "test_type" + self.query_all = "SELECT * FROM {}".format(self.table_name) self.con = con[connection] - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_to_sql_dataframe(self, connection): - self.df.to_sql('test1', self.con, if_exists='replace') + self.df.to_sql("test1", self.con, if_exists="replace") def time_read_sql_query(self, connection): read_sql_query(self.query_all, self.con) @@ -40,85 +44,98 @@ def time_read_sql_query(self, connection): class WriteSQLDtypes: - params = (['sqlalchemy', 'sqlite'], - ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']) - param_names = ['connection', 'dtype'] + params = ( + ["sqlalchemy", "sqlite"], + ["float", "float_with_nan", "string", "bool", "int", "datetime"], + ) + param_names = ["connection", "dtype"] def setup(self, connection, dtype): N = 10000 - con = {'sqlalchemy': create_engine('sqlite:///:memory:'), - 'sqlite': sqlite3.connect(':memory:')} - self.table_name = 'test_type' - self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name) + con = { + "sqlalchemy": create_engine("sqlite:///:memory:"), + "sqlite": sqlite3.connect(":memory:"), + } + self.table_name = "test_type" + self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name) self.con = con[connection] - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_to_sql_dataframe_column(self, connection, dtype): - self.df[[dtype]].to_sql('test1', self.con, if_exists='replace') + self.df[[dtype]].to_sql("test1", self.con, if_exists="replace") def time_read_sql_query_select_column(self, connection, dtype): read_sql_query(self.query_col, self.con) class ReadSQLTable: - def setup(self): N = 10000 - self.table_name = 'test' - self.con = create_engine('sqlite:///:memory:') - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.table_name = "test" + self.con = create_engine("sqlite:///:memory:") + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_read_sql_table_all(self): read_sql_table(self.table_name, self.con) def time_read_sql_table_parse_dates(self): - read_sql_table(self.table_name, self.con, columns=['datetime_string'], - parse_dates=['datetime_string']) + read_sql_table( + self.table_name, + self.con, + columns=["datetime_string"], + parse_dates=["datetime_string"], + ) class ReadSQLTableDtypes: - params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'] - param_names = ['dtype'] + params = ["float", "float_with_nan", "string", "bool", "int", "datetime"] + param_names = ["dtype"] def setup(self, dtype): N = 10000 - self.table_name = 'test' - self.con = create_engine('sqlite:///:memory:') - self.df = DataFrame({'float': np.random.randn(N), - 'float_with_nan': np.random.randn(N), - 'string': ['foo'] * N, - 'bool': [True] * N, - 'int': np.random.randint(0, N, size=N), - 'datetime': date_range('2000-01-01', - periods=N, - freq='s')}, - index=tm.makeStringIndex(N)) - self.df.loc[1000:3000, 'float_with_nan'] = np.nan - self.df['datetime_string'] = self.df['datetime'].astype(str) - self.df.to_sql(self.table_name, self.con, if_exists='replace') + self.table_name = "test" + self.con = create_engine("sqlite:///:memory:") + self.df = DataFrame( + { + "float": np.random.randn(N), + "float_with_nan": np.random.randn(N), + "string": ["foo"] * N, + "bool": [True] * N, + "int": np.random.randint(0, N, size=N), + "datetime": date_range("2000-01-01", periods=N, freq="s"), + }, + index=tm.makeStringIndex(N), + ) + self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df["datetime_string"] = self.df["datetime"].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists="replace") def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index fff10cf10a4d3..b3ed71af47dc8 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -7,26 +7,30 @@ class Stata(BaseIO): - params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'] - param_names = ['convert_dates'] + params = ["tc", "td", "tm", "tw", "th", "tq", "ty"] + param_names = ["convert_dates"] def setup(self, convert_dates): - self.fname = '__test__.dta' + self.fname = "__test__.dta" N = self.N = 100000 C = self.C = 5 - self.df = DataFrame(np.random.randn(N, C), - columns=['float{}'.format(i) for i in range(C)], - index=date_range('20000101', periods=N, freq='H')) - self.df['object'] = tm.makeStringIndex(self.N) - self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min, - np.iinfo(np.int8).max - 27, N) - self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min, - np.iinfo(np.int16).max - 27, N) - self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min, - np.iinfo(np.int32).max - 27, N) - self.df['float32_'] = np.array(np.random.randn(N), - dtype=np.float32) - self.convert_dates = {'index': convert_dates} + self.df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + self.df["object"] = tm.makeStringIndex(self.N) + self.df["int8_"] = np.random.randint( + np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N + ) + self.df["int16_"] = np.random.randint( + np.iinfo(np.int16).min, np.iinfo(np.int16).max - 27, N + ) + self.df["int32_"] = np.random.randint( + np.iinfo(np.int32).min, np.iinfo(np.int32).max - 27, N + ) + self.df["float32_"] = np.array(np.random.randn(N), dtype=np.float32) + self.convert_dates = {"index": convert_dates} self.df.to_stata(self.fname, self.convert_dates) def time_read_stata(self, convert_dates): @@ -42,7 +46,7 @@ def setup(self, convert_dates): for i in range(10): missing_data = np.random.randn(self.N) missing_data[missing_data < 0] = np.nan - self.df['missing_{0}'.format(i)] = missing_data + self.df["missing_{0}".format(i)] = missing_data self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index bbaba9909966e..7c899e3dc6ac8 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -2,8 +2,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, MultiIndex, - date_range, concat, merge, merge_asof) +from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof try: from pandas import merge_ordered @@ -12,16 +11,14 @@ class Append: - def setup(self): - self.df1 = DataFrame(np.random.randn(10000, 4), - columns=['A', 'B', 'C', 'D']) + self.df1 = DataFrame(np.random.randn(10000, 4), columns=["A", "B", "C", "D"]) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() - self.mdf1['obj1'] = 'bar' - self.mdf1['obj2'] = 'bar' - self.mdf1['int1'] = 5 + self.mdf1["obj1"] = "bar" + self.mdf1["obj2"] = "bar" + self.mdf1["int1"] = 5 self.mdf1 = self.mdf1._consolidate() self.mdf2 = self.mdf1.copy() self.mdf2.index = self.df2.index @@ -36,15 +33,16 @@ def time_append_mixed(self): class Concat: params = [0, 1] - param_names = ['axis'] + param_names = ["axis"] def setup(self, axis): N = 1000 s = Series(N, index=tm.makeStringIndex(N)) - self.series = [s[i:- i] for i in range(1, 10)] * 50 + self.series = [s[i:-i] for i in range(1, 10)] * 50 self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 - df = DataFrame({'A': range(N)}, - index=date_range('20130101', periods=N, freq='s')) + df = DataFrame( + {"A": range(N)}, index=date_range("20130101", periods=N, freq="s") + ) self.empty_left = [DataFrame(), df] self.empty_right = [df, DataFrame()] self.mixed_ndims = [df, df.head(N // 2)] @@ -68,14 +66,12 @@ def time_concat_mixed_ndims(self, axis): class ConcatDataFrames: params = ([0, 1], [True, False]) - param_names = ['axis', 'ignore_index'] + param_names = ["axis", "ignore_index"] def setup(self, axis, ignore_index): - frame_c = DataFrame(np.zeros((10000, 200), - dtype=np.float32, order='C')) + frame_c = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="C")) self.frame_c = [frame_c] * 20 - frame_f = DataFrame(np.zeros((10000, 200), - dtype=np.float32, order='F')) + frame_f = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="F")) self.frame_f = [frame_f] * 20 def time_c_ordered(self, axis, ignore_index): @@ -88,74 +84,78 @@ def time_f_ordered(self, axis, ignore_index): class Join: params = [True, False] - param_names = ['sort'] + param_names = ["sort"] def setup(self, sort): level1 = tm.makeStringIndex(10).values level2 = tm.makeStringIndex(1000).values codes1 = np.arange(10).repeat(1000) codes2 = np.tile(np.arange(1000), 10) - index2 = MultiIndex(levels=[level1, level2], - codes=[codes1, codes2]) - self.df_multi = DataFrame(np.random.randn(len(index2), 4), - index=index2, - columns=['A', 'B', 'C', 'D']) + index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2]) + self.df_multi = DataFrame( + np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"] + ) self.key1 = np.tile(level1.take(codes1), 10) self.key2 = np.tile(level2.take(codes2), 10) - self.df = DataFrame({'data1': np.random.randn(100000), - 'data2': np.random.randn(100000), - 'key1': self.key1, - 'key2': self.key2}) - - self.df_key1 = DataFrame(np.random.randn(len(level1), 4), - index=level1, - columns=['A', 'B', 'C', 'D']) - self.df_key2 = DataFrame(np.random.randn(len(level2), 4), - index=level2, - columns=['A', 'B', 'C', 'D']) + self.df = DataFrame( + { + "data1": np.random.randn(100000), + "data2": np.random.randn(100000), + "key1": self.key1, + "key2": self.key2, + } + ) + + self.df_key1 = DataFrame( + np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"] + ) + self.df_key2 = DataFrame( + np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"] + ) shuf = np.arange(100000) np.random.shuffle(shuf) self.df_shuf = self.df.reindex(self.df.index[shuf]) def time_join_dataframe_index_multi(self, sort): - self.df.join(self.df_multi, on=['key1', 'key2'], sort=sort) + self.df.join(self.df_multi, on=["key1", "key2"], sort=sort) def time_join_dataframe_index_single_key_bigger(self, sort): - self.df.join(self.df_key2, on='key2', sort=sort) + self.df.join(self.df_key2, on="key2", sort=sort) def time_join_dataframe_index_single_key_small(self, sort): - self.df.join(self.df_key1, on='key1', sort=sort) + self.df.join(self.df_key1, on="key1", sort=sort) def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): - self.df_shuf.join(self.df_key2, on='key2', sort=sort) + self.df_shuf.join(self.df_key2, on="key2", sort=sort) class JoinIndex: - def setup(self): N = 50000 - self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)), - columns=['jim', 'joe']) - self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)), - columns=['jolie', 'jolia']).set_index('jolie') + self.left = DataFrame( + np.random.randint(1, N / 500, (N, 2)), columns=["jim", "joe"] + ) + self.right = DataFrame( + np.random.randint(1, N / 500, (N, 2)), columns=["jolie", "jolia"] + ).set_index("jolie") def time_left_outer_join_index(self): - self.left.join(self.right, on='jim') + self.left.join(self.right, on="jim") class JoinNonUnique: # outer join of non-unique # GH 6329 def setup(self): - date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') - daily_dates = date_index.to_period('D').to_timestamp('S', 'S') + date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T") + daily_dates = date_index.to_period("D").to_timestamp("S", "S") self.fracofday = date_index.values - daily_dates.values - self.fracofday = self.fracofday.astype('timedelta64[ns]') + self.fracofday = self.fracofday.astype("timedelta64[ns]") self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0 self.fracofday = Series(self.fracofday, daily_dates) - index = date_range(date_index.min(), date_index.max(), freq='D') + index = date_range(date_index.min(), date_index.max(), freq="D") self.temp = Series(1.0, index)[self.fracofday.index] def time_join_non_unique_equal(self): @@ -165,7 +165,7 @@ def time_join_non_unique_equal(self): class Merge: params = [True, False] - param_names = ['sort'] + param_names = ["sort"] def setup(self, sort): N = 10000 @@ -173,17 +173,25 @@ def setup(self, sort): indices2 = tm.makeStringIndex(N).values key = np.tile(indices[:8000], 10) key2 = np.tile(indices2[:8000], 10) - self.left = DataFrame({'key': key, 'key2': key2, - 'value': np.random.randn(80000)}) - self.right = DataFrame({'key': indices[2000:], - 'key2': indices2[2000:], - 'value2': np.random.randn(8000)}) - - self.df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), - 'key2': np.tile(np.arange(250).repeat(10), 4), - 'value': np.random.randn(10000)}) - self.df2 = DataFrame({'key1': np.arange(500), - 'value2': np.random.randn(500)}) + self.left = DataFrame( + {"key": key, "key2": key2, "value": np.random.randn(80000)} + ) + self.right = DataFrame( + { + "key": indices[2000:], + "key2": indices2[2000:], + "value2": np.random.randn(8000), + } + ) + + self.df = DataFrame( + { + "key1": np.tile(np.arange(500).repeat(10), 2), + "key2": np.tile(np.arange(250).repeat(10), 4), + "value": np.random.randn(10000), + } + ) + self.df2 = DataFrame({"key1": np.arange(500), "value2": np.random.randn(500)}) self.df3 = self.df[:5000] def time_merge_2intkey(self, sort): @@ -193,125 +201,141 @@ def time_merge_dataframe_integer_2key(self, sort): merge(self.df, self.df3, sort=sort) def time_merge_dataframe_integer_key(self, sort): - merge(self.df, self.df2, on='key1', sort=sort) + merge(self.df, self.df2, on="key1", sort=sort) class I8Merge: - params = ['inner', 'outer', 'left', 'right'] - param_names = ['how'] + params = ["inner", "outer", "left", "right"] + param_names = ["how"] def setup(self, how): - low, high, n = -1000, 1000, 10**6 - self.left = DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) - self.left['left'] = self.left.sum(axis=1) - self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1) + low, high, n = -1000, 1000, 10 ** 6 + self.left = DataFrame( + np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG") + ) + self.left["left"] = self.left.sum(axis=1) + self.right = self.left.sample(frac=1).rename({"left": "right"}, axis=1) self.right = self.right.reset_index(drop=True) - self.right['right'] *= -1 + self.right["right"] *= -1 def time_i8merge(self, how): merge(self.left, self.right, how=how) class MergeCategoricals: - def setup(self): self.left_object = DataFrame( - {'X': np.random.choice(range(0, 10), size=(10000,)), - 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))}) + { + "X": np.random.choice(range(0, 10), size=(10000,)), + "Y": np.random.choice(["one", "two", "three"], size=(10000,)), + } + ) self.right_object = DataFrame( - {'X': np.random.choice(range(0, 10), size=(10000,)), - 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))}) + { + "X": np.random.choice(range(0, 10), size=(10000,)), + "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)), + } + ) self.left_cat = self.left_object.assign( - Y=self.left_object['Y'].astype('category')) + Y=self.left_object["Y"].astype("category") + ) self.right_cat = self.right_object.assign( - Z=self.right_object['Z'].astype('category')) + Z=self.right_object["Z"].astype("category") + ) def time_merge_object(self): - merge(self.left_object, self.right_object, on='X') + merge(self.left_object, self.right_object, on="X") def time_merge_cat(self): - merge(self.left_cat, self.right_cat, on='X') + merge(self.left_cat, self.right_cat, on="X") class MergeOrdered: - def setup(self): groups = tm.makeStringIndex(10).values - self.left = DataFrame({'group': groups.repeat(5000), - 'key': np.tile(np.arange(0, 10000, 2), 10), - 'lvalue': np.random.randn(50000)}) - self.right = DataFrame({'key': np.arange(10000), - 'rvalue': np.random.randn(10000)}) + self.left = DataFrame( + { + "group": groups.repeat(5000), + "key": np.tile(np.arange(0, 10000, 2), 10), + "lvalue": np.random.randn(50000), + } + ) + self.right = DataFrame( + {"key": np.arange(10000), "rvalue": np.random.randn(10000)} + ) def time_merge_ordered(self): - merge_ordered(self.left, self.right, on='key', left_by='group') + merge_ordered(self.left, self.right, on="key", left_by="group") class MergeAsof: - params = [['backward', 'forward', 'nearest']] - param_names = ['direction'] + params = [["backward", "forward", "nearest"]] + param_names = ["direction"] def setup(self, direction): one_count = 200000 two_count = 1000000 df1 = DataFrame( - {'time': np.random.randint(0, one_count / 20, one_count), - 'key': np.random.choice(list(string.ascii_uppercase), one_count), - 'key2': np.random.randint(0, 25, one_count), - 'value1': np.random.randn(one_count)}) + { + "time": np.random.randint(0, one_count / 20, one_count), + "key": np.random.choice(list(string.ascii_uppercase), one_count), + "key2": np.random.randint(0, 25, one_count), + "value1": np.random.randn(one_count), + } + ) df2 = DataFrame( - {'time': np.random.randint(0, two_count / 20, two_count), - 'key': np.random.choice(list(string.ascii_uppercase), two_count), - 'key2': np.random.randint(0, 25, two_count), - 'value2': np.random.randn(two_count)}) - - df1 = df1.sort_values('time') - df2 = df2.sort_values('time') - - df1['time32'] = np.int32(df1.time) - df2['time32'] = np.int32(df2.time) - - self.df1a = df1[['time', 'value1']] - self.df2a = df2[['time', 'value2']] - self.df1b = df1[['time', 'key', 'value1']] - self.df2b = df2[['time', 'key', 'value2']] - self.df1c = df1[['time', 'key2', 'value1']] - self.df2c = df2[['time', 'key2', 'value2']] - self.df1d = df1[['time32', 'value1']] - self.df2d = df2[['time32', 'value2']] - self.df1e = df1[['time', 'key', 'key2', 'value1']] - self.df2e = df2[['time', 'key', 'key2', 'value2']] + { + "time": np.random.randint(0, two_count / 20, two_count), + "key": np.random.choice(list(string.ascii_uppercase), two_count), + "key2": np.random.randint(0, 25, two_count), + "value2": np.random.randn(two_count), + } + ) + + df1 = df1.sort_values("time") + df2 = df2.sort_values("time") + + df1["time32"] = np.int32(df1.time) + df2["time32"] = np.int32(df2.time) + + self.df1a = df1[["time", "value1"]] + self.df2a = df2[["time", "value2"]] + self.df1b = df1[["time", "key", "value1"]] + self.df2b = df2[["time", "key", "value2"]] + self.df1c = df1[["time", "key2", "value1"]] + self.df2c = df2[["time", "key2", "value2"]] + self.df1d = df1[["time32", "value1"]] + self.df2d = df2[["time32", "value2"]] + self.df1e = df1[["time", "key", "key2", "value1"]] + self.df2e = df2[["time", "key", "key2", "value2"]] def time_on_int(self, direction): - merge_asof(self.df1a, self.df2a, on='time', direction=direction) + merge_asof(self.df1a, self.df2a, on="time", direction=direction) def time_on_int32(self, direction): - merge_asof(self.df1d, self.df2d, on='time32', direction=direction) + merge_asof(self.df1d, self.df2d, on="time32", direction=direction) def time_by_object(self, direction): - merge_asof(self.df1b, self.df2b, on='time', by='key', - direction=direction) + merge_asof(self.df1b, self.df2b, on="time", by="key", direction=direction) def time_by_int(self, direction): - merge_asof(self.df1c, self.df2c, on='time', by='key2', - direction=direction) + merge_asof(self.df1c, self.df2c, on="time", by="key2", direction=direction) def time_multiby(self, direction): - merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'], - direction=direction) + merge_asof( + self.df1e, self.df2e, on="time", by=["key", "key2"], direction=direction + ) class Align: - def setup(self): - size = 5 * 10**5 - rng = np.arange(0, 10**13, 10**7) - stamps = np.datetime64('now').view('i8') + rng + size = 5 * 10 ** 5 + rng = np.arange(0, 10 ** 13, 10 ** 7) + stamps = np.datetime64("now").view("i8") + rng idx1 = np.sort(np.random.choice(stamps, size, replace=False)) idx2 = np.sort(np.random.choice(stamps, size, replace=False)) self.ts1 = Series(np.random.randn(size), idx1) @@ -321,7 +345,7 @@ def time_series_align_int64_index(self): self.ts1 + self.ts2 def time_series_align_left_monotonic(self): - self.ts1.align(self.ts2, join='left') + self.ts1.align(self.ts2, join="left") from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index c979ba6d53a08..eda059a68e8a5 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -6,46 +6,44 @@ class GetLoc: - def setup(self): self.mi_large = MultiIndex.from_product( [np.arange(1000), np.arange(20), list(string.ascii_letters)], - names=['one', 'two', 'three']) + names=["one", "two", "three"], + ) self.mi_med = MultiIndex.from_product( - [np.arange(1000), np.arange(10), list('A')], - names=['one', 'two', 'three']) + [np.arange(1000), np.arange(10), list("A")], names=["one", "two", "three"] + ) self.mi_small = MultiIndex.from_product( - [np.arange(100), list('A'), list('A')], - names=['one', 'two', 'three']) + [np.arange(100), list("A"), list("A")], names=["one", "two", "three"] + ) def time_large_get_loc(self): - self.mi_large.get_loc((999, 19, 'Z')) + self.mi_large.get_loc((999, 19, "Z")) def time_large_get_loc_warm(self): for _ in range(1000): - self.mi_large.get_loc((999, 19, 'Z')) + self.mi_large.get_loc((999, 19, "Z")) def time_med_get_loc(self): - self.mi_med.get_loc((999, 9, 'A')) + self.mi_med.get_loc((999, 9, "A")) def time_med_get_loc_warm(self): for _ in range(1000): - self.mi_med.get_loc((999, 9, 'A')) + self.mi_med.get_loc((999, 9, "A")) def time_string_get_loc(self): - self.mi_small.get_loc((99, 'A', 'A')) + self.mi_small.get_loc((99, "A", "A")) def time_small_get_loc_warm(self): for _ in range(1000): - self.mi_small.get_loc((99, 'A', 'A')) + self.mi_small.get_loc((99, "A", "A")) class Duplicates: - def setup(self): size = 65536 - arrays = [np.random.randint(0, 8192, size), - np.random.randint(0, 1024, size)] + arrays = [np.random.randint(0, 8192, size), np.random.randint(0, 1024, size)] mask = np.random.rand(size) < 0.1 self.mi_unused_levels = MultiIndex.from_arrays(arrays) self.mi_unused_levels = self.mi_unused_levels[mask] @@ -55,15 +53,25 @@ def time_remove_unused_levels(self): class Integer: - def setup(self): - self.mi_int = MultiIndex.from_product([np.arange(1000), - np.arange(1000)], - names=['one', 'two']) - self.obj_index = np.array([(0, 10), (0, 11), (0, 12), - (0, 13), (0, 14), (0, 15), - (0, 16), (0, 17), (0, 18), - (0, 19)], dtype=object) + self.mi_int = MultiIndex.from_product( + [np.arange(1000), np.arange(1000)], names=["one", "two"] + ) + self.obj_index = np.array( + [ + (0, 10), + (0, 11), + (0, 12), + (0, 13), + (0, 14), + (0, 15), + (0, 16), + (0, 17), + (0, 18), + (0, 19), + ], + dtype=object, + ) def time_get_indexer(self): self.mi_int.get_indexer(self.obj_index) @@ -73,12 +81,9 @@ def time_is_monotonic(self): class Duplicated: - def setup(self): n, k = 200, 5000 - levels = [np.arange(n), - tm.makeStringIndex(n).values, - 1000 + np.arange(n)] + levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes) @@ -87,12 +92,13 @@ def time_duplicated(self): class Sortlevel: - def setup(self): n = 1182720 low, high = -4096, 4096 - arrs = [np.repeat(np.random.randint(low, high, (n // k)), k) - for k in [11, 7, 5, 3, 1]] + arrs = [ + np.repeat(np.random.randint(low, high, (n // k)), k) + for k in [11, 7, 5, 3, 1] + ] self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)] a = np.repeat(np.arange(100), 1000) @@ -111,11 +117,10 @@ def time_sortlevel_one(self): class Values: - def setup_cache(self): level1 = range(1000) - level2 = date_range(start='1/1/2012', periods=100) + level2 = date_range(start="1/1/2012", periods=100) mi = MultiIndex.from_product([level1, level2]) return mi @@ -127,17 +132,18 @@ def time_datetime_level_values_sliced(self, mi): class CategoricalLevel: - def setup(self): - self.df = DataFrame({ - 'a': np.arange(1_000_000, dtype=np.int32), - 'b': np.arange(1_000_000, dtype=np.int64), - 'c': np.arange(1_000_000, dtype=float), - }).astype({'a': 'category', 'b': 'category'}) + self.df = DataFrame( + { + "a": np.arange(1_000_000, dtype=np.int32), + "b": np.arange(1_000_000, dtype=np.int64), + "c": np.arange(1_000_000, dtype=float), + } + ).astype({"a": "category", "b": "category"}) def time_categorical_level(self): - self.df.set_index(['a', 'b']) + self.df.set_index(["a", "b"]) from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 9b738e699a5b3..31c3b6fb6cb60 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd + try: import pandas.tseries.holiday # noqa except ImportError: @@ -10,35 +11,43 @@ hcal = pd.tseries.holiday.USFederalHolidayCalendar() # These offsets currently raise a NotImplimentedError with .apply_index() -non_apply = [pd.offsets.Day(), - pd.offsets.BYearEnd(), - pd.offsets.BYearBegin(), - pd.offsets.BQuarterEnd(), - pd.offsets.BQuarterBegin(), - pd.offsets.BMonthEnd(), - pd.offsets.BMonthBegin(), - pd.offsets.CustomBusinessDay(), - pd.offsets.CustomBusinessDay(calendar=hcal), - pd.offsets.CustomBusinessMonthBegin(calendar=hcal), - pd.offsets.CustomBusinessMonthEnd(calendar=hcal), - pd.offsets.CustomBusinessMonthEnd(calendar=hcal)] -other_offsets = [pd.offsets.YearEnd(), pd.offsets.YearBegin(), - pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(), - pd.offsets.MonthEnd(), pd.offsets.MonthBegin(), - pd.offsets.DateOffset(months=2, days=2), - pd.offsets.BusinessDay(), pd.offsets.SemiMonthEnd(), - pd.offsets.SemiMonthBegin()] +non_apply = [ + pd.offsets.Day(), + pd.offsets.BYearEnd(), + pd.offsets.BYearBegin(), + pd.offsets.BQuarterEnd(), + pd.offsets.BQuarterBegin(), + pd.offsets.BMonthEnd(), + pd.offsets.BMonthBegin(), + pd.offsets.CustomBusinessDay(), + pd.offsets.CustomBusinessDay(calendar=hcal), + pd.offsets.CustomBusinessMonthBegin(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal), +] +other_offsets = [ + pd.offsets.YearEnd(), + pd.offsets.YearBegin(), + pd.offsets.QuarterEnd(), + pd.offsets.QuarterBegin(), + pd.offsets.MonthEnd(), + pd.offsets.MonthBegin(), + pd.offsets.DateOffset(months=2, days=2), + pd.offsets.BusinessDay(), + pd.offsets.SemiMonthEnd(), + pd.offsets.SemiMonthBegin(), +] offsets = non_apply + other_offsets class ApplyIndex: params = other_offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): N = 10000 - self.rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + self.rng = pd.date_range(start="1/1/2000", periods=N, freq="T") def time_apply_index(self, offset): offset.apply_index(self.rng) @@ -47,13 +56,15 @@ def time_apply_index(self, offset): class OnOffset: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): - self.dates = [datetime(2016, m, d) - for m in [10, 11, 12] - for d in [1, 2, 3, 28, 29, 30, 31] - if not (m == 11 and d == 31)] + self.dates = [ + datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31) + ] def time_on_offset(self, offset): for date in self.dates: @@ -63,11 +74,11 @@ def time_on_offset(self, offset): class OffsetSeriesArithmetic: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): N = 1000 - rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + rng = pd.date_range(start="1/1/2000", periods=N, freq="T") self.data = pd.Series(rng) def time_add_offset(self, offset): @@ -78,11 +89,11 @@ def time_add_offset(self, offset): class OffsetDatetimeIndexArithmetic: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): N = 1000 - self.data = pd.date_range(start='1/1/2000', periods=N, freq='T') + self.data = pd.date_range(start="1/1/2000", periods=N, freq="T") def time_add_offset(self, offset): with warnings.catch_warnings(record=True): @@ -92,11 +103,11 @@ def time_add_offset(self, offset): class OffestDatetimeArithmetic: params = offsets - param_names = ['offset'] + param_names = ["offset"] def setup(self, offset): self.date = datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') + self.dt64 = np.datetime64("2011-01-01 09:00Z") def time_apply(self, offset): offset.apply(self.date) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 59b1638920666..fdc8207021c0f 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -5,26 +5,42 @@ import pandas as pd # Compatibility import for lib -for imp in ['pandas._libs.lib', 'pandas.lib']: +for imp in ["pandas._libs.lib", "pandas.lib"]: try: lib = import_module(imp) break except (ImportError, TypeError, ValueError): pass -numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, - np.float64, np.int16, np.int8, np.uint16, np.uint8] +numeric_dtypes = [ + np.int64, + np.int32, + np.uint32, + np.uint64, + np.float32, + np.float64, + np.int16, + np.int8, + np.uint16, + np.uint8, +] datetime_dtypes = [np.datetime64, np.timedelta64] string_dtypes = [np.object] try: - extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype, - pd.Int32Dtype, pd.Int64Dtype, - pd.UInt8Dtype, pd.UInt16Dtype, - pd.UInt32Dtype, pd.UInt64Dtype, - pd.CategoricalDtype, - pd.IntervalDtype, - pd.DatetimeTZDtype('ns', 'UTC'), - pd.PeriodDtype('D')] + extension_dtypes = [ + pd.Int8Dtype, + pd.Int16Dtype, + pd.Int32Dtype, + pd.Int64Dtype, + pd.UInt8Dtype, + pd.UInt16Dtype, + pd.UInt32Dtype, + pd.UInt64Dtype, + pd.CategoricalDtype, + pd.IntervalDtype, + pd.DatetimeTZDtype("ns", "UTC"), + pd.PeriodDtype("D"), + ] except AttributeError: extension_dtypes = [] @@ -40,6 +56,7 @@ class BaseIO: """ Base class for IO benchmarks """ + fname = None def remove(self, f): diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index c8ba6c382cb64..2f8ae0650ab75 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,18 +1,33 @@ -from pandas import ( - DataFrame, Period, PeriodIndex, Series, date_range, period_range) +from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range from pandas.tseries.frequencies import to_offset class PeriodProperties: - params = (['M', 'min'], - ['year', 'month', 'day', 'hour', 'minute', 'second', - 'is_leap_year', 'quarter', 'qyear', 'week', 'daysinmonth', - 'dayofweek', 'dayofyear', 'start_time', 'end_time']) - param_names = ['freq', 'attr'] + params = ( + ["M", "min"], + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "is_leap_year", + "quarter", + "qyear", + "week", + "daysinmonth", + "dayofweek", + "dayofyear", + "start_time", + "end_time", + ], + ) + param_names = ["freq", "attr"] def setup(self, freq, attr): - self.per = Period('2012-06-01', freq=freq) + self.per = Period("2012-06-01", freq=freq) def time_property(self, freq, attr): getattr(self.per, attr) @@ -20,11 +35,11 @@ def time_property(self, freq, attr): class PeriodUnaryMethods: - params = ['M', 'min'] - param_names = ['freq'] + params = ["M", "min"] + param_names = ["freq"] def setup(self, freq): - self.per = Period('2012-06-01', freq=freq) + self.per = Period("2012-06-01", freq=freq) def time_to_timestamp(self, freq): self.per.to_timestamp() @@ -33,12 +48,12 @@ def time_now(self, freq): self.per.now(freq) def time_asfreq(self, freq): - self.per.asfreq('A') + self.per.asfreq("A") class PeriodConstructor: - params = [['D'], [True, False]] - param_names = ['freq', 'is_offset'] + params = [["D"], [True, False]] + param_names = ["freq", "is_offset"] def setup(self, freq, is_offset): if is_offset: @@ -47,20 +62,21 @@ def setup(self, freq, is_offset): self.freq = freq def time_period_constructor(self, freq, is_offset): - Period('2012-06-01', freq=freq) + Period("2012-06-01", freq=freq) class PeriodIndexConstructor: - params = [['D'], [True, False]] - param_names = ['freq', 'is_offset'] + params = [["D"], [True, False]] + param_names = ["freq", "is_offset"] def setup(self, freq, is_offset): - self.rng = date_range('1985', periods=1000) - self.rng2 = date_range('1985', periods=1000).to_pydatetime() + self.rng = date_range("1985", periods=1000) + self.rng2 = date_range("1985", periods=1000).to_pydatetime() self.ints = list(range(2000, 3000)) - self.daily_ints = date_range('1/1/2000', periods=1000, - freq=freq).strftime('%Y%m%d').map(int) + self.daily_ints = ( + date_range("1/1/2000", periods=1000, freq=freq).strftime("%Y%m%d").map(int) + ) if is_offset: self.freq = to_offset(freq) else: @@ -80,32 +96,35 @@ def time_from_ints_daily(self, freq, is_offset): class DataFramePeriodColumn: - def setup(self): - self.rng = period_range(start='1/1/1990', freq='S', periods=20000) + self.rng = period_range(start="1/1/1990", freq="S", periods=20000) self.df = DataFrame(index=range(len(self.rng))) def time_setitem_period_column(self): - self.df['col'] = self.rng + self.df["col"] = self.rng def time_set_index(self): # GH#21582 limited by comparisons of Period objects - self.df['col2'] = self.rng - self.df.set_index('col2', append=True) + self.df["col2"] = self.rng + self.df.set_index("col2", append=True) class Algorithms: - params = ['index', 'series'] - param_names = ['typ'] + params = ["index", "series"] + param_names = ["typ"] def setup(self, typ): - data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), - Period('2011-03', freq='M'), Period('2011-04', freq='M')] - - if typ == 'index': - self.vector = PeriodIndex(data * 1000, freq='M') - elif typ == 'series': + data = [ + Period("2011-01", freq="M"), + Period("2011-02", freq="M"), + Period("2011-03", freq="M"), + Period("2011-04", freq="M"), + ] + + if typ == "index": + self.vector = PeriodIndex(data * 1000, freq="M") + elif typ == "series": self.vector = Series(data * 1000) def time_drop_duplicates(self, typ): @@ -116,9 +135,8 @@ def time_value_counts(self, typ): class Indexing: - def setup(self): - self.index = period_range(start='1985', periods=1000, freq='D') + self.index = period_range(start="1985", periods=1000, freq="D") self.series = Series(range(1000), index=self.index) self.period = self.index[500] @@ -135,7 +153,7 @@ def time_series_loc(self): self.series.loc[self.period] def time_align(self): - DataFrame({'a': self.series, 'b': self.series[:500]}) + DataFrame({"a": self.series, "b": self.series[:500]}) def time_intersection(self): self.index[:750].intersection(self.index[250:]) diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 9e3bc87c32987..4fb0876f05a0a 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,27 +1,29 @@ import numpy as np from pandas import DataFrame, Series, DatetimeIndex, date_range + try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") class SeriesPlotting: - params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']] - param_names = ['kind'] + params = [["line", "bar", "area", "barh", "hist", "kde", "pie"]] + param_names = ["kind"] def setup(self, kind): - if kind in ['bar', 'barh', 'pie']: + if kind in ["bar", "barh", "pie"]: n = 100 - elif kind in ['kde']: + elif kind in ["kde"]: n = 10000 else: n = 1000000 self.s = Series(np.random.randn(n)) - if kind in ['area', 'pie']: + if kind in ["area", "pie"]: self.s = self.s.abs() def time_series_plot(self, kind): @@ -29,41 +31,43 @@ def time_series_plot(self, kind): class FramePlotting: - params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter', - 'hexbin']] - param_names = ['kind'] + params = [ + ["line", "bar", "area", "barh", "hist", "kde", "pie", "scatter", "hexbin"] + ] + param_names = ["kind"] def setup(self, kind): - if kind in ['bar', 'barh', 'pie']: + if kind in ["bar", "barh", "pie"]: n = 100 - elif kind in ['kde', 'scatter', 'hexbin']: + elif kind in ["kde", "scatter", "hexbin"]: n = 10000 else: n = 1000000 self.x = Series(np.random.randn(n)) self.y = Series(np.random.randn(n)) - if kind in ['area', 'pie']: + if kind in ["area", "pie"]: self.x = self.x.abs() self.y = self.y.abs() - self.df = DataFrame({'x': self.x, 'y': self.y}) + self.df = DataFrame({"x": self.x, "y": self.y}) def time_frame_plot(self, kind): - self.df.plot(x='x', y='y', kind=kind) + self.df.plot(x="x", y="y", kind=kind) class TimeseriesPlotting: - def setup(self): N = 2000 M = 5 - idx = date_range('1/1/1975', periods=N) + idx = date_range("1/1/1975", periods=N) self.df = DataFrame(np.random.randn(N, M), index=idx) - idx_irregular = DatetimeIndex(np.concatenate((idx.values[0:10], - idx.values[12:]))) - self.df2 = DataFrame(np.random.randn(len(idx_irregular), M), - index=idx_irregular) + idx_irregular = DatetimeIndex( + np.concatenate((idx.values[0:10], idx.values[12:])) + ) + self.df2 = DataFrame( + np.random.randn(len(idx_irregular), M), index=idx_irregular + ) def time_plot_regular(self): self.df.plot() @@ -79,12 +83,11 @@ def time_plot_table(self): class Misc: - def setup(self): N = 500 M = 10 self.df = DataFrame(np.random.randn(N, M)) - self.df['Name'] = ["A"] * N + self.df["Name"] = ["A"] * N def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index a6ceb0e93a089..8d4c9ebaf3e89 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,20 +1,18 @@ import numpy as np import pandas.util.testing as tm -from pandas import (DataFrame, Series, MultiIndex, Index, date_range, - period_range) +from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range from .pandas_vb_common import lib class Reindex: - def setup(self): - rng = date_range(start='1/1/1970', periods=10000, freq='1min') - self.df = DataFrame(np.random.rand(10000, 10), index=rng, - columns=range(10)) - self.df['foo'] = 'bar' + rng = date_range(start="1/1/1970", periods=10000, freq="1min") + self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) + self.df["foo"] = "bar" self.rng_subset = Index(rng[::2]) - self.df2 = DataFrame(index=range(10000), - data=np.random.rand(10000, 30), columns=range(30)) + self.df2 = DataFrame( + index=range(10000), data=np.random.rand(10000, 30), columns=range(30) + ) N = 5000 K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) @@ -35,12 +33,12 @@ def time_reindex_multiindex(self): class ReindexMethod: - params = [['pad', 'backfill'], [date_range, period_range]] - param_names = ['method', 'constructor'] + params = [["pad", "backfill"], [date_range, period_range]] + param_names = ["method", "constructor"] def setup(self, method, constructor): N = 100000 - self.idx = constructor('1/1/2000', periods=N, freq='1min') + self.idx = constructor("1/1/2000", periods=N, freq="1min") self.ts = Series(np.random.randn(N), index=self.idx)[::2] def time_reindex_method(self, method, constructor): @@ -49,15 +47,15 @@ def time_reindex_method(self, method, constructor): class Fillna: - params = ['pad', 'backfill'] - param_names = ['method'] + params = ["pad", "backfill"] + param_names = ["method"] def setup(self, method): N = 100000 - self.idx = date_range('1/1/2000', periods=N, freq='1min') + self.idx = date_range("1/1/2000", periods=N, freq="1min") ts = Series(np.random.randn(N), index=self.idx)[::2] self.ts_reindexed = ts.reindex(self.idx) - self.ts_float32 = self.ts_reindexed.astype('float32') + self.ts_float32 = self.ts_reindexed.astype("float32") def time_reindexed(self, method): self.ts_reindexed.fillna(method=method) @@ -67,17 +65,17 @@ def time_float_32(self, method): class LevelAlign: - def setup(self): self.index = MultiIndex( levels=[np.arange(10), np.arange(100), np.arange(100)], - codes=[np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)]) - self.df = DataFrame(np.random.randn(len(self.index), 4), - index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), - index=self.index.levels[1]) + codes=[ + np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10), + ], + ) + self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) + self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) def time_align_level(self): self.df.align(self.df_level, level=1, copy=False) @@ -89,15 +87,16 @@ def time_reindex_level(self): class DropDuplicates: params = [True, False] - param_names = ['inplace'] + param_names = ["inplace"] def setup(self, inplace): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) - self.df = DataFrame({'key1': key1, 'key2': key2, - 'value': np.random.randn(N * K)}) + self.df = DataFrame( + {"key1": key1, "key2": key2, "value": np.random.randn(N * K)} + ) self.df_nan = self.df.copy() self.df_nan.iloc[:10000, :] = np.nan @@ -107,15 +106,14 @@ def setup(self, inplace): N = 1000000 K = 10000 key1 = np.random.randint(0, K, size=N) - self.df_int = DataFrame({'key1': key1}) - self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), - dtype=bool)) + self.df_int = DataFrame({"key1": key1}) + self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool)) def time_frame_drop_dups(self, inplace): - self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) + self.df.drop_duplicates(["key1", "key2"], inplace=inplace) def time_frame_drop_dups_na(self, inplace): - self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) + self.df_nan.drop_duplicates(["key1", "key2"], inplace=inplace) def time_series_drop_dups_int(self, inplace): self.s.drop_duplicates(inplace=inplace) @@ -137,16 +135,16 @@ def setup(self): indices = tm.makeStringIndex(n) subsample_size = 40000 self.x = Series(np.random.randn(n), indices) - self.y = Series(np.random.randn(subsample_size), - index=np.random.choice(indices, subsample_size, - replace=False)) + self.y = Series( + np.random.randn(subsample_size), + index=np.random.choice(indices, subsample_size, replace=False), + ) def time_align_series_irregular_string(self): self.x + self.y class LibFastZip: - def setup(self): N = 10000 K = 10 diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 9dff1778f8e56..6137e944e6b9e 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -5,11 +5,11 @@ class FillNa: params = [True, False] - param_names = ['inplace'] + param_names = ["inplace"] def setup(self, inplace): - N = 10**6 - rng = pd.date_range('1/1/2000', periods=N, freq='min') + N = 10 ** 6 + rng = pd.date_range("1/1/2000", periods=N, freq="min") data = np.random.randn(N) data[::2] = np.nan self.ts = pd.Series(data, index=rng) @@ -24,13 +24,13 @@ def time_replace(self, inplace): class ReplaceDict: params = [True, False] - param_names = ['inplace'] + param_names = ["inplace"] def setup(self, inplace): - N = 10**5 - start_value = 10**5 + N = 10 ** 5 + start_value = 10 ** 5 self.to_rep = dict(enumerate(np.arange(N) + start_value)) - self.s = pd.Series(np.random.randint(N, size=10**3)) + self.s = pd.Series(np.random.randint(N, size=10 ** 3)) def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) @@ -38,14 +38,17 @@ def time_replace_series(self, inplace): class Convert: - params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta']) - param_names = ['constructor', 'replace_data'] + params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) + param_names = ["constructor", "replace_data"] def setup(self, constructor, replace_data): - N = 10**3 - data = {'Series': pd.Series(np.random.randint(N, size=N)), - 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N), - 'B': np.random.randint(N, size=N)})} + N = 10 ** 3 + data = { + "Series": pd.Series(np.random.randint(N, size=N)), + "DataFrame": pd.DataFrame( + {"A": np.random.randint(N, size=N), "B": np.random.randint(N, size=N)} + ), + } self.to_replace = {i: getattr(pd, replace_data) for i in range(N)} self.data = data[constructor] diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 678403d837805..f41e13163b3f5 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -7,35 +7,33 @@ class Melt: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C']) - self.df['id1'] = np.random.randint(0, 10, 10000) - self.df['id2'] = np.random.randint(100, 1000, 10000) + self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"]) + self.df["id1"] = np.random.randint(0, 10, 10000) + self.df["id2"] = np.random.randint(100, 1000, 10000) def time_melt_dataframe(self): - melt(self.df, id_vars=['id1', 'id2']) + melt(self.df, id_vars=["id1", "id2"]) class Pivot: - def setup(self): N = 10000 - index = date_range('1/1/2000', periods=N, freq='h') - data = {'value': np.random.randn(N * 50), - 'variable': np.arange(50).repeat(N), - 'date': np.tile(index.values, 50)} + index = date_range("1/1/2000", periods=N, freq="h") + data = { + "value": np.random.randn(N * 50), + "variable": np.arange(50).repeat(N), + "date": np.tile(index.values, 50), + } self.df = DataFrame(data) def time_reshape_pivot_time_series(self): - self.df.pivot('date', 'variable', 'value') + self.df.pivot("date", "variable", "value") class SimpleReshape: - def setup(self): - arrays = [np.arange(100).repeat(100), - np.roll(np.tile(np.arange(100), 100), 25)] + arrays = [np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)] index = MultiIndex.from_arrays(arrays) self.df = DataFrame(np.random.randn(10000, 4), index=index) self.udf = self.df.unstack(1) @@ -49,7 +47,7 @@ def time_unstack(self): class Unstack: - params = ['int', 'category'] + params = ["int", "category"] def setup(self, dtype): m = 100 @@ -58,7 +56,7 @@ def setup(self, dtype): levels = np.arange(m) index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) - if dtype == 'int': + if dtype == "int": values = np.arange(m * m * n).reshape(m * m, n) else: # the category branch is ~20x slower than int. So we @@ -80,84 +78,94 @@ def time_without_last_row(self, dtype): class SparseIndex: - def setup(self): NUM_ROWS = 1000 - self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS), - 'B': np.random.randint(50, size=NUM_ROWS), - 'C': np.random.randint(-10, 10, size=NUM_ROWS), - 'D': np.random.randint(-10, 10, size=NUM_ROWS), - 'E': np.random.randint(10, size=NUM_ROWS), - 'F': np.random.randn(NUM_ROWS)}) - self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E']) + self.df = DataFrame( + { + "A": np.random.randint(50, size=NUM_ROWS), + "B": np.random.randint(50, size=NUM_ROWS), + "C": np.random.randint(-10, 10, size=NUM_ROWS), + "D": np.random.randint(-10, 10, size=NUM_ROWS), + "E": np.random.randint(10, size=NUM_ROWS), + "F": np.random.randn(NUM_ROWS), + } + ) + self.df = self.df.set_index(["A", "B", "C", "D", "E"]) def time_unstack(self): self.df.unstack() class WideToLong: - def setup(self): nyrs = 20 nidvars = 20 N = 5000 - self.letters = list('ABCD') - yrvars = [l + str(num) - for l, num in product(self.letters, range(1, nyrs + 1))] + self.letters = list("ABCD") + yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] columns = [str(i) for i in range(nidvars)] + yrvars - self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), - columns=columns) - self.df['id'] = self.df.index + self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns) + self.df["id"] = self.df.index def time_wide_to_long_big(self): - wide_to_long(self.df, self.letters, i='id', j='year') + wide_to_long(self.df, self.letters, i="id", j="year") class PivotTable: - def setup(self): N = 100000 - fac1 = np.array(['A', 'B', 'C'], dtype='O') - fac2 = np.array(['one', 'two'], dtype='O') + fac1 = np.array(["A", "B", "C"], dtype="O") + fac2 = np.array(["one", "two"], dtype="O") ind1 = np.random.randint(0, 3, size=N) ind2 = np.random.randint(0, 2, size=N) - self.df = DataFrame({'key1': fac1.take(ind1), - 'key2': fac2.take(ind2), - 'key3': fac2.take(ind2), - 'value1': np.random.randn(N), - 'value2': np.random.randn(N), - 'value3': np.random.randn(N)}) - self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'), - 'col3': [1, 2, 3, 4, 5]}) - self.df2.col1 = self.df2.col1.astype('category') - self.df2.col2 = self.df2.col2.astype('category') + self.df = DataFrame( + { + "key1": fac1.take(ind1), + "key2": fac2.take(ind2), + "key3": fac2.take(ind2), + "value1": np.random.randn(N), + "value2": np.random.randn(N), + "value3": np.random.randn(N), + } + ) + self.df2 = DataFrame( + {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]} + ) + self.df2.col1 = self.df2.col1.astype("category") + self.df2.col2 = self.df2.col2.astype("category") def time_pivot_table(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3']) + self.df.pivot_table(index="key1", columns=["key2", "key3"]) def time_pivot_table_agg(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3'], - aggfunc=['sum', 'mean']) + self.df.pivot_table( + index="key1", columns=["key2", "key3"], aggfunc=["sum", "mean"] + ) def time_pivot_table_margins(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3'], - margins=True) + self.df.pivot_table(index="key1", columns=["key2", "key3"], margins=True) def time_pivot_table_categorical(self): - self.df2.pivot_table(index='col1', values='col3', columns='col2', - aggfunc=np.sum, fill_value=0) + self.df2.pivot_table( + index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 + ) def time_pivot_table_categorical_observed(self): - self.df2.pivot_table(index='col1', values='col3', columns='col2', - aggfunc=np.sum, fill_value=0, observed=True) + self.df2.pivot_table( + index="col1", + values="col3", + columns="col2", + aggfunc=np.sum, + fill_value=0, + observed=True, + ) class Crosstab: - def setup(self): N = 100000 - fac1 = np.array(['A', 'B', 'C'], dtype='O') - fac2 = np.array(['one', 'two'], dtype='O') + fac1 = np.array(["A", "B", "C"], dtype="O") + fac2 = np.array(["one", "two"], dtype="O") self.ind1 = np.random.randint(0, 3, size=N) self.ind2 = np.random.randint(0, 2, size=N) self.vec1 = fac1.take(self.ind1) @@ -167,7 +175,7 @@ def time_crosstab(self): pd.crosstab(self.vec1, self.vec2) def time_crosstab_values(self): - pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum') + pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc="sum") def time_crosstab_normalize(self): pd.crosstab(self.vec1, self.vec2, normalize=True) @@ -179,8 +187,10 @@ def time_crosstab_normalize_margins(self): class GetDummies: def setup(self): categories = list(string.ascii_letters[:12]) - s = pd.Series(np.random.choice(categories, size=1000000), - dtype=pd.api.types.CategoricalDtype(categories)) + s = pd.Series( + np.random.choice(categories, size=1000000), + dtype=pd.api.types.CategoricalDtype(categories), + ) self.s = s def time_get_dummies_1d(self): @@ -192,16 +202,18 @@ def time_get_dummies_1d_sparse(self): class Cut: params = [[4, 10, 1000]] - param_names = ['bins'] + param_names = ["bins"] def setup(self, bins): - N = 10**5 + N = 10 ** 5 self.int_series = pd.Series(np.arange(N).repeat(5)) self.float_series = pd.Series(np.random.randn(N).repeat(5)) - self.timedelta_series = pd.Series(np.random.randint(N, size=N), - dtype='timedelta64[ns]') - self.datetime_series = pd.Series(np.random.randint(N, size=N), - dtype='datetime64[ns]') + self.timedelta_series = pd.Series( + np.random.randint(N, size=N), dtype="timedelta64[ns]" + ) + self.datetime_series = pd.Series( + np.random.randint(N, size=N), dtype="datetime64[ns]" + ) def time_cut_int(self, bins): pd.cut(self.int_series, bins) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 033b466c8b9be..a70977fcf539f 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -4,15 +4,16 @@ class Methods: - params = (['DataFrame', 'Series'], - [10, 1000], - ['int', 'float'], - ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) @@ -22,14 +23,15 @@ def time_rolling(self, constructor, window, dtype, method): class ExpandingMethods: - params = (['DataFrame', 'Series'], - ['int', 'float'], - ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = ( + ["DataFrame", "Series"], + ["int", "float"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) self.expanding = getattr(pd, constructor)(arr).expanding() @@ -39,14 +41,11 @@ def time_expanding(self, constructor, dtype, method): class EWMMethods: - params = (['DataFrame', 'Series'], - [10, 1000], - ['int', 'float'], - ['mean', 'std']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"]) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) @@ -55,29 +54,28 @@ def time_ewm(self, constructor, window, dtype, method): class VariableWindowMethods(Methods): - params = (['DataFrame', 'Series'], - ['50s', '1h', '1d'], - ['int', 'float'], - ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum']) - param_names = ['contructor', 'window', 'dtype', 'method'] + params = ( + ["DataFrame", "Series"], + ["50s", "1h", "1d"], + ["int", "float"], + ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"], + ) + param_names = ["contructor", "window", "dtype", "method"] def setup(self, constructor, window, dtype, method): - N = 10**5 + N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) - index = pd.date_range('2017-01-01', periods=N, freq='5s') + index = pd.date_range("2017-01-01", periods=N, freq="5s") self.roll = getattr(pd, constructor)(arr, index=index).rolling(window) class Pairwise: - params = ([10, 1000, None], - ['corr', 'cov'], - [True, False]) - param_names = ['window', 'method', 'pairwise'] + params = ([10, 1000, None], ["corr", "cov"], [True, False]) + param_names = ["window", "method", "pairwise"] def setup(self, window, method, pairwise): - N = 10**4 + N = 10 ** 4 arr = np.random.random(N) self.df = pd.DataFrame(arr) @@ -90,25 +88,25 @@ def time_pairwise(self, window, method, pairwise): class Quantile: - params = (['DataFrame', 'Series'], - [10, 1000], - ['int', 'float'], - [0, 0.5, 1], - ['linear', 'nearest', 'lower', 'higher', 'midpoint']) - param_names = ['constructor', 'window', 'dtype', 'percentile'] + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + [0, 0.5, 1], + ["linear", "nearest", "lower", "higher", "midpoint"], + ) + param_names = ["constructor", "window", "dtype", "percentile"] def setup(self, constructor, window, dtype, percentile, interpolation): N = 10 ** 5 arr = np.random.random(N).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) - def time_quantile(self, constructor, window, dtype, percentile, - interpolation): + def time_quantile(self, constructor, window, dtype, percentile, interpolation): self.roll.quantile(percentile, interpolation=interpolation) class PeakMemFixed: - def setup(self): N = 10 arr = 100 * np.random.random(N) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 4b1af2dc8c932..e2835c5156f55 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -7,13 +7,13 @@ class SeriesConstructor: - params = [None, 'dict'] - param_names = ['data'] + params = [None, "dict"] + param_names = ["data"] def setup(self, data): - self.idx = date_range(start=datetime(2015, 10, 26), - end=datetime(2016, 1, 1), - freq='50s') + self.idx = date_range( + start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" + ) dict_data = dict(zip(self.idx, range(len(self.idx)))) self.data = None if data is None else dict_data @@ -23,8 +23,8 @@ def time_constructor(self, data): class IsIn: - params = ['int64', 'uint64', 'object'] - param_names = ['dtype'] + params = ["int64", "uint64", "object"] + param_names = ["dtype"] def setup(self, dtype): self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) @@ -35,12 +35,11 @@ def time_isin(self, dtypes): class IsInFloat64: - def setup(self): self.small = Series([1, 2], dtype=np.float64) - self.many_different_values = np.arange(10**6, dtype=np.float64) - self.few_different_values = np.zeros(10**7, dtype=np.float64) - self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64) + self.many_different_values = np.arange(10 ** 6, dtype=np.float64) + self.few_different_values = np.zeros(10 ** 7, dtype=np.float64) + self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64) def time_isin_many_different(self): # runtime is dominated by creation of the lookup-table @@ -56,19 +55,18 @@ def time_isin_nan_values(self): class IsInForObjects: - def setup(self): - self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object) - self.vals_nans = np.full(10**4, np.nan).astype(np.object) + self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(np.object) + self.vals_nans = np.full(10 ** 4, np.nan).astype(np.object) self.s_short = Series(np.arange(2)).astype(np.object) - self.s_long = Series(np.arange(10**5)).astype(np.object) + self.s_long = Series(np.arange(10 ** 5)).astype(np.object) self.vals_short = np.arange(2).astype(np.object) - self.vals_long = np.arange(10**5).astype(np.object) + self.vals_long = np.arange(10 ** 5).astype(np.object) # because of nans floats are special: - self.s_long_floats = Series(np.arange(10**5, - dtype=np.float)).astype(np.object) - self.vals_long_floats = np.arange(10**5, - dtype=np.float).astype(np.object) + self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype( + np.object + ) + self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(np.object) def time_isin_nans(self): # if nan-objects are different objects, @@ -94,8 +92,8 @@ def time_isin_long_series_long_values_floats(self): class NSort: - params = ['first', 'last', 'all'] - param_names = ['keep'] + params = ["first", "last", "all"] + param_names = ["keep"] def setup(self, keep): self.s = Series(np.random.randint(1, 10, 100000)) @@ -109,15 +107,17 @@ def time_nsmallest(self, keep): class Dropna: - params = ['int', 'datetime'] - param_names = ['dtype'] + params = ["int", "datetime"] + param_names = ["dtype"] def setup(self, dtype): - N = 10**6 - data = {'int': np.random.randint(1, 10, N), - 'datetime': date_range('2000-01-01', freq='S', periods=N)} + N = 10 ** 6 + data = { + "int": np.random.randint(1, 10, N), + "datetime": date_range("2000-01-01", freq="S", periods=N), + } self.s = Series(data[dtype]) - if dtype == 'datetime': + if dtype == "datetime": self.s[np.random.randint(1, N, 100)] = NaT def time_dropna(self, dtype): @@ -127,37 +127,47 @@ def time_dropna(self, dtype): class SearchSorted: goal_time = 0.2 - params = ['int8', 'int16', 'int32', 'int64', - 'uint8', 'uint16', 'uint32', 'uint64', - 'float16', 'float32', 'float64', - 'str'] - param_names = ['dtype'] + params = [ + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float16", + "float32", + "float64", + "str", + ] + param_names = ["dtype"] def setup(self, dtype): - N = 10**5 + N = 10 ** 5 data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) self.s = Series(data) def time_searchsorted(self, dtype): - key = '2' if dtype == 'str' else 2 + key = "2" if dtype == "str" else 2 self.s.searchsorted(key) class Map: - params = (['dict', 'Series', 'lambda'], ['object', 'category', 'int']) - param_names = 'mapper' + params = (["dict", "Series", "lambda"], ["object", "category", "int"]) + param_names = "mapper" def setup(self, mapper, dtype): map_size = 1000 map_data = Series(map_size - np.arange(map_size), dtype=dtype) # construct mapper - if mapper == 'Series': + if mapper == "Series": self.map_data = map_data - elif mapper == 'dict': + elif mapper == "dict": self.map_data = map_data.to_dict() - elif mapper == 'lambda': + elif mapper == "lambda": map_dict = map_data.to_dict() self.map_data = lambda x: map_dict[x] else: @@ -170,8 +180,8 @@ def time_map(self, mapper, *args, **kwargs): class Clip: - params = [50, 1000, 10**5] - param_names = ['n'] + params = [50, 1000, 10 ** 5] + param_names = ["n"] def setup(self, n): self.s = Series(np.random.randn(n)) @@ -182,8 +192,8 @@ def time_clip(self, n): class ValueCounts: - params = ['int', 'uint', 'float', 'object'] - param_names = ['dtype'] + params = ["int", "uint", "float", "object"] + param_names = ["dtype"] def setup(self, dtype): self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype) @@ -193,7 +203,6 @@ def time_value_counts(self, dtype): class Dir: - def setup(self): self.s = Series(index=tm.makeStringIndex(10000)) @@ -204,21 +213,19 @@ def time_dir_strings(self): class SeriesGetattr: # https://github.com/pandas-dev/pandas/issues/19764 def setup(self): - self.s = Series(1, - index=date_range("2012-01-01", freq='s', - periods=int(1e6))) + self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6))) def time_series_datetimeindex_repr(self): - getattr(self.s, 'a', None) + getattr(self.s, "a", None) class All(object): - params = [[10**3, 10**6], ['fast', 'slow']] - param_names = ['N', 'case'] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] + param_names = ["N", "case"] def setup(self, N, case): - val = case != 'fast' + val = case != "fast" self.s = Series([val] * N) def time_all(self, N, case): @@ -227,11 +234,11 @@ def time_all(self, N, case): class Any(object): - params = [[10**3, 10**6], ['fast', 'slow']] - param_names = ['N', 'case'] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] + param_names = ["N", "case"] def setup(self, N, case): - val = case == 'fast' + val = case == "fast" self.s = Series([val] * N) def time_any(self, N, case): @@ -240,11 +247,25 @@ def time_any(self, N, case): class NanOps(object): - params = [['var', 'mean', 'median', 'max', 'min', 'sum', 'std', 'sem', - 'argmax', 'skew', 'kurt', 'prod'], - [10**3, 10**6], - ['int8', 'int32', 'int64', 'float64']] - param_names = ['func', 'N', 'dtype'] + params = [ + [ + "var", + "mean", + "median", + "max", + "min", + "sum", + "std", + "sem", + "argmax", + "skew", + "kurt", + "prod", + ], + [10 ** 3, 10 ** 6], + ["int8", "int32", "int64", "float64"], + ] + param_names = ["func", "N", "dtype"] def setup(self, func, N, dtype): self.s = Series([1] * N, dtype=dtype) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 281e81f21ba9c..19d08c086a508 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -14,11 +14,10 @@ def make_array(size, dense_proportion, fill_value, dtype): class SparseSeriesToFrame: - def setup(self): K = 50 N = 50001 - rng = date_range('1/1/2000', periods=N, freq='T') + rng = date_range("1/1/2000", periods=N, freq="T") self.series = {} for i in range(1, K): data = np.random.randn(N)[:-i] @@ -32,12 +31,11 @@ def time_series_to_frame(self): class SparseArrayConstructor: - params = ([0.1, 0.01], [0, np.nan], - [np.int64, np.float64, np.object]) - param_names = ['dense_proportion', 'fill_value', 'dtype'] + params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, np.object]) + param_names = ["dense_proportion", "fill_value", "dtype"] def setup(self, dense_proportion, fill_value, dtype): - N = 10**6 + N = 10 ** 6 self.array = make_array(N, dense_proportion, fill_value, dtype) def time_sparse_array(self, dense_proportion, fill_value, dtype): @@ -45,7 +43,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype): class SparseDataFrameConstructor: - def setup(self): N = 1000 self.arr = np.arange(N) @@ -56,18 +53,16 @@ def time_from_scipy(self): class FromCoo: - def setup(self): - self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], - ([1, 0, 0], [0, 2, 3])), - shape=(100, 100)) + self.matrix = scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100) + ) def time_sparse_series_from_coo(self): pd.Series.sparse.from_coo(self.matrix) class ToCoo: - def setup(self): s = Series([np.nan] * 10000) s[0] = 3.0 @@ -77,18 +72,16 @@ def setup(self): self.ss = s.astype("Sparse") def time_sparse_series_to_coo(self): - self.ss.sparse.to_coo(row_levels=[0, 1], - column_levels=[2, 3], - sort_labels=True) + self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) class Arithmetic: params = ([0.1, 0.01], [0, np.nan]) - param_names = ['dense_proportion', 'fill_value'] + param_names = ["dense_proportion", "fill_value"] def setup(self, dense_proportion, fill_value): - N = 10**6 + N = 10 ** 6 arr1 = make_array(N, dense_proportion, fill_value, np.int64) self.array1 = SparseArray(arr1, fill_value=fill_value) arr2 = make_array(N, dense_proportion, fill_value, np.int64) @@ -110,22 +103,24 @@ def time_divide(self, dense_proportion, fill_value): class ArithmeticBlock: params = [np.nan, 0] - param_names = ['fill_value'] + param_names = ["fill_value"] def setup(self, fill_value): - N = 10**6 - self.arr1 = self.make_block_array(length=N, num_blocks=1000, - block_size=10, fill_value=fill_value) - self.arr2 = self.make_block_array(length=N, num_blocks=1000, - block_size=10, fill_value=fill_value) + N = 10 ** 6 + self.arr1 = self.make_block_array( + length=N, num_blocks=1000, block_size=10, fill_value=fill_value + ) + self.arr2 = self.make_block_array( + length=N, num_blocks=1000, block_size=10, fill_value=fill_value + ) def make_block_array(self, length, num_blocks, block_size, fill_value): arr = np.full(length, fill_value) - indicies = np.random.choice(np.arange(0, length, block_size), - num_blocks, - replace=False) + indicies = np.random.choice( + np.arange(0, length, block_size), num_blocks, replace=False + ) for ind in indicies: - arr[ind:ind + block_size] = np.random.randint(0, 100, block_size) + arr[ind : ind + block_size] = np.random.randint(0, 100, block_size) return SparseArray(arr, fill_value=fill_value) def time_make_union(self, fill_value): diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 3514335f92e77..620a6de0f5f34 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -2,14 +2,13 @@ import pandas as pd -ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem', - 'var'] +ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] class FrameOps: - params = [ops, ['float', 'int'], [0, 1], [True, False]] - param_names = ['op', 'dtype', 'axis', 'use_bottleneck'] + params = [ops, ["float", "int"], [0, 1], [True, False]] + param_names = ["op", "dtype", "axis", "use_bottleneck"] def setup(self, op, dtype, axis, use_bottleneck): df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) @@ -17,6 +16,7 @@ def setup(self, op, dtype, axis, use_bottleneck): pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.df_func = getattr(df, op) @@ -27,13 +27,15 @@ def time_op(self, op, dtype, axis, use_bottleneck): class FrameMultiIndexOps: params = ([0, 1, [0, 1]], ops) - param_names = ['level', 'op'] + param_names = ["level", "op"] def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - codes = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] + codes = [ + np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10), + ] index = pd.MultiIndex(levels=levels, codes=codes) df = pd.DataFrame(np.random.randn(len(index), 4), index=index) self.df_func = getattr(df, op) @@ -44,8 +46,8 @@ def time_op(self, level, op): class SeriesOps: - params = [ops, ['float', 'int'], [True, False]] - param_names = ['op', 'dtype', 'use_bottleneck'] + params = [ops, ["float", "int"], [True, False]] + param_names = ["op", "dtype", "use_bottleneck"] def setup(self, op, dtype, use_bottleneck): s = pd.Series(np.random.randn(100000)).astype(dtype) @@ -53,6 +55,7 @@ def setup(self, op, dtype, use_bottleneck): pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.s_func = getattr(s, op) @@ -63,13 +66,15 @@ def time_op(self, op, dtype, use_bottleneck): class SeriesMultiIndexOps: params = ([0, 1, [0, 1]], ops) - param_names = ['level', 'op'] + param_names = ["level", "op"] def setup(self, level, op): levels = [np.arange(10), np.arange(100), np.arange(100)] - codes = [np.arange(10).repeat(10000), - np.tile(np.arange(100).repeat(100), 10), - np.tile(np.tile(np.arange(100), 100), 10)] + codes = [ + np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10), + ] index = pd.MultiIndex(levels=levels, codes=codes) s = pd.Series(np.random.randn(len(index)), index=index) self.s_func = getattr(s, op) @@ -80,11 +85,11 @@ def time_op(self, level, op): class Rank: - params = [['DataFrame', 'Series'], [True, False]] - param_names = ['constructor', 'pct'] + params = [["DataFrame", "Series"], [True, False]] + param_names = ["constructor", "pct"] def setup(self, constructor, pct): - values = np.random.randn(10**5) + values = np.random.randn(10 ** 5) self.data = getattr(pd, constructor)(values) def time_rank(self, constructor, pct): @@ -96,14 +101,15 @@ def time_average_old(self, constructor, pct): class Correlation: - params = [['spearman', 'kendall', 'pearson'], [True, False]] - param_names = ['method', 'use_bottleneck'] + params = [["spearman", "kendall", "pearson"], [True, False]] + param_names = ["method", "use_bottleneck"] def setup(self, method, use_bottleneck): try: pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) @@ -126,13 +132,14 @@ def time_corrwith_rows(self, method, use_bottleneck): class Covariance: params = [[True, False]] - param_names = ['use_bottleneck'] + param_names = ["use_bottleneck"] def setup(self, use_bottleneck): try: pd.options.compute.use_bottleneck = use_bottleneck except TypeError: from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.s = pd.Series(np.random.randn(100000)) self.s2 = pd.Series(np.random.randn(100000)) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 5dbcc71b7455e..6be2fa92d9eac 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -6,31 +6,30 @@ class Methods: - def setup(self): - self.s = Series(tm.makeStringIndex(10**5)) + self.s = Series(tm.makeStringIndex(10 ** 5)) def time_center(self): self.s.str.center(100) def time_count(self): - self.s.str.count('A') + self.s.str.count("A") def time_endswith(self): - self.s.str.endswith('A') + self.s.str.endswith("A") def time_extract(self): with warnings.catch_warnings(record=True): - self.s.str.extract('(\\w*)A(\\w*)') + self.s.str.extract("(\\w*)A(\\w*)") def time_findall(self): - self.s.str.findall('[A-Z]+') + self.s.str.findall("[A-Z]+") def time_find(self): - self.s.str.find('[A-Z]+') + self.s.str.find("[A-Z]+") def time_rfind(self): - self.s.str.rfind('[A-Z]+') + self.s.str.rfind("[A-Z]+") def time_get(self): self.s.str.get(0) @@ -39,43 +38,43 @@ def time_len(self): self.s.str.len() def time_join(self): - self.s.str.join(' ') + self.s.str.join(" ") def time_match(self): - self.s.str.match('A') + self.s.str.match("A") def time_normalize(self): - self.s.str.normalize('NFC') + self.s.str.normalize("NFC") def time_pad(self): - self.s.str.pad(100, side='both') + self.s.str.pad(100, side="both") def time_partition(self): - self.s.str.partition('A') + self.s.str.partition("A") def time_rpartition(self): - self.s.str.rpartition('A') + self.s.str.rpartition("A") def time_replace(self): - self.s.str.replace('A', '\x01\x01') + self.s.str.replace("A", "\x01\x01") def time_translate(self): - self.s.str.translate({'A': '\x01\x01'}) + self.s.str.translate({"A": "\x01\x01"}) def time_slice(self): self.s.str.slice(5, 15, 2) def time_startswith(self): - self.s.str.startswith('A') + self.s.str.startswith("A") def time_strip(self): - self.s.str.strip('A') + self.s.str.strip("A") def time_rstrip(self): - self.s.str.rstrip('A') + self.s.str.rstrip("A") def time_lstrip(self): - self.s.str.lstrip('A') + self.s.str.lstrip("A") def time_title(self): self.s.str.title() @@ -95,13 +94,13 @@ def time_zfill(self): class Repeat: - params = ['int', 'array'] - param_names = ['repeats'] + params = ["int", "array"] + param_names = ["repeats"] def setup(self, repeats): - N = 10**5 + N = 10 ** 5 self.s = Series(tm.makeStringIndex(N)) - repeat = {'int': 1, 'array': np.random.randint(1, 3, N)} + repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats] def time_repeat(self, repeats): @@ -110,20 +109,20 @@ def time_repeat(self, repeats): class Cat: - params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15]) - param_names = ['other_cols', 'sep', 'na_rep', 'na_frac'] + params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15]) + param_names = ["other_cols", "sep", "na_rep", "na_frac"] def setup(self, other_cols, sep, na_rep, na_frac): N = 10 ** 5 - mask_gen = lambda: np.random.choice([True, False], N, - p=[1 - na_frac, na_frac]) + mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) if other_cols == 0: # str.cat self-concatenates only for others=None self.others = None else: - self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) - for i in range(other_cols)}) + self.others = DataFrame( + {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)} + ) def time_cat(self, other_cols, sep, na_rep, na_frac): # before the concatenation (one caller + other_cols columns), the total @@ -136,52 +135,49 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains: params = [True, False] - param_names = ['regex'] + param_names = ["regex"] def setup(self, regex): - self.s = Series(tm.makeStringIndex(10**5)) + self.s = Series(tm.makeStringIndex(10 ** 5)) def time_contains(self, regex): - self.s.str.contains('A', regex=regex) + self.s.str.contains("A", regex=regex) class Split: params = [True, False] - param_names = ['expand'] + param_names = ["expand"] def setup(self, expand): - self.s = Series(tm.makeStringIndex(10**5)).str.join('--') + self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--") def time_split(self, expand): - self.s.str.split('--', expand=expand) + self.s.str.split("--", expand=expand) def time_rsplit(self, expand): - self.s.str.rsplit('--', expand=expand) + self.s.str.rsplit("--", expand=expand) class Dummies: - def setup(self): - self.s = Series(tm.makeStringIndex(10**5)).str.join('|') + self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("|") def time_get_dummies(self): - self.s.str.get_dummies('|') + self.s.str.get_dummies("|") class Encode: - def setup(self): self.ser = Series(tm.makeUnicodeIndex()) def time_encode_decode(self): - self.ser.str.encode('utf-8').str.decode('utf-8') + self.ser.str.encode("utf-8").str.decode("utf-8") class Slice: - def setup(self): - self.s = Series(['abcdefg', np.nan] * 500000) + self.s = Series(["abcdefg", np.nan] * 500000) def time_vector_slice(self): # GH 2602 diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index c4fe462944a2a..36a9db529f98f 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -3,49 +3,60 @@ import numpy as np from pandas import ( - DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta) + DataFrame, + Series, + Timedelta, + Timestamp, + timedelta_range, + to_timedelta, +) class TimedeltaConstructor: - def time_from_int(self): Timedelta(123456789) def time_from_unit(self): - Timedelta(1, unit='d') + Timedelta(1, unit="d") def time_from_components(self): - Timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5, - microseconds=6, nanoseconds=7) + Timedelta( + days=1, + hours=2, + minutes=3, + seconds=4, + milliseconds=5, + microseconds=6, + nanoseconds=7, + ) def time_from_datetime_timedelta(self): Timedelta(datetime.timedelta(days=1, seconds=1)) def time_from_np_timedelta(self): - Timedelta(np.timedelta64(1, 'ms')) + Timedelta(np.timedelta64(1, "ms")) def time_from_string(self): - Timedelta('1 days') + Timedelta("1 days") def time_from_iso_format(self): - Timedelta('P4DT12H30M5S') + Timedelta("P4DT12H30M5S") def time_from_missing(self): - Timedelta('nat') + Timedelta("nat") class ToTimedelta: - def setup(self): self.ints = np.random.randint(0, 60, size=10000) self.str_days = [] self.str_seconds = [] for i in self.ints: - self.str_days.append('{0} days'.format(i)) - self.str_seconds.append('00:00:{0:02d}'.format(i)) + self.str_days.append("{0} days".format(i)) + self.str_seconds.append("00:00:{0:02d}".format(i)) def time_convert_int(self): - to_timedelta(self.ints, unit='s') + to_timedelta(self.ints, unit="s") def time_convert_string_days(self): to_timedelta(self.str_days) @@ -56,30 +67,28 @@ def time_convert_string_seconds(self): class ToTimedeltaErrors: - params = ['coerce', 'ignore'] - param_names = ['errors'] + params = ["coerce", "ignore"] + param_names = ["errors"] def setup(self, errors): ints = np.random.randint(0, 60, size=10000) - self.arr = ['{0} days'.format(i) for i in ints] - self.arr[-1] = 'apple' + self.arr = ["{0} days".format(i) for i in ints] + self.arr[-1] = "apple" def time_convert(self, errors): to_timedelta(self.arr, errors=errors) class TimedeltaOps: - def setup(self): self.td = to_timedelta(np.arange(1000000)) - self.ts = Timestamp('2000') + self.ts = Timestamp("2000") def time_add_td_ts(self): self.td + self.ts class TimedeltaProperties: - def setup_cache(self): td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35) return td @@ -98,10 +107,9 @@ def time_timedelta_nanoseconds(self, td): class DatetimeAccessor: - def setup_cache(self): N = 100000 - series = Series(timedelta_range('1 days', periods=N, freq='h')) + series = Series(timedelta_range("1 days", periods=N, freq="h")) return series def time_dt_accessor(self, series): @@ -121,10 +129,9 @@ def time_timedelta_nanoseconds(self, series): class TimedeltaIndexing: - def setup(self): - self.index = timedelta_range(start='1985', periods=1000, freq='D') - self.index2 = timedelta_range(start='1986', periods=1000, freq='D') + self.index = timedelta_range(start="1985", periods=1000, freq="D") + self.index2 = timedelta_range(start="1986", periods=1000, freq="D") self.series = Series(range(1000), index=self.index) self.timedelta = self.index[500] @@ -141,7 +148,7 @@ def time_series_loc(self): self.series.loc[self.timedelta] def time_align(self): - DataFrame({'a': self.series, 'b': self.series[:500]}) + DataFrame({"a": self.series, "b": self.series[:500]}) def time_intersection(self): self.index.intersection(self.index2) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 14ee8747cf81d..a74527df25f9b 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -4,6 +4,7 @@ import numpy as np from pandas import to_datetime, date_range, Series, DataFrame, period_range from pandas.tseries.frequencies import infer_freq + try: from pandas.plotting._matplotlib.converter import DatetimeConverter except ImportError: @@ -12,27 +13,22 @@ class DatetimeIndex: - params = ['dst', 'repeated', 'tz_aware', 'tz_local', 'tz_naive'] - param_names = ['index_type'] + params = ["dst", "repeated", "tz_aware", "tz_local", "tz_naive"] + param_names = ["index_type"] def setup(self, index_type): N = 100000 - dtidxes = {'dst': date_range(start='10/29/2000 1:00:00', - end='10/29/2000 1:59:59', freq='S'), - 'repeated': date_range(start='2000', - periods=N / 10, - freq='s').repeat(10), - 'tz_aware': date_range(start='2000', - periods=N, - freq='s', - tz='US/Eastern'), - 'tz_local': date_range(start='2000', - periods=N, - freq='s', - tz=dateutil.tz.tzlocal()), - 'tz_naive': date_range(start='2000', - periods=N, - freq='s')} + dtidxes = { + "dst": date_range( + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + ), + "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10), + "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"), + "tz_local": date_range( + start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal() + ), + "tz_naive": date_range(start="2000", periods=N, freq="s"), + } self.index = dtidxes[index_type] def time_add_timedelta(self, index_type): @@ -62,31 +58,31 @@ def time_to_pydatetime(self, index_type): class TzLocalize: - params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()] - param_names = 'tz' + params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] + param_names = "tz" def setup(self, tz): - dst_rng = date_range(start='10/29/2000 1:00:00', - end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', - end='10/29/2000 00:59:59', freq='S') + dst_rng = date_range( + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + ) + self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S") self.index = self.index.append(dst_rng) self.index = self.index.append(dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', - end='10/29/2000 3:00:00', - freq='S')) + self.index = self.index.append( + date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S") + ) def time_infer_dst(self, tz): - self.index.tz_localize(tz, ambiguous='infer') + self.index.tz_localize(tz, ambiguous="infer") class ResetIndex: - params = [None, 'US/Eastern'] - param_names = 'tz' + params = [None, "US/Eastern"] + param_names = "tz" def setup(self, tz): - idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz) + idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) def time_reest_datetimeindex(self, tz): @@ -95,12 +91,12 @@ def time_reest_datetimeindex(self, tz): class Factorize: - params = [None, 'Asia/Tokyo'] - param_names = 'tz' + params = [None, "Asia/Tokyo"] + param_names = "tz" def setup(self, tz): N = 100000 - self.dti = date_range('2011-01-01', freq='H', periods=N, tz=tz) + self.dti = date_range("2011-01-01", freq="H", periods=N, tz=tz) self.dti = self.dti.repeat(5) def time_factorize(self, tz): @@ -109,25 +105,24 @@ def time_factorize(self, tz): class InferFreq: - params = [None, 'D', 'B'] - param_names = ['freq'] + params = [None, "D", "B"] + param_names = ["freq"] def setup(self, freq): if freq is None: - self.idx = date_range(start='1/1/1700', freq='D', periods=10000) + self.idx = date_range(start="1/1/1700", freq="D", periods=10000) self.idx.freq = None else: - self.idx = date_range(start='1/1/1700', freq=freq, periods=10000) + self.idx = date_range(start="1/1/1700", freq=freq, periods=10000) def time_infer_freq(self, freq): infer_freq(self.idx) class TimeDatetimeConverter: - def setup(self): N = 100000 - self.rng = date_range(start='1/1/2000', periods=N, freq='T') + self.rng = date_range(start="1/1/2000", periods=N, freq="T") def time_convert(self): DatetimeConverter.convert(self.rng, None, None) @@ -136,11 +131,11 @@ def time_convert(self): class Iteration: params = [date_range, period_range] - param_names = ['time_index'] + param_names = ["time_index"] def setup(self, time_index): - N = 10**6 - self.idx = time_index(start='20140101', freq='T', periods=N) + N = 10 ** 6 + self.idx = time_index(start="20140101", freq="T", periods=N) self.exit = 10000 def time_iter(self, time_index): @@ -155,13 +150,13 @@ def time_iter_preexit(self, time_index): class ResampleDataFrame: - params = ['max', 'mean', 'min'] - param_names = ['method'] + params = ["max", "mean", "min"] + param_names = ["method"] def setup(self, method): - rng = date_range(start='20130101', periods=100000, freq='50L') + rng = date_range(start="20130101", periods=100000, freq="50L") df = DataFrame(np.random.randn(100000, 2), index=rng) - self.resample = getattr(df.resample('1s'), method) + self.resample = getattr(df.resample("1s"), method) def time_method(self, method): self.resample() @@ -169,16 +164,14 @@ def time_method(self, method): class ResampleSeries: - params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc']) - param_names = ['index', 'freq', 'method'] + params = (["period", "datetime"], ["5min", "1D"], ["mean", "ohlc"]) + param_names = ["index", "freq", "method"] def setup(self, index, freq, method): - indexes = {'period': period_range(start='1/1/2000', - end='1/1/2001', - freq='T'), - 'datetime': date_range(start='1/1/2000', - end='1/1/2001', - freq='T')} + indexes = { + "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"), + "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"), + } idx = indexes[index] ts = Series(np.random.randn(len(idx)), index=idx) self.resample = getattr(ts.resample(freq), method) @@ -190,32 +183,35 @@ def time_resample(self, index, freq, method): class ResampleDatetetime64: # GH 7754 def setup(self): - rng3 = date_range(start='2000-01-01 00:00:00', - end='2000-01-01 10:00:00', freq='555000U') - self.dt_ts = Series(5, rng3, dtype='datetime64[ns]') + rng3 = date_range( + start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U" + ) + self.dt_ts = Series(5, rng3, dtype="datetime64[ns]") def time_resample(self): - self.dt_ts.resample('1S').last() + self.dt_ts.resample("1S").last() class AsOf: - params = ['DataFrame', 'Series'] - param_names = ['constructor'] + params = ["DataFrame", "Series"] + param_names = ["constructor"] def setup(self, constructor): N = 10000 M = 10 - rng = date_range(start='1/1/1990', periods=N, freq='53s') - data = {'DataFrame': DataFrame(np.random.randn(N, M)), - 'Series': Series(np.random.randn(N))} + rng = date_range(start="1/1/1990", periods=N, freq="53s") + data = { + "DataFrame": DataFrame(np.random.randn(N, M)), + "Series": Series(np.random.randn(N)), + } self.ts = data[constructor] self.ts.index = rng self.ts2 = self.ts.copy() self.ts2.iloc[250:5000] = np.nan self.ts3 = self.ts.copy() self.ts3.iloc[-5000:] = np.nan - self.dates = date_range(start='1/1/1990', periods=N * 10, freq='5s') + self.dates = date_range(start="1/1/1990", periods=N * 10, freq="5s") self.date = self.dates[0] self.date_last = self.dates[-1] self.date_early = self.date - timedelta(10) @@ -248,11 +244,11 @@ def time_asof_nan_single(self, constructor): class SortIndex: params = [True, False] - param_names = ['monotonic'] + param_names = ["monotonic"] def setup(self, monotonic): - N = 10**5 - idx = date_range(start='1/1/2000', periods=N, freq='s') + N = 10 ** 5 + idx = date_range(start="1/1/2000", periods=N, freq="s") self.s = Series(np.random.randn(N), index=idx) if not monotonic: self.s = self.s.sample(frac=1) @@ -265,10 +261,9 @@ def time_get_slice(self, monotonic): class IrregularOps: - def setup(self): - N = 10**5 - idx = date_range(start='1/1/2000', periods=N, freq='s') + N = 10 ** 5 + idx = date_range(start="1/1/2000", periods=N, freq="s") s = Series(np.random.randn(N), index=idx) self.left = s.sample(frac=1) self.right = s.sample(frac=1) @@ -278,10 +273,9 @@ def time_add(self): class Lookup: - def setup(self): N = 1500000 - rng = date_range(start='1/1/2000', periods=N, freq='S') + rng = date_range(start="1/1/2000", periods=N, freq="S") self.ts = Series(1, index=rng) self.lookup_val = rng[N // 2] @@ -291,36 +285,35 @@ def time_lookup_and_cleanup(self): class ToDatetimeYYYYMMDD: - def setup(self): - rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.stringsD = Series(rng.strftime('%Y%m%d')) + rng = date_range(start="1/1/2000", periods=10000, freq="D") + self.stringsD = Series(rng.strftime("%Y%m%d")) def time_format_YYYYMMDD(self): - to_datetime(self.stringsD, format='%Y%m%d') + to_datetime(self.stringsD, format="%Y%m%d") class ToDatetimeCacheSmallCount(object): params = ([True, False], [50, 500, 5000, 100000]) - param_names = ['cache', 'count'] + param_names = ["cache", "count"] def setup(self, cache, count): - rng = date_range(start='1/1/1971', periods=count) - self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist() + rng = date_range(start="1/1/1971", periods=count) + self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist() def time_unique_date_strings(self, cache, count): to_datetime(self.unique_date_strings, cache=cache) class ToDatetimeISO8601: - def setup(self): - rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist() - self.strings_nosep = rng.strftime('%Y%m%d %H:%M:%S').tolist() - self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' - for x in rng] + rng = date_range(start="1/1/2000", periods=20000, freq="H") + self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() + self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() + self.strings_tz_space = [ + x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng + ] def time_iso8601(self): to_datetime(self.strings) @@ -329,22 +322,21 @@ def time_iso8601_nosep(self): to_datetime(self.strings_nosep) def time_iso8601_format(self): - to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') + to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S") def time_iso8601_format_no_sep(self): - to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') + to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S") def time_iso8601_tz_spaceformat(self): to_datetime(self.strings_tz_space) class ToDatetimeNONISO8601: - def setup(self): N = 10000 half = int(N / 2) - ts_string_1 = 'March 1, 2018 12:00:00+0400' - ts_string_2 = 'March 1, 2018 12:00:00+0500' + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" self.same_offset = [ts_string_1] * N self.diff_offset = [ts_string_1] * half + [ts_string_2] * half @@ -356,50 +348,48 @@ def time_different_offset(self): class ToDatetimeFormatQuarters: - def setup(self): - self.s = Series(['2Q2005', '2Q05', '2005Q1', '05Q1'] * 10000) + self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000) def time_infer_quarter(self): to_datetime(self.s) class ToDatetimeFormat: - def setup(self): - self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000) - self.s2 = self.s.str.replace(':\\S+$', '') + self.s = Series(["19MAY11", "19MAY11:00:00:00"] * 100000) + self.s2 = self.s.str.replace(":\\S+$", "") def time_exact(self): - to_datetime(self.s2, format='%d%b%y') + to_datetime(self.s2, format="%d%b%y") def time_no_exact(self): - to_datetime(self.s, format='%d%b%y', exact=False) + to_datetime(self.s, format="%d%b%y", exact=False) class ToDatetimeCache: params = [True, False] - param_names = ['cache'] + param_names = ["cache"] def setup(self, cache): N = 10000 self.unique_numeric_seconds = list(range(N)) self.dup_numeric_seconds = [1000] * N - self.dup_string_dates = ['2000-02-11'] * N - self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N + self.dup_string_dates = ["2000-02-11"] * N + self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N def time_unique_seconds_and_unit(self, cache): - to_datetime(self.unique_numeric_seconds, unit='s', cache=cache) + to_datetime(self.unique_numeric_seconds, unit="s", cache=cache) def time_dup_seconds_and_unit(self, cache): - to_datetime(self.dup_numeric_seconds, unit='s', cache=cache) + to_datetime(self.dup_numeric_seconds, unit="s", cache=cache) def time_dup_string_dates(self, cache): to_datetime(self.dup_string_dates, cache=cache) def time_dup_string_dates_and_format(self, cache): - to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=cache) + to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache) def time_dup_string_tzoffset_dates(self, cache): to_datetime(self.dup_string_with_tz, cache=cache) @@ -407,14 +397,12 @@ def time_dup_string_tzoffset_dates(self, cache): class DatetimeAccessor: - params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()] - param_names = 'tz' + params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] + param_names = "tz" def setup(self, tz): N = 100000 - self.series = Series( - date_range(start='1/1/2000', periods=N, freq='T', tz=tz) - ) + self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz)) def time_dt_accessor(self, tz): self.series.dt diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index c6e56804c7b21..8ebb2d8d2f35d 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -7,21 +7,20 @@ class TimestampConstruction: - def time_parse_iso8601_no_tz(self): - Timestamp('2017-08-25 08:16:14') + Timestamp("2017-08-25 08:16:14") def time_parse_iso8601_tz(self): - Timestamp('2017-08-25 08:16:14-0500') + Timestamp("2017-08-25 08:16:14-0500") def time_parse_dateutil(self): - Timestamp('2017/08/25 08:16:14 AM') + Timestamp("2017/08/25 08:16:14 AM") def time_parse_today(self): - Timestamp('today') + Timestamp("today") def time_parse_now(self): - Timestamp('now') + Timestamp("now") def time_fromordinal(self): Timestamp.fromordinal(730120) @@ -31,14 +30,13 @@ def time_fromtimestamp(self): class TimestampProperties: - _tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC, - dateutil.tz.tzutc()] - _freqs = [None, 'B'] + _tzs = [None, pytz.timezone("Europe/Amsterdam"), pytz.UTC, dateutil.tz.tzutc()] + _freqs = [None, "B"] params = [_tzs, _freqs] - param_names = ['tz', 'freq'] + param_names = ["tz", "freq"] def setup(self, tz, freq): - self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq) + self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz, freq=freq) def time_tz(self, tz, freq): self.ts.tz @@ -93,15 +91,14 @@ def time_month_name(self, tz, freq): class TimestampOps: - params = [None, 'US/Eastern', pytz.UTC, - dateutil.tz.tzutc()] - param_names = ['tz'] + params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()] + param_names = ["tz"] def setup(self, tz): - self.ts = Timestamp('2017-08-25 08:16:14', tz=tz) + self.ts = Timestamp("2017-08-25 08:16:14", tz=tz) def time_replace_tz(self, tz): - self.ts.replace(tzinfo=pytz.timezone('US/Eastern')) + self.ts.replace(tzinfo=pytz.timezone("US/Eastern")) def time_replace_None(self, tz): self.ts.replace(tzinfo=None) @@ -124,16 +121,16 @@ def time_to_julian_date(self, tz): self.ts.to_julian_date() def time_floor(self, tz): - self.ts.floor('5T') + self.ts.floor("5T") def time_ceil(self, tz): - self.ts.ceil('5T') + self.ts.ceil("5T") class TimestampAcrossDst: def setup(self): dt = datetime.datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo + self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo self.ts2 = Timestamp(dt) def time_replace_across_dst(self): diff --git a/ci/print_skipped.py b/ci/print_skipped.py index 859481c5d188d..a44281044e11d 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -11,45 +11,42 @@ def parse_results(filename): root = tree.getroot() skipped = [] - current_class = '' + current_class = "" i = 1 assert i - 1 == len(skipped) - for el in root.findall('testcase'): - cn = el.attrib['classname'] - for sk in el.findall('skipped'): + for el in root.findall("testcase"): + cn = el.attrib["classname"] + for sk in el.findall("skipped"): old_class = current_class current_class = cn - name = '{classname}.{name}'.format(classname=current_class, - name=el.attrib['name']) - msg = sk.attrib['message'] - out = '' + name = "{classname}.{name}".format( + classname=current_class, name=el.attrib["name"] + ) + msg = sk.attrib["message"] + out = "" if old_class != current_class: ndigits = int(math.log(i, 10) + 1) # 4 for : + space + # + space - out += ('-' * (len(name + msg) + 4 + ndigits) + '\n') - out += '#{i} {name}: {msg}'.format(i=i, name=name, msg=msg) + out += "-" * (len(name + msg) + 4 + ndigits) + "\n" + out += "#{i} {name}: {msg}".format(i=i, name=name, msg=msg) skipped.append(out) i += 1 assert i - 1 == len(skipped) assert i - 1 == len(skipped) # assert len(skipped) == int(root.attrib['skip']) - return '\n'.join(skipped) + return "\n".join(skipped) def main(): - test_files = [ - 'test-data-single.xml', - 'test-data-multiple.xml', - 'test-data.xml', - ] + test_files = ["test-data-single.xml", "test-data-multiple.xml", "test-data.xml"] - print('SKIPPED TESTS:') + print("SKIPPED TESTS:") for fn in test_files: if os.path.isfile(fn): print(parse_results(fn)) return 0 -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py index c3647f0c7d2a8..5a07b094e6ad3 100644 --- a/doc/logo/pandas_logo.py +++ b/doc/logo/pandas_logo.py @@ -4,7 +4,7 @@ from matplotlib import rcParams import numpy as np -rcParams['mathtext.fontset'] = 'cm' +rcParams["mathtext.fontset"] = "cm" def fnx(): @@ -37,8 +37,12 @@ def fnx(): plt.figtext(0.05, 0.5, "pandas", size=40) plt.figtext( - 0.05, 0.2, r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$", - size=16, color="#5a89a4") - -fig.savefig('pandas_logo.svg') -fig.savefig('pandas_logo.png') + 0.05, + 0.2, + r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$", + size=16, + color="#5a89a4", +) + +fig.savefig("pandas_logo.svg") +fig.savefig("pandas_logo.png") diff --git a/doc/make.py b/doc/make.py index 496b3cfd4ee45..48febef20fbe6 100755 --- a/doc/make.py +++ b/doc/make.py @@ -24,9 +24,9 @@ DOC_PATH = os.path.dirname(os.path.abspath(__file__)) -SOURCE_PATH = os.path.join(DOC_PATH, 'source') -BUILD_PATH = os.path.join(DOC_PATH, 'build') -REDIRECTS_FILE = os.path.join(DOC_PATH, 'redirects.csv') +SOURCE_PATH = os.path.join(DOC_PATH, "source") +BUILD_PATH = os.path.join(DOC_PATH, "build") +REDIRECTS_FILE = os.path.join(DOC_PATH, "redirects.csv") class DocBuilder: @@ -36,8 +36,15 @@ class DocBuilder: All public methods of this class can be called as parameters of the script. """ - def __init__(self, num_jobs=0, include_api=True, single_doc=None, - verbosity=0, warnings_are_errors=False): + + def __init__( + self, + num_jobs=0, + include_api=True, + single_doc=None, + verbosity=0, + warnings_are_errors=False, + ): self.num_jobs = num_jobs self.verbosity = verbosity self.warnings_are_errors = warnings_are_errors @@ -45,16 +52,15 @@ def __init__(self, num_jobs=0, include_api=True, single_doc=None, if single_doc: single_doc = self._process_single_doc(single_doc) include_api = False - os.environ['SPHINX_PATTERN'] = single_doc + os.environ["SPHINX_PATTERN"] = single_doc elif not include_api: - os.environ['SPHINX_PATTERN'] = '-api' + os.environ["SPHINX_PATTERN"] = "-api" self.single_doc_html = None - if single_doc and single_doc.endswith('.rst'): - self.single_doc_html = os.path.splitext(single_doc)[0] + '.html' + if single_doc and single_doc.endswith(".rst"): + self.single_doc_html = os.path.splitext(single_doc)[0] + ".html" elif single_doc: - self.single_doc_html = 'reference/api/pandas.{}.html'.format( - single_doc) + self.single_doc_html = "reference/api/pandas.{}.html".format(single_doc) def _process_single_doc(self, single_doc): """ @@ -66,26 +72,30 @@ def _process_single_doc(self, single_doc): (e.g. reference/api/pandas.DataFrame.head.rst). """ base_name, extension = os.path.splitext(single_doc) - if extension in ('.rst', '.ipynb'): + if extension in (".rst", ".ipynb"): if os.path.exists(os.path.join(SOURCE_PATH, single_doc)): return single_doc else: - raise FileNotFoundError('File {} not found'.format(single_doc)) + raise FileNotFoundError("File {} not found".format(single_doc)) - elif single_doc.startswith('pandas.'): + elif single_doc.startswith("pandas."): try: obj = pandas # noqa: F821 - for name in single_doc.split('.'): + for name in single_doc.split("."): obj = getattr(obj, name) except AttributeError: - raise ImportError('Could not import {}'.format(single_doc)) + raise ImportError("Could not import {}".format(single_doc)) else: - return single_doc[len('pandas.'):] + return single_doc[len("pandas.") :] else: - raise ValueError(('--single={} not understood. Value should be a ' - 'valid path to a .rst or .ipynb file, or a ' - 'valid pandas object (e.g. categorical.rst or ' - 'pandas.DataFrame.head)').format(single_doc)) + raise ValueError( + ( + "--single={} not understood. Value should be a " + "valid path to a .rst or .ipynb file, or a " + "valid pandas object (e.g. categorical.rst or " + "pandas.DataFrame.head)" + ).format(single_doc) + ) @staticmethod def _run_os(*args): @@ -117,52 +127,55 @@ def _sphinx_build(self, kind): -------- >>> DocBuilder(num_jobs=4)._sphinx_build('html') """ - if kind not in ('html', 'latex'): - raise ValueError('kind must be html or latex, ' - 'not {}'.format(kind)) + if kind not in ("html", "latex"): + raise ValueError("kind must be html or latex, " "not {}".format(kind)) - cmd = ['sphinx-build', '-b', kind] + cmd = ["sphinx-build", "-b", kind] if self.num_jobs: - cmd += ['-j', str(self.num_jobs)] + cmd += ["-j", str(self.num_jobs)] if self.warnings_are_errors: - cmd += ['-W', '--keep-going'] + cmd += ["-W", "--keep-going"] if self.verbosity: - cmd.append('-{}'.format('v' * self.verbosity)) - cmd += ['-d', os.path.join(BUILD_PATH, 'doctrees'), - SOURCE_PATH, os.path.join(BUILD_PATH, kind)] + cmd.append("-{}".format("v" * self.verbosity)) + cmd += [ + "-d", + os.path.join(BUILD_PATH, "doctrees"), + SOURCE_PATH, + os.path.join(BUILD_PATH, kind), + ] return subprocess.call(cmd) def _open_browser(self, single_doc_html): """ Open a browser tab showing single """ - url = os.path.join('file://', DOC_PATH, 'build', 'html', - single_doc_html) + url = os.path.join("file://", DOC_PATH, "build", "html", single_doc_html) webbrowser.open(url, new=2) def _get_page_title(self, page): """ Open the rst file `page` and extract its title. """ - fname = os.path.join(SOURCE_PATH, '{}.rst'.format(page)) + fname = os.path.join(SOURCE_PATH, "{}.rst".format(page)) option_parser = docutils.frontend.OptionParser( - components=(docutils.parsers.rst.Parser,)) - doc = docutils.utils.new_document( - '', - option_parser.get_default_values()) + components=(docutils.parsers.rst.Parser,) + ) + doc = docutils.utils.new_document("", option_parser.get_default_values()) with open(fname) as f: data = f.read() parser = docutils.parsers.rst.Parser() # do not generate any warning when parsing the rst - with open(os.devnull, 'a') as f: + with open(os.devnull, "a") as f: doc.reporter.stream = f parser.parse(data, doc) - section = next(node for node in doc.children - if isinstance(node, docutils.nodes.section)) - title = next(node for node in section.children - if isinstance(node, docutils.nodes.title)) + section = next( + node for node in doc.children if isinstance(node, docutils.nodes.section) + ) + title = next( + node for node in section.children if isinstance(node, docutils.nodes.title) + ) return title.astext() @@ -171,7 +184,7 @@ def _add_redirects(self): Create in the build directory an html file with a redirect, for every row in REDIRECTS_FILE. """ - html = ''' + html = """ @@ -182,16 +195,14 @@ def _add_redirects(self):

- ''' + """ with open(REDIRECTS_FILE) as mapping_fd: reader = csv.reader(mapping_fd) for row in reader: - if not row or row[0].strip().startswith('#'): + if not row or row[0].strip().startswith("#"): continue - path = os.path.join(BUILD_PATH, - 'html', - *row[0].split('/')) + '.html' + path = os.path.join(BUILD_PATH, "html", *row[0].split("/")) + ".html" try: title = self._get_page_title(row[1]) @@ -199,24 +210,26 @@ def _add_redirects(self): # the file can be an ipynb and not an rst, or docutils # may not be able to read the rst because it has some # sphinx specific stuff - title = 'this page' + title = "this page" if os.path.exists(path): - raise RuntimeError(( - 'Redirection would overwrite an existing file: ' - '{}').format(path)) + raise RuntimeError( + ("Redirection would overwrite an existing file: " "{}").format( + path + ) + ) - with open(path, 'w') as moved_page_fd: + with open(path, "w") as moved_page_fd: moved_page_fd.write( - html.format(url='{}.html'.format(row[1]), - title=title)) + html.format(url="{}.html".format(row[1]), title=title) + ) def html(self): """ Build HTML documentation. """ - ret_code = self._sphinx_build('html') - zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + ret_code = self._sphinx_build("html") + zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): os.remove(zip_fname) @@ -231,20 +244,20 @@ def latex(self, force=False): """ Build PDF documentation. """ - if sys.platform == 'win32': - sys.stderr.write('latex build has not been tested on windows\n') + if sys.platform == "win32": + sys.stderr.write("latex build has not been tested on windows\n") else: - ret_code = self._sphinx_build('latex') - os.chdir(os.path.join(BUILD_PATH, 'latex')) + ret_code = self._sphinx_build("latex") + os.chdir(os.path.join(BUILD_PATH, "latex")) if force: for i in range(3): - self._run_os('pdflatex', - '-interaction=nonstopmode', - 'pandas.tex') - raise SystemExit('You should check the file ' - '"build/latex/pandas.pdf" for problems.') + self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex") + raise SystemExit( + "You should check the file " + '"build/latex/pandas.pdf" for problems.' + ) else: - self._run_os('make') + self._run_os("make") return ret_code def latex_forced(self): @@ -259,84 +272,101 @@ def clean(): Clean documentation generated files. """ shutil.rmtree(BUILD_PATH, ignore_errors=True) - shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'), - ignore_errors=True) + shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True) def zip_html(self): """ Compress HTML documentation into a zip file. """ - zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): os.remove(zip_fname) - dirname = os.path.join(BUILD_PATH, 'html') + dirname = os.path.join(BUILD_PATH, "html") fnames = os.listdir(dirname) os.chdir(dirname) - self._run_os('zip', - zip_fname, - '-r', - '-q', - *fnames) + self._run_os("zip", zip_fname, "-r", "-q", *fnames) def main(): - cmds = [method for method in dir(DocBuilder) if not method.startswith('_')] + cmds = [method for method in dir(DocBuilder) if not method.startswith("_")] argparser = argparse.ArgumentParser( - description='pandas documentation builder', - epilog='Commands: {}'.format(','.join(cmds))) - argparser.add_argument('command', - nargs='?', - default='html', - help='command to run: {}'.format(', '.join(cmds))) - argparser.add_argument('--num-jobs', - type=int, - default=0, - help='number of jobs used by sphinx-build') - argparser.add_argument('--no-api', - default=False, - help='omit api and autosummary', - action='store_true') - argparser.add_argument('--single', - metavar='FILENAME', - type=str, - default=None, - help=('filename (relative to the "source" folder)' - ' of section or method name to compile, e.g. ' - '"development/contributing.rst",' - ' "ecosystem.rst", "pandas.DataFrame.join"')) - argparser.add_argument('--python-path', - type=str, - default=os.path.dirname(DOC_PATH), - help='path') - argparser.add_argument('-v', action='count', dest='verbosity', default=0, - help=('increase verbosity (can be repeated), ' - 'passed to the sphinx build command')) - argparser.add_argument('--warnings-are-errors', '-W', - action='store_true', - help='fail if warnings are raised') + description="pandas documentation builder", + epilog="Commands: {}".format(",".join(cmds)), + ) + argparser.add_argument( + "command", + nargs="?", + default="html", + help="command to run: {}".format(", ".join(cmds)), + ) + argparser.add_argument( + "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build" + ) + argparser.add_argument( + "--no-api", default=False, help="omit api and autosummary", action="store_true" + ) + argparser.add_argument( + "--single", + metavar="FILENAME", + type=str, + default=None, + help=( + 'filename (relative to the "source" folder)' + " of section or method name to compile, e.g. " + '"development/contributing.rst",' + ' "ecosystem.rst", "pandas.DataFrame.join"' + ), + ) + argparser.add_argument( + "--python-path", type=str, default=os.path.dirname(DOC_PATH), help="path" + ) + argparser.add_argument( + "-v", + action="count", + dest="verbosity", + default=0, + help=( + "increase verbosity (can be repeated), " + "passed to the sphinx build command" + ), + ) + argparser.add_argument( + "--warnings-are-errors", + "-W", + action="store_true", + help="fail if warnings are raised", + ) args = argparser.parse_args() if args.command not in cmds: - raise ValueError('Unknown command {}. Available options: {}'.format( - args.command, ', '.join(cmds))) + raise ValueError( + "Unknown command {}. Available options: {}".format( + args.command, ", ".join(cmds) + ) + ) # Below we update both os.environ and sys.path. The former is used by # external libraries (namely Sphinx) to compile this module and resolve # the import of `python_path` correctly. The latter is used to resolve # the import within the module, injecting it into the global namespace - os.environ['PYTHONPATH'] = args.python_path + os.environ["PYTHONPATH"] = args.python_path sys.path.insert(0, args.python_path) - globals()['pandas'] = importlib.import_module('pandas') + globals()["pandas"] = importlib.import_module("pandas") # Set the matplotlib backend to the non-interactive Agg backend for all # child processes. - os.environ['MPLBACKEND'] = 'module://matplotlib.backends.backend_agg' - - builder = DocBuilder(args.num_jobs, not args.no_api, args.single, - args.verbosity, args.warnings_are_errors) + os.environ["MPLBACKEND"] = "module://matplotlib.backends.backend_agg" + + builder = DocBuilder( + args.num_jobs, + not args.no_api, + args.single, + args.verbosity, + args.warnings_are_errors, + ) return getattr(builder, args.command)() -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/doc/source/conf.py b/doc/source/conf.py index 2484a9d592e09..3ebc5d8b6333b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -34,15 +34,13 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.append(os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../sphinxext')) -sys.path.extend([ - - # numpy standard doc extensions - os.path.join(os.path.dirname(__file__), - '..', '../..', - 'sphinxext') - -]) +sys.path.insert(0, os.path.abspath("../sphinxext")) +sys.path.extend( + [ + # numpy standard doc extensions + os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext") + ] +) # -- General configuration ----------------------------------------------- @@ -50,65 +48,66 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. # sphinxext. -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.extlinks', - 'sphinx.ext.todo', - 'numpydoc', # handle NumPy documentation formatted docstrings - 'IPython.sphinxext.ipython_directive', - 'IPython.sphinxext.ipython_console_highlighting', - 'matplotlib.sphinxext.plot_directive', - 'sphinx.ext.intersphinx', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.linkcode', - 'nbsphinx', - 'contributors', # custom pandas extension - ] - -exclude_patterns = ['**.ipynb_checkpoints'] +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.extlinks", + "sphinx.ext.todo", + "numpydoc", # handle NumPy documentation formatted docstrings + "IPython.sphinxext.ipython_directive", + "IPython.sphinxext.ipython_console_highlighting", + "matplotlib.sphinxext.plot_directive", + "sphinx.ext.intersphinx", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.linkcode", + "nbsphinx", + "contributors", # custom pandas extension +] + +exclude_patterns = ["**.ipynb_checkpoints"] try: import nbconvert except ImportError: - logger.warn('nbconvert not installed. Skipping notebooks.') - exclude_patterns.append('**/*.ipynb') + logger.warn("nbconvert not installed. Skipping notebooks.") + exclude_patterns.append("**/*.ipynb") else: try: nbconvert.utils.pandoc.get_pandoc_version() except nbconvert.utils.pandoc.PandocMissing: - logger.warn('Pandoc not installed. Skipping notebooks.') - exclude_patterns.append('**/*.ipynb') + logger.warn("Pandoc not installed. Skipping notebooks.") + exclude_patterns.append("**/*.ipynb") # sphinx_pattern can be '-api' to exclude the API pages, # the path to a file, or a Python object # (e.g. '10min.rst' or 'pandas.DataFrame.head') source_path = os.path.dirname(os.path.abspath(__file__)) -pattern = os.environ.get('SPHINX_PATTERN') +pattern = os.environ.get("SPHINX_PATTERN") if pattern: for dirname, dirs, fnames in os.walk(source_path): for fname in fnames: - if os.path.splitext(fname)[-1] in ('.rst', '.ipynb'): - fname = os.path.relpath(os.path.join(dirname, fname), - source_path) + if os.path.splitext(fname)[-1] in (".rst", ".ipynb"): + fname = os.path.relpath(os.path.join(dirname, fname), source_path) - if (fname == 'index.rst' - and os.path.abspath(dirname) == source_path): + if fname == "index.rst" and os.path.abspath(dirname) == source_path: continue - elif pattern == '-api' and dirname == 'reference': + elif pattern == "-api" and dirname == "reference": exclude_patterns.append(fname) - elif pattern != '-api' and fname != pattern: + elif pattern != "-api" and fname != pattern: exclude_patterns.append(fname) -with open(os.path.join(source_path, 'index.rst.template')) as f: +with open(os.path.join(source_path, "index.rst.template")) as f: t = jinja2.Template(f.read()) -with open(os.path.join(source_path, 'index.rst'), 'w') as f: - f.write(t.render(include_api=pattern is None, - single_doc=(pattern - if pattern is not None and pattern != '-api' - else None))) -autosummary_generate = True if pattern is None else ['index'] +with open(os.path.join(source_path, "index.rst"), "w") as f: + f.write( + t.render( + include_api=pattern is None, + single_doc=(pattern if pattern is not None and pattern != "-api" else None), + ) + ) +autosummary_generate = True if pattern is None else ["index"] # numpydoc numpydoc_attributes_as_param_list = False @@ -122,22 +121,20 @@ import pandas as pd""" # Add any paths that contain templates here, relative to this directory. -templates_path = ['../_templates'] +templates_path = ["../_templates"] # The suffix of source filenames. -source_suffix = [ - '.rst', -] +source_suffix = [".rst"] # The encoding of source files. -source_encoding = 'utf-8' +source_encoding = "utf-8" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'pandas' -copyright = '2008-2014, the pandas development team' +project = "pandas" +copyright = "2008-2014, the pandas development team" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -184,7 +181,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -194,7 +191,7 @@ # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'nature_with_gtoc' +html_theme = "nature_with_gtoc" # The style sheet to use for HTML and HTML Help pages. A file of that name # must exist either in Sphinx' static/ path, or in one of the custom paths @@ -207,7 +204,7 @@ # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ['themes'] +html_theme_path = ["themes"] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". @@ -223,12 +220,12 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = os.path.join(html_static_path[0], 'favicon.ico') +html_favicon = os.path.join(html_static_path[0], "favicon.ico") # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. @@ -250,60 +247,62 @@ # https://github.com/pandas-dev/pandas/issues/16186 moved_api_pages = [ - ('pandas.core.common.isnull', 'pandas.isna'), - ('pandas.core.common.notnull', 'pandas.notna'), - ('pandas.core.reshape.get_dummies', 'pandas.get_dummies'), - ('pandas.tools.merge.concat', 'pandas.concat'), - ('pandas.tools.merge.merge', 'pandas.merge'), - ('pandas.tools.pivot.pivot_table', 'pandas.pivot_table'), - ('pandas.tseries.tools.to_datetime', 'pandas.to_datetime'), - ('pandas.io.clipboard.read_clipboard', 'pandas.read_clipboard'), - ('pandas.io.excel.ExcelFile.parse', 'pandas.ExcelFile.parse'), - ('pandas.io.excel.read_excel', 'pandas.read_excel'), - ('pandas.io.gbq.read_gbq', 'pandas.read_gbq'), - ('pandas.io.html.read_html', 'pandas.read_html'), - ('pandas.io.json.read_json', 'pandas.read_json'), - ('pandas.io.parsers.read_csv', 'pandas.read_csv'), - ('pandas.io.parsers.read_fwf', 'pandas.read_fwf'), - ('pandas.io.parsers.read_table', 'pandas.read_table'), - ('pandas.io.pickle.read_pickle', 'pandas.read_pickle'), - ('pandas.io.pytables.HDFStore.append', 'pandas.HDFStore.append'), - ('pandas.io.pytables.HDFStore.get', 'pandas.HDFStore.get'), - ('pandas.io.pytables.HDFStore.put', 'pandas.HDFStore.put'), - ('pandas.io.pytables.HDFStore.select', 'pandas.HDFStore.select'), - ('pandas.io.pytables.read_hdf', 'pandas.read_hdf'), - ('pandas.io.sql.read_sql', 'pandas.read_sql'), - ('pandas.io.sql.read_frame', 'pandas.read_frame'), - ('pandas.io.sql.write_frame', 'pandas.write_frame'), - ('pandas.io.stata.read_stata', 'pandas.read_stata'), + ("pandas.core.common.isnull", "pandas.isna"), + ("pandas.core.common.notnull", "pandas.notna"), + ("pandas.core.reshape.get_dummies", "pandas.get_dummies"), + ("pandas.tools.merge.concat", "pandas.concat"), + ("pandas.tools.merge.merge", "pandas.merge"), + ("pandas.tools.pivot.pivot_table", "pandas.pivot_table"), + ("pandas.tseries.tools.to_datetime", "pandas.to_datetime"), + ("pandas.io.clipboard.read_clipboard", "pandas.read_clipboard"), + ("pandas.io.excel.ExcelFile.parse", "pandas.ExcelFile.parse"), + ("pandas.io.excel.read_excel", "pandas.read_excel"), + ("pandas.io.gbq.read_gbq", "pandas.read_gbq"), + ("pandas.io.html.read_html", "pandas.read_html"), + ("pandas.io.json.read_json", "pandas.read_json"), + ("pandas.io.parsers.read_csv", "pandas.read_csv"), + ("pandas.io.parsers.read_fwf", "pandas.read_fwf"), + ("pandas.io.parsers.read_table", "pandas.read_table"), + ("pandas.io.pickle.read_pickle", "pandas.read_pickle"), + ("pandas.io.pytables.HDFStore.append", "pandas.HDFStore.append"), + ("pandas.io.pytables.HDFStore.get", "pandas.HDFStore.get"), + ("pandas.io.pytables.HDFStore.put", "pandas.HDFStore.put"), + ("pandas.io.pytables.HDFStore.select", "pandas.HDFStore.select"), + ("pandas.io.pytables.read_hdf", "pandas.read_hdf"), + ("pandas.io.sql.read_sql", "pandas.read_sql"), + ("pandas.io.sql.read_frame", "pandas.read_frame"), + ("pandas.io.sql.write_frame", "pandas.write_frame"), + ("pandas.io.stata.read_stata", "pandas.read_stata"), ] # Again, tuples of (from_old, to_new) moved_classes = [ - ('pandas.tseries.resample.Resampler', 'pandas.core.resample.Resampler'), - ('pandas.formats.style.Styler', 'pandas.io.formats.style.Styler'), + ("pandas.tseries.resample.Resampler", "pandas.core.resample.Resampler"), + ("pandas.formats.style.Styler", "pandas.io.formats.style.Styler"), ] for old, new in moved_classes: # the class itself... moved_api_pages.append((old, new)) - mod, classname = new.rsplit('.', 1) + mod, classname = new.rsplit(".", 1) klass = getattr(importlib.import_module(mod), classname) - methods = [x for x in dir(klass) - if not x.startswith('_') or x in ('__iter__', '__array__')] + methods = [ + x for x in dir(klass) if not x.startswith("_") or x in ("__iter__", "__array__") + ] for method in methods: # ... and each of its public methods moved_api_pages.append( - ("{old}.{method}".format(old=old, method=method), - "{new}.{method}".format(new=new, method=method)) + ( + "{old}.{method}".format(old=old, method=method), + "{new}.{method}".format(new=new, method=method), + ) ) if pattern is None: html_additional_pages = { - 'generated/' + page[0]: 'api_redirect.html' - for page in moved_api_pages + "generated/" + page[0]: "api_redirect.html" for page in moved_api_pages } @@ -323,12 +322,14 @@ import os os.chdir(r'{}') -""".format(os.path.dirname(os.path.dirname(__file__))) +""".format( + os.path.dirname(os.path.dirname(__file__)) +) html_context = { - 'redirects': {old: new for old, new in moved_api_pages}, - 'header': header + "redirects": {old: new for old, new in moved_api_pages}, + "header": header, } # If false, no module index is generated. @@ -352,7 +353,7 @@ # html_file_suffix = '' # Output file base name for HTML help builder. -htmlhelp_basename = 'pandas' +htmlhelp_basename = "pandas" # -- Options for nbsphinx ------------------------------------------------ @@ -371,9 +372,13 @@ # Grouping the document tree into LaTeX files. List of tuples (source start # file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'pandas.tex', - 'pandas: powerful Python data analysis toolkit', - r'Wes McKinney\n\& PyData Development Team', 'manual'), + ( + "index", + "pandas.tex", + "pandas: powerful Python data analysis toolkit", + r"Wes McKinney\n\& PyData Development Team", + "manual", + ) ] # The name of an image file (relative to this directory) to place at the top of @@ -396,32 +401,32 @@ if pattern is None: intersphinx_mapping = { - 'dateutil': ("https://dateutil.readthedocs.io/en/latest/", None), - 'matplotlib': ('https://matplotlib.org/', None), - 'numpy': ('https://docs.scipy.org/doc/numpy/', None), - 'pandas-gbq': ('https://pandas-gbq.readthedocs.io/en/latest/', None), - 'py': ('https://pylib.readthedocs.io/en/latest/', None), - 'python': ('https://docs.python.org/3/', None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None), - 'statsmodels': ('http://www.statsmodels.org/devel/', None), + "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None), + "matplotlib": ("https://matplotlib.org/", None), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None), + "py": ("https://pylib.readthedocs.io/en/latest/", None), + "python": ("https://docs.python.org/3/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), + "statsmodels": ("http://www.statsmodels.org/devel/", None), } # extlinks alias -extlinks = {'issue': ('https://github.com/pandas-dev/pandas/issues/%s', - 'GH'), - 'wiki': ('https://github.com/pandas-dev/pandas/wiki/%s', - 'wiki ')} +extlinks = { + "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"), + "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "), +} ipython_warning_is_error = False ipython_exec_lines = [ - 'import numpy as np', - 'import pandas as pd', + "import numpy as np", + "import pandas as pd", # This ensures correct rendering on system with console encoding != utf8 # (windows). It forces pandas to encode its output reprs using utf8 # wherever the docs are built. The docs' target is the browser, not # the console, so this is fine. - 'pd.options.display.encoding="utf8"' + 'pd.options.display.encoding="utf8"', ] @@ -430,8 +435,7 @@ import sphinx from sphinx.util import rpartition -from sphinx.ext.autodoc import ( - Documenter, MethodDocumenter, AttributeDocumenter) +from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter from sphinx.ext.autosummary import Autosummary @@ -439,8 +443,9 @@ class AccessorDocumenter(MethodDocumenter): """ Specialized Documenter subclass for accessors. """ - objtype = 'accessor' - directivetype = 'method' + + objtype = "accessor" + directivetype = "method" # lower than MethodDocumenter so this is not chosen for normal methods priority = 0.6 @@ -448,7 +453,7 @@ class AccessorDocumenter(MethodDocumenter): def format_signature(self): # this method gives an error/warning for the accessors, therefore # overriding it (accessor has no arguments) - return '' + return "" class AccessorLevelDocumenter(Documenter): @@ -456,6 +461,7 @@ class AccessorLevelDocumenter(Documenter): Specialized Documenter subclass for objects on accessor level (methods, attributes). """ + # This is the simple straightforward version # modname is None, base the last elements (eg 'hour') # and path the part before (eg 'Series.dt') @@ -468,41 +474,40 @@ class AccessorLevelDocumenter(Documenter): def resolve_name(self, modname, parents, path, base): if modname is None: if path: - mod_cls = path.rstrip('.') + mod_cls = path.rstrip(".") else: mod_cls = None # if documenting a class-level object without path, # there must be a current class, either from a parent # auto directive ... - mod_cls = self.env.temp_data.get('autodoc:class') + mod_cls = self.env.temp_data.get("autodoc:class") # ... or from a class directive if mod_cls is None: - mod_cls = self.env.temp_data.get('py:class') + mod_cls = self.env.temp_data.get("py:class") # ... if still None, there's no way to know if mod_cls is None: return None, [] # HACK: this is added in comparison to ClassLevelDocumenter # mod_cls still exists of class.accessor, so an extra # rpartition is needed - modname, accessor = rpartition(mod_cls, '.') - modname, cls = rpartition(modname, '.') + modname, accessor = rpartition(mod_cls, ".") + modname, cls = rpartition(modname, ".") parents = [cls, accessor] # if the module name is still missing, get it like above if not modname: - modname = self.env.temp_data.get('autodoc:module') + modname = self.env.temp_data.get("autodoc:module") if not modname: - if sphinx.__version__ > '1.3': - modname = self.env.ref_context.get('py:module') + if sphinx.__version__ > "1.3": + modname = self.env.ref_context.get("py:module") else: - modname = self.env.temp_data.get('py:module') + modname = self.env.temp_data.get("py:module") # ... else, it stays None, which means invalid return modname, parents + [base] -class AccessorAttributeDocumenter(AccessorLevelDocumenter, - AttributeDocumenter): - objtype = 'accessorattribute' - directivetype = 'attribute' +class AccessorAttributeDocumenter(AccessorLevelDocumenter, AttributeDocumenter): + objtype = "accessorattribute" + directivetype = "attribute" # lower than AttributeDocumenter so this is not chosen for normal # attributes @@ -510,8 +515,8 @@ class AccessorAttributeDocumenter(AccessorLevelDocumenter, class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter): - objtype = 'accessormethod' - directivetype = 'method' + objtype = "accessormethod" + directivetype = "method" # lower than MethodDocumenter so this is not chosen for normal methods priority = 0.6 @@ -522,14 +527,15 @@ class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter): This documenter lets us removes .__call__ from the method signature for callable accessors like Series.plot """ - objtype = 'accessorcallable' - directivetype = 'method' + + objtype = "accessorcallable" + directivetype = "method" # lower than MethodDocumenter; otherwise the doc build prints warnings priority = 0.5 def format_name(self): - return MethodDocumenter.format_name(self).rstrip('.__call__') + return MethodDocumenter.format_name(self).rstrip(".__call__") class PandasAutosummary(Autosummary): @@ -537,15 +543,16 @@ class PandasAutosummary(Autosummary): This alternative autosummary class lets us override the table summary for Series.plot and DataFrame.plot in the API docs. """ + def _replace_pandas_items(self, display_name, sig, summary, real_name): # this a hack: ideally we should extract the signature from the # .__call__ method instead of hard coding this - if display_name == 'DataFrame.plot': - sig = '([x, y, kind, ax, ....])' - summary = 'DataFrame plotting accessor and method' - elif display_name == 'Series.plot': - sig = '([kind, ax, figsize, ....])' - summary = 'Series plotting accessor and method' + if display_name == "DataFrame.plot": + sig = "([x, y, kind, ax, ....])" + summary = "DataFrame plotting accessor and method" + elif display_name == "Series.plot": + sig = "([kind, ax, figsize, ....])" + summary = "Series plotting accessor and method" return (display_name, sig, summary, real_name) @staticmethod @@ -554,15 +561,15 @@ def _is_deprecated(real_name): obj, parent, modname = _import_by_name(real_name) except ImportError: return False - doc = NumpyDocString(obj.__doc__ or '') - summary = ''.join(doc['Summary'] + doc['Extended Summary']) - return '.. deprecated::' in summary + doc = NumpyDocString(obj.__doc__ or "") + summary = "".join(doc["Summary"] + doc["Extended Summary"]) + return ".. deprecated::" in summary def _add_deprecation_prefixes(self, items): for item in items: display_name, sig, summary, real_name = item if self._is_deprecated(real_name): - summary = '(DEPRECATED) %s' % summary + summary = "(DEPRECATED) %s" % summary yield display_name, sig, summary, real_name def get_items(self, names): @@ -577,18 +584,18 @@ def linkcode_resolve(domain, info): """ Determine the URL corresponding to Python object """ - if domain != 'py': + if domain != "py": return None - modname = info['module'] - fullname = info['fullname'] + modname = info["module"] + fullname = info["fullname"] submod = sys.modules.get(modname) if submod is None: return None obj = submod - for part in fullname.split('.'): + for part in fullname.split("."): try: obj = getattr(obj, part) except AttributeError: @@ -617,12 +624,14 @@ def linkcode_resolve(domain, info): fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__)) - if '+' in pandas.__version__: - return ("http://github.com/pandas-dev/pandas/blob/master/pandas/" - "{}{}".format(fn, linespec)) + if "+" in pandas.__version__: + return "http://github.com/pandas-dev/pandas/blob/master/pandas/" "{}{}".format( + fn, linespec + ) else: - return ("http://github.com/pandas-dev/pandas/blob/" - "v{}/pandas/{}{}".format(pandas.__version__, fn, linespec)) + return "http://github.com/pandas-dev/pandas/blob/" "v{}/pandas/{}{}".format( + pandas.__version__, fn, linespec + ) # remove the docstring of the flags attribute (inherited from numpy ndarray) @@ -646,7 +655,7 @@ def process_class_docstrings(app, what, name, obj, options, lines): """ if what == "class": - joined = '\n'.join(lines) + joined = "\n".join(lines) templates = [ """.. rubric:: Attributes @@ -662,25 +671,25 @@ def process_class_docstrings(app, what, name, obj, options, lines): :toctree: None -""" +""", ] for template in templates: if template in joined: - joined = joined.replace(template, '') - lines[:] = joined.split('\n') + joined = joined.replace(template, "") + lines[:] = joined.split("\n") suppress_warnings = [ # We "overwrite" autosummary with our PandasAutosummary, but # still want the regular autosummary setup to run. So we just # suppress this warning. - 'app.add_directive' + "app.add_directive" ] if pattern: # When building a single document we don't want to warn because references # to other documents are unknown, as it's expected - suppress_warnings.append('ref.ref') + suppress_warnings.append("ref.ref") def rstjinja(app, docname, source): @@ -689,12 +698,10 @@ def rstjinja(app, docname, source): """ # http://ericholscher.com/blog/2016/jul/25/integrating-jinja-rst-sphinx/ # Make sure we're outputting HTML - if app.builder.format != 'html': + if app.builder.format != "html": return src = source[0] - rendered = app.builder.templates.render_string( - src, app.config.html_context - ) + rendered = app.builder.templates.render_string(src, app.config.html_context) source[0] = rendered @@ -706,4 +713,4 @@ def setup(app): app.add_autodocumenter(AccessorAttributeDocumenter) app.add_autodocumenter(AccessorMethodDocumenter) app.add_autodocumenter(AccessorCallableDocumenter) - app.add_directive('autosummary', PandasAutosummary) + app.add_directive("autosummary", PandasAutosummary) diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 950e3592abf6e..1a5ab99b5a94f 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -40,7 +40,7 @@ from git import Repo -UTF8Writer = codecs.getwriter('utf8') +UTF8Writer = codecs.getwriter("utf8") this_repo = Repo(os.path.join(os.path.dirname(__file__), "..", "..")) author_msg = """\ @@ -54,21 +54,19 @@ def get_authors(revision_range): - pat = '^.*\\t(.*)$' - lst_release, cur_release = [r.strip() for r in revision_range.split('..')] + pat = "^.*\\t(.*)$" + lst_release, cur_release = [r.strip() for r in revision_range.split("..")] # authors, in current release and previous to current release. - cur = set(re.findall(pat, this_repo.git.shortlog('-s', revision_range), - re.M)) - pre = set(re.findall(pat, this_repo.git.shortlog('-s', lst_release), - re.M)) + cur = set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M)) + pre = set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M)) # Homu is the author of auto merges, clean him out. - cur.discard('Homu') - pre.discard('Homu') + cur.discard("Homu") + pre.discard("Homu") # Append '+' to new authors. - authors = [s + ' +' for s in cur - pre] + [s for s in cur & pre] + authors = [s + " +" for s in cur - pre] + [s for s in cur & pre] authors.sort() return authors @@ -77,19 +75,19 @@ def get_pull_requests(repo, revision_range): prnums = [] # From regular merges - merges = this_repo.git.log( - '--oneline', '--merges', revision_range) + merges = this_repo.git.log("--oneline", "--merges", revision_range) issues = re.findall("Merge pull request \\#(\\d*)", merges) prnums.extend(int(s) for s in issues) # From Homu merges (Auto merges) - issues = re. findall("Auto merge of \\#(\\d*)", merges) + issues = re.findall("Auto merge of \\#(\\d*)", merges) prnums.extend(int(s) for s in issues) # From fast forward squash-merges commits = this_repo.git.log( - '--oneline', '--no-merges', '--first-parent', revision_range) - issues = re.findall('^.*\\(\\#(\\d+)\\)$', commits, re.M) + "--oneline", "--no-merges", "--first-parent", revision_range + ) + issues = re.findall("^.*\\(\\#(\\d+)\\)$", commits, re.M) prnums.extend(int(s) for s in issues) # get PR data from github repo @@ -99,27 +97,29 @@ def get_pull_requests(repo, revision_range): def build_components(revision_range, heading="Contributors"): - lst_release, cur_release = [r.strip() for r in revision_range.split('..')] + lst_release, cur_release = [r.strip() for r in revision_range.split("..")] authors = get_authors(revision_range) return { - 'heading': heading, - 'author_message': author_msg % len(authors), - 'authors': authors, + "heading": heading, + "author_message": author_msg % len(authors), + "authors": authors, } def build_string(revision_range, heading="Contributors"): components = build_components(revision_range, heading=heading) - components['uline'] = '=' * len(components['heading']) - components['authors'] = "* " + "\n* ".join(components['authors']) + components["uline"] = "=" * len(components["heading"]) + components["authors"] = "* " + "\n* ".join(components["authors"]) - tpl = textwrap.dedent("""\ + tpl = textwrap.dedent( + """\ {heading} {uline} {author_message} - {authors}""").format(**components) + {authors}""" + ).format(**components) return tpl @@ -133,6 +133,6 @@ def main(revision_range): from argparse import ArgumentParser parser = ArgumentParser(description="Generate author lists for release") - parser.add_argument('revision_range', help='..') + parser.add_argument("revision_range", help="..") args = parser.parse_args() main(args.revision_range) diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 7794a24dad89b..4256e4659715d 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -17,40 +17,36 @@ class ContributorsDirective(Directive): required_arguments = 1 - name = 'contributors' + name = "contributors" def run(self): range_ = self.arguments[0] - if range_.endswith('x..HEAD'): + if range_.endswith("x..HEAD"): return [nodes.paragraph(), nodes.bullet_list()] try: components = build_components(range_) except git.GitCommandError as exc: return [ self.state.document.reporter.warning( - "Cannot find contributors for range '{}': {}".format( - range_, exc), - line=self.lineno) + "Cannot find contributors for range '{}': {}".format(range_, exc), + line=self.lineno, + ) ] else: message = nodes.paragraph() - message += nodes.Text(components['author_message']) + message += nodes.Text(components["author_message"]) listnode = nodes.bullet_list() - for author in components['authors']: + for author in components["authors"]: para = nodes.paragraph() para += nodes.Text(author) - listnode += nodes.list_item('', para) + listnode += nodes.list_item("", para) return [message, listnode] def setup(app): - app.add_directive('contributors', ContributorsDirective) + app.add_directive("contributors", ContributorsDirective) - return { - 'version': '0.1', - 'parallel_read_safe': True, - 'parallel_write_safe': True, - } + return {"version": "0.1", "parallel_read_safe": True, "parallel_write_safe": True} diff --git a/pandas/__init__.py b/pandas/__init__.py index 5b39d954c2bc3..6351b508fb0e5 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -1,6 +1,6 @@ # flake8: noqa -__docformat__ = 'restructuredtext' +__docformat__ = "restructuredtext" # Let users know if they're missing any of our hard dependencies hard_dependencies = ("numpy", "pytz", "dateutil") @@ -13,62 +13,113 @@ missing_dependencies.append("{0}: {1}".format(dependency, str(e))) if missing_dependencies: - raise ImportError("Unable to import required dependencies:\n" + "\n".join(missing_dependencies)) + raise ImportError( + "Unable to import required dependencies:\n" + "\n".join(missing_dependencies) + ) del hard_dependencies, dependency, missing_dependencies # numpy compat from pandas.compat.numpy import ( - _np_version_under1p14, _np_version_under1p15, _np_version_under1p16, - _np_version_under1p17) + _np_version_under1p14, + _np_version_under1p15, + _np_version_under1p16, + _np_version_under1p17, +) try: - from pandas._libs import (hashtable as _hashtable, - lib as _lib, - tslib as _tslib) + from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib except ImportError as e: # pragma: no cover # hack but overkill to use re - module = str(e).replace('cannot import name ', '') - raise ImportError("C extension: {0} not built. If you want to import " - "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build " - "the C extensions first.".format(module)) + module = str(e).replace("cannot import name ", "") + raise ImportError( + "C extension: {0} not built. If you want to import " + "pandas from the source directory, you may need to run " + "'python setup.py build_ext --inplace --force' to build " + "the C extensions first.".format(module) + ) from datetime import datetime -from pandas._config import (get_option, set_option, reset_option, - describe_option, option_context, options) +from pandas._config import ( + get_option, + set_option, + reset_option, + describe_option, + option_context, + options, +) # let init-time option registration happen import pandas.core.config_init from pandas.core.api import ( # dtype - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, - UInt16Dtype, UInt32Dtype, UInt64Dtype, CategoricalDtype, - PeriodDtype, IntervalDtype, DatetimeTZDtype, - + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + CategoricalDtype, + PeriodDtype, + IntervalDtype, + DatetimeTZDtype, # missing - isna, isnull, notna, notnull, - + isna, + isnull, + notna, + notnull, # indexes - Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, - Float64Index, MultiIndex, IntervalIndex, TimedeltaIndex, - DatetimeIndex, PeriodIndex, IndexSlice, - + Index, + CategoricalIndex, + Int64Index, + UInt64Index, + RangeIndex, + Float64Index, + MultiIndex, + IntervalIndex, + TimedeltaIndex, + DatetimeIndex, + PeriodIndex, + IndexSlice, # tseries - NaT, Period, period_range, Timedelta, timedelta_range, - Timestamp, date_range, bdate_range, Interval, interval_range, + NaT, + Period, + period_range, + Timedelta, + timedelta_range, + Timestamp, + date_range, + bdate_range, + Interval, + interval_range, DateOffset, - # conversion - to_numeric, to_datetime, to_timedelta, - + to_numeric, + to_datetime, + to_timedelta, # misc - np, Grouper, factorize, unique, value_counts, NamedAgg, - array, Categorical, set_eng_float_format, Series, DataFrame) + np, + Grouper, + factorize, + unique, + value_counts, + NamedAgg, + array, + Categorical, + set_eng_float_format, + Series, + DataFrame, +) from pandas.core.sparse.api import ( - SparseArray, SparseDataFrame, SparseSeries, SparseDtype) + SparseArray, + SparseDataFrame, + SparseSeries, + SparseDtype, +) from pandas.tseries.api import infer_freq from pandas.tseries import offsets @@ -76,35 +127,56 @@ from pandas.core.computation.api import eval from pandas.core.reshape.api import ( - concat, lreshape, melt, wide_to_long, merge, merge_asof, - merge_ordered, crosstab, pivot, pivot_table, get_dummies, - cut, qcut) + concat, + lreshape, + melt, + wide_to_long, + merge, + merge_asof, + merge_ordered, + crosstab, + pivot, + pivot_table, + get_dummies, + cut, + qcut, +) from pandas.util._print_versions import show_versions from pandas.io.api import ( # excel - ExcelFile, ExcelWriter, read_excel, - + ExcelFile, + ExcelWriter, + read_excel, # packers - read_msgpack, to_msgpack, - + read_msgpack, + to_msgpack, # parsers - read_csv, read_fwf, read_table, - + read_csv, + read_fwf, + read_table, # pickle - read_pickle, to_pickle, - + read_pickle, + to_pickle, # pytables - HDFStore, read_hdf, - + HDFStore, + read_hdf, # sql - read_sql, read_sql_query, + read_sql, + read_sql_query, read_sql_table, - # misc - read_clipboard, read_parquet, read_feather, read_gbq, - read_html, read_json, read_stata, read_sas, read_spss) + read_clipboard, + read_parquet, + read_feather, + read_gbq, + read_html, + read_json, + read_stata, + read_sas, + read_spss, +) from pandas.util._tester import test import pandas.testing @@ -112,31 +184,38 @@ # use the closest tagged version if possible from ._version import get_versions + v = get_versions() -__version__ = v.get('closest-tag', v['version']) -__git_version__ = v.get('full-revisionid') +__version__ = v.get("closest-tag", v["version"]) +__git_version__ = v.get("full-revisionid") del get_versions, v # GH 27101 # TODO: remove Panel compat in 1.0 if pandas.compat.PY37: + def __getattr__(name): - if name == 'Panel': + if name == "Panel": import warnings + warnings.warn( "The Panel class is removed from pandas. Accessing it " "from the top-level namespace will also be removed in " "the next version", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) class Panel: pass return Panel - raise AttributeError( - "module 'pandas' has no attribute '{}'".format(name)) + raise AttributeError("module 'pandas' has no attribute '{}'".format(name)) + + else: + class Panel: pass diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index bf221ea444288..65936a9fcdbf3 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -5,11 +5,24 @@ importing `dates` and `display` ensures that keys needed by _libs are initialized. """ -__all__ = ["config", "detect_console_encoding", "get_option", "set_option", - "reset_option", "describe_option", "option_context", "options"] +__all__ = [ + "config", + "detect_console_encoding", + "get_option", + "set_option", + "reset_option", + "describe_option", + "option_context", + "options", +] from pandas._config import config from pandas._config import dates # noqa:F401 from pandas._config.config import ( - describe_option, get_option, option_context, options, reset_option, - set_option) + describe_option, + get_option, + option_context, + options, + reset_option, + set_option, +) from pandas._config.display import detect_console_encoding diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 6b685a0ce962a..61e926035c3f2 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -54,9 +54,8 @@ from typing import Dict, List import warnings -DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver') -RegisteredOption = namedtuple('RegisteredOption', - 'key defval doc validator cb') +DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") +RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb") # holds deprecated option metdata _deprecated_options = {} # type: Dict[str, DeprecatedOption] @@ -68,7 +67,7 @@ _global_config = {} # type: Dict[str, str] # keys which have a special meaning -_reserved_keys = ['all'] # type: List[str] +_reserved_keys = ["all"] # type: List[str] class OptionError(AttributeError, KeyError): @@ -76,6 +75,7 @@ class OptionError(AttributeError, KeyError): checks """ + # # User API @@ -85,9 +85,9 @@ def _get_single_key(pat, silent): if len(keys) == 0: if not silent: _warn_if_deprecated(pat) - raise OptionError('No such keys(s): {pat!r}'.format(pat=pat)) + raise OptionError("No such keys(s): {pat!r}".format(pat=pat)) if len(keys) > 1: - raise OptionError('Pattern matched multiple keys') + raise OptionError("Pattern matched multiple keys") key = keys[0] if not silent: @@ -110,11 +110,10 @@ def _set_option(*args, **kwargs): # must at least 1 arg deal with constraints later nargs = len(args) if not nargs or nargs % 2 != 0: - raise ValueError("Must provide an even number of non-keyword " - "arguments") + raise ValueError("Must provide an even number of non-keyword " "arguments") # default to false - silent = kwargs.pop('silent', False) + silent = kwargs.pop("silent", False) if kwargs: msg = '_set_option() got an unexpected keyword argument "{kwarg}"' @@ -139,13 +138,13 @@ def _set_option(*args, **kwargs): o.cb(key) -def _describe_option(pat='', _print_desc=True): +def _describe_option(pat="", _print_desc=True): keys = _select_options(pat) if len(keys) == 0: - raise OptionError('No such keys(s)') + raise OptionError("No such keys(s)") - s = '' + s = "" for k in keys: # filter by pat s += _build_option_description(k) @@ -160,13 +159,15 @@ def _reset_option(pat, silent=False): keys = _select_options(pat) if len(keys) == 0: - raise OptionError('No such keys(s)') + raise OptionError("No such keys(s)") - if len(keys) > 1 and len(pat) < 4 and pat != 'all': - raise ValueError('You must specify at least 4 characters when ' - 'resetting multiple keys, use the special keyword ' - '"all" to reset all the options to their default ' - 'value') + if len(keys) > 1 and len(pat) < 4 and pat != "all": + raise ValueError( + "You must specify at least 4 characters when " + "resetting multiple keys, use the special keyword " + '"all" to reset all the options to their default ' + "value" + ) for k in keys: _set_option(k, _registered_options[k].defval, silent=silent) @@ -213,6 +214,7 @@ def __getattr__(self, key): def __dir__(self): return list(self.d.keys()) + # For user convenience, we'd like to have the available options described # in the docstring. For dev convenience we'd like to generate the docstrings # dynamically instead of maintaining them by hand. To this, we use the @@ -223,7 +225,6 @@ def __dir__(self): class CallableDynamicDoc: - def __init__(self, func, doc_tmpl): self.__doc_tmpl__ = doc_tmpl self.__func__ = func @@ -233,10 +234,9 @@ def __call__(self, *args, **kwds): @property def __doc__(self): - opts_desc = _describe_option('all', _print_desc=False) + opts_desc = _describe_option("all", _print_desc=False) opts_list = pp_options_list(list(_registered_options.keys())) - return self.__doc_tmpl__.format(opts_desc=opts_desc, - opts_list=opts_list) + return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list) _get_option_tmpl = """ @@ -394,14 +394,14 @@ class option_context: def __init__(self, *args): if not (len(args) % 2 == 0 and len(args) >= 2): - raise ValueError('Need to invoke as' - ' option_context(pat, val, [(pat, val), ...]).') + raise ValueError( + "Need to invoke as" " option_context(pat, val, [(pat, val), ...])." + ) self.ops = list(zip(args[::2], args[1::2])) def __enter__(self): - self.undo = [(pat, _get_option(pat, silent=True)) - for pat, val in self.ops] + self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops] for pat, val in self.ops: _set_option(pat, val, silent=True) @@ -412,7 +412,7 @@ def __exit__(self, *args): _set_option(pat, val, silent=True) -def register_option(key, defval, doc='', validator=None, cb=None): +def register_option(key, defval, doc="", validator=None, cb=None): """Register an option in the package-wide pandas config object Parameters @@ -437,6 +437,7 @@ def register_option(key, defval, doc='', validator=None, cb=None): """ import tokenize import keyword + key = key.lower() if key in _registered_options: @@ -451,10 +452,10 @@ def register_option(key, defval, doc='', validator=None, cb=None): validator(defval) # walk the nested dict, creating dicts as needed along the path - path = key.split('.') + path = key.split(".") for k in path: - if not bool(re.match('^' + tokenize.Name + '$', k)): + if not bool(re.match("^" + tokenize.Name + "$", k)): raise ValueError("{k} is not a valid identifier".format(k=k)) if keyword.iskeyword(k): raise ValueError("{k} is a python keyword".format(k=k)) @@ -463,20 +464,20 @@ def register_option(key, defval, doc='', validator=None, cb=None): msg = "Path prefix to option '{option}' is already an option" for i, p in enumerate(path[:-1]): if not isinstance(cursor, dict): - raise OptionError(msg.format(option='.'.join(path[:i]))) + raise OptionError(msg.format(option=".".join(path[:i]))) if p not in cursor: cursor[p] = {} cursor = cursor[p] if not isinstance(cursor, dict): - raise OptionError(msg.format(option='.'.join(path[:-1]))) + raise OptionError(msg.format(option=".".join(path[:-1]))) cursor[path[-1]] = defval # initialize # save the option metadata - _registered_options[key] = RegisteredOption(key=key, defval=defval, - doc=doc, validator=validator, - cb=cb) + _registered_options[key] = RegisteredOption( + key=key, defval=defval, doc=doc, validator=validator, cb=cb + ) def deprecate_option(key, msg=None, rkey=None, removal_ver=None): @@ -526,6 +527,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) + # # functions internal to the module @@ -542,14 +544,14 @@ def _select_options(pat): # else look through all of them keys = sorted(_registered_options.keys()) - if pat == 'all': # reserved key + if pat == "all": # reserved key return keys return [k for k in keys if re.search(pat, k, re.I)] def _get_root(key): - path = key.split('.') + path = key.split(".") cursor = _global_config for p in path[:-1]: cursor = cursor[p] @@ -621,12 +623,11 @@ def _warn_if_deprecated(key): else: msg = "'{key}' is deprecated".format(key=key) if d.removal_ver: - msg += (' and will be removed in {version}' - .format(version=d.removal_ver)) + msg += " and will be removed in {version}".format(version=d.removal_ver) if d.rkey: msg += ", please use '{rkey}' instead.".format(rkey=d.rkey) else: - msg += ', please refrain from using it.' + msg += ", please refrain from using it." warnings.warn(msg, FutureWarning) return True @@ -639,22 +640,22 @@ def _build_option_description(k): o = _get_registered_option(k) d = _get_deprecated_option(k) - s = '{k} '.format(k=k) + s = "{k} ".format(k=k) if o.doc: - s += '\n'.join(o.doc.strip().split('\n')) + s += "\n".join(o.doc.strip().split("\n")) else: - s += 'No description available.' + s += "No description available." if o: - s += ('\n [default: {default}] [currently: {current}]' - .format(default=o.defval, current=_get_option(k, True))) + s += "\n [default: {default}] [currently: {current}]".format( + default=o.defval, current=_get_option(k, True) + ) if d: - s += '\n (Deprecated' - s += (', use `{rkey}` instead.' - .format(rkey=d.rkey if d.rkey else '')) - s += ')' + s += "\n (Deprecated" + s += ", use `{rkey}` instead.".format(rkey=d.rkey if d.rkey else "") + s += ")" return s @@ -666,28 +667,34 @@ def pp_options_list(keys, width=80, _print=False): from itertools import groupby def pp(name, ks): - pfx = ('- ' + name + '.[' if name else '') - ls = wrap(', '.join(ks), width, initial_indent=pfx, - subsequent_indent=' ', break_long_words=False) + pfx = "- " + name + ".[" if name else "" + ls = wrap( + ", ".join(ks), + width, + initial_indent=pfx, + subsequent_indent=" ", + break_long_words=False, + ) if ls and ls[-1] and name: - ls[-1] = ls[-1] + ']' + ls[-1] = ls[-1] + "]" return ls ls = [] - singles = [x for x in sorted(keys) if x.find('.') < 0] + singles = [x for x in sorted(keys) if x.find(".") < 0] if singles: - ls += pp('', singles) - keys = [x for x in keys if x.find('.') >= 0] + ls += pp("", singles) + keys = [x for x in keys if x.find(".") >= 0] - for k, g in groupby(sorted(keys), lambda x: x[:x.rfind('.')]): - ks = [x[len(k) + 1:] for x in list(g)] + for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]): + ks = [x[len(k) + 1 :] for x in list(g)] ls += pp(k, ks) - s = '\n'.join(ls) + s = "\n".join(ls) if _print: print(s) else: return s + # # helpers @@ -724,7 +731,7 @@ def config_prefix(prefix): def wrap(func): def inner(key, *args, **kwds): - pkey = '{prefix}.{key}'.format(prefix=prefix, key=key) + pkey = "{prefix}.{key}".format(prefix=prefix, key=key) return func(pkey, *args, **kwds) return inner @@ -740,6 +747,7 @@ def inner(key, *args, **kwds): get_option = _get_option register_option = _register_option + # These factories and methods are handy for use as the validator # arg in register_option diff --git a/pandas/_config/dates.py b/pandas/_config/dates.py index 85300a308de62..5bf2b49ce5904 100644 --- a/pandas/_config/dates.py +++ b/pandas/_config/dates.py @@ -13,9 +13,11 @@ When True, prints and parses dates with the year first, eg 2005/01/20 """ -with cf.config_prefix('display'): +with cf.config_prefix("display"): # Needed upstream of `_libs` because these are used in tslibs.parsing - cf.register_option('date_dayfirst', False, pc_date_dayfirst_doc, - validator=cf.is_bool) - cf.register_option('date_yearfirst', False, pc_date_yearfirst_doc, - validator=cf.is_bool) + cf.register_option( + "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool + ) + cf.register_option( + "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool + ) diff --git a/pandas/_config/display.py b/pandas/_config/display.py index 7997d12e06aa9..6e5fabe2706e5 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -25,14 +25,14 @@ def detect_console_encoding(): pass # try again for something better - if not encoding or 'ascii' in encoding.lower(): + if not encoding or "ascii" in encoding.lower(): try: encoding = locale.getpreferredencoding() except Exception: pass # when all else fails. this will usually be "ascii" - if not encoding or 'ascii' in encoding.lower(): + if not encoding or "ascii" in encoding.lower(): encoding = sys.getdefaultencoding() # GH#3360, save the reported defencoding at import time @@ -50,6 +50,7 @@ def detect_console_encoding(): these are generally strings meant to be displayed on the console. """ -with cf.config_prefix('display'): - cf.register_option('encoding', detect_console_encoding(), pc_encoding_doc, - validator=cf.is_text) +with cf.config_prefix("display"): + cf.register_option( + "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text + ) diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 1ca6d073f18c4..46802c6460959 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -37,7 +37,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): locale.setlocale(lc_var, new_locale) normalized_locale = locale.getlocale() if all(x is not None for x in normalized_locale): - yield '.'.join(normalized_locale) + yield ".".join(normalized_locale) else: yield new_locale finally: @@ -99,15 +99,16 @@ def _valid_locales(locales, normalize): def _default_locale_getter(): try: - raw_locales = subprocess.check_output(['locale -a'], shell=True) + raw_locales = subprocess.check_output(["locale -a"], shell=True) except subprocess.CalledProcessError as e: - raise type(e)("{exception}, the 'locale -a' command cannot be found " - "on your system".format(exception=e)) + raise type(e)( + "{exception}, the 'locale -a' command cannot be found " + "on your system".format(exception=e) + ) return raw_locales -def get_locales(prefix=None, normalize=True, - locale_getter=_default_locale_getter): +def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter): """ Get all the locales that are available on the system. @@ -145,11 +146,10 @@ def get_locales(prefix=None, normalize=True, # raw_locales is "\n" separated list of locales # it may contain non-decodable parts, so split # extract what we can and then rejoin. - raw_locales = raw_locales.split(b'\n') + raw_locales = raw_locales.split(b"\n") out_locales = [] for x in raw_locales: - out_locales.append(str( - x, encoding=options.display.encoding)) + out_locales.append(str(x, encoding=options.display.encoding)) except TypeError: pass @@ -157,6 +157,6 @@ def get_locales(prefix=None, normalize=True, if prefix is None: return _valid_locales(out_locales, normalize) - pattern = re.compile('{prefix}.*'.format(prefix=prefix)) - found = pattern.findall('\n'.join(out_locales)) + pattern = re.compile("{prefix}.*".format(prefix=prefix)) + found = pattern.findall("\n".join(out_locales)) return _valid_locales(found, normalize) diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py index fcf5ffbfcad92..af67cb3be7102 100644 --- a/pandas/_libs/__init__.py +++ b/pandas/_libs/__init__.py @@ -1,4 +1,11 @@ # flake8: noqa from .tslibs import ( - NaT, NaTType, OutOfBoundsDatetime, Period, Timedelta, Timestamp, iNaT) + NaT, + NaTType, + OutOfBoundsDatetime, + Period, + Timedelta, + Timestamp, + iNaT, +) diff --git a/pandas/_typing.py b/pandas/_typing.py index 8947e98bf52ce..46b1b4685ec9f 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -9,19 +9,25 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCExtensionArray, ABCIndexClass, ABCSeries, ABCSparseSeries) + ABCDataFrame, + ABCExtensionArray, + ABCIndexClass, + ABCSeries, + ABCSparseSeries, +) -AnyArrayLike = TypeVar('AnyArrayLike', - ABCExtensionArray, - ABCIndexClass, - ABCSeries, - ABCSparseSeries, - np.ndarray) -ArrayLike = TypeVar('ArrayLike', ABCExtensionArray, np.ndarray) -DatetimeLikeScalar = TypeVar('DatetimeLikeScalar', Period, Timestamp, - Timedelta) +AnyArrayLike = TypeVar( + "AnyArrayLike", + ABCExtensionArray, + ABCIndexClass, + ABCSeries, + ABCSparseSeries, + np.ndarray, +) +ArrayLike = TypeVar("ArrayLike", ABCExtensionArray, np.ndarray) +DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", Period, Timestamp, Timedelta) Dtype = Union[str, np.dtype, ExtensionDtype] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] -FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame) +FrameOrSeries = TypeVar("FrameOrSeries", ABCSeries, ABCDataFrame) Scalar = Union[str, int, float] diff --git a/pandas/_version.py b/pandas/_version.py index 5031f411270a1..4f5bdf59a99d5 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -56,6 +56,7 @@ def decorate(f: Callable) -> Callable: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate @@ -66,9 +67,12 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -96,14 +100,19 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: - print("guessing rootdir is '{root}', but '{dirname}' " - "doesn't start with prefix '{parentdir_prefix}'".format( - root=root, dirname=dirname, - parentdir_prefix=parentdir_prefix)) + print( + "guessing rootdir is '{root}', but '{dirname}' " + "doesn't start with prefix '{parentdir_prefix}'".format( + root=root, dirname=dirname, parentdir_prefix=parentdir_prefix + ) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + } @register_vcs_handler("git", "get_keywords") @@ -143,7 +152,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -152,7 +161,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} + tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '{}', no digits".format(",".join(refs - tags))) if verbose: @@ -160,19 +169,24 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking {r}".format(r=r)) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None - } + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags"} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + } @register_vcs_handler("git", "pieces_from_vcs") @@ -192,9 +206,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): GITS = ["git.cmd", "git.exe"] # if there is a tag, this yields TAG-NUM-gHEX[-dirty] # if there are no tags, this yields HEX[-dirty] (no NUM) - describe_out = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long"], - cwd=root) + describe_out = run_command( + GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -217,32 +231,32 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: " - "'{describe_out}'".format( - describe_out=describe_out)) + pieces["error"] = ( + "unable to parse git-describe output: " + "'{describe_out}'".format(describe_out=describe_out) + ) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): - fmt = ("tag '{full_tag}' doesn't start with prefix " - "'{tag_prefix}'") + fmt = "tag '{full_tag}' doesn't start with prefix " "'{tag_prefix}'" msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) if verbose: print(msg) pieces["error"] = msg return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -253,8 +267,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits return pieces @@ -283,8 +296,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.{:d}.g{}".format(pieces["distance"], - pieces["short"]) + rendered = "0+untagged.{:d}.g{}".format(pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -391,10 +403,12 @@ def render_git_describe_long(pieces): def render(pieces, style): if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"]} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + } if not style or style == "default": style = "pep440" # the default @@ -414,8 +428,12 @@ def render(pieces, style): else: raise ValueError("unknown style '{style}'".format(style=style)) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + } def get_versions(): @@ -428,8 +446,7 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass @@ -438,12 +455,15 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): + for i in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree"} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -457,6 +477,9 @@ def get_versions(): except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version"} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + } diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 0bd2733cb494c..431dd2b1968ae 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,12 +1,14 @@ """Public API for extending pandas objects.""" from pandas.core.dtypes.dtypes import ( # noqa: F401 - ExtensionDtype, register_extension_dtype) + ExtensionDtype, + register_extension_dtype, +) from pandas.core.accessor import ( # noqa: F401 - register_index_accessor, register_series_accessor) + register_index_accessor, + register_series_accessor, +) from pandas.core.algorithms import take # noqa: F401 -from pandas.core.arrays import ( # noqa: F401 - ExtensionArray, ExtensionScalarOpsMixin) +from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 -from pandas.core.accessor import ( # noqa: F401; noqa: F401 - register_dataframe_accessor) +from pandas.core.accessor import register_dataframe_accessor # noqa: F401; noqa: F401 diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index 668f79921d8e6..f32e1abe28cc1 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -5,4 +5,8 @@ from pandas.core.dtypes.api import * # noqa: F403, F401 from pandas.core.dtypes.concat import union_categoricals # noqa: F401 from pandas.core.dtypes.dtypes import ( # noqa: F401 - CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype) + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index ab014d49236b3..db01f2a0c674f 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -4,16 +4,23 @@ See :ref:`extending.extension-types` for more. """ from pandas.core.arrays import ( - Categorical, DatetimeArray, IntegerArray, IntervalArray, PandasArray, - PeriodArray, SparseArray, TimedeltaArray) + Categorical, + DatetimeArray, + IntegerArray, + IntervalArray, + PandasArray, + PeriodArray, + SparseArray, + TimedeltaArray, +) __all__ = [ - 'Categorical', - 'DatetimeArray', - 'IntegerArray', - 'IntervalArray', - 'PandasArray', - 'PeriodArray', - 'SparseArray', - 'TimedeltaArray', + "Categorical", + "DatetimeArray", + "IntegerArray", + "IntervalArray", + "PandasArray", + "PeriodArray", + "SparseArray", + "TimedeltaArray", ] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 4459e66540dac..c9597505fa596 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -13,7 +13,7 @@ PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) -PYPY = platform.python_implementation() == 'PyPy' +PYPY = platform.python_implementation() == "PyPy" # ---------------------------------------------------------------------------- @@ -29,9 +29,7 @@ def set_function_name(f, name, cls): Bind the name/qualname attributes of the function """ f.__name__ = name - f.__qualname__ = '{klass}.{name}'.format( - klass=cls.__name__, - name=name) + f.__qualname__ = "{klass}.{name}".format(klass=cls.__name__, name=name) f.__module__ = cls.__module__ return f @@ -49,19 +47,19 @@ def raise_with_traceback(exc, traceback=Ellipsis): # https://github.com/pandas-dev/pandas/pull/9123 def is_platform_little_endian(): """ am I little endian """ - return sys.byteorder == 'little' + return sys.byteorder == "little" def is_platform_windows(): - return sys.platform == 'win32' or sys.platform == 'cygwin' + return sys.platform == "win32" or sys.platform == "cygwin" def is_platform_linux(): - return sys.platform == 'linux2' + return sys.platform == "linux2" def is_platform_mac(): - return sys.platform == 'darwin' + return sys.platform == "darwin" def is_platform_32bit(): diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 620884d66821c..cd4e1b7e8aa4d 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -39,23 +39,18 @@ def _get_version(module: types.ModuleType) -> str: - version = getattr(module, '__version__', None) + version = getattr(module, "__version__", None) if version is None: # xlrd uses a capitalized attribute name - version = getattr(module, '__VERSION__', None) + version = getattr(module, "__VERSION__", None) if version is None: - raise ImportError( - "Can't determine version for {}".format(module.__name__) - ) + raise ImportError("Can't determine version for {}".format(module.__name__)) return version def import_optional_dependency( - name: str, - extra: str = "", - raise_on_missing: bool = True, - on_version: str = "raise", + name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise" ): """ Import an optional dependency. @@ -105,9 +100,7 @@ def import_optional_dependency( if distutils.version.LooseVersion(version) < minimum_version: assert on_version in {"warn", "raise", "ignore"} msg = version_message.format( - minimum_version=minimum_version, - name=name, - actual_version=version, + minimum_version=minimum_version, name=name, actual_version=version ) if on_version == "warn": warnings.warn(msg, UserWarning) diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index e57a2ba3af0ac..83f1da597d6a6 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -2,7 +2,6 @@ class DeepChainMap(ChainMap): - def __setitem__(self, key, value): for mapping in self.maps: if key in mapping: diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 22bfab8b7c6d6..ce56c08d3ec14 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -8,27 +8,29 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p14 = _nlv < LooseVersion('1.14') -_np_version_under1p15 = _nlv < LooseVersion('1.15') -_np_version_under1p16 = _nlv < LooseVersion('1.16') -_np_version_under1p17 = _nlv < LooseVersion('1.17') -_is_numpy_dev = '.dev' in str(_nlv) +_np_version_under1p14 = _nlv < LooseVersion("1.14") +_np_version_under1p15 = _nlv < LooseVersion("1.15") +_np_version_under1p16 = _nlv < LooseVersion("1.16") +_np_version_under1p17 = _nlv < LooseVersion("1.17") +_is_numpy_dev = ".dev" in str(_nlv) -if _nlv < '1.13.3': - raise ImportError('this version of pandas is incompatible with ' - 'numpy < 1.13.3\n' - 'your numpy version is {0}.\n' - 'Please upgrade numpy to >= 1.13.3 to use ' - 'this pandas version'.format(_np_version)) +if _nlv < "1.13.3": + raise ImportError( + "this version of pandas is incompatible with " + "numpy < 1.13.3\n" + "your numpy version is {0}.\n" + "Please upgrade numpy to >= 1.13.3 to use " + "this pandas version".format(_np_version) + ) -_tz_regex = re.compile('[+-]0000$') +_tz_regex = re.compile("[+-]0000$") def tz_replacer(s): if isinstance(s, str): - if s.endswith('Z'): + if s.endswith("Z"): s = s[:-1] elif _tz_regex.search(s): s = s[:-5] @@ -53,7 +55,7 @@ def np_array_datetime64_compat(arr, *args, **kwargs): warning, when need to pass '2015-01-01 09:00:00' """ # is_list_like - if (hasattr(arr, '__iter__') and not isinstance(arr, (str, bytes))): + if hasattr(arr, "__iter__") and not isinstance(arr, (str, bytes)): arr = [tz_replacer(s) for s in arr] else: arr = tz_replacer(arr) @@ -61,11 +63,12 @@ def np_array_datetime64_compat(arr, *args, **kwargs): return np.array(arr, *args, **kwargs) -__all__ = ['np', - '_np_version', - '_np_version_under1p14', - '_np_version_under1p15', - '_np_version_under1p16', - '_np_version_under1p17', - '_is_numpy_dev' - ] +__all__ = [ + "np", + "_np_version", + "_np_version_under1p14", + "_np_version_under1p15", + "_np_version_under1p16", + "_np_version_under1p17", + "_is_numpy_dev", +] diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 572dd7272986b..840dec2489a52 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -26,45 +26,50 @@ from pandas._libs.lib import is_bool, is_integer from pandas.errors import UnsupportedFunctionCall from pandas.util._validators import ( - validate_args, validate_args_and_kwargs, validate_kwargs) + validate_args, + validate_args_and_kwargs, + validate_kwargs, +) class CompatValidator: - - def __init__(self, defaults, fname=None, method=None, - max_fname_arg_count=None): + def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None): self.fname = fname self.method = method self.defaults = defaults self.max_fname_arg_count = max_fname_arg_count - def __call__(self, args, kwargs, fname=None, - max_fname_arg_count=None, method=None): + def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None): if args or kwargs: fname = self.fname if fname is None else fname - max_fname_arg_count = (self.max_fname_arg_count if - max_fname_arg_count is None - else max_fname_arg_count) + max_fname_arg_count = ( + self.max_fname_arg_count + if max_fname_arg_count is None + else max_fname_arg_count + ) method = self.method if method is None else method - if method == 'args': + if method == "args": validate_args(fname, args, max_fname_arg_count, self.defaults) - elif method == 'kwargs': + elif method == "kwargs": validate_kwargs(fname, kwargs, self.defaults) - elif method == 'both': - validate_args_and_kwargs(fname, args, kwargs, - max_fname_arg_count, - self.defaults) + elif method == "both": + validate_args_and_kwargs( + fname, args, kwargs, max_fname_arg_count, self.defaults + ) else: - raise ValueError("invalid validation method " - "'{method}'".format(method=method)) + raise ValueError( + "invalid validation method " "'{method}'".format(method=method) + ) ARGMINMAX_DEFAULTS = dict(out=None) -validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin', - method='both', max_fname_arg_count=1) -validate_argmax = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmax', - method='both', max_fname_arg_count=1) +validate_argmin = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1 +) +validate_argmax = CompatValidator( + ARGMINMAX_DEFAULTS, fname="argmax", method="both", max_fname_arg_count=1 +) def process_skipna(skipna, args): @@ -103,28 +108,30 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS = OrderedDict() \ - # type: OrderedDict[str, Optional[Union[int, str]]] -ARGSORT_DEFAULTS['axis'] = -1 -ARGSORT_DEFAULTS['kind'] = 'quicksort' -ARGSORT_DEFAULTS['order'] = None +ARGSORT_DEFAULTS = OrderedDict() +# type: OrderedDict[str, Optional[Union[int, str]]] +ARGSORT_DEFAULTS["axis"] = -1 +ARGSORT_DEFAULTS["kind"] = "quicksort" +ARGSORT_DEFAULTS["order"] = None if LooseVersion(_np_version) >= LooseVersion("1.17.0"): # GH-26361. NumPy added radix sort and changed default to None. - ARGSORT_DEFAULTS['kind'] = None + ARGSORT_DEFAULTS["kind"] = None -validate_argsort = CompatValidator(ARGSORT_DEFAULTS, fname='argsort', - max_fname_arg_count=0, method='both') +validate_argsort = CompatValidator( + ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both" +) # two different signatures of argsort, this second validation # for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND = OrderedDict() \ - # type: OrderedDict[str, Optional[int]] -ARGSORT_DEFAULTS_KIND['axis'] = -1 -ARGSORT_DEFAULTS_KIND['order'] = None -validate_argsort_kind = CompatValidator(ARGSORT_DEFAULTS_KIND, fname='argsort', - max_fname_arg_count=0, method='both') +ARGSORT_DEFAULTS_KIND = OrderedDict() +# type: OrderedDict[str, Optional[int]] +ARGSORT_DEFAULTS_KIND["axis"] = -1 +ARGSORT_DEFAULTS_KIND["order"] = None +validate_argsort_kind = CompatValidator( + ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" +) def validate_argsort_with_ascending(ascending, args, kwargs): @@ -145,8 +152,9 @@ def validate_argsort_with_ascending(ascending, args, kwargs): CLIP_DEFAULTS = dict(out=None) # type Dict[str, Any] -validate_clip = CompatValidator(CLIP_DEFAULTS, fname='clip', - method='both', max_fname_arg_count=3) +validate_clip = CompatValidator( + CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 +) def validate_clip_with_axis(axis, args, kwargs): @@ -166,18 +174,21 @@ def validate_clip_with_axis(axis, args, kwargs): COMPRESS_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] -COMPRESS_DEFAULTS['axis'] = None -COMPRESS_DEFAULTS['out'] = None -validate_compress = CompatValidator(COMPRESS_DEFAULTS, fname='compress', - method='both', max_fname_arg_count=1) +COMPRESS_DEFAULTS["axis"] = None +COMPRESS_DEFAULTS["out"] = None +validate_compress = CompatValidator( + COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 +) CUM_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any] -CUM_FUNC_DEFAULTS['dtype'] = None -CUM_FUNC_DEFAULTS['out'] = None -validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='both', - max_fname_arg_count=1) -validate_cumsum = CompatValidator(CUM_FUNC_DEFAULTS, fname='cumsum', - method='both', max_fname_arg_count=1) +CUM_FUNC_DEFAULTS["dtype"] = None +CUM_FUNC_DEFAULTS["out"] = None +validate_cum_func = CompatValidator( + CUM_FUNC_DEFAULTS, method="both", max_fname_arg_count=1 +) +validate_cumsum = CompatValidator( + CUM_FUNC_DEFAULTS, fname="cumsum", method="both", max_fname_arg_count=1 +) def validate_cum_func_with_skipna(skipna, args, kwargs, name): @@ -196,81 +207,88 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ALLANY_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]] -ALLANY_DEFAULTS['dtype'] = None -ALLANY_DEFAULTS['out'] = None -ALLANY_DEFAULTS['keepdims'] = False -validate_all = CompatValidator(ALLANY_DEFAULTS, fname='all', - method='both', max_fname_arg_count=1) -validate_any = CompatValidator(ALLANY_DEFAULTS, fname='any', - method='both', max_fname_arg_count=1) +ALLANY_DEFAULTS["dtype"] = None +ALLANY_DEFAULTS["out"] = None +ALLANY_DEFAULTS["keepdims"] = False +validate_all = CompatValidator( + ALLANY_DEFAULTS, fname="all", method="both", max_fname_arg_count=1 +) +validate_any = CompatValidator( + ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1 +) LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) -validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') +validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") MINMAX_DEFAULTS = dict(out=None, keepdims=False) -validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min', - method='both', max_fname_arg_count=1) -validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max', - method='both', max_fname_arg_count=1) - -RESHAPE_DEFAULTS = dict(order='C') # type: Dict[str, str] -validate_reshape = CompatValidator(RESHAPE_DEFAULTS, fname='reshape', - method='both', max_fname_arg_count=1) +validate_min = CompatValidator( + MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 +) +validate_max = CompatValidator( + MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 +) + +RESHAPE_DEFAULTS = dict(order="C") # type: Dict[str, str] +validate_reshape = CompatValidator( + RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 +) REPEAT_DEFAULTS = dict(axis=None) # type: Dict[str, Any] -validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat', - method='both', max_fname_arg_count=1) +validate_repeat = CompatValidator( + REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 +) ROUND_DEFAULTS = dict(out=None) # type: Dict[str, Any] -validate_round = CompatValidator(ROUND_DEFAULTS, fname='round', - method='both', max_fname_arg_count=1) +validate_round = CompatValidator( + ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 +) -SORT_DEFAULTS = OrderedDict() \ - # type: OrderedDict[str, Optional[Union[int, str]]] -SORT_DEFAULTS['axis'] = -1 -SORT_DEFAULTS['kind'] = 'quicksort' -SORT_DEFAULTS['order'] = None -validate_sort = CompatValidator(SORT_DEFAULTS, fname='sort', - method='kwargs') +SORT_DEFAULTS = OrderedDict() +# type: OrderedDict[str, Optional[Union[int, str]]] +SORT_DEFAULTS["axis"] = -1 +SORT_DEFAULTS["kind"] = "quicksort" +SORT_DEFAULTS["order"] = None +validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") STAT_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Any]] -STAT_FUNC_DEFAULTS['dtype'] = None -STAT_FUNC_DEFAULTS['out'] = None +STAT_FUNC_DEFAULTS["dtype"] = None +STAT_FUNC_DEFAULTS["out"] = None PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy() -SUM_DEFAULTS['keepdims'] = False -SUM_DEFAULTS['initial'] = None +SUM_DEFAULTS["keepdims"] = False +SUM_DEFAULTS["initial"] = None MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy() -MEDIAN_DEFAULTS['overwrite_input'] = False -MEDIAN_DEFAULTS['keepdims'] = False - -STAT_FUNC_DEFAULTS['keepdims'] = False - -validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, - method='kwargs') -validate_sum = CompatValidator(SUM_DEFAULTS, fname='sum', - method='both', max_fname_arg_count=1) -validate_prod = CompatValidator(PROD_DEFAULTS, fname="prod", - method="both", max_fname_arg_count=1) -validate_mean = CompatValidator(STAT_FUNC_DEFAULTS, fname='mean', - method='both', max_fname_arg_count=1) -validate_median = CompatValidator(MEDIAN_DEFAULTS, fname='median', - method='both', max_fname_arg_count=1) - -STAT_DDOF_FUNC_DEFAULTS = OrderedDict() \ - # type: OrderedDict[str, Optional[bool]] -STAT_DDOF_FUNC_DEFAULTS['dtype'] = None -STAT_DDOF_FUNC_DEFAULTS['out'] = None -STAT_DDOF_FUNC_DEFAULTS['keepdims'] = False -validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, - method='kwargs') +MEDIAN_DEFAULTS["overwrite_input"] = False +MEDIAN_DEFAULTS["keepdims"] = False + +STAT_FUNC_DEFAULTS["keepdims"] = False + +validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method="kwargs") +validate_sum = CompatValidator( + SUM_DEFAULTS, fname="sum", method="both", max_fname_arg_count=1 +) +validate_prod = CompatValidator( + PROD_DEFAULTS, fname="prod", method="both", max_fname_arg_count=1 +) +validate_mean = CompatValidator( + STAT_FUNC_DEFAULTS, fname="mean", method="both", max_fname_arg_count=1 +) +validate_median = CompatValidator( + MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 +) + +STAT_DDOF_FUNC_DEFAULTS = OrderedDict() +# type: OrderedDict[str, Optional[bool]] +STAT_DDOF_FUNC_DEFAULTS["dtype"] = None +STAT_DDOF_FUNC_DEFAULTS["out"] = None +STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False +validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") TAKE_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[str]] -TAKE_DEFAULTS['out'] = None -TAKE_DEFAULTS['mode'] = 'raise' -validate_take = CompatValidator(TAKE_DEFAULTS, fname='take', - method='kwargs') +TAKE_DEFAULTS["out"] = None +TAKE_DEFAULTS["mode"] = "raise" +validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") def validate_take_with_convert(convert, args, kwargs): @@ -285,20 +303,23 @@ def validate_take_with_convert(convert, args, kwargs): args = (convert,) + args convert = True - validate_take(args, kwargs, max_fname_arg_count=3, method='both') + validate_take(args, kwargs, max_fname_arg_count=3, method="both") return convert TRANSPOSE_DEFAULTS = dict(axes=None) -validate_transpose = CompatValidator(TRANSPOSE_DEFAULTS, fname='transpose', - method='both', max_fname_arg_count=0) +validate_transpose = CompatValidator( + TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 +) def validate_window_func(name, args, kwargs): - numpy_args = ('axis', 'dtype', 'out') - msg = ("numpy operations are not " - "valid with window objects. " - "Use .{func}() directly instead ".format(func=name)) + numpy_args = ("axis", "dtype", "out") + msg = ( + "numpy operations are not " + "valid with window objects. " + "Use .{func}() directly instead ".format(func=name) + ) if len(args) > 0: raise UnsupportedFunctionCall(msg) @@ -309,10 +330,12 @@ def validate_window_func(name, args, kwargs): def validate_rolling_func(name, args, kwargs): - numpy_args = ('axis', 'dtype', 'out') - msg = ("numpy operations are not " - "valid with window objects. " - "Use .rolling(...).{func}() instead ".format(func=name)) + numpy_args = ("axis", "dtype", "out") + msg = ( + "numpy operations are not " + "valid with window objects. " + "Use .rolling(...).{func}() instead ".format(func=name) + ) if len(args) > 0: raise UnsupportedFunctionCall(msg) @@ -323,10 +346,12 @@ def validate_rolling_func(name, args, kwargs): def validate_expanding_func(name, args, kwargs): - numpy_args = ('axis', 'dtype', 'out') - msg = ("numpy operations are not " - "valid with window objects. " - "Use .expanding(...).{func}() instead ".format(func=name)) + numpy_args = ("axis", "dtype", "out") + msg = ( + "numpy operations are not " + "valid with window objects. " + "Use .expanding(...).{func}() instead ".format(func=name) + ) if len(args) > 0: raise UnsupportedFunctionCall(msg) @@ -349,14 +374,16 @@ def validate_groupby_func(name, args, kwargs, allowed=None): kwargs = set(kwargs) - set(allowed) if len(args) + len(kwargs) > 0: - raise UnsupportedFunctionCall(( - "numpy operations are not valid " - "with groupby. Use .groupby(...)." - "{func}() instead".format(func=name))) + raise UnsupportedFunctionCall( + ( + "numpy operations are not valid " + "with groupby. Use .groupby(...)." + "{func}() instead".format(func=name) + ) + ) -RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod', - 'mean', 'std', 'var') +RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") def validate_resampler_func(method, args, kwargs): @@ -367,10 +394,13 @@ def validate_resampler_func(method, args, kwargs): """ if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: - raise UnsupportedFunctionCall(( - "numpy operations are not valid " - "with resample. Use .resample(...)." - "{func}() instead".format(func=method))) + raise UnsupportedFunctionCall( + ( + "numpy operations are not valid " + "with resample. Use .resample(...)." + "{func}() instead".format(func=method) + ) + ) else: raise TypeError("too many arguments passed in") @@ -392,5 +422,7 @@ def validate_minmax_axis(axis): if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): - raise ValueError("`axis` must be fewer than the number of " - "dimensions ({ndim})".format(ndim=ndim)) + raise ValueError( + "`axis` must be fewer than the number of " + "dimensions ({ndim})".format(ndim=ndim) + ) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 3b63cbf1cfabb..0934d8529fdf7 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -26,7 +26,7 @@ def load_reduce(self): # If we have a deprecated function, # try to replace and try again. - msg = '_reconstruct: First argument must be a sub-type of ndarray' + msg = "_reconstruct: First argument must be a sub-type of ndarray" if msg in str(e): try: @@ -37,10 +37,11 @@ def load_reduce(self): pass # try to re-encode the arguments - if getattr(self, 'encoding', None) is not None: - args = tuple(arg.encode(self.encoding) - if isinstance(arg, str) - else arg for arg in args) + if getattr(self, "encoding", None) is not None: + args = tuple( + arg.encode(self.encoding) if isinstance(arg, str) else arg + for arg in args + ) try: stack[-1] = func(*args) return @@ -48,7 +49,7 @@ def load_reduce(self): pass # unknown exception, re-raise - if getattr(self, 'is_verbose', None): + if getattr(self, "is_verbose", None): print(sys.exc_info()) print(func, args) raise @@ -56,9 +57,7 @@ def load_reduce(self): # If classes are moved, provide compat here. _class_locations_map = { - ('pandas.core.sparse.array', 'SparseArray'): - ('pandas.core.arrays', 'SparseArray'), - + ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"), # 15477 # # TODO: When FrozenNDArray is removed, add @@ -71,75 +70,84 @@ def load_reduce(self): # # Afterwards, remove the current entry # for `pandas.core.base.FrozenNDArray`. - ('pandas.core.base', 'FrozenNDArray'): - ('pandas.core.indexes.frozen', 'FrozenNDArray'), - ('pandas.core.base', 'FrozenList'): - ('pandas.core.indexes.frozen', 'FrozenList'), - + ("pandas.core.base", "FrozenNDArray"): ( + "pandas.core.indexes.frozen", + "FrozenNDArray", + ), + ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"), # 10890 - ('pandas.core.series', 'TimeSeries'): - ('pandas.core.series', 'Series'), - ('pandas.sparse.series', 'SparseTimeSeries'): - ('pandas.core.sparse.series', 'SparseSeries'), - + ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"), + ("pandas.sparse.series", "SparseTimeSeries"): ( + "pandas.core.sparse.series", + "SparseSeries", + ), # 12588, extensions moving - ('pandas._sparse', 'BlockIndex'): - ('pandas._libs.sparse', 'BlockIndex'), - ('pandas.tslib', 'Timestamp'): - ('pandas._libs.tslib', 'Timestamp'), - + ("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"), + ("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"), # 18543 moving period - ('pandas._period', 'Period'): ('pandas._libs.tslibs.period', 'Period'), - ('pandas._libs.period', 'Period'): - ('pandas._libs.tslibs.period', 'Period'), - + ("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"), + ("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"), # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype - ('pandas.tslib', '__nat_unpickle'): - ('pandas._libs.tslibs.nattype', '__nat_unpickle'), - ('pandas._libs.tslib', '__nat_unpickle'): - ('pandas._libs.tslibs.nattype', '__nat_unpickle'), - + ("pandas.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), + ("pandas._libs.tslib", "__nat_unpickle"): ( + "pandas._libs.tslibs.nattype", + "__nat_unpickle", + ), # 15998 top-level dirs moving - ('pandas.sparse.array', 'SparseArray'): - ('pandas.core.arrays.sparse', 'SparseArray'), - ('pandas.sparse.series', 'SparseSeries'): - ('pandas.core.sparse.series', 'SparseSeries'), - ('pandas.sparse.frame', 'SparseDataFrame'): - ('pandas.core.sparse.frame', 'SparseDataFrame'), - ('pandas.indexes.base', '_new_Index'): - ('pandas.core.indexes.base', '_new_Index'), - ('pandas.indexes.base', 'Index'): - ('pandas.core.indexes.base', 'Index'), - ('pandas.indexes.numeric', 'Int64Index'): - ('pandas.core.indexes.numeric', 'Int64Index'), - ('pandas.indexes.range', 'RangeIndex'): - ('pandas.core.indexes.range', 'RangeIndex'), - ('pandas.indexes.multi', 'MultiIndex'): - ('pandas.core.indexes.multi', 'MultiIndex'), - ('pandas.tseries.index', '_new_DatetimeIndex'): - ('pandas.core.indexes.datetimes', '_new_DatetimeIndex'), - ('pandas.tseries.index', 'DatetimeIndex'): - ('pandas.core.indexes.datetimes', 'DatetimeIndex'), - ('pandas.tseries.period', 'PeriodIndex'): - ('pandas.core.indexes.period', 'PeriodIndex'), - + ("pandas.sparse.array", "SparseArray"): ( + "pandas.core.arrays.sparse", + "SparseArray", + ), + ("pandas.sparse.series", "SparseSeries"): ( + "pandas.core.sparse.series", + "SparseSeries", + ), + ("pandas.sparse.frame", "SparseDataFrame"): ( + "pandas.core.sparse.frame", + "SparseDataFrame", + ), + ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"), + ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"), + ("pandas.indexes.numeric", "Int64Index"): ( + "pandas.core.indexes.numeric", + "Int64Index", + ), + ("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"), + ("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"), + ("pandas.tseries.index", "_new_DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "_new_DatetimeIndex", + ), + ("pandas.tseries.index", "DatetimeIndex"): ( + "pandas.core.indexes.datetimes", + "DatetimeIndex", + ), + ("pandas.tseries.period", "PeriodIndex"): ( + "pandas.core.indexes.period", + "PeriodIndex", + ), # 19269, arrays moving - ('pandas.core.categorical', 'Categorical'): - ('pandas.core.arrays', 'Categorical'), - + ("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"), # 19939, add timedeltaindex, float64index compat from 15998 move - ('pandas.tseries.tdi', 'TimedeltaIndex'): - ('pandas.core.indexes.timedeltas', 'TimedeltaIndex'), - ('pandas.indexes.numeric', 'Float64Index'): - ('pandas.core.indexes.numeric', 'Float64Index'), + ("pandas.tseries.tdi", "TimedeltaIndex"): ( + "pandas.core.indexes.timedeltas", + "TimedeltaIndex", + ), + ("pandas.indexes.numeric", "Float64Index"): ( + "pandas.core.indexes.numeric", + "Float64Index", + ), } # our Unpickler sub-class to override methods and some dispatcher # functions for compat -class Unpickler(pkl._Unpickler): # type: ignore +class Unpickler(pkl._Unpickler): # type: ignore def find_class(self, module, name): # override superclass key = (module, name) diff --git a/pandas/conftest.py b/pandas/conftest.py index 058361af343b6..29833ab2fc0fa 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -24,68 +24,70 @@ # or `deadline=None` to entirely disable timeouts for that test. deadline=500, timeout=hypothesis.unlimited, - suppress_health_check=(hypothesis.HealthCheck.too_slow,) + suppress_health_check=(hypothesis.HealthCheck.too_slow,), ) hypothesis.settings.load_profile("ci") def pytest_addoption(parser): - parser.addoption("--skip-slow", action="store_true", - help="skip slow tests") - parser.addoption("--skip-network", action="store_true", - help="skip network tests") - parser.addoption("--skip-db", action="store_true", - help="skip db tests") - parser.addoption("--run-high-memory", action="store_true", - help="run high memory tests") - parser.addoption("--only-slow", action="store_true", - help="run only slow tests") - parser.addoption("--strict-data-files", action="store_true", - help="Fail if a test is skipped for missing data file.") + parser.addoption("--skip-slow", action="store_true", help="skip slow tests") + parser.addoption("--skip-network", action="store_true", help="skip network tests") + parser.addoption("--skip-db", action="store_true", help="skip db tests") + parser.addoption( + "--run-high-memory", action="store_true", help="run high memory tests" + ) + parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption( + "--strict-data-files", + action="store_true", + help="Fail if a test is skipped for missing data file.", + ) def pytest_runtest_setup(item): - if 'slow' in item.keywords and item.config.getoption("--skip-slow"): + if "slow" in item.keywords and item.config.getoption("--skip-slow"): pytest.skip("skipping due to --skip-slow") - if 'slow' not in item.keywords and item.config.getoption("--only-slow"): + if "slow" not in item.keywords and item.config.getoption("--only-slow"): pytest.skip("skipping due to --only-slow") - if 'network' in item.keywords and item.config.getoption("--skip-network"): + if "network" in item.keywords and item.config.getoption("--skip-network"): pytest.skip("skipping due to --skip-network") - if 'db' in item.keywords and item.config.getoption("--skip-db"): + if "db" in item.keywords and item.config.getoption("--skip-db"): pytest.skip("skipping due to --skip-db") - if 'high_memory' in item.keywords and not item.config.getoption( - "--run-high-memory"): - pytest.skip( - "skipping high memory test since --run-high-memory was not set") + if "high_memory" in item.keywords and not item.config.getoption( + "--run-high-memory" + ): + pytest.skip("skipping high memory test since --run-high-memory was not set") # Configurations for all tests and all test modules + @pytest.fixture(autouse=True) def configure_tests(): - pd.set_option('chained_assignment', 'raise') + pd.set_option("chained_assignment", "raise") # For running doctests: make np and pd names available + @pytest.fixture(autouse=True) def add_imports(doctest_namespace): - doctest_namespace['np'] = np - doctest_namespace['pd'] = pd + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd -@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil']) +@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) def spmatrix(request): from scipy import sparse - return getattr(sparse, request.param + '_matrix') + + return getattr(sparse, request.param + "_matrix") -@pytest.fixture(params=[0, 1, 'index', 'columns'], - ids=lambda x: "axis {!r}".format(x)) +@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: "axis {!r}".format(x)) def axis(request): """ Fixture for returning the axis numbers of a DataFrame. @@ -96,7 +98,7 @@ def axis(request): axis_frame = axis -@pytest.fixture(params=[0, 'index'], ids=lambda x: "axis {!r}".format(x)) +@pytest.fixture(params=[0, "index"], ids=lambda x: "axis {!r}".format(x)) def axis_series(request): """ Fixture for returning the axis numbers of a Series. @@ -112,8 +114,9 @@ def ip(): Will raise a skip if IPython is not installed. """ - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.interactiveshell import InteractiveShell + return InteractiveShell() @@ -134,13 +137,22 @@ def ordered_fixture(request): return request.param -_all_arithmetic_operators = ['__add__', '__radd__', - '__sub__', '__rsub__', - '__mul__', '__rmul__', - '__floordiv__', '__rfloordiv__', - '__truediv__', '__rtruediv__', - '__pow__', '__rpow__', - '__mod__', '__rmod__'] +_all_arithmetic_operators = [ + "__add__", + "__radd__", + "__sub__", + "__rsub__", + "__mul__", + "__rmul__", + "__floordiv__", + "__rfloordiv__", + "__truediv__", + "__rtruediv__", + "__pow__", + "__rpow__", + "__mod__", + "__rmod__", +] @pytest.fixture(params=_all_arithmetic_operators) @@ -151,9 +163,18 @@ def all_arithmetic_operators(request): return request.param -_all_numeric_reductions = ['sum', 'max', 'min', - 'mean', 'prod', 'std', 'var', 'median', - 'kurt', 'skew'] +_all_numeric_reductions = [ + "sum", + "max", + "min", + "mean", + "prod", + "std", + "var", + "median", + "kurt", + "skew", +] @pytest.fixture(params=_all_numeric_reductions) @@ -164,7 +185,7 @@ def all_numeric_reductions(request): return request.param -_all_boolean_reductions = ['all', 'any'] +_all_boolean_reductions = ["all", "any"] @pytest.fixture(params=_all_boolean_reductions) @@ -202,13 +223,15 @@ def _get_cython_table_params(ndframe, func_names_and_expected): results = [] for func_name, expected in func_names_and_expected: results.append((ndframe, func_name, expected)) - results += [(ndframe, func, expected) for func, name in _cython_table - if name == func_name] + results += [ + (ndframe, func, expected) + for func, name in _cython_table + if name == func_name + ] return results -@pytest.fixture(params=['__eq__', '__ne__', '__le__', - '__lt__', '__ge__', '__gt__']) +@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]) def all_compare_operators(request): """ Fixture for dunder names for common compare operations @@ -223,7 +246,7 @@ def all_compare_operators(request): return request.param -@pytest.fixture(params=['__le__', '__lt__', '__ge__', '__gt__']) +@pytest.fixture(params=["__le__", "__lt__", "__ge__", "__gt__"]) def compare_operators_no_eq_ne(request): """ Fixture for dunder names for compare operations except == and != @@ -236,7 +259,7 @@ def compare_operators_no_eq_ne(request): return request.param -@pytest.fixture(params=[None, 'gzip', 'bz2', 'zip', 'xz']) +@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) def compression(request): """ Fixture for trying common compression types in compression tests @@ -244,7 +267,7 @@ def compression(request): return request.param -@pytest.fixture(params=['gzip', 'bz2', 'zip', 'xz']) +@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) def compression_only(request): """ Fixture for trying common compression types in compression tests excluding @@ -261,12 +284,12 @@ def writable(request): return request.param -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def datetime_tz_utc(): return timezone.utc -@pytest.fixture(params=['utc', 'dateutil/UTC', utc, tzutc(), timezone.utc]) +@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc]) def utc_fixture(request): """ Fixture to provide variants of UTC timezone strings and tzinfo objects @@ -274,7 +297,7 @@ def utc_fixture(request): return request.param -@pytest.fixture(params=['inner', 'outer', 'left', 'right']) +@pytest.fixture(params=["inner", "outer", "left", "right"]) def join_type(request): """ Fixture for trying all types of join operations @@ -305,7 +328,7 @@ def datapath(strict_data_files): ValueError If the path doesn't exist and the --strict-data-files option is set. """ - BASE_PATH = os.path.join(os.path.dirname(__file__), 'tests') + BASE_PATH = os.path.join(os.path.dirname(__file__), "tests") def deco(*args): path = os.path.join(BASE_PATH, *args) @@ -317,16 +340,17 @@ def deco(*args): msg = "Could not find {}." pytest.skip(msg.format(path)) return path + return deco @pytest.fixture def iris(datapath): """The iris dataset as a DataFrame.""" - return pd.read_csv(datapath('data', 'iris.csv')) + return pd.read_csv(datapath("data", "iris.csv")) -@pytest.fixture(params=['nlargest', 'nsmallest']) +@pytest.fixture(params=["nlargest", "nsmallest"]) def nselect_method(request): """ Fixture for trying all nselect methods @@ -334,7 +358,7 @@ def nselect_method(request): return request.param -@pytest.fixture(params=['left', 'right', 'both', 'neither']) +@pytest.fixture(params=["left", "right", "both", "neither"]) def closed(request): """ Fixture for trying all interval closed parameters @@ -342,7 +366,7 @@ def closed(request): return request.param -@pytest.fixture(params=['left', 'right', 'both', 'neither']) +@pytest.fixture(params=["left", "right", "both", "neither"]) def other_closed(request): """ Secondary closed fixture to allow parametrizing over all pairs of closed @@ -350,7 +374,7 @@ def other_closed(request): return request.param -@pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) +@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN")]) def nulls_fixture(request): """ Fixture for each null type in pandas @@ -373,11 +397,22 @@ def unique_nulls_fixture(request): unique_nulls_fixture2 = unique_nulls_fixture -TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific', - 'dateutil/Asia/Singapore', tzutc(), tzlocal(), FixedOffset(300), - FixedOffset(0), FixedOffset(-300), timezone.utc, - timezone(timedelta(hours=1)), - timezone(timedelta(hours=-1), name='foo')] +TIMEZONES = [ + None, + "UTC", + "US/Eastern", + "Asia/Tokyo", + "dateutil/US/Pacific", + "dateutil/Asia/Singapore", + tzutc(), + tzlocal(), + FixedOffset(300), + FixedOffset(0), + FixedOffset(-300), + timezone.utc, + timezone(timedelta(hours=1)), + timezone(timedelta(hours=-1), name="foo"), +] TIMEZONE_IDS = [repr(i) for i in TIMEZONES] @@ -416,19 +451,26 @@ def tz_aware_fixture(request): FLOAT_DTYPES = [float, "float32", "float64"] COMPLEX_DTYPES = [complex, "complex64", "complex128"] -STRING_DTYPES = [str, 'str', 'U'] +STRING_DTYPES = [str, "str", "U"] -DATETIME64_DTYPES = ['datetime64[ns]', 'M8[ns]'] -TIMEDELTA64_DTYPES = ['timedelta64[ns]', 'm8[ns]'] +DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"] +TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"] -BOOL_DTYPES = [bool, 'bool'] -BYTES_DTYPES = [bytes, 'bytes'] -OBJECT_DTYPES = [object, 'object'] +BOOL_DTYPES = [bool, "bool"] +BYTES_DTYPES = [bytes, "bytes"] +OBJECT_DTYPES = [object, "object"] ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES -ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES + - DATETIME64_DTYPES + TIMEDELTA64_DTYPES + BOOL_DTYPES + - OBJECT_DTYPES + BYTES_DTYPES) +ALL_NUMPY_DTYPES = ( + ALL_REAL_DTYPES + + COMPLEX_DTYPES + + STRING_DTYPES + + DATETIME64_DTYPES + + TIMEDELTA64_DTYPES + + BOOL_DTYPES + + OBJECT_DTYPES + + BYTES_DTYPES +) @pytest.fixture(params=STRING_DTYPES) @@ -618,29 +660,29 @@ def any_numpy_dtype(request): # categoricals are handled separately _any_skipna_inferred_dtype = [ - ('string', ['a', np.nan, 'c']), - ('bytes', [b'a', np.nan, b'c']), - ('empty', [np.nan, np.nan, np.nan]), - ('empty', []), - ('mixed-integer', ['a', np.nan, 2]), - ('mixed', ['a', np.nan, 2.0]), - ('floating', [1.0, np.nan, 2.0]), - ('integer', [1, np.nan, 2]), - ('mixed-integer-float', [1, np.nan, 2.0]), - ('decimal', [Decimal(1), np.nan, Decimal(2)]), - ('boolean', [True, np.nan, False]), - ('datetime64', [np.datetime64('2013-01-01'), np.nan, - np.datetime64('2018-01-01')]), - ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]), - ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), + ("mixed", ["a", np.nan, 2.0]), + ("floating", [1.0, np.nan, 2.0]), + ("integer", [1, np.nan, 2]), + ("mixed-integer-float", [1, np.nan, 2.0]), + ("decimal", [Decimal(1), np.nan, Decimal(2)]), + ("boolean", [True, np.nan, False]), + ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), + ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), + ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), # The following two dtypes are commented out due to GH 23554 # ('complex', [1 + 1j, np.nan, 2 + 2j]), # ('timedelta64', [np.timedelta64(1, 'D'), # np.nan, np.timedelta64(2, 'D')]), - ('timedelta', [timedelta(1), np.nan, timedelta(2)]), - ('time', [time(1), np.nan, time(2)]), - ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), - ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])] + ("timedelta", [timedelta(1), np.nan, timedelta(2)]), + ("time", [time(1), np.nan, time(2)]), + ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]), + ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]), +] ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id @@ -692,45 +734,55 @@ def any_skipna_inferred_dtype(request): return inferred_dtype, values -@pytest.fixture(params=[getattr(pd.offsets, o) for o in pd.offsets.__all__ if - issubclass(getattr(pd.offsets, o), pd.offsets.Tick)]) +@pytest.fixture( + params=[ + getattr(pd.offsets, o) + for o in pd.offsets.__all__ + if issubclass(getattr(pd.offsets, o), pd.offsets.Tick) + ] +) def tick_classes(request): """ Fixture for Tick based datetime offsets available for a time series. """ return request.param + # ---------------------------------------------------------------- # Global setup for tests using Hypothesis # Registering these strategies makes them globally available via st.from_type, # which is use for offsets in tests/tseries/offsets/test_offsets_properties.py -for name in 'MonthBegin MonthEnd BMonthBegin BMonthEnd'.split(): +for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split(): cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy(cls, st.builds( - cls, - n=st.integers(-99, 99), - normalize=st.booleans(), - )) + st.register_type_strategy( + cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans()) + ) -for name in 'YearBegin YearEnd BYearBegin BYearEnd'.split(): +for name in "YearBegin YearEnd BYearBegin BYearEnd".split(): cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy(cls, st.builds( + st.register_type_strategy( cls, - n=st.integers(-5, 5), - normalize=st.booleans(), - month=st.integers(min_value=1, max_value=12), - )) - -for name in 'QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd'.split(): + st.builds( + cls, + n=st.integers(-5, 5), + normalize=st.booleans(), + month=st.integers(min_value=1, max_value=12), + ), + ) + +for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split(): cls = getattr(pd.tseries.offsets, name) - st.register_type_strategy(cls, st.builds( + st.register_type_strategy( cls, - n=st.integers(-24, 24), - normalize=st.booleans(), - startingMonth=st.integers(min_value=1, max_value=12) - )) + st.builds( + cls, + n=st.integers(-24, 24), + normalize=st.booleans(), + startingMonth=st.integers(min_value=1, max_value=12), + ), + ) @pytest.fixture diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index b092541da93e6..f84033e9c3c90 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -13,7 +13,8 @@ class DirNamesMixin: _accessors = set() # type: Set[str] _deprecations = frozenset( - ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides']) + ["asobject", "base", "data", "flags", "itemsize", "strides"] + ) def _dir_deletions(self): """ @@ -50,8 +51,7 @@ class PandasDelegate: """ def _delegate_property_get(self, name, *args, **kwargs): - raise TypeError("You cannot access the " - "property {name}".format(name=name)) + raise TypeError("You cannot access the " "property {name}".format(name=name)) def _delegate_property_set(self, name, value, *args, **kwargs): raise TypeError("The property {name} cannot be set".format(name=name)) @@ -60,8 +60,7 @@ def _delegate_method(self, name, *args, **kwargs): raise TypeError("You cannot call method {name}".format(name=name)) @classmethod - def _add_delegate_accessors(cls, delegate, accessors, typ, - overwrite=False): + def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False): """ Add accessors to cls from the delegate class. @@ -76,7 +75,6 @@ def _add_delegate_accessors(cls, delegate, accessors, typ, """ def _create_delegator_property(name): - def _getter(self): return self._delegate_property_get(name) @@ -86,11 +84,11 @@ def _setter(self, new_values): _getter.__name__ = name _setter.__name__ = name - return property(fget=_getter, fset=_setter, - doc=getattr(delegate, name).__doc__) + return property( + fget=_getter, fset=_setter, doc=getattr(delegate, name).__doc__ + ) def _create_delegator_method(name): - def f(self, *args, **kwargs): return self._delegate_method(name, *args, **kwargs) @@ -101,7 +99,7 @@ def f(self, *args, **kwargs): for name in accessors: - if typ == 'property': + if typ == "property": f = _create_delegator_property(name) else: f = _create_delegator_method(name) @@ -138,9 +136,9 @@ def delegate_names(delegate, accessors, typ, overwrite=False): class CategoricalAccessor(PandasDelegate): [...] """ + def add_delegate_accessors(cls): - cls._add_delegate_accessors(delegate, accessors, typ, - overwrite=overwrite) + cls._add_delegate_accessors(delegate, accessors, typ, overwrite=overwrite) return cls return add_delegate_accessors @@ -151,6 +149,7 @@ def add_delegate_accessors(cls): # 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors # 2. We use a UserWarning instead of a custom Warning + class CachedAccessor: """ Custom property-like object (descriptor) for caching accessors. @@ -164,6 +163,7 @@ class CachedAccessor: should expect one of a ``Series``, ``DataFrame`` or ``Index`` as the single argument ``data`` """ + def __init__(self, name, accessor): self._name = name self._accessor = accessor @@ -185,14 +185,16 @@ def _register_accessor(name, cls): def decorator(accessor): if hasattr(cls, name): warnings.warn( - 'registration of accessor {!r} under name {!r} for type ' - '{!r} is overriding a preexisting attribute with the same ' - 'name.'.format(accessor, name, cls), + "registration of accessor {!r} under name {!r} for type " + "{!r} is overriding a preexisting attribute with the same " + "name.".format(accessor, name, cls), UserWarning, - stacklevel=2) + stacklevel=2, + ) setattr(cls, name, CachedAccessor(name, accessor)) cls._accessors.add(name) return accessor + return decorator @@ -266,25 +268,40 @@ def plot(self): """ -@Appender(_doc % dict(klass="DataFrame", - others=("register_series_accessor, " - "register_index_accessor"))) +@Appender( + _doc + % dict( + klass="DataFrame", + others=("register_series_accessor, " "register_index_accessor"), + ) +) def register_dataframe_accessor(name): from pandas import DataFrame + return _register_accessor(name, DataFrame) -@Appender(_doc % dict(klass="Series", - others=("register_dataframe_accessor, " - "register_index_accessor"))) +@Appender( + _doc + % dict( + klass="Series", + others=("register_dataframe_accessor, " "register_index_accessor"), + ) +) def register_series_accessor(name): from pandas import Series + return _register_accessor(name, Series) -@Appender(_doc % dict(klass="Index", - others=("register_dataframe_accessor, " - "register_series_accessor"))) +@Appender( + _doc + % dict( + klass="Index", + others=("register_dataframe_accessor, " "register_series_accessor"), + ) +) def register_index_accessor(name): from pandas import Index + return _register_accessor(name, Index) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4e84d7b26b707..79f205de11878 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -13,16 +13,39 @@ from pandas.util._decorators import Appender, Substitution, deprecate_kwarg from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, maybe_promote) + construct_1d_object_array_from_listlike, + maybe_promote, +) from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_object, ensure_platform_int, - ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, - is_complex_dtype, is_datetime64_any_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, - is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, - is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, - is_unsigned_integer_dtype, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_object, + ensure_platform_int, + ensure_uint64, + is_array_like, + is_bool_dtype, + is_categorical_dtype, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_extension_array_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + is_signed_integer_dtype, + is_sparse, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -62,20 +85,19 @@ def _ensure_data(values, dtype=None): # we check some simple dtypes first try: if is_object_dtype(dtype): - return ensure_object(np.asarray(values)), 'object', 'object' + return ensure_object(np.asarray(values)), "object", "object" if is_bool_dtype(values) or is_bool_dtype(dtype): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) - return np.asarray(values).astype('uint64'), 'bool', 'uint64' + return np.asarray(values).astype("uint64"), "bool", "uint64" elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): - return ensure_int64(values), 'int64', 'int64' - elif (is_unsigned_integer_dtype(values) or - is_unsigned_integer_dtype(dtype)): - return ensure_uint64(values), 'uint64', 'uint64' + return ensure_int64(values), "int64", "int64" + elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): + return ensure_uint64(values), "uint64", "uint64" elif is_float_dtype(values) or is_float_dtype(dtype): - return ensure_float64(values), 'float64', 'float64' + return ensure_float64(values), "float64", "float64" elif is_object_dtype(values) and dtype is None: - return ensure_object(np.asarray(values)), 'object', 'object' + return ensure_object(np.asarray(values)), "object", "object" elif is_complex_dtype(values) or is_complex_dtype(dtype): # ignore the fact that we are casting to float @@ -83,24 +105,28 @@ def _ensure_data(values, dtype=None): with catch_warnings(): simplefilter("ignore", np.ComplexWarning) values = ensure_float64(values) - return values, 'float64', 'float64' + return values, "float64", "float64" except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype # and it is incompat this will fall thru to here - return ensure_object(values), 'object', 'object' + return ensure_object(values), "object", "object" # datetimelike - if (needs_i8_conversion(values) or - is_period_dtype(dtype) or - is_datetime64_any_dtype(dtype) or - is_timedelta64_dtype(dtype)): + if ( + needs_i8_conversion(values) + or is_period_dtype(dtype) + or is_datetime64_any_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): if is_period_dtype(values) or is_period_dtype(dtype): from pandas import PeriodIndex + values = PeriodIndex(values) dtype = values.dtype elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex + values = TimedeltaIndex(values) dtype = values.dtype else: @@ -108,31 +134,33 @@ def _ensure_data(values, dtype=None): if values.ndim > 1 and is_datetime64_ns_dtype(values): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 - asi8 = values.view('i8') + asi8 = values.view("i8") dtype = values.dtype - return asi8, dtype, 'int64' + return asi8, dtype, "int64" from pandas import DatetimeIndex + values = DatetimeIndex(values) dtype = values.dtype - return values.asi8, dtype, 'int64' + return values.asi8, dtype, "int64" - elif (is_categorical_dtype(values) and - (is_categorical_dtype(dtype) or dtype is None)): - values = getattr(values, 'values', values) + elif is_categorical_dtype(values) and ( + is_categorical_dtype(dtype) or dtype is None + ): + values = getattr(values, "values", values) values = values.codes - dtype = 'category' + dtype = "category" # we are actually coercing to int64 # until our algos support int* directly (not all do) values = ensure_int64(values) - return values, dtype, 'int64' + return values, dtype, "int64" # we have failed, return object values = np.asarray(values, dtype=np.object) - return ensure_object(values), 'object', 'object' + return ensure_object(values), "object", "object" def _reconstruct_data(values, dtype, original): @@ -150,6 +178,7 @@ def _reconstruct_data(values, dtype, original): Index for extension types, otherwise ndarray casted to dtype """ from pandas import Index + if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): @@ -172,7 +201,7 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ['mixed', 'string', 'unicode']: + if inferred in ["mixed", "string", "unicode"]: if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -182,11 +211,11 @@ def _ensure_arraylike(values): _hashtables = { - 'float64': (htable.Float64HashTable, htable.Float64Vector), - 'uint64': (htable.UInt64HashTable, htable.UInt64Vector), - 'int64': (htable.Int64HashTable, htable.Int64Vector), - 'string': (htable.StringHashTable, htable.ObjectVector), - 'object': (htable.PyObjectHashTable, htable.ObjectVector) + "float64": (htable.Float64HashTable, htable.Float64Vector), + "uint64": (htable.UInt64HashTable, htable.UInt64Vector), + "int64": (htable.Int64HashTable, htable.Int64Vector), + "string": (htable.StringHashTable, htable.ObjectVector), + "object": (htable.PyObjectHashTable, htable.ObjectVector), } @@ -206,15 +235,15 @@ def _get_hashtable_algo(values): """ values, dtype, ndtype = _ensure_data(values) - if ndtype == 'object': + if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ['string']: - ndtype = 'string' + if lib.infer_dtype(values, skipna=False) in ["string"]: + ndtype = "string" else: - ndtype = 'object' + ndtype = "object" htable, table = _hashtables[ndtype] return (htable, table, values, dtype, ndtype) @@ -226,15 +255,15 @@ def _get_data_algo(values, func_map): values = values._values_for_rank() values, dtype, ndtype = _ensure_data(values) - if ndtype == 'object': + if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer # including nulls because that is the only difference between # StringHashTable and ObjectHashtable - if lib.infer_dtype(values, skipna=False) in ['string']: - ndtype = 'string' + if lib.infer_dtype(values, skipna=False) in ["string"]: + ndtype = "string" - f = func_map.get(ndtype, func_map['object']) + f = func_map.get(ndtype, func_map["object"]) return f, values @@ -243,6 +272,7 @@ def _get_data_algo(values, func_map): # top-level algos # # --------------- # + def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values @@ -275,6 +305,7 @@ def match(to_match, values, na_sentinel=-1): # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas import Series + result = Series(result.ravel()).replace(-1, na_sentinel) result = result.values.reshape(result.shape) @@ -393,13 +424,19 @@ def isin(comps, values): """ if not is_list_like(comps): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{comps_type}]" - .format(comps_type=type(comps).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{comps_type}]".format( + comps_type=type(comps).__name__ + ) + ) if not is_list_like(values): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]" - .format(values_type=type(values).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]".format( + values_type=type(values).__name__ + ) + ) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) @@ -423,8 +460,8 @@ def isin(comps, values): f = lambda x, y: np.in1d(x, y) elif is_integer_dtype(comps): try: - values = values.astype('int64', copy=False) - comps = comps.astype('int64', copy=False) + values = values.astype("int64", copy=False) + comps = comps.astype("int64", copy=False) f = lambda x, y: htable.ismember_int64(x, y) except (TypeError, ValueError, OverflowError): values = values.astype(object) @@ -432,8 +469,8 @@ def isin(comps, values): elif is_float_dtype(comps): try: - values = values.astype('float64', copy=False) - comps = comps.astype('float64', copy=False) + values = values.astype("float64", copy=False) + comps = comps.astype("float64", copy=False) f = lambda x, y: htable.ismember_float64(x, y) except (TypeError, ValueError): values = values.astype(object) @@ -442,8 +479,7 @@ def isin(comps, values): return f(comps, values) -def _factorize_array(values, na_sentinel=-1, size_hint=None, - na_value=None): +def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): """Factorize an array-like to labels and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -467,14 +503,17 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - uniques, labels = table.factorize(values, na_sentinel=na_sentinel, - na_value=na_value) + uniques, labels = table.factorize( + values, na_sentinel=na_sentinel, na_value=na_value + ) labels = ensure_platform_int(labels) return labels, uniques -_shared_docs['factorize'] = """ +_shared_docs[ + "factorize" +] = """ Encode the object as an enumerated type or categorical variable. This method is useful for obtaining a numeric representation of an @@ -568,29 +607,37 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, @Substitution( - values=dedent("""\ + values=dedent( + """\ values : sequence A 1-D sequence. Sequences that aren't pandas objects are coerced to ndarrays before factorization. - """), - order=dedent("""\ + """ + ), + order=dedent( + """\ order : None .. deprecated:: 0.23.0 This parameter has no effect and is deprecated. - """), - sort=dedent("""\ + """ + ), + sort=dedent( + """\ sort : bool, default False Sort `uniques` and shuffle `labels` to maintain the relationship. - """), - size_hint=dedent("""\ + """ + ), + size_hint=dedent( + """\ size_hint : int, optional Hint to the hashtable sizer. - """), + """ + ), ) -@Appender(_shared_docs['factorize']) -@deprecate_kwarg(old_arg_name='order', new_arg_name=None) +@Appender(_shared_docs["factorize"]) +@deprecate_kwarg(old_arg_name="order", new_arg_name=None) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -605,28 +652,31 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): original = values if is_extension_array_dtype(values): - values = getattr(values, '_values', values) + values = getattr(values, "_values", values) labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - if (is_datetime64_any_dtype(original) or - is_timedelta64_dtype(original) or - is_period_dtype(original)): + if ( + is_datetime64_any_dtype(original) + or is_timedelta64_dtype(original) + or is_period_dtype(original) + ): na_value = na_value_for_dtype(original.dtype) else: na_value = None - labels, uniques = _factorize_array(values, - na_sentinel=na_sentinel, - size_hint=size_hint, - na_value=na_value) + labels, uniques = _factorize_array( + values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value + ) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, - assume_unique=True, verify=False) + + uniques, labels = safe_sort( + uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False + ) uniques = _reconstruct_data(uniques, dtype, original) @@ -635,13 +685,15 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = original._shallow_copy(uniques, name=None) elif isinstance(original, ABCSeries): from pandas import Index + uniques = Index(uniques) return labels, uniques -def value_counts(values, sort=True, ascending=False, normalize=False, - bins=None, dropna=True): +def value_counts( + values, sort=True, ascending=False, normalize=False, bins=None, dropna=True +): """ Compute a histogram of the counts of non-null values. @@ -666,11 +718,13 @@ def value_counts(values, sort=True, ascending=False, normalize=False, """ from pandas.core.series import Series, Index - name = getattr(values, 'name', None) + + name = getattr(values, "name", None) if bins is not None: try: from pandas.core.reshape.tile import cut + values = Series(values) ii = cut(values, bins, include_lowest=True) except TypeError: @@ -679,7 +733,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, # count, remove nulls (from the index), and but the bins result = ii.value_counts(dropna=dropna) result = result[result.index.notna()] - result.index = result.index.astype('interval') + result.index = result.index.astype("interval") result = result.sort_index() # if we are dropna and we have NO values @@ -757,7 +811,7 @@ def _value_counts_arraylike(values, dropna): return keys, counts -def duplicated(values, keep='first'): +def duplicated(values, keep="first"): """ Return boolean ndarray denoting duplicate values. @@ -829,8 +883,7 @@ def mode(values, dropna=True): return Series(result) -def rank(values, axis=0, method='average', na_option='keep', - ascending=True, pct=False): +def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct=False): """ Rank the values along a given axis. @@ -856,12 +909,23 @@ def rank(values, axis=0, method='average', na_option='keep', """ if values.ndim == 1: f, values = _get_data_algo(values, _rank1d_functions) - ranks = f(values, ties_method=method, ascending=ascending, - na_option=na_option, pct=pct) + ranks = f( + values, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) elif values.ndim == 2: f, values = _get_data_algo(values, _rank2d_functions) - ranks = f(values, axis=axis, ties_method=method, - ascending=ascending, na_option=na_option, pct=pct) + ranks = f( + values, + axis=axis, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) else: raise TypeError("Array with ndim > 2 are not supported.") @@ -932,10 +996,12 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): elif not mask2.any(): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: - to_raise = (((np.iinfo(np.int64).max - - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or - ((np.iinfo(np.int64).min - - b2[mask2] > arr[mask2]) & not_nan[mask2]).any()) + to_raise = ( + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() + ) if to_raise: raise OverflowError("Overflow in int64 addition") @@ -943,21 +1009,21 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): _rank1d_functions = { - 'float64': algos.rank_1d_float64, - 'int64': algos.rank_1d_int64, - 'uint64': algos.rank_1d_uint64, - 'object': algos.rank_1d_object + "float64": algos.rank_1d_float64, + "int64": algos.rank_1d_int64, + "uint64": algos.rank_1d_uint64, + "object": algos.rank_1d_object, } _rank2d_functions = { - 'float64': algos.rank_2d_float64, - 'int64': algos.rank_2d_int64, - 'uint64': algos.rank_2d_uint64, - 'object': algos.rank_2d_object + "float64": algos.rank_2d_float64, + "int64": algos.rank_2d_int64, + "uint64": algos.rank_2d_uint64, + "object": algos.rank_2d_object, } -def quantile(x, q, interpolation_method='fraction'): +def quantile(x, q, interpolation_method="fraction"): """ Compute sample quantile or quantiles of the input array. For example, q=0.5 computes the median. @@ -1017,16 +1083,17 @@ def _get_score(at): if idx % 1 == 0: score = values[int(idx)] else: - if interpolation_method == 'fraction': - score = _interpolate(values[int(idx)], values[int(idx) + 1], - idx % 1) - elif interpolation_method == 'lower': + if interpolation_method == "fraction": + score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1) + elif interpolation_method == "lower": score = values[np.floor(idx)] - elif interpolation_method == 'higher': + elif interpolation_method == "higher": score = values[np.ceil(idx)] else: - raise ValueError("interpolation_method can only be 'fraction' " - ", 'lower' or 'higher'") + raise ValueError( + "interpolation_method can only be 'fraction' " + ", 'lower' or 'higher'" + ) return score @@ -1041,21 +1108,21 @@ def _get_score(at): # select n # # --------------- # -class SelectN: +class SelectN: def __init__(self, obj, n, keep): self.obj = obj self.n = n self.keep = keep - if self.keep not in ('first', 'last', 'all'): + if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') def nlargest(self): - return self.compute('nlargest') + return self.compute("nlargest") def nsmallest(self): - return self.compute('nsmallest') + return self.compute("nsmallest") @staticmethod def is_valid_dtype_n_method(dtype): @@ -1063,8 +1130,9 @@ def is_valid_dtype_n_method(dtype): Helper function to determine if dtype is valid for nsmallest/nlargest methods """ - return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or - needs_i8_conversion(dtype)) + return ( + is_numeric_dtype(dtype) and not is_complex_dtype(dtype) + ) or needs_i8_conversion(dtype) class SelectNSeries(SelectN): @@ -1087,9 +1155,10 @@ def compute(self, method): n = self.n dtype = self.obj.dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError("Cannot use method '{method}' with " - "dtype {dtype}".format(method=method, - dtype=dtype)) + raise TypeError( + "Cannot use method '{method}' with " + "dtype {dtype}".format(method=method, dtype=dtype) + ) if n <= 0: return self.obj[[]] @@ -1099,14 +1168,14 @@ def compute(self, method): # slow method if n >= len(self.obj): - reverse_it = (self.keep == 'last' or method == 'nlargest') - ascending = method == 'nsmallest' + reverse_it = self.keep == "last" or method == "nlargest" + ascending = method == "nsmallest" slc = np.s_[::-1] if reverse_it else np.s_[:] return dropped[slc].sort_values(ascending=ascending).head(n) # fast method arr, pandas_dtype, _ = _ensure_data(dropped.values) - if method == 'nlargest': + if method == "nlargest": arr = -arr if is_integer_dtype(pandas_dtype): # GH 21426: ensure reverse ordering at boundaries @@ -1116,7 +1185,7 @@ def compute(self, method): # GH 26154: ensure False is smaller than True arr = 1 - (-arr) - if self.keep == 'last': + if self.keep == "last": arr = arr[::-1] narr = len(arr) @@ -1124,12 +1193,12 @@ def compute(self, method): kth_val = algos.kth_smallest(arr.copy(), n - 1) ns, = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind='mergesort')] + inds = ns[arr[ns].argsort(kind="mergesort")] - if self.keep != 'all': + if self.keep != "all": inds = inds[:n] - if self.keep == 'last': + if self.keep == "last": # reverse indices inds = narr - 1 - inds @@ -1162,6 +1231,7 @@ def __init__(self, obj, n, keep, columns): def compute(self, method): from pandas import Int64Index + n = self.n frame = self.obj columns = self.columns @@ -1169,16 +1239,18 @@ def compute(self, method): for column in columns: dtype = frame[column].dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError(( - "Column {column!r} has dtype {dtype}, cannot use method " - "{method!r} with this dtype" - ).format(column=column, dtype=dtype, method=method)) + raise TypeError( + ( + "Column {column!r} has dtype {dtype}, cannot use method " + "{method!r} with this dtype" + ).format(column=column, dtype=dtype, method=method) + ) def get_indexer(current_indexer, other_indexer): """Helper function to concat `current_indexer` and `other_indexer` depending on `method` """ - if method == 'nsmallest': + if method == "nsmallest": return current_indexer.append(other_indexer) else: return other_indexer.append(current_indexer) @@ -1200,8 +1272,8 @@ def get_indexer(current_indexer, other_indexer): series = cur_frame[column] is_last_column = len(columns) - 1 == i values = getattr(series, method)( - cur_n, - keep=self.keep if is_last_column else 'all') + cur_n, keep=self.keep if is_last_column else "all" + ) if is_last_column or len(values) <= cur_n: indexer = get_indexer(indexer, values.index) @@ -1234,12 +1306,9 @@ def get_indexer(current_indexer, other_indexer): if len(columns) == 1: return frame - ascending = method == 'nsmallest' + ascending = method == "nsmallest" - return frame.sort_values( - columns, - ascending=ascending, - kind='mergesort') + return frame.sort_values(columns, ascending=ascending, kind="mergesort") # ------- ## ---- # @@ -1308,110 +1377,103 @@ def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info): _take_1d_dict = { - ('int8', 'int8'): algos.take_1d_int8_int8, - ('int8', 'int32'): algos.take_1d_int8_int32, - ('int8', 'int64'): algos.take_1d_int8_int64, - ('int8', 'float64'): algos.take_1d_int8_float64, - ('int16', 'int16'): algos.take_1d_int16_int16, - ('int16', 'int32'): algos.take_1d_int16_int32, - ('int16', 'int64'): algos.take_1d_int16_int64, - ('int16', 'float64'): algos.take_1d_int16_float64, - ('int32', 'int32'): algos.take_1d_int32_int32, - ('int32', 'int64'): algos.take_1d_int32_int64, - ('int32', 'float64'): algos.take_1d_int32_float64, - ('int64', 'int64'): algos.take_1d_int64_int64, - ('int64', 'float64'): algos.take_1d_int64_float64, - ('float32', 'float32'): algos.take_1d_float32_float32, - ('float32', 'float64'): algos.take_1d_float32_float64, - ('float64', 'float64'): algos.take_1d_float64_float64, - ('object', 'object'): algos.take_1d_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8, - None), - ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( - algos.take_1d_int64_int64, np.int64, np.int64, np.int64) + ("int8", "int8"): algos.take_1d_int8_int8, + ("int8", "int32"): algos.take_1d_int8_int32, + ("int8", "int64"): algos.take_1d_int8_int64, + ("int8", "float64"): algos.take_1d_int8_float64, + ("int16", "int16"): algos.take_1d_int16_int16, + ("int16", "int32"): algos.take_1d_int16_int32, + ("int16", "int64"): algos.take_1d_int16_int64, + ("int16", "float64"): algos.take_1d_int16_float64, + ("int32", "int32"): algos.take_1d_int32_int32, + ("int32", "int64"): algos.take_1d_int32_int64, + ("int32", "float64"): algos.take_1d_int32_float64, + ("int64", "int64"): algos.take_1d_int64_int64, + ("int64", "float64"): algos.take_1d_int64_float64, + ("float32", "float32"): algos.take_1d_float32_float32, + ("float32", "float64"): algos.take_1d_float32_float64, + ("float64", "float64"): algos.take_1d_float64_float64, + ("object", "object"): algos.take_1d_object_object, + ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), } _take_2d_axis0_dict = { - ('int8', 'int8'): algos.take_2d_axis0_int8_int8, - ('int8', 'int32'): algos.take_2d_axis0_int8_int32, - ('int8', 'int64'): algos.take_2d_axis0_int8_int64, - ('int8', 'float64'): algos.take_2d_axis0_int8_float64, - ('int16', 'int16'): algos.take_2d_axis0_int16_int16, - ('int16', 'int32'): algos.take_2d_axis0_int16_int32, - ('int16', 'int64'): algos.take_2d_axis0_int16_int64, - ('int16', 'float64'): algos.take_2d_axis0_int16_float64, - ('int32', 'int32'): algos.take_2d_axis0_int32_int32, - ('int32', 'int64'): algos.take_2d_axis0_int32_int64, - ('int32', 'float64'): algos.take_2d_axis0_int32_float64, - ('int64', 'int64'): algos.take_2d_axis0_int64_int64, - ('int64', 'float64'): algos.take_2d_axis0_int64_float64, - ('float32', 'float32'): algos.take_2d_axis0_float32_float32, - ('float32', 'float64'): algos.take_2d_axis0_float32_float64, - ('float64', 'float64'): algos.take_2d_axis0_float64_float64, - ('object', 'object'): algos.take_2d_axis0_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_axis0_int8_int8, + ("int8", "int32"): algos.take_2d_axis0_int8_int32, + ("int8", "int64"): algos.take_2d_axis0_int8_int64, + ("int8", "float64"): algos.take_2d_axis0_int8_float64, + ("int16", "int16"): algos.take_2d_axis0_int16_int16, + ("int16", "int32"): algos.take_2d_axis0_int16_int32, + ("int16", "int64"): algos.take_2d_axis0_int16_int64, + ("int16", "float64"): algos.take_2d_axis0_int16_float64, + ("int32", "int32"): algos.take_2d_axis0_int32_int32, + ("int32", "int64"): algos.take_2d_axis0_int32_int64, + ("int32", "float64"): algos.take_2d_axis0_int32_float64, + ("int64", "int64"): algos.take_2d_axis0_int64_int64, + ("int64", "float64"): algos.take_2d_axis0_int64_float64, + ("float32", "float32"): algos.take_2d_axis0_float32_float32, + ("float32", "float64"): algos.take_2d_axis0_float32_float64, + ("float64", "float64"): algos.take_2d_axis0_float64_float64, + ("object", "object"): algos.take_2d_axis0_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } _take_2d_axis1_dict = { - ('int8', 'int8'): algos.take_2d_axis1_int8_int8, - ('int8', 'int32'): algos.take_2d_axis1_int8_int32, - ('int8', 'int64'): algos.take_2d_axis1_int8_int64, - ('int8', 'float64'): algos.take_2d_axis1_int8_float64, - ('int16', 'int16'): algos.take_2d_axis1_int16_int16, - ('int16', 'int32'): algos.take_2d_axis1_int16_int32, - ('int16', 'int64'): algos.take_2d_axis1_int16_int64, - ('int16', 'float64'): algos.take_2d_axis1_int16_float64, - ('int32', 'int32'): algos.take_2d_axis1_int32_int32, - ('int32', 'int64'): algos.take_2d_axis1_int32_int64, - ('int32', 'float64'): algos.take_2d_axis1_int32_float64, - ('int64', 'int64'): algos.take_2d_axis1_int64_int64, - ('int64', 'float64'): algos.take_2d_axis1_int64_float64, - ('float32', 'float32'): algos.take_2d_axis1_float32_float32, - ('float32', 'float64'): algos.take_2d_axis1_float32_float64, - ('float64', 'float64'): algos.take_2d_axis1_float64_float64, - ('object', 'object'): algos.take_2d_axis1_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_axis1_int8_int8, + ("int8", "int32"): algos.take_2d_axis1_int8_int32, + ("int8", "int64"): algos.take_2d_axis1_int8_int64, + ("int8", "float64"): algos.take_2d_axis1_int8_float64, + ("int16", "int16"): algos.take_2d_axis1_int16_int16, + ("int16", "int32"): algos.take_2d_axis1_int16_int32, + ("int16", "int64"): algos.take_2d_axis1_int16_int64, + ("int16", "float64"): algos.take_2d_axis1_int16_float64, + ("int32", "int32"): algos.take_2d_axis1_int32_int32, + ("int32", "int64"): algos.take_2d_axis1_int32_int64, + ("int32", "float64"): algos.take_2d_axis1_int32_float64, + ("int64", "int64"): algos.take_2d_axis1_int64_int64, + ("int64", "float64"): algos.take_2d_axis1_int64_float64, + ("float32", "float32"): algos.take_2d_axis1_float32_float32, + ("float32", "float64"): algos.take_2d_axis1_float32_float64, + ("float64", "float64"): algos.take_2d_axis1_float64_float64, + ("object", "object"): algos.take_2d_axis1_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } _take_2d_multi_dict = { - ('int8', 'int8'): algos.take_2d_multi_int8_int8, - ('int8', 'int32'): algos.take_2d_multi_int8_int32, - ('int8', 'int64'): algos.take_2d_multi_int8_int64, - ('int8', 'float64'): algos.take_2d_multi_int8_float64, - ('int16', 'int16'): algos.take_2d_multi_int16_int16, - ('int16', 'int32'): algos.take_2d_multi_int16_int32, - ('int16', 'int64'): algos.take_2d_multi_int16_int64, - ('int16', 'float64'): algos.take_2d_multi_int16_float64, - ('int32', 'int32'): algos.take_2d_multi_int32_int32, - ('int32', 'int64'): algos.take_2d_multi_int32_int64, - ('int32', 'float64'): algos.take_2d_multi_int32_float64, - ('int64', 'int64'): algos.take_2d_multi_int64_int64, - ('int64', 'float64'): algos.take_2d_multi_int64_float64, - ('float32', 'float32'): algos.take_2d_multi_float32_float32, - ('float32', 'float64'): algos.take_2d_multi_float32_float64, - ('float64', 'float64'): algos.take_2d_multi_float64_float64, - ('object', 'object'): algos.take_2d_multi_object_object, - ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, - np.uint8), - ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object, - np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64, - fill_wrap=np.int64) + ("int8", "int8"): algos.take_2d_multi_int8_int8, + ("int8", "int32"): algos.take_2d_multi_int8_int32, + ("int8", "int64"): algos.take_2d_multi_int8_int64, + ("int8", "float64"): algos.take_2d_multi_int8_float64, + ("int16", "int16"): algos.take_2d_multi_int16_int16, + ("int16", "int32"): algos.take_2d_multi_int16_int32, + ("int16", "int64"): algos.take_2d_multi_int16_int64, + ("int16", "float64"): algos.take_2d_multi_int16_float64, + ("int32", "int32"): algos.take_2d_multi_int32_int32, + ("int32", "int64"): algos.take_2d_multi_int32_int64, + ("int32", "float64"): algos.take_2d_multi_int32_float64, + ("int64", "int64"): algos.take_2d_multi_int64_int64, + ("int64", "float64"): algos.take_2d_multi_int64_float64, + ("float32", "float32"): algos.take_2d_multi_float32_float32, + ("float32", "float64"): algos.take_2d_multi_float32_float64, + ("float64", "float64"): algos.take_2d_multi_float64_float64, + ("object", "object"): algos.take_2d_multi_object_object, + ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), } @@ -1442,8 +1504,9 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): def func(arr, indexer, out, fill_value=np.nan): indexer = ensure_int64(indexer) - _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value, - mask_info=mask_info) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) return func @@ -1534,16 +1597,18 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): if allow_fill: # Pandas style, -1 means NA validate_indices(indices, arr.shape[axis]) - result = take_1d(arr, indices, axis=axis, allow_fill=True, - fill_value=fill_value) + result = take_1d( + arr, indices, axis=axis, allow_fill=True, fill_value=fill_value + ) else: # NumPy style result = arr.take(indices, axis=axis) return result -def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, - allow_fill=True): +def take_nd( + arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True +): """ Specialized Cython take which sets NaN values in one pass @@ -1618,7 +1683,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: - raise TypeError('Incompatible type for fill_value') + raise TypeError("Incompatible type for fill_value") else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code @@ -1647,12 +1712,13 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # for dataframes initialized directly from 2-d ndarrays # (s.t. df.values is c-contiguous and df._data.blocks[0] is its # f-contiguous transpose) - out = np.empty(out_shape, dtype=dtype, order='F') + out = np.empty(out_shape, dtype=dtype, order="F") else: out = np.empty(out_shape, dtype=dtype) - func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, - mask_info=mask_info) + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info + ) func(arr, indexer, out, fill_value) if flip_order: @@ -1663,8 +1729,9 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, take_1d = take_nd -def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, - allow_fill=True): +def take_2d_multi( + arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True +): """ Specialized Cython take which sets NaN values in one pass """ @@ -1703,7 +1770,7 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, mask_info = (row_mask, col_mask), (row_needs, col_needs) if row_needs or col_needs: if out is not None and out.dtype != dtype: - raise TypeError('Incompatible type for fill_value') + raise TypeError("Incompatible type for fill_value") else: # if not, then depromote, set fill_value to dummy # (it won't be used but we don't want the cython code @@ -1724,8 +1791,9 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if func is None: def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_object(arr, indexer, out, fill_value=fill_value, - mask_info=mask_info) + _take_2d_multi_object( + arr, indexer, out, fill_value=fill_value, mask_info=mask_info + ) func(arr, indexer, out=out, fill_value=fill_value) return out @@ -1735,6 +1803,7 @@ def func(arr, indexer, out, fill_value=np.nan): # searchsorted # # ------------ # + def searchsorted(arr, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -1782,9 +1851,13 @@ def searchsorted(arr, value, side="left", sorter=None): if sorter is not None: sorter = ensure_platform_int(sorter) - if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and ( - is_integer(value) or is_integer_dtype(value)): + if ( + isinstance(arr, np.ndarray) + and is_integer_dtype(arr) + and (is_integer(value) or is_integer_dtype(value)) + ): from .arrays.array_ import array + # if `arr` and `value` have different dtypes, `arr` would be # recast by numpy, causing a slow search. # Before searching below, we therefore try to give `value` the @@ -1802,9 +1875,11 @@ def searchsorted(arr, value, side="left", sorter=None): value = dtype.type(value) else: value = array(value, dtype=dtype) - elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or - is_categorical_dtype(arr)): + elif not ( + is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) + ): from pandas.core.series import Series + # E.g. if `arr` is an array with dtype='datetime64[ns]' # and `value` is a pd.Timestamp, we may need to convert value value_ser = Series(value)._values @@ -1819,12 +1894,12 @@ def searchsorted(arr, value, side="left", sorter=None): # ---- # _diff_special = { - 'float64': algos.diff_2d_float64, - 'float32': algos.diff_2d_float32, - 'int64': algos.diff_2d_int64, - 'int32': algos.diff_2d_int32, - 'int16': algos.diff_2d_int16, - 'int8': algos.diff_2d_int8, + "float64": algos.diff_2d_float64, + "float32": algos.diff_2d_float32, + "int64": algos.diff_2d_int64, + "int32": algos.diff_2d_int32, + "int16": algos.diff_2d_int16, + "int8": algos.diff_2d_int8, } @@ -1854,7 +1929,7 @@ def diff(arr, n, axis=0): is_timedelta = False if needs_i8_conversion(arr): dtype = np.float64 - arr = arr.view('i8') + arr = arr.view("i8") na = iNaT is_timedelta = True @@ -1904,7 +1979,11 @@ def diff(arr, n, axis=0): if is_timedelta: from pandas import TimedeltaIndex - out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape( - out_arr.shape).astype('timedelta64[ns]') + + out_arr = ( + TimedeltaIndex(out_arr.ravel().astype("int64")) + .asi8.reshape(out_arr.shape) + .astype("timedelta64[ns]") + ) return out_arr diff --git a/pandas/core/api.py b/pandas/core/api.py index e8d21080775da..f3ea0976a2869 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -23,11 +23,20 @@ from pandas.core.arrays import Categorical, array from pandas.core.groupby import Grouper, NamedAgg from pandas.io.formats.format import set_eng_float_format -from pandas.core.index import (Index, CategoricalIndex, Int64Index, - UInt64Index, RangeIndex, Float64Index, - MultiIndex, IntervalIndex, - TimedeltaIndex, DatetimeIndex, - PeriodIndex, NaT) +from pandas.core.index import ( + Index, + CategoricalIndex, + Int64Index, + UInt64Index, + RangeIndex, + Float64Index, + MultiIndex, + IntervalIndex, + TimedeltaIndex, + DatetimeIndex, + PeriodIndex, + NaT, +) from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 7dc054c824fec..2246bbfde636d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -7,16 +7,28 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - is_dict_like, is_extension_type, is_list_like, is_sequence) + is_dict_like, + is_extension_type, + is_list_like, + is_sequence, +) from pandas.core.dtypes.generic import ABCSeries from pandas.io.formats.printing import pprint_thing -def frame_apply(obj, func, axis=0, broadcast=None, - raw=False, reduce=None, result_type=None, - ignore_failures=False, - args=None, kwds=None): +def frame_apply( + obj, + func, + axis=0, + broadcast=None, + raw=False, + reduce=None, + result_type=None, + ignore_failures=False, + args=None, + kwds=None, +): """ construct and return a row or column based frame apply object """ axis = obj._get_axis_number(axis) @@ -25,48 +37,71 @@ def frame_apply(obj, func, axis=0, broadcast=None, elif axis == 1: klass = FrameColumnApply - return klass(obj, func, broadcast=broadcast, - raw=raw, reduce=reduce, result_type=result_type, - ignore_failures=ignore_failures, - args=args, kwds=kwds) + return klass( + obj, + func, + broadcast=broadcast, + raw=raw, + reduce=reduce, + result_type=result_type, + ignore_failures=ignore_failures, + args=args, + kwds=kwds, + ) class FrameApply: - - def __init__(self, obj, func, broadcast, raw, reduce, result_type, - ignore_failures, args, kwds): + def __init__( + self, + obj, + func, + broadcast, + raw, + reduce, + result_type, + ignore_failures, + args, + kwds, + ): self.obj = obj self.raw = raw self.ignore_failures = ignore_failures self.args = args or () self.kwds = kwds or {} - if result_type not in [None, 'reduce', 'broadcast', 'expand']: - raise ValueError("invalid value for result_type, must be one " - "of {None, 'reduce', 'broadcast', 'expand'}") + if result_type not in [None, "reduce", "broadcast", "expand"]: + raise ValueError( + "invalid value for result_type, must be one " + "of {None, 'reduce', 'broadcast', 'expand'}" + ) if broadcast is not None: - warnings.warn("The broadcast argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='broadcast' to broadcast the result " - "to the original dimensions", - FutureWarning, stacklevel=4) + warnings.warn( + "The broadcast argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='broadcast' to broadcast the result " + "to the original dimensions", + FutureWarning, + stacklevel=4, + ) if broadcast: - result_type = 'broadcast' + result_type = "broadcast" if reduce is not None: - warnings.warn("The reduce argument is deprecated and will " - "be removed in a future version. You can specify " - "result_type='reduce' to try to reduce the result " - "to the original dimensions", - FutureWarning, stacklevel=4) + warnings.warn( + "The reduce argument is deprecated and will " + "be removed in a future version. You can specify " + "result_type='reduce' to try to reduce the result " + "to the original dimensions", + FutureWarning, + stacklevel=4, + ) if reduce: if result_type is not None: - raise ValueError( - "cannot pass both reduce=True and result_type") + raise ValueError("cannot pass both reduce=True and result_type") - result_type = 'reduce' + result_type = "reduce" self.result_type = result_type @@ -75,6 +110,7 @@ def __init__(self, obj, func, broadcast, raw, reduce, result_type, def f(x): return func(x, *args, **kwds) + else: f = func @@ -110,8 +146,7 @@ def get_result(self): # dispatch to agg if is_list_like(self.f) or is_dict_like(self.f): - return self.obj.aggregate(self.f, axis=self.axis, - *self.args, **self.kwds) + return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds) # all empty if len(self.columns) == 0 and len(self.index) == 0: @@ -124,19 +159,20 @@ def get_result(self): # don't, so inspect and insert if necessary. func = getattr(self.obj, self.f) sig = inspect.getfullargspec(func) - if 'axis' in sig.args: - self.kwds['axis'] = self.axis + if "axis" in sig.args: + self.kwds["axis"] = self.axis return func(*self.args, **self.kwds) # ufunc elif isinstance(self.f, np.ufunc): - with np.errstate(all='ignore'): - results = self.obj._data.apply('apply', func=self.f) - return self.obj._constructor(data=results, index=self.index, - columns=self.columns, copy=False) + with np.errstate(all="ignore"): + results = self.obj._data.apply("apply", func=self.f) + return self.obj._constructor( + data=results, index=self.index, columns=self.columns, copy=False + ) # broadcasting - if self.result_type == 'broadcast': + if self.result_type == "broadcast": return self.apply_broadcast() # one axis empty @@ -159,13 +195,14 @@ def apply_empty_result(self): # we are not asked to reduce or infer reduction # so just return a copy of the existing object - if self.result_type not in ['reduce', None]: + if self.result_type not in ["reduce", None]: return self.obj.copy() # we may need to infer - reduce = self.result_type == 'reduce' + reduce = self.result_type == "reduce" from pandas import Series + if not reduce: EMPTY_SERIES = Series([]) @@ -190,12 +227,9 @@ def apply_raw(self): # TODO: mixed type case if result.ndim == 2: - return self.obj._constructor(result, - index=self.index, - columns=self.columns) + return self.obj._constructor(result, index=self.index, columns=self.columns) else: - return self.obj._constructor_sliced(result, - index=self.agg_axis) + return self.obj._constructor_sliced(result, index=self.agg_axis) def apply_broadcast(self, target): result_values = np.empty_like(target.values) @@ -219,9 +253,9 @@ def apply_broadcast(self, target): result_values[:, i] = res # we *always* preserve the original index / columns - result = self.obj._constructor(result_values, - index=target.index, - columns=target.columns) + result = self.obj._constructor( + result_values, index=target.index, columns=target.columns + ) return result def apply_standard(self): @@ -232,11 +266,14 @@ def apply_standard(self): # we cannot reduce using non-numpy dtypes, # as demonstrated in gh-12244 - if (self.result_type in ['reduce', None] and - not self.dtypes.apply(is_extension_type).any()): + if ( + self.result_type in ["reduce", None] + and not self.dtypes.apply(is_extension_type).any() + ): # Create a dummy Series from an empty array from pandas import Series + values = self.values index = self.obj._get_axis(self.axis) labels = self.agg_axis @@ -244,10 +281,9 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.reduce(values, self.f, - axis=self.axis, - dummy=dummy, - labels=labels) + result = reduction.reduce( + values, self.f, axis=self.axis, dummy=dummy, labels=labels + ) return self.obj._constructor_sliced(result, index=labels) except Exception: pass @@ -285,13 +321,12 @@ def apply_series_generator(self): results[i] = self.f(v) keys.append(v.name) except Exception as e: - if hasattr(e, 'args'): + if hasattr(e, "args"): # make sure i is defined if i is not None: k = res_index[i] - e.args = e.args + ('occurred at index %s' % - pprint_thing(k), ) + e.args = e.args + ("occurred at index %s" % pprint_thing(k),) raise self.results = results @@ -321,8 +356,7 @@ def apply_broadcast(self): @property def series_generator(self): - return (self.obj._ixs(i, axis=1) - for i in range(len(self.columns))) + return (self.obj._ixs(i, axis=1) for i in range(len(self.columns))) @property def result_index(self): @@ -362,9 +396,10 @@ def apply_broadcast(self): @property def series_generator(self): constructor = self.obj._constructor_sliced - return (constructor(arr, index=self.columns, name=name) - for i, (arr, name) in enumerate(zip(self.values, - self.index))) + return ( + constructor(arr, index=self.columns, name=name) + for i, (arr, name) in enumerate(zip(self.values, self.index)) + ) @property def result_index(self): @@ -379,12 +414,13 @@ def wrap_results_for_axis(self): results = self.results # we have requested to expand - if self.result_type == 'expand': + if self.result_type == "expand": result = self.infer_to_same_shape() # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): from pandas import Series + result = Series(results) result.index = self.res_index diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 2d09a9eac6eab..dab29e9ce71d3 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,6 +1,9 @@ from .array_ import array # noqa: F401 from .base import ( # noqa: F401 - ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin) + ExtensionArray, + ExtensionOpsMixin, + ExtensionScalarOpsMixin, +) from .categorical import Categorical # noqa: F401 from .datetimes import DatetimeArray # noqa: F401 from .integer import IntegerArray, integer_array # noqa: F401 diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 7a83b7960a6e7..15ff1432f16e2 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -12,10 +12,9 @@ from pandas.tseries.offsets import DateOffset, Tick, generate_range -def generate_regular_range(start: Timestamp, - end: Timestamp, - periods: int, - freq: DateOffset) -> Tuple[np.ndarray, str]: +def generate_regular_range( + start: Timestamp, end: Timestamp, periods: int, freq: DateOffset +) -> Tuple[np.ndarray, str]: """ Generate a range of dates with the spans between dates described by the given `freq` DateOffset. @@ -41,21 +40,22 @@ def generate_regular_range(start: Timestamp, b = Timestamp(start).value # cannot just use e = Timestamp(end) + 1 because arange breaks when # stride is too large, see GH10887 - e = (b + (Timestamp(end).value - b) // stride * stride + - stride // 2 + 1) + e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1 # end.tz == start.tz by this point due to _generate implementation tz = start.tz elif start is not None: b = Timestamp(start).value - e = _generate_range_overflow_safe(b, periods, stride, side='start') + e = _generate_range_overflow_safe(b, periods, stride, side="start") tz = start.tz elif end is not None: e = Timestamp(end).value + stride - b = _generate_range_overflow_safe(e, periods, stride, side='end') + b = _generate_range_overflow_safe(e, periods, stride, side="end") tz = end.tz else: - raise ValueError("at least 'start' or 'end' should be specified " - "if a 'period' is given.") + raise ValueError( + "at least 'start' or 'end' should be specified " + "if a 'period' is given." + ) with np.errstate(over="raise"): # If the range is sufficiently large, np.arange may overflow @@ -76,18 +76,16 @@ def generate_regular_range(start: Timestamp, elif end is not None: tz = end.tz - xdr = generate_range(start=start, end=end, - periods=periods, offset=freq) + xdr = generate_range(start=start, end=end, periods=periods, offset=freq) values = np.array([x.value for x in xdr], dtype=np.int64) return values, tz -def _generate_range_overflow_safe(endpoint: int, - periods: int, - stride: int, - side: str = 'start') -> int: +def _generate_range_overflow_safe( + endpoint: int, periods: int, stride: int, side: str = "start" +) -> int: """ Calculate the second endpoint for passing to np.arange, checking to avoid an integer overflow. Catch OverflowError and re-raise @@ -113,12 +111,13 @@ def _generate_range_overflow_safe(endpoint: int, OutOfBoundsDatetime """ # GH#14187 raise instead of incorrectly wrapping around - assert side in ['start', 'end'] + assert side in ["start", "end"] i64max = np.uint64(np.iinfo(np.int64).max) - msg = ('Cannot generate range with {side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, periods=periods)) + msg = ( + "Cannot generate range with {side}={endpoint} and " + "periods={periods}".format(side=side, endpoint=endpoint, periods=periods) + ) with np.errstate(over="raise"): # if periods * strides cannot be multiplied within the *uint64* bounds, @@ -130,40 +129,39 @@ def _generate_range_overflow_safe(endpoint: int, if np.abs(addend) <= i64max: # relatively easy case without casting concerns - return _generate_range_overflow_safe_signed( - endpoint, periods, stride, side) + return _generate_range_overflow_safe_signed(endpoint, periods, stride, side) - elif ((endpoint > 0 and side == 'start' and stride > 0) or - (endpoint < 0 and side == 'end' and stride > 0)): + elif (endpoint > 0 and side == "start" and stride > 0) or ( + endpoint < 0 and side == "end" and stride > 0 + ): # no chance of not-overflowing raise OutOfBoundsDatetime(msg) - elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): + elif side == "end" and endpoint > i64max and endpoint - stride <= i64max: # in _generate_regular_range we added `stride` thereby overflowing # the bounds. Adjust to fix this. - return _generate_range_overflow_safe(endpoint - stride, - periods - 1, stride, side) + return _generate_range_overflow_safe( + endpoint - stride, periods - 1, stride, side + ) # split into smaller pieces mid_periods = periods // 2 remaining = periods - mid_periods assert 0 < remaining < periods, (remaining, periods, endpoint, stride) - midpoint = _generate_range_overflow_safe(endpoint, mid_periods, - stride, side) + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side) return _generate_range_overflow_safe(midpoint, remaining, stride, side) -def _generate_range_overflow_safe_signed(endpoint: int, - periods: int, - stride: int, - side: str) -> int: +def _generate_range_overflow_safe_signed( + endpoint: int, periods: int, stride: int, side: str +) -> int: """ A special case for _generate_range_overflow_safe where `periods * stride` can be calculated without overflowing int64 bounds. """ - assert side in ['start', 'end'] - if side == 'end': + assert side in ["start", "end"] + if side == "end": stride *= -1 with np.errstate(over="raise"): @@ -191,8 +189,8 @@ def _generate_range_overflow_safe_signed(endpoint: int, if result <= i64max + np.uint64(stride): return result - raise OutOfBoundsDatetime('Cannot generate range with ' - '{side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, - periods=periods)) + raise OutOfBoundsDatetime( + "Cannot generate range with " + "{side}={endpoint} and " + "periods={periods}".format(side=side, endpoint=endpoint, periods=periods) + ) diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 1b002ad12d526..93ee570c1f971 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -5,15 +5,19 @@ from pandas._libs import lib, tslibs from pandas.core.dtypes.common import ( - is_datetime64_ns_dtype, is_extension_array_dtype, is_timedelta64_ns_dtype) + is_datetime64_ns_dtype, + is_extension_array_dtype, + is_timedelta64_ns_dtype, +) from pandas.core.dtypes.dtypes import ExtensionDtype, registry from pandas.core.dtypes.generic import ABCExtensionArray -def array(data: Sequence[object], - dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, - copy: bool = True, - ) -> ABCExtensionArray: +def array( + data: Sequence[object], + dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, + copy: bool = True, +) -> ABCExtensionArray: """ Create an array. @@ -207,16 +211,17 @@ def array(data: Sequence[object], ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( - period_array, ExtensionArray, IntervalArray, PandasArray, + period_array, + ExtensionArray, + IntervalArray, + PandasArray, DatetimeArray, TimedeltaArray, ) from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): - msg = ( - "Cannot pass scalar '{}' to 'pandas.array'." - ) + msg = "Cannot pass scalar '{}' to 'pandas.array'." raise ValueError(msg.format(data)) data = extract_array(data, extract_numpy=True) @@ -234,14 +239,14 @@ def array(data: Sequence[object], if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=False) - if inferred_dtype == 'period': + if inferred_dtype == "period": try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass - elif inferred_dtype == 'interval': + elif inferred_dtype == "interval": try: return IntervalArray(data, copy=copy) except ValueError: @@ -249,7 +254,7 @@ def array(data: Sequence[object], # We choose to return an ndarray, rather than raising. pass - elif inferred_dtype.startswith('datetime'): + elif inferred_dtype.startswith("datetime"): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) @@ -257,7 +262,7 @@ def array(data: Sequence[object], # Mixture of timezones, fall back to PandasArray pass - elif inferred_dtype.startswith('timedelta'): + elif inferred_dtype.startswith("timedelta"): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 21f0f3c08e93b..2a5556ff6d357 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -17,8 +17,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ( - ABCExtensionArray, ABCIndexClass, ABCSeries) +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas._typing import ArrayLike @@ -120,9 +119,10 @@ class ExtensionArray: See :ref:`extending.extension.ufunc` for more. """ + # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. # Don't override this. - _typ = 'extension' + _typ = "extension" # ------------------------------------------------------------------------ # Constructors @@ -272,8 +272,8 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: # __init__ method coerces that value, then so should __setitem__ # Note, also, that Series/DataFrame.where internally use __setitem__ # on a copy of the data. - raise NotImplementedError(_not_implemented_message.format( - type(self), '__setitem__') + raise NotImplementedError( + _not_implemented_message.format(type(self), "__setitem__") ) def __len__(self) -> int: @@ -393,7 +393,7 @@ def _values_for_argsort(self) -> np.ndarray: # Note: this is used in `ExtensionArray.argsort`. return np.array(self) - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): + def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): """ Return the indices that would sort this array. @@ -423,8 +423,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = nargsort(self, kind=kind, ascending=ascending, - na_position='last') + result = nargsort(self, kind=kind, ascending=ascending, na_position="last") return result def fillna(self, value=None, method=None, limit=None): @@ -463,15 +462,16 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): - raise ValueError("Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self))) + raise ValueError( + "Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self)) + ) value = value[mask] if mask.any(): if method is not None: - func = pad_1d if method == 'pad' else backfill_1d - new_values = func(self.astype(object), limit=limit, - mask=mask) + func = pad_1d if method == "pad" else backfill_1d + new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value @@ -491,10 +491,7 @@ def dropna(self): """ return self[~self.isna()] - def shift( - self, - periods: int = 1, - fill_value: object = None) -> ABCExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray: """ Shift values by desired number. @@ -537,14 +534,13 @@ def shift( fill_value = self.dtype.na_value empty = self._from_sequence( - [fill_value] * min(abs(periods), len(self)), - dtype=self.dtype + [fill_value] * min(abs(periods), len(self)), dtype=self.dtype ) if periods > 0: a = empty b = self[:-periods] else: - a = self[abs(periods):] + a = self[abs(periods) :] b = empty return self._concat_same_type([a, b]) @@ -633,10 +629,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize( - self, - na_sentinel: int = -1, - ) -> Tuple[np.ndarray, ABCExtensionArray]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]: """ Encode the extension array as an enumerated type. @@ -679,13 +672,16 @@ def factorize( arr, na_value = self._values_for_factorize() - labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel, - na_value=na_value) + labels, uniques = _factorize_array( + arr, na_sentinel=na_sentinel, na_value=na_value + ) uniques = self._from_factorized(uniques, self) return labels, uniques - _extension_array_shared_docs['repeat'] = """ + _extension_array_shared_docs[ + "repeat" + ] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -727,8 +723,8 @@ def factorize( Categories (3, object): [a, b, c] """ - @Substitution(klass='ExtensionArray') - @Appender(_extension_array_shared_docs['repeat']) + @Substitution(klass="ExtensionArray") + @Appender(_extension_array_shared_docs["repeat"]) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) ind = np.arange(len(self)).repeat(repeats) @@ -739,10 +735,7 @@ def repeat(self, repeats, axis=None): # ------------------------------------------------------------------------ def take( - self, - indices: Sequence[int], - allow_fill: bool = False, - fill_value: Any = None + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None ) -> ABCExtensionArray: """ Take elements from an array. @@ -849,25 +842,19 @@ def copy(self) -> ABCExtensionArray: def __repr__(self): from pandas.io.formats.printing import format_object_summary - template = ( - '{class_name}' - '{data}\n' - 'Length: {length}, dtype: {dtype}' - ) + template = "{class_name}" "{data}\n" "Length: {length}, dtype: {dtype}" # the short repr has no trailing newline, while the truncated # repr does. So we include a newline in our template, and strip # any trailing newlines from format_object_summary - data = format_object_summary(self, self._formatter(), - indent_for_name=False).rstrip(', \n') - class_name = '<{}>\n'.format(self.__class__.__name__) - return template.format(class_name=class_name, data=data, - length=len(self), - dtype=self.dtype) - - def _formatter( - self, - boxed: bool = False, - ) -> Callable[[Any], Optional[str]]: + data = format_object_summary( + self, self._formatter(), indent_for_name=False + ).rstrip(", \n") + class_name = "<{}>\n".format(self.__class__.__name__) + return template.format( + class_name=class_name, data=data, length=len(self), dtype=self.dtype + ) + + def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: """Formatting function for scalar values. This is used in the default '__repr__'. The returned formatting @@ -926,8 +913,7 @@ def ravel(self, order="C") -> ABCExtensionArray: @classmethod def _concat_same_type( - cls, - to_concat: Sequence[ABCExtensionArray] + cls, to_concat: Sequence[ABCExtensionArray] ) -> ABCExtensionArray: """ Concatenate multiple array @@ -985,8 +971,11 @@ def _reduce(self, name, skipna=True, **kwargs): ------ TypeError : subclass does not define reductions """ - raise TypeError("cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype)) + raise TypeError( + "cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype + ) + ) class ExtensionOpsMixin: @@ -1127,7 +1116,7 @@ def _maybe_convert(arr): res = np.asarray(arr) return res - if op.__name__ in {'divmod', 'rdivmod'}: + if op.__name__ in {"divmod", "rdivmod"}: a, b = zip(*res) res = _maybe_convert(a), _maybe_convert(b) else: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5ae71ffb165e9..c4f7d6dbe32fa 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -9,20 +9,41 @@ from pandas._libs import algos as libalgos, lib from pandas.compat.numpy import function as nv from pandas.util._decorators import ( - Appender, Substitution, cache_readonly, deprecate_kwarg) + Appender, + Substitution, + cache_readonly, + deprecate_kwarg, +) from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.dtypes.cast import ( - coerce_indexer_dtype, maybe_infer_to_datetimelike) +from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike from pandas.core.dtypes.common import ( - ensure_int64, ensure_object, ensure_platform_int, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, - is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype, - is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence, - is_timedelta64_dtype) + ensure_int64, + ensure_object, + ensure_platform_int, + is_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_datetimelike, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + is_sequence, + is_timedelta64_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( - ABCCategoricalIndex, ABCDataFrame, ABCIndexClass, ABCSeries) + ABCCategoricalIndex, + ABCDataFrame, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -39,7 +60,8 @@ from .base import ExtensionArray, _extension_array_shared_docs -_take_msg = textwrap.dedent("""\ +_take_msg = textwrap.dedent( + """\ Interpreting negative values in 'indexer' as missing values. In the future, this will change to meaning positional indices from the right. @@ -47,7 +69,8 @@ Use 'allow_fill=True' to retain the previous behavior and silence this warning. - Use 'allow_fill=False' to accept the new behavior.""") + Use 'allow_fill=False' to accept the new behavior.""" +) def _cat_compare_op(op): @@ -63,28 +86,27 @@ def f(self, other): other = lib.item_from_zerodim(other) if not self.ordered: - if op in ['__lt__', '__gt__', '__le__', '__ge__']: - raise TypeError("Unordered Categoricals can only compare " - "equality or not") + if op in ["__lt__", "__gt__", "__le__", "__ge__"]: + raise TypeError( + "Unordered Categoricals can only compare " "equality or not" + ) if isinstance(other, Categorical): # Two Categoricals can only be be compared if the categories are # the same (maybe up to ordering, depending on ordered) - msg = ("Categoricals can only be compared if " - "'categories' are the same.") + msg = "Categoricals can only be compared if " "'categories' are the same." if len(self.categories) != len(other.categories): raise TypeError(msg + " Categories are different lengths") - elif (self.ordered and not (self.categories == - other.categories).all()): + elif self.ordered and not (self.categories == other.categories).all(): raise TypeError(msg) elif not set(self.categories) == set(other.categories): raise TypeError(msg) if not (self.ordered == other.ordered): - raise TypeError("Categoricals can only be compared if " - "'ordered' is the same") - if not self.ordered and not self.categories.equals( - other.categories): + raise TypeError( + "Categoricals can only be compared if " "'ordered' is the same" + ) + if not self.ordered and not self.categories.equals(other.categories): # both unordered and different order other_codes = _get_codes_for_values(other, self.categories) else: @@ -104,28 +126,32 @@ def f(self, other): ret = getattr(self._codes, op)(i) # check for NaN in self - mask = (self._codes == -1) + mask = self._codes == -1 ret[mask] = False return ret else: - if op == '__eq__': + if op == "__eq__": return np.repeat(False, len(self)) - elif op == '__ne__': + elif op == "__ne__": return np.repeat(True, len(self)) else: - msg = ("Cannot compare a Categorical for op {op} with a " - "scalar, which is not a category.") + msg = ( + "Cannot compare a Categorical for op {op} with a " + "scalar, which is not a category." + ) raise TypeError(msg.format(op=op)) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons - if op in ['__eq__', '__ne__']: + if op in ["__eq__", "__ne__"]: return getattr(np.array(self), op)(np.array(other)) - msg = ("Cannot compare a Categorical for op {op} with type {typ}." - "\nIf you want to compare values, use 'np.asarray(cat) " - " other'.") + msg = ( + "Cannot compare a Categorical for op {op} with type {typ}." + "\nIf you want to compare values, use 'np.asarray(cat) " + " other'." + ) raise TypeError(msg.format(op=op, typ=type(other))) f.__name__ = op @@ -308,14 +334,16 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = frozenset(['labels', 'tolist']) - _typ = 'categorical' + _deprecations = frozenset(["labels", "tolist"]) + _typ = "categorical" - def __init__(self, values, categories=None, ordered=None, dtype=None, - fastpath=False): + def __init__( + self, values, categories=None, ordered=None, dtype=None, fastpath=False + ): - dtype = CategoricalDtype._from_values_or_dtype(values, categories, - ordered, dtype) + dtype = CategoricalDtype._from_values_or_dtype( + values, categories, ordered, dtype + ) # At this point, dtype is always a CategoricalDtype, but # we may have dtype.categories be None, and we need to # infer categories in a factorization step futher below @@ -340,9 +368,10 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.internals.construction import sanitize_array + # By convention, empty lists result in object dtype: if len(values) == 0: - sanitize_dtype = 'object' + sanitize_dtype = "object" else: sanitize_dtype = None null_mask = isna(values) @@ -358,30 +387,35 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, if dtype._ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories - raise TypeError("'values' is not ordered, please " - "explicitly specify the categories order " - "by passing in a categories argument.") + raise TypeError( + "'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument." + ) except ValueError: # FIXME - raise NotImplementedError("> 1 ndim Categorical are not " - "supported at this time") + raise NotImplementedError( + "> 1 ndim Categorical are not " "supported at this time" + ) # we're inferring from values dtype = CategoricalDtype(categories, dtype._ordered) elif is_categorical_dtype(values): - old_codes = (values._values.codes if isinstance(values, ABCSeries) - else values.codes) - codes = _recode_for_categories(old_codes, values.dtype.categories, - dtype.categories) + old_codes = ( + values._values.codes if isinstance(values, ABCSeries) else values.codes + ) + codes = _recode_for_categories( + old_codes, values.dtype.categories, dtype.categories + ) else: codes = _get_codes_for_values(values, dtype.categories) if null_mask.any(): # Reinsert -1 placeholders for previously removed missing values - full_codes = - np.ones(null_mask.shape, dtype=codes.dtype) + full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) full_codes[~null_mask] = codes codes = full_codes @@ -422,10 +456,13 @@ def categories(self): @categories.setter def categories(self, categories): new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if (self.dtype.categories is not None and - len(self.dtype.categories) != len(new_dtype.categories)): - raise ValueError("new categories need to have the same number of " - "items as the old categories!") + if self.dtype.categories is not None and len(self.dtype.categories) != len( + new_dtype.categories + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) self._dtype = new_dtype @property @@ -462,9 +499,9 @@ def copy(self): """ Copy constructor. """ - return self._constructor(values=self._codes.copy(), - dtype=self.dtype, - fastpath=True) + return self._constructor( + values=self._codes.copy(), dtype=self.dtype, fastpath=True + ) def astype(self, dtype, copy=True): """ @@ -531,8 +568,9 @@ def base(self): return None @classmethod - def _from_inferred_categories(cls, inferred_categories, inferred_codes, - dtype, true_values=None): + def _from_inferred_categories( + cls, inferred_categories, inferred_codes, dtype, true_values=None + ): """ Construct a Categorical from inferred values. @@ -556,8 +594,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, from pandas import Index, to_numeric, to_datetime, to_timedelta cats = Index(inferred_categories) - known_categories = (isinstance(dtype, CategoricalDtype) and - dtype.categories is not None) + known_categories = ( + isinstance(dtype, CategoricalDtype) and dtype.categories is not None + ) if known_categories: # Convert to a specialized type with `dtype` if specified. @@ -582,8 +621,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, unsorted = cats.copy() categories = cats.sort_values() - codes = _recode_for_categories(inferred_codes, unsorted, - categories) + codes = _recode_for_categories(inferred_codes, unsorted, categories) dtype = CategoricalDtype(categories, ordered=False) else: dtype = CategoricalDtype(cats, ordered=False) @@ -636,31 +674,37 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): [a, b, a, b] Categories (2, object): [a < b] """ - dtype = CategoricalDtype._from_values_or_dtype(categories=categories, - ordered=ordered, - dtype=dtype) + dtype = CategoricalDtype._from_values_or_dtype( + categories=categories, ordered=ordered, dtype=dtype + ) if dtype.categories is None: - msg = ("The categories must be provided in 'categories' or " - "'dtype'. Both were None.") + msg = ( + "The categories must be provided in 'categories' or " + "'dtype'. Both were None." + ) raise ValueError(msg) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): msg = "codes need to be array-like integers" if is_float_dtype(codes): - icodes = codes.astype('i8') + icodes = codes.astype("i8") if (icodes == codes).all(): msg = None codes = icodes - warn(("float codes will be disallowed in the future and " - "raise a ValueError"), FutureWarning, stacklevel=2) + warn( + ( + "float codes will be disallowed in the future and " + "raise a ValueError" + ), + FutureWarning, + stacklevel=2, + ) if msg: raise ValueError(msg) - if len(codes) and ( - codes.max() >= len(dtype.categories) or codes.min() < -1): - raise ValueError("codes need to be between -1 and " - "len(categories)-1") + if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): + raise ValueError("codes need to be between -1 and " "len(categories)-1") return cls(codes, dtype=dtype, fastpath=True) @@ -710,14 +754,18 @@ def _set_categories(self, categories, fastpath=False): """ if fastpath: - new_dtype = CategoricalDtype._from_fastpath(categories, - self.ordered) + new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) else: new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if (not fastpath and self.dtype.categories is not None and - len(new_dtype.categories) != len(self.dtype.categories)): - raise ValueError("new categories need to have the same number of " - "items than the old categories!") + if ( + not fastpath + and self.dtype.categories is not None + and len(new_dtype.categories) != len(self.dtype.categories) + ): + raise ValueError( + "new categories need to have the same number of " + "items than the old categories!" + ) self._dtype = new_dtype @@ -734,8 +782,7 @@ def _set_dtype(self, dtype): We don't do any validation here. It's assumed that the dtype is a (valid) instance of `CategoricalDtype`. """ - codes = _recode_for_categories(self.codes, self.categories, - dtype.categories) + codes = _recode_for_categories(self.codes, self.categories, dtype.categories) return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): @@ -750,7 +797,7 @@ def set_ordered(self, value, inplace=False): Whether or not to set the ordered attribute in-place or return a copy of this categorical with ordered set to the value. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() cat._dtype = new_dtype @@ -772,7 +819,7 @@ def as_ordered(self, inplace=False): Categorical Ordered Categorical. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(True, inplace=inplace) def as_unordered(self, inplace=False): @@ -790,11 +837,10 @@ def as_unordered(self, inplace=False): Categorical Unordered Categorical. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) - def set_categories(self, new_categories, ordered=None, rename=False, - inplace=False): + def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Set the categories to the specified new_categories. @@ -845,20 +891,22 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_categories remove_unused_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if ordered is None: ordered = self.dtype._ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) cat = self if inplace else self.copy() if rename: - if (cat.dtype.categories is not None and - len(new_dtype.categories) < len(cat.dtype.categories)): + if cat.dtype.categories is not None and len(new_dtype.categories) < len( + cat.dtype.categories + ): # remove all _codes which are larger and set to -1/NaN cat._codes[cat._codes >= len(new_dtype.categories)] = -1 else: - codes = _recode_for_categories(cat.codes, cat.categories, - new_dtype.categories) + codes = _recode_for_categories( + cat.codes, cat.categories, new_dtype.categories + ) cat._codes = codes cat._dtype = new_dtype @@ -932,12 +980,11 @@ def rename_categories(self, new_categories, inplace=False): [A, A, B] Categories (2, object): [A, B] """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() if is_dict_like(new_categories): - cat.categories = [new_categories.get(item, item) - for item in cat.categories] + cat.categories = [new_categories.get(item, item) for item in cat.categories] elif callable(new_categories): cat.categories = [new_categories(item) for item in cat.categories] else: @@ -981,12 +1028,12 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): remove_unused_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if set(self.dtype.categories) != set(new_categories): - raise ValueError("items in new_categories are not the same as in " - "old categories") - return self.set_categories(new_categories, ordered=ordered, - inplace=inplace) + raise ValueError( + "items in new_categories are not the same as in " "old categories" + ) + return self.set_categories(new_categories, ordered=ordered, inplace=inplace) def add_categories(self, new_categories, inplace=False): """ @@ -1021,13 +1068,15 @@ def add_categories(self, new_categories, inplace=False): remove_unused_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(new_categories): new_categories = [new_categories] already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: - msg = ("new categories must not include old categories: " - "{already_included!s}") + msg = ( + "new categories must not include old categories: " + "{already_included!s}" + ) raise ValueError(msg.format(already_included=already_included)) new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) @@ -1070,14 +1119,13 @@ def remove_categories(self, removals, inplace=False): remove_unused_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not is_list_like(removals): removals = [removals] removal_set = set(list(removals)) not_included = removal_set - set(self.dtype.categories) - new_categories = [c for c in self.dtype.categories - if c not in removal_set] + new_categories = [c for c in self.dtype.categories if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -1088,8 +1136,9 @@ def remove_categories(self, removals, inplace=False): msg = "removals must all be in old categories: {not_included!s}" raise ValueError(msg.format(not_included=not_included)) - return self.set_categories(new_categories, ordered=self.ordered, - rename=False, inplace=inplace) + return self.set_categories( + new_categories, ordered=self.ordered, rename=False, inplace=inplace + ) def remove_unused_categories(self, inplace=False): """ @@ -1113,7 +1162,7 @@ def remove_unused_categories(self, inplace=False): remove_categories set_categories """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) @@ -1121,8 +1170,9 @@ def remove_unused_categories(self, inplace=False): idx, inv = idx[1:], inv - 1 new_categories = cat.dtype.categories.take(idx) - new_dtype = CategoricalDtype._from_fastpath(new_categories, - ordered=self.ordered) + new_dtype = CategoricalDtype._from_fastpath( + new_categories, ordered=self.ordered + ) cat._dtype = new_dtype cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) @@ -1200,23 +1250,22 @@ def map(self, mapper): """ new_categories = self.categories.map(mapper) try: - return self.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) + return self.from_codes( + self._codes.copy(), categories=new_categories, ordered=self.ordered + ) except ValueError: # NA values are represented in self._codes with -1 # np.take causes NA values to take final element in new_categories if np.any(self._codes == -1): - new_categories = new_categories.insert(len(new_categories), - np.nan) + new_categories = new_categories.insert(len(new_categories), np.nan) return np.take(new_categories, self._codes) - __eq__ = _cat_compare_op('__eq__') - __ne__ = _cat_compare_op('__ne__') - __lt__ = _cat_compare_op('__lt__') - __gt__ = _cat_compare_op('__gt__') - __le__ = _cat_compare_op('__le__') - __ge__ = _cat_compare_op('__ge__') + __eq__ = _cat_compare_op("__eq__") + __ne__ = _cat_compare_op("__ne__") + __lt__ = _cat_compare_op("__lt__") + __gt__ = _cat_compare_op("__gt__") + __le__ = _cat_compare_op("__le__") + __ge__ = _cat_compare_op("__ge__") # for Series/ndarray like compat @property @@ -1262,9 +1311,11 @@ def shift(self, periods, fill_value=None): elif fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - raise ValueError("'fill_value={}' is not present " - "in this Categorical's " - "categories".format(fill_value)) + raise ValueError( + "'fill_value={}' is not present " + "in this Categorical's " + "categories".format(fill_value) + ) if periods > 0: codes[:periods] = fill_value else: @@ -1296,43 +1347,43 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result # for all other cases, raise for now (similarly as what happens in # Series.__array_prepare__) - raise TypeError("Object with dtype {dtype} cannot perform " - "the numpy op {op}".format( - dtype=self.dtype, - op=ufunc.__name__)) + raise TypeError( + "Object with dtype {dtype} cannot perform " + "the numpy op {op}".format(dtype=self.dtype, op=ufunc.__name__) + ) def __setstate__(self, state): """Necessary for making this object picklable""" if not isinstance(state, dict): - raise Exception('invalid pickle state') + raise Exception("invalid pickle state") # Provide compatibility with pre-0.15.0 Categoricals. - if '_categories' not in state and '_levels' in state: - state['_categories'] = self.dtype.validate_categories(state.pop( - '_levels')) - if '_codes' not in state and 'labels' in state: - state['_codes'] = coerce_indexer_dtype( - state.pop('labels'), state['_categories']) + if "_categories" not in state and "_levels" in state: + state["_categories"] = self.dtype.validate_categories(state.pop("_levels")) + if "_codes" not in state and "labels" in state: + state["_codes"] = coerce_indexer_dtype( + state.pop("labels"), state["_categories"] + ) # 0.16.0 ordered change - if '_ordered' not in state: + if "_ordered" not in state: # >=15.0 < 0.16.0 - if 'ordered' in state: - state['_ordered'] = state.pop('ordered') + if "ordered" in state: + state["_ordered"] = state.pop("ordered") else: - state['_ordered'] = False + state["_ordered"] = False # 0.21.0 CategoricalDtype change - if '_dtype' not in state: - state['_dtype'] = CategoricalDtype(state['_categories'], - state['_ordered']) + if "_dtype" not in state: + state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) for k, v in state.items(): setattr(self, k, v) @@ -1371,18 +1422,20 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self.dtype.categories.memory_usage( - deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) - @Substitution(klass='Categorical') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="Categorical") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if not self.ordered: - raise ValueError("Categorical not ordered\nyou can use " - ".as_ordered() to change the Categorical to an " - "ordered one") + raise ValueError( + "Categorical not ordered\nyou can use " + ".as_ordered() to change the Categorical to an " + "ordered one" + ) from pandas.core.series import Series + codes = _get_codes_for_values(Series(value).values, self.categories) if -1 in codes: raise KeyError("Value(s) to be inserted must be in categories.") @@ -1411,6 +1464,7 @@ def isna(self): ret = self._codes == -1 return ret + isnull = isna def notna(self): @@ -1432,14 +1486,14 @@ def notna(self): """ return ~self.isna() + notnull = notna def put(self, *args, **kwargs): """ Replace specific elements in the Categorical with given values. """ - raise NotImplementedError(("'put' is not yet implemented " - "for Categorical")) + raise NotImplementedError(("'put' is not yet implemented " "for Categorical")) def dropna(self): """ @@ -1489,10 +1543,9 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, dtype=self.dtype, - fastpath=True) + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) - return Series(count, index=CategoricalIndex(ix), dtype='int64') + return Series(count, index=CategoricalIndex(ix), dtype="int64") def get_values(self): """ @@ -1508,8 +1561,12 @@ def get_values(self): A numpy array of the same dtype as categorical.categories.dtype or Index if datetime / periods. """ - warn("The 'get_values' method is deprecated and will be removed in a " - "future version", FutureWarning, stacklevel=2) + warn( + "The 'get_values' method is deprecated and will be removed in a " + "future version", + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -1517,21 +1574,22 @@ def _internal_get_values(self): if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) elif is_integer_dtype(self.categories) and -1 in self._codes: - return self.categories.astype("object").take(self._codes, - fill_value=np.nan) + return self.categories.astype("object").take(self._codes, fill_value=np.nan) return np.array(self) def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: - raise TypeError("Categorical is not ordered for operation {op}\n" - "you can use .as_ordered() to change the " - "Categorical to an ordered one\n".format(op=op)) + raise TypeError( + "Categorical is not ordered for operation {op}\n" + "you can use .as_ordered() to change the " + "Categorical to an ordered one\n".format(op=op) + ) def _values_for_argsort(self): return self._codes.copy() - def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): + def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): """ Return the indices that would sort the Categorical. @@ -1584,7 +1642,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) - def sort_values(self, inplace=False, ascending=True, na_position='last'): + def sort_values(self, inplace=False, ascending=True, na_position="last"): """ Sort the Categorical by category value returning a new Categorical by default. @@ -1658,21 +1716,19 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): [NaN, NaN, 5.0, 2.0, 2.0] Categories (2, int64): [2, 5] """ - inplace = validate_bool_kwarg(inplace, 'inplace') - if na_position not in ['last', 'first']: - msg = 'invalid na_position: {na_position!r}' + inplace = validate_bool_kwarg(inplace, "inplace") + if na_position not in ["last", "first"]: + msg = "invalid na_position: {na_position!r}" raise ValueError(msg.format(na_position=na_position)) - sorted_idx = nargsort(self, - ascending=ascending, - na_position=na_position) + sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) if inplace: self._codes = self._codes[sorted_idx] else: - return self._constructor(values=self._codes[sorted_idx], - dtype=self.dtype, - fastpath=True) + return self._constructor( + values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True + ) def _values_for_rank(self): """ @@ -1687,11 +1743,12 @@ def _values_for_rank(self): """ from pandas import Series + if self.ordered: values = self.codes mask = values == -1 if mask.any(): - values = values.astype('float64') + values = values.astype("float64") values[mask] = np.nan elif self.categories.is_numeric(): values = np.array(self) @@ -1703,7 +1760,7 @@ def _values_for_rank(self): ) return values - def ravel(self, order='C'): + def ravel(self, order="C"): """ Return a flattened (numpy) array. @@ -1713,9 +1770,12 @@ def ravel(self, order='C'): ------- numpy.array """ - warn("Categorical.ravel will return a Categorical object instead " - "of an ndarray in a future version.", - FutureWarning, stacklevel=2) + warn( + "Categorical.ravel will return a Categorical object instead " + "of an ndarray in a future version.", + FutureWarning, + stacklevel=2, + ) return np.array(self) def view(self): @@ -1743,7 +1803,7 @@ def to_dense(self): """ return np.asarray(self) - @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value') + @deprecate_kwarg(old_arg_name="fill_value", new_arg_name="value") def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -1780,8 +1840,9 @@ def fillna(self, value=None, method=None, limit=None): if value is None: value = np.nan if limit is not None: - raise NotImplementedError("specifying a limit for fillna has not " - "been implemented yet") + raise NotImplementedError( + "specifying a limit for fillna has not " "been implemented yet" + ) codes = self._codes @@ -1789,8 +1850,9 @@ def fillna(self, value=None, method=None, limit=None): if method is not None: values = self.to_dense().reshape(-1, len(self)) - values = interpolate_2d(values, method, 0, None, - value).astype(self.categories.dtype)[0] + values = interpolate_2d(values, method, 0, None, value).astype( + self.categories.dtype + )[0] codes = _get_codes_for_values(values, self.categories) else: @@ -1819,9 +1881,11 @@ def fillna(self, value=None, method=None, limit=None): codes[mask] = self.categories.get_loc(value) else: - raise TypeError('"value" parameter must be a scalar, dict ' - 'or Series, but you passed a ' - '"{0}"'.format(type(value).__name__)) + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + '"{0}"'.format(type(value).__name__) + ) return self._constructor(codes, dtype=self.dtype, fastpath=True) @@ -1913,14 +1977,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): if fill_value in self.categories: fill_value = self.categories.get_loc(fill_value) else: - msg = ( - "'fill_value' ('{}') is not in this Categorical's " - "categories." - ) + msg = "'fill_value' ('{}') is not in this Categorical's " "categories." raise TypeError(msg.format(fill_value)) - codes = take(self._codes, indexer, allow_fill=allow_fill, - fill_value=fill_value) + codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) result = type(self).from_codes(codes, dtype=dtype) return result @@ -1937,8 +1997,7 @@ def _slice(self, slicer): # in a 2-d case be passd (slice(None),....) if isinstance(slicer, tuple) and len(slicer) == 2: if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") slicer = slicer[1] codes = self._codes[slicer] @@ -1972,12 +2031,13 @@ def _tidy_repr(self, max_vals=10, footer=True): """ num = max_vals // 2 head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) + tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) - result = '{head}, ..., {tail}'.format(head=head[:-1], tail=tail[1:]) + result = "{head}, ..., {tail}".format(head=head[:-1], tail=tail[1:]) if footer: - result = '{result}\n{footer}'.format( - result=result, footer=self._repr_footer()) + result = "{result}\n{footer}".format( + result=result, footer=self._repr_footer() + ) return str(result) @@ -1985,9 +2045,13 @@ def _repr_categories(self): """ return the base repr for the categories """ - max_categories = (10 if get_option("display.max_categories") == 0 else - get_option("display.max_categories")) + max_categories = ( + 10 + if get_option("display.max_categories") == 0 + else get_option("display.max_categories") + ) from pandas.io.formats import format as fmt + if len(self.categories) > max_categories: num = max_categories // 2 head = fmt.format_array(self.categories[:num], None) @@ -2008,7 +2072,8 @@ def _repr_categories_info(self): category_strs = self._repr_categories() dtype = str(self.categories.dtype) levheader = "Categories ({length}, {dtype}): ".format( - length=len(self.categories), dtype=dtype) + length=len(self.categories), dtype=dtype + ) width, height = get_terminal_size() max_width = get_option("display.width") or width if console.in_ipython_frontend(): @@ -2033,13 +2098,16 @@ def _repr_categories_info(self): def _repr_footer(self): - return 'Length: {length}\n{info}'.format( - length=len(self), info=self._repr_categories_info()) + return "Length: {length}\n{info}".format( + length=len(self), info=self._repr_categories_info() + ) - def _get_repr(self, length=True, na_rep='NaN', footer=True): + def _get_repr(self, length=True, na_rep="NaN", footer=True): from pandas.io.formats import format as fmt - formatter = fmt.CategoricalFormatter(self, length=length, - na_rep=na_rep, footer=footer) + + formatter = fmt.CategoricalFormatter( + self, length=length, na_rep=na_rep, footer=footer + ) result = formatter.to_string() return str(result) @@ -2054,7 +2122,7 @@ def __repr__(self): result = self._get_repr(length=len(self) > _maxlen) else: msg = self._get_repr(length=False, footer=True).replace("\n", ", ") - result = ('[], {repr_msg}'.format(repr_msg=msg)) + result = "[], {repr_msg}".format(repr_msg=msg) return result @@ -2062,7 +2130,7 @@ def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype """ - if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i': + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": indexer = indexer.astype(self._codes.dtype) return indexer @@ -2077,8 +2145,9 @@ def __getitem__(self, key): else: return self.categories[i] else: - return self._constructor(values=self._codes[key], - dtype=self.dtype, fastpath=True) + return self._constructor( + values=self._codes[key], dtype=self.dtype, fastpath=True + ) def __setitem__(self, key, value): """ @@ -2098,8 +2167,10 @@ def __setitem__(self, key, value): # require identical categories set if isinstance(value, Categorical): if not is_dtype_equal(self, value): - raise ValueError("Cannot set a Categorical with another, " - "without identical categories") + raise ValueError( + "Cannot set a Categorical with another, " + "without identical categories" + ) if not self.categories.equals(value.categories): new_codes = _recode_for_categories( value.codes, value.categories, self.categories @@ -2109,13 +2180,16 @@ def __setitem__(self, key, value): rvalue = value if is_list_like(value) else [value] from pandas import Index + to_add = Index(rvalue).difference(self.categories) # no assignments of values not in categories, but it's always ok to set # something to np.nan if len(to_add) and not isna(to_add).all(): - raise ValueError("Cannot setitem on a Categorical with a new " - "category, set the categories first") + raise ValueError( + "Cannot setitem on a Categorical with a new " + "category, set the categories first" + ) # set by position if isinstance(key, (int, np.integer)): @@ -2127,14 +2201,12 @@ def __setitem__(self, key, value): # in a 2-d case be passd (slice(None),....) if len(key) == 2: if not com.is_null_slice(key[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") key = key[1] elif len(key) == 1: key = key[0] else: - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") # slicing in Series or Categorical elif isinstance(key, slice): @@ -2172,8 +2244,9 @@ def _reverse_indexer(self): """ categories = self.categories - r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'), - categories.size) + r, counts = libalgos.groupsort_indexer( + self.codes.astype("int64"), categories.size + ) counts = counts.cumsum() result = (r[start:end] for start, end in zip(counts, counts[1:])) result = dict(zip(categories, result)) @@ -2183,7 +2256,7 @@ def _reverse_indexer(self): def _reduce(self, name, axis=0, **kwargs): func = getattr(self, name, None) if func is None: - msg = 'Categorical cannot perform the operation {op}' + msg = "Categorical cannot perform the operation {op}" raise TypeError(msg.format(op=name)) return func(**kwargs) @@ -2202,7 +2275,7 @@ def min(self, numeric_only=None, **kwargs): ------- min : the minimum of this `Categorical` """ - self.check_for_ordered('min') + self.check_for_ordered("min") if numeric_only: good = self._codes != -1 pointer = self._codes[good].min(**kwargs) @@ -2228,7 +2301,7 @@ def max(self, numeric_only=None, **kwargs): ------- max : the maximum of this `Categorical` """ - self.check_for_ordered('max') + self.check_for_ordered("max") if numeric_only: good = self._codes != -1 pointer = self._codes[good].max(**kwargs) @@ -2258,6 +2331,7 @@ def mode(self, dropna=True): """ import pandas._libs.hashtable as htable + codes = self._codes if dropna: good = self._codes != -1 @@ -2322,13 +2396,14 @@ def unique(self): return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): - codes = self.codes.astype('int64') + codes = self.codes.astype("int64") return codes, -1 @classmethod def _from_factorized(cls, uniques, original): - return original._constructor(original.categories.take(uniques), - dtype=original.dtype) + return original._constructor( + original.categories.take(uniques), dtype=original.dtype + ) def equals(self, other): """ @@ -2347,9 +2422,9 @@ def equals(self, other): # fastpath to avoid re-coding other_codes = other._codes else: - other_codes = _recode_for_categories(other.codes, - other.categories, - self.categories) + other_codes = _recode_for_categories( + other.codes, other.categories, self.categories + ) return np.array_equal(self._codes, other_codes) return False @@ -2385,14 +2460,15 @@ def describe(self): freqs = counts / float(counts.sum()) from pandas.core.reshape.concat import concat + result = concat([counts, freqs], axis=1) - result.columns = ['counts', 'freqs'] - result.index.name = 'categories' + result.columns = ["counts", "freqs"] + result.index.name = "categories" return result - @Substitution(klass='Categorical') - @Appender(_extension_array_shared_docs['repeat']) + @Substitution(klass="Categorical") + @Appender(_extension_array_shared_docs["repeat"]) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) codes = self._codes.repeat(repeats) @@ -2452,10 +2528,14 @@ def isin(self, values): array([ True, False, True, False, True, False]) """ from pandas.core.internals.construction import sanitize_array + if not is_list_like(values): - raise TypeError("only list-like objects are allowed to be passed" - " to isin(), you passed a [{values_type}]" - .format(values_type=type(values).__name__)) + raise TypeError( + "only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]".format( + values_type=type(values).__name__ + ) + ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) @@ -2466,15 +2546,23 @@ def isin(self, values): # The Series.cat accessor -@delegate_names(delegate=Categorical, - accessors=["categories", "ordered"], - typ="property") -@delegate_names(delegate=Categorical, - accessors=["rename_categories", "reorder_categories", - "add_categories", "remove_categories", - "remove_unused_categories", "set_categories", - "as_ordered", "as_unordered"], - typ="method") +@delegate_names( + delegate=Categorical, accessors=["categories", "ordered"], typ="property" +) +@delegate_names( + delegate=Categorical, + accessors=[ + "rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + ], + typ="method", +) class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): """ Accessor object for categorical properties of the Series values. @@ -2511,8 +2599,9 @@ def __init__(self, data): @staticmethod def _validate(data): if not is_categorical_dtype(data.dtype): - raise AttributeError("Can only use .cat accessor with a " - "'category' dtype") + raise AttributeError( + "Can only use .cat accessor with a " "'category' dtype" + ) def _delegate_property_get(self, name): return getattr(self._parent, name) @@ -2526,10 +2615,12 @@ def codes(self): Return Series of codes as well as the index. """ from pandas import Series + return Series(self._parent.codes, index=self._index) def _delegate_method(self, name, *args, **kwargs): from pandas import Series + method = getattr(self._parent, name) res = method(*args, **kwargs) if res is not None: @@ -2540,10 +2631,12 @@ def categorical(self): # Note: Upon deprecation, `test_tab_completion_with_categorical` will # need to be updated. `categorical` will need to be removed from # `ok_for_cat`. - warn("`Series.cat.categorical` has been deprecated. Use the " - "attributes on 'Series.cat' directly instead.", - FutureWarning, - stacklevel=2) + warn( + "`Series.cat.categorical` has been deprecated. Use the " + "attributes on 'Series.cat' directly instead.", + FutureWarning, + stacklevel=2, + ) return self._parent @property @@ -2551,10 +2644,11 @@ def name(self): # Note: Upon deprecation, `test_tab_completion_with_categorical` will # need to be updated. `name` will need to be removed from # `ok_for_cat`. - warn("`Series.cat.name` has been deprecated. Use `Series.name` " - "instead.", - FutureWarning, - stacklevel=2) + warn( + "`Series.cat.name` has been deprecated. Use `Series.name` " "instead.", + FutureWarning, + stacklevel=2, + ) return self._name @property @@ -2562,12 +2656,14 @@ def index(self): # Note: Upon deprecation, `test_tab_completion_with_categorical` will # need to be updated. `index` will need to be removed from # ok_for_cat`. - warn("`Series.cat.index` has been deprecated. Use `Series.index` " - "instead.", - FutureWarning, - stacklevel=2) + warn( + "`Series.cat.index` has been deprecated. Use `Series.index` " "instead.", + FutureWarning, + stacklevel=2, + ) return self._index + # utility routines @@ -2576,22 +2672,20 @@ def _get_codes_for_values(values, categories): utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables + dtype_equal = is_dtype_equal(values.dtype, categories.dtype) if dtype_equal: # To prevent erroneous dtype coercion in _get_data_algo, retrieve # the underlying numpy array. gh-22702 - values = getattr(values, '_ndarray_values', values) - categories = getattr(categories, '_ndarray_values', categories) - elif (is_extension_array_dtype(categories.dtype) and - is_object_dtype(values)): + values = getattr(values, "_ndarray_values", values) + categories = getattr(categories, "_ndarray_values", categories) + elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values): # Support inferring the correct extension dtype from an array of # scalar objects. e.g. # Categorical(array[Period, Period], categories=PeriodIndex(...)) try: - values = ( - categories.dtype.construct_array_type()._from_sequence(values) - ) + values = categories.dtype.construct_array_type()._from_sequence(values) except Exception: # but that may fail for any reason, so fall back to object values = ensure_object(values) @@ -2636,8 +2730,9 @@ def _recode_for_categories(codes, old_categories, new_categories): elif new_categories.equals(old_categories): # Same categories, so no need to actually recode return codes.copy() - indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), - new_categories) + indexer = coerce_indexer_dtype( + new_categories.get_indexer(old_categories), new_categories + ) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) return new_codes @@ -2647,8 +2742,7 @@ def _convert_to_list_like(list_like): return list_like if isinstance(list_like, list): return list_like - if (is_sequence(list_like) or isinstance(list_like, tuple) or - is_iterator(list_like)): + if is_sequence(list_like) or isinstance(list_like, tuple) or is_iterator(list_like): return list(list_like) elif is_scalar(list_like): return [list_like] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 93166759d8dbd..540442b7eaed4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -7,30 +7,40 @@ from pandas._libs import NaT, NaTType, Timestamp, algos, iNaT, lib from pandas._libs.tslibs.c_timestamp import maybe_integer_op_deprecated -from pandas._libs.tslibs.period import ( - DIFFERENT_FREQ, IncompatibleFrequency, Period) +from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 from pandas.compat.numpy import function as nv -from pandas.errors import ( - AbstractMethodError, NullFrequencyError, PerformanceWarning) +from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, - is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, - is_object_dtype, is_offsetlike, is_period_dtype, is_string_dtype, - is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype) + is_categorical_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_offsetlike, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna from pandas._typing import DatetimeLikeScalar from pandas.core import missing, nanops -from pandas.core.algorithms import ( - checked_add_with_arr, take, unique1d, value_counts) +from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com from pandas.tseries import frequencies @@ -68,8 +78,7 @@ def _scalar_type(self) -> Type[DatetimeLikeScalar]: raise AbstractMethodError(self) def _scalar_from_string( - self, - value: str, + self, value: str ) -> Union[Period, Timestamp, Timedelta, NaTType]: """ Construct a scalar type from a string. @@ -90,10 +99,7 @@ def _scalar_from_string( """ raise AbstractMethodError(self) - def _unbox_scalar( - self, - value: Union[Period, Timestamp, Timedelta, NaTType], - ) -> int: + def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> int: """ Unbox the integer value of a scalar `value`. @@ -113,8 +119,7 @@ def _unbox_scalar( raise AbstractMethodError(self) def _check_compatible_with( - self, - other: Union[Period, Timestamp, Timedelta, NaTType], + self, other: Union[Period, Timestamp, Timedelta, NaTType] ) -> None: """ Verify that `self` and `other` are compatible. @@ -141,8 +146,10 @@ class DatelikeOps: Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. """ - @Substitution(URL="https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior") + @Substitution( + URL="https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior" + ) def strftime(self, date_format): """ Convert to Index using specified date_format. @@ -179,6 +186,7 @@ def strftime(self, date_format): dtype='object') """ from pandas import Index + return Index(self._format_native_types(date_format=date_format)) @@ -187,8 +195,7 @@ class TimelikeOps: Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. """ - _round_doc = ( - """ + _round_doc = """ Perform {op} operation on the data to the specified `freq`. Parameters @@ -247,10 +254,9 @@ class TimelikeOps: DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', '2018-01-01 12:01:00'], dtype='datetime64[ns]', freq='T') - """) + """ - _round_example = ( - """>>> rng.round('H') + _round_example = """>>> rng.round('H') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) @@ -262,10 +268,9 @@ class TimelikeOps: 1 2018-01-01 12:00:00 2 2018-01-01 12:00:00 dtype: datetime64[ns] - """) + """ - _floor_example = ( - """>>> rng.floor('H') + _floor_example = """>>> rng.floor('H') DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', '2018-01-01 12:00:00'], dtype='datetime64[ns]', freq=None) @@ -278,10 +283,8 @@ class TimelikeOps: 2 2018-01-01 12:00:00 dtype: datetime64[ns] """ - ) - _ceil_example = ( - """>>> rng.ceil('H') + _ceil_example = """>>> rng.ceil('H') DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', '2018-01-01 13:00:00'], dtype='datetime64[ns]', freq=None) @@ -294,7 +297,6 @@ class TimelikeOps: 2 2018-01-01 13:00:00 dtype: datetime64[ns] """ - ) def _round(self, freq, mode, ambiguous, nonexistent): # round the local times @@ -310,23 +312,19 @@ def _round(self, freq, mode, ambiguous, nonexistent): ) @Appender((_round_doc + _round_example).format(op="round")) - def round(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round( - freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent - ) + def round(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) @Appender((_round_doc + _floor_example).format(op="floor")) - def floor(self, freq, ambiguous='raise', nonexistent='raise'): + def floor(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @Appender((_round_doc + _ceil_example).format(op="ceil")) - def ceil(self, freq, ambiguous='raise', nonexistent='raise'): + def ceil(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) -class DatetimeLikeArrayMixin(ExtensionOpsMixin, - AttributesMixin, - ExtensionArray): +class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray @@ -365,7 +363,7 @@ def asi8(self) -> np.ndarray: An ndarray with int64 dtype. """ # do not cache or you'll create a memory leak - return self._data.view('i8') + return self._data.view("i8") @property def _ndarray_values(self): @@ -374,7 +372,7 @@ def _ndarray_values(self): # ---------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, na_rep='NaT', date_format=None): + def _format_native_types(self, na_rep="NaT", date_format=None): """ Helper method for astype when converting to strings. @@ -417,9 +415,11 @@ def __getitem__(self, key): is_int = lib.is_integer(key) if lib.is_scalar(key) and not is_int: - raise IndexError("only integers, slices (`:`), ellipsis (`...`), " - "numpy.newaxis (`None`) and integer or boolean " - "arrays are valid indices") + raise IndexError( + "only integers, slices (`:`), ellipsis (`...`), " + "numpy.newaxis (`None`) and integer or boolean " + "arrays are valid indices" + ) getitem = self._data.__getitem__ if is_int: @@ -459,9 +459,9 @@ def __getitem__(self, key): return self._simple_new(result, dtype=self.dtype, freq=freq) def __setitem__( - self, - key: Union[int, Sequence[int], Sequence[bool], slice], - value: Union[NaTType, Any, Sequence[Any]] + self, + key: Union[int, Sequence[int], Sequence[bool], slice], + value: Union[NaTType, Any, Sequence[Any]], ) -> None: # I'm fudging the types a bit here. "Any" above really depends # on type(self). For PeriodArray, it's Period (or stuff coercible @@ -477,12 +477,12 @@ def __setitem__( if not is_slice: key = cast(Sequence, key) - if (len(key) != len(value) - and not com.is_bool_indexer(key)): - msg = ("shape mismatch: value array of length '{}' does " - "not match indexing result of length '{}'.") - raise ValueError(msg.format( - len(key), len(value))) + if len(key) != len(value) and not com.is_bool_indexer(key): + msg = ( + "shape mismatch: value array of length '{}' does " + "not match indexing result of length '{}'." + ) + raise ValueError(msg.format(len(key), len(value))) elif not len(key): return @@ -499,8 +499,9 @@ def __setitem__( "'value' should be a '{scalar}', 'NaT', or array of those. " "Got '{typ}' instead." ) - raise TypeError(msg.format(scalar=self._scalar_type.__name__, - typ=type(value).__name__)) + raise TypeError( + msg.format(scalar=self._scalar_type.__name__, typ=type(value).__name__) + ) self._data[key] = value self._maybe_clear_freq() @@ -515,6 +516,7 @@ def astype(self, dtype, copy=True): # 2. DatetimeArray.astype handles conversion between tz. # 3. DatetimeArray.astype handles datetime -> period from pandas import Categorical + dtype = pandas_dtype(dtype) if is_object_dtype(dtype): @@ -533,11 +535,13 @@ def astype(self, dtype, copy=True): if copy: values = values.copy() return values - elif (is_datetime_or_timedelta_dtype(dtype) and - not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): + elif ( + is_datetime_or_timedelta_dtype(dtype) + and not is_dtype_equal(self.dtype, dtype) + ) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float - msg = 'Cannot cast {name} to dtype {dtype}' + msg = "Cannot cast {name} to dtype {dtype}" raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) elif is_categorical_dtype(dtype): return Categorical(self, dtype=dtype) @@ -589,10 +593,9 @@ def take(self, indices, allow_fill=False, fill_value=None): if allow_fill: fill_value = self._validate_fill_value(fill_value) - new_values = take(self.asi8, - indices, - allow_fill=allow_fill, - fill_value=fill_value) + new_values = take( + self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value + ) return type(self)(new_values, dtype=self.dtype) @@ -624,7 +627,7 @@ def _values_for_argsort(self): # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def searchsorted(self, value, side='left', sorter=None): + def searchsorted(self, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -652,10 +655,10 @@ def searchsorted(self, value, side='left', sorter=None): if isinstance(value, str): value = self._scalar_from_string(value) - if not (isinstance(value, (self._scalar_type, type(self))) - or isna(value)): - raise ValueError("Unexpected type for 'value': {valtype}" - .format(valtype=type(value))) + if not (isinstance(value, (self._scalar_type, type(self))) or isna(value)): + raise ValueError( + "Unexpected type for 'value': {valtype}".format(valtype=type(value)) + ) self._check_compatible_with(value) if isinstance(value, type(self)): @@ -675,7 +678,7 @@ def repeat(self, repeats, *args, **kwargs): """ nv.validate_repeat(args, kwargs) values = self._data.repeat(repeats) - return type(self)(values.view('i8'), dtype=self.dtype) + return type(self)(values.view("i8"), dtype=self.dtype) def value_counts(self, dropna=False): """ @@ -700,8 +703,9 @@ def value_counts(self, dropna=False): cls = type(self) result = value_counts(values, sort=False, dropna=dropna) - index = Index(cls(result.index.view('i8'), dtype=self.dtype), - name=result.index.name) + index = Index( + cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name + ) return Series(result.values, index=index, name=result.name) def map(self, mapper): @@ -725,7 +729,7 @@ def _isnan(self): """ return if each value is nan """ - return (self.asi8 == iNaT) + return self.asi8 == iNaT @property # NB: override with cache_readonly in immutable subclasses def _hasnans(self): @@ -773,13 +777,15 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): - raise ValueError("Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self))) + raise ValueError( + "Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self)) + ) value = value[mask] if mask.any(): if method is not None: - if method == 'pad': + if method == "pad": func = missing.pad_1d else: func = missing.backfill_1d @@ -791,8 +797,7 @@ def fillna(self, value=None, method=None, limit=None): # to avoid modifying `self` in-place. values = values.copy() - new_values = func(values, limit=limit, - mask=mask) + new_values = func(values, limit=limit, mask=mask) if is_datetime64tz_dtype(self): # we need to pass int64 values to the constructor to avoid # re-localizing incorrectly @@ -878,9 +883,9 @@ def _validate_frequency(cls, index, freq, **kwargs): return None try: - on_freq = cls._generate_range(start=index[0], end=None, - periods=len(index), freq=freq, - **kwargs) + on_freq = cls._generate_range( + start=index[0], end=None, periods=len(index), freq=freq, **kwargs + ) if not np.array_equal(index.asi8, on_freq.asi8): raise ValueError except ValueError as e: @@ -893,9 +898,12 @@ def _validate_frequency(cls, index, freq, **kwargs): # is `NaT`, in which case the call to `cls._generate_range` will # raise a ValueError, which we re-raise with a more targeted # message. - raise ValueError('Inferred frequency {infer} from passed values ' - 'does not conform to passed frequency {passed}' - .format(infer=inferred, passed=freq.freqstr)) + raise ValueError( + "Inferred frequency {infer} from passed values " + "does not conform to passed frequency {passed}".format( + infer=inferred, passed=freq.freqstr + ) + ) # monotonicity/uniqueness properties are called via frequencies.infer_freq, # see GH#23789 @@ -917,24 +925,28 @@ def _is_unique(self): def _add_datetimelike_scalar(self, other): # Overriden by TimedeltaArray - raise TypeError("cannot add {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "cannot add {cls} and {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) _add_datetime_arraylike = _add_datetimelike_scalar def _sub_datetimelike_scalar(self, other): # Overridden by DatetimeArray assert other is not NaT - raise TypeError("cannot subtract a datelike from a {cls}" - .format(cls=type(self).__name__)) + raise TypeError( + "cannot subtract a datelike from a {cls}".format(cls=type(self).__name__) + ) _sub_datetime_arraylike = _sub_datetimelike_scalar def _sub_period(self, other): # Overriden by PeriodArray - raise TypeError("cannot subtract Period from a {cls}" - .format(cls=type(self).__name__)) + raise TypeError( + "cannot subtract Period from a {cls}".format(cls=type(self).__name__) + ) def _add_offset(self, offset): raise AbstractMethodError(self) @@ -973,15 +985,16 @@ def _add_timedeltalike_scalar(self, other): """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds - new_values = np.empty(len(self), dtype='i8') + new_values = np.empty(len(self), dtype="i8") new_values[:] = iNaT return new_values inc = delta_to_nanoseconds(other) - new_values = checked_add_with_arr(self.asi8, inc, - arr_mask=self._isnan).view('i8') + new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view( + "i8" + ) new_values = self._maybe_mask_results(new_values) - return new_values.view('i8') + return new_values.view("i8") def _add_delta_tdi(self, other): """ @@ -994,26 +1007,29 @@ def _add_delta_tdi(self, other): if isinstance(other, np.ndarray): # ndarray[timedelta64]; wrap in TimedeltaIndex for op from pandas import TimedeltaIndex + other = TimedeltaIndex(other) self_i8 = self.asi8 other_i8 = other.asi8 - new_values = checked_add_with_arr(self_i8, other_i8, - arr_mask=self._isnan, - b_mask=other._isnan) + new_values = checked_add_with_arr( + self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan + ) if self._hasnans or other._hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = iNaT - return new_values.view('i8') + return new_values.view("i8") def _add_nat(self): """ Add pd.NaT to self """ if is_period_dtype(self): - raise TypeError('Cannot add {cls} and {typ}' - .format(cls=type(self).__name__, - typ=type(NaT).__name__)) + raise TypeError( + "Cannot add {cls} and {typ}".format( + cls=type(self).__name__, typ=type(NaT).__name__ + ) + ) # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes @@ -1033,7 +1049,7 @@ def _sub_nat(self): # For period dtype, timedelta64 is a close-enough return dtype. result = np.zeros(len(self), dtype=np.int64) result.fill(iNaT) - return result.view('timedelta64[ns]') + return result.view("timedelta64[ns]") def _sub_period_array(self, other): """ @@ -1051,22 +1067,23 @@ def _sub_period_array(self, other): Array of DateOffset objects; nulls represented by NaT. """ if not is_period_dtype(self): - raise TypeError("cannot subtract {dtype}-dtype from {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "cannot subtract {dtype}-dtype from {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) if len(self) != len(other): - raise ValueError("cannot subtract arrays/indices of " - "unequal length") + raise ValueError("cannot subtract arrays/indices of " "unequal length") if self.freq != other.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr + ) raise IncompatibleFrequency(msg) - new_values = checked_add_with_arr(self.asi8, -other.asi8, - arr_mask=self._isnan, - b_mask=other._isnan) + new_values = checked_add_with_arr( + self.asi8, -other.asi8, arr_mask=self._isnan, b_mask=other._isnan + ) new_values = np.array([self.freq.base * x for x in new_values]) if self._hasnans or other._hasnans: @@ -1125,17 +1142,19 @@ def _addsub_offset_array(self, other, op): if len(other) == 1: return op(self, other[0]) - warnings.warn("Adding/subtracting array of DateOffsets to " - "{cls} not vectorized" - .format(cls=type(self).__name__), PerformanceWarning) + warnings.warn( + "Adding/subtracting array of DateOffsets to " + "{cls} not vectorized".format(cls=type(self).__name__), + PerformanceWarning, + ) # For EA self.astype('O') returns a numpy array, not an Index - left = lib.values_from_object(self.astype('O')) + left = lib.values_from_object(self.astype("O")) res_values = op(left, np.array(other)) kwargs = {} if not is_period_dtype(self): - kwargs['freq'] = 'infer' + kwargs["freq"] = "infer" return self._from_sequence(res_values, **kwargs) def _time_shift(self, periods, freq=None): @@ -1173,8 +1192,7 @@ def _time_shift(self, periods, freq=None): # Note: in the DatetimeTZ case, _generate_range will infer the # appropriate timezone from `start` and `end`, so tz does not need # to be passed explicitly. - return self._generate_range(start=start, end=end, periods=None, - freq=self.freq) + return self._generate_range(start=start, end=end, periods=None, freq=self.freq) def __add__(self, other): other = lib.item_from_zerodim(other) @@ -1214,9 +1232,11 @@ def __add__(self, other): result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other): # Explicitly catch invalid dtypes - raise TypeError("cannot add {dtype}-dtype to {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "cannot add {dtype}-dtype to {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) elif is_period_dtype(other): # if self is a TimedeltaArray and other is a PeriodArray with # a timedelta-like (i.e. Tick) freq, this operation is valid. @@ -1231,6 +1251,7 @@ def __add__(self, other): if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray + # TODO: infer freq? return TimedeltaArray(result) return result @@ -1282,14 +1303,18 @@ def __sub__(self, other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) elif isinstance(other, ABCIndexClass): - raise TypeError("cannot subtract {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "cannot subtract {cls} and {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) elif is_float_dtype(other): # Explicitly catch invalid dtypes - raise TypeError("cannot subtract {dtype}-dtype from {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "cannot subtract {dtype}-dtype from {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented @@ -1298,6 +1323,7 @@ def __sub__(self, other): if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray + # TODO: infer freq? return TimedeltaArray(result) return result @@ -1309,20 +1335,28 @@ def __rsub__(self, other): if not isinstance(other, DatetimeLikeArrayMixin): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray + other = DatetimeArray(other) return other - self - elif (is_datetime64_any_dtype(self) and hasattr(other, 'dtype') and - not is_datetime64_any_dtype(other)): + elif ( + is_datetime64_any_dtype(self) + and hasattr(other, "dtype") + and not is_datetime64_any_dtype(other) + ): # GH#19959 datetime - datetime is well-defined as timedelta, # but any other type - datetime is not well-defined. - raise TypeError("cannot subtract {cls} from {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "cannot subtract {cls} from {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) elif is_period_dtype(self) and is_timedelta64_dtype(other): # TODO: Can we simplify/generalize these cases at all? - raise TypeError("cannot subtract {cls} from {dtype}" - .format(cls=type(self).__name__, - dtype=other.dtype)) + raise TypeError( + "cannot subtract {cls} from {dtype}".format( + cls=type(self).__name__, dtype=other.dtype + ) + ) return -(self - other) # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 @@ -1337,8 +1371,9 @@ def __isub__(self, other): # -------------------------------------------------------------- # Comparison Methods - def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', - from_utc=False): + def _ensure_localized( + self, arg, ambiguous="raise", nonexistent="raise", from_utc=False + ): """ Ensure that we are re-localized. @@ -1360,12 +1395,12 @@ def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', """ # reconvert to local tz - tz = getattr(self, 'tz', None) + tz = getattr(self, "tz", None) if tz is not None: if not isinstance(arg, type(self)): arg = self._simple_new(arg) if from_utc: - arg = arg.tz_localize('UTC').tz_convert(self.tz) + arg = arg.tz_localize("UTC").tz_convert(self.tz) else: arg = arg.tz_localize( self.tz, ambiguous=ambiguous, nonexistent=nonexistent @@ -1463,8 +1498,8 @@ def mean(self, skipna=True): raise TypeError( "mean is not implemented for {cls} since the meaning is " "ambiguous. An alternative is " - "obj.to_timestamp(how='start').mean()" - .format(cls=type(self).__name__)) + "obj.to_timestamp(how='start').mean()".format(cls=type(self).__name__) + ) mask = self.isna() if skipna: @@ -1478,7 +1513,7 @@ def mean(self, skipna=True): # short-circut for empty max / min return NaT - result = nanops.nanmean(values.view('i8'), skipna=skipna) + result = nanops.nanmean(values.view("i8"), skipna=skipna) # Don't have to worry about NA `result`, since no NA went in. return self._box_func(result) @@ -1486,6 +1521,7 @@ def mean(self, skipna=True): # ------------------------------------------------------------------- # Shared Constructor Helpers + def validate_periods(periods): """ If a `periods` argument is passed to the Datetime/Timedelta Array/Index @@ -1508,8 +1544,9 @@ def validate_periods(periods): if lib.is_float(periods): periods = int(periods) elif not lib.is_integer(periods): - raise TypeError('periods must be a number, got {periods}' - .format(periods=periods)) + raise TypeError( + "periods must be a number, got {periods}".format(periods=periods) + ) return periods @@ -1569,11 +1606,11 @@ def validate_inferred_freq(freq, inferred_freq, freq_infer): """ if inferred_freq is not None: if freq is not None and freq != inferred_freq: - raise ValueError('Inferred frequency {inferred} from passed ' - 'values does not conform to passed frequency ' - '{passed}' - .format(inferred=inferred_freq, - passed=freq.freqstr)) + raise ValueError( + "Inferred frequency {inferred} from passed " + "values does not conform to passed frequency " + "{passed}".format(inferred=inferred_freq, passed=freq.freqstr) + ) elif freq is None: freq = inferred_freq freq_infer = False @@ -1600,7 +1637,7 @@ def maybe_infer_freq(freq): freq_infer = False if not isinstance(freq, DateOffset): # if a passed freq is None, don't infer automatically - if freq != 'infer': + if freq != "infer": freq = frequencies.to_offset(freq) else: freq_infer = True @@ -1628,17 +1665,16 @@ def _ensure_datetimelike_to_i8(other, to_utc=False): if lib.is_scalar(other) and isna(other): return iNaT - elif isinstance(other, (PeriodArray, ABCIndexClass, - DatetimeLikeArrayMixin)): + elif isinstance(other, (PeriodArray, ABCIndexClass, DatetimeLikeArrayMixin)): # convert tz if needed - if getattr(other, 'tz', None) is not None: + if getattr(other, "tz", None) is not None: if to_utc: - other = other.tz_convert('UTC') + other = other.tz_convert("UTC") else: other = other.tz_localize(None) else: try: - return np.array(other, copy=False).view('i8') + return np.array(other, copy=False).view("i8") except TypeError: # period array cannot be coerced to int other = Index(other) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6b554ddf25c96..5b540dcce53c8 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -8,20 +8,44 @@ from pandas._libs import lib, tslib from pandas._libs.tslibs import ( - NaT, Timestamp, ccalendar, conversion, fields, iNaT, normalize_date, - resolution as libresolution, timezones, tzconversion) + NaT, + Timestamp, + ccalendar, + conversion, + fields, + iNaT, + normalize_date, + resolution as libresolution, + timezones, + tzconversion, +) import pandas.compat as compat from pandas.errors import PerformanceWarning from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, is_datetime64_dtype, - is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_type, is_float_dtype, is_object_dtype, is_period_dtype, - is_string_dtype, is_timedelta64_dtype, pandas_dtype) + _INT64_DTYPE, + _NS_DTYPE, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_type, + is_float_dtype, + is_object_dtype, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCPandasArray, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCPandasArray, + ABCSeries, +) from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -92,15 +116,16 @@ def f(self): values = self._local_timestamps() if field in self._bool_ops: - if field.endswith(('start', 'end')): + if field.endswith(("start", "end")): freq = self.freq month_kw = 12 if freq: kwds = freq.kwds - month_kw = kwds.get('startingMonth', kwds.get('month', 12)) + month_kw = kwds.get("startingMonth", kwds.get("month", 12)) - result = fields.get_start_end_field(values, field, - self.freqstr, month_kw) + result = fields.get_start_end_field( + values, field, self.freqstr, month_kw + ) else: result = fields.get_date_field(values, field) @@ -113,8 +138,9 @@ def f(self): else: result = fields.get_date_field(values, field) - result = self._maybe_mask_results(result, fill_value=None, - convert='float64') + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) return result @@ -127,8 +153,8 @@ def _dt_array_cmp(cls, op): """ Wrap comparison operations to convert datetime-like to datetime64 """ - opname = '__{name}__'.format(name=op.__name__) - nat_result = opname == '__ne__' + opname = "__{name}__".format(name=op.__name__) + nat_result = opname == "__ne__" def wrapper(self, other): if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): @@ -147,7 +173,7 @@ def wrapper(self, other): # string that cannot be parsed to Timestamp return ops.invalid_comparison(self, other, op) - result = op(self.asi8, other.view('i8')) + result = op(self.asi8, other.view("i8")) if isna(other): result.fill(nat_result) elif lib.is_scalar(other) or np.ndim(other) == 0: @@ -160,8 +186,9 @@ def wrapper(self, other): other = type(self)._from_sequence(other) except ValueError: other = np.array(other, dtype=np.object_) - elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries, - DatetimeArray)): + elif not isinstance( + other, (np.ndarray, ABCIndexClass, ABCSeries, DatetimeArray) + ): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. return ops.invalid_comparison(self, other, op) @@ -170,13 +197,12 @@ def wrapper(self, other): # We have to use _comp_method_OBJECT_ARRAY instead of numpy # comparison otherwise it would fail to raise when # comparing tz-aware and tz-naive - with np.errstate(all='ignore'): - result = ops._comp_method_OBJECT_ARRAY(op, - self.astype(object), - other) + with np.errstate(all="ignore"): + result = ops._comp_method_OBJECT_ARRAY( + op, self.astype(object), other + ) o_mask = isna(other) - elif not (is_datetime64_dtype(other) or - is_datetime64tz_dtype(other)): + elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) return ops.invalid_comparison(self, other, op) else: @@ -184,14 +210,16 @@ def wrapper(self, other): if isinstance(other, (ABCIndexClass, ABCSeries)): other = other.array - if (is_datetime64_dtype(other) and - not is_datetime64_ns_dtype(other) or - not hasattr(other, 'asi8')): + if ( + is_datetime64_dtype(other) + and not is_datetime64_ns_dtype(other) + or not hasattr(other, "asi8") + ): # e.g. other.dtype == 'datetime64[s]' # or an object-dtype ndarray other = type(self)._from_sequence(other) - result = op(self.view('i8'), other.view('i8')) + result = op(self.view("i8"), other.view("i8")) o_mask = other._isnan result = com.values_from_object(result) @@ -207,9 +235,7 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class DatetimeArray(dtl.DatetimeLikeArrayMixin, - dtl.TimelikeOps, - dtl.DatelikeOps): +class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. @@ -245,25 +271,53 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, ------- None """ + _typ = "datetimearray" _scalar_type = Timestamp # define my properties & methods for delegation - _bool_ops = ['is_month_start', 'is_month_end', - 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'is_leap_year'] - _object_ops = ['weekday_name', 'freq', 'tz'] - _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'week', 'weekday', 'dayofweek', - 'dayofyear', 'quarter', 'days_in_month', - 'daysinmonth', 'microsecond', - 'nanosecond'] - _other_ops = ['date', 'time', 'timetz'] + _bool_ops = [ + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "is_leap_year", + ] + _object_ops = ["weekday_name", "freq", "tz"] + _field_ops = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "week", + "weekday", + "dayofweek", + "dayofyear", + "quarter", + "days_in_month", + "daysinmonth", + "microsecond", + "nanosecond", + ] + _other_ops = ["date", "time", "timetz"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops + _other_ops - _datetimelike_methods = ['to_period', 'tz_localize', - 'tz_convert', - 'normalize', 'strftime', 'round', 'floor', - 'ceil', 'month_name', 'day_name'] + _datetimelike_methods = [ + "to_period", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "month_name", + "day_name", + ] # ndim is inherited from ExtensionArray, must exist to ensure # Timestamp.__richcmp__(DateTimeArray) operates pointwise @@ -286,7 +340,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): if isinstance(values, type(self)): # validation - dtz = getattr(dtype, 'tz', None) + dtz = getattr(dtype, "tz", None) if dtz and values.tz is None: dtype = DatetimeTZDtype(tz=dtype.tz) elif dtz and values.tz: @@ -312,7 +366,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): if values.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") - if values.dtype == 'i8': + if values.dtype == "i8": # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps @@ -338,7 +392,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): values = values.copy() if freq: freq = to_offset(freq) - if getattr(dtype, 'tz', None): + if getattr(dtype, "tz", None): # https://github.com/pandas-dev/pandas/issues/18595 # Ensure that we have a standard timezone for pytz objects. # Without this, things like adding an array of timedeltas and @@ -356,7 +410,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): @classmethod def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE): assert isinstance(values, np.ndarray) - if values.dtype == 'i8': + if values.dtype == "i8": values = values.view(_NS_DTYPE) result = object.__new__(cls) @@ -366,20 +420,33 @@ def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE): return result @classmethod - def _from_sequence(cls, data, dtype=None, copy=False, - tz=None, freq=None, - dayfirst=False, yearfirst=False, ambiguous='raise', - int_as_wall_time=False): + def _from_sequence( + cls, + data, + dtype=None, + copy=False, + tz=None, + freq=None, + dayfirst=False, + yearfirst=False, + ambiguous="raise", + int_as_wall_time=False, + ): freq, freq_infer = dtl.maybe_infer_freq(freq) subarr, tz, inferred_freq = sequence_to_dt64ns( - data, dtype=dtype, copy=copy, tz=tz, - dayfirst=dayfirst, yearfirst=yearfirst, - ambiguous=ambiguous, int_as_wall_time=int_as_wall_time) + data, + dtype=dtype, + copy=copy, + tz=tz, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + int_as_wall_time=int_as_wall_time, + ) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, - freq_infer) + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) dtype = tz_to_dtype(tz) result = cls._simple_new(subarr, freq=freq, dtype=dtype) @@ -396,18 +463,28 @@ def _from_sequence(cls, data, dtype=None, copy=False, return result @classmethod - def _generate_range(cls, start, end, periods, freq, tz=None, - normalize=False, ambiguous='raise', - nonexistent='raise', closed=None): + def _generate_range( + cls, + start, + end, + periods, + freq, + tz=None, + normalize=False, + ambiguous="raise", + nonexistent="raise", + closed=None, + ): periods = dtl.validate_periods(periods) if freq is None and any(x is None for x in [periods, start, end]): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') + raise ValueError("Must provide freq argument if no data is " "supplied") if com.count_not_none(start, end, periods, freq) != 3: - raise ValueError('Of the four parameters: start, end, periods, ' - 'and freq, exactly three must be specified') + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) freq = to_offset(freq) if start is not None: @@ -418,27 +495,31 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if start is None and end is None: if closed is not None: - raise ValueError("Closed has to be None if not both of start" - "and end are defined") + raise ValueError( + "Closed has to be None if not both of start" "and end are defined" + ) if start is NaT or end is NaT: raise ValueError("Neither `start` nor `end` can be NaT") left_closed, right_closed = dtl.validate_endpoints(closed) - start, end, _normalized = _maybe_normalize_endpoints(start, end, - normalize) + start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize) tz = _infer_tz_from_endpoints(start, end, tz) if tz is not None: # Localize the start and end arguments start = _maybe_localize_point( - start, getattr(start, 'tz', None), start, freq, tz, - ambiguous, nonexistent + start, + getattr(start, "tz", None), + start, + freq, + tz, + ambiguous, + nonexistent, ) end = _maybe_localize_point( - end, getattr(end, 'tz', None), end, freq, tz, - ambiguous, nonexistent + end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent ) if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for @@ -455,8 +536,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if tz is not None and index.tz is None: arr = conversion.tz_localize_to_utc( - index.asi8, - tz, ambiguous=ambiguous, nonexistent=nonexistent) + index.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent + ) index = cls(arr) @@ -471,12 +552,13 @@ def _generate_range(cls, start, end, periods, freq, tz=None, # Nanosecond-granularity timestamps aren't always correctly # representable with doubles, so we limit the range that we # pass to np.linspace as much as possible - arr = np.linspace( - 0, end.value - start.value, - periods, dtype='int64') + start.value + arr = ( + np.linspace(0, end.value - start.value, periods, dtype="int64") + + start.value + ) dtype = tz_to_dtype(tz) index = cls._simple_new( - arr.astype('M8[ns]', copy=False), freq=None, dtype=dtype + arr.astype("M8[ns]", copy=False), freq=None, dtype=dtype ) if not left_closed and len(index) and index[0] == start: @@ -504,8 +586,11 @@ def _check_compatible_with(self, other): if other is NaT: return if not timezones.tz_compare(self.tz, other.tz): - raise ValueError("Timezones don't match. '{own} != {other}'" - .format(own=self.tz, other=other.tz)) + raise ValueError( + "Timezones don't match. '{own} != {other}'".format( + own=self.tz, other=other.tz + ) + ) def _maybe_clear_freq(self): self._freq = None @@ -555,8 +640,10 @@ def tz(self): @tz.setter def tz(self, value): # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError("Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate") + raise AttributeError( + "Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate" + ) @property def tzinfo(self): @@ -610,9 +697,9 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, length) - converted = tslib.ints_to_pydatetime(data[start_i:end_i], - tz=self.tz, freq=self.freq, - box="timestamp") + converted = tslib.ints_to_pydatetime( + data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" + ) for v in converted: yield v @@ -623,11 +710,10 @@ def astype(self, dtype, copy=True): # DatetimeLikeArrayMixin Super handles the rest. dtype = pandas_dtype(dtype) - if (is_datetime64_ns_dtype(dtype) and - not is_dtype_equal(dtype, self.dtype)): + if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype): # GH#18951: datetime64_ns dtype but not equal means different tz - new_tz = getattr(dtype, 'tz', None) - if getattr(self.dtype, 'tz', None) is None: + new_tz = getattr(dtype, "tz", None) + if getattr(self.dtype, "tz", None) is None: return self.tz_localize(new_tz) result = self.tz_convert(new_tz) if new_tz is None: @@ -636,8 +722,7 @@ def astype(self, dtype, copy=True): # ndarray, but we could maybe work around it there. result = result._data return result - elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype, - dtype): + elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype, dtype): if copy: return self.copy() return self @@ -656,21 +741,23 @@ def _validate_fill_value(self, fill_value): self._assert_tzawareness_compat(fill_value) fill_value = Timestamp(fill_value).value else: - raise ValueError("'fill_value' should be a Timestamp. " - "Got '{got}'.".format(got=fill_value)) + raise ValueError( + "'fill_value' should be a Timestamp. " + "Got '{got}'.".format(got=fill_value) + ) return fill_value # ----------------------------------------------------------------- # Rendering Methods - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values + fmt = _get_format_datetime64_from_values(self, date_format) - return tslib.format_array_from_datetime(self.asi8, - tz=self.tz, - format=fmt, - na_rep=na_rep) + return tslib.format_array_from_datetime( + self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + ) # ----------------------------------------------------------------- # Comparison Methods @@ -684,12 +771,12 @@ def _has_same_tz(self, other): if isinstance(other, np.datetime64): # convert to Timestamp as np.datetime64 doesn't have tz attr other = Timestamp(other) - vzone = timezones.get_timezone(getattr(other, 'tzinfo', '__no_tz__')) + vzone = timezones.get_timezone(getattr(other, "tzinfo", "__no_tz__")) return zzone == vzone def _assert_tzawareness_compat(self, other): # adapted from _Timestamp._assert_tzawareness_compat - other_tz = getattr(other, 'tzinfo', None) + other_tz = getattr(other, "tzinfo", None) if is_datetime64tz_dtype(other): # Get tzinfo from Series dtype other_tz = other.dtype.tz @@ -698,11 +785,13 @@ def _assert_tzawareness_compat(self, other): pass elif self.tz is None: if other_tz is not None: - raise TypeError('Cannot compare tz-naive and tz-aware ' - 'datetime-like objects.') + raise TypeError( + "Cannot compare tz-naive and tz-aware " "datetime-like objects." + ) elif other_tz is None: - raise TypeError('Cannot compare tz-naive and tz-aware ' - 'datetime-like objects') + raise TypeError( + "Cannot compare tz-naive and tz-aware " "datetime-like objects" + ) # ----------------------------------------------------------------- # Arithmetic Methods @@ -718,18 +807,18 @@ def _sub_datetime_arraylike(self, other): if not self._has_same_tz(other): # require tz compat - raise TypeError("{cls} subtraction must have the same " - "timezones or no timezones" - .format(cls=type(self).__name__)) + raise TypeError( + "{cls} subtraction must have the same " + "timezones or no timezones".format(cls=type(self).__name__) + ) self_i8 = self.asi8 other_i8 = other.asi8 arr_mask = self._isnan | other._isnan - new_values = checked_add_with_arr(self_i8, -other_i8, - arr_mask=arr_mask) + new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=arr_mask) if self._hasnans or other._hasnans: new_values[arr_mask] = iNaT - return new_values.view('timedelta64[ns]') + return new_values.view("timedelta64[ns]") def _add_offset(self, offset): assert not isinstance(offset, Tick) @@ -743,11 +832,13 @@ def _add_offset(self, offset): result = result.tz_localize(self.tz) except NotImplementedError: - warnings.warn("Non-vectorized DateOffset being applied to Series " - "or DatetimeIndex", PerformanceWarning) - result = self.astype('O') + offset + warnings.warn( + "Non-vectorized DateOffset being applied to Series " "or DatetimeIndex", + PerformanceWarning, + ) + result = self.astype("O") + offset - return type(self)._from_sequence(result, freq='infer') + return type(self)._from_sequence(result, freq="infer") def _sub_datetimelike_scalar(self, other): # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]] @@ -759,14 +850,14 @@ def _sub_datetimelike_scalar(self, other): if not self._has_same_tz(other): # require tz compat - raise TypeError("Timestamp subtraction must have the same " - "timezones or no timezones") + raise TypeError( + "Timestamp subtraction must have the same " "timezones or no timezones" + ) i8 = self.asi8 - result = checked_add_with_arr(i8, -other.value, - arr_mask=self._isnan) + result = checked_add_with_arr(i8, -other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) - return result.view('timedelta64[ns]') + return result.view("timedelta64[ns]") def _add_delta(self, delta): """ @@ -783,7 +874,7 @@ def _add_delta(self, delta): result : DatetimeArray """ new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, tz=self.tz, freq='infer') + return type(self)._from_sequence(new_values, tz=self.tz, freq="infer") # ----------------------------------------------------------------- # Timezone Conversion and Localization Methods @@ -865,15 +956,15 @@ def tz_convert(self, tz): if self.tz is None: # tz naive, use tz_localize - raise TypeError('Cannot convert tz-naive timestamps, use ' - 'tz_localize to localize') + raise TypeError( + "Cannot convert tz-naive timestamps, use " "tz_localize to localize" + ) # No conversion since timestamps are all UTC to begin with dtype = tz_to_dtype(tz) return self._simple_new(self.asi8, dtype=dtype, freq=self.freq) - def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', - errors=None): + def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): """ Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index. @@ -1021,30 +1112,35 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', dtype: datetime64[ns, 'Europe/Warsaw'] """ if errors is not None: - warnings.warn("The errors argument is deprecated and will be " - "removed in a future release. Use " - "nonexistent='NaT' or nonexistent='raise' " - "instead.", FutureWarning) - if errors == 'coerce': - nonexistent = 'NaT' - elif errors == 'raise': - nonexistent = 'raise' + warnings.warn( + "The errors argument is deprecated and will be " + "removed in a future release. Use " + "nonexistent='NaT' or nonexistent='raise' " + "instead.", + FutureWarning, + ) + if errors == "coerce": + nonexistent = "NaT" + elif errors == "raise": + nonexistent = "raise" else: - raise ValueError("The errors argument must be either 'coerce' " - "or 'raise'.") + raise ValueError( + "The errors argument must be either 'coerce' " "or 'raise'." + ) - nonexistent_options = ('raise', 'NaT', 'shift_forward', - 'shift_backward') + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( - nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object") + nonexistent, timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise'," + " 'NaT', 'shift_forward', 'shift_backward' or" + " a timedelta object" + ) if self.tz is not None: if tz is None: - new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC, - self.tz) + new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC, self.tz) else: raise TypeError("Already tz-aware, use tz_convert to convert.") else: @@ -1052,7 +1148,7 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise', # Convert to UTC new_dates = conversion.tz_localize_to_utc( - self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent, + self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent ) new_dates = new_dates.view(_NS_DTYPE) dtype = tz_to_dtype(tz) @@ -1114,12 +1210,11 @@ def normalize(self): not_null = ~self.isna() DAY_NS = ccalendar.DAY_SECONDS * 1000000000 new_values = self.asi8.copy() - adjustment = (new_values[not_null] % DAY_NS) + adjustment = new_values[not_null] % DAY_NS new_values[not_null] = new_values[not_null] - adjustment else: new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) - return type(self)._from_sequence(new_values, - freq='infer').tz_localize(self.tz) + return type(self)._from_sequence(new_values, freq="infer").tz_localize(self.tz) def to_period(self, freq=None): """ @@ -1168,15 +1263,19 @@ def to_period(self, freq=None): from pandas.core.arrays import PeriodArray if self.tz is not None: - warnings.warn("Converting to PeriodArray/Index representation " - "will drop timezone information.", UserWarning) + warnings.warn( + "Converting to PeriodArray/Index representation " + "will drop timezone information.", + UserWarning, + ) if freq is None: freq = self.freqstr or self.inferred_freq if freq is None: - raise ValueError("You must pass a freq argument as " - "current index has none.") + raise ValueError( + "You must pass a freq argument as " "current index has none." + ) freq = get_period_alias(freq) @@ -1198,8 +1297,9 @@ def to_perioddelta(self, freq): """ # TODO: consider privatizing (discussion in GH#23113) from pandas.core.arrays.timedeltas import TimedeltaArray + i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8 - m8delta = i8delta.view('m8[ns]') + m8delta = i8delta.view("m8[ns]") return TimedeltaArray(m8delta) # ----------------------------------------------------------------- @@ -1236,8 +1336,7 @@ def month_name(self, locale=None): else: values = self.asi8 - result = fields.get_date_name_field(values, 'month_name', - locale=locale) + result = fields.get_date_name_field(values, "month_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) return result @@ -1272,8 +1371,7 @@ def day_name(self, locale=None): else: values = self.asi8 - result = fields.get_date_name_field(values, 'day_name', - locale=locale) + result = fields.get_date_name_field(values, "day_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) return result @@ -1316,19 +1414,17 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "The year of the datetime.") - month = _field_accessor('month', 'M', - "The month as January=1, December=12. ") - day = _field_accessor('day', 'D', "The days of the datetime.") - hour = _field_accessor('hour', 'h', "The hours of the datetime.") - minute = _field_accessor('minute', 'm', "The minutes of the datetime.") - second = _field_accessor('second', 's', "The seconds of the datetime.") - microsecond = _field_accessor('microsecond', 'us', - "The microseconds of the datetime.") - nanosecond = _field_accessor('nanosecond', 'ns', - "The nanoseconds of the datetime.") - weekofyear = _field_accessor('weekofyear', 'woy', - "The week ordinal of the year.") + year = _field_accessor("year", "Y", "The year of the datetime.") + month = _field_accessor("month", "M", "The month as January=1, December=12. ") + day = _field_accessor("day", "D", "The days of the datetime.") + hour = _field_accessor("hour", "h", "The hours of the datetime.") + minute = _field_accessor("minute", "m", "The minutes of the datetime.") + second = _field_accessor("second", "s", "The seconds of the datetime.") + microsecond = _field_accessor( + "microsecond", "us", "The microseconds of the datetime." + ) + nanosecond = _field_accessor("nanosecond", "ns", "The nanoseconds of the datetime.") + weekofyear = _field_accessor("weekofyear", "woy", "The week ordinal of the year.") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1364,21 +1460,20 @@ def date(self): 2017-01-08 6 Freq: D, dtype: int64 """ - dayofweek = _field_accessor('dayofweek', 'dow', _dayofweek_doc) + dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc) weekday = dayofweek weekday_name = _field_accessor( - 'weekday_name', - 'weekday_name', - "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") + "weekday_name", + "weekday_name", + "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0", + ) - dayofyear = _field_accessor('dayofyear', 'doy', - "The ordinal day of the year.") - quarter = _field_accessor('quarter', 'q', "The quarter of the date.") + dayofyear = _field_accessor("dayofyear", "doy", "The ordinal day of the year.") + quarter = _field_accessor("quarter", "q", "The quarter of the date.") days_in_month = _field_accessor( - 'days_in_month', - 'dim', - "The number of days in the month.") + "days_in_month", "dim", "The number of days in the month." + ) daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. @@ -1425,18 +1520,16 @@ def date(self): array([False, True, False]) """ is_month_start = _field_accessor( - 'is_month_start', - 'is_month_start', - _is_month_doc.format(first_or_last='first')) + "is_month_start", "is_month_start", _is_month_doc.format(first_or_last="first") + ) is_month_end = _field_accessor( - 'is_month_end', - 'is_month_end', - _is_month_doc.format(first_or_last='last')) + "is_month_end", "is_month_end", _is_month_doc.format(first_or_last="last") + ) is_quarter_start = _field_accessor( - 'is_quarter_start', - 'is_quarter_start', + "is_quarter_start", + "is_quarter_start", """ Indicator for whether the date is the first day of a quarter. @@ -1474,10 +1567,11 @@ def date(self): >>> idx.is_quarter_start array([False, False, True, False]) - """) + """, + ) is_quarter_end = _field_accessor( - 'is_quarter_end', - 'is_quarter_end', + "is_quarter_end", + "is_quarter_end", """ Indicator for whether the date is the last day of a quarter. @@ -1515,10 +1609,11 @@ def date(self): >>> idx.is_quarter_end array([False, True, False, False]) - """) + """, + ) is_year_start = _field_accessor( - 'is_year_start', - 'is_year_start', + "is_year_start", + "is_year_start", """ Indicate whether the date is the first day of a year. @@ -1558,10 +1653,11 @@ def date(self): >>> idx.is_year_start array([False, False, True]) - """) + """, + ) is_year_end = _field_accessor( - 'is_year_end', - 'is_year_end', + "is_year_end", + "is_year_end", """ Indicate whether the date is the last day of the year. @@ -1601,10 +1697,11 @@ def date(self): >>> idx.is_year_end array([False, True, False]) - """) + """, + ) is_leap_year = _field_accessor( - 'is_leap_year', - 'is_leap_year', + "is_leap_year", + "is_leap_year", """ Boolean indicator if the date belongs to a leap year. @@ -1641,7 +1738,8 @@ def date(self): 1 False 2 False dtype: bool - """) + """, + ) def to_julian_date(self): """ @@ -1657,19 +1755,23 @@ def to_julian_date(self): testarr = month < 3 year[testarr] -= 1 month[testarr] += 12 - return (day + - np.fix((153 * month - 457) / 5) + - 365 * year + - np.floor(year / 4) - - np.floor(year / 100) + - np.floor(year / 400) + - 1721118.5 + - (self.hour + - self.minute / 60.0 + - self.second / 3600.0 + - self.microsecond / 3600.0 / 1e+6 + - self.nanosecond / 3600.0 / 1e+9 - ) / 24.0) + return ( + day + + np.fix((153 * month - 457) / 5) + + 365 * year + + np.floor(year / 4) + - np.floor(year / 100) + + np.floor(year / 400) + + 1721118.5 + + ( + self.hour + + self.minute / 60.0 + + self.second / 3600.0 + + self.microsecond / 3600.0 / 1e6 + + self.nanosecond / 3600.0 / 1e9 + ) + / 24.0 + ) DatetimeArray._add_comparison_ops() @@ -1678,10 +1780,17 @@ def to_julian_date(self): # ------------------------------------------------------------------- # Constructor Helpers -def sequence_to_dt64ns(data, dtype=None, copy=False, - tz=None, - dayfirst=False, yearfirst=False, ambiguous='raise', - int_as_wall_time=False): + +def sequence_to_dt64ns( + data, + dtype=None, + copy=False, + tz=None, + dayfirst=False, + yearfirst=False, + ambiguous="raise", + int_as_wall_time=False, +): """ Parameters ---------- @@ -1748,13 +1857,14 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False - if lib.infer_dtype(data, skipna=False) == 'integer': + if lib.infer_dtype(data, skipna=False) == "integer": data = data.astype(np.int64) else: # data comes back here as either i8 to denote UTC timestamps # or M8[ns] to denote wall times data, inferred_tz = objects_to_datetime64ns( - data, dayfirst=dayfirst, yearfirst=yearfirst) + data, dayfirst=dayfirst, yearfirst=yearfirst + ) tz = maybe_infer_tz(tz, inferred_tz) # When a sequence of timestamp objects is passed, we always # want to treat the (now i8-valued) data as UTC timestamps, @@ -1777,8 +1887,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, if tz is not None: # Convert tz-naive to UTC tz = timezones.maybe_get_tz(tz) - data = conversion.tz_localize_to_utc(data.view('i8'), tz, - ambiguous=ambiguous) + data = conversion.tz_localize_to_utc( + data.view("i8"), tz, ambiguous=ambiguous + ) data = data.view(_NS_DTYPE) assert data.dtype == _NS_DTYPE, data.dtype @@ -1794,8 +1905,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, data = data.astype(np.int64, copy=False) if int_as_wall_time and tz is not None and not timezones.is_utc(tz): warnings.warn(_i8_message, FutureWarning, stacklevel=4) - data = conversion.tz_localize_to_utc(data.view('i8'), tz, - ambiguous=ambiguous) + data = conversion.tz_localize_to_utc( + data.view("i8"), tz, ambiguous=ambiguous + ) data = data.view(_NS_DTYPE) result = data.view(_NS_DTYPE) @@ -1804,7 +1916,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, result = result.copy() assert isinstance(result, np.ndarray), type(result) - assert result.dtype == 'M8[ns]', result.dtype + assert result.dtype == "M8[ns]", result.dtype # We have to call this again after possibly inferring a tz above validate_tz_from_dtype(dtype, tz) @@ -1812,9 +1924,15 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, return result, tz, inferred_freq -def objects_to_datetime64ns(data, dayfirst, yearfirst, - utc=False, errors="raise", - require_iso8601=False, allow_object=False): +def objects_to_datetime64ns( + data, + dayfirst, + yearfirst, + utc=False, + errors="raise", + require_iso8601=False, + allow_object=False, +): """ Convert data to array of timestamps. @@ -1854,14 +1972,14 @@ def objects_to_datetime64ns(data, dayfirst, yearfirst, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - require_iso8601=require_iso8601 + require_iso8601=require_iso8601, ) except ValueError as e: try: values, tz_parsed = conversion.datetime_to_datetime64(data) # If tzaware, these values represent unix timestamps, so we # return them as i8 to distinguish from wall times - return values.view('i8'), tz_parsed + return values.view("i8"), tz_parsed except (ValueError, TypeError): raise e @@ -1869,7 +1987,7 @@ def objects_to_datetime64ns(data, dayfirst, yearfirst, # We can take a shortcut since the datetime64 numpy array # is in UTC # Return i8 values to denote unix timestamps - return result.view('i8'), tz_parsed + return result.view("i8"), tz_parsed elif is_datetime64_dtype(result): # returning M8[ns] denotes wall-times; since tz is None # the distinction is a thin one @@ -1917,16 +2035,20 @@ def maybe_convert_dtype(data, copy): # with integer dtypes. See discussion in GH#23675 elif is_timedelta64_dtype(data): - warnings.warn("Passing timedelta64-dtype data is deprecated, will " - "raise a TypeError in a future version", - FutureWarning, stacklevel=5) + warnings.warn( + "Passing timedelta64-dtype data is deprecated, will " + "raise a TypeError in a future version", + FutureWarning, + stacklevel=5, + ) data = data.view(_NS_DTYPE) elif is_period_dtype(data): # Note: without explicitly raising here, PeriodIndex # test_setops.test_join_does_not_recur fails - raise TypeError("Passing PeriodDtype data is invalid. " - "Use `data.to_timestamp()` instead") + raise TypeError( + "Passing PeriodDtype data is invalid. " "Use `data.to_timestamp()` instead" + ) elif is_categorical_dtype(data): # GH#18664 preserve tz in going DTI->Categorical->DTI @@ -1947,6 +2069,7 @@ def maybe_convert_dtype(data, copy): # ------------------------------------------------------------------- # Validation and Inference + def maybe_infer_tz(tz, inferred_tz): """ If a timezone is inferred from data, check that it is compatible with @@ -1970,9 +2093,10 @@ def maybe_infer_tz(tz, inferred_tz): elif inferred_tz is None: pass elif not timezones.tz_compare(tz, inferred_tz): - raise TypeError('data is already tz-aware {inferred_tz}, unable to ' - 'set specified tz: {tz}' - .format(inferred_tz=inferred_tz, tz=tz)) + raise TypeError( + "data is already tz-aware {inferred_tz}, unable to " + "set specified tz: {tz}".format(inferred_tz=inferred_tz, tz=tz) + ) return tz @@ -2003,17 +2127,21 @@ def _validate_dt64_dtype(dtype): if is_dtype_equal(dtype, np.dtype("M8")): # no precision, warn dtype = _NS_DTYPE - msg = textwrap.dedent("""\ + msg = textwrap.dedent( + """\ Passing in 'datetime64' dtype with no precision is deprecated and will raise in a future version. Please pass in - 'datetime64[ns]' instead.""") + 'datetime64[ns]' instead.""" + ) warnings.warn(msg, FutureWarning, stacklevel=5) - if ((isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) - or not isinstance(dtype, (np.dtype, DatetimeTZDtype))): - raise ValueError("Unexpected value for 'dtype': '{dtype}'. " - "Must be 'datetime64[ns]' or DatetimeTZDtype'." - .format(dtype=dtype)) + if (isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) or not isinstance( + dtype, (np.dtype, DatetimeTZDtype) + ): + raise ValueError( + "Unexpected value for 'dtype': '{dtype}'. " + "Must be 'datetime64[ns]' or DatetimeTZDtype'.".format(dtype=dtype) + ) return dtype @@ -2046,19 +2174,20 @@ def validate_tz_from_dtype(dtype, tz): # but not by us. We *do* allow non-existent tz errors to # go through pass - dtz = getattr(dtype, 'tz', None) + dtz = getattr(dtype, "tz", None) if dtz is not None: if tz is not None and not timezones.tz_compare(tz, dtz): - raise ValueError("cannot supply both a tz and a dtype" - " with a tz") + raise ValueError("cannot supply both a tz and a dtype" " with a tz") tz = dtz if tz is not None and is_datetime64_dtype(dtype): # We also need to check for the case where the user passed a # tz-naive dtype (i.e. datetime64[ns]) if tz is not None and not timezones.tz_compare(tz, dtz): - raise ValueError("cannot supply both a tz and a " - "timezone-naive dtype (i.e. datetime64[ns])") + raise ValueError( + "cannot supply both a tz and a " + "timezone-naive dtype (i.e. datetime64[ns])" + ) return tz @@ -2086,16 +2215,16 @@ def _infer_tz_from_endpoints(start, end, tz): try: inferred_tz = timezones.infer_tzinfo(start, end) except Exception: - raise TypeError('Start and end cannot both be tz-aware with ' - 'different timezones') + raise TypeError( + "Start and end cannot both be tz-aware with " "different timezones" + ) inferred_tz = timezones.maybe_get_tz(inferred_tz) tz = timezones.maybe_get_tz(tz) if tz is not None and inferred_tz is not None: if not timezones.tz_compare(inferred_tz, tz): - raise AssertionError("Inferred time zone not equal to passed " - "time zone") + raise AssertionError("Inferred time zone not equal to passed " "time zone") elif inferred_tz is not None: tz = inferred_tz @@ -2123,8 +2252,7 @@ def _maybe_normalize_endpoints(start, end, normalize): return start, end, _normalized -def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, - nonexistent): +def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent): """ Localize a start or end Timestamp to the timezone of the corresponding start or end Timestamp @@ -2149,10 +2277,9 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, if is_none is None and is_not_none is not None: # Note: We can't ambiguous='infer' a singular ambiguous time; however, # we have historically defaulted ambiguous=False - ambiguous = ambiguous if ambiguous != 'infer' else False - localize_args = {'ambiguous': ambiguous, 'nonexistent': nonexistent, - 'tz': None} + ambiguous = ambiguous if ambiguous != "infer" else False + localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None} if isinstance(freq, Tick) or freq is None: - localize_args['tz'] = tz + localize_args["tz"] = tz ts = ts.tz_localize(**localize_args) return ts diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 644c2f634240f..c999c4db232e6 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -12,8 +12,15 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - is_bool_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype, - is_list_like, is_object_dtype, is_scalar) + is_bool_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, +) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna @@ -32,23 +39,23 @@ class _IntegerDtype(ExtensionDtype): The attributes name & type are set when these subclasses are created. """ + name = None # type: str base = None type = None # type: Type na_value = np.nan def __repr__(self): - sign = 'U' if self.is_unsigned_integer else '' - return "{sign}Int{size}Dtype()".format(sign=sign, - size=8 * self.itemsize) + sign = "U" if self.is_unsigned_integer else "" + return "{sign}Int{size}Dtype()".format(sign=sign, size=8 * self.itemsize) @cache_readonly def is_signed_integer(self): - return self.kind == 'i' + return self.kind == "i" @cache_readonly def is_unsigned_integer(self): - return self.kind == 'u' + return self.kind == "u" @property def _is_numeric(self): @@ -111,15 +118,18 @@ def safe_cast(values, dtype, copy): """ try: - return values.astype(dtype, casting='safe', copy=copy) + return values.astype(dtype, casting="safe", copy=copy) except TypeError: casted = values.astype(dtype, copy=copy) if (casted == values).all(): return casted - raise TypeError("cannot safely cast non-equivalent {} to {}".format( - values.dtype, np.dtype(dtype))) + raise TypeError( + "cannot safely cast non-equivalent {} to {}".format( + values.dtype, np.dtype(dtype) + ) + ) def coerce_to_array(values, dtype, mask=None, copy=False): @@ -139,13 +149,14 @@ def coerce_to_array(values, dtype, mask=None, copy=False): tuple of (values, mask) """ # if values is integer numpy array, preserve it's dtype - if dtype is None and hasattr(values, 'dtype'): + if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype if dtype is not None: - if (isinstance(dtype, str) and - (dtype.startswith("Int") or dtype.startswith("UInt"))): + if isinstance(dtype, str) and ( + dtype.startswith("Int") or dtype.startswith("UInt") + ): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() @@ -169,20 +180,26 @@ def coerce_to_array(values, dtype, mask=None, copy=False): values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) - if inferred_type == 'empty': + if inferred_type == "empty": values = np.empty(len(values)) values.fill(np.nan) - elif inferred_type not in ['floating', 'integer', - 'mixed-integer', 'mixed-integer-float']: - raise TypeError("{} cannot be converted to an IntegerDtype".format( - values.dtype)) + elif inferred_type not in [ + "floating", + "integer", + "mixed-integer", + "mixed-integer-float", + ]: + raise TypeError( + "{} cannot be converted to an IntegerDtype".format(values.dtype) + ) elif is_bool_dtype(values) and is_integer_dtype(dtype): values = np.array(values, dtype=int, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): - raise TypeError("{} cannot be converted to an IntegerDtype".format( - values.dtype)) + raise TypeError( + "{} cannot be converted to an IntegerDtype".format(values.dtype) + ) if mask is None: mask = isna(values) @@ -196,7 +213,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): # infer dtype if needed if dtype is None: - dtype = np.dtype('int64') + dtype = np.dtype("int64") else: dtype = dtype.type @@ -284,13 +301,16 @@ def dtype(self): return _dtypes[str(self._data.dtype)] def __init__(self, values, mask, copy=False): - if not (isinstance(values, np.ndarray) - and is_integer_dtype(values.dtype)): - raise TypeError("values should be integer numpy array. Use " - "the 'integer_array' function instead") + if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)): + raise TypeError( + "values should be integer numpy array. Use " + "the 'integer_array' function instead" + ) if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)): - raise TypeError("mask should be boolean numpy array. Use " - "the 'integer_array' function instead") + raise TypeError( + "mask should be boolean numpy array. Use " + "the 'integer_array' function instead" + ) if copy: values = values.copy() @@ -315,8 +335,9 @@ def _from_factorized(cls, values, original): def _formatter(self, boxed=False): def fmt(x): if isna(x): - return 'NaN' + return "NaN" return str(x) + return fmt def __getitem__(self, item): @@ -350,10 +371,10 @@ def __array__(self, dtype=None): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # For IntegerArray inputs, we apply the ufunc to ._data # and mask the result. - if method == 'reduce': + if method == "reduce": # Not clear how to handle missing values in reductions. Raise. raise NotImplementedError("The 'reduce' method is not supported.") - out = kwargs.get('out', ()) + out = kwargs.get("out", ()) for x in inputs + out: if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)): @@ -361,7 +382,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result @@ -404,11 +426,11 @@ def take(self, indexer, allow_fill=False, fill_value=None): # we always fill with 1 internally # to avoid upcasting data_fill_value = 1 if isna(fill_value) else fill_value - result = take(self._data, indexer, fill_value=data_fill_value, - allow_fill=allow_fill) + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) - mask = take(self._mask, indexer, fill_value=True, - allow_fill=allow_fill) + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) # if we are filling # we only fill where the indexer is null @@ -545,9 +567,10 @@ def value_counts(self, dropna=True): # appending to an Index *always* infers # w/o passing the dtype array = np.append(array, [self._mask.sum()]) - index = Index(np.concatenate( - [index.values, - np.array([np.nan], dtype=object)]), dtype=object) + index = Index( + np.concatenate([index.values, np.array([np.nan], dtype=object)]), + dtype=object, + ) return Series(array, index=index) @@ -585,7 +608,7 @@ def cmp_method(self, other): elif is_list_like(other): other = np.asarray(other) if other.ndim > 0 and len(self) != len(other): - raise ValueError('Lengths must match to compare') + raise ValueError("Lengths must match to compare") other = lib.item_from_zerodim(other) @@ -593,7 +616,7 @@ def cmp_method(self, other): # comparisons, this will raise in the future with warnings.catch_warnings(): warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(self._data, other) # nans propagate @@ -602,10 +625,10 @@ def cmp_method(self, other): else: mask = self._mask | mask - result[mask] = op_name == 'ne' + result[mask] = op_name == "ne" return result - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): @@ -614,19 +637,19 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): - data = self._data.astype('float64') + data = self._data.astype("float64") data[mask] = self._na_value - op = getattr(nanops, 'nan' + name) + op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask) # if we have a boolean op, don't coerce - if name in ['any', 'all']: + if name in ["any", "all"]: pass # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ['sum', 'min', 'max', 'prod'] and notna(result): + elif name in ["sum", "min", "max", "prod"] and notna(result): int_result = int(result) if int_result == result: result = int_result @@ -651,8 +674,9 @@ def _maybe_mask_result(self, result, mask, other, op_name): # if we have a float operand we are by-definition # a float result # or our op is a divide - if ((is_float_dtype(other) or is_float(other)) or - (op_name in ['rtruediv', 'truediv', 'rdiv', 'div'])): + if (is_float_dtype(other) or is_float(other)) or ( + op_name in ["rtruediv", "truediv", "rdiv", "div"] + ): result[mask] = np.nan return result @@ -669,14 +693,13 @@ def integer_arithmetic_method(self, other): # Rely on pandas to unbox and dispatch to us. return NotImplemented - if getattr(other, 'ndim', 0) > 1: - raise NotImplementedError( - "can only perform ops with 1-d structures") + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, mask = other._data, other._mask - elif getattr(other, 'ndim', None) == 0: + elif getattr(other, "ndim", None) == 0: other = other.item() elif is_list_like(other): @@ -685,8 +708,7 @@ def integer_arithmetic_method(self, other): other = other.item() elif other.ndim == 1: if not (is_float_dtype(other) or is_integer_dtype(other)): - raise TypeError( - "can only perform ops with numeric values") + raise TypeError("can only perform ops with numeric values") else: if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") @@ -698,24 +720,26 @@ def integer_arithmetic_method(self, other): mask = self._mask | mask # 1 ** np.nan is 1. So we have to unmask those. - if op_name == 'pow': + if op_name == "pow": mask = np.where(self == 1, False, mask) - elif op_name == 'rpow': + elif op_name == "rpow": mask = np.where(other == 1, False, mask) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(self._data, other) # divmod returns a tuple - if op_name == 'divmod': + if op_name == "divmod": div, mod = result - return (self._maybe_mask_result(div, mask, other, 'floordiv'), - self._maybe_mask_result(mod, mask, other, 'mod')) + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) return self._maybe_mask_result(result, mask, other, op_name) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return set_function_name(integer_arithmetic_method, name, cls) @@ -739,76 +763,108 @@ def integer_arithmetic_method(self, other): # create the Dtype Int8Dtype = register_extension_dtype( - type('Int8Dtype', (_IntegerDtype, ), { - 'type': np.int8, - 'name': 'Int8', - '__doc__': _dtype_docstring.format(dtype='int8') - }) + type( + "Int8Dtype", + (_IntegerDtype,), + { + "type": np.int8, + "name": "Int8", + "__doc__": _dtype_docstring.format(dtype="int8"), + }, + ) ) Int16Dtype = register_extension_dtype( - type('Int16Dtype', (_IntegerDtype, ), { - 'type': np.int16, - 'name': 'Int16', - '__doc__': _dtype_docstring.format(dtype='int16') - }) + type( + "Int16Dtype", + (_IntegerDtype,), + { + "type": np.int16, + "name": "Int16", + "__doc__": _dtype_docstring.format(dtype="int16"), + }, + ) ) Int32Dtype = register_extension_dtype( - type('Int32Dtype', (_IntegerDtype, ), { - 'type': np.int32, - 'name': 'Int32', - '__doc__': _dtype_docstring.format(dtype='int32') - }) + type( + "Int32Dtype", + (_IntegerDtype,), + { + "type": np.int32, + "name": "Int32", + "__doc__": _dtype_docstring.format(dtype="int32"), + }, + ) ) Int64Dtype = register_extension_dtype( - type('Int64Dtype', (_IntegerDtype, ), { - 'type': np.int64, - 'name': 'Int64', - '__doc__': _dtype_docstring.format(dtype='int64') - }) + type( + "Int64Dtype", + (_IntegerDtype,), + { + "type": np.int64, + "name": "Int64", + "__doc__": _dtype_docstring.format(dtype="int64"), + }, + ) ) UInt8Dtype = register_extension_dtype( - type('UInt8Dtype', (_IntegerDtype, ), { - 'type': np.uint8, - 'name': 'UInt8', - '__doc__': _dtype_docstring.format(dtype='uint8') - }) + type( + "UInt8Dtype", + (_IntegerDtype,), + { + "type": np.uint8, + "name": "UInt8", + "__doc__": _dtype_docstring.format(dtype="uint8"), + }, + ) ) UInt16Dtype = register_extension_dtype( - type('UInt16Dtype', (_IntegerDtype, ), { - 'type': np.uint16, - 'name': 'UInt16', - '__doc__': _dtype_docstring.format(dtype='uint16') - }) + type( + "UInt16Dtype", + (_IntegerDtype,), + { + "type": np.uint16, + "name": "UInt16", + "__doc__": _dtype_docstring.format(dtype="uint16"), + }, + ) ) UInt32Dtype = register_extension_dtype( - type('UInt32Dtype', (_IntegerDtype, ), { - 'type': np.uint32, - 'name': 'UInt32', - '__doc__': _dtype_docstring.format(dtype='uint32') - }) + type( + "UInt32Dtype", + (_IntegerDtype,), + { + "type": np.uint32, + "name": "UInt32", + "__doc__": _dtype_docstring.format(dtype="uint32"), + }, + ) ) UInt64Dtype = register_extension_dtype( - type('UInt64Dtype', (_IntegerDtype, ), { - 'type': np.uint64, - 'name': 'UInt64', - '__doc__': _dtype_docstring.format(dtype='uint64') - }) + type( + "UInt64Dtype", + (_IntegerDtype,), + { + "type": np.uint64, + "name": "UInt64", + "__doc__": _dtype_docstring.format(dtype="uint64"), + }, + ) ) _dtypes = { - 'int8': Int8Dtype(), - 'int16': Int16Dtype(), - 'int32': Int32Dtype(), - 'int64': Int64Dtype(), - 'uint8': UInt8Dtype(), - 'uint16': UInt16Dtype(), - 'uint32': UInt32Dtype(), - 'uint64': UInt64Dtype(), + "int8": Int8Dtype(), + "int16": Int16Dtype(), + "int32": Int32Dtype(), + "int64": Int64Dtype(), + "uint8": UInt8Dtype(), + "uint16": UInt16Dtype(), + "uint32": UInt32Dtype(), + "uint64": UInt64Dtype(), } diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cf8ca25857f4e..f9fbd7ada376e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -5,38 +5,49 @@ from pandas._config import get_option -from pandas._libs.interval import ( - Interval, IntervalMixin, intervals_to_interval_bounds) +from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_any_dtype, is_float_dtype, - is_integer_dtype, is_interval, is_interval_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, pandas_dtype) + is_categorical_dtype, + is_datetime64_any_dtype, + is_float_dtype, + is_integer_dtype, + is_interval, + is_interval_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries) + ABCDatetimeIndex, + ABCInterval, + ABCIntervalIndex, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna -from pandas.core.arrays.base import ( - ExtensionArray, _extension_array_shared_docs) +from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com from pandas.core.indexes.base import Index, ensure_index -_VALID_CLOSED = {'left', 'right', 'both', 'neither'} +_VALID_CLOSED = {"left", "right", "both", "neither"} _interval_shared_docs = {} _shared_docs_kwargs = dict( - klass='IntervalArray', - qualname='arrays.IntervalArray', - name='' + klass="IntervalArray", qualname="arrays.IntervalArray", name="" ) -_interval_shared_docs['class'] = """ +_interval_shared_docs[ + "class" +] = """ %(summary)s .. versionadded:: %(versionadded)s @@ -99,14 +110,17 @@ """ -@Appender(_interval_shared_docs['class'] % dict( - klass="IntervalArray", - summary="Pandas array for interval data that are closed on the same side.", - versionadded="0.24.0", - name='', - extra_attributes='', - extra_methods='', - examples=textwrap.dedent("""\ +@Appender( + _interval_shared_docs["class"] + % dict( + klass="IntervalArray", + summary="Pandas array for interval data that are closed on the same side.", + versionadded="0.24.0", + name="", + extra_attributes="", + extra_methods="", + examples=textwrap.dedent( + """\ Examples -------- A new ``IntervalArray`` can be constructed directly from an array-like of @@ -120,16 +134,17 @@ It may also be constructed using one of the constructor methods: :meth:`IntervalArray.from_arrays`, :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. - """), -)) + """ + ), + ) +) class IntervalArray(IntervalMixin, ExtensionArray): dtype = IntervalDtype() ndim = 1 can_hold_na = True _na_value = _fill_value = np.nan - def __new__(cls, data, closed=None, dtype=None, copy=False, - verify_integrity=True): + def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True): if isinstance(data, ABCSeries) and is_interval_dtype(data): data = data.values @@ -142,25 +157,35 @@ def __new__(cls, data, closed=None, dtype=None, copy=False, # don't allow scalars if is_scalar(data): - msg = ("{}(...) must be called with a collection of some kind," - " {} was passed") + msg = ( + "{}(...) must be called with a collection of some kind," + " {} was passed" + ) raise TypeError(msg.format(cls.__name__, data)) # might need to convert empty or purely na data data = maybe_convert_platform_interval(data) left, right, infer_closed = intervals_to_interval_bounds( - data, validate_closed=closed is None) + data, validate_closed=closed is None + ) closed = closed or infer_closed - return cls._simple_new(left, right, closed, copy=copy, dtype=dtype, - verify_integrity=verify_integrity) + return cls._simple_new( + left, + right, + closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) @classmethod - def _simple_new(cls, left, right, closed=None, - copy=False, dtype=None, verify_integrity=True): + def _simple_new( + cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True + ): result = IntervalMixin.__new__(cls) - closed = closed or 'right' + closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) @@ -168,7 +193,7 @@ def _simple_new(cls, left, right, closed=None, # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): - msg = 'dtype must be an IntervalDtype, got {dtype}' + msg = "dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg.format(dtype=dtype)) elif dtype.subtype is not None: left = left.astype(dtype.subtype) @@ -181,22 +206,25 @@ def _simple_new(cls, left, right, closed=None, left = left.astype(right.dtype) if type(left) != type(right): - msg = ('must not have differing left [{ltype}] and right ' - '[{rtype}] types') - raise ValueError(msg.format(ltype=type(left).__name__, - rtype=type(right).__name__)) + msg = "must not have differing left [{ltype}] and right " "[{rtype}] types" + raise ValueError( + msg.format(ltype=type(left).__name__, rtype=type(right).__name__) + ) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 - msg = ('category, object, and string subtypes are not supported ' - 'for IntervalArray') + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalArray" + ) raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): - msg = 'Period dtypes are not supported, use a PeriodIndex instead' + msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) - elif (isinstance(left, ABCDatetimeIndex) and - str(left.tz) != str(right.tz)): - msg = ("left and right must have the same time zone, got " - "'{left_tz}' and '{right_tz}'") + elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): + msg = ( + "left and right must have the same time zone, got " + "'{left_tz}' and '{right_tz}'" + ) raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) result._left = left @@ -219,7 +247,9 @@ def _from_factorized(cls, values, original): values = values.astype(original.dtype.subtype) return cls(values, closed=original.closed) - _interval_shared_docs['from_breaks'] = """ + _interval_shared_docs[ + "from_breaks" + ] = """ Construct an %(klass)s from an array of splits. Parameters @@ -255,14 +285,15 @@ def _from_factorized(cls, values, original): """ @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _shared_docs_kwargs) - def from_breaks(cls, breaks, closed='right', copy=False, dtype=None): + @Appender(_interval_shared_docs["from_breaks"] % _shared_docs_kwargs) + def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): breaks = maybe_convert_platform_interval(breaks) - return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, - dtype=dtype) + return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) - _interval_shared_docs['from_arrays'] = """ + _interval_shared_docs[ + "from_arrays" + ] = """ Construct from two arrays defining the left and right bounds. Parameters @@ -317,15 +348,18 @@ def from_breaks(cls, breaks, closed='right', copy=False, dtype=None): """ @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _shared_docs_kwargs) - def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): + @Appender(_interval_shared_docs["from_arrays"] % _shared_docs_kwargs) + def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) right = maybe_convert_platform_interval(right) - return cls._simple_new(left, right, closed, copy=copy, - dtype=dtype, verify_integrity=True) + return cls._simple_new( + left, right, closed, copy=copy, dtype=dtype, verify_integrity=True + ) - _interval_shared_docs['from_intervals'] = """ + _interval_shared_docs[ + "from_intervals" + ] = """ Construct an %(klass)s from a 1d array of Interval objects .. deprecated:: 0.23.0 @@ -367,7 +401,9 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): closed='right', dtype='interval[int64]') """ - _interval_shared_docs['from_tuples'] = """ + _interval_shared_docs[ + "from_tuples" + ] = """ Construct an %(klass)s from an array-like of tuples Parameters @@ -404,8 +440,8 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): """ @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _shared_docs_kwargs) - def from_tuples(cls, data, closed='right', copy=False, dtype=None): + @Appender(_interval_shared_docs["from_tuples"] % _shared_docs_kwargs) + def from_tuples(cls, data, closed="right", copy=False, dtype=None): if len(data): left, right = [], [] else: @@ -421,18 +457,19 @@ def from_tuples(cls, data, closed='right', copy=False, dtype=None): # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] lhs, rhs = d except ValueError: - msg = ('{name}.from_tuples requires tuples of ' - 'length 2, got {tpl}').format(name=name, tpl=d) + msg = ( + "{name}.from_tuples requires tuples of " "length 2, got {tpl}" + ).format(name=name, tpl=d) raise ValueError(msg) except TypeError: - msg = ('{name}.from_tuples received an invalid ' - 'item, {tpl}').format(name=name, tpl=d) + msg = ( + "{name}.from_tuples received an invalid " "item, {tpl}" + ).format(name=name, tpl=d) raise TypeError(msg) left.append(lhs) right.append(rhs) - return cls.from_arrays(left, right, closed, copy=False, - dtype=dtype) + return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) def _validate(self): """Verify that the IntervalArray is valid. @@ -445,17 +482,20 @@ def _validate(self): * left is always below right """ if self.closed not in _VALID_CLOSED: - raise ValueError("invalid option for 'closed': {closed}" - .format(closed=self.closed)) + raise ValueError( + "invalid option for 'closed': {closed}".format(closed=self.closed) + ) if len(self.left) != len(self.right): - raise ValueError('left and right must have the same length') + raise ValueError("left and right must have the same length") left_mask = notna(self.left) right_mask = notna(self.right) if not (left_mask == right_mask).all(): - raise ValueError('missing values must be missing in the same ' - 'location both left and right sides') + raise ValueError( + "missing values must be missing in the same " + "location both left and right sides" + ) if not (self.left[left_mask] <= self.right[left_mask]).all(): - raise ValueError('left side of interval must be <= right side') + raise ValueError("left side of interval must be <= right side") # --------- # Interface @@ -487,10 +527,10 @@ def __setitem__(self, key, value): needs_float_conversion = True elif is_datetime64_any_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array - value = np.datetime64('NaT') + value = np.datetime64("NaT") elif is_timedelta64_dtype(self.dtype.subtype): # need proper NaT to set directly on the numpy array - value = np.timedelta64('NaT') + value = np.timedelta64("NaT") value_left, value_right = value, value # scalar interval @@ -512,13 +552,13 @@ def __setitem__(self, key, value): # forced to copy, update the copy, and swap in the new values. left = self.left.copy(deep=True) if needs_float_conversion: - left = left.astype('float') + left = left.astype("float") left.values[key] = value_left self._left = left right = self.right.copy(deep=True) if needs_float_conversion: - right = right.astype('float') + right = right.astype("float") right.values[key] = value_right self._right = right @@ -550,18 +590,20 @@ def fillna(self, value=None, method=None, limit=None): filled : IntervalArray with NA/NaN filled """ if method is not None: - raise TypeError('Filling by method is not supported for ' - 'IntervalArray.') + raise TypeError("Filling by method is not supported for " "IntervalArray.") if limit is not None: - raise TypeError('limit is not supported for IntervalArray.') + raise TypeError("limit is not supported for IntervalArray.") if not isinstance(value, ABCInterval): - msg = ("'IntervalArray.fillna' only supports filling with a " - "scalar 'pandas.Interval'. Got a '{}' instead." - .format(type(value).__name__)) + msg = ( + "'IntervalArray.fillna' only supports filling with a " + "scalar 'pandas.Interval'. Got a '{}' instead.".format( + type(value).__name__ + ) + ) raise TypeError(msg) - value = getattr(value, '_values', value) + value = getattr(value, "_values", value) self._check_closed_matches(value, name="value") left = self.left.fillna(value=value.left) @@ -601,8 +643,10 @@ def astype(self, dtype, copy=True): new_left = self.left.astype(dtype.subtype) new_right = self.right.astype(dtype.subtype) except TypeError: - msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' - 'incompatible') + msg = ( + "Cannot convert {dtype} to {new_dtype}; subtypes are " + "incompatible" + ) raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): @@ -611,7 +655,7 @@ def astype(self, dtype, copy=True): try: return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' + msg = "Cannot cast {name} to dtype {dtype}" raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) @classmethod @@ -674,8 +718,7 @@ def _shallow_copy(self, left=None, right=None, closed=None): pass closed = closed or self.closed - return self._simple_new( - left, right, closed=closed, verify_integrity=False) + return self._simple_new(left, right, closed=closed, verify_integrity=False) def copy(self): """ @@ -707,8 +750,7 @@ def size(self): def shape(self): return self.left.shape - def take(self, indices, allow_fill=False, fill_value=None, axis=None, - **kwargs): + def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. @@ -763,18 +805,23 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, if fill_value is None: fill_left = fill_right = self.left._na_value elif is_interval(fill_value): - self._check_closed_matches(fill_value, name='fill_value') + self._check_closed_matches(fill_value, name="fill_value") fill_left, fill_right = fill_value.left, fill_value.right elif not is_scalar(fill_value) and notna(fill_value): - msg = ("'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval or NA'. Got a '{}' instead." - .format(type(fill_value).__name__)) + msg = ( + "'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval or NA'. Got a '{}' instead.".format( + type(fill_value).__name__ + ) + ) raise ValueError(msg) - left_take = take(self.left, indices, - allow_fill=allow_fill, fill_value=fill_left) - right_take = take(self.right, indices, - allow_fill=allow_fill, fill_value=fill_right) + left_take = take( + self.left, indices, allow_fill=allow_fill, fill_value=fill_left + ) + right_take = take( + self.right, indices, allow_fill=allow_fill, fill_value=fill_right + ) return self._shallow_copy(left_take, right_take) @@ -797,6 +844,7 @@ def value_counts(self, dropna=True): """ # TODO: implement this is a non-naive way! from pandas.core.algorithms import value_counts + return value_counts(np.asarray(self), dropna=dropna) # Formatting @@ -806,46 +854,51 @@ def _format_data(self): # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical n = len(self) - max_seq_items = min((get_option( - 'display.max_seq_items') or n) // 10, 10) + max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) formatter = str if n == 0: - summary = '[]' + summary = "[]" elif n == 1: first = formatter(self[0]) - summary = '[{first}]'.format(first=first) + summary = "[{first}]".format(first=first) elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = '[{first}, {last}]'.format(first=first, last=last) + summary = "[{first}, {last}]".format(first=first, last=last) else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = '[{head} ... {tail}]'.format( - head=', '.join(head), tail=', '.join(tail)) + summary = "[{head} ... {tail}]".format( + head=", ".join(head), tail=", ".join(tail) + ) else: tail = [formatter(x) for x in self] - summary = '[{tail}]'.format(tail=', '.join(tail)) + summary = "[{tail}]".format(tail=", ".join(tail)) return summary def __repr__(self): - tpl = textwrap.dedent("""\ + tpl = textwrap.dedent( + """\ {cls}({data}, {lead}closed='{closed}', - {lead}dtype='{dtype}')""") - return tpl.format(cls=self.__class__.__name__, - data=self._format_data(), - lead=' ' * len(self.__class__.__name__) + ' ', - closed=self.closed, dtype=self.dtype) + {lead}dtype='{dtype}')""" + ) + return tpl.format( + cls=self.__class__.__name__, + data=self._format_data(), + lead=" " * len(self.__class__.__name__) + " ", + closed=self.closed, + dtype=self.dtype, + ) def _format_space(self): - space = ' ' * (len(self.__class__.__name__) + 1) + space = " " * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) @property @@ -872,7 +925,9 @@ def closed(self): """ return self._closed - _interval_shared_docs['set_closed'] = """ + _interval_shared_docs[ + "set_closed" + ] = """ Return an %(klass)s identical to the current one, but closed on the specified side @@ -901,7 +956,7 @@ def closed(self): dtype='interval[int64]') """ - @Appender(_interval_shared_docs['set_closed'] % _shared_docs_kwargs) + @Appender(_interval_shared_docs["set_closed"] % _shared_docs_kwargs) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -919,8 +974,10 @@ def length(self): return self.right - self.left except TypeError: # length not defined for some types, e.g. string - msg = ('IntervalArray contains Intervals without defined length, ' - 'e.g. Intervals with string endpoints') + msg = ( + "IntervalArray contains Intervals without defined length, " + "e.g. Intervals with string endpoints" + ) raise TypeError(msg) @property @@ -934,7 +991,9 @@ def mid(self): # datetime safe version return self.left + 0.5 * self.length - _interval_shared_docs['is_non_overlapping_monotonic'] = """ + _interval_shared_docs[ + "is_non_overlapping_monotonic" + ] = """ Return True if the %(klass)s is non-overlapping (no Intervals share points) and is either monotonic increasing or monotonic decreasing, else False @@ -942,8 +1001,9 @@ def mid(self): # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties @property # type: ignore - @Appender(_interval_shared_docs['is_non_overlapping_monotonic'] - % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs + ) def is_non_overlapping_monotonic(self): # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) @@ -951,14 +1011,18 @@ def is_non_overlapping_monotonic(self): # strict inequality for closed == 'both'; equality implies overlapping # at a point when both sides of intervals are included - if self.closed == 'both': - return bool((self.right[:-1] < self.left[1:]).all() or - (self.left[:-1] > self.right[1:]).all()) + if self.closed == "both": + return bool( + (self.right[:-1] < self.left[1:]).all() + or (self.left[:-1] > self.right[1:]).all() + ) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping - return bool((self.right[:-1] <= self.left[1:]).all() or - (self.left[:-1] >= self.right[1:]).all()) + return bool( + (self.right[:-1] <= self.left[1:]).all() + or (self.left[:-1] >= self.right[1:]).all() + ) # Conversion def __array__(self, dtype=None): @@ -979,7 +1043,9 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result - _interval_shared_docs['to_tuples'] = """ + _interval_shared_docs[ + "to_tuples" + ] = """ Return an %(return_type)s of tuples of the form (left, right) Parameters @@ -996,10 +1062,9 @@ def __array__(self, dtype=None): %(examples)s\ """ - @Appender(_interval_shared_docs['to_tuples'] % dict( - return_type='ndarray', - examples='', - )) + @Appender( + _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") + ) def to_tuples(self, na_tuple=True): tuples = com.asarray_tuplesafe(zip(self.left, self.right)) if not na_tuple: @@ -1007,14 +1072,16 @@ def to_tuples(self, na_tuple=True): tuples = np.where(~self.isna(), tuples, np.nan) return tuples - @Appender(_extension_array_shared_docs['repeat'] % _shared_docs_kwargs) + @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) - _interval_shared_docs['contains'] = """ + _interval_shared_docs[ + "contains" + ] = """ Check elementwise if the Intervals contain the value. Return a boolean mask whether the value is contained in the Intervals @@ -1048,19 +1115,18 @@ def repeat(self, repeats, axis=None): array([ True, False, False]) """ - @Appender(_interval_shared_docs['contains'] % _shared_docs_kwargs) + @Appender(_interval_shared_docs["contains"] % _shared_docs_kwargs) def contains(self, other): if isinstance(other, Interval): - raise NotImplementedError( - 'contains not implemented for two intervals' - ) + raise NotImplementedError("contains not implemented for two intervals") - return ( - (self.left < other if self.open_left else self.left <= other) & - (other < self.right if self.open_right else other <= self.right) + return (self.left < other if self.open_left else self.left <= other) & ( + other < self.right if self.open_right else other <= self.right ) - _interval_shared_docs['overlaps'] = """ + _interval_shared_docs[ + "overlaps" + ] = """ Check elementwise if an Interval overlaps the values in the %(klass)s. Two intervals overlap if they share a common point, including closed @@ -1104,12 +1170,12 @@ def contains(self, other): array([False, True, False]) """ - @Appender(_interval_shared_docs['overlaps'] % _shared_docs_kwargs) + @Appender(_interval_shared_docs["overlaps"] % _shared_docs_kwargs) def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): raise NotImplementedError elif not isinstance(other, Interval): - msg = '`other` must be Interval-like, got {other}' + msg = "`other` must be Interval-like, got {other}" raise TypeError(msg.format(other=type(other).__name__)) # equality is okay if both endpoints are closed (overlap at a point) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 1c5dc7666c3a1..9f428a4ac10b2 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -33,7 +33,8 @@ class PandasDtype(ExtensionDtype): ---------- dtype : numpy.dtype """ - _metadata = ('_dtype',) + + _metadata = ("_dtype",) def __init__(self, dtype): dtype = np.dtype(dtype) @@ -60,11 +61,11 @@ def type(self): @property def _is_numeric(self): # exclude object, str, unicode, void. - return self.kind in set('biufc') + return self.kind in set("biufc") @property def _is_boolean(self): - return self.kind == 'b' + return self.kind == "b" @classmethod def construct_from_string(cls, string): @@ -107,6 +108,7 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin): ------- None """ + # If you're wondering why pd.Series(cls) doesn't put the array in an # ExtensionBlock, search for `ABCPandasArray`. We check for # that _typ to ensure that that users don't unnecessarily use EAs inside @@ -171,7 +173,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # numpy.lib.mixins.NDArrayOperatorsMixin.html # The primary modification is not boxing scalar return values # in PandasArray, since pandas' ExtensionArrays are 1-d. - out = kwargs.get('out', ()) + out = kwargs.get("out", ()) for x in inputs + out: # Only support operations with instances of _HANDLED_TYPES. # Use PandasArray instead of type(self) for isinstance to @@ -181,12 +183,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return NotImplemented # Defer to the implementation of the ufunc on unwrapped values. - inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x - for x in inputs) + inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs) if out: - kwargs['out'] = tuple( - x._ndarray if isinstance(x, PandasArray) else x - for x in out) + kwargs["out"] = tuple( + x._ndarray if isinstance(x, PandasArray) else x for x in out + ) result = getattr(ufunc, method)(*inputs, **kwargs) if type(result) is tuple and len(result): @@ -197,7 +198,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: # but not scalar reductions return result - elif method == 'at': + elif method == "at": # no return value return None else: @@ -233,7 +234,7 @@ def __setitem__(self, key, value): values = self._ndarray t = np.result_type(value, values) if t != self._ndarray.dtype: - values = values.astype(t, casting='safe') + values = values.astype(t, casting="safe") values[key] = value self._dtype = PandasDtype(t) self._ndarray = values @@ -260,15 +261,16 @@ def fillna(self, value=None, method=None, limit=None): if is_array_like(value): if len(value) != len(self): - raise ValueError("Length of 'value' does not match. Got ({}) " - " expected {}".format(len(value), len(self))) + raise ValueError( + "Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self)) + ) value = value[mask] if mask.any(): if method is not None: - func = pad_1d if method == 'pad' else backfill_1d - new_values = func(self._ndarray, limit=limit, - mask=mask) + func = pad_1d if method == "pad" else backfill_1d + new_values = func(self._ndarray, limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value @@ -281,8 +283,9 @@ def fillna(self, value=None, method=None, limit=None): def take(self, indices, allow_fill=False, fill_value=None): from pandas.core.algorithms import take - result = take(self._ndarray, indices, allow_fill=allow_fill, - fill_value=fill_value) + result = take( + self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value + ) return type(self)(result) def copy(self): @@ -307,9 +310,7 @@ def _reduce(self, name, skipna=True, **kwargs): if meth: return meth(skipna=skipna, **kwargs) else: - msg = ( - "'{}' does not implement reduction '{}'" - ) + msg = "'{}' does not implement reduction '{}'" raise TypeError(msg.format(type(self).__name__, name)) def any(self, axis=None, out=None, keepdims=False, skipna=True): @@ -328,67 +329,80 @@ def max(self, axis=None, out=None, keepdims=False, skipna=True): nv.validate_max((), dict(out=out, keepdims=keepdims)) return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) - def sum(self, axis=None, dtype=None, out=None, keepdims=False, - initial=None, skipna=True, min_count=0): - nv.validate_sum((), dict(dtype=dtype, out=out, keepdims=keepdims, - initial=initial)) - return nanops.nansum(self._ndarray, axis=axis, skipna=skipna, - min_count=min_count) - - def prod(self, axis=None, dtype=None, out=None, keepdims=False, - initial=None, skipna=True, min_count=0): - nv.validate_prod((), dict(dtype=dtype, out=out, keepdims=keepdims, - initial=initial)) - return nanops.nanprod(self._ndarray, axis=axis, skipna=skipna, - min_count=min_count) - - def mean(self, axis=None, dtype=None, out=None, keepdims=False, - skipna=True): + def sum( + self, + axis=None, + dtype=None, + out=None, + keepdims=False, + initial=None, + skipna=True, + min_count=0, + ): + nv.validate_sum( + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + ) + return nanops.nansum( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + + def prod( + self, + axis=None, + dtype=None, + out=None, + keepdims=False, + initial=None, + skipna=True, + min_count=0, + ): + nv.validate_prod( + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + ) + return nanops.nanprod( + self._ndarray, axis=axis, skipna=skipna, min_count=min_count + ) + + def mean(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) - def median(self, axis=None, out=None, overwrite_input=False, - keepdims=False, skipna=True): - nv.validate_median((), dict(out=out, overwrite_input=overwrite_input, - keepdims=keepdims)) + def median( + self, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True + ): + nv.validate_median( + (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) + ) return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) - def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='std') - return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, - ddof=ddof) - - def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='var') - return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, - ddof=ddof) - - def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='sem') - return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, - ddof=ddof) - - def kurt(self, axis=None, dtype=None, out=None, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='kurt') + def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" + ) + return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var" + ) + return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem" + ) + return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + + def kurt(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt" + ) return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) - def skew(self, axis=None, dtype=None, out=None, keepdims=False, - skipna=True): - nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out, - keepdims=keepdims), - fname='skew') + def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_stat_ddof_func( + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew" + ) return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) # ------------------------------------------------------------------------ @@ -417,9 +431,8 @@ def to_numpy(self, dtype=None, copy=False): return result @Appender(ExtensionArray.searchsorted.__doc__) - def searchsorted(self, value, side='left', sorter=None): - return searchsorted(self.to_numpy(), value, - side=side, sorter=sorter) + def searchsorted(self, value, side="left", sorter=None): + return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) # ------------------------------------------------------------------------ # Ops @@ -445,9 +458,9 @@ def arithmetic_method(self, other): return cls(result) - return compat.set_function_name(arithmetic_method, - "__{}__".format(op.__name__), - cls) + return compat.set_function_name( + arithmetic_method, "__{}__".format(op.__name__), cls + ) _create_comparison_method = _create_arithmetic_method diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index bb144764a26fc..8291cb70affcd 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -6,21 +6,41 @@ from pandas._libs import lib from pandas._libs.tslibs import ( - NaT, NaTType, frequencies as libfrequencies, iNaT, period as libperiod) + NaT, + NaTType, + frequencies as libfrequencies, + iNaT, + period as libperiod, +) from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.period import ( - DIFFERENT_FREQ, IncompatibleFrequency, Period, get_period_field_arr, - period_asfreq_arr) + DIFFERENT_FREQ, + IncompatibleFrequency, + Period, + get_period_field_arr, + period_asfreq_arr, +) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds import pandas.compat as compat from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, - is_list_like, is_period_dtype, pandas_dtype) + _TD_DTYPE, + ensure_object, + is_datetime64_dtype, + is_float_dtype, + is_list_like, + is_period_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos @@ -46,8 +66,8 @@ def _period_array_cmp(cls, op): """ Wrap comparison operations to convert Period-like to PeriodDtype """ - opname = '__{name}__'.format(name=op.__name__) - nat_result = opname == '__ne__' + opname = "__{name}__".format(name=op.__name__) + nat_result = opname == "__ne__" def wrapper(self, other): op = getattr(self.asi8, opname) @@ -138,6 +158,7 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): The `freq` indicates the span covered by each element of the array. All elements in the PeriodArray have the same `freq`. """ + # array priority higher than numpy scalars __array_priority__ = 1000 _attributes = ["freq"] @@ -146,14 +167,27 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): # Names others delegate to us _other_ops = [] # type: List[str] - _bool_ops = ['is_leap_year'] - _object_ops = ['start_time', 'end_time', 'freq'] - _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'weekday', 'week', 'dayofweek', - 'dayofyear', 'quarter', 'qyear', - 'days_in_month', 'daysinmonth'] + _bool_ops = ["is_leap_year"] + _object_ops = ["start_time", "end_time", "freq"] + _field_ops = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "weekday", + "week", + "dayofweek", + "dayofyear", + "quarter", + "qyear", + "days_in_month", + "daysinmonth", + ] _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ['strftime', 'to_timestamp', 'asfreq'] + _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"] # -------------------------------------------------------------------- # Constructors @@ -174,16 +208,18 @@ def __init__(self, values, freq=None, dtype=None, copy=False): if isinstance(values, type(self)): if freq is not None and freq != values.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=values.freq.freqstr, - other_freq=freq.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=values.freq.freqstr, + other_freq=freq.freqstr, + ) raise IncompatibleFrequency(msg) values, freq = values._data, values.freq - values = np.array(values, dtype='int64', copy=copy) + values = np.array(values, dtype="int64", copy=copy) self._data = values if freq is None: - raise ValueError('freq is not specified and cannot be inferred') + raise ValueError("freq is not specified and cannot be inferred") self._dtype = PeriodDtype(freq) @classmethod @@ -193,10 +229,10 @@ def _simple_new(cls, values, freq=None, **kwargs): @classmethod def _from_sequence( - cls, - scalars: Sequence[Optional[Period]], - dtype: Optional[PeriodDtype] = None, - copy: bool = False, + cls, + scalars: Sequence[Optional[Period]], + dtype: Optional[PeriodDtype] = None, + copy: bool = False, ) -> ABCPeriodArray: if dtype: freq = dtype.freq @@ -245,14 +281,14 @@ def _generate_range(cls, start, end, periods, freq, fields): field_count = len(fields) if start is not None or end is not None: if field_count > 0: - raise ValueError('Can either instantiate from fields ' - 'or endpoints, but not both') + raise ValueError( + "Can either instantiate from fields " "or endpoints, but not both" + ) subarr, freq = _get_ordinal_range(start, end, periods, freq) elif field_count > 0: subarr, freq = _range_from_fields(freq=freq, **fields) else: - raise ValueError('Not enough parameters to construct ' - 'Period range') + raise ValueError("Not enough parameters to construct " "Period range") return subarr, freq @@ -267,8 +303,9 @@ def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: self._check_compatible_with(value) return value.ordinal else: - raise ValueError("'value' should be a Period. Got '{val}' instead." - .format(val=value)) + raise ValueError( + "'value' should be a Period. Got '{val}' instead.".format(val=value) + ) def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) @@ -301,23 +338,26 @@ def __array__(self, dtype=None): # -------------------------------------------------------------------- # Vectorized analogues of Period properties - year = _field_accessor('year', 0, "The year of the period") - month = _field_accessor('month', 3, "The month as January=1, December=12") - day = _field_accessor('day', 4, "The days of the period") - hour = _field_accessor('hour', 5, "The hour of the period") - minute = _field_accessor('minute', 6, "The minute of the period") - second = _field_accessor('second', 7, "The second of the period") - weekofyear = _field_accessor('week', 8, "The week ordinal of the year") + year = _field_accessor("year", 0, "The year of the period") + month = _field_accessor("month", 3, "The month as January=1, December=12") + day = _field_accessor("day", 4, "The days of the period") + hour = _field_accessor("hour", 5, "The hour of the period") + minute = _field_accessor("minute", 6, "The minute of the period") + second = _field_accessor("second", 7, "The second of the period") + weekofyear = _field_accessor("week", 8, "The week ordinal of the year") week = weekofyear - dayofweek = _field_accessor('dayofweek', 10, - "The day of the week with Monday=0, Sunday=6") + dayofweek = _field_accessor( + "dayofweek", 10, "The day of the week with Monday=0, Sunday=6" + ) weekday = dayofweek - dayofyear = day_of_year = _field_accessor('dayofyear', 9, - "The ordinal day of the year") - quarter = _field_accessor('quarter', 2, "The quarter of the date") - qyear = _field_accessor('qyear', 1) - days_in_month = _field_accessor('days_in_month', 11, - "The number of days in the month") + dayofyear = day_of_year = _field_accessor( + "dayofyear", 9, "The ordinal day of the year" + ) + quarter = _field_accessor("quarter", 2, "The quarter of the date") + qyear = _field_accessor("qyear", 1) + days_in_month = _field_accessor( + "days_in_month", 11, "The number of days in the month" + ) daysinmonth = days_in_month @property @@ -329,13 +369,13 @@ def is_leap_year(self): @property def start_time(self): - return self.to_timestamp(how='start') + return self.to_timestamp(how="start") @property def end_time(self): - return self.to_timestamp(how='end') + return self.to_timestamp(how="end") - def to_timestamp(self, freq=None, how='start'): + def to_timestamp(self, freq=None, how="start"): """ Cast to DatetimeArray/Index. @@ -354,15 +394,15 @@ def to_timestamp(self, freq=None, how='start'): how = libperiod._validate_end_alias(how) - end = how == 'E' + end = how == "E" if end: - if freq == 'B': + if freq == "B": # roll forward to ensure we land on B date - adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') - return self.to_timestamp(how='start') + adjust + adjust = Timedelta(1, "D") - Timedelta(1, "ns") + return self.to_timestamp(how="start") + adjust else: - adjust = Timedelta(1, 'ns') - return (self + self.freq).to_timestamp(how='start') - adjust + adjust = Timedelta(1, "ns") + return (self + self.freq).to_timestamp(how="start") - adjust if freq is None: base, mult = libfrequencies.get_freq_code(self.freq) @@ -374,7 +414,7 @@ def to_timestamp(self, freq=None, how='start'): new_data = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) - return DatetimeArray._from_sequence(new_data, freq='infer') + return DatetimeArray._from_sequence(new_data, freq="infer") # -------------------------------------------------------------------- # Array-like / EA-Interface Methods @@ -392,8 +432,10 @@ def _validate_fill_value(self, fill_value): self._check_compatible_with(fill_value) fill_value = fill_value.ordinal else: - raise ValueError("'fill_value' should be a Period. " - "Got '{got}'.".format(got=fill_value)) + raise ValueError( + "'fill_value' should be a Period. " + "Got '{got}'.".format(got=fill_value) + ) return fill_value # -------------------------------------------------------------------- @@ -414,9 +456,10 @@ def _time_shift(self, periods, freq=None): Frequency increment to shift by. """ if freq is not None: - raise TypeError("`freq` argument is not supported for " - "{cls}._time_shift" - .format(cls=type(self).__name__)) + raise TypeError( + "`freq` argument is not supported for " + "{cls}._time_shift".format(cls=type(self).__name__) + ) values = self.asi8 + periods * self.freq.n if self._hasnans: values[self._isnan] = iNaT @@ -426,7 +469,7 @@ def _time_shift(self, periods, freq=None): def _box_func(self): return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) - def asfreq(self, freq=None, how='E'): + def asfreq(self, freq=None, how="E"): """ Convert the Period Array/Index to the specified frequency `freq`. @@ -469,7 +512,7 @@ def asfreq(self, freq=None, how='E'): asi8 = self.asi8 # mult1 can't be negative or 0 - end = how == 'E' + end = how == "E" if end: ordinal = asi8 + mult1 - 1 else: @@ -485,7 +528,7 @@ def asfreq(self, freq=None, how='E'): # ------------------------------------------------------------------ # Rendering Methods - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): """ actually format my specific types """ @@ -494,14 +537,13 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): if date_format: formatter = lambda dt: dt.strftime(date_format) else: - formatter = lambda dt: '%s' % dt + formatter = lambda dt: "%s" % dt if self._hasnans: mask = self._isnan values[mask] = na_rep imask = ~mask - values[imask] = np.array([formatter(dt) for dt - in values[imask]]) + values[imask] = np.array([formatter(dt) for dt in values[imask]]) else: values = np.array([formatter(dt) for dt in values]) return values @@ -548,17 +590,15 @@ def _sub_period(self, other): @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__) def _addsub_int_array( - self, - other: Union[ABCPeriodArray, ABCSeries, - ABCPeriodIndex, np.ndarray], - op: Callable[[Any], Any] + self, + other: Union[ABCPeriodArray, ABCSeries, ABCPeriodIndex, np.ndarray], + op: Callable[[Any], Any], ) -> ABCPeriodArray: assert op in [operator.add, operator.sub] if op is operator.sub: other = -other - res_values = algos.checked_add_with_arr(self.asi8, other, - arr_mask=self._isnan) - res_values = res_values.view('i8') + res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) + res_values = res_values.view("i8") res_values[self._isnan] = iNaT return type(self)(res_values, freq=self.freq) @@ -663,12 +703,12 @@ def _check_timedeltalike_freq_compat(self, other): elif isinstance(other, np.ndarray): # numpy timedelta64 array; all entries must be compatible - assert other.dtype.kind == 'm' + assert other.dtype.kind == "m" if other.dtype != _TD_DTYPE: # i.e. non-nano unit # TODO: disallow unit-less timedelta64 other = other.astype(_TD_DTYPE) - nanos = other.view('i8') + nanos = other.view("i8") else: # TimedeltaArray/Index nanos = other.asi8 @@ -712,19 +752,18 @@ def _raise_on_incompatible(left, right): else: other_freq = _delta_to_tick(Timedelta(right)).freqstr - msg = DIFFERENT_FREQ.format(cls=type(left).__name__, - own_freq=left.freqstr, - other_freq=other_freq) + msg = DIFFERENT_FREQ.format( + cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq + ) raise IncompatibleFrequency(msg) # ------------------------------------------------------------------- # Constructor Helpers + def period_array( - data: Sequence[Optional[Period]], - freq: Optional[Tick] = None, - copy: bool = False, + data: Sequence[Optional[Period]], freq: Optional[Tick] = None, copy: bool = False ) -> PeriodArray: """ Construct a new PeriodArray from a sequence of Period scalars. @@ -796,8 +835,7 @@ def period_array( dtype = None if is_float_dtype(data) and len(data) > 0: - raise TypeError("PeriodIndex does not allow " - "floating point in construction") + raise TypeError("PeriodIndex does not allow " "floating point in construction") data = ensure_object(data) @@ -829,12 +867,11 @@ def validate_dtype_freq(dtype, freq): if dtype is not None: dtype = pandas_dtype(dtype) if not is_period_dtype(dtype): - raise ValueError('dtype must be PeriodDtype') + raise ValueError("dtype must be PeriodDtype") if freq is None: freq = dtype.freq elif freq != dtype.freq: - raise IncompatibleFrequency('specified freq and dtype ' - 'are different') + raise IncompatibleFrequency("specified freq and dtype " "are different") return freq @@ -858,8 +895,8 @@ def dt64arr_to_periodarr(data, freq, tz=None): used. """ - if data.dtype != np.dtype('M8[ns]'): - raise ValueError('Wrong dtype: {dtype}'.format(dtype=data.dtype)) + if data.dtype != np.dtype("M8[ns]"): + raise ValueError("Wrong dtype: {dtype}".format(dtype=data.dtype)) if freq is None: if isinstance(data, ABCIndexClass): @@ -873,13 +910,15 @@ def dt64arr_to_periodarr(data, freq, tz=None): data = data._values base, mult = libfrequencies.get_freq_code(freq) - return libperiod.dt64arr_to_periodarr(data.view('i8'), base, tz), freq + return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq def _get_ordinal_range(start, end, periods, freq, mult=1): if com.count_not_none(start, end, periods) != 2: - raise ValueError('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) if freq is not None: _, mult = libfrequencies.get_freq_code(freq) @@ -893,9 +932,9 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): is_end_per = isinstance(end, Period) if is_start_per and is_end_per and start.freq != end.freq: - raise ValueError('start and end must have same freq') - if (start is NaT or end is NaT): - raise ValueError('start and end must not be NaT') + raise ValueError("start and end must have same freq") + if start is NaT or end is NaT: + raise ValueError("start and end must not be NaT") if freq is None: if is_start_per: @@ -903,25 +942,34 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): elif is_end_per: freq = end.freq else: # pragma: no cover - raise ValueError('Could not infer freq from start/end') + raise ValueError("Could not infer freq from start/end") if periods is not None: periods = periods * mult if start is None: - data = np.arange(end.ordinal - periods + mult, - end.ordinal + 1, mult, - dtype=np.int64) + data = np.arange( + end.ordinal - periods + mult, end.ordinal + 1, mult, dtype=np.int64 + ) else: - data = np.arange(start.ordinal, start.ordinal + periods, mult, - dtype=np.int64) + data = np.arange( + start.ordinal, start.ordinal + periods, mult, dtype=np.int64 + ) else: data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64) return data, freq -def _range_from_fields(year=None, month=None, quarter=None, day=None, - hour=None, minute=None, second=None, freq=None): +def _range_from_fields( + year=None, + month=None, + quarter=None, + day=None, + hour=None, + minute=None, + second=None, + freq=None, +): if hour is None: hour = 0 if minute is None: @@ -935,7 +983,7 @@ def _range_from_fields(year=None, month=None, quarter=None, day=None, if quarter is not None: if freq is None: - freq = 'Q' + freq = "Q" base = libfrequencies.FreqGroup.FR_QTR else: base, mult = libfrequencies.get_freq_code(freq) @@ -951,8 +999,7 @@ def _range_from_fields(year=None, month=None, quarter=None, day=None, base, mult = libfrequencies.get_freq_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) for y, mth, d, h, mn, s in zip(*arrays): - ordinals.append(libperiod.period_ordinal( - y, mth, d, h, mn, s, 0, 0, base)) + ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) return np.array(ordinals, dtype=np.int64), freq @@ -962,11 +1009,15 @@ def _make_field_arrays(*fields): for x in fields: if isinstance(x, (list, np.ndarray, ABCSeries)): if length is not None and len(x) != length: - raise ValueError('Mismatched Period array lengths') + raise ValueError("Mismatched Period array lengths") elif length is None: length = len(x) - arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) - else np.repeat(x, length) for x in fields] + arrays = [ + np.asarray(x) + if isinstance(x, (np.ndarray, list, ABCSeries)) + else np.repeat(x, length) + for x in fields + ] return arrays diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 29cc899fa6a9b..2332da46574c5 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -21,14 +21,29 @@ from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( - astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type, - infer_dtype_from_scalar) + astype_nansafe, + construct_1d_arraylike_from_scalar, + find_common_type, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import ( - is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, - is_integer, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype) + is_array_like, + is_bool_dtype, + is_datetime64_any_dtype, + is_dtype_equal, + is_integer, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( - ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries) + ABCIndexClass, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna from pandas._typing import Dtype @@ -82,21 +97,16 @@ class SparseDtype(ExtensionDtype): ------- None """ + # We include `_is_na_fill_value` in the metadata to avoid hash collisions # between SparseDtype(float, 0.0) and SparseDtype(float, nan). # Without is_na_fill_value in the comparison, those would be equal since # hash(nan) is (sometimes?) 0. - _metadata = ('_dtype', '_fill_value', '_is_na_fill_value') + _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - def __init__( - self, - dtype: Dtype = np.float64, - fill_value: Any = None - ) -> None: + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: from pandas.core.dtypes.missing import na_value_for_dtype - from pandas.core.dtypes.common import ( - pandas_dtype, is_string_dtype, is_scalar - ) + from pandas.core.dtypes.common import pandas_dtype, is_string_dtype, is_scalar if isinstance(dtype, type(self)): if fill_value is None: @@ -105,14 +115,15 @@ def __init__( dtype = pandas_dtype(dtype) if is_string_dtype(dtype): - dtype = np.dtype('object') + dtype = np.dtype("object") if fill_value is None: fill_value = na_value_for_dtype(dtype) if not is_scalar(fill_value): - raise ValueError("fill_value must be a scalar. Got {} " - "instead".format(fill_value)) + raise ValueError( + "fill_value must be a scalar. Got {} " "instead".format(fill_value) + ) self._dtype = dtype self._fill_value = fill_value @@ -139,9 +150,9 @@ def __eq__(self, other): # i.e. we want to treat any floating-point NaN as equal, but # not a floating-point NaN and a datetime NaT. fill_value = ( - other._is_na_fill_value and - isinstance(self.fill_value, type(other.fill_value)) or - isinstance(other.fill_value, type(self.fill_value)) + other._is_na_fill_value + and isinstance(self.fill_value, type(other.fill_value)) + or isinstance(other.fill_value, type(self.fill_value)) ) else: fill_value = self.fill_value == other.fill_value @@ -168,16 +179,19 @@ def fill_value(self): @property def _is_na_fill_value(self): from pandas.core.dtypes.missing import isna + return isna(self.fill_value) @property def _is_numeric(self): from pandas.core.dtypes.common import is_object_dtype + return not is_object_dtype(self.subtype) @property def _is_boolean(self): from pandas.core.dtypes.common import is_bool_dtype + return is_bool_dtype(self.subtype) @property @@ -197,7 +211,7 @@ def subtype(self): @property def name(self): - return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value) + return "Sparse[{}, {}]".format(self.subtype.name, self.fill_value) def __repr__(self): return self.name @@ -241,11 +255,13 @@ def construct_from_string(cls, string): except Exception: raise TypeError(msg) else: - msg = ("Could not construct SparseDtype from '{}'.\n\nIt " - "looks like the fill_value in the string is not " - "the default for the dtype. Non-default fill_values " - "are not supported. Use the 'SparseDtype()' " - "constructor instead.") + msg = ( + "Could not construct SparseDtype from '{}'.\n\nIt " + "looks like the fill_value in the string is not " + "the default for the dtype. Non-default fill_values " + "are not supported. Use the 'SparseDtype()' " + "constructor instead." + ) if has_fill_value and str(result) != string: raise TypeError(msg.format(string)) return result @@ -274,30 +290,27 @@ def _parse_subtype(dtype): ValueError When the subtype cannot be extracted. """ - xpr = re.compile( - r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$" - ) + xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$") m = xpr.match(dtype) has_fill_value = False if m: - subtype = m.groupdict()['subtype'] - has_fill_value = m.groupdict()['fill_value'] or has_fill_value + subtype = m.groupdict()["subtype"] + has_fill_value = m.groupdict()["fill_value"] or has_fill_value elif dtype == "Sparse": - subtype = 'float64' + subtype = "float64" else: raise ValueError("Cannot parse {}".format(dtype)) return subtype, has_fill_value @classmethod def is_dtype(cls, dtype): - dtype = getattr(dtype, 'dtype', dtype) - if (isinstance(dtype, str) and - dtype.startswith("Sparse")): + dtype = getattr(dtype, "dtype", dtype) + if isinstance(dtype, str) and dtype.startswith("Sparse"): sub_type, _ = cls._parse_subtype(dtype) dtype = np.dtype(sub_type) elif isinstance(dtype, cls): return True - return isinstance(dtype, np.dtype) or dtype == 'Sparse' + return isinstance(dtype, np.dtype) or dtype == "Sparse" def update_dtype(self, dtype): """ @@ -341,8 +354,7 @@ def update_dtype(self, dtype): dtype = pandas_dtype(dtype) if not isinstance(dtype, cls): - fill_value = astype_nansafe(np.array(self.fill_value), - dtype).item() + fill_value = astype_nansafe(np.array(self.fill_value), dtype).item() dtype = cls(dtype, fill_value=fill_value) return dtype @@ -381,7 +393,7 @@ def _subtype_with_str(self): # Array -_sparray_doc_kwargs = dict(klass='SparseArray') +_sparray_doc_kwargs = dict(klass="SparseArray") def _get_fill(arr: ABCSparseArray) -> np.ndarray: @@ -409,10 +421,7 @@ def _get_fill(arr: ABCSparseArray) -> np.ndarray: def _sparse_array_op( - left: ABCSparseArray, - right: ABCSparseArray, - op: Callable, - name: str + left: ABCSparseArray, right: ABCSparseArray, op: Callable, name: str ) -> Any: """ Perform a binary operation between two arrays. @@ -430,7 +439,7 @@ def _sparse_array_op( ------- SparseArray """ - if name.startswith('__'): + if name.startswith("__"): # For lookups in _libs.sparse we need non-dunder op name name = name[2:-2] @@ -454,7 +463,7 @@ def _sparse_array_op( result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(left.to_dense(), right.to_dense()) fill = op(_get_fill(left), _get_fill(right)) @@ -463,32 +472,37 @@ def _sparse_array_op( else: index = right.sp_index elif left.sp_index.equals(right.sp_index): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: - if name[0] == 'r': + if name[0] == "r": left, right = right, left name = name[1:] - if name in ('and', 'or') and dtype == 'bool': - opname = 'sparse_{name}_uint8'.format(name=name) + if name in ("and", "or") and dtype == "bool": + opname = "sparse_{name}_uint8".format(name=name) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: - opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) + opname = "sparse_{name}_{dtype}".format(name=name, dtype=dtype) left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result, index, fill = sparse_op( - left_sp_values, left.sp_index, left.fill_value, - right_sp_values, right.sp_index, right.fill_value) + left_sp_values, + left.sp_index, + left.fill_value, + right_sp_values, + right.sp_index, + right.fill_value, + ) if result_dtype is None: result_dtype = result.dtype @@ -500,11 +514,11 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ - if name.startswith('__'): + if name.startswith("__"): # e.g. __eq__ --> eq name = name[2:-2] - if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + if name in ("eq", "ne", "lt", "gt", "le", "ge"): dtype = np.bool fill_value = lib.item_from_zerodim(fill_value) @@ -512,10 +526,9 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): if is_bool_dtype(dtype): # fill_value may be np.bool_ fill_value = bool(fill_value) - return SparseArray(data, - sparse_index=sparse_index, - fill_value=fill_value, - dtype=dtype) + return SparseArray( + data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype + ) class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): @@ -583,11 +596,19 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): None """ - _pandas_ftype = 'sparse' - _subtyp = 'sparse_array' # register ABCSparseArray + _pandas_ftype = "sparse" + _subtyp = "sparse_array" # register ABCSparseArray - def __init__(self, data, sparse_index=None, index=None, fill_value=None, - kind='integer', dtype=None, copy=False): + def __init__( + self, + data, + sparse_index=None, + index=None, + fill_value=None, + kind="integer", + dtype=None, + copy=False, + ): from pandas.core.internals import SingleBlockManager if isinstance(data, SingleBlockManager): @@ -637,9 +658,7 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None, npoints = sparse_index.length dtype = infer_dtype_from_scalar(data)[0] - data = construct_1d_arraylike_from_scalar( - data, npoints, dtype - ) + data = construct_1d_arraylike_from_scalar(data, npoints, dtype) if dtype is not None: dtype = pandas_dtype(dtype) @@ -654,6 +673,7 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None, try: # probably shared code in sanitize_series from pandas.core.internals.construction import sanitize_array + data = sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] @@ -685,19 +705,17 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None, else: sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: - raise AssertionError("Non array-like type {type} must " - "have the same length as the index" - .format(type=type(sparse_values))) + raise AssertionError( + "Non array-like type {type} must " + "have the same length as the index".format(type=type(sparse_values)) + ) self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value) @classmethod def _simple_new( - cls, - sparse_array: np.ndarray, - sparse_index: SparseIndex, - dtype: SparseDtype + cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype ) -> ABCSparseArray: new = cls([]) new._sparse_index = sparse_index @@ -736,9 +754,7 @@ def from_spmatrix(cls, data): length, ncol = data.shape if ncol != 1: - raise ValueError( - "'data' must have a single column, not '{}'".format(ncol) - ) + raise ValueError("'data' must have a single column, not '{}'".format(ncol)) # our sparse index classes require that the positions be strictly # increasing. So we need to sort loc, and arr accordingly. @@ -769,7 +785,7 @@ def __array__(self, dtype=None, copy=True): # a datetime64 with pandas NaT. if fill_value is NaT: # Can't put pd.NaT in a datetime64[ns] - fill_value = np.datetime64('NaT') + fill_value = np.datetime64("NaT") try: dtype = np.result_type(self.sp_values.dtype, type(fill_value)) except TypeError: @@ -840,9 +856,9 @@ def kind(self): The kind of sparse index for this array. One of {'integer', 'block'}. """ if isinstance(self.sp_index, IntIndex): - return 'integer' + return "integer" else: - return 'block' + return "block" @property def _valid_sp_values(self): @@ -906,17 +922,18 @@ def values(self): msg = ( "The SparseArray.values attribute is deprecated and will be " "removed in a future version. You can use `np.asarray(...)` or " - "the `.to_dense()` method instead.") + "the `.to_dense()` method instead." + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self.to_dense() def isna(self): from pandas import isna + # If null fill value, we want SparseDtype[bool, true] # to preserve the same memory usage. dtype = SparseDtype(bool, self._null_fill_value) - return type(self)._simple_new(isna(self.sp_values), - self.sp_index, dtype) + return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) def fillna(self, value=None, method=None, limit=None): """ @@ -951,15 +968,15 @@ def fillna(self, value=None, method=None, limit=None): When ``self.fill_value`` is not NA, the result dtype will be ``self.dtype``. Again, this preserves the amount of memory used. """ - if ((method is None and value is None) or - (method is not None and value is not None)): + if (method is None and value is None) or ( + method is not None and value is not None + ): raise ValueError("Must specify one of 'method' or 'value'.") elif method is not None: msg = "fillna with 'method' requires high memory usage." warnings.warn(msg, PerformanceWarning) - filled = interpolate_2d(np.asarray(self), method=method, - limit=limit) + filled = interpolate_2d(np.asarray(self), method=method, limit=limit) return type(self)(filled, fill_value=self.fill_value) else: @@ -990,15 +1007,14 @@ def shift(self, periods=1, fill_value=None): arr = self empty = self._from_sequence( - [fill_value] * min(abs(periods), len(self)), - dtype=arr.dtype + [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype ) if periods > 0: a = empty b = arr[:-periods] else: - a = arr[abs(periods):] + a = arr[abs(periods) :] b = empty return arr._concat_same_type([a, b]) @@ -1037,8 +1053,7 @@ def factorize(self, na_sentinel=-1): # ExtensionArray.factorize -> Tuple[EA, EA] # Given that we have to return a dense array of labels, why bother # implementing an efficient factorize? - labels, uniques = algos.factorize(np.asarray(self), - na_sentinel=na_sentinel) + labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) uniques = SparseArray(uniques, dtype=self.dtype) return labels, uniques @@ -1057,8 +1072,7 @@ def value_counts(self, dropna=True): """ from pandas import Index, Series - keys, counts = algos._value_counts_arraylike(self.sp_values, - dropna=dropna) + keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0: if self._null_fill_value and dropna: @@ -1115,7 +1129,7 @@ def __getitem__(self, key): if com.is_bool_indexer(key) and len(self) == len(key): return self.take(np.arange(len(key), dtype=np.int32)[key]) - elif hasattr(key, '__len__'): + elif hasattr(key, "__len__"): return self.take(key) else: raise ValueError("Cannot slice with '{}'".format(key)) @@ -1128,7 +1142,7 @@ def _get_val_at(self, loc): loc += n if loc >= n or loc < 0: - raise IndexError('Out of bounds access') + raise IndexError("Out of bounds access") sp_loc = self.sp_index.lookup(loc) if sp_loc == -1: @@ -1138,30 +1152,32 @@ def _get_val_at(self, loc): def take(self, indices, allow_fill=False, fill_value=None): if is_scalar(indices): - raise ValueError("'indices' must be an array, not a " - "scalar '{}'.".format(indices)) + raise ValueError( + "'indices' must be an array, not a " "scalar '{}'.".format(indices) + ) indices = np.asarray(indices, dtype=np.int32) if indices.size == 0: result = [] - kwargs = {'dtype': self.dtype} + kwargs = {"dtype": self.dtype} elif allow_fill: result = self._take_with_fill(indices, fill_value=fill_value) kwargs = {} else: result = self._take_without_fill(indices) - kwargs = {'dtype': self.dtype} + kwargs = {"dtype": self.dtype} - return type(self)(result, fill_value=self.fill_value, kind=self.kind, - **kwargs) + return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs) def _take_with_fill(self, indices, fill_value=None): if fill_value is None: fill_value = self.dtype.na_value if indices.min() < -1: - raise ValueError("Invalid value in 'indices'. Must be between -1 " - "and the length of the array.") + raise ValueError( + "Invalid value in 'indices'. Must be between -1 " + "and the length of the array." + ) if indices.max() >= len(self): raise IndexError("out of bounds value in 'indices'.") @@ -1174,15 +1190,17 @@ def _take_with_fill(self, indices, fill_value=None): taken.fill(fill_value) return taken else: - raise IndexError('cannot do a non-empty take from an empty ' - 'axes.') + raise IndexError("cannot do a non-empty take from an empty " "axes.") sp_indexer = self.sp_index.lookup_array(indices) if self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values - taken = np.full(sp_indexer.shape, fill_value=fill_value, - dtype=np.result_type(type(fill_value))) + taken = np.full( + sp_indexer.shape, + fill_value=fill_value, + dtype=np.result_type(type(fill_value)), + ) else: taken = self.sp_values.take(sp_indexer) @@ -1203,8 +1221,7 @@ def _take_with_fill(self, indices, fill_value=None): result_type = taken.dtype if m0.any(): - result_type = np.result_type(result_type, - type(self.fill_value)) + result_type = np.result_type(result_type, type(self.fill_value)) taken = taken.astype(result_type) taken[old_fill_indices] = self.fill_value @@ -1223,8 +1240,7 @@ def _take_without_fill(self, indices): if (indices.max() >= n) or (indices.min() < -n): if n == 0: - raise IndexError("cannot do a non-empty take from an " - "empty axes.") + raise IndexError("cannot do a non-empty take from an " "empty axes.") else: raise IndexError("out of bounds value in 'indices'.") @@ -1234,16 +1250,17 @@ def _take_without_fill(self, indices): if self.sp_index.npoints == 0: # edge case in take... # I think just return - out = np.full(indices.shape, self.fill_value, - dtype=np.result_type(type(self.fill_value))) - arr, sp_index, fill_value = make_sparse(out, - fill_value=self.fill_value) - return type(self)(arr, sparse_index=sp_index, - fill_value=fill_value) + out = np.full( + indices.shape, + self.fill_value, + dtype=np.result_type(type(self.fill_value)), + ) + arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value) + return type(self)(arr, sparse_index=sp_index, fill_value=fill_value) sp_indexer = self.sp_index.lookup_array(indices) taken = self.sp_values.take(sp_indexer) - fillable = (sp_indexer < 0) + fillable = sp_indexer < 0 if fillable.any(): # TODO: may need to coerce array to fill value @@ -1259,9 +1276,7 @@ def searchsorted(self, v, side="left", sorter=None): if not is_scalar(v): v = np.asarray(v) v = np.asarray(v) - return np.asarray(self, dtype=self.dtype.subtype).searchsorted( - v, side, sorter - ) + return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) def copy(self): values = self.sp_values.copy() @@ -1276,11 +1291,13 @@ def _concat_same_type(cls, to_concat): # np.nan isn't a singleton, so we may end up with multiple # NaNs here, so we ignore tha all NA case too. if not (len(set(fill_values)) == 1 or isna(fill_values).all()): - warnings.warn("Concatenating sparse arrays with multiple fill " - "values: '{}'. Picking the first and " - "converting the rest.".format(fill_values), - PerformanceWarning, - stacklevel=6) + warnings.warn( + "Concatenating sparse arrays with multiple fill " + "values: '{}'. Picking the first and " + "converting the rest.".format(fill_values), + PerformanceWarning, + stacklevel=6, + ) keep = to_concat[0] to_concat2 = [keep] @@ -1295,9 +1312,9 @@ def _concat_same_type(cls, to_concat): if to_concat: sp_kind = to_concat[0].kind else: - sp_kind = 'integer' + sp_kind = "integer" - if sp_kind == 'integer': + if sp_kind == "integer": indices = [] for arr in to_concat: @@ -1396,15 +1413,11 @@ def astype(self, dtype=None, copy=True): """ dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str - sp_values = astype_nansafe(self.sp_values, - subtype, - copy=copy) + sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() - return self._simple_new(sp_values, - self.sp_index, - dtype) + return self._simple_new(sp_values, self.sp_index, dtype) def map(self, mapper): """ @@ -1456,8 +1469,7 @@ def map(self, mapper): fill_value = mapper(self.fill_value) sp_values = [mapper(x) for x in self.sp_values] - return type(self)(sp_values, sparse_index=self.sp_index, - fill_value=fill_value) + return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value) def to_dense(self): """ @@ -1480,7 +1492,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use the 'to_dense' method instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() _internal_get_values = to_dense @@ -1504,9 +1518,9 @@ def __setstate__(self, state): def nonzero(self): if self.fill_value == 0: - return self.sp_index.to_int_index().indices, + return (self.sp_index.to_int_index().indices,) else: - return self.sp_index.to_int_index().indices[self.sp_values != 0], + return (self.sp_index.to_int_index().indices[self.sp_values != 0],) # ------------------------------------------------------------------------ # Reductions @@ -1516,8 +1530,11 @@ def _reduce(self, name, skipna=True, **kwargs): method = getattr(self, name, None) if method is None: - raise TypeError("cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype)) + raise TypeError( + "cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype + ) + ) if skipna: arr = self @@ -1528,9 +1545,9 @@ def _reduce(self, name, skipna=True, **kwargs): # They should only be present when called via pandas, so do it here. # instead of in `any` / `all` (which will raise if they're present, # thanks to nv.validate - kwargs.pop('filter_type', None) - kwargs.pop('numeric_only', None) - kwargs.pop('op', None) + kwargs.pop("filter_type", None) + kwargs.pop("numeric_only", None) + kwargs.pop("op", None) return getattr(arr, name)(**kwargs) def all(self, axis=None, *args, **kwargs): @@ -1618,8 +1635,11 @@ def cumsum(self, axis=0, *args, **kwargs): if not self._null_fill_value: return SparseArray(self.to_dense()).cumsum() - return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, - fill_value=self.fill_value) + return SparseArray( + self.sp_values.cumsum(), + sparse_index=self.sp_index, + fill_value=self.fill_value, + ) def mean(self, axis=0, *args, **kwargs): """ @@ -1660,7 +1680,7 @@ def T(self): _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - out = kwargs.get('out', ()) + out = kwargs.get("out", ()) for x in inputs + out: if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): @@ -1668,7 +1688,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result @@ -1680,19 +1701,18 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if isinstance(sp_values, tuple): # multiple outputs. e.g. modf arrays = tuple( - self._simple_new(sp_value, - self.sp_index, - SparseDtype(sp_value.dtype, fv)) + self._simple_new( + sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) + ) for sp_value, fv in zip(sp_values, fill_value) ) return arrays - return self._simple_new(sp_values, - self.sp_index, - SparseDtype(sp_values.dtype, fill_value)) + return self._simple_new( + sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) + ) - result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], - **kwargs) + result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs) if out: if len(out) == 1: out = out[0] @@ -1700,7 +1720,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if type(result) is tuple: return tuple(type(self)(x) for x in result) - elif method == 'at': + elif method == "at": # no return value return None else: @@ -1721,7 +1741,7 @@ def sparse_unary_method(self): dtype = SparseDtype(values.dtype, fill_value) return cls._simple_new(values, self.sp_index, dtype) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(sparse_unary_method, name, cls) @classmethod @@ -1737,34 +1757,41 @@ def sparse_arithmetic_method(self, other): return _sparse_array_op(self, other, op, op_name) elif is_scalar(other): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): fill = op(_get_fill(self), np.asarray(other)) result = op(self.sp_values, other) - if op_name == 'divmod': + if op_name == "divmod": left, right = result lfill, rfill = fill - return (_wrap_result(op_name, left, self.sp_index, lfill), - _wrap_result(op_name, right, self.sp_index, rfill)) + return ( + _wrap_result(op_name, left, self.sp_index, lfill), + _wrap_result(op_name, right, self.sp_index, rfill), + ) return _wrap_result(op_name, result, self.sp_index, fill) else: other = np.asarray(other) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): # TODO: delete sparse stuff in core/ops.py # TODO: look into _wrap_result if len(self) != len(other): raise AssertionError( - ("length mismatch: {self} vs. {other}".format( - self=len(self), other=len(other)))) + ( + "length mismatch: {self} vs. {other}".format( + self=len(self), other=len(other) + ) + ) + ) if not isinstance(other, SparseArray): - dtype = getattr(other, 'dtype', None) - other = SparseArray(other, fill_value=self.fill_value, - dtype=dtype) + dtype = getattr(other, "dtype", None) + other = SparseArray( + other, fill_value=self.fill_value, dtype=dtype + ) return _sparse_array_op(self, other, op, op_name) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(sparse_arithmetic_method, name, cls) @classmethod @@ -1772,7 +1799,7 @@ def _create_comparison_method(cls, op): def cmp_method(self, other): op_name = op.__name__ - if op_name in {'and_', 'or_'}: + if op_name in {"and_", "or_"}: op_name = op_name[:-1] if isinstance(other, (ABCSeries, ABCIndexClass)): @@ -1786,24 +1813,28 @@ def cmp_method(self, other): if isinstance(other, np.ndarray): # TODO: make this more flexible than just ndarray... if len(self) != len(other): - raise AssertionError("length mismatch: {self} vs. {other}" - .format(self=len(self), - other=len(other))) + raise AssertionError( + "length mismatch: {self} vs. {other}".format( + self=len(self), other=len(other) + ) + ) other = SparseArray(other, fill_value=self.fill_value) if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) else: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): fill_value = op(self.fill_value, other) result = op(self.sp_values, other) - return type(self)(result, - sparse_index=self.sp_index, - fill_value=fill_value, - dtype=np.bool_) + return type(self)( + result, + sparse_index=self.sp_index, + fill_value=fill_value, + dtype=np.bool_, + ) - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(cmp_method, name, cls) @classmethod @@ -1822,10 +1853,11 @@ def _add_comparison_ops(cls): # Formatting # ----------- def __repr__(self): - return '{self}\nFill: {fill}\n{index}'.format( + return "{self}\nFill: {fill}\n{index}".format( self=printing.pprint_thing(self), fill=printing.pprint_thing(self.fill_value), - index=printing.pprint_thing(self.sp_index)) + index=printing.pprint_thing(self.sp_index), + ) def _formatter(self, boxed=False): # Defer to the formatter from the GenericArrayFormatter calling us. @@ -1842,12 +1874,12 @@ def _maybe_to_dense(obj): """ try to convert to dense """ - if hasattr(obj, 'to_dense'): + if hasattr(obj, "to_dense"): return obj.to_dense() return obj -def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): +def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format @@ -1904,13 +1936,13 @@ def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): def _make_index(length, indices, kind): - if kind == 'block' or isinstance(kind, BlockIndex): + if kind == "block" or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) - elif kind == 'integer' or isinstance(kind, IntIndex): + elif kind == "integer" or isinstance(kind, IntIndex): index = IntIndex(length, indices) else: # pragma: no cover - raise ValueError('must be block or integer type') + raise ValueError("must be block or integer type") return index @@ -1929,9 +1961,9 @@ def _validate(self, data): raise NotImplementedError -@delegate_names(SparseArray, ['npoints', 'density', 'fill_value', - 'sp_values'], - typ='property') +@delegate_names( + SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" +) class SparseAccessor(BaseAccessor, PandasDelegate): """ Accessor for SparseSparse from other sparse matrix data types. @@ -1945,9 +1977,9 @@ def _delegate_property_get(self, name, *args, **kwargs): return getattr(self._parent.array, name) def _delegate_method(self, name, *args, **kwargs): - if name == 'from_coo': + if name == "from_coo": return self.from_coo(*args, **kwargs) - elif name == 'to_coo': + elif name == "to_coo": return self.to_coo(*args, **kwargs) else: raise ValueError @@ -1995,13 +2027,12 @@ def from_coo(cls, A, dense_index=False): from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series - result = _coo_to_sparse_series(A, dense_index=dense_index, - sparse_series=False) + result = _coo_to_sparse_series(A, dense_index=dense_index, sparse_series=False) result = Series(result.array, index=result.index, copy=False) return result - def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): """ Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. @@ -2051,10 +2082,9 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): """ from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo - A, rows, columns = _sparse_series_to_coo(self._parent, - row_levels, - column_levels, - sort_labels=sort_labels) + A, rows, columns = _sparse_series_to_coo( + self._parent, row_levels, column_levels, sort_labels=sort_labels + ) return A, rows, columns def to_dense(self): @@ -2084,9 +2114,12 @@ def to_dense(self): dtype: int64 """ from pandas import Series - return Series(self._parent.array.to_dense(), - index=self._parent.index, - name=self._parent.name) + + return Series( + self._parent.array.to_dense(), + index=self._parent.index, + name=self._parent.name, + ) class SparseFrameAccessor(BaseAccessor, PandasDelegate): @@ -2136,10 +2169,7 @@ def from_spmatrix(cls, data, index=None, columns=None): data = data.tocsc() index, columns = cls._prep_index(data, index, columns) - sparrays = [ - SparseArray.from_spmatrix(data[:, i]) - for i in range(data.shape[1]) - ] + sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])] data = dict(enumerate(sparrays)) result = DataFrame(data, index=index) result.columns = columns @@ -2167,11 +2197,8 @@ def to_dense(self): """ from pandas import DataFrame - data = {k: v.array.to_dense() - for k, v in self._parent.items()} - return DataFrame(data, - index=self._parent.index, - columns=self._parent.columns) + data = {k: v.array.to_dense() for k, v in self._parent.items()} + return DataFrame(data, index=self._parent.index, columns=self._parent.columns) def to_coo(self): """ @@ -2221,8 +2248,7 @@ def density(self) -> float: Ratio of non-sparse points to total (dense) data points represented in the DataFrame. """ - return np.mean([column.array.density - for _, column in self._parent.items()]) + return np.mean([column.array.density for _, column in self._parent.items()]) @staticmethod def _prep_index(data, index, columns): @@ -2235,9 +2261,13 @@ def _prep_index(data, index, columns): columns = ibase.default_index(K) if len(columns) != K: - raise ValueError('Column length mismatch: {columns} vs. {K}' - .format(columns=len(columns), K=K)) + raise ValueError( + "Column length mismatch: {columns} vs. {K}".format( + columns=len(columns), K=K + ) + ) if len(index) != N: - raise ValueError('Index length mismatch: {index} vs. {N}' - .format(index=len(index), N=N)) + raise ValueError( + "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N) + ) return index, columns diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 50bc8d6d3ae6b..9d622d92e0979 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -9,18 +9,36 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( - array_to_timedelta64, parse_timedelta_unit, precision_from_unit) + array_to_timedelta64, + parse_timedelta_unit, + precision_from_unit, +) import pandas.compat as compat from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, ensure_int64, is_datetime64_dtype, is_dtype_equal, - is_float_dtype, is_integer_dtype, is_list_like, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, - pandas_dtype) + _NS_DTYPE, + _TD_DTYPE, + ensure_int64, + is_datetime64_dtype, + is_dtype_equal, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCSeries, ABCTimedeltaIndex) + ABCDataFrame, + ABCIndexClass, + ABCSeries, + ABCTimedeltaIndex, +) from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -44,8 +62,9 @@ def f(self): values = self.asi8 result = get_timedelta_field(values, alias) if self._hasnans: - result = self._maybe_mask_results(result, fill_value=None, - convert='float64') + result = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) return result @@ -58,8 +77,8 @@ def _td_array_cmp(cls, op): """ Wrap comparison operations to convert timedelta-like to timedelta64 """ - opname = '__{name}__'.format(name=op.__name__) - nat_result = opname == '__ne__' + opname = "__{name}__".format(name=op.__name__) + nat_result = opname == "__ne__" def wrapper(self, other): other = lib.item_from_zerodim(other) @@ -73,7 +92,7 @@ def wrapper(self, other): # failed to parse as timedelta return ops.invalid_comparison(self, other, op) - result = op(self.view('i8'), other.value) + result = op(self.view("i8"), other.value) if isna(other): result.fill(nat_result) @@ -89,7 +108,7 @@ def wrapper(self, other): except (ValueError, TypeError): return ops.invalid_comparison(self, other, op) - result = op(self.view('i8'), other.view('i8')) + result = op(self.view("i8"), other.view("i8")) result = com.values_from_object(result) o_mask = np.array(isna(other)) @@ -136,24 +155,30 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): ------- None """ + _typ = "timedeltaarray" _scalar_type = Timedelta __array_priority__ = 1000 # define my properties & methods for delegation _other_ops = [] # type: List[str] _bool_ops = [] # type: List[str] - _object_ops = ['freq'] - _field_ops = ['days', 'seconds', 'microseconds', 'nanoseconds'] + _object_ops = ["freq"] + _field_ops = ["days", "seconds", "microseconds", "nanoseconds"] _datetimelike_ops = _field_ops + _object_ops + _bool_ops - _datetimelike_methods = ["to_pytimedelta", "total_seconds", - "round", "floor", "ceil"] + _datetimelike_methods = [ + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", + ] # Needed so that NaT.__richcmp__(DateTimeArray) operates pointwise ndim = 1 @property def _box_func(self): - return lambda x: Timedelta(x, unit='ns') + return lambda x: Timedelta(x, unit="ns") @property def dtype(self): @@ -199,7 +224,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): if values.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") - if values.dtype == 'i8': + if values.dtype == "i8": # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps @@ -239,15 +264,13 @@ def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): return result @classmethod - def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, - freq=None, unit=None): + def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, freq=None, unit=None): if dtype: _validate_td64_dtype(dtype) freq, freq_infer = dtl.maybe_infer_freq(freq) data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) - freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, - freq_infer) + freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer) result = cls._simple_new(data, freq=freq) @@ -267,12 +290,13 @@ def _generate_range(cls, start, end, periods, freq, closed=None): periods = dtl.validate_periods(periods) if freq is None and any(x is None for x in [periods, start, end]): - raise ValueError('Must provide freq argument if no data is ' - 'supplied') + raise ValueError("Must provide freq argument if no data is " "supplied") if com.count_not_none(start, end, periods, freq) != 3: - raise ValueError('Of the four parameters: start, end, periods, ' - 'and freq, exactly three must be specified') + raise ValueError( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) if start is not None: start = Timedelta(start) @@ -282,15 +306,16 @@ def _generate_range(cls, start, end, periods, freq, closed=None): if start is None and end is None: if closed is not None: - raise ValueError("Closed has to be None if not both of start" - "and end are defined") + raise ValueError( + "Closed has to be None if not both of start" "and end are defined" + ) left_closed, right_closed = dtl.validate_endpoints(closed) if freq is not None: index = _generate_regular_range(start, end, periods, freq) else: - index = np.linspace(start.value, end.value, periods).astype('i8') + index = np.linspace(start.value, end.value, periods).astype("i8") if not left_closed: index = index[1:] @@ -328,8 +353,10 @@ def _validate_fill_value(self, fill_value): elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)): fill_value = Timedelta(fill_value).value else: - raise ValueError("'fill_value' should be a Timedelta. " - "Got '{got}'.".format(got=fill_value)) + raise ValueError( + "'fill_value' should be a Timedelta. " + "Got '{got}'.".format(got=fill_value) + ) return fill_value def astype(self, dtype, copy=True): @@ -346,12 +373,12 @@ def astype(self, dtype, copy=True): if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) - values = self._maybe_mask_results(result, - fill_value=None, - convert='float64') + values = self._maybe_mask_results( + result, fill_value=None, convert="float64" + ) return values result = self._data.astype(dtype, copy=copy) - return result.astype('i8') + return result.astype("i8") elif is_timedelta64_ns_dtype(dtype): if copy: return self.copy() @@ -363,9 +390,10 @@ def astype(self, dtype, copy=True): def _formatter(self, boxed=False): from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep='NaT', date_format=None): + def _format_native_types(self, na_rep="NaT", date_format=None): from pandas.io.formats.format import _get_format_timedelta64 formatter = _get_format_timedelta64(self._data, na_rep) @@ -378,9 +406,11 @@ def _format_native_types(self, na_rep='NaT', date_format=None): def _add_offset(self, other): assert not isinstance(other, Tick) - raise TypeError("cannot add the type {typ} to a {cls}" - .format(typ=type(other).__name__, - cls=type(self).__name__)) + raise TypeError( + "cannot add the type {typ} to a {cls}".format( + typ=type(other).__name__, cls=type(self).__name__ + ) + ) def _add_delta(self, delta): """ @@ -397,7 +427,7 @@ def _add_delta(self, delta): result : TimedeltaArray """ new_values = super()._add_delta(delta) - return type(self)._from_sequence(new_values, freq='infer') + return type(self)._from_sequence(new_values, freq="infer") def _add_datetime_arraylike(self, other): """ @@ -406,6 +436,7 @@ def _add_datetime_arraylike(self, other): if isinstance(other, np.ndarray): # At this point we have already checked that dtype is datetime64 from pandas.core.arrays import DatetimeArray + other = DatetimeArray(other) # defer to implementation in DatetimeArray @@ -420,12 +451,11 @@ def _add_datetimelike_scalar(self, other): if other is NaT: # In this case we specifically interpret NaT as a datetime, not # the timedelta interpretation we would get by returning self + NaT - result = self.asi8.view('m8[ms]') + NaT.to_datetime64() + result = self.asi8.view("m8[ms]") + NaT.to_datetime64() return DatetimeArray(result) i8 = self.asi8 - result = checked_add_with_arr(i8, other.value, - arr_mask=self._isnan) + result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE return DatetimeArray(result, dtype=dtype, freq=self.freq) @@ -438,8 +468,11 @@ def _addsub_offset_array(self, other, op): # which we re-raise as TypeError return super()._addsub_offset_array(other, op) except AttributeError: - raise TypeError("Cannot add/subtract non-tick DateOffset to {cls}" - .format(cls=type(self).__name__)) + raise TypeError( + "Cannot add/subtract non-tick DateOffset to {cls}".format( + cls=type(self).__name__ + ) + ) def __mul__(self, other): other = lib.item_from_zerodim(other) @@ -546,9 +579,11 @@ def __rtruediv__(self, other): return other / self._data elif lib.is_scalar(other): - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=type(other).__name__, - cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=type(other).__name__, cls=type(self).__name__ + ) + ) if not hasattr(other, "dtype"): # e.g. list, tuple @@ -569,9 +604,11 @@ def __rtruediv__(self, other): return np.array(result) else: - raise TypeError("Cannot divide {dtype} data by {cls}" - .format(dtype=other.dtype, - cls=type(self).__name__)) + raise TypeError( + "Cannot divide {dtype} data by {cls}".format( + dtype=other.dtype, cls=type(self).__name__ + ) + ) def __floordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): @@ -599,7 +636,7 @@ def __floordiv__(self, other): if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other - return type(self)(result.view('m8[ns]'), freq=freq) + return type(self)(result.view("m8[ns]"), freq=freq) if not hasattr(other, "dtype"): # list, tuple @@ -622,7 +659,7 @@ def __floordiv__(self, other): elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) - if lib.infer_dtype(result, skipna=False) == 'timedelta': + if lib.infer_dtype(result, skipna=False) == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) return result @@ -633,8 +670,11 @@ def __floordiv__(self, other): else: dtype = getattr(other, "dtype", type(other).__name__) - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=dtype, cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=dtype, cls=type(self).__name__ + ) + ) def __rfloordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): @@ -654,9 +694,11 @@ def __rfloordiv__(self, other): result = other.__floordiv__(self._data) return result - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=type(other).__name__, - cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=type(other).__name__, cls=type(self).__name__ + ) + ) if not hasattr(other, "dtype"): # list, tuple @@ -683,8 +725,11 @@ def __rfloordiv__(self, other): else: dtype = getattr(other, "dtype", type(other).__name__) - raise TypeError("Cannot divide {typ} by {cls}" - .format(typ=dtype, cls=type(self).__name__)) + raise TypeError( + "Cannot divide {typ} by {cls}".format( + typ=dtype, cls=type(self).__name__ + ) + ) def __mod__(self, other): # Note: This is a naive implementation, can likely be optimized @@ -813,17 +858,22 @@ def to_pytimedelta(self): """ return tslibs.ints_to_pytimedelta(self.asi8) - days = _field_accessor("days", "days", - "Number of days for each element.") - seconds = _field_accessor("seconds", "seconds", - "Number of seconds (>= 0 and less than 1 day) " - "for each element.") - microseconds = _field_accessor("microseconds", "microseconds", - "Number of microseconds (>= 0 and less " - "than 1 second) for each element.") - nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "Number of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.") + days = _field_accessor("days", "days", "Number of days for each element.") + seconds = _field_accessor( + "seconds", + "seconds", + "Number of seconds (>= 0 and less than 1 day) " "for each element.", + ) + microseconds = _field_accessor( + "microseconds", + "microseconds", + "Number of microseconds (>= 0 and less " "than 1 second) for each element.", + ) + nanoseconds = _field_accessor( + "nanoseconds", + "nanoseconds", + "Number of nanoseconds (>= 0 and less " "than 1 microsecond) for each element.", + ) @property def components(self): @@ -837,21 +887,31 @@ def components(self): """ from pandas import DataFrame - columns = ['days', 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds', 'nanoseconds'] + columns = [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] hasnans = self._hasnans if hasnans: + def f(x): if isna(x): return [np.nan] * len(columns) return x.components + else: + def f(x): return x.components result = DataFrame([f(x) for x in self], columns=columns) if not hasnans: - result = result.astype('int64') + result = result.astype("int64") return result @@ -861,6 +921,7 @@ def f(x): # --------------------------------------------------------------------- # Constructor Helpers + def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters @@ -894,7 +955,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray - if not hasattr(data, 'dtype'): + if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator @@ -926,7 +987,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): frac = data - base if p: frac = np.round(frac, p) - data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]') + data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False @@ -939,21 +1000,27 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): elif is_datetime64_dtype(data): # GH#23539 - warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " - "deprecated, will raise a TypeError in a future " - "version", - FutureWarning, stacklevel=4) + warnings.warn( + "Passing datetime64-dtype data to TimedeltaIndex is " + "deprecated, will raise a TypeError in a future " + "version", + FutureWarning, + stacklevel=4, + ) data = ensure_int64(data).view(_TD_DTYPE) else: - raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]" - .format(dtype=data.dtype)) + raise TypeError( + "dtype {dtype} cannot be converted to timedelta64[ns]".format( + dtype=data.dtype + ) + ) data = np.array(data, copy=copy) if data.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") - assert data.dtype == 'm8[ns]', data + assert data.dtype == "m8[ns]", data return data, inferred_freq @@ -1028,19 +1095,20 @@ def objects_to_td64ns(data, unit="ns", errors="raise"): # coerce Index to np.ndarray, converting string-dtype if necessary values = np.array(data, dtype=np.object_, copy=False) - result = array_to_timedelta64(values, - unit=unit, errors=errors) - return result.view('timedelta64[ns]') + result = array_to_timedelta64(values, unit=unit, errors=errors) + return result.view("timedelta64[ns]") def _validate_td64_dtype(dtype): dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, np.dtype("timedelta64")): dtype = _TD_DTYPE - msg = textwrap.dedent("""\ + msg = textwrap.dedent( + """\ Passing in 'timedelta' dtype with no precision is deprecated and will raise in a future version. Please pass in - 'timedelta64[ns]' instead.""") + 'timedelta64[ns]' instead.""" + ) warnings.warn(msg, FutureWarning, stacklevel=4) if not is_dtype_equal(dtype, _TD_DTYPE): @@ -1062,8 +1130,9 @@ def _generate_regular_range(start, end, periods, offset): e = Timedelta(end).value + stride b = e - periods * stride else: - raise ValueError("at least 'start' or 'end' should be specified " - "if a 'period' is given.") + raise ValueError( + "at least 'start' or 'end' should be specified " "if a 'period' is given." + ) data = np.arange(b, e, stride, dtype=np.int64) return data diff --git a/pandas/core/base.py b/pandas/core/base.py index 93db65deff820..15baf1bed0ecd 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -16,9 +16,17 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_extension_type, is_list_like, - is_object_dtype, is_scalar, is_timedelta64_ns_dtype) + is_categorical_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_extension_array_dtype, + is_extension_type, + is_list_like, + is_object_dtype, + is_scalar, + is_timedelta64_ns_dtype, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -28,14 +36,19 @@ import pandas.core.nanops as nanops _shared_docs = dict() -_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', - unique='IndexOpsMixin', duplicated='IndexOpsMixin') +_indexops_doc_kwargs = dict( + klass="IndexOpsMixin", + inplace="", + unique="IndexOpsMixin", + duplicated="IndexOpsMixin", +) class StringMixin: """ Implements string methods so long as object defines a `__str__` method. """ + # side note - this could be made into a metaclass if more than one # object needs @@ -75,7 +88,7 @@ def _reset_cache(self, key=None): """ Reset cached properties. If ``key`` is passed, only clears that key. """ - if getattr(self, '_cache', None) is None: + if getattr(self, "_cache", None) is None: return if key is None: self._cache.clear() @@ -87,7 +100,7 @@ def __sizeof__(self): Generates the total memory usage for an object that returns either a value or Series of values """ - if hasattr(self, 'memory_usage'): + if hasattr(self, "memory_usage"): mem = self.memory_usage(deep=True) if not is_scalar(mem): mem = mem.sum() @@ -120,12 +133,14 @@ def __setattr__(self, key, value): # because # 1.) getattr is false for attributes that raise errors # 2.) cls.__dict__ doesn't traverse into base classes - if (getattr(self, "__frozen", False) and not - (key == "_cache" or - key in type(self).__dict__ or - getattr(self, key, None) is not None)): - raise AttributeError("You cannot add any new attribute '{key}'". - format(key=key)) + if getattr(self, "__frozen", False) and not ( + key == "_cache" + or key in type(self).__dict__ + or getattr(self, key, None) is not None + ): + raise AttributeError( + "You cannot add any new attribute '{key}'".format(key=key) + ) object.__setattr__(self, key, value) @@ -146,43 +161,44 @@ class SelectionMixin: mixin implementing the selection & aggregation interface on a group-like object sub-classes need to define: obj, exclusions """ + _selection = None - _internal_names = ['_cache', '__setstate__'] + _internal_names = ["_cache", "__setstate__"] _internal_names_set = set(_internal_names) - _builtin_table = OrderedDict(( - (builtins.sum, np.sum), - (builtins.max, np.max), - (builtins.min, np.min), - )) - - _cython_table = OrderedDict(( - (builtins.sum, 'sum'), - (builtins.max, 'max'), - (builtins.min, 'min'), - (np.all, 'all'), - (np.any, 'any'), - (np.sum, 'sum'), - (np.nansum, 'sum'), - (np.mean, 'mean'), - (np.nanmean, 'mean'), - (np.prod, 'prod'), - (np.nanprod, 'prod'), - (np.std, 'std'), - (np.nanstd, 'std'), - (np.var, 'var'), - (np.nanvar, 'var'), - (np.median, 'median'), - (np.nanmedian, 'median'), - (np.max, 'max'), - (np.nanmax, 'max'), - (np.min, 'min'), - (np.nanmin, 'min'), - (np.cumprod, 'cumprod'), - (np.nancumprod, 'cumprod'), - (np.cumsum, 'cumsum'), - (np.nancumsum, 'cumsum'), - )) + _builtin_table = OrderedDict( + ((builtins.sum, np.sum), (builtins.max, np.max), (builtins.min, np.min)) + ) + + _cython_table = OrderedDict( + ( + (builtins.sum, "sum"), + (builtins.max, "max"), + (builtins.min, "min"), + (np.all, "all"), + (np.any, "any"), + (np.sum, "sum"), + (np.nansum, "sum"), + (np.mean, "mean"), + (np.nanmean, "mean"), + (np.prod, "prod"), + (np.nanprod, "prod"), + (np.std, "std"), + (np.nanstd, "std"), + (np.var, "var"), + (np.nanvar, "var"), + (np.median, "median"), + (np.nanmedian, "median"), + (np.max, "max"), + (np.nanmax, "max"), + (np.min, "min"), + (np.nanmin, "min"), + (np.cumprod, "cumprod"), + (np.nancumprod, "cumprod"), + (np.cumsum, "cumsum"), + (np.nancumsum, "cumsum"), + ) + ) @property def _selection_name(self): @@ -198,8 +214,9 @@ def _selection_name(self): @property def _selection_list(self): - if not isinstance(self._selection, (list, tuple, ABCSeries, - ABCIndexClass, np.ndarray)): + if not isinstance( + self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray) + ): return [self._selection] return self._selection @@ -217,8 +234,7 @@ def ndim(self): @cache_readonly def _obj_with_exclusions(self): - if self._selection is not None and isinstance(self.obj, - ABCDataFrame): + if self._selection is not None and isinstance(self.obj, ABCDataFrame): return self.obj.reindex(columns=self._selection_list) if len(self.exclusions) > 0: @@ -228,18 +244,21 @@ def _obj_with_exclusions(self): def __getitem__(self, key): if self._selection is not None: - raise IndexError('Column(s) {selection} already selected' - .format(selection=self._selection)) + raise IndexError( + "Column(s) {selection} already selected".format( + selection=self._selection + ) + ) - if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, - np.ndarray)): + if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) - raise KeyError("Columns not found: {missing}" - .format(missing=str(bad_keys)[1:-1])) + raise KeyError( + "Columns not found: {missing}".format(missing=str(bad_keys)[1:-1]) + ) return self._gotitem(list(key), ndim=2) - elif not getattr(self, 'as_index', False): + elif not getattr(self, "as_index", False): if key not in self.obj.columns: raise KeyError("Column not found: {key}".format(key=key)) return self._gotitem(key, ndim=2) @@ -288,8 +307,9 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): # people may try to aggregate on a non-callable attribute # but don't let them think they can pass args to it assert len(args) == 0 - assert len([kwarg for kwarg in kwargs - if kwarg not in ['axis', '_level']]) == 0 + assert ( + len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 + ) return f f = getattr(np, arg, None) @@ -320,34 +340,35 @@ def _aggregate(self, arg, *args, **kwargs): is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False - _axis = kwargs.pop('_axis', None) + _axis = kwargs.pop("_axis", None) if _axis is None: - _axis = getattr(self, 'axis', 0) - _level = kwargs.pop('_level', None) + _axis = getattr(self, "axis", 0) + _level = kwargs.pop("_level", None) if isinstance(arg, str): - return self._try_aggregate_string_function(arg, *args, - **kwargs), None + return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover - raise ValueError('Can only pass dict with axis=0') + raise ValueError("Can only pass dict with axis=0") obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 - msg = textwrap.dedent("""\ + msg = textwrap.dedent( + """\ using a dict with renaming is deprecated and will be removed in a future version. For column-specific groupby renaming, use named aggregation >>> df.groupby(...).agg(name=('column', aggfunc)) - """) + """ + ) warnings.warn(msg, FutureWarning, stacklevel=level) # if we have a dict of any non-scalars @@ -375,17 +396,17 @@ def nested_renaming_depr(level=4): is_nested_renamer = True if k not in obj.columns: - msg = ('cannot perform renaming for {key} with a ' - 'nested dictionary').format(key=k) + msg = ( + "cannot perform renaming for {key} with a " + "nested dictionary" + ).format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() - elif (isinstance(obj, ABCDataFrame) and - k not in obj.columns): - raise KeyError( - "Column '{col}' does not exist!".format(col=k)) + elif isinstance(obj, ABCDataFrame) and k not in obj.columns: + raise KeyError("Column '{col}' does not exist!".format(col=k)) arg = new_arg @@ -393,8 +414,9 @@ def nested_renaming_depr(level=4): # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) - if (isinstance(obj, ABCDataFrame) and - len(obj.columns.intersection(keys)) != len(keys)): + if isinstance(obj, ABCDataFrame) and len( + obj.columns.intersection(keys) + ) != len(keys): nested_renaming_depr() from pandas.core.reshape.concat import concat @@ -405,16 +427,16 @@ def _agg_1dim(name, how, subset=None): """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: - raise SpecificationError("nested dictionary is ambiguous " - "in aggregation") + raise SpecificationError( + "nested dictionary is ambiguous " "in aggregation" + ) return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ - colg = self._gotitem(self._selection, ndim=2, - subset=obj) + colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): @@ -456,8 +478,9 @@ def _agg(arg, func): # but may have multiple aggregations if len(sl) == 1: - result = _agg(arg, lambda fname, - agg_how: _agg_1dim(self._selection, agg_how)) + result = _agg( + arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how) + ) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): @@ -488,8 +511,7 @@ def is_any_series(): def is_any_frame(): # return a boolean if we have *any* nested series - return any(isinstance(r, ABCDataFrame) - for r in result.values()) + return any(isinstance(r, ABCDataFrame) for r in result.values()) if isinstance(result, list): return concat(result, keys=keys, axis=1, sort=True), True @@ -498,8 +520,7 @@ def is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame - return concat([result[k] for k in keys], - keys=keys, axis=1), True + return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): @@ -512,28 +533,28 @@ def is_any_frame(): # we have non-same sized objects, so # we don't automatically broadcast - raise ValueError("cannot perform both aggregation " - "and transformation operations " - "simultaneously") + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) return result, True # fall thru from pandas import DataFrame, Series + try: result = DataFrame(result) except ValueError: # we have a dict of scalars - result = Series(result, - name=getattr(self, 'name', None)) + result = Series(result, name=getattr(self, "name", None)) return result, True elif is_list_like(arg): # we require a list, but not an 'str' - return self._aggregate_multiple_funcs(arg, - _level=_level, - _axis=_axis), None + return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None @@ -577,8 +598,7 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): else: for index, col in enumerate(obj): try: - colg = self._gotitem(col, ndim=1, - subset=obj.iloc[:, index]) + colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): @@ -602,10 +622,12 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): from pandas.core.dtypes.cast import is_nested_object from pandas import Series + result = Series(results, index=keys, name=self.name) if is_nested_object(result): - raise ValueError("cannot combine transform and " - "aggregation operations") + raise ValueError( + "cannot combine transform and " "aggregation operations" + ) return result def _shallow_copy(self, obj=None, obj_type=None, **kwargs): @@ -656,8 +678,11 @@ def transpose(self, *args, **kwargs): nv.validate_transpose(args, kwargs) return self - T = property(transpose, doc="""\nReturn the transpose, which is by - definition self.\n""") + T = property( + transpose, + doc="""\nReturn the transpose, which is by + definition self.\n""", + ) @property def _is_homogeneous_type(self): @@ -700,8 +725,11 @@ def item(self): scalar The first element of %(klass)s. """ - warnings.warn('`item` has been deprecated and will be removed in a ' - 'future version', FutureWarning, stacklevel=2) + warnings.warn( + "`item` has been deprecated and will be removed in a " "future version", + FutureWarning, + stacklevel=2, + ) return self.values.item() @property @@ -711,9 +739,12 @@ def data(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.data is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.data is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self.values.data @property @@ -723,9 +754,12 @@ def itemsize(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.itemsize is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.itemsize is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self._ndarray_values.itemsize @property @@ -742,9 +776,12 @@ def strides(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.strides is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.strides is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self._ndarray_values.strides @property @@ -761,9 +798,12 @@ def flags(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.flags is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.flags is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self.values.flags @property @@ -773,9 +813,12 @@ def base(self): .. deprecated:: 0.23.0 """ - warnings.warn("{obj}.base is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.base is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self.values.base @property @@ -849,13 +892,16 @@ def array(self) -> ExtensionArray: if is_datetime64_ns_dtype(result.dtype): from pandas.arrays import DatetimeArray + result = DatetimeArray(result) elif is_timedelta64_ns_dtype(result.dtype): from pandas.arrays import TimedeltaArray + result = TimedeltaArray(result) elif not is_extension_array_dtype(result.dtype): from pandas.core.arrays.numpy_ import PandasArray + result = PandasArray(result) return result @@ -1156,13 +1202,17 @@ def hasnans(self): """ return bool(isna(self).any()) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): """ perform the reduction type operation if we can """ func = getattr(self, name, None) if func is None: - raise TypeError("{klass} cannot perform the operation {op}".format( - klass=self.__class__.__name__, op=name)) + raise TypeError( + "{klass} cannot perform the operation {op}".format( + klass=self.__class__.__name__, op=name + ) + ) return func(skipna=skipna, **kwds) def _map_values(self, mapper, na_action=None): @@ -1191,7 +1241,7 @@ def _map_values(self, mapper, na_action=None): # as we know that we are not going to have to yield # python types if isinstance(mapper, dict): - if hasattr(mapper, '__missing__'): + if hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper @@ -1202,6 +1252,7 @@ def _map_values(self, mapper, na_action=None): # we specify the keys here to handle the # possibility that they are tuples from pandas import Series + mapper = Series(mapper) if isinstance(mapper, ABCSeries): @@ -1229,11 +1280,12 @@ def _map_values(self, mapper, na_action=None): map_f = lambda values, f: values.map(f) else: values = self.astype(object) - values = getattr(values, 'values', values) - if na_action == 'ignore': + values = getattr(values, "values", values) + if na_action == "ignore": + def map_f(values, f): - return lib.map_infer_mask(values, f, - isna(values).view(np.uint8)) + return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) + else: map_f = lib.map_infer @@ -1242,8 +1294,9 @@ def map_f(values, f): return new_values - def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None, dropna=True): + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): """ Return a Series containing counts of unique values. @@ -1322,18 +1375,26 @@ def value_counts(self, normalize=False, sort=True, ascending=False, dtype: int64 """ from pandas.core.algorithms import value_counts - result = value_counts(self, sort=sort, ascending=ascending, - normalize=normalize, bins=bins, dropna=dropna) + + result = value_counts( + self, + sort=sort, + ascending=ascending, + normalize=normalize, + bins=bins, + dropna=dropna, + ) return result def unique(self): values = self._values - if hasattr(values, 'unique'): + if hasattr(values, "unique"): result = values.unique() else: from pandas.core.algorithms import unique1d + result = unique1d(values) return result @@ -1402,6 +1463,7 @@ def is_monotonic(self): bool """ from pandas import Index + return Index(self).is_monotonic is_monotonic_increasing = is_monotonic @@ -1419,6 +1481,7 @@ def is_monotonic_decreasing(self): bool """ from pandas import Index + return Index(self).is_monotonic_decreasing def memory_usage(self, deep=False): @@ -1444,7 +1507,7 @@ def memory_usage(self, deep=False): Memory usage does not include memory consumed by elements that are not components of the array if deep=False or if used on PyPy """ - if hasattr(self.array, 'memory_usage'): + if hasattr(self.array, "memory_usage"): return self.array.memory_usage(deep=deep) v = self.array.nbytes @@ -1453,18 +1516,24 @@ def memory_usage(self, deep=False): return v @Substitution( - values='', order='', size_hint='', - sort=textwrap.dedent("""\ + values="", + order="", + size_hint="", + sort=textwrap.dedent( + """\ sort : boolean, default False Sort `uniques` and shuffle `labels` to maintain the relationship. - """)) - @Appender(algorithms._shared_docs['factorize']) + """ + ), + ) + @Appender(algorithms._shared_docs["factorize"]) def factorize(self, sort=False, na_sentinel=-1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) - _shared_docs['searchsorted'] = ( - """ + _shared_docs[ + "searchsorted" + ] = """ Find indices where elements should be inserted to maintain order. Find the indices into a sorted %(klass)s `self` such that, if the @@ -1534,16 +1603,15 @@ def factorize(self, sort=False, na_sentinel=-1): >>> x.searchsorted(['bread'], side='right') array([3]) - """) + """ - @Substitution(klass='Index') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): - return algorithms.searchsorted(self._values, value, - side=side, sorter=sorter) + @Substitution(klass="Index") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) - def drop_duplicates(self, keep='first', inplace=False): - inplace = validate_bool_kwarg(inplace, 'inplace') + def drop_duplicates(self, keep="first", inplace=False): + inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(self, ABCIndexClass): if self.is_unique: return self._shallow_copy() @@ -1555,15 +1623,17 @@ def drop_duplicates(self, keep='first', inplace=False): else: return result - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): from pandas.core.algorithms import duplicated + if isinstance(self, ABCIndexClass): if self.is_unique: return np.zeros(len(self), dtype=np.bool) return duplicated(self, keep=keep) else: - return self._constructor(duplicated(self, keep=keep), - index=self.index).__finalize__(self) + return self._constructor( + duplicated(self, keep=keep), index=self.index + ).__finalize__(self) # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/core/common.py b/pandas/core/common.py index 771ded04f461d..d2dd0d03d9425 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -18,7 +18,11 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( - is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer) + is_array_like, + is_bool_dtype, + is_extension_array_dtype, + is_integer, +) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -114,9 +118,10 @@ def is_bool_indexer(key: Any) -> bool: When the array is an object-dtype ndarray or ExtensionArray and contains missing values. """ - na_msg = 'cannot index with vector containing NA / NaN values' - if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or - (is_array_like(key) and is_extension_array_dtype(key.dtype))): + na_msg = "cannot index with vector containing NA / NaN values" + if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( + is_array_like(key) and is_extension_array_dtype(key.dtype) + ): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) @@ -234,7 +239,7 @@ def dict_keys_to_ordered_list(mapping): def asarray_tuplesafe(values, dtype=None): - if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')): + if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")): values = list(values) elif isinstance(values, ABCIndexClass): return values.values @@ -302,8 +307,12 @@ def is_null_slice(obj): """ We have a null slice. """ - return (isinstance(obj, slice) and obj.start is None and - obj.stop is None and obj.step is None) + return ( + isinstance(obj, slice) + and obj.start is None + and obj.stop is None + and obj.step is None + ) def is_true_slices(l): @@ -318,19 +327,20 @@ def is_full_slice(obj, l): """ We have a full length slice. """ - return (isinstance(obj, slice) and obj.start == 0 and obj.stop == l and - obj.step is None) + return ( + isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None + ) def get_callable_name(obj): # typical case has name - if hasattr(obj, '__name__'): - return getattr(obj, '__name__') + if hasattr(obj, "__name__"): + return getattr(obj, "__name__") # some objects don't; could recurse if isinstance(obj, partial): return get_callable_name(obj.func) # fall back to class name - if hasattr(obj, '__call__'): + if hasattr(obj, "__call__"): return obj.__class__.__name__ # everything failed (probably because the argument # wasn't actually callable); we return None @@ -399,14 +409,12 @@ def standardize_mapping(into): """ if not inspect.isclass(into): if isinstance(into, collections.defaultdict): - return partial( - collections.defaultdict, into.default_factory) + return partial(collections.defaultdict, into.default_factory) into = type(into) if not issubclass(into, abc.Mapping): - raise TypeError('unsupported type: {into}'.format(into=into)) + raise TypeError("unsupported type: {into}".format(into=into)) elif into == collections.defaultdict: - raise TypeError( - 'to_dict() only accepts initialized defaultdicts') + raise TypeError("to_dict() only accepts initialized defaultdicts") return into @@ -435,8 +443,9 @@ def random_state(state=None): elif state is None: return np.random else: - raise ValueError("random_state must be an integer, a numpy " - "RandomState, or None") + raise ValueError( + "random_state must be an integer, a numpy " "RandomState, or None" + ) def _pipe(obj, func, *args, **kwargs): @@ -466,7 +475,7 @@ def _pipe(obj, func, *args, **kwargs): if isinstance(func, tuple): func, target = func if target in kwargs: - msg = '%s is both the pipe target and a keyword argument' % target + msg = "%s is both the pipe target and a keyword argument" % target raise ValueError(msg) kwargs[target] = obj return func(*args, **kwargs) @@ -486,6 +495,7 @@ def f(x): return mapper[x] else: return x + else: f = mapper diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index a7524161dd80e..1046401850963 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -18,25 +18,23 @@ def _align_core_single_unary_op(term): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) - ret = typ, + ret = (typ,) - if not hasattr(term.value, 'axes'): - ret += None, + if not hasattr(term.value, "axes"): + ret += (None,) else: - ret += _zip_axes_from_type(typ, term.value.axes), + ret += (_zip_axes_from_type(typ, term.value.axes),) return ret def _zip_axes_from_type(typ, new_axes): - axes = {ax_name: new_axes[ax_ind] - for ax_ind, ax_name in typ._AXIS_NAMES.items()} + axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()} return axes def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" - return any(isinstance(term.value, pd.core.generic.PandasObject) - for term in terms) + return any(isinstance(term.value, pd.core.generic.PandasObject) for term in terms) def _filter_special_cases(f): @@ -53,13 +51,13 @@ def wrapper(terms): return _result_type_many(*term_values), None return f(terms) + return wrapper @_filter_special_cases def _align_core(terms): - term_index = [i for i, term in enumerate(terms) - if hasattr(term.value, 'axes')] + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] term_dims = [terms[i].value.ndim for i in term_index] ndims = pd.Series(dict(zip(term_index, term_dims))) @@ -81,13 +79,13 @@ def _align_core(terms): ax, itm = axis, items if not axes[ax].is_(itm): - axes[ax] = axes[ax].join(itm, how='outer') + axes[ax] = axes[ax].join(itm, how="outer") for i, ndim in ndims.items(): for axis, items in zip(range(ndim), axes): ti = terms[i].value - if hasattr(ti, 'reindex'): + if hasattr(ti, "reindex"): transpose = isinstance(ti, pd.Series) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items @@ -96,10 +94,11 @@ def _align_core(terms): ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) if ordm >= 1 and reindexer_size >= 10000: - w = ('Alignment difference on axis {axis} is larger ' - 'than an order of magnitude on term {term!r}, by ' - 'more than {ordm:.4g}; performance may suffer' - ).format(axis=axis, term=terms[i].name, ordm=ordm) + w = ( + "Alignment difference on axis {axis} is larger " + "than an order of magnitude on term {term!r}, by " + "more than {ordm:.4g}; performance may suffer" + ).format(axis=axis, term=terms[i].name, ordm=ordm) warnings.warn(w, category=PerformanceWarning, stacklevel=6) f = partial(ti.reindex, reindexer, axis=axis, copy=False) @@ -158,12 +157,11 @@ def _reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) - if (not isinstance(typ, partial) and - issubclass(typ, pd.core.generic.PandasObject)): + if not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject): return typ(obj, dtype=res_t, **axes) # special case for pathological things like ~True/~False - if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_: + if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: ret_value = res_t.type(obj) else: ret_value = typ(obj).astype(res_t) diff --git a/pandas/core/computation/check.py b/pandas/core/computation/check.py index fc6b9a2522824..4d205909b9e2e 100644 --- a/pandas/core/computation/check.py +++ b/pandas/core/computation/check.py @@ -1,11 +1,10 @@ from pandas.compat._optional import import_optional_dependency -ne = import_optional_dependency("numexpr", raise_on_missing=False, - on_version="warn") +ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn") _NUMEXPR_INSTALLED = ne is not None if _NUMEXPR_INSTALLED: _NUMEXPR_VERSION = ne.__version__ else: _NUMEXPR_VERSION = None -__all__ = ['_NUMEXPR_INSTALLED', '_NUMEXPR_VERSION'] +__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"] diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 6a0e7981ad82b..ddb1023479cba 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -11,7 +11,7 @@ def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, (np.bytes_, bytes)): - s = s.decode(pd.get_option('display.encoding')) + s = s.decode(pd.get_option("display.encoding")) return s diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index c75552d15441d..2c94b142a45b3 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -5,8 +5,7 @@ import abc from pandas.core.computation.align import _align, _reconstruct_object -from pandas.core.computation.ops import ( - UndefinedVariableError, _mathops, _reductions) +from pandas.core.computation.ops import UndefinedVariableError, _mathops, _reductions import pandas.io.formats.printing as printing @@ -29,10 +28,11 @@ def _check_ne_builtin_clash(expr): overlap = names & _ne_builtins if overlap: - s = ', '.join(map(repr, overlap)) - raise NumExprClobberingError('Variables in expression "{expr}" ' - 'overlap with builtins: ({s})' - .format(expr=expr, s=s)) + s = ", ".join(map(repr, overlap)) + raise NumExprClobberingError( + 'Variables in expression "{expr}" ' + "overlap with builtins: ({s})".format(expr=expr, s=s) + ) class AbstractEngine(metaclass=abc.ABCMeta): @@ -68,8 +68,9 @@ def evaluate(self): # make sure no names in resolvers and locals/globals clash res = self._evaluate() - return _reconstruct_object(self.result_type, res, self.aligned_axes, - self.expr.terms.return_type) + return _reconstruct_object( + self.result_type, res, self.aligned_axes, self.expr.terms.return_type + ) @property def _is_aligned(self): @@ -95,6 +96,7 @@ def _evaluate(self): class NumExprEngine(AbstractEngine): """NumExpr engine class""" + has_neg_frac = True def __init__(self, expr): @@ -112,7 +114,7 @@ def _evaluate(self): try: env = self.expr.env scope = env.full_scope - truediv = scope['truediv'] + truediv = scope["truediv"] _check_ne_builtin_clash(self.expr) return ne.evaluate(s, local_dict=scope, truediv=truediv) except KeyError as e: @@ -130,6 +132,7 @@ class PythonEngine(AbstractEngine): Mostly for testing purposes. """ + has_neg_frac = False def __init__(self, expr): @@ -142,4 +145,4 @@ def _evaluate(self): pass -_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} +_engines = {"numexpr": NumExprEngine, "python": PythonEngine} diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index ef4639a3afe4c..456ecf4b2594f 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -38,24 +38,28 @@ def _check_engine(engine): if engine is None: if _NUMEXPR_INSTALLED: - engine = 'numexpr' + engine = "numexpr" else: - engine = 'python' + engine = "python" if engine not in _engines: valid = list(_engines.keys()) - raise KeyError('Invalid engine {engine!r} passed, valid engines are' - ' {valid}'.format(engine=engine, valid=valid)) + raise KeyError( + "Invalid engine {engine!r} passed, valid engines are" + " {valid}".format(engine=engine, valid=valid) + ) # TODO: validate this in a more general way (thinking of future engines # that won't necessarily be import-able) # Could potentially be done on engine instantiation - if engine == 'numexpr': + if engine == "numexpr": if not _NUMEXPR_INSTALLED: - raise ImportError("'numexpr' is not installed or an " - "unsupported version. Cannot use " - "engine='numexpr' for query/eval " - "if 'numexpr' is not installed") + raise ImportError( + "'numexpr' is not installed or an " + "unsupported version. Cannot use " + "engine='numexpr' for query/eval " + "if 'numexpr' is not installed" + ) return engine @@ -76,17 +80,21 @@ def _check_parser(parser): from pandas.core.computation.expr import _parsers if parser not in _parsers: - raise KeyError('Invalid parser {parser!r} passed, valid parsers are' - ' {valid}'.format(parser=parser, valid=_parsers.keys())) + raise KeyError( + "Invalid parser {parser!r} passed, valid parsers are" + " {valid}".format(parser=parser, valid=_parsers.keys()) + ) def _check_resolvers(resolvers): if resolvers is not None: for resolver in resolvers: - if not hasattr(resolver, '__getitem__'): + if not hasattr(resolver, "__getitem__"): name = type(resolver).__name__ - raise TypeError('Resolver of type {name!r} does not implement ' - 'the __getitem__ method'.format(name=name)) + raise TypeError( + "Resolver of type {name!r} does not implement " + "the __getitem__ method".format(name=name) + ) def _check_expression(expr): @@ -140,25 +148,36 @@ def _check_for_locals(expr, stack_level, parser): from pandas.core.computation.expr import tokenize_string at_top_of_stack = stack_level == 0 - not_pandas_parser = parser != 'pandas' + not_pandas_parser = parser != "pandas" if not_pandas_parser: msg = "The '@' prefix is only supported by the pandas parser" elif at_top_of_stack: - msg = ("The '@' prefix is not allowed in " - "top-level eval calls, \nplease refer to " - "your variables by name without the '@' " - "prefix") + msg = ( + "The '@' prefix is not allowed in " + "top-level eval calls, \nplease refer to " + "your variables by name without the '@' " + "prefix" + ) if at_top_of_stack or not_pandas_parser: for toknum, tokval in tokenize_string(expr): - if toknum == tokenize.OP and tokval == '@': + if toknum == tokenize.OP and tokval == "@": raise SyntaxError(msg) -def eval(expr, parser='pandas', engine=None, truediv=True, - local_dict=None, global_dict=None, resolvers=(), level=0, - target=None, inplace=False): +def eval( + expr, + parser="pandas", + engine=None, + truediv=True, + local_dict=None, + global_dict=None, + resolvers=(), + level=0, + target=None, + inplace=False, +): """ Evaluate a Python expression as a string using various backends. @@ -269,14 +288,16 @@ def eval(expr, parser='pandas', engine=None, truediv=True, if isinstance(expr, str): _check_expression(expr) - exprs = [e.strip() for e in expr.splitlines() if e.strip() != ''] + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] else: exprs = [expr] multi_line = len(exprs) > 1 if multi_line and target is None: - raise ValueError("multi-line expressions are only valid in the " - "context of data, use DataFrame.eval") + raise ValueError( + "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" + ) ret = None first_expr = True @@ -290,12 +311,15 @@ def eval(expr, parser='pandas', engine=None, truediv=True, _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - env = _ensure_scope(level + 1, global_dict=global_dict, - local_dict=local_dict, resolvers=resolvers, - target=target) + env = _ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) - parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, - truediv=truediv) + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) # construct the engine and evaluate the parsed expression eng = _engines[engine] @@ -304,11 +328,12 @@ def eval(expr, parser='pandas', engine=None, truediv=True, if parsed_expr.assigner is None: if multi_line: - raise ValueError("Multi-line expressions are only valid" - " if all expressions contain an assignment") + raise ValueError( + "Multi-line expressions are only valid" + " if all expressions contain an assignment" + ) elif inplace: - raise ValueError("Cannot operate inplace " - "if there is no assignment") + raise ValueError("Cannot operate inplace " "if there is no assignment") # assign if needed assigner = parsed_expr.assigner diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 32bd34c4db7d7..772fb547567e3 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -15,11 +15,27 @@ from pandas.core import common as com from pandas.core.base import StringMixin from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, _remove_spaces_column_name) + _BACKTICK_QUOTED_STRING, + _remove_spaces_column_name, +) from pandas.core.computation.ops import ( - _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, - UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, - _mathops, _reductions, _unary_ops_syms, is_term) + _LOCAL_TAG, + BinOp, + Constant, + Div, + FuncNode, + Op, + Term, + UnaryOp, + UndefinedVariableError, + _arith_ops_syms, + _bool_ops_syms, + _cmp_ops_syms, + _mathops, + _reductions, + _unary_ops_syms, + is_term, +) from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing @@ -40,10 +56,13 @@ def tokenize_string(source): # Then, take all tokens till the next backtick to form a backtick quoted # string. for toknum, tokval, _, _, _ in token_generator: - if tokval == '`': - tokval = " ".join(it.takewhile( - lambda tokval: tokval != '`', - map(operator.itemgetter(1), token_generator))) + if tokval == "`": + tokval = " ".join( + it.takewhile( + lambda tokval: tokval != "`", + map(operator.itemgetter(1), token_generator), + ) + ) toknum = _BACKTICK_QUOTED_STRING yield toknum, tokval @@ -63,7 +82,7 @@ def _rewrite_assign(tok): Either the input or token or the replacement values """ toknum, tokval = tok - return toknum, '==' if tokval == '=' else tokval + return toknum, "==" if tokval == "=" else tokval def _replace_booleans(tok): @@ -82,10 +101,10 @@ def _replace_booleans(tok): """ toknum, tokval = tok if toknum == tokenize.OP: - if tokval == '&': - return tokenize.NAME, 'and' - elif tokval == '|': - return tokenize.NAME, 'or' + if tokval == "&": + return tokenize.NAME, "and" + elif tokval == "|": + return tokenize.NAME, "or" return toknum, tokval return toknum, tokval @@ -110,7 +129,7 @@ def _replace_locals(tok): is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. """ toknum, tokval = tok - if toknum == tokenize.OP and tokval == '@': + if toknum == tokenize.OP and tokval == "@": return tokenize.OP, _LOCAL_TAG return toknum, tokval @@ -147,13 +166,19 @@ def _compose2(f, g): def _compose(*funcs): """Compose 2 or more callables""" - assert len(funcs) > 1, 'At least 2 callables must be passed to compose' + assert len(funcs) > 1, "At least 2 callables must be passed to compose" return reduce(_compose2, funcs) -def _preparse(source, f=_compose(_replace_locals, _replace_booleans, - _rewrite_assign, - _clean_spaces_backtick_quoted_names)): +def _preparse( + source, + f=_compose( + _replace_locals, + _replace_booleans, + _rewrite_assign, + _clean_spaces_backtick_quoted_names, + ), +): """Compose a collection of tokenization functions Parameters @@ -177,7 +202,7 @@ def _preparse(source, f=_compose(_replace_locals, _replace_booleans, form ``(toknum, tokval)``, where ``toknum`` is one of the constants from the ``tokenize`` module and ``tokval`` is a string. """ - assert callable(f), 'f must be callable' + assert callable(f), "f must be callable" return tokenize.untokenize((f(x) for x in tokenize_string(source))) @@ -191,15 +216,17 @@ def _is_type(t): # partition all AST nodes -_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and - issubclass(x, ast.AST), - (getattr(ast, node) for node in dir(ast)))) +_all_nodes = frozenset( + filter( + lambda x: isinstance(x, type) and issubclass(x, ast.AST), + (getattr(ast, node) for node in dir(ast)), + ) +) def _filter_nodes(superclass, all_nodes=_all_nodes): """Filter out AST nodes that are subclasses of ``superclass``.""" - node_names = (node.__name__ for node in all_nodes - if issubclass(node, superclass)) + node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) return frozenset(node_names) @@ -221,25 +248,44 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): # nodes that we don't support directly but are needed for parsing -_hacked_nodes = frozenset(['Assign', 'Module', 'Expr']) - - -_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp', - 'DictComp', 'SetComp', 'Repr', 'Lambda', - 'Set', 'AST', 'Is', 'IsNot']) +_hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + + +_unsupported_expr_nodes = frozenset( + [ + "Yield", + "GeneratorExp", + "IfExp", + "DictComp", + "SetComp", + "Repr", + "Lambda", + "Set", + "AST", + "Is", + "IsNot", + ] +) # these nodes are low priority or won't ever be supported (e.g., AST) -_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | - _arguments_nodes | _keyword_nodes | _alias_nodes | - _expr_context_nodes | _unsupported_expr_nodes) - - _hacked_nodes) +_unsupported_nodes = ( + _stmt_nodes + | _mod_nodes + | _handler_nodes + | _arguments_nodes + | _keyword_nodes + | _alias_nodes + | _expr_context_nodes + | _unsupported_expr_nodes +) - _hacked_nodes # we're adding a different assignment in some cases to be equality comparison # and we don't want `stmt` and friends in their so get only the class whose # names are capitalized _base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes -_msg = 'cannot both support and not support {intersection}'.format( - intersection=_unsupported_nodes & _base_supported_nodes) +_msg = "cannot both support and not support {intersection}".format( + intersection=_unsupported_nodes & _base_supported_nodes +) assert not _unsupported_nodes & _base_supported_nodes, _msg @@ -249,8 +295,10 @@ def _node_not_implemented(node_name, cls): """ def f(self, *args, **kwargs): - raise NotImplementedError("{name!r} nodes are not " - "implemented".format(name=node_name)) + raise NotImplementedError( + "{name!r} nodes are not " "implemented".format(name=node_name) + ) + return f @@ -262,14 +310,16 @@ def disallow(nodes): ------- disallowed : callable """ + def disallowed(cls): cls.unsupported_nodes = () for node in nodes: new_method = _node_not_implemented(node, cls) - name = 'visit_{node}'.format(node=node) + name = "visit_{node}".format(node=node) cls.unsupported_nodes += (name,) setattr(cls, name, new_method) return cls + return disallowed @@ -290,25 +340,27 @@ def f(self, node, *args, **kwargs): f : callable """ return partial(op_class, op_symbol, *args, **kwargs) + return f -_op_classes = {'binary': BinOp, 'unary': UnaryOp} +_op_classes = {"binary": BinOp, "unary": UnaryOp} def add_ops(op_classes): """Decorator to add default implementation of ops.""" + def f(cls): for op_attr_name, op_class in op_classes.items(): - ops = getattr(cls, '{name}_ops'.format(name=op_attr_name)) - ops_map = getattr(cls, '{name}_op_nodes_map'.format( - name=op_attr_name)) + ops = getattr(cls, "{name}_ops".format(name=op_attr_name)) + ops_map = getattr(cls, "{name}_op_nodes_map".format(name=op_attr_name)) for op in ops: op_node = ops_map[op] if op_node is not None: made_op = _op_maker(op_class, op) - setattr(cls, 'visit_{node}'.format(node=op_node), made_op) + setattr(cls, "visit_{node}".format(node=op_node), made_op) return cls + return f @@ -326,24 +378,43 @@ class BaseExprVisitor(ast.NodeVisitor): parser : str preparser : callable """ + const_type = Constant # type: Type[Term] term_type = Term binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms - binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn', - 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', - None, 'Pow', 'FloorDiv', 'Mod') + binary_op_nodes = ( + "Gt", + "Lt", + "GtE", + "LtE", + "Eq", + "NotEq", + "In", + "NotIn", + "BitAnd", + "BitOr", + "And", + "Or", + "Add", + "Sub", + "Mult", + None, + "Pow", + "FloorDiv", + "Mod", + ) binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) unary_ops = _unary_ops_syms - unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' + unary_op_nodes = "UAdd", "USub", "Invert", "Not" unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) rewrite_map = { ast.Eq: ast.In, ast.NotEq: ast.NotIn, ast.In: ast.In, - ast.NotIn: ast.NotIn + ast.NotIn: ast.NotIn, } def __init__(self, env, engine, parser, preparser=_preparse): @@ -360,18 +431,18 @@ def visit(self, node, **kwargs): node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: from keyword import iskeyword + if any(iskeyword(x) for x in clean.split()): - e.msg = ("Python keyword not valid identifier" - " in numexpr query") + e.msg = "Python keyword not valid identifier" " in numexpr query" raise e - method = 'visit_' + node.__class__.__name__ + method = "visit_" + node.__class__.__name__ visitor = getattr(self, method) return visitor(node, **kwargs) def visit_Module(self, node, **kwargs): if len(node.body) != 1: - raise SyntaxError('only a single expression is allowed') + raise SyntaxError("only a single expression is allowed") expr = node.body[0] return self.visit(expr, **kwargs) @@ -408,22 +479,29 @@ def _rewrite_membership_op(self, node, left, right): def _maybe_transform_eq_ne(self, node, left=None, right=None): if left is None: - left = self.visit(node.left, side='left') + left = self.visit(node.left, side="left") if right is None: - right = self.visit(node.right, side='right') - op, op_class, left, right = self._rewrite_membership_op(node, left, - right) + right = self.visit(node.right, side="right") + op, op_class, left, right = self._rewrite_membership_op(node, left, right) return op, op_class, left, right def _maybe_downcast_constants(self, left, right): f32 = np.dtype(np.float32) - if (left.is_scalar and hasattr(left, 'value') and - not right.is_scalar and right.return_type == f32): + if ( + left.is_scalar + and hasattr(left, "value") + and not right.is_scalar + and right.return_type == f32 + ): # right is a float32 array, left is a scalar name = self.env.add_tmp(np.float32(left.value)) left = self.term_type(name, self.env) - if (right.is_scalar and hasattr(right, 'value') and - not left.is_scalar and left.return_type == f32): + if ( + right.is_scalar + and hasattr(right, "value") + and not left.is_scalar + and left.return_type == f32 + ): # left is a float32 array, right is a scalar name = self.env.add_tmp(np.float32(right.value)) right = self.term_type(name, self.env) @@ -437,25 +515,33 @@ def _maybe_eval(self, binop, eval_in_python): # [1,2] in a + 2 * b # in that case a + 2 * b will be evaluated using numexpr, and the "in" # call will be evaluated using isin (in python space) - return binop.evaluate(self.env, self.engine, self.parser, - self.term_type, eval_in_python) - - def _maybe_evaluate_binop(self, op, op_class, lhs, rhs, - eval_in_python=('in', 'not in'), - maybe_eval_in_python=('==', '!=', '<', '>', - '<=', '>=')): + return binop.evaluate( + self.env, self.engine, self.parser, self.term_type, eval_in_python + ) + + def _maybe_evaluate_binop( + self, + op, + op_class, + lhs, + rhs, + eval_in_python=("in", "not in"), + maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), + ): res = op(lhs, rhs) if res.has_invalid_return_type: - raise TypeError("unsupported operand type(s) for {op}:" - " '{lhs}' and '{rhs}'".format(op=res.op, - lhs=lhs.type, - rhs=rhs.type)) - - if self.engine != 'pytables': - if (res.op in _cmp_ops_syms and - getattr(lhs, 'is_datetime', False) or - getattr(rhs, 'is_datetime', False)): + raise TypeError( + "unsupported operand type(s) for {op}:" + " '{lhs}' and '{rhs}'".format(op=res.op, lhs=lhs.type, rhs=rhs.type) + ) + + if self.engine != "pytables": + if ( + res.op in _cmp_ops_syms + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): # all date ops must be done in python bc numexpr doesn't work # well with NaT return self._maybe_eval(res, self.binary_ops) @@ -463,13 +549,14 @@ def _maybe_evaluate_binop(self, op, op_class, lhs, rhs, if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python return self._maybe_eval(res, eval_in_python) - elif self.engine != 'pytables': - if (getattr(lhs, 'return_type', None) == object or - getattr(rhs, 'return_type', None) == object): + elif self.engine != "pytables": + if ( + getattr(lhs, "return_type", None) == object + or getattr(rhs, "return_type", None) == object + ): # evaluate "==" and "!=" in python if either of our operands # has an object return type - return self._maybe_eval(res, eval_in_python + - maybe_eval_in_python) + return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res def visit_BinOp(self, node, **kwargs): @@ -478,7 +565,7 @@ def visit_BinOp(self, node, **kwargs): return self._maybe_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): - truediv = self.env.scope['truediv'] + truediv = self.env.scope["truediv"] return lambda lhs, rhs: Div(lhs, rhs, truediv) def visit_UnaryOp(self, node, **kwargs): @@ -512,15 +599,17 @@ def visit_Index(self, node, **kwargs): def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) - result = pd.eval(slobj, local_dict=self.env, engine=self.engine, - parser=self.parser) + result = pd.eval( + slobj, local_dict=self.env, engine=self.engine, parser=self.parser + ) try: # a Term instance v = value.value[result] except AttributeError: # an Op instance - lhs = pd.eval(value, local_dict=self.env, engine=self.engine, - parser=self.parser) + lhs = pd.eval( + value, local_dict=self.env, engine=self.engine, parser=self.parser + ) v = lhs[result] name = self.env.add_tmp(v) return self.term_type(name, env=self.env) @@ -551,22 +640,24 @@ def visit_Assign(self, node, **kwargs): """ if len(node.targets) != 1: - raise SyntaxError('can only assign a single expression') + raise SyntaxError("can only assign a single expression") if not isinstance(node.targets[0], ast.Name): - raise SyntaxError('left hand side of an assignment must be a ' - 'single name') + raise SyntaxError( + "left hand side of an assignment must be a " "single name" + ) if self.env.target is None: - raise ValueError('cannot assign without a target object') + raise ValueError("cannot assign without a target object") try: assigner = self.visit(node.targets[0], **kwargs) except UndefinedVariableError: assigner = node.targets[0].id - self.assigner = getattr(assigner, 'name', assigner) + self.assigner = getattr(assigner, "name", assigner) if self.assigner is None: - raise SyntaxError('left hand side of an assignment must be a ' - 'single resolvable name') + raise SyntaxError( + "left hand side of an assignment must be a " "single resolvable name" + ) return self.visit(node.value, **kwargs) @@ -587,8 +678,7 @@ def visit_Attribute(self, node, **kwargs): if isinstance(value, ast.Name) and value.id == attr: return resolved - raise ValueError("Invalid Attribute context {name}" - .format(name=ctx.__name__)) + raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__)) def visit_Call(self, node, side=None, **kwargs): @@ -608,9 +698,8 @@ def visit_Call(self, node, side=None, **kwargs): raise if res is None: - raise ValueError("Invalid function call {func}" - .format(func=node.func.id)) - if hasattr(res, 'value'): + raise ValueError("Invalid function call {func}".format(func=node.func.id)) + if hasattr(res, "value"): res = res.value if isinstance(res, FuncNode): @@ -618,8 +707,10 @@ def visit_Call(self, node, side=None, **kwargs): new_args = [self.visit(arg) for arg in node.args] if node.keywords: - raise TypeError("Function \"{name}\" does not support keyword " - "arguments".format(name=res.name)) + raise TypeError( + 'Function "{name}" does not support keyword ' + "arguments".format(name=res.name) + ) return res(*new_args, **kwargs) @@ -629,8 +720,10 @@ def visit_Call(self, node, side=None, **kwargs): for key in node.keywords: if not isinstance(key, ast.keyword): - raise ValueError("keyword error in function call " - "'{func}'".format(func=node.func.id)) + raise ValueError( + "keyword error in function call " + "'{func}'".format(func=node.func.id) + ) if key.arg: kwargs[key.arg] = self.visit(key.value).value @@ -654,8 +747,9 @@ def visit_Compare(self, node, **kwargs): left = node.left values = [] for op, comp in zip(ops, comps): - new_node = self.visit(ast.Compare(comparators=[comp], left=left, - ops=[self.translate_In(op)])) + new_node = self.visit( + ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) + ) left = comp values.append(new_node) return self.visit(ast.BoolOp(op=ast.And(), values=values)) @@ -670,33 +764,39 @@ def visitor(x, y): lhs = self._try_visit_binop(x) rhs = self._try_visit_binop(y) - op, op_class, lhs, rhs = self._maybe_transform_eq_ne( - node, lhs, rhs) + op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) return self._maybe_evaluate_binop(op, node.op, lhs, rhs) operands = node.values return reduce(visitor, operands) -_python_not_supported = frozenset(['Dict', 'BoolOp', 'In', 'NotIn']) +_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) _numexpr_supported_calls = frozenset(_reductions + _mathops) -@disallow((_unsupported_nodes | _python_not_supported) - - (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn', - 'Tuple']))) +@disallow( + (_unsupported_nodes | _python_not_supported) + - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) +) class PandasExprVisitor(BaseExprVisitor): - - def __init__(self, env, engine, parser, - preparser=partial(_preparse, f=_compose( - _replace_locals, _replace_booleans, - _clean_spaces_backtick_quoted_names))): + def __init__( + self, + env, + engine, + parser, + preparser=partial( + _preparse, + f=_compose( + _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names + ), + ), + ): super().__init__(env, engine, parser, preparser) -@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) +@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) class PythonExprVisitor(BaseExprVisitor): - def __init__(self, env, engine, parser, preparser=lambda x: x): super().__init__(env, engine, parser, preparser=preparser) @@ -715,19 +815,20 @@ class Expr(StringMixin): level : int, optional, default 2 """ - def __init__(self, expr, engine='numexpr', parser='pandas', env=None, - truediv=True, level=0): + def __init__( + self, expr, engine="numexpr", parser="pandas", env=None, truediv=True, level=0 + ): self.expr = expr self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser - self.env.scope['truediv'] = truediv + self.env.scope["truediv"] = truediv self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() @property def assigner(self): - return getattr(self._visitor, 'assigner', None) + return getattr(self._visitor, "assigner", None) def __call__(self): return self.terms(self.env) @@ -750,4 +851,4 @@ def names(self): return frozenset(term.name for term in com.flatten(self.terms)) -_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} +_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index b01000a7aee5b..dc4e6e85f6e7d 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -28,8 +28,8 @@ # the set of dtypes that we will allow pass to numexpr _ALLOWED_DTYPES = { - 'evaluate': {'int64', 'int32', 'float64', 'float32', 'bool'}, - 'where': {'int64', 'float64', 'bool'} + "evaluate": {"int64", "int32", "float64", "float32", "bool"}, + "where": {"int64", "float64", "bool"}, } # the minimum prod shape that we will use numexpr @@ -65,7 +65,7 @@ def _evaluate_standard(op, op_str, a, b, **eval_kwargs): """ standard evaluation """ if _TEST_MODE: _store_test_result(False) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): return op(a, b) @@ -79,7 +79,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # check for dtype compatibility dtypes = set() for o in [a, b]: - if hasattr(o, 'dtypes'): + if hasattr(o, "dtypes"): s = o.dtypes.value_counts() if len(s) > 1: return False @@ -94,11 +94,10 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, truediv=True, - reversed=False, **eval_kwargs): +def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwargs): result = None - if _can_use_numexpr(op, op_str, a, b, 'evaluate'): + if _can_use_numexpr(op, op_str, a, b, "evaluate"): try: # we were originally called by a reversed op @@ -108,13 +107,15 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, a_value = getattr(a, "values", a) b_value = getattr(b, "values", b) - result = ne.evaluate('a_value {op} b_value'.format(op=op_str), - local_dict={'a_value': a_value, - 'b_value': b_value}, - casting='safe', truediv=truediv, - **eval_kwargs) + result = ne.evaluate( + "a_value {op} b_value".format(op=op_str), + local_dict={"a_value": a_value, "b_value": b_value}, + casting="safe", + truediv=truediv, + **eval_kwargs + ) except ValueError as detail: - if 'unknown type object' in str(detail): + if "unknown type object" in str(detail): pass if _TEST_MODE: @@ -127,26 +128,33 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, def _where_standard(cond, a, b): - return np.where(com.values_from_object(cond), com.values_from_object(a), - com.values_from_object(b)) + return np.where( + com.values_from_object(cond), + com.values_from_object(a), + com.values_from_object(b), + ) def _where_numexpr(cond, a, b): result = None - if _can_use_numexpr(None, 'where', a, b, 'where'): + if _can_use_numexpr(None, "where", a, b, "where"): try: - cond_value = getattr(cond, 'values', cond) - a_value = getattr(a, 'values', a) - b_value = getattr(b, 'values', b) - result = ne.evaluate('where(cond_value, a_value, b_value)', - local_dict={'cond_value': cond_value, - 'a_value': a_value, - 'b_value': b_value}, - casting='safe') + cond_value = getattr(cond, "values", cond) + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) + result = ne.evaluate( + "where(cond_value, a_value, b_value)", + local_dict={ + "cond_value": cond_value, + "a_value": a_value, + "b_value": b_value, + }, + casting="safe", + ) except ValueError as detail: - if 'unknown type object' in str(detail): + if "unknown type object" in str(detail): pass except Exception as detail: raise TypeError(str(detail)) @@ -158,40 +166,44 @@ def _where_numexpr(cond, a, b): # turn myself on -set_use_numexpr(get_option('compute.use_numexpr')) +set_use_numexpr(get_option("compute.use_numexpr")) def _has_bool_dtype(x): try: if isinstance(x, ABCDataFrame): - return 'bool' in x.dtypes + return "bool" in x.dtypes else: return x.dtype == bool except AttributeError: return isinstance(x, (bool, np.bool_)) -def _bool_arith_check(op_str, a, b, not_allowed=frozenset(('/', '//', '**')), - unsupported=None): +def _bool_arith_check( + op_str, a, b, not_allowed=frozenset(("/", "//", "**")), unsupported=None +): if unsupported is None: - unsupported = {'+': '|', '*': '&', '-': '^'} + unsupported = {"+": "|", "*": "&", "-": "^"} if _has_bool_dtype(a) and _has_bool_dtype(b): if op_str in unsupported: - warnings.warn("evaluating in Python space because the {op!r} " - "operator is not supported by numexpr for " - "the bool dtype, use {alt_op!r} instead" - .format(op=op_str, alt_op=unsupported[op_str])) + warnings.warn( + "evaluating in Python space because the {op!r} " + "operator is not supported by numexpr for " + "the bool dtype, use {alt_op!r} instead".format( + op=op_str, alt_op=unsupported[op_str] + ) + ) return False if op_str in not_allowed: - raise NotImplementedError("operator {op!r} not implemented for " - "bool dtypes".format(op=op_str)) + raise NotImplementedError( + "operator {op!r} not implemented for " "bool dtypes".format(op=op_str) + ) return True -def evaluate(op, op_str, a, b, use_numexpr=True, - **eval_kwargs): +def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): """ evaluate and return the expression of the op on a and b Parameters diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index fd96739f4da76..9e6928372808e 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -19,19 +19,36 @@ from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -_reductions = 'sum', 'prod' - -_unary_math_ops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', - 'sqrt', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', - 'arctan', 'arccosh', 'arcsinh', 'arctanh', 'abs', 'log10', - 'floor', 'ceil' - ) -_binary_math_ops = ('arctan2',) +_reductions = "sum", "prod" + +_unary_math_ops = ( + "sin", + "cos", + "exp", + "log", + "expm1", + "log1p", + "sqrt", + "sinh", + "cosh", + "tanh", + "arcsin", + "arccos", + "arctan", + "arccosh", + "arcsinh", + "arctanh", + "abs", + "log10", + "floor", + "ceil", +) +_binary_math_ops = ("arctan2",) _mathops = _unary_math_ops + _binary_math_ops -_LOCAL_TAG = '__pd_eval_local_' +_LOCAL_TAG = "__pd_eval_local_" class UndefinedVariableError(NameError): @@ -40,14 +57,13 @@ class UndefinedVariableError(NameError): def __init__(self, name, is_local): if is_local: - msg = 'local variable {0!r} is not defined' + msg = "local variable {0!r} is not defined" else: - msg = 'name {0!r} is not defined' + msg = "name {0!r} is not defined" super().__init__(msg.format(name)) class Term(StringMixin): - def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls supr_new = super(Term, klass).__new__ @@ -58,14 +74,13 @@ def __init__(self, name, env, side=None, encoding=None): self.env = env self.side = side tname = str(name) - self.is_local = (tname.startswith(_LOCAL_TAG) or - tname in _DEFAULT_GLOBALS) + self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS self._value = self._resolve_name() self.encoding = encoding @property def local_name(self): - return self.name.replace(_LOCAL_TAG, '') + return self.name.replace(_LOCAL_TAG, "") def __str__(self): return pprint_thing(self.name) @@ -80,9 +95,10 @@ def _resolve_name(self): res = self.env.resolve(self.local_name, is_local=self.is_local) self.update(res) - if hasattr(res, 'ndim') and res.ndim > 2: - raise NotImplementedError("N-dimensional objects, where N > 2," - " are not supported with eval") + if hasattr(res, "ndim") and res.ndim > 2: + raise NotImplementedError( + "N-dimensional objects, where N > 2," " are not supported with eval" + ) return res def update(self, value): @@ -124,9 +140,10 @@ def type(self): @property def raw(self): - return pprint_thing('{0}(name={1!r}, type={2})' - ''.format(self.__class__.__name__, self.name, - self.type)) + return pprint_thing( + "{0}(name={1!r}, type={2})" + "".format(self.__class__.__name__, self.name, self.type) + ) @property def is_datetime(self): @@ -155,7 +172,6 @@ def ndim(self): class Constant(Term): - def __init__(self, value, env, side=None, encoding=None): super().__init__(value, env, side=side, encoding=encoding) @@ -172,7 +188,7 @@ def __str__(self): return repr(self.name) -_bool_op_map = {'not': '~', 'and': '&', 'or': '|'} +_bool_op_map = {"not": "~", "and": "&", "or": "|"} class Op(StringMixin): @@ -183,7 +199,7 @@ class Op(StringMixin): def __init__(self, op, operands, *args, **kwargs): self.op = _bool_op_map.get(op, op) self.operands = operands - self.encoding = kwargs.get('encoding', None) + self.encoding = kwargs.get("encoding", None) def __iter__(self): return iter(self.operands) @@ -192,9 +208,8 @@ def __str__(self): """Print a generic n-ary operator and its operands using infix notation""" # recurse over the operands - parened = ('({0})'.format(pprint_thing(opr)) - for opr in self.operands) - return pprint_thing(' {0} '.format(self.op).join(parened)) + parened = ("({0})".format(pprint_thing(opr)) for opr in self.operands) + return pprint_thing(" {0} ".format(self.op).join(parened)) @property def return_type(self): @@ -206,7 +221,7 @@ def return_type(self): @property def has_invalid_return_type(self): types = self.operand_types - obj_dtype_set = frozenset([np.dtype('object')]) + obj_dtype_set = frozenset([np.dtype("object")]) return self.return_type == object and types - obj_dtype_set @property @@ -257,23 +272,23 @@ def _not_in(x, y): return x not in y -_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in' +_cmp_ops_syms = ">", "<", ">=", "<=", "==", "!=", "in", "not in" _cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) -_bool_ops_syms = '&', '|', 'and', 'or' +_bool_ops_syms = "&", "|", "and", "or" _bool_ops_funcs = op.and_, op.or_, op.and_, op.or_ _bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) -_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' -_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv, op.pow, op.floordiv, - op.mod) +_arith_ops_syms = "+", "-", "*", "/", "**", "//", "%" +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv, op.pow, op.floordiv, op.mod) _arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) -_special_case_arith_ops_syms = '**', '//', '%' +_special_case_arith_ops_syms = "**", "//", "%" _special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod -_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms, - _special_case_arith_ops_funcs)) +_special_case_arith_ops_dict = dict( + zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs) +) _binary_ops_dict = {} @@ -337,8 +352,10 @@ def __init__(self, op, lhs, rhs, **kwargs): except KeyError: # has to be made a list for python3 keys = list(_binary_ops_dict.keys()) - raise ValueError('Invalid binary operator {0!r}, valid' - ' operators are {1}'.format(op, keys)) + raise ValueError( + "Invalid binary operator {0!r}, valid" + " operators are {1}".format(op, keys) + ) def __call__(self, env): """Recursively evaluate an expression in Python space. @@ -353,7 +370,7 @@ def __call__(self, env): The result of an evaluated expression. """ # handle truediv - if self.op == '/' and env.scope['truediv']: + if self.op == "/" and env.scope["truediv"]: self.func = op.truediv # recurse over the left/right nodes @@ -378,24 +395,32 @@ def evaluate(self, env, engine, parser, term_type, eval_in_python): term_type The "pre-evaluated" expression as an instance of ``term_type`` """ - if engine == 'python': + if engine == "python": res = self(env) else: # recurse over the left/right nodes - left = self.lhs.evaluate(env, engine=engine, parser=parser, - term_type=term_type, - eval_in_python=eval_in_python) - right = self.rhs.evaluate(env, engine=engine, parser=parser, - term_type=term_type, - eval_in_python=eval_in_python) + left = self.lhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + right = self.rhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) # base cases if self.op in eval_in_python: res = self.func(left.value, right.value) else: from pandas.core.computation.eval import eval - res = eval(self, local_dict=env, engine=engine, - parser=parser) + + res = eval(self, local_dict=env, engine=engine, parser=parser) name = env.add_tmp(res) return term_type(name, env=env) @@ -403,10 +428,10 @@ def evaluate(self, env, engine, parser, term_type, eval_in_python): def convert_values(self): """Convert datetimes to a comparable value in an expression. """ + def stringify(value): if self.encoding is not None: - encoder = partial(pprint_thing_encoded, - encoding=self.encoding) + encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) @@ -419,7 +444,7 @@ def stringify(value): v = stringify(v) v = Timestamp(_ensure_decoded(v)) if v.tz is not None: - v = v.tz_convert('UTC') + v = v.tz_convert("UTC") self.rhs.update(v) if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: @@ -428,14 +453,20 @@ def stringify(value): v = stringify(v) v = Timestamp(_ensure_decoded(v)) if v.tz is not None: - v = v.tz_convert('UTC') + v = v.tz_convert("UTC") self.lhs.update(v) def _disallow_scalar_only_bool_ops(self): - if ((self.lhs.is_scalar or self.rhs.is_scalar) and - self.op in _bool_ops_dict and - (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and - issubclass(self.lhs.return_type, (bool, np.bool_))))): + if ( + (self.lhs.is_scalar or self.rhs.is_scalar) + and self.op in _bool_ops_dict + and ( + not ( + issubclass(self.rhs.return_type, (bool, np.bool_)) + and issubclass(self.lhs.return_type, (bool, np.bool_)) + ) + ) + ): raise NotImplementedError("cannot evaluate scalar only bool ops") @@ -457,20 +488,20 @@ class Div(BinOp): """ def __init__(self, lhs, rhs, truediv, *args, **kwargs): - super().__init__('/', lhs, rhs, *args, **kwargs) + super().__init__("/", lhs, rhs, *args, **kwargs) if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): - raise TypeError("unsupported operand type(s) for {0}:" - " '{1}' and '{2}'".format(self.op, - lhs.return_type, - rhs.return_type)) + raise TypeError( + "unsupported operand type(s) for {0}:" + " '{1}' and '{2}'".format(self.op, lhs.return_type, rhs.return_type) + ) # do not upcast float32s to float64 un-necessarily acceptable_dtypes = [np.float32, np.float_] _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) -_unary_ops_syms = '+', '-', '~', 'not' +_unary_ops_syms = "+", "-", "~", "not" _unary_ops_funcs = op.pos, op.neg, op.invert, op.invert _unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) @@ -499,54 +530,55 @@ def __init__(self, op, operand): try: self.func = _unary_ops_dict[op] except KeyError: - raise ValueError('Invalid unary operator {0!r}, valid operators ' - 'are {1}'.format(op, _unary_ops_syms)) + raise ValueError( + "Invalid unary operator {0!r}, valid operators " + "are {1}".format(op, _unary_ops_syms) + ) def __call__(self, env): operand = self.operand(env) return self.func(operand) def __str__(self): - return pprint_thing('{0}({1})'.format(self.op, self.operand)) + return pprint_thing("{0}({1})".format(self.op, self.operand)) @property def return_type(self): operand = self.operand - if operand.return_type == np.dtype('bool'): - return np.dtype('bool') - if (isinstance(operand, Op) and - (operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict)): - return np.dtype('bool') - return np.dtype('int') + if operand.return_type == np.dtype("bool"): + return np.dtype("bool") + if isinstance(operand, Op) and ( + operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict + ): + return np.dtype("bool") + return np.dtype("int") class MathCall(Op): - def __init__(self, func, args): super().__init__(func.name, args) self.func = func def __call__(self, env): operands = [op(env) for op in self.operands] - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): return self.func.func(*operands) def __str__(self): operands = map(str, self.operands) - return pprint_thing('{0}({1})'.format(self.op, ','.join(operands))) + return pprint_thing("{0}({1})".format(self.op, ",".join(operands))) class FuncNode: def __init__(self, name): - from pandas.core.computation.check import (_NUMEXPR_INSTALLED, - _NUMEXPR_VERSION) + from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION + if name not in _mathops or ( - _NUMEXPR_INSTALLED and - _NUMEXPR_VERSION < LooseVersion('2.6.9') and - name in ('floor', 'ceil') + _NUMEXPR_INSTALLED + and _NUMEXPR_VERSION < LooseVersion("2.6.9") + and name in ("floor", "ceil") ): - raise ValueError( - "\"{0}\" is not a supported function".format(name)) + raise ValueError('"{0}" is not a supported function'.format(name)) self.name = name self.func = getattr(np, name) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 25cfa8fe17697..e4e005c024345 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -22,18 +22,14 @@ class Scope(expr.Scope): - __slots__ = 'queryables', + __slots__ = ("queryables",) - def __init__(self, level, global_dict=None, local_dict=None, - queryables=None): - super().__init__(level + 1, - global_dict=global_dict, - local_dict=local_dict) + def __init__(self, level, global_dict=None, local_dict=None, queryables=None): + super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) self.queryables = queryables or dict() class Term(ops.Term): - def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls supr_new = StringMixin.__new__ @@ -44,10 +40,9 @@ def __init__(self, name, env, side=None, encoding=None): def _resolve_name(self): # must be a queryables - if self.side == 'left': + if self.side == "left": if self.name not in self.env.queryables: - raise NameError('name {name!r} is not defined' - .format(name=self.name)) + raise NameError("name {name!r} is not defined".format(name=self.name)) return self.name # resolve the rhs (and allow it to be None) @@ -63,7 +58,6 @@ def value(self): class Constant(Term): - def __init__(self, value, env, side=None, encoding=None): super().__init__(value, env, side=side, encoding=encoding) @@ -86,7 +80,6 @@ def _disallow_scalar_only_bool_ops(self): pass def prune(self, klass): - def pr(left, right): """ create and return a new specialized BinOp from myself """ @@ -97,8 +90,9 @@ def pr(left, right): k = klass if isinstance(left, ConditionBinOp): - if (isinstance(left, ConditionBinOp) and - isinstance(right, ConditionBinOp)): + if isinstance(left, ConditionBinOp) and isinstance( + right, ConditionBinOp + ): k = JointConditionBinOp elif isinstance(left, k): return left @@ -106,16 +100,16 @@ def pr(left, right): return right elif isinstance(left, FilterBinOp): - if (isinstance(left, FilterBinOp) and - isinstance(right, FilterBinOp)): + if isinstance(left, FilterBinOp) and isinstance(right, FilterBinOp): k = JointFilterBinOp elif isinstance(left, k): return left elif isinstance(right, k): return right - return k(self.op, left, right, queryables=self.queryables, - encoding=self.encoding).evaluate() + return k( + self.op, left, right, queryables=self.queryables, encoding=self.encoding + ).evaluate() left, right = self.lhs, self.rhs @@ -152,17 +146,17 @@ def is_in_table(self): @property def kind(self): """ the kind of my field """ - return getattr(self.queryables.get(self.lhs), 'kind', None) + return getattr(self.queryables.get(self.lhs), "kind", None) @property def meta(self): """ the meta of my field """ - return getattr(self.queryables.get(self.lhs), 'meta', None) + return getattr(self.queryables.get(self.lhs), "meta", None) @property def metadata(self): """ the metadata of my field """ - return getattr(self.queryables.get(self.lhs), 'metadata', None) + return getattr(self.queryables.get(self.lhs), "metadata", None) def generate(self, v): """ create and return the op string for this TermValue """ @@ -175,64 +169,74 @@ def convert_value(self, v): def stringify(value): if self.encoding is not None: - encoder = partial(pprint_thing_encoded, - encoding=self.encoding) + encoder = partial(pprint_thing_encoded, encoding=self.encoding) else: encoder = pprint_thing return encoder(value) kind = _ensure_decoded(self.kind) meta = _ensure_decoded(self.meta) - if kind == 'datetime64' or kind == 'datetime': + if kind == "datetime64" or kind == "datetime": if isinstance(v, (int, float)): v = stringify(v) v = _ensure_decoded(v) v = Timestamp(v) if v.tz is not None: - v = v.tz_convert('UTC') + v = v.tz_convert("UTC") return TermValue(v, v.value, kind) - elif kind == 'timedelta64' or kind == 'timedelta': - v = Timedelta(v, unit='s').value + elif kind == "timedelta64" or kind == "timedelta": + v = Timedelta(v, unit="s").value return TermValue(int(v), v, kind) - elif meta == 'category': + elif meta == "category": metadata = com.values_from_object(self.metadata) - result = metadata.searchsorted(v, side='left') + result = metadata.searchsorted(v, side="left") # result returns 0 if v is first element or if v is not in metadata # check that metadata contains v if not result and v not in metadata: result = -1 - return TermValue(result, result, 'integer') - elif kind == 'integer': + return TermValue(result, result, "integer") + elif kind == "integer": v = int(float(v)) return TermValue(v, v, kind) - elif kind == 'float': + elif kind == "float": v = float(v) return TermValue(v, v, kind) - elif kind == 'bool': + elif kind == "bool": if isinstance(v, str): - v = not v.strip().lower() in ['false', 'f', 'no', - 'n', 'none', '0', - '[]', '{}', ''] + v = not v.strip().lower() in [ + "false", + "f", + "no", + "n", + "none", + "0", + "[]", + "{}", + "", + ] else: v = bool(v) return TermValue(v, v, kind) elif isinstance(v, str): # string quoting - return TermValue(v, stringify(v), 'string') + return TermValue(v, stringify(v), "string") else: - raise TypeError("Cannot compare {v} of type {typ} to {kind} column" - .format(v=v, typ=type(v), kind=kind)) + raise TypeError( + "Cannot compare {v} of type {typ} to {kind} column".format( + v=v, typ=type(v), kind=kind + ) + ) def convert_values(self): pass class FilterBinOp(BinOp): - def __str__(self): - return pprint_thing("[Filter : [{lhs}] -> [{op}]" - .format(lhs=self.filter[0], op=self.filter[1])) + return pprint_thing( + "[Filter : [{lhs}] -> [{op}]".format(lhs=self.filter[0], op=self.filter[1]) + ) def invert(self): """ invert the filter """ @@ -249,8 +253,7 @@ def format(self): def evaluate(self): if not self.is_valid: - raise ValueError("query term is not valid [{slf}]" - .format(slf=self)) + raise ValueError("query term is not valid [{slf}]".format(slf=self)) rhs = self.conform(self.rhs) values = [TermValue(v, v, self.kind).value for v in rhs] @@ -258,41 +261,36 @@ def evaluate(self): if self.is_in_table: # if too many values to create the expression, use a filter instead - if self.op in ['==', '!='] and len(values) > self._max_selectors: + if self.op in ["==", "!="] and len(values) > self._max_selectors: filter_op = self.generate_filter_op() - self.filter = ( - self.lhs, - filter_op, - pd.Index(values)) + self.filter = (self.lhs, filter_op, pd.Index(values)) return self return None # equality conditions - if self.op in ['==', '!=']: + if self.op in ["==", "!="]: filter_op = self.generate_filter_op() - self.filter = ( - self.lhs, - filter_op, - pd.Index(values)) + self.filter = (self.lhs, filter_op, pd.Index(values)) else: - raise TypeError("passing a filterable condition to a non-table " - "indexer [{slf}]".format(slf=self)) + raise TypeError( + "passing a filterable condition to a non-table " + "indexer [{slf}]".format(slf=self) + ) return self def generate_filter_op(self, invert=False): - if (self.op == '!=' and not invert) or (self.op == '==' and invert): + if (self.op == "!=" and not invert) or (self.op == "==" and invert): return lambda axis, vals: ~axis.isin(vals) else: return lambda axis, vals: axis.isin(vals) class JointFilterBinOp(FilterBinOp): - def format(self): raise NotImplementedError("unable to collapse Joint Filters") @@ -301,18 +299,17 @@ def evaluate(self): class ConditionBinOp(BinOp): - def __str__(self): - return pprint_thing("[Condition : [{cond}]]" - .format(cond=self.condition)) + return pprint_thing("[Condition : [{cond}]]".format(cond=self.condition)) def invert(self): """ invert the condition """ # if self.condition is not None: # self.condition = "~(%s)" % self.condition # return self - raise NotImplementedError("cannot use an invert condition when " - "passing to numexpr") + raise NotImplementedError( + "cannot use an invert condition when " "passing to numexpr" + ) def format(self): """ return the actual ne format """ @@ -321,8 +318,7 @@ def format(self): def evaluate(self): if not self.is_valid: - raise ValueError("query term is not valid [{slf}]" - .format(slf=self)) + raise ValueError("query term is not valid [{slf}]".format(slf=self)) # convert values if we are in the table if not self.is_in_table: @@ -332,12 +328,12 @@ def evaluate(self): values = [self.convert_value(v) for v in rhs] # equality conditions - if self.op in ['==', '!=']: + if self.op in ["==", "!="]: # too many values to create the expression? if len(values) <= self._max_selectors: vs = [self.generate(v) for v in values] - self.condition = "({cond})".format(cond=' | '.join(vs)) + self.condition = "({cond})".format(cond=" | ".join(vs)) # use a filter after reading else: @@ -349,19 +345,17 @@ def evaluate(self): class JointConditionBinOp(ConditionBinOp): - def evaluate(self): - self.condition = "({lhs} {op} {rhs})".format(lhs=self.lhs.condition, - op=self.op, - rhs=self.rhs.condition) + self.condition = "({lhs} {op} {rhs})".format( + lhs=self.lhs.condition, op=self.op, rhs=self.rhs.condition + ) return self class UnaryOp(ops.UnaryOp): - def prune(self, klass): - if self.op != '~': + if self.op != "~": raise NotImplementedError("UnaryOp only support invert type ops") operand = self.operand @@ -378,7 +372,7 @@ def prune(self, klass): return None -_op_classes = {'unary': UnaryOp} +_op_classes = {"unary": UnaryOp} class ExprVisitor(BaseExprVisitor): @@ -389,24 +383,27 @@ def __init__(self, env, engine, parser, **kwargs): super().__init__(env, engine, parser) for bin_op in self.binary_ops: bin_node = self.binary_op_nodes_map[bin_op] - setattr(self, 'visit_{node}'.format(node=bin_node), - lambda node, bin_op=bin_op: partial(BinOp, bin_op, - **kwargs)) + setattr( + self, + "visit_{node}".format(node=bin_node), + lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs), + ) def visit_UnaryOp(self, node, **kwargs): if isinstance(node.op, (ast.Not, ast.Invert)): - return UnaryOp('~', self.visit(node.operand)) + return UnaryOp("~", self.visit(node.operand)) elif isinstance(node.op, ast.USub): return self.const_type(-self.visit(node.operand).value, self.env) elif isinstance(node.op, ast.UAdd): - raise NotImplementedError('Unary addition not supported') + raise NotImplementedError("Unary addition not supported") def visit_Index(self, node, **kwargs): return self.visit(node.value).value def visit_Assign(self, node, **kwargs): - cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0], - comparators=[node.value]) + cmpr = ast.Compare( + ops=[ast.Eq()], left=node.targets[0], comparators=[node.value] + ) return self.visit(cmpr) def visit_Subscript(self, node, **kwargs): @@ -422,8 +419,10 @@ def visit_Subscript(self, node, **kwargs): try: return self.const_type(value[slobj], self.env) except TypeError: - raise ValueError("cannot subscript {value!r} with " - "{slobj!r}".format(value=value, slobj=slobj)) + raise ValueError( + "cannot subscript {value!r} with " + "{slobj!r}".format(value=value, slobj=slobj) + ) def visit_Attribute(self, node, **kwargs): attr = node.attr @@ -448,8 +447,7 @@ def visit_Attribute(self, node, **kwargs): if isinstance(value, ast.Name) and value.id == attr: return resolved - raise ValueError("Invalid Attribute context {name}" - .format(name=ctx.__name__)) + raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__)) def translate_In(self, op): return ast.Eq() if isinstance(op, ast.In) else op @@ -478,8 +476,9 @@ def _validate_where(w): """ if not (isinstance(w, (Expr, str)) or is_list_like(w)): - raise TypeError("where must be passed as a string, Expr, " - "or list-like of Exprs") + raise TypeError( + "where must be passed as a string, Expr, " "or list-like of Exprs" + ) return w @@ -537,16 +536,20 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=0): else: w = _validate_where(w) where[idx] = w - where = ' & '.join(map('({})'.format, com.flatten(where))) # noqa + where = " & ".join(map("({})".format, com.flatten(where))) # noqa self.expr = where self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, str): self.env.queryables.update(queryables) - self._visitor = ExprVisitor(self.env, queryables=queryables, - parser='pytables', engine='pytables', - encoding=encoding) + self._visitor = ExprVisitor( + self.env, + queryables=queryables, + parser="pytables", + engine="pytables", + encoding=encoding, + ) self.terms = self.parse() def __str__(self): @@ -560,15 +563,17 @@ def evaluate(self): try: self.condition = self.terms.prune(ConditionBinOp) except AttributeError: - raise ValueError("cannot process expression [{expr}], [{slf}] " - "is not a valid condition".format(expr=self.expr, - slf=self)) + raise ValueError( + "cannot process expression [{expr}], [{slf}] " + "is not a valid condition".format(expr=self.expr, slf=self) + ) try: self.filter = self.terms.prune(FilterBinOp) except AttributeError: - raise ValueError("cannot process expression [{expr}], [{slf}] " - "is not a valid filter".format(expr=self.expr, - slf=self)) + raise ValueError( + "cannot process expression [{expr}], [{slf}] " + "is not a valid filter".format(expr=self.expr, slf=self) + ) return self.condition, self.filter @@ -585,11 +590,11 @@ def __init__(self, value, converted, kind): def tostring(self, encoding): """ quote the string if not encoded else encode and return """ - if self.kind == 'string': + if self.kind == "string": if encoding is not None: return self.converted return '"{converted}"'.format(converted=self.converted) - elif self.kind == 'float': + elif self.kind == "float": # python 2 str(float) is not always # round-trippable so use repr() return repr(self.converted) @@ -600,7 +605,7 @@ def maybe_expression(s): """ loose checking if s is a pytables-acceptable expression """ if not isinstance(s, str): return False - ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',) + ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ("=",) # make sure we have an op at least return any(op in s for op in ops) diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 729acdc52e24a..4d5a523337f66 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -19,11 +19,17 @@ import pandas.core.computation as compu -def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(), - target=None, **kwargs): +def _ensure_scope( + level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs +): """Ensure that we are grabbing the correct scope.""" - return Scope(level + 1, global_dict=global_dict, local_dict=local_dict, - resolvers=resolvers, target=target) + return Scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) def _replacer(x): @@ -44,19 +50,19 @@ def _replacer(x): def _raw_hex_id(obj): """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns - packed = struct.pack('@P', id(obj)) - return ''.join(map(_replacer, packed)) + packed = struct.pack("@P", id(obj)) + return "".join(map(_replacer, packed)) _DEFAULT_GLOBALS = { - 'Timestamp': Timestamp, - 'datetime': datetime.datetime, - 'True': True, - 'False': False, - 'list': list, - 'tuple': tuple, - 'inf': np.inf, - 'Inf': np.inf, + "Timestamp": Timestamp, + "datetime": datetime.datetime, + "True": True, + "False": False, + "list": list, + "tuple": tuple, + "inf": np.inf, + "Inf": np.inf, } @@ -98,10 +104,12 @@ class Scope(StringMixin): target : object temps : dict """ - __slots__ = 'level', 'scope', 'target', 'temps' - def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), - target=None): + __slots__ = "level", "scope", "target", "temps" + + def __init__( + self, level, global_dict=None, local_dict=None, resolvers=(), target=None + ): self.level = level + 1 # shallow copy because we don't want to keep filling this up with what @@ -121,11 +129,9 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - self.scope = self.scope.new_child((global_dict or - frame.f_globals).copy()) + self.scope = self.scope.new_child((global_dict or frame.f_globals).copy()) if not isinstance(local_dict, Scope): - self.scope = self.scope.new_child((local_dict or - frame.f_locals).copy()) + self.scope = self.scope.new_child((local_dict or frame.f_locals).copy()) finally: del frame @@ -138,10 +144,10 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), def __str__(self): scope_keys = _get_pretty_string(list(self.scope.keys())) res_keys = _get_pretty_string(list(self.resolvers.keys())) - unicode_str = '{name}(scope={scope_keys}, resolvers={res_keys})' - return unicode_str.format(name=type(self).__name__, - scope_keys=scope_keys, - res_keys=res_keys) + unicode_str = "{name}(scope={scope_keys}, resolvers={res_keys})" + return unicode_str.format( + name=type(self).__name__, scope_keys=scope_keys, res_keys=res_keys + ) @property def has_resolvers(self): @@ -232,7 +238,7 @@ def _get_vars(self, stack, scopes): variables = itertools.product(scopes, stack) for scope, (frame, _, _, _, _, _) in variables: try: - d = getattr(frame, 'f_' + scope) + d = getattr(frame, "f_" + scope) self.scope = self.scope.new_child(d) finally: # won't remove it, but DECREF it @@ -255,7 +261,7 @@ def update(self, level): stack = inspect.stack() try: - self._get_vars(stack[:sl], scopes=['locals']) + self._get_vars(stack[:sl], scopes=["locals"]) finally: del stack[:], stack @@ -272,9 +278,9 @@ def add_tmp(self, value): name : basestring The name of the temporary variable created. """ - name = '{name}_{num}_{hex_id}'.format(name=type(value).__name__, - num=self.ntemps, - hex_id=_raw_hex_id(self)) + name = "{name}_{num}_{hex_id}".format( + name=type(value).__name__, num=self.ntemps, hex_id=_raw_hex_id(self) + ) # add to inner most scope assert name not in self.temps diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 856d5076f3755..be6086dd360f2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -13,8 +13,13 @@ import pandas._config.config as cf from pandas._config.config import ( - is_bool, is_callable, is_instance_factory, is_int, is_one_of_factory, - is_text) + is_bool, + is_callable, + is_instance_factory, + is_int, + is_one_of_factory, + is_text, +) # compute @@ -28,6 +33,7 @@ def use_bottleneck_cb(key): from pandas.core import nanops + nanops.set_use_bottleneck(cf.get_option(key)) @@ -41,14 +47,21 @@ def use_bottleneck_cb(key): def use_numexpr_cb(key): from pandas.core.computation import expressions + expressions.set_use_numexpr(cf.get_option(key)) -with cf.config_prefix('compute'): - cf.register_option('use_bottleneck', True, use_bottleneck_doc, - validator=is_bool, cb=use_bottleneck_cb) - cf.register_option('use_numexpr', True, use_numexpr_doc, - validator=is_bool, cb=use_numexpr_cb) +with cf.config_prefix("compute"): + cf.register_option( + "use_bottleneck", + True, + use_bottleneck_doc, + validator=is_bool, + cb=use_bottleneck_cb, + ) + cf.register_option( + "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb + ) # # options from the "display" namespace @@ -284,6 +297,7 @@ def use_numexpr_cb(key): def table_schema_cb(key): from pandas.io.formats.printing import _enable_data_resource_formatter + _enable_data_resource_formatter(cf.get_option(key)) @@ -298,84 +312,117 @@ def is_terminal(): except NameError: # assume standard Python interpreter in a terminal return True else: - if hasattr(ip, 'kernel'): # IPython as a Jupyter kernel + if hasattr(ip, "kernel"): # IPython as a Jupyter kernel return False else: # IPython in a terminal return True -with cf.config_prefix('display'): - cf.register_option('precision', 6, pc_precision_doc, validator=is_int) - cf.register_option('float_format', None, float_format_doc, - validator=is_one_of_factory([None, is_callable])) - cf.register_option('column_space', 12, validator=is_int) - cf.register_option('max_info_rows', 1690785, pc_max_info_rows_doc, - validator=is_instance_factory((int, type(None)))) - cf.register_option('max_rows', 60, pc_max_rows_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('min_rows', 10, pc_min_rows_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('max_categories', 8, pc_max_categories_doc, - validator=is_int) - cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int) +with cf.config_prefix("display"): + cf.register_option("precision", 6, pc_precision_doc, validator=is_int) + cf.register_option( + "float_format", + None, + float_format_doc, + validator=is_one_of_factory([None, is_callable]), + ) + cf.register_option("column_space", 12, validator=is_int) + cf.register_option( + "max_info_rows", + 1690785, + pc_max_info_rows_doc, + validator=is_instance_factory((int, type(None))), + ) + cf.register_option( + "max_rows", + 60, + pc_max_rows_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option( + "min_rows", + 10, + pc_min_rows_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int) + cf.register_option("max_colwidth", 50, max_colwidth_doc, validator=is_int) if is_terminal(): max_cols = 0 # automatically determine optimal number of columns else: max_cols = 20 # cannot determine optimal number of columns - cf.register_option('max_columns', max_cols, pc_max_cols_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('large_repr', 'truncate', pc_large_repr_doc, - validator=is_one_of_factory(['truncate', 'info'])) - cf.register_option('max_info_columns', 100, pc_max_info_cols_doc, - validator=is_int) - cf.register_option('colheader_justify', 'right', colheader_justify_doc, - validator=is_text) - cf.register_option('notebook_repr_html', True, pc_nb_repr_h_doc, - validator=is_bool) - cf.register_option('pprint_nest_depth', 3, pc_pprint_nest_depth, - validator=is_int) - cf.register_option('multi_sparse', True, pc_multi_sparse_doc, - validator=is_bool) - cf.register_option('expand_frame_repr', True, pc_expand_repr_doc) - cf.register_option('show_dimensions', 'truncate', pc_show_dimensions_doc, - validator=is_one_of_factory([True, False, 'truncate'])) - cf.register_option('chop_threshold', None, pc_chop_threshold_doc) - cf.register_option('max_seq_items', 100, pc_max_seq_items) - cf.register_option('width', 80, pc_width_doc, - validator=is_instance_factory([type(None), int])) - cf.register_option('memory_usage', True, pc_memory_usage_doc, - validator=is_one_of_factory([None, True, - False, 'deep'])) - cf.register_option('unicode.east_asian_width', False, - pc_east_asian_width_doc, validator=is_bool) - cf.register_option('unicode.ambiguous_as_wide', False, - pc_east_asian_width_doc, validator=is_bool) - cf.register_option('latex.repr', False, - pc_latex_repr_doc, validator=is_bool) - cf.register_option('latex.escape', True, pc_latex_escape, - validator=is_bool) - cf.register_option('latex.longtable', False, pc_latex_longtable, - validator=is_bool) - cf.register_option('latex.multicolumn', True, pc_latex_multicolumn, - validator=is_bool) - cf.register_option('latex.multicolumn_format', 'l', pc_latex_multicolumn, - validator=is_text) - cf.register_option('latex.multirow', False, pc_latex_multirow, - validator=is_bool) - cf.register_option('html.table_schema', False, pc_table_schema_doc, - validator=is_bool, cb=table_schema_cb) - cf.register_option('html.border', 1, pc_html_border_doc, - validator=is_int) - cf.register_option('html.use_mathjax', True, pc_html_use_mathjax_doc, - validator=is_bool) + cf.register_option( + "max_columns", + max_cols, + pc_max_cols_doc, + validator=is_instance_factory([type(None), int]), + ) + cf.register_option( + "large_repr", + "truncate", + pc_large_repr_doc, + validator=is_one_of_factory(["truncate", "info"]), + ) + cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int) + cf.register_option( + "colheader_justify", "right", colheader_justify_doc, validator=is_text + ) + cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool) + cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int) + cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool) + cf.register_option("expand_frame_repr", True, pc_expand_repr_doc) + cf.register_option( + "show_dimensions", + "truncate", + pc_show_dimensions_doc, + validator=is_one_of_factory([True, False, "truncate"]), + ) + cf.register_option("chop_threshold", None, pc_chop_threshold_doc) + cf.register_option("max_seq_items", 100, pc_max_seq_items) + cf.register_option( + "width", 80, pc_width_doc, validator=is_instance_factory([type(None), int]) + ) + cf.register_option( + "memory_usage", + True, + pc_memory_usage_doc, + validator=is_one_of_factory([None, True, False, "deep"]), + ) + cf.register_option( + "unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option( + "unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool + ) + cf.register_option("latex.repr", False, pc_latex_repr_doc, validator=is_bool) + cf.register_option("latex.escape", True, pc_latex_escape, validator=is_bool) + cf.register_option("latex.longtable", False, pc_latex_longtable, validator=is_bool) + cf.register_option( + "latex.multicolumn", True, pc_latex_multicolumn, validator=is_bool + ) + cf.register_option( + "latex.multicolumn_format", "l", pc_latex_multicolumn, validator=is_text + ) + cf.register_option("latex.multirow", False, pc_latex_multirow, validator=is_bool) + cf.register_option( + "html.table_schema", + False, + pc_table_schema_doc, + validator=is_bool, + cb=table_schema_cb, + ) + cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int) + cf.register_option( + "html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool + ) tc_sim_interactive_doc = """ : boolean Whether to simulate interactive mode for purposes of testing """ -with cf.config_prefix('mode'): - cf.register_option('sim_interactive', False, tc_sim_interactive_doc) +with cf.config_prefix("mode"): + cf.register_option("sim_interactive", False, tc_sim_interactive_doc) use_inf_as_null_doc = """ : boolean @@ -396,17 +443,19 @@ def is_terminal(): def use_inf_as_na_cb(key): from pandas.core.dtypes.missing import _use_inf_as_na + _use_inf_as_na(key) -with cf.config_prefix('mode'): - cf.register_option('use_inf_as_na', False, use_inf_as_na_doc, - cb=use_inf_as_na_cb) - cf.register_option('use_inf_as_null', False, use_inf_as_null_doc, - cb=use_inf_as_na_cb) +with cf.config_prefix("mode"): + cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb) + cf.register_option( + "use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb + ) -cf.deprecate_option('mode.use_inf_as_null', msg=use_inf_as_null_doc, - rkey='mode.use_inf_as_na') +cf.deprecate_option( + "mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na" +) # user warnings @@ -416,9 +465,13 @@ def use_inf_as_na_cb(key): The default is warn """ -with cf.config_prefix('mode'): - cf.register_option('chained_assignment', 'warn', chained_assignment, - validator=is_one_of_factory([None, 'warn', 'raise'])) +with cf.config_prefix("mode"): + cf.register_option( + "chained_assignment", + "warn", + chained_assignment, + validator=is_one_of_factory([None, "warn", "raise"]), + ) # Set up the io.excel specific reader configuration. @@ -428,41 +481,45 @@ def use_inf_as_na_cb(key): auto, {others}. """ -_xls_options = ['xlrd'] -_xlsm_options = ['xlrd', 'openpyxl'] -_xlsx_options = ['xlrd', 'openpyxl'] -_ods_options = ['odf'] +_xls_options = ["xlrd"] +_xlsm_options = ["xlrd", "openpyxl"] +_xlsx_options = ["xlrd", "openpyxl"] +_ods_options = ["odf"] with cf.config_prefix("io.excel.xls"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='xls', - others=', '.join(_xls_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsm"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='xlsm', - others=', '.join(_xlsm_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsx"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='xlsx', - others=', '.join(_xlsx_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=str, + ) with cf.config_prefix("io.excel.ods"): - cf.register_option("reader", "auto", - reader_engine_doc.format( - ext='ods', - others=', '.join(_ods_options)), - validator=str) + cf.register_option( + "reader", + "auto", + reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)), + validator=str, + ) # Set up the io.excel specific writer configuration. @@ -472,32 +529,35 @@ def use_inf_as_na_cb(key): auto, {others}. """ -_xls_options = ['xlwt'] -_xlsm_options = ['openpyxl'] -_xlsx_options = ['openpyxl', 'xlsxwriter'] +_xls_options = ["xlwt"] +_xlsm_options = ["openpyxl"] +_xlsx_options = ["openpyxl", "xlsxwriter"] with cf.config_prefix("io.excel.xls"): - cf.register_option("writer", "auto", - writer_engine_doc.format( - ext='xls', - others=', '.join(_xls_options)), - validator=str) + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xls", others=", ".join(_xls_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsm"): - cf.register_option("writer", "auto", - writer_engine_doc.format( - ext='xlsm', - others=', '.join(_xlsm_options)), - validator=str) + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)), + validator=str, + ) with cf.config_prefix("io.excel.xlsx"): - cf.register_option("writer", "auto", - writer_engine_doc.format( - ext='xlsx', - others=', '.join(_xlsx_options)), - validator=str) + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)), + validator=str, + ) # Set up the io.parquet specific configuration. @@ -507,10 +567,13 @@ def use_inf_as_na_cb(key): 'auto', 'pyarrow', 'fastparquet', the default is 'auto' """ -with cf.config_prefix('io.parquet'): +with cf.config_prefix("io.parquet"): cf.register_option( - 'engine', 'auto', parquet_engine_doc, - validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet'])) + "engine", + "auto", + parquet_engine_doc, + validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]), + ) # -------- # Plotting @@ -526,28 +589,35 @@ def use_inf_as_na_cb(key): def register_plotting_backend_cb(key): backend_str = cf.get_option(key) - if backend_str == 'matplotlib': + if backend_str == "matplotlib": try: import pandas.plotting._matplotlib # noqa except ImportError: - raise ImportError('matplotlib is required for plotting when the ' - 'default backend "matplotlib" is selected.') + raise ImportError( + "matplotlib is required for plotting when the " + 'default backend "matplotlib" is selected.' + ) else: return try: importlib.import_module(backend_str) except ImportError: - raise ValueError('"{}" does not seem to be an installed module. ' - 'A pandas plotting backend must be a module that ' - 'can be imported'.format(backend_str)) + raise ValueError( + '"{}" does not seem to be an installed module. ' + "A pandas plotting backend must be a module that " + "can be imported".format(backend_str) + ) -with cf.config_prefix('plotting'): - cf.register_option('backend', defval='matplotlib', - doc=plotting_backend_doc, - validator=str, - cb=register_plotting_backend_cb) +with cf.config_prefix("plotting"): + cf.register_option( + "backend", + defval="matplotlib", + doc=plotting_backend_doc, + validator=str, + cb=register_plotting_backend_cb, + ) register_converter_doc = """ @@ -569,5 +639,10 @@ def register_converter_cb(key): with cf.config_prefix("plotting.matplotlib"): - cf.register_option("register_converters", True, register_converter_doc, - validator=bool, cb=register_converter_cb) + cf.register_option( + "register_converters", + True, + register_converter_doc, + validator=bool, + cb=register_converter_cb, + ) diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index e9d7b9c4281bd..2b527e1fb5890 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -1,14 +1,47 @@ # flake8: noqa from .common import ( - is_array_like, is_bool, is_bool_dtype, is_categorical, - is_categorical_dtype, is_complex, is_complex_dtype, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_datetimetz, is_dict_like, is_dtype_equal, - is_extension_array_dtype, is_extension_type, is_file_like, is_float, - is_float_dtype, is_hashable, is_int64_dtype, is_integer, is_integer_dtype, - is_interval, is_interval_dtype, is_iterator, is_list_like, is_named_tuple, - is_number, is_numeric_dtype, is_object_dtype, is_period, is_period_dtype, - is_re, is_re_compilable, is_scalar, is_signed_integer_dtype, is_sparse, - is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, - is_unsigned_integer_dtype, pandas_dtype) + is_array_like, + is_bool, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_complex, + is_complex_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetimetz, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_file_like, + is_float, + is_float_dtype, + is_hashable, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_interval, + is_interval_dtype, + is_iterator, + is_list_like, + is_named_tuple, + is_number, + is_numeric_dtype, + is_object_dtype, + is_period, + is_period_dtype, + is_re, + is_re_compilable, + is_scalar, + is_signed_integer_dtype, + is_sparse, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index e7191136a7d53..59ef17e3d121f 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -68,6 +68,7 @@ class property**. ``pandas.errors.AbstractMethodError`` and no ``register`` method is provided for registering virtual subclasses. """ + _metadata = () # type: Tuple[str, ...] def __str__(self): @@ -98,8 +99,7 @@ def __eq__(self, other): return False if isinstance(other, type(self)): return all( - getattr(self, attr) == getattr(other, attr) - for attr in self._metadata + getattr(self, attr) == getattr(other, attr) for attr in self._metadata ) return False @@ -146,7 +146,7 @@ def kind(self) -> str: -------- numpy.dtype.kind """ - return 'O' + return "O" @property def name(self) -> str: @@ -223,8 +223,9 @@ def construct_from_string(cls, string: str): if not isinstance(string, str): raise TypeError("Expects a string, got {}".format(type(string))) if string != cls.name: - raise TypeError("Cannot construct a '{}' from '{}'".format( - cls.__name__, string)) + raise TypeError( + "Cannot construct a '{}' from '{}'".format(cls.__name__, string) + ) return cls() @classmethod @@ -250,10 +251,9 @@ def is_dtype(cls, dtype) -> bool: 3. ``dtype`` has a ``dtype`` attribute, and any of the above conditions is true for ``dtype.dtype``. """ - dtype = getattr(dtype, 'dtype', dtype) + dtype = getattr(dtype, "dtype", dtype) - if isinstance(dtype, (ABCSeries, ABCIndexClass, - ABCDataFrame, np.dtype)): + if isinstance(dtype, (ABCSeries, ABCIndexClass, ABCDataFrame, np.dtype)): # https://github.com/pandas-dev/pandas/issues/22960 # avoid passing data to `construct_from_string`. This could # cause a FutureWarning from numpy about failing elementwise diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c68d469d291e7..f483cf520754b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -8,19 +8,49 @@ from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT from .common import ( - _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, _TD_DTYPE, ensure_int8, - ensure_int16, ensure_int32, ensure_int64, ensure_object, ensure_str, - is_bool, is_bool_dtype, is_categorical_dtype, is_complex, is_complex_dtype, - is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, is_datetimelike, is_dtype_equal, - is_extension_array_dtype, is_extension_type, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype, - is_timedelta64_dtype, is_timedelta64_ns_dtype, is_unsigned_integer_dtype, - pandas_dtype) + _INT64_DTYPE, + _NS_DTYPE, + _POSSIBLY_CAST_DTYPES, + _TD_DTYPE, + ensure_int8, + ensure_int16, + ensure_int32, + ensure_int64, + ensure_object, + ensure_str, + is_bool, + is_bool_dtype, + is_categorical_dtype, + is_complex, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_datetimelike, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) from .dtypes import DatetimeTZDtype, ExtensionDtype, PeriodDtype from .generic import ( - ABCDatetimeArray, ABCDatetimeIndex, ABCPeriodArray, ABCPeriodIndex, - ABCSeries) + ABCDatetimeArray, + ABCDatetimeIndex, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) from .inference import is_list_like from .missing import isna, notna @@ -35,8 +65,8 @@ def maybe_convert_platform(values): if isinstance(values, (list, tuple, range)): values = construct_1d_object_array_from_listlike(values) - if getattr(values, 'dtype', None) == np.object_: - if hasattr(values, '_values'): + if getattr(values, "dtype", None) == np.object_: + if hasattr(values, "_values"): values = values._values values = lib.maybe_convert_objects(values) @@ -72,27 +102,27 @@ def trans(x): return x if isinstance(dtype, str): - if dtype == 'infer': - inferred_type = lib.infer_dtype(ensure_object(result.ravel()), - skipna=False) - if inferred_type == 'boolean': - dtype = 'bool' - elif inferred_type == 'integer': - dtype = 'int64' - elif inferred_type == 'datetime64': - dtype = 'datetime64[ns]' - elif inferred_type == 'timedelta64': - dtype = 'timedelta64[ns]' + if dtype == "infer": + inferred_type = lib.infer_dtype(ensure_object(result.ravel()), skipna=False) + if inferred_type == "boolean": + dtype = "bool" + elif inferred_type == "integer": + dtype = "int64" + elif inferred_type == "datetime64": + dtype = "datetime64[ns]" + elif inferred_type == "timedelta64": + dtype = "timedelta64[ns]" # try to upcast here - elif inferred_type == 'floating': - dtype = 'int64' + elif inferred_type == "floating": + dtype = "int64" if issubclass(result.dtype.type, np.number): def trans(x): # noqa return x.round() + else: - dtype = 'object' + dtype = "object" if isinstance(dtype, str): dtype = np.dtype(dtype) @@ -101,8 +131,7 @@ def trans(x): # noqa # don't allow upcasts here (except if empty) if dtype.kind == result.dtype.kind: - if (result.dtype.itemsize <= dtype.itemsize and - np.prod(result.shape)): + if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): return result if is_bool_dtype(dtype) or is_integer_dtype(dtype): @@ -116,17 +145,21 @@ def trans(x): # noqa arr = np.array([r[0]]) # if we have any nulls, then we are done - if (isna(arr).any() or - not np.allclose(arr, trans(arr).astype(dtype), rtol=0)): + if isna(arr).any() or not np.allclose( + arr, trans(arr).astype(dtype), rtol=0 + ): return result # a comparable, e.g. a Decimal may slip in here - elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, - float, bool)): + elif not isinstance( + r[0], (np.integer, np.floating, np.bool, int, float, bool) + ): return result - if (issubclass(result.dtype.type, (np.object_, np.number)) and - notna(result).all()): + if ( + issubclass(result.dtype.type, (np.object_, np.number)) + and notna(result).all() + ): new_result = trans(result).astype(dtype) try: if np.allclose(new_result, result, rtol=0): @@ -137,20 +170,20 @@ def trans(x): # noqa # hit here if (new_result == result).all(): return new_result - elif (issubclass(dtype.type, np.floating) and - not is_bool_dtype(result.dtype)): + elif issubclass(dtype.type, np.floating) and not is_bool_dtype(result.dtype): return result.astype(dtype) # a datetimelike # GH12821, iNaT is casted to float - elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']: + elif dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: try: result = result.astype(dtype) except Exception: if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime - result = to_datetime(result).tz_localize('utc') + + result = to_datetime(result).tz_localize("utc") result = result.tz_convert(dtype.tz) elif dtype.type == Period: @@ -206,7 +239,7 @@ def maybe_upcast_putmask(result, mask, other): if is_datetimelike(result.dtype): if is_scalar(other): if isna(other): - other = result.dtype.type('nat') + other = result.dtype.type("nat") elif is_integer(other): other = np.array(other, dtype=result.dtype) elif is_integer_dtype(other): @@ -244,8 +277,7 @@ def changeit(): # we have a scalar or len 0 ndarray # and its nan and we are changing some values - if (is_scalar(other) or - (isinstance(other, np.ndarray) and other.ndim < 1)): + if is_scalar(other) or (isinstance(other, np.ndarray) and other.ndim < 1): if isna(other): return changeit() @@ -385,10 +417,10 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): elif isinstance(val, (np.datetime64, datetime)): val = tslibs.Timestamp(val) if val is tslibs.NaT or val.tz is None: - dtype = np.dtype('M8[ns]') + dtype = np.dtype("M8[ns]") else: if pandas_dtype: - dtype = DatetimeTZDtype(unit='ns', tz=val.tz) + dtype = DatetimeTZDtype(unit="ns", tz=val.tz) else: # return datetimetz as object return np.object_, val @@ -396,7 +428,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): elif isinstance(val, (np.timedelta64, timedelta)): val = tslibs.Timedelta(val).value - dtype = np.dtype('m8[ns]') + dtype = np.dtype("m8[ns]") elif is_bool(val): dtype = np.bool_ @@ -473,8 +505,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False): # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr, skipna=False) - if inferred in ['string', 'bytes', 'unicode', - 'mixed', 'mixed-integer']: + if inferred in ["string", "bytes", "unicode", "mixed", "mixed-integer"]: return (np.object_, arr) arr = np.asarray(arr) @@ -506,7 +537,7 @@ def maybe_infer_dtype_type(element): numpy.int64 """ tipo = None - if hasattr(element, 'dtype'): + if hasattr(element, "dtype"): tipo = element.dtype elif is_list_like(element): element = np.asarray(element) @@ -547,15 +578,16 @@ def maybe_cast_item(obj, item, dtype): if dtype in (np.object_, np.bool_): obj[item] = chunk.astype(np.object_) elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover - raise ValueError("Unexpected dtype encountered: {dtype}" - .format(dtype=dtype)) + raise ValueError( + "Unexpected dtype encountered: {dtype}".format(dtype=dtype) + ) def invalidate_string_dtypes(dtype_set): """Change string like dtypes to object for ``DataFrame.select_dtypes()``. """ - non_string_dtypes = dtype_set - {np.dtype('S').type, np.dtype(' 1 and coerce: - raise ValueError("Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when when coerce=True.") + raise ValueError( + "Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True." + ) if isinstance(values, (list, tuple)): # List or scalar values = np.array(values, dtype=np.object_) - elif not hasattr(values, 'dtype'): + elif not hasattr(values, "dtype"): values = np.array([values], dtype=np.object_) elif not is_object_dtype(values.dtype): # If not object, do not attempt conversion @@ -798,21 +839,23 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, # Immediate return if coerce if datetime: from pandas import to_datetime - return to_datetime(values, errors='coerce').to_numpy() + + return to_datetime(values, errors="coerce").to_numpy() elif timedelta: from pandas import to_timedelta - return to_timedelta(values, errors='coerce').to_numpy() + + return to_timedelta(values, errors="coerce").to_numpy() elif numeric: from pandas import to_numeric - return to_numeric(values, errors='coerce') + + return to_numeric(values, errors="coerce") # Soft conversions if datetime: # GH 20380, when datetime is beyond year 2262, hence outside # bound of nanosecond-resolution 64-bit integers. try: - values = lib.maybe_convert_objects(values, - convert_datetime=datetime) + values = lib.maybe_convert_objects(values, convert_datetime=datetime) except OutOfBoundsDatetime: pass @@ -822,8 +865,7 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, if numeric and is_object_dtype(values.dtype): try: - converted = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) + converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) # If all NaNs, then do not-alter values = converted if not isna(converted).all() else values values = values.copy() if copy else values @@ -839,9 +881,9 @@ def maybe_castable(arr): # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce kind = arr.dtype.kind - if kind == 'M': + if kind == "M": return is_datetime64_ns_dtype(arr.dtype) - elif kind == 'm': + elif kind == "m": return is_timedelta64_ns_dtype(arr.dtype) return arr.dtype.name not in _POSSIBLY_CAST_DTYPES @@ -866,8 +908,9 @@ def maybe_infer_to_datetimelike(value, convert_dates=False): """ # TODO: why not timedelta? - if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex, - ABCDatetimeArray, ABCPeriodArray)): + if isinstance( + value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray) + ): return value elif isinstance(value, ABCSeries): if isinstance(value._values, ABCDatetimeIndex): @@ -894,9 +937,7 @@ def try_datetime(v): # safe coerce to datetime64 try: # GH19671 - v = tslib.array_to_datetime(v, - require_iso8601=True, - errors='raise')[0] + v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0] except ValueError: # we might have a sequence of the same-datetimes with tz's @@ -907,8 +948,7 @@ def try_datetime(v): from pandas import DatetimeIndex values, tz = conversion.datetime_to_datetime64(v) - return DatetimeIndex(values).tz_localize( - 'UTC').tz_convert(tz=tz) + return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz) except (ValueError, TypeError): pass @@ -922,6 +962,7 @@ def try_timedelta(v): # will try first with a string & object conversion from pandas import to_timedelta + try: return to_timedelta(v)._ndarray_values.reshape(shape) except Exception: @@ -929,13 +970,13 @@ def try_timedelta(v): inferred_type = lib.infer_datetimelike_array(ensure_object(v)) - if inferred_type == 'date' and convert_dates: + if inferred_type == "date" and convert_dates: value = try_datetime(v) - elif inferred_type == 'datetime': + elif inferred_type == "datetime": value = try_datetime(v) - elif inferred_type == 'timedelta': + elif inferred_type == "timedelta": value = try_timedelta(v) - elif inferred_type == 'nat': + elif inferred_type == "nat": # if all NaT, return as datetime if isna(v).all(): @@ -946,7 +987,7 @@ def try_timedelta(v): # try timedelta first to avoid spurious datetime conversions # e.g. '00:00:01' is a timedelta but technically is also a datetime value = try_timedelta(v) - if lib.infer_dtype(value, skipna=False) in ['mixed']: + if lib.infer_dtype(value, skipna=False) in ["mixed"]: # cannot skip missing values, as NaT implies that the string # is actually a datetime value = try_datetime(v) @@ -954,7 +995,7 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype, errors='raise'): +def maybe_cast_to_datetime(value, dtype, errors="raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -972,17 +1013,21 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): if is_datetime64 or is_datetime64tz or is_timedelta64: # Force the dtype if needed. - msg = ("The '{dtype}' dtype has no unit. " - "Please pass in '{dtype}[ns]' instead.") + msg = ( + "The '{dtype}' dtype has no unit. " + "Please pass in '{dtype}[ns]' instead." + ) if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): - if dtype.name in ('datetime64', 'datetime64[ns]'): - if dtype.name == 'datetime64': + if dtype.name in ("datetime64", "datetime64[ns]"): + if dtype.name == "datetime64": raise ValueError(msg.format(dtype=dtype.name)) dtype = _NS_DTYPE else: - raise TypeError("cannot convert datetimelike to " - "dtype [{dtype}]".format(dtype=dtype)) + raise TypeError( + "cannot convert datetimelike to " + "dtype [{dtype}]".format(dtype=dtype) + ) elif is_datetime64tz: # our NaT doesn't support tz's @@ -992,13 +1037,15 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): value = [value] elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): - if dtype.name in ('timedelta64', 'timedelta64[ns]'): - if dtype.name == 'timedelta64': + if dtype.name in ("timedelta64", "timedelta64[ns]"): + if dtype.name == "timedelta64": raise ValueError(msg.format(dtype=dtype.name)) dtype = _TD_DTYPE else: - raise TypeError("cannot convert timedeltalike to " - "dtype [{dtype}]".format(dtype=dtype)) + raise TypeError( + "cannot convert timedeltalike to " + "dtype [{dtype}]".format(dtype=dtype) + ) if is_scalar(value): if value == iNaT or isna(value): @@ -1011,8 +1058,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): value = iNaT # we have an array of datetime or timedeltas & nulls - elif np.prod(value.shape) or not is_dtype_equal(value.dtype, - dtype): + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype): try: if is_datetime64: value = to_datetime(value, errors=errors) @@ -1034,8 +1080,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): else: # Numeric values are UTC at this point, # so localize and convert - value = (value.tz_localize('UTC') - .tz_convert(dtype.tz)) + value = value.tz_localize("UTC").tz_convert(dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values except OutOfBoundsDatetime: @@ -1048,12 +1093,11 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): if is_object_dtype(dtype): if value.dtype != _NS_DTYPE: value = value.astype(_NS_DTYPE) - ints = np.asarray(value).view('i8') + ints = np.asarray(value).view("i8") return tslib.ints_to_pydatetime(ints) # we have a non-castable dtype that was passed - raise TypeError('Cannot cast datetime64 to {dtype}' - .format(dtype=dtype)) + raise TypeError("Cannot cast datetime64 to {dtype}".format(dtype=dtype)) else: @@ -1061,20 +1105,24 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'): # catch a datetime/timedelta that is not of ns variety # and no coercion specified - if is_array and value.dtype.kind in ['M', 'm']: + if is_array and value.dtype.kind in ["M", "m"]: dtype = value.dtype - if dtype.kind == 'M' and dtype != _NS_DTYPE: + if dtype.kind == "M" and dtype != _NS_DTYPE: value = tslibs.conversion.ensure_datetime64ns(value) - elif dtype.kind == 'm' and dtype != _TD_DTYPE: + elif dtype.kind == "m" and dtype != _TD_DTYPE: value = to_timedelta(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion - elif not (is_array and not (issubclass(value.dtype.type, np.integer) or - value.dtype == np.object_)): + elif not ( + is_array + and not ( + issubclass(value.dtype.type, np.integer) or value.dtype == np.object_ + ) + ): value = maybe_infer_to_datetimelike(value) return value @@ -1099,7 +1147,7 @@ def find_common_type(types): """ if len(types) == 0: - raise ValueError('no types given') + raise ValueError("no types given") first = types[0] @@ -1113,9 +1161,9 @@ def find_common_type(types): # take lowest unit if all(is_datetime64_dtype(t) for t in types): - return np.dtype('datetime64[ns]') + return np.dtype("datetime64[ns]") if all(is_timedelta64_dtype(t) for t in types): - return np.dtype('timedelta64[ns]') + return np.dtype("timedelta64[ns]") # don't mix bool / int or float or complex # this is different from numpy, which casts bool with float/int as int @@ -1174,9 +1222,11 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): """ if is_datetime64tz_dtype(dtype): from pandas import DatetimeIndex + subarr = DatetimeIndex([value] * length, dtype=dtype) elif is_categorical_dtype(dtype): from pandas import Categorical + subarr = Categorical([value] * length, dtype=dtype) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): @@ -1184,7 +1234,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): if length and is_integer_dtype(dtype) and isna(value): # coerce if we have nan for an integer dtype - dtype = np.dtype('float64') + dtype = np.dtype("float64") elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): # we need to coerce to object dtype to avoid # to allow numpy to take our string as a scalar value @@ -1218,7 +1268,7 @@ def construct_1d_object_array_from_listlike(values): """ # numpy will try to interpret nested lists as further dimensions, hence # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype='object') + result = np.empty(len(values), dtype="object") result[:] = values return result @@ -1314,8 +1364,10 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): else: casted = arr.astype(dtype, copy=copy) except OverflowError: - raise OverflowError("The elements provided in the data cannot all be " - "casted to the dtype {dtype}".format(dtype=dtype)) + raise OverflowError( + "The elements provided in the data cannot all be " + "casted to the dtype {dtype}".format(dtype=dtype) + ) if np.array_equal(arr, casted): return casted @@ -1328,9 +1380,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False): arr = np.asarray(arr) if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): - raise OverflowError("Trying to coerce negative values " - "to unsigned integers") + raise OverflowError("Trying to coerce negative values " "to unsigned integers") - if is_integer_dtype(dtype) and (is_float_dtype(arr) or - is_object_dtype(arr)): + if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b2b74e2a70ca9..d0e4bd9b4482a 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,22 +9,61 @@ from pandas.compat import PY36 from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, - PeriodDtype, registry) + CategoricalDtype, + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, + registry, +) from pandas.core.dtypes.generic import ( - ABCCategorical, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, - ABCPeriodArray, ABCPeriodIndex, ABCSeries) + ABCCategorical, + ABCDateOffset, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodArray, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.inference import ( # noqa:F401 - is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like, - is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like, - is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable, - is_scalar, is_sequence, is_string_like) + is_array_like, + is_bool, + is_complex, + is_decimal, + is_dict_like, + is_file_like, + is_float, + is_hashable, + is_integer, + is_interval, + is_iterator, + is_list_like, + is_named_tuple, + is_nested_list_like, + is_number, + is_re, + is_re_compilable, + is_scalar, + is_sequence, + is_string_like, +) from pandas._typing import ArrayLike -_POSSIBLY_CAST_DTYPES = {np.dtype(t).name - for t in ['O', 'int8', 'uint8', 'int16', 'uint16', - 'int32', 'uint32', 'int64', 'uint64']} +_POSSIBLY_CAST_DTYPES = { + np.dtype(t).name + for t in [ + "O", + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + "uint64", + ] +} _NS_DTYPE = conversion.NS_DTYPE _TD_DTYPE = conversion.TD_DTYPE @@ -74,7 +113,7 @@ def ensure_str(value: Union[bytes, Any]) -> str: Ensure that bytes and non-strings get converted into ``str`` objects. """ if isinstance(value, bytes): - value = value.decode('utf-8') + value = value.decode("utf-8") elif not isinstance(value, str): value = str(value) return value @@ -97,6 +136,7 @@ def ensure_categorical(arr): if not is_categorical(arr): from pandas import Categorical + arr = Categorical(arr) return arr @@ -128,13 +168,13 @@ def ensure_int_or_float(arr: ArrayLike, copy=False) -> np.array: will remain unchanged. """ try: - return arr.astype('int64', copy=copy, casting='safe') + return arr.astype("int64", copy=copy, casting="safe") except TypeError: pass try: - return arr.astype('uint64', copy=copy, casting='safe') + return arr.astype("uint64", copy=copy, casting="safe") except TypeError: - return arr.astype('float64', copy=copy) + return arr.astype("float64", copy=copy) def ensure_python_int(value: Union[int, np.integer]) -> int: @@ -154,12 +194,13 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: TypeError: if the value isn't an int or can't be converted to one. """ if not is_scalar(value): - raise TypeError("Value needs to be a scalar value, was type {}" - .format(type(value))) + raise TypeError( + "Value needs to be a scalar value, was type {}".format(type(value)) + ) msg = "Wrong type {} for value {}" try: new_value = int(value) - assert (new_value == value) + assert new_value == value except (TypeError, ValueError, AssertionError): raise TypeError(msg.format(type(value), value)) return new_value @@ -175,8 +216,10 @@ def classes_and_not_datetimelike(*klasses): evaluate if the tipo is a subclass of the klasses and not a datetimelike """ - return lambda tipo: (issubclass(tipo, klasses) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) + return lambda tipo: ( + issubclass(tipo, klasses) + and not issubclass(tipo, (np.datetime64, np.timedelta64)) + ) def is_object_dtype(arr_or_dtype): @@ -267,7 +310,7 @@ def is_sparse(arr): """ from pandas.core.arrays.sparse import SparseDtype - dtype = getattr(arr, 'dtype', arr) + dtype = getattr(arr, "dtype", arr) return isinstance(dtype, SparseDtype) @@ -385,9 +428,12 @@ def is_datetimetz(arr): True """ - warnings.warn("'is_datetimetz' is deprecated and will be removed in a " - "future version. Use 'is_datetime64tz_dtype' instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "'is_datetimetz' is deprecated and will be removed in a " + "future version. Use 'is_datetime64tz_dtype' instead.", + FutureWarning, + stacklevel=2, + ) return is_datetime64tz_dtype(arr) @@ -417,8 +463,7 @@ def is_offsetlike(arr_or_obj): """ if isinstance(arr_or_obj, ABCDateOffset): return True - elif (is_list_like(arr_or_obj) and len(arr_or_obj) and - is_object_dtype(arr_or_obj)): + elif is_list_like(arr_or_obj) and len(arr_or_obj) and is_object_dtype(arr_or_obj): return all(isinstance(x, ABCDateOffset) for x in arr_or_obj) return False @@ -449,9 +494,13 @@ def is_period(arr): True """ - warnings.warn("'is_period' is deprecated and will be removed in a future " - "version. Use 'is_period_dtype' or is_period_arraylike' " - "instead.", FutureWarning, stacklevel=2) + warnings.warn( + "'is_period' is deprecated and will be removed in a future " + "version. Use 'is_period_dtype' or is_period_arraylike' " + "instead.", + FutureWarning, + stacklevel=2, + ) return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) @@ -690,7 +739,8 @@ def is_string_dtype(arr_or_dtype): # TODO: gh-15585: consider making the checks stricter. def condition(dtype): - return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype) + return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) + return _is_dtype(arr_or_dtype, condition) @@ -723,7 +773,7 @@ def is_period_arraylike(arr): return True elif isinstance(arr, (np.ndarray, ABCSeries)): return is_period_dtype(arr.dtype) - return getattr(arr, 'inferred_type', None) == 'period' + return getattr(arr, "inferred_type", None) == "period" def is_datetime_arraylike(arr): @@ -754,9 +804,11 @@ def is_datetime_arraylike(arr): if isinstance(arr, ABCDatetimeIndex): return True elif isinstance(arr, (np.ndarray, ABCSeries)): - return (is_object_dtype(arr.dtype) - and lib.infer_dtype(arr, skipna=False) == 'datetime') - return getattr(arr, 'inferred_type', None) == 'datetime' + return ( + is_object_dtype(arr.dtype) + and lib.infer_dtype(arr, skipna=False) == "datetime" + ) + return getattr(arr, "inferred_type", None) == "datetime" def is_datetimelike(arr): @@ -799,9 +851,12 @@ def is_datetimelike(arr): True """ - return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or - is_timedelta64_dtype(arr) or - isinstance(arr, ABCPeriodIndex)) + return ( + is_datetime64_dtype(arr) + or is_datetime64tz_dtype(arr) + or is_timedelta64_dtype(arr) + or isinstance(arr, ABCPeriodIndex) + ) def is_dtype_equal(source, target): @@ -925,8 +980,7 @@ def is_any_int_dtype(arr_or_dtype): False """ - return _is_dtype_type( - arr_or_dtype, classes(np.integer, np.timedelta64)) + return _is_dtype_type(arr_or_dtype, classes(np.integer, np.timedelta64)) def is_integer_dtype(arr_or_dtype): @@ -981,8 +1035,7 @@ def is_integer_dtype(arr_or_dtype): False """ - return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.integer)) + return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.integer)) def is_signed_integer_dtype(arr_or_dtype): @@ -1039,8 +1092,7 @@ def is_signed_integer_dtype(arr_or_dtype): False """ - return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) + return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)) def is_unsigned_integer_dtype(arr_or_dtype): @@ -1088,7 +1140,8 @@ def is_unsigned_integer_dtype(arr_or_dtype): True """ return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger)) + arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger) + ) def is_int64_dtype(arr_or_dtype): @@ -1179,8 +1232,7 @@ def is_datetime64_any_dtype(arr_or_dtype): if arr_or_dtype is None: return False - return (is_datetime64_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) + return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) def is_datetime64_ns_dtype(arr_or_dtype): @@ -1230,7 +1282,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): tipo = _get_dtype(arr_or_dtype.dtype) else: return False - return tipo == _NS_DTYPE or getattr(tipo, 'base', None) == _NS_DTYPE + return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE def is_timedelta64_ns_dtype(arr_or_dtype): @@ -1300,8 +1352,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype): True """ - return _is_dtype_type( - arr_or_dtype, classes(np.datetime64, np.timedelta64)) + return _is_dtype_type(arr_or_dtype, classes(np.datetime64, np.timedelta64)) def _is_unorderable_exception(e): @@ -1325,7 +1376,7 @@ def _is_unorderable_exception(e): if PY36: return "'>' not supported between instances of" in str(e) - return 'unorderable' in str(e) + return "unorderable" in str(e) def is_numeric_v_string_like(a, b): @@ -1380,10 +1431,12 @@ def is_numeric_v_string_like(a, b): is_a_scalar_string_like = not is_a_array and is_string_like(a) is_b_scalar_string_like = not is_b_array and is_string_like(b) - return ((is_a_numeric_array and is_b_scalar_string_like) or - (is_b_numeric_array and is_a_scalar_string_like) or - (is_a_numeric_array and is_b_string_array) or - (is_b_numeric_array and is_a_string_array)) + return ( + (is_a_numeric_array and is_b_scalar_string_like) + or (is_b_numeric_array and is_a_scalar_string_like) + or (is_a_numeric_array and is_b_string_array) + or (is_b_numeric_array and is_a_string_array) + ) def is_datetimelike_v_numeric(a, b): @@ -1428,9 +1481,9 @@ def is_datetimelike_v_numeric(a, b): False """ - if not hasattr(a, 'dtype'): + if not hasattr(a, "dtype"): a = np.asarray(a) - if not hasattr(b, 'dtype'): + if not hasattr(b, "dtype"): b = np.asarray(b) def is_numeric(x): @@ -1440,8 +1493,9 @@ def is_numeric(x): return is_integer_dtype(x) or is_float_dtype(x) is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_numeric(b)) or - (is_datetimelike(b) and is_numeric(a))) + return (is_datetimelike(a) and is_numeric(b)) or ( + is_datetimelike(b) and is_numeric(a) + ) def is_datetimelike_v_object(a, b): @@ -1487,14 +1541,15 @@ def is_datetimelike_v_object(a, b): False """ - if not hasattr(a, 'dtype'): + if not hasattr(a, "dtype"): a = np.asarray(a) - if not hasattr(b, 'dtype'): + if not hasattr(b, "dtype"): b = np.asarray(b) is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_object_dtype(b)) or - (is_datetimelike(b) and is_object_dtype(a))) + return (is_datetimelike(a) and is_object_dtype(b)) or ( + is_datetimelike(b) and is_object_dtype(a) + ) def needs_i8_conversion(arr_or_dtype): @@ -1534,9 +1589,11 @@ def needs_i8_conversion(arr_or_dtype): if arr_or_dtype is None: return False - return (is_datetime_or_timedelta_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype) or - is_period_dtype(arr_or_dtype)) + return ( + is_datetime_or_timedelta_dtype(arr_or_dtype) + or is_datetime64tz_dtype(arr_or_dtype) + or is_period_dtype(arr_or_dtype) + ) def is_numeric_dtype(arr_or_dtype): @@ -1578,7 +1635,8 @@ def is_numeric_dtype(arr_or_dtype): """ return _is_dtype_type( - arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)) + arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_) + ) def is_string_like_dtype(arr_or_dtype): @@ -1610,8 +1668,7 @@ def is_string_like_dtype(arr_or_dtype): False """ - return _is_dtype( - arr_or_dtype, lambda dtype: dtype.kind in ('S', 'U')) + return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U")) def is_float_dtype(arr_or_dtype): @@ -1705,10 +1762,9 @@ def is_bool_dtype(arr_or_dtype): # we don't have a boolean Index class # so its object, we need to infer to # guess this - return (arr_or_dtype.is_object and - arr_or_dtype.inferred_type == 'boolean') + return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): - dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) return dtype._is_boolean return issubclass(dtype.type, np.bool_) @@ -1818,9 +1874,8 @@ def is_extension_array_dtype(arr_or_dtype): >>> is_extension_array_dtype(arr.dtype) False """ - dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) - return (isinstance(dtype, ExtensionDtype) or - registry.find(dtype) is not None) + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) + return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None def is_complex_dtype(arr_or_dtype): @@ -1911,7 +1966,7 @@ def _get_dtype(arr_or_dtype): return np.dtype(arr_or_dtype) # if we have an array-like - elif hasattr(arr_or_dtype, 'dtype'): + elif hasattr(arr_or_dtype, "dtype"): arr_or_dtype = arr_or_dtype.dtype return pandas_dtype(arr_or_dtype) @@ -1944,7 +1999,7 @@ def _is_dtype_type(arr_or_dtype, condition): return condition(np.dtype(arr_or_dtype).type) # if we have an array-like - if hasattr(arr_or_dtype, 'dtype'): + if hasattr(arr_or_dtype, "dtype"): arr_or_dtype = arr_or_dtype.dtype # we are not possibly a dtype @@ -2005,13 +2060,13 @@ def infer_dtype_from_object(dtype): # TODO(jreback) # should deprecate these - if dtype in ['datetimetz', 'datetime64tz']: + if dtype in ["datetimetz", "datetime64tz"]: return DatetimeTZDtype.type - elif dtype in ['period']: + elif dtype in ["period"]: raise NotImplementedError - if dtype == 'datetime' or dtype == 'timedelta': - dtype += '64' + if dtype == "datetime" or dtype == "timedelta": + dtype += "64" try: return infer_dtype_from_object(getattr(np, dtype)) except (AttributeError, TypeError): @@ -2045,9 +2100,9 @@ def _validate_date_like_dtype(dtype): try: typ = np.datetime_data(dtype)[0] except ValueError as e: - raise TypeError('{error}'.format(error=e)) - if typ != 'generic' and typ != 'ns': - msg = '{name!r} is too specific of a frequency, try passing {type!r}' + raise TypeError("{error}".format(error=e)) + if typ != "generic" and typ != "ns": + msg = "{name!r} is too specific of a frequency, try passing {type!r}" raise ValueError(msg.format(name=dtype.name, type=dtype.type.__name__)) @@ -2086,19 +2141,18 @@ def pandas_dtype(dtype): # we don't want to force a repr of the non-string if not isinstance(dtype, str): raise TypeError("data type not understood") - raise TypeError("data type '{}' not understood".format( - dtype)) + raise TypeError("data type '{}' not understood".format(dtype)) # Any invalid dtype (such as pd.Timestamp) should raise an error. # np.dtype(invalid_type).kind = 0 for such objects. However, this will # also catch some valid dtypes such as object, np.object_ and 'object' # which we safeguard against by catching them earlier and returning # np.dtype(valid_dtype) before this condition is evaluated. - if is_hashable(dtype) and dtype in [object, np.object_, 'object', 'O']: + if is_hashable(dtype) and dtype in [object, np.object_, "object", "O"]: # check hashability to avoid errors/DeprecationWarning when we get # here and `dtype` is an array return npdtype - elif npdtype.kind == 'O': + elif npdtype.kind == "O": raise TypeError("dtype '{}' not understood".format(dtype)) return npdtype diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 66f7a6365fe41..ac74ad5726a99 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -7,12 +7,27 @@ from pandas._libs import tslib, tslibs from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, is_bool_dtype, is_categorical_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_array_dtype, is_object_dtype, is_sparse, is_timedelta64_dtype) + _NS_DTYPE, + _TD_DTYPE, + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_object_dtype, + is_sparse, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ( - ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, - ABCRangeIndex, ABCSparseDataFrame, ABCTimedeltaIndex) + ABCDatetimeArray, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodIndex, + ABCRangeIndex, + ABCSparseDataFrame, + ABCTimedeltaIndex, +) def get_dtype_kinds(l): @@ -31,23 +46,23 @@ def get_dtype_kinds(l): dtype = arr.dtype if is_categorical_dtype(dtype): - typ = 'category' + typ = "category" elif is_sparse(arr): - typ = 'sparse' + typ = "sparse" elif isinstance(arr, ABCRangeIndex): - typ = 'range' + typ = "range" elif is_datetime64tz_dtype(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) elif is_datetime64_dtype(dtype): - typ = 'datetime' + typ = "datetime" elif is_timedelta64_dtype(dtype): - typ = 'timedelta' + typ = "timedelta" elif is_object_dtype(dtype): - typ = 'object' + typ = "object" elif is_bool_dtype(dtype): - typ = 'bool' + typ = "bool" elif is_extension_array_dtype(dtype): typ = str(arr.dtype) else: @@ -66,8 +81,7 @@ def _get_series_result_type(result, objs=None): # concat Series with axis 1 if isinstance(result, dict): # concat Series with axis 1 - if all(isinstance(c, (SparseSeries, SparseDataFrame)) - for c in result.values()): + if all(isinstance(c, (SparseSeries, SparseDataFrame)) for c in result.values()): return SparseDataFrame else: return DataFrame @@ -83,13 +97,12 @@ def _get_frame_result_type(result, objs): otherwise, return 1st obj """ - if (result.blocks and ( - any(isinstance(obj, ABCSparseDataFrame) for obj in objs))): + if result.blocks and (any(isinstance(obj, ABCSparseDataFrame) for obj in objs)): from pandas.core.sparse.api import SparseDataFrame + return SparseDataFrame else: - return next(obj for obj in objs if not isinstance(obj, - ABCSparseDataFrame)) + return next(obj for obj in objs if not isinstance(obj, ABCSparseDataFrame)) def _concat_compat(to_concat, axis=0): @@ -125,24 +138,24 @@ def is_nonempty(x): # np.concatenate which has them both implemented is compiled. typs = get_dtype_kinds(to_concat) - _contains_datetime = any(typ.startswith('datetime') for typ in typs) - _contains_period = any(typ.startswith('period') for typ in typs) + _contains_datetime = any(typ.startswith("datetime") for typ in typs) + _contains_period = any(typ.startswith("period") for typ in typs) - if 'category' in typs: + if "category" in typs: # this must be prior to _concat_datetime, # to support Categorical + datetime-like return _concat_categorical(to_concat, axis=axis) - elif _contains_datetime or 'timedelta' in typs or _contains_period: + elif _contains_datetime or "timedelta" in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) # these are mandated to handle empties as well - elif 'sparse' in typs: + elif "sparse" in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) all_empty = all(not is_nonempty(x) for x in to_concat) if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1: - to_concat = [np.atleast_2d(x.astype('object')) for x in to_concat] + to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] if all_empty: # we have all empties, but may need to coerce the result dtype to @@ -151,13 +164,12 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) if len(typs) != 1: - if (not len(typs - {'i', 'u', 'f'}) or - not len(typs - {'bool', 'i', 'u'})): + if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): # let numpy coerce pass else: # coerce to object - to_concat = [x.astype('object') for x in to_concat] + to_concat = [x.astype("object") for x in to_concat] return np.concatenate(to_concat, axis=axis) @@ -194,9 +206,14 @@ def _concat_categorical(to_concat, axis=0): return union_categoricals(categoricals) # extract the categoricals & coerce to object if needed - to_concat = [x._internal_get_values() if is_categorical_dtype(x.dtype) - else np.asarray(x).ravel() if not is_datetime64tz_dtype(x) - else np.asarray(x.astype(object)) for x in to_concat] + to_concat = [ + x._internal_get_values() + if is_categorical_dtype(x.dtype) + else np.asarray(x).ravel() + if not is_datetime64tz_dtype(x) + else np.asarray(x.astype(object)) + for x in to_concat + ] result = _concat_compat(to_concat) if axis == 1: result = result.reshape(1, len(result)) @@ -309,7 +326,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): from pandas.core.arrays.categorical import _recode_for_categories if len(to_union) == 0: - raise ValueError('No Categoricals to union') + raise ValueError("No Categoricals to union") def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): @@ -322,8 +339,10 @@ def _maybe_unwrap(x): to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] - if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) - for other in to_union[1:]): + if not all( + is_dtype_equal(other.categories.dtype, first.categories.dtype) + for other in to_union[1:] + ): raise TypeError("dtype of categories must be the same") ordered = False @@ -332,25 +351,26 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - if all(first.categories.equals(other.categories) - for other in to_union[1:]): + if all(first.categories.equals(other.categories) for other in to_union[1:]): new_codes = np.concatenate([c.codes for c in to_union]) else: - codes = [first.codes] + [_recode_for_categories(other.codes, - other.categories, - first.categories) - for other in to_union[1:]] + codes = [first.codes] + [ + _recode_for_categories(other.codes, other.categories, first.categories) + for other in to_union[1:] + ] new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: - raise TypeError("Cannot use sort_categories=True with " - "ordered Categoricals") + raise TypeError( + "Cannot use sort_categories=True with " "ordered Categoricals" + ) if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d + new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode @@ -359,23 +379,22 @@ def _maybe_unwrap(x): if sort_categories: categories = categories.sort_values() - new_codes = [_recode_for_categories(c.codes, c.categories, categories) - for c in to_union] + new_codes = [ + _recode_for_categories(c.codes, c.categories, categories) for c in to_union + ] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): - msg = ("to union ordered Categoricals, " - "all categories must be the same") + msg = "to union ordered Categoricals, " "all categories must be the same" raise TypeError(msg) else: - raise TypeError('Categorical.ordered must be the same') + raise TypeError("Categorical.ordered must be the same") if ignore_order: ordered = False - return Categorical(new_codes, categories=categories, ordered=ordered, - fastpath=True) + return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) def _concatenate_2d(to_concat, axis): @@ -406,14 +425,14 @@ def _concat_datetime(to_concat, axis=0, typs=None): # multiple types, need to coerce to object if len(typs) != 1: - return _concatenate_2d([_convert_datetimelike_to_object(x) - for x in to_concat], - axis=axis) + return _concatenate_2d( + [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis + ) # must be single dtype - if any(typ.startswith('datetime') for typ in typs): + if any(typ.startswith("datetime") for typ in typs): - if 'datetime' in typs: + if "datetime" in typs: to_concat = [x.astype(np.int64, copy=False) for x in to_concat] return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE) else: @@ -421,11 +440,12 @@ def _concat_datetime(to_concat, axis=0, typs=None): # thus no need to care return _concat_datetimetz(to_concat) - elif 'timedelta' in typs: - return _concatenate_2d([x.view(np.int64) for x in to_concat], - axis=axis).view(_TD_DTYPE) + elif "timedelta" in typs: + return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view( + _TD_DTYPE + ) - elif any(typ.startswith('period') for typ in typs): + elif any(typ.startswith("period") for typ in typs): assert len(typs) == 1 cls = to_concat[0] new_values = cls._concat_same_type(to_concat) @@ -437,12 +457,11 @@ def _convert_datetimelike_to_object(x): # if dtype is of datetimetz or timezone if x.dtype.kind == _NS_DTYPE.kind: - if getattr(x, 'tz', None) is not None: + if getattr(x, "tz", None) is not None: x = np.asarray(x.astype(object)) else: shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), - box="timestamp") + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp") x = x.reshape(shape) elif x.dtype == _TD_DTYPE: @@ -483,17 +502,14 @@ def _concat_index_asobject(to_concat, name=None): from pandas import Index from pandas.core.arrays import ExtensionArray - klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, - ExtensionArray) - to_concat = [x.astype(object) if isinstance(x, klasses) else x - for x in to_concat] + klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, ExtensionArray) + to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] self = to_concat[0] attribs = self._get_attributes_dict() - attribs['name'] = name + attribs["name"] = name - to_concat = [x._values if isinstance(x, Index) else x - for x in to_concat] + to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) @@ -516,14 +532,16 @@ def _concat_sparse(to_concat, axis=0, typs=None): from pandas.core.arrays import SparseArray - fill_values = [x.fill_value for x in to_concat - if isinstance(x, SparseArray)] + fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] fill_value = fill_values[0] # TODO: Fix join unit generation so we aren't passed this. - to_concat = [x if isinstance(x, SparseArray) - else SparseArray(x.squeeze(), fill_value=fill_value) - for x in to_concat] + to_concat = [ + x + if isinstance(x, SparseArray) + else SparseArray(x.squeeze(), fill_value=fill_value) + for x in to_concat + ] return SparseArray._concat_same_type(to_concat) @@ -557,8 +575,9 @@ def _concat_rangeindex_same_dtype(indexes): return _concat_index_same_dtype(indexes, klass=Int64Index) step = rng.start - start - non_consecutive = ((step != rng.step and len(rng) > 1) or - (next_ is not None and rng.start != next_)) + non_consecutive = (step != rng.step and len(rng) > 1) or ( + next_ is not None and rng.start != next_ + ) if non_consecutive: return _concat_index_same_dtype(indexes, klass=Int64Index) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d8d910a16e32a..1cf452b4a6c2c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -9,8 +9,7 @@ from pandas._libs.interval import Interval from pandas._libs.tslibs import NaT, Period, Timestamp, timezones -from pandas.core.dtypes.generic import ( - ABCCategoricalIndex, ABCDateOffset, ABCIndexClass) +from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass from .base import ExtensionDtype from .inference import is_list_like @@ -25,8 +24,7 @@ OrderedType = Union[None, bool, object] -def register_extension_dtype(cls: Type[ExtensionDtype], - ) -> Type[ExtensionDtype]: +def register_extension_dtype(cls: Type[ExtensionDtype],) -> Type[ExtensionDtype]: """ Register an ExtensionType with pandas as class decorator. @@ -67,6 +65,7 @@ class Registry: Multiple extension types can be registered. These are tried in order. """ + def __init__(self): self.dtypes = [] # type: List[Type[ExtensionDtype]] @@ -81,9 +80,9 @@ def register(self, dtype: Type[ExtensionDtype]) -> None: self.dtypes.append(dtype) - def find(self, - dtype: Union[Type[ExtensionDtype], str], - ) -> Optional[Type[ExtensionDtype]]: + def find( + self, dtype: Union[Type[ExtensionDtype], str] + ) -> Optional[Type[ExtensionDtype]]: """ Parameters ---------- @@ -120,6 +119,7 @@ class PandasExtensionDtype(ExtensionDtype): THIS IS NOT A REAL NUMPY DTYPE """ + type = None # type: Any kind = None # type: Any # The Any type annotations above are here only because mypy seems to have a @@ -149,8 +149,7 @@ def __repr__(self) -> str_type: return str(self) def __hash__(self) -> int: - raise NotImplementedError("sub-classes should implement an __hash__ " - "method") + raise NotImplementedError("sub-classes should implement an __hash__ " "method") def __getstate__(self) -> Dict[str_type, Any]: # pickle support; we don't want to pickle the cache @@ -166,6 +165,7 @@ class CategoricalDtypeType(type): """ the type of CategoricalDtype, this metaclass determines subclass ability """ + pass @@ -212,35 +212,31 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): dtype: category Categories (2, object): [b < a] """ + # TODO: Document public vs. private API - name = 'category' + name = "category" type = CategoricalDtypeType # type: Type[CategoricalDtypeType] - kind = 'O' # type: str_type - str = '|O08' - base = np.dtype('O') - _metadata = ('categories', 'ordered') + kind = "O" # type: str_type + str = "|O08" + base = np.dtype("O") + _metadata = ("categories", "ordered") _cache = {} # type: Dict[str_type, PandasExtensionDtype] - def __init__(self, - categories=None, - ordered: OrderedType = ordered_sentinel): + def __init__(self, categories=None, ordered: OrderedType = ordered_sentinel): self._finalize(categories, ordered, fastpath=False) @classmethod - def _from_fastpath(cls, - categories=None, - ordered: Optional[bool] = None - ) -> 'CategoricalDtype': + def _from_fastpath( + cls, categories=None, ordered: Optional[bool] = None + ) -> "CategoricalDtype": self = cls.__new__(cls) self._finalize(categories, ordered, fastpath=True) return self @classmethod - def _from_categorical_dtype(cls, - dtype: 'CategoricalDtype', - categories=None, - ordered: OrderedType = None, - ) -> 'CategoricalDtype': + def _from_categorical_dtype( + cls, dtype: "CategoricalDtype", categories=None, ordered: OrderedType = None + ) -> "CategoricalDtype": if categories is ordered is None: return dtype if categories is None: @@ -250,12 +246,13 @@ def _from_categorical_dtype(cls, return cls(categories, ordered) @classmethod - def _from_values_or_dtype(cls, - values=None, - categories=None, - ordered: Optional[bool] = None, - dtype: Optional['CategoricalDtype'] = None, - ) -> 'CategoricalDtype': + def _from_values_or_dtype( + cls, + values=None, + categories=None, + ordered: Optional[bool] = None, + dtype: Optional["CategoricalDtype"] = None, + ) -> "CategoricalDtype": """ Construct dtype from the input parameters used in :class:`Categorical`. @@ -316,19 +313,21 @@ def _from_values_or_dtype(cls, if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, str): - if dtype == 'category': + if dtype == "category": dtype = CategoricalDtype(categories, ordered) else: msg = "Unknown dtype {dtype!r}" raise ValueError(msg.format(dtype=dtype)) elif categories is not None or ordered is not None: - raise ValueError("Cannot specify `categories` or `ordered` " - "together with `dtype`.") + raise ValueError( + "Cannot specify `categories` or `ordered` " "together with `dtype`." + ) elif is_categorical(values): # If no "dtype" was passed, use the one from "values", but honor # the "ordered" and "categories" arguments - dtype = values.dtype._from_categorical_dtype(values.dtype, - categories, ordered) + dtype = values.dtype._from_categorical_dtype( + values.dtype, categories, ordered + ) else: # If dtype=None and values is not categorical, create a new dtype. # Note: This could potentially have categories=None and @@ -337,18 +336,15 @@ def _from_values_or_dtype(cls, return dtype - def _finalize(self, - categories, - ordered: OrderedType, - fastpath: bool = False, - ) -> None: + def _finalize( + self, categories, ordered: OrderedType, fastpath: bool = False + ) -> None: if ordered is not None and ordered is not ordered_sentinel: self.validate_ordered(ordered) if categories is not None: - categories = self.validate_categories(categories, - fastpath=fastpath) + categories = self.validate_categories(categories, fastpath=fastpath) self._categories = categories self._ordered = ordered if ordered is not ordered_sentinel else None @@ -358,8 +354,8 @@ def __setstate__(self, state: Dict[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._categories = state.pop('categories', None) - self._ordered = state.pop('ordered', False) + self._categories = state.pop("categories", None) + self._ordered = state.pop("ordered", False) def __hash__(self) -> int: # _hash_categories returns a uint64, so use the negative @@ -389,7 +385,7 @@ def __eq__(self, other: Any) -> bool: return other == self.name elif other is self: return True - elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')): + elif not (hasattr(other, "_ordered") and hasattr(other, "categories")): return False elif self.categories is None or other.categories is None: # We're forced into a suboptimal corner thanks to math and @@ -401,8 +397,9 @@ def __eq__(self, other: Any) -> bool: elif self._ordered or other._ordered: # At least one has ordered=True; equal if both have ordered=True # and the same values for categories in the same order. - return ((self._ordered == other._ordered) and - self.categories.equals(other.categories)) + return (self._ordered == other._ordered) and self.categories.equals( + other.categories + ) else: # Neither has ordered=True; equal if both have the same categories, # but same order is not necessary. There is no distinction between @@ -411,7 +408,7 @@ def __eq__(self, other: Any) -> bool: return hash(self) == hash(other) def __repr__(self): - tpl = 'CategoricalDtype(categories={}ordered={})' + tpl = "CategoricalDtype(categories={}ordered={})" if self.categories is None: data = "None, " else: @@ -421,7 +418,9 @@ def __repr__(self): @staticmethod def _hash_categories(categories, ordered: OrderedType = True) -> int: from pandas.core.util.hashing import ( - hash_array, _combine_hash_arrays, hash_tuples + hash_array, + _combine_hash_arrays, + hash_tuples, ) from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE @@ -432,7 +431,7 @@ def _hash_categories(categories, ordered: OrderedType = True) -> int: categories = list(categories) # breaks if a np.array of categories cat_array = hash_tuples(categories) else: - if categories.dtype == 'O': + if categories.dtype == "O": if len({type(x) for x in categories}) != 1: # TODO: hash_array doesn't handle mixed types. It casts # everything to a str first, which means we treat @@ -447,13 +446,12 @@ def _hash_categories(categories, ordered: OrderedType = True) -> int: cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: - cat_array = np.vstack([ - cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) - ]) + cat_array = np.vstack( + [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)] + ) else: cat_array = [cat_array] - hashed = _combine_hash_arrays(iter(cat_array), - num_items=len(cat_array)) + hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed) @classmethod @@ -466,6 +464,7 @@ def construct_array_type(cls): type """ from pandas import Categorical + return Categorical @staticmethod @@ -485,6 +484,7 @@ def validate_ordered(ordered: OrderedType) -> None: If 'ordered' is not a boolean. """ from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") @@ -514,17 +514,17 @@ def validate_categories(categories, fastpath: bool = False): if not fastpath: if categories.hasnans: - raise ValueError('Categorial categories cannot be null') + raise ValueError("Categorial categories cannot be null") if not categories.is_unique: - raise ValueError('Categorical categories must be unique') + raise ValueError("Categorical categories must be unique") if isinstance(categories, ABCCategoricalIndex): categories = categories.categories return categories - def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': + def update_dtype(self, dtype: "CategoricalDtype") -> "CategoricalDtype": """ Returns a CategoricalDtype with categories and ordered taken from dtype if specified, otherwise falling back to self if unspecified @@ -537,12 +537,14 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': ------- new_dtype : CategoricalDtype """ - if isinstance(dtype, str) and dtype == 'category': + if isinstance(dtype, str) and dtype == "category": # dtype='category' should not change anything return self elif not self.is_dtype(dtype): - msg = ('a CategoricalDtype must be passed to perform an update, ' - 'got {dtype!r}').format(dtype=dtype) + msg = ( + "a CategoricalDtype must be passed to perform an update, " + "got {dtype!r}" + ).format(dtype=dtype) raise ValueError(msg) # dtype is CDT: keep current categories/ordered if None @@ -557,11 +559,13 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype': new_ordered = self._ordered if self._ordered and new_ordered_from_sentinel: # only warn if we'd actually change the existing behavior - msg = ("Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version, which will cause the resulting categorical's " - "`ordered` attribute to change to False; `ordered=True`" - " must be explicitly passed in order to be retained") + msg = ( + "Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version, which will cause the resulting categorical's " + "`ordered` attribute to change to False; `ordered=True`" + " must be explicitly passed in order to be retained" + ) warnings.warn(msg, FutureWarning, stacklevel=3) return CategoricalDtype(new_categories, new_ordered) @@ -582,9 +586,11 @@ def ordered(self) -> OrderedType: if self._ordered_from_sentinel and self._ordered is None: # warn when accessing ordered if ordered=None and None was not # explicitly passed to the constructor - msg = ("Constructing a CategoricalDtype without specifying " - "`ordered` will default to `ordered=False` in a future " - "version; `ordered=None` must be explicitly passed.") + msg = ( + "Constructing a CategoricalDtype without specifying " + "`ordered` will default to `ordered=False` in a future " + "version; `ordered=None` must be explicitly passed." + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self._ordered @@ -632,13 +638,14 @@ class DatetimeTZDtype(PandasExtensionDtype): >>> pd.DatetimeTZDtype(tz='dateutil/US/Central') datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')] """ + type = Timestamp # type: Type[Timestamp] - kind = 'M' # type: str_type - str = '|M8[ns]' + kind = "M" # type: str_type + str = "|M8[ns]" num = 101 - base = np.dtype('M8[ns]') + base = np.dtype("M8[ns]") na_value = NaT - _metadata = ('unit', 'tz') + _metadata = ("unit", "tz") _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache = {} # type: Dict[str_type, PandasExtensionDtype] @@ -646,7 +653,7 @@ def __init__(self, unit="ns", tz=None): if isinstance(unit, DatetimeTZDtype): unit, tz = unit.unit, unit.tz - if unit != 'ns': + if unit != "ns": if isinstance(unit, str) and tz is None: # maybe a string like datetime64[ns, tz], which we support for # now. @@ -697,6 +704,7 @@ def construct_array_type(cls): type """ from pandas.core.arrays import DatetimeArray + return DatetimeArray @classmethod @@ -722,7 +730,7 @@ def construct_from_string(cls, string): match = cls._match.match(string) if match: d = match.groupdict() - return cls(unit=d['unit'], tz=d['tz']) + return cls(unit=d["unit"], tz=d["tz"]) except Exception: # TODO(py3): Change this pass to `raise TypeError(msg) from e` pass @@ -747,16 +755,18 @@ def __eq__(self, other): if isinstance(other, str): return other == self.name - return (isinstance(other, DatetimeTZDtype) and - self.unit == other.unit and - str(self.tz) == str(other.tz)) + return ( + isinstance(other, DatetimeTZDtype) + and self.unit == other.unit + and str(self.tz) == str(other.tz) + ) def __setstate__(self, state): # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._tz = state['tz'] - self._unit = state['unit'] + self._tz = state["tz"] + self._unit = state["unit"] @register_extension_dtype @@ -787,12 +797,13 @@ class PeriodDtype(PandasExtensionDtype): >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd()) period[M] """ + type = Period # type: Type[Period] - kind = 'O' # type: str_type - str = '|O08' - base = np.dtype('O') + kind = "O" # type: str_type + str = "|O08" + base = np.dtype("O") num = 102 - _metadata = ('freq',) + _metadata = ("freq",) _match = re.compile(r"(P|p)eriod\[(?P.+)\]") _cache = {} # type: Dict[str_type, PandasExtensionDtype] @@ -833,11 +844,12 @@ def freq(self): @classmethod def _parse_dtype_strict(cls, freq): if isinstance(freq, str): - if freq.startswith('period[') or freq.startswith('Period['): + if freq.startswith("period[") or freq.startswith("Period["): m = cls._match.search(freq) if m is not None: - freq = m.group('freq') + freq = m.group("freq") from pandas.tseries.frequencies import to_offset + freq = to_offset(freq) if freq is not None: return freq @@ -850,10 +862,11 @@ def construct_from_string(cls, string): Strict construction from a string, raise a TypeError if not possible """ - if (isinstance(string, str) and - (string.startswith('period[') or - string.startswith('Period[')) or - isinstance(string, ABCDateOffset)): + if ( + isinstance(string, str) + and (string.startswith("period[") or string.startswith("Period[")) + or isinstance(string, ABCDateOffset) + ): # do not parse string like U as period[U] # avoid tuple to be regarded as freq try: @@ -887,7 +900,7 @@ def __setstate__(self, state): # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._freq = state['freq'] + self._freq = state["freq"] @classmethod def is_dtype(cls, dtype): @@ -899,7 +912,7 @@ def is_dtype(cls, dtype): if isinstance(dtype, str): # PeriodDtype can be instantiated from freq string like "U", # but doesn't regard freq str like "U" as dtype. - if dtype.startswith('period[') or dtype.startswith('Period['): + if dtype.startswith("period[") or dtype.startswith("Period["): try: if cls._parse_dtype_strict(dtype) is not None: return True @@ -943,18 +956,22 @@ class IntervalDtype(PandasExtensionDtype): >>> pd.IntervalDtype(subtype='int64') interval[int64] """ - name = 'interval' + + name = "interval" kind = None # type: Optional[str_type] - str = '|O08' - base = np.dtype('O') + str = "|O08" + base = np.dtype("O") num = 103 - _metadata = ('subtype',) + _metadata = ("subtype",) _match = re.compile(r"(I|i)nterval\[(?P.+)\]") _cache = {} # type: Dict[str_type, PandasExtensionDtype] def __new__(cls, subtype=None): from pandas.core.dtypes.common import ( - is_categorical_dtype, is_string_dtype, pandas_dtype) + is_categorical_dtype, + is_string_dtype, + pandas_dtype, + ) if isinstance(subtype, IntervalDtype): return subtype @@ -964,14 +981,13 @@ def __new__(cls, subtype=None): u = object.__new__(cls) u._subtype = None return u - elif (isinstance(subtype, str) and - subtype.lower() == 'interval'): + elif isinstance(subtype, str) and subtype.lower() == "interval": subtype = None else: if isinstance(subtype, str): m = cls._match.search(subtype) if m is not None: - subtype = m.group('subtype') + subtype = m.group("subtype") try: subtype = pandas_dtype(subtype) @@ -980,8 +996,10 @@ def __new__(cls, subtype=None): if is_categorical_dtype(subtype) or is_string_dtype(subtype): # GH 19016 - msg = ('category, object, and string subtypes are not supported ' - 'for IntervalDtype') + msg = ( + "category, object, and string subtypes are not supported " + "for IntervalDtype" + ) raise TypeError(msg) try: @@ -1009,6 +1027,7 @@ def construct_array_type(cls): type """ from pandas.core.arrays import IntervalArray + return IntervalArray @classmethod @@ -1021,13 +1040,14 @@ def construct_from_string(cls, string): msg = "a string needs to be passed, got type {typ}" raise TypeError(msg.format(typ=type(string))) - if (string.lower() == 'interval' or - cls._match.search(string) is not None): + if string.lower() == "interval" or cls._match.search(string) is not None: return cls(string) - msg = ('Incorrectly formatted string passed to constructor. ' - 'Valid formats include Interval or Interval[dtype] ' - 'where dtype is numeric, datetime, or timedelta') + msg = ( + "Incorrectly formatted string passed to constructor. " + "Valid formats include Interval or Interval[dtype] " + "where dtype is numeric, datetime, or timedelta" + ) raise TypeError(msg) @property @@ -1053,13 +1073,14 @@ def __eq__(self, other): return True else: from pandas.core.dtypes.common import is_dtype_equal + return is_dtype_equal(self.subtype, other.subtype) def __setstate__(self, state): # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) - self._subtype = state['subtype'] + self._subtype = state["subtype"] @classmethod def is_dtype(cls, dtype): @@ -1069,7 +1090,7 @@ def is_dtype(cls, dtype): """ if isinstance(dtype, str): - if dtype.lower().startswith('interval'): + if dtype.lower().startswith("interval"): try: if cls.construct_from_string(dtype) is not None: return True diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 86aff93dfde14..de41644f09b66 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -6,76 +6,79 @@ def create_pandas_abc_type(name, attr, comp): @classmethod def _check(cls, inst): - return getattr(inst, attr, '_typ') in comp + return getattr(inst, attr, "_typ") in comp dct = dict(__instancecheck__=_check, __subclasscheck__=_check) - meta = type("ABCBase", (type, ), dct) + meta = type("ABCBase", (type,), dct) return meta(name, tuple(), dct) -ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", )) -ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", - ("int64index", )) -ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", - ("uint64index", )) -ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", - ("rangeindex", )) -ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", - ("float64index", )) -ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", - ("multiindex", )) -ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", - ("datetimeindex", )) -ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", - ("timedeltaindex", )) -ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", - ("periodindex", )) -ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", - ("categoricalindex", )) -ABCIntervalIndex = create_pandas_abc_type("ABCIntervalIndex", "_typ", - ("intervalindex", )) -ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", - ("index", "int64index", "rangeindex", - "float64index", "uint64index", - "multiindex", "datetimeindex", - "timedeltaindex", "periodindex", - "categoricalindex", "intervalindex")) +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) +ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) +ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) +ABCDatetimeIndex = create_pandas_abc_type( + "ABCDatetimeIndex", "_typ", ("datetimeindex",) +) +ABCTimedeltaIndex = create_pandas_abc_type( + "ABCTimedeltaIndex", "_typ", ("timedeltaindex",) +) +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type( + "ABCCategoricalIndex", "_typ", ("categoricalindex",) +) +ABCIntervalIndex = create_pandas_abc_type( + "ABCIntervalIndex", "_typ", ("intervalindex",) +) +ABCIndexClass = create_pandas_abc_type( + "ABCIndexClass", + "_typ", + ( + "index", + "int64index", + "rangeindex", + "float64index", + "uint64index", + "multiindex", + "datetimeindex", + "timedeltaindex", + "periodindex", + "categoricalindex", + "intervalindex", + ), +) -ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) -ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) -ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp", - ("sparse_frame", )) -ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", - ('sparse_series', - 'sparse_time_series')) -ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", - ('sparse_array', 'sparse_series')) -ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", - ("categorical")) -ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", - ("datetimearray")) -ABCTimedeltaArray = create_pandas_abc_type("ABCTimedeltaArray", "_typ", - ("timedeltaarray")) -ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", - ("periodarray", )) -ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period", )) -ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", - ("dateoffset",)) -ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", )) -ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ", - ("extension", - "categorical", - "periodarray", - "datetimearray", - "timedeltaarray", - )) -ABCPandasArray = create_pandas_abc_type("ABCPandasArray", - "_typ", - ("npy_extension",)) +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +ABCSparseDataFrame = create_pandas_abc_type( + "ABCSparseDataFrame", "_subtyp", ("sparse_frame",) +) +ABCSparseSeries = create_pandas_abc_type( + "ABCSparseSeries", "_subtyp", ("sparse_series", "sparse_time_series") +) +ABCSparseArray = create_pandas_abc_type( + "ABCSparseArray", "_subtyp", ("sparse_array", "sparse_series") +) +ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) +ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")) +ABCTimedeltaArray = create_pandas_abc_type( + "ABCTimedeltaArray", "_typ", ("timedeltaarray") +) +ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)) +ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",)) +ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",)) +ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval",)) +ABCExtensionArray = create_pandas_abc_type( + "ABCExtensionArray", + "_typ", + ("extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"), +) +ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)) class _ABCGeneric(type): - def __instancecheck__(cls, inst): return hasattr(inst, "_data") diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 02ee777bbe7f3..9373ea18e8a24 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -143,10 +143,10 @@ def is_iterator(obj): False """ - if not hasattr(obj, '__iter__'): + if not hasattr(obj, "__iter__"): return False - return hasattr(obj, '__next__') + return hasattr(obj, "__next__") def is_file_like(obj): @@ -180,7 +180,7 @@ def is_file_like(obj): False """ - if not (hasattr(obj, 'read') or hasattr(obj, 'write')): + if not (hasattr(obj, "read") or hasattr(obj, "write")): return False if not hasattr(obj, "__iter__"): @@ -281,15 +281,18 @@ def is_list_like(obj, allow_sets=True): False """ - return (isinstance(obj, abc.Iterable) and - # we do not count strings/unicode/bytes as list-like - not isinstance(obj, (str, bytes)) and - - # exclude zero-dimensional numpy arrays, effectively scalars - not (isinstance(obj, np.ndarray) and obj.ndim == 0) and - - # exclude sets if allow_sets is False - not (allow_sets is False and isinstance(obj, abc.Set))) + return ( + isinstance(obj, abc.Iterable) + and + # we do not count strings/unicode/bytes as list-like + not isinstance(obj, (str, bytes)) + and + # exclude zero-dimensional numpy arrays, effectively scalars + not (isinstance(obj, np.ndarray) and obj.ndim == 0) + and + # exclude sets if allow_sets is False + not (allow_sets is False and isinstance(obj, abc.Set)) + ) def is_array_like(obj): @@ -365,8 +368,12 @@ def is_nested_list_like(obj): -------- is_list_like """ - return (is_list_like(obj) and hasattr(obj, '__len__') and - len(obj) > 0 and all(is_list_like(item) for item in obj)) + return ( + is_list_like(obj) + and hasattr(obj, "__len__") + and len(obj) > 0 + and all(is_list_like(item) for item in obj) + ) def is_dict_like(obj): @@ -394,9 +401,11 @@ def is_dict_like(obj): True """ dict_like_attrs = ("__getitem__", "keys", "__contains__") - return (all(hasattr(obj, attr) for attr in dict_like_attrs) - # [GH 25196] exclude classes - and not isinstance(obj, type)) + return ( + all(hasattr(obj, attr) for attr in dict_like_attrs) + # [GH 25196] exclude classes + and not isinstance(obj, type) + ) def is_named_tuple(obj): @@ -423,7 +432,7 @@ def is_named_tuple(obj): False """ - return isinstance(obj, tuple) and hasattr(obj, '_fields') + return isinstance(obj, tuple) and hasattr(obj, "_fields") def is_hashable(obj): @@ -489,7 +498,7 @@ def is_sequence(obj): try: iter(obj) # Can iterate over it. - len(obj) # Has a length associated with it. + len(obj) # Has a length associated with it. return not isinstance(obj, (str, bytes)) except (TypeError, AttributeError): return False diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 914a292d3db97..f540e9297738a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -8,15 +8,37 @@ from pandas._libs.tslibs import NaT, iNaT from .common import ( - _NS_DTYPE, _TD_DTYPE, ensure_object, is_bool_dtype, is_complex_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, - is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, - is_float_dtype, is_integer_dtype, is_object_dtype, is_period_dtype, - is_scalar, is_string_dtype, is_string_like_dtype, is_timedelta64_dtype, - needs_i8_conversion, pandas_dtype) + _NS_DTYPE, + _TD_DTYPE, + ensure_object, + is_bool_dtype, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_datetimelike_v_numeric, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_period_dtype, + is_scalar, + is_string_dtype, + is_string_like_dtype, + is_timedelta64_dtype, + needs_i8_conversion, + pandas_dtype, +) from .generic import ( - ABCDatetimeArray, ABCExtensionArray, ABCGeneric, ABCIndexClass, - ABCMultiIndex, ABCSeries, ABCTimedeltaArray) + ABCDatetimeArray, + ABCExtensionArray, + ABCGeneric, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, + ABCTimedeltaArray, +) from .inference import is_list_like isposinf_scalar = libmissing.isposinf_scalar @@ -109,15 +131,23 @@ def _isna_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") - elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, - ABCExtensionArray, - ABCDatetimeArray, ABCTimedeltaArray)): + elif isinstance( + obj, + ( + ABCSeries, + np.ndarray, + ABCIndexClass, + ABCExtensionArray, + ABCDatetimeArray, + ABCTimedeltaArray, + ), + ): return _isna_ndarraylike(obj) elif isinstance(obj, ABCGeneric): return obj._constructor(obj._data.isna(func=isna)) elif isinstance(obj, list): return _isna_ndarraylike(np.asarray(obj, dtype=object)) - elif hasattr(obj, '__array__'): + elif hasattr(obj, "__array__"): return _isna_ndarraylike(np.asarray(obj)) else: return obj is None @@ -145,7 +175,7 @@ def _isna_old(obj): return obj._constructor(obj._data.isna(func=_isna_old)) elif isinstance(obj, list): return _isna_ndarraylike_old(np.asarray(obj, dtype=object)) - elif hasattr(obj, '__array__'): + elif hasattr(obj, "__array__"): return _isna_ndarraylike_old(np.asarray(obj)) else: return obj is None @@ -174,11 +204,12 @@ def _use_inf_as_na(key): programmatically-creating-variables-in-python/4859312#4859312 """ from pandas._config import get_option + flag = get_option(key) if flag: - globals()['_isna'] = _isna_old + globals()["_isna"] = _isna_old else: - globals()['_isna'] = _isna_new + globals()["_isna"] = _isna_new def _isna_ndarraylike(obj): @@ -187,7 +218,7 @@ def _isna_ndarraylike(obj): if not is_extension: # Avoid accessing `.values` on things like # PeriodIndex, which may be expensive. - values = getattr(obj, 'values', obj) + values = getattr(obj, "values", obj) else: values = obj @@ -216,20 +247,19 @@ def _isna_ndarraylike(obj): elif needs_i8_conversion(dtype): # this is the NaT pattern - result = values.view('i8') == iNaT + result = values.view("i8") == iNaT else: result = np.isnan(values) # box if isinstance(obj, ABCSeries): - result = obj._constructor( - result, index=obj.index, name=obj.name, copy=False) + result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) return result def _isna_ndarraylike_old(obj): - values = getattr(obj, 'values', obj) + values = getattr(obj, "values", obj) dtype = values.dtype if is_string_dtype(dtype): @@ -245,14 +275,13 @@ def _isna_ndarraylike_old(obj): elif is_datetime64_dtype(dtype): # this is the NaT pattern - result = values.view('i8') == iNaT + result = values.view("i8") == iNaT else: result = ~np.isfinite(values) # box if isinstance(obj, ABCSeries): - result = obj._constructor( - result, index=obj.index, name=obj.name, copy=False) + result = obj._constructor(result, index=obj.index, name=obj.name, copy=False) return result @@ -353,8 +382,7 @@ def _isna_compat(arr, fill_value=np.nan): """ dtype = arr.dtype if isna(fill_value): - return not (is_bool_dtype(dtype) or - is_integer_dtype(dtype)) + return not (is_bool_dtype(dtype) or is_integer_dtype(dtype)) return True @@ -402,15 +430,15 @@ def array_equivalent(left, right, strict_nan=False): if not strict_nan: # isna considers NaN and None to be equivalent. return lib.array_equivalent_object( - ensure_object(left.ravel()), ensure_object(right.ravel())) + ensure_object(left.ravel()), ensure_object(right.ravel()) + ) for left_value, right_value in zip(left, right): if left_value is NaT and right_value is not NaT: return False elif isinstance(left_value, float) and np.isnan(left_value): - if (not isinstance(right_value, float) or - not np.isnan(right_value)): + if not isinstance(right_value, float) or not np.isnan(right_value): return False else: if left_value != right_value: @@ -434,12 +462,11 @@ def array_equivalent(left, right, strict_nan=False): if not is_dtype_equal(left.dtype, right.dtype): return False - left = left.view('i8') - right = right.view('i8') + left = left.view("i8") + right = right.view("i8") # if we have structured dtypes, compare first - if (left.dtype.type is np.void or - right.dtype.type is np.void): + if left.dtype.type is np.void or right.dtype.type is np.void: if left.dtype != right.dtype: return False @@ -457,13 +484,13 @@ def _infer_fill_value(val): val = [val] val = np.array(val, copy=False) if is_datetimelike(val): - return np.array('NaT', dtype=val.dtype) + return np.array("NaT", dtype=val.dtype) elif is_object_dtype(val.dtype): dtype = lib.infer_dtype(ensure_object(val), skipna=False) - if dtype in ['datetime', 'datetime64']: - return np.array('NaT', dtype=_NS_DTYPE) - elif dtype in ['timedelta', 'timedelta64']: - return np.array('NaT', dtype=_TD_DTYPE) + if dtype in ["datetime", "datetime64"]: + return np.array("NaT", dtype=_NS_DTYPE) + elif dtype in ["timedelta", "timedelta64"]: + return np.array("NaT", dtype=_TD_DTYPE) return np.nan @@ -506,8 +533,12 @@ def na_value_for_dtype(dtype, compat=True): if is_extension_array_dtype(dtype): return dtype.na_value - if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or - is_timedelta64_dtype(dtype) or is_period_dtype(dtype)): + if ( + is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + or is_timedelta64_dtype(dtype) + or is_period_dtype(dtype) + ): return NaT elif is_float_dtype(dtype): return np.nan diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0dba7c7b5d288..a1989fd62b6ee 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -27,44 +27,89 @@ from pandas.compat import PY36, raise_with_traceback from pandas.compat.numpy import function as nv from pandas.util._decorators import ( - Appender, Substitution, deprecate_kwarg, rewrite_axis_style_signature) -from pandas.util._validators import ( - validate_axis_style_args, validate_bool_kwarg) + Appender, + Substitution, + deprecate_kwarg, + rewrite_axis_style_signature, +) +from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg from pandas.core.dtypes.cast import ( - cast_scalar_to_array, coerce_to_dtypes, find_common_type, - infer_dtype_from_scalar, invalidate_string_dtypes, maybe_cast_to_datetime, - maybe_convert_platform, maybe_downcast_to_dtype, - maybe_infer_to_datetimelike, maybe_upcast, maybe_upcast_putmask) + cast_scalar_to_array, + coerce_to_dtypes, + find_common_type, + infer_dtype_from_scalar, + invalidate_string_dtypes, + maybe_cast_to_datetime, + maybe_convert_platform, + maybe_downcast_to_dtype, + maybe_infer_to_datetimelike, + maybe_upcast, + maybe_upcast_putmask, +) from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_platform_int, infer_dtype_from_object, - is_bool_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_dict_like, is_dtype_equal, is_extension_array_dtype, is_extension_type, - is_float_dtype, is_integer, is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_nested_list_like, is_object_dtype, is_scalar, - is_sequence, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_platform_int, + infer_dtype_from_object, + is_bool_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_dict_like, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float_dtype, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_named_tuple, + is_nested_list_like, + is_object_dtype, + is_scalar, + is_sequence, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray -from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin as DatetimeLikeArray) +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import ( - Index, MultiIndex, ensure_index, ensure_index_from_sequences) + Index, + MultiIndex, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import ( - check_bool_indexer, convert_to_index_sliceable, maybe_droplevels) + check_bool_indexer, + convert_to_index_sliceable, + maybe_droplevels, +) from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( - arrays_to_mgr, get_names_from_index, init_dict, init_ndarray, - masked_rec_array_to_mgr, reorder_arrays, sanitize_index, to_arrays) + arrays_to_mgr, + get_names_from_index, + init_dict, + init_ndarray, + masked_rec_array_to_mgr, + reorder_arrays, + sanitize_index, + to_arrays, +) from pandas.core.series import Series from pandas.io.formats import console, format as fmt @@ -75,7 +120,8 @@ # Docstring templates _shared_doc_kwargs = dict( - axes='index, columns', klass='DataFrame', + axes="index, columns", + klass="DataFrame", axes_single_arg="{0 or 'index', 1 or 'columns'}", axis="""axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index': apply function to each column. @@ -91,7 +137,7 @@ .. versionchanged:: 0.23.0 Allow specifying index or column level names.""", - versionadded_to_excel='', + versionadded_to_excel="", optional_labels="""labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", optional_axis="""axis : int or str, optional @@ -330,9 +376,9 @@ def _constructor(self): return DataFrame _constructor_sliced = Series # type: Type[Series] - _deprecations = NDFrame._deprecations | frozenset([ - 'get_value', 'set_value', 'from_items' - ]) # type: FrozenSet[str] + _deprecations = NDFrame._deprecations | frozenset( + ["get_value", "set_value", "from_items"] + ) # type: FrozenSet[str] _accessors = set() # type: Set[str] @property @@ -342,8 +388,7 @@ def _constructor_expanddim(self): # ---------------------------------------------------------------------- # Constructors - def __init__(self, data=None, index=None, columns=None, dtype=None, - copy=False): + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if data is None: data = {} if dtype is not None: @@ -353,16 +398,17 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = data._data if isinstance(data, BlockManager): - mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), - dtype=dtype, copy=copy) + mgr = self._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) elif isinstance(data, dict): mgr = init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords + # masked recarray if isinstance(data, mrecords.MaskedRecords): - mgr = masked_rec_array_to_mgr(data, index, columns, dtype, - copy) + mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) # a masked array else: @@ -373,8 +419,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data[mask] = fill_value else: data = data.copy() - mgr = init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -383,20 +428,17 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if columns is None: columns = data_columns mgr = init_dict(data, index, columns, dtype=dtype) - elif getattr(data, 'name', None) is not None: - mgr = init_dict({data.name: data}, index, columns, - dtype=dtype) + elif getattr(data, "name", None) is not None: + mgr = init_dict({data.name: data}, index, columns, dtype=dtype) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) # For data is list-like, or Iterable (will consume into list) - elif (isinstance(data, abc.Iterable) and - not isinstance(data, (str, bytes))): + elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): if not isinstance(data, abc.Sequence): data = list(data) if len(data) > 0: - if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: + if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields arrays, columns = to_arrays(data, columns, dtype=dtype) @@ -411,28 +453,30 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, else: index = ibase.default_index(len(data)) - mgr = arrays_to_mgr(arrays, columns, index, columns, - dtype=dtype) + mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) else: mgr = init_dict({}, index, columns, dtype=dtype) else: try: arr = np.array(data, dtype=dtype, copy=copy) except (ValueError, TypeError) as e: - exc = TypeError('DataFrame constructor called with ' - 'incompatible data and dtype: {e}'.format(e=e)) + exc = TypeError( + "DataFrame constructor called with " + "incompatible data and dtype: {e}".format(e=e) + ) raise_with_traceback(exc) if arr.ndim == 0 and index is not None and columns is not None: - values = cast_scalar_to_array((len(index), len(columns)), - data, dtype=dtype) - mgr = init_ndarray(values, index, columns, - dtype=values.dtype, copy=False) + values = cast_scalar_to_array( + (len(index), len(columns)), data, dtype=dtype + ) + mgr = init_ndarray( + values, index, columns, dtype=values.dtype, copy=False + ) else: - raise ValueError('DataFrame constructor not properly called!') + raise ValueError("DataFrame constructor not properly called!") NDFrame.__init__(self, mgr, fastpath=True) @@ -533,8 +577,9 @@ def _repr_fits_horizontal_(self, ignore_width=False): nb_columns = len(self.columns) # exceed max columns - if ((max_columns and nb_columns > max_columns) or - ((not ignore_width) and width and nb_columns > (width // 2))): + if (max_columns and nb_columns > max_columns) or ( + (not ignore_width) and width and nb_columns > (width // 2) + ): return False # used by repr_html under IPython notebook or scripts ignore terminal @@ -542,8 +587,7 @@ def _repr_fits_horizontal_(self, ignore_width=False): if ignore_width or not console.in_interactive_session(): return True - if (get_option('display.width') is not None or - console.in_ipython_frontend()): + if get_option("display.width") is not None or console.in_ipython_frontend(): # check at least the column row for excessive width max_rows = 1 else: @@ -560,13 +604,13 @@ def _repr_fits_horizontal_(self, ignore_width=False): if not (max_rows is None): # unlimited rows # min of two, where one may be None - d = d.iloc[:min(max_rows, len(d))] + d = d.iloc[: min(max_rows, len(d))] else: return True d.to_string(buf=buf) value = buf.getvalue() - repr_width = max(len(l) for l in value.split('\n')) + repr_width = max(len(l) for l in value.split("\n")) return repr_width < width @@ -574,9 +618,10 @@ def _info_repr(self): """ True if the repr should show the info view. """ - info_repr_option = (get_option("display.large_repr") == "info") - return info_repr_option and not (self._repr_fits_horizontal_() and - self._repr_fits_vertical_()) + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) def __repr__(self): """ @@ -595,9 +640,14 @@ def __repr__(self): width, _ = console.get_console_size() else: width = None - self.to_string(buf=buf, max_rows=max_rows, min_rows=min_rows, - max_cols=max_cols, line_width=width, - show_dimensions=show_dimensions) + self.to_string( + buf=buf, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + line_width=width, + show_dimensions=show_dimensions, + ) return buf.getvalue() @@ -611,32 +661,52 @@ def _repr_html_(self): buf = StringIO("") self.info(buf=buf) # need to escape the , should be the first line. - val = buf.getvalue().replace('<', r'<', 1) - val = val.replace('>', r'>', 1) - return '
' + val + '
' + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return "
" + val + "
" if get_option("display.notebook_repr_html"): max_rows = get_option("display.max_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") - return self.to_html(max_rows=max_rows, max_cols=max_cols, - show_dimensions=show_dimensions, notebook=True) + return self.to_html( + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + notebook=True, + ) else: return None - @Substitution(header='Write out the column names. If a list of strings ' - 'is given, it is assumed to be aliases for the ' - 'column names', - col_space_type='int', - col_space='The minimum width of each column') - @Substitution(shared_params=fmt.common_docstring, - returns=fmt.return_docstring) - def to_string(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, - max_rows=None, min_rows=None, max_cols=None, - show_dimensions=False, decimal='.', line_width=None): + @Substitution( + header="Write out the column names. If a list of strings " + "is given, it is assumed to be aliases for the " + "column names", + col_space_type="int", + col_space="The minimum width of each column", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_string( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + min_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + line_width=None, + ): """ Render a DataFrame to a console-friendly tabular output. %(shared_params)s @@ -658,19 +728,26 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, 2 3 6 """ - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, justify=justify, - index_names=index_names, - header=header, index=index, - min_rows=min_rows, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, - line_width=line_width) + formatter = fmt.DataFrameFormatter( + self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + min_rows=min_rows, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width, + ) formatter.to_string() if buf is None: @@ -690,6 +767,7 @@ def style(self): io.formats.style.Styler """ from pandas.io.formats.style import Styler + return Styler(self) def iteritems(self): @@ -740,7 +818,7 @@ def iteritems(self): koala 80000 Name: population, dtype: int64 """ - if self.columns.is_unique and hasattr(self, '_item_cache'): + if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) else: @@ -966,9 +1044,8 @@ def dot(self, other): """ if isinstance(other, (Series, DataFrame)): common = self.columns.union(other.index) - if (len(common) > len(self.columns) or - len(common) > len(other.index)): - raise ValueError('matrices are not aligned') + if len(common) > len(self.columns) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") left = self.reindex(columns=common, copy=False) right = other.reindex(index=common, copy=False) @@ -979,13 +1056,15 @@ def dot(self, other): lvals = self.values rvals = np.asarray(other) if lvals.shape[1] != rvals.shape[0]: - raise ValueError('Dot product shape mismatch, ' - '{s} vs {r}'.format(s=lvals.shape, - r=rvals.shape)) + raise ValueError( + "Dot product shape mismatch, " + "{s} vs {r}".format(s=lvals.shape, r=rvals.shape) + ) if isinstance(other, DataFrame): - return self._constructor(np.dot(lvals, rvals), index=left.index, - columns=other.columns) + return self._constructor( + np.dot(lvals, rvals), index=left.index, columns=other.columns + ) elif isinstance(other, Series): return Series(np.dot(lvals, rvals), index=left.index) elif isinstance(rvals, (np.ndarray, Index)): @@ -995,7 +1074,7 @@ def dot(self, other): else: return Series(result, index=left.index) else: # pragma: no cover - raise TypeError('unsupported type: {oth}'.format(oth=type(other))) + raise TypeError("unsupported type: {oth}".format(oth=type(other))) def __matmul__(self, other): """ @@ -1013,7 +1092,7 @@ def __rmatmul__(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient='columns', dtype=None, columns=None): + def from_dict(cls, data, orient="columns", dtype=None, columns=None): """ Construct DataFrame from dict of array-like or dicts. @@ -1078,19 +1157,20 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): """ index = None orient = orient.lower() - if orient == 'index': + if orient == "index": if len(data) > 0: # TODO speed up Series case if isinstance(list(data.values())[0], (Series, dict)): data = _from_nested_dict(data) else: data, index = list(data.values()), list(data.keys()) - elif orient == 'columns': + elif orient == "columns": if columns is not None: - raise ValueError("cannot use columns parameter with " - "orient='columns'") + raise ValueError( + "cannot use columns parameter with " "orient='columns'" + ) else: # pragma: no cover - raise ValueError('only recognize index or columns for orient') + raise ValueError("only recognize index or columns for orient") return cls(data, index=index, columns=columns, dtype=dtype) @@ -1149,7 +1229,7 @@ def to_numpy(self, dtype=None, copy=False): result = np.array(self.values, dtype=dtype, copy=copy) return result - def to_dict(self, orient='dict', into=dict): + def to_dict(self, orient="dict", into=dict): """ Convert the DataFrame to a dictionary. @@ -1239,48 +1319,68 @@ def to_dict(self, orient='dict', into=dict): defaultdict(, {'col1': 2, 'col2': 0.75})] """ if not self.columns.is_unique: - warnings.warn("DataFrame columns are not unique, some " - "columns will be omitted.", UserWarning, - stacklevel=2) + warnings.warn( + "DataFrame columns are not unique, some " "columns will be omitted.", + UserWarning, + stacklevel=2, + ) # GH16122 into_c = com.standardize_mapping(into) - if orient.lower().startswith('d'): - return into_c( - (k, v.to_dict(into)) for k, v in self.items()) - elif orient.lower().startswith('l'): + if orient.lower().startswith("d"): + return into_c((k, v.to_dict(into)) for k, v in self.items()) + elif orient.lower().startswith("l"): return into_c((k, v.tolist()) for k, v in self.items()) - elif orient.lower().startswith('sp'): - return into_c((('index', self.index.tolist()), - ('columns', self.columns.tolist()), - ('data', [ - list(map(com.maybe_box_datetimelike, t)) - for t in self.itertuples(index=False, name=None) - ]))) - elif orient.lower().startswith('s'): - return into_c((k, com.maybe_box_datetimelike(v)) - for k, v in self.items()) - elif orient.lower().startswith('r'): + elif orient.lower().startswith("sp"): + return into_c( + ( + ("index", self.index.tolist()), + ("columns", self.columns.tolist()), + ( + "data", + [ + list(map(com.maybe_box_datetimelike, t)) + for t in self.itertuples(index=False, name=None) + ], + ), + ) + ) + elif orient.lower().startswith("s"): + return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) + elif orient.lower().startswith("r"): columns = self.columns.tolist() - rows = (dict(zip(columns, row)) - for row in self.itertuples(index=False, name=None)) + rows = ( + dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None) + ) return [ - into_c((k, com.maybe_box_datetimelike(v)) - for k, v in row.items()) - for row in rows] - elif orient.lower().startswith('i'): + into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) + for row in rows + ] + elif orient.lower().startswith("i"): if not self.index.is_unique: - raise ValueError( - "DataFrame index must be unique for orient='index'." - ) - return into_c((t[0], dict(zip(self.columns, t[1:]))) - for t in self.itertuples(name=None)) + raise ValueError("DataFrame index must be unique for orient='index'.") + return into_c( + (t[0], dict(zip(self.columns, t[1:]))) + for t in self.itertuples(name=None) + ) else: raise ValueError("orient '{o}' not understood".format(o=orient)) - def to_gbq(self, destination_table, project_id=None, chunksize=None, - reauth=False, if_exists='fail', auth_local_webserver=False, - table_schema=None, location=None, progress_bar=True, - credentials=None, verbose=None, private_key=None): + def to_gbq( + self, + destination_table, + project_id=None, + chunksize=None, + reauth=False, + if_exists="fail", + auth_local_webserver=False, + table_schema=None, + location=None, + progress_bar=True, + credentials=None, + verbose=None, + private_key=None, + ): """ Write a DataFrame to a Google BigQuery table. @@ -1376,16 +1476,33 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None, read_gbq : Read a DataFrame from Google BigQuery. """ from pandas.io import gbq - gbq.to_gbq(self, destination_table, project_id=project_id, - chunksize=chunksize, reauth=reauth, if_exists=if_exists, - auth_local_webserver=auth_local_webserver, - table_schema=table_schema, location=location, - progress_bar=progress_bar, credentials=credentials, - verbose=verbose, private_key=private_key) + + gbq.to_gbq( + self, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + verbose=verbose, + private_key=private_key, + ) @classmethod - def from_records(cls, data, index=None, exclude=None, columns=None, - coerce_float=False, nrows=None): + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float=False, + nrows=None, + ): """ Convert structured or record ndarray to DataFrame. @@ -1428,7 +1545,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls(index=index, columns=columns) dtype = None - if hasattr(first_row, 'dtype') and first_row.dtype.names: + if hasattr(first_row, "dtype") and first_row.dtype.names: dtype = first_row.dtype values = [first_row] @@ -1455,8 +1572,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns.append(k) arrays.append(v) - arrays, arr_columns = reorder_arrays(arrays, arr_columns, - columns) + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) @@ -1464,8 +1580,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, columns = ensure_index(columns) arr_columns = columns else: - arrays, arr_columns = to_arrays(data, columns, - coerce_float=coerce_float) + arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float) arr_columns = ensure_index(arr_columns) if columns is not None: @@ -1480,8 +1595,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, result_index = None if index is not None: - if (isinstance(index, str) or - not hasattr(index, "__iter__")): + if isinstance(index, str) or not hasattr(index, "__iter__"): i = columns.get_loc(index) exclude.add(index) if len(arrays) > 0: @@ -1490,10 +1604,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None, result_index = Index([], name=index) else: try: - index_data = [arrays[arr_columns.get_loc(field)] - for field in index] - result_index = ensure_index_from_sequences(index_data, - names=index) + index_data = [arrays[arr_columns.get_loc(field)] for field in index] + result_index = ensure_index_from_sequences(index_data, names=index) exclude.update(index) except Exception: @@ -1511,8 +1623,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None, return cls(mgr) - def to_records(self, index=True, convert_datetime64=None, - column_dtypes=None, index_dtypes=None): + def to_records( + self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None + ): """ Convert DataFrame to a NumPy record array. @@ -1604,10 +1717,13 @@ def to_records(self, index=True, convert_datetime64=None, """ if convert_datetime64 is not None: - warnings.warn("The 'convert_datetime64' parameter is " - "deprecated and will be removed in a future " - "version", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'convert_datetime64' parameter is " + "deprecated and will be removed in a future " + "version", + FutureWarning, + stacklevel=2, + ) if index: if is_datetime64_any_dtype(self.index) and convert_datetime64: @@ -1619,8 +1735,7 @@ def to_records(self, index=True, convert_datetime64=None, else: ix_vals = [self.index.values] - arrays = ix_vals + [self[c]._internal_get_values() - for c in self.columns] + arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns] count = 0 index_names = list(self.index.names) @@ -1628,13 +1743,12 @@ def to_records(self, index=True, convert_datetime64=None, if isinstance(self.index, MultiIndex): for i, n in enumerate(index_names): if n is None: - index_names[i] = 'level_%d' % count + index_names[i] = "level_%d" % count count += 1 elif index_names[0] is None: - index_names = ['index'] + index_names = ["index"] - names = [str(name) for name in itertools.chain(index_names, - self.columns)] + names = [str(name) for name in itertools.chain(index_names, self.columns)] else: arrays = [self[c]._internal_get_values() for c in self.columns] names = [str(c) for c in self.columns] @@ -1687,18 +1801,15 @@ def to_records(self, index=True, convert_datetime64=None, formats.append(dtype_mapping) else: element = "row" if i < index_len else "column" - msg = ("Invalid dtype {dtype} specified for " - "{element} {name}").format(dtype=dtype_mapping, - element=element, name=name) + msg = ( + "Invalid dtype {dtype} specified for " "{element} {name}" + ).format(dtype=dtype_mapping, element=element, name=name) raise ValueError(msg) - return np.rec.fromarrays( - arrays, - dtype={'names': names, 'formats': formats} - ) + return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod - def from_items(cls, items, columns=None, orient='columns'): + def from_items(cls, items, columns=None, orient="columns"): """ Construct a DataFrame from a list of tuples. @@ -1730,23 +1841,28 @@ def from_items(cls, items, columns=None, orient='columns'): DataFrame """ - warnings.warn("from_items is deprecated. Please use " - "DataFrame.from_dict(dict(items), ...) instead. " - "DataFrame.from_dict(OrderedDict(items)) may be used to " - "preserve the key order.", - FutureWarning, stacklevel=2) + warnings.warn( + "from_items is deprecated. Please use " + "DataFrame.from_dict(dict(items), ...) instead. " + "DataFrame.from_dict(OrderedDict(items)) may be used to " + "preserve the key order.", + FutureWarning, + stacklevel=2, + ) keys, values = zip(*items) - if orient == 'columns': + if orient == "columns": if columns is not None: columns = ensure_index(columns) idict = dict(items) if len(idict) < len(items): if not columns.equals(ensure_index(keys)): - raise ValueError('With non-unique item names, passed ' - 'columns must be identical') + raise ValueError( + "With non-unique item names, passed " + "columns must be identical" + ) arrays = values else: arrays = [idict[k] for k in columns if k in idict] @@ -1761,10 +1877,12 @@ def from_items(cls, items, columns=None, orient='columns'): except ValueError: if not is_nested_list_like(values): - raise ValueError('The value in each (key, value) pair ' - 'must be an array, Series, or dict') + raise ValueError( + "The value in each (key, value) pair " + "must be an array, Series, or dict" + ) - elif orient == 'index': + elif orient == "index": if columns is None: raise TypeError("Must pass columns with orient='index'") @@ -1779,8 +1897,10 @@ def from_items(cls, items, columns=None, orient='columns'): except TypeError: if not is_nested_list_like(values): - raise ValueError('The value in each (key, value) pair ' - 'must be an array, Series, or dict') + raise ValueError( + "The value in each (key, value) pair " + "must be an array, Series, or dict" + ) else: # pragma: no cover raise ValueError("'orient' must be either 'columns' or 'index'") @@ -1790,7 +1910,7 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) - def to_sparse(self, fill_value=None, kind='block'): + def to_sparse(self, fill_value=None, kind="block"): """ Convert to SparseDataFrame. @@ -1846,21 +1966,39 @@ def to_sparse(self, fill_value=None, kind='block'): >>> type(sdf) # doctest: +SKIP """ - warnings.warn("DataFrame.to_sparse is deprecated and will be removed " - "in a future version", FutureWarning, stacklevel=2) + warnings.warn( + "DataFrame.to_sparse is deprecated and will be removed " + "in a future version", + FutureWarning, + stacklevel=2, + ) from pandas.core.sparse.api import SparseDataFrame + with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="SparseDataFrame") - return SparseDataFrame(self._series, index=self.index, - columns=self.columns, default_kind=kind, - default_fill_value=fill_value) + return SparseDataFrame( + self._series, + index=self.index, + columns=self.columns, + default_kind=kind, + default_fill_value=fill_value, + ) - @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) - def to_stata(self, fname, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None, variable_labels=None, version=114, - convert_strl=None): + @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) + def to_stata( + self, + fname, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + version=114, + convert_strl=None, + ): """ Export DataFrame object to Stata dta format. @@ -1943,20 +2081,29 @@ def to_stata(self, fname, convert_dates=None, write_index=True, """ kwargs = {} if version not in (114, 117): - raise ValueError('Only formats 114 and 117 supported.') + raise ValueError("Only formats 114 and 117 supported.") if version == 114: if convert_strl is not None: - raise ValueError('strl support is only available when using ' - 'format 117') + raise ValueError( + "strl support is only available when using " "format 117" + ) from pandas.io.stata import StataWriter as statawriter else: from pandas.io.stata import StataWriter117 as statawriter - kwargs['convert_strl'] = convert_strl - writer = statawriter(fname, self, convert_dates=convert_dates, - byteorder=byteorder, time_stamp=time_stamp, - data_label=data_label, write_index=write_index, - variable_labels=variable_labels, **kwargs) + kwargs["convert_strl"] = convert_strl + + writer = statawriter( + fname, + self, + convert_dates=convert_dates, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + write_index=write_index, + variable_labels=variable_labels, + **kwargs + ) writer.write_file() def to_feather(self, fname): @@ -1971,10 +2118,18 @@ def to_feather(self, fname): string file path """ from pandas.io.feather_format import to_feather + to_feather(self, fname) - def to_parquet(self, fname, engine='auto', compression='snappy', - index=None, partition_cols=None, **kwargs): + def to_parquet( + self, + fname, + engine="auto", + compression="snappy", + index=None, + partition_cols=None, + **kwargs + ): """ Write a DataFrame to the binary parquet format. @@ -2041,24 +2196,51 @@ def to_parquet(self, fname, engine='auto', compression='snappy', 1 2 4 """ from pandas.io.parquet import to_parquet - to_parquet(self, fname, engine, - compression=compression, index=index, - partition_cols=partition_cols, **kwargs) - - @Substitution(header='Whether to print column labels, default True', - col_space_type='str or int', - col_space='The minimum width of each column in CSS length ' - 'units. An int is assumed to be px units.\n\n' - ' .. versionadded:: 0.25.0\n' - ' Ability to use str') - @Substitution(shared_params=fmt.common_docstring, - returns=fmt.return_docstring) - def to_html(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, max_rows=None, - max_cols=None, show_dimensions=False, decimal='.', - bold_rows=True, classes=None, escape=True, notebook=False, - border=None, table_id=None, render_links=False): + + to_parquet( + self, + fname, + engine, + compression=compression, + index=index, + partition_cols=partition_cols, + **kwargs + ) + + @Substitution( + header="Whether to print column labels, default True", + col_space_type="str or int", + col_space="The minimum width of each column in CSS length " + "units. An int is assumed to be px units.\n\n" + " .. versionadded:: 0.25.0\n" + " Ability to use str", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_html( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + bold_rows=True, + classes=None, + escape=True, + notebook=False, + border=None, + table_id=None, + render_links=False, + ): """ Render a DataFrame as an HTML table. %(shared_params)s @@ -2091,23 +2273,31 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, to_string : Convert DataFrame to a string. """ - if (justify is not None and - justify not in fmt._VALID_JUSTIFY_PARAMETERS): + if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: raise ValueError("Invalid value for justify parameter") - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, justify=justify, - index_names=index_names, - header=header, index=index, - bold_rows=bold_rows, escape=escape, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, table_id=table_id, - render_links=render_links) + formatter = fmt.DataFrameFormatter( + self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + bold_rows=bold_rows, + escape=escape, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + table_id=table_id, + render_links=render_links, + ) # TODO: a generic formatter wld b in DataFrameFormatter formatter.to_html(classes=classes, notebook=notebook, border=border) @@ -2116,8 +2306,9 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, # ---------------------------------------------------------------------- - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, - null_counts=None): + def info( + self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + ): """ Print a concise summary of a DataFrame. @@ -2257,7 +2448,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, lines.append(self.index._summary()) if len(self.columns) == 0: - lines.append('Empty {name}'.format(name=type(self).__name__)) + lines.append("Empty {name}".format(name=type(self).__name__)) fmt.buffer_put_lines(buf, lines) return @@ -2265,21 +2456,18 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, # hack if max_cols is None: - max_cols = get_option('display.max_info_columns', - len(self.columns) + 1) + max_cols = get_option("display.max_info_columns", len(self.columns) + 1) - max_rows = get_option('display.max_info_rows', len(self) + 1) + max_rows = get_option("display.max_info_rows", len(self) + 1) if null_counts is None: - show_counts = ((len(self.columns) <= max_cols) and - (len(self) < max_rows)) + show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows) else: show_counts = null_counts exceeds_info_cols = len(self.columns) > max_cols def _verbose_repr(): - lines.append('Data columns (total %d columns):' % - len(self.columns)) + lines.append("Data columns (total %d columns):" % len(self.columns)) space = max(len(pprint_thing(k)) for k in self.columns) + 4 counts = None @@ -2288,9 +2476,11 @@ def _verbose_repr(): counts = self.count() if len(cols) != len(counts): # pragma: no cover raise AssertionError( - 'Columns must equal counts ' - '({cols:d} != {counts:d})'.format( - cols=len(cols), counts=len(counts))) + "Columns must equal counts " + "({cols:d} != {counts:d})".format( + cols=len(cols), counts=len(counts) + ) + ) tmpl = "{count} non-null {dtype}" dtypes = self.dtypes @@ -2302,22 +2492,24 @@ def _verbose_repr(): if show_counts: count = counts.iloc[i] - lines.append(_put_str(col, space) + tmpl.format(count=count, - dtype=dtype)) + lines.append( + _put_str(col, space) + tmpl.format(count=count, dtype=dtype) + ) def _non_verbose_repr(): - lines.append(self.columns._summary(name='Columns')) + lines.append(self.columns._summary(name="Columns")) def _sizeof_fmt(num, size_qualifier): # returns size in human readable format - for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: + for x in ["bytes", "KB", "MB", "GB", "TB"]: if num < 1024.0: - return ("{num:3.1f}{size_q} " - "{x}".format(num=num, size_q=size_qualifier, x=x)) + return "{num:3.1f}{size_q} " "{x}".format( + num=num, size_q=size_qualifier, x=x + ) num /= 1024.0 - return "{num:3.1f}{size_q} {pb}".format(num=num, - size_q=size_qualifier, - pb='PB') + return "{num:3.1f}{size_q} {pb}".format( + num=num, size_q=size_qualifier, pb="PB" + ) if verbose: _verbose_repr() @@ -2330,28 +2522,29 @@ def _sizeof_fmt(num, size_qualifier): _verbose_repr() counts = self._data.get_dtype_counts() - dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k - in sorted(counts.items())] - lines.append('dtypes: {types}'.format(types=', '.join(dtypes))) + dtypes = ["{k}({kk:d})".format(k=k[0], kk=k[1]) for k in sorted(counts.items())] + lines.append("dtypes: {types}".format(types=", ".join(dtypes))) if memory_usage is None: - memory_usage = get_option('display.memory_usage') + memory_usage = get_option("display.memory_usage") if memory_usage: # append memory usage of df to display - size_qualifier = '' - if memory_usage == 'deep': + size_qualifier = "" + if memory_usage == "deep": deep = True else: # size_qualifier is just a best effort; not guaranteed to catch # all cases (e.g., it misses categorical data even with object # categories) deep = False - if ('object' in counts or - self.index._is_memory_usage_qualified()): - size_qualifier = '+' + if "object" in counts or self.index._is_memory_usage_qualified(): + size_qualifier = "+" mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append("memory usage: {mem}\n".format( - mem=_sizeof_fmt(mem_usage, size_qualifier))) + lines.append( + "memory usage: {mem}\n".format( + mem=_sizeof_fmt(mem_usage, size_qualifier) + ) + ) fmt.buffer_put_lines(buf, lines) @@ -2439,11 +2632,14 @@ def memory_usage(self, index=True, deep=False): >>> df['object'].astype('category').memory_usage(deep=True) 5216 """ - result = Series([c.memory_usage(index=False, deep=deep) - for col, c in self.iteritems()], index=self.columns) + result = Series( + [c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], + index=self.columns, + ) if index: - result = Series(self.index.memory_usage(deep=deep), - index=['Index']).append(result) + result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append( + result + ) return result def transpose(self, *args, **kwargs): @@ -2566,13 +2762,13 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover (vals, idx, cols), object_state = state index = com._unpickle_array(idx) - dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols), - copy=False) + dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols), copy=False) if object_state is not None: ovals, _, ocols = object_state - objects = DataFrame(ovals, index=index, - columns=com._unpickle_array(ocols), copy=False) + objects = DataFrame( + ovals, index=index, columns=com._unpickle_array(ocols), copy=False + ) dm = dm.join(objects) @@ -2599,10 +2795,13 @@ def get_value(self, index, col, takeable=False): scalar """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(index, col, takeable=takeable) def _get_value(self, index, col, takeable=False): @@ -2629,6 +2828,7 @@ def _get_value(self, index, col, takeable=False): col = self.columns.get_loc(col) index = self.index.get_loc(index) return self._get_value(index, col, takeable=True) + _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): @@ -2651,10 +2851,13 @@ def set_value(self, index, col, value, takeable=False): If label pair is contained, will be reference to calling DataFrame, otherwise a new object. """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(index, col, value, takeable=takeable) def _set_value(self, index, col, value, takeable=False): @@ -2677,6 +2880,7 @@ def _set_value(self, index, col, value, takeable=False): self._item_cache.pop(col, None) return self + _set_value.__doc__ = set_value.__doc__ def _ixs(self, i, axis=0): @@ -2706,12 +2910,15 @@ def _ixs(self, i, axis=0): return new_values # if we are a copy, mark as such - copy = (isinstance(new_values, np.ndarray) and - new_values.base is None) - result = self._constructor_sliced(new_values, - index=self.columns, - name=self.index[i], - dtype=new_values.dtype) + copy = ( + isinstance(new_values, np.ndarray) and new_values.base is None + ) + result = self._constructor_sliced( + new_values, + index=self.columns, + name=self.index[i], + dtype=new_values.dtype, + ) result._set_is_copy(self, copy=copy) return result @@ -2784,8 +2991,7 @@ def __getitem__(self, key): else: if is_iterator(key): key = list(key) - indexer = self.loc._convert_to_indexer(key, axis=1, - raise_missing=True) + indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True) # take() does not accept boolean indexers if getattr(indexer, "dtype", None) == bool: @@ -2810,11 +3016,15 @@ def _getitem_bool_array(self, key): # go with the __setitem__ behavior since that is more consistent # with all other indexing behavior if isinstance(key, Series) and not key.index.equals(self.index): - warnings.warn("Boolean Series key will be reindexed to match " - "DataFrame index.", UserWarning, stacklevel=3) + warnings.warn( + "Boolean Series key will be reindexed to match " "DataFrame index.", + UserWarning, + stacklevel=3, + ) elif len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d.' % - (len(key), len(self.index))) + raise ValueError( + "Item wrong length %d instead of %d." % (len(key), len(self.index)) + ) # check_bool_indexer will throw exception if Series key cannot # be reindexed to match DataFrame rows @@ -2832,8 +3042,9 @@ def _getitem_multilevel(self, key): result.columns = result_columns else: new_values = self.values[:, loc] - result = self._constructor(new_values, index=self.index, - columns=result_columns) + result = self._constructor( + new_values, index=self.index, columns=result_columns + ) result = result.__finalize__(self) # If there is only one column being returned, and its name is @@ -2846,12 +3057,12 @@ def _getitem_multilevel(self, key): top = result.columns[0] if isinstance(top, tuple): top = top[0] - if top == '': - result = result[''] + if top == "": + result = result[""] if isinstance(result, Series): - result = self._constructor_sliced(result, - index=self.index, - name=key) + result = self._constructor_sliced( + result, index=self.index, name=key + ) result._set_is_copy(self) return result @@ -2860,7 +3071,7 @@ def _getitem_multilevel(self, key): def _getitem_frame(self, key): if key.values.size and not is_bool_dtype(key.values): - raise ValueError('Must pass DataFrame with boolean values only') + raise ValueError("Must pass DataFrame with boolean values only") return self.where(key) def query(self, expr, inplace=False, **kwargs): @@ -2972,12 +3183,12 @@ def query(self, expr, inplace=False, **kwargs): A B C C 0 1 10 10 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(expr, str): msg = "expr must be a string to be evaluated, {0} given" raise ValueError(msg.format(type(expr))) - kwargs['level'] = kwargs.pop('level', 0) + 1 - kwargs['target'] = None + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None res = self.eval(expr, **kwargs) try: @@ -3084,17 +3295,16 @@ def eval(self, expr, inplace=False, **kwargs): """ from pandas.core.computation.eval import eval as _eval - inplace = validate_bool_kwarg(inplace, 'inplace') - resolvers = kwargs.pop('resolvers', None) - kwargs['level'] = kwargs.pop('level', 0) + 1 + inplace = validate_bool_kwarg(inplace, "inplace") + resolvers = kwargs.pop("resolvers", None) + kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - column_resolvers = \ - self._get_space_character_free_column_resolvers() + column_resolvers = self._get_space_character_free_column_resolvers() resolvers = column_resolvers, index_resolvers - if 'target' not in kwargs: - kwargs['target'] = self - kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers) + if "target" not in kwargs: + kwargs["target"] = self + kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) return _eval(expr, inplace=inplace, **kwargs) def select_dtypes(self, include=None, exclude=None): @@ -3176,10 +3386,11 @@ def select_dtypes(self, include=None, exclude=None): 4 True 1.0 5 False 2.0 """ + def _get_info_slice(obj, indexer): """Slice the info axis of `obj` with `indexer`.""" - if not hasattr(obj, '_info_axis_number'): - msg = 'object of type {typ!r} has no info axis' + if not hasattr(obj, "_info_axis_number"): + msg = "object of type {typ!r} has no info axis" raise TypeError(msg.format(typ=type(obj).__name__)) slices = [slice(None)] * obj.ndim slices[obj._info_axis_number] = indexer @@ -3193,19 +3404,22 @@ def _get_info_slice(obj, indexer): selection = tuple(map(frozenset, (include, exclude))) if not any(selection): - raise ValueError('at least one of include or exclude must be ' - 'nonempty') + raise ValueError("at least one of include or exclude must be " "nonempty") # convert the myriad valid dtypes object to a single representation include, exclude = map( - lambda x: frozenset(map(infer_dtype_from_object, x)), selection) + lambda x: frozenset(map(infer_dtype_from_object, x)), selection + ) for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError('include and exclude overlap on {inc_ex}'.format( - inc_ex=(include & exclude))) + raise ValueError( + "include and exclude overlap on {inc_ex}".format( + inc_ex=(include & exclude) + ) + ) # empty include/exclude -> defaults to True # three cases (we've already raised if both are empty) @@ -3224,8 +3438,9 @@ def _get_info_slice(obj, indexer): def is_dtype_instance_mapper(idx, dtype): return idx, functools.partial(issubclass, dtype.type) - for idx, f in itertools.starmap(is_dtype_instance_mapper, - enumerate(self.dtypes)): + for idx, f in itertools.starmap( + is_dtype_instance_mapper, enumerate(self.dtypes) + ): if include: # checks for the case of empty include or exclude include_these.iloc[idx] = any(map(f, include)) if exclude: @@ -3256,7 +3471,7 @@ def __setitem__(self, key, value): if indexer is not None: return self._setitem_slice(indexer, value) - if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2: + if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: self._setitem_frame(key, value) elif isinstance(key, (Series, np.ndarray, list, Index)): self._setitem_array(key, value) @@ -3272,8 +3487,9 @@ def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): if len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d!' % - (len(key), len(self.index))) + raise ValueError( + "Item wrong length %d instead of %d!" % (len(key), len(self.index)) + ) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() @@ -3281,7 +3497,7 @@ def _setitem_array(self, key, value): else: if isinstance(value, DataFrame): if len(value.columns) != len(key): - raise ValueError('Columns must be same length as key') + raise ValueError("Columns must be same length as key") for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: @@ -3294,14 +3510,12 @@ def _setitem_frame(self, key, value): # df[df > df2] = 0 if isinstance(key, np.ndarray): if key.shape != self.shape: - raise ValueError( - 'Array conditional must be same shape as self' - ) + raise ValueError("Array conditional must be same shape as self") key = self._constructor(key, **self._construct_axes_dict()) if key.values.size and not is_bool_dtype(key.values): raise TypeError( - 'Must pass DataFrame or 2-d ndarray with boolean values only' + "Must pass DataFrame or 2-d ndarray with boolean values only" ) self._check_inplace_setting(value) @@ -3318,12 +3532,15 @@ def _ensure_valid_index(self, value): try: value = Series(value) except (ValueError, NotImplementedError, TypeError): - raise ValueError('Cannot set a frame with no defined index ' - 'and a value that cannot be converted to a ' - 'Series') + raise ValueError( + "Cannot set a frame with no defined index " + "and a value that cannot be converted to a " + "Series" + ) - self._data = self._data.reindex_axis(value.index.copy(), axis=1, - fill_value=np.nan) + self._data = self._data.reindex_axis( + value.index.copy(), axis=1, fill_value=np.nan + ) def _set_item(self, key, value): """ @@ -3364,8 +3581,7 @@ def insert(self, loc, column, value, allow_duplicates=False): """ self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) - self._data.insert(loc, column, value, - allow_duplicates=allow_duplicates) + self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) def assign(self, **kwargs): r""" @@ -3494,8 +3710,9 @@ def reindexer(value): raise e # other - raise TypeError('incompatible index of inserted column ' - 'with frame index') + raise TypeError( + "incompatible index of inserted column " "with frame index" + ) return value if isinstance(value, Series): @@ -3541,8 +3758,7 @@ def reindexer(value): else: # cast ignores pandas dtypes. so save the dtype first - infer_dtype, _ = infer_dtype_from_scalar( - value, pandas_dtype=True) + infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) # upcast value = cast_scalar_to_array(len(self.index), value) @@ -3554,8 +3770,7 @@ def reindexer(value): # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: - if (not self.columns.is_unique or - isinstance(self.columns, MultiIndex)): + if not self.columns.is_unique or isinstance(self.columns, MultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) @@ -3564,8 +3779,10 @@ def reindexer(value): @property def _series(self): - return {item: Series(self._data.iget(idx), index=self.index, name=item) - for idx, item in enumerate(self.columns)} + return { + item: Series(self._data.iget(idx), index=self.index, name=item) + for idx, item in enumerate(self.columns) + } def lookup(self, row_labels, col_labels): """ @@ -3599,7 +3816,7 @@ def lookup(self, row_labels, col_labels): """ n = len(row_labels) if n != len(col_labels): - raise ValueError('Row labels must have same size as column labels') + raise ValueError("Row labels must have same size as column labels") thresh = 1000 if not self._is_mixed_type or n > thresh: @@ -3607,13 +3824,13 @@ def lookup(self, row_labels, col_labels): ridx = self.index.get_indexer(row_labels) cidx = self.columns.get_indexer(col_labels) if (ridx == -1).any(): - raise KeyError('One or more row labels was not found') + raise KeyError("One or more row labels was not found") if (cidx == -1).any(): - raise KeyError('One or more column labels was not found') + raise KeyError("One or more column labels was not found") flat_index = ridx * len(self.columns) + cidx result = values.flat[flat_index] else: - result = np.empty(n, dtype='O') + result = np.empty(n, dtype="O") for i, (r, c) in enumerate(zip(row_labels, col_labels)): result[i] = self._get_value(r, c) @@ -3625,88 +3842,142 @@ def lookup(self, row_labels, col_labels): # ---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, - copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): frame = self - columns = axes['columns'] + columns = axes["columns"] if columns is not None: - frame = frame._reindex_columns(columns, method, copy, level, - fill_value, limit, tolerance) + frame = frame._reindex_columns( + columns, method, copy, level, fill_value, limit, tolerance + ) - index = axes['index'] + index = axes["index"] if index is not None: - frame = frame._reindex_index(index, method, copy, level, - fill_value, limit, tolerance) + frame = frame._reindex_index( + index, method, copy, level, fill_value, limit, tolerance + ) return frame - def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan, - limit=None, tolerance=None): - new_index, indexer = self.index.reindex(new_index, method=method, - level=level, limit=limit, - tolerance=tolerance) - return self._reindex_with_indexers({0: [new_index, indexer]}, - copy=copy, fill_value=fill_value, - allow_dups=False) - - def _reindex_columns(self, new_columns, method, copy, level, - fill_value=None, limit=None, tolerance=None): - new_columns, indexer = self.columns.reindex(new_columns, method=method, - level=level, limit=limit, - tolerance=tolerance) - return self._reindex_with_indexers({1: [new_columns, indexer]}, - copy=copy, fill_value=fill_value, - allow_dups=False) + def _reindex_index( + self, + new_index, + method, + copy, + level, + fill_value=np.nan, + limit=None, + tolerance=None, + ): + new_index, indexer = self.index.reindex( + new_index, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {0: [new_index, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) + + def _reindex_columns( + self, + new_columns, + method, + copy, + level, + fill_value=None, + limit=None, + tolerance=None, + ): + new_columns, indexer = self.columns.reindex( + new_columns, method=method, level=level, limit=limit, tolerance=tolerance + ) + return self._reindex_with_indexers( + {1: [new_columns, indexer]}, + copy=copy, + fill_value=fill_value, + allow_dups=False, + ) def _reindex_multi(self, axes, copy, fill_value): """ We are guaranteed non-Nones in the axes. """ - new_index, row_indexer = self.index.reindex(axes['index']) - new_columns, col_indexer = self.columns.reindex(axes['columns']) + new_index, row_indexer = self.index.reindex(axes["index"]) + new_columns, col_indexer = self.columns.reindex(axes["columns"]) if row_indexer is not None and col_indexer is not None: indexer = row_indexer, col_indexer - new_values = algorithms.take_2d_multi(self.values, indexer, - fill_value=fill_value) - return self._constructor(new_values, index=new_index, - columns=new_columns) + new_values = algorithms.take_2d_multi( + self.values, indexer, fill_value=fill_value + ) + return self._constructor(new_values, index=new_index, columns=new_columns) else: - return self._reindex_with_indexers({0: [new_index, row_indexer], - 1: [new_columns, col_indexer]}, - copy=copy, - fill_value=fill_value) - - @Appender(_shared_docs['align'] % _shared_doc_kwargs) - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): - return super().align(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, method=method, - limit=limit, fill_axis=fill_axis, - broadcast_axis=broadcast_axis) + return self._reindex_with_indexers( + {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + copy=copy, + fill_value=fill_value, + ) + + @Appender(_shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + return super().align( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.reindex.__doc__) - @rewrite_axis_style_signature('labels', [('method', None), - ('copy', True), - ('level', None), - ('fill_value', np.nan), - ('limit', None), - ('tolerance', None)]) + @rewrite_axis_style_signature( + "labels", + [ + ("method", None), + ("copy", True), + ("level", None), + ("fill_value", np.nan), + ("limit", None), + ("tolerance", None), + ], + ) def reindex(self, *args, **kwargs): - axes = validate_axis_style_args(self, args, kwargs, 'labels', - 'reindex') + axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names - kwargs.pop('axis', None) - kwargs.pop('labels', None) + kwargs.pop("axis", None) + kwargs.pop("labels", None) return super().reindex(**kwargs) - def drop(self, labels=None, axis=0, index=None, columns=None, - level=None, inplace=False, errors='raise'): + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): """ Drop specified labels from rows or columns. @@ -3829,14 +4100,20 @@ def drop(self, labels=None, axis=0, index=None, columns=None, falcon speed 320.0 250.0 weight 1.0 0.8 """ - return super().drop(labels=labels, axis=axis, index=index, - columns=columns, level=level, inplace=inplace, - errors=errors) + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) - @rewrite_axis_style_signature('mapper', [('copy', True), - ('inplace', False), - ('level', None), - ('errors', 'ignore')]) + @rewrite_axis_style_signature( + "mapper", + [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], + ) def rename(self, *args, **kwargs): """ Alter axes labels. @@ -3946,35 +4223,63 @@ def rename(self, *args, **kwargs): 2 2 5 4 3 6 """ - axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename') + axes = validate_axis_style_args(self, args, kwargs, "mapper", "rename") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names - kwargs.pop('axis', None) - kwargs.pop('mapper', None) + kwargs.pop("axis", None) + kwargs.pop("mapper", None) return super().rename(**kwargs) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.fillna.__doc__) - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - return super().fillna(value=value, method=method, axis=axis, - inplace=inplace, limit=limit, downcast=downcast, - **kwargs) - - @Appender(_shared_docs['replace'] % _shared_doc_kwargs) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad'): - return super().replace(to_replace=to_replace, value=value, - inplace=inplace, limit=limit, regex=regex, - method=method) - - @Appender(_shared_docs['shift'] % _shared_doc_kwargs) + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs + ): + return super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + **kwargs + ) + + @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + return super().replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + @Appender(_shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return super().shift(periods=periods, freq=freq, axis=axis, - fill_value=fill_value) + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) - def set_index(self, keys, drop=True, append=False, inplace=False, - verify_integrity=False): + def set_index( + self, keys, drop=True, append=False, inplace=False, verify_integrity=False + ): """ Set the DataFrame index using existing columns. @@ -4064,35 +4369,39 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 3 9 7 2013 84 4 16 10 2014 31 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(keys, list): keys = [keys] - err_msg = ('The parameter "keys" may be a column key, one-dimensional ' - 'array, or a list containing only valid column keys and ' - 'one-dimensional arrays.') + err_msg = ( + 'The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + "one-dimensional arrays." + ) missing = [] for col in keys: - if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray, - list, abc.Iterator)): + if isinstance( + col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) + ): # arrays are fine as long as they are one-dimensional # iterators get converted to list below - if getattr(col, 'ndim', 1) != 1: + if getattr(col, "ndim", 1) != 1: raise ValueError(err_msg) else: # everything else gets tried as a key; see GH 24969 try: found = col in self.columns except TypeError: - raise TypeError(err_msg + ' Received column of ' - 'type {}'.format(type(col))) + raise TypeError( + err_msg + " Received column of " "type {}".format(type(col)) + ) else: if not found: missing.append(col) if missing: - raise KeyError('None of {} are in the columns'.format(missing)) + raise KeyError("None of {} are in the columns".format(missing)) if inplace: frame = self @@ -4135,18 +4444,18 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if len(arrays[-1]) != len(self): # check newest element against length of calling frame, since # ensure_index_from_sequences would not raise for append=False. - raise ValueError('Length mismatch: Expected {len_self} rows, ' - 'received array of length {len_col}'.format( - len_self=len(self), - len_col=len(arrays[-1]) - )) + raise ValueError( + "Length mismatch: Expected {len_self} rows, " + "received array of length {len_col}".format( + len_self=len(self), len_col=len(arrays[-1]) + ) + ) index = ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: duplicates = index[index.duplicated()].unique() - raise ValueError('Index has duplicate keys: {dup}'.format( - dup=duplicates)) + raise ValueError("Index has duplicate keys: {dup}".format(dup=duplicates)) # use set to handle duplicate column names gracefully in case of drop for c in set(to_remove): @@ -4160,8 +4469,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not inplace: return frame - def reset_index(self, level=None, drop=False, inplace=False, col_level=0, - col_fill=''): + def reset_index( + self, level=None, drop=False, inplace=False, col_level=0, col_fill="" + ): """ Reset the index, or a level of it. @@ -4303,7 +4613,7 @@ class max type lion mammal 80.5 run monkey mammal NaN jump """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if inplace: new_obj = self else: @@ -4339,8 +4649,7 @@ def _maybe_casted_values(index, labels=None): values = values._data if mask.any(): - values, changed = maybe_upcast_putmask( - values, mask, np.nan) + values, changed = maybe_upcast_putmask(values, mask, np.nan) if issubclass(values_type, DatetimeLikeArray): values = values_type(values, dtype=values_dtype) @@ -4357,13 +4666,14 @@ def _maybe_casted_values(index, labels=None): if not drop: if isinstance(self.index, MultiIndex): - names = [n if n is not None else ('level_%d' % i) - for (i, n) in enumerate(self.index.names)] + names = [ + n if n is not None else ("level_%d" % i) + for (i, n) in enumerate(self.index.names) + ] to_insert = zip(self.index.levels, self.index.codes) else: - default = 'index' if 'index' not in self else 'level_0' - names = ([default] if self.index.name is None - else [self.index.name]) + default = "index" if "index" not in self else "level_0" + names = [default] if self.index.name is None else [self.index.name] to_insert = ((self.index, None),) multi_col = isinstance(self.columns, MultiIndex) @@ -4372,13 +4682,14 @@ def _maybe_casted_values(index, labels=None): continue name = names[i] if multi_col: - col_name = (list(name) if isinstance(name, tuple) - else [name]) + col_name = list(name) if isinstance(name, tuple) else [name] if col_fill is None: if len(col_name) not in (1, self.columns.nlevels): - raise ValueError("col_fill=None is incompatible " - "with incomplete column name " - "{}".format(name)) + raise ValueError( + "col_fill=None is incompatible " + "with incomplete column name " + "{}".format(name) + ) col_fill = col_name[0] lev_num = self.columns._get_level_number(col_level) @@ -4397,24 +4708,23 @@ def _maybe_casted_values(index, labels=None): # ---------------------------------------------------------------------- # Reindex-based selection methods - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return super().isna() - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self): return super().isnull() - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return super().notna() - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notnull(self): return super().notnull() - def dropna(self, axis=0, how='any', thresh=None, subset=None, - inplace=False): + def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): """ Remove missing values. @@ -4517,17 +4827,18 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, name toy born 1 Batman Batmobile 1940-04-25 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(axis, (tuple, list)): # GH20987 - msg = ("supplying multiple axes to axis is deprecated and " - "will be removed in a future version.") + msg = ( + "supplying multiple axes to axis is deprecated and " + "will be removed in a future version." + ) warnings.warn(msg, FutureWarning, stacklevel=2) result = self for ax in axis: - result = result.dropna(how=how, thresh=thresh, subset=subset, - axis=ax) + result = result.dropna(how=how, thresh=thresh, subset=subset, axis=ax) else: axis = self._get_axis_number(axis) agg_axis = 1 - axis @@ -4545,15 +4856,15 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, if thresh is not None: mask = count >= thresh - elif how == 'any': + elif how == "any": mask = count == len(agg_obj._get_axis(agg_axis)) - elif how == 'all': + elif how == "all": mask = count > 0 else: if how is not None: - raise ValueError('invalid how option: {h}'.format(h=how)) + raise ValueError("invalid how option: {h}".format(h=how)) else: - raise TypeError('must specify how or thresh') + raise TypeError("must specify how or thresh") result = self.loc(axis=axis)[mask] @@ -4562,7 +4873,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: return result - def drop_duplicates(self, subset=None, keep='first', inplace=False): + def drop_duplicates(self, subset=None, keep="first", inplace=False): """ Return DataFrame with duplicate rows removed, optionally only considering certain columns. Indexes, including time indexes @@ -4587,7 +4898,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): if self.empty: return self.copy() - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") duplicated = self.duplicated(subset, keep=keep) if inplace: @@ -4597,7 +4908,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): else: return self[-duplicated] - def duplicated(self, subset=None, keep='first'): + def duplicated(self, subset=None, keep="first"): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns. @@ -4626,15 +4937,19 @@ def duplicated(self, subset=None, keep='first'): def f(vals): labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) - return labels.astype('i8', copy=False), len(shape) + vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) + ) + return labels.astype("i8", copy=False), len(shape) if subset is None: subset = self.columns - elif (not np.iterable(subset) or - isinstance(subset, str) or - isinstance(subset, tuple) and subset in self.columns): - subset = subset, + elif ( + not np.iterable(subset) + or isinstance(subset, str) + or isinstance(subset, tuple) + and subset in self.columns + ): + subset = (subset,) # Verify all columns in subset exist in the queried dataframe # Otherwise, raise a KeyError, same as if you try to __getitem__ with a @@ -4643,8 +4958,7 @@ def f(vals): if not diff.empty: raise KeyError(diff) - vals = (col.values for name, col in self.iteritems() - if name in subset) + vals = (col.values for name, col in self.iteritems() if name in subset) labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) @@ -4655,23 +4969,30 @@ def f(vals): @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - def sort_values(self, by, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): - inplace = validate_bool_kwarg(inplace, 'inplace') + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): + inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) if not isinstance(by, list): by = [by] if is_sequence(ascending) and len(by) != len(ascending): - raise ValueError('Length of ascending (%d) != length of by (%d)' % - (len(ascending), len(by))) + raise ValueError( + "Length of ascending (%d) != length of by (%d)" + % (len(ascending), len(by)) + ) if len(by) > 1: from pandas.core.sorting import lexsort_indexer - keys = [self._get_label_or_level_values(x, axis=axis) - for x in by] - indexer = lexsort_indexer(keys, orders=ascending, - na_position=na_position) + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) indexer = ensure_platform_int(indexer) else: from pandas.core.sorting import nargsort @@ -4682,12 +5003,13 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if isinstance(ascending, (tuple, list)): ascending = ascending[0] - indexer = nargsort(k, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort( + k, kind=kind, ascending=ascending, na_position=na_position + ) - new_data = self._data.take(indexer, - axis=self._get_block_manager_axis(axis), - verify=False) + new_data = self._data.take( + indexer, axis=self._get_block_manager_axis(axis), verify=False + ) if inplace: return self._update_inplace(new_data) @@ -4696,23 +5018,33 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_index.__doc__) - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True, - by=None): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + by=None, + ): # TODO: this can be combined with Series.sort_index impl as # almost identical - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # 10726 if by is not None: - warnings.warn("by argument to sort_index is deprecated, " - "please use .sort_values(by=...)", - FutureWarning, stacklevel=2) + warnings.warn( + "by argument to sort_index is deprecated, " + "please use .sort_values(by=...)", + FutureWarning, + stacklevel=2, + ) if level is not None: raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, - inplace=inplace) + return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -4722,34 +5054,37 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, labels = labels._sort_levels_monotonic() if level is not None: - new_axis, indexer = labels.sortlevel(level, ascending=ascending, - sort_remaining=sort_remaining) + new_axis, indexer = labels.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer(labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position) + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) else: from pandas.core.sorting import nargsort # Check monotonic-ness before sort an index # GH11080 - if ((ascending and labels.is_monotonic_increasing) or - (not ascending and labels.is_monotonic_decreasing)): + if (ascending and labels.is_monotonic_increasing) or ( + not ascending and labels.is_monotonic_decreasing + ): if inplace: return else: return self.copy() - indexer = nargsort(labels, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort( + labels, kind=kind, ascending=ascending, na_position=na_position + ) baxis = self._get_block_manager_axis(axis) - new_data = self._data.take(indexer, - axis=baxis, - verify=False) + new_data = self._data.take(indexer, axis=baxis, verify=False) # reconstruct axis if needed new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() @@ -4759,7 +5094,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return self._constructor(new_data).__finalize__(self) - def nlargest(self, n, columns, keep='first'): + def nlargest(self, n, columns, keep="first"): """ Return the first `n` rows ordered by `columns` in descending order. @@ -4866,12 +5201,9 @@ def nlargest(self, n, columns, keep='first'): Italy 59000000 1937894 IT Brunei 434000 12128 BN """ - return algorithms.SelectNFrame(self, - n=n, - keep=keep, - columns=columns).nlargest() + return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep='first'): + def nsmallest(self, n, columns, keep="first"): """ Return the first `n` rows ordered by `columns` in ascending order. @@ -4968,10 +5300,9 @@ def nsmallest(self, n, columns, keep='first'): Nauru 11300 182 NR Anguilla 11300 311 AI """ - return algorithms.SelectNFrame(self, - n=n, - keep=keep, - columns=columns).nsmallest() + return algorithms.SelectNFrame( + self, n=n, keep=keep, columns=columns + ).nsmallest() def swaplevel(self, i=-2, j=-1, axis=0): """ @@ -5018,9 +5349,8 @@ def reorder_levels(self, order, axis=0): type of caller (new object) """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), - MultiIndex): # pragma: no cover - raise TypeError('Can only reorder levels on a hierarchical axis.') + if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() @@ -5034,7 +5364,7 @@ def reorder_levels(self, order, axis=0): # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join='outer', level=level, copy=False) + this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns def _arith_op(left, right): @@ -5049,13 +5379,12 @@ def _arith_op(left, right): return ops.dispatch_to_series(this, other, _arith_op) else: result = _arith_op(this.values, other.values) - return self._constructor(result, - index=new_index, columns=new_columns, - copy=False) + return self._constructor( + result, index=new_index, columns=new_columns, copy=False + ) def _combine_match_index(self, other, func, level=None): - left, right = self.align(other, join='outer', axis=0, level=level, - copy=False) + left, right = self.align(other, join="outer", axis=0, level=level, copy=False) assert left.index.equals(right.index) if left._is_mixed_type or right._is_mixed_type: @@ -5065,14 +5394,13 @@ def _combine_match_index(self, other, func, level=None): # fastpath --> operate directly on values with np.errstate(all="ignore"): new_data = func(left.values.T, right.values).T - return self._constructor(new_data, - index=left.index, columns=self.columns, - copy=False) + return self._constructor( + new_data, index=left.index, columns=self.columns, copy=False + ) def _combine_match_columns(self, other, func, level=None): assert isinstance(other, Series) - left, right = self.align(other, join='outer', axis=1, level=level, - copy=False) + left, right = self.align(other, join="outer", axis=1, level=level, copy=False) assert left.columns.equals(right.index) return ops.dispatch_to_series(left, right, func, axis="columns") @@ -5245,8 +5573,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): result[col] = arr # convert_objects just in case - return self._constructor(result, index=new_index, - columns=new_columns) + return self._constructor(result, index=new_index, columns=new_columns) def combine_first(self, other): """ @@ -5304,7 +5631,7 @@ def extract_values(arr): if is_extension_array_dtype(arr.dtype): arr = arr.asi8 else: - arr = arr.view('i8') + arr = arr.view("i8") return arr def combiner(x, y): @@ -5324,10 +5651,14 @@ def combiner(x, y): return self.combine(other, combiner, overwrite=False) - @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors', - mapping={False: 'ignore', True: 'raise'}) - def update(self, other, join='left', overwrite=True, filter_func=None, - errors='ignore'): + @deprecate_kwarg( + old_arg_name="raise_conflict", + new_arg_name="errors", + mapping={False: "ignore", True: "raise"}, + ) + def update( + self, other, join="left", overwrite=True, filter_func=None, errors="ignore" + ): """ Modify in place using non-NA values from another DataFrame. @@ -5440,12 +5771,14 @@ def update(self, other, join='left', overwrite=True, filter_func=None, 2 3 6.0 """ import pandas.core.computation.expressions as expressions + # TODO: Support other joins - if join != 'left': # pragma: no cover + if join != "left": # pragma: no cover raise NotImplementedError("Only left join is supported") - if errors not in ['ignore', 'raise']: - raise ValueError("The parameter errors must be either " - "'ignore' or 'raise'") + if errors not in ["ignore", "raise"]: + raise ValueError( + "The parameter errors must be either " "'ignore' or 'raise'" + ) if not isinstance(other, DataFrame): other = DataFrame(other) @@ -5456,10 +5789,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, this = self[col]._values that = other[col]._values if filter_func is not None: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): mask = ~filter_func(this) | isna(that) else: - if errors == 'raise': + if errors == "raise": mask_this = notna(that) mask_that = notna(this) if any(mask_this & mask_that): @@ -5479,7 +5812,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # ---------------------------------------------------------------------- # Data reshaping - _shared_docs['pivot'] = """ + _shared_docs[ + "pivot" + ] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -5582,13 +5917,16 @@ def update(self, other, join='left', overwrite=True, filter_func=None, ValueError: Index contains duplicate entries, cannot reshape """ - @Substitution('') - @Appender(_shared_docs['pivot']) + @Substitution("") + @Appender(_shared_docs["pivot"]) def pivot(self, index=None, columns=None, values=None): from pandas.core.reshape.pivot import pivot + return pivot(self, index=index, columns=columns, values=values) - _shared_docs['pivot_table'] = """ + _shared_docs[ + "pivot_table" + ] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame. @@ -5713,16 +6051,34 @@ def pivot(self, index=None, columns=None, values=None): small 2.333333 6.0 4.333333 2.0 """ - @Substitution('') - @Appender(_shared_docs['pivot_table']) - def pivot_table(self, values=None, index=None, columns=None, - aggfunc='mean', fill_value=None, margins=False, - dropna=True, margins_name='All', observed=False): + @Substitution("") + @Appender(_shared_docs["pivot_table"]) + def pivot_table( + self, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + observed=False, + ): from pandas.core.reshape.pivot import pivot_table - return pivot_table(self, values=values, index=index, columns=columns, - aggfunc=aggfunc, fill_value=fill_value, - margins=margins, dropna=dropna, - margins_name=margins_name, observed=observed) + + return pivot_table( + self, + values=values, + index=index, + columns=columns, + aggfunc=aggfunc, + fill_value=fill_value, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + ) def stack(self, level=-1, dropna=True): """ @@ -5955,9 +6311,12 @@ def unstack(self, level=-1, fill_value=None): dtype: float64 """ from pandas.core.reshape.reshape import unstack + return unstack(self, level, fill_value) - _shared_docs['melt'] = (""" + _shared_docs[ + "melt" + ] = """ Unpivot a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -6050,18 +6409,32 @@ def unstack(self, level=-1, fill_value=None): 0 a B E 1 1 b B E 3 2 c B E 5 - """) - - @Appender(_shared_docs['melt'] % - dict(caller='df.melt(', - versionadded='.. versionadded:: 0.20.0\n', - other='melt')) - def melt(self, id_vars=None, value_vars=None, var_name=None, - value_name='value', col_level=None): + """ + + @Appender( + _shared_docs["melt"] + % dict( + caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt" + ) + ) + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, + ): from pandas.core.reshape.melt import melt - return melt(self, id_vars=id_vars, value_vars=value_vars, - var_name=var_name, value_name=value_name, - col_level=col_level) + + return melt( + self, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ) # ---------------------------------------------------------------------- # Time series-related @@ -6160,11 +6533,12 @@ def diff(self, periods=1, axis=0): # ---------------------------------------------------------------------- # Function application - def _gotitem(self, - key: Union[str, List[str]], - ndim: int, - subset: Optional[Union[Series, ABCDataFrame]] = None, - ) -> Union[Series, ABCDataFrame]: + def _gotitem( + self, + key: Union[str, List[str]], + ndim: int, + subset: Optional[Union[Series, ABCDataFrame]] = None, + ) -> Union[Series, ABCDataFrame]: """ Sub-classes to define. Return a sliced object. @@ -6184,7 +6558,8 @@ def _gotitem(self, # TODO: _shallow_copy(subset)? return subset[key] - _agg_summary_and_see_also_doc = dedent(""" + _agg_summary_and_see_also_doc = dedent( + """ The aggregation operations are always performed over an axis, either the index (default) or the column axis. This behavior is different from `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, @@ -6204,9 +6579,11 @@ def _gotitem(self, core.window.Expanding : Perform operations over expanding window. core.window.EWM : Perform operation over exponential weighted window. - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> df = pd.DataFrame([[1, 2, 3], @@ -6238,13 +6615,16 @@ def _gotitem(self, 2 8.0 3 NaN dtype: float64 - """) - - @Substitution(see_also=_agg_summary_and_see_also_doc, - examples=_agg_examples_doc, - versionadded='\n.. versionadded:: 0.20.0\n', - **_shared_doc_kwargs) - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_summary_and_see_also_doc, + examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", + **_shared_doc_kwargs + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) @@ -6268,15 +6648,24 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate - @Appender(_shared_docs['transform'] % _shared_doc_kwargs) + @Appender(_shared_docs["transform"] % _shared_doc_kwargs) def transform(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T return super().transform(func, *args, **kwargs) - def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, - result_type=None, args=(), **kwds): + def apply( + self, + func, + axis=0, + broadcast=None, + raw=False, + reduce=None, + result_type=None, + args=(), + **kwds + ): """ Apply a function along an axis of the DataFrame. @@ -6445,15 +6834,18 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, 2 1 2 """ from pandas.core.apply import frame_apply - op = frame_apply(self, - func=func, - axis=axis, - broadcast=broadcast, - raw=raw, - reduce=reduce, - result_type=result_type, - args=args, - kwds=kwds) + + op = frame_apply( + self, + func=func, + axis=axis, + broadcast=broadcast, + raw=raw, + reduce=reduce, + result_type=result_type, + args=args, + kwds=kwds, + ) return op.get_result() def applymap(self, func): @@ -6525,8 +6917,7 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, - verify_integrity=False, sort=None): + def append(self, other, ignore_index=False, verify_integrity=False, sort=None): """ Append rows of `other` to the end of caller, returning a new object. @@ -6624,8 +7015,10 @@ def append(self, other, ignore_index=False, if isinstance(other, dict): other = Series(other) if other.name is None and not ignore_index: - raise TypeError('Can only append a Series if ignore_index=True' - ' or if the Series has a name') + raise TypeError( + "Can only append a Series if ignore_index=True" + " or if the Series has a name" + ) if other.name is None: index = None @@ -6640,9 +7033,11 @@ def append(self, other, ignore_index=False, except TypeError: combined_columns = self.columns.astype(object).append(idx_diff) other = other.reindex(combined_columns, copy=False) - other = DataFrame(other.values.reshape((1, len(other))), - index=index, - columns=combined_columns) + other = DataFrame( + other.values.reshape((1, len(other))), + index=index, + columns=combined_columns, + ) other = other._convert(datetime=True, timedelta=True) if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) @@ -6652,16 +7047,19 @@ def append(self, other, ignore_index=False, other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat + if isinstance(other, (list, tuple)): to_concat = [self] + other else: to_concat = [self, other] - return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort) + return concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) - def join(self, other, on=None, how='left', lsuffix='', rsuffix='', - sort=False): + def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): """ Join columns of another DataFrame. @@ -6780,27 +7178,37 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', 5 K5 A5 NaN """ # For SparseDataFrame's benefit - return self._join_compat(other, on=on, how=how, lsuffix=lsuffix, - rsuffix=rsuffix, sort=sort) + return self._join_compat( + other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort + ) - def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', - sort=False): + def _join_compat( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ): from pandas.core.reshape.merge import merge from pandas.core.reshape.concat import concat if isinstance(other, Series): if other.name is None: - raise ValueError('Other Series must have a name') + raise ValueError("Other Series must have a name") other = DataFrame({other.name: other}) if isinstance(other, DataFrame): - return merge(self, other, left_on=on, how=how, - left_index=on is None, right_index=True, - suffixes=(lsuffix, rsuffix), sort=sort) + return merge( + self, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) else: if on is not None: - raise ValueError('Joining multiple DataFrames only supported' - ' for joining on index') + raise ValueError( + "Joining multiple DataFrames only supported" " for joining on index" + ) frames = [self] + list(other) @@ -6808,33 +7216,55 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', # join indexes only using concat if can_concat: - if how == 'left': - res = concat(frames, axis=1, join='outer', - verify_integrity=True) + if how == "left": + res = concat(frames, axis=1, join="outer", verify_integrity=True) return res.reindex(self.index, copy=False) else: - return concat(frames, axis=1, join=how, - verify_integrity=True) + return concat(frames, axis=1, join=how, verify_integrity=True) joined = frames[0] for frame in frames[1:]: - joined = merge(joined, frame, how=how, left_index=True, - right_index=True) + joined = merge( + joined, frame, how=how, left_index=True, right_index=True + ) return joined - @Substitution('') + @Substitution("") @Appender(_merge_doc, indents=2) - def merge(self, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None): + def merge( + self, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, + ): from pandas.core.reshape.merge import merge - return merge(self, right, how=how, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator, validate=validate) + + return merge( + self, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) def round(self, decimals=0, *args, **kwargs): """ @@ -6936,23 +7366,21 @@ def _series_round(s, decimals): new_cols = [col for col in _dict_round(self, decimals)] elif is_integer(decimals): # Dispatch to Series.round - new_cols = [_series_round(v, decimals) - for _, v in self.iteritems()] + new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] else: - raise TypeError("decimals must be an integer, a dict-like or a " - "Series") + raise TypeError("decimals must be an integer, a dict-like or a " "Series") if len(new_cols) > 0: - return self._constructor(concat(new_cols, axis=1), - index=self.index, - columns=self.columns) + return self._constructor( + concat(new_cols, axis=1), index=self.index, columns=self.columns + ) else: return self # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method='pearson', min_periods=1): + def corr(self, method="pearson", min_periods=1): """ Compute pairwise correlation of columns, excluding NA/null values. @@ -7000,12 +7428,11 @@ def corr(self, method='pearson', min_periods=1): idx = cols.copy() mat = numeric_df.values - if method == 'pearson': + if method == "pearson": correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods) - elif method == 'spearman': - correl = libalgos.nancorr_spearman(ensure_float64(mat), - minp=min_periods) - elif method == 'kendall' or callable(method): + elif method == "spearman": + correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) + elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 mat = ensure_float64(mat).T @@ -7022,7 +7449,7 @@ def corr(self, method='pearson', min_periods=1): if valid.sum() < min_periods: c = np.nan elif i == j: - c = 1. + c = 1.0 elif not valid.all(): c = corrf(ac[valid], bc[valid]) else: @@ -7030,9 +7457,11 @@ def corr(self, method='pearson', min_periods=1): correl[i, j] = c correl[j, i] = c else: - raise ValueError("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, " - "'{method}' was supplied".format(method=method)) + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + "'{method}' was supplied".format(method=method) + ) return self._constructor(correl, index=idx, columns=cols) @@ -7142,12 +7571,11 @@ def cov(self, min_periods=None): baseCov = np.cov(mat.T) baseCov = baseCov.reshape((len(cols), len(cols))) else: - baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, - minp=min_periods) + baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods) return self._constructor(baseCov, index=idx, columns=cols) - def corrwith(self, other, axis=0, drop=False, method='pearson'): + def corrwith(self, other, axis=0, drop=False, method="pearson"): """ Compute pairwise correlation between rows or columns of DataFrame with rows or columns of Series or DataFrame. DataFrames are first @@ -7183,17 +7611,16 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): this = self._get_numeric_data() if isinstance(other, Series): - return this.apply(lambda x: other.corr(x, method=method), - axis=axis) + return this.apply(lambda x: other.corr(x, method=method), axis=axis) other = other._get_numeric_data() - left, right = this.align(other, join='inner', copy=False) + left, right = this.align(other, join="inner", copy=False) if axis == 1: left = left.T right = right.T - if method == 'pearson': + if method == "pearson": # mask missing values left = left + right * 0 right = right + left * 0 @@ -7207,31 +7634,31 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): correl = num / dom - elif method in ['kendall', 'spearman'] or callable(method): + elif method in ["kendall", "spearman"] or callable(method): + def c(x): return nanops.nancorr(x[0], x[1], method=method) - correl = Series(map(c, - zip(left.values.T, right.values.T)), - index=left.columns) + correl = Series( + map(c, zip(left.values.T, right.values.T)), index=left.columns + ) else: - raise ValueError("Invalid method {method} was passed, " - "valid methods are: 'pearson', 'kendall', " - "'spearman', or callable". - format(method=method)) + raise ValueError( + "Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable".format(method=method) + ) if not drop: # Find non-matching labels along the given axis # and append missing correlations (GH 22375) raxis = 1 if axis == 0 else 0 - result_index = (this._get_axis(raxis). - union(other._get_axis(raxis))) + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) idx_diff = result_index.difference(correl.index) if len(idx_diff) > 0: - correl = correl.append(Series([np.nan] * len(idx_diff), - index=idx_diff)) + correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff)) return correl @@ -7316,8 +7743,7 @@ def count(self, axis=0, level=None, numeric_only=False): """ axis = self._get_axis_number(axis) if level is not None: - return self._count_level(level, axis=axis, - numeric_only=numeric_only) + return self._count_level(level, axis=axis, numeric_only=numeric_only) if numeric_only: frame = self._get_numeric_data() @@ -7338,7 +7764,7 @@ def count(self, axis=0, level=None, numeric_only=False): counts = series_counts.values result = Series(counts, index=frame._get_agg_axis(axis)) - return result.astype('int64') + return result.astype("int64") def _count_level(self, level, axis=0, numeric_only=False): if numeric_only: @@ -7350,8 +7776,10 @@ def _count_level(self, level, axis=0, numeric_only=False): agg_axis = frame._get_agg_axis(axis) if not isinstance(count_axis, MultiIndex): - raise TypeError("Can only count levels on hierarchical " - "{ax}.".format(ax=self._get_axis_name(axis))) + raise TypeError( + "Can only count levels on hierarchical " + "{ax}.".format(ax=self._get_axis_name(axis)) + ) if frame._is_mixed_type: # Since we have mixed types, calling notna(frame.values) might @@ -7371,8 +7799,7 @@ def _count_level(self, level, axis=0, numeric_only=False): level_index = count_axis.levels[level] level_codes = ensure_int64(count_axis.codes[level]) - counts = lib.count_level_2d(mask, level_codes, len(level_index), - axis=0) + counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) @@ -7382,9 +7809,10 @@ def _count_level(self, level, axis=0, numeric_only=False): else: return result - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - if axis is None and filter_type == 'bool': + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): + if axis is None and filter_type == "bool": labels = None constructor = None else: @@ -7397,9 +7825,14 @@ def f(x): return op(x, axis=axis, skipna=skipna, **kwds) # exclude timedelta/datetime unless we are uniform types - if (axis == 1 and self._is_datelike_mixed_type - and (not self._is_homogeneous_type - and not is_datetime64tz_dtype(self.dtypes[0]))): + if ( + axis == 1 + and self._is_datelike_mixed_type + and ( + not self._is_homogeneous_type + and not is_datetime64tz_dtype(self.dtypes[0]) + ) + ): numeric_only = True if numeric_only is None: @@ -7407,8 +7840,7 @@ def f(x): values = self.values result = f(values) - if (filter_type == 'bool' and is_object_dtype(values) and - axis is None): + if filter_type == "bool" and is_object_dtype(values) and axis is None: # work around https://github.com/numpy/numpy/issues/10489 # TODO: combine with hasattr(result, 'dtype') further down # hard since we don't have `values` down there. @@ -7428,10 +7860,10 @@ def f(x): # column-by-column reduction, where we have mixed type. # So let's just do what we can from pandas.core.apply import frame_apply - opa = frame_apply(self, - func=f, - result_type='expand', - ignore_failures=True) + + opa = frame_apply( + self, func=f, result_type="expand", ignore_failures=True + ) result = opa.get_result() if result.ndim == self.ndim: result = result.iloc[0] @@ -7439,28 +7871,31 @@ def f(x): except Exception: pass - if filter_type is None or filter_type == 'numeric': + if filter_type is None or filter_type == "numeric": data = self._get_numeric_data() - elif filter_type == 'bool': + elif filter_type == "bool": data = self._get_bool_data() else: # pragma: no cover e = NotImplementedError( "Handling exception with filter_type {f} not" - "implemented.".format(f=filter_type)) + "implemented.".format(f=filter_type) + ) raise_with_traceback(e) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = f(data.values) labels = data._get_agg_axis(axis) else: if numeric_only: - if filter_type is None or filter_type == 'numeric': + if filter_type is None or filter_type == "numeric": data = self._get_numeric_data() - elif filter_type == 'bool': + elif filter_type == "bool": # GH 25101, # GH 24434 data = self._get_bool_data() if axis == 0 else self else: # pragma: no cover - msg = ("Generating numeric_only data with filter_type {f}" - "not supported.".format(f=filter_type)) + msg = ( + "Generating numeric_only data with filter_type {f}" + "not supported.".format(f=filter_type) + ) raise NotImplementedError(msg) values = data.values labels = data._get_agg_axis(axis) @@ -7468,11 +7903,11 @@ def f(x): values = self.values result = f(values) - if hasattr(result, 'dtype') and is_object_dtype(result.dtype): + if hasattr(result, "dtype") and is_object_dtype(result.dtype): try: - if filter_type is None or filter_type == 'numeric': + if filter_type is None or filter_type == "numeric": result = result.astype(np.float64) - elif filter_type == 'bool' and notna(result).all(): + elif filter_type == "bool" and notna(result).all(): result = result.astype(np.bool_) except (ValueError, TypeError): @@ -7609,7 +8044,7 @@ def _get_agg_axis(self, axis_num): elif axis_num == 1: return self.index else: - raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num) + raise ValueError("Axis must be 0 or 1 (got %r)" % axis_num) def mode(self, axis=0, numeric_only=False, dropna=True): """ @@ -7697,8 +8132,7 @@ def f(s): return data.apply(f, axis=axis) - def quantile(self, q=0.5, axis=0, numeric_only=True, - interpolation='linear'): + def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): """ Return values at the given quantile over requested axis. @@ -7775,10 +8209,9 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, if is_transposed: data = data.T - result = data._data.quantile(qs=q, - axis=1, - interpolation=interpolation, - transposed=is_transposed) + result = data._data.quantile( + qs=q, axis=1, interpolation=interpolation, transposed=is_transposed + ) if result.ndim == 2: result = self._constructor(result) @@ -7790,7 +8223,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, return result - def to_timestamp(self, freq=None, how='start', axis=0, copy=True): + def to_timestamp(self, freq=None, how="start", axis=0, copy=True): """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -7820,8 +8253,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) else: # pragma: no cover - raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format( - ax=axis)) + raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis)) return self._constructor(new_data) @@ -7853,8 +8285,7 @@ def to_period(self, freq=None, axis=0, copy=True): elif axis == 1: new_data.set_axis(0, self.columns.to_period(freq=freq)) else: # pragma: no cover - raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format( - ax=axis)) + raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis)) return self._constructor(new_data) @@ -7923,29 +8354,36 @@ def isin(self, values): """ if isinstance(values, dict): from pandas.core.reshape.concat import concat + values = collections.defaultdict(list, values) - return concat((self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns)), axis=1) + return concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) elif isinstance(values, Series): if not values.index.is_unique: - raise ValueError("cannot compute isin with " - "a duplicate axis.") - return self.eq(values.reindex_like(self), axis='index') + raise ValueError("cannot compute isin with " "a duplicate axis.") + return self.eq(values.reindex_like(self), axis="index") elif isinstance(values, DataFrame): if not (values.columns.is_unique and values.index.is_unique): - raise ValueError("cannot compute isin with " - "a duplicate axis.") + raise ValueError("cannot compute isin with " "a duplicate axis.") return self.eq(values.reindex_like(self)) else: if not is_list_like(values): - raise TypeError("only list-like or dict-like objects are " - "allowed to be passed to DataFrame.isin(), " - "you passed a " - "{0!r}".format(type(values).__name__)) + raise TypeError( + "only list-like or dict-like objects are " + "allowed to be passed to DataFrame.isin(), " + "you passed a " + "{0!r}".format(type(values).__name__) + ) return DataFrame( - algorithms.isin(self.values.ravel(), - values).reshape(self.shape), self.index, - self.columns) + algorithms.isin(self.values.ravel(), values).reshape(self.shape), + self.index, + self.columns, + ) # ---------------------------------------------------------------------- # Add plotting methods to DataFrame @@ -7955,11 +8393,17 @@ def isin(self, values): sparse = CachedAccessor("sparse", SparseFrameAccessor) -DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0, - axes_are_reversed=True, aliases={'rows': 0}, - docs={ - 'index': 'The index (row labels) of the DataFrame.', - 'columns': 'The column labels of the DataFrame.'}) +DataFrame._setup_axes( + ["index", "columns"], + info_axis=1, + stat_axis=0, + axes_are_reversed=True, + aliases={"rows": 0}, + docs={ + "index": "The index (row labels) of the DataFrame.", + "columns": "The column labels of the DataFrame.", + }, +) DataFrame._add_numeric_operations() DataFrame._add_series_or_dataframe_operations() @@ -7978,4 +8422,4 @@ def _from_nested_dict(data): def _put_str(s, space): - return '{s}'.format(s=s)[:space].ljust(space) + return "{s}".format(s=s)[:space].ljust(space) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 106af6e565f8a..4e9f74162ae78 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -19,17 +19,32 @@ from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import ( - Appender, Substitution, rewrite_axis_style_signature) +from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.common import ( - ensure_int64, ensure_object, ensure_str, is_bool, is_bool_dtype, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_dict_like, is_extension_array_dtype, is_integer, is_list_like, - is_number, is_numeric_dtype, is_object_dtype, is_period_arraylike, - is_re_compilable, is_scalar, is_timedelta64_dtype, pandas_dtype) + ensure_int64, + ensure_object, + ensure_str, + is_bool, + is_bool_dtype, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dict_like, + is_extension_array_dtype, + is_integer, + is_list_like, + is_number, + is_numeric_dtype, + is_object_dtype, + is_period_arraylike, + is_re_compilable, + is_scalar, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -41,7 +56,12 @@ from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.index import ( - Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index) + Index, + InvalidIndexError, + MultiIndex, + RangeIndex, + ensure_index, +) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing @@ -56,12 +76,14 @@ # able to share _shared_docs = dict() _shared_doc_kwargs = dict( - axes='keywords for axes', klass='Series/DataFrame', - axes_single_arg='int or labels for object', - args_transpose='axes to permute (int or label for object)', + axes="keywords for axes", + klass="Series/DataFrame", + axes_single_arg="int or labels for object", + args_transpose="axes to permute (int or label for object)", optional_by=""" by : str or list of str - Name or list of names to sort by""") + Name or list of names to sort by""", +) # sentinel value to use as kwarg in place of None when None has special meaning # and needs to be distinguished from a user explicitly passing None. @@ -74,8 +96,11 @@ def _single_replace(self, to_replace, method, inplace, limit): replacement value is given in the replace method """ if self.ndim != 1: - raise TypeError('cannot replace {0} with method {1} on a {2}' - .format(to_replace, method, type(self).__name__)) + raise TypeError( + "cannot replace {0} with method {1} on a {2}".format( + to_replace, method, type(self).__name__ + ) + ) orig_dtype = self.dtype result = self if inplace else self.copy() @@ -87,8 +112,7 @@ def _single_replace(self, to_replace, method, inplace, limit): if values.dtype == orig_dtype and inplace: return - result = pd.Series(values, index=self.index, - dtype=self.dtype).__finalize__(self) + result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self) if inplace: self._update_inplace(result._data) @@ -108,15 +132,27 @@ class NDFrame(PandasObject, SelectionMixin): axes : list copy : boolean, default False """ - _internal_names = ['_data', '_cacher', '_item_cache', '_cache', '_is_copy', - '_subtyp', '_name', '_index', '_default_kind', - '_default_fill_value', '_metadata', '__array_struct__', - '__array_interface__'] # type: List[str] + + _internal_names = [ + "_data", + "_cacher", + "_item_cache", + "_cache", + "_is_copy", + "_subtyp", + "_name", + "_index", + "_default_kind", + "_default_fill_value", + "_metadata", + "__array_struct__", + "__array_interface__", + ] # type: List[str] _internal_names_set = set(_internal_names) # type: Set[str] _accessors = set() # type: Set[str] - _deprecations = frozenset([ - 'as_blocks', 'blocks', 'is_copy' - ]) # type: FrozenSet[str] + _deprecations = frozenset( + ["as_blocks", "blocks", "is_copy"] + ) # type: FrozenSet[str] _metadata = [] # type: List[str] _is_copy = None _data = None # type: BlockManager @@ -124,12 +160,14 @@ class NDFrame(PandasObject, SelectionMixin): # ---------------------------------------------------------------------- # Constructors - def __init__(self, - data: BlockManager, - axes: Optional[List[Index]] = None, - copy: bool = False, - dtype: Optional[Dtype] = None, - fastpath: bool = False): + def __init__( + self, + data: BlockManager, + axes: Optional[List[Index]] = None, + copy: bool = False, + dtype: Optional[Dtype] = None, + fastpath: bool = False, + ): if not fastpath: if dtype is not None: @@ -141,17 +179,17 @@ def __init__(self, for i, ax in enumerate(axes): data = data.reindex_axis(ax, axis=i) - object.__setattr__(self, '_is_copy', None) - object.__setattr__(self, '_data', data) - object.__setattr__(self, '_item_cache', {}) + object.__setattr__(self, "_is_copy", None) + object.__setattr__(self, "_data", data) + object.__setattr__(self, "_item_cache", {}) def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): """ passed a manager and a axes dict """ for a, axe in axes.items(): if axe is not None: - mgr = mgr.reindex_axis(axe, - axis=self._get_block_manager_axis(a), - copy=False) + mgr = mgr.reindex_axis( + axe, axis=self._get_block_manager_axis(a), copy=False + ) # make a copy if explicitly requested if copy: @@ -169,14 +207,22 @@ def is_copy(self): """ Return the copy. """ - warnings.warn("Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", FutureWarning, stacklevel=2) + warnings.warn( + "Attribute 'is_copy' is deprecated and will be removed " + "in a future version.", + FutureWarning, + stacklevel=2, + ) return self._is_copy @is_copy.setter def is_copy(self, msg): - warnings.warn("Attribute 'is_copy' is deprecated and will be removed " - "in a future version.", FutureWarning, stacklevel=2) + warnings.warn( + "Attribute 'is_copy' is deprecated and will be removed " + "in a future version.", + FutureWarning, + stacklevel=2, + ) self._is_copy = msg def _validate_dtype(self, dtype): @@ -186,10 +232,11 @@ def _validate_dtype(self, dtype): dtype = pandas_dtype(dtype) # a compound dtype - if dtype.kind == 'V': - raise NotImplementedError("compound dtypes are not implemented" - " in the {0} constructor" - .format(self.__class__.__name__)) + if dtype.kind == "V": + raise NotImplementedError( + "compound dtypes are not implemented" + " in the {0} constructor".format(self.__class__.__name__) + ) return dtype @@ -221,9 +268,18 @@ def _constructor_expanddim(self): # Axis @classmethod - def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None, - slicers=None, axes_are_reversed=False, build_axes=True, - ns=None, docs=None): + def _setup_axes( + cls, + axes, + info_axis=None, + stat_axis=None, + aliases=None, + slicers=None, + axes_are_reversed=False, + build_axes=True, + ns=None, + docs=None, + ): """Provide axes setup for the major PandasObjects. Parameters @@ -248,7 +304,7 @@ def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None, cls._AXIS_REVERSED = axes_are_reversed # typ - setattr(cls, '_typ', cls.__name__.lower()) + setattr(cls, "_typ", cls.__name__.lower()) # indexing support cls._ix = None @@ -293,13 +349,16 @@ def _construct_axes_dict_from(self, axes, **kwargs): def _construct_axes_dict_for_slice(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" - d = {self._AXIS_SLICEMAP[a]: self._get_axis(a) - for a in (axes or self._AXIS_ORDERS)} + d = { + self._AXIS_SLICEMAP[a]: self._get_axis(a) + for a in (axes or self._AXIS_ORDERS) + } d.update(kwargs) return d def _construct_axes_from_arguments( - self, args, kwargs, require_all=False, sentinel=None): + self, args, kwargs, require_all=False, sentinel=None + ): """Construct and returns axes if supplied in args/kwargs. If require_all, raise if all axis arguments are not supplied @@ -319,8 +378,10 @@ def _construct_axes_from_arguments( if alias is not None: if a in kwargs: if alias in kwargs: - raise TypeError("arguments are mutually exclusive " - "for [%s,%s]" % (a, alias)) + raise TypeError( + "arguments are mutually exclusive " + "for [%s,%s]" % (a, alias) + ) continue if alias in kwargs: kwargs[a] = kwargs.pop(alias) @@ -332,8 +393,7 @@ def _construct_axes_from_arguments( kwargs[a] = args.pop(0) except IndexError: if require_all: - raise TypeError("not enough/duplicate arguments " - "specified!") + raise TypeError("not enough/duplicate arguments " "specified!") axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS} return axes, kwargs @@ -361,8 +421,7 @@ def _get_axis_number(cls, axis): return cls._AXIS_NUMBERS[axis] except KeyError: pass - raise ValueError('No axis named {0} for object type {1}' - .format(axis, cls)) + raise ValueError("No axis named {0} for object type {1}".format(axis, cls)) @classmethod def _get_axis_name(cls, axis): @@ -375,8 +434,7 @@ def _get_axis_name(cls, axis): return cls._AXIS_NAMES[axis] except KeyError: pass - raise ValueError('No axis named {0} for object type {1}' - .format(axis, cls)) + raise ValueError("No axis named {0} for object type {1}".format(axis, cls)) def _get_axis(self, axis): name = self._get_axis_name(axis) @@ -404,7 +462,7 @@ def _get_axis_resolvers(self, axis): # prefix with 'i' or 'c' depending on the input axis # e.g., you must do ilevel_0 for the 0th level of an unnamed # multiiindex - key = '{prefix}level_{i}'.format(prefix=prefix, i=i) + key = "{prefix}level_{i}".format(prefix=prefix, i=i) level = i level_values = axis_index.get_level_values(level) @@ -436,8 +494,7 @@ def _get_space_character_free_column_resolvers(self): """ from pandas.core.computation.common import _remove_spaces_column_name - return {_remove_spaces_column_name(k): v for k, v - in self.iteritems()} + return {_remove_spaces_column_name(k): v for k, v in self.iteritems()} @property def _info_axis(self): @@ -525,7 +582,7 @@ def _expand_axes(self, key): for k, ax in zip(key, self.axes): if k not in ax: if type(k) != ax.dtype.type: - ax = ax.astype('O') + ax = ax.astype("O") new_axes.append(ax.insert(len(ax), k)) else: new_axes.append(ax) @@ -631,17 +688,21 @@ def set_axis(self, labels, axis=0, inplace=None): warnings.warn( 'set_axis now takes "labels" as first argument, and ' '"axis" as named parameter. The old form, with "axis" as ' - 'first parameter and \"labels\" as second, is still supported ' - 'but will be deprecated in a future version of pandas.', - FutureWarning, stacklevel=2) + 'first parameter and "labels" as second, is still supported ' + "but will be deprecated in a future version of pandas.", + FutureWarning, + stacklevel=2, + ) labels, axis = axis, labels if inplace is None: warnings.warn( - 'set_axis currently defaults to operating inplace.\nThis ' - 'will change in a future version of pandas, use ' - 'inplace=True to avoid this warning.', - FutureWarning, stacklevel=2) + "set_axis currently defaults to operating inplace.\nThis " + "will change in a future version of pandas, use " + "inplace=True to avoid this warning.", + FutureWarning, + stacklevel=2, + ) inplace = True if inplace: setattr(self, self._get_axis_name(axis), labels) @@ -678,21 +739,21 @@ def transpose(self, *args, **kwargs): """ # construct the args - axes, kwargs = self._construct_axes_from_arguments(args, kwargs, - require_all=True) - axes_names = tuple(self._get_axis_name(axes[a]) - for a in self._AXIS_ORDERS) - axes_numbers = tuple(self._get_axis_number(axes[a]) - for a in self._AXIS_ORDERS) + axes, kwargs = self._construct_axes_from_arguments( + args, kwargs, require_all=True + ) + axes_names = tuple(self._get_axis_name(axes[a]) for a in self._AXIS_ORDERS) + axes_numbers = tuple(self._get_axis_number(axes[a]) for a in self._AXIS_ORDERS) # we must have unique axes if len(axes) != len(set(axes)): - raise ValueError('Must specify %s unique axes' % self._AXIS_LEN) + raise ValueError("Must specify %s unique axes" % self._AXIS_LEN) - new_axes = self._construct_axes_dict_from(self, [self._get_axis(x) - for x in axes_names]) + new_axes = self._construct_axes_dict_from( + self, [self._get_axis(x) for x in axes_names] + ) new_values = self.values.transpose(axes_numbers) - if kwargs.pop('copy', None) or (len(args) and args[-1]): + if kwargs.pop("copy", None) or (len(args) and args[-1]): new_values = new_values.copy() nv.validate_transpose(tuple(), kwargs) @@ -716,8 +777,7 @@ def swapaxes(self, axis1, axis2, copy=True): mapping = {i: j, j: i} - new_axes = (self._get_axis(mapping.get(k, k)) - for k in range(self._AXIS_LEN)) + new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)) new_values = self.values.swapaxes(i, j) if copy: new_values = new_values.copy() @@ -938,12 +998,14 @@ def squeeze(self, axis=None): >>> df_0a.squeeze() 1 """ - axis = (self._AXIS_NAMES if axis is None else - (self._get_axis_number(axis),)) + axis = self._AXIS_NAMES if axis is None else (self._get_axis_number(axis),) try: return self.iloc[ - tuple(0 if i in axis and len(a) == 1 else slice(None) - for i, a in enumerate(self.axes))] + tuple( + 0 if i in axis and len(a) == 1 else slice(None) + for i, a in enumerate(self.axes) + ) + ] except Exception: return self @@ -1088,21 +1150,23 @@ def rename(self, *args, **kwargs): See the :ref:`user guide ` for more. """ axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - copy = kwargs.pop('copy', True) - inplace = kwargs.pop('inplace', False) - level = kwargs.pop('level', None) - axis = kwargs.pop('axis', None) - errors = kwargs.pop('errors', 'ignore') + copy = kwargs.pop("copy", True) + inplace = kwargs.pop("inplace", False) + level = kwargs.pop("level", None) + axis = kwargs.pop("axis", None) + errors = kwargs.pop("errors", "ignore") if axis is not None: # Validate the axis self._get_axis_number(axis) if kwargs: - raise TypeError('rename() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "rename() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) if com.count_not_none(*axes.values()) == 0: - raise TypeError('must pass an index to rename') + raise TypeError("must pass an index to rename") self._consolidate_inplace() result = self if inplace else self.copy(deep=copy) @@ -1120,14 +1184,15 @@ def rename(self, *args, **kwargs): # GH 13473 if not callable(v): indexer = self.axes[axis].get_indexer_for(v) - if errors == 'raise' and len(indexer[indexer == -1]): - missing_labels = [label for index, label in enumerate(v) - if indexer[index] == -1] - raise KeyError('{} not found in axis' - .format(missing_labels)) - - result._data = result._data.rename_axis(f, axis=baxis, copy=copy, - level=level) + if errors == "raise" and len(indexer[indexer == -1]): + missing_labels = [ + label for index, label in enumerate(v) if indexer[index] == -1 + ] + raise KeyError("{} not found in axis".format(missing_labels)) + + result._data = result._data.rename_axis( + f, axis=baxis, copy=copy, level=level + ) result._clear_item_cache() if inplace: @@ -1135,8 +1200,7 @@ def rename(self, *args, **kwargs): else: return result.__finalize__(self) - @rewrite_axis_style_signature('mapper', [('copy', True), - ('inplace', False)]) + @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) def rename_axis(self, mapper=sentinel, **kwargs): """ Set the name of the axis for the index or columns. @@ -1262,28 +1326,31 @@ class name monkey 2 2 """ axes, kwargs = self._construct_axes_from_arguments( - (), kwargs, sentinel=sentinel) - copy = kwargs.pop('copy', True) - inplace = kwargs.pop('inplace', False) - axis = kwargs.pop('axis', 0) + (), kwargs, sentinel=sentinel + ) + copy = kwargs.pop("copy", True) + inplace = kwargs.pop("inplace", False) + axis = kwargs.pop("axis", 0) if axis is not None: axis = self._get_axis_number(axis) if kwargs: - raise TypeError('rename_axis() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "rename_axis() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if mapper is not sentinel: # Use v0.23 behavior if a scalar or list - non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not - is_dict_like(mapper)) + non_mapper = is_scalar(mapper) or ( + is_list_like(mapper) and not is_dict_like(mapper) + ) if non_mapper: return self._set_axis_name(mapper, axis=axis, inplace=inplace) else: - raise ValueError("Use `.rename` to alter labels " - "with a mapper.") + raise ValueError("Use `.rename` to alter labels " "with a mapper.") else: # Use new behavior. Means that index and/or columns # is specified @@ -1293,16 +1360,14 @@ class name v = axes.get(self._AXIS_NAMES[axis]) if v is sentinel: continue - non_mapper = is_scalar(v) or (is_list_like(v) and not - is_dict_like(v)) + non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) if non_mapper: newnames = v else: f = com._get_rename_function(v) curnames = self._get_axis(axis).names newnames = [f(name) for name in curnames] - result._set_axis_name(newnames, axis=axis, - inplace=True) + result._set_axis_name(newnames, axis=axis, inplace=True) if not inplace: return result @@ -1361,7 +1426,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): axis = self._get_axis_number(axis) idx = self._get_axis(axis).set_names(name) - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") renamed = self if inplace else self.copy() renamed.set_axis(idx, axis=axis, inplace=True) if not inplace: @@ -1371,8 +1436,9 @@ def _set_axis_name(self, name, axis=0, inplace=False): # Comparison Methods def _indexed_same(self, other): - return all(self._get_axis(a).equals(other._get_axis(a)) - for a in self._AXIS_ORDERS) + return all( + self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS + ) def equals(self, other): """ @@ -1467,24 +1533,32 @@ def __neg__(self): values = com.values_from_object(self) if is_bool_dtype(values): arr = operator.inv(values) - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): + elif ( + is_numeric_dtype(values) + or is_timedelta64_dtype(values) + or is_object_dtype(values) + ): arr = operator.neg(values) else: - raise TypeError("Unary negative expects numeric dtype, not {}" - .format(values.dtype)) + raise TypeError( + "Unary negative expects numeric dtype, not {}".format(values.dtype) + ) return self.__array_wrap__(arr) def __pos__(self): values = com.values_from_object(self) - if (is_bool_dtype(values) or is_period_arraylike(values)): + if is_bool_dtype(values) or is_period_arraylike(values): arr = values - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): + elif ( + is_numeric_dtype(values) + or is_timedelta64_dtype(values) + or is_object_dtype(values) + ): arr = operator.pos(values) else: - raise TypeError("Unary plus expects numeric dtype, not {}" - .format(values.dtype)) + raise TypeError( + "Unary plus expects numeric dtype, not {}".format(values.dtype) + ) return self.__array_wrap__(arr) def __invert__(self): @@ -1500,9 +1574,12 @@ def __invert__(self): raise def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) + raise ValueError( + "The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( + self.__class__.__name__ + ) + ) __bool__ = __nonzero__ @@ -1523,8 +1600,10 @@ def bool(self): if isinstance(v, (bool, np.bool_)): return bool(v) elif is_scalar(v): - raise ValueError("bool cannot act on a non-boolean single element " - "{0}".format(self.__class__.__name__)) + raise ValueError( + "bool cannot act on a non-boolean single element " + "{0}".format(self.__class__.__name__) + ) self.__nonzero__() @@ -1565,10 +1644,12 @@ def _is_level_reference(self, key, axis=0): """ axis = self._get_axis_number(axis) - return (key is not None and - is_hashable(key) and - key in self.axes[axis].names and - not self._is_label_reference(key, axis=axis)) + return ( + key is not None + and is_hashable(key) + and key in self.axes[axis].names + and not self._is_label_reference(key, axis=axis) + ) def _is_label_reference(self, key, axis=0): """ @@ -1593,9 +1674,11 @@ def _is_label_reference(self, key, axis=0): axis = self._get_axis_number(axis) other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis) - return (key is not None and - is_hashable(key) and - any(key in self.axes[ax] for ax in other_axes)) + return ( + key is not None + and is_hashable(key) + and any(key in self.axes[ax] for ax in other_axes) + ) def _is_label_or_level_reference(self, key, axis=0): """ @@ -1617,8 +1700,9 @@ def _is_label_or_level_reference(self, key, axis=0): ------- is_label_or_level: bool """ - return (self._is_level_reference(key, axis=axis) or - self._is_label_reference(key, axis=axis)) + return self._is_level_reference(key, axis=axis) or self._is_label_reference( + key, axis=axis + ) def _check_label_or_level_ambiguity(self, key, axis=0): """ @@ -1641,27 +1725,32 @@ def _check_label_or_level_ambiguity(self, key, axis=0): axis = self._get_axis_number(axis) other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis) - if (key is not None and - is_hashable(key) and - key in self.axes[axis].names and - any(key in self.axes[ax] for ax in other_axes)): + if ( + key is not None + and is_hashable(key) + and key in self.axes[axis].names + and any(key in self.axes[ax] for ax in other_axes) + ): # Build an informative and grammatical warning - level_article, level_type = (('an', 'index') - if axis == 0 else - ('a', 'column')) - - label_article, label_type = (('a', 'column') - if axis == 0 else - ('an', 'index')) - - msg = ("'{key}' is both {level_article} {level_type} level and " - "{label_article} {label_type} label, which is ambiguous." - ).format(key=key, - level_article=level_article, - level_type=level_type, - label_article=label_article, - label_type=label_type) + level_article, level_type = ( + ("an", "index") if axis == 0 else ("a", "column") + ) + + label_article, label_type = ( + ("a", "column") if axis == 0 else ("an", "index") + ) + + msg = ( + "'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label, which is ambiguous." + ).format( + key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type, + ) raise ValueError(msg) def _get_label_or_level_values(self, key, axis=0): @@ -1712,21 +1801,27 @@ def _get_label_or_level_values(self, key, axis=0): # Check for duplicates if values.ndim > 1: - if other_axes and isinstance( - self._get_axis(other_axes[0]), MultiIndex): - multi_message = ('\n' - 'For a multi-index, the label must be a ' - 'tuple with elements corresponding to ' - 'each level.') + if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex): + multi_message = ( + "\n" + "For a multi-index, the label must be a " + "tuple with elements corresponding to " + "each level." + ) else: - multi_message = '' - - label_axis_name = 'column' if axis == 0 else 'index' - raise ValueError(("The {label_axis_name} label '{key}' " - "is not unique.{multi_message}") - .format(key=key, - label_axis_name=label_axis_name, - multi_message=multi_message)) + multi_message = "" + + label_axis_name = "column" if axis == 0 else "index" + raise ValueError( + ( + "The {label_axis_name} label '{key}' " + "is not unique.{multi_message}" + ).format( + key=key, + label_axis_name=label_axis_name, + multi_message=multi_message, + ) + ) return values @@ -1760,21 +1855,22 @@ def _drop_labels_or_levels(self, keys, axis=0): # Validate keys keys = com.maybe_make_list(keys) - invalid_keys = [k for k in keys if not - self._is_label_or_level_reference(k, axis=axis)] + invalid_keys = [ + k for k in keys if not self._is_label_or_level_reference(k, axis=axis) + ] if invalid_keys: - raise ValueError(("The following keys are not valid labels or " - "levels for axis {axis}: {invalid_keys}") - .format(axis=axis, - invalid_keys=invalid_keys)) + raise ValueError( + ( + "The following keys are not valid labels or " + "levels for axis {axis}: {invalid_keys}" + ).format(axis=axis, invalid_keys=invalid_keys) + ) # Compute levels and labels to drop - levels_to_drop = [k for k in keys - if self._is_level_reference(k, axis=axis)] + levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)] - labels_to_drop = [k for k in keys - if not self._is_level_reference(k, axis=axis)] + labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)] # Perform copy upfront and then use inplace operations below. # This ensures that we always perform exactly one copy. @@ -1810,8 +1906,10 @@ def _drop_labels_or_levels(self, keys, axis=0): # Iteration def __hash__(self): - raise TypeError('{0!r} objects are mutable, thus they cannot be' - ' hashed'.format(self.__class__.__name__)) + raise TypeError( + "{0!r} objects are mutable, thus they cannot be" + " hashed".format(self.__class__.__name__) + ) def __iter__(self): """ @@ -1937,9 +2035,12 @@ def to_dense(self): %(klass)s Dense %(klass)s. """ - warnings.warn("DataFrame/Series.to_dense is deprecated " - "and will be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "DataFrame/Series.to_dense is deprecated " + "and will be removed in a future version", + FutureWarning, + stacklevel=2, + ) # compat return self @@ -1948,15 +2049,14 @@ def to_dense(self): def __getstate__(self): meta = {k: getattr(self, k, None) for k in self._metadata} - return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, - **meta) + return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, **meta) def __setstate__(self, state): if isinstance(state, BlockManager): self._data = state elif isinstance(state, dict): - typ = state.get('_typ') + typ = state.get("_typ") if typ is not None: # set in the order of internal names @@ -1996,15 +2096,15 @@ def __setstate__(self, state): def __repr__(self): # string representation based upon iterating over self # (since, by definition, `PandasContainers` are iterable) - prepr = '[%s]' % ','.join(map(pprint_thing, self)) - return '%s(%s)' % (self.__class__.__name__, prepr) + prepr = "[%s]" % ",".join(map(pprint_thing, self)) + return "%s(%s)" % (self.__class__.__name__, prepr) def _repr_latex_(self): """ Returns a LaTeX representation for a particular object. Mainly for use with nbconvert (jupyter notebook conversion to pdf). """ - if config.get_option('display.latex.repr'): + if config.get_option("display.latex.repr"): return self.to_latex() else: return None @@ -2015,15 +2115,18 @@ def _repr_data_resource_(self): naming convention. """ if config.get_option("display.html.table_schema"): - data = self.head(config.get_option('display.max_rows')) - payload = json.loads(data.to_json(orient='table'), - object_pairs_hook=collections.OrderedDict) + data = self.head(config.get_option("display.max_rows")) + payload = json.loads( + data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict + ) return payload # ---------------------------------------------------------------------- # I/O Methods - _shared_docs['to_excel'] = """ + _shared_docs[ + "to_excel" + ] = """ Write %(klass)s to an Excel sheet. To write a single %(klass)s to an Excel .xlsx file it is only necessary to @@ -2128,28 +2231,62 @@ def _repr_data_resource_(self): """ @Appender(_shared_docs["to_excel"] % dict(klass="object")) - def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep="inf", verbose=True, - freeze_panes=None): + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.excel import ExcelFormatter - formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns, - header=header, - float_format=float_format, index=index, - index_label=index_label, - merge_cells=merge_cells, - inf_rep=inf_rep) - formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, - startcol=startcol, freeze_panes=freeze_panes, - engine=engine) - - def to_json(self, path_or_buf=None, orient=None, date_format=None, - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression='infer', - index=True): + + formatter = ExcelFormatter( + df, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + ) + + def to_json( + self, + path_or_buf=None, + orient=None, + date_format=None, + double_precision=10, + force_ascii=True, + date_unit="ms", + default_handler=None, + lines=False, + compression="infer", + index=True, + ): """ Convert the object to a JSON string. @@ -2286,17 +2423,24 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None, """ from pandas.io import json - if date_format is None and orient == 'table': - date_format = 'iso' + + if date_format is None and orient == "table": + date_format = "iso" elif date_format is None: - date_format = 'epoch' - return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, - date_format=date_format, - double_precision=double_precision, - force_ascii=force_ascii, date_unit=date_unit, - default_handler=default_handler, - lines=lines, compression=compression, - index=index) + date_format = "epoch" + return json.to_json( + path_or_buf=path_or_buf, + obj=self, + orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + lines=lines, + compression=compression, + index=index, + ) def to_hdf(self, path_or_buf, key, **kwargs): """ @@ -2400,9 +2544,10 @@ def to_hdf(self, path_or_buf, key, **kwargs): >>> os.remove('data.h5') """ from pandas.io import pytables + pytables.to_hdf(path_or_buf, key, self, **kwargs) - def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): + def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): """ Serialize object to input file path using msgpack format. @@ -2429,11 +2574,21 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): """ from pandas.io import packers - return packers.to_msgpack(path_or_buf, self, encoding=encoding, - **kwargs) - def to_sql(self, name, con, schema=None, if_exists='fail', index=True, - index_label=None, chunksize=None, dtype=None, method=None): + return packers.to_msgpack(path_or_buf, self, encoding=encoding, **kwargs) + + def to_sql( + self, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, + ): """ Write records stored in a DataFrame to a SQL database. @@ -2561,12 +2716,21 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True, [(1,), (None,), (2,)] """ from pandas.io import sql - sql.to_sql(self, name, con, schema=schema, if_exists=if_exists, - index=index, index_label=index_label, chunksize=chunksize, - dtype=dtype, method=method) - def to_pickle(self, path, compression='infer', - protocol=pickle.HIGHEST_PROTOCOL): + sql.to_sql( + self, + name, + con, + schema=schema, + if_exists=if_exists, + index=index, + index_label=index_label, + chunksize=chunksize, + dtype=dtype, + method=method, + ) + + def to_pickle(self, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. @@ -2621,6 +2785,7 @@ def to_pickle(self, path, compression='infer', >>> os.remove("./dummy.pkl") """ from pandas.io.pickle import to_pickle + to_pickle(self, path, compression=compression, protocol=protocol) def to_clipboard(self, excel=True, sep=None, **kwargs): @@ -2678,6 +2843,7 @@ def to_clipboard(self, excel=True, sep=None, **kwargs): ... # 4,5,6 """ from pandas.io import clipboards + clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) def to_xarray(self): @@ -2762,12 +2928,28 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' else: return xarray.Dataset.from_dataframe(self) - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, bold_rows=False, - column_format=None, longtable=None, escape=None, - encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + ): r""" Render an object to a LaTeX tabular environment table. @@ -2879,34 +3061,60 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, if multicolumn is None: multicolumn = config.get_option("display.latex.multicolumn") if multicolumn_format is None: - multicolumn_format = config.get_option( - "display.latex.multicolumn_format") + multicolumn_format = config.get_option("display.latex.multicolumn_format") if multirow is None: multirow = config.get_option("display.latex.multirow") - formatter = DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - header=header, index=index, - formatters=formatters, - float_format=float_format, - bold_rows=bold_rows, - sparsify=sparsify, - index_names=index_names, - escape=escape, decimal=decimal) - formatter.to_latex(column_format=column_format, longtable=longtable, - encoding=encoding, multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow) + formatter = DataFrameFormatter( + self, + buf=buf, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + index_names=index_names, + escape=escape, + decimal=decimal, + ) + formatter.to_latex( + column_format=column_format, + longtable=longtable, + encoding=encoding, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + ) if buf is None: return formatter.buf.getvalue() - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, - columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression='infer', quoting=None, - quotechar='"', line_terminator=None, chunksize=None, - date_format=None, doublequote=True, - escapechar=None, decimal='.'): + def to_csv( + self, + path_or_buf=None, + sep=",", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression="infer", + quoting=None, + quotechar='"', + line_terminator=None, + chunksize=None, + date_format=None, + doublequote=True, + escapechar=None, + decimal=".", + ): r""" Write object to a comma-separated values (csv) file. @@ -3012,17 +3220,29 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.csvs import CSVFormatter - formatter = CSVFormatter(df, path_or_buf, - line_terminator=line_terminator, sep=sep, - encoding=encoding, - compression=compression, quoting=quoting, - na_rep=na_rep, float_format=float_format, - cols=columns, header=header, index=index, - index_label=index_label, mode=mode, - chunksize=chunksize, quotechar=quotechar, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, decimal=decimal) + + formatter = CSVFormatter( + df, + path_or_buf, + line_terminator=line_terminator, + sep=sep, + encoding=encoding, + compression=compression, + quoting=quoting, + na_rep=na_rep, + float_format=float_format, + cols=columns, + header=header, + index=index, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + decimal=decimal, + ) formatter.save() if path_or_buf is None: @@ -3082,7 +3302,7 @@ def _set_as_cached(self, item, cacher): def _reset_cacher(self): """Reset the cacher.""" - if hasattr(self, '_cacher'): + if hasattr(self, "_cacher"): del self._cacher def _iget_item_cache(self, item): @@ -3105,11 +3325,11 @@ def _maybe_cache_changed(self, item, value): @property def _is_cached(self): """Return boolean indicating if self is cached or not.""" - return getattr(self, '_cacher', None) is not None + return getattr(self, "_cacher", None) is not None def _get_cacher(self): """return my cacher or None""" - cacher = getattr(self, '_cacher', None) + cacher = getattr(self, "_cacher", None) if cacher is not None: cacher = cacher[1]() return cacher @@ -3133,7 +3353,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): """ - cacher = getattr(self, '_cacher', None) + cacher = getattr(self, "_cacher", None) if cacher is not None: ref = cacher[1]() @@ -3148,7 +3368,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): pass if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t='referant') + self._check_setitem_copy(stacklevel=5, t="referant") if clear: self._clear_item_cache() @@ -3202,14 +3422,13 @@ def _check_is_chained_assignment_possible(self): if self._is_view and self._is_cached: ref = self._get_cacher() if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t='referant', - force=True) + self._check_setitem_copy(stacklevel=4, t="referant", force=True) return True elif self._is_copy: - self._check_setitem_copy(stacklevel=4, t='referant') + self._check_setitem_copy(stacklevel=4, t="referant") return False - def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): + def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): """ Parameters @@ -3244,7 +3463,7 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): if not (force or self._is_copy): return - value = config.get_option('mode.chained_assignment') + value = config.get_option("mode.chained_assignment") if value is None: return @@ -3260,30 +3479,31 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): if isinstance(self._is_copy, str): t = self._is_copy - elif t == 'referant': - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) + elif t == "referant": + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame\n\n" + "See the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) else: - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame.\n" - "Try using .loc[row_indexer,col_indexer] = value " - "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" - "indexing.html#returning-a-view-versus-a-copy" - ) - - if value == 'raise': + t = ( + "\n" + "A value is trying to be set on a copy of a slice from a " + "DataFrame.\n" + "Try using .loc[row_indexer,col_indexer] = value " + "instead\n\nSee the caveats in the documentation: " + "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "indexing.html#returning-a-view-versus-a-copy" + ) + + if value == "raise": raise com.SettingWithCopyError(t) - elif value == 'warn': - warnings.warn(t, com.SettingWithCopyWarning, - stacklevel=stacklevel) + elif value == "warn": + warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel) def __delitem__(self, key): """ @@ -3292,7 +3512,7 @@ def __delitem__(self, key): deleted = False maybe_shortcut = False - if hasattr(self, 'columns') and isinstance(self.columns, MultiIndex): + if hasattr(self, "columns") and isinstance(self.columns, MultiIndex): try: maybe_shortcut = key not in self.columns._engine except TypeError: @@ -3302,9 +3522,9 @@ def __delitem__(self, key): # Allow shorthand to delete all columns whose first len(key) # elements match key: if not isinstance(key, tuple): - key = (key, ) + key = (key,) for col in self.columns: - if isinstance(col, tuple) and col[:len(key)] == key: + if isinstance(col, tuple) and col[: len(key)] == key: del self[col] deleted = True if not deleted: @@ -3353,9 +3573,9 @@ def _take(self, indices, axis=0, is_copy=True): """ self._consolidate_inplace() - new_data = self._data.take(indices, - axis=self._get_block_manager_axis(axis), - verify=True) + new_data = self._data.take( + indices, axis=self._get_block_manager_axis(axis), verify=True + ) result = self._constructor(new_data).__finalize__(self) # Maybe set copy if we didn't actually change the index. @@ -3545,8 +3765,7 @@ class animal locomotion axis = self._get_axis_number(axis) labels = self._get_axis(axis) if level is not None: - loc, new_ax = labels.get_loc_level(key, level=level, - drop_level=drop_level) + loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level) # create the tuple of the indexer indexer = [slice(None)] * self.ndim @@ -3564,8 +3783,7 @@ class animal locomotion index = self.index if isinstance(index, MultiIndex): - loc, new_index = self.index.get_loc_level(key, - drop_level=drop_level) + loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) else: loc = self.index.get_loc(key) @@ -3591,8 +3809,11 @@ class animal locomotion return com.maybe_box_datetimelike(new_values) result = self._constructor_sliced( - new_values, index=self.columns, - name=self.index[loc], dtype=new_values.dtype) + new_values, + index=self.columns, + name=self.index[loc], + dtype=new_values.dtype, + ) else: result = self.iloc[loc] @@ -3605,8 +3826,7 @@ class animal locomotion _xs = xs # type: Callable - def reindex_like(self, other, method=None, copy=True, limit=None, - tolerance=None): + def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): """ Return an object with matching indices as other object. @@ -3701,28 +3921,42 @@ def reindex_like(self, other, method=None, copy=True, limit=None, 2014-02-14 NaN NaN NaN 2014-02-15 35.1 NaN medium """ - d = other._construct_axes_dict(axes=self._AXIS_ORDERS, method=method, - copy=copy, limit=limit, - tolerance=tolerance) + d = other._construct_axes_dict( + axes=self._AXIS_ORDERS, + method=method, + copy=copy, + limit=limit, + tolerance=tolerance, + ) return self.reindex(**d) - def drop(self, labels=None, axis=0, index=None, columns=None, level=None, - inplace=False, errors='raise'): + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if labels is not None: if index is not None or columns is not None: - raise ValueError("Cannot specify both 'labels' and " - "'index'/'columns'") + raise ValueError( + "Cannot specify both 'labels' and " "'index'/'columns'" + ) axis_name = self._get_axis_name(axis) axes = {axis_name: labels} elif index is not None or columns is not None: axes, _ = self._construct_axes_from_arguments((index, columns), {}) else: - raise ValueError("Need to specify at least one of 'labels', " - "'index' or 'columns'") + raise ValueError( + "Need to specify at least one of 'labels', " "'index' or 'columns'" + ) obj = self @@ -3735,7 +3969,7 @@ def drop(self, labels=None, axis=0, index=None, columns=None, level=None, else: return obj - def _drop_axis(self, labels, axis, level=None, errors='raise'): + def _drop_axis(self, labels, axis, level=None, errors="raise"): """ Drop labels from specified axis. Used in the ``drop`` method internally. @@ -3757,7 +3991,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): if axis.is_unique: if level is not None: if not isinstance(axis, MultiIndex): - raise AssertionError('axis must be a MultiIndex') + raise AssertionError("axis must be a MultiIndex") new_axis = axis.drop(labels, level=level, errors=errors) else: new_axis = axis.drop(labels, errors=errors) @@ -3768,18 +4002,18 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): labels = ensure_object(com.index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): - raise AssertionError('axis must be a MultiIndex') + raise AssertionError("axis must be a MultiIndex") indexer = ~axis.get_level_values(level).isin(labels) # GH 18561 MultiIndex.drop should raise if label is absent - if errors == 'raise' and indexer.all(): - raise KeyError('{} not found in axis'.format(labels)) + if errors == "raise" and indexer.all(): + raise KeyError("{} not found in axis".format(labels)) else: indexer = ~axis.isin(labels) # Check if label doesn't exist along axis labels_missing = (axis.get_indexer_for(labels) == -1).any() - if errors == 'raise' and labels_missing: - raise KeyError('{} not found in axis'.format(labels)) + if errors == "raise" and labels_missing: + raise KeyError("{} not found in axis".format(labels)) slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer @@ -3803,7 +4037,7 @@ def _update_inplace(self, result, verify_is_copy=True): self._reset_cache() self._clear_item_cache() - self._data = getattr(result, '_data', result) + self._data = getattr(result, "_data", result) self._maybe_update_cacher(verify_is_copy=verify_is_copy) def add_prefix(self, prefix): @@ -3860,7 +4094,7 @@ def add_prefix(self, prefix): 2 3 5 3 4 6 """ - f = functools.partial('{prefix}{}'.format, prefix=prefix) + f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} return self.rename(**mapper) @@ -3919,13 +4153,20 @@ def add_suffix(self, suffix): 2 3 5 3 4 6 """ - f = functools.partial('{}{suffix}'.format, suffix=suffix) + f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} return self.rename(**mapper) - def sort_values(self, by=None, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): + def sort_values( + self, + by=None, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): """ Sort by the values along either axis. @@ -4015,8 +4256,16 @@ def sort_values(self, by=None, axis=0, ascending=True, inplace=False, """ raise AbstractMethodError(self) - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ): """ Sort object by labels (along an axis). @@ -4048,7 +4297,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, sorted_obj : DataFrame or None DataFrame with sorted index if inplace=False, None otherwise. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) labels = self._get_axis(axis) @@ -4273,12 +4522,12 @@ def reindex(self, *args, **kwargs): # construct the args axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - method = missing.clean_reindex_fill_method(kwargs.pop('method', None)) - level = kwargs.pop('level', None) - copy = kwargs.pop('copy', True) - limit = kwargs.pop('limit', None) - tolerance = kwargs.pop('tolerance', None) - fill_value = kwargs.pop('fill_value', None) + method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) + level = kwargs.pop("level", None) + copy = kwargs.pop("copy", True) + limit = kwargs.pop("limit", None) + tolerance = kwargs.pop("tolerance", None) + fill_value = kwargs.pop("fill_value", None) # Series.reindex doesn't use / need the axis kwarg # We pop and ignore it here, to make writing Series/Frame generic code @@ -4286,15 +4535,20 @@ def reindex(self, *args, **kwargs): kwargs.pop("axis", None) if kwargs: - raise TypeError('reindex() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "reindex() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) self._consolidate_inplace() # if all axes that are requested to reindex are equal, then only copy # if indicated must have index names equal here as well as values - if all(self._get_axis(axis).identical(ax) - for axis, ax in axes.items() if ax is not None): + if all( + self._get_axis(axis).identical(ax) + for axis, ax in axes.items() + if ax is not None + ): if copy: return self.copy() return self @@ -4307,11 +4561,11 @@ def reindex(self, *args, **kwargs): pass # perform the reindex on the axes - return self._reindex_axes(axes, level, limit, tolerance, method, - fill_value, copy).__finalize__(self) + return self._reindex_axes( + axes, level, limit, tolerance, method, fill_value, copy + ).__finalize__(self) - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, - copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): """Perform the reindex for all the axes.""" obj = self for a in self._AXIS_ORDERS: @@ -4320,26 +4574,35 @@ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, continue ax = self._get_axis(a) - new_index, indexer = ax.reindex(labels, level=level, limit=limit, - tolerance=tolerance, method=method) + new_index, indexer = ax.reindex( + labels, level=level, limit=limit, tolerance=tolerance, method=method + ) axis = self._get_axis_number(a) - obj = obj._reindex_with_indexers({axis: [new_index, indexer]}, - fill_value=fill_value, - copy=copy, allow_dups=False) + obj = obj._reindex_with_indexers( + {axis: [new_index, indexer]}, + fill_value=fill_value, + copy=copy, + allow_dups=False, + ) return obj def _needs_reindex_multi(self, axes, method, level): """Check if we do need a multi reindex.""" - return ((com.count_not_none(*axes.values()) == self._AXIS_LEN) and - method is None and level is None and not self._is_mixed_type) + return ( + (com.count_not_none(*axes.values()) == self._AXIS_LEN) + and method is None + and level is None + and not self._is_mixed_type + ) def _reindex_multi(self, axes, copy, fill_value): return NotImplemented - def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, - allow_dups=False): + def _reindex_with_indexers( + self, reindexers, fill_value=None, copy=False, allow_dups=False + ): """allow_dups indicates an internal call here """ # reindex doing multiple operations on different axes if indicated @@ -4356,10 +4619,14 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, indexer = ensure_int64(indexer) # TODO: speed up on homogeneous DataFrame objects - new_data = new_data.reindex_indexer(index, indexer, axis=baxis, - fill_value=fill_value, - allow_dups=allow_dups, - copy=copy) + new_data = new_data.reindex_indexer( + index, + indexer, + axis=baxis, + fill_value=fill_value, + allow_dups=allow_dups, + copy=copy, + ) if copy and new_data is self._data: new_data = new_data.copy() @@ -4429,8 +4696,10 @@ def filter(self, items=None, like=None, regex=None, axis=None): nkw = com.count_not_none(items, like, regex) if nkw > 1: - raise TypeError('Keyword arguments `items`, `like`, or `regex` ' - 'are mutually exclusive') + raise TypeError( + "Keyword arguments `items`, `like`, or `regex` " + "are mutually exclusive" + ) if axis is None: axis = self._info_axis_name @@ -4438,21 +4707,24 @@ def filter(self, items=None, like=None, regex=None, axis=None): if items is not None: name = self._get_axis_name(axis) - return self.reindex( - **{name: [r for r in items if r in labels]}) + return self.reindex(**{name: [r for r in items if r in labels]}) elif like: + def f(x): return like in ensure_str(x) + values = labels.map(f) return self.loc(axis=axis)[values] elif regex: + def f(x): return matcher.search(ensure_str(x)) is not None + matcher = re.compile(regex) values = labels.map(f) return self.loc(axis=axis)[values] else: - raise TypeError('Must pass either `items`, `like`, or `regex`') + raise TypeError("Must pass either `items`, `like`, or `regex`") def head(self, n=5): """ @@ -4574,8 +4846,15 @@ def tail(self, n=5): return self.iloc[0:0] return self.iloc[-n:] - def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): + def sample( + self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ): """ Return a random sample of items from an axis of object. @@ -4683,28 +4962,33 @@ def sample(self, n=None, frac=None, replace=False, weights=None, try: weights = self[weights] except KeyError: - raise KeyError("String passed to weights not a " - "valid column") + raise KeyError( + "String passed to weights not a " "valid column" + ) else: - raise ValueError("Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame") + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) else: - raise ValueError("Strings cannot be passed as weights " - "when sampling from a Series.") + raise ValueError( + "Strings cannot be passed as weights " + "when sampling from a Series." + ) - weights = pd.Series(weights, dtype='float64') + weights = pd.Series(weights, dtype="float64") if len(weights) != axis_length: - raise ValueError("Weights and axis to be sampled must be of " - "same length") + raise ValueError( + "Weights and axis to be sampled must be of " "same length" + ) if (weights == np.inf).any() or (weights == -np.inf).any(): raise ValueError("weight vector may not include `inf` values") if (weights < 0).any(): - raise ValueError("weight vector many not include negative " - "values") + raise ValueError("weight vector many not include negative " "values") # If has nan, set to zero. weights = weights.fillna(0) @@ -4726,18 +5010,20 @@ def sample(self, n=None, frac=None, replace=False, weights=None, elif n is None and frac is not None: n = int(round(frac * axis_length)) elif n is not None and frac is not None: - raise ValueError('Please enter a value for `frac` OR `n`, not ' - 'both') + raise ValueError("Please enter a value for `frac` OR `n`, not " "both") # Check for negative sizes if n < 0: - raise ValueError("A negative number of rows requested. Please " - "provide positive value.") + raise ValueError( + "A negative number of rows requested. Please " "provide positive value." + ) locs = rs.choice(axis_length, size=n, replace=replace, p=weights) return self.take(locs, axis=axis, is_copy=False) - _shared_docs['pipe'] = (r""" + _shared_docs[ + "pipe" + ] = r""" Apply func(self, \*args, \*\*kwargs). Parameters @@ -4786,13 +5072,14 @@ def sample(self, n=None, frac=None, replace=False, weights=None, ... .pipe(g, arg1=a) ... .pipe((f, 'arg2'), arg1=a, arg3=c) ... ) - """) + """ - @Appender(_shared_docs['pipe'] % _shared_doc_kwargs) + @Appender(_shared_docs["pipe"] % _shared_doc_kwargs) def pipe(self, func, *args, **kwargs): return com._pipe(self, func, *args, **kwargs) - _shared_docs['aggregate'] = dedent(""" + _shared_docs["aggregate"] = dedent( + """ Aggregate using one or more operations over the specified axis. %(versionadded)s Parameters @@ -4830,9 +5117,12 @@ def pipe(self, func, *args, **kwargs): `agg` is an alias for `aggregate`. Use the alias. A passed user-defined-function will be passed a Series for evaluation. - %(examples)s""") + %(examples)s""" + ) - _shared_docs['transform'] = (""" + _shared_docs[ + "transform" + ] = """ Call ``func`` on self producing a %(klass)s with transformed values and that has the same axis length as self. @@ -4898,7 +5188,7 @@ def pipe(self, func, *args, **kwargs): 0 0.000000 1.000000 1 1.000000 2.718282 2 1.414214 7.389056 - """) + """ # ---------------------------------------------------------------------- # Attribute access @@ -4928,8 +5218,11 @@ def __getattr__(self, name): # Note: obj.x will always call obj.__getattribute__('x') prior to # calling obj.__getattr__('x'). - if (name in self._internal_names_set or name in self._metadata or - name in self._accessors): + if ( + name in self._internal_names_set + or name in self._metadata + or name in self._accessors + ): return object.__getattribute__(self, name) else: if self._info_axis._can_hold_identifiers_and_holds_name(name): @@ -4968,19 +5261,24 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) except (AttributeError, TypeError): if isinstance(self, ABCDataFrame) and (is_list_like(value)): - warnings.warn("Pandas doesn't allow columns to be " - "created via a new attribute name - see " - "https://pandas.pydata.org/pandas-docs/" - "stable/indexing.html#attribute-access", - stacklevel=2) + warnings.warn( + "Pandas doesn't allow columns to be " + "created via a new attribute name - see " + "https://pandas.pydata.org/pandas-docs/" + "stable/indexing.html#attribute-access", + stacklevel=2, + ) object.__setattr__(self, name, value) def _dir_additions(self): """ add the string-like attributes from the info_axis. If info_axis is a MultiIndex, it's first level values are used. """ - additions = {c for c in self._info_axis.unique(level=0)[:100] - if isinstance(c, str) and c.isidentifier()} + additions = { + c + for c in self._info_axis.unique(level=0)[:100] + if isinstance(c, str) and c.isidentifier() + } return super()._dir_additions().union(additions) # ---------------------------------------------------------------------- @@ -5021,7 +5319,7 @@ def _consolidate(self, inplace=False): ------- consolidated : same type as caller """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if inplace: self._consolidate_inplace() else: @@ -5057,14 +5355,15 @@ def _check_inplace_setting(self, value): except Exception: pass - raise TypeError('Cannot do inplace boolean setting on ' - 'mixed-types with a non np.nan value') + raise TypeError( + "Cannot do inplace boolean setting on " + "mixed-types with a non np.nan value" + ) return True def _get_numeric_data(self): - return self._constructor( - self._data.get_numeric_data()).__finalize__(self) + return self._constructor(self._data.get_numeric_data()).__finalize__(self) def _get_bool_data(self): return self._constructor(self._data.get_bool_data()).__finalize__(self) @@ -5111,11 +5410,14 @@ def as_matrix(self, columns=None): This method is provided for backwards compatibility. Generally, it is recommended to use '.values'. """ - warnings.warn("Method .as_matrix will be removed in a future version. " - "Use .values instead.", FutureWarning, stacklevel=2) + warnings.warn( + "Method .as_matrix will be removed in a future version. " + "Use .values instead.", + FutureWarning, + stacklevel=2, + ) self._consolidate_inplace() - return self._data.as_array(transpose=self._AXIS_REVERSED, - items=columns) + return self._data.as_array(transpose=self._AXIS_REVERSED, items=columns) @property def values(self): @@ -5253,7 +5555,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use '.values' or 'np.asarray(..)' instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -5292,11 +5596,15 @@ def get_dtype_counts(self): object 1 dtype: int64 """ - warnings.warn("`get_dtype_counts` has been deprecated and will be " - "removed in a future version. For DataFrames use " - "`.dtypes.value_counts()", FutureWarning, - stacklevel=2) + warnings.warn( + "`get_dtype_counts` has been deprecated and will be " + "removed in a future version. For DataFrames use " + "`.dtypes.value_counts()", + FutureWarning, + stacklevel=2, + ) from pandas import Series + return Series(self._data.get_dtype_counts()) def get_ftype_counts(self): @@ -5335,11 +5643,14 @@ def get_ftype_counts(self): object:dense 1 dtype: int64 """ - warnings.warn("get_ftype_counts is deprecated and will " - "be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "get_ftype_counts is deprecated and will " "be removed in a future version", + FutureWarning, + stacklevel=2, + ) from pandas import Series + return Series(self._data.get_ftype_counts()) @property @@ -5375,8 +5686,8 @@ def dtypes(self): dtype: object """ from pandas import Series - return Series(self._data.get_dtypes(), index=self._info_axis, - dtype=np.object_) + + return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_) @property def ftypes(self): @@ -5423,14 +5734,17 @@ def ftypes(self): 3 float64:sparse dtype: object """ - warnings.warn("DataFrame.ftypes is deprecated and will " - "be removed in a future version. " - "Use DataFrame.dtypes instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "DataFrame.ftypes is deprecated and will " + "be removed in a future version. " + "Use DataFrame.dtypes instead.", + FutureWarning, + stacklevel=2, + ) from pandas import Series - return Series(self._data.get_ftypes(), index=self._info_axis, - dtype=np.object_) + + return Series(self._data.get_ftypes(), index=self._info_axis, dtype=np.object_) def as_blocks(self, copy=True): """ @@ -5450,9 +5764,11 @@ def as_blocks(self, copy=True): ------- values : a dict of dtype -> Constructor Types """ - warnings.warn("as_blocks is deprecated and will " - "be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "as_blocks is deprecated and will " "be removed in a future version", + FutureWarning, + stacklevel=2, + ) return self._to_dict_of_blocks(copy=copy) @property @@ -5471,10 +5787,12 @@ def _to_dict_of_blocks(self, copy=True): Internal ONLY """ - return {k: self._constructor(v).__finalize__(self) - for k, v, in self._data.to_dict(copy=copy).items()} + return { + k: self._constructor(v).__finalize__(self) + for k, v, in self._data.to_dict(copy=copy).items() + } - def astype(self, dtype, copy=True, errors='raise', **kwargs): + def astype(self, dtype, copy=True, errors="raise", **kwargs): """ Cast a pandas object to a specified dtype ``dtype``. @@ -5579,33 +5897,43 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): if is_dict_like(dtype): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or self.name not in dtype: - raise KeyError('Only the Series name can be used for ' - 'the key in Series dtype mappings.') + raise KeyError( + "Only the Series name can be used for " + "the key in Series dtype mappings." + ) new_type = dtype[self.name] return self.astype(new_type, copy, errors, **kwargs) for col_name in dtype.keys(): if col_name not in self: - raise KeyError('Only a column name can be used for the ' - 'key in a dtype mappings argument.') + raise KeyError( + "Only a column name can be used for the " + "key in a dtype mappings argument." + ) results = [] for col_name, col in self.iteritems(): if col_name in dtype: - results.append(col.astype(dtype=dtype[col_name], copy=copy, - errors=errors, **kwargs)) + results.append( + col.astype( + dtype=dtype[col_name], copy=copy, errors=errors, **kwargs + ) + ) else: results.append(results.append(col.copy() if copy else col)) elif is_extension_array_dtype(dtype) and self.ndim > 1: # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: use iloc to handle duplicate column names - results = (self.iloc[:, i].astype(dtype, copy=copy) - for i in range(len(self.columns))) + results = ( + self.iloc[:, i].astype(dtype, copy=copy) + for i in range(len(self.columns)) + ) else: # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, - **kwargs) + new_data = self._data.astype( + dtype=dtype, copy=copy, errors=errors, **kwargs + ) return self._constructor(new_data).__finalize__(self) # GH 19920: retain column metadata after concat @@ -5735,8 +6063,9 @@ def __deepcopy__(self, memo=None): memo = {} return self.copy(deep=True) - def _convert(self, datetime=False, numeric=False, timedelta=False, - coerce=False, copy=True): + def _convert( + self, datetime=False, numeric=False, timedelta=False, coerce=False, copy=True + ): """ Attempt to infer better dtype for object columns @@ -5762,9 +6091,14 @@ def _convert(self, datetime=False, numeric=False, timedelta=False, converted : same as input object """ return self._constructor( - self._data.convert(datetime=datetime, numeric=numeric, - timedelta=timedelta, coerce=coerce, - copy=copy)).__finalize__(self) + self._data.convert( + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, + copy=copy, + ) + ).__finalize__(self) def infer_objects(self): """ @@ -5809,15 +6143,23 @@ def infer_objects(self): # python objects will still be converted to # native numpy numeric types return self._constructor( - self._data.convert(datetime=True, numeric=False, - timedelta=True, coerce=False, - copy=True)).__finalize__(self) + self._data.convert( + datetime=True, numeric=False, timedelta=True, coerce=False, copy=True + ) + ).__finalize__(self) # ---------------------------------------------------------------------- # Filling NA's - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None): + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ): """ Fill NA/NaN values using the specified method. @@ -5914,7 +6256,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, 2 NaN 1.0 NaN 5 3 NaN 3.0 NaN 4 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") value, method = validate_fillna_kwargs(value, method) self._consolidate_inplace() @@ -5926,6 +6268,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, axis = self._get_axis_number(axis) from pandas import DataFrame + if value is None: if self._is_mixed_type and axis == 1: @@ -5938,10 +6281,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return result - new_data = self._data.interpolate(method=method, axis=axis, - limit=limit, inplace=inplace, - coerce=True, - downcast=downcast) + new_data = self._data.interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + coerce=True, + downcast=downcast, + ) else: if len(self._get_axis(axis)) == 0: return self @@ -5949,23 +6296,28 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, if self.ndim == 1: if isinstance(value, (dict, ABCSeries)): from pandas import Series + value = Series(value) elif not is_list_like(value): pass else: - raise TypeError('"value" parameter must be a scalar, dict ' - 'or Series, but you passed a ' - '"{0}"'.format(type(value).__name__)) - - new_data = self._data.fillna(value=value, limit=limit, - inplace=inplace, - downcast=downcast) + raise TypeError( + '"value" parameter must be a scalar, dict ' + "or Series, but you passed a " + '"{0}"'.format(type(value).__name__) + ) + + new_data = self._data.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) elif isinstance(value, (dict, ABCSeries)): if axis == 1: - raise NotImplementedError('Currently only can fill ' - 'with dict/Series column ' - 'by column') + raise NotImplementedError( + "Currently only can fill " + "with dict/Series column " + "by column" + ) result = self if inplace else self.copy() for k, v in value.items(): @@ -5976,9 +6328,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return result if not inplace else None elif not is_list_like(value): - new_data = self._data.fillna(value=value, limit=limit, - inplace=inplace, - downcast=downcast) + new_data = self._data.fillna( + value=value, limit=limit, inplace=inplace, downcast=downcast + ) elif isinstance(value, DataFrame) and self.ndim == 2: new_data = self.where(self.notna(), value) else: @@ -5998,8 +6350,9 @@ def ffill(self, axis=None, inplace=False, limit=None, downcast=None): %(klass)s Object with missing values filled. """ - return self.fillna(method='ffill', axis=axis, inplace=inplace, - limit=limit, downcast=downcast) + return self.fillna( + method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) def bfill(self, axis=None, inplace=False, limit=None, downcast=None): """ @@ -6010,10 +6363,13 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): %(klass)s Object with missing values filled. """ - return self.fillna(method='bfill', axis=axis, inplace=inplace, - limit=limit, downcast=downcast) + return self.fillna( + method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast + ) - _shared_docs['replace'] = (""" + _shared_docs[ + "replace" + ] = """ Replace values given in `to_replace` with `value`. Values of the %(klass)s are replaced with other values dynamically. @@ -6302,15 +6658,23 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): 3 b 4 b dtype: object - """) + """ - @Appender(_shared_docs['replace'] % _shared_doc_kwargs) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad'): - inplace = validate_bool_kwarg(inplace, 'inplace') + @Appender(_shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + inplace = validate_bool_kwarg(inplace, "inplace") if not is_bool(regex) and to_replace is not None: - raise AssertionError("'to_replace' must be 'None' if 'regex' is " - "not a bool") + raise AssertionError( + "'to_replace' must be 'None' if 'regex' is " "not a bool" + ) self._consolidate_inplace() @@ -6322,17 +6686,18 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if isinstance(to_replace, (tuple, list)): if isinstance(self, pd.DataFrame): - return self.apply(_single_replace, - args=(to_replace, method, inplace, - limit)) - return _single_replace(self, to_replace, method, inplace, - limit) + return self.apply( + _single_replace, args=(to_replace, method, inplace, limit) + ) + return _single_replace(self, to_replace, method, inplace, limit) if not is_dict_like(to_replace): if not is_dict_like(regex): - raise TypeError('If "to_replace" and "value" are both None' - ' and "to_replace" is not a list, then ' - 'regex must be a mapping') + raise TypeError( + 'If "to_replace" and "value" are both None' + ' and "to_replace" is not a list, then ' + "regex must be a mapping" + ) to_replace = regex regex = True @@ -6343,9 +6708,11 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if any(are_mappings): if not all(are_mappings): - raise TypeError("If a nested mapping is passed, all values" - " of the top level mapping must be " - "mappings") + raise TypeError( + "If a nested mapping is passed, all values" + " of the top level mapping must be " + "mappings" + ) # passed a nested dict/Series to_rep_dict = {} value_dict = {} @@ -6353,8 +6720,10 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, for k, v in items: keys, values = list(zip(*v.items())) or ([], []) if set(keys) & set(values): - raise ValueError("Replacement not allowed with " - "overlapping keys and values") + raise ValueError( + "Replacement not allowed with " + "overlapping keys and values" + ) to_rep_dict[k] = list(keys) value_dict[k] = list(values) @@ -6362,8 +6731,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: to_replace, value = keys, values - return self.replace(to_replace, value, inplace=inplace, - limit=limit, regex=regex) + return self.replace( + to_replace, value, inplace=inplace, limit=limit, regex=regex + ) else: # need a non-zero len on all axes @@ -6379,55 +6749,67 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if c in value and c in self: # object conversion is handled in # series.replace which is called recursively - res[c] = res[c].replace(to_replace=src, - value=value[c], - inplace=False, - regex=regex) + res[c] = res[c].replace( + to_replace=src, + value=value[c], + inplace=False, + regex=regex, + ) return None if inplace else res # {'A': NA} -> 0 elif not is_list_like(value): - keys = [(k, src) for k, src in to_replace.items() - if k in self] + keys = [(k, src) for k, src in to_replace.items() if k in self] keys_len = len(keys) - 1 for i, (k, src) in enumerate(keys): convert = i == keys_len - new_data = new_data.replace(to_replace=src, - value=value, - filter=[k], - inplace=inplace, - regex=regex, - convert=convert) + new_data = new_data.replace( + to_replace=src, + value=value, + filter=[k], + inplace=inplace, + regex=regex, + convert=convert, + ) else: - raise TypeError('value argument must be scalar, dict, or ' - 'Series') + raise TypeError("value argument must be scalar, dict, or " "Series") elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] if is_list_like(value): if len(to_replace) != len(value): - raise ValueError('Replacement lists must match ' - 'in length. Expecting %d got %d ' % - (len(to_replace), len(value))) - - new_data = self._data.replace_list(src_list=to_replace, - dest_list=value, - inplace=inplace, - regex=regex) + raise ValueError( + "Replacement lists must match " + "in length. Expecting %d got %d " + % (len(to_replace), len(value)) + ) + + new_data = self._data.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, + ) else: # [NA, ''] -> 0 - new_data = self._data.replace(to_replace=to_replace, - value=value, inplace=inplace, - regex=regex) + new_data = self._data.replace( + to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) elif to_replace is None: - if not (is_re_compilable(regex) or - is_list_like(regex) or is_dict_like(regex)): - raise TypeError("'regex' must be a string or a compiled " - "regular expression or a list or dict of " - "strings or regular expressions, you " - "passed a" - " {0!r}".format(type(regex).__name__)) - return self.replace(regex, value, inplace=inplace, limit=limit, - regex=True) + if not ( + is_re_compilable(regex) + or is_list_like(regex) + or is_dict_like(regex) + ): + raise TypeError( + "'regex' must be a string or a compiled " + "regular expression or a list or dict of " + "strings or regular expressions, you " + "passed a" + " {0!r}".format(type(regex).__name__) + ) + return self.replace( + regex, value, inplace=inplace, limit=limit, regex=True + ) else: # dest iterable dict-like @@ -6436,18 +6818,22 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, for k, v in value.items(): if k in self: - new_data = new_data.replace(to_replace=to_replace, - value=v, filter=[k], - inplace=inplace, - regex=regex) + new_data = new_data.replace( + to_replace=to_replace, + value=v, + filter=[k], + inplace=inplace, + regex=regex, + ) elif not is_list_like(value): # NA -> 0 - new_data = self._data.replace(to_replace=to_replace, - value=value, inplace=inplace, - regex=regex) + new_data = self._data.replace( + to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) else: - msg = ('Invalid "to_replace" type: ' - '{0!r}').format(type(to_replace).__name__) + msg = ('Invalid "to_replace" type: ' "{0!r}").format( + type(to_replace).__name__ + ) raise TypeError(msg) # pragma: no cover if inplace: @@ -6455,7 +6841,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: return self._constructor(new_data).__finalize__(self) - _shared_docs['interpolate'] = """ + _shared_docs[ + "interpolate" + ] = """ Please note that only ``method='linear'`` is supported for DataFrame/Series with a MultiIndex. @@ -6644,14 +7032,22 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, Name: d, dtype: float64 """ - @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs) - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', limit_area=None, - downcast=None, **kwargs): + @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + limit_area=None, + downcast=None, + **kwargs + ): """ Interpolate values according to different methods. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if axis == 0: ax = self._info_axis_name @@ -6668,47 +7064,59 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, else: alt_ax = ax - if (isinstance(_maybe_transposed_self.index, MultiIndex) and - method != 'linear'): - raise ValueError("Only `method=linear` interpolation is supported " - "on MultiIndexes.") + if isinstance(_maybe_transposed_self.index, MultiIndex) and method != "linear": + raise ValueError( + "Only `method=linear` interpolation is supported " "on MultiIndexes." + ) - if _maybe_transposed_self._data.get_dtype_counts().get( - 'object') == len(_maybe_transposed_self.T): - raise TypeError("Cannot interpolate with all object-dtype columns " - "in the DataFrame. Try setting at least one " - "column to a numeric dtype.") + if _maybe_transposed_self._data.get_dtype_counts().get("object") == len( + _maybe_transposed_self.T + ): + raise TypeError( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) # create/use the index - if method == 'linear': + if method == "linear": # prior default index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax))) else: index = _maybe_transposed_self._get_axis(alt_ax) methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( - is_numeric_dtype(index) or - is_datetime64_dtype(index) or - is_timedelta64_dtype(index) + is_numeric_dtype(index) + or is_datetime64_dtype(index) + or is_timedelta64_dtype(index) ) if method not in methods and not is_numeric_or_datetime: raise ValueError( "Index column must be numeric or datetime type when " "using {method} method other than linear. " "Try setting a numeric or datetime index column before " - "interpolating.".format(method=method)) + "interpolating.".format(method=method) + ) if isna(index).any(): - raise NotImplementedError("Interpolation with NaNs in the index " - "has not been implemented. Try filling " - "those NaNs before interpolating.") + raise NotImplementedError( + "Interpolation with NaNs in the index " + "has not been implemented. Try filling " + "those NaNs before interpolating." + ) data = _maybe_transposed_self._data - new_data = data.interpolate(method=method, axis=ax, index=index, - values=_maybe_transposed_self, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - inplace=inplace, downcast=downcast, - **kwargs) + new_data = data.interpolate( + method=method, + axis=ax, + index=index, + values=_maybe_transposed_self, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + inplace=inplace, + downcast=downcast, + **kwargs + ) if inplace: if axis == 1: @@ -6823,6 +7231,7 @@ def asof(self, where, subset=None): """ if isinstance(where, str): from pandas import to_datetime + where = to_datetime(where) if not self.index.is_monotonic: @@ -6848,6 +7257,7 @@ def asof(self, where, subset=None): if where < start: if not is_series: from pandas import Series + return Series(index=self.columns, name=where) return np.nan @@ -6858,7 +7268,7 @@ def asof(self, where, subset=None): # code path whether *where* is a scalar or list. # See PR: https://github.com/pandas-dev/pandas/pull/14476 if is_series: - loc = self.index.searchsorted(where, side='right') + loc = self.index.searchsorted(where, side="right") if loc > 0: loc -= 1 @@ -6876,9 +7286,11 @@ def asof(self, where, subset=None): return self._constructor(np.nan, index=where, name=self.name) elif is_list: from pandas import DataFrame + return DataFrame(np.nan, index=where, columns=self.columns) else: from pandas import Series + return Series(np.nan, index=self.columns, name=where[0]) locs = self.index.asof_locs(where, ~(nulls.values)) @@ -6893,7 +7305,9 @@ def asof(self, where, subset=None): # ---------------------------------------------------------------------- # Action Methods - _shared_docs['isna'] = """ + _shared_docs[ + "isna" + ] = """ Detect missing values. Return a boolean same-sized object indicating if the values are NA. @@ -6953,15 +7367,17 @@ def asof(self, where, subset=None): dtype: bool """ - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return isna(self).__finalize__(self) - @Appender(_shared_docs['isna'] % _shared_doc_kwargs) + @Appender(_shared_docs["isna"] % _shared_doc_kwargs) def isnull(self): return isna(self).__finalize__(self) - _shared_docs['notna'] = """ + _shared_docs[ + "notna" + ] = """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. @@ -7021,23 +7437,24 @@ def isnull(self): dtype: bool """ - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return notna(self).__finalize__(self) - @Appender(_shared_docs['notna'] % _shared_doc_kwargs) + @Appender(_shared_docs["notna"] % _shared_doc_kwargs) def notnull(self): return notna(self).__finalize__(self) def _clip_with_scalar(self, lower, upper, inplace=False): - if ((lower is not None and np.any(isna(lower))) or - (upper is not None and np.any(isna(upper)))): + if (lower is not None and np.any(isna(lower))) or ( + upper is not None and np.any(isna(upper)) + ): raise ValueError("Cannot use an NA value as a clip threshold") result = self mask = isna(self.values) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if upper is not None: subset = self.to_numpy() <= upper result = result.where(subset, upper, axis=None, inplace=False) @@ -7060,7 +7477,7 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # method is self.le for upper bound and self.ge for lower bound if is_scalar(threshold) and is_number(threshold): - if method.__name__ == 'le': + if method.__name__ == "le": return self._clip_with_scalar(None, threshold, inplace=inplace) return self._clip_with_scalar(threshold, None, inplace=inplace) @@ -7073,12 +7490,10 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): if isinstance(self, ABCSeries): threshold = pd.Series(threshold, index=self.index) else: - threshold = _align_method_FRAME(self, threshold, - axis) + threshold = _align_method_FRAME(self, threshold, axis) return self.where(subset, threshold, axis=axis, inplace=inplace) - def clip(self, lower=None, upper=None, axis=None, inplace=False, - *args, **kwargs): + def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs): """ Trim values at input threshold(s). @@ -7151,7 +7566,7 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, 3 6 8 4 5 3 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") axis = nv.validate_clip_with_axis(axis, args, kwargs) if axis is not None: @@ -7173,19 +7588,22 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, lower, upper = min(lower, upper), max(lower, upper) # fast-path for scalars - if ((lower is None or (is_scalar(lower) and is_number(lower))) and - (upper is None or (is_scalar(upper) and is_number(upper)))): + if (lower is None or (is_scalar(lower) and is_number(lower))) and ( + upper is None or (is_scalar(upper) and is_number(upper)) + ): return self._clip_with_scalar(lower, upper, inplace=inplace) result = self if lower is not None: - result = result._clip_with_one_bound(lower, method=self.ge, - axis=axis, inplace=inplace) + result = result._clip_with_one_bound( + lower, method=self.ge, axis=axis, inplace=inplace + ) if upper is not None: if inplace: result = self - result = result._clip_with_one_bound(upper, method=self.le, - axis=axis, inplace=inplace) + result = result._clip_with_one_bound( + upper, method=self.le, axis=axis, inplace=inplace + ) return result @@ -7263,11 +7681,14 @@ def clip_upper(self, threshold, axis=None, inplace=False): 4 1 dtype: int64 """ - warnings.warn('clip_upper(threshold) is deprecated, ' - 'use clip(upper=threshold) instead', - FutureWarning, stacklevel=2) - return self._clip_with_one_bound(threshold, method=self.le, - axis=axis, inplace=inplace) + warnings.warn( + "clip_upper(threshold) is deprecated, " "use clip(upper=threshold) instead", + FutureWarning, + stacklevel=2, + ) + return self._clip_with_one_bound( + threshold, method=self.le, axis=axis, inplace=inplace + ) def clip_lower(self, threshold, axis=None, inplace=False): """ @@ -7379,14 +7800,27 @@ def clip_lower(self, threshold, axis=None, inplace=False): 1 4 5 2 5 6 """ - warnings.warn('clip_lower(threshold) is deprecated, ' - 'use clip(lower=threshold) instead', - FutureWarning, stacklevel=2) - return self._clip_with_one_bound(threshold, method=self.ge, - axis=axis, inplace=inplace) - - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, observed=False, **kwargs): + warnings.warn( + "clip_lower(threshold) is deprecated, " "use clip(lower=threshold) instead", + FutureWarning, + stacklevel=2, + ) + return self._clip_with_one_bound( + threshold, method=self.ge, axis=axis, inplace=inplace + ) + + def groupby( + self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + observed=False, + **kwargs + ): """ Group DataFrame or Series using a mapper or by a Series of columns. @@ -7501,12 +7935,20 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, if level is None and by is None: raise TypeError("You have to supply one of 'by' and 'level'") axis = self._get_axis_number(axis) - return groupby(self, by=by, axis=axis, level=level, as_index=as_index, - sort=sort, group_keys=group_keys, squeeze=squeeze, - observed=observed, **kwargs) - - def asfreq(self, freq, method=None, how=None, normalize=False, - fill_value=None): + return groupby( + self, + by=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + **kwargs + ) + + def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): """ Convert TimeSeries to specified frequency. @@ -7601,8 +8043,15 @@ def asfreq(self, freq, method=None, how=None, normalize=False, 2000-01-01 00:03:00 3.0 """ from pandas.core.resample import asfreq - return asfreq(self, freq, method=method, how=how, normalize=normalize, - fill_value=fill_value) + + return asfreq( + self, + freq, + method=method, + how=how, + normalize=normalize, + fill_value=fill_value, + ) def at_time(self, time, asof=False, axis=None): """ @@ -7656,12 +8105,13 @@ def at_time(self, time, asof=False, axis=None): try: indexer = index.indexer_at_time(time, asof=asof) except AttributeError: - raise TypeError('Index must be DatetimeIndex') + raise TypeError("Index must be DatetimeIndex") return self._take(indexer, axis=axis) - def between_time(self, start_time, end_time, include_start=True, - include_end=True, axis=None): + def between_time( + self, start_time, end_time, include_start=True, include_end=True, axis=None + ): """ Select values between particular times of the day (e.g., 9:00-9:30 AM). @@ -7726,16 +8176,32 @@ def between_time(self, start_time, end_time, include_start=True, index = self._get_axis(axis) try: indexer = index.indexer_between_time( - start_time, end_time, include_start=include_start, - include_end=include_end) + start_time, + end_time, + include_start=include_start, + include_end=include_end, + ) except AttributeError: - raise TypeError('Index must be DatetimeIndex') + raise TypeError("Index must be DatetimeIndex") return self._take(indexer, axis=axis) - def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, - label=None, convention='start', kind=None, loffset=None, - limit=None, base=0, on=None, level=None): + def resample( + self, + rule, + how=None, + axis=0, + fill_method=None, + closed=None, + label=None, + convention="start", + kind=None, + loffset=None, + limit=None, + base=0, + on=None, + level=None, + ): """ Resample time-series data. @@ -8020,17 +8486,25 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, 2000-01-04 36 90 """ - from pandas.core.resample import (resample, - _maybe_process_deprecations) + from pandas.core.resample import resample, _maybe_process_deprecations + axis = self._get_axis_number(axis) - r = resample(self, freq=rule, label=label, closed=closed, - axis=axis, kind=kind, loffset=loffset, - convention=convention, - base=base, key=on, level=level) - return _maybe_process_deprecations(r, - how=how, - fill_method=fill_method, - limit=limit) + r = resample( + self, + freq=rule, + label=label, + closed=closed, + axis=axis, + kind=kind, + loffset=loffset, + convention=convention, + base=base, + key=on, + level=level, + ) + return _maybe_process_deprecations( + r, how=how, fill_method=fill_method, limit=limit + ) def first(self, offset): """ @@ -8088,9 +8562,9 @@ def first(self, offset): end_date = end = self.index[0] + offset # Tick-like, e.g. 3 weeks - if not offset.isAnchored() and hasattr(offset, '_inc'): + if not offset.isAnchored() and hasattr(offset, "_inc"): if end_date in self.index: - end = self.index.searchsorted(end_date, side='left') + end = self.index.searchsorted(end_date, side="left") return self.iloc[:end] return self.loc[:end] @@ -8150,11 +8624,18 @@ def last(self, offset): offset = to_offset(offset) start_date = self.index[-1] - offset - start = self.index.searchsorted(start_date, side='right') + start = self.index.searchsorted(start_date, side="right") return self.iloc[start:] - def rank(self, axis=0, method='average', numeric_only=None, - na_option='keep', ascending=True, pct=False): + def rank( + self, + axis=0, + method="average", + numeric_only=None, + na_option="keep", + ascending=True, + pct=False, + ): """ Compute numerical data ranks (1 through n) along axis. @@ -8238,14 +8719,19 @@ def rank(self, axis=0, method='average', numeric_only=None, """ axis = self._get_axis_number(axis) - if na_option not in {'keep', 'top', 'bottom'}: + if na_option not in {"keep", "top", "bottom"}: msg = "na_option must be one of 'keep', 'top', or 'bottom'" raise ValueError(msg) def ranker(data): - ranks = algos.rank(data.values, axis=axis, method=method, - ascending=ascending, na_option=na_option, - pct=pct) + ranks = algos.rank( + data.values, + axis=axis, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) ranks = self._constructor(ranks, **data._construct_axes_dict()) return ranks.__finalize__(self) @@ -8264,7 +8750,9 @@ def ranker(data): return ranker(data) - _shared_docs['align'] = (""" + _shared_docs[ + "align" + ] = """ Align two objects on their axes with the specified join method for each axis Index. @@ -8304,13 +8792,24 @@ def ranker(data): ------- (left, right) : (%(klass)s, type of other) Aligned objects. - """) + """ - @Appender(_shared_docs['align'] % _shared_doc_kwargs) - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): + @Appender(_shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): from pandas import DataFrame, Series + method = missing.clean_fill_method(method) if broadcast_axis == 1 and self.ndim != other.ndim: @@ -8318,41 +8817,80 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, # this means other is a DataFrame, and we need to broadcast # self cons = self._constructor_expanddim - df = cons({c: self for c in other.columns}, - **other._construct_axes_dict()) - return df._align_frame(other, join=join, axis=axis, - level=level, copy=copy, - fill_value=fill_value, method=method, - limit=limit, fill_axis=fill_axis) + df = cons( + {c: self for c in other.columns}, **other._construct_axes_dict() + ) + return df._align_frame( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) elif isinstance(other, Series): # this means self is a DataFrame, and we need to broadcast # other cons = other._constructor_expanddim - df = cons({c: other for c in self.columns}, - **self._construct_axes_dict()) - return self._align_frame(df, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, limit=limit, - fill_axis=fill_axis) + df = cons( + {c: other for c in self.columns}, **self._construct_axes_dict() + ) + return self._align_frame( + df, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) if axis is not None: axis = self._get_axis_number(axis) if isinstance(other, DataFrame): - return self._align_frame(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, limit=limit, - fill_axis=fill_axis) + return self._align_frame( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) elif isinstance(other, Series): - return self._align_series(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, - method=method, limit=limit, - fill_axis=fill_axis) + return self._align_series( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + ) else: # pragma: no cover - raise TypeError('unsupported type: %s' % type(other)) - - def _align_frame(self, other, join='outer', axis=None, level=None, - copy=True, fill_value=None, method=None, limit=None, - fill_axis=0): + raise TypeError("unsupported type: %s" % type(other)) + + def _align_frame( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + ): # defaults join_index, join_columns = None, None ilidx, iridx = None, None @@ -8363,26 +8901,30 @@ def _align_frame(self, other, join='outer', axis=None, level=None, if axis is None or axis == 0: if not self.index.equals(other.index): join_index, ilidx, iridx = self.index.join( - other.index, how=join, level=level, return_indexers=True) + other.index, how=join, level=level, return_indexers=True + ) if axis is None or axis == 1: if not is_series and not self.columns.equals(other.columns): join_columns, clidx, cridx = self.columns.join( - other.columns, how=join, level=level, return_indexers=True) + other.columns, how=join, level=level, return_indexers=True + ) if is_series: reindexers = {0: [join_index, ilidx]} else: reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]} - left = self._reindex_with_indexers(reindexers, copy=copy, - fill_value=fill_value, - allow_dups=True) + left = self._reindex_with_indexers( + reindexers, copy=copy, fill_value=fill_value, allow_dups=True + ) # other must be always DataFrame - right = other._reindex_with_indexers({0: [join_index, iridx], - 1: [join_columns, cridx]}, - copy=copy, fill_value=fill_value, - allow_dups=True) + right = other._reindex_with_indexers( + {0: [join_index, iridx], 1: [join_columns, cridx]}, + copy=copy, + fill_value=fill_value, + allow_dups=True, + ) if method is not None: left = left.fillna(axis=fill_axis, method=method, limit=limit) @@ -8397,25 +8939,33 @@ def _align_frame(self, other, join='outer', axis=None, level=None, return left.__finalize__(self), right.__finalize__(other) - def _align_series(self, other, join='outer', axis=None, level=None, - copy=True, fill_value=None, method=None, limit=None, - fill_axis=0): + def _align_series( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + ): is_series = isinstance(self, ABCSeries) # series/series compat, other must always be a Series if is_series: if axis: - raise ValueError('cannot align series to a series other than ' - 'axis 0') + raise ValueError("cannot align series to a series other than " "axis 0") # equal if self.index.equals(other.index): join_index, lidx, ridx = None, None, None else: - join_index, lidx, ridx = self.index.join(other.index, how=join, - level=level, - return_indexers=True) + join_index, lidx, ridx = self.index.join( + other.index, how=join, level=level, return_indexers=True + ) left = self._reindex_indexer(join_index, lidx, copy) right = other._reindex_indexer(join_index, ridx, copy) @@ -8428,8 +8978,8 @@ def _align_series(self, other, join='outer', axis=None, level=None, lidx, ridx = None, None if not self.index.equals(other.index): join_index, lidx, ridx = self.index.join( - other.index, how=join, level=level, - return_indexers=True) + other.index, how=join, level=level, return_indexers=True + ) if lidx is not None: fdata = fdata.reindex_indexer(join_index, lidx, axis=1) @@ -8439,13 +8989,13 @@ def _align_series(self, other, join='outer', axis=None, level=None, lidx, ridx = None, None if not self.columns.equals(other.index): join_index, lidx, ridx = self.columns.join( - other.index, how=join, level=level, - return_indexers=True) + other.index, how=join, level=level, return_indexers=True + ) if lidx is not None: fdata = fdata.reindex_indexer(join_index, lidx, axis=0) else: - raise ValueError('Must specify axis=0 or 1') + raise ValueError("Must specify axis=0 or 1") if copy and fdata is self._data: fdata = fdata.copy() @@ -8460,8 +9010,7 @@ def _align_series(self, other, join='outer', axis=None, level=None, # fill fill_na = notna(fill_value) or (method is not None) if fill_na: - left = left.fillna(fill_value, method=method, limit=limit, - axis=fill_axis) + left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis) right = right.fillna(fill_value, method=method, limit=limit) # if DatetimeIndex have different tz, convert to UTC @@ -8474,24 +9023,31 @@ def _align_series(self, other, join='outer', axis=None, level=None, return left.__finalize__(self), right.__finalize__(other) - def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False): + def _where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): """ Equivalent to public method `where`, except that `other` is not applied as a function even if callable. Used in __setitem__. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # align the cond to same shape as myself cond = com.apply_if_callable(cond, self) if isinstance(cond, NDFrame): - cond, _ = cond.align(self, join='right', broadcast_axis=1) + cond, _ = cond.align(self, join="right", broadcast_axis=1) else: - if not hasattr(cond, 'shape'): + if not hasattr(cond, "shape"): cond = np.asanyarray(cond) if cond.shape != self.shape: - raise ValueError('Array conditional must be same shape as ' - 'self') + raise ValueError("Array conditional must be same shape as " "self") cond = self._constructor(cond, **self._construct_axes_dict()) # make sure we are boolean @@ -8513,24 +9069,26 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, # try to align with other try_quick = True - if hasattr(other, 'align'): + if hasattr(other, "align"): # align with me if other.ndim <= self.ndim: - _, other = self.align(other, join='left', axis=axis, - level=level, fill_value=np.nan) + _, other = self.align( + other, join="left", axis=axis, level=level, fill_value=np.nan + ) # if we are NOT aligned, raise as we cannot where index - if (axis is None and - not all(other._get_axis(i).equals(ax) - for i, ax in enumerate(self.axes))): + if axis is None and not all( + other._get_axis(i).equals(ax) for i, ax in enumerate(self.axes) + ): raise InvalidIndexError # slice me out of the other else: - raise NotImplementedError("cannot align with a higher " - "dimensional NDFrame") + raise NotImplementedError( + "cannot align with a higher " "dimensional NDFrame" + ) if isinstance(other, np.ndarray): @@ -8571,12 +9129,14 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, other = new_other else: - raise ValueError('Length of replacements must equal ' - 'series length') + raise ValueError( + "Length of replacements must equal " "series length" + ) else: - raise ValueError('other must be the same shape as self ' - 'when an ndarray') + raise ValueError( + "other must be the same shape as self " "when an ndarray" + ) # we are the same shape, so create an actual object for alignment else: @@ -8585,10 +9145,10 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if axis is None: axis = 0 - if self.ndim == getattr(other, 'ndim', 0): + if self.ndim == getattr(other, "ndim", 0): align = True else: - align = (self._get_axis_number(axis) == 1) + align = self._get_axis_number(axis) == 1 block_axis = self._get_block_manager_axis(axis) @@ -8597,20 +9157,32 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, # reconstruct the block manager self._check_inplace_setting(other) - new_data = self._data.putmask(mask=cond, new=other, align=align, - inplace=True, axis=block_axis, - transpose=self._AXIS_REVERSED) + new_data = self._data.putmask( + mask=cond, + new=other, + align=align, + inplace=True, + axis=block_axis, + transpose=self._AXIS_REVERSED, + ) self._update_inplace(new_data) else: - new_data = self._data.where(other=other, cond=cond, align=align, - errors=errors, - try_cast=try_cast, axis=block_axis, - transpose=self._AXIS_REVERSED) + new_data = self._data.where( + other=other, + cond=cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=block_axis, + transpose=self._AXIS_REVERSED, + ) return self._constructor(new_data).__finalize__(self) - _shared_docs['where'] = (""" + _shared_docs[ + "where" + ] = """ Replace values where the condition is %(cond_rev)s. Parameters @@ -8731,36 +9303,75 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, 2 True True 3 True True 4 True True - """) + """ - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True", - cond_rev="False", name='where', - name_other='mask')) - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False): + @Appender( + _shared_docs["where"] + % dict( + _shared_doc_kwargs, + cond="True", + cond_rev="False", + name="where", + name_other="mask", + ) + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): other = com.apply_if_callable(other, self) - return self._where(cond, other, inplace, axis, level, - errors=errors, try_cast=try_cast) - - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False", - cond_rev="True", name='mask', - name_other='where')) - def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, - errors='raise', try_cast=False): - - inplace = validate_bool_kwarg(inplace, 'inplace') + return self._where( + cond, other, inplace, axis, level, errors=errors, try_cast=try_cast + ) + + @Appender( + _shared_docs["where"] + % dict( + _shared_doc_kwargs, + cond="False", + cond_rev="True", + name="mask", + name_other="where", + ) + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): + + inplace = validate_bool_kwarg(inplace, "inplace") cond = com.apply_if_callable(cond, self) # see gh-21891 if not hasattr(cond, "__invert__"): cond = np.array(cond) - return self.where(~cond, other=other, inplace=inplace, axis=axis, - level=level, try_cast=try_cast, - errors=errors) - - _shared_docs['shift'] = (""" + return self.where( + ~cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + try_cast=try_cast, + errors=errors, + ) + + _shared_docs[ + "shift" + ] = """ Shift index by desired number of periods with an optional time `freq`. When `freq` is not passed, shift the index without realigning the data. @@ -8830,17 +9441,18 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, 2 0 0 0 3 10 13 17 4 20 23 27 - """) + """ - @Appender(_shared_docs['shift'] % _shared_doc_kwargs) + @Appender(_shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None): if periods == 0: return self.copy() block_axis = self._get_block_manager_axis(axis) if freq is None: - new_data = self._data.shift(periods=periods, axis=block_axis, - fill_value=fill_value) + new_data = self._data.shift( + periods=periods, axis=block_axis, fill_value=fill_value + ) else: return self.tshift(periods, freq) @@ -8908,13 +9520,13 @@ def tshift(self, periods=1, freq=None, axis=0): index = self._get_axis(axis) if freq is None: - freq = getattr(index, 'freq', None) + freq = getattr(index, "freq", None) if freq is None: - freq = getattr(index, 'inferred_freq', None) + freq = getattr(index, "inferred_freq", None) if freq is None: - msg = 'Freq was not given and was not set in the index' + msg = "Freq was not given and was not set in the index" raise ValueError(msg) if periods == 0: @@ -8930,8 +9542,10 @@ def tshift(self, periods=1, freq=None, axis=0): new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) else: - msg = ('Given freq %s does not match PeriodIndex freq %s' % - (freq.rule_code, orig_freq.rule_code)) + msg = "Given freq %s does not match PeriodIndex freq %s" % ( + freq.rule_code, + orig_freq.rule_code, + ) raise ValueError(msg) else: new_data = self._data.copy() @@ -9072,21 +9686,20 @@ def truncate(self, before=None, after=None, axis=None, copy=True): # treat like a slice if ax.is_all_dates: from pandas.core.tools.datetimes import to_datetime + before = to_datetime(before) after = to_datetime(after) if before is not None and after is not None: if before > after: - raise ValueError('Truncate: %s must be after %s' % - (after, before)) + raise ValueError("Truncate: %s must be after %s" % (after, before)) slicer = [slice(None, None)] * self._AXIS_LEN slicer[axis] = slice(before, after) result = self.loc[tuple(slicer)] if isinstance(ax, MultiIndex): - setattr(result, self._get_axis_name(axis), - ax.truncate(before, after)) + setattr(result, self._get_axis_name(axis), ax.truncate(before, after)) if copy: result = result.copy() @@ -9121,11 +9734,12 @@ def tz_convert(self, tz, axis=0, level=None, copy=True): ax = self._get_axis(axis) def _tz_convert(ax, tz): - if not hasattr(ax, 'tz_convert'): + if not hasattr(ax, "tz_convert"): if len(ax) > 0: ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or ' - 'PeriodIndex' % ax_name) + raise TypeError( + "%s is not a valid DatetimeIndex or " "PeriodIndex" % ax_name + ) else: ax = DatetimeIndex([], tz=tz) else: @@ -9147,8 +9761,9 @@ def _tz_convert(ax, tz): result = result.set_axis(ax, axis=axis, inplace=False) return result.__finalize__(self) - def tz_localize(self, tz, axis=0, level=None, copy=True, - ambiguous='raise', nonexistent='raise'): + def tz_localize( + self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise" + ): """ Localize tz-naive index of a Series or DataFrame to target time zone. @@ -9269,38 +9884,37 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, 2015-03-29 03:30:00+02:00 1 dtype: int64 """ - nonexistent_options = ('raise', 'NaT', 'shift_forward', - 'shift_backward') + nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward") if nonexistent not in nonexistent_options and not isinstance( - nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object") + nonexistent, timedelta + ): + raise ValueError( + "The nonexistent argument must be one of 'raise'," + " 'NaT', 'shift_forward', 'shift_backward' or" + " a timedelta object" + ) axis = self._get_axis_number(axis) ax = self._get_axis(axis) def _tz_localize(ax, tz, ambiguous, nonexistent): - if not hasattr(ax, 'tz_localize'): + if not hasattr(ax, "tz_localize"): if len(ax) > 0: ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or ' - 'PeriodIndex' % ax_name) + raise TypeError( + "%s is not a valid DatetimeIndex or " "PeriodIndex" % ax_name + ) else: ax = DatetimeIndex([], tz=tz) else: - ax = ax.tz_localize( - tz, ambiguous=ambiguous, nonexistent=nonexistent - ) + ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent) return ax # if a level is given it must be a MultiIndex level or # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - new_level = _tz_localize( - ax.levels[level], tz, ambiguous, nonexistent - ) + new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent) ax = ax.set_levels(new_level, level=level) else: if level not in (None, 0, ax.name): @@ -9641,14 +10255,18 @@ def describe(self, percentiles=None, include=None, exclude=None): formatted_percentiles = format_percentiles(percentiles) def describe_numeric_1d(series): - stat_index = (['count', 'mean', 'std', 'min'] + - formatted_percentiles + ['max']) - d = ([series.count(), series.mean(), series.std(), series.min()] + - series.quantile(percentiles).tolist() + [series.max()]) + stat_index = ( + ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] + ) + d = ( + [series.count(), series.mean(), series.std(), series.min()] + + series.quantile(percentiles).tolist() + + [series.max()] + ) return pd.Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data): - names = ['count', 'unique'] + names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) result = [data.count(), count_unique] @@ -9658,27 +10276,30 @@ def describe_categorical_1d(data): if is_datetime64_any_dtype(data): tz = data.dt.tz - asint = data.dropna().values.view('i8') + asint = data.dropna().values.view("i8") top = Timestamp(top) if top.tzinfo is not None and tz is not None: # Don't tz_localize(None) if key is already tz-aware top = top.tz_convert(tz) else: top = top.tz_localize(tz) - names += ['top', 'freq', 'first', 'last'] - result += [top, freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz)] + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] else: - names += ['top', 'freq'] + names += ["top", "freq"] result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency else: - names += ['top', 'freq'] + names += ["top", "freq"] result += [np.nan, np.nan] - dtype = 'object' + dtype = "object" return pd.Series(result, index=names, name=data.name, dtype=dtype) @@ -9699,7 +10320,7 @@ def describe_1d(data): data = self.select_dtypes(include=[np.number]) if len(data.columns) == 0: data = self - elif include == 'all': + elif include == "all": if exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) @@ -9716,8 +10337,7 @@ def describe_1d(data): if name not in names: names.append(name) - d = pd.concat([x.reindex(names, copy=False) for x in ldesc], - axis=1, sort=False) + d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) d.columns = data.columns.copy() return d @@ -9726,8 +10346,7 @@ def _check_percentile(self, q): Validate percentiles (used by describe and quantile). """ - msg = ("percentiles should all be in the interval [0, 1]. " - "Try {0} instead.") + msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." q = np.asarray(q) if q.ndim == 0: if not 0 <= q <= 1: @@ -9737,7 +10356,9 @@ def _check_percentile(self, q): raise ValueError(msg.format(q / 100.0)) return q - _shared_docs['pct_change'] = """ + _shared_docs[ + "pct_change" + ] = """ Percentage change between the current and a prior element. Computes the percentage change from the immediately previous row by @@ -9852,18 +10473,16 @@ def _check_percentile(self, q): APPL NaN 0.337604 0.012002 """ - @Appender(_shared_docs['pct_change'] % _shared_doc_kwargs) - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, - **kwargs): + @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): # TODO: Not sure if above is correct - need someone to confirm. - axis = self._get_axis_number(kwargs.pop('axis', self._stat_axis_name)) + axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) if fill_method is None: data = self else: data = self.fillna(method=fill_method, limit=limit, axis=axis) - rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis, - **kwargs)) - 1) + rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 rs = rs.reindex_like(data) if freq is None: mask = isna(com.values_from_object(data)) @@ -9890,16 +10509,40 @@ def _add_numeric_operations(cls): axis_descr, name, name2 = _doc_parms(cls) cls.any = _make_logical_function( - cls, 'any', name, name2, axis_descr, _any_desc, nanops.nanany, - _any_see_also, _any_examples, empty_value=False) + cls, + "any", + name, + name2, + axis_descr, + _any_desc, + nanops.nanany, + _any_see_also, + _any_examples, + empty_value=False, + ) cls.all = _make_logical_function( - cls, 'all', name, name2, axis_descr, _all_desc, nanops.nanall, - _all_see_also, _all_examples, empty_value=True) - - @Substitution(desc="Return the mean absolute deviation of the values " - "for the requested axis.", - name1=name, name2=name2, axis_descr=axis_descr, - min_count='', see_also='', examples='') + cls, + "all", + name, + name2, + axis_descr, + _all_desc, + nanops.nanall, + _all_see_also, + _all_examples, + empty_value=True, + ) + + @Substitution( + desc="Return the mean absolute deviation of the values " + "for the requested axis.", + name1=name, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also="", + examples="", + ) @Appender(_num_doc) def mad(self, axis=None, skipna=None, level=None): if skipna is None: @@ -9907,8 +10550,7 @@ def mad(self, axis=None, skipna=None, level=None): if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level('mad', axis=axis, level=level, - skipna=skipna) + return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) data = self._get_numeric_data() if axis == 0: @@ -9920,31 +10562,54 @@ def mad(self, axis=None, skipna=None, level=None): cls.mad = mad cls.sem = _make_stat_function_ddof( - cls, 'sem', name, name2, axis_descr, + cls, + "sem", + name, + name2, + axis_descr, "Return unbiased standard error of the mean over requested " "axis.\n\nNormalized by N-1 by default. This can be changed " "using the ddof argument", - nanops.nansem) + nanops.nansem, + ) cls.var = _make_stat_function_ddof( - cls, 'var', name, name2, axis_descr, + cls, + "var", + name, + name2, + axis_descr, "Return unbiased variance over requested axis.\n\nNormalized by " "N-1 by default. This can be changed using the ddof argument", - nanops.nanvar) + nanops.nanvar, + ) cls.std = _make_stat_function_ddof( - cls, 'std', name, name2, axis_descr, + cls, + "std", + name, + name2, + axis_descr, "Return sample standard deviation over requested axis." "\n\nNormalized by N-1 by default. This can be changed using the " "ddof argument", - nanops.nanstd) - - @Substitution(desc="Return the compound percentage of the values for " - "the requested axis.\n\n.. deprecated:: 0.25.0", - name1=name, name2=name2, axis_descr=axis_descr, - min_count='', see_also='', examples='') + nanops.nanstd, + ) + + @Substitution( + desc="Return the compound percentage of the values for " + "the requested axis.\n\n.. deprecated:: 0.25.0", + name1=name, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also="", + examples="", + ) @Appender(_num_doc) def compound(self, axis=None, skipna=None, level=None): - msg = ("The 'compound' method is deprecated and will be" - "removed in a future version.") + msg = ( + "The 'compound' method is deprecated and will be" + "removed in a future version." + ) warnings.warn(msg, FutureWarning, stacklevel=2) if skipna is None: skipna = True @@ -9953,63 +10618,146 @@ def compound(self, axis=None, skipna=None, level=None): cls.compound = compound cls.cummin = _make_cum_function( - cls, 'cummin', name, name2, axis_descr, "minimum", - lambda y, axis: np.minimum.accumulate(y, axis), "min", - np.inf, np.nan, _cummin_examples) + cls, + "cummin", + name, + name2, + axis_descr, + "minimum", + lambda y, axis: np.minimum.accumulate(y, axis), + "min", + np.inf, + np.nan, + _cummin_examples, + ) cls.cumsum = _make_cum_function( - cls, 'cumsum', name, name2, axis_descr, "sum", - lambda y, axis: y.cumsum(axis), "sum", 0., - np.nan, _cumsum_examples) + cls, + "cumsum", + name, + name2, + axis_descr, + "sum", + lambda y, axis: y.cumsum(axis), + "sum", + 0.0, + np.nan, + _cumsum_examples, + ) cls.cumprod = _make_cum_function( - cls, 'cumprod', name, name2, axis_descr, "product", - lambda y, axis: y.cumprod(axis), "prod", 1., - np.nan, _cumprod_examples) + cls, + "cumprod", + name, + name2, + axis_descr, + "product", + lambda y, axis: y.cumprod(axis), + "prod", + 1.0, + np.nan, + _cumprod_examples, + ) cls.cummax = _make_cum_function( - cls, 'cummax', name, name2, axis_descr, "maximum", - lambda y, axis: np.maximum.accumulate(y, axis), "max", - -np.inf, np.nan, _cummax_examples) + cls, + "cummax", + name, + name2, + axis_descr, + "maximum", + lambda y, axis: np.maximum.accumulate(y, axis), + "max", + -np.inf, + np.nan, + _cummax_examples, + ) cls.sum = _make_min_count_stat_function( - cls, 'sum', name, name2, axis_descr, + cls, + "sum", + name, + name2, + axis_descr, """Return the sum of the values for the requested axis.\n This is equivalent to the method ``numpy.sum``.""", - nanops.nansum, _stat_func_see_also, _sum_examples) + nanops.nansum, + _stat_func_see_also, + _sum_examples, + ) cls.mean = _make_stat_function( - cls, 'mean', name, name2, axis_descr, - 'Return the mean of the values for the requested axis.', - nanops.nanmean) + cls, + "mean", + name, + name2, + axis_descr, + "Return the mean of the values for the requested axis.", + nanops.nanmean, + ) cls.skew = _make_stat_function( - cls, 'skew', name, name2, axis_descr, - 'Return unbiased skew over requested axis\nNormalized by N-1.', - nanops.nanskew) + cls, + "skew", + name, + name2, + axis_descr, + "Return unbiased skew over requested axis\nNormalized by N-1.", + nanops.nanskew, + ) cls.kurt = _make_stat_function( - cls, 'kurt', name, name2, axis_descr, + cls, + "kurt", + name, + name2, + axis_descr, "Return unbiased kurtosis over requested axis using Fisher's " "definition of\nkurtosis (kurtosis of normal == 0.0). Normalized " "by N-1.", - nanops.nankurt) + nanops.nankurt, + ) cls.kurtosis = cls.kurt cls.prod = _make_min_count_stat_function( - cls, 'prod', name, name2, axis_descr, - 'Return the product of the values for the requested axis.', - nanops.nanprod, examples=_prod_examples) + cls, + "prod", + name, + name2, + axis_descr, + "Return the product of the values for the requested axis.", + nanops.nanprod, + examples=_prod_examples, + ) cls.product = cls.prod cls.median = _make_stat_function( - cls, 'median', name, name2, axis_descr, - 'Return the median of the values for the requested axis.', - nanops.nanmedian) + cls, + "median", + name, + name2, + axis_descr, + "Return the median of the values for the requested axis.", + nanops.nanmedian, + ) cls.max = _make_stat_function( - cls, 'max', name, name2, axis_descr, + cls, + "max", + name, + name2, + axis_descr, """Return the maximum of the values for the requested axis.\n If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``.""", - nanops.nanmax, _stat_func_see_also, _max_examples) + nanops.nanmax, + _stat_func_see_also, + _max_examples, + ) cls.min = _make_stat_function( - cls, 'min', name, name2, axis_descr, + cls, + "min", + name, + name2, + axis_descr, """Return the minimum of the values for the requested axis.\n If you want the *index* of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``.""", - nanops.nanmin, _stat_func_see_also, _min_examples) + nanops.nanmin, + _stat_func_see_also, + _min_examples, + ) @classmethod def _add_series_only_operations(cls): @@ -10023,18 +10771,26 @@ def _add_series_only_operations(cls): def nanptp(values, axis=0, skipna=True): nmax = nanops.nanmax(values, axis, skipna) nmin = nanops.nanmin(values, axis, skipna) - warnings.warn("Method .ptp is deprecated and will be removed " - "in a future version. Use numpy.ptp instead.", - FutureWarning, stacklevel=4) + warnings.warn( + "Method .ptp is deprecated and will be removed " + "in a future version. Use numpy.ptp instead.", + FutureWarning, + stacklevel=4, + ) return nmax - nmin cls.ptp = _make_stat_function( - cls, 'ptp', name, name2, axis_descr, + cls, + "ptp", + name, + name2, + axis_descr, """Return the difference between the maximum value and the minimum value in the object. This is the equivalent of the ``numpy.ndarray`` method ``ptp``.\n\n.. deprecated:: 0.24.0 Use numpy.ptp instead""", - nanptp) + nanptp, + ) @classmethod def _add_series_or_dataframe_operations(cls): @@ -10046,48 +10802,80 @@ def _add_series_or_dataframe_operations(cls): from pandas.core import window as rwindow @Appender(rwindow.rolling.__doc__) - def rolling(self, window, min_periods=None, center=False, - win_type=None, on=None, axis=0, closed=None): + def rolling( + self, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): axis = self._get_axis_number(axis) - return rwindow.rolling(self, window=window, - min_periods=min_periods, - center=center, win_type=win_type, - on=on, axis=axis, closed=closed) + return rwindow.rolling( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) cls.rolling = rolling @Appender(rwindow.expanding.__doc__) def expanding(self, min_periods=1, center=False, axis=0): axis = self._get_axis_number(axis) - return rwindow.expanding(self, min_periods=min_periods, - center=center, axis=axis) + return rwindow.expanding( + self, min_periods=min_periods, center=center, axis=axis + ) cls.expanding = expanding @Appender(rwindow.ewm.__doc__) - def ewm(self, com=None, span=None, halflife=None, alpha=None, - min_periods=0, adjust=True, ignore_na=False, - axis=0): + def ewm( + self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): axis = self._get_axis_number(axis) - return rwindow.ewm(self, com=com, span=span, halflife=halflife, - alpha=alpha, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na, axis=axis) + return rwindow.ewm( + self, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + ) cls.ewm = ewm - @Appender(_shared_docs['transform'] % dict(axis="", **_shared_doc_kwargs)) + @Appender(_shared_docs["transform"] % dict(axis="", **_shared_doc_kwargs)) def transform(self, func, *args, **kwargs): result = self.agg(func, *args, **kwargs) if is_scalar(result) or len(result) != len(self): - raise ValueError("transforms cannot produce " - "aggregated results") + raise ValueError("transforms cannot produce " "aggregated results") return result # ---------------------------------------------------------------------- # Misc methods - _shared_docs['valid_index'] = """ + _shared_docs[ + "valid_index" + ] = """ Return index for %(position)s non-NA/null value. Returns @@ -10113,7 +10901,7 @@ def _find_valid_index(self, how): ------- idx_first_valid : type of index """ - assert how in ['first', 'last'] + assert how in ["first", "last"] if len(self) == 0: # early stop return None @@ -10122,10 +10910,10 @@ def _find_valid_index(self, how): if self.ndim == 2: is_valid = is_valid.any(1) # reduce axis 1 - if how == 'first': + if how == "first": idxpos = is_valid.values[::].argmax() - if how == 'last': + if how == "last": idxpos = len(self) - 1 - is_valid.values[::-1].argmax() chk_notna = is_valid.iat[idxpos] @@ -10135,23 +10923,25 @@ def _find_valid_index(self, how): return None return idx - @Appender(_shared_docs['valid_index'] % {'position': 'first', - 'klass': 'Series/DataFrame'}) + @Appender( + _shared_docs["valid_index"] % {"position": "first", "klass": "Series/DataFrame"} + ) def first_valid_index(self): - return self._find_valid_index('first') + return self._find_valid_index("first") - @Appender(_shared_docs['valid_index'] % {'position': 'last', - 'klass': 'Series/DataFrame'}) + @Appender( + _shared_docs["valid_index"] % {"position": "last", "klass": "Series/DataFrame"} + ) def last_valid_index(self): - return self._find_valid_index('last') + return self._find_valid_index("last") def _doc_parms(cls): """Return a tuple of the doc parms.""" - axis_descr = "{%s}" % ', '.join("{0} ({1})".format(a, i) - for i, a in enumerate(cls._AXIS_ORDERS)) - name = (cls._constructor_sliced.__name__ - if cls._AXIS_LEN > 1 else 'scalar') + axis_descr = "{%s}" % ", ".join( + "{0} ({1})".format(a, i) for i, a in enumerate(cls._AXIS_ORDERS) + ) + name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar" name2 = cls.__name__ return axis_descr, name, name2 @@ -10684,7 +11474,9 @@ def _doc_parms(cls): Series([], dtype: bool) """ -_shared_docs['stat_func_example'] = """ +_shared_docs[ + "stat_func_example" +] = """ Examples -------- @@ -10718,12 +11510,9 @@ def _doc_parms(cls): cold {level_output_1} Name: legs, dtype: int64""" -_sum_examples = _shared_docs['stat_func_example'].format( - stat_func='sum', - verb='Sum', - default_output=14, - level_output_0=6, - level_output_1=8) +_sum_examples = _shared_docs["stat_func_example"].format( + stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8 +) _sum_examples += """ @@ -10747,19 +11536,13 @@ def _doc_parms(cls): >>> pd.Series([np.nan]).sum(min_count=1) nan""" -_max_examples = _shared_docs['stat_func_example'].format( - stat_func='max', - verb='Max', - default_output=8, - level_output_0=4, - level_output_1=8) +_max_examples = _shared_docs["stat_func_example"].format( + stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8 +) -_min_examples = _shared_docs['stat_func_example'].format( - stat_func='min', - verb='Min', - default_output=0, - level_output_0=2, - level_output_1=0) +_min_examples = _shared_docs["stat_func_example"].format( + stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0 +) _stat_func_see_also = """ @@ -10812,18 +11595,31 @@ def _doc_parms(cls): """ -def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, - f, see_also='', examples=''): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, min_count=_min_count_stub, - see_also=see_also, examples=examples) +def _make_min_count_stat_function( + cls, name, name1, name2, axis_descr, desc, f, see_also="", examples="" +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count=_min_count_stub, + see_also=see_also, + examples=examples, + ) @Appender(_num_doc) - def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=0, - **kwargs): - if name == 'sum': + def stat_func( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs + ): + if name == "sum": nv.validate_sum(tuple(), kwargs) - elif name == 'prod': + elif name == "prod": nv.validate_prod(tuple(), kwargs) else: nv.validate_stat_func(tuple(), kwargs, fname=name) @@ -10832,23 +11628,38 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna, min_count=min_count) - return self._reduce(f, name, axis=axis, skipna=skipna, - numeric_only=numeric_only, min_count=min_count) + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, min_count=min_count + ) + return self._reduce( + f, + name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) return set_function_name(stat_func, name, cls) -def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f, - see_also='', examples=''): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, min_count='', see_also=see_also, - examples=examples) +def _make_stat_function( + cls, name, name1, name2, axis_descr, desc, f, see_also="", examples="" +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also=see_also, + examples=examples, + ) @Appender(_num_doc) - def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, - **kwargs): - if name == 'median': + def stat_func( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + if name == "median": nv.validate_median(tuple(), kwargs) else: nv.validate_stat_func(tuple(), kwargs, fname=name) @@ -10857,39 +11668,57 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna) - return self._reduce(f, name, axis=axis, skipna=skipna, - numeric_only=numeric_only) + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + f, name, axis=axis, skipna=skipna, numeric_only=numeric_only + ) return set_function_name(stat_func, name, cls) def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) @Appender(_num_ddof_doc) - def stat_func(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + def stat_func( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: axis = self._stat_axis_number if level is not None: - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna, ddof=ddof) - return self._reduce(f, name, axis=axis, numeric_only=numeric_only, - skipna=skipna, ddof=ddof) + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, ddof=ddof + ) + return self._reduce( + f, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof + ) return set_function_name(stat_func, name, cls) -def _make_cum_function(cls, name, name1, name2, axis_descr, desc, - accum_func, accum_func_name, mask_a, mask_b, examples): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, accum_func_name=accum_func_name, - examples=examples) +def _make_cum_function( + cls, + name, + name1, + name2, + axis_descr, + desc, + accum_func, + accum_func_name, + mask_a, + mask_b, + examples, +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + accum_func_name=accum_func_name, + examples=examples, + ) @Appender(_cnum_doc) def cum_func(self, axis=None, skipna=True, *args, **kwargs): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) @@ -10900,8 +11729,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): y = com.values_from_object(self).copy() - if (skipna and - issubclass(y.dtype.type, (np.datetime64, np.timedelta64))): + if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): result = accum_func(y, axis) mask = isna(self) np.putmask(result, mask, iNaT) @@ -10914,29 +11742,41 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): result = accum_func(y, axis) d = self._construct_axes_dict() - d['copy'] = False + d["copy"] = False return self._constructor(result, **d).__finalize__(self) return set_function_name(cum_func, name, cls) -def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, - see_also, examples, empty_value): - @Substitution(desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr, see_also=see_also, examples=examples, - empty_value=empty_value) +def _make_logical_function( + cls, name, name1, name2, axis_descr, desc, f, see_also, examples, empty_value +): + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + see_also=see_also, + examples=examples, + empty_value=empty_value, + ) @Appender(_bool_doc) - def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, - **kwargs): + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) if level is not None: if bool_only is not None: - raise NotImplementedError("Option bool_only is not " - "implemented with option level.") - return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna) - return self._reduce(f, name, axis=axis, skipna=skipna, - numeric_only=bool_only, filter_type='bool') + raise NotImplementedError( + "Option bool_only is not " "implemented with option level." + ) + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + f, + name, + axis=axis, + skipna=skipna, + numeric_only=bool_only, + filter_type="bool", + ) return set_function_name(logical_func, name, cls) diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index fe50bd91a4f56..252f20ed40068 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,4 +1,7 @@ from pandas.core.groupby.generic import ( # noqa: F401 - DataFrameGroupBy, NamedAgg, SeriesGroupBy) + DataFrameGroupBy, + NamedAgg, + SeriesGroupBy, +) from pandas.core.groupby.groupby import GroupBy # noqa: F401 from pandas.core.groupby.grouper import Grouper # noqa: F401 diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index cffe0e589c6bc..5c4f1fa3fbddf 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -21,7 +21,9 @@ def outer(self, *args, **kwargs): def f(x): x = self._shallow_copy(x, groupby=self._groupby) return getattr(x, name)(*args, **kwargs) + return self._groupby.apply(f) + outer.__name__ = name return outer @@ -51,10 +53,7 @@ def _gotitem(self, key, ndim, subset=None): except IndexError: groupby = self._groupby - self = self.__class__(subset, - groupby=groupby, - parent=self, - **kwargs) + self = self.__class__(subset, groupby=groupby, parent=self, **kwargs) self._reset_cache() if subset.ndim == 2: if is_scalar(key) and key in subset or is_list_like(key): @@ -64,25 +63,41 @@ def _gotitem(self, key, ndim, subset=None): # special case to prevent duplicate plots when catching exceptions when # forwarding methods from NDFrames -plotting_methods = frozenset(['plot', 'hist']) - -common_apply_whitelist = frozenset([ - 'quantile', 'fillna', 'mad', 'take', - 'idxmax', 'idxmin', 'tshift', - 'skew', 'corr', 'cov', 'diff' -]) | plotting_methods - -series_apply_whitelist = ((common_apply_whitelist | - {'nlargest', 'nsmallest', - 'is_monotonic_increasing', - 'is_monotonic_decreasing'}) - ) | frozenset(['dtype', 'unique']) - -dataframe_apply_whitelist = ((common_apply_whitelist | - frozenset(['dtypes', 'corrwith']))) - -cython_transforms = frozenset(['cumprod', 'cumsum', 'shift', - 'cummin', 'cummax']) - -cython_cast_blacklist = frozenset(['rank', 'count', 'size', 'idxmin', - 'idxmax']) +plotting_methods = frozenset(["plot", "hist"]) + +common_apply_whitelist = ( + frozenset( + [ + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "corr", + "cov", + "diff", + ] + ) + | plotting_methods +) + +series_apply_whitelist = ( + ( + common_apply_whitelist + | { + "nlargest", + "nsmallest", + "is_monotonic_increasing", + "is_monotonic_decreasing", + } + ) +) | frozenset(["dtype", "unique"]) + +dataframe_apply_whitelist = common_apply_whitelist | frozenset(["dtypes", "corrwith"]) + +cython_transforms = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) + +cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 85f51323a97b5..fcf52ecfcbbcd 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -2,7 +2,10 @@ from pandas.core.algorithms import unique1d from pandas.core.arrays.categorical import ( - Categorical, CategoricalDtype, _recode_for_categories) + Categorical, + CategoricalDtype, + _recode_for_categories, +) def recode_for_groupby(c, sort, observed): @@ -49,9 +52,7 @@ def recode_for_groupby(c, sort, observed): # we recode according to the uniques categories = c.categories.take(take_codes) - codes = _recode_for_categories(c.codes, - c.categories, - categories) + codes = _recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories @@ -68,8 +69,7 @@ def recode_for_groupby(c, sort, observed): # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped - cat = cat.add_categories( - c.categories[~c.categories.isin(cat.categories)]) + cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None @@ -96,5 +96,4 @@ def recode_from_groupby(c, sort, ci): return ci.set_categories(c.categories) # we are not sorting, so add unobserved to the end - return ci.add_categories( - c.categories[~c.categories.isin(ci.categories)]) + return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9e7dcafc0b1a4..7fd0ca94e7997 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,12 +21,20 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution -from pandas.core.dtypes.cast import ( - maybe_convert_objects, maybe_downcast_to_dtype) +from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like, - is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, - is_object_dtype, is_scalar) + ensure_int64, + ensure_platform_int, + is_bool, + is_datetimelike, + is_dict_like, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_dtype, + is_object_dtype, + is_scalar, +) from pandas.core.dtypes.missing import isna, notna from pandas._typing import FrameOrSeries @@ -36,8 +44,7 @@ from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs from pandas.core.groupby import base -from pandas.core.groupby.groupby import ( - GroupBy, _apply_docs, _transform_template) +from pandas.core.groupby.groupby import GroupBy, _apply_docs, _transform_template from pandas.core.index import Index, MultiIndex import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block @@ -55,10 +62,9 @@ ScalarResult = typing.TypeVar("ScalarResult") -def whitelist_method_generator(base_class: Type[GroupBy], - klass: Type[FrameOrSeries], - whitelist: FrozenSet[str], - ) -> Iterator[str]: +def whitelist_method_generator( + base_class: Type[GroupBy], klass: Type[FrameOrSeries], whitelist: FrozenSet[str] +) -> Iterator[str]: """ Yields all GroupBy member defs for DataFrame/Series names in whitelist. @@ -80,8 +86,7 @@ class where members are defined. Since we don't want to override methods explicitly defined in the base class, any such name is skipped. """ - property_wrapper_template = \ - """@property + property_wrapper_template = """@property def %(name)s(self) : \"""%(doc)s\""" return self.__getattr__('%(name)s')""" @@ -94,14 +99,13 @@ def %(name)s(self) : # ugly, but we need the name string itself in the method. f = getattr(klass, name) doc = f.__doc__ - doc = doc if type(doc) == str else '' + doc = doc if type(doc) == str else "" wrapper_template = property_wrapper_template - params = {'name': name, 'doc': doc} + params = {"name": name, "doc": doc} yield wrapper_template % params class NDFrameGroupBy(GroupBy): - def _iterate_slices(self): if self.axis == 0: # kludge @@ -119,16 +123,15 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, alt=None, numeric_only=True, - min_count=-1): + def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): new_items, new_blocks = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only, min_count=min_count) + how, alt=alt, numeric_only=numeric_only, min_count=min_count + ) return self._wrap_agged_blocks(new_items, new_blocks) _block_agg_axis = 0 - def _cython_agg_blocks(self, how, alt=None, numeric_only=True, - min_count=-1): + def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine @@ -145,7 +148,8 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis, min_count=min_count) + block.values, how, axis=agg_axis, min_count=min_count + ) except NotImplementedError: # generally if we have numeric_only=False # and non-applicable functions @@ -181,7 +185,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, new_blocks.append(newb) if len(new_blocks) == 0: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering @@ -203,13 +207,13 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, offset = 0 for b in new_blocks: loc = len(b.mgr_locs) - b.mgr_locs = indexer[offset:(offset + loc)] + b.mgr_locs = indexer[offset : (offset + loc)] offset += loc return new_items, new_blocks def aggregate(self, func, *args, **kwargs): - _level = kwargs.pop('_level', None) + _level = kwargs.pop("_level", None) relabeling = func is None and _is_multi_agg_with_relabel(**kwargs) if relabeling: @@ -218,8 +222,7 @@ def aggregate(self, func, *args, **kwargs): kwargs = {} elif func is None: # nicer error message - raise TypeError("Must provide 'func' or tuples of " - "'(column, aggfunc).") + raise TypeError("Must provide 'func' or tuples of " "'(column, aggfunc).") func = _maybe_mangle_lambdas(func) @@ -238,11 +241,12 @@ def aggregate(self, func, *args, **kwargs): try: assert not args and not kwargs result = self._aggregate_multiple_funcs( - [func], _level=_level, _axis=self.axis) + [func], _level=_level, _axis=self.axis + ) result.columns = Index( - result.columns.levels[0], - name=self._selected_obj.columns.name) + result.columns.levels[0], name=self._selected_obj.columns.name + ) if isinstance(self.obj, SparseDataFrame): # Backwards compat for groupby.agg() with sparse @@ -266,7 +270,7 @@ def aggregate(self, func, *args, **kwargs): def _aggregate_generic(self, func, *args, **kwargs): if self.grouper.nkeys != 1: - raise AssertionError('Number of keys must be 1') + raise AssertionError("Number of keys must be 1") axis = self.axis obj = self._obj_with_exclusions @@ -275,16 +279,14 @@ def _aggregate_generic(self, func, *args, **kwargs): if axis != obj._info_axis_number: try: for name, data in self: - result[name] = self._try_cast(func(data, *args, **kwargs), - data) + result[name] = self._try_cast(func(data, *args, **kwargs), data) except Exception: return self._aggregate_item_by_item(func, *args, **kwargs) else: for name in self.indices: try: data = self.get_group(name, obj=obj) - result[name] = self._try_cast(func(data, *args, **kwargs), - data) + result[name] = self._try_cast(func(data, *args, **kwargs), data) except Exception: wrapper = lambda x: func(x, *args, **kwargs) result[name] = data.apply(wrapper, axis=axis) @@ -304,8 +306,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): for item in obj: try: data = obj[item] - colg = SeriesGroupBy(data, selection=item, - grouper=self.grouper) + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) cast = self._transform_should_cast(func) @@ -342,8 +343,7 @@ def _decide_output_index(self, output, labels): pass if isinstance(labels, MultiIndex): - output_keys = MultiIndex.from_tuples(output_keys, - names=labels.names) + output_keys = MultiIndex.from_tuples(output_keys, names=labels.names) return output_keys @@ -369,8 +369,7 @@ def first_not_none(values): # We'd prefer it return an empty dataframe. return DataFrame() elif isinstance(v, DataFrame): - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: if len(self.grouper.groupings) > 1: key_index = self.grouper.result_index @@ -400,8 +399,7 @@ def first_not_none(values): return DataFrame() elif isinstance(v, NDFrame): values = [ - x if x is not None else - v._constructor(**v._construct_axes_dict()) + x if x is not None else v._constructor(**v._construct_axes_dict()) for x in values ] @@ -410,11 +408,8 @@ def first_not_none(values): if isinstance(v, (np.ndarray, Index, Series)): if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = _all_indexes_same([ - x.index for x in values - ]) - singular_series = (len(values) == 1 and - applied_index.nlevels == 1) + all_indexed_same = _all_indexes_same([x.index for x in values]) + singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 # provide a reduction (Frame -> Series) if groups are @@ -438,13 +433,12 @@ def first_not_none(values): # path added as of GH 5545 elif all_indexed_same: from pandas.core.reshape.concat import concat + return concat(values) if not all_indexed_same: # GH 8467 - return self._concat_objects( - keys, values, not_indexed_same=True, - ) + return self._concat_objects(keys, values, not_indexed_same=True) try: if self.axis == 0: @@ -462,33 +456,37 @@ def first_not_none(values): # normally use vstack as its faster than concat # and if we have mi-columns - if (isinstance(v.index, MultiIndex) or - key_index is None or - isinstance(key_index, MultiIndex)): - stacked_values = np.vstack([ - np.asarray(v) for v in values - ]) - result = DataFrame(stacked_values, index=key_index, - columns=index) + if ( + isinstance(v.index, MultiIndex) + or key_index is None + or isinstance(key_index, MultiIndex) + ): + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = DataFrame( + stacked_values, index=key_index, columns=index + ) else: # GH5788 instead of stacking; concat gets the # dtypes correct from pandas.core.reshape.concat import concat - result = concat(values, keys=key_index, - names=key_index.names, - axis=self.axis).unstack() + + result = concat( + values, + keys=key_index, + names=key_index.names, + axis=self.axis, + ).unstack() result.columns = index else: - stacked_values = np.vstack([np.asarray(v) - for v in values]) - result = DataFrame(stacked_values.T, index=v.index, - columns=key_index) + stacked_values = np.vstack([np.asarray(v) for v in values]) + result = DataFrame( + stacked_values.T, index=v.index, columns=key_index + ) except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall # through to the outer else caluse - return Series(values, index=key_index, - name=self._selection_name) + return Series(values, index=key_index, name=self._selection_name) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -507,14 +505,13 @@ def first_not_none(values): # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return (Series(values, index=key_index) - ._convert(datetime=True, - coerce=coerce)) + return Series(values, index=key_index)._convert( + datetime=True, coerce=coerce + ) else: # Handle cases like BinGrouper - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) def _transform_general(self, func, *args, **kwargs): from pandas.core.reshape.concat import concat @@ -526,7 +523,7 @@ def _transform_general(self, func, *args, **kwargs): path = None for name, group in gen: - object.__setattr__(group, 'name', name) + object.__setattr__(group, "name", name) if path is None: # Try slow path and fast path. @@ -535,7 +532,7 @@ def _transform_general(self, func, *args, **kwargs): except TypeError: return self._transform_item_by_item(obj, fast_path) except ValueError: - msg = 'transform must return a scalar value for each group' + msg = "transform must return a scalar value for each group" raise ValueError(msg) else: res = path(group) @@ -553,9 +550,12 @@ def _transform_general(self, func, *args, **kwargs): r.index = group.index else: r = DataFrame( - np.concatenate([res.values] * len(group.index) - ).reshape(group.shape), - columns=group.columns, index=group.index) + np.concatenate([res.values] * len(group.index)).reshape( + group.shape + ), + columns=group.columns, + index=group.index, + ) applied.append(r) else: @@ -564,11 +564,10 @@ def _transform_general(self, func, *args, **kwargs): concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 concatenated = concat(applied, axis=self.axis, verify_integrity=False) - concatenated = concatenated.reindex(concat_index, axis=other_axis, - copy=False) + concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) return self._set_result_index_ordered(concatenated) - @Substitution(klass='DataFrame', selected='') + @Substitution(klass="DataFrame", selected="") @Appender(_transform_template) def transform(self, func, *args, **kwargs): @@ -614,18 +613,19 @@ def _transform_fast(self, result, obj, func_nm): res = self._try_cast(res, obj.iloc[:, i]) output.append(res) - return DataFrame._from_arrays(output, columns=result.columns, - index=obj.index) + return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): fast_path = lambda group: getattr(group, func)(*args, **kwargs) slow_path = lambda group: group.apply( - lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis) + lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis + ) else: fast_path = lambda group: func(group, *args, **kwargs) slow_path = lambda group: group.apply( - lambda x: func(x, *args, **kwargs), axis=self.axis) + lambda x: func(x, *args, **kwargs), axis=self.axis + ) return fast_path, slow_path def _choose_path(self, fast_path, slow_path, group): @@ -663,7 +663,7 @@ def _transform_item_by_item(self, obj, wrapper): pass if len(output) == 0: # pragma: no cover - raise TypeError('Transform function invalid for data types') + raise TypeError("Transform function invalid for data types") columns = obj.columns if len(output) < len(obj.columns): @@ -712,7 +712,7 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa gen = self.grouper.get_iterator(obj, axis=self.axis) for name, group in gen: - object.__setattr__(group, 'name', name) + object.__setattr__(group, "name", name) res = func(group, *args, **kwargs) @@ -727,9 +727,10 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa indices.append(self._get_index(name)) else: # non scalars aren't allowed - raise TypeError("filter function returned a %s, " - "but expected a scalar bool" % - type(res).__name__) + raise TypeError( + "filter function returned a %s, " + "but expected a scalar bool" % type(res).__name__ + ) return self._apply_filter(indices, dropna) @@ -739,8 +740,7 @@ class SeriesGroupBy(GroupBy): # Make class defs of attributes on SeriesGroupBy whitelist _apply_whitelist = base.series_apply_whitelist - for _def_str in whitelist_method_generator( - GroupBy, Series, _apply_whitelist): + for _def_str in whitelist_method_generator(GroupBy, Series, _apply_whitelist): exec(_def_str) @property @@ -755,15 +755,18 @@ def _selection_name(self): else: return self._selection - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.Series.groupby.apply pandas.Series.groupby.transform pandas.Series.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> s = pd.Series([1, 2, 3, 4]) @@ -800,27 +803,33 @@ def _selection_name(self): minimum maximum 1 1 2 2 3 4 - """) + """ + ) - @Appender(_apply_docs['template'] - .format(input='series', - examples=_apply_docs['series_examples'])) + @Appender( + _apply_docs["template"].format( + input="series", examples=_apply_docs["series_examples"] + ) + ) def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series', - axis='') - @Appender(_shared_docs['aggregate']) + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, func_or_funcs=None, *args, **kwargs): - _level = kwargs.pop('_level', None) + _level = kwargs.pop("_level", None) relabeling = func_or_funcs is None columns = None - no_arg_message = ("Must provide 'func_or_funcs' or named " - "aggregation **kwargs.") + no_arg_message = ( + "Must provide 'func_or_funcs' or named " "aggregation **kwargs." + ) if relabeling: columns = list(kwargs) if not PY36: @@ -839,8 +848,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): # Catch instances of lists / tuples # but not the class list / tuple itself. func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) - ret = self._aggregate_multiple_funcs(func_or_funcs, - (_level or 0) + 1) + ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) if relabeling: ret.columns = columns else: @@ -860,11 +868,12 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs): ret = Series(result, index=index) if not self.as_index: # pragma: no cover - print('Warning, ignoring as_index=True') + print("Warning, ignoring as_index=True") # _level handled at higher if not _level and isinstance(ret, dict): from pandas import concat + ret = concat(ret, axis=1) return ret @@ -877,20 +886,21 @@ def _aggregate_multiple_funcs(self, arg, _level): # have not shown a higher level one # GH 15931 if isinstance(self._selected_obj, Series) and _level <= 1: - msg = dedent("""\ + msg = dedent( + """\ using a dict on a Series for aggregation is deprecated and will be removed in a future version. Use \ named aggregation instead. >>> grouper.agg(name_1=func_1, name_2=func_2) - """) + """ + ) warnings.warn(msg, FutureWarning, stacklevel=3) columns = list(arg.keys()) arg = arg.items() elif any(isinstance(x, (tuple, list)) for x in arg): - arg = [(x, x) if not isinstance(x, (tuple, list)) else x - for x in arg] + arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg] # indicated column order columns = next(zip(*arg)) @@ -907,8 +917,9 @@ def _aggregate_multiple_funcs(self, arg, _level): obj = self if name in results: raise SpecificationError( - 'Function names must be unique, found multiple named ' - '{}'.format(name)) + "Function names must be unique, found multiple named " + "{}".format(name) + ) # reset the cache so that we # only include the named selection @@ -938,15 +949,13 @@ def _wrap_output(self, output, index, names=None): return Series(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): - result = self._wrap_output(output=output, - index=self.grouper.result_index, - names=names) + result = self._wrap_output( + output=output, index=self.grouper.result_index, names=names + ) return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): - return self._wrap_output(output=output, - index=self.obj.index, - names=names) + return self._wrap_output(output=output, index=self.obj.index, names=names) def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: @@ -971,17 +980,13 @@ def _get_index(): return result if isinstance(values[0], Series): - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif isinstance(values[0], DataFrame): # possible that Series -> DataFrame by applied function - return self._concat_objects(keys, values, - not_indexed_same=not_indexed_same) + return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = Series(data=values, - index=_get_index(), - name=self._selection_name) + result = Series(data=values, index=_get_index(), name=self._selection_name) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -991,12 +996,12 @@ def _aggregate_named(self, func, *args, **kwargs): group.name = name output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): - raise Exception('Must produce aggregated value') + raise Exception("Must produce aggregated value") result[name] = self._try_cast(output, group) return result - @Substitution(klass='Series', selected='A.') + @Substitution(klass="Series", selected="A.") @Appender(_transform_template) def transform(self, func, *args, **kwargs): func = self._is_cython_func(func) or func @@ -1009,17 +1014,18 @@ def transform(self, func, *args, **kwargs): else: # cythonized aggregation and merge return self._transform_fast( - lambda: getattr(self, func)(*args, **kwargs), func) + lambda: getattr(self, func)(*args, **kwargs), func + ) # reg transform klass = self._selected_obj.__class__ results = [] wrapper = lambda x: func(x, *args, **kwargs) for name, group in self: - object.__setattr__(group, 'name', name) + object.__setattr__(group, "name", name) res = wrapper(group) - if hasattr(res, 'values'): + if hasattr(res, "values"): res = res.values indexer = self._get_index(name) @@ -1029,6 +1035,7 @@ def transform(self, func, *args, **kwargs): # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat + result = concat(results).sort_index() else: result = Series() @@ -1099,8 +1106,9 @@ def true_and_notna(x, *args, **kwargs): return b and notna(b) try: - indices = [self._get_index(name) for name, group in self - if true_and_notna(group)] + indices = [ + self._get_index(name) for name, group in self if true_and_notna(group) + ] except ValueError: raise TypeError("the filter must return a boolean result") except TypeError: @@ -1125,7 +1133,7 @@ def nunique(self, dropna=True): try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes - msg = 'val.dtype must be object, got {}'.format(val.dtype) + msg = "val.dtype must be object, got {}".format(val.dtype) assert val.dtype == object, msg val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) @@ -1149,7 +1157,7 @@ def nunique(self, dropna=True): inc[mask & np.r_[False, mask[:-1]]] = 0 inc[idx] = 1 - out = np.add.reduceat(inc, idx).astype('int64', copy=False) + out = np.add.reduceat(inc, idx).astype("int64", copy=False) if len(ids): # NaN/NaT group exists if the head of ids is -1, # so remove it from res and exclude its index from idx @@ -1167,9 +1175,7 @@ def nunique(self, dropna=True): res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, - index=ri, - name=self._selection_name) + return Series(res, index=ri, name=self._selection_name) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -1178,8 +1184,9 @@ def describe(self, **kwargs): return result.T return result.unstack() - def value_counts(self, normalize=False, sort=True, ascending=False, - bins=None, dropna=True): + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers @@ -1187,11 +1194,13 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level # in a backward compatible way - return self.apply(Series.value_counts, - normalize=normalize, - sort=sort, - ascending=ascending, - bins=bins) + return self.apply( + Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + ) ids, _, _ = self.grouper.group_info val = self.obj._internal_get_values() @@ -1244,7 +1253,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, labels = out[mask], [label[mask] for label in labels] if normalize: - out = out.astype('float') + out = out.astype("float") d = np.diff(np.r_[idx, len(ids)]) if dropna: m = ids[lab == -1] @@ -1260,8 +1269,9 @@ def value_counts(self, normalize=False, sort=True, ascending=False, out, labels[-1] = out[sorter], labels[-1][sorter] if bins is None: - mi = MultiIndex(levels=levels, codes=labels, names=names, - verify_integrity=False) + mi = MultiIndex( + levels=levels, codes=labels, names=names, verify_integrity=False + ) if is_integer_dtype(out): out = ensure_int64(out) @@ -1269,18 +1279,17 @@ def value_counts(self, normalize=False, sort=True, ascending=False, # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros - diff = np.zeros(len(out), dtype='bool') + diff = np.zeros(len(out), dtype="bool") for lab in labels[:-1]: diff |= np.r_[True, lab[1:] != lab[:-1]] ncat, nbin = diff.sum(), len(levels[-1]) - left = [np.repeat(np.arange(ncat), nbin), - np.tile(np.arange(nbin), ncat)] + left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)] right = [diff.cumsum() - 1, labels[-1]] - _, idx = _get_join_indexers(left, right, sort=False, how='left') + _, idx = _get_join_indexers(left, right, sort=False, how="left") out = np.where(idx != -1, out[idx], 0) if sort: @@ -1291,8 +1300,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1])) codes.append(left[-1]) - mi = MultiIndex(levels=levels, codes=codes, names=names, - verify_integrity=False) + mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False) if is_integer_dtype(out): out = ensure_int64(out) @@ -1315,22 +1323,26 @@ def count(self): minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series(out, - index=self.grouper.result_index, - name=self._selection_name, - dtype='int64') + return Series( + out, + index=self.grouper.result_index, + name=self._selection_name, + dtype="int64", + ) def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): """Calculate pct_change of each value to previous entry in group""" # TODO: Remove this conditional when #23918 is fixed if freq: - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq)) + return self.apply( + lambda x: x.pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + ) filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.labels) shifted = fill_grp.shift(periods=periods, freq=freq) @@ -1344,21 +1356,23 @@ class DataFrameGroupBy(NDFrameGroupBy): # # Make class defs of attributes on DataFrameGroupBy whitelist. - for _def_str in whitelist_method_generator( - GroupBy, DataFrame, _apply_whitelist): + for _def_str in whitelist_method_generator(GroupBy, DataFrame, _apply_whitelist): exec(_def_str) _block_agg_axis = 1 - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.DataFrame.groupby.apply pandas.DataFrame.groupby.transform pandas.DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -1426,14 +1440,17 @@ class DataFrameGroupBy(NDFrameGroupBy): As usual, the aggregation can be a callable or a string alias. See :ref:`groupby.aggregate.named` for more. - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='DataFrame', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg=None, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) @@ -1456,17 +1473,21 @@ def _gotitem(self, key, ndim, subset=None): if ndim == 2: if subset is None: subset = self.obj - return DataFrameGroupBy(subset, self.grouper, selection=key, - grouper=self.grouper, - exclusions=self.exclusions, - as_index=self.as_index, - observed=self.observed) + return DataFrameGroupBy( + subset, + self.grouper, + selection=key, + grouper=self.grouper, + exclusions=self.exclusions, + as_index=self.as_index, + observed=self.observed, + ) elif ndim == 1: if subset is None: subset = self.obj[key] - return SeriesGroupBy(subset, selection=key, - grouper=self.grouper, - observed=self.observed) + return SeriesGroupBy( + subset, selection=key, grouper=self.grouper, observed=self.observed + ) raise AssertionError("invalid ndim for _gotitem") @@ -1474,11 +1495,9 @@ def _wrap_generic_output(self, result, obj): result_index = self.grouper.levels[0] if self.axis == 0: - return DataFrame(result, index=obj.columns, - columns=result_index).T + return DataFrame(result, index=obj.columns, columns=result_index).T else: - return DataFrame(result, index=obj.index, - columns=result_index) + return DataFrame(result, index=obj.index, columns=result_index) def _get_data_to_aggregate(self): obj = self._obj_with_exclusions @@ -1489,10 +1508,16 @@ def _get_data_to_aggregate(self): def _insert_inaxis_grouper_inplace(self, result): # zip in reverse so we can always insert at loc 0 - izip = zip(* map(reversed, ( - self.grouper.names, - self.grouper.get_group_levels(), - [grp.in_axis for grp in self.grouper.groupings]))) + izip = zip( + *map( + reversed, + ( + self.grouper.names, + self.grouper.get_group_levels(), + [grp.in_axis for grp in self.grouper.groupings], + ), + ) + ) for name, lev, in_axis in izip: if in_axis: @@ -1540,17 +1565,21 @@ def _wrap_agged_blocks(self, items, blocks): def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): - yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], - selection=colname, - grouper=self.grouper, - exclusions=self.exclusions) + yield colname, SeriesGroupBy( + self._selected_obj.iloc[:, i], + selection=colname, + grouper=self.grouper, + exclusions=self.exclusions, + ) def _apply_to_column_groupbys(self, func): from pandas.core.reshape.concat import concat + return concat( - (func(col_groupby) for _, col_groupby - in self._iterate_column_groupbys()), - keys=self._selected_obj.columns, axis=1) + (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), + keys=self._selected_obj.columns, + axis=1, + ) def count(self): """ @@ -1567,12 +1596,10 @@ def count(self): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - val = ((mask & ~_isna(np.atleast_2d(blk.get_values()))) - for blk in data.blocks) + val = ((mask & ~_isna(np.atleast_2d(blk.get_values()))) for blk in data.blocks) loc = (blk.mgr_locs for blk in data.blocks) - counter = partial( - lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1) + counter = partial(lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1) blk = map(make_block, map(counter, val), loc) return self._wrap_agged_blocks(data.items, list(blk)) @@ -1628,14 +1655,15 @@ def nunique(self, dropna=True): obj = self._selected_obj def groupby_series(obj, col=None): - return SeriesGroupBy(obj, - selection=col, - grouper=self.grouper).nunique(dropna=dropna) + return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique( + dropna=dropna + ) if isinstance(obj, Series): results = groupby_series(obj) else: from pandas.core.reshape.concat import concat + results = [groupby_series(obj[col], col) for col in obj.columns] results = concat(results, axis=1) results.columns.names = obj.columns.names @@ -1669,10 +1697,7 @@ def _is_multi_agg_with_relabel(**kwargs): >>> _is_multi_agg_with_relabel() False """ - return all( - isinstance(v, tuple) and len(v) == 2 - for v in kwargs.values() - ) and kwargs + return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs def _normalize_keyword_aggregation(kwargs): @@ -1719,8 +1744,7 @@ def _normalize_keyword_aggregation(kwargs): aggspec[column].append(aggfunc) else: aggspec[column] = [aggfunc] - order.append((column, - com.get_callable_name(aggfunc) or aggfunc)) + order.append((column, com.get_callable_name(aggfunc) or aggfunc)) return aggspec, columns, order @@ -1730,6 +1754,7 @@ def _normalize_keyword_aggregation(kwargs): # typing.Sequence[Callable[..., ScalarResult]] # -> typing.Sequence[Callable[..., ScalarResult]]: + def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: """ Possibly mangle a list of aggfuncs. @@ -1756,7 +1781,7 @@ def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]: for aggfunc in aggfuncs: if com.get_callable_name(aggfunc) == "": aggfunc = functools.partial(aggfunc) - aggfunc.__name__ = ''.format(i) + aggfunc.__name__ = "".format(i) i += 1 mangled_aggfuncs.append(aggfunc) @@ -1828,13 +1853,15 @@ def _recast_datetimelike_result(result: DataFrame) -> DataFrame: """ result = result.copy() - obj_cols = [idx for idx in range(len(result.columns)) - if is_object_dtype(result.dtypes[idx])] + obj_cols = [ + idx for idx in range(len(result.columns)) if is_object_dtype(result.dtypes[idx]) + ] # See GH#26285 for n in obj_cols: - converted = maybe_convert_objects(result.iloc[:, n].values, - convert_numeric=False) + converted = maybe_convert_objects( + result.iloc[:, n].values, convert_numeric=False + ) result.iloc[:, n] = converted return result diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 925f006de92b6..aa71fd68086fb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -28,16 +28,24 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_float, is_datetime64tz_dtype, is_extension_array_dtype, - is_numeric_dtype, is_scalar) + ensure_float, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_numeric_dtype, + is_scalar, +) from pandas.core.dtypes.missing import isna, notna -from pandas.api.types import ( - is_datetime64_dtype, is_integer_dtype, is_object_dtype) +from pandas.api.types import is_datetime64_dtype, is_integer_dtype, is_object_dtype import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical from pandas.core.base import ( - DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError) + DataError, + GroupByError, + PandasObject, + SelectionMixin, + SpecificationError, +) import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame @@ -169,7 +177,8 @@ class providing the base-class of operations. Examples -------- {examples} - """) + """, +) _pipe_template = """ Apply a function `func` with arguments to this %(klass)s object and return @@ -303,14 +312,17 @@ def __init__(self, groupby): def __call__(self, *args, **kwargs): def f(self): return self.plot(*args, **kwargs) - f.__name__ = 'plot' + + f.__name__ = "plot" return self._groupby.apply(f) def __getattr__(self, name): def attr(*args, **kwargs): def f(self): return getattr(self.plot, name)(*args, **kwargs) + return self._groupby.apply(f) + return attr @@ -328,10 +340,22 @@ class _GroupBy(PandasObject, SelectionMixin): _group_selection = None _apply_whitelist = frozenset() # type: FrozenSet[str] - def __init__(self, obj, keys=None, axis=0, level=None, - grouper=None, exclusions=None, selection=None, as_index=True, - sort=True, group_keys=True, squeeze=False, - observed=False, **kwargs): + def __init__( + self, + obj, + keys=None, + axis=0, + level=None, + grouper=None, + exclusions=None, + selection=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + observed=False, + **kwargs + ): self._selection = selection @@ -342,9 +366,9 @@ def __init__(self, obj, keys=None, axis=0, level=None, if not as_index: if not isinstance(obj, DataFrame): - raise TypeError('as_index=False only valid with DataFrame') + raise TypeError("as_index=False only valid with DataFrame") if axis != 0: - raise ValueError('as_index=False only valid for axis=0') + raise ValueError("as_index=False only valid for axis=0") self.as_index = as_index self.keys = keys @@ -352,16 +376,20 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.group_keys = group_keys self.squeeze = squeeze self.observed = observed - self.mutated = kwargs.pop('mutated', False) + self.mutated = kwargs.pop("mutated", False) if grouper is None: from pandas.core.groupby.grouper import _get_grouper - grouper, exclusions, obj = _get_grouper(obj, keys, - axis=axis, - level=level, - sort=sort, - observed=observed, - mutated=self.mutated) + + grouper, exclusions, obj = _get_grouper( + obj, + keys, + axis=axis, + level=level, + sort=sort, + observed=observed, + mutated=self.mutated, + ) self.obj = obj self.axis = obj._get_axis_number(axis) @@ -369,7 +397,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.exclusions = set(exclusions) if exclusions else set() # we accept no other args - validate_kwargs('group', kwargs, {}) + validate_kwargs("group", kwargs, {}) def __len__(self): return len(self.groups) @@ -428,13 +456,12 @@ def get_converter(s): if len(self.indices) > 0: index_sample = next(iter(self.indices)) else: - index_sample = None # Dummy sample + index_sample = None # Dummy sample name_sample = names[0] if isinstance(index_sample, tuple): if not isinstance(name_sample, tuple): - msg = ("must supply a tuple to get_group with multiple" - " grouping keys") + msg = "must supply a tuple to get_group with multiple" " grouping keys" raise ValueError(msg) if not len(name_sample) == len(index_sample): try: @@ -442,13 +469,14 @@ def get_converter(s): return [self.indices[name] for name in names] except KeyError: # turns out it wasn't a tuple - msg = ("must supply a same-length tuple to get_group" - " with multiple grouping keys") + msg = ( + "must supply a same-length tuple to get_group" + " with multiple grouping keys" + ) raise ValueError(msg) converters = [get_converter(s) for s in index_sample] - names = (tuple(f(n) for f, n in zip(converters, name)) - for name in names) + names = (tuple(f(n) for f, n in zip(converters, name)) for name in names) else: converter = get_converter(index_sample) @@ -482,7 +510,7 @@ def _reset_group_selection(self): if self._group_selection is not None: # GH12839 clear cached selection too when changing group selection self._group_selection = None - self._reset_cache('_selected_obj') + self._reset_cache("_selected_obj") def _set_group_selection(self): """ @@ -493,21 +521,21 @@ def _set_group_selection(self): NOTE: this should be paired with a call to _reset_group_selection """ grp = self.grouper - if not (self.as_index and - getattr(grp, 'groupings', None) is not None and - self.obj.ndim > 1 and - self._group_selection is None): + if not ( + self.as_index + and getattr(grp, "groupings", None) is not None + and self.obj.ndim > 1 + and self._group_selection is None + ): return ax = self.obj._info_axis - groupers = [g.name for g in grp.groupings - if g.level is None and g.in_axis] + groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis] if len(groupers): # GH12839 clear selected obj cache when group selection changes - self._group_selection = ax.difference(Index(groupers), - sort=False).tolist() - self._reset_cache('_selected_obj') + self._group_selection = ax.difference(Index(groupers), sort=False).tolist() + self._reset_cache("_selected_obj") def _set_result_index_ordered(self, result): # set the result index on the passed values object and @@ -516,13 +544,11 @@ def _set_result_index_ordered(self, result): # the values/counts are repeated according to the group index # shortcut if we have an already ordered grouper if not self.grouper.is_monotonic: - index = Index(np.concatenate( - self._get_indices(self.grouper.result_index))) + index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, - inplace=True) + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result def _dir_additions(self): @@ -536,12 +562,14 @@ def __getattr__(self, attr): if hasattr(self.obj, attr): return self._make_wrapper(attr) - raise AttributeError("%r object has no attribute %r" % - (type(self).__name__, attr)) + raise AttributeError( + "%r object has no attribute %r" % (type(self).__name__, attr) + ) - @Substitution(klass='GroupBy', - versionadded='.. versionadded:: 0.21.0', - examples="""\ + @Substitution( + klass="GroupBy", + versionadded=".. versionadded:: 0.21.0", + examples="""\ >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) >>> df A B @@ -557,7 +585,8 @@ def __getattr__(self, attr): B A a 2 -b 2""") +b 2""", + ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return com._pipe(self, func, *args, **kwargs) @@ -567,10 +596,11 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name): if name not in self._apply_whitelist: is_callable = callable(getattr(self._selected_obj, name, None)) - kind = ' callable ' if is_callable else ' ' - msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try " - "using the 'apply' method".format(kind, name, - type(self).__name__)) + kind = " callable " if is_callable else " " + msg = ( + "Cannot access{0}attribute {1!r} of {2!r} objects, try " + "using the 'apply' method".format(kind, name, type(self).__name__) + ) raise AttributeError(msg) self._set_group_selection() @@ -587,9 +617,8 @@ def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis # argument kwargs_with_axis = kwargs.copy() - if ('axis' not in kwargs_with_axis or - kwargs_with_axis['axis'] is None): - kwargs_with_axis['axis'] = self.axis + if "axis" not in kwargs_with_axis or kwargs_with_axis["axis"] is None: + kwargs_with_axis["axis"] = self.axis def curried_with_axis(x): return f(x, *args, **kwargs_with_axis) @@ -620,8 +649,7 @@ def curried(x): # if we don't have this method to indicated to aggregate to # mark this column as an error try: - return self._aggregate_item_by_item(name, - *args, **kwargs) + return self._aggregate_item_by_item(name, *args, **kwargs) except (AttributeError): raise ValueError @@ -664,9 +692,11 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) - @Appender(_apply_docs['template'] - .format(input="dataframe", - examples=_apply_docs['dataframe_examples'])) + @Appender( + _apply_docs["template"].format( + input="dataframe", examples=_apply_docs["dataframe_examples"] + ) + ) def apply(self, func, *args, **kwargs): func = self._is_builtin_func(func) @@ -679,16 +709,18 @@ def apply(self, func, *args, **kwargs): @wraps(func) def f(g): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): return func(g, *args, **kwargs) + else: - raise ValueError('func must be a callable if args or ' - 'kwargs are supplied') + raise ValueError( + "func must be a callable if args or " "kwargs are supplied" + ) else: f = func # ignore SettingWithCopy here in case the user mutates - with option_context('mode.chained_assignment', None): + with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f) except Exception: @@ -707,13 +739,11 @@ def f(g): return result def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self._selected_obj, - self.axis) + keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) return self._wrap_applied_output( - keys, - values, - not_indexed_same=mutated or self.mutated) + keys, values, not_indexed_same=mutated or self.mutated + ) def _iterate_slices(self): yield self._selection_name, self._selected_obj @@ -775,7 +805,7 @@ def _try_cast(self, result, obj, numeric_only=False): # to the target timezone try: result = obj._values._from_sequence( - result, dtype='datetime64[ns, UTC]' + result, dtype="datetime64[ns, UTC]" ) result = result.astype(dtype) except TypeError: @@ -813,7 +843,8 @@ def _transform_should_cast(self, func_nm): Whether transform should attempt to cast the result of aggregation """ return (self.size().fillna(0) > 0).any() and ( - func_nm not in base.cython_cast_blacklist) + func_nm not in base.cython_cast_blacklist + ) def _cython_transform(self, how, numeric_only=True, **kwargs): output = collections.OrderedDict() @@ -823,8 +854,7 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): continue try: - result, names = self.grouper.transform(obj.values, how, - **kwargs) + result, names = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue except AssertionError as e: @@ -835,12 +865,11 @@ def _cython_transform(self, how, numeric_only=True, **kwargs): output[name] = result if len(output) == 0: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, alt=None, numeric_only=True, - min_count=-1): + def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -848,14 +877,15 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True, continue try: - result, names = self.grouper.aggregate(obj.values, how, - min_count=min_count) + result, names = self.grouper.aggregate( + obj.values, how, min_count=min_count + ) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) if len(output) == 0: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") return self._wrap_aggregated_output(output, names) @@ -918,7 +948,8 @@ def reset_identity(values): # GH 14776 if isinstance(ax, MultiIndex) and not ax.is_unique: indexer = algorithms.unique1d( - result.index.get_indexer_for(ax.values)) + result.index.get_indexer_for(ax.values) + ) result = result.take(indexer, axis=self.axis) else: result = result.reindex(ax, axis=self.axis) @@ -933,9 +964,14 @@ def reset_identity(values): group_levels = self.grouper.levels group_names = self.grouper.names - result = concat(values, axis=self.axis, keys=group_keys, - levels=group_levels, names=group_names, - sort=False) + result = concat( + values, + axis=self.axis, + keys=group_keys, + levels=group_levels, + names=group_names, + sort=False, + ) else: # GH5610, returns a MI, with the first level being a @@ -946,8 +982,10 @@ def reset_identity(values): values = reset_identity(values) result = concat(values, axis=self.axis) - if (isinstance(result, Series) and - getattr(self, '_selection_name', None) is not None): + if ( + isinstance(result, Series) + and getattr(self, "_selection_name", None) is not None + ): result.name = self._selection_name @@ -955,7 +993,7 @@ def reset_identity(values): def _apply_filter(self, indices, dropna): if len(indices) == 0: - indices = np.array([], dtype='int64') + indices = np.array([], dtype="int64") else: indices = np.sort(np.concatenate(indices)) if dropna: @@ -1038,6 +1076,7 @@ class GroupBy(_GroupBy): See the online documentation for full exposition on these topics and much more """ + def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1054,16 +1093,20 @@ def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]: def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return result.astype(inference, copy=False) - return self._get_cythonized_result('group_any_all', self.grouper, - aggregate=True, - cython_dtype=np.uint8, - needs_values=True, - needs_mask=True, - pre_processing=objs_to_bool, - post_processing=result_to_bool, - val_test=val_test, skipna=skipna) - - @Substitution(name='groupby') + return self._get_cythonized_result( + "group_any_all", + self.grouper, + aggregate=True, + cython_dtype=np.uint8, + needs_values=True, + needs_mask=True, + pre_processing=objs_to_bool, + post_processing=result_to_bool, + val_test=val_test, + skipna=skipna, + ) + + @Substitution(name="groupby") @Appender(_common_see_also) def any(self, skipna=True): """ @@ -1078,9 +1121,9 @@ def any(self, skipna=True): ------- bool """ - return self._bool_agg('any', skipna) + return self._bool_agg("any", skipna) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def all(self, skipna=True): """ @@ -1095,9 +1138,9 @@ def all(self, skipna=True): ------- bool """ - return self._bool_agg('all', skipna) + return self._bool_agg("all", skipna) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def count(self): """ @@ -1112,7 +1155,7 @@ def count(self): # defined here for API doc raise NotImplementedError - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def mean(self, *args, **kwargs): """ @@ -1156,10 +1199,11 @@ def mean(self, *args, **kwargs): 2 4.0 Name: B, dtype: float64 """ - nv.validate_groupby_func('mean', args, kwargs, ['numeric_only']) + nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"]) try: return self._cython_agg_general( - 'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs) + "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs + ) except GroupByError: raise except Exception: # pragma: no cover @@ -1167,7 +1211,7 @@ def mean(self, *args, **kwargs): f = lambda x: x.mean(axis=self.axis, **kwargs) return self._python_agg_general(f) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def median(self, **kwargs): """ @@ -1182,10 +1226,10 @@ def median(self, **kwargs): """ try: return self._cython_agg_general( - 'median', - alt=lambda x, - axis: Series(x).median(axis=axis, **kwargs), - **kwargs) + "median", + alt=lambda x, axis: Series(x).median(axis=axis, **kwargs), + **kwargs + ) except GroupByError: raise except Exception: # pragma: no cover @@ -1194,10 +1238,11 @@ def f(x): if isinstance(x, np.ndarray): x = Series(x) return x.median(axis=self.axis, **kwargs) + with _group_selection_context(self): return self._python_agg_general(f) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def std(self, ddof=1, *args, **kwargs): """ @@ -1217,10 +1262,10 @@ def std(self, ddof=1, *args, **kwargs): """ # TODO: implement at Cython level? - nv.validate_groupby_func('std', args, kwargs) + nv.validate_groupby_func("std", args, kwargs) return np.sqrt(self.var(ddof=ddof, **kwargs)) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def var(self, ddof=1, *args, **kwargs): """ @@ -1238,13 +1283,14 @@ def var(self, ddof=1, *args, **kwargs): Series or DataFrame Variance of values within each group. """ - nv.validate_groupby_func('var', args, kwargs) + nv.validate_groupby_func("var", args, kwargs) if ddof == 1: try: return self._cython_agg_general( - 'var', + "var", alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs), - **kwargs) + **kwargs + ) except Exception: f = lambda x: x.var(ddof=ddof, **kwargs) with _group_selection_context(self): @@ -1254,7 +1300,7 @@ def var(self, ddof=1, *args, **kwargs): with _group_selection_context(self): return self._python_agg_general(f) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def sem(self, ddof=1): """ @@ -1274,7 +1320,7 @@ def sem(self, ddof=1): """ return self.std(ddof=ddof) / np.sqrt(self.count()) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def size(self): """ @@ -1288,7 +1334,7 @@ def size(self): result = self.grouper.size() if isinstance(self.obj, Series): - result.name = getattr(self.obj, 'name', None) + result.name = getattr(self.obj, "name", None) return result @classmethod @@ -1297,9 +1343,7 @@ def _add_numeric_operations(cls): Add numeric operations to the GroupBy generically. """ - def groupby_function(name, alias, npfunc, - numeric_only=True, - min_count=-1): + def groupby_function(name, alias, npfunc, numeric_only=True, min_count=-1): _local_template = """ Compute %(f)s of group values. @@ -1310,38 +1354,34 @@ def groupby_function(name, alias, npfunc, Computed %(f)s of values within each group. """ - @Substitution(name='groupby', f=name) + @Substitution(name="groupby", f=name) @Appender(_common_see_also) @Appender(_local_template) def f(self, **kwargs): - if 'numeric_only' not in kwargs: - kwargs['numeric_only'] = numeric_only - if 'min_count' not in kwargs: - kwargs['min_count'] = min_count + if "numeric_only" not in kwargs: + kwargs["numeric_only"] = numeric_only + if "min_count" not in kwargs: + kwargs["min_count"] = min_count self._set_group_selection() # try a cython aggregation if we can try: - return self._cython_agg_general( - alias, alt=npfunc, **kwargs) + return self._cython_agg_general(alias, alt=npfunc, **kwargs) except AssertionError as e: raise SpecificationError(str(e)) except Exception: pass # apply a non-cython aggregation - result = self.aggregate( - lambda x: npfunc(x, axis=self.axis)) + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) # coerce the resulting columns if we can if isinstance(result, DataFrame): for col in result.columns: - result[col] = self._try_cast( - result[col], self.obj[col]) + result[col] = self._try_cast(result[col], self.obj[col]) else: - result = self._try_cast( - result, self.obj) + result = self._try_cast(result, self.obj) return result @@ -1350,7 +1390,6 @@ def f(self, **kwargs): return f def first_compat(x, axis=0): - def first(x): x = x.to_numpy() @@ -1365,7 +1404,6 @@ def first(x): return first(x) def last_compat(x, axis=0): - def last(x): x = x.to_numpy() x = x[notna(x)] @@ -1378,16 +1416,14 @@ def last(x): else: return last(x) - cls.sum = groupby_function('sum', 'add', np.sum, min_count=0) - cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0) - cls.min = groupby_function('min', 'min', np.min, numeric_only=False) - cls.max = groupby_function('max', 'max', np.max, numeric_only=False) - cls.first = groupby_function('first', 'first', first_compat, - numeric_only=False) - cls.last = groupby_function('last', 'last', last_compat, - numeric_only=False) + cls.sum = groupby_function("sum", "add", np.sum, min_count=0) + cls.prod = groupby_function("prod", "prod", np.prod, min_count=0) + cls.min = groupby_function("min", "min", np.min, numeric_only=False) + cls.max = groupby_function("max", "max", np.max, numeric_only=False) + cls.first = groupby_function("first", "first", first_compat, numeric_only=False) + cls.last = groupby_function("last", "last", last_compat, numeric_only=False) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def ohlc(self): """ @@ -1401,8 +1437,7 @@ def ohlc(self): Open, high, low and close values within each group. """ - return self._apply_to_column_groupbys( - lambda x: x._cython_agg_general('ohlc')) + return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) @Appender(DataFrame.describe.__doc__) def describe(self, **kwargs): @@ -1519,18 +1554,20 @@ def resample(self, rule, *args, **kwargs): 5 2000-01-01 00:00:20 5 1 """ from pandas.core.resample import get_resampler_for_grouping + return get_resampler_for_grouping(self, rule, *args, **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def rolling(self, *args, **kwargs): """ Return a rolling grouper, providing rolling functionality per group. """ from pandas.core.window import RollingGroupby + return RollingGroupby(self, *args, **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def expanding(self, *args, **kwargs): """ @@ -1538,6 +1575,7 @@ def expanding(self, *args, **kwargs): functionality per group. """ from pandas.core.window import ExpandingGroupby + return ExpandingGroupby(self, *args, **kwargs) def _fill(self, direction, limit=None): @@ -1567,13 +1605,17 @@ def _fill(self, direction, limit=None): if limit is None: limit = -1 - return self._get_cythonized_result('group_fillna_indexer', - self.grouper, needs_mask=True, - cython_dtype=np.int64, - result_is_index=True, - direction=direction, limit=limit) - - @Substitution(name='groupby') + return self._get_cythonized_result( + "group_fillna_indexer", + self.grouper, + needs_mask=True, + cython_dtype=np.int64, + result_is_index=True, + direction=direction, + limit=limit, + ) + + @Substitution(name="groupby") def pad(self, limit=None): """ Forward fill the values. @@ -1595,10 +1637,11 @@ def pad(self, limit=None): Series.fillna DataFrame.fillna """ - return self._fill('ffill', limit=limit) + return self._fill("ffill", limit=limit) + ffill = pad - @Substitution(name='groupby') + @Substitution(name="groupby") def backfill(self, limit=None): """ Backward fill the values. @@ -1620,14 +1663,13 @@ def backfill(self, limit=None): Series.fillna DataFrame.fillna """ - return self._fill('bfill', limit=limit) + return self._fill("bfill", limit=limit) + bfill = backfill - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def nth(self, - n: Union[int, List[int]], - dropna: Optional[str] = None) -> DataFrame: + def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: """ Take the nth row from each group if n is an int, or a subset of rows if n is a list of ints. @@ -1717,8 +1759,7 @@ def nth(self, self._set_group_selection() mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, - -nth_array) + mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) mask = mask_left | mask_right ids, _, _ = self.grouper.group_info @@ -1736,19 +1777,19 @@ def nth(self, # dropna is truthy if isinstance(n, valid_containers): - raise ValueError( - "dropna option with a list of nth values is not supported") + raise ValueError("dropna option with a list of nth values is not supported") - if dropna not in ['any', 'all']: + if dropna not in ["any", "all"]: # Note: when agg-ing picker doesn't raise this, just returns NaN - raise ValueError("For a DataFrame groupby, dropna must be " - "either None, 'any' or 'all', " - "(was passed {dropna}).".format( - dropna=dropna)) + raise ValueError( + "For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed {dropna}).".format(dropna=dropna) + ) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf - max_len = n if n >= 0 else - 1 - n + max_len = n if n >= 0 else -1 - n dropped = self.obj.dropna(how=dropna, axis=self.axis) # get a new grouper for our dropped obj @@ -1765,13 +1806,17 @@ def nth(self, # create a grouper with the original parameters, but on dropped # object from pandas.core.groupby.grouper import _get_grouper - grouper, _, _ = _get_grouper(dropped, key=self.keys, - axis=self.axis, level=self.level, - sort=self.sort, - mutated=self.mutated) - grb = dropped.groupby( - grouper, as_index=self.as_index, sort=self.sort) + grouper, _, _ = _get_grouper( + dropped, + key=self.keys, + axis=self.axis, + level=self.level, + sort=self.sort, + mutated=self.mutated, + ) + + grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) sizes, result = grb.size(), grb.nth(n) mask = (sizes < max_len).values @@ -1780,15 +1825,16 @@ def nth(self, result.loc[mask] = np.nan # reset/reindex to the original groups - if (len(self.obj) == len(dropped) or - len(result) == len(self.grouper.result_index)): + if len(self.obj) == len(dropped) or len(result) == len( + self.grouper.result_index + ): result.index = self.grouper.result_index else: result = result.reindex(self.grouper.result_index) return result - def quantile(self, q=0.5, interpolation='linear'): + def quantile(self, q=0.5, interpolation="linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -1823,44 +1869,46 @@ def quantile(self, q=0.5, interpolation='linear'): b 3.0 """ - def pre_processor( - vals: np.ndarray - ) -> Tuple[np.ndarray, Optional[Type]]: + def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: if is_object_dtype(vals): - raise TypeError("'quantile' cannot be performed against " - "'object' dtypes!") + raise TypeError( + "'quantile' cannot be performed against " "'object' dtypes!" + ) inference = None if is_integer_dtype(vals): inference = np.int64 elif is_datetime64_dtype(vals): - inference = 'datetime64[ns]' + inference = "datetime64[ns]" vals = vals.astype(np.float) return vals, inference - def post_processor( - vals: np.ndarray, - inference: Optional[Type] - ) -> np.ndarray: + def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: if inference: # Check for edge case - if not (is_integer_dtype(inference) and - interpolation in {'linear', 'midpoint'}): + if not ( + is_integer_dtype(inference) + and interpolation in {"linear", "midpoint"} + ): vals = vals.astype(inference) return vals - return self._get_cythonized_result('group_quantile', self.grouper, - aggregate=True, - needs_values=True, - needs_mask=True, - cython_dtype=np.float64, - pre_processing=pre_processor, - post_processing=post_processor, - q=q, interpolation=interpolation) - - @Substitution(name='groupby') + return self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=q, + interpolation=interpolation, + ) + + @Substitution(name="groupby") def ngroup(self, ascending=True): """ Number each group from 0 to the number of groups - 1. @@ -1931,7 +1979,7 @@ def ngroup(self, ascending=True): result = self.ngroups - 1 - result return result - @Substitution(name='groupby') + @Substitution(name="groupby") def cumcount(self, ascending=True): """ Number each item in each group from 0 to the length of that group - 1. @@ -1990,10 +2038,11 @@ def cumcount(self, ascending=True): cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) - def rank(self, method='average', ascending=True, na_option='keep', - pct=False, axis=0): + def rank( + self, method="average", ascending=True, na_option="keep", pct=False, axis=0 + ): """ Provide the rank of values within each group. @@ -2020,14 +2069,20 @@ def rank(self, method='average', ascending=True, na_option='keep', ------- DataFrame with ranking of values within each group """ - if na_option not in {'keep', 'top', 'bottom'}: + if na_option not in {"keep", "top", "bottom"}: msg = "na_option must be one of 'keep', 'top', or 'bottom'" raise ValueError(msg) - return self._cython_transform('rank', numeric_only=False, - ties_method=method, ascending=ascending, - na_option=na_option, pct=pct, axis=axis) - - @Substitution(name='groupby') + return self._cython_transform( + "rank", + numeric_only=False, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + axis=axis, + ) + + @Substitution(name="groupby") @Appender(_common_see_also) def cumprod(self, axis=0, *args, **kwargs): """ @@ -2037,14 +2092,13 @@ def cumprod(self, axis=0, *args, **kwargs): ------- Series or DataFrame """ - nv.validate_groupby_func('cumprod', args, kwargs, - ['numeric_only', 'skipna']) + nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"]) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis, **kwargs)) - return self._cython_transform('cumprod', **kwargs) + return self._cython_transform("cumprod", **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def cumsum(self, axis=0, *args, **kwargs): """ @@ -2054,14 +2108,13 @@ def cumsum(self, axis=0, *args, **kwargs): ------- Series or DataFrame """ - nv.validate_groupby_func('cumsum', args, kwargs, - ['numeric_only', 'skipna']) + nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"]) if axis != 0: return self.apply(lambda x: x.cumsum(axis=axis, **kwargs)) - return self._cython_transform('cumsum', **kwargs) + return self._cython_transform("cumsum", **kwargs) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def cummin(self, axis=0, **kwargs): """ @@ -2074,9 +2127,9 @@ def cummin(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) - return self._cython_transform('cummin', numeric_only=False) + return self._cython_transform("cummin", numeric_only=False) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def cummax(self, axis=0, **kwargs): """ @@ -2089,14 +2142,22 @@ def cummax(self, axis=0, **kwargs): if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) - return self._cython_transform('cummax', numeric_only=False) - - def _get_cythonized_result(self, how, grouper, aggregate=False, - cython_dtype=None, needs_values=False, - needs_mask=False, needs_ngroups=False, - result_is_index=False, - pre_processing=None, post_processing=None, - **kwargs): + return self._cython_transform("cummax", numeric_only=False) + + def _get_cythonized_result( + self, + how, + grouper, + aggregate=False, + cython_dtype=None, + needs_values=False, + needs_mask=False, + needs_ngroups=False, + result_is_index=False, + pre_processing=None, + post_processing=None, + **kwargs + ): """ Get result for Cythonized functions. @@ -2140,8 +2201,9 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, `Series` or `DataFrame` with filled values """ if result_is_index and aggregate: - raise ValueError("'result_is_index' and 'aggregate' cannot both " - "be True!") + raise ValueError( + "'result_is_index' and 'aggregate' cannot both " "be True!" + ) if post_processing: if not callable(pre_processing): raise ValueError("'post_processing' must be a callable!") @@ -2149,8 +2211,9 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, if not callable(pre_processing): raise ValueError("'pre_processing' must be a callable!") if not needs_values: - raise ValueError("Cannot use 'pre_processing' without " - "specifying 'needs_values'!") + raise ValueError( + "Cannot use 'pre_processing' without " "specifying 'needs_values'!" + ) labels, _, ngroups = grouper.group_info output = collections.OrderedDict() @@ -2197,7 +2260,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, else: return self._wrap_transformed_output(output) - @Substitution(name='groupby') + @Substitution(name="groupby") @Appender(_common_see_also) def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ @@ -2220,19 +2283,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ if freq is not None or axis != 0 or not isna(fill_value): - return self.apply(lambda x: x.shift(periods, freq, - axis, fill_value)) - - return self._get_cythonized_result('group_shift_indexer', - self.grouper, cython_dtype=np.int64, - needs_ngroups=True, - result_is_index=True, - periods=periods) - - @Substitution(name='groupby') + return self.apply(lambda x: x.shift(periods, freq, axis, fill_value)) + + return self._get_cythonized_result( + "group_shift_indexer", + self.grouper, + cython_dtype=np.int64, + needs_ngroups=True, + result_is_index=True, + periods=periods, + ) + + @Substitution(name="groupby") @Appender(_common_see_also) - def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, - axis=0): + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): """ Calculate pct_change of each value to previous entry in group. @@ -2242,16 +2306,21 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, Percentage changes within each group. """ if freq is not None or axis != 0: - return self.apply(lambda x: x.pct_change(periods=periods, - fill_method=fill_method, - limit=limit, freq=freq, - axis=axis)) + return self.apply( + lambda x: x.pct_change( + periods=periods, + fill_method=fill_method, + limit=limit, + freq=freq, + axis=axis, + ) + ) filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.labels) shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def head(self, n=5): """ @@ -2282,7 +2351,7 @@ def head(self, n=5): mask = self._cumcount_array() < n return self._selected_obj[mask] - @Substitution(name='groupby') + @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def tail(self, n=5): """ @@ -2347,16 +2416,19 @@ def _reindex_output(self, output): return output # reindexing only applies to a Categorical grouper - elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex)) - for ping in groupings): + elif not any( + isinstance(ping.grouper, (Categorical, CategoricalIndex)) + for ping in groupings + ): return output levels_list = [ping.group_index for ping in groupings] index, _ = MultiIndex.from_product( - levels_list, names=self.grouper.names).sortlevel() + levels_list, names=self.grouper.names + ).sortlevel() if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + d = {self.obj._get_axis_name(self.axis): index, "copy": False} return output.reindex(**d) # GH 13204 @@ -2370,15 +2442,15 @@ def _reindex_output(self, output): # reindex `output`, and then reset the in-axis grouper columns. # Select in-axis groupers - in_axis_grps = ((i, ping.name) for (i, ping) - in enumerate(groupings) if ping.in_axis) + in_axis_grps = ( + (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis + ) g_nums, g_names = zip(*in_axis_grps) output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index - ).reindex(index, copy=False) + output = output.set_index(self.grouper.result_index).reindex(index, copy=False) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) @@ -2394,11 +2466,13 @@ def _reindex_output(self, output): def groupby(obj, by, **kwds): if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy + klass = SeriesGroupBy elif isinstance(obj, DataFrame): from pandas.core.groupby.generic import DataFrameGroupBy + klass = DataFrameGroupBy else: # pragma: no cover - raise TypeError('invalid type: {}'.format(obj)) + raise TypeError("invalid type: {}".format(obj)) return klass(obj, by, **kwds) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9e1033be26df2..818d844ca7994 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -11,8 +11,14 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable, - is_list_like, is_scalar, is_timedelta64_dtype) + ensure_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_hashable, + is_list_like, + is_scalar, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ABCSeries import pandas.core.algorithms as algorithms @@ -85,12 +91,13 @@ class Grouper: >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ - _attributes = ('key', 'level', 'freq', 'axis', - 'sort') # type: Tuple[str, ...] + + _attributes = ("key", "level", "freq", "axis", "sort") # type: Tuple[str, ...] def __new__(cls, *args, **kwargs): - if kwargs.get('freq') is not None: + if kwargs.get("freq") is not None: from pandas.core.resample import TimeGrouper + cls = TimeGrouper return super().__new__(cls) @@ -125,11 +132,14 @@ def _get_grouper(self, obj, validate=True): """ self._set_grouper(obj) - self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key], - axis=self.axis, - level=self.level, - sort=self.sort, - validate=validate) + self.grouper, exclusions, self.obj = _get_grouper( + self.obj, + [self.key], + axis=self.axis, + level=self.level, + sort=self.sort, + validate=validate, + ) return self.binner, self.grouper, self.obj def _set_grouper(self, obj, sort=False): @@ -145,8 +155,7 @@ def _set_grouper(self, obj, sort=False): """ if self.key is not None and self.level is not None: - raise ValueError( - "The Grouper cannot specify both a key and a level!") + raise ValueError("The Grouper cannot specify both a key and a level!") # Keep self.grouper value before overriding if self._grouper is None: @@ -156,13 +165,13 @@ def _set_grouper(self, obj, sort=False): if self.key is not None: key = self.key # The 'on' is already defined - if (getattr(self.grouper, 'name', None) == key and - isinstance(obj, ABCSeries)): + if getattr(self.grouper, "name", None) == key and isinstance( + obj, ABCSeries + ): ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: - raise KeyError( - "The grouper name {0} is not found".format(key)) + raise KeyError("The grouper name {0} is not found".format(key)) ax = Index(obj[key], name=key) else: @@ -174,18 +183,16 @@ def _set_grouper(self, obj, sort=False): # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - ax = Index(ax._get_level_values(level), - name=ax.names[level]) + ax = Index(ax._get_level_values(level), name=ax.names[level]) else: if level not in (0, ax.name): - raise ValueError( - "The level {0} is not valid".format(level)) + raise ValueError("The level {0} is not valid".format(level)) # possibly sort if (self.sort or sort) and not ax.is_monotonic: # use stable sort to support first, last, nth - indexer = self.indexer = ax.argsort(kind='mergesort') + indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) obj = obj._take(indexer, axis=self.axis, is_copy=False) @@ -198,9 +205,11 @@ def groups(self): return self.grouper.groups def __repr__(self): - attrs_list = ("{}={!r}".format(attr_name, getattr(self, attr_name)) - for attr_name in self._attributes - if getattr(self, attr_name) is not None) + attrs_list = ( + "{}={!r}".format(attr_name, getattr(self, attr_name)) + for attr_name in self._attributes + if getattr(self, attr_name) is not None + ) attrs = ", ".join(attrs_list) cls_name = self.__class__.__name__ return "{}({})".format(cls_name, attrs) @@ -234,8 +243,17 @@ class Grouping: * groups : dict of {group -> label_list} """ - def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, observed=False, in_axis=False): + def __init__( + self, + index, + grouper=None, + obj=None, + name=None, + level=None, + sort=True, + observed=False, + in_axis=False, + ): self.name = name self.level = level @@ -260,14 +278,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if level is not None: if not isinstance(level, int): if level not in index.names: - raise AssertionError('Level {} not in index'.format(level)) + raise AssertionError("Level {} not in index".format(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] - self.grouper, self._labels, self._group_index = \ - index._get_grouper_for_level(self.grouper, level) + self.grouper, self._labels, self._group_index = index._get_grouper_for_level( + self.grouper, level + ) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels @@ -293,8 +312,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, elif is_categorical_dtype(self.grouper): from pandas.core.groupby.categorical import recode_for_groupby + self.grouper, self.all_grouper = recode_for_groupby( - self.grouper, self.sort, observed) + self.grouper, self.sort, observed + ) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper @@ -310,42 +331,47 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self._group_index = CategoricalIndex( Categorical.from_codes( - codes=codes, - categories=categories, - ordered=self.grouper.ordered)) + codes=codes, categories=categories, ordered=self.grouper.ordered + ) + ) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed - elif not isinstance(self.grouper, - (Series, Index, ExtensionArray, np.ndarray)): - if getattr(self.grouper, 'ndim', 1) != 1: + elif not isinstance( + self.grouper, (Series, Index, ExtensionArray, np.ndarray) + ): + if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) - raise ValueError( - "Grouper for '{}' not 1-dimensional".format(t)) + raise ValueError("Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) - if not (hasattr(self.grouper, "__len__") and - len(self.grouper) == len(self.index)): - errmsg = ('Grouper result violates len(labels) == ' - 'len(data)\nresult: %s' % - pprint_thing(self.grouper)) + if not ( + hasattr(self.grouper, "__len__") + and len(self.grouper) == len(self.index) + ): + errmsg = ( + "Grouper result violates len(labels) == " + "len(data)\nresult: %s" % pprint_thing(self.grouper) + ) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like - if getattr(self.grouper, 'dtype', None) is not None: + if getattr(self.grouper, "dtype", None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime + self.grouper = to_datetime(self.grouper) elif is_timedelta64_dtype(self.grouper): from pandas import to_timedelta + self.grouper = to_timedelta(self.grouper) def __repr__(self): - return 'Grouping({0})'.format(self.name) + return "Grouping({0})".format(self.name) def __iter__(self): return iter(self.indices) @@ -376,8 +402,8 @@ def labels(self): def result_index(self): if self.all_grouper is not None: from pandas.core.groupby.categorical import recode_from_groupby - return recode_from_groupby(self.all_grouper, - self.sort, self.group_index) + + return recode_from_groupby(self.all_grouper, self.sort, self.group_index) return self.group_index @property @@ -393,20 +419,26 @@ def _make_labels(self): labels = self.grouper.label_info uniques = self.grouper.result_index else: - labels, uniques = algorithms.factorize( - self.grouper, sort=self.sort) + labels, uniques = algorithms.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques @cache_readonly def groups(self): - return self.index.groupby(Categorical.from_codes(self.labels, - self.group_index)) - - -def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - observed=False, mutated=False, validate=True): + return self.index.groupby(Categorical.from_codes(self.labels, self.group_index)) + + +def _get_grouper( + obj, + key=None, + axis=0, + level=None, + sort=True, + observed=False, + mutated=False, + validate=True, +): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -460,18 +492,17 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, if nlevels == 1: level = level[0] elif nlevels == 0: - raise ValueError('No group keys passed!') + raise ValueError("No group keys passed!") else: - raise ValueError('multiple levels only valid with ' - 'MultiIndex') + raise ValueError("multiple levels only valid with " "MultiIndex") if isinstance(level, str): if obj.index.name != level: - raise ValueError('level name {} is not the name of the ' - 'index'.format(level)) + raise ValueError( + "level name {} is not the name of the " "index".format(level) + ) elif level > 0 or level < -1: - raise ValueError( - 'level > 0 or level < -1 only valid with MultiIndex') + raise ValueError("level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. @@ -501,13 +532,16 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, all_hashable = is_tuple and is_hashable(key) if is_tuple: - if ((all_hashable and key not in obj and set(key).issubset(obj)) - or not all_hashable): + if ( + all_hashable and key not in obj and set(key).issubset(obj) + ) or not all_hashable: # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] - msg = ("Interpreting tuple 'by' as a list of keys, rather than " - "a single key. Use 'by=[...]' instead of 'by=(...)'. In " - "the future, a tuple will always mean a single key.") + msg = ( + "Interpreting tuple 'by' as a list of keys, rather than " + "a single key. Use 'by=[...]' instead of 'by=(...)'. In " + "the future, a tuple will always mean a single key." + ) warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) @@ -521,15 +555,22 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) - any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray)) - for g in keys) + any_arraylike = any( + isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys + ) # is this an index replacement? - if (not any_callable and not any_arraylike and not any_groupers and - match_axis_length and level is None): + if ( + not any_callable + and not any_arraylike + and not any_groupers + and match_axis_length + and level is None + ): if isinstance(obj, DataFrame): - all_in_columns_index = all(g in obj.columns or g in - obj.index.names for g in keys) + all_in_columns_index = all( + g in obj.columns or g in obj.index.names for g in keys + ) elif isinstance(obj, Series): all_in_columns_index = all(g in obj.index.names for g in keys) @@ -588,29 +629,37 @@ def is_in_obj(gpr): if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( - ("Length of grouper ({len_gpr}) and axis ({len_axis})" - " must be same length" - .format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) + ( + "Length of grouper ({len_gpr}) and axis ({len_axis})" + " must be same length".format( + len_gpr=len(gpr), len_axis=obj.shape[axis] + ) + ) + ) # create the Grouping # allow us to passing the actual Grouping as the gpr - ping = (Grouping(group_axis, - gpr, - obj=obj, - name=name, - level=level, - sort=sort, - observed=observed, - in_axis=in_axis) - if not isinstance(gpr, Grouping) else gpr) + ping = ( + Grouping( + group_axis, + gpr, + obj=obj, + name=name, + level=level, + sort=sort, + observed=observed, + in_axis=in_axis, + ) + if not isinstance(gpr, Grouping) + else gpr + ) groupings.append(ping) if len(groupings) == 0 and len(obj): - raise ValueError('No group keys passed!') + raise ValueError("No group keys passed!") elif len(groupings) == 0: - groupings.append(Grouping(Index([], dtype='int'), - np.array([], dtype=np.intp))) + groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) @@ -618,8 +667,7 @@ def is_in_obj(gpr): def _is_label_like(val): - return (isinstance(val, (str, tuple)) or - (val is not None and is_scalar(val))) + return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) def _convert_grouper(axis, grouper): @@ -632,7 +680,7 @@ def _convert_grouper(axis, grouper): return grouper.reindex(axis)._values elif isinstance(grouper, (list, Series, Index, np.ndarray)): if len(grouper) != len(axis): - raise ValueError('Grouper and axis must be same length') + raise ValueError("Grouper and axis must be same length") return grouper else: return grouper diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index dd44bc6990d59..33341a489866b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -17,10 +17,21 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_int_or_float, ensure_object, - ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, - is_timedelta64_dtype, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_int_or_float, + ensure_object, + ensure_platform_int, + is_bool_dtype, + is_categorical_dtype, + is_complex_dtype, + is_datetime64_any_dtype, + is_integer_dtype, + is_numeric_dtype, + is_sparse, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import _maybe_fill, isna import pandas.core.algorithms as algorithms @@ -32,8 +43,13 @@ from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( - compress_group_index, decons_obs_group_ids, get_flattened_iterator, - get_group_index, get_group_index_sorter, get_indexer_dict) + compress_group_index, + decons_obs_group_ids, + get_flattened_iterator, + get_group_index, + get_group_index_sorter, + get_indexer_dict, +) def generate_bins_generic(values, binner, closed): @@ -78,8 +94,9 @@ def generate_bins_generic(values, binner, closed): r_bin = binner[i + 1] # count values in current bin, advance to next bin - while j < lenidx and (values[j] < r_bin or - (closed == 'right' and values[j] == r_bin)): + while j < lenidx and ( + values[j] < r_bin or (closed == "right" and values[j] == r_bin) + ): j += 1 bins[bc] = j @@ -111,8 +128,9 @@ class BaseGrouper: """ - def __init__(self, axis, groupings, sort=True, group_keys=True, - mutated=False, indexer=None): + def __init__( + self, axis, groupings, sort=True, group_keys=True, mutated=False, indexer=None + ): self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis self.groupings = groupings @@ -166,10 +184,7 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, - ngroups, - self.levels, - self.labels) + return get_flattened_iterator(comp_ids, ngroups, self.levels, self.labels) def apply(self, f, data, axis=0): mutated = self.mutated @@ -179,8 +194,11 @@ def apply(self, f, data, axis=0): # oh boy f_name = com.get_callable_name(f) - if (f_name not in base.plotting_methods and - hasattr(splitter, 'fast_apply') and axis == 0): + if ( + f_name not in base.plotting_methods + and hasattr(splitter, "fast_apply") + and axis == 0 + ): try: result_values, mutated = splitter.fast_apply(f, group_keys) @@ -199,7 +217,7 @@ def apply(self, f, data, axis=0): pass for key, (i, group) in zip(group_keys, splitter): - object.__setattr__(group, 'name', key) + object.__setattr__(group, "name", key) # result_values is None if fast apply path wasn't taken # or fast apply aborted with an unexpected exception. @@ -230,8 +248,7 @@ def indices(self): return self.groupings[0].indices else: label_list = [ping.labels for ping in self.groupings] - keys = [com.values_from_object(ping.group_index) - for ping in self.groupings] + keys = [com.values_from_object(ping.group_index) for ping in self.groupings] return get_indexer_dict(label_list, keys) @property @@ -257,9 +274,7 @@ def size(self): out = np.bincount(ids[ids != -1], minlength=ngroup) else: out = [] - return Series(out, - index=self.result_index, - dtype='int64') + return Series(out, index=self.result_index, dtype="int64") @cache_readonly def groups(self): @@ -296,8 +311,7 @@ def label_info(self): def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: - group_index = get_group_index(all_labels, self.shape, - sort=True, xnull=True) + group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] @@ -311,8 +325,7 @@ def ngroups(self): def recons_labels(self): comp_ids, obs_ids, _ = self.group_info labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids( - comp_ids, obs_ids, self.shape, labels, xnull=True) + return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels, xnull=True) @cache_readonly def result_index(self): @@ -321,10 +334,9 @@ def result_index(self): codes = self.recons_labels levels = [ping.result_index for ping in self.groupings] - result = MultiIndex(levels=levels, - codes=codes, - verify_integrity=False, - names=self.names) + result = MultiIndex( + levels=levels, codes=codes, verify_integrity=False, names=self.names + ) return result def get_group_levels(self): @@ -344,49 +356,45 @@ def get_group_levels(self): # Aggregation functions _cython_functions = { - 'aggregate': { - 'add': 'group_add', - 'prod': 'group_prod', - 'min': 'group_min', - 'max': 'group_max', - 'mean': 'group_mean', - 'median': { - 'name': 'group_median' + "aggregate": { + "add": "group_add", + "prod": "group_prod", + "min": "group_min", + "max": "group_max", + "mean": "group_mean", + "median": {"name": "group_median"}, + "var": "group_var", + "first": { + "name": "group_nth", + "f": lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1), }, - 'var': 'group_var', - 'first': { - 'name': 'group_nth', - 'f': lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1) + "last": "group_last", + "ohlc": "group_ohlc", + }, + "transform": { + "cumprod": "group_cumprod", + "cumsum": "group_cumsum", + "cummin": "group_cummin", + "cummax": "group_cummax", + "rank": { + "name": "group_rank", + "f": lambda func, a, b, c, d, e, **kwargs: func( + a, + b, + c, + e, + kwargs.get("ties_method", "average"), + kwargs.get("ascending", True), + kwargs.get("pct", False), + kwargs.get("na_option", "keep"), + ), }, - 'last': 'group_last', - 'ohlc': 'group_ohlc', }, - - 'transform': { - 'cumprod': 'group_cumprod', - 'cumsum': 'group_cumsum', - 'cummin': 'group_cummin', - 'cummax': 'group_cummax', - 'rank': { - 'name': 'group_rank', - 'f': lambda func, a, b, c, d, e, **kwargs: func( - a, b, c, e, - kwargs.get('ties_method', 'average'), - kwargs.get('ascending', True), - kwargs.get('pct', False), - kwargs.get('na_option', 'keep') - ) - } - } } - _cython_arity = { - 'ohlc': 4, # OHLC - } + _cython_arity = {"ohlc": 4} # OHLC - _name_functions = { - 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] - } + _name_functions = {"ohlc": lambda *args: ["open", "high", "low", "close"]} def _is_builtin_func(self, arg): """ @@ -407,19 +415,22 @@ def get_func(fname): return f # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, 'object']: - f = getattr(libgroupby, "{fname}_{dtype_str}".format( - fname=fname, dtype_str=dt), None) + for dt in [dtype_str, "object"]: + f = getattr( + libgroupby, + "{fname}_{dtype_str}".format(fname=fname, dtype_str=dt), + None, + ) if f is not None: return f ftype = self._cython_functions[kind][how] if isinstance(ftype, dict): - func = afunc = get_func(ftype['name']) + func = afunc = get_func(ftype["name"]) # a sub-function - f = ftype.get('f') + f = ftype.get("f") if f is not None: def wrapper(*args, **kwargs): @@ -434,14 +445,13 @@ def wrapper(*args, **kwargs): if func is None: raise NotImplementedError( "function is not implemented for this dtype: " - "[how->{how},dtype->{dtype_str}]".format(how=how, - dtype_str=dtype_str)) + "[how->{how},dtype->{dtype_str}]".format(how=how, dtype_str=dtype_str) + ) return func - def _cython_operation(self, kind, values, how, axis, min_count=-1, - **kwargs): - assert kind in ['transform', 'aggregate'] + def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): + assert kind in ["transform", "aggregate"] # can we do this operation with our cython functions # if not raise NotImplementedError @@ -453,17 +463,18 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( - "{} are not support in cython ops".format(values.dtype)) + "{} are not support in cython ops".format(values.dtype) + ) elif is_datetime64_any_dtype(values): - if how in ['add', 'prod', 'cumsum', 'cumprod']: + if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( - "datetime64 type does not support {} " - "operations".format(how)) + "datetime64 type does not support {} " "operations".format(how) + ) elif is_timedelta64_dtype(values): - if how in ['prod', 'cumprod']: + if how in ["prod", "cumprod"]: raise NotImplementedError( - "timedelta64 type does not support {} " - "operations".format(how)) + "timedelta64 type does not support {} " "operations".format(how) + ) arity = self._cython_arity.get(how, 1) @@ -478,15 +489,16 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, assert axis == 1, axis values = values.T if arity > 1: - raise NotImplementedError("arity of more than 1 is not " - "supported for the 'how' argument") + raise NotImplementedError( + "arity of more than 1 is not " "supported for the 'how' argument" + ) out_shape = (self.ngroups,) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: - values = values.view('int64') + values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) @@ -503,59 +515,65 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, values = values.astype(object) try: - func = self._get_cython_function( - kind, how, values, is_numeric) + func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) - func = self._get_cython_function( - kind, how, values, is_numeric) + func = self._get_cython_function(kind, how, values, is_numeric) else: raise - if how == 'rank': - out_dtype = 'float' + if how == "rank": + out_dtype = "float" else: if is_numeric: - out_dtype = '{kind}{itemsize}'.format( - kind=values.dtype.kind, itemsize=values.dtype.itemsize) + out_dtype = "{kind}{itemsize}".format( + kind=values.dtype.kind, itemsize=values.dtype.itemsize + ) else: - out_dtype = 'object' + out_dtype = "object" labels, _, _ = self.group_info - if kind == 'aggregate': - result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), - fill_value=np.nan) + if kind == "aggregate": + result = _maybe_fill( + np.empty(out_shape, dtype=out_dtype), fill_value=np.nan + ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_numeric, - is_datetimelike, min_count) - elif kind == 'transform': - result = _maybe_fill(np.empty_like(values, dtype=out_dtype), - fill_value=np.nan) + result, + counts, + values, + labels, + func, + is_numeric, + is_datetimelike, + min_count, + ) + elif kind == "transform": + result = _maybe_fill( + np.empty_like(values, dtype=out_dtype), fill_value=np.nan + ) # TODO: min_count result = self._transform( - result, values, labels, func, is_numeric, is_datetimelike, - **kwargs) + result, values, labels, func, is_numeric, is_datetimelike, **kwargs + ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): - result = result.astype('float64') + result = result.astype("float64") result[mask] = np.nan - if (kind == 'aggregate' and - self._filter_empty_groups and not counts.all()): + if kind == "aggregate" and self._filter_empty_groups and not counts.all(): if result.ndim == 2: try: - result = lib.row_bool_subset( - result, (counts > 0).view(np.uint8)) + result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( - ensure_object(result), - (counts > 0).view(np.uint8)) + ensure_object(result), (counts > 0).view(np.uint8) + ) else: result = result[counts > 0] @@ -574,45 +592,69 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, return result, names def aggregate(self, values, how, axis=0, min_count=-1): - return self._cython_operation('aggregate', values, how, axis, - min_count=min_count) + return self._cython_operation( + "aggregate", values, how, axis, min_count=min_count + ) def transform(self, values, how, axis=0, **kwargs): - return self._cython_operation('transform', values, how, axis, **kwargs) - - def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric, is_datetimelike, min_count=-1): + return self._cython_operation("transform", values, how, axis, **kwargs) + + def _aggregate( + self, + result, + counts, + values, + comp_ids, + agg_func, + is_numeric, + is_datetimelike, + min_count=-1, + ): if values.ndim > 3: # punting for now - raise NotImplementedError("number of dimensions is currently " - "limited to 3") + raise NotImplementedError( + "number of dimensions is currently " "limited to 3" + ) elif values.ndim > 2: for i, chunk in enumerate(values.transpose(2, 0, 1)): chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids, - min_count) + agg_func(result[:, :, i], counts, chunk, comp_ids, min_count) else: agg_func(result, counts, values, comp_ids, min_count) return result - def _transform(self, result, values, comp_ids, transform_func, - is_numeric, is_datetimelike, **kwargs): + def _transform( + self, + result, + values, + comp_ids, + transform_func, + is_numeric, + is_datetimelike, + **kwargs + ): comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now - raise NotImplementedError("number of dimensions is currently " - "limited to 3") + raise NotImplementedError( + "number of dimensions is currently " "limited to 3" + ) elif values.ndim > 2: for i, chunk in enumerate(values.transpose(2, 0, 1)): - transform_func(result[:, :, i], values, - comp_ids, ngroups, is_datetimelike, **kwargs) + transform_func( + result[:, :, i], + values, + comp_ids, + ngroups, + is_datetimelike, + **kwargs + ) else: - transform_func(result, values, comp_ids, ngroups, is_datetimelike, - **kwargs) + transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) return result @@ -626,7 +668,7 @@ def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) if obj.index._has_complex_internals: - raise TypeError('Incompatible index for Cython grouper') + raise TypeError("Incompatible index for Cython grouper") group_index, _, ngroups = self.group_info @@ -634,10 +676,8 @@ def _aggregate_series_fast(self, obj, func): dummy = obj._get_values(slice(None, 0)) indexer = get_group_index_sorter(group_index, ngroups) obj = obj._take(indexer) - group_index = algorithms.take_nd( - group_index, indexer, allow_fill=False) - grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, - dummy) + group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) + grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts @@ -653,9 +693,9 @@ def _aggregate_series_pure_python(self, obj, func): for label, group in splitter: res = func(group) if result is None: - if (isinstance(res, (Series, Index, np.ndarray))): - raise ValueError('Function does not reduce') - result = np.empty(ngroups, dtype='O') + if isinstance(res, (Series, Index, np.ndarray)): + raise ValueError("Function does not reduce") + result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] result[label] = res @@ -695,8 +735,9 @@ class BinGrouper(BaseGrouper): """ - def __init__(self, bins, binlabels, filter_empty=False, mutated=False, - indexer=None): + def __init__( + self, bins, binlabels, filter_empty=False, mutated=False, indexer=None + ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) self._filter_empty_groups = filter_empty @@ -709,8 +750,11 @@ def groups(self): # this is mainly for compat # GH 3881 - result = {key: value for key, value in zip(self.binlabels, self.bins) - if key is not NaT} + result = { + key: value + for key, value in zip(self.binlabels, self.bins) + if key is not NaT + } return result @property @@ -736,8 +780,7 @@ def get_iterator(self, data, axis=0): for each group """ if isinstance(data, NDFrame): - slicer = lambda start, edge: data._slice( - slice(start, edge), axis=axis) + slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis) length = len(data.axes[axis]) else: slicer = lambda start, edge: data[slice(start, edge)] @@ -776,9 +819,11 @@ def group_info(self): else: comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep) - return (comp_ids.astype('int64', copy=False), - obs_group_ids.astype('int64', copy=False), - ngroups) + return ( + comp_ids.astype("int64", copy=False), + obs_group_ids.astype("int64", copy=False), + ngroups, + ) @cache_readonly def result_index(self): @@ -798,8 +843,11 @@ def names(self): @property def groupings(self): from pandas.core.groupby.grouper import Grouping - return [Grouping(lvl, lvl, in_axis=False, level=None, name=name) - for lvl, name in zip(self.levels, self.names)] + + return [ + Grouping(lvl, lvl, in_axis=False, level=None, name=name) + for lvl, name in zip(self.levels, self.names) + ] def agg_series(self, obj, func): dummy = obj[:0] @@ -830,7 +878,6 @@ def _is_indexed_like(obj, axes): class DataSplitter: - def __init__(self, data, labels, ngroups, axis=0): self.data = data self.labels = ensure_int64(labels) @@ -878,13 +925,11 @@ def apply(self, f): class SeriesSplitter(DataSplitter): - def _chop(self, sdata, slice_obj): return sdata._get_values(slice_obj) class FrameSplitter(DataSplitter): - def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool try: diff --git a/pandas/core/index.py b/pandas/core/index.py index f14f32c67d4e1..d308ac1a9b1c7 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1,7 +1,25 @@ from pandas.core.indexes.api import ( # noqa:F401 - CategoricalIndex, DatetimeIndex, Float64Index, Index, Int64Index, - IntervalIndex, InvalidIndexError, MultiIndex, NaT, NumericIndex, - PeriodIndex, RangeIndex, TimedeltaIndex, UInt64Index, _all_indexes_same, - _get_combined_index, _get_consensus_names, _get_objs_combined_axis, - _new_Index, _union_indexes, ensure_index, ensure_index_from_sequences) + CategoricalIndex, + DatetimeIndex, + Float64Index, + Index, + Int64Index, + IntervalIndex, + InvalidIndexError, + MultiIndex, + NaT, + NumericIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, + _all_indexes_same, + _get_combined_index, + _get_consensus_names, + _get_objs_combined_axis, + _new_Index, + _union_indexes, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.multi import _sparsify # noqa:F401 diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 602e11a08b4ed..5ba23990cbd51 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -4,9 +4,15 @@ import numpy as np from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetime_arraylike, is_integer_dtype, is_list_like, is_period_arraylike, - is_timedelta64_dtype) + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_arraylike, + is_integer_dtype, + is_list_like, + is_period_arraylike, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.accessor import PandasDelegate, delegate_names @@ -18,15 +24,16 @@ class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): - def __init__(self, data, orig): if not isinstance(data, ABCSeries): - raise TypeError("cannot convert an object of type {0} to a " - "datetimelike index".format(type(data))) + raise TypeError( + "cannot convert an object of type {0} to a " + "datetimelike index".format(type(data)) + ) self._parent = data self.orig = orig - self.name = getattr(data, 'name', None) + self.name = getattr(data, "name", None) self._freeze() def _get_values(self): @@ -47,11 +54,14 @@ def _get_values(self): if is_datetime_arraylike(data): return DatetimeIndex(data, copy=False, name=self.name) - raise TypeError("cannot convert an object of type {0} to a " - "datetimelike index".format(type(data))) + raise TypeError( + "cannot convert an object of type {0} to a " + "datetimelike index".format(type(data)) + ) def _delegate_property_get(self, name): from pandas import Series + values = self._get_values() result = getattr(values, name) @@ -59,7 +69,7 @@ def _delegate_property_get(self, name): # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): - result = result.astype('int64') + result = result.astype("int64") elif not is_list_like(result): return result @@ -75,19 +85,24 @@ def _delegate_property_get(self, name): result = Series(result, index=index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error - result._is_copy = ("modifications to a property of a datetimelike " - "object are not supported and are discarded. " - "Change values on the original.") + result._is_copy = ( + "modifications to a property of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) return result def _delegate_property_set(self, name, value, *args, **kwargs): - raise ValueError("modifications to a property of a datetimelike " - "object are not supported. Change values on the " - "original.") + raise ValueError( + "modifications to a property of a datetimelike " + "object are not supported. Change values on the " + "original." + ) def _delegate_method(self, name, *args, **kwargs): from pandas import Series + values = self._get_values() method = getattr(values, name) @@ -99,19 +114,21 @@ def _delegate_method(self, name, *args, **kwargs): result = Series(result, index=self._parent.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error - result._is_copy = ("modifications to a method of a datetimelike " - "object are not supported and are discarded. " - "Change values on the original.") + result._is_copy = ( + "modifications to a method of a datetimelike " + "object are not supported and are discarded. " + "Change values on the original." + ) return result -@delegate_names(delegate=DatetimeArray, - accessors=DatetimeArray._datetimelike_ops, - typ="property") -@delegate_names(delegate=DatetimeArray, - accessors=DatetimeArray._datetimelike_methods, - typ="method") +@delegate_names( + delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_methods, typ="method" +) class DatetimeProperties(Properties): """ Accessor object for datetimelike properties of the Series values. @@ -177,12 +194,14 @@ def freq(self): return self._get_values().inferred_freq -@delegate_names(delegate=TimedeltaArray, - accessors=TimedeltaArray._datetimelike_ops, - typ="property") -@delegate_names(delegate=TimedeltaArray, - accessors=TimedeltaArray._datetimelike_methods, - typ="method") +@delegate_names( + delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=TimedeltaArray, + accessors=TimedeltaArray._datetimelike_methods, + typ="method", +) class TimedeltaProperties(Properties): """ Accessor object for datetimelike properties of the Series values. @@ -266,12 +285,12 @@ def freq(self): return self._get_values().inferred_freq -@delegate_names(delegate=PeriodArray, - accessors=PeriodArray._datetimelike_ops, - typ="property") -@delegate_names(delegate=PeriodArray, - accessors=PeriodArray._datetimelike_methods, - typ="method") +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property" +) +@delegate_names( + delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method" +) class PeriodProperties(Properties): """ Accessor object for datetimelike properties of the Series values. @@ -287,9 +306,9 @@ class PeriodProperties(Properties): """ -class CombinedDatetimelikeProperties(DatetimeProperties, - TimedeltaProperties, PeriodProperties): - +class CombinedDatetimelikeProperties( + DatetimeProperties, TimedeltaProperties, PeriodProperties +): def __new__(cls, data): # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is @@ -298,14 +317,14 @@ def __new__(cls, data): from pandas import Series if not isinstance(data, Series): - raise TypeError("cannot convert an object of type {0} to a " - "datetimelike index".format(type(data))) + raise TypeError( + "cannot convert an object of type {0} to a " + "datetimelike index".format(type(data)) + ) orig = data if is_categorical_dtype(data) else None if orig is not None: - data = Series(orig.values.categories, - name=orig.name, - copy=False) + data = Series(orig.values.categories, name=orig.name, copy=False) try: if is_datetime64_dtype(data.dtype): @@ -321,5 +340,4 @@ def __new__(cls, data): except Exception: pass # we raise an attribute error anyway - raise AttributeError("Can only use .dt accessor with datetimelike " - "values") + raise AttributeError("Can only use .dt accessor with datetimelike " "values") diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 6299fc482d0df..a17f74286d59f 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -5,40 +5,64 @@ import pandas.core.common as com from pandas.core.indexes.base import ( - Index, _new_Index, ensure_index, ensure_index_from_sequences) + Index, + _new_Index, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.base import InvalidIndexError # noqa:F401 from pandas.core.indexes.category import CategoricalIndex # noqa:F401 from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.interval import IntervalIndex # noqa:F401 from pandas.core.indexes.multi import MultiIndex # noqa:F401 from pandas.core.indexes.numeric import ( # noqa:F401 - Float64Index, Int64Index, NumericIndex, UInt64Index) + Float64Index, + Int64Index, + NumericIndex, + UInt64Index, +) from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.range import RangeIndex # noqa:F401 from pandas.core.indexes.timedeltas import TimedeltaIndex -_sort_msg = textwrap.dedent("""\ +_sort_msg = textwrap.dedent( + """\ Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. -""") +""" +) # TODO: there are many places that rely on these private methods existing in # pandas.core.index -__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', - 'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index', - 'InvalidIndexError', 'TimedeltaIndex', - 'PeriodIndex', 'DatetimeIndex', - '_new_Index', 'NaT', - 'ensure_index', 'ensure_index_from_sequences', - '_get_combined_index', - '_get_objs_combined_axis', '_union_indexes', - '_get_consensus_names', - '_all_indexes_same'] +__all__ = [ + "Index", + "MultiIndex", + "NumericIndex", + "Float64Index", + "Int64Index", + "CategoricalIndex", + "IntervalIndex", + "RangeIndex", + "UInt64Index", + "InvalidIndexError", + "TimedeltaIndex", + "PeriodIndex", + "DatetimeIndex", + "_new_Index", + "NaT", + "ensure_index", + "ensure_index_from_sequences", + "_get_combined_index", + "_get_objs_combined_axis", + "_union_indexes", + "_get_consensus_names", + "_all_indexes_same", +] def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): @@ -64,8 +88,7 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True): ------- Index """ - obs_idxes = [obj._get_axis(axis) for obj in objs - if hasattr(obj, '_get_axis')] + obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, "_get_axis")] if obs_idxes: return _get_combined_index(obs_idxes, intersect=intersect, sort=sort) @@ -142,7 +165,7 @@ def _union_indexes(indexes, sort=True): Index """ if len(indexes) == 0: - raise AssertionError('Must have at least 1 Index to union') + raise AssertionError("Must have at least 1 Index to union") if len(indexes) == 1: result = indexes[0] if isinstance(result, list): @@ -165,24 +188,24 @@ def _unique_indices(inds): ------- Index """ + def conv(i): if isinstance(i, Index): i = i.tolist() return i - return Index( - lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) + return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort)) - if kind == 'special': + if kind == "special": result = indexes[0] - if hasattr(result, 'union_many'): + if hasattr(result, "union_many"): return result.union_many(indexes[1:]) else: for other in indexes[1:]: result = result.union(other) return result - elif kind == 'array': + elif kind == "array": index = indexes[0] for other in indexes[1:]: if not index.equals(other): @@ -227,17 +250,18 @@ def _sanitize_and_check(indexes): if list in kinds: if len(kinds) > 1: - indexes = [Index(com.try_sort(x)) - if not isinstance(x, Index) else - x for x in indexes] + indexes = [ + Index(com.try_sort(x)) if not isinstance(x, Index) else x + for x in indexes + ] kinds.remove(list) else: - return indexes, 'list' + return indexes, "list" if len(kinds) > 1 or Index not in kinds: - return indexes, 'special' + return indexes, "special" else: - return indexes, 'array' + return indexes, "array" def _get_consensus_names(indexes): @@ -259,8 +283,7 @@ def _get_consensus_names(indexes): # find the non-none names, need to tupleify to make # the set hashable, then reverse on return - consensus_names = {tuple(i.names) for i in indexes - if com._any_not_none(*i.names)} + consensus_names = {tuple(i.names) for i in indexes if com._any_not_none(*i.names)} if len(consensus_names) == 1: return list(list(consensus_names)[0]) return [None] * indexes[0].nlevels diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6e0d26750df00..973a022cfc3f1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -17,18 +17,47 @@ from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( - ensure_categorical, ensure_int64, ensure_object, ensure_platform_int, - is_bool, is_bool_dtype, is_categorical, is_categorical_dtype, - is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_array_dtype, is_float, is_float_dtype, is_hashable, - is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, - is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, - is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype) + ensure_categorical, + ensure_int64, + ensure_object, + ensure_platform_int, + is_bool, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_datetime64_any_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_iterator, + is_list_like, + is_object_dtype, + is_period_dtype, + is_scalar, + is_signed_integer_dtype, + is_timedelta64_dtype, + is_unsigned_integer_dtype, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass, - ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, ABCSeries, - ABCTimedeltaArray, ABCTimedeltaIndex) + ABCDataFrame, + ABCDateOffset, + ABCDatetimeArray, + ABCIndexClass, + ABCMultiIndex, + ABCPandasArray, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaArray, + ABCTimedeltaIndex, +) from pandas.core.dtypes.missing import array_equivalent, isna from pandas.core import ops @@ -44,16 +73,24 @@ from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( - default_pprint, format_object_attrs, format_object_summary, pprint_thing) - -__all__ = ['Index'] - -_unsortable_types = frozenset(('mixed', 'mixed-integer')) - -_index_doc_kwargs = dict(klass='Index', inplace='', - target_klass='Index', - raises_section='', - unique='Index', duplicated='np.ndarray') + default_pprint, + format_object_attrs, + format_object_summary, + pprint_thing, +) + +__all__ = ["Index"] + +_unsortable_types = frozenset(("mixed", "mixed-integer")) + +_index_doc_kwargs = dict( + klass="Index", + inplace="", + target_klass="Index", + raises_section="", + unique="Index", + duplicated="np.ndarray", +) _index_shared_docs = dict() @@ -61,15 +98,15 @@ def _make_comparison_op(op, cls): def cmp_method(self, other): if isinstance(other, (np.ndarray, Index, ABCSeries)): if other.ndim > 0 and len(self) != len(other): - raise ValueError('Lengths must match to compare') + raise ValueError("Lengths must match to compare") if is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) else: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index @@ -81,7 +118,7 @@ def cmp_method(self, other): except TypeError: return result - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) # TODO: docstring? return set_function_name(cmp_method, name, cls) @@ -93,12 +130,14 @@ def index_arithmetic_method(self, other): elif isinstance(other, ABCTimedeltaIndex): # Defer to subclass implementation return NotImplemented - elif (isinstance(other, (np.ndarray, ABCTimedeltaArray)) and - is_timedelta64_dtype(other)): + elif isinstance( + other, (np.ndarray, ABCTimedeltaArray) + ) and is_timedelta64_dtype(other): # GH#22390; wrap in Series for op, this will in turn wrap in # TimedeltaIndex, but will correctly raise TypeError instead of # NullFrequencyError for add/sub ops from pandas import Series + other = Series(other) out = op(self, other) return Index(out, name=self.name) @@ -112,7 +151,7 @@ def index_arithmetic_method(self, other): return self._evaluate_with_datetime_like(other, op) values = self.values - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(values, other) result = missing.dispatch_missing(op, values, other, result) @@ -125,7 +164,7 @@ def index_arithmetic_method(self, other): result = Index(result, **attrs) return result - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) # TODO: docstring? return set_function_name(index_arithmetic_method, name, cls) @@ -147,6 +186,7 @@ def _new_Index(cls, d): # ordinals through __new__ GH #13277 if issubclass(cls, ABCPeriodIndex): from pandas.core.indexes.period import _new_PeriodIndex + return _new_PeriodIndex(cls, **d) return cls.__new__(cls, **d) @@ -191,8 +231,9 @@ class Index(IndexOpsMixin, PandasObject): >>> pd.Index(list('abc')) Index(['a', 'b', 'c'], dtype='object') """ + # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = DirNamesMixin._deprecations | frozenset(['tolist']) + _deprecations = DirNamesMixin._deprecations | frozenset(["tolist"]) # To hand over control to subclasses _join_precedence = 1 @@ -213,12 +254,12 @@ def _inner_indexer(self, left, right): def _outer_indexer(self, left, right): return libjoin.outer_join_indexer(left, right) - _typ = 'index' + _typ = "index" _data = None _id = None name = None - _comparables = ['name'] - _attributes = ['name'] + _comparables = ["name"] + _attributes = ["name"] _is_numeric_dtype = False _can_hold_na = True @@ -231,27 +272,39 @@ def _outer_indexer(self, left, right): _engine_type = libindex.ObjectEngine - _accessors = {'str'} + _accessors = {"str"} str = CachedAccessor("str", StringMethods) # -------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=None, tupleize_cols=True, **kwargs): + def __new__( + cls, + data=None, + dtype=None, + copy=False, + name=None, + fastpath=None, + tupleize_cols=True, + **kwargs + ): - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(data, name) from .range import RangeIndex + if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() @@ -265,20 +318,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): from .category import CategoricalIndex - return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, - **kwargs) + + return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval - elif ((is_interval_dtype(data) or is_interval_dtype(dtype)) and - not is_object_dtype(dtype)): + elif ( + is_interval_dtype(data) or is_interval_dtype(dtype) + ) and not is_object_dtype(dtype): from .interval import IntervalIndex - closed = kwargs.get('closed', None) - return IntervalIndex(data, dtype=dtype, name=name, copy=copy, - closed=closed) - elif (is_datetime64_any_dtype(data) or - (dtype is not None and is_datetime64_any_dtype(dtype)) or - 'tz' in kwargs): + closed = kwargs.get("closed", None) + return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) + + elif ( + is_datetime64_any_dtype(data) + or (dtype is not None and is_datetime64_any_dtype(dtype)) + or "tz" in kwargs + ): from pandas import DatetimeIndex if dtype is not None and is_dtype_equal(_o_dtype, dtype): @@ -291,25 +347,30 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, result = DatetimeIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = DatetimeIndex(data, copy=copy, name=name, - dtype=dtype, **kwargs) + result = DatetimeIndex( + data, copy=copy, name=name, dtype=dtype, **kwargs + ) return result - elif (is_timedelta64_dtype(data) or - (dtype is not None and is_timedelta64_dtype(dtype))): + elif is_timedelta64_dtype(data) or ( + dtype is not None and is_timedelta64_dtype(dtype) + ): from pandas import TimedeltaIndex + if dtype is not None and is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy result = TimedeltaIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = TimedeltaIndex(data, copy=copy, name=name, - dtype=dtype, **kwargs) + result = TimedeltaIndex( + data, copy=copy, name=name, dtype=dtype, **kwargs + ) return result elif is_period_dtype(data) and not is_object_dtype(dtype): from pandas import PeriodIndex + result = PeriodIndex(data, copy=copy, name=name, **kwargs) return result @@ -320,12 +381,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # coerce to the provided dtype data = dtype.construct_array_type()._from_sequence( - data, dtype=dtype, copy=False) + data, dtype=dtype, copy=False + ) # coerce to the object dtype data = data.astype(object) - return Index(data, dtype=object, copy=copy, name=name, - **kwargs) + return Index(data, dtype=object, copy=copy, name=name, **kwargs) # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -339,13 +400,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # GH 11836 if is_integer_dtype(dtype): inferred = lib.infer_dtype(data, skipna=False) - if inferred == 'integer': - data = maybe_cast_to_integer_array(data, dtype, - copy=copy) - elif inferred in ['floating', 'mixed-integer-float']: + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: if isna(data).any(): - raise ValueError('cannot convert float ' - 'NaN to integer') + raise ValueError( + "cannot convert float " "NaN to integer" + ) if inferred == "mixed-integer-float": data = maybe_cast_to_integer_array(data, dtype) @@ -354,22 +415,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # then coerce to integer. try: return cls._try_convert_to_int_index( - data, copy, name, dtype) + data, copy, name, dtype + ) except ValueError: pass # Return an actual float index. from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, - name=name) - elif inferred == 'string': + return Float64Index(data, copy=copy, dtype=dtype, name=name) + + elif inferred == "string": pass else: data = data.astype(dtype) elif is_float_dtype(dtype): inferred = lib.infer_dtype(data, skipna=False) - if inferred == 'string': + if inferred == "string": pass else: data = data.astype(dtype) @@ -378,25 +440,29 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, except (TypeError, ValueError) as e: msg = str(e) - if ("cannot convert float" in msg or - "Trying to coerce float values to integer" in msg): + if ( + "cannot convert float" in msg + or "Trying to coerce float values to integer" in msg + ): raise # maybe coerce to a sub-class - from pandas.core.indexes.period import ( - PeriodIndex, IncompatibleFrequency) + from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency if is_signed_integer_dtype(data.dtype): from .numeric import Int64Index + return Int64Index(data, copy=copy, dtype=dtype, name=name) elif is_unsigned_integer_dtype(data.dtype): from .numeric import UInt64Index + return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): from .numeric import Float64Index + return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): - subarr = data.astype('object') + subarr = data.astype("object") else: subarr = com.asarray_tuplesafe(data, dtype=object) @@ -407,54 +473,57 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if dtype is None: inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == 'integer': + if inferred == "integer": try: - return cls._try_convert_to_int_index( - subarr, copy, name, dtype) + return cls._try_convert_to_int_index(subarr, copy, name, dtype) except ValueError: pass - return Index(subarr, copy=copy, - dtype=object, name=name) - elif inferred in ['floating', 'mixed-integer-float']: + return Index(subarr, copy=copy, dtype=object, name=name) + elif inferred in ["floating", "mixed-integer-float"]: from .numeric import Float64Index + return Float64Index(subarr, copy=copy, name=name) - elif inferred == 'interval': + elif inferred == "interval": from .interval import IntervalIndex + try: return IntervalIndex(subarr, name=name, copy=copy) except ValueError: # GH27172: mixed closed Intervals --> object dtype pass - elif inferred == 'boolean': + elif inferred == "boolean": # don't support boolean explicitly ATM pass - elif inferred != 'string': - if inferred.startswith('datetime'): - if (lib.is_datetime_with_singletz_array(subarr) or - 'tz' in kwargs): + elif inferred != "string": + if inferred.startswith("datetime"): + if ( + lib.is_datetime_with_singletz_array(subarr) + or "tz" in kwargs + ): # only when subarr has the same tz from pandas import DatetimeIndex + try: - return DatetimeIndex(subarr, copy=copy, - name=name, **kwargs) + return DatetimeIndex( + subarr, copy=copy, name=name, **kwargs + ) except OutOfBoundsDatetime: pass - elif inferred.startswith('timedelta'): + elif inferred.startswith("timedelta"): from pandas import TimedeltaIndex - return TimedeltaIndex(subarr, copy=copy, name=name, - **kwargs) - elif inferred == 'period': + + return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) + elif inferred == "period": try: return PeriodIndex(subarr, name=name, **kwargs) except IncompatibleFrequency: pass return cls._simple_new(subarr, name) - elif hasattr(data, '__array__'): - return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, - **kwargs) + elif hasattr(data, "__array__"): + return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) elif data is None or is_scalar(data): cls._scalar_data_error(data) else: @@ -467,8 +536,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # we must be all tuples, otherwise don't construct # 10697 from .multi import MultiIndex + return MultiIndex.from_tuples( - data, names=name or kwargs.get('names')) + data, names=name or kwargs.get("names") + ) # other iterable of some kind subarr = com.asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) @@ -512,14 +583,15 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): Must be careful not to recurse. """ - if not hasattr(values, 'dtype'): + if not hasattr(values, "dtype"): if (values is None or not len(values)) and dtype is not None: values = np.empty(0, dtype=dtype) else: values = np.array(values, copy=False) if is_object_dtype(values): - values = cls(values, name=name, dtype=dtype, - **kwargs)._ndarray_values + values = cls( + values, name=name, dtype=dtype, **kwargs + )._ndarray_values if isinstance(values, (ABCSeries, ABCIndexClass)): # Index._data must always be an ndarray. @@ -553,7 +625,9 @@ def _get_attributes_dict(self): """ return {k: getattr(self, k, None) for k in self._attributes} - _index_shared_docs['_shallow_copy'] = """ + _index_shared_docs[ + "_shallow_copy" + ] = """ Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking precedence. @@ -566,17 +640,17 @@ def _get_attributes_dict(self): kwargs : updates the default attributes for this Index """ - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is None: values = self.values attributes = self._get_attributes_dict() attributes.update(kwargs) - if not len(values) and 'dtype' not in kwargs: - attributes['dtype'] = self.dtype + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype # _simple_new expects an the type of self._data - values = getattr(values, '_values', values) + values = getattr(values, "_values", values) if isinstance(values, ABCDatetimeArray): # `self.values` returns `self` for tz-aware, so we need to unwrap # more specifically @@ -599,9 +673,9 @@ def _shallow_copy_with_infer(self, values, **kwargs): """ attributes = self._get_attributes_dict() attributes.update(kwargs) - attributes['copy'] = False - if not len(values) and 'dtype' not in kwargs: - attributes['dtype'] = self.dtype + attributes["copy"] = False + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype if self._infer_as_myclass: try: return self._constructor(values, **attributes) @@ -630,8 +704,7 @@ def is_(self, other): True if both have same underlying data, False otherwise : bool """ # use something other than None to be clearer - return self._id is getattr( - other, '_id', Ellipsis) and self._id is not None + return self._id is getattr(other, "_id", Ellipsis) and self._id is not None def _reset_identity(self): """ @@ -690,12 +763,15 @@ def dtype_str(self): .. deprecated:: 0.25.0 """ - warnings.warn('`dtype_str` has been deprecated. Call `str` on the ' - 'dtype attribute instead.', FutureWarning, - stacklevel=2) + warnings.warn( + "`dtype_str` has been deprecated. Call `str` on the " + "dtype attribute instead.", + FutureWarning, + stacklevel=2, + ) return str(self.dtype) - def ravel(self, order='C'): + def ravel(self, order="C"): """ Return an ndarray of the flattened values of the underlying data. @@ -714,7 +790,7 @@ def view(self, cls=None): # we need to see if we are subclassing an # index type here - if cls is not None and not hasattr(cls, '_typ'): + if cls is not None and not hasattr(cls, "_typ"): result = self._data.view(cls) else: result = self._shallow_copy() @@ -722,7 +798,9 @@ def view(self, cls=None): result._id = self._id return result - _index_shared_docs['astype'] = """ + _index_shared_docs[ + "astype" + ] = """ Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is impossible, a ValueError exception is raised. @@ -747,22 +825,22 @@ def view(self, cls=None): Index with values cast to specified dtype. """ - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self elif is_categorical_dtype(dtype): from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) + + return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) elif is_datetime64tz_dtype(dtype): # TODO(GH-24559): Remove this block, use the following elif. # avoid FutureWarning from DatetimeIndex constructor. from pandas import DatetimeIndex + tz = pandas_dtype(dtype).tz - return (DatetimeIndex(np.asarray(self)) - .tz_localize("UTC").tz_convert(tz)) + return DatetimeIndex(np.asarray(self)).tz_localize("UTC").tz_convert(tz) elif is_extension_array_dtype(dtype): return Index(np.asarray(self), dtype=dtype, copy=copy) @@ -770,15 +848,20 @@ def astype(self, dtype, copy=True): try: if is_datetime64tz_dtype(dtype): from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) + + return DatetimeIndex( + self.values, name=self.name, dtype=dtype, copy=copy + ) + return Index( + self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype + ) except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' + msg = "Cannot cast {name} to dtype {dtype}" raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - _index_shared_docs['take'] = """ + _index_shared_docs[ + "take" + ] = """ Return a new %(klass)s of the values selected by the indices. For internal compatibility with numpy arrays. @@ -804,26 +887,29 @@ def astype(self, dtype, copy=True): numpy.ndarray.take """ - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if kwargs: nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) + taken = self._assert_take_fillable( + self.values, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) else: if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' + msg = "Unable to fill values because {0} cannot contain NA" raise ValueError(msg.format(self.__class__.__name__)) taken = self.values.take(indices) return self._shallow_copy(taken) - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan + ): """ Internal method to handle NA filling of take. """ @@ -832,18 +918,21 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) + taken = algos.take( + values, indices, allow_fill=allow_fill, fill_value=na_value + ) else: taken = values.take(indices) return taken - _index_shared_docs['repeat'] = """ + _index_shared_docs[ + "repeat" + ] = """ Repeat elements of a %(klass)s. Returns a new %(klass)s where each element of the current %(klass)s @@ -880,7 +969,7 @@ def _assert_take_fillable(self, values, indices, allow_fill=True, Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object') """ - @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) return self._shallow_copy(self._values.repeat(repeats)) @@ -888,7 +977,9 @@ def repeat(self, repeats, axis=None): # -------------------------------------------------------------------- # Copying Methods - _index_shared_docs['copy'] = """ + _index_shared_docs[ + "copy" + ] = """ Make a copy of this object. Name and dtype sets those attributes on the new object. @@ -908,14 +999,14 @@ def repeat(self, repeats, axis=None): ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ - @Appender(_index_shared_docs['copy']) + @Appender(_index_shared_docs["copy"]) def copy(self, name=None, deep=False, dtype=None, **kwargs): if deep: new_index = self._shallow_copy(self._data.copy()) else: new_index = self._shallow_copy() - names = kwargs.get('names') + names = kwargs.get("names") names = self._validate_names(name=name, names=names, deep=deep) new_index = new_index.set_names(names) @@ -949,12 +1040,11 @@ def __repr__(self): attrs = self._format_attrs() space = self._format_space() - prepr = (",%s" % - space).join("%s=%s" % (k, v) for k, v in attrs) + prepr = (",%s" % space).join("%s=%s" % (k, v) for k, v in attrs) # no data provided, just attributes if data is None: - data = '' + data = "" res = "%s(%s%s)" % (klass, data, prepr) @@ -983,12 +1073,16 @@ def _format_data(self, name=None): """ # do we want to justify (only do so for non-objects) - is_justify = not (self.inferred_type in ('string', 'unicode') or - (self.inferred_type == 'categorical' and - is_object_dtype(self.categories))) + is_justify = not ( + self.inferred_type in ("string", "unicode") + or ( + self.inferred_type == "categorical" and is_object_dtype(self.categories) + ) + ) - return format_object_summary(self, self._formatter_func, - is_justify=is_justify, name=name) + return format_object_summary( + self, self._formatter_func, is_justify=is_justify, name=name + ) def _format_attrs(self): """ @@ -1006,16 +1100,18 @@ def format(self, name=False, formatter=None, **kwargs): """ header = [] if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') + header.append( + pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) if formatter is not None: return header + list(self.map(formatter)) return self._format_with_header(header, **kwargs) - def _format_with_header(self, header, na_rep='NaN', **kwargs): + def _format_with_header(self, header, na_rep="NaN", **kwargs): values = self.values from pandas.io.formats.format import format_array @@ -1027,8 +1123,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): values = lib.maybe_convert_objects(values, safe=1) if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] + result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values] # could have nans mask = isna(values) @@ -1038,7 +1133,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): result = result.tolist() else: - result = _trim_front(format_array(values, None, justify='left')) + result = _trim_front(format_array(values, None, justify="left")) return header + result def to_native_types(self, slicer=None, **kwargs): @@ -1072,7 +1167,7 @@ def to_native_types(self, slicer=None, **kwargs): values = values[slicer] return values._format_native_types(**kwargs) - def _format_native_types(self, na_rep='', quoting=None, **kwargs): + def _format_native_types(self, na_rep="", quoting=None, **kwargs): """ Actually format specific types of the index. """ @@ -1100,19 +1195,18 @@ def _summary(self, name=None): """ if len(self) > 0: head = self[0] - if hasattr(head, 'format') and not isinstance(head, str): + if hasattr(head, "format") and not isinstance(head, str): head = head.format() tail = self[-1] - if hasattr(tail, 'format') and not isinstance(tail, str): + if hasattr(tail, "format") and not isinstance(tail, str): tail = tail.format() - index_summary = ', %s to %s' % (pprint_thing(head), - pprint_thing(tail)) + index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) else: - index_summary = '' + index_summary = "" if name is None: name = type(self).__name__ - return '%s: %s entries%s' % (name, len(self), index_summary) + return "%s: %s entries%s" % (name, len(self), index_summary) def summary(self, name=None): """ @@ -1120,8 +1214,11 @@ def summary(self, name=None): .. deprecated:: 0.23.0 """ - warnings.warn("'summary' is deprecated and will be removed in a " - "future version.", FutureWarning, stacklevel=2) + warnings.warn( + "'summary' is deprecated and will be removed in a " "future version.", + FutureWarning, + stacklevel=2, + ) return self._summary(name) # -------------------------------------------------------------------- @@ -1227,6 +1324,7 @@ def to_frame(self, index=True, name=None): """ from pandas import DataFrame + if name is None: name = self.name or 0 result = DataFrame({name: self._values.copy()}) @@ -1244,6 +1342,7 @@ def _validate_names(self, name=None, names=None, deep=False): Index and plural 'names' parameter for MultiIndex. """ from copy import deepcopy + if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: @@ -1258,7 +1357,7 @@ def _validate_names(self, name=None, names=None, deep=False): return name def _get_names(self): - return FrozenList((self.name, )) + return FrozenList((self.name,)) def _set_names(self, values, level=None): """ @@ -1277,17 +1376,17 @@ def _set_names(self, values, level=None): TypeError if each name is not hashable. """ if not is_list_like(values): - raise ValueError('Names must be a list-like') + raise ValueError("Names must be a list-like") if len(values) != 1: - raise ValueError('Length of new names must be 1, got %d' % - len(values)) + raise ValueError("Length of new names must be 1, got %d" % len(values)) # GH 20527 # All items in 'name' need to be hashable: for name in values: if not is_hashable(name): - raise TypeError('{}.name must be a hashable type' - .format(self.__class__.__name__)) + raise TypeError( + "{}.name must be a hashable type".format(self.__class__.__name__) + ) self.name = values[0] names = property(fset=_set_names, fget=_get_names) @@ -1350,10 +1449,9 @@ def set_names(self, names, level=None, inplace=False): """ if level is not None and not isinstance(self, ABCMultiIndex): - raise ValueError('Level must be None for non-MultiIndex') + raise ValueError("Level must be None for non-MultiIndex") - if level is not None and not is_list_like(level) and is_list_like( - names): + if level is not None and not is_list_like(level) and is_list_like(names): msg = "Names must be a string when a single level is provided." raise TypeError(msg) @@ -1450,15 +1548,16 @@ def _validate_index_level(self, level): """ if isinstance(level, int): if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) + raise IndexError( + "Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level,) + ) elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) + raise IndexError( + "Too many levels:" " Index has only 1 level, not %d" % (level + 1) + ) elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) + raise KeyError("Level %s must be same as name (%s)" % (level, self.name)) def _get_level_number(self, level): self._validate_index_level(level) @@ -1552,9 +1651,11 @@ def droplevel(self, level=0): if len(level) == 0: return self if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) + raise ValueError( + "Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels) + ) # The two checks above guarantee that here self is a MultiIndex new_levels = list(self.levels) @@ -1578,10 +1679,17 @@ def droplevel(self, level=0): return result else: from .multi import MultiIndex - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) - _index_shared_docs['_get_grouper_for_level'] = """ + return MultiIndex( + levels=new_levels, + codes=new_codes, + names=new_names, + verify_integrity=False, + ) + + _index_shared_docs[ + "_get_grouper_for_level" + ] = """ Get index grouper corresponding to an index level Parameters @@ -1601,7 +1709,7 @@ def droplevel(self, level=0): Index of unique values for level. """ - @Appender(_index_shared_docs['_get_grouper_for_level']) + @Appender(_index_shared_docs["_get_grouper_for_level"]) def _get_grouper_for_level(self, mapper, level=None): assert level is None or level == 0 if mapper is None: @@ -1704,16 +1812,16 @@ def has_duplicates(self): return not self.is_unique def is_boolean(self): - return self.inferred_type in ['boolean'] + return self.inferred_type in ["boolean"] def is_integer(self): - return self.inferred_type in ['integer'] + return self.inferred_type in ["integer"] def is_floating(self): - return self.inferred_type in ['floating', 'mixed-integer-float'] + return self.inferred_type in ["floating", "mixed-integer-float"] def is_numeric(self): - return self.inferred_type in ['integer', 'floating'] + return self.inferred_type in ["integer", "floating"] def is_object(self): return is_object_dtype(self.dtype) @@ -1752,19 +1860,19 @@ def is_categorical(self): >>> s.index.is_categorical() False """ - return self.inferred_type in ['categorical'] + return self.inferred_type in ["categorical"] def is_interval(self): - return self.inferred_type in ['interval'] + return self.inferred_type in ["interval"] def is_mixed(self): - return self.inferred_type in ['mixed'] + return self.inferred_type in ["mixed"] def holds_integer(self): """ Whether the type is an integer type. """ - return self.inferred_type in ['integer', 'mixed-integer'] + return self.inferred_type in ["integer", "mixed-integer"] @cache_readonly def inferred_type(self): @@ -1793,7 +1901,7 @@ def __setstate__(self, state): """ if isinstance(state, dict): - self._data = state.pop('data') + self._data = state.pop("data") for k, v in state.items(): setattr(self, k, v) @@ -1909,6 +2017,7 @@ def isna(self): array([False, True, True, True], dtype=bool) """ return self._isnan + isnull = isna def notna(self): @@ -1956,9 +2065,12 @@ def notna(self): array([ True, True, True, False]) """ return ~self.isna() + notnull = notna - _index_shared_docs['fillna'] = """ + _index_shared_docs[ + "fillna" + ] = """ Fill NA/NaN values with the specified value Parameters @@ -1976,7 +2088,7 @@ def notna(self): filled : Index """ - @Appender(_index_shared_docs['fillna']) + @Appender(_index_shared_docs["fillna"]) def fillna(self, value=None, downcast=None): self._assert_can_do_op(value) if self.hasnans: @@ -1987,7 +2099,9 @@ def fillna(self, value=None, downcast=None): return Index(result, name=self.name) return self._shallow_copy() - _index_shared_docs['dropna'] = """ + _index_shared_docs[ + "dropna" + ] = """ Return Index without NA/NaN values Parameters @@ -2001,9 +2115,9 @@ def fillna(self, value=None, downcast=None): valid : Index """ - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): + @Appender(_index_shared_docs["dropna"]) + def dropna(self, how="any"): + if how not in ("any", "all"): raise ValueError("invalid how option: {0}".format(how)) if self.hasnans: @@ -2013,8 +2127,9 @@ def dropna(self, how='any'): # -------------------------------------------------------------------- # Uniqueness Methods - _index_shared_docs['index_unique'] = ( - """ + _index_shared_docs[ + "index_unique" + ] = """ Return unique values in the index. Uniques are returned in order of appearance, this does NOT sort. @@ -2033,16 +2148,16 @@ def dropna(self, how='any'): -------- unique Series.unique - """) + """ - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: self._validate_index_level(level) result = super().unique() return self._shallow_copy(result) - def drop_duplicates(self, keep='first'): + def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed. @@ -2090,7 +2205,7 @@ def drop_duplicates(self, keep='first'): """ return super().drop_duplicates(keep=keep) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): """ Indicate duplicate index values. @@ -2198,10 +2313,13 @@ def get_duplicates(self): >>> pd.Index(dates).get_duplicates() # doctest: +SKIP DatetimeIndex([], dtype='datetime64[ns]', freq=None) """ - warnings.warn("'get_duplicates' is deprecated and will be removed in " - "a future release. You can use " - "idx[idx.duplicated()].unique() instead", - FutureWarning, stacklevel=2) + warnings.warn( + "'get_duplicates' is deprecated and will be removed in " + "a future release. You can use " + "idx[idx.duplicated()].unique() instead", + FutureWarning, + stacklevel=2, + ) return self[self.duplicated()].unique() @@ -2266,9 +2384,12 @@ def __xor__(self, other): return self.symmetric_difference(other) def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) + raise ValueError( + "The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( + self.__class__.__name__ + ) + ) __bool__ = __nonzero__ @@ -2324,13 +2445,14 @@ def _is_compatible_with_other(self, other): ------- bool """ - return (type(self) is type(other) - and is_dtype_equal(self.dtype, other.dtype)) + return type(self) is type(other) and is_dtype_equal(self.dtype, other.dtype) def _validate_sort_keyword(self, sort): if sort not in [None, False]: - raise ValueError("The 'sort' keyword only takes the values of " - "None or False; {0} was passed.".format(sort)) + raise ValueError( + "The 'sort' keyword only takes the values of " + "None or False; {0} was passed.".format(sort) + ) def union(self, other, sort=None): """ @@ -2443,8 +2565,7 @@ def _union(self, other, sort): indexer, = (indexer == -1).nonzero() if len(indexer) > 0: - other_diff = algos.take_nd(rvals, indexer, - allow_fill=False) + other_diff = algos.take_nd(rvals, indexer, allow_fill=False) result = _concat._concat_compat((lvals, other_diff)) else: @@ -2454,9 +2575,12 @@ def _union(self, other, sort): try: result = sorting.safe_sort(result) except TypeError as e: - warnings.warn("{}, sort order is undefined for " - "incomparable objects".format(e), - RuntimeWarning, stacklevel=3) + warnings.warn( + "{}, sort order is undefined for " + "incomparable objects".format(e), + RuntimeWarning, + stacklevel=3, + ) # for subclasses return self._wrap_setop_result(other, result) @@ -2464,7 +2588,9 @@ def _union(self, other, sort): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) - _index_shared_docs['intersection'] = """ + _index_shared_docs[ + "intersection" + ] = """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. @@ -2500,7 +2626,7 @@ def _wrap_setop_result(self, other, result): """ # TODO: standardize return type of non-union setops type(self vs other) - @Appender(_index_shared_docs['intersection']) + @Appender(_index_shared_docs["intersection"]) def intersection(self, other, sort=False): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) @@ -2510,8 +2636,8 @@ def intersection(self, other, sort=False): return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') + this = self.astype("O") + other = other.astype("O") return this.intersection(other, sort=sort) # TODO(EA): setops-refactor, clean all this up @@ -2536,8 +2662,7 @@ def intersection(self, other, sort=False): indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: # duplicates - indexer = algos.unique1d( - Index(rvals).get_indexer_non_unique(lvals)[0]) + indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) @@ -2609,8 +2734,7 @@ def difference(self, other, sort=None): indexer = this.get_indexer(other) indexer = indexer.take((indexer != -1).nonzero()[0]) - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) + label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) if sort is None: try: @@ -2679,8 +2803,9 @@ def symmetric_difference(self, other, result_name=None, sort=None): # {this} minus {other} common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, - assume_unique=True) + left_indexer = np.setdiff1d( + np.arange(this.size), common_indexer, assume_unique=True + ) left_diff = this.values.take(left_indexer) # {other} minus {this} @@ -2695,14 +2820,14 @@ def symmetric_difference(self, other, result_name=None, sort=None): pass attribs = self._get_attributes_dict() - attribs['name'] = result_name - if 'freq' in attribs: - attribs['freq'] = None + attribs["name"] = result_name + if "freq" in attribs: + attribs["freq"] = None return self._shallow_copy_with_infer(the_diff, **attribs) def _assert_can_do_setop(self, other): if not is_list_like(other): - raise TypeError('Input must be Index or array-like') + raise TypeError("Input must be Index or array-like") return True def _convert_can_do_setop(self, other): @@ -2716,7 +2841,9 @@ def _convert_can_do_setop(self, other): # -------------------------------------------------------------------- # Indexing Methods - _index_shared_docs['get_loc'] = """ + _index_shared_docs[ + "get_loc" + ] = """ Get integer location, slice or boolean mask for requested label. Parameters @@ -2754,25 +2881,29 @@ def _convert_can_do_setop(self, other): array([False, True, False, True], dtype=bool) """ - @Appender(_index_shared_docs['get_loc']) + @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): if method is None: if tolerance is not None: - raise ValueError('tolerance argument only valid if using pad, ' - 'backfill or nearest lookups') + raise ValueError( + "tolerance argument only valid if using pad, " + "backfill or nearest lookups" + ) try: return self._engine.get_loc(key) except KeyError: return self._engine.get_loc(self._maybe_cast_indexer(key)) indexer = self.get_indexer([key], method=method, tolerance=tolerance) if indexer.ndim > 1 or indexer.size > 1: - raise TypeError('get_loc requires scalar valued input') + raise TypeError("get_loc requires scalar valued input") loc = indexer.item() if loc == -1: raise KeyError(key) return loc - _index_shared_docs['get_indexer'] = """ + _index_shared_docs[ + "get_indexer" + ] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. @@ -2819,7 +2950,7 @@ def get_loc(self, key, method=None, tolerance=None): and ``x`` is marked by -1, as it is not in ``index``. """ - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ensure_index(target) @@ -2834,30 +2965,37 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: - return pself.get_indexer(ptarget, method=method, limit=limit, - tolerance=tolerance) + return pself.get_indexer( + ptarget, method=method, limit=limit, tolerance=tolerance + ) if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) - return this.get_indexer(target, method=method, limit=limit, - tolerance=tolerance) + return this.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) if not self.is_unique: - raise InvalidIndexError('Reindexing only valid with uniquely' - ' valued Index objects') + raise InvalidIndexError( + "Reindexing only valid with uniquely" " valued Index objects" + ) - if method == 'pad' or method == 'backfill': + if method == "pad" or method == "backfill": indexer = self._get_fill_indexer(target, method, limit, tolerance) - elif method == 'nearest': + elif method == "nearest": indexer = self._get_nearest_indexer(target, limit, tolerance) else: if tolerance is not None: - raise ValueError('tolerance argument only valid if doing pad, ' - 'backfill or nearest reindexing') + raise ValueError( + "tolerance argument only valid if doing pad, " + "backfill or nearest reindexing" + ) if limit is not None: - raise ValueError('limit argument only valid if doing pad, ' - 'backfill or nearest reindexing') + raise ValueError( + "limit argument only valid if doing pad, " + "backfill or nearest reindexing" + ) indexer = self._engine.get_indexer(target._ndarray_values) @@ -2867,22 +3005,23 @@ def _convert_tolerance(self, tolerance, target): # override this method on subclasses tolerance = np.asarray(tolerance) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") return tolerance def _get_fill_indexer(self, target, method, limit=None, tolerance=None): if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = (self._engine.get_pad_indexer if method == 'pad' else - self._engine.get_backfill_indexer) + method = ( + self._engine.get_pad_indexer + if method == "pad" + else self._engine.get_backfill_indexer + ) indexer = method(target._ndarray_values, limit) else: - indexer = self._get_fill_indexer_searchsorted(target, method, - limit) + indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._ndarray_values, - indexer, - tolerance) + indexer = self._filter_indexer_tolerance( + target._ndarray_values, indexer, tolerance + ) return indexer def _get_fill_indexer_searchsorted(self, target, method, limit=None): @@ -2891,17 +3030,18 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): indexes and non-monotonic targets. """ if limit is not None: - raise ValueError('limit argument for %r method only well-defined ' - 'if index and target are monotonic' % method) + raise ValueError( + "limit argument for %r method only well-defined " + "if index and target are monotonic" % method + ) - side = 'left' if method == 'pad' else 'right' + side = "left" if method == "pad" else "right" # find exact matches first (this simplifies the algorithm) indexer = self.get_indexer(target) - nonexact = (indexer == -1) - indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], - side) - if side == 'left': + nonexact = indexer == -1 + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side) + if side == "left": # searchsorted returns "indices into a sorted array such that, # if the corresponding elements in v were inserted before the # indices, the order of a would be preserved". @@ -2921,19 +3061,21 @@ def _get_nearest_indexer(self, target, limit, tolerance): values that can be subtracted from each other (e.g., not strings or tuples). """ - left_indexer = self.get_indexer(target, 'pad', limit=limit) - right_indexer = self.get_indexer(target, 'backfill', limit=limit) + left_indexer = self.get_indexer(target, "pad", limit=limit) + right_indexer = self.get_indexer(target, "backfill", limit=limit) target = np.asarray(target) left_distances = abs(self.values[left_indexer] - target) right_distances = abs(self.values[right_indexer] - target) op = operator.lt if self.is_monotonic_increasing else operator.le - indexer = np.where(op(left_distances, right_distances) | - (right_indexer == -1), left_indexer, right_indexer) + indexer = np.where( + op(left_distances, right_distances) | (right_indexer == -1), + left_indexer, + right_indexer, + ) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, - tolerance) + indexer = self._filter_indexer_tolerance(target, indexer, tolerance) return indexer def _filter_indexer_tolerance(self, target, indexer, tolerance): @@ -2944,7 +3086,9 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): # -------------------------------------------------------------------- # Indexer Conversion Methods - _index_shared_docs['_convert_scalar_indexer'] = """ + _index_shared_docs[ + "_convert_scalar_indexer" + ] = """ Convert a scalar indexer. Parameters @@ -2953,43 +3097,47 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): kind : {'ix', 'loc', 'getitem', 'iloc'} or None """ - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + if kind == "iloc": + return self._validate_indexer("positional", key, kind) - if len(self) and not isinstance(self, ABCMultiIndex,): + if len(self) and not isinstance(self, ABCMultiIndex): # we can raise here if we are definitive that this # is positional indexing (eg. .ix on with a float) # or label indexing if we are using a type able # to be represented in the index - if kind in ['getitem', 'ix'] and is_float(key): + if kind in ["getitem", "ix"] and is_float(key): if not self.is_floating(): - return self._invalid_indexer('label', key) + return self._invalid_indexer("label", key) - elif kind in ['loc'] and is_float(key): + elif kind in ["loc"] and is_float(key): # we want to raise KeyError on string/mixed here # technically we *could* raise a TypeError # on anything but mixed though - if self.inferred_type not in ['floating', - 'mixed-integer-float', - 'string', - 'unicode', - 'mixed']: - return self._invalid_indexer('label', key) - - elif kind in ['loc'] and is_integer(key): + if self.inferred_type not in [ + "floating", + "mixed-integer-float", + "string", + "unicode", + "mixed", + ]: + return self._invalid_indexer("label", key) + + elif kind in ["loc"] and is_integer(key): if not self.holds_integer(): - return self._invalid_indexer('label', key) + return self._invalid_indexer("label", key) return key - _index_shared_docs['_convert_slice_indexer'] = """ + _index_shared_docs[ + "_convert_slice_indexer" + ] = """ Convert a slice indexer. By definition, these are labels unless 'iloc' is passed in. @@ -3001,19 +3149,21 @@ def _convert_scalar_indexer(self, key, kind=None): kind : {'ix', 'loc', 'getitem', 'iloc'} or None """ - @Appender(_index_shared_docs['_convert_slice_indexer']) + @Appender(_index_shared_docs["_convert_slice_indexer"]) def _convert_slice_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # if we are not a slice, then we are done if not isinstance(key, slice): return key # validate iloc - if kind == 'iloc': - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + if kind == "iloc": + return slice( + self._validate_indexer("slice", key.start, kind), + self._validate_indexer("slice", key.stop, kind), + self._validate_indexer("slice", key.step, kind), + ) # potentially cast the bounds to integers start, stop, step = key.start, key.stop, key.step @@ -3026,15 +3176,17 @@ def is_int(v): is_index_slice = is_int(start) and is_int(stop) is_positional = is_index_slice and not self.is_integer() - if kind == 'getitem': + if kind == "getitem": """ called from the getitem slicers, validate that we are in fact integers """ if self.is_integer() or is_index_slice: - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + return slice( + self._validate_indexer("slice", key.start, kind), + self._validate_indexer("slice", key.stop, kind), + self._validate_indexer("slice", key.step, kind), + ) # convert the slice to an indexer here @@ -3048,7 +3200,7 @@ def is_int(v): self.get_loc(stop) is_positional = False except KeyError: - if self.inferred_type == 'mixed-integer-float': + if self.inferred_type == "mixed-integer-float": raise if is_null_slicer: @@ -3091,7 +3243,9 @@ def _convert_listlike_indexer(self, keyarr, kind=None): indexer = self._convert_list_indexer(keyarr, kind=kind) return indexer, keyarr - _index_shared_docs['_convert_arr_indexer'] = """ + _index_shared_docs[ + "_convert_arr_indexer" + ] = """ Convert an array-like indexer to the appropriate dtype. Parameters @@ -3104,12 +3258,14 @@ def _convert_listlike_indexer(self, keyarr, kind=None): converted_keyarr : array-like """ - @Appender(_index_shared_docs['_convert_arr_indexer']) + @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): keyarr = com.asarray_tuplesafe(keyarr) return keyarr - _index_shared_docs['_convert_index_indexer'] = """ + _index_shared_docs[ + "_convert_index_indexer" + ] = """ Convert an Index indexer to the appropriate dtype. Parameters @@ -3122,11 +3278,13 @@ def _convert_arr_indexer(self, keyarr): converted_keyarr : Index (or sub-class) """ - @Appender(_index_shared_docs['_convert_index_indexer']) + @Appender(_index_shared_docs["_convert_index_indexer"]) def _convert_index_indexer(self, keyarr): return keyarr - _index_shared_docs['_convert_list_indexer'] = """ + _index_shared_docs[ + "_convert_list_indexer" + ] = """ Convert a list-like indexer to the appropriate dtype. Parameters @@ -3140,13 +3298,16 @@ def _convert_index_indexer(self, keyarr): positional indexer or None """ - @Appender(_index_shared_docs['_convert_list_indexer']) + @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): - if (kind in [None, 'iloc', 'ix'] and - is_integer_dtype(keyarr) and not self.is_floating() and - not isinstance(keyarr, ABCPeriodIndex)): - - if self.inferred_type == 'mixed-integer': + if ( + kind in [None, "iloc", "ix"] + and is_integer_dtype(keyarr) + and not self.is_floating() + and not isinstance(keyarr, ABCPeriodIndex) + ): + + if self.inferred_type == "mixed-integer": indexer = self.get_indexer(keyarr) if (indexer >= 0).all(): return indexer @@ -3157,9 +3318,10 @@ def _convert_list_indexer(self, keyarr, kind=None): # IndexError in maybe_convert_indices indexer[indexer < 0] = len(self) from pandas.core.indexing import maybe_convert_indices + return maybe_convert_indices(indexer, len(self)) - elif not self.inferred_type == 'integer': + elif not self.inferred_type == "integer": keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) return keyarr @@ -3169,10 +3331,12 @@ def _invalid_indexer(self, form, key): """ Consistent invalid indexer message. """ - raise TypeError("cannot do {form} indexing on {klass} with these " - "indexers [{key}] of {kind}".format( - form=form, klass=type(self), key=key, - kind=type(key))) + raise TypeError( + "cannot do {form} indexing on {klass} with these " + "indexers [{key}] of {kind}".format( + form=form, klass=type(self), key=key, kind=type(key) + ) + ) # -------------------------------------------------------------------- # Reindex Methods @@ -3194,8 +3358,7 @@ def _can_reindex(self, indexer): if not self.is_unique and len(indexer): raise ValueError("cannot reindex from a duplicate axis") - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary). @@ -3213,14 +3376,14 @@ def reindex(self, target, method=None, level=None, limit=None, """ # GH6552: preserve names when reindexing to non-named target # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'name') + preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. target = _ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq + attrs.pop("freq", None) # don't preserve freq values = self._data[:0] # appropriately-dtyped empty array target = self._simple_new(values, dtype=self.dtype, **attrs) else: @@ -3228,23 +3391,25 @@ def reindex(self, target, method=None, level=None, limit=None, if level is not None: if method is not None: - raise TypeError('Fill method not supported if level passed') - _, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True) + raise TypeError("Fill method not supported if level passed") + _, indexer, _ = self._join_level( + target, level, how="right", return_indexers=True + ) else: if self.equals(target): indexer = None else: # check is_overlapping for IntervalIndex compat - if (self.is_unique and - not getattr(self, 'is_overlapping', False)): - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) + if self.is_unique and not getattr(self, "is_overlapping", False): + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) else: if method is not None or limit is not None: - raise ValueError("cannot reindex a non-unique index " - "with a method or limit") + raise ValueError( + "cannot reindex a non-unique index " + "with a method or limit" + ) indexer, missing = self.get_indexer_non_unique(target) if preserve_names and target.nlevels == 1 and target.name != self.name: @@ -3315,7 +3480,9 @@ def _reindex_non_unique(self, target): # -------------------------------------------------------------------- # Join Methods - _index_shared_docs['join'] = """ + _index_shared_docs[ + "join" + ] = """ Compute join_index and indexers to conform data structures to the new index. @@ -3336,9 +3503,8 @@ def _reindex_non_unique(self, target): join_index, (left_indexer, right_indexer) """ - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + @Appender(_index_shared_docs["join"]) + def join(self, other, how="left", level=None, return_indexers=False, sort=False): self_is_mi = isinstance(self, ABCMultiIndex) other_is_mi = isinstance(other, ABCMultiIndex) @@ -3350,17 +3516,17 @@ def join(self, other, how='left', level=None, return_indexers=False, if self.names == other.names: pass else: - return self._join_multi(other, how=how, - return_indexers=return_indexers) + return self._join_multi(other, how=how, return_indexers=return_indexers) # join on the level if level is not None and (self_is_mi or other_is_mi): - return self._join_level(other, level, how=how, - return_indexers=return_indexers) + return self._join_level( + other, level, how=how, return_indexers=return_indexers + ) other = ensure_index(other) - if len(other) == 0 and how in ('left', 'outer'): + if len(other) == 0 and how in ("left", "outer"): join_index = self._shallow_copy() if return_indexers: rindexer = np.repeat(-1, len(join_index)) @@ -3368,7 +3534,7 @@ def join(self, other, how='left', level=None, return_indexers=False, else: return join_index - if len(self) == 0 and how in ('right', 'outer'): + if len(self) == 0 and how in ("right", "outer"): join_index = other._shallow_copy() if return_indexers: lindexer = np.repeat(-1, len(join_index)) @@ -3377,47 +3543,52 @@ def join(self, other, how='left', level=None, return_indexers=False, return join_index if self._join_precedence < other._join_precedence: - how = {'right': 'left', 'left': 'right'}.get(how, how) - result = other.join(self, how=how, level=level, - return_indexers=return_indexers) + how = {"right": "left", "left": "right"}.get(how, how) + result = other.join( + self, how=how, level=level, return_indexers=return_indexers + ) if return_indexers: x, y, z = result result = x, z, y return result if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') + this = self.astype("O") + other = other.astype("O") return this.join(other, how=how, return_indexers=return_indexers) _validate_join_method(how) if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) + return self._join_non_unique( + other, how=how, return_indexers=return_indexers + ) elif not self.is_unique or not other.is_unique: if self.is_monotonic and other.is_monotonic: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) + return self._join_monotonic( + other, how=how, return_indexers=return_indexers + ) else: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) + return self._join_non_unique( + other, how=how, return_indexers=return_indexers + ) elif self.is_monotonic and other.is_monotonic: try: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) + return self._join_monotonic( + other, how=how, return_indexers=return_indexers + ) except TypeError: pass - if how == 'left': + if how == "left": join_index = self - elif how == 'right': + elif how == "right": join_index = other - elif how == 'inner': + elif how == "inner": # TODO: sort=False here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.intersection(other, sort=False) - elif how == 'outer': + elif how == "outer": # TODO: sort=True here for backwards compat. It may # be better to use the sort parameter passed into join join_index = self.union(other) @@ -3465,23 +3636,23 @@ def _join_multi(self, other, how, return_indexers=True): # Join left and right # Join on same leveled multi-index frames is supported - join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, - return_indexers=True) + join_idx, lidx, ridx = self_jnlevels.join( + other_jnlevels, how, return_indexers=True + ) # Restore the dropped levels # Returned index level order is # common levels, ldrop_names, rdrop_names dropped_names = ldrop_names + rdrop_names - levels, codes, names = ( - _restore_dropped_levels_multijoin(self, other, - dropped_names, - join_idx, - lidx, ridx)) + levels, codes, names = _restore_dropped_levels_multijoin( + self, other, dropped_names, join_idx, lidx, ridx + ) # Re-create the multi-index - multi_join_idx = MultiIndex(levels=levels, codes=codes, - names=names, verify_integrity=False) + multi_join_idx = MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=False + ) multi_join_idx = multi_join_idx.remove_unused_levels() @@ -3496,24 +3667,24 @@ def _join_multi(self, other, how, return_indexers=True): self, other = other, self flip_order = True # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) + how = {"right": "left", "left": "right"}.get(how, how) level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) + result = self._join_level( + other, level, how=how, return_indexers=return_indexers + ) if flip_order: if isinstance(result, tuple): return result[0], result[2], result[1] return result - def _join_non_unique(self, other, how='left', return_indexers=False): + def _join_non_unique(self, other, how="left", return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - left_idx, right_idx = _get_join_indexers([self._ndarray_values], - [other._ndarray_values], - how=how, - sort=True) + left_idx, right_idx = _get_join_indexers( + [self._ndarray_values], [other._ndarray_values], how=how, sort=True + ) left_idx = ensure_platform_int(left_idx) right_idx = ensure_platform_int(right_idx) @@ -3529,8 +3700,9 @@ def _join_non_unique(self, other, how='left', return_indexers=False): else: return join_index - def _join_level(self, other, level, how='left', return_indexers=False, - keep_order=True): + def _join_level( + self, other, level, how="left", return_indexers=False, keep_order=True + ): """ The join method *only* affects the level of the resulting MultiIndex. Otherwise it just exactly aligns the Index data to the @@ -3548,7 +3720,7 @@ def _get_leaf_sorter(labels): order of higher levels. """ if labels[0].size == 0: - return np.empty(0, dtype='int64') + return np.empty(0, dtype="int64") if len(labels) == 1: lab = ensure_int64(labels[0]) @@ -3566,41 +3738,44 @@ def _get_leaf_sorter(labels): return lib.get_level_sorter(lab, ensure_int64(starts)) if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): - raise TypeError('Join on level between two MultiIndex objects ' - 'is ambiguous') + raise TypeError( + "Join on level between two MultiIndex objects " "is ambiguous" + ) left, right = self, other flip_order = not isinstance(self, MultiIndex) if flip_order: left, right = right, left - how = {'right': 'left', 'left': 'right'}.get(how, how) + how = {"right": "left", "left": "right"}.get(how, how) level = left._get_level_number(level) old_level = left.levels[level] if not right.is_unique: - raise NotImplementedError('Index._join_level on non-unique index ' - 'is not implemented') + raise NotImplementedError( + "Index._join_level on non-unique index " "is not implemented" + ) - new_level, left_lev_indexer, right_lev_indexer = \ - old_level.join(right, how=how, return_indexers=True) + new_level, left_lev_indexer, right_lev_indexer = old_level.join( + right, how=how, return_indexers=True + ) if left_lev_indexer is None: if keep_order or len(left) == 0: left_indexer = None join_index = left else: # sort the leaves - left_indexer = _get_leaf_sorter(left.codes[:level + 1]) + left_indexer = _get_leaf_sorter(left.codes[: level + 1]) join_index = left[left_indexer] else: left_lev_indexer = ensure_int64(left_lev_indexer) - rev_indexer = lib.get_reverse_indexer(left_lev_indexer, - len(old_level)) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) - new_lev_codes = algos.take_nd(rev_indexer, left.codes[level], - allow_fill=False) + new_lev_codes = algos.take_nd( + rev_indexer, left.codes[level], allow_fill=False + ) new_codes = list(left.codes) new_codes[level] = new_lev_codes @@ -3619,10 +3794,11 @@ def _get_leaf_sorter(labels): if level == 0: # outer most level, take the fast route ngroups = 1 + new_lev_codes.max() left_indexer, counts = libalgos.groupsort_indexer( - new_lev_codes, ngroups) + new_lev_codes, ngroups + ) # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0]:] + left_indexer = left_indexer[counts[0] :] new_codes = [lab[left_indexer] for lab in new_codes] else: # sort the leaves @@ -3631,7 +3807,7 @@ def _get_leaf_sorter(labels): if not mask_all: new_codes = [lab[mask] for lab in new_codes] - left_indexer = _get_leaf_sorter(new_codes[:level + 1]) + left_indexer = _get_leaf_sorter(new_codes[: level + 1]) new_codes = [lab[left_indexer] for lab in new_codes] # left_indexers are w.r.t masked frame. @@ -3639,13 +3815,17 @@ def _get_leaf_sorter(labels): if not mask_all: left_indexer = mask.nonzero()[0][left_indexer] - join_index = MultiIndex(levels=new_levels, codes=new_codes, - names=left.names, verify_integrity=False) + join_index = MultiIndex( + levels=new_levels, + codes=new_codes, + names=left.names, + verify_integrity=False, + ) if right_lev_indexer is not None: - right_indexer = algos.take_nd(right_lev_indexer, - join_index.codes[level], - allow_fill=False) + right_indexer = algos.take_nd( + right_lev_indexer, join_index.codes[level], allow_fill=False + ) else: right_indexer = join_index.codes[level] @@ -3653,17 +3833,19 @@ def _get_leaf_sorter(labels): left_indexer, right_indexer = right_indexer, left_indexer if return_indexers: - left_indexer = (None if left_indexer is None - else ensure_platform_int(left_indexer)) - right_indexer = (None if right_indexer is None - else ensure_platform_int(right_indexer)) + left_indexer = ( + None if left_indexer is None else ensure_platform_int(left_indexer) + ) + right_indexer = ( + None if right_indexer is None else ensure_platform_int(right_indexer) + ) return join_index, left_indexer, right_indexer else: return join_index - def _join_monotonic(self, other, how='left', return_indexers=False): + def _join_monotonic(self, other, how="left", return_indexers=False): if self.equals(other): - ret_index = other if how == 'right' else self + ret_index = other if how == "right" else self if return_indexers: return ret_index, None, None else: @@ -3674,28 +3856,28 @@ def _join_monotonic(self, other, how='left', return_indexers=False): if self.is_unique and other.is_unique: # We can perform much better than the general case - if how == 'left': + if how == "left": join_index = self lidx = None ridx = self._left_indexer_unique(sv, ov) - elif how == 'right': + elif how == "right": join_index = other lidx = self._left_indexer_unique(ov, sv) ridx = None - elif how == 'inner': + elif how == "inner": join_index, lidx, ridx = self._inner_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) - elif how == 'outer': + elif how == "outer": join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) else: - if how == 'left': + if how == "left": join_index, lidx, ridx = self._left_indexer(sv, ov) - elif how == 'right': + elif how == "right": join_index, ridx, lidx = self._left_indexer(ov, sv) - elif how == 'inner': + elif how == "inner": join_index, lidx, ridx = self._inner_indexer(sv, ov) - elif how == 'outer': + elif how == "outer": join_index, lidx, ridx = self._outer_indexer(sv, ov) join_index = self._wrap_joined_index(join_index, other) @@ -3813,7 +3995,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use '.to_numpy()' or '.array' instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -3827,7 +4011,9 @@ def memory_usage(self, deep=False): result += self._engine.sizeof(deep=deep) return result - _index_shared_docs['where'] = """ + _index_shared_docs[ + "where" + ] = """ Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other. @@ -3844,7 +4030,7 @@ def memory_usage(self, deep=False): Index """ - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): if other is None: other = self._na_value @@ -3890,11 +4076,12 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): """ from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): # skip int64 conversion attempt if uint-like dtype is passed, as # this could return Int64Index when UInt64Index is what's desired try: - res = data.astype('i8', copy=False) + res = data.astype("i8", copy=False) if (res == data).all(): return Int64Index(res, copy=copy, name=name) except (OverflowError, TypeError, ValueError): @@ -3903,7 +4090,7 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): # Conversion to int64 failed (possibly due to overflow) or was skipped, # so let's try now with uint64. try: - res = data.astype('u8', copy=False) + res = data.astype("u8", copy=False) if (res == data).all(): return UInt64Index(res, copy=copy, name=name) except (OverflowError, TypeError, ValueError): @@ -3913,14 +4100,17 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): @classmethod def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) + raise TypeError( + "{0}(...) must be called with a collection of some " + "kind, {1} was passed".format(cls.__name__, repr(data)) + ) @classmethod def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') + raise TypeError( + "String dtype not supported, you may need " + "to explicitly cast to a numeric type" + ) @classmethod def _coerce_to_ndarray(cls, data): @@ -4000,7 +4190,9 @@ def is_type_compatible(self, kind): """ return kind == self.inferred_type - _index_shared_docs['contains'] = """ + _index_shared_docs[ + "contains" + ] = """ Return a boolean indicating whether the provided key is in the index. Parameters @@ -4030,7 +4222,7 @@ def is_type_compatible(self, kind): False """ - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): hash(key) try: @@ -4052,7 +4244,10 @@ def contains(self, key): warnings.warn( "The 'contains' method is deprecated and will be removed in a " "future version. Use 'key in index' instead of " - "'index.contains(key)'", FutureWarning, stacklevel=2) + "'index.contains(key)'", + FutureWarning, + stacklevel=2, + ) return key in self def __hash__(self): @@ -4131,7 +4326,7 @@ def append(self, other): for obj in to_concat: if not isinstance(obj, Index): - raise TypeError('all inputs must be Index') + raise TypeError("all inputs must be Index") names = {obj.name for obj in to_concat} name = None if len(names) > 1 else self.name @@ -4197,8 +4392,9 @@ def equals(self, other): return other.equals(self) try: - return array_equivalent(com.values_from_object(self), - com.values_from_object(other)) + return array_equivalent( + com.values_from_object(self), com.values_from_object(other) + ) except Exception: return False @@ -4213,10 +4409,16 @@ def identical(self, other): If two Index objects have equal elements and same type True, otherwise False. """ - return (self.equals(other) and - all((getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables)) and - type(self) == type(other)) + return ( + self.equals(other) + and all( + ( + getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables + ) + ) + and type(self) == type(other) + ) def asof(self, label): """ @@ -4275,7 +4477,7 @@ def asof(self, label): ValueError: index must be monotonic increasing or decreasing """ try: - loc = self.get_loc(label, method='pad') + loc = self.get_loc(label, method="pad") except KeyError: return self._na_value else: @@ -4312,7 +4514,7 @@ def asof_locs(self, where, mask): which correspond to the return values of the `asof` function for every element in `where`. """ - locs = self.values[mask].searchsorted(where.values, side='right') + locs = self.values[mask].searchsorted(where.values, side="right") locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) @@ -4380,8 +4582,9 @@ def sort(self, *args, **kwargs): """ Use sort_values instead. """ - raise TypeError("cannot sort an Index object in-place, use " - "sort_values instead") + raise TypeError( + "cannot sort an Index object in-place, use " "sort_values instead" + ) def shift(self, periods=1, freq=None): """ @@ -4439,8 +4642,7 @@ def shift(self, periods=1, freq=None): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError("Not supported for type %s" % - type(self).__name__) + raise NotImplementedError("Not supported for type %s" % type(self).__name__) def argsort(self, *args, **kwargs): """ @@ -4482,7 +4684,9 @@ def argsort(self, *args, **kwargs): result = np.array(self) return result.argsort(*args, **kwargs) - _index_shared_docs['get_value'] = """ + _index_shared_docs[ + "get_value" + ] = """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing. @@ -4492,13 +4696,13 @@ def argsort(self, *args, **kwargs): A value in the Series with the index of the key value in self. """ - @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex # Things like `Series._get_value` (via .at) pass the EA directly here. - s = getattr(series, '_values', series) + s = getattr(series, "_values", series) if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): # GH 20882, 21257 # Unify Index and ExtensionArray treatment @@ -4510,8 +4714,7 @@ def get_value(self, series, key): iloc = self.get_loc(key) return s[iloc] except KeyError: - if (len(self) > 0 and - (self.holds_integer() or self.is_boolean())): + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): raise elif is_integer(key): return s[key] @@ -4519,10 +4722,9 @@ def get_value(self, series, key): s = com.values_from_object(series) k = com.values_from_object(key) - k = self._convert_scalar_indexer(k, kind='getitem') + k = self._convert_scalar_indexer(k, kind="getitem") try: - return self._engine.get_value(s, k, - tz=getattr(series.dtype, 'tz', None)) + return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) except KeyError as e1: if len(self) > 0 and (self.holds_integer() or self.is_boolean()): raise @@ -4553,10 +4755,13 @@ def set_value(self, arr, key, value): ----- Only use this if you know what you're doing. """ - self._engine.set_value(com.values_from_object(arr), - com.values_from_object(key), value) + self._engine.set_value( + com.values_from_object(arr), com.values_from_object(key), value + ) - _index_shared_docs['get_indexer_non_unique'] = """ + _index_shared_docs[ + "get_indexer_non_unique" + ] = """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the current data to the new index. @@ -4576,7 +4781,7 @@ def set_value(self, arr, key, value): These correspond to the -1 in the indexer array. """ - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ensure_index(target) if is_categorical(target): @@ -4614,11 +4819,12 @@ def get_indexer_for(self, target, **kwargs): def _maybe_promote(self, other): # A hack, but it works from pandas import DatetimeIndex - if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + + if self.inferred_type == "date" and isinstance(other, DatetimeIndex): return DatetimeIndex(self), other - elif self.inferred_type == 'boolean': + elif self.inferred_type == "boolean": if not is_object_dtype(self.dtype): - return self.astype('object'), other.astype('object') + return self.astype("object"), other.astype("object") return self, other def groupby(self, values): @@ -4669,6 +4875,7 @@ def map(self, mapper, na_action=None): """ from .multi import MultiIndex + new_values = super()._map_values(mapper, na_action=na_action) attributes = self._get_attributes_dict() @@ -4677,17 +4884,16 @@ def map(self, mapper, na_action=None): if new_values.size and isinstance(new_values[0], tuple): if isinstance(self, MultiIndex): names = self.names - elif attributes.get('name'): - names = [attributes.get('name')] * len(new_values[0]) + elif attributes.get("name"): + names = [attributes.get("name")] * len(new_values[0]) else: names = None - return MultiIndex.from_tuples(new_values, - names=names) + return MultiIndex.from_tuples(new_values, names=names) - attributes['copy'] = False + attributes["copy"] = False if not new_values.size: # empty - attributes['dtype'] = self.dtype + attributes["dtype"] = self.dtype return Index(new_values, **attributes) @@ -4823,8 +5029,7 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): >>> idx.slice_indexer(start='b', end=('c', 'g')) slice(1, 3) """ - start_slice, end_slice = self.slice_locs(start, end, step=step, - kind=kind) + start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind) # return a slice if not is_scalar(start_slice): @@ -4854,17 +5059,19 @@ def _validate_indexer(self, form, key, kind): If we are positional indexer, validate that we have appropriate typed bounds must be an integer. """ - assert kind in ['ix', 'loc', 'getitem', 'iloc'] + assert kind in ["ix", "loc", "getitem", "iloc"] if key is None: pass elif is_integer(key): pass - elif kind in ['iloc', 'getitem']: + elif kind in ["iloc", "getitem"]: self._invalid_indexer(form, key) return key - _index_shared_docs['_maybe_cast_slice_bound'] = """ + _index_shared_docs[ + "_maybe_cast_slice_bound" + ] = """ This function should be overloaded in subclasses that allow non-trivial casting on label-slice bounds, e.g. datetime-like indices allowing strings containing formatted datetimes. @@ -4885,38 +5092,38 @@ def _validate_indexer(self, form, key, kind): """ - @Appender(_index_shared_docs['_maybe_cast_slice_bound']) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes # reject them if is_float(label): - if not (kind in ['ix'] and (self.holds_integer() or - self.is_floating())): - self._invalid_indexer('slice', label) + if not (kind in ["ix"] and (self.holds_integer() or self.is_floating())): + self._invalid_indexer("slice", label) # we are trying to find integer bounds on a non-integer based index # this is rejected (generally .loc gets you here) elif is_integer(label): - self._invalid_indexer('slice', label) + self._invalid_indexer("slice", label) return label - def _searchsorted_monotonic(self, label, side='left'): + def _searchsorted_monotonic(self, label, side="left"): if self.is_monotonic_increasing: return self.searchsorted(label, side=side) elif self.is_monotonic_decreasing: # np.searchsorted expects ascending sort order, have to reverse # everything for it to work (element ordering, search side and # resulting value). - pos = self[::-1].searchsorted(label, side='right' if side == 'left' - else 'left') + pos = self[::-1].searchsorted( + label, side="right" if side == "left" else "left" + ) return len(self) - pos - raise ValueError('index must be monotonic increasing or decreasing') + raise ValueError("index must be monotonic increasing or decreasing") def get_slice_bound(self, label, side, kind): """ @@ -4936,12 +5143,13 @@ def get_slice_bound(self, label, side, kind): int Index of label. """ - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] - if side not in ('left', 'right'): - raise ValueError("Invalid value for side kwarg," - " must be either 'left' or 'right': %s" % - (side, )) + if side not in ("left", "right"): + raise ValueError( + "Invalid value for side kwarg," + " must be either 'left' or 'right': %s" % (side,) + ) original_label = label @@ -4963,20 +5171,22 @@ def get_slice_bound(self, label, side, kind): # get_loc may return a boolean array or an array of indices, which # is OK as long as they are representable by a slice. if is_bool_dtype(slc): - slc = lib.maybe_booleans_to_slice(slc.view('u1')) + slc = lib.maybe_booleans_to_slice(slc.view("u1")) else: - slc = lib.maybe_indices_to_slice(slc.astype('i8'), len(self)) + slc = lib.maybe_indices_to_slice(slc.astype("i8"), len(self)) if isinstance(slc, np.ndarray): - raise KeyError("Cannot get %s slice bound for non-unique " - "label: %r" % (side, original_label)) + raise KeyError( + "Cannot get %s slice bound for non-unique " + "label: %r" % (side, original_label) + ) if isinstance(slc, slice): - if side == 'left': + if side == "left": return slc.start else: return slc.stop else: - if side == 'right': + if side == "right": return slc + 1 else: return slc @@ -5013,7 +5223,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): >>> idx.slice_locs(start='b', end='c') (1, 3) """ - inc = (step is None or step >= 0) + inc = step is None or step >= 0 if not inc: # If it's a reverse slice, temporarily swap bounds. @@ -5021,8 +5231,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): # GH 16785: If start and end happen to be date strings with UTC offsets # attempt to parse and check that the offsets are the same - if (isinstance(start, (str, datetime)) - and isinstance(end, (str, datetime))): + if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)): try: ts_start = Timestamp(start) ts_end = Timestamp(end) @@ -5030,18 +5239,17 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): pass else: if not tz_compare(ts_start.tzinfo, ts_end.tzinfo): - raise ValueError("Both dates must have the " - "same UTC offset") + raise ValueError("Both dates must have the " "same UTC offset") start_slice = None if start is not None: - start_slice = self.get_slice_bound(start, 'left', kind) + start_slice = self.get_slice_bound(start, "left", kind) if start_slice is None: start_slice = 0 end_slice = None if end is not None: - end_slice = self.get_slice_bound(end, 'right', kind) + end_slice = self.get_slice_bound(end, "right", kind) if end_slice is None: end_slice = len(self) @@ -5102,7 +5310,7 @@ def insert(self, loc, item): idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) - def drop(self, labels, errors='raise'): + def drop(self, labels, errors="raise"): """ Make new Index with passed list of labels deleted. @@ -5121,14 +5329,13 @@ def drop(self, labels, errors='raise'): KeyError If not all of the labels are found in the selected axis """ - arr_dtype = 'object' if self.dtype == 'object' else None + arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): - if errors != 'ignore': - raise KeyError( - '{} not found in axis'.format(labels[mask])) + if errors != "ignore": + raise KeyError("{} not found in axis".format(labels[mask])) indexer = indexer[~mask] return self.delete(indexer) @@ -5138,17 +5345,18 @@ def drop(self, labels, errors='raise'): def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that # operation and then wrap the results - if self._is_numeric_dtype and op.__name__ in ['add', 'sub', - 'radd', 'rsub']: - raise TypeError("Operation {opname} between {cls} and {other} " - "is invalid".format(opname=op.__name__, - cls=self.dtype, - other=type(other).__name__)) + if self._is_numeric_dtype and op.__name__ in ["add", "sub", "radd", "rsub"]: + raise TypeError( + "Operation {opname} between {cls} and {other} " + "is invalid".format( + opname=op.__name__, cls=self.dtype, other=type(other).__name__ + ) + ) other = Timedelta(other) values = self.values - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(values, other) attrs = self._get_attributes_dict() @@ -5177,32 +5385,32 @@ def _add_numeric_methods_add_sub_disabled(cls): """ Add in the numeric add/sub methods to disable. """ - cls.__add__ = make_invalid_op('__add__') - cls.__radd__ = make_invalid_op('__radd__') - cls.__iadd__ = make_invalid_op('__iadd__') - cls.__sub__ = make_invalid_op('__sub__') - cls.__rsub__ = make_invalid_op('__rsub__') - cls.__isub__ = make_invalid_op('__isub__') + cls.__add__ = make_invalid_op("__add__") + cls.__radd__ = make_invalid_op("__radd__") + cls.__iadd__ = make_invalid_op("__iadd__") + cls.__sub__ = make_invalid_op("__sub__") + cls.__rsub__ = make_invalid_op("__rsub__") + cls.__isub__ = make_invalid_op("__isub__") @classmethod def _add_numeric_methods_disabled(cls): """ Add in numeric methods to disable other than add/sub. """ - cls.__pow__ = make_invalid_op('__pow__') - cls.__rpow__ = make_invalid_op('__rpow__') - cls.__mul__ = make_invalid_op('__mul__') - cls.__rmul__ = make_invalid_op('__rmul__') - cls.__floordiv__ = make_invalid_op('__floordiv__') - cls.__rfloordiv__ = make_invalid_op('__rfloordiv__') - cls.__truediv__ = make_invalid_op('__truediv__') - cls.__rtruediv__ = make_invalid_op('__rtruediv__') - cls.__mod__ = make_invalid_op('__mod__') - cls.__divmod__ = make_invalid_op('__divmod__') - cls.__neg__ = make_invalid_op('__neg__') - cls.__pos__ = make_invalid_op('__pos__') - cls.__abs__ = make_invalid_op('__abs__') - cls.__inv__ = make_invalid_op('__inv__') + cls.__pow__ = make_invalid_op("__pow__") + cls.__rpow__ = make_invalid_op("__rpow__") + cls.__mul__ = make_invalid_op("__mul__") + cls.__rmul__ = make_invalid_op("__rmul__") + cls.__floordiv__ = make_invalid_op("__floordiv__") + cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") + cls.__truediv__ = make_invalid_op("__truediv__") + cls.__rtruediv__ = make_invalid_op("__rtruediv__") + cls.__mod__ = make_invalid_op("__mod__") + cls.__divmod__ = make_invalid_op("__divmod__") + cls.__neg__ = make_invalid_op("__neg__") + cls.__pos__ = make_invalid_op("__pos__") + cls.__abs__ = make_invalid_op("__abs__") + cls.__inv__ = make_invalid_op("__inv__") def _maybe_update_attributes(self, attrs): """ @@ -5215,9 +5423,10 @@ def _validate_for_numeric_unaryop(self, op, opstr): Validate if we can perform a numeric unary operation. """ if not self._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op " - "{opstr} for type: {typ}" - .format(opstr=opstr, typ=type(self).__name__)) + raise TypeError( + "cannot evaluate a numeric op " + "{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__) + ) def _validate_for_numeric_binop(self, other, op): """ @@ -5228,30 +5437,32 @@ def _validate_for_numeric_binop(self, other, op): ----- This is an internal method called by ops. """ - opstr = '__{opname}__'.format(opname=op.__name__) + opstr = "__{opname}__".format(opname=op.__name__) # if we are an inheritor of numeric, # but not actually numeric (e.g. DatetimeIndex/PeriodIndex) if not self._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op {opstr} " - "for type: {typ}" - .format(opstr=opstr, typ=type(self).__name__)) + raise TypeError( + "cannot evaluate a numeric op {opstr} " + "for type: {typ}".format(opstr=opstr, typ=type(self).__name__) + ) if isinstance(other, Index): if not other._is_numeric_dtype: - raise TypeError("cannot evaluate a numeric op " - "{opstr} with type: {typ}" - .format(opstr=opstr, typ=type(other))) + raise TypeError( + "cannot evaluate a numeric op " + "{opstr} with type: {typ}".format(opstr=opstr, typ=type(other)) + ) elif isinstance(other, np.ndarray) and not other.ndim: other = other.item() if isinstance(other, (Index, ABCSeries, np.ndarray)): if len(self) != len(other): - raise ValueError("cannot evaluate a numeric op with " - "unequal lengths") + raise ValueError("cannot evaluate a numeric op with " "unequal lengths") other = com.values_from_object(other) - if other.dtype.kind not in ['f', 'i', 'u']: - raise TypeError("cannot evaluate a numeric op " - "with a non-numeric dtype") + if other.dtype.kind not in ["f", "i", "u"]: + raise TypeError( + "cannot evaluate a numeric op " "with a non-numeric dtype" + ) elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)): # higher up to handle pass @@ -5292,8 +5503,8 @@ def _add_numeric_methods_unary(cls): """ Add in numeric unary methods. """ - def _make_evaluate_unary(op, opstr): + def _make_evaluate_unary(op, opstr): def _evaluate_numeric_unary(self): self._validate_for_numeric_unaryop(op, opstr) @@ -5304,10 +5515,10 @@ def _evaluate_numeric_unary(self): _evaluate_numeric_unary.__name__ = opstr return _evaluate_numeric_unary - cls.__neg__ = _make_evaluate_unary(operator.neg, '__neg__') - cls.__pos__ = _make_evaluate_unary(operator.pos, '__pos__') - cls.__abs__ = _make_evaluate_unary(np.abs, '__abs__') - cls.__inv__ = _make_evaluate_unary(lambda x: -x, '__inv__') + cls.__neg__ = _make_evaluate_unary(operator.neg, "__neg__") + cls.__pos__ = _make_evaluate_unary(operator.pos, "__pos__") + cls.__abs__ = _make_evaluate_unary(np.abs, "__abs__") + cls.__inv__ = _make_evaluate_unary(lambda x: -x, "__inv__") @classmethod def _add_numeric_methods(cls): @@ -5334,7 +5545,8 @@ def _add_logical_methods(cls): %(outname)s : bool or array_like (if axis is specified) A single element array_like may be converted to bool.""" - _index_shared_docs['index_all'] = dedent(""" + _index_shared_docs["index_all"] = dedent( + """ See Also -------- @@ -5372,9 +5584,11 @@ def _add_logical_methods(cls): >>> pd.Index([0, 0, 0]).any() False - """) + """ + ) - _index_shared_docs['index_any'] = dedent(""" + _index_shared_docs["index_any"] = dedent( + """ See Also -------- @@ -5395,16 +5609,19 @@ def _add_logical_methods(cls): >>> index = pd.Index([0, 0, 0]) >>> index.any() False - """) + """ + ) def _make_logical_function(name, desc, f): @Substitution(outname=name, desc=desc) - @Appender(_index_shared_docs['index_' + name]) + @Appender(_index_shared_docs["index_" + name]) @Appender(_doc) def logical_func(self, *args, **kwargs): result = f(self.values) - if (isinstance(result, (np.ndarray, ABCSeries, Index)) and - result.ndim == 0): + if ( + isinstance(result, (np.ndarray, ABCSeries, Index)) + and result.ndim == 0 + ): # return NumPy type return result.dtype.type(result.item()) else: # pragma: no cover @@ -5413,20 +5630,20 @@ def logical_func(self, *args, **kwargs): logical_func.__name__ = name return logical_func - cls.all = _make_logical_function('all', 'Return whether all elements ' - 'are True.', - np.all) - cls.any = _make_logical_function('any', - 'Return whether any element is True.', - np.any) + cls.all = _make_logical_function( + "all", "Return whether all elements " "are True.", np.all + ) + cls.any = _make_logical_function( + "any", "Return whether any element is True.", np.any + ) @classmethod def _add_logical_methods_disabled(cls): """ Add in logical methods to disable. """ - cls.all = make_invalid_op('all') - cls.any = make_invalid_op('any') + cls.all = make_invalid_op("all") + cls.any = make_invalid_op("any") Index._add_numeric_methods_disabled() @@ -5511,7 +5728,7 @@ def ensure_index(index_like, copy=False): if copy: index_like = index_like.copy() return index_like - if hasattr(index_like, 'name'): + if hasattr(index_like, "name"): return Index(index_like, name=index_like.name, copy=copy) if is_iterator(index_like): @@ -5527,6 +5744,7 @@ def ensure_index(index_like, copy=False): if len(converted) > 0 and all_arrays: from .multi import MultiIndex + return MultiIndex.from_arrays(converted) else: index_like = converted @@ -5535,6 +5753,7 @@ def ensure_index(index_like, copy=False): # so only need to do this if not list instance if copy: from copy import copy + index_like = copy(index_like) return Index(index_like) @@ -5557,16 +5776,17 @@ def _trim_front(strings): Trims zeros and decimal points. """ trimmed = strings - while len(strings) > 0 and all(x[0] == ' ' for x in trimmed): + while len(strings) > 0 and all(x[0] == " " for x in trimmed): trimmed = [x[1:] for x in trimmed] return trimmed def _validate_join_method(method): - if method not in ['left', 'right', 'inner', 'outer']: - raise ValueError('do not recognize join method %s' % method) + if method not in ["left", "right", "inner", "outer"]: + raise ValueError("do not recognize join method %s" % method) def default_index(n): from pandas.core.index import RangeIndex + return RangeIndex(0, n, name=None) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 321297335cf23..9550d68f1d32b 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -12,8 +12,12 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - ensure_platform_int, is_categorical_dtype, is_interval_dtype, is_list_like, - is_scalar) + ensure_platform_int, + is_categorical_dtype, + is_interval_dtype, + is_list_like, + is_scalar, +) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCCategorical, ABCSeries from pandas.core.dtypes.missing import isna @@ -29,20 +33,26 @@ from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) +_index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @accessor.delegate_names( delegate=Categorical, - accessors=["rename_categories", - "reorder_categories", - "add_categories", - "remove_categories", - "remove_unused_categories", - "set_categories", - "as_ordered", "as_unordered", - "min", "max"], - typ='method', overwrite=True) + accessors=[ + "rename_categories", + "reorder_categories", + "add_categories", + "remove_categories", + "remove_unused_categories", + "set_categories", + "as_ordered", + "as_unordered", + "min", + "max", + ], + typ="method", + overwrite=True, +) class CategoricalIndex(Index, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -134,37 +144,48 @@ class CategoricalIndex(Index, accessor.PandasDelegate): 'c' """ - _typ = 'categoricalindex' + _typ = "categoricalindex" @property def _engine_type(self): # self.codes can have dtype int8, int16, int32 or int64, so we need # to return the corresponding engine type (libindex.Int8Engine, etc.). - return {np.int8: libindex.Int8Engine, - np.int16: libindex.Int16Engine, - np.int32: libindex.Int32Engine, - np.int64: libindex.Int64Engine, - }[self.codes.dtype.type] + return { + np.int8: libindex.Int8Engine, + np.int16: libindex.Int16Engine, + np.int32: libindex.Int32Engine, + np.int64: libindex.Int64Engine, + }[self.codes.dtype.type] - _attributes = ['name'] + _attributes = ["name"] # -------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, categories=None, ordered=None, dtype=None, - copy=False, name=None, fastpath=None): + def __new__( + cls, + data=None, + categories=None, + ordered=None, + dtype=None, + copy=False, + name=None, + fastpath=None, + ): if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(data, name=name, dtype=dtype) - dtype = CategoricalDtype._from_values_or_dtype(data, categories, - ordered, dtype) + dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name if not is_categorical_dtype(data): @@ -221,8 +242,7 @@ def _create_categorical(cls, data, dtype=None): ------- Categorical """ - if (isinstance(data, (cls, ABCSeries)) and - is_categorical_dtype(data)): + if isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data): data = data.values if not isinstance(data, ABCCategorical): @@ -248,7 +268,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): # -------------------------------------------------------------------- - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, dtype=None, **kwargs): if dtype is None: dtype = self.dtype @@ -269,17 +289,18 @@ def _is_dtype_compat(self, other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): - raise TypeError("categories must match existing categories " - "when appending") + raise TypeError( + "categories must match existing categories " "when appending" + ) else: values = other if not is_list_like(values): values = [values] - other = CategoricalIndex(self._create_categorical( - other, dtype=self.dtype)) + other = CategoricalIndex(self._create_categorical(other, dtype=self.dtype)) if not other.isin(values).all(): - raise TypeError("cannot append a non-category item to a " - "CategoricalIndex") + raise TypeError( + "cannot append a non-category item to a " "CategoricalIndex" + ) return other @@ -320,26 +341,31 @@ def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) """ - max_categories = (10 if get_option("display.max_categories") == 0 else - get_option("display.max_categories")) + max_categories = ( + 10 + if get_option("display.max_categories") == 0 + else get_option("display.max_categories") + ) attrs = [ - ('categories', - ibase.default_pprint(self.categories, - max_seq_items=max_categories)), - ('ordered', self.ordered)] + ( + "categories", + ibase.default_pprint(self.categories, max_seq_items=max_categories), + ), + ("ordered", self.ordered), + ] if self.name is not None: - attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype.name)) - max_seq_items = get_option('display.max_seq_items') or len(self) + attrs.append(("name", ibase.default_pprint(self.name))) + attrs.append(("dtype", "'%s'" % self.dtype.name)) + max_seq_items = get_option("display.max_seq_items") or len(self) if len(self) > max_seq_items: - attrs.append(('length', len(self))) + attrs.append(("length", len(self))) return attrs # -------------------------------------------------------------------- @property def inferred_type(self): - return 'categorical' + return "categorical" @property def values(self): @@ -378,7 +404,7 @@ def ordered(self): def _reverse_indexer(self): return self._data._reverse_indexer() - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): # if key is a NaN, check if any NaN is in self. if isna(key): @@ -390,10 +416,11 @@ def __array__(self, dtype=None): """ the array interface, return my values """ return np.array(self._data, dtype=dtype) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex + return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 @@ -408,7 +435,7 @@ def _isnan(self): """ return if each value is nan""" return self._data.codes == -1 - @Appender(ibase._index_shared_docs['fillna']) + @Appender(ibase._index_shared_docs["fillna"]) def fillna(self, value, downcast=None): self._assert_can_do_op(value) return CategoricalIndex(self._data.fillna(value), name=self.name) @@ -435,7 +462,7 @@ def is_monotonic_increasing(self): def is_monotonic_decreasing(self): return self._engine.is_monotonic_decreasing - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: self._validate_index_level(level) @@ -445,14 +472,15 @@ def unique(self, level=None): return self._shallow_copy(result, dtype=result.dtype) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): from pandas._libs.hashtable import duplicated_int64 - codes = self.codes.astype('i8') + + codes = self.codes.astype("i8") return duplicated_int64(codes, keep) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ - return self.astype('object') + return self.astype("object") def get_loc(self, key, method=None): """ @@ -493,9 +521,7 @@ def get_loc(self, key, method=None): except KeyError: raise KeyError(key) - def get_value(self, - series: AnyArrayLike, - key: Any): + def get_value(self, series: AnyArrayLike, key: Any): """ Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing @@ -515,7 +541,7 @@ def get_value(self, """ try: k = com.values_from_object(key) - k = self._convert_scalar_indexer(k, kind='getitem') + k = self._convert_scalar_indexer(k, kind="getitem") indexer = self.get_loc(k) return series.take([indexer])[0] except (KeyError, TypeError): @@ -528,7 +554,7 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with # 1. copy the underlying Categorical @@ -540,8 +566,7 @@ def where(self, cond, other=None): cat = Categorical(values, dtype=self.dtype) return self._shallow_copy(cat, **self._get_attributes_dict()) - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -555,14 +580,17 @@ def reindex(self, target, method=None, level=None, limit=None, """ if method is not None: - raise NotImplementedError("argument method is not implemented for " - "CategoricalIndex.reindex") + raise NotImplementedError( + "argument method is not implemented for " "CategoricalIndex.reindex" + ) if level is not None: - raise NotImplementedError("argument level is not implemented for " - "CategoricalIndex.reindex") + raise NotImplementedError( + "argument level is not implemented for " "CategoricalIndex.reindex" + ) if limit is not None: - raise NotImplementedError("argument limit is not implemented for " - "CategoricalIndex.reindex") + raise NotImplementedError( + "argument limit is not implemented for " "CategoricalIndex.reindex" + ) target = ibase.ensure_index(target) @@ -587,8 +615,7 @@ def reindex(self, target, method=None, level=None, limit=None, if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) - new_target, indexer, _ = result._reindex_non_unique( - np.array(target)) + new_target, indexer, _ = result._reindex_non_unique(np.array(target)) else: codes = new_target.codes.copy() @@ -628,7 +655,7 @@ def _reindex_non_unique(self, target): return new_target, indexer, new_indexer - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): from pandas.core.arrays.categorical import _recode_for_categories @@ -636,24 +663,26 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ibase.ensure_index(target) if self.is_unique and self.equals(target): - return np.arange(len(self), dtype='intp') - - if method == 'pad' or method == 'backfill': - raise NotImplementedError("method='pad' and method='backfill' not " - "implemented yet for CategoricalIndex") - elif method == 'nearest': - raise NotImplementedError("method='nearest' not implemented yet " - 'for CategoricalIndex') - - if (isinstance(target, CategoricalIndex) and - self.values.is_dtype_equal(target)): + return np.arange(len(self), dtype="intp") + + if method == "pad" or method == "backfill": + raise NotImplementedError( + "method='pad' and method='backfill' not " + "implemented yet for CategoricalIndex" + ) + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet " "for CategoricalIndex" + ) + + if isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target): if self.values.equals(target.values): # we have the same codes codes = target.codes else: - codes = _recode_for_categories(target.codes, - target.categories, - self.values.categories) + codes = _recode_for_categories( + target.codes, target.categories, self.values.categories + ) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) @@ -664,7 +693,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer) - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) @@ -680,14 +709,14 @@ def get_indexer_non_unique(self, target): indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): if self.categories._defer_to_indexing: return self.categories._convert_scalar_indexer(key, kind=kind) return super()._convert_scalar_indexer(key, kind=kind) - @Appender(_index_shared_docs['_convert_list_indexer']) + @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): # Return our indexer or raise if all of the values are not included in # the categories @@ -701,11 +730,12 @@ def _convert_list_indexer(self, keyarr, kind=None): raise KeyError( "a list-indexer must only " "include values that are " - "in the categories") + "in the categories" + ) return self.get_indexer(keyarr) - @Appender(_index_shared_docs['_convert_arr_indexer']) + @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): keyarr = com.asarray_tuplesafe(keyarr) @@ -714,19 +744,21 @@ def _convert_arr_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(_index_shared_docs['_convert_index_indexer']) + @Appender(_index_shared_docs["_convert_index_indexer"]) def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) - taken = self._assert_take_fillable(self.codes, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1) + taken = self._assert_take_fillable( + self.codes, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=-1, + ) return self._create_from_codes(taken) def is_dtype_equal(self, other): @@ -834,8 +866,10 @@ def insert(self, loc, item): """ code = self.categories.get_indexer([item]) if (code == -1) and not (is_scalar(item) and isna(item)): - raise TypeError("cannot insert an item into a CategoricalIndex " - "that is not already an existing category") + raise TypeError( + "cannot insert an item into a CategoricalIndex " + "that is not already an existing category" + ) codes = self.codes codes = np.concatenate((codes[:loc], code, codes[loc:])) @@ -850,8 +884,7 @@ def _concat_same_dtype(self, to_concat, name): Concatenate to_concat which has the same class ValueError if other is not in the categories """ - codes = np.concatenate([self._is_dtype_compat(c).codes - for c in to_concat]) + codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) result = self._create_from_codes(codes, name=name) # if name is None, _create_from_codes sets self.name result.name = name @@ -866,7 +899,7 @@ def _add_comparison_methods(cls): """ add in comparison methods """ def _make_compare(op): - opname = '__{op}__'.format(op=op.__name__) + opname = "__{op}__".format(op=op.__name__) def _evaluate_compare(self, other): @@ -875,19 +908,19 @@ def _evaluate_compare(self, other): if isinstance(other, CategoricalIndex): other = other._values elif isinstance(other, Index): - other = self._create_categorical( - other._values, dtype=self.dtype) + other = self._create_categorical(other._values, dtype=self.dtype) - if isinstance(other, (ABCCategorical, np.ndarray, - ABCSeries)): + if isinstance(other, (ABCCategorical, np.ndarray, ABCSeries)): if len(self.values) != len(other): raise ValueError("Lengths must match to compare") if isinstance(other, ABCCategorical): if not self.values.is_dtype_equal(other): - raise TypeError("categorical index comparisons must " - "have the same categories and ordered " - "attributes") + raise TypeError( + "categorical index comparisons must " + "have the same categories and ordered " + "attributes" + ) result = op(self.values, other) if isinstance(result, ABCSeries): @@ -908,7 +941,7 @@ def _evaluate_compare(self, other): def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ method = getattr(self._values, name) - if 'inplace' in kwargs: + if "inplace" in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) if is_scalar(res): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e141f7b5c5b23..f2e6f631ae9ee 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -14,15 +14,23 @@ from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_int64, is_dtype_equal, is_float, is_integer, is_list_like, - is_period_dtype, is_scalar) + ensure_int64, + is_dtype_equal, + is_float, + is_integer, + is_list_like, + is_period_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core import algorithms, ops from pandas.core.accessor import PandasDelegate from pandas.core.arrays import ExtensionOpsMixin from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8) + DatetimeLikeArrayMixin, + _ensure_datetimelike_to_i8, +) import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.tools.timedeltas import to_timedelta @@ -58,24 +66,24 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): """ common ops mixin to support a unified interface datetimelike Index """ + _data = None # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are # properties there. They can be made into cache_readonly for Index # subclasses bc they are immutable inferred_freq = cache_readonly( - DatetimeLikeArrayMixin.inferred_freq.fget) # type: ignore + DatetimeLikeArrayMixin.inferred_freq.fget + ) # type: ignore _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore - hasnans = cache_readonly( - DatetimeLikeArrayMixin._hasnans.fget) # type: ignore + hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code _resolution = cache_readonly( - DatetimeLikeArrayMixin._resolution.fget) # type: ignore - resolution = cache_readonly( - DatetimeLikeArrayMixin.resolution.fget) # type: ignore + DatetimeLikeArrayMixin._resolution.fget + ) # type: ignore + resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore - _maybe_mask_results = ea_passthrough( - DatetimeLikeArrayMixin._maybe_mask_results) + _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results) __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) mean = ea_passthrough(DatetimeLikeArrayMixin.mean) @@ -114,6 +122,7 @@ def _create_comparison_method(cls, op): """ Create a comparison method that dispatches to ``cls.values``. """ + def wrapper(self, other): if isinstance(other, ABCSeries): # the arrays defer to Series for comparison ops but the indexes @@ -124,7 +133,7 @@ def wrapper(self, other): return result wrapper.__doc__ = op.__doc__ - wrapper.__name__ = '__{}__'.format(op.__name__) + wrapper.__name__ = "__{}__".format(op.__name__) return wrapper @property @@ -182,12 +191,14 @@ def _join_i8_wrapper(joinf, dtype, with_indexers=True): @staticmethod def wrapper(left, right): - if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, - DatetimeLikeArrayMixin)): - left = left.view('i8') - if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, - DatetimeLikeArrayMixin)): - right = right.view('i8') + if isinstance( + left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) + ): + left = left.view("i8") + if isinstance( + right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) + ): + right = right.view("i8") results = joinf(left, right) if with_indexers: join_index, left_indexer, right_indexer = results @@ -197,27 +208,30 @@ def wrapper(left, right): return wrapper - def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', - from_utc=False): + def _ensure_localized( + self, arg, ambiguous="raise", nonexistent="raise", from_utc=False + ): # See DatetimeLikeArrayMixin._ensure_localized.__doc__ - if getattr(self, 'tz', None): + if getattr(self, "tz", None): # ensure_localized is only relevant for tz-aware DTI - result = self._data._ensure_localized(arg, - ambiguous=ambiguous, - nonexistent=nonexistent, - from_utc=from_utc) + result = self._data._ensure_localized( + arg, ambiguous=ambiguous, nonexistent=nonexistent, from_utc=from_utc + ) return type(self)._simple_new(result, name=self.name) return arg def _box_values(self, values): return self._data._box_values(values) - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): try: res = self.get_loc(key) - return (is_scalar(res) or isinstance(res, slice) or - (is_list_like(res) and len(res))) + return ( + is_scalar(res) + or isinstance(res, slice) + or (is_list_like(res) and len(res)) + ) except (KeyError, TypeError, ValueError): return False @@ -232,7 +246,7 @@ def map(self, mapper, na_action=None): result = Index(result) if not isinstance(result, Index): - raise TypeError('The map function must return an Index object') + raise TypeError("The map function must return an Index object") return result except Exception: return self.astype(object).map(mapper) @@ -250,23 +264,22 @@ def sort_values(self, return_indexer=False, ascending=True): else: sorted_values = np.sort(self._ndarray_values) attribs = self._get_attributes_dict() - freq = attribs['freq'] + freq = attribs["freq"] if freq is not None and not is_period_dtype(self): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: freq = freq * -1 - attribs['freq'] = freq + attribs["freq"] = freq if not ascending: sorted_values = sorted_values[::-1] return self._simple_new(sorted_values, **attribs) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_int64(indices) @@ -274,10 +287,13 @@ def take(self, indices, axis=0, allow_fill=True, if isinstance(maybe_slice, slice): return self[maybe_slice] - taken = self._assert_take_fillable(self.asi8, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=iNaT) + taken = self._assert_take_fillable( + self.asi8, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=iNaT, + ) # keep freq in PeriodArray/Index, reset otherwise freq = self.freq if is_period_dtype(self) else None @@ -298,16 +314,18 @@ def asobject(self): *this is an internal non-public method* """ - warnings.warn("'asobject' is deprecated. Use 'astype(object)'" - " instead", FutureWarning, stacklevel=2) + warnings.warn( + "'asobject' is deprecated. Use 'astype(object)'" " instead", + FutureWarning, + stacklevel=2, + ) return self.astype(object) def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(to_timedelta(tolerance).to_numpy()) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") return tolerance def tolist(self): @@ -370,7 +388,7 @@ def argmin(self, axis=None, skipna=True, *args, **kwargs): if mask.all() or not skipna: return -1 i8 = i8.copy() - i8[mask] = np.iinfo('int64').max + i8[mask] = np.iinfo("int64").max return i8.argmin() def max(self, axis=None, skipna=True, *args, **kwargs): @@ -433,7 +451,7 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # -------------------------------------------------------------------- # Rendering Methods - def _format_with_header(self, header, na_rep='NaT', **kwargs): + def _format_with_header(self, header, na_rep="NaT", **kwargs): return header + list(self._format_native_types(na_rep, **kwargs)) @property @@ -446,11 +464,11 @@ def _format_attrs(self): """ attrs = super()._format_attrs() for attrib in self._attributes: - if attrib == 'freq': + if attrib == "freq": freq = self.freqstr if freq is not None: freq = "'%s'" % freq - attrs.append(('freq', freq)) + attrs.append(("freq", freq)) return attrs # -------------------------------------------------------------------- @@ -466,17 +484,17 @@ def _convert_scalar_indexer(self, key, kind=None): kind : {'ix', 'loc', 'getitem', 'iloc'} or None """ - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem if is_scalar(key): is_int = is_integer(key) is_flt = is_float(key) - if kind in ['loc'] and (is_int or is_flt): - self._invalid_indexer('index', key) - elif kind in ['ix', 'getitem'] and is_flt: - self._invalid_indexer('index', key) + if kind in ["loc"] and (is_int or is_flt): + self._invalid_indexer("index", key) + elif kind in ["ix", "getitem"] and is_flt: + self._invalid_indexer("index", key) return super()._convert_scalar_indexer(key, kind=kind) @@ -497,6 +515,7 @@ def __add__(self, other): def __radd__(self, other): # alias for __add__ return self.__add__(other) + cls.__radd__ = __radd__ def __sub__(self, other): @@ -555,21 +574,24 @@ def intersection(self, other, sort=False): result.freq = to_offset(result.inferred_freq) return result - elif (other.freq is None or self.freq is None or - other.freq != self.freq or - not other.freq.isAnchored() or - (not self.is_monotonic or not other.is_monotonic)): + elif ( + other.freq is None + or self.freq is None + or other.freq != self.freq + or not other.freq.isAnchored() + or (not self.is_monotonic or not other.is_monotonic) + ): result = Index.intersection(self, other, sort=sort) # Invalidate the freq of `result`, which may not be correct at # this point, depending on the values. result.freq = None - if hasattr(self, 'tz'): - result = self._shallow_copy(result._values, name=result.name, - tz=result.tz, freq=None) + if hasattr(self, "tz"): + result = self._shallow_copy( + result._values, name=result.name, tz=result.tz, freq=None + ) else: - result = self._shallow_copy(result._values, name=result.name, - freq=None) + result = self._shallow_copy(result._values, name=result.name, freq=None) if result.freq is None: result.freq = to_offset(result.inferred_freq) return result @@ -592,17 +614,17 @@ def intersection(self, other, sort=False): left_chunk = left.values[lslice] return self._shallow_copy(left_chunk) - @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) freq = self.freq if is_period_dtype(self) else None return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) - @Appender(_index_shared_docs['where'] % _index_doc_kwargs) + @Appender(_index_shared_docs["where"] % _index_doc_kwargs) def where(self, cond, other=None): other = _ensure_datetimelike_to_i8(other, to_utc=True) values = _ensure_datetimelike_to_i8(self, to_utc=True) - result = np.where(cond, values, other).astype('i8') + result = np.where(cond, values, other).astype("i8") result = self._ensure_localized(result, from_utc=True) return self._shallow_copy(result) @@ -622,17 +644,19 @@ def _summary(self, name=None): """ formatter = self._formatter_func if len(self) > 0: - index_summary = ', %s to %s' % (formatter(self[0]), - formatter(self[-1])) + index_summary = ", %s to %s" % (formatter(self[0]), formatter(self[-1])) else: - index_summary = '' + index_summary = "" if name is None: name = type(self).__name__ - result = '%s: %s entries%s' % (printing.pprint_thing(name), - len(self), index_summary) + result = "%s: %s entries%s" % ( + printing.pprint_thing(name), + len(self), + index_summary, + ) if self.freq: - result += '\nFreq: %s' % self.freqstr + result += "\nFreq: %s" % self.freqstr # display as values, not quoted result = result.replace("'", "") @@ -643,10 +667,10 @@ def _concat_same_dtype(self, to_concat, name): Concatenate to_concat which has the same class. """ attribs = self._get_attributes_dict() - attribs['name'] = name + attribs["name"] = name # do not pass tz to set because tzlocal cannot be hashed if len({str(x.dtype) for x in to_concat}) != 1: - raise ValueError('to_concat must have the same tz') + raise ValueError("to_concat must have the same tz") new_data = type(self._values)._concat_same_type(to_concat).asi8 @@ -655,11 +679,11 @@ def _concat_same_dtype(self, to_concat, name): is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1 if not is_period_dtype(self) and not is_diff_evenly_spaced: # reset freq - attribs['freq'] = None + attribs["freq"] = None return self._simple_new(new_data, **attribs) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype) and copy is False: # Ensure that self.astype(self.dtype) is self @@ -669,10 +693,9 @@ def astype(self, dtype, copy=True): # pass copy=False because any copying will be done in the # _data.astype call above - return Index(new_values, - dtype=new_values.dtype, name=self.name, copy=False) + return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - @deprecate_kwarg(old_arg_name='n', new_arg_name='periods') + @deprecate_kwarg(old_arg_name="n", new_arg_name="periods") def shift(self, periods, freq=None): """ Shift index by desired number of time frequency increments. @@ -714,8 +737,10 @@ def wrap_arithmetic_op(self, other, result): if isinstance(result, tuple): # divmod, rdivmod assert len(result) == 2 - return (wrap_arithmetic_op(self, other, result[0]), - wrap_arithmetic_op(self, other, result[1])) + return ( + wrap_arithmetic_op(self, other, result[0]), + wrap_arithmetic_op(self, other, result[1]), + ) if not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype @@ -763,6 +788,7 @@ class DatetimelikeDelegateMixin(PandasDelegate): The set of properties whose results should should *not* be boxed in an index, after being returned from the array """ + # raw_methods : dispatch methods that shouldn't be boxed in an Index _raw_methods = set() # type: Set[str] # raw_properties : dispatch properties that shouldn't be boxed in an Index diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e2658b66f83ba..5024eebe03bb4 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -10,20 +10,33 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - _NS_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, - is_string_like) + _NS_DTYPE, + ensure_int64, + is_float, + is_integer, + is_list_like, + is_scalar, + is_string_like, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core.accessor import delegate_names from pandas.core.arrays.datetimes import ( - DatetimeArray, _to_M8, tz_to_dtype, validate_tz_from_dtype) + DatetimeArray, + _to_M8, + tz_to_dtype, + validate_tz_from_dtype, +) from pandas.core.base import _shared_docs import pandas.core.common as com from pandas.core.indexes.base import Index from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ea_passthrough) + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, + ea_passthrough, +) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools @@ -56,42 +69,31 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): # Some are "raw" methods, the result is not not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. - _extra_methods = [ - 'to_period', - 'to_perioddelta', - 'to_julian_date', - ] - _extra_raw_methods = [ - 'to_pydatetime', - '_local_timestamps', - '_has_same_tz', - ] - _extra_raw_properties = [ - '_box_func', - 'tz', 'tzinfo', - ] - _delegated_properties = ( - DatetimeArray._datetimelike_ops + _extra_raw_properties - ) + _extra_methods = ["to_period", "to_perioddelta", "to_julian_date"] + _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"] + _extra_raw_properties = ["_box_func", "tz", "tzinfo"] + _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties _delegated_methods = ( - DatetimeArray._datetimelike_methods + _extra_methods + - _extra_raw_methods + DatetimeArray._datetimelike_methods + _extra_methods + _extra_raw_methods + ) + _raw_properties = ( + {"date", "time", "timetz"} + | set(DatetimeArray._bool_ops) + | set(_extra_raw_properties) ) - _raw_properties = { - 'date', - 'time', - 'timetz', - } | set(DatetimeArray._bool_ops) | set(_extra_raw_properties) _raw_methods = set(_extra_raw_methods) _delegate_class = DatetimeArray -@delegate_names(DatetimeArray, - DatetimeDelegateMixin._delegated_properties, - typ="property") -@delegate_names(DatetimeArray, - DatetimeDelegateMixin._delegated_methods, - typ="method", overwrite=False) +@delegate_names( + DatetimeArray, DatetimeDelegateMixin._delegated_properties, typ="property" +) +@delegate_names( + DatetimeArray, + DatetimeDelegateMixin._delegated_methods, + typ="method", + overwrite=False, +) class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): """ Immutable ndarray of datetime64 data, represented internally as int64, and @@ -221,25 +223,26 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): Creating a DatetimeIndex based on `start`, `periods`, and `end` has been deprecated in favor of :func:`date_range`. """ - _typ = 'datetimeindex' + + _typ = "datetimeindex" _join_precedence = 10 def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', - **kwargs) + return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs) _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique_int64, with_indexers=False) + libjoin.left_join_indexer_unique_int64, with_indexers=False + ) _engine_type = libindex.DatetimeEngine _tz = None _freq = None - _comparables = ['name', 'freqstr', 'tz'] - _attributes = ['name', 'tz', 'freq'] + _comparables = ["name", "freqstr", "tz"] + _attributes = ["name", "tz", "freq"] _is_numeric_dtype = False _infer_as_myclass = True @@ -256,48 +259,81 @@ def _join_i8_wrapper(joinf, **kwargs): # -------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, - freq=None, start=None, end=None, periods=None, tz=None, - normalize=False, closed=None, ambiguous='raise', - dayfirst=False, yearfirst=False, dtype=None, - copy=False, name=None, verify_integrity=None): + def __new__( + cls, + data=None, + freq=None, + start=None, + end=None, + periods=None, + tz=None, + normalize=False, + closed=None, + ambiguous="raise", + dayfirst=False, + yearfirst=False, + dtype=None, + copy=False, + name=None, + verify_integrity=None, + ): if verify_integrity is not None: - warnings.warn("The 'verify_integrity' argument is deprecated, " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) else: verify_integrity = True if data is None: dtarr = DatetimeArray._generate_range( - start, end, periods, - freq=freq, tz=tz, normalize=normalize, - closed=closed, ambiguous=ambiguous) - warnings.warn("Creating a DatetimeIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.date_range` instead.", - FutureWarning, stacklevel=2) - return cls._simple_new( - dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name) + start, + end, + periods, + freq=freq, + tz=tz, + normalize=normalize, + closed=closed, + ambiguous=ambiguous, + ) + warnings.warn( + "Creating a DatetimeIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.date_range` instead.", + FutureWarning, + stacklevel=2, + ) + return cls._simple_new(dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name) if is_scalar(data): - raise TypeError("{cls}() must be called with a " - "collection of some kind, {data} was passed" - .format(cls=cls.__name__, data=repr(data))) + raise TypeError( + "{cls}() must be called with a " + "collection of some kind, {data} was passed".format( + cls=cls.__name__, data=repr(data) + ) + ) # - Cases checked above all return/raise before reaching here - # - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name dtarr = DatetimeArray._from_sequence( - data, dtype=dtype, copy=copy, tz=tz, freq=freq, - dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous, - int_as_wall_time=True) - - subarr = cls._simple_new(dtarr, name=name, - freq=dtarr.freq, tz=dtarr.tz) + data, + dtype=dtype, + copy=copy, + tz=tz, + freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst, + ambiguous=ambiguous, + int_as_wall_time=True, + ) + + subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz) return subarr @classmethod @@ -337,8 +373,11 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): # -------------------------------------------------------------------- def __array__(self, dtype=None): - if (dtype is None and isinstance(self._data, DatetimeArray) - and getattr(self.dtype, 'tz', None)): + if ( + dtype is None + and isinstance(self._data, DatetimeArray) + and getattr(self.dtype, "tz", None) + ): msg = ( "Converting timezone-aware DatetimeArray to timezone-naive " "ndarray with 'datetime64[ns]' dtype. In the future, this " @@ -348,7 +387,7 @@ def __array__(self, dtype=None): "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." ) warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = 'M8[ns]' + dtype = "M8[ns]" return np.asarray(self._data, dtype=dtype) @property @@ -363,8 +402,10 @@ def tz(self): @tz.setter def tz(self, value): # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError("Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate") + raise AttributeError( + "Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate" + ) tzinfo = tz @@ -372,6 +413,7 @@ def tz(self, value): def _is_dates_only(self): """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only + return _is_dates_only(self.values) and self.tz is None def __reduce__(self): @@ -413,20 +455,21 @@ def __setstate__(self, state): else: raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ def _convert_for_op(self, value): """ Convert value to be insertable to ndarray """ if self._has_same_tz(value): return _to_M8(value) - raise ValueError('Passed item and index have different timezone') + raise ValueError("Passed item and index have different timezone") def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get('freq', None) + freq = attrs.get("freq", None) if freq is not None: # no need to infer if freq is None - attrs['freq'] = 'infer' + attrs["freq"] = "infer" return attrs # -------------------------------------------------------------------- @@ -436,18 +479,19 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return libts.ints_to_pydatetime(self.asi8, self.tz) - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values + fmt = _get_format_datetime64_from_values(self, date_format) - return libts.format_array_from_datetime(self.asi8, - tz=self.tz, - format=fmt, - na_rep=na_rep) + return libts.format_array_from_datetime( + self.asi8, tz=self.tz, format=fmt, na_rep=na_rep + ) @property def _formatter_func(self): from pandas.io.formats.format import _get_format_datetime64 + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: "'%s'" % formatter(x, tz=self.tz) @@ -477,8 +521,9 @@ def _union(self, other, sort): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds result._data._dtype = this.dtype - if (result.freq is None and - (this.freq is not None or other.freq is not None)): + if result.freq is None and ( + this.freq is not None or other.freq is not None + ): result.freq = to_offset(result.inferred_freq) return result @@ -561,7 +606,7 @@ def _fast_union(self, other, sort=None): elif sort is False: left, right = self, other left_start = left[0] - loc = right.searchsorted(left_start, side='left') + loc = right.searchsorted(left_start, side="left") right_chunk = right.values[:loc] dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) @@ -577,7 +622,7 @@ def _fast_union(self, other, sort=None): # concatenate dates if left_end < right_end: - loc = right.searchsorted(left_end, side='right') + loc = right.searchsorted(left_end, side="right") right_chunk = right.values[loc:] dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) @@ -666,18 +711,26 @@ def to_series(self, keep_tz=None, index=None, name=None): name = self.name if keep_tz is None and self.tz is not None: - warnings.warn("The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", FutureWarning, stacklevel=2) + warnings.warn( + "The default of the 'keep_tz' keyword in " + "DatetimeIndex.to_series will change " + "to True in a future release. You can set " + "'keep_tz=True' to obtain the future behaviour and " + "silence this warning.", + FutureWarning, + stacklevel=2, + ) keep_tz = False elif keep_tz is False: - warnings.warn("Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", FutureWarning, stacklevel=2) + warnings.warn( + "Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", + FutureWarning, + stacklevel=2, + ) if keep_tz and self.tz is not None: # preserve the tz & copy @@ -687,7 +740,7 @@ def to_series(self, keep_tz=None, index=None, name=None): return Series(values, index=index, name=name) - def snap(self, freq='S'): + def snap(self, freq="S"): """ Snap time stamps to nearest occurring frequency @@ -712,52 +765,67 @@ def snap(self, freq='S'): snapped[i] = s # we know it conforms; skip check - return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, - freq=freq) + return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ See Index.join """ - if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type not in ('floating', 'integer', 'mixed-integer', - 'mixed-integer-float', 'mixed')): + if ( + not isinstance(other, DatetimeIndex) + and len(other) > 0 + and other.inferred_type + not in ( + "floating", + "integer", + "mixed-integer", + "mixed-integer-float", + "mixed", + ) + ): try: other = DatetimeIndex(other) except (TypeError, ValueError): pass this, other = self._maybe_utc_convert(other) - return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers, sort=sort) + return Index.join( + this, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) def _maybe_utc_convert(self, other): this = self if isinstance(other, DatetimeIndex): if self.tz is not None: if other.tz is None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') + raise TypeError( + "Cannot join tz-naive with tz-aware " "DatetimeIndex" + ) elif other.tz is not None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') + raise TypeError("Cannot join tz-naive with tz-aware " "DatetimeIndex") if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert('UTC') - other = other.tz_convert('UTC') + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") return this, other def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) - if (isinstance(other, DatetimeIndex) and - self.freq == other.freq and - self._can_fast_union(other)): + if ( + isinstance(other, DatetimeIndex) + and self.freq == other.freq + and self._can_fast_union(other) + ): joined = self._shallow_copy(joined) joined.name = name return joined else: - tz = getattr(other, 'tz', None) + tz = getattr(other, "tz", None) return self._simple_new(joined, name, tz=tz) def _parsed_string_to_bounds(self, reso, parsed): @@ -776,41 +844,63 @@ def _parsed_string_to_bounds(self, reso, parsed): lower, upper: pd.Timestamp """ - valid_resos = {'year', 'month', 'quarter', 'day', 'hour', 'minute', - 'second', 'minute', 'second', 'microsecond'} + valid_resos = { + "year", + "month", + "quarter", + "day", + "hour", + "minute", + "second", + "minute", + "second", + "microsecond", + } if reso not in valid_resos: raise KeyError - if reso == 'year': + if reso == "year": start = Timestamp(parsed.year, 1, 1) end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999) - elif reso == 'month': + elif reso == "month": d = ccalendar.get_days_in_month(parsed.year, parsed.month) start = Timestamp(parsed.year, parsed.month, 1) end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999) - elif reso == 'quarter': + elif reso == "quarter": qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month start = Timestamp(parsed.year, parsed.month, 1) end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999) - elif reso == 'day': + elif reso == "day": start = Timestamp(parsed.year, parsed.month, parsed.day) end = start + timedelta(days=1) - Nano(1) - elif reso == 'hour': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour) + elif reso == "hour": + start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour) end = start + timedelta(hours=1) - Nano(1) - elif reso == 'minute': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute) + elif reso == "minute": + start = Timestamp( + parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute + ) end = start + timedelta(minutes=1) - Nano(1) - elif reso == 'second': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute, parsed.second) + elif reso == "second": + start = Timestamp( + parsed.year, + parsed.month, + parsed.day, + parsed.hour, + parsed.minute, + parsed.second, + ) end = start + timedelta(seconds=1) - Nano(1) - elif reso == 'microsecond': - start = Timestamp(parsed.year, parsed.month, parsed.day, - parsed.hour, parsed.minute, parsed.second, - parsed.microsecond) + elif reso == "microsecond": + start = Timestamp( + parsed.year, + parsed.month, + parsed.day, + parsed.hour, + parsed.minute, + parsed.second, + parsed.microsecond, + ) end = start + timedelta(microseconds=1) - Nano(1) # GH 24076 # If an incoming date string contained a UTC offset, need to localize @@ -818,9 +908,11 @@ def _parsed_string_to_bounds(self, reso, parsed): # timezone if parsed.tzinfo is not None: if self.tz is None: - raise ValueError("The index must be timezone aware " - "when indexing with a date string with a " - "UTC offset") + raise ValueError( + "The index must be timezone aware " + "when indexing with a date string with a " + "UTC offset" + ) start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz) end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz) elif self.tz is not None: @@ -830,15 +922,18 @@ def _parsed_string_to_bounds(self, reso, parsed): def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): is_monotonic = self.is_monotonic - if (is_monotonic and reso in ['day', 'hour', 'minute', 'second'] and - self._resolution >= Resolution.get_reso(reso)): + if ( + is_monotonic + and reso in ["day", "hour", "minute", "second"] + and self._resolution >= Resolution.get_reso(reso) + ): # These resolution/monotonicity validations came from GH3931, # GH3452 and GH2369. # See also GH14826 raise KeyError - if reso == 'microsecond': + if reso == "microsecond": # _partial_date_slice doesn't allow microsecond resolution, but # _parsed_string_to_bounds allows it. raise KeyError @@ -849,17 +944,15 @@ def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): if is_monotonic: # we are out of range - if (len(stamps) and ((use_lhs and t1.value < stamps[0] and - t2.value < stamps[0]) or - ((use_rhs and t1.value > stamps[-1] and - t2.value > stamps[-1])))): + if len(stamps) and ( + (use_lhs and t1.value < stamps[0] and t2.value < stamps[0]) + or ((use_rhs and t1.value > stamps[-1] and t2.value > stamps[-1])) + ): raise KeyError # a monotonic (sorted) series can be sliced - left = stamps.searchsorted( - t1.value, side='left') if use_lhs else None - right = stamps.searchsorted( - t2.value, side='right') if use_rhs else None + left = stamps.searchsorted(t1.value, side="left") if use_lhs else None + right = stamps.searchsorted(t2.value, side="right") if use_rhs else None return slice(left, right) @@ -870,7 +963,7 @@ def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): return (lhs_mask & rhs_mask).nonzero()[0] def _maybe_promote(self, other): - if other.inferred_type == 'date': + if other.inferred_type == "date": other = DatetimeIndex(other) return self, other @@ -896,8 +989,7 @@ def get_value(self, series, key): return series.take(locs) try: - return com.maybe_box(self, Index.get_value(self, series, key), - series, key) + return com.maybe_box(self, Index.get_value(self, series, key), series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -920,8 +1012,7 @@ def get_value_maybe_box(self, series, key): key = key.tz_localize(self.tz) elif not isinstance(key, Timestamp): key = Timestamp(key) - values = self._engine.get_value(com.values_from_object(series), - key, tz=self.tz) + values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) return com.maybe_box(self, values, series, key) def get_loc(self, key, method=None, tolerance=None): @@ -948,14 +1039,17 @@ def get_loc(self, key, method=None, tolerance=None): elif isinstance(key, timedelta): # GH#20464 - raise TypeError("Cannot index {cls} with {other}" - .format(cls=type(self).__name__, - other=type(key).__name__)) + raise TypeError( + "Cannot index {cls} with {other}".format( + cls=type(self).__name__, other=type(key).__name__ + ) + ) if isinstance(key, time): if method is not None: - raise NotImplementedError('cannot yet lookup inexact labels ' - 'when key is a time object') + raise NotImplementedError( + "cannot yet lookup inexact labels " "when key is a time object" + ) return self.indexer_at_time(key) try: @@ -977,7 +1071,7 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) except ValueError as e: # list-like tolerance size must match target index size - if 'list-like' in str(e): + if "list-like" in str(e): raise e raise KeyError(key) @@ -1000,14 +1094,13 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] if is_float(label) or isinstance(label, time) or is_integer(label): - self._invalid_indexer('slice', label) + self._invalid_indexer("slice", label) if isinstance(label, str): - freq = getattr(self, 'freqstr', - getattr(self, 'inferred_freq', None)) + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) _, parsed, reso = parsing.parse_time_string(label, freq) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: @@ -1017,17 +1110,15 @@ def _maybe_cast_slice_bound(self, label, side, kind): # length > 1 (is_monotonic_decreasing gives True for empty # and length 1 index) if self._is_strictly_monotonic_decreasing and len(self) > 1: - return upper if side == 'left' else lower - return lower if side == 'left' else upper + return upper if side == "left" else lower + return lower if side == "left" else upper else: return label def _get_string_slice(self, key, use_lhs=True, use_rhs=True): - freq = getattr(self, 'freqstr', - getattr(self, 'inferred_freq', None)) + freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) _, parsed, reso = parsing.parse_time_string(key, freq) - loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, - use_rhs=use_rhs) + loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc def slice_indexer(self, start=None, end=None, step=None, kind=None): @@ -1049,11 +1140,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # an array of (self.hour, self.minute, self.seconds, self.microsecond). if isinstance(start, time) and isinstance(end, time): if step is not None and step != 1: - raise ValueError('Must have step size of 1 with time slices') + raise ValueError("Must have step size of 1 with time slices") return self.indexer_between_time(start, end) if isinstance(start, time) or isinstance(end, time): - raise KeyError('Cannot mix time and non-time slice keys') + raise KeyError("Cannot mix time and non-time slice keys") try: return Index.slice_indexer(self, start, end, step, kind=kind) @@ -1061,17 +1152,16 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # For historical reasons DatetimeIndex by default supports # value-based partial (aka string) slices on non-monotonic arrays, # let's try that. - if ((start is None or isinstance(start, str)) and - (end is None or isinstance(end, str))): + if (start is None or isinstance(start, str)) and ( + end is None or isinstance(end, str) + ): mask = True if start is not None: - start_casted = self._maybe_cast_slice_bound( - start, 'left', kind) + start_casted = self._maybe_cast_slice_bound(start, "left", kind) mask = start_casted <= self if end is not None: - end_casted = self._maybe_cast_slice_bound( - end, 'right', kind) + end_casted = self._maybe_cast_slice_bound(end, "right", kind) mask = (self <= end_casted) & mask indexer = mask.nonzero()[0][::step] @@ -1091,10 +1181,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): _is_unique = Index.is_unique _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore - is_normalized = cache_readonly( - DatetimeArray.is_normalized.fget) # type: ignore - _resolution = cache_readonly( - DatetimeArray._resolution.fget) # type: ignore + is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore + _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore strftime = ea_passthrough(DatetimeArray.strftime) _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) @@ -1104,9 +1192,12 @@ def offset(self): """ get/set the frequency of the instance """ - msg = ('{cls}.offset has been deprecated and will be removed ' - 'in a future version; use {cls}.freq instead.' - .format(cls=type(self).__name__)) + msg = ( + "{cls}.offset has been deprecated and will be removed " + "in a future version; use {cls}.freq instead.".format( + cls=type(self).__name__ + ) + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self.freq @@ -1115,9 +1206,12 @@ def offset(self, value): """ get/set the frequency of the instance """ - msg = ('{cls}.offset has been deprecated and will be removed ' - 'in a future version; use {cls}.freq instead.' - .format(cls=type(self).__name__)) + msg = ( + "{cls}.offset has been deprecated and will be removed " + "in a future version; use {cls}.freq instead.".format( + cls=type(self).__name__ + ) + ) warnings.warn(msg, FutureWarning, stacklevel=2) self.freq = value @@ -1138,9 +1232,9 @@ def _box_func(self): # -------------------------------------------------------------------- - @Substitution(klass='DatetimeIndex') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="DatetimeIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): value = np.array(value, dtype=_NS_DTYPE, copy=False) else: @@ -1149,13 +1243,13 @@ def searchsorted(self, value, side='left', sorter=None): return self.values.searchsorted(value, side=side) def is_type_compatible(self, typ): - return typ == self.inferred_type or typ == 'datetime' + return typ == self.inferred_type or typ == "datetime" @property def inferred_type(self): # b/c datetime is represented as microseconds since the epoch, make # sure we can't have ambiguous indexing - return 'datetime64' + return "datetime64" @property def is_all_dates(self): @@ -1185,28 +1279,26 @@ def insert(self, loc, item): if isinstance(item, (datetime, np.datetime64)): self._assert_can_do_op(item) if not self._has_same_tz(item) and not isna(item): - raise ValueError( - 'Passed item and index have different timezone') + raise ValueError("Passed item and index have different timezone") # check freq can be preserved on edge cases if self.size and self.freq is not None: - if ((loc == 0 or loc == -len(self)) and - item + self.freq == self[0]): + if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = _to_M8(item, tz=self.tz) try: - new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)], - self[loc:].asi8)) + new_dates = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) return self._shallow_copy(new_dates, freq=freq) except (AttributeError, TypeError): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError( - "cannot insert DatetimeIndex with incompatible label") + raise TypeError("cannot insert DatetimeIndex with incompatible label") def delete(self, loc): """ @@ -1229,10 +1321,9 @@ def delete(self, loc): freq = self.freq else: if is_list_like(loc): - loc = lib.maybe_indices_to_slice( - ensure_int64(np.array(loc)), len(self)) + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): - if (loc.start in (0, None) or loc.stop in (len(self), None)): + if loc.start in (0, None) or loc.stop in (len(self), None): freq = self.freq return self._shallow_copy(new_dates, freq=freq) @@ -1262,6 +1353,7 @@ def indexer_at_time(self, time, asof=False): if isinstance(time, str): from dateutil.parser import parse + time = parse(time).time() if time.tzinfo: @@ -1273,8 +1365,9 @@ def indexer_at_time(self, time, asof=False): micros = _time_to_micros(time) return (micros == time_micros).nonzero()[0] - def indexer_between_time(self, start_time, end_time, include_start=True, - include_end=True): + def indexer_between_time( + self, start_time, end_time, include_start=True, include_end=True + ): """ Return index locations of values between particular times of day (e.g., 9:00-9:30AM). @@ -1318,8 +1411,7 @@ def indexer_between_time(self, start_time, end_time, include_start=True, else: join_op = operator.or_ - mask = join_op(lop(start_micros, time_micros), - rop(time_micros, end_micros)) + mask = join_op(lop(start_micros, time_micros), rop(time_micros, end_micros)) return mask.nonzero()[0] @@ -1330,8 +1422,17 @@ def indexer_between_time(self, start_time, end_time, include_start=True, DatetimeIndex._add_datetimelike_methods() -def date_range(start=None, end=None, periods=None, freq=None, tz=None, - normalize=False, name=None, closed=None, **kwargs): +def date_range( + start=None, + end=None, + periods=None, + freq=None, + tz=None, + normalize=False, + name=None, + closed=None, + **kwargs +): """ Return a fixed frequency DatetimeIndex. @@ -1470,19 +1571,34 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, """ if freq is None and com._any_none(periods, start, end): - freq = 'D' + freq = "D" dtarr = DatetimeArray._generate_range( - start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, - closed=closed, **kwargs) - return DatetimeIndex._simple_new( - dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) - - -def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, - normalize=True, name=None, weekmask=None, holidays=None, - closed=None, **kwargs): + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + closed=closed, + **kwargs + ) + return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) + + +def bdate_range( + start=None, + end=None, + periods=None, + freq="B", + tz=None, + normalize=True, + name=None, + weekmask=None, + holidays=None, + closed=None, + **kwargs +): """ Return a fixed frequency DatetimeIndex, with business day as the default frequency @@ -1548,24 +1664,34 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, dtype='datetime64[ns]', freq='B') """ if freq is None: - msg = 'freq must be specified for bdate_range; use date_range instead' + msg = "freq must be specified for bdate_range; use date_range instead" raise TypeError(msg) - if is_string_like(freq) and freq.startswith('C'): + if is_string_like(freq) and freq.startswith("C"): try: - weekmask = weekmask or 'Mon Tue Wed Thu Fri' + weekmask = weekmask or "Mon Tue Wed Thu Fri" freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask) except (KeyError, TypeError): - msg = 'invalid custom frequency string: {freq}'.format(freq=freq) + msg = "invalid custom frequency string: {freq}".format(freq=freq) raise ValueError(msg) elif holidays or weekmask: - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency {freq}').format(freq=freq) + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency {freq}" + ).format(freq=freq) raise ValueError(msg) - return date_range(start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) + return date_range( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + name=name, + closed=closed, + **kwargs + ) def _time_to_micros(time): diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index aeb0fa119ab33..2e5b3ff8ef502 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -28,6 +28,7 @@ class FrozenList(PandasObject, list): because it's technically non-hashable, will be used for lookups, appropriately, etc. """ + # Side note: This has to be of type list. Otherwise, # it messes up PyTables type checks. @@ -105,16 +106,15 @@ def __hash__(self): def _disabled(self, *args, **kwargs): """This method will not function because object is immutable.""" - raise TypeError("'%s' does not support mutable operations." % - self.__class__.__name__) + raise TypeError( + "'%s' does not support mutable operations." % self.__class__.__name__ + ) def __str__(self): - return pprint_thing(self, quote_strings=True, - escape_chars=('\t', '\r', '\n')) + return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n")) def __repr__(self): - return "%s(%s)" % (self.__class__.__name__, - str(self)) + return "%s(%s)" % (self.__class__.__name__, str(self)) __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled pop = append = extend = remove = sort = insert = _disabled @@ -124,9 +124,12 @@ class FrozenNDArray(PandasObject, np.ndarray): # no __array_finalize__ for now because no metadata def __new__(cls, data, dtype=None, copy=False): - warnings.warn("\nFrozenNDArray is deprecated and will be removed in a " - "future version.\nPlease use `numpy.ndarray` instead.\n", - FutureWarning, stacklevel=2) + warnings.warn( + "\nFrozenNDArray is deprecated and will be removed in a " + "future version.\nPlease use `numpy.ndarray` instead.\n", + FutureWarning, + stacklevel=2, + ) if copy is None: copy = not isinstance(data, FrozenNDArray) @@ -135,8 +138,7 @@ def __new__(cls, data, dtype=None, copy=False): def _disabled(self, *args, **kwargs): """This method will not function because object is immutable.""" - raise TypeError("'%s' does not support mutable operations." % - self.__class__) + raise TypeError("'%s' does not support mutable operations." % self.__class__) __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled put = itemset = fill = _disabled @@ -153,8 +155,7 @@ def __repr__(self): """ Return a string representation for this object. """ - prepr = pprint_thing(self, escape_chars=('\t', '\r', '\n'), - quote_strings=True) + prepr = pprint_thing(self, escape_chars=("\t", "\r", "\n"), quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) @deprecate_kwarg(old_arg_name="v", new_arg_name="value") diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 83bc5963f4f9e..b14cff8cc6ade 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -14,11 +14,25 @@ from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( - find_common_type, infer_dtype_from_scalar, maybe_downcast_to_dtype) + find_common_type, + infer_dtype_from_scalar, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import ( - ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, - is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) + ensure_platform_int, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_object_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna @@ -27,7 +41,12 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - Index, InvalidIndexError, _index_shared_docs, default_pprint, ensure_index) + Index, + InvalidIndexError, + _index_shared_docs, + default_pprint, + ensure_index, +) from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range @@ -36,48 +55,54 @@ from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset -_VALID_CLOSED = {'left', 'right', 'both', 'neither'} +_VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - dict(klass='IntervalIndex', - qualname="IntervalIndex", - target_klass='IntervalIndex or list of Intervals', - name=textwrap.dedent("""\ + dict( + klass="IntervalIndex", + qualname="IntervalIndex", + target_klass="IntervalIndex or list of Intervals", + name=textwrap.dedent( + """\ name : object, optional Name to be stored in the index. - """), - )) + """ + ), + ) +) def _get_next_label(label): - dtype = getattr(label, 'dtype', type(label)) + dtype = getattr(label, "dtype", type(label)) if isinstance(label, (Timestamp, Timedelta)): - dtype = 'datetime64' + dtype = "datetime64" if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): - return label + np.timedelta64(1, 'ns') + return label + np.timedelta64(1, "ns") elif is_integer_dtype(dtype): return label + 1 elif is_float_dtype(dtype): return np.nextafter(label, np.infty) else: - raise TypeError('cannot determine next label for type {typ!r}' - .format(typ=type(label))) + raise TypeError( + "cannot determine next label for type {typ!r}".format(typ=type(label)) + ) def _get_prev_label(label): - dtype = getattr(label, 'dtype', type(label)) + dtype = getattr(label, "dtype", type(label)) if isinstance(label, (Timestamp, Timedelta)): - dtype = 'datetime64' + dtype = "datetime64" if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): - return label - np.timedelta64(1, 'ns') + return label - np.timedelta64(1, "ns") elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: - raise TypeError('cannot determine next label for type {typ!r}' - .format(typ=type(label))) + raise TypeError( + "cannot determine next label for type {typ!r}".format(typ=type(label)) + ) def _get_interval_closed_bounds(interval): @@ -106,6 +131,7 @@ class SetopCheck: This is called to decorate the set operations of IntervalIndex to perform the type check in advance. """ + def __init__(self, op_name): self.op_name = op_name @@ -115,36 +141,43 @@ def func(intvidx_self, other, sort=False): other = ensure_index(other) if not isinstance(other, IntervalIndex): - result = getattr(intvidx_self.astype(object), - self.op_name)(other) - if self.op_name in ('difference',): + result = getattr(intvidx_self.astype(object), self.op_name)(other) + if self.op_name in ("difference",): result = result.astype(intvidx_self.dtype) return result elif intvidx_self.closed != other.closed: - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') + msg = ( + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side" + ) raise ValueError(msg) # GH 19016: ensure set op will not return a prohibited dtype subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] common_subtype = find_common_type(subtypes) if is_object_dtype(common_subtype): - msg = ('can only do {op} between two IntervalIndex ' - 'objects that have compatible dtypes') + msg = ( + "can only do {op} between two IntervalIndex " + "objects that have compatible dtypes" + ) raise TypeError(msg.format(op=self.op_name)) return setop(intvidx_self, other, sort) + return func -@Appender(_interval_shared_docs['class'] % dict( - klass="IntervalIndex", - summary="Immutable index of intervals that are closed on the same side.", - name=_index_doc_kwargs['name'], - versionadded="0.20.0", - extra_attributes="is_overlapping\nvalues\n", - extra_methods="", - examples=textwrap.dedent("""\ +@Appender( + _interval_shared_docs["class"] + % dict( + klass="IntervalIndex", + summary="Immutable index of intervals that are closed on the same side.", + name=_index_doc_kwargs["name"], + versionadded="0.20.0", + extra_attributes="is_overlapping\nvalues\n", + extra_methods="", + examples=textwrap.dedent( + """\ Examples -------- A new ``IntervalIndex`` is typically constructed using @@ -161,13 +194,14 @@ def func(intvidx_self, other, sort=False): See further examples in the doc strings of ``interval_range`` and the mentioned constructor methods. - """), - -)) + """ + ), + ) +) class IntervalIndex(IntervalMixin, Index): - _typ = 'intervalindex' - _comparables = ['name'] - _attributes = ['name', 'closed'] + _typ = "intervalindex" + _comparables = ["name"] + _attributes = ["name", "closed"] # we would like our indexing holder to defer to us _defer_to_indexing = True @@ -178,15 +212,21 @@ class IntervalIndex(IntervalMixin, Index): # -------------------------------------------------------------------- # Constructors - def __new__(cls, data, closed=None, dtype=None, copy=False, - name=None, verify_integrity=True): + def __new__( + cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True + ): - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype, - verify_integrity=verify_integrity) + array = IntervalArray( + data, + closed=closed, + copy=copy, + dtype=dtype, + verify_integrity=verify_integrity, + ) return cls._simple_new(array, name) @@ -210,29 +250,32 @@ def _simple_new(cls, array, name, closed=None): return result @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) - def from_breaks(cls, breaks, closed='right', name=None, copy=False, - dtype=None): + @Appender(_interval_shared_docs["from_breaks"] % _index_doc_kwargs) + def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, - dtype=dtype) + array = IntervalArray.from_breaks( + breaks, closed=closed, copy=copy, dtype=dtype + ) return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) - def from_arrays(cls, left, right, closed='right', name=None, copy=False, - dtype=None): + @Appender(_interval_shared_docs["from_arrays"] % _index_doc_kwargs) + def from_arrays( + cls, left, right, closed="right", name=None, copy=False, dtype=None + ): with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_arrays(left, right, closed, copy=copy, - dtype=dtype) + array = IntervalArray.from_arrays( + left, right, closed, copy=copy, dtype=dtype + ) return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, - dtype=None): - msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; Use IntervalIndex(...) instead') + @Appender(_interval_shared_docs["from_intervals"] % _index_doc_kwargs) + def from_intervals(cls, data, closed=None, name=None, copy=False, dtype=None): + msg = ( + "IntervalIndex.from_intervals is deprecated and will be " + "removed in a future version; Use IntervalIndex(...) instead" + ) warnings.warn(msg, FutureWarning, stacklevel=2) with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) @@ -243,17 +286,15 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) - def from_tuples(cls, data, closed='right', name=None, copy=False, - dtype=None): + @Appender(_interval_shared_docs["from_tuples"] % _index_doc_kwargs) + def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, - dtype=dtype) + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) return cls._simple_new(arr, name=name) # -------------------------------------------------------------------- - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, left=None, right=None, **kwargs): result = self._data._shallow_copy(left=left, right=right) attributes = self._get_attributes_dict() @@ -295,9 +336,11 @@ def __contains__(self, key): except KeyError: return False - @Appender(_interval_shared_docs['to_tuples'] % dict( - return_type="Index", - examples=""" + @Appender( + _interval_shared_docs["to_tuples"] + % dict( + return_type="Index", + examples=""" Examples -------- >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) @@ -305,15 +348,15 @@ def __contains__(self, key): Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", - )) + ) + ) def to_tuples(self, na_tuple=True): tuples = self._data.to_tuples(na_tuple=na_tuple) return Index(tuples) @cache_readonly def _multiindex(self): - return MultiIndex.from_arrays([self.left, self.right], - names=['left', 'right']) + return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) @property def left(self): @@ -339,7 +382,7 @@ def closed(self): """ return self._data._closed - @Appender(_interval_shared_docs['set_closed'] % _index_doc_kwargs) + @Appender(_interval_shared_docs["set_closed"] % _index_doc_kwargs) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -369,13 +412,15 @@ def shape(self): @property def itemsize(self): - msg = ('IntervalIndex.itemsize is deprecated and will be removed in ' - 'a future version') + msg = ( + "IntervalIndex.itemsize is deprecated and will be removed in " + "a future version" + ) warnings.warn(msg, FutureWarning, stacklevel=2) # suppress the warning from the underlying left/right itemsize with warnings.catch_warnings(): - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") return self.left.itemsize + self.right.itemsize def __len__(self): @@ -405,12 +450,11 @@ def __array_wrap__(self, result, context=None): return result def __reduce__(self): - d = dict(left=self.left, - right=self.right) + d = dict(left=self.left, right=self.right) d.update(self._get_attributes_dict()) return _new_IntervalIndex, (self.__class__, d), None - @Appender(_index_shared_docs['copy']) + @Appender(_index_shared_docs["copy"]) def copy(self, deep=False, name=None): array = self._data if deep: @@ -421,9 +465,9 @@ def copy(self, deep=False, name=None): return self._simple_new(array, **attributes) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): - with rewrite_exception('IntervalArray', self.__class__.__name__): + with rewrite_exception("IntervalArray", self.__class__.__name__): new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) @@ -437,14 +481,13 @@ def dtype(self): @property def inferred_type(self): """Return a string of the type inferred from the values""" - return 'interval' + return "interval" @Appender(Index.memory_usage.__doc__) def memory_usage(self, deep=False): # we don't use an explicit engine # so return the bytes here - return (self.left.memory_usage(deep=deep) + - self.right.memory_usage(deep=deep)) + return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) @cache_readonly def mid(self): @@ -502,8 +545,7 @@ def is_unique(self): return True @cache_readonly - @Appender(_interval_shared_docs['is_non_overlapping_monotonic'] - % _index_doc_kwargs) + @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] % _index_doc_kwargs) def is_non_overlapping_monotonic(self): return self._data.is_non_overlapping_monotonic @@ -562,16 +604,16 @@ def is_overlapping(self): # GH 23309 return self._engine.is_overlapping - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - if kind == 'iloc': + if kind == "iloc": return super()._convert_scalar_indexer(key, kind=kind) return key def _maybe_cast_slice_bound(self, label, side, kind): return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - @Appender(_index_shared_docs['_convert_list_indexer']) + @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): """ we are passed a list-like indexer. Return the @@ -598,7 +640,7 @@ def _maybe_cast_indexed(self, key): if is_integer(key): key = float(key) elif isinstance(key, (np.ndarray, Index)): - key = key.astype('float64') + key = key.astype("float64") elif is_integer_dtype(subtype): if is_integer(key): key = int(key) @@ -691,8 +733,10 @@ def _maybe_convert_i8(self, key): # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype - msg = ('Cannot index an IntervalIndex of subtype {subtype} with ' - 'values of dtype {other}') + msg = ( + "Cannot index an IntervalIndex of subtype {subtype} with " + "values of dtype {other}" + ) if not is_dtype_equal(subtype, key_dtype): raise ValueError(msg.format(subtype=subtype, other=key_dtype)) @@ -702,27 +746,30 @@ def _check_method(self, method): if method is None: return - if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']: - msg = 'method {method} not yet implemented for IntervalIndex' + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + msg = "method {method} not yet implemented for IntervalIndex" raise NotImplementedError(msg.format(method=method)) raise ValueError("Invalid fill method") def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: - raise KeyError('can only get slices from an IntervalIndex if ' - 'bounds are non-overlapping and all monotonic ' - 'increasing or decreasing') + raise KeyError( + "can only get slices from an IntervalIndex if " + "bounds are non-overlapping and all monotonic " + "increasing or decreasing" + ) if isinstance(label, IntervalMixin): - msg = 'Interval objects are not currently supported' + msg = "Interval objects are not currently supported" raise NotImplementedError(msg) # GH 20921: "not is_monotonic_increasing" for the second condition # instead of "is_monotonic_decreasing" to account for single element # indexes being both increasing and decreasing - if ((side == 'left' and self.left.is_monotonic_increasing) or - (side == 'right' and not self.left.is_monotonic_increasing)): + if (side == "left" and self.left.is_monotonic_increasing) or ( + side == "right" and not self.left.is_monotonic_increasing + ): sub_idx = self.right if self.open_right or exclude_label: label = _get_next_label(label) @@ -736,9 +783,11 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): def _find_non_overlapping_monotonic_bounds(self, key): if isinstance(key, IntervalMixin): start = self._searchsorted_monotonic( - key.left, 'left', exclude_label=key.open_left) + key.left, "left", exclude_label=key.open_left + ) stop = self._searchsorted_monotonic( - key.right, 'right', exclude_label=key.open_right) + key.right, "right", exclude_label=key.open_right + ) elif isinstance(key, slice): # slice start, stop = key.start, key.stop @@ -747,22 +796,21 @@ def _find_non_overlapping_monotonic_bounds(self, key): if start is None: start = 0 else: - start = self._searchsorted_monotonic(start, 'left') + start = self._searchsorted_monotonic(start, "left") if stop is None: stop = len(self) else: - stop = self._searchsorted_monotonic(stop, 'right') + stop = self._searchsorted_monotonic(stop, "right") else: # scalar or index-like - start = self._searchsorted_monotonic(key, 'left') - stop = self._searchsorted_monotonic(key, 'right') + start = self._searchsorted_monotonic(key, "left") + stop = self._searchsorted_monotonic(key, "right") return start, stop - def get_loc(self, - key: Any, - method: Optional[str] = None - ) -> Union[int, slice, np.ndarray]: + def get_loc( + self, key: Any, method: Optional[str] = None + ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -827,29 +875,40 @@ def get_loc(self, raise KeyError(key) elif matches == 1: return mask.argmax() - return lib.maybe_booleans_to_slice(mask.view('u1')) - - @Substitution(**dict(_index_doc_kwargs, - **{'raises_section': textwrap.dedent(""" + return lib.maybe_booleans_to_slice(mask.view("u1")) + + @Substitution( + **dict( + _index_doc_kwargs, + **{ + "raises_section": textwrap.dedent( + """ Raises ------ NotImplementedError If any method argument other than the default of None is specified as these are not yet implemented. - """)})) - @Appender(_index_shared_docs['get_indexer']) - def get_indexer(self, - target: AnyArrayLike, - method: Optional[str] = None, - limit: Optional[int] = None, - tolerance: Optional[Any] = None - ) -> np.ndarray: + """ + ) + } + ) + ) + @Appender(_index_shared_docs["get_indexer"]) + def get_indexer( + self, + target: AnyArrayLike, + method: Optional[str] = None, + limit: Optional[int] = None, + tolerance: Optional[Any] = None, + ) -> np.ndarray: self._check_method(method) if self.is_overlapping: - msg = ('cannot handle overlapping indices; use ' - 'IntervalIndex.get_indexer_non_unique') + msg = ( + "cannot handle overlapping indices; use " + "IntervalIndex.get_indexer_non_unique" + ) raise InvalidIndexError(msg) target = ensure_index(target) @@ -857,11 +916,12 @@ def get_indexer(self, if isinstance(target, IntervalIndex): # equal indexes -> 1:1 positional match if self.equals(target): - return np.arange(len(self), dtype='intp') + return np.arange(len(self), dtype="intp") # different closed or incompatible subtype -> no matches - common_subtype = find_common_type([ - self.dtype.subtype, target.dtype.subtype]) + common_subtype = find_common_type( + [self.dtype.subtype, target.dtype.subtype] + ) if self.closed != target.closed or is_object_dtype(common_subtype): return np.repeat(np.intp(-1), len(target)) @@ -888,16 +948,17 @@ def get_indexer(self, return ensure_platform_int(indexer) - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, - target: AnyArrayLike - ) -> Tuple[np.ndarray, np.ndarray]: + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique( + self, target: AnyArrayLike + ) -> Tuple[np.ndarray, np.ndarray]: target = ensure_index(target) # check that target IntervalIndex is compatible if isinstance(target, IntervalIndex): - common_subtype = find_common_type([ - self.dtype.subtype, target.dtype.subtype]) + common_subtype = find_common_type( + [self.dtype.subtype, target.dtype.subtype] + ) if self.closed != target.closed or is_object_dtype(common_subtype): # different closed or incompatible subtype -> no matches return np.repeat(-1, len(target)), np.arange(len(target)) @@ -909,8 +970,7 @@ def get_indexer_non_unique(self, try: locs = self.get_loc(key) if isinstance(locs, slice): - locs = np.arange( - locs.start, locs.stop, locs.step, dtype='intp') + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") locs = np.array(locs, ndmin=1) except KeyError: missing.append(i) @@ -919,15 +979,11 @@ def get_indexer_non_unique(self, indexer = np.concatenate(indexer) else: target = self._maybe_convert_i8(target) - indexer, missing = self._engine.get_indexer_non_unique( - target.values) + indexer, missing = self._engine.get_indexer_non_unique(target.values) return ensure_platform_int(indexer), ensure_platform_int(missing) - def get_indexer_for(self, - target: AnyArrayLike, - **kwargs - ) -> np.ndarray: + def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: """ Guaranteed return of an indexer even when overlapping. @@ -943,11 +999,8 @@ def get_indexer_for(self, return self.get_indexer_non_unique(target, **kwargs)[0] return self.get_indexer(target, **kwargs) - @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs) - def get_value(self, - series: ABCSeries, - key: Any - ) -> Any: + @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) + def get_value(self, series: ABCSeries, key: Any) -> Any: if com.is_bool_indexer(key): loc = key @@ -961,12 +1014,12 @@ def get_value(self, elif isinstance(key, slice): if not (key.step is None or key.step == 1): raise ValueError("cannot support not-default step in a slice") - loc = self._convert_slice_indexer(key, kind='getitem') + loc = self._convert_slice_indexer(key, kind="getitem") else: loc = self.get_loc(key) return series.iloc[loc] - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): if other is None: other = self._na_value @@ -1002,16 +1055,18 @@ def insert(self, loc, item): """ if isinstance(item, Interval): if item.closed != self.closed: - raise ValueError('inserted item must be closed on the same ' - 'side as the index') + raise ValueError( + "inserted item must be closed on the same " "side as the index" + ) left_insert = item.left right_insert = item.right elif is_scalar(item) and isna(item): # GH 18295 left_insert = right_insert = item else: - raise ValueError('can only insert Interval objects and NA into ' - 'an IntervalIndex') + raise ValueError( + "can only insert Interval objects and NA into " "an IntervalIndex" + ) new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) @@ -1023,16 +1078,18 @@ def _concat_same_dtype(self, to_concat, name): we allow a 0-len index here as well """ if not len({i.closed for i in to_concat if len(i)}) == 1: - msg = ('can only append two IntervalIndex objects ' - 'that are closed on the same side') + msg = ( + "can only append two IntervalIndex objects " + "that are closed on the same side" + ) raise ValueError(msg) return super()._concat_same_dtype(to_concat, name) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - result = self._data.take(indices, axis=axis, allow_fill=allow_fill, - fill_value=fill_value, **kwargs) + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + result = self._data.take( + indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs + ) attributes = self._get_attributes_dict() return self._simple_new(result, **attributes) @@ -1051,56 +1108,56 @@ def __getitem__(self, value): def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) - def _format_native_types(self, na_rep='NaN', quoting=None, **kwargs): + def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): """ actually format my specific types """ from pandas.io.formats.format import ExtensionArrayFormatter - return ExtensionArrayFormatter(values=self, - na_rep=na_rep, - justify='all', - leading_space=False).get_result() + + return ExtensionArrayFormatter( + values=self, na_rep=na_rep, justify="all", leading_space=False + ).get_result() def _format_data(self, name=None): # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical n = len(self) - max_seq_items = min((get_option( - 'display.max_seq_items') or n) // 10, 10) + max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) formatter = str if n == 0: - summary = '[]' + summary = "[]" elif n == 1: first = formatter(self[0]) - summary = '[{first}]'.format(first=first) + summary = "[{first}]".format(first=first) elif n == 2: first = formatter(self[0]) last = formatter(self[-1]) - summary = '[{first}, {last}]'.format(first=first, last=last) + summary = "[{first}, {last}]".format(first=first, last=last) else: if n > max_seq_items: n = min(max_seq_items // 2, 10) head = [formatter(x) for x in self[:n]] tail = [formatter(x) for x in self[-n:]] - summary = '[{head} ... {tail}]'.format( - head=', '.join(head), tail=', '.join(tail)) + summary = "[{head} ... {tail}]".format( + head=", ".join(head), tail=", ".join(tail) + ) else: tail = [formatter(x) for x in self] - summary = '[{tail}]'.format(tail=', '.join(tail)) + summary = "[{tail}]".format(tail=", ".join(tail)) - return summary + ',' + self._format_space() + return summary + "," + self._format_space() def _format_attrs(self): - attrs = [('closed', repr(self.closed))] + attrs = [("closed", repr(self.closed))] if self.name is not None: - attrs.append(('name', default_pprint(self.name))) - attrs.append(('dtype', "'{dtype}'".format(dtype=self.dtype))) + attrs.append(("name", default_pprint(self.name))) + attrs.append(("dtype", "'{dtype}'".format(dtype=self.dtype))) return attrs def _format_space(self): - space = ' ' * (len(self.__class__.__name__) + 1) + space = " " * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) # -------------------------------------------------------------------- @@ -1120,30 +1177,30 @@ def equals(self, other): if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False - other = Index(getattr(other, '.values', other)) + other = Index(getattr(other, ".values", other)) - return (self.left.equals(other.left) and - self.right.equals(other.right) and - self.closed == other.closed) + return ( + self.left.equals(other.left) + and self.right.equals(other.right) + and self.closed == other.closed + ) - @Appender(_interval_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_interval_shared_docs["contains"] % _index_doc_kwargs) def contains(self, other): return self._data.contains(other) - @Appender(_interval_shared_docs['overlaps'] % _index_doc_kwargs) + @Appender(_interval_shared_docs["overlaps"] % _index_doc_kwargs) def overlaps(self, other): return self._data.overlaps(other) - @Appender(_index_shared_docs['intersection']) - @SetopCheck(op_name='intersection') - def intersection(self, - other: 'IntervalIndex', - sort: bool = False - ) -> 'IntervalIndex': + @Appender(_index_shared_docs["intersection"]) + @SetopCheck(op_name="intersection") + def intersection( + self, other: "IntervalIndex", sort: bool = False + ) -> "IntervalIndex": if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) - elif (other.left.is_unique and other.right.is_unique and - self.isna().sum() <= 1): + elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: # Swap other/self if other is unique and self does not have # multiple NaNs taken = other._intersection_unique(self) @@ -1156,9 +1213,7 @@ def intersection(self, return taken - def _intersection_unique(self, - other: 'IntervalIndex' - ) -> 'IntervalIndex': + def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ Used when the IntervalIndex does not have any common endpoint, no mater left or right. @@ -1180,9 +1235,7 @@ def _intersection_unique(self, return self.take(indexer) - def _intersection_non_unique(self, - other: 'IntervalIndex' - ) -> 'IntervalIndex': + def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ Used when the IntervalIndex does have some common endpoints, on either sides. @@ -1218,8 +1271,7 @@ def _intersection_non_unique(self, def _setop(op_name, sort=None): @SetopCheck(op_name=op_name) def func(self, other, sort=sort): - result = getattr(self._multiindex, op_name)(other._multiindex, - sort=sort) + result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) # GH 19101: ensure empty results have correct dtype @@ -1228,8 +1280,7 @@ def func(self, other, sort=sort): else: result = result.values - return type(self).from_tuples(result, closed=self.closed, - name=result_name) + return type(self).from_tuples(result, closed=self.closed, name=result_name) return func @@ -1241,9 +1292,9 @@ def is_all_dates(self): """ return False - union = _setop('union') - difference = _setop('difference') - symmetric_difference = _setop('symmetric_difference') + union = _setop("union") + difference = _setop("difference") + symmetric_difference = _setop("symmetric_difference") # TODO: arithmetic operations @@ -1253,24 +1304,31 @@ def is_all_dates(self): def _is_valid_endpoint(endpoint): """helper for interval_range to check if start/end are valid types""" - return any([is_number(endpoint), - isinstance(endpoint, Timestamp), - isinstance(endpoint, Timedelta), - endpoint is None]) + return any( + [ + is_number(endpoint), + isinstance(endpoint, Timestamp), + isinstance(endpoint, Timedelta), + endpoint is None, + ] + ) def _is_type_compatible(a, b): """helper for interval_range to check type compat of start/end/freq""" is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) - return ((is_number(a) and is_number(b)) or - (is_ts_compat(a) and is_ts_compat(b)) or - (is_td_compat(a) and is_td_compat(b)) or - com._any_none(a, b)) + return ( + (is_number(a) and is_number(b)) + or (is_ts_compat(a) and is_ts_compat(b)) + or (is_td_compat(a) and is_td_compat(b)) + or com._any_none(a, b) + ) -def interval_range(start=None, end=None, periods=None, freq=None, - name=None, closed='right'): +def interval_range( + start=None, end=None, periods=None, freq=None, name=None, closed="right" +): """ Return a fixed frequency IntervalIndex @@ -1363,36 +1421,44 @@ def interval_range(start=None, end=None, periods=None, freq=None, endpoint = start if start is not None else end if freq is None and com._any_none(periods, start, end): - freq = 1 if is_number(endpoint) else 'D' + freq = 1 if is_number(endpoint) else "D" if com.count_not_none(start, end, periods, freq) != 3: - raise ValueError('Of the four parameters: start, end, periods, and ' - 'freq, exactly three must be specified') + raise ValueError( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) if not _is_valid_endpoint(start): - msg = 'start must be numeric or datetime-like, got {start}' + msg = "start must be numeric or datetime-like, got {start}" raise ValueError(msg.format(start=start)) elif not _is_valid_endpoint(end): - msg = 'end must be numeric or datetime-like, got {end}' + msg = "end must be numeric or datetime-like, got {end}" raise ValueError(msg.format(end=end)) if is_float(periods): periods = int(periods) elif not is_integer(periods) and periods is not None: - msg = 'periods must be a number, got {periods}' + msg = "periods must be a number, got {periods}" raise TypeError(msg.format(periods=periods)) if freq is not None and not is_number(freq): try: freq = to_offset(freq) except ValueError: - raise ValueError('freq must be numeric or convertible to ' - 'DateOffset, got {freq}'.format(freq=freq)) + raise ValueError( + "freq must be numeric or convertible to " + "DateOffset, got {freq}".format(freq=freq) + ) # verify type compatibility - if not all([_is_type_compatible(start, end), - _is_type_compatible(start, freq), - _is_type_compatible(end, freq)]): + if not all( + [ + _is_type_compatible(start, end), + _is_type_compatible(start, freq), + _is_type_compatible(end, freq), + ] + ): raise TypeError("start, end, freq need to be type compatible") # +1 to convert interval count to breaks count (n breaks = n-1 intervals) @@ -1415,7 +1481,7 @@ def interval_range(start=None, end=None, periods=None, freq=None, breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com._not_none(start, end, freq)): # np.linspace always produces float output - breaks = maybe_downcast_to_dtype(breaks, 'int64') + breaks = maybe_downcast_to_dtype(breaks, "int64") else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0823a3ed9ad59..71b551adaf3ef 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -7,16 +7,23 @@ from pandas._config import get_option -from pandas._libs import ( - Timestamp, algos as libalgos, index as libindex, lib, tslibs) +from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, - is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, - pandas_dtype) + ensure_int64, + ensure_platform_int, + is_categorical_dtype, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_object_dtype, + is_scalar, + pandas_dtype, +) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.missing import array_equivalent, isna @@ -25,25 +32,32 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - Index, InvalidIndexError, _index_shared_docs, ensure_index) + Index, + InvalidIndexError, + _index_shared_docs, + ensure_index, +) from pandas.core.indexes.frozen import FrozenList, _ensure_frozen import pandas.core.missing as missing from pandas.io.formats.printing import ( - format_object_attrs, format_object_summary, pprint_thing) + format_object_attrs, + format_object_summary, + pprint_thing, +) _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - dict(klass='MultiIndex', - target_klass='MultiIndex or list of tuples')) + dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") +) -class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, - libindex.UInt64Engine): +class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): """ This class manages a MultiIndex by mapping label combinations to positive integers. """ + _base = libindex.UInt64Engine def _codes_to_ints(self, codes): @@ -77,13 +91,13 @@ def _codes_to_ints(self, codes): return np.bitwise_or.reduce(codes, axis=1) -class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, - libindex.ObjectEngine): +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): """ This class manages those (extreme) cases in which the number of possible label combinations overflows the 64 bits integers, and uses an ObjectEngine containing Python integers. """ + _base = libindex.ObjectEngine def _codes_to_ints(self, codes): @@ -106,7 +120,7 @@ def _codes_to_ints(self, codes): # Shift the representation of each level by the pre-calculated number # of bits. Since this can overflow uint64, first make sure we are # working with Python integers: - codes = codes.astype('object') << self.offsets + codes = codes.astype("object") << self.offsets # Now sum and OR are in fact interchangeable. This is a simple # composition of the (disjunct) significant bits of each level (i.e. @@ -205,20 +219,29 @@ class MultiIndex(Index): """ # initialize to zero-length tuples to make everything work - _typ = 'multiindex' + _typ = "multiindex" _names = FrozenList() _levels = FrozenList() _codes = FrozenList() - _comparables = ['names'] + _comparables = ["names"] rename = Index.set_names # -------------------------------------------------------------------- # Constructors - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def __new__(cls, levels=None, codes=None, sortorder=None, names=None, - dtype=None, copy=False, name=None, - verify_integrity=True, _set_identity=True): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def __new__( + cls, + levels=None, + codes=None, + sortorder=None, + names=None, + dtype=None, + copy=False, + name=None, + verify_integrity=True, + _set_identity=True, + ): # compat with Index if name is not None: @@ -226,9 +249,9 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None, if levels is None or codes is None: raise TypeError("Must pass both levels and codes") if len(levels) != len(codes): - raise ValueError('Length of levels and codes must be the same.') + raise ValueError("Length of levels and codes must be the same.") if len(levels) == 0: - raise ValueError('Must pass non-zero number of levels/codes') + raise ValueError("Must pass non-zero number of levels/codes") result = object.__new__(MultiIndex) @@ -302,32 +325,39 @@ def _verify_integrity(self, codes=None, levels=None): levels = levels or self.levels if len(levels) != len(codes): - raise ValueError("Length of levels and codes must match. NOTE:" - " this index is in an inconsistent state.") + raise ValueError( + "Length of levels and codes must match. NOTE:" + " this index is in an inconsistent state." + ) codes_length = len(codes[0]) for i, (level, level_codes) in enumerate(zip(levels, codes)): if len(level_codes) != codes_length: - raise ValueError("Unequal code lengths: %s" % - ([len(code_) for code_ in codes])) + raise ValueError( + "Unequal code lengths: %s" % ([len(code_) for code_ in codes]) + ) if len(level_codes) and level_codes.max() >= len(level): - msg = ("On level {level}, code max ({max_code}) >= length of " - "level ({level_len}). NOTE: this index is in an " - "inconsistent state".format( - level=i, max_code=level_codes.max(), - level_len=len(level))) + msg = ( + "On level {level}, code max ({max_code}) >= length of " + "level ({level_len}). NOTE: this index is in an " + "inconsistent state".format( + level=i, max_code=level_codes.max(), level_len=len(level) + ) + ) raise ValueError(msg) if len(level_codes) and level_codes.min() < -1: - raise ValueError("On level {level}, code value ({code})" - " < -1".format( - level=i, code=level_codes.min())) + raise ValueError( + "On level {level}, code value ({code})" + " < -1".format(level=i, code=level_codes.min()) + ) if not level.is_unique: - raise ValueError("Level values must be unique: {values} on " - "level {level}".format( - values=[value for value in level], - level=i)) - - codes = [self._validate_codes(level, code) - for level, code in zip(levels, codes)] + raise ValueError( + "Level values must be unique: {values} on " + "level {level}".format(values=[value for value in level], level=i) + ) + + codes = [ + self._validate_codes(level, code) for level, code in zip(levels, codes) + ] new_codes = FrozenList(codes) return new_codes @@ -383,7 +413,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): # raise ValueError, if not for i in range(1, len(arrays)): if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError('all arrays must be same length') + raise ValueError("all arrays must be same length") from pandas.core.arrays.categorical import _factorize_from_iterables @@ -391,8 +421,13 @@ def from_arrays(cls, arrays, sortorder=None, names=None): if names is None: names = [getattr(arr, "name", None) for arr in arrays] - return MultiIndex(levels=levels, codes=codes, sortorder=sortorder, - names=names, verify_integrity=False) + return MultiIndex( + levels=levels, + codes=codes, + sortorder=sortorder, + names=names, + verify_integrity=False, + ) @classmethod def from_tuples(cls, tuples, sortorder=None, names=None): @@ -432,13 +467,13 @@ def from_tuples(cls, tuples, sortorder=None, names=None): names=['number', 'color']) """ if not is_list_like(tuples): - raise TypeError('Input must be a list / sequence of tuple-likes.') + raise TypeError("Input must be a list / sequence of tuple-likes.") elif is_iterator(tuples): tuples = list(tuples) if len(tuples) == 0: if names is None: - msg = 'Cannot infer number of levels from empty list' + msg = "Cannot infer number of levels from empty list" raise TypeError(msg) arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): @@ -591,8 +626,10 @@ def array(self): ------ ValueError """ - msg = ("MultiIndex has no single backing array. Use " - "'MultiIndex.to_numpy()' to get a NumPy array of tuples.") + msg = ( + "MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples." + ) raise ValueError(msg) @property @@ -617,22 +654,23 @@ def _is_homogeneous_type(self): """ return len({x.dtype for x in self.levels}) <= 1 - def _set_levels(self, levels, level=None, copy=False, validate=True, - verify_integrity=False): + def _set_levels( + self, levels, level=None, copy=False, validate=True, verify_integrity=False + ): # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly if validate and len(levels) == 0: - raise ValueError('Must set non-zero number of levels.') + raise ValueError("Must set non-zero number of levels.") if validate and level is None and len(levels) != self.nlevels: - raise ValueError('Length of levels must match number of levels.') + raise ValueError("Length of levels must match number of levels.") if validate and level is not None and len(levels) != len(level): - raise ValueError('Length of levels must match length of level.') + raise ValueError("Length of levels must match length of level.") if level is None: new_levels = FrozenList( - ensure_index(lev, copy=copy)._shallow_copy() - for lev in levels) + ensure_index(lev, copy=copy)._shallow_copy() for lev in levels + ) else: level = [self._get_level_number(l) for l in level] new_levels = list(self._levels) @@ -652,8 +690,7 @@ def _set_levels(self, levels, level=None, copy=False, validate=True, self._tuples = None self._reset_cache() - def set_levels(self, levels, level=None, inplace=False, - verify_integrity=True): + def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -722,8 +759,9 @@ def set_levels(self, levels, level=None, inplace=False, else: idx = self._shallow_copy() idx._reset_identity() - idx._set_levels(levels, level=level, validate=True, - verify_integrity=verify_integrity) + idx._set_levels( + levels, level=level, validate=True, verify_integrity=verify_integrity + ) if not inplace: return idx @@ -733,29 +771,34 @@ def codes(self): @property def labels(self): - warnings.warn((".labels was deprecated in version 0.24.0. " - "Use .codes instead."), - FutureWarning, stacklevel=2) + warnings.warn( + (".labels was deprecated in version 0.24.0. " "Use .codes instead."), + FutureWarning, + stacklevel=2, + ) return self.codes - def _set_codes(self, codes, level=None, copy=False, validate=True, - verify_integrity=False): + def _set_codes( + self, codes, level=None, copy=False, validate=True, verify_integrity=False + ): if validate and level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") if validate and level is not None and len(codes) != len(level): - raise ValueError('Length of codes must match length of levels.') + raise ValueError("Length of codes must match length of levels.") if level is None: new_codes = FrozenList( _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() - for lev, level_codes in zip(self.levels, codes)) + for lev, level_codes in zip(self.levels, codes) + ) else: level = [self._get_level_number(l) for l in level] new_codes = list(self._codes) for lev_idx, level_codes in zip(level, codes): lev = self.levels[lev_idx] new_codes[lev_idx] = _ensure_frozen( - level_codes, lev, copy=copy)._shallow_copy() + level_codes, lev, copy=copy + )._shallow_copy() new_codes = FrozenList(new_codes) if verify_integrity: @@ -766,17 +809,24 @@ def _set_codes(self, codes, level=None, copy=False, validate=True, self._tuples = None self._reset_cache() - def set_labels(self, labels, level=None, inplace=False, - verify_integrity=True): - warnings.warn((".set_labels was deprecated in version 0.24.0. " - "Use .set_codes instead."), - FutureWarning, stacklevel=2) - return self.set_codes(codes=labels, level=level, inplace=inplace, - verify_integrity=verify_integrity) + def set_labels(self, labels, level=None, inplace=False, verify_integrity=True): + warnings.warn( + ( + ".set_labels was deprecated in version 0.24.0. " + "Use .set_codes instead." + ), + FutureWarning, + stacklevel=2, + ) + return self.set_codes( + codes=labels, + level=level, + inplace=inplace, + verify_integrity=verify_integrity, + ) - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def set_codes(self, codes, level=None, inplace=False, - verify_integrity=True): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -852,9 +902,17 @@ def set_codes(self, codes, level=None, inplace=False, if not inplace: return idx - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def copy(self, names=None, dtype=None, levels=None, codes=None, - deep=False, _set_identity=False, **kwargs): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def copy( + self, + names=None, + dtype=None, + levels=None, + codes=None, + deep=False, + _set_identity=False, + **kwargs + ): """ Make a copy of this object. Names, dtype, levels and codes can be passed and will be set on new copy. @@ -876,11 +934,12 @@ def copy(self, names=None, dtype=None, levels=None, codes=None, ``deep``, but if ``deep`` is passed it will attempt to deepcopy. This could be potentially expensive on large MultiIndex objects. """ - name = kwargs.get('name') + name = kwargs.get("name") names = self._validate_names(name=name, names=names, deep=deep) if deep: from copy import deepcopy + if levels is None: levels = deepcopy(self.levels) if codes is None: @@ -890,9 +949,14 @@ def copy(self, names=None, dtype=None, levels=None, codes=None, levels = self.levels if codes is None: codes = self.codes - return MultiIndex(levels=levels, codes=codes, names=names, - sortorder=self.sortorder, verify_integrity=False, - _set_identity=_set_identity) + return MultiIndex( + levels=levels, + codes=codes, + names=names, + sortorder=self.sortorder, + verify_integrity=False, + _set_identity=_set_identity, + ) def __array__(self, dtype=None): """ the array interface, return my values """ @@ -908,12 +972,14 @@ def _shallow_copy_with_infer(self, values, **kwargs): # On equal MultiIndexes the difference is empty. # Therefore, an empty MultiIndex is returned GH13490 if len(values) == 0: - return MultiIndex(levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - **kwargs) + return MultiIndex( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + **kwargs + ) return self._shallow_copy(values, **kwargs) - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) def __contains__(self, key): hash(key) try: @@ -922,23 +988,25 @@ def __contains__(self, key): except (LookupError, TypeError, ValueError): return False - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is not None: - names = kwargs.pop('names', kwargs.pop('name', self.names)) + names = kwargs.pop("names", kwargs.pop("name", self.names)) # discards freq - kwargs.pop('freq', None) + kwargs.pop("freq", None) return MultiIndex.from_tuples(values, names=names, **kwargs) return self.copy(**kwargs) @cache_readonly def dtype(self): - return np.dtype('O') + return np.dtype("O") def _is_memory_usage_qualified(self): """ return a boolean if we need a qualified .info display """ + def f(l): - return 'mixed' in l or 'string' in l or 'unicode' in l + return "mixed" in l or "string" in l or "unicode" in l + return any(f(l) for l in self._inferred_type_levels) @Appender(Index.memory_usage.__doc__) @@ -989,8 +1057,9 @@ def _format_data(self, name=None): """ Return the formatted data as a unicode string """ - return format_object_summary(self, self._formatter_func, - name=name, line_break_each_value=True) + return format_object_summary( + self, self._formatter_func, name=name, line_break_each_value=True + ) def _format_attrs(self): """ @@ -998,7 +1067,7 @@ def _format_attrs(self): """ return format_object_attrs(self, include_dtype=False) - def _format_native_types(self, na_rep='nan', **kwargs): + def _format_native_types(self, na_rep="nan", **kwargs): new_levels = [] new_codes = [] @@ -1006,7 +1075,7 @@ def _format_native_types(self, na_rep='nan', **kwargs): for level, level_codes in zip(self.levels, self.codes): level = level._format_native_types(na_rep=na_rep, **kwargs) # add nan values, if there are any - mask = (level_codes == -1) + mask = level_codes == -1 if mask.any(): nan_index = len(level) level = np.append(level, na_rep) @@ -1017,17 +1086,27 @@ def _format_native_types(self, na_rep='nan', **kwargs): if len(new_levels) == 1: # a single-level multi-index - return Index(new_levels[0].take( - new_codes[0]))._format_native_types() + return Index(new_levels[0].take(new_codes[0]))._format_native_types() else: # reconstruct the multi-index - mi = MultiIndex(levels=new_levels, codes=new_codes, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + mi = MultiIndex( + levels=new_levels, + codes=new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) return mi.values - def format(self, space=2, sparsify=None, adjoin=True, names=False, - na_rep=None, formatter=None): + def format( + self, + space=2, + sparsify=None, + adjoin=True, + names=False, + na_rep=None, + formatter=None, + ): if len(self) == 0: return [] @@ -1048,9 +1127,10 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, else: # weird all NA case - formatted = [pprint_thing(na if isna(x) else x, - escape_chars=('\t', '\r', '\n')) - for x in algos.take_1d(lev._values, level_codes)] + formatted = [ + pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n")) + for x in algos.take_1d(lev._values, level_codes) + ] stringified_levels.append(formatted) result_levels = [] @@ -1058,9 +1138,11 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, level = [] if names: - level.append(pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - if name is not None else '') + level.append( + pprint_thing(name, escape_chars=("\t", "\r", "\n")) + if name is not None + else "" + ) level.extend(np.array(lev, dtype=object)) result_levels.append(level) @@ -1069,20 +1151,22 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False, sparsify = get_option("display.multi_sparse") if sparsify: - sentinel = '' + sentinel = "" # GH3547 # use value of sparsify as sentinel, unless it's an obvious # "Truthy" value if sparsify not in [True, 1]: sentinel = sparsify # little bit of a kludge job for #1217 - result_levels = _sparsify(result_levels, start=int(names), - sentinel=sentinel) + result_levels = _sparsify( + result_levels, start=int(names), sentinel=sentinel + ) if adjoin: from pandas.io.formats.format import _get_adjustment + adj = _get_adjustment() - return adj.adjoin(space, *result_levels).split('\n') + return adj.adjoin(space, *result_levels).split("\n") else: return result_levels @@ -1122,14 +1206,15 @@ def _set_names(self, names, level=None, validate=True): # GH 15110 # Don't allow a single string for names in a MultiIndex if names is not None and not is_list_like(names): - raise ValueError('Names should be list-like for a MultiIndex') + raise ValueError("Names should be list-like for a MultiIndex") names = list(names) if validate and level is not None and len(names) != len(level): - raise ValueError('Length of names must match length of level.') + raise ValueError("Length of names must match length of level.") if validate and level is None and len(names) != self.nlevels: - raise ValueError('Length of names must match number of levels in ' - 'MultiIndex.') + raise ValueError( + "Length of names must match number of levels in " "MultiIndex." + ) if level is None: level = range(self.nlevels) @@ -1142,14 +1227,18 @@ def _set_names(self, names, level=None, validate=True): # GH 20527 # All items in 'names' need to be hashable: if not is_hashable(name): - raise TypeError('{}.name must be a hashable type' - .format(self.__class__.__name__)) + raise TypeError( + "{}.name must be a hashable type".format( + self.__class__.__name__ + ) + ) self.levels[l].rename(name, inplace=True) - names = property(fset=_set_names, fget=_get_names, - doc="""\nNames of levels in MultiIndex\n""") + names = property( + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex\n""" + ) - @Appender(_index_shared_docs['_get_grouper_for_level']) + @Appender(_index_shared_docs["_get_grouper_for_level"]) def _get_grouper_for_level(self, mapper, level): indexer = self.codes[level] level_index = self.levels[level] @@ -1185,29 +1274,34 @@ def _constructor(self): @cache_readonly def inferred_type(self): - return 'mixed' + return "mixed" def _get_level_number(self, level): count = self.names.count(level) if (count > 1) and not is_integer(level): - raise ValueError('The name %s occurs multiple times, use a ' - 'level number' % level) + raise ValueError( + "The name %s occurs multiple times, use a " "level number" % level + ) try: level = self.names.index(level) except ValueError: if not is_integer(level): - raise KeyError('Level %s not found' % str(level)) + raise KeyError("Level %s not found" % str(level)) elif level < 0: level += self.nlevels if level < 0: orig_level = level - self.nlevels - raise IndexError('Too many levels: Index has only %d ' - 'levels, %d is not a valid level number' % - (self.nlevels, orig_level)) + raise IndexError( + "Too many levels: Index has only %d " + "levels, %d is not a valid level number" + % (self.nlevels, orig_level) + ) # Note: levels are zero-based elif level >= self.nlevels: - raise IndexError('Too many levels: Index has only %d levels, ' - 'not %d' % (self.nlevels, level + 1)) + raise IndexError( + "Too many levels: Index has only %d levels, " + "not %d" % (self.nlevels, level + 1) + ) return level _tuples = None @@ -1226,7 +1320,7 @@ def _engine(self): # equivalent to sorting lexicographically the codes themselves. Notice # that each level needs to be shifted by the number of bits needed to # represent the _previous_ ones: - offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64') + offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64") # Check the total number of bits needed for our representation: if lev_bits[0] > 64: @@ -1245,8 +1339,7 @@ def values(self): vals = self._get_level_values(i) if is_categorical_dtype(vals): vals = vals._internal_get_values() - if (isinstance(vals.dtype, ExtensionDtype) - or hasattr(vals, '_box_values')): + if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"): vals = vals.astype(object) vals = np.array(vals, copy=False) values.append(vals) @@ -1267,8 +1360,9 @@ def is_monotonic_increasing(self): """ # reversed() because lexsort() wants the most significant key last. - values = [self._get_level_values(i).values - for i in reversed(range(len(self.levels)))] + values = [ + self._get_level_values(i).values for i in reversed(range(len(self.levels))) + ] try: sort_order = np.lexsort(values) return Index(sort_order).is_monotonic @@ -1289,7 +1383,7 @@ def is_monotonic_decreasing(self): @cache_readonly def _have_mixed_levels(self): """ return a boolean list indicated if we have mixed levels """ - return ['mixed' in l for l in self._inferred_type_levels] + return ["mixed" in l for l in self._inferred_type_levels] @cache_readonly def _inferred_type_levels(self): @@ -1300,6 +1394,7 @@ def _inferred_type_levels(self): def _hashed_values(self): """ return a uint64 ndarray of my hashed values """ from pandas.core.util.hashing import hash_tuples + return hash_tuples(self) def _hashed_indexing_key(self, key): @@ -1333,12 +1428,14 @@ def f(k, stringify): if stringify and not isinstance(k, str): k = str(k) return k - key = tuple(f(k, stringify) - for k, stringify in zip(key, self._have_mixed_levels)) + + key = tuple( + f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels) + ) return hash_tuple(key) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): from pandas.core.sorting import get_group_index from pandas._libs.hashtable import duplicated_int64 @@ -1351,14 +1448,14 @@ def fillna(self, value=None, downcast=None): """ fillna is not implemented for MultiIndex """ - raise NotImplementedError('isna is not defined for MultiIndex') + raise NotImplementedError("isna is not defined for MultiIndex") - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): + @Appender(_index_shared_docs["dropna"]) + def dropna(self, how="any"): nans = [level_codes == -1 for level_codes in self.codes] - if how == 'any': + if how == "any": indexer = np.any(nans, axis=0) - elif how == 'all': + elif how == "all": indexer = np.all(nans, axis=0) else: raise ValueError("invalid how option: {0}".format(how)) @@ -1380,8 +1477,9 @@ def _try_mi(k): new_values = series._values[loc] new_index = self[loc] new_index = maybe_droplevels(new_index, k) - return series._constructor(new_values, index=new_index, - name=series.name).__finalize__(self) + return series._constructor( + new_values, index=new_index, name=series.name + ).__finalize__(self) try: return self._engine.get_value(s, k) @@ -1419,8 +1517,13 @@ def _try_mi(k): try: return _try_mi(Timestamp(key)) - except (KeyError, TypeError, - IndexError, ValueError, tslibs.OutOfBoundsDatetime): + except ( + KeyError, + TypeError, + IndexError, + ValueError, + tslibs.OutOfBoundsDatetime, + ): pass raise InvalidIndexError(key) @@ -1447,8 +1550,7 @@ def _get_level_values(self, level, unique=False): level_codes = self.codes[level] if unique: level_codes = algos.unique(level_codes) - filled = algos.take_1d(values._values, level_codes, - fill_value=values._na_value) + filled = algos.take_1d(values._values, level_codes, fill_value=values._na_value) values = values._shallow_copy(filled) return values @@ -1488,7 +1590,7 @@ def get_level_values(self, level): values = self._get_level_values(level) return values - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is None: @@ -1528,26 +1630,31 @@ def to_frame(self, index=True, name=None): """ from pandas import DataFrame + if name is not None: if not is_list_like(name): - raise TypeError("'name' must be a list / sequence " - "of column names.") + raise TypeError("'name' must be a list / sequence " "of column names.") if len(name) != len(self.levels): - raise ValueError("'name' should have same length as " - "number of levels on index.") + raise ValueError( + "'name' should have same length as " "number of levels on index." + ) idx_names = name else: idx_names = self.names # Guarantee resulting column order result = DataFrame( - OrderedDict([ - ((level if lvlname is None else lvlname), - self._get_level_values(level)) - for lvlname, level in zip(idx_names, range(len(self.levels))) - ]), - copy=False + OrderedDict( + [ + ( + (level if lvlname is None else lvlname), + self._get_level_values(level), + ) + for lvlname, level in zip(idx_names, range(len(self.levels))) + ] + ), + copy=False, ) if index: @@ -1598,14 +1705,16 @@ def to_hierarchical(self, n_repeat, n_shuffle=1): ) """ levels = self.levels - codes = [np.repeat(level_codes, n_repeat) for - level_codes in self.codes] + codes = [np.repeat(level_codes, n_repeat) for level_codes in self.codes] # Assumes that each level_codes is divisible by n_shuffle - codes = [x.reshape(n_shuffle, -1).ravel(order='F') for x in codes] + codes = [x.reshape(n_shuffle, -1).ravel(order="F") for x in codes] names = self.names - warnings.warn("Method .to_hierarchical is deprecated and will " - "be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn( + "Method .to_hierarchical is deprecated and will " + "be removed in a future version", + FutureWarning, + stacklevel=2, + ) return MultiIndex(levels=levels, codes=codes, names=names) def to_flat_index(self): @@ -1728,9 +1837,13 @@ def _sort_levels_monotonic(self): new_levels.append(lev) new_codes.append(level_codes) - return MultiIndex(new_levels, new_codes, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + return MultiIndex( + new_levels, + new_codes, + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) def remove_unused_levels(self): """ @@ -1835,19 +1948,22 @@ def levshape(self): def __reduce__(self): """Necessary for making this object picklable""" - d = dict(levels=[lev for lev in self.levels], - codes=[level_codes for level_codes in self.codes], - sortorder=self.sortorder, names=list(self.names)) + d = dict( + levels=[lev for lev in self.levels], + codes=[level_codes for level_codes in self.codes], + sortorder=self.sortorder, + names=list(self.names), + ) return ibase._new_Index, (self.__class__, d), None def __setstate__(self, state): """Necessary for making this object picklable""" if isinstance(state, dict): - levels = state.get('levels') - codes = state.get('codes') - sortorder = state.get('sortorder') - names = state.get('names') + levels = state.get("levels") + codes = state.get("codes") + sortorder = state.get("sortorder") + names = state.get("names") elif isinstance(state, tuple): @@ -1887,30 +2003,40 @@ def __getitem__(self, key): new_codes = [level_codes[key] for level_codes in self.codes] - return MultiIndex(levels=self.levels, codes=new_codes, - names=self.names, sortorder=sortorder, - verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) - taken = self._assert_take_fillable(self.codes, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1) - return MultiIndex(levels=self.levels, codes=taken, - names=self.names, verify_integrity=False) - - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=None): + taken = self._assert_take_fillable( + self.codes, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=-1, + ) + return MultiIndex( + levels=self.levels, codes=taken, names=self.names, verify_integrity=False + ) + + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=None + ): """ Internal method to handle NA filling of take """ # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) raise ValueError(msg) taken = [lab.take(indices) for lab in self.codes] mask = indices == -1 @@ -1940,8 +2066,9 @@ def append(self, other): if not isinstance(other, (list, tuple)): other = [other] - if all((isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) - for o in other): + if all( + (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other + ): arrays = [] for i in range(self.nlevels): label = self._get_level_values(i) @@ -1949,7 +2076,7 @@ def append(self, other): arrays.append(label.append(appended)) return MultiIndex.from_arrays(arrays, names=self.names) - to_concat = (self.values, ) + tuple(k._values for k in other) + to_concat = (self.values,) + tuple(k._values for k in other) new_tuples = np.concatenate(to_concat) # if all(isinstance(x, MultiIndex) for x in other): @@ -1961,21 +2088,27 @@ def append(self, other): def argsort(self, *args, **kwargs): return self.values.argsort(*args, **kwargs) - @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs) + @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) - return MultiIndex(levels=self.levels, - codes=[level_codes.view(np.ndarray).repeat(repeats) - for level_codes in self.codes], - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=[ + level_codes.view(np.ndarray).repeat(repeats) + for level_codes in self.codes + ], + names=self.names, + sortorder=self.sortorder, + verify_integrity=False, + ) def where(self, cond, other=None): - raise NotImplementedError(".where is not supported for " - "MultiIndex operations") + raise NotImplementedError( + ".where is not supported for " "MultiIndex operations" + ) - @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes') - def drop(self, codes, level=None, errors='raise'): + @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes") + def drop(self, codes, level=None, errors="raise"): """ Make new MultiIndex with passed list of codes deleted @@ -1998,9 +2131,8 @@ def drop(self, codes, level=None, errors='raise'): indexer = self.get_indexer(codes) mask = indexer == -1 if mask.any(): - if errors != 'ignore': - raise ValueError('codes %s not contained in axis' % - codes[mask]) + if errors != "ignore": + raise ValueError("codes %s not contained in axis" % codes[mask]) except Exception: pass @@ -2016,18 +2148,20 @@ def drop(self, codes, level=None, errors='raise'): inds.extend(range(loc.start, loc.stop)) elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: - warnings.warn('dropping on a non-lexsorted multi-index' - ' without a level parameter may impact ' - 'performance.', - PerformanceWarning, - stacklevel=3) + warnings.warn( + "dropping on a non-lexsorted multi-index" + " without a level parameter may impact " + "performance.", + PerformanceWarning, + stacklevel=3, + ) loc = loc.nonzero()[0] inds.extend(loc) else: - msg = 'unsupported indexer of type {}'.format(type(loc)) + msg = "unsupported indexer of type {}".format(type(loc)) raise AssertionError(msg) except KeyError: - if errors != 'ignore': + if errors != "ignore": raise return self.delete(inds) @@ -2101,8 +2235,9 @@ def swaplevel(self, i=-2, j=-1): new_codes[i], new_codes[j] = new_codes[j], new_codes[i] new_names[i], new_names[j] = new_names[j], new_names[i] - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) def reorder_levels(self, order): """ @@ -2117,15 +2252,17 @@ def reorder_levels(self, order): """ order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: - raise AssertionError('Length of order must be same as ' - 'number of levels (%d), got %d' % - (self.nlevels, len(order))) + raise AssertionError( + "Length of order must be same as " + "number of levels (%d), got %d" % (self.nlevels, len(order)) + ) new_levels = [self.levels[i] for i in order] new_codes = [self.codes[i] for i in order] new_names = [self.names[i] for i in order] - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) def __getslice__(self, i, j): return self.__getitem__(slice(i, j)) @@ -2141,13 +2278,15 @@ def _get_codes_for_sorting(self): from pandas.core.arrays import Categorical def cats(level_codes): - return np.arange(np.array(level_codes).max() + 1 if - len(level_codes) else 0, - dtype=level_codes.dtype) + return np.arange( + np.array(level_codes).max() + 1 if len(level_codes) else 0, + dtype=level_codes.dtype, + ) - return [Categorical.from_codes(level_codes, cats(level_codes), - ordered=True) - for level_codes in self.codes] + return [ + Categorical.from_codes(level_codes, cats(level_codes), ordered=True) + for level_codes in self.codes + ] def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ @@ -2184,8 +2323,10 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): raise ValueError("level must have same length as ascending") from pandas.core.sorting import lexsort_indexer - indexer = lexsort_indexer([self.codes[lev] for lev in level], - orders=ascending) + + indexer = lexsort_indexer( + [self.codes[lev] for lev in level], orders=ascending + ) # level ordering else: @@ -2209,8 +2350,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): else: sortorder = level[0] - indexer = indexer_from_factorized(primary, primshp, - compress=False) + indexer = indexer_from_factorized(primary, primshp, compress=False) if not ascending: indexer = indexer[::-1] @@ -2218,9 +2358,13 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): indexer = ensure_platform_int(indexer) new_codes = [level_codes.take(indexer) for level_codes in self.codes] - new_index = MultiIndex(codes=new_codes, levels=self.levels, - names=self.names, sortorder=sortorder, - verify_integrity=False) + new_index = MultiIndex( + codes=new_codes, + levels=self.levels, + names=self.names, + sortorder=sortorder, + verify_integrity=False, + ) return new_index, indexer @@ -2240,8 +2384,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None): indexer, keyarr = super()._convert_listlike_indexer(keyarr, kind=kind) # are we indexing a specific level - if indexer is None and len(keyarr) and not isinstance(keyarr[0], - tuple): + if indexer is None and len(keyarr) and not isinstance(keyarr[0], tuple): level = 0 _, indexer = self.reindex(keyarr, level=level) @@ -2252,11 +2395,11 @@ def _convert_listlike_indexer(self, keyarr, kind=None): check = self.levels[0].get_indexer(keyarr) mask = check == -1 if mask.any(): - raise KeyError('%s not in index' % keyarr[mask]) + raise KeyError("%s not in index" % keyarr[mask]) return indexer, keyarr - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ensure_index(target) @@ -2272,34 +2415,36 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): # let's instead try with a straight Index if method is None: - return Index(self.values).get_indexer(target, - method=method, - limit=limit, - tolerance=tolerance) + return Index(self.values).get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) if not self.is_unique: - raise ValueError('Reindexing only valid with uniquely valued ' - 'Index objects') + raise ValueError( + "Reindexing only valid with uniquely valued " "Index objects" + ) - if method == 'pad' or method == 'backfill': + if method == "pad" or method == "backfill": if tolerance is not None: - raise NotImplementedError("tolerance not implemented yet " - 'for MultiIndex') + raise NotImplementedError( + "tolerance not implemented yet " "for MultiIndex" + ) indexer = self._engine.get_indexer(target, method, limit) - elif method == 'nearest': - raise NotImplementedError("method='nearest' not implemented yet " - 'for MultiIndex; see GitHub issue 9365') + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet " + "for MultiIndex; see GitHub issue 9365" + ) else: indexer = self._engine.get_indexer(target) return ensure_platform_int(indexer) - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): return super().get_indexer_non_unique(target) - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -2313,11 +2458,11 @@ def reindex(self, target, method=None, level=None, limit=None, """ # GH6552: preserve names when reindexing to non-named target # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'names') + preserve_names = not hasattr(target, "names") if level is not None: if method is not None: - raise TypeError('Fill method not supported if level passed') + raise TypeError("Fill method not supported if level passed") # GH7774: preserve dtype/tz if target is empty and not an Index. # target may be an iterator @@ -2325,23 +2470,22 @@ def reindex(self, target, method=None, level=None, limit=None, if len(target) == 0 and not isinstance(target, Index): idx = self.levels[level] attrs = idx._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq - target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), - **attrs) + attrs.pop("freq", None) # don't preserve freq + target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs) else: target = ensure_index(target) - target, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True, - keep_order=False) + target, indexer, _ = self._join_level( + target, level, how="right", return_indexers=True, keep_order=False + ) else: target = ensure_index(target) if self.equals(target): indexer = None else: if self.is_unique: - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) + indexer = self.get_indexer( + target, method=method, limit=limit, tolerance=tolerance + ) else: raise ValueError("cannot handle a non-unique multi-index!") @@ -2354,8 +2498,11 @@ def reindex(self, target, method=None, level=None, limit=None, # hopefully? target = MultiIndex.from_tuples(target) - if (preserve_names and target.nlevels == self.nlevels and - target.names != self.names): + if ( + preserve_names + and target.nlevels == self.nlevels + and target.names != self.names + ): target = target.copy(deep=False) target.names = self.names @@ -2364,7 +2511,7 @@ def reindex(self, target, method=None, level=None, limit=None, def get_slice_bound(self, label, side, kind): if not isinstance(label, tuple): - label = label, + label = (label,) return self._partial_tup_index(label, side=side) def slice_locs(self, start=None, end=None, step=None, kind=None): @@ -2423,12 +2570,12 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): # happens in get_slice_bound method), but it adds meaningful doc. return super().slice_locs(start, end, step, kind=kind) - def _partial_tup_index(self, tup, side='left'): + def _partial_tup_index(self, tup, side="left"): if len(tup) > self.lexsort_depth: raise UnsortedIndexError( - 'Key length (%d) was greater than MultiIndex' - ' lexsort depth (%d)' % - (len(tup), self.lexsort_depth)) + "Key length (%d) was greater than MultiIndex" + " lexsort depth (%d)" % (len(tup), self.lexsort_depth) + ) n = len(tup) start, end = 0, len(self) @@ -2437,20 +2584,19 @@ def _partial_tup_index(self, tup, side='left'): section = labs[start:end] if lab not in lev: - if not lev.is_type_compatible(lib.infer_dtype([lab], - skipna=False)): - raise TypeError('Level type mismatch: %s' % lab) + if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): + raise TypeError("Level type mismatch: %s" % lab) # short circuit loc = lev.searchsorted(lab, side=side) - if side == 'right' and loc >= 0: + if side == "right" and loc >= 0: loc -= 1 return start + section.searchsorted(loc, side=side) idx = lev.get_loc(lab) if k < n - 1: - end = start + section.searchsorted(idx, side='right') - start = start + section.searchsorted(idx, side='left') + end = start + section.searchsorted(idx, side="right") + start = start + section.searchsorted(idx, side="left") else: return start + section.searchsorted(idx, side=side) @@ -2495,19 +2641,21 @@ def get_loc(self, key, method=None): 1 """ if method is not None: - raise NotImplementedError('only the default get_loc method is ' - 'currently supported for MultiIndex') + raise NotImplementedError( + "only the default get_loc method is " + "currently supported for MultiIndex" + ) def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" - if not isinstance(loc, np.ndarray) or loc.dtype != 'int64': + if not isinstance(loc, np.ndarray) or loc.dtype != "int64": return loc loc = lib.maybe_indices_to_slice(loc, len(self)) if isinstance(loc, slice): return loc - mask = np.empty(len(self), dtype='bool') + mask = np.empty(len(self), dtype="bool") mask.fill(False) mask[loc] = True return mask @@ -2518,8 +2666,10 @@ def _maybe_to_slice(loc): keylen = len(key) if self.nlevels < keylen: - raise KeyError('Key length ({0}) exceeds index depth ({1})' - ''.format(keylen, self.nlevels)) + raise KeyError( + "Key length ({0}) exceeds index depth ({1})" + "".format(keylen, self.nlevels) + ) if keylen == self.nlevels and self.is_unique: return self._engine.get_loc(key) @@ -2530,8 +2680,9 @@ def _maybe_to_slice(loc): # needs linear search within the slice i = self.lexsort_depth lead_key, follow_key = key[:i], key[i:] - start, stop = (self.slice_locs(lead_key, lead_key) - if lead_key else (0, len(self))) + start, stop = ( + self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self)) + ) if start == stop: raise KeyError(key) @@ -2539,10 +2690,13 @@ def _maybe_to_slice(loc): if not follow_key: return slice(start, stop) - warnings.warn('indexing past lexsort depth may impact performance.', - PerformanceWarning, stacklevel=10) + warnings.warn( + "indexing past lexsort depth may impact performance.", + PerformanceWarning, + stacklevel=10, + ) - loc = np.arange(start, stop, dtype='int64') + loc = np.arange(start, stop, dtype="int64") for i, k in enumerate(follow_key, len(lead_key)): mask = self.codes[i][loc] == self.levels[i].get_loc(k) @@ -2551,8 +2705,7 @@ def _maybe_to_slice(loc): if not len(loc): raise KeyError(key) - return (_maybe_to_slice(loc) if len(loc) != stop - start else - slice(start, stop)) + return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop) def get_loc_level(self, key, level=0, drop_level=True): """ @@ -2612,8 +2765,9 @@ def maybe_droplevels(indexer, levels, drop_level): if isinstance(level, (tuple, list)): if len(key) != len(level): - raise AssertionError('Key for location must have same ' - 'length as number of levels') + raise AssertionError( + "Key for location must have same " "length as number of levels" + ) result = None for lev, k in zip(level, key): loc, new_index = self.get_loc_level(k, level=lev) @@ -2649,10 +2803,10 @@ def maybe_droplevels(indexer, levels, drop_level): def partial_selection(key, indexer=None): if indexer is None: indexer = self.get_loc(key) - ilevels = [i for i in range(len(key)) - if key[i] != slice(None, None)] - return indexer, maybe_droplevels(indexer, ilevels, - drop_level) + ilevels = [ + i for i in range(len(key)) if key[i] != slice(None, None) + ] + return indexer, maybe_droplevels(indexer, ilevels, drop_level) if len(key) == self.nlevels and self.is_unique: # Complete key in unique index -> standard get_loc @@ -2683,8 +2837,7 @@ def partial_selection(key, indexer=None): indexer &= k_index if indexer is None: indexer = slice(None, None) - ilevels = [i for i in range(len(key)) - if key[i] != slice(None, None)] + ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)] return indexer, maybe_droplevels(indexer, ilevels, drop_level) else: indexer = self._get_level_indexer(key, level=level) @@ -2698,8 +2851,7 @@ def _get_level_indexer(self, key, level=0, indexer=None): level_index = self.levels[level] level_codes = self.codes[level] - def convert_indexer(start, stop, step, indexer=indexer, - codes=level_codes): + def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): # given the inputs and the codes/indexer, compute an indexer set # if we have a provided indexer, then this need not consider # the entire labels set @@ -2714,6 +2866,7 @@ def convert_indexer(start, stop, step, indexer=indexer, # that the result are the mappings to the set that we have # selected from pandas import Series + mapper = Series(indexer) indexer = codes.take(ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) @@ -2721,8 +2874,7 @@ def convert_indexer(start, stop, step, indexer=indexer, else: m = np.zeros(len(codes), dtype=bool) - m[np.in1d(codes, r, - assume_unique=Index(codes).is_unique)] = True + m[np.in1d(codes, r, assume_unique=Index(codes).is_unique)] = True return m @@ -2744,8 +2896,9 @@ def convert_indexer(start, stop, step, indexer=indexer, # we have a partial slice (like looking up a partial date # string) - start = stop = level_index.slice_indexer(key.start, key.stop, - key.step, kind='loc') + start = stop = level_index.slice_indexer( + key.start, key.stop, key.step, kind="loc" + ) step = start.step if isinstance(start, slice) or isinstance(stop, slice): @@ -2753,8 +2906,8 @@ def convert_indexer(start, stop, step, indexer=indexer, # a partial date slicer on a DatetimeIndex generates a slice # note that the stop ALREADY includes the stopped point (if # it was a string sliced) - start = getattr(start, 'start', start) - stop = getattr(stop, 'stop', stop) + start = getattr(start, "start", start) + stop = getattr(stop, "stop", stop) return convert_indexer(start, stop, step) elif level > 0 or self.lexsort_depth == 0 or step is not None: @@ -2764,8 +2917,8 @@ def convert_indexer(start, stop, step, indexer=indexer, return convert_indexer(start, stop + 1, step) else: # sorted, so can return slice object -> view - i = level_codes.searchsorted(start, side='left') - j = level_codes.searchsorted(stop, side='right') + i = level_codes.searchsorted(start, side="left") + j = level_codes.searchsorted(stop, side="right") return slice(i, j, step) else: @@ -2780,8 +2933,8 @@ def convert_indexer(start, stop, step, indexer=indexer, raise KeyError(key) return locs - i = level_codes.searchsorted(code, side='left') - j = level_codes.searchsorted(code, side='right') + i = level_codes.searchsorted(code, side="left") + j = level_codes.searchsorted(code, side="right") if i == j: # The label is present in self.levels[level] but unused: raise KeyError(key) @@ -2826,10 +2979,11 @@ def get_locs(self, seq): # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] if true_slices and true_slices[-1] >= self.lexsort_depth: - raise UnsortedIndexError('MultiIndex slicing requires the index ' - 'to be lexsorted: slicing on levels {0}, ' - 'lexsort depth {1}' - .format(true_slices, self.lexsort_depth)) + raise UnsortedIndexError( + "MultiIndex slicing requires the index " + "to be lexsorted: slicing on levels {0}, " + "lexsort depth {1}".format(true_slices, self.lexsort_depth) + ) # indexer # this is the list of all values that we want to select n = len(self) @@ -2843,9 +2997,11 @@ def _convert_to_indexer(r): r = m.nonzero()[0] elif com.is_bool_indexer(r): if len(r) != n: - raise ValueError("cannot index with a boolean indexer " - "that is not the same length as the " - "index") + raise ValueError( + "cannot index with a boolean indexer " + "that is not the same length as the " + "index" + ) r = r.nonzero()[0] return Int64Index(r) @@ -2861,8 +3017,7 @@ def _update_indexer(idxr, indexer=indexer): if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer(_convert_to_indexer(k), - indexer=indexer) + indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) elif is_list_like(k): # a collection of labels to include from this level (these @@ -2871,10 +3026,9 @@ def _update_indexer(idxr, indexer=indexer): for x in k: try: idxrs = _convert_to_indexer( - self._get_level_indexer(x, level=i, - indexer=indexer)) - indexers = (idxrs if indexers is None - else indexers | idxrs) + self._get_level_indexer(x, level=i, indexer=indexer) + ) + indexers = idxrs if indexers is None else indexers | idxrs except KeyError: # ignore not founds @@ -2893,14 +3047,20 @@ def _update_indexer(idxr, indexer=indexer): elif isinstance(k, slice): # a slice, include BOTH of the labels - indexer = _update_indexer(_convert_to_indexer( - self._get_level_indexer(k, level=i, indexer=indexer)), - indexer=indexer) + indexer = _update_indexer( + _convert_to_indexer( + self._get_level_indexer(k, level=i, indexer=indexer) + ), + indexer=indexer, + ) else: # a single label - indexer = _update_indexer(_convert_to_indexer( - self.get_loc_level(k, level=i, drop_level=False)[0]), - indexer=indexer) + indexer = _update_indexer( + _convert_to_indexer( + self.get_loc_level(k, level=i, drop_level=False)[0] + ), + indexer=indexer, + ) # empty indexer if indexer is None: @@ -2923,7 +3083,7 @@ def truncate(self, before=None, after=None): truncated : MultiIndex """ if after and before and after < before: - raise ValueError('after < before') + raise ValueError("after < before") i, j = self.levels[0].slice_locs(before, after) left, right = self.slice_locs(before, after) @@ -2934,8 +3094,7 @@ def truncate(self, before=None, after=None): new_codes = [level_codes[left:right] for level_codes in self.codes] new_codes[0] = new_codes[0] - i - return MultiIndex(levels=new_levels, codes=new_codes, - verify_integrity=False) + return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) def equals(self, other): """ @@ -2965,14 +3124,15 @@ def equals(self, other): for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] - self_values = algos.take_nd(np.asarray(self.levels[i]._values), - self_codes, allow_fill=False) + self_values = algos.take_nd( + np.asarray(self.levels[i]._values), self_codes, allow_fill=False + ) other_codes = other.codes[i] other_codes = other_codes[other_codes != -1] other_values = algos.take_nd( - np.asarray(other.levels[i]._values), - other_codes, allow_fill=False) + np.asarray(other.levels[i]._values), other_codes, allow_fill=False + ) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say @@ -3041,12 +3201,13 @@ def union(self, other, sort=None): # TODO: Index.union returns other when `len(self)` is 0. - uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, - other._ndarray_values], - sort=sort) + uniq_tuples = lib.fast_unique_multiple( + [self._ndarray_values, other._ndarray_values], sort=sort + ) - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, - names=result_names) + return MultiIndex.from_arrays( + zip(*uniq_tuples), sortorder=0, names=result_names + ) def intersection(self, other, sort=False): """ @@ -3084,12 +3245,16 @@ def intersection(self, other, sort=False): uniq_tuples = sorted(uniq_tuples) if len(uniq_tuples) == 0: - return MultiIndex(levels=self.levels, - codes=[[]] * self.nlevels, - names=result_names, verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) else: - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, - names=result_names) + return MultiIndex.from_arrays( + zip(*uniq_tuples), sortorder=0, names=result_names + ) def difference(self, other, sort=None): """ @@ -3120,38 +3285,43 @@ def difference(self, other, sort=None): return self if self.equals(other): - return MultiIndex(levels=self.levels, - codes=[[]] * self.nlevels, - names=result_names, verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) this = self._get_unique_index() indexer = this.get_indexer(other) indexer = indexer.take((indexer != -1).nonzero()[0]) - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) + label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) difference = this.values.take(label_diff) if sort is None: difference = sorted(difference) if len(difference) == 0: - return MultiIndex(levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - names=result_names, verify_integrity=False) + return MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) else: - return MultiIndex.from_tuples(difference, sortorder=0, - names=result_names) + return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_categorical_dtype(dtype): - msg = '> 1 ndim Categorical are not supported at this time' + msg = "> 1 ndim Categorical are not supported at this time" raise NotImplementedError(msg) elif not is_object_dtype(dtype): - msg = ('Setting {cls} dtype to anything other than object ' - 'is not supported').format(cls=self.__class__) + msg = ( + "Setting {cls} dtype to anything other than object " "is not supported" + ).format(cls=self.__class__) raise TypeError(msg) elif copy is True: return self._shallow_copy() @@ -3160,13 +3330,15 @@ def astype(self, dtype, copy=True): def _convert_can_do_setop(self, other): result_names = self.names - if not hasattr(other, 'names'): + if not hasattr(other, "names"): if len(other) == 0: - other = MultiIndex(levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - verify_integrity=False) + other = MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + verify_integrity=False, + ) else: - msg = 'other must be a MultiIndex or a list of tuples' + msg = "other must be a MultiIndex or a list of tuples" try: other = MultiIndex.from_tuples(other) except TypeError: @@ -3192,10 +3364,9 @@ def insert(self, loc, item): # Pad the key with empty strings if lower levels of the key # aren't specified: if not isinstance(item, tuple): - item = (item, ) + ('', ) * (self.nlevels - 1) + item = (item,) + ("",) * (self.nlevels - 1) elif len(item) != self.nlevels: - raise ValueError('Item must have length equal to number of ' - 'levels.') + raise ValueError("Item must have length equal to number of " "levels.") new_levels = [] new_codes = [] @@ -3210,11 +3381,11 @@ def insert(self, loc, item): lev_loc = level.get_loc(k) new_levels.append(level) - new_codes.append(np.insert( - ensure_int64(level_codes), loc, lev_loc)) + new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc)) - return MultiIndex(levels=new_levels, codes=new_codes, - names=self.names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False + ) def delete(self, loc): """ @@ -3225,8 +3396,12 @@ def delete(self, loc): new_index : MultiIndex """ new_codes = [np.delete(level_codes, loc) for level_codes in self.codes] - return MultiIndex(levels=self.levels, codes=new_codes, - names=self.names, verify_integrity=False) + return MultiIndex( + levels=self.levels, + codes=new_codes, + names=self.names, + verify_integrity=False, + ) def _wrap_joined_index(self, joined, other): names = self.names if self.names == other.names else None @@ -3235,8 +3410,7 @@ def _wrap_joined_index(self, joined, other): @Appender(Index.isin.__doc__) def isin(self, values, level=None): if level is None: - values = MultiIndex.from_tuples(values, - names=self.names).values + values = MultiIndex.from_tuples(values, names=self.names).values return algos.isin(self.values, values) else: num = self._get_level_number(level) @@ -3255,14 +3429,14 @@ def isin(self, values, level=None): MultiIndex._add_logical_methods_disabled() -def _sparsify(label_list, start=0, sentinel=''): +def _sparsify(label_list, start=0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) - result = pivoted[:start + 1] + result = pivoted[: start + 1] prev = pivoted[start] - for cur in pivoted[start + 1:]: + for cur in pivoted[start + 1 :]: sparse_cur = [] for i, (p, t) in enumerate(zip(prev, cur)): @@ -3284,4 +3458,4 @@ def _sparsify(label_list, start=0, sentinel=''): def _get_na_rep(dtype): - return {np.datetime64: 'NaT', np.timedelta64: 'NaT'}.get(dtype, 'NaN') + return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN") diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 5f9c1f22887cc..daf26d53aa6e2 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -6,18 +6,29 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, - is_float_dtype, is_integer_dtype, is_scalar, needs_i8_conversion, - pandas_dtype) + is_bool, + is_bool_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer_dtype, + is_scalar, + needs_i8_conversion, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( - ABCFloat64Index, ABCInt64Index, ABCRangeIndex, ABCUInt64Index) + ABCFloat64Index, + ABCInt64Index, + ABCRangeIndex, + ABCUInt64Index, +) from pandas.core.dtypes.missing import isna from pandas.core import algorithms import pandas.core.common as com -from pandas.core.indexes.base import ( - Index, InvalidIndexError, _index_shared_docs) +from pandas.core.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.core.ops import get_op_result_name _num_index_shared_docs = dict() @@ -30,15 +41,18 @@ class NumericIndex(Index): This is an abstract class """ + _is_numeric_dtype = True - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=None): + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None): if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(data, name=name) @@ -54,18 +68,18 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, else: subarr = data - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name return cls._simple_new(subarr, name=name) - @Appender(_index_shared_docs['_maybe_cast_slice_bound']) + @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is not None and not self._can_hold_na: # Ensure we are not returning an Int64Index with float data: @@ -85,17 +99,24 @@ def _convert_for_op(self, value): def _convert_tolerance(self, tolerance, target): tolerance = np.asarray(tolerance) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") if not np.issubdtype(tolerance.dtype, np.number): if tolerance.ndim > 0: - raise ValueError(('tolerance argument for %s must contain ' - 'numeric elements if it is list type') % - (type(self).__name__,)) + raise ValueError( + ( + "tolerance argument for %s must contain " + "numeric elements if it is list type" + ) + % (type(self).__name__,) + ) else: - raise ValueError(('tolerance argument for %s must be numeric ' - 'if it is a scalar: %r') % - (type(self).__name__, tolerance)) + raise ValueError( + ( + "tolerance argument for %s must be numeric " + "if it is a scalar: %r" + ) + % (type(self).__name__, tolerance) + ) return tolerance @classmethod @@ -131,9 +152,8 @@ def _union(self, other, sort): # float | [u]int -> float (the special case) # | -> T # | -> object - needs_cast = ( - (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or - (is_integer_dtype(other.dtype) and is_float_dtype(self.dtype)) + needs_cast = (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or ( + is_integer_dtype(other.dtype) and is_float_dtype(self.dtype) ) if needs_cast: first = self.astype("float") @@ -143,7 +163,9 @@ def _union(self, other, sort): return super()._union(other, sort) -_num_index_shared_docs['class_descr'] = """ +_num_index_shared_docs[ + "class_descr" +] = """ Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects. %(klass)s is a special case of `Index` with purely %(ltype)s labels. %(extra)s @@ -174,12 +196,7 @@ def _union(self, other, sort): An Index instance can **only** contain hashable objects. """ -_int64_descr_args = dict( - klass='Int64Index', - ltype='integer', - dtype='int64', - extra='' -) +_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="") class IntegerIndex(NumericIndex): @@ -201,9 +218,9 @@ def __contains__(self, key): class Int64Index(IntegerIndex): - __doc__ = _num_index_shared_docs['class_descr'] % _int64_descr_args + __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args - _typ = 'int64index' + _typ = "int64index" _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.int64 @@ -211,19 +228,19 @@ class Int64Index(IntegerIndex): @property def inferred_type(self): """Always 'integer' for ``Int64Index``""" - return 'integer' + return "integer" @property def asi8(self): # do not cache or you'll create a memory leak - return self.values.view('i8') + return self.values.view("i8") - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # don't coerce ilocs to integers - if kind != 'iloc': + if kind != "iloc": key = self._maybe_cast_indexer(key) return super()._convert_scalar_indexer(key, kind=kind) @@ -238,16 +255,12 @@ def _assert_safe_casting(cls, data, subarr): """ if not issubclass(data.dtype.type, np.signedinteger): if not np.array_equal(data, subarr): - raise TypeError('Unsafe NumPy casting, you must ' - 'explicitly cast') + raise TypeError("Unsafe NumPy casting, you must " "explicitly cast") def _is_compatible_with_other(self, other): - return ( - super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCInt64Index, - ABCFloat64Index, - ABCRangeIndex)) - for obj in [self, other]) + return super()._is_compatible_with_other(other) or all( + isinstance(type(obj), (ABCInt64Index, ABCFloat64Index, ABCRangeIndex)) + for obj in [self, other] ) @@ -255,17 +268,14 @@ def _is_compatible_with_other(self, other): Int64Index._add_logical_methods() _uint64_descr_args = dict( - klass='UInt64Index', - ltype='unsigned integer', - dtype='uint64', - extra='' + klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" ) class UInt64Index(IntegerIndex): - __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args + __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args - _typ = 'uint64index' + _typ = "uint64index" _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 @@ -273,23 +283,23 @@ class UInt64Index(IntegerIndex): @property def inferred_type(self): """Always 'integer' for ``UInt64Index``""" - return 'integer' + return "integer" @property def asi8(self): # do not cache or you'll create a memory leak - return self.values.view('u8') + return self.values.view("u8") - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] # don't coerce ilocs to integers - if kind != 'iloc': + if kind != "iloc": key = self._maybe_cast_indexer(key) return super()._convert_scalar_indexer(key, kind=kind) - @Appender(_index_shared_docs['_convert_arr_indexer']) + @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are @@ -299,7 +309,7 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=np.uint64) return keyarr - @Appender(_index_shared_docs['_convert_index_indexer']) + @Appender(_index_shared_docs["_convert_index_indexer"]) def _convert_index_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are @@ -319,15 +329,12 @@ def _assert_safe_casting(cls, data, subarr): """ if not issubclass(data.dtype.type, np.unsignedinteger): if not np.array_equal(data, subarr): - raise TypeError('Unsafe NumPy casting, you must ' - 'explicitly cast') + raise TypeError("Unsafe NumPy casting, you must " "explicitly cast") def _is_compatible_with_other(self, other): - return ( - super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCUInt64Index, - ABCFloat64Index)) - for obj in [self, other]) + return super()._is_compatible_with_other(other) or all( + isinstance(type(obj), (ABCUInt64Index, ABCFloat64Index)) + for obj in [self, other] ) @@ -335,67 +342,73 @@ def _is_compatible_with_other(self, other): UInt64Index._add_logical_methods() _float64_descr_args = dict( - klass='Float64Index', - dtype='float64', - ltype='float', - extra='' + klass="Float64Index", dtype="float64", ltype="float", extra="" ) class Float64Index(NumericIndex): - __doc__ = _num_index_shared_docs['class_descr'] % _float64_descr_args + __doc__ = _num_index_shared_docs["class_descr"] % _float64_descr_args - _typ = 'float64index' + _typ = "float64index" _engine_type = libindex.Float64Engine _default_dtype = np.float64 @property def inferred_type(self): """Always 'floating' for ``Float64Index``""" - return 'floating' + return "floating" - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): - msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' - 'values are required for conversion').format(dtype=dtype) + msg = ( + "Cannot convert Float64Index to dtype {dtype}; integer " + "values are required for conversion" + ).format(dtype=dtype) raise TypeError(msg) - elif (is_integer_dtype(dtype) and - not is_extension_array_dtype(dtype)) and self.hasnans: + elif ( + is_integer_dtype(dtype) and not is_extension_array_dtype(dtype) + ) and self.hasnans: # TODO(jreback); this can change once we have an EA Index type # GH 13149 - raise ValueError('Cannot convert NA to integer') + raise ValueError("Cannot convert NA to integer") return super().astype(dtype, copy=copy) - @Appender(_index_shared_docs['_convert_scalar_indexer']) + @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + assert kind in ["ix", "loc", "getitem", "iloc", None] - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + if kind == "iloc": + return self._validate_indexer("positional", key, kind) return key - @Appender(_index_shared_docs['_convert_slice_indexer']) + @Appender(_index_shared_docs["_convert_slice_indexer"]) def _convert_slice_indexer(self, key, kind=None): # if we are not a slice, then we are done if not isinstance(key, slice): return key - if kind == 'iloc': + if kind == "iloc": return super()._convert_slice_indexer(key, kind=kind) # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) - def _format_native_types(self, na_rep='', float_format=None, decimal='.', - quoting=None, **kwargs): + def _format_native_types( + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs + ): from pandas.io.formats.format import FloatArrayFormatter - formatter = FloatArrayFormatter(self.values, na_rep=na_rep, - float_format=float_format, - decimal=decimal, quoting=quoting, - fixed_width=False) + + formatter = FloatArrayFormatter( + self.values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) return formatter.get_result_as_array() def get_value(self, series, key): @@ -424,8 +437,7 @@ def equals(self, other): try: if not isinstance(other, Float64Index): other = self._constructor(other) - if (not is_dtype_equal(self.dtype, other.dtype) or - self.shape != other.shape): + if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape: return False left, right = self._ndarray_values, other._ndarray_values return ((left == right) | (self._isnan & other._isnan)).all() @@ -451,7 +463,7 @@ def __contains__(self, other): return False - @Appender(_index_shared_docs['get_loc']) + @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): try: if np.all(np.isnan(key)) or is_bool(key): @@ -477,13 +489,12 @@ def isin(self, values, level=None): return algorithms.isin(np.array(self), values) def _is_compatible_with_other(self, other): - return ( - super()._is_compatible_with_other(other) - or all(isinstance(type(obj), (ABCInt64Index, - ABCFloat64Index, - ABCUInt64Index, - ABCRangeIndex)) - for obj in [self, other]) + return super()._is_compatible_with_other(other) or all( + isinstance( + type(obj), + (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), + ) + for obj in [self, other] ) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f61b2e679f0c8..0013df44614e8 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -4,26 +4,31 @@ import numpy as np from pandas._libs import index as libindex -from pandas._libs.tslibs import ( - NaT, frequencies as libfrequencies, iNaT, resolution) -from pandas._libs.tslibs.period import ( - DIFFERENT_FREQ, IncompatibleFrequency, Period) +from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution +from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, pandas_dtype) + is_bool_dtype, + is_datetime64_any_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + pandas_dtype, +) from pandas.core import common as com from pandas.core.accessor import delegate_names from pandas.core.algorithms import unique1d -from pandas.core.arrays.period import ( - PeriodArray, period_array, validate_dtype_freq) +from pandas.core.arrays.period import PeriodArray, period_array, validate_dtype_freq from pandas.core.base import _shared_docs import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, ensure_index from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin) + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, +) from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index from pandas.core.missing import isna from pandas.core.ops import get_op_result_name @@ -33,8 +38,7 @@ from pandas.tseries.offsets import DateOffset, Tick _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update( - dict(target_klass='PeriodIndex or list of Periods')) +_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) # --- Period index sketch @@ -42,9 +46,9 @@ def _new_PeriodIndex(cls, **d): # GH13277 for unpickling - values = d.pop('data') - if values.dtype == 'int64': - freq = d.pop('freq', None) + values = d.pop("data") + if values.dtype == "int64": + freq = d.pop("freq", None) values = PeriodArray(values, freq=freq) return cls._simple_new(values, **d) else: @@ -55,21 +59,17 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): """ Delegate from PeriodIndex to PeriodArray. """ + _delegate_class = PeriodArray _delegated_properties = PeriodArray._datetimelike_ops - _delegated_methods = ( - set(PeriodArray._datetimelike_methods) | {'_addsub_int_array'} - ) - _raw_properties = {'is_leap_year'} - - -@delegate_names(PeriodArray, - PeriodDelegateMixin._delegated_properties, - typ='property') -@delegate_names(PeriodArray, - PeriodDelegateMixin._delegated_methods, - typ="method", - overwrite=True) + _delegated_methods = set(PeriodArray._datetimelike_methods) | {"_addsub_int_array"} + _raw_properties = {"is_leap_year"} + + +@delegate_names(PeriodArray, PeriodDelegateMixin._delegated_properties, typ="property") +@delegate_names( + PeriodArray, PeriodDelegateMixin._delegated_methods, typ="method", overwrite=True +) class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): """ Immutable ndarray holding ordinal values indicating regular periods in @@ -161,8 +161,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): -------- >>> idx = pd.PeriodIndex(year=year_arr, quarter=q_arr) """ - _typ = 'periodindex' - _attributes = ['name', 'freq'] + + _typ = "periodindex" + _attributes = ["name", "freq"] # define my properties & methods for delegation _is_numeric_dtype = False @@ -175,39 +176,59 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): # ------------------------------------------------------------------------ # Index Constructors - def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, - periods=None, tz=None, dtype=None, copy=False, name=None, - **fields): - - valid_field_set = {'year', 'month', 'day', 'quarter', - 'hour', 'minute', 'second'} + def __new__( + cls, + data=None, + ordinal=None, + freq=None, + start=None, + end=None, + periods=None, + tz=None, + dtype=None, + copy=False, + name=None, + **fields + ): + + valid_field_set = { + "year", + "month", + "day", + "quarter", + "hour", + "minute", + "second", + } if not set(fields).issubset(valid_field_set): - raise TypeError('__new__() got an unexpected keyword argument {}'. - format(list(set(fields) - valid_field_set)[0])) + raise TypeError( + "__new__() got an unexpected keyword argument {}".format( + list(set(fields) - valid_field_set)[0] + ) + ) - if name is None and hasattr(data, 'name'): + if name is None and hasattr(data, "name"): name = data.name if data is None and ordinal is None: # range-based. - data, freq2 = PeriodArray._generate_range(start, end, periods, - freq, fields) + data, freq2 = PeriodArray._generate_range(start, end, periods, freq, fields) # PeriodArray._generate range does validate that fields is # empty when really using the range-based constructor. if not fields: - msg = ("Creating a PeriodIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.period_range` instead.") + msg = ( + "Creating a PeriodIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.period_range` instead." + ) # period_range differs from PeriodIndex for cases like # start="2000", periods=4 # PeriodIndex interprets that as A-DEC freq. # period_range interprets it as 'D' freq. - cond = ( - freq is None and ( - (start and not isinstance(start, Period)) or - (end and not isinstance(end, Period)) - ) + cond = freq is None and ( + (start and not isinstance(start, Period)) + or (end and not isinstance(end, Period)) ) if cond: msg += ( @@ -291,11 +312,12 @@ def freq(self, value): value = Period._maybe_convert_freq(value) # TODO: When this deprecation is enforced, PeriodIndex.freq can # be removed entirely, and we'll just inherit. - msg = ('Setting {cls}.freq has been deprecated and will be ' - 'removed in a future version; use {cls}.asfreq instead. ' - 'The {cls}.freq setter is not guaranteed to work.') - warnings.warn(msg.format(cls=type(self).__name__), - FutureWarning, stacklevel=2) + msg = ( + "Setting {cls}.freq has been deprecated and will be " + "removed in a future version; use {cls}.asfreq instead. " + "The {cls}.freq setter is not guaranteed to work." + ) + warnings.warn(msg.format(cls=type(self).__name__), FutureWarning, stacklevel=2) # PeriodArray._freq isn't actually mutable. We set the private _freq # here, but people shouldn't be doing this anyway. self._data._freq = value @@ -309,8 +331,7 @@ def _shallow_copy(self, values=None, **kwargs): values = values._values if not isinstance(values, PeriodArray): - if (isinstance(values, np.ndarray) and - is_integer_dtype(values.dtype)): + if isinstance(values, np.ndarray) and is_integer_dtype(values.dtype): values = PeriodArray(values, freq=self.freq) else: # in particular, I would like to avoid period_array here. @@ -322,12 +343,12 @@ def _shallow_copy(self, values=None, **kwargs): values = period_array(values, freq=self.freq) # We don't allow changing `freq` in _shallow_copy. - validate_dtype_freq(self.dtype, kwargs.get('freq')) + validate_dtype_freq(self.dtype, kwargs.get("freq")) attributes = self._get_attributes_dict() attributes.update(kwargs) - if not len(values) and 'dtype' not in kwargs: - attributes['dtype'] = self.dtype + if not len(values) and "dtype" not in kwargs: + attributes["dtype"] = self.dtype return self._simple_new(values, **attributes) def _shallow_copy_with_infer(self, values=None, **kwargs): @@ -347,6 +368,7 @@ def func(x): return x else: return Period._from_ordinal(ordinal=x, freq=self.freq) + return func def _maybe_convert_timedelta(self, other): @@ -366,8 +388,7 @@ def _maybe_convert_timedelta(self, other): IncompatibleFrequency : if the input cannot be written as a multiple of self.freq. Note IncompatibleFrequency subclasses ValueError. """ - if isinstance( - other, (timedelta, np.timedelta64, Tick, np.ndarray)): + if isinstance(other, (timedelta, np.timedelta64, Tick, np.ndarray)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, Tick): # _check_timedeltalike_freq_compat will raise if incompatible @@ -379,9 +400,9 @@ def _maybe_convert_timedelta(self, other): if base == self.freq.rule_code: return other.n - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr + ) raise IncompatibleFrequency(msg) elif is_integer(other): # integer is passed to .shift via @@ -390,19 +411,17 @@ def _maybe_convert_timedelta(self, other): return other # raise when input doesn't have freq - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=None) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=None + ) raise IncompatibleFrequency(msg) # ------------------------------------------------------------------------ # Rendering Methods - def _format_native_types(self, na_rep='NaT', quoting=None, **kwargs): + def _format_native_types(self, na_rep="NaT", quoting=None, **kwargs): # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, - quoting=quoting, - **kwargs) + return self._data._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _mpl_repr(self): # how to represent ourselves to matplotlib @@ -419,7 +438,7 @@ def _formatter_func(self): def _engine(self): return self._engine_type(lambda: self, len(self)) - @Appender(_index_shared_docs['contains']) + @Appender(_index_shared_docs["contains"]) def __contains__(self, key): if isinstance(key, Period): if key.freq != self.freq: @@ -471,14 +490,13 @@ def __array_wrap__(self, result, context=None): name = self.name left = context[1][0] right = context[1][1] - if (isinstance(left, PeriodIndex) and - isinstance(right, PeriodIndex)): + if isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex): name = left.name if left.name == right.name else None return Index(result, name=name) elif isinstance(left, Period) or isinstance(right, Period): return Index(result, name=name) elif isinstance(func, np.ufunc): - if 'M->M' not in func.types: + if "M->M" not in func.types: msg = "ufunc '{0}' not supported for the PeriodIndex" # This should be TypeError, but TypeError cannot be raised # from here because numpy catches. @@ -501,37 +519,41 @@ def asof_locs(self, where, mask): where_idx = PeriodIndex(where_idx.values, freq=self.freq) locs = self._ndarray_values[mask].searchsorted( - where_idx._ndarray_values, side='right') + where_idx._ndarray_values, side="right" + ) locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx._ndarray_values < - self._ndarray_values[first])] = -1 + result[ + (locs == 0) & (where_idx._ndarray_values < self._ndarray_values[first]) + ] = -1 return result - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True, how='start'): + @Appender(_index_shared_docs["astype"]) + def astype(self, dtype, copy=True, how="start"): dtype = pandas_dtype(dtype) if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. - tz = getattr(dtype, 'tz', None) + tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) - @Substitution(klass='PeriodIndex') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="PeriodIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if isinstance(value, Period): if value.freq != self.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=value.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=self.freqstr, + other_freq=value.freqstr, + ) raise IncompatibleFrequency(msg) value = value.ordinal elif isinstance(value, str): @@ -540,8 +562,7 @@ def searchsorted(self, value, side='left', sorter=None): except DateParseError: raise KeyError("Cannot interpret '{}' as period".format(value)) - return self._ndarray_values.searchsorted(value, side=side, - sorter=sorter) + return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) @property def is_all_dates(self): @@ -556,7 +577,7 @@ def is_full(self): if len(self) == 0: return True if not self.is_monotonic: - raise ValueError('Index is not monotonic') + raise ValueError("Index is not monotonic") values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() @@ -564,7 +585,7 @@ def is_full(self): def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous # indexing - return 'period' + return "period" def get_value(self, series, key): """ @@ -573,9 +594,7 @@ def get_value(self, series, key): """ s = com.values_from_object(series) try: - return com.maybe_box(self, - super().get_value(s, key), - series, key) + return com.maybe_box(self, super().get_value(s, key), series, key) except (KeyError, IndexError): try: asdt, parsed, reso = parse_time_string(key, self.freq) @@ -587,8 +606,8 @@ def get_value(self, series, key): # if our data is higher resolution than requested key, slice if grp < freqn: iv = Period(asdt, freq=(grp, 1)) - ord1 = iv.asfreq(self.freq, how='S').ordinal - ord2 = iv.asfreq(self.freq, how='E').ordinal + ord1 = iv.asfreq(self.freq, how="S").ordinal + ord2 = iv.asfreq(self.freq, how="E").ordinal if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) @@ -598,8 +617,9 @@ def get_value(self, series, key): return series[key] elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal - return com.maybe_box(self, self._engine.get_value(s, key), - series, key) + return com.maybe_box( + self, self._engine.get_value(s, key), series, key + ) else: raise KeyError(key) except TypeError: @@ -607,17 +627,18 @@ def get_value(self, series, key): period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal - return com.maybe_box(self, self._engine.get_value(s, key), - series, key) + return com.maybe_box(self, self._engine.get_value(s, key), series, key) - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) - if hasattr(target, 'freq') and target.freq != self.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr) + if hasattr(target, "freq") and target.freq != self.freq: + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, + own_freq=self.freqstr, + other_freq=target.freqstr, + ) raise IncompatibleFrequency(msg) if isinstance(target, PeriodIndex): @@ -625,8 +646,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): if tolerance is not None: tolerance = self._convert_tolerance(tolerance, target) - return Index.get_indexer(self._int64index, target, method, - limit, tolerance) + return Index.get_indexer(self._int64index, target, method, limit, tolerance) def _get_unique_index(self, dropna=False): """ @@ -682,8 +702,7 @@ def get_loc(self, key, method=None, tolerance=None): try: ordinal = iNaT if key is NaT else key.ordinal if tolerance is not None: - tolerance = self._convert_tolerance(tolerance, - np.asarray(key)) + tolerance = self._convert_tolerance(tolerance, np.asarray(key)) return self._int64index.get_loc(ordinal, method, tolerance) except KeyError: @@ -709,7 +728,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ['ix', 'loc', 'getitem'] + assert kind in ["ix", "loc", "getitem"] if isinstance(label, datetime): return Period(label, freq=self.freq) @@ -717,86 +736,105 @@ def _maybe_cast_slice_bound(self, label, side, kind): try: _, parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) - return bounds[0 if side == 'left' else 1] + return bounds[0 if side == "left" else 1] except Exception: raise KeyError(label) elif is_integer(label) or is_float(label): - self._invalid_indexer('slice', label) + self._invalid_indexer("slice", label) return label def _parsed_string_to_bounds(self, reso, parsed): - if reso == 'year': - t1 = Period(year=parsed.year, freq='A') - elif reso == 'month': - t1 = Period(year=parsed.year, month=parsed.month, freq='M') - elif reso == 'quarter': + if reso == "year": + t1 = Period(year=parsed.year, freq="A") + elif reso == "month": + t1 = Period(year=parsed.year, month=parsed.month, freq="M") + elif reso == "quarter": q = (parsed.month - 1) // 3 + 1 - t1 = Period(year=parsed.year, quarter=q, freq='Q-DEC') - elif reso == 'day': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - freq='D') - elif reso == 'hour': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - hour=parsed.hour, freq='H') - elif reso == 'minute': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - hour=parsed.hour, minute=parsed.minute, freq='T') - elif reso == 'second': - t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, - hour=parsed.hour, minute=parsed.minute, - second=parsed.second, freq='S') + t1 = Period(year=parsed.year, quarter=q, freq="Q-DEC") + elif reso == "day": + t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, freq="D") + elif reso == "hour": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + freq="H", + ) + elif reso == "minute": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + minute=parsed.minute, + freq="T", + ) + elif reso == "second": + t1 = Period( + year=parsed.year, + month=parsed.month, + day=parsed.day, + hour=parsed.hour, + minute=parsed.minute, + second=parsed.second, + freq="S", + ) else: raise KeyError(reso) - return (t1.asfreq(self.freq, how='start'), - t1.asfreq(self.freq, how='end')) + return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end")) def _get_string_slice(self, key): if not self.is_monotonic: - raise ValueError('Partial indexing only valid for ' - 'ordered time series') + raise ValueError("Partial indexing only valid for " "ordered time series") key, parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - if reso in ['day', 'hour', 'minute', 'second'] and not grp < freqn: + if reso in ["day", "hour", "minute", "second"] and not grp < freqn: raise KeyError(key) t1, t2 = self._parsed_string_to_bounds(reso, parsed) - return slice(self.searchsorted(t1.ordinal, side='left'), - self.searchsorted(t2.ordinal, side='right')) + return slice( + self.searchsorted(t1.ordinal, side="left"), + self.searchsorted(t2.ordinal, side="right"), + ) def _convert_tolerance(self, tolerance, target): - tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, - target) + tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target) if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') + raise ValueError("list-like tolerance size must match " "target index size") return self._maybe_convert_timedelta(tolerance) def insert(self, loc, item): if not isinstance(item, Period) or self.freq != item.freq: return self.astype(object).insert(loc, item) - idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]), - self[loc:].asi8)) + idx = np.concatenate( + (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) + ) return self._shallow_copy(idx) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ See Index.join """ self._assert_can_do_setop(other) if not isinstance(other, PeriodIndex): - return self.astype(object).join(other, how=how, level=level, - return_indexers=return_indexers, - sort=sort) - - result = Int64Index.join(self, other, how=how, level=level, - return_indexers=return_indexers, - sort=sort) + return self.astype(object).join( + other, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + result = Int64Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) if return_indexers: result, lidx, ridx = result @@ -813,9 +851,9 @@ def _assert_can_do_setop(self, other): # *Can't* use PeriodIndexes of different freqs # *Can* use PeriodIndex/DatetimeIndex if isinstance(other, PeriodIndex) and self.freq != other.freq: - msg = DIFFERENT_FREQ.format(cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=other.freqstr) + msg = DIFFERENT_FREQ.format( + cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr + ) raise IncompatibleFrequency(msg) def _wrap_setop_result(self, other, result): @@ -826,8 +864,7 @@ def _wrap_setop_result(self, other, result): def _apply_meta(self, rawarr): if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, - name=self.name) + rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) return rawarr def __setstate__(self, state): @@ -863,9 +900,12 @@ def __setstate__(self, state): @property def flags(self): """ return the ndarray.flags for the underlying data """ - warnings.warn("{obj}.flags is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.flags is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return self._ndarray_values.flags def item(self): @@ -876,22 +916,29 @@ def item(self): .. deprecated 0.25.0 """ - warnings.warn('`item` has been deprecated and will be removed in a ' - 'future version', FutureWarning, stacklevel=2) + warnings.warn( + "`item` has been deprecated and will be removed in a " "future version", + FutureWarning, + stacklevel=2, + ) # TODO(DatetimeArray): remove if len(self) == 1: return self[0] else: # copy numpy's message here because Py26 raises an IndexError - raise ValueError('can only convert an array of size 1 to a ' - 'Python scalar') + raise ValueError( + "can only convert an array of size 1 to a " "Python scalar" + ) @property def data(self): """ return the data pointer of the underlying data """ - warnings.warn("{obj}.data is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.data is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return np.asarray(self._data).data @property @@ -899,9 +946,12 @@ def base(self): """ return the base object if the memory of the underlying data is shared """ - warnings.warn("{obj}.base is deprecated and will be removed " - "in a future version".format(obj=type(self).__name__), - FutureWarning, stacklevel=2) + warnings.warn( + "{obj}.base is deprecated and will be removed " + "in a future version".format(obj=type(self).__name__), + FutureWarning, + stacklevel=2, + ) return np.asarray(self._data) @@ -963,13 +1013,13 @@ def period_range(start=None, end=None, periods=None, freq=None, name=None): dtype='period[M]', freq='M') """ if com.count_not_none(start, end, periods) != 2: - raise ValueError('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') - if freq is None and (not isinstance(start, Period) - and not isinstance(end, Period)): - freq = 'D' - - data, freq = PeriodArray._generate_range(start, end, periods, freq, - fields={}) + raise ValueError( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) + if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)): + freq = "D" + + data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={}) data = PeriodArray(data, freq=freq) return PeriodIndex(data, name=name) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 70ca0b349e7ed..16098c474a473 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -13,10 +13,16 @@ from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.common import ( - ensure_platform_int, ensure_python_int, is_int64_dtype, is_integer, - is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype) -from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCSeries, ABCTimedeltaIndex) + ensure_platform_int, + ensure_python_int, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + is_timedelta64_dtype, +) +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCTimedeltaIndex from pandas.core import ops import pandas.core.common as com @@ -65,7 +71,7 @@ class RangeIndex(Int64Index): Int64Index : Index of int64 data. """ - _typ = 'rangeindex' + _typ = "rangeindex" _engine_type = libindex.Int64Engine _range = None # type: range @@ -74,13 +80,24 @@ class RangeIndex(Int64Index): # -------------------------------------------------------------------- # Constructors - def __new__(cls, start=None, stop=None, step=None, - dtype=None, copy=False, name=None, fastpath=None): + def __new__( + cls, + start=None, + stop=None, + step=None, + dtype=None, + copy=False, + name=None, + fastpath=None, + ): if fastpath is not None: - warnings.warn("The 'fastpath' keyword is deprecated, and will be " - "removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'fastpath' keyword is deprecated, and will be " + "removed in a future version.", + FutureWarning, + stacklevel=2, + ) if fastpath: return cls._simple_new(range(start, stop, step), name=name) @@ -121,8 +138,9 @@ def from_range(cls, data, name=None, dtype=None): """ if not isinstance(data, range): raise TypeError( - '{0}(...) must be called with object coercible to a ' - 'range, {1} was passed'.format(cls.__name__, repr(data))) + "{0}(...) must be called with object coercible to a " + "range, {1} was passed".format(cls.__name__, repr(data)) + ) cls._validate_dtype(dtype) return cls._simple_new(data, dtype=dtype, name=name) @@ -153,7 +171,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): def _validate_dtype(dtype): """ require dtype to be None or int64 """ if not (dtype is None or is_int64_dtype(dtype)): - raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex') + raise TypeError("Invalid to pass a non-int64 dtype to RangeIndex") @cache_readonly def _constructor(self): @@ -170,8 +188,9 @@ def _data(self): triggering the construction. """ if self._cached_data is None: - self._cached_data = np.arange(self.start, self.stop, self.step, - dtype=np.int64) + self._cached_data = np.arange( + self.start, self.stop, self.step, dtype=np.int64 + ) return self._cached_data @cache_readonly @@ -181,9 +200,7 @@ def _int64index(self): def _get_data_as_items(self): """ return a list of tuples of start, stop, step """ rng = self._range - return [('start', rng.start), - ('stop', rng.stop), - ('step', rng.step)] + return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)] def __reduce__(self): d = self._get_attributes_dict() @@ -199,20 +216,22 @@ def _format_attrs(self): """ attrs = self._get_data_as_items() if self.name is not None: - attrs.append(('name', ibase.default_pprint(self.name))) + attrs.append(("name", ibase.default_pprint(self.name))) return attrs def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep='NaN', **kwargs): + def _format_with_header(self, header, na_rep="NaN", **kwargs): return header + list(map(pprint_thing, self._range)) # -------------------------------------------------------------------- - _deprecation_message = ("RangeIndex.{} is deprecated and will be " - "removed in a future version. Use RangeIndex.{} " - "instead") + _deprecation_message = ( + "RangeIndex.{} is deprecated and will be " + "removed in a future version. Use RangeIndex.{} " + "instead" + ) @cache_readonly def start(self): @@ -230,8 +249,11 @@ def _start(self): .. deprecated:: 0.25.0 Use ``start`` instead. """ - warnings.warn(self._deprecation_message.format("_start", "start"), - DeprecationWarning, stacklevel=2) + warnings.warn( + self._deprecation_message.format("_start", "start"), + DeprecationWarning, + stacklevel=2, + ) return self.start @cache_readonly @@ -250,8 +272,11 @@ def _stop(self): Use ``stop`` instead. """ # GH 25710 - warnings.warn(self._deprecation_message.format("_stop", "stop"), - DeprecationWarning, stacklevel=2) + warnings.warn( + self._deprecation_message.format("_stop", "stop"), + DeprecationWarning, + stacklevel=2, + ) return self.stop @cache_readonly @@ -271,8 +296,11 @@ def _step(self): Use ``step`` instead. """ # GH 25710 - warnings.warn(self._deprecation_message.format("_step", "step"), - DeprecationWarning, stacklevel=2) + warnings.warn( + self._deprecation_message.format("_step", "step"), + DeprecationWarning, + stacklevel=2, + ) return self.step @cache_readonly @@ -281,8 +309,10 @@ def nbytes(self): Return the number of bytes in the underlying data. """ rng = self._range - return getsizeof(rng) + sum(getsizeof(getattr(rng, attr_name)) - for attr_name in ['start', 'stop', 'step']) + return getsizeof(rng) + sum( + getsizeof(getattr(rng, attr_name)) + for attr_name in ["start", "stop", "step"] + ) def memory_usage(self, deep=False): """ @@ -338,7 +368,7 @@ def __contains__(self, key: Union[int, np.integer]) -> bool: return False return key in self._range - @Appender(_index_shared_docs['get_loc']) + @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): if is_integer(key) and method is None and tolerance is None: new_key = int(key) @@ -348,25 +378,21 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) - @Appender(_index_shared_docs['get_indexer']) + @Appender(_index_shared_docs["get_indexer"]) def get_indexer(self, target, method=None, limit=None, tolerance=None): if not (method is None and tolerance is None and is_list_like(target)): - return super().get_indexer(target, method=method, - tolerance=tolerance) + return super().get_indexer(target, method=method, tolerance=tolerance) if self.step > 0: start, stop, step = self.start, self.stop, self.step else: # Work on reversed range for simplicity: - start, stop, step = (self.stop - self.step, - self.start + 1, - - self.step) + start, stop, step = (self.stop - self.step, self.start + 1, -self.step) target_array = np.asarray(target) if not (is_integer_dtype(target_array) and target_array.ndim == 1): # checks/conversions/roundings are delegated to general method - return super().get_indexer(target, method=method, - tolerance=tolerance) + return super().get_indexer(target, method=method, tolerance=tolerance) locs = target_array - start valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) @@ -381,16 +407,16 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def tolist(self): return list(self._range) - @Appender(_index_shared_docs['_shallow_copy']) + @Appender(_index_shared_docs["_shallow_copy"]) def _shallow_copy(self, values=None, **kwargs): if values is None: name = kwargs.get("name", self.name) return self._simple_new(self._range, name=name) else: - kwargs.setdefault('name', self.name) + kwargs.setdefault("name", self.name) return self._int64index._shallow_copy(values, **kwargs) - @Appender(ibase._index_shared_docs['copy']) + @Appender(ibase._index_shared_docs["copy"]) def copy(self, name=None, deep=False, dtype=None, **kwargs): self._validate_dtype(dtype) if name is None: @@ -401,8 +427,7 @@ def _minmax(self, meth): no_steps = len(self) - 1 if no_steps == -1: return np.nan - elif ((meth == 'min' and self.step > 0) or - (meth == 'max' and self.step < 0)): + elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0): return self.start return self.start + self.step * no_steps @@ -411,13 +436,13 @@ def min(self, axis=None, skipna=True, *args, **kwargs): """The minimum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_min(args, kwargs) - return self._minmax('min') + return self._minmax("min") def max(self, axis=None, skipna=True, *args, **kwargs): """The maximum value of the RangeIndex""" nv.validate_minmax_axis(axis) nv.validate_max(args, kwargs) - return self._minmax('max') + return self._minmax("max") def argsort(self, *args, **kwargs): """ @@ -501,8 +526,7 @@ def intersection(self, other, sort=False): # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds - tmp_start = first.start + (second.start - first.start) * \ - first.step // gcd * s + tmp_start = first.start + (second.start - first.start) * first.step // gcd * s new_step = first.step * second.step // gcd new_range = range(tmp_start, int_high, new_step) new_index = self._simple_new(new_range) @@ -586,35 +610,39 @@ def _union(self, other, sort): start_r = min(start_s, start_o) end_r = max(end_s, end_o) if step_o == step_s: - if ((start_s - start_o) % step_s == 0 and - (start_s - end_o) <= step_s and - (start_o - end_s) <= step_s): + if ( + (start_s - start_o) % step_s == 0 + and (start_s - end_o) <= step_s + and (start_o - end_s) <= step_s + ): return self.__class__(start_r, end_r + step_s, step_s) - if ((step_s % 2 == 0) and - (abs(start_s - start_o) <= step_s / 2) and - (abs(end_s - end_o) <= step_s / 2)): - return self.__class__(start_r, - end_r + step_s / 2, - step_s / 2) + if ( + (step_s % 2 == 0) + and (abs(start_s - start_o) <= step_s / 2) + and (abs(end_s - end_o) <= step_s / 2) + ): + return self.__class__(start_r, end_r + step_s / 2, step_s / 2) elif step_o % step_s == 0: - if ((start_o - start_s) % step_s == 0 and - (start_o + step_s >= start_s) and - (end_o - step_s <= end_s)): + if ( + (start_o - start_s) % step_s == 0 + and (start_o + step_s >= start_s) + and (end_o - step_s <= end_s) + ): return self.__class__(start_r, end_r + step_s, step_s) elif step_s % step_o == 0: - if ((start_s - start_o) % step_o == 0 and - (start_s + step_o >= start_o) and - (end_s - step_o <= end_o)): + if ( + (start_s - start_o) % step_o == 0 + and (start_s + step_o >= start_o) + and (end_s - step_o <= end_o) + ): return self.__class__(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - if how == 'outer' and self is not other: + @Appender(_index_shared_docs["join"]) + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + if how == "outer" and self is not other: # note: could return RangeIndex in more circumstances - return self._int64index.join(other, how, level, return_indexers, - sort) + return self._int64index.join(other, how, level, return_indexers, sort) return super().join(other, how, level, return_indexers, sort) @@ -643,14 +671,17 @@ def __getitem__(self, key): try: return self._range[new_key] except IndexError: - raise IndexError("index {key} is out of bounds for axis 0 " - "with size {size}".format(key=key, - size=len(self))) + raise IndexError( + "index {key} is out of bounds for axis 0 " + "with size {size}".format(key=key, size=len(self)) + ) elif is_scalar(key): - raise IndexError("only integers, slices (`:`), " - "ellipsis (`...`), numpy.newaxis (`None`) " - "and integer or boolean " - "arrays are valid indices") + raise IndexError( + "only integers, slices (`:`), " + "ellipsis (`...`), numpy.newaxis (`None`) " + "and integer or boolean " + "arrays are valid indices" + ) # fall back to Int64Index return super().__getitem__(key) @@ -659,9 +690,7 @@ def __floordiv__(self, other): return NotImplemented if is_integer(other) and other != 0: - if (len(self) == 0 or - self.start % other == 0 and - self.step % other == 0): + if len(self) == 0 or self.start % other == 0 and self.step % other == 0: start = self.start // other step = self.step // other stop = start + len(self) * step @@ -717,7 +746,7 @@ def _evaluate_numeric_binop(self, other): try: # apply if we have an override if step: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): rstep = step(left.step, right) # we don't have a representable op @@ -728,7 +757,7 @@ def _evaluate_numeric_binop(self, other): else: rstep = left.step - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): rstart = op(left.start, right) rstop = op(left.stop, right) @@ -737,9 +766,8 @@ def _evaluate_numeric_binop(self, other): # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors - if not all(is_integer(x) for x in - [rstart, rstop, rstep]): - result = result.astype('float64') + if not all(is_integer(x) for x in [rstart, rstop, rstep]): + result = result.astype("float64") return result @@ -748,7 +776,7 @@ def _evaluate_numeric_binop(self, other): return op(self._int64index, other) # TODO: Do attrs get handled reliably? - name = '__{name}__'.format(name=op.__name__) + name = "__{name}__".format(name=op.__name__) return compat.set_function_name(_evaluate_numeric_binop, name, cls) cls.__add__ = _make_evaluate_binop(operator.add) @@ -757,10 +785,8 @@ def _evaluate_numeric_binop(self, other): cls.__rsub__ = _make_evaluate_binop(ops.rsub) cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul) cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul) - cls.__truediv__ = _make_evaluate_binop(operator.truediv, - step=operator.truediv) - cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, - step=ops.rtruediv) + cls.__truediv__ = _make_evaluate_binop(operator.truediv, step=operator.truediv) + cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, step=ops.rtruediv) RangeIndex._add_numeric_methods() diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ba5507fa71e8c..29ed3c6b97318 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -4,13 +4,20 @@ import numpy as np -from pandas._libs import ( - NaT, Timedelta, index as libindex, join as libjoin, lib) +from pandas._libs import NaT, Timedelta, index as libindex, join as libjoin, lib from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import ( - _TD_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, - is_timedelta64_dtype, is_timedelta64_ns_dtype, pandas_dtype) + _TD_DTYPE, + ensure_int64, + is_float, + is_integer, + is_list_like, + is_scalar, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna @@ -21,8 +28,11 @@ import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, maybe_unwrap_index, - wrap_arithmetic_op) + DatetimeIndexOpsMixin, + DatetimelikeDelegateMixin, + maybe_unwrap_index, + wrap_arithmetic_op, +) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name @@ -47,28 +57,24 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. _delegate_class = TimedeltaArray - _delegated_properties = (TimedeltaArray._datetimelike_ops + [ - 'components', - ]) - _delegated_methods = TimedeltaArray._datetimelike_methods + [ - '_box_values', - ] - _raw_properties = { - 'components', - } - _raw_methods = { - 'to_pytimedelta', - } - - -@delegate_names(TimedeltaArray, - TimedeltaDelegateMixin._delegated_properties, - typ="property") -@delegate_names(TimedeltaArray, - TimedeltaDelegateMixin._delegated_methods, - typ="method", overwrite=False) -class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, - TimedeltaDelegateMixin): + _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] + _delegated_methods = TimedeltaArray._datetimelike_methods + ["_box_values"] + _raw_properties = {"components"} + _raw_methods = {"to_pytimedelta"} + + +@delegate_names( + TimedeltaArray, TimedeltaDelegateMixin._delegated_properties, typ="property" +) +@delegate_names( + TimedeltaArray, + TimedeltaDelegateMixin._delegated_methods, + typ="method", + overwrite=False, +) +class TimedeltaIndex( + DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, TimedeltaDelegateMixin +): """ Immutable ndarray of timedelta64 data, represented internally as int64, and which can be boxed to timedelta objects @@ -148,23 +154,23 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, been deprecated in favor of :func:`timedelta_range`. """ - _typ = 'timedeltaindex' + _typ = "timedeltaindex" _join_precedence = 10 def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper( - joinf, dtype='m8[ns]', **kwargs) + return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs) _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique_int64, with_indexers=False) + libjoin.left_join_indexer_unique_int64, with_indexers=False + ) _engine_type = libindex.TimedeltaEngine - _comparables = ['name', 'freq'] - _attributes = ['name', 'freq'] + _comparables = ["name", "freq"] + _attributes = ["name", "freq"] _is_numeric_dtype = True _infer_as_myclass = True @@ -181,44 +187,67 @@ def _join_i8_wrapper(joinf, **kwargs): # ------------------------------------------------------------------- # Constructors - def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, - periods=None, closed=None, dtype=_TD_DTYPE, copy=False, - name=None, verify_integrity=None): + def __new__( + cls, + data=None, + unit=None, + freq=None, + start=None, + end=None, + periods=None, + closed=None, + dtype=_TD_DTYPE, + copy=False, + name=None, + verify_integrity=None, + ): if verify_integrity is not None: - warnings.warn("The 'verify_integrity' argument is deprecated, " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + warnings.warn( + "The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) else: verify_integrity = True if data is None: freq, freq_infer = dtl.maybe_infer_freq(freq) - warnings.warn("Creating a TimedeltaIndex by passing range " - "endpoints is deprecated. Use " - "`pandas.timedelta_range` instead.", - FutureWarning, stacklevel=2) - result = TimedeltaArray._generate_range(start, end, periods, freq, - closed=closed) + warnings.warn( + "Creating a TimedeltaIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.timedelta_range` instead.", + FutureWarning, + stacklevel=2, + ) + result = TimedeltaArray._generate_range( + start, end, periods, freq, closed=closed + ) return cls._simple_new(result._data, freq=freq, name=name) if is_scalar(data): - raise TypeError('{cls}() must be called with a ' - 'collection of some kind, {data} was passed' - .format(cls=cls.__name__, data=repr(data))) - - if unit in {'Y', 'y', 'M'}: - warnings.warn("M and Y units are deprecated and " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + raise TypeError( + "{cls}() must be called with a " + "collection of some kind, {data} was passed".format( + cls=cls.__name__, data=repr(data) + ) + ) + + if unit in {"Y", "y", "M"}: + warnings.warn( + "M and Y units are deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) if isinstance(data, TimedeltaArray): if copy: data = data.copy() return cls._simple_new(data, name=name, freq=freq) - if (isinstance(data, TimedeltaIndex) and - freq is None and name is None): + if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: return data.copy() else: @@ -226,8 +255,9 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, # - Cases checked above all return/raise before reaching here - # - tdarr = TimedeltaArray._from_sequence(data, freq=freq, unit=unit, - dtype=dtype, copy=copy) + tdarr = TimedeltaArray._from_sequence( + data, freq=freq, unit=unit, dtype=dtype, copy=copy + ) return cls._simple_new(tdarr._data, freq=tdarr.freq, name=name) @classmethod @@ -235,14 +265,13 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): # `dtype` is passed by _shallow_copy in corner cases, should always # be timedelta64[ns] if present if not isinstance(values, TimedeltaArray): - values = TimedeltaArray._simple_new(values, dtype=dtype, - freq=freq) + values = TimedeltaArray._simple_new(values, dtype=dtype, freq=freq) else: if freq is None: freq = values.freq assert isinstance(values, TimedeltaArray), type(values) assert dtype == _TD_DTYPE, dtype - assert values.dtype == 'm8[ns]', values.dtype + assert values.dtype == "m8[ns]", values.dtype tdarr = TimedeltaArray._simple_new(values._data, freq=freq) result = object.__new__(cls) @@ -262,14 +291,15 @@ def __setstate__(self, state): super().__setstate__(state) else: raise Exception("invalid pickle state") + _unpickle_compat = __setstate__ def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get('freq', None) + freq = attrs.get("freq", None) if freq is not None: # no need to infer if freq is None - attrs['freq'] = 'infer' + attrs["freq"] = "infer" return attrs # ------------------------------------------------------------------- @@ -278,13 +308,15 @@ def _maybe_update_attributes(self, attrs): @property def _formatter_func(self): from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import Timedelta64Formatter - return Timedelta64Formatter(values=self, - nat_rep=na_rep, - justify='all').get_result() + + return Timedelta64Formatter( + values=self, nat_rep=na_rep, justify="all" + ).get_result() # ------------------------------------------------------------------- # Wrapping TimedeltaArray @@ -307,7 +339,7 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): @property def _box_func(self): - return lambda x: Timedelta(x, unit='ns') + return lambda x: Timedelta(x, unit="ns") def __getitem__(self, key): result = self._data.__getitem__(key) @@ -317,7 +349,7 @@ def __getitem__(self, key): # ------------------------------------------------------------------- - @Appender(_index_shared_docs['astype']) + @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): @@ -327,7 +359,7 @@ def astype(self, dtype, copy=True): result = self._data.astype(dtype, copy=copy) if self.hasnans: return Index(result, name=self.name) - return Index(result.astype('i8'), name=self.name) + return Index(result.astype("i8"), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) def _union(self, other, sort): @@ -350,8 +382,7 @@ def _union(self, other, sort): result.freq = to_offset(result.inferred_freq) return result - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): + def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ See Index.join """ @@ -361,9 +392,14 @@ def join(self, other, how='left', level=None, return_indexers=False, except (TypeError, ValueError): pass - return Index.join(self, other, how=how, level=level, - return_indexers=return_indexers, - sort=sort) + return Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) def intersection(self, other, sort=False): """ @@ -395,8 +431,11 @@ def intersection(self, other, sort=False): def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) - if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and - self._can_fast_union(other)): + if ( + isinstance(other, TimedeltaIndex) + and self.freq == other.freq + and self._can_fast_union(other) + ): joined = self._shallow_copy(joined, name=name) return joined else: @@ -447,7 +486,7 @@ def _fast_union(self, other): # concatenate if left_end < right_end: - loc = right.searchsorted(left_end, side='right') + loc = right.searchsorted(left_end, side="right") right_chunk = right.values[loc:] dates = _concat._concat_compat((left.values, right_chunk)) return self._shallow_copy(dates) @@ -455,7 +494,7 @@ def _fast_union(self, other): return left def _maybe_promote(self, other): - if other.inferred_type == 'timedelta': + if other.inferred_type == "timedelta": other = TimedeltaIndex(other) return self, other @@ -470,8 +509,7 @@ def get_value(self, series, key): return self.get_value_maybe_box(series, key) try: - return com.maybe_box(self, Index.get_value(self, series, key), - series, key) + return com.maybe_box(self, Index.get_value(self, series, key), series, key) except KeyError: try: loc = self._get_string_slice(key) @@ -547,25 +585,23 @@ def _maybe_cast_slice_bound(self, label, side, kind): label : object """ - assert kind in ['ix', 'loc', 'getitem', None] + assert kind in ["ix", "loc", "getitem", None] if isinstance(label, str): parsed = Timedelta(label) lbound = parsed.round(parsed.resolution_string) - if side == 'left': + if side == "left": return lbound else: - return (lbound + to_offset(parsed.resolution_string) - - Timedelta(1, 'ns')) - elif ((is_integer(label) or is_float(label)) and - not is_timedelta64_dtype(label)): - self._invalid_indexer('slice', label) + return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") + elif (is_integer(label) or is_float(label)) and not is_timedelta64_dtype(label): + self._invalid_indexer("slice", label) return label def _get_string_slice(self, key): if is_integer(key) or is_float(key) or key is NaT: - self._invalid_indexer('slice', key) + self._invalid_indexer("slice", key) loc = self._partial_td_slice(key) return loc @@ -577,9 +613,9 @@ def _partial_td_slice(self, key): raise NotImplementedError - @Substitution(klass='TimedeltaIndex') - @Appender(_shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): + @Substitution(klass="TimedeltaIndex") + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): if isinstance(value, (np.ndarray, Index)): value = np.array(value, dtype=_TD_DTYPE, copy=False) else: @@ -588,11 +624,11 @@ def searchsorted(self, value, side='left', sorter=None): return self.values.searchsorted(value, side=side, sorter=sorter) def is_type_compatible(self, typ): - return typ == self.inferred_type or typ == 'timedelta' + return typ == self.inferred_type or typ == "timedelta" @property def inferred_type(self): - return 'timedelta64' + return "timedelta64" @property def is_all_dates(self): @@ -628,16 +664,16 @@ def insert(self, loc, item): # check freq can be preserved on edge cases if self.freq is not None: - if ((loc == 0 or loc == -len(self)) and - item + self.freq == self[0]): + if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq item = Timedelta(item).asm8.view(_TD_DTYPE) try: - new_tds = np.concatenate((self[:loc].asi8, [item.view(np.int64)], - self[loc:].asi8)) + new_tds = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) return self._shallow_copy(new_tds, freq=freq) except (AttributeError, TypeError): @@ -645,8 +681,7 @@ def insert(self, loc, item): # fall back to object index if isinstance(item, str): return self.astype(object).insert(loc, item) - raise TypeError( - "cannot insert TimedeltaIndex with incompatible label") + raise TypeError("cannot insert TimedeltaIndex with incompatible label") def delete(self, loc): """ @@ -663,16 +698,15 @@ def delete(self, loc): """ new_tds = np.delete(self.asi8, loc) - freq = 'infer' + freq = "infer" if is_integer(loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: if is_list_like(loc): - loc = lib.maybe_indices_to_slice( - ensure_int64(np.array(loc)), len(self)) + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): - if (loc.start in (0, None) or loc.stop in (len(self), None)): + if loc.start in (0, None) or loc.stop in (len(self), None): freq = self.freq return TimedeltaIndex(new_tds, name=self.name, freq=freq) @@ -690,15 +724,20 @@ def _is_convertible_to_index(other): """ if isinstance(other, TimedeltaIndex): return True - elif (len(other) > 0 and - other.inferred_type not in ('floating', 'mixed-integer', 'integer', - 'mixed-integer-float', 'mixed')): + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "mixed-integer-float", + "mixed", + ): return True return False -def timedelta_range(start=None, end=None, periods=None, freq=None, - name=None, closed=None): +def timedelta_range( + start=None, end=None, periods=None, freq=None, name=None, closed=None +): """ Return a fixed frequency TimedeltaIndex, with day as the default frequency @@ -765,9 +804,8 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, dtype='timedelta64[ns]', freq=None) """ if freq is None and com._any_none(periods, start, end): - freq = 'D' + freq = "D" freq, freq_infer = dtl.maybe_infer_freq(freq) - tdarr = TimedeltaArray._generate_range(start, end, periods, freq, - closed=closed) + tdarr = TimedeltaArray._generate_range(start, end, periods, freq, closed=closed) return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 677aefa15d200..ccc3a027af70d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -9,8 +9,17 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, - is_list_like, is_numeric_dtype, is_scalar, is_sequence, is_sparse) + ensure_platform_int, + is_float, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_numeric_dtype, + is_scalar, + is_sequence, + is_sparse, +) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna @@ -22,11 +31,11 @@ def get_indexers_list(): return [ - ('ix', _IXIndexer), - ('iloc', _iLocIndexer), - ('loc', _LocIndexer), - ('at', _AtIndexer), - ('iat', _iAtIndexer), + ("ix", _IXIndexer), + ("iloc", _iLocIndexer), + ("loc", _LocIndexer), + ("at", _AtIndexer), + ("iat", _iAtIndexer), ] @@ -102,12 +111,11 @@ def __call__(self, axis=None): return new_self def __iter__(self): - raise NotImplementedError('ix is not iterable') + raise NotImplementedError("ix is not iterable") def __getitem__(self, key): if type(key) is tuple: - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: values = self.obj._get_value(*key) if is_scalar(values): @@ -134,7 +142,7 @@ def _get_label(self, label, axis=None): # see GH5667 return self.obj._xs(label, axis=axis) elif isinstance(label, tuple) and isinstance(label[axis], slice): - raise IndexingError('no slices here, handle elsewhere') + raise IndexingError("no slices here, handle elsewhere") return self.obj._xs(label, axis=axis) @@ -154,7 +162,7 @@ def _get_setitem_indexer(self, key): axis = self.obj._get_axis(0) - if isinstance(axis, MultiIndex) and self.name != 'iloc': + if isinstance(axis, MultiIndex) and self.name != "iloc": try: return axis.get_loc(key) except Exception: @@ -174,14 +182,13 @@ def _get_setitem_indexer(self, key): except TypeError as e: # invalid indexer type vs 'other' indexing errors - if 'cannot do' in str(e): + if "cannot do" in str(e): raise raise IndexingError(key) def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: key = com.apply_if_callable(key, self.obj) indexer = self._get_setitem_indexer(key) @@ -216,13 +223,14 @@ def _has_valid_tuple(self, key): """ check the key for valid keys across my indexer """ for i, k in enumerate(key): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") try: self._validate_key(k, i) except ValueError: - raise ValueError("Location based indexing can only have " - "[{types}] types" - .format(types=self._valid_types)) + raise ValueError( + "Location based indexing can only have " + "[{types}] types".format(types=self._valid_types) + ) def _is_nested_tuple_indexer(self, tup): if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): @@ -235,14 +243,15 @@ def _convert_tuple(self, key, is_setter=False): axis = self.obj._get_axis_number(self.axis) for i in range(self.ndim): if i == axis: - keyidx.append(self._convert_to_indexer( - key, axis=axis, is_setter=is_setter)) + keyidx.append( + self._convert_to_indexer(key, axis=axis, is_setter=is_setter) + ) else: keyidx.append(slice(None)) else: for i, k in enumerate(key): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) keyidx.append(idx) return tuple(keyidx) @@ -272,8 +281,7 @@ def _has_valid_positional_setitem_indexer(self, indexer): will raise if needed, does not modify the indexer externally """ if isinstance(indexer, dict): - raise IndexError("{0} cannot enlarge its target object" - .format(self.name)) + raise IndexError("{0} cannot enlarge its target object".format(self.name)) else: if not isinstance(indexer, tuple): indexer = self._tuplify(indexer) @@ -286,11 +294,14 @@ def _has_valid_positional_setitem_indexer(self, indexer): pass elif is_integer(i): if i >= len(ax): - raise IndexError("{name} cannot enlarge its target " - "object".format(name=self.name)) + raise IndexError( + "{name} cannot enlarge its target " + "object".format(name=self.name) + ) elif isinstance(i, dict): - raise IndexError("{name} cannot enlarge its target object" - .format(name=self.name)) + raise IndexError( + "{name} cannot enlarge its target object".format(name=self.name) + ) return True @@ -299,6 +310,7 @@ def _setitem_with_indexer(self, indexer, value): # also has the side effect of consolidating in-place from pandas import Series + info_axis = self.obj._info_axis_number # maybe partial set @@ -309,8 +321,7 @@ def _setitem_with_indexer(self, indexer, value): if not take_split_path and self.obj._data.blocks: blk, = self.obj._data.blocks if 1 < blk.ndim: # in case of dict, keys are indices - val = list(value.values()) if isinstance(value, - dict) else value + val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): @@ -320,8 +331,9 @@ def _setitem_with_indexer(self, indexer, value): # if we have any multi-indexes that have non-trivial slices # (not null slices) then we must take the split path, xref # GH 10360 - if (isinstance(ax, MultiIndex) and - not (is_integer(i) or com.is_null_slice(i))): + if isinstance(ax, MultiIndex) and not ( + is_integer(i) or com.is_null_slice(i) + ): take_split_path = True break @@ -346,13 +358,14 @@ def _setitem_with_indexer(self, indexer, value): # or a list-like on the non-info axes if we have a # list-like len_non_info_axes = ( - len(_ax) for _i, _ax in enumerate(self.obj.axes) - if _i != i + len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i ) if any(not l for l in len_non_info_axes): if not is_list_like_indexer(value): - raise ValueError("cannot set a frame with no " - "defined index and a scalar") + raise ValueError( + "cannot set a frame with no " + "defined index and a scalar" + ) self.obj[key] = value return self.obj @@ -360,7 +373,8 @@ def _setitem_with_indexer(self, indexer, value): self.obj[key] = _infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( - indexer, self.obj.axes) + indexer, self.obj.axes + ) self._setitem_with_indexer(new_indexer, value) return self.obj @@ -402,21 +416,19 @@ def _setitem_with_indexer(self, indexer, value): if index.is_unique: new_indexer = index.get_indexer([new_index[-1]]) if (new_indexer != -1).any(): - return self._setitem_with_indexer(new_indexer, - value) + return self._setitem_with_indexer(new_indexer, value) # this preserves dtype of the value new_values = Series([value])._values if len(self.obj._values): try: - new_values = np.concatenate([self.obj._values, - new_values]) + new_values = np.concatenate([self.obj._values, new_values]) except TypeError: as_obj = self.obj.astype(object) - new_values = np.concatenate([as_obj, - new_values]) + new_values = np.concatenate([as_obj, new_values]) self.obj._data = self.obj._constructor( - new_values, index=new_index, name=self.obj.name)._data + new_values, index=new_index, name=self.obj.name + )._data self.obj._maybe_update_cacher(clear=True) return self.obj @@ -424,14 +436,14 @@ def _setitem_with_indexer(self, indexer, value): # no columns and scalar if not len(self.obj.columns): - raise ValueError("cannot set a frame with no defined " - "columns") + raise ValueError( + "cannot set a frame with no defined " "columns" + ) # append a Series if isinstance(value, Series): - value = value.reindex(index=self.obj.columns, - copy=True) + value = value.reindex(index=self.obj.columns, copy=True) value.name = indexer # a list-list @@ -440,11 +452,11 @@ def _setitem_with_indexer(self, indexer, value): # must have conforming columns if is_list_like_indexer(value): if len(value) != len(self.obj.columns): - raise ValueError("cannot set a row with " - "mismatched columns") + raise ValueError( + "cannot set a row with " "mismatched columns" + ) - value = Series(value, index=self.obj.columns, - name=indexer) + value = Series(value, index=self.obj.columns, name=indexer) self.obj._data = self.obj.append(value)._data self.obj._maybe_update_cacher(clear=True) @@ -469,46 +481,48 @@ def _setitem_with_indexer(self, indexer, value): # if we have a partial multiindex, then need to adjust the plane # indexer here - if (len(labels) == 1 and - isinstance(self.obj[labels[0]].axes[0], MultiIndex)): + if len(labels) == 1 and isinstance(self.obj[labels[0]].axes[0], MultiIndex): item = labels[0] obj = self.obj[item] index = obj.index idx = indexer[:info_axis][0] - plane_indexer = tuple([idx]) + indexer[info_axis + 1:] + plane_indexer = tuple([idx]) + indexer[info_axis + 1 :] lplane_indexer = length_of_indexer(plane_indexer[0], index) # require that we are setting the right number of values that # we are indexing - if is_list_like_indexer(value) and np.iterable( - value) and lplane_indexer != len(value): + if ( + is_list_like_indexer(value) + and np.iterable(value) + and lplane_indexer != len(value) + ): if len(obj[idx]) != len(value): - raise ValueError("cannot set using a multi-index " - "selection indexer with a different " - "length than the value") + raise ValueError( + "cannot set using a multi-index " + "selection indexer with a different " + "length than the value" + ) # make sure we have an ndarray - value = getattr(value, 'values', value).ravel() + value = getattr(value, "values", value).ravel() # we can directly set the series here # as we select a slice indexer on the mi idx = index._convert_slice_indexer(idx) obj._consolidate_inplace() obj = obj.copy() - obj._data = obj._data.setitem(indexer=tuple([idx]), - value=value) + obj._data = obj._data.setitem(indexer=tuple([idx]), value=value) self.obj[item] = obj return # non-mi else: - plane_indexer = indexer[:info_axis] + indexer[info_axis + 1:] + plane_indexer = indexer[:info_axis] + indexer[info_axis + 1 :] if info_axis > 0: plane_axis = self.obj.axes[:info_axis][0] - lplane_indexer = length_of_indexer(plane_indexer[0], - plane_axis) + lplane_indexer = length_of_indexer(plane_indexer[0], plane_axis) else: lplane_indexer = 0 @@ -521,10 +535,10 @@ def setter(item, v): # which means essentially reassign to the columns of a # multi-dim object # GH6149 (null slice), GH10408 (full bounds) - if (isinstance(pi, tuple) and - all(com.is_null_slice(idx) or - com.is_full_slice(idx, len(self.obj)) - for idx in pi)): + if isinstance(pi, tuple) and all( + com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) + for idx in pi + ): s = v else: # set the item, possibly having a dtype change @@ -538,8 +552,11 @@ def setter(item, v): def can_do_equal_len(): """ return True if we have an equal len settable """ - if (not len(labels) == 1 or not np.iterable(value) or - is_scalar(plane_indexer[0])): + if ( + not len(labels) == 1 + or not np.iterable(value) + or is_scalar(plane_indexer[0]) + ): return False item = labels[0] @@ -556,7 +573,7 @@ def can_do_equal_len(): # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) - if is_list_like_indexer(value) and getattr(value, 'ndim', 1) > 0: + if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: # we have an equal len Frame if isinstance(value, ABCDataFrame) and value.ndim > 1: @@ -567,8 +584,8 @@ def can_do_equal_len(): if item in value: sub_indexer[info_axis] = item v = self._align_series( - tuple(sub_indexer), value[item], - multiindex_indexer) + tuple(sub_indexer), value[item], multiindex_indexer + ) else: v = np.nan @@ -578,16 +595,18 @@ def can_do_equal_len(): # hasattr first, to avoid coercing to ndarray without reason. # But we may be relying on the ndarray coercion to check ndim. # Why not just convert to an ndarray earlier on if needed? - elif ((hasattr(value, 'ndim') and value.ndim == 2) - or (not hasattr(value, 'ndim') and - np.array(value).ndim) == 2): + elif (hasattr(value, "ndim") and value.ndim == 2) or ( + not hasattr(value, "ndim") and np.array(value).ndim + ) == 2: # note that this coerces the dtype if we are mixed # GH 7551 value = np.array(value, dtype=object) if len(labels) != value.shape[1]: - raise ValueError('Must have equal len keys and value ' - 'when setting with an ndarray') + raise ValueError( + "Must have equal len keys and value " + "when setting with an ndarray" + ) for i, item in enumerate(labels): @@ -602,8 +621,10 @@ def can_do_equal_len(): else: if len(labels) != len(value): - raise ValueError('Must have equal len keys and value ' - 'when setting with an iterable') + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) for item, v in zip(labels, value): setter(item, v) @@ -620,12 +641,16 @@ def can_do_equal_len(): # if we are setting on the info axis ONLY # set using those methods to avoid block-splitting # logic here - if (len(indexer) > info_axis and - is_integer(indexer[info_axis]) and - all(com.is_null_slice(idx) - for i, idx in enumerate(indexer) - if i != info_axis) and - item_labels.is_unique): + if ( + len(indexer) > info_axis + and is_integer(indexer[info_axis]) + and all( + com.is_null_slice(idx) + for i, idx in enumerate(indexer) + if i != info_axis + ) + and item_labels.is_unique + ): self.obj[item_labels[indexer[info_axis]]] = value return @@ -643,8 +668,7 @@ def can_do_equal_len(): # actually do the set self.obj._consolidate_inplace() - self.obj._data = self.obj._data.setitem(indexer=indexer, - value=value) + self.obj._data = self.obj._data.setitem(indexer=indexer, value=value) self.obj._maybe_update_cacher(clear=True) def _align_series(self, indexer, ser, multiindex_indexer=False): @@ -677,6 +701,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): # flatten np.ndarray indexers def ravel(i): return i.ravel() if isinstance(i, np.ndarray) else i + indexer = tuple(map(ravel, indexer)) aligners = [not com.is_null_slice(idx) for idx in indexer] @@ -696,8 +721,7 @@ def ravel(i): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) - if (sum_aligners == self.ndim and - all(is_sequence(_) for _ in indexer)): + if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer): ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values # single indexer @@ -741,7 +765,7 @@ def ravel(i): return ser.reindex(ax)._values - raise ValueError('Incompatible indexer with Series') + raise ValueError("Incompatible indexer with Series") def _align_frame(self, indexer, df): is_frame = self.obj.ndim == 2 @@ -772,8 +796,7 @@ def _align_frame(self, indexer, df): val = df.reindex(idx, columns=cols)._values return val - elif ((isinstance(indexer, slice) or is_list_like_indexer(indexer)) and - is_frame): + elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame: ax = self.obj.index[indexer] if df.index.equals(ax): val = df.copy()._values @@ -781,16 +804,20 @@ def _align_frame(self, indexer, df): # we have a multi-index and are trying to align # with a particular, level GH3738 - if (isinstance(ax, MultiIndex) and - isinstance(df.index, MultiIndex) and - ax.nlevels != df.index.nlevels): - raise TypeError("cannot align on a multi-index with out " - "specifying the join levels") + if ( + isinstance(ax, MultiIndex) + and isinstance(df.index, MultiIndex) + and ax.nlevels != df.index.nlevels + ): + raise TypeError( + "cannot align on a multi-index with out " + "specifying the join levels" + ) val = df.reindex(index=ax)._values return val - raise ValueError('Incompatible indexer with DataFrame') + raise ValueError("Incompatible indexer with DataFrame") def _getitem_tuple(self, tup): try: @@ -809,7 +836,7 @@ def _getitem_tuple(self, tup): retval = self.obj for i, key in enumerate(tup): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") if com.is_null_slice(key): continue @@ -859,8 +886,10 @@ def _multi_take(self, tup): """ # GH 836 o = self.obj - d = {axis: self._get_listlike_indexer(key, axis) - for (key, axis) in zip(tup, o._AXIS_ORDERS)} + d = { + axis: self._get_listlike_indexer(key, axis) + for (key, axis) in zip(tup, o._AXIS_ORDERS) + } return o._reindex_with_indexers(d, copy=True, allow_dups=True) def _convert_for_reindex(self, key, axis=None): @@ -878,8 +907,7 @@ def _handle_lowerdim_multi_index_axis0(self, tup): except KeyError as ek: # raise KeyError if number of indexers match # else IndexingError will be raised - if (len(tup) <= self.obj.index.nlevels - and len(tup) > self.obj.ndim): + if len(tup) <= self.obj.index.nlevels and len(tup) > self.obj.ndim: raise ek except Exception as e1: if isinstance(tup[0], (slice, Index)): @@ -907,7 +935,7 @@ def _getitem_lowerdim(self, tup): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, MultiIndex) and self.name != 'iloc': + if isinstance(ax0, MultiIndex) and self.name != "iloc": result = self._handle_lowerdim_multi_index_axis0(tup) if result is not None: return result @@ -929,15 +957,18 @@ def _getitem_lowerdim(self, tup): elif section.ndim == self.ndim: # we're in the middle of slicing through a MultiIndex # revise the key wrt to `section` by inserting an _NS - new_key = tup[:i] + (_NS,) + tup[i + 1:] + new_key = tup[:i] + (_NS,) + tup[i + 1 :] else: - new_key = tup[:i] + tup[i + 1:] + new_key = tup[:i] + tup[i + 1 :] # unfortunately need an odious kludge here because of # DataFrame transposing convention - if (isinstance(section, ABCDataFrame) and i > 0 and - len(new_key) == 2): + if ( + isinstance(section, ABCDataFrame) + and i > 0 + and len(new_key) == 2 + ): a, b = new_key new_key = b, a @@ -951,7 +982,7 @@ def _getitem_lowerdim(self, tup): # This is an elided recursive call to iloc/loc/etc' return getattr(section, self.name)[new_key] - raise IndexingError('not applicable') + raise IndexingError("not applicable") def _getitem_nested_tuple(self, tup): # we have a nested tuple so have at least 1 multi-index level @@ -984,7 +1015,7 @@ def _getitem_nested_tuple(self, tup): axis += 1 # if we have a scalar, we are done - if is_scalar(obj) or not hasattr(obj, 'ndim'): + if is_scalar(obj) or not hasattr(obj, "ndim"): break # has the dim of the obj changed? @@ -1006,12 +1037,12 @@ def _getitem_axis(self, key, axis=None): labels = self.obj._get_axis(axis) if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) - elif (is_list_like_indexer(key) and - not (isinstance(key, tuple) and - isinstance(labels, MultiIndex))): + elif is_list_like_indexer(key) and not ( + isinstance(key, tuple) and isinstance(labels, MultiIndex) + ): - if hasattr(key, 'ndim') and key.ndim > 1: - raise ValueError('Cannot index with multidimensional key') + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") return self._getitem_iterable(key, axis=axis) else: @@ -1066,15 +1097,13 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): # Have the index compute an indexer or return None # if it cannot handle: - indexer, keyarr = ax._convert_listlike_indexer(key, - kind=self.name) + indexer, keyarr = ax._convert_listlike_indexer(key, kind=self.name) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer(key, indexer, axis, - raise_missing=raise_missing) + self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing) return ax[indexer], indexer - if ax.is_unique and not getattr(ax, 'is_overlapping', False): + if ax.is_unique and not getattr(ax, "is_overlapping", False): # If we are trying to get actual keys from empty Series, we # patiently wait for a KeyError later on - otherwise, convert if len(ax) or not len(key): @@ -1084,9 +1113,9 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False): else: keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - self._validate_read_indexer(keyarr, indexer, - o._get_axis_number(axis), - raise_missing=raise_missing) + self._validate_read_indexer( + keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing + ) return keyarr, indexer def _getitem_iterable(self, key, axis=None): @@ -1129,10 +1158,10 @@ def _getitem_iterable(self, key, axis=None): return self.obj._take(inds, axis=axis) else: # A collection of keys - keyarr, indexer = self._get_listlike_indexer(key, axis, - raise_missing=False) - return self.obj._reindex_with_indexers({axis: [keyarr, indexer]}, - copy=True, allow_dups=True) + keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) + return self.obj._reindex_with_indexers( + {axis: [keyarr, indexer]}, copy=True, allow_dups=True + ) def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): """ @@ -1171,11 +1200,13 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): if missing == len(indexer): raise KeyError( "None of [{key}] are in the [{axis}]".format( - key=key, axis=self.obj._get_axis_name(axis))) + key=key, axis=self.obj._get_axis_name(axis) + ) + ) # We (temporarily) allow for some missing keys with .loc, except in # some cases (e.g. setting) in which "raise_missing" will be False - if not(self.name == 'loc' and not raise_missing): + if not (self.name == "loc" and not raise_missing): not_found = list(set(key) - set(ax)) raise KeyError("{} not in index".format(not_found)) @@ -1185,19 +1216,19 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False): # code, so we want to avoid warning & then # just raising - _missing_key_warning = textwrap.dedent(""" + _missing_key_warning = textwrap.dedent( + """ Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""") # noqa + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""" + ) # noqa if not (ax.is_categorical() or ax.is_interval()): - warnings.warn(_missing_key_warning, - FutureWarning, stacklevel=6) + warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6) - def _convert_to_indexer(self, obj, axis=None, is_setter=False, - raise_missing=False): + def _convert_to_indexer(self, obj, axis=None, is_setter=False, raise_missing=False): """ Convert indexing key into something we can use to do actual fancy indexing on an ndarray @@ -1239,7 +1270,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, except LookupError: if isinstance(obj, tuple) and isinstance(labels, MultiIndex): if is_setter and len(obj) == labels.nlevels: - return {'key': obj} + return {"key": obj} raise except TypeError: pass @@ -1255,14 +1286,14 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, if is_setter: # always valid - if self.name == 'loc': - return {'key': obj} + if self.name == "loc": + return {"key": obj} # a positional - if (obj >= self.obj.shape[axis] and - not isinstance(labels, MultiIndex)): - raise ValueError("cannot set by positional indexing with " - "enlargement") + if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex): + raise ValueError( + "cannot set by positional indexing with " "enlargement" + ) return obj @@ -1277,8 +1308,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, return inds else: # When setting, missing keys are not allowed, even with .loc: - kwargs = {'raise_missing': True if is_setter else - raise_missing} + kwargs = {"raise_missing": True if is_setter else raise_missing} return self._get_listlike_indexer(obj, axis, **kwargs)[1] else: try: @@ -1286,7 +1316,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False, except LookupError: # allow a not found key only if we are a setter if not is_list_like_indexer(obj) and is_setter: - return {'key': obj} + return {"key": obj} raise def _tuplify(self, loc): @@ -1305,7 +1335,7 @@ def _get_slice_axis(self, slice_obj, axis=None): indexer = self._convert_slice_indexer(slice_obj, axis) if isinstance(indexer, slice): - return self._slice(indexer, axis=axis, kind='iloc') + return self._slice(indexer, axis=axis, kind="iloc") else: return self.obj._take(indexer, axis=axis) @@ -1334,17 +1364,18 @@ class _IXIndexer(_NDFrameIndexer): See more at :ref:`Advanced Indexing `. """ - _ix_deprecation_warning = textwrap.dedent(""" + _ix_deprecation_warning = textwrap.dedent( + """ .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""") # noqa + http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""" + ) # noqa def __init__(self, name, obj): - warnings.warn(self._ix_deprecation_warning, - FutureWarning, stacklevel=2) + warnings.warn(self._ix_deprecation_warning, FutureWarning, stacklevel=2) super().__init__(name, obj) @Appender(_NDFrameIndexer._validate_key.__doc__) @@ -1413,8 +1444,7 @@ class _LocationIndexer(_NDFrameIndexer): def __getitem__(self, key): if type(key) is tuple: - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: if self._is_scalar_access(key): return self._getitem_scalar(key) @@ -1458,11 +1488,12 @@ def _get_slice_axis(self, slice_obj, axis=None): return obj.copy(deep=False) labels = obj._get_axis(axis) - indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, - slice_obj.step, kind=self.name) + indexer = labels.slice_indexer( + slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name + ) if isinstance(indexer, slice): - return self._slice(indexer, axis=axis, kind='iloc') + return self._slice(indexer, axis=axis, kind="iloc") else: return self.obj._take(indexer, axis=axis) @@ -1705,9 +1736,11 @@ class _LocIndexer(_LocationIndexer): viper mark ii 7 1 """ - _valid_types = ("labels (MUST BE IN THE INDEX), slices of labels (BOTH " - "endpoints included! Can be slices of integers if the " - "index is integers), listlike of labels, boolean") + _valid_types = ( + "labels (MUST BE IN THE INDEX), slices of labels (BOTH " + "endpoints included! Can be slices of integers if the " + "index is integers), listlike of labels, boolean" + ) _exception = KeyError @Appender(_NDFrameIndexer._validate_key.__doc__) @@ -1732,7 +1765,7 @@ def _is_scalar_access(self, key): # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) # b) provide a performant path - if not hasattr(key, '__len__'): + if not hasattr(key, "__len__"): return False if len(key) != self.ndim: @@ -1761,7 +1794,7 @@ def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" if isinstance(labels, MultiIndex): - if (isinstance(key, str) and labels.levels[0].is_all_dates): + if isinstance(key, str) and labels.levels[0].is_all_dates: # Convert key '2016-01-01' to # ('2016-01-01'[, slice(None, None, None)]+) key = tuple([key] + [slice(None)] * (len(labels.levels) - 1)) @@ -1771,8 +1804,7 @@ def _get_partial_string_timestamp_match_key(self, key, labels): # (..., slice('2016-01-01', '2016-01-01', None), ...) new_key = [] for i, component in enumerate(key): - if (isinstance(component, str) and - labels.levels[i].is_all_dates): + if isinstance(component, str) and labels.levels[i].is_all_dates: new_key.append(slice(component, component, None)) else: new_key.append(component) @@ -1810,23 +1842,30 @@ def _getitem_axis(self, key, axis=None): key = list(key) elif isinstance(key, ABCDataFrame): # GH 15438 - raise NotImplementedError("Indexing a MultiIndex with a " - "DataFrame key is not " - "implemented") - elif hasattr(key, 'ndim') and key.ndim > 1: - raise NotImplementedError("Indexing a MultiIndex with a " - "multidimensional key is not " - "implemented") - - if (not isinstance(key, tuple) and len(key) > 1 and - not isinstance(key[0], tuple)): + raise NotImplementedError( + "Indexing a MultiIndex with a " + "DataFrame key is not " + "implemented" + ) + elif hasattr(key, "ndim") and key.ndim > 1: + raise NotImplementedError( + "Indexing a MultiIndex with a " + "multidimensional key is not " + "implemented" + ) + + if ( + not isinstance(key, tuple) + and len(key) > 1 + and not isinstance(key[0], tuple) + ): key = tuple([key]) # an iterable multi-selection if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): - if hasattr(key, 'ndim') and key.ndim > 1: - raise ValueError('Cannot index with multidimensional key') + if hasattr(key, "ndim") and key.ndim > 1: + raise ValueError("Cannot index with multidimensional key") return self._getitem_iterable(key, axis=axis) @@ -1978,19 +2017,25 @@ class _iLocIndexer(_LocationIndexer): 2 1000 3000 """ - _valid_types = ("integer, integer slice (START point is INCLUDED, END " - "point is EXCLUDED), listlike of integers, boolean array") + _valid_types = ( + "integer, integer slice (START point is INCLUDED, END " + "point is EXCLUDED), listlike of integers, boolean array" + ) _exception = IndexError def _validate_key(self, key, axis): if com.is_bool_indexer(key): - if hasattr(key, 'index') and isinstance(key.index, Index): - if key.index.inferred_type == 'integer': - raise NotImplementedError("iLocation based boolean " - "indexing on an integer type " - "is not available") - raise ValueError("iLocation based boolean indexing cannot use " - "an indexable as a mask") + if hasattr(key, "index") and isinstance(key.index, Index): + if key.index.inferred_type == "integer": + raise NotImplementedError( + "iLocation based boolean " + "indexing on an integer type " + "is not available" + ) + raise ValueError( + "iLocation based boolean indexing cannot use " + "an indexable as a mask" + ) return if isinstance(key, slice): @@ -2000,22 +2045,25 @@ def _validate_key(self, key, axis): elif isinstance(key, tuple): # a tuple should already have been caught by this point # so don't treat a tuple as a valid indexer - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") elif is_list_like_indexer(key): arr = np.array(key) len_axis = len(self.obj._get_axis(axis)) # check that the key has a numeric dtype if not is_numeric_dtype(arr.dtype): - raise IndexError(".iloc requires numeric indexers, got " - "{arr}".format(arr=arr)) + raise IndexError( + ".iloc requires numeric indexers, got " "{arr}".format(arr=arr) + ) # check that the key does not exceed the maximum size of the index if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis): raise IndexError("positional indexers are out-of-bounds") else: - raise ValueError("Can only index by location with " - "a [{types}]".format(types=self._valid_types)) + raise ValueError( + "Can only index by location with " + "a [{types}]".format(types=self._valid_types) + ) def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) @@ -2025,7 +2073,7 @@ def _is_scalar_access(self, key): # that provide the equivalent access of .at and .iat # a) avoid getting things via sections and (to minimize dtype changes) # b) provide a performant path - if not hasattr(key, '__len__'): + if not hasattr(key, "__len__"): return False if len(key) != self.ndim: @@ -2084,7 +2132,7 @@ def _getitem_tuple(self, tup): axis = 0 for i, key in enumerate(tup): if i >= self.obj.ndim: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") if com.is_null_slice(key): axis += 1 @@ -2111,7 +2159,7 @@ def _get_slice_axis(self, slice_obj, axis=None): slice_obj = self._convert_slice_indexer(slice_obj, axis) if isinstance(slice_obj, slice): - return self._slice(slice_obj, axis=axis, kind='iloc') + return self._slice(slice_obj, axis=axis, kind="iloc") else: return self.obj._take(slice_obj, axis=axis) @@ -2158,8 +2206,9 @@ def _getitem_axis(self, key, axis=None): else: key = item_from_zerodim(key) if not is_integer(key): - raise TypeError("Cannot index by location index with a " - "non-integer key") + raise TypeError( + "Cannot index by location index with a " "non-integer key" + ) # validate the location self._validate_integer(key, axis) @@ -2182,8 +2231,10 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False): self._validate_key(obj, axis) return obj except ValueError: - raise ValueError("Can only index by location with " - "a [{types}]".format(types=self._valid_types)) + raise ValueError( + "Can only index by location with " + "a [{types}]".format(types=self._valid_types) + ) class _ScalarAccessIndexer(_NDFrameIndexer): @@ -2199,15 +2250,14 @@ def __getitem__(self, key): if not is_list_like_indexer(key): key = tuple([key]) else: - raise ValueError('Invalid call for scalar access (getting)!') + raise ValueError("Invalid call for scalar access (getting)!") key = self._convert_key(key) return self.obj._get_value(*key, takeable=self._takeable) def __setitem__(self, key, value): if isinstance(key, tuple): - key = tuple(com.apply_if_callable(x, self.obj) - for x in key) + key = tuple(com.apply_if_callable(x, self.obj) for x in key) else: # scalar callable may return tuple key = com.apply_if_callable(key, self.obj) @@ -2215,8 +2265,7 @@ def __setitem__(self, key, value): if not isinstance(key, tuple): key = self._tuplify(key) if len(key) != self.obj.ndim: - raise ValueError('Not enough indexers for scalar access ' - '(setting)!') + raise ValueError("Not enough indexers for scalar access " "(setting)!") key = list(self._convert_key(key, is_setter=True)) key.append(value) self.obj._set_value(*key, takeable=self._takeable) @@ -2283,13 +2332,17 @@ def _convert_key(self, key, is_setter=False): for ax, i in zip(self.obj.axes, key): if ax.is_integer(): if not is_integer(i): - raise ValueError("At based indexing on an integer index " - "can only have integer indexers") + raise ValueError( + "At based indexing on an integer index " + "can only have integer indexers" + ) else: if is_integer(i) and not ax.holds_integer(): - raise ValueError("At based indexing on an non-integer " - "index can only have non-integer " - "indexers") + raise ValueError( + "At based indexing on an non-integer " + "index can only have non-integer " + "indexers" + ) return key @@ -2348,8 +2401,7 @@ def _convert_key(self, key, is_setter=False): """ require integer args (and convert to label arguments) """ for a, i in zip(self.obj.axes, key): if not is_integer(i): - raise ValueError("iAt based indexing can only have integer " - "indexers") + raise ValueError("iAt based indexing can only have integer " "indexers") return key @@ -2388,7 +2440,7 @@ def convert_to_index_sliceable(obj, key): """ idx = obj.index if isinstance(key, slice): - return idx._convert_slice_indexer(key, kind='getitem') + return idx._convert_slice_indexer(key, kind="getitem") elif isinstance(key, str): @@ -2440,9 +2492,11 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: result = result.reindex(index) mask = isna(result._values) if mask.any(): - raise IndexingError('Unalignable boolean Series provided as ' - 'indexer (index of the boolean Series and of ' - 'the indexed object do not match).') + raise IndexingError( + "Unalignable boolean Series provided as " + "indexer (index of the boolean Series and of " + "the indexed object do not match)." + ) result = result.astype(bool)._values else: if is_sparse(result): @@ -2452,8 +2506,8 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: # GH26658 if len(result) != len(index): raise IndexError( - 'Item wrong length {} instead of {}.'.format(len(result), - len(index))) + "Item wrong length {} instead of {}.".format(len(result), len(index)) + ) return result @@ -2488,18 +2542,24 @@ def check_setitem_lengths(indexer, value, values): # boolean with truth values == len of the value is ok too if isinstance(indexer, (np.ndarray, list)): if is_list_like(value) and len(indexer) != len(value): - if not (isinstance(indexer, np.ndarray) and - indexer.dtype == np.bool_ and - len(indexer[indexer]) == len(value)): - raise ValueError("cannot set using a list-like indexer " - "with a different length than the value") + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and len(indexer[indexer]) == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) # slice elif isinstance(indexer, slice): if is_list_like(value) and len(values): if len(value) != length_of_indexer(indexer, values): - raise ValueError("cannot set using a slice indexer with a " - "different length than the value") + raise ValueError( + "cannot set using a slice indexer with a " + "different length than the value" + ) def convert_missing_indexer(indexer): @@ -2511,7 +2571,7 @@ def convert_missing_indexer(indexer): if isinstance(indexer, dict): # a missing key (but not a tuple indexer) - indexer = indexer['key'] + indexer = indexer["key"] if isinstance(indexer, bool): raise KeyError("cannot use a single bool to index into setitem") @@ -2526,8 +2586,7 @@ def convert_from_missing_indexer_tuple(indexer, axes): """ def get_indexer(_i, _idx): - return (axes[_i].get_loc(_idx['key']) if isinstance(_idx, dict) else - _idx) + return axes[_i].get_loc(_idx["key"]) if isinstance(_idx, dict) else _idx return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer)) @@ -2608,8 +2667,9 @@ def validate_indices(indices, n): if len(indices): min_idx = indices.min() if min_idx < -1: - msg = ("'indices' contains values less than allowed ({} < {})" - .format(min_idx, -1)) + msg = "'indices' contains values less than allowed ({} < {})".format( + min_idx, -1 + ) raise ValueError(msg) max_idx = indices.max() @@ -2648,8 +2708,7 @@ def is_nested_tuple(tup, labels): def is_list_like_indexer(key): # allow a list_like, but exclude NamedTuples which can be indexers - return is_list_like(key) and not (isinstance(key, tuple) and - type(key) is not tuple) + return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple) def is_label_like(key): @@ -2658,8 +2717,11 @@ def is_label_like(key): def need_slice(obj): - return (obj.start is not None or obj.stop is not None or - (obj.step is not None and obj.step != 1)) + return ( + obj.start is not None + or obj.stop is not None + or (obj.step is not None and obj.step != 1) + ) def maybe_droplevels(index, key): @@ -2697,8 +2759,9 @@ def _non_reducing_slice(slice_): def pred(part): # true when slice does *not* reduce, False when part is a tuple, # i.e. MultiIndex slice - return ((isinstance(part, slice) or is_list_like(part)) - and not isinstance(part, tuple)) + return (isinstance(part, slice) or is_list_like(part)) and not isinstance( + part, tuple + ) if not is_list_like(slice_): if not isinstance(slice_, slice): diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index bf46e5d1a74e4..8ac0df2fa4e0a 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,16 +1,28 @@ - from .blocks import ( # noqa: F401 - Block, BoolBlock, CategoricalBlock, ComplexBlock, DatetimeBlock, - DatetimeTZBlock, ExtensionBlock, FloatBlock, IntBlock, ObjectBlock, - TimeDeltaBlock) + Block, + BoolBlock, + CategoricalBlock, + ComplexBlock, + DatetimeBlock, + DatetimeTZBlock, + ExtensionBlock, + FloatBlock, + IntBlock, + ObjectBlock, + TimeDeltaBlock, +) from .managers import ( # noqa: F401 - BlockManager, SingleBlockManager, create_block_manager_from_arrays, - create_block_manager_from_blocks) + BlockManager, + SingleBlockManager, + create_block_manager_from_arrays, + create_block_manager_from_blocks, +) from .blocks import _safe_reshape # noqa: F401; io.packers from .blocks import make_block # noqa: F401; io.pytables, io.packers from .managers import ( # noqa: F401; reshape.concat, reshape.merge _transform_index, - concatenate_block_managers) + concatenate_block_managers, +) from .blocks import _block_shape # noqa:F401; io.pytables diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a9b2c0491458c..34186b60de27c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -13,27 +13,63 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( - astype_nansafe, find_common_type, infer_dtype_from, - infer_dtype_from_scalar, maybe_convert_objects, maybe_downcast_to_dtype, - maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects) + astype_nansafe, + find_common_type, + infer_dtype_from, + infer_dtype_from_scalar, + maybe_convert_objects, + maybe_downcast_to_dtype, + maybe_infer_dtype_type, + maybe_promote, + maybe_upcast, + soft_convert_objects, +) from pandas.core.dtypes.common import ( - _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_dtype_equal, is_extension_array_dtype, is_extension_type, - is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, - is_list_like, is_numeric_v_string_like, is_object_dtype, is_period_dtype, - is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype) + _NS_DTYPE, + _TD_DTYPE, + ensure_platform_int, + is_bool_dtype, + is_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float_dtype, + is_integer, + is_integer_dtype, + is_interval_dtype, + is_list_like, + is_numeric_v_string_like, + is_object_dtype, + is_period_dtype, + is_re, + is_re_compilable, + is_sparse, + is_timedelta64_dtype, + pandas_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass, - ABCPandasArray, ABCSeries) -from pandas.core.dtypes.missing import ( - _isna_compat, array_equivalent, isna, notna) + ABCDataFrame, + ABCDatetimeIndex, + ABCExtensionArray, + ABCIndexClass, + ABCPandasArray, + ABCSeries, +) +from pandas.core.dtypes.missing import _isna_compat, array_equivalent, isna, notna import pandas.core.algorithms as algos from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, PandasDtype, TimedeltaArray) + Categorical, + DatetimeArray, + ExtensionArray, + PandasDtype, + TimedeltaArray, +) from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.indexing import check_setitem_lengths @@ -51,7 +87,8 @@ class Block(PandasObject): Index-ignorant; let the container take care of that """ - __slots__ = ['_mgr_locs', 'values', 'ndim'] + + __slots__ = ["_mgr_locs", "values", "ndim"] is_numeric = False is_float = False is_integer = False @@ -67,7 +104,7 @@ class Block(PandasObject): _can_consolidate = True _verify_integrity = True _validate_ndim = True - _ftype = 'dense' + _ftype = "dense" _concatenator = staticmethod(np.concatenate) def __init__(self, values, placement, ndim=None): @@ -75,11 +112,11 @@ def __init__(self, values, placement, ndim=None): self.mgr_locs = placement self.values = values - if (self._validate_ndim and self.ndim and - len(self.mgr_locs) != len(self.values)): + if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( - 'Wrong number of items passed {val}, placement implies ' - '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs))) + "Wrong number of items passed {val}, placement implies " + "{mgr}".format(val=len(self.values), mgr=len(self.mgr_locs)) + ) def _check_ndim(self, values, ndim): """ @@ -106,8 +143,7 @@ def _check_ndim(self, values, ndim): ndim = values.ndim if self._validate_ndim and values.ndim != ndim: - msg = ("Wrong number of dimensions. values.ndim != ndim " - "[{} != {}]") + msg = "Wrong number of dimensions. values.ndim != ndim " "[{} != {}]" raise ValueError(msg.format(values.ndim, ndim)) return ndim @@ -218,32 +254,38 @@ def make_block(self, values, placement=None): return make_block(values, placement=placement, ndim=self.ndim) - def make_block_same_class(self, values, placement=None, ndim=None, - dtype=None): + def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): """ Wrap given values in a block of same type as self. """ if dtype is not None: # issue 19431 fastparquet is passing this - warnings.warn("dtype argument is deprecated, will be removed " - "in a future release.", FutureWarning) + warnings.warn( + "dtype argument is deprecated, will be removed " "in a future release.", + FutureWarning, + ) if placement is None: placement = self.mgr_locs - return make_block(values, placement=placement, ndim=ndim, - klass=self.__class__, dtype=dtype) + return make_block( + values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype + ) def __repr__(self): # don't want to print out all of the items here name = pprint_thing(self.__class__.__name__) if self._is_single_block: - result = '{name}: {len} dtype: {dtype}'.format( - name=name, len=len(self), dtype=self.dtype) + result = "{name}: {len} dtype: {dtype}".format( + name=name, len=len(self), dtype=self.dtype + ) else: - shape = ' x '.join(pprint_thing(s) for s in self.shape) - result = '{name}: {index}, {shape}, dtype: {dtype}'.format( - name=name, index=pprint_thing(self.mgr_locs.indexer), - shape=shape, dtype=self.dtype) + shape = " x ".join(pprint_thing(s) for s in self.shape) + result = "{name}: {index}, {shape}, dtype: {dtype}".format( + name=name, + index=pprint_thing(self.mgr_locs.indexer), + shape=shape, + dtype=self.dtype, + ) return result @@ -292,7 +334,7 @@ def dtype(self): @property def ftype(self): - if getattr(self.values, '_pandas_ftype', False): + if getattr(self.values, "_pandas_ftype", False): dtype = self.dtype.subtype else: dtype = self.dtype @@ -305,10 +347,12 @@ def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) + values = self._concatenator( + [blk.values for blk in to_concat], axis=self.ndim - 1 + ) return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1) + ) def iget(self, i): return self.values[i] @@ -334,11 +378,10 @@ def apply(self, func, **kwargs): """ apply the function to my values; return a block if we are not one """ - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = func(self.values, **kwargs) if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result, - ndim=self.ndim)) + result = self.make_block(values=_block_shape(result, ndim=self.ndim)) return result @@ -346,17 +389,18 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): """ fillna on the block with the value. If we fail, then convert to ObjectBlock and try again """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") mask = isna(self.values) if limit is not None: if not is_integer(limit): - raise ValueError('Limit must be an integer') + raise ValueError("Limit must be an integer") if limit < 1: - raise ValueError('Limit must be greater than 0') + raise ValueError("Limit must be greater than 0") if self.ndim > 2: - raise NotImplementedError("number of dimensions for 'fillna' " - "is currently limited to 2") + raise NotImplementedError( + "number of dimensions for 'fillna' " "is currently limited to 2" + ) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: @@ -371,8 +415,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): self._try_coerce_args(value) blocks = self.putmask(mask, value, inplace=inplace) - blocks = [b.make_block(values=self._try_coerce_result(b.values)) - for b in blocks] + blocks = [ + b.make_block(values=self._try_coerce_result(b.values)) for b in blocks + ] return self._maybe_downcast(blocks, downcast) except (TypeError, ValueError): @@ -387,10 +432,7 @@ def f(m, v, i): # slice out our block if i is not None: block = block.getitem_block(slice(i, i + 1)) - return block.fillna(value, - limit=limit, - inplace=inplace, - downcast=None) + return block.fillna(value, limit=limit, inplace=inplace, downcast=None) return self.split_and_operate(mask, f, inplace) @@ -424,8 +466,7 @@ def make_a_block(nv, ref_loc): # Put back the dimension that was taken from it and make # a block out of the result. nv = _block_shape(nv, ndim=self.ndim) - block = self.make_block(values=nv, - placement=ref_loc) + block = self.make_block(values=nv, placement=ref_loc) return block # ndim == 1 @@ -481,7 +522,7 @@ def downcast(self, dtypes=None): # try to cast all non-floats here if dtypes is None: - dtypes = 'infer' + dtypes = "infer" nv = maybe_downcast_to_dtype(values, dtypes) return self.make_block(nv) @@ -490,16 +531,17 @@ def downcast(self, dtypes=None): if dtypes is None: return self - if not (dtypes == 'infer' or isinstance(dtypes, dict)): - raise ValueError("downcast must have a dictionary or 'infer' as " - "its argument") + if not (dtypes == "infer" or isinstance(dtypes, dict)): + raise ValueError( + "downcast must have a dictionary or 'infer' as " "its argument" + ) # operate column-by-column # this is expensive as it splits the blocks items-by-item def f(m, v, i): - if dtypes == 'infer': - dtype = 'infer' + if dtypes == "infer": + dtype = "infer" else: raise AssertionError("dtypes as dict is not supported yet") @@ -509,12 +551,10 @@ def f(m, v, i): return self.split_and_operate(None, f, False) - def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): - return self._astype(dtype, copy=copy, errors=errors, values=values, - **kwargs) + def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): + return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs) - def _astype(self, dtype, copy=False, errors='raise', values=None, - **kwargs): + def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs): """Coerce to the new type Parameters @@ -530,31 +570,34 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, ------- Block """ - errors_legal_values = ('raise', 'ignore') + errors_legal_values = ("raise", "ignore") if errors not in errors_legal_values: - invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. " - "Supplied value is '{}'".format( - list(errors_legal_values), errors)) + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of {}. " + "Supplied value is '{}'".format(list(errors_legal_values), errors) + ) raise ValueError(invalid_arg) - if (inspect.isclass(dtype) and - issubclass(dtype, ExtensionDtype)): - msg = ("Expected an instance of {}, but got the class instead. " - "Try instantiating 'dtype'.".format(dtype.__name__)) + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + "Expected an instance of {}, but got the class instead. " + "Try instantiating 'dtype'.".format(dtype.__name__) + ) raise TypeError(msg) # may need to convert to categorical if self.is_categorical_astype(dtype): # deprecated 17636 - for deprecated_arg in ('categories', 'ordered'): + for deprecated_arg in ("categories", "ordered"): if deprecated_arg in kwargs: - raise ValueError('Got an unexpected argument: {}'.format( - deprecated_arg)) + raise ValueError( + "Got an unexpected argument: {}".format(deprecated_arg) + ) - categories = kwargs.get('categories', None) - ordered = kwargs.get('ordered', None) + categories = kwargs.get("categories", None) + ordered = kwargs.get("ordered", None) if com._any_not_none(categories, ordered): dtype = CategoricalDtype(categories, ordered) @@ -602,12 +645,11 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, values = values.reshape(self.shape) except Exception: # noqa: E722 - if errors == 'raise': + if errors == "raise": raise newb = self.copy() if copy else self else: - newb = make_block(values, placement=self.mgr_locs, - ndim=self.ndim) + newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim) if newb.is_numeric and self.is_numeric: if newb.shape != self.shape: @@ -615,9 +657,13 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, "cannot set astype for copy = [{copy}] for dtype " "({dtype} [{shape}]) to different shape " "({newb_dtype} [{newb_shape}])".format( - copy=copy, dtype=self.dtype.name, - shape=self.shape, newb_dtype=newb.dtype.name, - newb_shape=newb.shape)) + copy=copy, + dtype=self.dtype.name, + shape=self.shape, + newb_dtype=newb.dtype.name, + newb_shape=newb.shape, + ) + ) return newb def convert(self, copy=True, **kwargs): @@ -647,7 +693,7 @@ def _try_cast_result(self, result, dtype=None): pass elif self.is_float and result.dtype == self.dtype: # protect against a bool/object showing up here - if isinstance(dtype, str) and dtype == 'infer': + if isinstance(dtype, str) and dtype == "infer": return result # This is only reached via Block.setitem, where dtype is always @@ -678,9 +724,12 @@ def _try_coerce_args(self, other): if np.any(notna(other)) and not self._can_hold_element(other): # coercion issues # let higher levels handle - raise TypeError("cannot convert {} to an {}".format( - type(other).__name__, - type(self).__name__.lower().replace('Block', ''))) + raise TypeError( + "cannot convert {} to an {}".format( + type(other).__name__, + type(self).__name__.lower().replace("Block", ""), + ) + ) return other @@ -693,8 +742,7 @@ def _try_coerce_and_cast_result(self, result, dtype=None): result = self._try_cast_result(result, dtype=dtype) return result - def to_native_types(self, slicer=None, na_rep='nan', quoting=None, - **kwargs): + def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.get_values() @@ -706,7 +754,7 @@ def to_native_types(self, slicer=None, na_rep='nan', quoting=None, if not self.is_object and not quoting: values = values.astype(str) else: - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep return values @@ -719,14 +767,15 @@ def copy(self, deep=True): values = values.copy() return self.make_block_same_class(values, ndim=self.ndim) - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): """replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. regex is not used here. It is used in ObjectBlocks. It is here for API compatibility. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") original_to_replace = to_replace # try to replace, if we raise an error, convert to ObjectBlock and @@ -742,12 +791,14 @@ def replace(self, to_replace, value, inplace=False, filter=None, # try again with a compatible block block = self.astype(object) - return block.replace(to_replace=original_to_replace, - value=value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert) + return block.replace( + to_replace=original_to_replace, + value=value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) mask = missing.mask_missing(values, to_replace) if filter is not None: @@ -764,20 +815,23 @@ def replace(self, to_replace, value, inplace=False, filter=None, # try again with a compatible block block = self.astype(object) - return block.replace(to_replace=original_to_replace, - value=value, - inplace=inplace, - filter=filter, - regex=regex, - convert=convert) + return block.replace( + to_replace=original_to_replace, + value=value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) if convert: - blocks = [b.convert(by_item=True, numeric=False, - copy=not inplace) for b in blocks] + blocks = [ + b.convert(by_item=True, numeric=False, copy=not inplace) for b in blocks + ] return blocks def _replace_single(self, *args, **kwargs): """ no-op on a non-ObjectBlock """ - return self if kwargs['inplace'] else self.copy() + return self if kwargs["inplace"] else self.copy() def setitem(self, indexer, value): """Set the value inplace, returning a a maybe different typed block. @@ -809,17 +863,16 @@ def setitem(self, indexer, value): value = self._try_coerce_args(value) values = self._coerce_values(values) # can keep its own dtype - if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, - value.dtype): + if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): dtype = self.dtype else: - dtype = 'infer' + dtype = "infer" except (TypeError, ValueError): # current dtype cannot store value, coerce to common dtype find_dtype = False - if hasattr(value, 'dtype'): + if hasattr(value, "dtype"): dtype = value.dtype find_dtype = True @@ -828,11 +881,10 @@ def setitem(self, indexer, value): # NaN promotion is handled in latter path dtype = False else: - dtype, _ = infer_dtype_from_scalar(value, - pandas_dtype=True) + dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) find_dtype = True else: - dtype = 'infer' + dtype = "infer" if find_dtype: dtype = find_common_type([values.dtype, dtype]) @@ -860,8 +912,9 @@ def _is_scalar_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) + return any( + isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer + ) return False def _is_empty_indexer(indexer): @@ -872,8 +925,9 @@ def _is_empty_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return any(isinstance(idx, np.ndarray) and len(idx) == 0 - for idx in indexer) + return any( + isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer + ) return False # empty indexers @@ -889,9 +943,11 @@ def _is_empty_indexer(indexer): # if we are an exact match (ex-broadcasting), # then use the resultant dtype - elif (len(arr_value.shape) and - arr_value.shape[0] == values.shape[0] and - np.prod(arr_value.shape) == np.prod(values.shape)): + elif ( + len(arr_value.shape) + and arr_value.shape[0] == values.shape[0] + and np.prod(arr_value.shape) == np.prod(values.shape) + ): values[indexer] = value try: values = values.astype(arr_value.dtype) @@ -907,8 +963,7 @@ def _is_empty_indexer(indexer): block = self.make_block(transf(values)) return block - def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False): + def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -931,8 +986,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new_values = self.values if inplace else self.values.copy() - new = getattr(new, 'values', new) - mask = getattr(mask, 'values', mask) + new = getattr(new, "values", new) + mask = getattr(mask, "values", mask) # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: @@ -946,10 +1001,9 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, # If the default repeat behavior in np.putmask would go in the # wrong direction, then explicitly repeat and reshape new instead - if getattr(new, 'ndim', 0) >= 1: + if getattr(new, "ndim", 0) >= 1: if self.ndim - 1 == new.ndim and axis == 1: - new = np.repeat( - new, new_values.shape[-1]).reshape(self.shape) + new = np.repeat(new, new_values.shape[-1]).reshape(self.shape) new = new.astype(new_values.dtype) # we require exact matches between the len of the @@ -959,15 +1013,18 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, # # TODO: this prob needs some better checking # for 2D cases - if ((is_list_like(new) and - np.any(mask[mask]) and - getattr(new, 'ndim', 1) == 1)): - - if not (mask.shape[-1] == len(new) or - mask[mask].shape[-1] == len(new) or - len(new) == 1): - raise ValueError("cannot assign mismatch " - "length to masked array") + if ( + is_list_like(new) + and np.any(mask[mask]) + and getattr(new, "ndim", 1) == 1 + ): + + if not ( + mask.shape[-1] == len(new) + or mask[mask].shape[-1] == len(new) + or len(new) == 1 + ): + raise ValueError("cannot assign mismatch " "length to masked array") np.putmask(new_values, mask, new) @@ -980,7 +1037,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, axis = new_values.ndim - axis - 1 # Pseudo-broadcast - if getattr(new, 'ndim', 0) >= 1: + if getattr(new, "ndim", 0) >= 1: if self.ndim - 1 == new.ndim: new_shape = list(new.shape) new_shape.insert(axis, 1) @@ -1038,40 +1095,47 @@ def coerce_to_target_dtype(self, other): # we don't upcast to bool return self.astype(object) - elif ((self.is_float or self.is_complex) and - (is_integer_dtype(dtype) or is_float_dtype(dtype))): + elif (self.is_float or self.is_complex) and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) + ): # don't coerce float/complex to int return self - elif (self.is_datetime or - is_datetime64_dtype(dtype) or - is_datetime64tz_dtype(dtype)): + elif ( + self.is_datetime + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): # not a datetime - if not ((is_datetime64_dtype(dtype) or - is_datetime64tz_dtype(dtype)) and self.is_datetime): + if not ( + (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) + and self.is_datetime + ): return self.astype(object) # don't upcast timezone with different timezone or no timezone - mytz = getattr(self.dtype, 'tz', None) - othertz = getattr(dtype, 'tz', None) + mytz = getattr(self.dtype, "tz", None) + othertz = getattr(dtype, "tz", None) if str(mytz) != str(othertz): return self.astype(object) - raise AssertionError("possible recursion in " - "coerce_to_target_dtype: {} {}".format( - self, other)) + raise AssertionError( + "possible recursion in " + "coerce_to_target_dtype: {} {}".format(self, other) + ) - elif (self.is_timedelta or is_timedelta64_dtype(dtype)): + elif self.is_timedelta or is_timedelta64_dtype(dtype): # not a timedelta if not (is_timedelta64_dtype(dtype) and self.is_timedelta): return self.astype(object) - raise AssertionError("possible recursion in " - "coerce_to_target_dtype: {} {}".format( - self, other)) + raise AssertionError( + "possible recursion in " + "coerce_to_target_dtype: {} {}".format(self, other) + ) try: return self.astype(dtype) @@ -1080,12 +1144,23 @@ def coerce_to_target_dtype(self, other): return self.astype(object) - def interpolate(self, method='pad', axis=0, index=None, values=None, - inplace=False, limit=None, limit_direction='forward', - limit_area=None, fill_value=None, coerce=False, - downcast=None, **kwargs): - - inplace = validate_bool_kwarg(inplace, 'inplace') + def interpolate( + self, + method="pad", + axis=0, + index=None, + values=None, + inplace=False, + limit=None, + limit_direction="forward", + limit_area=None, + fill_value=None, + coerce=False, + downcast=None, + **kwargs + ): + + inplace = validate_bool_kwarg(inplace, "inplace") def check_int_bool(self, inplace): # Only FloatBlocks will contain NaNs. @@ -1106,30 +1181,48 @@ def check_int_bool(self, inplace): r = check_int_bool(self, inplace) if r is not None: return r - return self._interpolate_with_fill(method=m, axis=axis, - inplace=inplace, limit=limit, - fill_value=fill_value, - coerce=coerce, - downcast=downcast) + return self._interpolate_with_fill( + method=m, + axis=axis, + inplace=inplace, + limit=limit, + fill_value=fill_value, + coerce=coerce, + downcast=downcast, + ) # validate the interp method m = missing.clean_interp_method(method, **kwargs) r = check_int_bool(self, inplace) if r is not None: return r - return self._interpolate(method=m, index=index, values=values, - axis=axis, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, inplace=inplace, - downcast=downcast, **kwargs) - - def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, - limit=None, fill_value=None, coerce=False, - downcast=None): + return self._interpolate( + method=m, + index=index, + values=values, + axis=axis, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + inplace=inplace, + downcast=downcast, + **kwargs + ) + + def _interpolate_with_fill( + self, + method="pad", + axis=0, + inplace=False, + limit=None, + fill_value=None, + coerce=False, + downcast=None, + ): """ fillna but using the interpolate machinery """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # if we are coercing, then don't force the conversion # if the block can't hold the type @@ -1143,21 +1236,36 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, values = self.values if inplace else self.values.copy() values = self._coerce_values(values) fill_value = self._try_coerce_args(fill_value) - values = missing.interpolate_2d(values, method=method, axis=axis, - limit=limit, fill_value=fill_value, - dtype=self.dtype) + values = missing.interpolate_2d( + values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype, + ) values = self._try_coerce_result(values) blocks = [self.make_block_same_class(values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) - def _interpolate(self, method=None, index=None, values=None, - fill_value=None, axis=0, limit=None, - limit_direction='forward', limit_area=None, - inplace=False, downcast=None, **kwargs): + def _interpolate( + self, + method=None, + index=None, + values=None, + fill_value=None, + axis=0, + limit=None, + limit_direction="forward", + limit_area=None, + inplace=False, + downcast=None, + **kwargs + ): """ interpolate using scipy wrappers """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") data = self.values if inplace else self.values.copy() # only deal with floats @@ -1169,10 +1277,12 @@ def _interpolate(self, method=None, index=None, values=None, if fill_value is None: fill_value = self.fill_value - if method in ('krogh', 'piecewise_polynomial', 'pchip'): + if method in ("krogh", "piecewise_polynomial", "pchip"): if not index.is_monotonic: - raise ValueError("{0} interpolation requires that the " - "index be monotonic.".format(method)) + raise ValueError( + "{0} interpolation requires that the " + "index be monotonic.".format(method) + ) # process 1-d slices in the axis direction def func(x): @@ -1180,11 +1290,17 @@ def func(x): # process a 1-d slice, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to missing.interpolate_1d - return missing.interpolate_1d(index, x, method=method, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, - bounds_error=False, **kwargs) + return missing.interpolate_1d( + index, + x, + method=method, + limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, + bounds_error=False, + **kwargs + ) # interp each column independently interp_values = np.apply_along_axis(func, axis, data) @@ -1206,12 +1322,14 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): if fill_tuple is None: fill_value = self.fill_value - new_values = algos.take_nd(values, indexer, axis=axis, - allow_fill=False, fill_value=fill_value) + new_values = algos.take_nd( + values, indexer, axis=axis, allow_fill=False, fill_value=fill_value + ) else: fill_value = fill_tuple[0] - new_values = algos.take_nd(values, indexer, axis=axis, - allow_fill=True, fill_value=fill_value) + new_values = algos.take_nd( + values, indexer, axis=axis, allow_fill=True, fill_value=fill_value + ) if new_mgr_locs is None: if axis == 0: @@ -1247,8 +1365,7 @@ def shift(self, periods, axis=0, fill_value=None): axis = new_values.ndim - axis - 1 if np.prod(new_values.shape): - new_values = np.roll(new_values, ensure_platform_int(periods), - axis=axis) + new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis) axis_indexer = [slice(None)] * self.ndim if periods > 0: @@ -1263,8 +1380,16 @@ def shift(self, periods, axis=0, fill_value=None): return [self.make_block(new_values)] - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast=False, + axis=0, + transpose=False, + ): """ evaluate the block; return result block(s) from the result @@ -1286,27 +1411,27 @@ def where(self, other, cond, align=True, errors='raise', a new block(s), the result of the func """ import pandas.core.computation.expressions as expressions - assert errors in ['raise', 'ignore'] + + assert errors in ["raise", "ignore"] values = self.values orig_other = other if transpose: values = values.T - other = getattr(other, '_values', getattr(other, 'values', other)) - cond = getattr(cond, 'values', cond) + other = getattr(other, "_values", getattr(other, "values", other)) + cond = getattr(cond, "values", cond) # If the default broadcasting would go in the wrong direction, then # explicitly reshape other instead - if getattr(other, 'ndim', 0) >= 1: + if getattr(other, "ndim", 0) >= 1: if values.ndim - 1 == other.ndim and axis == 1: - other = other.reshape(tuple(other.shape + (1, ))) + other = other.reshape(tuple(other.shape + (1,))) elif transpose and values.ndim == self.ndim - 1: cond = cond.T - if not hasattr(cond, 'shape'): - raise ValueError("where must have a condition that is ndarray " - "like") + if not hasattr(cond, "shape"): + raise ValueError("where must have a condition that is ndarray " "like") # our where function def func(cond, values, other): @@ -1316,13 +1441,14 @@ def func(cond, values, other): fastres = expressions.where(cond, values, other) return self._try_coerce_result(fastres) except Exception as detail: - if errors == 'raise': + if errors == "raise": raise TypeError( - 'Could not operate [{other!r}] with block values ' - '[{detail!s}]'.format(other=other, detail=detail)) + "Could not operate [{other!r}] with block values " + "[{detail!s}]".format(other=other, detail=detail) + ) else: # return the values - result = np.empty(values.shape, dtype='float64') + result = np.empty(values.shape, dtype="float64") result.fill(np.nan) return result @@ -1339,11 +1465,16 @@ def func(cond, values, other): # we cannot coerce, return a compat dtype # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) - blocks = block.where(orig_other, cond, align=align, - errors=errors, - try_cast=try_cast, axis=axis, - transpose=transpose) - return self._maybe_downcast(blocks, 'infer') + blocks = block.where( + orig_other, + cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=axis, + transpose=transpose, + ) + return self._maybe_downcast(blocks, "infer") if self._can_hold_na or self.ndim == 1: @@ -1359,8 +1490,7 @@ def func(cond, values, other): # might need to separate out blocks axis = cond.ndim - 1 cond = cond.swapaxes(axis, 0) - mask = np.array([cond[i].all() for i in range(cond.shape[0])], - dtype=bool) + mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) result_blocks = [] for m in [mask, ~mask]: @@ -1410,7 +1540,7 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): blocks = [make_block(new_values, placement=new_placement)] return blocks, mask - def quantile(self, qs, interpolation='linear', axis=0): + def quantile(self, qs, interpolation="linear", axis=0): """ compute the quantiles of the @@ -1450,18 +1580,23 @@ def quantile(self, qs, interpolation='linear', axis=0): if is_empty: # create the array of na_values # 2d len(values) * len(qs) - result = np.repeat(np.array([self.fill_value] * len(qs)), - len(values)).reshape(len(values), - len(qs)) + result = np.repeat( + np.array([self.fill_value] * len(qs)), len(values) + ).reshape(len(values), len(qs)) else: # asarray needed for Sparse, see GH#24600 # Note: we use self.values below instead of values because the # `asi8` conversion above will behave differently under `isna` mask = np.asarray(isna(self.values)) - result = nanpercentile(values, np.array(qs) * 100, - axis=axis, na_value=self.fill_value, - mask=mask, ndim=self.ndim, - interpolation=interpolation) + result = nanpercentile( + values, + np.array(qs) * 100, + axis=axis, + na_value=self.fill_value, + mask=mask, + ndim=self.ndim, + interpolation=interpolation, + ) result = np.array(result, copy=False) result = result.T @@ -1472,14 +1607,13 @@ def quantile(self, qs, interpolation='linear', axis=0): result = result[..., 0] result = lib.item_from_zerodim(result) - ndim = getattr(result, 'ndim', None) or 0 + ndim = getattr(result, "ndim", None) or 0 result = self._try_coerce_result(result) - return make_block(result, - placement=np.arange(len(result)), - ndim=ndim) + return make_block(result, placement=np.arange(len(result)), ndim=ndim) - def _replace_coerce(self, to_replace, value, inplace=True, regex=False, - convert=False, mask=None): + def _replace_coerce( + self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + ): """ Replace value corresponding to the given boolean array with another value. @@ -1509,15 +1643,20 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, self = self.coerce_to_target_dtype(value) return self.putmask(mask, value, inplace=inplace) else: - return self._replace_single(to_replace, value, inplace=inplace, - regex=regex, - convert=convert, - mask=mask) + return self._replace_single( + to_replace, + value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) return self class NonConsolidatableMixIn: """ hold methods for the nonconsolidatable blocks """ + _can_consolidate = False _verify_integrity = False _validate_ndim = False @@ -1546,7 +1685,7 @@ def __init__(self, values, placement, ndim=None): @property def shape(self): if self.ndim == 1: - return (len(self.values)), + return ((len(self.values)),) return (len(self.mgr_locs), len(self.values)) def iget(self, col): @@ -1572,8 +1711,7 @@ def set(self, locs, values, check=False): assert locs.tolist() == [0] self.values = values - def putmask(self, mask, new, align=True, inplace=False, axis=0, - transpose=False): + def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False): """ putmask the data to the block; we must be a single block and not generate other blocks @@ -1591,7 +1729,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, ------- a new block, the result of the putmask """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # use block's copy logic. # .values may be an Index which does shallow copy by default @@ -1654,6 +1792,7 @@ class ExtensionBlock(NonConsolidatableMixIn, Block): ExtensionArrays are limited to 1-D. """ + is_extension = True def __init__(self, values, placement, ndim=None): @@ -1754,8 +1893,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take(indexer, fill_value=fill_value, - allow_fill=True) + new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True) if self.ndim == 1 and new_mgr_locs is None: new_mgr_locs = [0] @@ -1778,8 +1916,7 @@ def _slice(self, slicer): if isinstance(slicer, tuple) and len(slicer) == 2: if not com.is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim " - "categorical") + raise AssertionError("invalid slicing for a 1-ndim " "categorical") slicer = slicer[1] return self.values[slicer] @@ -1798,8 +1935,7 @@ def _try_cast_result(self, result, dtype=None): """ try: - result = self._holder._from_sequence( - result.ravel(), dtype=dtype) + result = self._holder._from_sequence(result.ravel(), dtype=dtype) except Exception: pass @@ -1809,7 +1945,7 @@ def formatting_values(self): # Deprecating the ability to override _formatting_values. # Do the warning here, it's only user in pandas, since we # have to check if the subclass overrode it. - fv = getattr(type(self.values), '_formatting_values', None) + fv = getattr(type(self.values), "_formatting_values", None) if fv and fv != ExtensionArray._formatting_values: msg = ( "'ExtensionArray._formatting_values' is deprecated. " @@ -1824,32 +1960,35 @@ def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. """ - values = self._holder._concat_same_type( - [blk.values for blk in to_concat]) + values = self._holder._concat_same_type([blk.values for blk in to_concat]) placement = placement or slice(0, len(values), 1) - return self.make_block_same_class(values, ndim=self.ndim, - placement=placement) + return self.make_block_same_class(values, ndim=self.ndim, placement=placement) def fillna(self, value, limit=None, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() values = values.fillna(value=value, limit=limit) - return [self.make_block_same_class(values=values, - placement=self.mgr_locs, - ndim=self.ndim)] + return [ + self.make_block_same_class( + values=values, placement=self.mgr_locs, ndim=self.ndim + ) + ] - def interpolate(self, method='pad', axis=0, inplace=False, limit=None, - fill_value=None, **kwargs): + def interpolate( + self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs + ): values = self.values if inplace else self.values.copy() return self.make_block_same_class( - values=values.fillna(value=fill_value, method=method, - limit=limit), - placement=self.mgr_locs) + values=values.fillna(value=fill_value, method=method, limit=limit), + placement=self.mgr_locs, + ) - def shift(self, - periods: int, - axis: libinternals.BlockPlacement = 0, - fill_value: Any = None) -> List['ExtensionBlock']: + def shift( + self, + periods: int, + axis: libinternals.BlockPlacement = 0, + fill_value: Any = None, + ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1859,11 +1998,21 @@ def shift(self, return [ self.make_block_same_class( self.values.shift(periods=periods, fill_value=fill_value), - placement=self.mgr_locs, ndim=self.ndim) + placement=self.mgr_locs, + ndim=self.ndim, + ) ] - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast=False, + axis=0, + transpose=False, + ): if isinstance(other, ABCDataFrame): # ExtensionArrays are 1-D, so if we get here then # `other` should be a DataFrame with a single column. @@ -1904,15 +2053,14 @@ def where(self, other, cond, align=True, errors='raise', # TypeError for SparseArray, which implements just to raise # a TypeError result = self._holder._from_sequence( - np.where(cond, self.values, other), - dtype=dtype, + np.where(cond, self.values, other), dtype=dtype ) return self.make_block_same_class(result, placement=self.mgr_locs) @property def _ftype(self): - return getattr(self.values, '_pandas_ftype', Block._ftype) + return getattr(self.values, "_pandas_ftype", Block._ftype) def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): # ExtensionArray-safe unstack. @@ -1931,9 +2079,9 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): blocks = [ self.make_block_same_class( - self.values.take(indices, allow_fill=True, - fill_value=fill_value), - [place]) + self.values.take(indices, allow_fill=True, fill_value=fill_value), + [place], + ) for indices, place in zip(new_values.T, new_placement) ] return blocks, mask @@ -1974,16 +2122,25 @@ class FloatBlock(FloatOrComplexBlock): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return (issubclass(tipo.type, (np.floating, np.integer)) and - not issubclass(tipo.type, (np.datetime64, np.timedelta64))) - return ( - isinstance( - element, (float, int, np.floating, np.int_)) and - not isinstance(element, (bool, np.bool_, datetime, timedelta, - np.datetime64, np.timedelta64))) + return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( + tipo.type, (np.datetime64, np.timedelta64) + ) + return isinstance( + element, (float, int, np.floating, np.int_) + ) and not isinstance( + element, + (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64), + ) - def to_native_types(self, slicer=None, na_rep='', float_format=None, - decimal='.', quoting=None, **kwargs): + def to_native_types( + self, + slicer=None, + na_rep="", + float_format=None, + decimal=".", + quoting=None, + **kwargs + ): """ convert to our native types format, slicing if desired """ values = self.values @@ -1993,29 +2150,33 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, # see gh-13418: no special formatting is desired at the # output (important for appropriate 'quoting' behaviour), # so do not pass it through the FloatArrayFormatter - if float_format is None and decimal == '.': + if float_format is None and decimal == ".": mask = isna(values) if not quoting: values = values.astype(str) else: - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep return values from pandas.io.formats.format import FloatArrayFormatter - formatter = FloatArrayFormatter(values, na_rep=na_rep, - float_format=float_format, - decimal=decimal, quoting=quoting, - fixed_width=False) + + formatter = FloatArrayFormatter( + values, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + quoting=quoting, + fixed_width=False, + ) return formatter.get_result_as_array() def should_store(self, value): # when inserting a column should not coerce integers to floats # unnecessarily - return (issubclass(value.dtype.type, np.floating) and - value.dtype == self.dtype) + return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype class ComplexBlock(FloatOrComplexBlock): @@ -2025,13 +2186,10 @@ class ComplexBlock(FloatOrComplexBlock): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return issubclass(tipo.type, - (np.floating, np.integer, np.complexfloating)) - return ( - isinstance( - element, - (float, int, complex, np.float_, np.int_)) and - not isinstance(element, (bool, np.bool_))) + return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) + return isinstance( + element, (float, int, complex, np.float_, np.int_) + ) and not isinstance(element, (bool, np.bool_)) def should_store(self, value): return issubclass(value.dtype.type, np.complexfloating) @@ -2045,10 +2203,11 @@ class IntBlock(NumericBlock): def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: - return (issubclass(tipo.type, np.integer) and - not issubclass(tipo.type, (np.datetime64, - np.timedelta64)) and - self.dtype.itemsize >= tipo.itemsize) + return ( + issubclass(tipo.type, np.integer) + and not issubclass(tipo.type, (np.datetime64, np.timedelta64)) + and self.dtype.itemsize >= tipo.itemsize + ) return is_integer(element) def should_store(self, value): @@ -2123,8 +2282,8 @@ def _astype(self, dtype, **kwargs): # if we are passed a datetime64[ns, tz] if is_datetime64tz_dtype(dtype): values = self.values - if getattr(values, 'tz', None) is None: - values = DatetimeArray(values).tz_localize('UTC') + if getattr(values, "tz", None) is None: + values = DatetimeArray(values).tz_localize("UTC") values = values.tz_convert(dtype.tz) return self.make_block(values) @@ -2135,11 +2294,10 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return tipo == _NS_DTYPE or tipo == np.int64 - return (is_integer(element) or isinstance(element, datetime) or - isna(element)) + return is_integer(element) or isinstance(element, datetime) or isna(element) def _coerce_values(self, values): - return values.view('i8') + return values.view("i8") def _try_coerce_args(self, other): """ @@ -2163,12 +2321,13 @@ def _try_coerce_args(self, other): other = tslibs.iNaT elif isinstance(other, (datetime, np.datetime64, date)): other = self._box_func(other) - if getattr(other, 'tz') is not None: - raise TypeError("cannot coerce a Timestamp with a tz on a " - "naive Block") - other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and is_datetime64_dtype(other): - other = other.astype('i8', copy=False).view('i8') + if getattr(other, "tz") is not None: + raise TypeError( + "cannot coerce a Timestamp with a tz on a " "naive Block" + ) + other = other.asm8.view("i8") + elif hasattr(other, "dtype") and is_datetime64_dtype(other): + other = other.astype("i8", copy=False).view("i8") else: # coercion issues # let higher levels handle @@ -2179,8 +2338,8 @@ def _try_coerce_args(self, other): def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f']: - result = result.astype('M8[ns]') + if result.dtype.kind in ["i", "f"]: + result = result.astype("M8[ns]") elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) @@ -2190,29 +2349,36 @@ def _try_coerce_result(self, result): def _box_func(self): return tslibs.Timestamp - def to_native_types(self, slicer=None, na_rep=None, date_format=None, - quoting=None, **kwargs): + def to_native_types( + self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs + ): """ convert to our native types format, slicing if desired """ values = self.values - i8values = self.values.view('i8') + i8values = self.values.view("i8") if slicer is not None: values = values[..., slicer] i8values = i8values[..., slicer] from pandas.io.formats.format import _get_format_datetime64_from_values + fmt = _get_format_datetime64_from_values(values, date_format) result = tslib.format_array_from_datetime( - i8values.ravel(), tz=getattr(self.values, 'tz', None), - format=fmt, na_rep=na_rep).reshape(i8values.shape) + i8values.ravel(), + tz=getattr(self.values, "tz", None), + format=fmt, + na_rep=na_rep, + ).reshape(i8values.shape) return np.atleast_2d(result) def should_store(self, value): - return (issubclass(value.dtype.type, np.datetime64) and - not is_datetime64tz_dtype(value) and - not is_extension_array_dtype(value)) + return ( + issubclass(value.dtype.type, np.datetime64) + and not is_datetime64tz_dtype(value) + and not is_extension_array_dtype(value) + ) def set(self, locs, values): """ @@ -2227,11 +2393,12 @@ def set(self, locs, values): self.values[locs] = values def external_values(self): - return np.asarray(self.values.astype('datetime64[ns]', copy=False)) + return np.asarray(self.values.astype("datetime64[ns]", copy=False)) class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ + __slots__ = () is_datetimetz = True is_extension = True @@ -2350,7 +2517,7 @@ def _try_coerce_args(self, other): other = _block_shape(other.asi8, ndim=self.ndim) elif isinstance(other, (np.datetime64, datetime, date)): other = tslibs.Timestamp(other) - tz = getattr(other, 'tz', None) + tz = getattr(other, "tz", None) # test we can have an equal time zone if tz is None or str(tz) != str(self.values.tz): @@ -2364,8 +2531,8 @@ def _try_coerce_args(self, other): def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): - if result.dtype.kind in ['i', 'f']: - result = result.astype('M8[ns]') + if result.dtype.kind in ["i", "f"]: + result = result.astype("M8[ns]") elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) @@ -2376,8 +2543,9 @@ def _try_coerce_result(self, result): if result.ndim > 1: result = result.reshape(np.prod(result.shape)) # GH#24096 new values invalidates a frequency - result = self._holder._simple_new(result, freq=None, - dtype=self.values.dtype) + result = self._holder._simple_new( + result, freq=None, dtype=self.values.dtype + ) return result @@ -2410,7 +2578,7 @@ def diff(self, n, axis=0): # Reshape the new_values like how algos.diff does for timedelta data new_values = new_values.reshape(1, len(new_values)) - new_values = new_values.astype('timedelta64[ns]') + new_values = new_values.astype("timedelta64[ns]") return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] def concat_same_type(self, to_concat, placement=None): @@ -2445,16 +2613,16 @@ def setitem(self, indexer, value): try: return super().setitem(indexer, value) except (ValueError, TypeError): - newb = make_block(self.values.astype(object), - placement=self.mgr_locs, - klass=ObjectBlock) + newb = make_block( + self.values.astype(object), placement=self.mgr_locs, klass=ObjectBlock + ) return newb.setitem(indexer, value) def equals(self, other): # override for significant performance improvement if self.dtype != other.dtype or self.shape != other.shape: return False - return (self.values.view('i8') == other.values.view('i8')).all() + return (self.values.view("i8") == other.values.view("i8")).all() class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): @@ -2477,14 +2645,15 @@ def _holder(self): @property def _box_func(self): - return lambda x: Timedelta(x, unit='ns') + return lambda x: Timedelta(x, unit="ns") def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.timedelta64, np.int64)) return is_integer(element) or isinstance( - element, (timedelta, np.timedelta64, np.int64)) + element, (timedelta, np.timedelta64, np.int64) + ) def fillna(self, value, **kwargs): @@ -2492,16 +2661,19 @@ def fillna(self, value, **kwargs): # interpreted as nanoseconds if is_integer(value) and not isinstance(value, np.timedelta64): # Deprecation GH#24694, GH#19233 - warnings.warn("Passing integers to fillna is deprecated, will " - "raise a TypeError in a future version. To retain " - "the old behavior, pass pd.Timedelta(seconds=n) " - "instead.", - FutureWarning, stacklevel=6) - value = Timedelta(value, unit='s') + warnings.warn( + "Passing integers to fillna is deprecated, will " + "raise a TypeError in a future version. To retain " + "the old behavior, pass pd.Timedelta(seconds=n) " + "instead.", + FutureWarning, + stacklevel=6, + ) + value = Timedelta(value, unit="s") return super().fillna(value, **kwargs) def _coerce_values(self, values): - return values.view('i8') + return values.view("i8") def _try_coerce_args(self, other): """ @@ -2523,8 +2695,8 @@ def _try_coerce_args(self, other): other = tslibs.iNaT elif isinstance(other, (timedelta, np.timedelta64)): other = Timedelta(other).value - elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): - other = other.astype('i8', copy=False).view('i8') + elif hasattr(other, "dtype") and is_timedelta64_dtype(other): + other = other.astype("i8", copy=False).view("i8") else: # coercion issues # let higher levels handle @@ -2536,8 +2708,8 @@ def _try_coerce_result(self, result): """ reverse of try_coerce_args / try_operate """ if isinstance(result, np.ndarray): mask = isna(result) - if result.dtype.kind in ['i', 'f']: - result = result.astype('m8[ns]') + if result.dtype.kind in ["i", "f"]: + result = result.astype("m8[ns]") result[mask] = tslibs.iNaT elif isinstance(result, (np.integer, np.float)): @@ -2546,11 +2718,11 @@ def _try_coerce_result(self, result): return result def should_store(self, value): - return (issubclass(value.dtype.type, np.timedelta64) and - not is_extension_array_dtype(value)) + return issubclass( + value.dtype.type, np.timedelta64 + ) and not is_extension_array_dtype(value) - def to_native_types(self, slicer=None, na_rep=None, quoting=None, - **kwargs): + def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -2560,7 +2732,7 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, rvalues = np.empty(values.shape, dtype=object) if na_rep is None: - na_rep = 'NaT' + na_rep = "NaT" rvalues[mask] = na_rep imask = (~mask).ravel() @@ -2568,9 +2740,10 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, # should use the formats.format.Timedelta64Formatter here # to figure what format to pass to the Timedelta # e.g. to not show the decimals say - rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all') - for val in values.ravel()[imask]], - dtype=object) + rvalues.flat[imask] = np.array( + [Timedelta(val)._repr_base(format="all") for val in values.ravel()[imask]], + dtype=object, + ) return rvalues def external_values(self, dtype=None): @@ -2589,17 +2762,25 @@ def _can_hold_element(self, element): return isinstance(element, (bool, np.bool_)) def should_store(self, value): - return (issubclass(value.dtype.type, np.bool_) and not - is_extension_array_dtype(value)) + return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype( + value + ) - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): - inplace = validate_bool_kwarg(inplace, 'inplace') + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): + inplace = validate_bool_kwarg(inplace, "inplace") to_replace_values = np.atleast_1d(to_replace) if not np.can_cast(to_replace_values, bool): return self - return super().replace(to_replace, value, inplace=inplace, - filter=filter, regex=regex, convert=convert) + return super().replace( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) class ObjectBlock(Block): @@ -2630,9 +2811,9 @@ def convert(self, *args, **kwargs): if args: raise NotImplementedError - by_item = kwargs.get('by_item', True) + by_item = kwargs.get("by_item", True) - new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta'] + new_inputs = ["coerce", "datetime", "numeric", "timedelta"] new_style = False for kw in new_inputs: new_style |= kw in kwargs @@ -2642,9 +2823,8 @@ def convert(self, *args, **kwargs): fn_inputs = new_inputs else: fn = maybe_convert_objects - fn_inputs = ['convert_dates', 'convert_numeric', - 'convert_timedeltas'] - fn_inputs += ['copy'] + fn_inputs = ["convert_dates", "convert_numeric", "convert_timedeltas"] + fn_inputs += ["copy"] fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} @@ -2663,8 +2843,7 @@ def f(m, v, i): blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) - blocks = [make_block(values, ndim=self.ndim, - placement=self.mgr_locs)] + blocks = [make_block(values, ndim=self.ndim, placement=self.mgr_locs)] return blocks @@ -2674,8 +2853,7 @@ def _maybe_downcast(self, blocks, downcast=None): return blocks # split and convert the blocks - return _extend_blocks([b.convert(datetime=True, numeric=False) - for b in blocks]) + return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) def _can_hold_element(self, element): return True @@ -2696,16 +2874,21 @@ def _try_coerce_args(self, other): return other def should_store(self, value): - return not (issubclass(value.dtype.type, - (np.integer, np.floating, np.complexfloating, - np.datetime64, np.bool_)) or - # TODO(ExtensionArray): remove is_extension_type - # when all extension arrays have been ported. - is_extension_type(value) or - is_extension_array_dtype(value)) - - def replace(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True): + return not ( + issubclass( + value.dtype.type, + (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_), + ) + or + # TODO(ExtensionArray): remove is_extension_type + # when all extension arrays have been ported. + is_extension_type(value) + or is_extension_array_dtype(value) + ) + + def replace( + self, to_replace, value, inplace=False, filter=None, regex=False, convert=True + ): to_rep_is_list = is_list_like(to_replace) value_is_list = is_list_like(value) both_lists = to_rep_is_list and value_is_list @@ -2715,19 +2898,35 @@ def replace(self, to_replace, value, inplace=False, filter=None, blocks = [self] if not either_list and is_re(to_replace): - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, regex=True, - convert=convert) + return self._replace_single( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=True, + convert=convert, + ) elif not (either_list or regex): - return super().replace(to_replace, value, inplace=inplace, - filter=filter, regex=regex, convert=convert) + return super().replace( + to_replace, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) elif both_lists: for to_rep, v in zip(to_replace, value): result_blocks = [] for b in blocks: - result = b._replace_single(to_rep, v, inplace=inplace, - filter=filter, regex=regex, - convert=convert) + result = b._replace_single( + to_rep, + v, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks @@ -2736,19 +2935,37 @@ def replace(self, to_replace, value, inplace=False, filter=None, for to_rep in to_replace: result_blocks = [] for b in blocks: - result = b._replace_single(to_rep, value, inplace=inplace, - filter=filter, regex=regex, - convert=convert) + result = b._replace_single( + to_rep, + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks - return self._replace_single(to_replace, value, inplace=inplace, - filter=filter, convert=convert, - regex=regex) + return self._replace_single( + to_replace, + value, + inplace=inplace, + filter=filter, + convert=convert, + regex=regex, + ) - def _replace_single(self, to_replace, value, inplace=False, filter=None, - regex=False, convert=True, mask=None): + def _replace_single( + self, + to_replace, + value, + inplace=False, + filter=None, + regex=False, + convert=True, + mask=None, + ): """ Replace elements by the given value. @@ -2772,7 +2989,7 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, ------- a new block, the result after replacing """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # to_replace is regex compilable to_rep_re = regex and is_re_compilable(to_replace) @@ -2782,8 +2999,9 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, # only one will survive if to_rep_re and regex_re: - raise AssertionError('only one of to_replace and regex can be ' - 'regex compilable') + raise AssertionError( + "only one of to_replace and regex can be " "regex compilable" + ) # if regex was passed as something that can be a regex (rather than a # boolean) @@ -2805,8 +3023,9 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None, else: # if the thing to replace is not a string or compiled regex call # the superclass method -> to_replace is some kind of object - return super().replace(to_replace, value, inplace=inplace, - filter=filter, regex=regex) + return super().replace( + to_replace, value, inplace=inplace, filter=filter, regex=regex + ) new_values = self.values if inplace else self.values.copy() @@ -2819,6 +3038,7 @@ def re_replacer(s): return value if rx.search(s) is not None else s except TypeError: return s + else: # value is guaranteed to be a string here, s can be either a string # or null if it's null it gets returned @@ -2846,8 +3066,9 @@ def re_replacer(s): block = block.convert(by_item=True, numeric=False) return block - def _replace_coerce(self, to_replace, value, inplace=True, regex=False, - convert=False, mask=None): + def _replace_coerce( + self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + ): """ Replace value corresponding to the given boolean array with another value. @@ -2873,11 +3094,17 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False, """ if mask.any(): block = super()._replace_coerce( - to_replace=to_replace, value=value, inplace=inplace, - regex=regex, convert=convert, mask=mask) + to_replace=to_replace, + value=value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) if convert: - block = [b.convert(by_item=True, numeric=False, copy=True) - for b in block] + block = [ + b.convert(by_item=True, numeric=False, copy=True) for b in block + ] return block return self @@ -2893,9 +3120,7 @@ def __init__(self, values, placement, ndim=None): from pandas.core.arrays.categorical import _maybe_to_categorical # coerce to categorical if we can - super().__init__(_maybe_to_categorical(values), - placement=placement, - ndim=ndim) + super().__init__(_maybe_to_categorical(values), placement=placement, ndim=ndim) @property def _holder(self): @@ -2913,8 +3138,7 @@ def _try_coerce_result(self, result): # GH12564: CategoricalBlock is 1-dim only # while returned results could be any dim - if ((not is_categorical_dtype(result)) and - isinstance(result, np.ndarray)): + if (not is_categorical_dtype(result)) and isinstance(result, np.ndarray): result = _block_shape(result, ndim=self.ndim) return result @@ -2925,7 +3149,7 @@ def to_dense(self): # other types. return self.values._internal_get_values() - def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): + def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -2933,7 +3157,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): # Categorical is always one dimension values = values[slicer] mask = isna(values) - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep # we are expected to return a 2-d ndarray @@ -2952,15 +3176,24 @@ def concat_same_type(self, to_concat, placement=None): 1. Change Categorical._concat_same_type to use union_categoricals 2. Delete this method. """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) + values = self._concatenator( + [blk.values for blk in to_concat], axis=self.ndim - 1 + ) # not using self.make_block_same_class as values can be object dtype return make_block( - values, placement=placement or slice(0, len(values), 1), - ndim=self.ndim) + values, placement=placement or slice(0, len(values), 1), ndim=self.ndim + ) - def where(self, other, cond, align=True, errors='raise', - try_cast=False, axis=0, transpose=False): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast=False, + axis=0, + transpose=False, + ): # TODO(CategoricalBlock.where): # This can all be deleted in favor of ExtensionBlock.where once # we enforce the deprecation. @@ -2980,16 +3213,22 @@ def where(self, other, cond, align=True, errors='raise', ) except (TypeError, ValueError): warnings.warn(object_msg, FutureWarning, stacklevel=6) - result = self.astype(object).where(other, cond, align=align, - errors=errors, - try_cast=try_cast, - axis=axis, transpose=transpose) + result = self.astype(object).where( + other, + cond, + align=align, + errors=errors, + try_cast=try_cast, + axis=axis, + transpose=transpose, + ) return result # ----------------------------------------------------------------- # Constructor Helpers + def get_block_type(values, dtype=None): """ Find the appropriate Block subclass to use for the given values and dtype. @@ -3036,8 +3275,7 @@ def get_block_type(values, dtype=None): return cls -def make_block(values, placement, klass=None, ndim=None, dtype=None, - fastpath=None): +def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=None): # Ensure that we don't allow PandasArray / PandasDtype in internals. # For now, blocks should be backed by ndarrays when possible. if isinstance(values, ABCPandasArray): @@ -3050,8 +3288,10 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, if fastpath is not None: # GH#19265 pyarrow is passing this - warnings.warn("fastpath argument is deprecated, will be removed " - "in a future release.", FutureWarning) + warnings.warn( + "fastpath argument is deprecated, will be removed " "in a future release.", + FutureWarning, + ) if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) @@ -3066,9 +3306,11 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, # ----------------------------------------------------------------- + def _extend_blocks(result, blocks=None): """ return a new extended blocks, givin the result """ from pandas.core.internals import BlockManager + if blocks is None: blocks = [] if isinstance(result, list): @@ -3093,7 +3335,7 @@ def _block_shape(values, ndim=1, shape=None): # TODO: https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - values = values.reshape(tuple((1, ) + shape)) + values = values.reshape(tuple((1,) + shape)) return values @@ -3193,13 +3435,14 @@ def _putmask_smart(v, m, n): # only compare integers/floats # don't compare integers to datetimelikes - if (not is_numeric_v_string_like(nn, nn_at) and - (is_float_dtype(nn.dtype) or - is_integer_dtype(nn.dtype) and - is_float_dtype(nn_at.dtype) or - is_integer_dtype(nn_at.dtype))): - - comp = (nn == nn_at) + if not is_numeric_v_string_like(nn, nn_at) and ( + is_float_dtype(nn.dtype) + or is_integer_dtype(nn.dtype) + and is_float_dtype(nn_at.dtype) + or is_integer_dtype(nn_at.dtype) + ): + + comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = v.copy() nv[m] = nn_at diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 6900dfc3c76d8..9ccd4b80869a0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -9,9 +9,16 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - _get_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_float_dtype, - is_numeric_dtype, is_sparse, is_timedelta64_dtype) + _get_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_float_dtype, + is_numeric_dtype, + is_sparse, + is_timedelta64_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna @@ -54,9 +61,9 @@ def get_mgr_concatenation_plan(mgr, indexers): blklocs = mgr._blklocs plan = [] - for blkno, placements in libinternals.get_blkno_placements(blknos, - mgr.nblocks, - group=False): + for blkno, placements in libinternals.get_blkno_placements( + blknos, mgr.nblocks, group=False + ): assert placements.is_slice_like @@ -72,18 +79,26 @@ def get_mgr_concatenation_plan(mgr, indexers): blk = mgr.blocks[blkno] ax0_blk_indexer = blklocs[placements.indexer] - unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and - # Fastpath detection of join unit not - # needing to reindex its block: no ax0 - # reindexing took place and block - # placement was sequential before. - ((ax0_indexer is None and - blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice.step == 1) or - # Slow-ish detection: all indexer locs - # are sequential (and length match is - # checked above). - (np.diff(ax0_blk_indexer) == 1).all())) + unit_no_ax0_reindexing = ( + len(placements) == len(blk.mgr_locs) + and + # Fastpath detection of join unit not + # needing to reindex its block: no ax0 + # reindexing took place and block + # placement was sequential before. + ( + ( + ax0_indexer is None + and blk.mgr_locs.is_slice_like + and blk.mgr_locs.as_slice.step == 1 + ) + or + # Slow-ish detection: all indexer locs + # are sequential (and length match is + # checked above). + (np.diff(ax0_blk_indexer) == 1).all() + ) + ) # Omit indexer if no item reindexing is required. if unit_no_ax0_reindexing: @@ -99,7 +114,6 @@ def get_mgr_concatenation_plan(mgr, indexers): class JoinUnit: - def __init__(self, block, shape, indexers=None): # Passing shape explicitly is required for cases when block is None. if indexers is None: @@ -109,9 +123,9 @@ def __init__(self, block, shape, indexers=None): self.shape = shape def __repr__(self): - return '{name}({block!r}, {indexers})'.format( - name=self.__class__.__name__, block=self.block, - indexers=self.indexers) + return "{name}({block!r}, {indexers})".format( + name=self.__class__.__name__, block=self.block, indexers=self.indexers + ) @cache_readonly def needs_filling(self): @@ -130,8 +144,7 @@ def dtype(self): if not self.needs_filling: return self.block.dtype else: - return _get_dtype(maybe_promote(self.block.dtype, - self.block.fill_value)[0]) + return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) @cache_readonly def is_na(self): @@ -152,11 +165,11 @@ def is_na(self): elif self.block.is_extension: values_flat = values else: - values_flat = values.ravel(order='K') + values_flat = values.ravel(order="K") total_len = values_flat.shape[0] chunk_len = max(total_len // 40, 1000) for i in range(0, total_len, chunk_len): - if not isna(values_flat[i:i + chunk_len]).all(): + if not isna(values_flat[i : i + chunk_len]).all(): return False return True @@ -170,24 +183,26 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): fill_value = upcasted_na if self.is_na: - if getattr(self.block, 'is_object', False): + if getattr(self.block, "is_object", False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls - values = self.block.values.ravel(order='K') + values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None - if (getattr(self.block, 'is_datetimetz', False) or - is_datetime64tz_dtype(empty_dtype)): + if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype( + empty_dtype + ): if self.block is None: array = empty_dtype.construct_array_type() - return array(np.full(self.shape[1], fill_value.value), - dtype=empty_dtype) + return array( + np.full(self.shape[1], fill_value.value), dtype=empty_dtype + ) pass - elif getattr(self.block, 'is_categorical', False): + elif getattr(self.block, "is_categorical", False): pass - elif getattr(self.block, 'is_extension', False): + elif getattr(self.block, "is_extension", False): pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) @@ -218,8 +233,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): else: for ax, indexer in self.indexers.items(): - values = algos.take_nd(values, indexer, axis=ax, - fill_value=fill_value) + values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values @@ -234,9 +248,10 @@ def concatenate_join_units(join_units, concat_axis, copy): empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) - to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, - upcasted_na=upcasted_na) - for ju in join_units] + to_concat = [ + ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) + for ju in join_units + ] if len(to_concat) == 1: # Only one block, nothing to concatenate. @@ -292,25 +307,25 @@ def get_empty_dtype_and_na(join_units): continue if is_categorical_dtype(dtype): - upcast_cls = 'category' + upcast_cls = "category" elif is_datetime64tz_dtype(dtype): - upcast_cls = 'datetimetz' + upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): - upcast_cls = 'bool' + upcast_cls = "bool" elif issubclass(dtype.type, np.object_): - upcast_cls = 'object' + upcast_cls = "object" elif is_datetime64_dtype(dtype): - upcast_cls = 'datetime' + upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): - upcast_cls = 'timedelta' + upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): - upcast_cls = 'object' + upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: - upcast_cls = 'float' + upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to @@ -324,24 +339,24 @@ def get_empty_dtype_and_na(join_units): upcast_classes = null_upcast_classes # create the result - if 'object' in upcast_classes: + if "object" in upcast_classes: return np.dtype(np.object_), np.nan - elif 'bool' in upcast_classes: + elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None - elif 'category' in upcast_classes: + elif "category" in upcast_classes: return np.dtype(np.object_), np.nan - elif 'datetimetz' in upcast_classes: + elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. - dtype = upcast_classes['datetimetz'] + dtype = upcast_classes["datetimetz"] return dtype[0], tslibs.NaT - elif 'datetime' in upcast_classes: - return np.dtype('M8[ns]'), tslibs.iNaT - elif 'timedelta' in upcast_classes: - return np.dtype('m8[ns]'), tslibs.iNaT + elif "datetime" in upcast_classes: + return np.dtype("M8[ns]"), tslibs.iNaT + elif "timedelta" in upcast_classes: + return np.dtype("m8[ns]"), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) @@ -370,21 +385,25 @@ def is_uniform_join_units(join_units): """ return ( # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa + all(type(ju.block) is type(join_units[0].block) for ju in join_units) + and # noqa # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. - all(not ju.is_na or ju.block.is_extension for ju in join_units) and + all(not ju.is_na or ju.block.is_extension for ju in join_units) + and # no blocks with indexers (as then the dimensions do not fit) - all(not ju.indexers for ju in join_units) and + all(not ju.indexers for ju in join_units) + and # only use this path when there is something to concatenate - len(join_units) > 1) + len(join_units) > 1 + ) def is_uniform_reindex(join_units): return ( # TODO: should this be ju.block._can_hold_na? - all(ju.block and ju.block.is_extension for ju in join_units) and - len({ju.block.dtype.name for ju in join_units}) == 1 + all(ju.block and ju.block.is_extension for ju in join_units) + and len({ju.block.dtype.name for ju in join_units}) == 1 ) @@ -413,8 +432,7 @@ def trim_join_unit(join_unit, length): extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] join_unit.shape = (length,) + join_unit.shape[1:] - return JoinUnit(block=extra_block, indexers=extra_indexers, - shape=extra_shape) + return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape) def combine_concat_plans(plans, concat_axis): @@ -471,8 +489,7 @@ def _next_or_none(seq): if len(plc) > min_len: # trim_join_unit updates unit in place, so only # placement needs to be sliced to skip min_len. - next_items[i] = (plc[min_len:], - trim_join_unit(unit, min_len)) + next_items[i] = (plc[min_len:], trim_join_unit(unit, min_len)) else: yielded_placement = plc next_items[i] = _next_or_none(plans[i]) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1044f25a6bbcd..4d64be34e624f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -12,26 +12,53 @@ from pandas.compat import raise_with_traceback from pandas.core.dtypes.cast import ( - construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - construct_1d_object_array_from_listlike, infer_dtype_from_scalar, - maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable, - maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast) + construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, + construct_1d_object_array_from_listlike, + infer_dtype_from_scalar, + maybe_cast_to_datetime, + maybe_cast_to_integer_array, + maybe_castable, + maybe_convert_platform, + maybe_infer_to_datetimelike, + maybe_upcast, +) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_extension_array_dtype, is_extension_type, is_float_dtype, - is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) + is_categorical_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_extension_type, + is_float_dtype, + is_integer_dtype, + is_iterator, + is_list_like, + is_object_dtype, + pandas_dtype, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries, - ABCTimedeltaIndex) + ABCDataFrame, + ABCDatetimeIndex, + ABCIndexClass, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaIndex, +) from pandas.core.dtypes.missing import isna from pandas.core import algorithms, common as com from pandas.core.arrays import Categorical, ExtensionArray, period_array from pandas.core.index import ( - Index, _get_objs_combined_axis, _union_indexes, ensure_index) + Index, + _get_objs_combined_axis, + _union_indexes, + ensure_index, +) from pandas.core.indexes import base as ibase from pandas.core.internals import ( - create_block_manager_from_arrays, create_block_manager_from_blocks) + create_block_manager_from_arrays, + create_block_manager_from_blocks, +) from pandas.core.internals.arrays import extract_array # --------------------------------------------------------------------- @@ -101,6 +128,7 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy): # --------------------------------------------------------------------- # DataFrame Constructor Interface + def init_ndarray(values, index, columns, dtype=None, copy=False): # input must be a ndarray, list, Series, index @@ -119,24 +147,23 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): # we could have a categorical type passed or coerced to 'category' # recast this to an arrays_to_mgr - if (is_categorical_dtype(getattr(values, 'dtype', None)) or - is_categorical_dtype(dtype)): + if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype( + dtype + ): - if not hasattr(values, 'dtype'): + if not hasattr(values, "dtype"): values = prep_ndarray(values, copy=copy) values = values.ravel() elif copy: values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) - return arrays_to_mgr([values], columns, index, columns, - dtype=dtype) + return arrays_to_mgr([values], columns, index, columns, dtype=dtype) elif is_extension_array_dtype(values): # GH#19157 if columns is None: columns = [0] - return arrays_to_mgr([values], columns, index, columns, - dtype=dtype) + return arrays_to_mgr([values], columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype @@ -147,9 +174,10 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): try: values = values.astype(dtype) except Exception as orig: - e = ValueError("failed to cast to '{dtype}' (Exception " - "was: {orig})".format(dtype=dtype, - orig=orig)) + e = ValueError( + "failed to cast to '{dtype}' (Exception " + "was: {orig})".format(dtype=dtype, orig=orig) + ) raise_with_traceback(e) index, columns = _get_axes(*values.shape, index=index, columns=columns) @@ -171,8 +199,9 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): from pandas.core.internals.blocks import make_block # TODO: What about re-joining object columns? - block_values = [make_block(dvals_list[n], placement=[n]) - for n in range(len(dvals_list))] + block_values = [ + make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) + ] else: datelike_vals = maybe_infer_to_datetimelike(values) @@ -190,6 +219,7 @@ def init_dict(data, index, columns, dtype=None): """ if columns is not None: from pandas.core.series import Series + arrays = Series(data, index=columns, dtype=object) data_names = arrays.index @@ -208,8 +238,7 @@ def init_dict(data, index, columns, dtype=None): nan_dtype = object else: nan_dtype = dtype - val = construct_1d_arraylike_from_scalar(np.nan, len(index), - nan_dtype) + val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() else: @@ -218,15 +247,18 @@ def init_dict(data, index, columns, dtype=None): arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies - arrays = [arr if not isinstance(arr, ABCIndexClass) else arr._data - for arr in arrays] - arrays = [arr if not is_datetime64tz_dtype(arr) else - arr.copy() for arr in arrays] + arrays = [ + arr if not isinstance(arr, ABCIndexClass) else arr._data for arr in arrays + ] + arrays = [ + arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays + ] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) # --------------------------------------------------------------------- + def prep_ndarray(values, copy=True): if not isinstance(values, (np.ndarray, ABCSeries, Index)): if len(values) == 0: @@ -239,7 +271,7 @@ def convert(v): # this is equiv of np.asarray, but does object conversion # and platform dtype preservation try: - if is_list_like(values[0]) or hasattr(values[0], 'len'): + if is_list_like(values[0]) or hasattr(values[0], "len"): values = np.array([convert(v) for v in values]) elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: # GH#21861 @@ -259,7 +291,7 @@ def convert(v): if values.ndim == 1: values = values.reshape((values.shape[0], 1)) elif values.ndim != 2: - raise ValueError('Must pass 2-d input') + raise ValueError("Must pass 2-d input") return values @@ -279,15 +311,16 @@ def _homogenize(data, index, dtype=None): else: if isinstance(val, dict): if oindex is None: - oindex = index.astype('O') + oindex = index.astype("O") if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): val = com.dict_compat(val) else: val = dict(val) val = lib.fast_multiget(val, oindex.values, default=np.nan) - val = sanitize_array(val, index, dtype=dtype, copy=False, - raise_cast_failure=False) + val = sanitize_array( + val, index, dtype=dtype, copy=False, raise_cast_failure=False + ) homogenized.append(val) @@ -313,13 +346,12 @@ def extract_index(data): elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) - elif is_list_like(val) and getattr(val, 'ndim', 1) == 1: + elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) if not indexes and not raw_lengths: - raise ValueError('If using all scalar values, you must pass' - ' an index') + raise ValueError("If using all scalar values, you must pass" " an index") if have_series or have_dicts: index = _union_indexes(indexes) @@ -327,17 +359,19 @@ def extract_index(data): if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: - raise ValueError('arrays must all be same length') + raise ValueError("arrays must all be same length") if have_dicts: - raise ValueError('Mixing dicts with non-Series may lead to ' - 'ambiguous ordering.') + raise ValueError( + "Mixing dicts with non-Series may lead to " "ambiguous ordering." + ) if have_series: if lengths[0] != len(index): - msg = ('array length {length} does not match index ' - 'length {idx_len}' - .format(length=lengths[0], idx_len=len(index))) + msg = ( + "array length {length} does not match index " + "length {idx_len}".format(length=lengths[0], idx_len=len(index)) + ) raise ValueError(msg) else: index = ibase.default_index(lengths[0]) @@ -347,8 +381,12 @@ def extract_index(data): def reorder_arrays(arrays, arr_columns, columns): # reorder according to the columns - if (columns is not None and len(columns) and arr_columns is not None and - len(arr_columns)): + if ( + columns is not None + and len(columns) + and arr_columns is not None + and len(arr_columns) + ): indexer = ensure_index(arr_columns).get_indexer(columns) arr_columns = ensure_index([arr_columns[i] for i in indexer]) arrays = [arrays[i] for i in indexer] @@ -356,18 +394,18 @@ def reorder_arrays(arrays, arr_columns, columns): def get_names_from_index(data): - has_some_name = any(getattr(s, 'name', None) is not None for s in data) + has_some_name = any(getattr(s, "name", None) is not None for s in data) if not has_some_name: return ibase.default_index(len(data)) index = list(range(len(data))) count = 0 for i, s in enumerate(data): - n = getattr(s, 'name', None) + n = getattr(s, "name", None) if n is not None: index[i] = n else: - index[i] = 'Unnamed {count}'.format(count=count) + index[i] = "Unnamed {count}".format(count=count) count += 1 return index @@ -392,14 +430,18 @@ def _get_axes(N, K, index, columns): # --------------------------------------------------------------------- # Conversion of Inputs to Arrays + def to_arrays(data, columns, coerce_float=False, dtype=None): """ Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): if columns is not None: - arrays = [data._ixs(i, axis=1).values - for i, col in enumerate(data.columns) if col in columns] + arrays = [ + data._ixs(i, axis=1).values + for i, col in enumerate(data.columns) + if col in columns + ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] @@ -413,21 +455,23 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays(data, columns, - coerce_float=coerce_float, dtype=dtype) + return _list_of_dict_to_arrays( + data, columns, coerce_float=coerce_float, dtype=dtype + ) elif isinstance(data[0], ABCSeries): - return _list_of_series_to_arrays(data, columns, - coerce_float=coerce_float, - dtype=dtype) + return _list_of_series_to_arrays( + data, columns, coerce_float=coerce_float, dtype=dtype + ) elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns - elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and - data.dtype.names is not None): + elif ( + isinstance(data, (np.ndarray, ABCSeries, Index)) + and data.dtype.names is not None + ): columns = list(data.dtype.names) arrays = [data[k] for k in columns] @@ -435,8 +479,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): else: # last ditch effort data = [tuple(x) for x in data] - return _list_to_arrays(data, columns, coerce_float=coerce_float, - dtype=dtype) + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) def _list_to_arrays(data, columns, coerce_float=False, dtype=None): @@ -447,8 +490,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None): content = list(lib.to_object_array(data).T) # gh-26429 do not raise user-facing AssertionError try: - result = _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) + result = _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) except AssertionError as e: raise ValueError(e) from e return result @@ -462,7 +506,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): aligned_values = [] for s in data: - index = getattr(s, 'index', None) + index = getattr(s, "index", None) if index is None: index = ibase.default_index(len(s)) @@ -478,8 +522,9 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): if values.dtype == np.object_: content = list(values.T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) + return _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) else: return values.T, columns @@ -495,8 +540,9 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): data = [(type(d) is dict) and d or dict(d) for d in data] content = list(lib.dicts_to_array(data, list(columns)).T) - return _convert_object_array(content, columns, dtype=dtype, - coerce_float=coerce_float) + return _convert_object_array( + content, columns, dtype=dtype, coerce_float=coerce_float + ) def _convert_object_array(content, columns, coerce_float=False, dtype=None): @@ -505,9 +551,10 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): else: if len(columns) != len(content): # pragma: no cover # caller's responsibility to check for this... - raise AssertionError('{col:d} columns passed, passed data had ' - '{con} columns'.format(col=len(columns), - con=len(content))) + raise AssertionError( + "{col:d} columns passed, passed data had " + "{con} columns".format(col=len(columns), con=len(content)) + ) # provide soft conversion of object dtypes def convert(arr): @@ -524,6 +571,7 @@ def convert(arr): # --------------------------------------------------------------------- # Series-Based + def sanitize_index(data, index, copy=False): """ Sanitize an index type to return an ndarray of the underlying, pass @@ -534,7 +582,7 @@ def sanitize_index(data, index, copy=False): return data if len(data) != len(index): - raise ValueError('Length of values does not match length of index') + raise ValueError("Length of values does not match length of index") if isinstance(data, ABCIndexClass) and not copy: pass @@ -546,14 +594,13 @@ def sanitize_index(data, index, copy=False): elif isinstance(data, np.ndarray): # coerce datetimelike types - if data.dtype.kind in ['M', 'm']: + if data.dtype.kind in ["M", "m"]: data = sanitize_array(data, index, copy=copy) return data -def sanitize_array(data, index, dtype=None, copy=False, - raise_cast_failure=False): +def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. @@ -576,8 +623,7 @@ def sanitize_array(data, index, dtype=None, copy=False, # GH#846 if isinstance(data, np.ndarray): - if (dtype is not None - and is_float_dtype(data.dtype) and is_integer_dtype(dtype)): + if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) @@ -617,13 +663,13 @@ def sanitize_array(data, index, dtype=None, copy=False, elif isinstance(data, range): # GH#16804 - arr = np.arange(data.start, data.stop, data.step, dtype='int64') + arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH - if getattr(subarr, 'ndim', 0) == 0: + if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: @@ -636,8 +682,7 @@ def sanitize_array(data, index, dtype=None, copy=False, # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) - subarr = construct_1d_arraylike_from_scalar( - value, len(index), dtype) + subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) else: return subarr.item() @@ -649,11 +694,12 @@ def sanitize_array(data, index, dtype=None, copy=False, # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( - subarr[0], len(index), subarr.dtype) + subarr[0], len(index), subarr.dtype + ) elif subarr.ndim > 1: if isinstance(data, np.ndarray): - raise Exception('Data must be 1-dimensional') + raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) @@ -668,12 +714,13 @@ def sanitize_array(data, index, dtype=None, copy=False, data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if (not (is_extension_array_dtype(subarr.dtype) or - is_extension_array_dtype(dtype)) and - is_object_dtype(subarr.dtype) and - not is_object_dtype(dtype)): + if ( + not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)) + and is_object_dtype(subarr.dtype) + and not is_object_dtype(dtype) + ): inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == 'period': + if inferred == "period": try: subarr = period_array(subarr) except IncompatibleFrequency: @@ -710,13 +757,13 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): - if is_object_dtype(dtype) and (is_list_like(subarr) and - not (is_iterator(subarr) or - isinstance(subarr, np.ndarray))): + if is_object_dtype(dtype) and ( + is_list_like(subarr) + and not (is_iterator(subarr) or isinstance(subarr, np.ndarray)) + ): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr = construct_1d_ndarray_preserving_na(subarr, dtype, - copy=copy) + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise raise @@ -724,8 +771,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. - subarr = Categorical(arr, dtype.categories, - ordered=dtype._ordered) + subarr = Categorical(arr, dtype.categories, ordered=dtype._ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype array_type = dtype.construct_array_type()._from_sequence diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cdf0826bbe21e..c5254aaa4af5f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -11,12 +11,21 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( - find_common_type, infer_dtype_from_scalar, maybe_convert_objects, - maybe_promote) + find_common_type, + infer_dtype_from_scalar, + maybe_convert_objects, + maybe_promote, +) from pandas.core.dtypes.common import ( - _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype, - is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar, - is_sparse) + _NS_DTYPE, + is_datetimelike_v_numeric, + is_extension_array_dtype, + is_extension_type, + is_list_like, + is_numeric_v_string_like, + is_scalar, + is_sparse, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries @@ -30,12 +39,23 @@ from pandas.io.formats.printing import pprint_thing from .blocks import ( - Block, CategoricalBlock, DatetimeTZBlock, ExtensionBlock, - ObjectValuesExtensionBlock, _extend_blocks, _merge_blocks, _safe_reshape, - get_block_type, make_block) + Block, + CategoricalBlock, + DatetimeTZBlock, + ExtensionBlock, + ObjectValuesExtensionBlock, + _extend_blocks, + _merge_blocks, + _safe_reshape, + get_block_type, + make_block, +) from .concat import ( # all for concatenate_block_managers - combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan, - is_uniform_join_units) + combine_concat_plans, + concatenate_join_units, + get_mgr_concatenation_plan, + is_uniform_join_units, +) # TODO: flexible with index=None and/or items=None @@ -91,22 +111,33 @@ class BlockManager(PandasObject): ----- This is *not* a public API class """ - __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', - '_is_consolidated', '_blknos', '_blklocs'] - def __init__(self, - blocks: Sequence[Block], - axes: Sequence[Index], - do_integrity_check: bool = True): + __slots__ = [ + "axes", + "blocks", + "_ndim", + "_shape", + "_known_consolidated", + "_is_consolidated", + "_blknos", + "_blklocs", + ] + + def __init__( + self, + blocks: Sequence[Block], + axes: Sequence[Index], + do_integrity_check: bool = True, + ): self.axes = [ensure_index(ax) for ax in axes] self.blocks = tuple(blocks) # type: Tuple[Block, ...] for block in blocks: if self.ndim != block.ndim: raise AssertionError( - 'Number of Block dimensions ({block}) must equal ' - 'number of axes ({self})'.format(block=block.ndim, - self=self.ndim)) + "Number of Block dimensions ({block}) must equal " + "number of axes ({self})".format(block=block.ndim, self=self.ndim) + ) if do_integrity_check: self._verify_integrity() @@ -118,8 +149,7 @@ def __init__(self, def make_empty(self, axes=None): """ return an empty BlockManager with the items axis of len 0 """ if axes is None: - axes = [ensure_index([])] + [ensure_index(a) - for a in self.axes[1:]] + axes = [ensure_index([])] + [ensure_index(a) for a in self.axes[1:]] # preserve dtype if possible if self.ndim == 1: @@ -149,8 +179,9 @@ def set_axis(self, axis, new_labels): if new_len != old_len: raise ValueError( - 'Length mismatch: Expected axis has {old} elements, new ' - 'values have {new} elements'.format(old=old_len, new=new_len)) + "Length mismatch: Expected axis has {old} elements, new " + "values have {new} elements".format(old=old_len, new=new_len) + ) self.axes[axis] = new_labels @@ -178,8 +209,9 @@ def _is_single_block(self): return False blk = self.blocks[0] - return (blk.mgr_locs.is_slice_like and - blk.mgr_locs.as_slice == slice(0, len(self), 1)) + return blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice == slice( + 0, len(self), 1 + ) def _rebuild_blknos_and_blklocs(self): """ @@ -234,10 +266,12 @@ def __getstate__(self): axes_array = [ax for ax in self.axes] extra_state = { - '0.14.1': { - 'axes': axes_array, - 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer) - for b in self.blocks] + "0.14.1": { + "axes": axes_array, + "blocks": [ + dict(values=b.values, mgr_locs=b.mgr_locs.indexer) + for b in self.blocks + ], } } @@ -249,12 +283,12 @@ def __setstate__(self, state): def unpickle_block(values, mgr_locs): return make_block(values, placement=mgr_locs) - if (isinstance(state, tuple) and len(state) >= 4 and - '0.14.1' in state[3]): - state = state[3]['0.14.1'] - self.axes = [ensure_index(ax) for ax in state['axes']] - self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs']) - for b in state['blocks']) + if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: + state = state[3]["0.14.1"] + self.axes = [ensure_index(ax) for ax in state["axes"]] + self.blocks = tuple( + unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"] + ) else: # discard anything after 3rd, support beta pickling format for a # little while longer @@ -272,12 +306,14 @@ def unpickle_block(values, mgr_locs): # block items corresponded to manager items 1-to-1. all_mgr_locs = [slice(0, len(bitems[0]))] else: - all_mgr_locs = [self.axes[0].get_indexer(blk_items) - for blk_items in bitems] + all_mgr_locs = [ + self.axes[0].get_indexer(blk_items) for blk_items in bitems + ] self.blocks = tuple( unpickle_block(values, mgr_locs) - for values, mgr_locs in zip(bvalues, all_mgr_locs)) + for values, mgr_locs in zip(bvalues, all_mgr_locs) + ) self._post_setstate() @@ -293,12 +329,12 @@ def __repr__(self): output = pprint_thing(self.__class__.__name__) for i, ax in enumerate(self.axes): if i == 0: - output += '\nItems: {ax}'.format(ax=ax) + output += "\nItems: {ax}".format(ax=ax) else: - output += '\nAxis {i}: {ax}'.format(i=i, ax=ax) + output += "\nAxis {i}: {ax}".format(i=i, ax=ax) for block in self.blocks: - output += '\n{block}'.format(block=pprint_thing(block)) + output += "\n{block}".format(block=pprint_thing(block)) return output def _verify_integrity(self): @@ -308,13 +344,21 @@ def _verify_integrity(self): if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: - raise AssertionError('Number of manager items must equal union of ' - 'block items\n# manager items: {0}, # ' - 'tot_items: {1}'.format( - len(self.items), tot_items)) + raise AssertionError( + "Number of manager items must equal union of " + "block items\n# manager items: {0}, # " + "tot_items: {1}".format(len(self.items), tot_items) + ) - def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, **kwargs): + def apply( + self, + f, + axes=None, + filter=None, + do_integrity_check=False, + consolidate=True, + **kwargs + ): """ iterate over the blocks, collect and create a new block manager @@ -344,37 +388,39 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, # All items are included, as if there were no filtering filter = None else: - kwargs['filter'] = filter_locs + kwargs["filter"] = filter_locs if consolidate: self._consolidate_inplace() - if f == 'where': + if f == "where": align_copy = True - if kwargs.get('align', True): - align_keys = ['other', 'cond'] + if kwargs.get("align", True): + align_keys = ["other", "cond"] else: - align_keys = ['cond'] - elif f == 'putmask': + align_keys = ["cond"] + elif f == "putmask": align_copy = False - if kwargs.get('align', True): - align_keys = ['new', 'mask'] + if kwargs.get("align", True): + align_keys = ["new", "mask"] else: - align_keys = ['mask'] - elif f == 'fillna': + align_keys = ["mask"] + elif f == "fillna": # fillna internally does putmask, maybe it's better to do this # at mgr, not block level? align_copy = False - align_keys = ['value'] + align_keys = ["value"] else: align_keys = [] # TODO(EA): may interfere with ExtensionBlock.setitem for blocks # with a .values attribute. - aligned_args = {k: kwargs[k] - for k in align_keys - if not isinstance(kwargs[k], ABCExtensionArray) and - hasattr(kwargs[k], 'values')} + aligned_args = { + k: kwargs[k] + for k in align_keys + if not isinstance(kwargs[k], ABCExtensionArray) + and hasattr(kwargs[k], "values") + } for b in self.blocks: if filter is not None: @@ -386,22 +432,29 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, b_items = self.items[b.mgr_locs.indexer] for k, obj in aligned_args.items(): - axis = getattr(obj, '_info_axis_number', 0) - kwargs[k] = obj.reindex(b_items, axis=axis, - copy=align_copy) + axis = getattr(obj, "_info_axis_number", 0) + kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) if len(result_blocks) == 0: return self.make_empty(axes or self.axes) - bm = self.__class__(result_blocks, axes or self.axes, - do_integrity_check=do_integrity_check) + bm = self.__class__( + result_blocks, axes or self.axes, do_integrity_check=do_integrity_check + ) bm._consolidate_inplace() return bm - def quantile(self, axis=0, consolidate=True, transposed=False, - interpolation='linear', qs=None, numeric_only=None): + def quantile( + self, + axis=0, + consolidate=True, + transposed=False, + interpolation="linear", + qs=None, + numeric_only=None, + ): """ Iterate over blocks applying quantile reduction. This routine is intended for reduction type operations and @@ -434,6 +487,7 @@ def get_axe(block, qs, axes): # Because Series dispatches to DataFrame, we will always have # block.ndim == 2 from pandas import Float64Index + if is_list_like(qs): ax = Float64Index(qs) else: @@ -466,14 +520,14 @@ def get_axe(block, qs, axes): b.mgr_locs = sb.mgr_locs else: - new_axes[axis] = Index(np.concatenate( - [ax.values for ax in axes])) + new_axes[axis] = Index(np.concatenate([ax.values for ax in axes])) if transposed: new_axes = new_axes[::-1] - blocks = [b.make_block(b.values.T, - placement=np.arange(b.shape[1]) - ) for b in blocks] + blocks = [ + b.make_block(b.values.T, placement=np.arange(b.shape[1])) + for b in blocks + ] return self.__class__(blocks, new_axes) @@ -493,51 +547,49 @@ def get_axe(block, qs, axes): values = values.take(indexer) return SingleBlockManager( - [make_block(values, - ndim=1, - placement=np.arange(len(values)))], - axes[0]) + [make_block(values, ndim=1, placement=np.arange(len(values)))], axes[0] + ) def isna(self, func, **kwargs): - return self.apply('apply', func=func, **kwargs) + return self.apply("apply", func=func, **kwargs) def where(self, **kwargs): - return self.apply('where', **kwargs) + return self.apply("where", **kwargs) def setitem(self, **kwargs): - return self.apply('setitem', **kwargs) + return self.apply("setitem", **kwargs) def putmask(self, **kwargs): - return self.apply('putmask', **kwargs) + return self.apply("putmask", **kwargs) def diff(self, **kwargs): - return self.apply('diff', **kwargs) + return self.apply("diff", **kwargs) def interpolate(self, **kwargs): - return self.apply('interpolate', **kwargs) + return self.apply("interpolate", **kwargs) def shift(self, **kwargs): - return self.apply('shift', **kwargs) + return self.apply("shift", **kwargs) def fillna(self, **kwargs): - return self.apply('fillna', **kwargs) + return self.apply("fillna", **kwargs) def downcast(self, **kwargs): - return self.apply('downcast', **kwargs) + return self.apply("downcast", **kwargs) def astype(self, dtype, **kwargs): - return self.apply('astype', dtype=dtype, **kwargs) + return self.apply("astype", dtype=dtype, **kwargs) def convert(self, **kwargs): - return self.apply('convert', **kwargs) + return self.apply("convert", **kwargs) def replace(self, **kwargs): - return self.apply('replace', **kwargs) + return self.apply("replace", **kwargs) def replace_list(self, src_list, dest_list, inplace=False, regex=False): """ do a list replace """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # figure out our mask a-priori to avoid repeated replacements values = self.as_array() @@ -549,9 +601,10 @@ def comp(s, regex=False): """ if isna(s): return isna(values) - if hasattr(s, 'asm8'): - return _compare_or_regex_search(maybe_convert_objects(values), - getattr(s, 'asm8'), regex) + if hasattr(s, "asm8"): + return _compare_or_regex_search( + maybe_convert_objects(values), getattr(s, "asm8"), regex + ) return _compare_or_regex_search(values, s, regex) masks = [comp(s, regex) for i, s in enumerate(src_list)] @@ -568,9 +621,14 @@ def comp(s, regex=False): for b in rb: m = masks[i][b.mgr_locs.indexer] convert = i == src_len - result = b._replace_coerce(mask=m, to_replace=s, value=d, - inplace=inplace, - convert=convert, regex=regex) + result = b._replace_coerce( + mask=m, + to_replace=s, + value=d, + inplace=inplace, + convert=convert, + regex=regex, + ) if m.any(): new_rb = _extend_blocks(result, new_rb) else: @@ -659,15 +717,15 @@ def combine(self, blocks, copy=True): return self.make_empty() # FIXME: optimization potential - indexer = np.sort(np.concatenate([b.mgr_locs.as_array - for b in blocks])) + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks = [] for b in blocks: b = b.copy(deep=copy) - b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array, - axis=0, allow_fill=False) + b.mgr_locs = algos.take_1d( + inv_indexer, b.mgr_locs.as_array, axis=0, allow_fill=False + ) new_blocks.append(b) axes = list(self.axes) @@ -717,15 +775,14 @@ def copy(self, deep=True): """ # this preserves the notion of view copying of axes if deep: - if deep == 'all': + if deep == "all": copy = lambda ax: ax.copy(deep=True) else: copy = lambda ax: ax.view() new_axes = [copy(ax) for ax in self.axes] else: new_axes = list(self.axes) - return self.apply('copy', axes=new_axes, deep=deep, - do_integrity_check=False) + return self.apply("copy", axes=new_axes, deep=deep, do_integrity_check=False) def as_array(self, transpose=False, items=None): """Convert the blockmanager data into an numpy array. @@ -775,7 +832,7 @@ def _interleave(self): if is_sparse(dtype): dtype = dtype.subtype elif is_extension_array_dtype(dtype): - dtype = 'object' + dtype = "object" result = np.empty(self.shape, dtype=dtype) @@ -787,7 +844,7 @@ def _interleave(self): itemmask[rl.indexer] = 1 if not itemmask.all(): - raise AssertionError('Some items were not contained in blocks') + raise AssertionError("Some items were not contained in blocks") return result @@ -813,8 +870,7 @@ def to_dict(self, copy=True): for b in self.blocks: bd.setdefault(str(b.dtype), []).append(b) - return {dtype: self.combine(blocks, copy=copy) - for dtype, blocks in bd.items()} + return {dtype: self.combine(blocks, copy=copy) for dtype, blocks in bd.items()} def fast_xs(self, loc): """ @@ -853,9 +909,7 @@ def fast_xs(self, loc): result[rl] = blk._try_coerce_result(blk.iget((i, loc))) if is_extension_array_dtype(dtype): - result = dtype.construct_array_type()._from_sequence( - result, dtype=dtype - ) + result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) return result @@ -907,9 +961,9 @@ def get(self, item, fastpath=True): raise TypeError("cannot label index with a null key") indexer = self.items.get_indexer_for([item]) - return self.reindex_indexer(new_axis=self.items[indexer], - indexer=indexer, axis=0, - allow_dups=True) + return self.reindex_indexer( + new_axis=self.items[indexer], indexer=indexer, axis=0, allow_dups=True + ) def iget(self, i, fastpath=True): """ @@ -924,10 +978,13 @@ def iget(self, i, fastpath=True): # fastpath shortcut for select a single-dim from a 2-dim BM return SingleBlockManager( - [block.make_block_same_class(values, - placement=slice(0, len(values)), - ndim=1)], - self.axes[1]) + [ + block.make_block_same_class( + values, placement=slice(0, len(values)), ndim=1 + ) + ], + self.axes[1], + ) def delete(self, item): """ @@ -962,8 +1019,9 @@ def delete(self, item): # FIXME: use Index.delete as soon as it uses fastpath=True self.axes[0] = self.items[~is_deleted] - self.blocks = tuple(b for blkno, b in enumerate(self.blocks) - if not is_blk_deleted[blkno]) + self.blocks = tuple( + b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno] + ) self._shape = None self._rebuild_blknos_and_blklocs() @@ -977,28 +1035,32 @@ def set(self, item, value): # TODO(EA): Remove an is_extension_ when all extension types satisfy # the interface - value_is_extension_type = (is_extension_type(value) or - is_extension_array_dtype(value)) + value_is_extension_type = is_extension_type(value) or is_extension_array_dtype( + value + ) # categorical/sparse/datetimetz if value_is_extension_type: def value_getitem(placement): return value + else: if value.ndim == self.ndim - 1: value = _safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value + else: def value_getitem(placement): return value[placement.indexer] if value.shape[1:] != self.shape[1:]: - raise AssertionError('Shape of new values must be compatible ' - 'with manager shape') + raise AssertionError( + "Shape of new values must be compatible " "with manager shape" + ) try: loc = self.items.get_loc(item) @@ -1016,9 +1078,9 @@ def value_getitem(placement): unfit_mgr_locs = [] unfit_val_locs = [] removed_blknos = [] - for blkno, val_locs in libinternals.get_blkno_placements(blknos, - self.nblocks, - group=True): + for blkno, val_locs in libinternals.get_blkno_placements( + blknos, self.nblocks, group=True + ): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): @@ -1042,12 +1104,13 @@ def value_getitem(placement): new_blknos = np.empty(self.nblocks, dtype=np.int64) new_blknos.fill(-1) - new_blknos[~is_deleted] = np.arange(self.nblocks - - len(removed_blknos)) - self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0, - allow_fill=False) - self.blocks = tuple(blk for i, blk in enumerate(self.blocks) - if i not in set(removed_blknos)) + new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos)) + self._blknos = algos.take_1d( + new_blknos, self._blknos, axis=0, allow_fill=False + ) + self.blocks = tuple( + blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) + ) if unfit_val_locs: unfit_mgr_locs = np.concatenate(unfit_mgr_locs) @@ -1058,12 +1121,15 @@ def value_getitem(placement): # This code (ab-)uses the fact that sparse blocks contain only # one item. new_blocks.extend( - make_block(values=value.copy(), ndim=self.ndim, - placement=slice(mgr_loc, mgr_loc + 1)) - for mgr_loc in unfit_mgr_locs) - - self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + - len(self.blocks)) + make_block( + values=value.copy(), + ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1), + ) + for mgr_loc in unfit_mgr_locs + ) + + self._blknos[unfit_mgr_locs] = np.arange(unfit_count) + len(self.blocks) self._blklocs[unfit_mgr_locs] = 0 else: @@ -1071,8 +1137,12 @@ def value_getitem(placement): unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) new_blocks.append( - make_block(values=value_getitem(unfit_val_items), - ndim=self.ndim, placement=unfit_mgr_locs)) + make_block( + values=value_getitem(unfit_val_items), + ndim=self.ndim, + placement=unfit_mgr_locs, + ) + ) self._blknos[unfit_mgr_locs] = len(self.blocks) self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) @@ -1097,7 +1167,7 @@ def insert(self, loc, item, value, allow_duplicates=False): """ if not allow_duplicates and item in self.items: # Should this be a different kind of error?? - raise ValueError('cannot insert {}, already exists'.format(item)) + raise ValueError("cannot insert {}, already exists".format(item)) if not isinstance(loc, int): raise TypeError("loc must be int") @@ -1105,8 +1175,7 @@ def insert(self, loc, item, value, allow_duplicates=False): # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) - block = make_block(values=value, ndim=self.ndim, - placement=slice(loc, loc + 1)) + block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) for blkno, count in _fast_count_smallints(self._blknos[loc:]): blk = self.blocks[blkno] @@ -1134,20 +1203,24 @@ def insert(self, loc, item, value, allow_duplicates=False): if len(self.blocks) > 100: self._consolidate_inplace() - def reindex_axis(self, new_index, axis, method=None, limit=None, - fill_value=None, copy=True): + def reindex_axis( + self, new_index, axis, method=None, limit=None, fill_value=None, copy=True + ): """ Conform block manager to new index. """ new_index = ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex(new_index, method=method, - limit=limit) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit + ) - return self.reindex_indexer(new_index, indexer, axis=axis, - fill_value=fill_value, copy=copy) + return self.reindex_indexer( + new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + ) - def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, - allow_dups=False, copy=True): + def reindex_indexer( + self, new_axis, indexer, axis, fill_value=None, allow_dups=False, copy=True + ): """ Parameters ---------- @@ -1178,12 +1251,18 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, - fill_tuple=(fill_value,)) + new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,)) else: - new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( - fill_value if fill_value is not None else blk.fill_value,)) - for blk in self.blocks] + new_blocks = [ + blk.take_nd( + indexer, + axis=axis, + fill_tuple=( + fill_value if fill_value is not None else blk.fill_value, + ), + ) + for blk in self.blocks + ] new_axes = list(self.axes) new_axes[axis] = new_axis @@ -1204,30 +1283,38 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): allow_fill = fill_tuple is not None sl_type, slobj, sllen = _preprocess_slice_or_indexer( - slice_or_indexer, self.shape[0], allow_fill=allow_fill) + slice_or_indexer, self.shape[0], allow_fill=allow_fill + ) if self._is_single_block: blk = self.blocks[0] - if sl_type in ('slice', 'mask'): + if sl_type in ("slice", "mask"): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_tuple[0] is None: _, fill_value = maybe_promote(blk.dtype) fill_tuple = (fill_value,) - return [blk.take_nd(slobj, axis=0, - new_mgr_locs=slice(0, sllen), - fill_tuple=fill_tuple)] + return [ + blk.take_nd( + slobj, + axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple, + ) + ] - if sl_type in ('slice', 'mask'): + if sl_type in ("slice", "mask"): blknos = self._blknos[slobj] blklocs = self._blklocs[slobj] else: - blknos = algos.take_1d(self._blknos, slobj, fill_value=-1, - allow_fill=allow_fill) - blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1, - allow_fill=allow_fill) + blknos = algos.take_1d( + self._blknos, slobj, fill_value=-1, allow_fill=allow_fill + ) + blklocs = algos.take_1d( + self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill + ) # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). @@ -1235,15 +1322,16 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, # pytables serialization will break otherwise. blocks = [] - for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, - self.nblocks, - group=True): + for blkno, mgr_locs in libinternals.get_blkno_placements( + blknos, self.nblocks, group=True + ): if blkno == -1: # If we've got here, fill_tuple was not None. fill_value = fill_tuple[0] - blocks.append(self._make_na_block(placement=mgr_locs, - fill_value=fill_value)) + blocks.append( + self._make_na_block(placement=mgr_locs, fill_value=fill_value) + ) else: blk = self.blocks[blkno] @@ -1258,9 +1346,14 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): blocks.append(newblk) else: - blocks.append(blk.take_nd(blklocs[mgr_locs.indexer], - axis=0, new_mgr_locs=mgr_locs, - fill_tuple=None)) + blocks.append( + blk.take_nd( + blklocs[mgr_locs.indexer], + axis=0, + new_mgr_locs=mgr_locs, + fill_tuple=None, + ) + ) return blocks @@ -1282,10 +1375,11 @@ def take(self, indexer, axis=1, verify=True, convert=True): Take items along any axis. """ self._consolidate_inplace() - indexer = (np.arange(indexer.start, indexer.stop, indexer.step, - dtype='int64') - if isinstance(indexer, slice) - else np.asanyarray(indexer, dtype='int64')) + indexer = ( + np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype="int64") + ) n = self.shape[axis] if convert: @@ -1293,12 +1387,14 @@ def take(self, indexer, axis=1, verify=True, convert=True): if verify: if ((indexer == -1) | (indexer >= n)).any(): - raise Exception('Indices must be nonzero and less than ' - 'the axis length') + raise Exception( + "Indices must be nonzero and less than " "the axis length" + ) new_labels = self.axes[axis].take(indexer) - return self.reindex_indexer(new_axis=new_labels, indexer=indexer, - axis=axis, allow_dups=True) + return self.reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) def equals(self, other): self_axes, other_axes = self.axes, other.axes @@ -1320,8 +1416,9 @@ def canonicalize(block): self_blocks = sorted(self.blocks, key=canonicalize) other_blocks = sorted(other.blocks, key=canonicalize) - return all(block.equals(oblock) - for block, oblock in zip(self_blocks, other_blocks)) + return all( + block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks) + ) def unstack(self, unstacker_func, fill_value): """Return a blockmanager with all blocks unstacked. @@ -1346,11 +1443,10 @@ def unstack(self, unstacker_func, fill_value): for blk in self.blocks: blocks, mask = blk._unstack( - partial(unstacker_func, - value_columns=self.items[blk.mgr_locs.indexer]), + partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]), new_columns, n_rows, - fill_value + fill_value, ) new_blocks.extend(blocks) @@ -1370,15 +1466,18 @@ class SingleBlockManager(BlockManager): _known_consolidated = True __slots__ = () - def __init__(self, - block: Block, - axis: Union[Index, List[Index]], - do_integrity_check: bool = False, - fastpath: bool = False): + def __init__( + self, + block: Block, + axis: Union[Index, List[Index]], + do_integrity_check: bool = False, + fastpath: bool = False, + ): if isinstance(axis, list): if len(axis) != 1: - raise ValueError("cannot create SingleBlockManager with more " - "than 1 axis") + raise ValueError( + "cannot create SingleBlockManager with more " "than 1 axis" + ) axis = axis[0] # passed from constructor, single block, single axis @@ -1390,8 +1489,9 @@ def __init__(self, if len(block) == 0: block = [np.array([])] elif len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') + raise ValueError( + "Cannot create SingleBlockManager with " "more than 1 block" + ) block = block[0] else: self.axes = [ensure_index(axis)] @@ -1406,8 +1506,9 @@ def __init__(self, block = _consolidate(block) if len(block) != 1: - raise ValueError('Cannot create SingleBlockManager with ' - 'more than 1 block') + raise ValueError( + "Cannot create SingleBlockManager with " "more than 1 block" + ) block = block[0] if not isinstance(block, Block): @@ -1440,8 +1541,9 @@ def get_slice(self, slobj, axis=0): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") - return self.__class__(self._block._slice(slobj), - self.index[slobj], fastpath=True) + return self.__class__( + self._block._slice(slobj), self.index[slobj], fastpath=True + ) @property def index(self): @@ -1449,8 +1551,8 @@ def index(self): def convert(self, **kwargs): """ convert the whole block as one """ - kwargs['by_item'] = False - return self.apply('convert', **kwargs) + kwargs["by_item"] = False + return self.apply("convert", **kwargs) @property def dtype(self): @@ -1547,13 +1649,11 @@ def concat(self, to_concat, new_axis): else: values = [x.values for x in blocks] values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) + new_block = make_block(values, placement=slice(0, len(values), 1)) else: values = [x._block.values for x in to_concat] values = _concat._concat_compat(values) - new_block = make_block( - values, placement=slice(0, len(values), 1)) + new_block = make_block(values, placement=slice(0, len(values), 1)) mgr = SingleBlockManager(new_block, new_axis) return mgr @@ -1562,6 +1662,7 @@ def concat(self, to_concat, new_axis): # -------------------------------------------------------------------- # Constructor Helpers + def create_block_manager_from_blocks(blocks, axes): try: if len(blocks) == 1 and not isinstance(blocks[0], Block): @@ -1572,15 +1673,16 @@ def create_block_manager_from_blocks(blocks, axes): # It's OK if a single block is passed as values, its placement # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. - blocks = [make_block(values=blocks[0], - placement=slice(0, len(axes[0])))] + blocks = [ + make_block(values=blocks[0], placement=slice(0, len(axes[0]))) + ] mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr except ValueError as e: - blocks = [getattr(b, 'values', b) for b in blocks] + blocks = [getattr(b, "values", b) for b in blocks] tot_items = sum(b.shape[0] for b in blocks) construction_error(tot_items, blocks[0].shape[1:], axes, e) @@ -1612,12 +1714,14 @@ def construction_error(tot_items, block_shape, axes, e=None): raise e if block_shape[0] == 0: raise ValueError("Empty data passed with indices specified.") - raise ValueError("Shape of passed values is {0}, indices imply {1}".format( - passed, implied)) + raise ValueError( + "Shape of passed values is {0}, indices imply {1}".format(passed, implied) + ) # ----------------------------------------------------------------------- + def form_blocks(arrays, names, axes): # put "leftover" items in float bucket, where else? # generalize? @@ -1643,60 +1747,61 @@ def form_blocks(arrays, names, axes): items_dict[block_type.__name__].append((i, k, v)) blocks = [] - if len(items_dict['FloatBlock']): - float_blocks = _multi_blockify(items_dict['FloatBlock']) + if len(items_dict["FloatBlock"]): + float_blocks = _multi_blockify(items_dict["FloatBlock"]) blocks.extend(float_blocks) - if len(items_dict['ComplexBlock']): - complex_blocks = _multi_blockify(items_dict['ComplexBlock']) + if len(items_dict["ComplexBlock"]): + complex_blocks = _multi_blockify(items_dict["ComplexBlock"]) blocks.extend(complex_blocks) - if len(items_dict['TimeDeltaBlock']): - timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock']) + if len(items_dict["TimeDeltaBlock"]): + timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) blocks.extend(timedelta_blocks) - if len(items_dict['IntBlock']): - int_blocks = _multi_blockify(items_dict['IntBlock']) + if len(items_dict["IntBlock"]): + int_blocks = _multi_blockify(items_dict["IntBlock"]) blocks.extend(int_blocks) - if len(items_dict['DatetimeBlock']): - datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'], - _NS_DTYPE) + if len(items_dict["DatetimeBlock"]): + datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], _NS_DTYPE) blocks.extend(datetime_blocks) - if len(items_dict['DatetimeTZBlock']): - dttz_blocks = [make_block(array, - klass=DatetimeTZBlock, - placement=[i]) - for i, _, array in items_dict['DatetimeTZBlock']] + if len(items_dict["DatetimeTZBlock"]): + dttz_blocks = [ + make_block(array, klass=DatetimeTZBlock, placement=[i]) + for i, _, array in items_dict["DatetimeTZBlock"] + ] blocks.extend(dttz_blocks) - if len(items_dict['BoolBlock']): - bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_) + if len(items_dict["BoolBlock"]): + bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_) blocks.extend(bool_blocks) - if len(items_dict['ObjectBlock']) > 0: - object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_) + if len(items_dict["ObjectBlock"]) > 0: + object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_) blocks.extend(object_blocks) - if len(items_dict['CategoricalBlock']) > 0: - cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i]) - for i, _, array in items_dict['CategoricalBlock']] + if len(items_dict["CategoricalBlock"]) > 0: + cat_blocks = [ + make_block(array, klass=CategoricalBlock, placement=[i]) + for i, _, array in items_dict["CategoricalBlock"] + ] blocks.extend(cat_blocks) - if len(items_dict['ExtensionBlock']): + if len(items_dict["ExtensionBlock"]): external_blocks = [ make_block(array, klass=ExtensionBlock, placement=[i]) - for i, _, array in items_dict['ExtensionBlock'] + for i, _, array in items_dict["ExtensionBlock"] ] blocks.extend(external_blocks) - if len(items_dict['ObjectValuesExtensionBlock']): + if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ make_block(array, klass=ObjectValuesExtensionBlock, placement=[i]) - for i, _, array in items_dict['ObjectValuesExtensionBlock'] + for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] blocks.extend(external_blocks) @@ -1756,7 +1861,7 @@ def _asarray_compat(x): def _shape_compat(x): if isinstance(x, ABCSeries): - return len(x), + return (len(x),) else: return x.shape @@ -1773,7 +1878,8 @@ def _shape_compat(x): def _interleaved_dtype( - blocks: List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]: + blocks: List[Block] +) -> Optional[Union[np.dtype, ExtensionDtype]]: """Find the common dtype for `blocks`. Parameters @@ -1802,8 +1908,9 @@ def _consolidate(blocks): new_blocks = [] for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, - _can_consolidate=_can_consolidate) + merged_blocks = _merge_blocks( + list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate + ) new_blocks = _extend_blocks(merged_blocks, new_blocks) return new_blocks @@ -1828,8 +1935,9 @@ def _compare_or_regex_search(a, b, regex=False): if not regex: op = lambda x: operator.eq(x, b) else: - op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str) - else False) + op = np.vectorize( + lambda x: bool(re.search(b, x)) if isinstance(x, str) else False + ) is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) @@ -1848,14 +1956,16 @@ def _compare_or_regex_search(a, b, regex=False): type_names = [type(a).__name__, type(b).__name__] if is_a_array: - type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype) + type_names[0] = "ndarray(dtype={dtype})".format(dtype=a.dtype) if is_b_array: - type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype) + type_names[1] = "ndarray(dtype={dtype})".format(dtype=b.dtype) raise TypeError( - "Cannot compare types {a!r} and {b!r}".format(a=type_names[0], - b=type_names[1])) + "Cannot compare types {a!r} and {b!r}".format( + a=type_names[0], b=type_names[1] + ) + ) return result @@ -1869,8 +1979,10 @@ def _transform_index(index, func, level=None): """ if isinstance(index, MultiIndex): if level is not None: - items = [tuple(func(y) if i == level else y - for i, y in enumerate(x)) for x in index] + items = [ + tuple(func(y) if i == level else y for i, y in enumerate(x)) + for x in index + ] else: items = [tuple(func(y) for y in x) for x in index] return MultiIndex.from_tuples(items, names=index.names) @@ -1888,16 +2000,20 @@ def _fast_count_smallints(arr): def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): if isinstance(slice_or_indexer, slice): - return ('slice', slice_or_indexer, - libinternals.slice_len(slice_or_indexer, length)) - elif (isinstance(slice_or_indexer, np.ndarray) and - slice_or_indexer.dtype == np.bool_): - return 'mask', slice_or_indexer, slice_or_indexer.sum() + return ( + "slice", + slice_or_indexer, + libinternals.slice_len(slice_or_indexer, length), + ) + elif ( + isinstance(slice_or_indexer, np.ndarray) and slice_or_indexer.dtype == np.bool_ + ): + return "mask", slice_or_indexer, slice_or_indexer.sum() else: indexer = np.asanyarray(slice_or_indexer, dtype=np.int64) if not allow_fill: indexer = maybe_convert_indices(indexer, length) - return 'fancy', indexer, len(indexer) + return "fancy", indexer, len(indexer) def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): @@ -1912,8 +2028,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): copy : bool """ - concat_plans = [get_mgr_concatenation_plan(mgr, indexers) - for mgr, indexers in mgrs_indexers] + concat_plans = [ + get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers + ] concat_plan = combine_concat_plans(concat_plans, concat_axis) blocks = [] @@ -1929,11 +2046,13 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): b = b.make_block_same_class(values, placement=placement) elif is_uniform_join_units(join_units): b = join_units[0].block.concat_same_type( - [ju.block for ju in join_units], placement=placement) + [ju.block for ju in join_units], placement=placement + ) else: b = make_block( concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement) + placement=placement, + ) blocks.append(b) return BlockManager(blocks, axes) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 4230b212f567a..ad4b5e4523806 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -10,9 +10,17 @@ from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( - ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype, - is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion) + ensure_float64, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_float_dtype, + is_integer, + is_integer_dtype, + is_numeric_v_string_like, + is_scalar, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import isna @@ -69,47 +77,73 @@ def mask_missing(arr, values_to_mask): def clean_fill_method(method, allow_nearest=False): # asfreq is compat for resampling - if method in [None, 'asfreq']: + if method in [None, "asfreq"]: return None if isinstance(method, str): method = method.lower() - if method == 'ffill': - method = 'pad' - elif method == 'bfill': - method = 'backfill' + if method == "ffill": + method = "pad" + elif method == "bfill": + method = "backfill" - valid_methods = ['pad', 'backfill'] - expecting = 'pad (ffill) or backfill (bfill)' + valid_methods = ["pad", "backfill"] + expecting = "pad (ffill) or backfill (bfill)" if allow_nearest: - valid_methods.append('nearest') - expecting = 'pad (ffill), backfill (bfill) or nearest' + valid_methods.append("nearest") + expecting = "pad (ffill), backfill (bfill) or nearest" if method not in valid_methods: - msg = ('Invalid fill method. Expecting {expecting}. Got {method}' - .format(expecting=expecting, method=method)) + msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( + expecting=expecting, method=method + ) raise ValueError(msg) return method def clean_interp_method(method, **kwargs): - order = kwargs.get('order') - valid = ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear', - 'quadratic', 'cubic', 'barycentric', 'polynomial', 'krogh', - 'piecewise_polynomial', 'pchip', 'akima', 'spline', - 'from_derivatives'] - if method in ('spline', 'polynomial') and order is None: - raise ValueError("You must specify the order of the spline or " - "polynomial.") + order = kwargs.get("order") + valid = [ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "polynomial", + "krogh", + "piecewise_polynomial", + "pchip", + "akima", + "spline", + "from_derivatives", + ] + if method in ("spline", "polynomial") and order is None: + raise ValueError("You must specify the order of the spline or " "polynomial.") if method not in valid: - raise ValueError("method must be one of {valid}. Got '{method}' " - "instead.".format(valid=valid, method=method)) + raise ValueError( + "method must be one of {valid}. Got '{method}' " + "instead.".format(valid=valid, method=method) + ) return method -def interpolate_1d(xvalues, yvalues, method='linear', limit=None, - limit_direction='forward', limit_area=None, fill_value=None, - bounds_error=False, order=None, **kwargs): +def interpolate_1d( + xvalues, + yvalues, + method="linear", + limit=None, + limit_direction="forward", + limit_area=None, + fill_value=None, + bounds_error=False, + order=None, + **kwargs +): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. @@ -132,39 +166,44 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, if valid.all(): return yvalues - if method == 'time': - if not getattr(xvalues, 'is_all_dates', None): + if method == "time": + if not getattr(xvalues, "is_all_dates", None): # if not issubclass(xvalues.dtype.type, np.datetime64): - raise ValueError('time-weighted interpolation only works ' - 'on Series or DataFrames with a ' - 'DatetimeIndex') - method = 'values' - - valid_limit_directions = ['forward', 'backward', 'both'] + raise ValueError( + "time-weighted interpolation only works " + "on Series or DataFrames with a " + "DatetimeIndex" + ) + method = "values" + + valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: - msg = ('Invalid limit_direction: expecting one of {valid!r}, ' - 'got {invalid!r}.') - raise ValueError(msg.format(valid=valid_limit_directions, - invalid=limit_direction)) + msg = "Invalid limit_direction: expecting one of {valid!r}, " "got {invalid!r}." + raise ValueError( + msg.format(valid=valid_limit_directions, invalid=limit_direction) + ) if limit_area is not None: - valid_limit_areas = ['inside', 'outside'] + valid_limit_areas = ["inside", "outside"] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: - raise ValueError('Invalid limit_area: expecting one of {}, got ' - '{}.'.format(valid_limit_areas, limit_area)) + raise ValueError( + "Invalid limit_area: expecting one of {}, got " + "{}.".format(valid_limit_areas, limit_area) + ) # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): - raise ValueError('Limit must be an integer') + raise ValueError("Limit must be an integer") elif limit < 1: - raise ValueError('Limit must be greater than 0') + raise ValueError("Limit must be greater than 0") from pandas import Series + ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... @@ -182,9 +221,9 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit - if limit_direction == 'forward': + if limit_direction == "forward": preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) - elif limit_direction == 'backward': + elif limit_direction == "backward": preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit @@ -192,22 +231,22 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 - if limit_area == 'inside': + if limit_area == "inside": # preserve NaNs on the outside preserve_nans |= start_nans | end_nans - elif limit_area == 'outside': + elif limit_area == "outside": # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) - xvalues = getattr(xvalues, 'values', xvalues) - yvalues = getattr(yvalues, 'values', yvalues) + xvalues = getattr(xvalues, "values", xvalues) + yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() - if method in ['linear', 'time', 'index', 'values']: - if method in ('values', 'index'): + if method in ["linear", "time", "index", "values"]: + if method in ("values", "index"): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): @@ -220,73 +259,99 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, result[preserve_nans] = np.nan return result - sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'barycentric', 'krogh', 'spline', 'polynomial', - 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] + sp_methods = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "spline", + "polynomial", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) - result[invalid] = _interpolate_scipy_wrapper(inds[valid], - yvalues[valid], - inds[invalid], - method=method, - fill_value=fill_value, - bounds_error=bounds_error, - order=order, **kwargs) + result[invalid] = _interpolate_scipy_wrapper( + inds[valid], + yvalues[valid], + inds[invalid], + method=method, + fill_value=fill_value, + bounds_error=bounds_error, + order=order, + **kwargs + ) result[preserve_nans] = np.nan return result -def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, - bounds_error=False, order=None, **kwargs): +def _interpolate_scipy_wrapper( + x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs +): """ Passed off to scipy.interpolate.interp1d. method is scipy's kind. Returns an array interpolated at new_x. Add any new methods to the list in _clean_interp_method. """ - extra = '{method} interpolation requires SciPy.'.format(method=method) - import_optional_dependency('scipy', extra=extra) + extra = "{method} interpolation requires SciPy.".format(method=method) + import_optional_dependency("scipy", extra=extra) from scipy import interpolate new_x = np.asarray(new_x) # ignores some kwargs that could be passed along. alt_methods = { - 'barycentric': interpolate.barycentric_interpolate, - 'krogh': interpolate.krogh_interpolate, - 'from_derivatives': _from_derivatives, - 'piecewise_polynomial': _from_derivatives, + "barycentric": interpolate.barycentric_interpolate, + "krogh": interpolate.krogh_interpolate, + "from_derivatives": _from_derivatives, + "piecewise_polynomial": _from_derivatives, } - if getattr(x, 'is_all_dates', False): + if getattr(x, "is_all_dates", False): # GH 5975, scipy.interp1d can't hande datetime64s - x, new_x = x._values.astype('i8'), new_x.astype('i8') + x, new_x = x._values.astype("i8"), new_x.astype("i8") - if method == 'pchip': + if method == "pchip": try: - alt_methods['pchip'] = interpolate.pchip_interpolate + alt_methods["pchip"] = interpolate.pchip_interpolate except AttributeError: - raise ImportError("Your version of Scipy does not support " - "PCHIP interpolation.") - elif method == 'akima': - alt_methods['akima'] = _akima_interpolate - - interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial'] + raise ImportError( + "Your version of Scipy does not support " "PCHIP interpolation." + ) + elif method == "akima": + alt_methods["akima"] = _akima_interpolate + + interp1d_methods = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "polynomial", + ] if method in interp1d_methods: - if method == 'polynomial': + if method == "polynomial": method = order - terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value, - bounds_error=bounds_error) + terp = interpolate.interp1d( + x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error + ) new_y = terp(new_x) - elif method == 'spline': + elif method == "spline": # GH #10633, #24014 if isna(order) or (order <= 0): - raise ValueError("order needs to be specified and greater than 0; " - "got order: {}".format(order)) + raise ValueError( + "order needs to be specified and greater than 0; " + "got order: {}".format(order) + ) terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs) new_y = terp(new_x) else: @@ -341,8 +406,7 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False): # return the method for compat with scipy version & backwards compat method = interpolate.BPoly.from_derivatives - m = method(xi, yi.reshape(-1, 1), - orders=order, extrapolate=extrapolate) + m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate) return m(x) @@ -384,6 +448,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): """ from scipy import interpolate + P = interpolate.Akima1DInterpolator(xi, yi, axis=axis) if der == 0: @@ -394,8 +459,9 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): return [P(x, nu) for nu in der] -def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, - dtype=None): +def interpolate_2d( + values, method="pad", axis=0, limit=None, fill_value=None, dtype=None +): """ Perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result. @@ -407,8 +473,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, ndim = values.ndim if values.ndim == 1: if axis != 0: # pragma: no cover - raise AssertionError("cannot interpolate on a ndim == 1 with " - "axis != 0") + raise AssertionError("cannot interpolate on a ndim == 1 with " "axis != 0") values = values.reshape(tuple((1,) + values.shape)) if fill_value is None: @@ -417,12 +482,12 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, mask = mask_missing(transf(values), fill_value) method = clean_fill_method(method) - if method == 'pad': - values = transf(pad_2d( - transf(values), limit=limit, mask=mask, dtype=dtype)) + if method == "pad": + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) else: - values = transf(backfill_2d( - transf(values), limit=limit, mask=mask, dtype=dtype)) + values = transf( + backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) + ) # reshape back if ndim == 1: @@ -438,8 +503,11 @@ def _cast_values_for_fillna(values, dtype): # TODO: for int-dtypes we make a copy, but for everything else this # alters the values in-place. Is this intentional? - if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or - is_timedelta64_dtype(dtype)): + if ( + is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): values = values.view(np.int64) elif is_integer_dtype(values): @@ -498,7 +566,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): return values -_fill_methods = {'pad': pad_1d, 'backfill': backfill_1d} +_fill_methods = {"pad": pad_1d, "backfill": backfill_1d} def get_fill_func(method): @@ -523,10 +591,10 @@ def fill_zeros(result, x, y, name, fill): if fill is None or is_float_dtype(result): return result - if name.startswith(('r', '__r')): + if name.startswith(("r", "__r")): x, y = y, x - is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) + is_variable_type = hasattr(y, "dtype") or hasattr(y, "type") is_scalar_type = is_scalar(y) if not is_variable_type and not is_scalar_type: @@ -544,15 +612,15 @@ def fill_zeros(result, x, y, name, fill): mask = ((y == 0) & ~np.isnan(result)).ravel() shape = result.shape - result = result.astype('float64', copy=False).ravel() + result = result.astype("float64", copy=False).ravel() np.putmask(result, mask, fill) # if we have a fill of inf, then sign it correctly # (GH 6178 and PR 9308) if np.isinf(fill): - signs = y if name.startswith(('r', '__r')) else x - signs = np.sign(signs.astype('float', copy=False)) + signs = y if name.startswith(("r", "__r")) else x + signs = np.sign(signs.astype("float", copy=False)) negative_inf_mask = (signs.ravel() < 0) & mask np.putmask(result, negative_inf_mask, -fill) @@ -606,7 +674,7 @@ def mask_zero_div_zero(x, y, result, copy=False): if nan_mask.any() or neginf_mask.any() or posinf_mask.any(): # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN - result = result.astype('float64', copy=copy).ravel() + result = result.astype("float64", copy=copy).ravel() np.putmask(result, nan_mask, np.nan) np.putmask(result, posinf_mask, np.inf) @@ -633,9 +701,8 @@ def dispatch_missing(op, left, right, result): ------- result : ndarray """ - opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__') - if op in [operator.truediv, operator.floordiv, - getattr(operator, 'div', None)]: + opstr = "__{opname}__".format(opname=op.__name__).replace("____", "__") + if op in [operator.truediv, operator.floordiv, getattr(operator, "div", None)]: result = mask_zero_div_zero(left, right, result) elif op is operator.mod: result = fill_zeros(result, left, right, opstr, np.nan) @@ -684,8 +751,9 @@ def _interp_limit(invalid, fw_limit, bw_limit): def inner(invalid, limit): limit = min(limit, N) windowed = _rolling_window(invalid, limit + 1).all(1) - idx = (set(np.where(windowed)[0] + limit) | - set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0])) + idx = set(np.where(windowed)[0] + limit) | set( + np.where((~invalid[: limit + 1]).cumsum() == 0)[0] + ) return idx if fw_limit is not None: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index cc8b241bedba1..ce14cb22a88ce 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -12,18 +12,30 @@ from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( - _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, - is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype, - is_object_dtype, is_scalar, is_timedelta64_dtype, pandas_dtype) + _get_dtype, + is_any_int_dtype, + is_bool_dtype, + is_complex, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_numeric_dtype, + is_object_dtype, + is_scalar, + is_timedelta64_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.common as com -bn = import_optional_dependency("bottleneck", - raise_on_missing=False, - on_version="warn") +bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn") _BOTTLENECK_INSTALLED = bn is not None _USE_BOTTLENECK = False @@ -35,28 +47,26 @@ def set_use_bottleneck(v=True): _USE_BOTTLENECK = v -set_use_bottleneck(get_option('compute.use_bottleneck')) +set_use_bottleneck(get_option("compute.use_bottleneck")) class disallow: - def __init__(self, *dtypes): super().__init__() self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes) def check(self, obj): - return hasattr(obj, 'dtype') and issubclass(obj.dtype.type, - self.dtypes) + return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes) def __call__(self, f): @functools.wraps(f) def _f(*args, **kwargs): obj_iter = itertools.chain(args, kwargs.values()) if any(self.check(obj) for obj in obj_iter): - msg = 'reduction operation {name!r} not allowed for this dtype' - raise TypeError(msg.format(name=f.__name__.replace('nan', ''))) + msg = "reduction operation {name!r} not allowed for this dtype" + raise TypeError(msg.format(name=f.__name__.replace("nan", ""))) try: - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): return f(*args, **kwargs) except ValueError as e: # we want to transform an object array @@ -71,7 +81,6 @@ def _f(*args, **kwargs): class bottleneck_switch: - def __init__(self, name=None, **kwargs): self.name = name self.kwargs = kwargs @@ -91,7 +100,7 @@ def f(values, axis=None, skipna=True, **kwds): if k not in kwds: kwds[k] = v try: - if values.size == 0 and kwds.get('min_count') is None: + if values.size == 0 and kwds.get("min_count") is None: # We are empty, returning NA for our type # Only applies for the default `min_count` of None # since that affects how empty arrays are handled. @@ -100,8 +109,7 @@ def f(values, axis=None, skipna=True, **kwds): # It *may* just be `var` return _na_for_min_count(values, axis) - if (_USE_BOTTLENECK and skipna and - _bn_ok_dtype(values.dtype, bn_name)): + if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name): result = bn_func(values, axis=axis, **kwds) # prefer to treat inf/-inf as NA, but must compute the func @@ -130,9 +138,9 @@ def f(values, axis=None, skipna=True, **kwds): def _bn_ok_dtype(dt, name): # Bottleneck chokes on datetime64 - if (not is_object_dtype(dt) and - not (is_datetime_or_timedelta_dtype(dt) or - is_datetime64tz_dtype(dt))): + if not is_object_dtype(dt) and not ( + is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt) + ): # GH 15507 # bottleneck does not properly upcast during the sum @@ -142,7 +150,7 @@ def _bn_ok_dtype(dt, name): # further we also want to preserve NaN when all elements # are NaN, unlinke bottleneck/numpy which consider this # to be 0 - if name in ['nansum', 'nanprod']: + if name in ["nansum", "nanprod"]: return False return True @@ -151,9 +159,9 @@ def _bn_ok_dtype(dt, name): def _has_infs(result): if isinstance(result, np.ndarray): - if result.dtype == 'f8': + if result.dtype == "f8": return lib.has_infs_f8(result.ravel()) - elif result.dtype == 'f4': + elif result.dtype == "f4": return lib.has_infs_f4(result.ravel()) try: return np.isinf(result).any() @@ -170,7 +178,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): if fill_value_typ is None: return np.nan else: - if fill_value_typ == '+inf': + if fill_value_typ == "+inf": return np.inf else: return -np.inf @@ -178,15 +186,16 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): if fill_value_typ is None: return tslibs.iNaT else: - if fill_value_typ == '+inf': + if fill_value_typ == "+inf": # need the max int here return _int64_max else: return tslibs.iNaT -def _maybe_get_mask(values: np.ndarray, skipna: bool, - mask: Optional[np.ndarray]) -> Optional[np.ndarray]: +def _maybe_get_mask( + values: np.ndarray, skipna: bool, mask: Optional[np.ndarray] +) -> Optional[np.ndarray]: """ This function will compute a mask iff it is necessary. Otherwise, return the provided mask (potentially None) when a mask does not need to be computed. @@ -227,11 +236,13 @@ def _maybe_get_mask(values: np.ndarray, skipna: bool, return mask -def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, - fill_value_typ: Optional[str] = None, - mask: Optional[np.ndarray] = None - ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, - np.dtype, Any]: +def _get_values( + values: np.ndarray, + skipna: bool, + fill_value: Any = None, + fill_value_typ: Optional[str] = None, + mask: Optional[np.ndarray] = None, +) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, @@ -288,8 +299,9 @@ def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, # get our fill value (in case we need to provide an alternative # dtype for it) - fill_value = _get_fill_value(dtype, fill_value=fill_value, - fill_value_typ=fill_value_typ) + fill_value = _get_fill_value( + dtype, fill_value=fill_value, fill_value_typ=fill_value_typ + ) copy = (mask is not None) and (fill_value is not None) @@ -315,16 +327,19 @@ def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) - if (is_complex_dtype(values) or is_float_dtype(values) or - is_integer_dtype(values) or is_bool_dtype(values)): + if ( + is_complex_dtype(values) + or is_float_dtype(values) + or is_integer_dtype(values) + or is_bool_dtype(values) + ): return ~np.isfinite(values) - return ~np.isfinite(values.astype('float64')) + return ~np.isfinite(values.astype("float64")) def _na_ok_dtype(dtype): # TODO: what about datetime64tz? PeriodDtype? - return not issubclass(dtype.type, - (np.integer, np.timedelta64, np.datetime64)) + return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) def _wrap_results(result, dtype, fill_value=None): @@ -335,7 +350,7 @@ def _wrap_results(result, dtype, fill_value=None): # GH#24293 fill_value = iNaT if not isinstance(result, np.ndarray): - tz = getattr(dtype, 'tz', None) + tz = getattr(dtype, "tz", None) assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan @@ -351,9 +366,9 @@ def _wrap_results(result, dtype, fill_value=None): if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") - result = tslibs.Timedelta(result, unit='ns') + result = tslibs.Timedelta(result, unit="ns") else: - result = result.astype('i8').view(dtype) + result = result.astype("i8").view(dtype) return result @@ -375,14 +390,13 @@ def _na_for_min_count(values, axis): """ # we either return np.nan or pd.NaT if is_numeric_dtype(values): - values = values.astype('float64') + values = values.astype("float64") fill_value = na_value_for_dtype(values.dtype) if values.ndim == 1: return fill_value else: - result_shape = (values.shape[:axis] + - values.shape[axis + 1:]) + result_shape = values.shape[:axis] + values.shape[axis + 1 :] result = np.empty(result_shape, dtype=values.dtype) result.fill(fill_value) return result @@ -416,8 +430,7 @@ def nanany(values, axis=None, skipna=True, mask=None): >>> nanops.nanany(s) False """ - values, _, _, _, _ = _get_values(values, skipna, fill_value=False, - mask=mask) + values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask) return values.any(axis) @@ -449,12 +462,11 @@ def nanall(values, axis=None, skipna=True, mask=None): >>> nanops.nanall(s) False """ - values, _, _, _, _ = _get_values(values, skipna, fill_value=True, - mask=mask) + values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask) return values.all(axis) -@disallow('M8') +@disallow("M8") def nansum(values, axis=None, skipna=True, min_count=0, mask=None): """ Sum the elements along an axis ignoring NaNs @@ -479,21 +491,21 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None): >>> nanops.nansum(s) 3.0 """ - values, mask, dtype, dtype_max, _ = _get_values(values, skipna, - fill_value=0, mask=mask) + values, mask, dtype, dtype_max, _ = _get_values( + values, skipna, fill_value=0, mask=mask + ) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, - min_count=min_count) + the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) return _wrap_results(the_sum, dtype) -@disallow('M8', DatetimeTZDtype) +@disallow("M8", DatetimeTZDtype) @bottleneck_switch() def nanmean(values, axis=None, skipna=True, mask=None): """ @@ -520,12 +532,17 @@ def nanmean(values, axis=None, skipna=True, mask=None): >>> nanops.nanmean(s) 1.5 """ - values, mask, dtype, dtype_max, _ = _get_values(values, skipna, - fill_value=0, mask=mask) + values, mask, dtype, dtype_max, _ = _get_values( + values, skipna, fill_value=0, mask=mask + ) dtype_sum = dtype_max dtype_count = np.float64 - if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or - is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)): + if ( + is_integer_dtype(dtype) + or is_timedelta64_dtype(dtype) + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype @@ -533,7 +550,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) - if axis is not None and getattr(the_sum, 'ndim', False): + if axis is not None and getattr(the_sum, "ndim", False): with np.errstate(all="ignore"): # suppress division by zero warnings the_mean = the_sum / count @@ -546,7 +563,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): return _wrap_results(the_mean, dtype) -@disallow('M8') +@disallow("M8") @bottleneck_switch() def nanmedian(values, axis=None, skipna=True, mask=None): """ @@ -571,6 +588,7 @@ def nanmedian(values, axis=None, skipna=True, mask=None): >>> nanops.nanmedian(s) 2.0 """ + def get_median(x): mask = notna(x) if not skipna and not mask.all(): @@ -579,7 +597,7 @@ def get_median(x): values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) if not is_float_dtype(values): - values = values.astype('f8') + values = values.astype("f8") if mask is not None: values[mask] = np.nan @@ -595,7 +613,8 @@ def get_median(x): if notempty: if not skipna: return _wrap_results( - np.apply_along_axis(get_median, axis, values), dtype) + np.apply_along_axis(get_median, axis, values), dtype + ) # fastpath for the skipna case return _wrap_results(np.nanmedian(values, axis), dtype) @@ -614,10 +633,13 @@ def get_median(x): return _wrap_results(get_median(values) if notempty else np.nan, dtype) -def _get_counts_nanvar(value_counts: Tuple[int], mask: Optional[np.ndarray], - axis: Optional[int], ddof: int, - dtype=float) -> Tuple[Union[int, np.ndarray], - Union[int, np.ndarray]]: +def _get_counts_nanvar( + value_counts: Tuple[int], + mask: Optional[np.ndarray], + axis: Optional[int], + ddof: int, + dtype=float, +) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]: """ Get the count of non-null values along an axis, accounting for degrees of freedom. @@ -656,7 +678,7 @@ def _get_counts_nanvar(value_counts: Tuple[int], mask: Optional[np.ndarray], return count, d -@disallow('M8') +@disallow("M8") @bottleneck_switch(ddof=1) def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): """ @@ -686,12 +708,11 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanstd(s) 1.0 """ - result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, - mask=mask)) + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)) return _wrap_results(result, values.dtype) -@disallow('M8') +@disallow("M8") @bottleneck_switch(ddof=1) def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): """ @@ -725,13 +746,12 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): dtype = values.dtype mask = _maybe_get_mask(values, skipna, mask) if is_any_int_dtype(values): - values = values.astype('f8') + values = values.astype("f8") if mask is not None: values[mask] = np.nan if is_float_dtype(values): - count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, - values.dtype) + count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) @@ -761,7 +781,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): return _wrap_results(result, values.dtype) -@disallow('M8', 'm8') +@disallow("M8", "m8") def nansem(values, axis=None, skipna=True, ddof=1, mask=None): """ Compute the standard error in the mean along given axis while ignoring NaNs @@ -797,7 +817,7 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): - values = values.astype('f8') + values = values.astype("f8") count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) var = nanvar(values, axis, skipna, ddof=ddof) @@ -806,20 +826,18 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): - - @bottleneck_switch(name='nan' + meth) + @bottleneck_switch(name="nan" + meth) def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max, fill_value = _get_values( - values, skipna, fill_value_typ=fill_value_typ, mask=mask) + values, skipna, fill_value_typ=fill_value_typ, mask=mask + ) - if ((axis is not None and values.shape[axis] == 0) or - values.size == 0): + if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, - ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): result = np.nan else: result = getattr(values, meth)(axis) @@ -830,11 +848,11 @@ def reduction(values, axis=None, skipna=True, mask=None): return reduction -nanmin = _nanminmax('min', fill_value_typ='+inf') -nanmax = _nanminmax('max', fill_value_typ='-inf') +nanmin = _nanminmax("min", fill_value_typ="+inf") +nanmax = _nanminmax("max", fill_value_typ="-inf") -@disallow('O') +@disallow("O") def nanargmax(values, axis=None, skipna=True, mask=None): """ Parameters @@ -858,13 +876,14 @@ def nanargmax(values, axis=None, skipna=True, mask=None): 4 """ values, mask, dtype, _, _ = _get_values( - values, True, fill_value_typ='-inf', mask=mask) + values, True, fill_value_typ="-inf", mask=mask + ) result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result -@disallow('O') +@disallow("O") def nanargmin(values, axis=None, skipna=True, mask=None): """ Parameters @@ -888,13 +907,14 @@ def nanargmin(values, axis=None, skipna=True, mask=None): 0 """ values, mask, dtype, _, _ = _get_values( - values, True, fill_value_typ='+inf', mask=mask) + values, True, fill_value_typ="+inf", mask=mask + ) result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result -@disallow('M8', 'm8') +@disallow("M8", "m8") def nanskew(values, axis=None, skipna=True, mask=None): """ Compute the sample skewness. @@ -926,7 +946,7 @@ def nanskew(values, axis=None, skipna=True, mask=None): values = com.values_from_object(values) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): - values = values.astype('f8') + values = values.astype("f8") count = _get_counts(values.shape, mask, axis) else: count = _get_counts(values.shape, mask, axis, dtype=values.dtype) @@ -954,7 +974,7 @@ def nanskew(values, axis=None, skipna=True, mask=None): m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype @@ -972,7 +992,7 @@ def nanskew(values, axis=None, skipna=True, mask=None): return result -@disallow('M8', 'm8') +@disallow("M8", "m8") def nankurt(values, axis=None, skipna=True, mask=None): """ Compute the sample excess kurtosis @@ -1005,7 +1025,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): values = com.values_from_object(values) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): - values = values.astype('f8') + values = values.astype("f8") count = _get_counts(values.shape, mask, axis) else: count = _get_counts(values.shape, mask, axis, dtype=values.dtype) @@ -1026,7 +1046,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2 ** 2 @@ -1046,7 +1066,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): if denom == 0: return 0 - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): result = numer / denom - adj dtype = values.dtype @@ -1060,7 +1080,7 @@ def nankurt(values, axis=None, skipna=True, mask=None): return result -@disallow('M8', 'm8') +@disallow("M8", "m8") def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): """ Parameters @@ -1093,18 +1113,17 @@ def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask, values.shape, - min_count=min_count) + return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count) -def _maybe_arg_null_out(result: np.ndarray, axis: Optional[int], - mask: Optional[np.ndarray], - skipna: bool) -> Union[np.ndarray, int]: +def _maybe_arg_null_out( + result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool +) -> Union[np.ndarray, int]: # helper function for nanargmin/nanargmax if mask is None: return result - if axis is None or not getattr(result, 'ndim', False): + if axis is None or not getattr(result, "ndim", False): if skipna: if mask.all(): result = -1 @@ -1121,8 +1140,12 @@ def _maybe_arg_null_out(result: np.ndarray, axis: Optional[int], return result -def _get_counts(values_shape: Tuple[int], mask: Optional[np.ndarray], - axis: Optional[int], dtype=float) -> Union[int, np.ndarray]: +def _get_counts( + values_shape: Tuple[int], + mask: Optional[np.ndarray], + axis: Optional[int], + dtype=float, +) -> Union[int, np.ndarray]: """ Get the count of non-null values along an axis Parameters @@ -1161,18 +1184,21 @@ def _get_counts(values_shape: Tuple[int], mask: Optional[np.ndarray], return np.array(count, dtype=dtype) -def _maybe_null_out(result: np.ndarray, axis: Optional[int], - mask: Optional[np.ndarray], shape: Tuple, - min_count: int = 1) -> np.ndarray: - if (mask is not None and axis is not None and - getattr(result, 'ndim', False)): +def _maybe_null_out( + result: np.ndarray, + axis: Optional[int], + mask: Optional[np.ndarray], + shape: Tuple, + min_count: int = 1, +) -> np.ndarray: + if mask is not None and axis is not None and getattr(result, "ndim", False): null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): - result = result.astype('c16') + result = result.astype("c16") else: - result = result.astype('f8') + result = result.astype("f8") result[null_mask] = np.nan else: # GH12941, use None to auto cast null @@ -1191,19 +1217,19 @@ def _maybe_null_out(result: np.ndarray, axis: Optional[int], def _zero_out_fperr(arg): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): return np.where(np.abs(arg) < 1e-14, 0, arg) else: return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg -@disallow('M8', 'm8') -def nancorr(a, b, method='pearson', min_periods=None): +@disallow("M8", "m8") +def nancorr(a, b, method="pearson", min_periods=None): """ a, b: ndarrays """ if len(a) != len(b): - raise AssertionError('Operands to nancorr must have same size') + raise AssertionError("Operands to nancorr must have same size") if min_periods is None: min_periods = 1 @@ -1221,7 +1247,7 @@ def nancorr(a, b, method='pearson', min_periods=None): def get_corr_func(method): - if method in ['kendall', 'spearman']: + if method in ["kendall", "spearman"]: from scipy.stats import kendalltau, spearmanr elif callable(method): return method @@ -1238,18 +1264,14 @@ def _kendall(a, b): def _spearman(a, b): return spearmanr(a, b)[0] - _cor_methods = { - 'pearson': _pearson, - 'kendall': _kendall, - 'spearman': _spearman - } + _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman} return _cor_methods[method] -@disallow('M8', 'm8') +@disallow("M8", "m8") def nancov(a, b, min_periods=None): if len(a) != len(b): - raise AssertionError('Operands to nancov must have same size') + raise AssertionError("Operands to nancov must have same size") if min_periods is None: min_periods = 1 @@ -1284,10 +1306,12 @@ def _ensure_numeric(x): try: x = complex(x) except Exception: - raise TypeError('Could not convert {value!s} to numeric' - .format(value=x)) + raise TypeError( + "Could not convert {value!s} to numeric".format(value=x) + ) return x + # NA-friendly array comparisons @@ -1297,12 +1321,12 @@ def f(x, y): ymask = isna(y) mask = xmask | ymask - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = op(x, y) if mask.any(): if is_bool_dtype(result): - result = result.astype('O') + result = result.astype("O") np.putmask(result, mask, np.nan) return result @@ -1344,8 +1368,7 @@ def _nanpercentile_1d(values, mask, q, na_value, interpolation): if lib.is_scalar(q): return na_value else: - return np.array([na_value] * len(q), - dtype=values.dtype) + return np.array([na_value] * len(q), dtype=values.dtype) return np.percentile(values, q, interpolation=interpolation) @@ -1372,8 +1395,9 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): """ if not lib.is_scalar(mask) and mask.any(): if ndim == 1: - return _nanpercentile_1d(values, mask, q, na_value, - interpolation=interpolation) + return _nanpercentile_1d( + values, mask, q, na_value, interpolation=interpolation + ) else: # for nonconsolidatable blocks mask is 1D, but values 2D if mask.ndim < values.ndim: @@ -1381,9 +1405,10 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): if axis == 0: values = values.T mask = mask.T - result = [_nanpercentile_1d(val, m, q, na_value, - interpolation=interpolation) - for (val, m) in zip(list(values), list(mask))] + result = [ + _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation) + for (val, m) in zip(list(values), list(mask)) + ] result = np.array(result, dtype=values.dtype, copy=False).T return result else: diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 5dd8455073212..5c58a1433ba3c 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -16,16 +16,34 @@ from pandas.util._decorators import Appender from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, find_common_type, - maybe_upcast_putmask) + construct_1d_object_array_from_listlike, + find_common_type, + maybe_upcast_putmask, +) from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, - is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype, - is_scalar, is_timedelta64_dtype, needs_i8_conversion) + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike_v_numeric, + is_extension_array_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_period_dtype, + is_scalar, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndex, ABCIndexClass, ABCSeries, ABCSparseArray, - ABCSparseSeries) + ABCDataFrame, + ABCIndex, + ABCIndexClass, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) from pandas.core.dtypes.missing import isna, notna import pandas as pd @@ -80,8 +98,8 @@ def _maybe_match_name(a, b): -------- pandas.core.common.consensus_name_attr """ - a_has = hasattr(a, 'name') - b_has = hasattr(b, 'name') + a_has = hasattr(a, "name") + b_has = hasattr(b, "name") if a_has and b_has: if a.name == b.name: return a.name @@ -137,6 +155,7 @@ def maybe_upcast_for_op(obj): # Reversed Operations not available in the stdlib operator module. # Defining these instead of using lambdas allows us to reference them by name. + def radd(left, right): return right + left @@ -166,8 +185,9 @@ def rmod(left, right): # formatting operation; this is a TypeError # otherwise perform the op if isinstance(right, str): - raise TypeError("{typ} cannot perform the operation mod".format( - typ=type(left).__name__)) + raise TypeError( + "{typ} cannot perform the operation mod".format(typ=type(left).__name__) + ) return right % left @@ -194,6 +214,7 @@ def rxor(left, right): # ----------------------------------------------------------------------------- + def make_invalid_op(name): """ Return a binary method that always raises a TypeError. @@ -206,9 +227,12 @@ def make_invalid_op(name): ------- invalid_op : function """ + def invalid_op(self, other=None): - raise TypeError("cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self).__name__)) + raise TypeError( + "cannot perform {name} with this index type: " + "{typ}".format(name=name, typ=type(self).__name__) + ) invalid_op.__name__ = name return invalid_op @@ -239,18 +263,18 @@ def _gen_eval_kwargs(name): # Series appear to only pass __add__, __radd__, ... # but DataFrame gets both these dunder names _and_ non-dunder names # add, radd, ... - name = name.replace('__', '') + name = name.replace("__", "") - if name.startswith('r'): - if name not in ['radd', 'rand', 'ror', 'rxor']: + if name.startswith("r"): + if name not in ["radd", "rand", "ror", "rxor"]: # Exclude commutative operations - kwargs['reversed'] = True + kwargs["reversed"] = True - if name in ['truediv', 'rtruediv']: - kwargs['truediv'] = True + if name in ["truediv", "rtruediv"]: + kwargs["truediv"] = True - if name in ['ne']: - kwargs['masker'] = True + if name in ["ne"]: + kwargs["masker"] = True return kwargs @@ -269,11 +293,11 @@ def _gen_fill_zeros(name): ------- fill_value : {None, np.nan, np.inf} """ - name = name.strip('__') - if 'div' in name: + name = name.strip("__") + if "div" in name: # truediv, floordiv, div, and reversed variants fill_value = np.inf - elif 'mod' in name: + elif "mod" in name: # mod, rmod fill_value = np.nan else: @@ -295,15 +319,15 @@ def _get_frame_op_default_axis(name): ------- default_axis: str or None """ - if name.replace('__r', '__') in ['__and__', '__or__', '__xor__']: + if name.replace("__r", "__") in ["__and__", "__or__", "__xor__"]: # bool methods - return 'columns' - elif name.startswith('__'): + return "columns" + elif name.startswith("__"): # __add__, __mul__, ... return None else: # add, mul, ... - return 'columns' + return "columns" def _get_opstr(op, cls): @@ -321,41 +345,43 @@ def _get_opstr(op, cls): op_str : string or None """ # numexpr is available for non-sparse classes - subtyp = getattr(cls, '_subtyp', '') - use_numexpr = 'sparse' not in subtyp + subtyp = getattr(cls, "_subtyp", "") + use_numexpr = "sparse" not in subtyp if not use_numexpr: # if we're not using numexpr, then don't pass a str_rep return None - return {operator.add: '+', - radd: '+', - operator.mul: '*', - rmul: '*', - operator.sub: '-', - rsub: '-', - operator.truediv: '/', - rtruediv: '/', - operator.floordiv: '//', - rfloordiv: '//', - operator.mod: None, # TODO: Why None for mod but '%' for rmod? - rmod: '%', - operator.pow: '**', - rpow: '**', - operator.eq: '==', - operator.ne: '!=', - operator.le: '<=', - operator.lt: '<', - operator.ge: '>=', - operator.gt: '>', - operator.and_: '&', - rand_: '&', - operator.or_: '|', - ror_: '|', - operator.xor: '^', - rxor: '^', - divmod: None, - rdivmod: None}[op] + return { + operator.add: "+", + radd: "+", + operator.mul: "*", + rmul: "*", + operator.sub: "-", + rsub: "-", + operator.truediv: "/", + rtruediv: "/", + operator.floordiv: "//", + rfloordiv: "//", + operator.mod: None, # TODO: Why None for mod but '%' for rmod? + rmod: "%", + operator.pow: "**", + rpow: "**", + operator.eq: "==", + operator.ne: "!=", + operator.le: "<=", + operator.lt: "<", + operator.ge: ">=", + operator.gt: ">", + operator.and_: "&", + rand_: "&", + operator.or_: "|", + ror_: "|", + operator.xor: "^", + rxor: "^", + divmod: None, + rdivmod: None, + }[op] def _get_op_name(op, special): @@ -372,9 +398,9 @@ def _get_op_name(op, special): ------- op_name : str """ - opname = op.__name__.strip('_') + opname = op.__name__.strip("_") if special: - opname = '__{opname}__'.format(opname=opname) + opname = "__{opname}__".format(opname=opname) return opname @@ -564,77 +590,89 @@ def _get_op_name(op, special): _op_descriptions = { # Arithmetic Operators - 'add': {'op': '+', - 'desc': 'Addition', - 'reverse': 'radd', - 'series_examples': _add_example_SERIES}, - 'sub': {'op': '-', - 'desc': 'Subtraction', - 'reverse': 'rsub', - 'series_examples': _sub_example_SERIES}, - 'mul': {'op': '*', - 'desc': 'Multiplication', - 'reverse': 'rmul', - 'series_examples': _mul_example_SERIES, - 'df_examples': None}, - 'mod': {'op': '%', - 'desc': 'Modulo', - 'reverse': 'rmod', - 'series_examples': _mod_example_SERIES}, - 'pow': {'op': '**', - 'desc': 'Exponential power', - 'reverse': 'rpow', - 'series_examples': _pow_example_SERIES, - 'df_examples': None}, - 'truediv': {'op': '/', - 'desc': 'Floating division', - 'reverse': 'rtruediv', - 'series_examples': _div_example_SERIES, - 'df_examples': None}, - 'floordiv': {'op': '//', - 'desc': 'Integer division', - 'reverse': 'rfloordiv', - 'series_examples': _floordiv_example_SERIES, - 'df_examples': None}, - 'divmod': {'op': 'divmod', - 'desc': 'Integer division and modulo', - 'reverse': 'rdivmod', - 'series_examples': None, - 'df_examples': None}, - + "add": { + "op": "+", + "desc": "Addition", + "reverse": "radd", + "series_examples": _add_example_SERIES, + }, + "sub": { + "op": "-", + "desc": "Subtraction", + "reverse": "rsub", + "series_examples": _sub_example_SERIES, + }, + "mul": { + "op": "*", + "desc": "Multiplication", + "reverse": "rmul", + "series_examples": _mul_example_SERIES, + "df_examples": None, + }, + "mod": { + "op": "%", + "desc": "Modulo", + "reverse": "rmod", + "series_examples": _mod_example_SERIES, + }, + "pow": { + "op": "**", + "desc": "Exponential power", + "reverse": "rpow", + "series_examples": _pow_example_SERIES, + "df_examples": None, + }, + "truediv": { + "op": "/", + "desc": "Floating division", + "reverse": "rtruediv", + "series_examples": _div_example_SERIES, + "df_examples": None, + }, + "floordiv": { + "op": "//", + "desc": "Integer division", + "reverse": "rfloordiv", + "series_examples": _floordiv_example_SERIES, + "df_examples": None, + }, + "divmod": { + "op": "divmod", + "desc": "Integer division and modulo", + "reverse": "rdivmod", + "series_examples": None, + "df_examples": None, + }, # Comparison Operators - 'eq': {'op': '==', - 'desc': 'Equal to', - 'reverse': None, - 'series_examples': None}, - 'ne': {'op': '!=', - 'desc': 'Not equal to', - 'reverse': None, - 'series_examples': None}, - 'lt': {'op': '<', - 'desc': 'Less than', - 'reverse': None, - 'series_examples': None}, - 'le': {'op': '<=', - 'desc': 'Less than or equal to', - 'reverse': None, - 'series_examples': None}, - 'gt': {'op': '>', - 'desc': 'Greater than', - 'reverse': None, - 'series_examples': None}, - 'ge': {'op': '>=', - 'desc': 'Greater than or equal to', - 'reverse': None, - 'series_examples': None} + "eq": {"op": "==", "desc": "Equal to", "reverse": None, "series_examples": None}, + "ne": { + "op": "!=", + "desc": "Not equal to", + "reverse": None, + "series_examples": None, + }, + "lt": {"op": "<", "desc": "Less than", "reverse": None, "series_examples": None}, + "le": { + "op": "<=", + "desc": "Less than or equal to", + "reverse": None, + "series_examples": None, + }, + "gt": {"op": ">", "desc": "Greater than", "reverse": None, "series_examples": None}, + "ge": { + "op": ">=", + "desc": "Greater than or equal to", + "reverse": None, + "series_examples": None, + }, } # type: Dict[str, Dict[str, Optional[str]]] _op_names = list(_op_descriptions.keys()) for key in _op_names: - reverse_op = _op_descriptions[key]['reverse'] + reverse_op = _op_descriptions[key]["reverse"] if reverse_op is not None: _op_descriptions[reverse_op] = _op_descriptions[key].copy() - _op_descriptions[reverse_op]['reverse'] = key + _op_descriptions[reverse_op]["reverse"] = key _flex_doc_SERIES = """ Return {desc} of series and other, element-wise (binary operator `{op_name}`). @@ -1007,42 +1045,43 @@ def _make_flex_doc(op_name, typ): ------- doc : str """ - op_name = op_name.replace('__', '') + op_name = op_name.replace("__", "") op_desc = _op_descriptions[op_name] - if op_name.startswith('r'): - equiv = 'other ' + op_desc['op'] + ' ' + typ + if op_name.startswith("r"): + equiv = "other " + op_desc["op"] + " " + typ else: - equiv = typ + ' ' + op_desc['op'] + ' other' + equiv = typ + " " + op_desc["op"] + " other" - if typ == 'series': + if typ == "series": base_doc = _flex_doc_SERIES doc_no_examples = base_doc.format( - desc=op_desc['desc'], + desc=op_desc["desc"], op_name=op_name, equiv=equiv, - reverse=op_desc['reverse'] + reverse=op_desc["reverse"], ) - if op_desc['series_examples']: - doc = doc_no_examples + op_desc['series_examples'] + if op_desc["series_examples"]: + doc = doc_no_examples + op_desc["series_examples"] else: doc = doc_no_examples - elif typ == 'dataframe': + elif typ == "dataframe": base_doc = _flex_doc_FRAME doc = base_doc.format( - desc=op_desc['desc'], + desc=op_desc["desc"], op_name=op_name, equiv=equiv, - reverse=op_desc['reverse'] + reverse=op_desc["reverse"], ) else: - raise AssertionError('Invalid typ argument.') + raise AssertionError("Invalid typ argument.") return doc # ----------------------------------------------------------------------------- # Masking NA values and fallbacks for operations numpy does not support + def fill_binop(left, right, fill_value): """ If a non-None fill_value is given, replace null entries in left and right @@ -1097,8 +1136,7 @@ def mask_cmp_op(x, y, op): if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notna(xrav) & notna(yrav) - result[mask] = op(np.array(list(xrav[mask])), - np.array(list(yrav[mask]))) + result[mask] = op(np.array(list(xrav[mask])), np.array(list(yrav[mask]))) else: mask = notna(xrav) result[mask] = op(np.array(list(xrav[mask])), y) @@ -1140,12 +1178,11 @@ def masked_arith_op(x, y, op): # Without specifically raising here we get mismatched # errors in Py3 (TypeError) vs Py2 (ValueError) # Note: Only = an issue in DataFrame case - raise ValueError('Cannot broadcast operands together.') + raise ValueError("Cannot broadcast operands together.") if mask.any(): - with np.errstate(all='ignore'): - result[mask] = op(xrav[mask], - com.values_from_object(yrav[mask])) + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], com.values_from_object(yrav[mask])) else: assert is_scalar(y), type(y) @@ -1161,7 +1198,7 @@ def masked_arith_op(x, y, op): mask = np.where(y == 1, False, mask) if mask.any(): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result[mask] = op(xrav[mask], y) result, changed = maybe_upcast_putmask(result, ~mask, np.nan) @@ -1193,14 +1230,18 @@ def invalid_comparison(left, right, op): elif op is operator.ne: res_values = np.ones(left.shape, dtype=bool) else: - raise TypeError("Invalid comparison between dtype={dtype} and {typ}" - .format(dtype=left.dtype, typ=type(right).__name__)) + raise TypeError( + "Invalid comparison between dtype={dtype} and {typ}".format( + dtype=left.dtype, typ=type(right).__name__ + ) + ) return res_values # ----------------------------------------------------------------------------- # Dispatch logic + def should_series_dispatch(left, right, op): """ Identify cases where a DataFrame operation should dispatch to its @@ -1226,8 +1267,9 @@ def should_series_dispatch(left, right, op): ldtype = left.dtypes.iloc[0] rdtype = right.dtypes.iloc[0] - if ((is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or - (is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype))): + if (is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or ( + is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype) + ): # numpy integer dtypes as timedelta64 dtypes in this scenario return True @@ -1263,15 +1305,13 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): if lib.is_scalar(right) or np.ndim(right) == 0: def column_op(a, b): - return {i: func(a.iloc[:, i], b) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[:, i]) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": # We only get here if called via left._combine_match_columns, @@ -1279,15 +1319,13 @@ def column_op(a, b): assert right.index.equals(left.columns) def column_op(a, b): - return {i: func(a.iloc[:, i], b.iloc[i]) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries): assert right.index.equals(left.index) # Handle other cases later def column_op(a, b): - return {i: func(a.iloc[:, i], b) - for i in range(len(a.columns))} + return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} else: # Remaining cases have less-obvious dispatch rules @@ -1324,15 +1362,17 @@ def dispatch_to_index_op(op, left, right, index_class): # avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes, # left_idx may inherit a freq from a cached DatetimeIndex. # See discussion in GH#19147. - if getattr(left_idx, 'freq', None) is not None: + if getattr(left_idx, "freq", None) is not None: left_idx = left_idx._shallow_copy(freq=None) try: result = op(left_idx, right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. - raise TypeError('incompatible type for a datetime/timedelta ' - 'operation [{name}]'.format(name=op.__name__)) + raise TypeError( + "incompatible type for a datetime/timedelta " + "operation [{name}]".format(name=op.__name__) + ) return result @@ -1359,9 +1399,8 @@ def dispatch_to_extension_op(op, left, right): res_values = op(new_left, new_right) res_name = get_op_result_name(left, right) - if op.__name__ in ['divmod', 'rdivmod']: - return _construct_divmod_result( - left, res_values, left.index, res_name) + if op.__name__ in ["divmod", "rdivmod"]: + return _construct_divmod_result(left, res_values, left.index, res_name) return _construct_result(left, res_values, left.index, res_name) @@ -1370,6 +1409,7 @@ def dispatch_to_extension_op(op, left, right): # Functions that add arithmetic methods to objects, given arithmetic factory # methods + def _get_method_wrappers(cls): """ Find the appropriate operation-wrappers to use when defining flex/special @@ -1451,33 +1491,39 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): rpow=arith_method(cls, rpow, special), rmod=arith_method(cls, rmod, special)) # yapf: enable - new_methods['div'] = new_methods['truediv'] - new_methods['rdiv'] = new_methods['rtruediv'] + new_methods["div"] = new_methods["truediv"] + new_methods["rdiv"] = new_methods["rtruediv"] if have_divmod: # divmod doesn't have an op that is supported by numexpr - new_methods['divmod'] = arith_method(cls, divmod, special) - new_methods['rdivmod'] = arith_method(cls, rdivmod, special) + new_methods["divmod"] = arith_method(cls, divmod, special) + new_methods["rdivmod"] = arith_method(cls, rdivmod, special) - new_methods.update(dict( - eq=comp_method(cls, operator.eq, special), - ne=comp_method(cls, operator.ne, special), - lt=comp_method(cls, operator.lt, special), - gt=comp_method(cls, operator.gt, special), - le=comp_method(cls, operator.le, special), - ge=comp_method(cls, operator.ge, special))) + new_methods.update( + dict( + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special), + ) + ) if bool_method: new_methods.update( - dict(and_=bool_method(cls, operator.and_, special), - or_=bool_method(cls, operator.or_, special), - # For some reason ``^`` wasn't used in original. - xor=bool_method(cls, operator.xor, special), - rand_=bool_method(cls, rand_, special), - ror_=bool_method(cls, ror_, special), - rxor=bool_method(cls, rxor, special))) + dict( + and_=bool_method(cls, operator.and_, special), + or_=bool_method(cls, operator.or_, special), + # For some reason ``^`` wasn't used in original. + xor=bool_method(cls, operator.xor, special), + rand_=bool_method(cls, rand_, special), + ror_=bool_method(cls, ror_, special), + rxor=bool_method(cls, rxor, special), + ) + ) if special: - dunderize = lambda x: '__{name}__'.format(name=x.strip('_')) + dunderize = lambda x: "__{name}__".format(name=x.strip("_")) else: dunderize = lambda x: x new_methods = {dunderize(k): v for k, v in new_methods.items()} @@ -1490,8 +1536,7 @@ def add_methods(cls, new_methods): # of the same name, it is OK to over-write it. The exception is # inplace methods (__iadd__, __isub__, ...) for SparseArray, which # retain the np.ndarray versions. - force = not (issubclass(cls, ABCSparseArray) and - name.startswith('__i')) + force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i")) if force or name not in cls.__dict__: setattr(cls, name, method) @@ -1509,8 +1554,9 @@ def add_special_arithmetic_methods(cls): special methods will be defined and pinned to this class """ _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) - new_methods = _create_methods(cls, arith_method, comp_method, bool_method, - special=True) + new_methods = _create_methods( + cls, arith_method, comp_method, bool_method, special=True + ) # inplace operators (I feel like these should get passed an `inplace=True` # or just be removed @@ -1524,8 +1570,9 @@ def f(self, other): # this makes sure that we are aligned like the input # we are updating inplace so we want to ignore is_copy - self._update_inplace(result.reindex_like(self, copy=False)._data, - verify_is_copy=False) + self._update_inplace( + result.reindex_like(self, copy=False)._data, verify_is_copy=False + ) return self @@ -1533,18 +1580,24 @@ def f(self, other): return f new_methods.update( - dict(__iadd__=_wrap_inplace_method(new_methods["__add__"]), - __isub__=_wrap_inplace_method(new_methods["__sub__"]), - __imul__=_wrap_inplace_method(new_methods["__mul__"]), - __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), - __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), - __imod__=_wrap_inplace_method(new_methods["__mod__"]), - __ipow__=_wrap_inplace_method(new_methods["__pow__"]))) + dict( + __iadd__=_wrap_inplace_method(new_methods["__add__"]), + __isub__=_wrap_inplace_method(new_methods["__sub__"]), + __imul__=_wrap_inplace_method(new_methods["__mul__"]), + __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), + __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), + __imod__=_wrap_inplace_method(new_methods["__mod__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]), + ) + ) new_methods.update( - dict(__iand__=_wrap_inplace_method(new_methods["__and__"]), - __ior__=_wrap_inplace_method(new_methods["__or__"]), - __ixor__=_wrap_inplace_method(new_methods["__xor__"]))) + dict( + __iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]), + ) + ) add_methods(cls, new_methods=new_methods) @@ -1560,14 +1613,18 @@ def add_flex_arithmetic_methods(cls): flex methods will be defined and pinned to this class """ flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) - new_methods = _create_methods(cls, flex_arith_method, - flex_comp_method, bool_method=None, - special=False) - new_methods.update(dict(multiply=new_methods['mul'], - subtract=new_methods['sub'], - divide=new_methods['div'])) + new_methods = _create_methods( + cls, flex_arith_method, flex_comp_method, bool_method=None, special=False + ) + new_methods.update( + dict( + multiply=new_methods["mul"], + subtract=new_methods["sub"], + divide=new_methods["div"], + ) + ) # opt out of bool flex methods for now - assert not any(kname in new_methods for kname in ('ror_', 'rxor', 'rand_')) + assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) add_methods(cls, new_methods=new_methods) @@ -1575,6 +1632,7 @@ def add_flex_arithmetic_methods(cls): # ----------------------------------------------------------------------------- # Series + def _align_method_SERIES(left, right, align_asobject=False): """ align lhs and rhs Series """ @@ -1612,10 +1670,8 @@ def _construct_divmod_result(left, result, index, name, dtype=None): """divmod returns a tuple of like indexed series instead of a single series. """ return ( - _construct_result(left, result[0], index=index, name=name, - dtype=dtype), - _construct_result(left, result[1], index=index, name=name, - dtype=dtype), + _construct_result(left, result[0], index=index, name=name, dtype=dtype), + _construct_result(left, result[1], index=index, name=name, dtype=dtype), ) @@ -1628,8 +1684,9 @@ def _arith_method_SERIES(cls, op, special): op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) fill_zeros = _gen_fill_zeros(op_name) - construct_result = (_construct_divmod_result - if op in [divmod, rdivmod] else _construct_result) + construct_result = ( + _construct_divmod_result if op in [divmod, rdivmod] else _construct_result + ) def na_op(x, y): """ @@ -1651,21 +1708,20 @@ def na_op(x, y): TypeError : invalid operation """ import pandas.core.computation.expressions as expressions + try: result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) except TypeError: result = masked_arith_op(x, y, op) except Exception: # TODO: more specific? if is_object_dtype(x): - return libalgos.arrmap_object(x, - lambda val: op(val, y)) + return libalgos.arrmap_object(x, lambda val: op(val, y)) raise if isinstance(result, tuple): # e.g. divmod result = tuple( - missing.fill_zeros(r, x, y, op_name, fill_zeros) - for r in result + missing.fill_zeros(r, x, y, op_name, fill_zeros) for r in result ) else: result = missing.fill_zeros(result, x, y, op_name, fill_zeros) @@ -1680,27 +1736,29 @@ def wrapper(left, right): right = maybe_upcast_for_op(right) if is_categorical_dtype(left): - raise TypeError("{typ} cannot perform the operation " - "{op}".format(typ=type(left).__name__, op=str_rep)) + raise TypeError( + "{typ} cannot perform the operation " + "{op}".format(typ=type(left).__name__, op=str_rep) + ) elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left): # Give dispatch_to_index_op a chance for tests like # test_dt64_series_add_intlike, which the index dispatching handles # specifically. result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) - return construct_result(left, result, - index=left.index, name=res_name, - dtype=result.dtype) + return construct_result( + left, result, index=left.index, name=res_name, dtype=result.dtype + ) - elif (is_extension_array_dtype(left) or - (is_extension_array_dtype(right) and not is_scalar(right))): + elif is_extension_array_dtype(left) or ( + is_extension_array_dtype(right) and not is_scalar(right) + ): # GH#22378 disallow scalar to exclude e.g. "category", "Int64" return dispatch_to_extension_op(op, left, right) elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) - return construct_result(left, result, - index=left.index, name=res_name) + return construct_result(left, result, index=left.index, name=res_name) elif is_timedelta64_dtype(right): # We should only get here with non-scalar or timedelta64('NaT') @@ -1709,19 +1767,20 @@ def wrapper(left, right): # that may incorrectly raise TypeError when we # should get NullFrequencyError result = op(pd.Index(left), right) - return construct_result(left, result, - index=left.index, name=res_name, - dtype=result.dtype) + return construct_result( + left, result, index=left.index, name=res_name, dtype=result.dtype + ) lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): rvalues = rvalues.values - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = na_op(lvalues, rvalues) - return construct_result(left, result, - index=left.index, name=res_name, dtype=None) + return construct_result( + left, result, index=left.index, name=res_name, dtype=None + ) wrapper.__name__ = op_name return wrapper @@ -1749,7 +1808,7 @@ def _comp_method_SERIES(cls, op, special): code duplication. """ op_name = _get_op_name(op, special) - masker = _gen_eval_kwargs(op_name).get('masker', False) + masker = _gen_eval_kwargs(op_name).get("masker", False) def na_op(x, y): # TODO: @@ -1779,12 +1838,12 @@ def na_op(x, y): mask = None if not is_scalar(y) and needs_i8_conversion(y): mask = isna(x) | isna(y) - y = y.view('i8') - x = x.view('i8') + y = y.view("i8") + x = x.view("i8") method = getattr(x, op_name, None) if method is not None: - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = method(y) if result is NotImplemented: return invalid_comparison(x, y, op) @@ -1812,21 +1871,20 @@ def wrapper(self, other, axis=None): return NotImplemented elif isinstance(other, ABCSeries) and not self._indexed_same(other): - raise ValueError("Can only compare identically-labeled " - "Series objects") + raise ValueError("Can only compare identically-labeled " "Series objects") elif is_categorical_dtype(self): # Dispatch to Categorical implementation; pd.CategoricalIndex # behavior is non-canonical GH#19513 res_values = dispatch_to_index_op(op, self, other, pd.Categorical) - return self._constructor(res_values, index=self.index, - name=res_name) + return self._constructor(res_values, index=self.index, name=res_name) elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self): # Dispatch to DatetimeIndex to ensure identical # Series/Index behavior - if (isinstance(other, datetime.date) and - not isinstance(other, datetime.datetime)): + if isinstance(other, datetime.date) and not isinstance( + other, datetime.datetime + ): # https://github.com/pandas-dev/pandas/issues/21152 # Compatibility for difference between Series comparison w/ # datetime and date @@ -1844,27 +1902,23 @@ def wrapper(self, other, axis=None): future = "a TypeError will be raised" else: future = ( - "'the values will not compare equal to the " - "'datetime.date'" + "'the values will not compare equal to the " "'datetime.date'" ) - msg = '\n'.join(textwrap.wrap(msg.format(future=future))) + msg = "\n".join(textwrap.wrap(msg.format(future=future))) warnings.warn(msg, FutureWarning, stacklevel=2) other = pd.Timestamp(other) - res_values = dispatch_to_index_op(op, self, other, - pd.DatetimeIndex) + res_values = dispatch_to_index_op(op, self, other, pd.DatetimeIndex) - return self._constructor(res_values, index=self.index, - name=res_name) + return self._constructor(res_values, index=self.index, name=res_name) elif is_timedelta64_dtype(self): - res_values = dispatch_to_index_op(op, self, other, - pd.TimedeltaIndex) - return self._constructor(res_values, index=self.index, - name=res_name) + res_values = dispatch_to_index_op(op, self, other, pd.TimedeltaIndex) + return self._constructor(res_values, index=self.index, name=res_name) - elif (is_extension_array_dtype(self) or - (is_extension_array_dtype(other) and not is_scalar(other))): + elif is_extension_array_dtype(self) or ( + is_extension_array_dtype(other) and not is_scalar(other) + ): # Note: the `not is_scalar(other)` condition rules out # e.g. other == "category" return dispatch_to_extension_op(op, self, other) @@ -1874,14 +1928,15 @@ def wrapper(self, other, axis=None): res_values = na_op(self.values, other.values) # rename is needed in case res_name is None and res_values.name # is not. - return self._constructor(res_values, index=self.index, - name=res_name).rename(res_name) + return self._constructor( + res_values, index=self.index, name=res_name + ).rename(res_name) elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast if other.ndim != 0 and len(self) != len(other): - raise ValueError('Lengths must match to compare') + raise ValueError("Lengths must match to compare") res_values = na_op(self.values, np.asarray(other)) result = self._constructor(res_values, index=self.index) @@ -1895,22 +1950,25 @@ def wrapper(self, other, axis=None): res_values = np.ones(len(self), dtype=bool) else: res_values = np.zeros(len(self), dtype=bool) - return self._constructor(res_values, index=self.index, - name=res_name, dtype='bool') + return self._constructor( + res_values, index=self.index, name=res_name, dtype="bool" + ) else: values = self.to_numpy() - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): res = na_op(values, other) if is_scalar(res): - raise TypeError('Could not compare {typ} type with Series' - .format(typ=type(other))) + raise TypeError( + "Could not compare {typ} type with Series".format(typ=type(other)) + ) # always return a full value series here res_values = com.values_from_object(res) - return self._constructor(res_values, index=self.index, - name=res_name, dtype='bool') + return self._constructor( + res_values, index=self.index, name=res_name, dtype="bool" + ) wrapper.__name__ = op_name return wrapper @@ -1941,12 +1999,19 @@ def na_op(x, y): y = bool(y) try: result = libops.scalar_binop(x, y, op) - except (TypeError, ValueError, AttributeError, - OverflowError, NotImplementedError): - raise TypeError("cannot compare a dtyped [{dtype}] array " - "with a scalar of type [{typ}]" - .format(dtype=x.dtype, - typ=type(y).__name__)) + except ( + TypeError, + ValueError, + AttributeError, + OverflowError, + NotImplementedError, + ): + raise TypeError( + "cannot compare a dtyped [{dtype}] array " + "with a scalar of type [{typ}]".format( + dtype=x.dtype, typ=type(y).__name__ + ) + ) return result @@ -1984,11 +2049,9 @@ def wrapper(self, other): # For int vs int `^`, `|`, `&` are bitwise operators and return # integer dtypes. Otherwise these are boolean ops - filler = (fill_int if is_self_int_dtype and is_other_int_dtype - else fill_bool) + filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_op(self.values, ovalues) - unfilled = self._constructor(res_values, - index=self.index, name=res_name) + unfilled = self._constructor(res_values, index=self.index, name=res_name) filled = filler(unfilled) return finalizer(filled) @@ -1998,7 +2061,7 @@ def wrapper(self, other): def _flex_method_SERIES(cls, op, special): name = _get_op_name(op, special) - doc = _make_flex_doc(name, 'series') + doc = _make_flex_doc(name, "series") @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -2009,15 +2072,14 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): return self._binop(other, op, level=level, fill_value=fill_value) elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): - raise ValueError('Lengths must be equal') + raise ValueError("Lengths must be equal") other = self._constructor(other, self.index) return self._binop(other, op, level=level, fill_value=fill_value) else: if fill_value is not None: self = self.fillna(fill_value) - return self._constructor(op(self, other), - self.index).__finalize__(self) + return self._constructor(op(self, other), self.index).__finalize__(self) flex_wrapper.__name__ = name return flex_wrapper @@ -2027,8 +2089,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # DataFrame -def _combine_series_frame(self, other, func, fill_value=None, axis=None, - level=None): +def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=None): """ Apply binary operator `func` to self, other using alignment and fill conventions determined by the fill_value, axis, and level kwargs. @@ -2047,8 +2108,9 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, result : DataFrame """ if fill_value is not None: - raise NotImplementedError("fill_value {fill} not supported." - .format(fill=fill_value)) + raise NotImplementedError( + "fill_value {fill} not supported.".format(fill=fill_value) + ) if axis is not None: axis = self._get_axis_number(axis) @@ -2062,8 +2124,9 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, if not len(self): # Ambiguous case, use _series so works with DataFrame - return self._constructor(data=self._series, index=self.index, - columns=self.columns) + return self._constructor( + data=self._series, index=self.index, columns=self.columns + ) # default axis is columns return self._combine_match_columns(other, func, level=level) @@ -2073,17 +2136,20 @@ def _align_method_FRAME(left, right, axis): """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ def to_series(right): - msg = ('Unable to coerce to Series, length must be {req_len}: ' - 'given {given_len}') - if axis is not None and left._get_axis_name(axis) == 'index': + msg = ( + "Unable to coerce to Series, length must be {req_len}: " "given {given_len}" + ) + if axis is not None and left._get_axis_name(axis) == "index": if len(left.index) != len(right): - raise ValueError(msg.format(req_len=len(left.index), - given_len=len(right))) + raise ValueError( + msg.format(req_len=len(left.index), given_len=len(right)) + ) right = left._constructor_sliced(right, index=left.index) else: if len(left.columns) != len(right): - raise ValueError(msg.format(req_len=len(left.columns), - given_len=len(right))) + raise ValueError( + msg.format(req_len=len(left.columns), given_len=len(right)) + ) right = left._constructor_sliced(right, index=left.columns) return right @@ -2094,32 +2160,32 @@ def to_series(right): elif right.ndim == 2: if right.shape == left.shape: - right = left._constructor(right, index=left.index, - columns=left.columns) + right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns right = np.broadcast_to(right, left.shape) - right = left._constructor(right, - index=left.index, - columns=left.columns) + right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[1] == left.shape[1] and right.shape[0] == 1: # Broadcast along rows right = to_series(right[0, :]) else: - raise ValueError("Unable to coerce to DataFrame, shape " - "must be {req_shape}: given {given_shape}" - .format(req_shape=left.shape, - given_shape=right.shape)) + raise ValueError( + "Unable to coerce to DataFrame, shape " + "must be {req_shape}: given {given_shape}".format( + req_shape=left.shape, given_shape=right.shape + ) + ) elif right.ndim > 2: - raise ValueError('Unable to coerce to Series/DataFrame, dim ' - 'must be <= 2: {dim}'.format(dim=right.shape)) + raise ValueError( + "Unable to coerce to Series/DataFrame, dim " + "must be <= 2: {dim}".format(dim=right.shape) + ) - elif (is_list_like(right) and - not isinstance(right, (ABCSeries, ABCDataFrame))): + elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): # GH17901 right = to_series(right) @@ -2146,7 +2212,7 @@ def na_op(x, y): if op_name in _op_descriptions: # i.e. include "add" but not "__add__" - doc = _make_flex_doc(op_name, 'dataframe') + doc = _make_flex_doc(op_name, "dataframe") else: doc = _arith_doc_FRAME % op_name @@ -2163,9 +2229,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # For these values of `axis`, we end up dispatching to Series op, # so do not want the masked op. pass_op = op if axis in [0, "columns", None] else na_op - return _combine_series_frame(self, other, pass_op, - fill_value=fill_value, axis=axis, - level=level) + return _combine_series_frame( + self, other, pass_op, fill_value=fill_value, axis=axis, level=level + ) else: if fill_value is not None: self = self.fillna(fill_value) @@ -2185,14 +2251,15 @@ def _flex_comp_method_FRAME(cls, op, special): def na_op(x, y): try: - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): result = op(x, y) except TypeError: result = mask_cmp_op(x, y, op) return result - doc = _flex_comp_doc_FRAME.format(op_name=op_name, - desc=_op_descriptions[op_name]['desc']) + doc = _flex_comp_doc_FRAME.format( + op_name=op_name, desc=_op_descriptions[op_name]["desc"] + ) @Appender(doc) def f(self, other, axis=default_axis, level=None): @@ -2202,14 +2269,13 @@ def f(self, other, axis=default_axis, level=None): if isinstance(other, ABCDataFrame): # Another DataFrame if not self._indexed_same(other): - self, other = self.align(other, 'outer', - level=level, copy=False) + self, other = self.align(other, "outer", level=level, copy=False) return dispatch_to_series(self, other, na_op, str_rep) elif isinstance(other, ABCSeries): - return _combine_series_frame(self, other, na_op, - fill_value=None, axis=axis, - level=level) + return _combine_series_frame( + self, other, na_op, fill_value=None, axis=axis, level=level + ) else: assert np.ndim(other) == 0, other return self._combine_const(other, na_op) @@ -2223,7 +2289,7 @@ def _comp_method_FRAME(cls, func, special): str_rep = _get_opstr(func, cls) op_name = _get_op_name(func, special) - @Appender('Wrapper for comparison method {name}'.format(name=op_name)) + @Appender("Wrapper for comparison method {name}".format(name=op_name)) def f(self, other): other = _align_method_FRAME(self, other, axis=None) @@ -2231,14 +2297,15 @@ def f(self, other): if isinstance(other, ABCDataFrame): # Another DataFrame if not self._indexed_same(other): - raise ValueError('Can only compare identically-labeled ' - 'DataFrame objects') + raise ValueError( + "Can only compare identically-labeled " "DataFrame objects" + ) return dispatch_to_series(self, other, func, str_rep) elif isinstance(other, ABCSeries): - return _combine_series_frame(self, other, func, - fill_value=None, axis=None, - level=None) + return _combine_series_frame( + self, other, func, fill_value=None, axis=None, level=None + ) else: # straight boolean comparisons we want to allow all columns @@ -2254,6 +2321,7 @@ def f(self, other): # ----------------------------------------------------------------------------- # Sparse + def _cast_sparse_series_op(left, right, opname): """ For SparseSeries operation, coerce to float64 if the result is expected @@ -2272,15 +2340,15 @@ def _cast_sparse_series_op(left, right, opname): """ from pandas.core.sparse.api import SparseDtype - opname = opname.strip('_') + opname = opname.strip("_") # TODO: This should be moved to the array? if is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf - if opname in ('floordiv', 'mod') and (right.to_dense() == 0).any(): + if opname in ("floordiv", "mod") and (right.to_dense() == 0).any(): left = left.astype(SparseDtype(np.float64, left.fill_value)) right = right.astype(SparseDtype(np.float64, right.fill_value)) - elif opname in ('rfloordiv', 'rmod') and (left.to_dense() == 0).any(): + elif opname in ("rfloordiv", "rmod") and (left.to_dense() == 0).any(): left = left.astype(SparseDtype(np.float64, left.fill_value)) right = right.astype(SparseDtype(np.float64, right.fill_value)) @@ -2302,25 +2370,25 @@ def wrapper(self, other): other = other.to_sparse(fill_value=self.fill_value) return _sparse_series_op(self, other, op, op_name) elif is_scalar(other): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): new_values = op(self.values, other) - return self._constructor(new_values, - index=self.index, - name=self.name) + return self._constructor(new_values, index=self.index, name=self.name) else: # pragma: no cover - raise TypeError('operation with {other} not supported' - .format(other=type(other))) + raise TypeError( + "operation with {other} not supported".format(other=type(other)) + ) wrapper.__name__ = op_name return wrapper def _sparse_series_op(left, right, op, name): - left, right = left.align(right, join='outer', copy=False) + left, right = left.align(right, join="outer", copy=False) new_index = left.index new_name = get_op_result_name(left, right) from pandas.core.arrays.sparse import _sparse_array_op + lvalues, rvalues = _cast_sparse_series_op(left.values, right.values, name) result = _sparse_array_op(lvalues, rvalues, op, name) return left._constructor(result, index=new_index, name=new_name) @@ -2335,36 +2403,40 @@ def _arith_method_SPARSE_ARRAY(cls, op, special): def wrapper(self, other): from pandas.core.arrays.sparse.array import ( - SparseArray, _sparse_array_op, _wrap_result, _get_fill) + SparseArray, + _sparse_array_op, + _wrap_result, + _get_fill, + ) + if isinstance(other, np.ndarray): if len(self) != len(other): - raise AssertionError("length mismatch: {self} vs. {other}" - .format(self=len(self), other=len(other))) + raise AssertionError( + "length mismatch: {self} vs. {other}".format( + self=len(self), other=len(other) + ) + ) if not isinstance(other, SparseArray): - dtype = getattr(other, 'dtype', None) - other = SparseArray(other, fill_value=self.fill_value, - dtype=dtype) + dtype = getattr(other, "dtype", None) + other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) return _sparse_array_op(self, other, op, op_name) elif is_scalar(other): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): fill = op(_get_fill(self), np.asarray(other)) result = op(self.sp_values, other) return _wrap_result(op_name, result, self.sp_index, fill) else: # pragma: no cover - raise TypeError('operation with {other} not supported' - .format(other=type(other))) + raise TypeError( + "operation with {other} not supported".format(other=type(other)) + ) wrapper.__name__ = op_name return wrapper def maybe_dispatch_ufunc_to_dunder_op( - self: ArrayLike, - ufunc: Callable, - method: str, - *inputs: ArrayLike, - **kwargs: Any + self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any ): """ Dispatch a ufunc to the equivalent dunder method. @@ -2387,33 +2459,48 @@ def maybe_dispatch_ufunc_to_dunder_op( The result of applying the ufunc """ # special has the ufuncs we dispatch to the dunder op on - special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv', - 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder', - 'matmul'} + special = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "eq", + "ne", + "lt", + "gt", + "le", + "ge", + "remainder", + "matmul", + } aliases = { - 'subtract': 'sub', - 'multiply': 'mul', - 'floor_divide': 'floordiv', - 'true_divide': 'truediv', - 'power': 'pow', - 'remainder': 'mod', - 'divide': 'div', - 'equal': 'eq', - 'not_equal': 'ne', - 'less': 'lt', - 'less_equal': 'le', - 'greater': 'gt', - 'greater_equal': 'ge', + "subtract": "sub", + "multiply": "mul", + "floor_divide": "floordiv", + "true_divide": "truediv", + "power": "pow", + "remainder": "mod", + "divide": "div", + "equal": "eq", + "not_equal": "ne", + "less": "lt", + "less_equal": "le", + "greater": "gt", + "greater_equal": "ge", } # For op(., Array) -> Array.__r{op}__ flipped = { - 'lt': '__gt__', - 'le': '__ge__', - 'gt': '__lt__', - 'ge': '__le__', - 'eq': '__eq__', - 'ne': '__ne__', + "lt": "__gt__", + "le": "__ge__", + "gt": "__lt__", + "ge": "__le__", + "eq": "__eq__", + "ne": "__ne__", } op_name = ufunc.__name__ @@ -2422,13 +2509,12 @@ def maybe_dispatch_ufunc_to_dunder_op( def not_implemented(*args, **kwargs): return NotImplemented - if (method == '__call__' and op_name in special - and kwargs.get('out') is None): + if method == "__call__" and op_name in special and kwargs.get("out") is None: if isinstance(inputs[0], type(self)): - name = '__{}__'.format(op_name) + name = "__{}__".format(op_name) return getattr(self, name, not_implemented)(inputs[1]) else: - name = flipped.get(op_name, '__r{}__'.format(op_name)) + name = flipped.get(op_name, "__r{}__".format(op_name)) return getattr(self, name, not_implemented)(inputs[0]) else: return NotImplemented diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 632b5a9c5e002..b4a3e6ed71bf4 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -21,8 +21,7 @@ from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import ( - GroupBy, _GroupBy, _pipe_template, groupby) +from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, groupby from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range @@ -60,8 +59,16 @@ class Resampler(_GroupBy): """ # to the groupby descriptor - _attributes = ['freq', 'axis', 'closed', 'label', 'convention', - 'loffset', 'base', 'kind'] + _attributes = [ + "freq", + "axis", + "closed", + "label", + "convention", + "loffset", + "base", + "kind", + ] def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.groupby = groupby @@ -83,11 +90,14 @@ def __str__(self): """ Provide a nice str repr of our rolling object. """ - attrs = ("{k}={v}".format(k=k, v=getattr(self.groupby, k)) - for k in self._attributes if - getattr(self.groupby, k, None) is not None) - return "{klass} [{attrs}]".format(klass=self.__class__.__name__, - attrs=', '.join(attrs)) + attrs = ( + "{k}={v}".format(k=k, v=getattr(self.groupby, k)) + for k in self._attributes + if getattr(self.groupby, k, None) is not None + ) + return "{klass} [{attrs}]".format( + klass=self.__class__.__name__, attrs=", ".join(attrs) + ) def __getattr__(self, attr): if attr in self._internal_names_set: @@ -129,8 +139,8 @@ def _typ(self): Masquerade for compat as a Series or a DataFrame. """ if isinstance(self._selected_obj, pd.Series): - return 'series' - return 'dataframe' + return "series" + return "dataframe" @property def _from_selection(self): @@ -139,9 +149,9 @@ def _from_selection(self): """ # upsampling and PeriodIndex resampling do not work # with selection, this state used to catch and raise an error - return (self.groupby is not None and - (self.groupby.key is not None or - self.groupby.level is not None)) + return self.groupby is not None and ( + self.groupby.key is not None or self.groupby.level is not None + ) def _convert_obj(self, obj): """ @@ -186,9 +196,10 @@ def _assure_grouper(self): """ self._set_binner() - @Substitution(klass='Resampler', - versionadded='.. versionadded:: 0.23.0', - examples=""" + @Substitution( + klass="Resampler", + versionadded=".. versionadded:: 0.23.0", + examples=""" >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, ... index=pd.date_range('2012-08-02', periods=4)) >>> df @@ -204,20 +215,24 @@ def _assure_grouper(self): >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) A 2012-08-02 1 - 2012-08-04 1""") + 2012-08-04 1""", + ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super().pipe(func, *args, **kwargs) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- DataFrame.groupby.aggregate DataFrame.resample.transform DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> s = pd.Series([1,2,3,4,5], @@ -251,14 +266,17 @@ def pipe(self, func, *args, **kwargs): 2013-01-01 00:00:00 3 2.121320 2013-01-01 00:00:02 7 4.949747 2013-01-01 00:00:04 5 NaN - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='DataFrame', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, func, *args, **kwargs): self._set_binner() @@ -266,10 +284,7 @@ def aggregate(self, func, *args, **kwargs): if result is None: how = func grouper = None - result = self._groupby_and_aggregate(how, - grouper, - *args, - **kwargs) + result = self._groupby_and_aggregate(how, grouper, *args, **kwargs) result = self._apply_loffset(result) return result @@ -295,8 +310,7 @@ def transform(self, arg, *args, **kwargs): -------- >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) """ - return self._selected_obj.groupby(self.groupby).transform( - arg, *args, **kwargs) + return self._selected_obj.groupby(self.groupby).transform(arg, *args, **kwargs) def _downsample(self, f): raise AbstractMethodError(self) @@ -370,10 +384,9 @@ def _apply_loffset(self, result): """ needs_offset = ( - isinstance(self.loffset, (DateOffset, timedelta, - np.timedelta64)) and - isinstance(result.index, DatetimeIndex) and - len(result.index) > 0 + isinstance(self.loffset, (DateOffset, timedelta, np.timedelta64)) + and isinstance(result.index, DatetimeIndex) + and len(result.index) > 0 ) if needs_offset: @@ -401,7 +414,7 @@ def _wrap_result(self, result): result.index = obj.index.asfreq(self.freq) else: result.index = obj.index._shallow_copy(freq=self.freq) - result.name = getattr(obj, 'name', None) + result.name = getattr(obj, "name", None) return result @@ -423,7 +436,8 @@ def pad(self, limit=None): Series.fillna DataFrame.fillna """ - return self._upsample('pad', limit=limit) + return self._upsample("pad", limit=limit) + ffill = pad def nearest(self, limit=None): @@ -486,7 +500,7 @@ def nearest(self, limit=None): 2018-01-01 01:00:00 2.0 Freq: 15T, dtype: float64 """ - return self._upsample('nearest', limit=limit) + return self._upsample("nearest", limit=limit) def backfill(self, limit=None): """ @@ -589,7 +603,8 @@ def backfill(self, limit=None): 2018-01-01 01:45:00 6.0 5.0 2018-01-01 02:00:00 6.0 5.0 """ - return self._upsample('backfill', limit=limit) + return self._upsample("backfill", limit=limit) + bfill = backfill def fillna(self, method, limit=None): @@ -752,21 +767,34 @@ def fillna(self, method, limit=None): """ return self._upsample(method, limit=limit) - @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs) - def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', limit_area=None, - downcast=None, **kwargs): + @Appender(_shared_docs["interpolate"] % _shared_docs_kwargs) + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + limit_area=None, + downcast=None, + **kwargs + ): """ Interpolate values according to different methods. .. versionadded:: 0.18.1 """ result = self._upsample(None) - return result.interpolate(method=method, axis=axis, limit=limit, - inplace=inplace, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, **kwargs) + return result.interpolate( + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs + ) def asfreq(self, fill_value=None): """ @@ -790,7 +818,7 @@ def asfreq(self, fill_value=None): Series.asfreq DataFrame.asfreq """ - return self._upsample('asfreq', fill_value=fill_value) + return self._upsample("asfreq", fill_value=fill_value) def std(self, ddof=1, *args, **kwargs): """ @@ -806,8 +834,8 @@ def std(self, ddof=1, *args, **kwargs): DataFrame or Series Standard deviation of values within each group. """ - nv.validate_resampler_func('std', args, kwargs) - return self._downsample('std', ddof=ddof) + nv.validate_resampler_func("std", args, kwargs) + return self._downsample("std", ddof=ddof) def var(self, ddof=1, *args, **kwargs): """ @@ -823,16 +851,16 @@ def var(self, ddof=1, *args, **kwargs): DataFrame or Series Variance of values within each group. """ - nv.validate_resampler_func('var', args, kwargs) - return self._downsample('var', ddof=ddof) + nv.validate_resampler_func("var", args, kwargs) + return self._downsample("var", ddof=ddof) @Appender(GroupBy.size.__doc__) def size(self): # It's a special case as higher level does return # a copy of 0-len objects. GH14962 - result = self._downsample('size') + result = self._downsample("size") if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame): - result = pd.Series([], index=result.index, dtype='int64') + result = pd.Series([], index=result.index, dtype="int64") return result def quantile(self, q=0.5, **kwargs): @@ -856,40 +884,45 @@ def quantile(self, q=0.5, **kwargs): DataFrame.quantile DataFrameGroupBy.quantile """ - return self._downsample('quantile', q=q, **kwargs) + return self._downsample("quantile", q=q, **kwargs) # downsample methods -for method in ['sum', 'prod']: +for method in ["sum", "prod"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method, min_count=min_count) + f.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, f) # downsample methods -for method in ['min', 'max', 'first', 'last', 'mean', 'sem', - 'median', 'ohlc']: +for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method) + g.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, g) # groupby & aggregate methods -for method in ['count']: +for method in ["count"]: + def h(self, _method=method): return self._downsample(_method) + h.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, h) # series only methods -for method in ['nunique']: +for method in ["nunique"]: + def h(self, _method=method): return self._downsample(_method) + h.__doc__ = getattr(SeriesGroupBy, method).__doc__ setattr(Resampler, method, h) @@ -913,26 +946,30 @@ def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): # if we have both a how and fill_method, then show # the following warning if fill_method is None: - warnings.warn("how in .resample() is deprecated\n" - "the new syntax is " - ".resample(...).{method}".format( - method=method), - FutureWarning, stacklevel=3) + warnings.warn( + "how in .resample() is deprecated\n" + "the new syntax is " + ".resample(...).{method}".format(method=method), + FutureWarning, + stacklevel=3, + ) r = r.aggregate(how) if fill_method is not None: # show the prior function call - method = '.' + method if how is not None else '' + method = "." + method if how is not None else "" args = "limit={0}".format(limit) if limit is not None else "" - warnings.warn("fill_method is deprecated to .resample()\n" - "the new syntax is .resample(...){method}" - ".{fill_method}({args})".format( - method=method, - fill_method=fill_method, - args=args), - FutureWarning, stacklevel=3) + warnings.warn( + "fill_method is deprecated to .resample()\n" + "the new syntax is .resample(...){method}" + ".{fill_method}({args})".format( + method=method, fill_method=fill_method, args=args + ), + FutureWarning, + stacklevel=3, + ) if how is not None: r = getattr(r, fill_method)(limit=limit) @@ -946,10 +983,11 @@ class _GroupByMixin(GroupByMixin): """ Provide the groupby facilities. """ + def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop('parent', None) - groupby = kwargs.pop('groupby', None) + parent = kwargs.pop("parent", None) + groupby = kwargs.pop("groupby", None) if parent is None: parent = obj @@ -988,7 +1026,6 @@ def func(x): class DatetimeIndexResampler(Resampler): - @property def _resampler_for_grouping(self): return DatetimeIndexResamplerGroupby @@ -996,7 +1033,7 @@ def _resampler_for_grouping(self): def _get_binner_for_time(self): # this is how we are actually creating the bins - if self.kind == 'period': + if self.kind == "period": return self.groupby._get_time_period_bins(self.ax) return self.groupby._get_time_bins(self.ax) @@ -1030,8 +1067,7 @@ def _downsample(self, how, **kwargs): # we are downsampling # we want to call the actual grouper method here - result = obj.groupby( - self.grouper, axis=self.axis).aggregate(how, **kwargs) + result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs) result = self._apply_loffset(result) return self._wrap_result(result) @@ -1042,7 +1078,7 @@ def _adjust_binner_for_upsample(self, binner): The range of a new index should not be outside specified range """ - if self.closed == 'right': + if self.closed == "right": binner = binner[1:] else: binner = binner[:-1] @@ -1066,12 +1102,14 @@ def _upsample(self, method, limit=None, fill_value=None): """ self._set_binner() if self.axis: - raise AssertionError('axis must be 0') + raise AssertionError("axis must be 0") if self._from_selection: - raise ValueError("Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like") + raise ValueError( + "Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like" + ) ax = self.ax obj = self._selected_obj @@ -1083,8 +1121,9 @@ def _upsample(self, method, limit=None, fill_value=None): result = obj.copy() result.index = res_index else: - result = obj.reindex(res_index, method=method, - limit=limit, fill_value=fill_value) + result = obj.reindex( + res_index, method=method, limit=limit, fill_value=fill_value + ) result = self._apply_loffset(result) return self._wrap_result(result) @@ -1094,7 +1133,7 @@ def _wrap_result(self, result): # we may have a different kind that we were asked originally # convert if needed - if self.kind == 'period' and not isinstance(result.index, PeriodIndex): + if self.kind == "period" and not isinstance(result.index, PeriodIndex): result.index = result.index.to_period(self.freq) return result @@ -1105,19 +1144,19 @@ class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return DatetimeIndexResampler class PeriodIndexResampler(DatetimeIndexResampler): - @property def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby def _get_binner_for_time(self): - if self.kind == 'timestamp': + if self.kind == "timestamp": return super()._get_binner_for_time() return self.groupby._get_period_bins(self.ax) @@ -1126,18 +1165,20 @@ def _convert_obj(self, obj): if self._from_selection: # see GH 14008, GH 12871 - msg = ("Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index") + msg = ( + "Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index" + ) raise NotImplementedError(msg) if self.loffset is not None: # Cannot apply loffset/timedelta to PeriodIndex -> convert to # timestamps - self.kind = 'timestamp' + self.kind = "timestamp" # convert to timestamp - if self.kind == 'timestamp': + if self.kind == "timestamp": obj = obj.to_timestamp(how=self.convention) return obj @@ -1153,7 +1194,7 @@ def _downsample(self, how, **kwargs): """ # we may need to actually resample as if we are timestamps - if self.kind == 'timestamp': + if self.kind == "timestamp": return super()._downsample(how, **kwargs) how = self._is_cython_func(how) or how @@ -1161,10 +1202,9 @@ def _downsample(self, how, **kwargs): if is_subperiod(ax.freq, self.freq): # Downsampling - return self._groupby_and_aggregate(how, grouper=self.grouper, - **kwargs) + return self._groupby_and_aggregate(how, grouper=self.grouper, **kwargs) elif is_superperiod(ax.freq, self.freq): - if how == 'ohlc': + if how == "ohlc": # GH #13083 # upsampling to subperiods is handled as an asfreq, which works # for pure aggregating/reducing methods @@ -1176,8 +1216,9 @@ def _downsample(self, how, **kwargs): return self.asfreq() raise IncompatibleFrequency( - 'Frequency {} cannot be resampled to {}, as they are not ' - 'sub or super periods'.format(ax.freq, self.freq)) + "Frequency {} cannot be resampled to {}, as they are not " + "sub or super periods".format(ax.freq, self.freq) + ) def _upsample(self, method, limit=None, fill_value=None): """ @@ -1197,9 +1238,8 @@ def _upsample(self, method, limit=None, fill_value=None): """ # we may need to actually resample as if we are timestamps - if self.kind == 'timestamp': - return super()._upsample(method, limit=limit, - fill_value=fill_value) + if self.kind == "timestamp": + return super()._upsample(method, limit=limit, fill_value=fill_value) self._set_binner() ax = self.ax @@ -1211,8 +1251,9 @@ def _upsample(self, method, limit=None, fill_value=None): # Get the fill indexer indexer = memb.get_indexer(new_index, method=method, limit=limit) - return self._wrap_result(_take_new_index( - obj, indexer, new_index, axis=self.axis)) + return self._wrap_result( + _take_new_index(obj, indexer, new_index, axis=self.axis) + ) class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): @@ -1221,13 +1262,13 @@ class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return PeriodIndexResampler class TimedeltaIndexResampler(DatetimeIndexResampler): - @property def _resampler_for_grouping(self): return TimedeltaIndexResamplerGroupby @@ -1251,6 +1292,7 @@ class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return TimedeltaIndexResampler @@ -1267,22 +1309,20 @@ def resample(obj, kind=None, **kwds): resample.__doc__ = Resampler.__doc__ -def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, - limit=None, kind=None, **kwargs): +def get_resampler_for_grouping( + groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs +): """ Return our appropriate resampler when grouping as well. """ # .resample uses 'on' similar to how .groupby uses 'key' - kwargs['key'] = kwargs.pop('on', None) + kwargs["key"] = kwargs.pop("on", None) tg = TimeGrouper(freq=rule, **kwargs) resampler = tg._get_resampler(groupby.obj, kind=kind) r = resampler._get_resampler_for_grouping(groupby=groupby) - return _maybe_process_deprecations(r, - how=how, - fill_method=fill_method, - limit=limit) + return _maybe_process_deprecations(r, how=how, fill_method=fill_method, limit=limit) class TimeGrouper(Grouper): @@ -1297,45 +1337,61 @@ class TimeGrouper(Grouper): convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex """ - _attributes = Grouper._attributes + ('closed', 'label', 'how', - 'loffset', 'kind', 'convention', - 'base') - def __init__(self, freq='Min', closed=None, label=None, how='mean', - axis=0, fill_method=None, limit=None, loffset=None, - kind=None, convention=None, base=0, **kwargs): + _attributes = Grouper._attributes + ( + "closed", + "label", + "how", + "loffset", + "kind", + "convention", + "base", + ) + + def __init__( + self, + freq="Min", + closed=None, + label=None, + how="mean", + axis=0, + fill_method=None, + limit=None, + loffset=None, + kind=None, + convention=None, + base=0, + **kwargs + ): # Check for correctness of the keyword arguments which would # otherwise silently use the default if misspelled - if label not in {None, 'left', 'right'}: - raise ValueError('Unsupported value {} for `label`'.format(label)) - if closed not in {None, 'left', 'right'}: - raise ValueError('Unsupported value {} for `closed`'.format( - closed)) - if convention not in {None, 'start', 'end', 'e', 's'}: - raise ValueError('Unsupported value {} for `convention`' - .format(convention)) + if label not in {None, "left", "right"}: + raise ValueError("Unsupported value {} for `label`".format(label)) + if closed not in {None, "left", "right"}: + raise ValueError("Unsupported value {} for `closed`".format(closed)) + if convention not in {None, "start", "end", "e", "s"}: + raise ValueError("Unsupported value {} for `convention`".format(convention)) freq = to_offset(freq) - end_types = {'M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'} + end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"} rule = freq.rule_code - if (rule in end_types or - ('-' in rule and rule[:rule.find('-')] in end_types)): + if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types): if closed is None: - closed = 'right' + closed = "right" if label is None: - label = 'right' + label = "right" else: if closed is None: - closed = 'left' + closed = "left" if label is None: - label = 'left' + label = "left" self.closed = closed self.label = label self.kind = kind - self.convention = convention or 'E' + self.convention = convention or "E" self.convention = self.convention.lower() if isinstance(loffset, str): @@ -1348,7 +1404,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.base = base # always sort time groupers - kwargs['sort'] = True + kwargs["sort"] = True super().__init__(freq=freq, axis=axis, **kwargs) @@ -1375,23 +1431,17 @@ def _get_resampler(self, obj, kind=None): ax = self.ax if isinstance(ax, DatetimeIndex): - return DatetimeIndexResampler(obj, - groupby=self, - kind=kind, - axis=self.axis) - elif isinstance(ax, PeriodIndex) or kind == 'period': - return PeriodIndexResampler(obj, - groupby=self, - kind=kind, - axis=self.axis) + return DatetimeIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) + elif isinstance(ax, PeriodIndex) or kind == "period": + return PeriodIndexResampler(obj, groupby=self, kind=kind, axis=self.axis) elif isinstance(ax, TimedeltaIndex): - return TimedeltaIndexResampler(obj, - groupby=self, - axis=self.axis) + return TimedeltaIndexResampler(obj, groupby=self, axis=self.axis) - raise TypeError("Only valid with DatetimeIndex, " - "TimedeltaIndex or PeriodIndex, " - "but got an instance of %r" % type(ax).__name__) + raise TypeError( + "Only valid with DatetimeIndex, " + "TimedeltaIndex or PeriodIndex, " + "but got an instance of %r" % type(ax).__name__ + ) def _get_grouper(self, obj, validate=True): # create the resampler and return our binner @@ -1401,43 +1451,46 @@ def _get_grouper(self, obj, validate=True): def _get_time_bins(self, ax): if not isinstance(ax, DatetimeIndex): - raise TypeError('axis must be a DatetimeIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a DatetimeIndex, but got " + "an instance of %r" % type(ax).__name__ + ) if len(ax) == 0: - binner = labels = DatetimeIndex( - data=[], freq=self.freq, name=ax.name) + binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels - first, last = _get_timestamp_range_edges(ax.min(), ax.max(), - self.freq, - closed=self.closed, - base=self.base) + first, last = _get_timestamp_range_edges( + ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base + ) # GH #12037 # use first/last directly instead of call replace() on them # because replace() will swallow the nanosecond part # thus last bin maybe slightly before the end if the end contains # nanosecond part and lead to `Values falls after last bin` error - binner = labels = date_range(freq=self.freq, - start=first, - end=last, - tz=ax.tz, - name=ax.name, - ambiguous='infer', - nonexistent='shift_forward') + binner = labels = date_range( + freq=self.freq, + start=first, + end=last, + tz=ax.tz, + name=ax.name, + ambiguous="infer", + nonexistent="shift_forward", + ) ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) # general version, knowing nothing about relative frequencies bins = lib.generate_bins_dt64( - ax_values, bin_edges, self.closed, hasnans=ax.hasnans) + ax_values, bin_edges, self.closed, hasnans=ax.hasnans + ) - if self.closed == 'right': + if self.closed == "right": labels = binner - if self.label == 'right': + if self.label == "right": labels = labels[1:] - elif self.label == 'right': + elif self.label == "right": labels = labels[1:] if ax.hasnans: @@ -1448,15 +1501,15 @@ def _get_time_bins(self, ax): # adjust the labels # GH4076 if len(bins) < len(labels): - labels = labels[:len(bins)] + labels = labels[: len(bins)] return binner, bins, labels def _adjust_bin_edges(self, binner, ax_values): # Some hacks for > daily data, see #1471, #1458, #1483 - if self.freq != 'D' and is_superperiod(self.freq, 'D'): - if self.closed == 'right': + if self.freq != "D" and is_superperiod(self.freq, "D"): + if self.closed == "right": # GH 21459, GH 9119: Adjust the bins relative to the wall time bin_edges = binner.tz_localize(None) bin_edges = bin_edges + timedelta(1) - Nano(1) @@ -1474,22 +1527,22 @@ def _adjust_bin_edges(self, binner, ax_values): def _get_time_delta_bins(self, ax): if not isinstance(ax, TimedeltaIndex): - raise TypeError('axis must be a TimedeltaIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a TimedeltaIndex, but got " + "an instance of %r" % type(ax).__name__ + ) if not len(ax): - binner = labels = TimedeltaIndex( - data=[], freq=self.freq, name=ax.name) + binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels start, end = ax.min(), ax.max() - labels = binner = timedelta_range(start=start, - end=end, - freq=self.freq, - name=ax.name) + labels = binner = timedelta_range( + start=start, end=end, freq=self.freq, name=ax.name + ) end_stamps = labels + self.freq - bins = ax.searchsorted(end_stamps, side='left') + bins = ax.searchsorted(end_stamps, side="left") # Addresses GH #10530 if self.base > 0: @@ -1499,8 +1552,10 @@ def _get_time_delta_bins(self, ax): def _get_time_period_bins(self, ax): if not isinstance(ax, DatetimeIndex): - raise TypeError('axis must be a DatetimeIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a DatetimeIndex, but got " + "an instance of %r" % type(ax).__name__ + ) freq = self.freq @@ -1508,22 +1563,23 @@ def _get_time_period_bins(self, ax): binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name) return binner, [], labels - labels = binner = pd.period_range(start=ax[0], - end=ax[-1], - freq=freq, - name=ax.name) + labels = binner = pd.period_range( + start=ax[0], end=ax[-1], freq=freq, name=ax.name + ) - end_stamps = (labels + freq).asfreq(freq, 's').to_timestamp() + end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp() if ax.tzinfo: end_stamps = end_stamps.tz_localize(ax.tzinfo) - bins = ax.searchsorted(end_stamps, side='left') + bins = ax.searchsorted(end_stamps, side="left") return binner, bins, labels def _get_period_bins(self, ax): if not isinstance(ax, PeriodIndex): - raise TypeError('axis must be a PeriodIndex, but got ' - 'an instance of %r' % type(ax).__name__) + raise TypeError( + "axis must be a PeriodIndex, but got " + "an instance of %r" % type(ax).__name__ + ) memb = ax.asfreq(self.freq, how=self.convention) @@ -1535,33 +1591,30 @@ def _get_period_bins(self, ax): # if index contains no valid (non-NaT) values, return empty index if not len(memb): - binner = labels = PeriodIndex( - data=[], freq=self.freq, name=ax.name) + binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) return binner, [], labels freq_mult = self.freq.n start = ax.min().asfreq(self.freq, how=self.convention) - end = ax.max().asfreq(self.freq, how='end') + end = ax.max().asfreq(self.freq, how="end") bin_shift = 0 # GH 23882 if self.base: # get base adjusted bin edge labels - p_start, end = _get_period_range_edges(start, - end, - self.freq, - closed=self.closed, - base=self.base) + p_start, end = _get_period_range_edges( + start, end, self.freq, closed=self.closed, base=self.base + ) # Get offset for bin edge (not label edge) adjustment - start_offset = (pd.Period(start, self.freq) - - pd.Period(p_start, self.freq)) + start_offset = pd.Period(start, self.freq) - pd.Period(p_start, self.freq) bin_shift = start_offset.n % freq_mult start = p_start - labels = binner = pd.period_range(start=start, end=end, - freq=self.freq, name=ax.name) + labels = binner = pd.period_range( + start=start, end=end, freq=self.freq, name=ax.name + ) i8 = memb.asi8 @@ -1572,7 +1625,7 @@ def _get_period_bins(self, ax): rng += freq_mult # adjust bin edge indexes to account for base rng -= bin_shift - bins = memb.searchsorted(rng, side='left') + bins = memb.searchsorted(rng, side="left") if nat_count > 0: # NaT handling as in pandas._lib.lib.generate_bins_dt64() @@ -1594,13 +1647,14 @@ def _take_new_index(obj, indexer, new_index, axis=0): elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") - return DataFrame(obj._data.reindex_indexer( - new_axis=new_index, indexer=indexer, axis=1)) + return DataFrame( + obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) + ) else: raise ValueError("'obj' should be either a Series or a DataFrame") -def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): +def _get_timestamp_range_edges(first, last, offset, closed="left", base=0): """ Adjust the `first` Timestamp to the preceding Timestamp that resides on the provided offset. Adjust the `last` Timestamp to the following @@ -1634,8 +1688,9 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): first = first.tz_localize(None) last = last.tz_localize(None) - first, last = _adjust_dates_anchored(first, last, offset, - closed=closed, base=base) + first, last = _adjust_dates_anchored( + first, last, offset, closed=closed, base=base + ) if isinstance(offset, Day): first = first.tz_localize(tz) last = last.tz_localize(tz) @@ -1645,7 +1700,7 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): first = first.normalize() last = last.normalize() - if closed == 'left': + if closed == "left": first = Timestamp(offset.rollback(first)) else: first = Timestamp(first - offset) @@ -1655,7 +1710,7 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): return first, last -def _get_period_range_edges(first, last, offset, closed='left', base=0): +def _get_period_range_edges(first, last, offset, closed="left", base=0): """ Adjust the provided `first` and `last` Periods to the respective Period of the given offset that encompasses them. @@ -1686,15 +1741,16 @@ def _get_period_range_edges(first, last, offset, closed='left', base=0): adjust_first = not offset.onOffset(first) adjust_last = offset.onOffset(last) - first, last = _get_timestamp_range_edges(first, last, offset, - closed=closed, base=base) + first, last = _get_timestamp_range_edges( + first, last, offset, closed=closed, base=base + ) first = (first + adjust_first * offset).to_period(offset) last = (last - adjust_last * offset).to_period(offset) return first, last -def _adjust_dates_anchored(first, last, offset, closed='right', base=0): +def _adjust_dates_anchored(first, last, offset, closed="right", base=0): # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is # not a multiple of the frequency. @@ -1708,9 +1764,9 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): last_tzinfo = last.tzinfo start_day_nanos = first.normalize().value if first_tzinfo is not None: - first = first.tz_convert('UTC') + first = first.tz_convert("UTC") if last_tzinfo is not None: - last = last.tz_convert('UTC') + last = last.tz_convert("UTC") base_nanos = (base % offset.n) * offset.nanos // offset.n start_day_nanos += base_nanos @@ -1718,7 +1774,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): foffset = (first.value - start_day_nanos) % offset.nanos loffset = (last.value - start_day_nanos) % offset.nanos - if closed == 'right': + if closed == "right": if foffset > 0: # roll back fresult = first.value - foffset @@ -1746,9 +1802,9 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): fresult = Timestamp(fresult) lresult = Timestamp(lresult) if first_tzinfo is not None: - fresult = fresult.tz_localize('UTC').tz_convert(first_tzinfo) + fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) if last_tzinfo is not None: - lresult = lresult.tz_localize('UTC').tz_convert(last_tzinfo) + lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo) return fresult, lresult @@ -1761,7 +1817,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): raise NotImplementedError("'method' argument is not supported") if how is None: - how = 'E' + how = "E" new_obj = obj.copy() new_obj.index = obj.index.asfreq(freq, how=how) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index d4272cf6e406d..5a476dceca1f3 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -11,11 +11,16 @@ from pandas import DataFrame, Index, MultiIndex, Series from pandas.core import common as com from pandas.core.arrays.categorical import ( - _factorize_from_iterable, _factorize_from_iterables) + _factorize_from_iterable, + _factorize_from_iterables, +) from pandas.core.generic import NDFrame from pandas.core.index import ( - _all_indexes_same, _get_consensus_names, _get_objs_combined_axis, - ensure_index) + _all_indexes_same, + _get_consensus_names, + _get_objs_combined_axis, + ensure_index, +) import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers @@ -23,9 +28,19 @@ # Concatenate DataFrame objects -def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, - sort=None, copy=True): +def concat( + objs, + axis=0, + join="outer", + join_axes=None, + ignore_index=False, + keys=None, + levels=None, + names=None, + verify_integrity=False, + sort=None, + copy=True, +): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -226,10 +241,19 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, ... ValueError: Indexes have overlapping values: ['a'] """ - op = _Concatenator(objs, axis=axis, ignore_index=ignore_index, join=join, - join_axes=join_axes, keys=keys, levels=levels, - names=names, verify_integrity=verify_integrity, - copy=copy, sort=sort) + op = _Concatenator( + objs, + axis=axis, + ignore_index=ignore_index, + join=join, + join_axes=join_axes, + keys=keys, + levels=levels, + names=names, + verify_integrity=verify_integrity, + copy=copy, + sort=sort, + ) return op.get_result() @@ -239,21 +263,35 @@ class _Concatenator: Orchestrates a concatenation operation for BlockManagers """ - def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, - levels=None, names=None, ignore_index=False, - verify_integrity=False, copy=True, sort=False): + def __init__( + self, + objs, + axis=0, + join="outer", + join_axes=None, + keys=None, + levels=None, + names=None, + ignore_index=False, + verify_integrity=False, + copy=True, + sort=False, + ): if isinstance(objs, (NDFrame, str)): - raise TypeError('first argument must be an iterable of pandas ' - 'objects, you passed an object of type ' - '"{name}"'.format(name=type(objs).__name__)) + raise TypeError( + "first argument must be an iterable of pandas " + "objects, you passed an object of type " + '"{name}"'.format(name=type(objs).__name__) + ) - if join == 'outer': + if join == "outer": self.intersect = False - elif join == 'inner': + elif join == "inner": self.intersect = True else: # pragma: no cover - raise ValueError('Only can inner (intersect) or outer (union) ' - 'join the other axis') + raise ValueError( + "Only can inner (intersect) or outer (union) " "join the other axis" + ) if isinstance(objs, dict): if keys is None: @@ -263,7 +301,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, objs = list(objs) if len(objs) == 0: - raise ValueError('No objects to concatenate') + raise ValueError("No objects to concatenate") if keys is None: objs = list(com._not_none(*objs)) @@ -277,19 +315,20 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, clean_keys.append(k) clean_objs.append(v) objs = clean_objs - name = getattr(keys, 'name', None) + name = getattr(keys, "name", None) keys = Index(clean_keys, name=name) if len(objs) == 0: - raise ValueError('All objects passed were None') + raise ValueError("All objects passed were None") # consolidate data & figure out what our result ndim is going to be ndims = set() for obj in objs: if not isinstance(obj, (Series, DataFrame)): - msg = ("cannot concatenate object of type '{}';" - ' only Series and DataFrame objs are valid' - .format(type(obj))) + msg = ( + "cannot concatenate object of type '{}';" + " only Series and DataFrame objs are valid".format(type(obj)) + ) raise TypeError(msg) # consolidate @@ -310,11 +349,13 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, else: # filter out the empties if we have not multi-index possibilities # note to keep empty Series as it affect to result columns / name - non_empties = [obj for obj in objs - if sum(obj.shape) > 0 or isinstance(obj, Series)] + non_empties = [ + obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series) + ] - if (len(non_empties) and (keys is None and names is None and - levels is None and not self.intersect)): + if len(non_empties) and ( + keys is None and names is None and levels is None and not self.intersect + ): objs = non_empties sample = objs[0] @@ -335,8 +376,10 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, self._is_series = isinstance(sample, Series) if not 0 <= axis <= sample.ndim: - raise AssertionError("axis must be between 0 and {ndim}, input was" - " {axis}".format(ndim=sample.ndim, axis=axis)) + raise AssertionError( + "axis must be between 0 and {ndim}, input was" + " {axis}".format(ndim=sample.ndim, axis=axis) + ) # if we have mixed ndims, then convert to highest ndim # creating column numbers as needed @@ -351,11 +394,13 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, pass elif ndim != max_ndim - 1: - raise ValueError("cannot concatenate unaligned mixed " - "dimensional NDFrame objects") + raise ValueError( + "cannot concatenate unaligned mixed " + "dimensional NDFrame objects" + ) else: - name = getattr(obj, 'name', None) + name = getattr(obj, "name", None) if ignore_index or name is None: name = current_column current_column += 1 @@ -372,7 +417,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, self.axis = axis self.join_axes = join_axes self.keys = keys - self.names = names or getattr(keys, 'names', None) + self.names = names or getattr(keys, "names", None) self.levels = levels self.sort = sort @@ -391,10 +436,11 @@ def get_result(self): if self.axis == 0: name = com.consensus_name_attr(self.objs) - mgr = self.objs[0]._data.concat([x._data for x in self.objs], - self.new_axes) + mgr = self.objs[0]._data.concat( + [x._data for x in self.objs], self.new_axes + ) cons = _concat._get_series_result_type(mgr, self.objs) - return cons(mgr, name=name).__finalize__(self, method='concat') + return cons(mgr, name=name).__finalize__(self, method="concat") # combine as columns in a frame else: @@ -404,7 +450,7 @@ def get_result(self): index, columns = self.new_axes df = cons(data, index=index) df.columns = columns - return df.__finalize__(self, method='concat') + return df.__finalize__(self, method="concat") # combine block managers else: @@ -424,14 +470,15 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, - copy=self.copy) + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy + ) if not self.copy: new_data._consolidate_inplace() cons = _concat._get_frame_result_type(new_data, self.objs) - return (cons._from_axes(new_data, self.new_axes) - .__finalize__(self, method='concat')) + return cons._from_axes(new_data, self.new_axes).__finalize__( + self, method="concat" + ) def _get_result_dim(self): if self._is_series and self.axis == 1: @@ -452,13 +499,18 @@ def _get_new_axes(self): else: # GH 21951 warnings.warn( - 'The join_axes-keyword is deprecated. Use .reindex or ' - '.reindex_like on the result to achieve the same ' - 'functionality.', FutureWarning, stacklevel=4) + "The join_axes-keyword is deprecated. Use .reindex or " + ".reindex_like on the result to achieve the same " + "functionality.", + FutureWarning, + stacklevel=4, + ) if len(self.join_axes) != ndim - 1: - raise AssertionError("length of join_axes must be equal " - "to {length}".format(length=ndim - 1)) + raise AssertionError( + "length of join_axes must be equal " + "to {length}".format(length=ndim - 1) + ) # ufff... indices = list(range(ndim)) @@ -473,13 +525,12 @@ def _get_new_axes(self): def _get_comb_axis(self, i): data_axis = self.objs[0]._get_block_manager_axis(i) try: - return _get_objs_combined_axis(self.objs, axis=data_axis, - intersect=self.intersect, - sort=self.sort) + return _get_objs_combined_axis( + self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort + ) except IndexError: types = [type(x).__name__ for x in self.objs] - raise TypeError("Cannot concatenate list of {types}" - .format(types=types)) + raise TypeError("Cannot concatenate list of {types}".format(types=types)) def _get_concat_axis(self): """ @@ -497,9 +548,10 @@ def _get_concat_axis(self): has_names = False for i, x in enumerate(self.objs): if not isinstance(x, Series): - raise TypeError("Cannot concatenate type 'Series' " - "with object of type {type!r}" - .format(type=type(x).__name__)) + raise TypeError( + "Cannot concatenate type 'Series' " + "with object of type {type!r}".format(type=type(x).__name__) + ) if x.name is not None: names[i] = x.name has_names = True @@ -522,8 +574,9 @@ def _get_concat_axis(self): if self.keys is None: concat_axis = _concat_indexes(indexes) else: - concat_axis = _make_concat_multiindex(indexes, self.keys, - self.levels, self.names) + concat_axis = _make_concat_multiindex( + indexes, self.keys, self.levels, self.names + ) self._maybe_check_integrity(concat_axis) @@ -533,8 +586,10 @@ def _maybe_check_integrity(self, concat_index): if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() - raise ValueError('Indexes have overlapping values: ' - '{overlap!s}'.format(overlap=overlap)) + raise ValueError( + "Indexes have overlapping values: " + "{overlap!s}".format(overlap=overlap) + ) def _concat_indexes(indexes): @@ -543,8 +598,9 @@ def _concat_indexes(indexes): def _make_concat_multiindex(indexes, keys, levels=None, names=None): - if ((levels is None and isinstance(keys[0], tuple)) or - (levels is not None and len(levels) > 1)): + if (levels is None and isinstance(keys[0], tuple)) or ( + levels is not None and len(levels) > 1 + ): zipped = list(zip(*keys)) if names is None: names = [None] * len(zipped) @@ -575,8 +631,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): try: i = level.get_loc(key) except KeyError: - raise ValueError('Key {key!s} not in level {level!s}' - .format(key=key, level=level)) + raise ValueError( + "Key {key!s} not in level {level!s}".format( + key=key, level=level + ) + ) to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) @@ -597,14 +656,17 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: - raise AssertionError("Cannot concat indices that do" - " not have the same number of levels") + raise AssertionError( + "Cannot concat indices that do" + " not have the same number of levels" + ) # also copies names = names + _get_consensus_names(indexes) - return MultiIndex(levels=levels, codes=codes_list, names=names, - verify_integrity=False) + return MultiIndex( + levels=levels, codes=codes_list, names=names, verify_integrity=False + ) new_index = indexes[0] n = len(new_index) @@ -625,8 +687,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): mask = mapped == -1 if mask.any(): - raise ValueError('Values not found in passed level: {hlevel!s}' - .format(hlevel=hlevel[mask])) + raise ValueError( + "Values not found in passed level: {hlevel!s}".format( + hlevel=hlevel[mask] + ) + ) new_codes.append(np.repeat(mapped, n)) @@ -640,5 +705,6 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): if len(new_names) < len(new_levels): new_names.extend(new_index.names) - return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, - verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index d655a8be13de7..9a69942a70e01 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -15,12 +15,18 @@ from pandas.core.tools.numeric import to_numeric -@Appender(_shared_docs['melt'] % - dict(caller='pd.melt(df, ', - versionadded="", - other='DataFrame.melt')) -def melt(frame, id_vars=None, value_vars=None, var_name=None, - value_name='value', col_level=None): +@Appender( + _shared_docs["melt"] + % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") +) +def melt( + frame, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, +): # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` @@ -31,36 +37,42 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] - elif (isinstance(frame.columns, ABCMultiIndex) and - not isinstance(id_vars, list)): - raise ValueError('id_vars must be a list of tuples when columns' - ' are a MultiIndex') + elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list): + raise ValueError( + "id_vars must be a list of tuples when columns" " are a MultiIndex" + ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(np.ravel(id_vars)).difference(cols) if not missing.empty: - raise KeyError("The following 'id_vars' are not present" - " in the DataFrame: {missing}" - "".format(missing=list(missing))) + raise KeyError( + "The following 'id_vars' are not present" + " in the DataFrame: {missing}" + "".format(missing=list(missing)) + ) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] - elif (isinstance(frame.columns, ABCMultiIndex) and - not isinstance(value_vars, list)): - raise ValueError('value_vars must be a list of tuples when' - ' columns are a MultiIndex') + elif isinstance(frame.columns, ABCMultiIndex) and not isinstance( + value_vars, list + ): + raise ValueError( + "value_vars must be a list of tuples when" " columns are a MultiIndex" + ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(np.ravel(value_vars)).difference(cols) if not missing.empty: - raise KeyError("The following 'value_vars' are not present in" - " the DataFrame: {missing}" - "".format(missing=list(missing))) + raise KeyError( + "The following 'value_vars' are not present in" + " the DataFrame: {missing}" + "".format(missing=list(missing)) + ) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() @@ -74,11 +86,13 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: - var_name = ['variable_{i}'.format(i=i) - for i in range(len(frame.columns.names))] + var_name = [ + "variable_{i}".format(i=i) for i in range(len(frame.columns.names)) + ] else: - var_name = [frame.columns.name if frame.columns.name is not None - else 'variable'] + var_name = [ + frame.columns.name if frame.columns.name is not None else "variable" + ] if isinstance(var_name, str): var_name = [var_name] @@ -96,11 +110,10 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None, mcolumns = id_vars + var_name + [value_name] - mdata[value_name] = frame.values.ravel('F') + mdata[value_name] = frame.values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index - mdata[col] = np.asanyarray(frame.columns - ._get_level_values(i)).repeat(N) + mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) return frame._constructor(mdata, columns=mcolumns) @@ -150,7 +163,7 @@ def lreshape(data, groups, dropna=True, label=None): for seq in values: if len(seq) != K: - raise ValueError('All column lists must be same length') + raise ValueError("All column lists must be same length") mdata = {} pivot_cols = [] @@ -159,6 +172,7 @@ def lreshape(data, groups, dropna=True, label=None): to_concat = [data[col].values for col in names] import pandas.core.dtypes.concat as _concat + mdata[target] = _concat._concat_compat(to_concat) pivot_cols.append(target) @@ -175,7 +189,7 @@ def lreshape(data, groups, dropna=True, label=None): return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): +def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): r""" Wide panel to long format. Less flexible but more user-friendly than melt. @@ -403,20 +417,27 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): 3 one 2.1 two 2.9 """ + def get_var_names(df, stub, sep, suffix): - regex = r'^{stub}{sep}{suffix}$'.format( - stub=re.escape(stub), sep=re.escape(sep), suffix=suffix) + regex = r"^{stub}{sep}{suffix}$".format( + stub=re.escape(stub), sep=re.escape(sep), suffix=suffix + ) pattern = re.compile(regex) return [col for col in df.columns if pattern.match(col)] def melt_stub(df, stub, i, j, value_vars, sep): - newdf = melt(df, id_vars=i, value_vars=value_vars, - value_name=stub.rstrip(sep), var_name=j) + newdf = melt( + df, + id_vars=i, + value_vars=value_vars, + value_name=stub.rstrip(sep), + var_name=j, + ) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") # GH17627 Cast numerics suffixes to int/float - newdf[j] = to_numeric(newdf[j], errors='ignore') + newdf[j] = to_numeric(newdf[j], errors="ignore") return newdf.set_index(i + [j]) @@ -441,9 +462,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - melted = [melt_stub(df, s, i, j, v, sep) - for s, v in zip(stubnames, value_vars)] - melted = melted[0].join(melted[1:], how='outer') + melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)] + melted = melted[0].join(melted[1:], how="outer") if len(i) == 1: new = df[id_vars].set_index(i).join(melted) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 549c69486ebfa..4f910f6a278ad 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -15,12 +15,28 @@ from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool, - is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, - is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer, - is_integer_dtype, is_list_like, is_number, is_numeric_dtype, - is_object_dtype, needs_i8_conversion) + ensure_float64, + ensure_int64, + ensure_object, + is_array_like, + is_bool, + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike, + is_dtype_equal, + is_extension_array_dtype, + is_float_dtype, + is_int64_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_number, + is_numeric_dtype, + is_object_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -33,26 +49,46 @@ from pandas.core.sorting import is_int64_overflow_possible -@Substitution('\nleft : DataFrame') +@Substitution("\nleft : DataFrame") @Appender(_merge_doc, indents=0) -def merge(left, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None): - op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator, - validate=validate) +def merge( + left, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, +): + op = _MergeOperation( + left, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + copy=copy, + indicator=indicator, + validate=validate, + ) return op.get_result() if __debug__: - merge.__doc__ = _merge_doc % '\nleft : DataFrame' + merge.__doc__ = _merge_doc % "\nleft : DataFrame" -def _groupby_and_merge(by, on, left, right, _merge_pieces, - check_duplicates=True): +def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True): """ groupby & merge; we are always performing a left-by type operation @@ -85,7 +121,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, on = [on] if right.duplicated(by + on).any(): - right = right.drop_duplicates(by + on, keep='last') + right = right.drop_duplicates(by + on, keep="last") rby = right.groupby(by, sort=False) except KeyError: rby = None @@ -100,8 +136,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, except KeyError: # key doesn't exist in left lcols = lhs.columns.tolist() - cols = lcols + [r for r in right.columns - if r not in set(lcols)] + cols = lcols + [r for r in right.columns if r not in set(lcols)] merged = lhs.reindex(columns=cols) merged.index = range(len(merged)) pieces.append(merged) @@ -123,16 +158,24 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, # preserve the original order # if we have a missing piece this can be reset from pandas.core.reshape.concat import concat + result = concat(pieces, ignore_index=True) result = result.reindex(columns=pieces[0].columns, copy=False) return result, lby -def merge_ordered(left, right, on=None, - left_on=None, right_on=None, - left_by=None, right_by=None, - fill_method=None, suffixes=('_x', '_y'), - how='outer'): +def merge_ordered( + left, + right, + on=None, + left_on=None, + right_on=None, + left_by=None, + right_by=None, + fill_method=None, + suffixes=("_x", "_y"), + how="outer", +): """ Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see @@ -211,36 +254,57 @@ def merge_ordered(left, right, on=None, 8 b d 2 3.0 9 b e 3 3.0 """ + def _merger(x, y): # perform the ordered merge operation - op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, - suffixes=suffixes, fill_method=fill_method, - how=how) + op = _OrderedMerge( + x, + y, + on=on, + left_on=left_on, + right_on=right_on, + suffixes=suffixes, + fill_method=fill_method, + how=how, + ) return op.get_result() if left_by is not None and right_by is not None: - raise ValueError('Can only group either left or right frames') + raise ValueError("Can only group either left or right frames") elif left_by is not None: - result, _ = _groupby_and_merge(left_by, on, left, right, - lambda x, y: _merger(x, y), - check_duplicates=False) + result, _ = _groupby_and_merge( + left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False + ) elif right_by is not None: - result, _ = _groupby_and_merge(right_by, on, right, left, - lambda x, y: _merger(y, x), - check_duplicates=False) + result, _ = _groupby_and_merge( + right_by, + on, + right, + left, + lambda x, y: _merger(y, x), + check_duplicates=False, + ) else: result = _merger(left, right) return result -def merge_asof(left, right, on=None, - left_on=None, right_on=None, - left_index=False, right_index=False, - by=None, left_by=None, right_by=None, - suffixes=('_x', '_y'), - tolerance=None, - allow_exact_matches=True, - direction='backward'): +def merge_asof( + left, + right, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + by=None, + left_by=None, + right_by=None, + suffixes=("_x", "_y"), + tolerance=None, + allow_exact_matches=True, + direction="backward", +): """ Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. @@ -458,14 +522,23 @@ def merge_asof(left, right, on=None, 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN """ - op = _AsOfMerge(left, right, - on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index, - by=by, left_by=left_by, right_by=right_by, - suffixes=suffixes, - how='asof', tolerance=tolerance, - allow_exact_matches=allow_exact_matches, - direction=direction) + op = _AsOfMerge( + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + by=by, + left_by=left_by, + right_by=right_by, + suffixes=suffixes, + how="asof", + tolerance=tolerance, + allow_exact_matches=allow_exact_matches, + direction=direction, + ) return op.get_result() @@ -476,13 +549,26 @@ class _MergeOperation: Perform a database (SQL) merge operation between two DataFrame objects using either columns as keys or their row indexes """ - _merge_type = 'merge' - def __init__(self, left, right, how='inner', on=None, - left_on=None, right_on=None, axis=1, - left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None): + _merge_type = "merge" + + def __init__( + self, + left, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + axis=1, + left_index=False, + right_index=False, + sort=True, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, + ): left = validate_operand(left) right = validate_operand(right) self.left = self.orig_left = left @@ -506,34 +592,39 @@ def __init__(self, left, right, how='inner', on=None, if isinstance(self.indicator, str): self.indicator_name = self.indicator elif isinstance(self.indicator, bool): - self.indicator_name = '_merge' if self.indicator else None + self.indicator_name = "_merge" if self.indicator else None else: raise ValueError( - 'indicator option can only accept boolean or string arguments') + "indicator option can only accept boolean or string arguments" + ) if not is_bool(left_index): raise ValueError( - 'left_index parameter must be of type bool, not ' - '{left_index}'.format(left_index=type(left_index))) + "left_index parameter must be of type bool, not " + "{left_index}".format(left_index=type(left_index)) + ) if not is_bool(right_index): raise ValueError( - 'right_index parameter must be of type bool, not ' - '{right_index}'.format(right_index=type(right_index))) + "right_index parameter must be of type bool, not " + "{right_index}".format(right_index=type(right_index)) + ) # warn user when merging between different levels if left.columns.nlevels != right.columns.nlevels: - msg = ('merging between different levels can give an unintended ' - 'result ({left} levels on the left, {right} on the right)' - ).format(left=left.columns.nlevels, - right=right.columns.nlevels) + msg = ( + "merging between different levels can give an unintended " + "result ({left} levels on the left, {right} on the right)" + ).format(left=left.columns.nlevels, right=right.columns.nlevels) warnings.warn(msg, UserWarning) self._validate_specification() # note this function has side effects - (self.left_join_keys, - self.right_join_keys, - self.join_names) = self._get_merge_keys() + ( + self.left_join_keys, + self.right_join_keys, + self.join_names, + ) = self._get_merge_keys() # validate the merge keys dtypes. We may need to coerce # to avoid incompat dtypes @@ -547,16 +638,16 @@ def __init__(self, left, right, how='inner', on=None, def get_result(self): if self.indicator: - self.left, self.right = self._indicator_pre_merge( - self.left, self.right) + self.left, self.right = self._indicator_pre_merge(self.left, self.right) join_index, left_indexer, right_indexer = self._get_join_info() ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) + llabels, rlabels = _items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf + ) lindexers = {1: left_indexer} if left_indexer is not None else {} rindexers = {1: right_indexer} if right_indexer is not None else {} @@ -564,7 +655,9 @@ def get_result(self): result_data = concatenate_block_managers( [(ldata, lindexers), (rdata, rindexers)], axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) + concat_axis=0, + copy=self.copy, + ) typ = self.left._constructor result = typ(result_data).__finalize__(self, method=self._merge_type) @@ -582,40 +675,42 @@ def _indicator_pre_merge(self, left, right): columns = left.columns.union(right.columns) - for i in ['_left_indicator', '_right_indicator']: + for i in ["_left_indicator", "_right_indicator"]: if i in columns: - raise ValueError("Cannot use `indicator=True` option when " - "data contains a column named {name}" - .format(name=i)) + raise ValueError( + "Cannot use `indicator=True` option when " + "data contains a column named {name}".format(name=i) + ) if self.indicator_name in columns: raise ValueError( - "Cannot use name of an existing column for indicator column") + "Cannot use name of an existing column for indicator column" + ) left = left.copy() right = right.copy() - left['_left_indicator'] = 1 - left['_left_indicator'] = left['_left_indicator'].astype('int8') + left["_left_indicator"] = 1 + left["_left_indicator"] = left["_left_indicator"].astype("int8") - right['_right_indicator'] = 2 - right['_right_indicator'] = right['_right_indicator'].astype('int8') + right["_right_indicator"] = 2 + right["_right_indicator"] = right["_right_indicator"].astype("int8") return left, right def _indicator_post_merge(self, result): - result['_left_indicator'] = result['_left_indicator'].fillna(0) - result['_right_indicator'] = result['_right_indicator'].fillna(0) + result["_left_indicator"] = result["_left_indicator"].fillna(0) + result["_right_indicator"] = result["_right_indicator"].fillna(0) - result[self.indicator_name] = Categorical((result['_left_indicator'] + - result['_right_indicator']), - categories=[1, 2, 3]) - result[self.indicator_name] = ( - result[self.indicator_name] - .cat.rename_categories(['left_only', 'right_only', 'both'])) + result[self.indicator_name] = Categorical( + (result["_left_indicator"] + result["_right_indicator"]), + categories=[1, 2, 3], + ) + result[self.indicator_name] = result[self.indicator_name].cat.rename_categories( + ["left_only", "right_only", "both"] + ) - result = result.drop(labels=['_left_indicator', '_right_indicator'], - axis=1) + result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) return result def _maybe_restore_index_levels(self, result): @@ -639,12 +734,14 @@ def _maybe_restore_index_levels(self, result): None """ names_to_restore = [] - for name, left_key, right_key in zip(self.join_names, - self.left_on, - self.right_on): - if (self.orig_left._is_level_reference(left_key) and - self.orig_right._is_level_reference(right_key) and - name not in result.index.names): + for name, left_key, right_key in zip( + self.join_names, self.left_on, self.right_on + ): + if ( + self.orig_left._is_level_reference(left_key) + and self.orig_right._is_level_reference(right_key) + and name not in result.index.names + ): names_to_restore.append(name) @@ -674,8 +771,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if left_has_missing: take_right = self.right_join_keys[i] - if not is_dtype_equal(result[name].dtype, - self.left[name].dtype): + if not is_dtype_equal( + result[name].dtype, self.left[name].dtype + ): take_left = self.left[name]._values elif name in self.right: @@ -686,12 +784,12 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if right_has_missing: take_left = self.left_join_keys[i] - if not is_dtype_equal(result[name].dtype, - self.right[name].dtype): + if not is_dtype_equal( + result[name].dtype, self.right[name].dtype + ): take_right = self.right[name]._values - elif left_indexer is not None \ - and is_array_like(self.left_join_keys[i]): + elif left_indexer is not None and is_array_like(self.left_join_keys[i]): take_left = self.left_join_keys[i] take_right = self.right_join_keys[i] @@ -701,15 +799,13 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): lvals = result[name]._values else: lfill = na_value_for_dtype(take_left.dtype) - lvals = algos.take_1d(take_left, left_indexer, - fill_value=lfill) + lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill) if take_right is None: rvals = result[name]._values else: rfill = na_value_for_dtype(take_right.dtype) - rvals = algos.take_1d(take_right, right_indexer, - fill_value=rfill) + rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) # if we have an all missing left_indexer # make sure to just use the right values @@ -724,61 +820,66 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): elif result._is_level_reference(name): if isinstance(result.index, MultiIndex): key_col.name = name - idx_list = [result.index.get_level_values(level_name) - if level_name != name else key_col - for level_name in result.index.names] + idx_list = [ + result.index.get_level_values(level_name) + if level_name != name + else key_col + for level_name in result.index.names + ] result.set_index(idx_list, inplace=True) else: result.index = Index(key_col, name=name) else: - result.insert(i, name or 'key_{i}'.format(i=i), key_col) + result.insert(i, name or "key_{i}".format(i=i), key_col) def _get_join_indexers(self): """ return the join indexers """ - return _get_join_indexers(self.left_join_keys, - self.right_join_keys, - sort=self.sort, - how=self.how) + return _get_join_indexers( + self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how + ) def _get_join_info(self): left_ax = self.left._data.axes[self.axis] right_ax = self.right._data.axes[self.axis] - if self.left_index and self.right_index and self.how != 'asof': - join_index, left_indexer, right_indexer = \ - left_ax.join(right_ax, how=self.how, return_indexers=True, - sort=self.sort) - elif self.right_index and self.how == 'left': - join_index, left_indexer, right_indexer = \ - _left_join_on_index(left_ax, right_ax, self.left_join_keys, - sort=self.sort) - - elif self.left_index and self.how == 'right': - join_index, right_indexer, left_indexer = \ - _left_join_on_index(right_ax, left_ax, self.right_join_keys, - sort=self.sort) + if self.left_index and self.right_index and self.how != "asof": + join_index, left_indexer, right_indexer = left_ax.join( + right_ax, how=self.how, return_indexers=True, sort=self.sort + ) + elif self.right_index and self.how == "left": + join_index, left_indexer, right_indexer = _left_join_on_index( + left_ax, right_ax, self.left_join_keys, sort=self.sort + ) + + elif self.left_index and self.how == "right": + join_index, right_indexer, left_indexer = _left_join_on_index( + right_ax, left_ax, self.right_join_keys, sort=self.sort + ) else: - (left_indexer, - right_indexer) = self._get_join_indexers() + (left_indexer, right_indexer) = self._get_join_indexers() if self.right_index: if len(self.left) > 0: - join_index = self._create_join_index(self.left.index, - self.right.index, - left_indexer, - right_indexer, - how='right') + join_index = self._create_join_index( + self.left.index, + self.right.index, + left_indexer, + right_indexer, + how="right", + ) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self._create_join_index(self.right.index, - self.left.index, - right_indexer, - left_indexer, - how='left') + join_index = self._create_join_index( + self.right.index, + self.left.index, + right_indexer, + left_indexer, + how="left", + ) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -789,8 +890,9 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _create_join_index(self, index, other_index, indexer, - other_indexer, how='left'): + def _create_join_index( + self, index, other_index, indexer, other_indexer, how="left" + ): """ Create a join index by rearranging one index to match another @@ -805,8 +907,7 @@ def _create_join_index(self, index, other_index, indexer, ------- join_index """ - if (self.how in (how, 'outer') and - not isinstance(other_index, MultiIndex)): + if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): # if final index requires values in other_index but not target # index, indexer may hold missing (-1) values, causing Index.take # to take the final value in target index. So, we set the last @@ -863,8 +964,7 @@ def _get_merge_keys(self): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append( - right._get_label_or_level_values(rk)) + right_keys.append(right._get_label_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -873,8 +973,7 @@ def _get_merge_keys(self): else: if not is_rkey(rk): if rk is not None: - right_keys.append( - right._get_label_or_level_values(rk)) + right_keys.append(right._get_label_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -902,9 +1001,12 @@ def _get_merge_keys(self): left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): - right_keys = [lev._values.take(lev_codes) for lev, lev_codes - in zip(self.right.index.levels, - self.right.index.codes)] + right_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.right.index.levels, self.right.index.codes + ) + ] else: right_keys = [self.right.index._values] elif _any(self.right_on): @@ -916,9 +1018,12 @@ def _get_merge_keys(self): right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): - left_keys = [lev._values.take(lev_codes) for lev, lev_codes - in zip(self.left.index.levels, - self.left.index.codes)] + left_keys = [ + lev._values.take(lev_codes) + for lev, lev_codes in zip( + self.left.index.levels, self.left.index.codes + ) + ] else: left_keys = [self.left.index.values] @@ -937,9 +1042,9 @@ def _maybe_coerce_merge_keys(self): # for example if these are categorical, but are not dtype_equal # or if we have object and integer dtypes - for lk, rk, name in zip(self.left_join_keys, - self.right_join_keys, - self.join_names): + for lk, rk, name in zip( + self.left_join_keys, self.right_join_keys, self.join_names + ): if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): continue @@ -960,10 +1065,11 @@ def _maybe_coerce_merge_keys(self): elif is_dtype_equal(lk.dtype, rk.dtype): continue - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, - rk_dtype=rk.dtype)) + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype) + ) # if we are numeric, then allow differing # kinds to proceed, eg. int64 and int8, int and float @@ -976,51 +1082,60 @@ def _maybe_coerce_merge_keys(self): # check whether ints and floats elif is_integer_dtype(rk) and is_float_dtype(lk): if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): - warnings.warn('You are merging on int and float ' - 'columns where the float values ' - 'are not equal to their int ' - 'representation', UserWarning) + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int " + "representation", + UserWarning, + ) continue elif is_float_dtype(rk) and is_integer_dtype(lk): if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): - warnings.warn('You are merging on int and float ' - 'columns where the float values ' - 'are not equal to their int ' - 'representation', UserWarning) + warnings.warn( + "You are merging on int and float " + "columns where the float values " + "are not equal to their int " + "representation", + UserWarning, + ) continue # let's infer and see if we are ok - elif (lib.infer_dtype(lk, skipna=False) - == lib.infer_dtype(rk, skipna=False)): + elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( + rk, skipna=False + ): continue # Check if we are trying to merge on obviously # incompatible dtypes GH 9780, GH 15800 # bool values are coerced to object - elif ((lk_is_object and is_bool_dtype(rk)) or - (is_bool_dtype(lk) and rk_is_object)): + elif (lk_is_object and is_bool_dtype(rk)) or ( + is_bool_dtype(lk) and rk_is_object + ): pass # object values are allowed to be merged - elif ((lk_is_object and is_numeric_dtype(rk)) or - (is_numeric_dtype(lk) and rk_is_object)): + elif (lk_is_object and is_numeric_dtype(rk)) or ( + is_numeric_dtype(lk) and rk_is_object + ): inferred_left = lib.infer_dtype(lk, skipna=False) inferred_right = lib.infer_dtype(rk, skipna=False) - bool_types = ['integer', 'mixed-integer', 'boolean', 'empty'] - string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty'] + bool_types = ["integer", "mixed-integer", "boolean", "empty"] + string_types = ["string", "unicode", "mixed", "bytes", "empty"] # inferred bool - if (inferred_left in bool_types and - inferred_right in bool_types): + if inferred_left in bool_types and inferred_right in bool_types: pass # unless we are merging non-string-like with string-like - elif ((inferred_left in string_types and - inferred_right not in string_types) or - (inferred_right in string_types and - inferred_left not in string_types)): + elif ( + inferred_left in string_types and inferred_right not in string_types + ) or ( + inferred_right in string_types and inferred_left not in string_types + ): raise ValueError(msg) # datetimelikes must match exactly @@ -1045,12 +1160,10 @@ def _maybe_coerce_merge_keys(self): # incompatible dtypes. See GH 16900. if name in self.left.columns: typ = lk.categories.dtype if lk_is_cat else object - self.left = self.left.assign( - **{name: self.left[name].astype(typ)}) + self.left = self.left.assign(**{name: self.left[name].astype(typ)}) if name in self.right.columns: typ = rk.categories.dtype if rk_is_cat else object - self.right = self.right.assign( - **{name: self.right[name].astype(typ)}) + self.right = self.right.assign(**{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? @@ -1060,43 +1173,53 @@ def _validate_specification(self): self.left_on, self.right_on = (), () elif self.left_index: if self.right_on is None: - raise MergeError('Must pass right_on or right_index=True') + raise MergeError("Must pass right_on or right_index=True") elif self.right_index: if self.left_on is None: - raise MergeError('Must pass left_on or left_index=True') + raise MergeError("Must pass left_on or left_index=True") else: # use the common columns - common_cols = self.left.columns.intersection( - self.right.columns) + common_cols = self.left.columns.intersection(self.right.columns) if len(common_cols) == 0: raise MergeError( - 'No common columns to perform merge on. ' - 'Merge options: left_on={lon}, right_on={ron}, ' - 'left_index={lidx}, right_index={ridx}' - .format(lon=self.left_on, ron=self.right_on, - lidx=self.left_index, ridx=self.right_index)) + "No common columns to perform merge on. " + "Merge options: left_on={lon}, right_on={ron}, " + "left_index={lidx}, right_index={ridx}".format( + lon=self.left_on, + ron=self.right_on, + lidx=self.left_index, + ridx=self.right_index, + ) + ) if not common_cols.is_unique: - raise MergeError("Data columns not unique: {common!r}" - .format(common=common_cols)) + raise MergeError( + "Data columns not unique: {common!r}".format(common=common_cols) + ) self.left_on = self.right_on = common_cols elif self.on is not None: if self.left_on is not None or self.right_on is not None: - raise MergeError('Can only pass argument "on" OR "left_on" ' - 'and "right_on", not a combination of both.') + raise MergeError( + 'Can only pass argument "on" OR "left_on" ' + 'and "right_on", not a combination of both.' + ) self.left_on = self.right_on = self.on elif self.left_on is not None: n = len(self.left_on) if self.right_index: if len(self.left_on) != self.right.index.nlevels: - raise ValueError('len(left_on) must equal the number ' - 'of levels in the index of "right"') + raise ValueError( + "len(left_on) must equal the number " + 'of levels in the index of "right"' + ) self.right_on = [None] * n elif self.right_on is not None: n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: - raise ValueError('len(right_on) must equal the number ' - 'of levels in the index of "left"') + raise ValueError( + "len(right_on) must equal the number " + 'of levels in the index of "left"' + ) self.left_on = [None] * n if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") @@ -1107,46 +1230,53 @@ def _validate(self, validate): if self.left_index: left_unique = self.orig_left.index.is_unique else: - left_unique = MultiIndex.from_arrays(self.left_join_keys - ).is_unique + left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique if self.right_index: right_unique = self.orig_right.index.is_unique else: - right_unique = MultiIndex.from_arrays(self.right_join_keys - ).is_unique + right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique # Check data integrity if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: - raise MergeError("Merge keys are not unique in either left" - " or right dataset; not a one-to-one merge") + raise MergeError( + "Merge keys are not unique in either left" + " or right dataset; not a one-to-one merge" + ) elif not left_unique: - raise MergeError("Merge keys are not unique in left dataset;" - " not a one-to-one merge") + raise MergeError( + "Merge keys are not unique in left dataset;" + " not a one-to-one merge" + ) elif not right_unique: - raise MergeError("Merge keys are not unique in right dataset;" - " not a one-to-one merge") + raise MergeError( + "Merge keys are not unique in right dataset;" + " not a one-to-one merge" + ) elif validate in ["one_to_many", "1:m"]: if not left_unique: - raise MergeError("Merge keys are not unique in left dataset;" - " not a one-to-many merge") + raise MergeError( + "Merge keys are not unique in left dataset;" + " not a one-to-many merge" + ) elif validate in ["many_to_one", "m:1"]: if not right_unique: - raise MergeError("Merge keys are not unique in right dataset;" - " not a many-to-one merge") + raise MergeError( + "Merge keys are not unique in right dataset;" + " not a many-to-one merge" + ) - elif validate in ['many_to_many', 'm:m']: + elif validate in ["many_to_many", "m:m"]: pass else: raise ValueError("Not a valid argument for validate") -def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', - **kwargs): +def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs): """ Parameters @@ -1164,14 +1294,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', """ from functools import partial - assert len(left_keys) == len(right_keys), \ - 'left_key and right_keys must be the same length' + assert len(left_keys) == len( + right_keys + ), "left_key and right_keys must be the same length" # bind `sort` arg. of _factorize_keys fkeys = partial(_factorize_keys, sort=sort) # get left & right join labels and num. of levels at each location - llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys))) + llab, rlab, shape = map(list, zip(*map(fkeys, left_keys, right_keys))) # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -1183,15 +1314,16 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if how == 'left': - kwargs['sort'] = sort + if how == "left": + kwargs["sort"] = sort join_func = _join_functions[how] return join_func(lkey, rkey, count, **kwargs) -def _restore_dropped_levels_multijoin(left, right, dropped_level_names, - join_index, lindexer, rindexer): +def _restore_dropped_levels_multijoin( + left, right, dropped_level_names, join_index, lindexer, rindexer +): """ *this is an internal non-public method* @@ -1232,8 +1364,7 @@ def _convert_to_mulitindex(index): if isinstance(index, MultiIndex): return index else: - return MultiIndex.from_arrays([index.values], - names=[index.name]) + return MultiIndex.from_arrays([index.values], names=[index.name]) # For multi-multi joins with one overlapping level, # the returned index if of type Index @@ -1280,21 +1411,39 @@ def _convert_to_mulitindex(index): class _OrderedMerge(_MergeOperation): - _merge_type = 'ordered_merge' - - def __init__(self, left, right, on=None, left_on=None, right_on=None, - left_index=False, right_index=False, axis=1, - suffixes=('_x', '_y'), copy=True, - fill_method=None, how='outer'): + _merge_type = "ordered_merge" + + def __init__( + self, + left, + right, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + axis=1, + suffixes=("_x", "_y"), + copy=True, + fill_method=None, + how="outer", + ): self.fill_method = fill_method - _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, - left_index=left_index, - right_index=right_index, - right_on=right_on, axis=axis, - how=how, suffixes=suffixes, - sort=True # factorize sorts - ) + _MergeOperation.__init__( + self, + left, + right, + on=on, + left_on=left_on, + left_index=left_index, + right_index=right_index, + right_on=right_on, + axis=axis, + how=how, + suffixes=suffixes, + sort=True, # factorize sorts + ) def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() @@ -1303,25 +1452,26 @@ def get_result(self): ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf, - rdata.items, rsuf) + llabels, rlabels = _items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf + ) - if self.fill_method == 'ffill': + if self.fill_method == "ffill": left_join_indexer = libjoin.ffill_indexer(left_indexer) right_join_indexer = libjoin.ffill_indexer(right_indexer) else: left_join_indexer = left_indexer right_join_indexer = right_indexer - lindexers = { - 1: left_join_indexer} if left_join_indexer is not None else {} - rindexers = { - 1: right_join_indexer} if right_join_indexer is not None else {} + lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} result_data = concatenate_block_managers( [(ldata, lindexers), (rdata, rindexers)], axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) + concat_axis=0, + copy=self.copy, + ) typ = self.left._constructor result = typ(result_data).__finalize__(self, method=self._merge_type) @@ -1332,43 +1482,56 @@ def get_result(self): def _asof_function(direction): - name = 'asof_join_{dir}'.format(dir=direction) + name = "asof_join_{dir}".format(dir=direction) return getattr(libjoin, name, None) def _asof_by_function(direction): - name = 'asof_join_{dir}_on_X_by_Y'.format(dir=direction) + name = "asof_join_{dir}_on_X_by_Y".format(dir=direction) return getattr(libjoin, name, None) _type_casters = { - 'int64_t': ensure_int64, - 'double': ensure_float64, - 'object': ensure_object, + "int64_t": ensure_int64, + "double": ensure_float64, + "object": ensure_object, } def _get_cython_type_upcast(dtype): """ Upcast a dtype to 'int64_t', 'double', or 'object' """ if is_integer_dtype(dtype): - return 'int64_t' + return "int64_t" elif is_float_dtype(dtype): - return 'double' + return "double" else: - return 'object' + return "object" class _AsOfMerge(_OrderedMerge): - _merge_type = 'asof_merge' - - def __init__(self, left, right, on=None, left_on=None, right_on=None, - left_index=False, right_index=False, - by=None, left_by=None, right_by=None, - axis=1, suffixes=('_x', '_y'), copy=True, - fill_method=None, - how='asof', tolerance=None, - allow_exact_matches=True, - direction='backward'): + _merge_type = "asof_merge" + + def __init__( + self, + left, + right, + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + by=None, + left_by=None, + right_by=None, + axis=1, + suffixes=("_x", "_y"), + copy=True, + fill_method=None, + how="asof", + tolerance=None, + allow_exact_matches=True, + direction="backward", + ): self.by = by self.left_by = left_by @@ -1377,11 +1540,20 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None, self.allow_exact_matches = allow_exact_matches self.direction = direction - _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, - right_on=right_on, left_index=left_index, - right_index=right_index, axis=axis, - how=how, suffixes=suffixes, - fill_method=fill_method) + _OrderedMerge.__init__( + self, + left, + right, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + axis=axis, + how=how, + suffixes=suffixes, + fill_method=fill_method, + ) def _validate_specification(self): super()._validate_specification() @@ -1402,13 +1574,12 @@ def _validate_specification(self): # set 'by' columns if self.by is not None: if self.left_by is not None or self.right_by is not None: - raise MergeError('Can only pass by OR left_by ' - 'and right_by') + raise MergeError("Can only pass by OR left_by " "and right_by") self.left_by = self.right_by = self.by if self.left_by is None and self.right_by is not None: - raise MergeError('missing left_by') + raise MergeError("missing left_by") if self.left_by is not None and self.right_by is None: - raise MergeError('missing right_by') + raise MergeError("missing right_by") # add 'by' to our key-list so we can have it in the # output as a key @@ -1419,15 +1590,16 @@ def _validate_specification(self): self.right_by = [self.right_by] if len(self.left_by) != len(self.right_by): - raise MergeError('left_by and right_by must be same length') + raise MergeError("left_by and right_by must be same length") self.left_on = self.left_by + list(self.left_on) self.right_on = self.right_by + list(self.right_on) # check 'direction' is valid - if self.direction not in ['backward', 'forward', 'nearest']: - raise MergeError('direction invalid: {direction}' - .format(direction=self.direction)) + if self.direction not in ["backward", "forward", "nearest"]: + raise MergeError( + "direction invalid: {direction}".format(direction=self.direction) + ) @property def _asof_key(self): @@ -1437,15 +1609,12 @@ def _asof_key(self): def _get_merge_keys(self): # note this function has side effects - (left_join_keys, - right_join_keys, - join_names) = super()._get_merge_keys() + (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() # validate index types are the same for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): if not is_dtype_equal(lk.dtype, rk.dtype): - if (is_categorical_dtype(lk.dtype) and - is_categorical_dtype(rk.dtype)): + if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): # The generic error message is confusing for categoricals. # # In this function, the join keys include both the original @@ -1454,15 +1623,19 @@ def _get_merge_keys(self): # are not supported for the former, but will fail # later with a ValueError, so we don't *need* to check # for them here. - msg = ("incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, both sides category, but not equal ones" - .format(i=i, lkdtype=repr(lk.dtype), - rkdtype=repr(rk.dtype))) + msg = ( + "incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, both sides category, but not equal ones".format( + i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) + ) + ) else: - msg = ("incompatible merge keys [{i}] {lkdtype} and " - "{rkdtype}, must be the same type" - .format(i=i, lkdtype=repr(lk.dtype), - rkdtype=repr(rk.dtype))) + msg = ( + "incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type".format( + i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype) + ) + ) raise MergeError(msg) # validate tolerance; must be a Timedelta if we have a DTI @@ -1473,10 +1646,12 @@ def _get_merge_keys(self): else: lt = left_join_keys[-1] - msg = ("incompatible tolerance {tolerance}, must be compat " - "with type {lkdtype}".format( - tolerance=type(self.tolerance), - lkdtype=repr(lt.dtype))) + msg = ( + "incompatible tolerance {tolerance}, must be compat " + "with type {lkdtype}".format( + tolerance=type(self.tolerance), lkdtype=repr(lt.dtype) + ) + ) if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): @@ -1511,16 +1686,18 @@ def _get_join_indexers(self): def flip(xs): """ unlike np.transpose, this returns an array of tuples """ - labels = list(string.ascii_lowercase[:len(xs)]) + labels = list(string.ascii_lowercase[: len(xs)]) dtypes = [x.dtype for x in xs] labeled_dtypes = list(zip(labels, dtypes)) return np.array(list(zip(*xs)), labeled_dtypes) # values to compare - left_values = (self.left.index.values if self.left_index else - self.left_join_keys[-1]) - right_values = (self.right.index.values if self.right_index else - self.right_join_keys[-1]) + left_values = ( + self.left.index.values if self.left_index else self.left_join_keys[-1] + ) + right_values = ( + self.right.index.values if self.right_index else self.right_join_keys[-1] + ) tolerance = self.tolerance # we require sortedness and non-null values in the join keys @@ -1529,20 +1706,20 @@ def flip(xs): if not Index(left_values).is_monotonic: if isnull(left_values).any(): - raise ValueError(msg_missings.format(side='left')) + raise ValueError(msg_missings.format(side="left")) else: - raise ValueError(msg_sorted.format(side='left')) + raise ValueError(msg_sorted.format(side="left")) if not Index(right_values).is_monotonic: if isnull(right_values).any(): - raise ValueError(msg_missings.format(side='right')) + raise ValueError(msg_missings.format(side="right")) else: - raise ValueError(msg_sorted.format(side='right')) + raise ValueError(msg_sorted.format(side="right")) # initial type conversion as needed if needs_i8_conversion(left_values): - left_values = left_values.view('i8') - right_values = right_values.view('i8') + left_values = left_values.view("i8") + right_values = right_values.view("i8") if tolerance is not None: tolerance = tolerance.value @@ -1572,19 +1749,18 @@ def flip(xs): # choose appropriate function by type func = _asof_by_function(self.direction) - return func(left_values, - right_values, - left_by_values, - right_by_values, - self.allow_exact_matches, - tolerance) + return func( + left_values, + right_values, + left_by_values, + right_by_values, + self.allow_exact_matches, + tolerance, + ) else: # choose appropriate function by type func = _asof_function(self.direction) - return func(left_values, - right_values, - self.allow_exact_matches, - tolerance) + return func(left_values, right_values, self.allow_exact_matches, tolerance) def _get_multiindex_indexer(join_keys, index, sort): @@ -1594,13 +1770,11 @@ def _get_multiindex_indexer(join_keys, index, sort): fkeys = partial(_factorize_keys, sort=sort) # left & right join labels and num. of levels at each location - rcodes, lcodes, shape = map(list, zip(* map(fkeys, - index.levels, - join_keys))) + rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys))) if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: - i8copy = lambda a: a.astype('i8', subok=False, copy=True) + i8copy = lambda a: a.astype("i8", subok=False, copy=True) rcodes = list(map(i8copy, index.codes)) # fix right labels if there were any nulls @@ -1628,29 +1802,31 @@ def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = libjoin.left_outer_join( - ensure_int64(left_key), - ensure_int64(right_key), - count, sort=sort) + ensure_int64(left_key), ensure_int64(right_key), count, sort=sort + ) return left_indexer, right_indexer def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): if len(join_keys) > 1: - if not ((isinstance(right_ax, MultiIndex) and - len(join_keys) == right_ax.nlevels)): - raise AssertionError("If more than one join key is given then " - "'right_ax' must be a MultiIndex and the " - "number of join keys must be the number of " - "levels in right_ax") - - left_indexer, right_indexer = \ - _get_multiindex_indexer(join_keys, right_ax, sort=sort) + if not ( + (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) + ): + raise AssertionError( + "If more than one join key is given then " + "'right_ax' must be a MultiIndex and the " + "number of join keys must be the number of " + "levels in right_ax" + ) + + left_indexer, right_indexer = _get_multiindex_indexer( + join_keys, right_ax, sort=sort + ) else: jkey = join_keys[0] - left_indexer, right_indexer = \ - _get_single_indexer(jkey, right_ax, sort=sort) + left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) if sort or len(left_ax) != len(left_indexer): # if asked to sort or there are 1-to-many matches @@ -1667,22 +1843,22 @@ def _right_outer_join(x, y, max_groups): _join_functions = { - 'inner': libjoin.inner_join, - 'left': libjoin.left_outer_join, - 'right': _right_outer_join, - 'outer': libjoin.full_outer_join, + "inner": libjoin.inner_join, + "left": libjoin.left_outer_join, + "right": _right_outer_join, + "outer": libjoin.full_outer_join, } def _factorize_keys(lk, rk, sort=True): # Some pre-processing for non-ndarray lk / rk if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): - lk = getattr(lk, '_values', lk)._data - rk = getattr(rk, '_values', rk)._data + lk = getattr(lk, "_values", lk)._data + rk = getattr(rk, "_values", rk)._data - elif (is_categorical_dtype(lk) and - is_categorical_dtype(rk) and - lk.is_dtype_equal(rk)): + elif ( + is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk) + ): if lk.categories.equals(rk.categories): # if we exactly match in categories, allow us to factorize on codes rk = rk.codes @@ -1693,9 +1869,11 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif (is_extension_array_dtype(lk.dtype) and - is_extension_array_dtype(rk.dtype) and - lk.dtype == rk.dtype): + elif ( + is_extension_array_dtype(lk.dtype) + and is_extension_array_dtype(rk.dtype) + and lk.dtype == rk.dtype + ): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() @@ -1705,8 +1883,9 @@ def _factorize_keys(lk, rk, sort=True): klass = libhashtable.Int64Factorizer lk = ensure_int64(com.values_from_object(lk)) rk = ensure_int64(com.values_from_object(rk)) - elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and - issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))): + elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass( + rk.dtype.type, (np.timedelta64, np.datetime64) + ): # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer lk = ensure_int64(com.values_from_object(lk)) @@ -1765,12 +1944,12 @@ def _get_join_keys(llab, rlab, shape, sort): nlev = next(filter(pred, range(len(shape), 0, -1))) # get keys for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype='i8') - lkey = stride * llab[0].astype('i8', subok=False, copy=False) - rkey = stride * rlab[0].astype('i8', subok=False, copy=False) + stride = np.prod(shape[1:nlev], dtype="i8") + lkey = stride * llab[0].astype("i8", subok=False, copy=False) + rkey = stride * rlab[0].astype("i8", subok=False, copy=False) for i in range(1, nlev): - with np.errstate(divide='ignore'): + with np.errstate(divide="ignore"): stride //= shape[i] lkey += llab[i] * stride rkey += rlab[i] * stride @@ -1803,12 +1982,14 @@ def validate_operand(obj): return obj elif isinstance(obj, Series): if obj.name is None: - raise ValueError('Cannot merge a Series without a name') + raise ValueError("Cannot merge a Series without a name") else: return obj.to_frame() else: - raise TypeError('Can only merge Series or DataFrame objects, ' - 'a {obj} was passed'.format(obj=type(obj))) + raise TypeError( + "Can only merge Series or DataFrame objects, " + "a {obj} was passed".format(obj=type(obj)) + ) def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): @@ -1823,8 +2004,10 @@ def _items_overlap_with_suffix(left, lsuffix, right, rsuffix): return left, right if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: ' - '{rename}'.format(rename=to_rename)) + raise ValueError( + "columns overlap but no suffix specified: " + "{rename}".format(rename=to_rename) + ) def renamer(x, suffix): """ @@ -1843,11 +2026,10 @@ def renamer(x, suffix): x : renamed column name """ if x in to_rename and suffix is not None: - return '{x}{suffix}'.format(x=x, suffix=suffix) + return "{x}{suffix}".format(x=x, suffix=suffix) return x lrenamer = partial(renamer, suffix=lsuffix) rrenamer = partial(renamer, suffix=rsuffix) - return (_transform_index(left, lrenamer), - _transform_index(right, rrenamer)) + return (_transform_index(left, lrenamer), _transform_index(right, rrenamer)) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 6374dd1b463f3..188f2edd96590 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -17,11 +17,20 @@ # Note: We need to make sure `frame` is imported before `pivot`, otherwise # _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency -@Substitution('\ndata : DataFrame') -@Appender(_shared_docs['pivot_table'], indents=1) -def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', - fill_value=None, margins=False, dropna=True, - margins_name='All', observed=False): +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot_table"], indents=1) +def pivot_table( + data, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + observed=False, +): index = _convert_by(index) columns = _convert_by(columns) @@ -29,14 +38,20 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pieces = [] keys = [] for func in aggfunc: - table = pivot_table(data, values=values, index=index, - columns=columns, - fill_value=fill_value, aggfunc=func, - margins=margins, dropna=dropna, - margins_name=margins_name, - observed=observed) + table = pivot_table( + data, + values=values, + index=index, + columns=columns, + fill_value=fill_value, + aggfunc=func, + margins=margins, + dropna=dropna, + margins_name=margins_name, + observed=observed, + ) pieces.append(table) - keys.append(getattr(func, '__name__', func)) + keys.append(getattr(func, "__name__", func)) return concat(pieces, keys=keys, axis=1) @@ -80,7 +95,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): - agged = agged.dropna(how='all') + agged = agged.dropna(how="all") # gh-21133 # we want to down cast if @@ -88,8 +103,12 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: - if (v in data and is_integer_dtype(data[v]) and - v in agged and not is_integer_dtype(agged[v])): + if ( + v in data + and is_integer_dtype(data[v]) + and v in agged + and not is_integer_dtype(agged[v]) + ): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged @@ -97,7 +116,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. - index_names = agged.index.names[:len(index)] + index_names = agged.index.names[: len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] @@ -109,33 +128,47 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if not dropna: from pandas import MultiIndex + if table.index.nlevels > 1: - m = MultiIndex.from_arrays(cartesian_product(table.index.levels), - names=table.index.names) + m = MultiIndex.from_arrays( + cartesian_product(table.index.levels), names=table.index.names + ) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: - m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), - names=table.columns.names) + m = MultiIndex.from_arrays( + cartesian_product(table.columns.levels), names=table.columns.names + ) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: - table = table.fillna(value=fill_value, downcast='infer') + table = table.fillna(value=fill_value, downcast="infer") if margins: if dropna: data = data[data.notna().all(axis=1)] - table = _add_margins(table, data, values, rows=index, - cols=columns, aggfunc=aggfunc, - observed=dropna, - margins_name=margins_name, fill_value=fill_value) + table = _add_margins( + table, + data, + values, + rows=index, + cols=columns, + aggfunc=aggfunc, + observed=dropna, + margins_name=margins_name, + fill_value=fill_value, + ) # discard the top level - if (values_passed and not values_multi and not table.empty and - (table.columns.nlevels > 1)): + if ( + values_passed + and not values_multi + and not table.empty + and (table.columns.nlevels > 1) + ): table = table[values[0]] if len(index) == 0 and len(columns) > 0: @@ -143,15 +176,24 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: - table = table.dropna(how='all', axis=1) + table = table.dropna(how="all", axis=1) return table -def _add_margins(table, data, values, rows, cols, aggfunc, - observed=None, margins_name='All', fill_value=None): +def _add_margins( + table, + data, + values, + rows, + cols, + aggfunc, + observed=None, + margins_name="All", + fill_value=None, +): if not isinstance(margins_name, str): - raise ValueError('margins_name argument must be a string') + raise ValueError("margins_name argument must be a string") msg = 'Conflicting name "{name}" in margins'.format(name=margins_name) for level in table.index.names: @@ -161,13 +203,13 @@ def _add_margins(table, data, values, rows, cols, aggfunc, grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) # could be passed a Series object with no 'columns' - if hasattr(table, 'columns'): + if hasattr(table, "columns"): for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) if len(rows) > 1: - key = (margins_name,) + ('',) * (len(rows) - 1) + key = (margins_name,) + ("",) * (len(rows) - 1) else: key = margins_name @@ -177,17 +219,24 @@ def _add_margins(table, data, values, rows, cols, aggfunc, return table.append(Series({key: grand_margin[margins_name]})) if values: - marginal_result_set = _generate_marginal_results(table, data, values, - rows, cols, aggfunc, - observed, - grand_margin, - margins_name) + marginal_result_set = _generate_marginal_results( + table, + data, + values, + rows, + cols, + aggfunc, + observed, + grand_margin, + margins_name, + ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, observed, margins_name) + table, data, rows, cols, aggfunc, observed, margins_name + ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set @@ -200,6 +249,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, row_margin[k] = grand_margin[k[0]] from pandas import DataFrame + margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names @@ -218,8 +268,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, return result -def _compute_grand_margin(data, values, aggfunc, - margins_name='All'): +def _compute_grand_margin(data, values, aggfunc, margins_name="All"): if values: grand_margin = {} @@ -241,26 +290,22 @@ def _compute_grand_margin(data, values, aggfunc, return {margins_name: aggfunc(data.index)} -def _generate_marginal_results(table, data, values, rows, cols, aggfunc, - observed, - grand_margin, - margins_name='All'): +def _generate_marginal_results( + table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name="All" +): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): - return (key, margins_name) + ('',) * (len(cols) - 1) + return (key, margins_name) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby( - rows, observed=observed).agg(aggfunc) + margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) cat_axis = 1 - for key, piece in table.groupby(level=0, - axis=cat_axis, - observed=observed): + for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! @@ -270,9 +315,11 @@ def _all_key(key): except TypeError: # we cannot reshape, so coerce the axis - piece.set_axis(piece._get_axis( - cat_axis)._to_safe_for_reshape(), - axis=cat_axis, inplace=True) + piece.set_axis( + piece._get_axis(cat_axis)._to_safe_for_reshape(), + axis=cat_axis, + inplace=True, + ) piece[all_key] = margin[key] table_pieces.append(piece) @@ -280,9 +327,7 @@ def _all_key(key): else: margin = grand_margin cat_axis = 0 - for key, piece in table.groupby(level=0, - axis=cat_axis, - observed=observed): + for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) @@ -297,8 +342,7 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby( - cols, observed=observed).agg(aggfunc) + row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack @@ -311,8 +355,8 @@ def _all_key(key): def _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, - observed, margins_name='All'): + table, data, rows, cols, aggfunc, observed, margins_name="All" +): if len(cols) > 0: # need to "interleave" the margins margin_keys = [] @@ -320,20 +364,17 @@ def _generate_marginal_results_without_values( def _all_key(): if len(cols) == 1: return margins_name - return (margins_name, ) + ('', ) * (len(cols) - 1) + return (margins_name,) + ("",) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows, - observed=observed).apply(aggfunc) + margin = data[rows].groupby(rows, observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, - axis=0, - observed=observed).apply(aggfunc) + margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -354,17 +395,19 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (is_scalar(by) or - isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) or - hasattr(by, '__call__')): + elif ( + is_scalar(by) + or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) + or hasattr(by, "__call__") + ): by = [by] else: by = list(by) return by -@Substitution('\ndata : DataFrame') -@Appender(_shared_docs['pivot'], indents=1) +@Substitution("\ndata : DataFrame") +@Appender(_shared_docs["pivot"], indents=1) def pivot(data, index=None, columns=None, values=None): if values is None: cols = [columns] if index is None else [index, columns] @@ -379,17 +422,26 @@ def pivot(data, index=None, columns=None, values=None): if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name - indexed = data._constructor(data[values].values, index=index, - columns=values) + indexed = data._constructor( + data[values].values, index=index, columns=values + ) else: - indexed = data._constructor_sliced(data[values].values, - index=index) + indexed = data._constructor_sliced(data[values].values, index=index) return indexed.unstack(columns) -def crosstab(index, columns, values=None, rownames=None, colnames=None, - aggfunc=None, margins=False, margins_name='All', dropna=True, - normalize=False): +def crosstab( + index, + columns, + values=None, + rownames=None, + colnames=None, + aggfunc=None, + margins=False, + margins_name="All", + dropna=True, + normalize=False, +): """ Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -490,11 +542,10 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) - rownames = _get_names(index, rownames, prefix='row') - colnames = _get_names(columns, colnames, prefix='col') + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") - common_idx = _get_objs_combined_axis(index + columns, intersect=True, - sort=False) + common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) data = {} data.update(zip(rownames, index)) @@ -507,30 +558,38 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, raise ValueError("values cannot be used without an aggfunc.") from pandas import DataFrame + df = DataFrame(data, index=common_idx) if values is None: - df['__dummy__'] = 0 - kwargs = {'aggfunc': len, 'fill_value': 0} + df["__dummy__"] = 0 + kwargs = {"aggfunc": len, "fill_value": 0} else: - df['__dummy__'] = values - kwargs = {'aggfunc': aggfunc} - - table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - margins=margins, margins_name=margins_name, - dropna=dropna, **kwargs) + df["__dummy__"] = values + kwargs = {"aggfunc": aggfunc} + + table = df.pivot_table( + "__dummy__", + index=rownames, + columns=colnames, + margins=margins, + margins_name=margins_name, + dropna=dropna, + **kwargs + ) # Post-process if normalize is not False: - table = _normalize(table, normalize=normalize, margins=margins, - margins_name=margins_name) + table = _normalize( + table, normalize=normalize, margins=margins, margins_name=margins_name + ) return table -def _normalize(table, normalize, margins, margins_name='All'): +def _normalize(table, normalize, margins, margins_name="All"): if not isinstance(normalize, (bool, str)): - axis_subs = {0: 'index', 1: 'columns'} + axis_subs = {0: "index", 1: "columns"} try: normalize = axis_subs[normalize] except KeyError: @@ -540,12 +599,12 @@ def _normalize(table, normalize, margins, margins_name='All'): # Actual Normalizations normalizers = { - 'all': lambda x: x / x.sum(axis=1).sum(axis=0), - 'columns': lambda x: x / x.sum(), - 'index': lambda x: x.div(x.sum(axis=1), axis=0) + "all": lambda x: x / x.sum(axis=1).sum(axis=0), + "columns": lambda x: x / x.sum(), + "index": lambda x: x.div(x.sum(axis=1), axis=0), } - normalizers[True] = normalizers['all'] + normalizers[True] = normalizers["all"] try: f = normalizers[normalize] @@ -568,12 +627,12 @@ def _normalize(table, normalize, margins, margins_name='All'): table = _normalize(table, normalize=normalize, margins=False) # Fix Margins - if normalize == 'columns': + if normalize == "columns": column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) - elif normalize == 'index': + elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) @@ -599,17 +658,17 @@ def _normalize(table, normalize, margins, margins_name='All'): return table -def _get_names(arrs, names, prefix='row'): +def _get_names(arrs, names, prefix="row"): if names is None: names = [] for i, arr in enumerate(arrs): if isinstance(arr, ABCSeries) and arr.name is not None: names.append(arr.name) else: - names.append('{prefix}_{i}'.format(prefix=prefix, i=i)) + names.append("{prefix}_{i}".format(prefix=prefix, i=i)) else: if len(names) != len(arrs): - raise AssertionError('arrays and names must have the same length') + raise AssertionError("arrays and names must have the same length") if not isinstance(names, list): names = list(names) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c59f9ffc48055..5d932d7ded9b8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -9,8 +9,14 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - ensure_platform_int, is_bool_dtype, is_extension_array_dtype, - is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion) + ensure_platform_int, + is_bool_dtype, + is_extension_array_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos @@ -21,8 +27,11 @@ from pandas.core.internals.arrays import extract_array from pandas.core.series import Series from pandas.core.sorting import ( - compress_group_index, decons_obs_group_ids, get_compressed_ids, - get_group_index) + compress_group_index, + decons_obs_group_ids, + get_compressed_ids, + get_group_index, +) class _Unstacker: @@ -76,8 +85,15 @@ class _Unstacker: unstacked : DataFrame """ - def __init__(self, values, index, level=-1, value_columns=None, - fill_value=None, constructor=None): + def __init__( + self, + values, + index, + level=-1, + value_columns=None, + fill_value=None, + constructor=None, + ): if values.ndim == 1: values = values[:, np.newaxis] @@ -90,7 +106,7 @@ def __init__(self, values, index, level=-1, value_columns=None, self.constructor = constructor if value_columns is None and values.shape[1] != 1: # pragma: no cover - raise ValueError('must pass column labels for multi-column data') + raise ValueError("must pass column labels for multi-column data") self.index = index.remove_unused_levels() @@ -110,16 +126,16 @@ def __init__(self, values, index, level=-1, value_columns=None, # If the data frame is too big, the number of unique index combination # will cause int32 overflow on windows environments. # We want to check and raise an error before this happens - num_rows = np.max([index_level.size for index_level - in self.new_index_levels]) + num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size # GH20601: This forces an overflow if the number of cells is too high. num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) if num_rows > 0 and num_columns > 0 and num_cells <= 0: - raise ValueError('Unstacked DataFrame is too big, ' - 'causing int32 overflow') + raise ValueError( + "Unstacked DataFrame is too big, " "causing int32 overflow" + ) self._make_sorted_values_labels() self._make_selectors() @@ -129,8 +145,8 @@ def _make_sorted_values_labels(self): codes = list(self.index.codes) levs = list(self.index.levels) - to_sort = codes[:v] + codes[v + 1:] + [codes[v]] - sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] + to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] + sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) @@ -160,8 +176,7 @@ def _make_selectors(self): mask.put(selector, True) if mask.sum() < len(self.index): - raise ValueError('Index contains duplicate entries, ' - 'cannot reshape') + raise ValueError("Index contains duplicate entries, " "cannot reshape") self.group_index = comp_index self.mask = mask @@ -188,11 +203,11 @@ def get_new_values(self): # we can simply reshape if we don't have a mask if mask_all and len(values): - new_values = (self.sorted_values - .reshape(length, width, stride) - .swapaxes(1, 2) - .reshape(result_shape) - ) + new_values = ( + self.sorted_values.reshape(length, width, stride) + .swapaxes(1, 2) + .reshape(result_shape) + ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask @@ -214,25 +229,27 @@ def get_new_values(self): # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): - sorted_values = sorted_values.view('i8') - new_values = new_values.view('i8') - name = 'int64' + sorted_values = sorted_values.view("i8") + new_values = new_values.view("i8") + name = "int64" elif is_bool_dtype(values): - sorted_values = sorted_values.astype('object') - new_values = new_values.astype('object') - name = 'object' + sorted_values = sorted_values.astype("object") + new_values = new_values.astype("object") + name = "object" else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask f = getattr(_reshape, "unstack_{name}".format(name=name)) - f(sorted_values, - mask.view('u1'), - stride, - length, - width, - new_values, - new_mask.view('u1')) + f( + sorted_values, + mask.view("u1"), + stride, + length, + width, + new_values, + new_mask.view("u1"), + ) # reconstruct dtype if needed if needs_i8_conversion(values): @@ -255,8 +272,7 @@ def get_new_columns(self): new_levels = self.value_columns.levels + (self.removed_level_full,) new_names = self.value_columns.names + (self.removed_name,) - new_codes = [lab.take(propagator) - for lab in self.value_columns.codes] + new_codes = [lab.take(propagator) for lab in self.value_columns.codes] else: new_levels = [self.value_columns, self.removed_level_full] new_names = [self.value_columns.name, self.removed_name] @@ -274,12 +290,12 @@ def get_new_columns(self): # The entire level is then just a repetition of the single chunk: new_codes.append(np.tile(repeater, width)) - return MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + return MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) def get_new_index(self): - result_codes = [lab.take(self.compressor) - for lab in self.sorted_labels[:-1]] + result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] # construct the new index if len(self.new_index_levels) == 1: @@ -288,8 +304,12 @@ def get_new_index(self): lev = lev.insert(len(lev), lev._na_value) return lev.take(lab) - return MultiIndex(levels=self.new_index_levels, codes=result_codes, - names=self.new_index_names, verify_integrity=False) + return MultiIndex( + levels=self.new_index_levels, + codes=result_codes, + names=self.new_index_names, + verify_integrity=False, + ) def _unstack_multiple(data, clocs, fill_value=None): @@ -315,23 +335,24 @@ def _unstack_multiple(data, clocs, fill_value=None): group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) - recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, - xnull=False) + recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index - dummy_index = Index(obs_ids, name='__placeholder__') + dummy_index = Index(obs_ids, name="__placeholder__") else: - dummy_index = MultiIndex(levels=rlevels + [obs_ids], - codes=rcodes + [comp_ids], - names=rnames + ['__placeholder__'], - verify_integrity=False) + dummy_index = MultiIndex( + levels=rlevels + [obs_ids], + codes=rcodes + [comp_ids], + names=rnames + ["__placeholder__"], + verify_integrity=False, + ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index - unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes @@ -348,7 +369,7 @@ def _unstack_multiple(data, clocs, fill_value=None): dummy = data.copy() dummy.index = dummy_index - unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) + unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: @@ -360,8 +381,9 @@ def _unstack_multiple(data, clocs, fill_value=None): for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) - new_columns = MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + new_columns = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) if isinstance(unstacked, Series): unstacked.index = new_columns @@ -388,24 +410,32 @@ def unstack(obj, level, fill_value=None): else: if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) - unstacker = _Unstacker(obj.values, obj.index, level=level, - fill_value=fill_value, - constructor=obj._constructor_expanddim) + unstacker = _Unstacker( + obj.values, + obj.index, + level=level, + fill_value=fill_value, + constructor=obj._constructor_expanddim, + ) return unstacker.get_result() def _unstack_frame(obj, level, fill_value=None): if obj._is_mixed_type: - unstacker = partial(_Unstacker, index=obj.index, - level=level, fill_value=fill_value) - blocks = obj._data.unstack(unstacker, - fill_value=fill_value) + unstacker = partial( + _Unstacker, index=obj.index, level=level, fill_value=fill_value + ) + blocks = obj._data.unstack(unstacker, fill_value=fill_value) return obj._constructor(blocks) else: - unstacker = _Unstacker(obj.values, obj.index, level=level, - value_columns=obj.columns, - fill_value=fill_value, - constructor=obj._constructor) + unstacker = _Unstacker( + obj.values, + obj.index, + level=level, + value_columns=obj.columns, + fill_value=fill_value, + constructor=obj._constructor, + ) return unstacker.get_result() @@ -441,18 +471,22 @@ def _unstack_extension_series(series, level, fill_value): dummy_arr = np.arange(len(series)) # fill_value=-1, since we will do a series.values.take later - result = _Unstacker(dummy_arr, series.index, - level=level, fill_value=-1).get_result() + result = _Unstacker( + dummy_arr, series.index, level=level, fill_value=-1 + ).get_result() out = [] values = extract_array(series, extract_numpy=False) for col, indices in result.iteritems(): - out.append(Series(values.take(indices.values, - allow_fill=True, - fill_value=fill_value), - name=col, index=result.index)) - return concat(out, axis='columns', copy=False, keys=result.columns) + out.append( + Series( + values.take(indices.values, allow_fill=True, fill_value=fill_value), + name=col, + index=result.index, + ) + ) + return concat(out, axis="columns", copy=False, keys=result.columns) def stack(frame, level=-1, dropna=True): @@ -464,6 +498,7 @@ def stack(frame, level=-1, dropna=True): ------- stacked : Series """ + def factorize(index): if index.is_unique: return index, np.arange(len(index)) @@ -487,15 +522,18 @@ def factorize(index): new_names = list(frame.index.names) new_names.append(frame.columns.name) - new_index = MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) else: - levels, (ilab, clab) = zip(*map(factorize, (frame.index, - frame.columns))) + levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() - new_index = MultiIndex(levels=levels, codes=codes, - names=[frame.index.name, frame.columns.name], - verify_integrity=False) + new_index = MultiIndex( + levels=levels, + codes=codes, + names=[frame.index.name, frame.columns.name], + verify_integrity=False, + ) if frame._is_homogeneous_type: # For homogeneous EAs, frame.values will coerce to object. So @@ -505,9 +543,9 @@ def factorize(index): if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() - new_values = arr._concat_same_type([ - col._values for _, col in frame.iteritems() - ]) + new_values = arr._concat_same_type( + [col._values for _, col in frame.iteritems()] + ) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA @@ -558,8 +596,10 @@ def stack_multiple(frame, level, dropna=True): level = updated_level else: - raise ValueError("level should contain all level names or all level " - "numbers, not a mixture of the two.") + raise ValueError( + "level should contain all level names or all level " + "numbers, not a mixture of the two." + ) return result @@ -604,9 +644,16 @@ def _convert_level_number(level_num, columns): # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: - tuples = list(zip(*[lev.take(level_codes) for lev, level_codes - in zip(this.columns.levels[:-1], - this.columns.codes[:-1])])) + tuples = list( + zip( + *[ + lev.take(level_codes) + for lev, level_codes in zip( + this.columns.levels[:-1], this.columns.codes[:-1] + ) + ] + ) + ) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) @@ -641,8 +688,9 @@ def _convert_level_number(level_num, columns): chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: - if (frame._is_homogeneous_type and - is_extension_array_dtype(frame.dtypes.iloc[0])): + if frame._is_homogeneous_type and is_extension_array_dtype( + frame.dtypes.iloc[0] + ): dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] @@ -682,21 +730,30 @@ def _convert_level_number(level_num, columns): new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) - new_index = MultiIndex(levels=new_levels, codes=new_codes, - names=new_names, verify_integrity=False) + new_index = MultiIndex( + levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False + ) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: - result = result.dropna(axis=0, how='all') + result = result.dropna(axis=0, how="all") return result -def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False, drop_first=False, dtype=None): +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + sparse=False, + drop_first=False, + dtype=None, +): """ Convert categorical variable into dummy/indicator variables. @@ -800,29 +857,31 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, from pandas.core.reshape.concat import concat from itertools import cycle - dtypes_to_encode = ['object', 'category'] + dtypes_to_encode = ["object", "category"] if isinstance(data, DataFrame): # determine columns being encoded if columns is None: - data_to_encode = data.select_dtypes( - include=dtypes_to_encode) + data_to_encode = data.select_dtypes(include=dtypes_to_encode) else: data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): - len_msg = ("Length of '{name}' ({len_item}) did not match the " - "length of the columns being encoded ({len_enc}).") + len_msg = ( + "Length of '{name}' ({len_item}) did not match the " + "length of the columns being encoded ({len_enc})." + ) if is_list_like(item): if not len(item) == data_to_encode.shape[1]: - len_msg = len_msg.format(name=name, len_item=len(item), - len_enc=data_to_encode.shape[1]) + len_msg = len_msg.format( + name=name, len_item=len(item), len_enc=data_to_encode.shape[1] + ) raise ValueError(len_msg) - check_len(prefix, 'prefix') - check_len(prefix_sep, 'prefix_sep') + check_len(prefix, "prefix") + check_len(prefix_sep, "prefix_sep") if isinstance(prefix, str): prefix = cycle([prefix]) @@ -850,25 +909,43 @@ def check_len(item, name): # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] - for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, - prefix_sep): + for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep): # col is (column_name, column), use just column data here - dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep, - dummy_na=dummy_na, sparse=sparse, - drop_first=drop_first, dtype=dtype) + dummy = _get_dummies_1d( + col[1], + prefix=pre, + prefix_sep=sep, + dummy_na=dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: - result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse, - drop_first=drop_first, - dtype=dtype) + result = _get_dummies_1d( + data, + prefix, + prefix_sep, + dummy_na, + sparse=sparse, + drop_first=drop_first, + dtype=dtype, + ) return result -def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False, drop_first=False, dtype=None): +def _get_dummies_1d( + data, + prefix, + prefix_sep="_", + dummy_na=False, + sparse=False, + drop_first=False, + dtype=None, +): from pandas.core.reshape.concat import concat + # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) @@ -907,13 +984,10 @@ def get_empty_frame(data): # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): - fstr = '{prefix}{prefix_sep}{level}' - return fstr.format(prefix=prefix, - prefix_sep=prefix_sep, - level=level) + fstr = "{prefix}{prefix_sep}{level}" + return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) - dummy_cols = [_make_col_name(prefix, prefix_sep, level) - for level in levels] + dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] if isinstance(data, Series): index = data.index @@ -945,10 +1019,12 @@ def _make_col_name(prefix, prefix_sep, level): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs), dtype=dtype), - sparse_index=IntIndex(N, ixs), - fill_value=fill_value, - dtype=dtype) + sarr = SparseArray( + np.ones(len(ixs), dtype=dtype), + sparse_index=IntIndex(N, ixs), + fill_value=fill_value, + dtype=dtype, + ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) @@ -968,7 +1044,7 @@ def _make_col_name(prefix, prefix_sep, level): return DataFrame(dummy_mat, index=index, columns=dummy_cols) -def make_axis_dummies(frame, axis='minor', transform=None): +def make_axis_dummies(frame, axis="minor", transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels @@ -989,7 +1065,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): dummies : DataFrame Column names taken from chosen axis """ - numbers = {'major': 0, 'minor': 1} + numbers = {"major": 0, "minor": 1} num = numbers.get(axis, axis) items = frame.index.levels[num] diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 96124331e43ef..0446f53345671 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -8,20 +8,43 @@ from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( - _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, - is_scalar, is_timedelta64_dtype) + _NS_DTYPE, + ensure_int64, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, + is_integer, + is_scalar, + is_timedelta64_dtype, +) from pandas.core.dtypes.missing import isna from pandas import ( - Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp, - to_datetime, to_timedelta) + Categorical, + Index, + Interval, + IntervalIndex, + Series, + Timedelta, + Timestamp, + to_datetime, + to_timedelta, +) import pandas.core.algorithms as algos import pandas.core.nanops as nanops -def cut(x, bins, right=True, labels=None, retbins=False, precision=3, - include_lowest=False, duplicates='raise'): +def cut( + x, + bins, + right=True, + labels=None, + retbins=False, + precision=3, + include_lowest=False, + duplicates="raise", +): """ Bin values into discrete intervals. @@ -199,18 +222,19 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, sz = x.size if sz == 0: - raise ValueError('Cannot cut empty array') + raise ValueError("Cannot cut empty array") rng = (nanops.nanmin(x), nanops.nanmax(x)) mn, mx = [mi + 0.0 for mi in rng] if np.isinf(mn) or np.isinf(mx): # GH 24314 - raise ValueError('cannot specify integer `bins` when input data ' - 'contains infinity') + raise ValueError( + "cannot specify integer `bins` when input data " "contains infinity" + ) elif mn == mx: # adjust end points before binning - mn -= .001 * abs(mn) if mn != 0 else .001 - mx += .001 * abs(mx) if mx != 0 else .001 + mn -= 0.001 * abs(mn) if mn != 0 else 0.001 + mx += 0.001 * abs(mx) if mx != 0 else 0.001 bins = np.linspace(mn, mx, bins + 1, endpoint=True) else: # adjust end points after binning bins = np.linspace(mn, mx, bins + 1, endpoint=True) @@ -222,7 +246,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, elif isinstance(bins, IntervalIndex): if bins.is_overlapping: - raise ValueError('Overlapping IntervalIndex is not accepted.') + raise ValueError("Overlapping IntervalIndex is not accepted.") else: if is_datetime64tz_dtype(bins): @@ -232,20 +256,26 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, bins = _convert_bin_to_numeric_type(bins, dtype) # GH 26045: cast to float64 to avoid an overflow - if (np.diff(bins.astype('float64')) < 0).any(): - raise ValueError('bins must increase monotonically.') - - fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, - precision=precision, - include_lowest=include_lowest, - dtype=dtype, - duplicates=duplicates) - - return _postprocess_for_cut(fac, bins, retbins, x_is_series, - series_index, name, dtype) - - -def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): + if (np.diff(bins.astype("float64")) < 0).any(): + raise ValueError("bins must increase monotonically.") + + fac, bins = _bins_to_cuts( + x, + bins, + right=right, + labels=labels, + precision=precision, + include_lowest=include_lowest, + dtype=dtype, + duplicates=duplicates, + ) + + return _postprocess_for_cut( + fac, bins, retbins, x_is_series, series_index, name, dtype + ) + + +def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -309,21 +339,37 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): else: quantiles = q bins = algos.quantile(x, quantiles) - fac, bins = _bins_to_cuts(x, bins, labels=labels, - precision=precision, include_lowest=True, - dtype=dtype, duplicates=duplicates) - - return _postprocess_for_cut(fac, bins, retbins, x_is_series, - series_index, name, dtype) - - -def _bins_to_cuts(x, bins, right=True, labels=None, - precision=3, include_lowest=False, - dtype=None, duplicates='raise'): - - if duplicates not in ['raise', 'drop']: - raise ValueError("invalid value for 'duplicates' parameter, " - "valid options are: raise, drop") + fac, bins = _bins_to_cuts( + x, + bins, + labels=labels, + precision=precision, + include_lowest=True, + dtype=dtype, + duplicates=duplicates, + ) + + return _postprocess_for_cut( + fac, bins, retbins, x_is_series, series_index, name, dtype + ) + + +def _bins_to_cuts( + x, + bins, + right=True, + labels=None, + precision=3, + include_lowest=False, + dtype=None, + duplicates="raise", +): + + if duplicates not in ["raise", "drop"]: + raise ValueError( + "invalid value for 'duplicates' parameter, " + "valid options are: raise, drop" + ) if isinstance(bins, IntervalIndex): # we have a fast-path here @@ -334,14 +380,16 @@ def _bins_to_cuts(x, bins, right=True, labels=None, unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: - if duplicates == 'raise': - raise ValueError("Bin edges must be unique: {bins!r}.\nYou " - "can drop duplicate edges by setting " - "the 'duplicates' kwarg".format(bins=bins)) + if duplicates == "raise": + raise ValueError( + "Bin edges must be unique: {bins!r}.\nYou " + "can drop duplicate edges by setting " + "the 'duplicates' kwarg".format(bins=bins) + ) else: bins = unique_bins - side = 'left' if right else 'right' + side = "left" if right else "right" ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: @@ -352,13 +400,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, if labels is not False: if labels is None: - labels = _format_labels(bins, precision, right=right, - include_lowest=include_lowest, - dtype=dtype) + labels = _format_labels( + bins, precision, right=right, include_lowest=include_lowest, dtype=dtype + ) else: if len(labels) != len(bins) - 1: - raise ValueError('Bin labels must be one fewer than ' - 'the number of bin edges') + raise ValueError( + "Bin labels must be one fewer than " "the number of bin edges" + ) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) @@ -386,10 +435,10 @@ def _coerce_to_type(x): dtype = x.dtype elif is_datetime64_dtype(x): x = to_datetime(x) - dtype = np.dtype('datetime64[ns]') + dtype = np.dtype("datetime64[ns]") elif is_timedelta64_dtype(x): x = to_timedelta(x) - dtype = np.dtype('timedelta64[ns]') + dtype = np.dtype("timedelta64[ns]") if dtype is not None: # GH 19768: force NaT to NaN during integer conversion @@ -414,12 +463,12 @@ def _convert_bin_to_numeric_type(bins, dtype): """ bins_dtype = infer_dtype(bins, skipna=False) if is_timedelta64_dtype(dtype): - if bins_dtype in ['timedelta', 'timedelta64']: + if bins_dtype in ["timedelta", "timedelta64"]: bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): - if bins_dtype in ['datetime', 'datetime64']: + if bins_dtype in ["datetime", "datetime64"]: bins = to_datetime(bins).view(np.int64) else: raise ValueError("bins must be of datetime64 dtype") @@ -443,28 +492,26 @@ def _convert_bin_to_datelike_type(bins, dtype): datelike """ if is_datetime64tz_dtype(dtype): - bins = to_datetime(bins.astype(np.int64), - utc=True).tz_convert(dtype.tz) + bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz) elif is_datetime_or_timedelta_dtype(dtype): bins = Index(bins.astype(np.int64), dtype=dtype) return bins -def _format_labels(bins, precision, right=True, - include_lowest=False, dtype=None): +def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None): """ based on the dtype, return our labels """ - closed = 'right' if right else 'left' + closed = "right" if right else "left" if is_datetime64tz_dtype(dtype): formatter = partial(Timestamp, tz=dtype.tz) - adjust = lambda x: x - Timedelta('1ns') + adjust = lambda x: x - Timedelta("1ns") elif is_datetime64_dtype(dtype): formatter = Timestamp - adjust = lambda x: x - Timedelta('1ns') + adjust = lambda x: x - Timedelta("1ns") elif is_timedelta64_dtype(dtype): formatter = Timedelta - adjust = lambda x: x - Timedelta('1ns') + adjust = lambda x: x - Timedelta("1ns") else: precision = _infer_precision(precision, bins) formatter = lambda x: _round_frac(x, precision) @@ -478,7 +525,7 @@ def _format_labels(bins, precision, right=True, # account that we are all right closed v = adjust(labels[0].left) - i = IntervalIndex([Interval(v, labels[0].right, closed='right')]) + i = IntervalIndex([Interval(v, labels[0].right, closed="right")]) labels = i.append(labels[1:]) return labels @@ -500,7 +547,7 @@ def _preprocess_for_cut(x): # Check that the passed array is a Pandas or Numpy object # We don't want to strip away a Pandas data-type here (e.g. datetimetz) - ndim = getattr(x, 'ndim', None) + ndim = getattr(x, "ndim", None) if ndim is None: x = np.asarray(x) if x.ndim != 1: @@ -509,8 +556,7 @@ def _preprocess_for_cut(x): return x_is_series, series_index, name, x -def _postprocess_for_cut(fac, bins, retbins, x_is_series, - series_index, name, dtype): +def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name, dtype): """ handles post processing for the cut method where we combine the index information if the originally passed diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 9975fe65ac0fe..044e058904dce 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -51,6 +51,9 @@ def cartesian_product(X): # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) - return [np.tile(np.repeat(np.asarray(com.values_from_object(x)), b[i]), - np.product(a[i])) - for i, x in enumerate(X)] + return [ + np.tile( + np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i]) + ) + for i, x in enumerate(X) + ] diff --git a/pandas/core/series.py b/pandas/core/series.py index 13966d4551b54..b3a7f38aef8ef 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,15 +19,38 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - _is_unorderable_exception, ensure_platform_int, is_bool, is_categorical, - is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, - is_extension_array_dtype, is_extension_type, is_hashable, is_integer, - is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype) + _is_unorderable_exception, + ensure_platform_int, + is_bool, + is_categorical, + is_categorical_dtype, + is_datetime64_dtype, + is_datetimelike, + is_dict_like, + is_extension_array_dtype, + is_extension_type, + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_scalar, + is_string_like, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, ABCSeries, - ABCSparseArray, ABCSparseSeries) + ABCDataFrame, + ABCDatetimeArray, + ABCDatetimeIndex, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) from pandas.core.dtypes.missing import ( - isna, na_value_for_dtype, notna, remove_na_arraylike) + isna, + na_value_for_dtype, + notna, + remove_na_arraylike, +) import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops @@ -37,7 +60,12 @@ from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.index import ( - Float64Index, Index, InvalidIndexError, MultiIndex, ensure_index) + Float64Index, + Index, + InvalidIndexError, + MultiIndex, + ensure_index, +) from pandas.core.indexes.accessors import CombinedDatetimelikeProperties import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex @@ -52,17 +80,24 @@ import pandas.io.formats.format as fmt import pandas.plotting -__all__ = ['Series'] +__all__ = ["Series"] _shared_doc_kwargs = dict( - axes='index', klass='Series', axes_single_arg="{0 or 'index'}", + axes="index", + klass="Series", + axes_single_arg="{0 or 'index'}", axis="""axis : {0 or 'index'} Parameter needed for compatibility with DataFrame.""", inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - unique='np.ndarray', duplicated='Series', - optional_by='', optional_mapper='', optional_labels='', optional_axis='', - versionadded_to_excel='\n .. versionadded:: 0.20.0\n') + unique="np.ndarray", + duplicated="Series", + optional_by="", + optional_mapper="", + optional_labels="", + optional_axis="", + versionadded_to_excel="\n .. versionadded:: 0.20.0\n", +) # see gh-16971 @@ -74,8 +109,11 @@ def remove_na(arr): Use s[s.notnull()] instead. """ - warnings.warn("remove_na is deprecated and is a private " - "function. Do not use.", FutureWarning, stacklevel=2) + warnings.warn( + "remove_na is deprecated and is a private " "function. Do not use.", + FutureWarning, + stacklevel=2, + ) return remove_na_arraylike(arr) @@ -87,12 +125,12 @@ def _coerce_method(converter): def wrapper(self): if len(self) == 1: return converter(self.iloc[0]) - raise TypeError("cannot convert the series to " - "{0}".format(str(converter))) + raise TypeError("cannot convert the series to " "{0}".format(str(converter))) wrapper.__name__ = "__{name}__".format(name=converter.__name__) return wrapper + # ---------------------------------------------------------------------- # Series class @@ -133,23 +171,26 @@ class Series(base.IndexOpsMixin, generic.NDFrame): copy : bool, default False Copy input data. """ - _metadata = ['name'] - _accessors = {'dt', 'cat', 'str', 'sparse'} + + _metadata = ["name"] + _accessors = {"dt", "cat", "str", "sparse"} # tolist is not actually deprecated, just suppressed in the __dir__ _deprecations = generic.NDFrame._deprecations | frozenset( - ['asobject', 'reshape', 'get_value', 'set_value', - 'valid', 'tolist']) + ["asobject", "reshape", "get_value", "set_value", "valid", "tolist"] + ) # Override cache_readonly bc Series is mutable - hasnans = property(base.IndexOpsMixin.hasnans.func, - doc=base.IndexOpsMixin.hasnans.__doc__) + hasnans = property( + base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ + ) _data = None # type: SingleBlockManager # ---------------------------------------------------------------------- # Constructors - def __init__(self, data=None, index=None, dtype=None, name=None, - copy=False, fastpath=False): + def __init__( + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False + ): # we are called internally, so short-circuit if fastpath: @@ -172,15 +213,19 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if dtype is not None: # GH 26336: explicitly handle 'category' to avoid warning # TODO: Remove after CategoricalDtype defaults to ordered=False - if (isinstance(dtype, str) and dtype == 'category' and - is_categorical(data)): + if ( + isinstance(dtype, str) + and dtype == "category" + and is_categorical(data) + ): dtype = data.dtype dtype = self._validate_dtype(dtype) if isinstance(data, MultiIndex): - raise NotImplementedError("initializing a Series from a " - "MultiIndex is not supported") + raise NotImplementedError( + "initializing a Series from a " "MultiIndex is not supported" + ) elif isinstance(data, Index): if name is None: name = data.name @@ -191,8 +236,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None, else: # need to copy to avoid aliasing issues data = data._values.copy() - if (isinstance(data, ABCDatetimeIndex) and - data.tz is not None): + if isinstance(data, ABCDatetimeIndex) and data.tz is not None: # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies data = data._values.copy(deep=True) @@ -218,16 +262,19 @@ def __init__(self, data=None, index=None, dtype=None, name=None, elif not data.index.equals(index) or copy: # GH#19275 SingleBlockManager input should only be called # internally - raise AssertionError('Cannot pass both SingleBlockManager ' - '`data` argument and a different ' - '`index` argument. `copy` must ' - 'be False.') + raise AssertionError( + "Cannot pass both SingleBlockManager " + "`data` argument and a different " + "`index` argument. `copy` must " + "be False." + ) elif is_extension_array_dtype(data): pass elif isinstance(data, (set, frozenset)): - raise TypeError("{0!r} type is unordered" - "".format(data.__class__.__name__)) + raise TypeError( + "{0!r} type is unordered" "".format(data.__class__.__name__) + ) elif isinstance(data, ABCSparseArray): # handle sparse passed here (and force conversion) data = data.to_dense() @@ -245,22 +292,20 @@ def __init__(self, data=None, index=None, dtype=None, name=None, try: if len(index) != len(data): raise ValueError( - 'Length of passed values is {val}, ' - 'index implies {ind}' - .format(val=len(data), ind=len(index))) + "Length of passed values is {val}, " + "index implies {ind}".format(val=len(data), ind=len(index)) + ) except TypeError: pass # create/copy the manager if isinstance(data, SingleBlockManager): if dtype is not None: - data = data.astype(dtype=dtype, errors='ignore', - copy=copy) + data = data.astype(dtype=dtype, errors="ignore", copy=copy) elif copy: data = data.copy() else: - data = sanitize_array(data, index, dtype, copy, - raise_cast_failure=True) + data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True) data = SingleBlockManager(data, index, fastpath=True) @@ -317,8 +362,9 @@ def _init_dict(self, data, index=None, dtype=None): return s._data, s.index @classmethod - def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, - fastpath=False): + def from_array( + cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False + ): """ Construct Series from array. @@ -330,14 +376,20 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, Series Constructed Series. """ - warnings.warn("'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.Series(..) " - "constructor instead.", FutureWarning, stacklevel=2) + warnings.warn( + "'from_array' is deprecated and will be removed in a " + "future version. Please use the pd.Series(..) " + "constructor instead.", + FutureWarning, + stacklevel=2, + ) if isinstance(arr, ABCSparseArray): from pandas.core.sparse.series import SparseSeries + cls = SparseSeries - return cls(arr, index=index, name=name, dtype=dtype, - copy=copy, fastpath=fastpath) + return cls( + arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath + ) # ---------------------------------------------------------------------- @@ -348,6 +400,7 @@ def _constructor(self): @property def _constructor_expanddim(self): from pandas.core.frame import DataFrame + return DataFrame # types @@ -367,8 +420,7 @@ def _set_axis(self, axis, labels, fastpath=False): is_all_dates = labels.is_all_dates if is_all_dates: - if not isinstance(labels, - (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): try: labels = DatetimeIndex(labels) # need to set here because we changed the index @@ -381,15 +433,15 @@ def _set_axis(self, axis, labels, fastpath=False): self._set_subtyp(is_all_dates) - object.__setattr__(self, '_index', labels) + object.__setattr__(self, "_index", labels) if not fastpath: self._data.set_axis(axis, labels) def _set_subtyp(self, is_all_dates): if is_all_dates: - object.__setattr__(self, '_subtyp', 'time_series') + object.__setattr__(self, "_subtyp", "time_series") else: - object.__setattr__(self, '_subtyp', 'series') + object.__setattr__(self, "_subtyp", "series") def _update_inplace(self, result, **kwargs): # we want to call the generic version and not the IndexOpsMixin @@ -405,8 +457,8 @@ def name(self): @name.setter def name(self, value): if value is not None and not is_hashable(value): - raise TypeError('Series.name must be a hashable type') - object.__setattr__(self, '_name', value) + raise TypeError("Series.name must be a hashable type") + object.__setattr__(self, "_name", value) # ndarray compatibility @property @@ -431,10 +483,13 @@ def ftype(self): .. deprecated:: 0.25.0 Use :func:`dtype` instead. """ - warnings.warn("Series.ftype is deprecated and will " - "be removed in a future version. " - "Use Series.dtype instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "Series.ftype is deprecated and will " + "be removed in a future version. " + "Use Series.dtype instead.", + FutureWarning, + stacklevel=2, + ) return self._data.ftype @@ -446,10 +501,13 @@ def ftypes(self): .. deprecated:: 0.25.0 Use :func:`dtypes` instead. """ - warnings.warn("Series.ftypes is deprecated and will " - "be removed in a future version. " - "Use Series.dtype instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "Series.ftypes is deprecated and will " + "be removed in a future version. " + "Use Series.dtype instead.", + FutureWarning, + stacklevel=2, + ) return self._data.ftype @@ -524,7 +582,9 @@ def get_values(self): warnings.warn( "The 'get_values' method is deprecated and will be removed in a " "future version. Use '.to_numpy()' or '.array' instead.", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) return self._internal_get_values() def _internal_get_values(self): @@ -541,12 +601,15 @@ def asobject(self): *this is an internal non-public method* """ - warnings.warn("'asobject' is deprecated. Use 'astype(object)'" - " instead", FutureWarning, stacklevel=2) + warnings.warn( + "'asobject' is deprecated. Use 'astype(object)'" " instead", + FutureWarning, + stacklevel=2, + ) return self.astype(object).values # ops - def ravel(self, order='C'): + def ravel(self, order="C"): """ Return the flattened underlying data as an ndarray. @@ -576,9 +639,11 @@ def compress(self, condition, *args, **kwargs): -------- numpy.ndarray.compress """ - msg = ("Series.compress(condition) is deprecated. " - "Use 'Series[condition]' or " - "'np.asarray(series).compress(condition)' instead.") + msg = ( + "Series.compress(condition) is deprecated. " + "Use 'Series[condition]' or " + "'np.asarray(series).compress(condition)' instead." + ) warnings.warn(msg, FutureWarning, stacklevel=2) nv.validate_compress(args, kwargs) return self[condition] @@ -624,9 +689,11 @@ def nonzero(self): d 4 dtype: int64 """ - msg = ("Series.nonzero() is deprecated " - "and will be removed in a future version." - "Use Series.to_numpy().nonzero() instead") + msg = ( + "Series.nonzero() is deprecated " + "and will be removed in a future version." + "Use Series.to_numpy().nonzero() instead" + ) warnings.warn(msg, FutureWarning, stacklevel=2) return self._values.nonzero() @@ -640,8 +707,11 @@ def put(self, *args, **kwargs): -------- numpy.ndarray.put """ - warnings.warn('`put` has been deprecated and will be removed in a' - 'future version.', FutureWarning, stacklevel=2) + warnings.warn( + "`put` has been deprecated and will be removed in a" "future version.", + FutureWarning, + stacklevel=2, + ) self._values.put(*args, **kwargs) def __len__(self): @@ -716,27 +786,26 @@ def view(self, dtype=None): 4 2 dtype: int8 """ - return self._constructor(self._values.view(dtype), - index=self.index).__finalize__(self) + return self._constructor( + self._values.view(dtype), index=self.index + ).__finalize__(self) # ---------------------------------------------------------------------- # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) def __array_ufunc__( - self, - ufunc: Callable, - method: str, - *inputs: Any, - **kwargs: Any + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any ): # TODO: handle DataFrame from pandas.core.internals.construction import extract_array + cls = type(self) # for binary ops, use our custom dunder methods result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs) + self, ufunc, method, *inputs, **kwargs + ) if result is not NotImplemented: return result @@ -745,19 +814,19 @@ def __array_ufunc__( for item in inputs: higher_priority = ( - hasattr(item, '__array_priority__') and - item.__array_priority__ > self.__array_priority__ + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ ) has_array_ufunc = ( - hasattr(item, '__array_ufunc__') and - type(item).__array_ufunc__ not in no_defer and - not isinstance(item, self._HANDLED_TYPES) + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) ) if higher_priority or has_array_ufunc: return NotImplemented # align all the inputs. - names = [getattr(x, 'name') for x in inputs if hasattr(x, 'name')] + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] types = tuple(type(x) for x in inputs) # TODO: dataframe alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] @@ -770,8 +839,10 @@ def __array_ufunc__( index = alignable[0].index for s in alignable[1:]: index |= s.index - inputs = tuple(x.reindex(index) if issubclass(t, Series) else x - for x, t in zip(inputs, types)) + inputs = tuple( + x.reindex(index) if issubclass(t, Series) else x + for x, t in zip(inputs, types) + ) else: index = self.index @@ -788,7 +859,7 @@ def construct_return(result): return result elif result.ndim > 1: # e.g. np.subtract.outer - if method == 'outer': + if method == "outer": msg = ( "outer method for ufunc {} is not implemented on " "pandas objects. Returning an ndarray, but in the " @@ -796,18 +867,14 @@ def construct_return(result): "Consider explicitly converting the Series " "to an array with '.array' first." ) - warnings.warn(msg.format(ufunc), FutureWarning, - stacklevel=3) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=3) return result - return self._constructor(result, - index=index, - name=name, - copy=False) + return self._constructor(result, index=index, name=name, copy=False) if type(result) is tuple: # multiple return values return tuple(construct_return(x) for x in result) - elif method == 'at': + elif method == "at": # no return value return None else: @@ -860,8 +927,11 @@ def __array__(self, dtype=None): array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - if (dtype is None and isinstance(self.array, ABCDatetimeArray) - and getattr(self.dtype, 'tz', None)): + if ( + dtype is None + and isinstance(self.array, ABCDatetimeArray) + and getattr(self.dtype, "tz", None) + ): msg = ( "Converting timezone-aware DatetimeArray to timezone-naive " "ndarray with 'datetime64[ns]' dtype. In the future, this " @@ -871,7 +941,7 @@ def __array__(self, dtype=None): "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." ) warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = 'M8[ns]' + dtype = "M8[ns]" return np.asarray(self.array, dtype) # ---------------------------------------------------------------------- @@ -884,8 +954,11 @@ def real(self): .. deprecated 0.25.0 """ - warnings.warn("`real` has be deprecated and will be removed in a " - "future verison", FutureWarning, stacklevel=2) + warnings.warn( + "`real` has be deprecated and will be removed in a " "future verison", + FutureWarning, + stacklevel=2, + ) return self.values.real @real.setter @@ -899,8 +972,11 @@ def imag(self): .. deprecated 0.25.0 """ - warnings.warn("`imag` has be deprecated and will be removed in a " - "future verison", FutureWarning, stacklevel=2) + warnings.warn( + "`imag` has be deprecated and will be removed in a " "future verison", + FutureWarning, + stacklevel=2, + ) return self.values.imag @imag.setter @@ -916,8 +992,8 @@ def imag(self, v): def _unpickle_series_compat(self, state): if isinstance(state, dict): - self._data = state['_data'] - self.name = state['name'] + self._data = state["_data"] + self.name = state["name"] self.index = self._data.index elif isinstance(state, tuple): @@ -975,7 +1051,7 @@ def _ixs(self, i, axis=0): raise except Exception: if isinstance(i, slice): - indexer = self.index._convert_slice_indexer(i, kind='iloc') + indexer = self.index._convert_slice_indexer(i, kind="iloc") return self._get_values(indexer) else: label = self.index[i] @@ -989,8 +1065,7 @@ def _is_mixed_type(self): return False def _slice(self, slobj, axis=0, kind=None): - slobj = self.index._convert_slice_indexer(slobj, - kind=kind or 'getitem') + slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem") return self._get_values(slobj) def __getitem__(self, key): @@ -1006,8 +1081,8 @@ def __getitem__(self, key): try: if not is_scalar(self.index.get_loc(key)): result = self._constructor( - result, index=[key] * len(result), - dtype=self.dtype).__finalize__(self) + result, index=[key] * len(result), dtype=self.dtype + ).__finalize__(self) except KeyError: pass return result @@ -1024,8 +1099,7 @@ def __getitem__(self, key): else: # we can try to coerce the indexer (or this will raise) - new_key = self.index._convert_scalar_indexer(key, - kind='getitem') + new_key = self.index._convert_scalar_indexer(key, kind="getitem") if type(new_key) != type(key): return self.__getitem__(new_key) raise @@ -1044,11 +1118,13 @@ def __getitem__(self, key): def _get_with(self, key): # other: fancy integer or otherwise if isinstance(key, slice): - indexer = self.index._convert_slice_indexer(key, kind='getitem') + indexer = self.index._convert_slice_indexer(key, kind="getitem") return self._get_values(indexer) elif isinstance(key, ABCDataFrame): - raise TypeError('Indexing a Series with DataFrame is not ' - 'supported, use the appropriate DataFrame column') + raise TypeError( + "Indexing a Series with DataFrame is not " + "supported, use the appropriate DataFrame column" + ) elif isinstance(key, tuple): try: return self._get_values_tuple(key) @@ -1068,12 +1144,12 @@ def _get_with(self, key): else: key_type = lib.infer_dtype(key, skipna=False) - if key_type == 'integer': + if key_type == "integer": if self.index.is_integer() or self.index.is_floating(): return self.loc[key] else: return self._get_values(key) - elif key_type == 'boolean': + elif key_type == "boolean": return self._get_values(key) try: @@ -1096,17 +1172,19 @@ def _get_values_tuple(self, key): return self._get_values(key) if not isinstance(self.index, MultiIndex): - raise ValueError('Can only tuple-index with a MultiIndex') + raise ValueError("Can only tuple-index with a MultiIndex") # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) - return self._constructor(self._values[indexer], - index=new_index).__finalize__(self) + return self._constructor(self._values[indexer], index=new_index).__finalize__( + self + ) def _get_values(self, indexer): try: - return self._constructor(self._data.get_slice(indexer), - fastpath=True).__finalize__(self) + return self._constructor( + self._data.get_slice(indexer), fastpath=True + ).__finalize__(self) except Exception: return self._values[indexer] @@ -1121,8 +1199,7 @@ def setitem(key, value): raise except (KeyError, ValueError): values = self._values - if (is_integer(key) and - not self.index.inferred_type == 'integer'): + if is_integer(key) and not self.index.inferred_type == "integer": values[key] = value return @@ -1137,8 +1214,7 @@ def setitem(key, value): value = iNaT try: - self.index._engine.set_value(self._values, key, - value) + self.index._engine.set_value(self._values, key, value) return except TypeError: pass @@ -1147,8 +1223,7 @@ def setitem(key, value): return except TypeError as e: - if (isinstance(key, tuple) and - not isinstance(self.index, MultiIndex)): + if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): raise ValueError("Can only tuple-index with a MultiIndex") # python 3 type errors should be raised @@ -1183,7 +1258,7 @@ def _set_with_engine(self, key, value): def _set_with(self, key, value): # other: fancy integer or otherwise if isinstance(key, slice): - indexer = self.index._convert_slice_indexer(key, kind='getitem') + indexer = self.index._convert_slice_indexer(key, kind="getitem") return self._set_values(indexer, value) else: if isinstance(key, tuple): @@ -1205,12 +1280,12 @@ def _set_with(self, key, value): else: key_type = lib.infer_dtype(key, skipna=False) - if key_type == 'integer': - if self.index.inferred_type == 'integer': + if key_type == "integer": + if self.index.inferred_type == "integer": self._set_labels(key, value) else: return self._set_values(key, value) - elif key_type == 'boolean': + elif key_type == "boolean": self._set_values(key.astype(np.bool_), value) else: self._set_labels(key, value) @@ -1223,7 +1298,7 @@ def _set_labels(self, key, value): indexer = self.index.get_indexer(key) mask = indexer == -1 if mask.any(): - raise ValueError('%s not contained in the index' % str(key[mask])) + raise ValueError("%s not contained in the index" % str(key[mask])) self._set_values(indexer, value) def _set_values(self, key, value): @@ -1287,8 +1362,7 @@ def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) - return self._constructor(new_values, - index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__(self) def get_value(self, label, takeable=False): """ @@ -1306,16 +1380,20 @@ def get_value(self, label, takeable=False): ------- scalar value """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(label, takeable=takeable) def _get_value(self, label, takeable=False): if takeable is True: return com.maybe_box_datetimelike(self._values[label]) return self.index.get_value(self._values, label) + _get_value.__doc__ = get_value.__doc__ def set_value(self, label, value, takeable=False): @@ -1342,10 +1420,13 @@ def set_value(self, label, value, takeable=False): If label is contained, will be reference to calling Series, otherwise a new object. """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(label, value, takeable=takeable) def _set_value(self, label, value, takeable=False): @@ -1360,6 +1441,7 @@ def _set_value(self, label, value, takeable=False): self.loc[label] = value return self + _set_value.__doc__ = set_value.__doc__ def reset_index(self, level=None, drop=False, name=None, inplace=False): @@ -1470,7 +1552,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): 2 baz one 2 3 baz two 3 """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if drop: new_index = ibase.default_index(len(self)) if level is not None: @@ -1485,11 +1567,13 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): # set name if it was passed, otherwise, keep the previous name self.name = name or self.name else: - return self._constructor(self._values.copy(), - index=new_index).__finalize__(self) + return self._constructor( + self._values.copy(), index=new_index + ).__finalize__(self) elif inplace: - raise TypeError('Cannot reset_index inplace on a Series ' - 'to create a DataFrame') + raise TypeError( + "Cannot reset_index inplace on a Series " "to create a DataFrame" + ) else: df = self.to_frame(name) return df.reset_index(level=level, drop=drop) @@ -1503,22 +1587,43 @@ def __repr__(self): """ buf = StringIO("") width, height = get_terminal_size() - max_rows = (height if get_option("display.max_rows") == 0 else - get_option("display.max_rows")) - min_rows = (height if get_option("display.max_rows") == 0 else - get_option("display.min_rows")) + max_rows = ( + height + if get_option("display.max_rows") == 0 + else get_option("display.max_rows") + ) + min_rows = ( + height + if get_option("display.max_rows") == 0 + else get_option("display.min_rows") + ) show_dimensions = get_option("display.show_dimensions") - self.to_string(buf=buf, name=self.name, dtype=self.dtype, - min_rows=min_rows, max_rows=max_rows, - length=show_dimensions) + self.to_string( + buf=buf, + name=self.name, + dtype=self.dtype, + min_rows=min_rows, + max_rows=max_rows, + length=show_dimensions, + ) result = buf.getvalue() return result - def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, - index=True, length=False, dtype=False, name=False, - max_rows=None, min_rows=None): + def to_string( + self, + buf=None, + na_rep="NaN", + float_format=None, + header=True, + index=True, + length=False, + dtype=False, + name=False, + max_rows=None, + min_rows=None, + ): """ Render a string representation of the Series. @@ -1554,19 +1659,27 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, String representation of Series if ``buf=None``, otherwise None. """ - formatter = fmt.SeriesFormatter(self, name=name, length=length, - header=header, index=index, - dtype=dtype, na_rep=na_rep, - float_format=float_format, - min_rows=min_rows, - max_rows=max_rows) + formatter = fmt.SeriesFormatter( + self, + name=name, + length=length, + header=header, + index=index, + dtype=dtype, + na_rep=na_rep, + float_format=float_format, + min_rows=min_rows, + max_rows=max_rows, + ) result = formatter.to_string() # catch contract violations if not isinstance(result, str): - raise AssertionError("result must be of type unicode, type" - " of result is {0!r}" - "".format(result.__class__.__name__)) + raise AssertionError( + "result must be of type unicode, type" + " of result is {0!r}" + "".format(result.__class__.__name__) + ) if buf is None: return result @@ -1574,7 +1687,7 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, try: buf.write(result) except AttributeError: - with open(buf, 'w') as f: + with open(buf, "w") as f: f.write(result) # ---------------------------------------------------------------------- @@ -1691,7 +1804,7 @@ def to_frame(self, name=None): return df - def to_sparse(self, kind='block', fill_value=None): + def to_sparse(self, kind="block", fill_value=None): """ Convert Series to SparseSeries. @@ -1709,16 +1822,19 @@ def to_sparse(self, kind='block', fill_value=None): Sparse representation of the Series. """ - warnings.warn("Series.to_sparse is deprecated and will be removed " - "in a future version", FutureWarning, stacklevel=2) + warnings.warn( + "Series.to_sparse is deprecated and will be removed " "in a future version", + FutureWarning, + stacklevel=2, + ) from pandas.core.sparse.series import SparseSeries values = SparseArray(self, kind=kind, fill_value=fill_value) with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="SparseSeries") - return SparseSeries( - values, index=self.index, name=self.name - ).__finalize__(self) + return SparseSeries(values, index=self.index, name=self.name).__finalize__( + self + ) def _set_name(self, name, inplace=False): """ @@ -1730,7 +1846,7 @@ def _set_name(self, name, inplace=False): inplace : bool whether to modify `self` directly or return a copy """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") ser = self if inplace else self.copy() ser.name = name return ser @@ -1777,8 +1893,7 @@ def count(self, level=None): obs = level_codes[notna(self.values)] out = np.bincount(obs, minlength=len(lev) or None) - return self._constructor(out, index=lev, - dtype='int64').__finalize__(self) + return self._constructor(out, index=lev, dtype="int64").__finalize__(self) def mode(self, dropna=True): """ @@ -1865,7 +1980,7 @@ def unique(self): result = super().unique() return result - def drop_duplicates(self, keep='first', inplace=False): + def drop_duplicates(self, keep="first", inplace=False): """ Return Series with duplicate values removed. @@ -1939,7 +2054,7 @@ def drop_duplicates(self, keep='first', inplace=False): """ return super().drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): """ Indicate duplicate Series values. @@ -2158,24 +2273,32 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): # ndarray compat argmin = deprecate( - 'argmin', idxmin, '0.21.0', - msg=dedent(""" + "argmin", + idxmin, + "0.21.0", + msg=dedent( + """ The current behaviour of 'Series.argmin' is deprecated, use 'idxmin' instead. The behavior of 'argmin' will be corrected to return the positional minimum in the future. For now, use 'series.values.argmin' or 'np.argmin(np.array(values))' to get the position of the minimum - row.""") + row.""" + ), ) argmax = deprecate( - 'argmax', idxmax, '0.21.0', - msg=dedent(""" + "argmax", + idxmax, + "0.21.0", + msg=dedent( + """ The current behaviour of 'Series.argmax' is deprecated, use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional maximum in the future. For now, use 'series.values.argmax' or 'np.argmax(np.array(values))' to get the position of the maximum - row.""") + row.""" + ), ) def round(self, decimals=0, *args, **kwargs): @@ -2214,7 +2337,7 @@ def round(self, decimals=0, *args, **kwargs): return result - def quantile(self, q=0.5, interpolation='linear'): + def quantile(self, q=0.5, interpolation="linear"): """ Return value at the given quantile. @@ -2265,21 +2388,18 @@ def quantile(self, q=0.5, interpolation='linear'): # about 2D cases. df = self.to_frame() - result = df.quantile(q=q, interpolation=interpolation, - numeric_only=False) + result = df.quantile(q=q, interpolation=interpolation, numeric_only=False) if result.ndim == 2: result = result.iloc[:, 0] if is_list_like(q): result.name = self.name - return self._constructor(result, - index=Float64Index(q), - name=self.name) + return self._constructor(result, index=Float64Index(q), name=self.name) else: # scalar return result.iloc[0] - def corr(self, other, method='pearson', min_periods=None): + def corr(self, other, method="pearson", min_periods=None): """ Compute correlation with `other` Series, excluding missing values. @@ -2315,17 +2435,20 @@ def corr(self, other, method='pearson', min_periods=None): >>> s1.corr(s2, method=histogram_intersection) 0.3 """ - this, other = self.align(other, join='inner', copy=False) + this, other = self.align(other, join="inner", copy=False) if len(this) == 0: return np.nan - if method in ['pearson', 'spearman', 'kendall'] or callable(method): - return nanops.nancorr(this.values, other.values, method=method, - min_periods=min_periods) + if method in ["pearson", "spearman", "kendall"] or callable(method): + return nanops.nancorr( + this.values, other.values, method=method, min_periods=min_periods + ) - raise ValueError("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, " - "'{method}' was supplied".format(method=method)) + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + "'{method}' was supplied".format(method=method) + ) def cov(self, other, min_periods=None): """ @@ -2351,11 +2474,10 @@ def cov(self, other, min_periods=None): >>> s1.cov(s2) -0.01685762652715874 """ - this, other = self.align(other, join='inner', copy=False) + this, other = self.align(other, join="inner", copy=False) if len(this) == 0: return np.nan - return nanops.nancov(this.values, other.values, - min_periods=min_periods) + return nanops.nancov(this.values, other.values, min_periods=min_periods) def diff(self, periods=1): """ @@ -2517,11 +2639,11 @@ def dot(self, other): array([24, 14]) """ from pandas.core.frame import DataFrame + if isinstance(other, (Series, DataFrame)): common = self.index.union(other.index) - if (len(common) > len(self.index) or - len(common) > len(other.index)): - raise ValueError('matrices are not aligned') + if len(common) > len(self.index) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") left = self.reindex(index=common, copy=False) right = other.reindex(index=common, copy=False) @@ -2531,18 +2653,20 @@ def dot(self, other): lvals = self.values rvals = np.asarray(other) if lvals.shape[0] != rvals.shape[0]: - raise Exception('Dot product shape mismatch, %s vs %s' % - (lvals.shape, rvals.shape)) + raise Exception( + "Dot product shape mismatch, %s vs %s" % (lvals.shape, rvals.shape) + ) if isinstance(other, DataFrame): - return self._constructor(np.dot(lvals, rvals), - index=other.columns).__finalize__(self) + return self._constructor( + np.dot(lvals, rvals), index=other.columns + ).__finalize__(self) elif isinstance(other, Series): return np.dot(lvals, rvals) elif isinstance(rvals, np.ndarray): return np.dot(lvals, rvals) else: # pragma: no cover - raise TypeError('unsupported type: %s' % type(other)) + raise TypeError("unsupported type: %s" % type(other)) def __matmul__(self, other): """ @@ -2556,11 +2680,10 @@ def __rmatmul__(self, other): """ return self.dot(np.transpose(other)) - @Substitution(klass='Series') - @Appender(base._shared_docs['searchsorted']) - def searchsorted(self, value, side='left', sorter=None): - return algorithms.searchsorted(self._values, value, - side=side, sorter=sorter) + @Substitution(klass="Series") + @Appender(base._shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination @@ -2644,8 +2767,9 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): to_concat = [self] + to_append else: to_concat = [self, to_append] - return concat(to_concat, ignore_index=ignore_index, - verify_integrity=verify_integrity) + return concat( + to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + ) def _binop(self, other, func, level=None, fill_value=None): """ @@ -2668,24 +2792,22 @@ def _binop(self, other, func, level=None, fill_value=None): """ if not isinstance(other, Series): - raise AssertionError('Other operand must be Series') + raise AssertionError("Other operand must be Series") new_index = self.index this = self if not self.index.equals(other.index): - this, other = self.align(other, level=level, join='outer', - copy=False) + this, other = self.align(other, level=level, join="outer", copy=False) new_index = this.index - this_vals, other_vals = ops.fill_binop(this.values, other.values, - fill_value) + this_vals, other_vals = ops.fill_binop(this.values, other.values, fill_value) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = func(this_vals, other_vals) name = ops.get_op_result_name(self, other) - if func.__name__ in ['divmod', 'rdivmod']: + if func.__name__ in ["divmod", "rdivmod"]: ret = ops._construct_divmod_result(self, result, new_index, name) else: ret = ops._construct_result(self, result, new_index, name) @@ -2770,13 +2892,13 @@ def combine(self, other, func, fill_value=None): for idx in new_index: lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): new_values.append(func(lv, rv)) else: # Assume that other is a scalar, so apply the function for # each element in the Series new_index = self.index - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): new_values = [func(lv, other) for lv in self._values] new_name = self.name @@ -2890,8 +3012,14 @@ def update(self, other): # ---------------------------------------------------------------------- # Reindexing, sorting - def sort_values(self, axis=0, ascending=True, inplace=False, - kind='quicksort', na_position='last'): + def sort_values( + self, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): """ Sort by the values. @@ -2996,14 +3124,16 @@ def sort_values(self, axis=0, ascending=True, inplace=False, 0 z dtype: object """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) # GH 5856/5853 if inplace and self._is_cached: - raise ValueError("This Series is a view of some other array, to " - "sort in-place you must create a copy") + raise ValueError( + "This Series is a view of some other array, to " + "sort in-place you must create a copy" + ) def _try_kind_sort(arr): # easier to ask forgiveness than permission @@ -3013,7 +3143,7 @@ def _try_kind_sort(arr): except TypeError: # stable sort not available for object dtype # uses the argsort default quicksort - return arr.argsort(kind='quicksort') + return arr.argsort(kind="quicksort") arr = self._values sortedIdx = np.empty(len(self), dtype=np.int32) @@ -3027,26 +3157,28 @@ def _try_kind_sort(arr): if is_list_like(ascending): if len(ascending) != 1: - raise ValueError('Length of ascending (%d) must be 1 ' - 'for Series' % (len(ascending))) + raise ValueError( + "Length of ascending (%d) must be 1 " + "for Series" % (len(ascending)) + ) ascending = ascending[0] if not is_bool(ascending): - raise ValueError('ascending must be boolean') + raise ValueError("ascending must be boolean") if not ascending: argsorted = argsorted[::-1] - if na_position == 'last': + if na_position == "last": n = good.sum() sortedIdx[:n] = idx[good][argsorted] sortedIdx[n:] = idx[bad] - elif na_position == 'first': + elif na_position == "first": n = bad.sum() sortedIdx[n:] = idx[good][argsorted] sortedIdx[:n] = idx[bad] else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + raise ValueError("invalid na_position: {!r}".format(na_position)) result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx]) @@ -3055,8 +3187,16 @@ def _try_kind_sort(arr): else: return result.__finalize__(self) - def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ): """ Sort Series by index labels. @@ -3169,34 +3309,40 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, """ # TODO: this can be combined with DataFrame.sort_index impl as # almost identical - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) index = self.index if level is not None: - new_index, indexer = index.sortlevel(level, ascending=ascending, - sort_remaining=sort_remaining) + new_index, indexer = index.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer + labels = index._sort_levels_monotonic() - indexer = lexsort_indexer(labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position) + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) else: from pandas.core.sorting import nargsort # Check monotonic-ness before sort an index # GH11080 - if ((ascending and index.is_monotonic_increasing) or - (not ascending and index.is_monotonic_decreasing)): + if (ascending and index.is_monotonic_increasing) or ( + not ascending and index.is_monotonic_decreasing + ): if inplace: return else: return self.copy() - indexer = nargsort(index, kind=kind, ascending=ascending, - na_position=na_position) + indexer = nargsort( + index, kind=kind, ascending=ascending, na_position=na_position + ) indexer = ensure_platform_int(indexer) new_index = index.take(indexer) @@ -3210,7 +3356,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return result.__finalize__(self) - def argsort(self, axis=0, kind='quicksort', order=None): + def argsort(self, axis=0, kind="quicksort", order=None): """ Override ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. @@ -3239,18 +3385,16 @@ def argsort(self, axis=0, kind='quicksort', order=None): mask = isna(values) if mask.any(): - result = Series(-1, index=self.index, name=self.name, - dtype='int64') + result = Series(-1, index=self.index, name=self.name, dtype="int64") notmask = ~mask result[notmask] = np.argsort(values[notmask], kind=kind) - return self._constructor(result, - index=self.index).__finalize__(self) + return self._constructor(result, index=self.index).__finalize__(self) else: return self._constructor( - np.argsort(values, kind=kind), index=self.index, - dtype='int64').__finalize__(self) + np.argsort(values, kind=kind), index=self.index, dtype="int64" + ).__finalize__(self) - def nlargest(self, n=5, keep='first'): + def nlargest(self, n=5, keep="first"): """ Return the largest `n` elements. @@ -3348,7 +3492,7 @@ def nlargest(self, n=5, keep='first'): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() - def nsmallest(self, n=5, keep='first'): + def nsmallest(self, n=5, keep="first"): """ Return the smallest `n` elements. @@ -3465,8 +3609,9 @@ def swaplevel(self, i=-2, j=-1, copy=True): the two innermost levels of the index. """ new_index = self.index.swaplevel(i, j) - return self._constructor(self._values, index=new_index, - copy=copy).__finalize__(self) + return self._constructor(self._values, index=new_index, copy=copy).__finalize__( + self + ) def reorder_levels(self, order): """ @@ -3484,7 +3629,7 @@ def reorder_levels(self, order): type of caller (new object) """ if not isinstance(self.index, MultiIndex): # pragma: no cover - raise Exception('Can only reorder levels on a hierarchical axis.') + raise Exception("Can only reorder levels on a hierarchical axis.") result = self.copy() result.index = result.index.reorder_levels(order) @@ -3532,6 +3677,7 @@ def unstack(self, level=-1, fill_value=None): b 2 4 """ from pandas.core.reshape.reshape import unstack + return unstack(self, level, fill_value) # ---------------------------------------------------------------------- @@ -3612,10 +3758,8 @@ def map(self, arg, na_action=None): 3 I am a rabbit dtype: object """ - new_values = super()._map_values( - arg, na_action=na_action) - return self._constructor(new_values, - index=self.index).__finalize__(self) + new_values = super()._map_values(arg, na_action=na_action) + return self._constructor(new_values, index=self.index).__finalize__(self) def _gotitem(self, key, ndim, subset=None): """ @@ -3631,14 +3775,17 @@ def _gotitem(self, key, ndim, subset=None): """ return self - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- Series.apply : Invoke function on a Series. Series.transform : Transform function producing a Series with like indexes. - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- >>> s = pd.Series([1, 2, 3, 4]) @@ -3656,13 +3803,16 @@ def _gotitem(self, key, ndim, subset=None): min 1 max 4 dtype: int64 - """) + """ + ) - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='\n.. versionadded:: 0.20.0\n', - **_shared_doc_kwargs) - @Appender(generic._shared_docs['aggregate']) + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", + **_shared_doc_kwargs + ) + @Appender(generic._shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) @@ -3671,8 +3821,8 @@ def aggregate(self, func, axis=0, *args, **kwargs): # we can be called from an inner function which # passes this meta-data - kwargs.pop('_axis', None) - kwargs.pop('_level', None) + kwargs.pop("_axis", None) + kwargs.pop("_level", None) # try a regular apply, this evaluates lambdas # row-by-row; however if the lambda is expected a Series @@ -3691,7 +3841,7 @@ def aggregate(self, func, axis=0, *args, **kwargs): agg = aggregate - @Appender(generic._shared_docs['transform'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["transform"] % _shared_doc_kwargs) def transform(self, func, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) @@ -3795,8 +3945,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): dtype: float64 """ if len(self) == 0: - return self._constructor(dtype=self.dtype, - index=self.index).__finalize__(self) + return self._constructor(dtype=self.dtype, index=self.index).__finalize__( + self + ) # dispatch to agg if isinstance(func, (list, dict)): @@ -3808,12 +3959,14 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): # handle ufuncs and lambdas if kwds or args and not isinstance(func, np.ufunc): + def f(x): return func(x, *args, **kwds) + else: f = func - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if isinstance(f, np.ufunc): return f(self) @@ -3827,14 +3980,13 @@ def f(x): if len(mapped) and isinstance(mapped[0], Series): # GH 25959 use pd.array instead of tolist # so extension arrays can be used - return self._constructor_expanddim(pd.array(mapped), - index=self.index) + return self._constructor_expanddim(pd.array(mapped), index=self.index) else: - return self._constructor(mapped, - index=self.index).__finalize__(self) + return self._constructor(mapped, index=self.index).__finalize__(self) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): """ Perform a reduction operation. @@ -3864,17 +4016,24 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, # dispatch to numpy arrays elif isinstance(delegate, np.ndarray): if numeric_only: - raise NotImplementedError('Series.{0} does not implement ' - 'numeric_only.'.format(name)) - with np.errstate(all='ignore'): + raise NotImplementedError( + "Series.{0} does not implement " "numeric_only.".format(name) + ) + with np.errstate(all="ignore"): return op(delegate, skipna=skipna, **kwds) # TODO(EA) dispatch to Index # remove once all internals extension types are # moved to ExtensionArrays - return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, - numeric_only=numeric_only, - filter_type=filter_type, **kwds) + return delegate._reduce( + op=op, + name=name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + filter_type=filter_type, + **kwds + ) def _reindex_indexer(self, new_index, indexer, copy): if indexer is None: @@ -3882,8 +4041,9 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - new_values = algorithms.take_1d(self._values, indexer, - allow_fill=True, fill_value=None) + new_values = algorithms.take_1d( + self._values, indexer, allow_fill=True, fill_value=None + ) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -3893,14 +4053,32 @@ def _needs_reindex_multi(self, axes, method, level): """ return False - @Appender(generic._shared_docs['align'] % _shared_doc_kwargs) - def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, - broadcast_axis=None): - return super().align(other, join=join, axis=axis, level=level, - copy=copy, fill_value=fill_value, method=method, - limit=limit, fill_axis=fill_axis, - broadcast_axis=broadcast_axis) + @Appender(generic._shared_docs["align"] % _shared_doc_kwargs) + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + return super().align( + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) def rename(self, index=None, **kwargs): """ @@ -3963,13 +4141,13 @@ def rename(self, index=None, **kwargs): 5 3 dtype: int64 """ - kwargs['inplace'] = validate_bool_kwarg(kwargs.get('inplace', False), - 'inplace') + kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace") - non_mapping = is_scalar(index) or (is_list_like(index) and - not is_dict_like(index)) + non_mapping = is_scalar(index) or ( + is_list_like(index) and not is_dict_like(index) + ) if non_mapping: - return self._set_name(index, inplace=kwargs.get('inplace')) + return self._set_name(index, inplace=kwargs.get("inplace")) return super().rename(index=index, **kwargs) @Substitution(**_shared_doc_kwargs) @@ -3977,8 +4155,16 @@ def rename(self, index=None, **kwargs): def reindex(self, index=None, **kwargs): return super().reindex(index=index, **kwargs) - def drop(self, labels=None, axis=0, index=None, columns=None, - level=None, inplace=False, errors='raise'): + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): """ Return Series with specified index labels removed. @@ -4065,29 +4251,62 @@ def drop(self, labels=None, axis=0, index=None, columns=None, length 0.3 dtype: float64 """ - return super().drop(labels=labels, axis=axis, index=index, - columns=columns, level=level, inplace=inplace, - errors=errors) + return super().drop( + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.fillna.__doc__) - def fillna(self, value=None, method=None, axis=None, inplace=False, - limit=None, downcast=None, **kwargs): - return super().fillna(value=value, method=method, axis=axis, - inplace=inplace, limit=limit, downcast=downcast, - **kwargs) - - @Appender(generic._shared_docs['replace'] % _shared_doc_kwargs) - def replace(self, to_replace=None, value=None, inplace=False, limit=None, - regex=False, method='pad'): - return super().replace(to_replace=to_replace, value=value, - inplace=inplace, limit=limit, regex=regex, - method=method) - - @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs) + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs + ): + return super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + **kwargs + ) + + @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + return super().replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return super().shift(periods=periods, freq=freq, axis=axis, - fill_value=fill_value) + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) def memory_usage(self, index=True, deep=False): """ @@ -4153,13 +4372,14 @@ def _take(self, indices, axis=0, is_copy=False): # https://github.com/pandas-dev/pandas/issues/20664 # TODO: remove when the default Categorical.take behavior changes indices = maybe_convert_indices(indices, len(self._get_axis(axis))) - kwargs = {'allow_fill': False} + kwargs = {"allow_fill": False} else: kwargs = {} new_values = self._values.take(indices, **kwargs) - result = (self._constructor(new_values, index=new_index, - fastpath=True).__finalize__(self)) + result = self._constructor( + new_values, index=new_index, fastpath=True + ).__finalize__(self) # Maybe set copy if we didn't actually change the index. if is_copy: @@ -4306,21 +4526,51 @@ def between(self, left, right, inclusive=True): @Appender(generic.NDFrame.to_csv.__doc__) def to_csv(self, *args, **kwargs): - names = ["path_or_buf", "sep", "na_rep", "float_format", "columns", - "header", "index", "index_label", "mode", "encoding", - "compression", "quoting", "quotechar", "line_terminator", - "chunksize", "date_format", "doublequote", - "escapechar", "decimal"] - - old_names = ["path_or_buf", "index", "sep", "na_rep", "float_format", - "header", "index_label", "mode", "encoding", - "compression", "date_format", "decimal"] + names = [ + "path_or_buf", + "sep", + "na_rep", + "float_format", + "columns", + "header", + "index", + "index_label", + "mode", + "encoding", + "compression", + "quoting", + "quotechar", + "line_terminator", + "chunksize", + "date_format", + "doublequote", + "escapechar", + "decimal", + ] + + old_names = [ + "path_or_buf", + "index", + "sep", + "na_rep", + "float_format", + "header", + "index_label", + "mode", + "encoding", + "compression", + "date_format", + "decimal", + ] if "path" in kwargs: - warnings.warn("The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`, and argument " - "'path' will be renamed to 'path_or_buf'.", - FutureWarning, stacklevel=2) + warnings.warn( + "The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`, and argument " + "'path' will be renamed to 'path_or_buf'.", + FutureWarning, + stacklevel=2, + ) kwargs["path_or_buf"] = kwargs.pop("path") if len(args) > 1: @@ -4330,49 +4580,57 @@ def to_csv(self, *args, **kwargs): if not (is_string_like(maybe_sep) and len(maybe_sep) == 1): # old signature - warnings.warn("The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`. Note that the " - "order of arguments changed, and the new one " - "has 'sep' in first place, for which \"{}\" is " - "not a valid value. The old order will cease to " - "be supported in a future version. Please refer " - "to the documentation for `DataFrame.to_csv` " - "when updating your function " - "calls.".format(maybe_sep), - FutureWarning, stacklevel=2) + warnings.warn( + "The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`. Note that the " + "order of arguments changed, and the new one " + "has 'sep' in first place, for which \"{}\" is " + "not a valid value. The old order will cease to " + "be supported in a future version. Please refer " + "to the documentation for `DataFrame.to_csv` " + "when updating your function " + "calls.".format(maybe_sep), + FutureWarning, + stacklevel=2, + ) names = old_names - pos_args = dict(zip(names[:len(args)], args)) + pos_args = dict(zip(names[: len(args)], args)) for key in pos_args: if key in kwargs: - raise ValueError("Argument given by name ('{}') and position " - "({})".format(key, names.index(key))) + raise ValueError( + "Argument given by name ('{}') and position " + "({})".format(key, names.index(key)) + ) kwargs[key] = pos_args[key] if kwargs.get("header", None) is None: - warnings.warn("The signature of `Series.to_csv` was aligned " - "to that of `DataFrame.to_csv`, and argument " - "'header' will change its default value from False " - "to True: please pass an explicit value to suppress " - "this warning.", FutureWarning, - stacklevel=2) + warnings.warn( + "The signature of `Series.to_csv` was aligned " + "to that of `DataFrame.to_csv`, and argument " + "'header' will change its default value from False " + "to True: please pass an explicit value to suppress " + "this warning.", + FutureWarning, + stacklevel=2, + ) kwargs["header"] = False # Backwards compatibility. return self.to_frame().to_csv(**kwargs) - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return super().isna() - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isnull(self): return super().isnull() - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return super().notna() - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notnull(self): return super().notnull() @@ -4447,11 +4705,13 @@ def dropna(self, axis=0, inplace=False, **kwargs): 5 I stay dtype: object """ - inplace = validate_bool_kwarg(inplace, 'inplace') - kwargs.pop('how', None) + inplace = validate_bool_kwarg(inplace, "inplace") + kwargs.pop("how", None) if kwargs: - raise TypeError('dropna() got an unexpected keyword ' - 'argument "{0}"'.format(list(kwargs.keys())[0])) + raise TypeError( + "dropna() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) # Validate the axis parameter self._get_axis_number(axis or 0) @@ -4480,14 +4740,18 @@ def valid(self, inplace=False, **kwargs): Series Series without null values. """ - warnings.warn("Method .valid will be removed in a future version. " - "Use .dropna instead.", FutureWarning, stacklevel=2) + warnings.warn( + "Method .valid will be removed in a future version. " + "Use .dropna instead.", + FutureWarning, + stacklevel=2, + ) return self.dropna(inplace=inplace, **kwargs) # ---------------------------------------------------------------------- # Time series-oriented methods - def to_timestamp(self, freq=None, how='start', copy=True): + def to_timestamp(self, freq=None, how="start", copy=True): """ Cast to DatetimeIndex of Timestamps, at *beginning* of period. @@ -4510,8 +4774,7 @@ def to_timestamp(self, freq=None, how='start', copy=True): new_values = new_values.copy() new_index = self.index.to_timestamp(freq=freq, how=how) - return self._constructor(new_values, - index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__(self) def to_period(self, freq=None, copy=True): """ @@ -4535,8 +4798,7 @@ def to_period(self, freq=None, copy=True): new_values = new_values.copy() new_index = self.index.to_period(freq=freq) - return self._constructor(new_values, - index=new_index).__finalize__(self) + return self._constructor(new_values, index=new_index).__finalize__(self) # ---------------------------------------------------------------------- # Accessor Methods @@ -4552,8 +4814,13 @@ def to_period(self, freq=None, copy=True): hist = pandas.plotting.hist_series -Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}, - docs={'index': 'The index (axis labels) of the Series.'}) +Series._setup_axes( + ["index"], + info_axis=0, + stat_axis=0, + aliases={"rows": 0}, + docs={"index": "The index (axis labels) of the Series."}, +) Series._add_numeric_operations() Series._add_series_only_operations() Series._add_series_or_dataframe_operations() diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index b79390581612b..523c4dc5e867b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -6,8 +6,12 @@ from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( - ensure_int64, ensure_platform_int, is_categorical_dtype, - is_extension_array_dtype, is_list_like) + ensure_int64, + ensure_platform_int, + is_categorical_dtype, + is_extension_array_dtype, + is_list_like, +) from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algorithms @@ -42,6 +46,7 @@ def get_group_index(labels, shape, sort, xnull): An array of type int64 where two elements are equal if their corresponding labels are equal at all location. """ + def _int64_cut_off(shape): acc = 1 for i, mul in enumerate(shape): @@ -69,8 +74,8 @@ def maybe_lift(lab, size): nlev = _int64_cut_off(shape) # compute flat ids for the first `nlev` levels - stride = np.prod(shape[1:nlev], dtype='i8') - out = stride * labels[0].astype('i8', subok=False, copy=False) + stride = np.prod(shape[1:nlev], dtype="i8") + out = stride * labels[0].astype("i8", subok=False, copy=False) for i in range(1, nlev): if shape[i] == 0: @@ -132,7 +137,7 @@ def decons_group_index(comp_labels, shape): if is_int64_overflow_possible(shape): # at some point group indices are factorized, # and may not be deconstructed here! wrong path! - raise ValueError('cannot deconstruct factorized group indices!') + raise ValueError("cannot deconstruct factorized group indices!") label_list = [] factor = 1 @@ -158,17 +163,16 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): """ if not xnull: - lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8') - shape = np.asarray(shape, dtype='i8') + lift + lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8") + shape = np.asarray(shape, dtype="i8") + lift if not is_int64_overflow_possible(shape): # obs ids are deconstructable! take the fast route! out = decons_group_index(obs_ids, shape) - return out if xnull or not lift.any() \ - else [x - y for x, y in zip(out, lift)] + return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)] i = unique_label_indices(comp_ids) - i8copy = lambda a: a.astype('i8', subok=False, copy=True) + i8copy = lambda a: a.astype("i8", subok=False, copy=True) return [i8copy(lab[i]) for lab in labels] @@ -184,7 +188,7 @@ def indexer_from_factorized(labels, shape, compress=True): return get_group_index_sorter(ids, ngroups) -def lexsort_indexer(keys, orders=None, na_position='last'): +def lexsort_indexer(keys, orders=None, na_position="last"): from pandas.core.arrays import Categorical labels = [] @@ -204,22 +208,22 @@ def lexsort_indexer(keys, orders=None, na_position='last'): else: c = Categorical(key, ordered=True) - if na_position not in ['last', 'first']: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + if na_position not in ["last", "first"]: + raise ValueError("invalid na_position: {!r}".format(na_position)) n = len(c.categories) codes = c.codes.copy() - mask = (c.codes == -1) + mask = c.codes == -1 if order: # ascending - if na_position == 'last': + if na_position == "last": codes = np.where(mask, n, codes) - elif na_position == 'first': + elif na_position == "first": codes += 1 else: # not order means descending - if na_position == 'last': + if na_position == "last": codes = np.where(mask, n, n - codes - 1) - elif na_position == 'first': + elif na_position == "first": codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 @@ -230,7 +234,7 @@ def lexsort_indexer(keys, orders=None, na_position='last'): return indexer_from_factorized(labels, shape) -def nargsort(items, kind='quicksort', ascending=True, na_position='last'): +def nargsort(items, kind="quicksort", ascending=True, na_position="last"): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. @@ -258,12 +262,12 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position - if na_position == 'last': + if na_position == "last": indexer = np.concatenate([indexer, nan_idx]) - elif na_position == 'first': + elif na_position == "first": indexer = np.concatenate([nan_idx, indexer]) else: - raise ValueError('invalid na_position: {!r}'.format(na_position)) + raise ValueError("invalid na_position: {!r}".format(na_position)) return indexer @@ -279,8 +283,7 @@ def __init__(self, comp_ids, ngroups, levels, labels): self.comp_ids = comp_ids.astype(np.int64) self.k = len(labels) - self.tables = [hashtable.Int64HashTable(ngroups) - for _ in range(self.k)] + self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] self._populate_tables() @@ -289,8 +292,10 @@ def _populate_tables(self): table.map(self.comp_ids, labs.astype(np.int64)) def get_key(self, comp_id): - return tuple(level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels)) + return tuple( + level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels) + ) def get_flattened_iterator(comp_ids, ngroups, levels, labels): @@ -304,9 +309,11 @@ def get_indexer_dict(label_list, keys): shape = list(map(len, keys)) group_index = get_group_index(label_list, shape, sort=True, xnull=True) - ngroups = ((group_index.size and group_index.max()) + 1) \ - if is_int64_overflow_possible(shape) \ - else np.prod(shape, dtype='i8') + ngroups = ( + ((group_index.size and group_index.max()) + 1) + if is_int64_overflow_possible(shape) + else np.prod(shape, dtype="i8") + ) sorter = get_group_index_sorter(group_index, ngroups) @@ -319,6 +326,7 @@ def get_indexer_dict(label_list, keys): # ---------------------------------------------------------------------- # sorting levels...cleverly? + def get_group_index_sorter(group_index, ngroups): """ algos.groupsort_indexer implements `counting sort` and it is at least @@ -336,14 +344,12 @@ def get_group_index_sorter(group_index, ngroups): count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters - do_groupsort = (count > 0 and ((alpha + beta * ngroups) < - (count * np.log(count)))) + do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count))) if do_groupsort: - sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), - ngroups) + sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups) return ensure_platform_int(sorter) else: - return group_index.argsort(kind='mergesort') + return group_index.argsort(kind="mergesort") def compress_group_index(group_index, sort=True): @@ -387,8 +393,7 @@ def _reorder_by_uniques(uniques, labels): return uniques, labels -def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, - verify=True): +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. @@ -433,26 +438,27 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): - raise TypeError("Only list-like objects are allowed to be passed to" - "safe_sort as values") + raise TypeError( + "Only list-like objects are allowed to be passed to" "safe_sort as values" + ) - if (not isinstance(values, np.ndarray) - and not is_extension_array_dtype(values)): + if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], - dtype=bool) + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None - if (not is_extension_array_dtype(values) - and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): + if ( + not is_extension_array_dtype(values) + and lib.infer_dtype(values, skipna=False) == "mixed-integer" + ): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: @@ -469,18 +475,22 @@ def sort_mixed(values): return ordered if not is_list_like(labels): - raise TypeError("Only list-like objects or None are allowed to be" - "passed to safe_sort as labels") + raise TypeError( + "Only list-like objects or None are allowed to be" + "passed to safe_sort as labels" + ) labels = ensure_platform_int(np.asarray(labels)) from pandas import Index + if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo( - values, algorithms._hashtables) + values, algorithms._hashtables + ) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) @@ -498,7 +508,7 @@ def sort_mixed(values): reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` - new_labels = reverse_indexer.take(labels, mode='wrap') + new_labels = reverse_indexer.take(labels, mode="wrap") mask = labels == na_sentinel if verify: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 6a0ba5f93c509..f195e4b5f4e37 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -20,14 +20,13 @@ from pandas.core.frame import DataFrame import pandas.core.generic as generic from pandas.core.index import Index, MultiIndex, ensure_index -from pandas.core.internals import ( - BlockManager, create_block_manager_from_arrays) +from pandas.core.internals import BlockManager, create_block_manager_from_arrays from pandas.core.internals.construction import extract_index, prep_ndarray import pandas.core.ops as ops from pandas.core.series import Series from pandas.core.sparse.series import SparseSeries -_shared_doc_kwargs = dict(klass='SparseDataFrame') +_shared_doc_kwargs = dict(klass="SparseDataFrame") depr_msg = """\ SparseDataFrame is deprecated and will be removed in a future version. Use a regular DataFrame whose columns are SparseArrays instead. @@ -62,10 +61,19 @@ class SparseDataFrame(DataFrame): Default fill_value for converting Series to SparseSeries (default: nan). Will not override SparseSeries passed in. """ - _subtyp = 'sparse_frame' - def __init__(self, data=None, index=None, columns=None, default_kind=None, - default_fill_value=None, dtype=None, copy=False): + _subtyp = "sparse_frame" + + def __init__( + self, + data=None, + index=None, + columns=None, + default_kind=None, + default_fill_value=None, + dtype=None, + copy=False, + ): warnings.warn(depr_msg, FutureWarning, stacklevel=2) # pick up the defaults from the Sparse structures @@ -83,7 +91,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, index = data.index if default_fill_value is None: default_fill_value = data.fill_value - if columns is None and hasattr(data, 'name'): + if columns is None and hasattr(data, "name"): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") @@ -92,30 +100,33 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if default_fill_value is None: default_fill_value = np.nan if default_kind is None: - default_kind = 'block' + default_kind = "block" self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): - mgr = self._init_spmatrix(data, index, columns, dtype=dtype, - fill_value=default_fill_value) + mgr = self._init_spmatrix( + data, index, columns, dtype=dtype, fill_value=default_fill_value + ) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): - mgr = self._init_mgr(data._data, - dict(index=index, columns=columns), - dtype=dtype, copy=copy) + mgr = self._init_mgr( + data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy + ) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): - mgr = self._init_dict(data.to_frame(), data.index, - columns=None, dtype=dtype) + mgr = self._init_dict( + data.to_frame(), data.index, columns=None, dtype=dtype + ) elif isinstance(data, BlockManager): - mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), - dtype=dtype, copy=copy) + mgr = self._init_mgr( + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + ) elif data is None: data = DataFrame() @@ -128,15 +139,20 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, columns = Index([]) else: for c in columns: - data[c] = SparseArray(self._default_fill_value, - index=index, kind=self._default_kind, - fill_value=self._default_fill_value) + data[c] = SparseArray( + self._default_fill_value, + index=index, + kind=self._default_kind, + fill_value=self._default_fill_value, + ) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: - msg = ('SparseDataFrame called with unknown type "{data_type}" ' - 'for data argument') + msg = ( + 'SparseDataFrame called with unknown type "{data_type}" ' + "for data argument" + ) raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr) @@ -160,9 +176,14 @@ def _init_dict(self, data, index, columns, dtype=None): index = extract_index(list(data.values())) def sp_maker(x): - return SparseArray(x, kind=self._default_kind, - fill_value=self._default_fill_value, - copy=True, dtype=dtype) + return SparseArray( + x, + kind=self._default_kind, + fill_value=self._default_fill_value, + copy=True, + dtype=dtype, + ) + sdict = {} for k, v in data.items(): if isinstance(v, Series): @@ -188,11 +209,14 @@ def sp_maker(x): if len(columns.difference(sdict)): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) - nan_arr = np.empty(len(index), dtype='float64') + nan_arr = np.empty(len(index), dtype="float64") nan_arr.fill(np.nan) - nan_arr = SparseArray(nan_arr, kind=self._default_kind, - fill_value=self._default_fill_value, - copy=False) + nan_arr = SparseArray( + nan_arr, + kind=self._default_kind, + fill_value=self._default_fill_value, + copy=False, + ) sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index) @@ -206,8 +230,7 @@ def _init_matrix(self, data, index, columns, dtype=None): data = {idx: data[:, i] for i, idx in enumerate(columns)} return self._init_dict(data, index, columns, dtype) - def _init_spmatrix(self, data, index, columns, dtype=None, - fill_value=None): + def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None): """ Init self from scipy.sparse matrix. """ @@ -225,16 +248,24 @@ def _init_spmatrix(self, data, index, columns, dtype=None, blocs, blens = get_blocks(rows) sdict[columns[col]] = SparseSeries( - rowvals.values, index=index, + rowvals.values, + index=index, fill_value=fill_value, - sparse_index=BlockIndex(N, blocs, blens)) + sparse_index=BlockIndex(N, blocs, blens), + ) # Add any columns that were empty and thus not grouped on above - sdict.update({column: SparseSeries(index=index, - fill_value=fill_value, - sparse_index=BlockIndex(N, [], [])) - for column in columns - if column not in sdict}) + sdict.update( + { + column: SparseSeries( + index=index, + fill_value=fill_value, + sparse_index=BlockIndex(N, [], []), + ) + for column in columns + if column not in sdict + } + ) return self._init_dict(sdict, index, columns, dtype) @@ -249,9 +280,13 @@ def __repr__(self): def __getstate__(self): # pickling - return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, - _default_fill_value=self._default_fill_value, - _default_kind=self._default_kind) + return dict( + _typ=self._typ, + _subtyp=self._subtyp, + _data=self._data, + _default_fill_value=self._default_fill_value, + _default_kind=self._default_kind, + ) def _unpickle_sparse_frame_compat(self, state): """ @@ -261,20 +296,23 @@ def _unpickle_sparse_frame_compat(self, state): if not isinstance(cols, Index): # pragma: no cover from pandas.io.pickle import _unpickle_array + columns = _unpickle_array(cols) else: columns = cols if not isinstance(idx, Index): # pragma: no cover from pandas.io.pickle import _unpickle_array + index = _unpickle_array(idx) else: index = idx series_dict = DataFrame() for col, (sp_index, sp_values) in series.items(): - series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, - fill_value=fv) + series_dict[col] = SparseSeries( + sp_values, sparse_index=sp_index, fill_value=fv + ) self._data = to_manager(series_dict, columns, index) self._default_fill_value = fv @@ -289,12 +327,14 @@ def _apply_columns(self, func): Get new SparseDataFrame applying func to each columns """ - new_data = {col: func(series) - for col, series in self.items()} + new_data = {col: func(series) for col, series in self.items()} return self._constructor( - data=new_data, index=self.index, columns=self.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + data=new_data, + index=self.index, + columns=self.columns, + default_fill_value=self.default_fill_value, + ).__finalize__(self) def astype(self, dtype): return self._apply_columns(lambda x: x.astype(dtype)) @@ -322,23 +362,27 @@ def density(self): Ratio of non-sparse points to total (dense) data points represented in the frame """ - tot_nonsparse = sum(ser.sp_index.npoints - for _, ser in self.items()) + tot_nonsparse = sum(ser.sp_index.npoints for _, ser in self.items()) tot = len(self.index) * len(self.columns) return tot_nonsparse / float(tot) - def fillna(self, value=None, method=None, axis=0, inplace=False, - limit=None, downcast=None): - new_self = super().fillna(value=value, method=method, axis=axis, - inplace=inplace, limit=limit, - downcast=downcast) + def fillna( + self, value=None, method=None, axis=0, inplace=False, limit=None, downcast=None + ): + new_self = super().fillna( + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) if not inplace: self = new_self # set the fill value if we are filling as a scalar with nothing special # going on - if (value is not None and value == value and method is None and - limit is None): + if value is not None and value == value and method is None and limit is None: self._default_fill_value = value if not inplace: @@ -362,29 +406,35 @@ def _sanitize_column(self, key, value, **kwargs): sanitized_column : SparseArray """ + def sp_maker(x, index=None): - return SparseArray(x, index=index, - fill_value=self._default_fill_value, - kind=self._default_kind) + return SparseArray( + x, + index=index, + fill_value=self._default_fill_value, + kind=self._default_kind, + ) + if isinstance(value, SparseSeries): clean = value.reindex(self.index).as_sparse_array( - fill_value=self._default_fill_value, kind=self._default_kind) + fill_value=self._default_fill_value, kind=self._default_kind + ) elif isinstance(value, SparseArray): if len(value) != len(self.index): - raise ValueError('Length of values does not match ' - 'length of index') + raise ValueError("Length of values does not match " "length of index") clean = value - elif hasattr(value, '__iter__'): + elif hasattr(value, "__iter__"): if isinstance(value, Series): clean = value.reindex(self.index) if not isinstance(value, SparseSeries): clean = sp_maker(clean) else: if len(value) != len(self.index): - raise ValueError('Length of values does not match ' - 'length of index') + raise ValueError( + "Length of values does not match " "length of index" + ) clean = sp_maker(value) # Scalar @@ -412,10 +462,13 @@ def get_value(self, index, col, takeable=False): ------- value : scalar value """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(index, col, takeable=takeable) def _get_value(self, index, col, takeable=False): @@ -425,6 +478,7 @@ def _get_value(self, index, col, takeable=False): series = self._get_item_cache(col) return series._get_value(index, takeable=takeable) + _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): @@ -452,17 +506,21 @@ def set_value(self, index, col, value, takeable=False): ------- frame : DataFrame """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(index, col, value, takeable=takeable) def _set_value(self, index, col, value, takeable=False): - dense = self.to_dense()._set_value( - index, col, value, takeable=takeable) - return dense.to_sparse(kind=self._default_kind, - fill_value=self._default_fill_value) + dense = self.to_dense()._set_value(index, col, value, takeable=takeable) + return dense.to_sparse( + kind=self._default_kind, fill_value=self._default_fill_value + ) + _set_value.__doc__ = set_value.__doc__ def _slice(self, slobj, axis=0, kind=None): @@ -503,7 +561,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - this, other = self.align(other, join='outer', level=level, copy=False) + this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns if self.empty and other.empty: @@ -527,10 +585,12 @@ def _combine_frame(self, other, func, fill_value=None, level=None): new_fill_value = self._get_op_result_fill_value(other, func) - return self._constructor(data=new_data, index=new_index, - columns=new_columns, - default_fill_value=new_fill_value - ).__finalize__(self) + return self._constructor( + data=new_data, + index=new_index, + columns=new_columns, + default_fill_value=new_fill_value, + ).__finalize__(self) def _combine_match_index(self, other, func, level=None): new_data = {} @@ -538,8 +598,7 @@ def _combine_match_index(self, other, func, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - this, other = self.align(other, join='outer', axis=0, level=level, - copy=False) + this, other = self.align(other, join="outer", axis=0, level=level, copy=False) for col, series in this.items(): new_data[col] = func(series.values, other.values) @@ -547,8 +606,11 @@ def _combine_match_index(self, other, func, level=None): fill_value = self._get_op_result_fill_value(other, func) return self._constructor( - new_data, index=this.index, columns=self.columns, - default_fill_value=fill_value).__finalize__(self) + new_data, + index=this.index, + columns=self.columns, + default_fill_value=fill_value, + ).__finalize__(self) def _combine_match_columns(self, other, func, level=None): # patched version of DataFrame._combine_match_columns to account for @@ -559,8 +621,7 @@ def _combine_match_columns(self, other, func, level=None): if level is not None: raise NotImplementedError("'level' argument is not supported") - left, right = self.align(other, join='outer', axis=1, level=level, - copy=False) + left, right = self.align(other, join="outer", axis=1, level=level, copy=False) assert left.columns.equals(right.index) new_data = {} @@ -569,8 +630,11 @@ def _combine_match_columns(self, other, func, level=None): new_data[col] = func(left[col], float(right[col])) return self._constructor( - new_data, index=left.index, columns=left.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + new_data, + index=left.index, + columns=left.columns, + default_fill_value=self.default_fill_value, + ).__finalize__(self) def _combine_const(self, other, func): return self._apply_columns(lambda x: func(x, other)) @@ -581,7 +645,7 @@ def _get_op_result_fill_value(self, other, func): if isinstance(other, DataFrame): # i.e. called from _combine_frame - other_default = getattr(other, 'default_fill_value', np.nan) + other_default = getattr(other, "default_fill_value", np.nan) # if the fill values are the same use them? or use a valid one if own_default == other_default: @@ -601,18 +665,18 @@ def _get_op_result_fill_value(self, other, func): if isna(other.fill_value) or isna(own_default): fill_value = np.nan else: - fill_value = func(np.float64(own_default), - np.float64(other.fill_value)) + fill_value = func(np.float64(own_default), np.float64(other.fill_value)) else: raise NotImplementedError(type(other)) return fill_value - def _reindex_index(self, index, method, copy, level, fill_value=np.nan, - limit=None, takeable=False): + def _reindex_index( + self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False + ): if level is not None: - raise TypeError('Reindex by level not supported for sparse') + raise TypeError("Reindex by level not supported for sparse") if self.index.equals(index): if copy: @@ -621,8 +685,9 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return self if len(self.index) == 0: - return self._constructor( - index=index, columns=self.columns).__finalize__(self) + return self._constructor(index=index, columns=self.columns).__finalize__( + self + ) indexer = self.index.get_indexer(index, method, limit=limit) indexer = ensure_platform_int(indexer) @@ -647,13 +712,17 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, new_series[col] = new return self._constructor( - new_series, index=index, columns=self.columns, - default_fill_value=self._default_fill_value).__finalize__(self) + new_series, + index=index, + columns=self.columns, + default_fill_value=self._default_fill_value, + ).__finalize__(self) - def _reindex_columns(self, columns, method, copy, level, fill_value=None, - limit=None, takeable=False): + def _reindex_columns( + self, columns, method, copy, level, fill_value=None, limit=None, takeable=False + ): if level is not None: - raise TypeError('Reindex by level not supported for sparse') + raise TypeError("Reindex by level not supported for sparse") if notna(fill_value): raise NotImplementedError("'fill_value' argument is not supported") @@ -667,21 +736,31 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, # TODO: fill value handling sdict = {k: v for k, v in self.items() if k in columns} return self._constructor( - sdict, index=self.index, columns=columns, - default_fill_value=self._default_fill_value).__finalize__(self) - - def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, - limit=None, copy=False, allow_dups=False): + sdict, + index=self.index, + columns=columns, + default_fill_value=self._default_fill_value, + ).__finalize__(self) + + def _reindex_with_indexers( + self, + reindexers, + method=None, + fill_value=None, + limit=None, + copy=False, + allow_dups=False, + ): if method is not None or limit is not None: - raise NotImplementedError("cannot reindex with a method or limit " - "with sparse") + raise NotImplementedError( + "cannot reindex with a method or limit " "with sparse" + ) if fill_value is None: fill_value = np.nan - reindexers = {self._get_axis_number(a): val - for (a, val) in reindexers.items()} + reindexers = {self._get_axis_number(a): val for (a, val) in reindexers.items()} index, row_indexer = reindexers.get(0, (None, None)) columns, col_indexer = reindexers.get(1, (None, None)) @@ -695,30 +774,32 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, continue if row_indexer is not None: new_arrays[col] = algos.take_1d( - self[col]._internal_get_values(), - row_indexer, - fill_value=fill_value) + self[col]._internal_get_values(), row_indexer, fill_value=fill_value + ) else: new_arrays[col] = self[col] - return self._constructor(new_arrays, index=index, - columns=columns).__finalize__(self) + return self._constructor(new_arrays, index=index, columns=columns).__finalize__( + self + ) - def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', - sort=False): + def _join_compat( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ): if on is not None: - raise NotImplementedError("'on' keyword parameter is not yet " - "implemented") + raise NotImplementedError( + "'on' keyword parameter is not yet " "implemented" + ) return self._join_index(other, how, lsuffix, rsuffix) def _join_index(self, other, how, lsuffix, rsuffix): if isinstance(other, Series): if other.name is None: - raise ValueError('Other Series must have a name') + raise ValueError("Other Series must have a name") other = SparseDataFrame( - {other.name: other}, - default_fill_value=self._default_fill_value) + {other.name: other}, default_fill_value=self._default_fill_value + ) join_index = self.index.join(other.index, how=how) @@ -728,23 +809,26 @@ def _join_index(self, other, how, lsuffix, rsuffix): this, other = this._maybe_rename_join(other, lsuffix, rsuffix) from pandas import concat + return concat([this, other], axis=1, verify_integrity=True) def _maybe_rename_join(self, other, lsuffix, rsuffix): to_rename = self.columns.intersection(other.columns) if len(to_rename) > 0: if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: ' - '{to_rename}'.format(to_rename=to_rename)) + raise ValueError( + "columns overlap but no suffix specified: " + "{to_rename}".format(to_rename=to_rename) + ) def lrenamer(x): if x in to_rename: - return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix) + return "{x}{lsuffix}".format(x=x, lsuffix=lsuffix) return x def rrenamer(x): if x in to_rename: - return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix) + return "{x}{rsuffix}".format(x=x, rsuffix=rsuffix) return x this = self.rename(columns=lrenamer) @@ -760,9 +844,12 @@ def transpose(self, *args, **kwargs): """ nv.validate_transpose(args, kwargs) return self._constructor( - self.values.T, index=self.columns, columns=self.index, + self.values.T, + index=self.columns, + columns=self.index, default_fill_value=self._default_fill_value, - default_kind=self._default_kind).__finalize__(self) + default_kind=self._default_kind, + ).__finalize__(self) T = property(transpose) @@ -793,18 +880,19 @@ def cumsum(self, axis=0, *args, **kwargs): return self.apply(lambda x: x.cumsum(), axis=axis) - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): return self._apply_columns(lambda x: x.isna()) + isnull = isna - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notna(self): return self._apply_columns(lambda x: x.notna()) + notnull = notna - def apply(self, func, axis=0, broadcast=None, reduce=None, - result_type=None): + def apply(self, func, axis=0, broadcast=None, reduce=None, result_type=None): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -865,17 +953,23 @@ def apply(self, func, axis=0, broadcast=None, reduce=None, applied.fill_value = func(v.fill_value) new_series[k] = applied return self._constructor( - new_series, index=self.index, columns=self.columns, + new_series, + index=self.index, + columns=self.columns, default_fill_value=self._default_fill_value, - default_kind=self._default_kind).__finalize__(self) + default_kind=self._default_kind, + ).__finalize__(self) from pandas.core.apply import frame_apply - op = frame_apply(self, - func=func, - axis=axis, - reduce=reduce, - broadcast=broadcast, - result_type=result_type) + + op = frame_apply( + self, + func=func, + axis=axis, + reduce=reduce, + broadcast=broadcast, + result_type=result_type, + ) return op.get_result() def applymap(self, func): @@ -904,8 +998,7 @@ def to_manager(sdf, columns, index): # from BlockManager perspective axes = [ensure_index(columns), ensure_index(index)] - return create_block_manager_from_arrays( - [sdf[c] for c in columns], columns, axes) + return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes) def stack_sparse_frame(frame): @@ -925,7 +1018,7 @@ def stack_sparse_frame(frame): # SparseDataFrame with a non-np.NaN fill value (fails earlier). for _, series in frame.items(): if not np.isnan(series.fill_value): - raise TypeError('This routine assumes NaN fill value') + raise TypeError("This routine assumes NaN fill value") int_index = series.sp_index.to_int_index() inds_to_concat.append(int_index.indices) @@ -933,12 +1026,13 @@ def stack_sparse_frame(frame): major_codes = np.concatenate(inds_to_concat) stacked_values = np.concatenate(vals_to_concat) - index = MultiIndex(levels=[frame.index, frame.columns], - codes=[major_codes, minor_codes], - verify_integrity=False) + index = MultiIndex( + levels=[frame.index, frame.columns], + codes=[major_codes, minor_codes], + verify_integrity=False, + ) - lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, - columns=['foo']) + lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=["foo"]) return lp.sort_index(level=0) @@ -966,7 +1060,7 @@ def homogenize(series_dict): for _, series in series_dict.items(): if not np.isnan(series.fill_value): - raise TypeError('this method is only valid with NaN fill values') + raise TypeError("this method is only valid with NaN fill values") if index is None: index = series.sp_index diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py index 7ff0f46575661..73638f5965119 100644 --- a/pandas/core/sparse/scipy_sparse.py +++ b/pandas/core/sparse/scipy_sparse.py @@ -13,13 +13,12 @@ def _check_is_partition(parts, whole): whole = set(whole) parts = [set(x) for x in parts] if set.intersection(*parts) != set(): - raise ValueError( - 'Is not a partition because intersection is not null.') + raise ValueError("Is not a partition because intersection is not null.") if set.union(*parts) != whole: - raise ValueError('Is not a partition because union is not the whole.') + raise ValueError("Is not a partition because union is not the whole.") -def _to_ijv(ss, row_levels=(0, ), column_levels=(1, ), sort_labels=False): +def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo constructor. """ @@ -36,8 +35,7 @@ def get_indexers(levels): # TODO: how to do this better? cleanly slice nonnull_labels given the # coord - values_ilabels = [tuple(x[i] for i in levels) - for x in nonnull_labels.index] + values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index] if len(levels) == 1: values_ilabels = [x[0] for x in values_ilabels] @@ -55,12 +53,11 @@ def _get_label_to_i_dict(labels, sort_labels=False): if sort_labels: labels = sorted(list(labels)) d = OrderedDict((k, i) for i, k in enumerate(labels)) - return (d) + return d def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): ilabels = list(zip(*[index._get_level_values(i) for i in subset])) - labels_to_i = _get_label_to_i_dict(ilabels, - sort_labels=sort_labels) + labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) labels_to_i = Series(labels_to_i) if len(subset) > 1: labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) @@ -69,11 +66,12 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): labels_to_i.index = Index(x[0] for x in labels_to_i.index) labels_to_i.index.name = index.names[subset[0]] - labels_to_i.name = 'value' - return (labels_to_i) + labels_to_i.name = "value" + return labels_to_i - labels_to_i = _get_index_subset_to_coord_dict(ss.index, levels, - sort_labels=sort_labels) + labels_to_i = _get_index_subset_to_coord_dict( + ss.index, levels, sort_labels=sort_labels + ) # ##################################################################### # ##################################################################### @@ -88,8 +86,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): return values, i_coord, j_coord, i_labels, j_labels -def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ), - sort_labels=False): +def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ Convert a SparseSeries to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column @@ -99,25 +96,26 @@ def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ), import scipy.sparse if ss.index.nlevels < 2: - raise ValueError('to_coo requires MultiIndex with nlevels > 2') + raise ValueError("to_coo requires MultiIndex with nlevels > 2") if not ss.index.is_unique: - raise ValueError('Duplicate index entries are not allowed in to_coo ' - 'transformation.') + raise ValueError( + "Duplicate index entries are not allowed in to_coo " "transformation." + ) # to keep things simple, only rely on integer indexing (not labels) row_levels = [ss.index._get_level_number(x) for x in row_levels] column_levels = [ss.index._get_level_number(x) for x in column_levels] - v, i, j, rows, columns = _to_ijv(ss, row_levels=row_levels, - column_levels=column_levels, - sort_labels=sort_labels) + v, i, j, rows, columns = _to_ijv( + ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels + ) sparse_matrix = scipy.sparse.coo_matrix( - (v, (i, j)), shape=(len(rows), len(columns))) + (v, (i, j)), shape=(len(rows), len(columns)) + ) return sparse_matrix, rows, columns -def _coo_to_sparse_series(A, dense_index: bool = False, - sparse_series: bool = True): +def _coo_to_sparse_series(A, dense_index: bool = False, sparse_series: bool = True): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. @@ -141,8 +139,7 @@ def _coo_to_sparse_series(A, dense_index: bool = False, try: s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) except AttributeError: - raise TypeError('Expected coo_matrix. Got {} instead.' - .format(type(A).__name__)) + raise TypeError("Expected coo_matrix. Got {} instead.".format(type(A).__name__)) s = s.sort_index() if sparse_series: # TODO(SparseSeries): remove this and the sparse_series keyword. diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 88b6634db92b6..43f2609f46bd6 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -24,12 +24,15 @@ from pandas.core.internals import SingleBlockManager import pandas.core.ops as ops from pandas.core.series import Series -from pandas.core.sparse.scipy_sparse import ( - _coo_to_sparse_series, _sparse_series_to_coo) +from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series, _sparse_series_to_coo -_shared_doc_kwargs = dict(axes='index', klass='SparseSeries', - axes_single_arg="{0, 'index'}", - optional_labels='', optional_axis='') +_shared_doc_kwargs = dict( + axes="index", + klass="SparseSeries", + axes_single_arg="{0, 'index'}", + optional_labels="", + optional_axis="", +) depr_msg = """\ @@ -70,11 +73,21 @@ class SparseSeries(Series): must change values, convert to dense, make your changes, then convert back to sparse """ - _subtyp = 'sparse_series' - def __init__(self, data=None, index=None, sparse_index=None, kind='block', - fill_value=None, name=None, dtype=None, copy=False, - fastpath=False): + _subtyp = "sparse_series" + + def __init__( + self, + data=None, + index=None, + sparse_index=None, + kind="block", + fill_value=None, + name=None, + dtype=None, + copy=False, + fastpath=False, + ): warnings.warn(depr_msg, FutureWarning, stacklevel=2) # TODO: Most of this should be refactored and shared with Series # 1. BlockManager -> array @@ -102,55 +115,67 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', data = np.full(len(index), fill_value=data) super().__init__( - SparseArray(data, - sparse_index=sparse_index, - kind=kind, - dtype=dtype, - fill_value=fill_value, - copy=copy), - index=index, name=name, - copy=False, fastpath=fastpath + SparseArray( + data, + sparse_index=sparse_index, + kind=kind, + dtype=dtype, + fill_value=fill_value, + copy=copy, + ), + index=index, + name=name, + copy=False, + fastpath=fastpath, ) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # avoid infinite recursion for other SparseSeries inputs - inputs = tuple( - x.values if isinstance(x, type(self)) else x - for x in inputs - ) + inputs = tuple(x.values if isinstance(x, type(self)) else x for x in inputs) result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs) - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) # unary ops # TODO: See if this can be shared def __pos__(self): result = self.values.__pos__() - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) def __neg__(self): result = self.values.__neg__() - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) def __invert__(self): result = self.values.__invert__() - return self._constructor(result, index=self.index, - sparse_index=self.sp_index, - fill_value=result.fill_value, - copy=False).__finalize__(self) + return self._constructor( + result, + index=self.index, + sparse_index=self.sp_index, + fill_value=result.fill_value, + copy=False, + ).__finalize__(self) @property def block(self): - warnings.warn("SparseSeries.block is deprecated.", FutureWarning, - stacklevel=2) + warnings.warn("SparseSeries.block is deprecated.", FutureWarning, stacklevel=2) return self._data._block @property @@ -174,18 +199,29 @@ def npoints(self): return self.values.npoints @classmethod - def from_array(cls, arr, index=None, name=None, copy=False, - fill_value=None, fastpath=False): + def from_array( + cls, arr, index=None, name=None, copy=False, fill_value=None, fastpath=False + ): """Construct SparseSeries from array. .. deprecated:: 0.23.0 Use the pd.SparseSeries(..) constructor instead. """ - warnings.warn("'from_array' is deprecated and will be removed in a " - "future version. Please use the pd.SparseSeries(..) " - "constructor instead.", FutureWarning, stacklevel=2) - return cls(arr, index=index, name=name, copy=copy, - fill_value=fill_value, fastpath=fastpath) + warnings.warn( + "'from_array' is deprecated and will be removed in a " + "future version. Please use the pd.SparseSeries(..) " + "constructor instead.", + FutureWarning, + stacklevel=2, + ) + return cls( + arr, + index=index, + name=name, + copy=copy, + fill_value=fill_value, + fastpath=fastpath, + ) @property def _constructor(self): @@ -194,14 +230,15 @@ def _constructor(self): @property def _constructor_expanddim(self): from pandas.core.sparse.api import SparseDataFrame + return SparseDataFrame @property def kind(self): if isinstance(self.sp_index, BlockIndex): - return 'block' + return "block" elif isinstance(self.sp_index, IntIndex): - return 'integer' + return "integer" def as_sparse_array(self, kind=None, fill_value=None, copy=False): """ return my self as a sparse array, do not copy by default """ @@ -210,26 +247,36 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False): fill_value = self.fill_value if kind is None: kind = self.kind - return SparseArray(self.values, sparse_index=self.sp_index, - fill_value=fill_value, kind=kind, copy=copy) + return SparseArray( + self.values, + sparse_index=self.sp_index, + fill_value=fill_value, + kind=kind, + copy=copy, + ) def __repr__(self): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Sparse") series_rep = Series.__repr__(self) - rep = '{series}\n{index!r}'.format(series=series_rep, - index=self.sp_index) + rep = "{series}\n{index!r}".format(series=series_rep, index=self.sp_index) return rep - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce( + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + ): """ perform a reduction operation """ return op(self.array.to_dense(), skipna=skipna, **kwds) def __getstate__(self): # pickling - return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data, - fill_value=self.fill_value, name=self.name) + return dict( + _typ=self._typ, + _subtyp=self._subtyp, + _data=self._data, + fill_value=self.fill_value, + name=self.name, + ) def _unpickle_series_compat(self, state): @@ -246,8 +293,9 @@ def _unpickle_series_compat(self, state): # create a sparse array if not isinstance(data, SparseArray): - data = SparseArray(data, sparse_index=sp_index, - fill_value=fill_value, copy=False) + data = SparseArray( + data, sparse_index=sp_index, fill_value=fill_value, copy=False + ) # recreate data = SingleBlockManager(data, index, fastpath=True) @@ -258,9 +306,9 @@ def _unpickle_series_compat(self, state): def _set_subtyp(self, is_all_dates): if is_all_dates: - object.__setattr__(self, '_subtyp', 'sparse_time_series') + object.__setattr__(self, "_subtyp", "sparse_time_series") else: - object.__setattr__(self, '_subtyp', 'sparse_series') + object.__setattr__(self, "_subtyp", "sparse_series") def _ixs(self, i, axis=0): """ @@ -294,8 +342,9 @@ def __getitem__(self, key): def _get_values(self, indexer): try: - return self._constructor(self._data.get_slice(indexer), - fastpath=True).__finalize__(self) + return self._constructor( + self._data.get_slice(indexer), fastpath=True + ).__finalize__(self) except Exception: return self[indexer] @@ -311,8 +360,9 @@ def abs(self): ------- abs: same type as caller """ - return self._constructor(np.abs(self.values), - index=self.index).__finalize__(self) + return self._constructor(np.abs(self.values), index=self.index).__finalize__( + self + ) def get(self, label, default=None): """ @@ -353,16 +403,20 @@ def get_value(self, label, takeable=False): ------- value : scalar value """ - warnings.warn("get_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "get_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._get_value(label, takeable=takeable) def _get_value(self, label, takeable=False): loc = label if takeable is True else self.index.get_loc(label) return self._get_val_at(loc) + _get_value.__doc__ = get_value.__doc__ def set_value(self, label, value, takeable=False): @@ -392,10 +446,13 @@ def set_value(self, label, value, takeable=False): ------- series : SparseSeries """ - warnings.warn("set_value is deprecated and will be removed " - "in a future release. Please use " - ".at[] or .iat[] accessors instead", FutureWarning, - stacklevel=2) + warnings.warn( + "set_value is deprecated and will be removed " + "in a future release. Please use " + ".at[] or .iat[] accessors instead", + FutureWarning, + stacklevel=2, + ) return self._set_value(label, value, takeable=takeable) def _set_value(self, label, value, takeable=False): @@ -407,10 +464,10 @@ def _set_value(self, label, value, takeable=False): if new_values is not None: values = new_values new_index = values.index - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) + values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, new_index) self._index = new_index + _set_value.__doc__ = set_value.__doc__ def _set_values(self, key, value): @@ -424,8 +481,7 @@ def _set_values(self, key, value): values = self.values.to_dense() values[key] = libindex.convert_scalar(values, value) - values = SparseArray(values, fill_value=self.fill_value, - kind=self.kind) + values = SparseArray(values, fill_value=self.fill_value, kind=self.kind) self._data = SingleBlockManager(values, self.index) def to_dense(self): @@ -436,8 +492,7 @@ def to_dense(self): ------- s : Series """ - return Series(self.values.to_dense(), index=self.index, - name=self.name) + return Series(self.values.to_dense(), index=self.index, name=self.name) @property def density(self): @@ -453,18 +508,21 @@ def copy(self, deep=True): new_data = self.values if deep: new_data = new_data.copy() - return self._constructor(new_data, sparse_index=self.sp_index, - fill_value=self.fill_value, - index=self.index.copy(), - name=self.name).__finalize__(self) + return self._constructor( + new_data, + sparse_index=self.sp_index, + fill_value=self.fill_value, + index=self.index.copy(), + name=self.name, + ).__finalize__(self) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) - def reindex(self, index=None, method=None, copy=True, limit=None, - **kwargs): + def reindex(self, index=None, method=None, copy=True, limit=None, **kwargs): # TODO: remove? - return super().reindex(index=index, method=method, copy=copy, - limit=limit, **kwargs) + return super().reindex( + index=index, method=method, copy=copy, limit=limit, **kwargs + ) def sparse_reindex(self, new_index): """ @@ -482,10 +540,11 @@ def sparse_reindex(self, new_index): raise TypeError("new index must be a SparseIndex") values = self.values values = values.sp_index.to_int_index().reindex( - values.sp_values.astype('float64'), values.fill_value, new_index) - values = SparseArray(values, - sparse_index=new_index, - fill_value=self.values.fill_value) + values.sp_values.astype("float64"), values.fill_value, new_index + ) + values = SparseArray( + values, sparse_index=new_index, fill_value=self.values.fill_value + ) return self._constructor(values, index=self.index).__finalize__(self) def cumsum(self, axis=0, *args, **kwargs): @@ -512,25 +571,30 @@ def cumsum(self, axis=0, *args, **kwargs): new_array = self.values.cumsum() return self._constructor( - new_array, index=self.index, - sparse_index=new_array.sp_index).__finalize__(self) + new_array, index=self.index, sparse_index=new_array.sp_index + ).__finalize__(self) # TODO: SparseSeries.isna is Sparse, while Series.isna is dense - @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) def isna(self): - arr = SparseArray(isna(self.values.sp_values), - sparse_index=self.values.sp_index, - fill_value=isna(self.fill_value)) + arr = SparseArray( + isna(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=isna(self.fill_value), + ) return self._constructor(arr, index=self.index).__finalize__(self) isnull = isna - @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs) + @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) def notna(self): - arr = SparseArray(notna(self.values.sp_values), - sparse_index=self.values.sp_index, - fill_value=notna(self.fill_value)) + arr = SparseArray( + notna(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=notna(self.fill_value), + ) return self._constructor(arr, index=self.index).__finalize__(self) + notnull = notna def dropna(self, axis=0, inplace=False, **kwargs): @@ -542,8 +606,9 @@ def dropna(self, axis=0, inplace=False, **kwargs): self._get_axis_number(axis or 0) dense_valid = self.to_dense().dropna() if inplace: - raise NotImplementedError("Cannot perform inplace dropna" - " operations on a SparseSeries") + raise NotImplementedError( + "Cannot perform inplace dropna" " operations on a SparseSeries" + ) if isna(self.fill_value): return dense_valid else: @@ -570,10 +635,10 @@ def combine_first(self, other): return dense_combined.to_sparse(fill_value=self.fill_value) @Appender(SparseAccessor.to_coo.__doc__) - def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): - A, rows, columns = _sparse_series_to_coo(self, row_levels, - column_levels, - sort_labels=sort_labels) + def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): + A, rows, columns = _sparse_series_to_coo( + self, row_levels, column_levels, sort_labels=sort_labels + ) return A, rows, columns @classmethod diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 710b29c6a6536..70700653c4795 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -12,8 +12,15 @@ from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, - is_list_like, is_re, is_scalar, is_string_like) + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_integer, + is_list_like, + is_re, + is_scalar, + is_string_like, +) from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import isna @@ -22,11 +29,15 @@ import pandas.core.common as com _cpython_optimized_encoders = ( - "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ( - "utf-16", "utf-32" + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", ) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") _shared_docs = dict() # type: Dict[str, str] @@ -80,11 +91,12 @@ def cat_safe(list_of_columns: List, sep: str): # object dtype), np.sum will fail; catch and return with better message for column in list_of_columns: dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ['string', 'empty']: + if dtype not in ["string", "empty"]: raise TypeError( - 'Concatenation requires list-likes containing only ' - 'strings (or missing values). Offending values found in ' - 'column {}'.format(dtype)) from None + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + "column {}".format(dtype) + ) from None return result @@ -109,8 +121,10 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): except (TypeError, AttributeError) as e: # Reraise the exception if callable `f` got wrong number of args. # The user may want to be warned by this, instead of getting NaN - p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' - r'(?(3)required )positional arguments?') + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) if len(e.args) >= 1 and re.search(p_err, e.args[0]): raise e @@ -330,9 +344,12 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): regex = re.compile(pat, flags=flags) if regex.groups > 0: - warnings.warn("This pattern has match groups. To actually get the" - " groups, use str.extract.", UserWarning, - stacklevel=3) + warnings.warn( + "This pattern has match groups. To actually get the" + " groups, use str.extract.", + UserWarning, + stacklevel=3, + ) f = lambda x: bool(regex.search(x)) else: @@ -585,8 +602,9 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): if regex: if is_compiled_re: if (case is not None) or (flags != 0): - raise ValueError("case and flags cannot be set" - " when pat is a compiled regex") + raise ValueError( + "case and flags cannot be set" " when pat is a compiled regex" + ) else: # not a compiled regex # set default case @@ -604,11 +622,11 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): f = lambda x: x.replace(pat, repl, n) else: if is_compiled_re: - raise ValueError("Cannot use a compiled regex as replacement " - "pattern with regex=False") + raise ValueError( + "Cannot use a compiled regex as replacement " "pattern with regex=False" + ) if callable(repl): - raise ValueError("Cannot use a callable replacement when " - "regex=False") + raise ValueError("Cannot use a callable replacement when " "regex=False") f = lambda x: x.replace(pat, repl, n) return _na_map(f, arr) @@ -655,6 +673,7 @@ def str_repeat(arr, repeats): dtype: object """ if is_scalar(repeats): + def scalar_rep(x): try: return bytes.__mul__(x, repeats) @@ -732,6 +751,7 @@ def f(x): return [np.nan if item is None else item for item in m.groups()] else: return empty_row + return f @@ -764,7 +784,8 @@ def _str_extract_noexpand(arr, pat, flags=0): [groups_or_na(val) for val in arr], columns=columns, index=arr.index, - dtype=object) + dtype=object, + ) return result, name @@ -792,7 +813,8 @@ def _str_extract_frame(arr, pat, flags=0): [groups_or_na(val) for val in arr], columns=columns, index=result_index, - dtype=object) + dtype=object, + ) def str_extract(arr, pat, flags=0, expand=True): @@ -980,27 +1002,25 @@ def str_extractall(arr, pat, flags=0): if isinstance(subject, str): if not is_mi: - subject_key = (subject_key, ) + subject_key = (subject_key,) for match_i, match_tuple in enumerate(regex.findall(subject)): if isinstance(match_tuple, str): match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group - for group in match_tuple] + na_tuple = [np.NaN if group == "" else group for group in match_tuple] match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i, )) + result_key = tuple(subject_key + (match_i,)) index_list.append(result_key) from pandas import MultiIndex - index = MultiIndex.from_tuples( - index_list, names=arr.index.names + ["match"]) - result = arr._constructor_expanddim(match_list, index=index, - columns=columns) + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + + result = arr._constructor_expanddim(match_list, index=index, columns=columns) return result -def str_get_dummies(arr, sep='|'): +def str_get_dummies(arr, sep="|"): """ Split each string in the Series by sep and return a DataFrame of dummy/indicator variables. @@ -1034,7 +1054,7 @@ def str_get_dummies(arr, sep='|'): 1 0 0 0 2 1 0 1 """ - arr = arr.fillna('') + arr = arr.fillna("") try: arr = sep + arr + sep except TypeError: @@ -1212,7 +1232,7 @@ def str_findall(arr, pat, flags=0): return _na_map(regex.findall, arr) -def str_find(arr, sub, start=0, end=None, side='left'): +def str_find(arr, sub, start=0, end=None, side="left"): """ Return indexes in each strings in the Series/Index where the substring is fully contained between [start:end]. Return -1 on failure. @@ -1235,15 +1255,15 @@ def str_find(arr, sub, start=0, end=None, side='left'): """ if not isinstance(sub, str): - msg = 'expected a string object, not {0}' + msg = "expected a string object, not {0}" raise TypeError(msg.format(type(sub).__name__)) - if side == 'left': - method = 'find' - elif side == 'right': - method = 'rfind' + if side == "left": + method = "find" + elif side == "right": + method = "rfind" else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") if end is None: f = lambda x: getattr(x, method)(sub, start) @@ -1253,17 +1273,17 @@ def str_find(arr, sub, start=0, end=None, side='left'): return _na_map(f, arr, dtype=int) -def str_index(arr, sub, start=0, end=None, side='left'): +def str_index(arr, sub, start=0, end=None, side="left"): if not isinstance(sub, str): - msg = 'expected a string object, not {0}' + msg = "expected a string object, not {0}" raise TypeError(msg.format(type(sub).__name__)) - if side == 'left': - method = 'index' - elif side == 'right': - method = 'rindex' + if side == "left": + method = "index" + elif side == "right": + method = "rindex" else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") if end is None: f = lambda x: getattr(x, method)(sub, start) @@ -1273,7 +1293,7 @@ def str_index(arr, sub, start=0, end=None, side='left'): return _na_map(f, arr, dtype=int) -def str_pad(arr, width, side='left', fillchar=' '): +def str_pad(arr, width, side="left", fillchar=" "): """ Pad strings in the Series/Index up to width. @@ -1327,24 +1347,24 @@ def str_pad(arr, width, side='left', fillchar=' '): dtype: object """ if not isinstance(fillchar, str): - msg = 'fillchar must be a character, not {0}' + msg = "fillchar must be a character, not {0}" raise TypeError(msg.format(type(fillchar).__name__)) if len(fillchar) != 1: - raise TypeError('fillchar must be a character, not str') + raise TypeError("fillchar must be a character, not str") if not is_integer(width): - msg = 'width must be of integer type, not {0}' + msg = "width must be of integer type, not {0}" raise TypeError(msg.format(type(width).__name__)) - if side == 'left': + if side == "left": f = lambda x: x.rjust(width, fillchar) - elif side == 'right': + elif side == "right": f = lambda x: x.ljust(width, fillchar) - elif side == 'both': + elif side == "both": f = lambda x: x.center(width, fillchar) else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") return _na_map(f, arr) @@ -1522,14 +1542,14 @@ def str_slice_replace(arr, start=None, stop=None, repl=None): dtype: object """ if repl is None: - repl = '' + repl = "" def f(x): - if x[start:stop] == '': + if x[start:stop] == "": local_stop = start else: local_stop = stop - y = '' + y = "" if start is not None: y += x[:start] y += repl @@ -1540,7 +1560,7 @@ def f(x): return _na_map(f, arr) -def str_strip(arr, to_strip=None, side='both'): +def str_strip(arr, to_strip=None, side="both"): """ Strip whitespace (including newlines) from each string in the Series/Index. @@ -1554,14 +1574,14 @@ def str_strip(arr, to_strip=None, side='both'): ------- Series or Index """ - if side == 'both': + if side == "both": f = lambda x: x.strip(to_strip) - elif side == 'left': + elif side == "left": f = lambda x: x.lstrip(to_strip) - elif side == 'right': + elif side == "right": f = lambda x: x.rstrip(to_strip) else: # pragma: no cover - raise ValueError('Invalid side') + raise ValueError("Invalid side") return _na_map(f, arr) @@ -1622,11 +1642,11 @@ def str_wrap(arr, width, **kwargs): 1 another line\nto be\nwrapped dtype: object """ - kwargs['width'] = width + kwargs["width"] = width tw = textwrap.TextWrapper(**kwargs) - return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr) + return _na_map(lambda s: "\n".join(tw.wrap(s)), arr) def str_translate(arr, table): @@ -1700,12 +1720,14 @@ def str_get(arr, i): 5 None dtype: object """ + def f(x): if isinstance(x, dict): return x.get(i) elif len(x) > i >= -len(x): return x[i] return np.nan + return _na_map(f, arr) @@ -1801,8 +1823,9 @@ def forbid_nonstring_types(forbidden, name=None): # deal with None forbidden = [] if forbidden is None else forbidden - allowed_types = {'string', 'empty', 'bytes', - 'mixed', 'mixed-integer'} - set(forbidden) + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) def _forbid_nonstring_types(func): func_name = func.__name__ if name is None else name @@ -1810,18 +1833,22 @@ def _forbid_nonstring_types(func): @wraps(func) def wrapper(self, *args, **kwargs): if self._inferred_dtype not in allowed_types: - msg = ('Cannot use .str.{name} with values of inferred dtype ' - '{inf_type!r}.'.format(name=func_name, - inf_type=self._inferred_dtype)) + msg = ( + "Cannot use .str.{name} with values of inferred dtype " + "{inf_type!r}.".format( + name=func_name, inf_type=self._inferred_dtype + ) + ) raise TypeError(msg) return func(self, *args, **kwargs) + wrapper.__name__ = func_name return wrapper + return _forbid_nonstring_types -def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=['bytes'], - **kargs): +def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=["bytes"], **kargs): @forbid_nonstring_types(forbidden_types, name=name) def wrapper(self): result = _na_map(f, self._parent, **kargs) @@ -1831,13 +1858,14 @@ def wrapper(self): if docstring is not None: wrapper.__doc__ = docstring else: - raise ValueError('Provide docstring') + raise ValueError("Provide docstring") return wrapper -def _pat_wrapper(f, flags=False, na=False, name=None, - forbidden_types=['bytes'], **kwargs): +def _pat_wrapper( + f, flags=False, na=False, name=None, forbidden_types=["bytes"], **kwargs +): @forbid_nonstring_types(forbidden_types, name=name) def wrapper1(self, pat): result = f(self._parent, pat) @@ -1919,21 +1947,21 @@ def _validate(data): dtype : inferred dtype of data """ if isinstance(data, ABCMultiIndex): - raise AttributeError('Can only use .str accessor with Index, ' - 'not MultiIndex') + raise AttributeError( + "Can only use .str accessor with Index, " "not MultiIndex" + ) # see _libs/lib.pyx for list of inferred types - allowed_types = ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'] + allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - values = getattr(data, 'values', data) # Series / Index - values = getattr(values, 'categories', values) # categorical / normal + values = getattr(data, "values", data) # Series / Index + values = getattr(values, "categories", values) # categorical / normal # missing values obfuscate type inference -> skip inferred_dtype = lib.infer_dtype(values, skipna=True) if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string " - "values!") + raise AttributeError("Can only use .str accessor with string " "values!") return inferred_dtype def __getitem__(self, key): @@ -1950,8 +1978,9 @@ def __iter__(self): i += 1 g = self.get(i) - def _wrap_result(self, result, use_codes=True, - name=None, expand=None, fill_value=np.nan): + def _wrap_result( + self, result, use_codes=True, name=None, expand=None, fill_value=np.nan + ): from pandas import Index, Series, MultiIndex @@ -1962,10 +1991,11 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d(result, Series(self._orig, copy=False).cat.codes, - fill_value=fill_value) + result = take_1d( + result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value + ) - if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): + if not hasattr(result, "ndim") or not hasattr(result, "dtype"): return result assert result.ndim < 3 @@ -1987,8 +2017,9 @@ def cons_row(x): if result: # propagate nan values to match longest sequence (GH 18450) max_len = max(len(x) for x in result) - result = [x * max_len if len(x) == 0 or x[0] is np.nan - else x for x in result] + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result + ] if not isinstance(expand, bool): raise ValueError("expand must be True or False") @@ -1997,7 +2028,7 @@ def cons_row(x): # if expand is False, result should have the same name # as the original otherwise specified if name is None: - name = getattr(result, 'name', None) + name = getattr(result, "name", None) if name is None: # do not use logical or, _orig may be a DataFrame # which has "name" column @@ -2058,9 +2089,11 @@ def _get_series_list(self, others, ignore_index=False): # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, Index) else self._orig.index - err_msg = ('others must be Series, Index, DataFrame, np.ndarray or ' - 'list-like (either containing only strings or containing ' - 'only objects of type Series/Index/list-like/np.ndarray)') + err_msg = ( + "others must be Series, Index, DataFrame, np.ndarray or " + "list-like (either containing only strings or containing " + "only objects of type Series/Index/list-like/np.ndarray)" + ) # Generally speaking, all objects without an index inherit the index # `idx` of the calling Series/Index - i.e. must have matching length. @@ -2069,13 +2102,13 @@ def _get_series_list(self, others, ignore_index=False): if isinstance(others, Series): warn = not others.index.equals(idx) # only reconstruct Series when absolutely necessary - los = [Series(others.values, index=idx) - if ignore_index and warn else others] + los = [ + Series(others.values, index=idx) if ignore_index and warn else others + ] return (los, warn) elif isinstance(others, Index): warn = not others.equals(idx) - los = [Series(others.values, - index=(idx if ignore_index else others))] + los = [Series(others.values, index=(idx if ignore_index else others))] return (los, warn) elif isinstance(others, DataFrame): warn = not others.index.equals(idx) @@ -2105,45 +2138,51 @@ def _get_series_list(self, others, ignore_index=False): # GH 21950 - DeprecationWarning # only allowing Series/Index/np.ndarray[1-dim] will greatly # simply this function post-deprecation. - if not (isinstance(nxt, (Series, Index)) or - (isinstance(nxt, np.ndarray) and nxt.ndim == 1)): + if not ( + isinstance(nxt, (Series, Index)) + or (isinstance(nxt, np.ndarray) and nxt.ndim == 1) + ): depr_warn = True - if not isinstance(nxt, (DataFrame, Series, - Index, np.ndarray)): + if not isinstance(nxt, (DataFrame, Series, Index, np.ndarray)): # safety for non-persistent list-likes (e.g. iterators) # do not map indexed/typed objects; info needed below nxt = list(nxt) # known types for which we can avoid deep inspection - no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1) - or isinstance(nxt, (Series, Index))) + no_deep = ( + isinstance(nxt, np.ndarray) and nxt.ndim == 1 + ) or isinstance(nxt, (Series, Index)) # nested list-likes are forbidden: # -> elements of nxt must not be list-like - is_legal = ((no_deep and nxt.dtype == object) - or all(not is_list_like(x) for x in nxt)) + is_legal = (no_deep and nxt.dtype == object) or all( + not is_list_like(x) for x in nxt + ) # DataFrame is false positive of is_legal # because "x in df" returns column names if not is_legal or isinstance(nxt, DataFrame): raise TypeError(err_msg) - nxt, wnx = self._get_series_list(nxt, - ignore_index=ignore_index) + nxt, wnx = self._get_series_list(nxt, ignore_index=ignore_index) los = los + nxt join_warn = join_warn or wnx if depr_warn: - warnings.warn('list-likes other than Series, Index, or ' - 'np.ndarray WITHIN another list-like are ' - 'deprecated and will be removed in a future ' - 'version.', FutureWarning, stacklevel=4) + warnings.warn( + "list-likes other than Series, Index, or " + "np.ndarray WITHIN another list-like are " + "deprecated and will be removed in a future " + "version.", + FutureWarning, + stacklevel=4, + ) return (los, join_warn) elif all(not is_list_like(x) for x in others): return ([Series(others, index=idx)], False) raise TypeError(err_msg) - @forbid_nonstring_types(['bytes', 'mixed', 'mixed-integer']) + @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) def cat(self, others=None, sep=None, na_rep=None, join=None): """ Concatenate strings in the Series/Index with given separator. @@ -2284,7 +2323,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): if isinstance(others, str): raise ValueError("Did you mean to supply a `sep` keyword?") if sep is None: - sep = '' + sep = "" if isinstance(self._orig, Index): data = Series(self._orig, index=self._orig) @@ -2303,38 +2342,50 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): try: # turn anything in "others" into lists of Series - others, warn = self._get_series_list(others, - ignore_index=(join is None)) + others, warn = self._get_series_list(others, ignore_index=(join is None)) except ValueError: # do not catch TypeError raised by _get_series_list if join is None: - raise ValueError('All arrays must be same length, except ' - 'those having an index if `join` is not None') + raise ValueError( + "All arrays must be same length, except " + "those having an index if `join` is not None" + ) else: - raise ValueError('If `others` contains arrays or lists (or ' - 'other list-likes without an index), these ' - 'must all be of the same length as the ' - 'calling Series/Index.') + raise ValueError( + "If `others` contains arrays or lists (or " + "other list-likes without an index), these " + "must all be of the same length as the " + "calling Series/Index." + ) if join is None and warn: - warnings.warn("A future version of pandas will perform index " - "alignment when `others` is a Series/Index/" - "DataFrame (or a list-like containing one). To " - "disable alignment (the behavior before v.0.23) and " - "silence this warning, use `.values` on any Series/" - "Index/DataFrame in `others`. To enable alignment " - "and silence this warning, pass `join='left'|" - "'outer'|'inner'|'right'`. The future default will " - "be `join='left'`.", FutureWarning, stacklevel=3) + warnings.warn( + "A future version of pandas will perform index " + "alignment when `others` is a Series/Index/" + "DataFrame (or a list-like containing one). To " + "disable alignment (the behavior before v.0.23) and " + "silence this warning, use `.values` on any Series/" + "Index/DataFrame in `others`. To enable alignment " + "and silence this warning, pass `join='left'|" + "'outer'|'inner'|'right'`. The future default will " + "be `join='left'`.", + FutureWarning, + stacklevel=3, + ) # if join is None, _get_series_list already force-aligned indexes - join = 'left' if join is None else join + join = "left" if join is None else join # align if required if any(not data.index.equals(x.index) for x in others): # Need to add keys for uniqueness in case of duplicate columns - others = concat(others, axis=1, - join=(join if join == 'inner' else 'outer'), - keys=range(len(others)), sort=False, copy=False) + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) data, others = data.align(others, join=join) others = [others[x] for x in others] # again list of Series @@ -2349,12 +2400,12 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): np.putmask(result, union_mask, np.nan) not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], - sep) + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) elif na_rep is not None and union_mask.any(): # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [np.where(nm, na_rep, col) - for nm, col in zip(na_masks, all_cols)] + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] result = cat_safe(all_cols, sep) else: # no NaNs - can just concatenate @@ -2364,11 +2415,14 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): # add dtype for case that result is all-NA result = Index(result, dtype=object, name=self._orig.name) else: # Series - result = Series(result, dtype=object, index=data.index, - name=self._orig.name) + result = Series( + result, dtype=object, index=data.index, name=self._orig.name + ) return result - _shared_docs['str_split'] = (r""" + _shared_docs[ + "str_split" + ] = r""" Split strings around given separator/delimiter. Splits the string in the Series/Index from the %(side)s, @@ -2496,25 +2550,23 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): >>> s.str.split(r"\+|=", expand=True) 0 1 2 0 1 1 2 - """) + """ - @Appender(_shared_docs['str_split'] % { - 'side': 'beginning', - 'method': 'split'}) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) + @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False): result = str_split(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) - @Appender(_shared_docs['str_split'] % { - 'side': 'end', - 'method': 'rsplit'}) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) + @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, n=-1, expand=False): result = str_rsplit(self._parent, pat, n=n) return self._wrap_result(result, expand=expand) - _shared_docs['str_partition'] = (""" + _shared_docs[ + "str_partition" + ] = """ Split the string at the %(side)s occurrence of `sep`. This method splits the string at the %(side)s occurrence of `sep`, @@ -2595,32 +2647,36 @@ def rsplit(self, pat=None, n=-1, expand=False): >>> idx.str.partition(expand=False) Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """) - - @Appender(_shared_docs['str_partition'] % { - 'side': 'first', - 'return': '3 elements containing the string itself, followed by two ' - 'empty strings', - 'also': 'rpartition : Split the string at the last occurrence of ' - '`sep`.' - }) - @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') - @forbid_nonstring_types(['bytes']) - def partition(self, sep=' ', expand=True): + """ + + @Appender( + _shared_docs["str_partition"] + % { + "side": "first", + "return": "3 elements containing the string itself, followed by two " + "empty strings", + "also": "rpartition : Split the string at the last occurrence of " "`sep`.", + } + ) + @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep") + @forbid_nonstring_types(["bytes"]) + def partition(self, sep=" ", expand=True): f = lambda x: x.partition(sep) result = _na_map(f, self._parent) return self._wrap_result(result, expand=expand) - @Appender(_shared_docs['str_partition'] % { - 'side': 'last', - 'return': '3 elements containing two empty strings, followed by the ' - 'string itself', - 'also': 'partition : Split the string at the first occurrence of ' - '`sep`.' - }) - @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep') - @forbid_nonstring_types(['bytes']) - def rpartition(self, sep=' ', expand=True): + @Appender( + _shared_docs["str_partition"] + % { + "side": "last", + "return": "3 elements containing two empty strings, followed by the " + "string itself", + "also": "partition : Split the string at the first occurrence of " "`sep`.", + } + ) + @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep") + @forbid_nonstring_types(["bytes"]) + def rpartition(self, sep=" ", expand=True): f = lambda x: x.rpartition(sep) result = _na_map(f, self._parent) return self._wrap_result(result, expand=expand) @@ -2631,44 +2687,48 @@ def get(self, i): return self._wrap_result(result) @copy(str_join) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def join(self, sep): result = str_join(self._parent, sep) return self._wrap_result(result) @copy(str_contains) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains(self._parent, pat, case=case, flags=flags, na=na, - regex=regex) + result = str_contains( + self._parent, pat, case=case, flags=flags, na=na, regex=regex + ) return self._wrap_result(result, fill_value=na) @copy(str_match) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na) @copy(str_replace) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - result = str_replace(self._parent, pat, repl, n=n, case=case, - flags=flags, regex=regex) + result = str_replace( + self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex + ) return self._wrap_result(result) @copy(str_repeat) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def repeat(self, repeats): result = str_repeat(self._parent, repeats) return self._wrap_result(result) @copy(str_pad) - @forbid_nonstring_types(['bytes']) - def pad(self, width, side='left', fillchar=' '): + @forbid_nonstring_types(["bytes"]) + def pad(self, width, side="left", fillchar=" "): result = str_pad(self._parent, width, side=side, fillchar=fillchar) return self._wrap_result(result) - _shared_docs['str_pad'] = (""" + _shared_docs[ + "str_pad" + ] = """ Filling %(side)s side of strings in the Series/Index with an additional character. Equivalent to :meth:`str.%(method)s`. @@ -2683,25 +2743,24 @@ def pad(self, width, side='left', fillchar=' '): Returns ------- filled : Series/Index of objects - """) + """ - @Appender(_shared_docs['str_pad'] % dict(side='left and right', - method='center')) - @forbid_nonstring_types(['bytes']) - def center(self, width, fillchar=' '): - return self.pad(width, side='both', fillchar=fillchar) + @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) + @forbid_nonstring_types(["bytes"]) + def center(self, width, fillchar=" "): + return self.pad(width, side="both", fillchar=fillchar) - @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust')) - @forbid_nonstring_types(['bytes']) - def ljust(self, width, fillchar=' '): - return self.pad(width, side='right', fillchar=fillchar) + @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) + @forbid_nonstring_types(["bytes"]) + def ljust(self, width, fillchar=" "): + return self.pad(width, side="right", fillchar=fillchar) - @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust')) - @forbid_nonstring_types(['bytes']) - def rjust(self, width, fillchar=' '): - return self.pad(width, side='left', fillchar=fillchar) + @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) + @forbid_nonstring_types(["bytes"]) + def rjust(self, width, fillchar=" "): + return self.pad(width, side="left", fillchar=fillchar) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def zfill(self, width): """ Pad strings in the Series/Index by prepending '0' characters. @@ -2762,7 +2821,7 @@ def zfill(self, width): 4 NaN dtype: object """ - result = str_pad(self._parent, width, side='left', fillchar='0') + result = str_pad(self._parent, width, side="left", fillchar="0") return self._wrap_result(result) @copy(str_slice) @@ -2771,7 +2830,7 @@ def slice(self, start=None, stop=None, step=None): return self._wrap_result(result) @copy(str_slice_replace) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def slice_replace(self, start=None, stop=None, repl=None): result = str_slice_replace(self._parent, start, stop, repl) return self._wrap_result(result) @@ -2783,12 +2842,14 @@ def decode(self, encoding, errors="strict"): return self._wrap_result(result) @copy(str_encode) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def encode(self, encoding, errors="strict"): result = str_encode(self._parent, encoding, errors) return self._wrap_result(result) - _shared_docs['str_strip'] = (r""" + _shared_docs[ + "str_strip" + ] = r""" Remove leading and trailing characters. Strip whitespaces (including newlines) or a set of specified characters @@ -2849,67 +2910,69 @@ def encode(self, encoding, errors="strict"): 2 Cat 3 NaN dtype: object - """) + """ - @Appender(_shared_docs['str_strip'] % dict(side='left and right sides', - method='strip')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["str_strip"] % dict(side="left and right sides", method="strip") + ) + @forbid_nonstring_types(["bytes"]) def strip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side='both') + result = str_strip(self._parent, to_strip, side="both") return self._wrap_result(result) - @Appender(_shared_docs['str_strip'] % dict(side='left side', - method='lstrip')) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_strip"] % dict(side="left side", method="lstrip")) + @forbid_nonstring_types(["bytes"]) def lstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side='left') + result = str_strip(self._parent, to_strip, side="left") return self._wrap_result(result) - @Appender(_shared_docs['str_strip'] % dict(side='right side', - method='rstrip')) - @forbid_nonstring_types(['bytes']) + @Appender(_shared_docs["str_strip"] % dict(side="right side", method="rstrip")) + @forbid_nonstring_types(["bytes"]) def rstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side='right') + result = str_strip(self._parent, to_strip, side="right") return self._wrap_result(result) @copy(str_wrap) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def wrap(self, width, **kwargs): result = str_wrap(self._parent, width, **kwargs) return self._wrap_result(result) @copy(str_get_dummies) - @forbid_nonstring_types(['bytes']) - def get_dummies(self, sep='|'): + @forbid_nonstring_types(["bytes"]) + def get_dummies(self, sep="|"): # we need to cast to Series of strings as only that has all # methods available for making the dummies... data = self._orig.astype(str) if self._is_categorical else self._parent result, name = str_get_dummies(data, sep) - return self._wrap_result(result, use_codes=(not self._is_categorical), - name=name, expand=True) + return self._wrap_result( + result, use_codes=(not self._is_categorical), name=name, expand=True + ) @copy(str_translate) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def translate(self, table): result = str_translate(self._parent, table) return self._wrap_result(result) - count = _pat_wrapper(str_count, flags=True, name='count') - startswith = _pat_wrapper(str_startswith, na=True, name='startswith') - endswith = _pat_wrapper(str_endswith, na=True, name='endswith') - findall = _pat_wrapper(str_findall, flags=True, name='findall') + count = _pat_wrapper(str_count, flags=True, name="count") + startswith = _pat_wrapper(str_startswith, na=True, name="startswith") + endswith = _pat_wrapper(str_endswith, na=True, name="endswith") + findall = _pat_wrapper(str_findall, flags=True, name="findall") @copy(str_extract) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def extract(self, pat, flags=0, expand=True): return str_extract(self, pat, flags=flags, expand=expand) @copy(str_extractall) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags=0): return str_extractall(self._orig, pat, flags=flags) - _shared_docs['find'] = (""" + _shared_docs[ + "find" + ] = """ Return %(side)s indexes in each strings in the Series/Index where the substring is fully contained between [start:end]. Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`. @@ -2930,26 +2993,35 @@ def extractall(self, pat, flags=0): See Also -------- %(also)s - """) + """ - @Appender(_shared_docs['find'] % - dict(side='lowest', method='find', - also='rfind : Return highest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["find"] + % dict( + side="lowest", + method="find", + also="rfind : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def find(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side='left') + result = str_find(self._parent, sub, start=start, end=end, side="left") return self._wrap_result(result) - @Appender(_shared_docs['find'] % - dict(side='highest', method='rfind', - also='find : Return lowest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["find"] + % dict( + side="highest", + method="rfind", + also="find : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def rfind(self, sub, start=0, end=None): - result = str_find(self._parent, sub, - start=start, end=end, side='right') + result = str_find(self._parent, sub, start=start, end=end, side="right") return self._wrap_result(result) - @forbid_nonstring_types(['bytes']) + @forbid_nonstring_types(["bytes"]) def normalize(self, form): """ Return the Unicode normal form for the strings in the Series/Index. @@ -2966,11 +3038,14 @@ def normalize(self, form): normalized : Series/Index of objects """ import unicodedata + f = lambda x: unicodedata.normalize(form, x) result = _na_map(f, self._parent) return self._wrap_result(result) - _shared_docs['index'] = (""" + _shared_docs[ + "index" + ] = """ Return %(side)s indexes in each strings where the substring is fully contained between [start:end]. This is the same as ``str.%(similar)s`` except instead of returning -1, it raises a ValueError @@ -2992,27 +3067,39 @@ def normalize(self, form): See Also -------- %(also)s - """) + """ - @Appender(_shared_docs['index'] % - dict(side='lowest', similar='find', method='index', - also='rindex : Return highest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["index"] + % dict( + side="lowest", + similar="find", + method="index", + also="rindex : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def index(self, sub, start=0, end=None): - result = str_index(self._parent, sub, - start=start, end=end, side='left') + result = str_index(self._parent, sub, start=start, end=end, side="left") return self._wrap_result(result) - @Appender(_shared_docs['index'] % - dict(side='highest', similar='rfind', method='rindex', - also='index : Return lowest indexes in each strings.')) - @forbid_nonstring_types(['bytes']) + @Appender( + _shared_docs["index"] + % dict( + side="highest", + similar="rfind", + method="rindex", + also="index : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) def rindex(self, sub, start=0, end=None): - result = str_index(self._parent, sub, - start=start, end=end, side='right') + result = str_index(self._parent, sub, start=start, end=end, side="right") return self._wrap_result(result) - _shared_docs['len'] = (""" + _shared_docs[ + "len" + ] = """ Compute the length of each element in the Series/Index. The element may be a sequence (such as a string, tuple or list) or a collection (such as a dictionary). @@ -3055,11 +3142,14 @@ def rindex(self, sub, start=0, end=None): 4 4.0 5 3.0 dtype: float64 - """) - len = _noarg_wrapper(len, docstring=_shared_docs['len'], - forbidden_types=None, dtype=int) + """ + len = _noarg_wrapper( + len, docstring=_shared_docs["len"], forbidden_types=None, dtype=int + ) - _shared_docs['casemethods'] = (""" + _shared_docs[ + "casemethods" + ] = """ Convert strings in the Series/Index to %(type)s. %(version)s Equivalent to :meth:`str.%(method)s`. @@ -3124,45 +3214,56 @@ def rindex(self, sub, start=0, end=None): 2 THIS IS A SENTENCE 3 sWaPcAsE dtype: object - """) + """ # _doc_args holds dict of strings to use in substituting casemethod docs _doc_args = {} # type: Dict[str, Dict[str, str]] - _doc_args['lower'] = dict(type='lowercase', method='lower', version='') - _doc_args['upper'] = dict(type='uppercase', method='upper', version='') - _doc_args['title'] = dict(type='titlecase', method='title', version='') - _doc_args['capitalize'] = dict(type='be capitalized', method='capitalize', - version='') - _doc_args['swapcase'] = dict(type='be swapcased', method='swapcase', - version='') - _doc_args['casefold'] = dict(type='be casefolded', method='casefold', - version='\n .. versionadded:: 0.25.0\n') - lower = _noarg_wrapper(lambda x: x.lower(), - name='lower', - docstring=_shared_docs['casemethods'] % - _doc_args['lower']) - upper = _noarg_wrapper(lambda x: x.upper(), - name='upper', - docstring=_shared_docs['casemethods'] % - _doc_args['upper']) - title = _noarg_wrapper(lambda x: x.title(), - name='title', - docstring=_shared_docs['casemethods'] % - _doc_args['title']) - capitalize = _noarg_wrapper(lambda x: x.capitalize(), - name='capitalize', - docstring=_shared_docs['casemethods'] % - _doc_args['capitalize']) - swapcase = _noarg_wrapper(lambda x: x.swapcase(), - name='swapcase', - docstring=_shared_docs['casemethods'] % - _doc_args['swapcase']) - casefold = _noarg_wrapper(lambda x: x.casefold(), - name='casefold', - docstring=_shared_docs['casemethods'] % - _doc_args['casefold']) - - _shared_docs['ismethods'] = (""" + _doc_args["lower"] = dict(type="lowercase", method="lower", version="") + _doc_args["upper"] = dict(type="uppercase", method="upper", version="") + _doc_args["title"] = dict(type="titlecase", method="title", version="") + _doc_args["capitalize"] = dict( + type="be capitalized", method="capitalize", version="" + ) + _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") + _doc_args["casefold"] = dict( + type="be casefolded", + method="casefold", + version="\n .. versionadded:: 0.25.0\n", + ) + lower = _noarg_wrapper( + lambda x: x.lower(), + name="lower", + docstring=_shared_docs["casemethods"] % _doc_args["lower"], + ) + upper = _noarg_wrapper( + lambda x: x.upper(), + name="upper", + docstring=_shared_docs["casemethods"] % _doc_args["upper"], + ) + title = _noarg_wrapper( + lambda x: x.title(), + name="title", + docstring=_shared_docs["casemethods"] % _doc_args["title"], + ) + capitalize = _noarg_wrapper( + lambda x: x.capitalize(), + name="capitalize", + docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], + ) + swapcase = _noarg_wrapper( + lambda x: x.swapcase(), + name="swapcase", + docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], + ) + casefold = _noarg_wrapper( + lambda x: x.casefold(), + name="casefold", + docstring=_shared_docs["casemethods"] % _doc_args["casefold"], + ) + + _shared_docs[ + "ismethods" + ] = """ Check whether all characters in each string are %(type)s. This is equivalent to running the Python string method @@ -3301,52 +3402,61 @@ def rindex(self, sub, start=0, end=None): 2 False 3 False dtype: bool - """) - _doc_args['isalnum'] = dict(type='alphanumeric', method='isalnum') - _doc_args['isalpha'] = dict(type='alphabetic', method='isalpha') - _doc_args['isdigit'] = dict(type='digits', method='isdigit') - _doc_args['isspace'] = dict(type='whitespace', method='isspace') - _doc_args['islower'] = dict(type='lowercase', method='islower') - _doc_args['isupper'] = dict(type='uppercase', method='isupper') - _doc_args['istitle'] = dict(type='titlecase', method='istitle') - _doc_args['isnumeric'] = dict(type='numeric', method='isnumeric') - _doc_args['isdecimal'] = dict(type='decimal', method='isdecimal') - isalnum = _noarg_wrapper(lambda x: x.isalnum(), - name='isalnum', - docstring=_shared_docs['ismethods'] % - _doc_args['isalnum']) - isalpha = _noarg_wrapper(lambda x: x.isalpha(), - name='isalpha', - docstring=_shared_docs['ismethods'] % - _doc_args['isalpha']) - isdigit = _noarg_wrapper(lambda x: x.isdigit(), - name='isdigit', - docstring=_shared_docs['ismethods'] % - _doc_args['isdigit']) - isspace = _noarg_wrapper(lambda x: x.isspace(), - name='isspace', - docstring=_shared_docs['ismethods'] % - _doc_args['isspace']) - islower = _noarg_wrapper(lambda x: x.islower(), - name='islower', - docstring=_shared_docs['ismethods'] % - _doc_args['islower']) - isupper = _noarg_wrapper(lambda x: x.isupper(), - name='isupper', - docstring=_shared_docs['ismethods'] % - _doc_args['isupper']) - istitle = _noarg_wrapper(lambda x: x.istitle(), - name='istitle', - docstring=_shared_docs['ismethods'] % - _doc_args['istitle']) - isnumeric = _noarg_wrapper(lambda x: x.isnumeric(), - name='isnumeric', - docstring=_shared_docs['ismethods'] % - _doc_args['isnumeric']) - isdecimal = _noarg_wrapper(lambda x: x.isdecimal(), - name='isdecimal', - docstring=_shared_docs['ismethods'] % - _doc_args['isdecimal']) + """ + _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") + _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") + _doc_args["isdigit"] = dict(type="digits", method="isdigit") + _doc_args["isspace"] = dict(type="whitespace", method="isspace") + _doc_args["islower"] = dict(type="lowercase", method="islower") + _doc_args["isupper"] = dict(type="uppercase", method="isupper") + _doc_args["istitle"] = dict(type="titlecase", method="istitle") + _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") + _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + isalnum = _noarg_wrapper( + lambda x: x.isalnum(), + name="isalnum", + docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], + ) + isalpha = _noarg_wrapper( + lambda x: x.isalpha(), + name="isalpha", + docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], + ) + isdigit = _noarg_wrapper( + lambda x: x.isdigit(), + name="isdigit", + docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], + ) + isspace = _noarg_wrapper( + lambda x: x.isspace(), + name="isspace", + docstring=_shared_docs["ismethods"] % _doc_args["isspace"], + ) + islower = _noarg_wrapper( + lambda x: x.islower(), + name="islower", + docstring=_shared_docs["ismethods"] % _doc_args["islower"], + ) + isupper = _noarg_wrapper( + lambda x: x.isupper(), + name="isupper", + docstring=_shared_docs["ismethods"] % _doc_args["isupper"], + ) + istitle = _noarg_wrapper( + lambda x: x.istitle(), + name="istitle", + docstring=_shared_docs["ismethods"] % _doc_args["istitle"], + ) + isnumeric = _noarg_wrapper( + lambda x: x.isnumeric(), + name="isnumeric", + docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], + ) + isdecimal = _noarg_wrapper( + lambda x: x.isdecimal(), + name="isdecimal", + docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], + ) @classmethod def _make_accessor(cls, data): diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3e3318ed4c4b6..e9d2c3f07bfae 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -8,16 +8,33 @@ from pandas._libs import tslib, tslibs from pandas._libs.tslibs import Timestamp, conversion, parsing from pandas._libs.tslibs.parsing import ( # noqa - DateParseError, _format_is_iso, _guess_datetime_format, parse_time_string) + DateParseError, + _format_is_iso, + _guess_datetime_format, + parse_time_string, +) from pandas._libs.tslibs.strptime import array_strptime from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype, - is_list_like, is_numeric_dtype, is_scalar) + ensure_object, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDatetimeIndex, ABCIndex, ABCIndexClass, ABCSeries) + ABCDataFrame, + ABCDatetimeIndex, + ABCIndex, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.missing import notna from pandas._typing import ArrayLike @@ -35,9 +52,10 @@ # types used in annotations Scalar = Union[int, float, str] -DatetimeScalar = TypeVar('DatetimeScalar', Scalar, datetime) -DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, list, tuple, - ArrayLike, ABCSeries] +DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime) +DatetimeScalarOrArrayConvertible = Union[ + DatetimeScalar, list, tuple, ArrayLike, ABCSeries +] # --------------------------------------------------------------------- @@ -50,8 +68,9 @@ def _guess_datetime_format_for_array(arr, **kwargs): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) -def should_cache(arg: ArrayConvertible, unique_share: float = 0.7, - check_count: Optional[int] = None) -> bool: +def should_cache( + arg: ArrayConvertible, unique_share: float = 0.7, check_count: Optional[int] = None +) -> bool: """ Decides whether to do caching. @@ -91,12 +110,13 @@ def should_cache(arg: ArrayConvertible, unique_share: float = 0.7, else: check_count = 500 else: - assert 0 <= check_count <= len(arg), \ - 'check_count must be in next bounds: [0; len(arg)]' + assert ( + 0 <= check_count <= len(arg) + ), "check_count must be in next bounds: [0; len(arg)]" if check_count == 0: return False - assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)' + assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)" unique_elements = unique(arg[:check_count]) if len(unique_elements) > check_count * unique_share: @@ -124,6 +144,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): Cache of converted, unique dates. Can be empty """ from pandas import Series + cache_array = Series() if cache: # Perform a quicker unique check @@ -138,9 +159,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): def _box_as_indexlike( - dt_array: ArrayLike, - utc: Optional[bool] = None, - name: Optional[str] = None + dt_array: ArrayLike, utc: Optional[bool] = None, name: Optional[str] = None ) -> Union[ABCIndex, ABCDatetimeIndex]: """ Properly boxes the ndarray of datetimes to DatetimeIndex @@ -162,8 +181,9 @@ def _box_as_indexlike( - general Index otherwise """ from pandas import DatetimeIndex, Index + if is_datetime64_dtype(dt_array): - tz = 'utc' if utc else None + tz = "utc" if utc else None return DatetimeIndex(dt_array, tz=tz, name=name) return Index(dt_array, name=name) @@ -172,7 +192,7 @@ def _convert_and_box_cache( arg: DatetimeScalarOrArrayConvertible, cache_array: ABCSeries, box: bool, - name: Optional[str] = None + name: Optional[str] = None, ) -> Union[ABCIndex, np.ndarray]: """ Convert array of dates with a cache and box the result @@ -194,6 +214,7 @@ def _convert_and_box_cache( - ndarray if box=False """ from pandas import Series + result = Series(arg).map(cache_array) if box: return _box_as_indexlike(result, utc=None, name=name) @@ -226,21 +247,34 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name): - ndarray of Timestamps if box=False """ if tz is not None: - raise ValueError("Cannot pass a tz argument when " - "parsing strings with timezone " - "information.") - tz_results = np.array([Timestamp(res).tz_localize(zone) for res, zone - in zip(result, timezones)]) + raise ValueError( + "Cannot pass a tz argument when " + "parsing strings with timezone " + "information." + ) + tz_results = np.array( + [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] + ) if box: from pandas import Index + return Index(tz_results, name=name) return tz_results -def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, - unit=None, errors=None, - infer_datetime_format=None, dayfirst=None, - yearfirst=None, exact=None): +def _convert_listlike_datetimes( + arg, + box, + format, + name=None, + tz=None, + unit=None, + errors=None, + infer_datetime_format=None, + dayfirst=None, + yearfirst=None, + exact=None, +): """ Helper function for to_datetime. Performs the conversions of 1D listlike of dates @@ -279,16 +313,18 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, from pandas import DatetimeIndex from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import ( - maybe_convert_dtype, objects_to_datetime64ns) + maybe_convert_dtype, + objects_to_datetime64ns, + ) if isinstance(arg, (list, tuple)): - arg = np.array(arg, dtype='O') + arg = np.array(arg, dtype="O") # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) - if tz == 'utc': + if tz == "utc": arg = arg.tz_convert(None).tz_localize(tz) return arg @@ -304,12 +340,12 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") - arg = getattr(arg, 'values', arg) - result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, - errors=errors) + arg = getattr(arg, "values", arg) + result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: - if errors == 'ignore': + if errors == "ignore": from pandas import Index + result = Index(result, name=name) else: result = DatetimeIndex(result, name=name) @@ -317,7 +353,7 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) # result will be naive but in UTC try: - result = result.tz_localize('UTC').tz_convert(tz_parsed) + result = result.tz_localize("UTC").tz_convert(tz_parsed) except AttributeError: # Regular Index from 'ignore' path return result @@ -327,9 +363,10 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, else: result = result.tz_convert(tz) return result - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a string, datetime, list, tuple, ' - '1-d array, or Series') + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, " "1-d array, or Series" + ) # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation @@ -358,30 +395,33 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, if format is not None: try: # shortcut formatting here - if format == '%Y%m%d': + if format == "%Y%m%d": try: # pass orig_arg as float-dtype may have been converted to # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): - raise ValueError("cannot convert the input to " - "'%Y%m%d' date format") + raise ValueError( + "cannot convert the input to " "'%Y%m%d' date format" + ) # fallback if result is None: try: result, timezones = array_strptime( - arg, format, exact=exact, errors=errors) - if '%Z' in format or '%z' in format: + arg, format, exact=exact, errors=errors + ) + if "%Z" in format or "%z" in format: return _return_parsed_timezone_results( - result, timezones, box, tz, name) + result, timezones, box, tz, name + ) except tslibs.OutOfBoundsDatetime: - if errors == 'raise': + if errors == "raise": raise - elif errors == 'coerce': - result = np.empty(arg.shape, dtype='M8[ns]') - iresult = result.view('i8') + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") iresult.fill(tslibs.iNaT) else: result = arg @@ -390,11 +430,11 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: - if errors == 'raise': + if errors == "raise": raise - elif errors == 'coerce': - result = np.empty(arg.shape, dtype='M8[ns]') - iresult = result.view('i8') + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") iresult.fill(tslibs.iNaT) else: result = arg @@ -409,27 +449,30 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, if result is None: assert format is None or infer_datetime_format - utc = tz == 'utc' + utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( - arg, dayfirst=dayfirst, yearfirst=yearfirst, - utc=utc, errors=errors, require_iso8601=require_iso8601, - allow_object=True) + arg, + dayfirst=dayfirst, + yearfirst=yearfirst, + utc=utc, + errors=errors, + require_iso8601=require_iso8601, + allow_object=True, + ) if tz_parsed is not None: if box: # We can take a shortcut since the datetime64 numpy array # is in UTC - return DatetimeIndex._simple_new(result, name=name, - tz=tz_parsed) + return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) else: # Convert the datetime64 numpy array to an numpy array # of datetime objects - result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() - for ts in result] + result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() for ts in result] return np.array(result, dtype=object) if box: - utc = tz == 'utc' + utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name) return result @@ -452,16 +495,15 @@ def _adjust_to_origin(arg, origin, unit): ------- ndarray or scalar of adjusted date(s) """ - if origin == 'julian': + if origin == "julian": original = arg j0 = Timestamp(0).to_julian_date() - if unit != 'D': + if unit != "D": raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except TypeError: - raise ValueError("incompatible 'arg' type for given " - "'origin'='julian'") + raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # preemptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 @@ -469,30 +511,36 @@ def _adjust_to_origin(arg, origin, unit): if np.any(arg > j_max) or np.any(arg < j_min): raise tslibs.OutOfBoundsDatetime( "{original} is Out of Bounds for " - "origin='julian'".format(original=original)) + "origin='julian'".format(original=original) + ) else: # arg must be numeric - if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or - is_numeric_dtype(np.asarray(arg))): + if not ( + (is_scalar(arg) and (is_integer(arg) or is_float(arg))) + or is_numeric_dtype(np.asarray(arg)) + ): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( - arg=arg, - origin=origin)) + arg=arg, origin=origin + ) + ) # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: raise tslibs.OutOfBoundsDatetime( - "origin {origin} is Out of Bounds".format(origin=origin)) + "origin {origin} is Out of Bounds".format(origin=origin) + ) except ValueError: - raise ValueError("origin {origin} cannot be converted " - "to a Timestamp".format(origin=origin)) + raise ValueError( + "origin {origin} cannot be converted " + "to a Timestamp".format(origin=origin) + ) if offset.tz is not None: - raise ValueError( - "origin offset {} must be tz-naive".format(offset)) + raise ValueError("origin offset {} must be tz-naive".format(offset)) offset -= Timestamp(0) # convert the offset to the unit of the arg @@ -501,17 +549,28 @@ def _adjust_to_origin(arg, origin, unit): # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( - arg, (ABCSeries, ABCIndexClass, np.ndarray)): + arg, (ABCSeries, ABCIndexClass, np.ndarray) + ): arg = np.asarray(arg) arg = arg + offset return arg -@deprecate_kwarg(old_arg_name='box', new_arg_name=None) -def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - utc=None, box=True, format=None, exact=True, - unit=None, infer_datetime_format=False, origin='unix', - cache=True): +@deprecate_kwarg(old_arg_name="box", new_arg_name=None) +def to_datetime( + arg, + errors="raise", + dayfirst=False, + yearfirst=False, + utc=None, + box=True, + format=None, + exact=True, + unit=None, + infer_datetime_format=False, + origin="unix", + cache=True, +): """ Convert argument to datetime. @@ -686,14 +745,20 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, if arg is None: return None - if origin != 'unix': + if origin != "unix": arg = _adjust_to_origin(arg, origin, unit) - tz = 'utc' if utc else None - convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit, - dayfirst=dayfirst, yearfirst=yearfirst, - errors=errors, exact=exact, - infer_datetime_format=infer_datetime_format) + tz = "utc" if utc else None + convert_listlike = partial( + _convert_listlike_datetimes, + tz=tz, + unit=unit, + dayfirst=dayfirst, + yearfirst=yearfirst, + errors=errors, + exact=exact, + infer_datetime_format=infer_datetime_format, + ) if isinstance(arg, Timestamp): result = arg @@ -714,8 +779,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: - result = _convert_and_box_cache(arg, cache_array, box, - name=arg.name) + result = _convert_and_box_cache(arg, cache_array, box, name=arg.name) else: convert_listlike = partial(convert_listlike, name=arg.name) result = convert_listlike(arg, box, format) @@ -732,28 +796,29 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, # mappings for assembling units -_unit_map = {'year': 'year', - 'years': 'year', - 'month': 'month', - 'months': 'month', - 'day': 'day', - 'days': 'day', - 'hour': 'h', - 'hours': 'h', - 'minute': 'm', - 'minutes': 'm', - 'second': 's', - 'seconds': 's', - 'ms': 'ms', - 'millisecond': 'ms', - 'milliseconds': 'ms', - 'us': 'us', - 'microsecond': 'us', - 'microseconds': 'us', - 'ns': 'ns', - 'nanosecond': 'ns', - 'nanoseconds': 'ns' - } +_unit_map = { + "year": "year", + "years": "year", + "month": "month", + "months": "month", + "day": "day", + "days": "day", + "hour": "h", + "hours": "h", + "minute": "m", + "minutes": "m", + "second": "s", + "seconds": "s", + "ms": "ms", + "millisecond": "ms", + "milliseconds": "ms", + "us": "us", + "microsecond": "us", + "microseconds": "us", + "ns": "ns", + "nanosecond": "ns", + "nanoseconds": "ns", +} def _assemble_from_unit_mappings(arg, errors, box, tz): @@ -780,6 +845,7 @@ def _assemble_from_unit_mappings(arg, errors, box, tz): Series """ from pandas import to_timedelta, to_numeric, DataFrame + arg = DataFrame(arg) if not arg.columns.is_unique: raise ValueError("cannot assemble with duplicate keys") @@ -799,19 +865,23 @@ def f(value): unit_rev = {v: k for k, v in unit.items()} # we require at least Ymd - required = ['year', 'month', 'day'] + required = ["year", "month", "day"] req = sorted(list(set(required) - set(unit_rev.keys()))) if len(req): - raise ValueError("to assemble mappings requires at least that " - "[year, month, day] be specified: [{required}] " - "is missing".format(required=','.join(req))) + raise ValueError( + "to assemble mappings requires at least that " + "[year, month, day] be specified: [{required}] " + "is missing".format(required=",".join(req)) + ) # keys we don't recognize excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values()))) if len(excess): - raise ValueError("extra keys have been passed " - "to the datetime assemblage: " - "[{excess}]".format(excess=','.join(excess))) + raise ValueError( + "extra keys have been passed " + "to the datetime assemblage: " + "[{excess}]".format(excess=",".join(excess)) + ) def coerce(values): # we allow coercion to if errors allows @@ -819,28 +889,29 @@ def coerce(values): # prevent overflow in case of int8 or int16 if is_integer_dtype(values): - values = values.astype('int64', copy=False) + values = values.astype("int64", copy=False) return values - values = (coerce(arg[unit_rev['year']]) * 10000 + - coerce(arg[unit_rev['month']]) * 100 + - coerce(arg[unit_rev['day']])) + values = ( + coerce(arg[unit_rev["year"]]) * 10000 + + coerce(arg[unit_rev["month"]]) * 100 + + coerce(arg[unit_rev["day"]]) + ) try: - values = to_datetime(values, format='%Y%m%d', errors=errors, utc=tz) + values = to_datetime(values, format="%Y%m%d", errors=errors, utc=tz) except (TypeError, ValueError) as e: - raise ValueError("cannot assemble the " - "datetimes: {error}".format(error=e)) + raise ValueError("cannot assemble the " "datetimes: {error}".format(error=e)) - for u in ['h', 'm', 's', 'ms', 'us', 'ns']: + for u in ["h", "m", "s", "ms", "us", "ns"]: value = unit_rev.get(u) if value is not None and value in arg: try: - values += to_timedelta(coerce(arg[value]), - unit=u, - errors=errors) + values += to_timedelta(coerce(arg[value]), unit=u, errors=errors) except (TypeError, ValueError) as e: - raise ValueError("cannot assemble the datetimes [{value}]: " - "{error}".format(value=value, error=e)) + raise ValueError( + "cannot assemble the datetimes [{value}]: " + "{error}".format(value=value, error=e) + ) if not box: return values.values return values @@ -861,18 +932,18 @@ def _attempt_YYYYMMDD(arg, errors): def calc(carg): # calculate the actual result carg = carg.astype(object) - parsed = parsing.try_parse_year_month_day(carg / 10000, - carg / 100 % 100, - carg % 100) + parsed = parsing.try_parse_year_month_day( + carg / 10000, carg / 100 % 100, carg % 100 + ) return tslib.array_to_datetime(parsed, errors=errors)[0] def calc_with_mask(carg, mask): - result = np.empty(carg.shape, dtype='M8[ns]') - iresult = result.view('i8') + result = np.empty(carg.shape, dtype="M8[ns]") + iresult = result.view("i8") iresult[~mask] = tslibs.iNaT masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) - result[mask] = masked_result.astype('M8[ns]') + result[mask] = masked_result.astype("M8[ns]") return result # try intlike / strings that are ints @@ -899,8 +970,16 @@ def calc_with_mask(carg, mask): # Fixed time formats for time parsing -_time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p", - "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"] +_time_formats = [ + "%H:%M", + "%H%M", + "%I:%M%p", + "%I%M%p", + "%H:%M:%S", + "%H%M%S", + "%I:%M:%S%p", + "%I%M%S%p", +] def _guess_time_format_for_array(arr): @@ -918,7 +997,7 @@ def _guess_time_format_for_array(arr): return None -def to_time(arg, format=None, infer_time_format=False, errors='raise'): +def to_time(arg, format=None, infer_time_format=False, errors="raise"): """ Parse time strings to time objects using fixed strptime formats ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", @@ -949,11 +1028,12 @@ def to_time(arg, format=None, infer_time_format=False, errors='raise'): def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): - arg = np.array(arg, dtype='O') + arg = np.array(arg, dtype="O") - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a string, datetime, list, tuple, ' - '1-d array, or Series') + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, datetime, list, tuple, " "1-d array, or Series" + ) arg = ensure_object(arg) @@ -966,12 +1046,13 @@ def _convert_listlike(arg, format): try: times.append(datetime.strptime(element, format).time()) except (ValueError, TypeError): - if errors == 'raise': - msg = ("Cannot convert {element} to a time with given " - "format {format}").format(element=element, - format=format) + if errors == "raise": + msg = ( + "Cannot convert {element} to a time with given " + "format {format}" + ).format(element=element, format=format) raise ValueError(msg) - elif errors == 'ignore': + elif errors == "ignore": return arg else: times.append(None) @@ -982,8 +1063,7 @@ def _convert_listlike(arg, format): time_object = None for time_format in formats: try: - time_object = datetime.strptime(element, - time_format).time() + time_object = datetime.strptime(element, time_format).time() if not format_found: # Put the found format in front fmt = formats.pop(formats.index(time_format)) @@ -995,10 +1075,11 @@ def _convert_listlike(arg, format): if time_object is not None: times.append(time_object) - elif errors == 'raise': - raise ValueError("Cannot convert arg {arg} to " - "a time".format(arg=arg)) - elif errors == 'ignore': + elif errors == "raise": + raise ValueError( + "Cannot convert arg {arg} to " "a time".format(arg=arg) + ) + elif errors == "ignore": return arg else: times.append(None) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index d7a1b1119ce4b..e1a976b874c25 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -4,14 +4,19 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_object, is_datetime_or_timedelta_dtype, is_decimal, is_number, - is_numeric_dtype, is_scalar) + ensure_object, + is_datetime_or_timedelta_dtype, + is_decimal, + is_number, + is_numeric_dtype, + is_scalar, +) from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries import pandas as pd -def to_numeric(arg, errors='raise', downcast=None): +def to_numeric(arg, errors="raise", downcast=None): """ Convert argument to a numeric type. @@ -102,11 +107,11 @@ def to_numeric(arg, errors='raise', downcast=None): 3 -3.0 dtype: float64 """ - if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): - raise ValueError('invalid downcasting method provided') + if downcast not in (None, "integer", "signed", "unsigned", "float"): + raise ValueError("invalid downcasting method provided") - if errors not in ('ignore', 'raise', 'coerce'): - raise ValueError('invalid error value specified') + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("invalid error value specified") is_series = False is_index = False @@ -121,16 +126,16 @@ def to_numeric(arg, errors='raise', downcast=None): if values is None: values = arg.values elif isinstance(arg, (list, tuple)): - values = np.array(arg, dtype='O') + values = np.array(arg, dtype="O") elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True - values = np.array([arg], dtype='O') - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a list, tuple, 1-d array, or Series') + values = np.array([arg], dtype="O") + elif getattr(arg, "ndim", 1) > 1: + raise TypeError("arg must be a list, tuple, 1-d array, or Series") else: values = arg @@ -141,12 +146,13 @@ def to_numeric(arg, errors='raise', downcast=None): values = values.astype(np.int64) else: values = ensure_object(values) - coerce_numeric = errors not in ('ignore', 'raise') - values = lib.maybe_convert_numeric(values, set(), - coerce_numeric=coerce_numeric) + coerce_numeric = errors not in ("ignore", "raise") + values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=coerce_numeric + ) except Exception: - if errors == 'raise': + if errors == "raise": raise # attempt downcast only if the data has been successfully converted @@ -154,12 +160,12 @@ def to_numeric(arg, errors='raise', downcast=None): if downcast is not None and is_numeric_dtype(values): typecodes = None - if downcast in ('integer', 'signed'): - typecodes = np.typecodes['Integer'] - elif downcast == 'unsigned' and np.min(values) >= 0: - typecodes = np.typecodes['UnsignedInteger'] - elif downcast == 'float': - typecodes = np.typecodes['Float'] + if downcast in ("integer", "signed"): + typecodes = np.typecodes["Integer"] + elif downcast == "unsigned" and np.min(values) >= 0: + typecodes = np.typecodes["UnsignedInteger"] + elif downcast == "float": + typecodes = np.typecodes["Float"] # pandas support goes only to np.float32, # as float dtypes smaller than that are diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 5e89b73c8754e..2c594a3df27ea 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -16,8 +16,8 @@ from pandas.core.arrays.timedeltas import sequence_to_td64ns -@deprecate_kwarg(old_arg_name='box', new_arg_name=None) -def to_timedelta(arg, unit='ns', box=True, errors='raise'): +@deprecate_kwarg(old_arg_name="box", new_arg_name=None) +def to_timedelta(arg, unit="ns", box=True, errors="raise"): """ Convert argument to timedelta. @@ -96,50 +96,49 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): """ unit = parse_timedelta_unit(unit) - if errors not in ('ignore', 'raise', 'coerce'): - raise ValueError("errors must be one of 'ignore', " - "'raise', or 'coerce'}") + if errors not in ("ignore", "raise", "coerce"): + raise ValueError("errors must be one of 'ignore', " "'raise', or 'coerce'}") - if unit in {'Y', 'y', 'M'}: - warnings.warn("M and Y units are deprecated and " - "will be removed in a future version.", - FutureWarning, stacklevel=2) + if unit in {"Y", "y", "M"}: + warnings.warn( + "M and Y units are deprecated and " "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) if arg is None: return arg elif isinstance(arg, ABCSeries): - values = _convert_listlike(arg._values, unit=unit, - box=False, errors=errors) + values = _convert_listlike(arg._values, unit=unit, box=False, errors=errors) return arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): - return _convert_listlike(arg, unit=unit, box=box, - errors=errors, name=arg.name) + return _convert_listlike(arg, unit=unit, box=box, errors=errors, name=arg.name) elif isinstance(arg, np.ndarray) and arg.ndim == 0: # extract array scalar and process below arg = arg.item() - elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1: + elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1: return _convert_listlike(arg, unit=unit, box=box, errors=errors) - elif getattr(arg, 'ndim', 1) > 1: - raise TypeError('arg must be a string, timedelta, list, tuple, ' - '1-d array, or Series') + elif getattr(arg, "ndim", 1) > 1: + raise TypeError( + "arg must be a string, timedelta, list, tuple, " "1-d array, or Series" + ) # ...so it must be a scalar value. Return scalar. - return _coerce_scalar_to_timedelta_type(arg, unit=unit, - box=box, errors=errors) + return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors) -def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): +def _coerce_scalar_to_timedelta_type(r, unit="ns", box=True, errors="raise"): """Convert string 'r' to a timedelta object.""" try: result = Timedelta(r, unit) if not box: # explicitly view as timedelta64 for case when result is pd.NaT - result = result.asm8.view('timedelta64[ns]') + result = result.asm8.view("timedelta64[ns]") except ValueError: - if errors == 'raise': + if errors == "raise": raise - elif errors == 'ignore': + elif errors == "ignore": return r # coerce @@ -148,10 +147,10 @@ def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): return result -def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): +def _convert_listlike(arg, unit="ns", box=True, errors="raise", name=None): """Convert a list of objects to a timedelta index object.""" - if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): + if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"): # This is needed only to ensure that in the case where we end up # returning arg (errors == "ignore"), and where the input is a # generator, we return a useful list-like instead of a @@ -159,10 +158,9 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): arg = np.array(list(arg), dtype=object) try: - value = sequence_to_td64ns(arg, unit=unit, - errors=errors, copy=False)[0] + value = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0] except ValueError: - if errors == 'ignore': + if errors == "ignore": return arg else: # This else-block accounts for the cases when errors='raise' @@ -176,5 +174,6 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): if box: from pandas import TimedeltaIndex - value = TimedeltaIndex(value, unit='ns', name=name) + + value = TimedeltaIndex(value, unit="ns", name=name) return value diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index a916f2f06df21..f07133baed435 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -10,13 +10,20 @@ from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( - is_categorical_dtype, is_extension_array_dtype, is_list_like) + is_categorical_dtype, + is_extension_array_dtype, + is_list_like, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna # 16 byte long hashing key -_default_hash_key = '0123456789123456' +_default_hash_key = "0123456789123456" def _combine_hash_arrays(arrays, num_items): @@ -42,13 +49,14 @@ def _combine_hash_arrays(arrays, num_items): out ^= a out *= mult mult += np.uint64(82520 + inverse_i + inverse_i) - assert i + 1 == num_items, 'Fed in wrong num_items' + assert i + 1 == num_items, "Fed in wrong num_items" out += np.uint64(97531) return out -def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, - categorize=True): +def hash_pandas_object( + obj, index=True, encoding="utf8", hash_key=None, categorize=True +): """ Return a data hash of the Index/Series/DataFrame @@ -72,53 +80,63 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, Series of uint64, same length as the object """ from pandas import Series + if hash_key is None: hash_key = _default_hash_key if isinstance(obj, ABCMultiIndex): - return Series(hash_tuples(obj, encoding, hash_key), - dtype='uint64', copy=False) + return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) if isinstance(obj, ABCIndexClass): - h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64', copy=False) - h = Series(h, index=obj, dtype='uint64', copy=False) + h = hash_array(obj.values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) + h = Series(h, index=obj, dtype="uint64", copy=False) elif isinstance(obj, ABCSeries): - h = hash_array(obj.values, encoding, hash_key, - categorize).astype('uint64', copy=False) + h = hash_array(obj.values, encoding, hash_key, categorize).astype( + "uint64", copy=False + ) if index: - index_iter = (hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values - for _ in [None]) + index_iter = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + ).values + for _ in [None] + ) arrays = itertools.chain([h], index_iter) h = _combine_hash_arrays(arrays, 2) - h = Series(h, index=obj.index, dtype='uint64', copy=False) + h = Series(h, index=obj.index, dtype="uint64", copy=False) elif isinstance(obj, ABCDataFrame): hashes = (hash_array(series.values) for _, series in obj.iteritems()) num_items = len(obj.columns) if index: - index_hash_generator = (hash_pandas_object(obj.index, - index=False, - encoding=encoding, - hash_key=hash_key, - categorize=categorize).values # noqa - for _ in [None]) + index_hash_generator = ( + hash_pandas_object( + obj.index, + index=False, + encoding=encoding, + hash_key=hash_key, + categorize=categorize, + ).values # noqa + for _ in [None] + ) num_items += 1 hashes = itertools.chain(hashes, index_hash_generator) h = _combine_hash_arrays(hashes, num_items) - h = Series(h, index=obj.index, dtype='uint64', copy=False) + h = Series(h, index=obj.index, dtype="uint64", copy=False) else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h -def hash_tuples(vals, encoding='utf8', hash_key=None): +def hash_tuples(vals, encoding="utf8", hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently @@ -147,17 +165,15 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): vals = MultiIndex.from_tuples(vals) # create a list-of-Categoricals - vals = [Categorical(vals.codes[level], - vals.levels[level], - ordered=False, - fastpath=True) - for level in range(vals.nlevels)] + vals = [ + Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) + for level in range(vals.nlevels) + ] # hash the list-of-ndarrays - hashes = (_hash_categorical(cat, - encoding=encoding, - hash_key=hash_key) - for cat in vals) + hashes = ( + _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals + ) h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] @@ -165,7 +181,7 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): return h -def hash_tuple(val, encoding='utf8', hash_key=None): +def hash_tuple(val, encoding="utf8", hash_key=None): """ Hash a single tuple efficiently @@ -180,8 +196,7 @@ def hash_tuple(val, encoding='utf8', hash_key=None): hash """ - hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) - for v in val) + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val) h = _combine_hash_arrays(hashes, len(val))[0] @@ -205,8 +220,7 @@ def _hash_categorical(c, encoding, hash_key): """ # Convert ExtensionArrays to ndarrays values = np.asarray(c.categories.values) - hashed = hash_array(values, encoding, hash_key, - categorize=False) + hashed = hash_array(values, encoding, hash_key, categorize=False) # we have uint64, as we don't directly support missing values # we don't want to use take_nd which will coerce to float @@ -219,7 +233,7 @@ def _hash_categorical(c, encoding, hash_key): if len(hashed): result = hashed.take(c.codes) else: - result = np.zeros(len(mask), dtype='uint64') + result = np.zeros(len(mask), dtype="uint64") if mask.any(): result[mask] = np.iinfo(np.uint64).max @@ -227,7 +241,7 @@ def _hash_categorical(c, encoding, hash_key): return result -def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): +def hash_array(vals, encoding="utf8", hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. @@ -250,7 +264,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): 1d uint64 numpy array of hash values, same length as the vals """ - if not hasattr(vals, 'dtype'): + if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype @@ -274,39 +288,40 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): - vals = vals.astype('u8') + vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): - vals = vals.view('i8').astype('u8', copy=False) + vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: - vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') + vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8") else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index + codes, categories = factorize(vals, sort=False) - cat = Categorical(codes, Index(categories), - ordered=False, fastpath=True) + cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types - vals = hashing.hash_object_array(vals.astype(str).astype(object), - hash_key, encoding) + vals = hashing.hash_object_array( + vals.astype(str).astype(object), hash_key, encoding + ) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 - vals *= np.uint64(0xbf58476d1ce4e5b9) + vals *= np.uint64(0xBF58476D1CE4E5B9) vals ^= vals >> 27 - vals *= np.uint64(0x94d049bb133111eb) + vals *= np.uint64(0x94D049BB133111EB) vals ^= vals >> 31 return vals -def _hash_scalar(val, encoding='utf8', hash_key=None): +def _hash_scalar(val, encoding="utf8", hash_key=None): """ Hash scalar value @@ -317,9 +332,9 @@ def _hash_scalar(val, encoding='utf8', hash_key=None): if isna(val): # this is to be consistent with the _hash_categorical implementation - return np.array([np.iinfo(np.uint64).max], dtype='u8') + return np.array([np.iinfo(np.uint64).max], dtype="u8") - if getattr(val, 'tzinfo', None) is not None: + if getattr(val, "tzinfo", None) is not None: # for tz-aware datetimes, we need the underlying naive UTC value and # not the tz aware object or pd extension type (as # infer_dtype_from_scalar would do) @@ -330,5 +345,4 @@ def _hash_scalar(val, encoding='utf8', hash_key=None): dtype, val = infer_dtype_from_scalar(val) vals = np.array([val], dtype=dtype) - return hash_array(vals, hash_key=hash_key, encoding=encoding, - categorize=False) + return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False) diff --git a/pandas/core/window.py b/pandas/core/window.py index 8f888ba510b0e..27588249b1b3c 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -16,11 +16,24 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - ensure_float64, is_bool, is_float_dtype, is_integer, is_integer_dtype, - is_list_like, is_scalar, is_timedelta64_dtype, needs_i8_conversion) + ensure_float64, + is_bool, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_scalar, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCPeriodIndex, ABCSeries, - ABCTimedeltaIndex) + ABCDataFrame, + ABCDateOffset, + ABCDatetimeIndex, + ABCPeriodIndex, + ABCSeries, + ABCTimedeltaIndex, +) from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com @@ -42,13 +55,29 @@ class _Window(PandasObject, SelectionMixin): - _attributes = ['window', 'min_periods', 'center', 'win_type', - 'axis', 'on', 'closed'] + _attributes = [ + "window", + "min_periods", + "center", + "win_type", + "axis", + "on", + "closed", + ] exclusions = set() # type: Set[str] - def __init__(self, obj, window=None, min_periods=None, - center=False, win_type=None, axis=0, on=None, closed=None, - **kwargs): + def __init__( + self, + obj, + window=None, + min_periods=None, + center=False, + win_type=None, + axis=0, + on=None, + closed=None, + **kwargs + ): self.__dict__.update(kwargs) self.blocks = [] @@ -77,18 +106,20 @@ def _on(self): @property def is_freq_type(self): - return self.win_type == 'freq' + return self.win_type == "freq" def validate(self): if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") - if (self.min_periods is not None and - not is_integer(self.min_periods)): + if self.min_periods is not None and not is_integer(self.min_periods): raise ValueError("min_periods must be an integer") - if (self.closed is not None and - self.closed not in ['right', 'both', 'left', 'neither']): - raise ValueError("closed must be 'right', 'left', 'both' or " - "'neither'") + if self.closed is not None and self.closed not in [ + "right", + "both", + "left", + "neither", + ]: + raise ValueError("closed must be 'right', 'left', 'both' or " "'neither'") def _convert_freq(self): """ @@ -110,8 +141,7 @@ def _create_blocks(self): # filter out the on from the object if self.on is not None: if obj.ndim == 2: - obj = obj.reindex(columns=obj.columns.difference([self.on]), - copy=False) + obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) blocks = obj._to_dict_of_blocks(copy=False).values() return blocks, obj, index @@ -145,8 +175,9 @@ def __getattr__(self, attr): if attr in self.obj: return self[attr] - raise AttributeError("%r object has no attribute %r" % - (type(self).__name__, attr)) + raise AttributeError( + "%r object has no attribute %r" % (type(self).__name__, attr) + ) def _dir_additions(self): return self.obj._dir_additions() @@ -163,15 +194,18 @@ def __repr__(self): Provide a nice str repr of our rolling object. """ - attrs = ("{k}={v}".format(k=k, v=getattr(self, k)) - for k in self._attributes - if getattr(self, k, None) is not None) - return "{klass} [{attrs}]".format(klass=self._window_type, - attrs=','.join(attrs)) + attrs = ( + "{k}={v}".format(k=k, v=getattr(self, k)) + for k in self._attributes + if getattr(self, k, None) is not None + ) + return "{klass} [{attrs}]".format( + klass=self._window_type, attrs=",".join(attrs) + ) def __iter__(self): - url = 'https://github.com/pandas-dev/pandas/issues/11704' - raise NotImplementedError('See issue #11704 {url}'.format(url=url)) + url = "https://github.com/pandas-dev/pandas/issues/11704" + raise NotImplementedError("See issue #11704 {url}".format(url=url)) def _get_index(self, index=None): """ @@ -191,7 +225,7 @@ def _get_index(self, index=None): def _prep_values(self, values=None, kill_inf=True): if values is None: - values = getattr(self._selected_obj, 'values', self._selected_obj) + values = getattr(self._selected_obj, "values", self._selected_obj) # GH #12373 : rolling functions error on float32 data # make sure the data is coerced to float64 @@ -200,17 +234,18 @@ def _prep_values(self, values=None, kill_inf=True): elif is_integer_dtype(values.dtype): values = ensure_float64(values) elif needs_i8_conversion(values.dtype): - raise NotImplementedError("ops for {action} for this " - "dtype {dtype} are not " - "implemented".format( - action=self._window_type, - dtype=values.dtype)) + raise NotImplementedError( + "ops for {action} for this " + "dtype {dtype} are not " + "implemented".format(action=self._window_type, dtype=values.dtype) + ) else: try: values = ensure_float64(values) except (ValueError, TypeError): - raise TypeError("cannot handle this type -> {0}" - "".format(values.dtype)) + raise TypeError( + "cannot handle this type -> {0}" "".format(values.dtype) + ) if kill_inf: values = values.copy() @@ -233,11 +268,14 @@ def _wrap_result(self, result, block=None, obj=None): if block is not None: if is_timedelta64_dtype(block.values.dtype): from pandas import to_timedelta - result = to_timedelta( - result.ravel(), unit='ns').values.reshape(result.shape) + + result = to_timedelta(result.ravel(), unit="ns").values.reshape( + result.shape + ) if result.ndim == 1: from pandas import Series + return Series(result, index, name=obj.name) return type(obj)(result, index=index, columns=block.columns) @@ -291,10 +329,10 @@ def _wrap_results(self, results, blocks, obj, exclude=None): columns = [c for c in columns if c not in exclude] if not columns: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if not len(final): - return obj.astype('float64') + return obj.astype("float64") return concat(final, axis=1).reindex(columns=columns, copy=False) def _center_window(self, result, window): @@ -302,8 +340,9 @@ def _center_window(self, result, window): Center the result in the window. """ if self.axis > result.ndim - 1: - raise ValueError("Requested axis is larger then no. of argument " - "dimensions") + raise ValueError( + "Requested axis is larger then no. of argument " "dimensions" + ) offset = _offset(window, True) if offset > 0: @@ -323,7 +362,8 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - _shared_docs['sum'] = dedent(""" + _shared_docs["sum"] = dedent( + """ Calculate %(name)s sum of given DataFrame or Series. Parameters @@ -396,9 +436,11 @@ def aggregate(self, func, *args, **kwargs): 2 6.0 14.0 3 9.0 29.0 4 12.0 50.0 - """) + """ + ) - _shared_docs['mean'] = dedent(""" + _shared_docs["mean"] = dedent( + """ Calculate the %(name)s mean of the values. Parameters @@ -440,7 +482,8 @@ def aggregate(self, func, *args, **kwargs): 2 2.0 3 3.0 dtype: float64 - """) + """ + ) class Window(_Window): @@ -605,17 +648,16 @@ def validate(self): if window <= 0: raise ValueError("window must be > 0 ") import_optional_dependency( - "scipy", - extra="Scipy is required to generate window weight." + "scipy", extra="Scipy is required to generate window weight." ) import scipy.signal as sig if not isinstance(self.win_type, str): - raise ValueError('Invalid win_type {0}'.format(self.win_type)) + raise ValueError("Invalid win_type {0}".format(self.win_type)) if getattr(sig, self.win_type, None) is None: - raise ValueError('Invalid win_type {0}'.format(self.win_type)) + raise ValueError("Invalid win_type {0}".format(self.win_type)) else: - raise ValueError('Invalid window {0}'.format(window)) + raise ValueError("Invalid window {0}".format(window)) def _prep_window(self, **kwargs): """ @@ -631,16 +673,17 @@ def _prep_window(self, **kwargs): # the below may pop from kwargs def _validate_win_type(win_type, kwargs): - arg_map = {'kaiser': ['beta'], - 'gaussian': ['std'], - 'general_gaussian': ['power', 'width'], - 'slepian': ['width'], - 'exponential': ['tau'], - } + arg_map = { + "kaiser": ["beta"], + "gaussian": ["std"], + "general_gaussian": ["power", "width"], + "slepian": ["width"], + "exponential": ["tau"], + } if win_type in arg_map: win_args = _pop_args(win_type, arg_map[win_type], kwargs) - if win_type == 'exponential': + if win_type == "exponential": # exponential window requires the first arg (center) # to be set to None (necessary for symmetric window) win_args.insert(0, None) @@ -650,7 +693,7 @@ def _validate_win_type(win_type, kwargs): return win_type def _pop_args(win_type, arg_names, kwargs): - msg = '%s window requires %%s' % win_type + msg = "%s window requires %%s" % win_type all_args = [] for n in arg_names: if n not in kwargs: @@ -694,7 +737,7 @@ def _apply_window(self, mean=True, **kwargs): del block_list[i] continue else: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if values.size == 0: results.append(values.copy()) @@ -705,10 +748,12 @@ def _apply_window(self, mean=True, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, len(window)) - return libwindow.roll_window(np.concatenate((arg, - additional_nans)) - if center else arg, window, minp, - avg=mean) + return libwindow.roll_window( + np.concatenate((arg, additional_nans)) if center else arg, + window, + minp, + avg=mean, + ) result = np.apply_along_axis(f, self.axis, values) @@ -718,14 +763,17 @@ def f(arg, *args, **kwargs): return self._wrap_results(results, block_list, obj, exclude) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.DataFrame.rolling.aggregate pandas.DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -755,14 +803,17 @@ def f(arg, *args, **kwargs): 7 0.906020 1.283573 0.085482 8 -0.096361 0.818139 0.472290 9 0.070889 0.134399 -0.031308 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/DataFrame', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/DataFrame", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): result, how = self._aggregate(arg, *args, **kwargs) if result is None: @@ -774,16 +825,16 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate - @Substitution(name='window') - @Appender(_shared_docs['sum']) + @Substitution(name="window") + @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): - nv.validate_window_func('sum', args, kwargs) + nv.validate_window_func("sum", args, kwargs) return self._apply_window(mean=False, **kwargs) - @Substitution(name='window') - @Appender(_shared_docs['mean']) + @Substitution(name="window") + @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): - nv.validate_window_func('mean', args, kwargs) + nv.validate_window_func("mean", args, kwargs) return self._apply_window(mean=True, **kwargs) @@ -793,8 +844,8 @@ class _GroupByMixin(GroupByMixin): """ def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop('parent', None) # noqa - groupby = kwargs.pop('groupby', None) + parent = kwargs.pop("parent", None) # noqa + groupby = kwargs.pop("groupby", None) if groupby is None: groupby, obj = obj, obj.obj self._groupby = groupby @@ -802,12 +853,13 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - count = GroupByMixin._dispatch('count') - corr = GroupByMixin._dispatch('corr', other=None, pairwise=None) - cov = GroupByMixin._dispatch('cov', other=None, pairwise=None) + count = GroupByMixin._dispatch("count") + corr = GroupByMixin._dispatch("corr", other=None, pairwise=None) + cov = GroupByMixin._dispatch("cov", other=None, pairwise=None) - def _apply(self, func, name=None, window=None, center=None, - check_minp=None, **kwargs): + def _apply( + self, func, name=None, window=None, center=None, check_minp=None, **kwargs + ): """ Dispatch to apply; we are stripping all of the _apply kwargs and performing the original function call on the grouped object. @@ -825,13 +877,13 @@ def f(x, name=name, *args): class _Rolling(_Window): - @property def _constructor(self): return Rolling - def _apply(self, func, name=None, window=None, center=None, - check_minp=None, **kwargs): + def _apply( + self, func, name=None, window=None, center=None, check_minp=None, **kwargs + ): """ Rolling statistical measure using supplied function. @@ -874,7 +926,7 @@ def _apply(self, func, name=None, window=None, center=None, del block_list[i] continue else: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if values.size == 0: results.append(values.copy()) @@ -884,15 +936,16 @@ def _apply(self, func, name=None, window=None, center=None, if isinstance(func, str): cfunc = getattr(libwindow, func, None) if cfunc is None: - raise ValueError("we do not support this function " - "in libwindow.{func}".format(func=func)) + raise ValueError( + "we do not support this function " + "in libwindow.{func}".format(func=func) + ) def func(arg, window, min_periods=None, closed=None): minp = check_minp(min_periods, window) # ensure we are only rolling on floats arg = ensure_float64(arg) - return cfunc(arg, - window, minp, indexi, closed, **kwargs) + return cfunc(arg, window, minp, indexi, closed, **kwargs) # calculation function if center: @@ -900,16 +953,21 @@ def func(arg, window, min_periods=None, closed=None): additional_nans = np.array([np.NaN] * offset) def calc(x): - return func(np.concatenate((x, additional_nans)), - window, min_periods=self.min_periods, - closed=self.closed) + return func( + np.concatenate((x, additional_nans)), + window, + min_periods=self.min_periods, + closed=self.closed, + ) + else: def calc(x): - return func(x, window, min_periods=self.min_periods, - closed=self.closed) + return func( + x, window, min_periods=self.min_periods, closed=self.closed + ) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if values.ndim > 1: result = np.apply_along_axis(calc, self.axis, values) else: @@ -925,7 +983,8 @@ def calc(x): class _Rolling_and_Expanding(_Rolling): - _shared_docs['count'] = dedent(r""" + _shared_docs["count"] = dedent( + r""" The %(name)s count of any non-NaN observations inside the window. Returns @@ -961,7 +1020,8 @@ class _Rolling_and_Expanding(_Rolling): 2 2.0 3 3.0 dtype: float64 - """) + """ + ) def count(self): @@ -975,15 +1035,20 @@ def count(self): results = [] for b in blocks: result = b.notna().astype(int) - result = self._constructor(result, window=window, min_periods=0, - center=self.center, - axis=self.axis, - closed=self.closed).sum() + result = self._constructor( + result, + window=window, + min_periods=0, + center=self.center, + axis=self.axis, + closed=self.closed, + ).sum() results.append(result) return self._wrap_results(results, blocks, obj) - _shared_docs['apply'] = dedent(r""" + _shared_docs["apply"] = dedent( + r""" The %(name)s function's apply function. Parameters @@ -1015,13 +1080,14 @@ def count(self): -------- Series.%(name)s : Series %(name)s. DataFrame.%(name)s : DataFrame %(name)s. - """) + """ + ) def apply(self, func, raw=None, args=(), kwargs={}): from pandas import Series # TODO: _level is unused? - _level = kwargs.pop('_level', None) # noqa + _level = kwargs.pop("_level", None) # noqa window = self._get_window() offset = _offset(window, self.center) index, indexi = self._get_index() @@ -1034,7 +1100,10 @@ def apply(self, func, raw=None, args=(), kwargs={}): "applied function. In the future, this will change to passing " "it as Series objects. You need to specify 'raw=True' to keep " "the current behaviour, and you can pass 'raw=False' to " - "silence this warning", FutureWarning, stacklevel=3) + "silence this warning", + FutureWarning, + stacklevel=3, + ) raw = True def f(arg, window, min_periods, closed): @@ -1042,30 +1111,32 @@ def f(arg, window, min_periods, closed): if not raw: arg = Series(arg, index=self.obj.index) return libwindow.roll_generic( - arg, window, minp, indexi, - closed, offset, func, raw, args, kwargs) + arg, window, minp, indexi, closed, offset, func, raw, args, kwargs + ) - return self._apply(f, func, args=args, kwargs=kwargs, - center=False, raw=raw) + return self._apply(f, func, args=args, kwargs=kwargs, center=False, raw=raw) def sum(self, *args, **kwargs): - nv.validate_window_func('sum', args, kwargs) - return self._apply('roll_sum', 'sum', **kwargs) + nv.validate_window_func("sum", args, kwargs) + return self._apply("roll_sum", "sum", **kwargs) - _shared_docs['max'] = dedent(""" + _shared_docs["max"] = dedent( + """ Calculate the %(name)s maximum. Parameters ---------- *args, **kwargs Arguments and keyword arguments to be passed into func. - """) + """ + ) def max(self, *args, **kwargs): - nv.validate_window_func('max', args, kwargs) - return self._apply('roll_max', 'max', **kwargs) + nv.validate_window_func("max", args, kwargs) + return self._apply("roll_max", "max", **kwargs) - _shared_docs['min'] = dedent(""" + _shared_docs["min"] = dedent( + """ Calculate the %(name)s minimum. Parameters @@ -1098,17 +1169,19 @@ def max(self, *args, **kwargs): 3 2.0 4 2.0 dtype: float64 - """) + """ + ) def min(self, *args, **kwargs): - nv.validate_window_func('min', args, kwargs) - return self._apply('roll_min', 'min', **kwargs) + nv.validate_window_func("min", args, kwargs) + return self._apply("roll_min", "min", **kwargs) def mean(self, *args, **kwargs): - nv.validate_window_func('mean', args, kwargs) - return self._apply('roll_mean', 'mean', **kwargs) + nv.validate_window_func("mean", args, kwargs) + return self._apply("roll_mean", "mean", **kwargs) - _shared_docs['median'] = dedent(""" + _shared_docs["median"] = dedent( + """ Calculate the %(name)s median. Parameters @@ -1141,12 +1214,14 @@ def mean(self, *args, **kwargs): 3 2.0 4 3.0 dtype: float64 - """) + """ + ) def median(self, **kwargs): - return self._apply('roll_median_c', 'median', **kwargs) + return self._apply("roll_median_c", "median", **kwargs) - _shared_docs['std'] = dedent(""" + _shared_docs["std"] = dedent( + """ Calculate %(name)s standard deviation. Normalized by N-1 by default. This can be changed using the `ddof` @@ -1202,22 +1277,26 @@ def median(self, **kwargs): 5 0.836660 6 0.786796 dtype: float64 - """) + """ + ) def std(self, ddof=1, *args, **kwargs): - nv.validate_window_func('std', args, kwargs) + nv.validate_window_func("std", args, kwargs) window = self._get_window() index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(libwindow.roll_var(arg, window, minp, indexi, - self.closed, ddof)) + return _zsqrt( + libwindow.roll_var(arg, window, minp, indexi, self.closed, ddof) + ) - return self._apply(f, 'std', check_minp=_require_min_periods(1), - ddof=ddof, **kwargs) + return self._apply( + f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + ) - _shared_docs['var'] = dedent(""" + _shared_docs["var"] = dedent( + """ Calculate unbiased %(name)s variance. Normalized by N-1 by default. This can be changed using the `ddof` @@ -1273,15 +1352,18 @@ def f(arg, *args, **kwargs): 5 0.700000 6 0.619048 dtype: float64 - """) + """ + ) def var(self, ddof=1, *args, **kwargs): - nv.validate_window_func('var', args, kwargs) - return self._apply('roll_var', 'var', - check_minp=_require_min_periods(1), ddof=ddof, - **kwargs) - - _shared_docs['skew'] = """ + nv.validate_window_func("var", args, kwargs) + return self._apply( + "roll_var", "var", check_minp=_require_min_periods(1), ddof=ddof, **kwargs + ) + + _shared_docs[ + "skew" + ] = """ Unbiased %(name)s skewness. Parameters @@ -1291,10 +1373,12 @@ def var(self, ddof=1, *args, **kwargs): """ def skew(self, **kwargs): - return self._apply('roll_skew', 'skew', - check_minp=_require_min_periods(3), **kwargs) + return self._apply( + "roll_skew", "skew", check_minp=_require_min_periods(3), **kwargs + ) - _shared_docs['kurt'] = dedent(""" + _shared_docs["kurt"] = dedent( + """ Calculate unbiased %(name)s kurtosis. This function uses Fisher's definition of kurtosis without bias. @@ -1322,13 +1406,16 @@ def skew(self, **kwargs): Notes ----- A minimum of 4 periods is required for the %(name)s calculation. - """) + """ + ) def kurt(self, **kwargs): - return self._apply('roll_kurt', 'kurt', - check_minp=_require_min_periods(4), **kwargs) + return self._apply( + "roll_kurt", "kurt", check_minp=_require_min_periods(4), **kwargs + ) - _shared_docs['quantile'] = dedent(""" + _shared_docs["quantile"] = dedent( + """ Calculate the %(name)s quantile. Parameters @@ -1380,29 +1467,29 @@ def kurt(self, **kwargs): 2 2.5 3 3.5 dtype: float64 - """) + """ + ) - def quantile(self, quantile, interpolation='linear', **kwargs): + def quantile(self, quantile, interpolation="linear", **kwargs): window = self._get_window() index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) if quantile == 1.0: - return libwindow.roll_max(arg, window, minp, indexi, - self.closed) + return libwindow.roll_max(arg, window, minp, indexi, self.closed) elif quantile == 0.0: - return libwindow.roll_min(arg, window, minp, indexi, - self.closed) + return libwindow.roll_min(arg, window, minp, indexi, self.closed) else: - return libwindow.roll_quantile(arg, window, minp, indexi, - self.closed, quantile, - interpolation) + return libwindow.roll_quantile( + arg, window, minp, indexi, self.closed, quantile, interpolation + ) - return self._apply(f, 'quantile', quantile=quantile, - **kwargs) + return self._apply(f, "quantile", quantile=quantile, **kwargs) - _shared_docs['cov'] = """ + _shared_docs[ + "cov" + ] = """ Calculate the %(name)s sample covariance. Parameters @@ -1440,19 +1527,21 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): def _get_cov(X, Y): # GH #12373 : rolling functions error on float32 data # to avoid potential overflow, cast the data to float64 - X = X.astype('float64') - Y = Y.astype('float64') - mean = lambda x: x.rolling(window, self.min_periods, - center=self.center).mean(**kwargs) - count = (X + Y).rolling(window=window, - center=self.center).count(**kwargs) + X = X.astype("float64") + Y = Y.astype("float64") + mean = lambda x: x.rolling( + window, self.min_periods, center=self.center + ).mean(**kwargs) + count = (X + Y).rolling(window=window, center=self.center).count(**kwargs) bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_cov, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) - _shared_docs['corr'] = dedent(""" + _shared_docs["corr"] = dedent( + """ Calculate %(name)s correlation. Parameters @@ -1559,7 +1648,8 @@ def _get_cov(X, Y): Y 0.626300 1.000000 4 X 1.000000 0.555368 Y 0.555368 1.000000 - """) + """ + ) def corr(self, other=None, pairwise=None, **kwargs): if other is None: @@ -1570,60 +1660,66 @@ def corr(self, other=None, pairwise=None, **kwargs): window = self._get_window(other) def _get_corr(a, b): - a = a.rolling(window=window, min_periods=self.min_periods, - center=self.center) - b = b.rolling(window=window, min_periods=self.min_periods, - center=self.center) + a = a.rolling( + window=window, min_periods=self.min_periods, center=self.center + ) + b = b.rolling( + window=window, min_periods=self.min_periods, center=self.center + ) return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_corr, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) class Rolling(_Rolling_and_Expanding): - @cache_readonly def is_datetimelike(self): - return isinstance(self._on, - (ABCDatetimeIndex, - ABCTimedeltaIndex, - ABCPeriodIndex)) + return isinstance( + self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) + ) @cache_readonly def _on(self): if self.on is None: return self.obj.index - elif (isinstance(self.obj, ABCDataFrame) and - self.on in self.obj.columns): + elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: from pandas import Index + return Index(self.obj[self.on]) else: - raise ValueError("invalid on specified as {0}, " - "must be a column (if DataFrame) " - "or None".format(self.on)) + raise ValueError( + "invalid on specified as {0}, " + "must be a column (if DataFrame) " + "or None".format(self.on) + ) def validate(self): super().validate() # we allow rolling on a datetimelike index - if ((self.obj.empty or self.is_datetimelike) and - isinstance(self.window, (str, ABCDateOffset, timedelta))): + if (self.obj.empty or self.is_datetimelike) and isinstance( + self.window, (str, ABCDateOffset, timedelta) + ): self._validate_monotonic() freq = self._validate_freq() # we don't allow center if self.center: - raise NotImplementedError("center is not implemented " - "for datetimelike and offset " - "based windows") + raise NotImplementedError( + "center is not implemented " + "for datetimelike and offset " + "based windows" + ) # this will raise ValueError on non-fixed freqs self.win_freq = self.window self.window = freq.nanos - self.win_type = 'freq' + self.win_type = "freq" # min_periods must be an integer if self.min_periods is None: @@ -1635,38 +1731,44 @@ def validate(self): raise ValueError("window must be non-negative") if not self.is_datetimelike and self.closed is not None: - raise ValueError("closed only implemented for datetimelike " - "and offset based windows") + raise ValueError( + "closed only implemented for datetimelike " "and offset based windows" + ) def _validate_monotonic(self): """ Validate on is_monotonic. """ if not self._on.is_monotonic: - formatted = self.on or 'index' - raise ValueError("{0} must be " - "monotonic".format(formatted)) + formatted = self.on or "index" + raise ValueError("{0} must be " "monotonic".format(formatted)) def _validate_freq(self): """ Validate & return window frequency. """ from pandas.tseries.frequencies import to_offset + try: return to_offset(self.window) except (TypeError, ValueError): - raise ValueError("passed window {0} is not " - "compatible with a datetimelike " - "index".format(self.window)) + raise ValueError( + "passed window {0} is not " + "compatible with a datetimelike " + "index".format(self.window) + ) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- Series.rolling DataFrame.rolling - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -1709,83 +1811,87 @@ def _validate_freq(self): 7 2.718061 -1.647453 8 -0.289082 -1.647453 9 0.212668 -1.647453 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/Dataframe', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) agg = aggregate - @Substitution(name='rolling') - @Appender(_shared_docs['count']) + @Substitution(name="rolling") + @Appender(_shared_docs["count"]) def count(self): # different impl for freq counting if self.is_freq_type: - return self._apply('roll_count', 'count') + return self._apply("roll_count", "count") return super().count() - @Substitution(name='rolling') - @Appender(_shared_docs['apply']) + @Substitution(name="rolling") + @Appender(_shared_docs["apply"]) def apply(self, func, raw=None, args=(), kwargs={}): return super().apply(func, raw=raw, args=args, kwargs=kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['sum']) + @Substitution(name="rolling") + @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): - nv.validate_rolling_func('sum', args, kwargs) + nv.validate_rolling_func("sum", args, kwargs) return super().sum(*args, **kwargs) - @Substitution(name='rolling') + @Substitution(name="rolling") @Appender(_doc_template) - @Appender(_shared_docs['max']) + @Appender(_shared_docs["max"]) def max(self, *args, **kwargs): - nv.validate_rolling_func('max', args, kwargs) + nv.validate_rolling_func("max", args, kwargs) return super().max(*args, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['min']) + @Substitution(name="rolling") + @Appender(_shared_docs["min"]) def min(self, *args, **kwargs): - nv.validate_rolling_func('min', args, kwargs) + nv.validate_rolling_func("min", args, kwargs) return super().min(*args, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['mean']) + @Substitution(name="rolling") + @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): - nv.validate_rolling_func('mean', args, kwargs) + nv.validate_rolling_func("mean", args, kwargs) return super().mean(*args, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['median']) + @Substitution(name="rolling") + @Appender(_shared_docs["median"]) def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['std']) + @Substitution(name="rolling") + @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): - nv.validate_rolling_func('std', args, kwargs) + nv.validate_rolling_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['var']) + @Substitution(name="rolling") + @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): - nv.validate_rolling_func('var', args, kwargs) + nv.validate_rolling_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) - @Substitution(name='rolling') + @Substitution(name="rolling") @Appender(_doc_template) - @Appender(_shared_docs['skew']) + @Appender(_shared_docs["skew"]) def skew(self, **kwargs): return super().skew(**kwargs) - _agg_doc = dedent(""" + _agg_doc = dedent( + """ Examples -------- @@ -1807,28 +1913,30 @@ def skew(self, **kwargs): 3 -1.200000 4 3.999946 dtype: float64 - """) + """ + ) @Appender(_agg_doc) - @Substitution(name='rolling') - @Appender(_shared_docs['kurt']) + @Substitution(name="rolling") + @Appender(_shared_docs["kurt"]) def kurt(self, **kwargs): return super().kurt(**kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['quantile']) - def quantile(self, quantile, interpolation='linear', **kwargs): - return super().quantile(quantile=quantile, interpolation=interpolation, - **kwargs) + @Substitution(name="rolling") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) - @Substitution(name='rolling') + @Substitution(name="rolling") @Appender(_doc_template) - @Appender(_shared_docs['cov']) + @Appender(_shared_docs["cov"]) def cov(self, other=None, pairwise=None, ddof=1, **kwargs): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - @Substitution(name='rolling') - @Appender(_shared_docs['corr']) + @Substitution(name="rolling") + @Appender(_shared_docs["corr"]) def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) @@ -1840,6 +1948,7 @@ class RollingGroupby(_GroupByMixin, Rolling): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return Rolling @@ -1913,12 +2022,10 @@ class Expanding(_Rolling_and_Expanding): 4 7.0 """ - _attributes = ['min_periods', 'center', 'axis'] + _attributes = ["min_periods", "center", "axis"] - def __init__(self, obj, min_periods=1, center=False, axis=0, - **kwargs): - super().__init__(obj=obj, min_periods=min_periods, center=center, - axis=axis) + def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): + super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) @property def _constructor(self): @@ -1945,15 +2052,18 @@ def _get_window(self, other=None): other = self.min_periods or -1 return max(length, other) - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- DataFrame.expanding.aggregate DataFrame.rolling.aggregate DataFrame.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -1983,79 +2093,82 @@ def _get_window(self, other=None): 7 0.680292 0.132049 0.548693 8 0.067236 0.948257 0.163353 9 -0.286980 0.618493 -0.694496 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/Dataframe', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) agg = aggregate - @Substitution(name='expanding') - @Appender(_shared_docs['count']) + @Substitution(name="expanding") + @Appender(_shared_docs["count"]) def count(self, **kwargs): return super().count(**kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['apply']) + @Substitution(name="expanding") + @Appender(_shared_docs["apply"]) def apply(self, func, raw=None, args=(), kwargs={}): - return super().apply( - func, raw=raw, args=args, kwargs=kwargs) + return super().apply(func, raw=raw, args=args, kwargs=kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['sum']) + @Substitution(name="expanding") + @Appender(_shared_docs["sum"]) def sum(self, *args, **kwargs): - nv.validate_expanding_func('sum', args, kwargs) + nv.validate_expanding_func("sum", args, kwargs) return super().sum(*args, **kwargs) - @Substitution(name='expanding') + @Substitution(name="expanding") @Appender(_doc_template) - @Appender(_shared_docs['max']) + @Appender(_shared_docs["max"]) def max(self, *args, **kwargs): - nv.validate_expanding_func('max', args, kwargs) + nv.validate_expanding_func("max", args, kwargs) return super().max(*args, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['min']) + @Substitution(name="expanding") + @Appender(_shared_docs["min"]) def min(self, *args, **kwargs): - nv.validate_expanding_func('min', args, kwargs) + nv.validate_expanding_func("min", args, kwargs) return super().min(*args, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['mean']) + @Substitution(name="expanding") + @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): - nv.validate_expanding_func('mean', args, kwargs) + nv.validate_expanding_func("mean", args, kwargs) return super().mean(*args, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['median']) + @Substitution(name="expanding") + @Appender(_shared_docs["median"]) def median(self, **kwargs): return super().median(**kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['std']) + @Substitution(name="expanding") + @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func('std', args, kwargs) + nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['var']) + @Substitution(name="expanding") + @Appender(_shared_docs["var"]) def var(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func('var', args, kwargs) + nv.validate_expanding_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) - @Substitution(name='expanding') + @Substitution(name="expanding") @Appender(_doc_template) - @Appender(_shared_docs['skew']) + @Appender(_shared_docs["skew"]) def skew(self, **kwargs): return super().skew(**kwargs) - _agg_doc = dedent(""" + _agg_doc = dedent( + """ Examples -------- @@ -2077,29 +2190,30 @@ def skew(self, **kwargs): 3 -1.200000 4 4.999874 dtype: float64 - """) + """ + ) @Appender(_agg_doc) - @Substitution(name='expanding') - @Appender(_shared_docs['kurt']) + @Substitution(name="expanding") + @Appender(_shared_docs["kurt"]) def kurt(self, **kwargs): return super().kurt(**kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['quantile']) - def quantile(self, quantile, interpolation='linear', **kwargs): - return super().quantile(quantile=quantile, - interpolation=interpolation, - **kwargs) + @Substitution(name="expanding") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) - @Substitution(name='expanding') + @Substitution(name="expanding") @Appender(_doc_template) - @Appender(_shared_docs['cov']) + @Appender(_shared_docs["cov"]) def cov(self, other=None, pairwise=None, ddof=1, **kwargs): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - @Substitution(name='expanding') - @Appender(_shared_docs['corr']) + @Substitution(name="expanding") + @Appender(_shared_docs["corr"]) def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) @@ -2111,6 +2225,7 @@ class ExpandingGroupby(_GroupByMixin, Expanding): .. versionadded:: 0.18.1 """ + @property def _constructor(self): return Expanding @@ -2239,11 +2354,20 @@ class EWM(_Rolling): 3 1.615385 4 3.670213 """ - _attributes = ['com', 'min_periods', 'adjust', 'ignore_na', 'axis'] - - def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, - min_periods=0, adjust=True, ignore_na=False, - axis=0): + _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] + + def __init__( + self, + obj, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): self.obj = obj self.com = _get_center_of_mass(com, span, halflife, alpha) self.min_periods = min_periods @@ -2256,13 +2380,16 @@ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, def _constructor(self): return EWM - _agg_see_also_doc = dedent(""" + _agg_see_also_doc = dedent( + """ See Also -------- pandas.DataFrame.rolling.aggregate - """) + """ + ) - _agg_examples_doc = dedent(""" + _agg_examples_doc = dedent( + """ Examples -------- @@ -2292,14 +2419,17 @@ def _constructor(self): 7 0.680292 0.132049 0.548693 8 0.067236 0.948257 0.163353 9 -0.286980 0.618493 -0.694496 - """) - - @Substitution(see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded='', - klass='Series/Dataframe', - axis='') - @Appender(_shared_docs['aggregate']) + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) def aggregate(self, arg, *args, **kwargs): return super().aggregate(arg, *args, **kwargs) @@ -2333,7 +2463,7 @@ def _apply(self, func, **kwargs): del block_list[i] continue else: - raise DataError('No numeric types to aggregate') + raise DataError("No numeric types to aggregate") if values.size == 0: results.append(values.copy()) @@ -2343,18 +2473,25 @@ def _apply(self, func, **kwargs): if isinstance(func, str): cfunc = getattr(libwindow, func, None) if cfunc is None: - raise ValueError("we do not support this function " - "in libwindow.{func}".format(func=func)) + raise ValueError( + "we do not support this function " + "in libwindow.{func}".format(func=func) + ) def func(arg): - return cfunc(arg, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods)) + return cfunc( + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + ) results.append(np.apply_along_axis(func, self.axis, values)) return self._wrap_results(results, block_list, obj, exclude) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) def mean(self, *args, **kwargs): """ @@ -2365,38 +2502,44 @@ def mean(self, *args, **kwargs): *args, **kwargs Arguments and keyword arguments to be passed into func. """ - nv.validate_window_func('mean', args, kwargs) - return self._apply('ewma', **kwargs) + nv.validate_window_func("mean", args, kwargs) + return self._apply("ewma", **kwargs) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_bias_template) def std(self, bias=False, *args, **kwargs): """ Exponential weighted moving stddev. """ - nv.validate_window_func('std', args, kwargs) + nv.validate_window_func("std", args, kwargs) return _zsqrt(self.var(bias=bias, **kwargs)) vol = std - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_bias_template) def var(self, bias=False, *args, **kwargs): """ Exponential weighted moving variance. """ - nv.validate_window_func('var', args, kwargs) + nv.validate_window_func("var", args, kwargs) def f(arg): - return libwindow.ewmcov(arg, arg, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - int(bias)) + return libwindow.ewmcov( + arg, + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) return self._apply(f, **kwargs) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_pairwise_template) def cov(self, other=None, pairwise=None, bias=False, **kwargs): @@ -2412,16 +2555,22 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = libwindow.ewmcov(X._prep_values(), Y._prep_values(), - self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - int(bias)) + cov = libwindow.ewmcov( + X._prep_values(), + Y._prep_values(), + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) return X._wrap_result(cov) - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_cov, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) - @Substitution(name='ewm') + @Substitution(name="ewm") @Appender(_doc_template) @Appender(_pairwise_template) def corr(self, other=None, pairwise=None, **kwargs): @@ -2439,35 +2588,47 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return libwindow.ewmcov(x, y, self.com, int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - 1) + return libwindow.ewmcov( + x, + y, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1, + ) x_values = X._prep_values() y_values = Y._prep_values() - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): cov = _cov(x_values, y_values) x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) corr = cov / _zsqrt(x_var * y_var) return X._wrap_result(corr) - return _flex_binary_moment(self._selected_obj, other._selected_obj, - _get_corr, pairwise=bool(pairwise)) + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) + # Helper Funcs def _flex_binary_moment(arg1, arg2, f, pairwise=False): - if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) and - isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))): - raise TypeError("arguments to moment function must be of type " - "np.ndarray/Series/DataFrame") - - if (isinstance(arg1, (np.ndarray, ABCSeries)) and - isinstance(arg2, (np.ndarray, ABCSeries))): + if not ( + isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) + and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) + ): + raise TypeError( + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" + ) + + if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( + arg2, (np.ndarray, ABCSeries) + ): X, Y = _prep_binary(arg1, arg2) return f(X, Y) @@ -2495,7 +2656,7 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'arg2' columns are not unique") with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - X, Y = arg1.align(arg2, join='outer') + X, Y = arg1.align(arg2, join="outer") X = X + 0 * Y Y = Y + 0 * X @@ -2505,8 +2666,7 @@ def dataframe_from_int_dict(data, frame_template): for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) - return DataFrame(results, index=X.index, - columns=res_columns) + return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): @@ -2515,8 +2675,9 @@ def dataframe_from_int_dict(data, frame_template): # Symmetric case results[i][j] = results[j][i] else: - results[i][j] = f(*_prep_binary(arg1.iloc[:, i], - arg2.iloc[:, j])) + results[i][j] = f( + *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + ) from pandas import MultiIndex, concat @@ -2525,50 +2686,60 @@ def dataframe_from_int_dict(data, frame_template): # construct result frame result = concat( - [concat([results[i][j] - for j, c in enumerate(arg2.columns)], - ignore_index=True) - for i, c in enumerate(arg1.columns)], + [ + concat( + [results[i][j] for j, c in enumerate(arg2.columns)], + ignore_index=True, + ) + for i, c in enumerate(arg1.columns) + ], ignore_index=True, - axis=1) + axis=1, + ) result.columns = arg1.columns # set the index and reorder if arg2.columns.nlevels > 1: result.index = MultiIndex.from_product( - arg2.columns.levels + [result_index]) + arg2.columns.levels + [result_index] + ) result = result.reorder_levels([2, 0, 1]).sort_index() else: result.index = MultiIndex.from_product( - [range(len(arg2.columns)), - range(len(result_index))]) + [range(len(arg2.columns)), range(len(result_index))] + ) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product( - [result_index] + [arg2.columns]) + [result_index] + [arg2.columns] + ) else: # empty result result = DataFrame( - index=MultiIndex(levels=[arg1.index, arg2.columns], - codes=[[], []]), + index=MultiIndex( + levels=[arg1.index, arg2.columns], codes=[[], []] + ), columns=arg2.columns, - dtype='float64') + dtype="float64", + ) # reset our index names to arg1 names # reset our column names to arg2 names # careful not to mutate the original names - result.columns = result.columns.set_names( - arg1.columns.names) + result.columns = result.columns.set_names(arg1.columns.names) result.index = result.index.set_names( - result_index.names + arg2.columns.names) + result_index.names + arg2.columns.names + ) return result else: raise ValueError("'pairwise' is not True/False") else: - results = {i: f(*_prep_binary(arg1.iloc[:, i], arg2)) - for i, col in enumerate(arg1.columns)} + results = { + i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + for i, col in enumerate(arg1.columns) + } return dataframe_from_int_dict(results, arg1) else: @@ -2578,8 +2749,7 @@ def dataframe_from_int_dict(data, frame_template): def _get_center_of_mass(comass, span, halflife, alpha): valid_count = com.count_not_none(comass, span, halflife, alpha) if valid_count > 1: - raise ValueError("comass, span, halflife, and alpha " - "are mutually exclusive") + raise ValueError("comass, span, halflife, and alpha " "are mutually exclusive") # Convert to center of mass; domain checks ensure 0 < alpha <= 1 if comass is not None: @@ -2588,7 +2758,7 @@ def _get_center_of_mass(comass, span, halflife, alpha): elif span is not None: if span < 1: raise ValueError("span must satisfy: span >= 1") - comass = (span - 1) / 2. + comass = (span - 1) / 2.0 elif halflife is not None: if halflife <= 0: raise ValueError("halflife must satisfy: halflife > 0") @@ -2607,7 +2777,7 @@ def _get_center_of_mass(comass, span, halflife, alpha): def _offset(window, center): if not is_integer(window): window = len(window) - offset = (window - 1) / 2. if center else 0 + offset = (window - 1) / 2.0 if center else 0 try: return int(offset) except TypeError: @@ -2632,7 +2802,7 @@ def _use_window(minp, window): def _zsqrt(x): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = np.sqrt(x) mask = x < 0 @@ -2648,7 +2818,7 @@ def _zsqrt(x): def _prep_binary(arg1, arg2): if not isinstance(arg2, type(arg1)): - raise Exception('Input arrays must be of the same type!') + raise Exception("Input arrays must be of the same type!") # mask out values, this also makes a common index... X = arg1 + 0 * arg2 @@ -2662,7 +2832,7 @@ def _prep_binary(arg1, arg2): def rolling(obj, win_type=None, **kwds): if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError("invalid type: %s" % type(obj)) if win_type is not None: return Window(obj, win_type=win_type, **kwds) @@ -2675,7 +2845,7 @@ def rolling(obj, win_type=None, **kwds): def expanding(obj, **kwds): if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError("invalid type: %s" % type(obj)) return Expanding(obj, **kwds) @@ -2685,7 +2855,7 @@ def expanding(obj, **kwds): def ewm(obj, **kwds): if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError("invalid type: %s" % type(obj)) return EWM(obj, **kwds) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 3b8904f4c1ef6..3177937ac4ba1 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -174,17 +174,18 @@ class AbstractMethodError(NotImplementedError): while keeping compatibility with Python 2 and Python 3. """ - def __init__(self, class_instance, methodtype='method'): - types = {'method', 'classmethod', 'staticmethod', 'property'} + def __init__(self, class_instance, methodtype="method"): + types = {"method", "classmethod", "staticmethod", "property"} if methodtype not in types: - msg = 'methodtype must be one of {}, got {} instead.'.format( - methodtype, types) + msg = "methodtype must be one of {}, got {} instead.".format( + methodtype, types + ) raise ValueError(msg) self.methodtype = methodtype self.class_instance = class_instance def __str__(self): - if self.methodtype == 'classmethod': + if self.methodtype == "classmethod": name = self.class_instance.__name__ else: name = self.class_instance.__class__.__name__ diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index e033d882a73f7..caa928731fb3a 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -23,15 +23,20 @@ This module does not work with PyGObject yet. """ -__version__ = '1.5.27' +__version__ = "1.5.27" import os import platform import subprocess from .clipboards import ( - init_klipper_clipboard, init_no_clipboard, init_osx_clipboard, - init_qt_clipboard, init_xclip_clipboard, init_xsel_clipboard) + init_klipper_clipboard, + init_no_clipboard, + init_osx_clipboard, + init_qt_clipboard, + init_xclip_clipboard, + init_xsel_clipboard, +) from .windows import init_windows_clipboard # `import qtpy` sys.exit()s if DISPLAY is not in the environment. @@ -42,20 +47,24 @@ def _executable_exists(name): - return subprocess.call([CHECK_CMD, name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 + return ( + subprocess.call( + [CHECK_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + == 0 + ) def determine_clipboard(): # Determine the OS/platform and set # the copy() and paste() functions accordingly. - if 'cygwin' in platform.system().lower(): + if "cygwin" in platform.system().lower(): # FIXME: pyperclip currently does not support Cygwin, # see https://github.com/asweigart/pyperclip/issues/55 pass - elif os.name == 'nt' or platform.system() == 'Windows': + elif os.name == "nt" or platform.system() == "Windows": return init_windows_clipboard() - if os.name == 'mac' or platform.system() == 'Darwin': + if os.name == "mac" or platform.system() == "Darwin": return init_osx_clipboard() if HAS_DISPLAY: # Determine which command/module is installed, if any. @@ -94,13 +103,15 @@ def determine_clipboard(): def set_clipboard(clipboard): global copy, paste - clipboard_types = {'osx': init_osx_clipboard, - 'qt': init_qt_clipboard, - 'xclip': init_xclip_clipboard, - 'xsel': init_xsel_clipboard, - 'klipper': init_klipper_clipboard, - 'windows': init_windows_clipboard, - 'no': init_no_clipboard} + clipboard_types = { + "osx": init_osx_clipboard, + "qt": init_qt_clipboard, + "xclip": init_xclip_clipboard, + "xsel": init_xsel_clipboard, + "klipper": init_klipper_clipboard, + "windows": init_windows_clipboard, + "no": init_no_clipboard, + } copy, paste = clipboard_types[clipboard]() diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py index 52abdeafb5ecc..cb4ed8ed549d0 100644 --- a/pandas/io/clipboard/clipboards.py +++ b/pandas/io/clipboard/clipboards.py @@ -9,15 +9,13 @@ def init_osx_clipboard(): def copy_osx(text): - p = subprocess.Popen(['pbcopy', 'w'], - stdin=subprocess.PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) + p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True) + p.communicate(input=text.encode("utf-8")) def paste_osx(): - p = subprocess.Popen(['pbpaste', 'r'], - stdout=subprocess.PIPE, close_fds=True) + p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() - return stdout.decode('utf-8') + return stdout.decode("utf-8") return copy_osx, paste_osx @@ -51,30 +49,34 @@ def paste_qt(): def init_xclip_clipboard(): def copy_xclip(text): - p = subprocess.Popen(['xclip', '-selection', 'c'], - stdin=subprocess.PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) + p = subprocess.Popen( + ["xclip", "-selection", "c"], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode("utf-8")) def paste_xclip(): - p = subprocess.Popen(['xclip', '-selection', 'c', '-o'], - stdout=subprocess.PIPE, close_fds=True) + p = subprocess.Popen( + ["xclip", "-selection", "c", "-o"], stdout=subprocess.PIPE, close_fds=True + ) stdout, stderr = p.communicate() - return stdout.decode('utf-8') + return stdout.decode("utf-8") return copy_xclip, paste_xclip def init_xsel_clipboard(): def copy_xsel(text): - p = subprocess.Popen(['xsel', '-b', '-i'], - stdin=subprocess.PIPE, close_fds=True) - p.communicate(input=text.encode('utf-8')) + p = subprocess.Popen( + ["xsel", "-b", "-i"], stdin=subprocess.PIPE, close_fds=True + ) + p.communicate(input=text.encode("utf-8")) def paste_xsel(): - p = subprocess.Popen(['xsel', '-b', '-o'], - stdout=subprocess.PIPE, close_fds=True) + p = subprocess.Popen( + ["xsel", "-b", "-o"], stdout=subprocess.PIPE, close_fds=True + ) stdout, stderr = p.communicate() - return stdout.decode('utf-8') + return stdout.decode("utf-8") return copy_xsel, paste_xsel @@ -82,25 +84,34 @@ def paste_xsel(): def init_klipper_clipboard(): def copy_klipper(text): p = subprocess.Popen( - ['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents', - text.encode('utf-8')], - stdin=subprocess.PIPE, close_fds=True) + [ + "qdbus", + "org.kde.klipper", + "/klipper", + "setClipboardContents", + text.encode("utf-8"), + ], + stdin=subprocess.PIPE, + close_fds=True, + ) p.communicate(input=None) def paste_klipper(): p = subprocess.Popen( - ['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'], - stdout=subprocess.PIPE, close_fds=True) + ["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"], + stdout=subprocess.PIPE, + close_fds=True, + ) stdout, stderr = p.communicate() # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874 # TODO: https://github.com/asweigart/pyperclip/issues/43 - clipboardContents = stdout.decode('utf-8') + clipboardContents = stdout.decode("utf-8") # even if blank, Klipper will append a newline at the end assert len(clipboardContents) > 0 # make sure that newline is there - assert clipboardContents.endswith('\n') - if clipboardContents.endswith('\n'): + assert clipboardContents.endswith("\n") + if clipboardContents.endswith("\n"): clipboardContents = clipboardContents[:-1] return clipboardContents @@ -109,7 +120,6 @@ def paste_klipper(): def init_no_clipboard(): class ClipboardUnavailable: - def __call__(self, *args, **kwargs): raise PyperclipException(EXCEPT_MSG) diff --git a/pandas/io/clipboard/exceptions.py b/pandas/io/clipboard/exceptions.py index 6276b06b9d7fe..eaf5578b5cd1b 100644 --- a/pandas/io/clipboard/exceptions.py +++ b/pandas/io/clipboard/exceptions.py @@ -6,7 +6,6 @@ class PyperclipException(RuntimeError): class PyperclipWindowsException(PyperclipException): - def __init__(self, message): message += " ({err})".format(err=ctypes.WinError()) super().__init__(message) diff --git a/pandas/io/clipboard/windows.py b/pandas/io/clipboard/windows.py index 72abc72966342..2935dfdc2ae19 100644 --- a/pandas/io/clipboard/windows.py +++ b/pandas/io/clipboard/windows.py @@ -10,7 +10,6 @@ class CheckedCall: - def __init__(self, f): super().__setattr__("f", f) @@ -25,15 +24,38 @@ def __setattr__(self, key, value): def init_windows_clipboard(): - from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND, - HINSTANCE, HMENU, BOOL, UINT, HANDLE) + from ctypes.wintypes import ( + HGLOBAL, + LPVOID, + DWORD, + LPCSTR, + INT, + HWND, + HINSTANCE, + HMENU, + BOOL, + UINT, + HANDLE, + ) windll = ctypes.windll - msvcrt = ctypes.CDLL('msvcrt') + msvcrt = ctypes.CDLL("msvcrt") safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA) - safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT, - INT, INT, HWND, HMENU, HINSTANCE, LPVOID] + safeCreateWindowExA.argtypes = [ + DWORD, + LPCSTR, + LPCSTR, + DWORD, + INT, + INT, + INT, + INT, + HWND, + HMENU, + HINSTANCE, + LPVOID, + ] safeCreateWindowExA.restype = HWND safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow) @@ -86,8 +108,9 @@ def window(): """ # we really just need the hwnd, so setting "STATIC" # as predefined lpClass is just fine. - hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0, - None, None, None, None) + hwnd = safeCreateWindowExA( + 0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None + ) try: yield hwnd finally: @@ -135,12 +158,14 @@ def copy_windows(text): # the object must have been allocated using the # function with the GMEM_MOVEABLE flag. count = wcslen(text) + 1 - handle = safeGlobalAlloc(GMEM_MOVEABLE, - count * sizeof(c_wchar)) + handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar)) locked_handle = safeGlobalLock(handle) - ctypes.memmove(c_wchar_p(locked_handle), c_wchar_p(text), - count * sizeof(c_wchar)) + ctypes.memmove( + c_wchar_p(locked_handle), + c_wchar_p(text), + count * sizeof(c_wchar), + ) safeGlobalUnlock(handle) safeSetClipboardData(CF_UNICODETEXT, handle) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index dc30285895dd5..0006824f09fe7 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -7,7 +7,7 @@ from pandas import get_option, option_context -def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover +def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" Read text from clipboard and pass to read_csv. See read_csv for the full argument list @@ -22,22 +22,21 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover ------- parsed : DataFrame """ - encoding = kwargs.pop('encoding', 'utf-8') + encoding = kwargs.pop("encoding", "utf-8") # only utf-8 is valid for passed value because that's what clipboard # supports - if encoding is not None and encoding.lower().replace('-', '') != 'utf8': - raise NotImplementedError( - 'reading from clipboard only supports utf-8 encoding') + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise NotImplementedError("reading from clipboard only supports utf-8 encoding") from pandas.io.clipboard import clipboard_get from pandas.io.parsers import read_csv + text = clipboard_get() # Try to decode (if needed, as "text" might already be a string here). try: - text = text.decode(kwargs.get('encoding') - or get_option('display.encoding')) + text = text.decode(kwargs.get("encoding") or get_option("display.encoding")) except AttributeError: pass @@ -45,7 +44,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover # inspect no more then the 10 first lines, if they # all contain an equal number (>0) of tabs, infer # that this came from excel and set 'sep' accordingly - lines = text[:10000].split('\n')[:-1][:10] + lines = text[:10000].split("\n")[:-1][:10] # Need to remove leading white space, since read_csv # accepts: @@ -53,21 +52,23 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = {x.lstrip().count('\t') for x in lines} + counts = {x.lstrip().count("\t") for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: - sep = '\t' + sep = "\t" # Edge case where sep is specified to be None, return to default - if sep is None and kwargs.get('delim_whitespace') is None: - sep = r'\s+' + if sep is None and kwargs.get("delim_whitespace") is None: + sep = r"\s+" # Regex separator currently only works with python engine. # Default to python if separator is multi-character (regex) - if len(sep) > 1 and kwargs.get('engine') is None: - kwargs['engine'] = 'python' - elif len(sep) > 1 and kwargs.get('engine') == 'c': - warnings.warn('read_clipboard with regex separator does not work' - ' properly with c engine') + if len(sep) > 1 and kwargs.get("engine") is None: + kwargs["engine"] = "python" + elif len(sep) > 1 and kwargs.get("engine") == "c": + warnings.warn( + "read_clipboard with regex separator does not work" + " properly with c engine" + ) return read_csv(StringIO(text), sep=sep, **kwargs) @@ -95,37 +96,39 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover - Windows: - OS X: """ - encoding = kwargs.pop('encoding', 'utf-8') + encoding = kwargs.pop("encoding", "utf-8") # testing if an invalid encoding is passed to clipboard - if encoding is not None and encoding.lower().replace('-', '') != 'utf8': - raise ValueError('clipboard only supports utf-8 encoding') + if encoding is not None and encoding.lower().replace("-", "") != "utf8": + raise ValueError("clipboard only supports utf-8 encoding") from pandas.io.clipboard import clipboard_set + if excel is None: excel = True if excel: try: if sep is None: - sep = '\t' + sep = "\t" buf = StringIO() # clipboard_set (pyperclip) expects unicode - obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) + obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs) text = buf.getvalue() clipboard_set(text) return except TypeError: - warnings.warn('to_clipboard in excel mode requires a single ' - 'character separator.') + warnings.warn( + "to_clipboard in excel mode requires a single " "character separator." + ) elif sep is not None: - warnings.warn('to_clipboard with excel=False ignores the sep argument') + warnings.warn("to_clipboard with excel=False ignores the sep argument") if isinstance(obj, ABCDataFrame): # str(df) has various unhelpful defaults, like truncation - with option_context('display.max_colwidth', 999999): + with option_context("display.max_colwidth", 999999): objstr = obj.to_string(**kwargs) else: objstr = str(obj) diff --git a/pandas/io/common.py b/pandas/io/common.py index 34635ebf64ad6..9a9620e2d0663 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -12,14 +12,23 @@ import pathlib from urllib.error import URLError # noqa from urllib.parse import ( # noqa - urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params, - uses_relative) + urlencode, + urljoin, + urlparse as parse_url, + uses_netloc, + uses_params, + uses_relative, +) from urllib.request import pathname2url, urlopen import zipfile from pandas.errors import ( # noqa - AbstractMethodError, DtypeWarning, EmptyDataError, ParserError, - ParserWarning) + AbstractMethodError, + DtypeWarning, + EmptyDataError, + ParserError, + ParserWarning, +) from pandas.core.dtypes.common import is_file_like @@ -29,13 +38,29 @@ # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', - '-nan', ''} +_NA_VALUES = { + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A N/A", + "#N/A", + "N/A", + "n/a", + "NA", + "#NA", + "NULL", + "null", + "NaN", + "-NaN", + "nan", + "-nan", + "", +} _VALID_URLS = set(uses_relative + uses_netloc + uses_params) -_VALID_URLS.discard('') +_VALID_URLS.discard("") class BaseIterator: @@ -88,10 +113,12 @@ def _expand_user(filepath_or_buffer): def _validate_header_arg(header): if isinstance(header, bool): - raise TypeError("Passing a bool to header is invalid. " - "Use header=None for no header or " - "header=int or list-like of ints to specify " - "the row(s) making up the column names") + raise TypeError( + "Passing a bool to header is invalid. " + "Use header=None for no header or " + "header=int or list-like of ints to specify " + "the row(s) making up the column names" + ) def _stringify_path(filepath_or_buffer): @@ -116,7 +143,7 @@ def _stringify_path(filepath_or_buffer): Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ - if hasattr(filepath_or_buffer, '__fspath__'): + if hasattr(filepath_or_buffer, "__fspath__"): return filepath_or_buffer.__fspath__() elif isinstance(filepath_or_buffer, pathlib.Path): return str(filepath_or_buffer) @@ -126,7 +153,7 @@ def _stringify_path(filepath_or_buffer): def is_s3_url(url): """Check for an s3, s3n, or s3a url""" try: - return parse_url(url).scheme in ['s3', 's3n', 's3a'] + return parse_url(url).scheme in ["s3", "s3n", "s3a"] except Exception: return False @@ -134,13 +161,14 @@ def is_s3_url(url): def is_gcs_url(url): """Check for a gcs url""" try: - return parse_url(url).scheme in ['gcs', 'gs'] + return parse_url(url).scheme in ["gcs", "gs"] except Exception: return False -def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, mode=None): +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -164,27 +192,27 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if _is_url(filepath_or_buffer): req = urlopen(filepath_or_buffer) - content_encoding = req.headers.get('Content-Encoding', None) - if content_encoding == 'gzip': + content_encoding = req.headers.get("Content-Encoding", None) + if content_encoding == "gzip": # Override compression based on Content-Encoding header - compression = 'gzip' + compression = "gzip" reader = BytesIO(req.read()) req.close() return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 - return s3.get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, - compression=compression, - mode=mode) + + return s3.get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + ) if is_gcs_url(filepath_or_buffer): from pandas.io import gcs - return gcs.get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, - compression=compression, - mode=mode) + + return gcs.get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False @@ -208,15 +236,10 @@ def file_path_to_url(path): ------- a valid FILE URL """ - return urljoin('file:', pathname2url(path)) + return urljoin("file:", pathname2url(path)) -_compression_to_extension = { - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', -} +_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} def _infer_compression(filepath_or_buffer, compression): @@ -250,7 +273,7 @@ def _infer_compression(filepath_or_buffer, compression): return None # Infer compression - if compression == 'infer': + if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): @@ -267,14 +290,15 @@ def _infer_compression(filepath_or_buffer, compression): if compression in _compression_to_extension: return compression - msg = 'Unrecognized compression type: {}'.format(compression) - valid = ['infer', None] + sorted(_compression_to_extension) - msg += '\nValid compression types are {}'.format(valid) + msg = "Unrecognized compression type: {}".format(compression) + valid = ["infer", None] + sorted(_compression_to_extension) + msg += "\nValid compression types are {}".format(valid) raise ValueError(msg) -def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False, is_text=True): +def _get_handle( + path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True +): """ Get file handle for given path/buffer and mode. @@ -304,6 +328,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, """ try: from s3fs import S3File + need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO,) @@ -321,45 +346,47 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, if compression: # GZ Compression - if compression == 'gzip': + if compression == "gzip": if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression - elif compression == 'bz2': + elif compression == "bz2": if is_path: f = bz2.BZ2File(path_or_buf, mode) else: f = bz2.BZ2File(path_or_buf) # ZIP Compression - elif compression == 'zip': + elif compression == "zip": zf = BytesZipFile(path_or_buf, mode) # Ensure the container is closed as well. handles.append(zf) - if zf.mode == 'w': + if zf.mode == "w": f = zf - elif zf.mode == 'r': + elif zf.mode == "r": zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: - raise ValueError('Zero files found in ZIP file {}' - .format(path_or_buf)) + raise ValueError( + "Zero files found in ZIP file {}".format(path_or_buf) + ) else: - raise ValueError('Multiple files found in ZIP file.' - ' Only one file per ZIP: {}' - .format(zip_names)) + raise ValueError( + "Multiple files found in ZIP file." + " Only one file per ZIP: {}".format(zip_names) + ) # XZ Compression - elif compression == 'xz': + elif compression == "xz": f = lzma.LZMAFile(path_or_buf, mode) # Unrecognized Compression else: - msg = 'Unrecognized compression type: {}'.format(compression) + msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) handles.append(f) @@ -370,7 +397,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # No explicit encoding - f = open(path_or_buf, mode, errors='replace', newline="") + f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) @@ -379,10 +406,11 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, # Convert BytesIO or file objects passed with an encoding if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding, newline='') + + f = TextIOWrapper(f, encoding=encoding, newline="") handles.append(f) - if memory_map and hasattr(f, 'fileno'): + if memory_map and hasattr(f, "fileno"): try: g = MMapWrapper(f) f.close() @@ -405,10 +433,11 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore BytesIO provides attributes of file-like object and ZipFile.writestr writes bytes strings into a member of the archive. """ + # GH 17778 def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): - if mode in ['wb', 'rb']: - mode = mode.replace('b', '') + if mode in ["wb", "rb"]: + mode = mode.replace("b", "") super().__init__(file, mode, compression, **kwargs) def write(self, data): @@ -446,12 +475,12 @@ def __next__(self): # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing - newline = newline.decode('utf-8') + newline = newline.decode("utf-8") # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise # if that is returned - if newline == '': + if newline == "": raise StopIteration return newline diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 75f353f28549c..ab64bc14344f1 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -17,17 +17,16 @@ def parse_date_fields(year_col, month_col, day_col): return parsing.try_parse_year_month_day(year_col, month_col, day_col) -def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, - second_col): +def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col): year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) hour_col = _maybe_cast(hour_col) minute_col = _maybe_cast(minute_col) second_col = _maybe_cast(second_col) - return parsing.try_parse_datetime_components(year_col, month_col, day_col, - hour_col, minute_col, - second_col) + return parsing.try_parse_datetime_components( + year_col, month_col, day_col, hour_col, minute_col, second_col + ) def generic_parser(parse_func, *cols): @@ -57,7 +56,9 @@ def _check_columns(cols): for i, n in enumerate(map(len, tail)): if n != N: - raise AssertionError('All columns must have the same length: {0}; ' - 'column {1} has length {2}'.format(N, i, n)) + raise AssertionError( + "All columns must have the same length: {0}; " + "column {1} has length {2}".format(N, i, n) + ) return N diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d10a40541bb6c..fae8f4203e9a0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -11,21 +11,29 @@ from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_kwarg -from pandas.core.dtypes.common import ( - is_bool, is_float, is_integer, is_list_like) +from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like from pandas.core.frame import DataFrame from pandas.io.common import ( - _NA_VALUES, _is_url, _stringify_path, _validate_header_arg, - get_filepath_or_buffer) + _NA_VALUES, + _is_url, + _stringify_path, + _validate_header_arg, + get_filepath_or_buffer, +) from pandas.io.excel._util import ( - _fill_mi_header, _get_default_writer, _maybe_convert_usecols, - _pop_header_name, get_writer) + _fill_mi_header, + _get_default_writer, + _maybe_convert_usecols, + _pop_header_name, + get_writer, +) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser -_read_excel_doc = """ +_read_excel_doc = ( + """ Read an Excel file into a pandas DataFrame. Support both `xls` and `xlsx` file extensions from a local filesystem or URL. @@ -124,8 +132,9 @@ na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted - as NaN: '""" + fill("', '".join( - sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. + as NaN: '""" + + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + + """'. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. @@ -251,47 +260,53 @@ 1 string2 2.0 2 None NaN """ +) @Appender(_read_excel_doc) @deprecate_kwarg("skip_footer", "skipfooter") -def read_excel(io, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - keep_default_na=True, - verbose=False, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skip_footer=0, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): - - for arg in ('sheet', 'sheetname', 'parse_cols'): +def read_excel( + io, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + keep_default_na=True, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skip_footer=0, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds +): + + for arg in ("sheet", "sheetname", "parse_cols"): if arg in kwds: - raise TypeError("read_excel() got an unexpected keyword argument " - "`{}`".format(arg)) + raise TypeError( + "read_excel() got an unexpected keyword argument " "`{}`".format(arg) + ) if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) elif engine and engine != io.engine: - raise ValueError("Engine should not be specified when passing " - "an ExcelFile - ExcelFile already has the engine set") + raise ValueError( + "Engine should not be specified when passing " + "an ExcelFile - ExcelFile already has the engine set" + ) return io.parse( sheet_name=sheet_name, @@ -316,19 +331,17 @@ def read_excel(io, skipfooter=skipfooter, convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, - **kwds) + **kwds + ) class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if _is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) - elif not isinstance(filepath_or_buffer, - (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer) + elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -339,8 +352,9 @@ def __init__(self, filepath_or_buffer): elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) else: - raise ValueError('Must explicitly set engine if not passing in' - ' buffer or path for io.') + raise ValueError( + "Must explicitly set engine if not passing in" " buffer or path for io." + ) @property @abc.abstractmethod @@ -368,28 +382,30 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): pass - def parse(self, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - dtype=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - verbose=False, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): + def parse( + self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + dtype=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + verbose=False, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds + ): _validate_header_arg(header) @@ -439,8 +455,7 @@ def parse(self, if is_integer(skiprows): row += skiprows - data[row], control_row = _fill_mi_header(data[row], - control_row) + data[row], control_row = _fill_mi_header(data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) @@ -460,7 +475,7 @@ def parse(self, last = data[offset][col] for row in range(offset + 1, len(data)): - if data[row][col] == '' or data[row][col] is None: + if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] @@ -469,33 +484,36 @@ def parse(self, # GH 12292 : error when read one empty column from excel file try: - parser = TextParser(data, - names=names, - header=header, - index_col=index_col, - has_index_names=has_index_names, - squeeze=squeeze, - dtype=dtype, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - usecols=usecols, - mangle_dupe_cols=mangle_dupe_cols, - **kwds) + parser = TextParser( + data, + names=names, + header=header, + index_col=index_col, + has_index_names=has_index_names, + squeeze=squeeze, + dtype=dtype, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + usecols=usecols, + mangle_dupe_cols=mangle_dupe_cols, + **kwds + ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ - asheetname].columns.set_names(header_names) + asheetname + ].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame @@ -570,6 +588,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: ... df.to_excel(writer, sheet_name='Sheet3') """ + # Defining an ExcelWriter implementation (see abstract methods for more...) # - Mandatory @@ -595,21 +614,18 @@ def __new__(cls, path, engine=None, **kwargs): # only switch class if generic(ExcelWriter) if cls is ExcelWriter: - if engine is None or (isinstance(engine, str) and - engine == 'auto'): + if engine is None or (isinstance(engine, str) and engine == "auto"): if isinstance(path, str): ext = os.path.splitext(path)[-1][1:] else: - ext = 'xlsx' + ext = "xlsx" try: - engine = config.get_option('io.excel.{ext}.writer' - .format(ext=ext)) - if engine == 'auto': + engine = config.get_option("io.excel.{ext}.writer".format(ext=ext)) + if engine == "auto": engine = _get_default_writer(ext) except KeyError: - raise ValueError("No engine for filetype: '{ext}'" - .format(ext=ext)) + raise ValueError("No engine for filetype: '{ext}'".format(ext=ext)) cls = get_writer(engine) return object.__new__(cls) @@ -632,8 +648,9 @@ def engine(self): pass @abc.abstractmethod - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): """ Write given formatted cells into Excel an excel sheet @@ -657,14 +674,20 @@ def save(self): """ pass - def __init__(self, path, engine=None, - date_format=None, datetime_format=None, mode='w', - **engine_kwargs): + def __init__( + self, + path, + engine=None, + date_format=None, + datetime_format=None, + mode="w", + **engine_kwargs + ): # validate that this engine can handle the extension if isinstance(path, str): ext = os.path.splitext(path)[-1] else: - ext = 'xls' if engine == 'xlwt' else 'xlsx' + ext = "xls" if engine == "xlwt" else "xlsx" self.check_extension(ext) @@ -673,11 +696,11 @@ def __init__(self, path, engine=None, self.cur_sheet = None if date_format is None: - self.date_format = 'YYYY-MM-DD' + self.date_format = "YYYY-MM-DD" else: self.date_format = date_format if datetime_format is None: - self.datetime_format = 'YYYY-MM-DD HH:MM:SS' + self.datetime_format = "YYYY-MM-DD HH:MM:SS" else: self.datetime_format = datetime_format @@ -690,8 +713,9 @@ def _get_sheet_name(self, sheet_name): if sheet_name is None: sheet_name = self.cur_sheet if sheet_name is None: # pragma: no cover - raise ValueError('Must pass explicit sheet_name or set ' - 'cur_sheet property') + raise ValueError( + "Must pass explicit sheet_name or set " "cur_sheet property" + ) return sheet_name def _value_with_fmt(self, val): @@ -721,7 +745,7 @@ def _value_with_fmt(self, val): fmt = self.date_format elif isinstance(val, timedelta): val = val.total_seconds() / float(86400) - fmt = '0' + fmt = "0" else: val = str(val) @@ -731,12 +755,12 @@ def _value_with_fmt(self, val): def check_extension(cls, ext): """checks that path's extension against the Writer's supported extensions. If it isn't supported, raises UnsupportedFiletypeError.""" - if ext.startswith('.'): + if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = ("Invalid extension for engine '{engine}': '{ext}'" - .format(engine=pprint_thing(cls.engine), - ext=pprint_thing(ext))) + msg = "Invalid extension for engine '{engine}': '{ext}'".format( + engine=pprint_thing(cls.engine), ext=pprint_thing(ext) + ) raise ValueError(msg) else: return True @@ -772,15 +796,11 @@ class ExcelFile: from pandas.io.excel._openpyxl import _OpenpyxlReader from pandas.io.excel._xlrd import _XlrdReader - _engines = { - 'xlrd': _XlrdReader, - 'openpyxl': _OpenpyxlReader, - 'odf': _ODFReader, - } + _engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader} def __init__(self, io, engine=None): if engine is None: - engine = 'xlrd' + engine = "xlrd" if engine not in self._engines: raise ValueError("Unknown engine: {engine}".format(engine=engine)) @@ -795,27 +815,29 @@ def __init__(self, io, engine=None): def __fspath__(self): return self._io - def parse(self, - sheet_name=0, - header=0, - names=None, - index_col=None, - usecols=None, - squeeze=False, - converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, - na_values=None, - parse_dates=False, - date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=True, - mangle_dupe_cols=True, - **kwds): + def parse( + self, + sheet_name=0, + header=0, + names=None, + index_col=None, + usecols=None, + squeeze=False, + converters=None, + true_values=None, + false_values=None, + skiprows=None, + nrows=None, + na_values=None, + parse_dates=False, + date_parser=None, + thousands=None, + comment=None, + skipfooter=0, + convert_float=True, + mangle_dupe_cols=True, + **kwds + ): """ Parse specified sheet(s) into a DataFrame @@ -827,30 +849,33 @@ def parse(self, DataFrame or dict of DataFrames DataFrame from the passed in Excel file. """ - if 'chunksize' in kwds: - raise NotImplementedError("chunksize keyword of read_excel " - "is not implemented") - - return self._reader.parse(sheet_name=sheet_name, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, - **kwds) + if "chunksize" in kwds: + raise NotImplementedError( + "chunksize keyword of read_excel " "is not implemented" + ) + + return self._reader.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + **kwds + ) @property def book(self): @@ -862,7 +887,7 @@ def sheet_names(self): def close(self): """close io if necessary""" - if hasattr(self.io, 'close'): + if hasattr(self.io, "close"): self.io.close() def __enter__(self): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index c820c1497c3c9..3be36663bac79 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -16,6 +16,7 @@ class _ODFReader(_BaseExcelReader): filepath_or_buffer: string, path to be parsed or an open readable stream. """ + def __init__(self, filepath_or_buffer: FilePathOrBuffer): import_optional_dependency("odf") super().__init__(filepath_or_buffer) @@ -23,16 +24,18 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer): @property def _workbook_class(self): from odf.opendocument import OpenDocument + return OpenDocument def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): from odf.opendocument import load + return load(filepath_or_buffer) @property def empty_value(self) -> str: """Property for compat with other readers.""" - return '' + return "" @property def sheet_names(self) -> List[str]: @@ -44,6 +47,7 @@ def sheet_names(self) -> List[str]: def get_sheet_by_index(self, index: int): from odf.table import Table + tables = self.book.getElementsByType(Table) return tables[index] @@ -74,8 +78,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: table = [] # type: List[List[Scalar]] for i, sheet_row in enumerate(sheet_rows): - sheet_cells = [x for x in sheet_row.childNodes - if x.qname in cell_names] + sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 table_row = [] # type: List[Scalar] @@ -122,12 +125,12 @@ def _get_row_repeat(self, row) -> int: """ from odf.namespaces import TABLENS - return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1)) + return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1)) def _get_column_repeat(self, cell) -> int: from odf.namespaces import TABLENS - return int(cell.attributes.get( - (TABLENS, 'number-columns-repeated'), 1)) + + return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1)) def _is_empty_row(self, row) -> bool: """Helper function to find empty rows @@ -140,18 +143,19 @@ def _is_empty_row(self, row) -> bool: def _get_cell_value(self, cell, convert_float: bool) -> Scalar: from odf.namespaces import OFFICENS - cell_type = cell.attributes.get((OFFICENS, 'value-type')) - if cell_type == 'boolean': + + cell_type = cell.attributes.get((OFFICENS, "value-type")) + if cell_type == "boolean": if str(cell) == "TRUE": return True return False if cell_type is None: return self.empty_value - elif cell_type == 'float': + elif cell_type == "float": # GH5394 - cell_value = float(cell.attributes.get((OFFICENS, 'value'))) + cell_value = float(cell.attributes.get((OFFICENS, "value"))) - if cell_value == 0. and str(cell) != cell_value: # NA handling + if cell_value == 0.0 and str(cell) != cell_value: # NA handling return str(cell) if convert_float: @@ -159,18 +163,18 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: if val == cell_value: return val return cell_value - elif cell_type == 'percentage': - cell_value = cell.attributes.get((OFFICENS, 'value')) + elif cell_type == "percentage": + cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) - elif cell_type == 'string': + elif cell_type == "string": return str(cell) - elif cell_type == 'currency': - cell_value = cell.attributes.get((OFFICENS, 'value')) + elif cell_type == "currency": + cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) - elif cell_type == 'date': - cell_value = cell.attributes.get((OFFICENS, 'date-value')) + elif cell_type == "date": + cell_value = cell.attributes.get((OFFICENS, "date-value")) return pd.to_datetime(cell_value) - elif cell_type == 'time': + elif cell_type == "time": return pd.to_datetime(str(cell)).time() else: - raise ValueError('Unrecognized type {}'.format(cell_type)) + raise ValueError("Unrecognized type {}".format(cell_type)) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7b1e203bd33ad..d8f5da5ab5bc6 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -11,17 +11,18 @@ class _OpenpyxlWriter(ExcelWriter): - engine = 'openpyxl' - supported_extensions = ('.xlsx', '.xlsm') + engine = "openpyxl" + supported_extensions = (".xlsx", ".xlsm") - def __init__(self, path, engine=None, mode='w', **engine_kwargs): + def __init__(self, path, engine=None, mode="w", **engine_kwargs): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook super().__init__(path, mode=mode, **engine_kwargs) - if self.mode == 'a': # Load from existing workbook + if self.mode == "a": # Load from existing workbook from openpyxl import load_workbook + book = load_workbook(self.path) self.book = book else: @@ -52,12 +53,16 @@ def _convert_to_style(cls, style_dict): """ from openpyxl.style import Style + xls_style = Style() for key, value in style_dict.items(): for nk, nv in value.items(): if key == "borders": - (xls_style.borders.__getattribute__(nk) - .__setattr__('border_style', nv)) + ( + xls_style.borders.__getattribute__(nk).__setattr__( + "border_style", nv + ) + ) else: xls_style.__getattribute__(key).__setattr__(nk, nv) @@ -86,16 +91,13 @@ def _convert_to_style_kwargs(cls, style_dict): appropriate class. """ - _style_key_map = { - 'borders': 'border', - } + _style_key_map = {"borders": "border"} style_kwargs = {} for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] - _conv_to_x = getattr(cls, '_convert_to_{k}'.format(k=k), - lambda x: None) + _conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v @@ -160,19 +162,19 @@ def _convert_to_font(cls, font_dict): from openpyxl.styles import Font _font_key_map = { - 'sz': 'size', - 'b': 'bold', - 'i': 'italic', - 'u': 'underline', - 'strike': 'strikethrough', - 'vertalign': 'vertAlign', + "sz": "size", + "b": "bold", + "i": "italic", + "u": "underline", + "strike": "strikethrough", + "vertalign": "vertAlign", } font_kwargs = {} for k, v in font_dict.items(): if k in _font_key_map: k = _font_key_map[k] - if k == 'color': + if k == "color": v = cls._convert_to_color(v) font_kwargs[k] = v @@ -222,17 +224,15 @@ def _convert_to_fill(cls, fill_dict): from openpyxl.styles import PatternFill, GradientFill _pattern_fill_key_map = { - 'patternType': 'fill_type', - 'patterntype': 'fill_type', - 'fgColor': 'start_color', - 'fgcolor': 'start_color', - 'bgColor': 'end_color', - 'bgcolor': 'end_color', + "patternType": "fill_type", + "patterntype": "fill_type", + "fgColor": "start_color", + "fgcolor": "start_color", + "bgColor": "end_color", + "bgcolor": "end_color", } - _gradient_fill_key_map = { - 'fill_type': 'type', - } + _gradient_fill_key_map = {"fill_type": "type"} pfill_kwargs = {} gfill_kwargs = {} @@ -242,9 +242,9 @@ def _convert_to_fill(cls, fill_dict): pk = _pattern_fill_key_map[k] if k in _gradient_fill_key_map: gk = _gradient_fill_key_map[k] - if pk in ['start_color', 'end_color']: + if pk in ["start_color", "end_color"]: v = cls._convert_to_color(v) - if gk == 'stop': + if gk == "stop": v = cls._convert_to_stop(v) if pk: pfill_kwargs[pk] = v @@ -277,9 +277,7 @@ def _convert_to_side(cls, side_spec): from openpyxl.styles import Side - _side_key_map = { - 'border_style': 'style', - } + _side_key_map = {"border_style": "style"} if isinstance(side_spec, str): return Side(style=side_spec) @@ -288,7 +286,7 @@ def _convert_to_side(cls, side_spec): for k, v in side_spec.items(): if k in _side_key_map: k = _side_key_map[k] - if k == 'color': + if k == "color": v = cls._convert_to_color(v) side_kwargs[k] = v @@ -320,18 +318,15 @@ def _convert_to_border(cls, border_dict): from openpyxl.styles import Border - _border_key_map = { - 'diagonalup': 'diagonalUp', - 'diagonaldown': 'diagonalDown', - } + _border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"} border_kwargs = {} for k, v in border_dict.items(): if k in _border_key_map: k = _border_key_map[k] - if k == 'color': + if k == "color": v = cls._convert_to_color(v) - if k in ['left', 'right', 'top', 'bottom', 'diagonal']: + if k in ["left", "right", "top", "bottom", "diagonal"]: v = cls._convert_to_side(v) border_kwargs[k] = v @@ -374,7 +369,7 @@ def _convert_to_number_format(cls, number_format_dict): ------- number_format : str """ - return number_format_dict['format_code'] + return number_format_dict["format_code"] @classmethod def _convert_to_protection(cls, protection_dict): @@ -394,8 +389,9 @@ def _convert_to_protection(cls, protection_dict): return Protection(**protection_dict) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) @@ -409,13 +405,13 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, self.sheets[sheet_name] = wks if _validate_freeze_panes(freeze_panes): - wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1, - column=freeze_panes[1] + 1) + wks.freeze_panes = wks.cell( + row=freeze_panes[0] + 1, column=freeze_panes[1] + 1 + ) for cell in cells: xcell = wks.cell( - row=startrow + cell.row + 1, - column=startcol + cell.col + 1 + row=startrow + cell.row + 1, column=startcol + cell.col + 1 ) xcell.value, fmt = self._value_with_fmt(cell.val) if fmt: @@ -439,7 +435,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, start_row=startrow + cell.row + 1, start_column=startcol + cell.col + 1, end_column=startcol + cell.mergeend + 1, - end_row=startrow + cell.mergestart + 1 + end_row=startrow + cell.mergestart + 1, ) # When cells are merged only the top-left cell is preserved @@ -462,7 +458,6 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: """Reader using openpyxl engine. @@ -477,12 +472,15 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: @property def _workbook_class(self): from openpyxl import Workbook + return Workbook def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): from openpyxl import load_workbook - return load_workbook(filepath_or_buffer, - read_only=True, data_only=True, keep_links=False) + + return load_workbook( + filepath_or_buffer, read_only=True, data_only=True, keep_links=False + ) @property def sheet_names(self) -> List[str]: @@ -499,13 +497,13 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: # TODO: replace with openpyxl constants if cell.is_date: return cell.value - elif cell.data_type == 'e': + elif cell.data_type == "e": return np.nan - elif cell.data_type == 'b': + elif cell.data_type == "b": return bool(cell.value) elif cell.value is None: - return '' # compat with xlrd - elif cell.data_type == 'n': + return "" # compat with xlrd + elif cell.data_type == "n": # GH5394 if convert_float: val = int(cell.value) @@ -519,7 +517,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: data = [] # type: List[List[Scalar]] for row in sheet.rows: - data.append( - [self._convert_cell(cell, convert_float) for cell in row]) + data.append([self._convert_cell(cell, convert_float) for cell in row]) return data diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 286efea9f120e..2ba3842d5c0c9 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -37,12 +37,12 @@ def _get_default_writer(ext): str The default engine for the extension. """ - _default_writers = {'xlsx': 'openpyxl', 'xlsm': 'openpyxl', 'xls': 'xlwt'} - xlsxwriter = import_optional_dependency("xlsxwriter", - raise_on_missing=False, - on_version="warn") + _default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"} + xlsxwriter = import_optional_dependency( + "xlsxwriter", raise_on_missing=False, on_version="warn" + ) if xlsxwriter: - _default_writers['xlsx'] = 'xlsxwriter' + _default_writers["xlsx"] = "xlsxwriter" return _default_writers[ext] @@ -50,8 +50,7 @@ def get_writer(engine_name): try: return _writers[engine_name] except KeyError: - raise ValueError("No Excel writer '{engine}'" - .format(engine=engine_name)) + raise ValueError("No Excel writer '{engine}'".format(engine=engine_name)) def _excel2num(x): @@ -137,10 +136,15 @@ def _maybe_convert_usecols(usecols): return usecols if is_integer(usecols): - warnings.warn(("Passing in an integer for `usecols` has been " - "deprecated. Please pass in a list of int from " - "0 to `usecols` inclusive instead."), - FutureWarning, stacklevel=2) + warnings.warn( + ( + "Passing in an integer for `usecols` has been " + "deprecated. Please pass in a list of int from " + "0 to `usecols` inclusive instead." + ), + FutureWarning, + stacklevel=2, + ) return list(range(usecols + 1)) if isinstance(usecols, str): @@ -151,14 +155,15 @@ def _maybe_convert_usecols(usecols): def _validate_freeze_panes(freeze_panes): if freeze_panes is not None: - if ( - len(freeze_panes) == 2 and - all(isinstance(item, int) for item in freeze_panes) + if len(freeze_panes) == 2 and all( + isinstance(item, int) for item in freeze_panes ): return True - raise ValueError("freeze_panes must be of form (row, column)" - " where row and column are integers") + raise ValueError( + "freeze_panes must be of form (row, column)" + " where row and column are integers" + ) # freeze_panes wasn't specified, return False so it won't be applied # to output sheet @@ -168,7 +173,7 @@ def _validate_freeze_panes(freeze_panes): def _trim_excel_header(row): # trim header row so auto-index inference works # xlrd uses '' , openpyxl None - while len(row) > 0 and (row[0] == '' or row[0] is None): + while len(row) > 0 and (row[0] == "" or row[0] is None): row = row[1:] return row @@ -195,7 +200,7 @@ def _fill_mi_header(row, control_row): if not control_row[i]: last = row[i] - if row[i] == '' or row[i] is None: + if row[i] == "" or row[i] is None: row[i] = last else: control_row[i] = False @@ -228,4 +233,4 @@ def _pop_header_name(row, index_col): header_name = row[i] header_name = None if header_name == "" else header_name - return header_name, row[:i] + [''] + row[i + 1:] + return header_name, row[:i] + [""] + row[i + 1 :] diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index fcc432dc7a5ad..be1b78eeb146e 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -8,7 +8,6 @@ class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): """Reader using xlrd engine. @@ -24,10 +23,12 @@ def __init__(self, filepath_or_buffer): @property def _workbook_class(self): from xlrd import Book + return Book def load_workbook(self, filepath_or_buffer): from xlrd import open_workbook + if hasattr(filepath_or_buffer, "read"): data = filepath_or_buffer.read() return open_workbook(file_contents=data) @@ -45,9 +46,13 @@ def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) def get_sheet_data(self, sheet, convert_float): - from xlrd import (xldate, XL_CELL_DATE, - XL_CELL_ERROR, XL_CELL_BOOLEAN, - XL_CELL_NUMBER) + from xlrd import ( + xldate, + XL_CELL_DATE, + XL_CELL_ERROR, + XL_CELL_BOOLEAN, + XL_CELL_NUMBER, + ) epoch1904 = self.book.datemode @@ -59,8 +64,7 @@ def _parse_cell(cell_contents, cell_typ): # Use the newer xlrd datetime handling. try: - cell_contents = xldate.xldate_as_datetime( - cell_contents, epoch1904) + cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904) except OverflowError: return cell_contents @@ -68,12 +72,15 @@ def _parse_cell(cell_contents, cell_typ): # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) or - (epoch1904 and year == (1904, 1, 1))): - cell_contents = time(cell_contents.hour, - cell_contents.minute, - cell_contents.second, - cell_contents.microsecond) + if (not epoch1904 and year == (1899, 12, 31)) or ( + epoch1904 and year == (1904, 1, 1) + ): + cell_contents = time( + cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond, + ) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan @@ -90,9 +97,10 @@ def _parse_cell(cell_contents, cell_typ): data = [] for i in range(sheet.nrows): - row = [_parse_cell(value, typ) - for value, typ in zip(sheet.row_values(i), - sheet.row_types(i))] + row = [ + _parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) + ] data.append(row) return data diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 2ddfcf3de5a8f..07bf265da4863 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -9,75 +9,69 @@ class _XlsxStyler: # Ordering necessary for both determinism and because some are keyed by # prefixes of others. STYLE_MAPPING = { - 'font': [ - (('name',), 'font_name'), - (('sz',), 'font_size'), - (('size',), 'font_size'), - (('color', 'rgb',), 'font_color'), - (('color',), 'font_color'), - (('b',), 'bold'), - (('bold',), 'bold'), - (('i',), 'italic'), - (('italic',), 'italic'), - (('u',), 'underline'), - (('underline',), 'underline'), - (('strike',), 'font_strikeout'), - (('vertAlign',), 'font_script'), - (('vertalign',), 'font_script'), + "font": [ + (("name",), "font_name"), + (("sz",), "font_size"), + (("size",), "font_size"), + (("color", "rgb"), "font_color"), + (("color",), "font_color"), + (("b",), "bold"), + (("bold",), "bold"), + (("i",), "italic"), + (("italic",), "italic"), + (("u",), "underline"), + (("underline",), "underline"), + (("strike",), "font_strikeout"), + (("vertAlign",), "font_script"), + (("vertalign",), "font_script"), ], - 'number_format': [ - (('format_code',), 'num_format'), - ((), 'num_format',), + "number_format": [(("format_code",), "num_format"), ((), "num_format")], + "protection": [(("locked",), "locked"), (("hidden",), "hidden")], + "alignment": [ + (("horizontal",), "align"), + (("vertical",), "valign"), + (("text_rotation",), "rotation"), + (("wrap_text",), "text_wrap"), + (("indent",), "indent"), + (("shrink_to_fit",), "shrink"), ], - 'protection': [ - (('locked',), 'locked'), - (('hidden',), 'hidden'), + "fill": [ + (("patternType",), "pattern"), + (("patterntype",), "pattern"), + (("fill_type",), "pattern"), + (("start_color", "rgb"), "fg_color"), + (("fgColor", "rgb"), "fg_color"), + (("fgcolor", "rgb"), "fg_color"), + (("start_color",), "fg_color"), + (("fgColor",), "fg_color"), + (("fgcolor",), "fg_color"), + (("end_color", "rgb"), "bg_color"), + (("bgColor", "rgb"), "bg_color"), + (("bgcolor", "rgb"), "bg_color"), + (("end_color",), "bg_color"), + (("bgColor",), "bg_color"), + (("bgcolor",), "bg_color"), ], - 'alignment': [ - (('horizontal',), 'align'), - (('vertical',), 'valign'), - (('text_rotation',), 'rotation'), - (('wrap_text',), 'text_wrap'), - (('indent',), 'indent'), - (('shrink_to_fit',), 'shrink'), - ], - 'fill': [ - (('patternType',), 'pattern'), - (('patterntype',), 'pattern'), - (('fill_type',), 'pattern'), - (('start_color', 'rgb',), 'fg_color'), - (('fgColor', 'rgb',), 'fg_color'), - (('fgcolor', 'rgb',), 'fg_color'), - (('start_color',), 'fg_color'), - (('fgColor',), 'fg_color'), - (('fgcolor',), 'fg_color'), - (('end_color', 'rgb',), 'bg_color'), - (('bgColor', 'rgb',), 'bg_color'), - (('bgcolor', 'rgb',), 'bg_color'), - (('end_color',), 'bg_color'), - (('bgColor',), 'bg_color'), - (('bgcolor',), 'bg_color'), - ], - 'border': [ - (('color', 'rgb',), 'border_color'), - (('color',), 'border_color'), - (('style',), 'border'), - (('top', 'color', 'rgb',), 'top_color'), - (('top', 'color',), 'top_color'), - (('top', 'style',), 'top'), - (('top',), 'top'), - (('right', 'color', 'rgb',), 'right_color'), - (('right', 'color',), 'right_color'), - (('right', 'style',), 'right'), - (('right',), 'right'), - (('bottom', 'color', 'rgb',), 'bottom_color'), - (('bottom', 'color',), 'bottom_color'), - (('bottom', 'style',), 'bottom'), - (('bottom',), 'bottom'), - (('left', 'color', 'rgb',), 'left_color'), - (('left', 'color',), 'left_color'), - (('left', 'style',), 'left'), - (('left',), 'left'), + "border": [ + (("color", "rgb"), "border_color"), + (("color",), "border_color"), + (("style",), "border"), + (("top", "color", "rgb"), "top_color"), + (("top", "color"), "top_color"), + (("top", "style"), "top"), + (("top",), "top"), + (("right", "color", "rgb"), "right_color"), + (("right", "color"), "right_color"), + (("right", "style"), "right"), + (("right",), "right"), + (("bottom", "color", "rgb"), "bottom_color"), + (("bottom", "color"), "bottom_color"), + (("bottom", "style"), "bottom"), + (("bottom",), "bottom"), + (("left", "color", "rgb"), "left_color"), + (("left", "color"), "left_color"), + (("left", "style"), "left"), + (("left",), "left"), ], } @@ -96,14 +90,14 @@ def convert(cls, style_dict, num_format_str=None): props = {} if num_format_str is not None: - props['num_format'] = num_format_str + props["num_format"] = num_format_str if style_dict is None: return props - if 'borders' in style_dict: + if "borders" in style_dict: style_dict = style_dict.copy() - style_dict['border'] = style_dict.pop('borders') + style_dict["border"] = style_dict.pop("borders") for style_group_key, style_group in style_dict.items(): for src, dst in cls.STYLE_MAPPING.get(style_group_key, []): @@ -120,51 +114,76 @@ def convert(cls, style_dict, num_format_str=None): else: props[dst] = v - if isinstance(props.get('pattern'), str): + if isinstance(props.get("pattern"), str): # TODO: support other fill patterns - props['pattern'] = 0 if props['pattern'] == 'none' else 1 + props["pattern"] = 0 if props["pattern"] == "none" else 1 - for k in ['border', 'top', 'right', 'bottom', 'left']: + for k in ["border", "top", "right", "bottom", "left"]: if isinstance(props.get(k), str): try: - props[k] = ['none', 'thin', 'medium', 'dashed', 'dotted', - 'thick', 'double', 'hair', 'mediumDashed', - 'dashDot', 'mediumDashDot', 'dashDotDot', - 'mediumDashDotDot', - 'slantDashDot'].index(props[k]) + props[k] = [ + "none", + "thin", + "medium", + "dashed", + "dotted", + "thick", + "double", + "hair", + "mediumDashed", + "dashDot", + "mediumDashDot", + "dashDotDot", + "mediumDashDotDot", + "slantDashDot", + ].index(props[k]) except ValueError: props[k] = 2 - if isinstance(props.get('font_script'), str): - props['font_script'] = ['baseline', 'superscript', - 'subscript'].index(props['font_script']) + if isinstance(props.get("font_script"), str): + props["font_script"] = ["baseline", "superscript", "subscript"].index( + props["font_script"] + ) - if isinstance(props.get('underline'), str): - props['underline'] = {'none': 0, 'single': 1, 'double': 2, - 'singleAccounting': 33, - 'doubleAccounting': 34}[props['underline']] + if isinstance(props.get("underline"), str): + props["underline"] = { + "none": 0, + "single": 1, + "double": 2, + "singleAccounting": 33, + "doubleAccounting": 34, + }[props["underline"]] return props class _XlsxWriter(ExcelWriter): - engine = 'xlsxwriter' - supported_extensions = ('.xlsx',) - - def __init__(self, path, engine=None, - date_format=None, datetime_format=None, mode='w', - **engine_kwargs): + engine = "xlsxwriter" + supported_extensions = (".xlsx",) + + def __init__( + self, + path, + engine=None, + date_format=None, + datetime_format=None, + mode="w", + **engine_kwargs + ): # Use the xlsxwriter module as the Excel writer. import xlsxwriter - if mode == 'a': - raise ValueError('Append mode is not supported with xlsxwriter!') + if mode == "a": + raise ValueError("Append mode is not supported with xlsxwriter!") - super().__init__(path, engine=engine, - date_format=date_format, - datetime_format=datetime_format, - mode=mode, - **engine_kwargs) + super().__init__( + path, + engine=engine, + date_format=date_format, + datetime_format=datetime_format, + mode=mode, + **engine_kwargs + ) self.book = xlsxwriter.Workbook(path, **engine_kwargs) @@ -175,8 +194,9 @@ def save(self): return self.book.close() - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): # Write the frame cells using xlsxwriter. sheet_name = self._get_sheet_name(sheet_name) @@ -186,7 +206,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, wks = self.book.add_worksheet(sheet_name) self.sheets[sheet_name] = wks - style_dict = {'null': None} + style_dict = {"null": None} if _validate_freeze_panes(freeze_panes): wks.freeze_panes(*(freeze_panes)) @@ -201,17 +221,17 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, if stylekey in style_dict: style = style_dict[stylekey] else: - style = self.book.add_format( - _XlsxStyler.convert(cell.style, fmt)) + style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt)) style_dict[stylekey] = style if cell.mergestart is not None and cell.mergeend is not None: - wks.merge_range(startrow + cell.row, - startcol + cell.col, - startrow + cell.mergestart, - startcol + cell.mergeend, - val, style) + wks.merge_range( + startrow + cell.row, + startcol + cell.col, + startrow + cell.mergestart, + startcol + cell.mergeend, + val, + style, + ) else: - wks.write(startrow + cell.row, - startcol + cell.col, - val, style) + wks.write(startrow + cell.row, startcol + cell.col, val, style) diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 62a57b99fe556..fe3d0a208de6a 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -5,22 +5,22 @@ class _XlwtWriter(ExcelWriter): - engine = 'xlwt' - supported_extensions = ('.xls',) + engine = "xlwt" + supported_extensions = (".xls",) - def __init__(self, path, engine=None, encoding=None, mode='w', - **engine_kwargs): + def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs): # Use the xlwt module as the Excel writer. import xlwt - engine_kwargs['engine'] = engine - if mode == 'a': - raise ValueError('Append mode is not supported with xlwt!') + engine_kwargs["engine"] = engine + + if mode == "a": + raise ValueError("Append mode is not supported with xlwt!") super().__init__(path, mode=mode, **engine_kwargs) if encoding is None: - encoding = 'ascii' + encoding = "ascii" self.book = xlwt.Workbook(encoding=encoding) self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format) self.fm_date = xlwt.easyxf(num_format_str=self.date_format) @@ -31,8 +31,9 @@ def save(self): """ return self.book.save(self.path) - def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, - freeze_panes=None): + def write_cells( + self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None + ): # Write the frame cells using xlwt. sheet_name = self._get_sheet_name(sheet_name) @@ -64,19 +65,19 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0, style_dict[stylekey] = style if cell.mergestart is not None and cell.mergeend is not None: - wks.write_merge(startrow + cell.row, - startrow + cell.mergestart, - startcol + cell.col, - startcol + cell.mergeend, - val, style) + wks.write_merge( + startrow + cell.row, + startrow + cell.mergestart, + startcol + cell.col, + startcol + cell.mergeend, + val, + style, + ) else: - wks.write(startrow + cell.row, - startcol + cell.col, - val, style) + wks.write(startrow + cell.row, startcol + cell.col, val, style) @classmethod - def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',', - line_sep=';'): + def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"): """helper which recursively generate an xlwt easy style string for example: @@ -91,17 +92,19 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',', border: top thin, right thin, bottom thin, left thin; \ align: horiz center; """ - if hasattr(item, 'items'): + if hasattr(item, "items"): if firstlevel: - it = ["{key}: {val}" - .format(key=key, val=cls._style_to_xlwt(value, False)) - for key, value in item.items()] + it = [ + "{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + for key, value in item.items() + ] out = "{sep} ".format(sep=(line_sep).join(it)) return out else: - it = ["{key} {val}" - .format(key=key, val=cls._style_to_xlwt(value, False)) - for key, value in item.items()] + it = [ + "{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + for key, value in item.items() + ] out = "{sep} ".format(sep=(field_sep).join(it)) return out else: @@ -123,7 +126,7 @@ def _convert_to_style(cls, style_dict, num_format_str=None): if style_dict: xlwt_stylestr = cls._style_to_xlwt(style_dict) - style = xlwt.easyxf(xlwt_stylestr, field_sep=',', line_sep=';') + style = xlwt.easyxf(xlwt_stylestr, field_sep=",", line_sep=";") else: style = xlwt.XFStyle() if num_format_str is not None: diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 93252f3a09ceb..05608f69c0d9d 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -28,7 +28,7 @@ def to_feather(df, path): if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") - valid_types = {'string', 'unicode'} + valid_types = {"string", "unicode"} # validate index # -------------- @@ -37,20 +37,24 @@ def to_feather(df, path): # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): - raise ValueError("feather does not support serializing {} " - "for the index; you can .reset_index()" - "to make the index into column(s)".format( - type(df.index))) + raise ValueError( + "feather does not support serializing {} " + "for the index; you can .reset_index()" + "to make the index into column(s)".format(type(df.index)) + ) if not df.index.equals(RangeIndex.from_range(range(len(df)))): - raise ValueError("feather does not support serializing a " - "non-default index for the index; you " - "can .reset_index() to make the index " - "into column(s)") + raise ValueError( + "feather does not support serializing a " + "non-default index for the index; you " + "can .reset_index() to make the index " + "into column(s)" + ) if df.index.name is not None: - raise ValueError("feather does not serialize index meta-data on a " - "default index") + raise ValueError( + "feather does not serialize index meta-data on a " "default index" + ) # validate columns # ---------------- @@ -62,7 +66,7 @@ def to_feather(df, path): feather.write_feather(df, path) -@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads') +@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads") def read_feather(path, columns=None, use_threads=True): """ Load a feather-format object from the file path @@ -95,12 +99,10 @@ def read_feather(path, columns=None, use_threads=True): path = _stringify_path(path) - if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'): + if LooseVersion(pyarrow.__version__) < LooseVersion("0.11.0"): int_use_threads = int(use_threads) if int_use_threads < 1: int_use_threads = 1 - return feather.read_feather(path, columns=columns, - nthreads=int_use_threads) + return feather.read_feather(path, columns=columns, nthreads=int_use_threads) - return feather.read_feather(path, columns=columns, - use_threads=bool(use_threads)) + return feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index 19c822e5dc270..7f8f2fbea2352 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -12,9 +12,9 @@ def get_console_size(): """ from pandas import get_option - display_width = get_option('display.width') + display_width = get_option("display.width") # deprecated. - display_height = get_option('display.max_rows') + display_height = get_option("display.max_rows") # Consider # interactive shell terminal, can detect term size @@ -31,8 +31,9 @@ def get_console_size(): # sane defaults for interactive non-shell terminal # match default for width,height in config_init from pandas._config.config import get_default_val - terminal_width = get_default_val('display.width') - terminal_height = get_default_val('display.max_rows') + + terminal_width = get_default_val("display.width") + terminal_height = get_default_val("display.max_rows") else: # pure terminal terminal_width, terminal_height = get_terminal_size() @@ -48,6 +49,7 @@ def get_console_size(): # ---------------------------------------------------------------------- # Detect our environment + def in_interactive_session(): """ check if we're running in an interactive shell @@ -59,9 +61,8 @@ def check_main(): try: import __main__ as main except ModuleNotFoundError: - return get_option('mode.sim_interactive') - return (not hasattr(main, '__file__') or - get_option('mode.sim_interactive')) + return get_option("mode.sim_interactive") + return not hasattr(main, "__file__") or get_option("mode.sim_interactive") try: return __IPYTHON__ or check_main() # noqa @@ -75,7 +76,7 @@ def in_ipython_frontend(): """ try: ip = get_ipython() # noqa - return 'zmq' in str(type(ip)).lower() + return "zmq" in str(type(ip)).lower() except NameError: pass diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index 2527e45650ea3..92fe87cddb35b 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -7,6 +7,7 @@ class CSSWarning(UserWarning): """This CSS syntax cannot currently be parsed""" + pass @@ -63,9 +64,9 @@ def __call__(self, declarations_str, inherited=None): props[prop] = val for prop, val in list(props.items()): - if val == 'inherit': - val = inherited.get(prop, 'initial') - if val == 'initial': + if val == "inherit": + val = inherited.get(prop, "initial") + if val == "initial": val = None if val is None: @@ -75,90 +76,94 @@ def __call__(self, declarations_str, inherited=None): props[prop] = val # 2. resolve relative font size - if props.get('font-size'): - if 'font-size' in inherited: - em_pt = inherited['font-size'] - assert em_pt[-2:] == 'pt' + if props.get("font-size"): + if "font-size" in inherited: + em_pt = inherited["font-size"] + assert em_pt[-2:] == "pt" em_pt = float(em_pt[:-2]) else: em_pt = None - props['font-size'] = self.size_to_pt( - props['font-size'], em_pt, conversions=self.FONT_SIZE_RATIOS) + props["font-size"] = self.size_to_pt( + props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS + ) - font_size = float(props['font-size'][:-2]) + font_size = float(props["font-size"][:-2]) else: font_size = None # 3. TODO: resolve other font-relative units for side in self.SIDES: - prop = 'border-{side}-width'.format(side=side) + prop = "border-{side}-width".format(side=side) if prop in props: props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, - conversions=self.BORDER_WIDTH_RATIOS) - for prop in ['margin-{side}'.format(side=side), - 'padding-{side}'.format(side=side)]: + props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS + ) + for prop in [ + "margin-{side}".format(side=side), + "padding-{side}".format(side=side), + ]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, - conversions=self.MARGIN_RATIOS) + props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS + ) return props UNIT_RATIOS = { - 'rem': ('pt', 12), - 'ex': ('em', .5), + "rem": ("pt", 12), + "ex": ("em", 0.5), # 'ch': - 'px': ('pt', .75), - 'pc': ('pt', 12), - 'in': ('pt', 72), - 'cm': ('in', 1 / 2.54), - 'mm': ('in', 1 / 25.4), - 'q': ('mm', .25), - '!!default': ('em', 0), + "px": ("pt", 0.75), + "pc": ("pt", 12), + "in": ("pt", 72), + "cm": ("in", 1 / 2.54), + "mm": ("in", 1 / 25.4), + "q": ("mm", 0.25), + "!!default": ("em", 0), } FONT_SIZE_RATIOS = UNIT_RATIOS.copy() - FONT_SIZE_RATIOS.update({ - '%': ('em', .01), - 'xx-small': ('rem', .5), - 'x-small': ('rem', .625), - 'small': ('rem', .8), - 'medium': ('rem', 1), - 'large': ('rem', 1.125), - 'x-large': ('rem', 1.5), - 'xx-large': ('rem', 2), - 'smaller': ('em', 1 / 1.2), - 'larger': ('em', 1.2), - '!!default': ('em', 1), - }) + FONT_SIZE_RATIOS.update( + { + "%": ("em", 0.01), + "xx-small": ("rem", 0.5), + "x-small": ("rem", 0.625), + "small": ("rem", 0.8), + "medium": ("rem", 1), + "large": ("rem", 1.125), + "x-large": ("rem", 1.5), + "xx-large": ("rem", 2), + "smaller": ("em", 1 / 1.2), + "larger": ("em", 1.2), + "!!default": ("em", 1), + } + ) MARGIN_RATIOS = UNIT_RATIOS.copy() - MARGIN_RATIOS.update({ - 'none': ('pt', 0), - }) + MARGIN_RATIOS.update({"none": ("pt", 0)}) BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() - BORDER_WIDTH_RATIOS.update({ - 'none': ('pt', 0), - 'thick': ('px', 4), - 'medium': ('px', 2), - 'thin': ('px', 1), - # Default: medium only if solid - }) + BORDER_WIDTH_RATIOS.update( + { + "none": ("pt", 0), + "thick": ("px", 4), + "medium": ("px", 2), + "thin": ("px", 1), + # Default: medium only if solid + } + ) def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): def _error(): - warnings.warn('Unhandled size: {val!r}'.format(val=in_val), - CSSWarning) - return self.size_to_pt('1!!default', conversions=conversions) + warnings.warn("Unhandled size: {val!r}".format(val=in_val), CSSWarning) + return self.size_to_pt("1!!default", conversions=conversions) try: - val, unit = re.match(r'^(\S*?)([a-zA-Z%!].*)', in_val).groups() + val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups() except AttributeError: return _error() - if val == '': + if val == "": # hack for 'large' etc. val = 1 else: @@ -167,13 +172,13 @@ def _error(): except ValueError: return _error() - while unit != 'pt': - if unit == 'em': + while unit != "pt": + if unit == "em": if em_pt is None: - unit = 'rem' + unit = "rem" else: val *= em_pt - unit = 'pt' + unit = "pt" continue try: @@ -184,14 +189,14 @@ def _error(): val = round(val, 5) if int(val) == val: - size_fmt = '{fmt:d}pt'.format(fmt=int(val)) + size_fmt = "{fmt:d}pt".format(fmt=int(val)) else: - size_fmt = '{fmt:f}pt'.format(fmt=val) + size_fmt = "{fmt:f}pt".format(fmt=val) return size_fmt def atomize(self, declarations): for prop, value in declarations: - attr = 'expand_' + prop.replace('-', '_') + attr = "expand_" + prop.replace("-", "_") try: expand = getattr(self, attr) except AttributeError: @@ -206,7 +211,7 @@ def atomize(self, declarations): 3: [0, 1, 2, 1], 4: [0, 1, 2, 3], } - SIDES = ('top', 'right', 'bottom', 'left') + SIDES = ("top", "right", "bottom", "left") def _side_expander(prop_fmt): def expand(self, prop, value): @@ -214,34 +219,39 @@ def expand(self, prop, value): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn('Could not expand "{prop}: {val}"' - .format(prop=prop, val=value), CSSWarning) + warnings.warn( + 'Could not expand "{prop}: {val}"'.format(prop=prop, val=value), + CSSWarning, + ) return for key, idx in zip(self.SIDES, mapping): yield prop_fmt.format(key), tokens[idx] return expand - expand_border_color = _side_expander('border-{:s}-color') - expand_border_style = _side_expander('border-{:s}-style') - expand_border_width = _side_expander('border-{:s}-width') - expand_margin = _side_expander('margin-{:s}') - expand_padding = _side_expander('padding-{:s}') + expand_border_color = _side_expander("border-{:s}-color") + expand_border_style = _side_expander("border-{:s}-style") + expand_border_width = _side_expander("border-{:s}-width") + expand_margin = _side_expander("margin-{:s}") + expand_padding = _side_expander("padding-{:s}") def parse(self, declarations_str): """Generates (prop, value) pairs from declarations In a future version may generate parsed tokens from tinycss/tinycss2 """ - for decl in declarations_str.split(';'): + for decl in declarations_str.split(";"): if not decl.strip(): continue - prop, sep, val = decl.partition(':') + prop, sep, val = decl.partition(":") prop = prop.strip().lower() # TODO: don't lowercase case sensitive parts of values (strings) val = val.strip().lower() if sep: yield prop, val else: - warnings.warn('Ill-formatted attribute: expected a colon ' - 'in {decl!r}'.format(decl=decl), CSSWarning) + warnings.warn( + "Ill-formatted attribute: expected a colon " + "in {decl!r}".format(decl=decl), + CSSWarning, + ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e1d95862ec872..d86bf432b83c4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -13,22 +13,45 @@ from pandas._libs import writers as libwriters from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex) + ABCDatetimeIndex, + ABCIndexClass, + ABCMultiIndex, + ABCPeriodIndex, +) from pandas.core.dtypes.missing import notna from pandas.io.common import ( - UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer) + UnicodeWriter, + _get_handle, + _infer_compression, + get_filepath_or_buffer, +) class CSVFormatter: - - def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', - float_format=None, cols=None, header=True, index=True, - index_label=None, mode='w', encoding=None, - compression='infer', quoting=None, line_terminator='\n', - chunksize=None, quotechar='"', - date_format=None, doublequote=True, escapechar=None, - decimal='.'): + def __init__( + self, + obj, + path_or_buf=None, + sep=",", + na_rep="", + float_format=None, + cols=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression="infer", + quoting=None, + line_terminator="\n", + chunksize=None, + quotechar='"', + date_format=None, + doublequote=True, + escapechar=None, + decimal=".", + ): self.obj = obj @@ -48,7 +71,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.index_label = index_label self.mode = mode if encoding is None: - encoding = 'utf-8' + encoding = "utf-8" self.encoding = encoding self.compression = _infer_compression(self.path_or_buf, compression) @@ -73,15 +96,18 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', # validate mi options if self.has_mi_columns: if cols is not None: - raise TypeError("cannot specify cols with a MultiIndex on the " - "columns") + raise TypeError( + "cannot specify cols with a MultiIndex on the " "columns" + ) if cols is not None: if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types(na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting) + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, + ) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -90,10 +116,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types(na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting) + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, + ) else: cols = list(cols) @@ -110,13 +138,17 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.chunksize = int(chunksize) self.data_index = obj.index - if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and - date_format is not None): + if ( + isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and date_format is not None + ): from pandas import Index - self.data_index = Index([x.strftime(date_format) if notna(x) else - '' for x in self.data_index]) - self.nlevels = getattr(self.data_index, 'nlevels', 1) + self.data_index = Index( + [x.strftime(date_format) if notna(x) else "" for x in self.data_index] + ) + + self.nlevels = getattr(self.data_index, "nlevels", 1) if not index: self.nlevels = 0 @@ -125,15 +157,14 @@ def save(self): Create the writer & save """ # GH21227 internal compression is not used when file-like passed. - if self.compression and hasattr(self.path_or_buf, 'write'): - msg = ("compression has no effect when passing file-like " - "object as input.") + if self.compression and hasattr(self.path_or_buf, "write"): + msg = "compression has no effect when passing file-like " "object as input." warnings.warn(msg, RuntimeWarning, stacklevel=2) # when zip compression is called. is_zip = isinstance(self.path_or_buf, ZipFile) or ( - not hasattr(self.path_or_buf, 'write') - and self.compression == 'zip') + not hasattr(self.path_or_buf, "write") and self.compression == "zip" + ) if is_zip: # zipfile doesn't support writing string to archive. uses string @@ -141,25 +172,31 @@ def save(self): # file handle. GH21241, GH21118 f = StringIO() close = False - elif hasattr(self.path_or_buf, 'write'): + elif hasattr(self.path_or_buf, "write"): f = self.path_or_buf close = False else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, - compression=self.compression) + f, handles = _get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + compression=self.compression, + ) close = True try: - writer_kwargs = dict(lineterminator=self.line_terminator, - delimiter=self.sep, quoting=self.quoting, - doublequote=self.doublequote, - escapechar=self.escapechar, - quotechar=self.quotechar) - if self.encoding == 'ascii': + writer_kwargs = dict( + lineterminator=self.line_terminator, + delimiter=self.sep, + quoting=self.quoting, + doublequote=self.doublequote, + escapechar=self.escapechar, + quotechar=self.quotechar, + ) + if self.encoding == "ascii": self.writer = csvlib.writer(f, **writer_kwargs) else: - writer_kwargs['encoding'] = self.encoding + writer_kwargs["encoding"] = self.encoding self.writer = UnicodeWriter(f, **writer_kwargs) self._save() @@ -168,12 +205,15 @@ def save(self): if is_zip: # GH17778 handles zip compression separately. buf = f.getvalue() - if hasattr(self.path_or_buf, 'write'): + if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, - compression=self.compression) + f, handles = _get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + compression=self.compression, + ) f.write(buf) close = True if close: @@ -191,15 +231,17 @@ def _save_header(self): header = self.header encoded_labels = [] - has_aliases = isinstance(header, (tuple, list, np.ndarray, - ABCIndexClass)) + has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) if not (has_aliases or self.header): return if has_aliases: if len(header) != len(cols): - raise ValueError(('Writing {ncols} cols but got {nalias} ' - 'aliases'.format(ncols=len(cols), - nalias=len(header)))) + raise ValueError( + ( + "Writing {ncols} cols but got {nalias} " + "aliases".format(ncols=len(cols), nalias=len(header)) + ) + ) else: write_cols = header else: @@ -213,16 +255,17 @@ def _save_header(self): index_label = [] for i, name in enumerate(obj.index.names): if name is None: - name = '' + name = "" index_label.append(name) else: index_label = obj.index.name if index_label is None: - index_label = [''] + index_label = [""] else: index_label = [index_label] - elif not isinstance(index_label, - (list, tuple, np.ndarray, ABCIndexClass)): + elif not isinstance( + index_label, (list, tuple, np.ndarray, ABCIndexClass) + ): # given a string for a DF with Index index_label = [index_label] @@ -249,7 +292,7 @@ def _save_header(self): col_line.append(columns.names[i]) if isinstance(index_label, list) and len(index_label) > 1: - col_line.extend([''] * (len(index_label) - 1)) + col_line.extend([""] * (len(index_label) - 1)) col_line.extend(columns._get_level_values(i)) @@ -258,8 +301,8 @@ def _save_header(self): # Write out the index line if it's not empty. # Otherwise, we will print out an extraneous # blank line between the mi and the data rows. - if encoded_labels and set(encoded_labels) != {''}: - encoded_labels.extend([''] * len(columns)) + if encoded_labels and set(encoded_labels) != {""}: + encoded_labels.extend([""] * len(columns)) writer.writerow(encoded_labels) def _save(self): @@ -288,21 +331,26 @@ def _save_chunk(self, start_i, end_i): slicer = slice(start_i, end_i) for i in range(len(self.blocks)): b = self.blocks[i] - d = b.to_native_types(slicer=slicer, na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting) + d = b.to_native_types( + slicer=slicer, + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list self.data[col_loc] = col - ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting) + ix = data_index.to_native_types( + slicer=slicer, + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) - libwriters.write_csv_rows(self.data, ix, self.nlevels, - self.cols, self.writer) + libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 66a00bf9ab054..012d2d9358241 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -21,11 +21,10 @@ class ExcelCell: - __fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend') + __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend") __slots__ = __fields__ - def __init__(self, row, col, val, style=None, mergestart=None, - mergeend=None): + def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None): self.row = row self.col = col self.val = val @@ -50,6 +49,7 @@ class CSSToExcelConverter: CSS declarations understood to be the containing scope for the CSS processed by :meth:`__call__`. """ + # NB: Most of the methods here could be classmethods, as only __init__ # and __call__ make use of instance attributes. We leave them as # instancemethods so that users can easily experiment with extensions @@ -84,11 +84,11 @@ def __call__(self, declarations_str): def build_xlstyle(self, props): out = { - 'alignment': self.build_alignment(props), - 'border': self.build_border(props), - 'fill': self.build_fill(props), - 'font': self.build_font(props), - 'number_format': self.build_number_format(props), + "alignment": self.build_alignment(props), + "border": self.build_border(props), + "fill": self.build_fill(props), + "font": self.build_font(props), + "number_format": self.build_number_format(props), } # TODO: handle cell width and height: needs support in pandas.io.excel @@ -106,33 +106,40 @@ def remove_none(d): return out VERTICAL_MAP = { - 'top': 'top', - 'text-top': 'top', - 'middle': 'center', - 'baseline': 'bottom', - 'bottom': 'bottom', - 'text-bottom': 'bottom', + "top": "top", + "text-top": "top", + "middle": "center", + "baseline": "bottom", + "bottom": "bottom", + "text-bottom": "bottom", # OpenXML also has 'justify', 'distributed' } def build_alignment(self, props): # TODO: text-indent, padding-left -> alignment.indent - return {'horizontal': props.get('text-align'), - 'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')), - 'wrap_text': (None if props.get('white-space') is None else - props['white-space'] not in - ('nowrap', 'pre', 'pre-line')) - } + return { + "horizontal": props.get("text-align"), + "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")), + "wrap_text": ( + None + if props.get("white-space") is None + else props["white-space"] not in ("nowrap", "pre", "pre-line") + ), + } def build_border(self, props): - return {side: { - 'style': self._border_style(props.get('border-{side}-style' - .format(side=side)), - props.get('border-{side}-width' - .format(side=side))), - 'color': self.color_to_excel( - props.get('border-{side}-color'.format(side=side))), - } for side in ['top', 'right', 'bottom', 'left']} + return { + side: { + "style": self._border_style( + props.get("border-{side}-style".format(side=side)), + props.get("border-{side}-width".format(side=side)), + ), + "color": self.color_to_excel( + props.get("border-{side}-color".format(side=side)) + ), + } + for side in ["top", "right", "bottom", "left"] + } def _border_style(self, style, width): # convert styles and widths to openxml, one of: @@ -151,61 +158,70 @@ def _border_style(self, style, width): # 'thin' if width is None and style is None: return None - if style == 'none' or style == 'hidden': + if style == "none" or style == "hidden": return None if width is None: - width = '2pt' + width = "2pt" width = float(width[:-2]) if width < 1e-5: return None elif width < 1.3: - width_name = 'thin' + width_name = "thin" elif width < 2.8: - width_name = 'medium' + width_name = "medium" else: - width_name = 'thick' + width_name = "thick" - if style in (None, 'groove', 'ridge', 'inset', 'outset'): + if style in (None, "groove", "ridge", "inset", "outset"): # not handled - style = 'solid' + style = "solid" - if style == 'double': - return 'double' - if style == 'solid': + if style == "double": + return "double" + if style == "solid": return width_name - if style == 'dotted': - if width_name in ('hair', 'thin'): - return 'dotted' - return 'mediumDashDotDot' - if style == 'dashed': - if width_name in ('hair', 'thin'): - return 'dashed' - return 'mediumDashed' + if style == "dotted": + if width_name in ("hair", "thin"): + return "dotted" + return "mediumDashDotDot" + if style == "dashed": + if width_name in ("hair", "thin"): + return "dashed" + return "mediumDashed" def build_fill(self, props): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type - fill_color = props.get('background-color') - if fill_color not in (None, 'transparent', 'none'): - return { - 'fgColor': self.color_to_excel(fill_color), - 'patternType': 'solid', - } - - BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True, - '800': True, '900': True, - 'normal': False, 'lighter': False, '100': False, '200': False, - '300': False, '400': False, '500': False} - ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True} + fill_color = props.get("background-color") + if fill_color not in (None, "transparent", "none"): + return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} + + BOLD_MAP = { + "bold": True, + "bolder": True, + "600": True, + "700": True, + "800": True, + "900": True, + "normal": False, + "lighter": False, + "100": False, + "200": False, + "300": False, + "400": False, + "500": False, + } + ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} def build_font(self, props): - size = props.get('font-size') + size = props.get("font-size") if size is not None: - assert size.endswith('pt') + assert size.endswith("pt") size = float(size[:-2]) - font_names_tmp = re.findall(r'''(?x) + font_names_tmp = re.findall( + r"""(?x) ( "(?:[^"]|\\")+" | @@ -213,13 +229,15 @@ def build_font(self, props): | [^'",]+ )(?=,|\s*$) - ''', props.get('font-family', '')) + """, + props.get("font-family", ""), + ) font_names = [] for name in font_names_tmp: if name[:1] == '"': name = name[1:-1].replace('\\"', '"') - elif name[:1] == '\'': - name = name[1:-1].replace('\\\'', '\'') + elif name[:1] == "'": + name = name[1:-1].replace("\\'", "'") else: name = name.strip() if name: @@ -227,40 +245,40 @@ def build_font(self, props): family = None for name in font_names: - if name == 'serif': + if name == "serif": family = 1 # roman break - elif name == 'sans-serif': + elif name == "sans-serif": family = 2 # swiss break - elif name == 'cursive': + elif name == "cursive": family = 4 # script break - elif name == 'fantasy': + elif name == "fantasy": family = 5 # decorative break - decoration = props.get('text-decoration') + decoration = props.get("text-decoration") if decoration is not None: decoration = decoration.split() else: decoration = () return { - 'name': font_names[0] if font_names else None, - 'family': family, - 'size': size, - 'bold': self.BOLD_MAP.get(props.get('font-weight')), - 'italic': self.ITALIC_MAP.get(props.get('font-style')), - 'underline': ('single' if - 'underline' in decoration - else None), - 'strike': ('line-through' in decoration) or None, - 'color': self.color_to_excel(props.get('color')), + "name": font_names[0] if font_names else None, + "family": family, + "size": size, + "bold": self.BOLD_MAP.get(props.get("font-weight")), + "italic": self.ITALIC_MAP.get(props.get("font-style")), + "underline": ("single" if "underline" in decoration else None), + "strike": ("line-through" in decoration) or None, + "color": self.color_to_excel(props.get("color")), # shadow if nonzero digit before shadow color - 'shadow': (bool(re.search('^[^#(]*[1-9]', - props['text-shadow'])) - if 'text-shadow' in props else None), + "shadow": ( + bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) + if "text-shadow" in props + else None + ), # 'vertAlign':, # 'charset': , # 'scheme': , @@ -269,43 +287,42 @@ def build_font(self, props): } NAMED_COLORS = { - 'maroon': '800000', - 'brown': 'A52A2A', - 'red': 'FF0000', - 'pink': 'FFC0CB', - 'orange': 'FFA500', - 'yellow': 'FFFF00', - 'olive': '808000', - 'green': '008000', - 'purple': '800080', - 'fuchsia': 'FF00FF', - 'lime': '00FF00', - 'teal': '008080', - 'aqua': '00FFFF', - 'blue': '0000FF', - 'navy': '000080', - 'black': '000000', - 'gray': '808080', - 'grey': '808080', - 'silver': 'C0C0C0', - 'white': 'FFFFFF', + "maroon": "800000", + "brown": "A52A2A", + "red": "FF0000", + "pink": "FFC0CB", + "orange": "FFA500", + "yellow": "FFFF00", + "olive": "808000", + "green": "008000", + "purple": "800080", + "fuchsia": "FF00FF", + "lime": "00FF00", + "teal": "008080", + "aqua": "00FFFF", + "blue": "0000FF", + "navy": "000080", + "black": "000000", + "gray": "808080", + "grey": "808080", + "silver": "C0C0C0", + "white": "FFFFFF", } def color_to_excel(self, val): if val is None: return None - if val.startswith('#') and len(val) == 7: + if val.startswith("#") and len(val) == 7: return val[1:].upper() - if val.startswith('#') and len(val) == 4: + if val.startswith("#") and len(val) == 4: return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper() try: return self.NAMED_COLORS[val] except KeyError: - warnings.warn('Unhandled color format: {val!r}'.format(val=val), - CSSWarning) + warnings.warn("Unhandled color format: {val!r}".format(val=val), CSSWarning) def build_number_format(self, props): - return {'format_code': props.get('number-format')} + return {"format_code": props.get("number-format")} class ExcelFormatter: @@ -341,15 +358,25 @@ class ExcelFormatter: This is only called for body cells. """ - max_rows = 2**20 - max_cols = 2**14 - - def __init__(self, df, na_rep='', float_format=None, cols=None, - header=True, index=True, index_label=None, merge_cells=False, - inf_rep='inf', style_converter=None): + max_rows = 2 ** 20 + max_cols = 2 ** 14 + + def __init__( + self, + df, + na_rep="", + float_format=None, + cols=None, + header=True, + index=True, + index_label=None, + merge_cells=False, + inf_rep="inf", + style_converter=None, + ): self.rowcounter = 0 self.na_rep = na_rep - if hasattr(df, 'render'): + if hasattr(df, "render"): self.styler = df df = df.data if style_converter is None: @@ -362,8 +389,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, # all missing, raise if not len(Index(cols) & df.columns): - raise KeyError( - "passes columns are not ALL present dataframe") + raise KeyError("passes columns are not ALL present dataframe") # deprecatedin gh-17295 # 1 missing is ok (for now) @@ -371,7 +397,8 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, warnings.warn( "Not all names specified in 'columns' are found; " "this will raise a KeyError in the future", - FutureWarning) + FutureWarning, + ) self.df = df.reindex(columns=cols) self.columns = self.df.columns @@ -384,13 +411,16 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, @property def header_style(self): - return {"font": {"bold": True}, - "borders": {"top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin"}, - "alignment": {"horizontal": "center", - "vertical": "top"}} + return { + "font": {"bold": True}, + "borders": { + "top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin", + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } def _format_value(self, val): if is_scalar(val) and missing.isna(val): @@ -399,30 +429,35 @@ def _format_value(self, val): if missing.isposinf_scalar(val): val = self.inf_rep elif missing.isneginf_scalar(val): - val = '-{inf}'.format(inf=self.inf_rep) + val = "-{inf}".format(inf=self.inf_rep) elif self.float_format is not None: val = float(self.float_format % val) - if getattr(val, 'tzinfo', None) is not None: - raise ValueError('Excel does not support datetimes with ' - 'timezones. Please ensure that datetimes ' - 'are timezone unaware before writing to Excel.') + if getattr(val, "tzinfo", None) is not None: + raise ValueError( + "Excel does not support datetimes with " + "timezones. Please ensure that datetimes " + "are timezone unaware before writing to Excel." + ) return val def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: - raise NotImplementedError("Writing to Excel with MultiIndex" - " columns and no index " - "('index'=False) is not yet " - "implemented.") + raise NotImplementedError( + "Writing to Excel with MultiIndex" + " columns and no index " + "('index'=False) is not yet " + "implemented." + ) has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) if not (has_aliases or self.header): return columns = self.columns - level_strs = columns.format(sparsify=self.merge_cells, adjoin=False, - names=False) + level_strs = columns.format( + sparsify=self.merge_cells, adjoin=False, names=False + ) level_lengths = get_level_lengths(level_strs) coloffset = 0 lnum = 0 @@ -436,17 +471,24 @@ def _format_header_mi(self): name = columns.names[lnum] yield ExcelCell(lnum, coloffset, name, self.header_style) - for lnum, (spans, levels, level_codes) in enumerate(zip( - level_lengths, columns.levels, columns.codes)): + for lnum, (spans, levels, level_codes) in enumerate( + zip(level_lengths, columns.levels, columns.codes) + ): values = levels.take(level_codes) for i in spans: if spans[i] > 1: - yield ExcelCell(lnum, coloffset + i + 1, values[i], - self.header_style, lnum, - coloffset + i + spans[i]) + yield ExcelCell( + lnum, + coloffset + i + 1, + values[i], + self.header_style, + lnum, + coloffset + i + spans[i], + ) else: - yield ExcelCell(lnum, coloffset + i + 1, values[i], - self.header_style) + yield ExcelCell( + lnum, coloffset + i + 1, values[i], self.header_style + ) else: # Format in legacy format with dots to indicate levels. for i, values in enumerate(zip(*level_strs)): @@ -468,15 +510,17 @@ def _format_header_regular(self): colnames = self.columns if has_aliases: if len(self.header) != len(self.columns): - raise ValueError('Writing {cols} cols but got {alias} ' - 'aliases'.format(cols=len(self.columns), - alias=len(self.header))) + raise ValueError( + "Writing {cols} cols but got {alias} " + "aliases".format(cols=len(self.columns), alias=len(self.header)) + ) else: colnames = self.header for colindex, colname in enumerate(colnames): - yield ExcelCell(self.rowcounter, colindex + coloffset, colname, - self.header_style) + yield ExcelCell( + self.rowcounter, colindex + coloffset, colname, self.header_style + ) def _format_header(self): if isinstance(self.columns, ABCMultiIndex): @@ -486,12 +530,14 @@ def _format_header(self): gen2 = () if self.df.index.names: - row = [x if x is not None else '' - for x in self.df.index.names] + [''] * len(self.columns) - if reduce(lambda x, y: x and y, map(lambda x: x != '', row)): - gen2 = (ExcelCell(self.rowcounter, colindex, val, - self.header_style) - for colindex, val in enumerate(row)) + row = [x if x is not None else "" for x in self.df.index.names] + [ + "" + ] * len(self.columns) + if reduce(lambda x, y: x and y, map(lambda x: x != "", row)): + gen2 = ( + ExcelCell(self.rowcounter, colindex, val, self.header_style) + for colindex, val in enumerate(row) + ) self.rowcounter += 1 return itertools.chain(gen, gen2) @@ -511,9 +557,9 @@ def _format_regular_rows(self): if self.index: # check aliases # if list only take first as this is not a MultiIndex - if (self.index_label and - isinstance(self.index_label, (list, tuple, np.ndarray, - Index))): + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): index_label = self.index_label[0] # if string good to go elif self.index_label and isinstance(self.index_label, str): @@ -525,8 +571,7 @@ def _format_regular_rows(self): self.rowcounter += 1 if index_label and self.header is not False: - yield ExcelCell(self.rowcounter - 1, 0, index_label, - self.header_style) + yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style) # write index_values index_values = self.df.index @@ -534,8 +579,7 @@ def _format_regular_rows(self): index_values = self.df.index.to_timestamp() for idx, idxval in enumerate(index_values): - yield ExcelCell(self.rowcounter + idx, 0, idxval, - self.header_style) + yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style) coloffset = 1 else: @@ -554,9 +598,9 @@ def _format_hierarchical_rows(self): if self.index: index_labels = self.df.index.names # check for aliases - if (self.index_label and - isinstance(self.index_label, (list, tuple, np.ndarray, - Index))): + if self.index_label and isinstance( + self.index_label, (list, tuple, np.ndarray, Index) + ): index_labels = self.index_label # MultiIndex columns require an extra row @@ -570,40 +614,52 @@ def _format_hierarchical_rows(self): if com._any_not_none(*index_labels) and self.header is not False: for cidx, name in enumerate(index_labels): - yield ExcelCell(self.rowcounter - 1, cidx, name, - self.header_style) + yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style) if self.merge_cells: # Format hierarchical rows as merged cells. - level_strs = self.df.index.format(sparsify=True, adjoin=False, - names=False) + level_strs = self.df.index.format( + sparsify=True, adjoin=False, names=False + ) level_lengths = get_level_lengths(level_strs) - for spans, levels, level_codes in zip(level_lengths, - self.df.index.levels, - self.df.index.codes): + for spans, levels, level_codes in zip( + level_lengths, self.df.index.levels, self.df.index.codes + ): - values = levels.take(level_codes, - allow_fill=levels._can_hold_na, - fill_value=True) + values = levels.take( + level_codes, allow_fill=levels._can_hold_na, fill_value=True + ) for i in spans: if spans[i] > 1: - yield ExcelCell(self.rowcounter + i, gcolidx, - values[i], self.header_style, - self.rowcounter + i + spans[i] - 1, - gcolidx) + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + self.rowcounter + i + spans[i] - 1, + gcolidx, + ) else: - yield ExcelCell(self.rowcounter + i, gcolidx, - values[i], self.header_style) + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + ) gcolidx += 1 else: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): - yield ExcelCell(self.rowcounter + idx, gcolidx, - indexcolval, self.header_style) + yield ExcelCell( + self.rowcounter + idx, + gcolidx, + indexcolval, + self.header_style, + ) gcolidx += 1 for cell in self._generate_body(gcolidx): @@ -623,18 +679,23 @@ def _generate_body(self, coloffset): series = self.df.iloc[:, colidx] for i, val in enumerate(series): if styles is not None: - xlstyle = self.style_converter(';'.join(styles[i, colidx])) - yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, - xlstyle) + xlstyle = self.style_converter(";".join(styles[i, colidx])) + yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) def get_formatted_cells(self): - for cell in itertools.chain(self._format_header(), - self._format_body()): + for cell in itertools.chain(self._format_header(), self._format_body()): cell.val = self._format_value(cell.val) yield cell - def write(self, writer, sheet_name='Sheet1', startrow=0, - startcol=0, freeze_panes=None, engine=None): + def write( + self, + writer, + sheet_name="Sheet1", + startrow=0, + startcol=0, + freeze_panes=None, + engine=None, + ): """ writer : string or ExcelWriter object File path or existing ExcelWriter @@ -657,10 +718,11 @@ def write(self, writer, sheet_name='Sheet1', startrow=0, num_rows, num_cols = self.df.shape if num_rows > self.max_rows or num_cols > self.max_cols: - raise ValueError("This sheet is too large! Your sheet size is: " + - "{}, {} ".format(num_rows, num_cols) + - "Max sheet size is: {}, {}". - format(self.max_rows, self.max_cols)) + raise ValueError( + "This sheet is too large! Your sheet size is: " + + "{}, {} ".format(num_rows, num_cols) + + "Max sheet size is: {}, {}".format(self.max_rows, self.max_cols) + ) if isinstance(writer, ExcelWriter): need_save = False @@ -669,8 +731,12 @@ def write(self, writer, sheet_name='Sheet1', startrow=0, need_save = True formatted_cells = self.get_formatted_cells() - writer.write_cells(formatted_cells, sheet_name, - startrow=startrow, startcol=startcol, - freeze_panes=freeze_panes) + writer.write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) if need_save: writer.save() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 98c31fbeb78e6..c4e3dd1c755cf 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -17,12 +17,26 @@ from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas.core.dtypes.common import ( - is_categorical_dtype, is_complex_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_list_like, is_numeric_dtype, is_scalar, - is_timedelta64_dtype) + is_categorical_dtype, + is_complex_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_numeric_dtype, + is_scalar, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ( - ABCIndexClass, ABCMultiIndex, ABCSeries, ABCSparseArray) + ABCIndexClass, + ABCMultiIndex, + ABCSeries, + ABCSparseArray, +) from pandas.core.dtypes.missing import isna, notna from pandas.core.base import PandasObject @@ -92,9 +106,19 @@ .. versionadded:: 0.18.0 """ -_VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify", - "justify-all", "start", "end", "inherit", - "match-parent", "initial", "unset") +_VALID_JUSTIFY_PARAMETERS = ( + "left", + "right", + "center", + "justify", + "justify-all", + "start", + "end", + "inherit", + "match-parent", + "initial", + "unset", +) return_docstring = """ Returns @@ -105,9 +129,7 @@ class CategoricalFormatter: - - def __init__(self, categorical, buf=None, length=True, na_rep='NaN', - footer=True): + def __init__(self, categorical, buf=None, length=True, na_rep="NaN", footer=True): self.categorical = categorical self.buf = buf if buf is not None else StringIO("") self.na_rep = na_rep @@ -115,25 +137,29 @@ def __init__(self, categorical, buf=None, length=True, na_rep='NaN', self.footer = footer def _get_footer(self): - footer = '' + footer = "" if self.length: if footer: - footer += ', ' + footer += ", " footer += "Length: {length}".format(length=len(self.categorical)) level_info = self.categorical._repr_categories_info() # Levels are added in a newline if footer: - footer += '\n' + footer += "\n" footer += level_info return str(footer) def _get_formatted_values(self): - return format_array(self.categorical._internal_get_values(), None, - float_format=None, na_rep=self.na_rep) + return format_array( + self.categorical._internal_get_values(), + None, + float_format=None, + na_rep=self.na_rep, + ) def to_string(self): categorical = self.categorical @@ -142,27 +168,37 @@ def to_string(self): if self.footer: return self._get_footer() else: - return '' + return "" fmt_values = self._get_formatted_values() - result = ['{i}'.format(i=i) for i in fmt_values] + result = ["{i}".format(i=i) for i in fmt_values] result = [i.strip() for i in result] - result = ', '.join(result) - result = ['[' + result + ']'] + result = ", ".join(result) + result = ["[" + result + "]"] if self.footer: footer = self._get_footer() if footer: result.append(footer) - return str('\n'.join(result)) + return str("\n".join(result)) class SeriesFormatter: - - def __init__(self, series, buf=None, length=True, header=True, index=True, - na_rep='NaN', name=False, float_format=None, dtype=True, - max_rows=None, min_rows=None): + def __init__( + self, + series, + buf=None, + length=True, + header=True, + index=True, + na_rep="NaN", + name=False, + float_format=None, + dtype=True, + max_rows=None, + min_rows=None, + ): self.series = series self.buf = buf if buf is not None else StringIO() self.name = name @@ -183,6 +219,7 @@ def __init__(self, series, buf=None, length=True, header=True, index=True, def _chk_truncate(self): from pandas.core.reshape.concat import concat + min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows @@ -199,8 +236,7 @@ def _chk_truncate(self): series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], - series.iloc[-row_num:])) + series = concat((series.iloc[:row_num], series.iloc[-row_num:])) self.tr_row_num = row_num else: self.tr_row_num = None @@ -209,32 +245,31 @@ def _chk_truncate(self): def _get_footer(self): name = self.series.name - footer = '' + footer = "" - if getattr(self.series.index, 'freq', None) is not None: - footer += 'Freq: {freq}'.format(freq=self.series.index.freqstr) + if getattr(self.series.index, "freq", None) is not None: + footer += "Freq: {freq}".format(freq=self.series.index.freqstr) if self.name is not False and name is not None: if footer: - footer += ', ' + footer += ", " - series_name = pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - footer += (("Name: {sname}".format(sname=series_name)) - if name is not None else "") + series_name = pprint_thing(name, escape_chars=("\t", "\r", "\n")) + footer += ( + ("Name: {sname}".format(sname=series_name)) if name is not None else "" + ) - if (self.length is True or - (self.length == 'truncate' and self.truncate_v)): + if self.length is True or (self.length == "truncate" and self.truncate_v): if footer: - footer += ', ' - footer += 'Length: {length}'.format(length=len(self.series)) + footer += ", " + footer += "Length: {length}".format(length=len(self.series)) if self.dtype is not False and self.dtype is not None: - name = getattr(self.tr_series.dtype, 'name', None) + name = getattr(self.tr_series.dtype, "name", None) if name: if footer: - footer += ', ' - footer += 'dtype: {typ}'.format(typ=pprint_thing(name)) + footer += ", " + footer += "dtype: {typ}".format(typ=pprint_thing(name)) # level infos are added to the end and in a new line, like it is done # for Categoricals @@ -260,8 +295,9 @@ def _get_formatted_index(self): def _get_formatted_values(self): values_to_format = self.tr_series._formatting_values() - return format_array(values_to_format, None, - float_format=self.float_format, na_rep=self.na_rep) + return format_array( + values_to_format, None, float_format=self.float_format, na_rep=self.na_rep + ) def to_string(self): series = self.tr_series @@ -269,7 +305,8 @@ def to_string(self): if len(series) == 0: return "{name}([], {footer})".format( - name=self.series.__class__.__name__, footer=footer) + name=self.series.__class__.__name__, footer=footer + ) fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() @@ -279,14 +316,14 @@ def to_string(self): row_num = self.tr_row_num width = self.adj.len(fmt_values[row_num - 1]) if width > 3: - dot_str = '...' + dot_str = "..." else: - dot_str = '..' + dot_str = ".." # Series uses mode=center because it has single value columns # DataFrame uses mode=left - dot_str = self.adj.justify([dot_str], width, mode='center')[0] + dot_str = self.adj.justify([dot_str], width, mode="center")[0] fmt_values.insert(row_num + n_header_rows, dot_str) - fmt_index.insert(row_num + 1, '') + fmt_index.insert(row_num + 1, "") if self.index: result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values]) @@ -294,32 +331,29 @@ def to_string(self): result = self.adj.adjoin(3, fmt_values) if self.header and have_header: - result = fmt_index[0] + '\n' + result + result = fmt_index[0] + "\n" + result if footer: - result += '\n' + footer + result += "\n" + footer - return str(''.join(result)) + return str("".join(result)) class TextAdjustment: - def __init__(self): self.encoding = get_option("display.encoding") def len(self, text): return len(text) - def justify(self, texts, max_len, mode='right'): + def justify(self, texts, max_len, mode="right"): return justify(texts, max_len, mode=mode) def adjoin(self, space, *lists, **kwargs): - return adjoin(space, *lists, strlen=self.len, - justfunc=self.justify, **kwargs) + return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs) class EastAsianTextAdjustment(TextAdjustment): - def __init__(self): super().__init__() if get_option("display.unicode.ambiguous_as_wide"): @@ -330,7 +364,7 @@ def __init__(self): # Definition of East Asian Width # http://unicode.org/reports/tr11/ # Ambiguous width can be changed by option - self._EAW_MAP = {'Na': 1, 'N': 1, 'W': 2, 'F': 2, 'H': 1} + self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1} def len(self, text): """ @@ -339,17 +373,18 @@ def len(self, text): if not isinstance(text, str): return len(text) - return sum(self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) - for c in text) + return sum( + self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text + ) - def justify(self, texts, max_len, mode='right'): + def justify(self, texts, max_len, mode="right"): # re-calculate padding space per str considering East Asian Width def _get_pad(t): return max_len - self.len(t) + len(t) - if mode == 'left': + if mode == "left": return [x.ljust(_get_pad(x)) for x in texts] - elif mode == 'center': + elif mode == "center": return [x.center(_get_pad(x)) for x in texts] else: return [x.rjust(_get_pad(x)) for x in texts] @@ -370,8 +405,9 @@ class TableFormatter: @property def should_show_dimensions(self): - return (self.show_dimensions is True or - (self.show_dimensions == 'truncate' and self.is_truncated)) + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) def _get_formatter(self, i): if isinstance(self.formatters, (list, tuple)): @@ -395,15 +431,33 @@ class DataFrameFormatter(TableFormatter): """ - __doc__ = __doc__ if __doc__ else '' + __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring - def __init__(self, frame, buf=None, columns=None, col_space=None, - header=True, index=True, na_rep='NaN', formatters=None, - justify=None, float_format=None, sparsify=None, - index_names=True, line_width=None, max_rows=None, - min_rows=None, max_cols=None, show_dimensions=False, - decimal='.', table_id=None, render_links=False, **kwds): + def __init__( + self, + frame, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + justify=None, + float_format=None, + sparsify=None, + index_names=True, + line_width=None, + max_rows=None, + min_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + table_id=None, + render_links=False, + **kwds + ): self.frame = frame if buf is not None: self.buf = _expand_user(_stringify_path(buf)) @@ -427,8 +481,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.max_rows = max_rows self.min_rows = min_rows self.max_cols = max_cols - self.max_rows_displayed = min(max_rows or len(self.frame), - len(self.frame)) + self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) self.show_dimensions = show_dimensions self.table_id = table_id self.render_links = render_links @@ -469,8 +522,7 @@ def _chk_truncate(self): prompt_row = 1 if self.show_dimensions: show_dimension_rows = 3 - n_add_rows = (self.header + dot_row + show_dimension_rows + - prompt_row) + n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row # rows available to fill with actual data max_rows_adj = self.h - n_add_rows self.max_rows_adj = max_rows_adj @@ -482,13 +534,13 @@ def _chk_truncate(self): if max_rows == 0 and len(self.frame) > h: max_rows = h - if not hasattr(self, 'max_rows_adj'): + if not hasattr(self, "max_rows_adj"): if max_rows: if (len(self.frame) > max_rows) and self.min_rows: # if truncated, set max_rows showed to min_rows max_rows = min(self.min_rows, max_rows) self.max_rows_adj = max_rows - if not hasattr(self, 'max_cols_adj'): + if not hasattr(self, "max_cols_adj"): self.max_cols_adj = max_cols max_cols_adj = self.max_cols_adj @@ -505,9 +557,10 @@ def _chk_truncate(self): frame = frame.iloc[:, :max_cols] col_num = max_cols else: - col_num = (max_cols_adj // 2) - frame = concat((frame.iloc[:, :col_num], - frame.iloc[:, -col_num:]), axis=1) + col_num = max_cols_adj // 2 + frame = concat( + (frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1 + ) self.tr_col_num = col_num if truncate_v: if max_rows_adj == 1: @@ -515,8 +568,7 @@ def _chk_truncate(self): frame = frame.iloc[:max_rows, :] else: row_num = max_rows_adj // 2 - frame = concat((frame.iloc[:row_num, :], - frame.iloc[-row_num:, :])) + frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) self.tr_row_num = row_num else: self.tr_row_num = None @@ -539,37 +591,44 @@ def _to_str_columns(self): stringified = [] for i, c in enumerate(frame): fmt_values = self._format_col(i) - fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=(self.col_space or 0), - adj=self.adj) + fmt_values = _make_fixed_width( + fmt_values, + self.justify, + minimum=(self.col_space or 0), + adj=self.adj, + ) stringified.append(fmt_values) else: if is_list_like(self.header): if len(self.header) != len(self.columns): - raise ValueError(('Writing {ncols} cols but got {nalias} ' - 'aliases' - .format(ncols=len(self.columns), - nalias=len(self.header)))) + raise ValueError( + ( + "Writing {ncols} cols but got {nalias} " + "aliases".format( + ncols=len(self.columns), nalias=len(self.header) + ) + ) + ) str_columns = [[label] for label in self.header] else: str_columns = self._get_formatted_column_labels(frame) if self.show_row_idx_names: for x in str_columns: - x.append('') + x.append("") stringified = [] for i, c in enumerate(frame): cheader = str_columns[i] - header_colwidth = max(self.col_space or 0, - *(self.adj.len(x) for x in cheader)) + header_colwidth = max( + self.col_space or 0, *(self.adj.len(x) for x in cheader) + ) fmt_values = self._format_col(i) - fmt_values = _make_fixed_width(fmt_values, self.justify, - minimum=header_colwidth, - adj=self.adj) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=header_colwidth, adj=self.adj + ) - max_len = max(max(self.adj.len(x) for x in fmt_values), - header_colwidth) + max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) cheader = self.adj.justify(cheader, max_len, mode=self.justify) stringified.append(cheader + fmt_values) @@ -583,7 +642,7 @@ def _to_str_columns(self): if truncate_h: col_num = self.tr_col_num - strcols.insert(self.tr_col_num + 1, [' ...'] * (len(str_index))) + strcols.insert(self.tr_col_num + 1, [" ..."] * (len(str_index))) if truncate_v: n_header_rows = len(str_index) - len(frame) row_num = self.tr_row_num @@ -594,17 +653,17 @@ def _to_str_columns(self): if truncate_h: is_dot_col = ix == col_num + 1 if cwidth > 3 or is_dot_col: - my_str = '...' + my_str = "..." else: - my_str = '..' + my_str = ".." if ix == 0: - dot_mode = 'left' + dot_mode = "left" elif is_dot_col: cwidth = 4 - dot_mode = 'right' + dot_mode = "right" else: - dot_mode = 'right' + dot_mode = "right" dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0] strcols[ix].insert(row_num + n_header_rows, dot_str) return strcols @@ -618,10 +677,11 @@ def to_string(self): frame = self.frame if len(frame.columns) == 0 or len(frame.index) == 0: - info_line = ('Empty {name}\nColumns: {col}\nIndex: {idx}' - .format(name=type(self.frame).__name__, - col=pprint_thing(frame.columns), - idx=pprint_thing(frame.index))) + info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format( + name=type(self.frame).__name__, + col=pprint_thing(frame.columns), + idx=pprint_thing(frame.index), + ) text = info_line else: @@ -629,27 +689,27 @@ def to_string(self): if self.line_width is None: # no need to wrap around just print # the whole frame text = self.adj.adjoin(1, *strcols) - elif (not isinstance(self.max_cols, int) or - self.max_cols > 0): # need to wrap around + elif ( + not isinstance(self.max_cols, int) or self.max_cols > 0 + ): # need to wrap around text = self._join_multiline(*strcols) else: # max_cols == 0. Try to fit frame to terminal - text = self.adj.adjoin(1, *strcols).split('\n') + text = self.adj.adjoin(1, *strcols).split("\n") max_len = Series(text).str.len().max() # plus truncate dot col dif = max_len - self.w # '+ 1' to avoid too wide repr (GH PR #17023) adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() - for ele in strcols]) + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) n_cols = len(col_lens) counter = 0 while adj_dif > 0 and n_cols > 1: counter += 1 - mid = int(round(n_cols / 2.)) + mid = int(round(n_cols / 2.0)) mid_ix = col_lens.index[mid] col_len = col_lens[mid_ix] # adjoin adds one - adj_dif -= (col_len + 1) + adj_dif -= col_len + 1 col_lens = col_lens.drop(mid_ix) n_cols = len(col_lens) # subtract index column @@ -666,8 +726,11 @@ def to_string(self): self.buf.writelines(text) if self.should_show_dimensions: - self.buf.write("\n\n[{nrows} rows x {ncols} columns]" - .format(nrows=len(frame), ncols=len(frame.columns))) + self.buf.write( + "\n\n[{nrows} rows x {ncols} columns]".format( + nrows=len(frame), ncols=len(frame.columns) + ) + ) def _join_multiline(self, *strcols): lwidth = self.line_width @@ -675,11 +738,12 @@ def _join_multiline(self, *strcols): strcols = list(strcols) if self.index: idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) - for x in idx]).max() + adjoin_width + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - col_widths = [np.array([self.adj.len(x) for x in col]).max() if - len(col) > 0 else 0 for col in strcols] + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] col_bins = _binify(col_widths, lwidth) nbins = len(col_bins) @@ -696,46 +760,62 @@ def _join_multiline(self, *strcols): row.insert(0, idx) if nbins > 1: if ed <= len(strcols) and i < nbins - 1: - row.append([' \\'] + [' '] * (nrows - 1)) + row.append([" \\"] + [" "] * (nrows - 1)) else: - row.append([' '] * nrows) + row.append([" "] * nrows) str_lst.append(self.adj.adjoin(adjoin_width, *row)) st = ed - return '\n\n'.join(str_lst) - - def to_latex(self, column_format=None, longtable=False, encoding=None, - multicolumn=False, multicolumn_format=None, multirow=False): + return "\n\n".join(str_lst) + + def to_latex( + self, + column_format=None, + longtable=False, + encoding=None, + multicolumn=False, + multicolumn_format=None, + multirow=False, + ): """ Render a DataFrame to a LaTeX tabular/longtable environment output. """ from pandas.io.formats.latex import LatexFormatter - latex_renderer = LatexFormatter(self, column_format=column_format, - longtable=longtable, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow) + + latex_renderer = LatexFormatter( + self, + column_format=column_format, + longtable=longtable, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + ) if encoding is None: - encoding = 'utf-8' + encoding = "utf-8" - if hasattr(self.buf, 'write'): + if hasattr(self.buf, "write"): latex_renderer.write_result(self.buf) elif isinstance(self.buf, str): import codecs - with codecs.open(self.buf, 'w', encoding=encoding) as f: + + with codecs.open(self.buf, "w", encoding=encoding) as f: latex_renderer.write_result(f) else: - raise TypeError('buf is not a file name and it has no write ' - 'method') + raise TypeError("buf is not a file name and it has no write " "method") def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) values_to_format = frame.iloc[:, i]._formatting_values() - return format_array(values_to_format, formatter, - float_format=self.float_format, na_rep=self.na_rep, - space=self.col_space, decimal=self.decimal) + return format_array( + values_to_format, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + space=self.col_space, + decimal=self.decimal, + ) def to_html(self, classes=None, notebook=False, border=None): """ @@ -755,16 +835,16 @@ def to_html(self, classes=None, notebook=False, border=None): .. versionadded:: 0.19.0 """ from pandas.io.formats.html import HTMLFormatter, NotebookFormatter + Klass = NotebookFormatter if notebook else HTMLFormatter html = Klass(self, classes=classes, border=border).render() - if hasattr(self.buf, 'write'): + if hasattr(self.buf, "write"): buffer_put_lines(self.buf, html) elif isinstance(self.buf, str): - with open(self.buf, 'w') as f: + with open(self.buf, "w") as f: buffer_put_lines(f, html) else: - raise TypeError('buf is not a file name and it has no write ' - ' method') + raise TypeError("buf is not a file name and it has no write " " method") def _get_formatted_column_labels(self, frame): from pandas.core.index import _sparsify @@ -781,13 +861,17 @@ def _get_formatted_column_labels(self, frame): need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) def space_format(x, y): - if (y not in self.formatters and - need_leadsp[x] and not restrict_formatting): - return ' ' + y + if ( + y not in self.formatters + and need_leadsp[x] + and not restrict_formatting + ): + return " " + y return y - str_columns = list(zip(*[[space_format(x, y) for y in x] - for x in fmt_columns])) + str_columns = list( + zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) + ) if self.sparsify and len(str_columns): str_columns = _sparsify(str_columns) @@ -796,10 +880,10 @@ def space_format(x, y): fmt_columns = columns.format() dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [[' ' + x if not self._get_formatter(i) and - need_leadsp[x] else x] - for i, (col, x) in enumerate(zip(columns, - fmt_columns))] + str_columns = [ + [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] + for i, (col, x) in enumerate(zip(columns, fmt_columns)) + ] # self.str_columns = str_columns return str_columns @@ -813,43 +897,45 @@ def has_column_names(self): @property def show_row_idx_names(self): - return all((self.has_index_names, - self.index, - self.show_index_names)) + return all((self.has_index_names, self.index, self.show_index_names)) @property def show_col_idx_names(self): - return all((self.has_column_names, - self.show_index_names, - self.header)) + return all((self.has_column_names, self.show_index_names, self.header)) def _get_formatted_index(self, frame): # Note: this is only used by to_string() and to_latex(), not by # to_html(). index = frame.index columns = frame.columns - fmt = self._get_formatter('__index__') + fmt = self._get_formatter("__index__") if isinstance(index, ABCMultiIndex): fmt_index = index.format( - sparsify=self.sparsify, adjoin=False, - names=self.show_row_idx_names, formatter=fmt) + sparsify=self.sparsify, + adjoin=False, + names=self.show_row_idx_names, + formatter=fmt, + ) else: - fmt_index = [index.format( - name=self.show_row_idx_names, formatter=fmt)] + fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)] - fmt_index = [tuple(_make_fixed_width(list(x), justify='left', - minimum=(self.col_space or 0), - adj=self.adj)) for x in fmt_index] + fmt_index = [ + tuple( + _make_fixed_width( + list(x), justify="left", minimum=(self.col_space or 0), adj=self.adj + ) + ) + for x in fmt_index + ] - adjoined = self.adj.adjoin(1, *fmt_index).split('\n') + adjoined = self.adj.adjoin(1, *fmt_index).split("\n") # empty space for columns if self.show_col_idx_names: - col_header = ['{x}'.format(x=x) - for x in self._get_column_name_list()] + col_header = ["{x}".format(x=x) for x in self._get_column_name_list()] else: - col_header = [''] * columns.nlevels + col_header = [""] * columns.nlevels if self.header: return col_header + adjoined @@ -860,19 +946,27 @@ def _get_column_name_list(self): names = [] columns = self.frame.columns if isinstance(columns, ABCMultiIndex): - names.extend('' if name is None else name - for name in columns.names) + names.extend("" if name is None else name for name in columns.names) else: - names.append('' if columns.name is None else columns.name) + names.append("" if columns.name is None else columns.name) return names + # ---------------------------------------------------------------------- # Array formatters -def format_array(values, formatter, float_format=None, na_rep='NaN', - digits=None, space=None, justify='right', decimal='.', - leading_space=None): +def format_array( + values, + formatter, + float_format=None, + na_rep="NaN", + digits=None, + space=None, + justify="right", + decimal=".", + leading_space=None, +): """ Format an array for printing. @@ -924,19 +1018,36 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', if digits is None: digits = get_option("display.precision") - fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, - float_format=float_format, formatter=formatter, - space=space, justify=justify, decimal=decimal, - leading_space=leading_space) + fmt_obj = fmt_klass( + values, + digits=digits, + na_rep=na_rep, + float_format=float_format, + formatter=formatter, + space=space, + justify=justify, + decimal=decimal, + leading_space=leading_space, + ) return fmt_obj.get_result() class GenericArrayFormatter: - - def __init__(self, values, digits=7, formatter=None, na_rep='NaN', - space=12, float_format=None, justify='right', decimal='.', - quoting=None, fixed_width=True, leading_space=None): + def __init__( + self, + values, + digits=7, + formatter=None, + na_rep="NaN", + space=12, + float_format=None, + justify="right", + decimal=".", + quoting=None, + fixed_width=True, + leading_space=None, + ): self.values = values self.digits = digits self.na_rep = na_rep @@ -957,15 +1068,18 @@ def _format_strings(self): if self.float_format is None: float_format = get_option("display.float_format") if float_format is None: - fmt_str = ('{{x: .{prec:d}g}}' - .format(prec=get_option("display.precision"))) + fmt_str = "{{x: .{prec:d}g}}".format( + prec=get_option("display.precision") + ) float_format = lambda x: fmt_str.format(x=x) else: float_format = self.float_format formatter = ( - self.formatter if self.formatter is not None else - (lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')))) + self.formatter + if self.formatter is not None + else (lambda x: pprint_thing(x, escape_chars=("\t", "\r", "\n"))) + ) def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): @@ -973,18 +1087,18 @@ def _format(x): # try block for np.isnat specifically # determine na_rep if x is None or NaT-like if x is None: - return 'None' + return "None" elif x is NaT or np.isnat(x): - return 'NaT' + return "NaT" except (TypeError, ValueError): # np.isnat only handles datetime or timedelta objects pass return self.na_rep elif isinstance(x, PandasObject): - return '{x}'.format(x=x) + return "{x}".format(x=x) else: # object dtype - return '{x}'.format(x=formatter(x)) + return "{x}".format(x=formatter(x)) vals = self.values if isinstance(vals, Index): @@ -1000,16 +1114,16 @@ def _format(x): fmt_values = [] for i, v in enumerate(vals): if not is_float_type[i] and leading_space: - fmt_values.append(' {v}'.format(v=_format(v))) + fmt_values.append(" {v}".format(v=_format(v))) elif is_float_type[i]: fmt_values.append(float_format(v)) else: if leading_space is False: # False specifically, so that the default is # to include a space if we get here. - tpl = '{v}' + tpl = "{v}" else: - tpl = ' {v}' + tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) return fmt_values @@ -1047,15 +1161,20 @@ def _value_formatter(self, float_format=None, threshold=None): # when there is no float_format, we use str instead of '%g' # because str(0.0) = '0.0' while '%g' % 0.0 = '0' if float_format: + def base_formatter(v): return float_format(value=v) if notna(v) else self.na_rep + else: + def base_formatter(v): return str(v) if notna(v) else self.na_rep - if self.decimal != '.': + if self.decimal != ".": + def decimal_formatter(v): - return base_formatter(v).replace('.', self.decimal, 1) + return base_formatter(v).replace(".", self.decimal, 1) + else: decimal_formatter = base_formatter @@ -1093,8 +1212,8 @@ def format_values_with(float_format): # default formatter leaves a space to the left when formatting # floats, must be consistent for left-justifying NaNs (GH #25061) - if self.justify == 'left': - na_rep = ' ' + self.na_rep + if self.justify == "left": + na_rep = " " + self.na_rep else: na_rep = self.na_rep @@ -1102,13 +1221,14 @@ def format_values_with(float_format): values = self.values is_complex = is_complex_dtype(values) mask = isna(values) - if hasattr(values, 'to_dense'): # sparse numpy ndarray + if hasattr(values, "to_dense"): # sparse numpy ndarray values = values.to_dense() - values = np.array(values, dtype='object') + values = np.array(values, dtype="object") values[mask] = na_rep imask = (~mask).ravel() - values.flat[imask] = np.array([formatter(val) - for val in values.ravel()[imask]]) + values.flat[imask] = np.array( + [formatter(val) for val in values.ravel()[imask]] + ) if self.fixed_width: if is_complex: @@ -1122,8 +1242,9 @@ def format_values_with(float_format): # The default is otherwise to use str instead of a formatting string if self.float_format is None: if self.fixed_width: - float_format = partial('{value: .{digits:d}f}'.format, - digits=self.digits) + float_format = partial( + "{value: .{digits:d}f}".format, digits=self.digits + ) else: float_format = self.float_format else: @@ -1144,18 +1265,18 @@ def format_values_with(float_format): else: too_long = False - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): abs_vals = np.abs(self.values) # this is pretty arbitrary for now # large values: more that 8 characters including decimal symbol # and first digit, hence > 1e6 has_large_values = (abs_vals > 1e6).any() - has_small_values = ((abs_vals < 10**(-self.digits)) & - (abs_vals > 0)).any() + has_small_values = ( + (abs_vals < 10 ** (-self.digits)) & (abs_vals > 0) + ).any() if has_small_values or (too_long and has_large_values): - float_format = partial('{value: .{digits:d}e}'.format, - digits=self.digits) + float_format = partial("{value: .{digits:d}e}".format, digits=self.digits) formatted_values = format_values_with(float_format) return formatted_values @@ -1169,16 +1290,14 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): - def _format_strings(self): - formatter = self.formatter or (lambda x: '{x: d}'.format(x=x)) + formatter = self.formatter or (lambda x: "{x: d}".format(x=x)) fmt_values = [formatter(x) for x in self.values] return fmt_values class Datetime64Formatter(GenericArrayFormatter): - - def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs): + def __init__(self, values, nat_rep="NaT", date_format=None, **kwargs): super().__init__(values, **kwargs) self.nat_rep = nat_rep self.date_format = date_format @@ -1196,9 +1315,9 @@ def _format_strings(self): fmt_values = format_array_from_datetime( values.asi8.ravel(), - format=_get_format_datetime64_from_values(values, - self.date_format), - na_rep=self.nat_rep).reshape(values.shape) + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep, + ).reshape(values.shape) return fmt_values.tolist() @@ -1216,12 +1335,16 @@ def _format_strings(self): else: array = np.asarray(values) - fmt_values = format_array(array, - formatter, - float_format=self.float_format, - na_rep=self.na_rep, digits=self.digits, - space=self.space, justify=self.justify, - leading_space=self.leading_space) + fmt_values = format_array( + array, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + digits=self.digits, + space=self.space, + justify=self.justify, + leading_space=self.leading_space, + ) return fmt_values @@ -1261,9 +1384,12 @@ def format_percentiles(percentiles): percentiles = np.asarray(percentiles) # It checks for np.NaN as well - with np.errstate(invalid='ignore'): - if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ - or not np.all(percentiles <= 1): + with np.errstate(invalid="ignore"): + if ( + not is_numeric_dtype(percentiles) + or not np.all(percentiles >= 0) + or not np.all(percentiles <= 1) + ): raise ValueError("percentiles should all be in the interval [0,1]") percentiles = 100 * percentiles @@ -1271,21 +1397,21 @@ def format_percentiles(percentiles): if np.all(int_idx): out = percentiles.astype(int).astype(str) - return [i + '%' for i in out] + return [i + "%" for i in out] unique_pcts = np.unique(percentiles) to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None # Least precision that keeps percentiles unique after rounding - prec = -np.floor(np.log10(np.min( - np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end) - ))).astype(int) + prec = -np.floor( + np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end))) + ).astype(int) prec = max(1, prec) out = np.empty_like(percentiles, dtype=object) out[int_idx] = percentiles[int_idx].astype(int).astype(str) out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) - return [i + '%' for i in out] + return [i + "%" for i in out] def _is_dates_only(values): @@ -1298,20 +1424,21 @@ def _is_dates_only(values): values_int = values.asi8 consider_values = values_int != iNaT - one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, - values_int % int(one_day_nanos) != 0).sum() == 0 + one_day_nanos = 86400 * 1e9 + even_days = ( + np.logical_and(consider_values, values_int % int(one_day_nanos) != 0).sum() == 0 + ) if even_days: return True return False -def _format_datetime64(x, tz=None, nat_rep='NaT'): +def _format_datetime64(x, tz=None, nat_rep="NaT"): if x is None or (is_scalar(x) and isna(x)): return nat_rep if tz is not None or not isinstance(x, Timestamp): - if getattr(x, 'tzinfo', None) is not None: + if getattr(x, "tzinfo", None) is not None: x = Timestamp(x).tz_convert(tz) else: x = Timestamp(x).tz_localize(tz) @@ -1319,7 +1446,7 @@ def _format_datetime64(x, tz=None, nat_rep='NaT'): return str(x) -def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): +def _format_datetime64_dateonly(x, nat_rep="NaT", date_format=None): if x is None or (is_scalar(x) and isna(x)): return nat_rep @@ -1332,11 +1459,12 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None): return x._date_repr -def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None): +def _get_format_datetime64(is_dates_only, nat_rep="NaT", date_format=None): if is_dates_only: return lambda x, tz=None: _format_datetime64_dateonly( - x, nat_rep=nat_rep, date_format=date_format) + x, nat_rep=nat_rep, date_format=date_format + ) else: return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) @@ -1356,36 +1484,34 @@ def _get_format_datetime64_from_values(values, date_format): class Datetime64TZFormatter(Datetime64Formatter): - def _format_strings(self): """ we by definition have a TZ """ values = self.values.astype(object) is_dates_only = _is_dates_only(values) - formatter = (self.formatter or - _get_format_datetime64(is_dates_only, - date_format=self.date_format)) + formatter = self.formatter or _get_format_datetime64( + is_dates_only, date_format=self.date_format + ) fmt_values = [formatter(x) for x in values] return fmt_values class Timedelta64Formatter(GenericArrayFormatter): - - def __init__(self, values, nat_rep='NaT', box=False, **kwargs): + def __init__(self, values, nat_rep="NaT", box=False, **kwargs): super().__init__(values, **kwargs) self.nat_rep = nat_rep self.box = box def _format_strings(self): - formatter = (self.formatter or - _get_format_timedelta64(self.values, nat_rep=self.nat_rep, - box=self.box)) + formatter = self.formatter or _get_format_timedelta64( + self.values, nat_rep=self.nat_rep, box=self.box + ) fmt_values = np.array([formatter(x) for x in self.values]) return fmt_values -def _get_format_timedelta64(values, nat_rep='NaT', box=False): +def _get_format_timedelta64(values, nat_rep="NaT", box=False): """ Return a formatter function for a range of timedeltas. These will all have the same format argument @@ -1397,18 +1523,20 @@ def _get_format_timedelta64(values, nat_rep='NaT', box=False): consider_values = values_int != iNaT - one_day_nanos = (86400 * 1e9) - even_days = np.logical_and(consider_values, - values_int % one_day_nanos != 0).sum() == 0 - all_sub_day = np.logical_and( - consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 + one_day_nanos = 86400 * 1e9 + even_days = ( + np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0 + ) + all_sub_day = ( + np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0 + ) if even_days: format = None elif all_sub_day: - format = 'sub_day' + format = "sub_day" else: - format = 'long' + format = "long" def _formatter(x): if x is None or (is_scalar(x) and isna(x)): @@ -1424,9 +1552,9 @@ def _formatter(x): return _formatter -def _make_fixed_width(strings, justify='right', minimum=None, adj=None): +def _make_fixed_width(strings, justify="right", minimum=None, adj=None): - if len(strings) == 0 or justify == 'all': + if len(strings) == 0 or justify == "all": return strings if adj is None: @@ -1444,7 +1572,7 @@ def _make_fixed_width(strings, justify='right', minimum=None, adj=None): def just(x): if conf_max is not None: if (conf_max > 3) & (adj.len(x) > max_len): - x = x[:max_len - 3] + '...' + x = x[: max_len - 3] + "..." return x strings = [just(x) for x in strings] @@ -1452,41 +1580,46 @@ def just(x): return result -def _trim_zeros_complex(str_complexes, na_rep='NaN'): +def _trim_zeros_complex(str_complexes, na_rep="NaN"): """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ + def separate_and_trim(str_complex, na_rep): - num_arr = str_complex.split('+') - return (_trim_zeros_float([num_arr[0]], na_rep) + - ['+'] + - _trim_zeros_float([num_arr[1][:-1]], na_rep) + - ['j']) + num_arr = str_complex.split("+") + return ( + _trim_zeros_float([num_arr[0]], na_rep) + + ["+"] + + _trim_zeros_float([num_arr[1][:-1]], na_rep) + + ["j"] + ) - return [''.join(separate_and_trim(x, na_rep)) for x in str_complexes] + return ["".join(separate_and_trim(x, na_rep)) for x in str_complexes] -def _trim_zeros_float(str_floats, na_rep='NaN'): +def _trim_zeros_float(str_floats, na_rep="NaN"): """ Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats def _is_number(x): - return (x != na_rep and not x.endswith('inf')) + return x != na_rep and not x.endswith("inf") def _cond(values): finite = [x for x in values if _is_number(x)] - return (len(finite) > 0 and all(x.endswith('0') for x in finite) and - not (any(('e' in x) or ('E' in x) for x in finite))) + return ( + len(finite) > 0 + and all(x.endswith("0") for x in finite) + and not (any(("e" in x) or ("E" in x) for x in finite)) + ) while _cond(trimmed): trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] # leave one 0 after the decimal points if need be. - return [x + "0" if x.endswith('.') and _is_number(x) else x - for x in trimmed] + return [x + "0" if x.endswith(".") and _is_number(x) else x for x in trimmed] def _has_names(index): @@ -1521,7 +1654,7 @@ class EngFormatter: 15: "P", 18: "E", 21: "Z", - 24: "Y" + 24: "Y", } def __init__(self, accuracy=None, use_eng_prefix=False): @@ -1551,13 +1684,14 @@ def __call__(self, num): """ import decimal import math + dnum = decimal.Decimal(str(num)) if decimal.Decimal.is_nan(dnum): - return 'NaN' + return "NaN" if decimal.Decimal.is_infinite(dnum): - return 'inf' + return "inf" sign = 1 @@ -1578,17 +1712,16 @@ def __call__(self, num): prefix = self.ENG_PREFIXES[int_pow10] else: if int_pow10 < 0: - prefix = 'E-{pow10:02d}'.format(pow10=-int_pow10) + prefix = "E-{pow10:02d}".format(pow10=-int_pow10) else: - prefix = 'E+{pow10:02d}'.format(pow10=int_pow10) + prefix = "E+{pow10:02d}".format(pow10=int_pow10) - mant = sign * dnum / (10**pow10) + mant = sign * dnum / (10 ** pow10) if self.accuracy is None: # pragma: no cover format_str = "{mant: g}{prefix}" else: - format_str = ("{{mant: .{acc:d}f}}{{prefix}}" - .format(acc=self.accuracy)) + format_str = "{{mant: .{acc:d}f}}{{prefix}}".format(acc=self.accuracy) formatted = format_str.format(mant=mant, prefix=prefix) @@ -1628,7 +1761,7 @@ def _binify(cols, line_width): return bins -def get_level_lengths(levels, sentinel=''): +def get_level_lengths(levels, sentinel=""): """For each index in each level the function returns lengths of indexes. Parameters @@ -1681,4 +1814,4 @@ def buffer_put_lines(buf, lines): """ if any(isinstance(x, str) for x in lines): lines = [str(x) for x in lines] - buf.write('\n'.join(lines)) + buf.write("\n".join(lines)) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 6fc36324092b5..e6aae44baa69b 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -35,17 +35,16 @@ def __init__(self, formatter, classes=None, border=None): self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns self.elements = [] - self.bold_rows = self.fmt.kwds.get('bold_rows', False) - self.escape = self.fmt.kwds.get('escape', True) + self.bold_rows = self.fmt.kwds.get("bold_rows", False) + self.escape = self.fmt.kwds.get("escape", True) self.show_dimensions = self.fmt.show_dimensions if border is None: - border = get_option('display.html.border') + border = get_option("display.html.border") self.border = border self.table_id = self.fmt.table_id self.render_links = self.fmt.render_links if isinstance(self.fmt.col_space, int): - self.fmt.col_space = ('{colspace}px' - .format(colspace=self.fmt.col_space)) + self.fmt.col_space = "{colspace}px".format(colspace=self.fmt.col_space) @property def show_row_idx_names(self): @@ -83,7 +82,7 @@ def ncols(self): def write(self, s, indent=0): rs = pprint_thing(s) - self.elements.append(' ' * indent + rs) + self.elements.append(" " * indent + rs) def write_th(self, s, header=False, indent=0, tags=None): """ @@ -109,25 +108,23 @@ def write_th(self, s, header=False, indent=0, tags=None): A written cell. """ if header and self.fmt.col_space is not None: - tags = (tags or "") - tags += ('style="min-width: {colspace};"' - .format(colspace=self.fmt.col_space)) + tags = tags or "" + tags += 'style="min-width: {colspace};"'.format(colspace=self.fmt.col_space) - return self._write_cell(s, kind='th', indent=indent, tags=tags) + return self._write_cell(s, kind="th", indent=indent, tags=tags) def write_td(self, s, indent=0, tags=None): - return self._write_cell(s, kind='td', indent=indent, tags=tags) + return self._write_cell(s, kind="td", indent=indent, tags=tags) - def _write_cell(self, s, kind='td', indent=0, tags=None): + def _write_cell(self, s, kind="td", indent=0, tags=None): if tags is not None: - start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags) + start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags) else: - start_tag = '<{kind}>'.format(kind=kind) + start_tag = "<{kind}>".format(kind=kind) if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict([('&', r'&'), ('<', r'<'), - ('>', r'>')]) + esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")]) else: esc = {} @@ -135,25 +132,35 @@ def _write_cell(self, s, kind='td', indent=0, tags=None): if self.render_links and _is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() - start_tag += ''.format( - url=rs_unescaped) - end_a = '' + start_tag += ''.format(url=rs_unescaped) + end_a = "" else: - end_a = '' - - self.write('{start}{rs}{end_a}'.format( - start=start_tag, rs=rs, end_a=end_a, kind=kind), indent) - - def write_tr(self, line, indent=0, indent_delta=0, header=False, - align=None, tags=None, nindex_levels=0): + end_a = "" + + self.write( + "{start}{rs}{end_a}".format( + start=start_tag, rs=rs, end_a=end_a, kind=kind + ), + indent, + ) + + def write_tr( + self, + line, + indent=0, + indent_delta=0, + header=False, + align=None, + tags=None, + nindex_levels=0, + ): if tags is None: tags = {} if align is None: - self.write('', indent) + self.write("", indent) else: - self.write('' - .format(align=align), indent) + self.write(''.format(align=align), indent) indent += indent_delta for i, s in enumerate(line): @@ -164,31 +171,34 @@ def write_tr(self, line, indent=0, indent_delta=0, header=False, self.write_td(s, indent, tags=val_tag) indent -= indent_delta - self.write('', indent) + self.write("", indent) def render(self): self._write_table() if self.should_show_dimensions: by = chr(215) # × - self.write('

{rows} rows {by} {cols} columns

' - .format(rows=len(self.frame), - by=by, - cols=len(self.frame.columns))) + self.write( + "

{rows} rows {by} {cols} columns

".format( + rows=len(self.frame), by=by, cols=len(self.frame.columns) + ) + ) return self.elements def _write_table(self, indent=0): - _classes = ['dataframe'] # Default class. + _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: - _classes.append('tex2jax_ignore') + _classes.append("tex2jax_ignore") if self.classes is not None: if isinstance(self.classes, str): self.classes = self.classes.split() if not isinstance(self.classes, (list, tuple)): - raise TypeError('classes must be a string, list, or tuple, ' - 'not {typ}'.format(typ=type(self.classes))) + raise TypeError( + "classes must be a string, list, or tuple, " + "not {typ}".format(typ=type(self.classes)) + ) _classes.extend(self.classes) if self.table_id is None: @@ -196,16 +206,19 @@ def _write_table(self, indent=0): else: id_section = ' id="{table_id}"'.format(table_id=self.table_id) - self.write('' - .format(border=self.border, cls=' '.join(_classes), - id_section=id_section), indent) + self.write( + '
'.format( + border=self.border, cls=" ".join(_classes), id_section=id_section + ), + indent, + ) if self.fmt.header or self.show_row_idx_names: self._write_header(indent + self.indent_delta) self._write_body(indent + self.indent_delta) - self.write('
', indent) + self.write("", indent) def _write_col_header(self, indent): truncate_h = self.fmt.truncate_h @@ -217,12 +230,10 @@ def _write_col_header(self, indent): sentinel = object() else: sentinel = False - levels = self.columns.format(sparsify=sentinel, adjoin=False, - names=False) + levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 - for lnum, (records, values) in enumerate(zip(level_lengths, - levels)): + for lnum, (records, values) in enumerate(zip(level_lengths, levels)): if truncate_h: # modify the header lines ins_col = self.fmt.tr_col_num @@ -235,21 +246,23 @@ def _write_col_header(self, indent): elif tag + span > ins_col: recs_new[tag] = span + 1 if lnum == inner_lvl: - values = (values[:ins_col] + ('...',) + - values[ins_col:]) + values = ( + values[:ins_col] + ("...",) + values[ins_col:] + ) else: # sparse col headers do not receive a ... - values = (values[:ins_col] + - (values[ins_col - 1], ) + - values[ins_col:]) + values = ( + values[:ins_col] + + (values[ins_col - 1],) + + values[ins_col:] + ) else: recs_new[tag] = span # if ins_col lies between tags, all col headers # get ... if tag + span == ins_col: recs_new[ins_col] = 1 - values = (values[:ins_col] + ('...',) + - values[ins_col:]) + values = values[:ins_col] + ("...",) + values[ins_col:] records = recs_new inner_lvl = len(level_lengths) - 1 if lnum == inner_lvl: @@ -263,8 +276,7 @@ def _write_col_header(self, indent): recs_new[tag] = span recs_new[ins_col] = 1 records = recs_new - values = (values[:ins_col] + ['...'] + - values[ins_col:]) + values = values[:ins_col] + ["..."] + values[ins_col:] # see gh-22579 # Column Offset Bug with to_html(index=False) with @@ -272,7 +284,7 @@ def _write_col_header(self, indent): # Initially fill row with blank cells before column names. # TODO: Refactor to remove code duplication with code # block below for standard columns index. - row = [''] * (self.row_levels - 1) + row = [""] * (self.row_levels - 1) if self.fmt.index or self.show_col_idx_names: # see gh-22747 # If to_html(index_names=False) do not show columns @@ -283,9 +295,9 @@ def _write_col_header(self, indent): # parity with DataFrameFormatter class. if self.fmt.show_index_names: name = self.columns.names[lnum] - row.append(pprint_thing(name or '')) + row.append(pprint_thing(name or "")) else: - row.append('') + row.append("") tags = {} j = len(row) @@ -297,8 +309,7 @@ def _write_col_header(self, indent): continue j += 1 row.append(v) - self.write_tr(row, indent, self.indent_delta, tags=tags, - header=True) + self.write_tr(row, indent, self.indent_delta, tags=tags, header=True) else: # see gh-22579 # Column misalignment also occurs for @@ -306,7 +317,7 @@ def _write_col_header(self, indent): # Initially fill row with blank cells before column names. # TODO: Refactor to remove code duplication with code block # above for columns MultiIndex. - row = [''] * (self.row_levels - 1) + row = [""] * (self.row_levels - 1) if self.fmt.index or self.show_col_idx_names: # see gh-22747 # If to_html(index_names=False) do not show columns @@ -314,27 +325,27 @@ def _write_col_header(self, indent): # TODO: Refactor to use _get_column_name_list from # DataFrameFormatter class. if self.fmt.show_index_names: - row.append(self.columns.name or '') + row.append(self.columns.name or "") else: - row.append('') + row.append("") row.extend(self._get_columns_formatted_values()) align = self.fmt.justify if truncate_h: ins_col = self.row_levels + self.fmt.tr_col_num - row.insert(ins_col, '...') + row.insert(ins_col, "...") - self.write_tr(row, indent, self.indent_delta, header=True, - align=align) + self.write_tr(row, indent, self.indent_delta, header=True, align=align) def _write_row_header(self, indent): truncate_h = self.fmt.truncate_h - row = ([x if x is not None else '' for x in self.frame.index.names] - + [''] * (self.ncols + (1 if truncate_h else 0))) + row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( + self.ncols + (1 if truncate_h else 0) + ) self.write_tr(row, indent, self.indent_delta, header=True) def _write_header(self, indent): - self.write('', indent) + self.write("", indent) if self.fmt.header: self._write_col_header(indent + self.indent_delta) @@ -342,27 +353,24 @@ def _write_header(self, indent): if self.show_row_idx_names: self._write_row_header(indent + self.indent_delta) - self.write('', indent) + self.write("", indent) def _get_formatted_values(self): - with option_context('display.max_colwidth', 999999): - fmt_values = {i: self.fmt._format_col(i) - for i in range(self.ncols)} + with option_context("display.max_colwidth", 999999): + fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} return fmt_values def _write_body(self, indent): - self.write('', indent) + self.write("", indent) fmt_values = self._get_formatted_values() # write values if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): - self._write_hierarchical_rows( - fmt_values, indent + self.indent_delta) + self._write_hierarchical_rows(fmt_values, indent + self.indent_delta) else: - self._write_regular_rows( - fmt_values, indent + self.indent_delta) + self._write_regular_rows(fmt_values, indent + self.indent_delta) - self.write('', indent) + self.write("", indent) def _write_regular_rows(self, fmt_values, indent): truncate_h = self.fmt.truncate_h @@ -371,7 +379,7 @@ def _write_regular_rows(self, fmt_values, indent): nrows = len(self.fmt.tr_frame) if self.fmt.index: - fmt = self.fmt._get_formatter('__index__') + fmt = self.fmt._get_formatter("__index__") if fmt is not None: index_values = self.fmt.tr_frame.index.map(fmt) else: @@ -381,9 +389,14 @@ def _write_regular_rows(self, fmt_values, indent): for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): - str_sep_row = ['...'] * len(row) - self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=self.row_levels) + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) row = [] if self.fmt.index: @@ -393,14 +406,15 @@ def _write_regular_rows(self, fmt_values, indent): # a standard index when the columns index is named. # Add blank cell before data cells. elif self.show_col_idx_names: - row.append('') + row.append("") row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: dot_col_ix = self.fmt.tr_col_num + self.row_levels - row.insert(dot_col_ix, '...') - self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=self.row_levels) + row.insert(dot_col_ix, "...") + self.write_tr( + row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels + ) def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="{span}" valign="top"' @@ -410,15 +424,13 @@ def _write_hierarchical_rows(self, fmt_values, indent): frame = self.fmt.tr_frame nrows = len(frame) - idx_values = frame.index.format(sparsify=False, adjoin=False, - names=False) + idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = list(zip(*idx_values)) if self.fmt.sparsify: # GH3547 sentinel = object() - levels = frame.index.format(sparsify=sentinel, adjoin=False, - names=False) + levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 @@ -438,12 +450,12 @@ def _write_hierarchical_rows(self, fmt_values, indent): # GH 14882 - Make sure insertion done once if not inserted: dot_row = list(idx_values[ins_row - 1]) - dot_row[-1] = '...' + dot_row[-1] = "..." idx_values.insert(ins_row, tuple(dot_row)) inserted = True else: dot_row = list(idx_values[ins_row]) - dot_row[inner_lvl - lnum] = '...' + dot_row[inner_lvl - lnum] = "..." idx_values[ins_row] = tuple(dot_row) else: rec_new[tag] = span @@ -452,19 +464,20 @@ def _write_hierarchical_rows(self, fmt_values, indent): if tag + span == ins_row: rec_new[ins_row] = 1 if lnum == 0: - idx_values.insert(ins_row, tuple( - ['...'] * len(level_lengths))) + idx_values.insert( + ins_row, tuple(["..."] * len(level_lengths)) + ) # GH 14882 - Place ... in correct level elif inserted: dot_row = list(idx_values[ins_row]) - dot_row[inner_lvl - lnum] = '...' + dot_row[inner_lvl - lnum] = "..." idx_values[ins_row] = tuple(dot_row) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 for ix_col in range(len(fmt_values)): - fmt_values[ix_col].insert(ins_row, '...') + fmt_values[ix_col].insert(ins_row, "...") nrows += 1 for i in range(nrows): @@ -486,27 +499,44 @@ def _write_hierarchical_rows(self, fmt_values, indent): row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: - row.insert(self.row_levels - sparse_offset + - self.fmt.tr_col_num, '...') - self.write_tr(row, indent, self.indent_delta, tags=tags, - nindex_levels=len(levels) - sparse_offset) + row.insert( + self.row_levels - sparse_offset + self.fmt.tr_col_num, "..." + ) + self.write_tr( + row, + indent, + self.indent_delta, + tags=tags, + nindex_levels=len(levels) - sparse_offset, + ) else: row = [] for i in range(len(frame)): if truncate_v and i == (self.fmt.tr_row_num): - str_sep_row = ['...'] * len(row) - self.write_tr(str_sep_row, indent, self.indent_delta, - tags=None, nindex_levels=self.row_levels) - - idx_values = list(zip(*frame.index.format( - sparsify=False, adjoin=False, names=False))) + str_sep_row = ["..."] * len(row) + self.write_tr( + str_sep_row, + indent, + self.indent_delta, + tags=None, + nindex_levels=self.row_levels, + ) + + idx_values = list( + zip(*frame.index.format(sparsify=False, adjoin=False, names=False)) + ) row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: - row.insert(self.row_levels + self.fmt.tr_col_num, '...') - self.write_tr(row, indent, self.indent_delta, tags=None, - nindex_levels=frame.index.nlevels) + row.insert(self.row_levels + self.fmt.tr_col_num, "...") + self.write_tr( + row, + indent, + self.indent_delta, + tags=None, + nindex_levels=frame.index.nlevels, + ) class NotebookFormatter(HTMLFormatter): @@ -534,34 +564,25 @@ def write_style(self): .dataframe %s { %s: %s; }""" - element_props = [('tbody tr th:only-of-type', - 'vertical-align', - 'middle'), - ('tbody tr th', - 'vertical-align', - 'top')] + element_props = [ + ("tbody tr th:only-of-type", "vertical-align", "middle"), + ("tbody tr th", "vertical-align", "top"), + ] if isinstance(self.columns, ABCMultiIndex): - element_props.append(('thead tr th', - 'text-align', - 'left')) + element_props.append(("thead tr th", "text-align", "left")) if self.show_row_idx_names: - element_props.append(('thead tr:last-of-type th', - 'text-align', - 'right')) + element_props.append( + ("thead tr:last-of-type th", "text-align", "right") + ) else: - element_props.append(('thead th', - 'text-align', - 'right')) - template_mid = '\n\n'.join(map(lambda t: template_select % t, - element_props)) - template = dedent('\n'.join((template_first, - template_mid, - template_last))) + element_props.append(("thead th", "text-align", "right")) + template_mid = "\n\n".join(map(lambda t: template_select % t, element_props)) + template = dedent("\n".join((template_first, template_mid, template_last))) self.write(template) def render(self): - self.write('
') + self.write("
") self.write_style() super().render() - self.write('
') + self.write("
") return self.elements diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 33bc413e9c3fe..dad099b747701 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -26,11 +26,18 @@ class LatexFormatter(TableFormatter): HTMLFormatter """ - def __init__(self, formatter, column_format=None, longtable=False, - multicolumn=False, multicolumn_format=None, multirow=False): + def __init__( + self, + formatter, + column_format=None, + longtable=False, + multicolumn=False, + multicolumn_format=None, + multirow=False, + ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.kwds.get('bold_rows', False) + self.bold_rows = self.fmt.kwds.get("bold_rows", False) self.column_format = column_format self.longtable = longtable self.multicolumn = multicolumn @@ -44,25 +51,28 @@ def write_result(self, buf): # string representation of the columns if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = ('Empty {name}\nColumns: {col}\nIndex: {idx}' - .format(name=type(self.frame).__name__, - col=self.frame.columns, - idx=self.frame.index)) + info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format( + name=type(self.frame).__name__, + col=self.frame.columns, + idx=self.frame.index, + ) strcols = [[info_line]] else: strcols = self.fmt._to_str_columns() def get_col_type(dtype): if issubclass(dtype.type, np.number): - return 'r' + return "r" else: - return 'l' + return "l" # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( - adjoin=False, sparsify=self.fmt.sparsify, - names=self.fmt.has_index_names, na_rep=self.fmt.na_rep + adjoin=False, + sparsify=self.fmt.sparsify, + names=self.fmt.has_index_names, + na_rep=self.fmt.na_rep, ) # index.format will sparsify repeated entries with empty strings @@ -71,17 +81,18 @@ def pad_empties(x): for pad in reversed(x): if pad: break - return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]] + return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] + out = (pad_empties(i) for i in out) # Add empty spaces for each column level clevels = self.frame.columns.nlevels - out = [[' ' * len(i[-1])] * clevels + i for i in out] + out = [[" " * len(i[-1])] * clevels + i for i in out] # Add the column names to the last index column cnames = self.frame.columns.names if any(cnames): - new_names = [i if i else '{}' for i in cnames] + new_names = [i if i else "{}" for i in cnames] out[self.frame.index.nlevels - 1][:clevels] = new_names # Get rid of old multiindex column and add new ones @@ -90,22 +101,22 @@ def pad_empties(x): column_format = self.column_format if column_format is None: dtypes = self.frame.dtypes._values - column_format = ''.join(map(get_col_type, dtypes)) + column_format = "".join(map(get_col_type, dtypes)) if self.fmt.index: - index_format = 'l' * self.frame.index.nlevels + index_format = "l" * self.frame.index.nlevels column_format = index_format + column_format elif not isinstance(column_format, str): # pragma: no cover - raise AssertionError('column_format must be str or unicode, ' - 'not {typ}'.format(typ=type(column_format))) + raise AssertionError( + "column_format must be str or unicode, " + "not {typ}".format(typ=type(column_format)) + ) if not self.longtable: - buf.write('\\begin{{tabular}}{{{fmt}}}\n' - .format(fmt=column_format)) - buf.write('\\toprule\n') + buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format)) + buf.write("\\toprule\n") else: - buf.write('\\begin{{longtable}}{{{fmt}}}\n' - .format(fmt=column_format)) - buf.write('\\toprule\n') + buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) + buf.write("\\toprule\n") ilevels = self.frame.index.nlevels clevels = self.frame.columns.nlevels @@ -117,50 +128,63 @@ def pad_empties(x): for i, row in enumerate(strrows): if i == nlevels and self.fmt.header: - buf.write('\\midrule\n') # End of header + buf.write("\\midrule\n") # End of header if self.longtable: - buf.write('\\endhead\n') - buf.write('\\midrule\n') - buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next ' - 'page}}}} \\\\\n'.format(n=len(row))) - buf.write('\\midrule\n') - buf.write('\\endfoot\n\n') - buf.write('\\bottomrule\n') - buf.write('\\endlastfoot\n') - if self.fmt.kwds.get('escape', True): + buf.write("\\endhead\n") + buf.write("\\midrule\n") + buf.write( + "\\multicolumn{{{n}}}{{r}}{{{{Continued on next " + "page}}}} \\\\\n".format(n=len(row)) + ) + buf.write("\\midrule\n") + buf.write("\\endfoot\n\n") + buf.write("\\bottomrule\n") + buf.write("\\endlastfoot\n") + if self.fmt.kwds.get("escape", True): # escape backslashes first - crow = [(x.replace('\\', '\\textbackslash ') - .replace('_', '\\_') - .replace('%', '\\%').replace('$', '\\$') - .replace('#', '\\#').replace('{', '\\{') - .replace('}', '\\}').replace('~', '\\textasciitilde ') - .replace('^', '\\textasciicircum ') - .replace('&', '\\&') - if (x and x != '{}') else '{}') for x in row] + crow = [ + ( + x.replace("\\", "\\textbackslash ") + .replace("_", "\\_") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~", "\\textasciitilde ") + .replace("^", "\\textasciicircum ") + .replace("&", "\\&") + if (x and x != "{}") + else "{}" + ) + for x in row + ] else: - crow = [x if x else '{}' for x in row] + crow = [x if x else "{}" for x in row] if self.bold_rows and self.fmt.index: # bold row labels - crow = ['\\textbf{{{x}}}'.format(x=x) - if j < ilevels and x.strip() not in ['', '{}'] else x - for j, x in enumerate(crow)] + crow = [ + "\\textbf{{{x}}}".format(x=x) + if j < ilevels and x.strip() not in ["", "{}"] + else x + for j, x in enumerate(crow) + ] if i < clevels and self.fmt.header and self.multicolumn: # sum up columns to multicolumns crow = self._format_multicolumn(crow, ilevels) - if (i >= nlevels and self.fmt.index and self.multirow and - ilevels > 1): + if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1: # sum up rows to multirows crow = self._format_multirow(crow, ilevels, i, strrows) - buf.write(' & '.join(crow)) - buf.write(' \\\\\n') + buf.write(" & ".join(crow)) + buf.write(" \\\\\n") if self.multirow and i < len(strrows) - 1: self._print_cline(buf, i, len(strcols)) if not self.longtable: - buf.write('\\bottomrule\n') - buf.write('\\end{tabular}\n') + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") else: - buf.write('\\end{longtable}\n') + buf.write("\\end{longtable}\n") def _format_multicolumn(self, row, ilevels): r""" @@ -174,17 +198,20 @@ def _format_multicolumn(self, row, ilevels): """ row2 = list(row[:ilevels]) ncol = 1 - coltext = '' + coltext = "" def append_col(): # write multicolumn if needed if ncol > 1: - row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}' - .format(ncol=ncol, fmt=self.multicolumn_format, - txt=coltext.strip())) + row2.append( + "\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}".format( + ncol=ncol, fmt=self.multicolumn_format, txt=coltext.strip() + ) + ) # don't modify where not needed else: row2.append(coltext) + for c in row[ilevels:]: # if next col has text, write the previous if c.strip(): @@ -213,15 +240,16 @@ def _format_multirow(self, row, ilevels, i, rows): for j in range(ilevels): if row[j].strip(): nrow = 1 - for r in rows[i + 1:]: + for r in rows[i + 1 :]: if not r[j].strip(): nrow += 1 else: break if nrow > 1: # overwrite non-multirow entry - row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format( - nrow=nrow, row=row[j].strip()) + row[j] = "\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}".format( + nrow=nrow, row=row[j].strip() + ) # save when to end the current block with \cline self.clinebuf.append([i + nrow - 1, j + 1]) return row @@ -232,7 +260,6 @@ def _print_cline(self, buf, i, icol): """ for cl in self.clinebuf: if cl[0] == i: - buf.write('\\cline{{{cl:d}-{icol:d}}}\n' - .format(cl=cl[1], icol=icol)) + buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol)) # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 73d8586a0a8c9..4958d8246610e 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -25,8 +25,8 @@ def adjoin(space, *lists, **kwargs): justfunc : callable function used to justify str. Needed for unicode handling. """ - strlen = kwargs.pop('strlen', len) - justfunc = kwargs.pop('justfunc', justify) + strlen = kwargs.pop("strlen", len) + justfunc = kwargs.pop("justfunc", justify) out_lines = [] newLists = [] @@ -35,34 +35,33 @@ def adjoin(space, *lists, **kwargs): lengths.append(max(map(len, lists[-1]))) maxLen = max(map(len, lists)) for i, lst in enumerate(lists): - nl = justfunc(lst, lengths[i], mode='left') - nl.extend([' ' * lengths[i]] * (maxLen - len(lst))) + nl = justfunc(lst, lengths[i], mode="left") + nl.extend([" " * lengths[i]] * (maxLen - len(lst))) newLists.append(nl) toJoin = zip(*newLists) for lines in toJoin: out_lines.append(_join_unicode(lines)) - return _join_unicode(out_lines, sep='\n') + return _join_unicode(out_lines, sep="\n") -def justify(texts, max_len, mode='right'): +def justify(texts, max_len, mode="right"): """ Perform ljust, center, rjust against string or list-like """ - if mode == 'left': + if mode == "left": return [x.ljust(max_len) for x in texts] - elif mode == 'center': + elif mode == "center": return [x.center(max_len) for x in texts] else: return [x.rjust(max_len) for x in texts] -def _join_unicode(lines, sep=''): +def _join_unicode(lines, sep=""): try: return sep.join(lines) except UnicodeDecodeError: sep = str(sep) - return sep.join([x.decode('utf-8') if isinstance(x, str) else x - for x in lines]) + return sep.join([x.decode("utf-8") if isinstance(x, str) else x for x in lines]) # Unicode consolidation @@ -99,7 +98,7 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): if isinstance(seq, set): fmt = "{{{body}}}" else: - fmt = "[{body}]" if hasattr(seq, '__setitem__') else "({body})" + fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})" if max_seq_items is False: nitems = len(seq) @@ -108,15 +107,16 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): s = iter(seq) # handle sets, no slicing - r = [pprint_thing(next(s), - _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) - for i in range(min(nitems, len(seq)))] + r = [ + pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) + for i in range(min(nitems, len(seq))) + ] body = ", ".join(r) if nitems < len(seq): body += ", ..." elif isinstance(seq, tuple) and len(seq) == 1: - body += ',' + body += "," return fmt.format(body=body) @@ -139,10 +139,10 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): for k, v in list(seq.items())[:nitems]: pairs.append( pfmt.format( - key=pprint_thing(k, _nest_lvl + 1, - max_seq_items=max_seq_items, **kwds), - val=pprint_thing(v, _nest_lvl + 1, - max_seq_items=max_seq_items, **kwds))) + key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), + ) + ) if nitems < len(seq): return fmt.format(things=", ".join(pairs) + ", ...") @@ -150,8 +150,14 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): return fmt.format(things=", ".join(pairs)) -def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, - quote_strings=False, max_seq_items=None): +def pprint_thing( + thing, + _nest_lvl=0, + escape_chars=None, + default_escapes=False, + quote_strings=False, + max_seq_items=None, +): """ This function is the sanctioned way of converting objects to a unicode representation. @@ -188,9 +194,9 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): result = str(thing) # we should try this first except UnicodeDecodeError: # either utf-8 or we replace errors - result = str(thing).decode('utf-8', "replace") + result = str(thing).decode("utf-8", "replace") - translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', } + translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} if isinstance(escape_chars, dict): if default_escapes: translate.update(escape_chars) @@ -204,17 +210,22 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): return str(result) - if hasattr(thing, '__next__'): + if hasattr(thing, "__next__"): return str(thing) - elif (isinstance(thing, dict) and - _nest_lvl < get_option("display.pprint_nest_depth")): - result = _pprint_dict(thing, _nest_lvl, quote_strings=True, - max_seq_items=max_seq_items) - elif (is_sequence(thing) and - _nest_lvl < get_option("display.pprint_nest_depth")): - result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, - quote_strings=quote_strings, - max_seq_items=max_seq_items) + elif isinstance(thing, dict) and _nest_lvl < get_option( + "display.pprint_nest_depth" + ): + result = _pprint_dict( + thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items + ) + elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"): + result = _pprint_seq( + thing, + _nest_lvl, + escape_chars=escape_chars, + quote_strings=quote_strings, + max_seq_items=max_seq_items, + ) elif isinstance(thing, str) and quote_strings: result = "'{thing}'".format(thing=as_escaped_unicode(thing)) else: @@ -223,16 +234,17 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): return str(result) # always unicode -def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds): +def pprint_thing_encoded(object, encoding="utf-8", errors="replace", **kwds): value = pprint_thing(object) # get unicode representation of object return value.encode(encoding, errors, **kwds) def _enable_data_resource_formatter(enable): - if 'IPython' not in sys.modules: + if "IPython" not in sys.modules: # definitely not in IPython return from IPython import get_ipython + ip = get_ipython() if ip is None: # still not in IPython @@ -247,8 +259,9 @@ def _enable_data_resource_formatter(enable): from IPython.core.formatters import BaseFormatter class TableSchemaFormatter(BaseFormatter): - print_method = '_repr_data_resource_' + print_method = "_repr_data_resource_" _return_type = (dict,) + # register it: formatters[mimetype] = TableSchemaFormatter() # enable it if it's been disabled: @@ -259,13 +272,19 @@ class TableSchemaFormatter(BaseFormatter): formatters[mimetype].enabled = False -default_pprint = lambda x, max_seq_items=None: \ - pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True, - max_seq_items=max_seq_items) +default_pprint = lambda x, max_seq_items=None: pprint_thing( + x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items +) -def format_object_summary(obj, formatter, is_justify=True, name=None, - indent_for_name=True, line_break_each_value=False): +def format_object_summary( + obj, + formatter, + is_justify=True, + name=None, + indent_for_name=True, + line_break_each_value=False, +): """ Return the formatted obj as a unicode string @@ -299,14 +318,14 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, display_width, _ = get_console_size() if display_width is None: - display_width = get_option('display.width') or 80 + display_width = get_option("display.width") or 80 if name is None: name = obj.__class__.__name__ if indent_for_name: name_len = len(name) - space1 = "\n%s" % (' ' * (name_len + 1)) - space2 = "\n%s" % (' ' * (name_len + 2)) + space1 = "\n%s" % (" " * (name_len + 1)) + space2 = "\n%s" % (" " * (name_len + 2)) else: space1 = "\n" space2 = "\n " # space for the opening '[' @@ -315,10 +334,10 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, if line_break_each_value: # If we want to vertically align on each value of obj, we need to # separate values by a line break and indent the values - sep = ',\n ' + ' ' * len(name) + sep = ",\n " + " " * len(name) else: - sep = ',' - max_seq_items = get_option('display.max_seq_items') or n + sep = "," + max_seq_items = get_option("display.max_seq_items") or n # are we a truncated display is_truncated = n > max_seq_items @@ -328,8 +347,7 @@ def format_object_summary(obj, formatter, is_justify=True, name=None, def _extend_line(s, line, value, display_width, next_line_prefix): - if (adj.len(line.rstrip()) + adj.len(value.rstrip()) >= - display_width): + if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width: s += line.rstrip() line = next_line_prefix line += value @@ -341,17 +359,17 @@ def best_len(values): else: return 0 - close = ', ' + close = ", " if n == 0: - summary = '[]{}'.format(close) + summary = "[]{}".format(close) elif n == 1 and not line_break_each_value: first = formatter(obj[0]) - summary = '[{}]{}'.format(first, close) + summary = "[{}]{}".format(first, close) elif n == 2 and not line_break_each_value: first = formatter(obj[0]) last = formatter(obj[-1]) - summary = '[{}, {}]{}'.format(first, last, close) + summary = "[{}, {}]{}".format(first, last, close) else: if n > max_seq_items: @@ -369,8 +387,10 @@ def best_len(values): # strings will right align when head and tail are stacked # vertically. head, tail = _justify(head, tail) - elif (is_truncated or not (len(', '.join(head)) < display_width and - len(', '.join(tail)) < display_width)): + elif is_truncated or not ( + len(", ".join(head)) < display_width + and len(", ".join(tail)) < display_width + ): # Each string in head and tail should align with each other max_length = max(best_len(head), best_len(tail)) head = [x.rjust(max_length) for x in head] @@ -396,37 +416,34 @@ def best_len(values): line = space2 for max_items in range(len(head)): - word = head[max_items] + sep + ' ' - summary, line = _extend_line(summary, line, word, - display_width, space2) + word = head[max_items] + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) if is_truncated: # remove trailing space of last line - summary += line.rstrip() + space2 + '...' + summary += line.rstrip() + space2 + "..." line = space2 for max_items in range(len(tail) - 1): - word = tail[max_items] + sep + ' ' - summary, line = _extend_line(summary, line, word, - display_width, space2) + word = tail[max_items] + sep + " " + summary, line = _extend_line(summary, line, word, display_width, space2) # last value: no sep added + 1 space of width used for trailing ',' - summary, line = _extend_line(summary, line, tail[-1], - display_width - 2, space2) + summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2) summary += line # right now close is either '' or ', ' # Now we want to include the ']', but not the maybe space. - close = ']' + close.rstrip(' ') + close = "]" + close.rstrip(" ") summary += close if len(summary) > (display_width) or line_break_each_value: summary += space1 else: # one row - summary += ' ' + summary += " " # remove initial space - summary = '[' + summary[len(space2):] + summary = "[" + summary[len(space2) :] return summary @@ -461,10 +478,12 @@ def _justify(head, tail): max_length = [max(x, y) for x, y in zip(max_length, length)] # justify each item in each list-like in head and tail using max_length - head = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) - for seq in head] - tail = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) - for seq in tail] + head = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head + ] + tail = [ + tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail + ] return head, tail @@ -486,13 +505,13 @@ def format_object_attrs(obj, include_dtype=True): """ attrs = [] - if hasattr(obj, 'dtype') and include_dtype: - attrs.append(('dtype', "'{}'".format(obj.dtype))) - if getattr(obj, 'name', None) is not None: - attrs.append(('name', default_pprint(obj.name))) - elif getattr(obj, 'names', None) is not None and any(obj.names): - attrs.append(('names', default_pprint(obj.names))) - max_seq_items = get_option('display.max_seq_items') or len(obj) + if hasattr(obj, "dtype") and include_dtype: + attrs.append(("dtype", "'{}'".format(obj.dtype))) + if getattr(obj, "name", None) is not None: + attrs.append(("name", default_pprint(obj.name))) + elif getattr(obj, "names", None) is not None and any(obj.names): + attrs.append(("names", default_pprint(obj.names))) + max_seq_items = get_option("display.max_seq_items") or len(obj) if len(obj) > max_seq_items: - attrs.append(('length', len(obj))) + attrs.append(("length", len(obj))) return attrs diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0d9b5fe4314a3..e7aa5d22995c6 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -26,14 +26,13 @@ from pandas.core.generic import _shared_docs from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice -jinja2 = import_optional_dependency( - "jinja2", extra="DataFrame.style requires jinja2." -) +jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") try: import matplotlib.pyplot as plt from matplotlib import colors + has_mpl = True except ImportError: has_mpl = False @@ -108,15 +107,21 @@ class Styler: * Blank cells include ``blank`` * Data cells include ``data`` """ + loader = jinja2.PackageLoader("pandas", "io/formats/templates") - env = jinja2.Environment( - loader=loader, - trim_blocks=True, - ) + env = jinja2.Environment(loader=loader, trim_blocks=True) template = env.get_template("html.tpl") - def __init__(self, data, precision=None, table_styles=None, uuid=None, - caption=None, table_attributes=None, cell_ids=True): + def __init__( + self, + data, + precision=None, + table_styles=None, + uuid=None, + caption=None, + table_attributes=None, + cell_ids=True, + ): self.ctx = defaultdict(list) self._todo = [] @@ -135,7 +140,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, self.table_styles = table_styles self.caption = caption if precision is None: - precision = get_option('display.precision') + precision = get_option("display.precision") self.precision = precision self.table_attributes = table_attributes self.hidden_index = False @@ -146,7 +151,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, def default_display_func(x): if is_float(x): - return '{:>.{precision}g}'.format(x, precision=self.precision) + return "{:>.{precision}g}".format(x, precision=self.precision) else: return x @@ -158,29 +163,59 @@ def _repr_html_(self): """ return self.render() - @Appender(_shared_docs['to_excel'] % dict( - axes='index, columns', klass='Styler', - axes_single_arg="{0 or 'index', 1 or 'columns'}", - optional_by=""" + @Appender( + _shared_docs["to_excel"] + % dict( + axes="index, columns", + klass="Styler", + axes_single_arg="{0 or 'index', 1 or 'columns'}", + optional_by=""" by : str or list of str Name or list of names which refer to the axis items.""", - versionadded_to_excel='\n .. versionadded:: 0.20')) - def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', verbose=True, - freeze_panes=None): + versionadded_to_excel="\n .. versionadded:: 0.20", + ) + ) + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): from pandas.io.formats.excel import ExcelFormatter - formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns, - header=header, - float_format=float_format, index=index, - index_label=index_label, - merge_cells=merge_cells, - inf_rep=inf_rep) - formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, - startcol=startcol, freeze_panes=freeze_panes, - engine=engine) + + formatter = ExcelFormatter( + self, + na_rep=na_rep, + cols=columns, + header=header, + float_format=float_format, + index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep, + ) + formatter.write( + excel_writer, + sheet_name=sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + engine=engine, + ) def _translate(self): """ @@ -227,29 +262,43 @@ def format_attr(pair): for r in range(n_clvls): # Blank for Index columns... - row_es = [{"type": "th", - "value": BLANK_VALUE, - "display_value": BLANK_VALUE, - "is_visible": not hidden_index, - "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1) + row_es = [ + { + "type": "th", + "value": BLANK_VALUE, + "display_value": BLANK_VALUE, + "is_visible": not hidden_index, + "class": " ".join([BLANK_CLASS]), + } + ] * (n_rlvls - 1) # ... except maybe the last for columns.names name = self.data.columns.names[r] - cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, - "level{lvl}".format(lvl=r)] + cs = [ + BLANK_CLASS if name is None else INDEX_NAME_CLASS, + "level{lvl}".format(lvl=r), + ] name = BLANK_VALUE if name is None else name - row_es.append({"type": "th", - "value": name, - "display_value": name, - "class": " ".join(cs), - "is_visible": not hidden_index}) + row_es.append( + { + "type": "th", + "value": name, + "display_value": name, + "class": " ".join(cs), + "is_visible": not hidden_index, + } + ) if clabels: for c, value in enumerate(clabels[r]): - cs = [COL_HEADING_CLASS, "level{lvl}".format(lvl=r), - "col{col}".format(col=c)] - cs.extend(cell_context.get( - "col_headings", {}).get(r, {}).get(c, [])) + cs = [ + COL_HEADING_CLASS, + "level{lvl}".format(lvl=r), + "col{col}".format(col=c), + ] + cs.extend( + cell_context.get("col_headings", {}).get(r, {}).get(c, []) + ) es = { "type": "th", "value": value, @@ -265,23 +314,24 @@ def format_attr(pair): row_es.append(es) head.append(row_es) - if (self.data.index.names and - com._any_not_none(*self.data.index.names) and - not hidden_index): + if ( + self.data.index.names + and com._any_not_none(*self.data.index.names) + and not hidden_index + ): index_header_row = [] for c, name in enumerate(self.data.index.names): - cs = [INDEX_NAME_CLASS, - "level{lvl}".format(lvl=c)] - name = '' if name is None else name - index_header_row.append({"type": "th", "value": name, - "class": " ".join(cs)}) + cs = [INDEX_NAME_CLASS, "level{lvl}".format(lvl=c)] + name = "" if name is None else name + index_header_row.append( + {"type": "th", "value": name, "class": " ".join(cs)} + ) index_header_row.extend( - [{"type": "th", - "value": BLANK_VALUE, - "class": " ".join([BLANK_CLASS]) - }] * (len(clabels[0]) - len(hidden_columns))) + [{"type": "th", "value": BLANK_VALUE, "class": " ".join([BLANK_CLASS])}] + * (len(clabels[0]) - len(hidden_columns)) + ) head.append(index_header_row) @@ -289,16 +339,18 @@ def format_attr(pair): for r, idx in enumerate(self.data.index): row_es = [] for c, value in enumerate(rlabels[r]): - rid = [ROW_HEADING_CLASS, "level{lvl}".format(lvl=c), - "row{row}".format(row=r)] + rid = [ + ROW_HEADING_CLASS, + "level{lvl}".format(lvl=c), + "row{row}".format(row=r), + ] es = { "type": "th", - "is_visible": (_is_visible(r, c, idx_lengths) and - not hidden_index), + "is_visible": (_is_visible(r, c, idx_lengths) and not hidden_index), "value": value, "display_value": value, "id": "_".join(rid[1:]), - "class": " ".join(rid) + "class": " ".join(rid), } rowspan = idx_lengths.get((c, r), 0) if rowspan > 1: @@ -308,19 +360,19 @@ def format_attr(pair): row_es.append(es) for c, col in enumerate(self.data.columns): - cs = [DATA_CLASS, "row{row}".format(row=r), - "col{col}".format(col=c)] + cs = [DATA_CLASS, "row{row}".format(row=r), "col{col}".format(col=c)] cs.extend(cell_context.get("data", {}).get(r, {}).get(c, [])) formatter = self._display_funcs[(r, c)] value = self.data.iloc[r, c] - row_dict = {"type": "td", - "value": value, - "class": " ".join(cs), - "display_value": formatter(value), - "is_visible": (c not in hidden_columns)} + row_dict = { + "type": "td", + "value": value, + "class": " ".join(cs), + "display_value": formatter(value), + "is_visible": (c not in hidden_columns), + } # only add an id if the cell has a style - if (self.cell_ids or - not(len(ctx[r, c]) == 1 and ctx[r, c][0] == '')): + if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): row_dict["id"] = "_".join(cs[1:]) row_es.append(row_dict) props = [] @@ -329,25 +381,34 @@ def format_attr(pair): if x.count(":"): props.append(x.split(":")) else: - props.append(['', '']) - cellstyle.append({'props': props, - 'selector': "row{row}_col{col}" - .format(row=r, col=c)}) + props.append(["", ""]) + cellstyle.append( + { + "props": props, + "selector": "row{row}_col{col}".format(row=r, col=c), + } + ) body.append(row_es) table_attr = self.table_attributes use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: - table_attr = table_attr or '' + table_attr = table_attr or "" if 'class="' in table_attr: - table_attr = table_attr.replace('class="', - 'class="tex2jax_ignore ') + table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ') else: table_attr += ' class="tex2jax_ignore"' - return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid, - precision=precision, table_styles=table_styles, - caption=caption, table_attributes=table_attr) + return dict( + head=head, + cellstyle=cellstyle, + body=body, + uuid=uuid, + precision=precision, + table_styles=table_styles, + caption=caption, + table_attributes=table_attr, + ) def format(self, formatter, subset=None): """ @@ -460,9 +521,8 @@ def render(self, **kwargs): # filter out empty styles, every cell will have a class # but the list of props may just be [['', '']]. # so we have the neested anys below - trimmed = [x for x in d['cellstyle'] - if any(any(y) for y in x['props'])] - d['cellstyle'] = trimmed + trimmed = [x for x in d["cellstyle"] if any(any(y) for y in x["props"])] + d["cellstyle"] = trimmed d.update(kwargs) return self.template.render(**d) @@ -485,9 +545,13 @@ def _update_ctx(self, attrs): self.ctx[(i, j)].append(pair) def _copy(self, deepcopy=False): - styler = Styler(self.data, precision=self.precision, - caption=self.caption, uuid=self.uuid, - table_styles=self.table_styles) + styler = Styler( + self.data, + precision=self.precision, + caption=self.caption, + uuid=self.uuid, + table_styles=self.table_styles, + ) if deepcopy: styler.ctx = copy.deepcopy(self.ctx) styler._todo = copy.deepcopy(self._todo) @@ -532,30 +596,34 @@ def _apply(self, func, axis=0, subset=None, **kwargs): subset = _non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: - result = data.apply(func, axis=axis, - result_type='expand', **kwargs) + result = data.apply(func, axis=axis, result_type="expand", **kwargs) result.columns = data.columns else: result = func(data, **kwargs) if not isinstance(result, pd.DataFrame): raise TypeError( "Function {func!r} must return a DataFrame when " - "passed to `Styler.apply` with axis=None" - .format(func=func)) - if not (result.index.equals(data.index) and - result.columns.equals(data.columns)): - msg = ('Result of {func!r} must have identical index and ' - 'columns as the input'.format(func=func)) + "passed to `Styler.apply` with axis=None".format(func=func) + ) + if not ( + result.index.equals(data.index) and result.columns.equals(data.columns) + ): + msg = ( + "Result of {func!r} must have identical index and " + "columns as the input".format(func=func) + ) raise ValueError(msg) result_shape = result.shape expected_shape = self.data.loc[subset].shape if result_shape != expected_shape: - msg = ("Function {func!r} returned the wrong shape.\n" - "Result has shape: {res}\n" - "Expected shape: {expect}".format(func=func, - res=result.shape, - expect=expected_shape)) + msg = ( + "Function {func!r} returned the wrong shape.\n" + "Result has shape: {res}\n" + "Expected shape: {expect}".format( + func=func, res=result.shape, expect=expected_shape + ) + ) raise ValueError(msg) self._update_ctx(result) return self @@ -605,8 +673,9 @@ def apply(self, func, axis=0, subset=None, **kwargs): >>> df = pd.DataFrame(np.random.randn(5, 2)) >>> df.style.apply(highlight_max) """ - self._todo.append((lambda instance: getattr(instance, '_apply'), - (func, axis, subset), kwargs)) + self._todo.append( + (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs) + ) return self def _applymap(self, func, subset=None, **kwargs): @@ -641,8 +710,9 @@ def applymap(self, func, subset=None, **kwargs): -------- Styler.where """ - self._todo.append((lambda instance: getattr(instance, '_applymap'), - (func, subset), kwargs)) + self._todo.append( + (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) + ) return self def where(self, cond, value, other=None, subset=None, **kwargs): @@ -677,10 +747,11 @@ def where(self, cond, value, other=None, subset=None, **kwargs): """ if other is None: - other = '' + other = "" - return self.applymap(lambda val: value if cond(val) else other, - subset=subset, **kwargs) + return self.applymap( + lambda val: value if cond(val) else other, subset=subset, **kwargs + ) def set_precision(self, precision): """ @@ -858,10 +929,11 @@ def hide_columns(self, subset): @staticmethod def _highlight_null(v, null_color): - return ('background-color: {color}'.format(color=null_color) - if pd.isna(v) else '') + return ( + "background-color: {color}".format(color=null_color) if pd.isna(v) else "" + ) - def highlight_null(self, null_color='red'): + def highlight_null(self, null_color="red"): """ Shade the background ``null_color`` for missing values. @@ -876,8 +948,15 @@ def highlight_null(self, null_color='red'): self.applymap(self._highlight_null, null_color=null_color) return self - def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0, - subset=None, text_color_threshold=0.408): + def background_gradient( + self, + cmap="PuBu", + low=0, + high=0, + axis=0, + subset=None, + text_color_threshold=0.408, + ): """ Color the background in a gradient according to the data in each column (optionally row). @@ -921,19 +1000,26 @@ def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0, """ subset = _maybe_numeric_slice(self.data, subset) subset = _non_reducing_slice(subset) - self.apply(self._background_gradient, cmap=cmap, subset=subset, - axis=axis, low=low, high=high, - text_color_threshold=text_color_threshold) + self.apply( + self._background_gradient, + cmap=cmap, + subset=subset, + axis=axis, + low=low, + high=high, + text_color_threshold=text_color_threshold, + ) return self @staticmethod - def _background_gradient(s, cmap='PuBu', low=0, high=0, - text_color_threshold=0.408): + def _background_gradient(s, cmap="PuBu", low=0, high=0, text_color_threshold=0.408): """ Color background in a range according to the data. """ - if (not isinstance(text_color_threshold, (float, int)) or - not 0 <= text_color_threshold <= 1): + if ( + not isinstance(text_color_threshold, (float, int)) + or not 0 <= text_color_threshold <= 1 + ): msg = "`text_color_threshold` must be a value from 0 to 1." raise ValueError(msg) @@ -971,8 +1057,8 @@ def relative_luminance(rgba): def css(rgba): dark = relative_luminance(rgba) < text_color_threshold - text_color = '#f1f1f1' if dark else '#000000' - return 'background-color: {b};color: {c};'.format( + text_color = "#f1f1f1" if dark else "#000000" + return "background-color: {b};color: {c};".format( b=colors.rgb2hex(rgba), c=text_color ) @@ -981,7 +1067,8 @@ def css(rgba): else: return pd.DataFrame( [[css(rgba) for rgba in row] for row in rgbas], - index=s.index, columns=s.columns + index=s.index, + columns=s.columns, ) def set_properties(self, subset=None, **kwargs): @@ -1006,8 +1093,7 @@ def set_properties(self, subset=None, **kwargs): >>> df.style.set_properties(color="white", align="right") >>> df.style.set_properties(**{'background-color': 'yellow'}) """ - values = ';'.join('{p}: {v}'.format(p=p, v=v) - for p, v in kwargs.items()) + values = ";".join("{p}: {v}".format(p=p, v=v) for p, v in kwargs.items()) f = lambda x: values return self.applymap(f, subset=subset) @@ -1023,10 +1109,10 @@ def _bar(s, align, colors, width=100, vmin=None, vmax=None): smax = s.max() if vmax is None else vmax if isinstance(smax, ABCSeries): smax = smax.max() - if align == 'mid': + if align == "mid": smin = min(0, smin) smax = max(0, smax) - elif align == 'zero': + elif align == "zero": # For "zero" mode, we want the range to be symmetrical around zero. smax = max(abs(smin), abs(smax)) smin = -smax @@ -1038,26 +1124,26 @@ def css_bar(start, end, color): """ Generate CSS code to draw a bar from start to end. """ - css = 'width: 10em; height: 80%;' + css = "width: 10em; height: 80%;" if end > start: - css += 'background: linear-gradient(90deg,' + css += "background: linear-gradient(90deg," if start > 0: - css += ' transparent {s:.1f}%, {c} {s:.1f}%, '.format( + css += " transparent {s:.1f}%, {c} {s:.1f}%, ".format( s=start, c=color ) - css += '{c} {e:.1f}%, transparent {e:.1f}%)'.format( - e=min(end, width), c=color, + css += "{c} {e:.1f}%, transparent {e:.1f}%)".format( + e=min(end, width), c=color ) return css def css(x): if pd.isna(x): - return '' + return "" # avoid deprecated indexing `colors[x > zero]` color = colors[1] if x > zero else colors[0] - if align == 'left': + if align == "left": return css_bar(0, x, color) else: return css_bar(min(x, zero), max(x, zero), color) @@ -1067,11 +1153,20 @@ def css(x): else: return pd.DataFrame( [[css(x) for x in row] for row in normed], - index=s.index, columns=s.columns + index=s.index, + columns=s.columns, ) - def bar(self, subset=None, axis=0, color='#d65f5f', width=100, - align='left', vmin=None, vmax=None): + def bar( + self, + subset=None, + axis=0, + color="#d65f5f", + width=100, + align="left", + vmin=None, + vmax=None, + ): """ Draw bar chart in the cell backgrounds. @@ -1120,7 +1215,7 @@ def bar(self, subset=None, axis=0, color='#d65f5f', width=100, ------- self : Styler """ - if align not in ('left', 'zero', 'mid'): + if align not in ("left", "zero", "mid"): raise ValueError("`align` must be one of {'left', 'zero',' mid'}") if not (is_list_like(color)): @@ -1128,19 +1223,28 @@ def bar(self, subset=None, axis=0, color='#d65f5f', width=100, elif len(color) == 1: color = [color[0], color[0]] elif len(color) > 2: - raise ValueError("`color` must be string or a list-like" - " of length 2: [`color_neg`, `color_pos`]" - " (eg: color=['#d65f5f', '#5fba7d'])") + raise ValueError( + "`color` must be string or a list-like" + " of length 2: [`color_neg`, `color_pos`]" + " (eg: color=['#d65f5f', '#5fba7d'])" + ) subset = _maybe_numeric_slice(self.data, subset) subset = _non_reducing_slice(subset) - self.apply(self._bar, subset=subset, axis=axis, - align=align, colors=color, width=width, - vmin=vmin, vmax=vmax) + self.apply( + self._bar, + subset=subset, + axis=axis, + align=align, + colors=color, + width=width, + vmin=vmin, + vmax=vmax, + ) return self - def highlight_max(self, subset=None, color='yellow', axis=0): + def highlight_max(self, subset=None, color="yellow", axis=0): """ Highlight the maximum by shading the background. @@ -1158,10 +1262,9 @@ def highlight_max(self, subset=None, color='yellow', axis=0): ------- self : Styler """ - return self._highlight_handler(subset=subset, color=color, axis=axis, - max_=True) + return self._highlight_handler(subset=subset, color=color, axis=axis, max_=True) - def highlight_min(self, subset=None, color='yellow', axis=0): + def highlight_min(self, subset=None, color="yellow", axis=0): """ Highlight the minimum by shading the background. @@ -1179,35 +1282,37 @@ def highlight_min(self, subset=None, color='yellow', axis=0): ------- self : Styler """ - return self._highlight_handler(subset=subset, color=color, axis=axis, - max_=False) + return self._highlight_handler( + subset=subset, color=color, axis=axis, max_=False + ) - def _highlight_handler(self, subset=None, color='yellow', axis=None, - max_=True): + def _highlight_handler(self, subset=None, color="yellow", axis=None, max_=True): subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset)) - self.apply(self._highlight_extrema, color=color, axis=axis, - subset=subset, max_=max_) + self.apply( + self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_ + ) return self @staticmethod - def _highlight_extrema(data, color='yellow', max_=True): + def _highlight_extrema(data, color="yellow", max_=True): """ Highlight the min or max in a Series or DataFrame. """ - attr = 'background-color: {0}'.format(color) + attr = "background-color: {0}".format(color) if data.ndim == 1: # Series from .apply if max_: extrema = data == data.max() else: extrema = data == data.min() - return [attr if v else '' for v in extrema] + return [attr if v else "" for v in extrema] else: # DataFrame from .tee if max_: extrema = data == data.max().max() else: extrema = data == data.min().min() - return pd.DataFrame(np.where(extrema, attr, ''), - index=data.index, columns=data.columns) + return pd.DataFrame( + np.where(extrema, attr, ""), index=data.index, columns=data.columns + ) @classmethod def from_custom_template(cls, searchpath, name): @@ -1227,10 +1332,7 @@ def from_custom_template(cls, searchpath, name): MyStyler : subclass of Styler Has the correct ``env`` and ``template`` class attributes set. """ - loader = jinja2.ChoiceLoader([ - jinja2.FileSystemLoader(searchpath), - cls.loader, - ]) + loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader]) class MyStyler(cls): env = jinja2.Environment(loader=loader) @@ -1333,27 +1435,28 @@ def _get_level_lengths(index, hidden_elements=None): lengths = {} if index.nlevels == 1: for i, value in enumerate(levels): - if(i not in hidden_elements): + if i not in hidden_elements: lengths[(0, i)] = 1 return lengths for i, lvl in enumerate(levels): for j, row in enumerate(lvl): - if not get_option('display.multi_sparse'): + if not get_option("display.multi_sparse"): lengths[(i, j)] = 1 elif (row != sentinel) and (j not in hidden_elements): last_label = j lengths[(i, last_label)] = 1 - elif (row != sentinel): + elif row != sentinel: # even if its hidden, keep track of it in case # length >1 and later elements are visible last_label = j lengths[(i, last_label)] = 0 - elif(j not in hidden_elements): + elif j not in hidden_elements: lengths[(i, last_label)] += 1 non_zero_lengths = { - element: length for element, length in lengths.items() if length >= 1} + element: length for element, length in lengths.items() if length >= 1 + } return non_zero_lengths @@ -1364,6 +1467,8 @@ def _maybe_wrap_formatter(formatter): elif callable(formatter): return formatter else: - msg = ("Expected a template string or callable, got {formatter} " - "instead".format(formatter=formatter)) + msg = ( + "Expected a template string or callable, got {formatter} " + "instead".format(formatter=formatter) + ) raise TypeError(msg) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index a9eff003f2249..d29078cad9318 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -9,17 +9,25 @@ def _try_import(): "pandas-gbq is required to load data from Google BigQuery. " "See the docs: https://pandas-gbq.readthedocs.io." ) - pandas_gbq = import_optional_dependency( - "pandas_gbq", - extra=msg, - ) + pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg) return pandas_gbq -def read_gbq(query, project_id=None, index_col=None, col_order=None, - reauth=False, auth_local_webserver=False, dialect=None, - location=None, configuration=None, credentials=None, - use_bqstorage_api=None, private_key=None, verbose=None): +def read_gbq( + query, + project_id=None, + index_col=None, + col_order=None, + reauth=False, + auth_local_webserver=False, + dialect=None, + location=None, + configuration=None, + credentials=None, + use_bqstorage_api=None, + private_key=None, + verbose=None, +): """ Load data from Google BigQuery. @@ -155,21 +163,48 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, # END: deprecated kwargs return pandas_gbq.read_gbq( - query, project_id=project_id, index_col=index_col, - col_order=col_order, reauth=reauth, - auth_local_webserver=auth_local_webserver, dialect=dialect, - location=location, configuration=configuration, - credentials=credentials, **kwargs) + query, + project_id=project_id, + index_col=index_col, + col_order=col_order, + reauth=reauth, + auth_local_webserver=auth_local_webserver, + dialect=dialect, + location=location, + configuration=configuration, + credentials=credentials, + **kwargs + ) -def to_gbq(dataframe, destination_table, project_id=None, chunksize=None, - reauth=False, if_exists='fail', auth_local_webserver=False, - table_schema=None, location=None, progress_bar=True, - credentials=None, verbose=None, private_key=None): +def to_gbq( + dataframe, + destination_table, + project_id=None, + chunksize=None, + reauth=False, + if_exists="fail", + auth_local_webserver=False, + table_schema=None, + location=None, + progress_bar=True, + credentials=None, + verbose=None, + private_key=None, +): pandas_gbq = _try_import() - pandas_gbq.to_gbq(dataframe, destination_table, project_id=project_id, - chunksize=chunksize, reauth=reauth, if_exists=if_exists, - auth_local_webserver=auth_local_webserver, - table_schema=table_schema, location=location, - progress_bar=progress_bar, credentials=credentials, - verbose=verbose, private_key=private_key) + pandas_gbq.to_gbq( + dataframe, + destination_table, + project_id=project_id, + chunksize=chunksize, + reauth=reauth, + if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + verbose=verbose, + private_key=private_key, + ) diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py index 862ccbb291c01..1f5e0faedc6d2 100644 --- a/pandas/io/gcs.py +++ b/pandas/io/gcs.py @@ -2,18 +2,17 @@ from pandas.compat._optional import import_optional_dependency gcsfs = import_optional_dependency( - "gcsfs", - extra="The gcsfs library is required to handle GCS files" + "gcsfs", extra="The gcsfs library is required to handle GCS files" ) -def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, mode=None): +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): if mode is None: - mode = 'rb' + mode = "rb" fs = gcsfs.GCSFileSystem() - filepath_or_buffer = fs.open( - filepath_or_buffer, mode) + filepath_or_buffer = fs.open(filepath_or_buffer, mode) return filepath_or_buffer, None, compression, True diff --git a/pandas/io/html.py b/pandas/io/html.py index f080e1d1fc188..91f5e5a949ac3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -35,16 +35,17 @@ def _importers(): return global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB - bs4 = import_optional_dependency("bs4", raise_on_missing=False, - on_version="ignore") + bs4 = import_optional_dependency("bs4", raise_on_missing=False, on_version="ignore") _HAS_BS4 = bs4 is not None - lxml = import_optional_dependency("lxml.etree", raise_on_missing=False, - on_version="ignore") + lxml = import_optional_dependency( + "lxml.etree", raise_on_missing=False, on_version="ignore" + ) _HAS_LXML = lxml is not None - html5lib = import_optional_dependency("html5lib", raise_on_missing=False, - on_version="ignore") + html5lib = import_optional_dependency( + "html5lib", raise_on_missing=False, on_version="ignore" + ) _HAS_HTML5LIB = html5lib is not None _IMPORTS = True @@ -53,7 +54,7 @@ def _importers(): ############# # READ HTML # ############# -_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}') +_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") def _remove_whitespace(s, regex=_RE_WHITESPACE): @@ -72,7 +73,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): subd : str or unicode `s` with all extra whitespace replaced with a single space. """ - return regex.sub(' ', s.strip()) + return regex.sub(" ", s.strip()) def _get_skiprows(skiprows): @@ -100,8 +101,9 @@ def _get_skiprows(skiprows): return skiprows elif skiprows is None: return 0 - raise TypeError('%r is not a valid type for skipping rows' % - type(skiprows).__name__) + raise TypeError( + "%r is not a valid type for skipping rows" % type(skiprows).__name__ + ) def _read(obj): @@ -118,13 +120,13 @@ def _read(obj): if _is_url(obj): with urlopen(obj) as url: text = url.read() - elif hasattr(obj, 'read'): + elif hasattr(obj, "read"): text = obj.read() elif isinstance(obj, (str, bytes)): text = obj try: if os.path.isfile(text): - with open(text, 'rb') as f: + with open(text, "rb") as f: return f.read() except (TypeError, ValueError): pass @@ -397,8 +399,7 @@ def _parse_thead_tbody_tfoot(self, table_html): footer_rows = self._parse_tfoot_tr(table_html) def row_is_all_th(row): - return all(self._equals_tag(t, 'th') for t in - self._parse_td(row)) + return all(self._equals_tag(t, "th") for t in self._parse_td(row)) if not header_rows: # The table has no . Move the top all- rows from @@ -449,14 +450,13 @@ def _expand_colspan_rowspan(self, rows): prev_i, prev_text, prev_rowspan = remainder.pop(0) texts.append(prev_text) if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, - prev_rowspan - 1)) + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) index += 1 # Append the text from this , colspan times text = _remove_whitespace(self._text_getter(td)) - rowspan = int(self._attr_getter(td, 'rowspan') or 1) - colspan = int(self._attr_getter(td, 'colspan') or 1) + rowspan = int(self._attr_getter(td, "rowspan") or 1) + colspan = int(self._attr_getter(td, "colspan") or 1) for _ in range(colspan): texts.append(text) @@ -468,8 +468,7 @@ def _expand_colspan_rowspan(self, rows): for prev_i, prev_text, prev_rowspan in remainder: texts.append(prev_text) if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, - prev_rowspan - 1)) + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) all_texts.append(texts) remainder = next_remainder @@ -482,8 +481,7 @@ def _expand_colspan_rowspan(self, rows): for prev_i, prev_text, prev_rowspan in remainder: texts.append(prev_text) if prev_rowspan > 1: - next_remainder.append((prev_i, prev_text, - prev_rowspan - 1)) + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) all_texts.append(texts) remainder = next_remainder @@ -508,8 +506,12 @@ def _handle_hidden_tables(self, tbl_list, attr_name): if not self.displayed_only: return tbl_list - return [x for x in tbl_list if "display:none" not in - getattr(x, attr_name).get('style', '').replace(" ", "")] + return [ + x + for x in tbl_list + if "display:none" + not in getattr(x, attr_name).get("style", "").replace(" ", "") + ] class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): @@ -529,14 +531,15 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) from bs4 import SoupStrainer - self._strainer = SoupStrainer('table') + + self._strainer = SoupStrainer("table") def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) if not tables: - raise ValueError('No tables found') + raise ValueError("No tables found") result = [] unique_tables = set() @@ -544,18 +547,17 @@ def _parse_tables(self, doc, match, attrs): for table in tables: if self.displayed_only: - for elem in table.find_all( - style=re.compile(r"display:\s*none")): + for elem in table.find_all(style=re.compile(r"display:\s*none")): elem.decompose() - if (table not in unique_tables and - table.find(text=match) is not None): + if table not in unique_tables and table.find(text=match) is not None: result.append(table) unique_tables.add(table) if not result: - raise ValueError("No tables found matching pattern {patt!r}" - .format(patt=match.pattern)) + raise ValueError( + "No tables found matching pattern {patt!r}".format(patt=match.pattern) + ) return result def _text_getter(self, obj): @@ -565,31 +567,32 @@ def _equals_tag(self, obj, tag): return obj.name == tag def _parse_td(self, row): - return row.find_all(('td', 'th'), recursive=False) + return row.find_all(("td", "th"), recursive=False) def _parse_thead_tr(self, table): - return table.select('thead tr') + return table.select("thead tr") def _parse_tbody_tr(self, table): - from_tbody = table.select('tbody tr') - from_root = table.find_all('tr', recursive=False) + from_tbody = table.select("tbody tr") + from_root = table.find_all("tr", recursive=False) # HTML spec: at most one of these lists has content return from_tbody + from_root def _parse_tfoot_tr(self, table): - return table.select('tfoot tr') + return table.select("tfoot tr") def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: - raise ValueError('No text parsed from document: {doc}' - .format(doc=self.io)) + raise ValueError("No text parsed from document: {doc}".format(doc=self.io)) return raw_text def _build_doc(self): from bs4 import BeautifulSoup - return BeautifulSoup(self._setup_build_doc(), features='html5lib', - from_encoding=self.encoding) + + return BeautifulSoup( + self._setup_build_doc(), features="html5lib", from_encoding=self.encoding + ) def _build_xpath_expr(attrs): @@ -607,15 +610,15 @@ def _build_xpath_expr(attrs): An XPath expression that checks for the given HTML attributes. """ # give class attribute as class_ because class is a python keyword - if 'class_' in attrs: - attrs['class'] = attrs.pop('class_') + if "class_" in attrs: + attrs["class"] = attrs.pop("class_") s = ["@{key}={val!r}".format(key=k, val=v) for k, v in attrs.items()] - return '[{expr}]'.format(expr=' and '.join(s)) + return "[{expr}]".format(expr=" and ".join(s)) -_re_namespace = {'re': 'http://exslt.org/regular-expressions'} -_valid_schemes = 'http', 'file', 'ftp' +_re_namespace = {"re": "http://exslt.org/regular-expressions"} +_valid_schemes = "http", "file", "ftp" class _LxmlFrameParser(_HtmlFrameParser): @@ -645,14 +648,14 @@ def _text_getter(self, obj): def _parse_td(self, row): # Look for direct children only: the "row" element here may be a # or (see _parse_thead_tr). - return row.xpath('./td|./th') + return row.xpath("./td|./th") def _parse_tables(self, doc, match, kwargs): pattern = match.pattern # 1. check all descendants for the given pattern and only search tables # 2. go up the tree until we find a table - query = '//table//*[re:test(text(), {patt!r})]/ancestor::table' + query = "//table//*[re:test(text(), {patt!r})]/ancestor::table" xpath_expr = query.format(patt=pattern) # if any table attributes were given build an xpath expression to @@ -668,14 +671,14 @@ def _parse_tables(self, doc, match, kwargs): # lxml utilizes XPATH 1.0 which does not have regex # support. As a result, we find all elements with a style # attribute and iterate them to check for display:none - for elem in table.xpath('.//*[@style]'): - if "display:none" in elem.attrib.get( - "style", "").replace(" ", ""): + for elem in table.xpath(".//*[@style]"): + if "display:none" in elem.attrib.get("style", "").replace(" ", ""): elem.getparent().remove(elem) if not tables: - raise ValueError("No tables found matching regex {patt!r}" - .format(patt=pattern)) + raise ValueError( + "No tables found matching regex {patt!r}".format(patt=pattern) + ) return tables def _equals_tag(self, obj, tag): @@ -699,6 +702,7 @@ def _build_doc(self): """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + parser = HTMLParser(recover=True, encoding=self.encoding) try: @@ -724,15 +728,15 @@ def _build_doc(self): else: raise e else: - if not hasattr(r, 'text_content'): + if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r def _parse_thead_tr(self, table): rows = [] - for thead in table.xpath('.//thead'): - rows.extend(thead.xpath('./tr')) + for thead in table.xpath(".//thead"): + rows.extend(thead.xpath("./tr")) # HACK: lxml does not clean up the clearly-erroneous # foobar. (Missing ). Add @@ -740,20 +744,20 @@ def _parse_thead_tr(self, table): # children as though it's a . # # Better solution would be to use html5lib. - elements_at_root = thead.xpath('./td|./th') + elements_at_root = thead.xpath("./td|./th") if elements_at_root: rows.append(thead) return rows def _parse_tbody_tr(self, table): - from_tbody = table.xpath('.//tbody//tr') - from_root = table.xpath('./tr') + from_tbody = table.xpath(".//tbody//tr") + from_root = table.xpath("./tr") # HTML spec: at most one of these lists has content return from_tbody + from_root def _parse_tfoot_tr(self, table): - return table.xpath('.//tfoot//tr') + return table.xpath(".//tfoot//tr") def _expand_elements(body): @@ -761,15 +765,15 @@ def _expand_elements(body): lens_max = lens.max() not_max = lens[lens != lens_max] - empty = [''] + empty = [""] for ind, length in not_max.items(): body[ind] += empty * (lens_max - length) def _data_to_frame(**kwargs): - head, body, foot = kwargs.pop('data') - header = kwargs.pop('header') - kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) + head, body, foot = kwargs.pop("data") + header = kwargs.pop("header") + kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) if head: body = head + body @@ -779,8 +783,7 @@ def _data_to_frame(**kwargs): header = 0 else: # ignore all-empty-text rows - header = [i for i, row in enumerate(head) - if any(text for text in row)] + header = [i for i, row in enumerate(head) if any(text for text in row)] if foot: body += foot @@ -792,9 +795,12 @@ def _data_to_frame(**kwargs): return df -_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser, - 'html5lib': _BeautifulSoupHtml5LibFrameParser, - 'bs4': _BeautifulSoupHtml5LibFrameParser} +_valid_parsers = { + "lxml": _LxmlFrameParser, + None: _LxmlFrameParser, + "html5lib": _BeautifulSoupHtml5LibFrameParser, + "bs4": _BeautifulSoupHtml5LibFrameParser, +} def _parser_dispatch(flavor): @@ -819,18 +825,18 @@ def _parser_dispatch(flavor): """ valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: - raise ValueError('{invalid!r} is not a valid flavor, valid flavors ' - 'are {valid}' - .format(invalid=flavor, valid=valid_parsers)) + raise ValueError( + "{invalid!r} is not a valid flavor, valid flavors " + "are {valid}".format(invalid=flavor, valid=valid_parsers) + ) - if flavor in ('bs4', 'html5lib'): + if flavor in ("bs4", "html5lib"): if not _HAS_HTML5LIB: raise ImportError("html5lib not found, please install it") if not _HAS_BS4: - raise ImportError( - "BeautifulSoup4 (bs4) not found, please install it") + raise ImportError("BeautifulSoup4 (bs4) not found, please install it") # Although we call this above, we want to raise here right before use. - bs4 = import_optional_dependency('bs4') # noqa:F841 + bs4 = import_optional_dependency("bs4") # noqa:F841 else: if not _HAS_LXML: @@ -839,23 +845,23 @@ def _parser_dispatch(flavor): def _print_as_set(s): - return ('{' + '{arg}'.format(arg=', '.join( - pprint_thing(el) for el in s)) + '}') + return "{" + "{arg}".format(arg=", ".join(pprint_thing(el) for el in s)) + "}" def _validate_flavor(flavor): if flavor is None: - flavor = 'lxml', 'bs4' + flavor = "lxml", "bs4" elif isinstance(flavor, str): - flavor = flavor, + flavor = (flavor,) elif isinstance(flavor, abc.Iterable): if not all(isinstance(flav, str) for flav in flavor): - raise TypeError('Object of type {typ!r} is not an iterable of ' - 'strings' - .format(typ=type(flavor).__name__)) + raise TypeError( + "Object of type {typ!r} is not an iterable of " + "strings".format(typ=type(flavor).__name__) + ) else: - fmt = '{flavor!r}' if isinstance(flavor, str) else '{flavor}' - fmt += ' is not a valid flavor' + fmt = "{flavor!r}" if isinstance(flavor, str) else "{flavor}" + fmt += " is not a valid flavor" raise ValueError(fmt.format(flavor=flavor)) flavor = tuple(flavor) @@ -863,10 +869,12 @@ def _validate_flavor(flavor): flavor_set = set(flavor) if not flavor_set & valid_flavors: - raise ValueError('{invalid} is not a valid set of flavors, valid ' - 'flavors are {valid}' - .format(invalid=_print_as_set(flavor_set), - valid=_print_as_set(valid_flavors))) + raise ValueError( + "{invalid} is not a valid set of flavors, valid " + "flavors are {valid}".format( + invalid=_print_as_set(flavor_set), valid=_print_as_set(valid_flavors) + ) + ) return flavor @@ -885,15 +893,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): except Exception as caught: # if `io` is an io-like object, check if it's seekable # and try to rewind it before trying the next parser - if hasattr(io, 'seekable') and io.seekable(): + if hasattr(io, "seekable") and io.seekable(): io.seek(0) - elif hasattr(io, 'seekable') and not io.seekable(): + elif hasattr(io, "seekable") and not io.seekable(): # if we couldn't rewind it, let the user know - raise ValueError('The flavor {} failed to parse your input. ' - 'Since you passed a non-rewindable file ' - 'object, we can\'t rewind it to try ' - 'another parser. Try read_html() with a ' - 'different flavor.'.format(flav)) + raise ValueError( + "The flavor {} failed to parse your input. " + "Since you passed a non-rewindable file " + "object, we can't rewind it to try " + "another parser. Try read_html() with a " + "different flavor.".format(flav) + ) retained = caught else: @@ -910,11 +920,23 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): return ret -def read_html(io, match='.+', flavor=None, header=None, index_col=None, - skiprows=None, attrs=None, parse_dates=False, - thousands=',', encoding=None, - decimal='.', converters=None, na_values=None, - keep_default_na=True, displayed_only=True): +def read_html( + io, + match=".+", + flavor=None, + header=None, + index_col=None, + skiprows=None, + attrs=None, + parse_dates=False, + thousands=",", + encoding=None, + decimal=".", + converters=None, + na_values=None, + keep_default_na=True, + displayed_only=True, +): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -1060,13 +1082,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, # Type check here. We don't want to parse only to fail because of an # invalid value of an integer skiprows. if isinstance(skiprows, numbers.Integral) and skiprows < 0: - raise ValueError('cannot skip rows starting from the end of the ' - 'data (you passed a negative value)') + raise ValueError( + "cannot skip rows starting from the end of the " + "data (you passed a negative value)" + ) _validate_header_arg(header) - return _parse(flavor=flavor, io=io, match=match, header=header, - index_col=index_col, skiprows=skiprows, - parse_dates=parse_dates, - thousands=thousands, attrs=attrs, encoding=encoding, - decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na, - displayed_only=displayed_only) + return _parse( + flavor=flavor, + io=io, + match=match, + header=header, + index_col=index_col, + skiprows=skiprows, + parse_dates=parse_dates, + thousands=thousands, + attrs=attrs, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + ) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index f14b615471ccc..f3f0f417acaab 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -14,8 +14,12 @@ from pandas.core.reshape.concat import concat from pandas.io.common import ( - BaseIterator, _get_handle, _infer_compression, _stringify_path, - get_filepath_or_buffer) + BaseIterator, + _get_handle, + _infer_compression, + _stringify_path, + get_filepath_or_buffer, +) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import _validate_integer @@ -25,27 +29,36 @@ loads = json.loads dumps = json.dumps -TABLE_SCHEMA_VERSION = '0.20.0' +TABLE_SCHEMA_VERSION = "0.20.0" # interface to/from -def to_json(path_or_buf, obj, orient=None, date_format='epoch', - double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None, lines=False, compression='infer', - index=True): - - if not index and orient not in ['split', 'table']: - raise ValueError("'index=False' is only valid when 'orient' is " - "'split' or 'table'") +def to_json( + path_or_buf, + obj, + orient=None, + date_format="epoch", + double_precision=10, + force_ascii=True, + date_unit="ms", + default_handler=None, + lines=False, + compression="infer", + index=True, +): + + if not index and orient not in ["split", "table"]: + raise ValueError( + "'index=False' is only valid when 'orient' is " "'split' or 'table'" + ) path_or_buf = _stringify_path(path_or_buf) - if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") + if lines and orient != "records": + raise ValueError("'lines' keyword only valid when 'orient' is records") - if orient == 'table' and isinstance(obj, Series): - obj = obj.to_frame(name=obj.name or 'values') - if orient == 'table' and isinstance(obj, DataFrame): + if orient == "table" and isinstance(obj, Series): + obj = obj.to_frame(name=obj.name or "values") + if orient == "table" and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter @@ -55,16 +68,21 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', raise NotImplementedError("'obj' should be a Series or a DataFrame") s = writer( - obj, orient=orient, date_format=date_format, - double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit, default_handler=default_handler, - index=index).write() + obj, + orient=orient, + date_format=date_format, + double_precision=double_precision, + ensure_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + index=index, + ).write() if lines: s = _convert_to_line_delimits(s) if isinstance(path_or_buf, str): - fh, handles = _get_handle(path_or_buf, 'w', compression=compression) + fh, handles = _get_handle(path_or_buf, "w", compression=compression) try: fh.write(s) finally: @@ -76,8 +94,17 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', class Writer: - def __init__(self, obj, orient, date_format, double_precision, - ensure_ascii, date_unit, index, default_handler=None): + def __init__( + self, + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=None, + ): self.obj = obj if orient is None: @@ -98,12 +125,26 @@ def _format_axes(self): raise AbstractMethodError(self) def write(self): - return self._write(self.obj, self.orient, self.double_precision, - self.ensure_ascii, self.date_unit, - self.date_format == 'iso', self.default_handler) + return self._write( + self.obj, + self.orient, + self.double_precision, + self.ensure_ascii, + self.date_unit, + self.date_format == "iso", + self.default_handler, + ) - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): return dumps( obj, orient=orient, @@ -111,91 +152,147 @@ def _write(self, obj, orient, double_precision, ensure_ascii, ensure_ascii=ensure_ascii, date_unit=date_unit, iso_dates=iso_dates, - default_handler=default_handler + default_handler=default_handler, ) class SeriesWriter(Writer): - _default_orient = 'index' + _default_orient = "index" def _format_axes(self): - if not self.obj.index.is_unique and self.orient == 'index': - raise ValueError("Series index must be unique for orient=" - "'{orient}'".format(orient=self.orient)) + if not self.obj.index.is_unique and self.orient == "index": + raise ValueError( + "Series index must be unique for orient=" + "'{orient}'".format(orient=self.orient) + ) - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): - if not self.index and orient == 'split': + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): + if not self.index and orient == "split": obj = {"name": obj.name, "data": obj.values} - return super()._write(obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler) + return super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ) class FrameWriter(Writer): - _default_orient = 'columns' + _default_orient = "columns" def _format_axes(self): """ Try to format axes if they are datelike. """ - if not self.obj.index.is_unique and self.orient in ( - 'index', 'columns'): - raise ValueError("DataFrame index must be unique for orient=" - "'{orient}'.".format(orient=self.orient)) + if not self.obj.index.is_unique and self.orient in ("index", "columns"): + raise ValueError( + "DataFrame index must be unique for orient=" + "'{orient}'.".format(orient=self.orient) + ) if not self.obj.columns.is_unique and self.orient in ( - 'index', 'columns', 'records'): - raise ValueError("DataFrame columns must be unique for orient=" - "'{orient}'.".format(orient=self.orient)) - - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): - if not self.index and orient == 'split': - obj = obj.to_dict(orient='split') + "index", + "columns", + "records", + ): + raise ValueError( + "DataFrame columns must be unique for orient=" + "'{orient}'.".format(orient=self.orient) + ) + + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): + if not self.index and orient == "split": + obj = obj.to_dict(orient="split") del obj["index"] - return super()._write(obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler) + return super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ) class JSONTableWriter(FrameWriter): - _default_orient = 'records' - - def __init__(self, obj, orient, date_format, double_precision, - ensure_ascii, date_unit, index, default_handler=None): + _default_orient = "records" + + def __init__( + self, + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=None, + ): """ Adds a `schema` attribute with the Table Schema, resets the index (can't do in caller, because the schema inference needs to know what the index is, forces orient to records, and forces date_format to 'iso'. """ - super().__init__(obj, orient, date_format, double_precision, - ensure_ascii, date_unit, index, - default_handler=default_handler) - - if date_format != 'iso': - msg = ("Trying to write with `orient='table'` and " - "`date_format='{fmt}'`. Table Schema requires dates " - "to be formatted with `date_format='iso'`" - .format(fmt=date_format)) + super().__init__( + obj, + orient, + date_format, + double_precision, + ensure_ascii, + date_unit, + index, + default_handler=default_handler, + ) + + if date_format != "iso": + msg = ( + "Trying to write with `orient='table'` and " + "`date_format='{fmt}'`. Table Schema requires dates " + "to be formatted with `date_format='iso'`".format(fmt=date_format) + ) raise ValueError(msg) self.schema = build_table_schema(obj, index=self.index) # NotImplemented on a column MultiIndex if obj.ndim == 2 and isinstance(obj.columns, MultiIndex): - raise NotImplementedError( - "orient='table' is not supported for MultiIndex") + raise NotImplementedError("orient='table' is not supported for MultiIndex") # TODO: Do this timedelta properly in objToJSON.c See GH #15137 - if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or - len(obj.columns & obj.index.names)): + if ( + (obj.ndim == 1) + and (obj.name in set(obj.index.names)) + or len(obj.columns & obj.index.names) + ): msg = "Overlapping names between the index and columns" raise ValueError(msg) obj = obj.copy() - timedeltas = obj.select_dtypes(include=['timedelta']).columns + timedeltas = obj.select_dtypes(include=["timedelta"]).columns if len(timedeltas): - obj[timedeltas] = obj[timedeltas].applymap( - lambda x: x.isoformat()) + obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat()) # Convert PeriodIndex to datetimes before serialzing if is_period_dtype(obj.index): obj.index = obj.index.to_timestamp() @@ -205,23 +302,51 @@ def __init__(self, obj, orient, date_format, double_precision, self.obj = obj.reset_index(drop=True) else: self.obj = obj.reset_index(drop=False) - self.date_format = 'iso' - self.orient = 'records' + self.date_format = "iso" + self.orient = "records" self.index = index - def _write(self, obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler): - data = super()._write(obj, orient, double_precision, ensure_ascii, - date_unit, iso_dates, default_handler) + def _write( + self, + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ): + data = super()._write( + obj, + orient, + double_precision, + ensure_ascii, + date_unit, + iso_dates, + default_handler, + ) serialized = '{{"schema": {schema}, "data": {data}}}'.format( - schema=dumps(self.schema), data=data) + schema=dumps(self.schema), data=data + ) return serialized -def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, - convert_axes=None, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None, encoding=None, - lines=False, chunksize=None, compression='infer'): +def read_json( + path_or_buf=None, + orient=None, + typ="frame", + dtype=None, + convert_axes=None, + convert_dates=True, + keep_default_dates=True, + numpy=False, + precise_float=False, + date_unit=None, + encoding=None, + lines=False, + chunksize=None, + compression="infer", +): """ Convert a JSON string to pandas object. @@ -414,27 +539,36 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None, {"index": "row 2", "col 1": "c", "col 2": "d"}]}' """ - if orient == 'table' and dtype: + if orient == "table" and dtype: raise ValueError("cannot pass both dtype and orient='table'") - if orient == 'table' and convert_axes: + if orient == "table" and convert_axes: raise ValueError("cannot pass both convert_axes and orient='table'") - if dtype is None and orient != 'table': + if dtype is None and orient != "table": dtype = True - if convert_axes is None and orient != 'table': + if convert_axes is None and orient != "table": convert_axes = True compression = _infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, + path_or_buf, encoding=encoding, compression=compression ) json_reader = JsonReader( - filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, - convert_axes=convert_axes, convert_dates=convert_dates, - keep_default_dates=keep_default_dates, numpy=numpy, - precise_float=precise_float, date_unit=date_unit, encoding=encoding, - lines=lines, chunksize=chunksize, compression=compression, + filepath_or_buffer, + orient=orient, + typ=typ, + dtype=dtype, + convert_axes=convert_axes, + convert_dates=convert_dates, + keep_default_dates=keep_default_dates, + numpy=numpy, + precise_float=precise_float, + date_unit=date_unit, + encoding=encoding, + lines=lines, + chunksize=chunksize, + compression=compression, ) if chunksize: @@ -457,9 +591,24 @@ class JsonReader(BaseIterator): ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the whole document. """ - def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes, - convert_dates, keep_default_dates, numpy, precise_float, - date_unit, encoding, lines, chunksize, compression): + + def __init__( + self, + filepath_or_buffer, + orient, + typ, + dtype, + convert_axes, + convert_dates, + keep_default_dates, + numpy, + precise_float, + date_unit, + encoding, + lines, + chunksize, + compression, + ): self.path_or_buf = filepath_or_buffer self.orient = orient @@ -494,9 +643,9 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, 'read') and not self.chunksize: + if hasattr(data, "read") and not self.chunksize: data = data.read() - if not hasattr(data, 'read') and self.chunksize: + if not hasattr(data, "read") and self.chunksize: data = StringIO(data) return data @@ -522,9 +671,12 @@ def _get_data_from_filepath(self, filepath_or_buffer): pass if exists or self.compression is not None: - data, _ = _get_handle(filepath_or_buffer, 'r', - encoding=self.encoding, - compression=self.compression) + data, _ = _get_handle( + filepath_or_buffer, + "r", + encoding=self.encoding, + compression=self.compression, + ) self.should_close = True self.open_stream = data @@ -535,7 +687,7 @@ def _combine_lines(self, lines): Combines a list of JSON objects into one JSON object. """ lines = filter(None, map(lambda x: x.strip(), lines)) - return '[' + ','.join(lines) + ']' + return "[" + ",".join(lines) + "]" def read(self): """ @@ -545,9 +697,7 @@ def read(self): obj = concat(self) elif self.lines: data = ensure_str(self.data) - obj = self._get_object_parser( - self._combine_lines(data.split('\n')) - ) + obj = self._get_object_parser(self._combine_lines(data.split("\n"))) else: obj = self._get_object_parser(self.data) self.close() @@ -560,19 +710,22 @@ def _get_object_parser(self, json): typ = self.typ dtype = self.dtype kwargs = { - "orient": self.orient, "dtype": self.dtype, + "orient": self.orient, + "dtype": self.dtype, "convert_axes": self.convert_axes, "convert_dates": self.convert_dates, - "keep_default_dates": self.keep_default_dates, "numpy": self.numpy, - "precise_float": self.precise_float, "date_unit": self.date_unit + "keep_default_dates": self.keep_default_dates, + "numpy": self.numpy, + "precise_float": self.precise_float, + "date_unit": self.date_unit, } obj = None - if typ == 'frame': + if typ == "frame": obj = FrameParser(json, **kwargs).parse() - if typ == 'series' or obj is None: + if typ == "series" or obj is None: if not isinstance(dtype, bool): - kwargs['dtype'] = dtype + kwargs["dtype"] = dtype obj = SeriesParser(json, **kwargs).parse() return obj @@ -608,16 +761,26 @@ def __next__(self): class Parser: - _STAMP_UNITS = ('s', 'ms', 'us', 'ns') + _STAMP_UNITS = ("s", "ms", "us", "ns") _MIN_STAMPS = { - 's': 31536000, - 'ms': 31536000000, - 'us': 31536000000000, - 'ns': 31536000000000000} - - def __init__(self, json, orient, dtype=None, convert_axes=True, - convert_dates=True, keep_default_dates=False, numpy=False, - precise_float=False, date_unit=None): + "s": 31536000, + "ms": 31536000000, + "us": 31536000000000, + "ns": 31536000000000000, + } + + def __init__( + self, + json, + orient, + dtype=None, + convert_axes=True, + convert_dates=True, + keep_default_dates=False, + numpy=False, + precise_float=False, + date_unit=None, + ): self.json = json if orient is None: @@ -632,11 +795,12 @@ def __init__(self, json, orient, dtype=None, convert_axes=True, if date_unit is not None: date_unit = date_unit.lower() if date_unit not in self._STAMP_UNITS: - raise ValueError('date_unit must be one of {units}' - .format(units=self._STAMP_UNITS)) + raise ValueError( + "date_unit must be one of {units}".format(units=self._STAMP_UNITS) + ) self.min_stamp = self._MIN_STAMPS[date_unit] else: - self.min_stamp = self._MIN_STAMPS['s'] + self.min_stamp = self._MIN_STAMPS["s"] self.numpy = numpy self.precise_float = precise_float @@ -653,8 +817,11 @@ def check_keys_split(self, decoded): bad_keys = set(decoded.keys()).difference(set(self._split_keys)) if bad_keys: bad_keys = ", ".join(bad_keys) - raise ValueError("JSON data had unexpected key(s): {bad_keys}" - .format(bad_keys=pprint_thing(bad_keys))) + raise ValueError( + "JSON data had unexpected key(s): {bad_keys}".format( + bad_keys=pprint_thing(bad_keys) + ) + ) def parse(self): @@ -679,16 +846,15 @@ def _convert_axes(self): """ for axis in self.obj._AXIS_NUMBERS.keys(): new_axis, result = self._try_convert_data( - axis, self.obj._get_axis(axis), use_dtypes=False, - convert_dates=True) + axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True + ) if result: setattr(self.obj, axis, new_axis) def _try_convert_types(self): raise AbstractMethodError(self) - def _try_convert_data(self, name, data, use_dtypes=True, - convert_dates=True): + def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): """ Try to parse a ndarray like into a column by inferring dtype. """ @@ -701,8 +867,9 @@ def _try_convert_data(self, name, data, use_dtypes=True, pass else: # dtype to force - dtype = (self.dtype.get(name) - if isinstance(self.dtype, dict) else self.dtype) + dtype = ( + self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype + ) if dtype is not None: try: dtype = np.dtype(dtype) @@ -717,32 +884,32 @@ def _try_convert_data(self, name, data, use_dtypes=True, result = False - if data.dtype == 'object': + if data.dtype == "object": # try float try: - data = data.astype('float64') + data = data.astype("float64") result = True except (TypeError, ValueError): pass - if data.dtype.kind == 'f': + if data.dtype.kind == "f": - if data.dtype != 'float64': + if data.dtype != "float64": # coerce floats to 64 try: - data = data.astype('float64') + data = data.astype("float64") result = True except (TypeError, ValueError): pass # don't coerce 0-len data - if len(data) and (data.dtype == 'float' or data.dtype == 'object'): + if len(data) and (data.dtype == "float" or data.dtype == "object"): # coerce ints if we can try: - new_data = data.astype('int64') + new_data = data.astype("int64") if (new_data == data).all(): data = new_data result = True @@ -750,11 +917,11 @@ def _try_convert_data(self, name, data, use_dtypes=True, pass # coerce ints to 64 - if data.dtype == 'int': + if data.dtype == "int": # coerce floats to 64 try: - data = data.astype('int64') + data = data.astype("int64") result = True except (TypeError, ValueError): pass @@ -774,24 +941,26 @@ def _try_convert_to_date(self, data): return data, False new_data = data - if new_data.dtype == 'object': + if new_data.dtype == "object": try: - new_data = data.astype('int64') + new_data = data.astype("int64") except (TypeError, ValueError, OverflowError): pass # ignore numbers that are out of range if issubclass(new_data.dtype.type, np.number): - in_range = (isna(new_data.values) | (new_data > self.min_stamp) | - (new_data.values == iNaT)) + in_range = ( + isna(new_data.values) + | (new_data > self.min_stamp) + | (new_data.values == iNaT) + ) if not in_range.all(): return data, False date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS for date_unit in date_units: try: - new_data = to_datetime(new_data, errors='raise', - unit=date_unit) + new_data = to_datetime(new_data, errors="raise", unit=date_unit) except ValueError: continue except Exception: @@ -804,52 +973,62 @@ def _try_convert_dates(self): class SeriesParser(Parser): - _default_orient = 'index' - _split_keys = ('name', 'index', 'data') + _default_orient = "index" + _split_keys = ("name", "index", "data") def _parse_no_numpy(self): json = self.json orient = self.orient if orient == "split": - decoded = {str(k): v for k, v in loads( - json, precise_float=self.precise_float).items()} + decoded = { + str(k): v + for k, v in loads(json, precise_float=self.precise_float).items() + } self.check_keys_split(decoded) self.obj = Series(dtype=None, **decoded) else: - self.obj = Series( - loads(json, precise_float=self.precise_float), dtype=None) + self.obj = Series(loads(json, precise_float=self.precise_float), dtype=None) def _parse_numpy(self): json = self.json orient = self.orient if orient == "split": - decoded = loads(json, dtype=None, numpy=True, - precise_float=self.precise_float) + decoded = loads( + json, dtype=None, numpy=True, precise_float=self.precise_float + ) decoded = {str(k): v for k, v in decoded.items()} self.check_keys_split(decoded) self.obj = Series(**decoded) elif orient == "columns" or orient == "index": - self.obj = Series(*loads(json, dtype=None, numpy=True, - labelled=True, - precise_float=self.precise_float)) + self.obj = Series( + *loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) + ) else: - self.obj = Series(loads(json, dtype=None, numpy=True, - precise_float=self.precise_float)) + self.obj = Series( + loads(json, dtype=None, numpy=True, precise_float=self.precise_float) + ) def _try_convert_types(self): if self.obj is None: return obj, result = self._try_convert_data( - 'data', self.obj, convert_dates=self.convert_dates) + "data", self.obj, convert_dates=self.convert_dates + ) if result: self.obj = obj class FrameParser(Parser): - _default_orient = 'columns' - _split_keys = ('columns', 'index', 'data') + _default_orient = "columns" + _split_keys = ("columns", "index", "data") def _parse_numpy(self): @@ -857,24 +1036,37 @@ def _parse_numpy(self): orient = self.orient if orient == "columns": - args = loads(json, dtype=None, numpy=True, labelled=True, - precise_float=self.precise_float) + args = loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) if len(args): args = (args[0].T, args[2], args[1]) self.obj = DataFrame(*args) elif orient == "split": - decoded = loads(json, dtype=None, numpy=True, - precise_float=self.precise_float) + decoded = loads( + json, dtype=None, numpy=True, precise_float=self.precise_float + ) decoded = {str(k): v for k, v in decoded.items()} self.check_keys_split(decoded) self.obj = DataFrame(**decoded) elif orient == "values": - self.obj = DataFrame(loads(json, dtype=None, numpy=True, - precise_float=self.precise_float)) + self.obj = DataFrame( + loads(json, dtype=None, numpy=True, precise_float=self.precise_float) + ) else: - self.obj = DataFrame(*loads(json, dtype=None, numpy=True, - labelled=True, - precise_float=self.precise_float)) + self.obj = DataFrame( + *loads( + json, + dtype=None, + numpy=True, + labelled=True, + precise_float=self.precise_float, + ) + ) def _parse_no_numpy(self): @@ -883,21 +1075,25 @@ def _parse_no_numpy(self): if orient == "columns": self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None) + loads(json, precise_float=self.precise_float), dtype=None + ) elif orient == "split": - decoded = {str(k): v for k, v in loads( - json, precise_float=self.precise_float).items()} + decoded = { + str(k): v + for k, v in loads(json, precise_float=self.precise_float).items() + } self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None).T - elif orient == 'table': - self.obj = parse_table_schema(json, - precise_float=self.precise_float) + loads(json, precise_float=self.precise_float), dtype=None + ).T + elif orient == "table": + self.obj = parse_table_schema(json, precise_float=self.precise_float) else: self.obj = DataFrame( - loads(json, precise_float=self.precise_float), dtype=None) + loads(json, precise_float=self.precise_float), dtype=None + ) def _process_converter(self, f, filt=None): """ @@ -931,7 +1127,8 @@ def _try_convert_types(self): self._try_convert_dates() self._process_converter( - lambda col, c: self._try_convert_data(col, c, convert_dates=False)) + lambda col, c: self._try_convert_data(col, c, convert_dates=False) + ) def _try_convert_dates(self): if self.obj is None: @@ -951,16 +1148,20 @@ def is_ok(col): return False col_lower = col.lower() - if (col_lower.endswith('_at') or - col_lower.endswith('_time') or - col_lower == 'modified' or - col_lower == 'date' or - col_lower == 'datetime' or - col_lower.startswith('timestamp')): + if ( + col_lower.endswith("_at") + or col_lower.endswith("_time") + or col_lower == "modified" + or col_lower == "date" + or col_lower == "datetime" + or col_lower.startswith("timestamp") + ): return True return False self._process_converter( lambda col, c: self._try_convert_to_date(c), - lambda col, c: ((self.keep_default_dates and is_ok(col)) or - col in convert_dates)) + lambda col, c: ( + (self.keep_default_dates and is_ok(col)) or col in convert_dates + ), + ) diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 5c6018d399c82..c09dc177ccbd1 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -19,16 +19,20 @@ def _convert_to_line_delimits(s): # Determine we have a JSON list to turn to lines otherwise just return the # json object, only lists can - if not s[0] == '[' and s[-1] == ']': + if not s[0] == "[" and s[-1] == "]": return s s = s[1:-1] return convert_json_to_lines(s) -def nested_to_record(ds, prefix: str = "", - sep: str = ".", level: int = 0, - max_level: Optional[int] = None): +def nested_to_record( + ds, + prefix: str = "", + sep: str = ".", + level: int = 0, + max_level: Optional[int] = None, +): """ A simplified json_normalize @@ -90,16 +94,16 @@ def nested_to_record(ds, prefix: str = "", # current dict level < maximum level provided and # only dicts gets recurse-flattened # only at level>1 do we rename the rest of the keys - if (not isinstance(v, dict) or - (max_level is not None and level >= max_level)): + if not isinstance(v, dict) or ( + max_level is not None and level >= max_level + ): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, sep, level + 1, - max_level)) + new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level)) new_ds.append(new_d) if singleton: @@ -107,14 +111,16 @@ def nested_to_record(ds, prefix: str = "", return new_ds -def json_normalize(data: List[Dict], - record_path: Optional[Union[str, List]] = None, - meta: Optional[Union[str, List]] = None, - meta_prefix: Optional[str] = None, - record_prefix: Optional[str] = None, - errors: Optional[str] = 'raise', - sep: str = '.', - max_level: Optional[int] = None): +def json_normalize( + data: List[Dict], + record_path: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List]] = None, + meta_prefix: Optional[str] = None, + record_prefix: Optional[str] = None, + errors: Optional[str] = "raise", + sep: str = ".", + max_level: Optional[int] = None, +): """ Normalize semi-structured JSON data into a flat table. @@ -230,6 +236,7 @@ def json_normalize(data: List[Dict], Returns normalized data with columns prefixed with the given string. """ + def _pull_field(js, spec): result = js if isinstance(spec, list): @@ -256,8 +263,7 @@ def _pull_field(js, spec): # # TODO: handle record value which are lists, at least error # reasonably - data = nested_to_record(data, sep=sep, - max_level=max_level) + data = nested_to_record(data, sep=sep, max_level=max_level) return DataFrame(data) elif not isinstance(record_path, list): record_path = [record_path] @@ -287,14 +293,16 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) - _recursive_extract(obj[path[0]], path[1:], - seen_meta, level=level + 1) + _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) - recs = [nested_to_record(r, sep=sep, - max_level=max_level) - if isinstance(r, dict) else r for r in recs] + recs = [ + nested_to_record(r, sep=sep, max_level=max_level) + if isinstance(r, dict) + else r + for r in recs + ] # For repeating the metadata later lengths.append(len(recs)) @@ -305,13 +313,14 @@ def _recursive_extract(data, path, seen_meta, level=0): try: meta_val = _pull_field(obj, val[level:]) except KeyError as e: - if errors == 'ignore': + if errors == "ignore": meta_val = np.nan else: - raise KeyError("Try running with " - "errors='ignore' as key " - "{err} is not always present" - .format(err=e)) + raise KeyError( + "Try running with " + "errors='ignore' as key " + "{err} is not always present".format(err=e) + ) meta_vals[key].append(meta_val) records.extend(recs) @@ -320,8 +329,7 @@ def _recursive_extract(data, path, seen_meta, level=0): result = DataFrame(records) if record_prefix is not None: - result = result.rename( - columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) + result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in meta_vals.items(): @@ -329,7 +337,9 @@ def _recursive_extract(data, path, seen_meta, level=0): k = meta_prefix + k if k in result: - raise ValueError('Conflicting metadata name {name}, ' - 'need distinguishing prefix '.format(name=k)) + raise ValueError( + "Conflicting metadata name {name}, " + "need distinguishing prefix ".format(name=k) + ) result[k] = np.array(v, dtype=object).repeat(lengths) return result diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index a54f5cdf723a3..045127c63af5c 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -8,9 +8,16 @@ import pandas._libs.json as json from pandas.core.dtypes.common import ( - is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, - is_string_dtype, is_timedelta64_dtype) + is_bool_dtype, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_numeric_dtype, + is_period_dtype, + is_string_dtype, + is_timedelta64_dtype, +) from pandas import DataFrame from pandas.api.types import CategoricalDtype @@ -50,70 +57,71 @@ def as_json_table_type(x): =============== ================= """ if is_integer_dtype(x): - return 'integer' + return "integer" elif is_bool_dtype(x): - return 'boolean' + return "boolean" elif is_numeric_dtype(x): - return 'number' - elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or - is_period_dtype(x)): - return 'datetime' + return "number" + elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x): + return "datetime" elif is_timedelta64_dtype(x): - return 'duration' + return "duration" elif is_categorical_dtype(x): - return 'any' + return "any" elif is_string_dtype(x): - return 'string' + return "string" else: - return 'any' + return "any" def set_default_names(data): """Sets index names to 'index' for regular, or 'level_x' for Multi""" if com._all_not_none(*data.index.names): nms = data.index.names - if len(nms) == 1 and data.index.name == 'index': + if len(nms) == 1 and data.index.name == "index": warnings.warn("Index name of 'index' is not round-trippable") - elif len(nms) > 1 and any(x.startswith('level_') for x in nms): - warnings.warn("Index names beginning with 'level_' are not " - "round-trippable") + elif len(nms) > 1 and any(x.startswith("level_") for x in nms): + warnings.warn( + "Index names beginning with 'level_' are not " "round-trippable" + ) return data data = data.copy() if data.index.nlevels > 1: - names = [name if name is not None else 'level_{}'.format(i) - for i, name in enumerate(data.index.names)] + names = [ + name if name is not None else "level_{}".format(i) + for i, name in enumerate(data.index.names) + ] data.index.names = names else: - data.index.name = data.index.name or 'index' + data.index.name = data.index.name or "index" return data def convert_pandas_type_to_json_field(arr, dtype=None): dtype = dtype or arr.dtype if arr.name is None: - name = 'values' + name = "values" else: name = arr.name - field = {'name': name, - 'type': as_json_table_type(dtype)} + field = {"name": name, "type": as_json_table_type(dtype)} if is_categorical_dtype(arr): - if hasattr(arr, 'categories'): + if hasattr(arr, "categories"): cats = arr.categories ordered = arr.ordered else: cats = arr.cat.categories ordered = arr.cat.ordered - field['constraints'] = {"enum": list(cats)} - field['ordered'] = ordered + field["constraints"] = {"enum": list(cats)} + field["ordered"] = ordered elif is_period_dtype(arr): - field['freq'] = arr.freqstr + field["freq"] = arr.freqstr elif is_datetime64tz_dtype(arr): - if hasattr(arr, 'dt'): - field['tz'] = arr.dt.tz.zone + if hasattr(arr, "dt"): + field["tz"] = arr.dt.tz.zone else: - field['tz'] = arr.tz.zone + field["tz"] = arr.tz.zone return field @@ -154,28 +162,29 @@ def convert_json_field_to_pandas_type(field): 'tz': 'US/Central'}) 'datetime64[ns, US/Central]' """ - typ = field['type'] - if typ == 'string': - return 'object' - elif typ == 'integer': - return 'int64' - elif typ == 'number': - return 'float64' - elif typ == 'boolean': - return 'bool' - elif typ == 'duration': - return 'timedelta64' - elif typ == 'datetime': - if field.get('tz'): - return 'datetime64[ns, {tz}]'.format(tz=field['tz']) + typ = field["type"] + if typ == "string": + return "object" + elif typ == "integer": + return "int64" + elif typ == "number": + return "float64" + elif typ == "boolean": + return "bool" + elif typ == "duration": + return "timedelta64" + elif typ == "datetime": + if field.get("tz"): + return "datetime64[ns, {tz}]".format(tz=field["tz"]) else: - return 'datetime64[ns]' - elif typ == 'any': - if 'constraints' in field and 'ordered' in field: - return CategoricalDtype(categories=field['constraints']['enum'], - ordered=field['ordered']) + return "datetime64[ns]" + elif typ == "any": + if "constraints" in field and "ordered" in field: + return CategoricalDtype( + categories=field["constraints"]["enum"], ordered=field["ordered"] + ) else: - return 'object' + return "object" raise ValueError("Unsupported or invalid field type: {}".format(typ)) @@ -245,17 +254,17 @@ def build_table_schema(data, index=True, primary_key=None, version=True): else: fields.append(convert_pandas_type_to_json_field(data)) - schema['fields'] = fields + schema["fields"] = fields if index and data.index.is_unique and primary_key is None: if data.index.nlevels == 1: - schema['primaryKey'] = [data.index.name] + schema["primaryKey"] = [data.index.name] else: - schema['primaryKey'] = data.index.names + schema["primaryKey"] = data.index.names elif primary_key is not None: - schema['primaryKey'] = primary_key + schema["primaryKey"] = primary_key if version: - schema['pandas_version'] = '0.20.0' + schema["pandas_version"] = "0.20.0" return schema @@ -296,31 +305,34 @@ def parse_table_schema(json, precise_float): pandas.read_json """ table = loads(json, precise_float=precise_float) - col_order = [field['name'] for field in table['schema']['fields']] - df = DataFrame(table['data'], columns=col_order)[col_order] + col_order = [field["name"] for field in table["schema"]["fields"]] + df = DataFrame(table["data"], columns=col_order)[col_order] - dtypes = {field['name']: convert_json_field_to_pandas_type(field) - for field in table['schema']['fields']} + dtypes = { + field["name"]: convert_json_field_to_pandas_type(field) + for field in table["schema"]["fields"] + } # Cannot directly use as_type with timezone data on object; raise for now - if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()): - raise NotImplementedError('table="orient" can not yet read timezone ' - 'data') + if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): + raise NotImplementedError('table="orient" can not yet read timezone ' "data") # No ISO constructor for Timedelta as of yet, so need to raise - if 'timedelta64' in dtypes.values(): - raise NotImplementedError('table="orient" can not yet read ' - 'ISO-formatted Timedelta data') + if "timedelta64" in dtypes.values(): + raise NotImplementedError( + 'table="orient" can not yet read ' "ISO-formatted Timedelta data" + ) df = df.astype(dtypes) - if 'primaryKey' in table['schema']: - df = df.set_index(table['schema']['primaryKey']) + if "primaryKey" in table["schema"]: + df = df.set_index(table["schema"]["primaryKey"]) if len(df.index.names) == 1: - if df.index.name == 'index': + if df.index.name == "index": df.index.name = None else: - df.index.names = [None if x.startswith('level_') else x for x in - df.index.names] + df.index.names = [ + None if x.startswith("level_") else x for x in df.index.names + ] return df diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py index f8feffcf49240..9b09cffd83f75 100644 --- a/pandas/io/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -6,8 +6,9 @@ from pandas.io.msgpack._version import version # noqa -class ExtType(namedtuple('ExtType', 'code data')): +class ExtType(namedtuple("ExtType", "code data")): """ExtType represents ext type in msgpack.""" + def __new__(cls, code, data): if not isinstance(code, int): raise TypeError("code must be int") @@ -17,6 +18,7 @@ def __new__(cls, code, data): raise ValueError("code must be 0~127") return super().__new__(cls, code, data) + import os # noqa from pandas.io.msgpack._packer import Packer # noqa diff --git a/pandas/io/msgpack/exceptions.py b/pandas/io/msgpack/exceptions.py index ae0f74a6700bd..40f5a8af8f583 100644 --- a/pandas/io/msgpack/exceptions.py +++ b/pandas/io/msgpack/exceptions.py @@ -15,7 +15,6 @@ class UnpackValueError(UnpackException, ValueError): class ExtraData(ValueError): - def __init__(self, unpacked, extra): self.unpacked = unpacked self.extra = extra diff --git a/pandas/io/packers.py b/pandas/io/packers.py index e43f94e28d4af..b0ce7a4ccb12a 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -49,16 +49,37 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import PerformanceWarning from pandas.util._move import ( - BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer) + BadMove as _BadMove, + move_into_mutable_buffer as _move_into_mutable_buffer, +) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype, - needs_i8_conversion, pandas_dtype) + is_categorical_dtype, + is_datetime64tz_dtype, + is_object_dtype, + needs_i8_conversion, + pandas_dtype, +) from pandas import ( # noqa:F401 - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Period, - PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Index, + Int64Index, + Interval, + IntervalIndex, + MultiIndex, + NaT, + Period, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + Timestamp, +) from pandas.core import internals from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray from pandas.core.arrays.sparse import BlockIndex, IntIndex @@ -95,19 +116,22 @@ def to_msgpack(path_or_buf, *args, **kwargs): compress : type of compressor (zlib or blosc), default to None (no compression) """ - warnings.warn("to_msgpack is deprecated and will be removed in a " - "future version.\n" - "It is recommended to use pyarrow for on-the-wire " - "transmission of pandas objects.", - FutureWarning, stacklevel=3) + warnings.warn( + "to_msgpack is deprecated and will be removed in a " + "future version.\n" + "It is recommended to use pyarrow for on-the-wire " + "transmission of pandas objects.", + FutureWarning, + stacklevel=3, + ) global compressor - compressor = kwargs.pop('compress', None) - append = kwargs.pop('append', None) + compressor = kwargs.pop("compress", None) + append = kwargs.pop("append", None) if append: - mode = 'a+b' + mode = "a+b" else: - mode = 'wb' + mode = "wb" def writer(fh): for a in args: @@ -125,7 +149,7 @@ def writer(fh): writer(path_or_buf) -def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): +def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs): """ Load msgpack pandas object from the specified file path @@ -152,11 +176,14 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): read_msgpack is only guaranteed to be backwards compatible to pandas 0.20.3. """ - warnings.warn("The read_msgpack is deprecated and will be removed in a " - "future version.\n" - "It is recommended to use pyarrow for on-the-wire " - "transmission of pandas objects.", - FutureWarning, stacklevel=3) + warnings.warn( + "The read_msgpack is deprecated and will be removed in a " + "future version.\n" + "It is recommended to use pyarrow for on-the-wire " + "transmission of pandas objects.", + FutureWarning, + stacklevel=3, + ) path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: @@ -182,7 +209,7 @@ def read(fh): exists = False if exists: - with open(path_or_buf, 'rb') as fh: + with open(path_or_buf, "rb") as fh: return read(fh) if isinstance(path_or_buf, bytes): @@ -194,25 +221,25 @@ def read(fh): finally: if fh is not None: fh.close() - elif hasattr(path_or_buf, 'read') and callable(path_or_buf.read): + elif hasattr(path_or_buf, "read") and callable(path_or_buf.read): # treat as a buffer like return read(path_or_buf) - raise ValueError('path_or_buf needs to be a string file path or file-like') + raise ValueError("path_or_buf needs to be a string file path or file-like") -dtype_dict = {21: np.dtype('M8[ns]'), - 'datetime64[ns]': np.dtype('M8[ns]'), - 'datetime64[us]': np.dtype('M8[us]'), - 22: np.dtype('m8[ns]'), - 'timedelta64[ns]': np.dtype('m8[ns]'), - 'timedelta64[us]': np.dtype('m8[us]'), - - # this is platform int, which we need to remap to np.int64 - # for compat on windows platforms - 7: np.dtype('int64'), - 'category': 'category' - } +dtype_dict = { + 21: np.dtype("M8[ns]"), + "datetime64[ns]": np.dtype("M8[ns]"), + "datetime64[us]": np.dtype("M8[us]"), + 22: np.dtype("m8[ns]"), + "timedelta64[ns]": np.dtype("m8[ns]"), + "timedelta64[us]": np.dtype("m8[us]"), + # this is platform int, which we need to remap to np.int64 + # for compat on windows platforms + 7: np.dtype("int64"), + "category": "category", +} def dtype_for(t): @@ -222,13 +249,11 @@ def dtype_for(t): return np.typeDict.get(t, t) -c2f_dict = {'complex': np.float64, - 'complex128': np.float64, - 'complex64': np.float32} +c2f_dict = {"complex": np.float64, "complex128": np.float64, "complex64": np.float32} # windows (32 bit) compat -if hasattr(np, 'float128'): - c2f_dict['complex256'] = np.float128 +if hasattr(np, "float128"): + c2f_dict["complex256"] = np.float128 def c2f(r, i, ctype_name): @@ -252,13 +277,12 @@ def convert(values): return values.ravel().tolist() if needs_i8_conversion(dtype): - values = values.view('i8') + values = values.view("i8") v = values.ravel() - if compressor == 'zlib': + if compressor == "zlib": zlib = import_optional_dependency( - "zlib", - extra="zlib is required when `compress='zlib'`." + "zlib", extra="zlib is required when `compress='zlib'`." ) # return string arrays like they are @@ -269,10 +293,9 @@ def convert(values): v = v.tostring() return ExtType(0, zlib.compress(v)) - elif compressor == 'blosc': + elif compressor == "blosc": blosc = import_optional_dependency( - "blosc", - extra="zlib is required when `compress='blosc'`." + "blosc", extra="zlib is required when `compress='blosc'`." ) # return string arrays like they are @@ -303,19 +326,17 @@ def unconvert(values, dtype, compress=None): dtype = pandas_dtype(dtype).base if not as_is_ext: - values = values.encode('latin1') + values = values.encode("latin1") if compress: - if compress == 'zlib': + if compress == "zlib": zlib = import_optional_dependency( - "zlib", - extra="zlib is required when `compress='zlib'`." + "zlib", extra="zlib is required when `compress='zlib'`." ) decompress = zlib.decompress - elif compress == 'blosc': + elif compress == "blosc": blosc = import_optional_dependency( - "blosc", - extra="zlib is required when `compress='blosc'`." + "blosc", extra="zlib is required when `compress='blosc'`." ) decompress = blosc.decompress else: @@ -323,8 +344,7 @@ def unconvert(values, dtype, compress=None): try: return np.frombuffer( - _move_into_mutable_buffer(decompress(values)), - dtype=dtype, + _move_into_mutable_buffer(decompress(values)), dtype=dtype ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. @@ -338,8 +358,8 @@ def unconvert(values, dtype, compress=None): # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( - 'copying data after decompressing; this may mean that' - ' decompress is caching its result', + "copying data after decompressing; this may mean that" + " decompress is caching its result", PerformanceWarning, ) # fall through to copying `np.fromstring` @@ -358,76 +378,87 @@ def encode(obj): tobj = type(obj) if isinstance(obj, Index): if isinstance(obj, RangeIndex): - return {'typ': 'range_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'start': obj._range.start, - 'stop': obj._range.stop, - 'step': obj._range.step, - } + return { + "typ": "range_index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "start": obj._range.start, + "stop": obj._range.stop, + "step": obj._range.step, + } elif isinstance(obj, PeriodIndex): - return {'typ': 'period_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'freq': getattr(obj, 'freqstr', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'compress': compressor} + return { + "typ": "period_index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "freq": getattr(obj, "freqstr", None), + "dtype": obj.dtype.name, + "data": convert(obj.asi8), + "compress": compressor, + } elif isinstance(obj, DatetimeIndex): - tz = getattr(obj, 'tz', None) + tz = getattr(obj, "tz", None) # store tz info and data as UTC if tz is not None: tz = tz.zone - obj = obj.tz_convert('UTC') - return {'typ': 'datetime_index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.asi8), - 'freq': getattr(obj, 'freqstr', None), - 'tz': tz, - 'compress': compressor} + obj = obj.tz_convert("UTC") + return { + "typ": "datetime_index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "dtype": obj.dtype.name, + "data": convert(obj.asi8), + "freq": getattr(obj, "freqstr", None), + "tz": tz, + "compress": compressor, + } elif isinstance(obj, (IntervalIndex, IntervalArray)): if isinstance(obj, IntervalIndex): - typ = 'interval_index' + typ = "interval_index" else: - typ = 'interval_array' - return {'typ': typ, - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'left': getattr(obj, 'left', None), - 'right': getattr(obj, 'right', None), - 'closed': getattr(obj, 'closed', None)} + typ = "interval_array" + return { + "typ": typ, + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "left": getattr(obj, "left", None), + "right": getattr(obj, "right", None), + "closed": getattr(obj, "closed", None), + } elif isinstance(obj, MultiIndex): - return {'typ': 'multi_index', - 'klass': obj.__class__.__name__, - 'names': getattr(obj, 'names', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return { + "typ": "multi_index", + "klass": obj.__class__.__name__, + "names": getattr(obj, "names", None), + "dtype": obj.dtype.name, + "data": convert(obj.values), + "compress": compressor, + } else: - return {'typ': 'index', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return { + "typ": "index", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "dtype": obj.dtype.name, + "data": convert(obj.values), + "compress": compressor, + } elif isinstance(obj, Categorical): - return {'typ': 'category', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'codes': obj.codes, - 'categories': obj.categories, - 'ordered': obj.ordered, - 'compress': compressor} + return { + "typ": "category", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "codes": obj.codes, + "categories": obj.categories, + "ordered": obj.ordered, + "compress": compressor, + } elif isinstance(obj, Series): if isinstance(obj, SparseSeries): - raise NotImplementedError( - 'msgpack sparse series is not implemented' - ) + raise NotImplementedError("msgpack sparse series is not implemented") # d = {'typ': 'sparse_series', # 'klass': obj.__class__.__name__, # 'dtype': obj.dtype.name, @@ -439,18 +470,18 @@ def encode(obj): # d[f] = getattr(obj, f, None) # return d else: - return {'typ': 'series', - 'klass': obj.__class__.__name__, - 'name': getattr(obj, 'name', None), - 'index': obj.index, - 'dtype': obj.dtype.name, - 'data': convert(obj.values), - 'compress': compressor} + return { + "typ": "series", + "klass": obj.__class__.__name__, + "name": getattr(obj, "name", None), + "index": obj.index, + "dtype": obj.dtype.name, + "data": convert(obj.values), + "compress": compressor, + } elif issubclass(tobj, NDFrame): if isinstance(obj, SparseDataFrame): - raise NotImplementedError( - 'msgpack sparse frame is not implemented' - ) + raise NotImplementedError("msgpack sparse frame is not implemented") # d = {'typ': 'sparse_dataframe', # 'klass': obj.__class__.__name__, # 'columns': obj.columns} @@ -466,19 +497,27 @@ def encode(obj): data = data.consolidate() # the block manager - return {'typ': 'block_manager', - 'klass': obj.__class__.__name__, - 'axes': data.axes, - 'blocks': [{'locs': b.mgr_locs.as_array, - 'values': convert(b.values), - 'shape': b.values.shape, - 'dtype': b.dtype.name, - 'klass': b.__class__.__name__, - 'compress': compressor} for b in data.blocks] + return { + "typ": "block_manager", + "klass": obj.__class__.__name__, + "axes": data.axes, + "blocks": [ + { + "locs": b.mgr_locs.as_array, + "values": convert(b.values), + "shape": b.values.shape, + "dtype": b.dtype.name, + "klass": b.__class__.__name__, + "compress": compressor, } - - elif isinstance(obj, (datetime, date, np.datetime64, timedelta, - np.timedelta64)) or obj is NaT: + for b in data.blocks + ], + } + + elif ( + isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64)) + or obj is NaT + ): if isinstance(obj, Timestamp): tz = obj.tzinfo if tz is not None: @@ -486,71 +525,73 @@ def encode(obj): freq = obj.freq if freq is not None: freq = freq.freqstr - return {'typ': 'timestamp', - 'value': obj.value, - 'freq': freq, - 'tz': tz} + return {"typ": "timestamp", "value": obj.value, "freq": freq, "tz": tz} if obj is NaT: - return {'typ': 'nat'} + return {"typ": "nat"} elif isinstance(obj, np.timedelta64): - return {'typ': 'timedelta64', - 'data': obj.view('i8')} + return {"typ": "timedelta64", "data": obj.view("i8")} elif isinstance(obj, timedelta): - return {'typ': 'timedelta', - 'data': (obj.days, obj.seconds, obj.microseconds)} + return { + "typ": "timedelta", + "data": (obj.days, obj.seconds, obj.microseconds), + } elif isinstance(obj, np.datetime64): - return {'typ': 'datetime64', - 'data': str(obj)} + return {"typ": "datetime64", "data": str(obj)} elif isinstance(obj, datetime): - return {'typ': 'datetime', - 'data': obj.isoformat()} + return {"typ": "datetime", "data": obj.isoformat()} elif isinstance(obj, date): - return {'typ': 'date', - 'data': obj.isoformat()} - raise Exception( - "cannot encode this datetimelike object: {obj}".format(obj=obj)) + return {"typ": "date", "data": obj.isoformat()} + raise Exception("cannot encode this datetimelike object: {obj}".format(obj=obj)) elif isinstance(obj, Period): - return {'typ': 'period', - 'ordinal': obj.ordinal, - 'freq': obj.freqstr} + return {"typ": "period", "ordinal": obj.ordinal, "freq": obj.freqstr} elif isinstance(obj, Interval): - return {'typ': 'interval', - 'left': obj.left, - 'right': obj.right, - 'closed': obj.closed} + return { + "typ": "interval", + "left": obj.left, + "right": obj.right, + "closed": obj.closed, + } elif isinstance(obj, BlockIndex): - return {'typ': 'block_index', - 'klass': obj.__class__.__name__, - 'blocs': obj.blocs, - 'blengths': obj.blengths, - 'length': obj.length} + return { + "typ": "block_index", + "klass": obj.__class__.__name__, + "blocs": obj.blocs, + "blengths": obj.blengths, + "length": obj.length, + } elif isinstance(obj, IntIndex): - return {'typ': 'int_index', - 'klass': obj.__class__.__name__, - 'indices': obj.indices, - 'length': obj.length} + return { + "typ": "int_index", + "klass": obj.__class__.__name__, + "indices": obj.indices, + "length": obj.length, + } elif isinstance(obj, np.ndarray): - return {'typ': 'ndarray', - 'shape': obj.shape, - 'ndim': obj.ndim, - 'dtype': obj.dtype.name, - 'data': convert(obj), - 'compress': compressor} + return { + "typ": "ndarray", + "shape": obj.shape, + "ndim": obj.ndim, + "dtype": obj.dtype.name, + "data": convert(obj), + "compress": compressor, + } elif isinstance(obj, np.number): if np.iscomplexobj(obj): - return {'typ': 'np_scalar', - 'sub_typ': 'np_complex', - 'dtype': obj.dtype.name, - 'real': np.real(obj).__repr__(), - 'imag': np.imag(obj).__repr__()} + return { + "typ": "np_scalar", + "sub_typ": "np_complex", + "dtype": obj.dtype.name, + "real": np.real(obj).__repr__(), + "imag": np.imag(obj).__repr__(), + } else: - return {'typ': 'np_scalar', - 'dtype': obj.dtype.name, - 'data': obj.__repr__()} + return {"typ": "np_scalar", "dtype": obj.dtype.name, "data": obj.__repr__()} elif isinstance(obj, complex): - return {'typ': 'np_complex', - 'real': np.real(obj).__repr__(), - 'imag': np.imag(obj).__repr__()} + return { + "typ": "np_complex", + "real": np.real(obj).__repr__(), + "imag": np.imag(obj).__repr__(), + } return obj @@ -560,105 +601,101 @@ def decode(obj): Decoder for deserializing numpy data types. """ - typ = obj.get('typ') + typ = obj.get("typ") if typ is None: return obj - elif typ == 'timestamp': - freq = obj['freq'] if 'freq' in obj else obj['offset'] - return Timestamp(obj['value'], tz=obj['tz'], freq=freq) - elif typ == 'nat': + elif typ == "timestamp": + freq = obj["freq"] if "freq" in obj else obj["offset"] + return Timestamp(obj["value"], tz=obj["tz"], freq=freq) + elif typ == "nat": return NaT - elif typ == 'period': - return Period(ordinal=obj['ordinal'], freq=obj['freq']) - elif typ == 'index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) - return Index(data, dtype=dtype, name=obj['name']) - elif typ == 'range_index': - return RangeIndex(obj['start'], - obj['stop'], - obj['step'], - name=obj['name']) - elif typ == 'multi_index': - dtype = dtype_for(obj['dtype']) - data = unconvert(obj['data'], dtype, - obj.get('compress')) + elif typ == "period": + return Period(ordinal=obj["ordinal"], freq=obj["freq"]) + elif typ == "index": + dtype = dtype_for(obj["dtype"]) + data = unconvert(obj["data"], dtype, obj.get("compress")) + return Index(data, dtype=dtype, name=obj["name"]) + elif typ == "range_index": + return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"]) + elif typ == "multi_index": + dtype = dtype_for(obj["dtype"]) + data = unconvert(obj["data"], dtype, obj.get("compress")) data = [tuple(x) for x in data] - return MultiIndex.from_tuples(data, names=obj['names']) - elif typ == 'period_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq']) - freq = d.pop('freq', None) + return MultiIndex.from_tuples(data, names=obj["names"]) + elif typ == "period_index": + data = unconvert(obj["data"], np.int64, obj.get("compress")) + d = dict(name=obj["name"], freq=obj["freq"]) + freq = d.pop("freq", None) return PeriodIndex(PeriodArray(data, freq), **d) - elif typ == 'datetime_index': - data = unconvert(obj['data'], np.int64, obj.get('compress')) - d = dict(name=obj['name'], freq=obj['freq']) + elif typ == "datetime_index": + data = unconvert(obj["data"], np.int64, obj.get("compress")) + d = dict(name=obj["name"], freq=obj["freq"]) result = DatetimeIndex(data, **d) - tz = obj['tz'] + tz = obj["tz"] # reverse tz conversion if tz is not None: - result = result.tz_localize('UTC').tz_convert(tz) + result = result.tz_localize("UTC").tz_convert(tz) return result - elif typ in ('interval_index', 'interval_array'): - return globals()[obj['klass']].from_arrays(obj['left'], - obj['right'], - obj['closed'], - name=obj['name']) - elif typ == 'category': - from_codes = globals()[obj['klass']].from_codes - return from_codes(codes=obj['codes'], - categories=obj['categories'], - ordered=obj['ordered']) - - elif typ == 'interval': - return Interval(obj['left'], obj['right'], obj['closed']) - elif typ == 'series': - dtype = dtype_for(obj['dtype']) - index = obj['index'] - data = unconvert(obj['data'], dtype, obj['compress']) - return Series(data, index=index, dtype=dtype, name=obj['name']) - - elif typ == 'block_manager': - axes = obj['axes'] + elif typ in ("interval_index", "interval_array"): + return globals()[obj["klass"]].from_arrays( + obj["left"], obj["right"], obj["closed"], name=obj["name"] + ) + elif typ == "category": + from_codes = globals()[obj["klass"]].from_codes + return from_codes( + codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"] + ) + + elif typ == "interval": + return Interval(obj["left"], obj["right"], obj["closed"]) + elif typ == "series": + dtype = dtype_for(obj["dtype"]) + index = obj["index"] + data = unconvert(obj["data"], dtype, obj["compress"]) + return Series(data, index=index, dtype=dtype, name=obj["name"]) + + elif typ == "block_manager": + axes = obj["axes"] def create_block(b): - values = _safe_reshape(unconvert( - b['values'], dtype_for(b['dtype']), - b['compress']), b['shape']) + values = _safe_reshape( + unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"] + ) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 - if 'locs' in b: - placement = b['locs'] + if "locs" in b: + placement = b["locs"] else: - placement = axes[0].get_indexer(b['items']) + placement = axes[0].get_indexer(b["items"]) - if is_datetime64tz_dtype(b['dtype']): + if is_datetime64tz_dtype(b["dtype"]): assert isinstance(values, np.ndarray), type(values) - assert values.dtype == 'M8[ns]', values.dtype - values = DatetimeArray(values, dtype=b['dtype']) - - return make_block(values=values, - klass=getattr(internals, b['klass']), - placement=placement, - dtype=b['dtype']) - - blocks = [create_block(b) for b in obj['blocks']] - return globals()[obj['klass']](BlockManager(blocks, axes)) - elif typ == 'datetime': - return parse(obj['data']) - elif typ == 'datetime64': - return np.datetime64(parse(obj['data'])) - elif typ == 'date': - return parse(obj['data']).date() - elif typ == 'timedelta': - return timedelta(*obj['data']) - elif typ == 'timedelta64': - return np.timedelta64(int(obj['data'])) + assert values.dtype == "M8[ns]", values.dtype + values = DatetimeArray(values, dtype=b["dtype"]) + + return make_block( + values=values, + klass=getattr(internals, b["klass"]), + placement=placement, + dtype=b["dtype"], + ) + + blocks = [create_block(b) for b in obj["blocks"]] + return globals()[obj["klass"]](BlockManager(blocks, axes)) + elif typ == "datetime": + return parse(obj["data"]) + elif typ == "datetime64": + return np.datetime64(parse(obj["data"])) + elif typ == "date": + return parse(obj["data"]).date() + elif typ == "timedelta": + return timedelta(*obj["data"]) + elif typ == "timedelta64": + return np.timedelta64(int(obj["data"])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return SparseSeries( @@ -671,94 +708,129 @@ def create_block(b): # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) - elif typ == 'block_index': - return globals()[obj['klass']](obj['length'], obj['blocs'], - obj['blengths']) - elif typ == 'int_index': - return globals()[obj['klass']](obj['length'], obj['indices']) - elif typ == 'ndarray': - return unconvert(obj['data'], np.typeDict[obj['dtype']], - obj.get('compress')).reshape(obj['shape']) - elif typ == 'np_scalar': - if obj.get('sub_typ') == 'np_complex': - return c2f(obj['real'], obj['imag'], obj['dtype']) + elif typ == "block_index": + return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"]) + elif typ == "int_index": + return globals()[obj["klass"]](obj["length"], obj["indices"]) + elif typ == "ndarray": + return unconvert( + obj["data"], np.typeDict[obj["dtype"]], obj.get("compress") + ).reshape(obj["shape"]) + elif typ == "np_scalar": + if obj.get("sub_typ") == "np_complex": + return c2f(obj["real"], obj["imag"], obj["dtype"]) else: - dtype = dtype_for(obj['dtype']) + dtype = dtype_for(obj["dtype"]) try: - return dtype(obj['data']) + return dtype(obj["data"]) except (ValueError, TypeError): - return dtype.type(obj['data']) - elif typ == 'np_complex': - return complex(obj['real'] + '+' + obj['imag'] + 'j') + return dtype.type(obj["data"]) + elif typ == "np_complex": + return complex(obj["real"] + "+" + obj["imag"] + "j") elif isinstance(obj, (dict, list, set)): return obj else: return obj -def pack(o, default=encode, - encoding='utf-8', unicode_errors='strict', use_single_float=False, - autoreset=1, use_bin_type=1): +def pack( + o, + default=encode, + encoding="utf-8", + unicode_errors="strict", + use_single_float=False, + autoreset=1, + use_bin_type=1, +): """ Pack an object and return the packed bytes. """ - return Packer(default=default, encoding=encoding, - unicode_errors=unicode_errors, - use_single_float=use_single_float, - autoreset=autoreset, - use_bin_type=use_bin_type).pack(o) - - -def unpack(packed, object_hook=decode, - list_hook=None, use_list=False, encoding='utf-8', - unicode_errors='strict', object_pairs_hook=None, - max_buffer_size=0, ext_hook=ExtType): + return Packer( + default=default, + encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float, + autoreset=autoreset, + use_bin_type=use_bin_type, + ).pack(o) + + +def unpack( + packed, + object_hook=decode, + list_hook=None, + use_list=False, + encoding="utf-8", + unicode_errors="strict", + object_pairs_hook=None, + max_buffer_size=0, + ext_hook=ExtType, +): """ Unpack a packed object, return an iterator Note: packed lists will be returned as tuples """ - return Unpacker(packed, object_hook=object_hook, - list_hook=list_hook, - use_list=use_list, encoding=encoding, - unicode_errors=unicode_errors, - object_pairs_hook=object_pairs_hook, - max_buffer_size=max_buffer_size, - ext_hook=ext_hook) + return Unpacker( + packed, + object_hook=object_hook, + list_hook=list_hook, + use_list=use_list, + encoding=encoding, + unicode_errors=unicode_errors, + object_pairs_hook=object_pairs_hook, + max_buffer_size=max_buffer_size, + ext_hook=ext_hook, + ) class Packer(_Packer): - - def __init__(self, default=encode, - encoding='utf-8', - unicode_errors='strict', - use_single_float=False, - autoreset=1, - use_bin_type=1): - super().__init__(default=default, encoding=encoding, - unicode_errors=unicode_errors, - use_single_float=use_single_float, - autoreset=autoreset, - use_bin_type=use_bin_type) + def __init__( + self, + default=encode, + encoding="utf-8", + unicode_errors="strict", + use_single_float=False, + autoreset=1, + use_bin_type=1, + ): + super().__init__( + default=default, + encoding=encoding, + unicode_errors=unicode_errors, + use_single_float=use_single_float, + autoreset=autoreset, + use_bin_type=use_bin_type, + ) class Unpacker(_Unpacker): - - def __init__(self, file_like=None, read_size=0, use_list=False, - object_hook=decode, - object_pairs_hook=None, list_hook=None, encoding='utf-8', - unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType): - super().__init__(file_like=file_like, - read_size=read_size, - use_list=use_list, - object_hook=object_hook, - object_pairs_hook=object_pairs_hook, - list_hook=list_hook, - encoding=encoding, - unicode_errors=unicode_errors, - max_buffer_size=max_buffer_size, - ext_hook=ext_hook) + def __init__( + self, + file_like=None, + read_size=0, + use_list=False, + object_hook=decode, + object_pairs_hook=None, + list_hook=None, + encoding="utf-8", + unicode_errors="strict", + max_buffer_size=0, + ext_hook=ExtType, + ): + super().__init__( + file_like=file_like, + read_size=read_size, + use_list=use_list, + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, + list_hook=list_hook, + encoding=encoding, + unicode_errors=unicode_errors, + max_buffer_size=max_buffer_size, + ext_hook=ext_hook, + ) class Iterator: @@ -784,13 +856,13 @@ def __iter__(self): path_exists = False if path_exists: - fh = open(self.path, 'rb') + fh = open(self.path, "rb") else: fh = BytesIO(self.path) else: - if not hasattr(self.path, 'read'): + if not hasattr(self.path, "read"): fh = BytesIO(self.path) else: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 9a846d1c7845c..3db05b94e5dce 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -13,10 +13,10 @@ def get_engine(engine): """ return our implementation """ - if engine == 'auto': - engine = get_option('io.parquet.engine') + if engine == "auto": + engine = get_option("io.parquet.engine") - if engine == 'auto': + if engine == "auto": # try engines in this order try: return PyArrowImpl() @@ -28,17 +28,19 @@ def get_engine(engine): except ImportError: pass - raise ImportError("Unable to find a usable engine; " - "tried using: 'pyarrow', 'fastparquet'.\n" - "pyarrow or fastparquet is required for parquet " - "support") + raise ImportError( + "Unable to find a usable engine; " + "tried using: 'pyarrow', 'fastparquet'.\n" + "pyarrow or fastparquet is required for parquet " + "support" + ) - if engine not in ['pyarrow', 'fastparquet']: + if engine not in ["pyarrow", "fastparquet"]: raise ValueError("engine must be one of 'pyarrow', 'fastparquet'") - if engine == 'pyarrow': + if engine == "pyarrow": return PyArrowImpl() - elif engine == 'fastparquet': + elif engine == "fastparquet": return FastParquetImpl() @@ -53,14 +55,12 @@ def validate_dataframe(df): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) - if df.columns.inferred_type not in {'string', 'unicode'}: + if df.columns.inferred_type not in {"string", "unicode"}: raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( - isinstance(name, str) - for name in df.index.names - if name is not None + isinstance(name, str) for name in df.index.names if name is not None ) if not valid_names: raise ValueError("Index level names must be strings") @@ -73,42 +73,57 @@ def read(self, path, columns=None, **kwargs): class PyArrowImpl(BaseImpl): - def __init__(self): pyarrow = import_optional_dependency( - "pyarrow", - extra="pyarrow is required for parquet support." + "pyarrow", extra="pyarrow is required for parquet support." ) import pyarrow.parquet + self.api = pyarrow - def write(self, df, path, compression='snappy', - coerce_timestamps='ms', index=None, partition_cols=None, - **kwargs): + def write( + self, + df, + path, + compression="snappy", + coerce_timestamps="ms", + index=None, + partition_cols=None, + **kwargs + ): self.validate_dataframe(df) - path, _, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode="wb") if index is None: from_pandas_kwargs = {} else: - from_pandas_kwargs = {'preserve_index': index} + from_pandas_kwargs = {"preserve_index": index} table = self.api.Table.from_pandas(df, **from_pandas_kwargs) if partition_cols is not None: self.api.parquet.write_to_dataset( - table, path, compression=compression, + table, + path, + compression=compression, coerce_timestamps=coerce_timestamps, - partition_cols=partition_cols, **kwargs) + partition_cols=partition_cols, + **kwargs + ) else: self.api.parquet.write_table( - table, path, compression=compression, - coerce_timestamps=coerce_timestamps, **kwargs) + table, + path, + compression=compression, + coerce_timestamps=coerce_timestamps, + **kwargs + ) def read(self, path, columns=None, **kwargs): path, _, _, should_close = get_filepath_or_buffer(path) - kwargs['use_pandas_metadata'] = True - result = self.api.parquet.read_table(path, columns=columns, - **kwargs).to_pandas() + kwargs["use_pandas_metadata"] = True + result = self.api.parquet.read_table( + path, columns=columns, **kwargs + ).to_pandas() if should_close: try: path.close() @@ -119,47 +134,53 @@ def read(self, path, columns=None, **kwargs): class FastParquetImpl(BaseImpl): - def __init__(self): # since pandas is a dependency of fastparquet # we need to import on first use fastparquet = import_optional_dependency( - "fastparquet", - extra="fastparquet is required for parquet support." + "fastparquet", extra="fastparquet is required for parquet support." ) self.api = fastparquet - def write(self, df, path, compression='snappy', index=None, - partition_cols=None, **kwargs): + def write( + self, df, path, compression="snappy", index=None, partition_cols=None, **kwargs + ): self.validate_dataframe(df) # thriftpy/protocol/compact.py:339: # DeprecationWarning: tostring() is deprecated. # Use tobytes() instead. - if 'partition_on' in kwargs and partition_cols is not None: - raise ValueError("Cannot use both partition_on and " - "partition_cols. Use partition_cols for " - "partitioning data") - elif 'partition_on' in kwargs: - partition_cols = kwargs.pop('partition_on') + if "partition_on" in kwargs and partition_cols is not None: + raise ValueError( + "Cannot use both partition_on and " + "partition_cols. Use partition_cols for " + "partitioning data" + ) + elif "partition_on" in kwargs: + partition_cols = kwargs.pop("partition_on") if partition_cols is not None: - kwargs['file_scheme'] = 'hive' + kwargs["file_scheme"] = "hive" if is_s3_url(path): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' - path, _, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode="wb") # And pass the opened s3file to the fastparquet internal impl. - kwargs['open_with'] = lambda path, _: path + kwargs["open_with"] = lambda path, _: path else: path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): - self.api.write(path, df, compression=compression, - write_index=index, partition_on=partition_cols, - **kwargs) + self.api.write( + path, + df, + compression=compression, + write_index=index, + partition_on=partition_cols, + **kwargs + ) def read(self, path, columns=None, **kwargs): if is_s3_url(path): @@ -178,8 +199,15 @@ def read(self, path, columns=None, **kwargs): return parquet_file.to_pandas(columns=columns, **kwargs) -def to_parquet(df, path, engine='auto', compression='snappy', index=None, - partition_cols=None, **kwargs): +def to_parquet( + df, + path, + engine="auto", + compression="snappy", + index=None, + partition_cols=None, + **kwargs +): """ Write a DataFrame to the parquet format. @@ -215,11 +243,17 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None, Additional keyword arguments passed to the engine """ impl = get_engine(engine) - return impl.write(df, path, compression=compression, index=index, - partition_cols=partition_cols, **kwargs) + return impl.write( + df, + path, + compression=compression, + index=index, + partition_cols=partition_cols, + **kwargs + ) -def read_parquet(path, engine='auto', columns=None, **kwargs): +def read_parquet(path, engine="auto", columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8fe0e466e7c0a..78440939ebc01 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -19,15 +19,30 @@ import pandas._libs.parsers as parsers from pandas._libs.tslibs import parsing from pandas.errors import ( - AbstractMethodError, EmptyDataError, ParserError, ParserWarning) + AbstractMethodError, + EmptyDataError, + ParserError, + ParserWarning, +) from pandas.util._decorators import Appender from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( - ensure_object, ensure_str, is_bool_dtype, is_categorical_dtype, - is_dtype_equal, is_extension_array_dtype, is_float, is_integer, - is_integer_dtype, is_list_like, is_object_dtype, is_scalar, - is_string_dtype, pandas_dtype) + ensure_object, + ensure_str, + is_bool_dtype, + is_categorical_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_float, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + is_scalar, + is_string_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -35,24 +50,31 @@ from pandas.core import algorithms from pandas.core.arrays import Categorical from pandas.core.frame import DataFrame -from pandas.core.index import ( - Index, MultiIndex, RangeIndex, ensure_index_from_sequences) +from pandas.core.index import Index, MultiIndex, RangeIndex, ensure_index_from_sequences from pandas.core.series import Series from pandas.core.tools import datetimes as tools from pandas.io.common import ( - _NA_VALUES, BaseIterator, UnicodeReader, UTF8Recoder, _get_handle, - _infer_compression, _validate_header_arg, get_filepath_or_buffer, - is_file_like) + _NA_VALUES, + BaseIterator, + UnicodeReader, + UTF8Recoder, + _get_handle, + _infer_compression, + _validate_header_arg, + get_filepath_or_buffer, + is_file_like, +) from pandas.io.date_converters import generic_parser # BOM character (byte order mark) # This exists at the beginning of a file to indicate endianness # of a file (stream). Unfortunately, this marker screws up parsing, # so we need to remove it if we see it. -_BOM = '\ufeff' +_BOM = "\ufeff" -_doc_read_csv_and_table = r""" +_doc_read_csv_and_table = ( + r""" {summary} Also supports optionally iterating or breaking of the file @@ -168,8 +190,9 @@ na_values : scalar, str, list-like, or dict, optional Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as - NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), - 70, subsequent_indent=" ") + """'. + NaN: '""" + + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + + """'. keep_default_na : bool, default True Whether or not to include the default NaN values when parsing the data. Depending on whether `na_values` is passed in, the behavior is as follows: @@ -343,6 +366,7 @@ -------- >>> pd.{func_name}('data.csv') # doctest: +SKIP """ +) def _validate_integer(name, val, min_val=0): @@ -361,8 +385,9 @@ def _validate_integer(name, val, min_val=0): min_val : int Minimum allowed value (val < min_val will result in a ValueError) """ - msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name, - min_val=min_val) + msg = "'{name:s}' must be an integer >={min_val:d}".format( + name=name, min_val=min_val + ) if val is not None: if is_float(val): @@ -394,18 +419,18 @@ def _validate_names(names): if names is not None: if len(names) != len(set(names)): - raise ValueError('Duplicate names are not allowed.') + raise ValueError("Duplicate names are not allowed.") return names def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" - encoding = kwds.get('encoding', None) + encoding = kwds.get("encoding", None) if encoding is not None: - encoding = re.sub('_', '-', encoding).lower() - kwds['encoding'] = encoding + encoding = re.sub("_", "-", encoding).lower() + kwds["encoding"] = encoding - compression = kwds.get('compression', 'infer') + compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) # TODO: get_filepath_or_buffer could return @@ -413,17 +438,18 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression) - kwds['compression'] = compression + filepath_or_buffer, encoding, compression + ) + kwds["compression"] = compression - if kwds.get('date_parser', None) is not None: - if isinstance(kwds['parse_dates'], bool): - kwds['parse_dates'] = True + if kwds.get("date_parser", None) is not None: + if isinstance(kwds["parse_dates"], bool): + kwds["parse_dates"] = True # Extract some of the arguments (pass chunksize on). - iterator = kwds.get('iterator', False) - chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1) - nrows = kwds.get('nrows', None) + iterator = kwds.get("iterator", False) + chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) + nrows = kwds.get("nrows", None) # Check for duplicates in names. _validate_names(kwds.get("names", None)) @@ -449,147 +475,127 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _parser_defaults = { - 'delimiter': None, - - 'escapechar': None, - 'quotechar': '"', - 'quoting': csv.QUOTE_MINIMAL, - 'doublequote': True, - 'skipinitialspace': False, - 'lineterminator': None, - - 'header': 'infer', - 'index_col': None, - 'names': None, - 'prefix': None, - 'skiprows': None, - 'skipfooter': 0, - 'nrows': None, - 'na_values': None, - 'keep_default_na': True, - - 'true_values': None, - 'false_values': None, - 'converters': None, - 'dtype': None, - 'cache_dates': True, - - 'thousands': None, - 'comment': None, - 'decimal': b'.', - + "delimiter": None, + "escapechar": None, + "quotechar": '"', + "quoting": csv.QUOTE_MINIMAL, + "doublequote": True, + "skipinitialspace": False, + "lineterminator": None, + "header": "infer", + "index_col": None, + "names": None, + "prefix": None, + "skiprows": None, + "skipfooter": 0, + "nrows": None, + "na_values": None, + "keep_default_na": True, + "true_values": None, + "false_values": None, + "converters": None, + "dtype": None, + "cache_dates": True, + "thousands": None, + "comment": None, + "decimal": b".", # 'engine': 'c', - 'parse_dates': False, - 'keep_date_col': False, - 'dayfirst': False, - 'date_parser': None, - 'usecols': None, - + "parse_dates": False, + "keep_date_col": False, + "dayfirst": False, + "date_parser": None, + "usecols": None, # 'iterator': False, - 'chunksize': None, - 'verbose': False, - 'encoding': None, - 'squeeze': False, - 'compression': None, - 'mangle_dupe_cols': True, - 'infer_datetime_format': False, - 'skip_blank_lines': True + "chunksize": None, + "verbose": False, + "encoding": None, + "squeeze": False, + "compression": None, + "mangle_dupe_cols": True, + "infer_datetime_format": False, + "skip_blank_lines": True, } _c_parser_defaults = { - 'delim_whitespace': False, - 'na_filter': True, - 'low_memory': True, - 'memory_map': False, - 'error_bad_lines': True, - 'warn_bad_lines': True, - 'float_precision': None + "delim_whitespace": False, + "na_filter": True, + "low_memory": True, + "memory_map": False, + "error_bad_lines": True, + "warn_bad_lines": True, + "float_precision": None, } -_fwf_defaults = { - 'colspecs': 'infer', - 'infer_nrows': 100, - 'widths': None, -} +_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} -_c_unsupported = {'skipfooter'} -_python_unsupported = { - 'low_memory', - 'float_precision', -} +_c_unsupported = {"skipfooter"} +_python_unsupported = {"low_memory", "float_precision"} _deprecated_defaults = {} # type: Dict[str, Any] _deprecated_args = set() # type: Set[str] -def _make_parser_function(name, default_sep=','): - - def parser_f(filepath_or_buffer: FilePathOrBuffer, - sep=default_sep, - delimiter=None, - - # Column and Index Locations and Names - header='infer', - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - - # General Parsing Configuration - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - skipfooter=0, - nrows=None, - - # NA and Missing Data Handling - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - - # Datetime Handling - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - cache_dates=True, - - # Iteration - iterator=False, - chunksize=None, - - # Quoting, Compression, and File Format - compression='infer', - thousands=None, - decimal=b'.', - lineterminator=None, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - - # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - - # Internal - delim_whitespace=False, - low_memory=_c_parser_defaults['low_memory'], - memory_map=False, - float_precision=None): +def _make_parser_function(name, default_sep=","): + def parser_f( + filepath_or_buffer: FilePathOrBuffer, + sep=default_sep, + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal=b".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, + ): # gh-23761 # @@ -614,69 +620,68 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, delimiter = sep if delim_whitespace and delimiter != default_sep: - raise ValueError("Specified a delimiter with both sep and" - " delim_whitespace=True; you can only" - " specify one.") + raise ValueError( + "Specified a delimiter with both sep and" + " delim_whitespace=True; you can only" + " specify one." + ) if engine is not None: engine_specified = True else: - engine = 'c' + engine = "c" engine_specified = False - kwds.update(delimiter=delimiter, - engine=engine, - dialect=dialect, - compression=compression, - engine_specified=engine_specified, - - doublequote=doublequote, - escapechar=escapechar, - quotechar=quotechar, - quoting=quoting, - skipinitialspace=skipinitialspace, - lineterminator=lineterminator, - - header=header, - index_col=index_col, - names=names, - prefix=prefix, - skiprows=skiprows, - skipfooter=skipfooter, - na_values=na_values, - true_values=true_values, - false_values=false_values, - keep_default_na=keep_default_na, - thousands=thousands, - comment=comment, - decimal=decimal, - - parse_dates=parse_dates, - keep_date_col=keep_date_col, - dayfirst=dayfirst, - date_parser=date_parser, - cache_dates=cache_dates, - - nrows=nrows, - iterator=iterator, - chunksize=chunksize, - converters=converters, - dtype=dtype, - usecols=usecols, - verbose=verbose, - encoding=encoding, - squeeze=squeeze, - memory_map=memory_map, - float_precision=float_precision, - - na_filter=na_filter, - delim_whitespace=delim_whitespace, - warn_bad_lines=warn_bad_lines, - error_bad_lines=error_bad_lines, - low_memory=low_memory, - mangle_dupe_cols=mangle_dupe_cols, - infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines) + kwds.update( + delimiter=delimiter, + engine=engine, + dialect=dialect, + compression=compression, + engine_specified=engine_specified, + doublequote=doublequote, + escapechar=escapechar, + quotechar=quotechar, + quoting=quoting, + skipinitialspace=skipinitialspace, + lineterminator=lineterminator, + header=header, + index_col=index_col, + names=names, + prefix=prefix, + skiprows=skiprows, + skipfooter=skipfooter, + na_values=na_values, + true_values=true_values, + false_values=false_values, + keep_default_na=keep_default_na, + thousands=thousands, + comment=comment, + decimal=decimal, + parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, + date_parser=date_parser, + cache_dates=cache_dates, + nrows=nrows, + iterator=iterator, + chunksize=chunksize, + converters=converters, + dtype=dtype, + usecols=usecols, + verbose=verbose, + encoding=encoding, + squeeze=squeeze, + memory_map=memory_map, + float_precision=float_precision, + na_filter=na_filter, + delim_whitespace=delim_whitespace, + warn_bad_lines=warn_bad_lines, + error_bad_lines=error_bad_lines, + low_memory=low_memory, + mangle_dupe_cols=mangle_dupe_cols, + infer_datetime_format=infer_datetime_format, + skip_blank_lines=skip_blank_lines, + ) return _read(filepath_or_buffer, kwds) @@ -685,27 +690,32 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer, return parser_f -read_csv = _make_parser_function('read_csv', default_sep=',') -read_csv = Appender(_doc_read_csv_and_table.format( - func_name='read_csv', - summary=('Read a comma-separated values (csv) file ' - 'into DataFrame.'), - _default_sep="','") - )(read_csv) - -read_table = _make_parser_function('read_table', default_sep='\t') -read_table = Appender(_doc_read_csv_and_table.format( - func_name='read_table', - summary='Read general delimited file into DataFrame.', - _default_sep=r"'\\t' (tab-stop)") - )(read_table) - - -def read_fwf(filepath_or_buffer: FilePathOrBuffer, - colspecs='infer', - widths=None, - infer_nrows=100, - **kwds): +read_csv = _make_parser_function("read_csv", default_sep=",") +read_csv = Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary=("Read a comma-separated values (csv) file " "into DataFrame."), + _default_sep="','", + ) +)(read_csv) + +read_table = _make_parser_function("read_table", default_sep="\t") +read_table = Appender( + _doc_read_csv_and_table.format( + func_name="read_table", + summary="Read general delimited file into DataFrame.", + _default_sep=r"'\\t' (tab-stop)", + ) +)(read_table) + + +def read_fwf( + filepath_or_buffer: FilePathOrBuffer, + colspecs="infer", + widths=None, + infer_nrows=100, + **kwds +): r""" Read a table of fixed-width formatted lines into DataFrame. @@ -765,9 +775,8 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer, # Check input arguments. if colspecs is None and widths is None: raise ValueError("Must specify either colspecs or widths") - elif colspecs not in (None, 'infer') and widths is not None: - raise ValueError("You must specify only one of 'widths' and " - "'colspecs'") + elif colspecs not in (None, "infer") and widths is not None: + raise ValueError("You must specify only one of 'widths' and " "'colspecs'") # Compute 'colspecs' from 'widths', if specified. if widths is not None: @@ -776,9 +785,9 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer, colspecs.append((col, col + w)) col += w - kwds['colspecs'] = colspecs - kwds['infer_nrows'] = infer_nrows - kwds['engine'] = 'python-fwf' + kwds["colspecs"] = colspecs + kwds["infer_nrows"] = infer_nrows + kwds["engine"] = "python-fwf" return _read(filepath_or_buffer, kwds) @@ -796,25 +805,34 @@ def __init__(self, f, engine=None, **kwds): if engine is not None: engine_specified = True else: - engine = 'python' + engine = "python" engine_specified = False - self._engine_specified = kwds.get('engine_specified', engine_specified) + self._engine_specified = kwds.get("engine_specified", engine_specified) - if kwds.get('dialect') is not None: - dialect = kwds['dialect'] + if kwds.get("dialect") is not None: + dialect = kwds["dialect"] if dialect in csv.list_dialects(): dialect = csv.get_dialect(dialect) # Any valid dialect should have these attributes. # If any are missing, we will raise automatically. - for param in ('delimiter', 'doublequote', 'escapechar', - 'skipinitialspace', 'quotechar', 'quoting'): + for param in ( + "delimiter", + "doublequote", + "escapechar", + "skipinitialspace", + "quotechar", + "quoting", + ): try: dialect_val = getattr(dialect, param) except AttributeError: - raise ValueError("Invalid dialect '{dialect}' provided" - .format(dialect=kwds['dialect'])) + raise ValueError( + "Invalid dialect '{dialect}' provided".format( + dialect=kwds["dialect"] + ) + ) parser_default = _parser_defaults[param] provided = kwds.get(param, parser_default) @@ -825,21 +843,24 @@ def __init__(self, f, engine=None, **kwds): # Don't warn if the default parameter was passed in, # even if it conflicts with the dialect (gh-23761). if provided != parser_default and provided != dialect_val: - msg = ("Conflicting values for '{param}': '{val}' was " - "provided, but the dialect specifies '{diaval}'. " - "Using the dialect-specified value.".format( - param=param, val=provided, diaval=dialect_val)) + msg = ( + "Conflicting values for '{param}': '{val}' was " + "provided, but the dialect specifies '{diaval}'. " + "Using the dialect-specified value.".format( + param=param, val=provided, diaval=dialect_val + ) + ) # Annoying corner case for not warning about # conflicts between dialect and delimiter parameter. # Refer to the outer "_read_" function for more info. - if not (param == "delimiter" and - kwds.pop("sep_override", False)): + if not (param == "delimiter" and kwds.pop("sep_override", False)): conflict_msgs.append(msg) if conflict_msgs: - warnings.warn('\n\n'.join(conflict_msgs), ParserWarning, - stacklevel=2) + warnings.warn( + "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2 + ) kwds[param] = dialect_val if kwds.get("skipfooter"): @@ -848,8 +869,8 @@ def __init__(self, f, engine=None, **kwds): if kwds.get("nrows"): raise ValueError("'skipfooter' not supported with 'nrows'") - if kwds.get('header', 'infer') == 'infer': - kwds['header'] = 0 if kwds.get('names') is None else None + if kwds.get("header", "infer") == "infer": + kwds["header"] = 0 if kwds.get("names") is None else None self.orig_options = kwds @@ -860,16 +881,16 @@ def __init__(self, f, engine=None, **kwds): options = self._get_options_with_defaults(engine) - self.chunksize = options.pop('chunksize', None) - self.nrows = options.pop('nrows', None) - self.squeeze = options.pop('squeeze', False) + self.chunksize = options.pop("chunksize", None) + self.nrows = options.pop("nrows", None) + self.squeeze = options.pop("squeeze", False) # might mutate self.engine self.engine = self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) - if 'has_index_names' in kwds: - self.options['has_index_names'] = kwds['has_index_names'] + if "has_index_names" in kwds: + self.options["has_index_names"] = kwds["has_index_names"] self._make_engine(self.engine) @@ -885,9 +906,10 @@ def _get_options_with_defaults(self, engine): value = kwds.get(argname, default) # see gh-12935 - if argname == 'mangle_dupe_cols' and not value: - raise ValueError('Setting mangle_dupe_cols=False is ' - 'not supported yet') + if argname == "mangle_dupe_cols" and not value: + raise ValueError( + "Setting mangle_dupe_cols=False is " "not supported yet" + ) else: options[argname] = value @@ -895,21 +917,21 @@ def _get_options_with_defaults(self, engine): if argname in kwds: value = kwds[argname] - if engine != 'c' and value != default: - if ('python' in engine and - argname not in _python_unsupported): + if engine != "c" and value != default: + if "python" in engine and argname not in _python_unsupported: pass elif value == _deprecated_defaults.get(argname, default): pass else: raise ValueError( - 'The %r option is not supported with the' - ' %r engine' % (argname, engine)) + "The %r option is not supported with the" + " %r engine" % (argname, engine) + ) else: value = _deprecated_defaults.get(argname, default) options[argname] = value - if engine == 'python-fwf': + if engine == "python-fwf": for argname, default in _fwf_defaults.items(): options[argname] = kwds.get(argname, default) @@ -926,8 +948,7 @@ def _check_file_or_buffer(self, f, engine): # needs to have that attribute ("next" for Python 2.x, "__next__" # for Python 3.x) if engine != "c" and not hasattr(f, next_attr): - msg = ("The 'python' engine cannot iterate " - "through this file buffer.") + msg = "The 'python' engine cannot iterate " "through this file buffer." raise ValueError(msg) return engine @@ -938,36 +959,39 @@ def _clean_options(self, options, engine): engine_specified = self._engine_specified fallback_reason = None - sep = options['delimiter'] - delim_whitespace = options['delim_whitespace'] + sep = options["delimiter"] + delim_whitespace = options["delim_whitespace"] # C engine not supported yet - if engine == 'c': - if options['skipfooter'] > 0: - fallback_reason = ("the 'c' engine does not support" - " skipfooter") - engine = 'python' + if engine == "c": + if options["skipfooter"] > 0: + fallback_reason = "the 'c' engine does not support" " skipfooter" + engine = "python" - encoding = sys.getfilesystemencoding() or 'utf-8' + encoding = sys.getfilesystemencoding() or "utf-8" if sep is None and not delim_whitespace: - if engine == 'c': - fallback_reason = ("the 'c' engine does not support" - " sep=None with delim_whitespace=False") - engine = 'python' + if engine == "c": + fallback_reason = ( + "the 'c' engine does not support" + " sep=None with delim_whitespace=False" + ) + engine = "python" elif sep is not None and len(sep) > 1: - if engine == 'c' and sep == r'\s+': - result['delim_whitespace'] = True - del result['delimiter'] - elif engine not in ('python', 'python-fwf'): + if engine == "c" and sep == r"\s+": + result["delim_whitespace"] = True + del result["delimiter"] + elif engine not in ("python", "python-fwf"): # wait until regex engine integrated - fallback_reason = ("the 'c' engine does not support" - " regex separators (separators > 1 char and" - r" different from '\s+' are" - " interpreted as regex)") - engine = 'python' + fallback_reason = ( + "the 'c' engine does not support" + " regex separators (separators > 1 char and" + r" different from '\s+' are" + " interpreted as regex)" + ) + engine = "python" elif delim_whitespace: - if 'python' in engine: - result['delimiter'] = r'\s+' + if "python" in engine: + result["delimiter"] = r"\s+" elif sep is not None: encodeable = True try: @@ -975,73 +999,85 @@ def _clean_options(self, options, engine): encodeable = False except UnicodeDecodeError: encodeable = False - if not encodeable and engine not in ('python', 'python-fwf'): - fallback_reason = ("the separator encoded in {encoding}" - " is > 1 char long, and the 'c' engine" - " does not support such separators" - .format(encoding=encoding)) - engine = 'python' - - quotechar = options['quotechar'] - if (quotechar is not None and - isinstance(quotechar, (str, bytes))): - if (len(quotechar) == 1 and ord(quotechar) > 127 and - engine not in ('python', 'python-fwf')): - fallback_reason = ("ord(quotechar) > 127, meaning the " - "quotechar is larger than one byte, " - "and the 'c' engine does not support " - "such quotechars") - engine = 'python' + if not encodeable and engine not in ("python", "python-fwf"): + fallback_reason = ( + "the separator encoded in {encoding}" + " is > 1 char long, and the 'c' engine" + " does not support such separators".format(encoding=encoding) + ) + engine = "python" + + quotechar = options["quotechar"] + if quotechar is not None and isinstance(quotechar, (str, bytes)): + if ( + len(quotechar) == 1 + and ord(quotechar) > 127 + and engine not in ("python", "python-fwf") + ): + fallback_reason = ( + "ord(quotechar) > 127, meaning the " + "quotechar is larger than one byte, " + "and the 'c' engine does not support " + "such quotechars" + ) + engine = "python" if fallback_reason and engine_specified: raise ValueError(fallback_reason) - if engine == 'c': + if engine == "c": for arg in _c_unsupported: del result[arg] - if 'python' in engine: + if "python" in engine: for arg in _python_unsupported: if fallback_reason and result[arg] != _c_parser_defaults[arg]: - msg = ("Falling back to the 'python' engine because" - " {reason}, but this causes {option!r} to be" - " ignored as it is not supported by the 'python'" - " engine.").format(reason=fallback_reason, - option=arg) + msg = ( + "Falling back to the 'python' engine because" + " {reason}, but this causes {option!r} to be" + " ignored as it is not supported by the 'python'" + " engine." + ).format(reason=fallback_reason, option=arg) raise ValueError(msg) del result[arg] if fallback_reason: - warnings.warn(("Falling back to the 'python' engine because" - " {0}; you can avoid this warning by specifying" - " engine='python'.").format(fallback_reason), - ParserWarning, stacklevel=5) + warnings.warn( + ( + "Falling back to the 'python' engine because" + " {0}; you can avoid this warning by specifying" + " engine='python'." + ).format(fallback_reason), + ParserWarning, + stacklevel=5, + ) - index_col = options['index_col'] - names = options['names'] - converters = options['converters'] - na_values = options['na_values'] - skiprows = options['skiprows'] + index_col = options["index_col"] + names = options["names"] + converters = options["converters"] + na_values = options["na_values"] + skiprows = options["skiprows"] - _validate_header_arg(options['header']) + _validate_header_arg(options["header"]) - depr_warning = '' + depr_warning = "" for arg in _deprecated_args: parser_default = _c_parser_defaults[arg] depr_default = _deprecated_defaults[arg] - msg = ("The '{arg}' argument has been deprecated " - "and will be removed in a future version." - .format(arg=arg)) + msg = ( + "The '{arg}' argument has been deprecated " + "and will be removed in a future version.".format(arg=arg) + ) if result.get(arg, depr_default) != depr_default: # raise Exception(result.get(arg, depr_default), depr_default) - depr_warning += msg + '\n\n' + depr_warning += msg + "\n\n" else: result[arg] = parser_default - if depr_warning != '': + if depr_warning != "": warnings.warn(depr_warning, FutureWarning, stacklevel=2) if index_col is True: @@ -1049,26 +1085,28 @@ def _clean_options(self, options, engine): if _is_index_col(index_col): if not isinstance(index_col, (list, tuple, np.ndarray)): index_col = [index_col] - result['index_col'] = index_col + result["index_col"] = index_col names = list(names) if names is not None else names # type conversion-related if converters is not None: if not isinstance(converters, dict): - raise TypeError('Type converters must be a dict or' - ' subclass, input was ' - 'a {0!r}'.format(type(converters).__name__)) + raise TypeError( + "Type converters must be a dict or" + " subclass, input was " + "a {0!r}".format(type(converters).__name__) + ) else: converters = {} # Converting values to NA - keep_default_na = options['keep_default_na'] + keep_default_na = options["keep_default_na"] na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the # c-engine, so only need for python parsers - if engine != 'c': + if engine != "c": if is_integer(skiprows): skiprows = list(range(skiprows)) if skiprows is None: @@ -1077,11 +1115,11 @@ def _clean_options(self, options, engine): skiprows = set(skiprows) # put stuff back - result['names'] = names - result['converters'] = converters - result['na_values'] = na_values - result['na_fvalues'] = na_fvalues - result['skiprows'] = skiprows + result["names"] = names + result["converters"] = converters + result["na_values"] = na_values + result["na_fvalues"] = na_fvalues + result["skiprows"] = skiprows return result, engine @@ -1092,25 +1130,27 @@ def __next__(self): self.close() raise - def _make_engine(self, engine='c'): - if engine == 'c': + def _make_engine(self, engine="c"): + if engine == "c": self._engine = CParserWrapper(self.f, **self.options) else: - if engine == 'python': + if engine == "python": klass = PythonParser - elif engine == 'python-fwf': + elif engine == "python-fwf": klass = FixedWidthFieldParser else: - raise ValueError('Unknown engine: {engine} (valid options are' - ' "c", "python", or' ' "python-fwf")'.format( - engine=engine)) + raise ValueError( + "Unknown engine: {engine} (valid options are" + ' "c", "python", or' + ' "python-fwf")'.format(engine=engine) + ) self._engine = klass(self.f, **self.options) def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = _validate_integer('nrows', nrows) + nrows = _validate_integer("nrows", nrows) ret = self._engine.read(nrows) # May alter columns / col_dict @@ -1166,8 +1206,11 @@ def _is_potential_multi_index(columns): ------- boolean : Whether or not columns could become a MultiIndex """ - return (len(columns) and not isinstance(columns, MultiIndex) and - all(isinstance(c, tuple) for c in columns)) + return ( + len(columns) + and not isinstance(columns, MultiIndex) + and all(isinstance(c, tuple) for c in columns) + ) def _evaluate_usecols(usecols, names): @@ -1271,8 +1314,10 @@ def _validate_usecols_arg(usecols): 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like is passed in or None if a callable or None is passed in. """ - msg = ("'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable.") + msg = ( + "'usecols' must either be list-like of all strings, all unicode, " + "all integers or a callable." + ) if usecols is not None: if callable(usecols): return usecols, None @@ -1285,8 +1330,7 @@ def _validate_usecols_arg(usecols): usecols_dtype = lib.infer_dtype(usecols, skipna=False) - if usecols_dtype not in ("empty", "integer", - "string", "unicode"): + if usecols_dtype not in ("empty", "integer", "string", "unicode"): raise ValueError(msg) usecols = set(usecols) @@ -1301,9 +1345,11 @@ def _validate_parse_dates_arg(parse_dates): is a non-boolean scalar. Raises a ValueError if that is the case. """ - msg = ("Only booleans, lists, and " - "dictionaries are accepted " - "for the 'parse_dates' parameter") + msg = ( + "Only booleans, lists, and " + "dictionaries are accepted " + "for the 'parse_dates' parameter" + ) if parse_dates is not None: if is_scalar(parse_dates): @@ -1317,62 +1363,65 @@ def _validate_parse_dates_arg(parse_dates): class ParserBase: - def __init__(self, kwds): - self.names = kwds.get('names') + self.names = kwds.get("names") self.orig_names = None - self.prefix = kwds.pop('prefix', None) + self.prefix = kwds.pop("prefix", None) - self.index_col = kwds.get('index_col', None) + self.index_col = kwds.get("index_col", None) self.unnamed_cols = set() self.index_names = None self.col_names = None - self.parse_dates = _validate_parse_dates_arg( - kwds.pop('parse_dates', False)) - self.date_parser = kwds.pop('date_parser', None) - self.dayfirst = kwds.pop('dayfirst', False) - self.keep_date_col = kwds.pop('keep_date_col', False) + self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) + self.date_parser = kwds.pop("date_parser", None) + self.dayfirst = kwds.pop("dayfirst", False) + self.keep_date_col = kwds.pop("keep_date_col", False) - self.na_values = kwds.get('na_values') - self.na_fvalues = kwds.get('na_fvalues') - self.na_filter = kwds.get('na_filter', False) - self.keep_default_na = kwds.get('keep_default_na', True) + self.na_values = kwds.get("na_values") + self.na_fvalues = kwds.get("na_fvalues") + self.na_filter = kwds.get("na_filter", False) + self.keep_default_na = kwds.get("keep_default_na", True) - self.true_values = kwds.get('true_values') - self.false_values = kwds.get('false_values') - self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) - self.infer_datetime_format = kwds.pop('infer_datetime_format', False) - self.cache_dates = kwds.pop('cache_dates', True) + self.true_values = kwds.get("true_values") + self.false_values = kwds.get("false_values") + self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) + self.infer_datetime_format = kwds.pop("infer_datetime_format", False) + self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, infer_datetime_format=self.infer_datetime_format, - cache_dates=self.cache_dates + cache_dates=self.cache_dates, ) # validate header options for mi - self.header = kwds.get('header') + self.header = kwds.get("header") if isinstance(self.header, (list, tuple, np.ndarray)): if not all(map(is_integer, self.header)): raise ValueError("header must be integer or list of integers") - if kwds.get('usecols'): - raise ValueError("cannot specify usecols when " - "specifying a multi-index header") - if kwds.get('names'): - raise ValueError("cannot specify names when " - "specifying a multi-index header") + if kwds.get("usecols"): + raise ValueError( + "cannot specify usecols when " "specifying a multi-index header" + ) + if kwds.get("names"): + raise ValueError( + "cannot specify names when " "specifying a multi-index header" + ) # validate index_col that only contains integers if self.index_col is not None: - is_sequence = isinstance(self.index_col, (list, tuple, - np.ndarray)) - if not (is_sequence and - all(map(is_integer, self.index_col)) or - is_integer(self.index_col)): - raise ValueError("index_col must only contain row numbers " - "when specifying a multi-index header") + is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) + if not ( + is_sequence + and all(map(is_integer, self.index_col)) + or is_integer(self.index_col) + ): + raise ValueError( + "index_col must only contain row numbers " + "when specifying a multi-index header" + ) # GH 16338 elif self.header is not None and not is_integer(self.header): @@ -1392,10 +1441,11 @@ def close(self): @property def _has_complex_date_col(self): - return (isinstance(self.parse_dates, dict) or - (isinstance(self.parse_dates, list) and - len(self.parse_dates) > 0 and - isinstance(self.parse_dates[0], list))) + return isinstance(self.parse_dates, dict) or ( + isinstance(self.parse_dates, list) + and len(self.parse_dates) > 0 + and isinstance(self.parse_dates[0], list) + ) def _should_parse_dates(self, i): if isinstance(self.parse_dates, bool): @@ -1408,14 +1458,17 @@ def _should_parse_dates(self, i): j = self.index_col[i] if is_scalar(self.parse_dates): - return ((j == self.parse_dates) or - (name is not None and name == self.parse_dates)) + return (j == self.parse_dates) or ( + name is not None and name == self.parse_dates + ) else: - return ((j in self.parse_dates) or - (name is not None and name in self.parse_dates)) + return (j in self.parse_dates) or ( + name is not None and name in self.parse_dates + ) - def _extract_multi_indexer_columns(self, header, index_names, col_names, - passed_names=False): + def _extract_multi_indexer_columns( + self, header, index_names, col_names, passed_names=False + ): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ if len(header) < 2: @@ -1434,9 +1487,9 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, # clean the index_names index_names = header.pop(-1) - index_names, names, index_col = _clean_index_names(index_names, - self.index_col, - self.unnamed_cols) + index_names, names, index_col = _clean_index_names( + index_names, self.index_col, self.unnamed_cols + ) # extract the columns field_count = len(header[0]) @@ -1453,15 +1506,17 @@ def extract(r): if all(ensure_str(col[n]) in self.unnamed_cols for col in columns): raise ParserError( "Passed header=[{header}] are too many rows for this " - "multi_index of columns" - .format(header=','.join(str(x) for x in self.header)) + "multi_index of columns".format( + header=",".join(str(x) for x in self.header) + ) ) # Clean the column names (if we have an index_col). if len(ic): - col_names = [r[0] if (len(r[0]) and - r[0] not in self.unnamed_cols) else None - for r in header] + col_names = [ + r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None + for r in header + ] else: col_names = [None] * len(header) @@ -1487,11 +1542,11 @@ def _maybe_dedup_names(self, names): counts[col] = cur_count + 1 if is_potential_mi: - col = col[:-1] + ('{column}.{count}'.format( - column=col[-1], count=cur_count),) + col = col[:-1] + ( + "{column}.{count}".format(column=col[-1], count=cur_count), + ) else: - col = '{column}.{count}'.format( - column=col, count=cur_count) + col = "{column}.{count}".format(column=col, count=cur_count) cur_count = counts[col] names[i] = col @@ -1514,10 +1569,9 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): index = self._agg_index(index) elif self._has_complex_date_col: if not self._name_processed: - (self.index_names, _, - self.index_col) = _clean_index_names(list(columns), - self.index_col, - self.unnamed_cols) + (self.index_names, _, self.index_col) = _clean_index_names( + list(columns), self.index_col, self.unnamed_cols + ) self._name_processed = True index = self._get_complex_date_index(data, columns) index = self._agg_index(index, try_parse_dates=False) @@ -1538,7 +1592,7 @@ def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): return col - raise ValueError('Index {col} invalid'.format(col=col)) + raise ValueError("Index {col} invalid".format(col=col)) to_remove = [] index = [] @@ -1562,8 +1616,11 @@ def _get_name(icol): return icol if col_names is None: - raise ValueError(('Must supply column order to use {icol!s} ' - 'as index').format(icol=icol)) + raise ValueError( + ("Must supply column order to use {icol!s} " "as index").format( + icol=icol + ) + ) for i, c in enumerate(col_names): if i == icol: @@ -1603,8 +1660,8 @@ def _agg_index(self, index, try_parse_dates=True): col_name = self.index_names[i] if col_name is not None: col_na_values, col_na_fvalues = _get_na_values( - col_name, self.na_values, self.na_fvalues, - self.keep_default_na) + col_name, self.na_values, self.na_fvalues, self.keep_default_na + ) arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) @@ -1614,8 +1671,9 @@ def _agg_index(self, index, try_parse_dates=True): return index - def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, - converters=None, dtypes=None): + def _convert_to_ndarrays( + self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None + ): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) @@ -1627,50 +1685,61 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues, self.keep_default_na) + c, na_values, na_fvalues, self.keep_default_na + ) else: col_na_values, col_na_fvalues = set(), set() if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: - warnings.warn(("Both a converter and dtype were specified " - "for column {0} - only the converter will " - "be used").format(c), ParserWarning, - stacklevel=7) + warnings.warn( + ( + "Both a converter and dtype were specified " + "for column {0} - only the converter will " + "be used" + ).format(c), + ParserWarning, + stacklevel=7, + ) try: values = lib.map_infer(values, conv_f) except ValueError: - mask = algorithms.isin( - values, list(na_values)).view(np.uint8) + mask = algorithms.isin(values, list(na_values)).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, - try_num_bool=False) + values, set(col_na_values) | col_na_fvalues, try_num_bool=False + ) else: - is_str_or_ea_dtype = (is_string_dtype(cast_type) - or is_extension_array_dtype(cast_type)) + is_str_or_ea_dtype = is_string_dtype( + cast_type + ) or is_extension_array_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( - values, set(col_na_values) | col_na_fvalues, - try_num_bool) + values, set(col_na_values) | col_na_fvalues, try_num_bool + ) # type specified in dtype param or cast_type is an EA - if cast_type and (not is_dtype_equal(cvals, cast_type) - or is_extension_array_dtype(cast_type)): + if cast_type and ( + not is_dtype_equal(cvals, cast_type) + or is_extension_array_dtype(cast_type) + ): try: - if (is_bool_dtype(cast_type) and - not is_categorical_dtype(cast_type) - and na_count > 0): - raise ValueError("Bool column has NA values in " - "column {column}" - .format(column=c)) + if ( + is_bool_dtype(cast_type) + and not is_categorical_dtype(cast_type) + and na_count > 0 + ): + raise ValueError( + "Bool column has NA values in " + "column {column}".format(column=c) + ) except (AttributeError, TypeError): # invalid input to is_bool_dtype pass @@ -1678,8 +1747,11 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, result[c] = cvals if verbose and na_count: - print('Filled {count} NA values in column {c!s}'.format( - count=na_count, c=c)) + print( + "Filled {count} NA values in column {c!s}".format( + count=na_count, c=c + ) + ) return result def _infer_types(self, values, na_values, try_num_bool=True): @@ -1715,17 +1787,18 @@ def _infer_types(self, values, na_values, try_num_bool=True): except Exception: result = values if values.dtype == np.object_: - na_count = parsers.sanitize_objects(result, - na_values, False) + na_count = parsers.sanitize_objects(result, na_values, False) else: result = values if values.dtype == np.object_: na_count = parsers.sanitize_objects(values, na_values, False) if result.dtype == np.object_ and try_num_bool: - result = libops.maybe_convert_bool(np.asarray(values), - true_values=self.true_values, - false_values=self.false_values) + result = libops.maybe_convert_bool( + np.asarray(values), + true_values=self.true_values, + false_values=self.false_values, + ) return result, na_count @@ -1747,8 +1820,10 @@ def _cast_types(self, values, cast_type, column): """ if is_categorical_dtype(cast_type): - known_cats = (isinstance(cast_type, CategoricalDtype) and - cast_type.categories is not None) + known_cats = ( + isinstance(cast_type, CategoricalDtype) + and cast_type.categories is not None + ) if not is_object_dtype(values) and not known_cats: # XXX this is for consistency with @@ -1758,8 +1833,8 @@ def _cast_types(self, values, cast_type, column): cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( - cats, cats.get_indexer(values), cast_type, - true_values=self.true_values) + cats, cats.get_indexer(values), cast_type, true_values=self.true_values + ) # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): @@ -1767,23 +1842,22 @@ def _cast_types(self, values, cast_type, column): cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: - return array_type._from_sequence_of_strings(values, - dtype=cast_type) + return array_type._from_sequence_of_strings(values, dtype=cast_type) except NotImplementedError: raise NotImplementedError( "Extension Array: {ea} must implement " "_from_sequence_of_strings in order " - "to be used in parser methods".format(ea=array_type)) + "to be used in parser methods".format(ea=array_type) + ) else: try: - values = astype_nansafe(values, cast_type, - copy=True, skipna=True) + values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError: raise ValueError( "Unable to convert column {column} to type " - "{cast_type}".format( - column=column, cast_type=cast_type)) + "{cast_type}".format(column=column, cast_type=cast_type) + ) return values def _do_date_conversions(self, names, data): @@ -1791,8 +1865,14 @@ def _do_date_conversions(self, names, data): if self.parse_dates is not None: data, names = _process_date_conversion( - data, self._date_conv, self.parse_dates, self.index_col, - self.index_names, names, keep_date_col=self.keep_date_col) + data, + self._date_conv, + self.parse_dates, + self.index_col, + self.index_names, + names, + keep_date_col=self.keep_date_col, + ) return names, data @@ -1808,22 +1888,20 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if (kwds.get('compression') is None - and 'utf-16' in (kwds.get('encoding') or '')): + if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""): # if source is utf-16 plain text, convert source to utf-8 if isinstance(src, str): - src = open(src, 'rb') + src = open(src, "rb") self.handles.append(src) - src = UTF8Recoder(src, kwds['encoding']) - kwds['encoding'] = 'utf-8' + src = UTF8Recoder(src, kwds["encoding"]) + kwds["encoding"] = "utf-8" # #2442 - kwds['allow_leading_cols'] = self.index_col is not False + kwds["allow_leading_cols"] = self.index_col is not False # GH20529, validate usecol arg before TextReader - self.usecols, self.usecols_dtype = _validate_usecols_arg( - kwds['usecols']) - kwds['usecols'] = self.usecols + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + kwds["usecols"] = self.usecols self._reader = parsers.TextReader(src, **kwds) self.unnamed_cols = self._reader.unnamed_cols @@ -1835,19 +1913,18 @@ def __init__(self, src, **kwds): else: if len(self._reader.header) > 1: # we have a multi index in the columns - self.names, self.index_names, self.col_names, passed_names = ( - self._extract_multi_indexer_columns( - self._reader.header, self.index_names, self.col_names, - passed_names - ) + self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns( + self._reader.header, self.index_names, self.col_names, passed_names ) else: self.names = list(self._reader.header[0]) if self.names is None: if self.prefix: - self.names = ['{prefix}{i}'.format(prefix=self.prefix, i=i) - for i in range(self._reader.table_width)] + self.names = [ + "{prefix}{i}".format(prefix=self.prefix, i=i) + for i in range(self._reader.table_width) + ] else: self.names = list(range(self._reader.table_width)) @@ -1865,19 +1942,23 @@ def __init__(self, src, **kwds): usecols = _evaluate_usecols(self.usecols, self.orig_names) # GH 14671 - if (self.usecols_dtype == 'string' and - not set(usecols).issubset(self.orig_names)): + if self.usecols_dtype == "string" and not set(usecols).issubset( + self.orig_names + ): _validate_usecols_names(usecols, self.orig_names) # GH 25623 # validate that column indices in usecols are not out of bounds - elif self.usecols_dtype == 'integer': + elif self.usecols_dtype == "integer": indices = range(self._reader.table_width) _validate_usecols_names(usecols, indices) if len(self.names) > len(usecols): - self.names = [n for i, n in enumerate(self.names) - if (i in usecols or n in usecols)] + self.names = [ + n + for i, n in enumerate(self.names) + if (i in usecols or n in usecols) + ] if len(self.names) < len(usecols): _validate_usecols_names(usecols, self.names) @@ -1887,14 +1968,12 @@ def __init__(self, src, **kwds): self.orig_names = self.names if not self._has_complex_date_col: - if (self._reader.leading_cols == 0 and - _is_index_col(self.index_col)): + if self._reader.leading_cols == 0 and _is_index_col(self.index_col): self._name_processed = True - (index_names, self.names, - self.index_col) = _clean_index_names(self.names, - self.index_col, - self.unnamed_cols) + (index_names, self.names, self.index_col) = _clean_index_names( + self.names, self.index_col, self.unnamed_cols + ) if self.index_names is None: self.index_names = index_names @@ -1922,13 +2001,12 @@ def _set_noconvert_columns(self): undergo such conversions. """ names = self.orig_names - if self.usecols_dtype == 'integer': + if self.usecols_dtype == "integer": # A set of integers will be converted to a list in # the correct order every single time. usecols = list(self.usecols) usecols.sort() - elif (callable(self.usecols) or - self.usecols_dtype not in ('empty', None)): + elif callable(self.usecols) or self.usecols_dtype not in ("empty", None): # The names attribute should have the correct columns # in the proper order for indexing with parse_dates. usecols = self.names[:] @@ -1979,16 +2057,19 @@ def read(self, nrows=None): self._first_chunk = False names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names, - dtype=self.kwds.get('dtype')) - columns = self._maybe_make_multi_index_columns( - columns, self.col_names) + names, + self.index_col, + self.index_names, + dtype=self.kwds.get("dtype"), + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) if self.usecols is not None: columns = self._filter_usecols(columns) - col_dict = dict(filter(lambda item: item[0] in columns, - col_dict.items())) + col_dict = dict( + filter(lambda item: item[0] in columns, col_dict.items()) + ) return index, columns, col_dict @@ -2002,7 +2083,7 @@ def read(self, nrows=None): if self._reader.leading_cols: if self._has_complex_date_col: - raise NotImplementedError('file structure not yet supported') + raise NotImplementedError("file structure not yet supported") # implicit index, no index names arrays = [] @@ -2013,8 +2094,7 @@ def read(self, nrows=None): else: values = data.pop(self.index_col[i]) - values = self._maybe_parse_dates(values, i, - try_parse_dates=True) + values = self._maybe_parse_dates(values, i, try_parse_dates=True) arrays.append(values) index = ensure_index_from_sequences(arrays) @@ -2058,8 +2138,9 @@ def _filter_usecols(self, names): # hackish usecols = _evaluate_usecols(self.usecols, names) if usecols is not None and len(names) != len(usecols): - names = [name for i, name in enumerate(names) - if i in usecols or name in usecols] + names = [ + name for i, name in enumerate(names) if i in usecols or name in usecols + ] return names def _get_index_names(self): @@ -2067,9 +2148,9 @@ def _get_index_names(self): idx_names = None if self._reader.leading_cols == 0 and self.index_col is not None: - (idx_names, names, - self.index_col) = _clean_index_names(names, self.index_col, - self.unnamed_cols) + (idx_names, names, self.index_col) = _clean_index_names( + names, self.index_col, self.unnamed_cols + ) return names, idx_names @@ -2133,16 +2214,15 @@ def TextParser(*args, **kwds): 'high' for the high-precision converter, and 'round_trip' for the round-trip converter. """ - kwds['engine'] = 'python' + kwds["engine"] = "python" return TextFileReader(*args, **kwds) def count_empty_vals(vals): - return sum(1 for v in vals if v == '' or v is None) + return sum(1 for v in vals if v == "" or v is None) class PythonParser(ParserBase): - def __init__(self, f, **kwds): """ Workhorse function for processing nested list into DataFrame @@ -2156,58 +2236,61 @@ def __init__(self, f, **kwds): self.pos = 0 self.line_pos = 0 - self.encoding = kwds['encoding'] - self.compression = kwds['compression'] - self.memory_map = kwds['memory_map'] - self.skiprows = kwds['skiprows'] + self.encoding = kwds["encoding"] + self.compression = kwds["compression"] + self.memory_map = kwds["memory_map"] + self.skiprows = kwds["skiprows"] if callable(self.skiprows): self.skipfunc = self.skiprows else: self.skipfunc = lambda x: x in self.skiprows - self.skipfooter = _validate_skipfooter_arg(kwds['skipfooter']) - self.delimiter = kwds['delimiter'] + self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"]) + self.delimiter = kwds["delimiter"] - self.quotechar = kwds['quotechar'] + self.quotechar = kwds["quotechar"] if isinstance(self.quotechar, str): self.quotechar = str(self.quotechar) - self.escapechar = kwds['escapechar'] - self.doublequote = kwds['doublequote'] - self.skipinitialspace = kwds['skipinitialspace'] - self.lineterminator = kwds['lineterminator'] - self.quoting = kwds['quoting'] - self.usecols, self.usecols_dtype = _validate_usecols_arg( - kwds['usecols']) - self.skip_blank_lines = kwds['skip_blank_lines'] + self.escapechar = kwds["escapechar"] + self.doublequote = kwds["doublequote"] + self.skipinitialspace = kwds["skipinitialspace"] + self.lineterminator = kwds["lineterminator"] + self.quoting = kwds["quoting"] + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + self.skip_blank_lines = kwds["skip_blank_lines"] - self.warn_bad_lines = kwds['warn_bad_lines'] - self.error_bad_lines = kwds['error_bad_lines'] + self.warn_bad_lines = kwds["warn_bad_lines"] + self.error_bad_lines = kwds["error_bad_lines"] - self.names_passed = kwds['names'] or None + self.names_passed = kwds["names"] or None self.has_index_names = False - if 'has_index_names' in kwds: - self.has_index_names = kwds['has_index_names'] + if "has_index_names" in kwds: + self.has_index_names = kwds["has_index_names"] - self.verbose = kwds['verbose'] - self.converters = kwds['converters'] + self.verbose = kwds["verbose"] + self.converters = kwds["converters"] - self.dtype = kwds['dtype'] - self.thousands = kwds['thousands'] - self.decimal = kwds['decimal'] + self.dtype = kwds["dtype"] + self.thousands = kwds["thousands"] + self.decimal = kwds["decimal"] - self.comment = kwds['comment'] + self.comment = kwds["comment"] self._comment_lines = [] - f, handles = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression, - memory_map=self.memory_map) + f, handles = _get_handle( + f, + "r", + encoding=self.encoding, + compression=self.compression, + memory_map=self.memory_map, + ) self.handles.extend(handles) # Set self.data to something that can read lines. - if hasattr(f, 'readline'): + if hasattr(f, "readline"): self._make_reader(f) else: self.data = f @@ -2215,17 +2298,18 @@ def __init__(self, f, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices = None - (self.columns, self.num_original_columns, - self.unnamed_cols) = self._infer_columns() + ( + self.columns, + self.num_original_columns, + self.unnamed_cols, + ) = self._infer_columns() # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. if len(self.columns) > 1: # we are processing a multi index column - self.columns, self.index_names, self.col_names, _ = ( - self._extract_multi_indexer_columns( - self.columns, self.index_names, self.col_names - ) + self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns( + self.columns, self.index_names, self.col_names ) # Update list of original names to include all indices. self.num_original_columns = len(self.columns) @@ -2239,8 +2323,9 @@ def __init__(self, f, **kwds): # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = ( - self._get_index_name(self.columns)) + (index_names, self.orig_names, self.columns) = self._get_index_name( + self.columns + ) self._name_processed = True if self.index_names is None: self.index_names = index_names @@ -2251,14 +2336,18 @@ def __init__(self, f, **kwds): self._no_thousands_columns = None if len(self.decimal) != 1: - raise ValueError('Only length-1 decimal markers supported') + raise ValueError("Only length-1 decimal markers supported") if self.thousands is None: self.nonnum = re.compile( - r'[^-^0-9^{decimal}]+'.format(decimal=self.decimal)) + r"[^-^0-9^{decimal}]+".format(decimal=self.decimal) + ) else: - self.nonnum = re.compile(r'[^-^0-9^{thousands}^{decimal}]+'.format( - thousands=self.thousands, decimal=self.decimal)) + self.nonnum = re.compile( + r"[^-^0-9^{thousands}^{decimal}]+".format( + thousands=self.thousands, decimal=self.decimal + ) + ) def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands @@ -2301,8 +2390,9 @@ def _make_reader(self, f): if sep is None or len(sep) == 1: if self.lineterminator: - raise ValueError('Custom line terminators not supported in ' - 'python parser (yet)') + raise ValueError( + "Custom line terminators not supported in " "python parser (yet)" + ) class MyDialect(csv.Dialect): delimiter = self.delimiter @@ -2311,7 +2401,7 @@ class MyDialect(csv.Dialect): doublequote = self.doublequote skipinitialspace = self.skipinitialspace quoting = self.quoting - lineterminator = '\n' + lineterminator = "\n" dia = MyDialect @@ -2334,23 +2424,25 @@ class MyDialect(csv.Dialect): sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter if self.encoding is not None: - self.buf.extend(list( - UnicodeReader(StringIO(line), - dialect=dia, - encoding=self.encoding))) + self.buf.extend( + list( + UnicodeReader( + StringIO(line), dialect=dia, encoding=self.encoding + ) + ) + ) else: - self.buf.extend(list(csv.reader(StringIO(line), - dialect=dia))) + self.buf.extend(list(csv.reader(StringIO(line), dialect=dia))) if self.encoding is not None: - reader = UnicodeReader(f, dialect=dia, - encoding=self.encoding, - strict=True) + reader = UnicodeReader( + f, dialect=dia, encoding=self.encoding, strict=True + ) else: - reader = csv.reader(f, dialect=dia, - strict=True) + reader = csv.reader(f, dialect=dia, strict=True) else: + def _read(): line = f.readline() pat = re.compile(sep) @@ -2359,6 +2451,7 @@ def _read(): for line in f: yield pat.split(line.strip()) + reader = _read() self.data = reader @@ -2380,9 +2473,9 @@ def read(self, rows=None): # DataFrame with the right metadata, even though it's length 0 names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - names, self.index_col, self.index_names, self.dtype) - columns = self._maybe_make_multi_index_columns( - columns, self.col_names) + names, self.index_col, self.index_names, self.dtype + ) + columns = self._maybe_make_multi_index_columns(columns, self.col_names) return index, columns, col_dict # handle new style for names in index @@ -2462,9 +2555,14 @@ def _clean_mapping(mapping): clean_na_values = self.na_values clean_na_fvalues = self.na_fvalues - return self._convert_to_ndarrays(data, clean_na_values, - clean_na_fvalues, self.verbose, - clean_conv, clean_dtypes) + return self._convert_to_ndarrays( + data, + clean_na_values, + clean_na_fvalues, + self.verbose, + clean_conv, + clean_dtypes, + ) def _infer_columns(self): names = self.names @@ -2495,8 +2593,9 @@ def _infer_columns(self): except StopIteration: if self.line_pos < hr: raise ValueError( - 'Passed header={hr} but only {pos} lines in ' - 'file'.format(hr=hr, pos=(self.line_pos + 1))) + "Passed header={hr} but only {pos} lines in " + "file".format(hr=hr, pos=(self.line_pos + 1)) + ) # We have an empty file, so check # if columns are provided. That will @@ -2508,8 +2607,7 @@ def _infer_columns(self): return columns, num_original_columns, unnamed_cols if not self.names: - raise EmptyDataError( - "No columns to parse from file") + raise EmptyDataError("No columns to parse from file") line = self.names[:] @@ -2517,10 +2615,11 @@ def _infer_columns(self): this_unnamed_cols = [] for i, c in enumerate(line): - if c == '': + if c == "": if have_mi_columns: - col_name = ("Unnamed: {i}_level_{level}" - .format(i=i, level=level)) + col_name = "Unnamed: {i}_level_{level}".format( + i=i, level=level + ) else: col_name = "Unnamed: {i}".format(i=i) @@ -2537,8 +2636,7 @@ def _infer_columns(self): while cur_count > 0: counts[col] = cur_count + 1 - col = '{column}.{count}'.format( - column=col, count=cur_count) + col = "{column}.{count}".format(column=col, count=cur_count) cur_count = counts[col] this_columns[i] = col @@ -2550,8 +2648,7 @@ def _infer_columns(self): # line for the rest of the parsing code if hr == header[-1]: lc = len(this_columns) - ic = (len(self.index_col) - if self.index_col is not None else 0) + ic = len(self.index_col) if self.index_col is not None else 0 unnamed_count = len(this_unnamed_cols) if lc != unnamed_count and lc - ic > unnamed_count: @@ -2560,8 +2657,7 @@ def _infer_columns(self): self.buf = [self.buf[-1]] columns.append(this_columns) - unnamed_cols.update({this_columns[i] - for i in this_unnamed_cols}) + unnamed_cols.update({this_columns[i] for i in this_unnamed_cols}) if len(columns) == 1: num_original_columns = len(this_columns) @@ -2571,21 +2667,21 @@ def _infer_columns(self): # GH 25623 # validate that column indices in usecols are not out of bounds - if self.usecols_dtype == 'integer': + if self.usecols_dtype == "integer": for col in columns: indices = range(len(col)) _validate_usecols_names(self.usecols, indices) if names is not None: - if ((self.usecols is not None and - len(names) != len(self.usecols)) or - (self.usecols is None and - len(names) != len(columns[0]))): - raise ValueError('Number of passed names did not match ' - 'number of header fields in the file') + if (self.usecols is not None and len(names) != len(self.usecols)) or ( + self.usecols is None and len(names) != len(columns[0]) + ): + raise ValueError( + "Number of passed names did not match " + "number of header fields in the file" + ) if len(columns) > 1: - raise TypeError('Cannot pass names with multi-index ' - 'columns') + raise TypeError("Cannot pass names with multi-index " "columns") if self.usecols is not None: # Set _use_cols. We don't store columns because they are @@ -2603,8 +2699,7 @@ def _infer_columns(self): except StopIteration: if not names: - raise EmptyDataError( - "No columns to parse from file") + raise EmptyDataError("No columns to parse from file") line = names[:] @@ -2613,13 +2708,17 @@ def _infer_columns(self): # GH 25623 # validate that column indices in usecols are not out of bounds - if self.usecols_dtype == 'integer': + if self.usecols_dtype == "integer": _validate_usecols_names(self.usecols, range(ncols)) if not names: if self.prefix: - columns = [['{prefix}{idx}'.format( - prefix=self.prefix, idx=i) for i in range(ncols)]] + columns = [ + [ + "{prefix}{idx}".format(prefix=self.prefix, idx=i) + for i in range(ncols) + ] + ] else: columns = [list(range(ncols))] columns = self._handle_usecols(columns, columns[0]) @@ -2628,11 +2727,10 @@ def _infer_columns(self): columns = self._handle_usecols([names], names) num_original_columns = len(names) else: - if (not callable(self.usecols) and - len(names) != len(self.usecols)): + if not callable(self.usecols) and len(names) != len(self.usecols): raise ValueError( - 'Number of passed names did not match number of ' - 'header fields in the file' + "Number of passed names did not match number of " + "header fields in the file" ) # Ignore output but set used columns. self._handle_usecols([names], names) @@ -2652,8 +2750,9 @@ def _handle_usecols(self, columns, usecols_key): col_indices = _evaluate_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: - raise ValueError("If using multiple headers, usecols must " - "be integers.") + raise ValueError( + "If using multiple headers, usecols must " "be integers." + ) col_indices = [] for col in self.usecols: @@ -2667,8 +2766,10 @@ def _handle_usecols(self, columns, usecols_key): else: col_indices = self.usecols - columns = [[n for i, n in enumerate(column) if i in col_indices] - for column in columns] + columns = [ + [n for i, n in enumerate(column) if i in col_indices] + for column in columns + ] self._col_indices = col_indices return columns @@ -2724,7 +2825,7 @@ def _check_for_bom(self, first_row): # Extract any remaining data after the second # quotation mark. if len(first_row_bom) > end + 1: - new_row += first_row_bom[end + 1:] + new_row += first_row_bom[end + 1 :] return [new_row] + first_row[1:] elif len(first_row_bom) > 1: @@ -2759,9 +2860,9 @@ def _next_line(self): line = self._check_comments([self.data[self.pos]])[0] self.pos += 1 # either uncommented or blank to begin with - if (not self.skip_blank_lines and - (self._is_line_empty( - self.data[self.pos - 1]) or line)): + if not self.skip_blank_lines and ( + self._is_line_empty(self.data[self.pos - 1]) or line + ): break elif self.skip_blank_lines: ret = self._remove_empty_lines([line]) @@ -2819,8 +2920,8 @@ def _alert_malformed(self, msg, row_num): if self.error_bad_lines: raise ParserError(msg) elif self.warn_bad_lines: - base = 'Skipping line {row_num}: '.format(row_num=row_num) - sys.stderr.write(base + msg + '\n') + base = "Skipping line {row_num}: ".format(row_num=row_num) + sys.stderr.write(base + msg + "\n") def _next_iter_line(self, row_num): """ @@ -2841,19 +2942,23 @@ def _next_iter_line(self, row_num): if self.warn_bad_lines or self.error_bad_lines: msg = str(e) - if 'NULL byte' in msg: - msg = ('NULL byte detected. This byte ' - 'cannot be processed in Python\'s ' - 'native csv library at the moment, ' - 'so please pass in engine=\'c\' instead') + if "NULL byte" in msg: + msg = ( + "NULL byte detected. This byte " + "cannot be processed in Python's " + "native csv library at the moment, " + "so please pass in engine='c' instead" + ) if self.skipfooter > 0: - reason = ('Error could possibly be due to ' - 'parsing errors in the skipped footer rows ' - '(the skipfooter keyword is only applied ' - 'after Python\'s csv library has parsed ' - 'all rows).') - msg += '. ' + reason + reason = ( + "Error could possibly be due to " + "parsing errors in the skipped footer rows " + "(the skipfooter keyword is only applied " + "after Python's csv library has parsed " + "all rows)." + ) + msg += ". " + reason self._alert_malformed(msg, row_num) return None @@ -2865,11 +2970,10 @@ def _check_comments(self, lines): for l in lines: rl = [] for x in l: - if (not isinstance(x, str) or - self.comment not in x): + if not isinstance(x, str) or self.comment not in x: rl.append(x) else: - x = x[:x.find(self.comment)] + x = x[: x.find(self.comment)] if len(x) > 0: rl.append(x) break @@ -2895,8 +2999,11 @@ def _remove_empty_lines(self, lines): ret = [] for l in lines: # Remove empty lines and lines with only one whitespace value - if (len(l) > 1 or len(l) == 1 and - (not isinstance(l[0], str) or l[0].strip())): + if ( + len(l) > 1 + or len(l) == 1 + and (not isinstance(l[0], str) or l[0].strip()) + ): ret.append(l) return ret @@ -2904,20 +3011,21 @@ def _check_thousands(self, lines): if self.thousands is None: return lines - return self._search_replace_num_columns(lines=lines, - search=self.thousands, - replace='') + return self._search_replace_num_columns( + lines=lines, search=self.thousands, replace="" + ) def _search_replace_num_columns(self, lines, search, replace): ret = [] for l in lines: rl = [] for i, x in enumerate(l): - if (not isinstance(x, str) or - search not in x or - (self._no_thousands_columns and - i in self._no_thousands_columns) or - self.nonnum.search(x.strip())): + if ( + not isinstance(x, str) + or search not in x + or (self._no_thousands_columns and i in self._no_thousands_columns) + or self.nonnum.search(x.strip()) + ): rl.append(x) else: rl.append(x.replace(search, replace)) @@ -2925,12 +3033,12 @@ def _search_replace_num_columns(self, lines, search, replace): return ret def _check_decimal(self, lines): - if self.decimal == _parser_defaults['decimal']: + if self.decimal == _parser_defaults["decimal"]: return lines - return self._search_replace_num_columns(lines=lines, - search=self.decimal, - replace='.') + return self._search_replace_num_columns( + lines=lines, search=self.decimal, replace="." + ) def _clear_buffer(self): self.buf = [] @@ -2995,9 +3103,9 @@ def _get_index_name(self, columns): else: # Case 2 - (index_name, columns_, - self.index_col) = _clean_index_names(columns, self.index_col, - self.unnamed_cols) + (index_name, columns_, self.index_col) = _clean_index_names( + columns, self.index_col, self.unnamed_cols + ) return index_name, orig_names, columns @@ -3012,9 +3120,7 @@ def _rows_to_cols(self, content): # Check that there are no rows with too many # elements in their row (rows with too few # elements are padded with NaN). - if (max_len > col_len and - self.index_col is not False and - self.usecols is None): + if max_len > col_len and self.index_col is not False and self.usecols is None: footers = self.skipfooter if self.skipfooter else 0 bad_lines = [] @@ -3037,32 +3143,43 @@ def _rows_to_cols(self, content): content.append(l) for row_num, actual_len in bad_lines: - msg = ('Expected {col_len} fields in line {line}, saw ' - '{length}'.format(col_len=col_len, line=(row_num + 1), - length=actual_len)) - if (self.delimiter and - len(self.delimiter) > 1 and - self.quoting != csv.QUOTE_NONE): + msg = ( + "Expected {col_len} fields in line {line}, saw " + "{length}".format( + col_len=col_len, line=(row_num + 1), length=actual_len + ) + ) + if ( + self.delimiter + and len(self.delimiter) > 1 + and self.quoting != csv.QUOTE_NONE + ): # see gh-13374 - reason = ('Error could possibly be due to quotes being ' - 'ignored when a multi-char delimiter is used.') - msg += '. ' + reason + reason = ( + "Error could possibly be due to quotes being " + "ignored when a multi-char delimiter is used." + ) + msg += ". " + reason self._alert_malformed(msg, row_num + 1) # see gh-13320 - zipped_content = list(lib.to_object_array( - content, min_width=col_len).T) + zipped_content = list(lib.to_object_array(content, min_width=col_len).T) if self.usecols: if self._implicit_index: zipped_content = [ - a for i, a in enumerate(zipped_content) - if (i < len(self.index_col) or - i - len(self.index_col) in self._col_indices)] + a + for i, a in enumerate(zipped_content) + if ( + i < len(self.index_col) + or i - len(self.index_col) in self._col_indices + ) + ] else: - zipped_content = [a for i, a in enumerate(zipped_content) - if i in self._col_indices] + zipped_content = [ + a for i, a in enumerate(zipped_content) if i in self._col_indices + ] return zipped_content def _get_lines(self, rows=None): @@ -3084,16 +3201,19 @@ def _get_lines(self, rows=None): if self.pos > len(self.data): raise StopIteration if rows is None: - new_rows = self.data[self.pos:] + new_rows = self.data[self.pos :] new_pos = len(self.data) else: - new_rows = self.data[self.pos:self.pos + rows] + new_rows = self.data[self.pos : self.pos + rows] new_pos = self.pos + rows # Check for stop rows. n.b.: self.skiprows is a set. if self.skiprows: - new_rows = [row for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos)] + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] lines.extend(new_rows) self.pos = new_pos @@ -3109,8 +3229,7 @@ def _get_lines(self, rows=None): rows = 0 while True: - new_row = self._next_iter_line( - row_num=self.pos + rows + 1) + new_row = self._next_iter_line(row_num=self.pos + rows + 1) rows += 1 if new_row is not None: @@ -3118,8 +3237,11 @@ def _get_lines(self, rows=None): except StopIteration: if self.skiprows: - new_rows = [row for i, row in enumerate(new_rows) - if not self.skipfunc(i + self.pos)] + new_rows = [ + row + for i, row in enumerate(new_rows) + if not self.skipfunc(i + self.pos) + ] lines.extend(new_rows) if len(lines) == 0: raise @@ -3130,7 +3252,7 @@ def _get_lines(self, rows=None): lines = new_rows if self.skipfooter: - lines = lines[:-self.skipfooter] + lines = lines[: -self.skipfooter] lines = self._check_comments(lines) if self.skip_blank_lines: @@ -3139,8 +3261,9 @@ def _get_lines(self, rows=None): return self._check_decimal(lines) -def _make_date_converter(date_parser=None, dayfirst=False, - infer_datetime_format=False, cache_dates=True): +def _make_date_converter( + date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True +): def converter(*date_cols): if date_parser is None: strs = parsing._concat_date_cols(date_cols) @@ -3150,25 +3273,22 @@ def converter(*date_cols): ensure_object(strs), utc=None, dayfirst=dayfirst, - errors='ignore', + errors="ignore", infer_datetime_format=infer_datetime_format, - cache=cache_dates + cache=cache_dates, ).to_numpy() except ValueError: return tools.to_datetime( - parsing.try_parse_dates(strs, dayfirst=dayfirst), - cache=cache_dates + parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates ) else: try: result = tools.to_datetime( - date_parser(*date_cols), - errors='ignore', - cache=cache_dates + date_parser(*date_cols), errors="ignore", cache=cache_dates ) if isinstance(result, datetime.datetime): - raise Exception('scalar parser') + raise Exception("scalar parser") return result except Exception: try: @@ -3176,22 +3296,29 @@ def converter(*date_cols): parsing.try_parse_dates( parsing._concat_date_cols(date_cols), parser=date_parser, - dayfirst=dayfirst), - errors='ignore') + dayfirst=dayfirst, + ), + errors="ignore", + ) except Exception: return generic_parser(date_parser, *date_cols) return converter -def _process_date_conversion(data_dict, converter, parse_spec, - index_col, index_names, columns, - keep_date_col=False): +def _process_date_conversion( + data_dict, + converter, + parse_spec, + index_col, + index_names, + columns, + keep_date_col=False, +): def _isindex(colspec): - return ((isinstance(index_col, list) and - colspec in index_col) or - (isinstance(index_names, list) and - colspec in index_names)) + return (isinstance(index_col, list) and colspec in index_col) or ( + isinstance(index_names, list) and colspec in index_names + ) new_cols = [] new_data = {} @@ -3215,11 +3342,12 @@ def _isindex(colspec): data_dict[colspec] = converter(data_dict[colspec]) else: new_name, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names) + converter, colspec, data_dict, orig_names + ) if new_name in data_dict: raise ValueError( - 'New date column already in dict {name}'.format( - name=new_name)) + "New date column already in dict {name}".format(name=new_name) + ) new_data[new_name] = col new_cols.append(new_name) date_cols.update(old_names) @@ -3229,10 +3357,12 @@ def _isindex(colspec): for new_name, colspec in parse_spec.items(): if new_name in data_dict: raise ValueError( - 'Date column {name} already in dict'.format(name=new_name)) + "Date column {name} already in dict".format(name=new_name) + ) - _, col, old_names = _try_convert_dates(converter, colspec, - data_dict, orig_names) + _, col, old_names = _try_convert_dates( + converter, colspec, data_dict, orig_names + ) new_data[new_name] = col new_cols.append(new_name) @@ -3261,7 +3391,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns): else: colnames.append(c) - new_name = '_'.join(str(x) for x in colnames) + new_name = "_".join(str(x) for x in colnames) to_parse = [data_dict[c] for c in colnames if c in data_dict] new_col = parser(*to_parse) @@ -3377,8 +3507,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): for i, n in enumerate(index_col): columns.pop(n - i) - col_dict = {col_name: Series([], dtype=dtype[col_name]) - for col_name in columns} + col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns} return index, columns, col_dict @@ -3473,29 +3602,35 @@ class FixedWidthReader(BaseIterator): A reader of fixed-width lines. """ - def __init__(self, f, colspecs, delimiter, comment, skiprows=None, - infer_nrows=100): + def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100): self.f = f self.buffer = None - self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' + self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " self.comment = comment - if colspecs == 'infer': - self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows, - skiprows=skiprows) + if colspecs == "infer": + self.colspecs = self.detect_colspecs( + infer_nrows=infer_nrows, skiprows=skiprows + ) else: self.colspecs = colspecs if not isinstance(self.colspecs, (tuple, list)): - raise TypeError("column specifications must be a list or tuple, " - "input was a %r" % type(colspecs).__name__) + raise TypeError( + "column specifications must be a list or tuple, " + "input was a %r" % type(colspecs).__name__ + ) for colspec in self.colspecs: - if not (isinstance(colspec, (tuple, list)) and - len(colspec) == 2 and - isinstance(colspec[0], (int, np.integer, type(None))) and - isinstance(colspec[1], (int, np.integer, type(None)))): - raise TypeError('Each column specification must be ' - '2 element tuple or list of integers') + if not ( + isinstance(colspec, (tuple, list)) + and len(colspec) == 2 + and isinstance(colspec[0], (int, np.integer, type(None))) + and isinstance(colspec[1], (int, np.integer, type(None))) + ): + raise TypeError( + "Each column specification must be " + "2 element tuple or list of integers" + ) def get_rows(self, infer_nrows, skiprows=None): """ @@ -3537,8 +3672,8 @@ def get_rows(self, infer_nrows, skiprows=None): def detect_colspecs(self, infer_nrows=100, skiprows=None): # Regex escape the delimiters - delimiters = ''.join(r'\{}'.format(x) for x in self.delimiter) - pattern = re.compile('([^{}]+)'.format(delimiters)) + delimiters = "".join(r"\{}".format(x) for x in self.delimiter) + pattern = re.compile("([^{}]+)".format(delimiters)) rows = self.get_rows(infer_nrows, skiprows) if not rows: raise EmptyDataError("No rows from which to infer column width") @@ -3548,7 +3683,7 @@ def detect_colspecs(self, infer_nrows=100, skiprows=None): rows = [row.partition(self.comment)[0] for row in rows] for row in rows: for m in pattern.finditer(row): - mask[m.start():m.end()] = 1 + mask[m.start() : m.end()] = 1 shifted = np.roll(mask, 1) shifted[0] = 0 edges = np.where((mask ^ shifted) == 1)[0] @@ -3565,8 +3700,7 @@ def __next__(self): else: line = next(self.f) # Note: 'colspecs' is a sequence of half-open intervals. - return [line[fromm:to].strip(self.delimiter) - for (fromm, to) in self.colspecs] + return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] class FixedWidthFieldParser(PythonParser): @@ -3577,11 +3711,16 @@ class FixedWidthFieldParser(PythonParser): def __init__(self, f, **kwds): # Support iterators, convert to a list. - self.colspecs = kwds.pop('colspecs') - self.infer_nrows = kwds.pop('infer_nrows') + self.colspecs = kwds.pop("colspecs") + self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): - self.data = FixedWidthReader(f, self.colspecs, self.delimiter, - self.comment, self.skiprows, - self.infer_nrows) + self.data = FixedWidthReader( + f, + self.colspecs, + self.delimiter, + self.comment, + self.skiprows, + self.infer_nrows, + ) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index afe1622d99eac..4e390de87fc60 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -10,8 +10,7 @@ from pandas.io.common import _get_handle, _stringify_path -def to_pickle(obj, path, compression='infer', - protocol=pickle.HIGHEST_PROTOCOL): +def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. @@ -70,9 +69,7 @@ def to_pickle(obj, path, compression='infer', >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) - f, fh = _get_handle(path, 'wb', - compression=compression, - is_text=False) + f, fh = _get_handle(path, "wb", compression=compression, is_text=False) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -83,7 +80,7 @@ def to_pickle(obj, path, compression='infer', _f.close() -def read_pickle(path, compression='infer'): +def read_pickle(path, compression="infer"): """ Load pickled pandas object (or any object) from file. @@ -145,7 +142,7 @@ def read_pickle(path, compression='infer'): >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) - f, fh = _get_handle(path, 'rb', compression=compression, is_text=False) + f, fh = _get_handle(path, "rb", compression=compression, is_text=False) # 1) try standard libary Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes @@ -160,12 +157,13 @@ def read_pickle(path, compression='infer'): try: return pc.load(f, encoding=None) except Exception: # noqa: E722 - return pc.load(f, encoding='latin1') + return pc.load(f, encoding="latin1") finally: f.close() for _f in fh: _f.close() + # compat with sparse pickle / unpickle diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f439e365fbcf0..9206463e18fb3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -22,15 +22,31 @@ from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import ( - ensure_object, is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_type, is_list_like, - is_timedelta64_dtype) + ensure_object, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_type, + is_list_like, + is_timedelta64_dtype, +) from pandas.core.dtypes.missing import array_equivalent from pandas import ( - DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, PeriodIndex, - Series, SparseDataFrame, SparseSeries, TimedeltaIndex, concat, isna, - to_datetime) + DataFrame, + DatetimeIndex, + Index, + Int64Index, + MultiIndex, + PeriodIndex, + Series, + SparseDataFrame, + SparseSeries, + TimedeltaIndex, + concat, + isna, + to_datetime, +) from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.sparse import BlockIndex, IntIndex import pandas.core.common as com @@ -42,16 +58,16 @@ from pandas.io.formats.printing import adjoin, pprint_thing # versioning attribute -_version = '0.15.2' +_version = "0.15.2" # encoding -_default_encoding = 'UTF-8' +_default_encoding = "UTF-8" def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, np.bytes_): - s = s.decode('UTF-8') + s = s.decode("UTF-8") return s @@ -145,12 +161,7 @@ class DuplicateWarning(Warning): """ # formats -_FORMAT_MAP = { - 'f': 'fixed', - 'fixed': 'fixed', - 't': 'table', - 'table': 'table', -} +_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"} format_deprecate_doc = """ the table keyword has been deprecated @@ -163,38 +174,35 @@ class DuplicateWarning(Warning): # map object types _TYPE_MAP = { - - Series: 'series', - SparseSeries: 'sparse_series', - DataFrame: 'frame', - SparseDataFrame: 'sparse_frame', + Series: "series", + SparseSeries: "sparse_series", + DataFrame: "frame", + SparseDataFrame: "sparse_frame", } # storer class map _STORER_MAP = { - 'Series': 'LegacySeriesFixed', - 'DataFrame': 'LegacyFrameFixed', - 'DataMatrix': 'LegacyFrameFixed', - 'series': 'SeriesFixed', - 'sparse_series': 'SparseSeriesFixed', - 'frame': 'FrameFixed', - 'sparse_frame': 'SparseFrameFixed', + "Series": "LegacySeriesFixed", + "DataFrame": "LegacyFrameFixed", + "DataMatrix": "LegacyFrameFixed", + "series": "SeriesFixed", + "sparse_series": "SparseSeriesFixed", + "frame": "FrameFixed", + "sparse_frame": "SparseFrameFixed", } # table class map _TABLE_MAP = { - 'generic_table': 'GenericTable', - 'appendable_series': 'AppendableSeriesTable', - 'appendable_multiseries': 'AppendableMultiSeriesTable', - 'appendable_frame': 'AppendableFrameTable', - 'appendable_multiframe': 'AppendableMultiFrameTable', - 'worm': 'WORMTable', + "generic_table": "GenericTable", + "appendable_series": "AppendableSeriesTable", + "appendable_multiseries": "AppendableMultiSeriesTable", + "appendable_frame": "AppendableFrameTable", + "appendable_multiframe": "AppendableMultiFrameTable", + "worm": "WORMTable", } # axes map -_AXES_MAP = { - DataFrame: [0], -} +_AXES_MAP = {DataFrame: [0]} # register our configuration options dropna_doc = """ @@ -207,12 +215,13 @@ class DuplicateWarning(Warning): put will default to 'fixed' and append will default to 'table' """ -with config.config_prefix('io.hdf'): - config.register_option('dropna_table', False, dropna_doc, - validator=config.is_bool) +with config.config_prefix("io.hdf"): + config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool) config.register_option( - 'default_format', None, format_doc, - validator=config.is_one_of_factory(['fixed', 'table', None]) + "default_format", + None, + format_doc, + validator=config.is_one_of_factory(["fixed", "table", None]), ) # oh the troubles to reduce import time @@ -225,6 +234,7 @@ def _tables(): global _table_file_open_policy_is_strict if _table_mod is None: import tables + _table_mod = tables # set the file open policy @@ -232,17 +242,27 @@ def _tables(): # depending on the HDF5 version try: _table_file_open_policy_is_strict = ( - tables.file._FILE_OPEN_POLICY == 'strict') + tables.file._FILE_OPEN_POLICY == "strict" + ) except AttributeError: pass return _table_mod + # interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, - append=None, **kwargs): +def to_hdf( + path_or_buf, + key, + value, + mode=None, + complevel=None, + complib=None, + append=None, + **kwargs +): """ store this object, close it if we opened it """ if append: @@ -252,14 +272,15 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): - with HDFStore(path_or_buf, mode=mode, complevel=complevel, - complib=complib) as store: + with HDFStore( + path_or_buf, mode=mode, complevel=complevel, complib=complib + ) as store: f(store) else: f(path_or_buf) -def read_hdf(path_or_buf, key=None, mode='r', **kwargs): +def read_hdf(path_or_buf, key=None, mode="r", **kwargs): """ Read from the store, close it if we opened it. @@ -319,24 +340,27 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): >>> reread = pd.read_hdf('./store.h5') """ - if mode not in ['r', 'r+', 'a']: - raise ValueError('mode {0} is not allowed while performing a read. ' - 'Allowed modes are r, r+ and a.'.format(mode)) + if mode not in ["r", "r+", "a"]: + raise ValueError( + "mode {0} is not allowed while performing a read. " + "Allowed modes are r, r+ and a.".format(mode) + ) # grab the scope - if 'where' in kwargs: - kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) + if "where" in kwargs: + kwargs["where"] = _ensure_term(kwargs["where"], scope_level=1) if isinstance(path_or_buf, HDFStore): if not path_or_buf.is_open: - raise IOError('The HDFStore must be open for reading.') + raise IOError("The HDFStore must be open for reading.") store = path_or_buf auto_close = False else: path_or_buf = _stringify_path(path_or_buf) if not isinstance(path_or_buf, str): - raise NotImplementedError('Support for generic buffers has not ' - 'been implemented.') + raise NotImplementedError( + "Support for generic buffers has not " "been implemented." + ) try: exists = os.path.exists(path_or_buf) @@ -346,7 +370,8 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): if not exists: raise FileNotFoundError( - 'File {path} does not exist'.format(path=path_or_buf)) + "File {path} does not exist".format(path=path_or_buf) + ) store = HDFStore(path_or_buf, mode=mode, **kwargs) # can't auto open/close if we are using an iterator @@ -357,7 +382,7 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): if key is None: groups = store.groups() if len(groups) == 0: - raise ValueError('No dataset in HDF5 file.') + raise ValueError("No dataset in HDF5 file.") candidate_only_group = groups[0] # For the HDF file to have only one dataset, all other groups @@ -366,8 +391,10 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs): # before their children.) for group_to_check in groups[1:]: if not _is_metadata_of(group_to_check, candidate_only_group): - raise ValueError('key must be provided when HDF5 file ' - 'contains multiple datasets.') + raise ValueError( + "key must be provided when HDF5 file " + "contains multiple datasets." + ) key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except (ValueError, TypeError, KeyError): @@ -388,7 +415,7 @@ def _is_metadata_of(group, parent_group): current = group while current._v_depth > 1: parent = current._v_parent - if parent == parent_group and current._v_name == 'meta': + if parent == parent_group and current._v_name == "meta": return True current = current._v_parent return False @@ -439,25 +466,28 @@ class HDFStore: >>> store.close() """ - def __init__(self, path, mode=None, complevel=None, complib=None, - fletcher32=False, **kwargs): + def __init__( + self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs + ): - if 'format' in kwargs: - raise ValueError('format is not a defined argument for HDFStore') + if "format" in kwargs: + raise ValueError("format is not a defined argument for HDFStore") tables = import_optional_dependency("tables") if complib is not None and complib not in tables.filters.all_complibs: raise ValueError( "complib only supports {libs} compression.".format( - libs=tables.filters.all_complibs)) + libs=tables.filters.all_complibs + ) + ) if complib is None and complevel is not None: complib = tables.filters.default_complib self._path = _stringify_path(path) if mode is None: - mode = 'a' + mode = "a" self._mode = mode self._handle = None self._complevel = complevel if complevel else 0 @@ -496,7 +526,9 @@ def __getattr__(self, name): pass raise AttributeError( "'{object}' object has no attribute '{name}'".format( - object=type(self).__name__, name=name)) + object=type(self).__name__, name=name + ) + ) def __contains__(self, key): """ check for existence of this key @@ -513,8 +545,9 @@ def __len__(self): return len(self.groups()) def __repr__(self): - return '{type}\nFile path: {path}\n'.format( - type=type(self), path=pprint_thing(self._path)) + return "{type}\nFile path: {path}\n".format( + type=type(self), path=pprint_thing(self._path) + ) def __enter__(self): return self @@ -546,7 +579,7 @@ def items(self): iteritems = items - def open(self, mode='a', **kwargs): + def open(self, mode="a", **kwargs): """ Open the file in the specified mode @@ -560,16 +593,15 @@ def open(self, mode='a', **kwargs): if self._mode != mode: # if we are changing a write mode to read, ok - if self._mode in ['a', 'w'] and mode in ['r', 'r+']: + if self._mode in ["a", "w"] and mode in ["r", "r+"]: pass - elif mode in ['w']: + elif mode in ["w"]: # this would truncate, raise here if self.is_open: raise PossibleDataLossError( "Re-opening the file [{0}] with mode [{1}] " - "will delete the current file!" - .format(self._path, self._mode) + "will delete the current file!".format(self._path, self._mode) ) self._mode = mode @@ -579,16 +611,16 @@ def open(self, mode='a', **kwargs): self.close() if self._complevel and self._complevel > 0: - self._filters = _tables().Filters(self._complevel, self._complib, - fletcher32=self._fletcher32) + self._filters = _tables().Filters( + self._complevel, self._complib, fletcher32=self._fletcher32 + ) try: self._handle = tables.open_file(self._path, self._mode, **kwargs) except (IOError) as e: # pragma: no cover - if 'can not be written' in str(e): - print( - 'Opening {path} in read-only mode'.format(path=self._path)) - self._handle = tables.open_file(self._path, 'r', **kwargs) + if "can not be written" in str(e): + print("Opening {path} in read-only mode".format(path=self._path)) + self._handle = tables.open_file(self._path, "r", **kwargs) else: raise @@ -596,7 +628,7 @@ def open(self, mode='a', **kwargs): # trap PyTables >= 3.1 FILE_OPEN_POLICY exception # to provide an updated message - if 'FILE_OPEN_POLICY' in str(e): + if "FILE_OPEN_POLICY" in str(e): e = ValueError( "PyTables [{version}] no longer supports opening multiple " "files\n" @@ -605,9 +637,11 @@ def open(self, mode='a', **kwargs): "and not open the same file multiple times at once,\n" "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " "which allows\n" - "files to be opened multiple times at once\n" - .format(version=tables.__version__, - hdf_version=tables.get_hdf5_version())) + "files to be opened multiple times at once\n".format( + version=tables.__version__, + hdf_version=tables.get_hdf5_version(), + ) + ) raise e @@ -615,7 +649,7 @@ def open(self, mode='a', **kwargs): # trying to read from a non-existent file causes an error which # is not part of IOError, make it one - if self._mode == 'r' and 'Unable to open/create file' in str(e): + if self._mode == "r" and "Unable to open/create file" in str(e): raise IOError(str(e)) raise @@ -674,11 +708,21 @@ def get(self, key): """ group = self.get_node(key) if group is None: - raise KeyError('No object named {key} in the file'.format(key=key)) + raise KeyError("No object named {key} in the file".format(key=key)) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, - iterator=False, chunksize=None, auto_close=False, **kwargs): + def select( + self, + key, + where=None, + start=None, + stop=None, + columns=None, + iterator=False, + chunksize=None, + auto_close=False, + **kwargs + ): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -702,7 +746,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, """ group = self.get_node(key) if group is None: - raise KeyError('No object named {key} in the file'.format(key=key)) + raise KeyError("No object named {key} in the file".format(key=key)) # create the storer and axes where = _ensure_term(where, scope_level=1) @@ -711,19 +755,25 @@ def select(self, key, where=None, start=None, stop=None, columns=None, # function to call on iteration def func(_start, _stop, _where): - return s.read(start=_start, stop=_stop, - where=_where, - columns=columns) + return s.read(start=_start, stop=_stop, where=_where, columns=columns) # create the iterator - it = TableIterator(self, s, func, where=where, nrows=s.nrows, - start=start, stop=stop, iterator=iterator, - chunksize=chunksize, auto_close=auto_close) + it = TableIterator( + self, + s, + func, + where=where, + nrows=s.nrows, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) return it.get_result() - def select_as_coordinates( - self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): """ return the selection as an Index @@ -735,8 +785,9 @@ def select_as_coordinates( stop : integer (defaults to None), row number to stop selection """ where = _ensure_term(where, scope_level=1) - return self.get_storer(key).read_coordinates(where=where, start=start, - stop=stop, **kwargs) + return self.get_storer(key).read_coordinates( + where=where, start=start, stop=stop, **kwargs + ) def select_column(self, key, column, **kwargs): """ @@ -758,9 +809,19 @@ def select_column(self, key, column, **kwargs): """ return self.get_storer(key).read_column(column=column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, - start=None, stop=None, iterator=False, - chunksize=None, auto_close=False, **kwargs): + def select_as_multiple( + self, + keys, + where=None, + selector=None, + columns=None, + start=None, + stop=None, + iterator=False, + chunksize=None, + auto_close=False, + **kwargs + ): """ Retrieve pandas objects from multiple tables Parameters @@ -786,9 +847,16 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, str): - return self.select(key=keys, where=where, columns=columns, - start=start, stop=stop, iterator=iterator, - chunksize=chunksize, **kwargs) + return self.select( + key=keys, + where=where, + columns=columns, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + **kwargs + ) if not isinstance(keys, (list, tuple)): raise TypeError("keys must be a list/tuple") @@ -817,8 +885,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, if nrows is None: nrows = t.nrows elif t.nrows != nrows: - raise ValueError( - "all tables must have exactly the same nrows!") + raise ValueError("all tables must have exactly the same nrows!") # axis is the concentration axes axis = list({t.non_index_axes[0][0] for t in tbls})[0] @@ -827,17 +894,29 @@ def func(_start, _stop, _where): # retrieve the objs, _where is always passed as a set of # coordinates here - objs = [t.read(where=_where, columns=columns, start=_start, - stop=_stop, **kwargs) for t in tbls] + objs = [ + t.read( + where=_where, columns=columns, start=_start, stop=_stop, **kwargs + ) + for t in tbls + ] # concat and return - return concat(objs, axis=axis, - verify_integrity=False)._consolidate() + return concat(objs, axis=axis, verify_integrity=False)._consolidate() # create the iterator - it = TableIterator(self, s, func, where=where, nrows=nrows, - start=start, stop=stop, iterator=iterator, - chunksize=chunksize, auto_close=auto_close) + it = TableIterator( + self, + s, + func, + where=where, + nrows=nrows, + start=start, + stop=stop, + iterator=iterator, + chunksize=chunksize, + auto_close=auto_close, + ) return it.get_result(coordinates=True) @@ -867,7 +946,7 @@ def put(self, key, value, format=None, append=False, **kwargs): the store settable by the option 'io.hdf.dropna_table' """ if format is None: - format = get_option("io.hdf.default_format") or 'fixed' + format = get_option("io.hdf.default_format") or "fixed" kwargs = self._validate_format(format, kwargs) self._write_to_group(key, value, append=append, **kwargs) @@ -902,7 +981,8 @@ def remove(self, key, where=None, start=None, stop=None): if where is not None: raise ValueError( - "trying to remove a node with a non-None where clause!") + "trying to remove a node with a non-None where clause!" + ) # we are actually trying to remove a node (with children) s = self.get_node(key) @@ -918,11 +998,13 @@ def remove(self, key, where=None, start=None, stop=None): else: if not s.is_table: raise ValueError( - 'can only remove with where on objects written as tables') + "can only remove with where on objects written as tables" + ) return s.delete(where=where, start=start, stop=stop) - def append(self, key, value, format=None, append=True, columns=None, - dropna=None, **kwargs): + def append( + self, key, value, format=None, append=True, columns=None, dropna=None, **kwargs + ): """ Append to Table in file. Node must already exist and be Table format. @@ -957,19 +1039,20 @@ def append(self, key, value, format=None, append=True, columns=None, data in the table, so be careful """ if columns is not None: - raise TypeError("columns is not a supported keyword in append, " - "try data_columns") + raise TypeError( + "columns is not a supported keyword in append, " "try data_columns" + ) if dropna is None: dropna = get_option("io.hdf.dropna_table") if format is None: - format = get_option("io.hdf.default_format") or 'table' + format = get_option("io.hdf.default_format") or "table" kwargs = self._validate_format(format, kwargs) - self._write_to_group(key, value, append=append, dropna=dropna, - **kwargs) + self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) - def append_to_multiple(self, d, value, selector, data_columns=None, - axes=None, dropna=False, **kwargs): + def append_to_multiple( + self, d, value, selector, data_columns=None, axes=None, dropna=False, **kwargs + ): """ Append to multiple tables @@ -992,9 +1075,11 @@ def append_to_multiple(self, d, value, selector, data_columns=None, """ if axes is not None: - raise TypeError("axes is currently not accepted as a parameter to" - " append_to_multiple; you can create the " - "tables independently instead") + raise TypeError( + "axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables independently instead" + ) if not isinstance(d, dict): raise ValueError( @@ -1035,7 +1120,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None, # ensure rows are synchronized across the tables if dropna: - idxs = (value[cols].dropna(how='all').index for cols in d.values()) + idxs = (value[cols].dropna(how="all").index for cols in d.values()) valid_index = next(idxs) for index in idxs: valid_index = valid_index.intersection(index) @@ -1069,8 +1154,7 @@ def create_table_index(self, key, **kwargs): return if not s.is_table: - raise TypeError( - "cannot create table index on a Fixed format store") + raise TypeError("cannot create table index on a Fixed format store") s.create_index(**kwargs) def groups(self): @@ -1084,12 +1168,16 @@ def groups(self): _tables() self._check_if_open() return [ - g for g in self._handle.walk_groups() - if (not isinstance(g, _table_mod.link.Link) and - (getattr(g._v_attrs, 'pandas_type', None) or - getattr(g, 'table', None) or - (isinstance(g, _table_mod.table.Table) and - g._v_name != 'table'))) + g + for g in self._handle.walk_groups() + if ( + not isinstance(g, _table_mod.link.Link) + and ( + getattr(g._v_attrs, "pandas_type", None) + or getattr(g, "table", None) + or (isinstance(g, _table_mod.table.Table) and g._v_name != "table") + ) + ) ] def walk(self, where="/"): @@ -1123,27 +1211,27 @@ def walk(self, where="/"): _tables() self._check_if_open() for g in self._handle.walk_groups(where): - if getattr(g._v_attrs, 'pandas_type', None) is not None: + if getattr(g._v_attrs, "pandas_type", None) is not None: continue groups = [] leaves = [] for child in g._v_children.values(): - pandas_type = getattr(child._v_attrs, 'pandas_type', None) + pandas_type = getattr(child._v_attrs, "pandas_type", None) if pandas_type is None: if isinstance(child, _table_mod.group.Group): groups.append(child._v_name) else: leaves.append(child._v_name) - yield (g._v_pathname.rstrip('/'), groups, leaves) + yield (g._v_pathname.rstrip("/"), groups, leaves) def get_node(self, key): """ return the node with the key or None if it does not exist """ self._check_if_open() try: - if not key.startswith('/'): - key = '/' + key + if not key.startswith("/"): + key = "/" + key return self._handle.get_node(self.root, key) except _table_mod.exceptions.NoSuchNodeError: return None @@ -1152,14 +1240,23 @@ def get_storer(self, key): """ return the storer object for a key, raise if not in the file """ group = self.get_node(key) if group is None: - raise KeyError('No object named {key} in the file'.format(key=key)) + raise KeyError("No object named {key} in the file".format(key=key)) s = self._create_storer(group) s.infer_axes() return s - def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, - complevel=None, fletcher32=False, overwrite=True): + def copy( + self, + file, + mode="w", + propindexes=True, + keys=None, + complib=None, + complevel=None, + fletcher32=False, + overwrite=True, + ): """ copy the existing store to a new file, upgrading in place Parameters @@ -1176,11 +1273,8 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, """ new_store = HDFStore( - file, - mode=mode, - complib=complib, - complevel=complevel, - fletcher32=fletcher32) + file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 + ) if keys is None: keys = list(self.keys()) if not isinstance(keys, (tuple, list)): @@ -1200,9 +1294,11 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( - k, data, index=index, - data_columns=getattr(s, 'data_columns', None), - encoding=s.encoding + k, + data, + index=index, + data_columns=getattr(s, "data_columns", None), + encoding=s.encoding, ) else: new_store.put(k, data, encoding=s.encoding) @@ -1219,8 +1315,9 @@ def info(self): ------- str """ - output = '{type}\nFile path: {path}\n'.format( - type=type(self), path=pprint_thing(self._path)) + output = "{type}\nFile path: {path}\n".format( + type=type(self), path=pprint_thing(self._path) + ) if self.is_open: lkeys = sorted(list(self.keys())) if len(lkeys): @@ -1232,17 +1329,18 @@ def info(self): s = self.get_storer(k) if s is not None: keys.append(pprint_thing(s.pathname or k)) - values.append( - pprint_thing(s or 'invalid_HDFStore node')) + values.append(pprint_thing(s or "invalid_HDFStore node")) except Exception as detail: keys.append(k) values.append( "[invalid_HDFStore node: {detail}]".format( - detail=pprint_thing(detail))) + detail=pprint_thing(detail) + ) + ) output += adjoin(12, keys, values) else: - output += 'Empty' + output += "Empty" else: output += "File is CLOSED" @@ -1259,58 +1357,64 @@ def _validate_format(self, format, kwargs): # validate try: - kwargs['format'] = _FORMAT_MAP[format.lower()] + kwargs["format"] = _FORMAT_MAP[format.lower()] except KeyError: - raise TypeError("invalid HDFStore format specified [{0}]" - .format(format)) + raise TypeError("invalid HDFStore format specified [{0}]".format(format)) return kwargs - def _create_storer(self, group, format=None, value=None, append=False, - **kwargs): + def _create_storer(self, group, format=None, value=None, append=False, **kwargs): """ return a suitable class to operate """ def error(t): raise TypeError( "cannot properly create the storer for: [{t}] [group->" "{group},value->{value},format->{format},append->{append}," - "kwargs->{kwargs}]".format(t=t, group=group, - value=type(value), format=format, - append=append, kwargs=kwargs)) + "kwargs->{kwargs}]".format( + t=t, + group=group, + value=type(value), + format=format, + append=append, + kwargs=kwargs, + ) + ) - pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) - tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) + pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) + tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) # infer the pt from the passed value if pt is None: if value is None: _tables() - if (getattr(group, 'table', None) or - isinstance(group, _table_mod.table.Table)): - pt = 'frame_table' - tt = 'generic_table' + if getattr(group, "table", None) or isinstance( + group, _table_mod.table.Table + ): + pt = "frame_table" + tt = "generic_table" else: raise TypeError( "cannot create a storer if the object is not existing " - "nor a value are passed") + "nor a value are passed" + ) else: try: pt = _TYPE_MAP[type(value)] except KeyError: - error('_TYPE_MAP') + error("_TYPE_MAP") # we are actually a table - if format == 'table': - pt += '_table' + if format == "table": + pt += "_table" # a storer node - if 'table' not in pt: + if "table" not in pt: try: return globals()[_STORER_MAP[pt]](self, group, **kwargs) except KeyError: - error('_STORER_MAP') + error("_STORER_MAP") # existing node (and must be a table) if tt is None: @@ -1318,43 +1422,52 @@ def error(t): # if we are a writer, determine the tt if value is not None: - if pt == 'series_table': - index = getattr(value, 'index', None) + if pt == "series_table": + index = getattr(value, "index", None) if index is not None: if index.nlevels == 1: - tt = 'appendable_series' + tt = "appendable_series" elif index.nlevels > 1: - tt = 'appendable_multiseries' - elif pt == 'frame_table': - index = getattr(value, 'index', None) + tt = "appendable_multiseries" + elif pt == "frame_table": + index = getattr(value, "index", None) if index is not None: if index.nlevels == 1: - tt = 'appendable_frame' + tt = "appendable_frame" elif index.nlevels > 1: - tt = 'appendable_multiframe' - elif pt == 'wide_table': - tt = 'appendable_panel' - elif pt == 'ndim_table': - tt = 'appendable_ndim' + tt = "appendable_multiframe" + elif pt == "wide_table": + tt = "appendable_panel" + elif pt == "ndim_table": + tt = "appendable_ndim" else: # distinguish between a frame/table - tt = 'legacy_panel' + tt = "legacy_panel" try: fields = group.table._v_attrs.fields - if len(fields) == 1 and fields[0] == 'value': - tt = 'legacy_frame' + if len(fields) == 1 and fields[0] == "value": + tt = "legacy_frame" except IndexError: pass try: return globals()[_TABLE_MAP[tt]](self, group, **kwargs) except KeyError: - error('_TABLE_MAP') - - def _write_to_group(self, key, value, format, index=True, append=False, - complib=None, encoding=None, **kwargs): + error("_TABLE_MAP") + + def _write_to_group( + self, + key, + value, + format, + index=True, + append=False, + complib=None, + encoding=None, + **kwargs + ): group = self.get_node(key) # remove the node if we are not appending @@ -1364,43 +1477,41 @@ def _write_to_group(self, key, value, format, index=True, append=False, # we don't want to store a table node at all if are object is 0-len # as there are not dtypes - if getattr(value, 'empty', None) and (format == 'table' or append): + if getattr(value, "empty", None) and (format == "table" or append): return if group is None: - paths = key.split('/') + paths = key.split("/") # recursively create the groups - path = '/' + path = "/" for p in paths: if not len(p): continue new_path = path - if not path.endswith('/'): - new_path += '/' + if not path.endswith("/"): + new_path += "/" new_path += p group = self.get_node(new_path) if group is None: group = self._handle.create_group(path, p) path = new_path - s = self._create_storer(group, format, value, append=append, - encoding=encoding, **kwargs) + s = self._create_storer( + group, format, value, append=append, encoding=encoding, **kwargs + ) if append: # raise if we are trying to append to a Fixed format, # or a table that exists (and we are putting) - if (not s.is_table or - (s.is_table and format == 'fixed' and s.is_exists)): - raise ValueError('Can only append to Tables') + if not s.is_table or (s.is_table and format == "fixed" and s.is_exists): + raise ValueError("Can only append to Tables") if not s.is_exists: s.set_object_info() else: s.set_object_info() if not s.is_table and complib: - raise ValueError( - 'Compression not supported on Fixed format stores' - ) + raise ValueError("Compression not supported on Fixed format stores") # write the object s.write(obj=value, append=append, complib=complib, **kwargs) @@ -1435,8 +1546,19 @@ class TableIterator: kwargs : the passed kwargs """ - def __init__(self, store, s, func, where, nrows, start=None, stop=None, - iterator=False, chunksize=None, auto_close=False): + def __init__( + self, + store, + s, + func, + where, + nrows, + start=None, + stop=None, + iterator=False, + chunksize=None, + auto_close=False, + ): self.store = store self.s = s self.func = func @@ -1491,8 +1613,7 @@ def get_result(self, coordinates=False): # return the actual iterator if self.chunksize is not None: if not self.s.is_table: - raise TypeError( - "can only use an iterator or chunksize on a table") + raise TypeError("can only use an iterator or chunksize on a table") self.coordinates = self.s.read_coordinates(where=self.where) @@ -1500,8 +1621,9 @@ def get_result(self, coordinates=False): # if specified read via coordinates (necessary for multiple selections if coordinates: - where = self.s.read_coordinates(where=self.where, start=self.start, - stop=self.stop) + where = self.s.read_coordinates( + where=self.where, start=self.start, stop=self.stop + ) else: where = self.where @@ -1525,13 +1647,27 @@ class IndexCol: pos : the position in the pytables """ + is_an_indexable = True is_data_indexable = True - _info_fields = ['freq', 'tz', 'index_name'] - - def __init__(self, values=None, kind=None, typ=None, cname=None, - itemsize=None, name=None, axis=None, kind_attr=None, - pos=None, freq=None, tz=None, index_name=None, **kwargs): + _info_fields = ["freq", "tz", "index_name"] + + def __init__( + self, + values=None, + kind=None, + typ=None, + cname=None, + itemsize=None, + name=None, + axis=None, + kind_attr=None, + pos=None, + freq=None, + tz=None, + index_name=None, + **kwargs + ): self.values = values self.kind = kind self.typ = typ @@ -1581,20 +1717,21 @@ def set_table(self, table): def __repr__(self): temp = tuple( - map(pprint_thing, - (self.name, - self.cname, - self.axis, - self.pos, - self.kind))) - return ','.join(("{key}->{value}".format(key=key, value=value) - for key, value in zip( - ['name', 'cname', 'axis', 'pos', 'kind'], temp))) + map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) + ) + return ",".join( + ( + "{key}->{value}".format(key=key, value=value) + for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) + ) + ) def __eq__(self, other): """ compare 2 col items """ - return all(getattr(self, a, None) == getattr(other, a, None) - for a in ['name', 'cname', 'axis', 'pos']) + return all( + getattr(self, a, None) == getattr(other, a, None) + for a in ["name", "cname", "axis", "pos"] + ) def __ne__(self, other): return not self.__eq__(other) @@ -1620,8 +1757,7 @@ def infer(self, handler): new_self.read_metadata(handler) return new_self - def convert(self, values, nan_rep, encoding, errors, start=None, - stop=None): + def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): """ set the values from this selection: take = take ownership """ # values is a recarray @@ -1632,9 +1768,9 @@ def convert(self, values, nan_rep, encoding, errors, start=None, kwargs = dict() if self.freq is not None: - kwargs['freq'] = _ensure_decoded(self.freq) + kwargs["freq"] = _ensure_decoded(self.freq) if self.index_name is not None: - kwargs['name'] = _ensure_decoded(self.index_name) + kwargs["name"] = _ensure_decoded(self.index_name) # making an Index instance could throw a number of different errors try: self.values = Index(values, **kwargs) @@ -1642,8 +1778,8 @@ def convert(self, values, nan_rep, encoding, errors, start=None, # if the output freq is different that what we recorded, # it should be None (see also 'doc example part 2') - if 'freq' in kwargs: - kwargs['freq'] = None + if "freq" in kwargs: + kwargs["freq"] = None self.values = Index(values, **kwargs) self.values = _set_tz(self.values, self.tz) @@ -1680,14 +1816,13 @@ def maybe_set_size(self, min_itemsize=None): """ maybe set a string col itemsize: min_itemsize can be an integer or a dict with this columns name with an integer size """ - if _ensure_decoded(self.kind) == 'string': + if _ensure_decoded(self.kind) == "string": if isinstance(min_itemsize, dict): min_itemsize = min_itemsize.get(self.name) if min_itemsize is not None and self.typ.itemsize < min_itemsize: - self.typ = _tables( - ).StringCol(itemsize=min_itemsize, pos=self.pos) + self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) def validate(self, handler, append): self.validate_names() @@ -1707,7 +1842,7 @@ def validate_col(self, itemsize=None): """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - if _ensure_decoded(self.kind) == 'string': + if _ensure_decoded(self.kind) == "string": c = self.col if c is not None: if itemsize is None: @@ -1718,8 +1853,9 @@ def validate_col(self, itemsize=None): "[{cname}] column but\nthis column has a limit of " "[{c_itemsize}]!\nConsider using min_itemsize to " "preset the sizes on these columns".format( - itemsize=itemsize, cname=self.cname, - c_itemsize=c.itemsize)) + itemsize=itemsize, cname=self.cname, c_itemsize=c.itemsize + ) + ) return c.itemsize return None @@ -1731,8 +1867,8 @@ def validate_attr(self, append): if existing_kind is not None and existing_kind != self.kind: raise TypeError( "incompatible kind in col [{existing} - " - "{self_kind}]".format( - existing=existing_kind, self_kind=self.kind)) + "{self_kind}]".format(existing=existing_kind, self_kind=self.kind) + ) def update_info(self, info): """ set/update the info for this indexable with the key/value @@ -1747,7 +1883,7 @@ def update_info(self, info): if key in idx and value is not None and existing_value != value: # frequency/name just warn - if key in ['freq', 'index_name']: + if key in ["freq", "index_name"]: ws = attribute_conflict_doc % (key, existing_value, value) warnings.warn(ws, AttributeConflictWarning, stacklevel=6) @@ -1760,8 +1896,12 @@ def update_info(self, info): "invalid info for [{name}] for [{key}], " "existing_value [{existing_value}] conflicts with " "new value [{value}]".format( - name=self.name, key=key, - existing_value=existing_value, value=value)) + name=self.name, + key=key, + existing_value=existing_value, + value=value, + ) + ) else: if value is not None or existing_value is not None: idx[key] = value @@ -1788,13 +1928,18 @@ def read_metadata(self, handler): def validate_metadata(self, handler): """ validate that kind=category does not change the categories """ - if self.meta == 'category': + if self.meta == "category": new_metadata = self.metadata cur_metadata = handler.read_metadata(self.cname) - if (new_metadata is not None and cur_metadata is not None and - not array_equivalent(new_metadata, cur_metadata)): - raise ValueError("cannot append a categorical with " - "different categories to the existing") + if ( + new_metadata is not None + and cur_metadata is not None + and not array_equivalent(new_metadata, cur_metadata) + ): + raise ValueError( + "cannot append a categorical with " + "different categories to the existing" + ) def write_metadata(self, handler): """ set the meta data """ @@ -1810,8 +1955,7 @@ class GenericIndexCol(IndexCol): def is_indexed(self): return False - def convert(self, values, nan_rep, encoding, errors, start=None, - stop=None): + def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): """ set the values from this selection: take = take ownership Parameters @@ -1829,8 +1973,7 @@ def convert(self, values, nan_rep, encoding, errors, start=None, """ start = start if start is not None else 0 - stop = (min(stop, self.table.nrows) - if stop is not None else self.table.nrows) + stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows self.values = Int64Index(np.arange(stop - start)) return self @@ -1855,17 +1998,17 @@ class DataCol(IndexCol): meta : a string description of the metadata metadata : the actual metadata """ + is_an_indexable = False is_data_indexable = False - _info_fields = ['tz', 'ordered'] + _info_fields = ["tz", "ordered"] @classmethod - def create_for_block( - cls, i=None, name=None, cname=None, version=None, **kwargs): + def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): """ return a new datacol with the block i """ if cname is None: - cname = name or 'values_block_{idx}'.format(idx=i) + cname = name or "values_block_{idx}".format(idx=i) if name is None: name = cname @@ -1881,34 +2024,45 @@ def create_for_block( return cls(name=name, cname=cname, **kwargs) - def __init__(self, values=None, kind=None, typ=None, - cname=None, data=None, meta=None, metadata=None, - block=None, **kwargs): - super().__init__(values=values, kind=kind, typ=typ, cname=cname, - **kwargs) + def __init__( + self, + values=None, + kind=None, + typ=None, + cname=None, + data=None, + meta=None, + metadata=None, + block=None, + **kwargs + ): + super().__init__(values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = '{name}_dtype'.format(name=self.name) + self.dtype_attr = "{name}_dtype".format(name=self.name) self.meta = meta - self.meta_attr = '{name}_meta'.format(name=self.name) + self.meta_attr = "{name}_meta".format(name=self.name) self.set_data(data) self.set_metadata(metadata) def __repr__(self): temp = tuple( - map(pprint_thing, - (self.name, - self.cname, - self.dtype, - self.kind, - self.shape))) - return ','.join(("{key}->{value}".format(key=key, value=value) - for key, value in zip( - ['name', 'cname', 'dtype', 'kind', 'shape'], temp))) + map( + pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) + ) + ) + return ",".join( + ( + "{key}->{value}".format(key=key, value=value) + for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) + ) + ) def __eq__(self, other): """ compare 2 col items """ - return all(getattr(self, a, None) == getattr(other, a, None) - for a in ['name', 'cname', 'dtype', 'pos']) + return all( + getattr(self, a, None) == getattr(other, a, None) + for a in ["name", "cname", "dtype", "pos"] + ) def set_data(self, data, dtype=None): self.data = data @@ -1937,39 +2091,49 @@ def set_kind(self): if self.dtype is not None: dtype = _ensure_decoded(self.dtype) - if dtype.startswith('string') or dtype.startswith('bytes'): - self.kind = 'string' - elif dtype.startswith('float'): - self.kind = 'float' - elif dtype.startswith('complex'): - self.kind = 'complex' - elif dtype.startswith('int') or dtype.startswith('uint'): - self.kind = 'integer' - elif dtype.startswith('date'): - self.kind = 'datetime' - elif dtype.startswith('timedelta'): - self.kind = 'timedelta' - elif dtype.startswith('bool'): - self.kind = 'bool' + if dtype.startswith("string") or dtype.startswith("bytes"): + self.kind = "string" + elif dtype.startswith("float"): + self.kind = "float" + elif dtype.startswith("complex"): + self.kind = "complex" + elif dtype.startswith("int") or dtype.startswith("uint"): + self.kind = "integer" + elif dtype.startswith("date"): + self.kind = "datetime" + elif dtype.startswith("timedelta"): + self.kind = "timedelta" + elif dtype.startswith("bool"): + self.kind = "bool" else: raise AssertionError( "cannot interpret dtype of [{dtype}] in [{obj}]".format( - dtype=dtype, obj=self)) + dtype=dtype, obj=self + ) + ) # set my typ if we need if self.typ is None: self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, block_items, existing_col, min_itemsize, - nan_rep, info, encoding=None, errors='strict'): + def set_atom( + self, + block, + block_items, + existing_col, + min_itemsize, + nan_rep, + info, + encoding=None, + errors="strict", + ): """ create and setup my atom from the block b """ self.values = list(block_items) # short-cut certain block types if block.is_categorical: - return self.set_atom_categorical(block, items=block_items, - info=info) + return self.set_atom_categorical(block, items=block_items, info=info) elif block.is_datetimetz: return self.set_atom_datetime64tz(block, info=info) elif block.is_datetime: @@ -1982,32 +2146,31 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, dtype = block.dtype.name inferred_type = lib.infer_dtype(block.values, skipna=False) - if inferred_type == 'date': - raise TypeError( - "[date] is not implemented as a table column") - elif inferred_type == 'datetime': + if inferred_type == "date": + raise TypeError("[date] is not implemented as a table column") + elif inferred_type == "datetime": # after 8260 # this only would be hit for a mutli-timezone dtype # which is an error raise TypeError( - "too many timezones in this block, create separate " - "data columns" + "too many timezones in this block, create separate " "data columns" ) - elif inferred_type == 'unicode': - raise TypeError( - "[unicode] is not implemented as a table column") + elif inferred_type == "unicode": + raise TypeError("[unicode] is not implemented as a table column") # this is basically a catchall; if say a datetime64 has nans then will # end up here ### - elif inferred_type == 'string' or dtype == 'object': + elif inferred_type == "string" or dtype == "object": self.set_atom_string( - block, block_items, + block, + block_items, existing_col, min_itemsize, nan_rep, encoding, - errors) + errors, + ) # set as a data block else: @@ -2016,8 +2179,9 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, block_items, existing_col, min_itemsize, - nan_rep, encoding, errors): + def set_atom_string( + self, block, block_items, existing_col, min_itemsize, nan_rep, encoding, errors + ): # fill nan items with myself, don't disturb the blocks by # trying to downcast block = block.fillna(nan_rep, downcast=False) @@ -2027,7 +2191,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, # see if we have a valid string type inferred_type = lib.infer_dtype(data.ravel(), skipna=False) - if inferred_type != 'string': + if inferred_type != "string": # we cannot serialize this data, so report an exception on a column # by column basis @@ -2035,11 +2199,12 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, col = block.iget(i) inferred_type = lib.infer_dtype(col.ravel(), skipna=False) - if inferred_type != 'string': + if inferred_type != "string": raise TypeError( "Cannot serialize the column [{item}] because\n" "its data contents are [{type}] object dtype".format( - item=item, type=inferred_type) + item=item, type=inferred_type + ) ) # itemsize is the maximum length of a string (along any dimension) @@ -2048,8 +2213,9 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, # specified min_itemsize? if isinstance(min_itemsize, dict): - min_itemsize = int(min_itemsize.get( - self.name) or min_itemsize.get('values') or 0) + min_itemsize = int( + min_itemsize.get(self.name) or min_itemsize.get("values") or 0 + ) itemsize = max(min_itemsize or 0, itemsize) # check for column in the values conflicts @@ -2059,16 +2225,17 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize, itemsize = eci self.itemsize = itemsize - self.kind = 'string' + self.kind = "string" self.typ = self.get_atom_string(block, itemsize) - self.set_data(data_converted.astype( - '|S{size}'.format(size=itemsize), copy=False)) + self.set_data( + data_converted.astype("|S{size}".format(size=itemsize), copy=False) + ) def get_atom_coltype(self, kind=None): """ return the PyTables column class for this column """ if kind is None: kind = self.kind - if self.kind.startswith('uint'): + if self.kind.startswith("uint"): col_name = "UInt{name}Col".format(name=kind[4:]) else: col_name = "{name}Col".format(name=kind.capitalize()) @@ -2080,9 +2247,8 @@ def get_atom_data(self, block, kind=None): def set_atom_complex(self, block): self.kind = block.dtype.name - itemsize = int(self.kind.split('complex')[-1]) // 8 - self.typ = _tables().ComplexCol( - itemsize=itemsize, shape=block.shape[0]) + itemsize = int(self.kind.split("complex")[-1]) // 8 + self.typ = _tables().ComplexCol(itemsize=itemsize, shape=block.shape[0]) self.set_data(block.values.astype(self.typ.type, copy=False)) def set_atom_data(self, block): @@ -2096,7 +2262,7 @@ def set_atom_categorical(self, block, items, info=None, values=None): values = block.values codes = values.codes - self.kind = 'integer' + self.kind = "integer" self.dtype = codes.dtype.name if values.ndim > 1: raise NotImplementedError("only support 1-d categoricals") @@ -2109,7 +2275,7 @@ def set_atom_categorical(self, block, items, info=None, values=None): self.set_data(_block_shape(codes)) # write the categories - self.meta = 'category' + self.meta = "category" self.set_metadata(block.values.categories) # update the info @@ -2119,11 +2285,11 @@ def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) def set_atom_datetime64(self, block, values=None): - self.kind = 'datetime64' + self.kind = "datetime64" self.typ = self.get_atom_datetime64(block) if values is None: - values = block.values.view('i8') - self.set_data(values, 'datetime64') + values = block.values.view("i8") + self.set_data(values, "datetime64") def set_atom_datetime64tz(self, block, info, values=None): @@ -2137,23 +2303,23 @@ def set_atom_datetime64tz(self, block, info, values=None): self.tz = _get_tz(block.values.tz) self.update_info(info) - self.kind = 'datetime64' + self.kind = "datetime64" self.typ = self.get_atom_datetime64(block) - self.set_data(values, 'datetime64') + self.set_data(values, "datetime64") def get_atom_timedelta64(self, block): return _tables().Int64Col(shape=block.shape[0]) def set_atom_timedelta64(self, block, values=None): - self.kind = 'timedelta64' + self.kind = "timedelta64" self.typ = self.get_atom_timedelta64(block) if values is None: - values = block.values.view('i8') - self.set_data(values, 'timedelta64') + values = block.values.view("i8") + self.set_data(values, "timedelta64") @property def shape(self): - return getattr(self.data, 'shape', None) + return getattr(self.data, "shape", None) @property def cvalues(self): @@ -2164,19 +2330,19 @@ def validate_attr(self, append): """validate that we have the same order as the existing & same dtype""" if append: existing_fields = getattr(self.attrs, self.kind_attr, None) - if (existing_fields is not None and - existing_fields != list(self.values)): - raise ValueError("appended items do not match existing items" - " in table!") + if existing_fields is not None and existing_fields != list(self.values): + raise ValueError( + "appended items do not match existing items" " in table!" + ) existing_dtype = getattr(self.attrs, self.dtype_attr, None) - if (existing_dtype is not None and - existing_dtype != self.dtype): - raise ValueError("appended items dtype do not match existing " - "items dtype in table!") + if existing_dtype is not None and existing_dtype != self.dtype: + raise ValueError( + "appended items dtype do not match existing " + "items dtype in table!" + ) - def convert(self, values, nan_rep, encoding, errors, start=None, - stop=None): + def convert(self, values, nan_rep, encoding, errors, start=None, stop=None): """set the data from this selection (and convert to the correct dtype if we can) """ @@ -2195,27 +2361,28 @@ def convert(self, values, nan_rep, encoding, errors, start=None, dtype = _ensure_decoded(self.dtype) # reverse converts - if dtype == 'datetime64': + if dtype == "datetime64": # recreate with tz if indicated self.data = _set_tz(self.data, self.tz, coerce=True) - elif dtype == 'timedelta64': - self.data = np.asarray(self.data, dtype='m8[ns]') - elif dtype == 'date': + elif dtype == "timedelta64": + self.data = np.asarray(self.data, dtype="m8[ns]") + elif dtype == "date": try: self.data = np.asarray( - [date.fromordinal(v) for v in self.data], dtype=object) + [date.fromordinal(v) for v in self.data], dtype=object + ) except ValueError: self.data = np.asarray( - [date.fromtimestamp(v) for v in self.data], - dtype=object) - elif dtype == 'datetime': + [date.fromtimestamp(v) for v in self.data], dtype=object + ) + elif dtype == "datetime": self.data = np.asarray( - [datetime.fromtimestamp(v) for v in self.data], - dtype=object) + [datetime.fromtimestamp(v) for v in self.data], dtype=object + ) - elif meta == 'category': + elif meta == "category": # we have a categorical categories = self.metadata @@ -2236,21 +2403,22 @@ def convert(self, values, nan_rep, encoding, errors, start=None, categories = categories[~mask] codes[codes != -1] -= mask.astype(int).cumsum().values - self.data = Categorical.from_codes(codes, - categories=categories, - ordered=self.ordered) + self.data = Categorical.from_codes( + codes, categories=categories, ordered=self.ordered + ) else: try: self.data = self.data.astype(dtype, copy=False) except TypeError: - self.data = self.data.astype('O', copy=False) + self.data = self.data.astype("O", copy=False) # convert nans / decode - if _ensure_decoded(self.kind) == 'string': + if _ensure_decoded(self.kind) == "string": self.data = _unconvert_string_array( - self.data, nan_rep=nan_rep, encoding=encoding, errors=errors) + self.data, nan_rep=nan_rep, encoding=encoding, errors=errors + ) return self @@ -2272,6 +2440,7 @@ def set_attr(self): class DataIndexableCol(DataCol): """ represent a data column that can be indexed """ + is_data_indexable = True def validate_names(self): @@ -2311,13 +2480,13 @@ class Fixed: parent : my parent HDFStore group : the group node where the table resides """ + pandas_kind = None # type: str obj_type = None # type: Type[Union[DataFrame, Series]] ndim = None # type: int is_table = False - def __init__(self, parent, group, encoding=None, errors='strict', - **kwargs): + def __init__(self, parent, group, encoding=None, errors="strict", **kwargs): self.parent = parent self.group = group self.encoding = _ensure_encoding(encoding) @@ -2326,15 +2495,13 @@ def __init__(self, parent, group, encoding=None, errors='strict', @property def is_old_version(self): - return (self.version[0] <= 0 and self.version[1] <= 10 and - self.version[2] < 1) + return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 def set_version(self): """ compute and set our version """ - version = _ensure_decoded( - getattr(self.group._v_attrs, 'pandas_version', None)) + version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) try: - self.version = tuple(int(x) for x in version.split('.')) + self.version = tuple(int(x) for x in version.split(".")) if len(self.version) == 2: self.version = self.version + (0,) except AttributeError: @@ -2342,12 +2509,11 @@ def set_version(self): @property def pandas_type(self): - return _ensure_decoded(getattr(self.group._v_attrs, - 'pandas_type', None)) + return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) @property def format_type(self): - return 'fixed' + return "fixed" def __repr__(self): """ return a pretty representation of myself """ @@ -2355,10 +2521,10 @@ def __repr__(self): s = self.shape if s is not None: if isinstance(s, (list, tuple)): - s = "[{shape}]".format( - shape=','.join(pprint_thing(x) for x in s)) + s = "[{shape}]".format(shape=",".join(pprint_thing(x) for x in s)) return "{type:12.12} (shape->{shape})".format( - type=self.pandas_type, shape=s) + type=self.pandas_type, shape=s + ) return self.pandas_type def set_object_info(self): @@ -2426,7 +2592,7 @@ def is_exists(self): @property def nrows(self): - return getattr(self.storable, 'nrows', None) + return getattr(self.storable, "nrows", None) def validate(self, other): """ validate against an existing storable """ @@ -2450,11 +2616,13 @@ def infer_axes(self): def read(self, **kwargs): raise NotImplementedError( - "cannot read on an abstract storer: subclasses should implement") + "cannot read on an abstract storer: subclasses should implement" + ) def write(self, **kwargs): raise NotImplementedError( - "cannot write on an abstract storer: sublcasses should implement") + "cannot write on an abstract storer: sublcasses should implement" + ) def delete(self, where=None, start=None, stop=None, **kwargs): """ @@ -2471,13 +2639,14 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class GenericFixed(Fixed): """ a generified fixed version """ - _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} + + _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} _reverse_index_map = {v: k for k, v in _index_type_map.items()} attributes = [] # type: List[str] # indexer helpders def _class_to_alias(self, cls): - return self._index_type_map.get(cls, '') + return self._index_type_map.get(cls, "") def _alias_to_class(self, alias): if isinstance(alias, type): # pragma: no cover @@ -2487,17 +2656,20 @@ def _alias_to_class(self, alias): def _get_index_factory(self, klass): if klass == DatetimeIndex: + def f(values, freq=None, tz=None): # data are already in UTC, localize and convert if tz present - result = DatetimeIndex._simple_new(values.values, name=None, - freq=freq) + result = DatetimeIndex._simple_new(values.values, name=None, freq=freq) if tz is not None: - result = result.tz_localize('UTC').tz_convert(tz) + result = result.tz_localize("UTC").tz_convert(tz) return result + return f elif klass == PeriodIndex: + def f(values, freq=None, tz=None): return PeriodIndex._simple_new(values, name=None, freq=freq) + return f return klass @@ -2509,16 +2681,20 @@ def validate_read(self, kwargs): """ kwargs = copy.copy(kwargs) - columns = kwargs.pop('columns', None) + columns = kwargs.pop("columns", None) if columns is not None: - raise TypeError("cannot pass a column specification when reading " - "a Fixed format store. this store must be " - "selected in its entirety") - where = kwargs.pop('where', None) + raise TypeError( + "cannot pass a column specification when reading " + "a Fixed format store. this store must be " + "selected in its entirety" + ) + where = kwargs.pop("where", None) if where is not None: - raise TypeError("cannot pass a where specification when reading " - "from a Fixed format store. this store must be " - "selected in its entirety") + raise TypeError( + "cannot pass a where specification when reading " + "from a Fixed format store. this store must be " + "selected in its entirety" + ) return kwargs @property @@ -2532,8 +2708,8 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) - self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict')) + self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) for n in self.attributes: setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) @@ -2543,16 +2719,17 @@ def write(self, obj, **kwargs): def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ import tables + node = getattr(self.group, key) attrs = node._v_attrs - transposed = getattr(attrs, 'transposed', False) + transposed = getattr(attrs, "transposed", False) if isinstance(node, tables.VLArray): ret = node[0][start:stop] else: - dtype = getattr(attrs, 'value_type', None) - shape = getattr(attrs, 'shape', None) + dtype = getattr(attrs, "value_type", None) + shape = getattr(attrs, "shape", None) if shape is not None: # length 0 axis @@ -2560,13 +2737,13 @@ def read_array(self, key, start=None, stop=None): else: ret = node[start:stop] - if dtype == 'datetime64': + if dtype == "datetime64": # reconstruct a timezone if indicated - ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True) + ret = _set_tz(ret, getattr(attrs, "tz", None), coerce=True) - elif dtype == 'timedelta64': - ret = np.asarray(ret, dtype='m8[ns]') + elif dtype == "timedelta64": + ret = np.asarray(ret, dtype="m8[ns]") if transposed: return ret.T @@ -2574,37 +2751,37 @@ def read_array(self, key, start=None, stop=None): return ret def read_index(self, key, **kwargs): - variety = _ensure_decoded( - getattr(self.attrs, '{key}_variety'.format(key=key))) + variety = _ensure_decoded(getattr(self.attrs, "{key}_variety".format(key=key))) - if variety == 'multi': + if variety == "multi": return self.read_multi_index(key, **kwargs) - elif variety == 'block': + elif variety == "block": return self.read_block_index(key, **kwargs) - elif variety == 'sparseint': + elif variety == "sparseint": return self.read_sparse_intindex(key, **kwargs) - elif variety == 'regular': + elif variety == "regular": _, index = self.read_index_node(getattr(self.group, key), **kwargs) return index else: # pragma: no cover raise TypeError( - 'unrecognized index variety: {variety}'.format( - variety=variety)) + "unrecognized index variety: {variety}".format(variety=variety) + ) def write_index(self, key, index): if isinstance(index, MultiIndex): - setattr(self.attrs, '{key}_variety'.format(key=key), 'multi') + setattr(self.attrs, "{key}_variety".format(key=key), "multi") self.write_multi_index(key, index) elif isinstance(index, BlockIndex): - setattr(self.attrs, '{key}_variety'.format(key=key), 'block') + setattr(self.attrs, "{key}_variety".format(key=key), "block") self.write_block_index(key, index) elif isinstance(index, IntIndex): - setattr(self.attrs, '{key}_variety'.format(key=key), 'sparseint') + setattr(self.attrs, "{key}_variety".format(key=key), "sparseint") self.write_sparse_intindex(key, index) else: - setattr(self.attrs, '{key}_variety'.format(key=key), 'regular') - converted = _convert_index(index, self.encoding, self.errors, - self.format_type).set_name('index') + setattr(self.attrs, "{key}_variety".format(key=key), "regular") + converted = _convert_index( + index, self.encoding, self.errors, self.format_type + ).set_name("index") self.write_array(key, converted.values) @@ -2615,113 +2792,124 @@ def write_index(self, key, index): if isinstance(index, (DatetimeIndex, PeriodIndex)): node._v_attrs.index_class = self._class_to_alias(type(index)) - if hasattr(index, 'freq'): + if hasattr(index, "freq"): node._v_attrs.freq = index.freq - if hasattr(index, 'tz') and index.tz is not None: + if hasattr(index, "tz") and index.tz is not None: node._v_attrs.tz = _get_tz(index.tz) def write_block_index(self, key, index): - self.write_array('{key}_blocs'.format(key=key), index.blocs) - self.write_array('{key}_blengths'.format(key=key), index.blengths) - setattr(self.attrs, '{key}_length'.format(key=key), index.length) + self.write_array("{key}_blocs".format(key=key), index.blocs) + self.write_array("{key}_blengths".format(key=key), index.blengths) + setattr(self.attrs, "{key}_length".format(key=key), index.length) def read_block_index(self, key, **kwargs): - length = getattr(self.attrs, '{key}_length'.format(key=key)) - blocs = self.read_array('{key}_blocs'.format(key=key), **kwargs) - blengths = self.read_array('{key}_blengths'.format(key=key), **kwargs) + length = getattr(self.attrs, "{key}_length".format(key=key)) + blocs = self.read_array("{key}_blocs".format(key=key), **kwargs) + blengths = self.read_array("{key}_blengths".format(key=key), **kwargs) return BlockIndex(length, blocs, blengths) def write_sparse_intindex(self, key, index): - self.write_array('{key}_indices'.format(key=key), index.indices) - setattr(self.attrs, '{key}_length'.format(key=key), index.length) + self.write_array("{key}_indices".format(key=key), index.indices) + setattr(self.attrs, "{key}_length".format(key=key), index.length) def read_sparse_intindex(self, key, **kwargs): - length = getattr(self.attrs, '{key}_length'.format(key=key)) - indices = self.read_array('{key}_indices'.format(key=key), **kwargs) + length = getattr(self.attrs, "{key}_length".format(key=key)) + indices = self.read_array("{key}_indices".format(key=key), **kwargs) return IntIndex(length, indices) def write_multi_index(self, key, index): - setattr(self.attrs, '{key}_nlevels'.format(key=key), index.nlevels) + setattr(self.attrs, "{key}_nlevels".format(key=key), index.nlevels) - for i, (lev, level_codes, name) in enumerate(zip(index.levels, - index.codes, - index.names)): + for i, (lev, level_codes, name) in enumerate( + zip(index.levels, index.codes, index.names) + ): # write the level if is_extension_type(lev): - raise NotImplementedError("Saving a MultiIndex with an " - "extension dtype is not supported.") - level_key = '{key}_level{idx}'.format(key=key, idx=i) - conv_level = _convert_index(lev, self.encoding, self.errors, - self.format_type).set_name(level_key) + raise NotImplementedError( + "Saving a MultiIndex with an " "extension dtype is not supported." + ) + level_key = "{key}_level{idx}".format(key=key, idx=i) + conv_level = _convert_index( + lev, self.encoding, self.errors, self.format_type + ).set_name(level_key) self.write_array(level_key, conv_level.values) node = getattr(self.group, level_key) node._v_attrs.kind = conv_level.kind node._v_attrs.name = name # write the name - setattr(node._v_attrs, '{key}_name{name}'.format( - key=key, name=name), name) + setattr(node._v_attrs, "{key}_name{name}".format(key=key, name=name), name) # write the labels - label_key = '{key}_label{idx}'.format(key=key, idx=i) + label_key = "{key}_label{idx}".format(key=key, idx=i) self.write_array(label_key, level_codes) def read_multi_index(self, key, **kwargs): - nlevels = getattr(self.attrs, '{key}_nlevels'.format(key=key)) + nlevels = getattr(self.attrs, "{key}_nlevels".format(key=key)) levels = [] codes = [] names = [] for i in range(nlevels): - level_key = '{key}_level{idx}'.format(key=key, idx=i) - name, lev = self.read_index_node(getattr(self.group, level_key), - **kwargs) + level_key = "{key}_level{idx}".format(key=key, idx=i) + name, lev = self.read_index_node(getattr(self.group, level_key), **kwargs) levels.append(lev) names.append(name) - label_key = '{key}_label{idx}'.format(key=key, idx=i) + label_key = "{key}_label{idx}".format(key=key, idx=i) level_codes = self.read_array(label_key, **kwargs) codes.append(level_codes) - return MultiIndex(levels=levels, codes=codes, names=names, - verify_integrity=True) + return MultiIndex( + levels=levels, codes=codes, names=names, verify_integrity=True + ) def read_index_node(self, node, start=None, stop=None): data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. - if ('shape' in node._v_attrs and - self._is_empty_array(getattr(node._v_attrs, 'shape'))): - data = np.empty(getattr(node._v_attrs, 'shape'), - dtype=getattr(node._v_attrs, 'value_type')) + if "shape" in node._v_attrs and self._is_empty_array( + getattr(node._v_attrs, "shape") + ): + data = np.empty( + getattr(node._v_attrs, "shape"), + dtype=getattr(node._v_attrs, "value_type"), + ) kind = _ensure_decoded(node._v_attrs.kind) name = None - if 'name' in node._v_attrs: + if "name" in node._v_attrs: name = _ensure_str(node._v_attrs.name) name = _ensure_decoded(name) - index_class = self._alias_to_class(_ensure_decoded( - getattr(node._v_attrs, 'index_class', ''))) + index_class = self._alias_to_class( + _ensure_decoded(getattr(node._v_attrs, "index_class", "")) + ) factory = self._get_index_factory(index_class) kwargs = {} - if 'freq' in node._v_attrs: - kwargs['freq'] = node._v_attrs['freq'] - - if 'tz' in node._v_attrs: - kwargs['tz'] = node._v_attrs['tz'] - - if kind in ('date', 'datetime'): - index = factory(_unconvert_index(data, kind, - encoding=self.encoding, - errors=self.errors), - dtype=object, **kwargs) + if "freq" in node._v_attrs: + kwargs["freq"] = node._v_attrs["freq"] + + if "tz" in node._v_attrs: + kwargs["tz"] = node._v_attrs["tz"] + + if kind in ("date", "datetime"): + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + dtype=object, + **kwargs + ) else: - index = factory(_unconvert_index(data, kind, - encoding=self.encoding, - errors=self.errors), **kwargs) + index = factory( + _unconvert_index( + data, kind, encoding=self.encoding, errors=self.errors + ), + **kwargs + ) index.name = name @@ -2749,11 +2937,13 @@ def write_array(self, key, value, items=None): transposed = False if is_categorical_dtype(value): - raise NotImplementedError('Cannot store a category dtype in ' - 'a HDF5 dataset that uses format=' - '"fixed". Use format="table".') + raise NotImplementedError( + "Cannot store a category dtype in " + "a HDF5 dataset that uses format=" + '"fixed". Use format="table".' + ) if not empty_array: - if hasattr(value, 'T'): + if hasattr(value, "T"): # ExtensionArrays (1d) may not have transpose. value = value.T transposed = True @@ -2769,9 +2959,9 @@ def write_array(self, key, value, items=None): if atom is not None: # create an empty chunked array and fill it from value if not empty_array: - ca = self._handle.create_carray(self.group, key, atom, - value.shape, - filters=self._filters) + ca = self._handle.create_carray( + self.group, key, atom, value.shape, filters=self._filters + ) ca[:] = value getattr(self.group, key)._v_attrs.transposed = transposed @@ -2787,7 +2977,7 @@ def write_array(self, key, value, items=None): inferred_type = lib.infer_dtype(value.ravel(), skipna=False) if empty_array: pass - elif inferred_type == 'string': + elif inferred_type == "string": pass else: try: @@ -2797,32 +2987,26 @@ def write_array(self, key, value, items=None): ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning, stacklevel=7) - vlarr = self._handle.create_vlarray(self.group, key, - _tables().ObjectAtom()) + vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) vlarr.append(value) else: if empty_array: self.write_array_empty(key, value) else: if is_datetime64_dtype(value.dtype): - self._handle.create_array( - self.group, key, value.view('i8')) - getattr( - self.group, key)._v_attrs.value_type = 'datetime64' + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "datetime64" elif is_datetime64tz_dtype(value.dtype): # store as UTC # with a zone - self._handle.create_array(self.group, key, - value.asi8) + self._handle.create_array(self.group, key, value.asi8) node = getattr(self.group, key) node._v_attrs.tz = _get_tz(value.tz) - node._v_attrs.value_type = 'datetime64' + node._v_attrs.value_type = "datetime64" elif is_timedelta64_dtype(value.dtype): - self._handle.create_array( - self.group, key, value.view('i8')) - getattr( - self.group, key)._v_attrs.value_type = 'timedelta64' + self._handle.create_array(self.group, key, value.view("i8")) + getattr(self.group, key)._v_attrs.value_type = "timedelta64" else: self._handle.create_array(self.group, key, value) @@ -2830,117 +3014,122 @@ def write_array(self, key, value, items=None): class LegacyFixed(GenericFixed): - def read_index_legacy(self, key, start=None, stop=None): node = getattr(self.group, key) data = node[start:stop] kind = node._v_attrs.kind - return _unconvert_index_legacy(data, kind, encoding=self.encoding, - errors=self.errors) + return _unconvert_index_legacy( + data, kind, encoding=self.encoding, errors=self.errors + ) class LegacySeriesFixed(LegacyFixed): - def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index_legacy('index') - values = self.read_array('values') + index = self.read_index_legacy("index") + values = self.read_array("values") return Series(values, index=index) class LegacyFrameFixed(LegacyFixed): - def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index_legacy('index') - columns = self.read_index_legacy('columns') - values = self.read_array('values') + index = self.read_index_legacy("index") + columns = self.read_index_legacy("columns") + values = self.read_array("values") return DataFrame(values, index=index, columns=columns) class SeriesFixed(GenericFixed): - pandas_kind = 'series' - attributes = ['name'] + pandas_kind = "series" + attributes = ["name"] @property def shape(self): try: - return len(getattr(self.group, 'values')), + return (len(getattr(self.group, "values")),) except (TypeError, AttributeError): return None def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index('index', **kwargs) - values = self.read_array('values', **kwargs) + index = self.read_index("index", **kwargs) + values = self.read_array("values", **kwargs) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): super().write(obj, **kwargs) - self.write_index('index', obj.index) - self.write_array('values', obj.values) + self.write_index("index", obj.index) + self.write_array("values", obj.values) self.attrs.name = obj.name class SparseFixed(GenericFixed): - def validate_read(self, kwargs): """ we don't support start, stop kwds in Sparse """ kwargs = super().validate_read(kwargs) - if 'start' in kwargs or 'stop' in kwargs: - raise NotImplementedError("start and/or stop are not supported " - "in fixed Sparse reading") + if "start" in kwargs or "stop" in kwargs: + raise NotImplementedError( + "start and/or stop are not supported " "in fixed Sparse reading" + ) return kwargs class SparseSeriesFixed(SparseFixed): - pandas_kind = 'sparse_series' - attributes = ['name', 'fill_value', 'kind'] + pandas_kind = "sparse_series" + attributes = ["name", "fill_value", "kind"] def read(self, **kwargs): kwargs = self.validate_read(kwargs) - index = self.read_index('index') - sp_values = self.read_array('sp_values') - sp_index = self.read_index('sp_index') - return SparseSeries(sp_values, index=index, sparse_index=sp_index, - kind=self.kind or 'block', - fill_value=self.fill_value, - name=self.name) + index = self.read_index("index") + sp_values = self.read_array("sp_values") + sp_index = self.read_index("sp_index") + return SparseSeries( + sp_values, + index=index, + sparse_index=sp_index, + kind=self.kind or "block", + fill_value=self.fill_value, + name=self.name, + ) def write(self, obj, **kwargs): super().write(obj, **kwargs) - self.write_index('index', obj.index) - self.write_index('sp_index', obj.sp_index) - self.write_array('sp_values', obj.sp_values) + self.write_index("index", obj.index) + self.write_index("sp_index", obj.sp_index) + self.write_array("sp_values", obj.sp_values) self.attrs.name = obj.name self.attrs.fill_value = obj.fill_value self.attrs.kind = obj.kind class SparseFrameFixed(SparseFixed): - pandas_kind = 'sparse_frame' - attributes = ['default_kind', 'default_fill_value'] + pandas_kind = "sparse_frame" + attributes = ["default_kind", "default_fill_value"] def read(self, **kwargs): kwargs = self.validate_read(kwargs) - columns = self.read_index('columns') + columns = self.read_index("columns") sdict = {} for c in columns: - key = 'sparse_series_{columns}'.format(columns=c) + key = "sparse_series_{columns}".format(columns=c) s = SparseSeriesFixed(self.parent, getattr(self.group, key)) s.infer_axes() sdict[c] = s.read() - return SparseDataFrame(sdict, columns=columns, - default_kind=self.default_kind, - default_fill_value=self.default_fill_value) + return SparseDataFrame( + sdict, + columns=columns, + default_kind=self.default_kind, + default_fill_value=self.default_fill_value, + ) def write(self, obj, **kwargs): """ write it as a collection of individual sparse series """ super().write(obj, **kwargs) for name, ss in obj.items(): - key = 'sparse_series_{name}'.format(name=name) + key = "sparse_series_{name}".format(name=name) if key not in self.group._v_children: node = self._handle.create_group(self.group, key) else: @@ -2949,11 +3138,11 @@ def write(self, obj, **kwargs): s.write(ss) self.attrs.default_fill_value = obj.default_fill_value self.attrs.default_kind = obj.default_kind - self.write_index('columns', obj.columns) + self.write_index("columns", obj.columns) class BlockManagerFixed(GenericFixed): - attributes = ['ndim', 'nblocks'] + attributes = ["ndim", "nblocks"] is_shape_reversed = False @property @@ -2964,16 +3153,16 @@ def shape(self): # items items = 0 for i in range(self.nblocks): - node = getattr(self.group, 'block{idx}_items'.format(idx=i)) - shape = getattr(node, 'shape', None) + node = getattr(self.group, "block{idx}_items".format(idx=i)) + shape = getattr(node, "shape", None) if shape is not None: items += shape[0] # data shape - node = getattr(self.group, 'block0_values') - shape = getattr(node, 'shape', None) + node = getattr(self.group, "block0_values") + shape = getattr(node, "shape", None) if shape is not None: - shape = list(shape[0:(ndim - 1)]) + shape = list(shape[0 : (ndim - 1)]) else: shape = [] @@ -2997,19 +3186,18 @@ def read(self, start=None, stop=None, **kwargs): for i in range(self.ndim): _start, _stop = (start, stop) if i == select_axis else (None, None) - ax = self.read_index('axis{idx}'.format( - idx=i), start=_start, stop=_stop) + ax = self.read_index("axis{idx}".format(idx=i), start=_start, stop=_stop) axes.append(ax) items = axes[0] blocks = [] for i in range(self.nblocks): - blk_items = self.read_index('block{idx}_items'.format(idx=i)) - values = self.read_array('block{idx}_values'.format(idx=i), - start=_start, stop=_stop) - blk = make_block(values, - placement=items.get_indexer(blk_items)) + blk_items = self.read_index("block{idx}_items".format(idx=i)) + values = self.read_array( + "block{idx}_values".format(idx=i), start=_start, stop=_stop + ) + blk = make_block(values, placement=items.get_indexer(blk_items)) blocks.append(blk) return self.obj_type(BlockManager(blocks, axes)) @@ -3024,22 +3212,22 @@ def write(self, obj, **kwargs): for i, ax in enumerate(data.axes): if i == 0: if not ax.is_unique: - raise ValueError( - "Columns index has to be unique for fixed format") - self.write_index('axis{idx}'.format(idx=i), ax) + raise ValueError("Columns index has to be unique for fixed format") + self.write_index("axis{idx}".format(idx=i), ax) # Supporting mixed-type DataFrame objects...nontrivial self.attrs.nblocks = len(data.blocks) for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 blk_items = data.items.take(blk.mgr_locs) - self.write_array('block{idx}_values'.format(idx=i), - blk.values, items=blk_items) - self.write_index('block{idx}_items'.format(idx=i), blk_items) + self.write_array( + "block{idx}_values".format(idx=i), blk.values, items=blk_items + ) + self.write_index("block{idx}_items".format(idx=i), blk_items) class FrameFixed(BlockManagerFixed): - pandas_kind = 'frame' + pandas_kind = "frame" obj_type = DataFrame @@ -3068,7 +3256,8 @@ class Table(Fixed): metadata : the names of the metadata columns """ - pandas_kind = 'wide_table' + + pandas_kind = "wide_table" table_type = None # type: str levels = 1 is_table = True @@ -3087,31 +3276,35 @@ def __init__(self, *args, **kwargs): @property def table_type_short(self): - return self.table_type.split('_')[0] + return self.table_type.split("_")[0] @property def format_type(self): - return 'table' + return "table" def __repr__(self): """ return a pretty representation of myself """ self.infer_axes() - dc = ",dc->[{columns}]".format(columns=(','.join( - self.data_columns) if len(self.data_columns) else '')) + dc = ",dc->[{columns}]".format( + columns=(",".join(self.data_columns) if len(self.data_columns) else "") + ) - ver = '' + ver = "" if self.is_old_version: - ver = "[{version}]".format( - version='.'.join(str(x) for x in self.version)) + ver = "[{version}]".format(version=".".join(str(x) for x in self.version)) return ( "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows}," "ncols->{ncols},indexers->[{index_axes}]{dc})".format( - pandas_type=self.pandas_type, ver=ver, - table_type=self.table_type_short, nrows=self.nrows, + pandas_type=self.pandas_type, + ver=ver, + table_type=self.table_type_short, + nrows=self.nrows, ncols=self.ncols, - index_axes=(','.join(a.name for a in self.index_axes)), dc=dc - )) + index_axes=(",".join(a.name for a in self.index_axes)), + dc=dc, + ) + ) def __getitem__(self, c): """ return the axis for c """ @@ -3129,9 +3322,11 @@ def validate(self, other): raise TypeError( "incompatible table_type with existing " "[{other} - {self}]".format( - other=other.table_type, self=self.table_type)) + other=other.table_type, self=self.table_type + ) + ) - for c in ['index_axes', 'non_index_axes', 'values_axes']: + for c in ["index_axes", "non_index_axes", "values_axes"]: sv = getattr(self, c, None) ov = getattr(other, c, None) if sv != ov: @@ -3143,12 +3338,15 @@ def validate(self, other): raise ValueError( "invalid combinate of [{c}] on appending data " "[{sax}] vs current table [{oax}]".format( - c=c, sax=sax, oax=oax)) + c=c, sax=sax, oax=oax + ) + ) # should never get here raise Exception( "invalid combinate of [{c}] on appending data [{sv}] vs " - "current table [{ov}]".format(c=c, sv=sv, ov=ov)) + "current table [{ov}]".format(c=c, sv=sv, ov=ov) + ) @property def is_multi_index(self): @@ -3157,20 +3355,22 @@ def is_multi_index(self): def validate_metadata(self, existing): """ create / validate metadata """ - self.metadata = [ - c.name for c in self.values_axes if c.metadata is not None] + self.metadata = [c.name for c in self.values_axes if c.metadata is not None] def validate_multiindex(self, obj): """validate that we can store the multi-index; reset and return the new object """ - levels = [l if l is not None else "level_{0}".format(i) - for i, l in enumerate(obj.index.names)] + levels = [ + l if l is not None else "level_{0}".format(i) + for i, l in enumerate(obj.index.names) + ] try: return obj.reset_index(), levels except ValueError: - raise ValueError("duplicate names/columns in the multi-index when " - "storing as a table") + raise ValueError( + "duplicate names/columns in the multi-index when " "storing as a table" + ) @property def nrows_expected(self): @@ -3180,11 +3380,11 @@ def nrows_expected(self): @property def is_exists(self): """ has this table been created """ - return 'table' in self.group + return "table" in self.group @property def storable(self): - return getattr(self.group, 'table', None) + return getattr(self.group, "table", None) @property def table(self): @@ -3215,19 +3415,28 @@ def is_transposed(self): @property def data_orientation(self): """return a tuple of my permutated axes, non_indexable at the front""" - return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes], - [int(a.axis) for a in self.index_axes])) + return tuple( + itertools.chain( + [int(a[0]) for a in self.non_index_axes], + [int(a.axis) for a in self.index_axes], + ) + ) def queryables(self): """ return a dict of the kinds allowable columns for this object """ # compute the values_axes queryables return dict( - [(a.cname, a) for a in self.index_axes] + - [(self.storage_obj_type._AXIS_NAMES[axis], None) - for axis, values in self.non_index_axes] + - [(v.cname, v) for v in self.values_axes - if v.name in set(self.data_columns)] + [(a.cname, a) for a in self.index_axes] + + [ + (self.storage_obj_type._AXIS_NAMES[axis], None) + for axis, values in self.non_index_axes + ] + + [ + (v.cname, v) + for v in self.values_axes + if v.name in set(self.data_columns) + ] ) def index_cols(self): @@ -3240,8 +3449,7 @@ def values_cols(self): def _get_metadata_path(self, key): """ return the metadata pathname for this key """ - return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, - key=key) + return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key) def write_metadata(self, key, values): """ @@ -3254,13 +3462,18 @@ def write_metadata(self, key, values): """ values = Series(values) - self.parent.put(self._get_metadata_path(key), values, format='table', - encoding=self.encoding, errors=self.errors, - nan_rep=self.nan_rep) + self.parent.put( + self._get_metadata_path(key), + values, + format="table", + encoding=self.encoding, + errors=self.errors, + nan_rep=self.nan_rep, + ) def read_metadata(self, key): """ return the meta data array for this key """ - if getattr(getattr(self.group, 'meta', None), key, None) is not None: + if getattr(getattr(self.group, "meta", None), key, None) is not None: return self.parent.select(self._get_metadata_path(key)) return None @@ -3284,34 +3497,24 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = getattr( - self.attrs, 'non_index_axes', None) or [] - self.data_columns = getattr( - self.attrs, 'data_columns', None) or [] - self.info = getattr( - self.attrs, 'info', None) or dict() - self.nan_rep = getattr(self.attrs, 'nan_rep', None) - self.encoding = _ensure_encoding( - getattr(self.attrs, 'encoding', None)) - self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict')) - self.levels = getattr( - self.attrs, 'levels', None) or [] - self.index_axes = [ - a.infer(self) for a in self.indexables if a.is_an_indexable - ] + self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] + self.data_columns = getattr(self.attrs, "data_columns", None) or [] + self.info = getattr(self.attrs, "info", None) or dict() + self.nan_rep = getattr(self.attrs, "nan_rep", None) + self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) + self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) + self.levels = getattr(self.attrs, "levels", None) or [] + self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable] self.values_axes = [ a.infer(self) for a in self.indexables if not a.is_an_indexable ] - self.metadata = getattr( - self.attrs, 'metadata', None) or [] + self.metadata = getattr(self.attrs, "metadata", None) or [] def validate_version(self, where=None): """ are we trying to operate on an old version? """ if where is not None: - if (self.version[0] <= 0 and self.version[1] <= 10 and - self.version[2] < 1): - ws = incompatibility_doc % '.'.join( - [str(x) for x in self.version]) + if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: + ws = incompatibility_doc % ".".join([str(x) for x in self.version]) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): @@ -3327,12 +3530,13 @@ def validate_min_itemsize(self, min_itemsize): for k, v in min_itemsize.items(): # ok, apply generally - if k == 'values': + if k == "values": continue if k not in q: raise ValueError( "min_itemsize has the key [{key}] which is not an axis or " - "data_column".format(key=k)) + "data_column".format(key=k) + ) @property def indexables(self): @@ -3342,10 +3546,12 @@ def indexables(self): self._indexables = [] # index columns - self._indexables.extend([ - IndexCol(name=name, axis=axis, pos=i) - for i, (axis, name) in enumerate(self.attrs.index_cols) - ]) + self._indexables.extend( + [ + IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols) + ] + ) # values columns dc = set(self.data_columns) @@ -3355,11 +3561,13 @@ def f(i, c): klass = DataCol if c in dc: klass = DataIndexableCol - return klass.create_for_block(i=i, name=c, pos=base_pos + i, - version=self.version) + return klass.create_for_block( + i=i, name=c, pos=base_pos + i, version=self.version + ) self._indexables.extend( - [f(i, c) for i, c in enumerate(self.attrs.values_cols)]) + [f(i, c) for i, c in enumerate(self.attrs.values_cols)] + ) return self._indexables @@ -3395,9 +3603,9 @@ def create_index(self, columns=None, optlevel=None, kind=None): kw = dict() if optlevel is not None: - kw['optlevel'] = optlevel + kw["optlevel"] = optlevel if kind is not None: - kw['kind'] = kind + kw["kind"] = kind table = self.table for c in columns: @@ -3413,23 +3621,24 @@ def create_index(self, columns=None, optlevel=None, kind=None): if kind is not None and cur_kind != kind: v.remove_index() else: - kw['kind'] = cur_kind + kw["kind"] = cur_kind if optlevel is not None and cur_optlevel != optlevel: v.remove_index() else: - kw['optlevel'] = cur_optlevel + kw["optlevel"] = cur_optlevel # create the index if not v.is_indexed: - if v.type.startswith('complex'): + if v.type.startswith("complex"): raise TypeError( - 'Columns containing complex values can be stored ' - 'but cannot' - ' be indexed when using table format. Either use ' - 'fixed format, set index=False, or do not include ' - 'the columns containing complex values to ' - 'data_columns when initializing the table.') + "Columns containing complex values can be stored " + "but cannot" + " be indexed when using table format. Either use " + "fixed format, set index=False, or do not include " + "the columns containing complex values to " + "data_columns when initializing the table." + ) v.create_index(**kw) def read_axes(self, where, **kwargs): @@ -3453,9 +3662,14 @@ def read_axes(self, where, **kwargs): a.set_info(self.info) # `kwargs` may contain `start` and `stop` arguments if passed to # `store.select()`. If set they determine the index size. - a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding, - errors=self.errors, start=kwargs.get('start'), - stop=kwargs.get('stop')) + a.convert( + values, + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, + start=kwargs.get("start"), + stop=kwargs.get("stop"), + ) return True @@ -3473,9 +3687,11 @@ def validate_data_columns(self, data_columns, min_itemsize): axis, axis_labels = self.non_index_axes[0] info = self.info.get(axis, dict()) - if info.get('type') == 'MultiIndex' and data_columns: - raise ValueError("cannot use a multi-index on axis [{0}] with " - "data_columns {1}".format(axis, data_columns)) + if info.get("type") == "MultiIndex" and data_columns: + raise ValueError( + "cannot use a multi-index on axis [{0}] with " + "data_columns {1}".format(axis, data_columns) + ) # evaluate the passed data_columns, True == use all columns # take only valide axis labels @@ -3488,16 +3704,27 @@ def validate_data_columns(self, data_columns, min_itemsize): if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) - data_columns.extend([ - k for k in min_itemsize.keys() - if k != 'values' and k not in existing_data_columns - ]) + data_columns.extend( + [ + k + for k in min_itemsize.keys() + if k != "values" and k not in existing_data_columns + ] + ) # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, - data_columns=None, min_itemsize=None, **kwargs): + def create_axes( + self, + axes, + obj, + validate=True, + nan_rep=None, + data_columns=None, + min_itemsize=None, + **kwargs + ): """ create and return the axes legacy tables create an indexable column, indexable index, non-indexable fields @@ -3524,8 +3751,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, except KeyError: raise TypeError( "cannot properly create the storer for: [group->{group}," - "value->{value}]".format( - group=self.group._v_name, value=type(obj))) + "value->{value}]".format(group=self.group._v_name, value=type(obj)) + ) # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] @@ -3546,7 +3773,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, # currently support on ndim-1 axes if len(axes) != self.ndim - 1: raise ValueError( - "currently only support ndim-1 indexers in an AppendableTable") + "currently only support ndim-1 indexers in an AppendableTable" + ) # create according to the new data self.non_index_axes = [] @@ -3554,7 +3782,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, # nan_representation if nan_rep is None: - nan_rep = 'nan' + nan_rep = "nan" self.nan_rep = nan_rep @@ -3564,9 +3792,11 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, if i in axes: name = obj._AXIS_NAMES[i] - index_axes_map[i] = _convert_index( - a, self.encoding, self.errors, self.format_type - ).set_name(name).set_axis(i) + index_axes_map[i] = ( + _convert_index(a, self.encoding, self.errors, self.format_type) + .set_name(name) + .set_axis(i) + ) else: # we might be able to change the axes on the appending data if @@ -3575,18 +3805,20 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, if existing_table is not None: indexer = len(self.non_index_axes) exist_axis = existing_table.non_index_axes[indexer][1] - if not array_equivalent(np.array(append_axis), - np.array(exist_axis)): + if not array_equivalent( + np.array(append_axis), np.array(exist_axis) + ): # ahah! -> reindex - if array_equivalent(np.array(sorted(append_axis)), - np.array(sorted(exist_axis))): + if array_equivalent( + np.array(sorted(append_axis)), np.array(sorted(exist_axis)) + ): append_axis = exist_axis # the non_index_axes info info = _get_info(self.info, i) - info['names'] = list(a.names) - info['type'] = a.__class__.__name__ + info["names"] = list(a.names) + info["type"] = a.__class__.__name__ self.non_index_axes.append((i, append_axis)) @@ -3614,12 +3846,10 @@ def get_blk_items(mgr, blocks): blk_items = get_blk_items(block_obj._data, blocks) if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] - data_columns = self.validate_data_columns( - data_columns, min_itemsize) + data_columns = self.validate_data_columns(data_columns, min_itemsize) if len(data_columns): mgr = block_obj.reindex( - Index(axis_labels).difference(Index(data_columns)), - axis=axis + Index(axis_labels).difference(Index(data_columns)), axis=axis )._data blocks = list(mgr.blocks) @@ -3631,8 +3861,10 @@ def get_blk_items(mgr, blocks): # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = {tuple(b_items.tolist()): (b, b_items) - for b, b_items in zip(blocks, blk_items)} + by_items = { + tuple(b_items.tolist()): (b, b_items) + for b, b_items in zip(blocks, blk_items) + } new_blocks = [] new_blk_items = [] for ea in existing_table.values_axes: @@ -3645,8 +3877,9 @@ def get_blk_items(mgr, blocks): raise ValueError( "cannot match existing table structure for [{items}] " "on appending data".format( - items=(','.join(pprint_thing(item) for - item in items)))) + items=(",".join(pprint_thing(item) for item in items)) + ) + ) blocks = new_blocks blk_items = new_blk_items @@ -3659,8 +3892,7 @@ def get_blk_items(mgr, blocks): name = None # we have a data_column - if (data_columns and len(b_items) == 1 and - b_items[0] in data_columns): + if data_columns and len(b_items) == 1 and b_items[0] in data_columns: klass = DataIndexableCol name = b_items[0] self.data_columns.append(name) @@ -3674,21 +3906,24 @@ def get_blk_items(mgr, blocks): raise ValueError( "Incompatible appended table [{blocks}]" "with existing table [{table}]".format( - blocks=blocks, - table=existing_table.values_axes)) + blocks=blocks, table=existing_table.values_axes + ) + ) else: existing_col = None try: - col = klass.create_for_block( - i=i, name=name, version=self.version) - col.set_atom(block=b, block_items=b_items, - existing_col=existing_col, - min_itemsize=min_itemsize, - nan_rep=nan_rep, - encoding=self.encoding, - errors=self.errors, - info=self.info) + col = klass.create_for_block(i=i, name=name, version=self.version) + col.set_atom( + block=b, + block_items=b_items, + existing_col=existing_col, + min_itemsize=min_itemsize, + nan_rep=nan_rep, + encoding=self.encoding, + errors=self.errors, + info=self.info, + ) col.set_pos(j) self.values_axes.append(col) @@ -3698,7 +3933,9 @@ def get_blk_items(mgr, blocks): raise Exception( "cannot find the correct atom type -> " "[dtype->{name},items->{items}] {detail!s}".format( - name=b.dtype.name, items=b_items, detail=detail)) + name=b.dtype.name, items=b_items, detail=detail + ) + ) j += 1 # validate our min_itemsize @@ -3747,8 +3984,7 @@ def process_filter(field, filt): filt = filt.union(Index(self.levels)) takers = op(axis_values, filt) - return obj.loc._getitem_axis(takers, - axis=axis_number) + return obj.loc._getitem_axis(takers, axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: @@ -3761,38 +3997,42 @@ def process_filter(field, filt): if isinstance(obj, DataFrame): axis_number = 1 - axis_number takers = op(values, filt) - return obj.loc._getitem_axis(takers, - axis=axis_number) + return obj.loc._getitem_axis(takers, axis=axis_number) - raise ValueError("cannot find the field [{field}] for " - "filtering!".format(field=field)) + raise ValueError( + "cannot find the field [{field}] for " + "filtering!".format(field=field) + ) obj = process_filter(field, filt) return obj - def create_description(self, complib=None, complevel=None, - fletcher32=False, expectedrows=None): + def create_description( + self, complib=None, complevel=None, fletcher32=False, expectedrows=None + ): """ create the description of the table from the axes & values """ # provided expected rows if its passed if expectedrows is None: expectedrows = max(self.nrows_expected, 10000) - d = dict(name='table', expectedrows=expectedrows) + d = dict(name="table", expectedrows=expectedrows) # description from the axes & values - d['description'] = {a.cname: a.typ for a in self.axes} + d["description"] = {a.cname: a.typ for a in self.axes} if complib: if complevel is None: complevel = self._complevel or 9 filters = _tables().Filters( - complevel=complevel, complib=complib, - fletcher32=fletcher32 or self._fletcher32) - d['filters'] = filters + complevel=complevel, + complib=complib, + fletcher32=fletcher32 or self._fletcher32, + ) + d["filters"] = filters elif self._filters is not None: - d['filters'] = self._filters + d["filters"] = self._filters return d @@ -3809,15 +4049,14 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection( - self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) coords = self.selection.select_coords() if self.selection.filter is not None: for field, op, filt in self.selection.filter.format(): data = self.read_column( - field, start=coords.min(), stop=coords.max() + 1) - coords = coords[ - op(data.iloc[coords - coords.min()], filt).values] + field, start=coords.min(), stop=coords.max() + 1 + ) + coords = coords[op(data.iloc[coords - coords.min()], filt).values] return Index(coords) @@ -3834,8 +4073,7 @@ def read_column(self, column, where=None, start=None, stop=None): return False if where is not None: - raise TypeError("read_column does not currently accept a where " - "clause") + raise TypeError("read_column does not currently accept a where " "clause") # find the axes for a in self.axes: @@ -3844,20 +4082,27 @@ def read_column(self, column, where=None, start=None, stop=None): if not a.is_data_indexable: raise ValueError( "column [{column}] can not be extracted individually; " - "it is not data indexable".format(column=column)) + "it is not data indexable".format(column=column) + ) # column must be an indexable or a data column c = getattr(self.table.cols, column) a.set_info(self.info) - return Series(_set_tz(a.convert(c[start:stop], - nan_rep=self.nan_rep, - encoding=self.encoding, - errors=self.errors - ).take_data(), - a.tz, True), name=column) + return Series( + _set_tz( + a.convert( + c[start:stop], + nan_rep=self.nan_rep, + encoding=self.encoding, + errors=self.errors, + ).take_data(), + a.tz, + True, + ), + name=column, + ) - raise KeyError( - "column [{column}] not found in the table".format(column=column)) + raise KeyError("column [{column}] not found in the table".format(column=column)) class WORMTable(Table): @@ -3866,7 +4111,8 @@ class WORMTable(Table): table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk """ - table_type = 'worm' + + table_type = "worm" def read(self, **kwargs): """ read the indices and the indexing array, calculate offset rows and @@ -3889,12 +4135,13 @@ class LegacyTable(Table): that can be easily searched """ + _indexables = [ - IndexCol(name='index', axis=1, pos=0), - IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'), - DataCol(name='fields', cname='values', kind_attr='fields', pos=2) + IndexCol(name="index", axis=1, pos=0), + IndexCol(name="column", axis=2, pos=1, index_kind="columns_kind"), + DataCol(name="fields", cname="values", kind_attr="fields", pos=2), ] # type: Optional[List[IndexCol]] - table_type = 'legacy' + table_type = "legacy" ndim = 3 def write(self, **kwargs): @@ -3911,20 +4158,32 @@ def read(self, where=None, columns=None, **kwargs): class AppendableTable(LegacyTable): """ support the new appendable table formats """ - _indexables = None - table_type = 'appendable' - def write(self, obj, axes=None, append=False, complib=None, - complevel=None, fletcher32=None, min_itemsize=None, - chunksize=None, expectedrows=None, dropna=False, **kwargs): + _indexables = None + table_type = "appendable" + + def write( + self, + obj, + axes=None, + append=False, + complib=None, + complevel=None, + fletcher32=None, + min_itemsize=None, + chunksize=None, + expectedrows=None, + dropna=False, + **kwargs + ): if not append and self.is_exists: - self._handle.remove_node(self.group, 'table') + self._handle.remove_node(self.group, "table") # create the axes - self.create_axes(axes=axes, obj=obj, validate=append, - min_itemsize=min_itemsize, - **kwargs) + self.create_axes( + axes=axes, obj=obj, validate=append, min_itemsize=min_itemsize, **kwargs + ) for a in self.axes: a.validate(self, append) @@ -3932,10 +4191,12 @@ def write(self, obj, axes=None, append=False, complib=None, if not self.is_exists: # create the table - options = self.create_description(complib=complib, - complevel=complevel, - fletcher32=fletcher32, - expectedrows=expectedrows) + options = self.create_description( + complib=complib, + complevel=complevel, + fletcher32=fletcher32, + expectedrows=expectedrows, + ) # set the table attributes self.set_attrs() @@ -3973,7 +4234,7 @@ def write_data(self, chunksize, dropna=False): # column, otherwise ignore the mask mask = isna(a.data).all(axis=0) if isinstance(mask, np.ndarray): - masks.append(mask.astype('u1', copy=False)) + masks.append(mask.astype("u1", copy=False)) # consolidate masks if len(masks): @@ -3992,13 +4253,13 @@ def write_data(self, chunksize, dropna=False): # broadcast to all other indexes except myself if i > 0 and i < nindexes: - repeater = np.prod( - [indexes[bi].shape[0] for bi in range(0, i)]) + repeater = np.prod([indexes[bi].shape[0] for bi in range(0, i)]) idx = np.tile(idx, repeater) if i < nindexes - 1: - repeater = np.prod([indexes[bi].shape[0] - for bi in range(i + 1, nindexes)]) + repeater = np.prod( + [indexes[bi].shape[0] for bi in range(i + 1, nindexes)] + ) idx = np.repeat(idx, repeater) bindexes.append(idx) @@ -4006,8 +4267,7 @@ def write_data(self, chunksize, dropna=False): # transpose the values so first dimension is last # reshape the values if needed values = [a.take_data() for a in self.values_axes] - values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) - for v in values] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values] bvalues = [] for i, v in enumerate(values): new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape @@ -4029,7 +4289,8 @@ def write_data(self, chunksize, dropna=False): rows, indexes=[a[start_i:end_i] for a in bindexes], mask=mask[start_i:end_i] if mask is not None else None, - values=[v[start_i:end_i] for v in bvalues]) + values=[v[start_i:end_i] for v in bvalues], + ) def write_data_chunk(self, rows, indexes, mask, values): """ @@ -4068,8 +4329,7 @@ def write_data_chunk(self, rows, indexes, mask, values): rows = rows[m] except Exception as detail: - raise Exception( - "cannot create row-data -> {detail}".format(detail=detail)) + raise Exception("cannot create row-data -> {detail}".format(detail=detail)) try: if len(rows): @@ -4077,8 +4337,8 @@ def write_data_chunk(self, rows, indexes, mask, values): self.table.flush() except Exception as detail: raise TypeError( - "tables cannot write this data -> {detail}".format( - detail=detail)) + "tables cannot write this data -> {detail}".format(detail=detail) + ) def delete(self, where=None, start=None, stop=None, **kwargs): @@ -4101,8 +4361,7 @@ def delete(self, where=None, start=None, stop=None, **kwargs): # create the selection table = self.table - self.selection = Selection( - self, where, start=start, stop=stop, **kwargs) + self.selection = Selection(self, where, start=start, stop=stop, **kwargs) values = self.selection.select_coords() # delete the rows in reverse order @@ -4131,8 +4390,9 @@ def delete(self, where=None, start=None, stop=None, **kwargs): pg = groups.pop() for g in reversed(groups): rows = sorted_series.take(range(g, pg)) - table.remove_rows(start=rows[rows.index[0] - ], stop=rows[rows.index[-1]] + 1) + table.remove_rows( + start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1 + ) pg = g self.table.flush() @@ -4143,8 +4403,9 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class AppendableFrameTable(AppendableTable): """ support the new appendable table formats """ - pandas_kind = 'frame_table' - table_type = 'appendable_frame' + + pandas_kind = "frame_table" + table_type = "appendable_frame" ndim = 2 obj_type = DataFrame # type: Type[Union[DataFrame, Series]] @@ -4163,29 +4424,32 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - info = (self.info.get(self.non_index_axes[0][0], dict()) - if len(self.non_index_axes) else dict()) + info = ( + self.info.get(self.non_index_axes[0][0], dict()) + if len(self.non_index_axes) + else dict() + ) index = self.index_axes[0].values frames = [] for a in self.values_axes: # we could have a multi-index constructor here # ensure_index doesn't recognized our list-of-tuples here - if info.get('type') == 'MultiIndex': + if info.get("type") == "MultiIndex": cols = MultiIndex.from_tuples(a.values) else: cols = Index(a.values) - names = info.get('names') + names = info.get("names") if names is not None: cols.set_names(names, inplace=True) if self.is_transposed: values = a.cvalues index_ = cols - cols_ = Index(index, name=getattr(index, 'name', None)) + cols_ = Index(index, name=getattr(index, "name", None)) else: values = a.cvalues.T - index_ = Index(index, name=getattr(index, 'name', None)) + index_ = Index(index, name=getattr(index, "name", None)) cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim @@ -4209,8 +4473,9 @@ def read(self, where=None, columns=None, **kwargs): class AppendableSeriesTable(AppendableFrameTable): """ support the new appendable table formats """ - pandas_kind = 'series_table' - table_type = 'appendable_series' + + pandas_kind = "series_table" + table_type = "appendable_series" ndim = 2 obj_type = Series storage_obj_type = DataFrame @@ -4225,11 +4490,10 @@ def get_object(self, obj): def write(self, obj, data_columns=None, **kwargs): """ we are going to write this as a frame table """ if not isinstance(obj, DataFrame): - name = obj.name or 'values' + name = obj.name or "values" obj = DataFrame({name: obj}, index=obj.index) obj.columns = [name] - return super().write(obj=obj, data_columns=obj.columns.tolist(), - **kwargs) + return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) def read(self, columns=None, **kwargs): @@ -4245,19 +4509,20 @@ def read(self, columns=None, **kwargs): s = s.iloc[:, 0] # remove the default name - if s.name == 'values': + if s.name == "values": s.name = None return s class AppendableMultiSeriesTable(AppendableSeriesTable): """ support the new appendable table formats """ - pandas_kind = 'series_table' - table_type = 'appendable_multiseries' + + pandas_kind = "series_table" + table_type = "appendable_multiseries" def write(self, obj, **kwargs): """ we are going to write this as a frame table """ - name = obj.name or 'values' + name = obj.name or "values" obj, self.levels = self.validate_multiindex(obj) cols = list(self.levels) cols.append(name) @@ -4267,8 +4532,9 @@ def write(self, obj, **kwargs): class GenericTable(AppendableFrameTable): """ a table that read/writes the generic pytables table format """ - pandas_kind = 'frame_table' - table_type = 'generic_table' + + pandas_kind = "frame_table" + table_type = "generic_table" ndim = 2 obj_type = DataFrame @@ -4278,7 +4544,7 @@ def pandas_type(self): @property def storable(self): - return getattr(self.group, 'table', None) or self.group + return getattr(self.group, "table", None) or self.group def get_attrs(self): """ retrieve our attributes """ @@ -4286,10 +4552,10 @@ def get_attrs(self): self.nan_rep = None self.levels = [] - self.index_axes = [a.infer(self) - for a in self.indexables if a.is_an_indexable] - self.values_axes = [a.infer(self) - for a in self.indexables if not a.is_an_indexable] + self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable] + self.values_axes = [ + a.infer(self) for a in self.indexables if not a.is_an_indexable + ] self.data_columns = [a.name for a in self.values_axes] @property @@ -4300,12 +4566,13 @@ def indexables(self): d = self.description # the index columns is just a simple index - self._indexables = [GenericIndexCol(name='index', axis=0)] + self._indexables = [GenericIndexCol(name="index", axis=0)] for i, n in enumerate(d._v_names): dc = GenericDataIndexableCol( - name=n, pos=i, values=[n], version=self.version) + name=n, pos=i, values=[n], version=self.version + ) self._indexables.append(dc) return self._indexables @@ -4317,14 +4584,15 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): """ a frame with a multi-index """ - table_type = 'appendable_multiframe' + + table_type = "appendable_multiframe" obj_type = DataFrame ndim = 2 _re_levels = re.compile(r"^level_\d+$") @property def table_type_short(self): - return 'appendable_multi' + return "appendable_multi" def write(self, obj, data_columns=None, **kwargs): if data_columns is None: @@ -4343,9 +4611,9 @@ def read(self, **kwargs): df = df.set_index(self.levels) # remove names for 'level_%d' - df.index = df.index.set_names([ - None if self._re_levels.search(l) else l for l in df.index.names - ]) + df.index = df.index.set_names( + [None if self._re_levels.search(l) else l for l in df.index.names] + ) return df @@ -4379,6 +4647,7 @@ def _get_info(info, name): idx = info[name] = dict() return idx + # tz to/from coercion @@ -4404,146 +4673,172 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ if tz is not None: - name = getattr(values, 'name', None) + name = getattr(values, "name", None) values = values.ravel() tz = timezones.get_timezone(_ensure_decoded(tz)) values = DatetimeIndex(values, name=name) if values.tz is None: - values = values.tz_localize('UTC').tz_convert(tz) + values = values.tz_localize("UTC").tz_convert(tz) if preserve_UTC: - if tz == 'UTC': + if tz == "UTC": values = list(values) elif coerce: - values = np.asarray(values, dtype='M8[ns]') + values = np.asarray(values, dtype="M8[ns]") return values -def _convert_index(index, encoding=None, errors='strict', format_type=None): - index_name = getattr(index, 'name', None) +def _convert_index(index, encoding=None, errors="strict", format_type=None): + index_name = getattr(index, "name", None) if isinstance(index, DatetimeIndex): converted = index.asi8 - return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - tz=getattr(index, 'tz', None), - index_name=index_name) + return IndexCol( + converted, + "datetime64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + tz=getattr(index, "tz", None), + index_name=index_name, + ) elif isinstance(index, TimedeltaIndex): converted = index.asi8 - return IndexCol(converted, 'timedelta64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - index_name=index_name) + return IndexCol( + converted, + "timedelta64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + index_name=index_name, + ) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects - return IndexCol(index._ndarray_values, 'integer', atom, - freq=getattr(index, 'freq', None), - index_name=index_name) + return IndexCol( + index._ndarray_values, + "integer", + atom, + freq=getattr(index, "freq", None), + index_name=index_name, + ) if isinstance(index, MultiIndex): - raise TypeError('MultiIndex not supported here!') + raise TypeError("MultiIndex not supported here!") inferred_type = lib.infer_dtype(index, skipna=False) values = np.asarray(index) - if inferred_type == 'datetime64': - converted = values.view('i8') - return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - tz=getattr(index, 'tz', None), - index_name=index_name) - elif inferred_type == 'timedelta64': - converted = values.view('i8') - return IndexCol(converted, 'timedelta64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), - index_name=index_name) - elif inferred_type == 'datetime': - converted = np.asarray([(time.mktime(v.timetuple()) + - v.microsecond / 1E6) for v in values], - dtype=np.float64) - return IndexCol(converted, 'datetime', _tables().Time64Col(), - index_name=index_name) - elif inferred_type == 'date': - converted = np.asarray([v.toordinal() for v in values], - dtype=np.int32) - return IndexCol(converted, 'date', _tables().Time32Col(), - index_name=index_name) - elif inferred_type == 'string': + if inferred_type == "datetime64": + converted = values.view("i8") + return IndexCol( + converted, + "datetime64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + tz=getattr(index, "tz", None), + index_name=index_name, + ) + elif inferred_type == "timedelta64": + converted = values.view("i8") + return IndexCol( + converted, + "timedelta64", + _tables().Int64Col(), + freq=getattr(index, "freq", None), + index_name=index_name, + ) + elif inferred_type == "datetime": + converted = np.asarray( + [(time.mktime(v.timetuple()) + v.microsecond / 1e6) for v in values], + dtype=np.float64, + ) + return IndexCol( + converted, "datetime", _tables().Time64Col(), index_name=index_name + ) + elif inferred_type == "date": + converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) + return IndexCol(converted, "date", _tables().Time32Col(), index_name=index_name) + elif inferred_type == "string": # atom = _tables().ObjectAtom() # return np.asarray(values, dtype='O'), 'object', atom converted = _convert_string_array(values, encoding, errors) itemsize = converted.dtype.itemsize return IndexCol( - converted, 'string', _tables().StringCol(itemsize), - itemsize=itemsize, index_name=index_name + converted, + "string", + _tables().StringCol(itemsize), + itemsize=itemsize, + index_name=index_name, ) - elif inferred_type == 'unicode': - if format_type == 'fixed': + elif inferred_type == "unicode": + if format_type == "fixed": atom = _tables().ObjectAtom() - return IndexCol(np.asarray(values, dtype='O'), 'object', atom, - index_name=index_name) + return IndexCol( + np.asarray(values, dtype="O"), "object", atom, index_name=index_name + ) raise TypeError( - "[unicode] is not supported as a in index type for [{0}] formats" - .format(format_type) + "[unicode] is not supported as a in index type for [{0}] formats".format( + format_type + ) ) - elif inferred_type == 'integer': + elif inferred_type == "integer": # take a guess for now, hope the values fit atom = _tables().Int64Col() - return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom, - index_name=index_name) - elif inferred_type == 'floating': + return IndexCol( + np.asarray(values, dtype=np.int64), "integer", atom, index_name=index_name + ) + elif inferred_type == "floating": atom = _tables().Float64Col() - return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom, - index_name=index_name) + return IndexCol( + np.asarray(values, dtype=np.float64), "float", atom, index_name=index_name + ) else: # pragma: no cover atom = _tables().ObjectAtom() - return IndexCol(np.asarray(values, dtype='O'), 'object', atom, - index_name=index_name) + return IndexCol( + np.asarray(values, dtype="O"), "object", atom, index_name=index_name + ) -def _unconvert_index(data, kind, encoding=None, errors='strict'): +def _unconvert_index(data, kind, encoding=None, errors="strict"): kind = _ensure_decoded(kind) - if kind == 'datetime64': + if kind == "datetime64": index = DatetimeIndex(data) - elif kind == 'timedelta64': + elif kind == "timedelta64": index = TimedeltaIndex(data) - elif kind == 'datetime': - index = np.asarray([datetime.fromtimestamp(v) for v in data], - dtype=object) - elif kind == 'date': + elif kind == "datetime": + index = np.asarray([datetime.fromtimestamp(v) for v in data], dtype=object) + elif kind == "date": try: - index = np.asarray( - [date.fromordinal(v) for v in data], dtype=object) + index = np.asarray([date.fromordinal(v) for v in data], dtype=object) except (ValueError): - index = np.asarray( - [date.fromtimestamp(v) for v in data], dtype=object) - elif kind in ('integer', 'float'): + index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object) + elif kind in ("integer", "float"): index = np.asarray(data) - elif kind in ('string'): - index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, - errors=errors) - elif kind == 'object': + elif kind in ("string"): + index = _unconvert_string_array( + data, nan_rep=None, encoding=encoding, errors=errors + ) + elif kind == "object": index = np.asarray(data[0]) else: # pragma: no cover - raise ValueError('unrecognized index type {kind}'.format(kind=kind)) + raise ValueError("unrecognized index type {kind}".format(kind=kind)) return index -def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, - errors='strict'): +def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, errors="strict"): kind = _ensure_decoded(kind) - if kind == 'datetime': + if kind == "datetime": index = to_datetime(data) - elif kind in ('integer'): + elif kind in ("integer"): index = np.asarray(data, dtype=object) - elif kind in ('string'): - index = _unconvert_string_array(data, nan_rep=None, encoding=encoding, - errors=errors) + elif kind in ("string"): + index = _unconvert_string_array( + data, nan_rep=None, encoding=encoding, errors=errors + ) else: # pragma: no cover - raise ValueError('unrecognized index type {kind}'.format(kind=kind)) + raise ValueError("unrecognized index type {kind}".format(kind=kind)) return index @@ -4566,8 +4861,9 @@ def _convert_string_array(data, encoding, errors, itemsize=None): # encode if needed if encoding is not None and len(data): - data = Series(data.ravel()).str.encode( - encoding, errors).values.reshape(data.shape) + data = ( + Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape) + ) # create the sized dtype if itemsize is None: @@ -4578,8 +4874,7 @@ def _convert_string_array(data, encoding, errors, itemsize=None): return data -def _unconvert_string_array(data, nan_rep=None, encoding=None, - errors='strict'): +def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"): """ inverse of _convert_string_array @@ -4612,7 +4907,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None, data = data.astype(dtype, copy=False).astype(object, copy=False) if nan_rep is None: - nan_rep = 'nan' + nan_rep = "nan" data = libwriters.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) @@ -4628,20 +4923,19 @@ def _maybe_convert(values, val_kind, encoding, errors): def _get_converter(kind, encoding, errors): kind = _ensure_decoded(kind) - if kind == 'datetime64': - return lambda x: np.asarray(x, dtype='M8[ns]') - elif kind == 'datetime': + if kind == "datetime64": + return lambda x: np.asarray(x, dtype="M8[ns]") + elif kind == "datetime": return lambda x: to_datetime(x, cache=True).to_pydatetime() - elif kind == 'string': - return lambda x: _unconvert_string_array(x, encoding=encoding, - errors=errors) + elif kind == "string": + return lambda x: _unconvert_string_array(x, encoding=encoding, errors=errors) else: # pragma: no cover - raise ValueError('invalid kind {kind}'.format(kind=kind)) + raise ValueError("invalid kind {kind}".format(kind=kind)) def _need_convert(kind): kind = _ensure_decoded(kind) - if kind in ('datetime', 'datetime64', 'string'): + if kind in ("datetime", "datetime64", "string"): return True return False @@ -4674,7 +4968,7 @@ def __init__(self, table, where=None, start=None, stop=None): # see if we have a passed coordinate like try: inferred = lib.infer_dtype(where, skipna=False) - if inferred == 'integer' or inferred == 'boolean': + if inferred == "integer" or inferred == "boolean": where = np.asarray(where) if where.dtype == np.bool_: start, stop = self.start, self.stop @@ -4684,13 +4978,11 @@ def __init__(self, table, where=None, start=None, stop=None): stop = self.table.nrows self.coordinates = np.arange(start, stop)[where] elif issubclass(where.dtype.type, np.integer): - if ((self.start is not None and - (where < self.start).any()) or - (self.stop is not None and - (where >= self.stop).any())): + if (self.start is not None and (where < self.start).any()) or ( + self.stop is not None and (where >= self.stop).any() + ): raise ValueError( - "where must have index locations >= start and " - "< stop" + "where must have index locations >= start and " "< stop" ) self.coordinates = where @@ -4723,8 +5015,9 @@ def generate(self, where): "reference to\n" " an axis (e.g. 'index' or 'columns'), or a " "data_column\n" - " The currently defined references are: {1}\n" - .format(where, ','.join(q.keys())) + " The currently defined references are: {1}\n".format( + where, ",".join(q.keys()) + ) ) def select(self): @@ -4732,9 +5025,9 @@ def select(self): generate the selection """ if self.condition is not None: - return self.table.table.read_where(self.condition.format(), - start=self.start, - stop=self.stop) + return self.table.table.read_where( + self.condition.format(), start=self.start, stop=self.stop + ) elif self.coordinates is not None: return self.table.table.read_coordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -4755,9 +5048,9 @@ def select_coords(self): stop += nrows if self.condition is not None: - return self.table.table.get_where_list(self.condition.format(), - start=start, stop=stop, - sort=True) + return self.table.table.get_where_list( + self.condition.format(), start=start, stop=stop, sort=True + ) elif self.coordinates is not None: return self.coordinates diff --git a/pandas/io/s3.py b/pandas/io/s3.py index d784e8d473aac..0a7c082fec51c 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -4,8 +4,7 @@ from pandas.compat._optional import import_optional_dependency s3fs = import_optional_dependency( - "s3fs", - extra="The s3fs package is required to handle s3 files." + "s3fs", extra="The s3fs package is required to handle s3 files." ) @@ -15,12 +14,13 @@ def _strip_schema(url): return result.netloc + result.path -def get_filepath_or_buffer(filepath_or_buffer, encoding=None, - compression=None, mode=None): +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): from botocore.exceptions import NoCredentialsError if mode is None: - mode = 'rb' + mode = "rb" fs = s3fs.S3FileSystem(anon=False) try: @@ -33,6 +33,5 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # A NoCredentialsError is raised if you don't have creds # for that bucket. fs = s3fs.S3FileSystem(anon=True) - filepath_or_buffer = fs.open( - _strip_schema(filepath_or_buffer), mode) + filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) return filepath_or_buffer, None, compression, True diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 634bdfa93ba2e..7cc9dc11a8ccc 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -65,9 +65,17 @@ class SAS7BDATReader(BaseIterator): bytes. """ - def __init__(self, path_or_buf, index=None, convert_dates=True, - blank_missing=True, chunksize=None, encoding=None, - convert_text=True, convert_header_text=True): + def __init__( + self, + path_or_buf, + index=None, + convert_dates=True, + blank_missing=True, + chunksize=None, + encoding=None, + convert_text=True, + convert_header_text=True, + ): self.index = index self.convert_dates = convert_dates @@ -96,7 +104,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, str): - self._path_or_buf = open(self._path_or_buf, 'rb') + self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf self._get_properties() @@ -113,7 +121,7 @@ def column_data_offsets(self): def column_types(self): """Returns a numpy character array of the column types: s (string) or d (double)""" - return np.asarray(self._column_types, dtype=np.dtype('S1')) + return np.asarray(self._column_types, dtype=np.dtype("S1")) def close(self): try: @@ -126,7 +134,7 @@ def _get_properties(self): # Check magic number self._path_or_buf.seek(0) self._cached_page = self._path_or_buf.read(288) - if self._cached_page[0:len(const.magic)] != const.magic: + if self._cached_page[0 : len(const.magic)] != const.magic: self.close() raise ValueError("magic number mismatch (not a SAS file?)") @@ -150,9 +158,8 @@ def _get_properties(self): total_align = align1 + align2 # Get endianness information - buf = self._read_bytes(const.endianness_offset, - const.endianness_length) - if buf == b'\x01': + buf = self._read_bytes(const.endianness_offset, const.endianness_length) + if buf == b"\x01": self.byte_order = "<" else: self.byte_order = ">" @@ -166,36 +173,39 @@ def _get_properties(self): # Get platform information buf = self._read_bytes(const.platform_offset, const.platform_length) - if buf == b'1': + if buf == b"1": self.platform = "unix" - elif buf == b'2': + elif buf == b"2": self.platform = "windows" else: self.platform = "unknown" buf = self._read_bytes(const.dataset_offset, const.dataset_length) - self.name = buf.rstrip(b'\x00 ') + self.name = buf.rstrip(b"\x00 ") if self.convert_header_text: - self.name = self.name.decode( - self.encoding or self.default_encoding) + self.name = self.name.decode(self.encoding or self.default_encoding) buf = self._read_bytes(const.file_type_offset, const.file_type_length) - self.file_type = buf.rstrip(b'\x00 ') + self.file_type = buf.rstrip(b"\x00 ") if self.convert_header_text: self.file_type = self.file_type.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) # Timestamp is epoch 01/01/1960 epoch = datetime(1960, 1, 1) - x = self._read_float(const.date_created_offset + align1, - const.date_created_length) - self.date_created = epoch + pd.to_timedelta(x, unit='s') - x = self._read_float(const.date_modified_offset + align1, - const.date_modified_length) - self.date_modified = epoch + pd.to_timedelta(x, unit='s') - - self.header_length = self._read_int(const.header_size_offset + align1, - const.header_size_length) + x = self._read_float( + const.date_created_offset + align1, const.date_created_length + ) + self.date_created = epoch + pd.to_timedelta(x, unit="s") + x = self._read_float( + const.date_modified_offset + align1, const.date_modified_length + ) + self.date_modified = epoch + pd.to_timedelta(x, unit="s") + + self.header_length = self._read_int( + const.header_size_offset + align1, const.header_size_length + ) # Read the rest of the header into cached_page. buf = self._path_or_buf.read(self.header_length - 288) @@ -204,44 +214,53 @@ def _get_properties(self): self.close() raise ValueError("The SAS7BDAT file appears to be truncated.") - self._page_length = self._read_int(const.page_size_offset + align1, - const.page_size_length) - self._page_count = self._read_int(const.page_count_offset + align1, - const.page_count_length) - - buf = self._read_bytes(const.sas_release_offset + total_align, - const.sas_release_length) - self.sas_release = buf.rstrip(b'\x00 ') + self._page_length = self._read_int( + const.page_size_offset + align1, const.page_size_length + ) + self._page_count = self._read_int( + const.page_count_offset + align1, const.page_count_length + ) + + buf = self._read_bytes( + const.sas_release_offset + total_align, const.sas_release_length + ) + self.sas_release = buf.rstrip(b"\x00 ") if self.convert_header_text: self.sas_release = self.sas_release.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) - buf = self._read_bytes(const.sas_server_type_offset + total_align, - const.sas_server_type_length) - self.server_type = buf.rstrip(b'\x00 ') + buf = self._read_bytes( + const.sas_server_type_offset + total_align, const.sas_server_type_length + ) + self.server_type = buf.rstrip(b"\x00 ") if self.convert_header_text: self.server_type = self.server_type.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) - buf = self._read_bytes(const.os_version_number_offset + total_align, - const.os_version_number_length) - self.os_version = buf.rstrip(b'\x00 ') + buf = self._read_bytes( + const.os_version_number_offset + total_align, const.os_version_number_length + ) + self.os_version = buf.rstrip(b"\x00 ") if self.convert_header_text: self.os_version = self.os_version.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) - buf = self._read_bytes(const.os_name_offset + total_align, - const.os_name_length) - buf = buf.rstrip(b'\x00 ') + buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length) + buf = buf.rstrip(b"\x00 ") if len(buf) > 0: self.os_name = buf.decode(self.encoding or self.default_encoding) else: - buf = self._read_bytes(const.os_maker_offset + total_align, - const.os_maker_length) - self.os_name = buf.rstrip(b'\x00 ') + buf = self._read_bytes( + const.os_maker_offset + total_align, const.os_maker_length + ) + self.os_name = buf.rstrip(b"\x00 ") if self.convert_header_text: self.os_name = self.os_name.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) def __next__(self): da = self.read(nrows=self.chunksize or 1) @@ -281,7 +300,7 @@ def _read_bytes(self, offset, length): if offset + length > len(self._cached_page): self.close() raise ValueError("The cached page is too small.") - return self._cached_page[offset:offset + length] + return self._cached_page[offset : offset + length] def _parse_metadata(self): done = False @@ -291,8 +310,7 @@ def _parse_metadata(self): break if len(self._cached_page) != self._page_length: self.close() - raise ValueError( - "Failed to read a meta data page from the SAS file.") + raise ValueError("Failed to read a meta data page from the SAS file.") done = self._process_page_meta() def _process_page_meta(self): @@ -302,43 +320,45 @@ def _process_page_meta(self): self._process_page_metadata() is_data_page = self._current_page_type & const.page_data_type is_mix_page = self._current_page_type in const.page_mix_types - return (is_data_page or is_mix_page - or self._current_page_data_subheader_pointers != []) + return ( + is_data_page + or is_mix_page + or self._current_page_data_subheader_pointers != [] + ) def _read_page_header(self): bit_offset = self._page_bit_offset tx = const.page_type_offset + bit_offset self._current_page_type = self._read_int(tx, const.page_type_length) tx = const.block_count_offset + bit_offset - self._current_page_block_count = self._read_int( - tx, const.block_count_length) + self._current_page_block_count = self._read_int(tx, const.block_count_length) tx = const.subheader_count_offset + bit_offset - self._current_page_subheaders_count = ( - self._read_int(tx, const.subheader_count_length)) + self._current_page_subheaders_count = self._read_int( + tx, const.subheader_count_length + ) def _process_page_metadata(self): bit_offset = self._page_bit_offset for i in range(self._current_page_subheaders_count): pointer = self._process_subheader_pointers( - const.subheader_pointers_offset + bit_offset, i) + const.subheader_pointers_offset + bit_offset, i + ) if pointer.length == 0: continue if pointer.compression == const.truncated_subheader_id: continue - subheader_signature = self._read_subheader_signature( - pointer.offset) - subheader_index = ( - self._get_subheader_index(subheader_signature, - pointer.compression, pointer.ptype)) + subheader_signature = self._read_subheader_signature(pointer.offset) + subheader_index = self._get_subheader_index( + subheader_signature, pointer.compression, pointer.ptype + ) self._process_subheader(subheader_index, pointer) def _get_subheader_index(self, signature, compression, ptype): index = const.subheader_signature_to_index.get(signature) if index is None: - f1 = ((compression == const.compressed_subheader_id) or - (compression == 0)) - f2 = (ptype == const.compressed_subheader_type) + f1 = (compression == const.compressed_subheader_id) or (compression == 0) + f2 = ptype == const.compressed_subheader_type if (self.compression != "") and f1 and f2: index = const.SASIndex.data_subheader_index else: @@ -349,8 +369,7 @@ def _get_subheader_index(self, signature, compression, ptype): def _process_subheader_pointers(self, offset, subheader_pointer_index): subheader_pointer_length = self._subheader_pointer_length - total_offset = (offset + - subheader_pointer_length * subheader_pointer_index) + total_offset = offset + subheader_pointer_length * subheader_pointer_index subheader_offset = self._read_int(total_offset, self._int_length) total_offset += self._int_length @@ -416,13 +435,17 @@ def _process_rowsize_subheader(self, offset, length): lcp_offset += 378 self.row_length = self._read_int( - offset + const.row_length_offset_multiplier * int_len, int_len) + offset + const.row_length_offset_multiplier * int_len, int_len + ) self.row_count = self._read_int( - offset + const.row_count_offset_multiplier * int_len, int_len) + offset + const.row_count_offset_multiplier * int_len, int_len + ) self.col_count_p1 = self._read_int( - offset + const.col_count_p1_multiplier * int_len, int_len) + offset + const.col_count_p1_multiplier * int_len, int_len + ) self.col_count_p2 = self._read_int( - offset + const.col_count_p2_multiplier * int_len, int_len) + offset + const.col_count_p2_multiplier * int_len, int_len + ) mx = const.row_count_on_mix_page_offset_multiplier * int_len self._mix_page_row_count = self._read_int(offset + mx, int_len) self._lcs = self._read_int(lcs_offset, 2) @@ -432,13 +455,15 @@ def _process_columnsize_subheader(self, offset, length): int_len = self._int_length offset += int_len self.column_count = self._read_int(offset, int_len) - if (self.col_count_p1 + self.col_count_p2 != - self.column_count): + if self.col_count_p1 + self.col_count_p2 != self.column_count: print( "Warning: column count mismatch ({p1} + {p2} != " "{column_count})\n".format( - p1=self.col_count_p1, p2=self.col_count_p2, - column_count=self.column_count)) + p1=self.col_count_p1, + p2=self.col_count_p2, + column_count=self.column_count, + ) + ) # Unknown purpose def _process_subheader_counts(self, offset, length): @@ -476,60 +501,74 @@ def _process_columntext_subheader(self, offset, length): if self.U64: offset1 += 4 buf = self._read_bytes(offset1, self._lcp) - self.creator_proc = buf[0:self._lcp] + self.creator_proc = buf[0 : self._lcp] elif compression_literal == const.rle_compression: offset1 = offset + 40 if self.U64: offset1 += 4 buf = self._read_bytes(offset1, self._lcp) - self.creator_proc = buf[0:self._lcp] + self.creator_proc = buf[0 : self._lcp] elif self._lcs > 0: self._lcp = 0 offset1 = offset + 16 if self.U64: offset1 += 4 buf = self._read_bytes(offset1, self._lcs) - self.creator_proc = buf[0:self._lcp] + self.creator_proc = buf[0 : self._lcp] if self.convert_header_text: if hasattr(self, "creator_proc"): self.creator_proc = self.creator_proc.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) def _process_columnname_subheader(self, offset, length): int_len = self._int_length offset += int_len column_name_pointers_count = (length - 2 * int_len - 12) // 8 for i in range(column_name_pointers_count): - text_subheader = offset + const.column_name_pointer_length * \ - (i + 1) + const.column_name_text_subheader_offset - col_name_offset = offset + const.column_name_pointer_length * \ - (i + 1) + const.column_name_offset_offset - col_name_length = offset + const.column_name_pointer_length * \ - (i + 1) + const.column_name_length_offset + text_subheader = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_text_subheader_offset + ) + col_name_offset = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_offset_offset + ) + col_name_length = ( + offset + + const.column_name_pointer_length * (i + 1) + + const.column_name_length_offset + ) idx = self._read_int( - text_subheader, const.column_name_text_subheader_length) + text_subheader, const.column_name_text_subheader_length + ) col_offset = self._read_int( - col_name_offset, const.column_name_offset_length) - col_len = self._read_int( - col_name_length, const.column_name_length_length) + col_name_offset, const.column_name_offset_length + ) + col_len = self._read_int(col_name_length, const.column_name_length_length) name_str = self.column_names_strings[idx] - self.column_names.append(name_str[col_offset:col_offset + col_len]) + self.column_names.append(name_str[col_offset : col_offset + col_len]) def _process_columnattributes_subheader(self, offset, length): int_len = self._int_length - column_attributes_vectors_count = ( - length - 2 * int_len - 12) // (int_len + 8) + column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8) for i in range(column_attributes_vectors_count): - col_data_offset = (offset + int_len + - const.column_data_offset_offset + - i * (int_len + 8)) - col_data_len = (offset + 2 * int_len + - const.column_data_length_offset + - i * (int_len + 8)) - col_types = (offset + 2 * int_len + - const.column_type_offset + i * (int_len + 8)) + col_data_offset = ( + offset + int_len + const.column_data_offset_offset + i * (int_len + 8) + ) + col_data_len = ( + offset + + 2 * int_len + + const.column_data_length_offset + + i * (int_len + 8) + ) + col_types = ( + offset + 2 * int_len + const.column_type_offset + i * (int_len + 8) + ) x = self._read_int(col_data_offset, int_len) self._column_data_offsets.append(x) @@ -538,7 +577,7 @@ def _process_columnattributes_subheader(self, offset, length): self._column_data_lengths.append(x) x = self._read_int(col_types, const.column_type_length) - self._column_types.append(b'd' if x == 1 else b's') + self._column_types.append(b"d" if x == 1 else b"s") def _process_columnlist_subheader(self, offset, length): # unknown purpose @@ -547,47 +586,38 @@ def _process_columnlist_subheader(self, offset, length): def _process_format_subheader(self, offset, length): int_len = self._int_length text_subheader_format = ( - offset + - const.column_format_text_subheader_index_offset + - 3 * int_len) - col_format_offset = (offset + - const.column_format_offset_offset + - 3 * int_len) - col_format_len = (offset + - const.column_format_length_offset + - 3 * int_len) + offset + const.column_format_text_subheader_index_offset + 3 * int_len + ) + col_format_offset = offset + const.column_format_offset_offset + 3 * int_len + col_format_len = offset + const.column_format_length_offset + 3 * int_len text_subheader_label = ( - offset + - const.column_label_text_subheader_index_offset + - 3 * int_len) - col_label_offset = (offset + - const.column_label_offset_offset + - 3 * int_len) + offset + const.column_label_text_subheader_index_offset + 3 * int_len + ) + col_label_offset = offset + const.column_label_offset_offset + 3 * int_len col_label_len = offset + const.column_label_length_offset + 3 * int_len - x = self._read_int(text_subheader_format, - const.column_format_text_subheader_index_length) + x = self._read_int( + text_subheader_format, const.column_format_text_subheader_index_length + ) format_idx = min(x, len(self.column_names_strings) - 1) format_start = self._read_int( - col_format_offset, const.column_format_offset_length) - format_len = self._read_int( - col_format_len, const.column_format_length_length) + col_format_offset, const.column_format_offset_length + ) + format_len = self._read_int(col_format_len, const.column_format_length_length) label_idx = self._read_int( - text_subheader_label, - const.column_label_text_subheader_index_length) + text_subheader_label, const.column_label_text_subheader_index_length + ) label_idx = min(label_idx, len(self.column_names_strings) - 1) - label_start = self._read_int( - col_label_offset, const.column_label_offset_length) - label_len = self._read_int(col_label_len, - const.column_label_length_length) + label_start = self._read_int(col_label_offset, const.column_label_offset_length) + label_len = self._read_int(col_label_len, const.column_label_length_length) label_names = self.column_names_strings[label_idx] - column_label = label_names[label_start: label_start + label_len] + column_label = label_names[label_start : label_start + label_len] format_names = self.column_names_strings[format_idx] - column_format = format_names[format_start: format_start + format_len] + column_format = format_names[format_start : format_start + format_len] current_column_number = len(self.columns) col = _column() @@ -619,8 +649,8 @@ def read(self, nrows=None): if nrows > m: nrows = m - nd = self._column_types.count(b'd') - ns = self._column_types.count(b's') + nd = self._column_types.count(b"d") + ns = self._column_types.count(b"s") self._string_chunk = np.empty((ns, nrows), dtype=np.object) self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) @@ -642,10 +672,8 @@ def _read_next_page(self): return True elif len(self._cached_page) != self._page_length: self.close() - msg = ("failed to read complete page from file " - "(read {:d} of {:d} bytes)") - raise ValueError(msg.format(len(self._cached_page), - self._page_length)) + msg = "failed to read complete page from file " "(read {:d} of {:d} bytes)" + raise ValueError(msg.format(len(self._cached_page), self._page_length)) self._read_page_header() page_type = self._current_page_type @@ -671,32 +699,34 @@ def _chunk_to_dataframe(self): name = self.column_names[j] - if self._column_types[j] == b'd': - rslt[name] = self._byte_chunk[jb, :].view( - dtype=self.byte_order + 'd') + if self._column_types[j] == b"d": + rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d") rslt[name] = np.asarray(rslt[name], dtype=np.float64) if self.convert_dates: unit = None if self.column_formats[j] in const.sas_date_formats: - unit = 'd' + unit = "d" elif self.column_formats[j] in const.sas_datetime_formats: - unit = 's' + unit = "s" if unit: - rslt[name] = pd.to_datetime(rslt[name], unit=unit, - origin="1960-01-01") + rslt[name] = pd.to_datetime( + rslt[name], unit=unit, origin="1960-01-01" + ) jb += 1 - elif self._column_types[j] == b's': + elif self._column_types[j] == b"s": rslt[name] = self._string_chunk[js, :] if self.convert_text and (self.encoding is not None): rslt[name] = rslt[name].str.decode( - self.encoding or self.default_encoding) + self.encoding or self.default_encoding + ) if self.blank_missing: ii = rslt[name].str.len() == 0 rslt.loc[ii, name] = np.nan js += 1 else: self.close() - raise ValueError("unknown column type {type}".format( - type=self._column_types[j])) + raise ValueError( + "unknown column type {type}".format(type=self._column_types[j]) + ) return rslt diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py index c37a26cd62ad2..23b23a1bf09c0 100644 --- a/pandas/io/sas/sas_constants.py +++ b/pandas/io/sas/sas_constants.py @@ -1,13 +1,15 @@ -magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" + - b"\x00\x00\x00\x00\xc2\xea\x81\x60" + - b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" + - b"\x09\xc7\x31\x8c\x18\x1f\x10\x11") +magic = ( + b"\x00\x00\x00\x00\x00\x00\x00\x00" + + b"\x00\x00\x00\x00\xc2\xea\x81\x60" + + b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" + + b"\x09\xc7\x31\x8c\x18\x1f\x10\x11" +) -align_1_checker_value = b'3' +align_1_checker_value = b"3" align_1_offset = 32 align_1_length = 1 align_1_value = 4 -u64_byte_checker_value = b'3' +u64_byte_checker_value = b"3" align_2_offset = 35 align_2_length = 1 align_2_value = 4 @@ -91,15 +93,22 @@ column_label_offset_length = 2 column_label_length_offset = 32 column_label_length_length = 2 -rle_compression = b'SASYZCRL' -rdc_compression = b'SASYZCR2' +rle_compression = b"SASYZCRL" +rdc_compression = b"SASYZCR2" compression_literals = [rle_compression, rdc_compression] # Incomplete list of encodings, using SAS nomenclature: # http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm -encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2", - 61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"} +encoding_names = { + 29: "latin1", + 20: "utf-8", + 33: "cyrillic", + 60: "wlatin2", + 61: "wcyrillic", + 62: "wlatin1", + 90: "ebcdic870", +} class SASIndex: @@ -144,28 +153,101 @@ class SASIndex: b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index, b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index, b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index, - b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index} + b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index, +} # List of frequently used SAS date and datetime formats # http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm # https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java -sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN", - "MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS", - "MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR", - "NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV", - "WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD", - "YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ", - "YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC", - "YYQRD", "YYQRP", "YYQRS", "YYQRN", - "YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC", - "MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN", - "YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB", - "MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS", - "MINGUO") +sas_date_formats = ( + "DATE", + "DAY", + "DDMMYY", + "DOWNAME", + "JULDAY", + "JULIAN", + "MMDDYY", + "MMYY", + "MMYYC", + "MMYYD", + "MMYYP", + "MMYYS", + "MMYYN", + "MONNAME", + "MONTH", + "MONYY", + "QTR", + "QTRR", + "NENGO", + "WEEKDATE", + "WEEKDATX", + "WEEKDAY", + "WEEKV", + "WORDDATE", + "WORDDATX", + "YEAR", + "YYMM", + "YYMMC", + "YYMMD", + "YYMMP", + "YYMMS", + "YYMMN", + "YYMON", + "YYMMDD", + "YYQ", + "YYQC", + "YYQD", + "YYQP", + "YYQS", + "YYQN", + "YYQR", + "YYQRC", + "YYQRD", + "YYQRP", + "YYQRS", + "YYQRN", + "YYMMDDP", + "YYMMDDC", + "E8601DA", + "YYMMDDN", + "MMDDYYC", + "MMDDYYS", + "MMDDYYD", + "YYMMDDS", + "B8601DA", + "DDMMYYN", + "YYMMDDD", + "DDMMYYB", + "DDMMYYP", + "MMDDYYP", + "YYMMDDB", + "MMDDYYN", + "DDMMYYC", + "DDMMYYD", + "DDMMYYS", + "MINGUO", +) -sas_datetime_formats = ("DATETIME", "DTWKDATX", - "B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX", - "E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX", - "DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX", - "DTYEAR", "TOD", "MDYAMPM") +sas_datetime_formats = ( + "DATETIME", + "DTWKDATX", + "B8601DN", + "B8601DT", + "B8601DX", + "B8601DZ", + "B8601LX", + "E8601DN", + "E8601DT", + "E8601DX", + "E8601DZ", + "E8601LX", + "DATEAMPM", + "DTDATE", + "DTMONYY", + "DTMONYY", + "DTWKDATX", + "DTYEAR", + "TOD", + "MDYAMPM", +) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 0dbea452230d6..34b93d72d0e29 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -21,17 +21,39 @@ from pandas.io.common import BaseIterator, get_filepath_or_buffer -_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" - "000000000000000000000000000000 ") -_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" - "000000000000000001600000000") -_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" - "000000000000000000000000000000 ") -_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!" - "000000000000000000000000000000 ") -_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label', - 'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform', - 'nifl', 'nifd', 'npos', '_'] +_correct_line1 = ( + "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_header1 = ( + "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" "000000000000000001600000000" +) +_correct_header2 = ( + "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_correct_obs_header = ( + "HEADER RECORD*******OBS HEADER RECORD!!!!!!!" + "000000000000000000000000000000 " +) +_fieldkeys = [ + "ntype", + "nhfun", + "field_length", + "nvar0", + "name", + "label", + "nform", + "nfl", + "num_decimals", + "nfj", + "nfill", + "niform", + "nifl", + "nifd", + "npos", + "_", +] _base_params_doc = """\ @@ -80,10 +102,12 @@ >>> for chunk in itr: >>> do_something(chunk) -""" % {"_base_params_doc": _base_params_doc, - "_format_params_doc": _format_params_doc, - "_params2_doc": _params2_doc, - "_iterator_doc": _iterator_doc} +""" % { + "_base_params_doc": _base_params_doc, + "_format_params_doc": _format_params_doc, + "_params2_doc": _params2_doc, + "_iterator_doc": _iterator_doc, +} _xport_reader_doc = """\ @@ -98,8 +122,10 @@ Contains information about the file fields : list Contains information about the variables in the file -""" % {"_base_params_doc": _base_params_doc, - "_params2_doc": _params2_doc} +""" % { + "_base_params_doc": _base_params_doc, + "_params2_doc": _params2_doc, +} _read_method_doc = """\ @@ -142,9 +168,9 @@ def _split_line(s, parts): out = {} start = 0 for name, length in parts: - out[name] = s[start:start + length].strip() + out[name] = s[start : start + length].strip() start += length - del out['_'] + del out["_"] return out @@ -158,10 +184,10 @@ def _handle_truncated_float_vec(vec, nbytes): # The R "foreign" library if nbytes != 8: - vec1 = np.zeros(len(vec), np.dtype('S8')) - dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes)) + vec1 = np.zeros(len(vec), np.dtype("S8")) + dtype = np.dtype("S%d,S%d" % (nbytes, 8 - nbytes)) vec2 = vec1.view(dtype=dtype) - vec2['f0'] = vec + vec2["f0"] = vec return vec2 return vec @@ -173,14 +199,14 @@ def _parse_float_vec(vec): native 8 byte floats. """ - dtype = np.dtype('>u4,>u4') + dtype = np.dtype(">u4,>u4") vec1 = vec.view(dtype=dtype) - xport1 = vec1['f0'] - xport2 = vec1['f1'] + xport1 = vec1["f0"] + xport2 = vec1["f1"] # Start by setting first half of ieee number to first half of IBM # number sans exponent - ieee1 = xport1 & 0x00ffffff + ieee1 = xport1 & 0x00FFFFFF # The fraction bit to the left of the binary point in the ieee # format was set and the number was shifted 0, 1, 2, or 3 @@ -203,7 +229,7 @@ def _parse_float_vec(vec): ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift))) # clear the 1 bit to the left of the binary point - ieee1 &= 0xffefffff + ieee1 &= 0xFFEFFFFF # set the exponent of the ieee number to be the actual exponent # plus the shift count + 1023. Or this into the first half of the @@ -212,14 +238,15 @@ def _parse_float_vec(vec): # incremented by 1 and the fraction bits left 4 positions to the # right of the radix point. (had to add >> 24 because C treats & # 0x7f as 0x7f000000 and Python doesn't) - ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) + - shift + 1023) << 20) | (xport1 & 0x80000000) + ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | ( + xport1 & 0x80000000 + ) - ieee = np.empty((len(ieee1),), dtype='>u4,>u4') - ieee['f0'] = ieee1 - ieee['f1'] = ieee2 - ieee = ieee.view(dtype='>f8') - ieee = ieee.astype('f8') + ieee = np.empty((len(ieee1),), dtype=">u4,>u4") + ieee["f0"] = ieee1 + ieee["f1"] = ieee2 + ieee = ieee.view(dtype=">f8") + ieee = ieee.astype("f8") return ieee @@ -227,8 +254,9 @@ def _parse_float_vec(vec): class XportReader(BaseIterator): __doc__ = _xport_reader_doc - def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', - chunksize=None): + def __init__( + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + ): self._encoding = encoding self._lines_read = 0 @@ -236,12 +264,15 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._chunksize = chunksize if isinstance(filepath_or_buffer, str): - (filepath_or_buffer, encoding, - compression, should_close) = get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding) + ( + filepath_or_buffer, + encoding, + compression, + should_close, + ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, bytes)): - self.filepath_or_buffer = open(filepath_or_buffer, 'rb') + self.filepath_or_buffer = open(filepath_or_buffer, "rb") else: # Copy to BytesIO, and ensure no encoding contents = filepath_or_buffer.read() @@ -269,23 +300,22 @@ def _read_header(self): raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() - fif = [['prefix', 24], ['version', 8], ['OS', 8], - ['_', 24], ['created', 16]] + fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]] file_info = _split_line(line2, fif) - if file_info['prefix'] != "SAS SAS SASLIB": + if file_info["prefix"] != "SAS SAS SASLIB": self.close() raise ValueError("Header record has invalid prefix.") - file_info['created'] = _parse_date(file_info['created']) + file_info["created"] = _parse_date(file_info["created"]) self.file_info = file_info line3 = self._get_row() - file_info['modified'] = _parse_date(line3[:16]) + file_info["modified"] = _parse_date(line3[:16]) # read member header header1 = self._get_row() header2 = self._get_row() headflag1 = header1.startswith(_correct_header1) - headflag2 = (header2 == _correct_header2) + headflag2 = header2 == _correct_header2 if not (headflag1 and headflag2): self.close() raise ValueError("Member header not found") @@ -293,17 +323,24 @@ def _read_header(self): fieldnamelength = int(header1[-5:-2]) # member info - mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8], - ['version', 8], ['OS', 8], ['_', 24], ['created', 16]] + mem = [ + ["prefix", 8], + ["set_name", 8], + ["sasdata", 8], + ["version", 8], + ["OS", 8], + ["_", 24], + ["created", 16], + ] member_info = _split_line(self._get_row(), mem) - mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]] + mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]] member_info.update(_split_line(self._get_row(), mem)) - member_info['modified'] = _parse_date(member_info['modified']) - member_info['created'] = _parse_date(member_info['created']) + member_info["modified"] = _parse_date(member_info["modified"]) + member_info["created"] = _parse_date(member_info["created"]) self.member_info = member_info # read field names - types = {1: 'numeric', 2: 'char'} + types = {1: "numeric", 2: "char"} fieldcount = int(self._get_row()[54:58]) datalength = fieldnamelength * fieldcount # round up to nearest 80 @@ -314,19 +351,21 @@ def _read_header(self): obs_length = 0 while len(fielddata) >= fieldnamelength: # pull data for one field - field, fielddata = (fielddata[:fieldnamelength], - fielddata[fieldnamelength:]) + field, fielddata = ( + fielddata[:fieldnamelength], + fielddata[fieldnamelength:], + ) # rest at end gets ignored, so if field is short, pad out # to match struct pattern below field = field.ljust(140) - fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field) + fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field) field = dict(zip(_fieldkeys, fieldstruct)) - del field['_'] - field['ntype'] = types[field['ntype']] - fl = field['field_length'] - if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): + del field["_"] + field["ntype"] = types[field["ntype"]] + fl = field["field_length"] + if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)): self.close() msg = "Floating field width {0} is not between 2 and 8." raise TypeError(msg.format(fl)) @@ -337,7 +376,7 @@ def _read_header(self): except AttributeError: pass - obs_length += field['field_length'] + obs_length += field["field_length"] fields += [field] header = self._get_row() @@ -350,11 +389,13 @@ def _read_header(self): self.record_start = self.filepath_or_buffer.tell() self.nobs = self._record_count() - self.columns = [x['name'].decode() for x in self.fields] + self.columns = [x["name"].decode() for x in self.fields] # Setup the dtype. - dtypel = [('s' + str(i), "S" + str(field['field_length'])) - for i, field in enumerate(self.fields)] + dtypel = [ + ("s" + str(i), "S" + str(field["field_length"])) + for i, field in enumerate(self.fields) + ] dtype = np.dtype(dtypel) self._dtype = dtype @@ -372,8 +413,7 @@ def _record_count(self): """ self.filepath_or_buffer.seek(0, 2) - total_records_length = (self.filepath_or_buffer.tell() - - self.record_start) + total_records_length = self.filepath_or_buffer.tell() - self.record_start if total_records_length % 80 != 0: warnings.warn("xport file may be corrupted") @@ -416,10 +456,13 @@ def get_chunk(self, size=None): return self.read(nrows=size) def _missing_double(self, vec): - v = vec.view(dtype='u1,u1,u2,u4') - miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0) - miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) | - (v['f0'] == 0x5f) | (v['f0'] == 0x2e)) + v = vec.view(dtype="u1,u1,u2,u4") + miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0) + miss1 = ( + ((v["f0"] >= 0x41) & (v["f0"] <= 0x5A)) + | (v["f0"] == 0x5F) + | (v["f0"] == 0x2E) + ) miss &= miss1 return miss @@ -439,15 +482,14 @@ def read(self, nrows=None): df = pd.DataFrame(index=range(read_lines)) for j, x in enumerate(self.columns): - vec = data['s%d' % j] - ntype = self.fields[j]['ntype'] + vec = data["s%d" % j] + ntype = self.fields[j]["ntype"] if ntype == "numeric": - vec = _handle_truncated_float_vec( - vec, self.fields[j]['field_length']) + vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"]) miss = self._missing_double(vec) v = _parse_float_vec(vec) v[miss] = np.nan - elif self.fields[j]['ntype'] == 'char': + elif self.fields[j]["ntype"] == "char": v = [y.rstrip() for y in vec] if self._encoding is not None: diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 0726e17e3bbab..680425f421eec 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -4,8 +4,14 @@ from pandas.io.common import _stringify_path -def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, - chunksize=None, iterator=False): +def read_sas( + filepath_or_buffer, + format=None, + index=None, + encoding=None, + chunksize=None, + iterator=False, +): """ Read SAS files stored as either XPORT or SAS7BDAT format files. @@ -31,9 +37,11 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, or XportReader """ if format is None: - buffer_error_msg = ("If this is a buffer object rather " - "than a string name, you must specify " - "a format string") + buffer_error_msg = ( + "If this is a buffer object rather " + "than a string name, you must specify " + "a format string" + ) filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) @@ -45,18 +53,20 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, else: raise ValueError("unable to infer format of SAS file") - if format.lower() == 'xport': + if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader - reader = XportReader(filepath_or_buffer, index=index, - encoding=encoding, - chunksize=chunksize) - elif format.lower() == 'sas7bdat': + + reader = XportReader( + filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ) + elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader - reader = SAS7BDATReader(filepath_or_buffer, index=index, - encoding=encoding, - chunksize=chunksize) + + reader = SAS7BDATReader( + filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ) else: - raise ValueError('unknown SAS format') + raise ValueError("unknown SAS format") if iterator or chunksize: return reader diff --git a/pandas/io/spss.py b/pandas/io/spss.py index b1b92fc2b8439..983ac1c818c42 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -7,9 +7,11 @@ from pandas.core.api import DataFrame -def read_spss(path: Union[str, Path], - usecols: Optional[Sequence[str]] = None, - convert_categoricals: bool = True) -> DataFrame: +def read_spss( + path: Union[str, Path], + usecols: Optional[Sequence[str]] = None, + convert_categoricals: bool = True, +) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. @@ -36,6 +38,7 @@ def read_spss(path: Union[str, Path], else: usecols = list(usecols) # pyreadstat requires a list - df, _ = pyreadstat.read_sav(path, usecols=usecols, - apply_value_formats=convert_categoricals) + df, _ = pyreadstat.read_sav( + path, usecols=usecols, apply_value_formats=convert_categoricals + ) return df diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 6cb57077be76a..211571c7dbaa1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -14,8 +14,7 @@ import pandas._libs.lib as lib from pandas.compat import raise_with_traceback -from pandas.core.dtypes.common import ( - is_datetime64tz_dtype, is_dict_like, is_list_like) +from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -43,12 +42,14 @@ def _is_sqlalchemy_connectable(con): if _SQLALCHEMY_INSTALLED is None: try: import sqlalchemy + _SQLALCHEMY_INSTALLED = True except ImportError: _SQLALCHEMY_INSTALLED = False if _SQLALCHEMY_INSTALLED: import sqlalchemy # noqa: F811 + return isinstance(con, sqlalchemy.engine.Connectable) else: return False @@ -58,7 +59,7 @@ def _convert_params(sql, params): """Convert SQL and params args to DBAPI2.0 compliant format.""" args = [sql] if params is not None: - if hasattr(params, 'keys'): # test if params is a mapping + if hasattr(params, "keys"): # test if params is a mapping args += [params] else: args += [list(params)] @@ -71,28 +72,30 @@ def _process_parse_dates_argument(parse_dates): if parse_dates is True or parse_dates is None or parse_dates is False: parse_dates = [] - elif not hasattr(parse_dates, '__iter__'): + elif not hasattr(parse_dates, "__iter__"): parse_dates = [parse_dates] return parse_dates def _handle_date_column(col, utc=None, format=None): if isinstance(format, dict): - return to_datetime(col, errors='ignore', **format) + return to_datetime(col, errors="ignore", **format) else: # Allow passing of formatting string for integers # GH17855 - if format is None and (issubclass(col.dtype.type, np.floating) or - issubclass(col.dtype.type, np.integer)): - format = 's' - if format in ['D', 'd', 'h', 'm', 's', 'ms', 'us', 'ns']: - return to_datetime(col, errors='coerce', unit=format, utc=utc) + if format is None and ( + issubclass(col.dtype.type, np.floating) + or issubclass(col.dtype.type, np.integer) + ): + format = "s" + if format in ["D", "d", "h", "m", "s", "ms", "us", "ns"]: + return to_datetime(col, errors="coerce", unit=format, utc=utc) elif is_datetime64tz_dtype(col): # coerce to UTC timezone # GH11216 return to_datetime(col, utc=True) else: - return to_datetime(col, errors='coerce', format=format, utc=utc) + return to_datetime(col, errors="coerce", format=format, utc=utc) def _parse_date_columns(data_frame, parse_dates): @@ -116,12 +119,10 @@ def _parse_date_columns(data_frame, parse_dates): return data_frame -def _wrap_result(data, columns, index_col=None, coerce_float=True, - parse_dates=None): +def _wrap_result(data, columns, index_col=None, coerce_float=True, parse_dates=None): """Wrap result set of query in a DataFrame.""" - frame = DataFrame.from_records(data, columns=columns, - coerce_float=coerce_float) + frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float) frame = _parse_date_columns(frame, parse_dates) @@ -162,9 +163,17 @@ def execute(sql, con, cur=None, params=None): # ----------------------------------------------------------------------------- # -- Read and write to DataFrames -def read_sql_table(table_name, con, schema=None, index_col=None, - coerce_float=True, parse_dates=None, columns=None, - chunksize=None): + +def read_sql_table( + table_name, + con, + schema=None, + index_col=None, + coerce_float=True, + parse_dates=None, + columns=None, + chunksize=None, +): """ Read SQL database table into a DataFrame. @@ -223,10 +232,12 @@ def read_sql_table(table_name, con, schema=None, index_col=None, con = _engine_builder(con) if not _is_sqlalchemy_connectable(con): - raise NotImplementedError("read_sql_table only supported for " - "SQLAlchemy connectable.") + raise NotImplementedError( + "read_sql_table only supported for " "SQLAlchemy connectable." + ) import sqlalchemy from sqlalchemy.schema import MetaData + meta = MetaData(con, schema=schema) try: meta.reflect(only=[table_name], views=True) @@ -235,8 +246,13 @@ def read_sql_table(table_name, con, schema=None, index_col=None, pandas_sql = SQLDatabase(con, meta=meta) table = pandas_sql.read_table( - table_name, index_col=index_col, coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns, chunksize=chunksize) + table_name, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) if table is not None: return table @@ -244,8 +260,15 @@ def read_sql_table(table_name, con, schema=None, index_col=None, raise ValueError("Table {name} not found".format(name=table_name), con) -def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, - parse_dates=None, chunksize=None): +def read_sql_query( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + chunksize=None, +): """Read SQL query into a DataFrame. Returns a DataFrame corresponding to the result set of the query @@ -301,12 +324,25 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, """ pandas_sql = pandasSQL_builder(con) return pandas_sql.read_query( - sql, index_col=index_col, params=params, coerce_float=coerce_float, - parse_dates=parse_dates, chunksize=chunksize) - - -def read_sql(sql, con, index_col=None, coerce_float=True, params=None, - parse_dates=None, columns=None, chunksize=None): + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) + + +def read_sql( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + columns=None, + chunksize=None, +): """ Read SQL query or database table into a DataFrame. @@ -366,9 +402,13 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, if isinstance(pandas_sql, SQLiteDatabase): return pandas_sql.read_query( - sql, index_col=index_col, params=params, - coerce_float=coerce_float, parse_dates=parse_dates, - chunksize=chunksize) + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) try: _is_table_name = pandas_sql.has_table(sql) @@ -379,17 +419,36 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, if _is_table_name: pandas_sql.meta.reflect(only=[sql]) return pandas_sql.read_table( - sql, index_col=index_col, coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns, chunksize=chunksize) + sql, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) else: return pandas_sql.read_query( - sql, index_col=index_col, params=params, - coerce_float=coerce_float, parse_dates=parse_dates, - chunksize=chunksize) + sql, + index_col=index_col, + params=params, + coerce_float=coerce_float, + parse_dates=parse_dates, + chunksize=chunksize, + ) -def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, - index_label=None, chunksize=None, dtype=None, method=None): +def to_sql( + frame, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, +): """ Write records stored in a DataFrame to a SQL database. @@ -435,7 +494,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, .. versionadded:: 0.24.0 """ - if if_exists not in ('fail', 'replace', 'append'): + if if_exists not in ("fail", "replace", "append"): raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) pandas_sql = pandasSQL_builder(con, schema=schema) @@ -443,12 +502,21 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True, if isinstance(frame, Series): frame = frame.to_frame() elif not isinstance(frame, DataFrame): - raise NotImplementedError("'frame' argument should be either a " - "Series or a DataFrame") + raise NotImplementedError( + "'frame' argument should be either a " "Series or a DataFrame" + ) - pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, - index_label=index_label, schema=schema, - chunksize=chunksize, dtype=dtype, method=method) + pandas_sql.to_sql( + frame, + name, + if_exists=if_exists, + index=index, + index_label=index_label, + schema=schema, + chunksize=chunksize, + dtype=dtype, + method=method, + ) def has_table(table_name, con, schema=None): @@ -496,8 +564,7 @@ def _engine_builder(con): return con -def pandasSQL_builder(con, schema=None, meta=None, - is_cursor=False): +def pandasSQL_builder(con, schema=None, meta=None, is_cursor=False): """ Convenience function to return the correct PandasSQL subclass based on the provided parameters. @@ -521,11 +588,22 @@ class SQLTable(PandasObject): Also holds various flags needed to avoid having to pass them between functions all the time. """ + # TODO: support for multiIndex - def __init__(self, name, pandas_sql_engine, frame=None, index=True, - if_exists='fail', prefix='pandas', index_label=None, - schema=None, keys=None, dtype=None): + def __init__( + self, + name, + pandas_sql_engine, + frame=None, + index=True, + if_exists="fail", + prefix="pandas", + index_label=None, + schema=None, + keys=None, + dtype=None, + ): self.name = name self.pd_sql = pandas_sql_engine self.prefix = prefix @@ -544,14 +622,14 @@ def __init__(self, name, pandas_sql_engine, frame=None, index=True, self.table = self.pd_sql.get_table(self.name, self.schema) if self.table is None: - raise ValueError( - "Could not init table '{name}'".format(name=name)) + raise ValueError("Could not init table '{name}'".format(name=name)) def exists(self): return self.pd_sql.has_table(self.name, self.schema) def sql_schema(self): from sqlalchemy.schema import CreateTable + return str(CreateTable(self.table).compile(self.pd_sql.connectable)) def _execute_create(self): @@ -561,17 +639,19 @@ def _execute_create(self): def create(self): if self.exists(): - if self.if_exists == 'fail': + if self.if_exists == "fail": raise ValueError( - "Table '{name}' already exists.".format(name=self.name)) - elif self.if_exists == 'replace': + "Table '{name}' already exists.".format(name=self.name) + ) + elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() - elif self.if_exists == 'append': + elif self.if_exists == "append": pass else: raise ValueError( - "'{0}' is not valid for if_exists".format(self.if_exists)) + "'{0}' is not valid for if_exists".format(self.if_exists) + ) else: self._execute_create() @@ -606,8 +686,7 @@ def insert_data(self): try: temp.reset_index(inplace=True) except ValueError as err: - raise ValueError( - "duplicate name in index/columns: {0}".format(err)) + raise ValueError("duplicate name in index/columns: {0}".format(err)) else: temp = self.frame @@ -626,7 +705,7 @@ def insert_data(self): d = np.atleast_2d(d) else: # convert to microsecond resolution for datetime.datetime - d = b.values.astype('M8[us]').astype(object) + d = b.values.astype("M8[us]").astype(object) else: d = np.array(b.get_values(), dtype=object) @@ -645,12 +724,12 @@ def insert(self, chunksize=None, method=None): # set insert method if method is None: exec_insert = self._execute_insert - elif method == 'multi': + elif method == "multi": exec_insert = self._execute_insert_multi elif callable(method): exec_insert = partial(method, self) else: - raise ValueError('Invalid parameter `method`: {}'.format(method)) + raise ValueError("Invalid parameter `method`: {}".format(method)) keys, data_list = self.insert_data() @@ -662,7 +741,7 @@ def insert(self, chunksize=None, method=None): if chunksize is None: chunksize = nrows elif chunksize == 0: - raise ValueError('chunksize argument should be non-zero') + raise ValueError("chunksize argument should be non-zero") chunks = int(nrows / chunksize) + 1 @@ -676,8 +755,9 @@ def insert(self, chunksize=None, method=None): chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) exec_insert(conn, keys, chunk_iter) - def _query_iterator(self, result, chunksize, columns, coerce_float=True, - parse_dates=None): + def _query_iterator( + self, result, chunksize, columns, coerce_float=True, parse_dates=None + ): """Return generator through chunked result set.""" while True: @@ -686,7 +766,8 @@ def _query_iterator(self, result, chunksize, columns, coerce_float=True, break else: self.frame = DataFrame.from_records( - data, columns=columns, coerce_float=coerce_float) + data, columns=columns, coerce_float=coerce_float + ) self._harmonize_columns(parse_dates=parse_dates) @@ -695,11 +776,11 @@ def _query_iterator(self, result, chunksize, columns, coerce_float=True, yield self.frame - def read(self, coerce_float=True, parse_dates=None, columns=None, - chunksize=None): + def read(self, coerce_float=True, parse_dates=None, columns=None, chunksize=None): if columns is not None and len(columns) > 0: from sqlalchemy import select + cols = [self.table.c[n] for n in columns] if self.index is not None: [cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]] @@ -711,13 +792,18 @@ def read(self, coerce_float=True, parse_dates=None, columns=None, column_names = result.keys() if chunksize is not None: - return self._query_iterator(result, chunksize, column_names, - coerce_float=coerce_float, - parse_dates=parse_dates) + return self._query_iterator( + result, + chunksize, + column_names, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) else: data = result.fetchall() self.frame = DataFrame.from_records( - data, columns=column_names, coerce_float=coerce_float) + data, columns=column_names, coerce_float=coerce_float + ) self._harmonize_columns(parse_dates=parse_dates) @@ -737,16 +823,22 @@ def _index_name(self, index, index_label): if len(index_label) != nlevels: raise ValueError( "Length of 'index_label' should match number of " - "levels, which is {0}".format(nlevels)) + "levels, which is {0}".format(nlevels) + ) else: return index_label # return the used column labels for the index columns - if (nlevels == 1 and 'index' not in self.frame.columns and - self.frame.index.name is None): - return ['index'] + if ( + nlevels == 1 + and "index" not in self.frame.columns + and self.frame.index.name is None + ): + return ["index"] else: - return [l if l is not None else "level_{0}".format(i) - for i, l in enumerate(self.frame.index.names)] + return [ + l if l is not None else "level_{0}".format(i) + for i, l in enumerate(self.frame.index.names) + ] # for reading: index=(list of) string to specify column to set as index elif isinstance(index, str): @@ -760,14 +852,11 @@ def _get_column_names_and_types(self, dtype_mapper): column_names_and_types = [] if self.index is not None: for i, idx_label in enumerate(self.index): - idx_type = dtype_mapper( - self.frame.index._get_level_values(i)) + idx_type = dtype_mapper(self.frame.index._get_level_values(i)) column_names_and_types.append((str(idx_label), idx_type, True)) column_names_and_types += [ - (str(self.frame.columns[i]), - dtype_mapper(self.frame.iloc[:, i]), - False) + (str(self.frame.columns[i]), dtype_mapper(self.frame.iloc[:, i]), False) for i in range(len(self.frame.columns)) ] @@ -776,19 +865,19 @@ def _get_column_names_and_types(self, dtype_mapper): def _create_table_setup(self): from sqlalchemy import Table, Column, PrimaryKeyConstraint - column_names_and_types = self._get_column_names_and_types( - self._sqlalchemy_type - ) + column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type) - columns = [Column(name, typ, index=is_index) - for name, typ, is_index in column_names_and_types] + columns = [ + Column(name, typ, index=is_index) + for name, typ, is_index in column_names_and_types + ] if self.keys is not None: if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys - pkc = PrimaryKeyConstraint(*keys, name=self.name + '_pk') + pkc = PrimaryKeyConstraint(*keys, name=self.name + "_pk") columns.append(pkc) schema = self.schema or self.pd_sql.meta.schema @@ -796,6 +885,7 @@ def _create_table_setup(self): # At this point, attach to new metadata, only attach to self.meta # once table is created. from sqlalchemy.schema import MetaData + meta = MetaData(self.pd_sql, schema=schema) return Table(self.name, meta, *columns, schema=schema) @@ -826,15 +916,17 @@ def _harmonize_columns(self, parse_dates=None): fmt = parse_dates[col_name] except TypeError: fmt = None - self.frame[col_name] = _handle_date_column( - df_col, format=fmt) + self.frame[col_name] = _handle_date_column(df_col, format=fmt) continue # the type the dataframe column should have col_type = self._get_dtype(sql_col.type) - if (col_type is datetime or col_type is date or - col_type is DatetimeTZDtype): + if ( + col_type is datetime + or col_type is date + or col_type is DatetimeTZDtype + ): # Convert tz-aware Datetime SQL columns to UTC utc = col_type is DatetimeTZDtype self.frame[col_name] = _handle_date_column(df_col, utc=utc) @@ -844,9 +936,8 @@ def _harmonize_columns(self, parse_dates=None): elif len(df_col) == df_col.count(): # No NA values, can convert ints and bools - if col_type is np.dtype('int64') or col_type is bool: - self.frame[col_name] = df_col.astype( - col_type, copy=False) + if col_type is np.dtype("int64") or col_type is bool: + self.frame[col_name] = df_col.astype(col_type, copy=False) except KeyError: pass # this column not in results @@ -860,11 +951,19 @@ def _sqlalchemy_type(self, col): # Needed for inserting typed data containing NULLs, GH 8778. col_type = lib.infer_dtype(col, skipna=True) - from sqlalchemy.types import (BigInteger, Integer, Float, - Text, Boolean, - DateTime, Date, Time, TIMESTAMP) + from sqlalchemy.types import ( + BigInteger, + Integer, + Float, + Text, + Boolean, + DateTime, + Date, + Time, + TIMESTAMP, + ) - if col_type == 'datetime64' or col_type == 'datetime': + if col_type == "datetime64" or col_type == "datetime": # GH 9086: TIMESTAMP is the suggested type if the column contains # timezone information try: @@ -875,41 +974,44 @@ def _sqlalchemy_type(self, col): if col.tz is not None: return TIMESTAMP(timezone=True) return DateTime - if col_type == 'timedelta64': - warnings.warn("the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", UserWarning, stacklevel=8) + if col_type == "timedelta64": + warnings.warn( + "the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", + UserWarning, + stacklevel=8, + ) return BigInteger - elif col_type == 'floating': - if col.dtype == 'float32': + elif col_type == "floating": + if col.dtype == "float32": return Float(precision=23) else: return Float(precision=53) - elif col_type == 'integer': - if col.dtype == 'int32': + elif col_type == "integer": + if col.dtype == "int32": return Integer else: return BigInteger - elif col_type == 'boolean': + elif col_type == "boolean": return Boolean - elif col_type == 'date': + elif col_type == "date": return Date - elif col_type == 'time': + elif col_type == "time": return Time - elif col_type == 'complex': - raise ValueError('Complex datatypes not supported') + elif col_type == "complex": + raise ValueError("Complex datatypes not supported") return Text def _get_dtype(self, sqltype): - from sqlalchemy.types import (Integer, Float, Boolean, DateTime, - Date, TIMESTAMP) + from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP if isinstance(sqltype, Float): return float elif isinstance(sqltype, Integer): # TODO: Refine integer size. - return np.dtype('int64') + return np.dtype("int64") elif isinstance(sqltype, TIMESTAMP): # we have a timezone capable type if not sqltype.timezone: @@ -931,12 +1033,16 @@ class PandasSQL(PandasObject): """ def read_sql(self, *args, **kwargs): - raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or sqlite connection") + raise ValueError( + "PandasSQL must be created with an SQLAlchemy " + "connectable or sqlite connection" + ) def to_sql(self, *args, **kwargs): - raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or sqlite connection") + raise ValueError( + "PandasSQL must be created with an SQLAlchemy " + "connectable or sqlite connection" + ) class SQLDatabase(PandasSQL): @@ -963,6 +1069,7 @@ def __init__(self, engine, schema=None, meta=None): self.connectable = engine if not meta: from sqlalchemy.schema import MetaData + meta = MetaData(self.connectable, schema=schema) self.meta = meta @@ -970,7 +1077,7 @@ def __init__(self, engine, schema=None, meta=None): @contextmanager def run_transaction(self): with self.connectable.begin() as tx: - if hasattr(tx, 'execute'): + if hasattr(tx, "execute"): yield tx else: yield self.connectable @@ -979,9 +1086,16 @@ def execute(self, *args, **kwargs): """Simple passthrough to SQLAlchemy connectable""" return self.connectable.execute(*args, **kwargs) - def read_table(self, table_name, index_col=None, coerce_float=True, - parse_dates=None, columns=None, schema=None, - chunksize=None): + def read_table( + self, + table_name, + index_col=None, + coerce_float=True, + parse_dates=None, + columns=None, + schema=None, + chunksize=None, + ): """Read SQL database table into a DataFrame. Parameters @@ -1024,13 +1138,17 @@ def read_table(self, table_name, index_col=None, coerce_float=True, """ table = SQLTable(table_name, self, index=index_col, schema=schema) - return table.read(coerce_float=coerce_float, - parse_dates=parse_dates, columns=columns, - chunksize=chunksize) + return table.read( + coerce_float=coerce_float, + parse_dates=parse_dates, + columns=columns, + chunksize=chunksize, + ) @staticmethod - def _query_iterator(result, chunksize, columns, index_col=None, - coerce_float=True, parse_dates=None): + def _query_iterator( + result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + ): """Return generator through chunked result set""" while True: @@ -1038,12 +1156,23 @@ def _query_iterator(result, chunksize, columns, index_col=None, if not data: break else: - yield _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + yield _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) - def read_query(self, sql, index_col=None, coerce_float=True, - parse_dates=None, params=None, chunksize=None): + def read_query( + self, + sql, + index_col=None, + coerce_float=True, + parse_dates=None, + params=None, + chunksize=None, + ): """Read SQL query into a DataFrame. Parameters @@ -1090,22 +1219,39 @@ def read_query(self, sql, index_col=None, coerce_float=True, columns = result.keys() if chunksize is not None: - return self._query_iterator(result, chunksize, columns, - index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + return self._query_iterator( + result, + chunksize, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) else: data = result.fetchall() - frame = _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + frame = _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) return frame read_sql = read_query - def to_sql(self, frame, name, if_exists='fail', index=True, - index_label=None, schema=None, chunksize=None, dtype=None, - method=None): + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype=None, + method=None, + ): """ Write records stored in a DataFrame to a SQL database. @@ -1152,24 +1298,33 @@ def to_sql(self, frame, name, if_exists='fail', index=True, if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine + for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): - raise ValueError('The type of {column} is not a ' - 'SQLAlchemy type '.format(column=col)) + raise ValueError( + "The type of {column} is not a " + "SQLAlchemy type ".format(column=col) + ) - table = SQLTable(name, self, frame=frame, index=index, - if_exists=if_exists, index_label=index_label, - schema=schema, dtype=dtype) + table = SQLTable( + name, + self, + frame=frame, + index=index, + if_exists=if_exists, + index_label=index_label, + schema=schema, + dtype=dtype, + ) table.create() table.insert(chunksize, method=method) - if (not name.isdigit() and not name.islower()): + if not name.isdigit() and not name.islower(): # check for potentially case sensitivity issues (GH7815) # Only check when name is not a number and name is not lower case engine = self.connectable.engine with self.connectable.connect() as conn: table_names = engine.table_names( - schema=schema or self.meta.schema, - connection=conn, + schema=schema or self.meta.schema, connection=conn ) if name not in table_names: msg = ( @@ -1186,20 +1341,19 @@ def tables(self): def has_table(self, name, schema=None): return self.connectable.run_callable( - self.connectable.dialect.has_table, - name, - schema or self.meta.schema, + self.connectable.dialect.has_table, name, schema or self.meta.schema ) def get_table(self, table_name, schema=None): schema = schema or self.meta.schema if schema: - tbl = self.meta.tables.get('.'.join([schema, table_name])) + tbl = self.meta.tables.get(".".join([schema, table_name])) else: tbl = self.meta.tables.get(table_name) # Avoid casting double-precision floats into decimals from sqlalchemy import Numeric + for column in tbl.columns: if isinstance(column.type, Numeric): column.type.asdecimal = False @@ -1214,8 +1368,9 @@ def drop_table(self, table_name, schema=None): self.meta.clear() def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): - table = SQLTable(table_name, self, frame=frame, index=False, keys=keys, - dtype=dtype) + table = SQLTable( + table_name, self, frame=frame, index=False, keys=keys, dtype=dtype + ) return str(table.sql_schema()) @@ -1223,13 +1378,13 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): # sqlite-specific sql strings and handler class # dictionary used for readability purposes _SQL_TYPES = { - 'string': 'TEXT', - 'floating': 'REAL', - 'integer': 'INTEGER', - 'datetime': 'TIMESTAMP', - 'date': 'DATE', - 'time': 'TIME', - 'boolean': 'INTEGER', + "string": "TEXT", + "floating": "REAL", + "integer": "INTEGER", + "datetime": "TIMESTAMP", + "date": "DATE", + "time": "TIME", + "boolean": "INTEGER", } @@ -1238,7 +1393,8 @@ def _get_unicode_name(name): uname = str(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: raise ValueError( - "Cannot convert identifier to UTF-8: '{name}'".format(name=name)) + "Cannot convert identifier to UTF-8: '{name}'".format(name=name) + ) return uname @@ -1256,13 +1412,15 @@ def _get_valid_sqlite_name(name): nul_index = uname.find("\x00") if nul_index >= 0: - raise ValueError('SQLite identifier cannot contain NULs') + raise ValueError("SQLite identifier cannot contain NULs") return '"' + uname.replace('"', '""') + '"' -_SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. " - "In pandas versions < 0.14, spaces were converted to " - "underscores.") +_SAFE_NAMES_WARNING = ( + "The spaces in these column names will not be changed. " + "In pandas versions < 0.14, spaces were converted to " + "underscores." +) class SQLiteTable(SQLTable): @@ -1275,6 +1433,7 @@ def __init__(self, *args, **kwargs): # GH 8341 # register an adapter callable for datetime.time object import sqlite3 + # this will transform time(12,34,56,789) into '12:34:56.000789' # (this is what sqlalchemy does) sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f")) @@ -1290,18 +1449,18 @@ def _execute_create(self): def insert_statement(self): names = list(map(str, self.frame.columns)) - wld = '?' # wildcard char + wld = "?" # wildcard char escape = _get_valid_sqlite_name if self.index is not None: [names.insert(0, idx) for idx in self.index[::-1]] bracketed_names = [escape(column) for column in names] - col_names = ','.join(bracketed_names) - wildcards = ','.join([wld] * len(names)) - insert_statement = \ - 'INSERT INTO {table} ({columns}) VALUES ({wld})'.format( - table=escape(self.name), columns=col_names, wld=wildcards) + col_names = ",".join(bracketed_names) + wildcards = ",".join([wld] * len(names)) + insert_statement = "INSERT INTO {table} ({columns}) VALUES ({wld})".format( + table=escape(self.name), columns=col_names, wld=wildcards + ) return insert_statement def _execute_insert(self, conn, keys, data_iter): @@ -1314,19 +1473,18 @@ def _create_table_setup(self): structure of a DataFrame. The first entry will be a CREATE TABLE statement while the rest will be CREATE INDEX statements. """ - column_names_and_types = self._get_column_names_and_types( - self._sql_type_name - ) + column_names_and_types = self._get_column_names_and_types(self._sql_type_name) - pat = re.compile(r'\s+') + pat = re.compile(r"\s+") column_names = [col_name for col_name, _, _ in column_names_and_types] if any(map(pat.search, column_names)): warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6) escape = _get_valid_sqlite_name - create_tbl_stmts = [escape(cname) + ' ' + ctype - for cname, ctype, _ in column_names_and_types] + create_tbl_stmts = [ + escape(cname) + " " + ctype for cname, ctype, _ in column_names_and_types + ] if self.keys is not None and len(self.keys): if not is_list_like(self.keys): @@ -1336,19 +1494,31 @@ def _create_table_setup(self): cnames_br = ", ".join(escape(c) for c in keys) create_tbl_stmts.append( "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( - tbl=self.name, cnames_br=cnames_br)) - - create_stmts = ["CREATE TABLE " + escape(self.name) + " (\n" + - ',\n '.join(create_tbl_stmts) + "\n)"] + tbl=self.name, cnames_br=cnames_br + ) + ) + + create_stmts = [ + "CREATE TABLE " + + escape(self.name) + + " (\n" + + ",\n ".join(create_tbl_stmts) + + "\n)" + ] - ix_cols = [cname for cname, _, is_index in column_names_and_types - if is_index] + ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index] if len(ix_cols): cnames = "_".join(ix_cols) cnames_br = ",".join(escape(c) for c in ix_cols) create_stmts.append( - "CREATE INDEX " + escape("ix_" + self.name + "_" + cnames) + - "ON " + escape(self.name) + " (" + cnames_br + ")") + "CREATE INDEX " + + escape("ix_" + self.name + "_" + cnames) + + "ON " + + escape(self.name) + + " (" + + cnames_br + + ")" + ) return create_stmts @@ -1361,10 +1531,14 @@ def _sql_type_name(self, col): # Needed for inserting typed data containing NULLs, GH 8778. col_type = lib.infer_dtype(col, skipna=True) - if col_type == 'timedelta64': - warnings.warn("the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", UserWarning, stacklevel=8) + if col_type == "timedelta64": + warnings.warn( + "the 'timedelta' type is not supported, and will be " + "written as integer values (ns frequency) to the " + "database.", + UserWarning, + stacklevel=8, + ) col_type = "integer" elif col_type == "datetime64": @@ -1374,7 +1548,7 @@ def _sql_type_name(self, col): col_type = "string" elif col_type == "complex": - raise ValueError('Complex datatypes not supported') + raise ValueError("Complex datatypes not supported") if col_type not in _SQL_TYPES: col_type = "string" @@ -1426,17 +1600,19 @@ def execute(self, *args, **kwargs): except Exception: # pragma: no cover ex = DatabaseError( "Execution failed on sql: {sql}\n{exc}\nunable " - "to rollback".format(sql=args[0], exc=exc)) + "to rollback".format(sql=args[0], exc=exc) + ) raise_with_traceback(ex) ex = DatabaseError( - "Execution failed on sql '{sql}': {exc}".format( - sql=args[0], exc=exc)) + "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc) + ) raise_with_traceback(ex) @staticmethod - def _query_iterator(cursor, chunksize, columns, index_col=None, - coerce_float=True, parse_dates=None): + def _query_iterator( + cursor, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None + ): """Return generator through chunked result set""" while True: @@ -1447,29 +1623,48 @@ def _query_iterator(cursor, chunksize, columns, index_col=None, cursor.close() break else: - yield _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + yield _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) - def read_query(self, sql, index_col=None, coerce_float=True, params=None, - parse_dates=None, chunksize=None): + def read_query( + self, + sql, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + chunksize=None, + ): args = _convert_params(sql, params) cursor = self.execute(*args) columns = [col_desc[0] for col_desc in cursor.description] if chunksize is not None: - return self._query_iterator(cursor, chunksize, columns, - index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + return self._query_iterator( + cursor, + chunksize, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) else: data = self._fetchall_as_list(cursor) cursor.close() - frame = _wrap_result(data, columns, index_col=index_col, - coerce_float=coerce_float, - parse_dates=parse_dates) + frame = _wrap_result( + data, + columns, + index_col=index_col, + coerce_float=coerce_float, + parse_dates=parse_dates, + ) return frame def _fetchall_as_list(self, cur): @@ -1478,9 +1673,18 @@ def _fetchall_as_list(self, cur): result = list(result) return result - def to_sql(self, frame, name, if_exists='fail', index=True, - index_label=None, schema=None, chunksize=None, dtype=None, - method=None): + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype=None, + method=None, + ): """ Write records stored in a DataFrame to a SQL database. @@ -1527,12 +1731,21 @@ def to_sql(self, frame, name, if_exists='fail', index=True, if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): - raise ValueError('{column} ({type!s}) not a string'.format( - column=col, type=my_type)) + raise ValueError( + "{column} ({type!s}) not a string".format( + column=col, type=my_type + ) + ) - table = SQLiteTable(name, self, frame=frame, index=index, - if_exists=if_exists, index_label=index_label, - dtype=dtype) + table = SQLiteTable( + name, + self, + frame=frame, + index=index, + if_exists=if_exists, + index_label=index_label, + dtype=dtype, + ) table.create() table.insert(chunksize, method) @@ -1541,23 +1754,24 @@ def has_table(self, name, schema=None): # escape = _get_valid_sqlite_name # esc_name = escape(name) - wld = '?' - query = ("SELECT name FROM sqlite_master " - "WHERE type='table' AND name={wld};").format(wld=wld) + wld = "?" + query = ( + "SELECT name FROM sqlite_master " "WHERE type='table' AND name={wld};" + ).format(wld=wld) - return len(self.execute(query, [name, ]).fetchall()) > 0 + return len(self.execute(query, [name]).fetchall()) > 0 def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - drop_sql = "DROP TABLE {name}".format( - name=_get_valid_sqlite_name(name)) + drop_sql = "DROP TABLE {name}".format(name=_get_valid_sqlite_name(name)) self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): - table = SQLiteTable(table_name, self, frame=frame, index=False, - keys=keys, dtype=dtype) + table = SQLiteTable( + table_name, self, frame=frame, index=False, keys=keys, dtype=dtype + ) return str(table.sql_schema()) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 00b7a29b27b63..7087d2ee963cb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -26,20 +26,31 @@ from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( - ensure_object, is_categorical_dtype, is_datetime64_dtype) + ensure_object, + is_categorical_dtype, + is_datetime64_dtype, +) from pandas import ( - Categorical, DatetimeIndex, NaT, Timestamp, concat, isna, to_datetime, - to_timedelta) + Categorical, + DatetimeIndex, + NaT, + Timestamp, + concat, + isna, + to_datetime, + to_timedelta, +) from pandas.core.frame import DataFrame from pandas.core.series import Series -from pandas.io.common import ( - BaseIterator, _stringify_path, get_filepath_or_buffer) +from pandas.io.common import BaseIterator, _stringify_path, get_filepath_or_buffer -_version_error = ("Version of given Stata file is not 104, 105, 108, " - "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " - "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)") +_version_error = ( + "Version of given Stata file is not 104, 105, 108, " + "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " + "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" +) _statafile_processing_params1 = """\ convert_dates : boolean, defaults to True @@ -111,9 +122,13 @@ >>> itr = pd.read_stata('filename.dta', chunksize=10000) >>> for chunk in itr: ... do_something(chunk) -""" % (_statafile_processing_params1, _encoding_params, - _statafile_processing_params2, _chunksize_params, - _iterator_params) +""" % ( + _statafile_processing_params1, + _encoding_params, + _statafile_processing_params2, + _chunksize_params, + _iterator_params, +) _data_method_doc = """\ Read observations from Stata file, converting them into a dataframe @@ -129,7 +144,10 @@ Returns ------- DataFrame -""" % (_statafile_processing_params1, _statafile_processing_params2) +""" % ( + _statafile_processing_params1, + _statafile_processing_params2, +) _read_method_doc = """\ Reads observations from Stata file, converting them into a dataframe @@ -144,7 +162,10 @@ Returns ------- DataFrame -""" % (_statafile_processing_params1, _statafile_processing_params2) +""" % ( + _statafile_processing_params1, + _statafile_processing_params2, +) _stata_reader_doc = """\ @@ -161,26 +182,42 @@ %s %s %s -""" % (_statafile_processing_params1, _statafile_processing_params2, - _encoding_params, _chunksize_params) +""" % ( + _statafile_processing_params1, + _statafile_processing_params2, + _encoding_params, + _chunksize_params, +) @Appender(_read_stata_doc) -@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) -@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') -def read_stata(filepath_or_buffer, convert_dates=True, - convert_categoricals=True, encoding=None, index_col=None, - convert_missing=False, preserve_dtypes=True, columns=None, - order_categoricals=True, chunksize=None, iterator=False): - - reader = StataReader(filepath_or_buffer, - convert_dates=convert_dates, - convert_categoricals=convert_categoricals, - index_col=index_col, convert_missing=convert_missing, - preserve_dtypes=preserve_dtypes, - columns=columns, - order_categoricals=order_categoricals, - chunksize=chunksize) +@deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) +@deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") +def read_stata( + filepath_or_buffer, + convert_dates=True, + convert_categoricals=True, + encoding=None, + index_col=None, + convert_missing=False, + preserve_dtypes=True, + columns=None, + order_categoricals=True, + chunksize=None, + iterator=False, +): + + reader = StataReader( + filepath_or_buffer, + convert_dates=convert_dates, + convert_categoricals=convert_categoricals, + index_col=index_col, + convert_missing=convert_missing, + preserve_dtypes=preserve_dtypes, + columns=columns, + order_categoricals=order_categoricals, + chunksize=chunksize, + ) if iterator or chunksize: data = reader @@ -261,12 +298,12 @@ def convert_year_month_safe(year, month): using datetime. """ if year.max() < MAX_YEAR and year.min() > MIN_YEAR: - return to_datetime(100 * year + month, format='%Y%m') + return to_datetime(100 * year + month, format="%Y%m") else: - index = getattr(year, 'index', None) + index = getattr(year, "index", None) return Series( - [datetime.datetime(y, m, 1) for y, m in zip(year, month)], - index=index) + [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index + ) def convert_year_days_safe(year, days): """ @@ -274,12 +311,13 @@ def convert_year_days_safe(year, days): datetime or datetime64 Series """ if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: - return (to_datetime(year, format='%Y') + - to_timedelta(days, unit='d')) + return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") else: - index = getattr(year, 'index', None) - value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) - for y, d in zip(year, days)] + index = getattr(year, "index", None) + value = [ + datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) + for y, d in zip(year, days) + ] return Series(value, index=index) def convert_delta_safe(base, deltas, unit): @@ -288,18 +326,19 @@ def convert_delta_safe(base, deltas, unit): versions if the deltas satisfy restrictions required to be expressed as dates in pandas. """ - index = getattr(deltas, 'index', None) - if unit == 'd': + index = getattr(deltas, "index", None) + if unit == "d": if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: values = [base + relativedelta(days=int(d)) for d in deltas] return Series(values, index=index) - elif unit == 'ms': + elif unit == "ms": if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: - values = [base + relativedelta(microseconds=(int(d) * 1000)) - for d in deltas] + values = [ + base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas + ] return Series(values, index=index) else: - raise ValueError('format not understood') + raise ValueError("format not understood") base = to_datetime(base) deltas = to_timedelta(deltas, unit=unit) return base + deltas @@ -317,11 +356,10 @@ def convert_delta_safe(base, deltas, unit): if fmt.startswith(("%tc", "tc")): # Delta ms relative to base base = stata_epoch ms = dates - conv_dates = convert_delta_safe(base, ms, 'ms') + conv_dates = convert_delta_safe(base, ms, "ms") elif fmt.startswith(("%tC", "tC")): - warnings.warn("Encountered %tC format. Leaving in Stata " - "Internal Format.") + warnings.warn("Encountered %tC format. Leaving in Stata " "Internal Format.") conv_dates = Series(dates, dtype=np.object) if has_bad_values: conv_dates[bad_locs] = NaT @@ -330,7 +368,7 @@ def convert_delta_safe(base, deltas, unit): elif fmt.startswith(("%td", "td", "%d", "d")): base = stata_epoch days = dates - conv_dates = convert_delta_safe(base, days, 'd') + conv_dates = convert_delta_safe(base, days, "d") # does not count leap days - 7 days is a week. # 52nd week may have more than 7 days elif fmt.startswith(("%tw", "tw")): @@ -383,34 +421,35 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): if is_datetime64_dtype(dates.values): if delta: delta = dates - stata_epoch - d['delta'] = delta.values.astype( - np.int64) // 1000 # microseconds + d["delta"] = delta.values.astype(np.int64) // 1000 # microseconds if days or year: dates = DatetimeIndex(dates) - d['year'], d['month'] = dates.year, dates.month + d["year"], d["month"] = dates.year, dates.month if days: - days = (dates.astype(np.int64) - - to_datetime(d['year'], format='%Y').astype(np.int64)) - d['days'] = days // NS_PER_DAY + days = dates.astype(np.int64) - to_datetime( + d["year"], format="%Y" + ).astype(np.int64) + d["days"] = days // NS_PER_DAY - elif infer_dtype(dates, skipna=False) == 'datetime': + elif infer_dtype(dates, skipna=False) == "datetime": if delta: delta = dates.values - stata_epoch - f = lambda x: \ - US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds + f = lambda x: US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds v = np.vectorize(f) - d['delta'] = v(delta) + d["delta"] = v(delta) if year: year_month = dates.apply(lambda x: 100 * x.year + x.month) - d['year'] = year_month.values // 100 - d['month'] = (year_month.values - d['year'] * 100) + d["year"] = year_month.values // 100 + d["month"] = year_month.values - d["year"] * 100 if days: f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days v = np.vectorize(f) - d['days'] = v(dates) + d["days"] = v(dates) else: - raise ValueError('Columns containing dates must contain either ' - 'datetime64, datetime.datetime or null values.') + raise ValueError( + "Columns containing dates must contain either " + "datetime64, datetime.datetime or null values." + ) return DataFrame(d, index=index) @@ -434,26 +473,26 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): conv_dates = d.delta // US_PER_DAY elif fmt in ["%tw", "tw"]: d = parse_dates_safe(dates, year=True, days=True) - conv_dates = (52 * (d.year - stata_epoch.year) + d.days // 7) + conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 elif fmt in ["%tm", "tm"]: d = parse_dates_safe(dates, year=True) - conv_dates = (12 * (d.year - stata_epoch.year) + d.month - 1) + conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1 elif fmt in ["%tq", "tq"]: d = parse_dates_safe(dates, year=True) conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3 elif fmt in ["%th", "th"]: d = parse_dates_safe(dates, year=True) - conv_dates = (2 * (d.year - stata_epoch.year) + - (d.month > 6).astype(np.int)) + conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(np.int) elif fmt in ["%ty", "ty"]: d = parse_dates_safe(dates, year=True) conv_dates = d.year else: raise ValueError( - "Format {fmt} is not a known Stata date format".format(fmt=fmt)) + "Format {fmt} is not a known Stata date format".format(fmt=fmt) + ) conv_dates = Series(conv_dates, dtype=np.float64) - missing_value = struct.unpack('= 2 ** 53: - ws = precision_loss_doc % ('uint64', 'float64') + ws = precision_loss_doc % ("uint64", "float64") data[col] = data[col].astype(dtype) @@ -561,28 +602,31 @@ def _cast_to_stata_types(data): if data[col].max() > 32740 or data[col].min() < -32767: data[col] = data[col].astype(np.int32) elif dtype == np.int64: - if (data[col].max() <= 2147483620 and - data[col].min() >= -2147483647): + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53: - ws = precision_loss_doc % ('int64', 'float64') + ws = precision_loss_doc % ("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() if np.isinf(value): - raise ValueError('Column {col} has a maximum value of ' - 'infinity which is outside the range ' - 'supported by Stata.'.format(col=col)) + raise ValueError( + "Column {col} has a maximum value of " + "infinity which is outside the range " + "supported by Stata.".format(col=col) + ) if dtype == np.float32 and value > float32_max: data[col] = data[col].astype(np.float64) elif dtype == np.float64: if value > float64_max: - raise ValueError('Column {col} has a maximum value ' - '({val}) outside the range supported by ' - 'Stata ({float64_max})' - .format(col=col, val=value, - float64_max=float64_max)) + raise ValueError( + "Column {col} has a maximum value " + "({val}) outside the range supported by " + "Stata ({float64_max})".format( + col=col, val=value, float64_max=float64_max + ) + ) if ws: warnings.warn(ws, PossiblePrecisionLoss) @@ -630,8 +674,10 @@ def __init__(self, catarray): category = vl[1] if not isinstance(category, str): category = str(category) - warnings.warn(value_label_mismatch_doc.format(catarray.name), - ValueLabelTypeMismatch) + warnings.warn( + value_label_mismatch_doc.format(catarray.name), + ValueLabelTypeMismatch, + ) self.off.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding @@ -640,9 +686,11 @@ def __init__(self, catarray): self.n += 1 if self.text_len > 32000: - raise ValueError('Stata value labels for a single variable must ' - 'have a combined length less than 32,000 ' - 'characters.') + raise ValueError( + "Stata value labels for a single variable must " + "have a combined length less than 32,000 " + "characters." + ) # Ensure int32 self.off = np.array(self.off, dtype=np.int32) @@ -674,11 +722,11 @@ def generate_value_label(self, byteorder, encoding): self._encoding = encoding bio = BytesIO() - null_string = '\x00' - null_byte = b'\x00' + null_string = "\x00" + null_byte = b"\x00" # len - bio.write(struct.pack(byteorder + 'i', self.len)) + bio.write(struct.pack(byteorder + "i", self.len)) # labname labname = self._encode(_pad_bytes(self.labname[:32], 33)) @@ -686,22 +734,22 @@ def generate_value_label(self, byteorder, encoding): # padding - 3 bytes for i in range(3): - bio.write(struct.pack('c', null_byte)) + bio.write(struct.pack("c", null_byte)) # value_label_table # n - int32 - bio.write(struct.pack(byteorder + 'i', self.n)) + bio.write(struct.pack(byteorder + "i", self.n)) # textlen - int32 - bio.write(struct.pack(byteorder + 'i', self.text_len)) + bio.write(struct.pack(byteorder + "i", self.text_len)) # off - int32 array (n elements) for offset in self.off: - bio.write(struct.pack(byteorder + 'i', offset)) + bio.write(struct.pack(byteorder + "i", offset)) # val - int32 array (n elements) for value in self.val: - bio.write(struct.pack(byteorder + 'i', value)) + bio.write(struct.pack(byteorder + "i", value)) # txt - Text labels, null terminated for text in self.txt: @@ -760,36 +808,37 @@ class StataMissingValue: bases = (101, 32741, 2147483621) for b in bases: # Conversion to long to avoid hash issues on 32 bit platforms #8968 - MISSING_VALUES[b] = '.' + MISSING_VALUES[b] = "." for i in range(1, 27): - MISSING_VALUES[i + b] = '.' + chr(96 + i) + MISSING_VALUES[i + b] = "." + chr(96 + i) - float32_base = b'\x00\x00\x00\x7f' - increment = struct.unpack(' 0: MISSING_VALUES[value] += chr(96 + i) - int_value = struct.unpack(' 0: MISSING_VALUES[value] += chr(96 + i) - int_value = struct.unpack('q', struct.pack(' 0 + self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0 # calculate size of a data record self.col_sizes = [self._calcsize(typ) for typ in self.typlist] @@ -1038,10 +1135,9 @@ def _read_new_header(self, first_char): raise ValueError(_version_error) self._set_encoding() self.path_or_buf.read(21) #
- self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<' + self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" self.path_or_buf.read(15) # - self.nvar = struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] + self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] self.path_or_buf.read(7) # self.nobs = self._get_nobs() @@ -1053,27 +1149,35 @@ def _read_new_header(self, first_char): self.path_or_buf.read(8) # 0x0000000000000000 self.path_or_buf.read(8) # position of - self._seek_vartypes = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 16 - self._seek_varnames = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 - self._seek_sortlist = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10 - self._seek_formats = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9 - self._seek_value_label_names = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19 + self._seek_vartypes = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 16 + ) + self._seek_varnames = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 + ) + self._seek_sortlist = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 + ) + self._seek_formats = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 9 + ) + self._seek_value_label_names = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 19 + ) # Requires version-specific treatment self._seek_variable_labels = self._get_seek_variable_labels() self.path_or_buf.read(8) # - self.data_location = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6 - self.seek_strls = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 7 - self.seek_value_labels = struct.unpack( - self.byteorder + 'q', self.path_or_buf.read(8))[0] + 14 + self.data_location = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 6 + ) + self.seek_strls = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 7 + ) + self.seek_value_labels = ( + struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 14 + ) self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes) @@ -1082,8 +1186,8 @@ def _read_new_header(self, first_char): self.path_or_buf.seek(self._seek_sortlist) self.srtlist = struct.unpack( - self.byteorder + ('h' * (self.nvar + 1)), - self.path_or_buf.read(2 * (self.nvar + 1)) + self.byteorder + ("h" * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)), )[:-1] self.path_or_buf.seek(self._seek_formats) @@ -1099,9 +1203,10 @@ def _read_new_header(self, first_char): def _get_dtypes(self, seek_vartypes): self.path_or_buf.seek(seek_vartypes) - raw_typlist = [struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] - for i in range(self.nvar)] + raw_typlist = [ + struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] + for i in range(self.nvar) + ] def f(typ): if typ <= 2045: @@ -1109,8 +1214,7 @@ def f(typ): try: return self.TYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata types [{0}]". - format(typ)) + raise ValueError("cannot convert stata types [{0}]".format(typ)) typlist = [f(x) for x in raw_typlist] @@ -1120,8 +1224,7 @@ def f(typ): try: return self.DTYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata dtype [{0}]" - .format(typ)) + raise ValueError("cannot convert stata dtype [{0}]".format(typ)) dtyplist = [f(x) for x in raw_typlist] @@ -1133,8 +1236,7 @@ def _get_varlist(self): elif self.format_version == 118: b = 129 - return [self._decode(self.path_or_buf.read(b)) - for i in range(self.nvar)] + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the format list def _get_fmtlist(self): @@ -1147,8 +1249,7 @@ def _get_fmtlist(self): else: b = 7 - return [self._decode(self.path_or_buf.read(b)) - for i in range(self.nvar)] + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the label list def _get_lbllist(self): @@ -1158,36 +1259,35 @@ def _get_lbllist(self): b = 33 else: b = 9 - return [self._decode(self.path_or_buf.read(b)) - for i in range(self.nvar)] + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] def _get_variable_labels(self): if self.format_version == 118: - vlblist = [self._decode(self.path_or_buf.read(321)) - for i in range(self.nvar)] + vlblist = [ + self._decode(self.path_or_buf.read(321)) for i in range(self.nvar) + ] elif self.format_version > 105: - vlblist = [self._decode(self.path_or_buf.read(81)) - for i in range(self.nvar)] + vlblist = [ + self._decode(self.path_or_buf.read(81)) for i in range(self.nvar) + ] else: - vlblist = [self._decode(self.path_or_buf.read(32)) - for i in range(self.nvar)] + vlblist = [ + self._decode(self.path_or_buf.read(32)) for i in range(self.nvar) + ] return vlblist def _get_nobs(self): if self.format_version == 118: - return struct.unpack(self.byteorder + 'Q', - self.path_or_buf.read(8))[0] + return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] else: - return struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] + return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] def _get_data_label(self): if self.format_version == 118: - strlen = struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] + strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] return self._decode(self.path_or_buf.read(strlen)) elif self.format_version == 117: - strlen = struct.unpack('b', self.path_or_buf.read(1))[0] + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 105: return self._decode(self.path_or_buf.read(81)) @@ -1196,10 +1296,10 @@ def _get_data_label(self): def _get_time_stamp(self): if self.format_version == 118: - strlen = struct.unpack('b', self.path_or_buf.read(1))[0] + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] return self.path_or_buf.read(strlen).decode("utf-8") elif self.format_version == 117: - strlen = struct.unpack('b', self.path_or_buf.read(1))[0] + strlen = struct.unpack("b", self.path_or_buf.read(1))[0] return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 104: return self._decode(self.path_or_buf.read(18)) @@ -1214,23 +1314,22 @@ def _get_seek_variable_labels(self): # variable, 20 for the closing tag and 17 for the opening tag return self._seek_value_label_names + (33 * self.nvar) + 20 + 17 elif self.format_version == 118: - return struct.unpack(self.byteorder + 'q', - self.path_or_buf.read(8))[0] + 17 + return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17 else: raise ValueError() def _read_old_header(self, first_char): - self.format_version = struct.unpack('b', first_char)[0] + self.format_version = struct.unpack("b", first_char)[0] if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: raise ValueError(_version_error) self._set_encoding() - self.byteorder = struct.unpack( - 'b', self.path_or_buf.read(1))[0] == 0x1 and '>' or '<' - self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] + self.byteorder = ( + struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" + ) + self.filetype = struct.unpack("b", self.path_or_buf.read(1))[0] self.path_or_buf.read(1) # unused - self.nvar = struct.unpack(self.byteorder + 'H', - self.path_or_buf.read(2))[0] + self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] self.nobs = self._get_nobs() self._data_label = self._get_data_label() @@ -1239,8 +1338,7 @@ def _read_old_header(self, first_char): # descriptors if self.format_version > 108: - typlist = [ord(self.path_or_buf.read(1)) - for i in range(self.nvar)] + typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)] else: buf = self.path_or_buf.read(self.nvar) typlistb = np.frombuffer(buf, dtype=np.uint8) @@ -1254,23 +1352,31 @@ def _read_old_header(self, first_char): try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError("cannot convert stata types [{0}]" - .format(','.join(str(x) for x in typlist))) + raise ValueError( + "cannot convert stata types [{0}]".format( + ",".join(str(x) for x in typlist) + ) + ) try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError("cannot convert stata dtypes [{0}]" - .format(','.join(str(x) for x in typlist))) + raise ValueError( + "cannot convert stata dtypes [{0}]".format( + ",".join(str(x) for x in typlist) + ) + ) if self.format_version > 108: - self.varlist = [self._decode(self.path_or_buf.read(33)) - for i in range(self.nvar)] + self.varlist = [ + self._decode(self.path_or_buf.read(33)) for i in range(self.nvar) + ] else: - self.varlist = [self._decode(self.path_or_buf.read(9)) - for i in range(self.nvar)] + self.varlist = [ + self._decode(self.path_or_buf.read(9)) for i in range(self.nvar) + ] self.srtlist = struct.unpack( - self.byteorder + ('h' * (self.nvar + 1)), - self.path_or_buf.read(2 * (self.nvar + 1)) + self.byteorder + ("h" * (self.nvar + 1)), + self.path_or_buf.read(2 * (self.nvar + 1)), )[:-1] self.fmtlist = self._get_fmtlist() @@ -1286,14 +1392,17 @@ def _read_old_header(self, first_char): if self.format_version > 104: while True: - data_type = struct.unpack(self.byteorder + 'b', - self.path_or_buf.read(1))[0] + data_type = struct.unpack( + self.byteorder + "b", self.path_or_buf.read(1) + )[0] if self.format_version > 108: - data_len = struct.unpack(self.byteorder + 'i', - self.path_or_buf.read(4))[0] + data_len = struct.unpack( + self.byteorder + "i", self.path_or_buf.read(4) + )[0] else: - data_len = struct.unpack(self.byteorder + 'h', - self.path_or_buf.read(2))[0] + data_len = struct.unpack( + self.byteorder + "h", self.path_or_buf.read(2) + )[0] if data_type == 0: break self.path_or_buf.read(data_len) @@ -1309,18 +1418,16 @@ def _setup_dtype(self): dtype = [] # Convert struct data types to numpy data type for i, typ in enumerate(self.typlist): if typ in self.NUMPY_TYPE_MAP: - dtype.append(('s' + str(i), self.byteorder + - self.NUMPY_TYPE_MAP[typ])) + dtype.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) else: - dtype.append(('s' + str(i), 'S' + str(typ))) + dtype.append(("s" + str(i), "S" + str(typ))) dtype = np.dtype(dtype) self._dtype = dtype return self._dtype def _calcsize(self, fmt): - return (type(fmt) is int and fmt or - struct.calcsize(self.byteorder + fmt)) + return type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt) def _decode(self, s): # have bytes not strings, so must decode @@ -1336,7 +1443,7 @@ def _decode(self, s): has been incorrectly encoded by Stata or some other software. You should verify the string values returned are correct.""" warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) - return s.decode('latin-1') + return s.decode("latin-1") def _read_value_labels(self): if self._value_labels_read: @@ -1359,7 +1466,7 @@ def _read_value_labels(self): while True: if self.format_version >= 117: - if self.path_or_buf.read(5) == b' + if self.path_or_buf.read(5) == b" break # end of value label table slength = self.path_or_buf.read(4) @@ -1371,16 +1478,14 @@ def _read_value_labels(self): labname = self._decode(self.path_or_buf.read(129)) self.path_or_buf.read(3) # padding - n = struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] - txtlen = struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] - off = np.frombuffer(self.path_or_buf.read(4 * n), - dtype=self.byteorder + "i4", - count=n) - val = np.frombuffer(self.path_or_buf.read(4 * n), - dtype=self.byteorder + "i4", - count=n) + n = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + txtlen = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] + off = np.frombuffer( + self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n + ) + val = np.frombuffer( + self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n + ) ii = np.argsort(off) off = off[ii] val = val[ii] @@ -1388,8 +1493,7 @@ def _read_value_labels(self): self.value_label_dict[labname] = dict() for i in range(n): end = off[i + 1] if i < n - 1 else txtlen - self.value_label_dict[labname][val[i]] = \ - self._decode(txt[off[i]:end]) + self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end]) if self.format_version >= 117: self.path_or_buf.read(6) # self._value_labels_read = True @@ -1397,25 +1501,23 @@ def _read_value_labels(self): def _read_strls(self): self.path_or_buf.seek(self.seek_strls) # Wrap v_o in a string to allow uint64 values as keys on 32bit OS - self.GSO = {'0': ''} + self.GSO = {"0": ""} while True: - if self.path_or_buf.read(3) != b'GSO': + if self.path_or_buf.read(3) != b"GSO": break if self.format_version == 117: - v_o = struct.unpack(self.byteorder + 'Q', - self.path_or_buf.read(8))[0] + v_o = struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] else: buf = self.path_or_buf.read(12) # Only tested on little endian file on little endian machine. - if self.byteorder == '<': + if self.byteorder == "<": buf = buf[0:2] + buf[4:10] else: buf = buf[0:2] + buf[6:] - v_o = struct.unpack('Q', buf)[0] - typ = struct.unpack('B', self.path_or_buf.read(1))[0] - length = struct.unpack(self.byteorder + 'I', - self.path_or_buf.read(4))[0] + v_o = struct.unpack("Q", buf)[0] + typ = struct.unpack("B", self.path_or_buf.read(1))[0] + length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] va = self.path_or_buf.read(length) if typ == 130: va = va[0:-1].decode(self._encoding) @@ -1455,11 +1557,18 @@ def get_chunk(self, size=None): return self.read(nrows=size) @Appender(_read_method_doc) - @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col') - def read(self, nrows=None, convert_dates=None, - convert_categoricals=None, index_col=None, - convert_missing=None, preserve_dtypes=None, - columns=None, order_categoricals=None): + @deprecate_kwarg(old_arg_name="index", new_arg_name="index_col") + def read( + self, + nrows=None, + convert_dates=None, + convert_categoricals=None, + index_col=None, + convert_missing=None, + preserve_dtypes=None, + columns=None, + order_categoricals=None, + ): # Handle empty file or chunk. If reading incrementally raise # StopIteration. If reading the whole thing return an empty # data frame. @@ -1507,8 +1616,9 @@ def read(self, nrows=None, convert_dates=None, offset = self._lines_read * dtype.itemsize self.path_or_buf.seek(self.data_location + offset) read_lines = min(nrows, self.nobs - self._lines_read) - data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype, - count=read_lines) + data = np.frombuffer( + self.path_or_buf.read(read_len), dtype=dtype, count=read_lines + ) self._lines_read += read_lines if self._lines_read == self.nobs: @@ -1543,8 +1653,7 @@ def read(self, nrows=None, convert_dates=None, # Decode strings for col, typ in zip(data, self.typlist): if type(typ) is int: - data[col] = data[col].apply( - self._decode, convert_dtype=True) + data[col] = data[col].apply(self._decode, convert_dtype=True) data = self._insert_strls(data) @@ -1561,7 +1670,8 @@ def read(self, nrows=None, convert_dates=None, if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( - (col, Series(data[col], ix, self.dtyplist[i]))) + (col, Series(data[col], ix, self.dtyplist[i])) + ) else: data_formatted.append((col, data[col])) if requires_type_conversion: @@ -1571,24 +1681,25 @@ def read(self, nrows=None, convert_dates=None, data = self._do_convert_missing(data, convert_missing) if convert_dates: + def any_startswith(x: str) -> bool: return any(x.startswith(fmt) for fmt in _date_formats) + cols = np.where([any_startswith(x) for x in self.fmtlist])[0] for i in cols: col = data.columns[i] try: data[col] = _stata_elapsed_date_to_datetime_vec( - data[col], - self.fmtlist[i]) + data[col], self.fmtlist[i] + ) except ValueError: self.close() raise if convert_categoricals and self.format_version > 108: - data = self._do_convert_categoricals(data, - self.value_label_dict, - self.lbllist, - order_categoricals) + data = self._do_convert_categoricals( + data, self.value_label_dict, self.lbllist, order_categoricals + ) if not preserve_dtypes: retyped_data = [] @@ -1628,8 +1739,7 @@ def _do_convert_missing(self, data, convert_missing): if convert_missing: # Replacement follows Stata notation missing_loc = np.argwhere(missing._ndarray_values) - umissing, umissing_loc = np.unique(series[missing], - return_inverse=True) + umissing, umissing_loc = np.unique(series[missing], return_inverse=True) replacement = Series(series, dtype=np.object) for j, um in enumerate(umissing): missing_value = StataMissingValue(um) @@ -1646,16 +1756,15 @@ def _do_convert_missing(self, data, convert_missing): if replacements: columns = data.columns replacements = DataFrame(replacements) - data = concat([data.drop(replacements.columns, 1), - replacements], 1) + data = concat([data.drop(replacements.columns, 1), replacements], 1) data = data[columns] return data def _insert_strls(self, data): - if not hasattr(self, 'GSO') or len(self.GSO) == 0: + if not hasattr(self, "GSO") or len(self.GSO) == 0: return data for i, typ in enumerate(self.typlist): - if typ != 'Q': + if typ != "Q": continue # Wrap v_o in a string to allow uint64 values as keys on 32bit OS data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] @@ -1666,12 +1775,13 @@ def _do_select_columns(self, data, columns): if not self._column_selector_set: column_set = set(columns) if len(column_set) != len(columns): - raise ValueError('columns contains duplicate entries') + raise ValueError("columns contains duplicate entries") unmatched = column_set.difference(data.columns) if unmatched: - raise ValueError('The following columns were not found in the ' - 'Stata data set: ' + - ', '.join(list(unmatched))) + raise ValueError( + "The following columns were not found in the " + "Stata data set: " + ", ".join(list(unmatched)) + ) # Copy information for retained columns for later processing dtyplist = [] typlist = [] @@ -1692,8 +1802,9 @@ def _do_select_columns(self, data, columns): return data[columns] - def _do_convert_categoricals(self, data, value_label_dict, lbllist, - order_categoricals): + def _do_convert_categoricals( + self, data, value_label_dict, lbllist, order_categoricals + ): """ Converts categorical columns to Categorical type. """ @@ -1714,7 +1825,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, except ValueError: vc = Series(categories).value_counts() repeats = list(vc.index[vc > 1]) - repeats = '-' * 80 + '\n' + '\n'.join(repeats) + repeats = "-" * 80 + "\n" + "\n".join(repeats) # GH 25772 msg = """ Value labels for column {col} are not unique. These cannot be converted to @@ -1784,7 +1895,7 @@ def _open_file_binary_write(fname): own : bool True if the file was created, otherwise False """ - if hasattr(fname, 'write'): + if hasattr(fname, "write"): # if 'b' not in fname.mode: return fname, False return open(fname, "wb"), True @@ -1796,8 +1907,7 @@ def _set_endianness(endianness): elif endianness.lower() in [">", "big"]: return ">" else: # pragma : no cover - raise ValueError( - "Endianness {endian} not understood".format(endian=endianness)) + raise ValueError("Endianness {endian} not understood".format(endian=endianness)) def _pad_bytes(name, length): @@ -1811,12 +1921,25 @@ def _convert_datetime_to_stata_type(fmt): """ Convert from one of the stata date formats to a type in TYPE_MAP. """ - if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq", - "%tq", "th", "%th", "ty", "%ty"]: + if fmt in [ + "tc", + "%tc", + "td", + "%td", + "tw", + "%tw", + "tm", + "%tm", + "tq", + "%tq", + "th", + "%th", + "ty", + "%ty", + ]: return np.float64 # Stata expects doubles for SIFs else: - raise NotImplementedError( - "Format {fmt} not implemented".format(fmt=fmt)) + raise NotImplementedError("Format {fmt} not implemented".format(fmt=fmt)) def _maybe_convert_to_int_keys(convert_dates, varlist): @@ -1828,8 +1951,7 @@ def _maybe_convert_to_int_keys(convert_dates, varlist): new_dict.update({varlist.index(key): convert_dates[key]}) else: if not isinstance(key, int): - raise ValueError("convert_dates key must be a " - "column or an integer") + raise ValueError("convert_dates key must be a " "column or an integer") new_dict.update({key: convert_dates[key]}) return new_dict @@ -1868,11 +1990,11 @@ def _dtype_to_stata_type(dtype, column): return 251 else: # pragma : no cover raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype)) + "Data type {dtype} not supported.".format(dtype=dtype) + ) -def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, - force_strl=False): +def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): """ Map numpy dtype to stata's default format for this type. Not terribly important since users can change this in Stata. Semantics are @@ -1894,23 +2016,24 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, else: max_str_len = 2045 if force_strl: - return '%9s' + return "%9s" if dtype.type == np.object_: inferred_dtype = infer_dtype(column, skipna=True) - if not (inferred_dtype in ('string', 'unicode') or - len(column) == 0): - raise ValueError('Column `{col}` cannot be exported.\n\nOnly ' - 'string-like object arrays containing all ' - 'strings or a mix of strings and None can be ' - 'exported. Object arrays containing only null ' - 'values are prohibited. Other object types' - 'cannot be exported and must first be converted ' - 'to one of the supported ' - 'types.'.format(col=column.name)) + if not (inferred_dtype in ("string", "unicode") or len(column) == 0): + raise ValueError( + "Column `{col}` cannot be exported.\n\nOnly " + "string-like object arrays containing all " + "strings or a mix of strings and None can be " + "exported. Object arrays containing only null " + "values are prohibited. Other object types" + "cannot be exported and must first be converted " + "to one of the supported " + "types.".format(col=column.name) + ) itemsize = max_len_string_array(ensure_object(column.values)) if itemsize > max_str_len: if dta_version >= 117: - return '%9s' + return "%9s" else: raise ValueError(excessive_string_length_error % column.name) return "%" + str(max(itemsize, 1)) + "s" @@ -1924,7 +2047,8 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, return "%8.0g" else: # pragma : no cover raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype)) + "Data type {dtype} not supported.".format(dtype=dtype) + ) class StataWriter(StataParser): @@ -1998,14 +2122,23 @@ class StataWriter(StataParser): _max_string_length = 244 - @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) - def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None, variable_labels=None): + @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) + def __init__( + self, + fname, + data, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - self._encoding = 'latin-1' + self._encoding = "latin-1" self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2024,8 +2157,7 @@ def _write(self, to_write): """ Helper to call encode before writing to file for Python 3 compat. """ - self._file.write(to_write.encode(self._encoding or - self._default_encoding)) + self._file.write(to_write.encode(self._encoding or self._default_encoding)) def _prepare_categoricals(self, data): """Check for categorical columns, retain categorical information for @@ -2044,8 +2176,10 @@ def _prepare_categoricals(self, data): self._value_labels.append(StataValueLabel(data[col])) dtype = data[col].cat.codes.dtype if dtype == np.int64: - raise ValueError('It is not possible to export ' - 'int64-based categorical data to Stata.') + raise ValueError( + "It is not possible to export " + "int64-based categorical data to Stata." + ) values = data[col].cat.codes.values.copy() # Upcast if needed so that correct missing values can be set @@ -2073,9 +2207,9 @@ def _replace_nans(self, data): dtype = data[c].dtype if dtype in (np.float32, np.float64): if dtype == np.float32: - replacement = self.MISSING_VALUES['f'] + replacement = self.MISSING_VALUES["f"] else: - replacement = self.MISSING_VALUES['d'] + replacement = self.MISSING_VALUES["d"] data[c] = data[c].fillna(replacement) return data @@ -2108,26 +2242,30 @@ def _check_column_names(self, data): name = str(name) for c in name: - if ((c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and - (c < '0' or c > '9') and c != '_'): - name = name.replace(c, '_') + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") # Variable name must not be a reserved word if name in self.RESERVED_WORDS: - name = '_' + name + name = "_" + name # Variable name may not start with a number - if name[0] >= '0' and name[0] <= '9': - name = '_' + name + if name[0] >= "0" and name[0] <= "9": + name = "_" + name - name = name[:min(len(name), 32)] + name = name[: min(len(name), 32)] if not name == orig_name: # check for duplicates while columns.count(name) > 0: # prepend ascending number to avoid duplicates - name = '_' + str(duplicate_var_id) + name - name = name[:min(len(name), 32)] + name = "_" + str(duplicate_var_id) + name + name = name[: min(len(name), 32)] duplicate_var_id += 1 converted_names[orig_name] = name @@ -2147,13 +2285,13 @@ def _check_column_names(self, data): for orig_name, name in converted_names.items(): # need to possibly encode the orig name if its unicode try: - orig_name = orig_name.encode('utf-8') + orig_name = orig_name.encode("utf-8") except (UnicodeDecodeError, AttributeError): pass - msg = '{0} -> {1}'.format(orig_name, name) + msg = "{0} -> {1}".format(orig_name, name) conversion_warning.append(msg) - ws = invalid_name_doc.format('\n '.join(conversion_warning)) + ws = invalid_name_doc.format("\n ".join(conversion_warning)) warnings.warn(ws, InvalidColumnName) self._converted_names = converted_names @@ -2201,14 +2339,13 @@ def _prepare_pandas(self, data): if col in self._convert_dates: continue if is_datetime64_dtype(data[col]): - self._convert_dates[col] = 'tc' + self._convert_dates[col] = "tc" - self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, - self.varlist) + self._convert_dates = _maybe_convert_to_int_keys( + self._convert_dates, self.varlist + ) for key in self._convert_dates: - new_type = _convert_datetime_to_stata_type( - self._convert_dates[key] - ) + new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) dtypes[key] = np.dtype(new_type) self._set_formats_and_types(data, dtypes) @@ -2221,8 +2358,7 @@ def _prepare_pandas(self, data): def write_file(self): self._file, self._own_file = _open_file_binary_write(self._fname) try: - self._write_header(time_stamp=self._time_stamp, - data_label=self._data_label) + self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) self._write_map() self._write_variable_types() self._write_varnames() @@ -2244,9 +2380,12 @@ def write_file(self): if self._own_file: os.unlink(self._fname) except Exception: - warnings.warn('This save was not successful but {0} could not ' - 'be deleted. This file is not ' - 'valid.'.format(self._fname), ResourceWarning) + warnings.warn( + "This save was not successful but {0} could not " + "be deleted. This file is not " + "valid.".format(self._fname), + ResourceWarning, + ) raise exc else: self._close() @@ -2290,8 +2429,7 @@ def _write_expansion_fields(self): def _write_value_labels(self): for vl in self._value_labels: - self._file.write(vl.generate_value_label(self._byteorder, - self._encoding)) + self._file.write(vl.generate_value_label(self._byteorder, self._encoding)) def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder @@ -2311,9 +2449,7 @@ def _write_header(self, data_label=None, time_stamp=None): if data_label is None: self._file.write(self._null_terminate(_pad_bytes("", 80))) else: - self._file.write( - self._null_terminate(_pad_bytes(data_label[:80], 80)) - ) + self._file.write(self._null_terminate(_pad_bytes(data_label[:80], 80))) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm if time_stamp is None: @@ -2322,17 +2458,31 @@ def _write_header(self, data_label=None, time_stamp=None): raise ValueError("time_stamp should be datetime type") # GH #13856 # Avoid locale-specific month conversion - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', - 'Sep', 'Oct', 'Nov', 'Dec'] + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] month_lookup = {i + 1: month for i, month in enumerate(months)} - ts = (time_stamp.strftime("%d ") + - month_lookup[time_stamp.month] + - time_stamp.strftime(" %Y %H:%M")) + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) self._file.write(self._null_terminate(ts)) def _write_variable_types(self): for typ in self.typlist: - self._file.write(struct.pack('B', typ)) + self._file.write(struct.pack("B", typ)) def _write_varnames(self): # varlist names are checked by _check_column_names @@ -2366,7 +2516,7 @@ def _write_value_label_names(self): def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination - blank = _pad_bytes('', 81) + blank = _pad_bytes("", 81) if self._variable_labels is None: for i in range(self.nvar): @@ -2377,13 +2527,16 @@ def _write_variable_labels(self): if col in self._variable_labels: label = self._variable_labels[col] if len(label) > 80: - raise ValueError('Variable labels must be 80 characters ' - 'or fewer') + raise ValueError( + "Variable labels must be 80 characters " "or fewer" + ) is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: - raise ValueError('Variable labels must contain only ' - 'characters that can be encoded in ' - 'Latin-1') + raise ValueError( + "Variable labels must contain only " + "characters that can be encoded in " + "Latin-1" + ) self._write(_pad_bytes(label, 81)) else: self._write(blank) @@ -2400,8 +2553,9 @@ def _prepare_data(self): if self._convert_dates is not None: for i, col in enumerate(data): if i in convert_dates: - data[col] = _datetime_to_stata_elapsed_vec(data[col], - self.fmtlist[i]) + data[col] = _datetime_to_stata_elapsed_vec( + data[col], self.fmtlist[i] + ) # 2. Convert strls data = self._convert_strls(data) @@ -2411,8 +2565,8 @@ def _prepare_data(self): for i, col in enumerate(data): typ = typlist[i] if typ <= self._max_string_length: - data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,)) - stype = 'S{type}'.format(type=typ) + data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) + stype = "S{type}".format(type=typ) dtypes[col] = stype data[col] = data[col].str.encode(self._encoding).astype(stype) else: @@ -2428,7 +2582,7 @@ def _write_data(self): self._file.write(data.tobytes()) def _null_terminate(self, s, as_string=False): - null_byte = '\x00' + null_byte = "\x00" s += null_byte if not as_string: @@ -2484,8 +2638,8 @@ def _pad_bytes_new(name, length): Takes a bytes instance and pads it with null bytes until it's length chars. """ if isinstance(name, str): - name = bytes(name, 'utf-8') - return name + b'\x00' * (length - len(name)) + name = bytes(name, "utf-8") + return name + b"\x00" * (length - len(name)) class StataStrLWriter: @@ -2519,23 +2673,23 @@ class StataStrLWriter: def __init__(self, df, columns, version=117, byteorder=None): if version not in (117, 118, 119): - raise ValueError('Only dta versions 117, 118 and 119 supported') + raise ValueError("Only dta versions 117, 118 and 119 supported") self._dta_ver = version self.df = df self.columns = columns - self._gso_table = OrderedDict((('', (0, 0)),)) + self._gso_table = OrderedDict((("", (0, 0)),)) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - gso_v_type = 'I' # uint32 - gso_o_type = 'Q' # uint64 - self._encoding = 'utf-8' + gso_v_type = "I" # uint32 + gso_o_type = "Q" # uint64 + self._encoding = "utf-8" if version == 117: o_size = 4 - gso_o_type = 'I' # 117 used uint32 - self._encoding = 'latin-1' + gso_o_type = "I" # 117 used uint32 + self._encoding = "latin-1" elif version == 118: o_size = 6 else: # version == 119 @@ -2588,7 +2742,7 @@ def generate_table(self): for j, (col, v) in enumerate(col_index): val = row[col] # Allow columns with mixed str and None (GH 23633) - val = '' if val is None else val + val = "" if val is None else val key = gso_table.get(val, None) if key is None: # Stata prefers human numbers @@ -2636,12 +2790,12 @@ def generate_blob(self, gso_table): # 3 u4 u8 u1 u4 string + null term bio = BytesIO() - gso = bytes('GSO', 'ascii') - gso_type = struct.pack(self._byteorder + 'B', 130) - null = struct.pack(self._byteorder + 'B', 0) + gso = bytes("GSO", "ascii") + gso_type = struct.pack(self._byteorder + "B", 130) + null = struct.pack(self._byteorder + "B", 0) v_type = self._byteorder + self._gso_v_type o_type = self._byteorder + self._gso_o_type - len_type = self._byteorder + 'I' + len_type = self._byteorder + "I" for strl, vo in gso_table.items(): if vo == (0, 0): continue @@ -2660,7 +2814,7 @@ def generate_blob(self, gso_table): bio.write(gso_type) # llll - utf8_string = bytes(strl, 'utf-8') + utf8_string = bytes(strl, "utf-8") bio.write(struct.pack(len_type, len(utf8_string) + 1)) # xxx...xxx @@ -2748,17 +2902,33 @@ class StataWriter117(StataWriter): _max_string_length = 2045 - @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None) - def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None, variable_labels=None, convert_strl=None): + @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None) + def __init__( + self, + fname, + data, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + convert_strl=None, + ): # Shallow copy since convert_strl might be modified later self._convert_strl = [] if convert_strl is None else convert_strl[:] - super().__init__(fname, data, convert_dates, write_index, - byteorder=byteorder, time_stamp=time_stamp, - data_label=data_label, - variable_labels=variable_labels) + super().__init__( + fname, + data, + convert_dates, + write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + ) self._map = None self._strl_blob = None @@ -2766,9 +2936,8 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, def _tag(val, tag): """Surround val with """ if isinstance(val, str): - val = bytes(val, 'utf-8') - return (bytes('<' + tag + '>', 'utf-8') + val + - bytes('', 'utf-8')) + val = bytes(val, "utf-8") + return bytes("<" + tag + ">", "utf-8") + val + bytes("", "utf-8") def _update_map(self, tag): """Update map location for tag with file position""" @@ -2777,22 +2946,22 @@ def _update_map(self, tag): def _write_header(self, data_label=None, time_stamp=None): """Write the file header""" byteorder = self._byteorder - self._file.write(bytes('', 'utf-8')) + self._file.write(bytes("", "utf-8")) bio = BytesIO() # ds_format - 117 - bio.write(self._tag(bytes('117', 'utf-8'), 'release')) + bio.write(self._tag(bytes("117", "utf-8"), "release")) # byteorder - bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", 'byteorder')) + bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) # number of vars, 2 bytes assert self.nvar < 2 ** 16 - bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), 'K')) + bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) # number of obs, 4 bytes - bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), 'N')) + bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), "N")) # data label 81 bytes, char, null terminated - label = data_label[:80] if data_label is not None else '' + label = data_label[:80] if data_label is not None else "" label_len = struct.pack(byteorder + "B", len(label)) - label = label_len + bytes(label, 'utf-8') - bio.write(self._tag(label, 'label')) + label = label_len + bytes(label, "utf-8") + bio.write(self._tag(label, "label")) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm if time_stamp is None: @@ -2800,155 +2969,176 @@ def _write_header(self, data_label=None, time_stamp=None): elif not isinstance(time_stamp, datetime.datetime): raise ValueError("time_stamp should be datetime type") # Avoid locale-specific month conversion - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', - 'Sep', 'Oct', 'Nov', 'Dec'] + months = [ + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ] month_lookup = {i + 1: month for i, month in enumerate(months)} - ts = (time_stamp.strftime("%d ") + - month_lookup[time_stamp.month] + - time_stamp.strftime(" %Y %H:%M")) + ts = ( + time_stamp.strftime("%d ") + + month_lookup[time_stamp.month] + + time_stamp.strftime(" %Y %H:%M") + ) # '\x11' added due to inspection of Stata file - ts = b'\x11' + bytes(ts, 'utf8') - bio.write(self._tag(ts, 'timestamp')) + ts = b"\x11" + bytes(ts, "utf8") + bio.write(self._tag(ts, "timestamp")) bio.seek(0) - self._file.write(self._tag(bio.read(), 'header')) + self._file.write(self._tag(bio.read(), "header")) def _write_map(self): """Called twice during file write. The first populates the values in the map with 0s. The second call writes the final map locations when all blocks have been written.""" if self._map is None: - self._map = OrderedDict((('stata_data', 0), - ('map', self._file.tell()), - ('variable_types', 0), - ('varnames', 0), - ('sortlist', 0), - ('formats', 0), - ('value_label_names', 0), - ('variable_labels', 0), - ('characteristics', 0), - ('data', 0), - ('strls', 0), - ('value_labels', 0), - ('stata_data_close', 0), - ('end-of-file', 0))) + self._map = OrderedDict( + ( + ("stata_data", 0), + ("map", self._file.tell()), + ("variable_types", 0), + ("varnames", 0), + ("sortlist", 0), + ("formats", 0), + ("value_label_names", 0), + ("variable_labels", 0), + ("characteristics", 0), + ("data", 0), + ("strls", 0), + ("value_labels", 0), + ("stata_data_close", 0), + ("end-of-file", 0), + ) + ) # Move to start of map - self._file.seek(self._map['map']) + self._file.seek(self._map["map"]) bio = BytesIO() for val in self._map.values(): - bio.write(struct.pack(self._byteorder + 'Q', val)) + bio.write(struct.pack(self._byteorder + "Q", val)) bio.seek(0) - self._file.write(self._tag(bio.read(), 'map')) + self._file.write(self._tag(bio.read(), "map")) def _write_variable_types(self): - self._update_map('variable_types') + self._update_map("variable_types") bio = BytesIO() for typ in self.typlist: - bio.write(struct.pack(self._byteorder + 'H', typ)) + bio.write(struct.pack(self._byteorder + "H", typ)) bio.seek(0) - self._file.write(self._tag(bio.read(), 'variable_types')) + self._file.write(self._tag(bio.read(), "variable_types")) def _write_varnames(self): - self._update_map('varnames') + self._update_map("varnames") bio = BytesIO() for name in self.varlist: name = self._null_terminate(name, True) name = _pad_bytes_new(name[:32], 33) bio.write(name) bio.seek(0) - self._file.write(self._tag(bio.read(), 'varnames')) + self._file.write(self._tag(bio.read(), "varnames")) def _write_sortlist(self): - self._update_map('sortlist') - self._file.write(self._tag(b'\x00\00' * (self.nvar + 1), 'sortlist')) + self._update_map("sortlist") + self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist")) def _write_formats(self): - self._update_map('formats') + self._update_map("formats") bio = BytesIO() for fmt in self.fmtlist: bio.write(_pad_bytes_new(fmt, 49)) bio.seek(0) - self._file.write(self._tag(bio.read(), 'formats')) + self._file.write(self._tag(bio.read(), "formats")) def _write_value_label_names(self): - self._update_map('value_label_names') + self._update_map("value_label_names") bio = BytesIO() for i in range(self.nvar): # Use variable name when categorical - name = '' # default name + name = "" # default name if self._is_col_cat[i]: name = self.varlist[i] name = self._null_terminate(name, True) name = _pad_bytes_new(name[:32], 33) bio.write(name) bio.seek(0) - self._file.write(self._tag(bio.read(), 'value_label_names')) + self._file.write(self._tag(bio.read(), "value_label_names")) def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination - self._update_map('variable_labels') + self._update_map("variable_labels") bio = BytesIO() - blank = _pad_bytes_new('', 81) + blank = _pad_bytes_new("", 81) if self._variable_labels is None: for _ in range(self.nvar): bio.write(blank) bio.seek(0) - self._file.write(self._tag(bio.read(), 'variable_labels')) + self._file.write(self._tag(bio.read(), "variable_labels")) return for col in self.data: if col in self._variable_labels: label = self._variable_labels[col] if len(label) > 80: - raise ValueError('Variable labels must be 80 characters ' - 'or fewer') + raise ValueError( + "Variable labels must be 80 characters " "or fewer" + ) is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: - raise ValueError('Variable labels must contain only ' - 'characters that can be encoded in ' - 'Latin-1') + raise ValueError( + "Variable labels must contain only " + "characters that can be encoded in " + "Latin-1" + ) bio.write(_pad_bytes_new(label, 81)) else: bio.write(blank) bio.seek(0) - self._file.write(self._tag(bio.read(), 'variable_labels')) + self._file.write(self._tag(bio.read(), "variable_labels")) def _write_characteristics(self): - self._update_map('characteristics') - self._file.write(self._tag(b'', 'characteristics')) + self._update_map("characteristics") + self._file.write(self._tag(b"", "characteristics")) def _write_data(self): - self._update_map('data') + self._update_map("data") data = self.data - self._file.write(b'') + self._file.write(b"") self._file.write(data.tobytes()) - self._file.write(b'') + self._file.write(b"") def _write_strls(self): - self._update_map('strls') - strls = b'' + self._update_map("strls") + strls = b"" if self._strl_blob is not None: strls = self._strl_blob - self._file.write(self._tag(strls, 'strls')) + self._file.write(self._tag(strls, "strls")) def _write_expansion_fields(self): """No-op in dta 117+""" pass def _write_value_labels(self): - self._update_map('value_labels') + self._update_map("value_labels") bio = BytesIO() for vl in self._value_labels: lab = vl.generate_value_label(self._byteorder, self._encoding) - lab = self._tag(lab, 'lbl') + lab = self._tag(lab, "lbl") bio.write(lab) bio.seek(0) - self._file.write(self._tag(bio.read(), 'value_labels')) + self._file.write(self._tag(bio.read(), "value_labels")) def _write_file_close_tag(self): - self._update_map('stata_data_close') - self._file.write(bytes('', 'utf-8')) - self._update_map('end-of-file') + self._update_map("stata_data_close") + self._file.write(bytes("", "utf-8")) + self._update_map("end-of-file") def _update_strl_names(self): """Update column names for conversion to strl if they might have been @@ -2963,8 +3153,10 @@ def _convert_strls(self, data): """Convert columns to StrLs if either very large or in the convert_strl variable""" convert_cols = [ - col for i, col in enumerate(data) - if self.typlist[i] == 32768 or col in self._convert_strl] + col + for i, col in enumerate(data) + if self.typlist[i] == 32768 or col in self._convert_strl + ] if convert_cols: ssw = StataStrLWriter(data, convert_cols) @@ -2978,9 +3170,8 @@ def _set_formats_and_types(self, data, dtypes): self.fmtlist = [] for col, dtype in dtypes.iteritems(): force_strl = col in self._convert_strl - fmt = _dtype_to_default_stata_fmt(dtype, data[col], - dta_version=117, - force_strl=force_strl) + fmt = _dtype_to_default_stata_fmt( + dtype, data[col], dta_version=117, force_strl=force_strl + ) self.fmtlist.append(fmt) - self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], - force_strl)) + self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], force_strl)) diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index 57a45f0f18d90..ebe047c58b889 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -57,17 +57,43 @@ https://github.com/pandas-dev/pandas/issues/26747. """ from pandas.plotting._core import ( - PlotAccessor, boxplot, boxplot_frame, boxplot_frame_groupby, hist_frame, - hist_series) + PlotAccessor, + boxplot, + boxplot_frame, + boxplot_frame_groupby, + hist_frame, + hist_series, +) from pandas.plotting._misc import ( - andrews_curves, autocorrelation_plot, bootstrap_plot, - deregister as deregister_matplotlib_converters, lag_plot, - parallel_coordinates, plot_params, radviz, - register as register_matplotlib_converters, scatter_matrix, table) + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + deregister as deregister_matplotlib_converters, + lag_plot, + parallel_coordinates, + plot_params, + radviz, + register as register_matplotlib_converters, + scatter_matrix, + table, +) -__all__ = ['PlotAccessor', 'boxplot', 'boxplot_frame', 'boxplot_frame_groupby', - 'hist_frame', 'hist_series', 'scatter_matrix', 'radviz', - 'andrews_curves', 'bootstrap_plot', 'parallel_coordinates', - 'lag_plot', 'autocorrelation_plot', 'table', 'plot_params', - 'register_matplotlib_converters', - 'deregister_matplotlib_converters'] +__all__ = [ + "PlotAccessor", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "hist_frame", + "hist_series", + "scatter_matrix", + "radviz", + "andrews_curves", + "bootstrap_plot", + "parallel_coordinates", + "lag_plot", + "autocorrelation_plot", + "table", + "plot_params", + "register_matplotlib_converters", + "deregister_matplotlib_converters", +] diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 2f46df2985703..5e67d9a587914 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -19,9 +19,19 @@ pass -def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, figsize=None, - bins=10, **kwds): +def hist_series( + self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + **kwds +): """ Draw histogram of the input series using matplotlib. @@ -61,15 +71,38 @@ def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, matplotlib.axes.Axes.hist : Plot a histogram using matplotlib. """ plot_backend = _get_plot_backend() - return plot_backend.hist_series(self, by=by, ax=ax, grid=grid, - xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, - figsize=figsize, bins=bins, **kwds) - - -def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): + return plot_backend.hist_series( + self, + by=by, + ax=ax, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + figsize=figsize, + bins=bins, + **kwds + ) + + +def hist_frame( + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + **kwds +): """ Make a histogram of the DataFrame's. @@ -148,17 +181,38 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend() - return plot_backend.hist_frame(data, column=column, by=by, grid=grid, - xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, - ax=ax, sharex=sharex, sharey=sharey, - figsize=figsize, layout=layout, bins=bins, - **kwds) - - -def boxplot(data, column=None, by=None, ax=None, fontsize=None, - rot=0, grid=True, figsize=None, layout=None, return_type=None, - **kwds): + return plot_backend.hist_frame( + data, + column=column, + by=by, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + bins=bins, + **kwds + ) + + +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): """ Make a box plot from DataFrame columns. @@ -322,26 +376,65 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, """ plot_backend = _get_plot_backend() - return plot_backend.boxplot(data, column=column, by=by, ax=ax, - fontsize=fontsize, rot=rot, grid=grid, - figsize=figsize, layout=layout, - return_type=return_type, **kwds) + return plot_backend.boxplot( + data, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds + ) @Appender(boxplot.__doc__) -def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, - grid=True, figsize=None, layout=None, - return_type=None, **kwds): +def boxplot_frame( + self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): plot_backend = _get_plot_backend() - return plot_backend.boxplot_frame(self, column=column, by=by, ax=ax, - fontsize=fontsize, rot=rot, grid=grid, - figsize=figsize, layout=layout, - return_type=return_type, **kwds) - - -def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, - rot=0, grid=True, ax=None, figsize=None, - layout=None, sharex=False, sharey=True, **kwds): + return plot_backend.boxplot_frame( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds + ) + + +def boxplot_frame_groupby( + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + sharex=False, + sharey=True, + **kwds +): """ Make box plots from DataFrameGroupBy data. @@ -393,9 +486,19 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, """ plot_backend = _get_plot_backend() return plot_backend.boxplot_frame_groupby( - grouped, subplots=subplots, column=column, fontsize=fontsize, rot=rot, - grid=grid, ax=ax, figsize=figsize, layout=layout, sharex=sharex, - sharey=sharey, **kwds) + grouped, + subplots=subplots, + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + sharex=sharex, + sharey=sharey, + **kwds + ) class PlotAccessor(PandasObject): @@ -500,11 +603,11 @@ class PlotAccessor(PandasObject): From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5 (center) """ - _common_kinds = ('line', 'bar', 'barh', 'kde', 'density', 'area', 'hist', - 'box') - _series_kinds = ('pie',) - _dataframe_kinds = ('scatter', 'hexbin') - _kind_aliases = {'density': 'kde'} + + _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box") + _series_kinds = ("pie",) + _dataframe_kinds = ("scatter", "hexbin") + _kind_aliases = {"density": "kde"} _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds def __init__(self, data): @@ -521,63 +624,106 @@ def _get_call_args(backend_name, data, args, kwargs): """ if isinstance(data, ABCSeries): arg_def = [ - ('kind', 'line'), ('ax', None), ('figsize', None), - ('use_index', True), ('title', None), ('grid', None), - ('legend', False), ('style', None), ('logx', False), - ('logy', False), ('loglog', False), ('xticks', None), - ('yticks', None), ('xlim', None), ('ylim', None), - ('rot', None), ('fontsize', None), ('colormap', None), - ('table', False), ('yerr', None), ('xerr', None), - ('label', None), ('secondary_y', False)] + ("kind", "line"), + ("ax", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", False), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("label", None), + ("secondary_y", False), + ] elif isinstance(data, ABCDataFrame): arg_def = [ - ('x', None), ('y', None), ('kind', 'line'), ('ax', None), - ('subplots', False), ('sharex', None), ('sharey', False), - ('layout', None), ('figsize', None), ('use_index', True), - ('title', None), ('grid', None), ('legend', True), - ('style', None), ('logx', False), ('logy', False), - ('loglog', False), ('xticks', None), ('yticks', None), - ('xlim', None), ('ylim', None), ('rot', None), - ('fontsize', None), ('colormap', None), ('table', False), - ('yerr', None), ('xerr', None), ('secondary_y', False), - ('sort_columns', False)] + ("x", None), + ("y", None), + ("kind", "line"), + ("ax", None), + ("subplots", False), + ("sharex", None), + ("sharey", False), + ("layout", None), + ("figsize", None), + ("use_index", True), + ("title", None), + ("grid", None), + ("legend", True), + ("style", None), + ("logx", False), + ("logy", False), + ("loglog", False), + ("xticks", None), + ("yticks", None), + ("xlim", None), + ("ylim", None), + ("rot", None), + ("fontsize", None), + ("colormap", None), + ("table", False), + ("yerr", None), + ("xerr", None), + ("secondary_y", False), + ("sort_columns", False), + ] else: - raise TypeError(('Called plot accessor for type {}, expected ' - 'Series or DataFrame').format( - type(data).__name__)) + raise TypeError( + ( + "Called plot accessor for type {}, expected " "Series or DataFrame" + ).format(type(data).__name__) + ) if args and isinstance(data, ABCSeries): - msg = ('`Series.plot()` should not be called with positional ' - 'arguments, only keyword arguments. The order of ' - 'positional arguments will change in the future. ' - 'Use `Series.plot({})` instead of `Series.plot({})`.') + msg = ( + "`Series.plot()` should not be called with positional " + "arguments, only keyword arguments. The order of " + "positional arguments will change in the future. " + "Use `Series.plot({})` instead of `Series.plot({})`." + ) positional_args = str(args)[1:-1] - keyword_args = ', '.join('{}={!r}'.format(name, value) - for (name, default), value - in zip(arg_def, args)) - warnings.warn(msg.format(keyword_args, positional_args), - FutureWarning, stacklevel=3) + keyword_args = ", ".join( + "{}={!r}".format(name, value) + for (name, default), value in zip(arg_def, args) + ) + warnings.warn( + msg.format(keyword_args, positional_args), FutureWarning, stacklevel=3 + ) pos_args = {name: value for value, (name, _) in zip(args, arg_def)} - if backend_name == 'pandas.plotting._matplotlib': + if backend_name == "pandas.plotting._matplotlib": kwargs = dict(arg_def, **pos_args, **kwargs) else: kwargs = dict(pos_args, **kwargs) - x = kwargs.pop('x', None) - y = kwargs.pop('y', None) - kind = kwargs.pop('kind', 'line') + x = kwargs.pop("x", None) + y = kwargs.pop("y", None) + kind = kwargs.pop("kind", "line") return x, y, kind, kwargs def __call__(self, *args, **kwargs): plot_backend = _get_plot_backend() - x, y, kind, kwargs = self._get_call_args(plot_backend.__name__, - self._parent, args, kwargs) + x, y, kind, kwargs = self._get_call_args( + plot_backend.__name__, self._parent, args, kwargs + ) kind = self._kind_aliases.get(kind, kind) if kind not in self._all_kinds: - raise ValueError('{} is not a valid plot kind'.format(kind)) + raise ValueError("{} is not a valid plot kind".format(kind)) # The original data structured can be transformed before passed to the # backend. For example, for DataFrame is common to set the index as the @@ -585,22 +731,22 @@ def __call__(self, *args, **kwargs): data = self._parent.copy() if isinstance(data, pandas.core.dtypes.generic.ABCSeries): - kwargs['reuse_plot'] = True + kwargs["reuse_plot"] = True if kind in self._dataframe_kinds: if isinstance(data, ABCDataFrame): return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs) else: - raise ValueError(("plot kind {} can only be used for " - "data frames").format(kind)) + raise ValueError( + ("plot kind {} can only be used for " "data frames").format(kind) + ) elif kind in self._series_kinds: if isinstance(data, ABCDataFrame): - if y is None and kwargs.get('subplots') is False: + if y is None and kwargs.get("subplots") is False: msg = "{} requires either y column or 'subplots=True'" raise ValueError(msg.format(kind)) elif y is not None: - if (is_integer(y) - and not data.columns.holds_integer()): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] # converted to series actually. copy to not modify data = data[y].copy() @@ -620,11 +766,11 @@ def __call__(self, *args, **kwargs): if int_y_arg and not data.columns.holds_integer(): y = data_cols[y] - label_kw = kwargs['label'] if 'label' in kwargs else False - for kw in ['xerr', 'yerr']: - if (kw in kwargs and - (isinstance(kwargs[kw], str) - or is_integer(kwargs[kw]))): + label_kw = kwargs["label"] if "label" in kwargs else False + for kw in ["xerr", "yerr"]: + if kw in kwargs and ( + isinstance(kwargs[kw], str) or is_integer(kwargs[kw]) + ): try: kwargs[kw] = data[kwargs[kw]] except (IndexError, KeyError, TypeError): @@ -640,7 +786,8 @@ def __call__(self, *args, **kwargs): match = is_list_like(label_kw) and len(label_kw) == len(y) if label_kw and not match: raise ValueError( - "label should be list-like and same length as y") + "label should be list-like and same length as y" + ) label_name = label_kw or data.columns data.columns = label_name @@ -713,7 +860,7 @@ def line(self, x=None, y=None, **kwargs): >>> lines = df.plot.line(x='pig', y='horse') """ - return self(kind='line', x=x, y=y, **kwargs) + return self(kind="line", x=x, y=y, **kwargs) def bar(self, x=None, y=None, **kwargs): """ @@ -798,7 +945,7 @@ def bar(self, x=None, y=None, **kwargs): >>> ax = df.plot.bar(x='lifespan', rot=0) """ - return self(kind='bar', x=x, y=y, **kwargs) + return self(kind="bar", x=x, y=y, **kwargs) def barh(self, x=None, y=None, **kwargs): """ @@ -878,7 +1025,7 @@ def barh(self, x=None, y=None, **kwargs): ... 'lifespan': lifespan}, index=index) >>> ax = df.plot.barh(x='lifespan') """ - return self(kind='barh', x=x, y=y, **kwargs) + return self(kind="barh", x=x, y=y, **kwargs) def box(self, by=None, **kwargs): r""" @@ -928,7 +1075,7 @@ def box(self, by=None, **kwargs): >>> df = pd.DataFrame(data, columns=list('ABCD')) >>> ax = df.plot.box() """ - return self(kind='box', by=by, **kwargs) + return self(kind="box", by=by, **kwargs) def hist(self, by=None, bins=10, **kwargs): """ @@ -975,7 +1122,7 @@ def hist(self, by=None, bins=10, **kwargs): >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) """ - return self(kind='hist', by=by, bins=bins, **kwargs) + return self(kind="hist", by=by, bins=bins, **kwargs) def kde(self, bw_method=None, ind=None, **kwargs): """ @@ -1083,7 +1230,7 @@ def kde(self, bw_method=None, ind=None, **kwargs): >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) """ - return self(kind='kde', bw_method=bw_method, ind=ind, **kwargs) + return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs) density = kde @@ -1158,7 +1305,7 @@ def area(self, x=None, y=None, **kwargs): ... }) >>> ax = df.plot.area(x='day') """ - return self(kind='area', x=x, y=y, **kwargs) + return self(kind="area", x=x, y=y, **kwargs) def pie(self, **kwargs): """ @@ -1207,11 +1354,13 @@ def pie(self, **kwargs): >>> plot = df.plot.pie(subplots=True, figsize=(6, 3)) """ - if (isinstance(self._parent, ABCDataFrame) - and kwargs.get('y', None) is None - and not kwargs.get('subplots', False)): + if ( + isinstance(self._parent, ABCDataFrame) + and kwargs.get("y", None) is None + and not kwargs.get("subplots", False) + ): raise ValueError("pie requires either y column or 'subplots=True'") - return self(kind='pie', **kwargs) + return self(kind="pie", **kwargs) def scatter(self, x, y, s=None, c=None, **kwargs): """ @@ -1292,10 +1441,9 @@ def scatter(self, x, y, s=None, c=None, **kwargs): ... c='species', ... colormap='viridis') """ - return self(kind='scatter', x=x, y=y, s=s, c=c, **kwargs) + return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs) - def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, - **kwargs): + def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, **kwargs): """ Generate a hexagonal binning plot. @@ -1378,11 +1526,11 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, ... cmap="viridis") """ if reduce_C_function is not None: - kwargs['reduce_C_function'] = reduce_C_function + kwargs["reduce_C_function"] = reduce_C_function if gridsize is not None: - kwargs['gridsize'] = gridsize + kwargs["gridsize"] = gridsize - return self(kind='hexbin', x=x, y=y, C=C, **kwargs) + return self(kind="hexbin", x=x, y=y, C=C, **kwargs) def _get_plot_backend(): @@ -1398,7 +1546,7 @@ def _get_plot_backend(): The backend is imported lazily, as matplotlib is a soft dependency, and pandas can be used without it being installed. """ - backend_str = pandas.get_option('plotting.backend') - if backend_str == 'matplotlib': - backend_str = 'pandas.plotting._matplotlib' + backend_str = pandas.get_option("plotting.backend") + if backend_str == "matplotlib": + backend_str = "pandas.plotting._matplotlib" return importlib.import_module(backend_str) diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 8eac6897add0e..d3b7a34b6c923 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,28 +1,46 @@ from pandas._config import get_option from pandas.plotting._matplotlib.boxplot import ( - BoxPlot, boxplot, boxplot_frame, boxplot_frame_groupby) + BoxPlot, + boxplot, + boxplot_frame, + boxplot_frame_groupby, +) from pandas.plotting._matplotlib.converter import deregister, register from pandas.plotting._matplotlib.core import ( - AreaPlot, BarhPlot, BarPlot, HexBinPlot, LinePlot, PiePlot, ScatterPlot) -from pandas.plotting._matplotlib.hist import ( - HistPlot, KdePlot, hist_frame, hist_series) + AreaPlot, + BarhPlot, + BarPlot, + HexBinPlot, + LinePlot, + PiePlot, + ScatterPlot, +) +from pandas.plotting._matplotlib.hist import HistPlot, KdePlot, hist_frame, hist_series from pandas.plotting._matplotlib.misc import ( - andrews_curves, autocorrelation_plot, bootstrap_plot, lag_plot, - parallel_coordinates, radviz, scatter_matrix) + andrews_curves, + autocorrelation_plot, + bootstrap_plot, + lag_plot, + parallel_coordinates, + radviz, + scatter_matrix, +) from pandas.plotting._matplotlib.timeseries import tsplot from pandas.plotting._matplotlib.tools import table -PLOT_CLASSES = {'line': LinePlot, - 'bar': BarPlot, - 'barh': BarhPlot, - 'box': BoxPlot, - 'hist': HistPlot, - 'kde': KdePlot, - 'area': AreaPlot, - 'pie': PiePlot, - 'scatter': ScatterPlot, - 'hexbin': HexBinPlot} +PLOT_CLASSES = { + "line": LinePlot, + "bar": BarPlot, + "barh": BarhPlot, + "box": BoxPlot, + "hist": HistPlot, + "kde": KdePlot, + "area": AreaPlot, + "pie": PiePlot, + "scatter": ScatterPlot, + "hexbin": HexBinPlot, +} if get_option("plotting.matplotlib.register_converters"): register(explicit=False) @@ -33,20 +51,35 @@ def plot(data, kind, **kwargs): # registered) causes problems in matplotlib 2 (converters seem to not # work) import matplotlib.pyplot as plt - if kwargs.pop('reuse_plot', False): - ax = kwargs.get('ax') + + if kwargs.pop("reuse_plot", False): + ax = kwargs.get("ax") if ax is None and len(plt.get_fignums()) > 0: with plt.rc_context(): ax = plt.gca() - kwargs['ax'] = getattr(ax, 'left_ax', ax) + kwargs["ax"] = getattr(ax, "left_ax", ax) plot_obj = PLOT_CLASSES[kind](data, **kwargs) plot_obj.generate() plot_obj.draw() return plot_obj.result -__all__ = ['plot', 'hist_series', 'hist_frame', 'boxplot', 'boxplot_frame', - 'boxplot_frame_groupby', 'tsplot', 'table', 'andrews_curves', - 'autocorrelation_plot', 'bootstrap_plot', 'lag_plot', - 'parallel_coordinates', 'radviz', 'scatter_matrix', 'register', - 'deregister'] +__all__ = [ + "plot", + "hist_series", + "hist_frame", + "boxplot", + "boxplot_frame", + "boxplot_frame_groupby", + "tsplot", + "table", + "andrews_curves", + "autocorrelation_plot", + "bootstrap_plot", + "lag_plot", + "parallel_coordinates", + "radviz", + "scatter_matrix", + "register", + "deregister", +] diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index f8bc531e3c344..8ff7441df5354 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -17,18 +17,17 @@ class BoxPlot(LinePlot): - _kind = 'box' - _layout_type = 'horizontal' + _kind = "box" + _layout_type = "horizontal" - _valid_return_types = (None, 'axes', 'dict', 'both') + _valid_return_types = (None, "axes", "dict", "both") # namedtuple to hold results - BP = namedtuple("Boxplot", ['ax', 'lines']) + BP = namedtuple("Boxplot", ["ax", "lines"]) - def __init__(self, data, return_type='axes', **kwargs): + def __init__(self, data, return_type="axes", **kwargs): # Do not call LinePlot.__init__ which may fill nan if return_type not in self._valid_return_types: - raise ValueError( - "return_type must be {None, 'axes', 'dict', 'both'}") + raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") self.return_type = return_type MPLPlot.__init__(self, data, **kwargs) @@ -37,13 +36,13 @@ def _args_adjust(self): if self.subplots: # Disable label ax sharing. Otherwise, all subplots shows last # column label - if self.orientation == 'vertical': + if self.orientation == "vertical": self.sharex = False else: self.sharey = False @classmethod - def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds): + def _plot(cls, ax, y, column_num=None, return_type="axes", **kwds): if y.ndim == 2: y = [remove_na_arraylike(v) for v in y] # Boxplot fails with empty arrays, so need to add a NaN @@ -54,52 +53,53 @@ def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds): y = remove_na_arraylike(y) bp = ax.boxplot(y, **kwds) - if return_type == 'dict': + if return_type == "dict": return bp, bp - elif return_type == 'both': + elif return_type == "both": return cls.BP(ax=ax, lines=bp), bp else: return ax, bp def _validate_color_args(self): - if 'color' in self.kwds: + if "color" in self.kwds: if self.colormap is not None: - warnings.warn("'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'") - self.color = self.kwds.pop('color') + warnings.warn( + "'color' and 'colormap' cannot be used " + "simultaneously. Using 'color'" + ) + self.color = self.kwds.pop("color") if isinstance(self.color, dict): - valid_keys = ['boxes', 'whiskers', 'medians', 'caps'] + valid_keys = ["boxes", "whiskers", "medians", "caps"] for key, values in self.color.items(): if key not in valid_keys: - raise ValueError("color dict contains invalid " - "key '{0}' " - "The key must be either {1}" - .format(key, valid_keys)) + raise ValueError( + "color dict contains invalid " + "key '{0}' " + "The key must be either {1}".format(key, valid_keys) + ) else: self.color = None # get standard colors for default - colors = _get_standard_colors(num_colors=3, - colormap=self.colormap, - color=None) + colors = _get_standard_colors(num_colors=3, colormap=self.colormap, color=None) # use 2 colors by default, for box/whisker and median # flier colors isn't needed here # because it can be specified by ``sym`` kw self._boxes_c = colors[0] self._whiskers_c = colors[0] self._medians_c = colors[2] - self._caps_c = 'k' # mpl default + self._caps_c = "k" # mpl default - def _get_colors(self, num_colors=None, color_kwds='color'): + def _get_colors(self, num_colors=None, color_kwds="color"): pass def maybe_color_bp(self, bp): if isinstance(self.color, dict): - boxes = self.color.get('boxes', self._boxes_c) - whiskers = self.color.get('whiskers', self._whiskers_c) - medians = self.color.get('medians', self._medians_c) - caps = self.color.get('caps', self._caps_c) + boxes = self.color.get("boxes", self._boxes_c) + whiskers = self.color.get("whiskers", self._whiskers_c) + medians = self.color.get("medians", self._medians_c) + caps = self.color.get("caps", self._caps_c) else: # Other types are forwarded to matplotlib # If None, use default colors @@ -108,10 +108,10 @@ def maybe_color_bp(self, bp): medians = self.color or self._medians_c caps = self.color or self._caps_c - setp(bp['boxes'], color=boxes, alpha=1) - setp(bp['whiskers'], color=whiskers, alpha=1) - setp(bp['medians'], color=medians, alpha=1) - setp(bp['caps'], color=caps, alpha=1) + setp(bp["boxes"], color=boxes, alpha=1) + setp(bp["whiskers"], color=whiskers, alpha=1) + setp(bp["medians"], color=medians, alpha=1) + setp(bp["caps"], color=caps, alpha=1) def _make_plot(self): if self.subplots: @@ -121,8 +121,9 @@ def _make_plot(self): ax = self._get_ax(i) kwds = self.kwds.copy() - ret, bp = self._plot(ax, y, column_num=i, - return_type=self.return_type, **kwds) + ret, bp = self._plot( + ax, y, column_num=i, return_type=self.return_type, **kwds + ) self.maybe_color_bp(bp) self._return_obj[label] = ret @@ -133,8 +134,9 @@ def _make_plot(self): ax = self._get_ax(0) kwds = self.kwds.copy() - ret, bp = self._plot(ax, y, column_num=0, - return_type=self.return_type, **kwds) + ret, bp = self._plot( + ax, y, column_num=0, return_type=self.return_type, **kwds + ) self.maybe_color_bp(bp) self._return_obj = ret @@ -145,7 +147,7 @@ def _make_plot(self): self._set_ticklabels(ax, labels) def _set_ticklabels(self, ax, labels): - if self.orientation == 'vertical': + if self.orientation == "vertical": ax.set_xticklabels(labels) else: ax.set_yticklabels(labels) @@ -158,10 +160,10 @@ def _post_plot_logic(self, ax, data): @property def orientation(self): - if self.kwds.get('vert', True): - return 'vertical' + if self.kwds.get("vert", True): + return "vertical" else: - return 'horizontal' + return "horizontal" @property def result(self): @@ -171,18 +173,28 @@ def result(self): return self._return_obj -def _grouped_plot_by_column(plotf, data, columns=None, by=None, - numeric_only=True, grid=False, - figsize=None, ax=None, layout=None, - return_type=None, **kwargs): +def _grouped_plot_by_column( + plotf, + data, + columns=None, + by=None, + numeric_only=True, + grid=False, + figsize=None, + ax=None, + layout=None, + return_type=None, + **kwargs +): grouped = data.groupby(by) if columns is None: if not isinstance(by, (list, tuple)): by = [by] columns = data._get_numeric_data().columns.difference(by) naxes = len(columns) - fig, axes = _subplots(naxes=naxes, sharex=True, sharey=True, - figsize=figsize, ax=ax, layout=layout) + fig, axes = _subplots( + naxes=naxes, sharex=True, sharey=True, figsize=figsize, ax=ax, layout=layout + ) _axes = _flatten(axes) @@ -205,52 +217,63 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, result = axes byline = by[0] if len(by) == 1 else by - fig.suptitle('Boxplot grouped by {byline}'.format(byline=byline)) + fig.suptitle("Boxplot grouped by {byline}".format(byline=byline)) fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) return result -def boxplot(data, column=None, by=None, ax=None, fontsize=None, - rot=0, grid=True, figsize=None, layout=None, return_type=None, - **kwds): +def boxplot( + data, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): import matplotlib.pyplot as plt + # validate return_type: if return_type not in BoxPlot._valid_return_types: raise ValueError("return_type must be {'axes', 'dict', 'both'}") if isinstance(data, ABCSeries): - data = data.to_frame('x') - column = 'x' + data = data.to_frame("x") + column = "x" def _get_colors(): # num_colors=3 is required as method maybe_color_bp takes the colors # in positions 0 and 2. - return _get_standard_colors(color=kwds.get('color'), num_colors=3) + return _get_standard_colors(color=kwds.get("color"), num_colors=3) def maybe_color_bp(bp): - if 'color' not in kwds: - setp(bp['boxes'], color=colors[0], alpha=1) - setp(bp['whiskers'], color=colors[0], alpha=1) - setp(bp['medians'], color=colors[2], alpha=1) + if "color" not in kwds: + setp(bp["boxes"], color=colors[0], alpha=1) + setp(bp["whiskers"], color=colors[0], alpha=1) + setp(bp["medians"], color=colors[2], alpha=1) def plot_group(keys, values, ax): keys = [pprint_thing(x) for x in keys] values = [np.asarray(remove_na_arraylike(v)) for v in values] bp = ax.boxplot(values, **kwds) if fontsize is not None: - ax.tick_params(axis='both', labelsize=fontsize) - if kwds.get('vert', 1): + ax.tick_params(axis="both", labelsize=fontsize) + if kwds.get("vert", 1): ax.set_xticklabels(keys, rotation=rot) else: ax.set_yticklabels(keys, rotation=rot) maybe_color_bp(bp) # Return axes in multiplot case, maybe revisit later # 985 - if return_type == 'dict': + if return_type == "dict": return bp - elif return_type == 'both': + elif return_type == "both": return BoxPlot.BP(ax=ax, lines=bp) else: return ax @@ -267,19 +290,27 @@ def plot_group(keys, values, ax): if by is not None: # Prefer array return type for 2-D plots to match the subplot layout # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580 - result = _grouped_plot_by_column(plot_group, data, columns=columns, - by=by, grid=grid, figsize=figsize, - ax=ax, layout=layout, - return_type=return_type) + result = _grouped_plot_by_column( + plot_group, + data, + columns=columns, + by=by, + grid=grid, + figsize=figsize, + ax=ax, + layout=layout, + return_type=return_type, + ) else: if return_type is None: - return_type = 'axes' + return_type = "axes" if layout is not None: - raise ValueError("The 'layout' keyword is not supported when " - "'by' is None") + raise ValueError( + "The 'layout' keyword is not supported when " "'by' is None" + ) if ax is None: - rc = {'figure.figsize': figsize} if figsize is not None else {} + rc = {"figure.figsize": figsize} if figsize is not None else {} with plt.rc_context(rc): ax = plt.gca() data = data._get_numeric_data() @@ -294,37 +325,75 @@ def plot_group(keys, values, ax): return result -def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, - grid=True, figsize=None, layout=None, - return_type=None, **kwds): +def boxplot_frame( + self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds +): import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots - ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize, - grid=grid, rot=rot, figsize=figsize, layout=layout, - return_type=return_type, **kwds) + ax = boxplot( + self, + column=column, + by=by, + ax=ax, + fontsize=fontsize, + grid=grid, + rot=rot, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds + ) plt.draw_if_interactive() return ax -def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, - rot=0, grid=True, ax=None, figsize=None, - layout=None, sharex=False, sharey=True, **kwds): +def boxplot_frame_groupby( + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + sharex=False, + sharey=True, + **kwds +): converter._WARN = False # no warning for pandas plots if subplots is True: naxes = len(grouped) - fig, axes = _subplots(naxes=naxes, squeeze=False, - ax=ax, sharex=sharex, sharey=sharey, - figsize=figsize, layout=layout) + fig, axes = _subplots( + naxes=naxes, + squeeze=False, + ax=ax, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) axes = _flatten(axes) ret = pd.Series() for (key, group), ax in zip(grouped, axes): - d = group.boxplot(ax=ax, column=column, fontsize=fontsize, - rot=rot, grid=grid, **kwds) + d = group.boxplot( + ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds + ) ax.set_title(pprint_thing(key)) ret.loc[key] = d - fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, - right=0.9, wspace=0.2) + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: keys, frames = zip(*grouped) if grouped.axis == 0: @@ -334,7 +403,14 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, df = frames[0].join(frames[1::]) else: df = frames[0] - ret = df.boxplot(column=column, fontsize=fontsize, rot=rot, - grid=grid, ax=ax, figsize=figsize, - layout=layout, **kwds) + ret = df.boxplot( + column=column, + fontsize=fontsize, + rot=rot, + grid=grid, + ax=ax, + figsize=figsize, + layout=layout, + **kwds + ) return ret diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index 36bbe0f4ec174..e7855068334f7 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -9,12 +9,14 @@ def inner(): import matplotlib as mpl except ImportError: return False - return (op(LooseVersion(mpl.__version__), LooseVersion(version)) and - str(mpl.__version__)[0] != '0') + return ( + op(LooseVersion(mpl.__version__), LooseVersion(version)) + and str(mpl.__version__)[0] != "0" + ) return inner -_mpl_ge_2_2_3 = _mpl_version('2.2.3', operator.ge) -_mpl_ge_3_0_0 = _mpl_version('3.0.0', operator.ge) -_mpl_ge_3_1_0 = _mpl_version('3.1.0', operator.ge) +_mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) +_mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) +_mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 30ef7a64dec4a..b20dd3212c7cb 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -14,8 +14,13 @@ from pandas._libs.tslibs.frequencies import FreqGroup, get_freq from pandas.core.dtypes.common import ( - is_datetime64_ns_dtype, is_float, is_float_dtype, is_integer, - is_integer_dtype, is_nested_list_like) + is_datetime64_ns_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_nested_list_like, +) from pandas.core.dtypes.generic import ABCSeries import pandas.core.common as com @@ -25,9 +30,9 @@ import pandas.core.tools.datetimes as tools # constants -HOURS_PER_DAY = 24. -MIN_PER_HOUR = 60. -SEC_PER_MIN = 60. +HOURS_PER_DAY = 24.0 +MIN_PER_HOUR = 60.0 +SEC_PER_MIN = 60.0 SEC_PER_HOUR = SEC_PER_MIN * MIN_PER_HOUR SEC_PER_DAY = SEC_PER_HOUR * HOURS_PER_DAY @@ -75,8 +80,7 @@ def deregister(): # restore the old keys for unit, formatter in _mpl_units.items(): - if type(formatter) not in {DatetimeConverter, PeriodConverter, - TimeConverter}: + if type(formatter) not in {DatetimeConverter, PeriodConverter, TimeConverter}: # make it idempotent by excluding ours. units.registry[unit] = formatter @@ -85,21 +89,22 @@ def _check_implicitly_registered(): global _WARN if _WARN: - msg = ("Using an implicitly registered datetime converter for a " - "matplotlib plotting method. The converter was registered " - "by pandas on import. Future versions of pandas will require " - "you to explicitly register matplotlib converters.\n\n" - "To register the converters:\n\t" - ">>> from pandas.plotting import register_matplotlib_converters" - "\n\t" - ">>> register_matplotlib_converters()") + msg = ( + "Using an implicitly registered datetime converter for a " + "matplotlib plotting method. The converter was registered " + "by pandas on import. Future versions of pandas will require " + "you to explicitly register matplotlib converters.\n\n" + "To register the converters:\n\t" + ">>> from pandas.plotting import register_matplotlib_converters" + "\n\t" + ">>> register_matplotlib_converters()" + ) warnings.warn(msg, FutureWarning) _WARN = False def _to_ordinalf(tm): - tot_sec = (tm.hour * 3600 + tm.minute * 60 + tm.second + - float(tm.microsecond / 1e6)) + tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + float(tm.microsecond / 1e6) return tot_sec @@ -107,7 +112,7 @@ def time2num(d): if isinstance(d, str): parsed = tools.to_datetime(d) if not isinstance(parsed, datetime): - raise ValueError('Could not parse time {d}'.format(d=d)) + raise ValueError("Could not parse time {d}".format(d=d)) return _to_ordinalf(parsed.time()) if isinstance(d, pydt.time): return _to_ordinalf(d) @@ -115,12 +120,10 @@ def time2num(d): class TimeConverter(units.ConversionInterface): - @staticmethod def convert(value, unit, axis): valid_types = (str, pydt.time) - if (isinstance(value, valid_types) or is_integer(value) or - is_float(value)): + if isinstance(value, valid_types) or is_integer(value) or is_float(value): return time2num(value) if isinstance(value, Index): return value.map(time2num) @@ -130,21 +133,20 @@ def convert(value, unit, axis): @staticmethod def axisinfo(unit, axis): - if unit != 'time': + if unit != "time": return None majloc = AutoLocator() majfmt = TimeFormatter(majloc) - return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='time') + return units.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @staticmethod def default_units(x, axis): - return 'time' + return "time" # time formatter class TimeFormatter(Formatter): - def __init__(self, locs): self.locs = locs @@ -166,7 +168,7 @@ def __call__(self, x, pos=0): A string in HH:MM:SS.mmmuuu format. Microseconds, milliseconds and seconds are only displayed if non-zero. """ - fmt = '%H:%M:%S.%f' + fmt = "%H:%M:%S.%f" s = int(x) msus = int(round((x - s) * 1e6)) ms = msus // 1000 @@ -179,39 +181,35 @@ def __call__(self, x, pos=0): elif ms != 0: return pydt.time(h, m, s, msus).strftime(fmt)[:-3] elif s != 0: - return pydt.time(h, m, s).strftime('%H:%M:%S') + return pydt.time(h, m, s).strftime("%H:%M:%S") - return pydt.time(h, m).strftime('%H:%M') + return pydt.time(h, m).strftime("%H:%M") # Period Conversion class PeriodConverter(dates.DateConverter): - @staticmethod def convert(values, units, axis): if is_nested_list_like(values): - values = [PeriodConverter._convert_1d(v, units, axis) - for v in values] + values = [PeriodConverter._convert_1d(v, units, axis) for v in values] else: values = PeriodConverter._convert_1d(values, units, axis) return values @staticmethod def _convert_1d(values, units, axis): - if not hasattr(axis, 'freq'): - raise TypeError('Axis must have `freq` set to convert to Periods') - valid_types = (str, datetime, Period, pydt.date, pydt.time, - np.datetime64) - if (isinstance(values, valid_types) or is_integer(values) or - is_float(values)): + if not hasattr(axis, "freq"): + raise TypeError("Axis must have `freq` set to convert to Periods") + valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64) + if isinstance(values, valid_types) or is_integer(values) or is_float(values): return get_datevalue(values, axis.freq) elif isinstance(values, PeriodIndex): return values.asfreq(axis.freq)._ndarray_values elif isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - elif lib.infer_dtype(values, skipna=False) == 'period': + elif lib.infer_dtype(values, skipna=False) == "period": # https://github.com/pandas-dev/pandas/issues/24304 # convert ndarray[period] -> PeriodIndex return PeriodIndex(values, freq=axis.freq)._ndarray_values @@ -223,11 +221,13 @@ def _convert_1d(values, units, axis): def get_datevalue(date, freq): if isinstance(date, Period): return date.asfreq(freq).ordinal - elif isinstance(date, (str, datetime, pydt.date, pydt.time, - np.datetime64)): + elif isinstance(date, (str, datetime, pydt.date, pydt.time, np.datetime64)): return Period(date, freq).ordinal - elif (is_integer(date) or is_float(date) or - (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): + elif ( + is_integer(date) + or is_float(date) + or (isinstance(date, (np.ndarray, Index)) and (date.size == 1)) + ): return date elif date is None: return None @@ -240,9 +240,8 @@ def _dt_to_float_ordinal(dt): preserving hours, minutes, seconds and microseconds. Return value is a :func:`float`. """ - if (isinstance(dt, (np.ndarray, Index, ABCSeries) - ) and is_datetime64_ns_dtype(dt)): - base = dates.epoch2num(dt.asi8 / 1.0E9) + if isinstance(dt, (np.ndarray, Index, ABCSeries)) and is_datetime64_ns_dtype(dt): + base = dates.epoch2num(dt.asi8 / 1.0e9) else: base = dates.date2num(dt) return base @@ -250,14 +249,12 @@ def _dt_to_float_ordinal(dt): # Datetime Conversion class DatetimeConverter(dates.DateConverter): - @staticmethod def convert(values, unit, axis): # values might be a 1-d array, or a list-like of arrays. _check_implicitly_registered() if is_nested_list_like(values): - values = [DatetimeConverter._convert_1d(v, unit, axis) - for v in values] + values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values] else: values = DatetimeConverter._convert_1d(values, unit, axis) return values @@ -276,7 +273,7 @@ def try_parse(values): return _dt_to_float_ordinal(tslibs.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) - elif (is_integer(values) or is_float(values)): + elif is_integer(values) or is_float(values): return values elif isinstance(values, str): return try_parse(values) @@ -319,13 +316,13 @@ def axisinfo(unit, axis): datemin = pydt.date(2000, 1, 1) datemax = pydt.date(2010, 1, 1) - return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='', - default_limits=(datemin, datemax)) + return units.AxisInfo( + majloc=majloc, majfmt=majfmt, label="", default_limits=(datemin, datemax) + ) class PandasAutoDateFormatter(dates.AutoDateFormatter): - - def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'): + def __init__(self, locator, tz=None, defaultfmt="%Y-%m-%d"): dates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt) # matplotlib.dates._UTC has no _utcoffset called by pandas if self._tz is dates.UTC: @@ -333,15 +330,14 @@ def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'): class PandasAutoDateLocator(dates.AutoDateLocator): - def get_locator(self, dmin, dmax): - 'Pick the best locator based on a distance.' + "Pick the best locator based on a distance." _check_implicitly_registered() delta = relativedelta(dmax, dmin) num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds - tot_sec = num_days * 86400. + num_sec + tot_sec = num_days * 86400.0 + num_sec if abs(tot_sec) < self.minticks: self._freq = -1 @@ -360,11 +356,11 @@ def _get_unit(self): class MilliSecondLocator(dates.DateLocator): - UNIT = 1. / (24 * 3600 * 1000) + UNIT = 1.0 / (24 * 3600 * 1000) def __init__(self, tz): dates.DateLocator.__init__(self, tz) - self._interval = 1. + self._interval = 1.0 def _get_unit(self): return self.get_unit_generic(-1) @@ -411,24 +407,25 @@ def __call__(self): break else: # We went through the whole loop without breaking, default to 1 - self._interval = 1000. + self._interval = 1000.0 estimate = (nmax - nmin) / (self._get_unit() * self._get_interval()) if estimate > self.MAXTICKS * 2: - raise RuntimeError(('MillisecondLocator estimated to generate ' - '{estimate:d} ticks from {dmin} to {dmax}: ' - 'exceeds Locator.MAXTICKS' - '* 2 ({arg:d}) ').format( - estimate=estimate, dmin=dmin, dmax=dmax, - arg=self.MAXTICKS * 2)) - - freq = '%dL' % self._get_interval() + raise RuntimeError( + ( + "MillisecondLocator estimated to generate " + "{estimate:d} ticks from {dmin} to {dmax}: " + "exceeds Locator.MAXTICKS" + "* 2 ({arg:d}) " + ).format(estimate=estimate, dmin=dmin, dmax=dmax, arg=self.MAXTICKS * 2) + ) + + freq = "%dL" % self._get_interval() tz = self.tz.tzname(None) st = _from_ordinal(dates.date2num(dmin)) # strip tz ed = _from_ordinal(dates.date2num(dmax)) - all_dates = date_range(start=st, end=ed, - freq=freq, tz=tz).astype(object) + all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) try: if len(all_dates) > 0: @@ -485,8 +482,9 @@ def _from_ordinal(x, tz=None): microsecond = int(1e6 * remainder) if microsecond < 10: microsecond = 0 # compensate for rounding errors - dt = datetime(dt.year, dt.month, dt.day, int(hour), int(minute), - int(second), microsecond) + dt = datetime( + dt.year, dt.month, dt.day, int(hour), int(minute), int(second), microsecond + ) if tz is not None: dt = dt.astimezone(tz) @@ -495,6 +493,7 @@ def _from_ordinal(x, tz=None): return dt + # Fixed frequency dynamic tick locators and formatters # ------------------------------------------------------------------------- @@ -548,9 +547,9 @@ def has_level_label(label_flags, vmin): if the minimum view limit is not an exact integer, then the first tick label won't be shown, so we must adjust for that. """ - if label_flags.size == 0 or (label_flags.size == 1 and - label_flags[0] == 0 and - vmin % 1 > 0.0): + if label_flags.size == 0 or ( + label_flags.size == 1 and label_flags[0] == 0 and vmin % 1 > 0.0 + ): return False else: return True @@ -592,33 +591,34 @@ def _daily_finder(vmin, vmax, freq): # save this for later usage vmin_orig = vmin - (vmin, vmax) = (Period(ordinal=int(vmin), freq=freq), - Period(ordinal=int(vmax), freq=freq)) + (vmin, vmax) = ( + Period(ordinal=int(vmin), freq=freq), + Period(ordinal=int(vmax), freq=freq), + ) span = vmax.ordinal - vmin.ordinal + 1 dates_ = period_range(start=vmin, end=vmax, freq=freq) # Initialize the output - info = np.zeros(span, - dtype=[('val', np.int64), ('maj', bool), - ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_._ndarray_values - info['fmt'][:] = '' - info['maj'][[0, -1]] = True + info = np.zeros( + span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")] + ) + info["val"][:] = dates_._ndarray_values + info["fmt"][:] = "" + info["maj"][[0, -1]] = True # .. and set some shortcuts - info_maj = info['maj'] - info_min = info['min'] - info_fmt = info['fmt'] + info_maj = info["maj"] + info_min = info["min"] + info_fmt = info["fmt"] def first_label(label_flags): - if (label_flags[0] == 0) and (label_flags.size > 1) and \ - ((vmin_orig % 1) > 0.0): + if (label_flags[0] == 0) and (label_flags.size > 1) and ((vmin_orig % 1) > 0.0): return label_flags[1] else: return label_flags[0] # Case 1. Less than a month if span <= periodspermonth: - day_start = period_break(dates_, 'day') - month_start = period_break(dates_, 'month') + day_start = period_break(dates_, "day") + month_start = period_break(dates_, "month") def _hour_finder(label_interval, force_year_start): _hour = dates_.hour @@ -626,39 +626,38 @@ def _hour_finder(label_interval, force_year_start): hour_start = (_hour - _prev_hour) != 0 info_maj[day_start] = True info_min[hour_start & (_hour % label_interval == 0)] = True - year_start = period_break(dates_, 'year') - info_fmt[hour_start & (_hour % label_interval == 0)] = '%H:%M' - info_fmt[day_start] = '%H:%M\n%d-%b' - info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + year_start = period_break(dates_, "year") + info_fmt[hour_start & (_hour % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" if force_year_start and not has_level_label(year_start, vmin_orig): - info_fmt[first_label(day_start)] = '%H:%M\n%d-%b\n%Y' + info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y" def _minute_finder(label_interval): - hour_start = period_break(dates_, 'hour') + hour_start = period_break(dates_, "hour") _minute = dates_.minute _prev_minute = (dates_ - 1 * dates_.freq).minute minute_start = (_minute - _prev_minute) != 0 info_maj[hour_start] = True info_min[minute_start & (_minute % label_interval == 0)] = True - year_start = period_break(dates_, 'year') - info_fmt = info['fmt'] - info_fmt[minute_start & (_minute % label_interval == 0)] = '%H:%M' - info_fmt[day_start] = '%H:%M\n%d-%b' - info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[minute_start & (_minute % label_interval == 0)] = "%H:%M" + info_fmt[day_start] = "%H:%M\n%d-%b" + info_fmt[year_start] = "%H:%M\n%d-%b\n%Y" def _second_finder(label_interval): - minute_start = period_break(dates_, 'minute') + minute_start = period_break(dates_, "minute") _second = dates_.second _prev_second = (dates_ - 1 * dates_.freq).second second_start = (_second - _prev_second) != 0 - info['maj'][minute_start] = True - info['min'][second_start & (_second % label_interval == 0)] = True - year_start = period_break(dates_, 'year') - info_fmt = info['fmt'] - info_fmt[second_start & (_second % - label_interval == 0)] = '%H:%M:%S' - info_fmt[day_start] = '%H:%M:%S\n%d-%b' - info_fmt[year_start] = '%H:%M:%S\n%d-%b\n%Y' + info["maj"][minute_start] = True + info["min"][second_start & (_second % label_interval == 0)] = True + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[second_start & (_second % label_interval == 0)] = "%H:%M:%S" + info_fmt[day_start] = "%H:%M:%S\n%d-%b" + info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y" if span < periodsperday / 12000.0: _second_finder(1) @@ -695,81 +694,81 @@ def _second_finder(label_interval): else: info_maj[month_start] = True info_min[day_start] = True - year_start = period_break(dates_, 'year') - info_fmt = info['fmt'] - info_fmt[day_start] = '%d' - info_fmt[month_start] = '%d\n%b' - info_fmt[year_start] = '%d\n%b\n%Y' + year_start = period_break(dates_, "year") + info_fmt = info["fmt"] + info_fmt[day_start] = "%d" + info_fmt[month_start] = "%d\n%b" + info_fmt[year_start] = "%d\n%b\n%Y" if not has_level_label(year_start, vmin_orig): if not has_level_label(month_start, vmin_orig): - info_fmt[first_label(day_start)] = '%d\n%b\n%Y' + info_fmt[first_label(day_start)] = "%d\n%b\n%Y" else: - info_fmt[first_label(month_start)] = '%d\n%b\n%Y' + info_fmt[first_label(month_start)] = "%d\n%b\n%Y" # Case 2. Less than three months elif span <= periodsperyear // 4: - month_start = period_break(dates_, 'month') + month_start = period_break(dates_, "month") info_maj[month_start] = True if freq < FreqGroup.FR_HR: - info['min'] = True + info["min"] = True else: - day_start = period_break(dates_, 'day') - info['min'][day_start] = True - week_start = period_break(dates_, 'week') - year_start = period_break(dates_, 'year') - info_fmt[week_start] = '%d' - info_fmt[month_start] = '\n\n%b' - info_fmt[year_start] = '\n\n%b\n%Y' + day_start = period_break(dates_, "day") + info["min"][day_start] = True + week_start = period_break(dates_, "week") + year_start = period_break(dates_, "year") + info_fmt[week_start] = "%d" + info_fmt[month_start] = "\n\n%b" + info_fmt[year_start] = "\n\n%b\n%Y" if not has_level_label(year_start, vmin_orig): if not has_level_label(month_start, vmin_orig): - info_fmt[first_label(week_start)] = '\n\n%b\n%Y' + info_fmt[first_label(week_start)] = "\n\n%b\n%Y" else: - info_fmt[first_label(month_start)] = '\n\n%b\n%Y' + info_fmt[first_label(month_start)] = "\n\n%b\n%Y" # Case 3. Less than 14 months ............... elif span <= 1.15 * periodsperyear: - year_start = period_break(dates_, 'year') - month_start = period_break(dates_, 'month') - week_start = period_break(dates_, 'week') + year_start = period_break(dates_, "year") + month_start = period_break(dates_, "month") + week_start = period_break(dates_, "week") info_maj[month_start] = True info_min[week_start] = True info_min[year_start] = False info_min[month_start] = False - info_fmt[month_start] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[month_start] = "%b" + info_fmt[year_start] = "%b\n%Y" if not has_level_label(year_start, vmin_orig): - info_fmt[first_label(month_start)] = '%b\n%Y' + info_fmt[first_label(month_start)] = "%b\n%Y" # Case 4. Less than 2.5 years ............... elif span <= 2.5 * periodsperyear: - year_start = period_break(dates_, 'year') - quarter_start = period_break(dates_, 'quarter') - month_start = period_break(dates_, 'month') + year_start = period_break(dates_, "year") + quarter_start = period_break(dates_, "quarter") + month_start = period_break(dates_, "month") info_maj[quarter_start] = True info_min[month_start] = True - info_fmt[quarter_start] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" # Case 4. Less than 4 years ................. elif span <= 4 * periodsperyear: - year_start = period_break(dates_, 'year') - month_start = period_break(dates_, 'month') + year_start = period_break(dates_, "year") + month_start = period_break(dates_, "month") info_maj[year_start] = True info_min[month_start] = True info_min[year_start] = False month_break = dates_[month_start].month jan_or_jul = month_start[(month_break == 1) | (month_break == 7)] - info_fmt[jan_or_jul] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" # Case 5. Less than 11 years ................ elif span <= 11 * periodsperyear: - year_start = period_break(dates_, 'year') - quarter_start = period_break(dates_, 'quarter') + year_start = period_break(dates_, "year") + quarter_start = period_break(dates_, "quarter") info_maj[year_start] = True info_min[quarter_start] = True info_min[year_start] = False - info_fmt[year_start] = '%Y' + info_fmt[year_start] = "%Y" # Case 6. More than 12 years ................ else: - year_start = period_break(dates_, 'year') + year_start = period_break(dates_, "year") year_break = dates_[year_start].year nyears = span / periodsperyear (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) @@ -777,7 +776,7 @@ def _second_finder(label_interval): info_maj[major_idx] = True minor_idx = year_start[(year_break % min_anndef == 0)] info_min[minor_idx] = True - info_fmt[major_idx] = '%Y' + info_fmt[major_idx] = "%Y" return info @@ -790,54 +789,54 @@ def _monthly_finder(vmin, vmax, freq): span = vmax - vmin + 1 # Initialize the output - info = np.zeros(span, - dtype=[('val', int), ('maj', bool), ('min', bool), - ('fmt', '|S8')]) - info['val'] = np.arange(vmin, vmax + 1) - dates_ = info['val'] - info['fmt'] = '' + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + dates_ = info["val"] + info["fmt"] = "" year_start = (dates_ % 12 == 0).nonzero()[0] - info_maj = info['maj'] - info_fmt = info['fmt'] + info_maj = info["maj"] + info_fmt = info["fmt"] if span <= 1.15 * periodsperyear: info_maj[year_start] = True - info['min'] = True + info["min"] = True - info_fmt[:] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[:] = "%b" + info_fmt[year_start] = "%b\n%Y" if not has_level_label(year_start, vmin_orig): if dates_.size > 1: idx = 1 else: idx = 0 - info_fmt[idx] = '%b\n%Y' + info_fmt[idx] = "%b\n%Y" elif span <= 2.5 * periodsperyear: quarter_start = (dates_ % 3 == 0).nonzero() info_maj[year_start] = True # TODO: Check the following : is it really info['fmt'] ? - info['fmt'][quarter_start] = True - info['min'] = True + info["fmt"][quarter_start] = True + info["min"] = True - info_fmt[quarter_start] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[quarter_start] = "%b" + info_fmt[year_start] = "%b\n%Y" elif span <= 4 * periodsperyear: info_maj[year_start] = True - info['min'] = True + info["min"] = True jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6) - info_fmt[jan_or_jul] = '%b' - info_fmt[year_start] = '%b\n%Y' + info_fmt[jan_or_jul] = "%b" + info_fmt[year_start] = "%b\n%Y" elif span <= 11 * periodsperyear: quarter_start = (dates_ % 3 == 0).nonzero() info_maj[year_start] = True - info['min'][quarter_start] = True + info["min"][quarter_start] = True - info_fmt[year_start] = '%Y' + info_fmt[year_start] = "%Y" else: nyears = span / periodsperyear @@ -845,9 +844,9 @@ def _monthly_finder(vmin, vmax, freq): years = dates_[year_start] // 12 + 1 major_idx = year_start[(years % maj_anndef == 0)] info_maj[major_idx] = True - info['min'][year_start[(years % min_anndef == 0)]] = True + info["min"][year_start[(years % min_anndef == 0)]] = True - info_fmt[major_idx] = '%Y' + info_fmt[major_idx] = "%Y" return info @@ -858,33 +857,33 @@ def _quarterly_finder(vmin, vmax, freq): (vmin, vmax) = (int(vmin), int(vmax)) span = vmax - vmin + 1 - info = np.zeros(span, - dtype=[('val', int), ('maj', bool), ('min', bool), - ('fmt', '|S8')]) - info['val'] = np.arange(vmin, vmax + 1) - info['fmt'] = '' - dates_ = info['val'] - info_maj = info['maj'] - info_fmt = info['fmt'] + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] + info_maj = info["maj"] + info_fmt = info["fmt"] year_start = (dates_ % 4 == 0).nonzero()[0] if span <= 3.5 * periodsperyear: info_maj[year_start] = True - info['min'] = True + info["min"] = True - info_fmt[:] = 'Q%q' - info_fmt[year_start] = 'Q%q\n%F' + info_fmt[:] = "Q%q" + info_fmt[year_start] = "Q%q\n%F" if not has_level_label(year_start, vmin_orig): if dates_.size > 1: idx = 1 else: idx = 0 - info_fmt[idx] = 'Q%q\n%F' + info_fmt[idx] = "Q%q\n%F" elif span <= 11 * periodsperyear: info_maj[year_start] = True - info['min'] = True - info_fmt[year_start] = '%F' + info["min"] = True + info_fmt[year_start] = "%F" else: years = dates_[year_start] // 4 + 1 @@ -892,8 +891,8 @@ def _quarterly_finder(vmin, vmax, freq): (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) major_idx = year_start[(years % maj_anndef == 0)] info_maj[major_idx] = True - info['min'][year_start[(years % min_anndef == 0)]] = True - info_fmt[major_idx] = '%F' + info["min"][year_start[(years % min_anndef == 0)]] = True + info_fmt[major_idx] = "%F" return info @@ -902,18 +901,18 @@ def _annual_finder(vmin, vmax, freq): (vmin, vmax) = (int(vmin), int(vmax + 1)) span = vmax - vmin + 1 - info = np.zeros(span, - dtype=[('val', int), ('maj', bool), ('min', bool), - ('fmt', '|S8')]) - info['val'] = np.arange(vmin, vmax + 1) - info['fmt'] = '' - dates_ = info['val'] + info = np.zeros( + span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")] + ) + info["val"] = np.arange(vmin, vmax + 1) + info["fmt"] = "" + dates_ = info["val"] (min_anndef, maj_anndef) = _get_default_annual_spacing(span) major_idx = dates_ % maj_anndef == 0 - info['maj'][major_idx] = True - info['min'][(dates_ % min_anndef == 0)] = True - info['fmt'][major_idx] = '%Y' + info["maj"][major_idx] = True + info["min"][(dates_ % min_anndef == 0)] = True + info["fmt"][major_idx] = "%Y" return info @@ -929,7 +928,7 @@ def get_finder(freq): return _quarterly_finder elif freq == FreqGroup.FR_MTH: return _monthly_finder - elif ((freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK): + elif (freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: return _daily_finder else: # pragma: no cover errmsg = "Unsupported frequency: {freq}".format(freq=freq) @@ -954,8 +953,17 @@ class TimeSeries_DateLocator(Locator): day : {int}, optional """ - def __init__(self, freq, minor_locator=False, dynamic_mode=True, - base=1, quarter=1, month=1, day=1, plot_obj=None): + def __init__( + self, + freq, + minor_locator=False, + dynamic_mode=True, + base=1, + quarter=1, + month=1, + day=1, + plot_obj=None, + ): if isinstance(freq, str): freq = get_freq(freq) self.freq = freq @@ -976,11 +984,11 @@ def _get_default_locs(self, vmin, vmax): locator = self.plot_obj.date_axis_info if self.isminor: - return np.compress(locator['min'], locator['val']) - return np.compress(locator['maj'], locator['val']) + return np.compress(locator["min"], locator["val"]) + return np.compress(locator["maj"], locator["val"]) def __call__(self): - 'Return the locations of the ticks.' + "Return the locations of the ticks." # axis calls Locator.set_axis inside set_m_formatter _check_implicitly_registered() @@ -1015,6 +1023,7 @@ def autoscale(self): vmax += 1 return nonsingular(vmin, vmax) + # ------------------------------------------------------------------------- # --- Formatter --- # ------------------------------------------------------------------------- @@ -1035,8 +1044,7 @@ class TimeSeries_DateFormatter(Formatter): Whether the formatter works in dynamic mode or not. """ - def __init__(self, freq, minor_locator=False, dynamic_mode=True, - plot_obj=None): + def __init__(self, freq, minor_locator=False, dynamic_mode=True, plot_obj=None): if isinstance(freq, str): freq = get_freq(freq) self.format = None @@ -1057,15 +1065,14 @@ def _set_default_format(self, vmin, vmax): info = self.plot_obj.date_axis_info if self.isminor: - format = np.compress(info['min'] & np.logical_not(info['maj']), - info) + format = np.compress(info["min"] & np.logical_not(info["maj"]), info) else: - format = np.compress(info['maj'], info) + format = np.compress(info["maj"], info) self.formatdict = {x: f for (x, _, _, f) in format} return self.formatdict def set_locs(self, locs): - 'Sets the locations of the ticks' + "Sets the locations of the ticks" # don't actually use the locs. This is just needed to work with # matplotlib. Force to use vmin, vmax _check_implicitly_registered() @@ -1084,9 +1091,9 @@ def __call__(self, x, pos=0): _check_implicitly_registered() if self.formatdict is None: - return '' + return "" else: - fmt = self.formatdict.pop(x, '') + fmt = self.formatdict.pop(x, "") return Period(ordinal=int(x), freq=self.freq).strftime(fmt) @@ -1104,12 +1111,12 @@ def format_timedelta_ticks(x, pos, n_decimals): m, s = divmod(s, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) - decimals = int(ns * 10**(n_decimals - 9)) - s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s)) + decimals = int(ns * 10 ** (n_decimals - 9)) + s = r"{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s)) if n_decimals > 0: - s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals) + s += ".{{:0{:0d}d}}".format(n_decimals).format(decimals) if d != 0: - s = '{:d} days '.format(int(d)) + s + s = "{:d} days ".format(int(d)) + s return s def __call__(self, x, pos=0): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 5fb4d201223bd..d25715e6d167b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -10,9 +10,19 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - is_hashable, is_integer, is_iterator, is_list_like, is_number) + is_hashable, + is_integer, + is_iterator, + is_list_like, + is_number, +) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex, ABCSeries) + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCPeriodIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna, notna import pandas.core.common as com @@ -22,10 +32,16 @@ from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import ( - _flatten, _get_all_lines, _get_xlim, _handle_shared_axes, _subplots, - format_date_labels, table) - -if get_option('plotting.matplotlib.register_converters'): + _flatten, + _get_all_lines, + _get_xlim, + _handle_shared_axes, + _subplots, + format_date_labels, + table, +) + +if get_option("plotting.matplotlib.register_converters"): converter.register(explicit=False) @@ -38,29 +54,63 @@ class MPLPlot: data : """ + @property def _kind(self): """Specify kind str. Must be overridden in child class""" raise NotImplementedError - _layout_type = 'vertical' + _layout_type = "vertical" _default_rot = 0 orientation = None # type: Optional[str] - _pop_attributes = ['label', 'style', 'logy', 'logx', 'loglog', - 'mark_right', 'stacked'] - _attr_defaults = {'logy': False, 'logx': False, 'loglog': False, - 'mark_right': True, 'stacked': False} - - def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, - sharey=False, use_index=True, - figsize=None, grid=None, legend=True, rot=None, - ax=None, fig=None, title=None, xlim=None, ylim=None, - xticks=None, yticks=None, - sort_columns=False, fontsize=None, - secondary_y=False, colormap=None, - table=False, layout=None, **kwds): + _pop_attributes = [ + "label", + "style", + "logy", + "logx", + "loglog", + "mark_right", + "stacked", + ] + _attr_defaults = { + "logy": False, + "logx": False, + "loglog": False, + "mark_right": True, + "stacked": False, + } + + def __init__( + self, + data, + kind=None, + by=None, + subplots=False, + sharex=None, + sharey=False, + use_index=True, + figsize=None, + grid=None, + legend=True, + rot=None, + ax=None, + fig=None, + title=None, + xlim=None, + ylim=None, + xticks=None, + yticks=None, + sort_columns=False, + fontsize=None, + secondary_y=False, + colormap=None, + table=False, + layout=None, + **kwds + ): import matplotlib.pyplot as plt + converter._WARN = False # no warning for pandas plots self.data = data self.by = by @@ -104,7 +154,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.rot = self._default_rot if grid is None: - grid = False if secondary_y else plt.rcParams['axes.grid'] + grid = False if secondary_y else plt.rcParams["axes.grid"] self.grid = grid self.legend = legend @@ -120,22 +170,23 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self.axes = None # parse errorbar input if given - xerr = kwds.pop('xerr', None) - yerr = kwds.pop('yerr', None) - self.errors = {kw: self._parse_errorbars(kw, err) - for kw, err in zip(['xerr', 'yerr'], [xerr, yerr])} - - if not isinstance(secondary_y, (bool, tuple, list, - np.ndarray, ABCIndexClass)): + xerr = kwds.pop("xerr", None) + yerr = kwds.pop("yerr", None) + self.errors = { + kw: self._parse_errorbars(kw, err) + for kw, err in zip(["xerr", "yerr"], [xerr, yerr]) + } + + if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndexClass)): secondary_y = [secondary_y] self.secondary_y = secondary_y # ugly TypeError if user passes matplotlib's `cmap` name. # Probably better to accept either. - if 'cmap' in kwds and colormap: + if "cmap" in kwds and colormap: raise TypeError("Only specify one of `cmap` and `colormap`.") - elif 'cmap' in kwds: - self.colormap = kwds.pop('cmap') + elif "cmap" in kwds: + self.colormap = kwds.pop("cmap") else: self.colormap = colormap @@ -146,40 +197,54 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, self._validate_color_args() def _validate_color_args(self): - if 'color' not in self.kwds and 'colors' in self.kwds: - warnings.warn(("'colors' is being deprecated. Please use 'color'" - "instead of 'colors'")) - colors = self.kwds.pop('colors') - self.kwds['color'] = colors - - if ('color' in self.kwds and self.nseries == 1 and - not is_list_like(self.kwds['color'])): + if "color" not in self.kwds and "colors" in self.kwds: + warnings.warn( + ( + "'colors' is being deprecated. Please use 'color'" + "instead of 'colors'" + ) + ) + colors = self.kwds.pop("colors") + self.kwds["color"] = colors + + if ( + "color" in self.kwds + and self.nseries == 1 + and not is_list_like(self.kwds["color"]) + ): # support series.plot(color='green') - self.kwds['color'] = [self.kwds['color']] - - if ('color' in self.kwds and isinstance(self.kwds['color'], tuple) and - self.nseries == 1 and len(self.kwds['color']) in (3, 4)): + self.kwds["color"] = [self.kwds["color"]] + + if ( + "color" in self.kwds + and isinstance(self.kwds["color"], tuple) + and self.nseries == 1 + and len(self.kwds["color"]) in (3, 4) + ): # support RGB and RGBA tuples in series plot - self.kwds['color'] = [self.kwds['color']] + self.kwds["color"] = [self.kwds["color"]] - if ('color' in self.kwds or 'colors' in self.kwds) and \ - self.colormap is not None: - warnings.warn("'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'") + if ( + "color" in self.kwds or "colors" in self.kwds + ) and self.colormap is not None: + warnings.warn( + "'color' and 'colormap' cannot be used " "simultaneously. Using 'color'" + ) - if 'color' in self.kwds and self.style is not None: + if "color" in self.kwds and self.style is not None: if is_list_like(self.style): styles = self.style else: styles = [self.style] # need only a single match for s in styles: - if re.match('^[a-z]+?', s) is not None: + if re.match("^[a-z]+?", s) is not None: raise ValueError( "Cannot pass 'style' string with a color " "symbol and 'color' keyword argument. Please" " use one or the other or pass 'style' " - "without a color symbol") + "without a color symbol" + ) def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: @@ -227,19 +292,17 @@ def _args_adjust(self): def _has_plotted_object(self, ax): """check whether ax has data""" - return (len(ax.lines) != 0 or - len(ax.artists) != 0 or - len(ax.containers) != 0) + return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0 def _maybe_right_yaxis(self, ax, axes_num): if not self.on_right(axes_num): # secondary axes may be passed via ax kw return self._get_ax_layer(ax) - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): # if it has right_ax proparty, ``ax`` must be left axes return ax.right_ax - elif hasattr(ax, 'left_ax'): + elif hasattr(ax, "left_ax"): # if it has left_ax proparty, ``ax`` must be right axes return ax else: @@ -254,18 +317,22 @@ def _maybe_right_yaxis(self, ax, axes_num): orig_ax.get_yaxis().set_visible(False) if self.logy is True or self.loglog is True: - new_ax.set_yscale('log') - elif self.logy == 'sym' or self.loglog == 'sym': - new_ax.set_yscale('symlog') + new_ax.set_yscale("log") + elif self.logy == "sym" or self.loglog == "sym": + new_ax.set_yscale("symlog") return new_ax def _setup_subplots(self): if self.subplots: - fig, axes = _subplots(naxes=self.nseries, - sharex=self.sharex, sharey=self.sharey, - figsize=self.figsize, ax=self.ax, - layout=self.layout, - layout_type=self._layout_type) + fig, axes = _subplots( + naxes=self.nseries, + sharex=self.sharex, + sharey=self.sharey, + figsize=self.figsize, + ax=self.ax, + layout=self.layout, + layout_type=self._layout_type, + ) else: if self.ax is None: fig = self.plt.figure(figsize=self.figsize) @@ -278,7 +345,7 @@ def _setup_subplots(self): axes = _flatten(axes) - valid_log = {False, True, 'sym', None} + valid_log = {False, True, "sym", None} input_log = {self.logx, self.logy, self.loglog} if input_log - valid_log: invalid_log = next(iter((input_log - valid_log))) @@ -288,14 +355,14 @@ def _setup_subplots(self): ) if self.logx is True or self.loglog is True: - [a.set_xscale('log') for a in axes] - elif self.logx == 'sym' or self.loglog == 'sym': - [a.set_xscale('symlog') for a in axes] + [a.set_xscale("log") for a in axes] + elif self.logx == "sym" or self.loglog == "sym": + [a.set_xscale("symlog") for a in axes] if self.logy is True or self.loglog is True: - [a.set_yscale('log') for a in axes] - elif self.logy == 'sym' or self.loglog == 'sym': - [a.set_yscale('symlog') for a in axes] + [a.set_yscale("log") for a in axes] + elif self.logy == "sym" or self.loglog == "sym": + [a.set_yscale("symlog") for a in axes] self.fig = fig self.axes = axes @@ -312,9 +379,10 @@ def result(self): return self.axes else: sec_true = isinstance(self.secondary_y, bool) and self.secondary_y - all_sec = (is_list_like(self.secondary_y) and - len(self.secondary_y) == self.nseries) - if (sec_true or all_sec): + all_sec = ( + is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries + ) + if sec_true or all_sec: # if all data is plotted on secondary, return right axes return self._get_ax_layer(self.axes[0], primary=False) else: @@ -326,16 +394,15 @@ def _compute_plot_data(self): if isinstance(data, ABCSeries): label = self.label if label is None and data.name is None: - label = 'None' + label = "None" data = data.to_frame(name=label) # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) - numeric_data = data.select_dtypes(include=[np.number, - "datetime", - "datetimetz", - "timedelta"]) + numeric_data = data.select_dtypes( + include=[np.number, "datetime", "datetimetz", "timedelta"] + ) try: is_empty = numeric_data.empty @@ -344,7 +411,7 @@ def _compute_plot_data(self): # no non-numeric frames or series allowed if is_empty: - raise TypeError('no numeric data to plot') + raise TypeError("no numeric data to plot") # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to # np.ndarray before plot. @@ -370,23 +437,19 @@ def _add_table(self): def _post_plot_logic_common(self, ax, data): """Common post process for each axes""" - if self.orientation == 'vertical' or self.orientation is None: - self._apply_axis_properties(ax.xaxis, rot=self.rot, - fontsize=self.fontsize) + if self.orientation == "vertical" or self.orientation is None: + self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) - if hasattr(ax, 'right_ax'): - self._apply_axis_properties(ax.right_ax.yaxis, - fontsize=self.fontsize) + if hasattr(ax, "right_ax"): + self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) - elif self.orientation == 'horizontal': - self._apply_axis_properties(ax.yaxis, rot=self.rot, - fontsize=self.fontsize) + elif self.orientation == "horizontal": + self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize) - if hasattr(ax, 'right_ax'): - self._apply_axis_properties(ax.right_ax.yaxis, - fontsize=self.fontsize) + if hasattr(ax, "right_ax"): + self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize) else: # pragma no cover raise ValueError @@ -399,10 +462,15 @@ def _adorn_subplots(self): if len(self.axes) > 0: all_axes = self._get_subplots() nrows, ncols = self._get_axes_layout() - _handle_shared_axes(axarr=all_axes, nplots=len(all_axes), - naxes=nrows * ncols, nrows=nrows, - ncols=ncols, sharex=self.sharex, - sharey=self.sharey) + _handle_shared_axes( + axarr=all_axes, + nplots=len(all_axes), + naxes=nrows * ncols, + nrows=nrows, + ncols=ncols, + sharex=self.sharex, + sharey=self.sharey, + ) for ax in self.axes: if self.yticks is not None: @@ -423,12 +491,13 @@ def _adorn_subplots(self): if self.subplots: if is_list_like(self.title): if len(self.title) != self.nseries: - msg = ('The length of `title` must equal the number ' - 'of columns if using `title` of type `list` ' - 'and `subplots=True`.\n' - 'length of title = {}\n' - 'number of columns = {}').format( - len(self.title), self.nseries) + msg = ( + "The length of `title` must equal the number " + "of columns if using `title` of type `list` " + "and `subplots=True`.\n" + "length of title = {}\n" + "number of columns = {}" + ).format(len(self.title), self.nseries) raise ValueError(msg) for (ax, title) in zip(self.axes, self.title): @@ -437,8 +506,10 @@ def _adorn_subplots(self): self.fig.suptitle(self.title) else: if is_list_like(self.title): - msg = ('Using `title` of type `list` is not supported ' - 'unless `subplots=True` is passed') + msg = ( + "Using `title` of type `list` is not supported " + "unless `subplots=True` is passed" + ) raise ValueError(msg) self.axes[0].set_title(self.title) @@ -466,15 +537,14 @@ def legend_title(self): name = pprint_thing(name) return name else: - stringified = map(pprint_thing, - self.data.columns.names) - return ','.join(stringified) + stringified = map(pprint_thing, self.data.columns.names) + return ",".join(stringified) def _add_legend_handle(self, handle, label, index=None): if label is not None: if self.mark_right and index is not None: if self.on_right(index): - label = label + ' (right)' + label = label + " (right)" self.legend_handles.append(handle) self.legend_labels.append(label) @@ -483,7 +553,7 @@ def _make_legend(self): handles = [] labels = [] - title = '' + title = "" if not self.subplots: if leg is not None: @@ -492,7 +562,7 @@ def _make_legend(self): labels = [x.get_text() for x in leg.get_texts()] if self.legend: - if self.legend == 'reverse': + if self.legend == "reverse": self.legend_handles = reversed(self.legend_handles) self.legend_labels = reversed(self.legend_labels) @@ -502,17 +572,16 @@ def _make_legend(self): title = self.legend_title if len(handles) > 0: - ax.legend(handles, labels, loc='best', title=title) + ax.legend(handles, labels, loc="best", title=title) elif self.subplots and self.legend: for ax in self.axes: if ax.get_visible(): - ax.legend(loc='best') + ax.legend(loc="best") def _get_ax_legend(self, ax): leg = ax.get_legend() - other_ax = (getattr(ax, 'left_ax', None) or - getattr(ax, 'right_ax', None)) + other_ax = getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None) other_leg = None if other_ax is not None: other_leg = other_ax.get_legend() @@ -524,14 +593,14 @@ def _get_ax_legend(self, ax): @cache_readonly def plt(self): import matplotlib.pyplot as plt + return plt _need_to_set_index = False def _get_xticks(self, convert_period=False): index = self.data.index - is_datetype = index.inferred_type in ('datetime', 'date', - 'datetime64', 'time') + is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time") if self.use_index: if convert_period and isinstance(index, ABCPeriodIndex): @@ -568,10 +637,10 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): x = x._mpl_repr() if is_errorbar: - if 'xerr' in kwds: - kwds['xerr'] = np.array(kwds.get('xerr')) - if 'yerr' in kwds: - kwds['yerr'] = np.array(kwds.get('yerr')) + if "xerr" in kwds: + kwds["xerr"] = np.array(kwds.get("xerr")) + if "yerr" in kwds: + kwds["yerr"] = np.array(kwds.get("yerr")) return ax.errorbar(x, y, **kwds) else: # prevent style kwarg from going to errorbar, where it is @@ -586,7 +655,7 @@ def _get_index_name(self): if isinstance(self.data.index, ABCMultiIndex): name = self.data.index.names if com._any_not_none(*name): - name = ','.join(pprint_thing(x) for x in name) + name = ",".join(pprint_thing(x) for x in name) else: name = None else: @@ -600,9 +669,9 @@ def _get_index_name(self): def _get_ax_layer(cls, ax, primary=True): """get left (primary) or right (secondary) axes""" if primary: - return getattr(ax, 'left_ax', ax) + return getattr(ax, "left_ax", ax) else: - return getattr(ax, 'right_ax', ax) + return getattr(ax, "right_ax", ax) def _get_ax(self, i): # get the twinx ax if appropriate @@ -630,8 +699,7 @@ def on_right(self, i): if isinstance(self.secondary_y, bool): return self.secondary_y - if isinstance(self.secondary_y, (tuple, list, - np.ndarray, ABCIndexClass)): + if isinstance(self.secondary_y, (tuple, list, np.ndarray, ABCIndexClass)): return self.data.columns[i] in self.secondary_y def _apply_style_colors(self, colors, kwds, col_num, label): @@ -651,19 +719,21 @@ def _apply_style_colors(self, colors, kwds, col_num, label): else: style = self.style - has_color = 'color' in kwds or self.colormap is not None - nocolor_style = style is None or re.match('[a-z]+', style) is None + has_color = "color" in kwds or self.colormap is not None + nocolor_style = style is None or re.match("[a-z]+", style) is None if (has_color or self.subplots) and nocolor_style: - kwds['color'] = colors[col_num % len(colors)] + kwds["color"] = colors[col_num % len(colors)] return style, kwds - def _get_colors(self, num_colors=None, color_kwds='color'): + def _get_colors(self, num_colors=None, color_kwds="color"): if num_colors is None: num_colors = self.nseries - return _get_standard_colors(num_colors=num_colors, - colormap=self.colormap, - color=self.kwds.get(color_kwds)) + return _get_standard_colors( + num_colors=num_colors, + colormap=self.colormap, + color=self.kwds.get(color_kwds), + ) def _parse_errorbars(self, label, err): """ @@ -719,12 +789,15 @@ def match_labels(data, e): # asymmetrical error bars if err.ndim == 3: - if (err_shape[0] != self.nseries) or \ - (err_shape[1] != 2) or \ - (err_shape[2] != len(self.data)): - msg = "Asymmetrical error bars should be provided " + \ - "with the shape (%u, 2, %u)" % \ - (self.nseries, len(self.data)) + if ( + (err_shape[0] != self.nseries) + or (err_shape[1] != 2) + or (err_shape[2] != len(self.data)) + ): + msg = ( + "Asymmetrical error bars should be provided " + + "with the shape (%u, 2, %u)" % (self.nseries, len(self.data)) + ) raise ValueError(msg) # broadcast errors to each data series @@ -743,7 +816,7 @@ def match_labels(data, e): def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): errors = {} - for kw, flag in zip(['xerr', 'yerr'], [xerr, yerr]): + for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]): if flag: err = self.errors[kw] # user provided label-matched dataframe of errors @@ -761,8 +834,10 @@ def _get_errorbars(self, label=None, index=None, xerr=True, yerr=True): def _get_subplots(self): from matplotlib.axes import Subplot - return [ax for ax in self.axes[0].get_figure().get_axes() - if isinstance(ax, Subplot)] + + return [ + ax for ax in self.axes[0].get_figure().get_axes() if isinstance(ax, Subplot) + ] def _get_axes_layout(self): axes = self._get_subplots() @@ -781,20 +856,20 @@ class PlanePlot(MPLPlot): Abstract class for plotting on plane, currently scatter and hexbin. """ - _layout_type = 'single' + _layout_type = "single" def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: - raise ValueError(self._kind + ' requires an x and y column') + raise ValueError(self._kind + " requires an x and y column") if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + ' requires x column to be numeric') + raise ValueError(self._kind + " requires x column to be numeric") if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + ' requires y column to be numeric') + raise ValueError(self._kind + " requires y column to be numeric") self.x = x self.y = y @@ -830,10 +905,14 @@ def _plot_colorbar(self, ax, **kwds): points = ax.get_position().get_points() cbar_points = cbar.ax.get_position().get_points() - cbar.ax.set_position([cbar_points[0, 0], - points[0, 1], - cbar_points[1, 0] - cbar_points[0, 0], - points[1, 1] - points[0, 1]]) + cbar.ax.set_position( + [ + cbar_points[0, 0], + points[0, 1], + cbar_points[1, 0] - cbar_points[0, 0], + points[1, 1] - points[0, 1], + ] + ) # To see the discrepancy in axis heights uncomment # the following two lines: # print(points[1, 1] - points[0, 1]) @@ -841,7 +920,7 @@ def _plot_colorbar(self, ax, **kwds): class ScatterPlot(PlanePlot): - _kind = 'scatter' + _kind = "scatter" def __init__(self, data, x, y, s=None, c=None, **kwargs): if s is None: @@ -860,16 +939,16 @@ def _make_plot(self): c_is_column = is_hashable(c) and c in self.data.columns # plot a colorbar only if a colormap is provided or necessary - cb = self.kwds.pop('colorbar', self.colormap or c_is_column) + cb = self.kwds.pop("colorbar", self.colormap or c_is_column) # pandas uses colormap, matplotlib uses cmap. - cmap = self.colormap or 'Greys' + cmap = self.colormap or "Greys" cmap = self.plt.cm.get_cmap(cmap) color = self.kwds.pop("color", None) if c is not None and color is not None: - raise TypeError('Specify exactly one of `c` and `color`') + raise TypeError("Specify exactly one of `c` and `color`") elif c is None and color is None: - c_values = self.plt.rcParams['patch.facecolor'] + c_values = self.plt.rcParams["patch.facecolor"] elif color is not None: c_values = color elif c_is_column: @@ -877,14 +956,20 @@ def _make_plot(self): else: c_values = c - if self.legend and hasattr(self, 'label'): + if self.legend and hasattr(self, "label"): label = self.label else: label = None - scatter = ax.scatter(data[x].values, data[y].values, c=c_values, - label=label, cmap=cmap, **self.kwds) + scatter = ax.scatter( + data[x].values, + data[y].values, + c=c_values, + label=label, + cmap=cmap, + **self.kwds + ) if cb: - cbar_label = c if c_is_column else '' + cbar_label = c if c_is_column else "" self._plot_colorbar(ax, label=cbar_label) if label is not None: @@ -896,13 +981,12 @@ def _make_plot(self): errors_y = self._get_errorbars(label=y, index=0, xerr=False) if len(errors_x) > 0 or len(errors_y) > 0: err_kwds = dict(errors_x, **errors_y) - err_kwds['ecolor'] = scatter.get_facecolor()[0] - ax.errorbar(data[x].values, data[y].values, - linestyle='none', **err_kwds) + err_kwds["ecolor"] = scatter.get_facecolor()[0] + ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds) class HexBinPlot(PlanePlot): - _kind = 'hexbin' + _kind = "hexbin" def __init__(self, data, x, y, C=None, **kwargs): super().__init__(data, x, y, **kwargs) @@ -914,17 +998,16 @@ def _make_plot(self): x, y, data, C = self.x, self.y, self.data, self.C ax = self.axes[0] # pandas uses colormap, matplotlib uses cmap. - cmap = self.colormap or 'BuGn' + cmap = self.colormap or "BuGn" cmap = self.plt.cm.get_cmap(cmap) - cb = self.kwds.pop('colorbar', True) + cb = self.kwds.pop("colorbar", True) if C is None: c_values = None else: c_values = data[C].values - ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, - **self.kwds) + ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, **self.kwds) if cb: self._plot_colorbar(ax) @@ -933,18 +1016,19 @@ def _make_legend(self): class LinePlot(MPLPlot): - _kind = 'line' + _kind = "line" _default_rot = 0 - orientation = 'vertical' + orientation = "vertical" def __init__(self, data, **kwargs): from pandas.plotting import plot_params + MPLPlot.__init__(self, data, **kwargs) if self.stacked: self.data = self.data.fillna(value=0) - self.x_compat = plot_params['x_compat'] - if 'x_compat' in self.kwds: - self.x_compat = bool(self.kwds.pop('x_compat')) + self.x_compat = plot_params["x_compat"] + if "x_compat" in self.kwds: + self.x_compat = bool(self.kwds.pop("x_compat")) def _is_ts_plot(self): # this is slightly deceptive @@ -952,15 +1036,16 @@ def _is_ts_plot(self): def _use_dynamic_x(self): from pandas.plotting._matplotlib.timeseries import _use_dynamic_x + return _use_dynamic_x(self._get_ax(0), self.data) def _make_plot(self): if self._is_ts_plot(): - from pandas.plotting._matplotlib.timeseries import ( - _maybe_convert_index) + from pandas.plotting._matplotlib.timeseries import _maybe_convert_index + data = _maybe_convert_index(self._get_ax(0), self.data) - x = data.index # dummy, not used + x = data.index # dummy, not used plotf = self._ts_plot it = self._iter_data(data=data, keep_index=True) else: @@ -981,12 +1066,18 @@ def _make_plot(self): kwds = dict(kwds, **errors) label = pprint_thing(label) # .encode('utf-8') - kwds['label'] = label - - newlines = plotf(ax, x, y, style=style, column_num=i, - stacking_id=stacking_id, - is_errorbar=is_errorbar, - **kwds) + kwds["label"] = label + + newlines = plotf( + ax, + x, + y, + style=style, + column_num=i, + stacking_id=stacking_id, + is_errorbar=is_errorbar, + **kwds + ) self._add_legend_handle(newlines[0], label, index=i) lines = _get_all_lines(ax) @@ -994,22 +1085,24 @@ def _make_plot(self): ax.set_xlim(left, right) @classmethod - def _plot(cls, ax, x, y, style=None, column_num=None, - stacking_id=None, **kwds): + def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): # column_num is used to get the target column from protf in line and # area plots if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(y)) - y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label']) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds) cls._update_stacker(ax, stacking_id, y) return lines @classmethod def _ts_plot(cls, ax, x, data, style=None, **kwds): - from pandas.plotting._matplotlib.timeseries import (_maybe_resample, - _decorate_axes, - format_dateaxis) + from pandas.plotting._matplotlib.timeseries import ( + _maybe_resample, + _decorate_axes, + format_dateaxis, + ) + # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose @@ -1018,9 +1111,9 @@ def _ts_plot(cls, ax, x, data, style=None, **kwds): # Set ax with freq info _decorate_axes(ax, freq, kwds) # digging deeper - if hasattr(ax, 'left_ax'): + if hasattr(ax, "left_ax"): _decorate_axes(ax.left_ax, freq, kwds) - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): _decorate_axes(ax.right_ax, freq, kwds) ax._plot_data.append((data, cls._kind, kwds)) @@ -1039,9 +1132,9 @@ def _get_stacking_id(self): def _initialize_stacker(cls, ax, stacking_id, n): if stacking_id is None: return - if not hasattr(ax, '_stacker_pos_prior'): + if not hasattr(ax, "_stacker_pos_prior"): ax._stacker_pos_prior = {} - if not hasattr(ax, '_stacker_neg_prior'): + if not hasattr(ax, "_stacker_neg_prior"): ax._stacker_neg_prior = {} ax._stacker_pos_prior[stacking_id] = np.zeros(n) ax._stacker_neg_prior[stacking_id] = np.zeros(n) @@ -1050,7 +1143,7 @@ def _initialize_stacker(cls, ax, stacking_id, n): def _get_stacked_values(cls, ax, stacking_id, values, label): if stacking_id is None: return values - if not hasattr(ax, '_stacker_pos_prior'): + if not hasattr(ax, "_stacker_pos_prior"): # stacker may not be initialized for subplots cls._initialize_stacker(ax, stacking_id, len(values)) @@ -1059,10 +1152,11 @@ def _get_stacked_values(cls, ax, stacking_id, values, label): elif (values <= 0).all(): return ax._stacker_neg_prior[stacking_id] + values - raise ValueError('When stacked is True, each column must be either ' - 'all positive or negative.' - '{0} contains both positive and negative values' - .format(label)) + raise ValueError( + "When stacked is True, each column must be either " + "all positive or negative." + "{0} contains both positive and negative values".format(label) + ) @classmethod def _update_stacker(cls, ax, stacking_id, values): @@ -1080,7 +1174,7 @@ def get_label(i): try: return pprint_thing(data.index[i]) except Exception: - return '' + return "" if self._need_to_set_index: xticks = ax.get_xticks() @@ -1088,10 +1182,12 @@ def get_label(i): ax.set_xticklabels(xticklabels) ax.xaxis.set_major_locator(FixedLocator(xticks)) - condition = (not self._use_dynamic_x() and - data.index.is_all_dates and - not self.subplots or - (self.subplots and self.sharex)) + condition = ( + not self._use_dynamic_x() + and data.index.is_all_dates + and not self.subplots + or (self.subplots and self.sharex) + ) index_name = self._get_index_name() @@ -1107,31 +1203,40 @@ def get_label(i): class AreaPlot(LinePlot): - _kind = 'area' + _kind = "area" def __init__(self, data, **kwargs): - kwargs.setdefault('stacked', True) + kwargs.setdefault("stacked", True) data = data.fillna(value=0) LinePlot.__init__(self, data, **kwargs) if not self.stacked: # use smaller alpha to distinguish overlap - self.kwds.setdefault('alpha', 0.5) + self.kwds.setdefault("alpha", 0.5) if self.logy or self.loglog: raise ValueError("Log-y scales are not supported in area plot") @classmethod - def _plot(cls, ax, x, y, style=None, column_num=None, - stacking_id=None, is_errorbar=False, **kwds): + def _plot( + cls, + ax, + x, + y, + style=None, + column_num=None, + stacking_id=None, + is_errorbar=False, + **kwds + ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(y)) - y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label']) + y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"]) # need to remove label, because subplots uses mpl legend as it is line_kwds = kwds.copy() - line_kwds.pop('label') + line_kwds.pop("label") lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds) # get data from the line to get coordinates for fill_between @@ -1147,8 +1252,8 @@ def _plot(cls, ax, x, y, style=None, column_num=None, else: start = np.zeros(len(y)) - if 'color' not in kwds: - kwds['color'] = lines[0].get_color() + if "color" not in kwds: + kwds["color"] = lines[0].get_color() rect = ax.fill_between(xdata, start, y_values, **kwds) cls._update_stacker(ax, stacking_id, y) @@ -1168,33 +1273,33 @@ def _post_plot_logic(self, ax, data): class BarPlot(MPLPlot): - _kind = 'bar' + _kind = "bar" _default_rot = 90 - orientation = 'vertical' + orientation = "vertical" def __init__(self, data, **kwargs): # we have to treat a series differently than a # 1-column DataFrame w.r.t. color handling self._is_series = isinstance(data, ABCSeries) - self.bar_width = kwargs.pop('width', 0.5) - pos = kwargs.pop('position', 0.5) - kwargs.setdefault('align', 'center') + self.bar_width = kwargs.pop("width", 0.5) + pos = kwargs.pop("position", 0.5) + kwargs.setdefault("align", "center") self.tick_pos = np.arange(len(data)) - self.bottom = kwargs.pop('bottom', 0) - self.left = kwargs.pop('left', 0) + self.bottom = kwargs.pop("bottom", 0) + self.left = kwargs.pop("left", 0) - self.log = kwargs.pop('log', False) + self.log = kwargs.pop("log", False) MPLPlot.__init__(self, data, **kwargs) if self.stacked or self.subplots: self.tickoffset = self.bar_width * pos - if kwargs['align'] == 'edge': + if kwargs["align"] == "edge": self.lim_offset = self.bar_width / 2 else: self.lim_offset = 0 else: - if kwargs['align'] == 'edge': + if kwargs["align"] == "edge": w = self.bar_width / self.nseries self.tickoffset = self.bar_width * (pos - 0.5) + w * 0.5 self.lim_offset = w * 0.5 @@ -1231,18 +1336,17 @@ def _make_plot(self): ax = self._get_ax(i) kwds = self.kwds.copy() if self._is_series: - kwds['color'] = colors + kwds["color"] = colors else: - kwds['color'] = colors[i % ncolors] + kwds["color"] = colors[i % ncolors] errors = self._get_errorbars(label=label, index=i) kwds = dict(kwds, **errors) label = pprint_thing(label) - if (('yerr' in kwds) or ('xerr' in kwds)) \ - and (kwds.get('ecolor') is None): - kwds['ecolor'] = mpl.rcParams['xtick.color'] + if (("yerr" in kwds) or ("xerr" in kwds)) and (kwds.get("ecolor") is None): + kwds["ecolor"] = mpl.rcParams["xtick.color"] start = 0 if self.log and (y >= 1).all(): @@ -1251,24 +1355,45 @@ def _make_plot(self): if self.subplots: w = self.bar_width / 2 - rect = self._plot(ax, self.ax_pos + w, y, self.bar_width, - start=start, label=label, - log=self.log, **kwds) + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds + ) ax.set_title(label) elif self.stacked: mask = y > 0 start = np.where(mask, pos_prior, neg_prior) + self._start_base w = self.bar_width / 2 - rect = self._plot(ax, self.ax_pos + w, y, self.bar_width, - start=start, label=label, - log=self.log, **kwds) + rect = self._plot( + ax, + self.ax_pos + w, + y, + self.bar_width, + start=start, + label=label, + log=self.log, + **kwds + ) pos_prior = pos_prior + np.where(mask, y, 0) neg_prior = neg_prior + np.where(mask, 0, y) else: w = self.bar_width / K - rect = self._plot(ax, self.ax_pos + (i + 0.5) * w, y, w, - start=start, label=label, - log=self.log, **kwds) + rect = self._plot( + ax, + self.ax_pos + (i + 0.5) * w, + y, + w, + start=start, + label=label, + log=self.log, + **kwds + ) self._add_legend_handle(rect, label, index=i) def _post_plot_logic(self, ax, data): @@ -1292,9 +1417,9 @@ def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): class BarhPlot(BarPlot): - _kind = 'barh' + _kind = "barh" _default_rot = 0 - orientation = 'horizontal' + orientation = "horizontal" @property def _start_base(self): @@ -1314,8 +1439,8 @@ def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): class PiePlot(MPLPlot): - _kind = 'pie' - _layout_type = 'horizontal' + _kind = "pie" + _layout_type = "horizontal" def __init__(self, data, kind=None, **kwargs): data = data.fillna(value=0) @@ -1333,9 +1458,8 @@ def _validate_color_args(self): pass def _make_plot(self): - colors = self._get_colors( - num_colors=len(self.data), color_kwds='colors') - self.kwds.setdefault('colors', colors) + colors = self._get_colors(num_colors=len(self.data), color_kwds="colors") + self.kwds.setdefault("colors", colors) for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) @@ -1347,23 +1471,22 @@ def _make_plot(self): def blank_labeler(label, value): if value == 0: - return '' + return "" else: return label idx = [pprint_thing(v) for v in self.data.index] - labels = kwds.pop('labels', idx) + labels = kwds.pop("labels", idx) # labels is used for each wedge's labels # Blank out labels for values of 0 so they don't overlap # with nonzero wedges if labels is not None: - blabels = [blank_labeler(l, value) for - l, value in zip(labels, y)] + blabels = [blank_labeler(l, value) for l, value in zip(labels, y)] else: blabels = None results = ax.pie(y, labels=blabels, **kwds) - if kwds.get('autopct', None) is not None: + if kwds.get("autopct", None) is not None: patches, texts, autotexts = results else: patches, texts = results diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index d34c0cb6a3889..5213e09f14067 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -11,15 +11,14 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot -from pandas.plotting._matplotlib.tools import ( - _flatten, _set_ticks_props, _subplots) +from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots class HistPlot(LinePlot): - _kind = 'hist' + _kind = "hist" def __init__(self, data, bins=10, bottom=0, **kwargs): - self.bins = bins # use mpl default + self.bins = bins # use mpl default self.bottom = bottom # Do not call LinePlot.__init__ which may fill nan MPLPlot.__init__(self, data, **kwargs) @@ -27,28 +26,38 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): def _args_adjust(self): if is_integer(self.bins): # create common bin edge - values = (self.data._convert(datetime=True)._get_numeric_data()) + values = self.data._convert(datetime=True)._get_numeric_data() values = np.ravel(values) values = values[~isna(values)] hist, self.bins = np.histogram( - values, bins=self.bins, - range=self.kwds.get('range', None), - weights=self.kwds.get('weights', None)) + values, + bins=self.bins, + range=self.kwds.get("range", None), + weights=self.kwds.get("weights", None), + ) if is_list_like(self.bottom): self.bottom = np.array(self.bottom) @classmethod - def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, - stacking_id=None, **kwds): + def _plot( + cls, + ax, + y, + style=None, + bins=None, + bottom=0, + column_num=0, + stacking_id=None, + **kwds + ): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) y = y[~isna(y)] base = np.zeros(len(bins) - 1) - bottom = bottom + \ - cls._get_stacked_values(ax, stacking_id, base, kwds['label']) + bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) # ignore style n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds) cls._update_stacker(ax, stacking_id, n) @@ -64,41 +73,40 @@ def _make_plot(self): kwds = self.kwds.copy() label = pprint_thing(label) - kwds['label'] = label + kwds["label"] = label style, kwds = self._apply_style_colors(colors, kwds, i, label) if style is not None: - kwds['style'] = style + kwds["style"] = style kwds = self._make_plot_keywords(kwds, y) - artists = self._plot(ax, y, column_num=i, - stacking_id=stacking_id, **kwds) + artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) self._add_legend_handle(artists[0], label, index=i) def _make_plot_keywords(self, kwds, y): """merge BoxPlot/KdePlot properties to passed kwds""" # y is required for KdePlot - kwds['bottom'] = self.bottom - kwds['bins'] = self.bins + kwds["bottom"] = self.bottom + kwds["bins"] = self.bins return kwds def _post_plot_logic(self, ax, data): - if self.orientation == 'horizontal': - ax.set_xlabel('Frequency') + if self.orientation == "horizontal": + ax.set_xlabel("Frequency") else: - ax.set_ylabel('Frequency') + ax.set_ylabel("Frequency") @property def orientation(self): - if self.kwds.get('orientation', None) == 'horizontal': - return 'horizontal' + if self.kwds.get("orientation", None) == "horizontal": + return "horizontal" else: - return 'vertical' + return "vertical" class KdePlot(HistPlot): - _kind = 'kde' - orientation = 'vertical' + _kind = "kde" + orientation = "vertical" def __init__(self, data, bw_method=None, ind=None, **kwargs): MPLPlot.__init__(self, data, **kwargs) @@ -112,19 +120,34 @@ def _get_ind(self, y): if self.ind is None: # np.nanmax() and np.nanmin() ignores the missing values sample_range = np.nanmax(y) - np.nanmin(y) - ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, - np.nanmax(y) + 0.5 * sample_range, 1000) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + 1000, + ) elif is_integer(self.ind): sample_range = np.nanmax(y) - np.nanmin(y) - ind = np.linspace(np.nanmin(y) - 0.5 * sample_range, - np.nanmax(y) + 0.5 * sample_range, self.ind) + ind = np.linspace( + np.nanmin(y) - 0.5 * sample_range, + np.nanmax(y) + 0.5 * sample_range, + self.ind, + ) else: ind = self.ind return ind @classmethod - def _plot(cls, ax, y, style=None, bw_method=None, ind=None, - column_num=None, stacking_id=None, **kwds): + def _plot( + cls, + ax, + y, + style=None, + bw_method=None, + ind=None, + column_num=None, + stacking_id=None, + **kwds + ): from scipy.stats import gaussian_kde y = remove_na_arraylike(y) @@ -135,22 +158,36 @@ def _plot(cls, ax, y, style=None, bw_method=None, ind=None, return lines def _make_plot_keywords(self, kwds, y): - kwds['bw_method'] = self.bw_method - kwds['ind'] = self._get_ind(y) + kwds["bw_method"] = self.bw_method + kwds["ind"] = self._get_ind(y) return kwds def _post_plot_logic(self, ax, data): - ax.set_ylabel('Density') - - -def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, - figsize=None, sharex=True, sharey=True, layout=None, - rot=0, ax=None, **kwargs): - - if figsize == 'default': + ax.set_ylabel("Density") + + +def _grouped_plot( + plotf, + data, + column=None, + by=None, + numeric_only=True, + figsize=None, + sharex=True, + sharey=True, + layout=None, + rot=0, + ax=None, + **kwargs +): + + if figsize == "default": # allowed to specify mpl default with 'default' - warnings.warn("figsize='default' is deprecated. Specify figure " - "size by tuple instead", FutureWarning, stacklevel=5) + warnings.warn( + "figsize='default' is deprecated. Specify figure " "size by tuple instead", + FutureWarning, + stacklevel=5, + ) figsize = None grouped = data.groupby(by) @@ -158,9 +195,9 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, grouped = grouped[column] naxes = len(grouped) - fig, axes = _subplots(naxes=naxes, figsize=figsize, - sharex=sharex, sharey=sharey, ax=ax, - layout=layout) + fig, axes = _subplots( + naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout + ) _axes = _flatten(axes) @@ -174,10 +211,24 @@ def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, return fig, axes -def _grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, - layout=None, sharex=False, sharey=False, rot=90, grid=True, - xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, - **kwargs): +def _grouped_hist( + data, + column=None, + by=None, + ax=None, + bins=50, + figsize=None, + layout=None, + sharex=False, + sharey=False, + rot=90, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + **kwargs +): """ Grouped histogram @@ -200,76 +251,139 @@ def _grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, ------- collection of Matplotlib Axes """ + def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) converter._WARN = False # no warning for pandas plots xrot = xrot or rot - fig, axes = _grouped_plot(plot_group, data, column=column, - by=by, sharex=sharex, sharey=sharey, ax=ax, - figsize=figsize, layout=layout, rot=rot) - - _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) - - fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, - hspace=0.5, wspace=0.3) + fig, axes = _grouped_plot( + plot_group, + data, + column=column, + by=by, + sharex=sharex, + sharey=sharey, + ax=ax, + figsize=figsize, + layout=layout, + rot=rot, + ) + + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) + + fig.subplots_adjust( + bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 + ) return axes -def hist_series(self, by=None, ax=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, figsize=None, - bins=10, **kwds): +def hist_series( + self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + **kwds +): import matplotlib.pyplot as plt + if by is None: - if kwds.get('layout', None) is not None: - raise ValueError("The 'layout' keyword is not supported when " - "'by' is None") + if kwds.get("layout", None) is not None: + raise ValueError( + "The 'layout' keyword is not supported when " "'by' is None" + ) # hack until the plotting interface is a bit more unified - fig = kwds.pop('figure', plt.gcf() if plt.get_fignums() else - plt.figure(figsize=figsize)) - if (figsize is not None and tuple(figsize) != - tuple(fig.get_size_inches())): + fig = kwds.pop( + "figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize) + ) + if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()): fig.set_size_inches(*figsize, forward=True) if ax is None: ax = fig.gca() elif ax.get_figure() != fig: - raise AssertionError('passed axis not bound to passed figure') + raise AssertionError("passed axis not bound to passed figure") values = self.dropna().values ax.hist(values, bins=bins, **kwds) ax.grid(grid) axes = np.array([ax]) - _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) else: - if 'figure' in kwds: - raise ValueError("Cannot pass 'figure' when using the " - "'by' argument, since a new 'Figure' instance " - "will be created") - axes = _grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize, - bins=bins, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot, **kwds) - - if hasattr(axes, 'ndim'): + if "figure" in kwds: + raise ValueError( + "Cannot pass 'figure' when using the " + "'by' argument, since a new 'Figure' instance " + "will be created" + ) + axes = _grouped_hist( + self, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + **kwds + ) + + if hasattr(axes, "ndim"): if axes.ndim == 1 and len(axes) == 1: return axes[0] return axes -def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, - xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, - sharey=False, figsize=None, layout=None, bins=10, **kwds): +def hist_frame( + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + **kwds +): converter._WARN = False # no warning for pandas plots if by is not None: - axes = _grouped_hist(data, column=column, by=by, ax=ax, grid=grid, - figsize=figsize, sharex=sharex, sharey=sharey, - layout=layout, bins=bins, xlabelsize=xlabelsize, - xrot=xrot, ylabelsize=ylabelsize, - yrot=yrot, **kwds) + axes = _grouped_hist( + data, + column=column, + by=by, + ax=ax, + grid=grid, + figsize=figsize, + sharex=sharex, + sharey=sharey, + layout=layout, + bins=bins, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + **kwds + ) return axes if column is not None: @@ -280,12 +394,17 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, naxes = len(data.columns) if naxes == 0: - raise ValueError("hist method requires numerical columns, " - "nothing to plot.") - - fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False, - sharex=sharex, sharey=sharey, figsize=figsize, - layout=layout) + raise ValueError("hist method requires numerical columns, " "nothing to plot.") + + fig, axes = _subplots( + naxes=naxes, + ax=ax, + squeeze=False, + sharex=sharex, + sharey=sharey, + figsize=figsize, + layout=layout, + ) _axes = _flatten(axes) for i, col in enumerate(com.try_sort(data.columns)): @@ -294,8 +413,9 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, ax.set_title(col) ax.grid(grid) - _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot, - ylabelsize=ylabelsize, yrot=yrot) + _set_ticks_props( + axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot + ) fig.subplots_adjust(wspace=0.3, hspace=0.3) return axes diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 663a3c5153fac..6d5a94c4d5ff8 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -11,14 +11,23 @@ from pandas.plotting._matplotlib.tools import _set_ticks_props, _subplots -def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, - diagonal='hist', marker='.', density_kwds=None, - hist_kwds=None, range_padding=0.05, **kwds): +def scatter_matrix( + frame, + alpha=0.5, + figsize=None, + ax=None, + grid=False, + diagonal="hist", + marker=".", + density_kwds=None, + hist_kwds=None, + range_padding=0.05, + **kwds +): df = frame._get_numeric_data() n = df.columns.size naxes = n * n - fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, - squeeze=False) + fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) @@ -31,13 +40,13 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, density_kwds = density_kwds or {} # GH 14855 - kwds.setdefault('edgecolors', 'none') + kwds.setdefault("edgecolors", "none") boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) - rdelta_ext = (rmax_ - rmin_) * range_padding / 2. + rdelta_ext = (rmax_ - rmin_) * range_padding / 2.0 boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in enumerate(df.columns): @@ -48,11 +57,12 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. - if diagonal == 'hist': + if diagonal == "hist": ax.hist(values, **hist_kwds) - elif diagonal in ('kde', 'density'): + elif diagonal in ("kde", "density"): from scipy.stats import gaussian_kde + y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) @@ -63,8 +73,9 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, else: common = (mask[a] & mask[b]).values - ax.scatter(df[b][common], df[a][common], - marker=marker, alpha=alpha, **kwds) + ax.scatter( + df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds + ) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) @@ -99,7 +110,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, def _get_marker_compat(marker): if marker not in mlines.lineMarkers: - return 'o' + return "o" return marker @@ -120,16 +131,20 @@ def normalize(series): ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) to_plot = {} - colors = _get_standard_colors(num_colors=len(classes), colormap=colormap, - color_type='random', color=color) + colors = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) for kls in classes: to_plot[kls] = [[], []] m = len(frame.columns) - 1 - s = np.array([(np.cos(t), np.sin(t)) - for t in [2.0 * np.pi * (i / float(m)) - for i in range(m)]]) + s = np.array( + [ + (np.cos(t), np.sin(t)) + for t in [2.0 * np.pi * (i / float(m)) for i in range(m)] + ] + ) for i in range(n): row = df.iloc[i].values @@ -140,35 +155,50 @@ def normalize(series): to_plot[kls][1].append(y[1]) for i, kls in enumerate(classes): - ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i], - label=pprint_thing(kls), **kwds) + ax.scatter( + to_plot[kls][0], + to_plot[kls][1], + color=colors[i], + label=pprint_thing(kls), + **kwds + ) ax.legend() - ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none')) + ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) for xy, name in zip(s, df.columns): - ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray')) + ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) if xy[0] < 0.0 and xy[1] < 0.0: - ax.text(xy[0] - 0.025, xy[1] - 0.025, name, - ha='right', va='top', size='small') + ax.text( + xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small" + ) elif xy[0] < 0.0 and xy[1] >= 0.0: - ax.text(xy[0] - 0.025, xy[1] + 0.025, name, - ha='right', va='bottom', size='small') + ax.text( + xy[0] - 0.025, + xy[1] + 0.025, + name, + ha="right", + va="bottom", + size="small", + ) elif xy[0] >= 0.0 and xy[1] < 0.0: - ax.text(xy[0] + 0.025, xy[1] - 0.025, name, - ha='left', va='top', size='small') + ax.text( + xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small" + ) elif xy[0] >= 0.0 and xy[1] >= 0.0: - ax.text(xy[0] + 0.025, xy[1] + 0.025, name, - ha='left', va='bottom', size='small') + ax.text( + xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small" + ) - ax.axis('equal') + ax.axis("equal") return ax -def andrews_curves(frame, class_column, ax=None, samples=200, color=None, - colormap=None, **kwds): +def andrews_curves( + frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds +): import matplotlib.pyplot as plt def function(amplitudes): @@ -187,10 +217,13 @@ def f(t): harmonics = np.arange(0, coeffs.shape[0]) + 1 trig_args = np.outer(harmonics, t) - result += np.sum(coeffs[:, 0, np.newaxis] * np.sin(trig_args) + - coeffs[:, 1, np.newaxis] * np.cos(trig_args), - axis=0) + result += np.sum( + coeffs[:, 0, np.newaxis] * np.sin(trig_args) + + coeffs[:, 1, np.newaxis] * np.cos(trig_args), + axis=0, + ) return result + return f n = len(frame) @@ -200,9 +233,9 @@ def f(t): t = np.linspace(-np.pi, np.pi, samples) used_legends = set() - color_values = _get_standard_colors(num_colors=len(classes), - colormap=colormap, color_type='random', - color=color) + color_values = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) colors = dict(zip(classes, color_values)) if ax is None: ax = plt.gca(xlim=(-np.pi, np.pi)) @@ -218,7 +251,7 @@ def f(t): else: ax.plot(t, y, color=colors[kls], **kwds) - ax.legend(loc='upper right') + ax.legend(loc="upper right") ax.grid() return ax @@ -226,14 +259,16 @@ def f(t): def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): import matplotlib.pyplot as plt + # random.sample(ndarray, int) fails on python 3.3, sigh data = list(series.values) samplings = [random.sample(data, size) for _ in range(samples)] means = np.array([np.mean(sampling) for sampling in samplings]) medians = np.array([np.median(sampling) for sampling in samplings]) - midranges = np.array([(min(sampling) + max(sampling)) * 0.5 - for sampling in samplings]) + midranges = np.array( + [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings] + ) if fig is None: fig = plt.figure() x = list(range(samples)) @@ -268,13 +303,24 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): return fig -def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, - use_columns=False, xticks=None, colormap=None, - axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds): +def parallel_coordinates( + frame, + class_column, + cols=None, + ax=None, + color=None, + use_columns=False, + xticks=None, + colormap=None, + axvlines=True, + axvlines_kwds=None, + sort_labels=False, + **kwds +): import matplotlib.pyplot as plt + if axvlines_kwds is None: - axvlines_kwds = {'linewidth': 1, 'color': 'black'} + axvlines_kwds = {"linewidth": 1, "color": "black"} n = len(frame) classes = frame[class_column].drop_duplicates() @@ -292,13 +338,13 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, # determine values to use for xticks if use_columns is True: if not np.all(np.isreal(list(df.columns))): - raise ValueError('Columns must be numeric to be used as xticks') + raise ValueError("Columns must be numeric to be used as xticks") x = df.columns elif xticks is not None: if not np.all(np.isreal(xticks)): - raise ValueError('xticks specified must be numeric') + raise ValueError("xticks specified must be numeric") elif len(xticks) != ncols: - raise ValueError('Length of xticks must match number of columns') + raise ValueError("Length of xticks must match number of columns") x = xticks else: x = list(range(ncols)) @@ -306,9 +352,9 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, if ax is None: ax = plt.gca() - color_values = _get_standard_colors(num_colors=len(classes), - colormap=colormap, color_type='random', - color=color) + color_values = _get_standard_colors( + num_colors=len(classes), colormap=colormap, color_type="random", color=color + ) if sort_labels: classes = sorted(classes) @@ -332,7 +378,7 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, ax.set_xticks(x) ax.set_xticklabels(df.columns) ax.set_xlim(x[0], x[-1]) - ax.legend(loc='upper right') + ax.legend(loc="upper right") ax.grid() return ax @@ -340,7 +386,8 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, def lag_plot(series, lag=1, ax=None, **kwds): # workaround because `c='b'` is hardcoded in matplotlibs scatter method import matplotlib.pyplot as plt - kwds.setdefault('c', plt.rcParams['patch.facecolor']) + + kwds.setdefault("c", plt.rcParams["patch.facecolor"]) data = series.values y1 = data[:-lag] @@ -364,21 +411,21 @@ def autocorrelation_plot(series, ax=None, **kwds): c0 = np.sum((data - mean) ** 2) / float(n) def r(h): - return ((data[:n - h] - mean) * - (data[h:] - mean)).sum() / float(n) / c0 + return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 + x = np.arange(n) + 1 y = [r(loc) for loc in x] z95 = 1.959963984540054 z99 = 2.5758293035489004 - ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey') - ax.axhline(y=z95 / np.sqrt(n), color='grey') - ax.axhline(y=0.0, color='black') - ax.axhline(y=-z95 / np.sqrt(n), color='grey') - ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey') + ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey") + ax.axhline(y=z95 / np.sqrt(n), color="grey") + ax.axhline(y=0.0, color="black") + ax.axhline(y=-z95 / np.sqrt(n), color="grey") + ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey") ax.set_xlabel("Lag") ax.set_ylabel("Autocorrelation") ax.plot(x, y, **kwds) - if 'label' in kwds: + if "label" in kwds: ax.legend() ax.grid() return ax diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 8c9e3ea330dd3..e1bba5856e271 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -10,9 +10,11 @@ import pandas.core.common as com -def _get_standard_colors(num_colors=None, colormap=None, color_type='default', - color=None): +def _get_standard_colors( + num_colors=None, colormap=None, color_type="default", color=None +): import matplotlib.pyplot as plt + if color is None and colormap is not None: if isinstance(colormap, str): cmap = colormap @@ -22,24 +24,23 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', colors = [colormap(num) for num in np.linspace(0, 1, num=num_colors)] elif color is not None: if colormap is not None: - warnings.warn("'color' and 'colormap' cannot be used " - "simultaneously. Using 'color'") + warnings.warn( + "'color' and 'colormap' cannot be used " "simultaneously. Using 'color'" + ) colors = list(color) if is_list_like(color) else color else: - if color_type == 'default': + if color_type == "default": # need to call list() on the result to copy so we don't # modify the global rcParams below try: - colors = [c['color'] - for c in list(plt.rcParams['axes.prop_cycle'])] + colors = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] except KeyError: - colors = list(plt.rcParams.get('axes.color_cycle', - list('bgrcmyk'))) + colors = list(plt.rcParams.get("axes.color_cycle", list("bgrcmyk"))) if isinstance(colors, str): colors = list(colors) colors = colors[0:num_colors] - elif color_type == 'random': + elif color_type == "random": def random_color(column): """ Returns a random color represented as a list of length 3""" @@ -66,8 +67,7 @@ def _maybe_valid_colors(colors): # check whether each character can be convertible to colors maybe_color_cycle = _maybe_valid_colors(list(colors)) if maybe_single_color and maybe_color_cycle and len(colors) > 1: - hex_color = [c['color'] - for c in list(plt.rcParams['axes.prop_cycle'])] + hex_color = [c["color"] for c in list(plt.rcParams["axes.prop_cycle"])] colors = [hex_color[int(colors[1])]] elif maybe_single_color: colors = [colors] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index c3b548a6dfa85..f3fcb090e9883 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -6,16 +6,26 @@ import numpy as np from pandas._libs.tslibs.frequencies import ( - FreqGroup, get_base_alias, get_freq, is_subperiod, is_superperiod) + FreqGroup, + get_base_alias, + get_freq, + is_subperiod, + is_superperiod, +) from pandas._libs.tslibs.period import Period from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCPeriodIndex, ABCTimedeltaIndex) + ABCDatetimeIndex, + ABCPeriodIndex, + ABCTimedeltaIndex, +) from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.converter import ( - TimeSeries_DateFormatter, TimeSeries_DateLocator, - TimeSeries_TimedeltaFormatter) + TimeSeries_DateFormatter, + TimeSeries_DateLocator, + TimeSeries_TimedeltaFormatter, +) import pandas.tseries.frequencies as frequencies from pandas.tseries.offsets import DateOffset @@ -41,9 +51,13 @@ def tsplot(series, plotf, ax=None, **kwargs): Use Series.plot() instead """ import matplotlib.pyplot as plt - warnings.warn("'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, stacklevel=2) + + warnings.warn( + "'tsplot' is deprecated and will be removed in a " + "future version. Please use Series.plot() instead.", + FutureWarning, + stacklevel=2, + ) # Used inferred freq is possible, need a test case for inferred if ax is None: @@ -66,7 +80,7 @@ def _maybe_resample(series, ax, kwargs): freq, ax_freq = _get_freq(ax, series) if freq is None: # pragma: no cover - raise ValueError('Cannot use dynamic axis without frequency info') + raise ValueError("Cannot use dynamic axis without frequency info") # Convert DatetimeIndex to PeriodIndex if isinstance(series.index, ABCDatetimeIndex): @@ -75,28 +89,30 @@ def _maybe_resample(series, ax, kwargs): if ax_freq is not None and freq != ax_freq: if is_superperiod(freq, ax_freq): # upsample input series = series.copy() - series.index = series.index.asfreq(ax_freq, how='s') + series.index = series.index.asfreq(ax_freq, how="s") freq = ax_freq elif _is_sup(freq, ax_freq): # one is weekly - how = kwargs.pop('how', 'last') - series = getattr(series.resample('D'), how)().dropna() + how = kwargs.pop("how", "last") + series = getattr(series.resample("D"), how)().dropna() series = getattr(series.resample(ax_freq), how)().dropna() freq = ax_freq elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq): _upsample_others(ax, freq, kwargs) else: # pragma: no cover - raise ValueError('Incompatible frequency conversion') + raise ValueError("Incompatible frequency conversion") return freq, series def _is_sub(f1, f2): - return ((f1.startswith('W') and is_subperiod('D', f2)) or - (f2.startswith('W') and is_subperiod(f1, 'D'))) + return (f1.startswith("W") and is_subperiod("D", f2)) or ( + f2.startswith("W") and is_subperiod(f1, "D") + ) def _is_sup(f1, f2): - return ((f1.startswith('W') and is_superperiod('D', f2)) or - (f2.startswith('W') and is_superperiod(f1, 'D'))) + return (f1.startswith("W") and is_superperiod("D", f2)) or ( + f2.startswith("W") and is_superperiod(f1, "D") + ) def _upsample_others(ax, freq, kwargs): @@ -105,9 +121,9 @@ def _upsample_others(ax, freq, kwargs): _replot_ax(ax, freq, kwargs) other_ax = None - if hasattr(ax, 'left_ax'): + if hasattr(ax, "left_ax"): other_ax = ax.left_ax - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): other_ax = ax.right_ax if other_ax is not None: @@ -115,16 +131,15 @@ def _upsample_others(ax, freq, kwargs): lines.extend(rlines) labels.extend(rlabels) - if (legend is not None and kwargs.get('legend', True) and - len(lines) > 0): + if legend is not None and kwargs.get("legend", True) and len(lines) > 0: title = legend.get_title().get_text() - if title == 'None': + if title == "None": title = None - ax.legend(lines, labels, loc='best', title=title) + ax.legend(lines, labels, loc="best", title=title) def _replot_ax(ax, freq, kwargs): - data = getattr(ax, '_plot_data', None) + data = getattr(ax, "_plot_data", None) # clear current axes and data ax._plot_data = [] @@ -137,17 +152,17 @@ def _replot_ax(ax, freq, kwargs): if data is not None: for series, plotf, kwds in data: series = series.copy() - idx = series.index.asfreq(freq, how='S') + idx = series.index.asfreq(freq, how="S") series.index = idx ax._plot_data.append((series, plotf, kwds)) # for tsplot if isinstance(plotf, str): from pandas.plotting._matplotlib import PLOT_CLASSES + plotf = PLOT_CLASSES[plotf]._plot - lines.append(plotf(ax, series.index._mpl_repr(), - series.values, **kwds)[0]) + lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0]) labels.append(pprint_thing(series.name)) return lines, labels @@ -155,16 +170,16 @@ def _replot_ax(ax, freq, kwargs): def _decorate_axes(ax, freq, kwargs): """Initialize axes for time-series plotting""" - if not hasattr(ax, '_plot_data'): + if not hasattr(ax, "_plot_data"): ax._plot_data = [] ax.freq = freq xaxis = ax.get_xaxis() xaxis.freq = freq - if not hasattr(ax, 'legendlabels'): - ax.legendlabels = [kwargs.get('label', None)] + if not hasattr(ax, "legendlabels"): + ax.legendlabels = [kwargs.get("label", None)] else: - ax.legendlabels.append(kwargs.get('label', None)) + ax.legendlabels.append(kwargs.get("label", None)) ax.view_interval = None ax.date_axis_info = None @@ -175,19 +190,19 @@ def _get_ax_freq(ax): Also checks shared axes (eg when using secondary yaxis, sharex=True or twinx) """ - ax_freq = getattr(ax, 'freq', None) + ax_freq = getattr(ax, "freq", None) if ax_freq is None: # check for left/right ax in case of secondary yaxis - if hasattr(ax, 'left_ax'): - ax_freq = getattr(ax.left_ax, 'freq', None) - elif hasattr(ax, 'right_ax'): - ax_freq = getattr(ax.right_ax, 'freq', None) + if hasattr(ax, "left_ax"): + ax_freq = getattr(ax.left_ax, "freq", None) + elif hasattr(ax, "right_ax"): + ax_freq = getattr(ax.right_ax, "freq", None) if ax_freq is None: # check if a shared ax (sharex/twinx) has already freq set shared_axes = ax.get_shared_x_axes().get_siblings(ax) if len(shared_axes) > 1: for shared_ax in shared_axes: - ax_freq = getattr(shared_ax, 'freq', None) + ax_freq = getattr(shared_ax, "freq", None) if ax_freq is not None: break return ax_freq @@ -195,9 +210,9 @@ def _get_ax_freq(ax): def _get_freq(ax, series): # get frequency from data - freq = getattr(series.index, 'freq', None) + freq = getattr(series.index, "freq", None) if freq is None: - freq = getattr(series.index, 'inferred_freq', None) + freq = getattr(series.index, "inferred_freq", None) ax_freq = _get_ax_freq(ax) @@ -241,17 +256,17 @@ def _use_dynamic_x(ax, data): if isinstance(data.index, ABCDatetimeIndex): base = get_freq(freq) x = data.index - if (base <= FreqGroup.FR_DAY): + if base <= FreqGroup.FR_DAY: return x[:1].is_normalized return Period(x[0], freq).to_timestamp(tz=x.tz) == x[0] return True def _get_index_freq(data): - freq = getattr(data.index, 'freq', None) + freq = getattr(data.index, "freq", None) if freq is None: - freq = getattr(data.index, 'inferred_freq', None) - if freq == 'B': + freq = getattr(data.index, "inferred_freq", None) + if freq == "B": weekdays = np.unique(data.index.dayofweek) if (5 in weekdays) or (6 in weekdays): freq = None @@ -262,10 +277,10 @@ def _maybe_convert_index(ax, data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): - freq = getattr(data.index, 'freq', None) + freq = getattr(data.index, "freq", None) if freq is None: - freq = getattr(data.index, 'inferred_freq', None) + freq = getattr(data.index, "inferred_freq", None) if isinstance(freq, DateOffset): freq = freq.rule_code @@ -273,7 +288,7 @@ def _maybe_convert_index(ax, data): freq = _get_ax_freq(ax) if freq is None: - raise ValueError('Could not get frequency alias for plotting') + raise ValueError("Could not get frequency alias for plotting") freq = get_base_alias(freq) freq = frequencies.get_period_alias(freq) @@ -288,6 +303,7 @@ def _maybe_convert_index(ax, data): # Patch methods for subplot. Only format_dateaxis is currently used. # Do we need the rest for convenience? + def format_timedelta_ticks(x, pos, n_decimals): """ Convert seconds to 'D days HH:MM:SS.F' @@ -296,12 +312,12 @@ def format_timedelta_ticks(x, pos, n_decimals): m, s = divmod(s, 60) h, m = divmod(m, 60) d, h = divmod(h, 24) - decimals = int(ns * 10**(n_decimals - 9)) - s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s)) + decimals = int(ns * 10 ** (n_decimals - 9)) + s = r"{:02d}:{:02d}:{:02d}".format(int(h), int(m), int(s)) if n_decimals > 0: - s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals) + s += ".{{:0{:0d}d}}".format(n_decimals).format(decimals) if d != 0: - s = '{:d} days '.format(int(d)) + s + s = "{:d} days ".format(int(d)) + s return s @@ -325,21 +341,21 @@ def format_dateaxis(subplot, freq, index): # interface. DatetimeIndex uses matplotlib.date directly if isinstance(index, ABCPeriodIndex): - majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, - minor_locator=False, - plot_obj=subplot) - minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, - minor_locator=True, - plot_obj=subplot) + majlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minlocator = TimeSeries_DateLocator( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) subplot.xaxis.set_major_locator(majlocator) subplot.xaxis.set_minor_locator(minlocator) - majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, - minor_locator=False, - plot_obj=subplot) - minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, - minor_locator=True, - plot_obj=subplot) + majformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot + ) + minformatter = TimeSeries_DateFormatter( + freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot + ) subplot.xaxis.set_major_formatter(majformatter) subplot.xaxis.set_minor_formatter(minformatter) @@ -347,9 +363,8 @@ def format_dateaxis(subplot, freq, index): subplot.format_coord = functools.partial(_format_coord, freq) elif isinstance(index, ABCTimedeltaIndex): - subplot.xaxis.set_major_formatter( - TimeSeries_TimedeltaFormatter()) + subplot.xaxis.set_major_formatter(TimeSeries_TimedeltaFormatter()) else: - raise TypeError('index type not supported') + raise TypeError("index type not supported") pylab.draw_if_interactive() diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index acb5ab7b8e04b..8472eb3a3d887 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -14,7 +14,7 @@ def format_date_labels(ax, rot): # mini version of autofmt_xdate try: for label in ax.get_xticklabels(): - label.set_ha('right') + label.set_ha("right") label.set_rotation(rot) fig = ax.get_figure() fig.subplots_adjust(bottom=0.2) @@ -28,7 +28,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): elif isinstance(data, ABCDataFrame): pass else: - raise ValueError('Input data must be DataFrame or Series') + raise ValueError("Input data must be DataFrame or Series") if rowLabels is None: rowLabels = data.index @@ -38,16 +38,16 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): cellText = data.values - table = matplotlib.table.table(ax, cellText=cellText, - rowLabels=rowLabels, - colLabels=colLabels, **kwargs) + table = matplotlib.table.table( + ax, cellText=cellText, rowLabels=rowLabels, colLabels=colLabels, **kwargs + ) return table -def _get_layout(nplots, layout=None, layout_type='box'): +def _get_layout(nplots, layout=None, layout_type="box"): if layout is not None: if not isinstance(layout, (tuple, list)) or len(layout) != 2: - raise ValueError('Layout must be a tuple of (rows, columns)') + raise ValueError("Layout must be a tuple of (rows, columns)") nrows, ncols = layout @@ -62,17 +62,20 @@ def _get_layout(nplots, layout=None, layout_type='box'): raise ValueError(msg) if nrows * ncols < nplots: - raise ValueError('Layout of {nrows}x{ncols} must be larger ' - 'than required size {nplots}'.format( - nrows=nrows, ncols=ncols, nplots=nplots)) + raise ValueError( + "Layout of {nrows}x{ncols} must be larger " + "than required size {nplots}".format( + nrows=nrows, ncols=ncols, nplots=nplots + ) + ) return layout - if layout_type == 'single': + if layout_type == "single": return (1, 1) - elif layout_type == 'horizontal': + elif layout_type == "horizontal": return (1, nplots) - elif layout_type == 'vertical': + elif layout_type == "vertical": return (nplots, 1) layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)} @@ -88,12 +91,21 @@ def _get_layout(nplots, layout=None, layout_type='box'): else: return k, k + # copied from matplotlib/pyplot.py and modified for pandas.plotting -def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, - subplot_kw=None, ax=None, layout=None, layout_type='box', - **fig_kw): +def _subplots( + naxes=None, + sharex=False, + sharey=False, + squeeze=True, + subplot_kw=None, + ax=None, + layout=None, + layout_type="box", + **fig_kw +): """Create a figure with a set of subplots already made. This utility wrapper makes it convenient to create common layouts of @@ -168,6 +180,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ import matplotlib.pyplot as plt + if subplot_kw is None: subplot_kw = {} @@ -177,19 +190,26 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if is_list_like(ax): ax = _flatten(ax) if layout is not None: - warnings.warn("When passing multiple axes, layout keyword is " - "ignored", UserWarning) + warnings.warn( + "When passing multiple axes, layout keyword is " "ignored", + UserWarning, + ) if sharex or sharey: - warnings.warn("When passing multiple axes, sharex and sharey " - "are ignored. These settings must be specified " - "when creating axes", UserWarning, - stacklevel=4) + warnings.warn( + "When passing multiple axes, sharex and sharey " + "are ignored. These settings must be specified " + "when creating axes", + UserWarning, + stacklevel=4, + ) if len(ax) == naxes: fig = ax[0].get_figure() return fig, ax else: - raise ValueError("The number of passed axes must be {0}, the " - "same as the output plot".format(naxes)) + raise ValueError( + "The number of passed axes must be {0}, the " + "same as the output plot".format(naxes) + ) fig = ax.get_figure() # if ax is passed and a number of subplots is 1, return ax as it is @@ -199,9 +219,12 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, else: return fig, _flatten(ax) else: - warnings.warn("To output multiple subplots, the figure containing " - "the passed axes is being cleared", UserWarning, - stacklevel=4) + warnings.warn( + "To output multiple subplots, the figure containing " + "the passed axes is being cleared", + UserWarning, + stacklevel=4, + ) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) @@ -215,9 +238,9 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) if sharex: - subplot_kw['sharex'] = ax0 + subplot_kw["sharex"] = ax0 if sharey: - subplot_kw['sharey'] = ax0 + subplot_kw["sharey"] = ax0 axarr[0] = ax0 # Note off-by-one counting because add_subplot uses the MATLAB 1-based @@ -228,8 +251,8 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, # interfere with proper axis limits on the visible axes if # they share axes e.g. issue #7528 if i >= naxes: - kwds['sharex'] = None - kwds['sharey'] = None + kwds["sharex"] = None + kwds["sharey"] = None ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) axarr[i] = ax @@ -264,10 +287,10 @@ def _remove_labels_from_axis(axis): if isinstance(axis.get_minor_locator(), ticker.NullLocator): axis.set_minor_locator(ticker.AutoLocator()) if isinstance(axis.get_minor_formatter(), ticker.NullFormatter): - axis.set_minor_formatter(ticker.FormatStrFormatter('')) + axis.set_minor_formatter(ticker.FormatStrFormatter("")) for t in axis.get_minorticklabels(): t.set_visible(False) - except Exception: # pragma no cover + except Exception: # pragma no cover raise axis.get_label().set_visible(False) @@ -289,8 +312,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): # the last in the column, because below is no subplot/gap. if not layout[ax.rowNum + 1, ax.colNum]: continue - if sharex or len(ax.get_shared_x_axes() - .get_siblings(ax)) > 1: + if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) except IndexError: @@ -299,8 +321,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): for ax in axarr: if ax.is_last_row(): continue - if sharex or len(ax.get_shared_x_axes() - .get_siblings(ax)) > 1: + if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: _remove_labels_from_axis(ax.xaxis) if ncols > 1: @@ -325,10 +346,10 @@ def _flatten(axes): def _get_all_lines(ax): lines = ax.get_lines() - if hasattr(ax, 'right_ax'): + if hasattr(ax, "right_ax"): lines += ax.right_ax.get_lines() - if hasattr(ax, 'left_ax'): + if hasattr(ax, "left_ax"): lines += ax.left_ax.get_lines() return lines @@ -343,9 +364,9 @@ def _get_xlim(lines): return left, right -def _set_ticks_props(axes, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None): +def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): import matplotlib.pyplot as plt + for ax in _flatten(axes): if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index f240faf45dfce..435562f7d1262 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -25,8 +25,9 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): matplotlib table object """ plot_backend = _get_plot_backend() - return plot_backend.table(ax=ax, data=data, rowLabels=None, colLabels=None, - **kwargs) + return plot_backend.table( + ax=ax, data=data, rowLabels=None, colLabels=None, **kwargs + ) def register(explicit=True): @@ -70,9 +71,19 @@ def deregister(): plot_backend.deregister() -def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, - diagonal='hist', marker='.', density_kwds=None, - hist_kwds=None, range_padding=0.05, **kwds): +def scatter_matrix( + frame, + alpha=0.5, + figsize=None, + ax=None, + grid=False, + diagonal="hist", + marker=".", + density_kwds=None, + hist_kwds=None, + range_padding=0.05, + **kwds +): """ Draw a matrix of scatter plots. @@ -115,9 +126,18 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, """ plot_backend = _get_plot_backend() return plot_backend.scatter_matrix( - frame=frame, alpha=alpha, figsize=figsize, ax=ax, grid=grid, - diagonal=diagonal, marker=marker, density_kwds=density_kwds, - hist_kwds=hist_kwds, range_padding=range_padding, **kwds) + frame=frame, + alpha=alpha, + figsize=figsize, + ax=ax, + grid=grid, + diagonal=diagonal, + marker=marker, + density_kwds=density_kwds, + hist_kwds=hist_kwds, + range_padding=range_padding, + **kwds + ) def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): @@ -183,13 +203,20 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): >>> rad_viz = pd.plotting.radviz(df, 'Category') # doctest: +SKIP """ plot_backend = _get_plot_backend() - return plot_backend.radviz(frame=frame, class_column=class_column, ax=ax, - color=color, colormap=colormap, **kwds) - - -@deprecate_kwarg(old_arg_name='data', new_arg_name='frame') -def andrews_curves(frame, class_column, ax=None, samples=200, color=None, - colormap=None, **kwds): + return plot_backend.radviz( + frame=frame, + class_column=class_column, + ax=ax, + color=color, + colormap=colormap, + **kwds + ) + + +@deprecate_kwarg(old_arg_name="data", new_arg_name="frame") +def andrews_curves( + frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds +): """ Generate a matplotlib plot of Andrews curves, for visualising clusters of multivariate data. @@ -223,9 +250,15 @@ def andrews_curves(frame, class_column, ax=None, samples=200, color=None, class:`matplotlip.axis.Axes` """ plot_backend = _get_plot_backend() - return plot_backend.andrews_curves(frame=frame, class_column=class_column, - ax=ax, samples=samples, color=color, - colormap=colormap, **kwds) + return plot_backend.andrews_curves( + frame=frame, + class_column=class_column, + ax=ax, + samples=samples, + color=color, + colormap=colormap, + **kwds + ) def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): @@ -275,16 +308,27 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): >>> fig = pd.plotting.bootstrap_plot(s) # doctest: +SKIP """ plot_backend = _get_plot_backend() - return plot_backend.bootstrap_plot(series=series, fig=fig, size=size, - samples=samples, **kwds) - - -@deprecate_kwarg(old_arg_name='colors', new_arg_name='color') -@deprecate_kwarg(old_arg_name='data', new_arg_name='frame', stacklevel=3) -def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, - use_columns=False, xticks=None, colormap=None, - axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds): + return plot_backend.bootstrap_plot( + series=series, fig=fig, size=size, samples=samples, **kwds + ) + + +@deprecate_kwarg(old_arg_name="colors", new_arg_name="color") +@deprecate_kwarg(old_arg_name="data", new_arg_name="frame", stacklevel=3) +def parallel_coordinates( + frame, + class_column, + cols=None, + ax=None, + color=None, + use_columns=False, + xticks=None, + colormap=None, + axvlines=True, + axvlines_kwds=None, + sort_labels=False, + **kwds +): """Parallel coordinates plotting. Parameters @@ -332,10 +376,19 @@ def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None, """ plot_backend = _get_plot_backend() return plot_backend.parallel_coordinates( - frame=frame, class_column=class_column, cols=cols, ax=ax, color=color, - use_columns=use_columns, xticks=xticks, colormap=colormap, - axvlines=axvlines, axvlines_kwds=axvlines_kwds, - sort_labels=sort_labels, **kwds) + frame=frame, + class_column=class_column, + cols=cols, + ax=ax, + color=color, + use_columns=use_columns, + xticks=xticks, + colormap=colormap, + axvlines=axvlines, + axvlines_kwds=axvlines_kwds, + sort_labels=sort_labels, + **kwds + ) def lag_plot(series, lag=1, ax=None, **kwds): @@ -392,9 +445,12 @@ def tsplot(series, plotf, ax=None, **kwargs): .. deprecated:: 0.23.0 Use Series.plot() instead """ - warnings.warn("'tsplot' is deprecated and will be removed in a " - "future version. Please use Series.plot() instead.", - FutureWarning, stacklevel=2) + warnings.warn( + "'tsplot' is deprecated and will be removed in a " + "future version. Please use Series.plot() instead.", + FutureWarning, + stacklevel=2, + ) plot_backend = _get_plot_backend() return plot_backend.tsplot(series=series, plotf=plotf, ax=ax, **kwargs) @@ -408,19 +464,20 @@ class _Options(dict): """ # alias so the names are same as plotting method parameter names - _ALIASES = {'x_compat': 'xaxis.compat'} - _DEFAULT_KEYS = ['xaxis.compat'] + _ALIASES = {"x_compat": "xaxis.compat"} + _DEFAULT_KEYS = ["xaxis.compat"] def __init__(self, deprecated=False): self._deprecated = deprecated # self['xaxis.compat'] = False - super().__setitem__('xaxis.compat', False) + super().__setitem__("xaxis.compat", False) def __getitem__(self, key): key = self._get_canonical_key(key) if key not in self: raise ValueError( - '{key} is not a valid pandas plotting option'.format(key=key)) + "{key} is not a valid pandas plotting option".format(key=key) + ) return super().__getitem__(key) def __setitem__(self, key, value): @@ -430,8 +487,7 @@ def __setitem__(self, key, value): def __delitem__(self, key): key = self._get_canonical_key(key) if key in self._DEFAULT_KEYS: - raise ValueError( - 'Cannot remove default parameter {key}'.format(key=key)) + raise ValueError("Cannot remove default parameter {key}".format(key=key)) return super().__delitem__(key) def __contains__(self, key): diff --git a/pandas/testing.py b/pandas/testing.py index dbea1ecc7362a..acae47367d997 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -5,4 +5,7 @@ """ from pandas.util.testing import ( - assert_frame_equal, assert_index_equal, assert_series_equal) + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 614e3172d9d48..326bef7f4b480 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -4,13 +4,12 @@ class Base: - def check(self, namespace, expected, ignored=None): # see which names are in the namespace, minus optional # ignored ones # compare vs the expected - result = sorted(f for f in dir(namespace) if not f.startswith('__')) + result = sorted(f for f in dir(namespace) if not f.startswith("__")) if ignored is not None: result = sorted(list(set(result) - set(ignored))) @@ -22,33 +21,74 @@ class TestPDApi(Base): # these are optionally imported based on testing # & need to be ignored - ignored = ['tests', 'locale', 'conftest'] + ignored = ["tests", "locale", "conftest"] # top-level sub-packages - lib = ['api', 'arrays', 'compat', 'core', 'errors', 'pandas', - 'plotting', 'test', 'testing', 'tseries', - 'util', 'options', 'io'] + lib = [ + "api", + "arrays", + "compat", + "core", + "errors", + "pandas", + "plotting", + "test", + "testing", + "tseries", + "util", + "options", + "io", + ] # these are already deprecated; awaiting removal deprecated_modules = [] # misc - misc = ['IndexSlice', 'NaT'] + misc = ["IndexSlice", "NaT"] # top-level classes - classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset', - 'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index', - 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex', - 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', - 'Series', 'SparseArray', 'SparseDataFrame', 'SparseDtype', - 'SparseSeries', 'Timedelta', - 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex', - 'CategoricalDtype', 'PeriodDtype', 'IntervalDtype', - 'DatetimeTZDtype', - 'Int8Dtype', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', - 'UInt8Dtype', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', - 'NamedAgg', - ] + classes = [ + "Categorical", + "CategoricalIndex", + "DataFrame", + "DateOffset", + "DatetimeIndex", + "ExcelFile", + "ExcelWriter", + "Float64Index", + "Grouper", + "HDFStore", + "Index", + "Int64Index", + "MultiIndex", + "Period", + "PeriodIndex", + "RangeIndex", + "UInt64Index", + "Series", + "SparseArray", + "SparseDataFrame", + "SparseDtype", + "SparseSeries", + "Timedelta", + "TimedeltaIndex", + "Timestamp", + "Interval", + "IntervalIndex", + "CategoricalDtype", + "PeriodDtype", + "IntervalDtype", + "DatetimeTZDtype", + "Int8Dtype", + "Int16Dtype", + "Int32Dtype", + "Int64Dtype", + "UInt8Dtype", + "UInt16Dtype", + "UInt32Dtype", + "UInt64Dtype", + "NamedAgg", + ] if not compat.PY37: classes.append("Panel") @@ -59,35 +99,77 @@ class TestPDApi(Base): deprecated_classes_in_future = [] # external modules exposed in pandas namespace - modules = ['np', 'datetime'] + modules = ["np", "datetime"] # top-level functions - funcs = ['array', 'bdate_range', 'concat', 'crosstab', 'cut', - 'date_range', 'interval_range', 'eval', - 'factorize', 'get_dummies', - 'infer_freq', 'isna', 'isnull', 'lreshape', - 'melt', 'notna', 'notnull', 'offsets', - 'merge', 'merge_ordered', 'merge_asof', - 'period_range', - 'pivot', 'pivot_table', 'qcut', - 'show_versions', 'timedelta_range', 'unique', - 'value_counts', 'wide_to_long'] + funcs = [ + "array", + "bdate_range", + "concat", + "crosstab", + "cut", + "date_range", + "interval_range", + "eval", + "factorize", + "get_dummies", + "infer_freq", + "isna", + "isnull", + "lreshape", + "melt", + "notna", + "notnull", + "offsets", + "merge", + "merge_ordered", + "merge_asof", + "period_range", + "pivot", + "pivot_table", + "qcut", + "show_versions", + "timedelta_range", + "unique", + "value_counts", + "wide_to_long", + ] # top-level option funcs - funcs_option = ['reset_option', 'describe_option', 'get_option', - 'option_context', 'set_option', - 'set_eng_float_format'] + funcs_option = [ + "reset_option", + "describe_option", + "get_option", + "option_context", + "set_option", + "set_eng_float_format", + ] # top-level read_* funcs - funcs_read = ['read_clipboard', 'read_csv', 'read_excel', 'read_fwf', - 'read_gbq', 'read_hdf', 'read_html', 'read_json', - 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', - 'read_sql_query', 'read_sql_table', 'read_stata', - 'read_table', 'read_feather', 'read_parquet', 'read_spss'] + funcs_read = [ + "read_clipboard", + "read_csv", + "read_excel", + "read_fwf", + "read_gbq", + "read_hdf", + "read_html", + "read_json", + "read_msgpack", + "read_pickle", + "read_sas", + "read_sql", + "read_sql_query", + "read_sql_table", + "read_stata", + "read_table", + "read_feather", + "read_parquet", + "read_spss", + ] # top-level to_* funcs - funcs_to = ['to_datetime', 'to_msgpack', - 'to_numeric', 'to_pickle', 'to_timedelta'] + funcs_to = ["to_datetime", "to_msgpack", "to_numeric", "to_pickle", "to_timedelta"] # top-level to deprecate in the future deprecated_funcs_in_future = [] @@ -96,28 +178,45 @@ class TestPDApi(Base): deprecated_funcs = [] # private modules in pandas namespace - private_modules = ['_config', '_hashtable', '_lib', '_libs', - '_np_version_under1p14', '_np_version_under1p15', - '_np_version_under1p16', '_np_version_under1p17', - '_tslib', '_typing', '_version'] + private_modules = [ + "_config", + "_hashtable", + "_lib", + "_libs", + "_np_version_under1p14", + "_np_version_under1p15", + "_np_version_under1p16", + "_np_version_under1p17", + "_tslib", + "_typing", + "_version", + ] def test_api(self): - self.check(pd, - self.lib + self.misc + - self.modules + self.deprecated_modules + - self.classes + self.deprecated_classes + - self.deprecated_classes_in_future + - self.funcs + self.funcs_option + - self.funcs_read + self.funcs_to + - self.deprecated_funcs_in_future + - self.deprecated_funcs + self.private_modules, - self.ignored) + self.check( + pd, + self.lib + + self.misc + + self.modules + + self.deprecated_modules + + self.classes + + self.deprecated_classes + + self.deprecated_classes_in_future + + self.funcs + + self.funcs_option + + self.funcs_read + + self.funcs_to + + self.deprecated_funcs_in_future + + self.deprecated_funcs + + self.private_modules, + self.ignored, + ) class TestApi(Base): - allowed = ['types', 'extensions'] + allowed = ["types", "extensions"] def test_api(self): @@ -126,10 +225,10 @@ def test_api(self): class TestTesting(Base): - funcs = ['assert_frame_equal', 'assert_series_equal', - 'assert_index_equal'] + funcs = ["assert_frame_equal", "assert_series_equal", "assert_index_equal"] def test_testing(self): from pandas import testing + self.check(testing, self.funcs) diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index d6090225c0a7f..e2ff77715e301 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -6,28 +6,53 @@ class TestTypes(Base): - allowed = ['is_bool', 'is_bool_dtype', - 'is_categorical', 'is_categorical_dtype', 'is_complex', - 'is_complex_dtype', 'is_datetime64_any_dtype', - 'is_datetime64_dtype', 'is_datetime64_ns_dtype', - 'is_datetime64tz_dtype', 'is_dtype_equal', - 'is_extension_type', 'is_float', 'is_float_dtype', - 'is_int64_dtype', 'is_integer', - 'is_integer_dtype', 'is_number', 'is_numeric_dtype', - 'is_object_dtype', 'is_scalar', 'is_sparse', - 'is_string_dtype', 'is_signed_integer_dtype', - 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', - 'is_unsigned_integer_dtype', - 'is_period_dtype', 'is_interval', 'is_interval_dtype', - 'is_re', 'is_re_compilable', - 'is_dict_like', 'is_iterator', 'is_file_like', - 'is_list_like', 'is_hashable', 'is_array_like', - 'is_named_tuple', - 'pandas_dtype', 'union_categoricals', 'infer_dtype', - 'is_extension_array_dtype'] - deprecated = ['is_period', 'is_datetimetz'] - dtypes = ['CategoricalDtype', 'DatetimeTZDtype', - 'PeriodDtype', 'IntervalDtype'] + allowed = [ + "is_bool", + "is_bool_dtype", + "is_categorical", + "is_categorical_dtype", + "is_complex", + "is_complex_dtype", + "is_datetime64_any_dtype", + "is_datetime64_dtype", + "is_datetime64_ns_dtype", + "is_datetime64tz_dtype", + "is_dtype_equal", + "is_extension_type", + "is_float", + "is_float_dtype", + "is_int64_dtype", + "is_integer", + "is_integer_dtype", + "is_number", + "is_numeric_dtype", + "is_object_dtype", + "is_scalar", + "is_sparse", + "is_string_dtype", + "is_signed_integer_dtype", + "is_timedelta64_dtype", + "is_timedelta64_ns_dtype", + "is_unsigned_integer_dtype", + "is_period_dtype", + "is_interval", + "is_interval_dtype", + "is_re", + "is_re_compilable", + "is_dict_like", + "is_iterator", + "is_file_like", + "is_list_like", + "is_hashable", + "is_array_like", + "is_named_tuple", + "pandas_dtype", + "union_categoricals", + "infer_dtype", + "is_extension_array_dtype", + ] + deprecated = ["is_period", "is_datetimetz"] + dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"] def test_types(self): @@ -36,6 +61,5 @@ def test_types(self): def test_deprecated_from_api_types(self): for t in self.deprecated: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): getattr(types, t)(1) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index c6547c32f3ce7..c67a67bb31d62 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -11,24 +11,26 @@ def id_func(x): if isinstance(x, tuple): assert len(x) == 2 - return x[0].__name__ + '-' + str(x[1]) + return x[0].__name__ + "-" + str(x[1]) else: return x.__name__ # ------------------------------------------------------------------ + @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): # zero-dim integer array behaves like an integer return request.param -zeros = [box_cls([0] * 5, dtype=dtype) - for box_cls in [pd.Index, np.array] - for dtype in [np.int64, np.uint64, np.float64]] -zeros.extend([np.array(0, dtype=dtype) - for dtype in [np.int64, np.uint64, np.float64]]) +zeros = [ + box_cls([0] * 5, dtype=dtype) + for box_cls in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64] +] +zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) zeros.extend([0, 0.0]) @@ -42,11 +44,16 @@ def zero(request): # ------------------------------------------------------------------ # Vector Fixtures -@pytest.fixture(params=[pd.Float64Index(np.arange(5, dtype='float64')), - pd.Int64Index(np.arange(5, dtype='int64')), - pd.UInt64Index(np.arange(5, dtype='uint64')), - pd.RangeIndex(5)], - ids=lambda x: type(x).__name__) + +@pytest.fixture( + params=[ + pd.Float64Index(np.arange(5, dtype="float64")), + pd.Int64Index(np.arange(5, dtype="int64")), + pd.UInt64Index(np.arange(5, dtype="uint64")), + pd.RangeIndex(5), + ], + ids=lambda x: type(x).__name__, +) def numeric_idx(request): """ Several types of numeric-dtypes Index objects @@ -57,10 +64,15 @@ def numeric_idx(request): # ------------------------------------------------------------------ # Scalar Fixtures -@pytest.fixture(params=[pd.Timedelta('5m4s').to_pytimedelta(), - pd.Timedelta('5m4s'), - pd.Timedelta('5m4s').to_timedelta64()], - ids=lambda x: type(x).__name__) + +@pytest.fixture( + params=[ + pd.Timedelta("5m4s").to_pytimedelta(), + pd.Timedelta("5m4s"), + pd.Timedelta("5m4s").to_timedelta64(), + ], + ids=lambda x: type(x).__name__, +) def scalar_td(request): """ Several variants of Timedelta scalars representing 5 minutes and 4 seconds @@ -68,13 +80,17 @@ def scalar_td(request): return request.param -@pytest.fixture(params=[pd.offsets.Day(3), - pd.offsets.Hour(72), - pd.Timedelta(days=3).to_pytimedelta(), - pd.Timedelta('72:00:00'), - np.timedelta64(3, 'D'), - np.timedelta64(72, 'h')], - ids=lambda x: type(x).__name__) +@pytest.fixture( + params=[ + pd.offsets.Day(3), + pd.offsets.Hour(72), + pd.Timedelta(days=3).to_pytimedelta(), + pd.Timedelta("72:00:00"), + np.timedelta64(3, "D"), + np.timedelta64(72, "h"), + ], + ids=lambda x: type(x).__name__, +) def three_days(request): """ Several timedelta-like and DateOffset objects that each represent @@ -83,13 +99,17 @@ def three_days(request): return request.param -@pytest.fixture(params=[pd.offsets.Hour(2), - pd.offsets.Minute(120), - pd.Timedelta(hours=2).to_pytimedelta(), - pd.Timedelta(seconds=2 * 3600), - np.timedelta64(2, 'h'), - np.timedelta64(120, 'm')], - ids=lambda x: type(x).__name__) +@pytest.fixture( + params=[ + pd.offsets.Hour(2), + pd.offsets.Minute(120), + pd.Timedelta(hours=2).to_pytimedelta(), + pd.Timedelta(seconds=2 * 3600), + np.timedelta64(2, "h"), + np.timedelta64(120, "m"), + ], + ids=lambda x: type(x).__name__, +) def two_hours(request): """ Several timedelta-like and DateOffset objects that each represent @@ -98,14 +118,21 @@ def two_hours(request): return request.param -_common_mismatch = [pd.offsets.YearBegin(2), - pd.offsets.MonthBegin(1), - pd.offsets.Minute()] +_common_mismatch = [ + pd.offsets.YearBegin(2), + pd.offsets.MonthBegin(1), + pd.offsets.Minute(), +] -@pytest.fixture(params=[pd.Timedelta(minutes=30).to_pytimedelta(), - np.timedelta64(30, 's'), - pd.Timedelta(seconds=30)] + _common_mismatch) +@pytest.fixture( + params=[ + pd.Timedelta(minutes=30).to_pytimedelta(), + np.timedelta64(30, "s"), + pd.Timedelta(seconds=30), + ] + + _common_mismatch +) def not_hourly(request): """ Several timedelta-like and DateOffset instances that are _not_ @@ -114,9 +141,14 @@ def not_hourly(request): return request.param -@pytest.fixture(params=[np.timedelta64(4, 'h'), - pd.Timedelta(hours=23).to_pytimedelta(), - pd.Timedelta('23:00:00')] + _common_mismatch) +@pytest.fixture( + params=[ + np.timedelta64(4, "h"), + pd.Timedelta(hours=23).to_pytimedelta(), + pd.Timedelta("23:00:00"), + ] + + _common_mismatch +) def not_daily(request): """ Several timedelta-like and DateOffset instances that are _not_ @@ -125,9 +157,14 @@ def not_daily(request): return request.param -@pytest.fixture(params=[np.timedelta64(365, 'D'), - pd.Timedelta(days=365).to_pytimedelta(), - pd.Timedelta(days=365)] + _common_mismatch) +@pytest.fixture( + params=[ + np.timedelta64(365, "D"), + pd.Timedelta(days=365).to_pytimedelta(), + pd.Timedelta(days=365), + ] + + _common_mismatch +) def mismatched_freq(request): """ Several timedelta-like and DateOffset instances that are _not_ @@ -138,8 +175,8 @@ def mismatched_freq(request): # ------------------------------------------------------------------ -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], - ids=id_func) + +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func) def box(request): """ Several array-like containers that should have effectively identical @@ -148,11 +185,10 @@ def box(request): return request.param -@pytest.fixture(params=[pd.Index, - pd.Series, - pytest.param(pd.DataFrame, - marks=pytest.mark.xfail)], - ids=id_func) +@pytest.fixture( + params=[pd.Index, pd.Series, pytest.param(pd.DataFrame, marks=pytest.mark.xfail)], + ids=id_func, +) def box_df_fail(request): """ Fixture equivalent to `box` fixture but xfailing the DataFrame case. @@ -160,12 +196,15 @@ def box_df_fail(request): return request.param -@pytest.fixture(params=[(pd.Index, False), - (pd.Series, False), - (pd.DataFrame, False), - pytest.param((pd.DataFrame, True), - marks=pytest.mark.xfail)], - ids=id_func) +@pytest.fixture( + params=[ + (pd.Index, False), + (pd.Series, False), + (pd.DataFrame, False), + pytest.param((pd.DataFrame, True), marks=pytest.mark.xfail), + ], + ids=id_func, +) def box_transpose_fail(request): """ Fixture similar to `box` but testing both transpose cases for DataFrame, @@ -175,8 +214,7 @@ def box_transpose_fail(request): return request.param -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], - ids=id_func) +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], ids=id_func) def box_with_array(request): """ Fixture to test behavior for Index, Series, DataFrame, and pandas Array diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 908e197ec1d28..6037273450a1c 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -17,8 +17,15 @@ import pandas as pd from pandas import ( - DatetimeIndex, NaT, Period, Series, Timedelta, TimedeltaIndex, Timestamp, - date_range) + DatetimeIndex, + NaT, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + date_range, +) from pandas.core.indexes.datetimes import _to_M8 import pandas.util.testing as tm @@ -37,6 +44,7 @@ def assert_all(obj): # ------------------------------------------------------------------ # Comparisons + class TestDatetime64ArrayLikeComparisons: # Comparison tests for datetime64 vectors fully parametrized over # DataFrame/Series/DatetimeIndex/DateteimeArray. Ideally all comparison @@ -47,7 +55,7 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): tz = tz_naive_fixture box = box_with_array xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - dti = date_range('20130101', periods=3, tz=tz) + dti = date_range("20130101", periods=3, tz=tz) other = np.array(dti.to_numpy()[0]) @@ -60,13 +68,17 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): class TestDatetime64DataFrameComparison: - @pytest.mark.parametrize('timestamps', [ - [pd.Timestamp('2012-01-01 13:00:00+00:00')] * 2, - [pd.Timestamp('2012-01-01 13:00:00')] * 2]) + @pytest.mark.parametrize( + "timestamps", + [ + [pd.Timestamp("2012-01-01 13:00:00+00:00")] * 2, + [pd.Timestamp("2012-01-01 13:00:00")] * 2, + ], + ) def test_tz_aware_scalar_comparison(self, timestamps): # GH#15966 - df = pd.DataFrame({'test': timestamps}) - expected = pd.DataFrame({'test': [False, False]}) + df = pd.DataFrame({"test": timestamps}) + expected = pd.DataFrame({"test": [False, False]}) tm.assert_frame_equal(df == -1, expected) def test_dt64_nat_comparison(self): @@ -82,21 +94,26 @@ def test_dt64_nat_comparison(self): class TestDatetime64SeriesComparison: # TODO: moved from tests.series.test_operators; needs cleanup - @pytest.mark.parametrize('pair', [ - ([pd.Timestamp('2011-01-01'), NaT, pd.Timestamp('2011-01-03')], - [NaT, NaT, pd.Timestamp('2011-01-03')]), - - ([pd.Timedelta('1 days'), NaT, pd.Timedelta('3 days')], - [NaT, NaT, pd.Timedelta('3 days')]), - - ([pd.Period('2011-01', freq='M'), NaT, - pd.Period('2011-03', freq='M')], - [NaT, NaT, pd.Period('2011-03', freq='M')]), - - ]) - @pytest.mark.parametrize('reverse', [True, False]) - @pytest.mark.parametrize('box', [Series, pd.Index]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize( + "pair", + [ + ( + [pd.Timestamp("2011-01-01"), NaT, pd.Timestamp("2011-01-03")], + [NaT, NaT, pd.Timestamp("2011-01-03")], + ), + ( + [pd.Timedelta("1 days"), NaT, pd.Timedelta("3 days")], + [NaT, NaT, pd.Timedelta("3 days")], + ), + ( + [pd.Period("2011-01", freq="M"), NaT, pd.Period("2011-03", freq="M")], + [NaT, NaT, pd.Period("2011-03", freq="M")], + ), + ], + ) + @pytest.mark.parametrize("reverse", [True, False]) + @pytest.mark.parametrize("box", [Series, pd.Index]) + @pytest.mark.parametrize("dtype", [None, object]) def test_nat_comparisons(self, dtype, box, reverse, pair): l, r = pair if reverse: @@ -131,7 +148,7 @@ def test_comparison_invalid(self, box_with_array): xbox = box_with_array if box_with_array is not pd.Index else np.ndarray ser = Series(range(5)) - ser2 = Series(pd.date_range('20010101', periods=5)) + ser2 = Series(pd.date_range("20010101", periods=5)) ser = tm.box_expected(ser, box_with_array) ser2 = tm.box_expected(ser2, box_with_array) @@ -145,7 +162,7 @@ def test_comparison_invalid(self, box_with_array): result = x != y expected = tm.box_expected([True] * 5, xbox) tm.assert_equal(result, expected) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): x >= y with pytest.raises(TypeError, match=msg): @@ -155,12 +172,15 @@ def test_comparison_invalid(self, box_with_array): with pytest.raises(TypeError, match=msg): x <= y - @pytest.mark.parametrize('data', [ - [Timestamp('2011-01-01'), NaT, Timestamp('2011-01-03')], - [Timedelta('1 days'), NaT, Timedelta('3 days')], - [Period('2011-01', freq='M'), NaT, Period('2011-03', freq='M')] - ]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize( + "data", + [ + [Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")], + [Timedelta("1 days"), NaT, Timedelta("3 days")], + [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")], + ], + ) + @pytest.mark.parametrize("dtype", [None, object]) def test_nat_comparisons_scalar(self, dtype, data, box_with_array): if box_with_array is tm.to_array and dtype is object: # dont bother testing ndarray comparison methods as this fails @@ -195,7 +215,7 @@ def test_nat_comparisons_scalar(self, dtype, data, box_with_array): tm.assert_equal(NaT <= left, expected) def test_series_comparison_scalars(self): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) val = datetime(2000, 1, 4) result = series > val @@ -210,12 +230,12 @@ def test_series_comparison_scalars(self): def test_dt64_ser_cmp_date_warning(self): # https://github.com/pandas-dev/pandas/issues/21359 # Remove this test and enble invalid test below - ser = pd.Series(pd.date_range('20010101', periods=10), name='dates') + ser = pd.Series(pd.date_range("20010101", periods=10), name="dates") date = ser.iloc[0].to_pydatetime().date() with tm.assert_produces_warning(FutureWarning) as m: result = ser == date - expected = pd.Series([True] + [False] * 9, name='dates') + expected = pd.Series([True] + [False] * 9, name="dates") tm.assert_series_equal(result, expected) assert "Comparing Series of datetimes " in str(m[0].message) assert "will not compare equal" in str(m[0].message) @@ -232,18 +252,17 @@ def test_dt64_ser_cmp_date_warning(self): with tm.assert_produces_warning(FutureWarning) as m: result = ser < date - tm.assert_series_equal(result, pd.Series([False] * 10, name='dates')) + tm.assert_series_equal(result, pd.Series([False] * 10, name="dates")) assert "a TypeError will be raised" in str(m[0].message) with tm.assert_produces_warning(FutureWarning) as m: result = ser >= date - tm.assert_series_equal(result, pd.Series([True] * 10, name='dates')) + tm.assert_series_equal(result, pd.Series([True] * 10, name="dates")) assert "a TypeError will be raised" in str(m[0].message) with tm.assert_produces_warning(FutureWarning) as m: result = ser > date - tm.assert_series_equal(result, pd.Series([False] + [True] * 9, - name='dates')) + tm.assert_series_equal(result, pd.Series([False] + [True] * 9, name="dates")) assert "a TypeError will be raised" in str(m[0].message) @pytest.mark.skip(reason="GH#21359") @@ -252,7 +271,7 @@ def test_dt64ser_cmp_date_invalid(self, box_with_array): # match DatetimeIndex/Timestamp. This also matches the behavior # of stdlib datetime.datetime - ser = pd.date_range('20010101', periods=10) + ser = pd.date_range("20010101", periods=10) date = ser.iloc[0].to_pydatetime().date() ser = tm.box_expected(ser, box_with_array) @@ -267,12 +286,9 @@ def test_dt64ser_cmp_date_invalid(self, box_with_array): with pytest.raises(TypeError): ser <= date - @pytest.mark.parametrize("left,right", [ - ("lt", "gt"), - ("le", "ge"), - ("eq", "eq"), - ("ne", "ne"), - ]) + @pytest.mark.parametrize( + "left,right", [("lt", "gt"), ("le", "ge"), ("eq", "eq"), ("ne", "ne")] + ) def test_timestamp_compare_series(self, left, right): # see gh-4982 # Make sure we can compare Timestamps on the right AND left hand side. @@ -309,7 +325,7 @@ def test_dt64arr_timestamp_equality(self, box_with_array): # GH#11034 xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - ser = pd.Series([pd.Timestamp('2000-01-29 01:59:00'), 'NaT']) + ser = pd.Series([pd.Timestamp("2000-01-29 01:59:00"), "NaT"]) ser = tm.box_expected(ser, box_with_array) result = ser != ser @@ -336,19 +352,20 @@ def test_dt64arr_timestamp_equality(self, box_with_array): expected = tm.box_expected([False, False], xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_comparison_tzawareness_compat(self, op): # GH#18162 - dr = pd.date_range('2016-01-01', periods=6) - dz = dr.tz_localize('US/Pacific') + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") # Check that there isn't a problem aware-aware and naive-naive do not # raise naive_series = Series(dr) aware_series = Series(dz) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dz, naive_series) with pytest.raises(TypeError, match=msg): @@ -361,10 +378,10 @@ def test_comparison_tzawareness_compat(self, op): class TestDatetimeIndexComparisons: # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate - @pytest.mark.parametrize("op", [ - operator.eq, operator.ne, operator.gt, operator.lt, - operator.ge, operator.le - ]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.lt, operator.ge, operator.le], + ) def test_comparators(self, op): index = tm.makeDateIndex(100) element = index[len(index) // 2] @@ -377,12 +394,13 @@ def test_comparators(self, op): assert isinstance(index_result, np.ndarray) tm.assert_numpy_array_equal(arr_result, index_result) - @pytest.mark.parametrize('other', [datetime(2016, 1, 1), - Timestamp('2016-01-01'), - np.datetime64('2016-01-01')]) + @pytest.mark.parametrize( + "other", + [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")], + ) def test_dti_cmp_datetimelike(self, other, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) if tz is not None: if isinstance(other, np.datetime64): # no tzaware version available @@ -413,7 +431,7 @@ def dt64arr_cmp_non_datetime(self, tz_naive_fixture, box_with_array): # GH#19301 by convention datetime.date is not considered comparable # to Timestamp or DatetimeIndex. This may change in the future. tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) other = datetime(2016, 1, 1).date() @@ -428,28 +446,29 @@ def dt64arr_cmp_non_datetime(self, tz_naive_fixture, box_with_array): with pytest.raises(TypeError): dtarr >= other - @pytest.mark.parametrize('other', [None, np.nan, pd.NaT]) + @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) def test_dti_eq_null_scalar(self, other, tz_naive_fixture): # GH#19301 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) assert not (dti == other).any() - @pytest.mark.parametrize('other', [None, np.nan, pd.NaT]) + @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) def test_dti_ne_null_scalar(self, other, tz_naive_fixture): # GH#19301 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) assert (dti != other).all() - @pytest.mark.parametrize('other', [None, np.nan]) - def test_dti_cmp_null_scalar_inequality(self, tz_naive_fixture, other, - box_with_array): + @pytest.mark.parametrize("other", [None, np.nan]) + def test_dti_cmp_null_scalar_inequality( + self, tz_naive_fixture, other, box_with_array + ): # GH#19301 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): dtarr < other with pytest.raises(TypeError, match=msg): @@ -459,7 +478,7 @@ def test_dti_cmp_null_scalar_inequality(self, tz_naive_fixture, other, with pytest.raises(TypeError, match=msg): dtarr >= other - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_dti_cmp_nat(self, dtype, box_with_array): if box_with_array is tm.to_array and dtype is object: # dont bother testing ndarray comparison methods as this fails @@ -468,9 +487,10 @@ def test_dti_cmp_nat(self, dtype, box_with_array): xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, - pd.Timestamp('2011-01-03')]) - right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) + left = pd.DatetimeIndex( + [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] + ) + right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp("2011-01-03")]) left = tm.box_expected(left, box_with_array) right = tm.box_expected(right, box_with_array) @@ -508,15 +528,22 @@ def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0]) fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0]) - didx1 = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, - '2014-05-01', '2014-07-01']) - didx2 = pd.DatetimeIndex(['2014-02-01', '2014-03-01', pd.NaT, pd.NaT, - '2014-06-01', '2014-07-01']) - darr = np.array([np_datetime64_compat('2014-02-01 00:00Z'), - np_datetime64_compat('2014-03-01 00:00Z'), - np_datetime64_compat('nat'), np.datetime64('nat'), - np_datetime64_compat('2014-06-01 00:00Z'), - np_datetime64_compat('2014-07-01 00:00Z')]) + didx1 = pd.DatetimeIndex( + ["2014-01-01", pd.NaT, "2014-03-01", pd.NaT, "2014-05-01", "2014-07-01"] + ) + didx2 = pd.DatetimeIndex( + ["2014-02-01", "2014-03-01", pd.NaT, pd.NaT, "2014-06-01", "2014-07-01"] + ) + darr = np.array( + [ + np_datetime64_compat("2014-02-01 00:00Z"), + np_datetime64_compat("2014-03-01 00:00Z"), + np_datetime64_compat("nat"), + np.datetime64("nat"), + np_datetime64_compat("2014-06-01 00:00Z"), + np_datetime64_compat("2014-07-01 00:00Z"), + ] + ) cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)] @@ -593,20 +620,21 @@ def test_dti_cmp_nat_behaves_like_float_cmp_nan(self): expected = np.array([True, True, False, True, True, True]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_comparison_tzawareness_compat(self, op, box_df_fail): # GH#18162 box = box_df_fail - dr = pd.date_range('2016-01-01', periods=6) - dz = dr.tz_localize('US/Pacific') + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") dr = tm.box_expected(dr, box) dz = tm.box_expected(dz, box) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dr, dz) @@ -636,23 +664,24 @@ def test_comparison_tzawareness_compat(self, op, box_df_fail): assert (dr == list(dr)).all() assert (dz == list(dz)).all() - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): # GH#18162 - dr = pd.date_range('2016-01-01', periods=6) - dz = dr.tz_localize('US/Pacific') + dr = pd.date_range("2016-01-01", periods=6) + dz = dr.tz_localize("US/Pacific") dr = tm.box_expected(dr, box_with_array) dz = tm.box_expected(dz, box_with_array) # Check comparisons against scalar Timestamps - ts = pd.Timestamp('2000-03-14 01:59') - ts_tz = pd.Timestamp('2000-03-14 01:59', tz='Europe/Amsterdam') + ts = pd.Timestamp("2000-03-14 01:59") + ts_tz = pd.Timestamp("2000-03-14 01:59", tz="Europe/Amsterdam") assert_all(dr > ts) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dr, ts_tz) @@ -664,49 +693,54 @@ def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): with pytest.raises(TypeError, match=msg): op(ts, dz) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) - @pytest.mark.parametrize('other', [datetime(2016, 1, 1), - Timestamp('2016-01-01'), - np.datetime64('2016-01-01')]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) + @pytest.mark.parametrize( + "other", + [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")], + ) # Bug in NumPy? https://github.com/numpy/numpy/issues/13841 # Raising in __eq__ will fallback to NumPy, which warns, fails, # then re-raises the original exception. So we just need to ignore. @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") - def test_scalar_comparison_tzawareness(self, op, other, tz_aware_fixture, - box_with_array): + def test_scalar_comparison_tzawareness( + self, op, other, tz_aware_fixture, box_with_array + ): tz = tz_aware_fixture - dti = pd.date_range('2016-01-01', periods=2, tz=tz) + dti = pd.date_range("2016-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): op(dtarr, other) with pytest.raises(TypeError, match=msg): op(other, dtarr) - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.gt, operator.ge, - operator.lt, operator.le]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], + ) def test_nat_comparison_tzawareness(self, op): # GH#19276 # tzaware DatetimeIndex should not raise when compared to NaT - dti = pd.DatetimeIndex(['2014-01-01', pd.NaT, '2014-03-01', pd.NaT, - '2014-05-01', '2014-07-01']) + dti = pd.DatetimeIndex( + ["2014-01-01", pd.NaT, "2014-03-01", pd.NaT, "2014-05-01", "2014-07-01"] + ) expected = np.array([op == operator.ne] * len(dti)) result = op(dti, pd.NaT) tm.assert_numpy_array_equal(result, expected) - result = op(dti.tz_localize('US/Pacific'), pd.NaT) + result = op(dti.tz_localize("US/Pacific"), pd.NaT) tm.assert_numpy_array_equal(result, expected) def test_dti_cmp_str(self, tz_naive_fixture): # GH#22074 # regardless of tz, we expect these comparisons are valid tz = tz_naive_fixture - rng = date_range('1/1/2000', periods=10, tz=tz) - other = '1/1/2000' + rng = date_range("1/1/2000", periods=10, tz=tz) + other = "1/1/2000" result = rng == other expected = np.array([True] + [False] * 9) @@ -732,15 +766,13 @@ def test_dti_cmp_str(self, tz_naive_fixture): expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other', ['foo', 99, 4.0, - object(), timedelta(days=2)]) - def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, - box_with_array): + @pytest.mark.parametrize("other", ["foo", 99, 4.0, object(), timedelta(days=2)]) + def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): # GH#22074 tz = tz_naive_fixture xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - rng = date_range('1/1/2000', periods=10, tz=tz) + rng = date_range("1/1/2000", periods=10, tz=tz) rng = tm.box_expected(rng, box_with_array) result = rng == other @@ -752,7 +784,7 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, expected = np.array([True] * 10) expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): rng < other with pytest.raises(TypeError, match=msg): @@ -763,22 +795,26 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, rng >= other def test_dti_cmp_list(self): - rng = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10) result = rng == list(rng) expected = rng == rng tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other', [ - pd.timedelta_range('1D', periods=10), - pd.timedelta_range('1D', periods=10).to_series(), - pd.timedelta_range('1D', periods=10).asi8.view('m8[ns]') - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [ + pd.timedelta_range("1D", periods=10), + pd.timedelta_range("1D", periods=10).to_series(), + pd.timedelta_range("1D", periods=10).asi8.view("m8[ns]"), + ], + ids=lambda x: type(x).__name__, + ) def test_dti_cmp_tdi_tzawareness(self, other): # GH#22074 # reversion test that we _don't_ call _assert_tzawareness_compat # when comparing against TimedeltaIndex - dti = date_range('2000-01-01', periods=10, tz='Asia/Tokyo') + dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo") result = dti == other expected = np.array([False] * 10) @@ -787,7 +823,7 @@ def test_dti_cmp_tdi_tzawareness(self, other): result = dti != other expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) - msg = 'Invalid comparison between' + msg = "Invalid comparison between" with pytest.raises(TypeError, match=msg): dti < other with pytest.raises(TypeError, match=msg): @@ -799,16 +835,16 @@ def test_dti_cmp_tdi_tzawareness(self, other): def test_dti_cmp_object_dtype(self): # GH#22074 - dti = date_range('2000-01-01', periods=10, tz='Asia/Tokyo') + dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo") - other = dti.astype('O') + other = dti.astype("O") result = dti == other expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) other = dti.tz_localize(None) - msg = 'Cannot compare tz-naive and tz-aware' + msg = "Cannot compare tz-naive and tz-aware" with pytest.raises(TypeError, match=msg): # tzawareness failure dti != other @@ -825,6 +861,7 @@ def test_dti_cmp_object_dtype(self): # ------------------------------------------------------------------ # Arithmetic + class TestDatetime64Arithmetic: # This class is intended for "finished" tests that are fully parametrized # over DataFrame/Series/Index/DatetimeArray @@ -832,14 +869,14 @@ class TestDatetime64Arithmetic: # ------------------------------------------------------------- # Addition/Subtraction of timedelta-like - def test_dt64arr_add_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_add_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): # GH#22005, GH#22163 check DataFrame doesn't raise TypeError tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('2000-01-01 02:00', - '2000-02-01 02:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -847,13 +884,13 @@ def test_dt64arr_add_timedeltalike_scalar(self, tz_naive_fixture, result = rng + two_hours tm.assert_equal(result, expected) - def test_dt64arr_iadd_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_iadd_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('2000-01-01 02:00', - '2000-02-01 02:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -861,13 +898,13 @@ def test_dt64arr_iadd_timedeltalike_scalar(self, tz_naive_fixture, rng += two_hours tm.assert_equal(rng, expected) - def test_dt64arr_sub_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_sub_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('1999-12-31 22:00', - '2000-01-31 22:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -875,13 +912,13 @@ def test_dt64arr_sub_timedeltalike_scalar(self, tz_naive_fixture, result = rng - two_hours tm.assert_equal(result, expected) - def test_dt64arr_isub_timedeltalike_scalar(self, tz_naive_fixture, - two_hours, box_with_array): + def test_dt64arr_isub_timedeltalike_scalar( + self, tz_naive_fixture, two_hours, box_with_array + ): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - expected = pd.date_range('1999-12-31 22:00', - '2000-01-31 22:00', tz=tz) + rng = pd.date_range("2000-01-01", "2000-02-01", tz=tz) + expected = pd.date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -892,26 +929,28 @@ def test_dt64arr_isub_timedeltalike_scalar(self, tz_naive_fixture, def test_dt64arr_add_td64_scalar(self, box_with_array): # scalar timedeltas/np.timedelta64 objects # operate with np.timedelta64 correctly - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) - expected = Series([Timestamp('20130101 9:01:01'), - Timestamp('20130101 9:02:01')]) + expected = Series( + [Timestamp("20130101 9:01:01"), Timestamp("20130101 9:02:01")] + ) dtarr = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) - result = dtarr + np.timedelta64(1, 's') + result = dtarr + np.timedelta64(1, "s") tm.assert_equal(result, expected) - result = np.timedelta64(1, 's') + dtarr + result = np.timedelta64(1, "s") + dtarr tm.assert_equal(result, expected) - expected = Series([Timestamp('20130101 9:01:00.005'), - Timestamp('20130101 9:02:00.005')]) + expected = Series( + [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] + ) expected = tm.box_expected(expected, box_with_array) - result = dtarr + np.timedelta64(5, 'ms') + result = dtarr + np.timedelta64(5, "ms") tm.assert_equal(result, expected) - result = np.timedelta64(5, 'ms') + dtarr + result = np.timedelta64(5, "ms") + dtarr tm.assert_equal(result, expected) def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): @@ -933,19 +972,18 @@ def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture): tm.assert_equal(result, expected) result = obj - other tm.assert_equal(result, expected) - msg = 'cannot subtract' + msg = "cannot subtract" with pytest.raises(TypeError, match=msg): other - obj - def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, - box_with_array): + def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) - tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day']) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) tdarr = tdi.values - expected = pd.date_range('2015-12-31', periods=3, tz=tz) + expected = pd.date_range("2015-12-31", periods=3, tz=tz) dtarr = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -955,28 +993,32 @@ def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, result = tdarr + dtarr tm.assert_equal(result, expected) - expected = pd.date_range('2016-01-02', periods=3, tz=tz) + expected = pd.date_range("2016-01-02", periods=3, tz=tz) expected = tm.box_expected(expected, box_with_array) result = dtarr - tdarr tm.assert_equal(result, expected) - msg = 'cannot subtract|bad operand type for unary -' + msg = "cannot subtract|bad operand type for unary -" with pytest.raises(TypeError, match=msg): tdarr - dtarr # ----------------------------------------------------------------- # Subtraction of datetime-like scalars - @pytest.mark.parametrize('ts', [ - pd.Timestamp('2013-01-01'), - pd.Timestamp('2013-01-01').to_pydatetime(), - pd.Timestamp('2013-01-01').to_datetime64()]) + @pytest.mark.parametrize( + "ts", + [ + pd.Timestamp("2013-01-01"), + pd.Timestamp("2013-01-01").to_pydatetime(), + pd.Timestamp("2013-01-01").to_datetime64(), + ], + ) def test_dt64arr_sub_dtscalar(self, box_with_array, ts): # GH#8554, GH#22163 DataFrame op should _not_ return dt64 dtype - idx = pd.date_range('2013-01-01', periods=3) + idx = pd.date_range("2013-01-01", periods=3) idx = tm.box_expected(idx, box_with_array) - expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days']) + expected = pd.TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) expected = tm.box_expected(expected, box_with_array) result = idx - ts @@ -985,13 +1027,13 @@ def test_dt64arr_sub_dtscalar(self, box_with_array, ts): def test_dt64arr_sub_datetime64_not_ns(self, box_with_array): # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano # for DataFrame operation - dt64 = np.datetime64('2013-01-01') - assert dt64.dtype == 'datetime64[D]' + dt64 = np.datetime64("2013-01-01") + assert dt64.dtype == "datetime64[D]" - dti = pd.date_range('20130101', periods=3) + dti = pd.date_range("20130101", periods=3) dtarr = tm.box_expected(dti, box_with_array) - expected = pd.TimedeltaIndex(['0 Days', '1 Day', '2 Days']) + expected = pd.TimedeltaIndex(["0 Days", "1 Day", "2 Days"]) expected = tm.box_expected(expected, box_with_array) result = dtarr - dt64 @@ -1001,14 +1043,12 @@ def test_dt64arr_sub_datetime64_not_ns(self, box_with_array): tm.assert_equal(result, -expected) def test_dt64arr_sub_timestamp(self, box_with_array): - ser = pd.date_range('2014-03-17', periods=2, freq='D', - tz='US/Eastern') + ser = pd.date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern") ts = ser[0] ser = tm.box_expected(ser, box_with_array) - delta_series = pd.Series([np.timedelta64(0, 'D'), - np.timedelta64(1, 'D')]) + delta_series = pd.Series([np.timedelta64(0, "D"), np.timedelta64(1, "D")]) expected = tm.box_expected(delta_series, box_with_array) tm.assert_equal(ser - ts, expected) @@ -1016,19 +1056,19 @@ def test_dt64arr_sub_timestamp(self, box_with_array): def test_dt64arr_sub_NaT(self, box_with_array): # GH#18808 - dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp('19900315')]) + dti = pd.DatetimeIndex([pd.NaT, pd.Timestamp("19900315")]) ser = tm.box_expected(dti, box_with_array) result = ser - pd.NaT - expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) - dti_tz = dti.tz_localize('Asia/Tokyo') + dti_tz = dti.tz_localize("Asia/Tokyo") ser_tz = tm.box_expected(dti_tz, box_with_array) result = ser_tz - pd.NaT - expected = pd.Series([pd.NaT, pd.NaT], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, pd.NaT], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -1036,7 +1076,7 @@ def test_dt64arr_sub_NaT(self, box_with_array): # Subtraction of datetime-like array-like def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array): - dti = pd.date_range('2016-01-01', periods=3, tz=None) + dti = pd.date_range("2016-01-01", periods=3, tz=None) dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) @@ -1047,15 +1087,16 @@ def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array): result = dt64vals - dtarr tm.assert_equal(result, expected) - def test_dt64arr_aware_sub_dt64ndarray_raises(self, tz_aware_fixture, - box_with_array): + def test_dt64arr_aware_sub_dt64ndarray_raises( + self, tz_aware_fixture, box_with_array + ): tz = tz_aware_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) - msg = 'subtraction must have the same timezones or' + msg = "subtraction must have the same timezones or" with pytest.raises(TypeError, match=msg): dtarr - dt64vals with pytest.raises(TypeError, match=msg): @@ -1064,15 +1105,14 @@ def test_dt64arr_aware_sub_dt64ndarray_raises(self, tz_aware_fixture, # ------------------------------------------------------------- # Addition of datetime-like others (invalid) - def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, - box_with_array): + def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, box_with_array): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) dt64vals = dti.values dtarr = tm.box_expected(dti, box_with_array) - msg = 'cannot add' + msg = "cannot add" with pytest.raises(TypeError, match=msg): dtarr + dt64vals with pytest.raises(TypeError, match=msg): @@ -1080,22 +1120,22 @@ def test_dt64arr_add_dt64ndarray_raises(self, tz_naive_fixture, def test_dt64arr_add_timestamp_raises(self, box_with_array): # GH#22163 ensure DataFrame doesn't cast Timestamp to i8 - idx = DatetimeIndex(['2011-01-01', '2011-01-02']) + idx = DatetimeIndex(["2011-01-01", "2011-01-02"]) idx = tm.box_expected(idx, box_with_array) - msg = 'cannot add' + msg = "cannot add" with pytest.raises(TypeError, match=msg): - idx + Timestamp('2011-01-01') + idx + Timestamp("2011-01-01") with pytest.raises(TypeError, match=msg): - Timestamp('2011-01-01') + idx + Timestamp("2011-01-01") + idx # ------------------------------------------------------------- # Other Invalid Addition/Subtraction - @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) def test_dt64arr_add_sub_float(self, other, box_with_array): - dti = DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') + dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") dtarr = tm.box_expected(dti, box_with_array) - msg = '|'.join(['unsupported operand type', 'cannot (add|subtract)']) + msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) with pytest.raises(TypeError, match=msg): dtarr + other with pytest.raises(TypeError, match=msg): @@ -1105,18 +1145,25 @@ def test_dt64arr_add_sub_float(self, other, box_with_array): with pytest.raises(TypeError, match=msg): other - dtarr - @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H']) - @pytest.mark.parametrize('dti_freq', [None, 'D']) - def test_dt64arr_add_sub_parr(self, dti_freq, pi_freq, - box_with_array, box_with_array2): + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("dti_freq", [None, "D"]) + def test_dt64arr_add_sub_parr( + self, dti_freq, pi_freq, box_with_array, box_with_array2 + ): # GH#20049 subtracting PeriodIndex should raise TypeError - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=dti_freq) + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) pi = dti.to_period(pi_freq) dtarr = tm.box_expected(dti, box_with_array) parr = tm.box_expected(pi, box_with_array2) - msg = '|'.join(['cannot (add|subtract)', 'unsupported operand', - 'descriptor.*requires', 'ufunc.*cannot use operands']) + msg = "|".join( + [ + "cannot (add|subtract)", + "unsupported operand", + "descriptor.*requires", + "ufunc.*cannot use operands", + ] + ) with pytest.raises(TypeError, match=msg): dtarr + parr with pytest.raises(TypeError, match=msg): @@ -1126,15 +1173,15 @@ def test_dt64arr_add_sub_parr(self, dti_freq, pi_freq, with pytest.raises(TypeError, match=msg): parr - dtarr - @pytest.mark.parametrize('dti_freq', [None, 'D']) + @pytest.mark.parametrize("dti_freq", [None, "D"]) def test_dt64arr_add_sub_period_scalar(self, dti_freq, box_with_array): # GH#13078 # not supported, check TypeError - per = pd.Period('2011-01-01', freq='D') + per = pd.Period("2011-01-01", freq="D") - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=dti_freq) + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq) dtarr = tm.box_expected(idx, box_with_array) - msg = '|'.join(['unsupported operand type', 'cannot (add|subtract)']) + msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) with pytest.raises(TypeError, match=msg): dtarr + per with pytest.raises(TypeError, match=msg): @@ -1154,9 +1201,10 @@ class TestDatetime64DateOffsetArithmetic: def test_dt64arr_series_add_tick_DateOffset(self, box_with_array): # GH#4532 # operate with pd.offsets - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) - expected = Series([Timestamp('20130101 9:01:05'), - Timestamp('20130101 9:02:05')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + expected = Series( + [Timestamp("20130101 9:01:05"), Timestamp("20130101 9:02:05")] + ) ser = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1170,9 +1218,10 @@ def test_dt64arr_series_add_tick_DateOffset(self, box_with_array): def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array): # GH#4532 # operate with pd.offsets - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) - expected = Series([Timestamp('20130101 9:00:55'), - Timestamp('20130101 9:01:55')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) + expected = Series( + [Timestamp("20130101 9:00:55"), Timestamp("20130101 9:01:55")] + ) ser = tm.box_expected(ser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1186,13 +1235,13 @@ def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array): with pytest.raises(TypeError, match=msg): pd.offsets.Second(5) - ser - @pytest.mark.parametrize('cls_name', ['Day', 'Hour', 'Minute', 'Second', - 'Milli', 'Micro', 'Nano']) - def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, - box_with_array): + @pytest.mark.parametrize( + "cls_name", ["Day", "Hour", "Minute", "Second", "Milli", "Micro", "Nano"] + ) + def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, box_with_array): # GH#4532 # smoke tests for valid DateOffsets - ser = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) + ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) ser = tm.box_expected(ser, box_with_array) offset_cls = getattr(pd.offsets, cls_name) @@ -1203,15 +1252,17 @@ def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype tz = tz_aware_fixture - if tz == 'US/Pacific': - dates = date_range('2012-11-01', periods=3, tz=tz) + if tz == "US/Pacific": + dates = date_range("2012-11-01", periods=3, tz=tz) offset = dates + pd.offsets.Hour(5) assert dates[0] + pd.offsets.Hour(5) == offset[0] - dates = date_range('2010-11-01 00:00', - periods=3, tz=tz, freq='H') - expected = DatetimeIndex(['2010-11-01 05:00', '2010-11-01 06:00', - '2010-11-01 07:00'], freq='H', tz=tz) + dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="H") + expected = DatetimeIndex( + ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"], + freq="H", + tz=tz, + ) dates = tm.box_expected(dates, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1219,7 +1270,7 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): # TODO: parametrize over the scalar being added? radd? sub? offset = dates + pd.offsets.Hour(5) tm.assert_equal(offset, expected) - offset = dates + np.timedelta64(5, 'h') + offset = dates + np.timedelta64(5, "h") tm.assert_equal(offset, expected) offset = dates + timedelta(hours=5) tm.assert_equal(offset, expected) @@ -1229,21 +1280,31 @@ def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array): def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): # GH#10699 - vec = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-03-31'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31'), - Timestamp('2000-05-15'), - Timestamp('2001-06-15')]) + vec = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-03-31"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + Timestamp("2000-05-15"), + Timestamp("2001-06-15"), + ] + ) vec = tm.box_expected(vec, box_with_array) vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec # DateOffset relativedelta fastpath - relative_kwargs = [('years', 2), ('months', 5), ('days', 3), - ('hours', 5), ('minutes', 10), ('seconds', 2), - ('microseconds', 5)] + relative_kwargs = [ + ("years", 2), + ("months", 5), + ("days", 3), + ("hours", 5), + ("minutes", 10), + ("seconds", 2), + ("microseconds", 5), + ] for i, kwd in enumerate(relative_kwargs): off = pd.DateOffset(**dict([kwd])) @@ -1255,7 +1316,7 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): expected = tm.box_expected(expected, box_with_array) tm.assert_equal(expected, vec - off) - off = pd.DateOffset(**dict(relative_kwargs[:i + 1])) + off = pd.DateOffset(**dict(relative_kwargs[: i + 1])) expected = DatetimeIndex([x + off for x in vec_items]) expected = tm.box_expected(expected, box_with_array) @@ -1273,30 +1334,57 @@ def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array): # TODO: redundant with test_dt64arr_add_sub_DateOffset? that includes # tz-aware cases which this does not - @pytest.mark.parametrize('cls_and_kwargs', [ - 'YearBegin', ('YearBegin', {'month': 5}), - 'YearEnd', ('YearEnd', {'month': 5}), - 'MonthBegin', 'MonthEnd', - 'SemiMonthEnd', 'SemiMonthBegin', - 'Week', ('Week', {'weekday': 3}), - 'Week', ('Week', {'weekday': 6}), - 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', - 'CustomBusinessDay', 'CDay', 'CBMonthEnd', - 'CBMonthBegin', 'BMonthBegin', 'BMonthEnd', - 'BusinessHour', 'BYearBegin', 'BYearEnd', - 'BQuarterBegin', ('LastWeekOfMonth', {'weekday': 2}), - ('FY5253Quarter', {'qtr_with_extra_week': 1, - 'startingMonth': 1, - 'weekday': 2, - 'variation': 'nearest'}), - ('FY5253', {'weekday': 0, 'startingMonth': 2, 'variation': 'nearest'}), - ('WeekOfMonth', {'weekday': 2, 'week': 2}), - 'Easter', ('DateOffset', {'day': 4}), - ('DateOffset', {'month': 5})]) - @pytest.mark.parametrize('normalize', [True, False]) - @pytest.mark.parametrize('n', [0, 5]) - def test_dt64arr_add_sub_DateOffsets(self, box_with_array, - n, normalize, cls_and_kwargs): + @pytest.mark.parametrize( + "cls_and_kwargs", + [ + "YearBegin", + ("YearBegin", {"month": 5}), + "YearEnd", + ("YearEnd", {"month": 5}), + "MonthBegin", + "MonthEnd", + "SemiMonthEnd", + "SemiMonthBegin", + "Week", + ("Week", {"weekday": 3}), + "Week", + ("Week", {"weekday": 6}), + "BusinessDay", + "BDay", + "QuarterEnd", + "QuarterBegin", + "CustomBusinessDay", + "CDay", + "CBMonthEnd", + "CBMonthBegin", + "BMonthBegin", + "BMonthEnd", + "BusinessHour", + "BYearBegin", + "BYearEnd", + "BQuarterBegin", + ("LastWeekOfMonth", {"weekday": 2}), + ( + "FY5253Quarter", + { + "qtr_with_extra_week": 1, + "startingMonth": 1, + "weekday": 2, + "variation": "nearest", + }, + ), + ("FY5253", {"weekday": 0, "startingMonth": 2, "variation": "nearest"}), + ("WeekOfMonth", {"weekday": 2, "week": 2}), + "Easter", + ("DateOffset", {"day": 4}), + ("DateOffset", {"month": 5}), + ], + ) + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [0, 5]) + def test_dt64arr_add_sub_DateOffsets( + self, box_with_array, n, normalize, cls_and_kwargs + ): # GH#10699 # assert vectorized operation matches pointwise operations @@ -1308,19 +1396,27 @@ def test_dt64arr_add_sub_DateOffsets(self, box_with_array, cls_name = cls_and_kwargs kwargs = {} - if n == 0 and cls_name in ['WeekOfMonth', 'LastWeekOfMonth', - 'FY5253Quarter', 'FY5253']: + if n == 0 and cls_name in [ + "WeekOfMonth", + "LastWeekOfMonth", + "FY5253Quarter", + "FY5253", + ]: # passing n = 0 is invalid for these offset classes return - vec = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-03-31'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31'), - Timestamp('2000-05-15'), - Timestamp('2001-06-15')]) + vec = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-03-31"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + Timestamp("2000-05-15"), + Timestamp("2001-06-15"), + ] + ) vec = tm.box_expected(vec, box_with_array) vec_items = vec.squeeze() if box_with_array is pd.DataFrame else vec @@ -1351,40 +1447,58 @@ def test_dt64arr_add_sub_DateOffsets(self, box_with_array, def test_dt64arr_add_sub_DateOffset(self, box_with_array): # GH#10699 - s = date_range('2000-01-01', '2000-01-31', name='a') + s = date_range("2000-01-01", "2000-01-31", name="a") s = tm.box_expected(s, box_with_array) result = s + pd.DateOffset(years=1) result2 = pd.DateOffset(years=1) + s - exp = date_range('2001-01-01', '2001-01-31', name='a') + exp = date_range("2001-01-01", "2001-01-31", name="a") exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) result = s - pd.DateOffset(years=1) - exp = date_range('1999-01-01', '1999-01-31', name='a') + exp = date_range("1999-01-01", "1999-01-31", name="a") exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) - s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = DatetimeIndex( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) s = tm.box_expected(s, box_with_array) result = s + pd.offsets.Day() result2 = pd.offsets.Day() + s - exp = DatetimeIndex([Timestamp('2000-01-16 00:15:00', tz='US/Central'), - Timestamp('2000-02-16', tz='US/Central')], - name='a') + exp = DatetimeIndex( + [ + Timestamp("2000-01-16 00:15:00", tz="US/Central"), + Timestamp("2000-02-16", tz="US/Central"), + ], + name="a", + ) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) - s = DatetimeIndex([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = DatetimeIndex( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) s = tm.box_expected(s, box_with_array) result = s + pd.offsets.MonthEnd() result2 = pd.offsets.MonthEnd() + s - exp = DatetimeIndex([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')], - name='a') + exp = DatetimeIndex( + [ + Timestamp("2000-01-31 00:15:00", tz="US/Central"), + Timestamp("2000-02-29", tz="US/Central"), + ], + name="a", + ) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) @@ -1393,90 +1507,114 @@ def test_dt64arr_add_sub_DateOffset(self, box_with_array): def test_dt64arr_add_mixed_offset_array(self, box_with_array): # GH#10699 # array of offsets - s = DatetimeIndex([Timestamp('2000-1-1'), Timestamp('2000-2-1')]) + s = DatetimeIndex([Timestamp("2000-1-1"), Timestamp("2000-2-1")]) s = tm.box_expected(s, box_with_array) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): - other = pd.Index([pd.offsets.DateOffset(years=1), - pd.offsets.MonthEnd()]) + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): + other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other - exp = DatetimeIndex([Timestamp('2001-1-1'), - Timestamp('2000-2-29')]) + exp = DatetimeIndex([Timestamp("2001-1-1"), Timestamp("2000-2-29")]) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) # same offset - other = pd.Index([pd.offsets.DateOffset(years=1), - pd.offsets.DateOffset(years=1)]) + other = pd.Index( + [pd.offsets.DateOffset(years=1), pd.offsets.DateOffset(years=1)] + ) other = tm.box_expected(other, box_with_array) result = s + other - exp = DatetimeIndex([Timestamp('2001-1-1'), - Timestamp('2001-2-1')]) + exp = DatetimeIndex([Timestamp("2001-1-1"), Timestamp("2001-2-1")]) exp = tm.box_expected(exp, box_with_array) tm.assert_equal(result, exp) # TODO: overlap with test_dt64arr_add_mixed_offset_array? - def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, - box_with_array): + def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): # GH#18849 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz) + dti = pd.date_range("2017-01-01", periods=2, tz=tz) dtarr = tm.box_expected(dti, box_with_array) other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): res = dtarr + other - expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))], - name=dti.name, freq='infer') + expected = DatetimeIndex( + [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" + ) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): res2 = other + dtarr tm.assert_equal(res2, expected) - with tm.assert_produces_warning(warn, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning(warn, clear=[pd.core.arrays.datetimelike]): res = dtarr - other - expected = DatetimeIndex([dti[n] - other[n] for n in range(len(dti))], - name=dti.name, freq='infer') + expected = DatetimeIndex( + [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" + ) expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - @pytest.mark.parametrize("op, offset, exp, exp_freq", [ - ('__add__', pd.DateOffset(months=3, days=10), - [Timestamp('2014-04-11'), Timestamp('2015-04-11'), - Timestamp('2016-04-11'), Timestamp('2017-04-11')], - None), - ('__add__', pd.DateOffset(months=3), - [Timestamp('2014-04-01'), Timestamp('2015-04-01'), - Timestamp('2016-04-01'), Timestamp('2017-04-01')], - "AS-APR"), - ('__sub__', pd.DateOffset(months=3, days=10), - [Timestamp('2013-09-21'), Timestamp('2014-09-21'), - Timestamp('2015-09-21'), Timestamp('2016-09-21')], - None), - ('__sub__', pd.DateOffset(months=3), - [Timestamp('2013-10-01'), Timestamp('2014-10-01'), - Timestamp('2015-10-01'), Timestamp('2016-10-01')], - "AS-OCT") - ]) - def test_dti_add_sub_nonzero_mth_offset(self, op, offset, - exp, exp_freq, - tz_aware_fixture, - box_with_array): + @pytest.mark.parametrize( + "op, offset, exp, exp_freq", + [ + ( + "__add__", + pd.DateOffset(months=3, days=10), + [ + Timestamp("2014-04-11"), + Timestamp("2015-04-11"), + Timestamp("2016-04-11"), + Timestamp("2017-04-11"), + ], + None, + ), + ( + "__add__", + pd.DateOffset(months=3), + [ + Timestamp("2014-04-01"), + Timestamp("2015-04-01"), + Timestamp("2016-04-01"), + Timestamp("2017-04-01"), + ], + "AS-APR", + ), + ( + "__sub__", + pd.DateOffset(months=3, days=10), + [ + Timestamp("2013-09-21"), + Timestamp("2014-09-21"), + Timestamp("2015-09-21"), + Timestamp("2016-09-21"), + ], + None, + ), + ( + "__sub__", + pd.DateOffset(months=3), + [ + Timestamp("2013-10-01"), + Timestamp("2014-10-01"), + Timestamp("2015-10-01"), + Timestamp("2016-10-01"), + ], + "AS-OCT", + ), + ], + ) + def test_dti_add_sub_nonzero_mth_offset( + self, op, offset, exp, exp_freq, tz_aware_fixture, box_with_array + ): # GH 26258 tz = tz_aware_fixture - date = date_range(start='01 Jan 2014', end='01 Jan 2017', freq='AS', - tz=tz) + date = date_range(start="01 Jan 2014", end="01 Jan 2017", freq="AS", tz=tz) date = tm.box_expected(date, box_with_array, False) mth = getattr(date, op) result = mth(offset) @@ -1491,7 +1629,7 @@ class TestDatetime64OverflowHandling: def test_dt64_overflow_masking(self, box_with_array): # GH#25317 - left = Series([Timestamp('1969-12-31')]) + left = Series([Timestamp("1969-12-31")]) right = Series([NaT]) left = tm.box_expected(left, box_with_array) @@ -1505,11 +1643,11 @@ def test_dt64_overflow_masking(self, box_with_array): def test_dt64_series_arith_overflow(self): # GH#12534, fixed by GH#19024 - dt = pd.Timestamp('1700-01-31') - td = pd.Timedelta('20000 Days') - dti = pd.date_range('1949-09-30', freq='100Y', periods=4) + dt = pd.Timestamp("1700-01-31") + td = pd.Timedelta("20000 Days") + dti = pd.date_range("1949-09-30", freq="100Y", periods=4) ser = pd.Series(dti) - msg = 'Overflow in int64 addition' + msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): ser - dt with pytest.raises(OverflowError, match=msg): @@ -1520,37 +1658,43 @@ def test_dt64_series_arith_overflow(self): td + ser ser.iloc[-1] = pd.NaT - expected = pd.Series(['2004-10-03', '2104-10-04', '2204-10-04', 'NaT'], - dtype='datetime64[ns]') + expected = pd.Series( + ["2004-10-03", "2104-10-04", "2204-10-04", "NaT"], dtype="datetime64[ns]" + ) res = ser + td tm.assert_series_equal(res, expected) res = td + ser tm.assert_series_equal(res, expected) ser.iloc[1:] = pd.NaT - expected = pd.Series(['91279 Days', 'NaT', 'NaT', 'NaT'], - dtype='timedelta64[ns]') + expected = pd.Series( + ["91279 Days", "NaT", "NaT", "NaT"], dtype="timedelta64[ns]" + ) res = ser - dt tm.assert_series_equal(res, expected) res = dt - ser tm.assert_series_equal(res, -expected) def test_datetimeindex_sub_timestamp_overflow(self): - dtimax = pd.to_datetime(['now', pd.Timestamp.max]) - dtimin = pd.to_datetime(['now', pd.Timestamp.min]) - - tsneg = Timestamp('1950-01-01') - ts_neg_variants = [tsneg, - tsneg.to_pydatetime(), - tsneg.to_datetime64().astype('datetime64[ns]'), - tsneg.to_datetime64().astype('datetime64[D]')] - - tspos = Timestamp('1980-01-01') - ts_pos_variants = [tspos, - tspos.to_pydatetime(), - tspos.to_datetime64().astype('datetime64[ns]'), - tspos.to_datetime64().astype('datetime64[D]')] - msg = 'Overflow in int64 addition' + dtimax = pd.to_datetime(["now", pd.Timestamp.max]) + dtimin = pd.to_datetime(["now", pd.Timestamp.min]) + + tsneg = Timestamp("1950-01-01") + ts_neg_variants = [ + tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype("datetime64[ns]"), + tsneg.to_datetime64().astype("datetime64[D]"), + ] + + tspos = Timestamp("1980-01-01") + ts_pos_variants = [ + tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype("datetime64[ns]"), + tspos.to_datetime64().astype("datetime64[D]"), + ] + msg = "Overflow in int64 addition" for variant in ts_neg_variants: with pytest.raises(OverflowError, match=msg): dtimax - variant @@ -1571,11 +1715,11 @@ def test_datetimeindex_sub_timestamp_overflow(self): def test_datetimeindex_sub_datetimeindex_overflow(self): # GH#22492, GH#22508 - dtimax = pd.to_datetime(['now', pd.Timestamp.max]) - dtimin = pd.to_datetime(['now', pd.Timestamp.min]) + dtimax = pd.to_datetime(["now", pd.Timestamp.max]) + dtimin = pd.to_datetime(["now", pd.Timestamp.min]) - ts_neg = pd.to_datetime(['1950-01-01', '1950-01-01']) - ts_pos = pd.to_datetime(['1980-01-01', '1980-01-01']) + ts_neg = pd.to_datetime(["1950-01-01", "1950-01-01"]) + ts_pos = pd.to_datetime(["1980-01-01", "1980-01-01"]) # General tests expected = pd.Timestamp.max.value - ts_pos[1].value @@ -1585,7 +1729,7 @@ def test_datetimeindex_sub_datetimeindex_overflow(self): expected = pd.Timestamp.min.value - ts_neg[1].value result = dtimin - ts_neg assert result[1].value == expected - msg = 'Overflow in int64 addition' + msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): dtimax - ts_neg @@ -1594,26 +1738,25 @@ def test_datetimeindex_sub_datetimeindex_overflow(self): # Edge cases tmin = pd.to_datetime([pd.Timestamp.min]) - t1 = tmin + pd.Timedelta.max + pd.Timedelta('1us') + t1 = tmin + pd.Timedelta.max + pd.Timedelta("1us") with pytest.raises(OverflowError, match=msg): t1 - tmin tmax = pd.to_datetime([pd.Timestamp.max]) - t2 = tmax + pd.Timedelta.min - pd.Timedelta('1us') + t2 = tmax + pd.Timedelta.min - pd.Timedelta("1us") with pytest.raises(OverflowError, match=msg): tmax - t2 class TestTimestampSeriesArithmetic: - def test_empty_series_add_sub(self): # GH#13844 - a = Series(dtype='M8[ns]') - b = Series(dtype='m8[ns]') + a = Series(dtype="M8[ns]") + b = Series(dtype="m8[ns]") tm.assert_series_equal(a, a + b) tm.assert_series_equal(a, a - b) tm.assert_series_equal(a, b + a) - msg = 'cannot subtract' + msg = "cannot subtract" with pytest.raises(TypeError, match=msg): b - a @@ -1624,11 +1767,21 @@ def test_operators_datetimelike(self): td1.iloc[2] = np.nan # ## datetime64 ### - dt1 = Series([pd.Timestamp('20111230'), pd.Timestamp('20120101'), - pd.Timestamp('20120103')]) + dt1 = Series( + [ + pd.Timestamp("20111230"), + pd.Timestamp("20120101"), + pd.Timestamp("20120103"), + ] + ) dt1.iloc[2] = np.nan - dt2 = Series([pd.Timestamp('20111231'), pd.Timestamp('20120102'), - pd.Timestamp('20120104')]) + dt2 = Series( + [ + pd.Timestamp("20111231"), + pd.Timestamp("20120102"), + pd.Timestamp("20120104"), + ] + ) dt1 - dt2 dt2 - dt1 @@ -1648,7 +1801,7 @@ def test_dt64ser_sub_datetime_dtype(self): dt = datetime(1993, 6, 22, 13, 30) ser = Series([ts]) result = pd.to_timedelta(np.abs(ser - dt)) - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" # ------------------------------------------------------------- # TODO: This next block of tests came from tests.series.test_operators, @@ -1666,8 +1819,9 @@ def check(get_ser, test_ser): op = getattr(get_ser, op_str, None) # Previously, _validate_for_numeric_binop in core/indexes/base.py # did this for us. - with pytest.raises(TypeError, - match='operate|[cC]annot|unsupported operand'): + with pytest.raises( + TypeError, match="operate|[cC]annot|unsupported operand" + ): op(test_ser) # ## timedelta64 ### @@ -1675,50 +1829,51 @@ def check(get_ser, test_ser): td1.iloc[2] = np.nan # ## datetime64 ### - dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), - Timestamp('20120103')]) + dt1 = Series( + [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] + ) dt1.iloc[2] = np.nan - dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), - Timestamp('20120104')]) - if op_str not in ['__sub__', '__rsub__']: + dt2 = Series( + [Timestamp("20111231"), Timestamp("20120102"), Timestamp("20120104")] + ) + if op_str not in ["__sub__", "__rsub__"]: check(dt1, dt2) # ## datetime64 with timetimedelta ### # TODO(jreback) __rsub__ should raise? - if op_str not in ['__add__', '__radd__', '__sub__']: + if op_str not in ["__add__", "__radd__", "__sub__"]: check(dt1, td1) # 8260, 10763 # datetime64 with tz - tz = 'US/Eastern' - dt1 = Series(date_range('2000-01-01 09:00:00', periods=5, - tz=tz), name='foo') + tz = "US/Eastern" + dt1 = Series(date_range("2000-01-01 09:00:00", periods=5, tz=tz), name="foo") dt2 = dt1.copy() dt2.iloc[2] = np.nan - td1 = Series(pd.timedelta_range('1 days 1 min', periods=5, freq='H')) + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) td2 = td1.copy() td2.iloc[1] = np.nan - if op_str not in ['__add__', '__radd__', '__sub__', '__rsub__']: + if op_str not in ["__add__", "__radd__", "__sub__", "__rsub__"]: check(dt2, td2) def test_sub_single_tz(self): # GH#12290 - s1 = Series([pd.Timestamp('2016-02-10', tz='America/Sao_Paulo')]) - s2 = Series([pd.Timestamp('2016-02-08', tz='America/Sao_Paulo')]) + s1 = Series([pd.Timestamp("2016-02-10", tz="America/Sao_Paulo")]) + s2 = Series([pd.Timestamp("2016-02-08", tz="America/Sao_Paulo")]) result = s1 - s2 - expected = Series([Timedelta('2days')]) + expected = Series([Timedelta("2days")]) tm.assert_series_equal(result, expected) result = s2 - s1 - expected = Series([Timedelta('-2days')]) + expected = Series([Timedelta("-2days")]) tm.assert_series_equal(result, expected) def test_dt64tz_series_sub_dtitz(self): # GH#19071 subtracting tzaware DatetimeIndex from tzaware Series # (with same tz) raises, fixed by #19024 - dti = pd.date_range('1999-09-30', periods=10, tz='US/Pacific') + dti = pd.date_range("1999-09-30", periods=10, tz="US/Pacific") ser = pd.Series(dti) - expected = pd.Series(pd.TimedeltaIndex(['0days'] * 10)) + expected = pd.Series(pd.TimedeltaIndex(["0days"] * 10)) res = dti - ser tm.assert_series_equal(res, expected) @@ -1729,68 +1884,78 @@ def test_sub_datetime_compat(self): # see GH#14088 s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), pd.NaT]) dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) - exp = Series([Timedelta('1 days'), pd.NaT]) + exp = Series([Timedelta("1 days"), pd.NaT]) tm.assert_series_equal(s - dt, exp) tm.assert_series_equal(s - Timestamp(dt), exp) def test_dt64_series_add_mixed_tick_DateOffset(self): # GH#4532 # operate with pd.offsets - s = Series([Timestamp('20130101 9:01'), Timestamp('20130101 9:02')]) + s = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]) result = s + pd.offsets.Milli(5) result2 = pd.offsets.Milli(5) + s - expected = Series([Timestamp('20130101 9:01:00.005'), - Timestamp('20130101 9:02:00.005')]) + expected = Series( + [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")] + ) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) result = s + pd.offsets.Minute(5) + pd.offsets.Milli(5) - expected = Series([Timestamp('20130101 9:06:00.005'), - Timestamp('20130101 9:07:00.005')]) + expected = Series( + [Timestamp("20130101 9:06:00.005"), Timestamp("20130101 9:07:00.005")] + ) tm.assert_series_equal(result, expected) def test_datetime64_ops_nat(self): # GH#11349 - datetime_series = Series([NaT, Timestamp('19900315')]) - nat_series_dtype_timestamp = Series([NaT, NaT], dtype='datetime64[ns]') - single_nat_dtype_datetime = Series([NaT], dtype='datetime64[ns]') + datetime_series = Series([NaT, Timestamp("19900315")]) + nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") + single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") # subtraction - tm.assert_series_equal(-NaT + datetime_series, - nat_series_dtype_timestamp) - msg = 'Unary negative expects' + tm.assert_series_equal(-NaT + datetime_series, nat_series_dtype_timestamp) + msg = "Unary negative expects" with pytest.raises(TypeError, match=msg): -single_nat_dtype_datetime + datetime_series - tm.assert_series_equal(-NaT + nat_series_dtype_timestamp, - nat_series_dtype_timestamp) + tm.assert_series_equal( + -NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) with pytest.raises(TypeError, match=msg): -single_nat_dtype_datetime + nat_series_dtype_timestamp # addition - tm.assert_series_equal(nat_series_dtype_timestamp + NaT, - nat_series_dtype_timestamp) - tm.assert_series_equal(NaT + nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - - tm.assert_series_equal(nat_series_dtype_timestamp + NaT, - nat_series_dtype_timestamp) - tm.assert_series_equal(NaT + nat_series_dtype_timestamp, - nat_series_dtype_timestamp) + tm.assert_series_equal( + nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp + ) # ------------------------------------------------------------- # Invalid Operations # TODO: this block also needs to be de-duplicated and parametrized - @pytest.mark.parametrize('dt64_series', [ - Series([Timestamp('19900315'), Timestamp('19900315')]), - Series([pd.NaT, Timestamp('19900315')]), - Series([pd.NaT, pd.NaT], dtype='datetime64[ns]')]) - @pytest.mark.parametrize('one', [1, 1.0, np.array(1)]) + @pytest.mark.parametrize( + "dt64_series", + [ + Series([Timestamp("19900315"), Timestamp("19900315")]), + Series([pd.NaT, Timestamp("19900315")]), + Series([pd.NaT, pd.NaT], dtype="datetime64[ns]"), + ], + ) + @pytest.mark.parametrize("one", [1, 1.0, np.array(1)]) def test_dt64_mul_div_numeric_invalid(self, one, dt64_series): # multiplication - msg = 'cannot perform .* with this index type' + msg = "cannot perform .* with this index type" with pytest.raises(TypeError, match=msg): dt64_series * one with pytest.raises(TypeError, match=msg): @@ -1802,21 +1967,24 @@ def test_dt64_mul_div_numeric_invalid(self, one, dt64_series): with pytest.raises(TypeError, match=msg): one / dt64_series - @pytest.mark.parametrize('op', ['__add__', '__radd__', - '__sub__', '__rsub__']) - @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo']) + @pytest.mark.parametrize("op", ["__add__", "__radd__", "__sub__", "__rsub__"]) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_dt64_series_add_intlike(self, tz, op): # GH#19123 - dti = pd.DatetimeIndex(['2016-01-02', '2016-02-03', 'NaT'], tz=tz) + dti = pd.DatetimeIndex(["2016-01-02", "2016-02-03", "NaT"], tz=tz) ser = Series(dti) - other = Series([20, 30, 40], dtype='uint8') + other = Series([20, 30, 40], dtype="uint8") method = getattr(ser, op) - msg = '|'.join(['incompatible type for a .* operation', - 'cannot evaluate a numeric op', - 'ufunc .* cannot use operands', - 'cannot (add|subtract)']) + msg = "|".join( + [ + "incompatible type for a .* operation", + "cannot evaluate a numeric op", + "ufunc .* cannot use operands", + "cannot (add|subtract)", + ] + ) with pytest.raises(TypeError, match=msg): method(1) with pytest.raises(TypeError, match=msg): @@ -1830,13 +1998,12 @@ def test_dt64_series_add_intlike(self, tz, op): # Timezone-Centric Tests def test_operators_datetimelike_with_timezones(self): - tz = 'US/Eastern' - dt1 = Series(date_range('2000-01-01 09:00:00', periods=5, - tz=tz), name='foo') + tz = "US/Eastern" + dt1 = Series(date_range("2000-01-01 09:00:00", periods=5, tz=tz), name="foo") dt2 = dt1.copy() dt2.iloc[2] = np.nan - td1 = Series(pd.timedelta_range('1 days 1 min', periods=5, freq='H')) + td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="H")) td2 = td1.copy() td2.iloc[1] = np.nan @@ -1885,7 +2052,7 @@ def test_operators_datetimelike_with_timezones(self): result = dt2 - td2 exp = (dt2.dt.tz_localize(None) - td2).dt.tz_localize(tz) tm.assert_series_equal(result, exp) - msg = 'cannot (add|subtract)' + msg = "cannot (add|subtract)" with pytest.raises(TypeError, match=msg): td1 - dt1 with pytest.raises(TypeError, match=msg): @@ -1900,40 +2067,32 @@ class TestDatetimeIndexArithmetic: def test_dti_add_int(self, tz_naive_fixture, one): # Variants of `one` for #19012 tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = rng + one - expected = pd.date_range('2000-01-01 10:00', freq='H', - periods=10, tz=tz) + expected = pd.date_range("2000-01-01 10:00", freq="H", periods=10, tz=tz) tm.assert_index_equal(result, expected) def test_dti_iadd_int(self, tz_naive_fixture, one): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) - expected = pd.date_range('2000-01-01 10:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) + expected = pd.date_range("2000-01-01 10:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rng += one tm.assert_index_equal(rng, expected) def test_dti_sub_int(self, tz_naive_fixture, one): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = rng - one - expected = pd.date_range('2000-01-01 08:00', freq='H', - periods=10, tz=tz) + expected = pd.date_range("2000-01-01 08:00", freq="H", periods=10, tz=tz) tm.assert_index_equal(result, expected) def test_dti_isub_int(self, tz_naive_fixture, one): tz = tz_naive_fixture - rng = pd.date_range('2000-01-01 09:00', freq='H', - periods=10, tz=tz) - expected = pd.date_range('2000-01-01 08:00', freq='H', - periods=10, tz=tz) + rng = pd.date_range("2000-01-01 09:00", freq="H", periods=10, tz=tz) + expected = pd.date_range("2000-01-01 08:00", freq="H", periods=10, tz=tz) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rng -= one tm.assert_index_equal(rng, expected) @@ -1941,16 +2100,15 @@ def test_dti_isub_int(self, tz_naive_fixture, one): # ------------------------------------------------------------- # __add__/__sub__ with integer arrays - @pytest.mark.parametrize('freq', ['H', 'D']) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("freq", ["H", "D"]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_dti_add_intarray_tick(self, int_holder, freq): # GH#19959 - dti = pd.date_range('2016-01-01', periods=2, freq=freq) + dti = pd.date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = DatetimeIndex([dti[n] + other[n] - for n in range(len(dti))]) + expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))]) result = dti + other tm.assert_index_equal(result, expected) @@ -1958,16 +2116,15 @@ def test_dti_add_intarray_tick(self, int_holder, freq): result = other + dti tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq', ['W', 'M', 'MS', 'Q']) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("freq", ["W", "M", "MS", "Q"]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_dti_add_intarray_non_tick(self, int_holder, freq): # GH#19959 - dti = pd.date_range('2016-01-01', periods=2, freq=freq) + dti = pd.date_range("2016-01-01", periods=2, freq=freq) other = int_holder([4, -1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = DatetimeIndex([dti[n] + other[n] - for n in range(len(dti))]) + expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))]) # tm.assert_produces_warning does not handle cases where we expect # two warnings, in this case PerformanceWarning and FutureWarning. @@ -1982,13 +2139,13 @@ def test_dti_add_intarray_non_tick(self, int_holder, freq): result = other + dti tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_dti_add_intarray_no_freq(self, int_holder): # GH#19959 - dti = pd.DatetimeIndex(['2016-01-01', 'NaT', '2017-04-05 06:07:08']) + dti = pd.DatetimeIndex(["2016-01-01", "NaT", "2017-04-05 06:07:08"]) other = int_holder([9, 4, -1]) - nfmsg = 'Cannot shift with no freq' - tmsg = 'cannot subtract DatetimeArray from' + nfmsg = "Cannot shift with no freq" + tmsg = "cannot subtract DatetimeArray from" with pytest.raises(NullFrequencyError, match=nfmsg): dti + other with pytest.raises(NullFrequencyError, match=nfmsg): @@ -2004,9 +2161,9 @@ def test_dti_add_intarray_no_freq(self, int_holder): def test_dti_add_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz) + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz) # add with TimdeltaIndex result = dti + tdi @@ -2025,40 +2182,40 @@ def test_dti_add_tdi(self, tz_naive_fixture): def test_dti_iadd_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz) + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz) # iadd with TimdeltaIndex - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result += tdi tm.assert_index_equal(result, expected) - result = pd.timedelta_range('0 days', periods=10) + result = pd.timedelta_range("0 days", periods=10) result += dti tm.assert_index_equal(result, expected) # iadd with timedelta64 array - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result += tdi.values tm.assert_index_equal(result, expected) - result = pd.timedelta_range('0 days', periods=10) + result = pd.timedelta_range("0 days", periods=10) result += dti tm.assert_index_equal(result, expected) def test_dti_sub_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz, freq='-1D') + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz, freq="-1D") # sub with TimedeltaIndex result = dti - tdi tm.assert_index_equal(result, expected) - msg = 'cannot subtract .*TimedeltaArray' + msg = "cannot subtract .*TimedeltaArray" with pytest.raises(TypeError, match=msg): tdi - dti @@ -2066,34 +2223,38 @@ def test_dti_sub_tdi(self, tz_naive_fixture): result = dti - tdi.values tm.assert_index_equal(result, expected) - msg = 'cannot subtract DatetimeArray from' + msg = "cannot subtract DatetimeArray from" with pytest.raises(TypeError, match=msg): tdi.values - dti def test_dti_isub_tdi(self, tz_naive_fixture): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) - tdi = pd.timedelta_range('0 days', periods=10) - expected = pd.date_range('2017-01-01', periods=10, tz=tz, freq='-1D') + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + tdi = pd.timedelta_range("0 days", periods=10) + expected = pd.date_range("2017-01-01", periods=10, tz=tz, freq="-1D") # isub with TimedeltaIndex - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result -= tdi tm.assert_index_equal(result, expected) - msg = 'cannot subtract .* from a TimedeltaArray' + msg = "cannot subtract .* from a TimedeltaArray" with pytest.raises(TypeError, match=msg): tdi -= dti # isub with timedelta64 array - result = DatetimeIndex([Timestamp('2017-01-01', tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) result -= tdi.values tm.assert_index_equal(result, expected) - msg = '|'.join(['cannot perform __neg__ with this index type:', - 'ufunc subtract cannot use operands with types', - 'cannot subtract DatetimeArray from']) + msg = "|".join( + [ + "cannot perform __neg__ with this index type:", + "ufunc subtract cannot use operands with types", + "cannot subtract DatetimeArray from", + ] + ) with pytest.raises(TypeError, match=msg): tdi.values -= dti @@ -2102,20 +2263,24 @@ def test_dti_isub_tdi(self, tz_naive_fixture): # TODO: A couple other tests belong in this section. Move them in # A PR where there isn't already a giant diff. - @pytest.mark.parametrize('addend', [ - datetime(2011, 1, 1), - DatetimeIndex(['2011-01-01', '2011-01-02']), - DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize('US/Eastern'), - np.datetime64('2011-01-01'), - Timestamp('2011-01-01') - ], ids=lambda x: type(x).__name__) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) + @pytest.mark.parametrize( + "addend", + [ + datetime(2011, 1, 1), + DatetimeIndex(["2011-01-01", "2011-01-02"]), + DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize("US/Eastern"), + np.datetime64("2011-01-01"), + Timestamp("2011-01-01"), + ], + ids=lambda x: type(x).__name__, + ) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_add_datetimelike_and_dti(self, addend, tz): # GH#9631 - dti = DatetimeIndex(['2011-01-01', '2011-01-02']).tz_localize(tz) - msg = ('cannot add DatetimeArray and {0}' - .format(type(addend).__name__)).replace('DatetimeIndex', - 'DatetimeArray') + dti = DatetimeIndex(["2011-01-01", "2011-01-02"]).tz_localize(tz) + msg = ( + "cannot add DatetimeArray and {0}".format(type(addend).__name__) + ).replace("DatetimeIndex", "DatetimeArray") with pytest.raises(TypeError, match=msg): dti + addend with pytest.raises(TypeError, match=msg): @@ -2127,9 +2292,9 @@ def test_sub_dti_dti(self): # previously performed setop (deprecated in 0.16.0), now changed to # return subtraction -> TimeDeltaIndex (GH ...) - dti = date_range('20130101', periods=3) - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') + dti = date_range("20130101", periods=3) + dti_tz = date_range("20130101", periods=3).tz_localize("US/Eastern") + dti_tz2 = date_range("20130101", periods=3).tz_localize("UTC") expected = TimedeltaIndex([0, 0, 0]) result = dti - dti @@ -2137,7 +2302,7 @@ def test_sub_dti_dti(self): result = dti_tz - dti_tz tm.assert_index_equal(result, expected) - msg = 'DatetimeArray subtraction must have the same timezones or' + msg = "DatetimeArray subtraction must have the same timezones or" with pytest.raises(TypeError, match=msg): dti_tz - dti @@ -2152,16 +2317,16 @@ def test_sub_dti_dti(self): tm.assert_index_equal(dti, expected) # different length raises ValueError - dti1 = date_range('20130101', periods=3) - dti2 = date_range('20130101', periods=4) - msg = 'cannot add indices of unequal length' + dti1 = date_range("20130101", periods=3) + dti2 = date_range("20130101", periods=4) + msg = "cannot add indices of unequal length" with pytest.raises(ValueError, match=msg): dti1 - dti2 # NaN propagation - dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) - dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) - expected = TimedeltaIndex(['1 days', np.nan, np.nan]) + dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]) + dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]) + expected = TimedeltaIndex(["1 days", np.nan, np.nan]) result = dti2 - dti1 tm.assert_index_equal(result, expected) @@ -2169,14 +2334,18 @@ def test_sub_dti_dti(self): # TODO: Most of this block is moved from series or frame tests, needs # cleanup, box-parametrization, and de-duplication - @pytest.mark.parametrize('op', [operator.add, operator.sub]) + @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_timedelta64_equal_timedelta_supported_ops(self, op): - ser = Series([Timestamp('20130301'), - Timestamp('20130228 23:00:00'), - Timestamp('20130228 22:00:00'), - Timestamp('20130228 21:00:00')]) + ser = Series( + [ + Timestamp("20130301"), + Timestamp("20130228 23:00:00"), + Timestamp("20130228 22:00:00"), + Timestamp("20130228 21:00:00"), + ] + ) - intervals = ['D', 'h', 'm', 's', 'us'] + intervals = ["D", "h", "m", "s", "us"] # TODO: unused # npy16_mappings = {'D': 24 * 60 * 60 * 1000000, @@ -2191,8 +2360,7 @@ def timedelta64(*args): for d, h, m, s, us in product(*([range(2)] * 5)): nptd = timedelta64(d, h, m, s, us) - pytd = timedelta(days=d, hours=h, minutes=m, seconds=s, - microseconds=us) + pytd = timedelta(days=d, hours=h, minutes=m, seconds=s, microseconds=us) lhs = op(ser, nptd) rhs = op(ser, pytd) @@ -2200,106 +2368,119 @@ def timedelta64(*args): def test_ops_nat_mixed_datetime64_timedelta64(self): # GH#11349 - timedelta_series = Series([NaT, Timedelta('1s')]) - datetime_series = Series([NaT, Timestamp('19900315')]) - nat_series_dtype_timedelta = Series([NaT, NaT], - dtype='timedelta64[ns]') - nat_series_dtype_timestamp = Series([NaT, NaT], dtype='datetime64[ns]') - single_nat_dtype_datetime = Series([NaT], dtype='datetime64[ns]') - single_nat_dtype_timedelta = Series([NaT], dtype='timedelta64[ns]') + timedelta_series = Series([NaT, Timedelta("1s")]) + datetime_series = Series([NaT, Timestamp("19900315")]) + nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]") + nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") + single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") + single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]") # subtraction - tm.assert_series_equal(datetime_series - single_nat_dtype_datetime, - nat_series_dtype_timedelta) + tm.assert_series_equal( + datetime_series - single_nat_dtype_datetime, nat_series_dtype_timedelta + ) - tm.assert_series_equal(datetime_series - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(-single_nat_dtype_timedelta + datetime_series, - nat_series_dtype_timestamp) + tm.assert_series_equal( + datetime_series - single_nat_dtype_timedelta, nat_series_dtype_timestamp + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + datetime_series, nat_series_dtype_timestamp + ) # without a Series wrapping the NaT, it is ambiguous # whether it is a datetime64 or timedelta64 # defaults to interpreting it as timedelta64 - tm.assert_series_equal(nat_series_dtype_timestamp - - single_nat_dtype_datetime, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timestamp - - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(-single_nat_dtype_timedelta + - nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - msg = 'cannot subtract a datelike' + tm.assert_series_equal( + nat_series_dtype_timestamp - single_nat_dtype_datetime, + nat_series_dtype_timedelta, + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp - single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + msg = "cannot subtract a datelike" with pytest.raises(TypeError, match=msg): timedelta_series - single_nat_dtype_datetime # addition - tm.assert_series_equal(nat_series_dtype_timestamp + - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - - tm.assert_series_equal(nat_series_dtype_timestamp + - single_nat_dtype_timedelta, - nat_series_dtype_timestamp) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timestamp, - nat_series_dtype_timestamp) - - tm.assert_series_equal(nat_series_dtype_timedelta + - single_nat_dtype_datetime, - nat_series_dtype_timestamp) - tm.assert_series_equal(single_nat_dtype_datetime + - nat_series_dtype_timedelta, - nat_series_dtype_timestamp) + tm.assert_series_equal( + nat_series_dtype_timestamp + single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + + tm.assert_series_equal( + nat_series_dtype_timestamp + single_nat_dtype_timedelta, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timestamp, + nat_series_dtype_timestamp, + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_datetime, + nat_series_dtype_timestamp, + ) + tm.assert_series_equal( + single_nat_dtype_datetime + nat_series_dtype_timedelta, + nat_series_dtype_timestamp, + ) def test_ufunc_coercions(self): - idx = date_range('2011-01-01', periods=3, freq='2D', name='x') + idx = date_range("2011-01-01", periods=3, freq="2D", name="x") - delta = np.timedelta64(1, 'D') + delta = np.timedelta64(1, "D") for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range('2011-01-02', periods=3, freq='2D', name='x') + exp = date_range("2011-01-02", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) - assert result.freq == '2D' + assert result.freq == "2D" for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range('2010-12-31', periods=3, freq='2D', name='x') + exp = date_range("2010-12-31", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) - assert result.freq == '2D' + assert result.freq == "2D" - delta = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D'), - np.timedelta64(3, 'D')]) + delta = np.array( + [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")] + ) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-08'], - freq='3D', name='x') + exp = DatetimeIndex( + ["2011-01-02", "2011-01-05", "2011-01-08"], freq="3D", name="x" + ) tm.assert_index_equal(result, exp) - assert result.freq == '3D' + assert result.freq == "3D" for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex(['2010-12-31', '2011-01-01', '2011-01-02'], - freq='D', name='x') + exp = DatetimeIndex( + ["2010-12-31", "2011-01-01", "2011-01-02"], freq="D", name="x" + ) tm.assert_index_equal(result, exp) - assert result.freq == 'D' + assert result.freq == "D" - @pytest.mark.parametrize('names', [('foo', None, None), - ('baz', 'bar', None), - ('bar', 'bar', 'bar')]) - @pytest.mark.parametrize('tz', [None, 'America/Chicago']) + @pytest.mark.parametrize( + "names", [("foo", None, None), ("baz", "bar", None), ("bar", "bar", "bar")] + ) + @pytest.mark.parametrize("tz", [None, "America/Chicago"]) def test_dti_add_series(self, tz, names): # GH#13905 - index = DatetimeIndex(['2016-06-28 05:30', '2016-06-28 05:31'], - tz=tz, name=names[0]) - ser = Series([Timedelta(seconds=5)] * 2, - index=index, name=names[1]) - expected = Series(index + Timedelta(seconds=5), - index=index, name=names[2]) + index = DatetimeIndex( + ["2016-06-28 05:30", "2016-06-28 05:31"], tz=tz, name=names[0] + ) + ser = Series([Timedelta(seconds=5)] * 2, index=index, name=names[1]) + expected = Series(index + Timedelta(seconds=5), index=index, name=names[2]) # passing name arg isn't enough when names[2] is None expected.name = names[2] @@ -2315,89 +2496,99 @@ def test_dti_add_series(self, tz, names): result4 = index + ser.values tm.assert_index_equal(result4, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_dti_add_offset_index(self, tz_naive_fixture, names): # GH#18849, GH#19744 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], - name=names[1]) + dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) + other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res = dti + other - expected = DatetimeIndex([dti[n] + other[n] for n in range(len(dti))], - name=names[2], freq='infer') + expected = DatetimeIndex( + [dti[n] + other[n] for n in range(len(dti))], name=names[2], freq="infer" + ) tm.assert_index_equal(res, expected) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res2 = other + dti tm.assert_index_equal(res2, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_dti_sub_offset_index(self, tz_naive_fixture, names): # GH#18824, GH#19744 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) - other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], - name=names[1]) + dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) + other = pd.Index([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res = dti - other - expected = DatetimeIndex([dti[n] - other[n] for n in range(len(dti))], - name=names[2], freq='infer') + expected = DatetimeIndex( + [dti[n] - other[n] for n in range(len(dti))], name=names[2], freq="infer" + ) tm.assert_index_equal(res, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_dti_with_offset_series(self, tz_naive_fixture, names): # GH#18849 tz = tz_naive_fixture - dti = pd.date_range('2017-01-01', periods=2, tz=tz, name=names[0]) - other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], - name=names[1]) + dti = pd.date_range("2017-01-01", periods=2, tz=tz, name=names[0]) + other = Series([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1]) - expected_add = Series([dti[n] + other[n] for n in range(len(dti))], - name=names[2]) + expected_add = Series( + [dti[n] + other[n] for n in range(len(dti))], name=names[2] + ) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res = dti + other tm.assert_series_equal(res, expected_add) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res2 = other + dti tm.assert_series_equal(res2, expected_add) - expected_sub = Series([dti[n] - other[n] for n in range(len(dti))], - name=names[2]) + expected_sub = Series( + [dti[n] - other[n] for n in range(len(dti))], name=names[2] + ) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.arrays.datetimelike]): + with tm.assert_produces_warning( + PerformanceWarning, clear=[pd.core.arrays.datetimelike] + ): res3 = dti - other tm.assert_series_equal(res3, expected_sub) -@pytest.mark.parametrize('years', [-1, 0, 1]) -@pytest.mark.parametrize('months', [-2, 0, 2]) +@pytest.mark.parametrize("years", [-1, 0, 1]) +@pytest.mark.parametrize("months", [-2, 0, 2]) def test_shift_months(years, months): - dti = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), - Timestamp('2000-01-31 00:23:00'), - Timestamp('2000-01-01'), - Timestamp('2000-02-29'), - Timestamp('2000-12-31')]) + dti = DatetimeIndex( + [ + Timestamp("2000-01-05 00:15:00"), + Timestamp("2000-01-31 00:23:00"), + Timestamp("2000-01-01"), + Timestamp("2000-02-29"), + Timestamp("2000-12-31"), + ] + ) actual = DatetimeIndex(shift_months(dti.asi8, years * 12 + months)) - raw = [x + pd.offsets.DateOffset(years=years, months=months) - for x in dti] + raw = [x + pd.offsets.DateOffset(years=years, months=months) for x in dti] expected = DatetimeIndex(raw) tm.assert_index_equal(actual, expected) @@ -2406,12 +2597,13 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("lh,rh", [ - (SubDatetime(2000, 1, 1), - Timedelta(hours=1)), - (Timedelta(hours=1), - SubDatetime(2000, 1, 1)) -]) +@pytest.mark.parametrize( + "lh,rh", + [ + (SubDatetime(2000, 1, 1), Timedelta(hours=1)), + (Timedelta(hours=1), SubDatetime(2000, 1, 1)), + ], +) def test_dt_subclass_add_timedelta(lh, rh): # GH 25851 # ensure that subclassed datetime works for diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 31c7f47bcf5bd..7dcd0cc820061 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -34,7 +34,7 @@ def test_operator_series_comparison_zerorank(self): def test_df_numeric_cmp_dt64_raises(self): # GH#8932, GH#22163 ts = pd.Timestamp.now() - df = pd.DataFrame({'x': range(5)}) + df = pd.DataFrame({"x": range(5)}) with pytest.raises(TypeError): df > ts with pytest.raises(TypeError): @@ -52,30 +52,46 @@ def test_compare_invalid(self): # ops testing a = pd.Series(np.random.randn(5), name=0) b = pd.Series(np.random.randn(5)) - b.name = pd.Timestamp('2000-01-01') + b.name = pd.Timestamp("2000-01-01") tm.assert_series_equal(a / b, 1 / (b / a)) # ------------------------------------------------------------------ # Numeric dtypes Arithmetic with Timedelta Scalar + class TestNumericArraylikeArithmeticWithTimedeltaLike: # TODO: also check name retentention - @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) - @pytest.mark.parametrize('left', [ - pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) - for dtype in ['i1', 'i2', 'i4', 'i8', - 'u1', 'u2', 'u4', 'u8', - 'f2', 'f4', 'f8'] - for cls in [pd.Series, pd.Index]], - ids=lambda x: type(x).__name__ + str(x.dtype)) + @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize( + "left", + [pd.RangeIndex(10, 40, 10)] + + [ + cls([10, 20, 30], dtype=dtype) + for dtype in [ + "i1", + "i2", + "i4", + "i8", + "u1", + "u2", + "u4", + "u8", + "f2", + "f4", + "f8", + ] + for cls in [pd.Series, pd.Index] + ], + ids=lambda x: type(x).__name__ + str(x.dtype), + ) def test_mul_td64arr(self, left, box_cls): # GH#22390 - right = np.array([1, 2, 3], dtype='m8[s]') + right = np.array([1, 2, 3], dtype="m8[s]") right = box_cls(right) - expected = pd.TimedeltaIndex(['10s', '40s', '90s']) + expected = pd.TimedeltaIndex(["10s", "40s", "90s"]) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) @@ -86,20 +102,35 @@ def test_mul_td64arr(self, left, box_cls): tm.assert_equal(result, expected) # TODO: also check name retentention - @pytest.mark.parametrize('box_cls', [np.array, pd.Index, pd.Series]) - @pytest.mark.parametrize('left', [ - pd.RangeIndex(10, 40, 10)] + [cls([10, 20, 30], dtype=dtype) - for dtype in ['i1', 'i2', 'i4', 'i8', - 'u1', 'u2', 'u4', 'u8', - 'f2', 'f4', 'f8'] - for cls in [pd.Series, pd.Index]], - ids=lambda x: type(x).__name__ + str(x.dtype)) + @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) + @pytest.mark.parametrize( + "left", + [pd.RangeIndex(10, 40, 10)] + + [ + cls([10, 20, 30], dtype=dtype) + for dtype in [ + "i1", + "i2", + "i4", + "i8", + "u1", + "u2", + "u4", + "u8", + "f2", + "f4", + "f8", + ] + for cls in [pd.Series, pd.Index] + ], + ids=lambda x: type(x).__name__ + str(x.dtype), + ) def test_div_td64arr(self, left, box_cls): # GH#22390 - right = np.array([10, 40, 90], dtype='m8[s]') + right = np.array([10, 40, 90], dtype="m8[s]") right = box_cls(right) - expected = pd.TimedeltaIndex(['1s', '2s', '3s']) + expected = pd.TimedeltaIndex(["1s", "2s", "3s"]) if isinstance(left, pd.Series) or box_cls is pd.Series: expected = pd.Series(expected) @@ -118,24 +149,28 @@ def test_div_td64arr(self, left, box_cls): # TODO: de-duplicate with test_numeric_arr_mul_tdscalar def test_ops_series(self): # regression test for G#H8813 - td = Timedelta('1 day') + td = Timedelta("1 day") other = pd.Series([1, 2]) - expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) + expected = pd.Series(pd.to_timedelta(["1 day", "2 days"])) tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) # TODO: also test non-nanosecond timedelta64 and Tick objects; # see test_numeric_arr_rdiv_tdscalar for note on these failing - @pytest.mark.parametrize('scalar_td', [ - Timedelta(days=1), - Timedelta(days=1).to_timedelta64(), - Timedelta(days=1).to_pytimedelta()], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "scalar_td", + [ + Timedelta(days=1), + Timedelta(days=1).to_timedelta64(), + Timedelta(days=1).to_pytimedelta(), + ], + ids=lambda x: type(x).__name__, + ) def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): # GH#19333 index = numeric_idx - expected = pd.timedelta_range('0 days', '4 days') + expected = pd.timedelta_range("0 days", "4 days") index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) @@ -149,7 +184,7 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): index = numeric_idx[1:3] - expected = TimedeltaIndex(['3 Days', '36 Hours']) + expected = TimedeltaIndex(["3 Days", "36 Hours"]) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) @@ -160,15 +195,19 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): with pytest.raises(TypeError): index / three_days - @pytest.mark.parametrize('other', [ - pd.Timedelta(hours=31), - pd.Timedelta(hours=31).to_pytimedelta(), - pd.Timedelta(hours=31).to_timedelta64(), - pd.Timedelta(hours=31).to_timedelta64().astype('m8[h]'), - np.timedelta64('NaT'), - np.timedelta64('NaT', 'D'), - pd.offsets.Minute(3), - pd.offsets.Second(0)]) + @pytest.mark.parametrize( + "other", + [ + pd.Timedelta(hours=31), + pd.Timedelta(hours=31).to_pytimedelta(), + pd.Timedelta(hours=31).to_timedelta64(), + pd.Timedelta(hours=31).to_timedelta64().astype("m8[h]"), + np.timedelta64("NaT"), + np.timedelta64("NaT", "D"), + pd.offsets.Minute(3), + pd.offsets.Second(0), + ], + ) def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): left = tm.box_expected(numeric_idx, box) with pytest.raises(TypeError): @@ -184,46 +223,41 @@ def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): # ------------------------------------------------------------------ # Arithmetic -class TestDivisionByZero: +class TestDivisionByZero: def test_div_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], - dtype=np.float64) + expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) result = idx / zero tm.assert_index_equal(result, expected) - ser_compat = Series(idx).astype('i8') / np.array(zero).astype('i8') + ser_compat = Series(idx).astype("i8") / np.array(zero).astype("i8") tm.assert_series_equal(ser_compat, Series(result)) def test_floordiv_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], - dtype=np.float64) + expected = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) result = idx // zero tm.assert_index_equal(result, expected) - ser_compat = Series(idx).astype('i8') // np.array(zero).astype('i8') + ser_compat = Series(idx).astype("i8") // np.array(zero).astype("i8") tm.assert_series_equal(ser_compat, Series(result)) def test_mod_zero(self, zero, numeric_idx): idx = numeric_idx - expected = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], - dtype=np.float64) + expected = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) result = idx % zero tm.assert_index_equal(result, expected) - ser_compat = Series(idx).astype('i8') % np.array(zero).astype('i8') + ser_compat = Series(idx).astype("i8") % np.array(zero).astype("i8") tm.assert_series_equal(ser_compat, Series(result)) def test_divmod_zero(self, zero, numeric_idx): idx = numeric_idx - exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], - dtype=np.float64) - exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], - dtype=np.float64) + exleft = pd.Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64) + exright = pd.Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64) result = divmod(idx, zero) tm.assert_index_equal(result[0], exleft) @@ -231,19 +265,34 @@ def test_divmod_zero(self, zero, numeric_idx): # ------------------------------------------------------------------ - @pytest.mark.parametrize('dtype2', [ - np.int64, np.int32, np.int16, np.int8, - np.float64, np.float32, np.float16, - np.uint64, np.uint32, np.uint16, np.uint8]) - @pytest.mark.parametrize('dtype1', [np.int64, np.float64, np.uint64]) + @pytest.mark.parametrize( + "dtype2", + [ + np.int64, + np.int32, + np.int16, + np.int8, + np.float64, + np.float32, + np.float16, + np.uint64, + np.uint32, + np.uint16, + np.uint8, + ], + ) + @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) def test_ser_div_ser(self, dtype1, dtype2): # no longer do integer div for any ops, but deal with the 0's - first = Series([3, 4, 5, 8], name='first').astype(dtype1) - second = Series([0, 0, 0, 3], name='second').astype(dtype2) - - with np.errstate(all='ignore'): - expected = Series(first.values.astype(np.float64) / second.values, - dtype='float64', name=None) + first = Series([3, 4, 5, 8], name="first").astype(dtype1) + second = Series([0, 0, 0, 3], name="second").astype(dtype2) + + with np.errstate(all="ignore"): + expected = Series( + first.values.astype(np.float64) / second.values, + dtype="float64", + name=None, + ) expected.iloc[0:3] = np.inf result = first / second @@ -254,7 +303,7 @@ def test_rdiv_zero_compat(self): # GH#8674 zero_array = np.array([0] * 5) data = np.random.randn(5) - expected = Series([0.] * 5) + expected = Series([0.0] * 5) result = zero_array / Series(data) tm.assert_series_equal(result, expected) @@ -267,55 +316,54 @@ def test_rdiv_zero_compat(self): def test_div_zero_inf_signs(self): # GH#9144, inf signing - ser = Series([-1, 0, 1], name='first') - expected = Series([-np.inf, np.nan, np.inf], name='first') + ser = Series([-1, 0, 1], name="first") + expected = Series([-np.inf, np.nan, np.inf], name="first") result = ser / 0 tm.assert_series_equal(result, expected) def test_rdiv_zero(self): # GH#9144 - ser = Series([-1, 0, 1], name='first') - expected = Series([0.0, np.nan, 0.0], name='first') + ser = Series([-1, 0, 1], name="first") + expected = Series([0.0, np.nan, 0.0], name="first") result = 0 / ser tm.assert_series_equal(result, expected) def test_floordiv_div(self): # GH#9144 - ser = Series([-1, 0, 1], name='first') + ser = Series([-1, 0, 1], name="first") result = ser // 0 - expected = Series([-np.inf, np.nan, np.inf], name='first') + expected = Series([-np.inf, np.nan, np.inf], name="first") tm.assert_series_equal(result, expected) def test_df_div_zero_df(self): # integer div, but deal with the 0's (GH#9144) - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) result = df / df first = pd.Series([1.0, 1.0, 1.0, 1.0]) second = pd.Series([np.nan, np.nan, np.nan, 1]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) tm.assert_frame_equal(result, expected) def test_df_div_zero_array(self): # integer div, but deal with the 0's (GH#9144) - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) first = pd.Series([1.0, 1.0, 1.0, 1.0]) second = pd.Series([np.nan, np.nan, np.nan, 1]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) - with np.errstate(all='ignore'): - arr = df.values.astype('float') / df.values - result = pd.DataFrame(arr, index=df.index, - columns=df.columns) + with np.errstate(all="ignore"): + arr = df.values.astype("float") / df.values + result = pd.DataFrame(arr, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected) def test_df_div_zero_int(self): # integer div, but deal with the 0's (GH#9144) - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) result = df / 0 expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns) @@ -323,10 +371,9 @@ def test_df_div_zero_int(self): tm.assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatment - with np.errstate(all='ignore'): - arr = df.values.astype('float64') / 0 - result2 = pd.DataFrame(arr, index=df.index, - columns=df.columns) + with np.errstate(all="ignore"): + arr = df.values.astype("float64") / 0 + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) tm.assert_frame_equal(result2, expected) def test_df_div_zero_series_does_not_commute(self): @@ -342,45 +389,44 @@ def test_df_div_zero_series_does_not_commute(self): def test_df_mod_zero_df(self): # GH#3590, modulo as ints - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) # this is technically wrong, as the integer portion is coerced to float # ### - first = pd.Series([0, 0, 0, 0], dtype='float64') + first = pd.Series([0, 0, 0, 0], dtype="float64") second = pd.Series([np.nan, np.nan, np.nan, 0]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) result = df % df tm.assert_frame_equal(result, expected) def test_df_mod_zero_array(self): # GH#3590, modulo as ints - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) # this is technically wrong, as the integer portion is coerced to float # ### - first = pd.Series([0, 0, 0, 0], dtype='float64') + first = pd.Series([0, 0, 0, 0], dtype="float64") second = pd.Series([np.nan, np.nan, np.nan, 0]) - expected = pd.DataFrame({'first': first, 'second': second}) + expected = pd.DataFrame({"first": first, "second": second}) # numpy has a slightly different (wrong) treatment - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): arr = df.values % df.values - result2 = pd.DataFrame(arr, index=df.index, - columns=df.columns, dtype='float64') + result2 = pd.DataFrame(arr, index=df.index, columns=df.columns, dtype="float64") result2.iloc[0:3, 1] = np.nan tm.assert_frame_equal(result2, expected) def test_df_mod_zero_int(self): # GH#3590, modulo as ints - df = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) result = df % 0 expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatment - with np.errstate(all='ignore'): - arr = df.values.astype('float64') % 0 + with np.errstate(all="ignore"): + arr = df.values.astype("float64") % 0 result2 = pd.DataFrame(arr, index=df.index, columns=df.columns) tm.assert_frame_equal(result2, expected) @@ -398,14 +444,20 @@ class TestMultiplicationDivision: # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ # for non-timestamp/timedelta/period dtypes - @pytest.mark.parametrize('box', [ - pytest.param(pd.Index, - marks=pytest.mark.xfail(reason="Index.__div__ always " - "raises", - raises=TypeError)), - pd.Series, - pd.DataFrame - ], ids=lambda x: x.__name__) + @pytest.mark.parametrize( + "box", + [ + pytest.param( + pd.Index, + marks=pytest.mark.xfail( + reason="Index.__div__ always " "raises", raises=TypeError + ), + ), + pd.Series, + pd.DataFrame, + ], + ids=lambda x: x.__name__, + ) def test_divide_decimal(self, box): # resolves issue GH#9787 ser = Series([Decimal(10)]) @@ -425,8 +477,8 @@ def test_div_equiv_binop(self): # Test Series.div as well as Series.__div__ # float/integer issue # GH#7785 - first = Series([1, 0], name='first') - second = Series([-0.01, -0.02], name='second') + first = Series([1, 0], name="first") + second = Series([-0.01, -0.02], name="second") expected = Series([-0.01, -np.inf]) result = second.div(first) @@ -438,14 +490,14 @@ def test_div_equiv_binop(self): def test_div_int(self, numeric_idx): idx = numeric_idx result = idx / 1 - expected = idx.astype('float64') + expected = idx.astype("float64") tm.assert_index_equal(result, expected) result = idx / 2 expected = Index(idx.values / 2) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('op', [operator.mul, ops.rmul, operator.floordiv]) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul, operator.floordiv]) def test_mul_int_identity(self, op, numeric_idx, box): idx = numeric_idx idx = tm.box_expected(idx, box) @@ -457,10 +509,10 @@ def test_mul_int_array(self, numeric_idx): idx = numeric_idx didx = idx * idx - result = idx * np.array(5, dtype='int64') + result = idx * np.array(5, dtype="int64") tm.assert_index_equal(result, idx * 5) - arr_dtype = 'uint64' if isinstance(idx, pd.UInt64Index) else 'int64' + arr_dtype = "uint64" if isinstance(idx, pd.UInt64Index) else "int64" result = idx * np.arange(5, dtype=arr_dtype) tm.assert_index_equal(result, didx) @@ -468,13 +520,13 @@ def test_mul_int_series(self, numeric_idx): idx = numeric_idx didx = idx * idx - arr_dtype = 'uint64' if isinstance(idx, pd.UInt64Index) else 'int64' + arr_dtype = "uint64" if isinstance(idx, pd.UInt64Index) else "int64" result = idx * Series(np.arange(5, dtype=arr_dtype)) tm.assert_series_equal(result, Series(didx)) def test_mul_float_series(self, numeric_idx): idx = numeric_idx - rng5 = np.arange(5, dtype='float64') + rng5 = np.arange(5, dtype="float64") result = idx * Series(rng5 + 0.1) expected = Series(rng5 * (rng5 + 0.1)) @@ -490,7 +542,7 @@ def test_mul_index(self, numeric_idx): def test_mul_datelike_raises(self, numeric_idx): idx = numeric_idx with pytest.raises(TypeError): - idx * pd.date_range('20130101', periods=5) + idx * pd.date_range("20130101", periods=5) def test_mul_size_mismatch_raises(self, numeric_idx): idx = numeric_idx @@ -499,7 +551,7 @@ def test_mul_size_mismatch_raises(self, numeric_idx): with pytest.raises(ValueError): idx * np.array([1, 2]) - @pytest.mark.parametrize('op', [operator.pow, ops.rpow]) + @pytest.mark.parametrize("op", [operator.pow, ops.rpow]) def test_pow_float(self, op, numeric_idx, box): # test power calculations both ways, GH#14973 idx = numeric_idx @@ -526,7 +578,7 @@ def test_divmod_scalar(self, numeric_idx): idx = numeric_idx result = divmod(idx, 2) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): div, mod = divmod(idx.values, 2) expected = Index(div), Index(mod) @@ -538,7 +590,7 @@ def test_divmod_ndarray(self, numeric_idx): other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 result = divmod(idx, other) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): div, mod = divmod(idx.values, other) expected = Index(div), Index(mod) @@ -550,18 +602,19 @@ def test_divmod_series(self, numeric_idx): other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2 result = divmod(idx, Series(other)) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): div, mod = divmod(idx.values, other) expected = Series(div), Series(mod) for r, e in zip(result, expected): tm.assert_series_equal(r, e) - @pytest.mark.parametrize('other', [np.nan, 7, -23, 2.718, -3.14, np.inf]) + @pytest.mark.parametrize("other", [np.nan, 7, -23, 2.718, -3.14, np.inf]) def test_ops_np_scalar(self, other): vals = np.random.randn(5, 3) - f = lambda x: pd.DataFrame(x, index=list('ABCDE'), - columns=['jim', 'joe', 'jolie']) + f = lambda x: pd.DataFrame( + x, index=list("ABCDE"), columns=["jim", "joe", "jolie"] + ) df = f(vals) @@ -574,44 +627,39 @@ def test_ops_np_scalar(self, other): def test_operators_frame(self): # rpow does not work with DataFrame ts = tm.makeTimeSeries() - ts.name = 'ts' + ts.name = "ts" - df = pd.DataFrame({'A': ts}) + df = pd.DataFrame({"A": ts}) - tm.assert_series_equal(ts + ts, ts + df['A'], - check_names=False) - tm.assert_series_equal(ts ** ts, ts ** df['A'], - check_names=False) - tm.assert_series_equal(ts < ts, ts < df['A'], - check_names=False) - tm.assert_series_equal(ts / ts, ts / df['A'], - check_names=False) + tm.assert_series_equal(ts + ts, ts + df["A"], check_names=False) + tm.assert_series_equal(ts ** ts, ts ** df["A"], check_names=False) + tm.assert_series_equal(ts < ts, ts < df["A"], check_names=False) + tm.assert_series_equal(ts / ts, ts / df["A"], check_names=False) # TODO: this came from tests.series.test_analytics, needs cleanup and # de-duplication with test_modulo above def test_modulo2(self): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): # GH#3590, modulo as ints - p = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values, - dtype='float64') + p = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values, dtype="float64") expected.iloc[0:3] = np.nan tm.assert_series_equal(result, expected) - result = p['first'] % 0 - expected = Series(np.nan, index=p.index, name='first') + result = p["first"] % 0 + expected = Series(np.nan, index=p.index, name="first") tm.assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values) + p = p.astype("float64") + result = p["first"] % p["second"] + expected = Series(p["first"].values % p["second"].values) tm.assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - result2 = p['second'] % p['first'] + p = p.astype("float64") + result = p["first"] % p["second"] + result2 = p["second"] % p["first"] assert not result.equals(result2) # GH#9144 @@ -633,30 +681,26 @@ class TestAdditionSubtraction: # TODO: This came from series.test.test_operators, needs cleanup def test_arith_ops_df_compat(self): # GH#1134 - s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - exp = pd.Series([3.0, 4.0, np.nan, np.nan], - index=list('ABCD'), name='x') + exp = pd.Series([3.0, 4.0, np.nan, np.nan], index=list("ABCD"), name="x") tm.assert_series_equal(s1 + s2, exp) tm.assert_series_equal(s2 + s1, exp) - exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [3.0, 4.0, np.nan, np.nan]}, index=list("ABCD")) tm.assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) tm.assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) # different length - s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") - exp = pd.Series([3, 4, 5, np.nan], - index=list('ABCD'), name='x') + exp = pd.Series([3, 4, 5, np.nan], index=list("ABCD"), name="x") tm.assert_series_equal(s3 + s4, exp) tm.assert_series_equal(s4 + s3, exp) - exp = pd.DataFrame({'x': [3, 4, 5, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [3, 4, 5, np.nan]}, index=list("ABCD")) tm.assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) tm.assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) @@ -664,17 +708,17 @@ def test_arith_ops_df_compat(self): def test_series_frame_radd_bug(self): # GH#353 vals = pd.Series(tm.rands_array(5, 10)) - result = 'foo_' + vals - expected = vals.map(lambda x: 'foo_' + x) + result = "foo_" + vals + expected = vals.map(lambda x: "foo_" + x) tm.assert_series_equal(result, expected) - frame = pd.DataFrame({'vals': vals}) - result = 'foo_' + frame - expected = pd.DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) + frame = pd.DataFrame({"vals": vals}) + result = "foo_" + frame + expected = pd.DataFrame({"vals": vals.map(lambda x: "foo_" + x)}) tm.assert_frame_equal(result, expected) ts = tm.makeTimeSeries() - ts.name = 'ts' + ts.name = "ts" # really raise this time now = pd.Timestamp.now().to_pydatetime() @@ -694,8 +738,10 @@ def test_datetime64_with_index(self): # GH#4629 # arithmetic datetime64 ops with an index - ser = pd.Series(pd.date_range('20130101', periods=5), - index=pd.date_range('20130101', periods=5)) + ser = pd.Series( + pd.date_range("20130101", periods=5), + index=pd.date_range("20130101", periods=5), + ) expected = ser - ser.index.to_series() result = ser - ser.index tm.assert_series_equal(result, expected) @@ -704,17 +750,18 @@ def test_datetime64_with_index(self): # GH#18850 result = ser - ser.index.to_period() - df = pd.DataFrame(np.random.randn(5, 2), - index=pd.date_range('20130101', periods=5)) - df['date'] = pd.Timestamp('20130102') - df['expected'] = df['date'] - df.index.to_series() - df['result'] = df['date'] - df.index - tm.assert_series_equal(df['result'], df['expected'], check_names=False) + df = pd.DataFrame( + np.random.randn(5, 2), index=pd.date_range("20130101", periods=5) + ) + df["date"] = pd.Timestamp("20130102") + df["expected"] = df["date"] - df.index.to_series() + df["result"] = df["date"] - df.index + tm.assert_series_equal(df["result"], df["expected"], check_names=False) # TODO: taken from tests.frame.test_operators, needs cleanup def test_frame_operators(self, float_frame): frame = float_frame - frame2 = pd.DataFrame(float_frame, columns=['D', 'C', 'B', 'A']) + frame2 = pd.DataFrame(float_frame, columns=["D", "C", "B", "A"]) garbage = np.random.random(4) colSeries = pd.Series(garbage, index=np.array(frame.columns)) @@ -742,15 +789,14 @@ def test_frame_operators(self, float_frame): expected = frame2 * 2 tm.assert_frame_equal(added, expected) - df = pd.DataFrame({'a': ['a', None, 'b']}) - tm.assert_frame_equal(df + df, - pd.DataFrame({'a': ['aa', np.nan, 'bb']})) + df = pd.DataFrame({"a": ["a", None, "b"]}) + tm.assert_frame_equal(df + df, pd.DataFrame({"a": ["aa", np.nan, "bb"]})) # Test for issue #10181 - for dtype in ('float', 'int64'): + for dtype in ("float", "int64"): frames = [ pd.DataFrame(dtype=dtype), - pd.DataFrame(columns=['A'], dtype=dtype), + pd.DataFrame(columns=["A"], dtype=dtype), pd.DataFrame(index=[0], dtype=dtype), ] for df in frames: @@ -765,11 +811,10 @@ def _check_op(series, other, op, pos_only=False, check_dtype=True): cython_or_numpy = op(left, right) python = left.combine(right, op) - tm.assert_series_equal(cython_or_numpy, python, - check_dtype=check_dtype) + tm.assert_series_equal(cython_or_numpy, python, check_dtype=check_dtype) def check(series, other): - simple_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod'] + simple_ops = ["add", "sub", "mul", "truediv", "floordiv", "mod"] for opname in simple_ops: _check_op(series, other, getattr(operator, opname)) @@ -781,11 +826,10 @@ def check(series, other): _check_op(series, other, lambda x, y: operator.truediv(y, x)) _check_op(series, other, lambda x, y: operator.floordiv(y, x)) _check_op(series, other, lambda x, y: operator.mul(y, x)) - _check_op(series, other, lambda x, y: operator.pow(y, x), - pos_only=True) + _check_op(series, other, lambda x, y: operator.pow(y, x), pos_only=True) _check_op(series, other, lambda x, y: operator.mod(y, x)) - tser = tm.makeTimeSeries().rename('ts') + tser = tm.makeTimeSeries().rename("ts") check(tser, tser * 2) check(tser, tser * 0) check(tser, tser[::2]) @@ -815,7 +859,7 @@ def check(series, other): else: other_np = other other_np = np.asarray(other_np) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expecteds = divmod(series.values, np.asarray(other_np)) for result, expected in zip(results, expecteds): @@ -825,7 +869,7 @@ def check(series, other): assert result.name == series.name tm.assert_index_equal(result.index, series.index) - tser = tm.makeTimeSeries().rename('ts') + tser = tm.makeTimeSeries().rename("ts") check(tser, tser * 2) check(tser, tser * 0) check(tser, tser[::2]) @@ -833,88 +877,90 @@ def check(series, other): class TestUFuncCompat: - - @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index, - pd.Float64Index, pd.RangeIndex, - pd.Series]) + @pytest.mark.parametrize( + "holder", + [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.RangeIndex, pd.Series], + ) def test_ufunc_compat(self, holder): box = pd.Series if holder is pd.Series else pd.Index if holder is pd.RangeIndex: idx = pd.RangeIndex(0, 5) else: - idx = holder(np.arange(5, dtype='int64')) + idx = holder(np.arange(5, dtype="int64")) result = np.sin(idx) - expected = box(np.sin(np.arange(5, dtype='int64'))) + expected = box(np.sin(np.arange(5, dtype="int64"))) tm.assert_equal(result, expected) - @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index, - pd.Float64Index, pd.Series]) + @pytest.mark.parametrize( + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + ) def test_ufunc_coercions(self, holder): - idx = holder([1, 2, 3, 4, 5], name='x') + idx = holder([1, 2, 3, 4, 5], name="x") box = pd.Series if holder is pd.Series else pd.Index result = np.sqrt(idx) - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name='x') + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index(np.sqrt(np.array([1, 2, 3, 4, 5])), name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = np.divide(idx, 2.) - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + result = np.divide(idx, 2.0) + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([0.5, 1.0, 1.5, 2.0, 2.5], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) # _evaluate_numeric_binop - result = idx + 2. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([3., 4., 5., 6., 7.], name='x') + result = idx + 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([3.0, 4.0, 5.0, 6.0, 7.0], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = idx - 2. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([-1., 0., 1., 2., 3.], name='x') + result = idx - 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([-1.0, 0.0, 1.0, 2.0, 3.0], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = idx * 1. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([1., 2., 3., 4., 5.], name='x') + result = idx * 1.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([1.0, 2.0, 3.0, 4.0, 5.0], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - result = idx / 2. - assert result.dtype == 'f8' and isinstance(result, box) - exp = pd.Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') + result = idx / 2.0 + assert result.dtype == "f8" and isinstance(result, box) + exp = pd.Float64Index([0.5, 1.0, 1.5, 2.0, 2.5], name="x") exp = tm.box_expected(exp, box) tm.assert_equal(result, exp) - @pytest.mark.parametrize('holder', [pd.Int64Index, pd.UInt64Index, - pd.Float64Index, pd.Series]) + @pytest.mark.parametrize( + "holder", [pd.Int64Index, pd.UInt64Index, pd.Float64Index, pd.Series] + ) def test_ufunc_multiple_return_values(self, holder): - obj = holder([1, 2, 3], name='x') + obj = holder([1, 2, 3], name="x") box = pd.Series if holder is pd.Series else pd.Index result = np.modf(obj) assert isinstance(result, tuple) - exp1 = pd.Float64Index([0., 0., 0.], name='x') - exp2 = pd.Float64Index([1., 2., 3.], name='x') + exp1 = pd.Float64Index([0.0, 0.0, 0.0], name="x") + exp2 = pd.Float64Index([1.0, 2.0, 3.0], name="x") tm.assert_equal(result[0], tm.box_expected(exp1, box)) tm.assert_equal(result[1], tm.box_expected(exp2, box)) def test_ufunc_at(self): - s = pd.Series([0, 1, 2], index=[1, 2, 3], name='x') + s = pd.Series([0, 1, 2], index=[1, 2, 3], name="x") np.add.at(s, [0, 2], 10) - expected = pd.Series([10, 1, 12], index=[1, 2, 3], name='x') + expected = pd.Series([10, 1, 12], index=[1, 2, 3], name="x") tm.assert_series_equal(s, expected) class TestObjectDtypeEquivalence: # Tests that arithmetic operations match operations executed elementwise - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_numarr_with_dtype_add_nan(self, dtype, box): ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) @@ -928,7 +974,7 @@ def test_numarr_with_dtype_add_nan(self, dtype, box): result = ser + np.nan tm.assert_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_numarr_with_dtype_add_int(self, dtype, box): ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([2, 3, 4], dtype=dtype) @@ -943,14 +989,16 @@ def test_numarr_with_dtype_add_int(self, dtype, box): tm.assert_equal(result, expected) # TODO: moved from tests.series.test_operators; needs cleanup - @pytest.mark.parametrize('op', [operator.add, operator.sub, operator.mul, - operator.truediv, operator.floordiv]) + @pytest.mark.parametrize( + "op", + [operator.add, operator.sub, operator.mul, operator.truediv, operator.floordiv], + ) def test_operators_reverse_object(self, op): # GH#56 arr = pd.Series(np.random.randn(10), index=np.arange(10), dtype=object) - result = op(1., arr) - expected = op(1., arr.astype(float)) + result = op(1.0, arr) + expected = op(1.0, arr.astype(float)) tm.assert_series_equal(result.astype(float), expected) @@ -971,11 +1019,20 @@ def check_binop(self, ops, scalars, idxs): tm.assert_index_equal(result, expected) def test_binops(self): - ops = [operator.add, operator.sub, operator.mul, operator.floordiv, - operator.truediv] + ops = [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + ] scalars = [-1, 1, 2] - idxs = [pd.RangeIndex(0, 10, 1), pd.RangeIndex(0, 20, 2), - pd.RangeIndex(-10, 10, 2), pd.RangeIndex(5, -5, -1)] + idxs = [ + pd.RangeIndex(0, 10, 1), + pd.RangeIndex(0, 20, 2), + pd.RangeIndex(-10, 10, 2), + pd.RangeIndex(5, -5, -1), + ] self.check_binop(ops, scalars, idxs) def test_binops_pow(self): @@ -987,9 +1044,17 @@ def test_binops_pow(self): self.check_binop(ops, scalars, idxs) # TODO: mod, divmod? - @pytest.mark.parametrize('op', [operator.add, operator.sub, - operator.mul, operator.floordiv, - operator.truediv, operator.pow]) + @pytest.mark.parametrize( + "op", + [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + operator.pow, + ], + ) def test_arithmetic_with_frame_or_series(self, op): # check that we return NotImplemented when operating with Series # or DataFrame @@ -1024,7 +1089,7 @@ def test_numeric_compat2(self): tm.assert_index_equal(result, expected, exact=True) result = idx / 2 - expected = pd.RangeIndex(0, 5, 1).astype('float64') + expected = pd.RangeIndex(0, 5, 1).astype("float64") tm.assert_index_equal(result, expected, exact=True) result = idx / 4 @@ -1050,19 +1115,22 @@ def test_numeric_compat2(self): cases_exact = [ (pd.RangeIndex(0, 1000, 2), 2, pd.RangeIndex(0, 500, 1)), (pd.RangeIndex(-99, -201, -3), -3, pd.RangeIndex(33, 67, 1)), - (pd.RangeIndex(0, 1000, 1), 2, - pd.RangeIndex(0, 1000, 1)._int64index // 2), - (pd.RangeIndex(0, 100, 1), 2.0, - pd.RangeIndex(0, 100, 1)._int64index // 2.0), + (pd.RangeIndex(0, 1000, 1), 2, pd.RangeIndex(0, 1000, 1)._int64index // 2), + ( + pd.RangeIndex(0, 100, 1), + 2.0, + pd.RangeIndex(0, 100, 1)._int64index // 2.0, + ), (pd.RangeIndex(0), 50, pd.RangeIndex(0)), (pd.RangeIndex(2, 4, 2), 3, pd.RangeIndex(0, 1, 1)), (pd.RangeIndex(-5, -10, -6), 4, pd.RangeIndex(-2, -1, 1)), - (pd.RangeIndex(-100, -200, 3), 2, pd.RangeIndex(0))] + (pd.RangeIndex(-100, -200, 3), 2, pd.RangeIndex(0)), + ] for idx, div, expected in cases_exact: tm.assert_index_equal(idx // div, expected, exact=True) - @pytest.mark.parametrize('dtype', [np.int64, np.float64]) - @pytest.mark.parametrize('delta', [1, 0, -1]) + @pytest.mark.parametrize("dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("delta", [1, 0, -1]) def test_addsub_arithmetic(self, dtype, delta): # GH#8142 delta = dtype(delta) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index dd931939ddf51..f7f6ba8b114e7 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -17,12 +17,11 @@ class TestObjectComparisons: - def test_comparison_object_numeric_nas(self): ser = Series(np.random.randn(10), dtype=object) shifted = ser.shift(2) - ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + ops = ["lt", "le", "gt", "ge", "eq", "ne"] for op in ops: func = getattr(operator, op) @@ -31,24 +30,24 @@ def test_comparison_object_numeric_nas(self): tm.assert_series_equal(result, expected) def test_object_comparisons(self): - ser = Series(['a', 'b', np.nan, 'c', 'a']) + ser = Series(["a", "b", np.nan, "c", "a"]) - result = ser == 'a' + result = ser == "a" expected = Series([True, False, False, False, True]) tm.assert_series_equal(result, expected) - result = ser < 'a' + result = ser < "a" expected = Series([False, False, False, False, False]) tm.assert_series_equal(result, expected) - result = ser != 'a' - expected = -(ser == 'a') + result = ser != "a" + expected = -(ser == "a") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_more_na_comparisons(self, dtype): - left = Series(['a', np.nan, 'c'], dtype=dtype) - right = Series(['a', np.nan, 'd'], dtype=dtype) + left = Series(["a", np.nan, "c"], dtype=dtype) + right = Series(["a", np.nan, "d"], dtype=dtype) result = left == right expected = Series([True, False, False]) @@ -70,6 +69,7 @@ def test_more_na_comparisons(self, dtype): # ------------------------------------------------------------------ # Arithmetic + class TestArithmetic: # TODO: parametrize @@ -94,7 +94,7 @@ def test_add_extension_scalar(self, other, box, op): # Check that scalars satisfying is_extension_array_dtype(obj) # do not incorrectly try to dispatch to an ExtensionArray operation - arr = pd.Series(['a', 'b', 'c']) + arr = pd.Series(["a", "b", "c"]) expected = pd.Series([op(x, other) for x in arr]) arr = tm.box_expected(arr, box) @@ -103,60 +103,73 @@ def test_add_extension_scalar(self, other, box, op): result = op(arr, other) tm.assert_equal(result, expected) - @pytest.mark.parametrize('box', [ - pytest.param(pd.Index, - marks=pytest.mark.xfail(reason="Does not mask nulls", - raises=TypeError)), - pd.Series, - pd.DataFrame - ], ids=lambda x: x.__name__) + @pytest.mark.parametrize( + "box", + [ + pytest.param( + pd.Index, + marks=pytest.mark.xfail(reason="Does not mask nulls", raises=TypeError), + ), + pd.Series, + pd.DataFrame, + ], + ids=lambda x: x.__name__, + ) def test_objarr_add_str(self, box): - ser = pd.Series(['x', np.nan, 'x']) - expected = pd.Series(['xa', np.nan, 'xa']) + ser = pd.Series(["x", np.nan, "x"]) + expected = pd.Series(["xa", np.nan, "xa"]) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - result = ser + 'a' + result = ser + "a" tm.assert_equal(result, expected) - @pytest.mark.parametrize('box', [ - pytest.param(pd.Index, - marks=pytest.mark.xfail(reason="Does not mask nulls", - raises=TypeError)), - pd.Series, - pd.DataFrame - ], ids=lambda x: x.__name__) + @pytest.mark.parametrize( + "box", + [ + pytest.param( + pd.Index, + marks=pytest.mark.xfail(reason="Does not mask nulls", raises=TypeError), + ), + pd.Series, + pd.DataFrame, + ], + ids=lambda x: x.__name__, + ) def test_objarr_radd_str(self, box): - ser = pd.Series(['x', np.nan, 'x']) - expected = pd.Series(['ax', np.nan, 'ax']) + ser = pd.Series(["x", np.nan, "x"]) + expected = pd.Series(["ax", np.nan, "ax"]) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) - result = 'a' + ser + result = "a" + ser tm.assert_equal(result, expected) - @pytest.mark.parametrize('data', [ - [1, 2, 3], - [1.1, 2.2, 3.3], - [Timestamp('2011-01-01'), Timestamp('2011-01-02'), pd.NaT], - ['x', 'y', 1]]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + [1.1, 2.2, 3.3], + [Timestamp("2011-01-01"), Timestamp("2011-01-02"), pd.NaT], + ["x", "y", 1], + ], + ) + @pytest.mark.parametrize("dtype", [None, object]) def test_objarr_radd_str_invalid(self, dtype, data, box): ser = Series(data, dtype=dtype) ser = tm.box_expected(ser, box) with pytest.raises(TypeError): - 'foo_' + ser + "foo_" + ser - @pytest.mark.parametrize('op', [operator.add, ops.radd, - operator.sub, ops.rsub]) + @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) def test_objarr_add_invalid(self, op, box): # invalid ops obj_ser = tm.makeObjectSeries() - obj_ser.name = 'objects' + obj_ser.name = "objects" obj_ser = tm.box_expected(obj_ser, box) with pytest.raises(Exception): @@ -166,70 +179,86 @@ def test_objarr_add_invalid(self, op, box): # TODO: Moved from tests.series.test_operators; needs cleanup def test_operators_na_handling(self): - ser = Series(['foo', 'bar', 'baz', np.nan]) - result = 'prefix_' + ser - expected = pd.Series(['prefix_foo', 'prefix_bar', - 'prefix_baz', np.nan]) + ser = Series(["foo", "bar", "baz", np.nan]) + result = "prefix_" + ser + expected = pd.Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan]) tm.assert_series_equal(result, expected) - result = ser + '_suffix' - expected = pd.Series(['foo_suffix', 'bar_suffix', - 'baz_suffix', np.nan]) + result = ser + "_suffix" + expected = pd.Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan]) tm.assert_series_equal(result, expected) # TODO: parametrize over box - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_series_with_dtype_radd_timedelta(self, dtype): # note this test is _not_ aimed at timedelta64-dtyped Series - ser = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), - pd.Timedelta('3 days')], dtype=dtype) - expected = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'), - pd.Timedelta('6 days')]) - - result = pd.Timedelta('3 days') + ser + ser = pd.Series( + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + dtype=dtype, + ) + expected = pd.Series( + [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")] + ) + + result = pd.Timedelta("3 days") + ser tm.assert_series_equal(result, expected) - result = ser + pd.Timedelta('3 days') + result = ser + pd.Timedelta("3 days") tm.assert_series_equal(result, expected) # TODO: cleanup & parametrize over box def test_mixed_timezone_series_ops_object(self): # GH#13043 - ser = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), - pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], - name='xxx') + ser = pd.Series( + [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-01", tz="Asia/Tokyo"), + ], + name="xxx", + ) assert ser.dtype == object - exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), - pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], - name='xxx') - tm.assert_series_equal(ser + pd.Timedelta('1 days'), exp) - tm.assert_series_equal(pd.Timedelta('1 days') + ser, exp) + exp = pd.Series( + [ + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="Asia/Tokyo"), + ], + name="xxx", + ) + tm.assert_series_equal(ser + pd.Timedelta("1 days"), exp) + tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp) # object series & object series - ser2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), - pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], - name='xxx') + ser2 = pd.Series( + [ + pd.Timestamp("2015-01-03", tz="US/Eastern"), + pd.Timestamp("2015-01-05", tz="Asia/Tokyo"), + ], + name="xxx", + ) assert ser2.dtype == object - exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], - name='xxx') + exp = pd.Series([pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx") tm.assert_series_equal(ser2 - ser, exp) tm.assert_series_equal(ser - ser2, -exp) - ser = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], - name='xxx', dtype=object) + ser = pd.Series( + [pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")], + name="xxx", + dtype=object, + ) assert ser.dtype == object - exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], - name='xxx') - tm.assert_series_equal(ser + pd.Timedelta('00:30:00'), exp) - tm.assert_series_equal(pd.Timedelta('00:30:00') + ser, exp) + exp = pd.Series( + [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")], name="xxx" + ) + tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp) + tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp) # TODO: cleanup & parametrize over box def test_iadd_preserves_name(self): # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name ser = pd.Series([1, 2, 3]) - ser.index.name = 'foo' + ser.index.name = "foo" ser.index += 1 assert ser.index.name == "foo" @@ -239,19 +268,19 @@ def test_iadd_preserves_name(self): def test_add_string(self): # from bug report - index = pd.Index(['a', 'b', 'c']) - index2 = index + 'foo' + index = pd.Index(["a", "b", "c"]) + index2 = index + "foo" - assert 'a' not in index2 - assert 'afoo' in index2 + assert "a" not in index2 + assert "afoo" in index2 def test_iadd_string(self): - index = pd.Index(['a', 'b', 'c']) + index = pd.Index(["a", "b", "c"]) # doesn't fail test unless there is a check before `+=` - assert 'a' in index + assert "a" in index - index += '_x' - assert 'a_x' in index + index += "_x" + assert "a_x" in index def test_add(self): index = tm.makeStringIndex(100) @@ -261,16 +290,16 @@ def test_add(self): tm.assert_index_equal(index.tolist() + index, expected) # test add and radd - index = pd.Index(list('abc')) - expected = pd.Index(['a1', 'b1', 'c1']) - tm.assert_index_equal(index + '1', expected) - expected = pd.Index(['1a', '1b', '1c']) - tm.assert_index_equal('1' + index, expected) + index = pd.Index(list("abc")) + expected = pd.Index(["a1", "b1", "c1"]) + tm.assert_index_equal(index + "1", expected) + expected = pd.Index(["1a", "1b", "1c"]) + tm.assert_index_equal("1" + index, expected) def test_sub_fail(self): index = tm.makeStringIndex(100) with pytest.raises(TypeError): - index - 'a' + index - "a" with pytest.raises(TypeError): index - index with pytest.raises(TypeError): @@ -290,10 +319,10 @@ def test_sub_object(self): tm.assert_index_equal(result, expected) with pytest.raises(TypeError): - index - 'foo' + index - "foo" with pytest.raises(TypeError): - index - np.array([2, 'foo']) + index - np.array([2, "foo"]) def test_rsub_object(self): # GH#19369 @@ -307,7 +336,7 @@ def test_rsub_object(self): tm.assert_index_equal(result, expected) with pytest.raises(TypeError): - 'foo' - index + "foo" - index with pytest.raises(TypeError): np.array([True, pd.Timestamp.now()]) - index diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 413d58d9429e7..bd21335a7f9c7 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -29,7 +29,7 @@ def test_compare_zerodim(self, box_with_array): # GH#26689 make sure we unbox zero-dimensional arrays xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - pi = pd.period_range('2000', periods=4) + pi = pd.period_range("2000", periods=4) other = np.array(pi.to_numpy()[0]) pi = tm.box_expected(pi, box_with_array) @@ -44,14 +44,14 @@ class TestPeriodIndexComparisons: @pytest.mark.parametrize("other", ["2017", 2017]) def test_eq(self, other): - idx = PeriodIndex(['2017', '2017', '2018'], freq="D") + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") expected = np.array([True, True, False]) result = idx == other tm.assert_numpy_array_equal(result, expected) def test_pi_cmp_period(self): - idx = period_range('2007-01', periods=20, freq='M') + idx = period_range("2007-01", periods=20, freq="M") result = idx < idx[10] exp = idx.values < idx.values[10] @@ -61,9 +61,9 @@ def test_pi_cmp_period(self): def test_parr_cmp_period_scalar2(self, box_with_array): xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - pi = pd.period_range('2000-01-01', periods=10, freq='D') + pi = pd.period_range("2000-01-01", periods=10, freq="D") - val = Period('2000-01-04', freq='D') + val = Period("2000-01-04", freq="D") expected = [x > val for x in pi] ser = tm.box_expected(pi, box_with_array) @@ -77,15 +77,14 @@ def test_parr_cmp_period_scalar2(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_period_scalar(self, freq, box_with_array): # GH#13200 xbox = np.ndarray if box_with_array is pd.Index else box_with_array - base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq=freq) + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) - per = Period('2011-02', freq=freq) + per = Period("2011-02", freq=freq) exp = np.array([False, True, False, False]) exp = tm.box_expected(exp, xbox) @@ -117,18 +116,16 @@ def test_parr_cmp_period_scalar(self, freq, box_with_array): tm.assert_equal(base <= per, exp) tm.assert_equal(per >= base, exp) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_pi(self, freq, box_with_array): # GH#13200 xbox = np.ndarray if box_with_array is pd.Index else box_with_array - base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq=freq) + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) # TODO: could also box idx? - idx = PeriodIndex(['2011-02', '2011-01', '2011-03', '2011-05'], - freq=freq) + idx = PeriodIndex(["2011-02", "2011-01", "2011-03", "2011-05"], freq=freq) exp = np.array([False, False, True, False]) exp = tm.box_expected(exp, xbox) @@ -154,25 +151,25 @@ def test_parr_cmp_pi(self, freq, box_with_array): exp = tm.box_expected(exp, xbox) tm.assert_equal(base <= idx, exp) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # GH#13200 # different base freq - base = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq=freq) + base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) msg = "Input has different freq=A-DEC from " with pytest.raises(IncompatibleFrequency, match=msg): - base <= Period('2011', freq='A') + base <= Period("2011", freq="A") with pytest.raises(IncompatibleFrequency, match=msg): - Period('2011', freq='A') >= base + Period("2011", freq="A") >= base # TODO: Could parametrize over boxes for idx? - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') - rev_msg = (r'Input has different freq=(M|2M|3M) from ' - r'PeriodArray\(freq=A-DEC\)') + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") + rev_msg = ( + r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=A-DEC\)" + ) idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -180,41 +177,40 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # Different frequency msg = "Input has different freq=4M from " with pytest.raises(IncompatibleFrequency, match=msg): - base <= Period('2011', freq='4M') + base <= Period("2011", freq="4M") with pytest.raises(IncompatibleFrequency, match=msg): - Period('2011', freq='4M') >= base + Period("2011", freq="4M") >= base - idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') - rev_msg = (r'Input has different freq=(M|2M|3M) from ' - r'PeriodArray\(freq=4M\)') + idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") + rev_msg = r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=4M\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_pi_cmp_nat(self, freq): - idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) - result = idx1 > Period('2011-02', freq=freq) + result = idx1 > Period("2011-02", freq=freq) exp = np.array([False, False, False, True]) tm.assert_numpy_array_equal(result, exp) - result = Period('2011-02', freq=freq) < idx1 + result = Period("2011-02", freq=freq) < idx1 tm.assert_numpy_array_equal(result, exp) - result = idx1 == Period('NaT', freq=freq) + result = idx1 == Period("NaT", freq=freq) exp = np.array([False, False, False, False]) tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) == idx1 + result = Period("NaT", freq=freq) == idx1 tm.assert_numpy_array_equal(result, exp) - result = idx1 != Period('NaT', freq=freq) + result = idx1 != Period("NaT", freq=freq) exp = np.array([True, True, True, True]) tm.assert_numpy_array_equal(result, exp) - result = Period('NaT', freq=freq) != idx1 + result = Period("NaT", freq=freq) != idx1 tm.assert_numpy_array_equal(result, exp) - idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq=freq) + idx2 = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq=freq) result = idx1 < idx2 exp = np.array([True, False, False, False]) tm.assert_numpy_array_equal(result, exp) @@ -235,11 +231,11 @@ def test_pi_cmp_nat(self, freq): exp = np.array([False, False, True, False]) tm.assert_numpy_array_equal(result, exp) - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_pi_cmp_nat_mismatched_freq_raises(self, freq): - idx1 = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-05'], freq=freq) + idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq) - diff = PeriodIndex(['2011-02', '2011-01', '2011-04', 'NaT'], freq='4M') + diff = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq="4M") msg = "Input has different freq=4M from Period(Array|Index)" with pytest.raises(IncompatibleFrequency, match=msg): idx1 > diff @@ -248,11 +244,12 @@ def test_pi_cmp_nat_mismatched_freq_raises(self, freq): idx1 == diff # TODO: De-duplicate with test_pi_cmp_nat - @pytest.mark.parametrize('dtype', [object, None]) + @pytest.mark.parametrize("dtype", [object, None]) def test_comp_nat(self, dtype): - left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, - pd.Period('2011-01-03')]) - right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) + left = pd.PeriodIndex( + [pd.Period("2011-01-01"), pd.NaT, pd.Period("2011-01-03")] + ) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period("2011-01-03")]) if dtype is not None: left = left.astype(dtype) @@ -282,15 +279,23 @@ def test_comp_nat(self, dtype): class TestPeriodSeriesComparisons: def test_cmp_series_period_series_mixed_freq(self): # GH#13200 - base = Series([Period('2011', freq='A'), - Period('2011-02', freq='M'), - Period('2013', freq='A'), - Period('2011-04', freq='M')]) - - ser = Series([Period('2012', freq='A'), - Period('2011-01', freq='M'), - Period('2013', freq='A'), - Period('2011-05', freq='M')]) + base = Series( + [ + Period("2011", freq="A"), + Period("2011-02", freq="M"), + Period("2013", freq="A"), + Period("2011-04", freq="M"), + ] + ) + + ser = Series( + [ + Period("2012", freq="A"), + Period("2011-01", freq="M"), + Period("2013", freq="A"), + Period("2011-05", freq="M"), + ] + ) exp = Series([False, False, True, False]) tm.assert_series_equal(base == ser, exp) @@ -313,6 +318,7 @@ def test_cmp_series_period_series_mixed_freq(self): class TestPeriodIndexSeriesComparisonConsistency: """ Test PeriodIndex and Period Series Ops consistency """ + # TODO: needs parametrization+de-duplication def _check(self, values, func, expected): @@ -332,41 +338,43 @@ def _check(self, values, func, expected): tm.assert_series_equal(result, exp) def test_pi_comp_period(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) - f = lambda x: x == pd.Period('2011-03', freq='M') + f = lambda x: x == pd.Period("2011-03", freq="M") exp = np.array([False, False, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') == x + f = lambda x: pd.Period("2011-03", freq="M") == x self._check(idx, f, exp) - f = lambda x: x != pd.Period('2011-03', freq='M') + f = lambda x: x != pd.Period("2011-03", freq="M") exp = np.array([True, True, False, True], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') != x + f = lambda x: pd.Period("2011-03", freq="M") != x self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') >= x + f = lambda x: pd.Period("2011-03", freq="M") >= x exp = np.array([True, True, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: x > pd.Period('2011-03', freq='M') + f = lambda x: x > pd.Period("2011-03", freq="M") exp = np.array([False, False, False, True], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') >= x + f = lambda x: pd.Period("2011-03", freq="M") >= x exp = np.array([True, True, True, False], dtype=np.bool) self._check(idx, f, exp) def test_pi_comp_period_nat(self): - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) - f = lambda x: x == pd.Period('2011-03', freq='M') + f = lambda x: x == pd.Period("2011-03", freq="M") exp = np.array([False, False, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') == x + f = lambda x: pd.Period("2011-03", freq="M") == x self._check(idx, f, exp) f = lambda x: x == pd.NaT @@ -375,10 +383,10 @@ def test_pi_comp_period_nat(self): f = lambda x: pd.NaT == x self._check(idx, f, exp) - f = lambda x: x != pd.Period('2011-03', freq='M') + f = lambda x: x != pd.Period("2011-03", freq="M") exp = np.array([True, True, False, True], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') != x + f = lambda x: pd.Period("2011-03", freq="M") != x self._check(idx, f, exp) f = lambda x: x != pd.NaT @@ -387,11 +395,11 @@ def test_pi_comp_period_nat(self): f = lambda x: pd.NaT != x self._check(idx, f, exp) - f = lambda x: pd.Period('2011-03', freq='M') >= x + f = lambda x: pd.Period("2011-03", freq="M") >= x exp = np.array([True, False, True, False], dtype=np.bool) self._check(idx, f, exp) - f = lambda x: x < pd.Period('2011-03', freq='M') + f = lambda x: x < pd.Period("2011-03", freq="M") exp = np.array([True, False, False, False], dtype=np.bool) self._check(idx, f, exp) @@ -407,34 +415,46 @@ def test_pi_comp_period_nat(self): # ------------------------------------------------------------------ # Arithmetic -class TestPeriodFrameArithmetic: +class TestPeriodFrameArithmetic: def test_ops_frame_period(self): # GH#13043 - df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), - pd.Period('2015-02', freq='M')], - 'B': [pd.Period('2014-01', freq='M'), - pd.Period('2014-02', freq='M')]}) - assert df['A'].dtype == 'Period[M]' - assert df['B'].dtype == 'Period[M]' - - p = pd.Period('2015-03', freq='M') + df = pd.DataFrame( + { + "A": [pd.Period("2015-01", freq="M"), pd.Period("2015-02", freq="M")], + "B": [pd.Period("2014-01", freq="M"), pd.Period("2014-02", freq="M")], + } + ) + assert df["A"].dtype == "Period[M]" + assert df["B"].dtype == "Period[M]" + + p = pd.Period("2015-03", freq="M") off = p.freq # dtype will be object because of original dtype - exp = pd.DataFrame({'A': np.array([2 * off, 1 * off], dtype=object), - 'B': np.array([14 * off, 13 * off], dtype=object)}) + exp = pd.DataFrame( + { + "A": np.array([2 * off, 1 * off], dtype=object), + "B": np.array([14 * off, 13 * off], dtype=object), + } + ) tm.assert_frame_equal(p - df, exp) tm.assert_frame_equal(df - p, -1 * exp) - df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), - pd.Period('2015-06', freq='M')], - 'B': [pd.Period('2015-05', freq='M'), - pd.Period('2015-06', freq='M')]}) - assert df2['A'].dtype == 'Period[M]' - assert df2['B'].dtype == 'Period[M]' - - exp = pd.DataFrame({'A': np.array([4 * off, 4 * off], dtype=object), - 'B': np.array([16 * off, 16 * off], dtype=object)}) + df2 = pd.DataFrame( + { + "A": [pd.Period("2015-05", freq="M"), pd.Period("2015-06", freq="M")], + "B": [pd.Period("2015-05", freq="M"), pd.Period("2015-06", freq="M")], + } + ) + assert df2["A"].dtype == "Period[M]" + assert df2["B"].dtype == "Period[M]" + + exp = pd.DataFrame( + { + "A": np.array([4 * off, 4 * off], dtype=object), + "B": np.array([16 * off, 16 * off], dtype=object), + } + ) tm.assert_frame_equal(df2 - df, exp) tm.assert_frame_equal(df - df2, -1 * exp) @@ -447,8 +467,8 @@ class TestPeriodIndexArithmetic: # and PeriodIndex (with matching freq) def test_parr_add_iadd_parr_raises(self, box_with_array): - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='D', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="D", periods=5) # TODO: parametrize over boxes for other? rng = tm.box_expected(rng, box_with_array) @@ -467,8 +487,8 @@ def test_pi_sub_isub_pi(self): # For historical reference see GH#14164, GH#13077. # PeriodIndex subtraction originally performed set difference, # then changed to raise TypeError before being implemented in GH#20049 - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='D', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="D", periods=5) off = rng.freq expected = pd.Index([-5 * off] * 5) @@ -479,7 +499,7 @@ def test_pi_sub_isub_pi(self): tm.assert_index_equal(rng, expected) def test_pi_sub_pi_with_nat(self): - rng = pd.period_range('1/1/2000', freq='D', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) other = rng[1:].insert(0, pd.NaT) assert other[1:].equals(rng[1:]) @@ -489,68 +509,77 @@ def test_pi_sub_pi_with_nat(self): tm.assert_index_equal(result, expected) def test_parr_sub_pi_mismatched_freq(self, box_with_array): - rng = pd.period_range('1/1/2000', freq='D', periods=5) - other = pd.period_range('1/6/2000', freq='H', periods=5) + rng = pd.period_range("1/1/2000", freq="D", periods=5) + other = pd.period_range("1/6/2000", freq="H", periods=5) # TODO: parametrize over boxes for other? rng = tm.box_expected(rng, box_with_array) with pytest.raises(IncompatibleFrequency): rng - other - @pytest.mark.parametrize('n', [1, 2, 3, 4]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) def test_sub_n_gt_1_ticks(self, tick_classes, n): # GH 23878 - p1_d = '19910905' - p2_d = '19920406' + p1_d = "19910905" + p2_d = "19920406" p1 = pd.PeriodIndex([p1_d], freq=tick_classes(n)) p2 = pd.PeriodIndex([p2_d], freq=tick_classes(n)) - expected = (pd.PeriodIndex([p2_d], freq=p2.freq.base) - - pd.PeriodIndex([p1_d], freq=p1.freq.base)) + expected = pd.PeriodIndex([p2_d], freq=p2.freq.base) - pd.PeriodIndex( + [p1_d], freq=p1.freq.base + ) tm.assert_index_equal((p2 - p1), expected) - @pytest.mark.parametrize('n', [1, 2, 3, 4]) - @pytest.mark.parametrize('offset, kwd_name', [ - (pd.offsets.YearEnd, 'month'), - (pd.offsets.QuarterEnd, 'startingMonth'), - (pd.offsets.MonthEnd, None), - (pd.offsets.Week, 'weekday') - ]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (pd.offsets.YearEnd, "month"), + (pd.offsets.QuarterEnd, "startingMonth"), + (pd.offsets.MonthEnd, None), + (pd.offsets.Week, "weekday"), + ], + ) def test_sub_n_gt_1_offsets(self, offset, kwd_name, n): # GH 23878 kwds = {kwd_name: 3} if kwd_name is not None else {} - p1_d = '19910905' - p2_d = '19920406' + p1_d = "19910905" + p2_d = "19920406" freq = offset(n, normalize=False, **kwds) p1 = pd.PeriodIndex([p1_d], freq=freq) p2 = pd.PeriodIndex([p2_d], freq=freq) result = p2 - p1 - expected = (pd.PeriodIndex([p2_d], freq=freq.base) - - pd.PeriodIndex([p1_d], freq=freq.base)) + expected = pd.PeriodIndex([p2_d], freq=freq.base) - pd.PeriodIndex( + [p1_d], freq=freq.base + ) tm.assert_index_equal(result, expected) # ------------------------------------------------------------- # Invalid Operations - @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) - @pytest.mark.parametrize('op', [operator.add, ops.radd, - operator.sub, ops.rsub]) + @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) def test_parr_add_sub_float_raises(self, op, other, box_with_array): - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq='D') - pi = dti.to_period('D') + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") + pi = dti.to_period("D") pi = tm.box_expected(pi, box_with_array) with pytest.raises(TypeError): op(pi, other) - @pytest.mark.parametrize('other', [pd.Timestamp.now(), - pd.Timestamp.now().to_pydatetime(), - pd.Timestamp.now().to_datetime64()]) + @pytest.mark.parametrize( + "other", + [ + pd.Timestamp.now(), + pd.Timestamp.now().to_pydatetime(), + pd.Timestamp.now().to_datetime64(), + ], + ) def test_parr_add_sub_datetime_scalar(self, other, box_with_array): # GH#23215 - rng = pd.period_range('1/1/2000', freq='D', periods=3) + rng = pd.period_range("1/1/2000", freq="D", periods=3) rng = tm.box_expected(rng, box_with_array) with pytest.raises(TypeError): @@ -566,8 +595,8 @@ def test_parr_add_sub_datetime_scalar(self, other, box_with_array): # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] def test_parr_add_sub_dt64_array_raises(self, box_with_array): - rng = pd.period_range('1/1/2000', freq='D', periods=3) - dti = pd.date_range('2016-01-01', periods=3) + rng = pd.period_range("1/1/2000", freq="D", periods=3) + dti = pd.date_range("2016-01-01", periods=3) dtarr = dti.values rng = tm.box_expected(rng, box_with_array) @@ -583,8 +612,8 @@ def test_parr_add_sub_dt64_array_raises(self, box_with_array): dtarr - rng def test_pi_add_sub_td64_array_non_tick_raises(self): - rng = pd.period_range('1/1/2000', freq='Q', periods=3) - tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day']) + rng = pd.period_range("1/1/2000", freq="Q", periods=3) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) tdarr = tdi.values with pytest.raises(IncompatibleFrequency): @@ -600,11 +629,11 @@ def test_pi_add_sub_td64_array_non_tick_raises(self): def test_pi_add_sub_td64_array_tick(self): # PeriodIndex + Timedelta-like is allowed only with # tick-like frequencies - rng = pd.period_range('1/1/2000', freq='90D', periods=3) - tdi = pd.TimedeltaIndex(['-1 Day', '-1 Day', '-1 Day']) + rng = pd.period_range("1/1/2000", freq="90D", periods=3) + tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) tdarr = tdi.values - expected = pd.period_range('12/31/1999', freq='90D', periods=3) + expected = pd.period_range("12/31/1999", freq="90D", periods=3) result = rng + tdi tm.assert_index_equal(result, expected) result = rng + tdarr @@ -614,7 +643,7 @@ def test_pi_add_sub_td64_array_tick(self): result = tdarr + rng tm.assert_index_equal(result, expected) - expected = pd.period_range('1/2/2000', freq='90D', periods=3) + expected = pd.period_range("1/2/2000", freq="90D", periods=3) result = rng - tdi tm.assert_index_equal(result, expected) @@ -630,13 +659,17 @@ def test_pi_add_sub_td64_array_tick(self): # ----------------------------------------------------------------- # operations with array/Index of DateOffset objects - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_pi_add_offset_array(self, box): # GH#18849 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')]) - offs = box([pd.offsets.QuarterEnd(n=1, startingMonth=12), - pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) - expected = pd.PeriodIndex([pd.Period('2015Q2'), pd.Period('2015Q4')]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("2016Q2")]) + offs = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) + expected = pd.PeriodIndex([pd.Period("2015Q2"), pd.Period("2015Q4")]) with tm.assert_produces_warning(PerformanceWarning): res = pi + offs @@ -646,8 +679,7 @@ def test_pi_add_offset_array(self, box): res2 = offs + pi tm.assert_index_equal(res2, expected) - unanchored = np.array([pd.offsets.Hour(n=1), - pd.offsets.Minute(n=-2)]) + unanchored = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) # addition/subtraction ops with incompatible offsets should issue # a PerformanceWarning and _then_ raise a TypeError. with pytest.raises(IncompatibleFrequency): @@ -657,12 +689,16 @@ def test_pi_add_offset_array(self, box): with tm.assert_produces_warning(PerformanceWarning): unanchored + pi - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_pi_sub_offset_array(self, box): # GH#18824 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('2016Q2')]) - other = box([pd.offsets.QuarterEnd(n=1, startingMonth=12), - pd.offsets.QuarterEnd(n=-2, startingMonth=12)]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("2016Q2")]) + other = box( + [ + pd.offsets.QuarterEnd(n=1, startingMonth=12), + pd.offsets.QuarterEnd(n=-2, startingMonth=12), + ] + ) expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))]) @@ -683,9 +719,9 @@ def test_pi_sub_offset_array(self, box): def test_pi_add_iadd_int(self, one): # Variants of `one` for #19012 - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + rng = pd.period_range("2000-01-01 09:00", freq="H", periods=10) result = rng + one - expected = pd.period_range('2000-01-01 10:00', freq='H', periods=10) + expected = pd.period_range("2000-01-01 10:00", freq="H", periods=10) tm.assert_index_equal(result, expected) rng += one tm.assert_index_equal(rng, expected) @@ -695,16 +731,16 @@ def test_pi_sub_isub_int(self, one): PeriodIndex.__sub__ and __isub__ with several representations of the integer 1, e.g. int, np.int64, np.uint8, ... """ - rng = pd.period_range('2000-01-01 09:00', freq='H', periods=10) + rng = pd.period_range("2000-01-01 09:00", freq="H", periods=10) result = rng - one - expected = pd.period_range('2000-01-01 08:00', freq='H', periods=10) + expected = pd.period_range("2000-01-01 08:00", freq="H", periods=10) tm.assert_index_equal(result, expected) rng -= one tm.assert_index_equal(rng, expected) - @pytest.mark.parametrize('five', [5, np.array(5, dtype=np.int64)]) + @pytest.mark.parametrize("five", [5, np.array(5, dtype=np.int64)]) def test_pi_sub_intlike(self, five): - rng = period_range('2007-01', periods=50) + rng = period_range("2007-01", periods=50) result = rng - five exp = rng + (-five) @@ -713,16 +749,16 @@ def test_pi_sub_intlike(self, five): def test_pi_sub_isub_offset(self): # offset # DateOffset - rng = pd.period_range('2014', '2024', freq='A') + rng = pd.period_range("2014", "2024", freq="A") result = rng - pd.offsets.YearEnd(5) - expected = pd.period_range('2009', '2019', freq='A') + expected = pd.period_range("2009", "2019", freq="A") tm.assert_index_equal(result, expected) rng -= pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) - rng = pd.period_range('2014-01', '2016-12', freq='M') + rng = pd.period_range("2014-01", "2016-12", freq="M") result = rng - pd.offsets.MonthEnd(5) - expected = pd.period_range('2013-08', '2016-07', freq='M') + expected = pd.period_range("2013-08", "2016-07", freq="M") tm.assert_index_equal(result, expected) rng -= pd.offsets.MonthEnd(5) @@ -733,10 +769,10 @@ def test_pi_add_offset_n_gt1(self, box_transpose_fail): # add offset to PeriodIndex with freq.n > 1 box, transpose = box_transpose_fail - per = pd.Period('2016-01', freq='2M') + per = pd.Period("2016-01", freq="2M") pi = pd.PeriodIndex([per]) - expected = pd.PeriodIndex(['2016-03'], freq='2M') + expected = pd.PeriodIndex(["2016-03"], freq="2M") pi = tm.box_expected(pi, box, transpose=transpose) expected = tm.box_expected(expected, box, transpose=transpose) @@ -750,41 +786,41 @@ def test_pi_add_offset_n_gt1(self, box_transpose_fail): def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array): # GH#23215 # PeriodIndex with freq.n > 1 add offset with offset.n % freq.n != 0 - pi = pd.PeriodIndex(['2016-01'], freq='2M') - expected = pd.PeriodIndex(['2016-04'], freq='2M') + pi = pd.PeriodIndex(["2016-01"], freq="2M") + expected = pd.PeriodIndex(["2016-04"], freq="2M") # FIXME: with transposing these tests fail pi = tm.box_expected(pi, box_with_array, transpose=False) expected = tm.box_expected(expected, box_with_array, transpose=False) - result = pi + to_offset('3M') + result = pi + to_offset("3M") tm.assert_equal(result, expected) - result = to_offset('3M') + pi + result = to_offset("3M") + pi tm.assert_equal(result, expected) # --------------------------------------------------------------- # __add__/__sub__ with integer arrays - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_pi_add_intarray(self, int_holder, op): # GH#19959 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("NaT")]) other = int_holder([4, -1]) result = op(pi, other) - expected = pd.PeriodIndex([pd.Period('2016Q1'), pd.Period('NaT')]) + expected = pd.PeriodIndex([pd.Period("2016Q1"), pd.Period("NaT")]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('int_holder', [np.array, pd.Index]) + @pytest.mark.parametrize("int_holder", [np.array, pd.Index]) def test_pi_sub_intarray(self, int_holder): # GH#19959 - pi = pd.PeriodIndex([pd.Period('2015Q1'), pd.Period('NaT')]) + pi = pd.PeriodIndex([pd.Period("2015Q1"), pd.Period("NaT")]) other = int_holder([4, -1]) result = pi - other - expected = pd.PeriodIndex([pd.Period('2014Q1'), pd.Period('NaT')]) + expected = pd.PeriodIndex([pd.Period("2014Q1"), pd.Period("NaT")]) tm.assert_index_equal(result, expected) with pytest.raises(TypeError): @@ -800,10 +836,9 @@ def test_pi_add_timedeltalike_minute_gt1(self, three_days): # in test_pi_add_timedeltalike_tick_gt1, but here we write out the # expected result more explicitly. other = three_days - rng = pd.period_range('2014-05-01', periods=3, freq='2D') + rng = pd.period_range("2014-05-01", periods=3, freq="2D") - expected = pd.PeriodIndex(['2014-05-04', '2014-05-06', '2014-05-08'], - freq='2D') + expected = pd.PeriodIndex(["2014-05-04", "2014-05-06", "2014-05-08"], freq="2D") result = rng + other tm.assert_index_equal(result, expected) @@ -812,21 +847,19 @@ def test_pi_add_timedeltalike_minute_gt1(self, three_days): tm.assert_index_equal(result, expected) # subtraction - expected = pd.PeriodIndex(['2014-04-28', '2014-04-30', '2014-05-02'], - freq='2D') + expected = pd.PeriodIndex(["2014-04-28", "2014-04-30", "2014-05-02"], freq="2D") result = rng - other tm.assert_index_equal(result, expected) with pytest.raises(TypeError): other - rng - @pytest.mark.parametrize('freqstr', ['5ns', '5us', '5ms', - '5s', '5T', '5h', '5d']) + @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5T", "5h", "5d"]) def test_pi_add_timedeltalike_tick_gt1(self, three_days, freqstr): # GH#23031 adding a time-delta-like offset to a PeriodArray that has # tick-like frequency with n != 1 other = three_days - rng = pd.period_range('2014-05-01', periods=6, freq=freqstr) + rng = pd.period_range("2014-05-01", periods=6, freq=freqstr) expected = pd.period_range(rng[0] + other, periods=6, freq=freqstr) @@ -847,8 +880,8 @@ def test_pi_add_timedeltalike_tick_gt1(self, three_days, freqstr): def test_pi_add_iadd_timedeltalike_daily(self, three_days): # Tick other = three_days - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - expected = pd.period_range('2014-05-04', '2014-05-18', freq='D') + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + expected = pd.period_range("2014-05-04", "2014-05-18", freq="D") result = rng + other tm.assert_index_equal(result, expected) @@ -859,8 +892,8 @@ def test_pi_add_iadd_timedeltalike_daily(self, three_days): def test_pi_sub_isub_timedeltalike_daily(self, three_days): # Tick-like 3 Days other = three_days - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - expected = pd.period_range('2014-04-28', '2014-05-12', freq='D') + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + expected = pd.period_range("2014-04-28", "2014-05-12", freq="D") result = rng - other tm.assert_index_equal(result, expected) @@ -870,8 +903,8 @@ def test_pi_sub_isub_timedeltalike_daily(self, three_days): def test_pi_add_sub_timedeltalike_freq_mismatch_daily(self, not_daily): other = not_daily - rng = pd.period_range('2014-05-01', '2014-05-15', freq='D') - msg = 'Input has different freq(=.+)? from Period.*?\\(freq=D\\)' + rng = pd.period_range("2014-05-01", "2014-05-15", freq="D") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=D\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): @@ -883,9 +916,8 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_daily(self, not_daily): def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): other = two_hours - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - expected = pd.period_range('2014-01-01 12:00', '2014-01-05 12:00', - freq='H') + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + expected = pd.period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="H") result = rng + other tm.assert_index_equal(result, expected) @@ -895,8 +927,8 @@ def test_pi_add_iadd_timedeltalike_hourly(self, two_hours): def test_pi_add_timedeltalike_mismatched_freq_hourly(self, not_hourly): other = not_hourly - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - msg = 'Input has different freq(=.+)? from Period.*?\\(freq=H\\)' + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=H\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other @@ -906,9 +938,8 @@ def test_pi_add_timedeltalike_mismatched_freq_hourly(self, not_hourly): def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): other = two_hours - rng = pd.period_range('2014-01-01 10:00', '2014-01-05 10:00', freq='H') - expected = pd.period_range('2014-01-01 08:00', '2014-01-05 08:00', - freq='H') + rng = pd.period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="H") + expected = pd.period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="H") result = rng - other tm.assert_index_equal(result, expected) @@ -919,19 +950,17 @@ def test_pi_sub_isub_timedeltalike_hourly(self, two_hours): def test_add_iadd_timedeltalike_annual(self): # offset # DateOffset - rng = pd.period_range('2014', '2024', freq='A') + rng = pd.period_range("2014", "2024", freq="A") result = rng + pd.offsets.YearEnd(5) - expected = pd.period_range('2019', '2029', freq='A') + expected = pd.period_range("2019", "2029", freq="A") tm.assert_index_equal(result, expected) rng += pd.offsets.YearEnd(5) tm.assert_index_equal(rng, expected) - def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, - mismatched_freq): + def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq): other = mismatched_freq - rng = pd.period_range('2014', '2024', freq='A') - msg = ('Input has different freq(=.+)? ' - 'from Period.*?\\(freq=A-DEC\\)') + rng = pd.period_range("2014", "2024", freq="A") + msg = "Input has different freq(=.+)? " "from Period.*?\\(freq=A-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): @@ -942,8 +971,8 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, rng -= other def test_pi_add_iadd_timedeltalike_M(self): - rng = pd.period_range('2014-01', '2016-12', freq='M') - expected = pd.period_range('2014-06', '2017-05', freq='M') + rng = pd.period_range("2014-01", "2016-12", freq="M") + expected = pd.period_range("2014-06", "2017-05", freq="M") result = rng + pd.offsets.MonthEnd(5) tm.assert_index_equal(result, expected) @@ -951,11 +980,10 @@ def test_pi_add_iadd_timedeltalike_M(self): rng += pd.offsets.MonthEnd(5) tm.assert_index_equal(rng, expected) - def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, - mismatched_freq): + def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq): other = mismatched_freq - rng = pd.period_range('2014-01', '2016-12', freq='M') - msg = 'Input has different freq(=.+)? from Period.*?\\(freq=M\\)' + rng = pd.period_range("2014-01", "2016-12", freq="M") + msg = "Input has different freq(=.+)? from Period.*?\\(freq=M\\)" with pytest.raises(IncompatibleFrequency, match=msg): rng + other with pytest.raises(IncompatibleFrequency, match=msg): @@ -989,17 +1017,21 @@ def test_parr_add_sub_td64_nat(self, box_transpose_fail): class TestPeriodSeriesArithmetic: def test_ops_series_timedelta(self): # GH#13043 - ser = pd.Series([pd.Period('2015-01-01', freq='D'), - pd.Period('2015-01-02', freq='D')], name='xxx') - assert ser.dtype == 'Period[D]' + ser = pd.Series( + [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], + name="xxx", + ) + assert ser.dtype == "Period[D]" - expected = pd.Series([pd.Period('2015-01-02', freq='D'), - pd.Period('2015-01-03', freq='D')], name='xxx') + expected = pd.Series( + [pd.Period("2015-01-02", freq="D"), pd.Period("2015-01-03", freq="D")], + name="xxx", + ) - result = ser + pd.Timedelta('1 days') + result = ser + pd.Timedelta("1 days") tm.assert_series_equal(result, expected) - result = pd.Timedelta('1 days') + ser + result = pd.Timedelta("1 days") + ser tm.assert_series_equal(result, expected) result = ser + pd.tseries.offsets.Day() @@ -1010,22 +1042,26 @@ def test_ops_series_timedelta(self): def test_ops_series_period(self): # GH#13043 - ser = pd.Series([pd.Period('2015-01-01', freq='D'), - pd.Period('2015-01-02', freq='D')], name='xxx') + ser = pd.Series( + [pd.Period("2015-01-01", freq="D"), pd.Period("2015-01-02", freq="D")], + name="xxx", + ) assert ser.dtype == "Period[D]" - per = pd.Period('2015-01-10', freq='D') + per = pd.Period("2015-01-10", freq="D") off = per.freq # dtype will be object because of original dtype - expected = pd.Series([9 * off, 8 * off], name='xxx', dtype=object) + expected = pd.Series([9 * off, 8 * off], name="xxx", dtype=object) tm.assert_series_equal(per - ser, expected) tm.assert_series_equal(ser - per, -1 * expected) - s2 = pd.Series([pd.Period('2015-01-05', freq='D'), - pd.Period('2015-01-04', freq='D')], name='xxx') + s2 = pd.Series( + [pd.Period("2015-01-05", freq="D"), pd.Period("2015-01-04", freq="D")], + name="xxx", + ) assert s2.dtype == "Period[D]" - expected = pd.Series([4 * off, 2 * off], name='xxx', dtype=object) + expected = pd.Series([4 * off, 2 * off], name="xxx", dtype=object) tm.assert_series_equal(s2 - ser, expected) tm.assert_series_equal(ser - s2, -1 * expected) @@ -1045,52 +1081,62 @@ def _check(self, values, func, expected): tm.assert_series_equal(result, exp) def test_pi_ops(self): - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) - expected = PeriodIndex(['2011-03', '2011-04', '2011-05', '2011-06'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-03", "2011-04", "2011-05", "2011-06"], freq="M", name="idx" + ) self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) self._check(idx + 2, lambda x: x - 2, idx) - result = idx - Period('2011-01', freq='M') + result = idx - Period("2011-01", freq="M") off = idx.freq - exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name='idx') + exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name="idx") tm.assert_index_equal(result, exp) - result = Period('2011-01', freq='M') - idx - exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name='idx') + result = Period("2011-01", freq="M") - idx + exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name="idx") tm.assert_index_equal(result, exp) @pytest.mark.parametrize("ng", ["str", 1.5]) - @pytest.mark.parametrize("func", [ - lambda obj, ng: obj + ng, - lambda obj, ng: ng + obj, - lambda obj, ng: obj - ng, - lambda obj, ng: ng - obj, - lambda obj, ng: np.add(obj, ng), - lambda obj, ng: np.add(ng, obj), - lambda obj, ng: np.subtract(obj, ng), - lambda obj, ng: np.subtract(ng, obj), - ]) + @pytest.mark.parametrize( + "func", + [ + lambda obj, ng: obj + ng, + lambda obj, ng: ng + obj, + lambda obj, ng: obj - ng, + lambda obj, ng: ng - obj, + lambda obj, ng: np.add(obj, ng), + lambda obj, ng: np.add(ng, obj), + lambda obj, ng: np.subtract(obj, ng), + lambda obj, ng: np.subtract(ng, obj), + ], + ) def test_parr_ops_errors(self, ng, func, box_with_array): - idx = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], - freq="M", name="idx") + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) obj = tm.box_expected(idx, box_with_array) - msg = (r"unsupported operand type\(s\)|can only concatenate|" - r"must be str|object to str implicitly") + msg = ( + r"unsupported operand type\(s\)|can only concatenate|" + r"must be str|object to str implicitly" + ) with pytest.raises(TypeError, match=msg): func(obj, ng) def test_pi_ops_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') - expected = PeriodIndex(['2011-03', '2011-04', 'NaT', '2011-06'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + expected = PeriodIndex( + ["2011-03", "2011-04", "NaT", "2011-06"], freq="M", name="idx" + ) self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) @@ -1100,10 +1146,12 @@ def test_pi_ops_nat(self): self._check(idx + 2, lambda x: np.subtract(x, 2), idx) # freq with mult - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='2M', name='idx') - expected = PeriodIndex(['2011-07', '2011-08', 'NaT', '2011-10'], - freq='2M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="2M", name="idx" + ) + expected = PeriodIndex( + ["2011-07", "2011-08", "NaT", "2011-10"], freq="2M", name="idx" + ) self._check(idx, lambda x: x + 3, expected) self._check(idx, lambda x: 3 + x, expected) @@ -1114,49 +1162,69 @@ def test_pi_ops_nat(self): def test_pi_ops_array_int(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) f = lambda x: x + np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], - freq='M', name='idx') + exp = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) self._check(idx, f, exp) f = lambda x: np.add(x, np.array([4, -1, 1, 2])) - exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'], - freq='M', name='idx') + exp = PeriodIndex( + ["2011-05", "2011-01", "NaT", "2011-06"], freq="M", name="idx" + ) self._check(idx, f, exp) f = lambda x: x - np.array([1, 2, 3, 4]) - exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'], - freq='M', name='idx') + exp = PeriodIndex( + ["2010-12", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) self._check(idx, f, exp) f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) - exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'], - freq='M', name='idx') + exp = PeriodIndex( + ["2010-10", "2010-12", "NaT", "2011-06"], freq="M", name="idx" + ) self._check(idx, f, exp) def test_pi_ops_offset(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) f = lambda x: x + pd.offsets.Day() - exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', - '2011-04-02'], freq='D', name='idx') + exp = PeriodIndex( + ["2011-01-02", "2011-02-02", "2011-03-02", "2011-04-02"], + freq="D", + name="idx", + ) self._check(idx, f, exp) f = lambda x: x + pd.offsets.Day(2) - exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', - '2011-04-03'], freq='D', name='idx') + exp = PeriodIndex( + ["2011-01-03", "2011-02-03", "2011-03-03", "2011-04-03"], + freq="D", + name="idx", + ) self._check(idx, f, exp) f = lambda x: x - pd.offsets.Day(2) - exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', - '2011-03-30'], freq='D', name='idx') + exp = PeriodIndex( + ["2010-12-30", "2011-01-30", "2011-02-27", "2011-03-30"], + freq="D", + name="idx", + ) self._check(idx, f, exp) def test_pi_offset_errors(self): - idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', - '2011-04-01'], freq='D', name='idx') + idx = PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"], + freq="D", + name="idx", + ) ser = pd.Series(idx) # Series op is applied per Period instance, thus error is raised @@ -1175,50 +1243,53 @@ def test_pi_offset_errors(self): def test_pi_sub_period(self): # GH#13071 - idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx" + ) - result = idx - pd.Period('2012-01', freq='M') + result = idx - pd.Period("2012-01", freq="M") off = idx.freq - exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name='idx') + exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name="idx") tm.assert_index_equal(result, exp) - result = np.subtract(idx, pd.Period('2012-01', freq='M')) + result = np.subtract(idx, pd.Period("2012-01", freq="M")) tm.assert_index_equal(result, exp) - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name='idx') + result = pd.Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name="idx") tm.assert_index_equal(result, exp) - result = np.subtract(pd.Period('2012-01', freq='M'), idx) + result = np.subtract(pd.Period("2012-01", freq="M"), idx) tm.assert_index_equal(result, exp) - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + tm.assert_index_equal(idx - pd.Period("NaT", freq="M"), exp) + tm.assert_index_equal(pd.Period("NaT", freq="M") - idx, exp) def test_pi_sub_pdnat(self): # GH#13071 - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') - exp = pd.TimedeltaIndex([pd.NaT] * 4, name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) + exp = pd.TimedeltaIndex([pd.NaT] * 4, name="idx") tm.assert_index_equal(pd.NaT - idx, exp) tm.assert_index_equal(idx - pd.NaT, exp) def test_pi_sub_period_nat(self): # GH#13071 - idx = PeriodIndex(['2011-01', 'NaT', '2011-03', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx" + ) - result = idx - pd.Period('2012-01', freq='M') + result = idx - pd.Period("2012-01", freq="M") off = idx.freq - exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name='idx') + exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name="idx") tm.assert_index_equal(result, exp) - result = pd.Period('2012-01', freq='M') - idx - exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name='idx') + result = pd.Period("2012-01", freq="M") - idx + exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name="idx") tm.assert_index_equal(result, exp) - exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') - tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) - tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name="idx") + tm.assert_index_equal(idx - pd.Period("NaT", freq="M"), exp) + tm.assert_index_equal(pd.Period("NaT", freq="M") - idx, exp) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 0ae325cfce787..06c4a6ece4bcc 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -5,13 +5,19 @@ import numpy as np import pytest -from pandas.errors import ( - NullFrequencyError, OutOfBoundsDatetime, PerformanceWarning) +from pandas.errors import NullFrequencyError, OutOfBoundsDatetime, PerformanceWarning import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, NaT, Series, Timedelta, TimedeltaIndex, - Timestamp, timedelta_range) + DataFrame, + DatetimeIndex, + NaT, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + timedelta_range, +) import pandas.util.testing as tm @@ -31,6 +37,7 @@ def get_upcast_box(box, vector): # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons + class TestTimedelta64ArrayLikeComparisons: # Comparison tests for timedelta64[ns] vectors fully parametrized over # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all comparison @@ -41,7 +48,7 @@ def test_compare_timedelta64_zerodim(self, box_with_array): box = box_with_array xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - tdi = pd.timedelta_range('2H', periods=4) + tdi = pd.timedelta_range("2H", periods=4) other = np.array(tdi.to_numpy()[0]) tdi = tm.box_expected(tdi, box) @@ -68,10 +75,10 @@ def test_compare_timedelta_series(self): def test_tdi_cmp_str_invalid(self, box_with_array): # GH#13624 xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - tdi = TimedeltaIndex(['1 day', '2 days']) + tdi = TimedeltaIndex(["1 day", "2 days"]) tdarr = tm.box_expected(tdi, box_with_array) - for left, right in [(tdarr, 'a'), ('a', tdarr)]: + for left, right in [(tdarr, "a"), ("a", tdarr)]: with pytest.raises(TypeError): left > right with pytest.raises(TypeError): @@ -91,11 +98,12 @@ def test_tdi_cmp_str_invalid(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("dtype", [None, object]) def test_comp_nat(self, dtype): - left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, - pd.Timedelta('3 days')]) - right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) + left = pd.TimedeltaIndex( + [pd.Timedelta("1 days"), pd.NaT, pd.Timedelta("3 days")] + ) + right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta("3 days")]) lhs, rhs = left, right if dtype is object: @@ -122,15 +130,29 @@ def test_comp_nat(self, dtype): tm.assert_numpy_array_equal(pd.NaT > lhs, expected) def test_comparisons_nat(self): - tdidx1 = pd.TimedeltaIndex(['1 day', pd.NaT, '1 day 00:00:01', pd.NaT, - '1 day 00:00:01', '5 day 00:00:03']) - tdidx2 = pd.TimedeltaIndex(['2 day', '2 day', pd.NaT, pd.NaT, - '1 day 00:00:02', '5 days 00:00:03']) - tdarr = np.array([np.timedelta64(2, 'D'), - np.timedelta64(2, 'D'), np.timedelta64('nat'), - np.timedelta64('nat'), - np.timedelta64(1, 'D') + np.timedelta64(2, 's'), - np.timedelta64(5, 'D') + np.timedelta64(3, 's')]) + tdidx1 = pd.TimedeltaIndex( + [ + "1 day", + pd.NaT, + "1 day 00:00:01", + pd.NaT, + "1 day 00:00:01", + "5 day 00:00:03", + ] + ) + tdidx2 = pd.TimedeltaIndex( + ["2 day", "2 day", pd.NaT, pd.NaT, "1 day 00:00:02", "5 days 00:00:03"] + ) + tdarr = np.array( + [ + np.timedelta64(2, "D"), + np.timedelta64(2, "D"), + np.timedelta64("nat"), + np.timedelta64("nat"), + np.timedelta64(1, "D") + np.timedelta64(2, "s"), + np.timedelta64(5, "D") + np.timedelta64(3, "s"), + ] + ) cases = [(tdidx1, tdidx2), (tdidx1, tdarr)] @@ -163,7 +185,7 @@ def test_comparisons_nat(self): # TODO: better name def test_comparisons_coverage(self): - rng = timedelta_range('1 days', periods=10) + rng = timedelta_range("1 days", periods=10) result = rng < rng[3] expected = np.array([True, True, True] + [False] * 7) @@ -181,53 +203,49 @@ def test_comparisons_coverage(self): # ------------------------------------------------------------------ # Timedelta64[ns] dtype Arithmetic Operations + class TestTimedelta64ArithmeticUnsorted: # Tests moved from type-specific test files but not # yet sorted/parametrized/de-duplicated def test_ufunc_coercions(self): # normal ops are also tested in tseries/test_timedeltas.py - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') + idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") for result in [idx * 2, np.multiply(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['4H', '8H', '12H', '16H', '20H'], - freq='4H', name='x') + exp = TimedeltaIndex(["4H", "8H", "12H", "16H", "20H"], freq="4H", name="x") tm.assert_index_equal(result, exp) - assert result.freq == '4H' + assert result.freq == "4H" for result in [idx / 2, np.divide(idx, 2)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['1H', '2H', '3H', '4H', '5H'], - freq='H', name='x') + exp = TimedeltaIndex(["1H", "2H", "3H", "4H", "5H"], freq="H", name="x") tm.assert_index_equal(result, exp) - assert result.freq == 'H' + assert result.freq == "H" - idx = TimedeltaIndex(['2H', '4H', '6H', '8H', '10H'], - freq='2H', name='x') + idx = TimedeltaIndex(["2H", "4H", "6H", "8H", "10H"], freq="2H", name="x") for result in [-idx, np.negative(idx)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['-2H', '-4H', '-6H', '-8H', '-10H'], - freq='-2H', name='x') + exp = TimedeltaIndex( + ["-2H", "-4H", "-6H", "-8H", "-10H"], freq="-2H", name="x" + ) tm.assert_index_equal(result, exp) - assert result.freq == '-2H' + assert result.freq == "-2H" - idx = TimedeltaIndex(['-2H', '-1H', '0H', '1H', '2H'], - freq='H', name='x') + idx = TimedeltaIndex(["-2H", "-1H", "0H", "1H", "2H"], freq="H", name="x") for result in [abs(idx), np.absolute(idx)]: assert isinstance(result, TimedeltaIndex) - exp = TimedeltaIndex(['2H', '1H', '0H', '1H', '2H'], - freq=None, name='x') + exp = TimedeltaIndex(["2H", "1H", "0H", "1H", "2H"], freq=None, name="x") tm.assert_index_equal(result, exp) assert result.freq is None def test_subtraction_ops(self): # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = pd.date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") msg = "cannot subtract a datelike from a TimedeltaArray" with pytest.raises(TypeError, match=msg): @@ -235,8 +253,10 @@ def test_subtraction_ops(self): with pytest.raises(TypeError, match=msg): tdi - dti - msg = (r"descriptor '__sub__' requires a 'datetime\.datetime' object" - " but received a 'Timedelta'") + msg = ( + r"descriptor '__sub__' requires a 'datetime\.datetime' object" + " but received a 'Timedelta'" + ) with pytest.raises(TypeError, match=msg): td - dt @@ -245,41 +265,40 @@ def test_subtraction_ops(self): td - dti result = dt - dti - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days'], name='bar') + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"], name="bar") tm.assert_index_equal(result, expected) result = dti - dt - expected = TimedeltaIndex(['0 days', '1 days', '2 days'], name='bar') + expected = TimedeltaIndex(["0 days", "1 days", "2 days"], name="bar") tm.assert_index_equal(result, expected) result = tdi - td - expected = TimedeltaIndex(['0 days', pd.NaT, '1 days'], name='foo') + expected = TimedeltaIndex(["0 days", pd.NaT, "1 days"], name="foo") tm.assert_index_equal(result, expected, check_names=False) result = td - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '-1 days'], name='foo') + expected = TimedeltaIndex(["0 days", pd.NaT, "-1 days"], name="foo") tm.assert_index_equal(result, expected, check_names=False) result = dti - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], name='bar') + expected = DatetimeIndex(["20121231", "20130101", "20130102"], name="bar") tm.assert_index_equal(result, expected, check_names=False) result = dt - tdi - expected = DatetimeIndex(['20121231', pd.NaT, '20121230'], name='foo') + expected = DatetimeIndex(["20121231", pd.NaT, "20121230"], name="foo") tm.assert_index_equal(result, expected) def test_subtraction_ops_with_tz(self): # check that dt/dti subtraction ops with tz are validated - dti = pd.date_range('20130101', periods=3) - ts = Timestamp('20130101') + dti = pd.date_range("20130101", periods=3) + ts = Timestamp("20130101") dt = ts.to_pydatetime() - dti_tz = pd.date_range('20130101', periods=3).tz_localize('US/Eastern') - ts_tz = Timestamp('20130101').tz_localize('US/Eastern') - ts_tz2 = Timestamp('20130101').tz_localize('CET') + dti_tz = pd.date_range("20130101", periods=3).tz_localize("US/Eastern") + ts_tz = Timestamp("20130101").tz_localize("US/Eastern") + ts_tz2 = Timestamp("20130101").tz_localize("CET") dt_tz = ts_tz.to_pydatetime() - td = Timedelta('1 days') + td = Timedelta("1 days") def _check(result, expected): assert result == expected @@ -287,34 +306,31 @@ def _check(result, expected): # scalars result = ts - ts - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) result = dt_tz - ts_tz - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) result = ts_tz - dt_tz - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) # tz mismatches - msg = ("Timestamp subtraction must have the same timezones or no" - " timezones") + msg = "Timestamp subtraction must have the same timezones or no" " timezones" with pytest.raises(TypeError, match=msg): dt_tz - ts msg = "can't subtract offset-naive and offset-aware datetimes" with pytest.raises(TypeError, match=msg): dt_tz - dt - msg = ("Timestamp subtraction must have the same timezones or no" - " timezones") + msg = "Timestamp subtraction must have the same timezones or no" " timezones" with pytest.raises(TypeError, match=msg): dt_tz - ts_tz2 msg = "can't subtract offset-naive and offset-aware datetimes" with pytest.raises(TypeError, match=msg): dt - dt_tz - msg = ("Timestamp subtraction must have the same timezones or no" - " timezones") + msg = "Timestamp subtraction must have the same timezones or no" " timezones" with pytest.raises(TypeError, match=msg): ts - dt_tz with pytest.raises(TypeError, match=msg): @@ -333,72 +349,71 @@ def _check(result, expected): dti_tz - ts_tz2 result = dti_tz - dt_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) tm.assert_index_equal(result, expected) result = dt_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) tm.assert_index_equal(result, expected) result = dti_tz - ts_tz - expected = TimedeltaIndex(['0 days', '1 days', '2 days']) + expected = TimedeltaIndex(["0 days", "1 days", "2 days"]) tm.assert_index_equal(result, expected) result = ts_tz - dti_tz - expected = TimedeltaIndex(['0 days', '-1 days', '-2 days']) + expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"]) tm.assert_index_equal(result, expected) result = td - td - expected = Timedelta('0 days') + expected = Timedelta("0 days") _check(result, expected) result = dti_tz - td - expected = DatetimeIndex( - ['20121231', '20130101', '20130102'], tz='US/Eastern') + expected = DatetimeIndex(["20121231", "20130101", "20130102"], tz="US/Eastern") tm.assert_index_equal(result, expected) def test_dti_tdi_numeric_ops(self): # These are normally union/diff set-like ops - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = pd.date_range('20130101', periods=3, name='bar') + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") # TODO(wesm): unused? # td = Timedelta('1 days') # dt = Timestamp('20130101') result = tdi - tdi - expected = TimedeltaIndex(['0 days', pd.NaT, '0 days'], name='foo') + expected = TimedeltaIndex(["0 days", pd.NaT, "0 days"], name="foo") tm.assert_index_equal(result, expected) result = tdi + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '4 days'], name='foo') + expected = TimedeltaIndex(["2 days", pd.NaT, "4 days"], name="foo") tm.assert_index_equal(result, expected) result = dti - tdi # name will be reset - expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) + expected = DatetimeIndex(["20121231", pd.NaT, "20130101"]) tm.assert_index_equal(result, expected) def test_addition_ops(self): # with datetimes/timedelta and tdi/dti - tdi = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - dti = pd.date_range('20130101', periods=3, name='bar') - td = Timedelta('1 days') - dt = Timestamp('20130101') + tdi = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + dti = pd.date_range("20130101", periods=3, name="bar") + td = Timedelta("1 days") + dt = Timestamp("20130101") result = tdi + dt - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + expected = DatetimeIndex(["20130102", pd.NaT, "20130103"], name="foo") tm.assert_index_equal(result, expected) result = dt + tdi - expected = DatetimeIndex(['20130102', pd.NaT, '20130103'], name='foo') + expected = DatetimeIndex(["20130102", pd.NaT, "20130103"], name="foo") tm.assert_index_equal(result, expected) result = td + tdi - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + expected = TimedeltaIndex(["2 days", pd.NaT, "3 days"], name="foo") tm.assert_index_equal(result, expected) result = tdi + td - expected = TimedeltaIndex(['2 days', pd.NaT, '3 days'], name='foo') + expected = TimedeltaIndex(["2 days", pd.NaT, "3 days"], name="foo") tm.assert_index_equal(result, expected) # unequal length @@ -416,32 +431,32 @@ def test_addition_ops(self): # pytest.raises(TypeError, lambda : Int64Index([1,2,3]) + tdi) result = tdi + dti # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + expected = DatetimeIndex(["20130102", pd.NaT, "20130105"]) tm.assert_index_equal(result, expected) result = dti + tdi # name will be reset - expected = DatetimeIndex(['20130102', pd.NaT, '20130105']) + expected = DatetimeIndex(["20130102", pd.NaT, "20130105"]) tm.assert_index_equal(result, expected) result = dt + td - expected = Timestamp('20130102') + expected = Timestamp("20130102") assert result == expected result = td + dt - expected = Timestamp('20130102') + expected = Timestamp("20130102") assert result == expected # TODO: Needs more informative name, probably split up into # more targeted tests - @pytest.mark.parametrize('freq', ['D', 'B']) + @pytest.mark.parametrize("freq", ["D", "B"]) def test_timedelta(self, freq): - index = pd.date_range('1/1/2000', periods=50, freq=freq) + index = pd.date_range("1/1/2000", periods=50, freq=freq) shifted = index + timedelta(1) back = shifted + timedelta(-1) tm.assert_index_equal(index, back) - if freq == 'D': + if freq == "D": expected = pd.tseries.offsets.Day(1) assert index.freq == expected assert shifted.freq == expected @@ -456,7 +471,7 @@ def test_timedelta(self, freq): tm.assert_index_equal(result, expected) # GH#4134, buggy with timedeltas - rng = pd.date_range('2013', '2014') + rng = pd.date_range("2013", "2014") s = Series(rng) result1 = rng - pd.offsets.Hour(1) result2 = DatetimeIndex(s - np.timedelta64(100000000)) @@ -471,19 +486,23 @@ class TestAddSubNaTMasking: def test_tdi_add_timestamp_nat_masking(self): # GH#17991 checking for overflow-masking with NaT - tdinat = pd.to_timedelta(['24658 days 11:15:00', 'NaT']) - - tsneg = Timestamp('1950-01-01') - ts_neg_variants = [tsneg, - tsneg.to_pydatetime(), - tsneg.to_datetime64().astype('datetime64[ns]'), - tsneg.to_datetime64().astype('datetime64[D]')] - - tspos = Timestamp('1980-01-01') - ts_pos_variants = [tspos, - tspos.to_pydatetime(), - tspos.to_datetime64().astype('datetime64[ns]'), - tspos.to_datetime64().astype('datetime64[D]')] + tdinat = pd.to_timedelta(["24658 days 11:15:00", "NaT"]) + + tsneg = Timestamp("1950-01-01") + ts_neg_variants = [ + tsneg, + tsneg.to_pydatetime(), + tsneg.to_datetime64().astype("datetime64[ns]"), + tsneg.to_datetime64().astype("datetime64[D]"), + ] + + tspos = Timestamp("1980-01-01") + ts_pos_variants = [ + tspos, + tspos.to_pydatetime(), + tspos.to_datetime64().astype("datetime64[ns]"), + tspos.to_datetime64().astype("datetime64[D]"), + ] for variant in ts_neg_variants + ts_pos_variants: res = tdinat + variant @@ -493,36 +512,39 @@ def test_tdi_add_overflow(self): # See GH#14068 # preliminary test scalar analogue of vectorized tests below with pytest.raises(OutOfBoundsDatetime): - pd.to_timedelta(106580, 'D') + Timestamp('2000') + pd.to_timedelta(106580, "D") + Timestamp("2000") with pytest.raises(OutOfBoundsDatetime): - Timestamp('2000') + pd.to_timedelta(106580, 'D') + Timestamp("2000") + pd.to_timedelta(106580, "D") _NaT = int(pd.NaT) + 1 msg = "Overflow in int64 addition" with pytest.raises(OverflowError, match=msg): - pd.to_timedelta([106580], 'D') + Timestamp('2000') + pd.to_timedelta([106580], "D") + Timestamp("2000") with pytest.raises(OverflowError, match=msg): - Timestamp('2000') + pd.to_timedelta([106580], 'D') + Timestamp("2000") + pd.to_timedelta([106580], "D") with pytest.raises(OverflowError, match=msg): - pd.to_timedelta([_NaT]) - Timedelta('1 days') + pd.to_timedelta([_NaT]) - Timedelta("1 days") with pytest.raises(OverflowError, match=msg): - pd.to_timedelta(['5 days', _NaT]) - Timedelta('1 days') + pd.to_timedelta(["5 days", _NaT]) - Timedelta("1 days") with pytest.raises(OverflowError, match=msg): - (pd.to_timedelta([_NaT, '5 days', '1 hours']) - - pd.to_timedelta(['7 seconds', _NaT, '4 hours'])) + ( + pd.to_timedelta([_NaT, "5 days", "1 hours"]) + - pd.to_timedelta(["7 seconds", _NaT, "4 hours"]) + ) # These should not overflow! exp = TimedeltaIndex([pd.NaT]) - result = pd.to_timedelta([pd.NaT]) - Timedelta('1 days') + result = pd.to_timedelta([pd.NaT]) - Timedelta("1 days") tm.assert_index_equal(result, exp) - exp = TimedeltaIndex(['4 days', pd.NaT]) - result = pd.to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + exp = TimedeltaIndex(["4 days", pd.NaT]) + result = pd.to_timedelta(["5 days", pd.NaT]) - Timedelta("1 days") tm.assert_index_equal(result, exp) - exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) - result = (pd.to_timedelta([pd.NaT, '5 days', '1 hours']) + - pd.to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + exp = TimedeltaIndex([pd.NaT, pd.NaT, "5 hours"]) + result = pd.to_timedelta([pd.NaT, "5 days", "1 hours"]) + pd.to_timedelta( + ["7 seconds", pd.NaT, "4 hours"] + ) tm.assert_index_equal(result, exp) @@ -533,7 +555,7 @@ class TestTimedeltaArraylikeAddSubOps: def test_td64_df_add_int_frame(self): # GH#22696 Check that we don't dispatch to numpy implementation, # which treats int64 as m8[ns] - tdi = pd.timedelta_range('1', periods=3) + tdi = pd.timedelta_range("1", periods=3) df = tdi.to_frame() other = pd.DataFrame([1, 2, 3], index=tdi) # indexed like `df` with pytest.raises(TypeError): @@ -549,21 +571,21 @@ def test_td64_df_add_int_frame(self): # parametrization+de-duplication def test_timedelta_ops_with_missing_values(self): # setup - s1 = pd.to_timedelta(Series(['00:00:01'])) - s2 = pd.to_timedelta(Series(['00:00:02'])) + s1 = pd.to_timedelta(Series(["00:00:01"])) + s2 = pd.to_timedelta(Series(["00:00:02"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # Passing datetime64-dtype data to TimedeltaIndex is deprecated sn = pd.to_timedelta(Series([pd.NaT])) - df1 = pd.DataFrame(['00:00:01']).apply(pd.to_timedelta) - df2 = pd.DataFrame(['00:00:02']).apply(pd.to_timedelta) + df1 = pd.DataFrame(["00:00:01"]).apply(pd.to_timedelta) + df2 = pd.DataFrame(["00:00:02"]).apply(pd.to_timedelta) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # Passing datetime64-dtype data to TimedeltaIndex is deprecated dfn = pd.DataFrame([pd.NaT]).apply(pd.to_timedelta) - scalar1 = pd.to_timedelta('00:00:01') - scalar2 = pd.to_timedelta('00:00:02') - timedelta_NaT = pd.to_timedelta('NaT') + scalar1 = pd.to_timedelta("00:00:01") + scalar2 = pd.to_timedelta("00:00:02") + timedelta_NaT = pd.to_timedelta("NaT") actual = scalar1 + scalar1 assert actual == scalar2 @@ -645,67 +667,64 @@ def test_timedelta_ops_with_missing_values(self): # de-duplication, box-parametrization... def test_operators_timedelta64(self): # series ops - v1 = pd.date_range('2012-1-1', periods=3, freq='D') - v2 = pd.date_range('2012-1-2', periods=3, freq='D') + v1 = pd.date_range("2012-1-1", periods=3, freq="D") + v2 = pd.date_range("2012-1-2", periods=3, freq="D") rs = Series(v2) - Series(v1) - xp = Series(1e9 * 3600 * 24, - rs.index).astype('int64').astype('timedelta64[ns]') + xp = Series(1e9 * 3600 * 24, rs.index).astype("int64").astype("timedelta64[ns]") tm.assert_series_equal(rs, xp) - assert rs.dtype == 'timedelta64[ns]' + assert rs.dtype == "timedelta64[ns]" df = DataFrame(dict(A=v1)) td = Series([timedelta(days=i) for i in range(3)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" # series on the rhs - result = df['A'] - df['A'].shift() - assert result.dtype == 'timedelta64[ns]' + result = df["A"] - df["A"].shift() + assert result.dtype == "timedelta64[ns]" - result = df['A'] + td - assert result.dtype == 'M8[ns]' + result = df["A"] + td + assert result.dtype == "M8[ns]" # scalar Timestamp on rhs - maxa = df['A'].max() + maxa = df["A"].max() assert isinstance(maxa, Timestamp) - resultb = df['A'] - df['A'].max() - assert resultb.dtype == 'timedelta64[ns]' + resultb = df["A"] - df["A"].max() + assert resultb.dtype == "timedelta64[ns]" # timestamp on lhs - result = resultb + df['A'] - values = [Timestamp('20111230'), Timestamp('20120101'), - Timestamp('20120103')] - expected = Series(values, name='A') + result = resultb + df["A"] + values = [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")] + expected = Series(values, name="A") tm.assert_series_equal(result, expected) # datetimes on rhs - result = df['A'] - datetime(2001, 1, 1) - expected = Series( - [timedelta(days=4017 + i) for i in range(3)], name='A') + result = df["A"] - datetime(2001, 1, 1) + expected = Series([timedelta(days=4017 + i) for i in range(3)], name="A") tm.assert_series_equal(result, expected) - assert result.dtype == 'm8[ns]' + assert result.dtype == "m8[ns]" d = datetime(2001, 1, 1, 3, 4) - resulta = df['A'] - d - assert resulta.dtype == 'm8[ns]' + resulta = df["A"] - d + assert resulta.dtype == "m8[ns]" # roundtrip resultb = resulta + d - tm.assert_series_equal(df['A'], resultb) + tm.assert_series_equal(df["A"], resultb) # timedeltas on rhs td = timedelta(days=1) - resulta = df['A'] + td + resulta = df["A"] + td resultb = resulta - td - tm.assert_series_equal(resultb, df['A']) - assert resultb.dtype == 'M8[ns]' + tm.assert_series_equal(resultb, df["A"]) + assert resultb.dtype == "M8[ns]" # roundtrip td = timedelta(minutes=5, seconds=3) - resulta = df['A'] + td + resulta = df["A"] + td resultb = resulta - td - tm.assert_series_equal(df['A'], resultb) - assert resultb.dtype == 'M8[ns]' + tm.assert_series_equal(df["A"], resultb) + assert resultb.dtype == "M8[ns]" # inplace value = rs[2] + np.timedelta64(timedelta(minutes=5, seconds=1)) @@ -714,100 +733,102 @@ def test_operators_timedelta64(self): def test_timedelta64_ops_nat(self): # GH 11349 - timedelta_series = Series([NaT, Timedelta('1s')]) - nat_series_dtype_timedelta = Series([NaT, NaT], - dtype='timedelta64[ns]') - single_nat_dtype_timedelta = Series([NaT], dtype='timedelta64[ns]') + timedelta_series = Series([NaT, Timedelta("1s")]) + nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]") + single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]") # subtraction - tm.assert_series_equal(timedelta_series - NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(-NaT + timedelta_series, - nat_series_dtype_timedelta) + tm.assert_series_equal(timedelta_series - NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(-NaT + timedelta_series, nat_series_dtype_timedelta) - tm.assert_series_equal(timedelta_series - single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(-single_nat_dtype_timedelta + timedelta_series, - nat_series_dtype_timedelta) + tm.assert_series_equal( + timedelta_series - single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + -single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) # addition - tm.assert_series_equal(nat_series_dtype_timedelta + NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(NaT + nat_series_dtype_timedelta, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timedelta + - single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timedelta, - nat_series_dtype_timedelta) - - tm.assert_series_equal(timedelta_series + NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(NaT + timedelta_series, - nat_series_dtype_timedelta) - - tm.assert_series_equal(timedelta_series + single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(single_nat_dtype_timedelta + timedelta_series, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timedelta + NaT, - nat_series_dtype_timedelta) - tm.assert_series_equal(NaT + nat_series_dtype_timedelta, - nat_series_dtype_timedelta) - - tm.assert_series_equal(nat_series_dtype_timedelta + - single_nat_dtype_timedelta, - nat_series_dtype_timedelta) - tm.assert_series_equal(single_nat_dtype_timedelta + - nat_series_dtype_timedelta, - nat_series_dtype_timedelta) + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) + + tm.assert_series_equal(timedelta_series + NaT, nat_series_dtype_timedelta) + tm.assert_series_equal(NaT + timedelta_series, nat_series_dtype_timedelta) + + tm.assert_series_equal( + timedelta_series + single_nat_dtype_timedelta, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) + + tm.assert_series_equal( + nat_series_dtype_timedelta + single_nat_dtype_timedelta, + nat_series_dtype_timedelta, + ) + tm.assert_series_equal( + single_nat_dtype_timedelta + nat_series_dtype_timedelta, + nat_series_dtype_timedelta, + ) # multiplication - tm.assert_series_equal(nat_series_dtype_timedelta * 1.0, - nat_series_dtype_timedelta) - tm.assert_series_equal(1.0 * nat_series_dtype_timedelta, - nat_series_dtype_timedelta) + tm.assert_series_equal( + nat_series_dtype_timedelta * 1.0, nat_series_dtype_timedelta + ) + tm.assert_series_equal( + 1.0 * nat_series_dtype_timedelta, nat_series_dtype_timedelta + ) tm.assert_series_equal(timedelta_series * 1, timedelta_series) tm.assert_series_equal(1 * timedelta_series, timedelta_series) - tm.assert_series_equal(timedelta_series * 1.5, - Series([NaT, Timedelta('1.5s')])) - tm.assert_series_equal(1.5 * timedelta_series, - Series([NaT, Timedelta('1.5s')])) + tm.assert_series_equal(timedelta_series * 1.5, Series([NaT, Timedelta("1.5s")])) + tm.assert_series_equal(1.5 * timedelta_series, Series([NaT, Timedelta("1.5s")])) - tm.assert_series_equal(timedelta_series * np.nan, - nat_series_dtype_timedelta) - tm.assert_series_equal(np.nan * timedelta_series, - nat_series_dtype_timedelta) + tm.assert_series_equal(timedelta_series * np.nan, nat_series_dtype_timedelta) + tm.assert_series_equal(np.nan * timedelta_series, nat_series_dtype_timedelta) # division - tm.assert_series_equal(timedelta_series / 2, - Series([NaT, Timedelta('0.5s')])) - tm.assert_series_equal(timedelta_series / 2.0, - Series([NaT, Timedelta('0.5s')])) - tm.assert_series_equal(timedelta_series / np.nan, - nat_series_dtype_timedelta) + tm.assert_series_equal(timedelta_series / 2, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / 2.0, Series([NaT, Timedelta("0.5s")])) + tm.assert_series_equal(timedelta_series / np.nan, nat_series_dtype_timedelta) # ------------------------------------------------------------- # Invalid Operations def test_td64arr_add_str_invalid(self, box_with_array): # GH#13624 - tdi = TimedeltaIndex(['1 day', '2 days']) + tdi = TimedeltaIndex(["1 day", "2 days"]) tdi = tm.box_expected(tdi, box_with_array) with pytest.raises(TypeError): - tdi + 'a' + tdi + "a" with pytest.raises(TypeError): - 'a' + tdi + "a" + tdi - @pytest.mark.parametrize('other', [3.14, np.array([2.0, 3.0])]) + @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) def test_td64arr_add_sub_float(self, box_with_array, other): - tdi = TimedeltaIndex(['-1 days', '-1 days']) + tdi = TimedeltaIndex(["-1 days", "-1 days"]) tdarr = tm.box_expected(tdi, box_with_array) with pytest.raises(TypeError): @@ -819,12 +840,12 @@ def test_td64arr_add_sub_float(self, box_with_array, other): with pytest.raises(TypeError): other - tdarr - @pytest.mark.parametrize('freq', [None, 'H']) + @pytest.mark.parametrize("freq", [None, "H"]) def test_td64arr_sub_period(self, box_with_array, freq): # GH#13078 # not supported, check TypeError - p = pd.Period('2011-01-01', freq='D') - idx = TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + p = pd.Period("2011-01-01", freq="D") + idx = TimedeltaIndex(["1 hours", "2 hours"], freq=freq) idx = tm.box_expected(idx, box_with_array) with pytest.raises(TypeError): @@ -833,12 +854,12 @@ def test_td64arr_sub_period(self, box_with_array, freq): with pytest.raises(TypeError): p - idx - @pytest.mark.parametrize('pi_freq', ['D', 'W', 'Q', 'H']) - @pytest.mark.parametrize('tdi_freq', [None, 'H']) + @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "H"]) + @pytest.mark.parametrize("tdi_freq", [None, "H"]) def test_td64arr_sub_pi(self, box_with_array, tdi_freq, pi_freq): # GH#20049 subtracting PeriodIndex should raise TypeError - tdi = TimedeltaIndex(['1 hours', '2 hours'], freq=tdi_freq) - dti = Timestamp('2018-03-07 17:16:40') + tdi + tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) + dti = Timestamp("2018-03-07 17:16:40") + tdi pi = dti.to_period(pi_freq) # TODO: parametrize over box for pi? @@ -850,24 +871,26 @@ def test_td64arr_sub_pi(self, box_with_array, tdi_freq, pi_freq): # Binary operations td64 arraylike and datetime-like def test_td64arr_sub_timestamp_raises(self, box_with_array): - idx = TimedeltaIndex(['1 day', '2 day']) + idx = TimedeltaIndex(["1 day", "2 day"]) idx = tm.box_expected(idx, box_with_array) - msg = ("cannot subtract a datelike from|" - "Could not operate|" - "cannot perform operation") + msg = ( + "cannot subtract a datelike from|" + "Could not operate|" + "cannot perform operation" + ) with pytest.raises(TypeError, match=msg): - idx - Timestamp('2011-01-01') + idx - Timestamp("2011-01-01") def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): # GH#23215 # TODO: parametrize over scalar datetime types? tz = tz_naive_fixture - other = Timestamp('2011-01-01', tz=tz) + other = Timestamp("2011-01-01", tz=tz) - idx = TimedeltaIndex(['1 day', '2 day']) - expected = DatetimeIndex(['2011-01-02', '2011-01-03'], tz=tz) + idx = TimedeltaIndex(["1 day", "2 day"]) + expected = DatetimeIndex(["2011-01-02", "2011-01-03"], tz=tz) idx = tm.box_expected(idx, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -880,11 +903,11 @@ def test_td64arr_add_timestamp(self, box_with_array, tz_naive_fixture): def test_td64arr_add_sub_timestamp(self, box_with_array): # GH#11925 - ts = Timestamp('2012-01-01') + ts = Timestamp("2012-01-01") # TODO: parametrize over types of datetime scalar? - tdi = timedelta_range('1 day', periods=3) - expected = pd.date_range('2012-01-02', periods=3) + tdi = timedelta_range("1 day", periods=3) + expected = pd.date_range("2012-01-02", periods=3) tdarr = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -892,7 +915,7 @@ def test_td64arr_add_sub_timestamp(self, box_with_array): tm.assert_equal(ts + tdarr, expected) tm.assert_equal(tdarr + ts, expected) - expected2 = pd.date_range('2011-12-31', periods=3, freq='-1D') + expected2 = pd.date_range("2011-12-31", periods=3, freq="-1D") expected2 = tm.box_expected(expected2, box_with_array) tm.assert_equal(ts - tdarr, expected2) @@ -902,7 +925,7 @@ def test_td64arr_add_sub_timestamp(self, box_with_array): tdarr - ts def test_tdi_sub_dt64_array(self, box_with_array): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) dtarr = dti.values expected = pd.DatetimeIndex(dtarr) - tdi @@ -918,7 +941,7 @@ def test_tdi_sub_dt64_array(self, box_with_array): tm.assert_equal(result, expected) def test_tdi_add_dt64_array(self, box_with_array): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) dtarr = dti.values expected = pd.DatetimeIndex(dtarr) + tdi @@ -933,9 +956,9 @@ def test_tdi_add_dt64_array(self, box_with_array): def test_td64arr_add_datetime64_nat(self, box_with_array): # GH#23215 - other = np.datetime64('NaT') + other = np.datetime64("NaT") - tdi = timedelta_range('1 day', periods=3) + tdi = timedelta_range("1 day", periods=3) expected = pd.DatetimeIndex(["NaT", "NaT", "NaT"]) tdser = tm.box_expected(tdi, box_with_array) @@ -948,7 +971,7 @@ def test_td64arr_add_datetime64_nat(self, box_with_array): # Operations with int-like others def test_td64arr_add_int_series_invalid(self, box): - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError if box is not pd.Index else NullFrequencyError int_ser = Series([2, 3, 4]) @@ -964,14 +987,14 @@ def test_td64arr_add_int_series_invalid(self, box): def test_td64arr_add_intlike(self, box_with_array): # GH#19123 - tdi = TimedeltaIndex(['59 days', '59 days', 'NaT']) + tdi = TimedeltaIndex(["59 days", "59 days", "NaT"]) ser = tm.box_expected(tdi, box_with_array) err = TypeError if box_with_array in [pd.Index, tm.to_array]: err = NullFrequencyError - other = Series([20, 30, 40], dtype='uint8') + other = Series([20, 30, 40], dtype="uint8") # TODO: separate/parametrize with pytest.raises(err): @@ -994,12 +1017,11 @@ def test_td64arr_add_intlike(self, box_with_array): with pytest.raises(err): ser - pd.Index(other) - @pytest.mark.parametrize('scalar', [1, 1.5, np.array(2)]) - def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, - scalar): + @pytest.mark.parametrize("scalar", [1, 1.5, np.array(2)]) + def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, scalar): box = box_with_array - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError if box in [pd.Index, tm.to_array] and not isinstance(scalar, float): @@ -1014,20 +1036,36 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, with pytest.raises(err): scalar - tdser - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vec', [ - np.array([1, 2, 3]), - pd.Index([1, 2, 3]), - Series([1, 2, 3]) - # TODO: Add DataFrame in here? - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "int32", + "int16", + "uint64", + "uint32", + "uint16", + "uint8", + "float64", + "float32", + "float16", + ], + ) + @pytest.mark.parametrize( + "vec", + [ + np.array([1, 2, 3]), + pd.Index([1, 2, 3]), + Series([1, 2, 3]) + # TODO: Add DataFrame in here? + ], + ids=lambda x: type(x).__name__, + ) def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError - if box is pd.Index and not dtype.startswith('float'): + if box is pd.Index and not dtype.startswith("float"): err = NullFrequencyError vector = vec.astype(dtype) @@ -1044,9 +1082,14 @@ def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): # Operations with timedelta-like others # TODO: this was taken from tests.series.test_ops; de-duplicate - @pytest.mark.parametrize('scalar_td', [timedelta(minutes=5, seconds=4), - Timedelta(minutes=5, seconds=4), - Timedelta('5m4s').to_timedelta64()]) + @pytest.mark.parametrize( + "scalar_td", + [ + timedelta(minutes=5, seconds=4), + Timedelta(minutes=5, seconds=4), + Timedelta("5m4s").to_timedelta64(), + ], + ) def test_operators_timedelta64_with_timedelta(self, scalar_td): # smoke tests td1 = Series([timedelta(minutes=5, seconds=3)] * 3) @@ -1065,14 +1108,16 @@ def test_timedelta64_operations_with_timedeltas(self): td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td2 = timedelta(minutes=5, seconds=4) result = td1 - td2 - expected = (Series([timedelta(seconds=0)] * 3) - - Series([timedelta(seconds=1)] * 3)) - assert result.dtype == 'm8[ns]' + expected = Series([timedelta(seconds=0)] * 3) - Series( + [timedelta(seconds=1)] * 3 + ) + assert result.dtype == "m8[ns]" tm.assert_series_equal(result, expected) result2 = td2 - td1 - expected = (Series([timedelta(seconds=1)] * 3) - - Series([timedelta(seconds=0)] * 3)) + expected = Series([timedelta(seconds=1)] * 3) - Series( + [timedelta(seconds=0)] * 3 + ) tm.assert_series_equal(result2, expected) # roundtrip @@ -1080,24 +1125,26 @@ def test_timedelta64_operations_with_timedeltas(self): # Now again, using pd.to_timedelta, which should build # a Series or a scalar, depending on input. - td1 = Series(pd.to_timedelta(['00:05:03'] * 3)) - td2 = pd.to_timedelta('00:05:04') + td1 = Series(pd.to_timedelta(["00:05:03"] * 3)) + td2 = pd.to_timedelta("00:05:04") result = td1 - td2 - expected = (Series([timedelta(seconds=0)] * 3) - - Series([timedelta(seconds=1)] * 3)) - assert result.dtype == 'm8[ns]' + expected = Series([timedelta(seconds=0)] * 3) - Series( + [timedelta(seconds=1)] * 3 + ) + assert result.dtype == "m8[ns]" tm.assert_series_equal(result, expected) result2 = td2 - td1 - expected = (Series([timedelta(seconds=1)] * 3) - - Series([timedelta(seconds=0)] * 3)) + expected = Series([timedelta(seconds=1)] * 3) - Series( + [timedelta(seconds=0)] * 3 + ) tm.assert_series_equal(result2, expected) # roundtrip tm.assert_series_equal(result + td2, td1) def test_td64arr_add_td64_array(self, box): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1111,7 +1158,7 @@ def test_td64arr_add_td64_array(self, box): tm.assert_equal(result, expected) def test_td64arr_sub_td64_array(self, box): - dti = pd.date_range('2016-01-01', periods=3) + dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1125,20 +1172,28 @@ def test_td64arr_sub_td64_array(self, box): tm.assert_equal(result, expected) # TODO: parametrize over [add, sub, radd, rsub]? - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) def test_td64arr_add_sub_tdi(self, box, names): # GH#17250 make sure result dtype is correct # GH#19043 make sure names are propagated correctly - if box is pd.DataFrame and names[1] == 'Venkman': - pytest.skip("Name propagation for DataFrame does not behave like " - "it does for Index/Series") + if box is pd.DataFrame and names[1] == "Venkman": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) - tdi = TimedeltaIndex(['0 days', '1 day'], name=names[0]) + tdi = TimedeltaIndex(["0 days", "1 day"], name=names[0]) ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) - expected = Series([Timedelta(hours=3), Timedelta(days=1, hours=4)], - name=names[2]) + expected = Series( + [Timedelta(hours=3), Timedelta(days=1, hours=4)], name=names[2] + ) ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) @@ -1146,38 +1201,39 @@ def test_td64arr_add_sub_tdi(self, box, names): result = tdi + ser tm.assert_equal(result, expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" result = ser + tdi tm.assert_equal(result, expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" - expected = Series([Timedelta(hours=-3), Timedelta(days=1, hours=-4)], - name=names[2]) + expected = Series( + [Timedelta(hours=-3), Timedelta(days=1, hours=-4)], name=names[2] + ) expected = tm.box_expected(expected, box) result = tdi - ser tm.assert_equal(result, expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" result = ser - tdi tm.assert_equal(result, -expected) if box is not pd.DataFrame: - assert result.dtype == 'timedelta64[ns]' + assert result.dtype == "timedelta64[ns]" else: - assert result.dtypes[0] == 'timedelta64[ns]' + assert result.dtypes[0] == "timedelta64[ns]" def test_td64arr_add_sub_td64_nat(self, box): # GH#23320 special handling for timedelta64("NaT") - tdi = pd.TimedeltaIndex([NaT, Timedelta('1s')]) + tdi = pd.TimedeltaIndex([NaT, Timedelta("1s")]) other = np.timedelta64("NaT") expected = pd.TimedeltaIndex(["NaT"] * 2) @@ -1195,8 +1251,8 @@ def test_td64arr_add_sub_td64_nat(self, box): def test_td64arr_sub_NaT(self, box): # GH#18808 - ser = Series([NaT, Timedelta('1s')]) - expected = Series([NaT, NaT], dtype='timedelta64[ns]') + ser = Series([NaT, Timedelta("1s")]) + expected = Series([NaT, NaT], dtype="timedelta64[ns]") ser = tm.box_expected(ser, box) expected = tm.box_expected(expected, box) @@ -1206,9 +1262,8 @@ def test_td64arr_sub_NaT(self, box): def test_td64arr_add_timedeltalike(self, two_hours, box): # only test adding/sub offsets as + is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', - freq='D') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") rng = tm.box_expected(rng, box) expected = tm.box_expected(expected, box) @@ -1217,8 +1272,8 @@ def test_td64arr_add_timedeltalike(self, two_hours, box): def test_td64arr_sub_timedeltalike(self, two_hours, box): # only test adding/sub offsets as - is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") rng = tm.box_expected(rng, box) expected = tm.box_expected(expected, box) @@ -1242,11 +1297,16 @@ def test_timedelta64_operations_with_DateOffset(self): tm.assert_series_equal(result, expected) with tm.assert_produces_warning(PerformanceWarning): - result = td + Series([pd.offsets.Minute(1), pd.offsets.Second(3), - pd.offsets.Hour(2)]) - expected = Series([timedelta(minutes=6, seconds=3), - timedelta(minutes=5, seconds=6), - timedelta(hours=2, minutes=5, seconds=3)]) + result = td + Series( + [pd.offsets.Minute(1), pd.offsets.Second(3), pd.offsets.Hour(2)] + ) + expected = Series( + [ + timedelta(minutes=6, seconds=3), + timedelta(minutes=5, seconds=6), + timedelta(hours=2, minutes=5, seconds=3), + ] + ) tm.assert_series_equal(result, expected) result = td + pd.offsets.Minute(1) + pd.offsets.Second(12) @@ -1254,30 +1314,30 @@ def test_timedelta64_operations_with_DateOffset(self): tm.assert_series_equal(result, expected) # valid DateOffsets - for do in ['Hour', 'Minute', 'Second', 'Day', 'Micro', 'Milli', - 'Nano']: + for do in ["Hour", "Minute", "Second", "Day", "Micro", "Milli", "Nano"]: op = getattr(pd.offsets, do) td + op(5) op(5) + td td - op(5) op(5) - td - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_td64arr_add_offset_index(self, names, box): # GH#18849, GH#19744 - if box is pd.DataFrame and names[1] == 'bar': - pytest.skip("Name propagation for DataFrame does not behave like " - "it does for Index/Series") - - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], - name=names[0]) - other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], - name=names[1]) - - expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))], - freq='infer', name=names[2]) + if box is pd.DataFrame and names[1] == "bar": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) + + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) + + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=names[2] + ) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) @@ -1296,11 +1356,12 @@ def test_td64arr_add_offset_index(self, names, box): # over second box? def test_td64arr_add_offset_array(self, box): # GH#18849 - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) - expected = TimedeltaIndex([tdi[n] + other[n] for n in range(len(tdi))], - freq='infer') + expected = TimedeltaIndex( + [tdi[n] + other[n] for n in range(len(tdi))], freq="infer" + ) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) @@ -1316,22 +1377,23 @@ def test_td64arr_add_offset_array(self, box): res2 = other + tdi tm.assert_equal(res2, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_td64arr_sub_offset_index(self, names, box): # GH#18824, GH#19744 - if box is pd.DataFrame and names[1] == 'bar': - pytest.skip("Name propagation for DataFrame does not behave like " - "it does for Index/Series") + if box is pd.DataFrame and names[1] == "bar": + pytest.skip( + "Name propagation for DataFrame does not behave like " + "it does for Index/Series" + ) - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], - name=names[0]) - other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], - name=names[1]) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) - expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))], - freq='infer', name=names[2]) + expected = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=names[2] + ) tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, box) @@ -1345,11 +1407,12 @@ def test_td64arr_sub_offset_index(self, names, box): def test_td64arr_sub_offset_array(self, box_with_array): # GH#18824 - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) - expected = TimedeltaIndex([tdi[n] - other[n] for n in range(len(tdi))], - freq='infer') + expected = TimedeltaIndex( + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer" + ) tdi = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1361,21 +1424,20 @@ def test_td64arr_sub_offset_array(self, box_with_array): res = tdi - other tm.assert_equal(res, expected) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('foo', 'foo', 'foo')]) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] + ) def test_td64arr_with_offset_series(self, names, box_df_fail): # GH#18849 box = box_df_fail box2 = Series if box in [pd.Index, tm.to_array] else box - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00'], - name=names[0]) - other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], - name=names[1]) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) + other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) - expected_add = Series([tdi[n] + other[n] for n in range(len(tdi))], - name=names[2]) + expected_add = Series( + [tdi[n] + other[n] for n in range(len(tdi))], name=names[2] + ) tdi = tm.box_expected(tdi, box) expected_add = tm.box_expected(expected_add, box2) @@ -1388,19 +1450,19 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): tm.assert_equal(res2, expected_add) # TODO: separate/parametrize add/sub test? - expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], - name=names[2]) + expected_sub = Series( + [tdi[n] - other[n] for n in range(len(tdi))], name=names[2] + ) expected_sub = tm.box_expected(expected_sub, box2) with tm.assert_produces_warning(PerformanceWarning): res3 = tdi - other tm.assert_equal(res3, expected_sub) - @pytest.mark.parametrize('obox', [np.array, pd.Index, pd.Series]) - def test_td64arr_addsub_anchored_offset_arraylike(self, obox, - box_with_array): + @pytest.mark.parametrize("obox", [np.array, pd.Index, pd.Series]) + def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # GH#18824 - tdi = TimedeltaIndex(['1 days 00:00:00', '3 days 04:00:00']) + tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) tdi = tm.box_expected(tdi, box_with_array) anchored = obox([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) @@ -1427,10 +1489,10 @@ class TestTimedeltaArraylikeMulDivOps: # TODO: Moved from tests.series.test_operators; needs cleanup @pytest.mark.parametrize("m", [1, 3, 10]) - @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns']) + @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) def test_timedelta64_conversions(self, m, unit): - startdate = Series(pd.date_range('2013-01-01', '2013-01-03')) - enddate = Series(pd.date_range('2013-03-01', '2013-03-03')) + startdate = Series(pd.date_range("2013-01-01", "2013-01-03")) + enddate = Series(pd.date_range("2013-03-01", "2013-03-03")) ser = enddate - startdate ser[2] = np.nan @@ -1441,8 +1503,7 @@ def test_timedelta64_conversions(self, m, unit): tm.assert_series_equal(result, expected) # reverse op - expected = Series([Timedelta(np.timedelta64(m, unit)) / x - for x in ser]) + expected = Series([Timedelta(np.timedelta64(m, unit)) / x for x in ser]) result = np.timedelta64(m, unit) / ser tm.assert_series_equal(result, expected) @@ -1451,7 +1512,7 @@ def test_timedelta64_conversions(self, m, unit): # organized with scalar others first, then array-like def test_td64arr_mul_int(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) result = idx * 1 @@ -1461,24 +1522,24 @@ def test_td64arr_mul_int(self, box_with_array): tm.assert_equal(result, idx) def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array): - rng = timedelta_range('1 days', '10 days', name='foo') + rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) with pytest.raises(TypeError): rng * two_hours def test_tdi_mul_int_array_zerodim(self, box_with_array): - rng5 = np.arange(5, dtype='int64') + rng5 = np.arange(5, dtype="int64") idx = TimedeltaIndex(rng5) expected = TimedeltaIndex(rng5 * 5) idx = tm.box_expected(idx, box_with_array) expected = tm.box_expected(expected, box_with_array) - result = idx * np.array(5, dtype='int64') + result = idx * np.array(5, dtype="int64") tm.assert_equal(result, expected) def test_tdi_mul_int_array(self, box_with_array): - rng5 = np.arange(5, dtype='int64') + rng5 = np.arange(5, dtype="int64") idx = TimedeltaIndex(rng5) expected = TimedeltaIndex(rng5 ** 2) @@ -1492,23 +1553,23 @@ def test_tdi_mul_int_series(self, box_with_array): box = box_with_array xbox = pd.Series if box in [pd.Index, tm.to_array] else box - idx = TimedeltaIndex(np.arange(5, dtype='int64')) - expected = TimedeltaIndex(np.arange(5, dtype='int64') ** 2) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) + expected = TimedeltaIndex(np.arange(5, dtype="int64") ** 2) idx = tm.box_expected(idx, box) expected = tm.box_expected(expected, xbox) - result = idx * pd.Series(np.arange(5, dtype='int64')) + result = idx * pd.Series(np.arange(5, dtype="int64")) tm.assert_equal(result, expected) def test_tdi_mul_float_series(self, box_with_array): box = box_with_array xbox = pd.Series if box in [pd.Index, tm.to_array] else box - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box) - rng5f = np.arange(5, dtype='float64') + rng5f = np.arange(5, dtype="float64") expected = TimedeltaIndex(rng5f * (rng5f + 1.0)) expected = tm.box_expected(expected, xbox) @@ -1516,19 +1577,23 @@ def test_tdi_mul_float_series(self, box_with_array): tm.assert_equal(result, expected) # TODO: Put Series/DataFrame in others? - @pytest.mark.parametrize('other', [ - np.arange(1, 11), - pd.Int64Index(range(1, 11)), - pd.UInt64Index(range(1, 11)), - pd.Float64Index(range(1, 11)), - pd.RangeIndex(1, 11) - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [ + np.arange(1, 11), + pd.Int64Index(range(1, 11)), + pd.UInt64Index(range(1, 11)), + pd.Float64Index(range(1, 11)), + pd.RangeIndex(1, 11), + ], + ids=lambda x: type(x).__name__, + ) def test_tdi_rmul_arraylike(self, other, box_with_array): box = box_with_array xbox = get_upcast_box(box, other) - tdi = TimedeltaIndex(['1 Day'] * 10) - expected = timedelta_range('1 days', '10 days') + tdi = TimedeltaIndex(["1 Day"] * 10) + expected = timedelta_range("1 days", "10 days") expected._data.freq = None tdi = tm.box_expected(tdi, box) @@ -1544,21 +1609,20 @@ def test_tdi_rmul_arraylike(self, other, box_with_array): def test_td64arr_div_nat_invalid(self, box_with_array): # don't allow division by NaT (maybe could in the future) - rng = timedelta_range('1 days', '10 days', name='foo') + rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) - with pytest.raises(TypeError, - match="'?true_divide'? cannot use operands"): + with pytest.raises(TypeError, match="'?true_divide'? cannot use operands"): rng / pd.NaT - with pytest.raises(TypeError, match='Cannot divide NaTType by'): + with pytest.raises(TypeError, match="Cannot divide NaTType by"): pd.NaT / rng def test_td64arr_div_td64nat(self, box_with_array): # GH#23829 - rng = timedelta_range('1 days', '10 days',) + rng = timedelta_range("1 days", "10 days") rng = tm.box_expected(rng, box_with_array) - other = np.timedelta64('NaT') + other = np.timedelta64("NaT") expected = np.array([np.nan] * 10) expected = tm.box_expected(expected, box_with_array) @@ -1570,20 +1634,20 @@ def test_td64arr_div_td64nat(self, box_with_array): tm.assert_equal(result, expected) def test_td64arr_div_int(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) result = idx / 1 tm.assert_equal(result, idx) - with pytest.raises(TypeError, match='Cannot divide'): + with pytest.raises(TypeError, match="Cannot divide"): # GH#23829 1 / idx def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): # GH#20088, GH#22163 ensure DataFrame returns correct dtype - rng = timedelta_range('1 days', '10 days', name='foo') - expected = pd.Float64Index((np.arange(10) + 1) * 12, name='foo') + rng = timedelta_range("1 days", "10 days", name="foo") + expected = pd.Float64Index((np.arange(10) + 1) * 12, name="foo") rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1595,10 +1659,9 @@ def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): expected = 1 / expected tm.assert_equal(result, expected) - def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, - box_with_array): - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days'], name='foo') - expected = pd.Float64Index([12, np.nan, 24], name='foo') + def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, box_with_array): + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") + expected = pd.Float64Index([12, np.nan, 24], name="foo") rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1612,13 +1675,13 @@ def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, def test_td64arr_div_td64_ndarray(self, box_with_array): # GH#22631 - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days']) + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) expected = pd.Float64Index([12, np.nan, 24]) rng = tm.box_expected(rng, box_with_array) expected = tm.box_expected(expected, box_with_array) - other = np.array([2, 4, 2], dtype='m8[h]') + other = np.array([2, 4, 2], dtype="m8[h]") result = rng / other tm.assert_equal(result, expected) @@ -1646,7 +1709,7 @@ def test_td64arr_div_td64_ndarray(self, box_with_array): tm.assert_equal(result, expected) def test_tdarr_div_length_mismatch(self, box_with_array): - rng = TimedeltaIndex(['1 days', pd.NaT, '2 days']) + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) mismatched = [1, 2, 3, 4] rng = tm.box_expected(rng, box_with_array) @@ -1687,8 +1750,7 @@ def test_td64arr_rfloordiv_tdscalar(self, box_with_array, scalar_td): result = scalar_td // td1 tm.assert_equal(result, expected) - def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, - scalar_td): + def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, scalar_td): # GH#18831 td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan @@ -1704,19 +1766,18 @@ def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, tm.assert_equal(result, expected) def test_td64arr_floordiv_int(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) result = idx // 1 tm.assert_equal(result, idx) - pattern = ('floor_divide cannot use operands|' - 'Cannot divide int by Timedelta*') + pattern = "floor_divide cannot use operands|" "Cannot divide int by Timedelta*" with pytest.raises(TypeError, match=pattern): 1 // idx def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array): - tdi = timedelta_range('1 days', '10 days', name='foo') - expected = pd.Int64Index((np.arange(10) + 1) * 12, name='foo') + tdi = timedelta_range("1 days", "10 days", name="foo") + expected = pd.Int64Index((np.arange(10) + 1) * 12, name="foo") tdi = tm.box_expected(tdi, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1725,14 +1786,18 @@ def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array): tm.assert_equal(result, expected) # TODO: Is this redundant with test_td64arr_floordiv_tdlike_scalar? - @pytest.mark.parametrize('scalar_td', [ - timedelta(minutes=10, seconds=7), - Timedelta('10m7s'), - Timedelta('10m7s').to_timedelta64() - ], ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "scalar_td", + [ + timedelta(minutes=10, seconds=7), + Timedelta("10m7s"), + Timedelta("10m7s").to_timedelta64(), + ], + ids=lambda x: type(x).__name__, + ) def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box_with_array): # GH#19125 - tdi = TimedeltaIndex(['00:05:03', '00:05:03', pd.NaT], freq=None) + tdi = TimedeltaIndex(["00:05:03", "00:05:03", pd.NaT], freq=None) expected = pd.Index([2.0, 2.0, np.nan]) tdi = tm.box_expected(tdi, box_with_array, transpose=False) @@ -1753,10 +1818,10 @@ def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box_with_array): # reversed ops def test_td64arr_mod_tdscalar(self, box_with_array, three_days): - tdi = timedelta_range('1 Day', '9 days') + tdi = timedelta_range("1 Day", "9 days") tdarr = tm.box_expected(tdi, box_with_array) - expected = TimedeltaIndex(['1 Day', '2 Days', '0 Days'] * 3) + expected = TimedeltaIndex(["1 Day", "2 Days", "0 Days"] * 3) expected = tm.box_expected(expected, box_with_array) result = tdarr % three_days @@ -1770,10 +1835,10 @@ def test_td64arr_mod_tdscalar(self, box_with_array, three_days): tm.assert_equal(result[0], tdarr // three_days) def test_td64arr_mod_int(self, box_with_array): - tdi = timedelta_range('1 ns', '10 ns', periods=10) + tdi = timedelta_range("1 ns", "10 ns", periods=10) tdarr = tm.box_expected(tdi, box_with_array) - expected = TimedeltaIndex(['1 ns', '0 ns'] * 5) + expected = TimedeltaIndex(["1 ns", "0 ns"] * 5) expected = tm.box_expected(expected, box_with_array) result = tdarr % 2 @@ -1790,10 +1855,10 @@ def test_td64arr_mod_int(self, box_with_array): tm.assert_equal(result[0], tdarr // 2) def test_td64arr_rmod_tdscalar(self, box_with_array, three_days): - tdi = timedelta_range('1 Day', '9 days') + tdi = timedelta_range("1 Day", "9 days") tdarr = tm.box_expected(tdi, box_with_array) - expected = ['0 Days', '1 Day', '0 Days'] + ['3 Days'] * 6 + expected = ["0 Days", "1 Day", "0 Days"] + ["3 Days"] * 6 expected = TimedeltaIndex(expected) expected = tm.box_expected(expected, box_with_array) @@ -1819,14 +1884,14 @@ def test_td64arr_mul_tdscalar_invalid(self, box_with_array, scalar_td): # check that we are getting a TypeError # with 'operate' (from core/ops.py) for the ops that are not # defined - pattern = 'operate|unsupported|cannot|not supported' + pattern = "operate|unsupported|cannot|not supported" with pytest.raises(TypeError, match=pattern): td1 * scalar_td with pytest.raises(TypeError, match=pattern): scalar_td * td1 def test_td64arr_mul_too_short_raises(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) with pytest.raises(TypeError): idx * idx[:3] @@ -1834,7 +1899,7 @@ def test_td64arr_mul_too_short_raises(self, box_with_array): idx * np.array([1, 2]) def test_td64arr_mul_td64arr_raises(self, box_with_array): - idx = TimedeltaIndex(np.arange(5, dtype='int64')) + idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box_with_array) with pytest.raises(TypeError): idx * idx @@ -1842,13 +1907,12 @@ def test_td64arr_mul_td64arr_raises(self, box_with_array): # ------------------------------------------------------------------ # Operations with numeric others - @pytest.mark.parametrize('one', [1, np.array(1), 1.0, np.array(1.0)]) + @pytest.mark.parametrize("one", [1, np.array(1), 1.0, np.array(1.0)]) def test_td64arr_mul_numeric_scalar(self, box_with_array, one): # GH#4521 # divide/multiply by integers - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') - expected = Series(['-59 Days', '-59 Days', 'NaT'], - dtype='timedelta64[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["-59 Days", "-59 Days", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1858,8 +1922,7 @@ def test_td64arr_mul_numeric_scalar(self, box_with_array, one): result = (-one) * tdser tm.assert_equal(result, expected) - expected = Series(['118 Days', '118 Days', 'NaT'], - dtype='timedelta64[ns]') + expected = Series(["118 Days", "118 Days", "NaT"], dtype="timedelta64[ns]") expected = tm.box_expected(expected, box_with_array) result = tdser * (2 * one) @@ -1867,12 +1930,12 @@ def test_td64arr_mul_numeric_scalar(self, box_with_array, one): result = (2 * one) * tdser tm.assert_equal(result, expected) - @pytest.mark.parametrize('two', [2, 2.0, np.array(2), np.array(2.0)]) + @pytest.mark.parametrize("two", [2, 2.0, np.array(2), np.array(2.0)]) def test_td64arr_div_numeric_scalar(self, box_with_array, two): # GH#4521 # divide/multiply by integers - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') - expected = Series(['29.5D', '29.5D', 'NaT'], dtype='timedelta64[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") + expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, box_with_array) @@ -1880,26 +1943,38 @@ def test_td64arr_div_numeric_scalar(self, box_with_array, two): result = tdser / two tm.assert_equal(result, expected) - with pytest.raises(TypeError, match='Cannot divide'): + with pytest.raises(TypeError, match="Cannot divide"): two / tdser - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), - pd.Index([20, 30, 40]), - Series([20, 30, 40])], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "int32", + "int16", + "uint64", + "uint32", + "uint16", + "uint8", + "float64", + "float32", + "float16", + ], + ) + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) def test_td64arr_rmul_numeric_array(self, box_with_array, vector, dtype): # GH#4521 # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(dtype) - expected = Series(['1180 Days', '1770 Days', 'NaT'], - dtype='timedelta64[ns]') + expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, xbox) @@ -1910,21 +1985,33 @@ def test_td64arr_rmul_numeric_array(self, box_with_array, vector, dtype): result = vector * tdser tm.assert_equal(result, expected) - @pytest.mark.parametrize('dtype', ['int64', 'int32', 'int16', - 'uint64', 'uint32', 'uint16', 'uint8', - 'float64', 'float32', 'float16']) - @pytest.mark.parametrize('vector', [np.array([20, 30, 40]), - pd.Index([20, 30, 40]), - Series([20, 30, 40])], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "int32", + "int16", + "uint64", + "uint32", + "uint16", + "uint8", + "float64", + "float32", + "float16", + ], + ) + @pytest.mark.parametrize( + "vector", + [np.array([20, 30, 40]), pd.Index([20, 30, 40]), Series([20, 30, 40])], + ids=lambda x: type(x).__name__, + ) def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): # GH#4521 # divide/multiply by integers xbox = get_upcast_box(box_with_array, vector) - tdser = pd.Series(['59 Days', '59 Days', 'NaT'], dtype='m8[ns]') + tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") vector = vector.astype(dtype) - expected = Series(['2.95D', '1D 23H 12m', 'NaT'], - dtype='timedelta64[ns]') + expected = Series(["2.95D", "1D 23H 12m", "NaT"], dtype="timedelta64[ns]") tdser = tm.box_expected(tdser, box_with_array) expected = tm.box_expected(expected, xbox) @@ -1932,11 +2019,13 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): result = tdser / vector tm.assert_equal(result, expected) - pattern = ('true_divide cannot use operands|' - 'cannot perform __div__|' - 'cannot perform __truediv__|' - 'unsupported operand|' - 'Cannot divide') + pattern = ( + "true_divide cannot use operands|" + "cannot perform __div__|" + "cannot perform __truediv__|" + "unsupported operand|" + "Cannot divide" + ) with pytest.raises(TypeError, match=pattern): vector / tdser @@ -1944,8 +2033,7 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): # Index.__rdiv__ won't try to operate elementwise, just raises result = tdser / vector.astype(object) if box_with_array is pd.DataFrame: - expected = [tdser.iloc[0, n] / vector[n] - for n in range(len(vector))] + expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))] else: expected = [tdser[n] / vector[n] for n in range(len(tdser))] expected = tm.box_expected(expected, xbox) @@ -1954,20 +2042,28 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): with pytest.raises(TypeError, match=pattern): vector.astype(object) / tdser - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) def test_td64arr_mul_int_series(self, box_df_fail, names): # GH#19042 test for correct name attachment box = box_df_fail # broadcasts along wrong axis, but doesn't raise - tdi = TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], - name=names[0]) + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) # TODO: Should we be parametrizing over types for `ser` too? ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1]) - expected = Series(['0days', '1day', '4days', '9days', '16days'], - dtype='timedelta64[ns]', - name=names[2]) + expected = Series( + ["0days", "1day", "4days", "9days", "16days"], + dtype="timedelta64[ns]", + name=names[2], + ) tdi = tm.box_expected(tdi, box) box = Series if (box is pd.Index and type(ser) is Series) else box @@ -1981,22 +2077,30 @@ def test_td64arr_mul_int_series(self, box_df_fail, names): tm.assert_equal(result, expected) # TODO: Should we be parametrizing over types for `ser` too? - @pytest.mark.parametrize('names', [(None, None, None), - ('Egon', 'Venkman', None), - ('NCC1701D', 'NCC1701D', 'NCC1701D')]) + @pytest.mark.parametrize( + "names", + [ + (None, None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ], + ) def test_float_series_rdiv_td64arr(self, box_with_array, names): # GH#19042 test for correct name attachment # TODO: the direct operation TimedeltaIndex / Series still # needs to be fixed. box = box_with_array - tdi = TimedeltaIndex(['0days', '1day', '2days', '3days', '4days'], - name=names[0]) + tdi = TimedeltaIndex( + ["0days", "1day", "2days", "3days", "4days"], name=names[0] + ) ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1]) xname = names[2] if box is not tm.to_array else names[1] - expected = Series([tdi[n] / ser[n] for n in range(len(ser))], - dtype='timedelta64[ns]', - name=xname) + expected = Series( + [tdi[n] / ser[n] for n in range(len(ser))], + dtype="timedelta64[ns]", + name=xname, + ) xbox = box if box in [pd.Index, tm.to_array] and type(ser) is Series: @@ -2014,7 +2118,6 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): class TestTimedeltaArraylikeInvalidArithmeticOps: - def test_td64arr_pow_invalid(self, scalar_td, box_with_array): td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan @@ -2024,7 +2127,7 @@ def test_td64arr_pow_invalid(self, scalar_td, box_with_array): # check that we are getting a TypeError # with 'operate' (from core/ops.py) for the ops that are not # defined - pattern = 'operate|unsupported|cannot|not supported' + pattern = "operate|unsupported|cannot|not supported" with pytest.raises(TypeError, match=pattern): scalar_td ** td1 diff --git a/pandas/tests/arrays/categorical/common.py b/pandas/tests/arrays/categorical/common.py index cc4b53407107c..4ef9390656979 100644 --- a/pandas/tests/arrays/categorical/common.py +++ b/pandas/tests/arrays/categorical/common.py @@ -2,7 +2,7 @@ class TestCategorical: - def setup_method(self, method): - self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], - ordered=True) + self.factor = Categorical( + ["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True + ) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 6b75d06438889..1508fef86ae62 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -5,45 +5,42 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('ordered', [True, False]) -@pytest.mark.parametrize('categories', [ - ['b', 'a', 'c'], - ['a', 'b', 'c', 'd'], -]) +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]]) def test_factorize(categories, ordered): - cat = pd.Categorical(['b', 'b', 'a', 'c', None], - categories=categories, - ordered=ordered) + cat = pd.Categorical( + ["b", "b", "a", "c", None], categories=categories, ordered=ordered + ) labels, uniques = pd.factorize(cat) expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp) - expected_uniques = pd.Categorical(['b', 'a', 'c'], - categories=categories, - ordered=ordered) + expected_uniques = pd.Categorical( + ["b", "a", "c"], categories=categories, ordered=ordered + ) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) def test_factorized_sort(): - cat = pd.Categorical(['b', 'b', None, 'a']) + cat = pd.Categorical(["b", "b", None, "a"]) labels, uniques = pd.factorize(cat, sort=True) expected_labels = np.array([1, 1, -1, 0], dtype=np.intp) - expected_uniques = pd.Categorical(['a', 'b']) + expected_uniques = pd.Categorical(["a", "b"]) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) def test_factorized_sort_ordered(): - cat = pd.Categorical(['b', 'b', None, 'a'], - categories=['c', 'b', 'a'], - ordered=True) + cat = pd.Categorical( + ["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True + ) labels, uniques = pd.factorize(cat, sort=True) expected_labels = np.array([0, 0, -1, 1], dtype=np.intp) - expected_uniques = pd.Categorical(['b', 'a'], - categories=['c', 'b', 'a'], - ordered=True) + expected_uniques = pd.Categorical( + ["b", "a"], categories=["c", "b", "a"], ordered=True + ) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) @@ -75,49 +72,52 @@ class TestTake: # https://github.com/pandas-dev/pandas/issues/20664 def test_take_warns(self): - cat = pd.Categorical(['a', 'b']) + cat = pd.Categorical(["a", "b"]) with tm.assert_produces_warning(FutureWarning): cat.take([0, -1]) def test_take_positive_no_warning(self): - cat = pd.Categorical(['a', 'b']) + cat = pd.Categorical(["a", "b"]) with tm.assert_produces_warning(None): cat.take([0, 0]) def test_take_bounds(self, allow_fill): # https://github.com/pandas-dev/pandas/issues/20664 - cat = pd.Categorical(['a', 'b', 'a']) + cat = pd.Categorical(["a", "b", "a"]) with pytest.raises(IndexError): cat.take([4, 5], allow_fill=allow_fill) def test_take_empty(self, allow_fill): # https://github.com/pandas-dev/pandas/issues/20664 - cat = pd.Categorical([], categories=['a', 'b']) + cat = pd.Categorical([], categories=["a", "b"]) with pytest.raises(IndexError): cat.take([0], allow_fill=allow_fill) def test_positional_take(self, ordered_fixture): - cat = pd.Categorical(['a', 'a', 'b', 'b'], categories=['b', 'a'], - ordered=ordered_fixture) + cat = pd.Categorical( + ["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered_fixture + ) result = cat.take([0, 1, 2], allow_fill=False) - expected = pd.Categorical(['a', 'a', 'b'], categories=cat.categories, - ordered=ordered_fixture) + expected = pd.Categorical( + ["a", "a", "b"], categories=cat.categories, ordered=ordered_fixture + ) tm.assert_categorical_equal(result, expected) def test_positional_take_unobserved(self, ordered_fixture): - cat = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'], - ordered=ordered_fixture) + cat = pd.Categorical( + ["a", "b"], categories=["a", "b", "c"], ordered=ordered_fixture + ) result = cat.take([1, 0], allow_fill=False) - expected = pd.Categorical(['b', 'a'], categories=cat.categories, - ordered=ordered_fixture) + expected = pd.Categorical( + ["b", "a"], categories=cat.categories, ordered=ordered_fixture + ) tm.assert_categorical_equal(result, expected) def test_take_allow_fill(self): # https://github.com/pandas-dev/pandas/issues/23296 - cat = pd.Categorical(['a', 'a', 'b']) + cat = pd.Categorical(["a", "a", "b"]) result = cat.take([0, -1, -1], allow_fill=True) - expected = pd.Categorical(['a', np.nan, np.nan], - categories=['a', 'b']) + expected = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b"]) tm.assert_categorical_equal(result, expected) def test_take_fill_with_negative_one(self): @@ -129,14 +129,14 @@ def test_take_fill_with_negative_one(self): def test_take_fill_value(self): # https://github.com/pandas-dev/pandas/issues/23296 - cat = pd.Categorical(['a', 'b', 'c']) - result = cat.take([0, 1, -1], fill_value='a', allow_fill=True) - expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) + cat = pd.Categorical(["a", "b", "c"]) + result = cat.take([0, 1, -1], fill_value="a", allow_fill=True) + expected = pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) def test_take_fill_value_new_raises(self): # https://github.com/pandas-dev/pandas/issues/23296 - cat = pd.Categorical(['a', 'b', 'c']) + cat = pd.Categorical(["a", "b", "c"]) xpr = r"'fill_value' \('d'\) is not in this Categorical's categories." with pytest.raises(TypeError, match=xpr): - cat.take([0, 1, -1], fill_value='d', allow_fill=True) + cat.take([0, 1, -1], fill_value="d", allow_fill=True) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 134b16d828746..d8831d7e6bf36 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -11,15 +11,14 @@ class TestCategoricalAnalytics: - def test_min_max(self): # unordered cats have no min/max cat = Categorical(["a", "b", "c", "d"], ordered=False) msg = "Categorical is not ordered for operation {}" - with pytest.raises(TypeError, match=msg.format('min')): + with pytest.raises(TypeError, match=msg.format("min")): cat.min() - with pytest.raises(TypeError, match=msg.format('max')): + with pytest.raises(TypeError, match=msg.format("max")): cat.max() cat = Categorical(["a", "b", "c", "d"], ordered=True) @@ -28,15 +27,17 @@ def test_min_max(self): assert _min == "a" assert _max == "d" - cat = Categorical(["a", "b", "c", "d"], - categories=['d', 'c', 'b', 'a'], ordered=True) + cat = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) _min = cat.min() _max = cat.max() assert _min == "d" assert _max == "a" - cat = Categorical([np.nan, "b", "c", np.nan], - categories=['d', 'c', 'b', 'a'], ordered=True) + cat = Categorical( + [np.nan, "b", "c", np.nan], categories=["d", "c", "b", "a"], ordered=True + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) @@ -47,8 +48,9 @@ def test_min_max(self): _max = cat.max(numeric_only=True) assert _max == "b" - cat = Categorical([np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], - ordered=True) + cat = Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) @@ -59,13 +61,17 @@ def test_min_max(self): _max = cat.max(numeric_only=True) assert _max == 1 - @pytest.mark.parametrize("values,categories,exp_mode", [ - ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]), - ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]), - ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]), - ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]), - ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), - ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4])]) + @pytest.mark.parametrize( + "values,categories,exp_mode", + [ + ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]), + ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]), + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]), + ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]), + ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), + ], + ) def test_mode(self, values, categories, exp_mode): s = Categorical(values, categories=categories, ordered=True) res = s.mode() @@ -76,34 +82,38 @@ def test_searchsorted(self): # https://github.com/pandas-dev/pandas/issues/8420 # https://github.com/pandas-dev/pandas/issues/14522 - c1 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], - categories=['cheese', 'milk', 'apple', 'bread'], - ordered=True) + c1 = Categorical( + ["cheese", "milk", "apple", "bread", "bread"], + categories=["cheese", "milk", "apple", "bread"], + ordered=True, + ) s1 = Series(c1) - c2 = Categorical(['cheese', 'milk', 'apple', 'bread', 'bread'], - categories=['cheese', 'milk', 'apple', 'bread'], - ordered=False) + c2 = Categorical( + ["cheese", "milk", "apple", "bread", "bread"], + categories=["cheese", "milk", "apple", "bread"], + ordered=False, + ) s2 = Series(c2) # Searching for single item argument, side='left' (default) - res_cat = c1.searchsorted('apple') + res_cat = c1.searchsorted("apple") assert res_cat == 2 assert is_scalar(res_cat) - res_ser = s1.searchsorted('apple') + res_ser = s1.searchsorted("apple") assert res_ser == 2 assert is_scalar(res_ser) # Searching for single item array, side='left' (default) - res_cat = c1.searchsorted(['bread']) - res_ser = s1.searchsorted(['bread']) + res_cat = c1.searchsorted(["bread"]) + res_ser = s1.searchsorted(["bread"]) exp = np.array([3], dtype=np.intp) tm.assert_numpy_array_equal(res_cat, exp) tm.assert_numpy_array_equal(res_ser, exp) # Searching for several items array, side='right' - res_cat = c1.searchsorted(['apple', 'bread'], side='right') - res_ser = s1.searchsorted(['apple', 'bread'], side='right') + res_cat = c1.searchsorted(["apple", "bread"], side="right") + res_ser = s1.searchsorted(["apple", "bread"], side="right") exp = np.array([3, 5], dtype=np.intp) tm.assert_numpy_array_equal(res_cat, exp) tm.assert_numpy_array_equal(res_ser, exp) @@ -111,22 +121,22 @@ def test_searchsorted(self): # Searching for a single value that is not from the Categorical msg = r"Value\(s\) to be inserted must be in categories" with pytest.raises(KeyError, match=msg): - c1.searchsorted('cucumber') + c1.searchsorted("cucumber") with pytest.raises(KeyError, match=msg): - s1.searchsorted('cucumber') + s1.searchsorted("cucumber") # Searching for multiple values one of each is not from the Categorical with pytest.raises(KeyError, match=msg): - c1.searchsorted(['bread', 'cucumber']) + c1.searchsorted(["bread", "cucumber"]) with pytest.raises(KeyError, match=msg): - s1.searchsorted(['bread', 'cucumber']) + s1.searchsorted(["bread", "cucumber"]) # searchsorted call for unordered Categorical msg = "Categorical not ordered" with pytest.raises(ValueError, match=msg): - c2.searchsorted('apple') + c2.searchsorted("apple") with pytest.raises(ValueError, match=msg): - s2.searchsorted('apple') + s2.searchsorted("apple") def test_unique(self): # categories are reordered based on value when ordered=False @@ -141,17 +151,15 @@ def test_unique(self): tm.assert_index_equal(res.categories, exp) tm.assert_categorical_equal(res, Categorical(exp)) - cat = Categorical(["c", "a", "b", "a", "a"], - categories=["a", "b", "c"]) + cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) exp = Index(["c", "a", "b"]) res = cat.unique() tm.assert_index_equal(res.categories, exp) - exp_cat = Categorical(exp, categories=['c', 'a', 'b']) + exp_cat = Categorical(exp, categories=["c", "a", "b"]) tm.assert_categorical_equal(res, exp_cat) # nan must be removed - cat = Categorical(["b", np.nan, "b", np.nan, "a"], - categories=["a", "b", "c"]) + cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) res = cat.unique() exp = Index(["b", "a"]) tm.assert_index_equal(res.categories, exp) @@ -160,29 +168,28 @@ def test_unique(self): def test_unique_ordered(self): # keep categories order when ordered=True - cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True) + cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True) res = cat.unique() - exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) + exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical( + ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True + ) res = cat.unique() - exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'], - ordered=True) + exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True) res = cat.unique() - exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) + exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical( + ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True + ) res = cat.unique() - exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'], - ordered=True) + exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) tm.assert_categorical_equal(res, exp_cat) def test_unique_index_series(self): @@ -211,18 +218,19 @@ def test_unique_index_series(self): def test_shift(self): # GH 9416 - cat = Categorical(['a', 'b', 'c', 'd', 'a']) + cat = Categorical(["a", "b", "c", "d", "a"]) # shift forward sp1 = cat.shift(1) - xp1 = Categorical([np.nan, 'a', 'b', 'c', 'd']) + xp1 = Categorical([np.nan, "a", "b", "c", "d"]) tm.assert_categorical_equal(sp1, xp1) tm.assert_categorical_equal(cat[:-1], sp1[1:]) # shift back sn2 = cat.shift(-2) - xp2 = Categorical(['c', 'd', 'a', np.nan, np.nan], - categories=['a', 'b', 'c', 'd']) + xp2 = Categorical( + ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"] + ) tm.assert_categorical_equal(sn2, xp2) tm.assert_categorical_equal(cat[2:], sn2[:-2]) @@ -241,7 +249,7 @@ def test_memory_usage(self): assert 0 < cat.nbytes <= cat.memory_usage() assert 0 < cat.nbytes <= cat.memory_usage(deep=True) - cat = Categorical(['foo', 'foo', 'bar']) + cat = Categorical(["foo", "foo", "bar"]) assert cat.memory_usage(deep=True) > cat.nbytes if not PYPY: @@ -251,14 +259,14 @@ def test_memory_usage(self): assert abs(diff) < 100 def test_map(self): - c = Categorical(list('ABABC'), categories=list('CBA'), ordered=True) + c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True) result = c.map(lambda x: x.lower()) - exp = Categorical(list('ababc'), categories=list('cba'), ordered=True) + exp = Categorical(list("ababc"), categories=list("cba"), ordered=True) tm.assert_categorical_equal(result, exp) - c = Categorical(list('ABABC'), categories=list('ABC'), ordered=False) + c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False) result = c.map(lambda x: x.lower()) - exp = Categorical(list('ababc'), categories=list('abc'), ordered=False) + exp = Categorical(list("ababc"), categories=list("abc"), ordered=False) tm.assert_categorical_equal(result, exp) result = c.map(lambda x: 1) @@ -266,7 +274,7 @@ def test_map(self): tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) def test_validate_inplace(self): - cat = Categorical(['A', 'B', 'B', 'C', 'A']) + cat = Categorical(["A", "B", "B", "C", "A"]) invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: @@ -280,21 +288,19 @@ def test_validate_inplace(self): cat.as_unordered(inplace=value) with pytest.raises(ValueError): - cat.set_categories(['X', 'Y', 'Z'], rename=True, inplace=value) + cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) with pytest.raises(ValueError): - cat.rename_categories(['X', 'Y', 'Z'], inplace=value) + cat.rename_categories(["X", "Y", "Z"], inplace=value) with pytest.raises(ValueError): - cat.reorder_categories( - ['X', 'Y', 'Z'], ordered=True, inplace=value) + cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) with pytest.raises(ValueError): - cat.add_categories( - new_categories=['D', 'E', 'F'], inplace=value) + cat.add_categories(new_categories=["D", "E", "F"], inplace=value) with pytest.raises(ValueError): - cat.remove_categories(removals=['D', 'E', 'F'], inplace=value) + cat.remove_categories(removals=["D", "E", "F"], inplace=value) with pytest.raises(ValueError): cat.remove_unused_categories(inplace=value) diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index d2f63268e5a12..ab07b3c96a1db 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -8,23 +8,22 @@ class TestCategoricalAPI: - def test_ordered_api(self): # GH 9347 - cat1 = Categorical(list('acb'), ordered=False) - tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c'])) + cat1 = Categorical(list("acb"), ordered=False) + tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"])) assert not cat1.ordered - cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False) - tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a'])) + cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False) + tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"])) assert not cat2.ordered - cat3 = Categorical(list('acb'), ordered=True) - tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c'])) + cat3 = Categorical(list("acb"), ordered=True) + tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"])) assert cat3.ordered - cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True) - tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a'])) + cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True) + tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"])) assert cat4.ordered def test_set_ordered(self): @@ -47,7 +46,7 @@ def test_set_ordered(self): assert not cat2.ordered # removed in 0.19.0 - msg = "can\'t set attribute" + msg = "can't set attribute" with pytest.raises(AttributeError, match=msg): cat.ordered = True with pytest.raises(AttributeError, match=msg): @@ -58,8 +57,9 @@ def test_rename_categories(self): # inplace=False: the old one must not be changed res = cat.rename_categories([1, 2, 3]) - tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1], - dtype=np.int64)) + tm.assert_numpy_array_equal( + res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) + ) tm.assert_index_equal(res.categories, Index([1, 2, 3])) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) @@ -76,8 +76,9 @@ def test_rename_categories(self): # and now inplace res = cat.rename_categories([1, 2, 3], inplace=True) assert res is None - tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1], - dtype=np.int64)) + tm.assert_numpy_array_equal( + cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) + ) tm.assert_index_equal(cat.categories, Index([1, 2, 3])) # Lengthen @@ -90,50 +91,49 @@ def test_rename_categories(self): def test_rename_categories_series(self): # https://github.com/pandas-dev/pandas/issues/17981 - c = Categorical(['a', 'b']) - result = c.rename_categories(Series([0, 1], index=['a', 'b'])) + c = Categorical(["a", "b"]) + result = c.rename_categories(Series([0, 1], index=["a", "b"])) expected = Categorical([0, 1]) tm.assert_categorical_equal(result, expected) def test_rename_categories_dict(self): # GH 17336 - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}) expected = Index([4, 3, 2, 1]) tm.assert_index_equal(res.categories, expected) # Test for inplace - res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1}, - inplace=True) + res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) assert res is None tm.assert_index_equal(cat.categories, expected) # Test for dicts of smaller length - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'a': 1, 'c': 3}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "c": 3}) - expected = Index([1, 'b', 3, 'd']) + expected = Index([1, "b", 3, "d"]) tm.assert_index_equal(res.categories, expected) # Test for dicts with bigger length - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3, - 'd': 4, 'e': 5, 'f': 6}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6}) expected = Index([1, 2, 3, 4]) tm.assert_index_equal(res.categories, expected) # Test for dicts with no items from old categories - cat = Categorical(['a', 'b', 'c', 'd']) - res = cat.rename_categories({'f': 1, 'g': 3}) + cat = Categorical(["a", "b", "c", "d"]) + res = cat.rename_categories({"f": 1, "g": 3}) - expected = Index(['a', 'b', 'c', 'd']) + expected = Index(["a", "b", "c", "d"]) tm.assert_index_equal(res.categories, expected) def test_reorder_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() - new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"], - ordered=True) + new = Categorical( + ["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True + ) # first inplace == False res = cat.reorder_categories(["c", "b", "a"]) @@ -164,8 +164,9 @@ def test_reorder_categories(self): def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() - new = Categorical(["a", "b", "c", "a"], - categories=["a", "b", "c", "d"], ordered=True) + new = Categorical( + ["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True + ) # first inplace == False res = cat.add_categories("d") @@ -187,8 +188,7 @@ def test_add_categories(self): # GH 9927 cat = Categorical(list("abc"), ordered=True) - expected = Categorical( - list("abc"), categories=list("abcde"), ordered=True) + expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) # test with Series, np.array, index, list res = cat.add_categories(Series(["d", "e"])) tm.assert_categorical_equal(res, expected) @@ -222,13 +222,11 @@ def test_set_categories(self): # np.nan cat = Categorical(["a", "b", "c", "a"], ordered=True) res = cat.set_categories(["a"]) - tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8)) # still not all "old" in "new" res = cat.set_categories(["a", "b", "d"]) - tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) tm.assert_index_equal(res.categories, Index(["a", "b", "d"])) # all "old" included in "new" @@ -238,8 +236,7 @@ def test_set_categories(self): # internals... c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) - tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) @@ -249,8 +246,7 @@ def test_set_categories(self): c = c.set_categories([4, 3, 2, 1]) # positions are changed - tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8)) # categories are now in new order tm.assert_index_equal(c.categories, Index([4, 3, 2, 1])) @@ -273,31 +269,33 @@ def test_set_categories(self): tm.assert_numpy_array_equal(c.to_dense(), c2.to_dense()) - @pytest.mark.parametrize('values, categories, new_categories', [ - # No NaNs, same cats, same order - (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],), - # Same, unsorted - (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],), - # NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']), - (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - # Introduce NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a']), - (['a', 'b', 'c'], ['a', 'b'], ['b']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - # No overlap - (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']), - ]) - @pytest.mark.parametrize('ordered', [True, False]) - def test_set_categories_many(self, values, categories, new_categories, - ordered): + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_categories_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c.set_categories(new_categories, ordered=ordered) @@ -305,28 +303,27 @@ def test_set_categories_many(self, values, categories, new_categories, def test_set_categories_rename_less(self): # GH 24675 - cat = Categorical(['A', 'B']) - result = cat.set_categories(['A'], rename=True) - expected = Categorical(['A', np.nan]) + cat = Categorical(["A", "B"]) + result = cat.set_categories(["A"], rename=True) + expected = Categorical(["A", np.nan]) tm.assert_categorical_equal(result, expected) def test_set_categories_private(self): - cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) - cat._set_categories(['a', 'c', 'd', 'e']) - expected = Categorical(['a', 'c', 'd'], categories=list('acde')) + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"]) + expected = Categorical(["a", "c", "d"], categories=list("acde")) tm.assert_categorical_equal(cat, expected) # fastpath - cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) - cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True) - expected = Categorical(['a', 'c', 'd'], categories=list('acde')) + cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) + cat._set_categories(["a", "c", "d", "e"], fastpath=True) + expected = Categorical(["a", "c", "d"], categories=list("acde")) tm.assert_categorical_equal(cat, expected) def test_remove_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() - new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], - ordered=True) + new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) # first inplace == False res = cat.remove_categories("c") @@ -347,8 +344,7 @@ def test_remove_categories(self): cat.remove_categories(["c"]) def test_remove_unused_categories(self): - c = Categorical(["a", "b", "c", "d", "a"], - categories=["a", "b", "c", "d", "e"]) + c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) exp_categories_all = Index(["a", "b", "c", "d", "e"]) exp_categories_dropped = Index(["a", "b", "c", "d"]) @@ -363,25 +359,23 @@ def test_remove_unused_categories(self): assert res is None # with NaN values (GH11599) - c = Categorical(["a", "b", "c", np.nan], - categories=["a", "b", "c", "d", "e"]) + c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) res = c.remove_unused_categories() - tm.assert_index_equal(res.categories, - Index(np.array(["a", "b", "c"]))) + tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"]))) exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) tm.assert_numpy_array_equal(res.codes, exp_codes) tm.assert_index_equal(c.categories, exp_categories_all) - val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] - cat = Categorical(values=val, categories=list('ABCDEFG')) + val = ["F", np.nan, "D", "B", "D", "F", np.nan] + cat = Categorical(values=val, categories=list("ABCDEFG")) out = cat.remove_unused_categories() - tm.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) + tm.assert_index_equal(out.categories, Index(["B", "D", "F"])) exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) tm.assert_numpy_array_equal(out.codes, exp_codes) assert out.tolist() == val - alpha = list('abcdefghijklmnopqrstuvwxyz') - val = np.random.choice(alpha[::2], 10000).astype('object') + alpha = list("abcdefghijklmnopqrstuvwxyz") + val = np.random.choice(alpha[::2], 10000).astype("object") val[np.random.choice(len(val), 100)] = np.nan cat = Categorical(values=val, categories=alpha) @@ -390,16 +384,16 @@ def test_remove_unused_categories(self): class TestCategoricalAPIWithFactor(TestCategorical): - def test_describe(self): # string type desc = self.factor.describe() assert self.factor.ordered - exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories', - ordered=self.factor.ordered) - expected = DataFrame({'counts': [3, 2, 3], - 'freqs': [3 / 8., 2 / 8., 3 / 8.]}, - index=exp_index) + exp_index = CategoricalIndex( + ["a", "b", "c"], name="categories", ordered=self.factor.ordered + ) + expected = DataFrame( + {"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index + ) tm.assert_frame_equal(desc, expected) # check unused categories @@ -408,51 +402,53 @@ def test_describe(self): desc = cat.describe() exp_index = CategoricalIndex( - list('abcd'), ordered=self.factor.ordered, name='categories') - expected = DataFrame({'counts': [3, 2, 3, 0], - 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]}, - index=exp_index) + list("abcd"), ordered=self.factor.ordered, name="categories" + ) + expected = DataFrame( + {"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]}, + index=exp_index, + ) tm.assert_frame_equal(desc, expected) # check an integer one cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) desc = cat.describe() - exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, - name='categories') - expected = DataFrame({'counts': [5, 3, 3], - 'freqs': [5 / 11., 3 / 11., 3 / 11.]}, - index=exp_index) + exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories") + expected = DataFrame( + {"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]}, + index=exp_index, + ) tm.assert_frame_equal(desc, expected) # https://github.com/pandas-dev/pandas/issues/3678 # describe should work with NaN cat = Categorical([np.nan, 1, 2, 2]) desc = cat.describe() - expected = DataFrame({'counts': [1, 2, 1], - 'freqs': [1 / 4., 2 / 4., 1 / 4.]}, - index=CategoricalIndex([1, 2, np.nan], - categories=[1, 2], - name='categories')) + expected = DataFrame( + {"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]}, + index=CategoricalIndex( + [1, 2, np.nan], categories=[1, 2], name="categories" + ), + ) tm.assert_frame_equal(desc, expected) def test_set_categories_inplace(self): cat = self.factor.copy() - cat.set_categories(['a', 'b', 'c', 'd'], inplace=True) - tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd'])) + cat.set_categories(["a", "b", "c", "d"], inplace=True) + tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"])) class TestPrivateCategoricalAPI: - def test_codes_immutable(self): # Codes should be read only c = Categorical(["a", "b", "c", "a", np.nan]) - exp = np.array([0, 1, 2, 0, -1], dtype='int8') + exp = np.array([0, 1, 2, 0, -1], dtype="int8") tm.assert_numpy_array_equal(c.codes, exp) # Assignments to codes should raise with pytest.raises(ValueError): - c.codes = np.array([0, 1, 2, 0, 1], dtype='int8') + c.codes = np.array([0, 1, 2, 0, 1], dtype="int8") # changes in the codes array should raise codes = c.codes @@ -463,26 +459,29 @@ def test_codes_immutable(self): # But even after getting the codes, the original array should still be # writeable! c[4] = "a" - exp = np.array([0, 1, 2, 0, 0], dtype='int8') + exp = np.array([0, 1, 2, 0, 0], dtype="int8") tm.assert_numpy_array_equal(c.codes, exp) c._codes[4] = 2 - exp = np.array([0, 1, 2, 0, 2], dtype='int8') + exp = np.array([0, 1, 2, 0, 2], dtype="int8") tm.assert_numpy_array_equal(c.codes, exp) - @pytest.mark.parametrize('codes, old, new, expected', [ - ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]), - ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]), - ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]), - ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]), - ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]), - ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]), - ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]), - ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]), - ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]), - ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]), - ([-1, -1], [], ['a', 'b'], [-1, -1]), - ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]), - ]) + @pytest.mark.parametrize( + "codes, old, new, expected", + [ + ([0, 1], ["a", "b"], ["a", "b"], [0, 1]), + ([0, 1], ["b", "a"], ["b", "a"], [0, 1]), + ([0, 1], ["a", "b"], ["b", "a"], [1, 0]), + ([0, 1], ["b", "a"], ["a", "b"], [1, 0]), + ([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]), + ([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]), + ([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]), + ([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]), + ([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]), + ([-1, -1], [], ["a", "b"], [-1, -1]), + ([1, 0], ["b", "a"], ["a", "b"], [0, 1]), + ], + ) def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index a28977a87de8d..4bf31a52dcda8 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -8,14 +8,23 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DatetimeIndex, Index, Interval, - IntervalIndex, NaT, Series, Timestamp, date_range, period_range, - timedelta_range) + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + NaT, + Series, + Timestamp, + date_range, + period_range, + timedelta_range, +) import pandas.util.testing as tm class TestCategoricalConstructors: - def test_validate_ordered(self): # see gh-14058 exp_msg = "'ordered' must either be 'True' or 'False'" @@ -28,8 +37,9 @@ def test_validate_ordered(self): Categorical([1, 2, 3], ordered=ordered) with pytest.raises(exp_err, match=exp_msg): - Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], - ordered=ordered) + Categorical.from_codes( + [0, 0, 1], categories=["a", "b", "c"], ordered=ordered + ) def test_constructor_empty(self): # GH 17248 @@ -57,32 +67,42 @@ def test_constructor_tuples(self): def test_constructor_tuples_datetimes(self): # numpy will auto reshape when all of the tuples are the # same len, so add an extra one with 2 items and slice it off - values = np.array([(Timestamp('2010-01-01'),), - (Timestamp('2010-01-02'),), - (Timestamp('2010-01-01'),), - (Timestamp('2010-01-02'),), - ('a', 'b')], dtype=object)[:-1] + values = np.array( + [ + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + (Timestamp("2010-01-01"),), + (Timestamp("2010-01-02"),), + ("a", "b"), + ], + dtype=object, + )[:-1] result = Categorical(values) - expected = Index([(Timestamp('2010-01-01'),), - (Timestamp('2010-01-02'),)], tupleize_cols=False) + expected = Index( + [(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)], + tupleize_cols=False, + ) tm.assert_index_equal(result.categories, expected) def test_constructor_unsortable(self): # it works! - arr = np.array([1, 2, 3, datetime.now()], dtype='O') + arr = np.array([1, 2, 3, datetime.now()], dtype="O") factor = Categorical(arr, ordered=False) assert not factor.ordered # this however will raise as cannot be sorted - msg = ("'values' is not ordered, please explicitly specify the " - "categories order by passing in a categories argument.") + msg = ( + "'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument." + ) with pytest.raises(TypeError, match=msg): Categorical(arr, ordered=True) def test_constructor_interval(self): - result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)], - ordered=True) + result = Categorical( + [Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True + ) ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)]) exp = Categorical(ii, ordered=True) tm.assert_categorical_equal(result, exp) @@ -143,8 +163,7 @@ def test_constructor(self): tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - c2 = Categorical(Series(["a", "b", "c", "a"]), - categories=["a", "b", "c", "d"]) + c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"]) tm.assert_categorical_equal(c1, c2) # This should result in integer categories, not float! @@ -156,10 +175,10 @@ def test_constructor(self): assert is_integer_dtype(cat.categories) # this should result in floats - cat = Categorical([np.nan, 1, 2., 3]) + cat = Categorical([np.nan, 1, 2.0, 3]) assert is_float_dtype(cat.categories) - cat = Categorical([np.nan, 1., 2., 3.]) + cat = Categorical([np.nan, 1.0, 2.0, 3.0]) assert is_float_dtype(cat.categories) # This doesn't work -> this would probably need some kind of "remember @@ -193,12 +212,10 @@ def test_constructor(self): # - when the first is an integer dtype and the second is not # - when the resulting codes are all -1/NaN with tm.assert_produces_warning(None): - c_old = Categorical([0, 1, 2, 0, 1, 2], - categories=["a", "b", "c"]) # noqa + c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) # noqa with tm.assert_produces_warning(None): - c_old = Categorical([0, 1, 2, 0, 1, 2], # noqa - categories=[3, 4, 5]) + c_old = Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) # noqa # the next one are from the old docs with tm.assert_produces_warning(None): @@ -207,8 +224,9 @@ def test_constructor(self): # this is a legitimate constructor with tm.assert_produces_warning(None): - c = Categorical(np.array([], dtype='int64'), # noqa - categories=[3, 2, 1], ordered=True) + c = Categorical( + np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True # noqa + ) def test_constructor_with_existing_categories(self): # GH25318: constructing with pd.Series used to bogusly skip recoding @@ -226,32 +244,32 @@ def test_constructor_not_sequence(self): # https://github.com/pandas-dev/pandas/issues/16022 msg = r"^Parameter 'categories' must be list-like, was" with pytest.raises(TypeError, match=msg): - Categorical(['a', 'b'], categories='a') + Categorical(["a", "b"], categories="a") def test_constructor_with_null(self): # Cannot have NaN in categories msg = "Categorial categories cannot be null" with pytest.raises(ValueError, match=msg): - Categorical([np.nan, "a", "b", "c"], - categories=[np.nan, "a", "b", "c"]) + Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) with pytest.raises(ValueError, match=msg): - Categorical([None, "a", "b", "c"], - categories=[None, "a", "b", "c"]) + Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"]) with pytest.raises(ValueError, match=msg): - Categorical(DatetimeIndex(['nat', '20160101']), - categories=[NaT, Timestamp('20160101')]) + Categorical( + DatetimeIndex(["nat", "20160101"]), + categories=[NaT, Timestamp("20160101")], + ) def test_constructor_with_index(self): - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) tm.assert_categorical_equal(ci.values, Categorical(ci)) - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - tm.assert_categorical_equal(ci.values, - Categorical(ci.astype(object), - categories=ci.categories)) + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + tm.assert_categorical_equal( + ci.values, Categorical(ci.astype(object), categories=ci.categories) + ) def test_constructor_with_generator(self): # This was raising an Error in isna(single_val).any() because isna @@ -266,7 +284,8 @@ def test_constructor_with_generator(self): # This uses xrange internally from pandas.core.index import MultiIndex - MultiIndex.from_product([range(5), ['a', 'b', 'c']]) + + MultiIndex.from_product([range(5), ["a", "b", "c"]]) # check that categories accept generators and sequences cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2])) @@ -274,12 +293,14 @@ def test_constructor_with_generator(self): cat = Categorical([0, 1, 2], categories=xrange(3)) tm.assert_categorical_equal(cat, exp) - @pytest.mark.parametrize("dtl", [ - date_range("1995-01-01 00:00:00", periods=5, freq="s"), - date_range("1995-01-01 00:00:00", periods=5, - freq="s", tz="US/Eastern"), - timedelta_range("1 day", periods=5, freq="s") - ]) + @pytest.mark.parametrize( + "dtl", + [ + date_range("1995-01-01 00:00:00", periods=5, freq="s"), + date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"), + timedelta_range("1 day", periods=5, freq="s"), + ], + ) def test_constructor_with_datetimelike(self, dtl): # see gh-12077 # constructor with a datetimelike and NaT @@ -310,8 +331,7 @@ def test_constructor_with_datetimelike(self, dtl): assert "NaT" in result def test_constructor_from_index_series_datetimetz(self): - idx = date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern') + idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") result = Categorical(idx) tm.assert_index_equal(result.categories, idx) @@ -319,7 +339,7 @@ def test_constructor_from_index_series_datetimetz(self): tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_timedelta(self): - idx = timedelta_range('1 days', freq='D', periods=3) + idx = timedelta_range("1 days", freq="D", periods=3) result = Categorical(idx) tm.assert_index_equal(result.categories, idx) @@ -327,7 +347,7 @@ def test_constructor_from_index_series_timedelta(self): tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_period(self): - idx = period_range('2015-01-01', freq='D', periods=3) + idx = period_range("2015-01-01", freq="D", periods=3) result = Categorical(idx) tm.assert_index_equal(result.categories, idx) @@ -337,50 +357,52 @@ def test_constructor_from_index_series_period(self): def test_constructor_invariant(self): # GH 14190 vals = [ - np.array([1., 1.2, 1.8, np.nan]), - np.array([1, 2, 3], dtype='int64'), - ['a', 'b', 'c', np.nan], - [pd.Period('2014-01'), pd.Period('2014-02'), NaT], - [Timestamp('2014-01-01'), Timestamp('2014-01-02'), NaT], - [Timestamp('2014-01-01', tz='US/Eastern'), - Timestamp('2014-01-02', tz='US/Eastern'), NaT], + np.array([1.0, 1.2, 1.8, np.nan]), + np.array([1, 2, 3], dtype="int64"), + ["a", "b", "c", np.nan], + [pd.Period("2014-01"), pd.Period("2014-02"), NaT], + [Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT], + [ + Timestamp("2014-01-01", tz="US/Eastern"), + Timestamp("2014-01-02", tz="US/Eastern"), + NaT, + ], ] for val in vals: c = Categorical(val) c2 = Categorical(c) tm.assert_categorical_equal(c, c2) - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("ordered", [True, False]) def test_constructor_with_dtype(self, ordered): - categories = ['b', 'a', 'c'] + categories = ["b", "a", "c"] dtype = CategoricalDtype(categories, ordered=ordered) - result = Categorical(['a', 'b', 'a', 'c'], dtype=dtype) - expected = Categorical(['a', 'b', 'a', 'c'], categories=categories, - ordered=ordered) + result = Categorical(["a", "b", "a", "c"], dtype=dtype) + expected = Categorical( + ["a", "b", "a", "c"], categories=categories, ordered=ordered + ) tm.assert_categorical_equal(result, expected) assert result.ordered is ordered def test_constructor_dtype_and_others_raises(self): - dtype = CategoricalDtype(['a', 'b'], ordered=True) + dtype = CategoricalDtype(["a", "b"], ordered=True) msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): - Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype) + Categorical(["a", "b"], categories=["a", "b"], dtype=dtype) with pytest.raises(ValueError, match=msg): - Categorical(['a', 'b'], ordered=True, dtype=dtype) + Categorical(["a", "b"], ordered=True, dtype=dtype) with pytest.raises(ValueError, match=msg): - Categorical(['a', 'b'], ordered=False, dtype=dtype) + Categorical(["a", "b"], ordered=False, dtype=dtype) - @pytest.mark.parametrize('categories', [ - None, ['a', 'b'], ['a', 'c'], - ]) - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]]) + @pytest.mark.parametrize("ordered", [True, False]) def test_constructor_str_category(self, categories, ordered): - result = Categorical(['a', 'b'], categories=categories, - ordered=ordered, dtype='category') - expected = Categorical(['a', 'b'], categories=categories, - ordered=ordered) + result = Categorical( + ["a", "b"], categories=categories, ordered=ordered, dtype="category" + ) + expected = Categorical(["a", "b"], categories=categories, ordered=ordered) tm.assert_categorical_equal(result, expected) def test_constructor_str_unknown(self): @@ -388,46 +410,48 @@ def test_constructor_str_unknown(self): Categorical([1, 2], dtype="foo") def test_constructor_from_categorical_with_dtype(self): - dtype = CategoricalDtype(['a', 'b', 'c'], ordered=True) - values = Categorical(['a', 'b', 'd']) + dtype = CategoricalDtype(["a", "b", "c"], ordered=True) + values = Categorical(["a", "b", "d"]) result = Categorical(values, dtype=dtype) # We use dtype.categories, not values.categories - expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'], - ordered=True) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_constructor_from_categorical_with_unknown_dtype(self): dtype = CategoricalDtype(None, ordered=True) - values = Categorical(['a', 'b', 'd']) + values = Categorical(["a", "b", "d"]) result = Categorical(values, dtype=dtype) # We use values.categories, not dtype.categories - expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'], - ordered=True) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "d"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_constructor_from_categorical_string(self): - values = Categorical(['a', 'b', 'd']) + values = Categorical(["a", "b", "d"]) # use categories, ordered - result = Categorical(values, categories=['a', 'b', 'c'], ordered=True, - dtype='category') - expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'], - ordered=True) + result = Categorical( + values, categories=["a", "b", "c"], ordered=True, dtype="category" + ) + expected = Categorical( + ["a", "b", "d"], categories=["a", "b", "c"], ordered=True + ) tm.assert_categorical_equal(result, expected) # No string - result = Categorical(values, categories=['a', 'b', 'c'], ordered=True) + result = Categorical(values, categories=["a", "b", "c"], ordered=True) tm.assert_categorical_equal(result, expected) def test_constructor_with_categorical_categories(self): # GH17884 - expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) - result = Categorical( - ['a', 'b'], categories=Categorical(['a', 'b', 'c'])) + result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"])) tm.assert_categorical_equal(result, expected) - result = Categorical( - ['a', 'b'], categories=CategoricalIndex(['a', 'b', 'c'])) + result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"])) tm.assert_categorical_equal(result, expected) def test_from_codes(self): @@ -448,13 +472,11 @@ def test_from_codes(self): Categorical.from_codes(["a"], dtype=dtype) # no unique categories - with pytest.raises(ValueError, - match="Categorical categories must be unique"): + with pytest.raises(ValueError, match="Categorical categories must be unique"): Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) # NaN categories included - with pytest.raises(ValueError, - match="Categorial categories cannot be null"): + with pytest.raises(ValueError, match="Categorial categories cannot be null"): Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) # too negative @@ -474,103 +496,100 @@ def test_from_codes(self): def test_from_codes_with_categorical_categories(self): # GH17884 - expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + expected = Categorical(["a", "b"], categories=["a", "b", "c"]) - result = Categorical.from_codes( - [0, 1], categories=Categorical(['a', 'b', 'c'])) + result = Categorical.from_codes([0, 1], categories=Categorical(["a", "b", "c"])) tm.assert_categorical_equal(result, expected) result = Categorical.from_codes( - [0, 1], categories=CategoricalIndex(['a', 'b', 'c'])) + [0, 1], categories=CategoricalIndex(["a", "b", "c"]) + ) tm.assert_categorical_equal(result, expected) # non-unique Categorical still raises - with pytest.raises(ValueError, - match="Categorical categories must be unique"): - Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) + with pytest.raises(ValueError, match="Categorical categories must be unique"): + Categorical.from_codes([0, 1], Categorical(["a", "b", "a"])) def test_from_codes_with_nan_code(self): # GH21767 codes = [1, 2, np.nan] - dtype = CategoricalDtype(categories=['a', 'b', 'c']) - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + dtype = CategoricalDtype(categories=["a", "b", "c"]) + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, categories=dtype.categories) - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_float(self): # GH21767 codes = [1.0, 2.0, 0] # integer, but in float dtype - dtype = CategoricalDtype(categories=['a', 'b', 'c']) + dtype = CategoricalDtype(categories=["a", "b", "c"]) with tm.assert_produces_warning(FutureWarning): cat = Categorical.from_codes(codes, dtype.categories) - tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1")) with tm.assert_produces_warning(FutureWarning): cat = Categorical.from_codes(codes, dtype=dtype) - tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="i1")) codes = [1.1, 2.0, 0] # non-integer - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype.categories) - with pytest.raises(ValueError, - match="codes need to be array-like integers"): + with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_dtype_raises(self): - msg = 'Cannot specify' + msg = "Cannot specify" with pytest.raises(ValueError, match=msg): - Categorical.from_codes([0, 1], categories=['a', 'b'], - dtype=CategoricalDtype(['a', 'b'])) + Categorical.from_codes( + [0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"]) + ) with pytest.raises(ValueError, match=msg): - Categorical.from_codes([0, 1], ordered=True, - dtype=CategoricalDtype(['a', 'b'])) + Categorical.from_codes( + [0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"]) + ) def test_from_codes_neither(self): msg = "Both were None" with pytest.raises(ValueError, match=msg): Categorical.from_codes([0, 1]) - @pytest.mark.parametrize('dtype', [None, 'category']) + @pytest.mark.parametrize("dtype", [None, "category"]) def test_from_inferred_categories(self, dtype): - cats = ['a', 'b'] - codes = np.array([0, 0, 1, 1], dtype='i8') + cats = ["a", "b"] + codes = np.array([0, 0, 1, 1], dtype="i8") result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical.from_codes(codes, cats) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('dtype', [None, 'category']) + @pytest.mark.parametrize("dtype", [None, "category"]) def test_from_inferred_categories_sorts(self, dtype): - cats = ['b', 'a'] - codes = np.array([0, 1, 1, 1], dtype='i8') + cats = ["b", "a"] + codes = np.array([0, 1, 1, 1], dtype="i8") result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) + expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"]) tm.assert_categorical_equal(result, expected) def test_from_inferred_categories_dtype(self): - cats = ['a', 'b', 'd'] - codes = np.array([0, 1, 0, 2], dtype='i8') - dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True) + cats = ["a", "b", "d"] + codes = np.array([0, 1, 0, 2], dtype="i8") + dtype = CategoricalDtype(["c", "b", "a"], ordered=True) result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical(['a', 'b', 'a', 'd'], - categories=['c', 'b', 'a'], - ordered=True) + expected = Categorical( + ["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_from_inferred_categories_coerces(self): - cats = ['1', '2', 'bad'] - codes = np.array([0, 0, 1, 2], dtype='i8') + cats = ["1", "2", "bad"] + codes = np.array([0, 0, 1, 2], dtype="i8") dtype = CategoricalDtype([1, 2]) result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical([1, 1, 2, np.nan]) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('ordered', [None, True, False]) + @pytest.mark.parametrize("ordered", [None, True, False]) def test_construction_with_ordered(self, ordered): # GH 9347, 9190 cat = Categorical([0, 1, 2], ordered=ordered) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 14ad3c4d5e860..be64b1f28c733 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -8,24 +8,22 @@ class TestCategoricalDtypes: - def test_is_equal_dtype(self): # test dtype comparisons between cats - c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False) - c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False) - c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True) + c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False) + c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False) + c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True) assert c1.is_dtype_equal(c1) assert c2.is_dtype_equal(c2) assert c3.is_dtype_equal(c3) assert c1.is_dtype_equal(c2) assert not c1.is_dtype_equal(c3) - assert not c1.is_dtype_equal(Index(list('aabca'))) + assert not c1.is_dtype_equal(Index(list("aabca"))) assert not c1.is_dtype_equal(c1.astype(object)) assert c1.is_dtype_equal(CategoricalIndex(c1)) - assert (c1.is_dtype_equal( - CategoricalIndex(c1, categories=list('cab')))) + assert c1.is_dtype_equal(CategoricalIndex(c1, categories=list("cab"))) assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) # GH 16659 @@ -40,83 +38,85 @@ def test_is_equal_dtype(self): assert not c1.is_dtype_equal(s1.astype(object)) def test_set_dtype_same(self): - c = Categorical(['a', 'b', 'c']) - result = c._set_dtype(CategoricalDtype(['a', 'b', 'c'])) + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(["a", "b", "c"])) tm.assert_categorical_equal(result, c) def test_set_dtype_new_categories(self): - c = Categorical(['a', 'b', 'c']) - result = c._set_dtype(CategoricalDtype(list('abcd'))) + c = Categorical(["a", "b", "c"]) + result = c._set_dtype(CategoricalDtype(list("abcd"))) tm.assert_numpy_array_equal(result.codes, c.codes) - tm.assert_index_equal(result.dtype.categories, Index(list('abcd'))) - - @pytest.mark.parametrize('values, categories, new_categories', [ - # No NaNs, same cats, same order - (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],), - # Same, unsorted - (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],), - # No NaNs, same cats, different order - (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],), - # NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']), - (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), - # Introduce NaNs - (['a', 'b', 'c'], ['a', 'b'], ['a']), - (['a', 'b', 'c'], ['a', 'b'], ['b']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - (['b', 'a', 'c'], ['a', 'b'], ['a']), - # No overlap - (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']), - ]) - @pytest.mark.parametrize('ordered', [True, False]) - def test_set_dtype_many(self, values, categories, new_categories, - ordered): + tm.assert_index_equal(result.dtype.categories, Index(list("abcd"))) + + @pytest.mark.parametrize( + "values, categories, new_categories", + [ + # No NaNs, same cats, same order + (["a", "b", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["a", "b", "a"], ["a", "b"], ["b", "a"]), + # Same, unsorted + (["b", "a", "a"], ["a", "b"], ["a", "b"]), + # No NaNs, same cats, different order + (["b", "a", "a"], ["a", "b"], ["b", "a"]), + # NaNs + (["a", "b", "c"], ["a", "b"], ["a", "b"]), + (["a", "b", "c"], ["a", "b"], ["b", "a"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + (["b", "a", "c"], ["a", "b"], ["a", "b"]), + # Introduce NaNs + (["a", "b", "c"], ["a", "b"], ["a"]), + (["a", "b", "c"], ["a", "b"], ["b"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + (["b", "a", "c"], ["a", "b"], ["a"]), + # No overlap + (["a", "b", "c"], ["a", "b"], ["d", "e"]), + ], + ) + @pytest.mark.parametrize("ordered", [True, False]) + def test_set_dtype_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c._set_dtype(expected.dtype) tm.assert_categorical_equal(result, expected) def test_set_dtype_no_overlap(self): - c = Categorical(['a', 'b', 'c'], ['d', 'e']) - result = c._set_dtype(CategoricalDtype(['a', 'b'])) - expected = Categorical([None, None, None], categories=['a', 'b']) + c = Categorical(["a", "b", "c"], ["d", "e"]) + result = c._set_dtype(CategoricalDtype(["a", "b"])) + expected = Categorical([None, None, None], categories=["a", "b"]) tm.assert_categorical_equal(result, expected) def test_codes_dtypes(self): # GH 8453 - result = Categorical(['foo', 'bar', 'baz']) - assert result.codes.dtype == 'int8' + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" - result = Categorical(['foo%05d' % i for i in range(400)]) - assert result.codes.dtype == 'int16' + result = Categorical(["foo%05d" % i for i in range(400)]) + assert result.codes.dtype == "int16" - result = Categorical(['foo%05d' % i for i in range(40000)]) - assert result.codes.dtype == 'int32' + result = Categorical(["foo%05d" % i for i in range(40000)]) + assert result.codes.dtype == "int32" # adding cats - result = Categorical(['foo', 'bar', 'baz']) - assert result.codes.dtype == 'int8' - result = result.add_categories(['foo%05d' % i for i in range(400)]) - assert result.codes.dtype == 'int16' + result = Categorical(["foo", "bar", "baz"]) + assert result.codes.dtype == "int8" + result = result.add_categories(["foo%05d" % i for i in range(400)]) + assert result.codes.dtype == "int16" # removing cats - result = result.remove_categories(['foo%05d' % i for i in range(300)]) - assert result.codes.dtype == 'int8' + result = result.remove_categories(["foo%05d" % i for i in range(300)]) + assert result.codes.dtype == "int8" - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("ordered", [True, False]) def test_astype(self, ordered): # string - cat = Categorical(list('abbaaccc'), ordered=ordered) + cat = Categorical(list("abbaaccc"), ordered=ordered) result = cat.astype(object) expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = 'could not convert string to float' + msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): cat.astype(float) @@ -134,37 +134,36 @@ def test_astype(self, ordered): expected = np.array(cat, dtype=np.float) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('dtype_ordered', [True, False]) - @pytest.mark.parametrize('cat_ordered', [True, False]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("cat_ordered", [True, False]) def test_astype_category(self, dtype_ordered, cat_ordered): # GH 10696/18593 - data = list('abcaacbab') - cat = Categorical(data, categories=list('bac'), ordered=cat_ordered) + data = list("abcaacbab") + cat = Categorical(data, categories=list("bac"), ordered=cat_ordered) # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = cat.astype(dtype) - expected = Categorical( - data, categories=cat.categories, ordered=dtype_ordered) + expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered) tm.assert_categorical_equal(result, expected) # non-standard categories - dtype = CategoricalDtype(list('adc'), dtype_ordered) + dtype = CategoricalDtype(list("adc"), dtype_ordered) result = cat.astype(dtype) expected = Categorical(data, dtype=dtype) tm.assert_categorical_equal(result, expected) if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once - result = cat.astype('category') + result = cat.astype("category") expected = cat tm.assert_categorical_equal(result, expected) def test_astype_category_ordered_none_deprecated(self): # GH 26336 - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb')) - cat = Categorical(list('abcdaba'), dtype=cdt1) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb")) + cat = Categorical(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(FutureWarning): cat.astype(cdt2) @@ -175,7 +174,6 @@ def test_iter_python_types(self): assert isinstance(cat.tolist()[0], int) def test_iter_python_types_datetime(self): - cat = Categorical([Timestamp('2017-01-01'), - Timestamp('2017-01-02')]) + cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")]) assert isinstance(list(cat)[0], Timestamp) assert isinstance(cat.tolist()[0], Timestamp) diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 98f8ccb48c44b..6edd7fd00b707 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -9,73 +9,74 @@ class TestCategoricalIndexingWithFactor(TestCategorical): - def test_getitem(self): - assert self.factor[0] == 'a' - assert self.factor[-1] == 'c' + assert self.factor[0] == "a" + assert self.factor[-1] == "c" subf = self.factor[[0, 1, 2]] - tm.assert_numpy_array_equal(subf._codes, - np.array([0, 1, 1], dtype=np.int8)) + tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8)) - subf = self.factor[np.asarray(self.factor) == 'c'] - tm.assert_numpy_array_equal(subf._codes, - np.array([2, 2, 2], dtype=np.int8)) + subf = self.factor[np.asarray(self.factor) == "c"] + tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8)) def test_setitem(self): # int/positional c = self.factor.copy() - c[0] = 'b' - assert c[0] == 'b' - c[-1] = 'a' - assert c[-1] == 'a' + c[0] = "b" + assert c[0] == "b" + c[-1] = "a" + assert c[-1] == "a" # boolean c = self.factor.copy() - indexer = np.zeros(len(c), dtype='bool') + indexer = np.zeros(len(c), dtype="bool") indexer[0] = True indexer[-1] = True - c[indexer] = 'c' - expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], - ordered=True) + c[indexer] = "c" + expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(c, expected) - @pytest.mark.parametrize('other', [ - pd.Categorical(['b', 'a']), - pd.Categorical(['b', 'a'], categories=['b', 'a']), - ]) + @pytest.mark.parametrize( + "other", + [pd.Categorical(["b", "a"]), pd.Categorical(["b", "a"], categories=["b", "a"])], + ) def test_setitem_same_but_unordered(self, other): # GH-24142 - target = pd.Categorical(['a', 'b'], categories=['a', 'b']) + target = pd.Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) target[mask] = other[mask] - expected = pd.Categorical(['b', 'b'], categories=['a', 'b']) + expected = pd.Categorical(["b", "b"], categories=["a", "b"]) tm.assert_categorical_equal(target, expected) - @pytest.mark.parametrize('other', [ - pd.Categorical(['b', 'a'], categories=['b', 'a', 'c']), - pd.Categorical(['b', 'a'], categories=['a', 'b', 'c']), - pd.Categorical(['a', 'a'], categories=['a']), - pd.Categorical(['b', 'b'], categories=['b']), - ]) + @pytest.mark.parametrize( + "other", + [ + pd.Categorical(["b", "a"], categories=["b", "a", "c"]), + pd.Categorical(["b", "a"], categories=["a", "b", "c"]), + pd.Categorical(["a", "a"], categories=["a"]), + pd.Categorical(["b", "b"], categories=["b"]), + ], + ) def test_setitem_different_unordered_raises(self, other): # GH-24142 - target = pd.Categorical(['a', 'b'], categories=['a', 'b']) + target = pd.Categorical(["a", "b"], categories=["a", "b"]) mask = np.array([True, False]) with pytest.raises(ValueError): target[mask] = other[mask] - @pytest.mark.parametrize('other', [ - pd.Categorical(['b', 'a']), - pd.Categorical(['b', 'a'], categories=['b', 'a'], ordered=True), - pd.Categorical(['b', 'a'], categories=['a', 'b', 'c'], ordered=True), - ]) + @pytest.mark.parametrize( + "other", + [ + pd.Categorical(["b", "a"]), + pd.Categorical(["b", "a"], categories=["b", "a"], ordered=True), + pd.Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True), + ], + ) def test_setitem_same_ordered_rasies(self, other): # Gh-24142 - target = pd.Categorical(['a', 'b'], categories=['a', 'b'], - ordered=True) + target = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=True) mask = np.array([True, False]) with pytest.raises(ValueError): @@ -83,7 +84,6 @@ def test_setitem_same_ordered_rasies(self, other): class TestCategoricalIndexing: - def test_getitem_listlike(self): # GH 9469 @@ -95,31 +95,53 @@ def test_getitem_listlike(self): tm.assert_numpy_array_equal(result, expected) def test_periodindex(self): - idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', - '2014-03', '2014-03'], freq='M') + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ) cat1 = Categorical(idx1) str(cat1) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) - exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") tm.assert_numpy_array_equal(cat1._codes, exp_arr) tm.assert_index_equal(cat1.categories, exp_idx) - idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', - '2014-03', '2014-01'], freq='M') + idx2 = PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ) cat2 = Categorical(idx2, ordered=True) str(cat2) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) - exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") tm.assert_numpy_array_equal(cat2._codes, exp_arr) tm.assert_index_equal(cat2.categories, exp_idx2) - idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', - '2013-08', '2013-07', '2013-05'], freq='M') + idx3 = PeriodIndex( + [ + "2013-12", + "2013-11", + "2013-10", + "2013-09", + "2013-08", + "2013-07", + "2013-05", + ], + freq="M", + ) cat3 = Categorical(idx3, ordered=True) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) - exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', - '2013-10', '2013-11', '2013-12'], freq='M') + exp_idx = PeriodIndex( + [ + "2013-05", + "2013-07", + "2013-08", + "2013-09", + "2013-10", + "2013-11", + "2013-12", + ], + freq="M", + ) tm.assert_numpy_array_equal(cat3._codes, exp_arr) tm.assert_index_equal(cat3.categories, exp_idx) @@ -139,8 +161,9 @@ def test_categories_assigments(self): s.categories = [1, 2] # Combinations of sorted/unique: - @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], - [1, 3, 3, 4], [1, 2, 2, 4]]) + @pytest.mark.parametrize( + "idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]] + ) # Combinations of missing/unique @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) @@ -148,7 +171,7 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): # GH 21448 key = key_class(key_values, categories=range(1, 5)) # Test for flat index and CategoricalIndex with same/different cats: - for dtype in None, 'category', key.dtype: + for dtype in None, "category", key.dtype: idx = Index(idx_values, dtype=dtype) expected, exp_miss = idx.get_indexer_non_unique(key_values) result, res_miss = idx.get_indexer_non_unique(key) @@ -157,57 +180,51 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(exp_miss, res_miss) def test_where_unobserved_nan(self): - ser = pd.Series(pd.Categorical(['a', 'b'])) + ser = pd.Series(pd.Categorical(["a", "b"])) result = ser.where([True, False]) - expected = pd.Series(pd.Categorical(['a', None], - categories=['a', 'b'])) + expected = pd.Series(pd.Categorical(["a", None], categories=["a", "b"])) tm.assert_series_equal(result, expected) # all NA - ser = pd.Series(pd.Categorical(['a', 'b'])) + ser = pd.Series(pd.Categorical(["a", "b"])) result = ser.where([False, False]) - expected = pd.Series(pd.Categorical([None, None], - categories=['a', 'b'])) + expected = pd.Series(pd.Categorical([None, None], categories=["a", "b"])) tm.assert_series_equal(result, expected) def test_where_unobserved_categories(self): - ser = pd.Series( - Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) - ) - result = ser.where([True, True, False], other='b') + ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + result = ser.where([True, True, False], other="b") expected = pd.Series( - Categorical(['a', 'b', 'b'], categories=ser.cat.categories) + Categorical(["a", "b", "b"], categories=ser.cat.categories) ) tm.assert_series_equal(result, expected) def test_where_other_categorical(self): - ser = pd.Series( - Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a']) - ) - other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd']) + ser = pd.Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"])) + other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"]) result = ser.where([True, False, True], other) - expected = pd.Series(Categorical(['a', 'c', 'c'], dtype=ser.dtype)) + expected = pd.Series(Categorical(["a", "c", "c"], dtype=ser.dtype)) tm.assert_series_equal(result, expected) def test_where_warns(self): - ser = pd.Series(Categorical(['a', 'b', 'c'])) + ser = pd.Series(Categorical(["a", "b", "c"])) with tm.assert_produces_warning(FutureWarning): - result = ser.where([True, False, True], 'd') + result = ser.where([True, False, True], "d") - expected = pd.Series(np.array(['a', 'd', 'c'], dtype='object')) + expected = pd.Series(np.array(["a", "d", "c"], dtype="object")) tm.assert_series_equal(result, expected) def test_where_ordered_differs_rasies(self): ser = pd.Series( - Categorical(['a', 'b', 'c'], categories=['d', 'c', 'b', 'a'], - ordered=True) + Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True) + ) + other = Categorical( + ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True ) - other = Categorical(['b', 'c', 'a'], categories=['a', 'c', 'b', 'd'], - ordered=True) with tm.assert_produces_warning(FutureWarning): result = ser.where([True, False, True], other) - expected = pd.Series(np.array(['a', 'c', 'c'], dtype=object)) + expected = pd.Series(np.array(["a", "c", "c"], dtype=object)) tm.assert_series_equal(result, expected) @@ -220,7 +237,7 @@ def test_mask_with_boolean(index): assert com.is_bool_indexer(idx) result = s[idx] - expected = s[idx.astype('object')] + expected = s[idx.astype("object")] tm.assert_series_equal(result, expected) @@ -231,7 +248,7 @@ def test_mask_with_boolean_raises(index): if index: idx = CategoricalIndex(idx) - with pytest.raises(ValueError, match='NA / NaN'): + with pytest.raises(ValueError, match="NA / NaN"): s[idx] @@ -256,7 +273,7 @@ def array(self, dtype=None): def test_series_at(non_coercible_categorical): - arr = Categorical(['a', 'b', 'c']) + arr = Categorical(["a", "b", "c"]) ser = Series(arr) result = ser.at[0] - assert result == 'a' + assert result == "a" diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index eaf6606b5d63f..1b62479530d24 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -10,7 +10,6 @@ class TestCategoricalMissing: - def test_na_flags_int_categories(self): # #1457 @@ -28,25 +27,21 @@ def test_nan_handling(self): # Nans are represented as -1 in codes c = Categorical(["a", "b", np.nan, "a"]) tm.assert_index_equal(c.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) c[1] = np.nan tm.assert_index_equal(c.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8)) # Adding nan to categories should make assigned nan point to the # category! c = Categorical(["a", "b", np.nan, "a"]) tm.assert_index_equal(c.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], - dtype=np.int8)) + tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) def test_set_dtype_nans(self): - c = Categorical(['a', 'b', np.nan]) - result = c._set_dtype(CategoricalDtype(['a', 'c'])) - tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], - dtype='int8')) + c = Categorical(["a", "b", np.nan]) + result = c._set_dtype(CategoricalDtype(["a", "c"])) + tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8")) def test_set_item_nan(self): cat = Categorical([1, 2, 3]) @@ -55,14 +50,17 @@ def test_set_item_nan(self): exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(cat, exp) - @pytest.mark.parametrize('fillna_kwargs, msg', [ - (dict(value=1, method='ffill'), - "Cannot specify both 'value' and 'method'."), - (dict(), - "Must specify a fill 'value' or 'method'."), - (dict(method='bad'), - "Invalid fill method. Expecting .* bad"), - ]) + @pytest.mark.parametrize( + "fillna_kwargs, msg", + [ + ( + dict(value=1, method="ffill"), + "Cannot specify both 'value' and 'method'.", + ), + (dict(), "Must specify a fill 'value' or 'method'."), + (dict(method="bad"), "Invalid fill method. Expecting .* bad"), + ], + ) def test_fillna_raises(self, fillna_kwargs, msg): # https://github.com/pandas-dev/pandas/issues/19682 cat = Categorical([1, 2, 3]) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index af1d3ca0f9ad4..cd8ec7fcb787d 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -11,35 +11,33 @@ class TestCategoricalOpsWithFactor(TestCategorical): - def test_categories_none_comparisons(self): - factor = Categorical(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], ordered=True) + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, self.factor) def test_comparisons(self): - result = self.factor[self.factor == 'a'] - expected = self.factor[np.asarray(self.factor) == 'a'] + result = self.factor[self.factor == "a"] + expected = self.factor[np.asarray(self.factor) == "a"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor != 'a'] - expected = self.factor[np.asarray(self.factor) != 'a'] + result = self.factor[self.factor != "a"] + expected = self.factor[np.asarray(self.factor) != "a"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor < 'c'] - expected = self.factor[np.asarray(self.factor) < 'c'] + result = self.factor[self.factor < "c"] + expected = self.factor[np.asarray(self.factor) < "c"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor > 'a'] - expected = self.factor[np.asarray(self.factor) > 'a'] + result = self.factor[self.factor > "a"] + expected = self.factor[np.asarray(self.factor) > "a"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor >= 'b'] - expected = self.factor[np.asarray(self.factor) >= 'b'] + result = self.factor[self.factor >= "b"] + expected = self.factor[np.asarray(self.factor) >= "b"] tm.assert_categorical_equal(result, expected) - result = self.factor[self.factor <= 'b'] - expected = self.factor[np.asarray(self.factor) <= 'b'] + result = self.factor[self.factor <= "b"] + expected = self.factor[np.asarray(self.factor) <= "b"] tm.assert_categorical_equal(result, expected) n = len(self.factor) @@ -49,18 +47,17 @@ def test_comparisons(self): expected = np.asarray(self.factor) == np.asarray(other) tm.assert_numpy_array_equal(result, expected) - result = self.factor == 'd' + result = self.factor == "d" expected = np.repeat(False, len(self.factor)) tm.assert_numpy_array_equal(result, expected) # comparisons with categoricals - cat_rev = Categorical( - ["a", "b", "c"], categories=["c", "b", "a"], ordered=True) + cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True) cat_rev_base = Categorical( - ["b", "b", "b"], categories=["c", "b", "a"], ordered=True) + ["b", "b", "b"], categories=["c", "b", "a"], ordered=True + ) cat = Categorical(["a", "b", "c"], ordered=True) - cat_base = Categorical( - ["b", "b", "b"], categories=cat.categories, ordered=True) + cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base @@ -79,8 +76,7 @@ def test_comparisons(self): with pytest.raises(TypeError): cat > cat_rev - cat_rev_base2 = Categorical( - ["b", "b", "b"], categories=["c", "b", "a", "d"]) + cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"]) with pytest.raises(TypeError): cat_rev > cat_rev_base2 @@ -94,8 +90,10 @@ def test_comparisons(self): # comparison (in both directions) with Series will raise s = Series(["b", "b", "b"]) - msg = ("Cannot compare a Categorical for op __gt__ with type" - r" ") + msg = ( + "Cannot compare a Categorical for op __gt__ with type" + r" " + ) with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): @@ -115,8 +113,7 @@ def test_comparisons(self): # Make sure that unequal comparison take the categories order in # account - cat_rev = Categorical( - list("abc"), categories=list("cba"), ordered=True) + cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True) exp = np.array([True, False, False]) res = cat_rev > "b" tm.assert_numpy_array_equal(res, exp) @@ -127,7 +124,6 @@ def test_comparisons(self): class TestCategoricalOps: - def test_compare_frame(self): # GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame data = ["a", "b", 2, "a"] @@ -135,8 +131,14 @@ def test_compare_frame(self): df = DataFrame(cat) - for op in [operator.eq, operator.ne, operator.ge, - operator.gt, operator.le, operator.lt]: + for op in [ + operator.eq, + operator.ne, + operator.ge, + operator.gt, + operator.le, + operator.lt, + ]: with pytest.raises(ValueError): # alignment raises unless we transpose op(cat, df) @@ -150,19 +152,15 @@ def test_compare_frame(self): tm.assert_frame_equal(result, expected) def test_datetime_categorical_comparison(self): - dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True) - tm.assert_numpy_array_equal(dt_cat > dt_cat[0], - np.array([False, True, True])) - tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, - np.array([False, True, True])) + dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True) + tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True])) def test_reflected_comparison_with_scalars(self): # GH8658 cat = Categorical([1, 2, 3], ordered=True) - tm.assert_numpy_array_equal(cat > cat[0], - np.array([False, True, True])) - tm.assert_numpy_array_equal(cat[0] < cat, - np.array([False, True, True])) + tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True])) + tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True])) def test_comparison_with_unknown_scalars(self): # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057 @@ -170,24 +168,25 @@ def test_comparison_with_unknown_scalars(self): # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) - msg = ("Cannot compare a Categorical for op __{}__ with a scalar," - " which is not a category") - with pytest.raises(TypeError, match=msg.format('lt')): + msg = ( + "Cannot compare a Categorical for op __{}__ with a scalar," + " which is not a category" + ) + with pytest.raises(TypeError, match=msg.format("lt")): cat < 4 - with pytest.raises(TypeError, match=msg.format('gt')): + with pytest.raises(TypeError, match=msg.format("gt")): cat > 4 - with pytest.raises(TypeError, match=msg.format('gt')): + with pytest.raises(TypeError, match=msg.format("gt")): 4 < cat - with pytest.raises(TypeError, match=msg.format('lt')): + with pytest.raises(TypeError, match=msg.format("lt")): 4 > cat - tm.assert_numpy_array_equal(cat == 4, - np.array([False, False, False])) - tm.assert_numpy_array_equal(cat != 4, - np.array([True, True, True])) + tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) + tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) def test_comparison_of_ordered_categorical_with_nan_to_scalar( - self, compare_operators_no_eq_ne): + self, compare_operators_no_eq_ne + ): # https://github.com/pandas-dev/pandas/issues/26504 # BUG: fix ordered categorical comparison with missing values (#26504 ) # and following comparisons with scalars in categories with missing @@ -197,13 +196,13 @@ def test_comparison_of_ordered_categorical_with_nan_to_scalar( scalar = 2 with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) - expected = getattr(np.array(cat), - compare_operators_no_eq_ne)(scalar) + expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar) actual = getattr(cat, compare_operators_no_eq_ne)(scalar) tm.assert_numpy_array_equal(actual, expected) def test_comparison_of_ordered_categorical_with_nan_to_listlike( - self, compare_operators_no_eq_ne): + self, compare_operators_no_eq_ne + ): # https://github.com/pandas-dev/pandas/issues/26504 # and following comparisons of missing values in ordered Categorical # with listlike should be evaluated as False @@ -216,18 +215,17 @@ def test_comparison_of_ordered_categorical_with_nan_to_listlike( actual = getattr(cat, compare_operators_no_eq_ne)(other) tm.assert_numpy_array_equal(actual, expected) - @pytest.mark.parametrize('data,reverse,base', [ - (list("abc"), list("cba"), list("bbb")), - ([1, 2, 3], [3, 2, 1], [2, 2, 2])] + @pytest.mark.parametrize( + "data,reverse,base", + [(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])], ) def test_comparisons(self, data, reverse, base): - cat_rev = Series( - Categorical(data, categories=reverse, ordered=True)) - cat_rev_base = Series( - Categorical(base, categories=reverse, ordered=True)) + cat_rev = Series(Categorical(data, categories=reverse, ordered=True)) + cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True)) cat = Series(Categorical(data, ordered=True)) cat_base = Series( - Categorical(base, categories=cat.cat.categories, ordered=True)) + Categorical(base, categories=cat.cat.categories, ordered=True) + ) s = Series(base) a = np.array(base) @@ -262,8 +260,10 @@ def test_comparisons(self, data, reverse, base): # categorical cannot be compared to Series or numpy array, and also # not the other way around - msg = ("Cannot compare a Categorical for op __gt__ with type" - r" ") + msg = ( + "Cannot compare a Categorical for op __gt__ with type" + r" " + ) with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): @@ -283,40 +283,42 @@ def test_comparisons(self, data, reverse, base): with pytest.raises(TypeError, match=msg): a < cat_rev - @pytest.mark.parametrize('ctor', [ - lambda *args, **kwargs: Categorical(*args, **kwargs), - lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), - ]) + @pytest.mark.parametrize( + "ctor", + [ + lambda *args, **kwargs: Categorical(*args, **kwargs), + lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), + ], + ) def test_unordered_different_order_equal(self, ctor): # https://github.com/pandas-dev/pandas/issues/16014 - c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) assert (c1 == c2).all() - c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False) assert (c1 != c2).all() - c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) - c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False) assert (c1 != c2).all() - c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) - c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False) + c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False) result = c1 == c2 tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) def test_unordered_different_categories_raises(self): - c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False) + c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False) + c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False) - with pytest.raises(TypeError, match=("Categoricals can " - "only be compared")): + with pytest.raises(TypeError, match=("Categoricals can " "only be compared")): c1 == c2 def test_compare_different_lengths(self): - c1 = Categorical([], categories=['a', 'b']) - c2 = Categorical([], categories=['a']) + c1 = Categorical([], categories=["a", "b"]) + c2 = Categorical([], categories=["a"]) msg = "Categories are different lengths" with pytest.raises(TypeError, match=msg): @@ -325,33 +327,36 @@ def test_compare_different_lengths(self): def test_compare_unordered_different_order(self): # https://github.com/pandas-dev/pandas/issues/16603#issuecomment- # 349290078 - a = pd.Categorical(['a'], categories=['a', 'b']) - b = pd.Categorical(['b'], categories=['b', 'a']) + a = pd.Categorical(["a"], categories=["a", "b"]) + b = pd.Categorical(["b"], categories=["b", "a"]) assert not a.equals(b) def test_numeric_like_ops(self): - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) # numeric ops should not succeed - for op, str_rep in [('__add__', r'\+'), - ('__sub__', '-'), - ('__mul__', r'\*'), - ('__truediv__', '/')]: + for op, str_rep in [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ]: msg = r"Series cannot perform the operation {}".format(str_rep) with pytest.raises(TypeError, match=msg): getattr(df, op)(df) # reduction ops should not succeed (unless specifically defined, e.g. # min/max) - s = df['value_group'] - for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']: + s = df["value_group"] + for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: msg = "Categorical cannot perform the operation {}".format(op) with pytest.raises(TypeError, match=msg): getattr(s, op)(numeric_only=False) @@ -364,10 +369,12 @@ def test_numeric_like_ops(self): np.sum(s) # numeric ops on a Series - for op, str_rep in [('__add__', r'\+'), - ('__sub__', '-'), - ('__mul__', r'\*'), - ('__truediv__', '/')]: + for op, str_rep in [ + ("__add__", r"\+"), + ("__sub__", "-"), + ("__mul__", r"\*"), + ("__truediv__", "/"), + ]: msg = r"Series cannot perform the operation {}".format(str_rep) with pytest.raises(TypeError, match=msg): getattr(s, op)(2) @@ -378,10 +385,10 @@ def test_numeric_like_ops(self): def test_contains(self): # GH21508 - c = pd.Categorical(list('aabbca'), categories=list('cab')) + c = pd.Categorical(list("aabbca"), categories=list("cab")) - assert 'b' in c - assert 'z' not in c + assert "b" in c + assert "z" not in c assert np.nan not in c with pytest.raises(TypeError): assert [1] in c @@ -390,16 +397,21 @@ def test_contains(self): assert 0 not in c assert 1 not in c - c = pd.Categorical(list('aabbca') + [np.nan], categories=list('cab')) + c = pd.Categorical(list("aabbca") + [np.nan], categories=list("cab")) assert np.nan in c - @pytest.mark.parametrize('item, expected', [ - (pd.Interval(0, 1), True), - (1.5, True), - (pd.Interval(0.5, 1.5), False), - ('a', False), - (pd.Timestamp(1), False), - (pd.Timedelta(1), False)], ids=str) + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) def test_contains_interval(self, item, expected): # GH 23705 cat = Categorical(pd.IntervalIndex.from_breaks(range(3))) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d4ae50dcdaa5d..9321813b42b33 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,28 +1,33 @@ import numpy as np from pandas import ( - Categorical, CategoricalIndex, Series, date_range, option_context, - period_range, timedelta_range) + Categorical, + CategoricalIndex, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) from pandas.tests.arrays.categorical.common import TestCategorical class TestCategoricalReprWithFactor(TestCategorical): - def test_print(self): - expected = ["[a, b, b, a, a, c, c, c]", - "Categories (3, object): [a < b < c]"] + expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"] expected = "\n".join(expected) actual = repr(self.factor) assert actual == expected class TestCategoricalRepr: - def test_big_print(self): - factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ['a', 'b', 'c'], - fastpath=True) - expected = ["[a, b, c, a, b, ..., b, c, a, b, c]", "Length: 600", - "Categories (3, object): [a, b, c]"] + factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True) + expected = [ + "[a, b, c, a, b, ..., b, c, a, b, c]", + "Length: 600", + "Categories (3, object): [a, b, c]", + ] expected = "\n".join(expected) actual = repr(factor) @@ -31,31 +36,33 @@ def test_big_print(self): def test_empty_print(self): factor = Categorical([], ["a", "b", "c"]) - expected = ("[], Categories (3, object): [a, b, c]") + expected = "[], Categories (3, object): [a, b, c]" actual = repr(factor) assert actual == expected assert expected == actual factor = Categorical([], ["a", "b", "c"], ordered=True) - expected = ("[], Categories (3, object): [a < b < c]") + expected = "[], Categories (3, object): [a < b < c]" actual = repr(factor) assert expected == actual factor = Categorical([], []) - expected = ("[], Categories (0, object): []") + expected = "[], Categories (0, object): []" assert expected == repr(factor) def test_print_none_width(self): # GH10087 a = Series(Categorical([1, 2, 3, 4])) - exp = ("0 1\n1 2\n2 3\n3 4\n" - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) with option_context("display.width", None): assert exp == repr(a) def test_unicode_print(self): - c = Categorical(['aaaaa', 'bb', 'cccc'] * 20) + c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ [aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc] Length: 60 @@ -63,7 +70,7 @@ def test_unicode_print(self): assert repr(c) == expected - c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20) + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """\ [ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] Length: 60 @@ -73,9 +80,9 @@ def test_unicode_print(self): # unicode option should not affect to Categorical, as it doesn't care # the repr width - with option_context('display.unicode.east_asian_width', True): + with option_context("display.unicode.east_asian_width", True): - c = Categorical(['ああああ', 'いいいいい', 'ううううううう'] * 20) + c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] Length: 60 Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa @@ -137,7 +144,7 @@ def test_categorical_repr_ordered(self): assert repr(c) == exp def test_categorical_repr_datetime(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) # TODO(wesm): exceeding 80 characters in the console is not good @@ -148,7 +155,9 @@ def test_categorical_repr_datetime(self): "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" " 2011-01-01 12:00:00, " - "2011-01-01 13:00:00]""") + "2011-01-01 13:00:00]" + "" + ) assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx) @@ -160,12 +169,12 @@ def test_categorical_repr_datetime(self): "Categories (5, datetime64[ns]): [2011-01-01 09:00:00, " "2011-01-01 10:00:00, 2011-01-01 11:00:00,\n" " 2011-01-01 12:00:00, " - "2011-01-01 13:00:00]") + "2011-01-01 13:00:00]" + ) assert repr(c) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") c = Categorical(idx) exp = ( "[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, " @@ -176,7 +185,8 @@ def test_categorical_repr_datetime(self): " " "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" " " - "2011-01-01 13:00:00-05:00]") + "2011-01-01 13:00:00-05:00]" + ) assert repr(c) == exp @@ -192,12 +202,13 @@ def test_categorical_repr_datetime(self): " " "2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n" " " - "2011-01-01 13:00:00-05:00]") + "2011-01-01 13:00:00-05:00]" + ) assert repr(c) == exp def test_categorical_repr_datetime_ordered(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < @@ -212,8 +223,7 @@ def test_categorical_repr_datetime_ordered(self): assert repr(c) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < @@ -242,7 +252,7 @@ def test_categorical_repr_int_with_nan(self): assert repr(s) == s_exp def test_categorical_repr_period(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, @@ -257,7 +267,7 @@ def test_categorical_repr_period(self): assert repr(c) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) c = Categorical(idx) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" @@ -271,7 +281,7 @@ def test_categorical_repr_period(self): assert repr(c) == exp def test_categorical_repr_period_ordered(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < @@ -286,7 +296,7 @@ def test_categorical_repr_period_ordered(self): assert repr(c) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) c = Categorical(idx, ordered=True) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" @@ -300,7 +310,7 @@ def test_categorical_repr_period_ordered(self): assert repr(c) == exp def test_categorical_repr_timedelta(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) c = Categorical(idx) exp = """[1 days, 2 days, 3 days, 4 days, 5 days] Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" @@ -313,7 +323,7 @@ def test_categorical_repr_timedelta(self): assert repr(c) == exp - idx = timedelta_range('1 hours', periods=20) + idx = timedelta_range("1 hours", periods=20) c = Categorical(idx) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] Length: 20 @@ -333,7 +343,7 @@ def test_categorical_repr_timedelta(self): assert repr(c) == exp def test_categorical_repr_timedelta_ordered(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) c = Categorical(idx, ordered=True) exp = """[1 days, 2 days, 3 days, 4 days, 5 days] Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa @@ -346,7 +356,7 @@ def test_categorical_repr_timedelta_ordered(self): assert repr(c) == exp - idx = timedelta_range('1 hours', periods=20) + idx = timedelta_range("1 hours", periods=20) c = Categorical(idx, ordered=True) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] Length: 20 @@ -384,7 +394,7 @@ def test_categorical_index_repr_ordered(self): assert repr(i) == exp def test_categorical_index_repr_datetime(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -393,8 +403,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -404,7 +413,7 @@ def test_categorical_index_repr_datetime(self): assert repr(i) == exp def test_categorical_index_repr_datetime_ordered(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', @@ -413,8 +422,7 @@ def test_categorical_index_repr_datetime_ordered(self): assert repr(i) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', @@ -435,22 +443,22 @@ def test_categorical_index_repr_datetime_ordered(self): def test_categorical_index_repr_period(self): # test all length - idx = period_range('2011-01-01 09:00', freq='H', periods=1) + idx = period_range("2011-01-01 09:00", freq="H", periods=1) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = period_range('2011-01-01 09:00', freq='H', periods=2) + idx = period_range("2011-01-01 09:00", freq="H", periods=2) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = period_range('2011-01-01 09:00', freq='H', periods=3) + idx = period_range("2011-01-01 09:00", freq="H", periods=3) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], @@ -467,13 +475,13 @@ def test_categorical_index_repr_period(self): assert repr(i) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa assert repr(i) == exp def test_categorical_index_repr_period_ordered(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], @@ -481,18 +489,18 @@ def test_categorical_index_repr_period_ordered(self): assert repr(i) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa assert repr(i) == exp def test_categorical_index_repr_timedelta(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', @@ -503,12 +511,12 @@ def test_categorical_index_repr_timedelta(self): assert repr(i) == exp def test_categorical_index_repr_timedelta_ordered(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days 00:00:00, 2 days 00:00:00, 3 days 00:00:00, 4 days 00:00:00, 5 days 00:00:00], ordered=True, dtype='category')""" # noqa assert repr(i) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00', '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py index 26d8da3bf33f1..a0b09e19ece6e 100644 --- a/pandas/tests/arrays/categorical/test_sorting.py +++ b/pandas/tests/arrays/categorical/test_sorting.py @@ -6,27 +6,28 @@ class TestCategoricalSort: - def test_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) expected = np.array([2, 4, 1, 3, 0]) - tm.assert_numpy_array_equal(c.argsort(ascending=True), expected, - check_dtype=False) + tm.assert_numpy_array_equal( + c.argsort(ascending=True), expected, check_dtype=False + ) expected = expected[::-1] - tm.assert_numpy_array_equal(c.argsort(ascending=False), expected, - check_dtype=False) + tm.assert_numpy_array_equal( + c.argsort(ascending=False), expected, check_dtype=False + ) def test_numpy_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) expected = np.array([2, 4, 1, 3, 0]) - tm.assert_numpy_array_equal(np.argsort(c), expected, - check_dtype=False) + tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False) - tm.assert_numpy_array_equal(np.argsort(c, kind='mergesort'), expected, - check_dtype=False) + tm.assert_numpy_array_equal( + np.argsort(c, kind="mergesort"), expected, check_dtype=False + ) msg = "the 'axis' parameter is not supported" with pytest.raises(ValueError, match=msg): @@ -34,7 +35,7 @@ def test_numpy_argsort(self): msg = "the 'order' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(c, order='C') + np.argsort(c, order="C") def test_sort_values(self): @@ -50,8 +51,9 @@ def test_sort_values(self): tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, cat.categories) - cat = Categorical(["a", "c", "b", "d"], - categories=["a", "b", "c", "d"], ordered=True) + cat = Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=object) tm.assert_numpy_array_equal(res.__array__(), exp) @@ -88,34 +90,34 @@ def test_sort_values_na_position(self): tm.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) - res = cat.sort_values(ascending=True, na_position='first') + res = cat.sort_values(ascending=True, na_position="first") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) - res = cat.sort_values(ascending=False, na_position='first') + res = cat.sort_values(ascending=False, na_position="first") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) - res = cat.sort_values(ascending=True, na_position='last') + res = cat.sort_values(ascending=True, na_position="last") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) - res = cat.sort_values(ascending=False, na_position='last') + res = cat.sort_values(ascending=False, na_position="last") tm.assert_numpy_array_equal(res.__array__(), exp) tm.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='last') + res = cat.sort_values(ascending=False, na_position="last") exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) exp_categories = Index(["a", "b", "c", "d"]) tm.assert_numpy_array_equal(res.__array__(), exp_val) tm.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) - res = cat.sort_values(ascending=False, na_position='first') + res = cat.sort_values(ascending=False, na_position="first") exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) exp_categories = Index(["a", "b", "c", "d"]) tm.assert_numpy_array_equal(res.__array__(), exp_val) diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py index a8f0a348b3ecc..cfc7b8541302f 100644 --- a/pandas/tests/arrays/categorical/test_subclass.py +++ b/pandas/tests/arrays/categorical/test_subclass.py @@ -3,21 +3,20 @@ class TestCategoricalSubclassing: - def test_constructor(self): - sc = tm.SubclassedCategorical(['a', 'b', 'c']) + sc = tm.SubclassedCategorical(["a", "b", "c"]) assert isinstance(sc, tm.SubclassedCategorical) - tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) + tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"])) def test_from_codes(self): - sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"]) assert isinstance(sc, tm.SubclassedCategorical) - exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"]) tm.assert_categorical_equal(sc, exp) def test_map(self): - sc = tm.SubclassedCategorical(['a', 'b', 'c']) + sc = tm.SubclassedCategorical(["a", "b", "c"]) res = sc.map(lambda x: x.upper()) assert isinstance(res, tm.SubclassedCategorical) - exp = Categorical(['A', 'B', 'C']) + exp = Categorical(["A", "B", "C"]) tm.assert_categorical_equal(res, exp) diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index 79634f581f366..53733770ed954 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -7,23 +7,23 @@ class TestCategoricalWarnings: def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = Categorical([])" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('c.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("c.", 1)) def test_CategoricalAccessor_categorical_deprecation(self): with tm.assert_produces_warning(FutureWarning): - pd.Series(['a', 'b'], dtype='category').cat.categorical + pd.Series(["a", "b"], dtype="category").cat.categorical def test_CategoricalAccessor_name_deprecation(self): with tm.assert_produces_warning(FutureWarning): - pd.Series(['a', 'b'], dtype='category').cat.name + pd.Series(["a", "b"], dtype="category").cat.name def test_CategoricalAccessor_index_deprecation(self): with tm.assert_produces_warning(FutureWarning): - pd.Series(['a', 'b'], dtype='category').cat.index + pd.Series(["a", "b"], dtype="category").cat.index diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 4a7962d88a44e..82409df5b46f7 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -3,21 +3,31 @@ import pandas as pd from pandas import ( - Index, Interval, IntervalIndex, Timedelta, Timestamp, date_range, - timedelta_range) + Index, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + timedelta_range, +) from pandas.core.arrays import IntervalArray import pandas.util.testing as tm -@pytest.fixture(params=[ - (Index([0, 2, 4]), Index([1, 3, 5])), - (Index([0., 1., 2.]), Index([1., 2., 3.])), - (timedelta_range('0 days', periods=3), - timedelta_range('1 day', periods=3)), - (date_range('20170101', periods=3), date_range('20170102', periods=3)), - (date_range('20170101', periods=3, tz='US/Eastern'), - date_range('20170102', periods=3, tz='US/Eastern'))], - ids=lambda x: str(x[0].dtype)) +@pytest.fixture( + params=[ + (Index([0, 2, 4]), Index([1, 3, 5])), + (Index([0.0, 1.0, 2.0]), Index([1.0, 2.0, 3.0])), + (timedelta_range("0 days", periods=3), timedelta_range("1 day", periods=3)), + (date_range("20170101", periods=3), date_range("20170102", periods=3)), + ( + date_range("20170101", periods=3, tz="US/Eastern"), + date_range("20170102", periods=3, tz="US/Eastern"), + ), + ], + ids=lambda x: str(x[0].dtype), +) def left_right_dtypes(request): """ Fixture for building an IntervalArray from various dtypes @@ -26,26 +36,30 @@ def left_right_dtypes(request): class TestAttributes: - @pytest.mark.parametrize('left, right', [ - (0, 1), - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timestamp('2018-01-02')), - pytest.param(Timestamp('2018-01-01', tz='US/Eastern'), - Timestamp('2018-01-02', tz='US/Eastern'), - marks=pytest.mark.xfail(strict=True, reason='GH 27011'))]) - @pytest.mark.parametrize('constructor', [IntervalArray, IntervalIndex]) + @pytest.mark.parametrize( + "left, right", + [ + (0, 1), + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timestamp("2018-01-02")), + pytest.param( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-02", tz="US/Eastern"), + marks=pytest.mark.xfail(strict=True, reason="GH 27011"), + ), + ], + ) + @pytest.mark.parametrize("constructor", [IntervalArray, IntervalIndex]) def test_is_empty(self, constructor, left, right, closed): # GH27219 tuples = [(left, left), (left, right), np.nan] - expected = np.array([closed != 'both', False, False]) + expected = np.array([closed != "both", False, False]) result = constructor.from_tuples(tuples, closed=closed).is_empty tm.assert_numpy_array_equal(result, expected) class TestMethods: - - @pytest.mark.parametrize('new_closed', [ - 'left', 'right', 'both', 'neither']) + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) def test_set_closed(self, closed, new_closed): # GH 21670 array = IntervalArray.from_breaks(range(10), closed=closed) @@ -53,20 +67,21 @@ def test_set_closed(self, closed, new_closed): expected = IntervalArray.from_breaks(range(10), closed=new_closed) tm.assert_extension_array_equal(result, expected) - @pytest.mark.parametrize('other', [ - Interval(0, 1, closed='right'), - IntervalArray.from_breaks([1, 2, 3, 4], closed='right'), - ]) + @pytest.mark.parametrize( + "other", + [ + Interval(0, 1, closed="right"), + IntervalArray.from_breaks([1, 2, 3, 4], closed="right"), + ], + ) def test_where_raises(self, other): - ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], - closed='left')) + ser = pd.Series(IntervalArray.from_breaks([1, 2, 3, 4], closed="left")) match = "'value.closed' is 'right', expected 'left'." with pytest.raises(ValueError, match=match): ser.where([True, False, True], other=other) class TestSetitem: - def test_set_na(self, left_right_dtypes): left, right = left_right_dtypes result = IntervalArray.from_arrays(left, right) diff --git a/pandas/tests/arrays/interval/test_ops.py b/pandas/tests/arrays/interval/test_ops.py index 7f53c40d7bf4e..43601ea301568 100644 --- a/pandas/tests/arrays/interval/test_ops.py +++ b/pandas/tests/arrays/interval/test_ops.py @@ -15,10 +15,14 @@ def constructor(request): return request.param -@pytest.fixture(params=[ - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timedelta('1 day')), - (0, 1)], ids=lambda x: type(x[0]).__name__) +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) def start_shift(request): """ Fixture for generating intervals of different types from a start value @@ -28,28 +32,27 @@ def start_shift(request): class TestOverlaps: - - def test_overlaps_interval( - self, constructor, start_shift, closed, other_closed): + def test_overlaps_interval(self, constructor, start_shift, closed, other_closed): start, shift = start_shift interval = Interval(start, start + 3 * shift, other_closed) # intervals: identical, nested, spanning, partial, adjacent, disjoint - tuples = [(start, start + 3 * shift), - (start + shift, start + 2 * shift), - (start - shift, start + 4 * shift), - (start + 2 * shift, start + 4 * shift), - (start + 3 * shift, start + 4 * shift), - (start + 4 * shift, start + 5 * shift)] + tuples = [ + (start, start + 3 * shift), + (start + shift, start + 2 * shift), + (start - shift, start + 4 * shift), + (start + 2 * shift, start + 4 * shift), + (start + 3 * shift, start + 4 * shift), + (start + 4 * shift, start + 5 * shift), + ] interval_container = constructor.from_tuples(tuples, closed) - adjacent = (interval.closed_right and interval_container.closed_left) + adjacent = interval.closed_right and interval_container.closed_left expected = np.array([True, True, True, True, adjacent, False]) result = interval_container.overlaps(interval) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other_constructor', [ - IntervalArray, IntervalIndex]) + @pytest.mark.parametrize("other_constructor", [IntervalArray, IntervalIndex]) def test_overlaps_interval_container(self, constructor, other_constructor): # TODO: modify this test when implemented interval_container = constructor.from_breaks(range(5)) @@ -62,21 +65,26 @@ def test_overlaps_na(self, constructor, start_shift): start, shift = start_shift interval = Interval(start, start + shift) - tuples = [(start, start + shift), - np.nan, - (start + 2 * shift, start + 3 * shift)] + tuples = [ + (start, start + shift), + np.nan, + (start + 2 * shift, start + 3 * shift), + ] interval_container = constructor.from_tuples(tuples) expected = np.array([True, False, False]) result = interval_container.overlaps(interval) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('other', [ - 10, True, 'foo', Timedelta('1 day'), Timestamp('2018-01-01')], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) def test_overlaps_invalid_type(self, constructor, other): interval_container = constructor.from_breaks(range(5)) - msg = '`other` must be Interval-like, got {other}'.format( - other=type(other).__name__) + msg = "`other` must be Interval-like, got {other}".format( + other=type(other).__name__ + ) with pytest.raises(TypeError, match=msg): interval_container.overlaps(other) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index d0a188a8aff3c..eab174862818c 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -12,25 +12,21 @@ class TestSeriesAccessor: # TODO: collect other Series accessor tests def test_to_dense(self): - s = pd.Series([0, 1, 0, 10], dtype='Sparse[int64]') + s = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]") result = s.sparse.to_dense() expected = pd.Series([0, 1, 0, 10]) tm.assert_series_equal(result, expected) class TestFrameAccessor: - def test_accessor_raises(self): df = pd.DataFrame({"A": [0, 1]}) - with pytest.raises(AttributeError, match='sparse'): + with pytest.raises(AttributeError, match="sparse"): df.sparse - @pytest.mark.parametrize('format', ['csc', 'csr', 'coo']) - @pytest.mark.parametrize("labels", [ - None, - list(string.ascii_letters[:10]), - ]) - @pytest.mark.parametrize('dtype', ['float64', 'int64']) + @pytest.mark.parametrize("format", ["csc", "csr", "coo"]) + @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])]) + @pytest.mark.parametrize("dtype", ["float64", "int64"]) @td.skip_if_no_scipy def test_from_spmatrix(self, format, labels, dtype): import scipy.sparse @@ -38,85 +34,76 @@ def test_from_spmatrix(self, format, labels, dtype): sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) mat = scipy.sparse.eye(10, format=format, dtype=dtype) - result = pd.DataFrame.sparse.from_spmatrix( - mat, index=labels, columns=labels - ) + result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) expected = pd.DataFrame( - np.eye(10, dtype=dtype), - index=labels, - columns=labels, + np.eye(10, dtype=dtype), index=labels, columns=labels ).astype(sp_dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("columns", [ - ['a', 'b'], - pd.MultiIndex.from_product([['A'], ['a', 'b']]), - ['a', 'a'], - ]) + @pytest.mark.parametrize( + "columns", + [["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]], + ) @td.skip_if_no_scipy def test_from_spmatrix_columns(self, columns): import scipy.sparse - dtype = pd.SparseDtype('float64', 0.0) + dtype = pd.SparseDtype("float64", 0.0) mat = scipy.sparse.random(10, 2, density=0.5) result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) - expected = pd.DataFrame( - mat.toarray(), columns=columns - ).astype(dtype) + expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype) tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy def test_to_coo(self): import scipy.sparse - df = pd.DataFrame({ - "A": [0, 1, 0], - "B": [1, 0, 0], - }, dtype='Sparse[int64, 0]') + df = pd.DataFrame({"A": [0, 1, 0], "B": [1, 0, 0]}, dtype="Sparse[int64, 0]") result = df.sparse.to_coo() expected = scipy.sparse.coo_matrix(np.asarray(df)) assert (result != expected).nnz == 0 def test_to_dense(self): - df = pd.DataFrame({ - "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 0)), - "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype('int64', 1)), - "C": pd.SparseArray([1., 0.], - dtype=pd.SparseDtype('float64', 0.0)), - }, index=['b', 'a']) + df = pd.DataFrame( + { + "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 0)), + "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 1)), + "C": pd.SparseArray([1.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=["b", "a"], + ) result = df.sparse.to_dense() - expected = pd.DataFrame({ - 'A': [1, 0], - 'B': [1, 0], - 'C': [1.0, 0.0], - }, index=['b', 'a']) + expected = pd.DataFrame( + {"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"] + ) tm.assert_frame_equal(result, expected) def test_density(self): - df = pd.DataFrame({ - 'A': pd.SparseArray([1, 0, 2, 1], fill_value=0), - 'B': pd.SparseArray([0, 1, 1, 1], fill_value=0), - }) + df = pd.DataFrame( + { + "A": pd.SparseArray([1, 0, 2, 1], fill_value=0), + "B": pd.SparseArray([0, 1, 1, 1], fill_value=0), + } + ) res = df.sparse.density expected = 0.75 assert res == expected - @pytest.mark.parametrize("dtype", ['int64', 'float64']) + @pytest.mark.parametrize("dtype", ["int64", "float64"]) @pytest.mark.parametrize("dense_index", [True, False]) @td.skip_if_no_scipy def test_series_from_coo(self, dtype, dense_index): import scipy.sparse - A = scipy.sparse.eye(3, format='coo', dtype=dtype) + A = scipy.sparse.eye(3, format="coo", dtype=dtype) result = pd.Series.sparse.from_coo(A, dense_index=dense_index) index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - expected = pd.Series(pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), - index=index) + expected = pd.Series( + pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index + ) if dense_index: - expected = expected.reindex( - pd.MultiIndex.from_product(index.levels) - ) + expected = expected.reindex(pd.MultiIndex.from_product(index.levels)) tm.assert_series_equal(result, expected) @@ -124,8 +111,9 @@ def test_series_from_coo(self, dtype, dense_index): def test_series_from_coo_incorrect_format_raises(self): # gh-26554 import scipy.sparse + m = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 0]])) - with pytest.raises(TypeError, - match='Expected coo_matrix. Got csr_matrix instead.' - ): + with pytest.raises( + TypeError, match="Expected coo_matrix. Got csr_matrix instead." + ): pd.Series.sparse.from_coo(m) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 31a8f13571d16..7bfedff217719 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -19,7 +19,7 @@ def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) def _check_numeric_ops(self, a, b, a_dense, b_dense): - with np.errstate(invalid='ignore', divide='ignore'): + with np.errstate(invalid="ignore", divide="ignore"): # Unfortunately, trying to wrap the computation of each expected # value is with np.errstate() is too tedious. @@ -38,8 +38,7 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense): self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense) # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and - a.dtype.subtype == np.dtype('int64')): + if not (self._base == pd.Series and a.dtype.subtype == np.dtype("int64")): self._assert((a // b).to_dense(), a_dense // b_dense) self._assert((b // a).to_dense(), b_dense // a_dense) @@ -64,8 +63,7 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense): self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense) # ToDo: FIXME in GH 13843 - if not (self._base == pd.Series and - a.dtype.subtype == np.dtype('int64')): + if not (self._base == pd.Series and a.dtype.subtype == np.dtype("int64")): self._assert((a // b_dense).to_dense(), a_dense // b_dense) self._assert((b_dense // a).to_dense(), b_dense // a_dense) @@ -82,7 +80,7 @@ def _check_bool_result(self, res): assert isinstance(res.fill_value, bool) def _check_comparison_ops(self, a, b, a_dense, b_dense): - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): # Unfortunately, trying to wrap the computation of each expected # value is with np.errstate() is too tedious. # @@ -141,7 +139,7 @@ def _check_logical_ops(self, a, b, a_dense, b_dense): def test_float_scalar(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) self._check_numeric_ops(a, 1, values, 1) self._check_numeric_ops(a, 0, values, 0) @@ -160,7 +158,7 @@ def test_float_scalar(self): def test_float_scalar_comparison(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) self._check_comparison_ops(a, 1, values, 1) self._check_comparison_ops(a, 0, values, 0) @@ -178,7 +176,7 @@ def test_float_scalar_comparison(self): def test_float_same_index(self): # when sp_index are the same - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) @@ -186,8 +184,8 @@ def test_float_same_index(self): b = self._klass(rvalues, kind=kind) self._check_numeric_ops(a, b, values, rvalues) - values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) a = self._klass(values, kind=kind, fill_value=0) b = self._klass(rvalues, kind=kind, fill_value=0) @@ -195,7 +193,7 @@ def test_float_same_index(self): def test_float_same_index_comparison(self): # when sp_index are the same - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) @@ -203,8 +201,8 @@ def test_float_same_index_comparison(self): b = self._klass(rvalues, kind=kind) self._check_comparison_ops(a, b, values, rvalues) - values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) + rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) a = self._klass(values, kind=kind, fill_value=0) b = self._klass(rvalues, kind=kind, fill_value=0) @@ -214,7 +212,7 @@ def test_float_array(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) self._check_numeric_ops(a, b, values, rvalues) @@ -236,28 +234,28 @@ def test_float_array_different_kind(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - a = self._klass(values, kind='integer') - b = self._klass(rvalues, kind='block') + a = self._klass(values, kind="integer") + b = self._klass(rvalues, kind="block") self._check_numeric_ops(a, b, values, rvalues) self._check_numeric_ops(a, b * 0, values, rvalues * 0) - a = self._klass(values, kind='integer', fill_value=0) - b = self._klass(rvalues, kind='block') + a = self._klass(values, kind="integer", fill_value=0) + b = self._klass(rvalues, kind="block") self._check_numeric_ops(a, b, values, rvalues) - a = self._klass(values, kind='integer', fill_value=0) - b = self._klass(rvalues, kind='block', fill_value=0) + a = self._klass(values, kind="integer", fill_value=0) + b = self._klass(rvalues, kind="block", fill_value=0) self._check_numeric_ops(a, b, values, rvalues) - a = self._klass(values, kind='integer', fill_value=1) - b = self._klass(rvalues, kind='block', fill_value=2) + a = self._klass(values, kind="integer", fill_value=1) + b = self._klass(rvalues, kind="block", fill_value=2) self._check_numeric_ops(a, b, values, rvalues) def test_float_array_comparison(self): values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) self._check_comparison_ops(a, b, values, rvalues) @@ -282,7 +280,7 @@ def test_int_array(self): values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, dtype=dtype, kind=kind) assert a.dtype == SparseDtype(dtype) b = self._klass(rvalues, dtype=dtype, kind=kind) @@ -313,11 +311,11 @@ def test_int_array(self): def test_int_array_comparison(self): # int32 NI ATM - for dtype in ['int64']: + for dtype in ["int64"]: values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, dtype=dtype, kind=kind) b = self._klass(rvalues, dtype=dtype, kind=kind) self._check_comparison_ops(a, b, values, rvalues) @@ -338,40 +336,38 @@ def test_int_array_comparison(self): def test_bool_same_index(self): # GH 14000 # when sp_index are the same - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: values = self._base([True, False, True, True], dtype=np.bool) rvalues = self._base([True, False, True, True], dtype=np.bool) for fill_value in [True, False, np.nan]: - a = self._klass(values, kind=kind, dtype=np.bool, - fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool, - fill_value=fill_value) + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass( + rvalues, kind=kind, dtype=np.bool, fill_value=fill_value + ) self._check_logical_ops(a, b, values, rvalues) def test_bool_array_logical(self): # GH 14000 # when sp_index are the same - for kind in ['integer', 'block']: - values = self._base([True, False, True, False, True, True], - dtype=np.bool) - rvalues = self._base([True, False, False, True, False, True], - dtype=np.bool) + for kind in ["integer", "block"]: + values = self._base([True, False, True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, False, True, False, True], dtype=np.bool) for fill_value in [True, False, np.nan]: - a = self._klass(values, kind=kind, dtype=np.bool, - fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool, - fill_value=fill_value) + a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) + b = self._klass( + rvalues, kind=kind, dtype=np.bool, fill_value=fill_value + ) self._check_logical_ops(a, b, values, rvalues) def test_mixed_array_float_int(self): - for rdtype in ['int64']: + for rdtype in ["int64"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) @@ -397,11 +393,11 @@ def test_mixed_array_float_int(self): def test_mixed_array_comparison(self): # int32 NI ATM - for rdtype in ['int64']: + for rdtype in ["int64"]: values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: a = self._klass(values, kind=kind) b = self._klass(rvalues, kind=kind) assert b.dtype == SparseDtype(rdtype) @@ -438,33 +434,34 @@ def test_alignment(self): db = pd.Series(np.arange(4), index=[1, 2, 3, 4]) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) - sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], - dtype=np.int64, fill_value=0) + sb = pd.SparseSeries( + np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=0 + ) self._check_numeric_ops(sa, sb, da, db) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) - sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], - dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries( + np.arange(4), index=[1, 2, 3, 4], dtype=np.int64, fill_value=np.nan + ) self._check_numeric_ops(sa, sb, da, db) da = pd.Series(np.arange(4)) db = pd.Series(np.arange(4), index=[10, 11, 12, 13]) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) - sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], - dtype=np.int64, fill_value=0) + sb = pd.SparseSeries( + np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=0 + ) self._check_numeric_ops(sa, sb, da, db) sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) - sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], - dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries( + np.arange(4), index=[10, 11, 12, 13], dtype=np.int64, fill_value=np.nan + ) self._check_numeric_ops(sa, sb, da, db) -@pytest.mark.parametrize("op", [ - operator.eq, - operator.add, -]) +@pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): arr = pd.SparseArray([0, 1], fill_value=0) result = op(arr, [0, 1]) @@ -472,13 +469,10 @@ def test_with_list(op): tm.assert_sp_array_equal(result, expected) -@pytest.mark.parametrize('ufunc', [ - np.abs, np.exp, -]) -@pytest.mark.parametrize('arr', [ - pd.SparseArray([0, 0, -1, 1]), - pd.SparseArray([None, None, -1, 1]), -]) +@pytest.mark.parametrize("ufunc", [np.abs, np.exp]) +@pytest.mark.parametrize( + "arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])] +) def test_ufuncs(ufunc, arr): result = ufunc(arr) fill_value = ufunc(arr.fill_value) @@ -486,17 +480,17 @@ def test_ufuncs(ufunc, arr): tm.assert_sp_array_equal(result, expected) -@pytest.mark.parametrize("a, b", [ - (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), -]) -@pytest.mark.parametrize("ufunc", [ - np.add, - np.greater, -]) +@pytest.mark.parametrize( + "a, b", + [ + (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + ], +) +@pytest.mark.parametrize("ufunc", [np.add, np.greater]) def test_binary_ufuncs(ufunc, a, b): # can't say anything about fill value here. result = ufunc(a, b) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 8a51704732d7f..c76b4d9600526 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -21,10 +21,8 @@ def kind(request): class TestSparseArray: - def setup_method(self, method): - self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, - np.nan, 4, 5, np.nan, 6]) + self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) self.arr = SparseArray(self.arr_data) self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) @@ -59,32 +57,31 @@ def test_constructor_dtype(self): assert arr.fill_value == 0 def test_constructor_dtype_str(self): - result = SparseArray([1, 2, 3], dtype='int') + result = SparseArray([1, 2, 3], dtype="int") expected = SparseArray([1, 2, 3], dtype=int) tm.assert_sp_array_equal(result, expected) def test_constructor_sparse_dtype(self): - result = SparseArray([1, 0, 0, 1], dtype=SparseDtype('int64', -1)) + result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1)) expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64) tm.assert_sp_array_equal(result, expected) - assert result.sp_values.dtype == np.dtype('int64') + assert result.sp_values.dtype == np.dtype("int64") def test_constructor_sparse_dtype_str(self): - result = SparseArray([1, 0, 0, 1], dtype='Sparse[int32]') + result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]") expected = SparseArray([1, 0, 0, 1], dtype=np.int32) tm.assert_sp_array_equal(result, expected) - assert result.sp_values.dtype == np.dtype('int32') + assert result.sp_values.dtype == np.dtype("int32") def test_constructor_object_dtype(self): # GH 11856 - arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object) + arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object) assert arr.dtype == SparseDtype(np.object) assert np.isnan(arr.fill_value) - arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object, - fill_value='A') - assert arr.dtype == SparseDtype(np.object, 'A') - assert arr.fill_value == 'A' + arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object, fill_value="A") + assert arr.dtype == SparseDtype(np.object, "A") + assert arr.fill_value == "A" # GH 17574 data = [False, 0, 100.0, 0.0] @@ -104,37 +101,42 @@ def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) # XXX: Behavior change: specifying SparseIndex no longer changes the # fill_value - expected = SparseArray([0, 1, 2, 0], kind='integer') + expected = SparseArray([0, 1, 2, 0], kind="integer") tm.assert_sp_array_equal(arr, expected) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - arr = SparseArray(data=[1, 2, 3], - sparse_index=IntIndex(4, [1, 2, 3]), - dtype=np.int64, fill_value=0) + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, + fill_value=0, + ) exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), - fill_value=0, dtype=np.int64) + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64 + ) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - arr = SparseArray(data=[1, 2, 3], - sparse_index=IntIndex(4, [1, 2, 3]), - dtype=None, fill_value=0) + arr = SparseArray( + data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, + fill_value=0, + ) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - @pytest.mark.parametrize("sparse_index", [ - None, IntIndex(1, [0]), - ]) + @pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])]) def test_constructor_spindex_dtype_scalar(self, sparse_index): # scalar input arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None) @@ -150,19 +152,23 @@ def test_constructor_spindex_dtype_scalar(self, sparse_index): assert arr.fill_value == 0 def test_constructor_spindex_dtype_scalar_broadcasts(self): - arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), - fill_value=0, dtype=None) + arr = SparseArray( + data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None + ) exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None) tm.assert_sp_array_equal(arr, exp) assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - @pytest.mark.parametrize('data, fill_value', [ - (np.array([1, 2]), 0), - (np.array([1.0, 2.0]), np.nan), - ([True, False], False), - ([pd.Timestamp('2017-01-01')], pd.NaT), - ]) + @pytest.mark.parametrize( + "data, fill_value", + [ + (np.array([1, 2]), 0), + (np.array([1.0, 2.0]), np.nan), + ([True, False], False), + ([pd.Timestamp("2017-01-01")], pd.NaT), + ], + ) def test_constructor_inferred_fill_value(self, data, fill_value): result = SparseArray(data).fill_value @@ -171,13 +177,11 @@ def test_constructor_inferred_fill_value(self, data, fill_value): else: assert result == fill_value - @pytest.mark.parametrize('format', ['coo', 'csc', 'csr']) - @pytest.mark.parametrize('size', [ - pytest.param(0, - marks=td.skip_if_np_lt("1.16", - reason='NumPy-11383')), - 10 - ]) + @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) + @pytest.mark.parametrize( + "size", + [pytest.param(0, marks=td.skip_if_np_lt("1.16", reason="NumPy-11383")), 10], + ) @td.skip_if_no_scipy def test_from_spmatrix(self, size, format): import scipy.sparse @@ -193,16 +197,20 @@ def test_from_spmatrix(self, size, format): def test_from_spmatrix_raises(self): import scipy.sparse - mat = scipy.sparse.eye(5, 4, format='csc') + mat = scipy.sparse.eye(5, 4, format="csc") with pytest.raises(ValueError, match="not '4'"): SparseArray.from_spmatrix(mat) - @pytest.mark.parametrize('scalar,dtype', [ - (False, SparseDtype(bool, False)), - (0.0, SparseDtype('float64', 0)), - (1, SparseDtype('int64', 1)), - ('z', SparseDtype('object', 'z'))]) + @pytest.mark.parametrize( + "scalar,dtype", + [ + (False, SparseDtype(bool, False)), + (0.0, SparseDtype("float64", 0)), + (1, SparseDtype("int64", 1)), + ("z", SparseDtype("object", "z")), + ], + ) def test_scalar_with_index_infer_dtype(self, scalar, dtype): # GH 19163 arr = SparseArray(scalar, index=[1, 2, 3], fill_value=scalar) @@ -217,13 +225,13 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_round_trip(self, kind, fill): # see gh-13999 - arr = SparseArray([np.nan, 1, np.nan, 2, 3], - kind=kind, fill_value=fill) + arr = SparseArray([np.nan, 1, np.nan, 2, 3], kind=kind, fill_value=fill) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) - arr = SparseArray([0, 0, 0, 1, 1, 2], dtype=np.int64, - kind=kind, fill_value=fill) + arr = SparseArray( + [0, 0, 0, 1, 1, 2], dtype=np.int64, kind=kind, fill_value=fill + ) res = SparseArray(SparseSeries(arr), dtype=np.int64) tm.assert_sp_array_equal(arr, res) @@ -234,8 +242,9 @@ def test_sparse_series_round_trip(self, kind, fill): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_sparse_series_round_trip2(self, kind, fill): # see gh-13999 - arr = SparseArray([True, False, True, True], dtype=np.bool, - kind=kind, fill_value=fill) + arr = SparseArray( + [True, False, True, True], dtype=np.bool, kind=kind, fill_value=fill + ) res = SparseArray(SparseSeries(arr)) tm.assert_sp_array_equal(arr, res) @@ -291,16 +300,14 @@ def test_take_negative(self): exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) - @pytest.mark.parametrize('fill_value', [0, None, np.nan]) + @pytest.mark.parametrize("fill_value", [0, None, np.nan]) def test_shift_fill_value(self, fill_value): # GH #24128 - sparse = SparseArray(np.array([1, 0, 0, 3, 0]), - fill_value=8.0) + sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0) res = sparse.shift(1, fill_value=fill_value) if isna(fill_value): fill_value = res.dtype.na_value - exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), - fill_value=8.0) + exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0) tm.assert_sp_array_equal(res, exp) def test_bad_take(self): @@ -320,8 +327,7 @@ def test_take_filling(self): tm.assert_sp_array_equal(result, expected) # allow_fill=False - result = sparse.take(np.array([1, 0, -1]), - allow_fill=False, fill_value=True) + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([np.nan, np.nan, 4]) tm.assert_sp_array_equal(result, expected) @@ -355,12 +361,11 @@ def test_take_filling_fill_value(self): tm.assert_sp_array_equal(result, expected) # allow_fill=False - result = sparse.take(np.array([1, 0, -1]), - allow_fill=False, fill_value=True) + result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = SparseArray([0, np.nan, 4], fill_value=0) tm.assert_sp_array_equal(result, expected) - msg = ("Invalid value in 'indices'.") + msg = "Invalid value in 'indices'." with pytest.raises(ValueError, match=msg): sparse.take(np.array([1, 0, -2]), allow_fill=True) with pytest.raises(ValueError, match=msg): @@ -377,11 +382,11 @@ def test_take_filling_all_nan(self): sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan]) # XXX: did the default kind from take change? result = sparse.take(np.array([1, 0, -1])) - expected = SparseArray([np.nan, np.nan, np.nan], kind='block') + expected = SparseArray([np.nan, np.nan, np.nan], kind="block") tm.assert_sp_array_equal(result, expected) result = sparse.take(np.array([1, 0, -1]), fill_value=True) - expected = SparseArray([np.nan, np.nan, np.nan], kind='block') + expected = SparseArray([np.nan, np.nan, np.nan], kind="block") tm.assert_sp_array_equal(result, expected) with pytest.raises(IndexError): @@ -431,8 +436,7 @@ def test_constructor_bool(self): tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True])) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) - tm.assert_numpy_array_equal(arr.sp_index.indices, - np.array([2, 3], np.int32)) + tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32)) dense = arr.to_dense() assert dense.dtype == bool @@ -453,16 +457,16 @@ def test_constructor_bool_fill_value(self): def test_constructor_float32(self): # GH 10648 - data = np.array([1., np.nan, 3], dtype=np.float32) + data = np.array([1.0, np.nan, 3], dtype=np.float32) arr = SparseArray(data, dtype=np.float32) assert arr.dtype == SparseDtype(np.float32) - tm.assert_numpy_array_equal(arr.sp_values, - np.array([1, 3], dtype=np.float32)) + tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32)) # Behavior change: np.asarray densifies. # tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr)) - tm.assert_numpy_array_equal(arr.sp_index.indices, - np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal( + arr.sp_index.indices, np.array([0, 2], dtype=np.int32) + ) dense = arr.to_dense() assert dense.dtype == np.float32 @@ -472,39 +476,38 @@ def test_astype(self): # float -> float arr = SparseArray([None, None, 0, 2]) result = arr.astype("Sparse[float32]") - expected = SparseArray([None, None, 0, 2], dtype=np.dtype('float32')) + expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32")) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("float64", fill_value=0) result = arr.astype(dtype) - expected = SparseArray._simple_new(np.array([0., 2.], - dtype=dtype.subtype), - IntIndex(4, [2, 3]), - dtype) + expected = SparseArray._simple_new( + np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype + ) tm.assert_sp_array_equal(result, expected) dtype = SparseDtype("int64", 0) result = arr.astype(dtype) - expected = SparseArray._simple_new(np.array([0, 2], dtype=np.int64), - IntIndex(4, [2, 3]), - dtype) + expected = SparseArray._simple_new( + np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype + ) tm.assert_sp_array_equal(result, expected) arr = SparseArray([0, np.nan, 0, 1], fill_value=0) - with pytest.raises(ValueError, match='NA'): - arr.astype('Sparse[i8]') + with pytest.raises(ValueError, match="NA"): + arr.astype("Sparse[i8]") def test_astype_bool(self): a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) - expected = SparseArray([True, 0, 0, True], - dtype=SparseDtype(bool, 0)) + expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) tm.assert_sp_array_equal(result, expected) # update fill value result = a.astype(SparseDtype(bool, False)) - expected = SparseArray([True, False, False, True], - dtype=SparseDtype(bool, False)) + expected = SparseArray( + [True, False, False, True], dtype=SparseDtype(bool, False) + ) tm.assert_sp_array_equal(result, expected) def test_astype_all(self, any_real_dtype): @@ -515,39 +518,55 @@ def test_astype_all(self, any_real_dtype): assert res.dtype == SparseDtype(typ, 1) assert res.sp_values.dtype == typ - tm.assert_numpy_array_equal(np.asarray(res.to_dense()), - vals.astype(typ)) - - @pytest.mark.parametrize('array, dtype, expected', [ - (SparseArray([0, 1]), 'float', - SparseArray([0., 1.], dtype=SparseDtype(float, 0.0))), - (SparseArray([0, 1]), bool, SparseArray([False, True])), - (SparseArray([0, 1], fill_value=1), bool, - SparseArray([False, True], dtype=SparseDtype(bool, True))), - pytest.param( - SparseArray([0, 1]), 'datetime64[ns]', - SparseArray(np.array([0, 1], dtype='datetime64[ns]'), - dtype=SparseDtype('datetime64[ns]', - pd.Timestamp('1970'))), - marks=[pytest.mark.xfail(reason="NumPy-7619")], - ), - (SparseArray([0, 1, 10]), str, - SparseArray(['0', '1', '10'], dtype=SparseDtype(str, '0'))), - (SparseArray(['10', '20']), float, SparseArray([10.0, 20.0])), - (SparseArray([0, 1, 0]), object, - SparseArray([0, 1, 0], dtype=SparseDtype(object, 0))), - ]) + tm.assert_numpy_array_equal(np.asarray(res.to_dense()), vals.astype(typ)) + + @pytest.mark.parametrize( + "array, dtype, expected", + [ + ( + SparseArray([0, 1]), + "float", + SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)), + ), + (SparseArray([0, 1]), bool, SparseArray([False, True])), + ( + SparseArray([0, 1], fill_value=1), + bool, + SparseArray([False, True], dtype=SparseDtype(bool, True)), + ), + pytest.param( + SparseArray([0, 1]), + "datetime64[ns]", + SparseArray( + np.array([0, 1], dtype="datetime64[ns]"), + dtype=SparseDtype("datetime64[ns]", pd.Timestamp("1970")), + ), + marks=[pytest.mark.xfail(reason="NumPy-7619")], + ), + ( + SparseArray([0, 1, 10]), + str, + SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + ), + (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), + ( + SparseArray([0, 1, 0]), + object, + SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)), + ), + ], + ) def test_astype_more(self, array, dtype, expected): result = array.astype(dtype) tm.assert_sp_array_equal(result, expected) def test_astype_nan_raises(self): arr = SparseArray([1.0, np.nan]) - with pytest.raises(ValueError, match='Cannot convert non-finite'): + with pytest.raises(ValueError, match="Cannot convert non-finite"): arr.astype(int) def test_set_fill_value(self): - arr = SparseArray([1., np.nan, 2.], fill_value=np.nan) + arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan) arr.fill_value = 2 assert arr.fill_value == 2 @@ -599,22 +618,28 @@ def test_copy(self): def test_values_asarray(self): assert_almost_equal(self.arr.to_dense(), self.arr_data) - @pytest.mark.parametrize('data,shape,dtype', [ - ([0, 0, 0, 0, 0], (5,), None), - ([], (0,), None), - ([0], (1,), None), - (['A', 'A', np.nan, 'B'], (4,), np.object) - ]) + @pytest.mark.parametrize( + "data,shape,dtype", + [ + ([0, 0, 0, 0, 0], (5,), None), + ([], (0,), None), + ([0], (1,), None), + (["A", "A", np.nan, "B"], (4,), np.object), + ], + ) def test_shape(self, data, shape, dtype): # GH 21126 out = SparseArray(data, dtype=dtype) assert out.shape == shape - @pytest.mark.parametrize("vals", [ - [np.nan, np.nan, np.nan, np.nan, np.nan], - [1, np.nan, np.nan, 3, np.nan], - [1, np.nan, 0, 3, 0], - ]) + @pytest.mark.parametrize( + "vals", + [ + [np.nan, np.nan, np.nan, np.nan, np.nan], + [1, np.nan, np.nan, 3, np.nan], + [1, np.nan, 0, 3, 0], + ], + ) @pytest.mark.parametrize("fill_value", [None, 0]) def test_dense_repr(self, vals, fill_value): vals = np.array(vals) @@ -664,13 +689,13 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[4:, ] - exp = SparseArray(dense[4:, ]) + res = sparse[4:,] + exp = SparseArray(dense[4:,]) tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[4:, ] - exp = SparseArray(dense[4:, ], fill_value=0) + res = sparse[4:,] + exp = SparseArray(dense[4:,], fill_value=0) tm.assert_sp_array_equal(res, exp) with pytest.raises(IndexError): @@ -685,8 +710,7 @@ def test_boolean_slice_empty(self): res = arr[[False, False, False]] assert res.dtype == arr.dtype - @pytest.mark.parametrize("op", ["add", "sub", "mul", - "truediv", "floordiv", "pow"]) + @pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"]) def test_binary_operators(self, op): op = getattr(operator, op) data1 = np.random.randn(20) @@ -705,8 +729,9 @@ def test_binary_operators(self, op): def _check_op(op, first, second): res = op(first, second) - exp = SparseArray(op(first.to_dense(), second.to_dense()), - fill_value=first.fill_value) + exp = SparseArray( + op(first.to_dense(), second.to_dense()), fill_value=first.fill_value + ) assert isinstance(res, SparseArray) assert_almost_equal(res.to_dense(), exp.to_dense()) @@ -746,10 +771,8 @@ def _check_roundtrip(obj): def test_generator_warnings(self): sp_arr = SparseArray([1, 2, 3]) with warnings.catch_warnings(record=True) as w: - warnings.filterwarnings(action='always', - category=DeprecationWarning) - warnings.filterwarnings(action='always', - category=PendingDeprecationWarning) + warnings.filterwarnings(action="always", category=DeprecationWarning) + warnings.filterwarnings(action="always", category=PendingDeprecationWarning) for _ in sp_arr: pass assert len(w) == 0 @@ -786,9 +809,9 @@ def test_fillna(self): tm.assert_sp_array_equal(res, exp) # float dtype's fill_value is np.nan, replaced by -1 - s = SparseArray([0., 0., 0., 0.]) + s = SparseArray([0.0, 0.0, 0.0, 0.0]) res = s.fillna(-1) - exp = SparseArray([0., 0., 0., 0.], fill_value=-1) + exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1) tm.assert_sp_array_equal(res, exp) # int dtype shouldn't have missing. No changes. @@ -829,13 +852,7 @@ def test_fillna_overlap(self): def test_nonzero(self): # Tests regression #21172. - sa = pd.SparseArray([ - float('nan'), - float('nan'), - 1, 0, 0, - 2, 0, 0, 0, - 3, 0, 0 - ]) + sa = pd.SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) result, = sa.nonzero() tm.assert_numpy_array_equal(expected, result) @@ -846,12 +863,14 @@ def test_nonzero(self): class TestSparseArrayAnalytics: - - @pytest.mark.parametrize('data,pos,neg', [ - ([True, True, True], True, False), - ([1, 2, 1], 1, 0), - ([1.0, 2.0, 1.0], 1.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) def test_all(self, data, pos, neg): # GH 17570 out = SparseArray(data).all() @@ -867,11 +886,14 @@ def test_all(self, data, pos, neg): out = SparseArray(data, fill_value=pos).all() assert not out - @pytest.mark.parametrize('data,pos,neg', [ - ([True, True, True], True, False), - ([1, 2, 1], 1, 0), - ([1.0, 2.0, 1.0], 1.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([True, True, True], True, False), + ([1, 2, 1], 1, 0), + ([1.0, 2.0, 1.0], 1.0, 0.0), + ], + ) @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_all(self, data, pos, neg): # GH 17570 @@ -889,15 +911,18 @@ def test_numpy_all(self, data, pos, neg): assert not out # raises with a different message on py2. - msg = "the \'out\' parameter is not supported" + msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.all(SparseArray(data), out=np.array([])) - @pytest.mark.parametrize('data,pos,neg', [ - ([False, True, False], True, False), - ([0, 2, 0], 2, 0), - ([0.0, 2.0, 0.0], 2.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) def test_any(self, data, pos, neg): # GH 17570 out = SparseArray(data).any() @@ -913,11 +938,14 @@ def test_any(self, data, pos, neg): out = SparseArray(data, fill_value=pos).any() assert not out - @pytest.mark.parametrize('data,pos,neg', [ - ([False, True, False], True, False), - ([0, 2, 0], 2, 0), - ([0.0, 2.0, 0.0], 2.0, 0.0) - ]) + @pytest.mark.parametrize( + "data,pos,neg", + [ + ([False, True, False], True, False), + ([0, 2, 0], 2, 0), + ([0.0, 2.0, 0.0], 2.0, 0.0), + ], + ) @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_any(self, data, pos, neg): # GH 17570 @@ -934,7 +962,7 @@ def test_numpy_any(self, data, pos, neg): out = np.any(SparseArray(data, fill_value=pos)) assert not out - msg = "the \'out\' parameter is not supported" + msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.any(SparseArray(data), out=out) @@ -970,12 +998,19 @@ def test_numpy_sum(self): with pytest.raises(ValueError, match=msg): np.sum(SparseArray(data), out=out) - @pytest.mark.parametrize("data,expected", [ - (np.array([1, 2, 3, 4, 5], dtype=float), # non-null data - SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0]))), - (np.array([1, 2, np.nan, 4, 5], dtype=float), # null data - SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0]))) - ]) + @pytest.mark.parametrize( + "data,expected", + [ + ( + np.array([1, 2, 3, 4, 5], dtype=float), # non-null data + SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])), + ), + ( + np.array([1, 2, np.nan, 4, 5], dtype=float), # null data + SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])), + ), + ], + ) @pytest.mark.parametrize("numpy", [True, False]) def test_cumsum(self, data, expected, numpy): cumsum = np.cumsum if numpy else lambda s: s.cumsum() @@ -1037,14 +1072,12 @@ def test_ufunc(self): tm.assert_sp_array_equal(np.abs(sparse), result) sparse = SparseArray([1, -1, 2, -2], fill_value=1) - result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, - fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) tm.assert_sp_array_equal(abs(sparse), result) tm.assert_sp_array_equal(np.abs(sparse), result) sparse = SparseArray([1, -1, 2, -2], fill_value=-1) - result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, - fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1) tm.assert_sp_array_equal(abs(sparse), result) tm.assert_sp_array_equal(np.abs(sparse), result) @@ -1074,33 +1107,30 @@ def test_ufunc_args(self): result = SparseArray([2, 0, 1, -1], fill_value=1) tm.assert_sp_array_equal(np.add(sparse, 1), result) - @pytest.mark.parametrize('fill_value', [0.0, np.nan]) + @pytest.mark.parametrize("fill_value", [0.0, np.nan]) def test_modf(self, fill_value): # https://github.com/pandas-dev/pandas/issues/26946 - sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], - fill_value=fill_value) + sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) r1, r2 = np.modf(sparse) e1, e2 = np.modf(np.asarray(sparse)) tm.assert_sp_array_equal(r1, pd.SparseArray(e1, fill_value=fill_value)) tm.assert_sp_array_equal(r2, pd.SparseArray(e2, fill_value=fill_value)) def test_nbytes_integer(self): - arr = SparseArray([1, 0, 0, 0, 2], kind='integer') + arr = SparseArray([1, 0, 0, 0, 2], kind="integer") result = arr.nbytes # (2 * 8) + 2 * 4 assert result == 24 def test_nbytes_block(self): - arr = SparseArray([1, 2, 0, 0, 0], kind='block') + arr = SparseArray([1, 2, 0, 0, 0], kind="block") result = arr.nbytes # (2 * 8) + 4 + 4 # sp_values, blocs, blenghts assert result == 24 def test_asarray_datetime64(self): - s = pd.SparseArray( - pd.to_datetime(['2012', None, None, '2013']) - ) + s = pd.SparseArray(pd.to_datetime(["2012", None, None, "2013"])) np.asarray(s) def test_density(self): @@ -1114,10 +1144,7 @@ def test_npoints(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestAccessor: - - @pytest.mark.parametrize('attr', [ - 'npoints', 'density', 'fill_value', 'sp_values', - ]) + @pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"]) def test_get_attributes(self, attr): arr = SparseArray([0, 1]) ser = pd.Series(arr) @@ -1137,29 +1164,31 @@ def test_from_coo(self): result = pd.Series.sparse.from_coo(sp_array) index = pd.MultiIndex.from_arrays([[0, 0, 1, 3], [0, 2, 1, 3]]) - expected = pd.Series([4, 9, 7, 5], index=index, dtype='Sparse[int]') + expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]") tm.assert_series_equal(result, expected) @td.skip_if_no_scipy def test_to_coo(self): import scipy.sparse - ser = pd.Series([1, 2, 3], - index=pd.MultiIndex.from_product([[0], [1, 2, 3]], - names=['a', 'b']), - dtype='Sparse[int]') + + ser = pd.Series( + [1, 2, 3], + index=pd.MultiIndex.from_product([[0], [1, 2, 3]], names=["a", "b"]), + dtype="Sparse[int]", + ) A, _, _ = ser.sparse.to_coo() assert isinstance(A, scipy.sparse.coo.coo_matrix) def test_non_sparse_raises(self): ser = pd.Series([1, 2, 3]) - with pytest.raises(AttributeError, match='.sparse'): + with pytest.raises(AttributeError, match=".sparse"): ser.sparse.density def test_setting_fill_value_fillna_still_works(): # This is why letting users update fill_value / dtype is bad # astype has the same problem. - arr = SparseArray([1., np.nan, 1.0], fill_value=0.0) + arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0) arr.fill_value = np.nan result = arr.isna() # Can't do direct comparison, since the sp_index will be different @@ -1183,27 +1212,26 @@ def test_setting_fill_value_updates(): tm.assert_sp_array_equal(arr, expected) -@pytest.mark.parametrize("arr, loc", [ - ([None, 1, 2], 0), - ([0, None, 2], 1), - ([0, 1, None], 2), - ([0, 1, 1, None, None], 3), - ([1, 1, 1, 2], -1), - ([], -1), -]) +@pytest.mark.parametrize( + "arr, loc", + [ + ([None, 1, 2], 0), + ([0, None, 2], 1), + ([0, 1, None], 2), + ([0, 1, 1, None, None], 3), + ([1, 1, 1, 2], -1), + ([], -1), + ], +) def test_first_fill_value_loc(arr, loc): result = SparseArray(arr)._first_fill_value_loc() assert result == loc -@pytest.mark.parametrize('arr', [ - [1, 2, np.nan, np.nan], - [1, np.nan, 2, np.nan], - [1, 2, np.nan], -]) -@pytest.mark.parametrize("fill_value", [ - np.nan, 0, 1 -]) +@pytest.mark.parametrize( + "arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]] +) +@pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value): a = pd.SparseArray(arr, fill_value=fill_value).unique() b = pd.Series(arr).unique() diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 2d386de0d31a3..db8f62962f0b0 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -5,14 +5,17 @@ from pandas.core.sparse.api import SparseDtype -@pytest.mark.parametrize("dtype, fill_value", [ - ('int', 0), - ('float', np.nan), - ('bool', False), - ('object', np.nan), - ('datetime64[ns]', pd.NaT), - ('timedelta64[ns]', pd.NaT), -]) +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", 0), + ("float", np.nan), + ("bool", False), + ("object", np.nan), + ("datetime64[ns]", pd.NaT), + ("timedelta64[ns]", pd.NaT), + ], +) def test_inferred_dtype(dtype, fill_value): sparse_dtype = SparseDtype(dtype) result = sparse_dtype.fill_value @@ -23,28 +26,31 @@ def test_inferred_dtype(dtype, fill_value): def test_from_sparse_dtype(): - dtype = SparseDtype('float', 0) + dtype = SparseDtype("float", 0) result = SparseDtype(dtype) assert result.fill_value == 0 def test_from_sparse_dtype_fill_value(): - dtype = SparseDtype('int', 1) + dtype = SparseDtype("int", 1) result = SparseDtype(dtype, fill_value=2) - expected = SparseDtype('int', 2) + expected = SparseDtype("int", 2) assert result == expected -@pytest.mark.parametrize('dtype, fill_value', [ - ('int', None), - ('float', None), - ('bool', None), - ('object', None), - ('datetime64[ns]', None), - ('timedelta64[ns]', None), - ('int', np.nan), - ('float', 0), -]) +@pytest.mark.parametrize( + "dtype, fill_value", + [ + ("int", None), + ("float", None), + ("bool", None), + ("object", None), + ("datetime64[ns]", None), + ("timedelta64[ns]", None), + ("int", np.nan), + ("float", 0), + ], +) def test_equal(dtype, fill_value): a = SparseDtype(dtype, fill_value) b = SparseDtype(dtype, fill_value) @@ -53,64 +59,76 @@ def test_equal(dtype, fill_value): def test_nans_equal(): - a = SparseDtype(float, float('nan')) + a = SparseDtype(float, float("nan")) b = SparseDtype(float, np.nan) assert a == b assert b == a -@pytest.mark.parametrize('a, b', [ - (SparseDtype('float64'), SparseDtype('float32')), - (SparseDtype('float64'), SparseDtype('float64', 0)), - (SparseDtype('float64'), SparseDtype('datetime64[ns]', np.nan)), - (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), - (SparseDtype('float64'), np.dtype('float64')), -]) +@pytest.mark.parametrize( + "a, b", + [ + (SparseDtype("float64"), SparseDtype("float32")), + (SparseDtype("float64"), SparseDtype("float64", 0)), + (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)), + (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), + (SparseDtype("float64"), np.dtype("float64")), + ], +) def test_not_equal(a, b): assert a != b def test_construct_from_string_raises(): with pytest.raises(TypeError): - SparseDtype.construct_from_string('not a dtype') - - -@pytest.mark.parametrize("dtype, expected", [ - (SparseDtype(int), True), - (SparseDtype(float), True), - (SparseDtype(bool), True), - (SparseDtype(object), False), - (SparseDtype(str), False), -]) + SparseDtype.construct_from_string("not a dtype") + + +@pytest.mark.parametrize( + "dtype, expected", + [ + (SparseDtype(int), True), + (SparseDtype(float), True), + (SparseDtype(bool), True), + (SparseDtype(object), False), + (SparseDtype(str), False), + ], +) def test_is_numeric(dtype, expected): assert dtype._is_numeric is expected def test_str_uses_object(): result = SparseDtype(str).subtype - assert result == np.dtype('object') - - -@pytest.mark.parametrize("string, expected", [ - ('Sparse[float64]', SparseDtype(np.dtype('float64'))), - ('Sparse[float32]', SparseDtype(np.dtype('float32'))), - ('Sparse[int]', SparseDtype(np.dtype('int'))), - ('Sparse[str]', SparseDtype(np.dtype('str'))), - ('Sparse[datetime64[ns]]', SparseDtype(np.dtype('datetime64[ns]'))), - ("Sparse", SparseDtype(np.dtype("float"), np.nan)) -]) + assert result == np.dtype("object") + + +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[float64]", SparseDtype(np.dtype("float64"))), + ("Sparse[float32]", SparseDtype(np.dtype("float32"))), + ("Sparse[int]", SparseDtype(np.dtype("int"))), + ("Sparse[str]", SparseDtype(np.dtype("str"))), + ("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))), + ("Sparse", SparseDtype(np.dtype("float"), np.nan)), + ], +) def test_construct_from_string(string, expected): result = SparseDtype.construct_from_string(string) assert result == expected -@pytest.mark.parametrize("a, b, expected", [ - (SparseDtype(float, 0.0), SparseDtype(np.dtype('float'), 0.0), True), - (SparseDtype(int, 0), SparseDtype(int, 0), True), - (SparseDtype(float, float('nan')), SparseDtype(float, np.nan), True), - (SparseDtype(float, 0), SparseDtype(float, np.nan), False), - (SparseDtype(int, 0.0), SparseDtype(float, 0.0), False), -]) +@pytest.mark.parametrize( + "a, b, expected", + [ + (SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True), + (SparseDtype(int, 0), SparseDtype(int, 0), True), + (SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True), + (SparseDtype(float, 0), SparseDtype(float, np.nan), False), + (SparseDtype(int, 0.0), SparseDtype(float, 0.0), False), + ], +) def test_hash_equal(a, b, expected): result = a == b assert result is expected @@ -119,43 +137,47 @@ def test_hash_equal(a, b, expected): assert result is expected -@pytest.mark.parametrize('string, expected', [ - ('Sparse[int]', 'int'), - ('Sparse[int, 0]', 'int'), - ('Sparse[int64]', 'int64'), - ('Sparse[int64, 0]', 'int64'), - ('Sparse[datetime64[ns], 0]', 'datetime64[ns]'), -]) +@pytest.mark.parametrize( + "string, expected", + [ + ("Sparse[int]", "int"), + ("Sparse[int, 0]", "int"), + ("Sparse[int64]", "int64"), + ("Sparse[int64, 0]", "int64"), + ("Sparse[datetime64[ns], 0]", "datetime64[ns]"), + ], +) def test_parse_subtype(string, expected): subtype, _ = SparseDtype._parse_subtype(string) assert subtype == expected -@pytest.mark.parametrize("string", [ - "Sparse[int, 1]", - "Sparse[float, 0.0]", - "Sparse[bool, True]", -]) +@pytest.mark.parametrize( + "string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"] +) def test_construct_from_string_fill_value_raises(string): - with pytest.raises(TypeError, match='fill_value in the string is not'): + with pytest.raises(TypeError, match="fill_value in the string is not"): SparseDtype.construct_from_string(string) -@pytest.mark.parametrize('original, dtype, expected', [ - (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), - (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, '1')), - (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), -]) +@pytest.mark.parametrize( + "original, dtype, expected", + [ + (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), + (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), + (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), + ], +) def test_update_dtype(original, dtype, expected): result = original.update_dtype(dtype) assert result == expected -@pytest.mark.parametrize("original, dtype", [ - (SparseDtype(float, np.nan), int), - (SparseDtype(str, 'abc'), int), -]) +@pytest.mark.parametrize( + "original, dtype", + [(SparseDtype(float, np.nan), int), (SparseDtype(str, "abc"), int)], +) def test_update_dtype_raises(original, dtype): with pytest.raises(ValueError): original.update_dtype(dtype) diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 44bda995f9a78..183eaada16452 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -12,24 +12,54 @@ TEST_LENGTH = 20 -plain_case = dict(xloc=[0, 7, 15], xlen=[3, 5, 5], yloc=[2, 9, 14], - ylen=[2, 3, 5], intersect_loc=[2, 9, 15], - intersect_len=[1, 3, 4]) -delete_blocks = dict(xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], - intersect_loc=[1], intersect_len=[3]) -split_blocks = dict(xloc=[0], xlen=[10], yloc=[0, 5], ylen=[3, 7], - intersect_loc=[0, 5], intersect_len=[3, 5]) -skip_block = dict(xloc=[10], xlen=[5], yloc=[0, 12], ylen=[5, 3], - intersect_loc=[12], intersect_len=[3]) - -no_intersect = dict(xloc=[0, 10], xlen=[4, 6], yloc=[5, 17], ylen=[4, 2], - intersect_loc=[], intersect_len=[]) +plain_case = dict( + xloc=[0, 7, 15], + xlen=[3, 5, 5], + yloc=[2, 9, 14], + ylen=[2, 3, 5], + intersect_loc=[2, 9, 15], + intersect_len=[1, 3, 4], +) +delete_blocks = dict( + xloc=[0, 5], xlen=[4, 4], yloc=[1], ylen=[4], intersect_loc=[1], intersect_len=[3] +) +split_blocks = dict( + xloc=[0], + xlen=[10], + yloc=[0, 5], + ylen=[3, 7], + intersect_loc=[0, 5], + intersect_len=[3, 5], +) +skip_block = dict( + xloc=[10], + xlen=[5], + yloc=[0, 12], + ylen=[5, 3], + intersect_loc=[12], + intersect_len=[3], +) + +no_intersect = dict( + xloc=[0, 10], + xlen=[4, 6], + yloc=[5, 17], + ylen=[4, 2], + intersect_loc=[], + intersect_len=[], +) def check_cases(_check_case): def _check_case_dict(case): - _check_case(case['xloc'], case['xlen'], case['yloc'], case['ylen'], - case['intersect_loc'], case['intersect_len']) + _check_case( + case["xloc"], + case["xlen"], + case["yloc"], + case["ylen"], + case["intersect_loc"], + case["intersect_len"], + ) _check_case_dict(plain_case) _check_case_dict(delete_blocks) @@ -43,24 +73,22 @@ def _check_case_dict(case): class TestSparseIndexUnion: - def test_index_make_union(self): def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) yindex = BlockIndex(TEST_LENGTH, yloc, ylen) bresult = xindex.make_union(yindex) - assert (isinstance(bresult, BlockIndex)) - tm.assert_numpy_array_equal(bresult.blocs, - np.array(eloc, dtype=np.int32)) - tm.assert_numpy_array_equal(bresult.blengths, - np.array(elen, dtype=np.int32)) + assert isinstance(bresult, BlockIndex) + tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32)) + tm.assert_numpy_array_equal( + bresult.blengths, np.array(elen, dtype=np.int32) + ) ixindex = xindex.to_int_index() iyindex = yindex.to_int_index() iresult = ixindex.make_union(iyindex) - assert (isinstance(iresult, IntIndex)) - tm.assert_numpy_array_equal(iresult.indices, - bresult.to_int_index().indices) + assert isinstance(iresult, IntIndex) + tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices) """ x: ---- @@ -191,12 +219,11 @@ def test_int_index_make_union(self): class TestSparseIndexIntersect: - @td.skip_if_windows def test_intersect(self): def _check_correct(a, b, expected): result = a.intersect(b) - assert (result.equals(expected)) + assert result.equals(expected) def _check_length_exc(a, longer): msg = "Indices must reference same underlying length" @@ -210,12 +237,12 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) _check_correct(xindex, yindex, expected) - _check_correct(xindex.to_int_index(), yindex.to_int_index(), - expected.to_int_index()) + _check_correct( + xindex.to_int_index(), yindex.to_int_index(), expected.to_int_index() + ) _check_length_exc(xindex, longer_index) - _check_length_exc(xindex.to_int_index(), - longer_index.to_int_index()) + _check_length_exc(xindex.to_int_index(), longer_index.to_int_index()) check_cases(_check_case) @@ -231,10 +258,12 @@ def test_intersect_empty(self): assert yindex.intersect(xindex).equals(xindex) def test_intersect_identical(self): - cases = [IntIndex(5, np.array([1, 2], dtype=np.int32)), - IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), - IntIndex(0, np.array([], dtype=np.int32)), - IntIndex(5, np.array([], dtype=np.int32))] + cases = [ + IntIndex(5, np.array([1, 2], dtype=np.int32)), + IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), + IntIndex(0, np.array([], dtype=np.int32)), + IntIndex(5, np.array([], dtype=np.int32)), + ] for case in cases: assert case.intersect(case).equals(case) @@ -243,64 +272,49 @@ def test_intersect_identical(self): class TestSparseIndexCommon: - def test_int_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.indices, - np.array([2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.indices, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='integer') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.indices, - np.array([0, 1, 2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) def test_block_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.blocs, - np.array([2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.blocs, - np.array([], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='block') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([4], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), - kind='block') + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 3 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0, 2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([1, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) def test_lookup(self): - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == -1 @@ -314,8 +328,7 @@ def test_lookup(self): for i in range(-1, 5): assert idx.lookup(i) == -1 - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == 0 assert idx.lookup(1) == 1 @@ -323,8 +336,7 @@ def test_lookup(self): assert idx.lookup(3) == 3 assert idx.lookup(4) == -1 - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == 0 assert idx.lookup(1) == -1 @@ -333,7 +345,7 @@ def test_lookup(self): assert idx.lookup(4) == -1 def test_lookup_array(self): - for kind in ['integer', 'block']: + for kind in ["integer", "block"]: idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) @@ -348,8 +360,7 @@ def test_lookup_array(self): res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32)) exp = np.array([-1, -1, -1, -1], dtype=np.int32) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) exp = np.array([-1, 0, 2], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) @@ -358,8 +369,7 @@ def test_lookup_array(self): exp = np.array([-1, 2, 1, 3], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), - kind=kind) + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32)) exp = np.array([1, -1, 2, 0], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) @@ -370,16 +380,16 @@ def test_lookup_array(self): def test_lookup_basics(self): def _check(index): - assert (index.lookup(0) == -1) - assert (index.lookup(5) == 0) - assert (index.lookup(7) == 2) - assert (index.lookup(8) == -1) - assert (index.lookup(9) == -1) - assert (index.lookup(10) == -1) - assert (index.lookup(11) == -1) - assert (index.lookup(12) == 3) - assert (index.lookup(17) == 8) - assert (index.lookup(18) == -1) + assert index.lookup(0) == -1 + assert index.lookup(5) == 0 + assert index.lookup(7) == 2 + assert index.lookup(8) == -1 + assert index.lookup(9) == -1 + assert index.lookup(10) == -1 + assert index.lookup(11) == -1 + assert index.lookup(12) == 3 + assert index.lookup(17) == 8 + assert index.lookup(18) == -1 bindex = BlockIndex(20, [5, 12], [3, 6]) iindex = bindex.to_int_index() @@ -391,50 +401,38 @@ def _check(index): class TestBlockIndex: - def test_block_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.blocs, - np.array([2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.blocs, - np.array([], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='block') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([4], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind='block') + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 3 - tm.assert_numpy_array_equal(idx.blocs, - np.array([0, 2], dtype=np.int32)) - tm.assert_numpy_array_equal(idx.blengths, - np.array([1, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32)) def test_make_block_boundary(self): for i in [5, 10, 100, 101]: - idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), - kind='block') + idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block") exp = np.arange(0, i, 2, dtype=np.int32) tm.assert_numpy_array_equal(idx.blocs, exp) - tm.assert_numpy_array_equal(idx.blengths, - np.ones(len(exp), dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32)) def test_equals(self): index = BlockIndex(10, [0, 4], [2, 5]) @@ -469,8 +467,7 @@ def test_to_int_index(self): block = BlockIndex(20, locs, lengths) dense = block.to_int_index() - tm.assert_numpy_array_equal(dense.indices, - np.array(exp_inds, dtype=np.int32)) + tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32)) def test_to_block_index(self): index = BlockIndex(10, [0, 5], [4, 5]) @@ -478,7 +475,6 @@ def test_to_block_index(self): class TestIntIndex: - def test_check_integrity(self): # Too many indices than specified in self.length @@ -518,24 +514,20 @@ def test_check_integrity(self): IntIndex(length=5, indices=[1, 3, 3]) def test_int_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 2 - tm.assert_numpy_array_equal(idx.indices, - np.array([2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind='integer') + idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 0 - tm.assert_numpy_array_equal(idx.indices, - np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), - kind='integer') + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 4 - tm.assert_numpy_array_equal(idx.indices, - np.array([0, 1, 2, 3], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) def test_equals(self): index = IntIndex(10, [0, 1, 2, 3, 4]) @@ -543,7 +535,6 @@ def test_equals(self): assert not index.equals(IntIndex(10, [0, 1, 2, 3])) def test_to_block_index(self): - def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) yindex = BlockIndex(TEST_LENGTH, yloc, ylen) @@ -563,7 +554,6 @@ def test_to_int_index(self): class TestSparseOperators: - def _op_tests(self, sparse_op, python_op): def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xindex = BlockIndex(TEST_LENGTH, xloc, xlen) @@ -572,16 +562,18 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xdindex = xindex.to_int_index() ydindex = yindex.to_int_index() - x = np.arange(xindex.npoints) * 10. + 1 - y = np.arange(yindex.npoints) * 100. + 1 + x = np.arange(xindex.npoints) * 10.0 + 1 + y = np.arange(yindex.npoints) * 100.0 + 1 xfill = 0 yfill = 2 - result_block_vals, rb_index, bfill = sparse_op(x, xindex, xfill, y, - yindex, yfill) - result_int_vals, ri_index, ifill = sparse_op(x, xdindex, xfill, y, - ydindex, yfill) + result_block_vals, rb_index, bfill = sparse_op( + x, xindex, xfill, y, yindex, yfill + ) + result_int_vals, ri_index, ifill = sparse_op( + x, xdindex, xfill, y, ydindex, yfill + ) assert rb_index.to_int_index().equals(ri_index) tm.assert_numpy_array_equal(result_block_vals, result_int_vals) @@ -597,15 +589,13 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): series_result = python_op(xseries, yseries) series_result = series_result.reindex(ri_index.indices) - tm.assert_numpy_array_equal(result_block_vals, - series_result.values) + tm.assert_numpy_array_equal(result_block_vals, series_result.values) tm.assert_numpy_array_equal(result_int_vals, series_result.values) check_cases(_check_case) - @pytest.mark.parametrize('opname', - ['add', 'sub', 'mul', 'truediv', 'floordiv']) + @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"]) def test_op(self, opname): - sparse_op = getattr(splib, 'sparse_%s_float64' % opname) + sparse_op = getattr(splib, "sparse_%s_float64" % opname) python_op = getattr(operator, opname) self._op_tests(sparse_op, python_op) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index d097141cd8c73..e8d9ecfac61e4 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -11,95 +11,126 @@ from pandas.api.extensions import register_extension_dtype from pandas.api.types import is_scalar from pandas.core.arrays import PandasArray, integer_array, period_array -from pandas.tests.extension.decimal import ( - DecimalArray, DecimalDtype, to_decimal) +from pandas.tests.extension.decimal import DecimalArray, DecimalDtype, to_decimal import pandas.util.testing as tm -@pytest.mark.parametrize("data, dtype, expected", [ - # Basic NumPy defaults. - ([1, 2], None, PandasArray(np.array([1, 2]))), - ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), - ([1, 2], np.dtype('float32'), - PandasArray(np.array([1., 2.0], dtype=np.dtype('float32')))), - (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), - - # String alias passes through to NumPy - ([1, 2], 'float32', PandasArray(np.array([1, 2], dtype='float32'))), - - # Period alias - ([pd.Period('2000', 'D'), pd.Period('2001', 'D')], 'Period[D]', - period_array(['2000', '2001'], freq='D')), - - # Period dtype - ([pd.Period('2000', 'D')], pd.PeriodDtype('D'), - period_array(['2000'], freq='D')), - - # Datetime (naive) - ([1, 2], np.dtype('datetime64[ns]'), - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype='datetime64[ns]'))), - - (np.array([1, 2], dtype='datetime64[ns]'), None, - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype='datetime64[ns]'))), - - (pd.DatetimeIndex(['2000', '2001']), np.dtype('datetime64[ns]'), - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - (pd.DatetimeIndex(['2000', '2001']), None, - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - (['2000', '2001'], np.dtype('datetime64[ns]'), - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - # Datetime (tz-aware) - (['2000', '2001'], pd.DatetimeTZDtype(tz="CET"), - pd.arrays.DatetimeArray._from_sequence( - ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz="CET"))), - - # Timedelta - (['1H', '2H'], np.dtype('timedelta64[ns]'), - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - (pd.TimedeltaIndex(['1H', '2H']), np.dtype('timedelta64[ns]'), - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - (pd.TimedeltaIndex(['1H', '2H']), None, - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - # Category - (['a', 'b'], 'category', pd.Categorical(['a', 'b'])), - (['a', 'b'], pd.CategoricalDtype(None, ordered=True), - pd.Categorical(['a', 'b'], ordered=True)), - - # Interval - ([pd.Interval(1, 2), pd.Interval(3, 4)], 'interval', - pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)])), - - # Sparse - ([0, 1], 'Sparse[int64]', pd.SparseArray([0, 1], dtype='int64')), - - # IntegerNA - ([1, None], 'Int16', integer_array([1, None], dtype='Int16')), - (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), - - # Index - (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), - - # Series[EA] returns the EA - (pd.Series(pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), - None, - pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])), - - # "3rd party" EAs work - ([decimal.Decimal(0), decimal.Decimal(1)], 'decimal', to_decimal([0, 1])), - - # pass an ExtensionArray, but a different dtype - (period_array(['2000', '2001'], freq='D'), - 'category', - pd.Categorical([pd.Period('2000', 'D'), pd.Period('2001', 'D')])), -]) +@pytest.mark.parametrize( + "data, dtype, expected", + [ + # Basic NumPy defaults. + ([1, 2], None, PandasArray(np.array([1, 2]))), + ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), + ( + [1, 2], + np.dtype("float32"), + PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), + ), + (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), + # String alias passes through to NumPy + ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), + # Period alias + ( + [pd.Period("2000", "D"), pd.Period("2001", "D")], + "Period[D]", + period_array(["2000", "2001"], freq="D"), + ), + # Period dtype + ( + [pd.Period("2000", "D")], + pd.PeriodDtype("D"), + period_array(["2000"], freq="D"), + ), + # Datetime (naive) + ( + [1, 2], + np.dtype("datetime64[ns]"), + pd.arrays.DatetimeArray._from_sequence( + np.array([1, 2], dtype="datetime64[ns]") + ), + ), + ( + np.array([1, 2], dtype="datetime64[ns]"), + None, + pd.arrays.DatetimeArray._from_sequence( + np.array([1, 2], dtype="datetime64[ns]") + ), + ), + ( + pd.DatetimeIndex(["2000", "2001"]), + np.dtype("datetime64[ns]"), + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + pd.DatetimeIndex(["2000", "2001"]), + None, + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + ["2000", "2001"], + np.dtype("datetime64[ns]"), + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + # Datetime (tz-aware) + ( + ["2000", "2001"], + pd.DatetimeTZDtype(tz="CET"), + pd.arrays.DatetimeArray._from_sequence( + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") + ), + ), + # Timedelta + ( + ["1H", "2H"], + np.dtype("timedelta64[ns]"), + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + pd.TimedeltaIndex(["1H", "2H"]), + np.dtype("timedelta64[ns]"), + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + pd.TimedeltaIndex(["1H", "2H"]), + None, + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + # Category + (["a", "b"], "category", pd.Categorical(["a", "b"])), + ( + ["a", "b"], + pd.CategoricalDtype(None, ordered=True), + pd.Categorical(["a", "b"], ordered=True), + ), + # Interval + ( + [pd.Interval(1, 2), pd.Interval(3, 4)], + "interval", + pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]), + ), + # Sparse + ([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")), + # IntegerNA + ([1, None], "Int16", integer_array([1, None], dtype="Int16")), + (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # Index + (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # Series[EA] returns the EA + ( + pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])), + None, + pd.Categorical(["a", "b"], categories=["a", "b", "c"]), + ), + # "3rd party" EAs work + ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])), + # pass an ExtensionArray, but a different dtype + ( + period_array(["2000", "2001"], freq="D"), + "category", + pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]), + ), + ], +) def test_array(data, dtype, expected): result = pd.array(data, dtype=dtype) tm.assert_equal(result, expected) @@ -123,85 +154,101 @@ def test_array_copy(): cet = pytz.timezone("CET") -@pytest.mark.parametrize('data, expected', [ - # period - ([pd.Period("2000", "D"), pd.Period("2001", "D")], - period_array(["2000", "2001"], freq="D")), - - # interval - ([pd.Interval(0, 1), pd.Interval(1, 2)], - pd.arrays.IntervalArray.from_breaks([0, 1, 2])), - - # datetime - ([pd.Timestamp('2000',), pd.Timestamp('2001')], - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - ([datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), - - (np.array([1, 2], dtype='M8[ns]'), - pd.arrays.DatetimeArray(np.array([1, 2], dtype='M8[ns]'))), - - (np.array([1, 2], dtype='M8[us]'), - pd.arrays.DatetimeArray(np.array([1000, 2000], dtype='M8[ns]'))), - - # datetimetz - ([pd.Timestamp('2000', tz='CET'), pd.Timestamp('2001', tz='CET')], - pd.arrays.DatetimeArray._from_sequence( - ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz='CET'))), - - ([datetime.datetime(2000, 1, 1, tzinfo=cet), - datetime.datetime(2001, 1, 1, tzinfo=cet)], - pd.arrays.DatetimeArray._from_sequence(['2000', '2001'], - tz=cet)), - - # timedelta - ([pd.Timedelta('1H'), pd.Timedelta('2H')], - pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), - - (np.array([1, 2], dtype='m8[ns]'), - pd.arrays.TimedeltaArray(np.array([1, 2], dtype='m8[ns]'))), - - (np.array([1, 2], dtype='m8[us]'), - pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype='m8[ns]'))), - -]) +@pytest.mark.parametrize( + "data, expected", + [ + # period + ( + [pd.Period("2000", "D"), pd.Period("2001", "D")], + period_array(["2000", "2001"], freq="D"), + ), + # interval + ( + [pd.Interval(0, 1), pd.Interval(1, 2)], + pd.arrays.IntervalArray.from_breaks([0, 1, 2]), + ), + # datetime + ( + [pd.Timestamp("2000"), pd.Timestamp("2001")], + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + ), + ( + np.array([1, 2], dtype="M8[ns]"), + pd.arrays.DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + ), + ( + np.array([1, 2], dtype="M8[us]"), + pd.arrays.DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), + ), + # datetimetz + ( + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], + pd.arrays.DatetimeArray._from_sequence( + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") + ), + ), + ( + [ + datetime.datetime(2000, 1, 1, tzinfo=cet), + datetime.datetime(2001, 1, 1, tzinfo=cet), + ], + pd.arrays.DatetimeArray._from_sequence(["2000", "2001"], tz=cet), + ), + # timedelta + ( + [pd.Timedelta("1H"), pd.Timedelta("2H")], + pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + ), + ( + np.array([1, 2], dtype="m8[ns]"), + pd.arrays.TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + ), + ( + np.array([1, 2], dtype="m8[us]"), + pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), + ), + ], +) def test_array_inference(data, expected): result = pd.array(data) tm.assert_equal(result, expected) -@pytest.mark.parametrize('data', [ - # mix of frequencies - [pd.Period("2000", "D"), pd.Period("2001", "A")], - # mix of closed - [pd.Interval(0, 1, closed='left'), pd.Interval(1, 2, closed='right')], - # Mix of timezones - [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")], - # Mix of tz-aware and tz-naive - [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")], - np.array([pd.Timestamp('2000'), pd.Timestamp('2000', tz='CET')]), -]) +@pytest.mark.parametrize( + "data", + [ + # mix of frequencies + [pd.Period("2000", "D"), pd.Period("2001", "A")], + # mix of closed + [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")], + # Mix of timezones + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")], + # Mix of tz-aware and tz-naive + [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")], + np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]), + ], +) def test_array_inference_fails(data): result = pd.array(data) expected = PandasArray(np.array(data, dtype=object)) tm.assert_extension_array_equal(result, expected) -@pytest.mark.parametrize("data", [ - np.array([[1, 2], [3, 4]]), - [[1, 2], [3, 4]], -]) +@pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]]) def test_nd_raises(data): - with pytest.raises(ValueError, match='PandasArray must be 1-dimensional'): + with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"): pd.array(data) def test_scalar_raises(): - with pytest.raises(ValueError, - match="Cannot pass scalar '1'"): + with pytest.raises(ValueError, match="Cannot pass scalar '1'"): pd.array(1) + # --------------------------------------------------------------------------- # A couple dummy classes to ensure that Series and Indexes are unboxed before # getting to the EA classes. @@ -209,7 +256,7 @@ def test_scalar_raises(): @register_extension_dtype class DecimalDtype2(DecimalDtype): - name = 'decimal2' + name = "decimal2" @classmethod def construct_array_type(cls): @@ -227,12 +274,12 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @pytest.mark.parametrize("box", [pd.Series, pd.Index]) def test_array_unboxes(box): - data = box([decimal.Decimal('1'), decimal.Decimal('2')]) + data = box([decimal.Decimal("1"), decimal.Decimal("2")]) # make sure it works with pytest.raises(TypeError): DecimalArray2._from_sequence(data) - result = pd.array(data, dtype='decimal2') + result = pd.array(data, dtype="decimal2") expected = DecimalArray2._from_sequence(data.values) tm.assert_equal(result, expected) @@ -247,8 +294,8 @@ def registry_without_decimal(): def test_array_not_registered(registry_without_decimal): # check we aren't on it - assert registry.find('decimal') is None - data = [decimal.Decimal('1'), decimal.Decimal('2')] + assert registry.find("decimal") is None + data = [decimal.Decimal("1"), decimal.Decimal("2")] result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) @@ -257,13 +304,13 @@ def test_array_not_registered(registry_without_decimal): class TestArrayAnalytics: def test_searchsorted(self, string_dtype): - arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) + arr = pd.array(["a", "b", "c"], dtype=string_dtype) - result = arr.searchsorted('a', side='left') + result = arr.searchsorted("a", side="left") assert is_scalar(result) assert result == 0 - result = arr.searchsorted('a', side='right') + result = arr.searchsorted("a", side="right") assert is_scalar(result) assert result == 1 @@ -283,13 +330,23 @@ def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): expected = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('arr, val', [ - [pd.date_range('20120101', periods=10, freq='2D'), - pd.Timestamp('20120102')], - [pd.date_range('20120101', periods=10, freq='2D', tz='Asia/Hong_Kong'), - pd.Timestamp('20120102', tz='Asia/Hong_Kong')], - [pd.timedelta_range(start='1 day', end='10 days', periods=10), - pd.Timedelta('2 days')]]) + @pytest.mark.parametrize( + "arr, val", + [ + [ + pd.date_range("20120101", periods=10, freq="2D"), + pd.Timestamp("20120102"), + ], + [ + pd.date_range("20120101", periods=10, freq="2D", tz="Asia/Hong_Kong"), + pd.Timestamp("20120102", tz="Asia/Hong_Kong"), + ], + [ + pd.timedelta_range(start="1 day", end="10 days", periods=10), + pd.Timedelta("2 days"), + ], + ], + ) def test_search_sorted_datetime64_scalar(self, arr, val): arr = pd.array(arr) result = arr.searchsorted(val) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 2337d8363155c..34fae1f4b1ab4 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -7,7 +7,7 @@ # TODO: more freq variants -@pytest.fixture(params=['D', 'B', 'W', 'M', 'Q', 'Y']) +@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) def period_index(request): """ A fixture to provide PeriodIndex objects with different frequencies. @@ -18,13 +18,11 @@ def period_index(request): """ freqstr = request.param # TODO: non-monotone indexes; NaTs, different start dates - pi = pd.period_range(start=pd.Timestamp('2000-01-01'), - periods=100, - freq=freqstr) + pi = pd.period_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi -@pytest.fixture(params=['D', 'B', 'W', 'M', 'Q', 'Y']) +@pytest.fixture(params=["D", "B", "W", "M", "Q", "Y"]) def datetime_index(request): """ A fixture to provide DatetimeIndex objects with different frequencies. @@ -35,9 +33,7 @@ def datetime_index(request): """ freqstr = request.param # TODO: non-monotone indexes; NaTs, different start dates, timezones - pi = pd.date_range(start=pd.Timestamp('2000-01-01'), - periods=100, - freq=freqstr) + pi = pd.date_range(start=pd.Timestamp("2000-01-01"), periods=100, freq=freqstr) return pi @@ -50,7 +46,7 @@ def timedelta_index(request): the TimedeltaIndex behavior. """ # TODO: flesh this out - return pd.TimedeltaIndex(['1 Day', '3 Hours', 'NaT']) + return pd.TimedeltaIndex(["1 Day", "3 Hours", "NaT"]) class SharedTests: @@ -59,9 +55,9 @@ class SharedTests: def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific # to the case where one has length-1, which numpy would broadcast - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq='D') + idx = self.index_cls._simple_new(data, freq="D") arr = self.array_cls(idx) with pytest.raises(ValueError, match="Lengths must match"): @@ -72,10 +68,10 @@ def test_compare_len1_raises(self): idx <= idx[[0]] def test_take(self): - data = np.arange(100, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9 np.random.shuffle(data) - idx = self.index_cls._simple_new(data, freq='D') + idx = self.index_cls._simple_new(data, freq="D") arr = self.array_cls(idx) takers = [1, 4, 94] @@ -91,9 +87,9 @@ def test_take(self): tm.assert_index_equal(self.index_cls(result), expected) def test_take_fill(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq='D') + idx = self.index_cls._simple_new(data, freq="D") arr = self.array_cls(idx) result = arr.take([-1, 1], allow_fill=True, fill_value=None) @@ -112,13 +108,12 @@ def test_take_fill(self): arr.take([0, 1], allow_fill=True, fill_value=2.0) with pytest.raises(ValueError): - arr.take([0, 1], allow_fill=True, - fill_value=pd.Timestamp.now().time) + arr.take([0, 1], allow_fill=True, fill_value=pd.Timestamp.now().time) def test_concat_same_type(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq='D').insert(0, pd.NaT) + idx = self.index_cls._simple_new(data, freq="D").insert(0, pd.NaT) arr = self.array_cls(idx) result = arr._concat_same_type([arr[:-1], arr[1:], arr]) @@ -127,8 +122,8 @@ def test_concat_same_type(self): tm.assert_index_equal(self.index_cls(result), expected) def test_unbox_scalar(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") result = arr._unbox_scalar(arr[0]) assert isinstance(result, int) @@ -136,36 +131,36 @@ def test_unbox_scalar(self): assert isinstance(result, int) with pytest.raises(ValueError): - arr._unbox_scalar('foo') + arr._unbox_scalar("foo") def test_check_compatible_with(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") arr._check_compatible_with(arr[0]) arr._check_compatible_with(arr[:1]) arr._check_compatible_with(pd.NaT) def test_scalar_from_string(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") result = arr._scalar_from_string(str(arr[0])) assert result == arr[0] def test_reduce_invalid(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") - with pytest.raises(TypeError, match='cannot perform'): + with pytest.raises(TypeError, match="cannot perform"): arr._reduce("not a method") - @pytest.mark.parametrize('method', ['pad', 'backfill']) + @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_method_doesnt_change_orig(self, method): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") arr[4] = pd.NaT - fill_value = arr[3] if method == 'pad' else arr[5] + fill_value = arr[3] if method == "pad" else arr[5] result = arr.fillna(method=method) assert result[4] == fill_value @@ -174,8 +169,8 @@ def test_fillna_method_doesnt_change_orig(self, method): assert arr[4] is pd.NaT def test_searchsorted(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") # scalar result = arr.searchsorted(arr[1]) @@ -199,11 +194,11 @@ def test_searchsorted(self): assert result == 0 def test_setitem(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") arr[0] = arr[1] - expected = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 + expected = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 expected[0] = expected[1] tm.assert_numpy_array_equal(arr.asi8, expected) @@ -213,8 +208,8 @@ def test_setitem(self): tm.assert_numpy_array_equal(arr.asi8, expected) def test_setitem_raises(self): - data = np.arange(10, dtype='i8') * 24 * 3600 * 10**9 - arr = self.array_cls(data, freq='D') + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") val = arr[0] with pytest.raises(IndexError, match="index 12 is out of bounds"): @@ -231,9 +226,9 @@ class TestDatetimeArray(SharedTests): def test_round(self, tz_naive_fixture): # GH#24064 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01 01:01:00', periods=3, freq='H', tz=tz) + dti = pd.date_range("2016-01-01 01:01:00", periods=3, freq="H", tz=tz) - result = dti.round(freq='2T') + result = dti.round(freq="2T") expected = dti - pd.Timedelta(minutes=1) tm.assert_index_equal(result, expected) @@ -250,14 +245,14 @@ def test_array_interface(self, datetime_index): tm.assert_numpy_array_equal(result, expected) # specifying M8[ns] gives the same result as default - result = np.asarray(arr, dtype='datetime64[ns]') + result = np.asarray(arr, dtype="datetime64[ns]") expected = arr._data assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='datetime64[ns]', copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=False) assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='datetime64[ns]') + result = np.array(arr, dtype="datetime64[ns]") assert result is not expected tm.assert_numpy_array_equal(result, expected) @@ -267,14 +262,14 @@ def test_array_interface(self, datetime_index): tm.assert_numpy_array_equal(result, expected) # to other dtype always copies - result = np.asarray(arr, dtype='int64') + result = np.asarray(arr, dtype="int64") assert result is not arr.asi8 assert not np.may_share_memory(arr, result) expected = arr.asi8.copy() tm.assert_numpy_array_equal(result, expected) # other dtypes handled by numpy - for dtype in ['float64', str]: + for dtype in ["float64", str]: result = np.asarray(arr, dtype=dtype) expected = np.asarray(arr).astype(dtype) tm.assert_numpy_array_equal(result, expected) @@ -282,7 +277,7 @@ def test_array_interface(self, datetime_index): def test_array_object_dtype(self, tz_naive_fixture): # GH#23524 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) expected = np.array(list(dti)) @@ -297,44 +292,44 @@ def test_array_object_dtype(self, tz_naive_fixture): def test_array_tz(self, tz_naive_fixture): # GH#23524 tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) - expected = dti.asi8.view('M8[ns]') - result = np.array(arr, dtype='M8[ns]') + expected = dti.asi8.view("M8[ns]") + result = np.array(arr, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='datetime64[ns]') + result = np.array(arr, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # check that we are not making copies when setting copy=False - result = np.array(arr, dtype='M8[ns]', copy=False) + result = np.array(arr, dtype="M8[ns]", copy=False) assert result.base is expected.base assert result.base is not None - result = np.array(arr, dtype='datetime64[ns]', copy=False) + result = np.array(arr, dtype="datetime64[ns]", copy=False) assert result.base is expected.base assert result.base is not None def test_array_i8_dtype(self, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) expected = dti.asi8 - result = np.array(arr, dtype='i8') + result = np.array(arr, dtype="i8") tm.assert_numpy_array_equal(result, expected) result = np.array(arr, dtype=np.int64) tm.assert_numpy_array_equal(result, expected) # check that we are still making copies when setting copy=False - result = np.array(arr, dtype='i8', copy=False) + result = np.array(arr, dtype="i8", copy=False) assert result.base is not expected.base assert result.base is None def test_from_array_keeps_base(self): # Ensure that DatetimeArray._data.base isn't lost. - arr = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]') + arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") dta = DatetimeArray(arr) assert dta._data is arr @@ -343,7 +338,7 @@ def test_from_array_keeps_base(self): def test_from_dti(self, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) assert list(dti) == list(arr) @@ -354,14 +349,14 @@ def test_from_dti(self, tz_naive_fixture): def test_astype_object(self, tz_naive_fixture): tz = tz_naive_fixture - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) arr = DatetimeArray(dti) - asobj = arr.astype('O') + asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) - assert asobj.dtype == 'O' + assert asobj.dtype == "O" assert list(asobj) == list(dti) - @pytest.mark.parametrize('freqstr', ['D', 'B', 'W', 'M', 'Q', 'Y']) + @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) def test_to_perioddelta(self, datetime_index, freqstr): # GH#23113 dti = datetime_index @@ -375,7 +370,7 @@ def test_to_perioddelta(self, datetime_index, freqstr): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) - @pytest.mark.parametrize('freqstr', ['D', 'B', 'W', 'M', 'Q', 'Y']) + @pytest.mark.parametrize("freqstr", ["D", "B", "W", "M", "Q", "Y"]) def test_to_period(self, datetime_index, freqstr): dti = datetime_index arr = DatetimeArray(dti) @@ -388,7 +383,7 @@ def test_to_period(self, datetime_index, freqstr): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) - @pytest.mark.parametrize('propname', pd.DatetimeIndex._bool_ops) + @pytest.mark.parametrize("propname", pd.DatetimeIndex._bool_ops) def test_bool_properties(self, datetime_index, propname): # in this case _bool_ops is just `is_leap_year` dti = datetime_index @@ -400,7 +395,7 @@ def test_bool_properties(self, datetime_index, propname): tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('propname', pd.DatetimeIndex._field_ops) + @pytest.mark.parametrize("propname", pd.DatetimeIndex._field_ops) def test_int_properties(self, datetime_index, propname): dti = datetime_index arr = DatetimeArray(dti) @@ -424,9 +419,9 @@ def test_take_fill_valid(self, datetime_index, tz_naive_fixture): with pytest.raises(ValueError): # fill_value Period invalid - arr.take([-1, 1], allow_fill=True, fill_value=pd.Period('2014Q1')) + arr.take([-1, 1], allow_fill=True, fill_value=pd.Period("2014Q1")) - tz = None if dti.tz is not None else 'US/Eastern' + tz = None if dti.tz is not None else "US/Eastern" now = pd.Timestamp.now().tz_localize(tz) with pytest.raises(TypeError): # Timestamp with mismatched tz-awareness @@ -442,7 +437,7 @@ def test_concat_same_type_invalid(self, datetime_index): arr = DatetimeArray(dti) if arr.tz is None: - other = arr.tz_localize('UTC') + other = arr.tz_localize("UTC") else: other = arr.tz_localize(None) @@ -451,15 +446,19 @@ def test_concat_same_type_invalid(self, datetime_index): def test_concat_same_type_different_freq(self): # we *can* concatenate DTI with different freqs. - a = DatetimeArray(pd.date_range('2000', periods=2, freq='D', - tz='US/Central')) - b = DatetimeArray(pd.date_range('2000', periods=2, freq='H', - tz='US/Central')) + a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) + b = DatetimeArray(pd.date_range("2000", periods=2, freq="H", tz="US/Central")) result = DatetimeArray._concat_same_type([a, b]) - expected = DatetimeArray(pd.to_datetime([ - '2000-01-01 00:00:00', '2000-01-02 00:00:00', - '2000-01-01 00:00:00', '2000-01-01 01:00:00', - ]).tz_localize("US/Central")) + expected = DatetimeArray( + pd.to_datetime( + [ + "2000-01-01 00:00:00", + "2000-01-02 00:00:00", + "2000-01-01 00:00:00", + "2000-01-01 01:00:00", + ] + ).tz_localize("US/Central") + ) tm.assert_datetime_array_equal(result, expected) @@ -469,7 +468,7 @@ class TestTimedeltaArray(SharedTests): array_cls = TimedeltaArray def test_from_tdi(self): - tdi = pd.TimedeltaIndex(['1 Day', '3 Hours']) + tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"]) arr = TimedeltaArray(tdi) assert list(arr) == list(tdi) @@ -479,11 +478,11 @@ def test_from_tdi(self): assert list(tdi2) == list(arr) def test_astype_object(self): - tdi = pd.TimedeltaIndex(['1 Day', '3 Hours']) + tdi = pd.TimedeltaIndex(["1 Day", "3 Hours"]) arr = TimedeltaArray(tdi) - asobj = arr.astype('O') + asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) - assert asobj.dtype == 'O' + assert asobj.dtype == "O" assert list(asobj) == list(tdi) def test_to_pytimedelta(self, timedelta_index): @@ -504,7 +503,7 @@ def test_total_seconds(self, timedelta_index): tm.assert_numpy_array_equal(result, expected.values) - @pytest.mark.parametrize('propname', pd.TimedeltaIndex._field_ops) + @pytest.mark.parametrize("propname", pd.TimedeltaIndex._field_ops) def test_int_properties(self, timedelta_index, propname): tdi = timedelta_index arr = TimedeltaArray(tdi) @@ -527,14 +526,14 @@ def test_array_interface(self, timedelta_index): tm.assert_numpy_array_equal(result, expected) # specifying m8[ns] gives the same result as default - result = np.asarray(arr, dtype='timedelta64[ns]') + result = np.asarray(arr, dtype="timedelta64[ns]") expected = arr._data assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='timedelta64[ns]', copy=False) + result = np.array(arr, dtype="timedelta64[ns]", copy=False) assert result is expected tm.assert_numpy_array_equal(result, expected) - result = np.array(arr, dtype='timedelta64[ns]') + result = np.array(arr, dtype="timedelta64[ns]") assert result is not expected tm.assert_numpy_array_equal(result, expected) @@ -544,14 +543,14 @@ def test_array_interface(self, timedelta_index): tm.assert_numpy_array_equal(result, expected) # to other dtype always copies - result = np.asarray(arr, dtype='int64') + result = np.asarray(arr, dtype="int64") assert result is not arr.asi8 assert not np.may_share_memory(arr, result) expected = arr.asi8.copy() tm.assert_numpy_array_equal(result, expected) # other dtypes handled by numpy - for dtype in ['float64', str]: + for dtype in ["float64", str]: result = np.asarray(arr, dtype=dtype) expected = np.asarray(arr).astype(dtype) tm.assert_numpy_array_equal(result, expected) @@ -571,7 +570,7 @@ def test_take_fill_valid(self, timedelta_index): with pytest.raises(ValueError): # fill_value Period invalid - arr.take([0, 1], allow_fill=True, fill_value=now.to_period('D')) + arr.take([0, 1], allow_fill=True, fill_value=now.to_period("D")) class TestPeriodArray(SharedTests): @@ -591,12 +590,12 @@ def test_from_pi(self, period_index): def test_astype_object(self, period_index): pi = period_index arr = PeriodArray(pi) - asobj = arr.astype('O') + asobj = arr.astype("O") assert isinstance(asobj, np.ndarray) - assert asobj.dtype == 'O' + assert asobj.dtype == "O" assert list(asobj) == list(pi) - @pytest.mark.parametrize('how', ['S', 'E']) + @pytest.mark.parametrize("how", ["S", "E"]) def test_to_timestamp(self, how, period_index): pi = period_index arr = PeriodArray(pi) @@ -609,7 +608,7 @@ def test_to_timestamp(self, how, period_index): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) - @pytest.mark.parametrize('propname', PeriodArray._bool_ops) + @pytest.mark.parametrize("propname", PeriodArray._bool_ops) def test_bool_properties(self, period_index, propname): # in this case _bool_ops is just `is_leap_year` pi = period_index @@ -620,7 +619,7 @@ def test_bool_properties(self, period_index, propname): tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('propname', PeriodArray._field_ops) + @pytest.mark.parametrize("propname", PeriodArray._field_ops) def test_int_properties(self, period_index, propname): pi = period_index arr = PeriodArray(pi) @@ -644,11 +643,11 @@ def test_array_interface(self, period_index): # to other dtypes with pytest.raises(TypeError): - np.asarray(arr, dtype='int64') + np.asarray(arr, dtype="int64") with pytest.raises(TypeError): - np.asarray(arr, dtype='float64') + np.asarray(arr, dtype="float64") - result = np.asarray(arr, dtype='S20') - expected = np.asarray(arr).astype('S20') + result = np.asarray(arr, dtype="S20") + expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index c7c0e1180ce46..58c2f3fc65bb2 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -15,9 +15,8 @@ class TestDatetimeArrayConstructor: - def test_only_1dim_accepted(self): - arr = np.array([0, 1, 2, 3], dtype='M8[h]').astype('M8[ns]') + arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): # 2-dim @@ -30,24 +29,33 @@ def test_only_1dim_accepted(self): def test_freq_validation(self): # GH#24623 check that invalid instances cannot be created with the # public constructor - arr = np.arange(5, dtype=np.int64) * 3600 * 10**9 + arr = np.arange(5, dtype=np.int64) * 3600 * 10 ** 9 - msg = ("Inferred frequency H from passed values does not " - "conform to passed frequency W-SUN") + msg = ( + "Inferred frequency H from passed values does not " + "conform to passed frequency W-SUN" + ) with pytest.raises(ValueError, match=msg): DatetimeArray(arr, freq="W") - @pytest.mark.parametrize('meth', [DatetimeArray._from_sequence, - sequence_to_dt64ns, - pd.to_datetime, - pd.DatetimeIndex]) + @pytest.mark.parametrize( + "meth", + [ + DatetimeArray._from_sequence, + sequence_to_dt64ns, + pd.to_datetime, + pd.DatetimeIndex, + ], + ) def test_mixing_naive_tzaware_raises(self, meth): # GH#24569 - arr = np.array([pd.Timestamp('2000'), pd.Timestamp('2000', tz='CET')]) + arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) - msg = ('Cannot mix tz-aware with tz-naive values|' - 'Tz-aware datetime.datetime cannot be converted ' - 'to datetime64 unless utc=True') + msg = ( + "Cannot mix tz-aware with tz-naive values|" + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True" + ) for obj in [arr, arr[::-1]]: # check that we raise regardless of whether naive is found @@ -56,39 +64,42 @@ def test_mixing_naive_tzaware_raises(self, meth): meth(obj) def test_from_pandas_array(self): - arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9 + arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10 ** 9 - result = DatetimeArray._from_sequence(arr, freq='infer') + result = DatetimeArray._from_sequence(arr, freq="infer") - expected = pd.date_range('1970-01-01', periods=5, freq='H')._data + expected = pd.date_range("1970-01-01", periods=5, freq="H")._data tm.assert_datetime_array_equal(result, expected) def test_mismatched_timezone_raises(self): - arr = DatetimeArray(np.array(['2000-01-01T06:00:00'], dtype='M8[ns]'), - dtype=DatetimeTZDtype(tz='US/Central')) - dtype = DatetimeTZDtype(tz='US/Eastern') - with pytest.raises(TypeError, match='Timezone of the array'): + arr = DatetimeArray( + np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"), + dtype=DatetimeTZDtype(tz="US/Central"), + ) + dtype = DatetimeTZDtype(tz="US/Eastern") + with pytest.raises(TypeError, match="Timezone of the array"): DatetimeArray(arr, dtype=dtype) def test_non_array_raises(self): - with pytest.raises(ValueError, match='list'): + with pytest.raises(ValueError, match="list"): DatetimeArray([1, 2, 3]) def test_other_type_raises(self): - with pytest.raises(ValueError, - match="The dtype of 'values' is incorrect.*bool"): - DatetimeArray(np.array([1, 2, 3], dtype='bool')) + with pytest.raises( + ValueError, match="The dtype of 'values' is incorrect.*bool" + ): + DatetimeArray(np.array([1, 2, 3], dtype="bool")) def test_incorrect_dtype_raises(self): with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): - DatetimeArray(np.array([1, 2, 3], dtype='i8'), dtype='category') + DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category") def test_freq_infer_raises(self): - with pytest.raises(ValueError, match='Frequency inference'): - DatetimeArray(np.array([1, 2, 3], dtype='i8'), freq="infer") + with pytest.raises(ValueError, match="Frequency inference"): + DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer") def test_copy(self): - data = np.array([1, 2, 3], dtype='M8[ns]') + data = np.array([1, 2, 3], dtype="M8[ns]") arr = DatetimeArray(data, copy=False) assert arr._data is data @@ -102,10 +113,10 @@ class TestDatetimeArrayComparisons: def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): # arbitrary tz-naive DatetimeIndex - opname = all_compare_operators.strip('_') + opname = all_compare_operators.strip("_") op = getattr(operator, opname) - dti = pd.date_range('2016-01-1', freq='MS', periods=9, tz=None) + dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None) arr = DatetimeArray(dti) assert arr.freq == dti.freq assert arr.tz == dti.tz @@ -113,7 +124,7 @@ def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): right = dti expected = np.ones(len(arr), dtype=bool) - if opname in ['ne', 'gt', 'lt']: + if opname in ["ne", "gt", "lt"]: # for these the comparisons should be all-False expected = ~expected @@ -131,50 +142,45 @@ def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): class TestDatetimeArray: def test_astype_to_same(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False) assert result is arr - @pytest.mark.parametrize("dtype", [ - int, np.int32, np.int64, 'uint32', 'uint64', - ]) + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = DatetimeArray._from_sequence([pd.Timestamp('2000'), - pd.Timestamp('2001')]) + arr = DatetimeArray._from_sequence([pd.Timestamp("2000"), pd.Timestamp("2001")]) result = arr.astype(dtype) - if np.dtype(dtype).kind == 'u': - expected_dtype = np.dtype('uint64') + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") else: - expected_dtype = np.dtype('int64') + expected_dtype = np.dtype("int64") expected = arr.astype(expected_dtype) assert result.dtype == expected_dtype tm.assert_numpy_array_equal(result, expected) def test_tz_setter_raises(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') - with pytest.raises(AttributeError, match='tz_localize'): - arr.tz = 'UTC' + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + with pytest.raises(AttributeError, match="tz_localize"): + arr.tz = "UTC" def test_setitem_different_tz_raises(self): - data = np.array([1, 2, 3], dtype='M8[ns]') - arr = DatetimeArray(data, copy=False, - dtype=DatetimeTZDtype(tz="US/Central")) + data = np.array([1, 2, 3], dtype="M8[ns]") + arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) with pytest.raises(ValueError, match="None"): - arr[0] = pd.Timestamp('2000') + arr[0] = pd.Timestamp("2000") with pytest.raises(ValueError, match="US/Central"): - arr[0] = pd.Timestamp('2000', tz="US/Eastern") + arr[0] = pd.Timestamp("2000", tz="US/Eastern") def test_setitem_clears_freq(self): - a = DatetimeArray(pd.date_range('2000', periods=2, freq='D', - tz='US/Central')) + a = DatetimeArray(pd.date_range("2000", periods=2, freq="D", tz="US/Central")) a[0] = pd.Timestamp("2000", tz="US/Central") assert a.freq is None def test_repeat_preserves_tz(self): - dti = pd.date_range('2000', periods=2, freq='D', tz='US/Central') + dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") arr = DatetimeArray(dti) repeated = arr.repeat([1, 1]) @@ -184,7 +190,7 @@ def test_repeat_preserves_tz(self): tm.assert_equal(repeated, expected) def test_value_counts_preserves_tz(self): - dti = pd.date_range('2000', periods=2, freq='D', tz='US/Central') + dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central") arr = DatetimeArray(dti).repeat([4, 3]) result = arr.value_counts() @@ -194,20 +200,18 @@ def test_value_counts_preserves_tz(self): arr[-2] = pd.NaT result = arr.value_counts() - expected = pd.Series([1, 4, 2], - index=[pd.NaT, dti[0], dti[1]]) + expected = pd.Series([1, 4, 2], index=[pd.NaT, dti[0], dti[1]]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('method', ['pad', 'backfill']) + @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): - dti = pd.date_range('2000-01-01', periods=5, freq='D', tz='US/Central') + dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") arr = DatetimeArray(dti, copy=True) arr[2] = pd.NaT - fill_val = dti[1] if method == 'pad' else dti[3] + fill_val = dti[1] if method == "pad" else dti[3] expected = DatetimeArray._from_sequence( - [dti[0], dti[1], fill_val, dti[3], dti[4]], - freq=None, tz='US/Central' + [dti[0], dti[1], fill_val, dti[3], dti[4]], freq=None, tz="US/Central" ) result = arr.fillna(method=method) @@ -215,75 +219,82 @@ def test_fillna_preserves_tz(self, method): # assert that arr and dti were not modified in-place assert arr[2] is pd.NaT - assert dti[2] == pd.Timestamp('2000-01-03', tz='US/Central') + assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central") def test_array_interface_tz(self): tz = "US/Central" - data = DatetimeArray(pd.date_range('2017', periods=2, tz=tz)) + data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz)) result = np.asarray(data) - expected = np.array([pd.Timestamp('2017-01-01T00:00:00', tz=tz), - pd.Timestamp('2017-01-02T00:00:00', tz=tz)], - dtype=object) + expected = np.array( + [ + pd.Timestamp("2017-01-01T00:00:00", tz=tz), + pd.Timestamp("2017-01-02T00:00:00", tz=tz), + ], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) result = np.asarray(data, dtype=object) tm.assert_numpy_array_equal(result, expected) - result = np.asarray(data, dtype='M8[ns]') + result = np.asarray(data, dtype="M8[ns]") - expected = np.array(['2017-01-01T06:00:00', - '2017-01-02T06:00:00'], dtype="M8[ns]") + expected = np.array( + ["2017-01-01T06:00:00", "2017-01-02T06:00:00"], dtype="M8[ns]" + ) tm.assert_numpy_array_equal(result, expected) def test_array_interface(self): - data = DatetimeArray(pd.date_range('2017', periods=2)) - expected = np.array(['2017-01-01T00:00:00', '2017-01-02T00:00:00'], - dtype='datetime64[ns]') + data = DatetimeArray(pd.date_range("2017", periods=2)) + expected = np.array( + ["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]" + ) result = np.asarray(data) tm.assert_numpy_array_equal(result, expected) result = np.asarray(data, dtype=object) - expected = np.array([pd.Timestamp('2017-01-01T00:00:00'), - pd.Timestamp('2017-01-02T00:00:00')], - dtype=object) + expected = np.array( + [pd.Timestamp("2017-01-01T00:00:00"), pd.Timestamp("2017-01-02T00:00:00")], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) class TestSequenceToDT64NS: - def test_tz_dtype_mismatch_raises(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') - with pytest.raises(TypeError, match='data is already tz-aware'): + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + with pytest.raises(TypeError, match="data is already tz-aware"): sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="UTC")) def test_tz_dtype_matches(self): - arr = DatetimeArray._from_sequence(['2000'], tz='US/Central') - result, _, _ = sequence_to_dt64ns( - arr, dtype=DatetimeTZDtype(tz="US/Central")) + arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + result, _, _ = sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central")) tm.assert_numpy_array_equal(arr._data, result) class TestReductions: - @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_min_max(self, tz): - arr = DatetimeArray._from_sequence([ - '2000-01-03', - '2000-01-03', - 'NaT', - '2000-01-02', - '2000-01-05', - '2000-01-04', - ], tz=tz) + arr = DatetimeArray._from_sequence( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + tz=tz, + ) result = arr.min() - expected = pd.Timestamp('2000-01-02', tz=tz) + expected = pd.Timestamp("2000-01-02", tz=tz) assert result == expected result = arr.max() - expected = pd.Timestamp('2000-01-05', tz=tz) + expected = pd.Timestamp("2000-01-05", tz=tz) assert result == expected result = arr.min(skipna=False) @@ -293,7 +304,7 @@ def test_min_max(self, tz): assert result is pd.NaT @pytest.mark.parametrize("tz", [None, "US/Central"]) - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna, tz): arr = DatetimeArray._from_sequence([], tz=tz) result = arr.min(skipna=skipna) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index fb62a90a6007e..c01b52456ff87 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -7,22 +7,35 @@ from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar from pandas.core.arrays import IntegerArray, integer_array from pandas.core.arrays.integer import ( - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype) + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) from pandas.tests.extension.base import BaseOpsUtil import pandas.util.testing as tm def make_data(): - return (list(range(8)) + - [np.nan] + - list(range(10, 98)) + - [np.nan] + - [99, 100]) - - -@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) + return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) def dtype(request): return request.param() @@ -37,12 +50,12 @@ def data_missing(dtype): return integer_array([np.nan, 1], dtype=dtype) -@pytest.fixture(params=['data', 'data_missing']) +@pytest.fixture(params=["data", "data_missing"]) def all_data(request, data, data_missing): """Parametrized fixture giving 'data' and 'data_missing'""" - if request.param == 'data': + if request.param == "data": return data - elif request.param == 'data_missing': + elif request.param == "data_missing": return data_missing @@ -50,33 +63,32 @@ def test_dtypes(dtype): # smoke tests on auto dtype construction if dtype.is_signed_integer: - assert np.dtype(dtype.type).kind == 'i' + assert np.dtype(dtype.type).kind == "i" else: - assert np.dtype(dtype.type).kind == 'u' + assert np.dtype(dtype.type).kind == "u" assert dtype.name is not None -@pytest.mark.parametrize('dtype, expected', [ - (Int8Dtype(), 'Int8Dtype()'), - (Int16Dtype(), 'Int16Dtype()'), - (Int32Dtype(), 'Int32Dtype()'), - (Int64Dtype(), 'Int64Dtype()'), - (UInt8Dtype(), 'UInt8Dtype()'), - (UInt16Dtype(), 'UInt16Dtype()'), - (UInt32Dtype(), 'UInt32Dtype()'), - (UInt64Dtype(), 'UInt64Dtype()'), -]) +@pytest.mark.parametrize( + "dtype, expected", + [ + (Int8Dtype(), "Int8Dtype()"), + (Int16Dtype(), "Int16Dtype()"), + (Int32Dtype(), "Int32Dtype()"), + (Int64Dtype(), "Int64Dtype()"), + (UInt8Dtype(), "UInt8Dtype()"), + (UInt16Dtype(), "UInt16Dtype()"), + (UInt32Dtype(), "UInt32Dtype()"), + (UInt64Dtype(), "UInt64Dtype()"), + ], +) def test_repr_dtype(dtype, expected): assert repr(dtype) == expected def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = ( - '\n' - '[1, NaN, 3]\n' - 'Length: 3, dtype: Int64' - ) + expected = "\n" "[1, NaN, 3]\n" "Length: 3, dtype: Int64" assert result == expected @@ -94,14 +106,13 @@ def test_repr_array_long(): class TestConstructors: - def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype('float'), dtype=str(dtype)) + result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) tm.assert_series_equal(result, expected) # from int / list @@ -117,7 +128,6 @@ def test_from_dtype_from_float(self, data): class TestArithmeticOps(BaseOpsUtil): - def _check_divmod_op(self, s, op, other, exc=None): super()._check_divmod_op(s, op, other, None) @@ -137,23 +147,25 @@ def _check_op(self, s, op_name, other, exc=None): # other array is an Integer if isinstance(other, IntegerArray): - omask = getattr(other, 'mask', None) - mask = getattr(other, 'data', other) + omask = getattr(other, "mask", None) + mask = getattr(other, "data", other) if omask is not None: mask |= omask # 1 ** na is na, so need to unmask those - if op_name == '__pow__': + if op_name == "__pow__": mask = np.where(s == 1, False, mask) - elif op_name == '__rpow__': + elif op_name == "__rpow__": mask = np.where(other == 1, False, mask) # float result type or float op - if ((is_float_dtype(other) or is_float(other) or - op_name in ['__rtruediv__', '__truediv__', - '__rdiv__', '__div__'])): - rs = s.astype('float') + if ( + is_float_dtype(other) + or is_float(other) + or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] + ): + rs = s.astype("float") expected = op(rs, other) self._check_op_float(result, expected, mask, s, op_name, other) @@ -183,15 +195,14 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): # mod/rmod turn floating 0 into NaN while # integer works as expected (no nan) - if op_name in ['__mod__', '__rmod__']: + if op_name in ["__mod__", "__rmod__"]: if is_scalar(other): if other == 0: expected[s.values == 0] = 0 else: expected = expected.fillna(0) else: - expected[(s.values == 0) & - ((expected == 0) | expected.isna())] = 0 + expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 try: expected[(expected == np.inf) | (expected == -np.inf)] = fill_value original = expected @@ -213,13 +224,13 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): # we need to fill with 0's to emulate what an astype('int') does # (truncation) for certain ops - if op_name in ['__rtruediv__', '__rdiv__']: + if op_name in ["__rtruediv__", "__rdiv__"]: mask |= original.isna() - original = original.fillna(0).astype('int') + original = original.fillna(0).astype("int") - original = original.astype('float') + original = original.astype("float") original[mask] = np.nan - tm.assert_series_equal(original, expected.astype('float')) + tm.assert_series_equal(original, expected.astype("float")) # assert our expected result tm.assert_series_equal(result, expected) @@ -246,7 +257,7 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - df = pd.DataFrame({'A': data}) + df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): @@ -265,15 +276,15 @@ def test_arith_coerce_scalar(self, data, all_arithmetic_operators): other = 0.01 self._check_op(s, op, other) - @pytest.mark.parametrize("other", [1., 1.0, np.array(1.), np.array([1.])]) + @pytest.mark.parametrize("other", [1.0, 1.0, np.array(1.0), np.array([1.0])]) def test_arithmetic_conversion(self, all_arithmetic_operators, other): # if we have a float operand we should have a float result # if that is equal to an integer op = self.get_op_from_name(all_arithmetic_operators) - s = pd.Series([1, 2, 3], dtype='Int64') + s = pd.Series([1, 2, 3], dtype="Int64") result = op(s, other) - assert result.dtype is np.dtype('float') + assert result.dtype is np.dtype("float") @pytest.mark.parametrize("other", [0, 0.5]) def test_arith_zero_dim_ndarray(self, other): @@ -292,23 +303,23 @@ def test_error(self, data, all_arithmetic_operators): # invalid scalars with pytest.raises(TypeError): - ops('foo') + ops("foo") with pytest.raises(TypeError): - ops(pd.Timestamp('20180101')) + ops(pd.Timestamp("20180101")) # invalid array-likes with pytest.raises(TypeError): - ops(pd.Series('foo', index=s.index)) + ops(pd.Series("foo", index=s.index)) - if op != '__rpow__': + if op != "__rpow__": # TODO(extension) # rpow with a datetimelike coerces the integer array incorrectly with pytest.raises(TypeError): - ops(pd.Series(pd.date_range('20180101', periods=len(s)))) + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) # 2d with pytest.raises(NotImplementedError): - opa(pd.DataFrame({'A': s})) + opa(pd.DataFrame({"A": s})) with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) @@ -329,7 +340,6 @@ def test_rpow_one_to_na(self): class TestComparisonOps(BaseOpsUtil): - def _compare_other(self, data, op_name, other): op = self.get_op_from_name(op_name) @@ -338,7 +348,7 @@ def _compare_other(self, data, op_name, other): expected = pd.Series(op(data._data, other)) # fill the nan locations - expected[data._mask] = op_name == '__ne__' + expected[data._mask] = op_name == "__ne__" tm.assert_series_equal(result, expected) @@ -350,7 +360,7 @@ def _compare_other(self, data, op_name, other): expected = op(expected, other) # fill the nan locations - expected[data._mask] = op_name == '__ne__' + expected[data._mask] = op_name == "__ne__" tm.assert_series_equal(result, expected) @@ -367,7 +377,7 @@ def test_compare_array(self, data, all_compare_operators): class TestCasting: pass - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_construct_index(self, all_data, dropna): # ensure that we do not coerce to Float64Index, rather # keep as Index @@ -383,7 +393,7 @@ def test_construct_index(self, all_data, dropna): tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_astype_index(self, all_data, dropna): # as an int/uint index to Index @@ -423,8 +433,7 @@ def test_astype(self, all_data): # coerce to same numpy_dtype - ints s = pd.Series(ints) result = s.astype(all_data.dtype.numpy_dtype) - expected = pd.Series(ints._data.astype( - all_data.dtype.numpy_dtype)) + expected = pd.Series(ints._data.astype(all_data.dtype.numpy_dtype)) tm.assert_series_equal(result, expected) # coerce to same type - mixed @@ -446,19 +455,18 @@ def test_astype(self, all_data): # coerce to object s = pd.Series(mixed) - result = s.astype('object') + result = s.astype("object") expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', [Int8Dtype(), 'Int8', - UInt32Dtype(), 'UInt32']) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): - s = pd.Series([1, 2, 3], dtype='Int64') + s = pd.Series([1, 2, 3], dtype="Int64") result = s.astype(dtype) expected = pd.Series([1, 2, 3], dtype=dtype) tm.assert_series_equal(result, expected) - s = pd.Series([1, 2, 3, None], dtype='Int64') + s = pd.Series([1, 2, 3, None], dtype="Int64") result = s.astype(dtype) expected = pd.Series([1, 2, 3, None], dtype=dtype) tm.assert_series_equal(result, expected) @@ -483,24 +491,24 @@ def test_construct_cast_invalid(self, dtype): def test_frame_repr(data_missing): - df = pd.DataFrame({'A': data_missing}) + df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = ' A\n0 NaN\n1 1' + expected = " A\n0 NaN\n1 1" assert result == expected def test_conversions(data_missing): # astype to object series - df = pd.DataFrame({'A': data_missing}) - result = df['A'].astype('object') - expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A') + df = pd.DataFrame({"A": data_missing}) + result = df["A"].astype("object") + expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A") tm.assert_series_equal(result, expected) # convert to object ndarray # we assert that we are exactly equal # including type conversions of scalars - result = df['A'].astype('object').values + result = df["A"].astype("object").values expected = np.array([np.nan, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -516,11 +524,11 @@ def test_conversions(data_missing): def test_integer_array_constructor(): - values = np.array([1, 2, 3, 4], dtype='int64') - mask = np.array([False, False, False, True], dtype='bool') + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") result = IntegerArray(values, mask) - expected = integer_array([1, 2, 3, np.nan], dtype='int64') + expected = integer_array([1, 2, 3, np.nan], dtype="int64") tm.assert_extension_array_equal(result, expected) with pytest.raises(TypeError): @@ -536,12 +544,15 @@ def test_integer_array_constructor(): IntegerArray(values) -@pytest.mark.parametrize('a, b', [ - ([1, None], [1, np.nan]), - ([None], [np.nan]), - ([None, np.nan], [np.nan, np.nan]), - ([np.nan, np.nan], [np.nan, np.nan]), -]) +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, np.nan]), + ([None], [np.nan]), + ([None, np.nan], [np.nan, np.nan]), + ([np.nan, np.nan], [np.nan, np.nan]), + ], +) def test_integer_array_constructor_none_is_nan(a, b): result = integer_array(a) expected = integer_array(b) @@ -549,8 +560,8 @@ def test_integer_array_constructor_none_is_nan(a, b): def test_integer_array_constructor_copy(): - values = np.array([1, 2, 3, 4], dtype='int64') - mask = np.array([False, False, False, True], dtype='bool') + values = np.array([1, 2, 3, 4], dtype="int64") + mask = np.array([False, False, False, True], dtype="bool") result = IntegerArray(values, mask) assert result._data is values @@ -562,17 +573,19 @@ def test_integer_array_constructor_copy(): @pytest.mark.parametrize( - 'values', + "values", [ - ['foo', 'bar'], - ['1', '2'], - 'foo', + ["foo", "bar"], + ["1", "2"], + "foo", 1, 1.0, - pd.date_range('20130101', periods=2), - np.array(['foo']), + pd.date_range("20130101", periods=2), + np.array(["foo"]), [[1, 2], [3, 4]], - [np.nan, {'a': 1}]]) + [np.nan, {"a": 1}], + ], +) def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays with pytest.raises(TypeError): @@ -581,9 +594,9 @@ def test_to_integer_array_error(values): def test_to_integer_array_inferred_dtype(): # if values has dtype -> respect it - result = integer_array(np.array([1, 2], dtype='int8')) + result = integer_array(np.array([1, 2], dtype="int8")) assert result.dtype == Int8Dtype() - result = integer_array(np.array([1, 2], dtype='int32')) + result = integer_array(np.array([1, 2], dtype="int32")) assert result.dtype == Int32Dtype() # if values have no dtype -> always int64 @@ -592,34 +605,36 @@ def test_to_integer_array_inferred_dtype(): def test_to_integer_array_dtype_keyword(): - result = integer_array([1, 2], dtype='int8') + result = integer_array([1, 2], dtype="int8") assert result.dtype == Int8Dtype() # if values has dtype -> override it - result = integer_array(np.array([1, 2], dtype='int8'), dtype='int32') + result = integer_array(np.array([1, 2], dtype="int8"), dtype="int32") assert result.dtype == Int32Dtype() def test_to_integer_array_float(): - result = integer_array([1., 2.]) + result = integer_array([1.0, 2.0]) expected = integer_array([1, 2]) tm.assert_extension_array_equal(result, expected) with pytest.raises(TypeError, match="cannot safely cast non-equivalent"): - integer_array([1.5, 2.]) + integer_array([1.5, 2.0]) # for float dtypes, the itemsize is not preserved - result = integer_array(np.array([1., 2.], dtype='float32')) + result = integer_array(np.array([1.0, 2.0], dtype="float32")) assert result.dtype == Int64Dtype() @pytest.mark.parametrize( - 'bool_values, int_values, target_dtype, expected_dtype', - [([False, True], [0, 1], Int64Dtype(), Int64Dtype()), - ([False, True], [0, 1], 'Int64', Int64Dtype()), - ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype())]) -def test_to_integer_array_bool(bool_values, int_values, target_dtype, - expected_dtype): + "bool_values, int_values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Int64Dtype(), Int64Dtype()), + ([False, True], [0, 1], "Int64", Int64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Int64Dtype(), Int64Dtype()), + ], +) +def test_to_integer_array_bool(bool_values, int_values, target_dtype, expected_dtype): result = integer_array(bool_values, dtype=target_dtype) assert result.dtype == expected_dtype expected = integer_array(int_values, dtype=target_dtype) @@ -627,11 +642,13 @@ def test_to_integer_array_bool(bool_values, int_values, target_dtype, @pytest.mark.parametrize( - 'values, to_dtype, result_dtype', + "values, to_dtype, result_dtype", [ - (np.array([1], dtype='int64'), None, Int64Dtype), + (np.array([1], dtype="int64"), None, Int64Dtype), (np.array([1, np.nan]), None, Int64Dtype), - (np.array([1, np.nan]), 'int8', Int8Dtype)]) + (np.array([1, np.nan]), "int8", Int8Dtype), + ], +) def test_to_integer_array(values, to_dtype, result_dtype): # convert existing arrays to IntegerArrays result = integer_array(values, dtype=to_dtype) @@ -642,12 +659,16 @@ def test_to_integer_array(values, to_dtype, result_dtype): def test_cross_type_arithmetic(): - df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B': pd.Series([1, np.nan, 3], dtype='UInt8'), - 'C': [1, 2, 3]}) + df = pd.DataFrame( + { + "A": pd.Series([1, 2, np.nan], dtype="Int64"), + "B": pd.Series([1, np.nan, 3], dtype="UInt8"), + "C": [1, 2, 3], + } + ) result = df.A + df.C - expected = pd.Series([2, 4, np.nan], dtype='Int64') + expected = pd.Series([2, 4, np.nan], dtype="Int64") tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 @@ -655,20 +676,22 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = df.A + df.B - expected = pd.Series([2, np.nan, np.nan], dtype='Int64') + expected = pd.Series([2, np.nan, np.nan], dtype="Int64") tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('op', ['sum', 'min', 'max', 'prod']) +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) def test_preserve_dtypes(op): # TODO(#22346): preserve Int64 dtype # for ops that enable (mean would actually work here # but generally it is a float return value) - df = pd.DataFrame({ - "A": ['a', 'b', 'b'], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype='Int64'), - }) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) # op result = getattr(df.C, op)() @@ -677,22 +700,24 @@ def test_preserve_dtypes(op): # groupby result = getattr(df.groupby("A"), op)() - expected = pd.DataFrame({ - "B": np.array([1.0, 3.0]), - "C": integer_array([1, 3], dtype="Int64") - }, index=pd.Index(['a', 'b'], name='A')) + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('op', ['mean']) +@pytest.mark.parametrize("op", ["mean"]) def test_reduce_to_float(op): # some reduce ops always return float, even if the result # is a rounded number - df = pd.DataFrame({ - "A": ['a', 'b', 'b'], - "B": [1, None, 3], - "C": integer_array([1, None, 3], dtype='Int64'), - }) + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": integer_array([1, None, 3], dtype="Int64"), + } + ) # op result = getattr(df.C, op)() @@ -701,10 +726,10 @@ def test_reduce_to_float(op): # groupby result = getattr(df.groupby("A"), op)() - expected = pd.DataFrame({ - "B": np.array([1.0, 3.0]), - "C": integer_array([1, 3], dtype="Int64") - }, index=pd.Index(['a', 'b'], name='A')) + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": integer_array([1, 3], dtype="Int64")}, + index=pd.Index(["a", "b"], name="A"), + ) tm.assert_frame_equal(result, expected) @@ -714,11 +739,10 @@ def test_astype_nansafe(): msg = "cannot convert float NaN to integer" with pytest.raises(ValueError, match=msg): - arr.astype('uint32') + arr.astype("uint32") -@pytest.mark.parametrize( - 'ufunc', [np.abs, np.sign]) +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) def test_ufuncs_single_int(ufunc): a = integer_array([1, 2, -3, np.nan]) result = ufunc(a) @@ -731,24 +755,22 @@ def test_ufuncs_single_int(ufunc): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - 'ufunc', [np.log, np.exp, np.sin, np.cos, np.sqrt]) +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) def test_ufuncs_single_float(ufunc): a = integer_array([1, 2, -3, np.nan]) - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): result = ufunc(a) expected = ufunc(a.astype(float)) tm.assert_numpy_array_equal(result, expected) s = pd.Series(a) - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): result = ufunc(s) expected = ufunc(s.astype(float)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - 'ufunc', [np.add, np.subtract]) +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) def test_ufuncs_binary_int(ufunc): # two IntegerArrays a = integer_array([1, 2, -3, np.nan]) @@ -776,9 +798,7 @@ def test_ufuncs_binary_int(ufunc): tm.assert_extension_array_equal(result, expected) -@pytest.mark.parametrize('values', [ - [0, 1], [0, None] -]) +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) def test_ufunc_reduce_raises(values): a = integer_array(values) with pytest.raises(NotImplementedError): diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 5e4f6e376c1d3..c4c1696ede6e6 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -11,15 +11,17 @@ import pandas.util.testing as tm -@pytest.fixture(params=[ - np.array(['a', 'b'], dtype=object), - np.array([0, 1], dtype=float), - np.array([0, 1], dtype=int), - np.array([0, 1 + 2j], dtype=complex), - np.array([True, False], dtype=bool), - np.array([0, 1], dtype='datetime64[ns]'), - np.array([0, 1], dtype='timedelta64[ns]'), -]) +@pytest.fixture( + params=[ + np.array(["a", "b"], dtype=object), + np.array([0, 1], dtype=float), + np.array([0, 1], dtype=int), + np.array([0, 1 + 2j], dtype=complex), + np.array([True, False], dtype=bool), + np.array([0, 1], dtype="datetime64[ns]"), + np.array([0, 1], dtype="timedelta64[ns]"), + ] +) def any_numpy_array(request): """ Parametrized fixture for NumPy arrays with different dtypes. @@ -32,35 +34,42 @@ def any_numpy_array(request): # ---------------------------------------------------------------------------- # PandasDtype -@pytest.mark.parametrize('dtype, expected', [ - ('bool', True), - ('int', True), - ('uint', True), - ('float', True), - ('complex', True), - ('str', False), - ('bytes', False), - ('datetime64[ns]', False), - ('object', False), - ('void', False), -]) + +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", True), + ("uint", True), + ("float", True), + ("complex", True), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) def test_is_numeric(dtype, expected): dtype = PandasDtype(dtype) assert dtype._is_numeric is expected -@pytest.mark.parametrize('dtype, expected', [ - ('bool', True), - ('int', False), - ('uint', False), - ('float', False), - ('complex', False), - ('str', False), - ('bytes', False), - ('datetime64[ns]', False), - ('object', False), - ('void', False) -]) +@pytest.mark.parametrize( + "dtype, expected", + [ + ("bool", True), + ("int", False), + ("uint", False), + ("float", False), + ("complex", False), + ("str", False), + ("bytes", False), + ("datetime64[ns]", False), + ("object", False), + ("void", False), + ], +) def test_is_boolean(dtype, expected): dtype = PandasDtype(dtype) assert dtype._is_boolean is expected @@ -80,8 +89,9 @@ def test_constructor_from_string(): # ---------------------------------------------------------------------------- # Construction + def test_constructor_no_coercion(): - with pytest.raises(ValueError, match='NumPy array'): + with pytest.raises(ValueError, match="NumPy array"): PandasArray([1, 2, 3]) @@ -100,9 +110,9 @@ def test_series_constructor_with_astype(): def test_from_sequence_dtype(): - arr = np.array([1, 2, 3], dtype='int64') - result = PandasArray._from_sequence(arr, dtype='uint64') - expected = PandasArray(np.array([1, 2, 3], dtype='uint64')) + arr = np.array([1, 2, 3], dtype="int64") + result = PandasArray._from_sequence(arr, dtype="uint64") + expected = PandasArray(np.array([1, 2, 3], dtype="uint64")) tm.assert_extension_array_equal(result, expected) @@ -122,6 +132,7 @@ def test_constructor_with_data(any_numpy_array): # ---------------------------------------------------------------------------- # Conversion + def test_to_numpy(): arr = PandasArray(np.array([1, 2, 3])) result = arr.to_numpy() @@ -130,14 +141,15 @@ def test_to_numpy(): result = arr.to_numpy(copy=True) assert result is not arr._ndarray - result = arr.to_numpy(dtype='f8') - expected = np.array([1, 2, 3], dtype='f8') + result = arr.to_numpy(dtype="f8") + expected = np.array([1, 2, 3], dtype="f8") tm.assert_numpy_array_equal(result, expected) # ---------------------------------------------------------------------------- # Setitem + def test_setitem_series(): ser = pd.Series([1, 2, 3]) ser.array[0] = 10 @@ -158,8 +170,9 @@ def test_setitem(any_numpy_array): # ---------------------------------------------------------------------------- # Reductions + def test_bad_reduce_raises(): - arr = np.array([1, 2, 3], dtype='int64') + arr = np.array([1, 2, 3], dtype="int64") arr = PandasArray(arr) msg = "cannot perform not_a_method with type int" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index c27200e3273ee..fab59d312fb9d 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -20,21 +20,24 @@ def test_registered(): expected = PeriodDtype("D") assert result == expected + # ---------------------------------------------------------------------------- # period_array -@pytest.mark.parametrize("data, freq, expected", [ - ([pd.Period("2017", "D")], None, [17167]), - ([pd.Period("2017", "D")], "D", [17167]), - ([2017], "D", [17167]), - (["2017"], "D", [17167]), - ([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]), - ([pd.Period("2017", "D"), None], None, [17167, iNaT]), - (pd.Series(pd.date_range("2017", periods=3)), None, - [17167, 17168, 17169]), - (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), -]) +@pytest.mark.parametrize( + "data, freq, expected", + [ + ([pd.Period("2017", "D")], None, [17167]), + ([pd.Period("2017", "D")], "D", [17167]), + ([2017], "D", [17167]), + (["2017"], "D", [17167]), + ([pd.Period("2017", "D")], pd.tseries.offsets.Day(), [17167]), + ([pd.Period("2017", "D"), None], None, [17167, iNaT]), + (pd.Series(pd.date_range("2017", periods=3)), None, [17167, 17168, 17169]), + (pd.date_range("2017", periods=3), None, [17167, 17168, 17169]), + ], +) def test_period_array_ok(data, freq, expected): result = period_array(data, freq=freq).asi8 expected = np.asarray(expected, dtype=np.int64) @@ -43,8 +46,8 @@ def test_period_array_ok(data, freq, expected): def test_period_array_readonly_object(): # https://github.com/pandas-dev/pandas/issues/25403 - pa = period_array([pd.Period('2019-01-01')]) - arr = np.asarray(pa, dtype='object') + pa = period_array([pd.Period("2019-01-01")]) + arr = np.asarray(pa, dtype="object") arr.setflags(write=False) result = period_array(arr) @@ -61,20 +64,21 @@ def test_from_datetime64_freq_changes(): # https://github.com/pandas-dev/pandas/issues/23438 arr = pd.date_range("2017", periods=3, freq="D") result = PeriodArray._from_datetime64(arr, freq="M") - expected = period_array(['2017-01-01', '2017-01-01', '2017-01-01'], - freq="M") + expected = period_array(["2017-01-01", "2017-01-01", "2017-01-01"], freq="M") tm.assert_period_array_equal(result, expected) -@pytest.mark.parametrize("data, freq, msg", [ - ([pd.Period('2017', 'D'), - pd.Period('2017', 'A')], - None, - "Input has different freq"), - ([pd.Period('2017', 'D')], - "A", - "Input has different freq"), -]) +@pytest.mark.parametrize( + "data, freq, msg", + [ + ( + [pd.Period("2017", "D"), pd.Period("2017", "A")], + None, + "Input has different freq", + ), + ([pd.Period("2017", "D")], "A", "Input has different freq"), + ], +) def test_period_array_raises(data, freq, msg): with pytest.raises(IncompatibleFrequency, match=msg): period_array(data, freq) @@ -82,48 +86,45 @@ def test_period_array_raises(data, freq, msg): def test_period_array_non_period_series_raies(): ser = pd.Series([1, 2, 3]) - with pytest.raises(TypeError, match='dtype'): - PeriodArray(ser, freq='D') + with pytest.raises(TypeError, match="dtype"): + PeriodArray(ser, freq="D") def test_period_array_freq_mismatch(): - arr = period_array(['2000', '2001'], freq='D') - with pytest.raises(IncompatibleFrequency, match='freq'): - PeriodArray(arr, freq='M') + arr = period_array(["2000", "2001"], freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + PeriodArray(arr, freq="M") - with pytest.raises(IncompatibleFrequency, match='freq'): + with pytest.raises(IncompatibleFrequency, match="freq"): PeriodArray(arr, freq=pd.tseries.offsets.MonthEnd()) def test_asi8(): - result = period_array(['2000', '2001', None], freq='D').asi8 + result = period_array(["2000", "2001", None], freq="D").asi8 expected = np.array([10957, 11323, iNaT]) tm.assert_numpy_array_equal(result, expected) def test_take_raises(): - arr = period_array(['2000', '2001'], freq='D') - with pytest.raises(IncompatibleFrequency, match='freq'): - arr.take([0, -1], allow_fill=True, - fill_value=pd.Period('2000', freq='W')) + arr = period_array(["2000", "2001"], freq="D") + with pytest.raises(IncompatibleFrequency, match="freq"): + arr.take([0, -1], allow_fill=True, fill_value=pd.Period("2000", freq="W")) - with pytest.raises(ValueError, match='foo'): - arr.take([0, -1], allow_fill=True, fill_value='foo') + with pytest.raises(ValueError, match="foo"): + arr.take([0, -1], allow_fill=True, fill_value="foo") -@pytest.mark.parametrize('dtype', [ - int, np.int32, np.int64, 'uint32', 'uint64', -]) +@pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype(dtype): # We choose to ignore the sign and size of integers for # Period/Datetime/Timedelta astype - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") result = arr.astype(dtype) - if np.dtype(dtype).kind == 'u': - expected_dtype = np.dtype('uint64') + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") else: - expected_dtype = np.dtype('int64') + expected_dtype = np.dtype("int64") expected = arr.astype(expected_dtype) assert result.dtype == expected_dtype @@ -131,7 +132,7 @@ def test_astype(dtype): def test_astype_copies(): - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") result = arr.astype(np.int64, copy=False) # Add the `.base`, since we now use `.asi8` which returns a view. # We could maybe override it in PeriodArray to return ._data directly. @@ -139,42 +140,40 @@ def test_astype_copies(): result = arr.astype(np.int64, copy=True) assert result is not arr._data - tm.assert_numpy_array_equal(result, arr._data.view('i8')) + tm.assert_numpy_array_equal(result, arr._data.view("i8")) def test_astype_categorical(): - arr = period_array(['2000', '2001', '2001', None], freq='D') - result = arr.astype('category') - categories = pd.PeriodIndex(['2000', '2001'], freq='D') + arr = period_array(["2000", "2001", "2001", None], freq="D") + result = arr.astype("category") + categories = pd.PeriodIndex(["2000", "2001"], freq="D") expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories) tm.assert_categorical_equal(result, expected) def test_astype_period(): - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") result = arr.astype(PeriodDtype("M")) - expected = period_array(['2000', '2001', None], freq='M') + expected = period_array(["2000", "2001", None], freq="M") tm.assert_period_array_equal(result, expected) -@pytest.mark.parametrize('other', [ - 'datetime64[ns]', 'timedelta64[ns]', -]) +@pytest.mark.parametrize("other", ["datetime64[ns]", "timedelta64[ns]"]) def test_astype_datetime(other): - arr = period_array(['2000', '2001', None], freq='D') + arr = period_array(["2000", "2001", None], freq="D") # slice off the [ns] so that the regex matches. with pytest.raises(TypeError, match=other[:-4]): arr.astype(other) def test_fillna_raises(): - arr = period_array(['2000', '2001', '2002'], freq='D') - with pytest.raises(ValueError, match='Length'): + arr = period_array(["2000", "2001", "2002"], freq="D") + with pytest.raises(ValueError, match="Length"): arr.fillna(arr[:2]) def test_fillna_copies(): - arr = period_array(['2000', '2001', '2002'], freq='D') + arr = period_array(["2000", "2001", "2002"], freq="D") result = arr.fillna(pd.Period("2000", "D")) assert result is not arr @@ -182,16 +181,21 @@ def test_fillna_copies(): # ---------------------------------------------------------------------------- # setitem -@pytest.mark.parametrize('key, value, expected', [ - ([0], pd.Period("2000", "D"), [10957, 1, 2]), - ([0], None, [iNaT, 1, 2]), - ([0], np.nan, [iNaT, 1, 2]), - ([0, 1, 2], pd.Period("2000", "D"), [10957] * 3), - ([0, 1, 2], [pd.Period("2000", "D"), - pd.Period("2001", "D"), - pd.Period("2002", "D")], - [10957, 11323, 11688]), -]) + +@pytest.mark.parametrize( + "key, value, expected", + [ + ([0], pd.Period("2000", "D"), [10957, 1, 2]), + ([0], None, [iNaT, 1, 2]), + ([0], np.nan, [iNaT, 1, 2]), + ([0, 1, 2], pd.Period("2000", "D"), [10957] * 3), + ( + [0, 1, 2], + [pd.Period("2000", "D"), pd.Period("2001", "D"), pd.Period("2002", "D")], + [10957, 11323, 11688], + ), + ], +) def test_setitem(key, value, expected): arr = PeriodArray(np.arange(3), freq="D") expected = PeriodArray(expected, freq="D") @@ -204,7 +208,7 @@ def test_setitem_raises_incompatible_freq(): with pytest.raises(IncompatibleFrequency, match="freq"): arr[0] = pd.Period("2000", freq="A") - other = period_array(['2000', '2001'], freq='A') + other = period_array(["2000", "2001"], freq="A") with pytest.raises(IncompatibleFrequency, match="freq"): arr[[0, 1]] = other @@ -224,8 +228,9 @@ def test_setitem_raises_type(): # ---------------------------------------------------------------------------- # Ops + def test_sub_period(): - arr = period_array(['2000', '2001'], freq='D') + arr = period_array(["2000", "2001"], freq="D") other = pd.Period("2000", freq="M") with pytest.raises(IncompatibleFrequency, match="freq"): arr - other @@ -234,12 +239,13 @@ def test_sub_period(): # ---------------------------------------------------------------------------- # Methods -@pytest.mark.parametrize('other', [ - pd.Period('2000', freq='H'), - period_array(['2000', '2001', '2000'], freq='H') -]) + +@pytest.mark.parametrize( + "other", + [pd.Period("2000", freq="H"), period_array(["2000", "2001", "2000"], freq="H")], +) def test_where_different_freq_raises(other): - ser = pd.Series(period_array(['2000', '2001', '2002'], freq='D')) + ser = pd.Series(period_array(["2000", "2001", "2002"], freq="D")) cond = np.array([True, False, True]) with pytest.raises(IncompatibleFrequency, match="freq"): ser.where(cond, other) @@ -248,19 +254,18 @@ def test_where_different_freq_raises(other): # ---------------------------------------------------------------------------- # Printing + def test_repr_small(): - arr = period_array(['2000', '2001'], freq='D') + arr = period_array(["2000", "2001"], freq="D") result = str(arr) expected = ( - "\n" - "['2000-01-01', '2001-01-01']\n" - "Length: 2, dtype: period[D]" + "\n" "['2000-01-01', '2001-01-01']\n" "Length: 2, dtype: period[D]" ) assert result == expected def test_repr_large(): - arr = period_array(['2000', '2001'] * 500, freq='D') + arr = period_array(["2000", "2001"] * 500, freq="D") result = str(arr) expected = ( "\n" @@ -281,24 +286,27 @@ def test_repr_large(): # ---------------------------------------------------------------------------- # Reductions -class TestReductions: +class TestReductions: def test_min_max(self): - arr = period_array([ - '2000-01-03', - '2000-01-03', - 'NaT', - '2000-01-02', - '2000-01-05', - '2000-01-04', - ], freq='D') + arr = period_array( + [ + "2000-01-03", + "2000-01-03", + "NaT", + "2000-01-02", + "2000-01-05", + "2000-01-04", + ], + freq="D", + ) result = arr.min() - expected = pd.Period('2000-01-02', freq='D') + expected = pd.Period("2000-01-02", freq="D") assert result == expected result = arr.max() - expected = pd.Period('2000-01-05', freq='D') + expected = pd.Period("2000-01-05", freq="D") assert result == expected result = arr.min(skipna=False) @@ -307,9 +315,9 @@ def test_min_max(self): result = arr.max(skipna=False) assert result is pd.NaT - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna): - arr = period_array([], freq='D') + arr = period_array([], freq="D") result = arr.min(skipna=skipna) assert result is pd.NaT diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 87f32ef101fa9..5825f9f150eb8 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -9,7 +9,7 @@ class TestTimedeltaArrayConstructor: def test_only_1dim_accepted(self): # GH#25282 - arr = np.array([0, 1, 2, 3], dtype='m8[h]').astype('m8[ns]') + arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): # 2-dim @@ -21,37 +21,38 @@ def test_only_1dim_accepted(self): def test_freq_validation(self): # ensure that the public constructor cannot create an invalid instance - arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10**9 + arr = np.array([0, 0, 1], dtype=np.int64) * 3600 * 10 ** 9 - msg = ("Inferred frequency None from passed values does not " - "conform to passed frequency D") + msg = ( + "Inferred frequency None from passed values does not " + "conform to passed frequency D" + ) with pytest.raises(ValueError, match=msg): - TimedeltaArray(arr.view('timedelta64[ns]'), freq="D") + TimedeltaArray(arr.view("timedelta64[ns]"), freq="D") def test_non_array_raises(self): - with pytest.raises(ValueError, match='list'): + with pytest.raises(ValueError, match="list"): TimedeltaArray([1, 2, 3]) def test_other_type_raises(self): - with pytest.raises(ValueError, - match="dtype bool cannot be converted"): - TimedeltaArray(np.array([1, 2, 3], dtype='bool')) + with pytest.raises(ValueError, match="dtype bool cannot be converted"): + TimedeltaArray(np.array([1, 2, 3], dtype="bool")) def test_incorrect_dtype_raises(self): # TODO: why TypeError for 'category' but ValueError for i8? - with pytest.raises(ValueError, - match=r'category cannot be converted ' - r'to timedelta64\[ns\]'): - TimedeltaArray(np.array([1, 2, 3], dtype='i8'), dtype='category') + with pytest.raises( + ValueError, match=r"category cannot be converted " r"to timedelta64\[ns\]" + ): + TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") - with pytest.raises(ValueError, - match=r"dtype int64 cannot be converted " - r"to timedelta64\[ns\]"): - TimedeltaArray(np.array([1, 2, 3], dtype='i8'), - dtype=np.dtype("int64")) + with pytest.raises( + ValueError, + match=r"dtype int64 cannot be converted " r"to timedelta64\[ns\]", + ): + TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) def test_copy(self): - data = np.array([1, 2, 3], dtype='m8[ns]') + data = np.array([1, 2, 3], dtype="m8[ns]") arr = TimedeltaArray(data, copy=False) assert arr._data is data @@ -63,7 +64,7 @@ def test_copy(self): class TestTimedeltaArray: def test_np_sum(self): # GH#25282 - vals = np.arange(5, dtype=np.int64).view('m8[h]').astype('m8[ns]') + vals = np.arange(5, dtype=np.int64).view("m8[h]").astype("m8[ns]") arr = TimedeltaArray(vals) result = np.sum(arr) assert result == vals.sum() @@ -77,27 +78,27 @@ def test_from_sequence_dtype(self): TimedeltaArray._from_sequence([], dtype=object) def test_abs(self): - vals = np.array([-3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]') + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") arr = TimedeltaArray(vals) - evals = np.array([3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]') + evals = np.array([3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") expected = TimedeltaArray(evals) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) def test_neg(self): - vals = np.array([-3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]') + vals = np.array([-3600 * 10 ** 9, "NaT", 7200 * 10 ** 9], dtype="m8[ns]") arr = TimedeltaArray(vals) - evals = np.array([3600 * 10**9, 'NaT', -7200 * 10**9], dtype='m8[ns]') + evals = np.array([3600 * 10 ** 9, "NaT", -7200 * 10 ** 9], dtype="m8[ns]") expected = TimedeltaArray(evals) result = -arr tm.assert_timedelta_array_equal(result, expected) def test_neg_freq(self): - tdi = pd.timedelta_range('2 Days', periods=4, freq='H') + tdi = pd.timedelta_range("2 Days", periods=4, freq="H") arr = TimedeltaArray(tdi, freq=tdi.freq) expected = TimedeltaArray(-tdi._data, freq=-tdi.freq) @@ -105,42 +106,36 @@ def test_neg_freq(self): result = -arr tm.assert_timedelta_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [ - int, np.int32, np.int64, 'uint32', 'uint64', - ]) + @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"]) def test_astype_int(self, dtype): - arr = TimedeltaArray._from_sequence([pd.Timedelta('1H'), - pd.Timedelta('2H')]) + arr = TimedeltaArray._from_sequence([pd.Timedelta("1H"), pd.Timedelta("2H")]) result = arr.astype(dtype) - if np.dtype(dtype).kind == 'u': - expected_dtype = np.dtype('uint64') + if np.dtype(dtype).kind == "u": + expected_dtype = np.dtype("uint64") else: - expected_dtype = np.dtype('int64') + expected_dtype = np.dtype("int64") expected = arr.astype(expected_dtype) assert result.dtype == expected_dtype tm.assert_numpy_array_equal(result, expected) def test_setitem_clears_freq(self): - a = TimedeltaArray(pd.timedelta_range('1H', periods=2, freq='H')) + a = TimedeltaArray(pd.timedelta_range("1H", periods=2, freq="H")) a[0] = pd.Timedelta("1H") assert a.freq is None class TestReductions: - def test_min_max(self): - arr = TimedeltaArray._from_sequence([ - '3H', '3H', 'NaT', '2H', '5H', '4H', - ]) + arr = TimedeltaArray._from_sequence(["3H", "3H", "NaT", "2H", "5H", "4H"]) result = arr.min() - expected = pd.Timedelta('2H') + expected = pd.Timedelta("2H") assert result == expected result = arr.max() - expected = pd.Timedelta('5H') + expected = pd.Timedelta("5H") assert result == expected result = arr.min(skipna=False) @@ -149,7 +144,7 @@ def test_min_max(self): result = arr.max(skipna=False) assert result is pd.NaT - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna): arr = TimedeltaArray._from_sequence([]) result = arr.min(skipna=skipna) diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index 3b01851bd39ca..b3fbd8c17d8bf 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -13,10 +13,12 @@ def test_compat(): # test we have compat with our version of nu from pandas.core.computation.check import _NUMEXPR_INSTALLED + try: import numexpr as ne + ver = ne.__version__ - if LooseVersion(ver) < LooseVersion(VERSIONS['numexpr']): + if LooseVersion(ver) < LooseVersion(VERSIONS["numexpr"]): assert not _NUMEXPR_INSTALLED else: assert _NUMEXPR_INSTALLED @@ -24,22 +26,21 @@ def test_compat(): pytest.skip("not testing numexpr version compat") -@pytest.mark.parametrize('engine', _engines) -@pytest.mark.parametrize('parser', expr._parsers) +@pytest.mark.parametrize("engine", _engines) +@pytest.mark.parametrize("parser", expr._parsers) def test_invalid_numexpr_version(engine, parser): def testit(): a, b = 1, 2 # noqa - res = pd.eval('a + b', engine=engine, parser=parser) + res = pd.eval("a + b", engine=engine, parser=parser) assert res == 3 - if engine == 'numexpr': + if engine == "numexpr": try: import numexpr as ne except ImportError: pytest.skip("no numexpr") else: - if (LooseVersion(ne.__version__) < - LooseVersion(VERSIONS['numexpr'])): + if LooseVersion(ne.__version__) < LooseVersion(VERSIONS["numexpr"]): with pytest.raises(ImportError): testit() else: diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index ca78e2e40ec74..2fd7c8f04c8be 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -20,26 +20,40 @@ from pandas.core.computation.engines import NumExprClobberingError, _engines import pandas.core.computation.expr as expr from pandas.core.computation.expr import PandasExprVisitor, PythonExprVisitor -from pandas.core.computation.expressions import ( - _NUMEXPR_INSTALLED, _USE_NUMEXPR) +from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR from pandas.core.computation.ops import ( - _arith_ops_syms, _binary_math_ops, _binary_ops_dict, - _special_case_arith_ops_syms, _unary_math_ops) + _arith_ops_syms, + _binary_math_ops, + _binary_ops_dict, + _special_case_arith_ops_syms, + _unary_math_ops, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_frame_equal, assert_numpy_array_equal, assert_produces_warning, - assert_series_equal, makeCustomDataframe as mkdf, randbool) - - -@pytest.fixture(params=( - pytest.param(engine, - marks=pytest.mark.skipif( - engine == 'numexpr' and not _USE_NUMEXPR, - reason='numexpr enabled->{enabled}, ' - 'installed->{installed}'.format( - enabled=_USE_NUMEXPR, - installed=_NUMEXPR_INSTALLED))) - for engine in _engines)) # noqa + assert_frame_equal, + assert_numpy_array_equal, + assert_produces_warning, + assert_series_equal, + makeCustomDataframe as mkdf, + randbool, +) + + +@pytest.fixture( + params=( + pytest.param( + engine, + marks=pytest.mark.skipif( + engine == "numexpr" and not _USE_NUMEXPR, + reason="numexpr enabled->{enabled}, " + "installed->{installed}".format( + enabled=_USE_NUMEXPR, installed=_NUMEXPR_INSTALLED + ), + ), + ) + for engine in _engines + ) +) # noqa def engine(request): return request.param @@ -51,19 +65,18 @@ def parser(request): @pytest.fixture def ne_lt_2_6_9(): - if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion('2.6.9'): + if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion("2.6.9"): pytest.skip("numexpr is >= 2.6.9") - return 'numexpr' + return "numexpr" @pytest.fixture def unary_fns_for_ne(): if _NUMEXPR_INSTALLED: - if _NUMEXPR_VERSION >= LooseVersion('2.6.9'): + if _NUMEXPR_VERSION >= LooseVersion("2.6.9"): return _unary_math_ops else: - return tuple(x for x in _unary_math_ops - if x not in ("floor", "ceil")) + return tuple(x for x in _unary_math_ops if x not in ("floor", "ceil")) else: pytest.skip("numexpr is not present") @@ -78,23 +91,24 @@ def _eval_single_bin(lhs, cmp1, rhs, engine): try: return c(lhs, rhs) except ValueError as e: - if str(e).startswith('negative number cannot be ' - 'raised to a fractional power'): + if str(e).startswith( + "negative number cannot be " "raised to a fractional power" + ): return np.nan raise return c(lhs, rhs) def _series_and_2d_ndarray(lhs, rhs): - return ((isinstance(lhs, Series) and - isinstance(rhs, np.ndarray) and rhs.ndim > 1) or - (isinstance(rhs, Series) and - isinstance(lhs, np.ndarray) and lhs.ndim > 1)) + return ( + isinstance(lhs, Series) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 + ) or (isinstance(rhs, Series) and isinstance(lhs, np.ndarray) and lhs.ndim > 1) def _series_and_frame(lhs, rhs): - return ((isinstance(lhs, Series) and isinstance(rhs, DataFrame)) or - (isinstance(rhs, Series) and isinstance(lhs, DataFrame))) + return (isinstance(lhs, Series) and isinstance(rhs, DataFrame)) or ( + isinstance(rhs, Series) and isinstance(lhs, DataFrame) + ) def _bool_and_frame(lhs, rhs): @@ -102,8 +116,7 @@ def _bool_and_frame(lhs, rhs): def _is_py3_complex_incompat(result, expected): - return (isinstance(expected, (complex, np.complexfloating)) and - np.isnan(result)) + return isinstance(expected, (complex, np.complexfloating)) and np.isnan(result) _good_arith_ops = set(_arith_ops_syms).difference(_special_case_arith_ops_syms) @@ -111,18 +124,18 @@ def _is_py3_complex_incompat(result, expected): @td.skip_if_no_ne class TestEvalNumexprPandas: - @classmethod def setup_class(cls): import numexpr as ne + cls.ne = ne - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" @classmethod def teardown_class(cls): del cls.engine, cls.parser - if hasattr(cls, 'ne'): + if hasattr(cls, "ne"): del cls.ne def setup_data(self): @@ -131,12 +144,20 @@ def setup_data(self): nan_df2 = DataFrame(rand(10, 5)) nan_df2[nan_df2 > 0.5] = np.nan - self.pandas_lhses = (DataFrame(randn(10, 5)), Series(randn(5)), - Series([1, 2, np.nan, np.nan, 5]), nan_df1) - self.pandas_rhses = (DataFrame(randn(10, 5)), Series(randn(5)), - Series([1, 2, np.nan, np.nan, 5]), nan_df2) - self.scalar_lhses = randn(), - self.scalar_rhses = randn(), + self.pandas_lhses = ( + DataFrame(randn(10, 5)), + Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), + nan_df1, + ) + self.pandas_rhses = ( + DataFrame(randn(10, 5)), + Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), + nan_df2, + ) + self.scalar_lhses = (randn(),) + self.scalar_rhses = (randn(),) self.lhses = self.pandas_lhses + self.scalar_lhses self.rhses = self.pandas_rhses + self.scalar_rhses @@ -147,7 +168,7 @@ def setup_ops(self): self.bin_ops = expr._bool_ops_syms self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops - self.unary_ops = '-', '~', 'not ' + self.unary_ops = "-", "~", "not " def setup_method(self, method): self.setup_ops() @@ -159,27 +180,35 @@ def teardown_method(self, method): del self.pandas_rhses, self.pandas_lhses, self.current_engines @pytest.mark.slow - @pytest.mark.parametrize('cmp1', ['!=', '==', '<=', '>=', '<', '>'], - ids=['ne', 'eq', 'le', 'ge', 'lt', 'gt']) - @pytest.mark.parametrize('cmp2', ['>', '<'], ids=['gt', 'lt']) + @pytest.mark.parametrize( + "cmp1", + ["!=", "==", "<=", ">=", "<", ">"], + ids=["ne", "eq", "le", "ge", "lt", "gt"], + ) + @pytest.mark.parametrize("cmp2", [">", "<"], ids=["gt", "lt"]) def test_complex_cmp_ops(self, cmp1, cmp2): - for lhs, rhs, binop in product( - self.lhses, self.rhses, self.bin_ops): + for lhs, rhs, binop in product(self.lhses, self.rhses, self.bin_ops): lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) - expected = _eval_single_bin( - lhs_new, binop, rhs_new, self.engine) + expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) - ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format( - cmp1=cmp1, binop=binop, cmp2=cmp2) + ex = "(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)".format( + cmp1=cmp1, binop=binop, cmp2=cmp2 + ) result = pd.eval(ex, engine=self.engine, parser=self.parser) self.check_equal(result, expected) def test_simple_cmp_ops(self): - bool_lhses = (DataFrame(randbool(size=(10, 5))), - Series(randbool((5,))), randbool()) - bool_rhses = (DataFrame(randbool(size=(10, 5))), - Series(randbool((5,))), randbool()) + bool_lhses = ( + DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), + randbool(), + ) + bool_rhses = ( + DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), + randbool(), + ) for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): self.check_simple_cmp_op(lhs, cmp_op, rhs) @@ -190,17 +219,17 @@ def test_binary_arith_ops(self): def test_modulus(self): for lhs, rhs in product(self.lhses, self.rhses): - self.check_modulus(lhs, '%', rhs) + self.check_modulus(lhs, "%", rhs) def test_floor_division(self): for lhs, rhs in product(self.lhses, self.rhses): - self.check_floor_division(lhs, '//', rhs) + self.check_floor_division(lhs, "//", rhs) @td.skip_if_windows def test_pow(self): # odd failure on win32 platform, so skip for lhs, rhs in product(self.lhses, self.rhses): - self.check_pow(lhs, '**', rhs) + self.check_pow(lhs, "**", rhs) @pytest.mark.slow def test_single_invert_op(self): @@ -215,9 +244,10 @@ def test_compound_invert_op(self): @pytest.mark.slow def test_chained_cmp_op(self): mids = self.lhses - cmp_ops = '<', '>' - for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, - mids, cmp_ops, self.rhses): + cmp_ops = "<", ">" + for lhs, cmp1, mid, cmp2, rhs in product( + self.lhses, cmp_ops, mids, cmp_ops, self.rhses + ): self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) def check_equal(self, result, expected): @@ -231,7 +261,6 @@ def check_equal(self, result, expected): assert result == expected def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - def check_operands(left, right, cmp_op): return _eval_single_bin(left, cmp_op, right, self.engine) @@ -239,42 +268,46 @@ def check_operands(left, right, cmp_op): rhs_new = check_operands(mid, rhs, cmp2) if lhs_new is not None and rhs_new is not None: - ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) - ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) - ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) - expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) + ex1 = "lhs {0} mid {1} rhs".format(cmp1, cmp2) + ex2 = "lhs {0} mid and mid {1} rhs".format(cmp1, cmp2) + ex3 = "(lhs {0} mid) & (mid {1} rhs)".format(cmp1, cmp2) + expected = _eval_single_bin(lhs_new, "&", rhs_new, self.engine) for ex in (ex1, ex2, ex3): - result = pd.eval(ex, engine=self.engine, - parser=self.parser) + result = pd.eval(ex, engine=self.engine, parser=self.parser) tm.assert_almost_equal(result, expected) def check_simple_cmp_op(self, lhs, cmp1, rhs): - ex = 'lhs {0} rhs'.format(cmp1) - msg = (r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')bool(\]|')|" - "argument of type 'bool' is not iterable") - if cmp1 in ('in', 'not in') and not is_list_like(rhs): + ex = "lhs {0} rhs".format(cmp1) + msg = ( + r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')bool(\]|')|" + "argument of type 'bool' is not iterable" + ) + if cmp1 in ("in", "not in") and not is_list_like(rhs): with pytest.raises(TypeError, match=msg): - pd.eval(ex, engine=self.engine, parser=self.parser, - local_dict={'lhs': lhs, 'rhs': rhs}) + pd.eval( + ex, + engine=self.engine, + parser=self.parser, + local_dict={"lhs": lhs, "rhs": rhs}, + ) else: expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) self.check_equal(result, expected) def check_binary_arith_op(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = _eval_single_bin(lhs, arith1, rhs, self.engine) tm.assert_almost_equal(result, expected) - ex = 'lhs {0} rhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) - nlhs = _eval_single_bin(lhs, arith1, rhs, - self.engine) + nlhs = _eval_single_bin(lhs, arith1, rhs, self.engine) self.check_alignment(result, nlhs, rhs, arith1) def check_alignment(self, result, nlhs, ghs, op): @@ -287,44 +320,51 @@ def check_alignment(self, result, nlhs, ghs, op): else: # direct numpy comparison - expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) + expected = self.ne.evaluate("nlhs {0} ghs".format(op)) tm.assert_numpy_array_equal(result.values, expected) # modulus, pow, and floor division require special casing def check_modulus(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs tm.assert_almost_equal(result, expected) - expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) + expected = self.ne.evaluate("expected {0} rhs".format(arith1)) if isinstance(result, (DataFrame, Series)): tm.assert_almost_equal(result.values, expected) else: tm.assert_almost_equal(result, expected.item()) def check_floor_division(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) - if self.engine == 'python': + if self.engine == "python": res = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs // rhs self.check_equal(res, expected) else: - msg = (r"unsupported operand type\(s\) for //: 'VariableNode' and" - " 'VariableNode'") + msg = ( + r"unsupported operand type\(s\) for //: 'VariableNode' and" + " 'VariableNode'" + ) with pytest.raises(TypeError, match=msg): - pd.eval(ex, local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) + pd.eval( + ex, + local_dict={"lhs": lhs, "rhs": rhs}, + engine=self.engine, + parser=self.parser, + ) def get_expected_pow_result(self, lhs, rhs): try: - expected = _eval_single_bin(lhs, '**', rhs, self.engine) + expected = _eval_single_bin(lhs, "**", rhs, self.engine) except ValueError as e: - if str(e).startswith('negative number cannot be ' - 'raised to a fractional power'): - if self.engine == 'python': + if str(e).startswith( + "negative number cannot be " "raised to a fractional power" + ): + if self.engine == "python": pytest.skip(str(e)) else: expected = np.nan @@ -333,21 +373,25 @@ def get_expected_pow_result(self, lhs, rhs): return expected def check_pow(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) expected = self.get_expected_pow_result(lhs, rhs) result = pd.eval(ex, engine=self.engine, parser=self.parser) - if (is_scalar(lhs) and is_scalar(rhs) and - _is_py3_complex_incompat(result, expected)): + if ( + is_scalar(lhs) + and is_scalar(rhs) + and _is_py3_complex_incompat(result, expected) + ): with pytest.raises(AssertionError): tm.assert_numpy_array_equal(result, expected) else: tm.assert_almost_equal(result, expected) - ex = '(lhs {0} rhs) {0} rhs'.format(arith1) + ex = "(lhs {0} rhs) {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = self.get_expected_pow_result( - self.get_expected_pow_result(lhs, rhs), rhs) + self.get_expected_pow_result(lhs, rhs), rhs + ) tm.assert_almost_equal(result, expected) def check_single_invert_op(self, lhs, cmp1, rhs): @@ -358,25 +402,32 @@ def check_single_invert_op(self, lhs, cmp1, rhs): except AttributeError: elb = np.array([bool(el)]) expected = ~elb - result = pd.eval('~elb', engine=self.engine, parser=self.parser) + result = pd.eval("~elb", engine=self.engine, parser=self.parser) tm.assert_almost_equal(expected, result) for engine in self.current_engines: - tm.assert_almost_equal(result, pd.eval('~elb', engine=engine, - parser=self.parser)) + tm.assert_almost_equal( + result, pd.eval("~elb", engine=engine, parser=self.parser) + ) def check_compound_invert_op(self, lhs, cmp1, rhs): - skip_these = 'in', 'not in' - ex = '~(lhs {0} rhs)'.format(cmp1) - - msg = (r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')float(\]|')|" - "argument of type 'float' is not iterable") + skip_these = "in", "not in" + ex = "~(lhs {0} rhs)".format(cmp1) + + msg = ( + r"only list-like( or dict-like)? objects are allowed to be" + r" passed to (DataFrame\.)?isin\(\), you passed a" + r" (\[|')float(\]|')|" + "argument of type 'float' is not iterable" + ) if is_scalar(rhs) and cmp1 in skip_these: with pytest.raises(TypeError, match=msg): - pd.eval(ex, engine=self.engine, parser=self.parser, - local_dict={'lhs': lhs, 'rhs': rhs}) + pd.eval( + ex, + engine=self.engine, + parser=self.parser, + local_dict={"lhs": lhs, "rhs": rhs}, + ) else: # compound if is_scalar(lhs) and is_scalar(rhs): @@ -394,17 +445,17 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): ev = pd.eval(ex, engine=self.engine, parser=self.parser) tm.assert_almost_equal(ev, result) - def ex(self, op, var_name='lhs'): - return '{0}{1}'.format(op, var_name) + def ex(self, op, var_name="lhs"): + return "{0}{1}".format(op, var_name) def test_frame_invert(self): - expr = self.ex('~') + expr = self.ex("~") # ~ ## # frame # float always raises lhs = DataFrame(randn(5, 2)) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -413,7 +464,7 @@ def test_frame_invert(self): # int raises on numexpr lhs = DataFrame(randint(5, size=(5, 2))) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -428,8 +479,8 @@ def test_frame_invert(self): assert_frame_equal(expect, result) # object raises - lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5}) - if self.engine == 'numexpr': + lhs = DataFrame({"b": ["a", 1, 2.0], "c": rand(3) > 0.5}) + if self.engine == "numexpr": with pytest.raises(ValueError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -438,12 +489,12 @@ def test_frame_invert(self): def test_series_invert(self): # ~ #### - expr = self.ex('~') + expr = self.ex("~") # series # float raises lhs = Series(randn(5)) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -452,7 +503,7 @@ def test_series_invert(self): # int raises on numexpr lhs = Series(randint(5, size=5)) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -471,8 +522,8 @@ def test_series_invert(self): # bool # object - lhs = Series(['a', 1, 2.0]) - if self.engine == 'numexpr': + lhs = Series(["a", 1, 2.0]) + if self.engine == "numexpr": with pytest.raises(ValueError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -480,7 +531,7 @@ def test_series_invert(self): result = pd.eval(expr, engine=self.engine, parser=self.parser) def test_frame_negate(self): - expr = self.ex('-') + expr = self.ex("-") # float lhs = DataFrame(randn(5, 2)) @@ -496,7 +547,7 @@ def test_frame_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = DataFrame(rand(5, 2) > 0.5) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -505,7 +556,7 @@ def test_frame_negate(self): assert_frame_equal(expect, result) def test_series_negate(self): - expr = self.ex('-') + expr = self.ex("-") # float lhs = Series(randn(5)) @@ -521,7 +572,7 @@ def test_series_negate(self): # bool doesn't work with numexpr but works elsewhere lhs = Series(rand(5) > 0.5) - if self.engine == 'numexpr': + if self.engine == "numexpr": with pytest.raises(NotImplementedError): result = pd.eval(expr, engine=self.engine, parser=self.parser) else: @@ -530,7 +581,7 @@ def test_series_negate(self): assert_series_equal(expect, result) def test_frame_pos(self): - expr = self.ex('+') + expr = self.ex("+") # float lhs = DataFrame(randn(5, 2)) @@ -551,7 +602,7 @@ def test_frame_pos(self): assert_frame_equal(expect, result) def test_series_pos(self): - expr = self.ex('+') + expr = self.ex("+") # float lhs = Series(randn(5)) @@ -573,57 +624,63 @@ def test_series_pos(self): def test_scalar_unary(self): with pytest.raises(TypeError): - pd.eval('~1.0', engine=self.engine, parser=self.parser) - - assert pd.eval('-1.0', parser=self.parser, - engine=self.engine) == -1.0 - assert pd.eval('+1.0', parser=self.parser, - engine=self.engine) == +1.0 - assert pd.eval('~1', parser=self.parser, - engine=self.engine) == ~1 - assert pd.eval('-1', parser=self.parser, - engine=self.engine) == -1 - assert pd.eval('+1', parser=self.parser, - engine=self.engine) == +1 - assert pd.eval('~True', parser=self.parser, - engine=self.engine) == ~True - assert pd.eval('~False', parser=self.parser, - engine=self.engine) == ~False - assert pd.eval('-True', parser=self.parser, - engine=self.engine) == -True - assert pd.eval('-False', parser=self.parser, - engine=self.engine) == -False - assert pd.eval('+True', parser=self.parser, - engine=self.engine) == +True - assert pd.eval('+False', parser=self.parser, - engine=self.engine) == +False + pd.eval("~1.0", engine=self.engine, parser=self.parser) + + assert pd.eval("-1.0", parser=self.parser, engine=self.engine) == -1.0 + assert pd.eval("+1.0", parser=self.parser, engine=self.engine) == +1.0 + assert pd.eval("~1", parser=self.parser, engine=self.engine) == ~1 + assert pd.eval("-1", parser=self.parser, engine=self.engine) == -1 + assert pd.eval("+1", parser=self.parser, engine=self.engine) == +1 + assert pd.eval("~True", parser=self.parser, engine=self.engine) == ~True + assert pd.eval("~False", parser=self.parser, engine=self.engine) == ~False + assert pd.eval("-True", parser=self.parser, engine=self.engine) == -True + assert pd.eval("-False", parser=self.parser, engine=self.engine) == -False + assert pd.eval("+True", parser=self.parser, engine=self.engine) == +True + assert pd.eval("+False", parser=self.parser, engine=self.engine) == +False def test_unary_in_array(self): # GH 11235 assert_numpy_array_equal( - pd.eval('[-True, True, ~True, +True,' - '-False, False, ~False, +False,' - '-37, 37, ~37, +37]'), - np.array([-True, True, ~True, +True, - -False, False, ~False, +False, - -37, 37, ~37, +37], dtype=np.object_)) - - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + pd.eval( + "[-True, True, ~True, +True," + "-False, False, ~False, +False," + "-37, 37, ~37, +37]" + ), + np.array( + [ + -True, + True, + ~True, + +True, + -False, + False, + ~False, + +False, + -37, + 37, + ~37, + +37, + ], + dtype=np.object_, + ), + ) + + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_float_comparison_bin_op(self, dtype): # GH 16363 - df = pd.DataFrame({'x': np.array([0], dtype=dtype)}) - res = df.eval('x < -0.1') + df = pd.DataFrame({"x": np.array([0], dtype=dtype)}) + res = df.eval("x < -0.1") assert res.values == np.array([False]) - res = df.eval('-5 > x') + res = df.eval("-5 > x") assert res.values == np.array([False]) def test_disallow_scalar_bool_ops(self): - exprs = '1 or 2', '1 and 2' - exprs += 'a and b', 'a or b' - exprs += '1 or 2 and (3 + 2) > 3', - exprs += '2 * x > 2 or 1 and 2', - exprs += '2 * df > 3 and 1 or a', + exprs = "1 or 2", "1 and 2" + exprs += "a and b", "a or b" + exprs += ("1 or 2 and (3 + 2) > 3",) + exprs += ("2 * x > 2 or 1 and 2",) + exprs += ("2 * df > 3 and 1 or a",) x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) # noqa for ex in exprs: @@ -633,35 +690,35 @@ def test_disallow_scalar_bool_ops(self): def test_identical(self): # see gh-10546 x = 1 - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) assert result == 1 assert is_scalar(result) x = 1.5 - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) assert result == 1.5 assert is_scalar(result) x = False - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) assert not result assert is_bool(result) assert is_scalar(result) x = np.array([1]) - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) tm.assert_numpy_array_equal(result, np.array([1])) - assert result.shape == (1, ) + assert result.shape == (1,) x = np.array([1.5]) - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) tm.assert_numpy_array_equal(result, np.array([1.5])) - assert result.shape == (1, ) + assert result.shape == (1,) x = np.array([False]) # noqa - result = pd.eval('x', engine=self.engine, parser=self.parser) + result = pd.eval("x", engine=self.engine, parser=self.parser) tm.assert_numpy_array_equal(result, np.array([False])) - assert result.shape == (1, ) + assert result.shape == (1,) def test_line_continuation(self): # GH 11149 @@ -672,14 +729,12 @@ def test_line_continuation(self): def test_float_truncation(self): # GH 14241 - exp = '1000000000.006' + exp = "1000000000.006" result = pd.eval(exp, engine=self.engine, parser=self.parser) expected = np.float64(exp) assert result == expected - df = pd.DataFrame({'A': [1000000000.0009, - 1000000000.0011, - 1000000000.0015]}) + df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) cutoff = 1000000000.0006 result = df.query("A < %.4f" % cutoff) assert result.empty @@ -690,60 +745,59 @@ def test_float_truncation(self): tm.assert_frame_equal(expected, result) exact = 1000000000.0011 - result = df.query('A == %.4f' % exact) + result = df.query("A == %.4f" % exact) expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) def test_disallow_python_keywords(self): # GH 18221 - df = pd.DataFrame([[0, 0, 0]], columns=['foo', 'bar', 'class']) + df = pd.DataFrame([[0, 0, 0]], columns=["foo", "bar", "class"]) msg = "Python keyword not valid identifier in numexpr query" with pytest.raises(SyntaxError, match=msg): - df.query('class == 0') + df.query("class == 0") df = pd.DataFrame() - df.index.name = 'lambda' + df.index.name = "lambda" with pytest.raises(SyntaxError, match=msg): - df.query('lambda == 0') + df.query("lambda == 0") @td.skip_if_no_ne class TestEvalNumexprPython(TestEvalNumexprPandas): - @classmethod def setup_class(cls): super().setup_class() import numexpr as ne + cls.ne = ne - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" def setup_ops(self): - self.cmp_ops = list(filter(lambda x: x not in ('in', 'not in'), - expr._cmp_ops_syms)) + self.cmp_ops = list( + filter(lambda x: x not in ("in", "not in"), expr._cmp_ops_syms) + ) self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = [s for s in expr._bool_ops_syms - if s not in ('and', 'or')] + self.bin_ops = [s for s in expr._bool_ops_syms if s not in ("and", "or")] self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops - self.unary_ops = '+', '-', '~' + self.unary_ops = "+", "-", "~" def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex1 = "lhs {0} mid {1} rhs".format(cmp1, cmp2) with pytest.raises(NotImplementedError): pd.eval(ex1, engine=self.engine, parser=self.parser) class TestEvalPythonPython(TestEvalNumexprPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'python' + cls.engine = "python" + cls.parser = "python" def check_modulus(self, lhs, arith1, rhs): - ex = 'lhs {0} rhs'.format(arith1) + ex = "lhs {0} rhs".format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs @@ -760,21 +814,19 @@ def check_alignment(self, result, nlhs, ghs, op): # TypeError, AttributeError: series or frame with scalar align pass else: - expected = eval('nlhs {0} ghs'.format(op)) + expected = eval("nlhs {0} ghs".format(op)) tm.assert_almost_equal(result, expected) class TestEvalPythonPandas(TestEvalPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, - rhs) + TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs) f = lambda *args, **kwargs: np.random.randn() @@ -785,19 +837,19 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): class TestTypeCasting: - @pytest.mark.parametrize('op', ['+', '-', '*', '**', '/']) + @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"]) # maybe someday... numexpr has too many upcasting rules now # chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])) - @pytest.mark.parametrize('dt', [np.float32, np.float64]) + @pytest.mark.parametrize("dt", [np.float32, np.float64]) def test_binop_typecasting(self, engine, parser, op, dt): df = mkdf(5, 3, data_gen_f=f, dtype=dt) - s = 'df {} 3'.format(op) + s = "df {} 3".format(op) res = pd.eval(s, engine=engine, parser=parser) assert df.values.dtype == dt assert res.values.dtype == dt assert_frame_equal(res, eval(s)) - s = '3 {} df'.format(op) + s = "3 {} df".format(op) res = pd.eval(s, engine=engine, parser=parser) assert df.values.dtype == dt assert res.values.dtype == dt @@ -807,65 +859,68 @@ def test_binop_typecasting(self, engine, parser, op, dt): # ------------------------------------- # Basic and complex alignment + def _is_datetime(x): return issubclass(x.dtype.type, np.datetime64) def should_warn(*args): - not_mono = not any(map(operator.attrgetter('is_monotonic'), args)) + not_mono = not any(map(operator.attrgetter("is_monotonic"), args)) only_one_dt = reduce(operator.xor, map(_is_datetime, args)) return not_mono and only_one_dt class TestAlignment: - index_types = 'i', 'u', 'dt' - lhs_index_types = index_types + ('s',) # 'p' + index_types = "i", "u", "dt" + lhs_index_types = index_types + ("s",) # 'p' def test_align_nested_unary_op(self, engine, parser): - s = 'df * ~2' + s = "df * ~2" df = mkdf(5, 3, data_gen_f=f) res = pd.eval(s, engine=engine, parser=parser) assert_frame_equal(res, df * ~2) def test_basic_frame_alignment(self, engine, parser): - args = product(self.lhs_index_types, self.index_types, - self.index_types) + args = product(self.lhs_index_types, self.index_types, self.index_types) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for lr_idx_type, rr_idx_type, c_idx_type in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type, - c_idx_type=c_idx_type) - df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type, - c_idx_type=c_idx_type) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=lr_idx_type, c_idx_type=c_idx_type + ) + df2 = mkdf( + 20, 10, data_gen_f=f, r_idx_type=rr_idx_type, c_idx_type=c_idx_type + ) # only warns if not monotonic and not sortable if should_warn(df.index, df2.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df + df2', engine=engine, parser=parser) + res = pd.eval("df + df2", engine=engine, parser=parser) else: - res = pd.eval('df + df2', engine=engine, parser=parser) + res = pd.eval("df + df2", engine=engine, parser=parser) assert_frame_equal(res, df + df2) def test_frame_comparison(self, engine, parser): args = product(self.lhs_index_types, repeat=2) for r_idx_type, c_idx_type in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - res = pd.eval('df < 2', engine=engine, parser=parser) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) + res = pd.eval("df < 2", engine=engine, parser=parser) assert_frame_equal(res, df < 2) - df3 = DataFrame(randn(*df.shape), index=df.index, - columns=df.columns) - res = pd.eval('df < df3', engine=engine, parser=parser) + df3 = DataFrame(randn(*df.shape), index=df.index, columns=df.columns) + res = pd.eval("df < df3", engine=engine, parser=parser) assert_frame_equal(res, df < df3) @pytest.mark.slow def test_medium_complex_frame_alignment(self, engine, parser): - args = product(self.lhs_index_types, self.index_types, - self.index_types, self.index_types) + args = product( + self.lhs_index_types, self.index_types, self.index_types, self.index_types + ) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r1, c1, r2, c2 in args: df = mkdf(3, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) @@ -873,59 +928,56 @@ def test_medium_complex_frame_alignment(self, engine, parser): df3 = mkdf(5, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) if should_warn(df.index, df2.index, df3.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df + df2 + df3', engine=engine, - parser=parser) + res = pd.eval("df + df2 + df3", engine=engine, parser=parser) else: - res = pd.eval('df + df2 + df3', - engine=engine, parser=parser) + res = pd.eval("df + df2 + df3", engine=engine, parser=parser) assert_frame_equal(res, df + df2 + df3) def test_basic_frame_series_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) if should_warn(df.index, s.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df + s', engine=engine, parser=parser) + res = pd.eval("df + s", engine=engine, parser=parser) else: - res = pd.eval('df + s', engine=engine, parser=parser) + res = pd.eval("df + s", engine=engine, parser=parser) - if r_idx_type == 'dt' or c_idx_type == 'dt': - expected = df.add(s) if engine == 'numexpr' else df + s + if r_idx_type == "dt" or c_idx_type == "dt": + expected = df.add(s) if engine == "numexpr" else df + s else: expected = df + s assert_frame_equal(res, expected) - args = product(self.lhs_index_types, self.index_types, - ('index', 'columns')) + args = product(self.lhs_index_types, self.index_types, ("index", "columns")) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) def test_basic_series_frame_alignment(self, engine, parser): def testit(r_idx_type, c_idx_type, index_name): - df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) if should_warn(s.index, df.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('s + df', engine=engine, parser=parser) + res = pd.eval("s + df", engine=engine, parser=parser) else: - res = pd.eval('s + df', engine=engine, parser=parser) + res = pd.eval("s + df", engine=engine, parser=parser) - if r_idx_type == 'dt' or c_idx_type == 'dt': - expected = df.add(s) if engine == 'numexpr' else s + df + if r_idx_type == "dt" or c_idx_type == "dt": + expected = df.add(s) if engine == "numexpr" else s + df else: expected = s + df assert_frame_equal(res, expected) # only test dt with dt, otherwise weird joins result - args = product(['i', 'u', 's'], ['i', 'u', 's'], ('index', 'columns')) + args = product(["i", "u", "s"], ["i", "u", "s"], ("index", "columns")) with warnings.catch_warnings(record=True): # avoid warning about comparing strings and ints warnings.simplefilter("ignore", RuntimeWarning) @@ -934,7 +986,7 @@ def testit(r_idx_type, c_idx_type, index_name): testit(r_idx_type, c_idx_type, index_name) # dt with dt - args = product(['dt'], ['dt'], ('index', 'columns')) + args = product(["dt"], ["dt"], ("index", "columns")) with warnings.catch_warnings(record=True): # avoid warning about comparing strings and ints warnings.simplefilter("ignore", RuntimeWarning) @@ -943,19 +995,21 @@ def testit(r_idx_type, c_idx_type, index_name): testit(r_idx_type, c_idx_type, index_name) def test_series_frame_commutativity(self, engine, parser): - args = product(self.lhs_index_types, self.index_types, ('+', '*'), - ('index', 'columns')) + args = product( + self.lhs_index_types, self.index_types, ("+", "*"), ("index", "columns") + ) with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r_idx_type, c_idx_type, op, index_name in args: - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + df = mkdf( + 10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) - lhs = 's {0} df'.format(op) - rhs = 'df {0} s'.format(op) + lhs = "s {0} df".format(op) + rhs = "df {0} s".format(op) if should_warn(df.index, s.index): with tm.assert_produces_warning(RuntimeWarning): a = pd.eval(lhs, engine=engine, parser=parser) @@ -965,40 +1019,42 @@ def test_series_frame_commutativity(self, engine, parser): a = pd.eval(lhs, engine=engine, parser=parser) b = pd.eval(rhs, engine=engine, parser=parser) - if r_idx_type != 'dt' and c_idx_type != 'dt': - if engine == 'numexpr': + if r_idx_type != "dt" and c_idx_type != "dt": + if engine == "numexpr": assert_frame_equal(a, b) @pytest.mark.slow def test_complex_series_frame_alignment(self, engine, parser): import random - args = product(self.lhs_index_types, self.index_types, - self.index_types, self.index_types) + + args = product( + self.lhs_index_types, self.index_types, self.index_types, self.index_types + ) n = 3 m1 = 5 m2 = 2 * m1 with warnings.catch_warnings(record=True): - warnings.simplefilter('always', RuntimeWarning) + warnings.simplefilter("always", RuntimeWarning) for r1, r2, c1, c2 in args: - index_name = random.choice(['index', 'columns']) - obj_name = random.choice(['df', 'df2']) + index_name = random.choice(["index", "columns"]) + obj_name = random.choice(["df", "df2"]) df = mkdf(m1, n, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) df2 = mkdf(m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) index = getattr(locals().get(obj_name), index_name) s = Series(np.random.randn(n), index[:n]) - if r2 == 'dt' or c2 == 'dt': - if engine == 'numexpr': + if r2 == "dt" or c2 == "dt": + if engine == "numexpr": expected2 = df2.add(s) else: expected2 = df2 + s else: expected2 = df2 + s - if r1 == 'dt' or c1 == 'dt': - if engine == 'numexpr': + if r1 == "dt" or c1 == "dt": + if engine == "numexpr": expected = expected2.add(df) else: expected = expected2 + df @@ -1007,37 +1063,36 @@ def test_complex_series_frame_alignment(self, engine, parser): if should_warn(df2.index, s.index, df.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval('df2 + s + df', engine=engine, - parser=parser) + res = pd.eval("df2 + s + df", engine=engine, parser=parser) else: - res = pd.eval('df2 + s + df', engine=engine, parser=parser) + res = pd.eval("df2 + s + df", engine=engine, parser=parser) assert res.shape == expected.shape assert_frame_equal(res, expected) def test_performance_warning_for_poor_alignment(self, engine, parser): df = DataFrame(randn(1000, 10)) s = Series(randn(10000)) - if engine == 'numexpr': + if engine == "numexpr": seen = PerformanceWarning else: seen = False with assert_produces_warning(seen): - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) s = Series(randn(1000)) with assert_produces_warning(False): - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) df = DataFrame(randn(10, 10000)) s = Series(randn(10000)) with assert_produces_warning(False): - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) df = DataFrame(randn(10, 10)) s = Series(randn(10000)) - is_python_engine = engine == 'python' + is_python_engine = engine == "python" if not is_python_engine: wrn = PerformanceWarning @@ -1045,28 +1100,30 @@ def test_performance_warning_for_poor_alignment(self, engine, parser): wrn = False with assert_produces_warning(wrn) as w: - pd.eval('df + s', engine=engine, parser=parser) + pd.eval("df + s", engine=engine, parser=parser) if not is_python_engine: assert len(w) == 1 msg = str(w[0].message) - expected = ("Alignment difference on axis {0} is larger" - " than an order of magnitude on term {1!r}, " - "by more than {2:.4g}; performance may suffer" - "".format(1, 'df', np.log10(s.size - df.shape[1]))) + expected = ( + "Alignment difference on axis {0} is larger" + " than an order of magnitude on term {1!r}, " + "by more than {2:.4g}; performance may suffer" + "".format(1, "df", np.log10(s.size - df.shape[1])) + ) assert msg == expected # ------------------------------------ # Slightly more complex ops + @td.skip_if_no_ne class TestOperationsNumExprPandas: - @classmethod def setup_class(cls): - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms @classmethod @@ -1074,20 +1131,20 @@ def teardown_class(cls): del cls.engine, cls.parser def eval(self, *args, **kwargs): - kwargs['engine'] = self.engine - kwargs['parser'] = self.parser - kwargs['level'] = kwargs.pop('level', 0) + 1 + kwargs["engine"] = self.engine + kwargs["parser"] = self.parser + kwargs["level"] = kwargs.pop("level", 0) + 1 return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): ops = self.arith_ops - for op in filter(lambda x: x != '//', ops): - ex = '1 {0} 1'.format(op) - ex2 = 'x {0} 1'.format(op) - ex3 = '1 {0} (x + 1)'.format(op) + for op in filter(lambda x: x != "//", ops): + ex = "1 {0} 1".format(op) + ex2 = "x {0} 1".format(op) + ex3 = "1 {0} (x + 1)".format(op) - if op in ('in', 'not in'): + if op in ("in", "not in"): msg = "argument of type 'int' is not iterable" with pytest.raises(TypeError, match=msg): pd.eval(ex, engine=self.engine, parser=self.parser) @@ -1097,27 +1154,29 @@ def test_simple_arith_ops(self): assert x == expec expec = _eval_single_bin(x, op, 1, self.engine) - y = self.eval(ex2, local_dict={'x': x}, engine=self.engine, - parser=self.parser) + y = self.eval( + ex2, local_dict={"x": x}, engine=self.engine, parser=self.parser + ) assert y == expec expec = _eval_single_bin(1, op, x + 1, self.engine) - y = self.eval(ex3, local_dict={'x': x}, - engine=self.engine, parser=self.parser) + y = self.eval( + ex3, local_dict={"x": x}, engine=self.engine, parser=self.parser + ) assert y == expec def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), - (True, False)): - ex = '{0} {1} {2}'.format(lhs, op, rhs) + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + ex = "{0} {1} {2}".format(lhs, op, rhs) res = self.eval(ex) exp = eval(ex) assert res == exp def test_bool_ops_with_constants(self): - for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), - ('True', 'False')): - ex = '{0} {1} {2}'.format(lhs, op, rhs) + for op, lhs, rhs in product( + expr._bool_ops_syms, ("True", "False"), ("True", "False") + ): + ex = "{0} {1} {2}".format(lhs, op, rhs) res = self.eval(ex) exp = eval(ex) assert res == exp @@ -1126,22 +1185,21 @@ def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) y = Series(randn(10)) with pytest.raises(NotImplementedError): - self.eval('x + y', - local_dict={'x': x, 'y': y}) + self.eval("x + y", local_dict={"x": x, "y": y}) def test_constant(self): - x = self.eval('1') + x = self.eval("1") assert x == 1 def test_single_variable(self): df = DataFrame(randn(10, 2)) - df2 = self.eval('df', local_dict={'df': df}) + df2 = self.eval("df", local_dict={"df": df}) assert_frame_equal(df, df2) def test_truediv(self): s = np.array([1]) - ex = 's / 1' - d = {'s': s} # noqa + ex = "s / 1" + d = {"s": s} # noqa res = self.eval(ex, truediv=False) tm.assert_numpy_array_equal(res, np.array([1.0])) @@ -1149,94 +1207,94 @@ def test_truediv(self): res = self.eval(ex, truediv=True) tm.assert_numpy_array_equal(res, np.array([1.0])) - res = self.eval('1 / 2', truediv=True) + res = self.eval("1 / 2", truediv=True) expec = 0.5 assert res == expec - res = self.eval('1 / 2', truediv=False) + res = self.eval("1 / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval('s / 2', truediv=False) + res = self.eval("s / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval('s / 2', truediv=True) + res = self.eval("s / 2", truediv=True) expec = 0.5 assert res == expec def test_failing_subscript_with_name_error(self): df = DataFrame(np.random.randn(5, 3)) # noqa with pytest.raises(NameError): - self.eval('df[x > 2] > 2') + self.eval("df[x > 2] > 2") def test_lhs_expression_subscript(self): df = DataFrame(np.random.randn(5, 3)) - result = self.eval('(df + 1)[df > 2]', local_dict={'df': df}) + result = self.eval("(df + 1)[df > 2]", local_dict={"df": df}) expected = (df + 1)[df > 2] assert_frame_equal(result, expected) def test_attr_expression(self): - df = DataFrame(np.random.randn(5, 3), columns=list('abc')) - expr1 = 'df.a < df.b' + df = DataFrame(np.random.randn(5, 3), columns=list("abc")) + expr1 = "df.a < df.b" expec1 = df.a < df.b - expr2 = 'df.a + df.b + df.c' + expr2 = "df.a + df.b + df.c" expec2 = df.a + df.b + df.c - expr3 = 'df.a + df.b + df.c[df.b < 0]' + expr3 = "df.a + df.b + df.c[df.b < 0]" expec3 = df.a + df.b + df.c[df.b < 0] exprs = expr1, expr2, expr3 expecs = expec1, expec2, expec3 for e, expec in zip(exprs, expecs): - assert_series_equal(expec, self.eval(e, local_dict={'df': df})) + assert_series_equal(expec, self.eval(e, local_dict={"df": df})) def test_assignment_fails(self): - df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + df = DataFrame(np.random.randn(5, 3), columns=list("abc")) df2 = DataFrame(np.random.randn(5, 3)) - expr1 = 'df = df2' + expr1 = "df = df2" msg = "cannot assign without a target object" with pytest.raises(ValueError, match=msg): - self.eval(expr1, local_dict={'df': df, 'df2': df2}) + self.eval(expr1, local_dict={"df": df, "df2": df2}) def test_assignment_column(self): - df = DataFrame(np.random.randn(5, 2), columns=list('ab')) + df = DataFrame(np.random.randn(5, 2), columns=list("ab")) orig_df = df.copy() # multiple assignees with pytest.raises(SyntaxError, match="invalid syntax"): - df.eval('d c = a + b') + df.eval("d c = a + b") # invalid assignees msg = "left hand side of an assignment must be a single name" with pytest.raises(SyntaxError, match=msg): - df.eval('d,c = a + b') + df.eval("d,c = a + b") msg = "can't assign to function call" with pytest.raises(SyntaxError, match=msg): df.eval('Timestamp("20131001") = a + b') # single assignment - existing variable expected = orig_df.copy() - expected['a'] = expected['a'] + expected['b'] + expected["a"] = expected["a"] + expected["b"] df = orig_df.copy() - df.eval('a = a + b', inplace=True) + df.eval("a = a + b", inplace=True) assert_frame_equal(df, expected) # single assignment - new variable expected = orig_df.copy() - expected['c'] = expected['a'] + expected['b'] + expected["c"] = expected["a"] + expected["b"] df = orig_df.copy() - df.eval('c = a + b', inplace=True) + df.eval("c = a + b", inplace=True) assert_frame_equal(df, expected) # with a local name overlap def f(): df = orig_df.copy() a = 1 # noqa - df.eval('a = 1 + b', inplace=True) + df.eval("a = 1 + b", inplace=True) return df df = f() expected = orig_df.copy() - expected['a'] = 1 + expected['b'] + expected["a"] = 1 + expected["b"] assert_frame_equal(df, expected) df = orig_df.copy() @@ -1244,7 +1302,7 @@ def f(): def f(): a = 1 # noqa old_a = df.a.copy() - df.eval('a = a + b', inplace=True) + df.eval("a = a + b", inplace=True) result = old_a + df.b assert_series_equal(result, df.a, check_names=False) assert result.name is None @@ -1253,146 +1311,169 @@ def f(): # multiple assignment df = orig_df.copy() - df.eval('c = a + b', inplace=True) + df.eval("c = a + b", inplace=True) msg = "can only assign a single expression" with pytest.raises(SyntaxError, match=msg): - df.eval('c = a = b') + df.eval("c = a = b") # explicit targets df = orig_df.copy() - self.eval('c = df.a + df.b', local_dict={'df': df}, - target=df, inplace=True) + self.eval("c = df.a + df.b", local_dict={"df": df}, target=df, inplace=True) expected = orig_df.copy() - expected['c'] = expected['a'] + expected['b'] + expected["c"] = expected["a"] + expected["b"] assert_frame_equal(df, expected) def test_column_in(self): # GH 11235 - df = DataFrame({'a': [11], 'b': [-32]}) - result = df.eval('a in [11, -32]') + df = DataFrame({"a": [11], "b": [-32]}) + result = df.eval("a in [11, -32]") expected = Series([True]) assert_series_equal(result, expected) def assignment_not_inplace(self): # see gh-9297 - df = DataFrame(np.random.randn(5, 2), columns=list('ab')) + df = DataFrame(np.random.randn(5, 2), columns=list("ab")) - actual = df.eval('c = a + b', inplace=False) + actual = df.eval("c = a + b", inplace=False) assert actual is not None expected = df.copy() - expected['c'] = expected['a'] + expected['b'] + expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) def test_multi_line_expression(self): # GH 11149 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() - expected['c'] = expected['a'] + expected['b'] - expected['d'] = expected['c'] + expected['b'] - ans = df.eval(""" + expected["c"] = expected["a"] + expected["b"] + expected["d"] = expected["c"] + expected["b"] + ans = df.eval( + """ c = a + b - d = c + b""", inplace=True) + d = c + b""", + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None - expected['a'] = expected['a'] - 1 - expected['e'] = expected['a'] + 2 - ans = df.eval(""" + expected["a"] = expected["a"] - 1 + expected["e"] = expected["a"] + 2 + ans = df.eval( + """ a = a - 1 - e = a + 2""", inplace=True) + e = a + 2""", + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None # multi-line not valid if not all assignments with pytest.raises(ValueError): - df.eval(""" + df.eval( + """ a = b + 2 - b - 2""", inplace=False) + b - 2""", + inplace=False, + ) def test_multi_line_expression_not_inplace(self): # GH 11149 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() - expected['c'] = expected['a'] + expected['b'] - expected['d'] = expected['c'] + expected['b'] - df = df.eval(""" + expected["c"] = expected["a"] + expected["b"] + expected["d"] = expected["c"] + expected["b"] + df = df.eval( + """ c = a + b - d = c + b""", inplace=False) + d = c + b""", + inplace=False, + ) assert_frame_equal(expected, df) - expected['a'] = expected['a'] - 1 - expected['e'] = expected['a'] + 2 - df = df.eval(""" + expected["a"] = expected["a"] - 1 + expected["e"] = expected["a"] + 2 + df = df.eval( + """ a = a - 1 - e = a + 2""", inplace=False) + e = a + 2""", + inplace=False, + ) assert_frame_equal(expected, df) def test_multi_line_expression_local_variable(self): # GH 15342 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() local_var = 7 - expected['c'] = expected['a'] * local_var - expected['d'] = expected['c'] + local_var - ans = df.eval(""" + expected["c"] = expected["a"] * local_var + expected["d"] = expected["c"] + local_var + ans = df.eval( + """ c = a * @local_var d = c + @local_var - """, inplace=True) + """, + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None def test_multi_line_expression_callable_local_variable(self): # 26426 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) def local_func(a, b): return b expected = df.copy() - expected['c'] = expected['a'] * local_func(1, 7) - expected['d'] = expected['c'] + local_func(1, 7) - ans = df.eval(""" + expected["c"] = expected["a"] * local_func(1, 7) + expected["d"] = expected["c"] + local_func(1, 7) + ans = df.eval( + """ c = a * @local_func(1, 7) d = c + @local_func(1, 7) - """, inplace=True) + """, + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None def test_multi_line_expression_callable_local_variable_with_kwargs(self): # 26426 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) def local_func(a, b): return b expected = df.copy() - expected['c'] = expected['a'] * local_func(b=7, a=1) - expected['d'] = expected['c'] + local_func(b=7, a=1) - ans = df.eval(""" + expected["c"] = expected["a"] * local_func(b=7, a=1) + expected["d"] = expected["c"] + local_func(b=7, a=1) + ans = df.eval( + """ c = a * @local_func(b=7, a=1) d = c + @local_func(b=7, a=1) - """, inplace=True) + """, + inplace=True, + ) assert_frame_equal(expected, df) assert ans is None def test_assignment_in_query(self): # GH 8664 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() with pytest.raises(ValueError): - df.query('a = 1') + df.query("a = 1") assert_frame_equal(df, df_orig) def test_query_inplace(self): # see gh-11149 - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) expected = df.copy() - expected = expected[expected['a'] == 2] - df.query('a == 2', inplace=True) + expected = expected[expected["a"] == 2] + df.query("a == 2", inplace=True) assert_frame_equal(expected, df) df = {} @@ -1401,8 +1482,7 @@ def test_query_inplace(self): self.eval("a = 1 + 2", target=df, inplace=True) tm.assert_dict_equal(df, expected) - @pytest.mark.parametrize("invalid_target", [1, "cat", [1, 2], - np.array([]), (1, 3)]) + @pytest.mark.parametrize("invalid_target", [1, "cat", [1, 2], np.array([]), (1, 3)]) @pytest.mark.filterwarnings("ignore::FutureWarning") def test_cannot_item_assign(self, invalid_target): msg = "Cannot assign expression output to target" @@ -1423,8 +1503,7 @@ def test_cannot_copy_item(self, invalid_target): with pytest.raises(ValueError, match=msg): self.eval(expression, target=invalid_target, inplace=False) - @pytest.mark.parametrize("target", [1, "cat", [1, 2], - np.array([]), (1, 3), {1: 2}]) + @pytest.mark.parametrize("target", [1, "cat", [1, 2], np.array([]), (1, 3), {1: 2}]) def test_inplace_no_assignment(self, target): expression = "1 + 2" @@ -1435,109 +1514,106 @@ def test_inplace_no_assignment(self, target): self.eval(expression, target=target, inplace=True) def test_basic_period_index_boolean_expression(self): - df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + df = mkdf(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") e = df < 2 - r = self.eval('df < 2', local_dict={'df': df}) + r = self.eval("df < 2", local_dict={"df": df}) x = df < 2 assert_frame_equal(r, e) assert_frame_equal(x, e) def test_basic_period_index_subscript_expression(self): - df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') - r = self.eval('df[df < 2 + 3]', local_dict={'df': df}) + df = mkdf(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + r = self.eval("df[df < 2 + 3]", local_dict={"df": df}) e = df[df < 2 + 3] assert_frame_equal(r, e) def test_nested_period_index_subscript_expression(self): - df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') - r = self.eval('df[df[df < 2] < 2] + df * 2', local_dict={'df': df}) + df = mkdf(2, 2, data_gen_f=f, c_idx_type="p", r_idx_type="i") + r = self.eval("df[df[df < 2] < 2] + df * 2", local_dict={"df": df}) e = df[df[df < 2] < 2] + df * 2 assert_frame_equal(r, e) def test_date_boolean(self): df = DataFrame(randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - res = self.eval('df.dates1 < 20130101', local_dict={'df': df}, - engine=self.engine, parser=self.parser) - expec = df.dates1 < '20130101' + df["dates1"] = date_range("1/1/2012", periods=5) + res = self.eval( + "df.dates1 < 20130101", + local_dict={"df": df}, + engine=self.engine, + parser=self.parser, + ) + expec = df.dates1 < "20130101" assert_series_equal(res, expec, check_names=False) def test_simple_in_ops(self): - if self.parser != 'python': - res = pd.eval('1 in [1, 2]', engine=self.engine, - parser=self.parser) + if self.parser != "python": + res = pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) assert res - res = pd.eval('2 in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('3 in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) assert not res - res = pd.eval('3 not in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('[3] not in (1, 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("[3] not in (1, 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('[3] in ([3], 2)', engine=self.engine, - parser=self.parser) + res = pd.eval("[3] in ([3], 2)", engine=self.engine, parser=self.parser) assert res - res = pd.eval('[[3]] in [[[3]], 2]', engine=self.engine, - parser=self.parser) + res = pd.eval("[[3]] in [[[3]], 2]", engine=self.engine, parser=self.parser) assert res - res = pd.eval('(3,) in [(3,), 2]', engine=self.engine, - parser=self.parser) + res = pd.eval("(3,) in [(3,), 2]", engine=self.engine, parser=self.parser) assert res - res = pd.eval('(3,) not in [(3,), 2]', engine=self.engine, - parser=self.parser) + res = pd.eval( + "(3,) not in [(3,), 2]", engine=self.engine, parser=self.parser + ) assert not res - res = pd.eval('[(3,)] in [[(3,)], 2]', engine=self.engine, - parser=self.parser) + res = pd.eval( + "[(3,)] in [[(3,)], 2]", engine=self.engine, parser=self.parser + ) assert res else: with pytest.raises(NotImplementedError): - pd.eval('1 in [1, 2]', engine=self.engine, parser=self.parser) + pd.eval("1 in [1, 2]", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('2 in (1, 2)', engine=self.engine, parser=self.parser) + pd.eval("2 in (1, 2)", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('3 in (1, 2)', engine=self.engine, parser=self.parser) + pd.eval("3 in (1, 2)", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('3 not in (1, 2)', engine=self.engine, - parser=self.parser) + pd.eval("3 not in (1, 2)", engine=self.engine, parser=self.parser) with pytest.raises(NotImplementedError): - pd.eval('[(3,)] in (1, 2, [(3,)])', engine=self.engine, - parser=self.parser) + pd.eval( + "[(3,)] in (1, 2, [(3,)])", engine=self.engine, parser=self.parser + ) with pytest.raises(NotImplementedError): - pd.eval('[3] not in (1, 2, [[3]])', engine=self.engine, - parser=self.parser) + pd.eval( + "[3] not in (1, 2, [[3]])", engine=self.engine, parser=self.parser + ) @td.skip_if_no_ne class TestOperationsNumExprPython(TestOperationsNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), - cls.arith_ops) + cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) def test_check_many_exprs(self): a = 1 # noqa - expr = ' * '.join('a' * 33) + expr = " * ".join("a" * 33) expected = 1 res = pd.eval(expr, engine=self.engine, parser=self.parser) assert res == expected @@ -1546,40 +1622,53 @@ def test_fails_and(self): df = DataFrame(np.random.randn(5, 3)) msg = "'BoolOp' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - pd.eval('df > 2 and df > 3', local_dict={'df': df}, - parser=self.parser, engine=self.engine) + pd.eval( + "df > 2 and df > 3", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) def test_fails_or(self): df = DataFrame(np.random.randn(5, 3)) msg = "'BoolOp' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - pd.eval('df > 2 or df > 3', local_dict={'df': df}, - parser=self.parser, engine=self.engine) + pd.eval( + "df > 2 or df > 3", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) def test_fails_not(self): df = DataFrame(np.random.randn(5, 3)) msg = "'Not' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - pd.eval('not df > 2', local_dict={'df': df}, parser=self.parser, - engine=self.engine) + pd.eval( + "not df > 2", + local_dict={"df": df}, + parser=self.parser, + engine=self.engine, + ) def test_fails_ampersand(self): df = DataFrame(np.random.randn(5, 3)) # noqa - ex = '(df + 2)[df > 1] > 0 & (df > 0)' + ex = "(df + 2)[df > 1] > 0 & (df > 0)" with pytest.raises(NotImplementedError): pd.eval(ex, parser=self.parser, engine=self.engine) def test_fails_pipe(self): df = DataFrame(np.random.randn(5, 3)) # noqa - ex = '(df + 2)[df > 1] > 0 | (df > 0)' + ex = "(df + 2)[df > 1] > 0 | (df > 0)" with pytest.raises(NotImplementedError): pd.eval(ex, parser=self.parser, engine=self.engine) def test_bool_ops_with_constants(self): - for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), - ('True', 'False')): - ex = '{0} {1} {2}'.format(lhs, op, rhs) - if op in ('and', 'or'): + for op, lhs, rhs in product( + expr._bool_ops_syms, ("True", "False"), ("True", "False") + ): + ex = "{0} {1} {2}".format(lhs, op, rhs) + if op in ("and", "or"): with pytest.raises(NotImplementedError): self.eval(ex) else: @@ -1588,10 +1677,9 @@ def test_bool_ops_with_constants(self): assert res == exp def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), - (True, False)): - ex = 'lhs {0} rhs'.format(op) - if op in ('and', 'or'): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + ex = "lhs {0} rhs".format(op) + if op in ("and", "or"): with pytest.raises(NotImplementedError): pd.eval(ex, engine=self.engine, parser=self.parser) else: @@ -1601,33 +1689,29 @@ def test_simple_bool_ops(self): class TestOperationsPythonPython(TestOperationsNumExprPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = cls.parser = 'python' + cls.engine = cls.parser = "python" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), - cls.arith_ops) + cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) class TestOperationsPythonPandas(TestOperationsNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms @td.skip_if_no_ne class TestMathPythonPython: - @classmethod def setup_class(cls): - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" cls.unary_fns = _unary_math_ops cls.binary_fns = _binary_math_ops @@ -1636,69 +1720,63 @@ def teardown_class(cls): del cls.engine, cls.parser def eval(self, *args, **kwargs): - kwargs['engine'] = self.engine - kwargs['parser'] = self.parser - kwargs['level'] = kwargs.pop('level', 0) + 1 + kwargs["engine"] = self.engine + kwargs["parser"] = self.parser + kwargs["level"] = kwargs.pop("level", 0) + 1 return pd.eval(*args, **kwargs) def test_unary_functions(self, unary_fns_for_ne): - df = DataFrame({'a': np.random.randn(10)}) + df = DataFrame({"a": np.random.randn(10)}) a = df.a for fn in unary_fns_for_ne: expr = "{0}(a)".format(fn) got = self.eval(expr) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expect = getattr(np, fn)(a) tm.assert_series_equal(got, expect, check_names=False) - def test_floor_and_ceil_functions_raise_error(self, - ne_lt_2_6_9, - unary_fns_for_ne): - for fn in ('floor', 'ceil'): - msg = "\"{0}\" is not a supported function".format(fn) + def test_floor_and_ceil_functions_raise_error(self, ne_lt_2_6_9, unary_fns_for_ne): + for fn in ("floor", "ceil"): + msg = '"{0}" is not a supported function'.format(fn) with pytest.raises(ValueError, match=msg): expr = "{0}(100)".format(fn) self.eval(expr) def test_binary_functions(self): - df = DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) a = df.a b = df.b for fn in self.binary_fns: expr = "{0}(a, b)".format(fn) got = self.eval(expr) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) tm.assert_almost_equal(got, expect, check_names=False) def test_df_use_case(self): - df = DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) - df.eval("e = arctan2(sin(a), b)", - engine=self.engine, - parser=self.parser, inplace=True) + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval( + "e = arctan2(sin(a), b)", + engine=self.engine, + parser=self.parser, + inplace=True, + ) got = df.e expect = np.arctan2(np.sin(df.a), df.b) tm.assert_series_equal(got, expect, check_names=False) def test_df_arithmetic_subexpression(self): - df = DataFrame({'a': np.random.randn(10), - 'b': np.random.randn(10)}) - df.eval("e = sin(a + b)", - engine=self.engine, - parser=self.parser, inplace=True) + df = DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval("e = sin(a + b)", engine=self.engine, parser=self.parser, inplace=True) got = df.e expect = np.sin(df.a + df.b) tm.assert_series_equal(got, expect, check_names=False) def check_result_type(self, dtype, expect_dtype): - df = DataFrame({'a': np.random.randn(10).astype(dtype)}) + df = DataFrame({"a": np.random.randn(10).astype(dtype)}) assert df.a.dtype == dtype - df.eval("b = sin(a)", - engine=self.engine, - parser=self.parser, inplace=True) + df.eval("b = sin(a)", engine=self.engine, parser=self.parser, inplace=True) got = df.b expect = np.sin(df.a) assert expect.dtype == got.dtype @@ -1720,101 +1798,97 @@ def test_result_types2(self): self.check_result_type(np.complex128, np.complex128) def test_undefined_func(self): - df = DataFrame({'a': np.random.randn(10)}) - msg = "\"mysin\" is not a supported function" + df = DataFrame({"a": np.random.randn(10)}) + msg = '"mysin" is not a supported function' with pytest.raises(ValueError, match=msg): - df.eval("mysin(a)", - engine=self.engine, - parser=self.parser) + df.eval("mysin(a)", engine=self.engine, parser=self.parser) def test_keyword_arg(self): - df = DataFrame({'a': np.random.randn(10)}) - msg = "Function \"sin\" does not support keyword arguments" + df = DataFrame({"a": np.random.randn(10)}) + msg = 'Function "sin" does not support keyword arguments' with pytest.raises(TypeError, match=msg): - df.eval("sin(x=a)", - engine=self.engine, - parser=self.parser) + df.eval("sin(x=a)", engine=self.engine, parser=self.parser) class TestMathPythonPandas(TestMathPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" class TestMathNumExprPandas(TestMathPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" class TestMathNumExprPython(TestMathPythonPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" _var_s = randn(10) class TestScope: - def test_global_scope(self, engine, parser): - e = '_var_s * 2' - tm.assert_numpy_array_equal(_var_s * 2, pd.eval(e, engine=engine, - parser=parser)) + e = "_var_s * 2" + tm.assert_numpy_array_equal( + _var_s * 2, pd.eval(e, engine=engine, parser=parser) + ) def test_no_new_locals(self, engine, parser): x = 1 # noqa lcls = locals().copy() - pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser) + pd.eval("x + 1", local_dict=lcls, engine=engine, parser=parser) lcls2 = locals().copy() - lcls2.pop('lcls') + lcls2.pop("lcls") assert lcls == lcls2 def test_no_new_globals(self, engine, parser): x = 1 # noqa gbls = globals().copy() - pd.eval('x + 1', engine=engine, parser=parser) + pd.eval("x + 1", engine=engine, parser=parser) gbls2 = globals().copy() assert gbls == gbls2 @td.skip_if_no_ne def test_invalid_engine(): - msg = 'Invalid engine \'asdf\' passed' + msg = "Invalid engine 'asdf' passed" with pytest.raises(KeyError, match=msg): - pd.eval('x + y', local_dict={'x': 1, 'y': 2}, engine='asdf') + pd.eval("x + y", local_dict={"x": 1, "y": 2}, engine="asdf") @td.skip_if_no_ne def test_invalid_parser(): - msg = 'Invalid parser \'asdf\' passed' + msg = "Invalid parser 'asdf' passed" with pytest.raises(KeyError, match=msg): - pd.eval('x + y', local_dict={'x': 1, 'y': 2}, parser='asdf') + pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf") -_parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, - 'pandas': PandasExprVisitor} +_parsers = { + "python": PythonExprVisitor, + "pytables": pytables.ExprVisitor, + "pandas": PandasExprVisitor, +} -@pytest.mark.parametrize('engine', _engines) -@pytest.mark.parametrize('parser', _parsers) +@pytest.mark.parametrize("engine", _engines) +@pytest.mark.parametrize("parser", _parsers) def test_disallowed_nodes(engine, parser): VisitorClass = _parsers[parser] uns_ops = VisitorClass.unsupported_nodes - inst = VisitorClass('x + 1', engine, parser) + inst = VisitorClass("x + 1", engine, parser) for ops in uns_ops: with pytest.raises(NotImplementedError): @@ -1822,23 +1896,23 @@ def test_disallowed_nodes(engine, parser): def test_syntax_error_exprs(engine, parser): - e = 's +' + e = "s +" with pytest.raises(SyntaxError): pd.eval(e, engine=engine, parser=parser) def test_name_error_exprs(engine, parser): - e = 's + t' + e = "s + t" with pytest.raises(NameError): pd.eval(e, engine=engine, parser=parser) def test_invalid_local_variable_reference(engine, parser): a, b = 1, 2 # noqa - exprs = 'a + @b', '@a + b', '@a + @b' + exprs = "a + @b", "@a + b", "@a + @b" for _expr in exprs: - if parser != 'pandas': + if parser != "pandas": with pytest.raises(SyntaxError, match="The '@' prefix is only"): pd.eval(_expr, engine=engine, parser=parser) else: @@ -1848,37 +1922,35 @@ def test_invalid_local_variable_reference(engine, parser): def test_numexpr_builtin_raises(engine, parser): sin, dotted_line = 1, 2 - if engine == 'numexpr': - msg = 'Variables in expression .+' + if engine == "numexpr": + msg = "Variables in expression .+" with pytest.raises(NumExprClobberingError, match=msg): - pd.eval('sin + dotted_line', engine=engine, parser=parser) + pd.eval("sin + dotted_line", engine=engine, parser=parser) else: - res = pd.eval('sin + dotted_line', engine=engine, parser=parser) + res = pd.eval("sin + dotted_line", engine=engine, parser=parser) assert res == sin + dotted_line def test_bad_resolver_raises(engine, parser): cannot_resolve = 42, 3.0 - with pytest.raises(TypeError, match='Resolver of type .+'): - pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine, - parser=parser) + with pytest.raises(TypeError, match="Resolver of type .+"): + pd.eval("1 + 2", resolvers=cannot_resolve, engine=engine, parser=parser) def test_empty_string_raises(engine, parser): # GH 13139 with pytest.raises(ValueError, match="expr cannot be an empty string"): - pd.eval('', engine=engine, parser=parser) + pd.eval("", engine=engine, parser=parser) def test_more_than_one_expression_raises(engine, parser): - with pytest.raises(SyntaxError, match=("only a single expression " - "is allowed")): - pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser) + with pytest.raises(SyntaxError, match=("only a single expression " "is allowed")): + pd.eval("1 + 1; 2 + 2", engine=engine, parser=parser) -@pytest.mark.parametrize('cmp', ('and', 'or')) -@pytest.mark.parametrize('lhs', (int, float)) -@pytest.mark.parametrize('rhs', (int, float)) +@pytest.mark.parametrize("cmp", ("and", "or")) +@pytest.mark.parametrize("lhs", (int, float)) +@pytest.mark.parametrize("rhs", (int, float)) def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): gen = {int: lambda: np.random.randint(10), float: np.random.randn} @@ -1886,38 +1958,37 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): lhs = gen[lhs]() # noqa rhs = gen[rhs]() # noqa - ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp) - ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp) - ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp) + ex1 = "lhs {0} mid {1} rhs".format(cmp, cmp) + ex2 = "lhs {0} mid and mid {1} rhs".format(cmp, cmp) + ex3 = "(lhs {0} mid) & (mid {1} rhs)".format(cmp, cmp) for ex in (ex1, ex2, ex3): with pytest.raises(NotImplementedError): pd.eval(ex, engine=engine, parser=parser) def test_inf(engine, parser): - s = 'inf + 1' + s = "inf + 1" expected = np.inf result = pd.eval(s, engine=engine, parser=parser) assert result == expected def test_negate_lt_eq_le(engine, parser): - df = pd.DataFrame([[0, 10], [1, 20]], columns=['cat', 'count']) + df = pd.DataFrame([[0, 10], [1, 20]], columns=["cat", "count"]) expected = df[~(df.cat > 0)] - result = df.query('~(cat > 0)', engine=engine, parser=parser) + result = df.query("~(cat > 0)", engine=engine, parser=parser) tm.assert_frame_equal(result, expected) - if parser == 'python': + if parser == "python": with pytest.raises(NotImplementedError): - df.query('not (cat > 0)', engine=engine, parser=parser) + df.query("not (cat > 0)", engine=engine, parser=parser) else: - result = df.query('not (cat > 0)', engine=engine, parser=parser) + result = df.query("not (cat > 0)", engine=engine, parser=parser) tm.assert_frame_equal(result, expected) class TestValidate: - def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py index 7e0b22cb3b414..3f12d1d7a292d 100644 --- a/pandas/tests/config/test_config.py +++ b/pandas/tests/config/test_config.py @@ -9,41 +9,39 @@ class TestConfig: - @classmethod def setup_class(cls): from copy import deepcopy cls.cf = cf - cls.gc = deepcopy(getattr(cls.cf, '_global_config')) - cls.do = deepcopy(getattr(cls.cf, '_deprecated_options')) - cls.ro = deepcopy(getattr(cls.cf, '_registered_options')) + cls.gc = deepcopy(getattr(cls.cf, "_global_config")) + cls.do = deepcopy(getattr(cls.cf, "_deprecated_options")) + cls.ro = deepcopy(getattr(cls.cf, "_registered_options")) def setup_method(self, method): - setattr(self.cf, '_global_config', {}) - setattr(self.cf, 'options', self.cf.DictWrapper( - self.cf._global_config)) - setattr(self.cf, '_deprecated_options', {}) - setattr(self.cf, '_registered_options', {}) + setattr(self.cf, "_global_config", {}) + setattr(self.cf, "options", self.cf.DictWrapper(self.cf._global_config)) + setattr(self.cf, "_deprecated_options", {}) + setattr(self.cf, "_registered_options", {}) # Our test fixture in conftest.py sets "chained_assignment" # to "raise" only after all test methods have been setup. # However, after this setup, there is no longer any # "chained_assignment" option, so re-register it. - self.cf.register_option('chained_assignment', 'raise') + self.cf.register_option("chained_assignment", "raise") def teardown_method(self, method): - setattr(self.cf, '_global_config', self.gc) - setattr(self.cf, '_deprecated_options', self.do) - setattr(self.cf, '_registered_options', self.ro) + setattr(self.cf, "_global_config", self.gc) + setattr(self.cf, "_deprecated_options", self.do) + setattr(self.cf, "_registered_options", self.ro) def test_api(self): # the pandas object exposes the user API - assert hasattr(pd, 'get_option') - assert hasattr(pd, 'set_option') - assert hasattr(pd, 'reset_option') - assert hasattr(pd, 'describe_option') + assert hasattr(pd, "get_option") + assert hasattr(pd, "set_option") + assert hasattr(pd, "reset_option") + assert hasattr(pd, "describe_option") def test_is_one_of_factory(self): v = self.cf.is_one_of_factory([None, 12]) @@ -55,128 +53,128 @@ def test_is_one_of_factory(self): v(1.1) def test_register_option(self): - self.cf.register_option('a', 1, 'doc') + self.cf.register_option("a", 1, "doc") # can't register an already registered option msg = "Option 'a' has already been registered" with pytest.raises(OptionError, match=msg): - self.cf.register_option('a', 1, 'doc') + self.cf.register_option("a", 1, "doc") # can't register an already registered option msg = "Path prefix to option 'a' is already an option" with pytest.raises(OptionError, match=msg): - self.cf.register_option('a.b.c.d1', 1, 'doc') + self.cf.register_option("a.b.c.d1", 1, "doc") with pytest.raises(OptionError, match=msg): - self.cf.register_option('a.b.c.d2', 1, 'doc') + self.cf.register_option("a.b.c.d2", 1, "doc") # no python keywords msg = "for is a python keyword" with pytest.raises(ValueError, match=msg): - self.cf.register_option('for', 0) + self.cf.register_option("for", 0) with pytest.raises(ValueError, match=msg): - self.cf.register_option('a.for.b', 0) + self.cf.register_option("a.for.b", 0) # must be valid identifier (ensure attribute access works) msg = "oh my goddess! is not a valid identifier" with pytest.raises(ValueError, match=msg): - self.cf.register_option('Oh my Goddess!', 0) + self.cf.register_option("Oh my Goddess!", 0) # we can register options several levels deep # without predefining the intermediate steps # and we can define differently named options # in the same namespace - self.cf.register_option('k.b.c.d1', 1, 'doc') - self.cf.register_option('k.b.c.d2', 1, 'doc') + self.cf.register_option("k.b.c.d1", 1, "doc") + self.cf.register_option("k.b.c.d2", 1, "doc") def test_describe_option(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b', 1, 'doc2') - self.cf.deprecate_option('b') - - self.cf.register_option('c.d.e1', 1, 'doc3') - self.cf.register_option('c.d.e2', 1, 'doc4') - self.cf.register_option('f', 1) - self.cf.register_option('g.h', 1) - self.cf.register_option('k', 2) - self.cf.deprecate_option('g.h', rkey="k") - self.cf.register_option('l', "foo") + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b", 1, "doc2") + self.cf.deprecate_option("b") + + self.cf.register_option("c.d.e1", 1, "doc3") + self.cf.register_option("c.d.e2", 1, "doc4") + self.cf.register_option("f", 1) + self.cf.register_option("g.h", 1) + self.cf.register_option("k", 2) + self.cf.deprecate_option("g.h", rkey="k") + self.cf.register_option("l", "foo") # non-existent keys raise KeyError msg = r"No such keys\(s\)" with pytest.raises(OptionError, match=msg): - self.cf.describe_option('no.such.key') + self.cf.describe_option("no.such.key") # we can get the description for any key we registered - assert 'doc' in self.cf.describe_option('a', _print_desc=False) - assert 'doc2' in self.cf.describe_option('b', _print_desc=False) - assert 'precated' in self.cf.describe_option('b', _print_desc=False) - assert 'doc3' in self.cf.describe_option('c.d.e1', _print_desc=False) - assert 'doc4' in self.cf.describe_option('c.d.e2', _print_desc=False) + assert "doc" in self.cf.describe_option("a", _print_desc=False) + assert "doc2" in self.cf.describe_option("b", _print_desc=False) + assert "precated" in self.cf.describe_option("b", _print_desc=False) + assert "doc3" in self.cf.describe_option("c.d.e1", _print_desc=False) + assert "doc4" in self.cf.describe_option("c.d.e2", _print_desc=False) # if no doc is specified we get a default message # saying "description not available" - assert 'vailable' in self.cf.describe_option('f', _print_desc=False) - assert 'vailable' in self.cf.describe_option('g.h', _print_desc=False) - assert 'precated' in self.cf.describe_option('g.h', _print_desc=False) - assert 'k' in self.cf.describe_option('g.h', _print_desc=False) + assert "vailable" in self.cf.describe_option("f", _print_desc=False) + assert "vailable" in self.cf.describe_option("g.h", _print_desc=False) + assert "precated" in self.cf.describe_option("g.h", _print_desc=False) + assert "k" in self.cf.describe_option("g.h", _print_desc=False) # default is reported - assert 'foo' in self.cf.describe_option('l', _print_desc=False) + assert "foo" in self.cf.describe_option("l", _print_desc=False) # current value is reported - assert 'bar' not in self.cf.describe_option('l', _print_desc=False) + assert "bar" not in self.cf.describe_option("l", _print_desc=False) self.cf.set_option("l", "bar") - assert 'bar' in self.cf.describe_option('l', _print_desc=False) + assert "bar" in self.cf.describe_option("l", _print_desc=False) def test_case_insensitive(self): - self.cf.register_option('KanBAN', 1, 'doc') + self.cf.register_option("KanBAN", 1, "doc") - assert 'doc' in self.cf.describe_option('kanbaN', _print_desc=False) - assert self.cf.get_option('kanBaN') == 1 - self.cf.set_option('KanBan', 2) - assert self.cf.get_option('kAnBaN') == 2 + assert "doc" in self.cf.describe_option("kanbaN", _print_desc=False) + assert self.cf.get_option("kanBaN") == 1 + self.cf.set_option("KanBan", 2) + assert self.cf.get_option("kAnBaN") == 2 # gets of non-existent keys fail msg = r"No such keys\(s\): 'no_such_option'" with pytest.raises(OptionError, match=msg): - self.cf.get_option('no_such_option') - self.cf.deprecate_option('KanBan') + self.cf.get_option("no_such_option") + self.cf.deprecate_option("KanBan") - assert self.cf._is_deprecated('kAnBaN') + assert self.cf._is_deprecated("kAnBaN") def test_get_option(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('b.b', None, 'doc2') + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") # gets of existing keys succeed - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - assert self.cf.get_option('b.b') is None + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None # gets of non-existent keys fail msg = r"No such keys\(s\): 'no_such_option'" with pytest.raises(OptionError, match=msg): - self.cf.get_option('no_such_option') + self.cf.get_option("no_such_option") def test_set_option(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('b.b', None, 'doc2') + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - assert self.cf.get_option('b.b') is None + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None - self.cf.set_option('a', 2) - self.cf.set_option('b.c', 'wurld') - self.cf.set_option('b.b', 1.1) + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + self.cf.set_option("b.b", 1.1) - assert self.cf.get_option('a') == 2 - assert self.cf.get_option('b.c') == 'wurld' - assert self.cf.get_option('b.b') == 1.1 + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" + assert self.cf.get_option("b.b") == 1.1 msg = r"No such keys\(s\): 'no.such.key'" with pytest.raises(OptionError, match=msg): - self.cf.set_option('no.such.key', None) + self.cf.set_option("no.such.key", None) def test_set_option_empty_args(self): msg = "Must provide an even number of non-keyword arguments" @@ -186,7 +184,7 @@ def test_set_option_empty_args(self): def test_set_option_uneven_args(self): msg = "Must provide an even number of non-keyword arguments" with pytest.raises(ValueError, match=msg): - self.cf.set_option('a.b', 2, 'b.c') + self.cf.set_option("a.b", 2, "b.c") def test_set_option_invalid_single_argument_type(self): msg = "Must provide an even number of non-keyword arguments" @@ -194,175 +192,168 @@ def test_set_option_invalid_single_argument_type(self): self.cf.set_option(2) def test_set_option_multiple(self): - self.cf.register_option('a', 1, 'doc') - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('b.b', None, 'doc2') + self.cf.register_option("a", 1, "doc") + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("b.b", None, "doc2") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - assert self.cf.get_option('b.b') is None + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + assert self.cf.get_option("b.b") is None - self.cf.set_option('a', '2', 'b.c', None, 'b.b', 10.0) + self.cf.set_option("a", "2", "b.c", None, "b.b", 10.0) - assert self.cf.get_option('a') == '2' - assert self.cf.get_option('b.c') is None - assert self.cf.get_option('b.b') == 10.0 + assert self.cf.get_option("a") == "2" + assert self.cf.get_option("b.c") is None + assert self.cf.get_option("b.b") == 10.0 def test_validation(self): - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2', - validator=self.cf.is_text) + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_text) msg = "Value must have type ''" with pytest.raises(ValueError, match=msg): - self.cf.register_option( - 'a.b.c.d2', 'NO', 'doc', validator=self.cf.is_int) + self.cf.register_option("a.b.c.d2", "NO", "doc", validator=self.cf.is_int) - self.cf.set_option('a', 2) # int is_int - self.cf.set_option('b.c', 'wurld') # str is_str + self.cf.set_option("a", 2) # int is_int + self.cf.set_option("b.c", "wurld") # str is_str # None not is_int with pytest.raises(ValueError, match=msg): - self.cf.set_option('a', None) + self.cf.set_option("a", None) with pytest.raises(ValueError, match=msg): - self.cf.set_option('a', 'ab') + self.cf.set_option("a", "ab") msg = r"Value must be an instance of \|" with pytest.raises(ValueError, match=msg): - self.cf.set_option('b.c', 1) + self.cf.set_option("b.c", 1) validator = self.cf.is_one_of_factory([None, self.cf.is_callable]) - self.cf.register_option('b', lambda: None, 'doc', - validator=validator) - self.cf.set_option('b', '%.1f'.format) # Formatter is callable - self.cf.set_option('b', None) # Formatter is none (default) + self.cf.register_option("b", lambda: None, "doc", validator=validator) + self.cf.set_option("b", "%.1f".format) # Formatter is callable + self.cf.set_option("b", None) # Formatter is none (default) with pytest.raises(ValueError, match="Value must be a callable"): - self.cf.set_option('b', '%.1f') + self.cf.set_option("b", "%.1f") def test_reset_option(self): - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2', - validator=self.cf.is_str) - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' - - self.cf.set_option('a', 2) - self.cf.set_option('b.c', 'wurld') - assert self.cf.get_option('a') == 2 - assert self.cf.get_option('b.c') == 'wurld' - - self.cf.reset_option('a') - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'wurld' - self.cf.reset_option('b.c') - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_str) + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" + + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" + + self.cf.reset_option("a") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "wurld" + self.cf.reset_option("b.c") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" def test_reset_option_all(self): - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2', - validator=self.cf.is_str) - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_str) + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" - self.cf.set_option('a', 2) - self.cf.set_option('b.c', 'wurld') - assert self.cf.get_option('a') == 2 - assert self.cf.get_option('b.c') == 'wurld' + self.cf.set_option("a", 2) + self.cf.set_option("b.c", "wurld") + assert self.cf.get_option("a") == 2 + assert self.cf.get_option("b.c") == "wurld" self.cf.reset_option("all") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b.c') == 'hullo' + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b.c") == "hullo" def test_deprecate_option(self): # we can deprecate non-existent options - self.cf.deprecate_option('foo') + self.cf.deprecate_option("foo") - assert self.cf._is_deprecated('foo') + assert self.cf._is_deprecated("foo") with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - with pytest.raises( - KeyError, - match="No such keys.s.: 'foo'"): - self.cf.get_option('foo') + warnings.simplefilter("always") + with pytest.raises(KeyError, match="No such keys.s.: 'foo'"): + self.cf.get_option("foo") assert len(w) == 1 # should have raised one warning - assert 'deprecated' in str(w[-1]) # we get the default message + assert "deprecated" in str(w[-1]) # we get the default message - self.cf.register_option('a', 1, 'doc', validator=self.cf.is_int) - self.cf.register_option('b.c', 'hullo', 'doc2') - self.cf.register_option('foo', 'hullo', 'doc2') + self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("b.c", "hullo", "doc2") + self.cf.register_option("foo", "hullo", "doc2") - self.cf.deprecate_option('a', removal_ver='nifty_ver') + self.cf.deprecate_option("a", removal_ver="nifty_ver") with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - self.cf.get_option('a') + warnings.simplefilter("always") + self.cf.get_option("a") assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the default message - assert 'nifty_ver' in str(w[-1]) # with the removal_ver quoted + assert "eprecated" in str(w[-1]) # we get the default message + assert "nifty_ver" in str(w[-1]) # with the removal_ver quoted msg = "Option 'a' has already been defined as deprecated" with pytest.raises(OptionError, match=msg): - self.cf.deprecate_option('a') + self.cf.deprecate_option("a") - self.cf.deprecate_option('b.c', 'zounds!') + self.cf.deprecate_option("b.c", "zounds!") with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - self.cf.get_option('b.c') + warnings.simplefilter("always") + self.cf.get_option("b.c") assert len(w) == 1 # should have raised one warning - assert 'zounds!' in str(w[-1]) # we get the custom message + assert "zounds!" in str(w[-1]) # we get the custom message # test rerouting keys - self.cf.register_option('d.a', 'foo', 'doc2') - self.cf.register_option('d.dep', 'bar', 'doc2') - assert self.cf.get_option('d.a') == 'foo' - assert self.cf.get_option('d.dep') == 'bar' + self.cf.register_option("d.a", "foo", "doc2") + self.cf.register_option("d.dep", "bar", "doc2") + assert self.cf.get_option("d.a") == "foo" + assert self.cf.get_option("d.dep") == "bar" - self.cf.deprecate_option('d.dep', rkey='d.a') # reroute d.dep to d.a + self.cf.deprecate_option("d.dep", rkey="d.a") # reroute d.dep to d.a with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - assert self.cf.get_option('d.dep') == 'foo' + warnings.simplefilter("always") + assert self.cf.get_option("d.dep") == "foo" assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the custom message + assert "eprecated" in str(w[-1]) # we get the custom message with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - self.cf.set_option('d.dep', 'baz') # should overwrite "d.a" + warnings.simplefilter("always") + self.cf.set_option("d.dep", "baz") # should overwrite "d.a" assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the custom message + assert "eprecated" in str(w[-1]) # we get the custom message with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - assert self.cf.get_option('d.dep') == 'baz' + warnings.simplefilter("always") + assert self.cf.get_option("d.dep") == "baz" assert len(w) == 1 # should have raised one warning - assert 'eprecated' in str(w[-1]) # we get the custom message + assert "eprecated" in str(w[-1]) # we get the custom message def test_config_prefix(self): with self.cf.config_prefix("base"): - self.cf.register_option('a', 1, "doc1") - self.cf.register_option('b', 2, "doc2") - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b') == 2 + self.cf.register_option("a", 1, "doc1") + self.cf.register_option("b", 2, "doc2") + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b") == 2 - self.cf.set_option('a', 3) - self.cf.set_option('b', 4) - assert self.cf.get_option('a') == 3 - assert self.cf.get_option('b') == 4 + self.cf.set_option("a", 3) + self.cf.set_option("b", 4) + assert self.cf.get_option("a") == 3 + assert self.cf.get_option("b") == 4 - assert self.cf.get_option('base.a') == 3 - assert self.cf.get_option('base.b') == 4 - assert 'doc1' in self.cf.describe_option('base.a', _print_desc=False) - assert 'doc2' in self.cf.describe_option('base.b', _print_desc=False) + assert self.cf.get_option("base.a") == 3 + assert self.cf.get_option("base.b") == 4 + assert "doc1" in self.cf.describe_option("base.a", _print_desc=False) + assert "doc2" in self.cf.describe_option("base.b", _print_desc=False) - self.cf.reset_option('base.a') - self.cf.reset_option('base.b') + self.cf.reset_option("base.a") + self.cf.reset_option("base.b") with self.cf.config_prefix("base"): - assert self.cf.get_option('a') == 1 - assert self.cf.get_option('b') == 2 + assert self.cf.get_option("a") == 1 + assert self.cf.get_option("b") == 2 def test_callback(self): k = [None] @@ -372,8 +363,8 @@ def callback(key): k.append(key) v.append(self.cf.get_option(key)) - self.cf.register_option('d.a', 'foo', cb=callback) - self.cf.register_option('d.b', 'foo', cb=callback) + self.cf.register_option("d.a", "foo", cb=callback) + self.cf.register_option("d.b", "foo", cb=callback) del k[-1], v[-1] self.cf.set_option("d.a", "fooz") @@ -393,7 +384,7 @@ def test_set_ContextManager(self): def eq(val): assert self.cf.get_option("a") == val - self.cf.register_option('a', 0) + self.cf.register_option("a", 0) eq(0) with self.cf.option_context("a", 15): eq(15) @@ -411,8 +402,8 @@ def test_attribute_access(self): def f3(key): holder.append(True) - self.cf.register_option('a', 0) - self.cf.register_option('c', 0, cb=f3) + self.cf.register_option("a", 0) + self.cf.register_option("c", 0, cb=f3) options = self.cf.options assert options.a == 0 @@ -442,7 +433,7 @@ def test_option_context_scope(self): original_value = 60 context_value = 10 - option_name = 'a' + option_name = "a" self.cf.register_option(option_name, original_value) @@ -462,4 +453,4 @@ def test_dictwrapper_getattr(self): # GH 19789 with pytest.raises(OptionError, match="No such option"): options.bananas - assert not hasattr(options, 'bananas') + assert not hasattr(options, "bananas") diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index c63465ff0c464..20a5be0c8a289 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -12,11 +12,13 @@ _current_locale = locale.getlocale() # Don't run any of these tests if we are on Windows or have no locales. -pytestmark = pytest.mark.skipif(is_platform_windows() or not _all_locales, - reason="Need non-Windows and locales") +pytestmark = pytest.mark.skipif( + is_platform_windows() or not _all_locales, reason="Need non-Windows and locales" +) _skip_if_only_one_locale = pytest.mark.skipif( - len(_all_locales) <= 1, reason="Need multiple locales for meaningful test") + len(_all_locales) <= 1, reason="Need multiple locales for meaningful test" +) def test_can_set_locale_valid_set(): diff --git a/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/pandas/tests/dtypes/cast/test_construct_from_scalar.py index c8a35e692e2b7..4ff3be5dfaa72 100644 --- a/pandas/tests/dtypes/cast/test_construct_from_scalar.py +++ b/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -15,6 +15,6 @@ def test_cast_1d_array_like_from_scalar_categorical(): expected = Categorical(["a", "a"], categories=cats) result = construct_1d_arraylike_from_scalar("a", len(expected), cat_type) - tm.assert_categorical_equal(result, expected, - check_category_order=True, - check_dtype=True) + tm.assert_categorical_equal( + result, expected, check_category_order=True, check_dtype=True + ) diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index 8653206be9156..da3789a87aa07 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -6,13 +6,16 @@ from pandas.util import testing as tm -@pytest.mark.parametrize('values, dtype, expected', [ - ([1, 2, 3], None, np.array([1, 2, 3])), - (np.array([1, 2, 3]), None, np.array([1, 2, 3])), - (['1', '2', None], None, np.array(['1', '2', None])), - (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])), - ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])), -]) +@pytest.mark.parametrize( + "values, dtype, expected", + [ + ([1, 2, 3], None, np.array([1, 2, 3])), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (["1", "2", None], None, np.array(["1", "2", None])), + (["1", "2", None], np.dtype("str"), np.array(["1", "2", None])), + ([1, 2, None], np.dtype("str"), np.array(["1", "2", None])), + ], +) def test_construct_1d_ndarray_preserving_na(values, dtype, expected): result = construct_1d_ndarray_preserving_na(values, dtype=dtype) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_construct_object_arr.py b/pandas/tests/dtypes/cast/test_construct_object_arr.py index 15277b198f675..cb44f91f34dec 100644 --- a/pandas/tests/dtypes/cast/test_construct_object_arr.py +++ b/pandas/tests/dtypes/cast/test_construct_object_arr.py @@ -3,8 +3,8 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -@pytest.mark.parametrize("datum1", [1, 2., "3", (4, 5), [6, 7], None]) -@pytest.mark.parametrize("datum2", [8, 9., "10", (11, 12), [13, 14], None]) +@pytest.mark.parametrize("datum1", [1, 2.0, "3", (4, 5), [6, 7], None]) +@pytest.mark.parametrize("datum2", [8, 9.0, "10", (11, 12), [13, 14], None]) def test_cast_1d_array(datum1, datum2): data = [datum1, datum2] result = construct_1d_object_array_from_listlike(data) @@ -14,7 +14,7 @@ def test_cast_1d_array(datum1, datum2): assert list(result) == data -@pytest.mark.parametrize("val", [1, 2., None]) +@pytest.mark.parametrize("val", [1, 2.0, None]) def test_cast_1d_array_invalid_scalar(val): with pytest.raises(TypeError, match="has no len()"): construct_1d_object_array_from_listlike(val) diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 61aba9ed41f59..d574b03a8c724 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -7,16 +7,26 @@ from pandas.util import testing as tm -@pytest.mark.parametrize("arr,dtype,expected", [ - (np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), "infer", - np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995])), - - (np.array([8., 8., 8., 8., 8.9999999999995]), "infer", - np.array([8, 8, 8, 8, 9], dtype=np.int64)), - - (np.array([8., 8., 8., 8., 9.0000000000005]), "infer", - np.array([8, 8, 8, 8, 9], dtype=np.int64)), -]) +@pytest.mark.parametrize( + "arr,dtype,expected", + [ + ( + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + "infer", + np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 8.9999999999995]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ( + np.array([8.0, 8.0, 8.0, 8.0, 9.0000000000005]), + "infer", + np.array([8, 8, 8, 8, 9], dtype=np.int64), + ), + ], +) def test_downcast(arr, expected, dtype): result = maybe_downcast_to_dtype(arr, dtype) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_find_common_type.py b/pandas/tests/dtypes/cast/test_find_common_type.py index c48657bb272cc..ac7a5221d3469 100644 --- a/pandas/tests/dtypes/cast/test_find_common_type.py +++ b/pandas/tests/dtypes/cast/test_find_common_type.py @@ -2,61 +2,63 @@ import pytest from pandas.core.dtypes.cast import find_common_type -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, PeriodDtype) - - -@pytest.mark.parametrize("source_dtypes,expected_common_dtype", [ - ((np.int64,), np.int64), - ((np.uint64,), np.uint64), - ((np.float32,), np.float32), - ((np.object,), np.object), - - # Into ints. - ((np.int16, np.int64), np.int64), - ((np.int32, np.uint32), np.int64), - ((np.uint16, np.uint64), np.uint64), - - # Into floats. - ((np.float16, np.float32), np.float32), - ((np.float16, np.int16), np.float32), - ((np.float32, np.int16), np.float32), - ((np.uint64, np.int64), np.float64), - ((np.int16, np.float64), np.float64), - ((np.float16, np.int64), np.float64), - - # Into others. - ((np.complex128, np.int32), np.complex128), - ((np.object, np.float32), np.object), - ((np.object, np.int16), np.object), - - # Bool with int. - ((np.dtype("bool"), np.int64), np.object), - ((np.dtype("bool"), np.int32), np.object), - ((np.dtype("bool"), np.int16), np.object), - ((np.dtype("bool"), np.int8), np.object), - ((np.dtype("bool"), np.uint64), np.object), - ((np.dtype("bool"), np.uint32), np.object), - ((np.dtype("bool"), np.uint16), np.object), - ((np.dtype("bool"), np.uint8), np.object), - - # Bool with float. - ((np.dtype("bool"), np.float64), np.object), - ((np.dtype("bool"), np.float32), np.object), - - ((np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")), - np.dtype("datetime64[ns]")), - ((np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")), - np.dtype("timedelta64[ns]")), - - ((np.dtype("datetime64[ns]"), np.dtype("datetime64[ms]")), - np.dtype("datetime64[ns]")), - ((np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")), - np.dtype("timedelta64[ns]")), - - ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), np.object), - ((np.dtype("datetime64[ns]"), np.int64), np.object) -]) +from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype + + +@pytest.mark.parametrize( + "source_dtypes,expected_common_dtype", + [ + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((np.object,), np.object), + # Into ints. + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + # Into floats. + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + # Into others. + ((np.complex128, np.int32), np.complex128), + ((np.object, np.float32), np.object), + ((np.object, np.int16), np.object), + # Bool with int. + ((np.dtype("bool"), np.int64), np.object), + ((np.dtype("bool"), np.int32), np.object), + ((np.dtype("bool"), np.int16), np.object), + ((np.dtype("bool"), np.int8), np.object), + ((np.dtype("bool"), np.uint64), np.object), + ((np.dtype("bool"), np.uint32), np.object), + ((np.dtype("bool"), np.uint16), np.object), + ((np.dtype("bool"), np.uint8), np.object), + # Bool with float. + ((np.dtype("bool"), np.float64), np.object), + ((np.dtype("bool"), np.float32), np.object), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ns]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ( + (np.dtype("datetime64[ns]"), np.dtype("datetime64[ms]")), + np.dtype("datetime64[ns]"), + ), + ( + (np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")), + np.dtype("timedelta64[ns]"), + ), + ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), np.object), + ((np.dtype("datetime64[ns]"), np.int64), np.object), + ], +) def test_numpy_dtypes(source_dtypes, expected_common_dtype): assert find_common_type(source_dtypes) == expected_common_dtype @@ -66,11 +68,14 @@ def test_raises_empty_input(): find_common_type([]) -@pytest.mark.parametrize("dtypes,exp_type", [ - ([CategoricalDtype()], "category"), - ([np.object, CategoricalDtype()], np.object), - ([CategoricalDtype(), CategoricalDtype()], "category"), -]) +@pytest.mark.parametrize( + "dtypes,exp_type", + [ + ([CategoricalDtype()], "category"), + ([np.object, CategoricalDtype()], np.object), + ([CategoricalDtype(), CategoricalDtype()], "category"), + ], +) def test_categorical_dtype(dtypes, exp_type): assert find_common_type(dtypes) == exp_type @@ -80,10 +85,15 @@ def test_datetimetz_dtype_match(): assert find_common_type([dtype, dtype]) == "datetime64[ns, US/Eastern]" -@pytest.mark.parametrize("dtype2", [ - DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), - np.dtype("datetime64[ns]"), np.object, np.int64 -]) +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + np.dtype("datetime64[ns]"), + np.object, + np.int64, + ], +) def test_datetimetz_dtype_mismatch(dtype2): dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") assert find_common_type([dtype, dtype2]) == np.object @@ -95,11 +105,17 @@ def test_period_dtype_match(): assert find_common_type([dtype, dtype]) == "period[D]" -@pytest.mark.parametrize("dtype2", [ - DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), - PeriodDtype(freq="2D"), PeriodDtype(freq="H"), - np.dtype("datetime64[ns]"), np.object, np.int64 -]) +@pytest.mark.parametrize( + "dtype2", + [ + DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), + PeriodDtype(freq="2D"), + PeriodDtype(freq="H"), + np.dtype("datetime64[ns]"), + np.object, + np.int64, + ], +) def test_period_dtype_mismatch(dtype2): dtype = PeriodDtype(freq="D") assert find_common_type([dtype, dtype2]) == np.object diff --git a/pandas/tests/dtypes/cast/test_infer_datetimelike.py b/pandas/tests/dtypes/cast/test_infer_datetimelike.py index 3ff7b02b81342..f4253e9d9e37b 100644 --- a/pandas/tests/dtypes/cast/test_infer_datetimelike.py +++ b/pandas/tests/dtypes/cast/test_infer_datetimelike.py @@ -4,11 +4,14 @@ from pandas import DataFrame, NaT, Series, Timestamp -@pytest.mark.parametrize("data,exp_size", [ - # see gh-16362. - ([[NaT, "a", "b", 0], [NaT, "b", "c", 1]], 8), - ([[NaT, "a", 0], [NaT, "b", 1]], 6) -]) +@pytest.mark.parametrize( + "data,exp_size", + [ + # see gh-16362. + ([[NaT, "a", "b", 0], [NaT, "b", "c", 1]], 8), + ([[NaT, "a", 0], [NaT, "b", 1]], 6), + ], +) def test_maybe_infer_to_datetimelike_df_construct(data, exp_size): result = DataFrame(np.array(data)) assert result.size == exp_size diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 88c91243fcd74..602b2f26eaa4a 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -4,11 +4,13 @@ import pytest from pandas.core.dtypes.cast import ( - cast_scalar_to_array, infer_dtype_from_array, infer_dtype_from_scalar) + cast_scalar_to_array, + infer_dtype_from_array, + infer_dtype_from_scalar, +) from pandas.core.dtypes.common import is_dtype_equal -from pandas import ( - Categorical, Period, Series, Timedelta, Timestamp, date_range) +from pandas import Categorical, Period, Series, Timedelta, Timestamp, date_range from pandas.util import testing as tm @@ -33,9 +35,7 @@ def test_infer_dtype_from_float_scalar(float_dtype): assert dtype == float_dtype -@pytest.mark.parametrize("data,exp_dtype", [ - (12, np.int64), (np.float(12), np.float64) -]) +@pytest.mark.parametrize("data,exp_dtype", [(12, np.int64), (np.float(12), np.float64)]) def test_infer_dtype_from_python_scalar(data, exp_dtype): dtype, val = infer_dtype_from_scalar(data) assert dtype == exp_dtype @@ -53,15 +53,15 @@ def test_infer_dtype_from_complex(complex_dtype): assert dtype == np.complex_ -@pytest.mark.parametrize("data", [np.datetime64(1, "ns"), Timestamp(1), - datetime(2000, 1, 1, 0, 0)]) +@pytest.mark.parametrize( + "data", [np.datetime64(1, "ns"), Timestamp(1), datetime(2000, 1, 1, 0, 0)] +) def test_infer_dtype_from_datetime(data): dtype, val = infer_dtype_from_scalar(data) assert dtype == "M8[ns]" -@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), - timedelta(1)]) +@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), timedelta(1)]) def test_infer_dtype_from_timedelta(data): dtype, val = infer_dtype_from_scalar(data) assert dtype == "m8[ns]" @@ -83,8 +83,9 @@ def test_infer_dtype_from_period(freq, pandas_dtype): assert val == exp_val -@pytest.mark.parametrize("data", [date(2000, 1, 1), "foo", - Timestamp(1, tz="US/Eastern")]) +@pytest.mark.parametrize( + "data", [date(2000, 1, 1), "foo", Timestamp(1, tz="US/Eastern")] +) def test_infer_dtype_misc(data): dtype, val = infer_dtype_from_scalar(data) assert dtype == np.object_ @@ -115,39 +116,51 @@ def test_infer_dtype_from_scalar_errors(): @pytest.mark.parametrize( "arr, expected, pandas_dtype", - [("foo", np.object_, False), - (b"foo", np.object_, False), - (1, np.int_, False), - (1.5, np.float_, False), - ([1], np.int_, False), - (np.array([1], dtype=np.int64), np.int64, False), - ([np.nan, 1, ""], np.object_, False), - (np.array([[1.0, 2.0]]), np.float_, False), - (Categorical(list("aabc")), np.object_, False), - (Categorical([1, 2, 3]), np.int64, False), - (Categorical(list("aabc")), "category", True), - (Categorical([1, 2, 3]), "category", True), - (Timestamp("20160101"), np.object_, False), - (np.datetime64("2016-01-01"), np.dtype("=M8[D]"), False), - (date_range("20160101", periods=3), - np.dtype("=M8[ns]"), False), - (date_range("20160101", periods=3, tz="US/Eastern"), - "datetime64[ns, US/Eastern]", True), - (Series([1., 2, 3]), np.float64, False), - (Series(list("abc")), np.object_, False), - (Series(date_range("20160101", periods=3, tz="US/Eastern")), - "datetime64[ns, US/Eastern]", True)]) + [ + ("foo", np.object_, False), + (b"foo", np.object_, False), + (1, np.int_, False), + (1.5, np.float_, False), + ([1], np.int_, False), + (np.array([1], dtype=np.int64), np.int64, False), + ([np.nan, 1, ""], np.object_, False), + (np.array([[1.0, 2.0]]), np.float_, False), + (Categorical(list("aabc")), np.object_, False), + (Categorical([1, 2, 3]), np.int64, False), + (Categorical(list("aabc")), "category", True), + (Categorical([1, 2, 3]), "category", True), + (Timestamp("20160101"), np.object_, False), + (np.datetime64("2016-01-01"), np.dtype("=M8[D]"), False), + (date_range("20160101", periods=3), np.dtype("=M8[ns]"), False), + ( + date_range("20160101", periods=3, tz="US/Eastern"), + "datetime64[ns, US/Eastern]", + True, + ), + (Series([1.0, 2, 3]), np.float64, False), + (Series(list("abc")), np.object_, False), + ( + Series(date_range("20160101", periods=3, tz="US/Eastern")), + "datetime64[ns, US/Eastern]", + True, + ), + ], +) def test_infer_dtype_from_array(arr, expected, pandas_dtype): dtype, _ = infer_dtype_from_array(arr, pandas_dtype=pandas_dtype) assert is_dtype_equal(dtype, expected) -@pytest.mark.parametrize("obj,dtype", [ - (1, np.int64), (1.1, np.float64), - (Timestamp("2011-01-01"), "datetime64[ns]"), - (Timestamp("2011-01-01", tz="US/Eastern"), np.object), - (Period("2011-01-01", freq="D"), np.object) -]) +@pytest.mark.parametrize( + "obj,dtype", + [ + (1, np.int64), + (1.1, np.float64), + (Timestamp("2011-01-01"), "datetime64[ns]"), + (Timestamp("2011-01-01", tz="US/Eastern"), np.object), + (Period("2011-01-01", freq="D"), np.object), + ], +) def test_cast_scalar_to_array(obj, dtype): shape = (3, 2) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 5a5b5d47b3ccc..44aebd4d277f2 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -12,17 +12,38 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, - is_float_dtype, is_integer_dtype, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype) + is_complex_dtype, + is_datetime64_dtype, + is_datetime_or_timedelta_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + is_scalar, + is_string_dtype, + is_timedelta64_dtype, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype, PandasExtensionDtype import pandas as pd -@pytest.fixture(params=[bool, 'uint8', 'int32', 'uint64', 'float32', 'float64', - 'complex64', 'complex128', 'M8[ns]', 'm8[ns]', str, - bytes, object]) +@pytest.fixture( + params=[ + bool, + "uint8", + "int32", + "uint64", + "float32", + "float64", + "complex64", + "complex128", + "M8[ns]", + "m8[ns]", + str, + bytes, + object, + ] +) def any_numpy_dtype_reduced(request): """ Parameterized fixture for numpy dtypes, reduced from any_numpy_dtype. @@ -43,8 +64,10 @@ def any_numpy_dtype_reduced(request): return request.param -@pytest.fixture(params=[(True, None), (True, object), (False, None)], - ids=['True-None', 'True-object', 'False-None']) +@pytest.fixture( + params=[(True, None), (True, object), (False, None)], + ids=["True-None", "True-object", "False-None"], +) def box(request): """ Parametrized fixture determining whether/how to transform fill_value. @@ -81,8 +104,15 @@ def _safe_dtype_assert(left_dtype, right_dtype): assert left_dtype == right_dtype -def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar=None, exp_val_for_array=None): +def _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar=None, + exp_val_for_array=None, +): """ Auxiliary function to unify testing of scalar/array promotion. @@ -129,17 +159,18 @@ def _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, # for equal values, also check type (relevant e.g. for int vs float, resp. # for different datetimes and timedeltas) - match_value = (result_fill_value == expected_fill_value - # disabled type check due to too many xfails; GH 23982/25425 - # and type(result_fill_value) == type(expected_fill_value) - ) + match_value = ( + result_fill_value + == expected_fill_value + # disabled type check due to too many xfails; GH 23982/25425 + # and type(result_fill_value) == type(expected_fill_value) + ) # for missing values, None == None and iNaT == iNaT (which is checked # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT - match_missing = ((result_fill_value is np.nan - and expected_fill_value is np.nan) - or (result_fill_value is NaT - and expected_fill_value is NaT)) + match_missing = (result_fill_value is np.nan and expected_fill_value is np.nan) or ( + result_fill_value is NaT and expected_fill_value is NaT + ) assert match_value or match_missing @@ -150,14 +181,14 @@ def test_maybe_promote_int_with_int(): # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, None), (False, None)]) +@pytest.mark.parametrize("box", [(True, None), (False, None)]) def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): dtype = np.dtype(any_int_dtype) fill_dtype = np.dtype(float_dtype) boxed, box_dtype = box # read from parametrized fixture - if float_dtype == 'float32' and not boxed: - pytest.xfail('falsely upcasts to float64') + if float_dtype == "float32" and not boxed: + pytest.xfail("falsely upcasts to float64") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -168,12 +199,19 @@ def test_maybe_promote_int_with_float(any_int_dtype, float_dtype, box): exp_val_for_scalar = np.float64(fill_value) exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, None), (False, None)]) +@pytest.mark.parametrize("box", [(True, None), (False, None)]) def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): dtype = np.dtype(float_dtype) @@ -190,8 +228,15 @@ def test_maybe_promote_float_with_int(float_dtype, any_int_dtype, box): exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_float_with_float(): @@ -205,10 +250,9 @@ def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): boxed, box_dtype = box # read from parametrized fixture if boxed and fill_dtype == bool: - pytest.xfail('falsely upcasts to object') - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrongly casts fill_value') + pytest.xfail("falsely upcasts to object") + if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + pytest.xfail("wrongly casts fill_value") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -218,8 +262,15 @@ def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): exp_val_for_scalar = fill_value exp_val_for_array = np.nan if fill_dtype != bool else None - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): @@ -228,11 +279,11 @@ def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): boxed, box_dtype = box # read from parametrized fixture if boxed and dtype == bool: - pytest.xfail('falsely upcasts to object') + pytest.xfail("falsely upcasts to object") if boxed and dtype not in (str, object) and box_dtype is None: - pytest.xfail('falsely upcasts to object') + pytest.xfail("falsely upcasts to object") if not boxed and is_datetime_or_timedelta_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # filling anything but bool with bool casts to object expected_dtype = np.dtype(object) if dtype != bool else dtype @@ -240,8 +291,15 @@ def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0] exp_val_for_array = np.nan if dtype != bool else None - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_bytes_with_any(): @@ -260,38 +318,49 @@ def test_maybe_promote_datetime64_with_any(): # override parametrization of box to add special case for dt_dtype -@pytest.mark.parametrize('box', [ - (True, None), # fill_value wrapped in array with default dtype - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype - # (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -]) -@pytest.mark.parametrize('fill_value', [ - pd.Timestamp('now'), np.datetime64('now'), - datetime.datetime.now(), datetime.date.today() -], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) -def test_maybe_promote_any_with_datetime64(any_numpy_dtype_reduced, - datetime64_dtype, fill_value, box): +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'dt_dtype'), # fill_value in array with explicit datetime dtype + # (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("now"), + np.datetime64("now"), + datetime.datetime.now(), + datetime.date.today(), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) +def test_maybe_promote_any_with_datetime64( + any_numpy_dtype_reduced, datetime64_dtype, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if is_datetime64_dtype(dtype): - if (boxed and (box_dtype == object - or (box_dtype is None - and not is_datetime64_dtype(type(fill_value))))): - pytest.xfail('falsely upcasts to object') + if boxed and ( + box_dtype == object + or (box_dtype is None and not is_datetime64_dtype(type(fill_value))) + ): + pytest.xfail("falsely upcasts to object") else: - if (boxed and (box_dtype == 'dt_dtype' - or (box_dtype is None - and is_datetime64_dtype(type(fill_value))))): - pytest.xfail('mix of lack of upcasting, resp. wrong missing value') + if boxed and ( + box_dtype == "dt_dtype" + or (box_dtype is None and is_datetime64_dtype(type(fill_value))) + ): + pytest.xfail("mix of lack of upcasting, resp. wrong missing value") if not boxed and is_timedelta64_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # special case for box_dtype - box_dtype = (np.dtype(datetime64_dtype) if box_dtype == 'dt_dtype' - else box_dtype) + box_dtype = np.dtype(datetime64_dtype) if box_dtype == "dt_dtype" else box_dtype # filling datetime with anything but datetime casts to object if is_datetime64_dtype(dtype): @@ -304,20 +373,28 @@ def test_maybe_promote_any_with_datetime64(any_numpy_dtype_reduced, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, object)]) +@pytest.mark.parametrize("box", [(True, object)]) def test_maybe_promote_datetimetz_with_any_numpy_dtype( - tz_aware_fixture, any_numpy_dtype_reduced, box): + tz_aware_fixture, any_numpy_dtype_reduced, box +): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if box_dtype != object: - pytest.xfail('does not upcast correctly') + pytest.xfail("does not upcast correctly") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -327,25 +404,34 @@ def test_maybe_promote_datetimetz_with_any_numpy_dtype( exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(True, None), (True, object)]) -def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, - tz_aware_fixture2, box): +@pytest.mark.parametrize("box", [(True, None), (True, object)]) +def test_maybe_promote_datetimetz_with_datetimetz( + tz_aware_fixture, tz_aware_fixture2, box +): dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) boxed, box_dtype = box # read from parametrized fixture from dateutil.tz import tzlocal + if is_platform_windows() and tz_aware_fixture2 == tzlocal(): - pytest.xfail('Cannot process fill_value with this dtype, see GH 24310') + pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") if dtype.tz == fill_dtype.tz and boxed: - pytest.xfail('falsely upcasts') + pytest.xfail("falsely upcasts") if dtype.tz != fill_dtype.tz and not boxed: - pytest.xfail('falsely upcasts') + pytest.xfail("falsely upcasts") # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -359,49 +445,72 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, expected_dtype = np.dtype(object) exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], - ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +@pytest.mark.parametrize( + "fill_value", [None, np.nan, NaT, iNaT], ids=["None", "np.nan", "pd.NaT", "iNaT"] +) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(False, None)]) +@pytest.mark.parametrize("box", [(False, None)]) def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, box): dtype = DatetimeTZDtype(tz=tz_aware_fixture) boxed, box_dtype = box # read from parametrized fixture - if (boxed and (box_dtype == object - or (box_dtype is None - and (fill_value is None or fill_value is NaT)))): - pytest.xfail('false upcasts to object') + if boxed and ( + box_dtype == object + or (box_dtype is None and (fill_value is None or fill_value is NaT)) + ): + pytest.xfail("false upcasts to object") # takes the opinion that DatetimeTZ should have single na-marker # using iNaT would lead to errors elsewhere -> NaT if not boxed and fill_value == iNaT: - pytest.xfail('wrong missing value marker') + pytest.xfail("wrong missing value marker") expected_dtype = dtype # DatetimeTZDtype does not use iNaT as missing value marker exp_val_for_scalar = NaT exp_val_for_array = NaT - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) - - -@pytest.mark.parametrize('fill_value', [ - pd.Timestamp('now'), np.datetime64('now'), - datetime.datetime.now(), datetime.date.today() -], ids=['pd.Timestamp', 'np.datetime64', 'datetime.datetime', 'datetime.date']) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) + + +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("now"), + np.datetime64("now"), + datetime.datetime.now(), + datetime.date.today(), + ], + ids=["pd.Timestamp", "np.datetime64", "datetime.datetime", "datetime.date"], +) def test_maybe_promote_any_numpy_dtype_with_datetimetz( - any_numpy_dtype_reduced, tz_aware_fixture, fill_value, box): + any_numpy_dtype_reduced, tz_aware_fixture, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) boxed, box_dtype = box # read from parametrized fixture if is_datetime_or_timedelta_dtype(dtype) and not boxed: - pytest.xfail('raises error') + pytest.xfail("raises error") fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] @@ -410,8 +519,15 @@ def test_maybe_promote_any_numpy_dtype_with_datetimetz( exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) def test_maybe_promote_timedelta64_with_any(): @@ -419,44 +535,55 @@ def test_maybe_promote_timedelta64_with_any(): pass -@pytest.mark.parametrize('fill_value', [ - pd.Timedelta(days=1), np.timedelta64(24, 'h'), datetime.timedelta(1) -], ids=['pd.Timedelta', 'np.timedelta64', 'datetime.timedelta']) +@pytest.mark.parametrize( + "fill_value", + [pd.Timedelta(days=1), np.timedelta64(24, "h"), datetime.timedelta(1)], + ids=["pd.Timedelta", "np.timedelta64", "datetime.timedelta"], +) # override parametrization of box to add special case for td_dtype -@pytest.mark.parametrize('box', [ - (True, None), # fill_value wrapped in array with default dtype - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -]) +@pytest.mark.parametrize( + "box", + [ + (True, None), # fill_value wrapped in array with default dtype + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, 'td_dtype'), # fill_value in array with explicit timedelta dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], +) def test_maybe_promote_any_with_timedelta64( - any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box): + any_numpy_dtype_reduced, timedelta64_dtype, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if is_timedelta64_dtype(dtype): - if (boxed and (box_dtype == object - or (box_dtype is None - and not is_timedelta64_dtype(type(fill_value))))): - pytest.xfail('falsely upcasts to object') + if boxed and ( + box_dtype == object + or (box_dtype is None and not is_timedelta64_dtype(type(fill_value))) + ): + pytest.xfail("falsely upcasts to object") else: - if (boxed and box_dtype is None - and is_timedelta64_dtype(type(fill_value))): - pytest.xfail('does not upcast correctly') - if (not boxed and is_timedelta64_dtype(type(fill_value)) and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) + if boxed and box_dtype is None and is_timedelta64_dtype(type(fill_value)): + pytest.xfail("does not upcast correctly") + if ( + not boxed + and is_timedelta64_dtype(type(fill_value)) + and ( + is_integer_dtype(dtype) + or is_float_dtype(dtype) or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_))): - pytest.xfail('does not upcast correctly') - if box_dtype == 'td_dtype': - pytest.xfail('falsely upcasts') + or issubclass(dtype.type, np.bytes_) + ) + ): + pytest.xfail("does not upcast correctly") + if box_dtype == "td_dtype": + pytest.xfail("falsely upcasts") if not boxed and is_datetime64_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # special case for box_dtype - box_dtype = (np.dtype(timedelta64_dtype) if box_dtype == 'td_dtype' - else box_dtype) + box_dtype = np.dtype(timedelta64_dtype) if box_dtype == "td_dtype" else box_dtype # filling anything but timedelta with timedelta casts to object if is_timedelta64_dtype(dtype): @@ -469,19 +596,24 @@ def test_maybe_promote_any_with_timedelta64( exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -def test_maybe_promote_string_with_any(string_dtype, - any_numpy_dtype_reduced, box): +def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, box): dtype = np.dtype(string_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrong missing value marker') + if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + pytest.xfail("wrong missing value marker") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -491,56 +623,76 @@ def test_maybe_promote_string_with_any(string_dtype, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) # override parametrization of box to add special case for str -@pytest.mark.parametrize('box', [ - # disabled due to too many xfails; see GH 23982 / 25425 - # (True, None), # fill_value wrapped in array with default dtype - # (True, 'str'), # fill_value wrapped in array with generic string-dtype - (True, object), # fill_value wrapped in array with object dtype - (False, None) # fill_value passed on as scalar -]) -def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, - string_dtype, box): +@pytest.mark.parametrize( + "box", + [ + # disabled due to too many xfails; see GH 23982 / 25425 + # (True, None), # fill_value wrapped in array with default dtype + # (True, 'str'), # fill_value wrapped in array with generic string-dtype + (True, object), # fill_value wrapped in array with object dtype + (False, None), # fill_value passed on as scalar + ], +) +def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) fill_dtype = np.dtype(string_dtype) boxed, box_dtype = box # read from parametrized fixture if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: - pytest.xfail('does not upcast or raises') - if (boxed and box_dtype in (None, 'str') and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) + pytest.xfail("does not upcast or raises") + if ( + boxed + and box_dtype in (None, "str") + and ( + is_integer_dtype(dtype) + or is_float_dtype(dtype) or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_))): - pytest.xfail('does not upcast correctly') + or issubclass(dtype.type, np.bytes_) + ) + ): + pytest.xfail("does not upcast correctly") # create array of given dtype - fill_value = 'abc' + fill_value = "abc" # special case for box_dtype (cannot use fixture in parametrization) - box_dtype = fill_dtype if box_dtype == 'str' else box_dtype + box_dtype = fill_dtype if box_dtype == "str" else box_dtype # filling anything with a string casts to object expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -def test_maybe_promote_object_with_any(object_dtype, - any_numpy_dtype_reduced, box): +def test_maybe_promote_object_with_any(object_dtype, any_numpy_dtype_reduced, box): dtype = np.dtype(object_dtype) fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if (boxed and box_dtype is None - and is_datetime_or_timedelta_dtype(fill_dtype)): - pytest.xfail('wrong missing value marker') + if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + pytest.xfail("wrong missing value marker") # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -550,17 +702,23 @@ def test_maybe_promote_object_with_any(object_dtype, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, - object_dtype, box): +def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype, box): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if not boxed and is_datetime_or_timedelta_dtype(dtype): - pytest.xfail('raises error') + pytest.xfail("raises error") # create array of object dtype from a scalar value (i.e. passing # dtypes.common.is_scalar), which can however not be cast to int/float etc. @@ -571,42 +729,61 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, exp_val_for_scalar = fill_value exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -@pytest.mark.parametrize('fill_value', [None, np.nan, NaT, iNaT], - ids=['None', 'np.nan', 'pd.NaT', 'iNaT']) +@pytest.mark.parametrize( + "fill_value", [None, np.nan, NaT, iNaT], ids=["None", "np.nan", "pd.NaT", "iNaT"] +) # override parametrization due to to many xfails; see GH 23982 / 25425 -@pytest.mark.parametrize('box', [(False, None)]) -def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, - fill_value, box): +@pytest.mark.parametrize("box", [(False, None)]) +def test_maybe_promote_any_numpy_dtype_with_na( + any_numpy_dtype_reduced, fill_value, box +): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if (dtype == bytes and not boxed - and fill_value is not None and fill_value is not NaT): - pytest.xfail('does not upcast to object') - elif dtype == 'uint64' and not boxed and fill_value == iNaT: - pytest.xfail('does not upcast correctly') + if ( + dtype == bytes + and not boxed + and fill_value is not None + and fill_value is not NaT + ): + pytest.xfail("does not upcast to object") + elif dtype == "uint64" and not boxed and fill_value == iNaT: + pytest.xfail("does not upcast correctly") elif is_datetime_or_timedelta_dtype(dtype) and boxed: - pytest.xfail('falsely upcasts to object') - elif (boxed and (is_integer_dtype(dtype) or is_float_dtype(dtype) - or is_complex_dtype(dtype)) - and fill_value is not NaT and dtype != 'uint64'): - pytest.xfail('falsely upcasts to object') - elif (boxed and dtype == 'uint64' - and (fill_value is np.nan or fill_value is None)): - pytest.xfail('falsely upcasts to object') + pytest.xfail("falsely upcasts to object") + elif ( + boxed + and ( + is_integer_dtype(dtype) or is_float_dtype(dtype) or is_complex_dtype(dtype) + ) + and fill_value is not NaT + and dtype != "uint64" + ): + pytest.xfail("falsely upcasts to object") + elif boxed and dtype == "uint64" and (fill_value is np.nan or fill_value is None): + pytest.xfail("falsely upcasts to object") # below: opinionated that iNaT should be interpreted as missing value - elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) - and fill_value == iNaT): - pytest.xfail('does not cast to missing value marker correctly') - elif ((is_string_dtype(dtype) or dtype == bool) - and not boxed and fill_value == iNaT): - pytest.xfail('does not cast to missing value marker correctly') - - if is_integer_dtype(dtype) and dtype == 'uint64' and fill_value == iNaT: + elif ( + not boxed + and (is_float_dtype(dtype) or is_complex_dtype(dtype)) + and fill_value == iNaT + ): + pytest.xfail("does not cast to missing value marker correctly") + elif (is_string_dtype(dtype) or dtype == bool) and not boxed and fill_value == iNaT: + pytest.xfail("does not cast to missing value marker correctly") + + if is_integer_dtype(dtype) and dtype == "uint64" and fill_value == iNaT: # uint64 + negative int casts to object; iNaT is considered as missing expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan @@ -649,11 +826,18 @@ def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, else: # expected_dtype = float / complex / object exp_val_for_array = np.nan - _check_promote(dtype, fill_value, boxed, box_dtype, expected_dtype, - exp_val_for_scalar, exp_val_for_array) + _check_promote( + dtype, + fill_value, + boxed, + box_dtype, + expected_dtype, + exp_val_for_scalar, + exp_val_for_array, + ) -@pytest.mark.parametrize('dim', [0, 2, 3]) +@pytest.mark.parametrize("dim", [0, 2, 3]) def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): dtype = np.dtype(any_numpy_dtype_reduced) @@ -666,12 +850,13 @@ def test_maybe_promote_dimensions(any_numpy_dtype_reduced, dim): # test against 1-dimensional case expected_dtype, expected_missing_value = maybe_promote( - dtype, np.array([1], dtype=dtype)) + dtype, np.array([1], dtype=dtype) + ) result_dtype, result_missing_value = maybe_promote(dtype, fill_array) assert result_dtype == expected_dtype # None == None, iNaT == iNaT, but np.nan != np.nan - assert ((result_missing_value == expected_missing_value) - or (result_missing_value is np.nan - and expected_missing_value is np.nan)) + assert (result_missing_value == expected_missing_value) or ( + result_missing_value is np.nan and expected_missing_value is np.nan + ) diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index 8d5f21806cf46..f076bcd578001 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -7,11 +7,7 @@ from pandas.util import testing as tm -@pytest.mark.parametrize("result", [ - Series([10, 11, 12]), - [10, 11, 12], - (10, 11, 12) -]) +@pytest.mark.parametrize("result", [Series([10, 11, 12]), [10, 11, 12], (10, 11, 12)]) def test_upcast_error(result): # GH23823 mask = np.array([False, True, False]) @@ -20,18 +16,26 @@ def test_upcast_error(result): result, _ = maybe_upcast_putmask(result, mask, other) -@pytest.mark.parametrize("arr, other, exp_changed, expected", [ - (np.arange(1, 6), np.array([61, 62, 63]), - False, np.array([1, 61, 3, 62, 63])), - (np.arange(1, 6), np.array([61.1, 62.2, 63.3]), - True, np.array([1, 61.1, 3, 62.2, 63.3])), - (np.arange(1, 6), np.nan, - True, np.array([1, np.nan, 3, np.nan, np.nan])), - (np.arange(10, 15), np.array([61, 62]), - False, np.array([10, 61, 12, 62, 61])), - (np.arange(10, 15), np.array([61, np.nan]), - True, np.array([10, 61, 12, np.nan, 61])) -]) +@pytest.mark.parametrize( + "arr, other, exp_changed, expected", + [ + (np.arange(1, 6), np.array([61, 62, 63]), False, np.array([1, 61, 3, 62, 63])), + ( + np.arange(1, 6), + np.array([61.1, 62.2, 63.3]), + True, + np.array([1, 61.1, 3, 62.2, 63.3]), + ), + (np.arange(1, 6), np.nan, True, np.array([1, np.nan, 3, np.nan, np.nan])), + (np.arange(10, 15), np.array([61, 62]), False, np.array([10, 61, 12, 62, 61])), + ( + np.arange(10, 15), + np.array([61, np.nan]), + True, + np.array([10, 61, 12, np.nan, 61]), + ), + ], +) def test_upcast(arr, other, exp_changed, expected): # GH23823 mask = np.array([False, True, False, True, True]) @@ -41,20 +45,44 @@ def test_upcast(arr, other, exp_changed, expected): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("arr, other, exp_changed, expected", [ - (np.arange('2019-01-01', '2019-01-06', dtype='datetime64[D]'), - np.arange('2018-01-01', '2018-01-04', dtype='datetime64[D]'), - False, np.array(['2019-01-01', '2018-01-01', '2019-01-03', - '2018-01-02', '2018-01-03'], dtype='datetime64[D]')), - (np.arange('2019-01-01', '2019-01-06', dtype='datetime64[D]'), np.nan, - False, np.array(['2019-01-01', np.datetime64('NaT'), - '2019-01-03', np.datetime64('NaT'), - np.datetime64('NaT')], dtype='datetime64[D]')), - (np.arange('2019-01-01', '2019-01-06', dtype='datetime64[D]'), - np.arange('2018-01-01', '2018-01-03', dtype='datetime64[D]'), - False, np.array(['2019-01-01', '2018-01-01', '2019-01-03', - '2018-01-02', '2018-01-01'], dtype='datetime64[D]')) -]) +@pytest.mark.parametrize( + "arr, other, exp_changed, expected", + [ + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-04", dtype="datetime64[D]"), + False, + np.array( + ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-03"], + dtype="datetime64[D]", + ), + ), + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.nan, + False, + np.array( + [ + "2019-01-01", + np.datetime64("NaT"), + "2019-01-03", + np.datetime64("NaT"), + np.datetime64("NaT"), + ], + dtype="datetime64[D]", + ), + ), + ( + np.arange("2019-01-01", "2019-01-06", dtype="datetime64[D]"), + np.arange("2018-01-01", "2018-01-03", dtype="datetime64[D]"), + False, + np.array( + ["2019-01-01", "2018-01-01", "2019-01-03", "2018-01-02", "2018-01-01"], + dtype="datetime64[D]", + ), + ), + ], +) def test_upcast_datetime(arr, other, exp_changed, expected): # GH23823 mask = np.array([False, True, False, True, True]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 675abec661b5a..27ae918b015fe 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -5,25 +5,32 @@ import pandas.core.dtypes.common as com from pandas.core.dtypes.dtypes import ( - CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, IntervalDtype, - PeriodDtype) + CategoricalDtype, + CategoricalDtypeType, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) import pandas as pd from pandas.conftest import ( - ALL_EA_INT_DTYPES, ALL_INT_DTYPES, SIGNED_EA_INT_DTYPES, SIGNED_INT_DTYPES, - UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES) + ALL_EA_INT_DTYPES, + ALL_INT_DTYPES, + SIGNED_EA_INT_DTYPES, + SIGNED_INT_DTYPES, + UNSIGNED_EA_INT_DTYPES, + UNSIGNED_INT_DTYPES, +) from pandas.core.sparse.api import SparseDtype import pandas.util.testing as tm -ignore_sparse_warning = pytest.mark.filterwarnings( - "ignore:Sparse:FutureWarning" -) +ignore_sparse_warning = pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") # EA & Actual Dtypes def to_ea_dtypes(dtypes): """ convert list of string dtypes to EA dtype """ - return [getattr(pd, dt + 'Dtype') for dt in dtypes] + return [getattr(pd, dt + "Dtype") for dt in dtypes] def to_numpy_dtypes(dtypes): @@ -35,64 +42,84 @@ class TestPandasDtype: # Passing invalid dtype, both as a string or object, must raise TypeError # Per issue GH15520 - @pytest.mark.parametrize('box', [pd.Timestamp, 'pd.Timestamp', list]) + @pytest.mark.parametrize("box", [pd.Timestamp, "pd.Timestamp", list]) def test_invalid_dtype_error(self, box): - with pytest.raises(TypeError, match='not understood'): + with pytest.raises(TypeError, match="not understood"): com.pandas_dtype(box) - @pytest.mark.parametrize('dtype', [ - object, 'float64', np.object_, np.dtype('object'), 'O', - np.float64, float, np.dtype('float64')]) + @pytest.mark.parametrize( + "dtype", + [ + object, + "float64", + np.object_, + np.dtype("object"), + "O", + np.float64, + float, + np.dtype("float64"), + ], + ) def test_pandas_dtype_valid(self, dtype): assert com.pandas_dtype(dtype) == dtype - @pytest.mark.parametrize('dtype', [ - 'M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']) + @pytest.mark.parametrize( + "dtype", ["M8[ns]", "m8[ns]", "object", "float64", "int64"] + ) def test_numpy_dtype(self, dtype): assert com.pandas_dtype(dtype) == np.dtype(dtype) def test_numpy_string_dtype(self): # do not parse freq-like string as period dtype - assert com.pandas_dtype('U') == np.dtype('U') - assert com.pandas_dtype('S') == np.dtype('S') - - @pytest.mark.parametrize('dtype', [ - 'datetime64[ns, US/Eastern]', - 'datetime64[ns, Asia/Tokyo]', - 'datetime64[ns, UTC]']) + assert com.pandas_dtype("U") == np.dtype("U") + assert com.pandas_dtype("S") == np.dtype("S") + + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[ns, US/Eastern]", + "datetime64[ns, Asia/Tokyo]", + "datetime64[ns, UTC]", + ], + ) def test_datetimetz_dtype(self, dtype): - assert (com.pandas_dtype(dtype) == - DatetimeTZDtype.construct_from_string(dtype)) + assert com.pandas_dtype(dtype) == DatetimeTZDtype.construct_from_string(dtype) assert com.pandas_dtype(dtype) == dtype def test_categorical_dtype(self): - assert com.pandas_dtype('category') == CategoricalDtype() - - @pytest.mark.parametrize('dtype', [ - 'period[D]', 'period[3M]', 'period[U]', - 'Period[D]', 'Period[3M]', 'Period[U]']) + assert com.pandas_dtype("category") == CategoricalDtype() + + @pytest.mark.parametrize( + "dtype", + [ + "period[D]", + "period[3M]", + "period[U]", + "Period[D]", + "Period[3M]", + "Period[U]", + ], + ) def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) is PeriodDtype(dtype) assert com.pandas_dtype(dtype) == PeriodDtype(dtype) assert com.pandas_dtype(dtype) == dtype -dtypes = dict(datetime_tz=com.pandas_dtype('datetime64[ns, US/Eastern]'), - datetime=com.pandas_dtype('datetime64[ns]'), - timedelta=com.pandas_dtype('timedelta64[ns]'), - period=PeriodDtype('D'), - integer=np.dtype(np.int64), - float=np.dtype(np.float64), - object=np.dtype(np.object), - category=com.pandas_dtype('category')) +dtypes = dict( + datetime_tz=com.pandas_dtype("datetime64[ns, US/Eastern]"), + datetime=com.pandas_dtype("datetime64[ns]"), + timedelta=com.pandas_dtype("timedelta64[ns]"), + period=PeriodDtype("D"), + integer=np.dtype(np.int64), + float=np.dtype(np.float64), + object=np.dtype(np.object), + category=com.pandas_dtype("category"), +) -@pytest.mark.parametrize('name1,dtype1', - list(dtypes.items()), - ids=lambda x: str(x)) -@pytest.mark.parametrize('name2,dtype2', - list(dtypes.items()), - ids=lambda x: str(x)) +@pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x)) +@pytest.mark.parametrize("name2,dtype2", list(dtypes.items()), ids=lambda x: str(x)) def test_dtype_equal(name1, dtype1, name2, dtype2): # match equal to self, but not equal to other @@ -101,16 +128,21 @@ def test_dtype_equal(name1, dtype1, name2, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) -@pytest.mark.parametrize("dtype1,dtype2", [ - (np.int8, np.int64), - (np.int16, np.int64), - (np.int32, np.int64), - (np.float32, np.float64), - (PeriodDtype("D"), PeriodDtype("2D")), # PeriodType - (com.pandas_dtype("datetime64[ns, US/Eastern]"), - com.pandas_dtype("datetime64[ns, CET]")), # Datetime - (None, None) # gh-15941: no exception should be raised. -]) +@pytest.mark.parametrize( + "dtype1,dtype2", + [ + (np.int8, np.int64), + (np.int16, np.int64), + (np.int32, np.int64), + (np.float32, np.float64), + (PeriodDtype("D"), PeriodDtype("2D")), # PeriodType + ( + com.pandas_dtype("datetime64[ns, US/Eastern]"), + com.pandas_dtype("datetime64[ns, CET]"), + ), # Datetime + (None, None), # gh-15941: no exception should be raised. + ], +) def test_dtype_equal_strict(dtype1, dtype2): assert not com.is_dtype_equal(dtype1, dtype2) @@ -122,14 +154,11 @@ def get_is_dtype_funcs(): """ - fnames = [f for f in dir(com) if (f.startswith('is_') and - f.endswith('dtype'))] + fnames = [f for f in dir(com) if (f.startswith("is_") and f.endswith("dtype"))] return [getattr(com, fname) for fname in fnames] -@pytest.mark.parametrize('func', - get_is_dtype_funcs(), - ids=lambda x: x.__name__) +@pytest.mark.parametrize("func", get_is_dtype_funcs(), ids=lambda x: x.__name__) def test_get_dtype_error_catch(func): # see gh-15941 # @@ -147,9 +176,9 @@ def test_is_object(): assert not com.is_object_dtype([1, 2, 3]) -@pytest.mark.parametrize("check_scipy", [ - False, pytest.param(True, marks=td.skip_if_no_scipy) -]) +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) @ignore_sparse_warning def test_is_sparse(check_scipy): assert com.is_sparse(pd.SparseArray([1, 2, 3])) @@ -159,6 +188,7 @@ def test_is_sparse(check_scipy): if check_scipy: import scipy.sparse + assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3])) @@ -166,6 +196,7 @@ def test_is_sparse(check_scipy): @ignore_sparse_warning def test_is_scipy_sparse(): from scipy.sparse import bsr_matrix + assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3])) @@ -213,8 +244,7 @@ def test_is_datetime64tz_dtype(): assert not com.is_datetime64tz_dtype(object) assert not com.is_datetime64tz_dtype([1, 2, 3]) assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) - assert com.is_datetime64tz_dtype(pd.DatetimeIndex(['2000'], - tz="US/Eastern")) + assert com.is_datetime64tz_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) def test_is_timedelta64_dtype(): @@ -222,14 +252,14 @@ def test_is_timedelta64_dtype(): assert not com.is_timedelta64_dtype(None) assert not com.is_timedelta64_dtype([1, 2, 3]) assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64)) - assert not com.is_timedelta64_dtype('0 days') + assert not com.is_timedelta64_dtype("0 days") assert not com.is_timedelta64_dtype("0 days 00:00:00") assert not com.is_timedelta64_dtype(["0 days 00:00:00"]) assert not com.is_timedelta64_dtype("NO DATE") assert com.is_timedelta64_dtype(np.timedelta64) assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) - assert com.is_timedelta64_dtype(pd.to_timedelta(['0 days', '1 days'])) + assert com.is_timedelta64_dtype(pd.to_timedelta(["0 days", "1 days"])) def test_is_period_dtype(): @@ -267,7 +297,7 @@ def test_is_string_dtype(): assert com.is_string_dtype(str) assert com.is_string_dtype(object) - assert com.is_string_dtype(np.array(['a', 'b'])) + assert com.is_string_dtype(np.array(["a", "b"])) def test_is_period_arraylike(): @@ -298,77 +328,119 @@ def test_is_datetimelike(): @pytest.mark.parametrize( - 'dtype', [ - pd.Series([1, 2])] + - ALL_INT_DTYPES + to_numpy_dtypes(ALL_INT_DTYPES) + - ALL_EA_INT_DTYPES + to_ea_dtypes(ALL_EA_INT_DTYPES)) + "dtype", + [pd.Series([1, 2])] + + ALL_INT_DTYPES + + to_numpy_dtypes(ALL_INT_DTYPES) + + ALL_EA_INT_DTYPES + + to_ea_dtypes(ALL_EA_INT_DTYPES), +) def test_is_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', [str, float, np.datetime64, np.timedelta64, - pd.Index([1, 2.]), np.array(['a', 'b']), - np.array([], dtype=np.timedelta64)]) + "dtype", + [ + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ], +) def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', [ - pd.Series([1, 2])] + - SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) + - SIGNED_EA_INT_DTYPES + to_ea_dtypes(SIGNED_EA_INT_DTYPES)) + "dtype", + [pd.Series([1, 2])] + + SIGNED_INT_DTYPES + + to_numpy_dtypes(SIGNED_INT_DTYPES) + + SIGNED_EA_INT_DTYPES + + to_ea_dtypes(SIGNED_EA_INT_DTYPES), +) def test_is_signed_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', + "dtype", [ - str, float, np.datetime64, np.timedelta64, - pd.Index([1, 2.]), np.array(['a', 'b']), - np.array([], dtype=np.timedelta64)] + - UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + - UNSIGNED_EA_INT_DTYPES + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES)) + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + UNSIGNED_INT_DTYPES + + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + + UNSIGNED_EA_INT_DTYPES + + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), +) def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', - [pd.Series([1, 2], dtype=np.uint32)] + - UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + - UNSIGNED_EA_INT_DTYPES + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES)) + "dtype", + [pd.Series([1, 2], dtype=np.uint32)] + + UNSIGNED_INT_DTYPES + + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + + UNSIGNED_EA_INT_DTYPES + + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), +) def test_is_unsigned_integer_dtype(dtype): assert com.is_unsigned_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', + "dtype", [ - str, float, np.datetime64, np.timedelta64, - pd.Index([1, 2.]), np.array(['a', 'b']), - np.array([], dtype=np.timedelta64)] + - SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) + - SIGNED_EA_INT_DTYPES + to_ea_dtypes(SIGNED_EA_INT_DTYPES)) + str, + float, + np.datetime64, + np.timedelta64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([], dtype=np.timedelta64), + ] + + SIGNED_INT_DTYPES + + to_numpy_dtypes(SIGNED_INT_DTYPES) + + SIGNED_EA_INT_DTYPES + + to_ea_dtypes(SIGNED_EA_INT_DTYPES), +) def test_is_not_unsigned_integer_dtype(dtype): assert not com.is_unsigned_integer_dtype(dtype) @pytest.mark.parametrize( - 'dtype', - [np.int64, np.array([1, 2], dtype=np.int64), 'Int64', pd.Int64Dtype]) + "dtype", [np.int64, np.array([1, 2], dtype=np.int64), "Int64", pd.Int64Dtype] +) def test_is_int64_dtype(dtype): assert com.is_int64_dtype(dtype) @pytest.mark.parametrize( - 'dtype', + "dtype", [ - str, float, np.int32, np.uint64, pd.Index([1, 2.]), - np.array(['a', 'b']), np.array([1, 2], dtype=np.uint32), - 'int8', 'Int8', pd.Int8Dtype]) + str, + float, + np.int32, + np.uint64, + pd.Index([1, 2.0]), + np.array(["a", "b"]), + np.array([1, 2], dtype=np.uint32), + "int8", + "Int8", + pd.Int8Dtype, + ], +) def test_is_not_int64_dtype(dtype): assert not com.is_int64_dtype(dtype) @@ -377,13 +449,14 @@ def test_is_datetime64_any_dtype(): assert not com.is_datetime64_any_dtype(int) assert not com.is_datetime64_any_dtype(str) assert not com.is_datetime64_any_dtype(np.array([1, 2])) - assert not com.is_datetime64_any_dtype(np.array(['a', 'b'])) + assert not com.is_datetime64_any_dtype(np.array(["a", "b"])) assert com.is_datetime64_any_dtype(np.datetime64) assert com.is_datetime64_any_dtype(np.array([], dtype=np.datetime64)) assert com.is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime64_any_dtype( - pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")) + pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]") + ) def test_is_datetime64_ns_dtype(): @@ -391,7 +464,7 @@ def test_is_datetime64_ns_dtype(): assert not com.is_datetime64_ns_dtype(str) assert not com.is_datetime64_ns_dtype(np.datetime64) assert not com.is_datetime64_ns_dtype(np.array([1, 2])) - assert not com.is_datetime64_ns_dtype(np.array(['a', 'b'])) + assert not com.is_datetime64_ns_dtype(np.array(["a", "b"])) assert not com.is_datetime64_ns_dtype(np.array([], dtype=np.datetime64)) # This datetime array has the wrong unit (ps instead of ns) @@ -399,34 +472,31 @@ def test_is_datetime64_ns_dtype(): assert com.is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime64_ns_dtype( - pd.DatetimeIndex([1, 2, 3], dtype=np.dtype('datetime64[ns]'))) + pd.DatetimeIndex([1, 2, 3], dtype=np.dtype("datetime64[ns]")) + ) def test_is_timedelta64_ns_dtype(): - assert not com.is_timedelta64_ns_dtype(np.dtype('m8[ps]')) - assert not com.is_timedelta64_ns_dtype( - np.array([1, 2], dtype=np.timedelta64)) + assert not com.is_timedelta64_ns_dtype(np.dtype("m8[ps]")) + assert not com.is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64)) - assert com.is_timedelta64_ns_dtype(np.dtype('m8[ns]')) - assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]')) + assert com.is_timedelta64_ns_dtype(np.dtype("m8[ns]")) + assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype="m8[ns]")) def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(int) assert not com.is_datetime_or_timedelta_dtype(str) assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) - assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) + assert not com.is_datetime_or_timedelta_dtype(np.array(["a", "b"])) # TODO(jreback), this is slightly suspect - assert not com.is_datetime_or_timedelta_dtype( - DatetimeTZDtype("ns", "US/Eastern")) + assert not com.is_datetime_or_timedelta_dtype(DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime_or_timedelta_dtype(np.datetime64) assert com.is_datetime_or_timedelta_dtype(np.timedelta64) - assert com.is_datetime_or_timedelta_dtype( - np.array([], dtype=np.timedelta64)) - assert com.is_datetime_or_timedelta_dtype( - np.array([], dtype=np.datetime64)) + assert com.is_datetime_or_timedelta_dtype(np.array([], dtype=np.timedelta64)) + assert com.is_datetime_or_timedelta_dtype(np.array([], dtype=np.datetime64)) def test_is_numeric_v_string_like(): @@ -434,8 +504,7 @@ def test_is_numeric_v_string_like(): assert not com.is_numeric_v_string_like(1, "foo") assert not com.is_numeric_v_string_like("foo", "foo") assert not com.is_numeric_v_string_like(np.array([1]), np.array([2])) - assert not com.is_numeric_v_string_like( - np.array(["foo"]), np.array(["foo"])) + assert not com.is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"])) assert com.is_numeric_v_string_like(np.array([1]), "foo") assert com.is_numeric_v_string_like("foo", np.array([1])) @@ -479,26 +548,25 @@ def test_needs_i8_conversion(): assert not com.needs_i8_conversion(str) assert not com.needs_i8_conversion(np.int64) assert not com.needs_i8_conversion(pd.Series([1, 2])) - assert not com.needs_i8_conversion(np.array(['a', 'b'])) + assert not com.needs_i8_conversion(np.array(["a", "b"])) assert com.needs_i8_conversion(np.datetime64) assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) - assert com.needs_i8_conversion(pd.DatetimeIndex( - ["2000"], tz="US/Eastern")) + assert com.needs_i8_conversion(pd.DatetimeIndex(["2000"], tz="US/Eastern")) def test_is_numeric_dtype(): assert not com.is_numeric_dtype(str) assert not com.is_numeric_dtype(np.datetime64) assert not com.is_numeric_dtype(np.timedelta64) - assert not com.is_numeric_dtype(np.array(['a', 'b'])) + assert not com.is_numeric_dtype(np.array(["a", "b"])) assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64)) assert com.is_numeric_dtype(int) assert com.is_numeric_dtype(float) assert com.is_numeric_dtype(np.uint64) assert com.is_numeric_dtype(pd.Series([1, 2])) - assert com.is_numeric_dtype(pd.Index([1, 2.])) + assert com.is_numeric_dtype(pd.Index([1, 2.0])) def test_is_string_like_dtype(): @@ -506,25 +574,25 @@ def test_is_string_like_dtype(): assert not com.is_string_like_dtype(pd.Series([1, 2])) assert com.is_string_like_dtype(str) - assert com.is_string_like_dtype(np.array(['a', 'b'])) + assert com.is_string_like_dtype(np.array(["a", "b"])) def test_is_float_dtype(): assert not com.is_float_dtype(str) assert not com.is_float_dtype(int) assert not com.is_float_dtype(pd.Series([1, 2])) - assert not com.is_float_dtype(np.array(['a', 'b'])) + assert not com.is_float_dtype(np.array(["a", "b"])) assert com.is_float_dtype(float) - assert com.is_float_dtype(pd.Index([1, 2.])) + assert com.is_float_dtype(pd.Index([1, 2.0])) def test_is_bool_dtype(): assert not com.is_bool_dtype(int) assert not com.is_bool_dtype(str) assert not com.is_bool_dtype(pd.Series([1, 2])) - assert not com.is_bool_dtype(np.array(['a', 'b'])) - assert not com.is_bool_dtype(pd.Index(['a', 'b'])) + assert not com.is_bool_dtype(np.array(["a", "b"])) + assert not com.is_bool_dtype(pd.Index(["a", "b"])) assert com.is_bool_dtype(bool) assert com.is_bool_dtype(np.bool) @@ -532,9 +600,9 @@ def test_is_bool_dtype(): assert com.is_bool_dtype(pd.Index([True, False])) -@pytest.mark.parametrize("check_scipy", [ - False, pytest.param(True, marks=td.skip_if_no_scipy) -]) +@pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] +) @ignore_sparse_warning def test_is_extension_type(check_scipy): assert not com.is_extension_type([1, 2, 3]) @@ -546,7 +614,7 @@ def test_is_extension_type(check_scipy): assert com.is_extension_type(pd.Series(cat)) assert com.is_extension_type(pd.SparseArray([1, 2, 3])) assert com.is_extension_type(pd.SparseSeries([1, 2, 3])) - assert com.is_extension_type(pd.DatetimeIndex(['2000'], tz="US/Eastern")) + assert com.is_extension_type(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") s = pd.Series([], dtype=dtype) @@ -554,6 +622,7 @@ def test_is_extension_type(check_scipy): if check_scipy: import scipy.sparse + assert not com.is_extension_type(scipy.sparse.bsr_matrix([1, 2, 3])) @@ -561,15 +630,14 @@ def test_is_complex_dtype(): assert not com.is_complex_dtype(int) assert not com.is_complex_dtype(str) assert not com.is_complex_dtype(pd.Series([1, 2])) - assert not com.is_complex_dtype(np.array(['a', 'b'])) + assert not com.is_complex_dtype(np.array(["a", "b"])) assert com.is_complex_dtype(np.complex) assert com.is_complex_dtype(np.array([1 + 1j, 5])) def test_is_offsetlike(): - assert com.is_offsetlike(np.array([pd.DateOffset(month=3), - pd.offsets.Nano()])) + assert com.is_offsetlike(np.array([pd.DateOffset(month=3), pd.offsets.Nano()])) assert com.is_offsetlike(pd.offsets.MonthEnd()) assert com.is_offsetlike(pd.Index([pd.DateOffset(second=1)])) @@ -580,95 +648,103 @@ def test_is_offsetlike(): assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)])) -@pytest.mark.parametrize('input_param,result', [ - (int, np.dtype(int)), - ('int32', np.dtype('int32')), - (float, np.dtype(float)), - ('float64', np.dtype('float64')), - (np.dtype('float64'), np.dtype('float64')), - (str, np.dtype(str)), - (pd.Series([1, 2], dtype=np.dtype('int16')), np.dtype('int16')), - (pd.Series(['a', 'b']), np.dtype(object)), - (pd.Index([1, 2]), np.dtype('int64')), - (pd.Index(['a', 'b']), np.dtype(object)), - ('category', 'category'), - (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), - (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), - (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), - (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), - (CategoricalDtype(), CategoricalDtype()), - (CategoricalDtype(['a', 'b']), CategoricalDtype()), - (pd.DatetimeIndex([1, 2]), np.dtype('=M8[ns]')), - (pd.DatetimeIndex([1, 2]).dtype, np.dtype('=M8[ns]')), - (' df.two.sum() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7acfc5ff7f0c5..6824266c9282b 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -20,15 +20,37 @@ from pandas.core.dtypes import inference from pandas.core.dtypes.common import ( - ensure_categorical, ensure_int32, is_bool, is_datetime64_any_dtype, - is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_float, is_integer, is_number, is_scalar, is_scipy_sparse, - is_timedelta64_dtype, is_timedelta64_ns_dtype) + ensure_categorical, + ensure_int32, + is_bool, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64_ns_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_number, + is_scalar, + is_scipy_sparse, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, +) import pandas as pd from pandas import ( - Categorical, DataFrame, DateOffset, DatetimeIndex, Index, Interval, Period, - Series, Timedelta, TimedeltaIndex, Timestamp, isna) + Categorical, + DataFrame, + DateOffset, + DatetimeIndex, + Index, + Interval, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, +) from pandas.util import testing as tm @@ -40,46 +62,46 @@ def coerce(request): # collect all objects to be tested for list-like-ness; use tuples of objects, # whether they are list-like or not (special casing for sets), and their ID ll_params = [ - ([1], True, 'list'), # noqa: E241 - ([], True, 'list-empty'), # noqa: E241 - ((1, ), True, 'tuple'), # noqa: E241 - (tuple(), True, 'tuple-empty'), # noqa: E241 - ({'a': 1}, True, 'dict'), # noqa: E241 - (dict(), True, 'dict-empty'), # noqa: E241 - ({'a', 1}, 'set', 'set'), # noqa: E241 - (set(), 'set', 'set-empty'), # noqa: E241 - (frozenset({'a', 1}), 'set', 'frozenset'), # noqa: E241 - (frozenset(), 'set', 'frozenset-empty'), # noqa: E241 - (iter([1, 2]), True, 'iterator'), # noqa: E241 - (iter([]), True, 'iterator-empty'), # noqa: E241 - ((x for x in [1, 2]), True, 'generator'), # noqa: E241 - ((x for x in []), True, 'generator-empty'), # noqa: E241 - (Series([1]), True, 'Series'), # noqa: E241 - (Series([]), True, 'Series-empty'), # noqa: E241 - (Series(['a']).str, True, 'StringMethods'), # noqa: E241 - (Series([], dtype='O').str, True, 'StringMethods-empty'), # noqa: E241 - (Index([1]), True, 'Index'), # noqa: E241 - (Index([]), True, 'Index-empty'), # noqa: E241 - (DataFrame([[1]]), True, 'DataFrame'), # noqa: E241 - (DataFrame(), True, 'DataFrame-empty'), # noqa: E241 - (np.ndarray((2,) * 1), True, 'ndarray-1d'), # noqa: E241 - (np.array([]), True, 'ndarray-1d-empty'), # noqa: E241 - (np.ndarray((2,) * 2), True, 'ndarray-2d'), # noqa: E241 - (np.array([[]]), True, 'ndarray-2d-empty'), # noqa: E241 - (np.ndarray((2,) * 3), True, 'ndarray-3d'), # noqa: E241 - (np.array([[[]]]), True, 'ndarray-3d-empty'), # noqa: E241 - (np.ndarray((2,) * 4), True, 'ndarray-4d'), # noqa: E241 - (np.array([[[[]]]]), True, 'ndarray-4d-empty'), # noqa: E241 - (np.array(2), False, 'ndarray-0d'), # noqa: E241 - (1, False, 'int'), # noqa: E241 - (b'123', False, 'bytes'), # noqa: E241 - (b'', False, 'bytes-empty'), # noqa: E241 - ('123', False, 'string'), # noqa: E241 - ('', False, 'string-empty'), # noqa: E241 - (str, False, 'string-type'), # noqa: E241 - (object(), False, 'object'), # noqa: E241 - (np.nan, False, 'NaN'), # noqa: E241 - (None, False, 'None') # noqa: E241 + ([1], True, "list"), # noqa: E241 + ([], True, "list-empty"), # noqa: E241 + ((1,), True, "tuple"), # noqa: E241 + (tuple(), True, "tuple-empty"), # noqa: E241 + ({"a": 1}, True, "dict"), # noqa: E241 + (dict(), True, "dict-empty"), # noqa: E241 + ({"a", 1}, "set", "set"), # noqa: E241 + (set(), "set", "set-empty"), # noqa: E241 + (frozenset({"a", 1}), "set", "frozenset"), # noqa: E241 + (frozenset(), "set", "frozenset-empty"), # noqa: E241 + (iter([1, 2]), True, "iterator"), # noqa: E241 + (iter([]), True, "iterator-empty"), # noqa: E241 + ((x for x in [1, 2]), True, "generator"), # noqa: E241 + ((x for x in []), True, "generator-empty"), # noqa: E241 + (Series([1]), True, "Series"), # noqa: E241 + (Series([]), True, "Series-empty"), # noqa: E241 + (Series(["a"]).str, True, "StringMethods"), # noqa: E241 + (Series([], dtype="O").str, True, "StringMethods-empty"), # noqa: E241 + (Index([1]), True, "Index"), # noqa: E241 + (Index([]), True, "Index-empty"), # noqa: E241 + (DataFrame([[1]]), True, "DataFrame"), # noqa: E241 + (DataFrame(), True, "DataFrame-empty"), # noqa: E241 + (np.ndarray((2,) * 1), True, "ndarray-1d"), # noqa: E241 + (np.array([]), True, "ndarray-1d-empty"), # noqa: E241 + (np.ndarray((2,) * 2), True, "ndarray-2d"), # noqa: E241 + (np.array([[]]), True, "ndarray-2d-empty"), # noqa: E241 + (np.ndarray((2,) * 3), True, "ndarray-3d"), # noqa: E241 + (np.array([[[]]]), True, "ndarray-3d-empty"), # noqa: E241 + (np.ndarray((2,) * 4), True, "ndarray-4d"), # noqa: E241 + (np.array([[[[]]]]), True, "ndarray-4d-empty"), # noqa: E241 + (np.array(2), False, "ndarray-0d"), # noqa: E241 + (1, False, "int"), # noqa: E241 + (b"123", False, "bytes"), # noqa: E241 + (b"", False, "bytes-empty"), # noqa: E241 + ("123", False, "string"), # noqa: E241 + ("", False, "string-empty"), # noqa: E241 + (str, False, "string-type"), # noqa: E241 + (object(), False, "object"), # noqa: E241 + (np.nan, False, "NaN"), # noqa: E241 + (None, False, "None"), # noqa: E241 ] objs, expected, ids = zip(*ll_params) @@ -91,29 +113,28 @@ def maybe_list_like(request): def test_is_list_like(maybe_list_like): obj, expected = maybe_list_like - expected = True if expected == 'set' else expected + expected = True if expected == "set" else expected assert inference.is_list_like(obj) == expected def test_is_list_like_disallow_sets(maybe_list_like): obj, expected = maybe_list_like - expected = False if expected == 'set' else expected + expected = False if expected == "set" else expected assert inference.is_list_like(obj, allow_sets=False) == expected def test_is_sequence(): is_seq = inference.is_sequence - assert (is_seq((1, 2))) - assert (is_seq([1, 2])) - assert (not is_seq("abcd")) - assert (not is_seq(np.int64)) + assert is_seq((1, 2)) + assert is_seq([1, 2]) + assert not is_seq("abcd") + assert not is_seq(np.int64) class A: - def __getitem__(self): return 1 - assert (not is_seq(A())) + assert not is_seq(A()) def test_is_array_like(): @@ -133,37 +154,66 @@ class DtypeList(list): assert not inference.is_array_like(123) -@pytest.mark.parametrize('inner', [ - [], [1], (1, ), (1, 2), {'a': 1}, {1, 'a'}, Series([1]), - Series([]), Series(['a']).str, (x for x in range(5)) -]) -@pytest.mark.parametrize('outer', [ - list, Series, np.array, tuple -]) +@pytest.mark.parametrize( + "inner", + [ + [], + [1], + (1,), + (1, 2), + {"a": 1}, + {1, "a"}, + Series([1]), + Series([]), + Series(["a"]).str, + (x for x in range(5)), + ], +) +@pytest.mark.parametrize("outer", [list, Series, np.array, tuple]) def test_is_nested_list_like_passes(inner, outer): result = outer([inner for _ in range(5)]) assert inference.is_list_like(result) -@pytest.mark.parametrize('obj', [ - 'abc', [], [1], (1,), ['a'], 'a', {'a'}, - [1, 2, 3], Series([1]), DataFrame({"A": [1]}), - ([1, 2] for _ in range(5)), -]) +@pytest.mark.parametrize( + "obj", + [ + "abc", + [], + [1], + (1,), + ["a"], + "a", + {"a"}, + [1, 2, 3], + Series([1]), + DataFrame({"A": [1]}), + ([1, 2] for _ in range(5)), + ], +) def test_is_nested_list_like_fails(obj): assert not inference.is_nested_list_like(obj) -@pytest.mark.parametrize( - "ll", [{}, {'A': 1}, Series([1]), collections.defaultdict()]) +@pytest.mark.parametrize("ll", [{}, {"A": 1}, Series([1]), collections.defaultdict()]) def test_is_dict_like_passes(ll): assert inference.is_dict_like(ll) -@pytest.mark.parametrize("ll", [ - '1', 1, [1, 2], (1, 2), range(2), Index([1]), - dict, collections.defaultdict, Series -]) +@pytest.mark.parametrize( + "ll", + [ + "1", + 1, + [1, 2], + (1, 2), + range(2), + Index([1]), + dict, + collections.defaultdict, + Series, + ], +) def test_is_dict_like_fails(ll): assert not inference.is_dict_like(ll) @@ -177,14 +227,17 @@ def __init__(self, d): self.d = d if has_keys: + def keys(self): return self.d.keys() if has_getitem: + def __getitem__(self, key): return self.d.__getitem__(key) if has_contains: + def __contains__(self, key): return self.d.__contains__(key) @@ -235,14 +288,12 @@ class MockFile: assert not is_file(data) -@pytest.mark.parametrize( - "ll", [collections.namedtuple('Test', list('abc'))(1, 2, 3)]) +@pytest.mark.parametrize("ll", [collections.namedtuple("Test", list("abc"))(1, 2, 3)]) def test_is_names_tuple_passes(ll): assert inference.is_named_tuple(ll) -@pytest.mark.parametrize( - "ll", [(1, 2, 3), 'a', Series({'pi': 3.14})]) +@pytest.mark.parametrize("ll", [(1, 2, 3), "a", Series({"pi": 3.14})]) def test_is_names_tuple_fails(ll): assert not inference.is_named_tuple(ll) @@ -257,19 +308,12 @@ class UnhashableClass1: __hash__ = None class UnhashableClass2: - def __hash__(self): raise TypeError("Not hashable") - hashable = (1, - 3.14, - np.float64(3.14), - 'a', - tuple(), - (1, ), - HashableClass(), ) - not_hashable = ([], UnhashableClass1(), ) - abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) + hashable = (1, 3.14, np.float64(3.14), "a", tuple(), (1,), HashableClass()) + not_hashable = ([], UnhashableClass1()) + abc_hashable_not_really_hashable = (([],), UnhashableClass2()) for i in hashable: assert inference.is_hashable(i) @@ -284,41 +328,34 @@ def __hash__(self): assert not inference.is_hashable(np.array([])) -@pytest.mark.parametrize( - "ll", [re.compile('ad')]) +@pytest.mark.parametrize("ll", [re.compile("ad")]) def test_is_re_passes(ll): assert inference.is_re(ll) -@pytest.mark.parametrize( - "ll", ['x', 2, 3, object()]) +@pytest.mark.parametrize("ll", ["x", 2, 3, object()]) def test_is_re_fails(ll): assert not inference.is_re(ll) @pytest.mark.parametrize( - "ll", [r'a', 'x', - r'asdf', - re.compile('adsf'), - r'\u2233\s*', - re.compile(r'')]) + "ll", [r"a", "x", r"asdf", re.compile("adsf"), r"\u2233\s*", re.compile(r"")] +) def test_is_recompilable_passes(ll): assert inference.is_re_compilable(ll) -@pytest.mark.parametrize( - "ll", [1, [], object()]) +@pytest.mark.parametrize("ll", [1, [], object()]) def test_is_recompilable_fails(ll): assert not inference.is_re_compilable(ll) class TestInference: - def test_infer_dtype_bytes(self): - compare = 'bytes' + compare = "bytes" # string array of bytes - arr = np.array(list('abc'), dtype='S1') + arr = np.array(list("abc"), dtype="S1") assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes @@ -326,64 +363,65 @@ def test_infer_dtype_bytes(self): assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes with missing values - assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare + assert lib.infer_dtype([b"a", np.nan, b"c"], skipna=True) == compare def test_isinf_scalar(self): # GH 11352 - assert libmissing.isposinf_scalar(float('inf')) + assert libmissing.isposinf_scalar(float("inf")) assert libmissing.isposinf_scalar(np.inf) assert not libmissing.isposinf_scalar(-np.inf) assert not libmissing.isposinf_scalar(1) - assert not libmissing.isposinf_scalar('a') + assert not libmissing.isposinf_scalar("a") - assert libmissing.isneginf_scalar(float('-inf')) + assert libmissing.isneginf_scalar(float("-inf")) assert libmissing.isneginf_scalar(-np.inf) assert not libmissing.isneginf_scalar(np.inf) assert not libmissing.isneginf_scalar(1) - assert not libmissing.isneginf_scalar('a') + assert not libmissing.isneginf_scalar("a") def test_maybe_convert_numeric_infinities(self): # see gh-13274 - infinities = ['inf', 'inF', 'iNf', 'Inf', - 'iNF', 'InF', 'INf', 'INF'] - na_values = {'', 'NULL', 'nan'} + infinities = ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"] + na_values = {"", "NULL", "nan"} - pos = np.array(['inf'], dtype=np.float64) - neg = np.array(['-inf'], dtype=np.float64) + pos = np.array(["inf"], dtype=np.float64) + neg = np.array(["-inf"], dtype=np.float64) msg = "Unable to parse string" for infinity in infinities: for maybe_int in (True, False): out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) + np.array([infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, pos) out = lib.maybe_convert_numeric( - np.array(['-' + infinity], dtype=object), - na_values, maybe_int) + np.array(["-" + infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, neg) out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) + np.array([infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, pos) out = lib.maybe_convert_numeric( - np.array(['+' + infinity], dtype=object), - na_values, maybe_int) + np.array(["+" + infinity], dtype=object), na_values, maybe_int + ) tm.assert_numpy_array_equal(out, pos) # too many characters with pytest.raises(ValueError, match=msg): lib.maybe_convert_numeric( - np.array(['foo_' + infinity], dtype=object), - na_values, maybe_int) + np.array(["foo_" + infinity], dtype=object), + na_values, + maybe_int, + ) def test_maybe_convert_numeric_post_floatify_nan(self, coerce): # see gh-13314 - data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + data = np.array(["1.200", "-999.000", "4.500"], dtype=object) expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) nan_values = {-999, -999.0} @@ -391,74 +429,81 @@ def test_maybe_convert_numeric_post_floatify_nan(self, coerce): tm.assert_numpy_array_equal(out, expected) def test_convert_infs(self): - arr = np.array(['inf', 'inf', 'inf'], dtype='O') + arr = np.array(["inf", "inf", "inf"], dtype="O") result = lib.maybe_convert_numeric(arr, set(), False) assert result.dtype == np.float64 - arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + arr = np.array(["-inf", "-inf", "-inf"], dtype="O") result = lib.maybe_convert_numeric(arr, set(), False) assert result.dtype == np.float64 def test_scientific_no_exponent(self): # See PR 12215 - arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + arr = np.array(["42E", "2E", "99e", "6e"], dtype="O") result = lib.maybe_convert_numeric(arr, set(), False, True) assert np.all(np.isnan(result)) def test_convert_non_hashable(self): # GH13324 # make sure that we are handing non-hashables - arr = np.array([[10.0, 2], 1.0, 'apple']) + arr = np.array([[10.0, 2], 1.0, "apple"]) result = lib.maybe_convert_numeric(arr, set(), False, True) tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) def test_convert_numeric_uint64(self): - arr = np.array([2**63], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([2 ** 63], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) - arr = np.array([str(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([str(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) - arr = np.array([np.uint64(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([np.uint64(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp) - @pytest.mark.parametrize("arr", [ - np.array([2**63, np.nan], dtype=object), - np.array([str(2**63), np.nan], dtype=object), - np.array([np.nan, 2**63], dtype=object), - np.array([np.nan, str(2**63)], dtype=object)]) + @pytest.mark.parametrize( + "arr", + [ + np.array([2 ** 63, np.nan], dtype=object), + np.array([str(2 ** 63), np.nan], dtype=object), + np.array([np.nan, 2 ** 63], dtype=object), + np.array([np.nan, str(2 ** 63)], dtype=object), + ], + ) def test_convert_numeric_uint64_nan(self, coerce, arr): expected = arr.astype(float) if coerce else arr.copy() - result = lib.maybe_convert_numeric(arr, set(), - coerce_numeric=coerce) + result = lib.maybe_convert_numeric(arr, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) def test_convert_numeric_uint64_nan_values(self, coerce): - arr = np.array([2**63, 2**63 + 1], dtype=object) - na_values = {2**63} + arr = np.array([2 ** 63, 2 ** 63 + 1], dtype=object) + na_values = {2 ** 63} - expected = (np.array([np.nan, 2**63 + 1], dtype=float) - if coerce else arr.copy()) - result = lib.maybe_convert_numeric(arr, na_values, - coerce_numeric=coerce) + expected = ( + np.array([np.nan, 2 ** 63 + 1], dtype=float) if coerce else arr.copy() + ) + result = lib.maybe_convert_numeric(arr, na_values, coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("case", [ - np.array([2**63, -1], dtype=object), - np.array([str(2**63), -1], dtype=object), - np.array([str(2**63), str(-1)], dtype=object), - np.array([-1, 2**63], dtype=object), - np.array([-1, str(2**63)], dtype=object), - np.array([str(-1), str(2**63)], dtype=object)]) + @pytest.mark.parametrize( + "case", + [ + np.array([2 ** 63, -1], dtype=object), + np.array([str(2 ** 63), -1], dtype=object), + np.array([str(2 ** 63), str(-1)], dtype=object), + np.array([-1, 2 ** 63], dtype=object), + np.array([-1, str(2 ** 63)], dtype=object), + np.array([str(-1), str(2 ** 63)], dtype=object), + ], + ) def test_convert_numeric_int64_uint64(self, case, coerce): expected = case.astype(float) if coerce else case.copy() result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("value", [-2**63 - 1, 2**64]) + @pytest.mark.parametrize("value", [-2 ** 63 - 1, 2 ** 64]) def test_convert_int_overflow(self, value): # see gh-18584 arr = np.array([value], dtype=object) @@ -467,29 +512,28 @@ def test_convert_int_overflow(self, value): def test_maybe_convert_objects_uint64(self): # see gh-4471 - arr = np.array([2**63], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([2 ** 63], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) # NumPy bug: can't compare uint64 to int64, as that # results in both casting to float64, so we should # make sure that this function is robust against it - arr = np.array([np.uint64(2**63)], dtype=object) - exp = np.array([2**63], dtype=np.uint64) + arr = np.array([np.uint64(2 ** 63)], dtype=object) + exp = np.array([2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) arr = np.array([2, -1], dtype=object) exp = np.array([2, -1], dtype=np.int64) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) - arr = np.array([2**63, -1], dtype=object) - exp = np.array([2**63, -1], dtype=object) + arr = np.array([2 ** 63, -1], dtype=object) + exp = np.array([2 ** 63, -1], dtype=object) tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) def test_mixed_dtypes_remain_object_array(self): # GH14956 - array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], - dtype=object) + array = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) result = lib.maybe_convert_objects(array, convert_datetime=1) tm.assert_numpy_array_equal(result, array) @@ -507,32 +551,31 @@ def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype): # make sure the inferred dtype of the fixture is as requested assert inferred_dtype == lib.infer_dtype(values, skipna=True) - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_length_zero(self, skipna): - result = lib.infer_dtype(np.array([], dtype='i4'), skipna=skipna) - assert result == 'integer' + result = lib.infer_dtype(np.array([], dtype="i4"), skipna=skipna) + assert result == "integer" result = lib.infer_dtype([], skipna=skipna) - assert result == 'empty' + assert result == "empty" # GH 18004 - arr = np.array([np.array([], dtype=object), - np.array([], dtype=object)]) + arr = np.array([np.array([], dtype=object), np.array([], dtype=object)]) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'empty' + assert result == "empty" def test_integers(self): - arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'integer' + assert result == "integer" - arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), "foo"], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed-integer' + assert result == "mixed-integer" - arr = np.array([1, 2, 3, 4, 5], dtype='i4') + arr = np.array([1, 2, 3, 4, 5], dtype="i4") result = lib.infer_dtype(arr, skipna=True) - assert result == 'integer' + assert result == "integer" def test_deprecation(self): # GH 24050 @@ -540,121 +583,123 @@ def test_deprecation(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = lib.infer_dtype(arr) # default: skipna=None -> warn - assert result == 'integer' + assert result == "integer" def test_bools(self): - arr = np.array([True, False, True, True, True], dtype='O') + arr = np.array([True, False, True, True, True], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" - arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + arr = np.array([np.bool_(True), np.bool_(False)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" - arr = np.array([True, False, True, 'foo'], dtype='O') + arr = np.array([True, False, True, "foo"], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed' + assert result == "mixed" arr = np.array([True, False, True], dtype=bool) result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" - arr = np.array([True, np.nan, False], dtype='O') + arr = np.array([True, np.nan, False], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'boolean' + assert result == "boolean" result = lib.infer_dtype(arr, skipna=False) - assert result == 'mixed' + assert result == "mixed" def test_floats(self): - arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + arr = np.array([1.0, 2.0, 3.0, np.float64(4), np.float32(5)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'floating' + assert result == "floating" - arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], - dtype='O') + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), "foo"], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed-integer' + assert result == "mixed-integer" - arr = np.array([1, 2, 3, 4, 5], dtype='f4') + arr = np.array([1, 2, 3, 4, 5], dtype="f4") result = lib.infer_dtype(arr, skipna=True) - assert result == 'floating' + assert result == "floating" - arr = np.array([1, 2, 3, 4, 5], dtype='f8') + arr = np.array([1, 2, 3, 4, 5], dtype="f8") result = lib.infer_dtype(arr, skipna=True) - assert result == 'floating' + assert result == "floating" def test_decimals(self): # GH15690 arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) - assert result == 'decimal' + assert result == "decimal" arr = np.array([1.0, 2.0, Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) - assert result == 'mixed' + assert result == "mixed" - arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) + arr = np.array([Decimal(1), Decimal("NaN"), Decimal(3)]) result = lib.infer_dtype(arr, skipna=True) - assert result == 'decimal' + assert result == "decimal" - arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') + arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype="O") result = lib.infer_dtype(arr, skipna=True) - assert result == 'decimal' + assert result == "decimal" # complex is compatible with nan, so skipna has no effect - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_complex(self, skipna): # gets cast to complex on array construction arr = np.array([1.0, 2.0, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" - arr = np.array([1.0, 2.0, 1 + 1j], dtype='O') + arr = np.array([1.0, 2.0, 1 + 1j], dtype="O") result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'mixed' + assert result == "mixed" # gets cast to complex on array construction arr = np.array([1, np.nan, 1 + 1j]) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" - arr = np.array([1.0, np.nan, 1 + 1j], dtype='O') + arr = np.array([1.0, np.nan, 1 + 1j], dtype="O") result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'mixed' + assert result == "mixed" # complex with nans stays complex - arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype='O') + arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype="O") result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" # test smaller complex dtype; will pass through _try_infer_map fastpath arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64) result = lib.infer_dtype(arr, skipna=skipna) - assert result == 'complex' + assert result == "complex" def test_string(self): pass def test_unicode(self): - arr = ['a', np.nan, 'c'] + arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) - assert result == 'mixed' + assert result == "mixed" - arr = ['a', np.nan, 'c'] + arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) - expected = 'string' + expected = "string" assert result == expected - @pytest.mark.parametrize('dtype, missing, skipna, expected', [ - (float, np.nan, False, 'floating'), - (float, np.nan, True, 'floating'), - (object, np.nan, False, 'floating'), - (object, np.nan, True, 'empty'), - (object, None, False, 'mixed'), - (object, None, True, 'empty') - ]) - @pytest.mark.parametrize('box', [pd.Series, np.array]) + @pytest.mark.parametrize( + "dtype, missing, skipna, expected", + [ + (float, np.nan, False, "floating"), + (float, np.nan, True, "floating"), + (object, np.nan, False, "floating"), + (object, np.nan, True, "empty"), + (object, None, False, "mixed"), + (object, None, True, "empty"), + ], + ) + @pytest.mark.parametrize("box", [pd.Series, np.array]) def test_object_empty(self, box, missing, dtype, skipna, expected): # GH 23421 arr = box([missing, missing], dtype=dtype) @@ -666,154 +711,150 @@ def test_datetime(self): dates = [datetime(2012, 1, x) for x in range(1, 20)] index = Index(dates) - assert index.inferred_type == 'datetime64' + assert index.inferred_type == "datetime64" def test_infer_dtype_datetime(self): - arr = np.array([Timestamp('2011-01-01'), - Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([np.datetime64('2011-01-01'), - np.datetime64('2011-01-01')], dtype=object) - assert lib.infer_dtype(arr, skipna=True) == 'datetime64' + arr = np.array( + [np.datetime64("2011-01-01"), np.datetime64("2011-01-01")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == "datetime" # starts with nan for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([n, pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([n, np.datetime64('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime64' + arr = np.array([n, np.datetime64("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" arr = np.array([n, datetime(2011, 1, 1)]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([n, pd.Timestamp('2011-01-02'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([n, pd.Timestamp("2011-01-02"), n]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" - arr = np.array([n, np.datetime64('2011-01-02'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime64' + arr = np.array([n, np.datetime64("2011-01-02"), n]) + assert lib.infer_dtype(arr, skipna=True) == "datetime64" arr = np.array([n, datetime(2011, 1, 1), n]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == "datetime" # different type of nat - arr = np.array([np.timedelta64('nat'), - np.datetime64('2011-01-02')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [np.timedelta64("nat"), np.datetime64("2011-01-02")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([np.datetime64('2011-01-02'), - np.timedelta64('nat')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [np.datetime64("2011-01-02"), np.timedelta64("nat")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" # mixed datetime - arr = np.array([datetime(2011, 1, 1), - pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'datetime' + arr = np.array([datetime(2011, 1, 1), pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "datetime" # should be datetime? - arr = np.array([np.datetime64('2011-01-01'), - pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([np.datetime64("2011-01-01"), pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" - arr = np.array([pd.Timestamp('2011-01-02'), - np.datetime64('2011-01-01')]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([pd.Timestamp("2011-01-02"), np.datetime64("2011-01-01")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer' + arr = np.array([np.nan, pd.Timestamp("2011-01-02"), 1]) + assert lib.infer_dtype(arr, skipna=True) == "mixed-integer" - arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([np.nan, pd.Timestamp("2011-01-02"), 1.1]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" - arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=True) == 'mixed' + arr = np.array([np.nan, "2011-01-01", pd.Timestamp("2011-01-02")]) + assert lib.infer_dtype(arr, skipna=True) == "mixed" def test_infer_dtype_timedelta(self): - arr = np.array([pd.Timedelta('1 days'), - pd.Timedelta('2 days')]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([pd.Timedelta("1 days"), pd.Timedelta("2 days")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([np.timedelta64(1, 'D'), - np.timedelta64(2, 'D')], dtype=object) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([np.timedelta64(1, "D"), np.timedelta64(2, "D")], dtype=object) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" arr = np.array([timedelta(1), timedelta(2)]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == "timedelta" # starts with nan for n in [pd.NaT, np.nan]: - arr = np.array([n, Timedelta('1 days')]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, Timedelta("1 days")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([n, np.timedelta64(1, 'D')]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, np.timedelta64(1, "D")]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" arr = np.array([n, timedelta(1)]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([n, pd.Timedelta('1 days'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, pd.Timedelta("1 days"), n]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" - arr = np.array([n, np.timedelta64(1, 'D'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + arr = np.array([n, np.timedelta64(1, "D"), n]) + assert lib.infer_dtype(arr, skipna=True) == "timedelta" arr = np.array([n, timedelta(1), n]) - assert lib.infer_dtype(arr, skipna=True) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == "timedelta" # different type of nat - arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([np.datetime64("nat"), np.timedelta64(1, "D")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([np.timedelta64(1, "D"), np.datetime64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" def test_infer_dtype_period(self): # GH 13664 - arr = np.array([pd.Period('2011-01', freq='D'), - pd.Period('2011-02', freq='D')]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([pd.Period("2011-01", freq="D"), pd.Period("2011-02", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" - arr = np.array([pd.Period('2011-01', freq='D'), - pd.Period('2011-02', freq='M')]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([pd.Period("2011-01", freq="D"), pd.Period("2011-02", freq="M")]) + assert lib.infer_dtype(arr, skipna=True) == "period" # starts with nan for n in [pd.NaT, np.nan]: - arr = np.array([n, pd.Period('2011-01', freq='D')]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([n, pd.Period("2011-01", freq="D")]) + assert lib.infer_dtype(arr, skipna=True) == "period" - arr = np.array([n, pd.Period('2011-01', freq='D'), n]) - assert lib.infer_dtype(arr, skipna=True) == 'period' + arr = np.array([n, pd.Period("2011-01", freq="D"), n]) + assert lib.infer_dtype(arr, skipna=True) == "period" # different type of nat - arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [np.datetime64("nat"), pd.Period("2011-01", freq="M")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array( + [pd.Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object + ) + assert lib.infer_dtype(arr, skipna=False) == "mixed" @pytest.mark.parametrize( "data", [ [datetime(2017, 6, 12, 19, 30), datetime(2017, 3, 11, 1, 15)], [Timestamp("20170612"), Timestamp("20170311")], - [Timestamp("20170612", tz='US/Eastern'), - Timestamp("20170311", tz='US/Eastern')], - [date(2017, 6, 12), - Timestamp("20170311", tz='US/Eastern')], + [ + Timestamp("20170612", tz="US/Eastern"), + Timestamp("20170311", tz="US/Eastern"), + ], + [date(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")], [np.datetime64("2017-06-12"), np.datetime64("2017-03-11")], - [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)] - ] + [np.datetime64("2017-06-12"), datetime(2017, 3, 11, 1, 15)], + ], ) def test_infer_datetimelike_array_datetime(self, data): assert lib.infer_datetimelike_array(data) == "datetime" @@ -824,8 +865,8 @@ def test_infer_datetimelike_array_datetime(self, data): [timedelta(2017, 6, 12), timedelta(2017, 3, 11)], [timedelta(2017, 6, 12), date(2017, 3, 11)], [np.timedelta64(2017, "D"), np.timedelta64(6, "s")], - [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)] - ] + [np.timedelta64(2017, "D"), timedelta(2017, 3, 11)], + ], ) def test_infer_datetimelike_array_timedelta(self, data): assert lib.infer_datetimelike_array(data) == "timedelta" @@ -841,10 +882,10 @@ def test_infer_datetimelike_array_date(self): [20170612, 20170311], [20170612.5, 20170311.8], [Dummy(), Dummy()], - [Timestamp("20170612"), Timestamp("20170311", tz='US/Eastern')], + [Timestamp("20170612"), Timestamp("20170311", tz="US/Eastern")], [Timestamp("20170612"), 20170311], - [timedelta(2017, 6, 12), Timestamp("20170311", tz='US/Eastern')] - ] + [timedelta(2017, 6, 12), Timestamp("20170311", tz="US/Eastern")], + ], ) def test_infer_datetimelike_array_mixed(self, data): assert lib.infer_datetimelike_array(data) == "mixed" @@ -859,87 +900,83 @@ def test_infer_datetimelike_array_mixed(self, data): [[np.datetime64("2017-06-12"), pd.NaT], "datetime"], [[date(2017, 6, 12), pd.NaT], "date"], [[timedelta(2017, 6, 12), pd.NaT], "timedelta"], - [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"] - ] + [[np.timedelta64(2017, "D"), pd.NaT], "timedelta"], + ], ) @pytest.mark.parametrize("second", [None, np.nan]) - def test_infer_datetimelike_array_nan_nat_like(self, first, second, - expected): + def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected): first.append(second) assert lib.infer_datetimelike_array(first) == expected def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) - assert lib.infer_dtype(arr, skipna=True) == 'floating' + assert lib.infer_dtype(arr, skipna=True) == "floating" # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) - assert lib.infer_dtype(arr, skipna=True) == 'empty' - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" arr = np.array([None, np.nan, np.nan]) - assert lib.infer_dtype(arr, skipna=True) == 'empty' - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == "empty" + assert lib.infer_dtype(arr, skipna=False) == "mixed" # pd.NaT arr = np.array([pd.NaT]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([pd.NaT, np.nan]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([np.nan, pd.NaT]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([np.nan, pd.NaT, np.nan]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" arr = np.array([None, pd.NaT, None]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=False) == "datetime" # np.datetime64(nat) - arr = np.array([np.datetime64('nat')]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + arr = np.array([np.datetime64("nat")]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.datetime64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + arr = np.array([n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" - arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + arr = np.array([pd.NaT, n, np.datetime64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "datetime64" - arr = np.array([np.timedelta64('nat')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + arr = np.array([np.timedelta64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" for n in [np.nan, pd.NaT, None]: - arr = np.array([n, np.timedelta64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + arr = np.array([n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" - arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + arr = np.array([pd.NaT, n, np.timedelta64("nat"), n]) + assert lib.infer_dtype(arr, skipna=False) == "timedelta" # datetime / timedelta mixed - arr = np.array([pd.NaT, np.datetime64('nat'), - np.timedelta64('nat'), np.nan]) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([pd.NaT, np.datetime64("nat"), np.timedelta64("nat"), np.nan]) + assert lib.infer_dtype(arr, skipna=False) == "mixed" - arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], - dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + arr = np.array([np.timedelta64("nat"), np.datetime64("nat")], dtype=object) + assert lib.infer_dtype(arr, skipna=False) == "mixed" def test_is_datetimelike_array_all_nan_nat_like(self): - arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) + arr = np.array([np.nan, pd.NaT, np.datetime64("nat")]) assert lib.is_datetime_array(arr) assert lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) - arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) + arr = np.array([np.nan, pd.NaT, np.timedelta64("nat")]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert lib.is_timedelta_or_timedelta64_array(arr) - arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), - np.timedelta64('nat')]) + arr = np.array([np.nan, pd.NaT, np.datetime64("nat"), np.timedelta64("nat")]) assert not lib.is_datetime_array(arr) assert not lib.is_datetime64_array(arr) assert not lib.is_timedelta_or_timedelta64_array(arr) @@ -955,28 +992,40 @@ def test_is_datetimelike_array_all_nan_nat_like(self): assert not lib.is_timedelta_or_timedelta64_array(arr) assert lib.is_datetime_with_singletz_array( - np.array([pd.Timestamp('20130101', tz='US/Eastern'), - pd.Timestamp('20130102', tz='US/Eastern')], - dtype=object)) + np.array( + [ + pd.Timestamp("20130101", tz="US/Eastern"), + pd.Timestamp("20130102", tz="US/Eastern"), + ], + dtype=object, + ) + ) assert not lib.is_datetime_with_singletz_array( - np.array([pd.Timestamp('20130101', tz='US/Eastern'), - pd.Timestamp('20130102', tz='CET')], - dtype=object)) + np.array( + [ + pd.Timestamp("20130101", tz="US/Eastern"), + pd.Timestamp("20130102", tz="CET"), + ], + dtype=object, + ) + ) @pytest.mark.parametrize( "func", [ - 'is_datetime_array', - 'is_datetime64_array', - 'is_bool_array', - 'is_timedelta_or_timedelta64_array', - 'is_date_array', - 'is_time_array', - 'is_interval_array', - 'is_period_array']) + "is_datetime_array", + "is_datetime64_array", + "is_bool_array", + "is_timedelta_or_timedelta64_array", + "is_date_array", + "is_time_array", + "is_interval_array", + "is_period_array", + ], + ) def test_other_dtypes_for_array(self, func): func = getattr(lib, func) - arr = np.array(['foo', 'bar']) + arr = np.array(["foo", "bar"]) assert not func(arr) arr = np.array([1, 2]) @@ -986,14 +1035,14 @@ def test_date(self): dates = [date(2012, 1, day) for day in range(1, 20)] index = Index(dates) - assert index.inferred_type == 'date' + assert index.inferred_type == "date" dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan] result = lib.infer_dtype(dates, skipna=False) - assert result == 'mixed' + assert result == "mixed" result = lib.infer_dtype(dates, skipna=True) - assert result == 'date' + assert result == "date" def test_is_numeric_array(self): @@ -1006,11 +1055,13 @@ def test_is_numeric_array(self): def test_is_string_array(self): - assert lib.is_string_array(np.array(['foo', 'bar'])) + assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( - np.array(['foo', 'bar', np.nan], dtype=object), skipna=False) + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) assert lib.is_string_array( - np.array(['foo', 'bar', np.nan], dtype=object), skipna=True) + np.array(["foo", "bar", np.nan], dtype=object), skipna=True + ) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): @@ -1021,7 +1072,8 @@ def test_to_object_array_tuples(self): try: # make sure record array works from collections import namedtuple - record = namedtuple('record', 'x y') + + record = namedtuple("record", "x y") r = record(5, 6) values = [r] result = lib.to_object_array_tuples(values) # noqa @@ -1032,11 +1084,11 @@ def test_object(self): # GH 7431 # cannot infer more than this as only a single element - arr = np.array([None], dtype='O') + arr = np.array([None], dtype="O") result = lib.infer_dtype(arr, skipna=False) - assert result == 'mixed' + assert result == "mixed" result = lib.infer_dtype(arr, skipna=True) - assert result == 'empty' + assert result == "empty" def test_to_object_array_width(self): # see gh-13320 @@ -1050,15 +1102,16 @@ def test_to_object_array_width(self): out = lib.to_object_array(rows, min_width=1) tm.assert_numpy_array_equal(out, expected) - expected = np.array([[1, 2, 3, None, None], - [4, 5, 6, None, None]], dtype=object) + expected = np.array( + [[1, 2, 3, None, None], [4, 5, 6, None, None]], dtype=object + ) out = lib.to_object_array(rows, min_width=5) tm.assert_numpy_array_equal(out, expected) def test_is_period(self): - assert lib.is_period(pd.Period('2011-01', freq='M')) - assert not lib.is_period(pd.PeriodIndex(['2011-01'], freq='M')) - assert not lib.is_period(pd.Timestamp('2011-01')) + assert lib.is_period(pd.Period("2011-01", freq="M")) + assert not lib.is_period(pd.PeriodIndex(["2011-01"], freq="M")) + assert not lib.is_period(pd.Timestamp("2011-01")) assert not lib.is_period(1) assert not lib.is_period(np.nan) @@ -1066,23 +1119,23 @@ def test_categorical(self): # GH 8974 from pandas import Categorical, Series - arr = Categorical(list('abc')) + + arr = Categorical(list("abc")) result = lib.infer_dtype(arr, skipna=True) - assert result == 'categorical' + assert result == "categorical" result = lib.infer_dtype(Series(arr), skipna=True) - assert result == 'categorical' + assert result == "categorical" - arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) + arr = Categorical(list("abc"), categories=["cegfab"], ordered=True) result = lib.infer_dtype(arr, skipna=True) - assert result == 'categorical' + assert result == "categorical" result = lib.infer_dtype(Series(arr), skipna=True) - assert result == 'categorical' + assert result == "categorical" class TestNumberScalar: - def test_is_number(self): assert is_number(True) @@ -1096,17 +1149,17 @@ def test_is_number(self): assert is_number(np.nan) assert not is_number(None) - assert not is_number('x') + assert not is_number("x") assert not is_number(datetime(2011, 1, 1)) - assert not is_number(np.datetime64('2011-01-01')) - assert not is_number(Timestamp('2011-01-01')) - assert not is_number(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_number(np.datetime64("2011-01-01")) + assert not is_number(Timestamp("2011-01-01")) + assert not is_number(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_number(timedelta(1000)) - assert not is_number(Timedelta('1 days')) + assert not is_number(Timedelta("1 days")) # questionable assert not is_number(np.bool_(False)) - assert is_number(np.timedelta64(1, 'D')) + assert is_number(np.timedelta64(1, "D")) def test_is_bool(self): assert is_bool(True) @@ -1121,14 +1174,14 @@ def test_is_bool(self): assert not is_bool(np.complex128(1 + 3j)) assert not is_bool(np.nan) assert not is_bool(None) - assert not is_bool('x') + assert not is_bool("x") assert not is_bool(datetime(2011, 1, 1)) - assert not is_bool(np.datetime64('2011-01-01')) - assert not is_bool(Timestamp('2011-01-01')) - assert not is_bool(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_bool(np.datetime64("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01")) + assert not is_bool(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_bool(timedelta(1000)) - assert not is_bool(np.timedelta64(1, 'D')) - assert not is_bool(Timedelta('1 days')) + assert not is_bool(np.timedelta64(1, "D")) + assert not is_bool(Timedelta("1 days")) def test_is_integer(self): assert is_integer(1) @@ -1143,16 +1196,16 @@ def test_is_integer(self): assert not is_integer(np.complex128(1 + 3j)) assert not is_integer(np.nan) assert not is_integer(None) - assert not is_integer('x') + assert not is_integer("x") assert not is_integer(datetime(2011, 1, 1)) - assert not is_integer(np.datetime64('2011-01-01')) - assert not is_integer(Timestamp('2011-01-01')) - assert not is_integer(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_integer(np.datetime64("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01")) + assert not is_integer(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_integer(timedelta(1000)) - assert not is_integer(Timedelta('1 days')) + assert not is_integer(Timedelta("1 days")) # questionable - assert is_integer(np.timedelta64(1, 'D')) + assert is_integer(np.timedelta64(1, "D")) def test_is_float(self): assert is_float(1.1) @@ -1167,75 +1220,74 @@ def test_is_float(self): assert not is_float(np.int64(1)) assert not is_float(np.complex128(1 + 3j)) assert not is_float(None) - assert not is_float('x') + assert not is_float("x") assert not is_float(datetime(2011, 1, 1)) - assert not is_float(np.datetime64('2011-01-01')) - assert not is_float(Timestamp('2011-01-01')) - assert not is_float(Timestamp('2011-01-01', tz='US/Eastern')) + assert not is_float(np.datetime64("2011-01-01")) + assert not is_float(Timestamp("2011-01-01")) + assert not is_float(Timestamp("2011-01-01", tz="US/Eastern")) assert not is_float(timedelta(1000)) - assert not is_float(np.timedelta64(1, 'D')) - assert not is_float(Timedelta('1 days')) + assert not is_float(np.timedelta64(1, "D")) + assert not is_float(Timedelta("1 days")) def test_is_datetime_dtypes(self): - ts = pd.date_range('20130101', periods=3) - tsa = pd.date_range('20130101', periods=3, tz='US/Eastern') + ts = pd.date_range("20130101", periods=3) + tsa = pd.date_range("20130101", periods=3, tz="US/Eastern") - assert is_datetime64_dtype('datetime64') - assert is_datetime64_dtype('datetime64[ns]') + assert is_datetime64_dtype("datetime64") + assert is_datetime64_dtype("datetime64[ns]") assert is_datetime64_dtype(ts) assert not is_datetime64_dtype(tsa) - assert not is_datetime64_ns_dtype('datetime64') - assert is_datetime64_ns_dtype('datetime64[ns]') + assert not is_datetime64_ns_dtype("datetime64") + assert is_datetime64_ns_dtype("datetime64[ns]") assert is_datetime64_ns_dtype(ts) assert is_datetime64_ns_dtype(tsa) - assert is_datetime64_any_dtype('datetime64') - assert is_datetime64_any_dtype('datetime64[ns]') + assert is_datetime64_any_dtype("datetime64") + assert is_datetime64_any_dtype("datetime64[ns]") assert is_datetime64_any_dtype(ts) assert is_datetime64_any_dtype(tsa) - assert not is_datetime64tz_dtype('datetime64') - assert not is_datetime64tz_dtype('datetime64[ns]') + assert not is_datetime64tz_dtype("datetime64") + assert not is_datetime64tz_dtype("datetime64[ns]") assert not is_datetime64tz_dtype(ts) assert is_datetime64tz_dtype(tsa) - for tz in ['US/Eastern', 'UTC']: - dtype = 'datetime64[ns, {}]'.format(tz) + for tz in ["US/Eastern", "UTC"]: + dtype = "datetime64[ns, {}]".format(tz) assert not is_datetime64_dtype(dtype) assert is_datetime64tz_dtype(dtype) assert is_datetime64_ns_dtype(dtype) assert is_datetime64_any_dtype(dtype) def test_is_timedelta(self): - assert is_timedelta64_dtype('timedelta64') - assert is_timedelta64_dtype('timedelta64[ns]') - assert not is_timedelta64_ns_dtype('timedelta64') - assert is_timedelta64_ns_dtype('timedelta64[ns]') + assert is_timedelta64_dtype("timedelta64") + assert is_timedelta64_dtype("timedelta64[ns]") + assert not is_timedelta64_ns_dtype("timedelta64") + assert is_timedelta64_ns_dtype("timedelta64[ns]") - tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64[ns]') + tdi = TimedeltaIndex([1e14, 2e14], dtype="timedelta64[ns]") assert is_timedelta64_dtype(tdi) assert is_timedelta64_ns_dtype(tdi) - assert is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]')) + assert is_timedelta64_ns_dtype(tdi.astype("timedelta64[ns]")) # Conversion to Int64Index: - assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64')) - assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]')) + assert not is_timedelta64_ns_dtype(tdi.astype("timedelta64")) + assert not is_timedelta64_ns_dtype(tdi.astype("timedelta64[h]")) class TestIsScalar: - def test_is_scalar_builtin_scalars(self): assert is_scalar(None) assert is_scalar(True) assert is_scalar(False) assert is_scalar(Number()) assert is_scalar(Fraction()) - assert is_scalar(0.) + assert is_scalar(0.0) assert is_scalar(np.nan) - assert is_scalar('foobar') - assert is_scalar(b'foobar') + assert is_scalar("foobar") + assert is_scalar(b"foobar") assert is_scalar(datetime(2014, 1, 1)) assert is_scalar(date(2014, 1, 1)) assert is_scalar(time(12, 0)) @@ -1247,26 +1299,29 @@ def test_is_scalar_builtin_nonscalars(self): assert not is_scalar([]) assert not is_scalar([1]) assert not is_scalar(()) - assert not is_scalar((1, )) + assert not is_scalar((1,)) assert not is_scalar(slice(None)) assert not is_scalar(Ellipsis) def test_is_scalar_numpy_array_scalars(self): assert is_scalar(np.int64(1)) - assert is_scalar(np.float64(1.)) + assert is_scalar(np.float64(1.0)) assert is_scalar(np.int32(1)) - assert is_scalar(np.object_('foobar')) - assert is_scalar(np.str_('foobar')) - assert is_scalar(np.unicode_('foobar')) - assert is_scalar(np.bytes_(b'foobar')) - assert is_scalar(np.datetime64('2014-01-01')) - assert is_scalar(np.timedelta64(1, 'h')) + assert is_scalar(np.object_("foobar")) + assert is_scalar(np.str_("foobar")) + assert is_scalar(np.unicode_("foobar")) + assert is_scalar(np.bytes_(b"foobar")) + assert is_scalar(np.datetime64("2014-01-01")) + assert is_scalar(np.timedelta64(1, "h")) def test_is_scalar_numpy_zerodim_arrays(self): - for zerodim in [np.array(1), np.array('foobar'), - np.array(np.datetime64('2014-01-01')), - np.array(np.timedelta64(1, 'h')), - np.array(np.datetime64('NaT'))]: + for zerodim in [ + np.array(1), + np.array("foobar"), + np.array(np.datetime64("2014-01-01")), + np.array(np.timedelta64(1, "h")), + np.array(np.datetime64("NaT")), + ]: assert not is_scalar(zerodim) assert is_scalar(lib.item_from_zerodim(zerodim)) @@ -1274,12 +1329,12 @@ def test_is_scalar_numpy_zerodim_arrays(self): def test_is_scalar_numpy_arrays(self): assert not is_scalar(np.array([])) assert not is_scalar(np.array([[]])) - assert not is_scalar(np.matrix('1; 2')) + assert not is_scalar(np.matrix("1; 2")) def test_is_scalar_pandas_scalars(self): - assert is_scalar(Timestamp('2014-01-01')) + assert is_scalar(Timestamp("2014-01-01")) assert is_scalar(Timedelta(hours=1)) - assert is_scalar(Period('2014-01-01')) + assert is_scalar(Period("2014-01-01")) assert is_scalar(Interval(left=0, right=1)) assert is_scalar(DateOffset(days=1)) @@ -1293,26 +1348,25 @@ def test_is_scalar_pandas_containers(self): def test_datetimeindex_from_empty_datetime64_array(): - for unit in ['ms', 'us', 'ns']: - idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) - assert (len(idx) == 0) + for unit in ["ms", "us", "ns"]: + idx = DatetimeIndex(np.array([], dtype="datetime64[%s]" % unit)) + assert len(idx) == 0 def test_nan_to_nat_conversions(): - df = DataFrame(dict({ - 'A': np.asarray(range(10), dtype='float64'), - 'B': Timestamp('20010101') - })) + df = DataFrame( + dict({"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}) + ) df.iloc[3:6, :] = np.nan - result = df.loc[4, 'B'].value - assert (result == iNaT) + result = df.loc[4, "B"].value + assert result == iNaT - s = df['B'].copy() + s = df["B"].copy() s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) - assert (isna(s[8])) + assert isna(s[8]) - assert (s[8].value == np.datetime64('NaT').astype(np.int64)) + assert s[8].value == np.datetime64("NaT").astype(np.int64) @td.skip_if_no_scipy @@ -1325,17 +1379,17 @@ def test_is_scipy_sparse(spmatrix): # noqa: F811 def test_ensure_int32(): values = np.arange(10, dtype=np.int32) result = ensure_int32(values) - assert (result.dtype == np.int32) + assert result.dtype == np.int32 values = np.arange(10, dtype=np.int64) result = ensure_int32(values) - assert (result.dtype == np.int32) + assert result.dtype == np.int32 def test_ensure_categorical(): values = np.arange(10, dtype=np.int32) result = ensure_categorical(values) - assert (result.dtype == 'category') + assert result.dtype == "category" values = Categorical(values) result = ensure_categorical(values) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 64bfc050da02f..a688dec50bc95 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -11,20 +11,24 @@ from pandas._libs.tslibs import iNaT, is_null_datetimelike from pandas.core.dtypes.common import is_scalar -from pandas.core.dtypes.dtypes import ( - DatetimeTZDtype, IntervalDtype, PeriodDtype) +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype from pandas.core.dtypes.missing import ( - array_equivalent, isna, isnull, na_value_for_dtype, notna, notnull) + array_equivalent, + isna, + isnull, + na_value_for_dtype, + notna, + notnull, +) import pandas as pd -from pandas import ( - DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range) +from pandas import DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range from pandas.util import testing as tm -@pytest.mark.parametrize('notna_f', [notna, notnull]) +@pytest.mark.parametrize("notna_f", [notna, notnull]) def test_notna_notnull(notna_f): - assert notna_f(1.) + assert notna_f(1.0) assert not notna_f(None) assert not notna_f(np.NaN) @@ -45,14 +49,17 @@ def test_notna_notnull(notna_f): assert result.sum() == 2 with cf.option_context("mode.use_inf_as_na", False): - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(notna_f(s), Series)) + for s in [ + tm.makeFloatSeries(), + tm.makeStringSeries(), + tm.makeObjectSeries(), + tm.makeTimeSeries(), + tm.makePeriodSeries(), + ]: + assert isinstance(notna_f(s), Series) class TestIsNA: - def test_0d_array(self): assert isna(np.array(np.nan)) assert not isna(np.array(0.0)) @@ -70,24 +77,31 @@ def test_empty_object(self): expected = np.ones(shape=shape, dtype=bool) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('isna_f', [isna, isnull]) + @pytest.mark.parametrize("isna_f", [isna, isnull]) def test_isna_isnull(self, isna_f): - assert not isna_f(1.) + assert not isna_f(1.0) assert isna_f(None) assert isna_f(np.NaN) - assert float('nan') + assert float("nan") assert not isna_f(np.inf) assert not isna_f(-np.inf) # series - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: + for s in [ + tm.makeFloatSeries(), + tm.makeStringSeries(), + tm.makeObjectSeries(), + tm.makeTimeSeries(), + tm.makePeriodSeries(), + ]: assert isinstance(isna_f(s), Series) # frame - for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), - tm.makeMixedDataFrame()]: + for df in [ + tm.makeTimeDataFrame(), + tm.makePeriodFrame(), + tm.makeMixedDataFrame(), + ]: result = isna_f(df) expected = df.apply(isna_f) tm.assert_frame_equal(result, expected) @@ -102,16 +116,16 @@ def test_isna_lists(self): tm.assert_numpy_array_equal(result, exp) # list of strings / unicode - result = isna(['foo', 'bar']) + result = isna(["foo", "bar"]) exp = np.array([False, False]) tm.assert_numpy_array_equal(result, exp) - result = isna(['foo', 'bar']) + result = isna(["foo", "bar"]) exp = np.array([False, False]) tm.assert_numpy_array_equal(result, exp) # GH20675 - result = isna([np.NaN, 'world']) + result = isna([np.NaN, "world"]) exp = np.array([True, False]) tm.assert_numpy_array_equal(result, exp) @@ -125,8 +139,14 @@ def test_isna_nat(self): tm.assert_numpy_array_equal(result, exp) def test_isna_numpy_nat(self): - arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), - np.datetime64('NaT', 's')]) + arr = np.array( + [ + NaT, + np.datetime64("NaT"), + np.timedelta64("NaT"), + np.datetime64("NaT", "s"), + ] + ) result = isna(arr) expected = np.array([True] * 4) tm.assert_numpy_array_equal(result, expected) @@ -135,7 +155,7 @@ def test_isna_datetime(self): assert not isna(datetime.now()) assert notna(datetime.now()) - idx = date_range('1/1/1990', periods=20) + idx = date_range("1/1/1990", periods=20) exp = np.ones(len(idx), dtype=bool) tm.assert_numpy_array_equal(notna(idx), exp) @@ -148,7 +168,7 @@ def test_isna_datetime(self): tm.assert_numpy_array_equal(mask, exp) # GH 9129 - pidx = idx.to_period(freq='M') + pidx = idx.to_period(freq="M") mask = isna(pidx) assert mask[0] exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) @@ -160,14 +180,20 @@ def test_isna_datetime(self): @pytest.mark.parametrize( "value, expected", - [(np.complex128(np.nan), True), - (np.float64(1), False), - (np.array([1, 1 + 0j, np.nan, 3]), - np.array([False, False, True, False])), - (np.array([1, 1 + 0j, np.nan, 3], dtype=object), - np.array([False, False, True, False])), - (np.array([1, 1 + 0j, np.nan, 3]).astype(object), - np.array([False, False, True, False]))]) + [ + (np.complex128(np.nan), True), + (np.float64(1), False), + (np.array([1, 1 + 0j, np.nan, 3]), np.array([False, False, True, False])), + ( + np.array([1, 1 + 0j, np.nan, 3], dtype=object), + np.array([False, False, True, False]), + ), + ( + np.array([1, 1 + 0j, np.nan, 3]).astype(object), + np.array([False, False, True, False]), + ), + ], + ) def test_complex(self, value, expected): result = isna(value) if is_scalar(result): @@ -176,16 +202,22 @@ def test_complex(self, value, expected): tm.assert_numpy_array_equal(result, expected) def test_datetime_other_units(self): - idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) + idx = pd.DatetimeIndex(["2011-01-01", "NaT", "2011-01-02"]) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) tm.assert_numpy_array_equal(isna(idx.values), exp) tm.assert_numpy_array_equal(notna(idx.values), ~exp) - for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', - 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', - 'datetime64[ns]']: + for dtype in [ + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) @@ -201,16 +233,22 @@ def test_datetime_other_units(self): tm.assert_series_equal(notna(s), ~exp) def test_timedelta_other_units(self): - idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) + idx = pd.TimedeltaIndex(["1 days", "NaT", "2 days"]) exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) tm.assert_numpy_array_equal(isna(idx.values), exp) tm.assert_numpy_array_equal(notna(idx.values), ~exp) - for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', - 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', - 'timedelta64[ns]']: + for dtype in [ + "timedelta64[D]", + "timedelta64[h]", + "timedelta64[m]", + "timedelta64[s]", + "timedelta64[ms]", + "timedelta64[us]", + "timedelta64[ns]", + ]: values = idx.values.astype(dtype) exp = np.array([False, True, False]) @@ -226,7 +264,7 @@ def test_timedelta_other_units(self): tm.assert_series_equal(notna(s), ~exp) def test_period(self): - idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') + idx = pd.PeriodIndex(["2011-01", "NaT", "2012-01"], freq="M") exp = np.array([False, True, False]) tm.assert_numpy_array_equal(isna(idx), exp) tm.assert_numpy_array_equal(notna(idx), ~exp) @@ -241,95 +279,112 @@ def test_period(self): def test_array_equivalent(): - assert array_equivalent(np.array([np.nan, np.nan]), - np.array([np.nan, np.nan])) - assert array_equivalent(np.array([np.nan, 1, np.nan]), - np.array([np.nan, 1, np.nan])) - assert array_equivalent(np.array([np.nan, None], dtype='object'), - np.array([np.nan, None], dtype='object')) - assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), - np.array([np.nan, 1 + 1j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1 + 1j], dtype='complex'), np.array( - [np.nan, 1 + 2j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) - assert not array_equivalent( - np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) - assert array_equivalent(Float64Index([0, np.nan]), - Float64Index([0, np.nan])) + assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan])) + assert array_equivalent( + np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan]) + ) + assert array_equivalent( + np.array([np.nan, None], dtype="object"), + np.array([np.nan, None], dtype="object"), + ) + assert array_equivalent( + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 1j], dtype="complex"), + ) assert not array_equivalent( - Float64Index([0, np.nan]), Float64Index([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan]), - DatetimeIndex([0, np.nan])) + np.array([np.nan, 1 + 1j], dtype="complex"), + np.array([np.nan, 1 + 2j], dtype="complex"), + ) assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) - assert array_equivalent(TimedeltaIndex([0, np.nan]), - TimedeltaIndex([0, np.nan])) + np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan]) + ) + assert not array_equivalent(np.array(["a", "b", "c", "d"]), np.array(["e", "e"])) + assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) + assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + assert array_equivalent(TimedeltaIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) assert not array_equivalent( - TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) + TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]) + ) with catch_warnings(): filterwarnings("ignore", "Converting timezone", FutureWarning) - assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), - DatetimeIndex([0, np.nan], tz='US/Eastern')) + assert array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( - [1, np.nan], tz='US/Eastern')) + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([1, np.nan], tz="US/Eastern"), + ) assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") + ) assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) + DatetimeIndex([0, np.nan], tz="CET"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) def test_array_equivalent_compat(): # see gh-13388 - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - assert (array_equivalent(m, n, strict_nan=True)) - assert (array_equivalent(m, n, strict_nan=False)) + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + assert array_equivalent(m, n, strict_nan=True) + assert array_equivalent(m, n, strict_nan=False) - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) - assert (not array_equivalent(m, n, strict_nan=True)) - assert (not array_equivalent(m, n, strict_nan=False)) + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (4, 3)], dtype=[("a", int), ("b", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) - m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) - n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) - assert (not array_equivalent(m, n, strict_nan=True)) - assert (not array_equivalent(m, n, strict_nan=False)) + m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)]) + n = np.array([(1, 2), (3, 4)], dtype=[("b", int), ("a", float)]) + assert not array_equivalent(m, n, strict_nan=True) + assert not array_equivalent(m, n, strict_nan=False) def test_array_equivalent_str(): - for dtype in ['O', 'S', 'U']: - assert array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'B'], dtype=dtype)) - assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'X'], dtype=dtype)) - - -@pytest.mark.parametrize('dtype, na_value', [ - # Datetime-like - (np.dtype("M8[ns]"), NaT), - (np.dtype("m8[ns]"), NaT), - (DatetimeTZDtype.construct_from_string('datetime64[ns, US/Eastern]'), NaT), - (PeriodDtype("M"), NaT), - # Integer - ('u1', 0), ('u2', 0), ('u4', 0), ('u8', 0), - ('i1', 0), ('i2', 0), ('i4', 0), ('i8', 0), - # Bool - ('bool', False), - # Float - ('f2', np.nan), ('f4', np.nan), ('f8', np.nan), - # Object - ('O', np.nan), - # Interval - (IntervalDtype(), np.nan), -]) + for dtype in ["O", "S", "U"]: + assert array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "B"], dtype=dtype) + ) + assert not array_equivalent( + np.array(["A", "B"], dtype=dtype), np.array(["A", "X"], dtype=dtype) + ) + + +@pytest.mark.parametrize( + "dtype, na_value", + [ + # Datetime-like + (np.dtype("M8[ns]"), NaT), + (np.dtype("m8[ns]"), NaT), + (DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]"), NaT), + (PeriodDtype("M"), NaT), + # Integer + ("u1", 0), + ("u2", 0), + ("u4", 0), + ("u8", 0), + ("i1", 0), + ("i2", 0), + ("i4", 0), + ("i8", 0), + # Bool + ("bool", False), + # Float + ("f2", np.nan), + ("f4", np.nan), + ("f8", np.nan), + # Object + ("O", np.nan), + # Interval + (IntervalDtype(), np.nan), + ], +) def test_na_value_for_dtype(dtype, na_value): result = na_value_for_dtype(dtype) assert result is na_value @@ -337,8 +392,8 @@ def test_na_value_for_dtype(dtype, na_value): class TestNAObj: - _1d_methods = ['isnaobj', 'isnaobj_old'] - _2d_methods = ['isnaobj2d', 'isnaobj2d_old'] + _1d_methods = ["isnaobj", "isnaobj_old"] + _2d_methods = ["isnaobj2d", "isnaobj2d_old"] def _check_behavior(self, arr, expected): for method in TestNAObj._1d_methods: @@ -353,7 +408,7 @@ def _check_behavior(self, arr, expected): tm.assert_numpy_array_equal(result, expected) def test_basic(self): - arr = np.array([1, None, 'foo', -5.1, pd.NaT, np.nan]) + arr = np.array([1, None, "foo", -5.1, pd.NaT, np.nan]) expected = np.array([False, True, False, False, True, True]) self._check_behavior(arr, expected) @@ -384,32 +439,31 @@ def test_empty_like(self): self._check_behavior(arr, expected) -m8_units = ['as', 'ps', 'ns', 'us', 'ms', 's', - 'm', 'h', 'D', 'W', 'M', 'Y'] - -na_vals = [ - None, - NaT, - float('NaN'), - complex('NaN'), - np.nan, - np.float64('NaN'), - np.float32('NaN'), - np.complex64(np.nan), - np.complex128(np.nan), - np.datetime64('NaT'), - np.timedelta64('NaT'), -] + [ - np.datetime64('NaT', unit) for unit in m8_units -] + [ - np.timedelta64('NaT', unit) for unit in m8_units -] +m8_units = ["as", "ps", "ns", "us", "ms", "s", "m", "h", "D", "W", "M", "Y"] + +na_vals = ( + [ + None, + NaT, + float("NaN"), + complex("NaN"), + np.nan, + np.float64("NaN"), + np.float32("NaN"), + np.complex64(np.nan), + np.complex128(np.nan), + np.datetime64("NaT"), + np.timedelta64("NaT"), + ] + + [np.datetime64("NaT", unit) for unit in m8_units] + + [np.timedelta64("NaT", unit) for unit in m8_units] +) inf_vals = [ - float('inf'), - float('-inf'), - complex('inf'), - complex('-inf'), + float("inf"), + float("-inf"), + complex("inf"), + complex("-inf"), np.inf, np.NINF, ] @@ -420,14 +474,12 @@ def test_empty_like(self): int(NaT.value), ] -sometimes_na_vals = [ - Decimal('NaN'), -] +sometimes_na_vals = [Decimal("NaN")] never_na_vals = [ # float/complex values that when viewed as int64 match iNaT -0.0, - np.float64('-0.0'), + np.float64("-0.0"), -0j, np.complex64(-0j), ] diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py index 0d6396033fac7..ee043a6bb837c 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/bool.py @@ -13,15 +13,19 @@ import pandas as pd from pandas.api.extensions import ( - ExtensionArray, ExtensionDtype, register_extension_dtype, take) + ExtensionArray, + ExtensionDtype, + register_extension_dtype, + take, +) @register_extension_dtype class ArrowBoolDtype(ExtensionDtype): type = np.bool_ - kind = 'b' - name = 'arrow_bool' + kind = "b" + name = "arrow_bool" na_value = pa.NULL @classmethod @@ -29,8 +33,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) @classmethod def construct_array_type(cls): @@ -90,9 +93,12 @@ def dtype(self): @property def nbytes(self): - return sum(x.size for chunk in self._data.chunks - for x in chunk.buffers() - if x is not None) + return sum( + x.size + for chunk in self._data.chunks + for x in chunk.buffers() + if x is not None + ) def isna(self): nas = pd.isna(self._data.to_pandas()) @@ -104,8 +110,7 @@ def take(self, indices, allow_fill=False, fill_value=None): if allow_fill and fill_value is None: fill_value = self.dtype.na_value - result = take(data, indices, fill_value=fill_value, - allow_fill=allow_fill) + result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result, dtype=self.dtype) def copy(self): @@ -113,15 +118,12 @@ def copy(self): @classmethod def _concat_same_type(cls, to_concat): - chunks = list(itertools.chain.from_iterable(x._data.chunks - for x in to_concat)) + chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) arr = pa.chunked_array(chunks) return cls(arr) def __invert__(self): - return type(self).from_scalars( - ~self._data.to_pandas() - ) + return type(self).from_scalars(~self._data.to_pandas()) def _reduce(self, method, skipna=True, **kwargs): if skipna: diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 21ce5e999334e..205edf5da5b74 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -5,7 +5,7 @@ from pandas.tests.extension import base import pandas.util.testing as tm -pytest.importorskip('pyarrow', minversion="0.10.0") +pytest.importorskip("pyarrow", minversion="0.10.0") from .bool import ArrowBoolArray, ArrowBoolDtype # isort:skip @@ -47,7 +47,7 @@ def test_from_dtype(self, data): pytest.skip("GH-22666") # seems like some bug in isna on empty BoolArray returning floats. - @pytest.mark.xfail(reason='bad is-na for empty data') + @pytest.mark.xfail(reason="bad is-na for empty data") def test_from_sequence_from_cls(self, data): super().test_from_sequence_from_cls(data) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 0b3f2b860c127..090df35bd94c9 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -49,10 +49,12 @@ class TestMyDtype(BaseDtypeTests): from .io import BaseParsingTests # noqa from .methods import BaseMethodsTests # noqa from .missing import BaseMissingTests # noqa -from .ops import ( # noqa - BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil) +from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa from .printing import BasePrintingTests # noqa from .reduce import ( # noqa - BaseBooleanReduceTests, BaseNoReduceTests, BaseNumericReduceTests) + BaseBooleanReduceTests, + BaseNoReduceTests, + BaseNumericReduceTests, +) from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index 55cfbea479c47..2f808d20acd31 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -6,6 +6,4 @@ class BaseExtensionTests: assert_equal = staticmethod(tm.assert_equal) assert_series_equal = staticmethod(tm.assert_series_equal) assert_frame_equal = staticmethod(tm.assert_frame_equal) - assert_extension_array_equal = staticmethod( - tm.assert_extension_array_equal - ) + assert_extension_array_equal = staticmethod(tm.assert_extension_array_equal) diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 231a1f648f8e8..7262a85b1fe00 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -8,7 +8,6 @@ class BaseConstructorsTests(BaseExtensionTests): - def test_from_sequence_from_cls(self, data): result = type(data)._from_sequence(data, dtype=data.dtype) self.assert_extension_array_equal(result, data) @@ -39,7 +38,7 @@ def test_dataframe_constructor_from_dict(self, data, from_series): if from_series: data = pd.Series(data) result = pd.DataFrame({"A": data}) - assert result.dtypes['A'] == data.dtype + assert result.dtypes["A"] == data.dtype assert result.shape == (len(data), 1) assert isinstance(result._data.blocks[0], ExtensionBlock) @@ -50,7 +49,7 @@ def test_dataframe_from_series(self, data): assert isinstance(result._data.blocks[0], ExtensionBlock) def test_series_given_mismatched_index_raises(self, data): - msg = 'Length of passed values is 3, index implies 5' + msg = "Length of passed values is 3, index implies 5" with pytest.raises(ValueError, match=msg): pd.Series(data[:3], index=[0, 1, 2, 3, 4]) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 7b9dedceb00d4..a5040c8cfc2fc 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -15,7 +15,7 @@ def test_name(self, dtype): assert isinstance(dtype.name, str) def test_kind(self, dtype): - valid = set('biufcmMOSUV') + valid = set("biufcmMOSUV") if dtype.kind is not None: assert dtype.kind in valid @@ -46,10 +46,10 @@ def test_is_not_object_type(self, dtype): def test_eq_with_str(self, dtype): assert dtype == dtype.name - assert dtype != dtype.name + '-suffix' + assert dtype != dtype.name + "-suffix" def test_eq_with_numpy_object(self, dtype): - assert dtype != np.dtype('object') + assert dtype != np.dtype("object") def test_eq_with_self(self, dtype): assert dtype == dtype @@ -62,18 +62,16 @@ def test_check_dtype(self, data): dtype = data.dtype # check equivalency for using .dtypes - df = pd.DataFrame({'A': pd.Series(data, dtype=dtype), - 'B': data, - 'C': 'foo', 'D': 1}) + df = pd.DataFrame( + {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} + ) # np.dtype('int64') == 'Int64' == 'int64' # so can't distinguish - if dtype.name == 'Int64': - expected = pd.Series([True, True, False, True], - index=list('ABCD')) + if dtype.name == "Int64": + expected = pd.Series([True, True, False, True], index=list("ABCD")) else: - expected = pd.Series([True, True, False, False], - index=list('ABCD')) + expected = pd.Series([True, True, False, False], index=list("ABCD")) # XXX: This should probably be *fixed* not ignored. # See libops.scalar_compare @@ -83,8 +81,7 @@ def test_check_dtype(self, data): self.assert_series_equal(result, expected) - expected = pd.Series([True, True, False, False], - index=list('ABCD')) + expected = pd.Series([True, True, False, False], index=list("ABCD")) result = df.dtypes.apply(str) == str(dtype) self.assert_series_equal(result, expected) @@ -96,10 +93,10 @@ def test_str(self, dtype): def test_eq(self, dtype): assert dtype == dtype.name - assert dtype != 'anonther_type' + assert dtype != "anonther_type" def test_construct_from_string(self, dtype): dtype_instance = dtype.__class__.construct_from_string(dtype.name) assert isinstance(dtype_instance, dtype.__class__) with pytest.raises(TypeError): - dtype.__class__.construct_from_string('another_type') + dtype.__class__.construct_from_string("another_type") diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 6a5507b51b3ba..e02586eacfea7 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -19,8 +19,7 @@ def test_iloc_series(self, data): self.assert_series_equal(result, expected) def test_iloc_frame(self, data): - df = pd.DataFrame({"A": data, 'B': - np.arange(len(data), dtype='int64')}) + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame @@ -31,7 +30,7 @@ def test_iloc_frame(self, data): result = df.iloc[[0, 1, 2, 3], [0]] self.assert_frame_equal(result, expected) - expected = pd.Series(data[:4], name='A') + expected = pd.Series(data[:4], name="A") # slice -> series result = df.iloc[:4, 0] @@ -51,26 +50,25 @@ def test_loc_series(self, data): self.assert_series_equal(result, expected) def test_loc_frame(self, data): - df = pd.DataFrame({"A": data, - 'B': np.arange(len(data), dtype='int64')}) + df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")}) expected = pd.DataFrame({"A": data[:4]}) # slice -> frame - result = df.loc[:3, ['A']] + result = df.loc[:3, ["A"]] self.assert_frame_equal(result, expected) # sequence -> frame - result = df.loc[[0, 1, 2, 3], ['A']] + result = df.loc[[0, 1, 2, 3], ["A"]] self.assert_frame_equal(result, expected) - expected = pd.Series(data[:4], name='A') + expected = pd.Series(data[:4], name="A") # slice -> series - result = df.loc[:3, 'A'] + result = df.loc[:3, "A"] self.assert_series_equal(result, expected) # sequence -> series - result = df.loc[:3, 'A'] + result = df.loc[:3, "A"] self.assert_series_equal(result, expected) def test_loc_iloc_frame_single_dtype(self, data): @@ -82,8 +80,9 @@ def test_loc_iloc_frame_single_dtype(self, data): result = df.loc[2] self.assert_series_equal(result, expected) - expected = pd.Series([data[-1]], index=["A"], name=len(data) - 1, - dtype=data.dtype) + expected = pd.Series( + [data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype + ) result = df.iloc[-1] self.assert_series_equal(result, expected) @@ -146,14 +145,14 @@ def test_get(self, data): assert s.get(-1) is None assert s.get(s.index.max() + 1) is None - s = pd.Series(data[:6], index=list('abcdef')) - assert s.get('c') == s.iloc[2] + s = pd.Series(data[:6], index=list("abcdef")) + assert s.get("c") == s.iloc[2] - result = s.get(slice('b', 'd')) + result = s.get(slice("b", "d")) expected = s.iloc[[1, 2, 3]] self.assert_series_equal(result, expected) - result = s.get('Z') + result = s.get("Z") assert result is None assert s.get(4) == s.iloc[4] @@ -216,7 +215,7 @@ def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError): data.take([0, -2], fill_value=na_value, allow_fill=True) - @pytest.mark.parametrize('allow_fill', [True, False]) + @pytest.mark.parametrize("allow_fill", [True, False]) def test_take_out_of_bounds_raises(self, data, allow_fill): arr = data[:3] with pytest.raises(IndexError): @@ -227,7 +226,8 @@ def test_take_series(self, data): result = s.take([0, -1]) expected = pd.Series( data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype), - index=[0, len(data) - 1]) + index=[0, len(data) - 1], + ) self.assert_series_equal(result, expected) def test_reindex(self, data, na_value): @@ -239,15 +239,15 @@ def test_reindex(self, data, na_value): n = len(data) result = s.reindex([-1, 0, n]) expected = pd.Series( - data._from_sequence([na_value, data[0], na_value], - dtype=s.dtype), - index=[-1, 0, n]) + data._from_sequence([na_value, data[0], na_value], dtype=s.dtype), + index=[-1, 0, n], + ) self.assert_series_equal(result, expected) result = s.reindex([n, n + 1]) - expected = pd.Series(data._from_sequence([na_value, na_value], - dtype=s.dtype), - index=[n, n + 1]) + expected = pd.Series( + data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1] + ) self.assert_series_equal(result, expected) def test_reindex_non_na_fill_value(self, data_missing): diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index daeec5923888c..dc926d2ff6ab4 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -10,20 +10,18 @@ class BaseGroupbyTests(BaseExtensionTests): """Groupby-specific tests.""" def test_grouping_grouper(self, data_for_grouping): - df = pd.DataFrame({ - "A": ["B", "B", None, None, "A", "A", "B", "C"], - "B": data_for_grouping - }) + df = pd.DataFrame( + {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping} + ) gr1 = df.groupby("A").grouper.groupings[0] gr2 = df.groupby("B").grouper.groupings[0] tm.assert_numpy_array_equal(gr1.grouper, df.A.values) tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) - @pytest.mark.parametrize('as_index', [True, False]) + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() _, index = pd.factorize(data_for_grouping, sort=True) @@ -36,8 +34,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): self.assert_frame_equal(result, expected) def test_groupby_extension_no_sort(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) @@ -47,44 +44,48 @@ def test_groupby_extension_no_sort(self, data_for_grouping): def test_groupby_extension_transform(self, data_for_grouping): valid = data_for_grouping[~data_for_grouping.isna()] - df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], - "B": valid}) + df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid}) result = df.groupby("B").A.transform(len) expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") self.assert_series_equal(result, expected) - def test_groupby_extension_apply( - self, data_for_grouping, groupby_apply_op): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) df.groupby("B").apply(groupby_apply_op) df.groupby("B").A.apply(groupby_apply_op) df.groupby("A").apply(groupby_apply_op) df.groupby("A").B.apply(groupby_apply_op) def test_groupby_apply_identity(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping}) - result = df.groupby('A').B.apply(lambda x: x.array) - expected = pd.Series([df.B.iloc[[0, 1, 6]].array, - df.B.iloc[[2, 3]].array, - df.B.iloc[[4, 5]].array, - df.B.iloc[[7]].array], - index=pd.Index([1, 2, 3, 4], name='A'), - name='B') + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + result = df.groupby("A").B.apply(lambda x: x.array) + expected = pd.Series( + [ + df.B.iloc[[0, 1, 6]].array, + df.B.iloc[[2, 3]].array, + df.B.iloc[[4, 5]].array, + df.B.iloc[[7]].array, + ], + index=pd.Index([1, 2, 3, 4], name="A"), + name="B", + ) self.assert_series_equal(result, expected) def test_in_numeric_groupby(self, data_for_grouping): - df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], - "B": data_for_grouping, - "C": [1, 1, 1, 1, 1, 1, 1, 1]}) + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1, 1], + } + ) result = df.groupby("A").sum().columns if data_for_grouping.dtype._is_numeric: - expected = pd.Index(['B', 'C']) + expected = pd.Index(["B", "C"]) else: - expected = pd.Index(['C']) + expected = pd.Index(["C"]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index fd47ae6f31290..dee8021f5375f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -11,6 +11,7 @@ class BaseInterfaceTests(BaseExtensionTests): """Tests that the basic interface is satisfied.""" + # ------------------------------------------------------------------------ # Interface # ------------------------------------------------------------------------ @@ -47,8 +48,8 @@ def test_is_extension_array_dtype(self, data): def test_no_values_attribute(self, data): # GH-20735: EA's with .values attribute give problems with internal # code, disallowing this for now until solved - assert not hasattr(data, 'values') - assert not hasattr(data, '_values') + assert not hasattr(data, "values") + assert not hasattr(data, "_values") def test_is_numeric_honored(self, data): result = pd.Series(data) @@ -59,10 +60,10 @@ def test_isna_extension_array(self, data_missing): # _reduce. At the *very* least, you must implement any and all na = data_missing.isna() if is_extension_array_dtype(na): - assert na._reduce('any') + assert na._reduce("any") assert na.any() - assert not na._reduce('all') + assert not na._reduce("all") assert not na.all() assert na.dtype._is_boolean diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 22787c38b66fb..3de752a8c682a 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -9,15 +9,12 @@ class BaseParsingTests(BaseExtensionTests): - - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - df = pd.DataFrame({ - 'with_dtype': pd.Series(data, dtype=str(data.dtype)) - }) + df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) csv_output = df.to_csv(index=False, na_rep=np.nan) - result = pd.read_csv(StringIO(csv_output), dtype={ - 'with_dtype': str(data.dtype) - }, engine=engine) + result = pd.read_csv( + StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine + ) expected = df self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 9b154a8afeabc..6d47b0c1d1f77 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -11,7 +11,7 @@ class BaseMethodsTests(BaseExtensionTests): """Various Series and DataFrame methods.""" - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: @@ -20,14 +20,13 @@ def test_value_counts(self, all_data, dropna): other = all_data result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts( - dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() self.assert_series_equal(result, expected) def test_count(self, data_missing): df = pd.DataFrame({"A": data_missing}) - result = df.count(axis='columns') + result = df.count(axis="columns") expected = pd.Series([0, 1]) self.assert_series_equal(result, expected) @@ -60,16 +59,19 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) - @pytest.mark.parametrize('na_position, expected', [ - ('last', np.array([2, 0, 1], dtype=np.dtype('intp'))), - ('first', np.array([1, 2, 0], dtype=np.dtype('intp'))) - ]) + @pytest.mark.parametrize( + "na_position, expected", + [ + ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))), + ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))), + ], + ) def test_nargsort(self, data_missing_for_sorting, na_position, expected): # GH 25439 result = nargsort(data_missing_for_sorting, na_position=na_position) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending): ser = pd.Series(data_for_sorting) result = ser.sort_values(ascending=ascending) @@ -79,7 +81,7 @@ def test_sort_values(self, data_for_sorting, ascending): self.assert_series_equal(result, expected) - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): ser = pd.Series(data_missing_for_sorting) result = ser.sort_values(ascending=ascending) @@ -89,18 +91,17 @@ def test_sort_values_missing(self, data_missing_for_sorting, ascending): expected = ser.iloc[[0, 2, 1]] self.assert_series_equal(result, expected) - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_frame(self, data_for_sorting, ascending): - df = pd.DataFrame({"A": [1, 2, 1], - "B": data_for_sorting}) - result = df.sort_values(['A', 'B']) - expected = pd.DataFrame({"A": [1, 1, 2], - 'B': data_for_sorting.take([2, 0, 1])}, - index=[2, 0, 1]) + df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting}) + result = df.sort_values(["A", "B"]) + expected = pd.DataFrame( + {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1] + ) self.assert_frame_equal(result, expected) - @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) - @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): duplicated = box(data._from_sequence([data[0], data[0]])) @@ -110,19 +111,18 @@ def test_unique(self, data, box, method): assert isinstance(result, type(data)) assert result[0] == duplicated[0] - @pytest.mark.parametrize('na_sentinel', [-1, -2]) + @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): - labels, uniques = pd.factorize(data_for_grouping, - na_sentinel=na_sentinel) - expected_labels = np.array([0, 0, na_sentinel, - na_sentinel, 1, 1, 0, 2], - dtype=np.intp) + labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_labels = np.array( + [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp + ) expected_uniques = data_for_grouping.take([0, 4, 7]) tm.assert_numpy_array_equal(labels, expected_labels) self.assert_extension_array_equal(uniques, expected_uniques) - @pytest.mark.parametrize('na_sentinel', [-1, -2]) + @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel) @@ -169,8 +169,9 @@ def test_combine_le(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) - expected = pd.Series([a <= b for (a, b) in - zip(list(orig_data1), list(orig_data2))]) + expected = pd.Series( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))] + ) self.assert_series_equal(result, expected) val = s1.iloc[0] @@ -184,17 +185,19 @@ def test_combine_add(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 + x2) - with np.errstate(over='ignore'): + with np.errstate(over="ignore"): expected = pd.Series( - orig_data1._from_sequence([a + b for (a, b) in - zip(list(orig_data1), - list(orig_data2))])) + orig_data1._from_sequence( + [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))] + ) + ) self.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 + x2) expected = pd.Series( - orig_data1._from_sequence([a + val for a in list(orig_data1)])) + orig_data1._from_sequence([a + val for a in list(orig_data1)]) + ) self.assert_series_equal(result, expected) def test_combine_first(self, data): @@ -205,24 +208,22 @@ def test_combine_first(self, data): expected = pd.Series(data[:5]) self.assert_series_equal(result, expected) - @pytest.mark.parametrize('frame', [True, False]) - @pytest.mark.parametrize('periods, indices', [ - (-2, [2, 3, 4, -1, -1]), - (0, [0, 1, 2, 3, 4]), - (2, [-1, -1, 0, 1, 2]), - ]) + @pytest.mark.parametrize("frame", [True, False]) + @pytest.mark.parametrize( + "periods, indices", + [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])], + ) def test_container_shift(self, data, frame, periods, indices): # https://github.com/pandas-dev/pandas/issues/22386 subset = data[:5] - data = pd.Series(subset, name='A') - expected = pd.Series(subset.take(indices, allow_fill=True), name='A') + data = pd.Series(subset, name="A") + expected = pd.Series(subset.take(indices, allow_fill=True), name="A") if frame: - result = data.to_frame(name='A').assign(B=1).shift(periods) - expected = pd.concat([ - expected, - pd.Series([1] * 5, name='B').shift(periods) - ], axis=1) + result = data.to_frame(name="A").assign(B=1).shift(periods) + expected = pd.concat( + [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1 + ) compare = self.assert_frame_equal else: result = data.shift(periods) @@ -230,13 +231,10 @@ def test_container_shift(self, data, frame, periods, indices): compare(result, expected) - @pytest.mark.parametrize('periods, indices', [ - [-4, [-1, -1]], - [-1, [1, -1]], - [0, [0, 1]], - [1, [-1, 0]], - [4, [-1, -1]] - ]) + @pytest.mark.parametrize( + "periods, indices", + [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]], + ) def test_shift_non_empty_array(self, data, periods, indices): # https://github.com/pandas-dev/pandas/issues/23911 subset = data[:2] @@ -244,9 +242,7 @@ def test_shift_non_empty_array(self, data, periods, indices): expected = subset.take(indices, allow_fill=True) self.assert_extension_array_equal(result, expected) - @pytest.mark.parametrize('periods', [ - -4, -1, 0, 1, 4 - ]) + @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4]) def test_shift_empty_array(self, data, periods): # https://github.com/pandas-dev/pandas/issues/23911 empty = data[:0] @@ -307,15 +303,16 @@ def test_where_series(self, data, na_value, as_frame): cond = np.array([True, True, False, False]) if as_frame: - ser = ser.to_frame(name='a') + ser = ser.to_frame(name="a") cond = cond.reshape(-1, 1) result = ser.where(cond) - expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], - dtype=data.dtype)) + expected = pd.Series( + cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype) + ) if as_frame: - expected = expected.to_frame(name='a') + expected = expected.to_frame(name="a") self.assert_equal(result, expected) # array other @@ -325,10 +322,9 @@ def test_where_series(self, data, na_value, as_frame): other = pd.DataFrame({"a": other}) cond = pd.DataFrame({"a": cond}) result = ser.where(cond, other) - expected = pd.Series(cls._from_sequence([a, b, b, b], - dtype=data.dtype)) + expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) if as_frame: - expected = expected.to_frame(name='a') + expected = expected.to_frame(name="a") self.assert_equal(result, expected) @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) @@ -347,11 +343,15 @@ def test_repeat(self, data, repeats, as_series, use_numpy): self.assert_equal(result, expected) - @pytest.mark.parametrize('repeats, kwargs, error, msg', [ - (2, dict(axis=1), ValueError, "'axis"), - (-1, dict(), ValueError, "negative"), - ([1, 2], dict(), ValueError, "shape"), - (2, dict(foo='bar'), TypeError, "'foo'")]) + @pytest.mark.parametrize( + "repeats, kwargs, error, msg", + [ + (2, dict(axis=1), ValueError, "'axis"), + (-1, dict(), ValueError, "negative"), + ([1, 2], dict(), ValueError, "shape"), + (2, dict(foo="bar"), TypeError, "'foo'"), + ], + ) def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy): with pytest.raises(error, match=msg): if use_numpy: diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 834f49f0461f0..21bbb365ab0f3 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -42,13 +42,12 @@ def test_dropna_frame(self, data_missing): self.assert_frame_equal(result, expected) # axis = 1 - result = df.dropna(axis='columns') + result = df.dropna(axis="columns") expected = pd.DataFrame(index=[0, 1]) self.assert_frame_equal(result, expected) # multiple - df = pd.DataFrame({"A": data_missing, - "B": [1, np.nan]}) + df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]}) result = df.dropna() expected = df.iloc[:0] self.assert_frame_equal(result, expected) @@ -61,13 +60,13 @@ def test_fillna_scalar(self, data_missing): def test_fillna_limit_pad(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) - result = pd.Series(arr).fillna(method='ffill', limit=2) + result = pd.Series(arr).fillna(method="ffill", limit=2) expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) self.assert_series_equal(result, expected) def test_fillna_limit_backfill(self, data_missing): arr = data_missing.take([1, 0, 0, 0, 1]) - result = pd.Series(arr).fillna(method='backfill', limit=2) + result = pd.Series(arr).fillna(method="backfill", limit=2) expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) self.assert_series_equal(result, expected) @@ -76,8 +75,11 @@ def test_fillna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.fillna(fill_value) - expected = pd.Series(data_missing._from_sequence( - [fill_value, fill_value], dtype=data_missing.dtype)) + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) self.assert_series_equal(result, expected) # Fill with a series @@ -91,40 +93,37 @@ def test_fillna_series(self, data_missing): def test_fillna_series_method(self, data_missing, fillna_method): fill_value = data_missing[1] - if fillna_method == 'ffill': + if fillna_method == "ffill": data_missing = data_missing[::-1] result = pd.Series(data_missing).fillna(method=fillna_method) - expected = pd.Series(data_missing._from_sequence( - [fill_value, fill_value], dtype=data_missing.dtype)) + expected = pd.Series( + data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ) + ) self.assert_series_equal(result, expected) def test_fillna_frame(self, data_missing): fill_value = data_missing[1] - result = pd.DataFrame({ - "A": data_missing, - "B": [1, 2] - }).fillna(fill_value) + result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value) - expected = pd.DataFrame({ - "A": data_missing._from_sequence([fill_value, fill_value], - dtype=data_missing.dtype), - "B": [1, 2], - }) + expected = pd.DataFrame( + { + "A": data_missing._from_sequence( + [fill_value, fill_value], dtype=data_missing.dtype + ), + "B": [1, 2], + } + ) self.assert_frame_equal(result, expected) def test_fillna_fill_other(self, data): - result = pd.DataFrame({ - "A": data, - "B": [np.nan] * len(data) - }).fillna({"B": 0.0}) - - expected = pd.DataFrame({ - "A": data, - "B": [0.0] * len(result), - }) + result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0}) + + expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)}) self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 708eb9c7c8c43..e35464964f432 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -9,9 +9,8 @@ class BaseOpsUtil(BaseExtensionTests): - def get_op_from_name(self, op_name): - short_opname = op_name.strip('_') + short_opname = op_name.strip("_") try: op = getattr(operator, short_opname) except AttributeError: @@ -61,6 +60,7 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * series_array_exc = TypeError * divmod_exc = TypeError """ + series_scalar_exc = TypeError frame_scalar_exc = TypeError series_array_exc = TypeError @@ -76,15 +76,16 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op_name = all_arithmetic_operators - df = pd.DataFrame({'A': data}) + df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0], exc=self.frame_scalar_exc) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators s = pd.Series(data) - self.check_opname(s, op_name, pd.Series([s.iloc[0]] * len(s)), - exc=self.series_array_exc) + self.check_opname( + s, op_name, pd.Series([s.iloc[0]] * len(s)), exc=self.series_array_exc + ) def test_divmod(self, data): s = pd.Series(data) @@ -117,7 +118,7 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): # EAs should return NotImplemented for ops with Series. # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) - if hasattr(data, '__add__'): + if hasattr(data, "__add__"): result = data.__add__(other) assert result is NotImplemented else: @@ -131,10 +132,10 @@ class BaseComparisonOpsTests(BaseOpsUtil): def _compare_other(self, s, data, op_name, other): op = self.get_op_from_name(op_name) - if op_name == '__eq__': + if op_name == "__eq__": assert getattr(data, op_name)(other) is NotImplemented assert not op(s, other).all() - elif op_name == '__ne__': + elif op_name == "__ne__": assert getattr(data, op_name)(other) is NotImplemented assert op(s, other).all() @@ -163,7 +164,7 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): # EAs should return NotImplemented for ops with Series. # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) - if hasattr(data, '__eq__'): + if hasattr(data, "__eq__"): result = data.__eq__(other) assert result is NotImplemented else: diff --git a/pandas/tests/extension/base/printing.py b/pandas/tests/extension/base/printing.py index 8b33ce173c786..0f10efbf32a49 100644 --- a/pandas/tests/extension/base/printing.py +++ b/pandas/tests/extension/base/printing.py @@ -19,10 +19,10 @@ def test_array_repr(self, data, size): result = repr(data) assert data.__class__.__name__ in result - assert 'Length: {}'.format(len(data)) in result + assert "Length: {}".format(len(data)) in result assert str(data.dtype) in result - if size == 'big': - assert '...' in result + if size == "big": + assert "..." in result def test_array_repr_unicode(self, data): result = str(data) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c4b70f2013265..8766bb771f8a2 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -13,16 +13,17 @@ class BaseReduceTests(BaseExtensionTests): Reduction specific tests. Generally these only make sense for numeric/boolean operations. """ + def check_reduce(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) - expected = getattr(s.astype('float64'), op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) tm.assert_almost_equal(result, expected) class BaseNoReduceTests(BaseReduceTests): """ we don't define any reductions """ - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions s = pd.Series(data) @@ -30,7 +31,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): with pytest.raises(TypeError): getattr(s, op_name)(skipna=skipna) - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions s = pd.Series(data) @@ -40,8 +41,7 @@ def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): class BaseNumericReduceTests(BaseReduceTests): - - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions s = pd.Series(data) @@ -53,8 +53,7 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna): class BaseBooleanReduceTests(BaseReduceTests): - - @pytest.mark.parametrize('skipna', [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions s = pd.Series(data) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 4ea78a4239e6e..90e607343297d 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -11,7 +11,8 @@ class BaseReshapingTests(BaseExtensionTests): """Tests for reshaping and concatenation.""" - @pytest.mark.parametrize('in_frame', [True, False]) + + @pytest.mark.parametrize("in_frame", [True, False]) def test_concat(self, data, in_frame): wrapped = pd.Series(data) if in_frame: @@ -28,7 +29,7 @@ def test_concat(self, data, in_frame): assert dtype == data.dtype assert isinstance(result._data.blocks[0], ExtensionBlock) - @pytest.mark.parametrize('in_frame', [True, False]) + @pytest.mark.parametrize("in_frame", [True, False]) def test_concat_all_na_block(self, data_missing, in_frame): valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1]) na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3]) @@ -45,9 +46,9 @@ def test_concat_all_na_block(self, data_missing, in_frame): def test_concat_mixed_dtypes(self, data): # https://github.com/pandas-dev/pandas/issues/20762 - df1 = pd.DataFrame({'A': data[:3]}) + df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"A": [1, 2, 3]}) - df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category') + df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category") dfs = [df1, df2, df3] # dataframes @@ -56,40 +57,41 @@ def test_concat_mixed_dtypes(self, data): self.assert_frame_equal(result, expected) # series - result = pd.concat([x['A'] for x in dfs]) - expected = pd.concat([x['A'].astype(object) for x in dfs]) + result = pd.concat([x["A"] for x in dfs]) + expected = pd.concat([x["A"].astype(object) for x in dfs]) self.assert_series_equal(result, expected) # simple test for just EA and one other result = pd.concat([df1, df2]) - expected = pd.concat([df1.astype('object'), df2.astype('object')]) + expected = pd.concat([df1.astype("object"), df2.astype("object")]) self.assert_frame_equal(result, expected) - result = pd.concat([df1['A'], df2['A']]) - expected = pd.concat([df1['A'].astype('object'), - df2['A'].astype('object')]) + result = pd.concat([df1["A"], df2["A"]]) + expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) self.assert_series_equal(result, expected) def test_concat_columns(self, data, na_value): - df1 = pd.DataFrame({'A': data[:3]}) - df2 = pd.DataFrame({'B': [1, 2, 3]}) + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": [1, 2, 3]}) - expected = pd.DataFrame({'A': data[:3], 'B': [1, 2, 3]}) + expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]}) result = pd.concat([df1, df2], axis=1) self.assert_frame_equal(result, expected) - result = pd.concat([df1['A'], df2['B']], axis=1) + result = pd.concat([df1["A"], df2["B"]], axis=1) self.assert_frame_equal(result, expected) # non-aligned - df2 = pd.DataFrame({'B': [1, 2, 3]}, index=[1, 2, 3]) - expected = pd.DataFrame({ - 'A': data._from_sequence(list(data[:3]) + [na_value], - dtype=data.dtype), - 'B': [np.nan, 1, 2, 3]}) + df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3]) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": [np.nan, 1, 2, 3], + } + ) result = pd.concat([df1, df2], axis=1) self.assert_frame_equal(result, expected) - result = pd.concat([df1['A'], df2['B']], axis=1) + result = pd.concat([df1["A"], df2["B"]], axis=1) self.assert_frame_equal(result, expected) def test_align(self, data, na_value): @@ -98,80 +100,88 @@ def test_align(self, data, na_value): r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) # Assumes that the ctor can take a list of scalars of the type - e1 = pd.Series(data._from_sequence(list(a) + [na_value], - dtype=data.dtype)) - e2 = pd.Series(data._from_sequence([na_value] + list(b), - dtype=data.dtype)) + e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype)) + e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype)) self.assert_series_equal(r1, e1) self.assert_series_equal(r2, e2) def test_align_frame(self, data, na_value): a = data[:3] b = data[2:5] - r1, r2 = pd.DataFrame({'A': a}).align( - pd.DataFrame({'A': b}, index=[1, 2, 3]) - ) + r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3])) # Assumes that the ctor can take a list of scalars of the type - e1 = pd.DataFrame({'A': data._from_sequence(list(a) + [na_value], - dtype=data.dtype)}) - e2 = pd.DataFrame({'A': data._from_sequence([na_value] + list(b), - dtype=data.dtype)}) + e1 = pd.DataFrame( + {"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)} + ) + e2 = pd.DataFrame( + {"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)} + ) self.assert_frame_equal(r1, e1) self.assert_frame_equal(r2, e2) def test_align_series_frame(self, data, na_value): # https://github.com/pandas-dev/pandas/issues/20576 - ser = pd.Series(data, name='a') + ser = pd.Series(data, name="a") df = pd.DataFrame({"col": np.arange(len(ser) + 1)}) r1, r2 = ser.align(df) - e1 = pd.Series(data._from_sequence(list(data) + [na_value], - dtype=data.dtype), - name=ser.name) + e1 = pd.Series( + data._from_sequence(list(data) + [na_value], dtype=data.dtype), + name=ser.name, + ) self.assert_series_equal(r1, e1) self.assert_frame_equal(r2, df) def test_set_frame_expand_regular_with_extension(self, data): df = pd.DataFrame({"A": [1] * len(data)}) - df['B'] = data + df["B"] = data expected = pd.DataFrame({"A": [1] * len(data), "B": data}) self.assert_frame_equal(df, expected) def test_set_frame_expand_extension_with_regular(self, data): - df = pd.DataFrame({'A': data}) - df['B'] = [1] * len(data) + df = pd.DataFrame({"A": data}) + df["B"] = [1] * len(data) expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) self.assert_frame_equal(df, expected) def test_set_frame_overwrite_object(self, data): # https://github.com/pandas-dev/pandas/issues/20555 df = pd.DataFrame({"A": [1] * len(data)}, dtype=object) - df['A'] = data - assert df.dtypes['A'] == data.dtype + df["A"] = data + assert df.dtypes["A"] == data.dtype def test_merge(self, data, na_value): # GH-20743 - df1 = pd.DataFrame({'ext': data[:3], 'int1': [1, 2, 3], - 'key': [0, 1, 2]}) - df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) + df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]}) + df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]}) res = pd.merge(df1, df2) exp = pd.DataFrame( - {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], - 'ext': data._from_sequence([data[0], data[0], data[1]], - dtype=data.dtype)}) - self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + { + "int1": [1, 1, 2], + "int2": [1, 2, 3], + "key": [0, 0, 1], + "ext": data._from_sequence( + [data[0], data[0], data[1]], dtype=data.dtype + ), + } + ) + self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) - res = pd.merge(df1, df2, how='outer') + res = pd.merge(df1, df2, how="outer") exp = pd.DataFrame( - {'int1': [1, 1, 2, 3, np.nan], 'int2': [1, 2, 3, np.nan, 4], - 'key': [0, 0, 1, 2, 3], - 'ext': data._from_sequence( - [data[0], data[0], data[1], data[2], na_value], - dtype=data.dtype)}) - self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + { + "int1": [1, 1, 2, 3, np.nan], + "int2": [1, 2, 3, np.nan, 4], + "key": [0, 0, 1, 2, 3], + "ext": data._from_sequence( + [data[0], data[0], data[1], data[2], na_value], dtype=data.dtype + ), + } + ) + self.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]]) def test_merge_on_extension_array(self, data): # GH 23020 @@ -179,14 +189,12 @@ def test_merge_on_extension_array(self, data): key = type(data)._from_sequence([a, b], dtype=data.dtype) df = pd.DataFrame({"key": key, "val": [1, 2]}) - result = pd.merge(df, df, on='key') - expected = pd.DataFrame({"key": key, - "val_x": [1, 2], - "val_y": [1, 2]}) + result = pd.merge(df, df, on="key") + expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]}) self.assert_frame_equal(result, expected) # order - result = pd.merge(df.iloc[[1, 0]], df, on='key') + result = pd.merge(df.iloc[[1, 0]], df, on="key") expected = expected.iloc[[1, 0]].reset_index(drop=True) self.assert_frame_equal(result, expected) @@ -197,19 +205,25 @@ def test_merge_on_extension_array_duplicates(self, data): df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]}) - result = pd.merge(df1, df2, on='key') - expected = pd.DataFrame({ - "key": key.take([0, 0, 0, 0, 1]), - "val_x": [1, 1, 3, 3, 2], - "val_y": [1, 3, 1, 3, 2], - }) + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame( + { + "key": key.take([0, 0, 0, 0, 1]), + "val_x": [1, 1, 3, 3, 2], + "val_y": [1, 3, 1, 3, 2], + } + ) self.assert_frame_equal(result, expected) - @pytest.mark.parametrize("columns", [ - ["A", "B"], - pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b')], - names=['outer', 'inner']), - ]) + @pytest.mark.parametrize( + "columns", + [ + ["A", "B"], + pd.MultiIndex.from_tuples( + [("A", "a"), ("A", "b")], names=["outer", "inner"] + ), + ], + ) def test_stack(self, data, columns): df = pd.DataFrame({"A": data[:5], "B": data[:5]}) df.columns = columns @@ -227,27 +241,29 @@ def test_stack(self, data, columns): result = result.astype(object) self.assert_equal(result, expected) - @pytest.mark.parametrize("index", [ - # Two levels, uniform. - pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), - names=['a', 'b']), - - # non-uniform - pd.MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('B', 'b')]), - - # three levels, non-uniform - pd.MultiIndex.from_product([('A', 'B'), ('a', 'b', 'c'), (0, 1, 2)]), - pd.MultiIndex.from_tuples([ - ('A', 'a', 1), - ('A', 'b', 0), - ('A', 'a', 0), - ('B', 'a', 0), - ('B', 'c', 1), - ]), - ]) + @pytest.mark.parametrize( + "index", + [ + # Two levels, uniform. + pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]), + # non-uniform + pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]), + # three levels, non-uniform + pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]), + pd.MultiIndex.from_tuples( + [ + ("A", "a", 1), + ("A", "b", 0), + ("A", "a", 0), + ("B", "a", 0), + ("B", "c", 1), + ] + ), + ], + ) @pytest.mark.parametrize("obj", ["series", "frame"]) def test_unstack(self, data, index, obj): - data = data[:len(index)] + data = data[: len(index)] if obj == "series": ser = pd.Series(data, index=index) else: @@ -263,8 +279,9 @@ def test_unstack(self, data, index, obj): for level in combinations: result = ser.unstack(level=level) - assert all(isinstance(result[col].array, type(data)) - for col in result.columns) + assert all( + isinstance(result[col].array, type(data)) for col in result.columns + ) expected = ser.astype(object).unstack(level=level) result = result.astype(object) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index db6328e39e6cc..bb6bb02b462e2 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -31,13 +31,13 @@ def test_setitem_sequence_mismatched_length_raises(self, data, as_array): if as_array: value = data._from_sequence(value) - xpr = 'cannot set using a {} indexer with a different length' - with pytest.raises(ValueError, match=xpr.format('list-like')): + xpr = "cannot set using a {} indexer with a different length" + with pytest.raises(ValueError, match=xpr.format("list-like")): ser[[0, 1]] = value # Ensure no modifications made before the exception self.assert_series_equal(ser, original) - with pytest.raises(ValueError, match=xpr.format('slice')): + with pytest.raises(ValueError, match=xpr.format("slice")): ser[slice(3)] = value self.assert_series_equal(ser, original) @@ -55,7 +55,7 @@ def test_setitem_sequence_broadcasts(self, data, box_in_series): assert data[0] == data[2] assert data[1] == data[2] - @pytest.mark.parametrize('setter', ['loc', 'iloc']) + @pytest.mark.parametrize("setter", ["loc", "iloc"]) def test_setitem_scalar(self, data, setter): arr = pd.Series(data) setter = getattr(arr, setter) @@ -64,36 +64,36 @@ def test_setitem_scalar(self, data, setter): def test_setitem_loc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - df.loc[0, 'B'] = data[1] - assert df.loc[0, 'B'] == data[1] + df.loc[0, "B"] = data[1] + assert df.loc[0, "B"] == data[1] def test_setitem_loc_scalar_single(self, data): df = pd.DataFrame({"B": data}) - df.loc[10, 'B'] = data[1] - assert df.loc[10, 'B'] == data[1] + df.loc[10, "B"] = data[1] + assert df.loc[10, "B"] == data[1] def test_setitem_loc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) - df.loc[10, 'B'] = data[1] - assert df.loc[10, 'B'] == data[1] + df.loc[10, "B"] = data[1] + assert df.loc[10, "B"] == data[1] def test_setitem_iloc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) df.iloc[0, 1] = data[1] - assert df.loc[0, 'B'] == data[1] + assert df.loc[0, "B"] == data[1] def test_setitem_iloc_scalar_single(self, data): df = pd.DataFrame({"B": data}) df.iloc[10, 0] = data[1] - assert df.loc[10, 'B'] == data[1] + assert df.loc[10, "B"] == data[1] def test_setitem_iloc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) df.iloc[10, 1] = data[1] - assert df.loc[10, 'B'] == data[1] + assert df.loc[10, "B"] == data[1] - @pytest.mark.parametrize('as_callable', [True, False]) - @pytest.mark.parametrize('setter', ['loc', None]) + @pytest.mark.parametrize("as_callable", [True, False]) + @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_aligned(self, data, as_callable, setter): ser = pd.Series(data) mask = np.zeros(len(data), dtype=bool) @@ -117,13 +117,13 @@ def test_setitem_mask_aligned(self, data, as_callable, setter): assert ser[0] == data[5] assert ser[1] == data[6] - @pytest.mark.parametrize('setter', ['loc', None]) + @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_broadcast(self, data, setter): ser = pd.Series(data) mask = np.zeros(len(data), dtype=bool) mask[:2] = True - if setter: # loc + if setter: # loc target = getattr(ser, setter) else: # __setitem__ target = ser @@ -135,35 +135,35 @@ def test_setitem_mask_broadcast(self, data, setter): def test_setitem_expand_columns(self, data): df = pd.DataFrame({"A": data}) result = df.copy() - result['B'] = 1 + result["B"] = 1 expected = pd.DataFrame({"A": data, "B": [1] * len(data)}) self.assert_frame_equal(result, expected) result = df.copy() - result.loc[:, 'B'] = 1 + result.loc[:, "B"] = 1 self.assert_frame_equal(result, expected) # overwrite with new type - result['B'] = data + result["B"] = data expected = pd.DataFrame({"A": data, "B": data}) self.assert_frame_equal(result, expected) def test_setitem_expand_with_extension(self, data): df = pd.DataFrame({"A": [1] * len(data)}) result = df.copy() - result['B'] = data + result["B"] = data expected = pd.DataFrame({"A": [1] * len(data), "B": data}) self.assert_frame_equal(result, expected) result = df.copy() - result.loc[:, 'B'] = data + result.loc[:, "B"] = data self.assert_frame_equal(result, expected) def test_setitem_frame_invalid_length(self, data): df = pd.DataFrame({"A": [1] * len(data)}) xpr = "Length of values does not match length of index" with pytest.raises(ValueError, match=xpr): - df['B'] = data[:5] + df["B"] = data[:5] @pytest.mark.xfail(reason="GH#20441: setitem on extension types.") def test_setitem_tuple_index(self, data): diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 6fbd43e46495f..d37638d37e4d6 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -33,12 +33,12 @@ def data_missing(): raise NotImplementedError -@pytest.fixture(params=['data', 'data_missing']) +@pytest.fixture(params=["data", "data_missing"]) def all_data(request, data, data_missing): """Parametrized fixture giving 'data' and 'data_missing'""" - if request.param == 'data': + if request.param == "data": return data - elif request.param == 'data_missing': + elif request.param == "data_missing": return data_missing @@ -57,9 +57,11 @@ def data_repeated(data): A callable that takes a `count` argument and returns a generator yielding `count` datasets. """ + def gen(count): for _ in range(count): yield data + return gen @@ -118,12 +120,15 @@ def box_in_series(request): return request.param -@pytest.fixture(params=[ - lambda x: 1, - lambda x: [1] * len(x), - lambda x: Series([1] * len(x)), - lambda x: x, -], ids=['scalar', 'list', 'series', 'object']) +@pytest.fixture( + params=[ + lambda x: 1, + lambda x: [1] * len(x), + lambda x: Series([1] * len(x)), + lambda x: x, + ], + ids=["scalar", "list", "series", "object"], +) def groupby_apply_op(request): """ Functions to test groupby.apply(). @@ -156,7 +161,7 @@ def use_numpy(request): return request.param -@pytest.fixture(params=['ffill', 'bfill']) +@pytest.fixture(params=["ffill", "bfill"]) def fillna_method(request): """ Parametrized fixture giving method parameters 'ffill' and 'bfill' for diff --git a/pandas/tests/extension/decimal/__init__.py b/pandas/tests/extension/decimal/__init__.py index 7c48e7e71503e..8194327f8812e 100644 --- a/pandas/tests/extension/decimal/__init__.py +++ b/pandas/tests/extension/decimal/__init__.py @@ -1,3 +1,3 @@ from .array import DecimalArray, DecimalDtype, make_data, to_decimal -__all__ = ['DecimalArray', 'DecimalDtype', 'to_decimal', 'make_data'] +__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index d097a599730b8..90e6a91fbb91a 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -15,15 +15,15 @@ @register_extension_dtype class DecimalDtype(ExtensionDtype): type = decimal.Decimal - name = 'decimal' - na_value = decimal.Decimal('NaN') - _metadata = ('context',) + name = "decimal" + na_value = decimal.Decimal("NaN") + _metadata = ("context",) def __init__(self, context=None): self.context = context or decimal.getcontext() def __repr__(self): - return 'DecimalDtype(context={})'.format(self.context) + return "DecimalDtype(context={})".format(self.context) @classmethod def construct_array_type(cls): @@ -40,8 +40,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) @property def _is_numeric(self): @@ -54,8 +53,7 @@ class DecimalArray(ExtensionArray, ExtensionScalarOpsMixin): def __init__(self, values, dtype=None, copy=False, context=None): for val in values: if not isinstance(val, decimal.Decimal): - raise TypeError("All values must be of type " + - str(decimal.Decimal)) + raise TypeError("All values must be of type " + str(decimal.Decimal)) values = np.asarray(values, dtype=object) self._data = values @@ -77,8 +75,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence([decimal.Decimal(x) for x in strings], - dtype, copy) + return cls._from_sequence([decimal.Decimal(x) for x in strings], dtype, copy) @classmethod def _from_factorized(cls, values, original): @@ -88,12 +85,12 @@ def _from_factorized(cls, values, original): def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # - if not all(isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) - for t in inputs): + if not all( + isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs + ): return NotImplemented - inputs = tuple(x._data if isinstance(x, DecimalArray) else x - for x in inputs) + inputs = tuple(x._data if isinstance(x, DecimalArray) else x for x in inputs) result = getattr(ufunc, method)(*inputs, **kwargs) def reconstruct(x): @@ -120,8 +117,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): if allow_fill and fill_value is None: fill_value = self.dtype.na_value - result = take(data, indexer, fill_value=fill_value, - allow_fill=allow_fill) + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result) def copy(self): @@ -156,7 +152,7 @@ def isna(self): @property def _na_value(self): - return decimal.Decimal('NaN') + return decimal.Decimal("NaN") def _formatter(self, boxed=False): if boxed: @@ -175,8 +171,9 @@ def _reduce(self, name, skipna=True, **kwargs): try: op = getattr(self.data, name) except AttributeError: - raise NotImplementedError("decimal does not support " - "the {} operation".format(name)) + raise NotImplementedError( + "decimal does not support " "the {} operation".format(name) + ) return op(axis=0) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 80885e4045e64..272936f6ec9f0 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -29,21 +29,21 @@ def data_for_twos(): @pytest.fixture def data_missing(): - return DecimalArray([decimal.Decimal('NaN'), decimal.Decimal(1)]) + return DecimalArray([decimal.Decimal("NaN"), decimal.Decimal(1)]) @pytest.fixture def data_for_sorting(): - return DecimalArray([decimal.Decimal('1'), - decimal.Decimal('2'), - decimal.Decimal('0')]) + return DecimalArray( + [decimal.Decimal("1"), decimal.Decimal("2"), decimal.Decimal("0")] + ) @pytest.fixture def data_missing_for_sorting(): - return DecimalArray([decimal.Decimal('1'), - decimal.Decimal('NaN'), - decimal.Decimal('0')]) + return DecimalArray( + [decimal.Decimal("1"), decimal.Decimal("NaN"), decimal.Decimal("0")] + ) @pytest.fixture @@ -58,15 +58,14 @@ def na_value(): @pytest.fixture def data_for_grouping(): - b = decimal.Decimal('1.0') - a = decimal.Decimal('0.0') - c = decimal.Decimal('2.0') - na = decimal.Decimal('NaN') + b = decimal.Decimal("1.0") + a = decimal.Decimal("0.0") + c = decimal.Decimal("2.0") + na = decimal.Decimal("NaN") return DecimalArray([b, b, na, na, a, a, b, c]) class BaseDecimal: - def assert_series_equal(self, left, right, *args, **kwargs): def convert(x): # need to convert array([Decimal(NaN)], dtype='object') to np.NaN @@ -77,35 +76,34 @@ def convert(x): except TypeError: return False - if left.dtype == 'object': + if left.dtype == "object": left_na = left.apply(convert) else: left_na = left.isna() - if right.dtype == 'object': + if right.dtype == "object": right_na = right.apply(convert) else: right_na = right.isna() tm.assert_series_equal(left_na, right_na) - return tm.assert_series_equal(left[~left_na], - right[~right_na], - *args, **kwargs) + return tm.assert_series_equal(left[~left_na], right[~right_na], *args, **kwargs) def assert_frame_equal(self, left, right, *args, **kwargs): # TODO(EA): select_dtypes tm.assert_index_equal( - left.columns, right.columns, - exact=kwargs.get('check_column_type', 'equiv'), - check_names=kwargs.get('check_names', True), - check_exact=kwargs.get('check_exact', False), - check_categorical=kwargs.get('check_categorical', True), - obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame'))) + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")), + ) - decimals = (left.dtypes == 'decimal').index + decimals = (left.dtypes == "decimal").index for col in decimals: - self.assert_series_equal(left[col], right[col], - *args, **kwargs) + self.assert_series_equal(left[col], right[col], *args, **kwargs) left = left.drop(columns=decimals) right = right.drop(columns=decimals) @@ -122,7 +120,6 @@ class TestInterface(BaseDecimal, base.BaseInterfaceTests): class TestConstructors(BaseDecimal, base.BaseConstructorsTests): - @pytest.mark.skip(reason="not implemented constructor from dtype") def test_from_dtype(self, data): # construct from our dtype & string dtype @@ -134,14 +131,10 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests): class TestGetitem(BaseDecimal, base.BaseGetitemTests): - def test_take_na_value_other_decimal(self): - arr = DecimalArray([decimal.Decimal('1.0'), - decimal.Decimal('2.0')]) - result = arr.take([0, -1], allow_fill=True, - fill_value=decimal.Decimal('-1.0')) - expected = DecimalArray([decimal.Decimal('1.0'), - decimal.Decimal('-1.0')]) + arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) + result = arr.take([0, -1], allow_fill=True, fill_value=decimal.Decimal("-1.0")) + expected = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("-1.0")]) self.assert_extension_array_equal(result, expected) @@ -150,10 +143,9 @@ class TestMissing(BaseDecimal, base.BaseMissingTests): class Reduce: - def check_reduce(self, s, op_name, skipna): - if skipna or op_name in ['median', 'skew', 'kurt']: + if skipna or op_name in ["median", "skew", "kurt"]: with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna) @@ -172,7 +164,7 @@ class TestBooleanReduce(Reduce, base.BaseBooleanReduceTests): class TestMethods(BaseDecimal, base.BaseMethodsTests): - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.xfail(reason="value_counts not implemented yet.") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] @@ -192,9 +184,9 @@ class TestCasting(BaseDecimal, base.BaseCastingTests): class TestGroupby(BaseDecimal, base.BaseGroupbyTests): - @pytest.mark.xfail( - reason="needs to correctly define __eq__ to handle nans, xref #27081.") + reason="needs to correctly define __eq__ to handle nans, xref #27081." + ) def test_groupby_apply_identity(self, data_for_grouping): super().test_groupby_apply_identity(data_for_grouping) @@ -204,7 +196,6 @@ class TestSetitem(BaseDecimal, base.BaseSetitemTests): class TestPrinting(BaseDecimal, base.BasePrintingTests): - def test_series_repr(self, data): # Overriding this base test to explicitly test that # the custom _formatter is used @@ -214,36 +205,40 @@ def test_series_repr(self, data): # TODO(extension) -@pytest.mark.xfail(reason=( - "raising AssertionError as this is not implemented, " - "though easy enough to do")) +@pytest.mark.xfail( + reason=( + "raising AssertionError as this is not implemented, " "though easy enough to do" + ) +) def test_series_constructor_coerce_data_to_extension_dtype_raises(): - xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " - "extension array directly.") + xpr = ( + "Cannot cast data to extension dtype 'decimal'. Pass the " + "extension array directly." + ) with pytest.raises(ValueError, match=xpr): pd.Series([0, 1, 2], dtype=DecimalDtype()) def test_series_constructor_with_dtype(): - arr = DecimalArray([decimal.Decimal('10.0')]) + arr = DecimalArray([decimal.Decimal("10.0")]) result = pd.Series(arr, dtype=DecimalDtype()) expected = pd.Series(arr) tm.assert_series_equal(result, expected) - result = pd.Series(arr, dtype='int64') + result = pd.Series(arr, dtype="int64") expected = pd.Series([10]) tm.assert_series_equal(result, expected) def test_dataframe_constructor_with_dtype(): - arr = DecimalArray([decimal.Decimal('10.0')]) + arr = DecimalArray([decimal.Decimal("10.0")]) result = pd.DataFrame({"A": arr}, dtype=DecimalDtype()) expected = pd.DataFrame({"A": arr}) tm.assert_frame_equal(result, expected) - arr = DecimalArray([decimal.Decimal('10.0')]) - result = pd.DataFrame({"A": arr}, dtype='int64') + arr = DecimalArray([decimal.Decimal("10.0")]) + result = pd.DataFrame({"A": arr}, dtype="int64") expected = pd.DataFrame({"A": [10]}) tm.assert_frame_equal(result, expected) @@ -254,7 +249,7 @@ def test_astype_dispatches(frame): # gets all the way through to ExtensionArray.astype # Designing a reliable smoke test that works for arbitrary data types # is difficult. - data = pd.Series(DecimalArray([decimal.Decimal(2)]), name='a') + data = pd.Series(DecimalArray([decimal.Decimal(2)]), name="a") ctx = decimal.Context() ctx.prec = 5 @@ -264,13 +259,12 @@ def test_astype_dispatches(frame): result = data.astype(DecimalDtype(ctx)) if frame: - result = result['a'] + result = result["a"] assert result.dtype.context.prec == ctx.prec class TestArithmeticOps(BaseDecimal, base.BaseArithmeticOpsTests): - def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -305,7 +299,6 @@ def test_error(self): class TestComparisonOps(BaseDecimal, base.BaseComparisonOpsTests): - def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -323,13 +316,13 @@ def test_compare_array(self, data, all_compare_operators): alter = np.random.choice([-1, 0, 1], len(data)) # Randomly double, halve or keep same value - other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) - for i in alter] + other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) for i in alter] self._compare_other(s, data, op_name, other) class DecimalArrayWithoutFromSequence(DecimalArray): """Helper class for testing error handling in _from_sequence.""" + def _from_sequence(cls, scalars, dtype=None, copy=False): raise KeyError("For the test") @@ -345,37 +338,38 @@ def _create_arithmetic_method(cls, op): def test_combine_from_sequence_raises(): # https://github.com/pandas-dev/pandas/issues/22850 - ser = pd.Series(DecimalArrayWithoutFromSequence([ - decimal.Decimal("1.0"), - decimal.Decimal("2.0") - ])) + ser = pd.Series( + DecimalArrayWithoutFromSequence( + [decimal.Decimal("1.0"), decimal.Decimal("2.0")] + ) + ) result = ser.combine(ser, operator.add) # note: object dtype - expected = pd.Series([decimal.Decimal("2.0"), - decimal.Decimal("4.0")], dtype="object") + expected = pd.Series( + [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("class_", [DecimalArrayWithoutFromSequence, - DecimalArrayWithoutCoercion]) +@pytest.mark.parametrize( + "class_", [DecimalArrayWithoutFromSequence, DecimalArrayWithoutCoercion] +) def test_scalar_ops_from_sequence_raises(class_): # op(EA, EA) should return an EA, or an ndarray if it's not possible # to return an EA with the return values. - arr = class_([ - decimal.Decimal("1.0"), - decimal.Decimal("2.0") - ]) + arr = class_([decimal.Decimal("1.0"), decimal.Decimal("2.0")]) result = arr + arr - expected = np.array([decimal.Decimal("2.0"), decimal.Decimal("4.0")], - dtype="object") + expected = np.array( + [decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object" + ) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("reverse, expected_div, expected_mod", [ - (False, [0, 1, 1, 2], [1, 0, 1, 0]), - (True, [2, 1, 0, 0], [0, 0, 2, 2]), -]) +@pytest.mark.parametrize( + "reverse, expected_div, expected_mod", + [(False, [0, 1, 1, 2], [1, 0, 1, 0]), (True, [2, 1, 0, 0], [0, 0, 2, 2])], +) def test_divmod_array(reverse, expected_div, expected_mod): # https://github.com/pandas-dev/pandas/issues/22930 arr = to_decimal([1, 2, 3, 4]) @@ -403,10 +397,9 @@ class DecimalArray2(DecimalArray): def _formatting_values(self): return np.array(self) - ser = pd.Series(DecimalArray2([decimal.Decimal('1.0')])) + ser = pd.Series(DecimalArray2([decimal.Decimal("1.0")])) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): repr(ser) diff --git a/pandas/tests/extension/json/__init__.py b/pandas/tests/extension/json/__init__.py index f2679d087c841..e205c7ee50974 100644 --- a/pandas/tests/extension/json/__init__.py +++ b/pandas/tests/extension/json/__init__.py @@ -1,3 +1,3 @@ from .array import JSONArray, JSONDtype, make_data -__all__ = ['JSONArray', 'JSONDtype', 'make_data'] +__all__ = ["JSONArray", "JSONDtype", "make_data"] diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 1b5009830303b..ece1924b1b228 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -26,7 +26,7 @@ class JSONDtype(ExtensionDtype): type = abc.Mapping - name = 'json' + name = "json" na_value = UserDict() @classmethod @@ -44,8 +44,7 @@ def construct_from_string(cls, string): if string == cls.name: return cls() else: - raise TypeError("Cannot construct a '{}' from " - "'{}'".format(cls, string)) + raise TypeError("Cannot construct a '{}' from " "'{}'".format(cls, string)) class JSONArray(ExtensionArray): @@ -55,8 +54,7 @@ class JSONArray(ExtensionArray): def __init__(self, values, dtype=None, copy=False): for val in values: if not isinstance(val, self.dtype.type): - raise TypeError("All values must be of type " + - str(self.dtype.type)) + raise TypeError("All values must be of type " + str(self.dtype.type)) self.data = values # Some aliases for common attribute names to ensure pandas supports @@ -77,7 +75,7 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif isinstance(item, np.ndarray) and item.dtype == 'bool': + elif isinstance(item, np.ndarray) and item.dtype == "bool": return self._from_sequence([x for x, m in zip(self, item) if m]) elif isinstance(item, abc.Iterable): # fancy indexing @@ -94,7 +92,7 @@ def __setitem__(self, key, value): # broadcast value value = itertools.cycle([value]) - if isinstance(key, np.ndarray) and key.dtype == 'bool': + if isinstance(key, np.ndarray) and key.dtype == "bool": # masking for i, (k, v) in enumerate(zip(key, value)): if k: @@ -113,16 +111,17 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == self.dtype.na_value for x in self.data], - dtype=bool) + return np.array([x == self.dtype.na_value for x in self.data], dtype=bool) def take(self, indexer, allow_fill=False, fill_value=None): # re-implement here, since NumPy has trouble setting # sized objects like UserDicts into scalar slots of # an ndarary. indexer = np.asarray(indexer) - msg = ("Index is out of bounds or cannot do a " - "non-empty take from an empty array.") + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) if allow_fill: if fill_value is None: @@ -131,8 +130,9 @@ def take(self, indexer, allow_fill=False, fill_value=None): if (indexer < -1).any(): raise ValueError try: - output = [self.data[loc] if loc != -1 else fill_value - for loc in indexer] + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] except IndexError: raise IndexError(msg) else: @@ -161,9 +161,9 @@ def astype(self, dtype, copy=True): def unique(self): # Parent method doesn't work since np.array will try to infer # a 2-dim object. - return type(self)([ - dict(x) for x in list({tuple(d.items()) for d in self.data}) - ]) + return type(self)( + [dict(x) for x in list({tuple(d.items()) for d in self.data})] + ) @classmethod def _concat_same_type(cls, to_concat): @@ -187,6 +187,12 @@ def _values_for_argsort(self): def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer - return [UserDict([ - (random.choice(string.ascii_letters), random.randint(0, 100)) - for _ in range(random.randint(0, 10))]) for _ in range(100)] + return [ + UserDict( + [ + (random.choice(string.ascii_letters), random.randint(0, 100)) + for _ in range(random.randint(0, 10)) + ] + ) + for _ in range(100) + ] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 89d30b0a3cc06..bc75ec6aeb2df 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -37,17 +37,17 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return JSONArray([{}, {'a': 10}]) + return JSONArray([{}, {"a": 10}]) @pytest.fixture def data_for_sorting(): - return JSONArray([{'b': 1}, {'c': 4}, {'a': 2, 'c': 3}]) + return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}]) @pytest.fixture def data_missing_for_sorting(): - return JSONArray([{'b': 1}, {}, {'a': 4}]) + return JSONArray([{"b": 1}, {}, {"a": 4}]) @pytest.fixture @@ -62,13 +62,18 @@ def na_cmp(): @pytest.fixture def data_for_grouping(): - return JSONArray([ - {'b': 1}, {'b': 1}, - {}, {}, - {'a': 0, 'c': 2}, {'a': 0, 'c': 2}, - {'b': 1}, - {'c': 2}, - ]) + return JSONArray( + [ + {"b": 1}, + {"b": 1}, + {}, + {}, + {"a": 0, "c": 2}, + {"a": 0, "c": 2}, + {"b": 1}, + {"c": 2}, + ] + ) class BaseJSON: @@ -77,28 +82,33 @@ class BaseJSON: # Series.values, which raises. We work around it by # converting the UserDicts to dicts. def assert_series_equal(self, left, right, **kwargs): - if left.dtype.name == 'json': + if left.dtype.name == "json": assert left.dtype == right.dtype - left = pd.Series(JSONArray(left.values.astype(object)), - index=left.index, name=left.name) - right = pd.Series(JSONArray(right.values.astype(object)), - index=right.index, name=right.name) + left = pd.Series( + JSONArray(left.values.astype(object)), index=left.index, name=left.name + ) + right = pd.Series( + JSONArray(right.values.astype(object)), + index=right.index, + name=right.name, + ) tm.assert_series_equal(left, right, **kwargs) def assert_frame_equal(self, left, right, *args, **kwargs): tm.assert_index_equal( - left.columns, right.columns, - exact=kwargs.get('check_column_type', 'equiv'), - check_names=kwargs.get('check_names', True), - check_exact=kwargs.get('check_exact', False), - check_categorical=kwargs.get('check_categorical', True), - obj='{obj}.columns'.format(obj=kwargs.get('obj', 'DataFrame'))) + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj="{obj}.columns".format(obj=kwargs.get("obj", "DataFrame")), + ) - jsons = (left.dtypes == 'json').index + jsons = (left.dtypes == "json").index for col in jsons: - self.assert_series_equal(left[col], right[col], - *args, **kwargs) + self.assert_series_equal(left[col], right[col], *args, **kwargs) left = left.drop(columns=jsons) right = right.drop(columns=jsons) @@ -113,9 +123,13 @@ class TestInterface(BaseJSON, base.BaseInterfaceTests): def test_custom_asserts(self): # This would always trigger the KeyError from trying to put # an array of equal-length UserDicts inside an ndarray. - data = JSONArray([collections.UserDict({'a': 1}), - collections.UserDict({'b': 2}), - collections.UserDict({'c': 3})]) + data = JSONArray( + [ + collections.UserDict({"a": 1}), + collections.UserDict({"b": 2}), + collections.UserDict({"c": 3}), + ] + ) a = pd.Series(data) self.assert_series_equal(a, a) self.assert_frame_equal(a.to_frame(), a.to_frame()) @@ -129,7 +143,6 @@ def test_custom_asserts(self): class TestConstructors(BaseJSON, base.BaseConstructorsTests): - @pytest.mark.skip(reason="not implemented constructor from dtype") def test_from_dtype(self, data): # construct from our dtype & string dtype @@ -137,7 +150,6 @@ def test_from_dtype(self, data): class TestReshaping(BaseJSON, base.BaseReshapingTests): - @pytest.mark.skip(reason="Different definitions of NA") def test_stack(self): """ @@ -168,8 +180,9 @@ def test_fillna_frame(self): unhashable = pytest.mark.skip(reason="Unhashable") -unstable = pytest.mark.skipif(not PY36, # 3.6 or higher - reason="Dictionary order unstable") +unstable = pytest.mark.skipif( + not PY36, reason="Dictionary order unstable" # 3.6 or higher +) class TestReduce(base.BaseNoReduceTests): @@ -195,12 +208,12 @@ def test_argsort_missing(self, data_missing_for_sorting): super().test_argsort_missing(data_missing_for_sorting) @unstable - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending): super().test_sort_values(data_for_sorting, ascending) @unstable - @pytest.mark.parametrize('ascending', [True, False]) + @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): super().test_sort_values_missing(data_missing_for_sorting, ascending) @@ -246,7 +259,6 @@ def test_astype_str(self): class TestGroupby(BaseJSON, base.BaseGroupbyTests): - @unhashable def test_groupby_extension_transform(self): """ @@ -269,7 +281,7 @@ def test_groupby_extension_apply(self): """ @unstable - @pytest.mark.parametrize('as_index', [True, False]) + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 046dcc1c74a03..f7456d24ad6d3 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -54,19 +54,17 @@ def data(): @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return Categorical([np.nan, 'A']) + return Categorical([np.nan, "A"]) @pytest.fixture def data_for_sorting(): - return Categorical(['A', 'B', 'C'], categories=['C', 'A', 'B'], - ordered=True) + return Categorical(["A", "B", "C"], categories=["C", "A", "B"], ordered=True) @pytest.fixture def data_missing_for_sorting(): - return Categorical(['A', None, 'B'], categories=['B', 'A'], - ordered=True) + return Categorical(["A", None, "B"], categories=["B", "A"], ordered=True) @pytest.fixture @@ -76,7 +74,7 @@ def na_value(): @pytest.fixture def data_for_grouping(): - return Categorical(['a', 'a', None, None, 'b', 'b', 'a', 'c']) + return Categorical(["a", "a", None, None, "b", "b", "a", "c"]) class TestDtype(base.BaseDtypeTests): @@ -95,7 +93,6 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - def test_ravel(self, data): # GH#27199 Categorical.ravel returns self until after deprecation cycle with tm.assert_produces_warning(FutureWarning): @@ -155,7 +152,6 @@ class TestSetitem(base.BaseSetitemTests): class TestMissing(base.BaseMissingTests): - @pytest.mark.skip(reason="Not implemented") def test_fillna_limit_pad(self, data_missing): super().test_fillna_limit_pad(data_missing) @@ -181,8 +177,9 @@ def test_combine_add(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 + x2) - expected = pd.Series(([a + b for (a, b) in - zip(list(orig_data1), list(orig_data2))])) + expected = pd.Series( + ([a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]) + ) self.assert_series_equal(result, expected) val = s1.iloc[0] @@ -204,14 +201,13 @@ class TestCasting(base.BaseCastingTests): class TestArithmeticOps(base.BaseArithmeticOpsTests): - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): op_name = all_arithmetic_operators - if op_name != '__rmod__': + if op_name != "__rmod__": super().test_arith_series_with_scalar(data, op_name) else: - pytest.skip('rmod never called when string is first argument') + pytest.skip("rmod never called when string is first argument") def test_add_series_with_extension_array(self, data): ser = pd.Series(data) @@ -228,15 +224,14 @@ def _check_divmod_op(self, s, op, other, exc=NotImplementedError): class TestComparisonOps(base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): op = self.get_op_from_name(op_name) - if op_name == '__eq__': + if op_name == "__eq__": result = op(s, other) expected = s.combine(other, lambda x, y: x == y) assert (result == expected).all() - elif op_name == '__ne__': + elif op_name == "__ne__": result = op(s, other) expected = s.combine(other, lambda x, y: x != y) assert (result == expected).all() diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 14db04e1bcd61..9b5f9d64f6b67 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -14,7 +14,6 @@ class DummyDtype(dtypes.ExtensionDtype): class DummyArray(ExtensionArray): - def __init__(self, data): self.data = data @@ -36,21 +35,20 @@ def astype(self, dtype, copy=True): class TestExtensionArrayDtype: - - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(np.array([1, 2])), - ]) + @pytest.mark.parametrize( + "values", + [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(np.array([1, 2])), + ], + ) def test_is_extension_array_dtype(self, values): assert is_extension_array_dtype(values) - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) + @pytest.mark.parametrize("values", [np.array([]), pd.Series(np.array([]))]) def test_is_not_extension_array_dtype(self, values): assert not is_extension_array_dtype(values) @@ -63,7 +61,7 @@ def test_astype(): result = arr.astype(object) tm.assert_numpy_array_equal(result, expected) - result = arr.astype('object') + result = arr.astype("object") tm.assert_numpy_array_equal(result, expected) @@ -77,10 +75,7 @@ def test_astype_no_copy(): assert arr is not result -@pytest.mark.parametrize('dtype', [ - dtypes.CategoricalDtype(), - dtypes.IntervalDtype(), -]) +@pytest.mark.parametrize("dtype", [dtypes.CategoricalDtype(), dtypes.IntervalDtype()]) def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) assert is_extension_array_dtype(dtype) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index baee04c3b79eb..9a7a43cff0c27 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -15,34 +15,30 @@ def dtype(request): @pytest.fixture def data(dtype): - data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), - dtype=dtype) + data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype) return data @pytest.fixture def data_missing(dtype): return DatetimeArray( - np.array(['NaT', '2000-01-01'], dtype='datetime64[ns]'), - dtype=dtype + np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype ) @pytest.fixture def data_for_sorting(dtype): - a = pd.Timestamp('2000-01-01') - b = pd.Timestamp('2000-01-02') - c = pd.Timestamp('2000-01-03') - return DatetimeArray(np.array([b, c, a], dtype='datetime64[ns]'), - dtype=dtype) + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + c = pd.Timestamp("2000-01-03") + return DatetimeArray(np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype) @pytest.fixture def data_missing_for_sorting(dtype): - a = pd.Timestamp('2000-01-01') - b = pd.Timestamp('2000-01-02') - return DatetimeArray(np.array([b, 'NaT', a], dtype='datetime64[ns]'), - dtype=dtype) + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + return DatetimeArray(np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype) @pytest.fixture @@ -52,19 +48,20 @@ def data_for_grouping(dtype): Where A < B < C and NA is missing """ - a = pd.Timestamp('2000-01-01') - b = pd.Timestamp('2000-01-02') - c = pd.Timestamp('2000-01-03') - na = 'NaT' - return DatetimeArray(np.array([b, b, na, na, a, a, b, c], - dtype='datetime64[ns]'), - dtype=dtype) + a = pd.Timestamp("2000-01-01") + b = pd.Timestamp("2000-01-02") + c = pd.Timestamp("2000-01-03") + na = "NaT" + return DatetimeArray( + np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype + ) @pytest.fixture def na_cmp(): def cmp(a, b): return a is pd.NaT and a is b + return cmp @@ -103,7 +100,6 @@ def test_combine_add(self, data_repeated): class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): - def test_array_interface(self, data): if data.tz: # np.asarray(DTA) is currently always tz-naive. @@ -113,34 +109,30 @@ def test_array_interface(self, data): class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests): - implements = {'__sub__', '__rsub__'} + implements = {"__sub__", "__rsub__"} def test_arith_series_with_scalar(self, data, all_arithmetic_operators): if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar(data, - all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def test_add_series_with_extension_array(self, data): # Datetime + Datetime not implemented s = pd.Series(data) - msg = 'cannot add DatetimeArray and DatetimeArray' + msg = "cannot add DatetimeArray and DatetimeArray" with pytest.raises(TypeError, match=msg): s + data def test_arith_series_with_array(self, data, all_arithmetic_operators): if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar(data, - all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def test_error(self, data, all_arithmetic_operators): pass @@ -157,8 +149,7 @@ def test_direct_arith_with_series_returns_not_implemented(self, data): # tests/arithmetic/test_datetime64::TestTimestampSeriesArithmetic:: # test_dt64_seris_add_intlike return super( - TestArithmeticOps, - self + TestArithmeticOps, self ).test_direct_arith_with_series_returns_not_implemented(data) @@ -167,7 +158,6 @@ class TestCasting(BaseDatetimeTests, base.BaseCastingTests): class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): # the base test is not appropriate for us. We raise on comparison # with (some) integers, depending on the value. @@ -176,8 +166,7 @@ def _compare_other(self, s, data, op_name, other): @pytest.mark.xfail(reason="different implementation", strict=False) def test_direct_arith_with_series_returns_not_implemented(self, data): return super( - TestComparisonOps, - self + TestComparisonOps, self ).test_direct_arith_with_series_returns_not_implemented(data) @@ -186,7 +175,6 @@ class TestMissing(BaseDatetimeTests, base.BaseMissingTests): class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests): - @pytest.mark.skip(reason="We have DatetimeTZBlock") def test_concat(self, data, in_frame): pass @@ -200,29 +188,31 @@ def test_concat_mixed_dtypes(self, data): @pytest.mark.parametrize("obj", ["series", "frame"]) def test_unstack(self, obj): # GH-13287: can't use base test, since building the expected fails. - data = DatetimeArray._from_sequence(['2000', '2001', '2002', '2003'], - tz='US/Central') - index = pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), - names=['a', 'b']) + data = DatetimeArray._from_sequence( + ["2000", "2001", "2002", "2003"], tz="US/Central" + ) + index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]) if obj == "series": ser = pd.Series(data, index=index) - expected = pd.DataFrame({ - "A": data.take([0, 1]), - "B": data.take([2, 3]) - }, index=pd.Index(['a', 'b'], name='b')) - expected.columns.name = 'a' + expected = pd.DataFrame( + {"A": data.take([0, 1]), "B": data.take([2, 3])}, + index=pd.Index(["a", "b"], name="b"), + ) + expected.columns.name = "a" else: ser = pd.DataFrame({"A": data, "B": data}, index=index) expected = pd.DataFrame( - {("A", "A"): data.take([0, 1]), - ("A", "B"): data.take([2, 3]), - ("B", "A"): data.take([0, 1]), - ("B", "B"): data.take([2, 3])}, - index=pd.Index(['a', 'b'], name='b') + { + ("A", "A"): data.take([0, 1]), + ("A", "B"): data.take([2, 3]), + ("B", "A"): data.take([0, 1]), + ("B", "B"): data.take([2, 3]), + }, + index=pd.Index(["a", "b"], name="b"), ) - expected.columns.names = [None, 'a'] + expected.columns.names = [None, "a"] result = ser.unstack(0) self.assert_equal(result, expected) diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 7f68babdb8aa5..1a4f84e2c0fd2 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -20,39 +20,40 @@ def concat_same_type(self, to_concat, placement=None): """ values = np.concatenate([blk.values for blk in to_concat]) return self.make_block_same_class( - values, placement=placement or slice(0, len(values), 1)) + values, placement=placement or slice(0, len(values), 1) + ) @pytest.fixture def df(): - df1 = pd.DataFrame({'a': [1, 2, 3]}) + df1 = pd.DataFrame({"a": [1, 2, 3]}) blocks = df1._data.blocks - values = np.arange(3, dtype='int64') + values = np.arange(3, dtype="int64") custom_block = CustomBlock(values, placement=slice(1, 2)) blocks = blocks + (custom_block,) - block_manager = BlockManager(blocks, [pd.Index(['a', 'b']), df1.index]) + block_manager = BlockManager(blocks, [pd.Index(["a", "b"]), df1.index]) return pd.DataFrame(block_manager) def test_custom_repr(): - values = np.arange(3, dtype='int64') + values = np.arange(3, dtype="int64") # series block = CustomBlock(values, placement=slice(0, 3)) s = pd.Series(SingleBlockManager(block, pd.RangeIndex(3))) - assert repr(s) == '0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64' + assert repr(s) == "0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64" # dataframe block = CustomBlock(values, placement=slice(0, 1)) - blk_mgr = BlockManager([block], [['col'], range(3)]) + blk_mgr = BlockManager([block], [["col"], range(3)]) df = pd.DataFrame(blk_mgr) - assert repr(df) == ' col\n0 Val: 0\n1 Val: 1\n2 Val: 2' + assert repr(df) == " col\n0 Val: 0\n1 Val: 1\n2 Val: 2" def test_concat_series(): # GH17728 - values = np.arange(3, dtype='int64') + values = np.arange(3, dtype="int64") block = CustomBlock(values, placement=slice(0, 3)) s = pd.Series(block, pd.RangeIndex(3), fastpath=True) @@ -68,6 +69,6 @@ def test_concat_dataframe(df): def test_concat_axis1(df): # GH17954 - df2 = pd.DataFrame({'c': [.1, .2, .3]}) + df2 = pd.DataFrame({"c": [0.1, 0.2, 0.3]}) res = pd.concat([df, df2], axis=1) assert isinstance(res._data.blocks[1], CustomBlock) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 22bb086a919ca..d051345fdd12d 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -21,18 +21,34 @@ import pandas as pd from pandas.core.arrays import integer_array from pandas.core.arrays.integer import ( - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype) + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, +) from pandas.tests.extension import base def make_data(): - return (list(range(1, 9)) + [np.nan] + list(range(10, 98)) - + [np.nan] + [99, 100]) - - -@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) + return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + + +@pytest.fixture( + params=[ + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ] +) def dtype(request): return request.param() @@ -83,7 +99,6 @@ def data_for_grouping(dtype): class TestDtype(base.BaseDtypeTests): - @pytest.mark.skip(reason="using multiple dtypes") def test_is_dtype_unboxes_dtype(self): # we have multiple dtypes, so skip @@ -91,20 +106,21 @@ def test_is_dtype_unboxes_dtype(self): class TestArithmeticOps(base.BaseArithmeticOpsTests): - def check_opname(self, s, op_name, other, exc=None): # overwriting to indicate ops don't raise an error super().check_opname(s, op_name, other, exc=None) def _check_op(self, s, op, other, op_name, exc=NotImplementedError): if exc is None: - if s.dtype.is_unsigned_integer and (op_name == '__rsub__'): + if s.dtype.is_unsigned_integer and (op_name == "__rsub__"): # TODO see https://github.com/pandas-dev/pandas/issues/22023 pytest.skip("unsigned subtraction gives negative values") - if (hasattr(other, 'dtype') - and not is_extension_array_dtype(other.dtype) - and pd.api.types.is_integer_dtype(other.dtype)): + if ( + hasattr(other, "dtype") + and not is_extension_array_dtype(other.dtype) + and pd.api.types.is_integer_dtype(other.dtype) + ): # other is np.int64 and would therefore always result in # upcasting, so keeping other as same numpy_dtype other = other.astype(s.dtype.numpy_dtype) @@ -112,12 +128,12 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): result = op(s, other) expected = s.combine(other, op) - if op_name in ('__rtruediv__', '__truediv__', '__div__'): + if op_name in ("__rtruediv__", "__truediv__", "__div__"): expected = expected.astype(float) - if op_name == '__rtruediv__': + if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) - elif op_name.startswith('__r'): + elif op_name.startswith("__r"): # TODO reverse operators result in object dtype # see https://github.com/pandas-dev/pandas/issues/22024 expected = expected.astype(s.dtype) @@ -126,7 +142,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass - if (op_name == '__rpow__') and isinstance(other, pd.Series): + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 result = result.fillna(1) @@ -146,7 +162,6 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): - def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -183,8 +198,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - - @pytest.mark.parametrize('dropna', [True, False]) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: @@ -193,8 +207,7 @@ def test_value_counts(self, all_data, dropna): other = all_data result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() - expected = pd.Series(other).value_counts( - dropna=dropna).sort_index() + expected = pd.Series(other).value_counts(dropna=dropna).sort_index() expected.index = expected.index.astype(all_data.dtype) self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index f1f90b298ffe2..1aab71286b4a6 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -103,8 +103,7 @@ class TestReduce(base.BaseNoReduceTests): class TestMethods(BaseInterval, base.BaseMethodsTests): - - @pytest.mark.skip(reason='addition is not defined for intervals') + @pytest.mark.skip(reason="addition is not defined for intervals") def test_combine_add(self, data_repeated): pass @@ -155,8 +154,8 @@ def test_array_repr(self, data, size): class TestParsing(BaseInterval, base.BaseParsingTests): - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - expected_msg = r'.*must implement _from_sequence_of_strings.*' + expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 74ca296d23295..221cf0787d839 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -10,7 +10,7 @@ from . import base -@pytest.fixture(params=['float', 'object']) +@pytest.fixture(params=["float", "object"]) def dtype(request): return PandasDtype(np.dtype(request.param)) @@ -33,13 +33,13 @@ def allow_in_pandas(monkeypatch): check. """ with monkeypatch.context() as m: - m.setattr(PandasArray, '_typ', 'extension') + m.setattr(PandasArray, "_typ", "extension") yield @pytest.fixture def data(allow_in_pandas, dtype): - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": return pd.Series([(i,) for i in range(100)]).array return PandasArray(np.arange(1, 101, dtype=dtype._dtype)) @@ -48,7 +48,7 @@ def data(allow_in_pandas, dtype): def data_missing(allow_in_pandas, dtype): # For NumPy <1.16, np.array([np.nan, (1,)]) raises # ValueError: setting an array element with a sequence. - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": if _np_version_under1p16: raise pytest.skip("Skipping for NumPy <1.16") return PandasArray(np.array([np.nan, (1,)])) @@ -64,6 +64,7 @@ def na_value(): def na_cmp(): def cmp(a, b): return np.isnan(a) and np.isnan(b) + return cmp @@ -74,15 +75,11 @@ def data_for_sorting(allow_in_pandas, dtype): This should be three items [B, C, A] with A < B < C """ - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": # Use an empty tuple for first element, then remove, # to disable np.array's shape inference. - return PandasArray( - np.array([(), (2,), (3,), (1,)])[1:] - ) - return PandasArray( - np.array([1, 2, 0]) - ) + return PandasArray(np.array([(), (2,), (3,), (1,)])[1:]) + return PandasArray(np.array([1, 2, 0])) @pytest.fixture @@ -92,13 +89,9 @@ def data_missing_for_sorting(allow_in_pandas, dtype): This should be three items [B, NA, A] with A < B and NA missing. """ - if dtype.numpy_dtype == 'object': - return PandasArray( - np.array([(1,), np.nan, (0,)]) - ) - return PandasArray( - np.array([1, np.nan, 0]) - ) + if dtype.numpy_dtype == "object": + return PandasArray(np.array([(1,), np.nan, (0,)])) + return PandasArray(np.array([1, np.nan, 0])) @pytest.fixture @@ -109,13 +102,11 @@ def data_for_grouping(allow_in_pandas, dtype): Where A < B < C and NA is missing """ - if dtype.numpy_dtype == 'object': + if dtype.numpy_dtype == "object": a, b, c = (1,), (2,), (3,) else: a, b, c = np.arange(3) - return PandasArray(np.array( - [b, b, np.nan, np.nan, a, a, b, c] - )) + return PandasArray(np.array([b, b, np.nan, np.nan, a, a, b, c])) @pytest.fixture @@ -129,11 +120,11 @@ def skip_numpy_object(dtype): This fixture allows these tests to be skipped when used as a usefixtures marker to either an individual test or a test class. """ - if dtype == 'object': + if dtype == "object": raise pytest.skip("Skipping for object dtype.") -skip_nested = pytest.mark.usefixtures('skip_numpy_object') +skip_nested = pytest.mark.usefixtures("skip_numpy_object") class BaseNumPyTests: @@ -141,7 +132,6 @@ class BaseNumPyTests: class TestCasting(BaseNumPyTests, base.BaseCastingTests): - @skip_nested def test_astype_str(self, data): # ValueError: setting an array element with a sequence @@ -161,7 +151,6 @@ def test_array_from_scalars(self, data): class TestDtype(BaseNumPyTests, base.BaseDtypeTests): - @pytest.mark.skip(reason="Incorrect expected.") # we unsurprisingly clash with a NumPy name. def test_check_dtype(self, data): @@ -169,7 +158,6 @@ def test_check_dtype(self, data): class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): - @skip_nested def test_getitem_scalar(self, data): # AssertionError @@ -187,11 +175,9 @@ def test_loc_iloc_frame_single_dtype(self, data): class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): @skip_nested - def test_groupby_extension_apply( - self, data_for_grouping, groupby_apply_op): + def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): # ValueError: Names should be list-like for a MultiIndex - super().test_groupby_extension_apply(data_for_grouping, - groupby_apply_op) + super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): @@ -202,7 +188,6 @@ def test_array_interface(self, data): class TestMethods(BaseNumPyTests, base.BaseMethodsTests): - @pytest.mark.skip(reason="TODO: remove?") def test_value_counts(self, all_data, dropna): pass @@ -224,8 +209,8 @@ def test_shift_fill_value(self, data): super().test_shift_fill_value(data) @skip_nested - @pytest.mark.parametrize('box', [pd.Series, lambda x: x]) - @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique]) + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method): # Fails creating expected super().test_unique(data, box, method) @@ -290,7 +275,6 @@ class TestPrinting(BaseNumPyTests, base.BasePrintingTests): @skip_nested class TestNumericReduce(BaseNumPyTests, base.BaseNumericReduceTests): - def check_reduce(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) # avoid coercing int -> float. Just cast to the actual numpy type. @@ -304,7 +288,6 @@ class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests): class TestMissing(BaseNumPyTests, base.BaseMissingTests): - @skip_nested def test_fillna_scalar(self, data_missing): # Non-scalar "scalar" values. @@ -313,8 +296,7 @@ def test_fillna_scalar(self, data_missing): @skip_nested def test_fillna_series_method(self, data_missing, fillna_method): # Non-scalar "scalar" values. - super().test_fillna_series_method( - data_missing, fillna_method) + super().test_fillna_series_method(data_missing, fillna_method) @skip_nested def test_fillna_series(self, data_missing): @@ -328,7 +310,6 @@ def test_fillna_frame(self, data_missing): class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): - @pytest.mark.skip("Incorrect parent test") # not actually a mixed concat, since we concat int and int. def test_concat_mixed_dtypes(self, data): @@ -351,7 +332,6 @@ def test_merge_on_extension_array_duplicates(self, data): class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): - @skip_nested def test_setitem_scalar_series(self, data, box_in_series): # AssertionError @@ -395,7 +375,7 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data): super().test_setitem_iloc_scalar_multiple_homogoneous(data) @skip_nested - @pytest.mark.parametrize('setter', ['loc', None]) + @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_broadcast(self, data, setter): # ValueError: cannot set using a list-like indexer with a different # length than the value diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index b988dcb211dd0..8a500e1be766e 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -12,7 +12,7 @@ @pytest.fixture def dtype(): - return PeriodDtype(freq='D') + return PeriodDtype(freq="D") @pytest.fixture @@ -71,7 +71,6 @@ class TestGetitem(BasePeriodTests, base.BaseGetitemTests): class TestMethods(BasePeriodTests, base.BaseMethodsTests): - def test_combine_add(self, data_repeated): # Period + Period is not defined. pass @@ -83,28 +82,24 @@ class TestInterface(BasePeriodTests, base.BaseInterfaceTests): class TestArithmeticOps(BasePeriodTests, base.BaseArithmeticOpsTests): - implements = {'__sub__', '__rsub__'} + implements = {"__sub__", "__rsub__"} def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # we implement substitution... if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar( - data, all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def test_arith_series_with_array(self, data, all_arithmetic_operators): if all_arithmetic_operators in self.implements: s = pd.Series(data) - self.check_opname(s, all_arithmetic_operators, s.iloc[0], - exc=None) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], exc=None) else: # ... but not the rest. - super().test_arith_series_with_scalar( - data, all_arithmetic_operators) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) def _check_divmod_op(self, s, op, other, exc=NotImplementedError): super()._check_divmod_op(s, op, other, exc=TypeError) @@ -112,8 +107,10 @@ def _check_divmod_op(self, s, op, other, exc=NotImplementedError): def test_add_series_with_extension_array(self, data): # we don't implement + for Period s = pd.Series(data) - msg = (r"unsupported operand type\(s\) for \+: " - r"\'PeriodArray\' and \'PeriodArray\'") + msg = ( + r"unsupported operand type\(s\) for \+: " + r"\'PeriodArray\' and \'PeriodArray\'" + ) with pytest.raises(TypeError, match=msg): s + data @@ -132,7 +129,6 @@ class TestCasting(BasePeriodTests, base.BaseCastingTests): class TestComparisonOps(BasePeriodTests, base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): # the base test is not appropriate for us. We raise on comparison # with (some) integers, depending on the value. @@ -160,8 +156,8 @@ class TestPrinting(BasePeriodTests, base.BasePrintingTests): class TestParsing(BasePeriodTests, base.BaseParsingTests): - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - expected_msg = r'.*must implement _from_sequence_of_strings.*' + expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 8ce53270b7ba8..84d59902d2aa7 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -29,8 +29,7 @@ def dtype(): @pytest.fixture(params=[0, np.nan]) def data(request): """Length-100 PeriodArray for semantics test.""" - res = SparseArray(make_data(request.param), - fill_value=request.param) + res = SparseArray(make_data(request.param), fill_value=request.param) return res @@ -48,10 +47,11 @@ def data_missing(request): @pytest.fixture(params=[0, np.nan]) def data_repeated(request): """Return different versions of data for count times""" + def gen(count): for _ in range(count): - yield SparseArray(make_data(request.param), - fill_value=request.param) + yield SparseArray(make_data(request.param), fill_value=request.param) + yield gen @@ -77,8 +77,7 @@ def na_cmp(): @pytest.fixture(params=[0, np.nan]) def data_for_grouping(request): - return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], - fill_value=request.param) + return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], fill_value=request.param) class BaseSparseTests: @@ -92,7 +91,6 @@ def test_ravel(self, data): class TestDtype(BaseSparseTests, base.BaseDtypeTests): - def test_array_type_with_arg(self, data, dtype): assert dtype.construct_array_type() is SparseArray @@ -111,19 +109,19 @@ class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): class TestReshaping(BaseSparseTests, base.BaseReshapingTests): - def test_concat_mixed_dtypes(self, data): # https://github.com/pandas-dev/pandas/issues/20762 # This should be the same, aside from concat([sparse, float]) - df1 = pd.DataFrame({'A': data[:3]}) + df1 = pd.DataFrame({"A": data[:3]}) df2 = pd.DataFrame({"A": [1, 2, 3]}) - df3 = pd.DataFrame({"A": ['a', 'b', 'c']}).astype('category') + df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category") dfs = [df1, df2, df3] # dataframes result = pd.concat(dfs) - expected = pd.concat([x.apply(lambda s: np.asarray(s).astype(object)) - for x in dfs]) + expected = pd.concat( + [x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs] + ) self.assert_frame_equal(result, expected) def test_concat_columns(self, data, na_value): @@ -148,7 +146,6 @@ def test_merge(self, data, na_value): class TestGetitem(BaseSparseTests, base.BaseGetitemTests): - def test_get(self, data): s = pd.Series(data, index=[2 * i for i in range(len(data))]) if np.isnan(s.values.fill_value): @@ -164,11 +161,10 @@ def test_reindex(self, data, na_value): # Skipping TestSetitem, since we don't implement it. -class TestMissing(BaseSparseTests, base.BaseMissingTests): +class TestMissing(BaseSparseTests, base.BaseMissingTests): def test_isna(self, data_missing): - expected_dtype = SparseDtype(bool, - pd.isna(data_missing.dtype.fill_value)) + expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value)) expected = SparseArray([True, False], dtype=expected_dtype) result = pd.isna(data_missing) @@ -204,27 +200,24 @@ def test_fillna_frame(self, data_missing): # Have to override to specify that fill_value will change. fill_value = data_missing[1] - result = pd.DataFrame({ - "A": data_missing, - "B": [1, 2] - }).fillna(fill_value) + result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value) if pd.isna(data_missing.fill_value): dtype = SparseDtype(data_missing.dtype, fill_value) else: dtype = data_missing.dtype - expected = pd.DataFrame({ - "A": data_missing._from_sequence([fill_value, fill_value], - dtype=dtype), - "B": [1, 2], - }) + expected = pd.DataFrame( + { + "A": data_missing._from_sequence([fill_value, fill_value], dtype=dtype), + "B": [1, 2], + } + ) self.assert_frame_equal(result, expected) class TestMethods(BaseSparseTests, base.BaseMethodsTests): - def test_combine_le(self, data_repeated): # We return a Series[SparseArray].__le__ returns a # Series[Sparse[bool]] @@ -233,17 +226,19 @@ def test_combine_le(self, data_repeated): s1 = pd.Series(orig_data1) s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) - expected = pd.Series(pd.SparseArray([ - a <= b for (a, b) in - zip(list(orig_data1), list(orig_data2)) - ], fill_value=False)) + expected = pd.Series( + pd.SparseArray( + [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], + fill_value=False, + ) + ) self.assert_series_equal(result, expected) val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) - expected = pd.Series(pd.SparseArray([ - a <= val for a in list(orig_data1) - ], fill_value=False)) + expected = pd.Series( + pd.SparseArray([a <= val for a in list(orig_data1)], fill_value=False) + ) self.assert_series_equal(result, expected) def test_fillna_copy_frame(self, data_missing): @@ -280,20 +275,20 @@ def test_where_series(self, data, na_value): cond = np.array([True, True, False, False]) result = ser.where(cond) - new_dtype = SparseDtype('float', 0.0) - expected = pd.Series(cls._from_sequence([a, a, na_value, na_value], - dtype=new_dtype)) + new_dtype = SparseDtype("float", 0.0) + expected = pd.Series( + cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype) + ) self.assert_series_equal(result, expected) other = cls._from_sequence([a, b, a, b], dtype=data.dtype) cond = np.array([True, False, True, True]) result = ser.where(cond, other) - expected = pd.Series(cls._from_sequence([a, b, b, b], - dtype=data.dtype)) + expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) self.assert_series_equal(result, expected) def test_combine_first(self, data): - if data.dtype.subtype == 'int': + if data.dtype.subtype == "int": # Right now this is upcasted to float, just like combine_first # for Series[int] pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.") @@ -334,7 +329,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): class TestComparisonOps(BaseSparseTests, base.BaseComparisonOpsTests): - def _compare_other(self, s, data, op_name, other): op = self.get_op_from_name(op_name) @@ -344,12 +338,14 @@ def _compare_other(self, s, data, op_name, other): # is in general. # Rely on tests in `tests/sparse` to validate that. assert isinstance(result.dtype, SparseDtype) - assert result.dtype.subtype == np.dtype('bool') + assert result.dtype.subtype == np.dtype("bool") - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): expected = pd.Series( - pd.SparseArray(op(np.asarray(data), np.asarray(other)), - fill_value=result.values.fill_value) + pd.SparseArray( + op(np.asarray(data), np.asarray(other)), + fill_value=result.values.fill_value, + ) ) tm.assert_series_equal(result, expected) @@ -361,14 +357,14 @@ def _compare_other(self, s, data, op_name, other): class TestPrinting(BaseSparseTests, base.BasePrintingTests): - @pytest.mark.xfail(reason='Different repr', strict=True) + @pytest.mark.xfail(reason="Different repr", strict=True) def test_array_repr(self, data, size): super().test_array_repr(data, size) class TestParsing(BaseSparseTests, base.BaseParsingTests): - @pytest.mark.parametrize('engine', ['c', 'python']) + @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): - expected_msg = r'.*must implement _from_sequence_of_strings.*' + expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 4b71405e20d32..281028b971d1e 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -9,17 +9,16 @@ _tsd = tm.getTimeSeriesData() _frame = pd.DataFrame(_seriesd) -_frame2 = pd.DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_frame2 = pd.DataFrame(_seriesd, columns=["D", "C", "B", "A"]) _intframe = pd.DataFrame({k: v.astype(int) for k, v in _seriesd.items()}) _tsframe = pd.DataFrame(_tsd) _mixed_frame = _frame.copy() -_mixed_frame['foo'] = 'bar' +_mixed_frame["foo"] = "bar" class TestData: - @cache_readonly def frame(self): return _frame.copy() @@ -31,8 +30,7 @@ def frame2(self): @cache_readonly def intframe(self): # force these all to int64 to avoid platform testing issues - return pd.DataFrame({c: s for c, s in _intframe.items()}, - dtype=np.int64) + return pd.DataFrame({c: s for c, s in _intframe.items()}, dtype=np.int64) @cache_readonly def tsframe(self): @@ -44,39 +42,59 @@ def mixed_frame(self): @cache_readonly def mixed_float(self): - return pd.DataFrame({'A': _frame['A'].copy().astype('float32'), - 'B': _frame['B'].copy().astype('float32'), - 'C': _frame['C'].copy().astype('float16'), - 'D': _frame['D'].copy().astype('float64')}) + return pd.DataFrame( + { + "A": _frame["A"].copy().astype("float32"), + "B": _frame["B"].copy().astype("float32"), + "C": _frame["C"].copy().astype("float16"), + "D": _frame["D"].copy().astype("float64"), + } + ) @cache_readonly def mixed_float2(self): - return pd.DataFrame({'A': _frame2['A'].copy().astype('float32'), - 'B': _frame2['B'].copy().astype('float32'), - 'C': _frame2['C'].copy().astype('float16'), - 'D': _frame2['D'].copy().astype('float64')}) + return pd.DataFrame( + { + "A": _frame2["A"].copy().astype("float32"), + "B": _frame2["B"].copy().astype("float32"), + "C": _frame2["C"].copy().astype("float16"), + "D": _frame2["D"].copy().astype("float64"), + } + ) @cache_readonly def mixed_int(self): - return pd.DataFrame({'A': _intframe['A'].copy().astype('int32'), - 'B': np.ones(len(_intframe['B']), dtype='uint64'), - 'C': _intframe['C'].copy().astype('uint8'), - 'D': _intframe['D'].copy().astype('int64')}) + return pd.DataFrame( + { + "A": _intframe["A"].copy().astype("int32"), + "B": np.ones(len(_intframe["B"]), dtype="uint64"), + "C": _intframe["C"].copy().astype("uint8"), + "D": _intframe["D"].copy().astype("int64"), + } + ) @cache_readonly def all_mixed(self): - return pd.DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'float32': np.array([1.] * 10, dtype='float32'), - 'int32': np.array([1] * 10, dtype='int32')}, - index=np.arange(10)) + return pd.DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) @cache_readonly def tzframe(self): - result = pd.DataFrame({'A': pd.date_range('20130101', periods=3), - 'B': pd.date_range('20130101', periods=3, - tz='US/Eastern'), - 'C': pd.date_range('20130101', periods=3, - tz='CET')}) + result = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=3), + "B": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "C": pd.date_range("20130101", periods=3, tz="CET"), + } + ) result.iloc[1, 1] = pd.NaT result.iloc[1, 2] = pd.NaT return result @@ -95,12 +113,10 @@ def ts2(self): @cache_readonly def simple(self): - arr = np.array([[1., 2., 3.], - [4., 5., 6.], - [7., 8., 9.]]) + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) + + return pd.DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) - return pd.DataFrame(arr, columns=['one', 'two', 'three'], - index=['a', 'b', 'c']) # self.ts3 = tm.makeTimeSeries()[-5:] # self.ts4 = tm.makeTimeSeries()[1:-1] @@ -108,32 +124,32 @@ def simple(self): def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 - dtypes = dict(A='float32', B='float32', C='float16', D='float64') + dtypes = dict(A="float32", B="float32", C="float16", D="float64") if isinstance(dtype, str): dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) - if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) - if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) - if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) - if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + if dtypes.get("A"): + assert df.dtypes["A"] == dtypes["A"] + if dtypes.get("B"): + assert df.dtypes["B"] == dtypes["B"] + if dtypes.get("C"): + assert df.dtypes["C"] == dtypes["C"] + if dtypes.get("D"): + assert df.dtypes["D"] == dtypes["D"] def _check_mixed_int(df, dtype=None): - dtypes = dict(A='int32', B='uint64', C='uint8', D='int64') + dtypes = dict(A="int32", B="uint64", C="uint8", D="int64") if isinstance(dtype, str): dtypes = {k: dtype for k, v in dtypes.items()} elif isinstance(dtype, dict): dtypes.update(dtype) - if dtypes.get('A'): - assert(df.dtypes['A'] == dtypes['A']) - if dtypes.get('B'): - assert(df.dtypes['B'] == dtypes['B']) - if dtypes.get('C'): - assert(df.dtypes['C'] == dtypes['C']) - if dtypes.get('D'): - assert(df.dtypes['D'] == dtypes['D']) + if dtypes.get("A"): + assert df.dtypes["A"] == dtypes["A"] + if dtypes.get("B"): + assert df.dtypes["B"] == dtypes["B"] + if dtypes.get("C"): + assert df.dtypes["C"] == dtypes["C"] + if dtypes.get("D"): + assert df.dtypes["D"] == dtypes["D"] diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 3232c400bd8ce..915d6edcd8367 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -159,7 +159,7 @@ def float_string_frame(): [30 rows x 5 columns] """ df = DataFrame(tm.getSeriesData()) - df['foo'] = 'bar' + df["foo"] = "bar" return df @@ -190,10 +190,10 @@ def mixed_float_frame(): [30 rows x 4 columns] """ df = DataFrame(tm.getSeriesData()) - df.A = df.A.astype('float32') - df.B = df.B.astype('float32') - df.C = df.C.astype('float16') - df.D = df.D.astype('float64') + df.A = df.A.astype("float32") + df.B = df.B.astype("float32") + df.C = df.C.astype("float16") + df.D = df.D.astype("float64") return df @@ -224,10 +224,10 @@ def mixed_int_frame(): [30 rows x 4 columns] """ df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()}) - df.A = df.A.astype('int32') - df.B = np.ones(len(df.B), dtype='uint64') - df.C = df.C.astype('uint8') - df.D = df.C.astype('int64') + df.A = df.A.astype("int32") + df.B = np.ones(len(df.B), dtype="uint64") + df.C = df.C.astype("uint8") + df.D = df.C.astype("int64") return df @@ -237,10 +237,16 @@ def mixed_type_frame(): Fixture for DataFrame of float/int/string columns with RangeIndex Columns are ['a', 'b', 'c', 'float32', 'int32']. """ - return DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'float32': np.array([1.] * 10, dtype='float32'), - 'int32': np.array([1] * 10, dtype='int32')}, - index=np.arange(10)) + return DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "float32": np.array([1.0] * 10, dtype="float32"), + "int32": np.array([1] * 10, dtype="int32"), + }, + index=np.arange(10), + ) @pytest.fixture @@ -255,11 +261,13 @@ def timezone_frame(): 1 2013-01-02 NaT NaT 2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00 """ - df = DataFrame({'A': date_range('20130101', periods=3), - 'B': date_range('20130101', periods=3, - tz='US/Eastern'), - 'C': date_range('20130101', periods=3, - tz='CET')}) + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": date_range("20130101", periods=3, tz="US/Eastern"), + "C": date_range("20130101", periods=3, tz="CET"), + } + ) df.iloc[1, 1] = NaT df.iloc[1, 2] = NaT return df @@ -272,8 +280,9 @@ def uint64_frame(): Columns are ['A', 'B'] """ - return DataFrame({'A': np.arange(3), 'B': [2**63, 2**63 + 5, 2**63 + 10]}, - dtype=np.uint64) + return DataFrame( + {"A": np.arange(3), "B": [2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10]}, dtype=np.uint64 + ) @pytest.fixture @@ -288,12 +297,9 @@ def simple_frame(): b 4.0 5.0 6.0 c 7.0 8.0 9.0 """ - arr = np.array([[1., 2., 3.], - [4., 5., 6.], - [7., 8., 9.]]) + arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]) - return DataFrame(arr, columns=['one', 'two', 'three'], - index=['a', 'b', 'c']) + return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"]) @pytest.fixture @@ -311,10 +317,14 @@ def frame_of_index_cols(): 3 bar one d 0.234246 1.085675 0.718445 4 bar two e 0.533841 -0.005702 -3.533912 """ - df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'three', 'one', 'two'], - 'C': ['a', 'b', 'c', 'd', 'e'], - 'D': np.random.randn(5), - 'E': np.random.randn(5), - ('tuple', 'as', 'label'): np.random.randn(5)}) + df = DataFrame( + { + "A": ["foo", "foo", "foo", "bar", "bar"], + "B": ["one", "two", "three", "one", "two"], + "C": ["a", "b", "c", "d", "e"], + "D": np.random.randn(5), + "E": np.random.randn(5), + ("tuple", "as", "label"): np.random.randn(5), + } + ) return df diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e7b4c2c65b842..229713a5af11a 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -5,24 +5,37 @@ import pytest from pandas.core.dtypes.common import ( - is_categorical_dtype, is_interval_dtype, is_object_dtype) + is_categorical_dtype, + is_interval_dtype, + is_object_dtype, +) from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, IntervalIndex, MultiIndex, - RangeIndex, Series, Timestamp, cut, date_range, to_datetime) + Categorical, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + Timestamp, + cut, + date_range, + to_datetime, +) import pandas.util.testing as tm @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestDataFrameAlterAxes: - def test_set_index_directly(self, float_string_frame): df = float_string_frame idx = Index(np.arange(len(df))[::-1]) df.index = idx tm.assert_index_equal(df.index, idx) - with pytest.raises(ValueError, match='Length mismatch'): + with pytest.raises(ValueError, match="Length mismatch"): df.index = idx[::2] def test_set_index(self, float_string_frame): @@ -31,23 +44,22 @@ def test_set_index(self, float_string_frame): df = df.set_index(idx) tm.assert_index_equal(df.index, idx) - with pytest.raises(ValueError, match='Length mismatch'): + with pytest.raises(ValueError, match="Length mismatch"): df.set_index(idx[::2]) def test_set_index_cast(self): # issue casting an index then set_index - df = DataFrame({'A': [1.1, 2.2, 3.3], 'B': [5.0, 6.1, 7.2]}, - index=[2010, 2011, 2012]) + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2]}, index=[2010, 2011, 2012] + ) df2 = df.set_index(df.index.astype(np.int32)) tm.assert_frame_equal(df, df2) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) - @pytest.mark.parametrize('inplace', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_drop_inplace(self, frame_of_index_cols, - drop, inplace, keys): + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys): df = frame_of_index_cols if isinstance(keys, list): @@ -66,15 +78,15 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) - @pytest.mark.parametrize('drop', [True, False]) + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("drop", [True, False]) def test_set_index_append(self, frame_of_index_cols, drop, keys): df = frame_of_index_cols keys = keys if isinstance(keys, list) else [keys] - idx = MultiIndex.from_arrays([df.index] + [df[x] for x in keys], - names=[None] + keys) + idx = MultiIndex.from_arrays( + [df.index] + [df[x] for x in keys], names=[None] + keys + ) expected = df.drop(keys, axis=1) if drop else df.copy() expected.index = idx @@ -83,17 +95,14 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_append_to_multiindex(self, frame_of_index_cols, - drop, keys): + @pytest.mark.parametrize("keys", ["A", "C", ["A", "B"], ("tuple", "as", "label")]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys): # append to existing multiindex - df = frame_of_index_cols.set_index(['D'], drop=drop, append=True) + df = frame_of_index_cols.set_index(["D"], drop=drop, append=True) keys = keys if isinstance(keys, list) else [keys] - expected = frame_of_index_cols.set_index(['D'] + keys, - drop=drop, append=True) + expected = frame_of_index_cols.set_index(["D"] + keys, drop=drop, append=True) result = df.set_index(keys, drop=drop, append=True) @@ -101,29 +110,38 @@ def test_set_index_append_to_multiindex(self, frame_of_index_cols, def test_set_index_after_mutation(self): # GH1590 - df = DataFrame({'val': [0, 1, 2], 'key': ['a', 'b', 'c']}) - expected = DataFrame({'val': [1, 2]}, - Index(['b', 'c'], name='key')) + df = DataFrame({"val": [0, 1, 2], "key": ["a", "b", "c"]}) + expected = DataFrame({"val": [1, 2]}, Index(["b", "c"], name="key")) df2 = df.loc[df.index.map(lambda indx: indx >= 1)] - result = df2.set_index('key') + result = df2.set_index("key") tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) - @pytest.mark.parametrize('box', [Series, Index, np.array, - list, lambda x: [list(x)], - lambda x: MultiIndex.from_arrays([x])]) - @pytest.mark.parametrize('append, index_name', [(True, None), - (True, 'B'), (True, 'test'), (False, None)]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_single_array(self, frame_of_index_cols, - drop, append, index_name, box): + @pytest.mark.parametrize( + "box", + [ + Series, + Index, + np.array, + list, + lambda x: [list(x)], + lambda x: MultiIndex.from_arrays([x]), + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "B"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_single_array( + self, frame_of_index_cols, drop, append, index_name, box + ): df = frame_of_index_cols df.index.name = index_name - key = box(df['B']) + key = box(df["B"]) if box == list: # list of strings gets interpreted as list of keys msg = "['one', 'two', 'three', 'one', 'two']" @@ -131,41 +149,44 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, df.set_index(key, drop=drop, append=append) else: # np.array/list-of-list "forget" the name of B - name_mi = getattr(key, 'names', None) - name = [getattr(key, 'name', None)] if name_mi is None else name_mi + name_mi = getattr(key, "names", None) + name = [getattr(key, "name", None)] if name_mi is None else name_mi result = df.set_index(key, drop=drop, append=append) # only valid column keys are dropped # since B is always passed as array above, nothing is dropped - expected = df.set_index(['B'], drop=False, append=append) + expected = df.set_index(["B"], drop=False, append=append) expected.index.names = [index_name] + name if append else name tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) - @pytest.mark.parametrize('box', [Series, Index, np.array, list, - lambda x: MultiIndex.from_arrays([x])]) - @pytest.mark.parametrize('append, index_name', - [(True, None), (True, 'A'), (True, 'B'), - (True, 'test'), (False, None)]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_arrays(self, frame_of_index_cols, - drop, append, index_name, box): + @pytest.mark.parametrize( + "box", [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])] + ) + @pytest.mark.parametrize( + "append, index_name", + [(True, None), (True, "A"), (True, "B"), (True, "test"), (False, None)], + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays( + self, frame_of_index_cols, drop, append, index_name, box + ): df = frame_of_index_cols df.index.name = index_name - keys = ['A', box(df['B'])] + keys = ["A", box(df["B"])] # np.array/list "forget" the name of B - names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] + names = ["A", None if box in [np.array, list, tuple, iter] else "B"] result = df.set_index(keys, drop=drop, append=append) # only valid column keys are dropped # since B is always passed as array above, only A is dropped, if at all - expected = df.set_index(['A', 'B'], drop=False, append=append) - expected = expected.drop('A', axis=1) if drop else expected + expected = df.set_index(["A", "B"], drop=False, append=append) + expected = expected.drop("A", axis=1) if drop else expected expected.index.names = [index_name] + names if append else names tm.assert_frame_equal(result, expected) @@ -173,31 +194,52 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, list, iter, - lambda x: MultiIndex.from_arrays([x]), - lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, list, iter, - lambda x: MultiIndex.from_arrays([x]), - lambda x: x.name]) - @pytest.mark.parametrize('append, index_name', [(True, None), - (True, 'A'), (True, 'test'), (False, None)]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, - append, index_name, box1, box2): + @pytest.mark.parametrize( + "box2", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "box1", + [ + Series, + Index, + np.array, + list, + iter, + lambda x: MultiIndex.from_arrays([x]), + lambda x: x.name, + ], + ) + @pytest.mark.parametrize( + "append, index_name", [(True, None), (True, "A"), (True, "test"), (False, None)] + ) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_arrays_duplicate( + self, frame_of_index_cols, drop, append, index_name, box1, box2 + ): df = frame_of_index_cols df.index.name = index_name - keys = [box1(df['A']), box2(df['A'])] + keys = [box1(df["A"]), box2(df["A"])] result = df.set_index(keys, drop=drop, append=append) # if either box is iter, it has been consumed; re-read - keys = [box1(df['A']), box2(df['A'])] + keys = [box1(df["A"]), box2(df["A"])] # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers - first_drop = False if ( - keys[0] is 'A' and keys[1] is 'A') else drop # noqa: F632 + first_drop = ( + False if (keys[0] is "A" and keys[1] is "A") else drop + ) # noqa: F632 # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise @@ -206,85 +248,83 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, expected = expected.set_index([keys[1]], drop=drop, append=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_pass_multiindex(self, frame_of_index_cols, - drop, append): + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_pass_multiindex(self, frame_of_index_cols, drop, append): df = frame_of_index_cols - keys = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) + keys = MultiIndex.from_arrays([df["A"], df["B"]], names=["A", "B"]) result = df.set_index(keys, drop=drop, append=append) # setting with a MultiIndex will never drop columns - expected = df.set_index(['A', 'B'], drop=False, append=append) + expected = df.set_index(["A", "B"], drop=False, append=append) tm.assert_frame_equal(result, expected) def test_set_index_verify_integrity(self, frame_of_index_cols): df = frame_of_index_cols - with pytest.raises(ValueError, match='Index has duplicate keys'): - df.set_index('A', verify_integrity=True) + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index("A", verify_integrity=True) # with MultiIndex - with pytest.raises(ValueError, match='Index has duplicate keys'): - df.set_index([df['A'], df['A']], verify_integrity=True) + with pytest.raises(ValueError, match="Index has duplicate keys"): + df.set_index([df["A"], df["A"]], verify_integrity=True) - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): df = frame_of_index_cols with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): # column names are A-E, as well as one tuple - df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) + df.set_index(["foo", "bar", "baz"], drop=drop, append=append) # non-existent key in list with arrays - with pytest.raises(KeyError, match='X'): - df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) + with pytest.raises(KeyError, match="X"): + df.set_index([df["A"], df["B"], "X"], drop=drop, append=append) msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]" # tuples always raise KeyError with pytest.raises(KeyError, match=msg): - df.set_index(tuple(df['A']), drop=drop, append=append) + df.set_index(tuple(df["A"]), drop=drop, append=append) # also within a list with pytest.raises(KeyError, match=msg): - df.set_index(['A', df['A'], tuple(df['A'])], - drop=drop, append=append) - - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - @pytest.mark.parametrize('box', [set], ids=['set']) - def test_set_index_raise_on_type(self, frame_of_index_cols, box, - drop, append): + df.set_index(["A", df["A"], tuple(df["A"])], drop=drop, append=append) + + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + @pytest.mark.parametrize("box", [set], ids=["set"]) + def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): df = frame_of_index_cols msg = 'The parameter "keys" may be a column key, .*' # forbidden type, e.g. set with pytest.raises(TypeError, match=msg): - df.set_index(box(df['A']), drop=drop, append=append) + df.set_index(box(df["A"]), drop=drop, append=append) # forbidden type in list, e.g. set with pytest.raises(TypeError, match=msg): - df.set_index(['A', df['A'], box(df['A'])], - drop=drop, append=append) + df.set_index(["A", df["A"], box(df["A"])], drop=drop, append=append) # MultiIndex constructor does not work directly on Series -> lambda - @pytest.mark.parametrize('box', [Series, Index, np.array, iter, - lambda x: MultiIndex.from_arrays([x])], - ids=['Series', 'Index', 'np.array', - 'iter', 'MultiIndex']) - @pytest.mark.parametrize('length', [4, 6], ids=['too_short', 'too_long']) - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_raise_on_len(self, frame_of_index_cols, box, length, - drop, append): + @pytest.mark.parametrize( + "box", + [Series, Index, np.array, iter, lambda x: MultiIndex.from_arrays([x])], + ids=["Series", "Index", "np.array", "iter", "MultiIndex"], + ) + @pytest.mark.parametrize("length", [4, 6], ids=["too_short", "too_long"]) + @pytest.mark.parametrize("append", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_set_index_raise_on_len( + self, frame_of_index_cols, box, length, drop, append + ): # GH 24984 df = frame_of_index_cols # has length 5 values = np.random.randint(0, 10, (length,)) - msg = 'Length mismatch: Expected 5 rows, received array of length.*' + msg = "Length mismatch: Expected 5 rows, received array of length.*" # wrong length directly with pytest.raises(ValueError, match=msg): @@ -292,7 +332,7 @@ def test_set_index_raise_on_len(self, frame_of_index_cols, box, length, # wrong length in list with pytest.raises(ValueError, match=msg): - df.set_index(['A', df.A, box(values)], drop=drop, append=append) + df.set_index(["A", df.A, box(values)], drop=drop, append=append) def test_set_index_custom_label_type(self): # GH 24969 @@ -308,11 +348,10 @@ def __str__(self): # necessary for pretty KeyError __repr__ = __str__ - thing1 = Thing('One', 'red') - thing2 = Thing('Two', 'blue') + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) - expected = DataFrame({thing1: [0, 1]}, - index=Index([2, 3], name=thing2)) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) # use custom label directly result = df.set_index(thing2) @@ -323,7 +362,7 @@ def __str__(self): tm.assert_frame_equal(result, expected) # missing key - thing3 = Thing('Three', 'pink') + thing3 = Thing("Three", "pink") msg = "" with pytest.raises(KeyError, match=msg): # missing label directly @@ -345,13 +384,12 @@ class Thing(frozenset): def __repr__(self): tmp = sorted(list(self)) # double curly brace prints one brace in format string - return "frozenset({{{}}})".format(', '.join(map(repr, tmp))) + return "frozenset({{{}}})".format(", ".join(map(repr, tmp))) - thing1 = Thing(['One', 'red']) - thing2 = Thing(['Two', 'blue']) + thing1 = Thing(["One", "red"]) + thing2 = Thing(["Two", "blue"]) df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) - expected = DataFrame({thing1: [0, 1]}, - index=Index([2, 3], name=thing2)) + expected = DataFrame({thing1: [0, 1]}, index=Index([2, 3], name=thing2)) # use custom label directly result = df.set_index(thing2) @@ -362,7 +400,7 @@ def __repr__(self): tm.assert_frame_equal(result, expected) # missing key - thing3 = Thing(['Three', 'pink']) + thing3 = Thing(["Three", "pink"]) msg = r"frozenset\(\{'Three', 'pink'\}\)" with pytest.raises(KeyError, match=msg): # missing label directly @@ -384,8 +422,8 @@ def __init__(self, name, color): def __str__(self): return "" % (self.name,) - thing1 = Thing('One', 'red') - thing2 = Thing('Two', 'blue') + thing1 = Thing("One", "red") + thing2 = Thing("Two", "blue") df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) msg = 'The parameter "keys" may be a column key, .*' @@ -400,53 +438,59 @@ def __str__(self): def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) - ci.name = 'B' + ci.name = "B" # with Categorical - df = DataFrame({'A': np.random.randn(10), - 'B': ci.values}) - idf = df.set_index('B') + df = DataFrame({"A": np.random.randn(10), "B": ci.values}) + idf = df.set_index("B") tm.assert_index_equal(idf.index, ci) # from a CategoricalIndex - df = DataFrame({'A': np.random.randn(10), - 'B': ci}) - idf = df.set_index('B') + df = DataFrame({"A": np.random.randn(10), "B": ci}) + idf = df.set_index("B") tm.assert_index_equal(idf.index, ci) # round-trip - idf = idf.reset_index().set_index('B') + idf = idf.reset_index().set_index("B") tm.assert_index_equal(idf.index, ci) def test_set_index_cast_datetimeindex(self): - df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) - for i in range(1000)], - 'B': np.random.randn(1000)}) + df = DataFrame( + { + "A": [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], + "B": np.random.randn(1000), + } + ) - idf = df.set_index('A') + idf = df.set_index("A") assert isinstance(idf.index, DatetimeIndex) def test_convert_dti_to_series(self): # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 - idx = DatetimeIndex(to_datetime(['2013-1-1 13:00', - '2013-1-2 14:00']), - name='B').tz_localize('US/Pacific') - df = DataFrame(np.random.randn(2, 1), columns=['A']) - - expected = Series(np.array([Timestamp('2013-01-01 13:00:00-0800', - tz='US/Pacific'), - Timestamp('2013-01-02 14:00:00-0800', - tz='US/Pacific')], - dtype="object"), name='B') + idx = DatetimeIndex( + to_datetime(["2013-1-1 13:00", "2013-1-2 14:00"]), name="B" + ).tz_localize("US/Pacific") + df = DataFrame(np.random.randn(2, 1), columns=["A"]) + + expected = Series( + np.array( + [ + Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ], + dtype="object", + ), + name="B", + ) # convert index to series result = Series(idx) tm.assert_series_equal(result, expected) # assign to frame - df['B'] = idx - result = df['B'] + df["B"] = idx + result = df["B"] tm.assert_series_equal(result, expected) # convert to series while keeping the timezone @@ -455,18 +499,19 @@ def test_convert_dti_to_series(self): # convert to utc with tm.assert_produces_warning(FutureWarning): - df['B'] = idx.to_series(keep_tz=False, index=[0, 1]) - result = df['B'] - comp = Series(DatetimeIndex(expected.values).tz_localize(None), - name='B') + df["B"] = idx.to_series(keep_tz=False, index=[0, 1]) + result = df["B"] + comp = Series(DatetimeIndex(expected.values).tz_localize(None), name="B") tm.assert_series_equal(result, comp) with tm.assert_produces_warning(FutureWarning) as m: result = idx.to_series(index=[0, 1]) tm.assert_series_equal(result, expected.dt.tz_convert(None)) - msg = ("The default of the 'keep_tz' keyword in " - "DatetimeIndex.to_series will change to True in a future " - "release.") + msg = ( + "The default of the 'keep_tz' keyword in " + "DatetimeIndex.to_series will change to True in a future " + "release." + ) assert msg in str(m[0].message) with tm.assert_produces_warning(FutureWarning): @@ -474,82 +519,83 @@ def test_convert_dti_to_series(self): tm.assert_series_equal(result, expected.dt.tz_convert(None)) # list of datetimes with a tz - df['B'] = idx.to_pydatetime() - result = df['B'] + df["B"] = idx.to_pydatetime() + result = df["B"] tm.assert_series_equal(result, expected) # GH 6785 # set the index manually import pytz - df = DataFrame( - [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) - expected = df.set_index('ts') - df.index = df['ts'] - df.pop('ts') + + df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=pytz.utc), "foo": 1}]) + expected = df.set_index("ts") + df.index = df["ts"] + df.pop("ts") tm.assert_frame_equal(df, expected) def test_reset_index_tz(self, tz_aware_fixture): # GH 3950 # reset_index with single level tz = tz_aware_fixture - idx = date_range('1/1/2011', periods=5, - freq='D', tz=tz, name='idx') - df = DataFrame({'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, - index=idx) - - expected = DataFrame({'idx': [datetime(2011, 1, 1), - datetime(2011, 1, 2), - datetime(2011, 1, 3), - datetime(2011, 1, 4), - datetime(2011, 1, 5)], - 'a': range(5), - 'b': ['A', 'B', 'C', 'D', 'E']}, - columns=['idx', 'a', 'b']) - expected['idx'] = expected['idx'].apply(lambda d: Timestamp(d, tz=tz)) + idx = date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx") + df = DataFrame({"a": range(5), "b": ["A", "B", "C", "D", "E"]}, index=idx) + + expected = DataFrame( + { + "idx": [ + datetime(2011, 1, 1), + datetime(2011, 1, 2), + datetime(2011, 1, 3), + datetime(2011, 1, 4), + datetime(2011, 1, 5), + ], + "a": range(5), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx", "a", "b"], + ) + expected["idx"] = expected["idx"].apply(lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) def test_set_index_timezone(self): # GH 12358 # tz-aware Series should retain the tz - idx = to_datetime(["2014-01-01 10:10:10"], - utc=True).tz_convert('Europe/Rome') - df = DataFrame({'A': idx}) + idx = to_datetime(["2014-01-01 10:10:10"], utc=True).tz_convert("Europe/Rome") + df = DataFrame({"A": idx}) assert df.set_index(idx).index[0].hour == 11 assert DatetimeIndex(Series(df.A))[0].hour == 11 assert df.set_index(df.A).index[0].hour == 11 def test_set_index_dst(self): - di = date_range('2006-10-29 00:00:00', periods=3, - freq='H', tz='US/Pacific') + di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") - df = DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, - index=di).reset_index() + df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() # single level - res = df.set_index('index') - exp = DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, - index=Index(di, name='index')) + res = df.set_index("index") + exp = DataFrame( + data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=Index(di, name="index") + ) tm.assert_frame_equal(res, exp) # GH 12920 - res = df.set_index(['index', 'a']) - exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], - names=['index', 'a']) - exp = DataFrame({'b': [3, 4, 5]}, index=exp_index) + res = df.set_index(["index", "a"]) + exp_index = MultiIndex.from_arrays([di, [0, 1, 2]], names=["index", "a"]) + exp = DataFrame({"b": [3, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp) def test_reset_index_with_intervals(self): - idx = IntervalIndex.from_breaks(np.arange(11), name='x') - original = DataFrame({'x': idx, 'y': np.arange(10)})[['x', 'y']] + idx = IntervalIndex.from_breaks(np.arange(11), name="x") + original = DataFrame({"x": idx, "y": np.arange(10)})[["x", "y"]] - result = original.set_index('x') - expected = DataFrame({'y': np.arange(10)}, index=idx) + result = original.set_index("x") + expected = DataFrame({"y": np.arange(10)}, index=idx) tm.assert_frame_equal(result, expected) result2 = result.reset_index() tm.assert_frame_equal(result2, original) def test_set_index_multiindexcolumns(self): - columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) + columns = MultiIndex.from_tuples([("foo", 1), ("foo", 2), ("bar", 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) result = df.set_index(df.columns[0]) expected = df.iloc[:, 1:] @@ -559,30 +605,32 @@ def test_set_index_multiindexcolumns(self): def test_set_index_empty_column(self): # GH 1971 - df = DataFrame([ - {'a': 1, 'p': 0}, - {'a': 2, 'm': 10}, - {'a': 3, 'm': 11, 'p': 20}, - {'a': 4, 'm': 12, 'p': 21} - ], columns=('a', 'm', 'p', 'x')) - - result = df.set_index(['a', 'x']) - expected = df[['m', 'p']] - expected.index = MultiIndex.from_arrays([df['a'], df['x']], - names=['a', 'x']) + df = DataFrame( + [ + {"a": 1, "p": 0}, + {"a": 2, "m": 10}, + {"a": 3, "m": 11, "p": 20}, + {"a": 4, "m": 12, "p": 21}, + ], + columns=("a", "m", "p", "x"), + ) + + result = df.set_index(["a", "x"]) + expected = df[["m", "p"]] + expected.index = MultiIndex.from_arrays([df["a"], df["x"]], names=["a", "x"]) tm.assert_frame_equal(result, expected) def test_set_columns(self, float_string_frame): cols = Index(np.arange(len(float_string_frame.columns))) float_string_frame.columns = cols - with pytest.raises(ValueError, match='Length mismatch'): + with pytest.raises(ValueError, match="Length mismatch"): float_string_frame.columns = cols[::2] def test_dti_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) - idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') - idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') + idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern") + idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) @@ -591,13 +639,13 @@ def test_dti_set_index_reindex(self): # GH 11314 # with tz - index = date_range(datetime(2015, 10, 1), - datetime(2015, 10, 1, 23), - freq='H', tz='US/Eastern') - df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) - new_index = date_range(datetime(2015, 10, 2), - datetime(2015, 10, 2, 23), - freq='H', tz='US/Eastern') + index = date_range( + datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern" + ) + df = DataFrame(np.random.randn(24, 1), columns=["a"], index=index) + new_index = date_range( + datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern" + ) result = df.set_index(new_index) assert result.index.freq == index.freq @@ -605,65 +653,58 @@ def test_dti_set_index_reindex(self): # Renaming def test_rename(self, float_frame): - mapping = { - 'A': 'a', - 'B': 'b', - 'C': 'c', - 'D': 'd' - } + mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} renamed = float_frame.rename(columns=mapping) renamed2 = float_frame.rename(columns=str.lower) tm.assert_frame_equal(renamed, renamed2) - tm.assert_frame_equal(renamed2.rename(columns=str.upper), - float_frame, check_names=False) + tm.assert_frame_equal( + renamed2.rename(columns=str.upper), float_frame, check_names=False + ) # index - data = { - 'A': {'foo': 0, 'bar': 1} - } + data = {"A": {"foo": 0, "bar": 1}} # gets sorted alphabetical df = DataFrame(data) - renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'}) - tm.assert_index_equal(renamed.index, Index(['foo', 'bar'])) + renamed = df.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["foo", "bar"])) renamed = df.rename(index=str.upper) - tm.assert_index_equal(renamed.index, Index(['BAR', 'FOO'])) + tm.assert_index_equal(renamed.index, Index(["BAR", "FOO"])) # have to pass something with pytest.raises(TypeError, match="must pass an index to rename"): float_frame.rename() # partial columns - renamed = float_frame.rename(columns={'C': 'foo', 'D': 'bar'}) - tm.assert_index_equal(renamed.columns, Index(['A', 'B', 'foo', 'bar'])) + renamed = float_frame.rename(columns={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.columns, Index(["A", "B", "foo", "bar"])) # other axis - renamed = float_frame.T.rename(index={'C': 'foo', 'D': 'bar'}) - tm.assert_index_equal(renamed.index, Index(['A', 'B', 'foo', 'bar'])) + renamed = float_frame.T.rename(index={"C": "foo", "D": "bar"}) + tm.assert_index_equal(renamed.index, Index(["A", "B", "foo", "bar"])) # index with name - index = Index(['foo', 'bar'], name='name') + index = Index(["foo", "bar"], name="name") renamer = DataFrame(data, index=index) - renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) - tm.assert_index_equal(renamed.index, - Index(['bar', 'foo'], name='name')) + renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) + tm.assert_index_equal(renamed.index, Index(["bar", "foo"], name="name")) assert renamed.index.name == renamer.index.name def test_rename_axis_inplace(self, float_frame): # GH 15704 - expected = float_frame.rename_axis('foo') + expected = float_frame.rename_axis("foo") result = float_frame.copy() - no_return = result.rename_axis('foo', inplace=True) + no_return = result.rename_axis("foo", inplace=True) assert no_return is None tm.assert_frame_equal(result, expected) - expected = float_frame.rename_axis('bar', axis=1) + expected = float_frame.rename_axis("bar", axis=1) result = float_frame.copy() - no_return = result.rename_axis('bar', axis=1, inplace=True) + no_return = result.rename_axis("bar", axis=1, inplace=True) assert no_return is None tm.assert_frame_equal(result, expected) @@ -681,65 +722,67 @@ def test_rename_axis_raises(self): df.rename_axis(id, axis=1) with pytest.raises(ValueError, match="Use `.rename`"): - df['A'].rename_axis(id) + df["A"].rename_axis(id) def test_rename_axis_mapper(self): # GH 19978 - mi = MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], - names=['ll', 'nn']) - df = DataFrame({'x': [i for i in range(len(mi))], - 'y': [i * 10 for i in range(len(mi))]}, - index=mi) + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) + df = DataFrame( + {"x": [i for i in range(len(mi))], "y": [i * 10 for i in range(len(mi))]}, + index=mi, + ) # Test for rename of the Index object of columns - result = df.rename_axis('cols', axis=1) - tm.assert_index_equal(result.columns, - Index(['x', 'y'], name='cols')) + result = df.rename_axis("cols", axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="cols")) # Test for rename of the Index object of columns using dict - result = result.rename_axis(columns={'cols': 'new'}, axis=1) - tm.assert_index_equal(result.columns, - Index(['x', 'y'], name='new')) + result = result.rename_axis(columns={"cols": "new"}, axis=1) + tm.assert_index_equal(result.columns, Index(["x", "y"], name="new")) # Test for renaming index using dict - result = df.rename_axis(index={'ll': 'foo'}) - assert result.index.names == ['foo', 'nn'] + result = df.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] # Test for renaming index using a function result = df.rename_axis(index=str.upper, axis=0) - assert result.index.names == ['LL', 'NN'] + assert result.index.names == ["LL", "NN"] # Test for renaming index providing complete list - result = df.rename_axis(index=['foo', 'goo']) - assert result.index.names == ['foo', 'goo'] + result = df.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] # Test for changing index and columns at same time - sdf = df.reset_index().set_index('nn').drop(columns=['ll', 'y']) - result = sdf.rename_axis(index='foo', columns='meh') - assert result.index.name == 'foo' - assert result.columns.name == 'meh' + sdf = df.reset_index().set_index("nn").drop(columns=["ll", "y"]) + result = sdf.rename_axis(index="foo", columns="meh") + assert result.index.name == "foo" + assert result.columns.name == "meh" # Test different error cases - with pytest.raises(TypeError, match='Must pass'): - df.rename_axis(index='wrong') + with pytest.raises(TypeError, match="Must pass"): + df.rename_axis(index="wrong") - with pytest.raises(ValueError, match='Length of names'): - df.rename_axis(index=['wrong']) + with pytest.raises(ValueError, match="Length of names"): + df.rename_axis(index=["wrong"]) - with pytest.raises(TypeError, match='bogus'): + with pytest.raises(TypeError, match="bogus"): df.rename_axis(bogus=None) - @pytest.mark.parametrize('kwargs, rename_index, rename_columns', [ - ({'mapper': None, 'axis': 0}, True, False), - ({'mapper': None, 'axis': 1}, False, True), - ({'index': None}, True, False), - ({'columns': None}, False, True), - ({'index': None, 'columns': None}, True, True), - ({}, False, False)]) + @pytest.mark.parametrize( + "kwargs, rename_index, rename_columns", + [ + ({"mapper": None, "axis": 0}, True, False), + ({"mapper": None, "axis": 1}, False, True), + ({"index": None}, True, False), + ({"columns": None}, False, True), + ({"index": None, "columns": None}, True, True), + ({}, False, False), + ], + ) def test_rename_axis_none(self, kwargs, rename_index, rename_columns): # GH 25034 - index = Index(list('abc'), name='foo') - columns = Index(['col1', 'col2'], name='bar') + index = Index(list("abc"), name="foo") + columns = Index(["col1", "col2"], name="bar") data = np.arange(6).reshape(3, 2) df = DataFrame(data, index, columns) @@ -751,24 +794,25 @@ def test_rename_axis_none(self, kwargs, rename_index, rename_columns): def test_rename_multiindex(self): - tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] - tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] - index = MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) - columns = MultiIndex.from_tuples( - tuples_columns, names=['fizz', 'buzz']) + tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] + tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] + index = MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) + columns = MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) df = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) # # without specifying level -> across all levels - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - new_index = MultiIndex.from_tuples([('foo3', 'bar1'), - ('foo2', 'bar3')], - names=['foo', 'bar']) - new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), - ('fizz2', 'buzz3')], - names=['fizz', 'buzz']) + renamed = df.rename( + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar3")], names=["foo", "bar"] + ) + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) tm.assert_index_equal(renamed.index, new_index) tm.assert_index_equal(renamed.columns, new_columns) assert renamed.index.names == df.index.names @@ -778,212 +822,213 @@ def test_rename_multiindex(self): # with specifying a level (GH13766) # dict - new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), - ('fizz2', 'buzz2')], - names=['fizz', 'buzz']) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=0) + new_columns = MultiIndex.from_tuples( + [("fizz3", "buzz1"), ("fizz2", "buzz2")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='fizz') + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") tm.assert_index_equal(renamed.columns, new_columns) - new_columns = MultiIndex.from_tuples([('fizz1', 'buzz1'), - ('fizz2', 'buzz3')], - names=['fizz', 'buzz']) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level=1) + new_columns = MultiIndex.from_tuples( + [("fizz1", "buzz1"), ("fizz2", "buzz3")], names=["fizz", "buzz"] + ) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, - level='buzz') + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") tm.assert_index_equal(renamed.columns, new_columns) # function func = str.upper - new_columns = MultiIndex.from_tuples([('FIZZ1', 'buzz1'), - ('FIZZ2', 'buzz2')], - names=['fizz', 'buzz']) + new_columns = MultiIndex.from_tuples( + [("FIZZ1", "buzz1"), ("FIZZ2", "buzz2")], names=["fizz", "buzz"] + ) renamed = df.rename(columns=func, level=0) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns=func, level='fizz') + renamed = df.rename(columns=func, level="fizz") tm.assert_index_equal(renamed.columns, new_columns) - new_columns = MultiIndex.from_tuples([('fizz1', 'BUZZ1'), - ('fizz2', 'BUZZ2')], - names=['fizz', 'buzz']) + new_columns = MultiIndex.from_tuples( + [("fizz1", "BUZZ1"), ("fizz2", "BUZZ2")], names=["fizz", "buzz"] + ) renamed = df.rename(columns=func, level=1) tm.assert_index_equal(renamed.columns, new_columns) - renamed = df.rename(columns=func, level='buzz') + renamed = df.rename(columns=func, level="buzz") tm.assert_index_equal(renamed.columns, new_columns) # index - new_index = MultiIndex.from_tuples([('foo3', 'bar1'), - ('foo2', 'bar2')], - names=['foo', 'bar']) - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, - level=0) + new_index = MultiIndex.from_tuples( + [("foo3", "bar1"), ("foo2", "bar2")], names=["foo", "bar"] + ) + renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(renamed.index, new_index) def test_rename_nocopy(self, float_frame): - renamed = float_frame.rename(columns={'C': 'foo'}, copy=False) - renamed['foo'] = 1. - assert (float_frame['C'] == 1.).all() + renamed = float_frame.rename(columns={"C": "foo"}, copy=False) + renamed["foo"] = 1.0 + assert (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): - float_frame.rename(columns={'C': 'foo'}) - assert 'C' in float_frame - assert 'foo' not in float_frame + float_frame.rename(columns={"C": "foo"}) + assert "C" in float_frame + assert "foo" not in float_frame - c_id = id(float_frame['C']) + c_id = id(float_frame["C"]) float_frame = float_frame.copy() - float_frame.rename(columns={'C': 'foo'}, inplace=True) + float_frame.rename(columns={"C": "foo"}, inplace=True) - assert 'C' not in float_frame - assert 'foo' in float_frame - assert id(float_frame['foo']) != c_id + assert "C" not in float_frame + assert "foo" in float_frame + assert id(float_frame["foo"]) != c_id def test_rename_bug(self): # GH 5344 # rename set ref_locs, and set_index was not resetting - df = DataFrame({0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]}) - df = df.rename(columns={0: 'a'}) - df = df.rename(columns={1: 'b'}) - df = df.set_index(['a', 'b']) - df.columns = ['2001-01-01'] - expected = DataFrame([[1], [2]], - index=MultiIndex.from_tuples( - [('foo', 'bah'), ('bar', 'bas')], - names=['a', 'b']), - columns=['2001-01-01']) + df = DataFrame({0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]}) + df = df.rename(columns={0: "a"}) + df = df.rename(columns={1: "b"}) + df = df.set_index(["a", "b"]) + df.columns = ["2001-01-01"] + expected = DataFrame( + [[1], [2]], + index=MultiIndex.from_tuples( + [("foo", "bah"), ("bar", "bas")], names=["a", "b"] + ), + columns=["2001-01-01"], + ) tm.assert_frame_equal(df, expected) def test_rename_bug2(self): # GH 19497 # rename was changing Index to MultiIndex if Index contained tuples - df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], - columns=["a"]) + df = DataFrame(data=np.arange(3), index=[(0, 0), (1, 1), (2, 2)], columns=["a"]) df = df.rename({(1, 1): (5, 4)}, axis="index") - expected = DataFrame(data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], - columns=["a"]) + expected = DataFrame( + data=np.arange(3), index=[(0, 0), (5, 4), (2, 2)], columns=["a"] + ) tm.assert_frame_equal(df, expected) def test_rename_errors_raises(self): - df = DataFrame(columns=['A', 'B', 'C', 'D']) - with pytest.raises(KeyError, match='\'E\'] not found in axis'): - df.rename(columns={'A': 'a', 'E': 'e'}, errors='raise') - - @pytest.mark.parametrize('mapper, errors, expected_columns', [ - ({'A': 'a', 'E': 'e'}, 'ignore', ['a', 'B', 'C', 'D']), - ({'A': 'a'}, 'raise', ['a', 'B', 'C', 'D']), - (str.lower, 'raise', ['a', 'b', 'c', 'd'])]) + df = DataFrame(columns=["A", "B", "C", "D"]) + with pytest.raises(KeyError, match="'E'] not found in axis"): + df.rename(columns={"A": "a", "E": "e"}, errors="raise") + + @pytest.mark.parametrize( + "mapper, errors, expected_columns", + [ + ({"A": "a", "E": "e"}, "ignore", ["a", "B", "C", "D"]), + ({"A": "a"}, "raise", ["a", "B", "C", "D"]), + (str.lower, "raise", ["a", "b", "c", "d"]), + ], + ) def test_rename_errors(self, mapper, errors, expected_columns): # GH 13473 # rename now works with errors parameter - df = DataFrame(columns=['A', 'B', 'C', 'D']) + df = DataFrame(columns=["A", "B", "C", "D"]) result = df.rename(columns=mapper, errors=errors) expected = DataFrame(columns=expected_columns) tm.assert_frame_equal(result, expected) def test_reorder_levels(self): - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], - [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]], - names=['L0', 'L1', 'L2']) - df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) + df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index) # no change, position result = df.reorder_levels([0, 1, 2]) tm.assert_frame_equal(df, result) # no change, labels - result = df.reorder_levels(['L0', 'L1', 'L2']) + result = df.reorder_levels(["L0", "L1", "L2"]) tm.assert_frame_equal(df, result) # rotate, position result = df.reorder_levels([1, 2, 0]) - e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], - codes=[[0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0]], - names=['L1', 'L2', 'L0']) - expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, - index=e_idx) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) tm.assert_frame_equal(result, expected) result = df.reorder_levels([0, 0, 0]) - e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], - codes=[[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]], - names=['L0', 'L0', 'L0']) - expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, - index=e_idx) + e_idx = MultiIndex( + levels=[["bar"], ["bar"], ["bar"]], + codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], + names=["L0", "L0", "L0"], + ) + expected = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=e_idx) tm.assert_frame_equal(result, expected) - result = df.reorder_levels(['L0', 'L0', 'L0']) + result = df.reorder_levels(["L0", "L0", "L0"]) tm.assert_frame_equal(result, expected) def test_reset_index(self, float_frame): stacked = float_frame.stack()[::2] - stacked = DataFrame({'foo': stacked, 'bar': stacked}) + stacked = DataFrame({"foo": stacked, "bar": stacked}) - names = ['first', 'second'] + names = ["first", "second"] stacked.index.names = names deleveled = stacked.reset_index() - for i, (lev, level_codes) in enumerate(zip(stacked.index.levels, - stacked.index.codes)): + for i, (lev, level_codes) in enumerate( + zip(stacked.index.levels, stacked.index.codes) + ): values = lev.take(level_codes) name = names[i] tm.assert_index_equal(values, Index(deleveled[name])) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() - tm.assert_series_equal(deleveled['first'], deleveled2['level_0'], - check_names=False) - tm.assert_series_equal(deleveled['second'], deleveled2['level_1'], - check_names=False) + tm.assert_series_equal( + deleveled["first"], deleveled2["level_0"], check_names=False + ) + tm.assert_series_equal( + deleveled["second"], deleveled2["level_1"], check_names=False + ) # default name assigned rdf = float_frame.reset_index() - exp = Series(float_frame.index.values, name='index') - tm.assert_series_equal(rdf['index'], exp) + exp = Series(float_frame.index.values, name="index") + tm.assert_series_equal(rdf["index"], exp) # default name assigned, corner case df = float_frame.copy() - df['index'] = 'foo' + df["index"] = "foo" rdf = df.reset_index() - exp = Series(float_frame.index.values, name='level_0') - tm.assert_series_equal(rdf['level_0'], exp) + exp = Series(float_frame.index.values, name="level_0") + tm.assert_series_equal(rdf["level_0"], exp) # but this is ok - float_frame.index.name = 'index' + float_frame.index.name = "index" deleveled = float_frame.reset_index() - tm.assert_series_equal(deleveled['index'], Series(float_frame.index)) - tm.assert_index_equal(deleveled.index, - Index(np.arange(len(deleveled)))) + tm.assert_series_equal(deleveled["index"], Series(float_frame.index)) + tm.assert_index_equal(deleveled.index, Index(np.arange(len(deleveled)))) # preserve column names - float_frame.columns.name = 'columns' + float_frame.columns.name = "columns" resetted = float_frame.reset_index() - assert resetted.columns.name == 'columns' + assert resetted.columns.name == "columns" # only remove certain columns - df = float_frame.reset_index().set_index(['index', 'A', 'B']) - rs = df.reset_index(['A', 'B']) + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index(["A", "B"]) # TODO should reset_index check_names ? tm.assert_frame_equal(rs, float_frame, check_names=False) - rs = df.reset_index(['index', 'A', 'B']) + rs = df.reset_index(["index", "A", "B"]) tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) - rs = df.reset_index(['index', 'A', 'B']) + rs = df.reset_index(["index", "A", "B"]) tm.assert_frame_equal(rs, float_frame.reset_index(), check_names=False) - rs = df.reset_index('A') - xp = float_frame.reset_index().set_index(['index', 'B']) + rs = df.reset_index("A") + xp = float_frame.reset_index().set_index(["index", "B"]) tm.assert_frame_equal(rs, xp, check_names=False) # test resetting in place @@ -992,179 +1037,198 @@ def test_reset_index(self, float_frame): df.reset_index(inplace=True) tm.assert_frame_equal(df, resetted, check_names=False) - df = float_frame.reset_index().set_index(['index', 'A', 'B']) - rs = df.reset_index('A', drop=True) + df = float_frame.reset_index().set_index(["index", "A", "B"]) + rs = df.reset_index("A", drop=True) xp = float_frame.copy() - del xp['A'] - xp = xp.set_index(['B'], append=True) + del xp["A"] + xp = xp.set_index(["B"], append=True) tm.assert_frame_equal(rs, xp, check_names=False) def test_reset_index_name(self): - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=['A', 'B', 'C', 'D'], - index=Index(range(2), name='x')) + df = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "C", "D"], + index=Index(range(2), name="x"), + ) assert df.reset_index().index.name is None assert df.reset_index(drop=True).index.name is None df.reset_index(inplace=True) assert df.index.name is None def test_reset_index_level(self): - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=['A', 'B', 'C', 'D']) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) - for levels in ['A', 'B'], [0, 1]: + for levels in ["A", "B"], [0, 1]: # With MultiIndex - result = df.set_index(['A', 'B']).reset_index(level=levels[0]) - tm.assert_frame_equal(result, df.set_index('B')) + result = df.set_index(["A", "B"]).reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index("B")) - result = df.set_index(['A', 'B']).reset_index(level=levels[:1]) - tm.assert_frame_equal(result, df.set_index('B')) + result = df.set_index(["A", "B"]).reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index("B")) - result = df.set_index(['A', 'B']).reset_index(level=levels) + result = df.set_index(["A", "B"]).reset_index(level=levels) tm.assert_frame_equal(result, df) - result = df.set_index(['A', 'B']).reset_index(level=levels, - drop=True) - tm.assert_frame_equal(result, df[['C', 'D']]) + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C", "D"]]) # With single-level Index (GH 16263) - result = df.set_index('A').reset_index(level=levels[0]) + result = df.set_index("A").reset_index(level=levels[0]) tm.assert_frame_equal(result, df) - result = df.set_index('A').reset_index(level=levels[:1]) + result = df.set_index("A").reset_index(level=levels[:1]) tm.assert_frame_equal(result, df) - result = df.set_index(['A']).reset_index(level=levels[0], - drop=True) - tm.assert_frame_equal(result, df[['B', 'C', 'D']]) + result = df.set_index(["A"]).reset_index(level=levels[0], drop=True) + tm.assert_frame_equal(result, df[["B", "C", "D"]]) # Missing levels - for both MultiIndex and single-level Index: - for idx_lev in ['A', 'B'], ['A']: - with pytest.raises(KeyError, match='Level E '): - df.set_index(idx_lev).reset_index(level=['A', 'E']) - with pytest.raises(IndexError, match='Too many levels'): + for idx_lev in ["A", "B"], ["A"]: + with pytest.raises(KeyError, match="Level E "): + df.set_index(idx_lev).reset_index(level=["A", "E"]) + with pytest.raises(IndexError, match="Too many levels"): df.set_index(idx_lev).reset_index(level=[0, 1, 2]) def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) - s1 = Series((9.81 * time ** 2) / 2, - index=Index(time, name='time'), - name='speed') + s1 = Series( + (9.81 * time ** 2) / 2, index=Index(time, name="time"), name="speed" + ) df = DataFrame(s1) resetted = s1.reset_index() - assert resetted['time'].dtype == np.float64 + assert resetted["time"].dtype == np.float64 resetted = df.reset_index() - assert resetted['time'].dtype == np.float64 + assert resetted["time"].dtype == np.float64 def test_reset_index_multiindex_col(self): vals = np.random.randn(3, 3).astype(object) - idx = ['x', 'y', 'z'] + idx = ["x", "y", "z"] full = np.hstack(([[x] for x in idx], vals)) - df = DataFrame(vals, Index(idx, name='a'), - columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']]) + df = DataFrame( + vals, + Index(idx, name="a"), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) rs = df.reset_index() - xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'], - ['', 'mean', 'median', 'mean']]) + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]] + ) tm.assert_frame_equal(rs, xp) rs = df.reset_index(col_fill=None) - xp = DataFrame(full, columns=[['a', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + xp = DataFrame( + full, columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) tm.assert_frame_equal(rs, xp) - rs = df.reset_index(col_level=1, col_fill='blah') - xp = DataFrame(full, columns=[['blah', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + rs = df.reset_index(col_level=1, col_fill="blah") + xp = DataFrame( + full, columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]] + ) tm.assert_frame_equal(rs, xp) - df = DataFrame(vals, - MultiIndex.from_arrays([[0, 1, 2], ['x', 'y', 'z']], - names=['d', 'a']), - columns=[['b', 'b', 'c'], ['mean', 'median', 'mean']]) - rs = df.reset_index('a', ) - xp = DataFrame(full, Index([0, 1, 2], name='d'), - columns=[['a', 'b', 'b', 'c'], - ['', 'mean', 'median', 'mean']]) + df = DataFrame( + vals, + MultiIndex.from_arrays([[0, 1, 2], ["x", "y", "z"]], names=["d", "a"]), + columns=[["b", "b", "c"], ["mean", "median", "mean"]], + ) + rs = df.reset_index("a") + xp = DataFrame( + full, + Index([0, 1, 2], name="d"), + columns=[["a", "b", "b", "c"], ["", "mean", "median", "mean"]], + ) tm.assert_frame_equal(rs, xp) - rs = df.reset_index('a', col_fill=None) - xp = DataFrame(full, Index(range(3), name='d'), - columns=[['a', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + rs = df.reset_index("a", col_fill=None) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["a", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) tm.assert_frame_equal(rs, xp) - rs = df.reset_index('a', col_fill='blah', col_level=1) - xp = DataFrame(full, Index(range(3), name='d'), - columns=[['blah', 'b', 'b', 'c'], - ['a', 'mean', 'median', 'mean']]) + rs = df.reset_index("a", col_fill="blah", col_level=1) + xp = DataFrame( + full, + Index(range(3), name="d"), + columns=[["blah", "b", "b", "c"], ["a", "mean", "median", "mean"]], + ) tm.assert_frame_equal(rs, xp) def test_reset_index_multiindex_nan(self): # GH6322, testing reset_index on MultiIndexes # when we have a nan or all nan - df = DataFrame({'A': ['a', 'b', 'c'], - 'B': [0, 1, np.nan], - 'C': np.random.rand(3)}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame( + {"A": ["a", "b", "c"], "B": [0, 1, np.nan], "C": np.random.rand(3)} + ) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) - df = DataFrame({'A': [np.nan, 'b', 'c'], - 'B': [0, 1, 2], - 'C': np.random.rand(3)}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame( + {"A": [np.nan, "b", "c"], "B": [0, 1, 2], "C": np.random.rand(3)} + ) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) - df = DataFrame({'A': ['a', 'b', 'c'], - 'B': [0, 1, 2], - 'C': [np.nan, 1.1, 2.2]}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame({"A": ["a", "b", "c"], "B": [0, 1, 2], "C": [np.nan, 1.1, 2.2]}) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) - df = DataFrame({'A': ['a', 'b', 'c'], - 'B': [np.nan, np.nan, np.nan], - 'C': np.random.rand(3)}) - rs = df.set_index(['A', 'B']).reset_index() + df = DataFrame( + { + "A": ["a", "b", "c"], + "B": [np.nan, np.nan, np.nan], + "C": np.random.rand(3), + } + ) + rs = df.set_index(["A", "B"]).reset_index() tm.assert_frame_equal(rs, df) def test_reset_index_with_datetimeindex_cols(self): # GH5818 # - df = DataFrame([[1, 2], [3, 4]], - columns=date_range('1/1/2013', '1/2/2013'), - index=['A', 'B']) + df = DataFrame( + [[1, 2], [3, 4]], + columns=date_range("1/1/2013", "1/2/2013"), + index=["A", "B"], + ) result = df.reset_index() - expected = DataFrame([['A', 1, 2], ['B', 3, 4]], - columns=['index', datetime(2013, 1, 1), - datetime(2013, 1, 2)]) + expected = DataFrame( + [["A", 1, 2], ["B", 3, 4]], + columns=["index", datetime(2013, 1, 1), datetime(2013, 1, 2)], + ) tm.assert_frame_equal(result, expected) def test_reset_index_range(self): # GH 12071 - df = DataFrame([[0, 0], [1, 1]], columns=['A', 'B'], - index=RangeIndex(stop=2)) + df = DataFrame([[0, 0], [1, 1]], columns=["A", "B"], index=RangeIndex(stop=2)) result = df.reset_index() assert isinstance(result.index, RangeIndex) - expected = DataFrame([[0, 0, 0], [1, 1, 1]], - columns=['index', 'A', 'B'], - index=RangeIndex(stop=2)) + expected = DataFrame( + [[0, 0, 0], [1, 1, 1]], + columns=["index", "A", "B"], + index=RangeIndex(stop=2), + ) tm.assert_frame_equal(result, expected) def test_set_index_names(self): df = tm.makeDataFrame() - df.index.name = 'name' + df.index.name = "name" - assert df.set_index(df.index).index.names == ['name'] + assert df.set_index(df.index).index.names == ["name"] - mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) - mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, - names=['A', 'B', 'C', 'D']) + mi = MultiIndex.from_arrays(df[["A", "B"]].T.values, names=["A", "B"]) + mi2 = MultiIndex.from_arrays( + df[["A", "B", "A", "B"]].T.values, names=["A", "B", "C", "D"] + ) - df = df.set_index(['A', 'B']) + df = df.set_index(["A", "B"]) - assert df.set_index(df.index).index.names == ['A', 'B'] + assert df.set_index(df.index).index.names == ["A", "B"] # Check that set_index isn't converting a MultiIndex into an Index assert isinstance(df.set_index(df.index).index, MultiIndex) @@ -1172,7 +1236,7 @@ def test_set_index_names(self): # Check actual equality tm.assert_index_equal(df.set_index(df.index).index, mi) - idx2 = df.index.rename(['C', 'D']) + idx2 = df.index.rename(["C", "D"]) # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather # than a pair of tuples @@ -1184,55 +1248,56 @@ def test_set_index_names(self): def test_rename_objects(self, float_string_frame): renamed = float_string_frame.rename(columns=str.upper) - assert 'FOO' in renamed - assert 'foo' not in renamed + assert "FOO" in renamed + assert "foo" not in renamed def test_rename_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 - df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['X', 'Y']) - expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y']) + df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["X", "Y"]) + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) result = df.rename(str.lower, axis=1) tm.assert_frame_equal(result, expected) - result = df.rename(str.lower, axis='columns') + result = df.rename(str.lower, axis="columns") tm.assert_frame_equal(result, expected) - result = df.rename({"A": 'a', 'B': 'b'}, axis=1) + result = df.rename({"A": "a", "B": "b"}, axis=1) tm.assert_frame_equal(result, expected) - result = df.rename({"A": 'a', 'B': 'b'}, axis='columns') + result = df.rename({"A": "a", "B": "b"}, axis="columns") tm.assert_frame_equal(result, expected) # Index - expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y']) + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) result = df.rename(str.lower, axis=0) tm.assert_frame_equal(result, expected) - result = df.rename(str.lower, axis='index') + result = df.rename(str.lower, axis="index") tm.assert_frame_equal(result, expected) - result = df.rename({'X': 'x', 'Y': 'y'}, axis=0) + result = df.rename({"X": "x", "Y": "y"}, axis=0) tm.assert_frame_equal(result, expected) - result = df.rename({'X': 'x', 'Y': 'y'}, axis='index') + result = df.rename({"X": "x", "Y": "y"}, axis="index") tm.assert_frame_equal(result, expected) - result = df.rename(mapper=str.lower, axis='index') + result = df.rename(mapper=str.lower, axis="index") tm.assert_frame_equal(result, expected) def test_rename_mapper_multi(self): - df = DataFrame({"A": ['a', 'b'], "B": ['c', 'd'], - 'C': [1, 2]}).set_index(["A", "B"]) + df = DataFrame({"A": ["a", "b"], "B": ["c", "d"], "C": [1, 2]}).set_index( + ["A", "B"] + ) result = df.rename(str.upper) expected = df.rename(index=str.upper) tm.assert_frame_equal(result, expected) def test_rename_positional_named(self): # https://github.com/pandas-dev/pandas/issues/12392 - df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=['X', 'Y']) + df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) result = df.rename(str.lower, columns=str.upper) - expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=['x', 'y']) + expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) tm.assert_frame_equal(result, expected) def test_rename_axis_style_raises(self): @@ -1240,8 +1305,7 @@ def test_rename_axis_style_raises(self): df = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["0", "1"]) # Named target and axis - over_spec_msg = ("Cannot specify both 'axis' and " - "any of 'index' or 'columns'") + over_spec_msg = "Cannot specify both 'axis' and " "any of 'index' or 'columns'" with pytest.raises(TypeError, match=over_spec_msg): df.rename(index=str.lower, axis=1) @@ -1269,62 +1333,62 @@ def test_rename_axis_style_raises(self): def test_reindex_api_equivalence(self): # equivalence of the labels/axis and index/columns API's - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - - res1 = df.reindex(['b', 'a']) - res2 = df.reindex(index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a']) - res4 = df.reindex(labels=['b', 'a'], axis=0) - res5 = df.reindex(['b', 'a'], axis=0) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) - res1 = df.reindex(columns=['e', 'd']) - res2 = df.reindex(['e', 'd'], axis=1) - res3 = df.reindex(labels=['e', 'd'], axis=1) + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) - res1 = df.reindex(index=['b', 'a'], columns=['e', 'd']) - res2 = df.reindex(columns=['e', 'd'], index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'], - axis=1) + res1 = df.reindex(index=["b", "a"], columns=["e", "d"]) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) def test_rename_positional(self): - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) with tm.assert_produces_warning(FutureWarning) as rec: result = df.rename(None, str.lower) - expected = DataFrame(columns=['a', 'b']) + expected = DataFrame(columns=["a", "b"]) tm.assert_frame_equal(result, expected) assert len(rec) == 1 message = str(rec[0].message) - assert 'rename' in message - assert 'Use named arguments' in message + assert "rename" in message + assert "Use named arguments" in message def test_assign_columns(self, float_frame): - float_frame['hi'] = 'there' + float_frame["hi"] = "there" df = float_frame.copy() - df.columns = ['foo', 'bar', 'baz', 'quux', 'foo2'] - tm.assert_series_equal(float_frame['C'], df['baz'], check_names=False) - tm.assert_series_equal(float_frame['hi'], df['foo2'], - check_names=False) + df.columns = ["foo", "bar", "baz", "quux", "foo2"] + tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False) + tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False) def test_set_index_preserve_categorical_dtype(self): # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) - for cols in ['C1', 'C2', ['A', 'C1'], ['A', 'C2'], ['C1', 'C2']]: + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) + for cols in ["C1", "C2", ["A", "C1"], ["A", "C2"], ["C1", "C2"]]: result = df.set_index(cols).reset_index() result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) @@ -1340,63 +1404,78 @@ def test_ambiguous_warns(self): def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) - assert parameters == {"self", "mapper", "index", "columns", "axis", - "inplace", "copy", "level", "errors"} + assert parameters == { + "self", + "mapper", + "index", + "columns", + "axis", + "inplace", + "copy", + "level", + "errors", + } def test_reindex_signature(self): sig = inspect.signature(DataFrame.reindex) parameters = set(sig.parameters) - assert parameters == {"self", "labels", "index", "columns", "axis", - "limit", "copy", "level", "method", - "fill_value", "tolerance"} + assert parameters == { + "self", + "labels", + "index", + "columns", + "axis", + "limit", + "copy", + "level", + "method", + "fill_value", + "tolerance", + } def test_droplevel(self): # GH20342 - df = DataFrame([ - [1, 2, 3, 4], - [5, 6, 7, 8], - [9, 10, 11, 12] - ]) - df = df.set_index([0, 1]).rename_axis(['a', 'b']) - df.columns = MultiIndex.from_tuples([('c', 'e'), ('d', 'f')], - names=['level_1', 'level_2']) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + df = df.set_index([0, 1]).rename_axis(["a", "b"]) + df.columns = MultiIndex.from_tuples( + [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ) # test that dropping of a level in index works - expected = df.reset_index('a', drop=True) - result = df.droplevel('a', axis='index') + expected = df.reset_index("a", drop=True) + result = df.droplevel("a", axis="index") tm.assert_frame_equal(result, expected) # test that dropping of a level in columns works expected = df.copy() - expected.columns = Index(['c', 'd'], name='level_1') - result = df.droplevel('level_2', axis='columns') + expected.columns = Index(["c", "d"], name="level_1") + result = df.droplevel("level_2", axis="columns") tm.assert_frame_equal(result, expected) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestIntervalIndex: - def test_setitem(self): - df = DataFrame({'A': range(10)}) + df = DataFrame({"A": range(10)}) s = cut(df.A, 5) assert isinstance(s.cat.categories, IntervalIndex) # B & D end up as Categoricals # the remainer are converted to in-line objects # contining an IntervalIndex.values - df['B'] = s - df['C'] = np.array(s) - df['D'] = s.values - df['E'] = np.array(s.values) + df["B"] = s + df["C"] = np.array(s) + df["D"] = s.values + df["E"] = np.array(s.values) - assert is_categorical_dtype(df['B']) - assert is_interval_dtype(df['B'].cat.categories) - assert is_categorical_dtype(df['D']) - assert is_interval_dtype(df['D'].cat.categories) + assert is_categorical_dtype(df["B"]) + assert is_interval_dtype(df["B"].cat.categories) + assert is_categorical_dtype(df["D"]) + assert is_interval_dtype(df["D"].cat.categories) - assert is_object_dtype(df['C']) - assert is_object_dtype(df['E']) + assert is_object_dtype(df["C"]) + assert is_object_dtype(df["E"]) # they compare equal as Index # when converted to numpy objects @@ -1407,77 +1486,75 @@ def test_setitem(self): tm.assert_index_equal(c(df.B), c(df.D), check_names=False) # B & D are the same Series - tm.assert_series_equal(df['B'], df['B'], check_names=False) - tm.assert_series_equal(df['B'], df['D'], check_names=False) + tm.assert_series_equal(df["B"], df["B"], check_names=False) + tm.assert_series_equal(df["B"], df["D"], check_names=False) # C & E are the same Series - tm.assert_series_equal(df['C'], df['C'], check_names=False) - tm.assert_series_equal(df['C'], df['E'], check_names=False) + tm.assert_series_equal(df["C"], df["C"], check_names=False) + tm.assert_series_equal(df["C"], df["E"], check_names=False) def test_set_reset_index(self): - df = DataFrame({'A': range(10)}) + df = DataFrame({"A": range(10)}) s = cut(df.A, 5) - df['B'] = s - df = df.set_index('B') + df["B"] = s + df = df.set_index("B") df = df.reset_index() def test_set_axis_inplace(self): # GH14636 - df = DataFrame({'A': [1.1, 2.2, 3.3], - 'B': [5.0, 6.1, 7.2], - 'C': [4.4, 5.5, 6.6]}, - index=[2010, 2011, 2012]) - - expected = {0: df.copy(), - 1: df.copy()} - expected[0].index = list('abc') - expected[1].columns = list('abc') - expected['index'] = expected[0] - expected['columns'] = expected[1] + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + + expected = {0: df.copy(), 1: df.copy()} + expected[0].index = list("abc") + expected[1].columns = list("abc") + expected["index"] = expected[0] + expected["columns"] = expected[1] for axis in expected: # inplace=True # The FutureWarning comes from the fact that we would like to have # inplace default to False some day for inplace, warn in (None, FutureWarning), (True, None): - kwargs = {'inplace': inplace} + kwargs = {"inplace": inplace} result = df.copy() with tm.assert_produces_warning(warn): - result.set_axis(list('abc'), axis=axis, **kwargs) + result.set_axis(list("abc"), axis=axis, **kwargs) tm.assert_frame_equal(result, expected[axis]) # inplace=False - result = df.set_axis(list('abc'), axis=axis, inplace=False) + result = df.set_axis(list("abc"), axis=axis, inplace=False) tm.assert_frame_equal(expected[axis], result) # omitting the "axis" parameter with tm.assert_produces_warning(None): - result = df.set_axis(list('abc'), inplace=False) + result = df.set_axis(list("abc"), inplace=False) tm.assert_frame_equal(result, expected[0]) # wrong values for the "axis" parameter - for axis in 3, 'foo': - with pytest.raises(ValueError, match='No axis named'): - df.set_axis(list('abc'), axis=axis, inplace=False) + for axis in 3, "foo": + with pytest.raises(ValueError, match="No axis named"): + df.set_axis(list("abc"), axis=axis, inplace=False) def test_set_axis_prior_to_deprecation_signature(self): - df = DataFrame({'A': [1.1, 2.2, 3.3], - 'B': [5.0, 6.1, 7.2], - 'C': [4.4, 5.5, 6.6]}, - index=[2010, 2011, 2012]) - - expected = {0: df.copy(), - 1: df.copy()} - expected[0].index = list('abc') - expected[1].columns = list('abc') - expected['index'] = expected[0] - expected['columns'] = expected[1] + df = DataFrame( + {"A": [1.1, 2.2, 3.3], "B": [5.0, 6.1, 7.2], "C": [4.4, 5.5, 6.6]}, + index=[2010, 2011, 2012], + ) + + expected = {0: df.copy(), 1: df.copy()} + expected[0].index = list("abc") + expected[1].columns = list("abc") + expected["index"] = expected[0] + expected["columns"] = expected[1] # old signature for axis in expected: with tm.assert_produces_warning(FutureWarning): - result = df.set_axis(axis, list('abc'), inplace=False) + result = df.set_axis(axis, list("abc"), inplace=False) tm.assert_frame_equal(result, expected[axis]) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9921d91d6de8c..8c1534aa515e8 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -10,16 +10,32 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timestamp, date_range, isna, - notna, to_datetime, to_timedelta) + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + date_range, + isna, + notna, + to_datetime, + to_timedelta, +) import pandas.core.algorithms as algorithms import pandas.core.nanops as nanops import pandas.util.testing as tm -def assert_stat_op_calc(opname, alternative, frame, has_skipna=True, - check_dtype=True, check_dates=False, - check_less_precise=False, skipna_alternative=None): +def assert_stat_op_calc( + opname, + alternative, + frame, + has_skipna=True, + check_dtype=True, + check_dates=False, + check_less_precise=False, + skipna_alternative=None, +): """ Check that operator opname works as advertised on frame @@ -49,43 +65,53 @@ def assert_stat_op_calc(opname, alternative, frame, has_skipna=True, f = getattr(frame, opname) if check_dates: - df = DataFrame({'b': date_range('1/1/2001', periods=2)}) + df = DataFrame({"b": date_range("1/1/2001", periods=2)}) result = getattr(df, opname)() assert isinstance(result, Series) - df['a'] = range(len(df)) + df["a"] = range(len(df)) result = getattr(df, opname)() assert isinstance(result, Series) assert len(result) if has_skipna: + def wrapper(x): return alternative(x.values) - skipna_wrapper = tm._make_skipna_wrapper(alternative, - skipna_alternative) + skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - tm.assert_series_equal(result0, frame.apply(wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) + tm.assert_series_equal( + result0, + frame.apply(wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise, + ) # HACK: win32 - tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False, - check_less_precise=check_less_precise) + tm.assert_series_equal( + result1, + frame.apply(wrapper, axis=1), + check_dtype=False, + check_less_precise=check_less_precise, + ) else: skipna_wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) - tm.assert_series_equal(result0, frame.apply(skipna_wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) - - if opname in ['sum', 'prod']: + tm.assert_series_equal( + result0, + frame.apply(skipna_wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise, + ) + + if opname in ["sum", "prod"]: expected = frame.apply(skipna_wrapper, axis=1) - tm.assert_series_equal(result1, expected, check_dtype=False, - check_less_precise=check_less_precise) + tm.assert_series_equal( + result1, expected, check_dtype=False, check_less_precise=check_less_precise + ) # check dtypes if check_dtype: @@ -94,7 +120,7 @@ def wrapper(x): assert lcd_dtype == result1.dtype # bad axis - with pytest.raises(ValueError, match='No axis named 2'): + with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case @@ -102,16 +128,15 @@ def wrapper(x): all_na = frame * np.NaN r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) - if opname in ['sum', 'prod']: - unit = 1 if opname == 'prod' else 0 # result for empty sum/prod + if opname in ["sum", "prod"]: + unit = 1 if opname == "prod" else 0 # result for empty sum/prod expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) tm.assert_series_equal(r0, expected) expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) tm.assert_series_equal(r1, expected) -def assert_stat_op_api(opname, float_frame, float_string_frame, - has_numeric_only=False): +def assert_stat_op_api(opname, float_frame, float_string_frame, has_numeric_only=False): """ Check that API for operator opname works as advertised on frame @@ -158,6 +183,7 @@ def assert_bool_op_calc(opname, alternative, frame, has_skipna=True): f = getattr(frame, opname) if has_skipna: + def skipna_wrapper(x): nona = x.dropna().values return alternative(nona) @@ -169,8 +195,9 @@ def wrapper(x): result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper)) - tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False) # HACK: win32 + tm.assert_series_equal( + result1, frame.apply(wrapper, axis=1), check_dtype=False + ) # HACK: win32 else: skipna_wrapper = alternative wrapper = alternative @@ -179,11 +206,12 @@ def wrapper(x): result1 = f(axis=1) tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) - tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), - check_dtype=False) + tm.assert_series_equal( + result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False + ) # bad axis - with pytest.raises(ValueError, match='No axis named 2'): + with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case @@ -191,7 +219,7 @@ def wrapper(x): all_na = frame * np.NaN r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) - if opname == 'any': + if opname == "any": assert not r0.any() assert not r1.any() else: @@ -199,8 +227,9 @@ def wrapper(x): assert r1.all() -def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, - has_bool_only=False): +def assert_bool_op_api( + opname, bool_frame_with_na, float_string_frame, has_bool_only=False +): """ Check that API for boolean operator opname works as advertised on frame @@ -217,7 +246,7 @@ def assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, """ # make sure op works on mixed-type frame mixed = float_string_frame - mixed['_bool_'] = np.random.randn(len(mixed)) > 0.5 + mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5 getattr(mixed, opname)(axis=0) getattr(mixed, opname)(axis=1) @@ -235,62 +264,69 @@ class TestDataFrameAnalytics: @td.skip_if_no_scipy def test_corr_pearson(self, float_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan - self._check_method(float_frame, 'pearson') + self._check_method(float_frame, "pearson") @td.skip_if_no_scipy def test_corr_kendall(self, float_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan - self._check_method(float_frame, 'kendall') + self._check_method(float_frame, "kendall") @td.skip_if_no_scipy def test_corr_spearman(self, float_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan - self._check_method(float_frame, 'spearman') + self._check_method(float_frame, "spearman") - def _check_method(self, frame, method='pearson'): + def _check_method(self, frame, method="pearson"): correls = frame.corr(method=method) - expected = frame['A'].corr(frame['C'], method=method) - tm.assert_almost_equal(correls['A']['C'], expected) + expected = frame["A"].corr(frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) @td.skip_if_no_scipy def test_corr_non_numeric(self, float_frame, float_string_frame): - float_frame['A'][:5] = np.nan - float_frame['B'][5:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][5:10] = np.nan # exclude non-numeric types result = float_string_frame.corr() - expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].corr() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr() tm.assert_frame_equal(result, expected) @td.skip_if_no_scipy - @pytest.mark.parametrize('meth', ['pearson', 'kendall', 'spearman']) + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) def test_corr_nooverlap(self, meth): # nothing in common - df = DataFrame({'A': [1, 1.5, 1, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, 1, 1.5, 1], - 'C': [np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan]}) + df = DataFrame( + { + "A": [1, 1.5, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1.5, 1], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) rs = df.corr(meth) - assert isna(rs.loc['A', 'B']) - assert isna(rs.loc['B', 'A']) - assert rs.loc['A', 'A'] == 1 - assert rs.loc['B', 'B'] == 1 - assert isna(rs.loc['C', 'C']) + assert isna(rs.loc["A", "B"]) + assert isna(rs.loc["B", "A"]) + assert rs.loc["A", "A"] == 1 + assert rs.loc["B", "B"] == 1 + assert isna(rs.loc["C", "C"]) @td.skip_if_no_scipy - @pytest.mark.parametrize('meth', ['pearson', 'spearman']) + @pytest.mark.parametrize("meth", ["pearson", "spearman"]) def test_corr_constant(self, meth): # constant --> all NA - df = DataFrame({'A': [1, 1, 1, np.nan, np.nan, np.nan], - 'B': [np.nan, np.nan, np.nan, 1, 1, 1]}) + df = DataFrame( + { + "A": [1, 1, 1, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, np.nan, 1, 1, 1], + } + ) rs = df.corr(meth) assert isna(rs.values).all() @@ -308,9 +344,8 @@ def test_corr_int_and_boolean(self): # so it need to be properly handled df = DataFrame({"a": [True, False], "b": [1, 0]}) - expected = DataFrame(np.ones((2, 2)), index=[ - 'a', 'b'], columns=['a', 'b']) - for meth in ['pearson', 'kendall', 'spearman']: + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + for meth in ["pearson", "kendall", "spearman"]: with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) @@ -319,9 +354,8 @@ def test_corr_int_and_boolean(self): def test_corr_cov_independent_index_column(self): # GH 14617 - df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), - columns=list("abcd")) - for method in ['cov', 'corr']: + df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4), columns=list("abcd")) + for method in ["cov", "corr"]: result = getattr(df, method)() assert result.index is not result.columns assert result.index.equals(result.columns) @@ -329,8 +363,9 @@ def test_corr_cov_independent_index_column(self): def test_corr_invalid_method(self): # GH 22298 df = pd.DataFrame(np.random.normal(size=(10, 2))) - msg = ("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, ") + msg = ( + "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " + ) with pytest.raises(ValueError, match=msg): df.corr(method="____") @@ -346,36 +381,39 @@ def test_cov(self, float_frame, float_string_frame): # with NAs frame = float_frame.copy() - frame['A'][:5] = np.nan - frame['B'][5:10] = np.nan + frame["A"][:5] = np.nan + frame["B"][5:10] = np.nan result = float_frame.cov(min_periods=len(float_frame) - 8) expected = float_frame.cov() - expected.loc['A', 'B'] = np.nan - expected.loc['B', 'A'] = np.nan + expected.loc["A", "B"] = np.nan + expected.loc["B", "A"] = np.nan # regular - float_frame['A'][:5] = np.nan - float_frame['B'][:10] = np.nan + float_frame["A"][:5] = np.nan + float_frame["B"][:10] = np.nan cov = float_frame.cov() - tm.assert_almost_equal(cov['A']['C'], - float_frame['A'].cov(float_frame['C'])) + tm.assert_almost_equal(cov["A"]["C"], float_frame["A"].cov(float_frame["C"])) # exclude non-numeric types result = float_string_frame.cov() - expected = float_string_frame.loc[:, ['A', 'B', 'C', 'D']].cov() + expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov() tm.assert_frame_equal(result, expected) # Single column frame df = DataFrame(np.linspace(0.0, 1.0, 10)) result = df.cov() - expected = DataFrame(np.cov(df.values.T).reshape((1, 1)), - index=df.columns, columns=df.columns) + expected = DataFrame( + np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns + ) tm.assert_frame_equal(result, expected) df.loc[0] = np.nan result = df.cov() - expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)), - index=df.columns, columns=df.columns) + expected = DataFrame( + np.cov(df.values[1:].T).reshape((1, 1)), + index=df.columns, + columns=df.columns, + ) tm.assert_frame_equal(result, expected) def test_corrwith(self, datetime_frame): @@ -386,39 +424,37 @@ def test_corrwith(self, datetime_frame): # make sure order does not matter b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) - del b['B'] + del b["B"] colcorr = a.corrwith(b, axis=0) - tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) + tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) rowcorr = a.corrwith(b, axis=1) tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) dropped = a.corrwith(b, axis=0, drop=True) - tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) - assert 'B' not in dropped + tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) + assert "B" not in dropped dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index # non time-series data - index = ['a', 'b', 'c', 'd', 'e'] - columns = ['one', 'two', 'three', 'four'] + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) - df2 = DataFrame(np.random.randn(4, 4), - index=index[:4], columns=columns) + df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) correls = df1.corrwith(df2, axis=1) for row in index[:4]: - tm.assert_almost_equal(correls[row], - df1.loc[row].corr(df2.loc[row])) + tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) def test_corrwith_with_objects(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() - cols = ['A', 'B', 'C', 'D'] + cols = ["A", "B", "C", "D"] - df1['obj'] = 'foo' - df2['obj'] = 'bar' + df1["obj"] = "foo" + df2["obj"] = "bar" result = df1.corrwith(df2) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) @@ -429,45 +465,42 @@ def test_corrwith_with_objects(self): tm.assert_series_equal(result, expected) def test_corrwith_series(self, datetime_frame): - result = datetime_frame.corrwith(datetime_frame['A']) - expected = datetime_frame.apply(datetime_frame['A'].corr) + result = datetime_frame.corrwith(datetime_frame["A"]) + expected = datetime_frame.apply(datetime_frame["A"].corr) tm.assert_series_equal(result, expected) def test_corrwith_matches_corrcoef(self): - df1 = DataFrame(np.arange(10000), columns=['a']) - df2 = DataFrame(np.arange(10000) ** 2, columns=['a']) - c1 = df1.corrwith(df2)['a'] - c2 = np.corrcoef(df1['a'], df2['a'])[0][1] + df1 = DataFrame(np.arange(10000), columns=["a"]) + df2 = DataFrame(np.arange(10000) ** 2, columns=["a"]) + c1 = df1.corrwith(df2)["a"] + c2 = np.corrcoef(df1["a"], df2["a"])[0][1] tm.assert_almost_equal(c1, c2) assert c1 < 1 def test_corrwith_mixed_dtypes(self): # GH 18570 - df = pd.DataFrame({'a': [1, 4, 3, 2], 'b': [4, 6, 7, 3], - 'c': ['a', 'b', 'c', 'd']}) + df = pd.DataFrame( + {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} + ) s = pd.Series([0, 6, 7, 3]) result = df.corrwith(s) - corrs = [df['a'].corr(s), df['b'].corr(s)] - expected = pd.Series(data=corrs, index=['a', 'b']) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = pd.Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) def test_corrwith_index_intersection(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), - columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), - columns=["a", "b", "c"]) + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=True).index.sort_values() expected = df1.columns.intersection(df2.columns).sort_values() tm.assert_index_equal(result, expected) def test_corrwith_index_union(self): - df1 = pd.DataFrame(np.random.random(size=(10, 2)), - columns=["a", "b"]) - df2 = pd.DataFrame(np.random.random(size=(10, 3)), - columns=["a", "b", "c"]) + df1 = pd.DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=False).index.sort_values() expected = df1.columns.union(df2.columns).sort_values() @@ -487,7 +520,7 @@ def test_corrwith_dup_cols(self): def test_corrwith_spearman(self): # GH 21925 df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df**2, method="spearman") + result = df.corrwith(df ** 2, method="spearman") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) @@ -495,7 +528,7 @@ def test_corrwith_spearman(self): def test_corrwith_kendall(self): # GH 21925 df = pd.DataFrame(np.random.random(size=(100, 3))) - result = df.corrwith(df**2, method="kendall") + result = df.corrwith(df ** 2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) @@ -503,34 +536,40 @@ def test_corrwith_kendall(self): # Describe def test_bool_describe_in_mixed_frame(self): - df = DataFrame({ - 'string_data': ['a', 'b', 'c', 'd', 'e'], - 'bool_data': [True, True, False, False, False], - 'int_data': [10, 20, 30, 40, 50], - }) + df = DataFrame( + { + "string_data": ["a", "b", "c", "d", "e"], + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + } + ) # Integer data are included in .describe() output, # Boolean and string data are not. result = df.describe() - expected = DataFrame({'int_data': [5, 30, df.int_data.std(), - 10, 20, 30, 40, 50]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + expected = DataFrame( + {"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_frame_equal(result, expected) # Top value is a boolean value that is False - result = df.describe(include=['bool']) + result = df.describe(include=["bool"]) - expected = DataFrame({'bool_data': [5, 2, False, 3]}, - index=['count', 'unique', 'top', 'freq']) + expected = DataFrame( + {"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"] + ) tm.assert_frame_equal(result, expected) def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 df = pd.DataFrame({"A": [None, None]}, dtype=object) result = df.describe() - expected = pd.DataFrame({"A": [0, 0, np.nan, np.nan]}, dtype=object, - index=['count', 'unique', 'top', 'freq']) + expected = pd.DataFrame( + {"A": [0, 0, np.nan, np.nan]}, + dtype=object, + index=["count", "unique", "top", "freq"], + ) tm.assert_frame_equal(result, expected) result = df.iloc[:0].describe() @@ -538,45 +577,51 @@ def test_describe_empty_object(self): def test_describe_bool_frame(self): # GH 13891 - df = pd.DataFrame({ - 'bool_data_1': [False, False, True, True], - 'bool_data_2': [False, True, True, True] - }) + df = pd.DataFrame( + { + "bool_data_1": [False, False, True, True], + "bool_data_2": [False, True, True, True], + } + ) result = df.describe() - expected = DataFrame({'bool_data_1': [4, 2, True, 2], - 'bool_data_2': [4, 2, True, 3]}, - index=['count', 'unique', 'top', 'freq']) + expected = DataFrame( + {"bool_data_1": [4, 2, True, 2], "bool_data_2": [4, 2, True, 3]}, + index=["count", "unique", "top", "freq"], + ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({ - 'bool_data': [False, False, True, True, False], - 'int_data': [0, 1, 2, 3, 4] - }) + df = pd.DataFrame( + { + "bool_data": [False, False, True, True, False], + "int_data": [0, 1, 2, 3, 4], + } + ) result = df.describe() - expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1, - 2, 3, 4]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + expected = DataFrame( + {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({ - 'bool_data': [False, False, True, True], - 'str_data': ['a', 'b', 'c', 'a'] - }) + df = pd.DataFrame( + {"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]} + ) result = df.describe() - expected = DataFrame({'bool_data': [4, 2, True, 2], - 'str_data': [4, 3, 'a', 2]}, - index=['count', 'unique', 'top', 'freq']) + expected = DataFrame( + {"bool_data": [4, 2, True, 2], "str_data": [4, 3, "a", 2]}, + index=["count", "unique", "top", "freq"], + ) tm.assert_frame_equal(result, expected) def test_describe_categorical(self): - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) cat = df # Categoricals should not show up together with numerical columns @@ -586,12 +631,12 @@ def test_describe_categorical(self): # In a frame, describe() for the cat should be the same as for string # arrays (count, unique, top, freq) - cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'], - ordered=True) + cat = Categorical( + ["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True + ) s = Series(cat) result = s.describe() - expected = Series([4, 2, "b", 3], - index=['count', 'unique', 'top', 'freq']) + expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"]) tm.assert_series_equal(result, expected) cat = Series(Categorical(["a", "b", "c", "c"])) @@ -605,9 +650,11 @@ def test_describe_empty_categorical_column(self): # also contains (count, unique, top, freq) df = pd.DataFrame({"empty_col": Categorical([])}) result = df.describe() - expected = DataFrame({'empty_col': [0, 0, np.nan, np.nan]}, - index=['count', 'unique', 'top', 'freq'], - dtype='object') + expected = DataFrame( + {"empty_col": [0, 0, np.nan, np.nan]}, + index=["count", "unique", "top", "freq"], + dtype="object", + ) tm.assert_frame_equal(result, expected) # ensure NaN, not None assert np.isnan(result.iloc[2, 0]) @@ -615,86 +662,113 @@ def test_describe_empty_categorical_column(self): def test_describe_categorical_columns(self): # GH 11558 - columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], - ordered=True, name='XXX') - df = DataFrame({'int1': [10, 20, 30, 40, 50], - 'int2': [10, 20, 30, 40, 50], - 'obj': ['A', 0, None, 'X', 1]}, - columns=columns) + columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX") + df = DataFrame( + { + "int1": [10, 20, 30, 40, 50], + "int2": [10, 20, 30, 40, 50], + "obj": ["A", 0, None, "X", 1], + }, + columns=columns, + ) result = df.describe() - exp_columns = pd.CategoricalIndex(['int1', 'int2'], - categories=['int1', 'int2', 'obj'], - ordered=True, name='XXX') - expected = DataFrame({'int1': [5, 30, df.int1.std(), - 10, 20, 30, 40, 50], - 'int2': [5, 30, df.int2.std(), - 10, 20, 30, 40, 50]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max'], - columns=exp_columns) + exp_columns = pd.CategoricalIndex( + ["int1", "int2"], + categories=["int1", "int2", "obj"], + ordered=True, + name="XXX", + ) + expected = DataFrame( + { + "int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50], + "int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + columns=exp_columns, + ) tm.assert_frame_equal(result, expected) - tm.assert_categorical_equal(result.columns.values, - expected.columns.values) + tm.assert_categorical_equal(result.columns.values, expected.columns.values) def test_describe_datetime_columns(self): - columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - freq='MS', tz='US/Eastern', name='XXX') - df = DataFrame({0: [10, 20, 30, 40, 50], - 1: [10, 20, 30, 40, 50], - 2: ['A', 0, None, 'X', 1]}) + columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], + freq="MS", + tz="US/Eastern", + name="XXX", + ) + df = DataFrame( + { + 0: [10, 20, 30, 40, 50], + 1: [10, 20, 30, 40, 50], + 2: ["A", 0, None, "X", 1], + } + ) df.columns = columns result = df.describe() - exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'], - freq='MS', tz='US/Eastern', name='XXX') - expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(), - 10, 20, 30, 40, 50], - 1: [5, 30, df.iloc[:, 1].std(), - 10, 20, 30, 40, 50]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + exp_columns = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX" + ) + expected = DataFrame( + { + 0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50], + 1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) expected.columns = exp_columns tm.assert_frame_equal(result, expected) - assert result.columns.freq == 'MS' + assert result.columns.freq == "MS" assert result.columns.tz == expected.columns.tz def test_describe_timedelta_values(self): # GH 6145 - t1 = pd.timedelta_range('1 days', freq='D', periods=5) - t2 = pd.timedelta_range('1 hours', freq='H', periods=5) - df = pd.DataFrame({'t1': t1, 't2': t2}) - - expected = DataFrame({'t1': [5, pd.Timedelta('3 days'), - df.iloc[:, 0].std(), - pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days'), - pd.Timedelta('4 days'), - pd.Timedelta('5 days')], - 't2': [5, pd.Timedelta('3 hours'), - df.iloc[:, 1].std(), - pd.Timedelta('1 hours'), - pd.Timedelta('2 hours'), - pd.Timedelta('3 hours'), - pd.Timedelta('4 hours'), - pd.Timedelta('5 hours')]}, - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + t1 = pd.timedelta_range("1 days", freq="D", periods=5) + t2 = pd.timedelta_range("1 hours", freq="H", periods=5) + df = pd.DataFrame({"t1": t1, "t2": t2}) + + expected = DataFrame( + { + "t1": [ + 5, + pd.Timedelta("3 days"), + df.iloc[:, 0].std(), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.Timedelta("4 days"), + pd.Timedelta("5 days"), + ], + "t2": [ + 5, + pd.Timedelta("3 hours"), + df.iloc[:, 1].std(), + pd.Timedelta("1 hours"), + pd.Timedelta("2 hours"), + pd.Timedelta("3 hours"), + pd.Timedelta("4 hours"), + pd.Timedelta("5 hours"), + ], + }, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) result = df.describe() tm.assert_frame_equal(result, expected) - exp_repr = (" t1 t2\n" - "count 5 5\n" - "mean 3 days 00:00:00 0 days 03:00:00\n" - "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" - "min 1 days 00:00:00 0 days 01:00:00\n" - "25% 2 days 00:00:00 0 days 02:00:00\n" - "50% 3 days 00:00:00 0 days 03:00:00\n" - "75% 4 days 00:00:00 0 days 04:00:00\n" - "max 5 days 00:00:00 0 days 05:00:00") + exp_repr = ( + " t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00" + ) assert repr(result) == exp_repr def test_describe_tz_values(self, tz_naive_fixture): @@ -704,62 +778,121 @@ def test_describe_tz_values(self, tz_naive_fixture): start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s2 = Series(date_range(start, end, tz=tz)) - df = pd.DataFrame({'s1': s1, 's2': s2}) - - expected = DataFrame({'s1': [5, np.nan, np.nan, np.nan, np.nan, np.nan, - 2, 1.581139, 0, 1, 2, 3, 4], - 's2': [5, 5, s2.value_counts().index[0], 1, - start.tz_localize(tz), - end.tz_localize(tz), np.nan, np.nan, - np.nan, np.nan, np.nan, np.nan, np.nan]}, - index=['count', 'unique', 'top', 'freq', 'first', - 'last', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] - ) - result = df.describe(include='all') + df = pd.DataFrame({"s1": s1, "s2": s2}) + + expected = DataFrame( + { + "s1": [ + 5, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2, + 1.581139, + 0, + 1, + 2, + 3, + 4, + ], + "s2": [ + 5, + 5, + s2.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + }, + index=[ + "count", + "unique", + "top", + "freq", + "first", + "last", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ], + ) + result = df.describe(include="all") tm.assert_frame_equal(result, expected) def test_describe_percentiles_integer_idx(self): # Issue 26660 - df = pd.DataFrame({'x': [1]}) + df = pd.DataFrame({"x": [1]}) pct = np.linspace(0, 1, 10 + 1) result = df.describe(percentiles=pct) expected = DataFrame( - {'x': [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, - index=['count', 'mean', 'std', 'min', '0%', '10%', '20%', '30%', - '40%', '50%', '60%', '70%', '80%', '90%', '100%', 'max']) + {"x": [1.0, 1.0, np.NaN, 1.0, *[1.0 for _ in pct], 1.0]}, + index=[ + "count", + "mean", + "std", + "min", + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + "max", + ], + ) tm.assert_frame_equal(result, expected) + # --------------------------------------------------------------------- # Reductions def test_stat_op_api(self, float_frame, float_string_frame): - assert_stat_op_api('count', float_frame, float_string_frame, - has_numeric_only=True) - assert_stat_op_api('sum', float_frame, float_string_frame, - has_numeric_only=True) - - assert_stat_op_api('nunique', float_frame, float_string_frame) - assert_stat_op_api('mean', float_frame, float_string_frame) - assert_stat_op_api('product', float_frame, float_string_frame) - assert_stat_op_api('median', float_frame, float_string_frame) - assert_stat_op_api('min', float_frame, float_string_frame) - assert_stat_op_api('max', float_frame, float_string_frame) - assert_stat_op_api('mad', float_frame, float_string_frame) - assert_stat_op_api('var', float_frame, float_string_frame) - assert_stat_op_api('std', float_frame, float_string_frame) - assert_stat_op_api('sem', float_frame, float_string_frame) - assert_stat_op_api('median', float_frame, float_string_frame) + assert_stat_op_api( + "count", float_frame, float_string_frame, has_numeric_only=True + ) + assert_stat_op_api( + "sum", float_frame, float_string_frame, has_numeric_only=True + ) + + assert_stat_op_api("nunique", float_frame, float_string_frame) + assert_stat_op_api("mean", float_frame, float_string_frame) + assert_stat_op_api("product", float_frame, float_string_frame) + assert_stat_op_api("median", float_frame, float_string_frame) + assert_stat_op_api("min", float_frame, float_string_frame) + assert_stat_op_api("max", float_frame, float_string_frame) + assert_stat_op_api("mad", float_frame, float_string_frame) + assert_stat_op_api("var", float_frame, float_string_frame) + assert_stat_op_api("std", float_frame, float_string_frame) + assert_stat_op_api("sem", float_frame, float_string_frame) + assert_stat_op_api("median", float_frame, float_string_frame) try: from scipy.stats import skew, kurtosis # noqa:F401 - assert_stat_op_api('skew', float_frame, float_string_frame) - assert_stat_op_api('kurt', float_frame, float_string_frame) + + assert_stat_op_api("skew", float_frame, float_string_frame) + assert_stat_op_api("kurt", float_frame, float_string_frame) except ImportError: pass def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame): - def count(s): return notna(s).sum() @@ -780,43 +913,61 @@ def sem(x): def skewness(x): from scipy.stats import skew # noqa:F811 + if len(x) < 3: return np.nan return skew(x, bias=False) def kurt(x): from scipy.stats import kurtosis # noqa:F811 + if len(x) < 4: return np.nan return kurtosis(x, bias=False) - assert_stat_op_calc('nunique', nunique, float_frame_with_na, - has_skipna=False, check_dtype=False, - check_dates=True) + assert_stat_op_calc( + "nunique", + nunique, + float_frame_with_na, + has_skipna=False, + check_dtype=False, + check_dates=True, + ) # mixed types (with upcasting happening) - assert_stat_op_calc('sum', np.sum, mixed_float_frame.astype('float32'), - check_dtype=False, check_less_precise=True) - - assert_stat_op_calc('sum', np.sum, float_frame_with_na, - skipna_alternative=np.nansum) - assert_stat_op_calc('mean', np.mean, float_frame_with_na, - check_dates=True) - assert_stat_op_calc('product', np.prod, float_frame_with_na) - - assert_stat_op_calc('mad', mad, float_frame_with_na) - assert_stat_op_calc('var', var, float_frame_with_na) - assert_stat_op_calc('std', std, float_frame_with_na) - assert_stat_op_calc('sem', sem, float_frame_with_na) + assert_stat_op_calc( + "sum", + np.sum, + mixed_float_frame.astype("float32"), + check_dtype=False, + check_less_precise=True, + ) - assert_stat_op_calc('count', count, float_frame_with_na, - has_skipna=False, check_dtype=False, - check_dates=True) + assert_stat_op_calc( + "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum + ) + assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) + assert_stat_op_calc("product", np.prod, float_frame_with_na) + + assert_stat_op_calc("mad", mad, float_frame_with_na) + assert_stat_op_calc("var", var, float_frame_with_na) + assert_stat_op_calc("std", std, float_frame_with_na) + assert_stat_op_calc("sem", sem, float_frame_with_na) + + assert_stat_op_calc( + "count", + count, + float_frame_with_na, + has_skipna=False, + check_dtype=False, + check_dates=True, + ) try: from scipy import skew, kurtosis # noqa:F401 - assert_stat_op_calc('skew', skewness, float_frame_with_na) - assert_stat_op_calc('kurt', kurt, float_frame_with_na) + + assert_stat_op_calc("skew", skewness, float_frame_with_na) + assert_stat_op_calc("kurt", kurt, float_frame_with_na) except ImportError: pass @@ -828,89 +979,100 @@ def wrapper(x): return np.nan return np.median(x) - assert_stat_op_calc('median', wrapper, float_frame_with_na, - check_dates=True) - assert_stat_op_calc('median', wrapper, int_frame, check_dtype=False, - check_dates=True) + assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True) + assert_stat_op_calc( + "median", wrapper, int_frame, check_dtype=False, check_dates=True + ) - @pytest.mark.parametrize('method', ['sum', 'mean', 'prod', 'var', - 'std', 'skew', 'min', 'max']) + @pytest.mark.parametrize( + "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"] + ) def test_stat_operators_attempt_obj_array(self, method): # GH#676 data = { - 'a': [-0.00049987540199591344, -0.0016467257772919831, - 0.00067695870775883013], - 'b': [-0, -0, 0.0], - 'c': [0.00031111847529610595, 0.0014902627951905339, - -0.00094099200035979691] + "a": [ + -0.00049987540199591344, + -0.0016467257772919831, + 0.00067695870775883013, + ], + "b": [-0, -0, 0.0], + "c": [ + 0.00031111847529610595, + 0.0014902627951905339, + -0.00094099200035979691, + ], } - df1 = DataFrame(data, index=['foo', 'bar', 'baz'], dtype='O') + df1 = DataFrame(data, index=["foo", "bar", "baz"], dtype="O") - df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], - 2: [np.nan, 4]}, dtype=object) + df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object) for df in [df1, df2]: assert df.values.dtype == np.object_ result = getattr(df, method)(1) - expected = getattr(df.astype('f8'), method)(1) + expected = getattr(df.astype("f8"), method)(1) - if method in ['sum', 'prod']: + if method in ["sum", "prod"]: tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('op', ['mean', 'std', 'var', - 'skew', 'kurt', 'sem']) + @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 - df = DataFrame({'int': [1, 2, 3, 4], - 'float': [1., 2., 3., 4.], - 'str': ['a', 'b', 'c', 'd']}) + df = DataFrame( + { + "int": [1, 2, 3, 4], + "float": [1.0, 2.0, 3.0, 4.0], + "str": ["a", "b", "c", "d"], + } + ) result = getattr(df, op)() assert len(result) == 2 - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): result = getattr(df, op)() assert len(result) == 2 def test_reduce_mixed_frame(self): # GH 6806 - df = DataFrame({ - 'bool_data': [True, True, False, False, False], - 'int_data': [10, 20, 30, 40, 50], - 'string_data': ['a', 'b', 'c', 'd', 'e'], - }) - df.reindex(columns=['bool_data', 'int_data', 'string_data']) + df = DataFrame( + { + "bool_data": [True, True, False, False, False], + "int_data": [10, 20, 30, 40, 50], + "string_data": ["a", "b", "c", "d", "e"], + } + ) + df.reindex(columns=["bool_data", "int_data", "string_data"]) test = df.sum(axis=0) - tm.assert_numpy_array_equal(test.values, - np.array([2, 150, 'abcde'], dtype=object)) + tm.assert_numpy_array_equal( + test.values, np.array([2, 150, "abcde"], dtype=object) + ) tm.assert_series_equal(test, df.T.sum(axis=1)) def test_nunique(self): - df = DataFrame({'A': [1, 1, 1], - 'B': [1, 2, 3], - 'C': [1, np.nan, 3]}) - tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2})) - tm.assert_series_equal(df.nunique(dropna=False), - Series({'A': 1, 'B': 3, 'C': 3})) + df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]}) + tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2})) + tm.assert_series_equal( + df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) + ) tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) - tm.assert_series_equal(df.nunique(axis=1, dropna=False), - Series({0: 1, 1: 3, 2: 2})) + tm.assert_series_equal( + df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) + ) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 - df = pd.DataFrame({"A": [1, 1], - "B": [pd.Timestamp('2000', tz=tz)] * 2}) + df = pd.DataFrame({"A": [1, 1], "B": [pd.Timestamp("2000", tz=tz)] * 2}) result = df.mean() - expected = pd.Series([1.0], index=['A']) + expected = pd.Series([1.0], index=["A"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_excludeds_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. - df = pd.DataFrame({"A": [pd.Timestamp('2000', tz=tz)] * 2}) + df = pd.DataFrame({"A": [pd.Timestamp("2000", tz=tz)] * 2}) result = df.mean() expected = pd.Series() tm.assert_series_equal(result, expected) @@ -928,28 +1090,27 @@ def test_var_std(self, datetime_frame): result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() - @pytest.mark.parametrize( - "meth", ['sem', 'var', 'std']) + @pytest.mark.parametrize("meth", ["sem", "var", "std"]) def test_numeric_only_flag(self, meth): # GH 9201 - df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz']) + df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a number in str format - df1.loc[0, 'foo'] = '100' + df1.loc[0, "foo"] = "100" - df2 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz']) + df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a non-number str - df2.loc[0, 'foo'] = 'a' + df2.loc[0, "foo"] = "a" result = getattr(df1, meth)(axis=1, numeric_only=True) - expected = getattr(df1[['bar', 'baz']], meth)(axis=1) + expected = getattr(df1[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) result = getattr(df2, meth)(axis=1, numeric_only=True) - expected = getattr(df2[['bar', 'baz']], meth)(axis=1) + expected = getattr(df2[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside @@ -962,82 +1123,104 @@ def test_numeric_only_flag(self, meth): def test_sem(self, datetime_frame): result = datetime_frame.sem(ddof=4) - expected = datetime_frame.apply( - lambda x: x.std(ddof=4) / np.sqrt(len(x))) + expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x))) tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nansem(arr, axis=0) assert not (result < 0).any() - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() @td.skip_if_no_scipy def test_kurt(self): - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], - [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) df = DataFrame(np.random.randn(6, 3), index=index) kurt = df.kurt() - kurt2 = df.kurt(level=0).xs('bar') + kurt2 = df.kurt(level=0).xs("bar") tm.assert_series_equal(kurt, kurt2, check_names=False) assert kurt.name is None - assert kurt2.name == 'bar' - - @pytest.mark.parametrize("dropna, expected", [ - (True, {'A': [12], - 'B': [10.0], - 'C': [1.0], - 'D': ['a'], - 'E': Categorical(['a'], categories=['a']), - 'F': to_datetime(['2000-1-2']), - 'G': to_timedelta(['1 days'])}), - (False, {'A': [12], - 'B': [10.0], - 'C': [np.nan], - 'D': np.array([np.nan], dtype=object), - 'E': Categorical([np.nan], categories=['a']), - 'F': [pd.NaT], - 'G': to_timedelta([pd.NaT])}), - (True, {'H': [8, 9, np.nan, np.nan], - 'I': [8, 9, np.nan, np.nan], - 'J': [1, np.nan, np.nan, np.nan], - 'K': Categorical(['a', np.nan, np.nan, np.nan], - categories=['a']), - 'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']), - 'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']), - 'N': [0, 1, 2, 3]}), - (False, {'H': [8, 9, np.nan, np.nan], - 'I': [8, 9, np.nan, np.nan], - 'J': [1, np.nan, np.nan, np.nan], - 'K': Categorical([np.nan, 'a', np.nan, np.nan], - categories=['a']), - 'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), - 'M': to_timedelta(['nan', '1 days', 'nan', 'nan']), - 'N': [0, 1, 2, 3]}) - ]) + assert kurt2.name == "bar" + + @pytest.mark.parametrize( + "dropna, expected", + [ + ( + True, + { + "A": [12], + "B": [10.0], + "C": [1.0], + "D": ["a"], + "E": Categorical(["a"], categories=["a"]), + "F": to_datetime(["2000-1-2"]), + "G": to_timedelta(["1 days"]), + }, + ), + ( + False, + { + "A": [12], + "B": [10.0], + "C": [np.nan], + "D": np.array([np.nan], dtype=object), + "E": Categorical([np.nan], categories=["a"]), + "F": [pd.NaT], + "G": to_timedelta([pd.NaT]), + }, + ), + ( + True, + { + "H": [8, 9, np.nan, np.nan], + "I": [8, 9, np.nan, np.nan], + "J": [1, np.nan, np.nan, np.nan], + "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), + "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), + "M": to_timedelta(["1 days", "nan", "nan", "nan"]), + "N": [0, 1, 2, 3], + }, + ), + ( + False, + { + "H": [8, 9, np.nan, np.nan], + "I": [8, 9, np.nan, np.nan], + "J": [1, np.nan, np.nan, np.nan], + "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), + "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "M": to_timedelta(["nan", "1 days", "nan", "nan"]), + "N": [0, 1, 2, 3], + }, + ), + ], + ) def test_mode_dropna(self, dropna, expected): - df = DataFrame({"A": [12, 12, 19, 11], - "B": [10, 10, np.nan, 3], - "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, 'a', np.nan], - "E": Categorical([np.nan, np.nan, 'a', np.nan]), - "F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']), - "G": to_timedelta(['1 days', 'nan', 'nan', 'nan']), - "H": [8, 8, 9, 9], - "I": [9, 9, 8, 8], - "J": [1, 1, np.nan, np.nan], - "K": Categorical(['a', np.nan, 'a', np.nan]), - "L": to_datetime(['2000-1-2', '2000-1-2', - 'NaT', 'NaT']), - "M": to_timedelta(['1 days', 'nan', - '1 days', 'nan']), - "N": np.arange(4, dtype='int64')}) + df = DataFrame( + { + "A": [12, 12, 19, 11], + "B": [10, 10, np.nan, 3], + "C": [1, np.nan, np.nan, np.nan], + "D": [np.nan, np.nan, "a", np.nan], + "E": Categorical([np.nan, np.nan, "a", np.nan]), + "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), + "G": to_timedelta(["1 days", "nan", "nan", "nan"]), + "H": [8, 8, 9, 9], + "I": [9, 9, 8, 8], + "J": [1, 1, np.nan, np.nan], + "K": Categorical(["a", np.nan, "a", np.nan]), + "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), + "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), + "N": np.arange(4, dtype="int64"), + } + ) result = df[sorted(list(expected.keys()))].mode(dropna=dropna) expected = DataFrame(expected) @@ -1047,89 +1230,101 @@ def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted - df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']}) - expected = DataFrame({'A': ['a', np.nan]}) + df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) + expected = DataFrame({"A": ["a", np.nan]}) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): result = df.mode(dropna=False) - result = result.sort_values(by='A').reset_index(drop=True) + result = result.sort_values(by="A").reset_index(drop=True) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): - df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'), - B=date_range('2012-1-2', periods=3, freq='D'), - C=Timestamp('20120101') - - timedelta(minutes=5, seconds=5))) + df = DataFrame( + dict( + A=date_range("2012-1-1", periods=3, freq="D"), + B=date_range("2012-1-2", periods=3, freq="D"), + C=Timestamp("20120101") - timedelta(minutes=5, seconds=5), + ) + ) - diffs = DataFrame(dict(A=df['A'] - df['C'], - B=df['A'] - df['B'])) + diffs = DataFrame(dict(A=df["A"] - df["C"], B=df["A"] - df["B"])) # min result = diffs.min() - assert result[0] == diffs.loc[0, 'A'] - assert result[1] == diffs.loc[0, 'B'] + assert result[0] == diffs.loc[0, "A"] + assert result[1] == diffs.loc[0, "B"] result = diffs.min(axis=1) - assert (result == diffs.loc[0, 'B']).all() + assert (result == diffs.loc[0, "B"]).all() # max result = diffs.max() - assert result[0] == diffs.loc[2, 'A'] - assert result[1] == diffs.loc[2, 'B'] + assert result[0] == diffs.loc[2, "A"] + assert result[1] == diffs.loc[2, "B"] result = diffs.max(axis=1) - assert (result == diffs['A']).all() + assert (result == diffs["A"]).all() # abs result = diffs.abs() result2 = abs(diffs) - expected = DataFrame(dict(A=df['A'] - df['C'], - B=df['B'] - df['A'])) + expected = DataFrame(dict(A=df["A"] - df["C"], B=df["B"] - df["A"])) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) # mixed frame mixed = diffs.copy() - mixed['C'] = 'foo' - mixed['D'] = 1 - mixed['E'] = 1. - mixed['F'] = Timestamp('20130101') + mixed["C"] = "foo" + mixed["D"] = 1 + mixed["E"] = 1.0 + mixed["F"] = Timestamp("20130101") # results in an object array result = mixed.min() - expected = Series([pd.Timedelta(timedelta(seconds=5 * 60 + 5)), - pd.Timedelta(timedelta(days=-1)), - 'foo', 1, 1.0, - Timestamp('20130101')], - index=mixed.columns) + expected = Series( + [ + pd.Timedelta(timedelta(seconds=5 * 60 + 5)), + pd.Timedelta(timedelta(days=-1)), + "foo", + 1, + 1.0, + Timestamp("20130101"), + ], + index=mixed.columns, + ) tm.assert_series_equal(result, expected) # excludes numeric result = mixed.min(axis=1) - expected = Series([1, 1, 1.], index=[0, 1, 2]) + expected = Series([1, 1, 1.0], index=[0, 1, 2]) tm.assert_series_equal(result, expected) # works when only those columns are selected - result = mixed[['A', 'B']].min(1) + result = mixed[["A", "B"]].min(1) expected = Series([timedelta(days=-1)] * 3) tm.assert_series_equal(result, expected) - result = mixed[['A', 'B']].min() - expected = Series([timedelta(seconds=5 * 60 + 5), - timedelta(days=-1)], index=['A', 'B']) + result = mixed[["A", "B"]].min() + expected = Series( + [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"] + ) tm.assert_series_equal(result, expected) # GH 3106 - df = DataFrame({'time': date_range('20130102', periods=5), - 'time2': date_range('20130105', periods=5)}) - df['off1'] = df['time2'] - df['time'] - assert df['off1'].dtype == 'timedelta64[ns]' + df = DataFrame( + { + "time": date_range("20130102", periods=5), + "time2": date_range("20130105", periods=5), + } + ) + df["off1"] = df["time2"] - df["time"] + assert df["off1"].dtype == "timedelta64[ns]" - df['off2'] = df['time'] - df['time2'] + df["off2"] = df["time"] - df["time2"] df._consolidate_inplace() - assert df['off1'].dtype == 'timedelta64[ns]' - assert df['off2'].dtype == 'timedelta64[ns]' + assert df["off1"].dtype == "timedelta64[ns]" + assert df["off2"].dtype == "timedelta64[ns]" def test_sum_corner(self): empty_frame = DataFrame() @@ -1141,18 +1336,15 @@ def test_sum_corner(self): assert len(axis0) == 0 assert len(axis1) == 0 - @pytest.mark.parametrize('method, unit', [ - ('sum', 0), - ('prod', 1), - ]) + @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) def test_sum_prod_nanops(self, method, unit): - idx = ['a', 'b', 'c'] - df = pd.DataFrame({"a": [unit, unit], - "b": [unit, np.nan], - "c": [np.nan, np.nan]}) + idx = ["a", "b", "c"] + df = pd.DataFrame( + {"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]} + ) # The default result = getattr(df, method) - expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") # min_count=1 result = getattr(df, method)(min_count=1) @@ -1161,7 +1353,7 @@ def test_sum_prod_nanops(self, method, unit): # min_count=0 result = getattr(df, method)(min_count=0) - expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + expected = pd.Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) result = getattr(df.iloc[1:], method)(min_count=1) @@ -1171,25 +1363,23 @@ def test_sum_prod_nanops(self, method, unit): # min_count > 1 df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) result = getattr(df, method)(min_count=5) - expected = pd.Series(result, index=['A', 'B']) + expected = pd.Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) result = getattr(df, method)(min_count=6) - expected = pd.Series(result, index=['A', 'B']) + expected = pd.Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) def test_sum_nanops_timedelta(self): # prod isn't defined on timedeltas - idx = ['a', 'b', 'c'] - df = pd.DataFrame({"a": [0, 0], - "b": [0, np.nan], - "c": [np.nan, np.nan]}) + idx = ["a", "b", "c"] + df = pd.DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) df2 = df.apply(pd.to_timedelta) # 0 by default result = df2.sum() - expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx) + expected = pd.Series([0, 0, 0], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) # min_count=0 @@ -1198,13 +1388,12 @@ def test_sum_nanops_timedelta(self): # min_count=1 result = df2.sum(min_count=1) - expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx) + expected = pd.Series([0, 0, np.nan], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) def test_sum_object(self, float_frame): values = float_frame.values.astype(int) - frame = DataFrame(values, index=float_frame.index, - columns=float_frame.columns) + frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns) deltas = frame * timedelta(1) deltas.sum() @@ -1227,49 +1416,49 @@ def test_mean_corner(self, float_frame, float_string_frame): tm.assert_index_equal(the_sum.index, the_mean.index) # take mean of boolean column - float_frame['bool'] = float_frame['A'] > 0 + float_frame["bool"] = float_frame["A"] > 0 means = float_frame.mean(0) - assert means['bool'] == float_frame['bool'].values.mean() + assert means["bool"] == float_frame["bool"].values.mean() def test_mean_datetimelike(self): # GH#24757 check that datetimelike are excluded by default, handled # correctly with numeric_only=True - df = pd.DataFrame({ - 'A': np.arange(3), - 'B': pd.date_range('2016-01-01', periods=3), - 'C': pd.timedelta_range('1D', periods=3), - 'D': pd.period_range('2016', periods=3, freq='A') - }) + df = pd.DataFrame( + { + "A": np.arange(3), + "B": pd.date_range("2016-01-01", periods=3), + "C": pd.timedelta_range("1D", periods=3), + "D": pd.period_range("2016", periods=3, freq="A"), + } + ) result = df.mean(numeric_only=True) - expected = pd.Series({'A': 1.}) + expected = pd.Series({"A": 1.0}) tm.assert_series_equal(result, expected) result = df.mean() - expected = pd.Series({ - 'A': 1., - 'C': df.loc[1, 'C'] - }) + expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(reason="casts to object-dtype and then tries to " - "add timestamps", - raises=TypeError, strict=True) + @pytest.mark.xfail( + reason="casts to object-dtype and then tries to " "add timestamps", + raises=TypeError, + strict=True, + ) def test_mean_datetimelike_numeric_only_false(self): - df = pd.DataFrame({ - 'A': np.arange(3), - 'B': pd.date_range('2016-01-01', periods=3), - 'C': pd.timedelta_range('1D', periods=3), - 'D': pd.period_range('2016', periods=3, freq='A') - }) + df = pd.DataFrame( + { + "A": np.arange(3), + "B": pd.date_range("2016-01-01", periods=3), + "C": pd.timedelta_range("1D", periods=3), + "D": pd.period_range("2016", periods=3, freq="A"), + } + ) result = df.mean(numeric_only=False) - expected = pd.Series({ - 'A': 1, - 'B': df.loc[1, 'B'], - 'C': df.loc[1, 'C'], - 'D': df.loc[1, 'D'] - }) + expected = pd.Series( + {"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"], "D": df.loc[1, "D"]} + ) tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): @@ -1288,8 +1477,7 @@ def test_sum_bools(self): # Cumulative Reductions - cumsum, cummax, ... def test_cumsum_corner(self): - dm = DataFrame(np.arange(20).reshape(4, 5), - index=range(4), columns=range(5)) + dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5)) # ?(wesm) result = dm.cumsum() # noqa @@ -1309,7 +1497,7 @@ def test_cumsum(self, datetime_frame): tm.assert_frame_equal(cumsum, expected) # works - df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cumsum() # noqa # fix issue @@ -1361,7 +1549,7 @@ def test_cummin(self, datetime_frame): tm.assert_frame_equal(cummin, expected) # it works - df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cummin() # noqa # fix issue @@ -1384,7 +1572,7 @@ def test_cummax(self, datetime_frame): tm.assert_frame_equal(cummax, expected) # it works - df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) + df = DataFrame({"A": np.arange(20)}, index=np.arange(20)) result = df.cummax() # noqa # fix issue @@ -1428,17 +1616,16 @@ def test_count_objects(self, float_string_frame): def test_pct_change(self): # GH#11150 - pnl = DataFrame([np.arange(0, 40, 10), - np.arange(0, 40, 10), - np.arange(0, 40, 10)]).astype(np.float64) + pnl = DataFrame( + [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] + ).astype(np.float64) pnl.iat[1, 0] = np.nan pnl.iat[1, 1] = np.nan pnl.iat[2, 3] = 60 for axis in range(2): - expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift( - axis=axis) - 1 - result = pnl.pct_change(axis=axis, fill_method='pad') + expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 + result = pnl.pct_change(axis=axis, fill_method="pad") tm.assert_frame_equal(result, expected) @@ -1453,12 +1640,10 @@ def test_idxmin(self, float_frame, int_frame): for axis in [0, 1]: for df in [frame, int_frame]: result = df.idxmin(axis=axis, skipna=skipna) - expected = df.apply(Series.idxmin, axis=axis, - skipna=skipna) + expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) @@ -1470,40 +1655,43 @@ def test_idxmax(self, float_frame, int_frame): for axis in [0, 1]: for df in [frame, int_frame]: result = df.idxmax(axis=axis, skipna=skipna) - expected = df.apply(Series.idxmax, axis=axis, - skipna=skipna) + expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) # ---------------------------------------------------------------------- # Logical reductions - @pytest.mark.parametrize('opname', ['any', 'all']) + @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all(self, opname, bool_frame_with_na, float_string_frame): - assert_bool_op_calc(opname, getattr(np, opname), bool_frame_with_na, - has_skipna=True) - assert_bool_op_api(opname, bool_frame_with_na, float_string_frame, - has_bool_only=True) + assert_bool_op_calc( + opname, getattr(np, opname), bool_frame_with_na, has_skipna=True + ) + assert_bool_op_api( + opname, bool_frame_with_na, float_string_frame, has_bool_only=True + ) def test_any_all_extra(self): - df = DataFrame({ - 'A': [True, False, False], - 'B': [True, True, False], - 'C': [True, True, True], - }, index=['a', 'b', 'c']) - result = df[['A', 'B']].any(1) - expected = Series([True, True, False], index=['a', 'b', 'c']) + df = DataFrame( + { + "A": [True, False, False], + "B": [True, True, False], + "C": [True, True, True], + }, + index=["a", "b", "c"], + ) + result = df[["A", "B"]].any(1) + expected = Series([True, True, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) - result = df[['A', 'B']].any(1, bool_only=True) + result = df[["A", "B"]].any(1, bool_only=True) tm.assert_series_equal(result, expected) result = df.all(1) - expected = Series([True, False, False], index=['a', 'b', 'c']) + expected = Series([True, False, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) result = df.all(1, bool_only=True) @@ -1516,21 +1704,20 @@ def test_any_all_extra(self): result = df.any(axis=None).item() assert result is True - result = df[['C']].all(axis=None).item() + result = df[["C"]].all(axis=None).item() assert result is True def test_any_datetime(self): # GH 23070 float_data = [1, np.nan, 3, np.nan] - datetime_data = [pd.Timestamp('1960-02-15'), - pd.Timestamp('1960-02-16'), - pd.NaT, - pd.NaT] - df = DataFrame({ - "A": float_data, - "B": datetime_data - }) + datetime_data = [ + pd.Timestamp("1960-02-15"), + pd.Timestamp("1960-02-16"), + pd.NaT, + pd.NaT, + ] + df = DataFrame({"A": float_data, "B": datetime_data}) result = df.any(1) expected = Series([True, True, True, False]) @@ -1539,72 +1726,107 @@ def test_any_datetime(self): def test_any_all_bool_only(self): # GH 25101 - df = DataFrame({"col1": [1, 2, 3], - "col2": [4, 5, 6], - "col3": [None, None, None]}) + df = DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + ) result = df.all(bool_only=True) expected = Series(dtype=np.bool) tm.assert_series_equal(result, expected) - df = DataFrame({"col1": [1, 2, 3], - "col2": [4, 5, 6], - "col3": [None, None, None], - "col4": [False, False, True]}) + df = DataFrame( + { + "col1": [1, 2, 3], + "col2": [4, 5, 6], + "col3": [None, None, None], + "col4": [False, False, True], + } + ) result = df.all(bool_only=True) expected = Series({"col4": False}) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('func, data, expected', [ - (np.any, {}, False), - (np.all, {}, True), - (np.any, {'A': []}, False), - (np.all, {'A': []}, True), - (np.any, {'A': [False, False]}, False), - (np.all, {'A': [False, False]}, False), - (np.any, {'A': [True, False]}, True), - (np.all, {'A': [True, False]}, False), - (np.any, {'A': [True, True]}, True), - (np.all, {'A': [True, True]}, True), - - (np.any, {'A': [False], 'B': [False]}, False), - (np.all, {'A': [False], 'B': [False]}, False), - - (np.any, {'A': [False, False], 'B': [False, True]}, True), - (np.all, {'A': [False, False], 'B': [False, True]}, False), - - # other types - (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), - (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), - (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), - (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), - pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, - marks=[td.skip_if_np_lt("1.15")]), - (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), - (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), - (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), - (np.any, {'A': pd.Series([1, 2], dtype='category')}, True), - - # # Mix - # GH 21484 - # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), - # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), - ]) + @pytest.mark.parametrize( + "func, data, expected", + [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {"A": []}, False), + (np.all, {"A": []}, True), + (np.any, {"A": [False, False]}, False), + (np.all, {"A": [False, False]}, False), + (np.any, {"A": [True, False]}, True), + (np.all, {"A": [True, False]}, False), + (np.any, {"A": [True, True]}, True), + (np.all, {"A": [True, True]}, True), + (np.any, {"A": [False], "B": [False]}, False), + (np.all, {"A": [False], "B": [False]}, False), + (np.any, {"A": [False, False], "B": [False, True]}, True), + (np.all, {"A": [False, False], "B": [False, True]}, False), + # other types + (np.all, {"A": pd.Series([0.0, 1.0], dtype="float")}, False), + (np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True), + (np.all, {"A": pd.Series([0, 1], dtype=int)}, False), + (np.any, {"A": pd.Series([0, 1], dtype=int)}, True), + pytest.param( + np.all, + {"A": pd.Series([0, 1], dtype="M8[ns]")}, + False, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([0, 1], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([1, 2], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([1, 2], dtype="M8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([0, 1], dtype="m8[ns]")}, + False, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([0, 1], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.all, + {"A": pd.Series([1, 2], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + pytest.param( + np.any, + {"A": pd.Series([1, 2], dtype="m8[ns]")}, + True, + marks=[td.skip_if_np_lt("1.15")], + ), + (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), + (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), + (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), + (np.any, {"A": pd.Series([1, 2], dtype="category")}, True), + # # Mix + # GH 21484 + # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), + # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), + ], + ) def test_any_all_np_func(self, func, data, expected): # GH 19976 data = DataFrame(data) @@ -1619,32 +1841,38 @@ def test_any_all_np_func(self, func, data, expected): def test_any_all_object(self): # GH 19976 - result = np.all(DataFrame(columns=['a', 'b'])).item() + result = np.all(DataFrame(columns=["a", "b"])).item() assert result is True - result = np.any(DataFrame(columns=['a', 'b'])).item() + result = np.any(DataFrame(columns=["a", "b"])).item() assert result is False - @pytest.mark.parametrize('method', ['any', 'all']) + @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_level_axis_none_raises(self, method): df = DataFrame( {"A": 1}, - index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], - names=['out', 'in']) + index=MultiIndex.from_product( + [["A", "B"], ["a", "b"]], names=["out", "in"] + ), ) xpr = "Must specify 'axis' when aggregating by level." with pytest.raises(ValueError, match=xpr): - getattr(df, method)(axis=None, level='out') + getattr(df, method)(axis=None, level="out") # ---------------------------------------------------------------------- # Isin def test_isin(self): # GH 4211 - df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], - 'ids2': ['a', 'n', 'c', 'n']}, - index=['foo', 'bar', 'baz', 'qux']) - other = ['a', 'b', 'c'] + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) + other = ["a", "b", "c"] result = df.isin(other) expected = DataFrame([df.loc[s].isin(other) for s in df.index]) @@ -1653,86 +1881,92 @@ def test_isin(self): @pytest.mark.parametrize("empty", [[], Series(), np.array([])]) def test_isin_empty(self, empty): # GH 16991 - df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) expected = DataFrame(False, df.index, df.columns) result = df.isin(empty) tm.assert_frame_equal(result, expected) def test_isin_dict(self): - df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) - d = {'A': ['a']} + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + d = {"A": ["a"]} expected = DataFrame(False, df.index, df.columns) - expected.loc[0, 'A'] = True + expected.loc[0, "A"] = True result = df.isin(d) tm.assert_frame_equal(result, expected) # non unique columns - df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) - df.columns = ['A', 'A'] + df = DataFrame({"A": ["a", "b", "c"], "B": ["a", "e", "f"]}) + df.columns = ["A", "A"] expected = DataFrame(False, df.index, df.columns) - expected.loc[0, 'A'] = True + expected.loc[0, "A"] = True result = df.isin(d) tm.assert_frame_equal(result, expected) def test_isin_with_string_scalar(self): # GH 4763 - df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], - 'ids2': ['a', 'n', 'c', 'n']}, - index=['foo', 'bar', 'baz', 'qux']) + df = DataFrame( + { + "vals": [1, 2, 3, 4], + "ids": ["a", "b", "f", "n"], + "ids2": ["a", "n", "c", "n"], + }, + index=["foo", "bar", "baz", "qux"], + ) with pytest.raises(TypeError): - df.isin('a') + df.isin("a") with pytest.raises(TypeError): - df.isin('aaa') + df.isin("aaa") def test_isin_df(self): - df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) - df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]}) + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) + df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) expected = DataFrame(False, df1.index, df1.columns) result = df1.isin(df2) - expected['A'].loc[[1, 3]] = True - expected['B'].loc[[0, 2]] = True + expected["A"].loc[[1, 3]] = True + expected["B"].loc[[0, 2]] = True tm.assert_frame_equal(result, expected) # partial overlapping columns - df2.columns = ['A', 'C'] + df2.columns = ["A", "C"] result = df1.isin(df2) - expected['B'] = False + expected["B"] = False tm.assert_frame_equal(result, expected) def test_isin_tuples(self): # GH 16394 - df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) - df['C'] = list(zip(df['A'], df['B'])) - result = df['C'].isin([(1, 'a')]) - tm.assert_series_equal(result, - Series([True, False, False], name="C")) + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "f"]}) + df["C"] = list(zip(df["A"], df["B"])) + result = df["C"].isin([(1, "a")]) + tm.assert_series_equal(result, Series([True, False, False], name="C")) def test_isin_df_dupe_values(self): - df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) + df1 = DataFrame({"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}) # just cols duped - df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], - columns=['B', 'B']) + df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], columns=["B", "B"]) with pytest.raises(ValueError): df1.isin(df2) # just index duped - df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]], - columns=['A', 'B'], index=[0, 0, 1, 1]) + df2 = DataFrame( + [[0, 2], [12, 4], [2, np.nan], [4, 5]], + columns=["A", "B"], + index=[0, 0, 1, 1], + ) with pytest.raises(ValueError): df1.isin(df2) # cols and index: - df2.columns = ['B', 'B'] + df2.columns = ["B", "B"] with pytest.raises(ValueError): df1.isin(df2) def test_isin_dupe_self(self): - other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]}) - df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A', 'A']) + other = DataFrame({"A": [1, 0, 1, 0], "B": [1, 1, 0, 0]}) + df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=["A", "A"]) result = df.isin(other) expected = DataFrame(False, index=df.index, columns=df.columns) expected.loc[0] = True @@ -1740,26 +1974,40 @@ def test_isin_dupe_self(self): tm.assert_frame_equal(result, expected) def test_isin_against_series(self): - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, - index=['a', 'b', 'c', 'd']) - s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd']) + df = pd.DataFrame( + {"A": [1, 2, 3, 4], "B": [2, np.nan, 4, 4]}, index=["a", "b", "c", "d"] + ) + s = pd.Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) expected = DataFrame(False, index=df.index, columns=df.columns) - expected['A'].loc['a'] = True - expected.loc['d'] = True + expected["A"].loc["a"] = True + expected.loc["d"] = True result = df.isin(s) tm.assert_frame_equal(result, expected) def test_isin_multiIndex(self): - idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'), - (0, 'b', 'bar'), (0, 'b', 'baz'), - (2, 'a', 'foo'), (2, 'a', 'bar'), - (2, 'c', 'bar'), (2, 'c', 'baz'), - (1, 'b', 'foo'), (1, 'b', 'bar'), - (1, 'c', 'bar'), (1, 'c', 'baz')]) - df1 = DataFrame({'A': np.ones(12), - 'B': np.zeros(12)}, index=idx) - df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], - 'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]}) + idx = MultiIndex.from_tuples( + [ + (0, "a", "foo"), + (0, "a", "bar"), + (0, "b", "bar"), + (0, "b", "baz"), + (2, "a", "foo"), + (2, "a", "bar"), + (2, "c", "bar"), + (2, "c", "baz"), + (1, "b", "foo"), + (1, "b", "bar"), + (1, "c", "bar"), + (1, "c", "baz"), + ] + ) + df1 = DataFrame({"A": np.ones(12), "B": np.zeros(12)}, index=idx) + df2 = DataFrame( + { + "A": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + "B": [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1], + } + ) # against regular index expected = DataFrame(False, index=df1.index, columns=df1.columns) result = df1.isin(df2) @@ -1768,21 +2016,19 @@ def test_isin_multiIndex(self): df2.index = idx expected = df2.values.astype(np.bool) expected[:, 1] = ~expected[:, 1] - expected = DataFrame(expected, columns=['A', 'B'], index=idx) + expected = DataFrame(expected, columns=["A", "B"], index=idx) result = df1.isin(df2) tm.assert_frame_equal(result, expected) def test_isin_empty_datetimelike(self): # GH 15473 - df1_ts = DataFrame({'date': - pd.to_datetime(['2014-01-01', '2014-01-02'])}) - df1_td = DataFrame({'date': - [pd.Timedelta(1, 's'), pd.Timedelta(2, 's')]}) - df2 = DataFrame({'date': []}) + df1_ts = DataFrame({"date": pd.to_datetime(["2014-01-01", "2014-01-02"])}) + df1_td = DataFrame({"date": [pd.Timedelta(1, "s"), pd.Timedelta(2, "s")]}) + df2 = DataFrame({"date": []}) df3 = DataFrame() - expected = DataFrame({'date': [False, False]}) + expected = DataFrame({"date": [False, False]}) result = df1_ts.isin(df2) tm.assert_frame_equal(result, expected) @@ -1805,18 +2051,17 @@ def test_round(self): tm.assert_frame_equal(df, df.round()) # Here's the test frame we'll be working with - df = DataFrame({'col1': [1.123, 2.123, 3.123], - 'col2': [1.234, 2.234, 3.234]}) + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) - expected_rounded = DataFrame( - {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) tm.assert_frame_equal(df.round(), expected_rounded) # Round with an integer decimals = 2 - expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12], - 'col2': [1.23, 2.23, 3.23]}) + expected_rounded = DataFrame( + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) tm.assert_frame_equal(df.round(decimals), expected_rounded) # This should also work with np.round (since np.round dispatches to @@ -1830,29 +2075,29 @@ def test_round(self): # Round with a dictionary expected_rounded = DataFrame( - {'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]}) - round_dict = {'col1': 1, 'col2': 2} + {"col1": [1.1, 2.1, 3.1], "col2": [1.23, 2.23, 3.23]} + ) + round_dict = {"col1": 1, "col2": 2} tm.assert_frame_equal(df.round(round_dict), expected_rounded) # Incomplete dict expected_partially_rounded = DataFrame( - {'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]}) - partial_round_dict = {'col2': 1} - tm.assert_frame_equal(df.round(partial_round_dict), - expected_partially_rounded) + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} + ) + partial_round_dict = {"col2": 1} + tm.assert_frame_equal(df.round(partial_round_dict), expected_partially_rounded) # Dict with unknown elements - wrong_round_dict = {'col3': 2, 'col2': 1} - tm.assert_frame_equal(df.round(wrong_round_dict), - expected_partially_rounded) + wrong_round_dict = {"col3": 2, "col2": 1} + tm.assert_frame_equal(df.round(wrong_round_dict), expected_partially_rounded) # float input to `decimals` - non_int_round_dict = {'col1': 1, 'col2': 0.5} + non_int_round_dict = {"col1": 1, "col2": 0.5} with pytest.raises(TypeError): df.round(non_int_round_dict) # String input - non_int_round_dict = {'col1': 1, 'col2': 'foo'} + non_int_round_dict = {"col1": 1, "col2": "foo"} with pytest.raises(TypeError): df.round(non_int_round_dict) @@ -1861,7 +2106,7 @@ def test_round(self): df.round(non_int_round_Series) # List input - non_int_round_dict = {'col1': 1, 'col2': [1, 2]} + non_int_round_dict = {"col1": 1, "col2": [1, 2]} with pytest.raises(TypeError): df.round(non_int_round_dict) @@ -1879,47 +2124,46 @@ def test_round(self): df.round(non_int_round_Series) # Negative numbers - negative_round_dict = {'col1': -1, 'col2': -2} + negative_round_dict = {"col1": -1, "col2": -2} big_df = df * 100 expected_neg_rounded = DataFrame( - {'col1': [110., 210, 310], 'col2': [100., 200, 300]}) - tm.assert_frame_equal(big_df.round(negative_round_dict), - expected_neg_rounded) + {"col1": [110.0, 210, 310], "col2": [100.0, 200, 300]} + ) + tm.assert_frame_equal(big_df.round(negative_round_dict), expected_neg_rounded) # nan in Series round - nan_round_Series = Series({'col1': np.nan, 'col2': 1}) + nan_round_Series = Series({"col1": np.nan, "col2": 1}) # TODO(wesm): unused? - expected_nan_round = DataFrame({ # noqa - 'col1': [1.123, 2.123, 3.123], - 'col2': [1.2, 2.2, 3.2]}) + expected_nan_round = DataFrame( + {"col1": [1.123, 2.123, 3.123], "col2": [1.2, 2.2, 3.2]} # noqa + ) with pytest.raises(TypeError): df.round(nan_round_Series) # Make sure this doesn't break existing Series.round - tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1']) + tm.assert_series_equal(df["col1"].round(1), expected_rounded["col1"]) # named columns # GH 11986 decimals = 2 expected_rounded = DataFrame( - {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]}) + {"col1": [1.12, 2.12, 3.12], "col2": [1.23, 2.23, 3.23]} + ) df.columns.name = "cols" expected_rounded.columns.name = "cols" tm.assert_frame_equal(df.round(decimals), expected_rounded) # interaction of named columns & series - tm.assert_series_equal(df['col1'].round(decimals), - expected_rounded['col1']) - tm.assert_series_equal(df.round(decimals)['col1'], - expected_rounded['col1']) + tm.assert_series_equal(df["col1"].round(decimals), expected_rounded["col1"]) + tm.assert_series_equal(df.round(decimals)["col1"], expected_rounded["col1"]) def test_numpy_round(self): # GH 12600 df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) out = np.round(df, decimals=0) - expected = DataFrame([[2., 1.], [0., 7.]]) + expected = DataFrame([[2.0, 1.0], [0.0, 7.0]]) tm.assert_frame_equal(out, expected) msg = "the 'out' parameter is not supported" @@ -1931,35 +2175,46 @@ def test_numpy_round_nan(self): df = Series([1.53, np.nan, 0.06]).to_frame() with tm.assert_produces_warning(None): result = df.round() - expected = Series([2., np.nan, 0.]).to_frame() + expected = Series([2.0, np.nan, 0.0]).to_frame() tm.assert_frame_equal(result, expected) def test_round_mixed_type(self): # GH 11885 - df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4], - 'col2': ['1', 'a', 'c', 'f'], - 'col3': date_range('20111111', periods=4)}) - round_0 = DataFrame({'col1': [1., 2., 3., 4.], - 'col2': ['1', 'a', 'c', 'f'], - 'col3': date_range('20111111', periods=4)}) + df = DataFrame( + { + "col1": [1.1, 2.2, 3.3, 4.4], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) + round_0 = DataFrame( + { + "col1": [1.0, 2.0, 3.0, 4.0], + "col2": ["1", "a", "c", "f"], + "col3": date_range("20111111", periods=4), + } + ) tm.assert_frame_equal(df.round(), round_0) tm.assert_frame_equal(df.round(1), df) - tm.assert_frame_equal(df.round({'col1': 1}), df) - tm.assert_frame_equal(df.round({'col1': 0}), round_0) - tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0) - tm.assert_frame_equal(df.round({'col3': 1}), df) + tm.assert_frame_equal(df.round({"col1": 1}), df) + tm.assert_frame_equal(df.round({"col1": 0}), round_0) + tm.assert_frame_equal(df.round({"col1": 0, "col2": 1}), round_0) + tm.assert_frame_equal(df.round({"col3": 1}), df) def test_round_issue(self): # GH 11611 - df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'], - index=['first', 'second', 'third']) + df = pd.DataFrame( + np.random.random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) dfs = pd.concat((df, df), axis=1) rounded = dfs.round() tm.assert_index_equal(rounded.index, dfs.index) - decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A']) + decimals = pd.Series([1, 0, 2], index=["A", "B", "A"]) msg = "Index of decimals must be unique" with pytest.raises(ValueError, match=msg): df.round(decimals) @@ -1967,18 +2222,16 @@ def test_round_issue(self): def test_built_in_round(self): # GH 11763 # Here's the test frame we'll be working with - df = DataFrame( - {'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]}) + df = DataFrame({"col1": [1.123, 2.123, 3.123], "col2": [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) - expected_rounded = DataFrame( - {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) + expected_rounded = DataFrame({"col1": [1.0, 2.0, 3.0], "col2": [1.0, 2.0, 3.0]}) tm.assert_frame_equal(round(df), expected_rounded) def test_round_nonunique_categorical(self): # See GH21809 - idx = pd.CategoricalIndex(['low'] * 3 + ['hi'] * 3) - df = pd.DataFrame(np.random.rand(6, 3), columns=list('abc')) + idx = pd.CategoricalIndex(["low"] * 3 + ["hi"] * 3) + df = pd.DataFrame(np.random.rand(6, 3), columns=list("abc")) expected = df.round(3) expected.index = idx @@ -2048,16 +2301,13 @@ def test_clip_mixed_numeric(self): # TODO(jreback) # clip on mixed integer or floats # with integer clippers coerces to float - df = DataFrame({'A': [1, 2, 3], - 'B': [1., np.nan, 3.]}) + df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]}) result = df.clip(1, 2) - expected = DataFrame({'A': [1, 2, 2], - 'B': [1., np.nan, 2.]}) + expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]}) tm.assert_frame_equal(result, expected, check_like=True) # GH 24162, clipping now preserves numeric types per column - df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], - columns=['foo', 'bar', 'baz']) + df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"]) expected = df.dtypes result = df.clip(upper=3).dtypes tm.assert_series_equal(result, expected) @@ -2093,21 +2343,20 @@ def test_clip_against_series(self, inplace): @pytest.mark.parametrize("inplace", [True, False]) @pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])]) - @pytest.mark.parametrize("axis,res", [ - (0, [[2., 2., 3.], [4., 5., 6.], [7., 7., 7.]]), - (1, [[2., 3., 4.], [4., 5., 6.], [5., 6., 7.]]) - ]) - def test_clip_against_list_like(self, simple_frame, - inplace, lower, axis, res): + @pytest.mark.parametrize( + "axis,res", + [ + (0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]), + (1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]), + ], + ) + def test_clip_against_list_like(self, simple_frame, inplace, lower, axis, res): # GH 15390 original = simple_frame.copy(deep=True) - result = original.clip(lower=lower, upper=[5, 6, 7], - axis=axis, inplace=inplace) + result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace) - expected = pd.DataFrame(res, - columns=original.columns, - index=original.index) + expected = pd.DataFrame(res, columns=original.columns, index=original.index) if inplace: result = original tm.assert_frame_equal(result, expected, check_exact=True) @@ -2130,16 +2379,15 @@ def test_clip_against_frame(self, axis): def test_clip_against_unordered_columns(self): # GH 20911 - df1 = DataFrame(np.random.randn(1000, 4), columns=['A', 'B', 'C', 'D']) - df2 = DataFrame(np.random.randn(1000, 4), columns=['D', 'A', 'B', 'C']) - df3 = DataFrame(df2.values - 1, columns=['B', 'D', 'C', 'A']) + df1 = DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"]) + df2 = DataFrame(np.random.randn(1000, 4), columns=["D", "A", "B", "C"]) + df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"]) result_upper = df1.clip(lower=0, upper=df2) expected_upper = df1.clip(lower=0, upper=df2[df1.columns]) result_lower = df1.clip(lower=df3, upper=3) expected_lower = df1.clip(lower=df3[df1.columns], upper=3) result_lower_upper = df1.clip(lower=df3, upper=df2) - expected_lower_upper = df1.clip(lower=df3[df1.columns], - upper=df2[df1.columns]) + expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns]) tm.assert_frame_equal(result_upper, expected_upper) tm.assert_frame_equal(result_lower, expected_lower) tm.assert_frame_equal(result_lower_upper, expected_lower_upper) @@ -2148,48 +2396,50 @@ def test_clip_with_na_args(self, float_frame): """Should process np.nan argument as None """ # GH 17276 tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) - tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), - float_frame) + tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame) # GH 19992 - df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6], - 'col_2': [7, 8, 9]}) + df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}) result = df.clip(lower=[4, 5, np.nan], axis=0) - expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan], - 'col_2': [7, 8, np.nan]}) + expected = DataFrame( + {"col_0": [4, 5, np.nan], "col_1": [4, 5, np.nan], "col_2": [7, 8, np.nan]} + ) tm.assert_frame_equal(result, expected) result = df.clip(lower=[4, 5, np.nan], axis=1) - expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6], - 'col_2': [np.nan, np.nan, np.nan]}) + expected = DataFrame( + {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [np.nan, np.nan, np.nan]} + ) tm.assert_frame_equal(result, expected) # --------------------------------------------------------------------- # Matrix-like def test_dot(self): - a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], - columns=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'], - columns=['one', 'two']) + a = DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] + ) result = a.dot(b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) # Check alignment b1 = b.reindex(index=reversed(b.index)) result = a.dot(b) tm.assert_frame_equal(result, expected) # Check series argument - result = a.dot(b['one']) - tm.assert_series_equal(result, expected['one'], check_names=False) + result = a.dot(b["one"]) + tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None - result = a.dot(b1['one']) - tm.assert_series_equal(result, expected['one'], check_names=False) + result = a.dot(b1["one"]) + tm.assert_series_equal(result, expected["one"], check_names=False) assert result.name is None # can pass correct-length arrays @@ -2199,7 +2449,7 @@ def test_dot(self): expected = a.dot(a.iloc[0]) tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match='Dot product shape mismatch'): + with pytest.raises(ValueError, match="Dot product shape mismatch"): a.dot(row[:-1]) a = np.random.rand(1, 5) @@ -2213,32 +2463,31 @@ def test_dot(self): result = A.dot(b) # unaligned - df = DataFrame(np.random.randn(3, 4), - index=[1, 2, 3], columns=range(4)) - df2 = DataFrame(np.random.randn(5, 3), - index=range(5), columns=[1, 2, 3]) + df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) + df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) - with pytest.raises(ValueError, match='aligned'): + with pytest.raises(ValueError, match="aligned"): df.dot(df2) def test_matmul(self): # matmul test is for GH 10259 - a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], - columns=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'], - columns=['one', 'two']) + a = DataFrame( + np.random.randn(3, 4), index=["a", "b", "c"], columns=["p", "q", "r", "s"] + ) + b = DataFrame( + np.random.randn(4, 2), index=["p", "q", "r", "s"], columns=["one", "two"] + ) # DataFrame @ DataFrame result = operator.matmul(a, b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_frame_equal(result, expected) # DataFrame @ Series result = operator.matmul(a, b.one) - expected = Series(np.dot(a.values, b.one.values), - index=['a', 'b', 'c']) + expected = Series(np.dot(a.values, b.one.values), index=["a", "b", "c"]) tm.assert_series_equal(result, expected) # np.array @ DataFrame @@ -2251,168 +2500,192 @@ def test_matmul(self): # nested list @ DataFrame (__rmatmul__) result = operator.matmul(a.values.tolist(), b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_almost_equal(result.values, expected.values) # mixed dtype DataFrame @ DataFrame - a['q'] = a.q.round().astype(int) + a["q"] = a.q.round().astype(int) result = operator.matmul(a, b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_frame_equal(result, expected) # different dtypes DataFrame @ DataFrame a = a.astype(int) result = operator.matmul(a, b) - expected = DataFrame(np.dot(a.values, b.values), - index=['a', 'b', 'c'], - columns=['one', 'two']) + expected = DataFrame( + np.dot(a.values, b.values), index=["a", "b", "c"], columns=["one", "two"] + ) tm.assert_frame_equal(result, expected) # unaligned - df = DataFrame(np.random.randn(3, 4), - index=[1, 2, 3], columns=range(4)) - df2 = DataFrame(np.random.randn(5, 3), - index=range(5), columns=[1, 2, 3]) + df = DataFrame(np.random.randn(3, 4), index=[1, 2, 3], columns=range(4)) + df2 = DataFrame(np.random.randn(5, 3), index=range(5), columns=[1, 2, 3]) - with pytest.raises(ValueError, match='aligned'): + with pytest.raises(ValueError, match="aligned"): operator.matmul(df, df2) @pytest.fixture def df_duplicates(): - return pd.DataFrame({'a': [1, 2, 3, 4, 4], - 'b': [1, 1, 1, 1, 1], - 'c': [0, 1, 2, 5, 4]}, - index=[0, 0, 1, 1, 1]) + return pd.DataFrame( + {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, + index=[0, 0, 1, 1, 1], + ) @pytest.fixture def df_strings(): - return pd.DataFrame({'a': np.random.permutation(10), - 'b': list(ascii_lowercase[:10]), - 'c': np.random.permutation(10).astype('float64')}) + return pd.DataFrame( + { + "a": np.random.permutation(10), + "b": list(ascii_lowercase[:10]), + "c": np.random.permutation(10).astype("float64"), + } + ) @pytest.fixture def df_main_dtypes(): return pd.DataFrame( - {'group': [1, 1, 2], - 'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'category_string': pd.Series(list('abc')).astype('category'), - 'category_int': [7, 8, 9], - 'datetime': pd.date_range('20130101', periods=3), - 'datetimetz': pd.date_range('20130101', - periods=3, - tz='US/Eastern'), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, - columns=['group', 'int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) class TestNLargestNSmallest: - dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot " - "use method {method!r} with this dtype") + dtype_error_msg_template = ( + "Column {column!r} has dtype {dtype}, cannot " + "use method {method!r} with this dtype" + ) # ---------------------------------------------------------------------- # Top / bottom - @pytest.mark.parametrize('order', [ - ['a'], - ['c'], - ['a', 'b'], - ['a', 'c'], - ['b', 'a'], - ['b', 'c'], - ['a', 'b', 'c'], - ['c', 'a', 'b'], - ['c', 'b', 'a'], - ['b', 'c', 'a'], - ['b', 'a', 'c'], - - # dups! - ['b', 'c', 'c']]) - @pytest.mark.parametrize('n', range(1, 11)) + @pytest.mark.parametrize( + "order", + [ + ["a"], + ["c"], + ["a", "b"], + ["a", "c"], + ["b", "a"], + ["b", "c"], + ["a", "b", "c"], + ["c", "a", "b"], + ["c", "b", "a"], + ["b", "c", "a"], + ["b", "a", "c"], + # dups! + ["b", "c", "c"], + ], + ) + @pytest.mark.parametrize("n", range(1, 11)) def test_n(self, df_strings, nselect_method, n, order): # GH 10393 df = df_strings - if 'b' in order: + if "b" in order: error_msg = self.dtype_error_msg_template.format( - column='b', method=nselect_method, dtype='object') + column="b", method=nselect_method, dtype="object" + ) with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(n, order) else: - ascending = nselect_method == 'nsmallest' + ascending = nselect_method == "nsmallest" result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('columns', [ - ['group', 'category_string'], ['group', 'string']]) + @pytest.mark.parametrize( + "columns", [["group", "category_string"], ["group", "string"]] + ) def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes col = columns[1] error_msg = self.dtype_error_msg_template.format( - column=col, method=nselect_method, dtype=df[col].dtype) + column=col, method=nselect_method, dtype=df[col].dtype + ) # escape some characters that may be in the repr - error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") - .replace("[", "\\[").replace("]", "\\]")) + error_msg = ( + error_msg.replace("(", "\\(") + .replace(")", "\\)") + .replace("[", "\\[") + .replace("]", "\\]") + ) with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes - df.nsmallest(2, list(set(df) - {'category_string', 'string'})) - df.nlargest(2, list(set(df) - {'category_string', 'string'})) - - @pytest.mark.parametrize('method,expected', [ - ('nlargest', - pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]}, - index=[2, 1, 0, 3])), - ('nsmallest', - pd.DataFrame({'a': [1, 1, 1, 2], 'b': [1, 2, 3, 1]}, - index=[5, 4, 3, 0]))]) + df.nsmallest(2, list(set(df) - {"category_string", "string"})) + df.nlargest(2, list(set(df) - {"category_string", "string"})) + + @pytest.mark.parametrize( + "method,expected", + [ + ( + "nlargest", + pd.DataFrame( + {"a": [2, 2, 2, 1], "b": [3, 2, 1, 3]}, index=[2, 1, 0, 3] + ), + ), + ( + "nsmallest", + pd.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 3, 1]}, index=[5, 4, 3, 0] + ), + ), + ], + ) def test_duplicates_on_starter_columns(self, method, expected): # regression test for #22752 - df = pd.DataFrame({ - 'a': [2, 2, 2, 1, 1, 1], - 'b': [1, 2, 3, 3, 2, 1] - }) + df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) - result = getattr(df, method)(4, columns=['a', 'b']) + result = getattr(df, method)(4, columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_n_identical_values(self): # GH 15297 - df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]}) + df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) - result = df.nlargest(3, 'a') - expected = pd.DataFrame( - {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2] - ) + result = df.nlargest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}, index=[0, 1, 2]) tm.assert_frame_equal(result, expected) - result = df.nsmallest(3, 'a') - expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) + result = df.nsmallest(3, "a") + expected = pd.DataFrame({"a": [1] * 3, "b": [1, 2, 3]}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('order', [ - ['a', 'b', 'c'], - ['c', 'b', 'a'], - ['a'], - ['b'], - ['a', 'b'], - ['c', 'b']]) - @pytest.mark.parametrize('n', range(1, 6)) + @pytest.mark.parametrize( + "order", + [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], + ) + @pytest.mark.parametrize("n", range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 @@ -2427,38 +2700,45 @@ def test_n_duplicate_index(self, df_duplicates, n, order): def test_duplicate_keep_all_ties(self): # GH 16818 - df = pd.DataFrame({'a': [5, 4, 4, 2, 3, 3, 3, 3], - 'b': [10, 9, 8, 7, 5, 50, 10, 20]}) - result = df.nlargest(4, 'a', keep='all') - expected = pd.DataFrame({'a': {0: 5, 1: 4, 2: 4, 4: 3, - 5: 3, 6: 3, 7: 3}, - 'b': {0: 10, 1: 9, 2: 8, 4: 5, - 5: 50, 6: 10, 7: 20}}) + df = pd.DataFrame( + {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} + ) + result = df.nlargest(4, "a", keep="all") + expected = pd.DataFrame( + { + "a": {0: 5, 1: 4, 2: 4, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {0: 10, 1: 9, 2: 8, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) tm.assert_frame_equal(result, expected) - result = df.nsmallest(2, 'a', keep='all') - expected = pd.DataFrame({'a': {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, - 'b': {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}}) + result = df.nsmallest(2, "a", keep="all") + expected = pd.DataFrame( + { + "a": {3: 2, 4: 3, 5: 3, 6: 3, 7: 3}, + "b": {3: 7, 4: 5, 5: 50, 6: 10, 7: 20}, + } + ) tm.assert_frame_equal(result, expected) def test_series_broadcasting(self): # smoke test for numpy warnings # GH 16378, GH 16306 df = DataFrame([1.0, 1.0, 1.0]) - df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]}) + df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]}) s = Series([1, 1, 1]) s_nan = Series([np.nan, np.nan, 1]) with tm.assert_produces_warning(None): with tm.assert_produces_warning(FutureWarning): df_nan.clip_lower(s, axis=0) - for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']: + for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) def test_series_nat_conversion(self): # GH 18521 # Check rank does not mutate DataFrame - df = DataFrame(np.random.randn(10, 3), dtype='float64') + df = DataFrame(np.random.randn(10, 3), dtype="float64") expected = df.copy() df.rank() result = df @@ -2468,15 +2748,16 @@ def test_multiindex_column_lookup(self): # Check whether tuples are correctly treated as multi-level lookups. # GH 23033 df = pd.DataFrame( - columns=pd.MultiIndex.from_product([['x'], ['a', 'b']]), - data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]]) + columns=pd.MultiIndex.from_product([["x"], ["a", "b"]]), + data=[[0.33, 0.13], [0.86, 0.25], [0.25, 0.70], [0.85, 0.91]], + ) # nsmallest - result = df.nsmallest(3, ('x', 'a')) + result = df.nsmallest(3, ("x", "a")) expected = df.iloc[[2, 0, 3]] tm.assert_frame_equal(result, expected) # nlargest - result = df.nlargest(3, ('x', 'b')) + result = df.nlargest(3, ("x", "b")) expected = df.iloc[[3, 2, 1]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 6372029f2efe7..93508d7ddc50b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -7,11 +7,21 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Series, SparseDataFrame, SparseDtype, compat, - date_range, timedelta_range) + Categorical, + DataFrame, + Series, + SparseDataFrame, + SparseDtype, + compat, + date_range, + timedelta_range, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) class SharedWithSparse: @@ -22,6 +32,7 @@ class SharedWithSparse: ``self._assert_series_equal()`` which are implemented in sub-classes and dispatch correctly. """ + def _assert_frame_equal(self, left, right): """Dispatch to frame class dependent assertion""" raise NotImplementedError @@ -33,79 +44,78 @@ def _assert_series_equal(self, left, right): def test_copy_index_name_checking(self, float_frame): # don't want to be able to modify the index stored elsewhere after # making a copy - for attr in ('index', 'columns'): + for attr in ("index", "columns"): ind = getattr(float_frame, attr) ind.name = None cp = float_frame.copy() - getattr(cp, attr).name = 'foo' + getattr(cp, attr).name = "foo" assert getattr(float_frame, attr).name is None def test_getitem_pop_assign_name(self, float_frame): - s = float_frame['A'] - assert s.name == 'A' + s = float_frame["A"] + assert s.name == "A" - s = float_frame.pop('A') - assert s.name == 'A' + s = float_frame.pop("A") + assert s.name == "A" - s = float_frame.loc[:, 'B'] - assert s.name == 'B' + s = float_frame.loc[:, "B"] + assert s.name == "B" s2 = s.loc[:] - assert s2.name == 'B' + assert s2.name == "B" def test_get_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = float_frame.get_value(idx, col) expected = float_frame[col][idx] tm.assert_almost_equal(result, expected) def test_add_prefix_suffix(self, float_frame): - with_prefix = float_frame.add_prefix('foo#') - expected = pd.Index(['foo#%s' % c for c in float_frame.columns]) + with_prefix = float_frame.add_prefix("foo#") + expected = pd.Index(["foo#%s" % c for c in float_frame.columns]) tm.assert_index_equal(with_prefix.columns, expected) - with_suffix = float_frame.add_suffix('#foo') - expected = pd.Index(['%s#foo' % c for c in float_frame.columns]) + with_suffix = float_frame.add_suffix("#foo") + expected = pd.Index(["%s#foo" % c for c in float_frame.columns]) tm.assert_index_equal(with_suffix.columns, expected) - with_pct_prefix = float_frame.add_prefix('%') - expected = pd.Index(['%{}'.format(c) for c in float_frame.columns]) + with_pct_prefix = float_frame.add_prefix("%") + expected = pd.Index(["%{}".format(c) for c in float_frame.columns]) tm.assert_index_equal(with_pct_prefix.columns, expected) - with_pct_suffix = float_frame.add_suffix('%') - expected = pd.Index(['{}%'.format(c) for c in float_frame.columns]) + with_pct_suffix = float_frame.add_suffix("%") + expected = pd.Index(["{}%".format(c) for c in float_frame.columns]) tm.assert_index_equal(with_pct_suffix.columns, expected) def test_get_axis(self, float_frame): f = float_frame assert f._get_axis_number(0) == 0 assert f._get_axis_number(1) == 1 - assert f._get_axis_number('index') == 0 - assert f._get_axis_number('rows') == 0 - assert f._get_axis_number('columns') == 1 + assert f._get_axis_number("index") == 0 + assert f._get_axis_number("rows") == 0 + assert f._get_axis_number("columns") == 1 - assert f._get_axis_name(0) == 'index' - assert f._get_axis_name(1) == 'columns' - assert f._get_axis_name('index') == 'index' - assert f._get_axis_name('rows') == 'index' - assert f._get_axis_name('columns') == 'columns' + assert f._get_axis_name(0) == "index" + assert f._get_axis_name(1) == "columns" + assert f._get_axis_name("index") == "index" + assert f._get_axis_name("rows") == "index" + assert f._get_axis_name("columns") == "columns" assert f._get_axis(0) is f.index assert f._get_axis(1) is f.columns - with pytest.raises(ValueError, match='No axis named'): + with pytest.raises(ValueError, match="No axis named"): f._get_axis_number(2) - with pytest.raises(ValueError, match='No axis.*foo'): - f._get_axis_name('foo') + with pytest.raises(ValueError, match="No axis.*foo"): + f._get_axis_name("foo") - with pytest.raises(ValueError, match='No axis.*None'): + with pytest.raises(ValueError, match="No axis.*None"): f._get_axis_name(None) - with pytest.raises(ValueError, match='No axis named'): + with pytest.raises(ValueError, match="No axis named"): f._get_axis_number(None) def test_keys(self, float_frame): @@ -118,28 +128,28 @@ def test_column_contains_raises(self, float_frame): def test_tab_completion(self): # DataFrame whose columns are identifiers shall have them in __dir__. - df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD')) - for key in list('ABCD'): + df = pd.DataFrame([list("abcd"), list("efgh")], columns=list("ABCD")) + for key in list("ABCD"): assert key in dir(df) - assert isinstance(df.__getitem__('A'), pd.Series) + assert isinstance(df.__getitem__("A"), pd.Series) # DataFrame whose first-level columns are identifiers shall have # them in __dir__. df = pd.DataFrame( - [list('abcd'), list('efgh')], - columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH')))) - for key in list('ABCD'): + [list("abcd"), list("efgh")], + columns=pd.MultiIndex.from_tuples(list(zip("ABCD", "EFGH"))), + ) + for key in list("ABCD"): assert key in dir(df) - for key in list('EFGH'): + for key in list("EFGH"): assert key not in dir(df) - assert isinstance(df.__getitem__('A'), pd.DataFrame) + assert isinstance(df.__getitem__("A"), pd.DataFrame) def test_not_hashable(self): empty_frame = DataFrame() df = self.klass([1]) - msg = ("'(Sparse)?DataFrame' objects are mutable, thus they cannot be" - " hashed") + msg = "'(Sparse)?DataFrame' objects are mutable, thus they cannot be" " hashed" with pytest.raises(TypeError, match=msg): hash(df) with pytest.raises(TypeError, match=msg): @@ -148,11 +158,11 @@ def test_not_hashable(self): def test_new_empty_index(self): df1 = self.klass(np.random.randn(0, 3)) df2 = self.klass(np.random.randn(0, 3)) - df1.index.name = 'foo' + df1.index.name = "foo" assert df2.index.name is None def test_array_interface(self, float_frame): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = np.sqrt(float_frame) assert isinstance(result, type(float_frame)) assert result.index is float_frame.index @@ -179,20 +189,18 @@ def test_nonzero(self, float_frame, float_string_frame): assert not float_string_frame.empty # corner case - df = DataFrame({'A': [1., 2., 3.], - 'B': ['a', 'b', 'c']}, - index=np.arange(3)) - del df['A'] + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}, index=np.arange(3)) + del df["A"] assert not df.empty def test_iteritems(self): - df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) + df = self.klass([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) for k, v in df.items(): assert isinstance(v, self.klass._constructor_sliced) def test_items(self): # GH 17213, GH 13918 - cols = ['a', 'b', 'c'] + cols = ["a", "b", "c"] df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols) for c, (k, v) in zip(cols, df.items()): assert c == k @@ -214,11 +222,14 @@ def test_iterrows(self, float_frame, float_string_frame): def test_iterrows_iso8601(self): # GH 19671 if self.klass == SparseDataFrame: - pytest.xfail(reason='SparseBlock datetime type not implemented.') + pytest.xfail(reason="SparseBlock datetime type not implemented.") s = self.klass( - {'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'], - 'iso8601': date_range('2000-01-01', periods=4, freq='M')}) + { + "non_iso8601": ["M1701", "M1802", "M1903", "M2004"], + "iso8601": date_range("2000-01-01", periods=4, freq="M"), + } + ) for k, v in s.iterrows(): exp = s.loc[k] self._assert_series_equal(v, exp) @@ -226,11 +237,22 @@ def test_iterrows_iso8601(self): def test_iterrows_corner(self): # gh-12222 df = DataFrame( - {'a': [datetime.datetime(2015, 1, 1)], 'b': [None], 'c': [None], - 'd': [''], 'e': [[]], 'f': [set()], 'g': [{}]}) + { + "a": [datetime.datetime(2015, 1, 1)], + "b": [None], + "c": [None], + "d": [""], + "e": [[]], + "f": [set()], + "g": [{}], + } + ) expected = Series( - [datetime.datetime(2015, 1, 1), None, None, '', [], set(), {}], - index=list('abcdefg'), name=0, dtype='object') + [datetime.datetime(2015, 1, 1), None, None, "", [], set(), {}], + index=list("abcdefg"), + name=0, + dtype="object", + ) _, result = next(df.iterrows()) tm.assert_series_equal(result, expected) @@ -241,46 +263,49 @@ def test_itertuples(self, float_frame): expected = float_frame.iloc[i, :].reset_index(drop=True) self._assert_series_equal(s, expected) - df = self.klass({'floats': np.random.randn(5), - 'ints': range(5)}, columns=['floats', 'ints']) + df = self.klass( + {"floats": np.random.randn(5), "ints": range(5)}, columns=["floats", "ints"] + ) for tup in df.itertuples(index=False): assert isinstance(tup[1], int) df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]}) - dfaa = df[['a', 'a']] + dfaa = df[["a", "a"]] - assert (list(dfaa.itertuples()) == - [(0, 1, 1), (1, 2, 2), (2, 3, 3)]) + assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)] # repr with int on 32-bit/windows if not (compat.is_platform_windows() or compat.is_platform_32bit()): - assert (repr(list(df.itertuples(name=None))) == - '[(0, 1, 4), (1, 2, 5), (2, 3, 6)]') + assert ( + repr(list(df.itertuples(name=None))) + == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]" + ) - tup = next(df.itertuples(name='TestName')) - assert tup._fields == ('Index', 'a', 'b') + tup = next(df.itertuples(name="TestName")) + assert tup._fields == ("Index", "a", "b") assert (tup.Index, tup.a, tup.b) == tup - assert type(tup).__name__ == 'TestName' + assert type(tup).__name__ == "TestName" - df.columns = ['def', 'return'] - tup2 = next(df.itertuples(name='TestName')) + df.columns = ["def", "return"] + tup2 = next(df.itertuples(name="TestName")) assert tup2 == (0, 1, 4) - assert tup2._fields == ('Index', '_1', '_2') + assert tup2._fields == ("Index", "_1", "_2") - df3 = DataFrame({'f' + str(i): [i] for i in range(1024)}) + df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) - assert not hasattr(tup3, '_fields') + assert not hasattr(tup3, "_fields") assert isinstance(tup3, tuple) def test_sequence_like_with_categorical(self): # GH 7839 # make sure can iterate - df = DataFrame({"id": [1, 2, 3, 4, 5, 6], - "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) - df['grade'] = Categorical(df['raw_grade']) + df = DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + df["grade"] = Categorical(df["raw_grade"]) # basic sequencing testing result = list(df.grade.values) @@ -314,16 +339,16 @@ def test_values(self, float_frame, float_string_frame): assert value == frame[col][i] # mixed type - arr = float_string_frame[['foo', 'A']].values - assert arr[0, 0] == 'bar' + arr = float_string_frame[["foo", "A"]].values + assert arr[0, 0] == "bar" - df = self.klass({'complex': [1j, 2j, 3j], 'real': [1, 2, 3]}) + df = self.klass({"complex": [1j, 2j, 3j], "real": [1, 2, 3]}) arr = df.values assert arr[0, 0] == 1j # single block corner case - arr = float_frame[['A', 'B']].values - expected = float_frame.reindex(columns=['A', 'B']).values + arr = float_frame[["A", "B"]].values + expected = float_frame.reindex(columns=["A", "B"]).values assert_almost_equal(arr, expected) def test_to_numpy(self): @@ -368,8 +393,10 @@ def test_swapaxes(self): self._assert_frame_equal(df.T, df.swapaxes(0, 1)) self._assert_frame_equal(df.T, df.swapaxes(1, 0)) self._assert_frame_equal(df, df.swapaxes(0, 0)) - msg = ("No axis named 2 for object type" - r" ") + msg = ( + "No axis named 2 for object type" + r" " + ) with pytest.raises(ValueError, match=msg): df.swapaxes(2, 5) @@ -378,11 +405,11 @@ def test_axis_aliases(self, float_frame): # reg name expected = f.sum(axis=0) - result = f.sum(axis='index') + result = f.sum(axis="index") assert_series_equal(result, expected) expected = f.sum(axis=1) - result = f.sum(axis='columns') + result = f.sum(axis="columns") assert_series_equal(result, expected) def test_class_axis(self): @@ -396,10 +423,11 @@ def test_more_values(self, float_string_frame): assert values.shape[1] == len(float_string_frame.columns) def test_repr_with_mi_nat(self, float_string_frame): - df = self.klass({'X': [1, 2]}, - index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']]) + df = self.klass( + {"X": [1, 2]}, index=[[pd.NaT, pd.Timestamp("20130101")], ["a", "b"]] + ) result = repr(df) - expected = ' X\nNaT a 1\n2013-01-01 b 2' + expected = " X\nNaT a 1\n2013-01-01 b 2" assert result == expected def test_iteritems_names(self, float_string_frame): @@ -416,26 +444,32 @@ def test_empty_nonzero(self): assert not df.empty df = self.klass(index=[1], columns=[1]) assert not df.empty - df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna() + df = self.klass(index=["a", "b"], columns=["c", "d"]).dropna() assert df.empty assert df.T.empty - empty_frames = [self.klass(), - self.klass(index=[1]), - self.klass(columns=[1]), - self.klass({1: []})] + empty_frames = [ + self.klass(), + self.klass(index=[1]), + self.klass(columns=[1]), + self.klass({1: []}), + ] for df in empty_frames: assert df.empty assert df.T.empty def test_with_datetimelikes(self): - df = self.klass({'A': date_range('20130101', periods=10), - 'B': timedelta_range('1 day', periods=10)}) + df = self.klass( + { + "A": date_range("20130101", periods=10), + "B": timedelta_range("1 day", periods=10), + } + ) t = df.T result = t.dtypes.value_counts() if self.klass is DataFrame: - expected = Series({np.dtype('object'): 10}) + expected = Series({np.dtype("object"): 10}) else: expected = Series({SparseDtype(dtype=object): 10}) tm.assert_series_equal(result, expected) @@ -449,7 +483,7 @@ class TestDataFrameMisc(SharedWithSparse): _assert_series_equal = staticmethod(assert_series_equal) def test_values(self, float_frame): - float_frame.values[:, 0] = 5. + float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() def test_as_matrix_deprecated(self, float_frame): @@ -462,10 +496,10 @@ def test_as_matrix_deprecated(self, float_frame): def test_deepcopy(self, float_frame): cp = deepcopy(float_frame) - series = cp['A'] + series = cp["A"] series[:] = 10 for idx, value in series.items(): - assert float_frame['A'][idx] != value + assert float_frame["A"][idx] != value def test_transpose_get_view(self, float_frame): dft = float_frame.T @@ -476,9 +510,9 @@ def test_transpose_get_view(self, float_frame): def test_inplace_return_self(self): # GH 1893 - data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'], - 'b': [0, 0, 1, 1], - 'c': [1, 2, 3, 4]}) + data = DataFrame( + {"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]} + ) def _check_f(base, f): result = f(base) @@ -487,19 +521,19 @@ def _check_f(base, f): # -----DataFrame----- # set_index - f = lambda x: x.set_index('a', inplace=True) + f = lambda x: x.set_index("a", inplace=True) _check_f(data.copy(), f) # reset_index f = lambda x: x.reset_index(inplace=True) - _check_f(data.set_index('a'), f) + _check_f(data.set_index("a"), f) # drop_duplicates f = lambda x: x.drop_duplicates(inplace=True) _check_f(data.copy(), f) # sort - f = lambda x: x.sort_values('b', inplace=True) + f = lambda x: x.sort_values("b", inplace=True) _check_f(data.copy(), f) # sort_index @@ -515,15 +549,15 @@ def _check_f(base, f): _check_f(data.copy(), f) # rename - f = lambda x: x.rename({1: 'foo'}, inplace=True) + f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(data.copy(), f) # -----Series----- - d = data.copy()['c'] + d = data.copy()["c"] # reset_index f = lambda x: x.reset_index(inplace=True, drop=True) - _check_f(data.set_index('a')['c'], f) + _check_f(data.set_index("a")["c"], f) # fillna f = lambda x: x.fillna(0, inplace=True) @@ -534,22 +568,22 @@ def _check_f(base, f): _check_f(d.copy(), f) # rename - f = lambda x: x.rename({1: 'foo'}, inplace=True) + f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) def test_tab_complete_warning(self, ip): # GH 16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; df = pd.DataFrame()" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('df.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("df.", 1)) def test_get_values_deprecated(self): - df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) + df = DataFrame({"a": [1, 2], "b": [0.1, 0.2]}) with tm.assert_produces_warning(FutureWarning): res = df.get_values() tm.assert_numpy_array_equal(res, df.values) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 0c09956b3f2fb..92912ff9ec093 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -24,22 +24,23 @@ def int_frame_const_col(): Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3] """ - df = DataFrame(np.tile(np.arange(3, dtype='int64'), 6).reshape(6, -1) + 1, - columns=['A', 'B', 'C']) + df = DataFrame( + np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1, + columns=["A", "B", "C"], + ) return df class TestDataFrameApply: - def test_apply(self, float_frame): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): # ufunc applied = float_frame.apply(np.sqrt) - tm.assert_series_equal(np.sqrt(float_frame['A']), applied['A']) + tm.assert_series_equal(np.sqrt(float_frame["A"]), applied["A"]) # aggregator applied = float_frame.apply(np.mean) - assert applied['A'] == np.mean(float_frame['A']) + assert applied["A"] == np.mean(float_frame["A"]) d = float_frame.index[0] applied = float_frame.apply(np.mean, axis=1) @@ -47,25 +48,27 @@ def test_apply(self, float_frame): assert applied.index is float_frame.index # want this # invalid axis - df = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) with pytest.raises(ValueError): df.apply(lambda x: x, 2) # GH 9573 - df = DataFrame({'c0': ['A', 'A', 'B', 'B'], - 'c1': ['C', 'C', 'D', 'D']}) - df = df.apply(lambda ts: ts.astype('category')) + df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]}) + df = df.apply(lambda ts: ts.astype("category")) assert df.shape == (4, 2) - assert isinstance(df['c0'].dtype, CategoricalDtype) - assert isinstance(df['c1'].dtype, CategoricalDtype) + assert isinstance(df["c0"].dtype, CategoricalDtype) + assert isinstance(df["c1"].dtype, CategoricalDtype) def test_apply_mixed_datetimelike(self): # mixed datetimelike # GH 7778 - df = DataFrame({'A': date_range('20130101', periods=3), - 'B': pd.to_timedelta(np.arange(3), unit='s')}) + df = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": pd.to_timedelta(np.arange(3), unit="s"), + } + ) result = df.apply(lambda x: x, axis=1) assert_frame_equal(result, df) @@ -90,8 +93,8 @@ def test_apply_empty(self, float_frame): assert_series_equal(result, expected) # GH 2476 - expected = DataFrame(index=['a']) - result = expected.apply(lambda x: x['a'], axis=1) + expected = DataFrame(index=["a"]) + result = expected.apply(lambda x: x["a"], axis=1) assert_frame_equal(expected, result) def test_apply_with_reduce_empty(self): @@ -99,18 +102,16 @@ def test_apply_with_reduce_empty(self): empty_frame = DataFrame() x = [] - result = empty_frame.apply(x.append, axis=1, result_type='expand') + result = empty_frame.apply(x.append, axis=1, result_type="expand") assert_frame_equal(result, empty_frame) - result = empty_frame.apply(x.append, axis=1, result_type='reduce') - assert_series_equal(result, Series( - [], index=pd.Index([], dtype=object))) + result = empty_frame.apply(x.append, axis=1, result_type="reduce") + assert_series_equal(result, Series([], index=pd.Index([], dtype=object))) - empty_with_cols = DataFrame(columns=['a', 'b', 'c']) - result = empty_with_cols.apply(x.append, axis=1, result_type='expand') + empty_with_cols = DataFrame(columns=["a", "b", "c"]) + result = empty_with_cols.apply(x.append, axis=1, result_type="expand") assert_frame_equal(result, empty_with_cols) - result = empty_with_cols.apply(x.append, axis=1, result_type='reduce') - assert_series_equal(result, Series( - [], index=pd.Index([], dtype=object))) + result = empty_with_cols.apply(x.append, axis=1, result_type="reduce") + assert_series_equal(result, Series([], index=pd.Index([], dtype=object))) # Ensure that x.append hasn't been called assert x == [] @@ -123,24 +124,26 @@ def test_apply_deprecate_reduce(self): empty_frame.apply(x.append, axis=1, reduce=True) def test_apply_standard_nonunique(self): - df = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) + df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) result = df.apply(lambda s: s[0], axis=1) - expected = Series([1, 4, 7], ['a', 'a', 'c']) + expected = Series([1, 4, 7], ["a", "a", "c"]) assert_series_equal(result, expected) result = df.T.apply(lambda s: s[0], axis=0) assert_series_equal(result, expected) - @pytest.mark.parametrize('func', ['sum', 'mean', 'min', 'max', 'std']) - @pytest.mark.parametrize('args,kwds', [ - pytest.param([], {}, id='no_args_or_kwds'), - pytest.param([1], {}, id='axis_from_args'), - pytest.param([], {'axis': 1}, id='axis_from_kwds'), - pytest.param([], {'numeric_only': True}, id='optional_kwds'), - pytest.param([1, None], {'numeric_only': True}, id='args_and_kwds') - ]) + @pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"]) + @pytest.mark.parametrize( + "args,kwds", + [ + pytest.param([], {}, id="no_args_or_kwds"), + pytest.param([1], {}, id="axis_from_args"), + pytest.param([], {"axis": 1}, id="axis_from_kwds"), + pytest.param([], {"numeric_only": True}, id="optional_kwds"), + pytest.param([1, None], {"numeric_only": True}, id="args_and_kwds"), + ], + ) def test_apply_with_string_funcs(self, float_frame, func, args, kwds): result = float_frame.apply(func, *args, **kwds) expected = getattr(float_frame, func)(*args, **kwds) @@ -153,11 +156,11 @@ def test_apply_broadcast_deprecated(self, float_frame): def test_apply_broadcast(self, float_frame, int_frame_const_col): # scalars - result = float_frame.apply(np.mean, result_type='broadcast') + result = float_frame.apply(np.mean, result_type="broadcast") expected = DataFrame([float_frame.mean()], index=float_frame.index) tm.assert_frame_equal(result, expected) - result = float_frame.apply(np.mean, axis=1, result_type='broadcast') + result = float_frame.apply(np.mean, axis=1, result_type="broadcast") m = float_frame.mean(axis=1) expected = DataFrame({c: m for c in float_frame.columns}) tm.assert_frame_equal(result, expected) @@ -166,31 +169,39 @@ def test_apply_broadcast(self, float_frame, int_frame_const_col): result = float_frame.apply( lambda x: list(range(len(float_frame.columns))), axis=1, - result_type='broadcast') + result_type="broadcast", + ) m = list(range(len(float_frame.columns))) - expected = DataFrame([m] * len(float_frame.index), - dtype='float64', - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + [m] * len(float_frame.index), + dtype="float64", + index=float_frame.index, + columns=float_frame.columns, + ) tm.assert_frame_equal(result, expected) - result = float_frame.apply(lambda x: - list(range(len(float_frame.index))), - result_type='broadcast') + result = float_frame.apply( + lambda x: list(range(len(float_frame.index))), result_type="broadcast" + ) m = list(range(len(float_frame.index))) - expected = DataFrame({c: m for c in float_frame.columns}, - dtype='float64', - index=float_frame.index) + expected = DataFrame( + {c: m for c in float_frame.columns}, + dtype="float64", + index=float_frame.index, + ) tm.assert_frame_equal(result, expected) # preserve columns df = int_frame_const_col - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") tm.assert_frame_equal(result, df) df = int_frame_const_col - result = df.apply(lambda x: Series([1, 2, 3], index=list('abc')), - axis=1, result_type='broadcast') + result = df.apply( + lambda x: Series([1, 2, 3], index=list("abc")), + axis=1, + result_type="broadcast", + ) expected = df.copy() tm.assert_frame_equal(result, expected) @@ -199,15 +210,18 @@ def test_apply_broadcast_error(self, int_frame_const_col): # > 1 ndim with pytest.raises(ValueError): - df.apply(lambda x: np.array([1, 2]).reshape(-1, 2), - axis=1, result_type='broadcast') + df.apply( + lambda x: np.array([1, 2]).reshape(-1, 2), + axis=1, + result_type="broadcast", + ) # cannot broadcast with pytest.raises(ValueError): - df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') + df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") with pytest.raises(ValueError): - df.apply(lambda x: Series([1, 2]), axis=1, result_type='broadcast') + df.apply(lambda x: Series([1, 2]), axis=1, result_type="broadcast") def test_apply_raw(self, float_frame): result0 = float_frame.apply(np.mean, raw=True) @@ -230,38 +244,37 @@ def test_apply_axis1(self, float_frame): assert tapplied[d] == np.mean(float_frame.xs(d)) def test_apply_ignore_failures(self, float_string_frame): - result = frame_apply(float_string_frame, np.mean, 0, - ignore_failures=True).apply_standard() + result = frame_apply( + float_string_frame, np.mean, 0, ignore_failures=True + ).apply_standard() expected = float_string_frame._get_numeric_data().apply(np.mean) assert_series_equal(result, expected) def test_apply_mixed_dtype_corner(self): - df = DataFrame({'A': ['foo'], - 'B': [1.]}) + df = DataFrame({"A": ["foo"], "B": [1.0]}) result = df[:0].apply(np.mean, axis=1) # the result here is actually kind of ambiguous, should it be a Series # or a DataFrame? - expected = Series(np.nan, index=pd.Index([], dtype='int64')) + expected = Series(np.nan, index=pd.Index([], dtype="int64")) assert_series_equal(result, expected) - df = DataFrame({'A': ['foo'], - 'B': [1.]}) - result = df.apply(lambda x: x['A'], axis=1) - expected = Series(['foo'], index=[0]) + df = DataFrame({"A": ["foo"], "B": [1.0]}) + result = df.apply(lambda x: x["A"], axis=1) + expected = Series(["foo"], index=[0]) assert_series_equal(result, expected) - result = df.apply(lambda x: x['B'], axis=1) - expected = Series([1.], index=[0]) + result = df.apply(lambda x: x["B"], axis=1) + expected = Series([1.0], index=[0]) assert_series_equal(result, expected) def test_apply_empty_infer_type(self): - no_cols = DataFrame(index=['a', 'b', 'c']) - no_index = DataFrame(columns=['a', 'b', 'c']) + no_cols = DataFrame(index=["a", "b", "c"]) + no_index = DataFrame(columns=["a", "b", "c"]) def _check(df, f): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - test_res = f(np.array([], dtype='f8')) + test_res = f(np.array([], dtype="f8")) is_reduction = not isinstance(test_res, np.ndarray) def _checkit(axis=0, raw=False): @@ -278,13 +291,13 @@ def _checkit(axis=0, raw=False): _checkit(raw=True) _checkit(axis=0, raw=True) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): _check(no_cols, lambda x: x) _check(no_cols, lambda x: x.mean()) _check(no_index, lambda x: x) _check(no_index, lambda x: x.mean()) - result = no_cols.apply(lambda x: x.mean(), result_type='broadcast') + result = no_cols.apply(lambda x: x.mean(), result_type="broadcast") assert isinstance(result, DataFrame) def test_apply_with_args_kwds(self, float_frame): @@ -306,7 +319,7 @@ def subtract_and_divide(x, sub, divide=1): assert_series_equal(result, expected) result = float_frame.apply(subtract_and_divide, args=(2,), divide=2) - expected = float_frame.apply(lambda x: (x - 2.) / 2.) + expected = float_frame.apply(lambda x: (x - 2.0) / 2.0) assert_frame_equal(result, expected) def test_apply_yield_list(self, float_frame): @@ -314,7 +327,7 @@ def test_apply_yield_list(self, float_frame): assert_frame_equal(result, float_frame) def test_apply_reduce_Series(self, float_frame): - float_frame.loc[::2, 'A'] = np.nan + float_frame.loc[::2, "A"] = np.nan expected = float_frame.mean(1) result = float_frame.apply(np.mean, axis=1) assert_series_equal(result, expected) @@ -330,86 +343,165 @@ def test_apply_differently_indexed(self): df = DataFrame(np.random.randn(20, 10)) result0 = df.apply(Series.describe, axis=0) - expected0 = DataFrame({i: v.describe() - for i, v in df.items()}, - columns=df.columns) + expected0 = DataFrame( + {i: v.describe() for i, v in df.items()}, columns=df.columns + ) assert_frame_equal(result0, expected0) result1 = df.apply(Series.describe, axis=1) - expected1 = DataFrame({i: v.describe() - for i, v in df.T.items()}, - columns=df.index).T + expected1 = DataFrame( + {i: v.describe() for i, v in df.T.items()}, columns=df.index + ).T assert_frame_equal(result1, expected1) def test_apply_modify_traceback(self): - data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - data.loc[4, 'C'] = np.nan + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + data.loc[4, "C"] = np.nan def transform(row): - if row['C'].startswith('shin') and row['A'] == 'foo': - row['D'] = 7 + if row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 return row def transform2(row): - if (notna(row['C']) and row['C'].startswith('shin') and - row['A'] == 'foo'): - row['D'] = 7 + if notna(row["C"]) and row["C"].startswith("shin") and row["A"] == "foo": + row["D"] = 7 return row try: data.apply(transform, axis=1) except AttributeError as e: assert len(e.args) == 2 - assert e.args[1] == 'occurred at index 4' + assert e.args[1] == "occurred at index 4" assert e.args[0] == "'float' object has no attribute 'startswith'" def test_apply_bug(self): # GH 6125 - positions = pd.DataFrame([[1, 'ABC0', 50], [1, 'YUM0', 20], - [1, 'DEF0', 20], [2, 'ABC1', 50], - [2, 'YUM1', 20], [2, 'DEF1', 20]], - columns=['a', 'market', 'position']) + positions = pd.DataFrame( + [ + [1, "ABC0", 50], + [1, "YUM0", 20], + [1, "DEF0", 20], + [2, "ABC1", 50], + [2, "YUM1", 20], + [2, "DEF1", 20], + ], + columns=["a", "market", "position"], + ) def f(r): - return r['market'] + return r["market"] + expected = positions.apply(f, axis=1) - positions = DataFrame([[datetime(2013, 1, 1), 'ABC0', 50], - [datetime(2013, 1, 2), 'YUM0', 20], - [datetime(2013, 1, 3), 'DEF0', 20], - [datetime(2013, 1, 4), 'ABC1', 50], - [datetime(2013, 1, 5), 'YUM1', 20], - [datetime(2013, 1, 6), 'DEF1', 20]], - columns=['a', 'market', 'position']) + positions = DataFrame( + [ + [datetime(2013, 1, 1), "ABC0", 50], + [datetime(2013, 1, 2), "YUM0", 20], + [datetime(2013, 1, 3), "DEF0", 20], + [datetime(2013, 1, 4), "ABC1", 50], + [datetime(2013, 1, 5), "YUM1", 20], + [datetime(2013, 1, 6), "DEF1", 20], + ], + columns=["a", "market", "position"], + ) result = positions.apply(f, axis=1) assert_series_equal(result, expected) def test_apply_convert_objects(self): - data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) result = data.apply(lambda x: x, axis=1) assert_frame_equal(result._convert(datetime=True), data) @@ -425,44 +517,43 @@ def test_apply_attach_name(self, float_frame): # non-reductions result = float_frame.apply(lambda x: np.repeat(x.name, len(x))) - expected = DataFrame(np.tile(float_frame.columns, - (len(float_frame.index), 1)), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + np.tile(float_frame.columns, (len(float_frame.index), 1)), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(result, expected) - result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), - axis=1) - expected = Series(np.repeat(t[0], len(float_frame.columns)) - for t in float_frame.itertuples()) + result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1) + expected = Series( + np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples() + ) expected.index = float_frame.index assert_series_equal(result, expected) def test_apply_multi_index(self, float_frame): - index = MultiIndex.from_arrays([['a', 'a', 'b'], ['c', 'd', 'd']]) - s = DataFrame([[1, 2], [3, 4], [5, 6]], - index=index, - columns=['col1', 'col2']) - result = s.apply( - lambda x: Series({'min': min(x), 'max': max(x)}), 1) - expected = DataFrame([[1, 2], [3, 4], [5, 6]], - index=index, - columns=['min', 'max']) + index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]]) + s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"]) + result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1) + expected = DataFrame( + [[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"] + ) assert_frame_equal(result, expected, check_like=True) def test_apply_dict(self): # GH 8735 - A = DataFrame([['foo', 'bar'], ['spam', 'eggs']]) - A_dicts = Series([dict([(0, 'foo'), (1, 'spam')]), - dict([(0, 'bar'), (1, 'eggs')])]) + A = DataFrame([["foo", "bar"], ["spam", "eggs"]]) + A_dicts = Series( + [dict([(0, "foo"), (1, "spam")]), dict([(0, "bar"), (1, "eggs")])] + ) B = DataFrame([[0, 1], [2, 3]]) B_dicts = Series([dict([(0, 0), (1, 2)]), dict([(0, 1), (1, 3)])]) fn = lambda x: x.to_dict() for df, dicts in [(A, A_dicts), (B, B_dicts)]: - reduce_true = df.apply(fn, result_type='reduce') - reduce_false = df.apply(fn, result_type='expand') + reduce_true = df.apply(fn, result_type="reduce") + reduce_false = df.apply(fn, result_type="expand") reduce_none = df.apply(fn) assert_series_equal(reduce_true, dicts) @@ -476,21 +567,21 @@ def test_applymap(self, float_frame): # GH 465: function returning tuples result = float_frame.applymap(lambda x: (x, x)) - assert isinstance(result['A'][0], tuple) + assert isinstance(result["A"][0], tuple) # GH 2909: object conversion to float in constructor? - df = DataFrame(data=[1, 'a']) + df = DataFrame(data=[1, "a"]) result = df.applymap(lambda x: x) assert result.dtypes[0] == object - df = DataFrame(data=[1., 'a']) + df = DataFrame(data=[1.0, "a"]) result = df.applymap(lambda x: x) assert result.dtypes[0] == object # GH 2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() - cols = ['a', 'a', 'a', 'a'] + cols = ["a", "a", "a", "a"] df.columns = cols expected = df2.applymap(str) @@ -499,17 +590,19 @@ def test_applymap(self, float_frame): tm.assert_frame_equal(result, expected) # datetime/timedelta - df['datetime'] = Timestamp('20130101') - df['timedelta'] = pd.Timedelta('1 min') + df["datetime"] = Timestamp("20130101") + df["timedelta"] = pd.Timedelta("1 min") result = df.applymap(str) - for f in ['datetime', 'timedelta']: + for f in ["datetime", "timedelta"]: assert result.loc[0, f] == str(df.loc[0, f]) # GH 8222 - empty_frames = [pd.DataFrame(), - pd.DataFrame(columns=list('ABC')), - pd.DataFrame(index=list('ABC')), - pd.DataFrame({'A': [], 'B': [], 'C': []})] + empty_frames = [ + pd.DataFrame(), + pd.DataFrame(columns=list("ABC")), + pd.DataFrame(index=list("ABC")), + pd.DataFrame({"A": [], "B": [], "C": []}), + ] for frame in empty_frames: for func in [round, lambda x: x]: result = frame.applymap(func) @@ -517,7 +610,7 @@ def test_applymap(self, float_frame): def test_applymap_box_timestamps(self): # GH 2689, GH 2627 - ser = pd.Series(date_range('1/1/2000', periods=10)) + ser = pd.Series(date_range("1/1/2000", periods=10)) def func(x): return (x.hour, x.day, x.month) @@ -527,52 +620,65 @@ def func(x): def test_applymap_box(self): # ufunc will not be boxed. Same test cases as the test_map_box - df = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02')], - 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')], - 'c': [pd.Timedelta('1 days'), - pd.Timedelta('2 days')], - 'd': [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')]}) - - result = df.applymap(lambda x: '{0}'.format(x.__class__.__name__)) - expected = pd.DataFrame({'a': ['Timestamp', 'Timestamp'], - 'b': ['Timestamp', 'Timestamp'], - 'c': ['Timedelta', 'Timedelta'], - 'd': ['Period', 'Period']}) + df = pd.DataFrame( + { + "a": [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")], + "b": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ], + "c": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], + "d": [ + pd.Period("2011-01-01", freq="M"), + pd.Period("2011-01-02", freq="M"), + ], + } + ) + + result = df.applymap(lambda x: "{0}".format(x.__class__.__name__)) + expected = pd.DataFrame( + { + "a": ["Timestamp", "Timestamp"], + "b": ["Timestamp", "Timestamp"], + "c": ["Timedelta", "Timedelta"], + "d": ["Period", "Period"], + } + ) tm.assert_frame_equal(result, expected) def test_frame_apply_dont_convert_datetime64(self): from pandas.tseries.offsets import BDay - df = DataFrame({'x1': [datetime(1996, 1, 1)]}) + + df = DataFrame({"x1": [datetime(1996, 1, 1)]}) df = df.applymap(lambda x: x + BDay()) df = df.applymap(lambda x: x + BDay()) - assert df.x1.dtype == 'M8[ns]' + assert df.x1.dtype == "M8[ns]" def test_apply_non_numpy_dtype(self): # GH 12244 - df = DataFrame({'dt': pd.date_range( - "2015-01-01", periods=3, tz='Europe/Brussels')}) + df = DataFrame( + {"dt": pd.date_range("2015-01-01", periods=3, tz="Europe/Brussels")} + ) result = df.apply(lambda x: x) assert_frame_equal(result, df) - result = df.apply(lambda x: x + pd.Timedelta('1day')) - expected = DataFrame({'dt': pd.date_range( - "2015-01-02", periods=3, tz='Europe/Brussels')}) + result = df.apply(lambda x: x + pd.Timedelta("1day")) + expected = DataFrame( + {"dt": pd.date_range("2015-01-02", periods=3, tz="Europe/Brussels")} + ) assert_frame_equal(result, expected) - df = DataFrame({'dt': ['a', 'b', 'c', 'a']}, dtype='category') + df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category") result = df.apply(lambda x: x) assert_frame_equal(result, df) def test_apply_dup_names_multi_agg(self): # GH 21063 - df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'a']) - expected = pd.DataFrame([[0, 1]], columns=['a', 'a'], index=['min']) - result = df.agg(['min']) + df = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "a"]) + expected = pd.DataFrame([[0, 1]], columns=["a", "a"], index=["min"]) + result = df.agg(["min"]) tm.assert_frame_equal(result, expected) @@ -594,80 +700,93 @@ def test_infer_row_shape(self): def test_with_dictlike_columns(self): # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1) - expected = Series([{'s': 3} for t in df.itertuples()]) + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) + expected = Series([{"s": 3} for t in df.itertuples()]) assert_series_equal(result, expected) - df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), - pd.Timestamp('2017-05-02 00:00:00')] - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1) + df["tm"] = [ + pd.Timestamp("2017-05-01 00:00:00"), + pd.Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1) assert_series_equal(result, expected) # compose a series - result = (df['a'] + df['b']).apply(lambda x: {'s': x}) - expected = Series([{'s': 3}, {'s': 3}]) + result = (df["a"] + df["b"]).apply(lambda x: {"s": x}) + expected = Series([{"s": 3}, {"s": 3}]) assert_series_equal(result, expected) # GH 18775 df = DataFrame() df["author"] = ["X", "Y", "Z"] df["publisher"] = ["BBC", "NBC", "N24"] - df["date"] = pd.to_datetime(['17-10-2010 07:15:30', - '13-05-2011 08:20:35', - '15-01-2013 09:09:09']) + df["date"] = pd.to_datetime( + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] + ) result = df.apply(lambda x: {}, axis=1) expected = Series([{}, {}, {}]) assert_series_equal(result, expected) def test_with_dictlike_columns_with_infer(self): # GH 17602 - df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1, result_type='expand') - expected = DataFrame({'s': [3, 3]}) + df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) + result = df.apply( + lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" + ) + expected = DataFrame({"s": [3, 3]}) assert_frame_equal(result, expected) - df['tm'] = [pd.Timestamp('2017-05-01 00:00:00'), - pd.Timestamp('2017-05-02 00:00:00')] - result = df.apply(lambda x: {'s': x['a'] + x['b']}, - axis=1, result_type='expand') + df["tm"] = [ + pd.Timestamp("2017-05-01 00:00:00"), + pd.Timestamp("2017-05-02 00:00:00"), + ] + result = df.apply( + lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand" + ) assert_frame_equal(result, expected) def test_with_listlike_columns(self): # GH 17348 - df = DataFrame({'a': Series(np.random.randn(4)), - 'b': ['a', 'list', 'of', 'words'], - 'ts': date_range('2016-10-01', periods=4, freq='H')}) - - result = df[['a', 'b']].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[['a', 'b']].itertuples()]) + df = DataFrame( + { + "a": Series(np.random.randn(4)), + "b": ["a", "list", "of", "words"], + "ts": date_range("2016-10-01", periods=4, freq="H"), + } + ) + + result = df[["a", "b"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "b"]].itertuples()]) assert_series_equal(result, expected) - result = df[['a', 'ts']].apply(tuple, axis=1) - expected = Series([t[1:] for t in df[['a', 'ts']].itertuples()]) + result = df[["a", "ts"]].apply(tuple, axis=1) + expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()]) assert_series_equal(result, expected) # GH 18919 - df = DataFrame({'x': Series([['a', 'b'], ['q']]), - 'y': Series([['z'], ['q', 't']])}) - df.index = MultiIndex.from_tuples([('i0', 'j0'), ('i1', 'j1')]) + df = DataFrame( + {"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])} + ) + df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")]) - result = df.apply( - lambda row: [el for el in row['x'] if el in row['y']], - axis=1) - expected = Series([[], ['q']], index=df.index) + result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1) + expected = Series([[], ["q"]], index=df.index) assert_series_equal(result, expected) def test_infer_output_shape_columns(self): # GH 18573 - df = DataFrame({'number': [1., 2.], - 'string': ['foo', 'bar'], - 'datetime': [pd.Timestamp('2017-11-29 03:30:00'), - pd.Timestamp('2017-11-29 03:45:00')]}) + df = DataFrame( + { + "number": [1.0, 2.0], + "string": ["foo", "bar"], + "datetime": [ + pd.Timestamp("2017-11-29 03:30:00"), + pd.Timestamp("2017-11-29 03:45:00"), + ], + } + ) result = df.apply(lambda row: (row.number, row.string), axis=1) expected = Series([(t.number, t.string) for t in df.itertuples()]) assert_series_equal(result, expected) @@ -675,7 +794,7 @@ def test_infer_output_shape_columns(self): def test_infer_output_shape_listlike_columns(self): # GH 16353 - df = DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C']) + df = DataFrame(np.random.randn(6, 3), columns=["A", "B", "C"]) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) @@ -686,26 +805,30 @@ def test_infer_output_shape_listlike_columns(self): assert_series_equal(result, expected) # GH 17970 - df = DataFrame({"a": [1, 2, 3]}, index=list('abc')) + df = DataFrame({"a": [1, 2, 3]}, index=list("abc")) result = df.apply(lambda row: np.ones(1), axis=1) - expected = Series([np.ones(1) for t in df.itertuples()], - index=df.index) + expected = Series([np.ones(1) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) result = df.apply(lambda row: np.ones(2), axis=1) - expected = Series([np.ones(2) for t in df.itertuples()], - index=df.index) + expected = Series([np.ones(2) for t in df.itertuples()], index=df.index) assert_series_equal(result, expected) # GH 17892 - df = pd.DataFrame({'a': [pd.Timestamp('2010-02-01'), - pd.Timestamp('2010-02-04'), - pd.Timestamp('2010-02-05'), - pd.Timestamp('2010-02-06')], - 'b': [9, 5, 4, 3], - 'c': [5, 3, 4, 2], - 'd': [1, 2, 3, 4]}) + df = pd.DataFrame( + { + "a": [ + pd.Timestamp("2010-02-01"), + pd.Timestamp("2010-02-04"), + pd.Timestamp("2010-02-05"), + pd.Timestamp("2010-02-06"), + ], + "b": [9, 5, 4, 3], + "c": [5, 3, 4, 2], + "d": [1, 2, 3, 4], + } + ) def fun(x): return (1, 2) @@ -717,7 +840,7 @@ def fun(x): def test_consistent_coerce_for_shapes(self): # we want column names to NOT be propagated # just because the shape matches the input shape - df = DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C']) + df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) result = df.apply(lambda x: [1, 2, 3], axis=1) expected = Series([[1, 2, 3] for t in df.itertuples()]) @@ -731,17 +854,16 @@ def test_consistent_names(self, int_frame_const_col): # if a Series is returned, we should use the resulting index names df = int_frame_const_col - result = df.apply(lambda x: Series([1, 2, 3], - index=['test', 'other', 'cols']), - axis=1) - expected = int_frame_const_col.rename(columns={'A': 'test', - 'B': 'other', - 'C': 'cols'}) + result = df.apply( + lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1 + ) + expected = int_frame_const_col.rename( + columns={"A": "test", "B": "other", "C": "cols"} + ) assert_frame_equal(result, expected) - result = df.apply(lambda x: Series([1, 2], index=['test', 'other']), - axis=1) - expected = expected[['test', 'other']] + result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1) + expected = expected[["test", "other"]] assert_frame_equal(result, expected) def test_result_type(self, int_frame_const_col): @@ -749,24 +871,25 @@ def test_result_type(self, int_frame_const_col): # path we take in the code df = int_frame_const_col - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='expand') + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand") expected = df.copy() expected.columns = [0, 1, 2] assert_frame_equal(result, expected) - result = df.apply(lambda x: [1, 2], axis=1, result_type='expand') - expected = df[['A', 'B']].copy() + result = df.apply(lambda x: [1, 2], axis=1, result_type="expand") + expected = df[["A", "B"]].copy() expected.columns = [0, 1] assert_frame_equal(result, expected) # broadcast result - result = df.apply(lambda x: [1, 2, 3], axis=1, result_type='broadcast') + result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast") expected = df.copy() assert_frame_equal(result, expected) - columns = ['other', 'col', 'names'] - result = df.apply(lambda x: Series([1, 2, 3], index=columns), - axis=1, result_type='broadcast') + columns = ["other", "col", "names"] + result = df.apply( + lambda x: Series([1, 2, 3], index=columns), axis=1, result_type="broadcast" + ) expected = df.copy() assert_frame_equal(result, expected) @@ -776,13 +899,13 @@ def test_result_type(self, int_frame_const_col): assert_frame_equal(result, expected) # series result with other index - columns = ['other', 'col', 'names'] + columns = ["other", "col", "names"] result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1) expected = df.copy() expected.columns = columns assert_frame_equal(result, expected) - @pytest.mark.parametrize("result_type", ['foo', 1]) + @pytest.mark.parametrize("result_type", ["foo", 1]) def test_result_type_error(self, result_type, int_frame_const_col): # allowed result_type df = int_frame_const_col @@ -792,10 +915,9 @@ def test_result_type_error(self, result_type, int_frame_const_col): @pytest.mark.parametrize( "box", - [lambda x: list(x), - lambda x: tuple(x), - lambda x: np.array(x, dtype='int64')], - ids=['list', 'tuple', 'array']) + [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")], + ids=["list", "tuple", "array"], + ) def test_consistency_for_boxed(self, box, int_frame_const_col): # passing an array or list should not affect the output shape df = int_frame_const_col @@ -804,9 +926,8 @@ def test_consistency_for_boxed(self, box, int_frame_const_col): expected = Series([box([1, 2]) for t in df.itertuples()]) assert_series_equal(result, expected) - result = df.apply(lambda x: box([1, 2]), axis=1, result_type='expand') - expected = int_frame_const_col[['A', 'B']].rename(columns={'A': 0, - 'B': 1}) + result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand") + expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1}) assert_frame_equal(result, expected) @@ -830,11 +951,10 @@ def zip_frames(frames, axis=1): class TestDataFrameAggregate: - def test_agg_transform(self, axis, float_frame): - other_axis = 1 if axis in {0, 'index'} else 0 + other_axis = 1 if axis in {0, "index"} else 0 - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): f_abs = np.abs(float_frame) f_sqrt = np.sqrt(float_frame) @@ -853,12 +973,14 @@ def test_agg_transform(self, axis, float_frame): # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() - if axis in {0, 'index'}: + if axis in {0, "index"}: expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ['sqrt']]) + [float_frame.columns, ["sqrt"]] + ) else: expected.index = pd.MultiIndex.from_product( - [float_frame.index, ['sqrt']]) + [float_frame.index, ["sqrt"]] + ) assert_frame_equal(result, expected) result = float_frame.transform([np.sqrt], axis=axis) @@ -869,39 +991,39 @@ def test_agg_transform(self, axis, float_frame): # functions per series and then concatting result = float_frame.apply([np.abs, np.sqrt], axis=axis) expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, 'index'}: + if axis in {0, "index"}: expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ['absolute', 'sqrt']]) + [float_frame.columns, ["absolute", "sqrt"]] + ) else: expected.index = pd.MultiIndex.from_product( - [float_frame.index, ['absolute', 'sqrt']]) + [float_frame.index, ["absolute", "sqrt"]] + ) assert_frame_equal(result, expected) - result = float_frame.transform([np.abs, 'sqrt'], axis=axis) + result = float_frame.transform([np.abs, "sqrt"], axis=axis) assert_frame_equal(result, expected) def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg with pytest.raises(ValueError): - float_frame.transform(['max', 'min'], axis=axis) + float_frame.transform(["max", "min"], axis=axis) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - float_frame.agg(['max', 'sqrt'], axis=axis) + with np.errstate(all="ignore"): + float_frame.agg(["max", "sqrt"], axis=axis) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - float_frame.transform(['max', 'sqrt'], axis=axis) + with np.errstate(all="ignore"): + float_frame.transform(["max", "sqrt"], axis=axis) - df = pd.DataFrame({'A': range(5), 'B': 5}) + df = pd.DataFrame({"A": range(5), "B": 5}) def f(): - with np.errstate(all='ignore'): - df.agg({'A': ['abs', 'sum'], 'B': ['mean', 'max']}, axis=axis) + with np.errstate(all="ignore"): + df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - @pytest.mark.parametrize('method', [ - 'abs', 'shift', 'pct_change', 'cumsum', 'rank', - ]) + @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) def test_transform_method_name(self, method): # GH 19760 df = pd.DataFrame({"A": [-1, 2]}) @@ -911,129 +1033,175 @@ def test_transform_method_name(self, method): def test_demo(self): # demonstration tests - df = pd.DataFrame({'A': range(5), 'B': 5}) + df = pd.DataFrame({"A": range(5), "B": 5}) - result = df.agg(['min', 'max']) - expected = DataFrame({'A': [0, 4], 'B': [5, 5]}, - columns=['A', 'B'], - index=['min', 'max']) + result = df.agg(["min", "max"]) + expected = DataFrame( + {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"] + ) tm.assert_frame_equal(result, expected) - result = df.agg({'A': ['min', 'max'], 'B': ['sum', 'max']}) - expected = DataFrame({'A': [4.0, 0.0, np.nan], - 'B': [5.0, np.nan, 25.0]}, - columns=['A', 'B'], - index=['max', 'min', 'sum']) + result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]}) + expected = DataFrame( + {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]}, + columns=["A", "B"], + index=["max", "min", "sum"], + ) tm.assert_frame_equal(result.reindex_like(expected), expected) def test_agg_multiple_mixed_no_warning(self): # GH 20909 - mdf = pd.DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) - expected = pd.DataFrame({"A": [1, 6], 'B': [1.0, 6.0], - "C": ['bar', 'foobarbaz'], - "D": [pd.Timestamp('2013-01-01'), pd.NaT]}, - index=['min', 'sum']) + mdf = pd.DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) + expected = pd.DataFrame( + { + "A": [1, 6], + "B": [1.0, 6.0], + "C": ["bar", "foobarbaz"], + "D": [pd.Timestamp("2013-01-01"), pd.NaT], + }, + index=["min", "sum"], + ) # sorted index with tm.assert_produces_warning(None): - result = mdf.agg(['min', 'sum']) + result = mdf.agg(["min", "sum"]) tm.assert_frame_equal(result, expected) with tm.assert_produces_warning(None): - result = mdf[['D', 'C', 'B', 'A']].agg(['sum', 'min']) + result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"]) # For backwards compatibility, the result's index is # still sorted by function name, so it's ['min', 'sum'] # not ['sum', 'min']. - expected = expected[['D', 'C', 'B', 'A']] + expected = expected[["D", "C", "B", "A"]] tm.assert_frame_equal(result, expected) def test_agg_dict_nested_renaming_depr(self): - df = pd.DataFrame({'A': range(5), 'B': 5}) + df = pd.DataFrame({"A": range(5), "B": 5}) # nested renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df.agg({'A': {'foo': 'min'}, - 'B': {'bar': 'max'}}) + df.agg({"A": {"foo": "min"}, "B": {"bar": "max"}}) def test_agg_reduce(self, axis, float_frame): - other_axis = 1 if axis in {0, 'index'} else 0 + other_axis = 1 if axis in {0, "index"} else 0 name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values() # all reducers - expected = pd.concat([float_frame.mean(axis=axis), - float_frame.max(axis=axis), - float_frame.sum(axis=axis), - ], axis=1) - expected.columns = ['mean', 'max', 'sum'] - expected = expected.T if axis in {0, 'index'} else expected - - result = float_frame.agg(['mean', 'max', 'sum'], axis=axis) + expected = pd.concat( + [ + float_frame.mean(axis=axis), + float_frame.max(axis=axis), + float_frame.sum(axis=axis), + ], + axis=1, + ) + expected.columns = ["mean", "max", "sum"] + expected = expected.T if axis in {0, "index"} else expected + + result = float_frame.agg(["mean", "max", "sum"], axis=axis) assert_frame_equal(result, expected) # dict input with scalars - func = OrderedDict([(name1, 'mean'), (name2, 'sum')]) + func = OrderedDict([(name1, "mean"), (name2, "sum")]) result = float_frame.agg(func, axis=axis) - expected = Series([float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name2].sum()], - index=[name1, name2]) + expected = Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name2].sum(), + ], + index=[name1, name2], + ) assert_series_equal(result, expected) # dict input with lists - func = OrderedDict([(name1, ['mean']), (name2, ['sum'])]) + func = OrderedDict([(name1, ["mean"]), (name2, ["sum"])]) result = float_frame.agg(func, axis=axis) - expected = DataFrame({ - name1: Series([float_frame.loc(other_axis)[name1].mean()], - index=['mean']), - name2: Series([float_frame.loc(other_axis)[name2].sum()], - index=['sum'])}) - expected = expected.T if axis in {1, 'columns'} else expected + expected = DataFrame( + { + name1: Series( + [float_frame.loc(other_axis)[name1].mean()], index=["mean"] + ), + name2: Series( + [float_frame.loc(other_axis)[name2].sum()], index=["sum"] + ), + } + ) + expected = expected.T if axis in {1, "columns"} else expected assert_frame_equal(result, expected) # dict input with lists with multiple - func = OrderedDict([(name1, ['mean', 'sum']), (name2, ['sum', 'max'])]) + func = OrderedDict([(name1, ["mean", "sum"]), (name2, ["sum", "max"])]) result = float_frame.agg(func, axis=axis) - expected = DataFrame(OrderedDict([ - (name1, Series([float_frame.loc(other_axis)[name1].mean(), - float_frame.loc(other_axis)[name1].sum()], - index=['mean', 'sum'])), - (name2, Series([float_frame.loc(other_axis)[name2].sum(), - float_frame.loc(other_axis)[name2].max()], - index=['sum', 'max'])), - ])) - expected = expected.T if axis in {1, 'columns'} else expected + expected = DataFrame( + OrderedDict( + [ + ( + name1, + Series( + [ + float_frame.loc(other_axis)[name1].mean(), + float_frame.loc(other_axis)[name1].sum(), + ], + index=["mean", "sum"], + ), + ), + ( + name2, + Series( + [ + float_frame.loc(other_axis)[name2].sum(), + float_frame.loc(other_axis)[name2].max(), + ], + index=["sum", "max"], + ), + ), + ] + ) + ) + expected = expected.T if axis in {1, "columns"} else expected assert_frame_equal(result, expected) def test_nuiscance_columns(self): # GH 15015 - df = DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) - - result = df.agg('min') - expected = Series([1, 1., 'bar', pd.Timestamp('20130101')], - index=df.columns) + df = DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) + + result = df.agg("min") + expected = Series([1, 1.0, "bar", pd.Timestamp("20130101")], index=df.columns) assert_series_equal(result, expected) - result = df.agg(['min']) - expected = DataFrame([[1, 1., 'bar', pd.Timestamp('20130101')]], - index=['min'], columns=df.columns) + result = df.agg(["min"]) + expected = DataFrame( + [[1, 1.0, "bar", pd.Timestamp("20130101")]], + index=["min"], + columns=df.columns, + ) assert_frame_equal(result, expected) - result = df.agg('sum') - expected = Series([6, 6., 'foobarbaz'], - index=['A', 'B', 'C']) + result = df.agg("sum") + expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"]) assert_series_equal(result, expected) - result = df.agg(['sum']) - expected = DataFrame([[6, 6., 'foobarbaz']], - index=['sum'], columns=['A', 'B', 'C']) + result = df.agg(["sum"]) + expected = DataFrame( + [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"] + ) assert_frame_equal(result, expected) def test_non_callable_aggregates(self): @@ -1041,74 +1209,85 @@ def test_non_callable_aggregates(self): # GH 16405 # 'size' is a property of frame/series # validate that this is working - df = DataFrame({'A': [None, 2, 3], - 'B': [1.0, np.nan, 3.0], - 'C': ['foo', None, 'bar']}) + df = DataFrame( + {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]} + ) # Function aggregate - result = df.agg({'A': 'count'}) - expected = Series({'A': 2}) + result = df.agg({"A": "count"}) + expected = Series({"A": 2}) assert_series_equal(result, expected) # Non-function aggregate - result = df.agg({'A': 'size'}) - expected = Series({'A': 3}) + result = df.agg({"A": "size"}) + expected = Series({"A": 3}) assert_series_equal(result, expected) # Mix function and non-function aggs - result1 = df.agg(['count', 'size']) - result2 = df.agg({'A': ['count', 'size'], - 'B': ['count', 'size'], - 'C': ['count', 'size']}) - expected = pd.DataFrame({'A': {'count': 2, 'size': 3}, - 'B': {'count': 2, 'size': 3}, - 'C': {'count': 2, 'size': 3}}) + result1 = df.agg(["count", "size"]) + result2 = df.agg( + {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]} + ) + expected = pd.DataFrame( + { + "A": {"count": 2, "size": 3}, + "B": {"count": 2, "size": 3}, + "C": {"count": 2, "size": 3}, + } + ) assert_frame_equal(result1, result2, check_like=True) assert_frame_equal(result2, expected, check_like=True) # Just functional string arg is same as calling df.arg() - result = df.agg('count') + result = df.agg("count") expected = df.count() assert_series_equal(result, expected) # Just a string attribute arg same as calling df.arg - result = df.agg('size') + result = df.agg("size") expected = df.size assert result == expected - @pytest.mark.parametrize("df, func, expected", chain( - _get_cython_table_params( - DataFrame(), [ - ('sum', Series()), - ('max', Series()), - ('min', Series()), - ('all', Series(dtype=bool)), - ('any', Series(dtype=bool)), - ('mean', Series()), - ('prod', Series()), - ('std', Series()), - ('var', Series()), - ('median', Series()), - ]), - _get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), [ - ('sum', Series([1., 3])), - ('max', Series([1., 2])), - ('min', Series([1., 1])), - ('all', Series([True, True])), - ('any', Series([True, True])), - ('mean', Series([1, 1.5])), - ('prod', Series([1., 2])), - ('std', Series([np.nan, 0.707107])), - ('var', Series([np.nan, 0.5])), - ('median', Series([1, 1.5])), - ]), - )) + @pytest.mark.parametrize( + "df, func, expected", + chain( + _get_cython_table_params( + DataFrame(), + [ + ("sum", Series()), + ("max", Series()), + ("min", Series()), + ("all", Series(dtype=bool)), + ("any", Series(dtype=bool)), + ("mean", Series()), + ("prod", Series()), + ("std", Series()), + ("var", Series()), + ("median", Series()), + ], + ), + _get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("sum", Series([1.0, 3])), + ("max", Series([1.0, 2])), + ("min", Series([1.0, 1])), + ("all", Series([True, True])), + ("any", Series([True, True])), + ("mean", Series([1, 1.5])), + ("prod", Series([1.0, 2])), + ("std", Series([np.nan, 0.707107])), + ("var", Series([np.nan, 0.5])), + ("median", Series([1, 1.5])), + ], + ), + ), + ) def test_agg_cython_table(self, df, func, expected, axis): # GH 21224 # test reducing functions in @@ -1116,18 +1295,21 @@ def test_agg_cython_table(self, df, func, expected, axis): result = df.agg(func, axis=axis) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("df, func, expected", chain( - _get_cython_table_params( - DataFrame(), [ - ('cumprod', DataFrame()), - ('cumsum', DataFrame()), - ]), - _get_cython_table_params( - DataFrame([[np.nan, 1], [1, 2]]), [ - ('cumprod', DataFrame([[np.nan, 1], [1., 2.]])), - ('cumsum', DataFrame([[np.nan, 1], [1., 3.]])), - ]), - )) + @pytest.mark.parametrize( + "df, func, expected", + chain( + _get_cython_table_params( + DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())] + ), + _get_cython_table_params( + DataFrame([[np.nan, 1], [1, 2]]), + [ + ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])), + ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])), + ], + ), + ), + ) def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in @@ -1135,10 +1317,11 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("df, func, expected", _get_cython_table_params( - DataFrame([['a', 'b'], ['b', 'a']]), [ - ['cumprod', TypeError], - ]), + @pytest.mark.parametrize( + "df, func, expected", + _get_cython_table_params( + DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]] + ), ) def test_agg_cython_table_raises(self, df, func, expected, axis): # GH 21224 diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index bcbea9d7a2236..7c022106c9104 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -17,22 +17,25 @@ class TestFrameComparisons: # Specifically _not_ flex-comparisons def test_comparison_invalid(self): - def check(df, df2): for (x, y) in [(df, df2), (df2, df)]: # we expect the result to match Series comparisons for # == and !=, inequalities should raise result = x == y - expected = pd.DataFrame({col: x[col] == y[col] - for col in x.columns}, - index=x.index, columns=x.columns) + expected = pd.DataFrame( + {col: x[col] == y[col] for col in x.columns}, + index=x.index, + columns=x.columns, + ) tm.assert_frame_equal(result, expected) result = x != y - expected = pd.DataFrame({col: x[col] != y[col] - for col in x.columns}, - index=x.index, columns=x.columns) + expected = pd.DataFrame( + {col: x[col] != y[col] for col in x.columns}, + index=x.index, + columns=x.columns, + ) tm.assert_frame_equal(result, expected) with pytest.raises(TypeError): @@ -46,56 +49,62 @@ def check(df, df2): # GH4968 # invalid date/int comparisons - df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=['a']) - df['dates'] = pd.date_range('20010101', periods=len(df)) + df = pd.DataFrame(np.random.randint(10, size=(10, 1)), columns=["a"]) + df["dates"] = pd.date_range("20010101", periods=len(df)) df2 = df.copy() - df2['dates'] = df['a'] + df2["dates"] = df["a"] check(df, df2) - df = pd.DataFrame(np.random.randint(10, size=(10, 2)), - columns=['a', 'b']) - df2 = pd.DataFrame({'a': pd.date_range('20010101', periods=len(df)), - 'b': pd.date_range('20100101', periods=len(df))}) + df = pd.DataFrame(np.random.randint(10, size=(10, 2)), columns=["a", "b"]) + df2 = pd.DataFrame( + { + "a": pd.date_range("20010101", periods=len(df)), + "b": pd.date_range("20100101", periods=len(df)), + } + ) check(df, df2) def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH#4982 - df = pd. DataFrame({'dates1': pd.date_range('20010101', periods=10), - 'dates2': pd.date_range('20010102', periods=10), - 'intcol': np.random.randint(1000000000, size=10), - 'floatcol': np.random.randn(10), - 'stringcol': list(tm.rands(10))}) - df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT - ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', - 'ne': 'ne'} + df = pd.DataFrame( + { + "dates1": pd.date_range("20010101", periods=10), + "dates2": pd.date_range("20010102", periods=10), + "intcol": np.random.randint(1000000000, size=10), + "floatcol": np.random.randn(10), + "stringcol": list(tm.rands(10)), + } + ) + df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT + ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats - if left in ['eq', 'ne']: - expected = left_f(df, pd.Timestamp('20010109')) - result = right_f(pd.Timestamp('20010109'), df) + if left in ["eq", "ne"]: + expected = left_f(df, pd.Timestamp("20010109")) + result = right_f(pd.Timestamp("20010109"), df) tm.assert_frame_equal(result, expected) else: with pytest.raises(TypeError): - left_f(df, pd.Timestamp('20010109')) + left_f(df, pd.Timestamp("20010109")) with pytest.raises(TypeError): - right_f(pd.Timestamp('20010109'), df) + right_f(pd.Timestamp("20010109"), df) # nats - expected = left_f(df, pd.Timestamp('nat')) - result = right_f(pd.Timestamp('nat'), df) + expected = left_f(df, pd.Timestamp("nat")) + result = right_f(pd.Timestamp("nat"), df) tm.assert_frame_equal(result, expected) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError # (this appears to be fixed before GH#22163, not sure when) - df = pd.DataFrame([['1989-08-01', 1], ['1989-08-01', 2]]) - other = pd.DataFrame([['a', 'b'], ['c', 'd']]) + df = pd.DataFrame([["1989-08-01", 1], ["1989-08-01", 2]]) + other = pd.DataFrame([["a", "b"], ["c", "d"]]) result = df == other assert not result.any().any() @@ -109,9 +118,7 @@ def test_df_boolean_comparison_error(self): # len(df.columns) is supported as of GH#22800 df = pd.DataFrame(np.arange(6).reshape((3, 2))) - expected = pd.DataFrame([[False, False], - [True, False], - [False, False]]) + expected = pd.DataFrame([[False, False], [True, False], [False, False]]) result = df == (2, 2) tm.assert_frame_equal(result, expected) @@ -120,8 +127,9 @@ def test_df_boolean_comparison_error(self): tm.assert_frame_equal(result, expected) def test_df_float_none_comparison(self): - df = pd.DataFrame(np.random.randn(8, 3), index=range(8), - columns=['A', 'B', 'C']) + df = pd.DataFrame( + np.random.randn(8, 3), index=range(8), columns=["A", "B", "C"] + ) result = df.__eq__(None) assert not result.any().any() @@ -156,7 +164,7 @@ def _check_unaligned_frame(meth, op, df, other): # DataFrame assert df.eq(df).values.all() assert not df.ne(df).values.any() - for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: + for op in ["eq", "ne", "gt", "lt", "ge", "le"]: f = getattr(df, op) o = getattr(operator, op) # No NAs @@ -230,23 +238,23 @@ def _test_seq(df, idx_ser, col_ser): # complex arr = np.array([np.nan, 1, 6, np.nan]) arr2 = np.array([2j, np.nan, 7, None]) - df = pd.DataFrame({'a': arr}) - df2 = pd.DataFrame({'a': arr2}) + df = pd.DataFrame({"a": arr}) + df2 = pd.DataFrame({"a": arr2}) rs = df.gt(df2) assert not rs.values.any() rs = df.ne(df2) assert rs.values.all() arr3 = np.array([2j, np.nan, None]) - df3 = pd.DataFrame({'a': arr3}) + df3 = pd.DataFrame({"a": arr3}) rs = df3.gt(2j) assert not rs.values.any() # corner, dtype=object - df1 = pd.DataFrame({'col': ['foo', np.nan, 'bar']}) - df2 = pd.DataFrame({'col': ['foo', datetime.now(), 'bar']}) + df1 = pd.DataFrame({"col": ["foo", np.nan, "bar"]}) + df2 = pd.DataFrame({"col": ["foo", datetime.now(), "bar"]}) result = df1.ne(df2) - exp = pd.DataFrame({'col': [False, True, False]}) + exp = pd.DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) def test_flex_comparison_nat(self): @@ -267,19 +275,19 @@ def test_flex_comparison_nat(self): result = df.ne(pd.NaT) assert result.iloc[0, 0].item() is True - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types(self, opname): # GH 15077, non-empty DataFrame - df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) + df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 result = getattr(df, opname)(const).dtypes.value_counts() tm.assert_series_equal(result, pd.Series([2], index=[np.dtype(bool)])) - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_df_flex_cmp_constant_return_types_empty(self, opname): # GH 15077 empty DataFrame - df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) + df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) const = 2 empty = df.iloc[:0] @@ -290,44 +298,48 @@ def test_df_flex_cmp_constant_return_types_empty(self, opname): # ------------------------------------------------------------------- # Arithmetic -class TestFrameFlexArithmetic: +class TestFrameFlexArithmetic: def test_df_add_td64_columnwise(self): # GH 22534 Check that column-wise addition broadcasts correctly - dti = pd.date_range('2016-01-01', periods=10) - tdi = pd.timedelta_range('1', periods=10) + dti = pd.date_range("2016-01-01", periods=10) + tdi = pd.timedelta_range("1", periods=10) tser = pd.Series(tdi) df = pd.DataFrame({0: dti, 1: tdi}) result = df.add(tser, axis=0) - expected = pd.DataFrame({0: dti + tdi, - 1: tdi + tdi}) + expected = pd.DataFrame({0: dti + tdi, 1: tdi + tdi}) tm.assert_frame_equal(result, expected) def test_df_add_flex_filled_mixed_dtypes(self): # GH 19611 - dti = pd.date_range('2016-01-01', periods=3) - ser = pd.Series(['1 Day', 'NaT', '2 Days'], dtype='timedelta64[ns]') - df = pd.DataFrame({'A': dti, 'B': ser}) - other = pd.DataFrame({'A': ser, 'B': ser}) + dti = pd.date_range("2016-01-01", periods=3) + ser = pd.Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]") + df = pd.DataFrame({"A": dti, "B": ser}) + other = pd.DataFrame({"A": ser, "B": ser}) fill = pd.Timedelta(days=1).to_timedelta64() result = df.add(other, fill_value=fill) expected = pd.DataFrame( - {'A': pd.Series(['2016-01-02', '2016-01-03', '2016-01-05'], - dtype='datetime64[ns]'), - 'B': ser * 2}) + { + "A": pd.Series( + ["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]" + ), + "B": ser * 2, + } + ) tm.assert_frame_equal(result, expected) - def test_arith_flex_frame(self, all_arithmetic_operators, float_frame, - mixed_float_frame): + def test_arith_flex_frame( + self, all_arithmetic_operators, float_frame, mixed_float_frame + ): # one instance of parametrized fixture op = all_arithmetic_operators def f(x, y): # r-versions not in operator-stdlib; get op without "r" and invert - if op.startswith('__r'): - return getattr(operator, op.replace('__r', '__'))(y, x) + if op.startswith("__r"): + return getattr(operator, op.replace("__r", "__"))(y, x) return getattr(operator, op)(x, y) result = getattr(float_frame, op)(2 * float_frame) @@ -340,9 +352,10 @@ def f(x, y): tm.assert_frame_equal(result, expected) _check_mixed_float(result, dtype=dict(C=None)) - @pytest.mark.parametrize('op', ['__add__', '__sub__', '__mul__']) - def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame, - mixed_float_frame): + @pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"]) + def test_arith_flex_frame_mixed( + self, op, int_frame, mixed_int_frame, mixed_float_frame + ): f = getattr(operator, op) # vs mix int @@ -351,9 +364,9 @@ def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame, # no overflow in the uint dtype = None - if op in ['__sub__']: - dtype = dict(B='uint64', C=None) - elif op in ['__add__', '__mul__']: + if op in ["__sub__"]: + dtype = dict(B="uint64", C=None) + elif op in ["__add__", "__mul__"]: dtype = dict(C=None) tm.assert_frame_equal(result, expected) _check_mixed_int(result, dtype=dtype) @@ -369,8 +382,7 @@ def test_arith_flex_frame_mixed(self, op, int_frame, mixed_int_frame, expected = f(int_frame, 2 * int_frame) tm.assert_frame_equal(result, expected) - def test_arith_flex_frame_raise(self, all_arithmetic_operators, - float_frame): + def test_arith_flex_frame_raise(self, all_arithmetic_operators, float_frame): # one instance of parametrized fixture op = all_arithmetic_operators @@ -393,19 +405,19 @@ def test_arith_flex_frame_corner(self, float_frame): result = float_frame[:0].add(float_frame) tm.assert_frame_equal(result, float_frame * np.nan) - with pytest.raises(NotImplementedError, match='fill_value'): + with pytest.raises(NotImplementedError, match="fill_value"): float_frame.add(float_frame.iloc[0], fill_value=3) - with pytest.raises(NotImplementedError, match='fill_value'): - float_frame.add(float_frame.iloc[0], axis='index', fill_value=3) + with pytest.raises(NotImplementedError, match="fill_value"): + float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) def test_arith_flex_series(self, simple_frame): df = simple_frame - row = df.xs('a') - col = df['two'] + row = df.xs("a") + col = df["two"] # after arithmetic refactor, add truediv here - ops = ['add', 'sub', 'mul', 'mod'] + ops = ["add", "sub", "mul", "mod"] for op in ops: f = getattr(df, op) op = getattr(operator, op) @@ -420,46 +432,47 @@ def test_arith_flex_series(self, simple_frame): tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T) # broadcasting issue in GH 7325 - df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64') + df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="int64") expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) - result = df.div(df[0], axis='index') + result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) - df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64') + df = pd.DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype="float64") expected = pd.DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) - result = df.div(df[0], axis='index') + result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) def test_arith_flex_zero_len_raises(self): # GH 19522 passing fill_value to frame flex arith methods should # raise even in the zero-length special cases ser_len0 = pd.Series([]) - df_len0 = pd.DataFrame(columns=['A', 'B']) - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df_len0 = pd.DataFrame(columns=["A", "B"]) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - with pytest.raises(NotImplementedError, match='fill_value'): - df.add(ser_len0, fill_value='E') + with pytest.raises(NotImplementedError, match="fill_value"): + df.add(ser_len0, fill_value="E") - with pytest.raises(NotImplementedError, match='fill_value'): - df_len0.sub(df['A'], axis=None, fill_value=3) + with pytest.raises(NotImplementedError, match="fill_value"): + df_len0.sub(df["A"], axis=None, fill_value=3) class TestFrameArithmetic: def test_df_add_2d_array_rowlike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) - expected = pd.DataFrame([[2, 4], - [4, 6], - [6, 8]], - columns=df.columns, index=df.index, - # specify dtype explicitly to avoid failing - # on 32bit builds - dtype=arr.dtype) + expected = pd.DataFrame( + [[2, 4], [4, 6], [6, 8]], + columns=df.columns, + index=df.index, + # specify dtype explicitly to avoid failing + # on 32bit builds + dtype=arr.dtype, + ) result = df + rowlike tm.assert_frame_equal(result, expected) result = rowlike + df @@ -468,41 +481,43 @@ def test_df_add_2d_array_rowlike_broadcasts(self): def test_df_add_2d_array_collike_broadcasts(self): # GH#23000 arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) - expected = pd.DataFrame([[1, 2], - [5, 6], - [9, 10]], - columns=df.columns, index=df.index, - # specify dtype explicitly to avoid failing - # on 32bit builds - dtype=arr.dtype) + expected = pd.DataFrame( + [[1, 2], [5, 6], [9, 10]], + columns=df.columns, + index=df.index, + # specify dtype explicitly to avoid failing + # on 32bit builds + dtype=arr.dtype, + ) result = df + collike tm.assert_frame_equal(result, expected) result = collike + df tm.assert_frame_equal(result, expected) - def test_df_arith_2d_array_rowlike_broadcasts(self, - all_arithmetic_operators): + def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators): # GH#23000 opname = all_arithmetic_operators arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) rowlike = arr[[1], :] # shape --> (1, ncols) assert rowlike.shape == (1, df.shape[1]) - exvals = [getattr(df.loc['A'], opname)(rowlike.squeeze()), - getattr(df.loc['B'], opname)(rowlike.squeeze()), - getattr(df.loc['C'], opname)(rowlike.squeeze())] + exvals = [ + getattr(df.loc["A"], opname)(rowlike.squeeze()), + getattr(df.loc["B"], opname)(rowlike.squeeze()), + getattr(df.loc["C"], opname)(rowlike.squeeze()), + ] expected = pd.DataFrame(exvals, columns=df.columns, index=df.index) - if opname in ['__rmod__', '__rfloordiv__']: + if opname in ["__rmod__", "__rfloordiv__"]: # exvals will have dtypes [f8, i8, i8] so expected will be # all-f8, but the DataFrame operation will return mixed dtypes # use exvals[-1].dtype instead of "i8" for compat with 32-bit @@ -512,28 +527,28 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, result = getattr(df, opname)(rowlike) tm.assert_frame_equal(result, expected) - def test_df_arith_2d_array_collike_broadcasts(self, - all_arithmetic_operators): + def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators): # GH#23000 opname = all_arithmetic_operators arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr, columns=[True, False], index=['A', 'B', 'C']) + df = pd.DataFrame(arr, columns=[True, False], index=["A", "B", "C"]) collike = arr[:, [1]] # shape --> (nrows, 1) assert collike.shape == (df.shape[0], 1) - exvals = {True: getattr(df[True], opname)(collike.squeeze()), - False: getattr(df[False], opname)(collike.squeeze())} + exvals = { + True: getattr(df[True], opname)(collike.squeeze()), + False: getattr(df[False], opname)(collike.squeeze()), + } dtype = None - if opname in ['__rmod__', '__rfloordiv__']: + if opname in ["__rmod__", "__rfloordiv__"]: # Series ops may return mixed int/float dtypes in cases where # DataFrame op will return all-float. So we upcast `expected` dtype = np.common_type(*[x.values for x in exvals.values()]) - expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, - dtype=dtype) + expected = pd.DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype) result = getattr(df, opname)(collike) tm.assert_frame_equal(result, expected) @@ -547,24 +562,22 @@ def test_df_bool_mul_int(self): # On appveyor this comes back as np.int32 instead of np.int64, # so we check dtype.kind instead of just dtype kinds = result.dtypes.apply(lambda x: x.kind) - assert (kinds == 'i').all() + assert (kinds == "i").all() result = 1 * df kinds = result.dtypes.apply(lambda x: x.kind) - assert (kinds == 'i').all() + assert (kinds == "i").all() def test_arith_mixed(self): - left = pd.DataFrame({'A': ['a', 'b', 'c'], - 'B': [1, 2, 3]}) + left = pd.DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]}) result = left + left - expected = pd.DataFrame({'A': ['aa', 'bb', 'cc'], - 'B': [2, 4, 6]}) + expected = pd.DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]}) tm.assert_frame_equal(result, expected) def test_arith_getitem_commute(self): - df = pd.DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) + df = pd.DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) def _test_op(df, op): result = op(df, 1) @@ -594,40 +607,38 @@ def _test_op(df, op): _test_op(df, lambda x, y: x / y) _test_op(df, lambda x, y: x ** y) - @pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]), - range(1, 3), deque([1, 2])]) + @pytest.mark.parametrize( + "values", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])] + ) def test_arith_alignment_non_pandas_object(self, values): # GH#17901 - df = pd.DataFrame({'A': [1, 1], 'B': [1, 1]}) - expected = pd.DataFrame({'A': [2, 2], 'B': [3, 3]}) + df = pd.DataFrame({"A": [1, 1], "B": [1, 1]}) + expected = pd.DataFrame({"A": [2, 2], "B": [3, 3]}) result = df + values tm.assert_frame_equal(result, expected) def test_arith_non_pandas_object(self): - df = pd.DataFrame(np.arange(1, 10, dtype='f8').reshape(3, 3), - columns=['one', 'two', 'three'], - index=['a', 'b', 'c']) - - val1 = df.xs('a').values - added = pd.DataFrame(df.values + val1, - index=df.index, columns=df.columns) + df = pd.DataFrame( + np.arange(1, 10, dtype="f8").reshape(3, 3), + columns=["one", "two", "three"], + index=["a", "b", "c"], + ) + + val1 = df.xs("a").values + added = pd.DataFrame(df.values + val1, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val1, added) - added = pd.DataFrame((df.values.T + val1).T, - index=df.index, columns=df.columns) + added = pd.DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val1, axis=0), added) - val2 = list(df['two']) + val2 = list(df["two"]) - added = pd.DataFrame(df.values + val2, - index=df.index, columns=df.columns) + added = pd.DataFrame(df.values + val2, index=df.index, columns=df.columns) tm.assert_frame_equal(df + val2, added) - added = pd.DataFrame((df.values.T + val2).T, index=df.index, - columns=df.columns) - tm.assert_frame_equal(df.add(val2, axis='index'), added) + added = pd.DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns) + tm.assert_frame_equal(df.add(val2, axis="index"), added) val3 = np.random.rand(*df.shape) - added = pd.DataFrame(df.values + val3, - index=df.index, columns=df.columns) + added = pd.DataFrame(df.values + val3, index=df.index, columns=df.columns) tm.assert_frame_equal(df.add(val3), added) diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py index e7b9ff348bd57..9a7d806c79dc3 100644 --- a/pandas/tests/frame/test_asof.py +++ b/pandas/tests/frame/test_asof.py @@ -13,17 +13,16 @@ def date_range_frame(): Columns are ['A', 'B']. """ N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') - return DataFrame({'A': np.arange(N), 'B': np.arange(N)}, index=rng) + rng = date_range("1/1/1990", periods=N, freq="53s") + return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng) class TestFrameAsof: - def test_basic(self, date_range_frame): df = date_range_frame N = 50 - df.loc[15:30, 'A'] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='25s') + df.loc[15:30, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = df.asof(dates) assert result.notna().all(1).all() @@ -41,23 +40,22 @@ def test_basic(self, date_range_frame): def test_subset(self, date_range_frame): N = 10 df = date_range_frame.iloc[:N].copy() - df.loc[4:8, 'A'] = np.nan - dates = date_range('1/1/1990', periods=N * 3, - freq='25s') + df.loc[4:8, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") # with a subset of A should be the same - result = df.asof(dates, subset='A') + result = df.asof(dates, subset="A") expected = df.asof(dates) tm.assert_frame_equal(result, expected) # same with A/B - result = df.asof(dates, subset=['A', 'B']) + result = df.asof(dates, subset=["A", "B"]) expected = df.asof(dates) tm.assert_frame_equal(result, expected) # B gives df.asof - result = df.asof(dates, subset='B') - expected = df.resample('25s', closed='right').ffill().reindex(dates) + result = df.asof(dates, subset="B") + expected = df.resample("25s", closed="right").ffill().reindex(dates) expected.iloc[20:] = 9 tm.assert_frame_equal(result, expected) @@ -67,14 +65,15 @@ def test_missing(self, date_range_frame): # no match found - `where` value before earliest date in index N = 10 df = date_range_frame.iloc[:N].copy() - result = df.asof('1989-12-31') + result = df.asof("1989-12-31") - expected = Series(index=['A', 'B'], name=Timestamp('1989-12-31')) + expected = Series(index=["A", "B"], name=Timestamp("1989-12-31")) tm.assert_series_equal(result, expected) - result = df.asof(to_datetime(['1989-12-31'])) - expected = DataFrame(index=to_datetime(['1989-12-31']), - columns=['A', 'B'], dtype='float64') + result = df.asof(to_datetime(["1989-12-31"])) + expected = DataFrame( + index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64" + ) tm.assert_frame_equal(result, expected) def test_all_nans(self, date_range_frame): @@ -87,41 +86,49 @@ def test_all_nans(self, date_range_frame): # testing non-default indexes, multiple inputs N = 150 rng = date_range_frame.index - dates = date_range('1/1/1990', periods=N, freq='25s') - result = DataFrame(np.nan, index=rng, columns=['A']).asof(dates) - expected = DataFrame(np.nan, index=dates, columns=['A']) + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A"]) tm.assert_frame_equal(result, expected) # testing multiple columns - dates = date_range('1/1/1990', periods=N, freq='25s') - result = DataFrame(np.nan, index=rng, - columns=['A', 'B', 'C']).asof(dates) - expected = DataFrame(np.nan, index=dates, columns=['A', 'B', 'C']) + dates = date_range("1/1/1990", periods=N, freq="25s") + result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates) + expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) # testing scalar input - result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof([3]) - expected = DataFrame(np.nan, index=[3], columns=['A', 'B']) + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3]) + expected = DataFrame(np.nan, index=[3], columns=["A", "B"]) tm.assert_frame_equal(result, expected) - result = DataFrame(np.nan, index=[1, 2], columns=['A', 'B']).asof(3) - expected = Series(np.nan, index=['A', 'B'], name=3) + result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3) + expected = Series(np.nan, index=["A", "B"], name=3) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "stamp,expected", - [(Timestamp('2018-01-01 23:22:43.325+00:00'), - Series(2.0, name=Timestamp('2018-01-01 23:22:43.325+00:00'))), - (Timestamp('2018-01-01 22:33:20.682+01:00'), - Series(1.0, name=Timestamp('2018-01-01 22:33:20.682+01:00'))), - ] + [ + ( + Timestamp("2018-01-01 23:22:43.325+00:00"), + Series(2.0, name=Timestamp("2018-01-01 23:22:43.325+00:00")), + ), + ( + Timestamp("2018-01-01 22:33:20.682+01:00"), + Series(1.0, name=Timestamp("2018-01-01 22:33:20.682+01:00")), + ), + ], ) def test_time_zone_aware_index(self, stamp, expected): # GH21194 # Testing awareness of DataFrame index considering different # UTC and timezone - df = DataFrame(data=[1, 2], - index=[Timestamp('2018-01-01 21:00:05.001+00:00'), - Timestamp('2018-01-01 22:35:10.550+00:00')]) + df = DataFrame( + data=[1, 2], + index=[ + Timestamp("2018-01-01 21:00:05.001+00:00"), + Timestamp("2018-01-01 22:35:10.550+00:00"), + ], + ) result = df.asof(stamp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 12ac373aa8f60..77be952506964 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -6,8 +6,7 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import ( - Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna) +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal @@ -17,101 +16,102 @@ class TestDataFrameSelectReindex: # test_indexing def test_drop_names(self): - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - df.index.name, df.columns.name = 'first', 'second' - df_dropped_b = df.drop('b') - df_dropped_e = df.drop('e', axis=1) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + df.index.name, df.columns.name = "first", "second" + df_dropped_b = df.drop("b") + df_dropped_e = df.drop("e", axis=1) df_inplace_b, df_inplace_e = df.copy(), df.copy() - df_inplace_b.drop('b', inplace=True) - df_inplace_e.drop('e', axis=1, inplace=True) + df_inplace_b.drop("b", inplace=True) + df_inplace_e.drop("e", axis=1, inplace=True) for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): - assert obj.index.name == 'first' - assert obj.columns.name == 'second' - assert list(df.columns) == ['d', 'e', 'f'] + assert obj.index.name == "first" + assert obj.columns.name == "second" + assert list(df.columns) == ["d", "e", "f"] msg = r"\['g'\] not found in axis" with pytest.raises(KeyError, match=msg): - df.drop(['g']) + df.drop(["g"]) with pytest.raises(KeyError, match=msg): - df.drop(['g'], 1) + df.drop(["g"], 1) # errors = 'ignore' - dropped = df.drop(['g'], errors='ignore') - expected = Index(['a', 'b', 'c'], name='first') + dropped = df.drop(["g"], errors="ignore") + expected = Index(["a", "b", "c"], name="first") tm.assert_index_equal(dropped.index, expected) - dropped = df.drop(['b', 'g'], errors='ignore') - expected = Index(['a', 'c'], name='first') + dropped = df.drop(["b", "g"], errors="ignore") + expected = Index(["a", "c"], name="first") tm.assert_index_equal(dropped.index, expected) - dropped = df.drop(['g'], axis=1, errors='ignore') - expected = Index(['d', 'e', 'f'], name='second') + dropped = df.drop(["g"], axis=1, errors="ignore") + expected = Index(["d", "e", "f"], name="second") tm.assert_index_equal(dropped.columns, expected) - dropped = df.drop(['d', 'g'], axis=1, errors='ignore') - expected = Index(['e', 'f'], name='second') + dropped = df.drop(["d", "g"], axis=1, errors="ignore") + expected = Index(["e", "f"], name="second") tm.assert_index_equal(dropped.columns, expected) # GH 16398 - dropped = df.drop([], errors='ignore') - expected = Index(['a', 'b', 'c'], name='first') + dropped = df.drop([], errors="ignore") + expected = Index(["a", "b", "c"], name="first") tm.assert_index_equal(dropped.index, expected) def test_drop_col_still_multiindex(self): - arrays = [['a', 'b', 'c', 'top'], - ['', '', '', 'OD'], - ['', '', '', 'wx']] + arrays = [["a", "b", "c", "top"], ["", "", "", "OD"], ["", "", "", "wx"]] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.randn(3, 4), columns=index) - del df[('a', '', '')] - assert(isinstance(df.columns, MultiIndex)) + del df[("a", "", "")] + assert isinstance(df.columns, MultiIndex) def test_drop(self): simple = DataFrame({"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}) - assert_frame_equal(simple.drop("A", axis=1), simple[['B']]) - assert_frame_equal(simple.drop(["A", "B"], axis='columns'), - simple[[]]) + assert_frame_equal(simple.drop("A", axis=1), simple[["B"]]) + assert_frame_equal(simple.drop(["A", "B"], axis="columns"), simple[[]]) assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) - assert_frame_equal(simple.drop( - [0, 3], axis='index'), simple.loc[[1, 2], :]) + assert_frame_equal(simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) with pytest.raises(KeyError, match=r"\[5\] not found in axis"): simple.drop(5) with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop('C', 1) + simple.drop("C", 1) with pytest.raises(KeyError, match=r"\[5\] not found in axis"): simple.drop([1, 5]) with pytest.raises(KeyError, match=r"\['C'\] not found in axis"): - simple.drop(['A', 'C'], 1) + simple.drop(["A", "C"], 1) # errors = 'ignore' - assert_frame_equal(simple.drop(5, errors='ignore'), simple) - assert_frame_equal(simple.drop([0, 5], errors='ignore'), - simple.loc[[1, 2, 3], :]) - assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple) - assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'), - simple[['B']]) + assert_frame_equal(simple.drop(5, errors="ignore"), simple) + assert_frame_equal( + simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + assert_frame_equal(simple.drop("C", axis=1, errors="ignore"), simple) + assert_frame_equal( + simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) # non-unique - wheee! - nu_df = DataFrame(list(zip(range(3), range(-3, 1), list('abc'))), - columns=['a', 'a', 'b']) - assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']]) - assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a']) + nu_df = DataFrame( + list(zip(range(3), range(-3, 1), list("abc"))), columns=["a", "a", "b"] + ) + assert_frame_equal(nu_df.drop("a", axis=1), nu_df[["b"]]) + assert_frame_equal(nu_df.drop("b", axis="columns"), nu_df["a"]) assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 - nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) - nu_df.columns = list('abc') - assert_frame_equal(nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :]) - assert_frame_equal(nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :]) + nu_df = nu_df.set_index(pd.Index(["X", "Y", "X"])) + nu_df.columns = list("abc") + assert_frame_equal(nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + assert_frame_equal(nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue # GH 5628 - df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) expected = df[~(df.b > 0)] df.drop(labels=df[df.b > 0].index, inplace=True) assert_frame_equal(df, expected) @@ -121,59 +121,63 @@ def test_drop_multiindex_not_lexsorted(self): # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( - [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns.is_lexsorted() # define the non-lexsorted version - not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], - [1, 'b2', 'c2', 4]]) + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) not_lexsorted_df = not_lexsorted_df.pivot_table( - index='a', columns=['b', 'c'], values='d') + index="a", columns=["b", "c"], values="d" + ) not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns.is_lexsorted() # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - expected = lexsorted_df.drop('a', axis=1) + expected = lexsorted_df.drop("a", axis=1) with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.drop('a', axis=1) + result = not_lexsorted_df.drop("a", axis=1) tm.assert_frame_equal(result, expected) def test_drop_api_equivalence(self): # equivalence of the labels/axis and index/columns API's (GH12392) - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - - res1 = df.drop('a') - res2 = df.drop(index='a') + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.drop("a") + res2 = df.drop(index="a") tm.assert_frame_equal(res1, res2) - res1 = df.drop('d', 1) - res2 = df.drop(columns='d') + res1 = df.drop("d", 1) + res2 = df.drop(columns="d") tm.assert_frame_equal(res1, res2) - res1 = df.drop(labels='e', axis=1) - res2 = df.drop(columns='e') + res1 = df.drop(labels="e", axis=1) + res2 = df.drop(columns="e") tm.assert_frame_equal(res1, res2) - res1 = df.drop(['a'], axis=0) - res2 = df.drop(index=['a']) + res1 = df.drop(["a"], axis=0) + res2 = df.drop(index=["a"]) tm.assert_frame_equal(res1, res2) - res1 = df.drop(['a'], axis=0).drop(['d'], axis=1) - res2 = df.drop(index=['a'], columns=['d']) + res1 = df.drop(["a"], axis=0).drop(["d"], axis=1) + res2 = df.drop(index=["a"], columns=["d"]) tm.assert_frame_equal(res1, res2) with pytest.raises(ValueError): - df.drop(labels='a', index='b') + df.drop(labels="a", index="b") with pytest.raises(ValueError): - df.drop(labels='a', columns='b') + df.drop(labels="a", columns="b") with pytest.raises(ValueError): df.drop(axis=1) @@ -182,25 +186,24 @@ def test_merge_join_different_levels(self): # GH 9455 # first dataframe - df1 = DataFrame(columns=['a', 'b'], data=[[1, 11], [0, 22]]) + df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]]) # second dataframe - columns = MultiIndex.from_tuples([('a', ''), ('c', 'c1')]) + columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")]) df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]]) # merge - columns = ['a', 'b', ('c', 'c1')] + columns = ["a", "b", ("c", "c1")] expected = DataFrame(columns=columns, data=[[1, 11, 33], [0, 22, 44]]) with tm.assert_produces_warning(UserWarning): - result = pd.merge(df1, df2, on='a') + result = pd.merge(df1, df2, on="a") tm.assert_frame_equal(result, expected) # join, see discussion in GH 12219 - columns = ['a', 'b', ('a', ''), ('c', 'c1')] - expected = DataFrame(columns=columns, - data=[[1, 11, 0, 44], [0, 22, 1, 33]]) + columns = ["a", "b", ("a", ""), ("c", "c1")] + expected = DataFrame(columns=columns, data=[[1, 11, 0, 44], [0, 22, 1, 33]]) with tm.assert_produces_warning(UserWarning): - result = df1.join(df2, on='a') + result = df1.join(df2, on="a") tm.assert_frame_equal(result, expected) def test_reindex(self, float_frame): @@ -266,69 +269,73 @@ def test_reindex(self, float_frame): assert result is not float_frame def test_reindex_nan(self): - df = pd.DataFrame([[1, 2], [3, 5], [7, 11], [9, 23]], - index=[2, np.nan, 1, 5], - columns=['joe', 'jim']) + df = pd.DataFrame( + [[1, 2], [3, 5], [7, 11], [9, 23]], + index=[2, np.nan, 1, 5], + columns=["joe", "jim"], + ) i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1] assert_frame_equal(df.reindex(i), df.iloc[j]) - df.index = df.index.astype('object') + df.index = df.index.astype("object") assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False) # GH10388 - df = pd.DataFrame({'other': ['a', 'b', np.nan, 'c'], - 'date': ['2015-03-22', np.nan, - '2012-01-08', np.nan], - 'amount': [2, 3, 4, 5]}) - - df['date'] = pd.to_datetime(df.date) - df['delta'] = (pd.to_datetime('2015-06-18') - df['date']).shift(1) - - left = df.set_index(['delta', 'other', 'date']).reset_index() - right = df.reindex(columns=['delta', 'other', 'date', 'amount']) + df = pd.DataFrame( + { + "other": ["a", "b", np.nan, "c"], + "date": ["2015-03-22", np.nan, "2012-01-08", np.nan], + "amount": [2, 3, 4, 5], + } + ) + + df["date"] = pd.to_datetime(df.date) + df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1) + + left = df.set_index(["delta", "other", "date"]).reset_index() + right = df.reindex(columns=["delta", "other", "date", "amount"]) assert_frame_equal(left, right) def test_reindex_name_remains(self): s = Series(np.random.rand(10)) df = DataFrame(s, index=np.arange(len(s))) - i = Series(np.arange(10), name='iname') + i = Series(np.arange(10), name="iname") df = df.reindex(i) - assert df.index.name == 'iname' + assert df.index.name == "iname" - df = df.reindex(Index(np.arange(10), name='tmpname')) - assert df.index.name == 'tmpname' + df = df.reindex(Index(np.arange(10), name="tmpname")) + assert df.index.name == "tmpname" s = Series(np.random.rand(10)) df = DataFrame(s.T, index=np.arange(len(s))) - i = Series(np.arange(10), name='iname') + i = Series(np.arange(10), name="iname") df = df.reindex(columns=i) - assert df.columns.name == 'iname' + assert df.columns.name == "iname" def test_reindex_int(self, int_frame): smaller = int_frame.reindex(int_frame.index[::2]) - assert smaller['A'].dtype == np.int64 + assert smaller["A"].dtype == np.int64 bigger = smaller.reindex(int_frame.index) - assert bigger['A'].dtype == np.float64 + assert bigger["A"].dtype == np.float64 - smaller = int_frame.reindex(columns=['A', 'B']) - assert smaller['A'].dtype == np.int64 + smaller = int_frame.reindex(columns=["A", "B"]) + assert smaller["A"].dtype == np.int64 def test_reindex_like(self, float_frame): - other = float_frame.reindex(index=float_frame.index[:10], - columns=['C', 'B']) + other = float_frame.reindex(index=float_frame.index[:10], columns=["C", "B"]) assert_frame_equal(other, float_frame.reindex_like(other)) def test_reindex_columns(self, float_frame): - new_frame = float_frame.reindex(columns=['A', 'B', 'E']) + new_frame = float_frame.reindex(columns=["A", "B", "E"]) - tm.assert_series_equal(new_frame['B'], float_frame['B']) - assert np.isnan(new_frame['E']).all() - assert 'C' not in new_frame + tm.assert_series_equal(new_frame["B"], float_frame["B"]) + assert np.isnan(new_frame["E"]).all() + assert "C" not in new_frame # Length zero new_frame = float_frame.reindex(columns=[]) @@ -337,55 +344,68 @@ def test_reindex_columns(self, float_frame): def test_reindex_columns_method(self): # GH 14992, reindexing over columns ignored method - df = DataFrame(data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], - index=[1, 2, 4], - columns=[1, 2, 4], - dtype=float) + df = DataFrame( + data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], + index=[1, 2, 4], + columns=[1, 2, 4], + dtype=float, + ) # default method result = df.reindex(columns=range(6)) - expected = DataFrame(data=[[np.nan, 11, 12, np.nan, 13, np.nan], - [np.nan, 21, 22, np.nan, 23, np.nan], - [np.nan, 31, 32, np.nan, 33, np.nan]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + expected = DataFrame( + data=[ + [np.nan, 11, 12, np.nan, 13, np.nan], + [np.nan, 21, 22, np.nan, 23, np.nan], + [np.nan, 31, 32, np.nan, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) assert_frame_equal(result, expected) # method='ffill' - result = df.reindex(columns=range(6), method='ffill') - expected = DataFrame(data=[[np.nan, 11, 12, 12, 13, 13], - [np.nan, 21, 22, 22, 23, 23], - [np.nan, 31, 32, 32, 33, 33]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + result = df.reindex(columns=range(6), method="ffill") + expected = DataFrame( + data=[ + [np.nan, 11, 12, 12, 13, 13], + [np.nan, 21, 22, 22, 23, 23], + [np.nan, 31, 32, 32, 33, 33], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) assert_frame_equal(result, expected) # method='bfill' - result = df.reindex(columns=range(6), method='bfill') - expected = DataFrame(data=[[11, 11, 12, 13, 13, np.nan], - [21, 21, 22, 23, 23, np.nan], - [31, 31, 32, 33, 33, np.nan]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + result = df.reindex(columns=range(6), method="bfill") + expected = DataFrame( + data=[ + [11, 11, 12, 13, 13, np.nan], + [21, 21, 22, 23, 23, np.nan], + [31, 31, 32, 33, 33, np.nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) assert_frame_equal(result, expected) def test_reindex_axes(self): # GH 3317, reindexing by both axes loses freq of the index - df = DataFrame(np.ones((3, 3)), - index=[datetime(2012, 1, 1), - datetime(2012, 1, 2), - datetime(2012, 1, 3)], - columns=['a', 'b', 'c']) - time_freq = date_range('2012-01-01', '2012-01-03', freq='d') - some_cols = ['a', 'b'] + df = DataFrame( + np.ones((3, 3)), + index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], + columns=["a", "b", "c"], + ) + time_freq = date_range("2012-01-01", "2012-01-03", freq="d") + some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq - seq_freq = df.reindex(index=time_freq).reindex( - columns=some_cols).index.freq + seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq assert index_freq == both_freq assert index_freq == seq_freq @@ -401,9 +421,9 @@ def test_reindex_fill_value(self): assert_frame_equal(result, expected) # axis=1 - result = df.reindex(columns=range(5), fill_value=0.) + result = df.reindex(columns=range(5), fill_value=0.0) expected = df.copy() - expected[4] = 0. + expected[4] = 0.0 assert_frame_equal(result, expected) result = df.reindex(columns=range(5), fill_value=0) @@ -411,13 +431,13 @@ def test_reindex_fill_value(self): expected[4] = 0 assert_frame_equal(result, expected) - result = df.reindex(columns=range(5), fill_value='foo') + result = df.reindex(columns=range(5), fill_value="foo") expected = df.copy() - expected[4] = 'foo' + expected[4] = "foo" assert_frame_equal(result, expected) # other dtypes - df['foo'] = 'foo' + df["foo"] = "foo" result = df.reindex(range(15), fill_value=0) expected = df.reindex(range(15)).fillna(0) assert_frame_equal(result, expected) @@ -442,57 +462,57 @@ def test_reindex_dups(self): def test_reindex_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - expected = pd.DataFrame({"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, - index=[0, 1, 3]) + expected = pd.DataFrame( + {"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3] + ) result = df.reindex([0, 1, 3]) assert_frame_equal(result, expected) result = df.reindex([0, 1, 3], axis=0) assert_frame_equal(result, expected) - result = df.reindex([0, 1, 3], axis='index') + result = df.reindex([0, 1, 3], axis="index") assert_frame_equal(result, expected) def test_reindex_positional_warns(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - expected = pd.DataFrame({"A": [1., 2], 'B': [4., 5], - "C": [np.nan, np.nan]}) + expected = pd.DataFrame({"A": [1.0, 2], "B": [4.0, 5], "C": [np.nan, np.nan]}) with tm.assert_produces_warning(FutureWarning): - result = df.reindex([0, 1], ['A', 'B', 'C']) + result = df.reindex([0, 1], ["A", "B", "C"]) assert_frame_equal(result, expected) def test_reindex_axis_style_raises(self): # https://github.com/pandas-dev/pandas/issues/12392 - df = pd.DataFrame({"A": [1, 2, 3], 'B': [4, 5, 6]}) + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex([0, 1], ['A'], axis=1) + df.reindex([0, 1], ["A"], axis=1) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex([0, 1], ['A'], axis='index') + df.reindex([0, 1], ["A"], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='index') + df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='columns') + df.reindex(index=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(columns=[0, 1], axis='columns') + df.reindex(columns=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], columns=[0, 1], axis='columns') + df.reindex(index=[0, 1], columns=[0, 1], axis="columns") - with pytest.raises(TypeError, match='Cannot specify all'): - df.reindex([0, 1], [0], ['A']) + with pytest.raises(TypeError, match="Cannot specify all"): + df.reindex([0, 1], [0], ["A"]) # Mixing styles with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='index') + df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): - df.reindex(index=[0, 1], axis='columns') + df.reindex(index=[0, 1], axis="columns") # Duplicates with pytest.raises(TypeError, match="multiple values"): @@ -501,37 +521,38 @@ def test_reindex_axis_style_raises(self): def test_reindex_single_named_indexer(self): # https://github.com/pandas-dev/pandas/issues/12392 df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) - result = df.reindex([0, 1], columns=['A']) + result = df.reindex([0, 1], columns=["A"]) expected = pd.DataFrame({"A": [1, 2]}) assert_frame_equal(result, expected) def test_reindex_api_equivalence(self): # https://github.com/pandas-dev/pandas/issues/12392 # equivalence of the labels/axis and index/columns API's - df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], - index=['a', 'b', 'c'], - columns=['d', 'e', 'f']) - - res1 = df.reindex(['b', 'a']) - res2 = df.reindex(index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a']) - res4 = df.reindex(labels=['b', 'a'], axis=0) - res5 = df.reindex(['b', 'a'], axis=0) + df = DataFrame( + [[1, 2, 3], [3, 4, 5], [5, 6, 7]], + index=["a", "b", "c"], + columns=["d", "e", "f"], + ) + + res1 = df.reindex(["b", "a"]) + res2 = df.reindex(index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"]) + res4 = df.reindex(labels=["b", "a"], axis=0) + res5 = df.reindex(["b", "a"], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) - res1 = df.reindex(columns=['e', 'd']) - res2 = df.reindex(['e', 'd'], axis=1) - res3 = df.reindex(labels=['e', 'd'], axis=1) + res1 = df.reindex(columns=["e", "d"]) + res2 = df.reindex(["e", "d"], axis=1) + res3 = df.reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) with tm.assert_produces_warning(FutureWarning) as m: - res1 = df.reindex(['b', 'a'], ['e', 'd']) - assert 'reindex' in str(m[0].message) - res2 = df.reindex(columns=['e', 'd'], index=['b', 'a']) - res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'], - axis=1) + res1 = df.reindex(["b", "a"], ["e", "d"]) + assert "reindex" in str(m[0].message) + res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) + res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) @@ -556,7 +577,7 @@ def test_align_float(self, float_frame): diff_b_vals = bf.reindex(diff_b).values assert (diff_a_vals == -1).all() - af, bf = float_frame.align(other, join='right', axis=0) + af, bf = float_frame.align(other, join="right", axis=0) tm.assert_index_equal(bf.columns, other.columns) tm.assert_index_equal(bf.index, other.index) tm.assert_index_equal(af.index, other.index) @@ -578,23 +599,25 @@ def test_align_float(self, float_frame): assert (diff_a_vals == -1).all() - af, bf = float_frame.align(other, join='inner', axis=1) + af, bf = float_frame.align(other, join="inner", axis=1) tm.assert_index_equal(bf.columns, other.columns) - af, bf = float_frame.align(other, join='inner', axis=1, method='pad') + af, bf = float_frame.align(other, join="inner", axis=1, method="pad") tm.assert_index_equal(bf.columns, other.columns) - af, bf = float_frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=None) + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None + ) tm.assert_index_equal(bf.index, Index([])) - af, bf = float_frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=0) + af, bf = float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) tm.assert_index_equal(bf.index, Index([])) # Try to align DataFrame to Series along bad axis with pytest.raises(ValueError): - float_frame.align(af.iloc[0, :3], join='inner', axis=2) + float_frame.align(af.iloc[0, :3], join="inner", axis=2) # align dataframe to series with broadcast or not idx = float_frame.index @@ -608,51 +631,56 @@ def test_align_float(self, float_frame): left, right = float_frame.align(s, broadcast_axis=1) tm.assert_index_equal(left.index, float_frame.index) expected = {c: s for c in float_frame.columns} - expected = DataFrame(expected, index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + expected, index=float_frame.index, columns=float_frame.columns + ) tm.assert_frame_equal(right, expected) # see gh-9558 - df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) - result = df[df['a'] == 2] - expected = DataFrame([[2, 5]], index=[1], columns=['a', 'b']) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df[df["a"] == 2] + expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"]) tm.assert_frame_equal(result, expected) - result = df.where(df['a'] == 2, 0) - expected = DataFrame({'a': [0, 2, 0], 'b': [0, 5, 0]}) + result = df.where(df["a"] == 2, 0) + expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected) def test_align_int(self, int_frame): # test other non-float types - other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + other = DataFrame(index=range(5), columns=["A", "B", "C"]) - af, bf = int_frame.align(other, join='inner', axis=1, method='pad') + af, bf = int_frame.align(other, join="inner", axis=1, method="pad") tm.assert_index_equal(bf.columns, other.columns) def test_align_mixed_type(self, float_string_frame): - af, bf = float_string_frame.align(float_string_frame, - join='inner', axis=1, method='pad') + af, bf = float_string_frame.align( + float_string_frame, join="inner", axis=1, method="pad" + ) tm.assert_index_equal(bf.columns, float_string_frame.columns) def test_align_mixed_float(self, mixed_float_frame): # mixed floats/ints - other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + other = DataFrame(index=range(5), columns=["A", "B", "C"]) - af, bf = mixed_float_frame.align(other.iloc[:, 0], join='inner', - axis=1, method=None, fill_value=0) + af, bf = mixed_float_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) tm.assert_index_equal(bf.index, Index([])) def test_align_mixed_int(self, mixed_int_frame): - other = DataFrame(index=range(5), columns=['A', 'B', 'C']) + other = DataFrame(index=range(5), columns=["A", "B", "C"]) - af, bf = mixed_int_frame.align(other.iloc[:, 0], join='inner', axis=1, - method=None, fill_value=0) + af, bf = mixed_int_frame.align( + other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 + ) tm.assert_index_equal(bf.index, Index([])) def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): - aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit, - fill_axis=fill_axis) + aa, ab = a.align( + b, axis=axis, join=how, method=method, limit=limit, fill_axis=fill_axis + ) join_index, join_columns = None, None @@ -673,10 +701,10 @@ def _check_align(self, a, b, axis, fill_axis, how, method, limit=None): assert_frame_equal(aa, ea) assert_frame_equal(ab, eb) - @pytest.mark.parametrize('meth', ['pad', 'bfill']) - @pytest.mark.parametrize('ax', [0, 1, None]) - @pytest.mark.parametrize('fax', [0, 1]) - @pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right']) + @pytest.mark.parametrize("meth", ["pad", "bfill"]) + @pytest.mark.parametrize("ax", [0, 1, None]) + @pytest.mark.parametrize("fax", [0, 1]) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) def test_align_fill_method(self, how, meth, ax, fax, float_frame): df = float_frame self._check_align_fill(df, how, meth, ax, fax) @@ -686,36 +714,36 @@ def _check_align_fill(self, frame, kind, meth, ax, fax): right = frame.iloc[2:, 6:] empty = frame.iloc[:0, :0] - self._check_align(left, right, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(left, right, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(left, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) # empty left - self._check_align(empty, right, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(empty, right, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(empty, right, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, right, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) # empty right - self._check_align(left, empty, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(left, empty, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(left, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + left, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) # both empty - self._check_align(empty, empty, axis=ax, fill_axis=fax, - how=kind, method=meth) - self._check_align(empty, empty, axis=ax, fill_axis=fax, - how=kind, method=meth, limit=1) + self._check_align(empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth) + self._check_align( + empty, empty, axis=ax, fill_axis=fax, how=kind, method=meth, limit=1 + ) def test_align_int_fill_bug(self): # GH #910 - X = np.arange(10 * 10, dtype='float64').reshape(10, 10) + X = np.arange(10 * 10, dtype="float64").reshape(10, 10) Y = np.ones((10, 1), dtype=int) df1 = DataFrame(X) - df1['0.X'] = Y.squeeze() + df1["0.X"] = Y.squeeze() df2 = df1.astype(float) @@ -727,15 +755,16 @@ def test_align_multiindex(self): # GH 10665 # same test cases as test_align_multiindex in test_series.py - midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], - names=('a', 'b', 'c')) - idx = pd.Index(range(2), name='b') - df1 = pd.DataFrame(np.arange(12, dtype='int64'), index=midx) - df2 = pd.DataFrame(np.arange(2, dtype='int64'), index=idx) + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + df1 = pd.DataFrame(np.arange(12, dtype="int64"), index=midx) + df2 = pd.DataFrame(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) - res1l, res1r = df1.align(df2, join='left') - res2l, res2r = df2.align(df1, join='right') + res1l, res1r = df1.align(df2, join="left") + res2l, res2r = df2.align(df1, join="right") expl = df1 assert_frame_equal(expl, res1l) @@ -744,11 +773,12 @@ def test_align_multiindex(self): assert_frame_equal(expr, res1r) assert_frame_equal(expr, res2l) - res1l, res1r = df1.align(df2, join='right') - res2l, res2r = df2.align(df1, join='left') + res1l, res1r = df1.align(df2, join="right") + res2l, res2r = df2.align(df1, join="left") - exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], - names=('a', 'b', 'c')) + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) expl = pd.DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) assert_frame_equal(expl, res1l) assert_frame_equal(expl, res2r) @@ -757,17 +787,16 @@ def test_align_multiindex(self): assert_frame_equal(expr, res2l) def test_align_series_combinations(self): - df = pd.DataFrame({'a': [1, 3, 5], - 'b': [1, 3, 5]}, index=list('ACE')) - s = pd.Series([1, 2, 4], index=list('ABD'), name='x') + df = pd.DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = pd.Series([1, 2, 4], index=list("ABD"), name="x") # frame + series res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5], - 'b': [1, np.nan, 3, np.nan, 5]}, - index=list('ABCDE')) - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], - index=list('ABCDE'), name='x') + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") tm.assert_frame_equal(res1, exp1) tm.assert_series_equal(res2, exp2) @@ -779,111 +808,114 @@ def test_align_series_combinations(self): def test_filter(self, float_frame, float_string_frame): # Items - filtered = float_frame.filter(['A', 'B', 'E']) + filtered = float_frame.filter(["A", "B", "E"]) assert len(filtered.columns) == 2 - assert 'E' not in filtered + assert "E" not in filtered - filtered = float_frame.filter(['A', 'B', 'E'], axis='columns') + filtered = float_frame.filter(["A", "B", "E"], axis="columns") assert len(filtered.columns) == 2 - assert 'E' not in filtered + assert "E" not in filtered # Other axis idx = float_frame.index[0:4] - filtered = float_frame.filter(idx, axis='index') + filtered = float_frame.filter(idx, axis="index") expected = float_frame.reindex(index=idx) tm.assert_frame_equal(filtered, expected) # like fcopy = float_frame.copy() - fcopy['AA'] = 1 + fcopy["AA"] = 1 - filtered = fcopy.filter(like='A') + filtered = fcopy.filter(like="A") assert len(filtered.columns) == 2 - assert 'AA' in filtered + assert "AA" in filtered # like with ints in column names - df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B']) - filtered = df.filter(like='_') + df = DataFrame(0.0, index=[0, 1, 2], columns=[0, 1, "_A", "_B"]) + filtered = df.filter(like="_") assert len(filtered.columns) == 2 # regex with ints in column names # from PR #10384 - df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C']) + df = DataFrame(0.0, index=[0, 1, 2], columns=["A1", 1, "B", 2, "C"]) expected = DataFrame( - 0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object)) - filtered = df.filter(regex='^[0-9]+$') + 0.0, index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object) + ) + filtered = df.filter(regex="^[0-9]+$") tm.assert_frame_equal(filtered, expected) - expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1']) + expected = DataFrame(0.0, index=[0, 1, 2], columns=[0, "0", 1, "1"]) # shouldn't remove anything - filtered = expected.filter(regex='^[0-9]+$') + filtered = expected.filter(regex="^[0-9]+$") tm.assert_frame_equal(filtered, expected) # pass in None - with pytest.raises(TypeError, match='Must pass'): + with pytest.raises(TypeError, match="Must pass"): float_frame.filter() - with pytest.raises(TypeError, match='Must pass'): + with pytest.raises(TypeError, match="Must pass"): float_frame.filter(items=None) - with pytest.raises(TypeError, match='Must pass'): + with pytest.raises(TypeError, match="Must pass"): float_frame.filter(axis=1) # test mutually exclusive arguments - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], regex='e$', like='bbi') - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], regex='e$', axis=1) - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], regex='e$') - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], like='bbi', axis=0) - with pytest.raises(TypeError, match='mutually exclusive'): - float_frame.filter(items=['one', 'three'], like='bbi') + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", like="bbi") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$", axis=1) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], regex="e$") + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi", axis=0) + with pytest.raises(TypeError, match="mutually exclusive"): + float_frame.filter(items=["one", "three"], like="bbi") # objects - filtered = float_string_frame.filter(like='foo') - assert 'foo' in filtered + filtered = float_string_frame.filter(like="foo") + assert "foo" in filtered # unicode columns, won't ascii-encode - df = float_frame.rename(columns={'B': '\u2202'}) - filtered = df.filter(like='C') - assert 'C' in filtered + df = float_frame.rename(columns={"B": "\u2202"}) + filtered = df.filter(like="C") + assert "C" in filtered def test_filter_regex_search(self, float_frame): fcopy = float_frame.copy() - fcopy['AA'] = 1 + fcopy["AA"] = 1 # regex - filtered = fcopy.filter(regex='[A]+') + filtered = fcopy.filter(regex="[A]+") assert len(filtered.columns) == 2 - assert 'AA' in filtered + assert "AA" in filtered # doesn't have to be at beginning - df = DataFrame({'aBBa': [1, 2], - 'BBaBB': [1, 2], - 'aCCa': [1, 2], - 'aCCaBB': [1, 2]}) + df = DataFrame( + {"aBBa": [1, 2], "BBaBB": [1, 2], "aCCa": [1, 2], "aCCaBB": [1, 2]} + ) - result = df.filter(regex='BB') - exp = df[[x for x in df.columns if 'BB' in x]] + result = df.filter(regex="BB") + exp = df[[x for x in df.columns if "BB" in x]] assert_frame_equal(result, exp) - @pytest.mark.parametrize('name,expected', [ - ('a', DataFrame({'a': [1, 2]})), - ('a', DataFrame({'a': [1, 2]})), - ('あ', DataFrame({'あ': [3, 4]})) - ]) + @pytest.mark.parametrize( + "name,expected", + [ + ("a", DataFrame({"a": [1, 2]})), + ("a", DataFrame({"a": [1, 2]})), + ("あ", DataFrame({"あ": [3, 4]})), + ], + ) def test_filter_unicode(self, name, expected): # GH13101 - df = DataFrame({'a': [1, 2], 'あ': [3, 4]}) + df = DataFrame({"a": [1, 2], "あ": [3, 4]}) assert_frame_equal(df.filter(like=name), expected) assert_frame_equal(df.filter(regex=name), expected) - @pytest.mark.parametrize('name', ['a', 'a']) + @pytest.mark.parametrize("name", ["a", "a"]) def test_filter_bytestring(self, name): # GH13101 - df = DataFrame({b'a': [1, 2], b'b': [3, 4]}) - expected = DataFrame({b'a': [1, 2]}) + df = DataFrame({b"a": [1, 2], b"b": [3, 4]}) + expected = DataFrame({b"a": [1, 2]}) assert_frame_equal(df.filter(like=name), expected) assert_frame_equal(df.filter(regex=name), expected) @@ -894,7 +926,7 @@ def test_filter_corner(self): result = empty.filter([]) assert_frame_equal(result, empty) - result = empty.filter(like='foo') + result = empty.filter(like="foo") assert_frame_equal(result, empty) def test_take(self, float_frame): @@ -908,7 +940,7 @@ def test_take(self, float_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['D', 'B', 'C', 'A']] + expected = df.loc[:, ["D", "B", "C", "A"]] assert_frame_equal(result, expected, check_names=False) # negative indices @@ -924,7 +956,7 @@ def test_take(self, float_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['C', 'B', 'D']] + expected = df.loc[:, ["C", "B", "D"]] assert_frame_equal(result, expected, check_names=False) # illegal indices @@ -950,7 +982,7 @@ def test_take_mixed_type(self, float_string_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['foo', 'B', 'C', 'A', 'D']] + expected = df.loc[:, ["foo", "B", "C", "A", "D"]] assert_frame_equal(result, expected) # negative indices @@ -963,7 +995,7 @@ def test_take_mixed_type(self, float_string_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['foo', 'B', 'D']] + expected = df.loc[:, ["foo", "B", "D"]] assert_frame_equal(result, expected) def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): @@ -977,13 +1009,13 @@ def test_take_mixed_numeric(self, mixed_float_frame, mixed_int_frame): # axis = 1 result = df.take(order, axis=1) - expected = df.loc[:, ['B', 'C', 'A', 'D']] + expected = df.loc[:, ["B", "C", "A", "D"]] assert_frame_equal(result, expected) def test_reindex_boolean(self): - frame = DataFrame(np.ones((10, 2), dtype=bool), - index=np.arange(0, 20, 2), - columns=[0, 2]) + frame = DataFrame( + np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2] + ) reindexed = frame.reindex(np.arange(10)) assert reindexed.values.dtype == np.object_ @@ -994,26 +1026,28 @@ def test_reindex_boolean(self): assert isna(reindexed[1]).all() def test_reindex_objects(self, float_string_frame): - reindexed = float_string_frame.reindex(columns=['foo', 'A', 'B']) - assert 'foo' in reindexed + reindexed = float_string_frame.reindex(columns=["foo", "A", "B"]) + assert "foo" in reindexed - reindexed = float_string_frame.reindex(columns=['A', 'B']) - assert 'foo' not in reindexed + reindexed = float_string_frame.reindex(columns=["A", "B"]) + assert "foo" not in reindexed def test_reindex_corner(self, int_frame): - index = Index(['a', 'b', 'c']) + index = Index(["a", "b", "c"]) dm = DataFrame({}).reindex(index=[1, 2, 3]) reindexed = dm.reindex(columns=index) tm.assert_index_equal(reindexed.columns, index) # ints are weird - smaller = int_frame.reindex(columns=['A', 'B', 'E']) - assert smaller['E'].dtype == np.float64 + smaller = int_frame.reindex(columns=["A", "B", "E"]) + assert smaller["E"].dtype == np.float64 def test_reindex_with_nans(self): - df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], - columns=['a', 'b'], - index=[100.0, 101.0, np.nan, 102.0, 103.0]) + df = DataFrame( + [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], + columns=["a", "b"], + index=[100.0, 101.0, np.nan, 102.0, 103.0], + ) result = df.reindex(index=[101.0, 102.0, 103.0]) expected = df.iloc[[1, 3, 4]] @@ -1049,60 +1083,63 @@ def test_reindex_multi(self): assert_frame_equal(result, expected) - df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c']) + df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"]) - result = df.reindex(index=[0, 1], columns=['a', 'b']) - expected = df.reindex([0, 1]).reindex(columns=['a', 'b']) + result = df.reindex(index=[0, 1], columns=["a", "b"]) + expected = df.reindex([0, 1]).reindex(columns=["a", "b"]) assert_frame_equal(result, expected) def test_reindex_multi_categorical_time(self): # https://github.com/pandas-dev/pandas/issues/21390 midx = pd.MultiIndex.from_product( - [Categorical(['a', 'b', 'c']), - Categorical(date_range("2012-01-01", periods=3, freq='H'))]) - df = pd.DataFrame({'a': range(len(midx))}, index=midx) + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) + df = pd.DataFrame({"a": range(len(midx))}, index=midx) df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] result = df2.reindex(midx) - expected = pd.DataFrame( - {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + expected = pd.DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) assert_frame_equal(result, expected) data = [[1, 2, 3], [1, 2, 3]] - @pytest.mark.parametrize('actual', [ - DataFrame(data=data, index=['a', 'a']), - DataFrame(data=data, index=['a', 'b']), - DataFrame(data=data, index=['a', 'b']).set_index([0, 1]), - DataFrame(data=data, index=['a', 'a']).set_index([0, 1]) - ]) + @pytest.mark.parametrize( + "actual", + [ + DataFrame(data=data, index=["a", "a"]), + DataFrame(data=data, index=["a", "b"]), + DataFrame(data=data, index=["a", "b"]).set_index([0, 1]), + DataFrame(data=data, index=["a", "a"]).set_index([0, 1]), + ], + ) def test_raise_on_drop_duplicate_index(self, actual): # issue 19186 level = 0 if isinstance(actual.index, MultiIndex) else None with pytest.raises(KeyError): - actual.drop('c', level=level, axis=0) + actual.drop("c", level=level, axis=0) with pytest.raises(KeyError): - actual.T.drop('c', level=level, axis=1) - expected_no_err = actual.drop('c', axis=0, level=level, - errors='ignore') + actual.T.drop("c", level=level, axis=1) + expected_no_err = actual.drop("c", axis=0, level=level, errors="ignore") assert_frame_equal(expected_no_err, actual) - expected_no_err = actual.T.drop('c', axis=1, level=level, - errors='ignore') + expected_no_err = actual.T.drop("c", axis=1, level=level, errors="ignore") assert_frame_equal(expected_no_err.T, actual) - @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]]) - @pytest.mark.parametrize('drop_labels', [[], [1], [2]]) + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize("drop_labels", [[], [1], [2]]) def test_drop_empty_list(self, index, drop_labels): # GH 21494 expected_index = [i for i in index if i not in drop_labels] frame = pd.DataFrame(index=index).drop(drop_labels) tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) - @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]]) - @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]]) + @pytest.mark.parametrize("index", [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize("drop_labels", [[1, 4], [4, 5]]) def test_drop_non_empty_list(self, index, drop_labels): # GH 21494 - with pytest.raises(KeyError, match='not found in axis'): + with pytest.raises(KeyError, match="not found in axis"): pd.DataFrame(index=index).drop(drop_labels) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index f1cbd7763474e..37b0d61ee31d9 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -7,14 +7,23 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Series, Timestamp, compat, date_range, - option_context) + Categorical, + DataFrame, + Series, + Timestamp, + compat, + date_range, + option_context, +) from pandas.core.arrays import IntervalArray, integer_array from pandas.core.internals import ObjectBlock from pandas.core.internals.blocks import IntBlock import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) # Segregated collection of methods that require the BlockManager internal data # structure @@ -25,17 +34,17 @@ def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz column inplace invalidates the # `freq` attribute on the underlying DatetimeIndex - dti = date_range('20130101', periods=3, tz='US/Eastern') + dti = date_range("20130101", periods=3, tz="US/Eastern") ts = dti[1] - df = DataFrame({'B': dti}) - assert df['B']._values.freq == 'D' + df = DataFrame({"B": dti}) + assert df["B"]._values.freq == "D" df.iloc[1, 0] = pd.NaT - assert df['B']._values.freq is None + assert df["B"]._values.freq is None # check that the DatetimeIndex was not altered in place - assert dti.freq == 'D' + assert dti.freq == "D" assert dti[1] == ts def test_cast_internals(self, float_frame): @@ -48,7 +57,7 @@ def test_cast_internals(self, float_frame): assert_frame_equal(casted, expected) def test_consolidate(self, float_frame): - float_frame['E'] = 7. + float_frame["E"] = 7.0 consolidated = float_frame._consolidate() assert len(consolidated._data.blocks) == 1 @@ -57,7 +66,7 @@ def test_consolidate(self, float_frame): assert recons is not consolidated tm.assert_frame_equal(recons, consolidated) - float_frame['F'] = 8. + float_frame["F"] = 8.0 assert len(float_frame._data.blocks) == 3 float_frame._consolidate(inplace=True) @@ -67,11 +76,11 @@ def test_consolidate_inplace(self, float_frame): frame = float_frame.copy() # noqa # triggers in-place consolidation - for letter in range(ord('A'), ord('Z')): + for letter in range(ord("A"), ord("Z")): float_frame[chr(letter)] = chr(letter) def test_values_consolidate(self, float_frame): - float_frame['E'] = 7. + float_frame["E"] = 7.0 assert not float_frame._data.is_consolidated() _ = float_frame.values # noqa assert float_frame._data.is_consolidated() @@ -81,12 +90,12 @@ def test_modify_values(self, float_frame): assert (float_frame.values[5] == 5).all() # unconsolidated - float_frame['E'] = 7. + float_frame["E"] = 7.0 float_frame.values[6] = 6 assert (float_frame.values[6] == 6).all() def test_boolean_set_uncons(self, float_frame): - float_frame['E'] = 7. + float_frame["E"] = 7.0 expected = float_frame.values.copy() expected[expected > 1] = 2 @@ -95,166 +104,174 @@ def test_boolean_set_uncons(self, float_frame): assert_almost_equal(expected, float_frame.values) def test_values_numeric_cols(self, float_frame): - float_frame['foo'] = 'bar' + float_frame["foo"] = "bar" - values = float_frame[['A', 'B', 'C', 'D']].values + values = float_frame[["A", "B", "C", "D"]].values assert values.dtype == np.float64 def test_values_lcd(self, mixed_float_frame, mixed_int_frame): # mixed lcd - values = mixed_float_frame[['A', 'B', 'C', 'D']].values + values = mixed_float_frame[["A", "B", "C", "D"]].values assert values.dtype == np.float64 - values = mixed_float_frame[['A', 'B', 'C']].values + values = mixed_float_frame[["A", "B", "C"]].values assert values.dtype == np.float32 - values = mixed_float_frame[['C']].values + values = mixed_float_frame[["C"]].values assert values.dtype == np.float16 # GH 10364 # B uint64 forces float because there are other signed int types - values = mixed_int_frame[['A', 'B', 'C', 'D']].values + values = mixed_int_frame[["A", "B", "C", "D"]].values assert values.dtype == np.float64 - values = mixed_int_frame[['A', 'D']].values + values = mixed_int_frame[["A", "D"]].values assert values.dtype == np.int64 # B uint64 forces float because there are other signed int types - values = mixed_int_frame[['A', 'B', 'C']].values + values = mixed_int_frame[["A", "B", "C"]].values assert values.dtype == np.float64 # as B and C are both unsigned, no forcing to float is needed - values = mixed_int_frame[['B', 'C']].values + values = mixed_int_frame[["B", "C"]].values assert values.dtype == np.uint64 - values = mixed_int_frame[['A', 'C']].values + values = mixed_int_frame[["A", "C"]].values assert values.dtype == np.int32 - values = mixed_int_frame[['C', 'D']].values + values = mixed_int_frame[["C", "D"]].values assert values.dtype == np.int64 - values = mixed_int_frame[['A']].values + values = mixed_int_frame[["A"]].values assert values.dtype == np.int32 - values = mixed_int_frame[['C']].values + values = mixed_int_frame[["C"]].values assert values.dtype == np.uint8 def test_constructor_with_convert(self): # this is actually mostly a test of lib.maybe_convert_objects # #2845 - df = DataFrame({'A': [2 ** 63 - 1]}) - result = df['A'] - expected = Series(np.asarray([2 ** 63 - 1], np.int64), name='A') + df = DataFrame({"A": [2 ** 63 - 1]}) + result = df["A"] + expected = Series(np.asarray([2 ** 63 - 1], np.int64), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [2 ** 63]}) - result = df['A'] - expected = Series(np.asarray([2 ** 63], np.uint64), name='A') + df = DataFrame({"A": [2 ** 63]}) + result = df["A"] + expected = Series(np.asarray([2 ** 63], np.uint64), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [datetime(2005, 1, 1), True]}) - result = df['A'] - expected = Series(np.asarray([datetime(2005, 1, 1), True], np.object_), - name='A') + df = DataFrame({"A": [datetime(2005, 1, 1), True]}) + result = df["A"] + expected = Series( + np.asarray([datetime(2005, 1, 1), True], np.object_), name="A" + ) assert_series_equal(result, expected) - df = DataFrame({'A': [None, 1]}) - result = df['A'] - expected = Series(np.asarray([np.nan, 1], np.float_), name='A') + df = DataFrame({"A": [None, 1]}) + result = df["A"] + expected = Series(np.asarray([np.nan, 1], np.float_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0, 2]}) - result = df['A'] - expected = Series(np.asarray([1.0, 2], np.float_), name='A') + df = DataFrame({"A": [1.0, 2]}) + result = df["A"] + expected = Series(np.asarray([1.0, 2], np.float_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, 3]}) - result = df['A'] - expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, 3]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, 3.0]}) - result = df['A'] - expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, 3.0]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, True]}) - result = df['A'] - expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, True]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0, None]}) - result = df['A'] - expected = Series(np.asarray([1.0, np.nan], np.float_), name='A') + df = DataFrame({"A": [1.0, None]}) + result = df["A"] + expected = Series(np.asarray([1.0, np.nan], np.float_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [1.0 + 2.0j, None]}) - result = df['A'] - expected = Series(np.asarray( - [1.0 + 2.0j, np.nan], np.complex_), name='A') + df = DataFrame({"A": [1.0 + 2.0j, None]}) + result = df["A"] + expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [2.0, 1, True, None]}) - result = df['A'] - expected = Series(np.asarray( - [2.0, 1, True, None], np.object_), name='A') + df = DataFrame({"A": [2.0, 1, True, None]}) + result = df["A"] + expected = Series(np.asarray([2.0, 1, True, None], np.object_), name="A") assert_series_equal(result, expected) - df = DataFrame({'A': [2.0, 1, datetime(2006, 1, 1), None]}) - result = df['A'] - expected = Series(np.asarray([2.0, 1, datetime(2006, 1, 1), - None], np.object_), name='A') + df = DataFrame({"A": [2.0, 1, datetime(2006, 1, 1), None]}) + result = df["A"] + expected = Series( + np.asarray([2.0, 1, datetime(2006, 1, 1), None], np.object_), name="A" + ) assert_series_equal(result, expected) def test_construction_with_mixed(self, float_string_frame): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround - data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 1)]] + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] df = DataFrame(data) # check dtypes result = df.dtypes - expected = Series({'datetime64[ns]': 3}) + expected = Series({"datetime64[ns]": 3}) # mixed-type frames - float_string_frame['datetime'] = datetime.now() - float_string_frame['timedelta'] = timedelta(days=1, seconds=1) - assert float_string_frame['datetime'].dtype == 'M8[ns]' - assert float_string_frame['timedelta'].dtype == 'm8[ns]' + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) + assert float_string_frame["datetime"].dtype == "M8[ns]" + assert float_string_frame["timedelta"].dtype == "m8[ns]" result = float_string_frame.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('object'), - np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]')], - index=list('ABCD') + ['foo', 'datetime', - 'timedelta']) + expected = Series( + [np.dtype("float64")] * 4 + + [ + np.dtype("object"), + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + ], + index=list("ABCD") + ["foo", "datetime", "timedelta"], + ) assert_series_equal(result, expected) def test_construction_with_conversions(self): # convert from a numpy array of non-ns timedelta64 - arr = np.array([1, 2, 3], dtype='timedelta64[s]') + arr = np.array([1, 2, 3], dtype="timedelta64[s]") df = DataFrame(index=range(3)) - df['A'] = arr - expected = DataFrame({'A': pd.timedelta_range('00:00:01', periods=3, - freq='s')}, - index=range(3)) + df["A"] = arr + expected = DataFrame( + {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3) + ) assert_frame_equal(df, expected) - expected = DataFrame({ - 'dt1': Timestamp('20130101'), - 'dt2': date_range('20130101', periods=3), - # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'), - }, index=range(3)) + expected = DataFrame( + { + "dt1": Timestamp("20130101"), + "dt2": date_range("20130101", periods=3), + # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'), + }, + index=range(3), + ) df = DataFrame(index=range(3)) - df['dt1'] = np.datetime64('2013-01-01') - df['dt2'] = np.array(['2013-01-01', '2013-01-02', '2013-01-03'], - dtype='datetime64[D]') + df["dt1"] = np.datetime64("2013-01-01") + df["dt2"] = np.array( + ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]" + ) # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]') @@ -266,36 +283,29 @@ def test_constructor_compound_dtypes(self): # compound dtypes should raise not-implementederror def f(dtype): - data = list(itertools.repeat((datetime(2001, 1, 1), - "aa", 20), 9)) - return DataFrame(data=data, - columns=["A", "B", "C"], - dtype=dtype) - - msg = ("compound dtypes are not implemented in the DataFrame" - " constructor") + data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9)) + return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype) + + msg = "compound dtypes are not implemented in the DataFrame" " constructor" with pytest.raises(NotImplementedError, match=msg): - f([("A", "datetime64[h]"), - ("B", "str"), - ("C", "int32")]) + f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) - f('int64') - f('float64') + f("int64") + f("float64") # 10822 # invalid error message on dt inference if not compat.is_platform_windows(): - f('M8[ns]') + f("M8[ns]") def test_equals_different_blocks(self): # GH 9330 - df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], - "C": ["w", "z"]}) + df0 = pd.DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] # this assert verifies that the above operations have # induced a block rearrangement - assert (df0._data.blocks[0].dtype != df1._data.blocks[0].dtype) + assert df0._data.blocks[0].dtype != df1._data.blocks[0].dtype # do the real tests assert_frame_equal(df0, df1) @@ -310,8 +320,7 @@ def test_copy_blocks(self, float_frame): # use the default copy=True, change a column # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): blocks = df.as_blocks() for dtype, _df in blocks.items(): if column in _df: @@ -328,8 +337,7 @@ def test_no_copy_blocks(self, float_frame): # use the copy=False, change a column # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): blocks = df.as_blocks(copy=False) for dtype, _df in blocks.items(): if column in _df: @@ -340,8 +348,8 @@ def test_no_copy_blocks(self, float_frame): def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() - cop['E'] = cop['A'] - assert 'E' not in float_frame + cop["E"] = cop["A"] + assert "E" not in float_frame # copy objects copy = float_string_frame.copy() @@ -379,21 +387,20 @@ def test_consolidate_datetime64(self): ser_starting = df.starting ser_starting.index = ser_starting.values - ser_starting = ser_starting.tz_localize('US/Eastern') - ser_starting = ser_starting.tz_convert('UTC') - ser_starting.index.name = 'starting' + ser_starting = ser_starting.tz_localize("US/Eastern") + ser_starting = ser_starting.tz_convert("UTC") + ser_starting.index.name = "starting" ser_ending = df.ending ser_ending.index = ser_ending.values - ser_ending = ser_ending.tz_localize('US/Eastern') - ser_ending = ser_ending.tz_convert('UTC') - ser_ending.index.name = 'ending' + ser_ending = ser_ending.tz_localize("US/Eastern") + ser_ending = ser_ending.tz_convert("UTC") + ser_ending.index.name = "ending" df.starting = ser_starting.index df.ending = ser_ending.index - tm.assert_index_equal(pd.DatetimeIndex( - df.starting), ser_starting.index) + tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index) tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index) def test_is_mixed_type(self, float_frame, float_string_frame): @@ -405,40 +412,50 @@ def test_get_numeric_data(self): intname = np.dtype(np.int_).name # noqa floatname = np.dtype(np.float_).name # noqa - datetime64name = np.dtype('M8[ns]').name + datetime64name = np.dtype("M8[ns]").name objectname = np.dtype(np.object_).name - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'f': Timestamp('20010102')}, - index=np.arange(10)) + df = DataFrame( + {"a": 1.0, "b": 2, "c": "foo", "f": Timestamp("20010102")}, + index=np.arange(10), + ) result = df.dtypes - expected = Series([np.dtype('float64'), - np.dtype('int64'), - np.dtype(objectname), - np.dtype(datetime64name)], - index=['a', 'b', 'c', 'f']) + expected = Series( + [ + np.dtype("float64"), + np.dtype("int64"), + np.dtype(objectname), + np.dtype(datetime64name), + ], + index=["a", "b", "c", "f"], + ) assert_series_equal(result, expected) - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', - 'd': np.array([1.] * 10, dtype='float32'), - 'e': np.array([1] * 10, dtype='int32'), - 'f': np.array([1] * 10, dtype='int16'), - 'g': Timestamp('20010102')}, - index=np.arange(10)) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + "d": np.array([1.0] * 10, dtype="float32"), + "e": np.array([1] * 10, dtype="int32"), + "f": np.array([1] * 10, dtype="int16"), + "g": Timestamp("20010102"), + }, + index=np.arange(10), + ) result = df._get_numeric_data() - expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']] + expected = df.loc[:, ["a", "b", "d", "e", "f"]] assert_frame_equal(result, expected) - only_obj = df.loc[:, ['c', 'g']] + only_obj = df.loc[:, ["c", "g"]] result = only_obj._get_numeric_data() expected = df.loc[:, []] assert_frame_equal(result, expected) - df = DataFrame.from_dict( - {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]}) + df = DataFrame.from_dict({"a": [1, 2], "b": ["foo", "bar"], "c": [np.pi, np.e]}) result = df._get_numeric_data() - expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]}) + expected = DataFrame.from_dict({"a": [1, 2], "c": [np.pi, np.e]}) assert_frame_equal(result, expected) df = result.copy() @@ -448,13 +465,16 @@ def test_get_numeric_data(self): def test_get_numeric_data_extension_dtype(self): # GH 22290 - df = DataFrame({ - 'A': integer_array([-10, np.nan, 0, 10, 20, 30], dtype='Int64'), - 'B': Categorical(list('abcabc')), - 'C': integer_array([0, 1, 2, 3, np.nan, 5], dtype='UInt8'), - 'D': IntervalArray.from_breaks(range(7))}) + df = DataFrame( + { + "A": integer_array([-10, np.nan, 0, 10, 20, 30], dtype="Int64"), + "B": Categorical(list("abcabc")), + "C": integer_array([0, 1, 2, 3, np.nan, 5], dtype="UInt8"), + "D": IntervalArray.from_breaks(range(7)), + } + ) result = df._get_numeric_data() - expected = df.loc[:, ['A', 'C']] + expected = df.loc[:, ["A", "C"]] assert_frame_equal(result, expected) def test_convert_objects(self, float_string_frame): @@ -462,100 +482,114 @@ def test_convert_objects(self, float_string_frame): oops = float_string_frame.T.T converted = oops._convert(datetime=True) assert_frame_equal(converted, float_string_frame) - assert converted['A'].dtype == np.float64 + assert converted["A"].dtype == np.float64 # force numeric conversion - float_string_frame['H'] = '1.' - float_string_frame['I'] = '1' + float_string_frame["H"] = "1." + float_string_frame["I"] = "1" # add in some items that will be nan length = len(float_string_frame) - float_string_frame['J'] = '1.' - float_string_frame['K'] = '1' - float_string_frame.loc[0:5, ['J', 'K']] = 'garbled' + float_string_frame["J"] = "1." + float_string_frame["K"] = "1" + float_string_frame.loc[0:5, ["J", "K"]] = "garbled" converted = float_string_frame._convert(datetime=True, numeric=True) - assert converted['H'].dtype == 'float64' - assert converted['I'].dtype == 'int64' - assert converted['J'].dtype == 'float64' - assert converted['K'].dtype == 'float64' - assert len(converted['J'].dropna()) == length - 5 - assert len(converted['K'].dropna()) == length - 5 + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" + assert converted["J"].dtype == "float64" + assert converted["K"].dtype == "float64" + assert len(converted["J"].dropna()) == length - 5 + assert len(converted["K"].dropna()) == length - 5 # via astype converted = float_string_frame.copy() - converted['H'] = converted['H'].astype('float64') - converted['I'] = converted['I'].astype('int64') - assert converted['H'].dtype == 'float64' - assert converted['I'].dtype == 'int64' + converted["H"] = converted["H"].astype("float64") + converted["I"] = converted["I"].astype("int64") + assert converted["H"].dtype == "float64" + assert converted["I"].dtype == "int64" # via astype, but errors converted = float_string_frame.copy() - with pytest.raises(ValueError, match='invalid literal'): - converted['H'].astype('int32') + with pytest.raises(ValueError, match="invalid literal"): + converted["H"].astype("int32") # mixed in a single column - df = DataFrame(dict(s=Series([1, 'na', 3, 4]))) + df = DataFrame(dict(s=Series([1, "na", 3, 4]))) result = df._convert(datetime=True, numeric=True) expected = DataFrame(dict(s=Series([1, np.nan, 3, 4]))) assert_frame_equal(result, expected) def test_convert_objects_no_conversion(self): - mixed1 = DataFrame( - {'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']}) + mixed1 = DataFrame({"a": [1, 2, 3], "b": [4.0, 5, 6], "c": ["x", "y", "z"]}) mixed2 = mixed1._convert(datetime=True) assert_frame_equal(mixed1, mixed2) def test_infer_objects(self): # GH 11221 - df = DataFrame({'a': ['a', 1, 2, 3], - 'b': ['b', 2.0, 3.0, 4.1], - 'c': ['c', datetime(2016, 1, 1), - datetime(2016, 1, 2), - datetime(2016, 1, 3)], - 'd': [1, 2, 3, 'd']}, - columns=['a', 'b', 'c', 'd']) + df = DataFrame( + { + "a": ["a", 1, 2, 3], + "b": ["b", 2.0, 3.0, 4.1], + "c": [ + "c", + datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3), + ], + "d": [1, 2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) df = df.iloc[1:].infer_objects() - assert df['a'].dtype == 'int64' - assert df['b'].dtype == 'float64' - assert df['c'].dtype == 'M8[ns]' - assert df['d'].dtype == 'object' - - expected = DataFrame({'a': [1, 2, 3], - 'b': [2.0, 3.0, 4.1], - 'c': [datetime(2016, 1, 1), - datetime(2016, 1, 2), - datetime(2016, 1, 3)], - 'd': [2, 3, 'd']}, - columns=['a', 'b', 'c', 'd']) + assert df["a"].dtype == "int64" + assert df["b"].dtype == "float64" + assert df["c"].dtype == "M8[ns]" + assert df["d"].dtype == "object" + + expected = DataFrame( + { + "a": [1, 2, 3], + "b": [2.0, 3.0, 4.1], + "c": [datetime(2016, 1, 1), datetime(2016, 1, 2), datetime(2016, 1, 3)], + "d": [2, 3, "d"], + }, + columns=["a", "b", "c", "d"], + ) # reconstruct frame to verify inference is same tm.assert_frame_equal(df.reset_index(drop=True), expected) def test_stale_cached_series_bug_473(self): # this is chained, but ok - with option_context('chained_assignment', None): - Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'), - columns=('e', 'f', 'g', 'h')) + with option_context("chained_assignment", None): + Y = DataFrame( + np.random.random((4, 4)), + index=("a", "b", "c", "d"), + columns=("e", "f", "g", "h"), + ) repr(Y) - Y['e'] = Y['e'].astype('object') - Y['g']['c'] = np.NaN + Y["e"] = Y["e"].astype("object") + Y["g"]["c"] = np.NaN repr(Y) result = Y.sum() # noqa - exp = Y['g'].sum() # noqa - assert pd.isna(Y['g']['c']) + exp = Y["g"].sum() # noqa + assert pd.isna(Y["g"]["c"]) def test_get_X_columns(self): # numeric and object columns - df = DataFrame({'a': [1, 2, 3], - 'b': [True, False, True], - 'c': ['foo', 'bar', 'baz'], - 'd': [None, None, None], - 'e': [3.14, 0.577, 2.773]}) + df = DataFrame( + { + "a": [1, 2, 3], + "b": [True, False, True], + "c": ["foo", "bar", "baz"], + "d": [None, None, None], + "e": [3.14, 0.577, 2.773], + } + ) - tm.assert_index_equal(df._get_numeric_data().columns, - pd.Index(['a', 'b', 'e'])) + tm.assert_index_equal(df._get_numeric_data().columns, pd.Index(["a", "b", "e"])) def test_strange_column_corruption_issue(self): # (wesm) Unclear how exactly this is related to internal matters @@ -591,10 +625,15 @@ def test_constructor_no_pandas_array(self): def test_add_column_with_pandas_array(self): # GH 26390 - df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd']}) - df['c'] = pd.array([1, 2, None, 3]) - df2 = pd.DataFrame({'a': [1, 2, 3, 4], 'b': ['a', 'b', 'c', 'd'], - 'c': pd.array([1, 2, None, 3])}) - assert type(df['c']._data.blocks[0]) == ObjectBlock - assert type(df2['c']._data.blocks[0]) == ObjectBlock + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = pd.array([1, 2, None, 3]) + df2 = pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": ["a", "b", "c", "d"], + "c": pd.array([1, 2, None, 3]), + } + ) + assert type(df["c"]._data.blocks[0]) == ObjectBlock + assert type(df2["c"]._data.blocks[0]) == ObjectBlock assert_frame_equal(df, df2) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index c1d057da91b8f..e38d214eadeb6 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -10,25 +10,27 @@ class TestDataFrameConcatCommon: - def test_concat_multiple_frames_dtypes(self): # GH 2759 - A = DataFrame(data=np.ones((10, 2)), columns=[ - 'foo', 'bar'], dtype=np.float64) + A = DataFrame(data=np.ones((10, 2)), columns=["foo", "bar"], dtype=np.float64) B = DataFrame(data=np.ones((10, 2)), dtype=np.float32) results = pd.concat((A, B), axis=1).dtypes - expected = Series([np.dtype('float64')] * 2 + - [np.dtype('float32')] * 2, - index=['foo', 'bar', 0, 1]) + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("float32")] * 2, + index=["foo", "bar", 0, 1], + ) assert_series_equal(results, expected) - @pytest.mark.parametrize('data', [ - pd.date_range('2000', periods=4), - pd.date_range('2000', periods=4, tz="US/Central"), - pd.period_range('2000', periods=4), - pd.timedelta_range(0, periods=4), - ]) + @pytest.mark.parametrize( + "data", + [ + pd.date_range("2000", periods=4), + pd.date_range("2000", periods=4, tz="US/Central"), + pd.period_range("2000", periods=4), + pd.timedelta_range(0, periods=4), + ], + ) def test_combine_datetlike_udf(self, data): # https://github.com/pandas-dev/pandas/issues/23079 df = pd.DataFrame({"A": data}) @@ -44,9 +46,9 @@ def combiner(a, b): def test_concat_multiple_tzs(self): # GH 12467 # combining datetime tz-aware and naive DataFrames - ts1 = Timestamp('2015-01-01', tz=None) - ts2 = Timestamp('2015-01-01', tz='UTC') - ts3 = Timestamp('2015-01-01', tz='EST') + ts1 = Timestamp("2015-01-01", tz=None) + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="EST") df1 = DataFrame(dict(time=[ts1])) df2 = DataFrame(dict(time=[ts2])) @@ -65,17 +67,23 @@ def test_concat_multiple_tzs(self): assert_frame_equal(results, expected) @pytest.mark.parametrize( - 't1', + "t1", [ - '2015-01-01', - pytest.param(pd.NaT, marks=pytest.mark.xfail( - reason='GH23037 incorrect dtype when concatenating'))]) + "2015-01-01", + pytest.param( + pd.NaT, + marks=pytest.mark.xfail( + reason="GH23037 incorrect dtype when concatenating" + ), + ), + ], + ) def test_concat_tz_NaT(self, t1): # GH 22796 # Concating tz-aware multicolumn DataFrames - ts1 = Timestamp(t1, tz='UTC') - ts2 = Timestamp('2015-01-01', tz='UTC') - ts3 = Timestamp('2015-01-01', tz='UTC') + ts1 = Timestamp(t1, tz="UTC") + ts2 = Timestamp("2015-01-01", tz="UTC") + ts3 = Timestamp("2015-01-01", tz="UTC") df1 = DataFrame([[ts1, ts2]]) df2 = DataFrame([[ts3]]) @@ -91,45 +99,53 @@ def test_concat_tz_not_aligned(self): a = pd.DataFrame({"A": ts}) b = pd.DataFrame({"A": ts, "B": ts}) result = pd.concat([a, b], sort=True, ignore_index=True) - expected = pd.DataFrame({"A": list(ts) + list(ts), - "B": [pd.NaT, pd.NaT] + list(ts)}) + expected = pd.DataFrame( + {"A": list(ts) + list(ts), "B": [pd.NaT, pd.NaT] + list(ts)} + ) assert_frame_equal(result, expected) def test_concat_tuple_keys(self): # GH 14438 - df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB')) - df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB')) - results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')]) + df1 = pd.DataFrame(np.ones((2, 2)), columns=list("AB")) + df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list("AB")) + results = pd.concat((df1, df2), keys=[("bee", "bah"), ("bee", "boo")]) expected = pd.DataFrame( - {'A': {('bee', 'bah', 0): 1.0, - ('bee', 'bah', 1): 1.0, - ('bee', 'boo', 0): 2.0, - ('bee', 'boo', 1): 2.0, - ('bee', 'boo', 2): 2.0}, - 'B': {('bee', 'bah', 0): 1.0, - ('bee', 'bah', 1): 1.0, - ('bee', 'boo', 0): 2.0, - ('bee', 'boo', 1): 2.0, - ('bee', 'boo', 2): 2.0}}) + { + "A": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + "B": { + ("bee", "bah", 0): 1.0, + ("bee", "bah", 1): 1.0, + ("bee", "boo", 0): 2.0, + ("bee", "boo", 1): 2.0, + ("bee", "boo", 2): 2.0, + }, + } + ) assert_frame_equal(results, expected) def test_append_series_dict(self): - df = DataFrame(np.random.randn(5, 4), - columns=['foo', 'bar', 'baz', 'qux']) + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) series = df.loc[4] - msg = 'Indexes have overlapping values' + msg = "Indexes have overlapping values" with pytest.raises(ValueError, match=msg): df.append(series, verify_integrity=True) series.name = None - msg = 'Can only append a Series if ignore_index=True' + msg = "Can only append a Series if ignore_index=True" with pytest.raises(TypeError, match=msg): df.append(series, verify_integrity=True) result = df.append(series[::-1], ignore_index=True) - expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T, - ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True + ) assert_frame_equal(result, expected) # dict @@ -137,8 +153,9 @@ def test_append_series_dict(self): assert_frame_equal(result, expected) result = df.append(series[::-1][:3], ignore_index=True) - expected = df.append(DataFrame({0: series[::-1][:3]}).T, - ignore_index=True, sort=True) + expected = df.append( + DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True + ) assert_frame_equal(result, expected.loc[:, result.columns]) # can append when name set @@ -149,8 +166,7 @@ def test_append_series_dict(self): assert_frame_equal(result, expected) def test_append_list_of_series_dicts(self): - df = DataFrame(np.random.randn(5, 4), - columns=['foo', 'bar', 'baz', 'qux']) + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) dicts = [x.to_dict() for idx, x in df.iterrows()] @@ -159,8 +175,10 @@ def test_append_list_of_series_dicts(self): assert_frame_equal(result, expected) # different columns - dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, - {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] + dicts = [ + {"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4}, + {"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8}, + ] result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected) @@ -170,10 +188,9 @@ def test_append_missing_cols(self): # exercise the conditional branch in append method where the data # to be appended is a list and does not contain all columns that are in # the target DataFrame - df = DataFrame(np.random.randn(5, 4), - columns=['foo', 'bar', 'baz', 'qux']) + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - dicts = [{'foo': 9}, {'bar': 10}] + dicts = [{"foo": 9}, {"bar": 10}] with tm.assert_produces_warning(None): result = df.append(dicts, ignore_index=True, sort=True) @@ -197,14 +214,14 @@ def test_append_empty_dataframe(self): assert_frame_equal(result, expected) # Empty df with columns append empty df - df1 = DataFrame(columns=['bar', 'foo']) + df1 = DataFrame(columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) # Non-Empty df with columns append empty df - df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo']) + df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() @@ -216,152 +233,151 @@ def test_append_dtypes(self): # row appends of different dtypes (so need to do by-item) # can sometimes infer the correct type - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(5)) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) df2 = DataFrame() result = df1.append(df2) expected = df1.copy() assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': 'foo'}, index=range(1, 2)) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) result = df1.append(df2) - expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']}) + expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': np.nan}, index=range(1, 2)) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')}) + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': np.nan}, index=range(1, 2), dtype=object) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) result = df1.append(df2) expected = DataFrame( - {'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')}) + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': np.nan}, index=range(1)) - df2 = DataFrame({'bar': Timestamp('20130101')}, index=range(1, 2)) + df1 = DataFrame({"bar": np.nan}, index=range(1)) + df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) result = df1.append(df2) expected = DataFrame( - {'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')}) + {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} + ) assert_frame_equal(result, expected) - df1 = DataFrame({'bar': Timestamp('20130101')}, index=range(1)) - df2 = DataFrame({'bar': 1}, index=range(1, 2), dtype=object) + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) result = df1.append(df2) - expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])}) + expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) assert_frame_equal(result, expected) def test_update(self): - df = DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other) - expected = DataFrame([[1.5, np.nan, 3], - [3.6, 2, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 7.]]) + expected = DataFrame( + [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) assert_frame_equal(df, expected) def test_update_dtypes(self): # gh 3016 - df = DataFrame([[1., 2., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) - other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) df.update(other) - expected = DataFrame([[45., 45., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) + expected = DataFrame( + [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) assert_frame_equal(df, expected) def test_update_nooverwrite(self): - df = DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other, overwrite=False) - expected = DataFrame([[1.5, np.nan, 3], - [1.5, 2, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 3.]]) + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 3.0]] + ) assert_frame_equal(df, expected) def test_update_filtered(self): - df = DataFrame([[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[3.6, 2., np.nan], - [np.nan, np.nan, 7]], index=[1, 3]) + other = DataFrame([[3.6, 2.0, np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other, filter_func=lambda x: x > 2) - expected = DataFrame([[1.5, np.nan, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 3], - [1.5, np.nan, 7.]]) + expected = DataFrame( + [[1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.0]] + ) assert_frame_equal(df, expected) - @pytest.mark.parametrize('bad_kwarg, exception, msg', [ - # errors must be 'ignore' or 'raise' - ({'errors': 'something'}, ValueError, 'The parameter errors must.*'), - ({'join': 'inner'}, NotImplementedError, 'Only left join is supported') - ]) + @pytest.mark.parametrize( + "bad_kwarg, exception, msg", + [ + # errors must be 'ignore' or 'raise' + ({"errors": "something"}, ValueError, "The parameter errors must.*"), + ({"join": "inner"}, NotImplementedError, "Only left join is supported"), + ], + ) def test_update_raise_bad_parameter(self, bad_kwarg, exception, msg): - df = DataFrame([[1.5, 1, 3.]]) + df = DataFrame([[1.5, 1, 3.0]]) with pytest.raises(exception, match=msg): df.update(df, **bad_kwarg) def test_update_raise_on_overlap(self): - df = DataFrame([[1.5, 1, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3], - [1.5, np.nan, 3]]) + df = DataFrame( + [[1.5, 1, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) - other = DataFrame([[2., np.nan], - [np.nan, 7]], index=[1, 3], columns=[1, 2]) + other = DataFrame([[2.0, np.nan], [np.nan, 7]], index=[1, 3], columns=[1, 2]) with pytest.raises(ValueError, match="Data overlaps"): - df.update(other, errors='raise') + df.update(other, errors="raise") - @pytest.mark.parametrize('raise_conflict', [True, False]) + @pytest.mark.parametrize("raise_conflict", [True, False]) def test_update_deprecation(self, raise_conflict): - df = DataFrame([[1.5, 1, 3.]]) + df = DataFrame([[1.5, 1, 3.0]]) other = DataFrame() with tm.assert_produces_warning(FutureWarning): df.update(other, raise_conflict=raise_conflict) def test_update_from_non_df(self): - d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])} + d = {"a": Series([1, 2, 3, 4]), "b": Series([5, 6, 7, 8])} df = DataFrame(d) - d['a'] = Series([5, 6, 7, 8]) + d["a"] = Series([5, 6, 7, 8]) df.update(d) expected = DataFrame(d) assert_frame_equal(df, expected) - d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]} + d = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} df = DataFrame(d) - d['a'] = [5, 6, 7, 8] + d["a"] = [5, 6, 7, 8] df.update(d) expected = DataFrame(d) @@ -370,93 +386,118 @@ def test_update_from_non_df(self): def test_update_datetime_tz(self): # GH 25807 - result = DataFrame([pd.Timestamp('2019', tz='UTC')]) + result = DataFrame([pd.Timestamp("2019", tz="UTC")]) result.update(result) - expected = DataFrame([pd.Timestamp('2019', tz='UTC')]) + expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) assert_frame_equal(result, expected) def test_join_str_datetime(self): - str_dates = ['20120209', '20120222'] + str_dates = ["20120209", "20120222"] dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] - A = DataFrame(str_dates, index=range(2), columns=['aa']) + A = DataFrame(str_dates, index=range(2), columns=["aa"]) C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates) - tst = A.join(C, on='aa') + tst = A.join(C, on="aa") assert len(tst.columns) == 3 def test_join_multiindex_leftright(self): # GH 10741 - df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908], - ['a', 'z', 0.563634], ['b', 'x', -0.353756], - ['b', 'y', 0.368062], ['b', 'z', -1.721840], - ['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]], - columns=['first', 'second', 'value1']) - .set_index(['first', 'second'])) - - df2 = (pd.DataFrame([['a', 10], ['b', 20]], - columns=['first', 'value2']) - .set_index(['first'])) - - exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10], - [-0.353756, 20], [0.368062, 20], - [-1.721840, 20], - [1.000000, np.nan], [2.000000, np.nan], - [3.000000, np.nan]], - index=df1.index, columns=['value1', 'value2']) + df1 = pd.DataFrame( + [ + ["a", "x", 0.471780], + ["a", "y", 0.774908], + ["a", "z", 0.563634], + ["b", "x", -0.353756], + ["b", "y", 0.368062], + ["b", "z", -1.721840], + ["c", "x", 1], + ["c", "y", 2], + ["c", "z", 3], + ], + columns=["first", "second", "value1"], + ).set_index(["first", "second"]) + + df2 = pd.DataFrame( + [["a", 10], ["b", 20]], columns=["first", "value2"] + ).set_index(["first"]) + + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + [1.000000, np.nan], + [2.000000, np.nan], + [3.000000, np.nan], + ], + index=df1.index, + columns=["value1", "value2"], + ) # these must be the same results (but columns are flipped) - assert_frame_equal(df1.join(df2, how='left'), exp) - assert_frame_equal(df2.join(df1, how='right'), - exp[['value2', 'value1']]) - - exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']], - names=['first', 'second']) - exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10], - [-0.353756, 20], [0.368062, 20], [-1.721840, 20]], - index=exp_idx, columns=['value1', 'value2']) - - assert_frame_equal(df1.join(df2, how='right'), exp) - assert_frame_equal(df2.join(df1, how='left'), - exp[['value2', 'value1']]) + assert_frame_equal(df1.join(df2, how="left"), exp) + assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]]) + + exp_idx = pd.MultiIndex.from_product( + [["a", "b"], ["x", "y", "z"]], names=["first", "second"] + ) + exp = pd.DataFrame( + [ + [0.471780, 10], + [0.774908, 10], + [0.563634, 10], + [-0.353756, 20], + [0.368062, 20], + [-1.721840, 20], + ], + index=exp_idx, + columns=["value1", "value2"], + ) + + assert_frame_equal(df1.join(df2, how="right"), exp) + assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]]) def test_concat_named_keys(self): # GH 14252 - df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]}) - index = Index(['a', 'b'], name='baz') + df = pd.DataFrame({"foo": [1, 2], "bar": [0.1, 0.2]}) + index = Index(["a", "b"], name="baz") concatted_named_from_keys = pd.concat([df, df], keys=index) expected_named = pd.DataFrame( - {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]}, - index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]), - names=['baz', None])) + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=["baz", None]), + ) assert_frame_equal(concatted_named_from_keys, expected_named) - index_no_name = Index(['a', 'b'], name=None) + index_no_name = Index(["a", "b"], name=None) concatted_named_from_names = pd.concat( - [df, df], keys=index_no_name, names=['baz']) + [df, df], keys=index_no_name, names=["baz"] + ) assert_frame_equal(concatted_named_from_names, expected_named) concatted_unnamed = pd.concat([df, df], keys=index_no_name) expected_unnamed = pd.DataFrame( - {'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]}, - index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]), - names=[None, None])) + {"foo": [1, 2, 1, 2], "bar": [0.1, 0.2, 0.1, 0.2]}, + index=pd.MultiIndex.from_product((["a", "b"], [0, 1]), names=[None, None]), + ) assert_frame_equal(concatted_unnamed, expected_unnamed) def test_concat_axis_parameter(self): # GH 14369 - df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2)) - df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2)) + df1 = pd.DataFrame({"A": [0.1, 0.2]}, index=range(2)) + df2 = pd.DataFrame({"A": [0.3, 0.4]}, index=range(2)) # Index/row/0 DataFrame - expected_index = pd.DataFrame( - {'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) + expected_index = pd.DataFrame({"A": [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1]) - concatted_index = pd.concat([df1, df2], axis='index') + concatted_index = pd.concat([df1, df2], axis="index") assert_frame_equal(concatted_index, expected_index) - concatted_row = pd.concat([df1, df2], axis='rows') + concatted_row = pd.concat([df1, df2], axis="rows") assert_frame_equal(concatted_row, expected_index) concatted_0 = pd.concat([df1, df2], axis=0) @@ -464,9 +505,10 @@ def test_concat_axis_parameter(self): # Columns/1 DataFrame expected_columns = pd.DataFrame( - [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A']) + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=["A", "A"] + ) - concatted_columns = pd.concat([df1, df2], axis='columns') + concatted_columns = pd.concat([df1, df2], axis="columns") assert_frame_equal(concatted_columns, expected_columns) concatted_1 = pd.concat([df1, df2], axis=1) @@ -476,13 +518,12 @@ def test_concat_axis_parameter(self): series2 = pd.Series([0.3, 0.4]) # Index/row/0 Series - expected_index_series = pd.Series( - [0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) + expected_index_series = pd.Series([0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1]) - concatted_index_series = pd.concat([series1, series2], axis='index') + concatted_index_series = pd.concat([series1, series2], axis="index") assert_series_equal(concatted_index_series, expected_index_series) - concatted_row_series = pd.concat([series1, series2], axis='rows') + concatted_row_series = pd.concat([series1, series2], axis="rows") assert_series_equal(concatted_row_series, expected_index_series) concatted_0_series = pd.concat([series1, series2], axis=0) @@ -490,61 +531,65 @@ def test_concat_axis_parameter(self): # Columns/1 Series expected_columns_series = pd.DataFrame( - [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1]) + [[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1] + ) - concatted_columns_series = pd.concat( - [series1, series2], axis='columns') + concatted_columns_series = pd.concat([series1, series2], axis="columns") assert_frame_equal(concatted_columns_series, expected_columns_series) concatted_1_series = pd.concat([series1, series2], axis=1) assert_frame_equal(concatted_1_series, expected_columns_series) # Testing ValueError - with pytest.raises(ValueError, match='No axis named'): - pd.concat([series1, series2], axis='something') + with pytest.raises(ValueError, match="No axis named"): + pd.concat([series1, series2], axis="something") def test_concat_numerical_names(self): # #15262 # #12223 - df = pd.DataFrame({'col': range(9)}, - dtype='int32', - index=(pd.MultiIndex - .from_product([['A0', 'A1', 'A2'], - ['B0', 'B1', 'B2']], - names=[1, 2]))) + df = pd.DataFrame( + {"col": range(9)}, + dtype="int32", + index=( + pd.MultiIndex.from_product( + [["A0", "A1", "A2"], ["B0", "B1", "B2"]], names=[1, 2] + ) + ), + ) result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :])) - expected = pd.DataFrame({'col': [0, 1, 7, 8]}, - dtype='int32', - index=pd.MultiIndex.from_tuples([('A0', 'B0'), - ('A0', 'B1'), - ('A2', 'B1'), - ('A2', 'B2')], - names=[1, 2])) + expected = pd.DataFrame( + {"col": [0, 1, 7, 8]}, + dtype="int32", + index=pd.MultiIndex.from_tuples( + [("A0", "B0"), ("A0", "B1"), ("A2", "B1"), ("A2", "B2")], names=[1, 2] + ), + ) tm.assert_frame_equal(result, expected) def test_concat_astype_dup_col(self): # gh 23049 - df = pd.DataFrame([{'a': 'b'}]) + df = pd.DataFrame([{"a": "b"}]) df = pd.concat([df, df], axis=1) - result = df.astype('category') - expected = pd.DataFrame(np.array(["b", "b"]).reshape(1, 2), - columns=["a", "a"]).astype("category") + result = df.astype("category") + expected = pd.DataFrame( + np.array(["b", "b"]).reshape(1, 2), columns=["a", "a"] + ).astype("category") tm.assert_frame_equal(result, expected) class TestDataFrameCombineFirst: - def test_combine_first_mixed(self): - a = Series(['a', 'b'], index=range(2)) + a = Series(["a", "b"], index=range(2)) b = Series(range(2), index=range(2)) - f = DataFrame({'A': a, 'B': b}) + f = DataFrame({"A": a, "B": b}) - a = Series(['a', 'b'], index=range(5, 7)) + a = Series(["a", "b"], index=range(5, 7)) b = Series(range(2), index=range(5, 7)) - g = DataFrame({'A': a, 'B': b}) + g = DataFrame({"A": a, "B": b}) - exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]}, - index=[0, 1, 5, 6]) + exp = pd.DataFrame( + {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] + ) combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) @@ -556,42 +601,42 @@ def test_combine_first(self, float_frame): reordered_frame = float_frame.reindex(combined.index) assert_frame_equal(combined, reordered_frame) assert tm.equalContents(combined.columns, float_frame.columns) - assert_series_equal(combined['A'], reordered_frame['A']) + assert_series_equal(combined["A"], reordered_frame["A"]) # same index fcopy = float_frame.copy() - fcopy['A'] = 1 - del fcopy['C'] + fcopy["A"] = 1 + del fcopy["C"] fcopy2 = float_frame.copy() - fcopy2['B'] = 0 - del fcopy2['D'] + fcopy2["B"] = 0 + del fcopy2["D"] combined = fcopy.combine_first(fcopy2) - assert (combined['A'] == 1).all() - assert_series_equal(combined['B'], fcopy['B']) - assert_series_equal(combined['C'], fcopy2['C']) - assert_series_equal(combined['D'], fcopy['D']) + assert (combined["A"] == 1).all() + assert_series_equal(combined["B"], fcopy["B"]) + assert_series_equal(combined["C"], fcopy2["C"]) + assert_series_equal(combined["D"], fcopy["D"]) # overlap head, tail = reordered_frame[:10].copy(), reordered_frame - head['A'] = 1 + head["A"] = 1 combined = head.combine_first(tail) - assert (combined['A'][:10] == 1).all() + assert (combined["A"][:10] == 1).all() # reverse overlap - tail['A'][:10] = 0 + tail["A"][:10] = 0 combined = tail.combine_first(head) - assert (combined['A'][:10] == 0).all() + assert (combined["A"][:10] == 0).all() # no overlap f = float_frame[:10] g = float_frame[10:] combined = f.combine_first(g) - assert_series_equal(combined['A'].reindex(f.index), f['A']) - assert_series_equal(combined['A'].reindex(g.index), g['A']) + assert_series_equal(combined["A"].reindex(f.index), f["A"]) + assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases comb = float_frame.combine_first(DataFrame()) @@ -604,72 +649,74 @@ def test_combine_first(self, float_frame): assert "faz" in comb.index # #2525 - df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) - df2 = DataFrame(columns=['b']) + df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame(columns=["b"]) result = df.combine_first(df2) - assert 'b' in result + assert "b" in result def test_combine_first_mixed_bug(self): - idx = Index(['a', 'b', 'c', 'e']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'e'], index=idx) + idx = Index(["a", "b", "c", "e"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "e"], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) - frame1 = DataFrame({"col0": ser1, - "col2": ser2, - "col3": ser3}) + frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3}) - idx = Index(['a', 'b', 'c', 'f']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'f'], index=idx) + idx = Index(["a", "b", "c", "f"]) + ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx) + ser2 = Series(["a", "b", "c", "f"], index=idx) ser3 = Series([12, 4, 5, 97], index=idx) - frame2 = DataFrame({"col1": ser1, - "col2": ser2, - "col5": ser3}) + frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3}) combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 # gh 3016 (same as in update) - df = DataFrame([[1., 2., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) + df = DataFrame( + [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], + columns=["A", "B", "bool1", "bool2"], + ) - other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) + other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) result = df.combine_first(other) assert_frame_equal(result, df) - df.loc[0, 'A'] = np.nan + df.loc[0, "A"] = np.nan result = df.combine_first(other) - df.loc[0, 'A'] = 45 + df.loc[0, "A"] = 45 assert_frame_equal(result, df) # doc example - df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) + df1 = DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) - df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + df2 = DataFrame( + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } + ) result = df1.combine_first(df2) - expected = DataFrame( - {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) + expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) assert_frame_equal(result, expected) # GH3552, return object dtype with bools df1 = DataFrame( - [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) - df2 = DataFrame( - [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) + [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] + ) + df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) result = df1.combine_first(df2)[2] expected = Series([True, True, False], name=2) assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) df1 = DataFrame({"a": [None, None, None]}) df2 = df1.combine_first(df0) assert_frame_equal(df2, df0) @@ -677,9 +724,9 @@ def test_combine_first_mixed_bug(self): df2 = df0.combine_first(df1) assert_frame_equal(df2, df0) - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) + df0 = DataFrame( + {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} + ) df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) df2 = df1.combine_first(df0) result = df0.copy() @@ -691,164 +738,182 @@ def test_combine_first_mixed_bug(self): def test_combine_first_align_nan(self): # GH 7509 (not fixed) - dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]], - columns=['a', 'b']) - dfb = pd.DataFrame([[4], [5]], columns=['b']) - assert dfa['a'].dtype == 'datetime64[ns]' - assert dfa['b'].dtype == 'int64' + dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = pd.DataFrame([[4], [5]], columns=["b"]) + assert dfa["a"].dtype == "datetime64[ns]" + assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) - exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT], - 'b': [2., 5.]}, columns=['a', 'b']) + exp = pd.DataFrame( + {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, + columns=["a", "b"], + ) tm.assert_frame_equal(res, exp) - assert res['a'].dtype == 'datetime64[ns]' + assert res["a"].dtype == "datetime64[ns]" # ToDo: this must be int64 - assert res['b'].dtype == 'float64' + assert res["b"].dtype == "float64" res = dfa.iloc[:0].combine_first(dfb) - exp = pd.DataFrame({'a': [np.nan, np.nan], - 'b': [4, 5]}, columns=['a', 'b']) + exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) tm.assert_frame_equal(res, exp) # ToDo: this must be datetime64 - assert res['a'].dtype == 'float64' + assert res["a"].dtype == "float64" # ToDo: this must be int64 - assert res['b'].dtype == 'int64' + assert res["b"].dtype == "int64" def test_combine_first_timezone(self): # see gh-7630 - data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC') - df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'], - data=data1, - index=pd.date_range('20140627', periods=1)) - data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC') - df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'], - data=data2, - index=pd.date_range('20140628', periods=1)) - res = df2[['UTCdatetime']].combine_first(df1) - exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01', - tz='UTC'), - pd.Timestamp('2012-12-12 12:12', - tz='UTC')], - 'abc': [pd.Timestamp('2010-01-01 01:01:00', - tz='UTC'), pd.NaT]}, - columns=['UTCdatetime', 'abc'], - index=pd.date_range('20140627', periods=2, - freq='D')) + data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") + df1 = pd.DataFrame( + columns=["UTCdatetime", "abc"], + data=data1, + index=pd.date_range("20140627", periods=1), + ) + data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") + df2 = pd.DataFrame( + columns=["UTCdatetime", "xyz"], + data=data2, + index=pd.date_range("20140628", periods=1), + ) + res = df2[["UTCdatetime"]].combine_first(df1) + exp = pd.DataFrame( + { + "UTCdatetime": [ + pd.Timestamp("2010-01-01 01:01", tz="UTC"), + pd.Timestamp("2012-12-12 12:12", tz="UTC"), + ], + "abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT], + }, + columns=["UTCdatetime", "abc"], + index=pd.date_range("20140627", periods=2, freq="D"), + ) tm.assert_frame_equal(res, exp) - assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]' - assert res['abc'].dtype == 'datetime64[ns, UTC]' + assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" + assert res["abc"].dtype == "datetime64[ns, UTC]" # see gh-10567 - dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC') - df1 = pd.DataFrame({'DATE': dts1}) - dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC') - df2 = pd.DataFrame({'DATE': dts2}) + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") + df2 = pd.DataFrame({"DATE": dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res['DATE'].dtype == 'datetime64[ns, UTC]' + assert res["DATE"].dtype == "datetime64[ns, UTC]" - dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', - '2011-01-04'], tz='US/Eastern') - df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7]) - dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02', - '2012-01-03'], tz='US/Eastern') - df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5]) + dts1 = pd.DatetimeIndex( + ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" + ) + df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex( + ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" + ) + df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT', - '2012-01-02', '2011-01-03', '2011-01-04'], - tz='US/Eastern') - exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = pd.DatetimeIndex( + [ + "2011-01-01", + "2012-01-01", + "NaT", + "2012-01-02", + "2011-01-03", + "2011-01-04", + ], + tz="US/Eastern", + ) + exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) # different tz - dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern') - df1 = pd.DataFrame({'DATE': dts1}) - dts2 = pd.date_range('2015-01-03', '2015-01-05') - df2 = pd.DataFrame({'DATE': dts2}) + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-03", "2015-01-05") + df2 = pd.DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]' + assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" - dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern') - df1 = pd.DataFrame({'DATE': dts1}) - dts2 = pd.date_range('2015-01-01', '2015-01-03') - df2 = pd.DataFrame({'DATE': dts2}) + dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") + df1 = pd.DataFrame({"DATE": dts1}) + dts2 = pd.date_range("2015-01-01", "2015-01-03") + df2 = pd.DataFrame({"DATE": dts2}) res = df1.combine_first(df2) - exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'), - pd.Timestamp('2015-01-02', tz='US/Eastern'), - pd.Timestamp('2015-01-03')] - exp = pd.DataFrame({'DATE': exp_dts}) + exp_dts = [ + pd.Timestamp("2015-01-01", tz="US/Eastern"), + pd.Timestamp("2015-01-02", tz="US/Eastern"), + pd.Timestamp("2015-01-03"), + ] + exp = pd.DataFrame({"DATE": exp_dts}) tm.assert_frame_equal(res, exp) - assert res['DATE'].dtype == 'object' + assert res["DATE"].dtype == "object" def test_combine_first_timedelta(self): - data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day']) - df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7]) - data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day']) - df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5]) + data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) + df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) + df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT', - '11 day', '3 day', '4 day']) - exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = pd.TimedeltaIndex( + ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] + ) + exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - assert res['TD'].dtype == 'timedelta64[ns]' + assert res["TD"].dtype == "timedelta64[ns]" def test_combine_first_period(self): - data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03', - '2011-04'], freq='M') - df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7]) - data2 = pd.PeriodIndex(['2012-01-01', '2012-02', - '2012-03'], freq='M') - df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5]) + data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") + df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") + df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT', - '2012-02', '2011-03', '2011-04'], - freq='M') - exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = pd.PeriodIndex( + ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" + ) + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - assert res['P'].dtype == data1.dtype + assert res["P"].dtype == data1.dtype # different freq - dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02', - '2012-01-03'], freq='D') - df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5]) + dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") + df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) - exp_dts = [pd.Period('2011-01', freq='M'), - pd.Period('2012-01-01', freq='D'), - pd.NaT, - pd.Period('2012-01-02', freq='D'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-04', freq='M')] - exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp_dts = [ + pd.Period("2011-01", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.NaT, + pd.Period("2012-01-02", freq="D"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ] + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) - assert res['P'].dtype == 'object' + assert res["P"].dtype == "object" def test_combine_first_int(self): # GH14687 - integer series that do no align exactly - df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64') - df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64') + df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res['a'].dtype == 'int64' + assert res["a"].dtype == "int64" @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 - df1 = pd.DataFrame({'isNum': [val]}) - df2 = pd.DataFrame({'isBool': [True]}) + df1 = pd.DataFrame({"isNum": [val]}) + df2 = pd.DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = pd.DataFrame({'isBool': [True], 'isNum': [val]}) + exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) @@ -856,35 +921,32 @@ def test_concat_datetime_datetime64_frame(self): # #2624 rows = [] rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), 'hi']) + rows.append([datetime(2010, 1, 2), "hi"]) - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) ind = date_range(start="2000/1/1", freq="D", periods=10) - df1 = DataFrame({'date': ind, 'test': range(10)}) + df1 = DataFrame({"date": ind, "test": range(10)}) # it works! pd.concat([df1, df2_obj]) class TestDataFrameUpdate: - def test_update_nan(self): # #15593 #15617 # test 1 - df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) - df2 = DataFrame({'A': [None, 2, 3]}) + df1 = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) expected = df1.copy() df1.update(df2, overwrite=False) tm.assert_frame_equal(df1, expected) # test 2 - df1 = DataFrame({'A': [1.0, None, 3], - 'B': date_range('2000', periods=3)}) - df2 = DataFrame({'A': [None, 2, 3]}) - expected = DataFrame({'A': [1.0, 2, 3], - 'B': date_range('2000', periods=3)}) + df1 = DataFrame({"A": [1.0, None, 3], "B": date_range("2000", periods=3)}) + df2 = DataFrame({"A": [None, 2, 3]}) + expected = DataFrame({"A": [1.0, 2, 3], "B": date_range("2000", periods=3)}) df1.update(df2, overwrite=False) tm.assert_frame_equal(df1, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 73a8720adb5cc..a16ca7045cfdd 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -15,32 +15,51 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, MultiIndex, RangeIndex, Series, Timedelta, - Timestamp, date_range, isna) + Categorical, + DataFrame, + Index, + MultiIndex, + RangeIndex, + Series, + Timedelta, + Timestamp, + date_range, + isna, +) import pandas.util.testing as tm -MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64'] -MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', - 'int32', 'int64'] +MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] +MIXED_INT_DTYPES = [ + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", +] class TestDataFrameConstructors: - - @pytest.mark.parametrize('constructor', [ - lambda: DataFrame(), - lambda: DataFrame(None), - lambda: DataFrame({}), - lambda: DataFrame(()), - lambda: DataFrame([]), - lambda: DataFrame((x for x in [])), - lambda: DataFrame(range(0)), - lambda: DataFrame(data=None), - lambda: DataFrame(data={}), - lambda: DataFrame(data=()), - lambda: DataFrame(data=[]), - lambda: DataFrame(data=(x for x in [])), - lambda: DataFrame(data=range(0)), - ]) + @pytest.mark.parametrize( + "constructor", + [ + lambda: DataFrame(), + lambda: DataFrame(None), + lambda: DataFrame({}), + lambda: DataFrame(()), + lambda: DataFrame([]), + lambda: DataFrame((x for x in [])), + lambda: DataFrame(range(0)), + lambda: DataFrame(data=None), + lambda: DataFrame(data={}), + lambda: DataFrame(data=()), + lambda: DataFrame(data=[]), + lambda: DataFrame(data=(x for x in [])), + lambda: DataFrame(data=range(0)), + ], + ) def test_empty_constructor(self, constructor): expected = DataFrame() result = constructor() @@ -48,13 +67,15 @@ def test_empty_constructor(self, constructor): assert len(result.columns) == 0 tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('emptylike,expected_index,expected_columns', [ - ([[]], RangeIndex(1), RangeIndex(0)), - ([[], []], RangeIndex(2), RangeIndex(0)), - ([(x for x in [])], RangeIndex(1), RangeIndex(0)) - ]) - def test_emptylike_constructor( - self, emptylike, expected_index, expected_columns): + @pytest.mark.parametrize( + "emptylike,expected_index,expected_columns", + [ + ([[]], RangeIndex(1), RangeIndex(0)), + ([[], []], RangeIndex(2), RangeIndex(0)), + ([(x for x in [])], RangeIndex(1), RangeIndex(0)), + ], + ) + def test_emptylike_constructor(self, emptylike, expected_index, expected_columns): expected = DataFrame(index=expected_index, columns=expected_columns) result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) @@ -66,36 +87,33 @@ def test_constructor_mixed(self, float_string_frame): indexed_frame = DataFrame(data, index=index) # noqa unindexed_frame = DataFrame(data) # noqa - assert float_string_frame['foo'].dtype == np.object_ + assert float_string_frame["foo"].dtype == np.object_ def test_constructor_cast_failure(self): - foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64) - assert foo['a'].dtype == object + foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64) + assert foo["a"].dtype == object # GH 3010, constructing with odd arrays df = DataFrame(np.ones((4, 2))) # this is ok - df['foo'] = np.ones((4, 2)).tolist() + df["foo"] = np.ones((4, 2)).tolist() # this is not ok msg = "Wrong number of items passed 2, placement implies 1" with pytest.raises(ValueError, match=msg): - df['test'] = np.ones((4, 2)) + df["test"] = np.ones((4, 2)) # this is ok - df['foo2'] = np.ones((4, 2)).tolist() + df["foo2"] = np.ones((4, 2)).tolist() def test_constructor_dtype_copy(self): - orig_df = DataFrame({ - 'col1': [1.], - 'col2': [2.], - 'col3': [3.]}) + orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]}) new_df = pd.DataFrame(orig_df, dtype=float, copy=True) - new_df['col1'] = 200. - assert orig_df['col1'][0] == 1. + new_df["col1"] = 200.0 + assert orig_df["col1"][0] == 1.0 def test_constructor_dtype_nocast_view(self): df = DataFrame([[1, 2]]) @@ -108,10 +126,9 @@ def test_constructor_dtype_nocast_view(self): assert df.values[0, 0] == 97 def test_constructor_dtype_list_data(self): - df = DataFrame([[1, '2'], - [None, 'a']], dtype=object) + df = DataFrame([[1, "2"], [None, "a"]], dtype=object) assert df.loc[1, 0] is None - assert df.loc[0, 1] == '2' + assert df.loc[0, 1] == "2" def test_constructor_list_frames(self): # see gh-3243 @@ -122,20 +139,19 @@ def test_constructor_list_frames(self): assert isinstance(result.iloc[0, 0], DataFrame) def test_constructor_mixed_dtypes(self): - def _make_mixed_dtypes_df(typ, ad=None): - if typ == 'int': + if typ == "int": dtypes = MIXED_INT_DTYPES - arrays = [np.array(np.random.rand(10), dtype=d) - for d in dtypes] - elif typ == 'float': + arrays = [np.array(np.random.rand(10), dtype=d) for d in dtypes] + elif typ == "float": dtypes = MIXED_FLOAT_DTYPES - arrays = [np.array(np.random.randint( - 10, size=10), dtype=d) for d in dtypes] + arrays = [ + np.array(np.random.randint(10, size=10), dtype=d) for d in dtypes + ] for d, a in zip(dtypes, arrays): - assert(a.dtype == d) + assert a.dtype == d if ad is None: ad = dict() ad.update({d: a for d, a in zip(dtypes, arrays)}) @@ -146,18 +162,18 @@ def _check_mixed_dtypes(df, dtypes=None): dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES for d in dtypes: if d in df: - assert(df.dtypes[d] == d) + assert df.dtypes[d] == d # mixed floating and integer coexist in the same frame - df = _make_mixed_dtypes_df('float') + df = _make_mixed_dtypes_df("float") _check_mixed_dtypes(df) # add lots of types - df = _make_mixed_dtypes_df('float', dict(A=1, B='foo', C='bar')) + df = _make_mixed_dtypes_df("float", dict(A=1, B="foo", C="bar")) _check_mixed_dtypes(df) # GH 622 - df = _make_mixed_dtypes_df('int') + df = _make_mixed_dtypes_df("int") _check_mixed_dtypes(df) def test_constructor_complex_dtypes(self): @@ -165,19 +181,19 @@ def test_constructor_complex_dtypes(self): a = np.random.rand(10).astype(np.complex64) b = np.random.rand(10).astype(np.complex128) - df = DataFrame({'a': a, 'b': b}) + df = DataFrame({"a": a, "b": b}) assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 - df = DataFrame({'A': ['x', None]}, dtype=string_dtype) + df = DataFrame({"A": ["x", None]}, dtype=string_dtype) result = df.isna() expected = DataFrame({"A": [False, True]}) tm.assert_frame_equal(result, expected) assert df.iloc[1, 0] is None - df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) + df = DataFrame({"A": ["x", np.nan]}, dtype=string_dtype) assert np.isnan(df.iloc[1, 0]) def test_constructor_rec(self, float_frame): @@ -194,37 +210,46 @@ def test_constructor_rec(self, float_frame): tm.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] - df3 = DataFrame(rec, index=rng, columns=['C', 'B']) - expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B']) + df3 = DataFrame(rec, index=rng, columns=["C", "B"]) + expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"]) tm.assert_frame_equal(df3, expected) def test_constructor_bool(self): - df = DataFrame({0: np.ones(10, dtype=bool), - 1: np.zeros(10, dtype=bool)}) + df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)}) assert df.values.dtype == np.bool_ def test_constructor_overflow_int64(self): # see gh-14881 - values = np.array([2 ** 64 - i for i in range(1, 10)], - dtype=np.uint64) + values = np.array([2 ** 64 - i for i in range(1, 10)], dtype=np.uint64) - result = DataFrame({'a': values}) - assert result['a'].dtype == np.uint64 + result = DataFrame({"a": values}) + assert result["a"].dtype == np.uint64 # see gh-2355 - data_scores = [(6311132704823138710, 273), (2685045978526272070, 23), - (8921811264899370420, 45), (17019687244989530680, 270), - (9930107427299601010, 273)] - dtype = [('uid', 'u8'), ('score', 'u8')] + data_scores = [ + (6311132704823138710, 273), + (2685045978526272070, 23), + (8921811264899370420, 45), + (17019687244989530680, 270), + (9930107427299601010, 273), + ] + dtype = [("uid", "u8"), ("score", "u8")] data = np.zeros((len(data_scores),), dtype=dtype) data[:] = data_scores df_crawls = DataFrame(data) - assert df_crawls['uid'].dtype == np.uint64 - - @pytest.mark.parametrize("values", [np.array([2**64], dtype=object), - np.array([2**65]), [2**64 + 1], - np.array([-2**63 - 4], dtype=object), - np.array([-2**64 - 1]), [-2**65 - 2]]) + assert df_crawls["uid"].dtype == np.uint64 + + @pytest.mark.parametrize( + "values", + [ + np.array([2 ** 64], dtype=object), + np.array([2 ** 65]), + [2 ** 64 + 1], + np.array([-2 ** 63 - 4], dtype=object), + np.array([-2 ** 64 - 1]), + [-2 ** 65 - 2], + ], + ) def test_constructor_int_overflow(self, values): # see gh-18584 value = values[0] @@ -235,10 +260,11 @@ def test_constructor_int_overflow(self, values): def test_constructor_ordereddict(self): import random + nitems = 100 nums = list(range(nitems)) random.shuffle(nums) - expected = ['A%d' % i for i in nums] + expected = ["A%d" % i for i in nums] df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems))) assert expected == list(df.columns) @@ -247,27 +273,29 @@ def test_constructor_dict(self): # test expects index shifted by 5 datetime_series_short = tm.makeTimeSeries(nper=30)[5:] - frame = DataFrame({'col1': datetime_series, - 'col2': datetime_series_short}) + frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short}) # col2 is padded with NaN assert len(datetime_series) == 30 assert len(datetime_series_short) == 25 - tm.assert_series_equal(frame['col1'], datetime_series.rename('col1')) + tm.assert_series_equal(frame["col1"], datetime_series.rename("col1")) - exp = pd.Series(np.concatenate([[np.nan] * 5, - datetime_series_short.values]), - index=datetime_series.index, name='col2') - tm.assert_series_equal(exp, frame['col2']) + exp = pd.Series( + np.concatenate([[np.nan] * 5, datetime_series_short.values]), + index=datetime_series.index, + name="col2", + ) + tm.assert_series_equal(exp, frame["col2"]) - frame = DataFrame({'col1': datetime_series, - 'col2': datetime_series_short}, - columns=['col2', 'col3', 'col4']) + frame = DataFrame( + {"col1": datetime_series, "col2": datetime_series_short}, + columns=["col2", "col3", "col4"], + ) assert len(frame) == len(datetime_series_short) - assert 'col1' not in frame - assert isna(frame['col3']).all() + assert "col1" not in frame + assert isna(frame["col3"]).all() # Corner cases assert len(DataFrame()) == 0 @@ -275,11 +303,11 @@ def test_constructor_dict(self): # mix dict and array, wrong size - no spec for which error should raise # first with pytest.raises(ValueError): - DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']}) + DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) # Length-one dict micro-optimization - frame = DataFrame({'A': {'1': 1, '2': 2}}) - tm.assert_index_equal(frame.index, pd.Index(['1', '2'])) + frame = DataFrame({"A": {"1": 1, "2": 2}}) + tm.assert_index_equal(frame.index, pd.Index(["1", "2"])) # empty dict plus index idx = Index([0, 1, 2]) @@ -294,97 +322,95 @@ def test_constructor_dict(self): assert len(frame._series) == 3 # with dict of empty list and Series - frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) + frame = DataFrame({"A": [], "B": []}, columns=["A", "B"]) tm.assert_index_equal(frame.index, Index([], dtype=np.int64)) # GH 14381 # Dict with None value frame_none = DataFrame(dict(a=None), index=[0]) frame_none_list = DataFrame(dict(a=[None]), index=[0]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert frame_none.get_value(0, 'a') is None - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert frame_none_list.get_value(0, 'a') is None + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert frame_none.get_value(0, "a") is None + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert frame_none_list.get_value(0, "a") is None tm.assert_frame_equal(frame_none, frame_none_list) # GH10856 # dict with scalar values should raise error, even if columns passed - msg = 'If using all scalar values, you must pass an index' + msg = "If using all scalar values, you must pass an index" with pytest.raises(ValueError, match=msg): - DataFrame({'a': 0.7}) + DataFrame({"a": 0.7}) with pytest.raises(ValueError, match=msg): - DataFrame({'a': 0.7}, columns=['a']) + DataFrame({"a": 0.7}, columns=["a"]) - @pytest.mark.parametrize("scalar", [2, np.nan, None, 'D']) + @pytest.mark.parametrize("scalar", [2, np.nan, None, "D"]) def test_constructor_invalid_items_unused(self, scalar): # No error if invalid (scalar) value is in fact not used: - result = DataFrame({'a': scalar}, columns=['b']) - expected = DataFrame(columns=['b']) + result = DataFrame({"a": scalar}, columns=["b"]) + expected = DataFrame(columns=["b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18455 cols = [1, value, 3] - idx = ['a', value] + idx = ["a", value] values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} - result = DataFrame(data).sort_values(1).sort_values('a', axis=1) - expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), - index=idx, columns=cols) + result = DataFrame(data).sort_values(1).sort_values("a", axis=1) + expected = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols + ) tm.assert_frame_equal(result, expected) - result = DataFrame(data, index=idx).sort_values('a', axis=1) + result = DataFrame(data, index=idx).sort_values("a", axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("value", [np.nan, None, float('nan')]) + @pytest.mark.parametrize("value", [np.nan, None, float("nan")]) def test_constructor_dict_nan_tuple_key(self, value): # GH 18455 cols = Index([(11, 21), (value, 22), (13, value)]) - idx = Index([('a', value), (value, 2)]) + idx = Index([("a", value), (value, 2)]) values = [[0, 3], [1, 4], [2, 5]] data = {cols[c]: Series(values[c], index=idx) for c in range(3)} - result = (DataFrame(data) - .sort_values((11, 21)) - .sort_values(('a', value), axis=1)) - expected = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), - index=idx, columns=cols) + result = DataFrame(data).sort_values((11, 21)).sort_values(("a", value), axis=1) + expected = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols + ) tm.assert_frame_equal(result, expected) - result = DataFrame(data, index=idx).sort_values(('a', value), axis=1) + result = DataFrame(data, index=idx).sort_values(("a", value), axis=1) tm.assert_frame_equal(result, expected) result = DataFrame(data, index=idx, columns=cols) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') + @pytest.mark.skipif(not PY36, reason="Insertion order for Python>=3.6") def test_constructor_dict_order_insertion(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) # GH19018 # initialization ordering: by insertion order if python>= 3.6 - d = {'b': datetime_series_short, 'a': datetime_series} + d = {"b": datetime_series_short, "a": datetime_series} frame = DataFrame(data=d) - expected = DataFrame(data=d, columns=list('ba')) + expected = DataFrame(data=d, columns=list("ba")) tm.assert_frame_equal(frame, expected) - @pytest.mark.skipif(PY36, reason='order by value for Python<3.6') + @pytest.mark.skipif(PY36, reason="order by value for Python<3.6") def test_constructor_dict_order_by_values(self): datetime_series = tm.makeTimeSeries(nper=30) datetime_series_short = tm.makeTimeSeries(nper=25) # GH19018 # initialization ordering: by value if python<3.6 - d = {'b': datetime_series_short, 'a': datetime_series} + d = {"b": datetime_series_short, "a": datetime_series} frame = DataFrame(data=d) - expected = DataFrame(data=d, columns=list('ab')) + expected = DataFrame(data=d, columns=list("ab")) tm.assert_frame_equal(frame, expected) def test_constructor_multi_index(self): @@ -404,20 +430,21 @@ def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. with pytest.raises(ValueError, match=msg): - DataFrame(np.empty(0), columns=list('abc')) + DataFrame(np.empty(0), columns=list("abc")) msg = "Mixing dicts with non-Series may lead to ambiguous ordering." # mix dict and array, wrong size with pytest.raises(ValueError, match=msg): - DataFrame({'A': {'a': 'a', 'b': 'b'}, - 'B': ['a', 'b', 'c']}) + DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]}) # wrong size ndarray, GH 3105 msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)" with pytest.raises(ValueError, match=msg): - DataFrame(np.arange(12).reshape((4, 3)), - columns=['foo', 'bar', 'baz'], - index=pd.date_range('2000-01-01', periods=3)) + DataFrame( + np.arange(12).reshape((4, 3)), + columns=["foo", "bar", "baz"], + index=pd.date_range("2000-01-01", periods=3), + ) arr = np.array([[4, 5, 6]]) msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)" @@ -430,37 +457,31 @@ def test_constructor_error_msgs(self): DataFrame(index=[0], columns=range(0, 4), data=arr) # higher dim raise exception - with pytest.raises(ValueError, match='Must pass 2-d input'): - DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) + with pytest.raises(ValueError, match="Must pass 2-d input"): + DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # wrong size axis labels - msg = ("Shape of passed values " - r"is \(2, 3\), indices " - r"imply \(1, 3\)") + msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(1, 3\)" with pytest.raises(ValueError, match=msg): - DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1]) + DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1]) - msg = ("Shape of passed values " - r"is \(2, 3\), indices " - r"imply \(2, 2\)") + msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(2, 2\)" with pytest.raises(ValueError, match=msg): - DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2]) + DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2]) # gh-26429 msg = "2 columns passed, passed data had 10 columns" with pytest.raises(ValueError, match=msg): - DataFrame((range(10), range(10, 20)), columns=('ones', 'twos')) + DataFrame((range(10), range(10, 20)), columns=("ones", "twos")) - msg = ("If using all scalar " - "values, you must pass " - "an index") + msg = "If using all scalar " "values, you must pass " "an index" with pytest.raises(ValueError, match=msg): - DataFrame({'a': False, 'b': True}) + DataFrame({"a": False, "b": True}) def test_constructor_with_embedded_frames(self): # embedded data frames - df1 = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) + df1 = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) df2 = DataFrame([df1, df1 + 10]) df2.dtypes @@ -474,11 +495,12 @@ def test_constructor_with_embedded_frames(self): def test_constructor_subclass_dict(self, float_frame): # Test for passing dict subclass to constructor - data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in range(10)), - 'col2': tm.TestSubDict((x, 20.0 * x) for x in range(10))} + data = { + "col1": tm.TestSubDict((x, 10.0 * x) for x in range(10)), + "col2": tm.TestSubDict((x, 20.0 * x) for x in range(10)), + } df = DataFrame(data) - refdf = DataFrame({col: dict(val.items()) - for col, val in data.items()}) + refdf = DataFrame({col: dict(val.items()) for col, val in data.items()}) tm.assert_frame_equal(refdf, df) data = tm.TestSubDict(data.items()) @@ -487,8 +509,9 @@ def test_constructor_subclass_dict(self, float_frame): # try with defaultdict from collections import defaultdict + data = {} - float_frame['B'][:10] = np.nan + float_frame["B"][:10] = np.nan for k, v in float_frame.items(): dct = defaultdict(dict) dct.update(v.to_dict()) @@ -497,48 +520,47 @@ def test_constructor_subclass_dict(self, float_frame): tm.assert_frame_equal(float_frame.sort_index(), frame) def test_constructor_dict_block(self): - expected = np.array([[4., 3., 2., 1.]]) - df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]}, - columns=['d', 'c', 'b', 'a']) + expected = np.array([[4.0, 3.0, 2.0, 1.0]]) + df = DataFrame( + {"d": [4.0], "c": [3.0], "b": [2.0], "a": [1.0]}, + columns=["d", "c", "b", "a"], + ) tm.assert_numpy_array_equal(df.values, expected) def test_constructor_dict_cast(self): # cast float tests - test_data = { - 'A': {'1': 1, '2': 2}, - 'B': {'1': '1', '2': '2', '3': '3'}, - } + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) assert len(frame) == 3 - assert frame['B'].dtype == np.float64 - assert frame['A'].dtype == np.float64 + assert frame["B"].dtype == np.float64 + assert frame["A"].dtype == np.float64 frame = DataFrame(test_data) assert len(frame) == 3 - assert frame['B'].dtype == np.object_ - assert frame['A'].dtype == np.float64 + assert frame["B"].dtype == np.object_ + assert frame["A"].dtype == np.float64 # can't cast to float test_data = { - 'A': dict(zip(range(20), tm.makeStringIndex(20))), - 'B': dict(zip(range(15), np.random.randn(15))) + "A": dict(zip(range(20), tm.makeStringIndex(20))), + "B": dict(zip(range(15), np.random.randn(15))), } frame = DataFrame(test_data, dtype=float) assert len(frame) == 20 - assert frame['A'].dtype == np.object_ - assert frame['B'].dtype == np.float64 + assert frame["A"].dtype == np.object_ + assert frame["B"].dtype == np.float64 def test_constructor_dict_dont_upcast(self): - d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}} + d = {"Col1": {"Row1": "A String", "Row2": np.nan}} df = DataFrame(d) - assert isinstance(df['Col1']['Row2'], float) + assert isinstance(df["Col1"]["Row2"], float) - dm = DataFrame([[1, 2], ['a', 'b']], index=[1, 2], columns=[1, 2]) + dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2]) assert isinstance(dm[1][1], int) def test_constructor_dict_of_tuples(self): # GH #1491 - data = {'a': (1, 2, 3), 'b': (4, 5, 6)} + data = {"a": (1, 2, 3), "b": (4, 5, 6)} result = DataFrame(data) expected = DataFrame({k: list(v) for k, v in data.items()}) @@ -546,50 +568,56 @@ def test_constructor_dict_of_tuples(self): def test_constructor_dict_of_ranges(self): # GH 26356 - data = {'a': range(3), 'b': range(3, 6)} + data = {"a": range(3), "b": range(3, 6)} result = DataFrame(data) - expected = DataFrame({'a': [0, 1, 2], 'b': [3, 4, 5]}) + expected = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_of_iterators(self): # GH 26349 - data = {'a': iter(range(3)), 'b': reversed(range(3))} + data = {"a": iter(range(3)), "b": reversed(range(3))} result = DataFrame(data) - expected = DataFrame({'a': [0, 1, 2], 'b': [2, 1, 0]}) + expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_of_generators(self): # GH 26349 - data = {'a': (i for i in (range(3))), - 'b': (i for i in reversed(range(3)))} + data = {"a": (i for i in (range(3))), "b": (i for i in reversed(range(3)))} result = DataFrame(data) - expected = DataFrame({'a': [0, 1, 2], 'b': [2, 1, 0]}) + expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]}) tm.assert_frame_equal(result, expected) def test_constructor_dict_multiindex(self): def check(result, expected): - return tm.assert_frame_equal(result, expected, check_dtype=True, - check_index_type=True, - check_column_type=True, - check_names=True) - d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, - ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4}, - ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}} + return tm.assert_frame_equal( + result, + expected, + check_dtype=True, + check_index_type=True, + check_column_type=True, + check_names=True, + ) + + d = { + ("a", "a"): {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2}, + ("b", "a"): {("i", "i"): 6, ("i", "j"): 5, ("j", "i"): 4}, + ("b", "c"): {("i", "i"): 7, ("i", "j"): 8, ("j", "i"): 9}, + } _d = sorted(d.items()) df = DataFrame(d) expected = DataFrame( - [x[1] for x in _d], - index=MultiIndex.from_tuples([x[0] for x in _d])).T + [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d]) + ).T expected.index = MultiIndex.from_tuples(expected.index) check(df, expected) - d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111} - _d.insert(0, ('z', d['z'])) + d["z"] = {"y": 123.0, ("i", "i"): 111, ("i", "j"): 111, ("j", "i"): 111} + _d.insert(0, ("z", d["z"])) expected = DataFrame( - [x[1] for x in _d], - index=Index([x[0] for x in _d], tupleize_cols=False)).T + [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False) + ).T expected.index = Index(expected.index, tupleize_cols=False) df = DataFrame(d) df = df.reindex(columns=expected.columns, index=expected.index) @@ -597,21 +625,24 @@ def check(result, expected): def test_constructor_dict_datetime64_index(self): # GH 10160 - dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] + dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] def create_data(constructor): - return {i: {constructor(s): 2 * i} - for i, s in enumerate(dates_as_str)} + return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)} data_datetime64 = create_data(np.datetime64) - data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) + data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) data_Timestamp = create_data(Timestamp) - expected = DataFrame([{0: 0, 1: None, 2: None, 3: None}, - {0: None, 1: 2, 2: None, 3: None}, - {0: None, 1: None, 2: 4, 3: None}, - {0: None, 1: None, 2: None, 3: 6}], - index=[Timestamp(dt) for dt in dates_as_str]) + expected = DataFrame( + [ + {0: 0, 1: None, 2: None, 3: None}, + {0: None, 1: 2, 2: None, 3: None}, + {0: None, 1: None, 2: 4, 3: None}, + {0: None, 1: None, 2: None, 3: 6}, + ], + index=[Timestamp(dt) for dt in dates_as_str], + ) result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) @@ -625,18 +656,21 @@ def test_constructor_dict_timedelta64_index(self): td_as_int = [1, 2, 3, 4] def create_data(constructor): - return {i: {constructor(s): 2 * i} - for i, s in enumerate(td_as_int)} + return {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)} - data_timedelta64 = create_data(lambda x: np.timedelta64(x, 'D')) + data_timedelta64 = create_data(lambda x: np.timedelta64(x, "D")) data_timedelta = create_data(lambda x: timedelta(days=x)) - data_Timedelta = create_data(lambda x: Timedelta(x, 'D')) + data_Timedelta = create_data(lambda x: Timedelta(x, "D")) - expected = DataFrame([{0: 0, 1: None, 2: None, 3: None}, - {0: None, 1: 2, 2: None, 3: None}, - {0: None, 1: None, 2: 4, 3: None}, - {0: None, 1: None, 2: None, 3: 6}], - index=[Timedelta(td, 'D') for td in td_as_int]) + expected = DataFrame( + [ + {0: 0, 1: None, 2: None, 3: None}, + {0: None, 1: 2, 2: None, 3: None}, + {0: None, 1: None, 2: 4, 3: None}, + {0: None, 1: None, 2: None, 3: 6}, + ], + index=[Timedelta(td, "D") for td in td_as_int], + ) result_timedelta64 = DataFrame(data_timedelta64) result_timedelta = DataFrame(data_timedelta) @@ -647,27 +681,27 @@ def create_data(constructor): def test_constructor_period(self): # PeriodIndex - a = pd.PeriodIndex(['2012-01', 'NaT', '2012-04'], freq='M') - b = pd.PeriodIndex(['2012-02-01', '2012-03-01', 'NaT'], freq='D') - df = pd.DataFrame({'a': a, 'b': b}) - assert df['a'].dtype == a.dtype - assert df['b'].dtype == b.dtype + a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") + b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") + df = pd.DataFrame({"a": a, "b": b}) + assert df["a"].dtype == a.dtype + assert df["b"].dtype == b.dtype # list of periods - df = pd.DataFrame({'a': a.astype(object).tolist(), - 'b': b.astype(object).tolist()}) - assert df['a'].dtype == a.dtype - assert df['b'].dtype == b.dtype + df = pd.DataFrame( + {"a": a.astype(object).tolist(), "b": b.astype(object).tolist()} + ) + assert df["a"].dtype == a.dtype + assert df["b"].dtype == b.dtype def test_nested_dict_frame_constructor(self): - rng = pd.period_range('1/1/2000', periods=5) + rng = pd.period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) data = {} for col in df.columns: for row in df.index: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): data.setdefault(col, {})[row] = df.get_value(row, col) result = DataFrame(data, columns=rng) @@ -676,8 +710,7 @@ def test_nested_dict_frame_constructor(self): data = {} for col in df.columns: for row in df.index: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): data.setdefault(row, {})[col] = df.get_value(row, col) result = DataFrame(data, index=rng).T @@ -688,33 +721,31 @@ def _check_basic_constructor(self, empty): # objects mat = empty((2, 3), dtype=float) # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 # 1-D input - frame = DataFrame(empty((3,)), columns=['A'], index=[1, 2, 3]) + frame = DataFrame(empty((3,)), columns=["A"], index=[1, 2, 3]) assert len(frame.index) == 3 assert len(frame.columns) == 1 # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=np.int64) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # wrong size axis labels - msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)' + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): - DataFrame(mat, columns=['A', 'B', 'C'], index=[1]) - msg = r'Shape of passed values is \(2, 3\), indices imply \(2, 2\)' + DataFrame(mat, columns=["A", "B", "C"], index=[1]) + msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" with pytest.raises(ValueError, match=msg): - DataFrame(mat, columns=['A', 'B'], index=[1, 2]) + DataFrame(mat, columns=["A", "B"], index=[1, 2]) # higher dim raise exception - with pytest.raises(ValueError, match='Must pass 2-d input'): - DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'], - index=[1]) + with pytest.raises(ValueError, match="Must pass 2-d input"): + DataFrame(empty((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # automatic labeling frame = DataFrame(mat) @@ -724,7 +755,7 @@ def _check_basic_constructor(self, empty): frame = DataFrame(mat, index=[1, 2]) tm.assert_index_equal(frame.columns, pd.Int64Index(range(3))) - frame = DataFrame(mat, columns=['A', 'B', 'C']) + frame = DataFrame(mat, columns=["A", "B", "C"]) tm.assert_index_equal(frame.index, pd.Int64Index(range(2))) # 0-length axis @@ -737,7 +768,7 @@ def _check_basic_constructor(self, empty): def test_constructor_ndarray(self): self._check_basic_constructor(np.ones) - frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) + frame = DataFrame(["foo", "bar"], index=[0, 1], columns=["A"]) assert len(frame) == 2 def test_constructor_maskedarray(self): @@ -747,134 +778,132 @@ def test_constructor_maskedarray(self): mat = ma.masked_all((2, 3), dtype=float) mat[0, 0] = 1.0 mat[1, 2] = 2.0 - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) - assert 1.0 == frame['A'][1] - assert 2.0 == frame['C'][2] + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) + assert 1.0 == frame["A"][1] + assert 2.0 == frame["C"][2] # what is this even checking?? mat = ma.masked_all((2, 3), dtype=float) - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert np.all(~np.asarray(frame == frame)) def test_constructor_maskedarray_nonfloat(self): # masked int promoted to float mat = ma.masked_all((2, 3), dtype=int) # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=np.float64) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.float64) assert frame.values.dtype == np.float64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 - frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) - assert 1 == frame['A'][1] - assert 2 == frame['C'][2] + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert 1 == frame["A"][1] + assert 2 == frame["C"][2] # masked np.datetime64 stays (use NaT as null) - mat = ma.masked_all((2, 3), dtype='M8[ns]') + mat = ma.masked_all((2, 3), dtype="M8[ns]") # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert isna(frame).values.all() # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=np.int64) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) assert frame.values.dtype == np.int64 # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = 1 mat2[1, 2] = 2 - frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) - assert 1 == frame['A'].view('i8')[1] - assert 2 == frame['C'].view('i8')[2] + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert 1 == frame["A"].view("i8")[1] + assert 2 == frame["C"].view("i8")[2] # masked bool promoted to object mat = ma.masked_all((2, 3), dtype=bool) # 2-D input - frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2]) assert len(frame.index) == 2 assert len(frame.columns) == 3 assert np.all(~np.asarray(frame == frame)) # cast type - frame = DataFrame(mat, columns=['A', 'B', 'C'], - index=[1, 2], dtype=object) + frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=object) assert frame.values.dtype == object # Check non-masked values mat2 = ma.copy(mat) mat2[0, 0] = True mat2[1, 2] = False - frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) - assert frame['A'][1] is True - assert frame['C'][2] is False + frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2]) + assert frame["A"][1] is True + assert frame["C"][2] is False def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) - expected = pd.DataFrame({ - 'A': [np.nan, np.nan], - 'B': [np.nan, np.nan]}, - columns=['A', 'B'], + result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = pd.DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, + columns=["A", "B"], index=[1, 2], - dtype=float) + dtype=float, + ) tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked mat_hard = ma.ones((2, 2), dtype=float).harden_mask() - result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) - expected = pd.DataFrame({ - 'A': [1.0, 1.0], - 'B': [1.0, 1.0]}, - columns=['A', 'B'], + result = pd.DataFrame(mat_hard, columns=["A", "B"], index=[1, 2]) + expected = pd.DataFrame( + {"A": [1.0, 1.0], "B": [1.0, 1.0]}, + columns=["A", "B"], index=[1, 2], - dtype=float) + dtype=float, + ) tm.assert_frame_equal(result, expected) def test_constructor_maskedrecarray_dtype(self): # Ensure constructor honors dtype data = np.ma.array( - np.ma.zeros(5, dtype=[('date', '0 - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', - floatname: np.array([1.] * 10, dtype=floatname), - intname: np.array([1] * 10, dtype=intname)}, - index=np.arange(10)) + df = DataFrame( + { + "a": 1.0, + "b": 2, + "c": "foo", + floatname: np.array([1.0] * 10, dtype=floatname), + intname: np.array([1] * 10, dtype=intname), + }, + index=np.arange(10), + ) result = df.dtypes - expected = Series([np.dtype('float64')] + - [np.dtype('int64')] + - [np.dtype('object')] + - [np.dtype('float64')] + - [np.dtype(intname)], - index=['a', 'b', 'c', floatname, intname]) + expected = Series( + [np.dtype("float64")] + + [np.dtype("int64")] + + [np.dtype("object")] + + [np.dtype("float64")] + + [np.dtype(intname)], + index=["a", "b", "c", floatname, intname], + ) tm.assert_series_equal(result, expected) # GH 2809 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] datetime_s = Series(datetimes) - assert datetime_s.dtype == 'M8[ns]' + assert datetime_s.dtype == "M8[ns]" # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) datetimes = [ts.to_pydatetime() for ts in ind] dates = [ts.date() for ts in ind] - df = DataFrame(datetimes, columns=['datetimes']) - df['dates'] = dates + df = DataFrame(datetimes, columns=["datetimes"]) + df["dates"] = dates result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), np.dtype('object')], - index=['datetimes', 'dates']) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("object")], + index=["datetimes", "dates"], + ) tm.assert_series_equal(result, expected) # GH 7594 # don't coerce tz-aware import pytz - tz = pytz.timezone('US/Eastern') + + tz = pytz.timezone("US/Eastern") dt = tz.localize(datetime(2012, 1, 1)) - df = DataFrame({'End Date': dt}, index=[0]) + df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt - tm.assert_series_equal(df.dtypes, Series( - {'End Date': 'datetime64[ns, US/Eastern]'})) + tm.assert_series_equal( + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + ) - df = DataFrame([{'End Date': dt}]) + df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt - tm.assert_series_equal(df.dtypes, Series( - {'End Date': 'datetime64[ns, US/Eastern]'})) + tm.assert_series_equal( + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + ) # tz-aware (UTC and other tz's) # GH 8411 - dr = date_range('20130101', periods=3) - df = DataFrame({'value': dr}) + dr = date_range("20130101", periods=3) + df = DataFrame({"value": dr}) assert df.iat[0, 0].tz is None - dr = date_range('20130101', periods=3, tz='UTC') - df = DataFrame({'value': dr}) - assert str(df.iat[0, 0].tz) == 'UTC' - dr = date_range('20130101', periods=3, tz='US/Eastern') - df = DataFrame({'value': dr}) - assert str(df.iat[0, 0].tz) == 'US/Eastern' + dr = date_range("20130101", periods=3, tz="UTC") + df = DataFrame({"value": dr}) + assert str(df.iat[0, 0].tz) == "UTC" + dr = date_range("20130101", periods=3, tz="US/Eastern") + df = DataFrame({"value": dr}) + assert str(df.iat[0, 0].tz) == "US/Eastern" # GH 7822 # preserver an index with a tz on dict construction - i = date_range('1/1/2011', periods=5, freq='10s', tz='US/Eastern') + i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern") - expected = DataFrame( - {'a': i.to_series(keep_tz=True).reset_index(drop=True)}) + expected = DataFrame({"a": i.to_series(keep_tz=True).reset_index(drop=True)}) df = DataFrame() - df['a'] = i + df["a"] = i tm.assert_frame_equal(df, expected) - df = DataFrame({'a': i}) + df = DataFrame({"a": i}) tm.assert_frame_equal(df, expected) # multiples - i_no_tz = date_range('1/1/2011', periods=5, freq='10s') - df = DataFrame({'a': i, 'b': i_no_tz}) - expected = DataFrame({'a': i.to_series(keep_tz=True) - .reset_index(drop=True), 'b': i_no_tz}) + i_no_tz = date_range("1/1/2011", periods=5, freq="10s") + df = DataFrame({"a": i, "b": i_no_tz}) + expected = DataFrame( + {"a": i.to_series(keep_tz=True).reset_index(drop=True), "b": i_no_tz} + ) tm.assert_frame_equal(df, expected) def test_constructor_datetimes_with_nulls(self): # gh-15869 - for arr in [np.array([None, None, None, None, - datetime.now(), None]), - np.array([None, None, datetime.now(), None])]: + for arr in [ + np.array([None, None, None, None, datetime.now(), None]), + np.array([None, None, datetime.now(), None]), + ]: result = DataFrame(arr).dtypes - expected = Series([np.dtype('datetime64[ns]')]) + expected = Series([np.dtype("datetime64[ns]")]) tm.assert_series_equal(result, expected) def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes - expected = Series([np.dtype('int64')] * 5) + expected = Series([np.dtype("int64")] * 5) tm.assert_series_equal(result, expected) - df = DataFrame([np.array(np.arange(5), dtype='int32') - for x in range(5)]) + df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) result = df.dtypes - expected = Series([np.dtype('int64')] * 5) + expected = Series([np.dtype("int64")] * 5) tm.assert_series_equal(result, expected) # overflow issue? (we always expecte int64 upcasting here) - df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]}) - assert df.dtypes.iloc[0] == np.dtype('int64') + df = DataFrame({"a": [2 ** 31, 2 ** 31 + 1]}) + assert df.dtypes.iloc[0] == np.dtype("int64") # GH #2751 (construction with no index specified), make sure we cast to # platform values df = DataFrame([1, 2]) - assert df.dtypes.iloc[0] == np.dtype('int64') + assert df.dtypes.iloc[0] == np.dtype("int64") - df = DataFrame([1., 2.]) - assert df.dtypes.iloc[0] == np.dtype('float64') + df = DataFrame([1.0, 2.0]) + assert df.dtypes.iloc[0] == np.dtype("float64") - df = DataFrame({'a': [1, 2]}) - assert df.dtypes.iloc[0] == np.dtype('int64') + df = DataFrame({"a": [1, 2]}) + assert df.dtypes.iloc[0] == np.dtype("int64") - df = DataFrame({'a': [1., 2.]}) - assert df.dtypes.iloc[0] == np.dtype('float64') + df = DataFrame({"a": [1.0, 2.0]}) + assert df.dtypes.iloc[0] == np.dtype("float64") - df = DataFrame({'a': 1}, index=range(3)) - assert df.dtypes.iloc[0] == np.dtype('int64') + df = DataFrame({"a": 1}, index=range(3)) + assert df.dtypes.iloc[0] == np.dtype("int64") - df = DataFrame({'a': 1.}, index=range(3)) - assert df.dtypes.iloc[0] == np.dtype('float64') + df = DataFrame({"a": 1.0}, index=range(3)) + assert df.dtypes.iloc[0] == np.dtype("float64") # with object list - df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3], - 'c': list('abcd'), - 'd': [datetime(2000, 1, 1) for i in range(4)], - 'e': [1., 2, 4., 7]}) + df = DataFrame( + { + "a": [1, 2, 4, 7], + "b": [1.2, 2.3, 5.1, 6.3], + "c": list("abcd"), + "d": [datetime(2000, 1, 1) for i in range(4)], + "e": [1.0, 2, 4.0, 7], + } + ) result = df.dtypes - expected = Series([np.dtype('int64'), - np.dtype('float64'), - np.dtype('object'), - np.dtype('datetime64[ns]'), - np.dtype('float64')], - index=list('abcde')) + expected = Series( + [ + np.dtype("int64"), + np.dtype("float64"), + np.dtype("object"), + np.dtype("datetime64[ns]"), + np.dtype("float64"), + ], + index=list("abcde"), + ) tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self, float_frame): cop = DataFrame(float_frame, copy=True) - cop['A'] = 5 - assert (cop['A'] == 5).all() - assert not (float_frame['A'] == 5).all() + cop["A"] = 5 + assert (cop["A"] == 5).all() + assert not (float_frame["A"] == 5).all() def test_constructor_ndarray_copy(self, float_frame): df = DataFrame(float_frame.values) @@ -1759,10 +1846,10 @@ def test_constructor_ndarray_copy(self, float_frame): def test_constructor_series_copy(self, float_frame): series = float_frame._series - df = DataFrame({'A': series['A']}) - df['A'][:] = 5 + df = DataFrame({"A": series["A"]}) + df["A"][:] = 5 - assert not (series['A'] == 5).all() + assert not (series["A"] == 5).all() def test_constructor_with_nas(self): # GH 5016 @@ -1776,19 +1863,19 @@ def check(df): # No NaN found -> error if len(indexer) == 0: - msg = ("cannot do label indexing on" - r" " - r" with these indexers \[nan\] of ") + msg = ( + "cannot do label indexing on" + r" " + r" with these indexers \[nan\] of " + ) with pytest.raises(TypeError, match=msg): df.loc[:, np.nan] # single nan should result in Series elif len(indexer) == 1: - tm.assert_series_equal(df.iloc[:, indexer[0]], - df.loc[:, np.nan]) + tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan]) # multiple nans should result in DataFrame else: - tm.assert_frame_equal(df.iloc[:, indexer], - df.loc[:, np.nan]) + tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan]) df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]) check(df) @@ -1796,104 +1883,107 @@ def check(df): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan]) check(df) - df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], - columns=[np.nan, 1.1, 2.2, np.nan]) + df = DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]) check(df) - df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], - columns=[np.nan, 1.1, 2.2, np.nan]) + df = DataFrame( + [[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan] + ) check(df) # GH 21428 (non-unique columns) - df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], - columns=[np.nan, 1, 2, 2]) + df = DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2]) check(df) def test_constructor_lists_to_object_dtype(self): # from #1074 - d = DataFrame({'a': [np.nan, False]}) - assert d['a'].dtype == np.object_ - assert not d['a'][1] + d = DataFrame({"a": [np.nan, False]}) + assert d["a"].dtype == np.object_ + assert not d["a"][1] def test_constructor_categorical(self): # GH8626 # dict creation - df = DataFrame({'A': list('abc')}, dtype='category') - expected = Series(list('abc'), dtype='category', name='A') - tm.assert_series_equal(df['A'], expected) + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) # to_frame - s = Series(list('abc'), dtype='category') + s = Series(list("abc"), dtype="category") result = s.to_frame() - expected = Series(list('abc'), dtype='category', name=0) + expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(result[0], expected) - result = s.to_frame(name='foo') - expected = Series(list('abc'), dtype='category', name='foo') - tm.assert_series_equal(result['foo'], expected) + result = s.to_frame(name="foo") + expected = Series(list("abc"), dtype="category", name="foo") + tm.assert_series_equal(result["foo"], expected) # list-like creation - df = DataFrame(list('abc'), dtype='category') - expected = Series(list('abc'), dtype='category', name=0) + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # ndim != 1 - df = DataFrame([Categorical(list('abc'))]) - expected = DataFrame({0: Series(list('abc'), dtype='category')}) + df = DataFrame([Categorical(list("abc"))]) + expected = DataFrame({0: Series(list("abc"), dtype="category")}) tm.assert_frame_equal(df, expected) - df = DataFrame([Categorical(list('abc')), Categorical(list('abd'))]) - expected = DataFrame({0: Series(list('abc'), dtype='category'), - 1: Series(list('abd'), dtype='category')}, - columns=[0, 1]) + df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))]) + expected = DataFrame( + { + 0: Series(list("abc"), dtype="category"), + 1: Series(list("abd"), dtype="category"), + }, + columns=[0, 1], + ) tm.assert_frame_equal(df, expected) # mixed - df = DataFrame([Categorical(list('abc')), list('def')]) - expected = DataFrame({0: Series(list('abc'), dtype='category'), - 1: list('def')}, columns=[0, 1]) + df = DataFrame([Categorical(list("abc")), list("def")]) + expected = DataFrame( + {0: Series(list("abc"), dtype="category"), 1: list("def")}, columns=[0, 1] + ) tm.assert_frame_equal(df, expected) # invalid (shape) msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" with pytest.raises(ValueError, match=msg): - DataFrame([Categorical(list('abc')), - Categorical(list('abdefg'))]) + DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) # ndim > 1 msg = "> 1 ndim Categorical are not supported at this time" with pytest.raises(NotImplementedError, match=msg): - Categorical(np.array([list('abcd')])) + Categorical(np.array([list("abcd")])) def test_constructor_categorical_series(self): items = [1, 2, 3, 1] - exp = Series(items).astype('category') - res = Series(items, dtype='category') + exp = Series(items).astype("category") + res = Series(items, dtype="category") tm.assert_series_equal(res, exp) items = ["a", "b", "c", "a"] - exp = Series(items).astype('category') - res = Series(items, dtype='category') + exp = Series(items).astype("category") + res = Series(items, dtype="category") tm.assert_series_equal(res, exp) # insert into frame with different index # GH 8076 - index = date_range('20000101', periods=3) - expected = Series(Categorical(values=[np.nan, np.nan, np.nan], - categories=['a', 'b', 'c'])) + index = date_range("20000101", periods=3) + expected = Series( + Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) + ) expected.index = index - expected = DataFrame({'x': expected}) - df = DataFrame( - {'x': Series(['a', 'b', 'c'], dtype='category')}, index=index) + expected = DataFrame({"x": expected}) + df = DataFrame({"x": Series(["a", "b", "c"], dtype="category")}, index=index) tm.assert_frame_equal(df, expected) def test_from_records_to_records(self): # from numpy documentation - arr = np.zeros((2,), dtype=('i4,f4,a10')) - arr[:] = [(1, 2., 'Hello'), (2, 3., "World")] + arr = np.zeros((2,), dtype=("i4,f4,a10")) + arr[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] # TODO(wesm): unused frame = DataFrame.from_records(arr) # noqa @@ -1907,11 +1997,11 @@ def test_from_records_to_records(self): tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length - msg = r'Shape of passed values is \(2, 3\), indices imply \(1, 3\)' + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) - indexed_frame = DataFrame.from_records(arr, index='f1') + indexed_frame = DataFrame.from_records(arr, index="f1") # what to do? records = indexed_frame.to_records() @@ -1919,46 +2009,51 @@ def test_from_records_to_records(self): records = indexed_frame.to_records(index=False) assert len(records.dtype.names) == 2 - assert 'index' not in records.dtype.names + assert "index" not in records.dtype.names def test_from_records_nones(self): - tuples = [(1, 2, None, 3), - (1, 2, None, 3), - (None, 2, 5, 3)] + tuples = [(1, 2, None, 3), (1, 2, None, 3), (None, 2, 5, 3)] - df = DataFrame.from_records(tuples, columns=['a', 'b', 'c', 'd']) - assert np.isnan(df['c'][0]) + df = DataFrame.from_records(tuples, columns=["a", "b", "c", "d"]) + assert np.isnan(df["c"][0]) def test_from_records_iterator(self): - arr = np.array([(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5., 5., 6, 6), - (7., 7., 8, 8)], - dtype=[('x', np.float64), ('u', np.float32), - ('y', np.int64), ('z', np.int32)]) + arr = np.array( + [(1.0, 1.0, 2, 2), (3.0, 3.0, 4, 4), (5.0, 5.0, 6, 6), (7.0, 7.0, 8, 8)], + dtype=[ + ("x", np.float64), + ("u", np.float32), + ("y", np.int64), + ("z", np.int32), + ], + ) df = DataFrame.from_records(iter(arr), nrows=2) - xp = DataFrame({'x': np.array([1.0, 3.0], dtype=np.float64), - 'u': np.array([1.0, 3.0], dtype=np.float32), - 'y': np.array([2, 4], dtype=np.int64), - 'z': np.array([2, 4], dtype=np.int32)}) + xp = DataFrame( + { + "x": np.array([1.0, 3.0], dtype=np.float64), + "u": np.array([1.0, 3.0], dtype=np.float32), + "y": np.array([2, 4], dtype=np.int64), + "z": np.array([2, 4], dtype=np.int32), + } + ) tm.assert_frame_equal(df.reindex_like(xp), xp) # no dtypes specified here, so just compare with the default - arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)] - df = DataFrame.from_records(iter(arr), columns=['x', 'y'], - nrows=2) - tm.assert_frame_equal(df, xp.reindex(columns=['x', 'y']), - check_dtype=False) + arr = [(1.0, 2), (3.0, 4), (5.0, 6), (7.0, 8)] + df = DataFrame.from_records(iter(arr), columns=["x", "y"], nrows=2) + tm.assert_frame_equal(df, xp.reindex(columns=["x", "y"]), check_dtype=False) def test_from_records_tuples_generator(self): def tuple_generator(length): for i in range(length): - letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" yield (i, letters[i % len(letters)], i / length) - columns_names = ['Integer', 'String', 'Float'] - columns = [[i[j] for i in tuple_generator( - 10)] for j in range(len(columns_names))] - data = {'Integer': columns[0], - 'String': columns[1], 'Float': columns[2]} + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in tuple_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} expected = DataFrame(data, columns=columns_names) generator = tuple_generator(10) @@ -1968,14 +2063,14 @@ def tuple_generator(length): def test_from_records_lists_generator(self): def list_generator(length): for i in range(length): - letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" yield [i, letters[i % len(letters)], i / length] - columns_names = ['Integer', 'String', 'Float'] - columns = [[i[j] for i in list_generator( - 10)] for j in range(len(columns_names))] - data = {'Integer': columns[0], - 'String': columns[1], 'Float': columns[2]} + columns_names = ["Integer", "String", "Float"] + columns = [ + [i[j] for i in list_generator(10)] for j in range(len(columns_names)) + ] + data = {"Integer": columns[0], "String": columns[1], "Float": columns[2]} expected = DataFrame(data, columns=columns_names) generator = list_generator(10) @@ -1983,111 +2078,112 @@ def list_generator(length): tm.assert_frame_equal(result, expected) def test_from_records_columns_not_modified(self): - tuples = [(1, 2, 3), - (1, 2, 3), - (2, 5, 3)] + tuples = [(1, 2, 3), (1, 2, 3), (2, 5, 3)] - columns = ['a', 'b', 'c'] + columns = ["a", "b", "c"] original_columns = list(columns) - df = DataFrame.from_records(tuples, columns=columns, index='a') # noqa + df = DataFrame.from_records(tuples, columns=columns, index="a") # noqa assert columns == original_columns def test_from_records_decimal(self): from decimal import Decimal - tuples = [(Decimal('1.5'),), (Decimal('2.5'),), (None,)] + tuples = [(Decimal("1.5"),), (Decimal("2.5"),), (None,)] - df = DataFrame.from_records(tuples, columns=['a']) - assert df['a'].dtype == object + df = DataFrame.from_records(tuples, columns=["a"]) + assert df["a"].dtype == object - df = DataFrame.from_records(tuples, columns=['a'], coerce_float=True) - assert df['a'].dtype == np.float64 - assert np.isnan(df['a'].values[-1]) + df = DataFrame.from_records(tuples, columns=["a"], coerce_float=True) + assert df["a"].dtype == np.float64 + assert np.isnan(df["a"].values[-1]) def test_from_records_duplicates(self): - result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], - columns=['a', 'b', 'a']) + result = DataFrame.from_records([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) - expected = DataFrame([(1, 2, 3), (4, 5, 6)], - columns=['a', 'b', 'a']) + expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "a"]) tm.assert_frame_equal(result, expected) def test_from_records_set_index_name(self): def create_dict(order_id): - return {'order_id': order_id, 'quantity': np.random.randint(1, 10), - 'price': np.random.randint(1, 10)} + return { + "order_id": order_id, + "quantity": np.random.randint(1, 10), + "price": np.random.randint(1, 10), + } + documents = [create_dict(i) for i in range(10)] # demo missing data - documents.append({'order_id': 10, 'quantity': 5}) + documents.append({"order_id": 10, "quantity": 5}) - result = DataFrame.from_records(documents, index='order_id') - assert result.index.name == 'order_id' + result = DataFrame.from_records(documents, index="order_id") + assert result.index.name == "order_id" # MultiIndex - result = DataFrame.from_records(documents, - index=['order_id', 'quantity']) - assert result.index.names == ('order_id', 'quantity') + result = DataFrame.from_records(documents, index=["order_id", "quantity"]) + assert result.index.names == ("order_id", "quantity") def test_from_records_misc_brokenness(self): # #2179 - data = {1: ['foo'], 2: ['bar']} + data = {1: ["foo"], 2: ["bar"]} - result = DataFrame.from_records(data, columns=['a', 'b']) - exp = DataFrame(data, columns=['a', 'b']) + result = DataFrame.from_records(data, columns=["a", "b"]) + exp = DataFrame(data, columns=["a", "b"]) tm.assert_frame_equal(result, exp) # overlap in index/index_names - data = {'a': [1, 2, 3], 'b': [4, 5, 6]} + data = {"a": [1, 2, 3], "b": [4, 5, 6]} - result = DataFrame.from_records(data, index=['a', 'b', 'c']) - exp = DataFrame(data, index=['a', 'b', 'c']) + result = DataFrame.from_records(data, index=["a", "b", "c"]) + exp = DataFrame(data, index=["a", "b", "c"]) tm.assert_frame_equal(result, exp) # GH 2623 rows = [] rows.append([datetime(2010, 1, 1), 1]) - rows.append([datetime(2010, 1, 2), 'hi']) # test col upconverts to obj - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + rows.append([datetime(2010, 1, 2), "hi"]) # test col upconverts to obj + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) result = df2_obj.dtypes - expected = Series([np.dtype('datetime64[ns]'), np.dtype('object')], - index=['date', 'test']) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("object")], index=["date", "test"] + ) tm.assert_series_equal(result, expected) rows = [] rows.append([datetime(2010, 1, 1), 1]) rows.append([datetime(2010, 1, 2), 1]) - df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) + df2_obj = DataFrame.from_records(rows, columns=["date", "test"]) result = df2_obj.dtypes - expected = Series([np.dtype('datetime64[ns]'), np.dtype('int64')], - index=['date', 'test']) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("int64")], index=["date", "test"] + ) tm.assert_series_equal(result, expected) def test_from_records_empty(self): # 3562 - result = DataFrame.from_records([], columns=['a', 'b', 'c']) - expected = DataFrame(columns=['a', 'b', 'c']) + result = DataFrame.from_records([], columns=["a", "b", "c"]) + expected = DataFrame(columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) - result = DataFrame.from_records([], columns=['a', 'b', 'b']) - expected = DataFrame(columns=['a', 'b', 'b']) + result = DataFrame.from_records([], columns=["a", "b", "b"]) + expected = DataFrame(columns=["a", "b", "b"]) tm.assert_frame_equal(result, expected) def test_from_records_empty_with_nonempty_fields_gh3682(self): - a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)]) - df = DataFrame.from_records(a, index='id') - tm.assert_index_equal(df.index, Index([1], name='id')) - assert df.index.name == 'id' - tm.assert_index_equal(df.columns, Index(['value'])) + a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)]) + df = DataFrame.from_records(a, index="id") + tm.assert_index_equal(df.index, Index([1], name="id")) + assert df.index.name == "id" + tm.assert_index_equal(df.columns, Index(["value"])) - b = np.array([], dtype=[('id', np.int64), ('value', np.int64)]) - df = DataFrame.from_records(b, index='id') - tm.assert_index_equal(df.index, Index([], name='id')) - assert df.index.name == 'id' + b = np.array([], dtype=[("id", np.int64), ("value", np.int64)]) + df = DataFrame.from_records(b, index="id") + tm.assert_index_equal(df.index, Index([], name="id")) + assert df.index.name == "id" def test_from_records_with_datetimes(self): @@ -2098,10 +2194,10 @@ def test_from_records_with_datetimes(self): # construction with a null in a recarray # GH 6140 - expected = DataFrame({'EXPIRY': [datetime(2005, 3, 1, 0, 0), None]}) + expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [('EXPIRY', '\n' - 'To: \n' - 'Subject: Test message\n' - '\n' - 'Body would go here\n') + headers = Parser().parsestr( + "From: \n" + "To: \n" + "Subject: Test message\n" + "\n" + "Body would go here\n" + ) frame = DataFrame.from_records([headers]) - all(x in frame for x in ['Type', 'Subject', 'From']) + all(x in frame for x in ["Type", "Subject", "From"]) def test_to_records_floats(self): df = DataFrame(np.random.rand(10, 10)) @@ -132,25 +136,24 @@ def test_to_records_floats(self): def test_to_records_index_name(self): df = DataFrame(np.random.randn(3, 3)) - df.index.name = 'X' + df.index.name = "X" rs = df.to_records() - assert 'X' in rs.dtype.fields + assert "X" in rs.dtype.fields df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() - assert 'index' in rs.dtype.fields + assert "index" in rs.dtype.fields - df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) - df.index.names = ['A', None] + df.index = MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) + df.index.names = ["A", None] rs = df.to_records() - assert 'level_0' in rs.dtype.fields + assert "level_0" in rs.dtype.fields def test_to_records_with_unicode_index(self): # GH13172 # unicode_literals conflict with to_records - result = DataFrame([{'a': 'x', 'b': 'y'}]).set_index('a') \ - .to_records() - expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) + result = DataFrame([{"a": "x", "b": "y"}]).set_index("a").to_records() + expected = np.rec.array([("x", "y")], dtype=[("a", "O"), ("b", "O")]) tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): @@ -163,8 +166,7 @@ def test_to_records_with_unicode_column_names(self): # to be specified using dictionary instead of list of tuples. expected = np.rec.array( [(0, 1.0)], - dtype={"names": ["index", "accented_name_é"], - "formats": ['=i8', '=f8']} + dtype={"names": ["index", "accented_name_é"], "formats": ["=i8", "=f8"]}, ) tm.assert_almost_equal(result, expected) @@ -173,117 +175,154 @@ def test_to_records_with_categorical(self): # GH8626 # dict creation - df = DataFrame({'A': list('abc')}, dtype='category') - expected = Series(list('abc'), dtype='category', name='A') - tm.assert_series_equal(df['A'], expected) + df = DataFrame({"A": list("abc")}, dtype="category") + expected = Series(list("abc"), dtype="category", name="A") + tm.assert_series_equal(df["A"], expected) # list-like creation - df = DataFrame(list('abc'), dtype='category') - expected = Series(list('abc'), dtype='category', name=0) + df = DataFrame(list("abc"), dtype="category") + expected = Series(list("abc"), dtype="category", name=0) tm.assert_series_equal(df[0], expected) # to record array # this coerces result = df.to_records() - expected = np.rec.array([(0, 'a'), (1, 'b'), (2, 'c')], - dtype=[('index', '=i8'), ('0', 'O')]) + expected = np.rec.array( + [(0, "a"), (1, "b"), (2, "c")], dtype=[("index", "=i8"), ("0", "O")] + ) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize("kwargs,expected", [ - # No dtypes --> default to array dtypes. - (dict(), - np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], - dtype=[("index", " default to array dtypes. + ( + dict(), + np.rec.array( + [(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")], + dtype=[("index", " 0 + float_string_frame["bool"] = float_string_frame["A"] > 0 result = float_string_frame.dtypes - expected = Series({k: v.dtype for k, v in float_string_frame.items()}, - index=result.index) + expected = Series( + {k: v.dtype for k, v in float_string_frame.items()}, index=result.index + ) assert_series_equal(result, expected) # compat, GH 8722 - with option_context('use_inf_as_na', True): + with option_context("use_inf_as_na", True): df = DataFrame([[1]]) result = df.dtypes - assert_series_equal(result, Series({0: np.dtype('int64')})) + assert_series_equal(result, Series({0: np.dtype("int64")})) def test_ftypes(self, mixed_float_frame): frame = mixed_float_frame - expected = Series(dict(A='float32:dense', - B='float32:dense', - C='float16:dense', - D='float64:dense')).sort_values() + expected = Series( + dict( + A="float32:dense", + B="float32:dense", + C="float16:dense", + D="float64:dense", + ) + ).sort_values() # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): @@ -433,74 +493,78 @@ def test_ftypes(self, mixed_float_frame): def test_astype_float(self, float_frame): casted = float_frame.astype(int) - expected = DataFrame(float_frame.values.astype(int), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(casted, expected) casted = float_frame.astype(np.int32) - expected = DataFrame(float_frame.values.astype(np.int32), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + float_frame.values.astype(np.int32), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(casted, expected) - float_frame['foo'] = '5' + float_frame["foo"] = "5" casted = float_frame.astype(int) - expected = DataFrame(float_frame.values.astype(int), - index=float_frame.index, - columns=float_frame.columns) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) assert_frame_equal(casted, expected) def test_astype_mixed_float(self, mixed_float_frame): # mixed casting - casted = mixed_float_frame.reindex( - columns=['A', 'B']).astype('float32') - _check_cast(casted, 'float32') + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") + _check_cast(casted, "float32") - casted = mixed_float_frame.reindex( - columns=['A', 'B']).astype('float16') - _check_cast(casted, 'float16') + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") + _check_cast(casted, "float16") def test_astype_mixed_type(self, mixed_type_frame): # mixed casting mn = mixed_type_frame._get_numeric_data().copy() - mn['little_float'] = np.array(12345., dtype='float16') - mn['big_float'] = np.array(123456789101112., dtype='float64') + mn["little_float"] = np.array(12345.0, dtype="float16") + mn["big_float"] = np.array(123456789101112.0, dtype="float64") - casted = mn.astype('float64') - _check_cast(casted, 'float64') + casted = mn.astype("float64") + _check_cast(casted, "float64") - casted = mn.astype('int64') - _check_cast(casted, 'int64') + casted = mn.astype("int64") + _check_cast(casted, "int64") - casted = mn.reindex(columns=['little_float']).astype('float16') - _check_cast(casted, 'float16') + casted = mn.reindex(columns=["little_float"]).astype("float16") + _check_cast(casted, "float16") - casted = mn.astype('float32') - _check_cast(casted, 'float32') + casted = mn.astype("float32") + _check_cast(casted, "float32") - casted = mn.astype('int32') - _check_cast(casted, 'int32') + casted = mn.astype("int32") + _check_cast(casted, "int32") # to object - casted = mn.astype('O') - _check_cast(casted, 'object') + casted = mn.astype("O") + _check_cast(casted, "object") def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) - df['string'] = 'foo' - casted = df.astype(int, errors='ignore') + df["string"] = "foo" + casted = df.astype(int, errors="ignore") - expected['string'] = 'foo' + expected["string"] = "foo" assert_frame_equal(casted, expected) df = float_frame.copy() expected = float_frame.astype(np.int32) - df['string'] = 'foo' - casted = df.astype(np.int32, errors='ignore') + df["string"] = "foo" + casted = df.astype(np.int32, errors="ignore") - expected['string'] = 'foo' + expected["string"] = "foo" assert_frame_equal(casted, expected) def test_astype_with_view_float(self, float_frame): @@ -515,7 +579,7 @@ def test_astype_with_view_float(self, float_frame): def test_astype_with_view_mixed_float(self, mixed_float_frame): - tf = mixed_float_frame.reindex(columns=['A', 'B', 'C']) + tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa @@ -545,16 +609,20 @@ def test_astype_str(self): # Datetime-like result = df.astype(str) - expected = DataFrame({ - "a": list(map(str, - map(lambda x: Timestamp(x)._date_repr, a._values))), - "b": list(map(str, map(Timestamp, b._values))), - "c": list(map(str, - map(lambda x: Timedelta(x)._repr_base(format="all"), - c._values))), - "d": list(map(str, d._values)), - "e": list(map(str, e._values)), - }) + expected = DataFrame( + { + "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), + "b": list(map(str, map(Timestamp, b._values))), + "c": list( + map( + str, + map(lambda x: Timedelta(x)._repr_base(format="all"), c._values), + ) + ), + "d": list(map(str, d._values)), + "e": list(map(str, e._values)), + } + ) assert_frame_equal(result, expected) @@ -568,54 +636,59 @@ def test_astype_str_float(self): # < 1.14 truncates # >= 1.14 preserves the full repr - val = ("1.12345678901" if _np_version_under1p14 - else "1.1234567890123457") + val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" expected = DataFrame([val]) assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 - a = Series(date_range('2010-01-04', periods=5)) + a = Series(date_range("2010-01-04", periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) - d = Series(['1.0', '2', '3.14', '4', '5.4']) - df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) + d = Series(["1.0", "2", "3.14", "4", "5.4"]) + df = DataFrame({"a": a, "b": b, "c": c, "d": d}) original = df.copy(deep=True) # change type of a subset of columns - dt1 = dtype_class({'b': 'str', 'd': 'float32'}) + dt1 = dtype_class({"b": "str", "d": "float32"}) result = df.astype(dt1) - expected = DataFrame({ - 'a': a, - 'b': Series(['0', '1', '2', '3', '4']), - 'c': c, - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) + expected = DataFrame( + { + "a": a, + "b": Series(["0", "1", "2", "3", "4"]), + "c": c, + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), + } + ) assert_frame_equal(result, expected) assert_frame_equal(df, original) - dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64}) + dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) result = df.astype(dt2) - expected = DataFrame({ - 'a': a, - 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), - 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + expected = DataFrame( + { + "a": a, + "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), + "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), + } + ) assert_frame_equal(result, expected) assert_frame_equal(df, original) # change all columns - dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str}) - assert_frame_equal(df.astype(dt3), - df.astype(str)) + dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) + assert_frame_equal(df.astype(dt3), df.astype(str)) assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict - dt4 = dtype_class({'b': str, 2: str}) - dt5 = dtype_class({'e': str}) - msg = ("Only a column name can be used for the key in a dtype mappings" - " argument") + dt4 = dtype_class({"b": str, 2: str}) + dt5 = dtype_class({"e": str}) + msg = ( + "Only a column name can be used for the key in a dtype mappings" " argument" + ) with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): @@ -638,117 +711,125 @@ def test_astype_dict_like(self, dtype_class): assert_frame_equal(df, original) def test_astype_duplicate_col(self): - a1 = Series([1, 2, 3, 4, 5], name='a') - b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b') - a2 = Series([0, 1, 2, 3, 4], name='a') + a1 = Series([1, 2, 3, 4, 5], name="a") + b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") + a2 = Series([0, 1, 2, 3, 4], name="a") df = concat([a1, b, a2], axis=1) result = df.astype(str) - a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a') - b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str, - name='b') - a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a') + a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") expected = concat([a1_str, b_str, a2_str], axis=1) assert_frame_equal(result, expected) - result = df.astype({'a': 'str'}) + result = df.astype({"a": "str"}) expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - 'category', - CategoricalDtype(), - CategoricalDtype(ordered=True), - CategoricalDtype(ordered=False), - CategoricalDtype(categories=list('abcdef')), - CategoricalDtype(categories=list('edba'), ordered=False), - CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr) + @pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list("abcdef")), + CategoricalDtype(categories=list("edba"), ordered=False), + CategoricalDtype(categories=list("edcb"), ordered=True), + ], + ids=repr, + ) def test_astype_categorical(self, dtype): # GH 18099 - d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} + d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("cls", [ - pd.api.types.CategoricalDtype, - pd.api.types.DatetimeTZDtype, - pd.api.types.IntervalDtype - ]) + @pytest.mark.parametrize( + "cls", + [ + pd.api.types.CategoricalDtype, + pd.api.types.DatetimeTZDtype, + pd.api.types.IntervalDtype, + ], + ) def test_astype_categoricaldtype_class_raises(self, cls): - df = DataFrame({"A": ['a', 'a', 'b', 'c']}) + df = DataFrame({"A": ["a", "a", "b", "c"]}) xpr = "Expected an instance of {}".format(cls.__name__) with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): - df['A'].astype(cls) + df["A"].astype(cls) - @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16']) + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes(self, dtype): # GH 22578 - df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b']) - - expected1 = pd.DataFrame({'a': integer_array([1, 3, 5], - dtype=dtype), - 'b': integer_array([2, 4, 6], - dtype=dtype)}) + df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + + expected1 = pd.DataFrame( + { + "a": integer_array([1, 3, 5], dtype=dtype), + "b": integer_array([2, 4, 6], dtype=dtype), + } + ) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) - tm.assert_frame_equal(df.astype(dtype).astype('float64'), df) - - df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b']) - df['b'] = df['b'].astype(dtype) - expected2 = pd.DataFrame({'a': [1., 3., 5.], - 'b': integer_array([2, 4, 6], - dtype=dtype)}) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) + + df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + df["b"] = df["b"].astype(dtype) + expected2 = pd.DataFrame( + {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)} + ) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16']) + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) def test_astype_extension_dtypes_1d(self, dtype): # GH 22578 - df = pd.DataFrame({'a': [1., 2., 3.]}) + df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) - expected1 = pd.DataFrame({'a': integer_array([1, 2, 3], - dtype=dtype)}) + expected1 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - df = pd.DataFrame({'a': [1., 2., 3.]}) - df['a'] = df['a'].astype(dtype) - expected2 = pd.DataFrame({'a': integer_array([1, 2, 3], - dtype=dtype)}) + df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) + df["a"] = df["a"].astype(dtype) + expected2 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - @pytest.mark.parametrize("dtype", ['category', 'Int64']) + @pytest.mark.parametrize("dtype", ["category", "Int64"]) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH 24704 - a1 = Series([0, np.nan, 4], name='a') - a2 = Series([np.nan, 3, 5], name='a') + a1 = Series([0, np.nan, 4], name="a") + a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - {100: 'float64', 200: 'uint64'}, 'category', 'float64']) + @pytest.mark.parametrize( + "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] + ) def test_astype_column_metadata(self, dtype): # GH 19920 - columns = pd.UInt64Index([100, 200, 300], name='foo') + columns = pd.UInt64Index([100, 200, 300], name="foo") df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): # tests astype to object dtype # gh-19223 / gh-12425 @@ -758,14 +839,14 @@ def test_astype_from_datetimelike_to_objectt(self, dtype, unit): result = df.astype(object) assert (result.dtypes == object).all() - if dtype.startswith('M8'): + if dtype.startswith("M8"): assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit) else: assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # gh-19223 / gh-12425 @@ -777,7 +858,7 @@ def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # gh-19223 @@ -789,7 +870,7 @@ def test_astype_to_datetime_unit(self, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['ns']) + @pytest.mark.parametrize("unit", ["ns"]) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # gh-19223 @@ -801,7 +882,7 @@ def test_astype_to_timedelta_unit_ns(self, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float # gh-19223 @@ -813,7 +894,7 @@ def test_astype_to_timedelta_unit(self, unit): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # gh-19224 @@ -821,43 +902,58 @@ def test_astype_to_incorrect_datetimelike(self, unit): other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" - r" \[timedelta64\[{}\]\]").format(unit) + msg = ( + r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" + r" \[timedelta64\[{}\]\]" + ).format(unit) with pytest.raises(TypeError, match=msg): df.astype(other) - msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" - r" \[datetime64\[{}\]\]").format(unit) + msg = ( + r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" + r" \[datetime64\[{}\]\]" + ).format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) def test_timedeltas(self): - df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, - freq='D')), - B=Series([timedelta(days=i) for i in range(3)]))) + df = DataFrame( + dict( + A=Series(date_range("2012-1-1", periods=3, freq="D")), + B=Series([timedelta(days=i) for i in range(3)]), + ) + ) result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]')], - index=list("AB")) + expected = Series( + [np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")], index=list("AB") + ) assert_series_equal(result, expected) - df['C'] = df['A'] + df['B'] + df["C"] = df["A"] + df["B"] result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]'), - np.dtype('datetime64[ns]')], - index=list("ABC")) + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + ], + index=list("ABC"), + ) assert_series_equal(result, expected) # mixed int types - df['D'] = 1 + df["D"] = 1 result = df.dtypes - expected = Series([np.dtype('datetime64[ns]'), - np.dtype('timedelta64[ns]'), - np.dtype('datetime64[ns]'), - np.dtype('int64')], - index=list("ABCD")) + expected = Series( + [ + np.dtype("datetime64[ns]"), + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + np.dtype("int64"), + ], + index=list("ABCD"), + ) assert_series_equal(result, expected) def test_arg_for_errors_in_astype(self): @@ -868,145 +964,208 @@ def test_arg_for_errors_in_astype(self): with pytest.raises(ValueError): df.astype(np.float64, errors=True) - df.astype(np.int8, errors='ignore') + df.astype(np.int8, errors="ignore") def test_arg_for_errors_in_astype_dictlist(self): # GH-25905 - df = pd.DataFrame([ - {'a': '1', 'b': '16.5%', 'c': 'test'}, - {'a': '2.2', 'b': '15.3', 'c': 'another_test'}]) - expected = pd.DataFrame([ - {'a': 1.0, 'b': '16.5%', 'c': 'test'}, - {'a': 2.2, 'b': '15.3', 'c': 'another_test'}]) - type_dict = {'a': 'float64', 'b': 'float64', 'c': 'object'} - - result = df.astype(dtype=type_dict, errors='ignore') + df = pd.DataFrame( + [ + {"a": "1", "b": "16.5%", "c": "test"}, + {"a": "2.2", "b": "15.3", "c": "another_test"}, + ] + ) + expected = pd.DataFrame( + [ + {"a": 1.0, "b": "16.5%", "c": "test"}, + {"a": 2.2, "b": "15.3", "c": "another_test"}, + ] + ) + type_dict = {"a": "float64", "b": "float64", "c": "object"} + + result = df.astype(dtype=type_dict, errors="ignore") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('input_vals', [ - ([1, 2]), - (['1', '2']), - (list(pd.date_range('1/1/2011', periods=2, freq='H'))), - (list(pd.date_range('1/1/2011', periods=2, freq='H', - tz='US/Eastern'))), - ([pd.Interval(left=0, right=5)]), - ]) + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(pd.date_range("1/1/2011", periods=2, freq="H"))), + (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' - result = DataFrame({'A': input_vals}, dtype=string_dtype) - expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) + result = DataFrame({"A": input_vals}, dtype=string_dtype) + expected = DataFrame({"A": input_vals}).astype({"A": string_dtype}) assert_frame_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) - expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) + expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) assert_frame_equal(result, expected) - @pytest.mark.parametrize("data, expected", [ - # empty - (DataFrame(), True), - # multi-same - (DataFrame({"A": [1, 2], "B": [1, 2]}), True), - # multi-object - (DataFrame({"A": np.array([1, 2], dtype=object), - "B": np.array(["a", "b"], dtype=object)}), True), - # multi-extension - (DataFrame({"A": pd.Categorical(['a', 'b']), - "B": pd.Categorical(['a', 'b'])}), True), - # differ types - (DataFrame({"A": [1, 2], "B": [1., 2.]}), False), - # differ sizes - (DataFrame({"A": np.array([1, 2], dtype=np.int32), - "B": np.array([1, 2], dtype=np.int64)}), False), - # multi-extension differ - (DataFrame({"A": pd.Categorical(['a', 'b']), - "B": pd.Categorical(['b', 'c'])}), False), - - ]) + @pytest.mark.parametrize( + "data, expected", + [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + ( + DataFrame( + { + "A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object), + } + ), + True, + ), + # multi-extension + ( + DataFrame( + {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["a", "b"])} + ), + True, + ), + # differ types + (DataFrame({"A": [1, 2], "B": [1.0, 2.0]}), False), + # differ sizes + ( + DataFrame( + { + "A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64), + } + ), + False, + ), + # multi-extension differ + ( + DataFrame( + {"A": pd.Categorical(["a", "b"]), "B": pd.Categorical(["b", "c"])} + ), + False, + ), + ], + ) def test_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected def test_asarray_homogenous(self): - df = pd.DataFrame({"A": pd.Categorical([1, 2]), - "B": pd.Categorical([1, 2])}) + df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) result = np.asarray(df) # may change from object in the future - expected = np.array([[1, 1], [2, 2]], dtype='object') + expected = np.array([[1, 1], [2, 2]], dtype="object") tm.assert_numpy_array_equal(result, expected) class TestDataFrameDatetimeWithTZ: - def test_interleave(self, timezone_frame): # interleave with object - result = timezone_frame.assign(D='foo').values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', tz='CET')], - ['foo', 'foo', 'foo']], dtype=object).T + result = timezone_frame.assign(D="foo").values + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ["foo", "foo", "foo"], + ], + dtype=object, + ).T tm.assert_numpy_array_equal(result, expected) # interleave with only datetime64[ns] result = timezone_frame.values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], dtype=object).T + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T tm.assert_numpy_array_equal(result, expected) def test_astype(self, timezone_frame): # astype - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], - dtype=object).T - expected = DataFrame(expected, - index=timezone_frame.index, - columns=timezone_frame.columns, dtype=object) + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + pd.NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + pd.NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + expected = DataFrame( + expected, + index=timezone_frame.index, + columns=timezone_frame.columns, + dtype=object, + ) result = timezone_frame.astype(object) assert_frame_equal(result, expected) - result = timezone_frame.astype('datetime64[ns]') - expected = DataFrame({'A': date_range('20130101', periods=3), - 'B': (date_range('20130101', periods=3, - tz='US/Eastern') - .tz_convert('UTC') - .tz_localize(None)), - 'C': (date_range('20130101', periods=3, - tz='CET') - .tz_convert('UTC') - .tz_localize(None))}) + result = timezone_frame.astype("datetime64[ns]") + expected = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": ( + date_range("20130101", periods=3, tz="US/Eastern") + .tz_convert("UTC") + .tz_localize(None) + ), + "C": ( + date_range("20130101", periods=3, tz="CET") + .tz_convert("UTC") + .tz_localize(None) + ), + } + ) expected.iloc[1, 1] = pd.NaT expected.iloc[1, 2] = pd.NaT assert_frame_equal(result, expected) @@ -1014,19 +1173,32 @@ def test_astype(self, timezone_frame): def test_astype_str(self, timezone_frame): # str formatting result = timezone_frame.astype(str) - expected = DataFrame([['2013-01-01', '2013-01-01 00:00:00-05:00', - '2013-01-01 00:00:00+01:00'], - ['2013-01-02', 'NaT', 'NaT'], - ['2013-01-03', '2013-01-03 00:00:00-05:00', - '2013-01-03 00:00:00+01:00']], - columns=timezone_frame.columns) + expected = DataFrame( + [ + [ + "2013-01-01", + "2013-01-01 00:00:00-05:00", + "2013-01-01 00:00:00+01:00", + ], + ["2013-01-02", "NaT", "NaT"], + [ + "2013-01-03", + "2013-01-03 00:00:00-05:00", + "2013-01-03 00:00:00+01:00", + ], + ], + columns=timezone_frame.columns, + ) tm.assert_frame_equal(result, expected) - with option_context('display.max_columns', 20): + with option_context("display.max_columns", 20): result = str(timezone_frame) - assert ('0 2013-01-01 2013-01-01 00:00:00-05:00 ' - '2013-01-01 00:00:00+01:00') in result - assert ('1 2013-01-02 ' - 'NaT NaT') in result - assert ('2 2013-01-03 2013-01-03 00:00:00-05:00 ' - '2013-01-03 00:00:00+01:00') in result + assert ( + "0 2013-01-01 2013-01-01 00:00:00-05:00 " "2013-01-01 00:00:00+01:00" + ) in result + assert ( + "1 2013-01-02 " "NaT NaT" + ) in result + assert ( + "2 2013-01-03 2013-01-03 00:00:00-05:00 " "2013-01-03 00:00:00+01:00" + ) in result diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 703d273b3ca24..0ea24777ae1f5 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -5,12 +5,10 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']]) +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) def test_duplicated_with_misspelled_column_name(subset): # GH 19730 - df = DataFrame({'A': [0, 0, 1], - 'B': [0, 0, 1], - 'C': [0, 0, 1]}) + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) with pytest.raises(KeyError): df.duplicated(subset) @@ -24,8 +22,9 @@ def test_duplicated_do_not_fail_on_wide_dataframes(): # gh-21524 # Given the wide dataframe with a lot of columns # with different (important!) values - data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) - for i in range(100)} + data = { + "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) + } df = DataFrame(data).T result = df.duplicated() @@ -36,37 +35,47 @@ def test_duplicated_do_not_fail_on_wide_dataframes(): assert result.dtype == np.bool -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) def test_duplicated_keep(keep, expected): - df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) result = df.duplicated(keep=keep) tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) def test_duplicated_nan_none(keep, expected): - df = DataFrame({'C': [np.nan, 3, 3, None, np.nan]}, dtype=object) + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) result = df.duplicated(keep=keep) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('keep', ['first', 'last', False]) -@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) def test_duplicated_subset(subset, keep): - df = DataFrame({'A': [0, 1, 1, 2, 0], - 'B': ['a', 'b', 'b', 'c', 'a'], - 'C': [np.nan, 3, 3, None, np.nan]}) + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) if subset is None: subset = list(df.columns) @@ -81,77 +90,77 @@ def test_duplicated_subset(subset, keep): def test_drop_duplicates(): - df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + df = DataFrame( + { + "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('AAA') + result = df.drop_duplicates("AAA") expected = df[:2] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep='last') + result = df.drop_duplicates("AAA", keep="last") expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep=False) + result = df.drop_duplicates("AAA", keep=False) expected = df.loc[[]] tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates(np.array(['AAA', 'B'])) + result = df.drop_duplicates(np.array(["AAA", "B"])) tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B']) + result = df.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AAA', 'B'), keep='last') + result = df.drop_duplicates(("AAA", "B"), keep="last") expected = df.loc[[0, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AAA', 'B'), keep=False) + result = df.drop_duplicates(("AAA", "B"), keep=False) expected = df.loc[[0]] tm.assert_frame_equal(result, expected) # consider everything - df2 = df.loc[:, ['AAA', 'B', 'C']] + df2 = df.loc[:, ["AAA", "B", "C"]] result = df2.drop_duplicates() # in this case only - expected = df2.drop_duplicates(['AAA', 'B']) + expected = df2.drop_duplicates(["AAA", "B"]) tm.assert_frame_equal(result, expected) - result = df2.drop_duplicates(keep='last') - expected = df2.drop_duplicates(['AAA', 'B'], keep='last') + result = df2.drop_duplicates(keep="last") + expected = df2.drop_duplicates(["AAA", "B"], keep="last") tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) - expected = df2.drop_duplicates(['AAA', 'B'], keep=False) + expected = df2.drop_duplicates(["AAA", "B"], keep=False) tm.assert_frame_equal(result, expected) # integers - result = df.drop_duplicates('C') + result = df.drop_duplicates("C") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') + result = df.drop_duplicates("C", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) - df['E'] = df['C'].astype('int8') - result = df.drop_duplicates('E') + df["E"] = df["C"].astype("int8") + result = df.drop_duplicates("E") expected = df.iloc[[0, 2]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('E', keep='last') + result = df.drop_duplicates("E", keep="last") expected = df.iloc[[-2, -1]] tm.assert_frame_equal(result, expected) # GH 11376 - df = DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], - 'y': [0, 6, 5, 5, 9, 1, 2]}) + df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] tm.assert_frame_equal(df.drop_duplicates(), expected) @@ -172,15 +181,15 @@ def test_drop_duplicates(): df = DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) - for keep in ['first', 'last', False]: + for keep in ["first", "last", False]: assert df.duplicated(keep=keep).sum() == 0 def test_duplicated_on_empty_frame(): # GH 25184 - df = DataFrame(columns=['a', 'b']) - dupes = df.duplicated('a') + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") result = df[dupes] expected = df.copy() @@ -189,90 +198,91 @@ def test_duplicated_on_empty_frame(): def test_drop_duplicates_with_duplicate_column_names(): # GH17836 - df = DataFrame([ - [1, 2, 5], - [3, 4, 6], - [3, 4, 7] - ], columns=['a', 'a', 'b']) + df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) result0 = df.drop_duplicates() tm.assert_frame_equal(result0, df) - result1 = df.drop_duplicates('a') + result1 = df.drop_duplicates("a") expected1 = df[:2] tm.assert_frame_equal(result1, expected1) def test_drop_duplicates_for_take_all(): - df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', - 'foo', 'bar', 'qux', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + df = DataFrame( + { + "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('AAA') + result = df.drop_duplicates("AAA") expected = df.iloc[[0, 1, 2, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep='last') + result = df.drop_duplicates("AAA", keep="last") expected = df.iloc[[2, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('AAA', keep=False) + result = df.drop_duplicates("AAA", keep=False) expected = df.iloc[[2, 6]] tm.assert_frame_equal(result, expected) # multiple columns - result = df.drop_duplicates(['AAA', 'B']) + result = df.drop_duplicates(["AAA", "B"]) expected = df.iloc[[0, 1, 2, 3, 4, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B'], keep='last') + result = df.drop_duplicates(["AAA", "B"], keep="last") expected = df.iloc[[0, 1, 2, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['AAA', 'B'], keep=False) + result = df.drop_duplicates(["AAA", "B"], keep=False) expected = df.iloc[[0, 1, 2, 6]] tm.assert_frame_equal(result, expected) def test_drop_duplicates_tuple(): - df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + df = DataFrame( + { + ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column - result = df.drop_duplicates(('AA', 'AB')) + result = df.drop_duplicates(("AA", "AB")) expected = df[:2] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AA', 'AB'), keep='last') + result = df.drop_duplicates(("AA", "AB"), keep="last") expected = df.loc[[6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(('AA', 'AB'), keep=False) + result = df.drop_duplicates(("AA", "AB"), keep=False) expected = df.loc[[]] # empty df assert len(result) == 0 tm.assert_frame_equal(result, expected) # multi column expected = df.loc[[0, 1, 2, 3]] - result = df.drop_duplicates((('AA', 'AB'), 'B')) + result = df.drop_duplicates((("AA", "AB"), "B")) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('df', [ - DataFrame(), - DataFrame(columns=[]), - DataFrame(columns=['A', 'B', 'C']), - DataFrame(index=[]), - DataFrame(index=['A', 'B', 'C']) -]) +@pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=[]), + DataFrame(columns=["A", "B", "C"]), + DataFrame(index=[]), + DataFrame(index=["A", "B", "C"]), + ], +) def test_drop_duplicates_empty(df): # GH 20516 result = df.drop_duplicates() @@ -285,134 +295,140 @@ def test_drop_duplicates_empty(df): def test_drop_duplicates_NA(): # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': range(8), - }) + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('A') + result = df.drop_duplicates("A") expected = df.loc[[0, 2, 3]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep='last') + result = df.drop_duplicates("A", keep="last") expected = df.loc[[1, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep=False) + result = df.drop_duplicates("A", keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column - result = df.drop_duplicates(['A', 'B']) + result = df.drop_duplicates(["A", "B"]) expected = df.loc[[0, 2, 3, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['A', 'B'], keep='last') + result = df.drop_duplicates(["A", "B"], keep="last") expected = df.loc[[1, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['A', 'B'], keep=False) + result = df.drop_duplicates(["A", "B"], keep=False) expected = df.loc[[6]] tm.assert_frame_equal(result, expected) # nan - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], - 'D': range(8), - }) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], + "D": range(8), + } + ) # single column - result = df.drop_duplicates('C') + result = df.drop_duplicates("C") expected = df[:2] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') + result = df.drop_duplicates("C", keep="last") expected = df.loc[[3, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep=False) + result = df.drop_duplicates("C", keep=False) expected = df.loc[[]] # empty df tm.assert_frame_equal(result, expected) assert len(result) == 0 # multi column - result = df.drop_duplicates(['C', 'B']) + result = df.drop_duplicates(["C", "B"]) expected = df.loc[[0, 1, 2, 4]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['C', 'B'], keep='last') + result = df.drop_duplicates(["C", "B"], keep="last") expected = df.loc[[1, 3, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates(['C', 'B'], keep=False) + result = df.drop_duplicates(["C", "B"], keep=False) expected = df.loc[[1]] tm.assert_frame_equal(result, expected) def test_drop_duplicates_NA_for_take_all(): # none - df = DataFrame({'A': [None, None, 'foo', 'bar', - 'foo', 'baz', 'bar', 'qux'], - 'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]}) + df = DataFrame( + { + "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"], + "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0], + } + ) # single column - result = df.drop_duplicates('A') + result = df.drop_duplicates("A") expected = df.iloc[[0, 2, 3, 5, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep='last') + result = df.drop_duplicates("A", keep="last") expected = df.iloc[[1, 4, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('A', keep=False) + result = df.drop_duplicates("A", keep=False) expected = df.iloc[[5, 7]] tm.assert_frame_equal(result, expected) # nan # single column - result = df.drop_duplicates('C') + result = df.drop_duplicates("C") expected = df.iloc[[0, 1, 5, 6]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep='last') + result = df.drop_duplicates("C", keep="last") expected = df.iloc[[3, 5, 6, 7]] tm.assert_frame_equal(result, expected) - result = df.drop_duplicates('C', keep=False) + result = df.drop_duplicates("C", keep=False) expected = df.iloc[[5, 6]] tm.assert_frame_equal(result, expected) def test_drop_duplicates_inplace(): - orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'bar', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': [1, 1, 2, 2, 2, 2, 1, 2], - 'D': range(8), - }) + orig = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": [1, 1, 2, 2, 2, 2, 1, 2], + "D": range(8), + } + ) # single column df = orig.copy() - df.drop_duplicates('A', inplace=True) + df.drop_duplicates("A", inplace=True) expected = orig[:2] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates('A', keep='last', inplace=True) + df.drop_duplicates("A", keep="last", inplace=True) expected = orig.loc[[6, 7]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates('A', keep=False, inplace=True) + df.drop_duplicates("A", keep=False, inplace=True) expected = orig.loc[[]] result = df tm.assert_frame_equal(result, expected) @@ -420,41 +436,41 @@ def test_drop_duplicates_inplace(): # multi column df = orig.copy() - df.drop_duplicates(['A', 'B'], inplace=True) + df.drop_duplicates(["A", "B"], inplace=True) expected = orig.loc[[0, 1, 2, 3]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates(['A', 'B'], keep='last', inplace=True) + df.drop_duplicates(["A", "B"], keep="last", inplace=True) expected = orig.loc[[0, 5, 6, 7]] result = df tm.assert_frame_equal(result, expected) df = orig.copy() - df.drop_duplicates(['A', 'B'], keep=False, inplace=True) + df.drop_duplicates(["A", "B"], keep=False, inplace=True) expected = orig.loc[[0]] result = df tm.assert_frame_equal(result, expected) # consider everything - orig2 = orig.loc[:, ['A', 'B', 'C']].copy() + orig2 = orig.loc[:, ["A", "B", "C"]].copy() df2 = orig2.copy() df2.drop_duplicates(inplace=True) # in this case only - expected = orig2.drop_duplicates(['A', 'B']) + expected = orig2.drop_duplicates(["A", "B"]) result = df2 tm.assert_frame_equal(result, expected) df2 = orig2.copy() - df2.drop_duplicates(keep='last', inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep='last') + df2.drop_duplicates(keep="last", inplace=True) + expected = orig2.drop_duplicates(["A", "B"], keep="last") result = df2 tm.assert_frame_equal(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep=False, inplace=True) - expected = orig2.drop_duplicates(['A', 'B'], keep=False) + expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index f8af942f67657..c2d38b2938fca 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -11,20 +11,31 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, - Timestamp, date_range, isna, notna) + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + isna, + notna, +) import pandas.core.common as com from pandas.core.indexing import IndexingError from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from pandas.tseries.offsets import BDay class TestDataFrameIndexing(TestData): - def test_getitem(self, float_frame): # Slicing sl = float_frame[:20] @@ -38,70 +49,81 @@ def test_getitem(self, float_frame): for key, _ in float_frame._series.items(): assert float_frame[key] is not None - assert 'random' not in float_frame - with pytest.raises(KeyError, match='random'): - float_frame['random'] + assert "random" not in float_frame + with pytest.raises(KeyError, match="random"): + float_frame["random"] df = float_frame.copy() - df['$10'] = np.random.randn(len(df)) + df["$10"] = np.random.randn(len(df)) ad = np.random.randn(len(df)) - df['@awesome_domain'] = ad + df["@awesome_domain"] = ad with pytest.raises(KeyError): df.__getitem__('df["$10"]') - res = df['@awesome_domain'] + res = df["@awesome_domain"] tm.assert_numpy_array_equal(ad, res.values) def test_getitem_dupe_cols(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) with pytest.raises(KeyError): - df[['baf']] + df[["baf"]] def test_get(self, float_frame): - b = float_frame.get('B') - assert_series_equal(b, float_frame['B']) - - assert float_frame.get('foo') is None - assert_series_equal(float_frame.get('foo', float_frame['B']), - float_frame['B']) - - @pytest.mark.parametrize("df", [ - DataFrame(), - DataFrame(columns=list("AB")), - DataFrame(columns=list("AB"), index=range(3)) - ]) + b = float_frame.get("B") + assert_series_equal(b, float_frame["B"]) + + assert float_frame.get("foo") is None + assert_series_equal(float_frame.get("foo", float_frame["B"]), float_frame["B"]) + + @pytest.mark.parametrize( + "df", + [ + DataFrame(), + DataFrame(columns=list("AB")), + DataFrame(columns=list("AB"), index=range(3)), + ], + ) def test_get_none(self, df): # see gh-5652 assert df.get(None) is None - @pytest.mark.parametrize('key_type', [iter, np.array, Series, Index]) + @pytest.mark.parametrize("key_type", [iter, np.array, Series, Index]) def test_loc_iterable(self, float_frame, key_type): - idx = key_type(['A', 'B', 'C']) + idx = key_type(["A", "B", "C"]) result = float_frame.loc[:, idx] - expected = float_frame.loc[:, ['A', 'B', 'C']] + expected = float_frame.loc[:, ["A", "B", "C"]] assert_frame_equal(result, expected) @pytest.mark.parametrize( "idx_type", - [list, iter, Index, set, - lambda l: dict(zip(l, range(len(l)))), - lambda l: dict(zip(l, range(len(l)))).keys()], - ids=["list", "iter", "Index", "set", "dict", "dict_keys"]) + [ + list, + iter, + Index, + set, + lambda l: dict(zip(l, range(len(l)))), + lambda l: dict(zip(l, range(len(l)))).keys(), + ], + ids=["list", "iter", "Index", "set", "dict", "dict_keys"], + ) @pytest.mark.parametrize("levels", [1, 2]) def test_getitem_listlike(self, idx_type, levels, float_frame): # GH 21294 if levels == 1: - frame, missing = float_frame, 'food' + frame, missing = float_frame, "food" else: # MultiIndex columns - frame = DataFrame(np.random.randn(8, 3), - columns=Index([('foo', 'bar'), ('baz', 'qux'), - ('peek', 'aboo')], - name=('sth', 'sth2'))) - missing = ('good', 'food') + frame = DataFrame( + np.random.randn(8, 3), + columns=Index( + [("foo", "bar"), ("baz", "qux"), ("peek", "aboo")], + name=("sth", "sth2"), + ), + ) + missing = ("good", "food") keys = [frame.columns[1], frame.columns[0]] idx = idx_type(keys) @@ -115,16 +137,15 @@ def test_getitem_listlike(self, idx_type, levels, float_frame): assert_frame_equal(result, expected) idx = idx_type(keys + [missing]) - with pytest.raises(KeyError, match='not in index'): + with pytest.raises(KeyError, match="not in index"): frame[idx] - @pytest.mark.parametrize("val,expected", [ - (2**63 - 1, Series([1])), - (2**63, Series([2])), - ]) + @pytest.mark.parametrize( + "val,expected", [(2 ** 63 - 1, Series([1])), (2 ** 63, Series([2]))] + ) def test_loc_uint64(self, val, expected): # see gh-19399 - df = DataFrame([1, 2], index=[2**63 - 1, 2**63]) + df = DataFrame([1, 2], index=[2 ** 63 - 1, 2 ** 63]) result = df.loc[val] expected.name = val @@ -132,11 +153,11 @@ def test_loc_uint64(self, val, expected): def test_getitem_callable(self, float_frame): # GH 12533 - result = float_frame[lambda x: 'A'] - tm.assert_series_equal(result, float_frame.loc[:, 'A']) + result = float_frame[lambda x: "A"] + tm.assert_series_equal(result, float_frame.loc[:, "A"]) - result = float_frame[lambda x: ['A', 'B']] - tm.assert_frame_equal(result, float_frame.loc[:, ['A', 'B']]) + result = float_frame[lambda x: ["A", "B"]] + tm.assert_frame_equal(result, float_frame.loc[:, ["A", "B"]]) df = float_frame[:3] result = df[lambda x: [True, False, True]] @@ -144,79 +165,77 @@ def test_getitem_callable(self, float_frame): def test_setitem_list(self, float_frame): - float_frame['E'] = 'foo' - data = float_frame[['A', 'B']] - float_frame[['B', 'A']] = data + float_frame["E"] = "foo" + data = float_frame[["A", "B"]] + float_frame[["B", "A"]] = data - assert_series_equal(float_frame['B'], data['A'], check_names=False) - assert_series_equal(float_frame['A'], data['B'], check_names=False) + assert_series_equal(float_frame["B"], data["A"], check_names=False) + assert_series_equal(float_frame["A"], data["B"], check_names=False) - msg = 'Columns must be same length as key' + msg = "Columns must be same length as key" with pytest.raises(ValueError, match=msg): - data[['A']] = float_frame[['A', 'B']] + data[["A"]] = float_frame[["A", "B"]] - msg = 'Length of values does not match length of index' + msg = "Length of values does not match length of index" with pytest.raises(ValueError, match=msg): - data['A'] = range(len(data.index) - 1) + data["A"] = range(len(data.index) - 1) - df = DataFrame(0, index=range(3), columns=['tt1', 'tt2'], - dtype=np.int_) - df.loc[1, ['tt1', 'tt2']] = [1, 2] + df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) + df.loc[1, ["tt1", "tt2"]] = [1, 2] - result = df.loc[df.index[1], ['tt1', 'tt2']] + result = df.loc[df.index[1], ["tt1", "tt2"]] expected = Series([1, 2], df.columns, dtype=np.int_, name=1) assert_series_equal(result, expected) - df['tt1'] = df['tt2'] = '0' - df.loc[df.index[1], ['tt1', 'tt2']] = ['1', '2'] - result = df.loc[df.index[1], ['tt1', 'tt2']] - expected = Series(['1', '2'], df.columns, name=1) + df["tt1"] = df["tt2"] = "0" + df.loc[df.index[1], ["tt1", "tt2"]] = ["1", "2"] + result = df.loc[df.index[1], ["tt1", "tt2"]] + expected = Series(["1", "2"], df.columns, name=1) assert_series_equal(result, expected) def test_setitem_list_not_dataframe(self, float_frame): data = np.random.randn(len(float_frame), 2) - float_frame[['A', 'B']] = data - assert_almost_equal(float_frame[['A', 'B']].values, data) + float_frame[["A", "B"]] = data + assert_almost_equal(float_frame[["A", "B"]].values, data) def test_setitem_list_of_tuples(self, float_frame): - tuples = list(zip(float_frame['A'], float_frame['B'])) - float_frame['tuples'] = tuples + tuples = list(zip(float_frame["A"], float_frame["B"])) + float_frame["tuples"] = tuples - result = float_frame['tuples'] - expected = Series(tuples, index=float_frame.index, name='tuples') + result = float_frame["tuples"] + expected = Series(tuples, index=float_frame.index, name="tuples") assert_series_equal(result, expected) def test_setitem_mulit_index(self): # GH7655, test that assigning to a sub-frame of a frame # with multi-index columns aligns both rows and columns - it = ['jim', 'joe', 'jolie'], ['first', 'last'], \ - ['left', 'center', 'right'] + it = ["jim", "joe", "jolie"], ["first", "last"], ["left", "center", "right"] cols = MultiIndex.from_product(it) - index = pd.date_range('20141006', periods=20) + index = pd.date_range("20141006", periods=20) vals = np.random.randint(1, 1000, (len(index), len(cols))) df = pd.DataFrame(vals, columns=cols, index=index) i, j = df.index.values.copy(), it[-1][:] np.random.shuffle(i) - df['jim'] = df['jolie'].loc[i, ::-1] - assert_frame_equal(df['jim'], df['jolie']) + df["jim"] = df["jolie"].loc[i, ::-1] + assert_frame_equal(df["jim"], df["jolie"]) np.random.shuffle(j) - df[('joe', 'first')] = df[('jolie', 'last')].loc[i, j] - assert_frame_equal(df[('joe', 'first')], df[('jolie', 'last')]) + df[("joe", "first")] = df[("jolie", "last")].loc[i, j] + assert_frame_equal(df[("joe", "first")], df[("jolie", "last")]) np.random.shuffle(j) - df[('joe', 'last')] = df[('jolie', 'first')].loc[i, j] - assert_frame_equal(df[('joe', 'last')], df[('jolie', 'first')]) + df[("joe", "last")] = df[("jolie", "first")].loc[i, j] + assert_frame_equal(df[("joe", "last")], df[("jolie", "first")]) def test_setitem_callable(self): # GH 12533 - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}) - df[lambda x: 'A'] = [11, 12, 13, 14] + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}) + df[lambda x: "A"] = [11, 12, 13, 14] - exp = pd.DataFrame({'A': [11, 12, 13, 14], 'B': [5, 6, 7, 8]}) + exp = pd.DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]}) tm.assert_frame_equal(df, exp) def test_setitem_other_callable(self): @@ -230,8 +249,9 @@ def inc(x): expected = pd.DataFrame([[-1, inc], [inc, -1]]) tm.assert_frame_equal(df, expected) - def test_getitem_boolean(self, float_string_frame, mixed_float_frame, - mixed_int_frame, datetime_frame): + def test_getitem_boolean( + self, float_string_frame, mixed_float_frame, mixed_int_frame, datetime_frame + ): # boolean indexing d = datetime_frame.index[10] indexer = datetime_frame.index > d @@ -241,13 +261,13 @@ def test_getitem_boolean(self, float_string_frame, mixed_float_frame, subframe = datetime_frame[indexer] tm.assert_index_equal(subindex, subframe.index) - with pytest.raises(ValueError, match='Item wrong length'): + with pytest.raises(ValueError, match="Item wrong length"): datetime_frame[indexer[:-1]] subframe_obj = datetime_frame[indexer_obj] assert_frame_equal(subframe_obj, subframe) - with pytest.raises(ValueError, match='boolean values only'): + with pytest.raises(ValueError, match="boolean values only"): datetime_frame[datetime_frame] # test that Series work @@ -266,16 +286,22 @@ def test_getitem_boolean(self, float_string_frame, mixed_float_frame, assert_frame_equal(subframe_obj, subframe) # test df[df > 0] - for df in [datetime_frame, float_string_frame, - mixed_float_frame, mixed_int_frame]: + for df in [ + datetime_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: if df is float_string_frame: continue data = df._get_numeric_data() bif = df[df > 0] - bifw = DataFrame({c: np.where(data[c] > 0, data[c], np.nan) - for c in data.columns}, - index=data.index, columns=data.columns) + bifw = DataFrame( + {c: np.where(data[c] > 0, data[c], np.nan) for c in data.columns}, + index=data.index, + columns=data.columns, + ) # add back other columns to compare for c in df.columns: @@ -292,31 +318,35 @@ def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() - df['E'] = 1 - df['E'] = df['E'].astype('int32') - df['E1'] = df['E'].copy() - df['F'] = 1 - df['F'] = df['F'].astype('int64') - df['F1'] = df['F'].copy() + df["E"] = 1 + df["E"] = df["E"].astype("int32") + df["E1"] = df["E"].copy() + df["F"] = 1 + df["F"] = df["F"].astype("int64") + df["F1"] = df["F"].copy() casted = df[df > 0] result = casted.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('int32')] * 2 + - [np.dtype('int64')] * 2, - index=['A', 'B', 'C', 'D', 'E', 'E1', 'F', 'F1']) + expected = Series( + [np.dtype("float64")] * 4 + + [np.dtype("int32")] * 2 + + [np.dtype("int64")] * 2, + index=["A", "B", "C", "D", "E", "E1", "F", "F1"], + ) assert_series_equal(result, expected) # int block splitting - df.loc[df.index[1:3], ['E1', 'F1']] = 0 + df.loc[df.index[1:3], ["E1", "F1"]] = 0 casted = df[df > 0] result = casted.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('int32')] + - [np.dtype('float64')] + - [np.dtype('int64')] + - [np.dtype('float64')], - index=['A', 'B', 'C', 'D', 'E', 'E1', 'F', 'F1']) + expected = Series( + [np.dtype("float64")] * 4 + + [np.dtype("int32")] + + [np.dtype("float64")] + + [np.dtype("int64")] + + [np.dtype("float64")], + index=["A", "B", "C", "D", "E", "E1", "F", "F1"], + ) assert_series_equal(result, expected) # where dtype conversions @@ -344,7 +374,7 @@ def _checkit(lst): def test_getitem_boolean_iadd(self): arr = np.random.randn(5, 5) - df = DataFrame(arr.copy(), columns=['A', 'B', 'C', 'D', 'E']) + df = DataFrame(arr.copy(), columns=["A", "B", "C", "D", "E"]) df[df < 0] += 1 arr[arr < 0] += 1 @@ -353,8 +383,7 @@ def test_getitem_boolean_iadd(self): def test_boolean_index_empty_corner(self): # #2096 - blah = DataFrame(np.empty([0, 1]), columns=['A'], - index=DatetimeIndex([])) + blah = DataFrame(np.empty([0, 1]), columns=["A"], index=DatetimeIndex([])) # both of these should succeed trivially k = np.array([], bool) @@ -363,8 +392,9 @@ def test_boolean_index_empty_corner(self): blah[k] = 0 def test_getitem_ix_mixed_integer(self): - df = DataFrame(np.random.randn(4, 3), - index=[1, 10, 'C', 'E'], columns=[1, 2, 3]) + df = DataFrame( + np.random.randn(4, 3), index=[1, 10, "C", "E"], columns=[1, 2, 3] + ) result = df.iloc[:-1] expected = df.loc[df.index[:-1]] @@ -377,11 +407,15 @@ def test_getitem_ix_mixed_integer(self): assert_frame_equal(result, expected) # 11320 - df = pd.DataFrame({"rna": (1.5, 2.2, 3.2, 4.5), - -1000: [11, 21, 36, 40], - 0: [10, 22, 43, 34], - 1000: [0, 10, 20, 30]}, - columns=['rna', -1000, 0, 1000]) + df = pd.DataFrame( + { + "rna": (1.5, 2.2, 3.2, 4.5), + -1000: [11, 21, 36, 40], + 0: [10, 22, 43, 34], + 1000: [0, 10, 20, 30], + }, + columns=["rna", -1000, 0, 1000], + ) result = df[[1000]] expected = df.iloc[:, [3]] assert_frame_equal(result, expected) @@ -393,22 +427,22 @@ def test_getitem_setitem_ix_negative_integers(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = float_frame.ix[:, -1] - assert_series_equal(result, float_frame['D']) + assert_series_equal(result, float_frame["D"]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = float_frame.ix[:, [-1]] - assert_frame_equal(result, float_frame[['D']]) + assert_frame_equal(result, float_frame[["D"]]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = float_frame.ix[:, [-1, -2]] - assert_frame_equal(result, float_frame[['D', 'C']]) + assert_frame_equal(result, float_frame[["D", "C"]]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) float_frame.ix[:, [-1]] = 0 - assert (float_frame['D'] == 0).all() + assert (float_frame["D"] == 0).all() df = DataFrame(np.random.randn(8, 4)) # ix does label-based indexing when having an integer index @@ -423,8 +457,7 @@ def test_getitem_setitem_ix_negative_integers(self, float_frame): df.ix[:, [-1]] # #1942 - a = DataFrame(np.random.randn(20, 2), - index=[chr(x + 65) for x in range(20)]) + a = DataFrame(np.random.randn(20, 2), index=[chr(x + 65) for x in range(20)]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) a.ix[-1] = a.ix[-2] @@ -432,64 +465,64 @@ def test_getitem_setitem_ix_negative_integers(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) assert_series_equal(a.ix[-1], a.ix[-2], check_names=False) - assert a.ix[-1].name == 'T' - assert a.ix[-2].name == 'S' + assert a.ix[-1].name == "T" + assert a.ix[-2].name == "S" def test_getattr(self, float_frame): - assert_series_equal(float_frame.A, float_frame['A']) + assert_series_equal(float_frame.A, float_frame["A"]) msg = "'DataFrame' object has no attribute 'NONEXISTENT_NAME'" with pytest.raises(AttributeError, match=msg): float_frame.NONEXISTENT_NAME def test_setattr_column(self): - df = DataFrame({'foobar': 1}, index=range(10)) + df = DataFrame({"foobar": 1}, index=range(10)) df.foobar = 5 assert (df.foobar == 5).all() def test_setitem(self, float_frame): # not sure what else to do here - series = float_frame['A'][::2] - float_frame['col5'] = series - assert 'col5' in float_frame + series = float_frame["A"][::2] + float_frame["col5"] = series + assert "col5" in float_frame assert len(series) == 15 assert len(float_frame) == 30 exp = np.ravel(np.column_stack((series.values, [np.nan] * 15))) - exp = Series(exp, index=float_frame.index, name='col5') - tm.assert_series_equal(float_frame['col5'], exp) + exp = Series(exp, index=float_frame.index, name="col5") + tm.assert_series_equal(float_frame["col5"], exp) - series = float_frame['A'] - float_frame['col6'] = series - tm.assert_series_equal(series, float_frame['col6'], check_names=False) + series = float_frame["A"] + float_frame["col6"] = series + tm.assert_series_equal(series, float_frame["col6"], check_names=False) with pytest.raises(KeyError): float_frame[np.random.randn(len(float_frame) + 1)] = 1 # set ndarray arr = np.random.randn(len(float_frame)) - float_frame['col9'] = arr - assert (float_frame['col9'] == arr).all() + float_frame["col9"] = arr + assert (float_frame["col9"] == arr).all() - float_frame['col7'] = 5 - assert((float_frame['col7'] == 5).all()) + float_frame["col7"] = 5 + assert (float_frame["col7"] == 5).all() - float_frame['col0'] = 3.14 - assert((float_frame['col0'] == 3.14).all()) + float_frame["col0"] = 3.14 + assert (float_frame["col0"] == 3.14).all() - float_frame['col8'] = 'foo' - assert((float_frame['col8'] == 'foo').all()) + float_frame["col8"] = "foo" + assert (float_frame["col8"] == "foo").all() # this is partially a view (e.g. some blocks are view) # so raise/warn smaller = float_frame[:2] with pytest.raises(com.SettingWithCopyError): - smaller['col10'] = ['1', '2'] + smaller["col10"] = ["1", "2"] - assert smaller['col10'].dtype == np.object_ - assert (smaller['col10'] == ['1', '2']).all() + assert smaller["col10"].dtype == np.object_ + assert (smaller["col10"] == ["1", "2"]).all() # dtype changing GH4204 df = DataFrame([[0, 0]]) @@ -509,27 +542,26 @@ def test_setitem_dtype(self, dtype, float_frame): assert float_frame[dtype].dtype.name == dtype def test_setitem_tuple(self, float_frame): - float_frame['A', 'B'] = float_frame['A'] - assert_series_equal(float_frame['A', 'B'], float_frame[ - 'A'], check_names=False) + float_frame["A", "B"] = float_frame["A"] + assert_series_equal(float_frame["A", "B"], float_frame["A"], check_names=False) def test_setitem_always_copy(self, float_frame): - s = float_frame['A'].copy() - float_frame['E'] = s + s = float_frame["A"].copy() + float_frame["E"] = s - float_frame['E'][5:10] = np.nan + float_frame["E"][5:10] = np.nan assert notna(s[5:10]).all() def test_setitem_boolean(self, float_frame): df = float_frame.copy() values = float_frame.values - df[df['A'] > 0] = 4 + df[df["A"] > 0] = 4 values[values[:, 0] > 0] = 4 assert_almost_equal(df.values, values) # test that column reindexing works - series = df['A'] == 4 + series = df["A"] == 4 series = series.reindex(df.index[::-1]) df[series] = 1 values[values[:, 0] == 4] = 1 @@ -572,9 +604,9 @@ def test_setitem_boolean(self, float_frame): @pytest.mark.parametrize( "mask_type", - [lambda df: df > np.abs(df) / 2, - lambda df: (df > np.abs(df) / 2).values], - ids=['dataframe', 'array']) + [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], + ids=["dataframe", "array"], + ) def test_setitem_boolean_mask(self, mask_type, float_frame): # Test for issue #18582 @@ -590,63 +622,65 @@ def test_setitem_boolean_mask(self, mask_type, float_frame): assert_frame_equal(result, expected) def test_setitem_cast(self, float_frame): - float_frame['D'] = float_frame['D'].astype('i8') - assert float_frame['D'].dtype == np.int64 + float_frame["D"] = float_frame["D"].astype("i8") + assert float_frame["D"].dtype == np.int64 # #669, should not cast? # this is now set to int64, which means a replacement of the column to # the value dtype (and nothing to do with the existing dtype) - float_frame['B'] = 0 - assert float_frame['B'].dtype == np.int64 + float_frame["B"] = 0 + assert float_frame["B"].dtype == np.int64 # cast if pass array of course - float_frame['B'] = np.arange(len(float_frame)) - assert issubclass(float_frame['B'].dtype.type, np.integer) + float_frame["B"] = np.arange(len(float_frame)) + assert issubclass(float_frame["B"].dtype.type, np.integer) - float_frame['foo'] = 'bar' - float_frame['foo'] = 0 - assert float_frame['foo'].dtype == np.int64 + float_frame["foo"] = "bar" + float_frame["foo"] = 0 + assert float_frame["foo"].dtype == np.int64 - float_frame['foo'] = 'bar' - float_frame['foo'] = 2.5 - assert float_frame['foo'].dtype == np.float64 + float_frame["foo"] = "bar" + float_frame["foo"] = 2.5 + assert float_frame["foo"].dtype == np.float64 - float_frame['something'] = 0 - assert float_frame['something'].dtype == np.int64 - float_frame['something'] = 2 - assert float_frame['something'].dtype == np.int64 - float_frame['something'] = 2.5 - assert float_frame['something'].dtype == np.float64 + float_frame["something"] = 0 + assert float_frame["something"].dtype == np.int64 + float_frame["something"] = 2 + assert float_frame["something"].dtype == np.int64 + float_frame["something"] = 2.5 + assert float_frame["something"].dtype == np.float64 # GH 7704 # dtype conversion on setting - df = DataFrame(np.random.rand(30, 3), columns=tuple('ABC')) - df['event'] = np.nan - df.loc[10, 'event'] = 'foo' + df = DataFrame(np.random.rand(30, 3), columns=tuple("ABC")) + df["event"] = np.nan + df.loc[10, "event"] = "foo" result = df.dtypes - expected = Series([np.dtype('float64')] * 3 + [np.dtype('object')], - index=['A', 'B', 'C', 'event']) + expected = Series( + [np.dtype("float64")] * 3 + [np.dtype("object")], + index=["A", "B", "C", "event"], + ) assert_series_equal(result, expected) # Test that data type is preserved . #5782 - df = DataFrame({'one': np.arange(6, dtype=np.int8)}) - df.loc[1, 'one'] = 6 + df = DataFrame({"one": np.arange(6, dtype=np.int8)}) + df.loc[1, "one"] = 6 assert df.dtypes.one == np.dtype(np.int8) df.one = np.int8(7) assert df.dtypes.one == np.dtype(np.int8) def test_setitem_boolean_column(self, float_frame): expected = float_frame.copy() - mask = float_frame['A'] > 0 + mask = float_frame["A"] > 0 - float_frame.loc[mask, 'B'] = 0 + float_frame.loc[mask, "B"] = 0 expected.values[mask.values, 1] = 0 assert_frame_equal(float_frame, expected) def test_frame_setitem_timestamp(self): # GH#2155 - columns = date_range(start='1/1/2012', end='2/1/2012', freq=BDay()) + columns = date_range(start="1/1/2012", end="2/1/2012", freq=BDay()) data = DataFrame(columns=columns, index=range(10)) t = datetime(2012, 11, 1) ts = Timestamp(t) @@ -655,60 +689,60 @@ def test_frame_setitem_timestamp(self): def test_setitem_corner(self, float_frame): # corner case - df = DataFrame({'B': [1., 2., 3.], - 'C': ['a', 'b', 'c']}, - index=np.arange(3)) - del df['B'] - df['B'] = [1., 2., 3.] - assert 'B' in df + df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) + del df["B"] + df["B"] = [1.0, 2.0, 3.0] + assert "B" in df assert len(df.columns) == 2 - df['A'] = 'beginning' - df['E'] = 'foo' - df['D'] = 'bar' - df[datetime.now()] = 'date' - df[datetime.now()] = 5. + df["A"] = "beginning" + df["E"] = "foo" + df["D"] = "bar" + df[datetime.now()] = "date" + df[datetime.now()] = 5.0 # what to do when empty frame with index dm = DataFrame(index=float_frame.index) - dm['A'] = 'foo' - dm['B'] = 'bar' + dm["A"] = "foo" + dm["B"] = "bar" assert len(dm.columns) == 2 assert dm.values.dtype == np.object_ # upcast - dm['C'] = 1 - assert dm['C'].dtype == np.int64 + dm["C"] = 1 + assert dm["C"].dtype == np.int64 - dm['E'] = 1. - assert dm['E'].dtype == np.float64 + dm["E"] = 1.0 + assert dm["E"].dtype == np.float64 # set existing column - dm['A'] = 'bar' - assert 'bar' == dm['A'][0] + dm["A"] = "bar" + assert "bar" == dm["A"][0] dm = DataFrame(index=np.arange(3)) - dm['A'] = 1 - dm['foo'] = 'bar' - del dm['foo'] - dm['foo'] = 'bar' - assert dm['foo'].dtype == np.object_ + dm["A"] = 1 + dm["foo"] = "bar" + del dm["foo"] + dm["foo"] = "bar" + assert dm["foo"].dtype == np.object_ - dm['coercable'] = ['1', '2', '3'] - assert dm['coercable'].dtype == np.object_ + dm["coercable"] = ["1", "2", "3"] + assert dm["coercable"].dtype == np.object_ def test_setitem_corner2(self): - data = {"title": ['foobar', 'bar', 'foobar'] + ['foobar'] * 17, - "cruft": np.random.random(20)} + data = { + "title": ["foobar", "bar", "foobar"] + ["foobar"] * 17, + "cruft": np.random.random(20), + } df = DataFrame(data) - ix = df[df['title'] == 'bar'].index + ix = df[df["title"] == "bar"].index - df.loc[ix, ['title']] = 'foobar' - df.loc[ix, ['cruft']] = 0 + df.loc[ix, ["title"]] = "foobar" + df.loc[ix, ["cruft"]] = 0 - assert df.loc[1, 'title'] == 'foobar' - assert df.loc[1, 'cruft'] == 0 + assert df.loc[1, "title"] == "foobar" + assert df.loc[1, "cruft"] == 0 def test_setitem_ambig(self): # Difficulties with mixed-type data @@ -717,9 +751,8 @@ def test_setitem_ambig(self): # Created as float type dm = DataFrame(index=range(3), columns=range(3)) - coercable_series = Series([Decimal(1) for _ in range(3)], - index=range(3)) - uncoercable_series = Series(['foo', 'bzr', 'baz'], index=range(3)) + coercable_series = Series([Decimal(1) for _ in range(3)], index=range(3)) + uncoercable_series = Series(["foo", "bzr", "baz"], index=range(3)) dm[0] = np.ones(3) assert len(dm.columns) == 3 @@ -733,46 +766,44 @@ def test_setitem_ambig(self): def test_setitem_clear_caches(self): # see gh-304 - df = DataFrame({'x': [1.1, 2.1, 3.1, 4.1], 'y': [5.1, 6.1, 7.1, 8.1]}, - index=[0, 1, 2, 3]) - df.insert(2, 'z', np.nan) + df = DataFrame( + {"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3] + ) + df.insert(2, "z", np.nan) # cache it - foo = df['z'] - df.loc[df.index[2:], 'z'] = 42 + foo = df["z"] + df.loc[df.index[2:], "z"] = 42 - expected = Series([np.nan, np.nan, 42, 42], index=df.index, name='z') + expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z") - assert df['z'] is not foo - tm.assert_series_equal(df['z'], expected) + assert df["z"] is not foo + tm.assert_series_equal(df["z"], expected) def test_setitem_None(self, float_frame): # GH #766 - float_frame[None] = float_frame['A'] + float_frame[None] = float_frame["A"] assert_series_equal( - float_frame.iloc[:, -1], float_frame['A'], check_names=False) - assert_series_equal(float_frame.loc[:, None], float_frame[ - 'A'], check_names=False) - assert_series_equal(float_frame[None], float_frame[ - 'A'], check_names=False) + float_frame.iloc[:, -1], float_frame["A"], check_names=False + ) + assert_series_equal( + float_frame.loc[:, None], float_frame["A"], check_names=False + ) + assert_series_equal(float_frame[None], float_frame["A"], check_names=False) repr(float_frame) def test_setitem_empty(self): # GH 9596 - df = pd.DataFrame({'a': ['1', '2', '3'], - 'b': ['11', '22', '33'], - 'c': ['111', '222', '333']}) + df = pd.DataFrame( + {"a": ["1", "2", "3"], "b": ["11", "22", "33"], "c": ["111", "222", "333"]} + ) result = df.copy() - result.loc[result.b.isna(), 'a'] = result.a + result.loc[result.b.isna(), "a"] = result.a assert_frame_equal(result, df) @pytest.mark.parametrize("dtype", ["float", "int64"]) - @pytest.mark.parametrize("kwargs", [ - dict(), - dict(index=[1]), - dict(columns=["A"]) - ]) + @pytest.mark.parametrize("kwargs", [dict(), dict(index=[1]), dict(columns=["A"])]) def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): # see gh-10126 kwargs["dtype"] = dtype @@ -785,8 +816,8 @@ def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): def test_setitem_scalars_no_index(self): # GH16823 / 17894 df = DataFrame() - df['foo'] = 1 - expected = DataFrame(columns=['foo']).astype(np.int64) + df["foo"] = 1 + expected = DataFrame(columns=["foo"]).astype(np.int64) assert_frame_equal(df, expected) def test_getitem_empty_frame_with_boolean(self): @@ -798,11 +829,11 @@ def test_getitem_empty_frame_with_boolean(self): def test_delitem_corner(self, float_frame): f = float_frame.copy() - del f['D'] + del f["D"] assert len(f.columns) == 3 with pytest.raises(KeyError, match=r"^'D'$"): - del f['D'] - del f['B'] + del f["D"] + del f["B"] assert len(f.columns) == 2 def test_getitem_fancy_2d(self, float_frame): @@ -810,35 +841,35 @@ def test_getitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_frame_equal(f.ix[:, ['B', 'A']], - f.reindex(columns=['B', 'A'])) + assert_frame_equal(f.ix[:, ["B", "A"]], f.reindex(columns=["B", "A"])) subidx = float_frame.index[[5, 4, 1]] with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_frame_equal(f.ix[subidx, ['B', 'A']], - f.reindex(index=subidx, columns=['B', 'A'])) + assert_frame_equal( + f.ix[subidx, ["B", "A"]], f.reindex(index=subidx, columns=["B", "A"]) + ) # slicing rows, etc. with catch_warnings(record=True): simplefilter("ignore", FutureWarning) assert_frame_equal(f.ix[5:10], f[5:10]) assert_frame_equal(f.ix[5:10, :], f[5:10]) - assert_frame_equal(f.ix[:5, ['A', 'B']], - f.reindex(index=f.index[:5], - columns=['A', 'B'])) + assert_frame_equal( + f.ix[:5, ["A", "B"]], f.reindex(index=f.index[:5], columns=["A", "B"]) + ) # slice rows with labels, inclusive! with catch_warnings(record=True): simplefilter("ignore", FutureWarning) expected = f.ix[5:11] - result = f.ix[f.index[5]:f.index[10]] + result = f.ix[f.index[5] : f.index[10]] assert_frame_equal(expected, result) # slice columns with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_frame_equal(f.ix[:, :2], f.reindex(columns=['A', 'B'])) + assert_frame_equal(f.ix[:, :2], f.reindex(columns=["A", "B"])) # get view with catch_warnings(record=True): @@ -910,9 +941,9 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[:, ['B', 'A']] = 1 - expected['B'] = 1. - expected['A'] = 1. + frame.ix[:, ["B", "A"]] = 1 + expected["B"] = 1.0 + expected["A"] = 1.0 assert_frame_equal(frame, expected) # case 2 @@ -926,11 +957,11 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[subidx, ['B', 'A']] = values - frame2.ix[[5, 4, 1], ['B', 'A']] = values + frame.ix[subidx, ["B", "A"]] = values + frame2.ix[[5, 4, 1], ["B", "A"]] = values - expected['B'].ix[subidx] = values[:, 0] - expected['A'].ix[subidx] = values[:, 1] + expected["B"].ix[subidx] = values[:, 0] + expected["A"].ix[subidx] = values[:, 1] assert_frame_equal(frame, expected) assert_frame_equal(frame2, expected) @@ -941,8 +972,8 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) expected1 = float_frame.copy() - frame.ix[5:10] = 1. - expected1.values[5:10] = 1. + frame.ix[5:10] = 1.0 + expected1.values[5:10] = 1.0 assert_frame_equal(frame, expected1) with catch_warnings(record=True): @@ -957,7 +988,7 @@ def test_setitem_fancy_2d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) frame = float_frame.copy() - frame.ix[5:10, :] = 1. + frame.ix[5:10, :] = 1.0 assert_frame_equal(frame, expected1) frame.ix[5:10, :] = arr assert_frame_equal(frame, expected2) @@ -971,9 +1002,9 @@ def test_setitem_fancy_2d(self, float_frame): expected = float_frame.copy() values = np.random.randn(5, 2) - frame.ix[:5, ['A', 'B']] = values - expected['A'][:5] = values[:, 0] - expected['B'][:5] = values[:, 1] + frame.ix[:5, ["A", "B"]] = values + expected["A"][:5] = values[:, 0] + expected["B"][:5] = values[:, 1] assert_frame_equal(frame, expected) with catch_warnings(record=True): @@ -987,7 +1018,7 @@ def test_setitem_fancy_2d(self, float_frame): frame = float_frame.copy() expected = float_frame.copy() - frame.ix[frame.index[5]:frame.index[10]] = 5. + frame.ix[frame.index[5] : frame.index[10]] = 5.0 expected.values[5:11] = 5 assert_frame_equal(frame, expected) @@ -999,36 +1030,34 @@ def test_setitem_fancy_2d(self, float_frame): expected = float_frame.copy() # slice indices - frame.ix[:, 1:3] = 4. - expected.values[:, 1:3] = 4. + frame.ix[:, 1:3] = 4.0 + expected.values[:, 1:3] = 4.0 assert_frame_equal(frame, expected) # slice with labels - frame.ix[:, 'B':'C'] = 4. + frame.ix[:, "B":"C"] = 4.0 assert_frame_equal(frame, expected) # new corner case of boolean slicing / setting - frame = DataFrame(zip([2, 3, 9, 6, 7], [np.nan] * 5), - columns=['a', 'b']) + frame = DataFrame(zip([2, 3, 9, 6, 7], [np.nan] * 5), columns=["a", "b"]) lst = [100] lst.extend([np.nan] * 4) - expected = DataFrame(zip([100, 3, 9, 6, 7], lst), - columns=['a', 'b']) - frame[frame['a'] == 2] = 100 + expected = DataFrame(zip([100, 3, 9, 6, 7], lst), columns=["a", "b"]) + frame[frame["a"] == 2] = 100 assert_frame_equal(frame, expected) def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): sliced = float_string_frame.iloc[:, -3:] - assert sliced['D'].dtype == np.float64 + assert sliced["D"].dtype == np.float64 # get view with single block # setting it triggers setting with copy sliced = float_frame.iloc[:, -3:] with pytest.raises(com.SettingWithCopyError): - sliced['C'] = 4. + sliced["C"] = 4.0 - assert (float_frame['C'] == 4).all() + assert (float_frame["C"] == 4).all() def test_fancy_setitem_int_labels(self): # integer index defers to label-based indexing @@ -1100,14 +1129,18 @@ def test_fancy_index_int_labels_exceptions(self, float_frame): df.ix[[0, 1, 2], [2, 3, 4]] = 5 # try to set indices not contained in frame - msg = (r"None of \[Index\(\['foo', 'bar', 'baz'\]," - r" dtype='object'\)\] are in the \[index\]") + msg = ( + r"None of \[Index\(\['foo', 'bar', 'baz'\]," + r" dtype='object'\)\] are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): - float_frame.ix[['foo', 'bar', 'baz']] = 1 - msg = (r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" - r" \[columns\]") + float_frame.ix[["foo", "bar", "baz"]] = 1 + msg = ( + r"None of \[Index\(\['E'\], dtype='object'\)\] are in the" + r" \[columns\]" + ) with pytest.raises(KeyError, match=msg): - float_frame.ix[:, ['E']] = 1 + float_frame.ix[:, ["E"]] = 1 # partial setting now allows this GH2578 # pytest.raises(KeyError, float_frame.ix.__setitem__, @@ -1117,29 +1150,27 @@ def test_setitem_fancy_mixed_2d(self, float_string_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5 - result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] + self.mixed_frame.ix[:5, ["C", "B", "A"]] = 5 + result = self.mixed_frame.ix[:5, ["C", "B", "A"]] assert (result.values == 5).all() float_string_frame.ix[5] = np.nan assert isna(float_string_frame.ix[5]).all() float_string_frame.ix[5] = float_string_frame.ix[6] - assert_series_equal(float_string_frame.ix[5], - float_string_frame.ix[6], - check_names=False) + assert_series_equal( + float_string_frame.ix[5], float_string_frame.ix[6], check_names=False + ) # #1432 with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - df = DataFrame({1: [1., 2., 3.], - 2: [3, 4, 5]}) + df = DataFrame({1: [1.0, 2.0, 3.0], 2: [3, 4, 5]}) assert df._is_mixed_type df.ix[1] = [5, 10] - expected = DataFrame({1: [1., 5., 3.], - 2: [3, 10, 5]}) + expected = DataFrame({1: [1.0, 5.0, 3.0], 2: [3, 10, 5]}) assert_frame_equal(df, expected) @@ -1249,40 +1280,42 @@ def test_ix_multi_take(self): """ def test_ix_multi_take_nonint_index(self): - df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], - columns=['a', 'b']) + df = DataFrame(np.random.randn(3, 2), index=["x", "y", "z"], columns=["a", "b"]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) rs = df.ix[[0], [0]] - xp = df.reindex(['x'], columns=['a']) + xp = df.reindex(["x"], columns=["a"]) assert_frame_equal(rs, xp) def test_ix_multi_take_multiindex(self): - df = DataFrame(np.random.randn(3, 2), index=['x', 'y', 'z'], - columns=[['a', 'b'], ['1', '2']]) + df = DataFrame( + np.random.randn(3, 2), + index=["x", "y", "z"], + columns=[["a", "b"], ["1", "2"]], + ) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) rs = df.ix[[0], [0]] - xp = df.reindex(['x'], columns=[('a', '1')]) + xp = df.reindex(["x"], columns=[("a", "1")]) assert_frame_equal(rs, xp) def test_ix_dup(self): - idx = Index(['a', 'a', 'b', 'c', 'd', 'd']) + idx = Index(["a", "a", "b", "c", "d", "d"]) df = DataFrame(np.random.randn(len(idx), 3), idx) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - sub = df.ix[:'d'] + sub = df.ix[:"d"] assert_frame_equal(sub, df) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - sub = df.ix['a':'c'] + sub = df.ix["a":"c"] assert_frame_equal(sub, df.ix[0:4]) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - sub = df.ix['b':'d'] + sub = df.ix["b":"d"] assert_frame_equal(sub, df.ix[2:]) def test_getitem_fancy_1d(self, float_frame, float_string_frame): @@ -1296,8 +1329,8 @@ def test_getitem_fancy_1d(self, float_frame, float_string_frame): # low dimensional slice with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - xs1 = f.ix[2, ['C', 'B', 'A']] - xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A']) + xs1 = f.ix[2, ["C", "B", "A"]] + xs2 = f.xs(f.index[2]).reindex(["C", "B", "A"]) tm.assert_series_equal(xs1, xs2) with catch_warnings(record=True): @@ -1322,7 +1355,7 @@ def test_getitem_fancy_1d(self, float_frame, float_string_frame): # single column with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - assert_series_equal(f.ix[:, 'A'], f['A']) + assert_series_equal(f.ix[:, "A"], f["A"]) # return view with catch_warnings(record=True): @@ -1353,16 +1386,16 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[2, ['C', 'B', 'A']] = [1., 2., 3.] - expected['C'][2] = 1. - expected['B'][2] = 2. - expected['A'][2] = 3. + frame.ix[2, ["C", "B", "A"]] = [1.0, 2.0, 3.0] + expected["C"][2] = 1.0 + expected["B"][2] = 2.0 + expected["A"][2] = 3.0 assert_frame_equal(frame, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) frame2 = float_frame.copy() - frame2.ix[2, [3, 2, 1]] = [1., 2., 3.] + frame2.ix[2, [3, 2, 1]] = [1.0, 2.0, 3.0] assert_frame_equal(frame, expected) # case 2, set a section of a column @@ -1379,7 +1412,7 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) frame2 = float_frame.copy() - frame2.ix[5:10, 'B'] = vals + frame2.ix[5:10, "B"] = vals assert_frame_equal(frame, expected) # case 3: full xs @@ -1388,14 +1421,14 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[4] = 5. - expected.values[4] = 5. + frame.ix[4] = 5.0 + expected.values[4] = 5.0 assert_frame_equal(frame, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[frame.index[4]] = 6. - expected.values[4] = 6. + frame.ix[frame.index[4]] = 6.0 + expected.values[4] = 6.0 assert_frame_equal(frame, expected) # single column @@ -1404,8 +1437,8 @@ def test_setitem_fancy_1d(self, float_frame): with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - frame.ix[:, 'A'] = 7. - expected['A'] = 7. + frame.ix[:, "A"] = 7.0 + expected["A"] = 7.0 assert_frame_equal(frame, expected) def test_getitem_fancy_scalar(self, float_frame): @@ -1438,11 +1471,11 @@ def test_getitem_fancy_boolean(self, float_frame): f = float_frame ix = f.loc - expected = f.reindex(columns=['B', 'D']) + expected = f.reindex(columns=["B", "D"]) result = ix[:, [False, True, False, True]] assert_frame_equal(result, expected) - expected = f.reindex(index=f.index[5:10], columns=['B', 'D']) + expected = f.reindex(index=f.index[5:10], columns=["B", "D"]) result = ix[f.index[5:10], [False, True, False, True]] assert_frame_equal(result, expected) @@ -1454,8 +1487,7 @@ def test_getitem_fancy_boolean(self, float_frame): assert_frame_equal(result, expected) result = ix[boolvec, f.columns[2:]] - expected = f.reindex(index=f.index[boolvec], - columns=['C', 'D']) + expected = f.reindex(index=f.index[boolvec], columns=["C", "D"]) assert_frame_equal(result, expected) def test_setitem_fancy_boolean(self, float_frame): @@ -1463,15 +1495,15 @@ def test_setitem_fancy_boolean(self, float_frame): frame = float_frame.copy() expected = float_frame.copy() - mask = frame['A'] > 0 - frame.loc[mask] = 0. - expected.values[mask.values] = 0. + mask = frame["A"] > 0 + frame.loc[mask] = 0.0 + expected.values[mask.values] = 0.0 assert_frame_equal(frame, expected) frame = float_frame.copy() expected = float_frame.copy() - frame.loc[mask, ['A', 'B']] = 0. - expected.values[mask.values, :2] = 0. + frame.loc[mask, ["A", "B"]] = 0.0 + expected.values[mask.values, :2] = 0.0 assert_frame_equal(frame, expected) def test_getitem_fancy_ints(self, float_frame): @@ -1485,7 +1517,7 @@ def test_getitem_fancy_ints(self, float_frame): def test_getitem_setitem_fancy_exceptions(self, float_frame): ix = float_frame.iloc - with pytest.raises(IndexingError, match='Too many indexers'): + with pytest.raises(IndexingError, match="Too many indexers"): ix[:, :, :] with pytest.raises(IndexingError): @@ -1493,7 +1525,7 @@ def test_getitem_setitem_fancy_exceptions(self, float_frame): def test_getitem_setitem_boolean_misaligned(self, float_frame): # boolean index misaligned labels - mask = float_frame['A'][::-1] > 1 + mask = float_frame["A"][::-1] > 1 result = float_frame.loc[mask] expected = float_frame.loc[mask[::-1]] @@ -1516,8 +1548,7 @@ def test_getitem_setitem_boolean_multi(self): assert_frame_equal(result, expected) expected = df.copy() - df.loc[np.array([True, False, True]), - np.array([False, True])] = 5 + df.loc[np.array([True, False, True]), np.array([False, True])] = 5 expected.loc[[0, 2], [1]] = 5 assert_frame_equal(df, expected) @@ -1554,9 +1585,11 @@ def test_getitem_setitem_float_labels(self): df = DataFrame(np.random.randn(5, 5), index=index) # positional slicing only via iloc! - msg = ("cannot do slice indexing on" - r" with" - r" these indexers \[1.0\] of ") + msg = ( + "cannot do slice indexing on" + r" with" + r" these indexers \[1.0\] of " + ) with pytest.raises(TypeError, match=msg): df.iloc[1.0:5] @@ -1608,38 +1641,44 @@ def test_getitem_setitem_float_labels(self): assert (result == 0).values.all() def test_setitem_single_column_mixed(self): - df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], - columns=['foo', 'bar', 'baz']) - df['str'] = 'qux' - df.loc[df.index[::2], 'str'] = np.nan - expected = np.array([np.nan, 'qux', np.nan, 'qux', np.nan], - dtype=object) - assert_almost_equal(df['str'].values, expected) + df = DataFrame( + np.random.randn(5, 3), + index=["a", "b", "c", "d", "e"], + columns=["foo", "bar", "baz"], + ) + df["str"] = "qux" + df.loc[df.index[::2], "str"] = np.nan + expected = np.array([np.nan, "qux", np.nan, "qux", np.nan], dtype=object) + assert_almost_equal(df["str"].values, expected) def test_setitem_single_column_mixed_datetime(self): - df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], - columns=['foo', 'bar', 'baz']) + df = DataFrame( + np.random.randn(5, 3), + index=["a", "b", "c", "d", "e"], + columns=["foo", "bar", "baz"], + ) - df['timestamp'] = Timestamp('20010102') + df["timestamp"] = Timestamp("20010102") # check our dtypes result = df.dtypes - expected = Series([np.dtype('float64')] * 3 + - [np.dtype('datetime64[ns]')], - index=['foo', 'bar', 'baz', 'timestamp']) + expected = Series( + [np.dtype("float64")] * 3 + [np.dtype("datetime64[ns]")], + index=["foo", "bar", "baz", "timestamp"], + ) assert_series_equal(result, expected) # set an allowable datetime64 type - df.loc['b', 'timestamp'] = iNaT - assert isna(df.loc['b', 'timestamp']) + df.loc["b", "timestamp"] = iNaT + assert isna(df.loc["b", "timestamp"]) # allow this syntax - df.loc['c', 'timestamp'] = np.nan - assert isna(df.loc['c', 'timestamp']) + df.loc["c", "timestamp"] = np.nan + assert isna(df.loc["c", "timestamp"]) # allow this syntax - df.loc['d', :] = np.nan - assert not isna(df.loc['c', :]).all() + df.loc["d", :] = np.nan + assert not isna(df.loc["c", :]).all() # as of GH 3216 this will now work! # try to set with a list like item @@ -1648,27 +1687,37 @@ def test_setitem_single_column_mixed_datetime(self): def test_setitem_mixed_datetime(self): # GH 9336 - expected = DataFrame({'a': [0, 0, 0, 0, 13, 14], - 'b': [pd.datetime(2012, 1, 1), - 1, - 'x', - 'y', - pd.datetime(2013, 1, 1), - pd.datetime(2014, 1, 1)]}) - df = pd.DataFrame(0, columns=list('ab'), index=range(6)) - df['b'] = pd.NaT - df.loc[0, 'b'] = pd.datetime(2012, 1, 1) - df.loc[1, 'b'] = 1 - df.loc[[2, 3], 'b'] = 'x', 'y' - A = np.array([[13, np.datetime64('2013-01-01T00:00:00')], - [14, np.datetime64('2014-01-01T00:00:00')]]) - df.loc[[4, 5], ['a', 'b']] = A + expected = DataFrame( + { + "a": [0, 0, 0, 0, 13, 14], + "b": [ + pd.datetime(2012, 1, 1), + 1, + "x", + "y", + pd.datetime(2013, 1, 1), + pd.datetime(2014, 1, 1), + ], + } + ) + df = pd.DataFrame(0, columns=list("ab"), index=range(6)) + df["b"] = pd.NaT + df.loc[0, "b"] = pd.datetime(2012, 1, 1) + df.loc[1, "b"] = 1 + df.loc[[2, 3], "b"] = "x", "y" + A = np.array( + [ + [13, np.datetime64("2013-01-01T00:00:00")], + [14, np.datetime64("2014-01-01T00:00:00")], + ] + ) + df.loc[[4, 5], ["a", "b"]] = A assert_frame_equal(df, expected) def test_setitem_frame_float(self, float_frame): - piece = float_frame.loc[float_frame.index[:2], ['A', 'B']] - float_frame.loc[float_frame.index[-2]:, ['A', 'B']] = piece.values - result = float_frame.loc[float_frame.index[-2:], ['A', 'B']].values + piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] + float_frame.loc[float_frame.index[-2] :, ["A", "B"]] = piece.values + result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values expected = piece.values assert_almost_equal(result, expected) @@ -1677,83 +1726,80 @@ def test_setitem_frame_mixed(self, float_string_frame): # already aligned f = float_string_frame.copy() - piece = DataFrame([[1., 2.], [3., 4.]], - index=f.index[0:2], columns=['A', 'B']) - key = (slice(None, 2), ['A', 'B']) + piece = DataFrame( + [[1.0, 2.0], [3.0, 4.0]], index=f.index[0:2], columns=["A", "B"] + ) + key = (slice(None, 2), ["A", "B"]) f.loc[key] = piece - assert_almost_equal(f.loc[f.index[0:2], ['A', 'B']].values, - piece.values) + assert_almost_equal(f.loc[f.index[0:2], ["A", "B"]].values, piece.values) # rows unaligned f = float_string_frame.copy() - piece = DataFrame([[1., 2.], [3., 4.], [5., 6.], [7., 8.]], - index=list(f.index[0:2]) + ['foo', 'bar'], - columns=['A', 'B']) - key = (slice(None, 2), ['A', 'B']) + piece = DataFrame( + [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]], + index=list(f.index[0:2]) + ["foo", "bar"], + columns=["A", "B"], + ) + key = (slice(None, 2), ["A", "B"]) f.loc[key] = piece - assert_almost_equal(f.loc[f.index[0:2:], ['A', 'B']].values, - piece.values[0:2]) + assert_almost_equal(f.loc[f.index[0:2:], ["A", "B"]].values, piece.values[0:2]) # key is unaligned with values f = float_string_frame.copy() - piece = f.loc[f.index[:2], ['A']] + piece = f.loc[f.index[:2], ["A"]] piece.index = f.index[-2:] - key = (slice(-2, None), ['A', 'B']) + key = (slice(-2, None), ["A", "B"]) f.loc[key] = piece - piece['B'] = np.nan - assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values, - piece.values) + piece["B"] = np.nan + assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) # ndarray f = float_string_frame.copy() - piece = float_string_frame.loc[f.index[:2], ['A', 'B']] - key = (slice(-2, None), ['A', 'B']) + piece = float_string_frame.loc[f.index[:2], ["A", "B"]] + key = (slice(-2, None), ["A", "B"]) f.loc[key] = piece.values - assert_almost_equal(f.loc[f.index[-2:], ['A', 'B']].values, - piece.values) + assert_almost_equal(f.loc[f.index[-2:], ["A", "B"]].values, piece.values) def test_setitem_frame_upcast(self): # needs upcasting - df = DataFrame([[1, 2, 'foo'], [3, 4, 'bar']], columns=['A', 'B', 'C']) + df = DataFrame([[1, 2, "foo"], [3, 4, "bar"]], columns=["A", "B", "C"]) df2 = df.copy() - df2.loc[:, ['A', 'B']] = df.loc[:, ['A', 'B']] + 0.5 - expected = df.reindex(columns=['A', 'B']) + df2.loc[:, ["A", "B"]] = df.loc[:, ["A", "B"]] + 0.5 + expected = df.reindex(columns=["A", "B"]) expected += 0.5 - expected['C'] = df['C'] + expected["C"] = df["C"] assert_frame_equal(df2, expected) def test_setitem_frame_align(self, float_frame): - piece = float_frame.loc[float_frame.index[:2], ['A', 'B']] + piece = float_frame.loc[float_frame.index[:2], ["A", "B"]] piece.index = float_frame.index[-2:] - piece.columns = ['A', 'B'] - float_frame.loc[float_frame.index[-2:], ['A', 'B']] = piece - result = float_frame.loc[float_frame.index[-2:], ['A', 'B']].values + piece.columns = ["A", "B"] + float_frame.loc[float_frame.index[-2:], ["A", "B"]] = piece + result = float_frame.loc[float_frame.index[-2:], ["A", "B"]].values expected = piece.values assert_almost_equal(result, expected) def test_getitem_setitem_ix_duplicates(self): # #1201 - df = DataFrame(np.random.randn(5, 3), - index=['foo', 'foo', 'bar', 'baz', 'bar']) + df = DataFrame(np.random.randn(5, 3), index=["foo", "foo", "bar", "baz", "bar"]) - result = df.loc['foo'] + result = df.loc["foo"] expected = df[:2] assert_frame_equal(result, expected) - result = df.loc['bar'] + result = df.loc["bar"] expected = df.iloc[[2, 4]] assert_frame_equal(result, expected) - result = df.loc['baz'] + result = df.loc["baz"] expected = df.iloc[3] assert_series_equal(result, expected) def test_getitem_ix_boolean_duplicates_multiple(self): # #1201 - df = DataFrame(np.random.randn(5, 3), - index=['foo', 'foo', 'bar', 'baz', 'bar']) + df = DataFrame(np.random.randn(5, 3), index=["foo", "foo", "bar", "baz", "bar"]) - result = df.loc[['bar']] + result = df.loc[["bar"]] exp = df.iloc[[2, 4]] assert_frame_equal(result, exp) @@ -1767,7 +1813,7 @@ def test_getitem_ix_boolean_duplicates_multiple(self): def test_getitem_setitem_ix_bool_keyerror(self): # #2199 - df = DataFrame({'a': [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) with pytest.raises(KeyError, match=r"^False$"): df.loc[False] @@ -1782,11 +1828,11 @@ def test_getitem_setitem_ix_bool_keyerror(self): def test_getitem_list_duplicates(self): # #1943 - df = DataFrame(np.random.randn(4, 4), columns=list('AABC')) - df.columns.name = 'foo' + df = DataFrame(np.random.randn(4, 4), columns=list("AABC")) + df.columns.name = "foo" - result = df[['B', 'C']] - assert result.columns.name == 'foo' + result = df[["B", "C"]] + assert result.columns.name == "foo" expected = df.iloc[:, 2:] assert_frame_equal(result, expected) @@ -1794,8 +1840,7 @@ def test_getitem_list_duplicates(self): def test_get_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = float_frame.get_value(idx, col) expected = float_frame[col][idx] assert result == expected @@ -1815,131 +1860,125 @@ def test_lookup_mixed(self, float_string_frame): cols = list(df.columns) * len(df.index) result = df.lookup(rows, cols) - expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)], - dtype=np.object_) + expected = np.array( + [df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_ + ) tm.assert_almost_equal(result, expected) def test_lookup_bool(self): - df = DataFrame({'label': ['a', 'b', 'a', 'c'], - 'mask_a': [True, True, False, True], - 'mask_b': [True, False, False, False], - 'mask_c': [False, True, False, True]}) - df['mask'] = df.lookup(df.index, 'mask_' + df['label']) - - exp_mask = np.array([ - df.loc[r, c] for r, c in zip(df.index, 'mask_' + df['label'])]) - - tm.assert_series_equal(df['mask'], pd.Series(exp_mask, name='mask')) - assert df['mask'].dtype == np.bool_ + df = DataFrame( + { + "label": ["a", "b", "a", "c"], + "mask_a": [True, True, False, True], + "mask_b": [True, False, False, False], + "mask_c": [False, True, False, True], + } + ) + df["mask"] = df.lookup(df.index, "mask_" + df["label"]) + + exp_mask = np.array( + [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] + ) + + tm.assert_series_equal(df["mask"], pd.Series(exp_mask, name="mask")) + assert df["mask"].dtype == np.bool_ def test_lookup_raises(self, float_frame): with pytest.raises(KeyError): - float_frame.lookup(['xyz'], ['A']) + float_frame.lookup(["xyz"], ["A"]) with pytest.raises(KeyError): - float_frame.lookup([float_frame.index[0]], ['xyz']) + float_frame.lookup([float_frame.index[0]], ["xyz"]) - with pytest.raises(ValueError, match='same size'): - float_frame.lookup(['a', 'b', 'c'], ['a']) + with pytest.raises(ValueError, match="same size"): + float_frame.lookup(["a", "b", "c"], ["a"]) def test_set_value(self, float_frame): for idx in float_frame.index: for col in float_frame.columns: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): float_frame.set_value(idx, col, 1) assert float_frame[col][idx] == 1 def test_set_value_resize(self, float_frame): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = float_frame.set_value('foobar', 'B', 0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = float_frame.set_value("foobar", "B", 0) assert res is float_frame - assert res.index[-1] == 'foobar' - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert res.get_value('foobar', 'B') == 0 + assert res.index[-1] == "foobar" + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert res.get_value("foobar", "B") == 0 - float_frame.loc['foobar', 'qux'] = 0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert float_frame.get_value('foobar', 'qux') == 0 + float_frame.loc["foobar", "qux"] = 0 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert float_frame.get_value("foobar", "qux") == 0 res = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res3 = res.set_value('foobar', 'baz', 'sam') - assert res3['baz'].dtype == np.object_ + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res3 = res.set_value("foobar", "baz", "sam") + assert res3["baz"].dtype == np.object_ res = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res3 = res.set_value('foobar', 'baz', True) - assert res3['baz'].dtype == np.object_ + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res3 = res.set_value("foobar", "baz", True) + assert res3["baz"].dtype == np.object_ res = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res3 = res.set_value('foobar', 'baz', 5) - assert is_float_dtype(res3['baz']) - assert isna(res3['baz'].drop(['foobar'])).all() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res3 = res.set_value("foobar", "baz", 5) + assert is_float_dtype(res3["baz"]) + assert isna(res3["baz"].drop(["foobar"])).all() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): msg = "could not convert string to float: 'sam'" with pytest.raises(ValueError, match=msg): - res3.set_value('foobar', 'baz', 'sam') + res3.set_value("foobar", "baz", "sam") def test_set_value_with_index_dtype_change(self): - df_orig = DataFrame(np.random.randn(3, 3), - index=range(3), columns=list('ABC')) + df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) # this is actually ambiguous as the 2 is interpreted as a positional # so column is not created df = df_orig.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - df.set_value('C', 2, 1.0) - assert list(df.index) == list(df_orig.index) + ['C'] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.set_value("C", 2, 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] # assert list(df.columns) == list(df_orig.columns) + [2] df = df_orig.copy() - df.loc['C', 2] = 1.0 - assert list(df.index) == list(df_orig.index) + ['C'] + df.loc["C", 2] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] # assert list(df.columns) == list(df_orig.columns) + [2] # create both new df = df_orig.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - df.set_value('C', 'D', 1.0) - assert list(df.index) == list(df_orig.index) + ['C'] - assert list(df.columns) == list(df_orig.columns) + ['D'] + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df.set_value("C", "D", 1.0) + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] df = df_orig.copy() - df.loc['C', 'D'] = 1.0 - assert list(df.index) == list(df_orig.index) + ['C'] - assert list(df.columns) == list(df_orig.columns) + ['D'] + df.loc["C", "D"] = 1.0 + assert list(df.index) == list(df_orig.index) + ["C"] + assert list(df.columns) == list(df_orig.columns) + ["D"] def test_get_set_value_no_partial_indexing(self): # partial w/ MultiIndex raise exception index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)]) df = DataFrame(index=index, columns=range(4)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with pytest.raises(KeyError, match=r"^0$"): df.get_value(0, 1) def test_single_element_ix_dont_upcast(self, float_frame): - float_frame['E'] = 1 - assert issubclass(float_frame['E'].dtype.type, (int, np.integer)) + float_frame["E"] = 1 + assert issubclass(float_frame["E"].dtype.type, (int, np.integer)) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = float_frame.ix[float_frame.index[5], 'E'] + result = float_frame.ix[float_frame.index[5], "E"] assert is_integer(result) - result = float_frame.loc[float_frame.index[5], 'E'] + result = float_frame.loc[float_frame.index[5], "E"] assert is_integer(result) # GH 11617 @@ -1953,7 +1992,7 @@ def test_single_element_ix_dont_upcast(self, float_frame): result = df.loc[0, "b"] assert is_integer(result) - expected = Series([666], [0], name='b') + expected = Series([666], [0], name="b") with catch_warnings(record=True): simplefilter("ignore", FutureWarning) result = df.ix[[0], "b"] @@ -1980,10 +2019,10 @@ def test_iloc_row(self): # verify slice is view # setting it makes it raise/warn with pytest.raises(com.SettingWithCopyError): - result[2] = 0. + result[2] = 0.0 exp_col = df[2].copy() - exp_col[4:8] = 0. + exp_col[4:8] = 0.0 assert_series_equal(df[2], exp_col) # list of integers @@ -2011,7 +2050,7 @@ def test_iloc_col(self): # verify slice is view # and that we are setting a copy with pytest.raises(com.SettingWithCopyError): - result[8] = 0. + result[8] = 0.0 assert (df[8] == 0).all() @@ -2022,8 +2061,7 @@ def test_iloc_col(self): def test_iloc_duplicates(self): - df = DataFrame(np.random.rand(3, 3), columns=list('ABC'), - index=list('aab')) + df = DataFrame(np.random.rand(3, 3), columns=list("ABC"), index=list("aab")) result = df.iloc[0] with catch_warnings(record=True): @@ -2042,9 +2080,11 @@ def test_iloc_duplicates(self): assert_series_equal(result, result2) # multiindex - df = DataFrame(np.random.randn(3, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + df = DataFrame( + np.random.randn(3, 3), + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j"], ["X", "X", "Y"]], + ) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) @@ -2074,11 +2114,12 @@ def test_loc_duplicates(self): # gh-17105 # insert a duplicate element to the index - trange = pd.date_range(start=pd.Timestamp(year=2017, month=1, day=1), - end=pd.Timestamp(year=2017, month=1, day=5)) + trange = pd.date_range( + start=pd.Timestamp(year=2017, month=1, day=1), + end=pd.Timestamp(year=2017, month=1, day=5), + ) - trange = trange.insert(loc=5, - item=pd.Timestamp(year=2017, month=1, day=5)) + trange = trange.insert(loc=5, item=pd.Timestamp(year=2017, month=1, day=5)) df = pd.DataFrame(0, index=trange, columns=["A", "B"]) bool_idx = np.array([False, False, False, False, False, True]) @@ -2086,9 +2127,9 @@ def test_loc_duplicates(self): # assignment df.loc[trange[bool_idx], "A"] = 6 - expected = pd.DataFrame({'A': [0, 0, 0, 0, 6, 6], - 'B': [0, 0, 0, 0, 0, 0]}, - index=trange) + expected = pd.DataFrame( + {"A": [0, 0, 0, 0, 6, 6], "B": [0, 0, 0, 0, 0, 0]}, index=trange + ) tm.assert_frame_equal(df, expected) # in-place @@ -2099,8 +2140,9 @@ def test_loc_duplicates(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_iloc_sparse_propegate_fill_value(self): from pandas.core.sparse.api import SparseDataFrame - df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999) - assert len(df['A'].sp_values) == len(df.iloc[:, 0].sp_values) + + df = SparseDataFrame({"A": [999, 1]}, default_fill_value=999) + assert len(df["A"].sp_values) == len(df.iloc[:, 0].sp_values) def test_iat(self, float_frame): @@ -2115,8 +2157,9 @@ def test_nested_exception(self): # (which may get fixed), it's just a way to trigger # the issue or reraising an outer exception without # a named argument - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], - "c": [7, 8, 9]}).set_index(["a", "b"]) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) index = list(df.index) index[0] = ["a", "b"] df.index = index @@ -2126,16 +2169,19 @@ def test_nested_exception(self): except Exception as e: assert type(e) != UnboundLocalError - @pytest.mark.parametrize("method,expected_values", [ - ("nearest", [0, 1, 1, 2]), - ("pad", [np.nan, 0, 1, 1]), - ("backfill", [0, 1, 2, 2]) - ]) + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) def test_reindex_methods(self, method, expected_values): df = pd.DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5]) - expected = pd.DataFrame({'x': expected_values}, index=target) + expected = pd.DataFrame({"x": expected_values}, index=target) actual = df.reindex(target, method=method) assert_frame_equal(expected, actual) @@ -2158,9 +2204,9 @@ def test_reindex_methods(self, method, expected_values): actual = df.reindex(target[new_order], method=method) assert_frame_equal(e2, actual) - switched_method = ('pad' if method == 'backfill' - else 'backfill' if method == 'pad' - else method) + switched_method = ( + "pad" if method == "backfill" else "backfill" if method == "pad" else method + ) actual = df[::-1].reindex(target, method=switched_method) assert_frame_equal(expected, actual) @@ -2173,49 +2219,49 @@ def test_reindex_methods_nearest_special(self): assert_frame_equal(expected, actual) expected = pd.DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", - tolerance=[0.5, 0.01, 0.4, 0.1]) + actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) assert_frame_equal(expected, actual) def test_reindex_frame_add_nat(self): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') - df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) result = df.reindex(range(15)) - assert np.issubdtype(result['B'].dtype, np.dtype('M8[ns]')) + assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) - mask = com.isna(result)['B'] + mask = com.isna(result)["B"] assert mask[-5:].all() assert not mask[:-5].any() def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) - assert x[0].dtype == np.dtype('M8[ns]') + assert x[0].dtype == np.dtype("M8[ns]") def test_non_monotonic_reindex_methods(self): - dr = pd.date_range('2013-08-01', periods=6, freq='B') + dr = pd.date_range("2013-08-01", periods=6, freq="B") data = np.random.randn(6, 1) - df = pd.DataFrame(data, index=dr, columns=list('A')) - df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], - columns=list('A')) + df = pd.DataFrame(data, index=dr, columns=list("A")) + df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) # index is not monotonic increasing or decreasing msg = "index must be monotonic increasing or decreasing" with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='pad') + df_rev.reindex(df.index, method="pad") with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='ffill') + df_rev.reindex(df.index, method="ffill") with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='bfill') + df_rev.reindex(df.index, method="bfill") with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method='nearest') + df_rev.reindex(df.index, method="nearest") def test_reindex_level(self): from itertools import permutations - icol = ['jim', 'joe', 'jolie'] + + icol = ["jim", "joe", "jolie"] def verify_first_level(df, level, idx, check_index_type=True): def f(val): return np.nonzero((df[level] == val).to_numpy())[0] + i = np.concatenate(list(map(f, idx))) left = df.set_index(icol).reindex(idx, level=level) right = df.iloc[i].set_index(icol) @@ -2226,64 +2272,87 @@ def verify(df, level, idx, indexer, check_index_type=True): right = df.iloc[indexer].set_index(icol) assert_frame_equal(left, right, check_index_type=check_index_type) - df = pd.DataFrame({'jim': list('B' * 4 + 'A' * 2 + 'C' * 3), - 'joe': list('abcdeabcd')[::-1], - 'jolie': [10, 20, 30] * 3, - 'joline': np.random.randint(0, 1000, 9)}) - - target = [['C', 'B', 'A'], ['F', 'C', 'A', 'D'], ['A'], - ['A', 'B', 'C'], ['C', 'A', 'B'], ['C', 'B'], ['C', 'A'], - ['A', 'B'], ['B', 'A', 'C']] + df = pd.DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.randint(0, 1000, 9), + } + ) + + target = [ + ["C", "B", "A"], + ["F", "C", "A", "D"], + ["A"], + ["A", "B", "C"], + ["C", "A", "B"], + ["C", "B"], + ["C", "A"], + ["A", "B"], + ["B", "A", "C"], + ] for idx in target: - verify_first_level(df, 'jim', idx) + verify_first_level(df, "jim", idx) # reindex by these causes different MultiIndex levels - for idx in [['D', 'F'], ['A', 'C', 'B']]: - verify_first_level(df, 'jim', idx, check_index_type=False) - - verify(df, 'joe', list('abcde'), [3, 2, 1, 0, 5, 4, 8, 7, 6]) - verify(df, 'joe', list('abcd'), [3, 2, 1, 0, 5, 8, 7, 6]) - verify(df, 'joe', list('abc'), [3, 2, 1, 8, 7, 6]) - verify(df, 'joe', list('eca'), [1, 3, 4, 6, 8]) - verify(df, 'joe', list('edc'), [0, 1, 4, 5, 6]) - verify(df, 'joe', list('eadbc'), [3, 0, 2, 1, 4, 5, 8, 7, 6]) - verify(df, 'joe', list('edwq'), [0, 4, 5]) - verify(df, 'joe', list('wq'), [], check_index_type=False) - - df = DataFrame({'jim': ['mid'] * 5 + ['btm'] * 8 + ['top'] * 7, - 'joe': ['3rd'] * 2 + ['1st'] * 3 + ['2nd'] * 3 + - ['1st'] * 2 + ['3rd'] * 3 + ['1st'] * 2 + - ['3rd'] * 3 + ['2nd'] * 2, - # this needs to be jointly unique with jim and joe or - # reindexing will fail ~1.5% of the time, this works - # out to needing unique groups of same size as joe - 'jolie': np.concatenate([ - np.random.choice(1000, x, replace=False) - for x in [2, 3, 3, 2, 3, 2, 3, 2]]), - 'joline': np.random.randn(20).round(3) * 10}) - - for idx in permutations(df['jim'].unique()): + for idx in [["D", "F"], ["A", "C", "B"]]: + verify_first_level(df, "jim", idx, check_index_type=False) + + verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) + verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) + verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) + verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) + verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) + verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) + verify(df, "joe", list("edwq"), [0, 4, 5]) + verify(df, "joe", list("wq"), [], check_index_type=False) + + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.randn(20).round(3) * 10, + } + ) + + for idx in permutations(df["jim"].unique()): for i in range(3): - verify_first_level(df, 'jim', idx[:i + 1]) + verify_first_level(df, "jim", idx[: i + 1]) - i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, - 11, 12, 13, 14, 18, 19, 15, 16, 17] - verify(df, 'joe', ['1st', '2nd', '3rd'], i) + i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] + verify(df, "joe", ["1st", "2nd", "3rd"], i) - i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, - 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] - verify(df, 'joe', ['3rd', '2nd', '1st'], i) + i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] + verify(df, "joe", ["3rd", "2nd", "1st"], i) i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] - verify(df, 'joe', ['2nd', '3rd'], i) + verify(df, "joe", ["2nd", "3rd"], i) i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] - verify(df, 'joe', ['3rd', '1st'], i) + verify(df, "joe", ["3rd", "1st"], i) def test_getitem_ix_float_duplicates(self): - df = pd.DataFrame(np.random.randn(3, 3), - index=[0.1, 0.2, 0.2], columns=list('abc')) + df = pd.DataFrame( + np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") + ) expect = df.iloc[1:] assert_frame_equal(df.loc[0.2], expect) with catch_warnings(record=True): @@ -2291,7 +2360,7 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) df.index = [1, 0.2, 0.2] expect = df.iloc[1:] @@ -2301,10 +2370,11 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:, 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) - df = pd.DataFrame(np.random.randn(4, 3), - index=[1, 0.2, 0.2, 1], columns=list('abc')) + df = pd.DataFrame( + np.random.randn(4, 3), index=[1, 0.2, 0.2, 1], columns=list("abc") + ) expect = df.iloc[1:-1] assert_frame_equal(df.loc[0.2], expect) with catch_warnings(record=True): @@ -2312,7 +2382,7 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[1:-1, 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) df.index = [0.1, 0.2, 2, 0.2] expect = df.iloc[[1, -1]] @@ -2322,96 +2392,96 @@ def test_getitem_ix_float_duplicates(self): assert_frame_equal(df.ix[0.2], expect) expect = df.iloc[[1, -1], 0] - assert_series_equal(df.loc[0.2, 'a'], expect) + assert_series_equal(df.loc[0.2, "a"], expect) def test_getitem_sparse_column(self): # https://github.com/pandas-dev/pandas/issues/23559 data = pd.SparseArray([0, 1]) df = pd.DataFrame({"A": data}) expected = pd.Series(data, name="A") - result = df['A'] + result = df["A"] tm.assert_series_equal(result, expected) result = df.iloc[:, 0] tm.assert_series_equal(result, expected) - result = df.loc[:, 'A'] + result = df.loc[:, "A"] tm.assert_series_equal(result, expected) def test_setitem_with_sparse_value(self): # GH8131 - df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) sp_array = pd.SparseArray([0, 0, 1]) - df['new_column'] = sp_array - assert_series_equal(df['new_column'], - pd.Series(sp_array, name='new_column'), - check_names=False) + df["new_column"] = sp_array + assert_series_equal( + df["new_column"], pd.Series(sp_array, name="new_column"), check_names=False + ) def test_setitem_with_unaligned_sparse_value(self): - df = pd.DataFrame({'c_1': ['a', 'b', 'c'], 'n_1': [1., 2., 3.]}) + df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) sp_series = pd.Series(pd.SparseArray([0, 0, 1]), index=[2, 1, 0]) - df['new_column'] = sp_series - exp = pd.Series(pd.SparseArray([1, 0, 0]), name='new_column') - assert_series_equal(df['new_column'], exp) + df["new_column"] = sp_series + exp = pd.Series(pd.SparseArray([1, 0, 0]), name="new_column") + assert_series_equal(df["new_column"], exp) def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. # Make sure timezone isn't lost - column = pd.Series(pd.date_range('2015-01-01', periods=3, tz='utc'), - name='dates') - df = pd.DataFrame({'dates': column}) - df['dates'] = column[[1, 0, 2]] - assert_series_equal(df['dates'], column) + column = pd.Series( + pd.date_range("2015-01-01", periods=3, tz="utc"), name="dates" + ) + df = pd.DataFrame({"dates": column}) + df["dates"] = column[[1, 0, 2]] + assert_series_equal(df["dates"], column) - df = pd.DataFrame({'dates': column}) - df.loc[[0, 1, 2], 'dates'] = column[[1, 0, 2]] - assert_series_equal(df['dates'], column) + df = pd.DataFrame({"dates": column}) + df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] + assert_series_equal(df["dates"], column) def test_setitem_datetime_coercion(self): # gh-1048 - df = pd.DataFrame({'c': [pd.Timestamp('2010-10-01')] * 3}) - df.loc[0:1, 'c'] = np.datetime64('2008-08-08') - assert pd.Timestamp('2008-08-08') == df.loc[0, 'c'] - assert pd.Timestamp('2008-08-08') == df.loc[1, 'c'] - df.loc[2, 'c'] = date(2005, 5, 5) - assert pd.Timestamp('2005-05-05') == df.loc[2, 'c'] + df = pd.DataFrame({"c": [pd.Timestamp("2010-10-01")] * 3}) + df.loc[0:1, "c"] = np.datetime64("2008-08-08") + assert pd.Timestamp("2008-08-08") == df.loc[0, "c"] + assert pd.Timestamp("2008-08-08") == df.loc[1, "c"] + df.loc[2, "c"] = date(2005, 5, 5) + assert pd.Timestamp("2005-05-05") == df.loc[2, "c"] def test_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT one_hour = timedelta(hours=1) - df = DataFrame(index=date_range('20130101', periods=4)) - df['A'] = np.array([1 * one_hour] * 4, dtype='m8[ns]') - df.loc[:, 'B'] = np.array([2 * one_hour] * 4, dtype='m8[ns]') - df.loc[:3, 'C'] = np.array([3 * one_hour] * 3, dtype='m8[ns]') - df.loc[:, 'D'] = np.array([4 * one_hour] * 4, dtype='m8[ns]') - df.loc[df.index[:3], 'E'] = np.array([5 * one_hour] * 3, - dtype='m8[ns]') - df['F'] = np.timedelta64('NaT') - df.loc[df.index[:-1], 'F'] = np.array([6 * one_hour] * 3, - dtype='m8[ns]') - df.loc[df.index[-3]:, 'G'] = date_range('20130101', periods=3) - df['H'] = np.datetime64('NaT') + df = DataFrame(index=date_range("20130101", periods=4)) + df["A"] = np.array([1 * one_hour] * 4, dtype="m8[ns]") + df.loc[:, "B"] = np.array([2 * one_hour] * 4, dtype="m8[ns]") + df.loc[:3, "C"] = np.array([3 * one_hour] * 3, dtype="m8[ns]") + df.loc[:, "D"] = np.array([4 * one_hour] * 4, dtype="m8[ns]") + df.loc[df.index[:3], "E"] = np.array([5 * one_hour] * 3, dtype="m8[ns]") + df["F"] = np.timedelta64("NaT") + df.loc[df.index[:-1], "F"] = np.array([6 * one_hour] * 3, dtype="m8[ns]") + df.loc[df.index[-3] :, "G"] = date_range("20130101", periods=3) + df["H"] = np.datetime64("NaT") result = df.dtypes - expected = Series([np.dtype('timedelta64[ns]')] * 6 + - [np.dtype('datetime64[ns]')] * 2, - index=list('ABCDEFGH')) + expected = Series( + [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, + index=list("ABCDEFGH"), + ) assert_series_equal(result, expected) - @pytest.mark.parametrize('idxer', ['var', ['var']]) + @pytest.mark.parametrize("idxer", ["var", ["var"]]) def test_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH 11365 tz = tz_naive_fixture - idx = date_range(start='2015-07-12', periods=3, freq='H', tz=tz) - expected = DataFrame(1.2, index=idx, columns=['var']) - result = DataFrame(index=idx, columns=['var']) + idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) + expected = DataFrame(1.2, index=idx, columns=["var"]) + result = DataFrame(index=idx, columns=["var"]) result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) def test_at_time_between_time_datetimeindex(self): - index = date_range("2012-01-01", "2012-01-05", freq='30min') + index = date_range("2012-01-01", "2012-01-05", freq="30min") df = DataFrame(np.random.randn(len(index), 5), index=index) akey = time(12, 0, 0) bkey = slice(time(13, 0, 0), time(14, 0, 0)) @@ -2466,86 +2536,84 @@ def test_xs(self, float_frame, datetime_frame): assert value == float_frame[item][idx] # mixed-type xs - test_data = { - 'A': {'1': 1, '2': 2}, - 'B': {'1': '1', '2': '2', '3': '3'}, - } + test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data) - xs = frame.xs('1') + xs = frame.xs("1") assert xs.dtype == np.object_ - assert xs['A'] == 1 - assert xs['B'] == '1' + assert xs["A"] == 1 + assert xs["B"] == "1" with pytest.raises(KeyError): datetime_frame.xs(datetime_frame.index[0] - BDay()) # xs get column - series = float_frame.xs('A', axis=1) - expected = float_frame['A'] + series = float_frame.xs("A", axis=1) + expected = float_frame["A"] assert_series_equal(series, expected) # view is returned if possible - series = float_frame.xs('A', axis=1) + series = float_frame.xs("A", axis=1) series[:] = 5 assert (expected == 5).all() def test_xs_corner(self): # pathological mixed-type reordering case df = DataFrame(index=[0]) - df['A'] = 1. - df['B'] = 'foo' - df['C'] = 2. - df['D'] = 'bar' - df['E'] = 3. + df["A"] = 1.0 + df["B"] = "foo" + df["C"] = 2.0 + df["D"] = "bar" + df["E"] = 3.0 xs = df.xs(0) - exp = pd.Series([1., 'foo', 2., 'bar', 3.], - index=list('ABCDE'), name=0) + exp = pd.Series([1.0, "foo", 2.0, "bar", 3.0], index=list("ABCDE"), name=0) tm.assert_series_equal(xs, exp) # no columns but Index(dtype=object) - df = DataFrame(index=['a', 'b', 'c']) - result = df.xs('a') - expected = Series([], name='a', index=pd.Index([], dtype=object)) + df = DataFrame(index=["a", "b", "c"]) + result = df.xs("a") + expected = Series([], name="a", index=pd.Index([], dtype=object)) assert_series_equal(result, expected) def test_xs_duplicates(self): - df = DataFrame(np.random.randn(5, 2), index=['b', 'b', 'c', 'b', 'a']) + df = DataFrame(np.random.randn(5, 2), index=["b", "b", "c", "b", "a"]) - cross = df.xs('c') + cross = df.xs("c") exp = df.iloc[2] assert_series_equal(cross, exp) def test_xs_keep_level(self): - df = (DataFrame({'day': {0: 'sat', 1: 'sun'}, - 'flavour': {0: 'strawberry', 1: 'strawberry'}, - 'sales': {0: 10, 1: 12}, - 'year': {0: 2008, 1: 2008}}) - .set_index(['year', 'flavour', 'day'])) - result = df.xs('sat', level='day', drop_level=False) + df = DataFrame( + { + "day": {0: "sat", 1: "sun"}, + "flavour": {0: "strawberry", 1: "strawberry"}, + "sales": {0: 10, 1: 12}, + "year": {0: 2008, 1: 2008}, + } + ).set_index(["year", "flavour", "day"]) + result = df.xs("sat", level="day", drop_level=False) expected = df[:1] assert_frame_equal(result, expected) - result = df.xs([2008, 'sat'], level=['year', 'day'], drop_level=False) + result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False) assert_frame_equal(result, expected) def test_xs_view(self): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent - dm = DataFrame(np.arange(20.).reshape(4, 5), - index=range(4), columns=range(5)) + dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) dm.xs(2)[:] = 10 assert (dm.xs(2) == 10).all() def test_index_namedtuple(self): from collections import namedtuple + IndexType = namedtuple("IndexType", ["a", "b"]) idx1 = IndexType("foo", "bar") idx2 = IndexType("baz", "bof") - index = Index([idx1, idx2], - name="composite_index", tupleize_cols=False) + index = Index([idx1, idx2], name="composite_index", tupleize_cols=False) df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) with catch_warnings(record=True): @@ -2558,36 +2626,56 @@ def test_index_namedtuple(self): def test_boolean_indexing(self): idx = list(range(3)) - cols = ['A', 'B', 'C'] - df1 = DataFrame(index=idx, columns=cols, - data=np.array([[0.0, 0.5, 1.0], - [1.5, 2.0, 2.5], - [3.0, 3.5, 4.0]], - dtype=float)) - df2 = DataFrame(index=idx, columns=cols, - data=np.ones((len(idx), len(cols)))) - - expected = DataFrame(index=idx, columns=cols, - data=np.array([[0.0, 0.5, 1.0], - [1.5, 2.0, -1], - [-1, -1, -1]], dtype=float)) + cols = ["A", "B", "C"] + df1 = DataFrame( + index=idx, + columns=cols, + data=np.array( + [[0.0, 0.5, 1.0], [1.5, 2.0, 2.5], [3.0, 3.5, 4.0]], dtype=float + ), + ) + df2 = DataFrame(index=idx, columns=cols, data=np.ones((len(idx), len(cols)))) + + expected = DataFrame( + index=idx, + columns=cols, + data=np.array([[0.0, 0.5, 1.0], [1.5, 2.0, -1], [-1, -1, -1]], dtype=float), + ) df1[df1 > 2.0 * df2] = -1 assert_frame_equal(df1, expected) - with pytest.raises(ValueError, match='Item wrong length'): + with pytest.raises(ValueError, match="Item wrong length"): df1[df1.index[:-1] > 2] = -1 def test_boolean_indexing_mixed(self): - df = DataFrame({ - 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, - 1: {35: np.nan, 40: 0.32632316859446198, 43: np.nan, - 49: 0.32632316859446198, 50: 0.39114724480578139}, - 2: {35: np.nan, 40: np.nan, 43: 0.29012581014105987, 49: np.nan, - 50: np.nan}, - 3: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, - 4: {35: 0.34215328467153283, 40: np.nan, 43: np.nan, 49: np.nan, - 50: np.nan}, - 'y': {35: 0, 40: 0, 43: 0, 49: 0, 50: 1}}) + df = DataFrame( + { + 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + 1: { + 35: np.nan, + 40: 0.32632316859446198, + 43: np.nan, + 49: 0.32632316859446198, + 50: 0.39114724480578139, + }, + 2: { + 35: np.nan, + 40: np.nan, + 43: 0.29012581014105987, + 49: np.nan, + 50: np.nan, + }, + 3: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, + 4: { + 35: 0.34215328467153283, + 40: np.nan, + 43: np.nan, + 49: np.nan, + 50: np.nan, + }, + "y": {35: 0, 40: 0, 43: 0, 49: 0, 50: 1}, + } + ) # mixed int/float ok df2 = df.copy() @@ -2599,33 +2687,33 @@ def test_boolean_indexing_mixed(self): expected.loc[35, 4] = 1 assert_frame_equal(df2, expected) - df['foo'] = 'test' + df["foo"] = "test" msg = "not supported between instances|unorderable types" with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 - def test_where(self, float_string_frame, mixed_float_frame, - mixed_int_frame): - default_frame = DataFrame(np.random.randn(5, 3), - columns=['A', 'B', 'C']) + def test_where(self, float_string_frame, mixed_float_frame, mixed_int_frame): + default_frame = DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) def _safe_add(df): # only add to the numeric items def is_ok(s): - return (issubclass(s.dtype.type, (np.integer, np.floating)) and - s.dtype != 'uint8') + return ( + issubclass(s.dtype.type, (np.integer, np.floating)) + and s.dtype != "uint8" + ) - return DataFrame(dict((c, s + 1) if is_ok(s) else (c, s) - for c, s in df.items())) + return DataFrame( + dict((c, s + 1) if is_ok(s) else (c, s) for c, s in df.items()) + ) def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) rs = df.where(cond, other1) rs2 = df.where(cond.values, other1) for k, v in rs.iteritems(): - exp = Series( - np.where(cond[k], df[k], other1[k]), index=v.index) + exp = Series(np.where(cond[k], df[k], other1[k]), index=v.index) assert_series_equal(v, exp, check_names=False) assert_frame_equal(rs, rs2) @@ -2634,8 +2722,12 @@ def _check_get(df, cond, check_dtypes=True): assert (rs.dtypes == df.dtypes).all() # check getting - for df in [default_frame, float_string_frame, - mixed_float_frame, mixed_int_frame]: + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: if df is float_string_frame: with pytest.raises(TypeError): df > 0 @@ -2644,16 +2736,23 @@ def _check_get(df, cond, check_dtypes=True): _check_get(df, cond) # upcasting case (GH # 2794) - df = DataFrame({c: Series([1] * 3, dtype=c) - for c in ['float32', 'float64', - 'int32', 'int64']}) + df = DataFrame( + { + c: Series([1] * 3, dtype=c) + for c in ["float32", "float64", "int32", "int64"] + } + ) df.iloc[1, :] = 0 result = df.dtypes - expected = Series([np.dtype('float32'), - np.dtype('float64'), - np.dtype('int32'), - np.dtype('int64')], - index=['float32', 'float64', 'int32', 'int64']) + expected = Series( + [ + np.dtype("float32"), + np.dtype("float64"), + np.dtype("int32"), + np.dtype("int64"), + ], + index=["float32", "float64", "int32", "int64"], + ) # when we don't preserve boolean casts # @@ -2706,8 +2805,7 @@ def _check_align(df, cond, other, check_dtypes=True): # integers are upcast, so don't check the dtypes cond = df > 0 - check_dtypes = all(not issubclass(s.type, np.integer) - for s in df.dtypes) + check_dtypes = all(not issubclass(s.type, np.integer) for s in df.dtypes) _check_align(df, cond, np.nan, check_dtypes=check_dtypes) # invalid conditions @@ -2741,11 +2839,15 @@ def _check_set(df, cond, check_dtypes=True): if check_dtypes: for k, v in df.dtypes.items(): if issubclass(v.type, np.integer) and not cond[k].all(): - v = np.dtype('float64') + v = np.dtype("float64") assert dfi[k].dtype == v - for df in [default_frame, float_string_frame, mixed_float_frame, - mixed_int_frame]: + for df in [ + default_frame, + float_string_frame, + mixed_float_frame, + mixed_int_frame, + ]: if df is float_string_frame: with pytest.raises(TypeError): df > 0 @@ -2763,9 +2865,9 @@ def _check_set(df, cond, check_dtypes=True): # GH 10218 # test DataFrame.where with Series slicing - df = DataFrame({'a': range(3), 'b': range(4, 7)}) - result = df.where(df['a'] == 1) - expected = df[df['a'] == 1].reindex(df.index) + df = DataFrame({"a": range(3), "b": range(4, 7)}) + result = df.where(df["a"] == 1) + expected = df[df["a"] == 1].reindex(df.index) assert_frame_equal(result, expected) @pytest.mark.parametrize("klass", [list, tuple, np.array]) @@ -2785,14 +2887,16 @@ def test_where_array_like(self, klass): result = df.where(klass(cond)) assert_frame_equal(result, expected) - @pytest.mark.parametrize("cond", [ - [[1], [0], [1]], - Series([[2], [5], [7]]), - DataFrame({"a": [2, 5, 7]}), - [["True"], ["False"], ["True"]], - [[Timestamp("2017-01-01")], - [pd.NaT], [Timestamp("2017-01-02")]] - ]) + @pytest.mark.parametrize( + "cond", + [ + [[1], [0], [1]], + Series([[2], [5], [7]]), + DataFrame({"a": [2, 5, 7]}), + [["True"], ["False"], ["True"]], + [[Timestamp("2017-01-01")], [pd.NaT], [Timestamp("2017-01-02")]], + ], + ) def test_where_invalid_input_single(self, cond): # see gh-15414: only boolean arrays accepted df = DataFrame({"a": [1, 2, 3]}) @@ -2801,16 +2905,20 @@ def test_where_invalid_input_single(self, cond): with pytest.raises(ValueError, match=msg): df.where(cond) - @pytest.mark.parametrize("cond", [ - [[0, 1], [1, 0], [1, 1]], - Series([[0, 2], [5, 0], [4, 7]]), - [["False", "True"], ["True", "False"], - ["True", "True"]], - DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), - [[pd.NaT, Timestamp("2017-01-01")], - [Timestamp("2017-01-02"), pd.NaT], - [Timestamp("2017-01-03"), Timestamp("2017-01-03")]] - ]) + @pytest.mark.parametrize( + "cond", + [ + [[0, 1], [1, 0], [1, 1]], + Series([[0, 2], [5, 0], [4, 7]]), + [["False", "True"], ["True", "False"], ["True", "True"]], + DataFrame({"a": [2, 5, 7], "b": [4, 8, 9]}), + [ + [pd.NaT, Timestamp("2017-01-01")], + [Timestamp("2017-01-02"), pd.NaT], + [Timestamp("2017-01-03"), Timestamp("2017-01-03")], + ], + ], + ) def test_where_invalid_input_multiple(self, cond): # see gh-15414: only boolean arrays accepted df = DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]}) @@ -2857,10 +2965,13 @@ def test_where_ndframe_align(self): def test_where_bug(self): # see gh-2793 - df = DataFrame({'a': [1.0, 2.0, 3.0, 4.0], 'b': [ - 4.0, 3.0, 2.0, 1.0]}, dtype='float64') - expected = DataFrame({'a': [np.nan, np.nan, 3.0, 4.0], 'b': [ - 4.0, 3.0, np.nan, np.nan]}, dtype='float64') + df = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [4.0, 3.0, 2.0, 1.0]}, dtype="float64" + ) + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) result = df.where(df > 2, np.nan) assert_frame_equal(result, expected) @@ -2870,13 +2981,17 @@ def test_where_bug(self): def test_where_bug_mixed(self, sint_dtype): # see gh-2793 - df = DataFrame({"a": np.array([1, 2, 3, 4], dtype=sint_dtype), - "b": np.array([4.0, 3.0, 2.0, 1.0], - dtype="float64")}) + df = DataFrame( + { + "a": np.array([1, 2, 3, 4], dtype=sint_dtype), + "b": np.array([4.0, 3.0, 2.0, 1.0], dtype="float64"), + } + ) - expected = DataFrame({"a": [np.nan, np.nan, 3.0, 4.0], - "b": [4.0, 3.0, np.nan, np.nan]}, - dtype="float64") + expected = DataFrame( + {"a": [np.nan, np.nan, 3.0, 4.0], "b": [4.0, 3.0, np.nan, np.nan]}, + dtype="float64", + ) result = df.where(df > 2, np.nan) assert_frame_equal(result, expected) @@ -2910,9 +3025,13 @@ def test_where_bug_transposition(self): def test_where_datetime(self): # GH 3311 - df = DataFrame(dict(A=date_range('20130102', periods=5), - B=date_range('20130104', periods=5), - C=np.random.randn(5))) + df = DataFrame( + dict( + A=date_range("20130102", periods=5), + B=date_range("20130104", periods=5), + C=np.random.randn(5), + ) + ) stamp = datetime(2013, 1, 3) with pytest.raises(TypeError): @@ -2921,23 +3040,28 @@ def test_where_datetime(self): result = df[df.iloc[:, :-1] > stamp] expected = df.copy() - expected.loc[[0, 1], 'A'] = np.nan - expected.loc[:, 'C'] = np.nan + expected.loc[[0, 1], "A"] = np.nan + expected.loc[:, "C"] = np.nan assert_frame_equal(result, expected) def test_where_none(self): # GH 4667 # setting with None changes dtype - df = DataFrame({'series': Series(range(10))}).astype(float) + df = DataFrame({"series": Series(range(10))}).astype(float) df[df > 7] = None expected = DataFrame( - {'series': Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])}) + {"series": Series([0, 1, 2, 3, 4, 5, 6, 7, np.nan, np.nan])} + ) assert_frame_equal(df, expected) # GH 7656 - df = DataFrame([{'A': 1, 'B': np.nan, 'C': 'Test'}, { - 'A': np.nan, 'B': 'Test', 'C': np.nan}]) - msg = 'boolean setting on mixed-type' + df = DataFrame( + [ + {"A": 1, "B": np.nan, "C": "Test"}, + {"A": np.nan, "B": "Test", "C": np.nan}, + ] + ) + msg = "boolean setting on mixed-type" with pytest.raises(TypeError, match=msg): df.where(~isna(df), None, inplace=True) @@ -2951,7 +3075,6 @@ def test_where_empty_df_and_empty_cond_having_non_bool_dtypes(self): tm.assert_frame_equal(result, df) def test_where_align(self): - def create(): df = DataFrame(np.random.randn(10, 3)) df.iloc[3:5, 0] = np.nan @@ -2962,31 +3085,31 @@ def create(): # series df = create() expected = df.fillna(df.mean()) - result = df.where(pd.notna(df), df.mean(), axis='columns') + result = df.where(pd.notna(df), df.mean(), axis="columns") assert_frame_equal(result, expected) - df.where(pd.notna(df), df.mean(), inplace=True, axis='columns') + df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") assert_frame_equal(df, expected) df = create().fillna(0) expected = df.apply(lambda x, y: x.where(x > 0, y), y=df[0]) - result = df.where(df > 0, df[0], axis='index') + result = df.where(df > 0, df[0], axis="index") assert_frame_equal(result, expected) - result = df.where(df > 0, df[0], axis='rows') + result = df.where(df > 0, df[0], axis="rows") assert_frame_equal(result, expected) # frame df = create() expected = df.fillna(1) - result = df.where(pd.notna(df), DataFrame( - 1, index=df.index, columns=df.columns)) + result = df.where( + pd.notna(df), DataFrame(1, index=df.index, columns=df.columns) + ) assert_frame_equal(result, expected) def test_where_complex(self): # GH 6345 - expected = DataFrame( - [[1 + 1j, 2], [np.nan, 4 + 1j]], columns=['a', 'b']) - df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=['a', 'b']) + expected = DataFrame([[1 + 1j, 2], [np.nan, 4 + 1j]], columns=["a", "b"]) + df = DataFrame([[1 + 1j, 2], [5 + 1j, 4 + 1j]], columns=["a", "b"]) df[df.abs() >= 5] = np.nan assert_frame_equal(df, expected) @@ -2996,72 +3119,80 @@ def test_where_axis(self): mask = DataFrame([[False, False], [False, False]]) s = Series([0, 1]) - expected = DataFrame([[0, 0], [1, 1]], dtype='float64') - result = df.where(mask, s, axis='index') + expected = DataFrame([[0, 0], [1, 1]], dtype="float64") + result = df.where(mask, s, axis="index") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis='index', inplace=True) + result.where(mask, s, axis="index", inplace=True) assert_frame_equal(result, expected) - expected = DataFrame([[0, 1], [0, 1]], dtype='float64') - result = df.where(mask, s, axis='columns') + expected = DataFrame([[0, 1], [0, 1]], dtype="float64") + result = df.where(mask, s, axis="columns") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis='columns', inplace=True) + result.where(mask, s, axis="columns", inplace=True) assert_frame_equal(result, expected) # Upcast needed - df = DataFrame([[1, 2], [3, 4]], dtype='int64') + df = DataFrame([[1, 2], [3, 4]], dtype="int64") mask = DataFrame([[False, False], [False, False]]) s = Series([0, np.nan]) - expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype='float64') - result = df.where(mask, s, axis='index') + expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") + result = df.where(mask, s, axis="index") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis='index', inplace=True) + result.where(mask, s, axis="index", inplace=True) assert_frame_equal(result, expected) expected = DataFrame([[0, np.nan], [0, np.nan]]) - result = df.where(mask, s, axis='columns') + result = df.where(mask, s, axis="columns") assert_frame_equal(result, expected) - expected = DataFrame({0: np.array([0, 0], dtype='int64'), - 1: np.array([np.nan, np.nan], dtype='float64')}) + expected = DataFrame( + { + 0: np.array([0, 0], dtype="int64"), + 1: np.array([np.nan, np.nan], dtype="float64"), + } + ) result = df.copy() - result.where(mask, s, axis='columns', inplace=True) + result.where(mask, s, axis="columns", inplace=True) assert_frame_equal(result, expected) # Multiple dtypes (=> multiple Blocks) - df = pd.concat([ - DataFrame(np.random.randn(10, 2)), - DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype='int64')], - ignore_index=True, axis=1) + df = pd.concat( + [ + DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype="int64"), + ], + ignore_index=True, + axis=1, + ) mask = DataFrame(False, columns=df.columns, index=df.index) s1 = Series(1, index=df.columns) s2 = Series(2, index=df.index) - result = df.where(mask, s1, axis='columns') + result = df.where(mask, s1, axis="columns") expected = DataFrame(1.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype('int64') - expected[3] = expected[3].astype('int64') + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s1, axis='columns', inplace=True) + result.where(mask, s1, axis="columns", inplace=True) assert_frame_equal(result, expected) - result = df.where(mask, s2, axis='index') + result = df.where(mask, s2, axis="index") expected = DataFrame(2.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype('int64') - expected[3] = expected[3].astype('int64') + expected[2] = expected[2].astype("int64") + expected[3] = expected[3].astype("int64") assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s2, axis='index', inplace=True) + result.where(mask, s2, axis="index", inplace=True) assert_frame_equal(result, expected) # DataFrame vs DataFrame @@ -3071,13 +3202,13 @@ def test_where_axis(self): result = df.where(mask, d1) assert_frame_equal(result, expected) - result = df.where(mask, d1, axis='index') + result = df.where(mask, d1, axis="index") assert_frame_equal(result, expected) result = df.copy() result.where(mask, d1, inplace=True) assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d1, inplace=True, axis='index') + result.where(mask, d1, inplace=True, axis="index") assert_frame_equal(result, expected) d2 = df.copy().drop(1, axis=1) @@ -3086,13 +3217,13 @@ def test_where_axis(self): result = df.where(mask, d2) assert_frame_equal(result, expected) - result = df.where(mask, d2, axis='columns') + result = df.where(mask, d2, axis="columns") assert_frame_equal(result, expected) result = df.copy() result.where(mask, d2, inplace=True) assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d2, inplace=True, axis='columns') + result.where(mask, d2, inplace=True, axis="columns") assert_frame_equal(result, expected) def test_where_callable(self): @@ -3113,20 +3244,22 @@ def test_where_callable(self): result = (df + 2).where(lambda x: x > 8, lambda x: x + 10) exp = DataFrame([[13, 14, 15], [16, 17, 18], [9, 10, 11]]) tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, - (df + 2).where((df + 2) > 8, (df + 2) + 10)) + tm.assert_frame_equal(result, (df + 2).where((df + 2) > 8, (df + 2) + 10)) def test_where_tz_values(self, tz_naive_fixture): - df1 = DataFrame(DatetimeIndex(['20150101', '20150102', '20150103'], - tz=tz_naive_fixture), - columns=['date']) - df2 = DataFrame(DatetimeIndex(['20150103', '20150104', '20150105'], - tz=tz_naive_fixture), - columns=['date']) - mask = DataFrame([True, True, False], columns=['date']) - exp = DataFrame(DatetimeIndex(['20150101', '20150102', '20150105'], - tz=tz_naive_fixture), - columns=['date']) + df1 = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + columns=["date"], + ) + df2 = DataFrame( + DatetimeIndex(["20150103", "20150104", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) + mask = DataFrame([True, True, False], columns=["date"]) + exp = DataFrame( + DatetimeIndex(["20150101", "20150102", "20150105"], tz=tz_naive_fixture), + columns=["date"], + ) result = df1.where(mask, df2) assert_frame_equal(exp, result) @@ -3191,8 +3324,7 @@ def test_mask_callable(self): result = (df + 2).mask(lambda x: x > 8, lambda x: x + 10) exp = DataFrame([[3, 4, 5], [6, 7, 8], [19, 20, 21]]) tm.assert_frame_equal(result, exp) - tm.assert_frame_equal(result, - (df + 2).mask((df + 2) > 8, (df + 2) + 10)) + tm.assert_frame_equal(result, (df + 2).mask((df + 2) > 8, (df + 2) + 10)) def test_head_tail(self, float_frame): assert_frame_equal(float_frame.head(), float_frame[:5]) @@ -3221,69 +3353,67 @@ def test_head_tail(self, float_frame): def test_type_error_multiindex(self): # See gh-12218 - df = DataFrame(columns=['i', 'c', 'x', 'y'], - data=[[0, 0, 1, 2], [1, 0, 3, 4], - [0, 1, 1, 2], [1, 1, 3, 4]]) - dg = df.pivot_table(index='i', columns='c', - values=['x', 'y']) + df = DataFrame( + columns=["i", "c", "x", "y"], + data=[[0, 0, 1, 2], [1, 0, 3, 4], [0, 1, 1, 2], [1, 1, 3, 4]], + ) + dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) with pytest.raises(TypeError, match="is an invalid key"): str(dg[:, 0]) - index = Index(range(2), name='i') - columns = MultiIndex(levels=[['x', 'y'], [0, 1]], - codes=[[0, 1], [0, 0]], - names=[None, 'c']) + index = Index(range(2), name="i") + columns = MultiIndex( + levels=[["x", "y"], [0, 1]], codes=[[0, 1], [0, 0]], names=[None, "c"] + ) expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index) result = dg.loc[:, (slice(None), 0)] assert_frame_equal(result, expected) - name = ('x', 0) - index = Index(range(2), name='i') + name = ("x", 0) + index = Index(range(2), name="i") expected = Series([1, 3], index=index, name=name) - result = dg['x', 0] + result = dg["x", 0] assert_series_equal(result, expected) def test_interval_index(self): # GH 19977 index = pd.interval_range(start=0, periods=3) - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=index, - columns=['A', 'B', 'C']) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=index, columns=["A", "B", "C"] + ) expected = 1 - result = df.loc[0.5, 'A'] + result = df.loc[0.5, "A"] assert_almost_equal(result, expected) - index = pd.interval_range(start=0, periods=3, closed='both') - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=index, - columns=['A', 'B', 'C']) + index = pd.interval_range(start=0, periods=3, closed="both") + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=index, columns=["A", "B", "C"] + ) - index_exp = pd.interval_range(start=0, periods=2, - freq=1, closed='both') - expected = pd.Series([1, 4], index=index_exp, name='A') - result = df.loc[1, 'A'] + index_exp = pd.interval_range(start=0, periods=2, freq=1, closed="both") + expected = pd.Series([1, 4], index=index_exp, name="A") + result = df.loc[1, "A"] assert_series_equal(result, expected) class TestDataFrameIndexingDatetimeWithTZ(TestData): - def test_setitem(self, timezone_frame): df = timezone_frame - idx = df['B'].rename('foo') + idx = df["B"].rename("foo") # setitem - df['C'] = idx - assert_series_equal(df['C'], Series(idx, name='C')) + df["C"] = idx + assert_series_equal(df["C"], Series(idx, name="C")) - df['D'] = 'foo' - df['D'] = idx - assert_series_equal(df['D'], Series(idx, name='D')) - del df['D'] + df["D"] = "foo" + df["D"] = idx + assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] # assert that A & C are not sharing the same base (e.g. they # are copies) @@ -3296,126 +3426,132 @@ def test_setitem(self, timezone_frame): df2 = df.copy() df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT - result = df2['B'] - assert_series_equal(notna(result), Series( - [True, False, True], name='B')) + result = df2["B"] + assert_series_equal(notna(result), Series([True, False, True], name="B")) assert_series_equal(df2.dtypes, df.dtypes) def test_set_reset(self): - idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") # set/reset - df = DataFrame({'A': [0, 1, 2]}, index=idx) + df = DataFrame({"A": [0, 1, 2]}, index=idx) result = df.reset_index() - assert result['foo'].dtype, 'M8[ns, US/Eastern' + assert result["foo"].dtype, "M8[ns, US/Eastern" - df = result.set_index('foo') + df = result.set_index("foo") tm.assert_index_equal(df.index, idx) def test_transpose(self, timezone_frame): result = timezone_frame.T expected = DataFrame(timezone_frame.values.T) - expected.index = ['A', 'B', 'C'] + expected.index = ["A", "B", "C"] assert_frame_equal(result, expected) def test_scalar_assignment(self): # issue #19843 df = pd.DataFrame(index=(0, 1, 2)) - df['now'] = pd.Timestamp('20130101', tz='UTC') + df["now"] = pd.Timestamp("20130101", tz="UTC") expected = pd.DataFrame( - {'now': pd.Timestamp('20130101', tz='UTC')}, index=[0, 1, 2]) + {"now": pd.Timestamp("20130101", tz="UTC")}, index=[0, 1, 2] + ) tm.assert_frame_equal(df, expected) class TestDataFrameIndexingUInt64(TestData): - def test_setitem(self, uint64_frame): df = uint64_frame - idx = df['A'].rename('foo') + idx = df["A"].rename("foo") # setitem - df['C'] = idx - assert_series_equal(df['C'], Series(idx, name='C')) + df["C"] = idx + assert_series_equal(df["C"], Series(idx, name="C")) - df['D'] = 'foo' - df['D'] = idx - assert_series_equal(df['D'], Series(idx, name='D')) - del df['D'] + df["D"] = "foo" + df["D"] = idx + assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] # With NaN: because uint64 has no NaN element, # the column should be cast to object. df2 = df.copy() df2.iloc[1, 1] = pd.NaT df2.iloc[1, 2] = pd.NaT - result = df2['B'] - assert_series_equal(notna(result), Series( - [True, False, True], name='B')) - assert_series_equal(df2.dtypes, Series([np.dtype('uint64'), - np.dtype('O'), np.dtype('O')], - index=['A', 'B', 'C'])) + result = df2["B"] + assert_series_equal(notna(result), Series([True, False, True], name="B")) + assert_series_equal( + df2.dtypes, + Series( + [np.dtype("uint64"), np.dtype("O"), np.dtype("O")], + index=["A", "B", "C"], + ), + ) def test_set_reset(self): - idx = Index([2**63, 2**63 + 5, 2**63 + 10], name='foo') + idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") # set/reset - df = DataFrame({'A': [0, 1, 2]}, index=idx) + df = DataFrame({"A": [0, 1, 2]}, index=idx) result = df.reset_index() - assert result['foo'].dtype == np.dtype('uint64') + assert result["foo"].dtype == np.dtype("uint64") - df = result.set_index('foo') + df = result.set_index("foo") tm.assert_index_equal(df.index, idx) def test_transpose(self, uint64_frame): result = uint64_frame.T expected = DataFrame(uint64_frame.values.T) - expected.index = ['A', 'B'] + expected.index = ["A", "B"] assert_frame_equal(result, expected) class TestDataFrameIndexingCategorical: - def test_assignment(self): # assignment - df = DataFrame({'value': np.array( - np.random.randint(0, 10000, 100), dtype='int32')}) - labels = Categorical(["{0} - {1}".format(i, i + 499) - for i in range(0, 10000, 500)]) - - df = df.sort_values(by=['value'], ascending=True) + df = DataFrame( + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} + ) + labels = Categorical( + ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + ) + + df = df.sort_values(by=["value"], ascending=True) s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) d = s.values - df['D'] = d + df["D"] = d str(df) result = df.dtypes expected = Series( - [np.dtype('int32'), CategoricalDtype(categories=labels, - ordered=False)], - index=['value', 'D']) + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], + index=["value", "D"], + ) tm.assert_series_equal(result, expected) - df['E'] = s + df["E"] = s str(df) result = df.dtypes - expected = Series([np.dtype('int32'), - CategoricalDtype(categories=labels, ordered=False), - CategoricalDtype(categories=labels, ordered=False)], - index=['value', 'D', 'E']) + expected = Series( + [ + np.dtype("int32"), + CategoricalDtype(categories=labels, ordered=False), + CategoricalDtype(categories=labels, ordered=False), + ], + index=["value", "D", "E"], + ) tm.assert_series_equal(result, expected) - result1 = df['D'] - result2 = df['E'] + result1 = df["D"] + result2 = df["E"] tm.assert_categorical_equal(result1._data._block.values, d) # sorting - s.name = 'E' + s.name = "E" tm.assert_series_equal(result2.sort_index(), s.sort_index()) cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) @@ -3438,44 +3574,37 @@ def test_assigning_ops(self): # assign a part of a column with dtype != categorical -> # exp_parts_cats_col - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], - categories=["a", "b"]) + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values # changed single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], - categories=["a", "b"]) + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, - "values": values1}, index=idx1) + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], - categories=["a", "b"]) + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, - "values": values2}, index=idx2) + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) # changed part of the cats column - cats3 = Categorical( - ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) values3 = [1, 1, 1, 1, 1, 1, 1] - exp_parts_cats_col = DataFrame({"cats": cats3, - "values": values3}, index=idx3) + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) # changed single value in cats col - cats4 = Categorical( - ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) values4 = [1, 1, 1, 1, 1, 1, 1] - exp_single_cats_value = DataFrame({"cats": cats4, - "values": values4}, index=idx4) + exp_single_cats_value = DataFrame( + {"cats": cats4, "values": values4}, index=idx4 + ) # iloc # ############### @@ -3521,12 +3650,12 @@ def test_assigning_ops(self): with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() - df.iloc[2:4, 0] = Categorical(list('bb'), categories=list('abc')) + df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) with pytest.raises(ValueError): # different values df = orig.copy() - df.iloc[2:4, 0] = Categorical(list('cc'), categories=list('abc')) + df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3575,21 +3704,22 @@ def test_assigning_ops(self): # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b"]) + df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b", "c"]) + ["b", "b"], categories=["a", "b", "c"] + ) with pytest.raises(ValueError): # different values df = orig.copy() df.loc["j":"k", "cats"] = Categorical( - ["c", "c"], categories=["a", "b", "c"]) + ["c", "c"], categories=["a", "b", "c"] + ) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3638,21 +3768,22 @@ def test_assigning_ops(self): # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b"]) + df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp_parts_cats_col) with pytest.raises(ValueError): # different categories -> not sure if this should fail or pass df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b", "c"]) + ["b", "b"], categories=["a", "b", "c"] + ) with pytest.raises(ValueError): # different values df = orig.copy() df.loc["j":"k", df.columns[0]] = Categorical( - ["c", "c"], categories=["a", "b", "c"]) + ["c", "c"], categories=["a", "b", "c"] + ) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3685,8 +3816,9 @@ def test_assigning_ops(self): df.at["j", "cats"] = "c" # fancy indexing - catsf = Categorical(["a", "a", "c", "c", "a", "a", "a"], - categories=["a", "b", "c"]) + catsf = Categorical( + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] + ) idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) @@ -3716,8 +3848,9 @@ def test_assigning_ops(self): tm.assert_frame_equal(df, exp) def test_functions_no_warnings(self): - df = DataFrame({'value': np.random.randint(0, 100, 20)}) + df = DataFrame({"value": np.random.randint(0, 100, 20)}) labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] with tm.assert_produces_warning(False): - df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, - labels=labels) + df["group"] = pd.cut( + df.value, range(0, 105, 10), right=False, labels=labels + ) diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index 3adc62609cc6a..adace5e4784ae 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -9,46 +9,64 @@ def frame_with_period_index(): return DataFrame( data=np.arange(20).reshape(4, 5), - columns=list('abcde'), - index=period_range(start='2000', freq='A', periods=4)) + columns=list("abcde"), + index=period_range(start="2000", freq="A", periods=4), + ) @pytest.fixture def left(): - return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture def right(): - return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) + return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) @pytest.mark.parametrize( "how, sort, expected", - [('inner', False, DataFrame({'a': [20, 10], - 'b': [200, 100]}, - index=[2, 1])), - ('inner', True, DataFrame({'a': [10, 20], - 'b': [100, 200]}, - index=[1, 2])), - ('left', False, DataFrame({'a': [20, 10, 0], - 'b': [200, 100, np.nan]}, - index=[2, 1, 0])), - ('left', True, DataFrame({'a': [0, 10, 20], - 'b': [np.nan, 100, 200]}, - index=[0, 1, 2])), - ('right', False, DataFrame({'a': [np.nan, 10, 20], - 'b': [300, 100, 200]}, - index=[3, 1, 2])), - ('right', True, DataFrame({'a': [10, 20, np.nan], - 'b': [100, 200, 300]}, - index=[1, 2, 3])), - ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3])), - ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]))]) + [ + ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), + ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), + ( + "left", + False, + DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), + ), + ( + "left", + True, + DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), + ), + ( + "right", + False, + DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]), + ), + ( + "right", + True, + DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]), + ), + ( + "outer", + False, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ( + "outer", + True, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ], +) def test_join(left, right, how, sort, expected): result = left.join(right, how=how, sort=sort) @@ -58,59 +76,59 @@ def test_join(left, right, how, sort, expected): def test_join_index(float_frame): # left / right - f = float_frame.loc[float_frame.index[:10], ['A', 'B']] - f2 = float_frame.loc[float_frame.index[5:], ['C', 'D']].iloc[::-1] + f = float_frame.loc[float_frame.index[:10], ["A", "B"]] + f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1] joined = f.join(f2) tm.assert_index_equal(f.index, joined.index) - expected_columns = Index(['A', 'B', 'C', 'D']) + expected_columns = Index(["A", "B", "C", "D"]) tm.assert_index_equal(joined.columns, expected_columns) - joined = f.join(f2, how='left') + joined = f.join(f2, how="left") tm.assert_index_equal(joined.index, f.index) tm.assert_index_equal(joined.columns, expected_columns) - joined = f.join(f2, how='right') + joined = f.join(f2, how="right") tm.assert_index_equal(joined.index, f2.index) tm.assert_index_equal(joined.columns, expected_columns) # inner - joined = f.join(f2, how='inner') + joined = f.join(f2, how="inner") tm.assert_index_equal(joined.index, f.index[5:10]) tm.assert_index_equal(joined.columns, expected_columns) # outer - joined = f.join(f2, how='outer') + joined = f.join(f2, how="outer") tm.assert_index_equal(joined.index, float_frame.index.sort_values()) tm.assert_index_equal(joined.columns, expected_columns) - with pytest.raises(ValueError, match='join method'): - f.join(f2, how='foo') + with pytest.raises(ValueError, match="join method"): + f.join(f2, how="foo") # corner case - overlapping columns - msg = 'columns overlap but no suffix' - for how in ('outer', 'left', 'inner'): + msg = "columns overlap but no suffix" + for how in ("outer", "left", "inner"): with pytest.raises(ValueError, match=msg): float_frame.join(float_frame, how=how) def test_join_index_more(float_frame): - af = float_frame.loc[:, ['A', 'B']] - bf = float_frame.loc[::2, ['C', 'D']] + af = float_frame.loc[:, ["A", "B"]] + bf = float_frame.loc[::2, ["C", "D"]] expected = af.copy() - expected['C'] = float_frame['C'][::2] - expected['D'] = float_frame['D'][::2] + expected["C"] = float_frame["C"][::2] + expected["D"] = float_frame["D"][::2] result = af.join(bf) tm.assert_frame_equal(result, expected) - result = af.join(bf, how='right') + result = af.join(bf, how="right") tm.assert_frame_equal(result, expected[::2]) - result = bf.join(af, how='right') + result = bf.join(af, how="right") tm.assert_frame_equal(result, expected.loc[:, result.columns]) @@ -123,19 +141,19 @@ def test_join_index_series(float_frame): tm.assert_frame_equal(joined, float_frame, check_names=False) s.name = None - with pytest.raises(ValueError, match='must have a name'): + with pytest.raises(ValueError, match="must have a name"): df.join(s) def test_join_overlap(float_frame): - df1 = float_frame.loc[:, ['A', 'B', 'C']] - df2 = float_frame.loc[:, ['B', 'C', 'D']] + df1 = float_frame.loc[:, ["A", "B", "C"]] + df2 = float_frame.loc[:, ["B", "C", "D"]] - joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') - df1_suf = df1.loc[:, ['B', 'C']].add_suffix('_df1') - df2_suf = df2.loc[:, ['B', 'C']].add_suffix('_df2') + joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2") + df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1") + df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2") - no_overlap = float_frame.loc[:, ['A', 'D']] + no_overlap = float_frame.loc[:, ["A", "D"]] expected = df1_suf.join(df2_suf).join(no_overlap) # column order not necessarily sorted @@ -143,35 +161,35 @@ def test_join_overlap(float_frame): def test_join_period_index(frame_with_period_index): - other = frame_with_period_index.rename( - columns=lambda x: '{key}{key}'.format(key=x)) + other = frame_with_period_index.rename(columns=lambda x: "{key}{key}".format(key=x)) - joined_values = np.concatenate( - [frame_with_period_index.values] * 2, axis=1) + joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1) joined_cols = frame_with_period_index.columns.append(other.columns) joined = frame_with_period_index.join(other) expected = DataFrame( - data=joined_values, - columns=joined_cols, - index=frame_with_period_index.index) + data=joined_values, columns=joined_cols, index=frame_with_period_index.index + ) tm.assert_frame_equal(joined, expected) def test_join_left_sequence_non_unique_index(): # https://github.com/pandas-dev/pandas/issues/19607 - df1 = DataFrame({'a': [0, 10, 20]}, index=[1, 2, 3]) - df2 = DataFrame({'b': [100, 200, 300]}, index=[4, 3, 2]) - df3 = DataFrame({'c': [400, 500, 600]}, index=[2, 2, 4]) + df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3]) + df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2]) + df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4]) - joined = df1.join([df2, df3], how='left') + joined = df1.join([df2, df3], how="left") - expected = DataFrame({ - 'a': [0, 10, 10, 20], - 'b': [np.nan, 300, 300, 200], - 'c': [np.nan, 400, 500, np.nan] - }, index=[1, 2, 2, 3]) + expected = DataFrame( + { + "a": [0, 10, 10, 20], + "b": [np.nan, 300, 300, 200], + "c": [np.nan, 400, 500, np.nan], + }, + index=[1, 2, 2, 3], + ) tm.assert_frame_equal(joined, expected) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index e40ae6dd5494d..c63a5ba64495f 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -18,55 +18,55 @@ def _skip_if_no_pchip(): from scipy.interpolate import pchip_interpolate # noqa except ImportError: import pytest - pytest.skip('scipy.interpolate.pchip missing') + pytest.skip("scipy.interpolate.pchip missing") -class TestDataFrameMissingData: +class TestDataFrameMissingData: def test_dropEmptyRows(self, float_frame): N = len(float_frame.index) mat = np.random.randn(N) mat[:5] = np.nan - frame = DataFrame({'foo': mat}, index=float_frame.index) - original = Series(mat, index=float_frame.index, name='foo') + frame = DataFrame({"foo": mat}, index=float_frame.index) + original = Series(mat, index=float_frame.index, name="foo") expected = original.dropna() inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() - smaller_frame = frame.dropna(how='all') + smaller_frame = frame.dropna(how="all") # check that original was preserved - assert_series_equal(frame['foo'], original) - inplace_frame1.dropna(how='all', inplace=True) - assert_series_equal(smaller_frame['foo'], expected) - assert_series_equal(inplace_frame1['foo'], expected) + assert_series_equal(frame["foo"], original) + inplace_frame1.dropna(how="all", inplace=True) + assert_series_equal(smaller_frame["foo"], expected) + assert_series_equal(inplace_frame1["foo"], expected) - smaller_frame = frame.dropna(how='all', subset=['foo']) - inplace_frame2.dropna(how='all', subset=['foo'], inplace=True) - assert_series_equal(smaller_frame['foo'], expected) - assert_series_equal(inplace_frame2['foo'], expected) + smaller_frame = frame.dropna(how="all", subset=["foo"]) + inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) + assert_series_equal(smaller_frame["foo"], expected) + assert_series_equal(inplace_frame2["foo"], expected) def test_dropIncompleteRows(self, float_frame): N = len(float_frame.index) mat = np.random.randn(N) mat[:5] = np.nan - frame = DataFrame({'foo': mat}, index=float_frame.index) - frame['bar'] = 5 - original = Series(mat, index=float_frame.index, name='foo') + frame = DataFrame({"foo": mat}, index=float_frame.index) + frame["bar"] = 5 + original = Series(mat, index=float_frame.index, name="foo") inp_frame1, inp_frame2 = frame.copy(), frame.copy() smaller_frame = frame.dropna() - assert_series_equal(frame['foo'], original) + assert_series_equal(frame["foo"], original) inp_frame1.dropna(inplace=True) - exp = Series(mat[5:], index=float_frame.index[5:], name='foo') - tm.assert_series_equal(smaller_frame['foo'], exp) - tm.assert_series_equal(inp_frame1['foo'], exp) + exp = Series(mat[5:], index=float_frame.index[5:], name="foo") + tm.assert_series_equal(smaller_frame["foo"], exp) + tm.assert_series_equal(inp_frame1["foo"], exp) - samesize_frame = frame.dropna(subset=['bar']) - assert_series_equal(frame['foo'], original) - assert (frame['bar'] == 5).all() - inp_frame2.dropna(subset=['bar'], inplace=True) + samesize_frame = frame.dropna(subset=["bar"]) + assert_series_equal(frame["foo"], original) + assert (frame["bar"] == 5).all() + inp_frame2.dropna(subset=["bar"], inplace=True) tm.assert_index_equal(samesize_frame.index, float_frame.index) tm.assert_index_equal(inp_frame2.index, float_frame.index) @@ -117,60 +117,63 @@ def test_dropna(self): assert_frame_equal(inp, df) # all - dropped = df.dropna(axis=1, how='all') + dropped = df.dropna(axis=1, how="all") assert_frame_equal(dropped, df) df[2] = np.nan - dropped = df.dropna(axis=1, how='all') + dropped = df.dropna(axis=1, how="all") expected = df.loc[:, [0, 1, 3]] assert_frame_equal(dropped, expected) # bad input - msg = ("No axis named 3 for object type" - " ") + msg = "No axis named 3 for object type" " " with pytest.raises(ValueError, match=msg): df.dropna(axis=3) def test_drop_and_dropna_caching(self): # tst that cacher updates - original = Series([1, 2, np.nan], name='A') - expected = Series([1, 2], dtype=original.dtype, name='A') - df = pd.DataFrame({'A': original.values.copy()}) + original = Series([1, 2, np.nan], name="A") + expected = Series([1, 2], dtype=original.dtype, name="A") + df = pd.DataFrame({"A": original.values.copy()}) df2 = df.copy() - df['A'].dropna() - assert_series_equal(df['A'], original) - df['A'].dropna(inplace=True) - assert_series_equal(df['A'], expected) - df2['A'].drop([1]) - assert_series_equal(df2['A'], original) - df2['A'].drop([1], inplace=True) - assert_series_equal(df2['A'], original.drop([1])) + df["A"].dropna() + assert_series_equal(df["A"], original) + df["A"].dropna(inplace=True) + assert_series_equal(df["A"], expected) + df2["A"].drop([1]) + assert_series_equal(df2["A"], original) + df2["A"].drop([1], inplace=True) + assert_series_equal(df2["A"], original.drop([1])) def test_dropna_corner(self, float_frame): # bad input msg = "invalid how option: foo" with pytest.raises(ValueError, match=msg): - float_frame.dropna(how='foo') + float_frame.dropna(how="foo") msg = "must specify how or thresh" with pytest.raises(TypeError, match=msg): float_frame.dropna(how=None) # non-existent column - 8303 with pytest.raises(KeyError, match=r"^\['X'\]$"): - float_frame.dropna(subset=['A', 'X']) + float_frame.dropna(subset=["A", "X"]) def test_dropna_multiple_axes(self): - df = DataFrame([[1, np.nan, 2, 3], - [4, np.nan, 5, 6], - [np.nan, np.nan, np.nan, np.nan], - [7, np.nan, 8, 9]]) + df = DataFrame( + [ + [1, np.nan, 2, 3], + [4, np.nan, 5, 6], + [np.nan, np.nan, np.nan, np.nan], + [7, np.nan, 8, 9], + ] + ) cp = df.copy() # GH20987 with tm.assert_produces_warning(FutureWarning): - result = df.dropna(how='all', axis=[0, 1]) + result = df.dropna(how="all", axis=[0, 1]) with tm.assert_produces_warning(FutureWarning): - result2 = df.dropna(how='all', axis=(0, 1)) - expected = df.dropna(how='all').dropna(how='all', axis=1) + result2 = df.dropna(how="all", axis=(0, 1)) + expected = df.dropna(how="all").dropna(how="all", axis=1) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) @@ -178,34 +181,30 @@ def test_dropna_multiple_axes(self): inp = df.copy() with tm.assert_produces_warning(FutureWarning): - inp.dropna(how='all', axis=(0, 1), inplace=True) + inp.dropna(how="all", axis=(0, 1), inplace=True) assert_frame_equal(inp, expected) def test_dropna_tz_aware_datetime(self): # GH13407 df = DataFrame() - dt1 = datetime.datetime(2015, 1, 1, - tzinfo=dateutil.tz.tzutc()) - dt2 = datetime.datetime(2015, 2, 2, - tzinfo=dateutil.tz.tzutc()) - df['Time'] = [dt1] + dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) + dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) + df["Time"] = [dt1] result = df.dropna(axis=0) - expected = DataFrame({'Time': [dt1]}) + expected = DataFrame({"Time": [dt1]}) assert_frame_equal(result, expected) # Ex2 - df = DataFrame({'Time': [dt1, None, np.nan, dt2]}) + df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) result = df.dropna(axis=0) - expected = DataFrame([dt1, dt2], - columns=['Time'], - index=[0, 3]) + expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) assert_frame_equal(result, expected) def test_dropna_categorical_interval_index(self): # GH 25087 ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) ci = pd.CategoricalIndex(ii) - df = pd.DataFrame({'A': list('abc')}, index=ci) + df = pd.DataFrame({"A": list("abc")}, index=ci) expected = df result = df.dropna() @@ -213,63 +212,65 @@ def test_dropna_categorical_interval_index(self): def test_fillna_datetime(self, datetime_frame): tf = datetime_frame - tf.loc[tf.index[:5], 'A'] = np.nan - tf.loc[tf.index[-5:], 'A'] = np.nan + tf.loc[tf.index[:5], "A"] = np.nan + tf.loc[tf.index[-5:], "A"] = np.nan zero_filled = datetime_frame.fillna(0) - assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all() + assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all() - padded = datetime_frame.fillna(method='pad') - assert np.isnan(padded.loc[padded.index[:5], 'A']).all() - assert (padded.loc[padded.index[-5:], 'A'] == - padded.loc[padded.index[-5], 'A']).all() + padded = datetime_frame.fillna(method="pad") + assert np.isnan(padded.loc[padded.index[:5], "A"]).all() + assert ( + padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] + ).all() msg = "Must specify a fill 'value' or 'method'" with pytest.raises(ValueError, match=msg): datetime_frame.fillna() msg = "Cannot specify both 'value' and 'method'" with pytest.raises(ValueError, match=msg): - datetime_frame.fillna(5, method='ffill') + datetime_frame.fillna(5, method="ffill") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame - mf.loc[mf.index[5:20], 'foo'] = np.nan - mf.loc[mf.index[-10:], 'A'] = np.nan + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan # TODO: make stronger assertion here, GH 25640 mf.fillna(value=0) - mf.fillna(method='pad') + mf.fillna(method="pad") def test_fillna_mixed_float(self, mixed_float_frame): # mixed numeric (but no float16) - mf = mixed_float_frame.reindex(columns=['A', 'B', 'D']) - mf.loc[mf.index[-10:], 'A'] = np.nan + mf = mixed_float_frame.reindex(columns=["A", "B", "D"]) + mf.loc[mf.index[-10:], "A"] = np.nan result = mf.fillna(value=0) _check_mixed_float(result, dtype=dict(C=None)) - result = mf.fillna(method='pad') + result = mf.fillna(method="pad") _check_mixed_float(result, dtype=dict(C=None)) def test_fillna_empty(self): # empty frame (GH #2778) - df = DataFrame(columns=['x']) - for m in ['pad', 'backfill']: + df = DataFrame(columns=["x"]) + for m in ["pad", "backfill"]: df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) def test_fillna_different_dtype(self): # with different dtype (GH#3386) - df = DataFrame([['a', 'a', np.nan, 'a'], [ - 'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']]) - - result = df.fillna({2: 'foo'}) - expected = DataFrame([['a', 'a', 'foo', 'a'], - ['b', 'b', 'foo', 'b'], - ['c', 'c', 'foo', 'c']]) + df = DataFrame( + [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] + ) + + result = df.fillna({2: "foo"}) + expected = DataFrame( + [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] + ) assert_frame_equal(result, expected) - df.fillna({2: 'foo'}, inplace=True) + df.fillna({2: "foo"}, inplace=True) assert_frame_equal(df, expected) def test_fillna_limit_and_value(self): @@ -287,42 +288,59 @@ def test_fillna_limit_and_value(self): def test_fillna_datelike(self): # with datelike # GH#6344 - df = DataFrame({ - 'Date': [pd.NaT, Timestamp("2014-1-1")], - 'Date2': [Timestamp("2013-1-1"), pd.NaT] - }) + df = DataFrame( + { + "Date": [pd.NaT, Timestamp("2014-1-1")], + "Date2": [Timestamp("2013-1-1"), pd.NaT], + } + ) expected = df.copy() - expected['Date'] = expected['Date'].fillna( - df.loc[df.index[0], 'Date2']) - result = df.fillna(value={'Date': df['Date2']}) + expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"]) + result = df.fillna(value={"Date": df["Date2"]}) assert_frame_equal(result, expected) def test_fillna_tzaware(self): # with timezone # GH#15855 - df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.NaT]}) - exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]}) - assert_frame_equal(df.fillna(method='pad'), exp) - - df = pd.DataFrame({'A': [pd.NaT, - pd.Timestamp('2012-11-11 00:00:00+01:00')]}) - exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]}) - assert_frame_equal(df.fillna(method='bfill'), exp) + df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]}) + exp = pd.DataFrame( + { + "A": [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + assert_frame_equal(df.fillna(method="pad"), exp) + + df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]}) + exp = pd.DataFrame( + { + "A": [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + assert_frame_equal(df.fillna(method="bfill"), exp) def test_fillna_tzaware_different_column(self): # with timezone in another column # GH#15522 - df = pd.DataFrame({'A': pd.date_range('20130101', periods=4, - tz='US/Eastern'), - 'B': [1, 2, np.nan, np.nan]}) - result = df.fillna(method='pad') - expected = pd.DataFrame({'A': pd.date_range('20130101', periods=4, - tz='US/Eastern'), - 'B': [1., 2., 2., 2.]}) + df = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1, 2, np.nan, np.nan], + } + ) + result = df.fillna(method="pad") + expected = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1.0, 2.0, 2.0, 2.0], + } + ) assert_frame_equal(result, expected) def test_na_actions_categorical(self): @@ -344,11 +362,10 @@ def test_na_actions_categorical(self): res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - with pytest.raises(ValueError, match=("fill value must " - "be in categories")): + with pytest.raises(ValueError, match=("fill value must " "be in categories")): df.fillna(value={"cats": 4, "vals": "c"}) - res = df.fillna(method='pad') + res = df.fillna(method="pad") tm.assert_frame_equal(res, df_exp_fill) # dropna @@ -376,8 +393,7 @@ def test_fillna_categorical_nan(self): df = DataFrame({"cats": cat, "vals": val}) res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] - df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, - dtype='category') + df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") tm.assert_frame_equal(res, df_exp) result = df.cats.fillna(np.nan) @@ -385,40 +401,41 @@ def test_fillna_categorical_nan(self): result = df.vals.fillna(np.nan) tm.assert_series_equal(result, df.vals) - idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', - '2011-01-01 09:00', pd.NaT, pd.NaT]) - df = DataFrame({'a': Categorical(idx)}) + idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT] + ) + df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) - idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', - pd.NaT, pd.NaT], freq='M') - df = DataFrame({'a': Categorical(idx)}) + idx = pd.PeriodIndex( + ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M" + ) + df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) - idx = pd.TimedeltaIndex(['1 days', '2 days', - '1 days', pd.NaT, pd.NaT]) - df = DataFrame({'a': Categorical(idx)}) + idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT]) + df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) def test_fillna_downcast(self): # GH 15277 # infer int64 from float64 - df = pd.DataFrame({'a': [1., np.nan]}) - result = df.fillna(0, downcast='infer') - expected = pd.DataFrame({'a': [1, 0]}) + df = pd.DataFrame({"a": [1.0, np.nan]}) + result = df.fillna(0, downcast="infer") + expected = pd.DataFrame({"a": [1, 0]}) assert_frame_equal(result, expected) # infer int64 from float64 when fillna value is a dict - df = pd.DataFrame({'a': [1., np.nan]}) - result = df.fillna({'a': 0}, downcast='infer') - expected = pd.DataFrame({'a': [1, 0]}) + df = pd.DataFrame({"a": [1.0, np.nan]}) + result = df.fillna({"a": 0}, downcast="infer") + expected = pd.DataFrame({"a": [1, 0]}) assert_frame_equal(result, expected) def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes - expected = Series([np.dtype('object')] * 5, index=[1, 2, 3, 4, 5]) + expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) assert_series_equal(result, expected) result = df.fillna(1) @@ -426,75 +443,91 @@ def test_fillna_dtype_conversion(self): assert_frame_equal(result, expected) # empty block - df = DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') - result = df.fillna('nan') - expected = DataFrame('nan', index=range(3), columns=['A', 'B']) + df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"]) assert_frame_equal(result, expected) # equiv of replace - df = DataFrame(dict(A=[1, np.nan], B=[1., 2.])) - for v in ['', 1, np.nan, 1.0]: + df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0])) + for v in ["", 1, np.nan, 1.0]: expected = df.replace(np.nan, v) result = df.fillna(v) assert_frame_equal(result, expected) def test_fillna_datetime_columns(self): # GH 7095 - df = pd.DataFrame({'A': [-1, -2, np.nan], - 'B': date_range('20130101', periods=3), - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]}, - index=date_range('20130110', periods=3)) - result = df.fillna('?') - expected = pd.DataFrame({'A': [-1, -2, '?'], - 'B': date_range('20130101', periods=3), - 'C': ['foo', 'bar', '?'], - 'D': ['foo2', 'bar2', '?']}, - index=date_range('20130110', periods=3)) + df = pd.DataFrame( + { + "A": [-1, -2, np.nan], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = pd.DataFrame( + { + "A": [-1, -2, "?"], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=date_range("20130110", periods=3), + ) tm.assert_frame_equal(result, expected) - df = pd.DataFrame({'A': [-1, -2, np.nan], - 'B': [pd.Timestamp('2013-01-01'), - pd.Timestamp('2013-01-02'), pd.NaT], - 'C': ['foo', 'bar', None], - 'D': ['foo2', 'bar2', None]}, - index=date_range('20130110', periods=3)) - result = df.fillna('?') - expected = pd.DataFrame({'A': [-1, -2, '?'], - 'B': [pd.Timestamp('2013-01-01'), - pd.Timestamp('2013-01-02'), '?'], - 'C': ['foo', 'bar', '?'], - 'D': ['foo2', 'bar2', '?']}, - index=pd.date_range('20130110', periods=3)) + df = pd.DataFrame( + { + "A": [-1, -2, np.nan], + "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT], + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = pd.DataFrame( + { + "A": [-1, -2, "?"], + "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"], + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=pd.date_range("20130110", periods=3), + ) tm.assert_frame_equal(result, expected) def test_ffill(self, datetime_frame): - datetime_frame['A'][:5] = np.nan - datetime_frame['A'][-5:] = np.nan + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan - assert_frame_equal(datetime_frame.ffill(), - datetime_frame.fillna(method='ffill')) + assert_frame_equal( + datetime_frame.ffill(), datetime_frame.fillna(method="ffill") + ) def test_bfill(self, datetime_frame): - datetime_frame['A'][:5] = np.nan - datetime_frame['A'][-5:] = np.nan + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan - assert_frame_equal(datetime_frame.bfill(), - datetime_frame.fillna(method='bfill')) + assert_frame_equal( + datetime_frame.bfill(), datetime_frame.fillna(method="bfill") + ) def test_frame_pad_backfill_limit(self): index = np.arange(10) df = DataFrame(np.random.randn(10, 4), index=index) - result = df[:2].reindex(index, method='pad', limit=5) + result = df[:2].reindex(index, method="pad", limit=5) - expected = df[:2].reindex(index).fillna(method='pad') + expected = df[:2].reindex(index).fillna(method="pad") expected.values[-3:] = np.nan tm.assert_frame_equal(result, expected) - result = df[-2:].reindex(index, method='backfill', limit=5) + result = df[-2:].reindex(index, method="backfill", limit=5) - expected = df[-2:].reindex(index).fillna(method='backfill') + expected = df[-2:].reindex(index).fillna(method="backfill") expected.values[:3] = np.nan tm.assert_frame_equal(result, expected) @@ -503,16 +536,16 @@ def test_frame_fillna_limit(self): df = DataFrame(np.random.randn(10, 4), index=index) result = df[:2].reindex(index) - result = result.fillna(method='pad', limit=5) + result = result.fillna(method="pad", limit=5) - expected = df[:2].reindex(index).fillna(method='pad') + expected = df[:2].reindex(index).fillna(method="pad") expected.values[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index) - result = result.fillna(method='backfill', limit=5) + result = result.fillna(method="backfill", limit=5) - expected = df[-2:].reindex(index).fillna(method='backfill') + expected = df[-2:].reindex(index).fillna(method="backfill") expected.values[:3] = np.nan tm.assert_frame_equal(result, expected) @@ -556,26 +589,30 @@ def test_fillna_inplace(self): df[1][:4] = np.nan df[3][-4:] = np.nan - expected = df.fillna(method='ffill') + expected = df.fillna(method="ffill") assert expected is not df - df.fillna(method='ffill', inplace=True) + df.fillna(method="ffill", inplace=True) tm.assert_frame_equal(df, expected) def test_fillna_dict_series(self): - df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]}) + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) - result = df.fillna({'a': 0, 'b': 5}) + result = df.fillna({"a": 0, "b": 5}) expected = df.copy() - expected['a'] = expected['a'].fillna(0) - expected['b'] = expected['b'].fillna(5) + expected["a"] = expected["a"].fillna(0) + expected["b"] = expected["b"].fillna(5) assert_frame_equal(result, expected) # it works - result = df.fillna({'a': 0, 'b': 5, 'd': 7}) + result = df.fillna({"a": 0, "b": 5, "d": 7}) # Series treated same as dict result = df.fillna(df.max()) @@ -583,29 +620,41 @@ def test_fillna_dict_series(self): assert_frame_equal(result, expected) # disable this for now - with pytest.raises(NotImplementedError, match='column by column'): + with pytest.raises(NotImplementedError, match="column by column"): df.fillna(df.max(1), axis=1) def test_fillna_dataframe(self): # GH 8377 - df = DataFrame({'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4]}, - index=list('VWXYZ')) + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) # df2 may have different index and columns - df2 = DataFrame({'a': [np.nan, 10, 20, 30, 40], - 'b': [50, 60, 70, 80, 90], - 'foo': ['bar'] * 5}, - index=list('VWXuZ')) + df2 = DataFrame( + { + "a": [np.nan, 10, 20, 30, 40], + "b": [50, 60, 70, 80, 90], + "foo": ["bar"] * 5, + }, + index=list("VWXuZ"), + ) result = df.fillna(df2) # only those columns and indices which are shared get filled - expected = DataFrame({'a': [np.nan, 1, 2, np.nan, 40], - 'b': [1, 2, 3, np.nan, 90], - 'c': [np.nan, 1, 2, 3, 4]}, - index=list('VWXYZ')) + expected = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, 40], + "b": [1, 2, 3, np.nan, 90], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) assert_frame_equal(result, expected) @@ -613,31 +662,32 @@ def test_fillna_columns(self): df = DataFrame(np.random.randn(10, 10)) df.values[:, ::2] = np.nan - result = df.fillna(method='ffill', axis=1) - expected = df.T.fillna(method='pad').T + result = df.fillna(method="ffill", axis=1) + expected = df.T.fillna(method="pad").T assert_frame_equal(result, expected) - df.insert(6, 'foo', 5) - result = df.fillna(method='ffill', axis=1) - expected = df.astype(float).fillna(method='ffill', axis=1) + df.insert(6, "foo", 5) + result = df.fillna(method="ffill", axis=1) + expected = df.astype(float).fillna(method="ffill", axis=1) assert_frame_equal(result, expected) def test_fillna_invalid_method(self, float_frame): - with pytest.raises(ValueError, match='ffil'): - float_frame.fillna(method='ffil') + with pytest.raises(ValueError, match="ffil"): + float_frame.fillna(method="ffil") def test_fillna_invalid_value(self, float_frame): # list - msg = ("\"value\" parameter must be a scalar or dict, but you passed" - " a \"{}\"") - with pytest.raises(TypeError, match=msg.format('list')): + msg = '"value" parameter must be a scalar or dict, but you passed' ' a "{}"' + with pytest.raises(TypeError, match=msg.format("list")): float_frame.fillna([1, 2]) # tuple - with pytest.raises(TypeError, match=msg.format('tuple')): + with pytest.raises(TypeError, match=msg.format("tuple")): float_frame.fillna((1, 2)) # frame with series - msg = ("\"value\" parameter must be a scalar, dict or Series, but you" - " passed a \"DataFrame\"") + msg = ( + '"value" parameter must be a scalar, dict or Series, but you' + ' passed a "DataFrame"' + ) with pytest.raises(TypeError, match=msg): float_frame.iloc[:, 0].fillna(float_frame) @@ -645,17 +695,17 @@ def test_fillna_col_reordering(self): cols = ["COL." + str(i) for i in range(5, 0, -1)] data = np.random.rand(20, 5) df = DataFrame(index=range(20), columns=cols, data=data) - filled = df.fillna(method='ffill') + filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame - mf.loc[mf.index[5:20], 'foo'] = np.nan - mf.loc[mf.index[-10:], 'A'] = np.nan + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], 'foo'] == 0).all() - del float_string_frame['foo'] + assert (filled.loc[filled.index[5:20], "foo"] == 0).all() + del float_string_frame["foo"] empty_float = float_frame.reindex(columns=[]) @@ -664,8 +714,8 @@ def test_fill_corner(self, float_frame, float_string_frame): def test_fill_value_when_combine_const(self): # GH12723 - dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') - df = DataFrame({'foo': dat}, index=range(6)) + dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") + df = DataFrame({"foo": dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) @@ -673,123 +723,144 @@ def test_fill_value_when_combine_const(self): class TestDataFrameInterpolate: - def test_interp_basic(self): - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': [1, 4, 9, np.nan], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) - expected = DataFrame({'A': [1., 2., 3., 4.], - 'B': [1., 4., 9., 9.], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + expected = DataFrame( + { + "A": [1.0, 2.0, 3.0, 4.0], + "B": [1.0, 4.0, 9.0, 9.0], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) result = df.interpolate() assert_frame_equal(result, expected) - result = df.set_index('C').interpolate() - expected = df.set_index('C') - expected.loc[3, 'A'] = 3 - expected.loc[5, 'B'] = 9 + result = df.set_index("C").interpolate() + expected = df.set_index("C") + expected.loc[3, "A"] = 3 + expected.loc[5, "B"] = 9 assert_frame_equal(result, expected) def test_interp_bad_method(self): - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': [1, 4, 9, np.nan], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) with pytest.raises(ValueError): - df.interpolate(method='not_a_method') + df.interpolate(method="not_a_method") def test_interp_combo(self): - df = DataFrame({'A': [1., 2., np.nan, 4.], - 'B': [1, 4, 9, np.nan], - 'C': [1, 2, 3, 5], - 'D': list('abcd')}) - - result = df['A'].interpolate() - expected = Series([1., 2., 3., 4.], name='A') + df = DataFrame( + { + "A": [1.0, 2.0, np.nan, 4.0], + "B": [1, 4, 9, np.nan], + "C": [1, 2, 3, 5], + "D": list("abcd"), + } + ) + + result = df["A"].interpolate() + expected = Series([1.0, 2.0, 3.0, 4.0], name="A") assert_series_equal(result, expected) - result = df['A'].interpolate(downcast='infer') - expected = Series([1, 2, 3, 4], name='A') + result = df["A"].interpolate(downcast="infer") + expected = Series([1, 2, 3, 4], name="A") assert_series_equal(result, expected) def test_interp_nan_idx(self): - df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) - df = df.set_index('A') + df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]}) + df = df.set_index("A") with pytest.raises(NotImplementedError): - df.interpolate(method='values') + df.interpolate(method="values") @td.skip_if_no_scipy def test_interp_various(self): - df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], - 'C': [1, 2, 3, 5, 8, 13, 21]}) - df = df.set_index('C') + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + df = df.set_index("C") expected = df.copy() - result = df.interpolate(method='polynomial', order=1) + result = df.interpolate(method="polynomial", order=1) expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923076 assert_frame_equal(result, expected) - result = df.interpolate(method='cubic') + result = df.interpolate(method="cubic") # GH #15662. expected.A.loc[3] = 2.81547781 expected.A.loc[13] = 5.52964175 assert_frame_equal(result, expected) - result = df.interpolate(method='nearest') + result = df.interpolate(method="nearest") expected.A.loc[3] = 2 expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) - result = df.interpolate(method='quadratic') + result = df.interpolate(method="quadratic") expected.A.loc[3] = 2.82150771 expected.A.loc[13] = 6.12648668 assert_frame_equal(result, expected) - result = df.interpolate(method='slinear') + result = df.interpolate(method="slinear") expected.A.loc[3] = 2.66666667 expected.A.loc[13] = 5.76923077 assert_frame_equal(result, expected) - result = df.interpolate(method='zero') - expected.A.loc[3] = 2. + result = df.interpolate(method="zero") + expected.A.loc[3] = 2.0 expected.A.loc[13] = 5 assert_frame_equal(result, expected, check_dtype=False) @td.skip_if_no_scipy def test_interp_alt_scipy(self): - df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7], - 'C': [1, 2, 3, 5, 8, 13, 21]}) - result = df.interpolate(method='barycentric') + df = DataFrame( + {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]} + ) + result = df.interpolate(method="barycentric") expected = df.copy() - expected.loc[2, 'A'] = 3 - expected.loc[5, 'A'] = 6 + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6 assert_frame_equal(result, expected) - result = df.interpolate(method='barycentric', downcast='infer') + result = df.interpolate(method="barycentric", downcast="infer") assert_frame_equal(result, expected.astype(np.int64)) - result = df.interpolate(method='krogh') + result = df.interpolate(method="krogh") expectedk = df.copy() - expectedk['A'] = expected['A'] + expectedk["A"] = expected["A"] assert_frame_equal(result, expectedk) _skip_if_no_pchip() - result = df.interpolate(method='pchip') - expected.loc[2, 'A'] = 3 - expected.loc[5, 'A'] = 6.0 + result = df.interpolate(method="pchip") + expected.loc[2, "A"] = 3 + expected.loc[5, "A"] = 6.0 assert_frame_equal(result, expected) def test_interp_rowwise(self): - df = DataFrame({0: [1, 2, np.nan, 4], - 1: [2, 3, 4, np.nan], - 2: [np.nan, 4, 5, 6], - 3: [4, np.nan, 6, 7], - 4: [1, 2, 3, 4]}) + df = DataFrame( + { + 0: [1, 2, np.nan, 4], + 1: [2, 3, 4, np.nan], + 2: [np.nan, 4, 5, 6], + 3: [4, np.nan, 6, 7], + 4: [1, 2, 3, 4], + } + ) result = df.interpolate(axis=1) expected = df.copy() expected.loc[3, 1] = 5 @@ -798,7 +869,7 @@ def test_interp_rowwise(self): expected[4] = expected[4].astype(np.float64) assert_frame_equal(result, expected) - result = df.interpolate(axis=1, method='values') + result = df.interpolate(axis=1, method="values") assert_frame_equal(result, expected) result = df.interpolate(axis=0) @@ -806,84 +877,96 @@ def test_interp_rowwise(self): assert_frame_equal(result, expected) def test_rowwise_alt(self): - df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64], - 1: [1, 2, 3, 4, 3, 2, 1, 0, -1]}) + df = DataFrame( + { + 0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64], + 1: [1, 2, 3, 4, 3, 2, 1, 0, -1], + } + ) df.interpolate(axis=0) - @pytest.mark.parametrize("check_scipy", [ - False, pytest.param(True, marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + ) def test_interp_leading_nans(self, check_scipy): - df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0], - "B": [np.nan, -3, -3.5, np.nan, -4]}) + df = DataFrame( + {"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]} + ) result = df.interpolate() expected = df.copy() - expected['B'].loc[3] = -3.75 + expected["B"].loc[3] = -3.75 assert_frame_equal(result, expected) if check_scipy: - result = df.interpolate(method='polynomial', order=1) + result = df.interpolate(method="polynomial", order=1) assert_frame_equal(result, expected) def test_interp_raise_on_only_mixed(self): - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': [np.nan, 2, 5, 7], - 'D': [np.nan, np.nan, 9, 9], - 'E': [1, 2, 3, 4]}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": ["a", "b", "c", "d"], + "C": [np.nan, 2, 5, 7], + "D": [np.nan, np.nan, 9, 9], + "E": [1, 2, 3, 4], + } + ) with pytest.raises(TypeError): df.interpolate(axis=1) def test_interp_raise_on_all_object_dtype(self): # GH 22985 - df = DataFrame({ - 'A': [1, 2, 3], - 'B': [4, 5, 6]}, - dtype='object') - msg = ("Cannot interpolate with all object-dtype columns " - "in the DataFrame. Try setting at least one " - "column to a numeric dtype.") + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object") + msg = ( + "Cannot interpolate with all object-dtype columns " + "in the DataFrame. Try setting at least one " + "column to a numeric dtype." + ) with pytest.raises(TypeError, match=msg): df.interpolate() def test_interp_inplace(self): - df = DataFrame({'a': [1., 2., np.nan, 4.]}) - expected = DataFrame({'a': [1., 2., 3., 4.]}) + df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) + expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) result = df.copy() - result['a'].interpolate(inplace=True) + result["a"].interpolate(inplace=True) assert_frame_equal(result, expected) result = df.copy() - result['a'].interpolate(inplace=True, downcast='infer') - assert_frame_equal(result, expected.astype('int64')) + result["a"].interpolate(inplace=True, downcast="infer") + assert_frame_equal(result, expected.astype("int64")) def test_interp_inplace_row(self): # GH 10395 - result = DataFrame({'a': [1., 2., 3., 4.], - 'b': [np.nan, 2., 3., 4.], - 'c': [3, 2, 2, 2]}) - expected = result.interpolate(method='linear', axis=1, inplace=False) - result.interpolate(method='linear', axis=1, inplace=True) + result = DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]} + ) + expected = result.interpolate(method="linear", axis=1, inplace=False) + result.interpolate(method="linear", axis=1, inplace=True) assert_frame_equal(result, expected) def test_interp_ignore_all_good(self): # GH - df = DataFrame({'A': [1, 2, np.nan, 4], - 'B': [1, 2, 3, 4], - 'C': [1., 2., np.nan, 4.], - 'D': [1., 2., 3., 4.]}) - expected = DataFrame({'A': np.array( - [1, 2, 3, 4], dtype='float64'), - 'B': np.array( - [1, 2, 3, 4], dtype='int64'), - 'C': np.array( - [1., 2., 3, 4.], dtype='float64'), - 'D': np.array( - [1., 2., 3., 4.], dtype='float64')}) + df = DataFrame( + { + "A": [1, 2, np.nan, 4], + "B": [1, 2, 3, 4], + "C": [1.0, 2.0, np.nan, 4.0], + "D": [1.0, 2.0, 3.0, 4.0], + } + ) + expected = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="float64"), + "B": np.array([1, 2, 3, 4], dtype="int64"), + "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"), + "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"), + } + ) result = df.interpolate(downcast=None) assert_frame_equal(result, expected) # all good - result = df[['B', 'D']].interpolate(downcast=None) - assert_frame_equal(result, df[['B', 'D']]) + result = df[["B", "D"]].interpolate(downcast=None) + assert_frame_equal(result, df[["B", "D"]]) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index ffc2a515bc4b7..ed9eeb594f7f6 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -11,13 +11,12 @@ class TestDataFrameMutateColumns: - def test_assign(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) original = df.copy() result = df.assign(C=df.B / df.A) expected = df.copy() - expected['C'] = [4, 2.5, 2] + expected["C"] = [4, 2.5, 2] assert_frame_equal(result, expected) # lambda syntax @@ -34,13 +33,13 @@ def test_assign(self): assert_frame_equal(df, original) result = df.assign(B=df.B / df.A) - expected = expected.drop('B', axis=1).rename(columns={'C': 'B'}) + expected = expected.drop("B", axis=1).rename(columns={"C": "B"}) assert_frame_equal(result, expected) # overwrite result = df.assign(A=df.A + df.B) expected = df.copy() - expected['A'] = [5, 7, 9] + expected["A"] = [5, 7, 9] assert_frame_equal(result, expected) # lambda @@ -48,33 +47,31 @@ def test_assign(self): assert_frame_equal(result, expected) def test_assign_multiple(self): - df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B']) + df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"]) result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) - expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], - [3, 6, 9, 3, 6]], columns=list('ABCDE')) + expected = DataFrame( + [[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE") + ) assert_frame_equal(result, expected) def test_assign_order(self): # GH 9818 - df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) result = df.assign(D=df.A + df.B, C=df.A - df.B) if PY36: - expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], - columns=list('ABDC')) + expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC")) else: - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], - columns=list('ABCD')) + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) assert_frame_equal(result, expected) result = df.assign(C=df.A - df.B, D=df.A + df.B) - expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], - columns=list('ABCD')) + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD")) assert_frame_equal(result, expected) def test_assign_bad(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # non-keyword argument with pytest.raises(TypeError): @@ -82,52 +79,55 @@ def test_assign_bad(self): with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) - @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python - 3.6 and above""") + @pytest.mark.skipif( + PY36, + reason="""Issue #14207: valid for python + 3.6 and above""", + ) def test_assign_dependent_old_python(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # Key C does not exist at definition time of df with pytest.raises(KeyError): - df.assign(C=lambda df: df.A, - D=lambda df: df['A'] + df['C']) + df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) with pytest.raises(KeyError): - df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) - @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for - python 3.5 and below""") + @pytest.mark.skipif( + not PY36, + reason="""Issue #14207: not valid for + python 3.5 and below""", + ) def test_assign_dependent(self): - df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) - result = df.assign(C=df.A, D=lambda x: x['A'] + x['C']) - expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], - columns=list('ABCD')) + result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) assert_frame_equal(result, expected) - result = df.assign(C=lambda df: df.A, - D=lambda df: df['A'] + df['C']) - expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], - columns=list('ABCD')) + result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"]) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD")) assert_frame_equal(result, expected) def test_insert_error_msmgs(self): # GH 7432 - df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [ - 1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo') - s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [ - 'g', 'h', 'i', 'j']}).set_index('foo') - msg = 'cannot reindex from a duplicate axis' + df = DataFrame( + {"foo": ["a", "b", "c"], "bar": [1, 2, 3], "baz": ["d", "e", "f"]} + ).set_index("foo") + s = DataFrame( + {"foo": ["a", "b", "c", "a"], "fiz": ["g", "h", "i", "j"]} + ).set_index("foo") + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - df['newcol'] = s + df["newcol"] = s # GH 4107, more descriptive error message - df = DataFrame(np.random.randint(0, 2, (4, 4)), - columns=['a', 'b', 'c', 'd']) + df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"]) - msg = 'incompatible index of inserted column with frame index' + msg = "incompatible index of inserted column with frame index" with pytest.raises(TypeError, match=msg): - df['gr'] = df.groupby(['b', 'c']).count() + df["gr"] = df.groupby(["b", "c"]).count() def test_insert_benchmark(self): # from the vb_suite/frame_methods/frame_insert_columns @@ -137,120 +137,120 @@ def test_insert_benchmark(self): new_col = np.random.randn(N) for i in range(K): df[i] = new_col - expected = DataFrame(np.repeat(new_col, K).reshape(N, K), - index=range(N)) + expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) assert_frame_equal(df, expected) def test_insert(self): - df = DataFrame(np.random.randn(5, 3), index=np.arange(5), - columns=['c', 'b', 'a']) + df = DataFrame( + np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] + ) - df.insert(0, 'foo', df['a']) - tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a'])) - tm.assert_series_equal(df['a'], df['foo'], check_names=False) + df.insert(0, "foo", df["a"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "b", "a"])) + tm.assert_series_equal(df["a"], df["foo"], check_names=False) - df.insert(2, 'bar', df['c']) - tm.assert_index_equal(df.columns, - Index(['foo', 'c', 'bar', 'b', 'a'])) - tm.assert_almost_equal(df['c'], df['bar'], check_names=False) + df.insert(2, "bar", df["c"]) + tm.assert_index_equal(df.columns, Index(["foo", "c", "bar", "b", "a"])) + tm.assert_almost_equal(df["c"], df["bar"], check_names=False) # diff dtype # new item - df['x'] = df['a'].astype('float32') + df["x"] = df["a"].astype("float32") result = df.dtypes - expected = Series([np.dtype('float64')] * 5 + [np.dtype('float32')], - index=['foo', 'c', 'bar', 'b', 'a', 'x']) + expected = Series( + [np.dtype("float64")] * 5 + [np.dtype("float32")], + index=["foo", "c", "bar", "b", "a", "x"], + ) tm.assert_series_equal(result, expected) # replacing current (in different block) - df['a'] = df['a'].astype('float32') + df["a"] = df["a"].astype("float32") result = df.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('float32')] * 2, - index=['foo', 'c', 'bar', 'b', 'a', 'x']) + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2, + index=["foo", "c", "bar", "b", "a", "x"], + ) tm.assert_series_equal(result, expected) - df['y'] = df['a'].astype('int32') + df["y"] = df["a"].astype("int32") result = df.dtypes - expected = Series([np.dtype('float64')] * 4 + - [np.dtype('float32')] * 2 + - [np.dtype('int32')], - index=['foo', 'c', 'bar', 'b', 'a', 'x', 'y']) + expected = Series( + [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")], + index=["foo", "c", "bar", "b", "a", "x", "y"], + ) tm.assert_series_equal(result, expected) - with pytest.raises(ValueError, match='already exists'): - df.insert(1, 'a', df['b']) + with pytest.raises(ValueError, match="already exists"): + df.insert(1, "a", df["b"]) msg = "cannot insert c, already exists" with pytest.raises(ValueError, match=msg): - df.insert(1, 'c', df['b']) + df.insert(1, "c", df["b"]) - df.columns.name = 'some_name' + df.columns.name = "some_name" # preserve columns name field - df.insert(0, 'baz', df['c']) - assert df.columns.name == 'some_name' + df.insert(0, "baz", df["c"]) + assert df.columns.name == "some_name" # GH 13522 - df = DataFrame(index=['A', 'B', 'C']) - df['X'] = df.index - df['X'] = ['x', 'y', 'z'] - exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) + df = DataFrame(index=["A", "B", "C"]) + df["X"] = df.index + df["X"] = ["x", "y", "z"] + exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) assert_frame_equal(df, exp) def test_delitem(self, float_frame): - del float_frame['A'] - assert 'A' not in float_frame + del float_frame["A"] + assert "A" not in float_frame def test_delitem_multiindex(self): - midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) df = DataFrame(np.random.randn(4, 4), columns=midx) assert len(df.columns) == 4 - assert ('A', ) in df.columns - assert 'A' in df.columns + assert ("A",) in df.columns + assert "A" in df.columns - result = df['A'] + result = df["A"] assert isinstance(result, DataFrame) - del df['A'] + del df["A"] assert len(df.columns) == 2 # A still in the levels, BUT get a KeyError if trying # to delete - assert ('A', ) not in df.columns + assert ("A",) not in df.columns with pytest.raises(KeyError): - del df[('A',)] + del df[("A",)] # behavior of dropped/deleted MultiIndex levels changed from # GH 2770 to GH 19027: MultiIndex no longer '.__contains__' # levels which are dropped/deleted - assert 'A' not in df.columns + assert "A" not in df.columns with pytest.raises(KeyError): - del df['A'] + del df["A"] def test_pop(self, float_frame): - float_frame.columns.name = 'baz' + float_frame.columns.name = "baz" - float_frame.pop('A') - assert 'A' not in float_frame + float_frame.pop("A") + assert "A" not in float_frame - float_frame['foo'] = 'bar' - float_frame.pop('foo') - assert 'foo' not in float_frame - assert float_frame.columns.name == 'baz' + float_frame["foo"] = "bar" + float_frame.pop("foo") + assert "foo" not in float_frame + assert float_frame.columns.name == "baz" # gh-10912: inplace ops cause caching issue - a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[ - 'A', 'B', 'C'], index=['X', 'Y']) - b = a.pop('B') + a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"], index=["X", "Y"]) + b = a.pop("B") b += 1 # original frame - expected = DataFrame([[1, 3], [4, 6]], columns=[ - 'A', 'C'], index=['X', 'Y']) + expected = DataFrame([[1, 3], [4, 6]], columns=["A", "C"], index=["X", "Y"]) tm.assert_frame_equal(a, expected) # result - expected = Series([2, 5], index=['X', 'Y'], name='B') + 1 + expected = Series([2, 5], index=["X", "Y"], name="B") + 1 tm.assert_series_equal(b, expected) def test_pop_non_unique_cols(self): @@ -268,19 +268,18 @@ def test_pop_non_unique_cols(self): def test_insert_column_bug_4032(self): # GH4032, inserting a column and renaming causing errors - df = DataFrame({'b': [1.1, 2.2]}) + df = DataFrame({"b": [1.1, 2.2]}) df = df.rename(columns={}) - df.insert(0, 'a', [1, 2]) + df.insert(0, "a", [1, 2]) result = df.rename(columns={}) str(result) - expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b']) + expected = DataFrame([[1, 1.1], [2, 2.2]], columns=["a", "b"]) assert_frame_equal(result, expected) - df.insert(0, 'c', [1.3, 2.3]) + df.insert(0, "c", [1.3, 2.3]) result = df.rename(columns={}) str(result) - expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], - columns=['c', 'a', 'b']) + expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index e7583adff403b..4faa0d0e3f941 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -9,9 +9,7 @@ class TestDataFrameNonuniqueIndexes(TestData): - def test_column_dups_operations(self): - def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) @@ -22,253 +20,294 @@ def check(result, expected=None): # GH 3687 arr = np.random.randn(3, 2) idx = list(range(2)) - df = DataFrame(arr, columns=['A', 'A']) + df = DataFrame(arr, columns=["A", "A"]) df.columns = idx expected = DataFrame(arr, columns=idx) check(df, expected) - idx = date_range('20130101', periods=4, freq='Q-NOV') - df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], - columns=['a', 'a', 'a', 'a']) + idx = date_range("20130101", periods=4, freq="Q-NOV") + df = DataFrame( + [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"] + ) df.columns = idx - expected = DataFrame( - [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) + expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx) check(df, expected) # insert - df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], - columns=['foo', 'bar', 'foo', 'hello']) - df['string'] = 'bah' - expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'], - [2, 1, 3, 5, 'bah']], - columns=['foo', 'bar', 'foo', 'hello', 'string']) + df = DataFrame( + [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], + columns=["foo", "bar", "foo", "hello"], + ) + df["string"] = "bah" + expected = DataFrame( + [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]], + columns=["foo", "bar", "foo", "hello", "string"], + ) check(df, expected) - with pytest.raises(ValueError, match='Length of value'): - df.insert(0, 'AnotherColumn', range(len(df.index) - 1)) + with pytest.raises(ValueError, match="Length of value"): + df.insert(0, "AnotherColumn", range(len(df.index) - 1)) # insert same dtype - df['foo2'] = 3 - expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3], - [2, 1, 3, 5, 'bah', 3]], - columns=['foo', 'bar', 'foo', 'hello', - 'string', 'foo2']) + df["foo2"] = 3 + expected = DataFrame( + [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]], + columns=["foo", "bar", "foo", "hello", "string", "foo2"], + ) check(df, expected) # set (non-dup) - df['foo2'] = 4 - expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4], - [2, 1, 3, 5, 'bah', 4]], - columns=['foo', 'bar', 'foo', 'hello', - 'string', 'foo2']) + df["foo2"] = 4 + expected = DataFrame( + [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]], + columns=["foo", "bar", "foo", "hello", "string", "foo2"], + ) check(df, expected) - df['foo2'] = 3 + df["foo2"] = 3 # delete (non dup) - del df['bar'] - expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3], - [2, 3, 5, 'bah', 3]], - columns=['foo', 'foo', 'hello', 'string', 'foo2']) + del df["bar"] + expected = DataFrame( + [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]], + columns=["foo", "foo", "hello", "string", "foo2"], + ) check(df, expected) # try to delete again (its not consolidated) - del df['hello'] - expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], - [2, 3, 'bah', 3]], - columns=['foo', 'foo', 'string', 'foo2']) + del df["hello"] + expected = DataFrame( + [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], + columns=["foo", "foo", "string", "foo2"], + ) check(df, expected) # consolidate df = df._consolidate() - expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3], - [2, 3, 'bah', 3]], - columns=['foo', 'foo', 'string', 'foo2']) + expected = DataFrame( + [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]], + columns=["foo", "foo", "string", "foo2"], + ) check(df, expected) # insert - df.insert(2, 'new_col', 5.) - expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3], - [2, 3, 5., 'bah', 3]], - columns=['foo', 'foo', 'new_col', 'string', - 'foo2']) + df.insert(2, "new_col", 5.0) + expected = DataFrame( + [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]], + columns=["foo", "foo", "new_col", "string", "foo2"], + ) check(df, expected) # insert a dup - with pytest.raises(ValueError, match='cannot insert'): - df.insert(2, 'new_col', 4.) - - df.insert(2, 'new_col', 4., allow_duplicates=True) - expected = DataFrame([[1, 1, 4., 5., 'bah', 3], - [1, 2, 4., 5., 'bah', 3], - [2, 3, 4., 5., 'bah', 3]], - columns=['foo', 'foo', 'new_col', - 'new_col', 'string', 'foo2']) + with pytest.raises(ValueError, match="cannot insert"): + df.insert(2, "new_col", 4.0) + + df.insert(2, "new_col", 4.0, allow_duplicates=True) + expected = DataFrame( + [ + [1, 1, 4.0, 5.0, "bah", 3], + [1, 2, 4.0, 5.0, "bah", 3], + [2, 3, 4.0, 5.0, "bah", 3], + ], + columns=["foo", "foo", "new_col", "new_col", "string", "foo2"], + ) check(df, expected) # delete (dup) - del df['foo'] - expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3], - [4., 5., 'bah', 3]], - columns=['new_col', 'new_col', 'string', 'foo2']) + del df["foo"] + expected = DataFrame( + [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]], + columns=["new_col", "new_col", "string", "foo2"], + ) assert_frame_equal(df, expected) # dup across dtypes - df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]], - columns=['foo', 'bar', 'foo', 'hello']) + df = DataFrame( + [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]], + columns=["foo", "bar", "foo", "hello"], + ) check(df) - df['foo2'] = 7. - expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.], - [2, 1, 3., 5, 7.]], - columns=['foo', 'bar', 'foo', 'hello', 'foo2']) + df["foo2"] = 7.0 + expected = DataFrame( + [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]], + columns=["foo", "bar", "foo", "hello", "foo2"], + ) check(df, expected) - result = df['foo'] - expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]], - columns=['foo', 'foo']) + result = df["foo"] + expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"]) check(result, expected) # multiple replacements - df['foo'] = 'string' - expected = DataFrame([['string', 1, 'string', 5, 7.], - ['string', 1, 'string', 5, 7.], - ['string', 1, 'string', 5, 7.]], - columns=['foo', 'bar', 'foo', 'hello', 'foo2']) + df["foo"] = "string" + expected = DataFrame( + [ + ["string", 1, "string", 5, 7.0], + ["string", 1, "string", 5, 7.0], + ["string", 1, "string", 5, 7.0], + ], + columns=["foo", "bar", "foo", "hello", "foo2"], + ) check(df, expected) - del df['foo'] - expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[ - 'bar', 'hello', 'foo2']) + del df["foo"] + expected = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"] + ) check(df, expected) # values - df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x']) + df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"]) result = df.values expected = np.array([[1, 2.5], [3, 4.5]]) assert (result == expected).all().all() # rename, GH 4403 df4 = DataFrame( - {'RT': [0.0454], - 'TClose': [22.02], - 'TExg': [0.0422]}, - index=MultiIndex.from_tuples([(600809, 20130331)], - names=['STK_ID', 'RPT_Date'])) - - df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331], - 'STK_ID': [600809] * 3, - 'STK_Name': ['饡驦', '饡驦', '饡驦'], - 'TClose': [38.05, 41.66, 30.01]}, - index=MultiIndex.from_tuples( - [(600809, 20120930), - (600809, 20121231), - (600809, 20130331)], - names=['STK_ID', 'RPT_Date'])) - - k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True) - result = k.rename( - columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'}) + {"RT": [0.0454], "TClose": [22.02], "TExg": [0.0422]}, + index=MultiIndex.from_tuples( + [(600809, 20130331)], names=["STK_ID", "RPT_Date"] + ), + ) + + df5 = DataFrame( + { + "RPT_Date": [20120930, 20121231, 20130331], + "STK_ID": [600809] * 3, + "STK_Name": ["饡驦", "饡驦", "饡驦"], + "TClose": [38.05, 41.66, 30.01], + }, + index=MultiIndex.from_tuples( + [(600809, 20120930), (600809, 20121231), (600809, 20130331)], + names=["STK_ID", "RPT_Date"], + ), + ) + + k = pd.merge(df4, df5, how="inner", left_index=True, right_index=True) + result = k.rename(columns={"TClose_x": "TClose", "TClose_y": "QT_Close"}) str(result) result.dtypes - expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809, - '饡驦', 30.01]], - columns=['RT', 'TClose', 'TExg', - 'RPT_Date', 'STK_ID', 'STK_Name', - 'QT_Close']) - .set_index(['STK_ID', 'RPT_Date'], drop=False)) + expected = DataFrame( + [[0.0454, 22.02, 0.0422, 20130331, 600809, "饡驦", 30.01]], + columns=[ + "RT", + "TClose", + "TExg", + "RPT_Date", + "STK_ID", + "STK_Name", + "QT_Close", + ], + ).set_index(["STK_ID", "RPT_Date"], drop=False) assert_frame_equal(result, expected) # reindex is invalid! - df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], - columns=['bar', 'a', 'a']) + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - df.reindex(columns=['bar']) + df.reindex(columns=["bar"]) with pytest.raises(ValueError, match=msg): - df.reindex(columns=['bar', 'foo']) + df.reindex(columns=["bar", "foo"]) # drop - df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], - columns=['bar', 'a', 'a']) - result = df.drop(['a'], axis=1) - expected = DataFrame([[1], [1], [1]], columns=['bar']) + df = DataFrame( + [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] + ) + result = df.drop(["a"], axis=1) + expected = DataFrame([[1], [1], [1]], columns=["bar"]) check(result, expected) - result = df.drop('a', axis=1) + result = df.drop("a", axis=1) check(result, expected) # describe - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['bar', 'a', 'a'], dtype='float64') + df = DataFrame( + [[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=["bar", "a", "a"], + dtype="float64", + ) result = df.describe() s = df.iloc[:, 0].describe() expected = pd.concat([s, s, s], keys=df.columns, axis=1) check(result, expected) # check column dups with index equal and not equal to df's index - df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], - columns=['A', 'B', 'A']) - for index in [df.index, pd.Index(list('edcba'))]: + df = DataFrame( + np.random.randn(5, 3), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "A"], + ) + for index in [df.index, pd.Index(list("edcba"))]: this_df = df.copy() expected_ser = pd.Series(index.values, index=this_df.index) - expected_df = DataFrame({'A': expected_ser, - 'B': this_df['B'], - 'A': expected_ser}, - columns=['A', 'B', 'A']) - this_df['A'] = index + expected_df = DataFrame( + {"A": expected_ser, "B": this_df["B"], "A": expected_ser}, + columns=["A", "B", "A"], + ) + this_df["A"] = index check(this_df, expected_df) # operations - for op in ['__add__', '__mul__', '__sub__', '__truediv__']: + for op in ["__add__", "__mul__", "__sub__", "__truediv__"]: df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10))) expected = getattr(df, op)(df) - expected.columns = ['A', 'A'] - df.columns = ['A', 'A'] + expected.columns = ["A", "A"] + df.columns = ["A", "A"] result = getattr(df, op)(df) check(result, expected) # multiple assignments that change dtypes # the location indexer is a slice # GH 6120 - df = DataFrame(np.random.randn(5, 2), columns=['that', 'that']) - expected = DataFrame(1.0, index=range(5), columns=['that', 'that']) + df = DataFrame(np.random.randn(5, 2), columns=["that", "that"]) + expected = DataFrame(1.0, index=range(5), columns=["that", "that"]) - df['that'] = 1.0 + df["that"] = 1.0 check(df, expected) - df = DataFrame(np.random.rand(5, 2), columns=['that', 'that']) - expected = DataFrame(1, index=range(5), columns=['that', 'that']) + df = DataFrame(np.random.rand(5, 2), columns=["that", "that"]) + expected = DataFrame(1, index=range(5), columns=["that", "that"]) - df['that'] = 1 + df["that"] = 1 check(df, expected) def test_column_dups2(self): # drop buggy GH 6240 - df = DataFrame({'A': np.random.randn(5), - 'B': np.random.randn(5), - 'C': np.random.randn(5), - 'D': ['a', 'b', 'c', 'd', 'e']}) + df = DataFrame( + { + "A": np.random.randn(5), + "B": np.random.randn(5), + "C": np.random.randn(5), + "D": ["a", "b", "c", "d", "e"], + } + ) expected = df.take([0, 1, 1], axis=1) df2 = df.take([2, 0, 1, 2, 1], axis=1) - result = df2.drop('C', axis=1) + result = df2.drop("C", axis=1) assert_frame_equal(result, expected) # dropna - df = DataFrame({'A': np.random.randn(5), - 'B': np.random.randn(5), - 'C': np.random.randn(5), - 'D': ['a', 'b', 'c', 'd', 'e']}) + df = DataFrame( + { + "A": np.random.randn(5), + "B": np.random.randn(5), + "C": np.random.randn(5), + "D": ["a", "b", "c", "d", "e"], + } + ) df.iloc[2, [0, 1, 2]] = np.nan df.iloc[0, 0] = np.nan df.iloc[1, 1] = np.nan df.iloc[:, 3] = np.nan - expected = df.dropna(subset=['A', 'B', 'C'], how='all') - expected.columns = ['A', 'A', 'B', 'C'] + expected = df.dropna(subset=["A", "B", "C"], how="all") + expected.columns = ["A", "A", "B", "C"] - df.columns = ['A', 'A', 'B', 'C'] + df.columns = ["A", "A", "B", "C"] - result = df.dropna(subset=['A', 'C'], how='all') + result = df.dropna(subset=["A", "C"], how="all") assert_frame_equal(result, expected) def test_column_dups_indexing(self): @@ -280,29 +319,28 @@ def check(result, expected=None): # boolean indexing # GH 4879 - dups = ['A', 'A', 'C', 'D'] - df = DataFrame(np.arange(12).reshape(3, 4), columns=[ - 'A', 'B', 'C', 'D'], dtype='float64') + dups = ["A", "A", "C", "D"] + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) expected = df[df.C > 6] expected.columns = dups - df = DataFrame(np.arange(12).reshape(3, 4), - columns=dups, dtype='float64') + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") result = df[df.C > 6] check(result, expected) # where - df = DataFrame(np.arange(12).reshape(3, 4), columns=[ - 'A', 'B', 'C', 'D'], dtype='float64') + df = DataFrame( + np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64" + ) expected = df[df > 6] expected.columns = dups - df = DataFrame(np.arange(12).reshape(3, 4), - columns=dups, dtype='float64') + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") result = df[df > 6] check(result, expected) # boolean with the duplicate raises - df = DataFrame(np.arange(12).reshape(3, 4), - columns=dups, dtype='float64') + df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df[df.A > 6] @@ -316,10 +354,8 @@ def check(result, expected=None): assert_frame_equal(result, expected) # equality - df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], - columns=['A', 'B']) - df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], - columns=['A', 'A']) + df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"]) + df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"]) # not-comparing like-labelled msg = "Can only compare identically-labeled DataFrame objects" @@ -328,105 +364,116 @@ def check(result, expected=None): df1r = df1.reindex_like(df2) result = df1r == df2 - expected = DataFrame([[False, True], [True, False], [False, False], [ - True, False]], columns=['A', 'A']) + expected = DataFrame( + [[False, True], [True, False], [False, False], [True, False]], + columns=["A", "A"], + ) assert_frame_equal(result, expected) # mixed column selection # GH 5639 - dfbool = DataFrame({'one': Series([True, True, False], - index=['a', 'b', 'c']), - 'two': Series([False, False, True, False], - index=['a', 'b', 'c', 'd']), - 'three': Series([False, True, True, True], - index=['a', 'b', 'c', 'd'])}) - expected = pd.concat( - [dfbool['one'], dfbool['three'], dfbool['one']], axis=1) - result = dfbool[['one', 'three', 'one']] + dfbool = DataFrame( + { + "one": Series([True, True, False], index=["a", "b", "c"]), + "two": Series([False, False, True, False], index=["a", "b", "c", "d"]), + "three": Series([False, True, True, True], index=["a", "b", "c", "d"]), + } + ) + expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1) + result = dfbool[["one", "three", "one"]] check(result, expected) # multi-axis dups # GH 6121 - df = DataFrame(np.arange(25.).reshape(5, 5), - index=['a', 'b', 'c', 'd', 'e'], - columns=['A', 'B', 'C', 'D', 'E']) - z = df[['A', 'C', 'A']].copy() - expected = z.loc[['a', 'c', 'a']] - - df = DataFrame(np.arange(25.).reshape(5, 5), - index=['a', 'b', 'c', 'd', 'e'], - columns=['A', 'B', 'C', 'D', 'E']) - z = df[['A', 'C', 'A']] - result = z.loc[['a', 'c', 'a']] + df = DataFrame( + np.arange(25.0).reshape(5, 5), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "C", "D", "E"], + ) + z = df[["A", "C", "A"]].copy() + expected = z.loc[["a", "c", "a"]] + + df = DataFrame( + np.arange(25.0).reshape(5, 5), + index=["a", "b", "c", "d", "e"], + columns=["A", "B", "C", "D", "E"], + ) + z = df[["A", "C", "A"]] + result = z.loc[["a", "c", "a"]] check(result, expected) def test_column_dups_indexing2(self): # GH 8363 # datetime ops with a non-unique index - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': np.arange(1, 6, dtype='int64')}, - index=[2, 2, 3, 3, 4]) + df = DataFrame( + {"A": np.arange(5, dtype="int64"), "B": np.arange(1, 6, dtype="int64")}, + index=[2, 2, 3, 3, 4], + ) result = df.B - df.A expected = Series(1, index=[2, 2, 3, 3, 4]) assert_series_equal(result, expected) - df = DataFrame({'A': date_range('20130101', periods=5), - 'B': date_range('20130101 09:00:00', periods=5)}, - index=[2, 2, 3, 3, 4]) + df = DataFrame( + { + "A": date_range("20130101", periods=5), + "B": date_range("20130101 09:00:00", periods=5), + }, + index=[2, 2, 3, 3, 4], + ) result = df.B - df.A - expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4]) + expected = Series(pd.Timedelta("9 hours"), index=[2, 2, 3, 3, 4]) assert_series_equal(result, expected) def test_columns_with_dups(self): # GH 3468 related # basic - df = DataFrame([[1, 2]], columns=['a', 'a']) - df.columns = ['a', 'a.1'] + df = DataFrame([[1, 2]], columns=["a", "a"]) + df.columns = ["a", "a.1"] str(df) - expected = DataFrame([[1, 2]], columns=['a', 'a.1']) + expected = DataFrame([[1, 2]], columns=["a", "a.1"]) assert_frame_equal(df, expected) - df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a']) - df.columns = ['b', 'a', 'a.1'] + df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"]) + df.columns = ["b", "a", "a.1"] str(df) - expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1']) + expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"]) assert_frame_equal(df, expected) # with a dup index - df = DataFrame([[1, 2]], columns=['a', 'a']) - df.columns = ['b', 'b'] + df = DataFrame([[1, 2]], columns=["a", "a"]) + df.columns = ["b", "b"] str(df) - expected = DataFrame([[1, 2]], columns=['b', 'b']) + expected = DataFrame([[1, 2]], columns=["b", "b"]) assert_frame_equal(df, expected) # multi-dtype - df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], - columns=['a', 'a', 'b', 'b', 'd', 'c', 'c']) - df.columns = list('ABCDEFG') + df = DataFrame( + [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], + columns=["a", "a", "b", "b", "d", "c", "c"], + ) + df.columns = list("ABCDEFG") str(df) expected = DataFrame( - [[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG')) + [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG") + ) assert_frame_equal(df, expected) - df = DataFrame([[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']) - df.columns = ['a', 'a.1', 'a.2', 'a.3'] + df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"]) + df.columns = ["a", "a.1", "a.2", "a.3"] str(df) - expected = DataFrame([[1, 2, 'foo', 'bar']], - columns=['a', 'a.1', 'a.2', 'a.3']) + expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"]) assert_frame_equal(df, expected) # dups across blocks - df_float = DataFrame(np.random.randn(10, 3), dtype='float64') - df_int = DataFrame(np.random.randn(10, 3), dtype='int64') - df_bool = DataFrame(True, index=df_float.index, - columns=df_float.columns) - df_object = DataFrame('foo', index=df_float.index, - columns=df_float.columns) - df_dt = DataFrame(pd.Timestamp('20010101'), - index=df_float.index, - columns=df_float.columns) + df_float = DataFrame(np.random.randn(10, 3), dtype="float64") + df_int = DataFrame(np.random.randn(10, 3), dtype="int64") + df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns) + df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns) + df_dt = DataFrame( + pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns + ) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) assert len(df._data._blknos) == len(df.columns) @@ -437,27 +484,26 @@ def test_columns_with_dups(self): df.iloc[:, i] # dup columns across dtype GH 2079/2194 - vals = [[1, -1, 2.], [2, -2, 3.]] - rs = DataFrame(vals, columns=['A', 'A', 'B']) + vals = [[1, -1, 2.0], [2, -2, 3.0]] + rs = DataFrame(vals, columns=["A", "A", "B"]) xp = DataFrame(vals) - xp.columns = ['A', 'A', 'B'] + xp.columns = ["A", "A", "B"] assert_frame_equal(rs, xp) def test_values_duplicates(self): - df = DataFrame([[1, 2, 'a', 'b'], - [1, 2, 'a', 'b']], - columns=['one', 'one', 'two', 'two']) + df = DataFrame( + [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"] + ) result = df.values - expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], - dtype=object) + expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_set_value_by_index(self): # See gh-12344 df = DataFrame(np.arange(9).reshape(3, 3).T) - df.columns = list('AAA') + df.columns = list("AAA") expected = df.iloc[:, 2] df.iloc[:, 0] = 3 @@ -473,9 +519,10 @@ def test_set_value_by_index(self): def test_insert_with_columns_dups(self): # GH 14291 df = pd.DataFrame() - df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True) - df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True) - df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True) - exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'], - ['c', 'f', 'i']], columns=['A', 'A', 'A']) + df.insert(0, "A", ["g", "h", "i"], allow_duplicates=True) + df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) + df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) + exp = pd.DataFrame( + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + ) assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 1e932879e9ad0..67482ddf657fb 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -10,79 +10,100 @@ from pandas.tests.frame.common import _check_mixed_float import pandas.util.testing as tm from pandas.util.testing import ( - assert_frame_equal, assert_numpy_array_equal, assert_series_equal) + assert_frame_equal, + assert_numpy_array_equal, + assert_series_equal, +) class TestDataFrameUnaryOperators: # __pos__, __neg__, __inv__ - @pytest.mark.parametrize('df,expected', [ - (pd.DataFrame({'a': [-1, 1]}), pd.DataFrame({'a': [1, -1]})), - (pd.DataFrame({'a': [False, True]}), - pd.DataFrame({'a': [True, False]})), - (pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), - pd.DataFrame({'a': pd.Series(pd.to_timedelta([1, -1]))})) - ]) + @pytest.mark.parametrize( + "df,expected", + [ + (pd.DataFrame({"a": [-1, 1]}), pd.DataFrame({"a": [1, -1]})), + (pd.DataFrame({"a": [False, True]}), pd.DataFrame({"a": [True, False]})), + ( + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([1, -1]))}), + ), + ], + ) def test_neg_numeric(self, df, expected): assert_frame_equal(-df, expected) - assert_series_equal(-df['a'], expected['a']) - - @pytest.mark.parametrize('df, expected', [ - (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), - ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]), - ]) + assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df, expected", + [ + (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), + ([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]), + ], + ) def test_neg_object(self, df, expected): # GH#21380 - df = pd.DataFrame({'a': df}) - expected = pd.DataFrame({'a': expected}) + df = pd.DataFrame({"a": df}) + expected = pd.DataFrame({"a": expected}) assert_frame_equal(-df, expected) - assert_series_equal(-df['a'], expected['a']) - - @pytest.mark.parametrize('df', [ - pd.DataFrame({'a': ['a', 'b']}), - pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), - ]) + assert_series_equal(-df["a"], expected["a"]) + + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": ["a", "b"]}), + pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), + ], + ) def test_neg_raises(self, df): with pytest.raises(TypeError): - (- df) + (-df) with pytest.raises(TypeError): - (- df['a']) + (-df["a"]) def test_invert(self, float_frame): df = float_frame assert_frame_equal(-(df < 0), ~(df < 0)) - @pytest.mark.parametrize('df', [ - pd.DataFrame({'a': [-1, 1]}), - pd.DataFrame({'a': [False, True]}), - pd.DataFrame({'a': pd.Series(pd.to_timedelta([-1, 1]))}), - ]) + @pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": [-1, 1]}), + pd.DataFrame({"a": [False, True]}), + pd.DataFrame({"a": pd.Series(pd.to_timedelta([-1, 1]))}), + ], + ) def test_pos_numeric(self, df): # GH#16073 assert_frame_equal(+df, df) - assert_series_equal(+df['a'], df['a']) - - @pytest.mark.parametrize('df', [ - # numpy changing behavior in the future - pytest.param(pd.DataFrame({'a': ['a', 'b']}), - marks=[pytest.mark.filterwarnings("ignore")]), - pd.DataFrame({'a': np.array([-1, 2], dtype=object)}), - pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}), - ]) + assert_series_equal(+df["a"], df["a"]) + + @pytest.mark.parametrize( + "df", + [ + # numpy changing behavior in the future + pytest.param( + pd.DataFrame({"a": ["a", "b"]}), + marks=[pytest.mark.filterwarnings("ignore")], + ), + pd.DataFrame({"a": np.array([-1, 2], dtype=object)}), + pd.DataFrame({"a": [Decimal("-1.0"), Decimal("2.0")]}), + ], + ) def test_pos_object(self, df): # GH#21380 assert_frame_equal(+df, df) - assert_series_equal(+df['a'], df['a']) + assert_series_equal(+df["a"], df["a"]) - @pytest.mark.parametrize('df', [ - pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), - ]) + @pytest.mark.parametrize( + "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] + ) def test_pos_raises(self, df): with pytest.raises(TypeError): - (+ df) + (+df) with pytest.raises(TypeError): - (+ df['a']) + (+df["a"]) class TestDataFrameLogicalOperators: @@ -103,14 +124,14 @@ def test_logical_ops_empty_frame(self): result = df & df2 assert_frame_equal(result, df2) - dfa = DataFrame(index=[1], columns=['A']) + dfa = DataFrame(index=[1], columns=["A"]) result = dfa & dfa assert_frame_equal(result, dfa) def test_logical_ops_bool_frame(self): # GH#5808 - df1a_bool = DataFrame(True, index=[1], columns=['A']) + df1a_bool = DataFrame(True, index=[1], columns=["A"]) result = df1a_bool & df1a_bool assert_frame_equal(result, df1a_bool) @@ -120,8 +141,8 @@ def test_logical_ops_bool_frame(self): def test_logical_ops_int_frame(self): # GH#5808 - df1a_int = DataFrame(1, index=[1], columns=['A']) - df1a_bool = DataFrame(True, index=[1], columns=['A']) + df1a_int = DataFrame(1, index=[1], columns=["A"]) + df1a_bool = DataFrame(True, index=[1], columns=["A"]) result = df1a_int | df1a_bool assert_frame_equal(result, df1a_int) @@ -129,48 +150,46 @@ def test_logical_ops_int_frame(self): def test_logical_ops_invalid(self): # GH#5808 - df1 = DataFrame(1.0, index=[1], columns=['A']) - df2 = DataFrame(True, index=[1], columns=['A']) + df1 = DataFrame(1.0, index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) with pytest.raises(TypeError): df1 | df2 - df1 = DataFrame('foo', index=[1], columns=['A']) - df2 = DataFrame(True, index=[1], columns=['A']) + df1 = DataFrame("foo", index=[1], columns=["A"]) + df2 = DataFrame(True, index=[1], columns=["A"]) with pytest.raises(TypeError): df1 | df2 def test_logical_operators(self): - def _check_bin_op(op): result = op(df1, df2) - expected = DataFrame(op(df1.values, df2.values), index=df1.index, - columns=df1.columns) + expected = DataFrame( + op(df1.values, df2.values), index=df1.index, columns=df1.columns + ) assert result.values.dtype == np.bool_ assert_frame_equal(result, expected) def _check_unary_op(op): result = op(df1) - expected = DataFrame(op(df1.values), index=df1.index, - columns=df1.columns) + expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns) assert result.values.dtype == np.bool_ assert_frame_equal(result, expected) - df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, - 'b': {'a': False, 'b': True, 'c': False, - 'd': False, 'e': False}, - 'c': {'a': False, 'b': False, 'c': True, - 'd': False, 'e': False}, - 'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, - 'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}} - - df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, - 'b': {'a': False, 'b': True, 'c': False, - 'd': False, 'e': False}, - 'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, - 'd': {'a': False, 'b': False, 'c': False, - 'd': True, 'e': False}, - 'e': {'a': False, 'b': False, 'c': False, - 'd': False, 'e': True}} + df1 = { + "a": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": False, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": True, "b": False, "c": False, "d": True, "e": True}, + "e": {"a": True, "b": False, "c": False, "d": True, "e": True}, + } + + df2 = { + "a": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "b": {"a": False, "b": True, "c": False, "d": False, "e": False}, + "c": {"a": True, "b": False, "c": True, "d": False, "e": False}, + "d": {"a": False, "b": False, "c": False, "d": True, "e": False}, + "e": {"a": False, "b": False, "c": False, "d": False, "e": True}, + } df1 = DataFrame(df1) df2 = DataFrame(df2) @@ -182,31 +201,32 @@ def _check_unary_op(op): _check_unary_op(operator.inv) # TODO: belongs elsewhere def test_logical_with_nas(self): - d = DataFrame({'a': [np.nan, False], 'b': [True, True]}) + d = DataFrame({"a": [np.nan, False], "b": [True, True]}) # GH4947 # bool comparisons should return bool - result = d['a'] | d['b'] + result = d["a"] | d["b"] expected = Series([False, True]) assert_series_equal(result, expected) # GH4604, automatic casting here - result = d['a'].fillna(False) | d['b'] + result = d["a"].fillna(False) | d["b"] expected = Series([True, True]) assert_series_equal(result, expected) - result = d['a'].fillna(False, downcast=False) | d['b'] + result = d["a"].fillna(False, downcast=False) | d["b"] expected = Series([True, True]) assert_series_equal(result, expected) class TestDataFrameOperators: - - @pytest.mark.parametrize('op', [operator.add, operator.sub, - operator.mul, operator.truediv]) + @pytest.mark.parametrize( + "op", [operator.add, operator.sub, operator.mul, operator.truediv] + ) def test_operators_none_as_na(self, op): - df = DataFrame({"col1": [2, 5.0, 123, None], - "col2": [1, 2, 3, 4]}, dtype=object) + df = DataFrame( + {"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object + ) # since filling converts dtypes from object, changed expected to be # object @@ -227,13 +247,12 @@ def test_operators_none_as_na(self, op): result = op(df.fillna(7), df) assert_frame_equal(result, expected, check_dtype=False) - @pytest.mark.parametrize('op,res', [('__eq__', False), - ('__ne__', True)]) + @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)]) # TODO: not sure what's correct here. @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning") def test_logical_typeerror_with_non_valid(self, op, res, float_frame): # we are comparing floats vs a string - result = getattr(float_frame, op)('foo') + result = getattr(float_frame, op)("foo") assert bool(result.all().all()) is res def test_binary_ops_align(self): @@ -241,105 +260,109 @@ def test_binary_ops_align(self): # test aligning binary ops # GH 6681 - index = MultiIndex.from_product([list('abc'), - ['one', 'two', 'three'], - [1, 2, 3]], - names=['first', 'second', 'third']) + index = MultiIndex.from_product( + [list("abc"), ["one", "two", "three"], [1, 2, 3]], + names=["first", "second", "third"], + ) - df = DataFrame(np.arange(27 * 3).reshape(27, 3), - index=index, - columns=['value1', 'value2', 'value3']).sort_index() + df = DataFrame( + np.arange(27 * 3).reshape(27, 3), + index=index, + columns=["value1", "value2", "value3"], + ).sort_index() idx = pd.IndexSlice - for op in ['add', 'sub', 'mul', 'div', 'truediv']: + for op in ["add", "sub", "mul", "div", "truediv"]: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) - result = getattr(df, op)(x, level='third', axis=0) + result = getattr(df, op)(x, level="third", axis=0) - expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) - for i, v in x.iteritems()]).sort_index() + expected = pd.concat( + [opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()] + ).sort_index() assert_frame_equal(result, expected) - x = Series([1.0, 10.0], ['two', 'three']) - result = getattr(df, op)(x, level='second', axis=0) + x = Series([1.0, 10.0], ["two", "three"]) + result = getattr(df, op)(x, level="second", axis=0) - expected = (pd.concat([opa(df.loc[idx[:, i], :], v) - for i, v in x.iteritems()]) - .reindex_like(df).sort_index()) + expected = ( + pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]) + .reindex_like(df) + .sort_index() + ) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) - midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']]) - df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx) - s = pd.Series({'a': 1, 'b': 2}) + midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) + df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) + s = pd.Series({"a": 1, "b": 2}) df2 = df.copy() - df2.columns.names = ['lvl0', 'lvl1'] + df2.columns.names = ["lvl0", "lvl1"] s2 = s.copy() - s2.index.name = 'lvl1' + s2.index.name = "lvl1" # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) - res5 = df2.mul(s, axis=1, level='lvl1') - res6 = df2.mul(s2, axis=1, level='lvl1') + res5 = df2.mul(s, axis=1, level="lvl1") + res6 = df2.mul(s2, axis=1, level="lvl1") - exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'), - columns=midx) + exp = DataFrame( + np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx + ) for res in [res1, res2]: assert_frame_equal(res, exp) - exp.columns.names = ['lvl0', 'lvl1'] + exp.columns.names = ["lvl0", "lvl1"] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp) def test_dti_tz_convert_to_utc(self): - base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', - '2011-01-03'], tz='UTC') - idx1 = base.tz_convert('Asia/Tokyo')[:2] - idx2 = base.tz_convert('US/Eastern')[1:] - - df1 = DataFrame({'A': [1, 2]}, index=idx1) - df2 = DataFrame({'A': [1, 1]}, index=idx2) - exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base) + base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") + idx1 = base.tz_convert("Asia/Tokyo")[:2] + idx2 = base.tz_convert("US/Eastern")[1:] + + df1 = DataFrame({"A": [1, 2]}, index=idx1) + df2 = DataFrame({"A": [1, 1]}, index=idx2) + exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base) assert_frame_equal(df1 + df2, exp) - def test_combineFrame(self, float_frame, mixed_float_frame, - mixed_int_frame): + def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): frame_copy = float_frame.reindex(float_frame.index[::2]) - del frame_copy['D'] - frame_copy['C'][:5] = np.nan + del frame_copy["D"] + frame_copy["C"][:5] = np.nan added = float_frame + frame_copy - indexer = added['A'].dropna().index - exp = (float_frame['A'] * 2).copy() + indexer = added["A"].dropna().index + exp = (float_frame["A"] * 2).copy() - tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer]) + tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer]) exp.loc[~exp.index.isin(indexer)] = np.nan - tm.assert_series_equal(added['A'], exp.loc[added['A'].index]) + tm.assert_series_equal(added["A"], exp.loc[added["A"].index]) - assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all() + assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all() # assert(False) - assert np.isnan(added['D']).all() + assert np.isnan(added["D"]).all() self_added = float_frame + float_frame tm.assert_index_equal(self_added.index, float_frame.index) added_rev = frame_copy + float_frame - assert np.isnan(added['D']).all() - assert np.isnan(added_rev['D']).all() + assert np.isnan(added["D"]).all() + assert np.isnan(added_rev["D"]).all() # corner cases @@ -360,9 +383,9 @@ def test_combineFrame(self, float_frame, mixed_float_frame, # mix vs float64, upcast added = float_frame + mixed_float_frame - _check_mixed_float(added, dtype='float64') + _check_mixed_float(added, dtype="float64") added = mixed_float_frame + float_frame - _check_mixed_float(added, dtype='float64') + _check_mixed_float(added, dtype="float64") # mix vs mix added = mixed_float_frame + mixed_float_frame @@ -370,10 +393,11 @@ def test_combineFrame(self, float_frame, mixed_float_frame, # with int added = float_frame + mixed_int_frame - _check_mixed_float(added, dtype='float64') + _check_mixed_float(added, dtype="float64") - def test_combineSeries(self, float_frame, mixed_float_frame, - mixed_int_frame, datetime_frame): + def test_combineSeries( + self, float_frame, mixed_float_frame, mixed_int_frame, datetime_frame + ): # Series series = float_frame.xs(float_frame.index[0]) @@ -384,23 +408,23 @@ def test_combineSeries(self, float_frame, mixed_float_frame, assert_series_equal(s, float_frame[key] + series[key]) larger_series = series.to_dict() - larger_series['E'] = 1 + larger_series["E"] = 1 larger_series = Series(larger_series) larger_added = float_frame + larger_series for key, s in float_frame.items(): assert_series_equal(larger_added[key], s + series[key]) - assert 'E' in larger_added - assert np.isnan(larger_added['E']).all() + assert "E" in larger_added + assert np.isnan(larger_added["E"]).all() # no upcast needed added = mixed_float_frame + series _check_mixed_float(added) # vs mix (upcast) as needed - added = mixed_float_frame + series.astype('float32') + added = mixed_float_frame + series.astype("float32") _check_mixed_float(added, dtype=dict(C=None)) - added = mixed_float_frame + series.astype('float16') + added = mixed_float_frame + series.astype("float16") _check_mixed_float(added, dtype=dict(C=None)) # these raise with numexpr.....as we are adding an int64 to an @@ -414,46 +438,48 @@ def test_combineSeries(self, float_frame, mixed_float_frame, # 'int32', D = 'int64')) # TimeSeries - ts = datetime_frame['A'] + ts = datetime_frame["A"] # 10890 # we no longer allow auto timeseries broadcasting # and require explicit broadcasting - added = datetime_frame.add(ts, axis='index') + added = datetime_frame.add(ts, axis="index") for key, col in datetime_frame.items(): result = col + ts assert_series_equal(added[key], result, check_names=False) assert added[key].name == key if col.name == ts.name: - assert result.name == 'A' + assert result.name == "A" else: assert result.name is None smaller_frame = datetime_frame[:-5] - smaller_added = smaller_frame.add(ts, axis='index') + smaller_added = smaller_frame.add(ts, axis="index") tm.assert_index_equal(smaller_added.index, datetime_frame.index) smaller_ts = ts[:-5] - smaller_added2 = datetime_frame.add(smaller_ts, axis='index') + smaller_added2 = datetime_frame.add(smaller_ts, axis="index") assert_frame_equal(smaller_added, smaller_added2) # length 0, result is all-nan - result = datetime_frame.add(ts[:0], axis='index') - expected = DataFrame(np.nan, index=datetime_frame.index, - columns=datetime_frame.columns) + result = datetime_frame.add(ts[:0], axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) assert_frame_equal(result, expected) # Frame is all-nan - result = datetime_frame[:0].add(ts, axis='index') - expected = DataFrame(np.nan, index=datetime_frame.index, - columns=datetime_frame.columns) + result = datetime_frame[:0].add(ts, axis="index") + expected = DataFrame( + np.nan, index=datetime_frame.index, columns=datetime_frame.columns + ) assert_frame_equal(result, expected) # empty but with non-empty index frame = datetime_frame[:1].reindex(columns=[]) - result = frame.mul(ts, axis='index') + result = frame.mul(ts, axis="index") assert len(result) == len(ts) def test_combineFunc(self, float_frame, mixed_float_frame): @@ -463,8 +489,7 @@ def test_combineFunc(self, float_frame, mixed_float_frame): # vs mix result = mixed_float_frame * 2 for c, s in result.items(): - tm.assert_numpy_array_equal( - s.values, mixed_float_frame[c].values * 2) + tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2) _check_mixed_float(result, dtype=dict(C=None)) result = DataFrame() * 2 @@ -475,26 +500,25 @@ def test_comparisons(self, simple_frame, float_frame): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() - row = simple_frame.xs('a') + row = simple_frame.xs("a") ndim_5 = np.ones(df1.shape + (1, 1, 1)) def test_comp(func): result = func(df1, df2) - tm.assert_numpy_array_equal(result.values, - func(df1.values, df2.values)) + tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values)) - with pytest.raises(ValueError, match='dim must be <= 2'): + with pytest.raises(ValueError, match="dim must be <= 2"): func(df1, ndim_5) result2 = func(simple_frame, row) - tm.assert_numpy_array_equal(result2.values, - func(simple_frame.values, row.values)) + tm.assert_numpy_array_equal( + result2.values, func(simple_frame.values, row.values) + ) result3 = func(float_frame, 0) - tm.assert_numpy_array_equal(result3.values, - func(float_frame.values, 0)) + tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0)) - msg = 'Can only compare identically-labeled DataFrame' + msg = "Can only compare identically-labeled DataFrame" with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) @@ -507,10 +531,10 @@ def test_comp(func): def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() - missing_df.iloc[0]['A'] = np.nan - with np.errstate(invalid='ignore'): + missing_df.iloc[0]["A"] = np.nan + with np.errstate(invalid="ignore"): expected = missing_df.values < 0 - with np.errstate(invalid='raise'): + with np.errstate(invalid="raise"): result = (missing_df < 0).values tm.assert_numpy_array_equal(result, expected) @@ -533,9 +557,9 @@ def test_boolean_comparison(self): result = df.values > b assert_numpy_array_equal(result, expected.values) - msg1d = 'Unable to coerce to Series, length must be 2: given 3' - msg2d = 'Unable to coerce to DataFrame, shape must be' - msg2db = 'operands could not be broadcast together with shapes' + msg1d = "Unable to coerce to Series, length must be 2: given 3" + msg2d = "Unable to coerce to DataFrame, shape must be" + msg2db = "operands could not be broadcast together with shapes" with pytest.raises(ValueError, match=msg1d): # wrong shape df > lst @@ -581,8 +605,9 @@ def test_boolean_comparison(self): assert df.values.shape != b_c.shape # with alignment - df = DataFrame(np.arange(6).reshape((3, 2)), - columns=list('AB'), index=list('abc')) + df = DataFrame( + np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc") + ) expected.index = df.index expected.columns = df.columns @@ -594,18 +619,20 @@ def test_boolean_comparison(self): def test_combine_generic(self, float_frame): df1 = float_frame - df2 = float_frame.loc[float_frame.index[:-5], ['A', 'B', 'C']] + df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]] combined = df1.combine(df2, np.add) combined2 = df2.combine(df1, np.add) - assert combined['D'].isna().all() - assert combined2['D'].isna().all() + assert combined["D"].isna().all() + assert combined2["D"].isna().all() - chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']] - chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']] + chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]] + chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]] - exp = float_frame.loc[float_frame.index[:-5], - ['A', 'B', 'C']].reindex_like(chunk) * 2 + exp = ( + float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk) + * 2 + ) assert_frame_equal(chunk, exp) assert_frame_equal(chunk2, exp) @@ -614,13 +641,15 @@ def test_inplace_ops_alignment(self): # inplace ops / ops alignment # GH 8511 - columns = list('abcdefg') - X_orig = DataFrame(np.arange(10 * len(columns)) - .reshape(-1, len(columns)), - columns=columns, index=range(10)) + columns = list("abcdefg") + X_orig = DataFrame( + np.arange(10 * len(columns)).reshape(-1, len(columns)), + columns=columns, + index=range(10), + ) Z = 100 * X_orig.iloc[:, 1:-1].copy() - block1 = list('bedcf') - subs = list('bcdef') + block1 = list("bedcf") + subs = list("bcdef") # add X = X_orig.copy() @@ -697,42 +726,54 @@ def test_inplace_ops_identity(self): # mixed dtype arr = np.random.randint(0, 10, size=5) - df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'}) + df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) df = df_orig.copy() df2 = df - df['A'] += 1 - expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'}) + df["A"] += 1 + expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data df = df_orig.copy() df2 = df - df['A'] += 1.5 - expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'}) + df["A"] += 1.5 + expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data - @pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod', - 'mul', 'or', 'pow', 'sub', 'truediv', - 'xor']) + @pytest.mark.parametrize( + "op", + [ + "add", + "and", + "div", + "floordiv", + "mod", + "mul", + "or", + "pow", + "sub", + "truediv", + "xor", + ], + ) def test_inplace_ops_identity2(self, op): - if op == 'div': + if op == "div": return - df = DataFrame({'a': [1., 2., 3.], - 'b': [1, 2, 3]}) + df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]}) operand = 2 - if op in ('and', 'or', 'xor'): + if op in ("and", "or", "xor"): # cannot use floats for boolean ops - df['a'] = [True, False, True] + df["a"] = [True, False, True] df_copy = df.copy() - iop = '__i{}__'.format(op) - op = '__{}__'.format(op) + iop = "__i{}__".format(op) + op = "__{}__".format(op) # no id change and value is correct getattr(df, iop)(operand) @@ -742,55 +783,62 @@ def test_inplace_ops_identity2(self, op): assert id(df) == expected def test_alignment_non_pandas(self): - index = ['A', 'B', 'C'] - columns = ['X', 'Y', 'Z'] + index = ["A", "B", "C"] + columns = ["X", "Y", "Z"] df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) align = pd.core.ops._align_method_FRAME - for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64), - range(1, 4)]: - - tm.assert_series_equal(align(df, val, 'index'), - Series([1, 2, 3], index=df.index)) - tm.assert_series_equal(align(df, val, 'columns'), - Series([1, 2, 3], index=df.columns)) + for val in [ + [1, 2, 3], + (1, 2, 3), + np.array([1, 2, 3], dtype=np.int64), + range(1, 4), + ]: + + tm.assert_series_equal( + align(df, val, "index"), Series([1, 2, 3], index=df.index) + ) + tm.assert_series_equal( + align(df, val, "columns"), Series([1, 2, 3], index=df.columns) + ) # length mismatch - msg = 'Unable to coerce to Series, length must be 3: given 2' + msg = "Unable to coerce to Series, length must be 3: given 2" for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]: with pytest.raises(ValueError, match=msg): - align(df, val, 'index') + align(df, val, "index") with pytest.raises(ValueError, match=msg): - align(df, val, 'columns') + align(df, val, "columns") val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - tm.assert_frame_equal(align(df, val, 'index'), - DataFrame(val, index=df.index, - columns=df.columns)) - tm.assert_frame_equal(align(df, val, 'columns'), - DataFrame(val, index=df.index, - columns=df.columns)) + tm.assert_frame_equal( + align(df, val, "index"), DataFrame(val, index=df.index, columns=df.columns) + ) + tm.assert_frame_equal( + align(df, val, "columns"), + DataFrame(val, index=df.index, columns=df.columns), + ) # shape mismatch - msg = 'Unable to coerce to DataFrame, shape must be' + msg = "Unable to coerce to DataFrame, shape must be" val = np.array([[1, 2, 3], [4, 5, 6]]) with pytest.raises(ValueError, match=msg): - align(df, val, 'index') + align(df, val, "index") with pytest.raises(ValueError, match=msg): - align(df, val, 'columns') + align(df, val, "columns") val = np.zeros((3, 3, 3)) with pytest.raises(ValueError): - align(df, val, 'index') + align(df, val, "index") with pytest.raises(ValueError): - align(df, val, 'columns') + align(df, val, "columns") def test_no_warning(self, all_arithmetic_operators): - df = pd.DataFrame({"A": [0., 0.], "B": [0., None]}) - b = df['B'] + df = pd.DataFrame({"A": [0.0, 0.0], "B": [0.0, None]}) + b = df["B"] with tm.assert_produces_warning(None): getattr(df, all_arithmetic_operators)(b, 0) @@ -798,7 +846,7 @@ def test_no_warning(self, all_arithmetic_operators): class TestTranspose: def test_transpose_tzaware_1col_single_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") df = pd.DataFrame(dti) assert (df.dtypes == dti.dtype).all() @@ -807,27 +855,27 @@ def test_transpose_tzaware_1col_single_tz(self): def test_transpose_tzaware_2col_single_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") - df3 = pd.DataFrame({'A': dti, 'B': dti}) + df3 = pd.DataFrame({"A": dti, "B": dti}) assert (df3.dtypes == dti.dtype).all() res3 = df3.T assert (res3.dtypes == dti.dtype).all() def test_transpose_tzaware_2col_mixed_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') - dti2 = dti.tz_convert('US/Pacific') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") - df4 = pd.DataFrame({'A': dti, 'B': dti2}) + df4 = pd.DataFrame({"A": dti, "B": dti2}) assert (df4.dtypes == [dti.dtype, dti2.dtype]).all() assert (df4.T.dtypes == object).all() tm.assert_frame_equal(df4.T.T, df4) def test_transpose_object_to_tzaware_mixed_tz(self): # GH#26825 - dti = pd.date_range('2016-04-05 04:30', periods=3, tz='UTC') - dti2 = dti.tz_convert('US/Pacific') + dti = pd.date_range("2016-04-05 04:30", periods=3, tz="UTC") + dti2 = dti.tz_convert("US/Pacific") # mixed all-tzaware dtypes df2 = pd.DataFrame([dti, dti2]) diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py index e36f8107ba9fd..a545db3365e36 100644 --- a/pandas/tests/frame/test_period.py +++ b/pandas/tests/frame/test_period.py @@ -5,8 +5,15 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, PeriodIndex, Timedelta, date_range, - period_range, to_datetime) + DataFrame, + DatetimeIndex, + Index, + PeriodIndex, + Timedelta, + date_range, + period_range, + to_datetime, +) import pandas.util.testing as tm @@ -15,9 +22,8 @@ def _permute(obj): class TestPeriodIndex: - def test_as_frame_columns(self): - rng = period_range('1/1/2000', periods=5) + rng = period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) ts = df[rng[0]] @@ -26,116 +32,119 @@ def test_as_frame_columns(self): # GH # 1211 repr(df) - ts = df['1/1/2000'] + ts = df["1/1/2000"] tm.assert_series_equal(ts, df.iloc[:, 0]) def test_frame_setitem(self): - rng = period_range('1/1/2000', periods=5, name='index') + rng = period_range("1/1/2000", periods=5, name="index") df = DataFrame(np.random.randn(5, 3), index=rng) - df['Index'] = rng - rs = Index(df['Index']) + df["Index"] = rng + rs = Index(df["Index"]) tm.assert_index_equal(rs, rng, check_names=False) - assert rs.name == 'Index' - assert rng.name == 'index' + assert rs.name == "Index" + assert rng.name == "index" - rs = df.reset_index().set_index('index') + rs = df.reset_index().set_index("index") assert isinstance(rs.index, PeriodIndex) tm.assert_index_equal(rs.index, rng) def test_frame_to_time_stamp(self): K = 5 - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") df = DataFrame(np.random.randn(len(index), K), index=index) - df['mix'] = 'a' + df["mix"] = "a" - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns') - result = df.to_timestamp('D', 'end') + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end") tm.assert_index_equal(result.index, exp_index) tm.assert_numpy_array_equal(result.values, df.values) - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = df.to_timestamp('D', 'start') + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = df.to_timestamp("D", "start") tm.assert_index_equal(result.index, exp_index) - def _get_with_delta(delta, freq='A-DEC'): - return date_range(to_datetime('1/1/2001') + delta, - to_datetime('12/31/2009') + delta, freq=freq) + def _get_with_delta(delta, freq="A-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) delta = timedelta(hours=23) - result = df.to_timestamp('H', 'end') + result = df.to_timestamp("H", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp('T', 'end') + result = df.to_timestamp("T", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - result = df.to_timestamp('S', 'end') + result = df.to_timestamp("S", "end") delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) # columns df = df.T - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns') - result = df.to_timestamp('D', 'end', axis=1) + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") + result = df.to_timestamp("D", "end", axis=1) tm.assert_index_equal(result.columns, exp_index) tm.assert_numpy_array_equal(result.values, df.values) - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = df.to_timestamp('D', 'start', axis=1) + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = df.to_timestamp("D", "start", axis=1) tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23) - result = df.to_timestamp('H', 'end', axis=1) + result = df.to_timestamp("H", "end", axis=1) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23, minutes=59) - result = df.to_timestamp('T', 'end', axis=1) + result = df.to_timestamp("T", "end", axis=1) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) - result = df.to_timestamp('S', 'end', axis=1) + result = df.to_timestamp("S", "end", axis=1) delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.columns, exp_index) # invalid axis - with pytest.raises(ValueError, match='axis'): + with pytest.raises(ValueError, match="axis"): df.to_timestamp(axis=2) - result1 = df.to_timestamp('5t', axis=1) - result2 = df.to_timestamp('t', axis=1) - expected = pd.date_range('2001-01-01', '2009-01-01', freq='AS') + result1 = df.to_timestamp("5t", axis=1) + result2 = df.to_timestamp("t", axis=1) + expected = pd.date_range("2001-01-01", "2009-01-01", freq="AS") assert isinstance(result1.columns, DatetimeIndex) assert isinstance(result2.columns, DatetimeIndex) tm.assert_numpy_array_equal(result1.columns.asi8, expected.asi8) tm.assert_numpy_array_equal(result2.columns.asi8, expected.asi8) # PeriodIndex.to_timestamp always use 'infer' - assert result1.columns.freqstr == 'AS-JAN' - assert result2.columns.freqstr == 'AS-JAN' + assert result1.columns.freqstr == "AS-JAN" + assert result2.columns.freqstr == "AS-JAN" def test_frame_index_to_string(self): - index = PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") frame = DataFrame(np.random.randn(3, 4), index=index) # it works! frame.to_string() def test_align_frame(self): - rng = period_range('1/1/2000', '1/1/2010', freq='A') + rng = period_range("1/1/2000", "1/1/2010", freq="A") ts = DataFrame(np.random.randn(len(rng), 3), index=rng) result = ts + ts[::2] diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 097477c42d249..bbb3395fb23af 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -8,92 +8,97 @@ class TestDataFrameQuantile: - def test_quantile(self, datetime_frame): from numpy import percentile df = datetime_frame q = df.quantile(0.1, axis=0) - assert q['A'] == percentile(df['A'], 10) + assert q["A"] == percentile(df["A"], 10) tm.assert_index_equal(q.index, df.columns) q = df.quantile(0.9, axis=1) - assert (q['2000-01-17'] == - percentile(df.loc['2000-01-17'], 90)) + assert q["2000-01-17"] == percentile(df.loc["2000-01-17"], 90) tm.assert_index_equal(q.index, df.index) # test degenerate case - q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) - assert(np.isnan(q['x']) and np.isnan(q['y'])) + q = DataFrame({"x": [], "y": []}).quantile(0.1, axis=0) + assert np.isnan(q["x"]) and np.isnan(q["y"]) # non-numeric exclusion - df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) + df = DataFrame({"col1": ["A", "A", "B", "B"], "col2": [1, 2, 3, 4]}) rs = df.quantile(0.5) xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1) + result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) - result = df.quantile([.5, .75], axis=1) - expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75], - 3: [3.5, 3.75]}, index=[0.5, 0.75]) + result = df.quantile([0.5, 0.75], axis=1) + expected = DataFrame( + {1: [1.5, 1.75], 2: [2.5, 2.75], 3: [3.5, 3.75]}, index=[0.5, 0.75] + ) assert_frame_equal(result, expected, check_index_type=True) # We may want to break API in the future to change this # so that we exclude non-numeric along the same axis # See GH #7312 - df = DataFrame([[1, 2, 3], - ['a', 'b', 4]]) - result = df.quantile(.5, axis=1) - expected = Series([3., 4.], index=[0, 1], name=0.5) + df = DataFrame([[1, 2, 3], ["a", "b", 4]]) + result = df.quantile(0.5, axis=1) + expected = Series([3.0, 4.0], index=[0, 1], name=0.5) assert_series_equal(result, expected) def test_quantile_axis_mixed(self): # mixed on axis=1 - df = DataFrame({"A": [1, 2, 3], - "B": [2., 3., 4.], - "C": pd.date_range('20130101', periods=3), - "D": ['foo', 'bar', 'baz']}) - result = df.quantile(.5, axis=1) + df = DataFrame( + { + "A": [1, 2, 3], + "B": [2.0, 3.0, 4.0], + "C": pd.date_range("20130101", periods=3), + "D": ["foo", "bar", "baz"], + } + ) + result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], name=0.5) assert_series_equal(result, expected) # must raise with pytest.raises(TypeError): - df.quantile(.5, axis=1, numeric_only=False) + df.quantile(0.5, axis=1, numeric_only=False) def test_quantile_axis_parameter(self): # GH 9543/9544 df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=0) + result = df.quantile(0.5, axis=0) - expected = Series([2., 3.], index=["A", "B"], name=0.5) + expected = Series([2.0, 3.0], index=["A", "B"], name=0.5) assert_series_equal(result, expected) - expected = df.quantile(.5, axis="index") + expected = df.quantile(0.5, axis="index") assert_series_equal(result, expected) - result = df.quantile(.5, axis=1) + result = df.quantile(0.5, axis=1) expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) - result = df.quantile(.5, axis="columns") + result = df.quantile(0.5, axis="columns") assert_series_equal(result, expected) - msg = ("No axis named -1 for object type" - " ") + msg = ( + "No axis named -1 for object type" " " + ) with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) - msg = ("No axis named column for object type" - " ") + msg = ( + "No axis named column for object type" + " " + ) with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") @@ -102,45 +107,53 @@ def test_quantile_interpolation(self): # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1, interpolation='nearest') + result = df.quantile(0.5, axis=1, interpolation="nearest") expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) # cross-check interpolation=nearest results in original dtype - exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5, - axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64') + exp = np.percentile( + np.array([[1, 2, 3], [2, 3, 4]]), 0.5, axis=0, interpolation="nearest" + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="int64") tm.assert_series_equal(result, expected) # float - df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5) + df = DataFrame({"A": [1.0, 2.0, 3.0], "B": [2.0, 3.0, 4.0]}, index=[1, 2, 3]) + result = df.quantile(0.5, axis=1, interpolation="nearest") + expected = Series([1.0, 2.0, 3.0], index=[1, 2, 3], name=0.5) tm.assert_series_equal(result, expected) - exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5, - axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64') + exp = np.percentile( + np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]]), + 0.5, + axis=0, + interpolation="nearest", + ) + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype="float64") assert_series_equal(result, expected) # axis - result = df.quantile([.5, .75], axis=1, interpolation='lower') - expected = DataFrame({1: [1., 1.], 2: [2., 2.], - 3: [3., 3.]}, index=[0.5, 0.75]) + result = df.quantile([0.5, 0.75], axis=1, interpolation="lower") + expected = DataFrame( + {1: [1.0, 1.0], 2: [2.0, 2.0], 3: [3.0, 3.0]}, index=[0.5, 0.75] + ) assert_frame_equal(result, expected) # test degenerate case - df = DataFrame({'x': [], 'y': []}) - q = df.quantile(0.1, axis=0, interpolation='higher') - assert(np.isnan(q['x']) and np.isnan(q['y'])) + df = DataFrame({"x": [], "y": []}) + q = df.quantile(0.1, axis=0, interpolation="higher") + assert np.isnan(q["x"]) and np.isnan(q["y"]) # multi - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - result = df.quantile([.25, .5], interpolation='midpoint') + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5], interpolation="midpoint") # https://github.com/numpy/numpy/issues/7163 - expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], - index=[.25, .5], columns=['a', 'b', 'c']) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) assert_frame_equal(result, expected) def test_quantile_interpolation_datetime(self, datetime_frame): @@ -148,8 +161,8 @@ def test_quantile_interpolation_datetime(self, datetime_frame): # interpolation = linear (default case) df = datetime_frame - q = df.quantile(0.1, axis=0, interpolation='linear') - assert q['A'] == np.percentile(df['A'], 10) + q = df.quantile(0.1, axis=0, interpolation="linear") + assert q["A"] == np.percentile(df["A"], 10) def test_quantile_interpolation_int(self, int_frame): # see gh-10174 @@ -157,65 +170,73 @@ def test_quantile_interpolation_int(self, int_frame): df = int_frame # interpolation = linear (default case) q = df.quantile(0.1) - assert q['A'] == np.percentile(df['A'], 10) + assert q["A"] == np.percentile(df["A"], 10) # test with and without interpolation keyword - q1 = df.quantile(0.1, axis=0, interpolation='linear') - assert q1['A'] == np.percentile(df['A'], 10) + q1 = df.quantile(0.1, axis=0, interpolation="linear") + assert q1["A"] == np.percentile(df["A"], 10) tm.assert_series_equal(q, q1) def test_quantile_multi(self): - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - result = df.quantile([.25, .5]) - expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], - index=[.25, .5], columns=['a', 'b', 'c']) + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], columns=["a", "b", "c"]) + result = df.quantile([0.25, 0.5]) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[0.25, 0.5], + columns=["a", "b", "c"], + ) assert_frame_equal(result, expected) # axis = 1 - result = df.quantile([.25, .5], axis=1) - expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], - index=[.25, .5], columns=[0, 1, 2]) + result = df.quantile([0.25, 0.5], axis=1) + expected = DataFrame( + [[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], index=[0.25, 0.5], columns=[0, 1, 2] + ) # empty - result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) - expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]}, - index=[.1, .9]) + result = DataFrame({"x": [], "y": []}).quantile([0.1, 0.9], axis=0) + expected = DataFrame( + {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9] + ) assert_frame_equal(result, expected) def test_quantile_datetime(self): - df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) + df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) # exclude datetime - result = df.quantile(.5) - expected = Series([2.5], index=['b']) + result = df.quantile(0.5) + expected = Series([2.5], index=["b"]) # datetime - result = df.quantile(.5, numeric_only=False) - expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], - index=['a', 'b'], - name=0.5) + result = df.quantile(0.5, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), 2.5], index=["a", "b"], name=0.5 + ) assert_series_equal(result, expected) # datetime w/ multi - result = df.quantile([.5], numeric_only=False) - expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], - index=[.5], columns=['a', 'b']) + result = df.quantile([0.5], numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"] + ) assert_frame_equal(result, expected) # axis = 1 - df['c'] = pd.to_datetime(['2011', '2012']) - result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) - expected = Series([Timestamp('2010-07-02 12:00:00'), - Timestamp('2011-07-02 12:00:00')], - index=[0, 1], - name=0.5) + df["c"] = pd.to_datetime(["2011", "2012"]) + result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False) + expected = Series( + [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")], + index=[0, 1], + name=0.5, + ) assert_series_equal(result, expected) - result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) - expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), - Timestamp('2011-07-02 12:00:00')]], - index=[0.5], columns=[0, 1]) + result = df[["a", "c"]].quantile([0.5], axis=1, numeric_only=False) + expected = DataFrame( + [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]], + index=[0.5], + columns=[0, 1], + ) assert_frame_equal(result, expected) # empty when numeric_only=True @@ -224,96 +245,146 @@ def test_quantile_datetime(self): # result = df[['a', 'c']].quantile([.5]) def test_quantile_invalid(self, datetime_frame): - msg = 'percentiles should all be in the interval \\[0, 1\\]' + msg = "percentiles should all be in the interval \\[0, 1\\]" for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): datetime_frame.quantile(invalid) def test_quantile_box(self): - df = DataFrame({'A': [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')], - 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')], - 'C': [pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days')]}) + df = DataFrame( + { + "A": [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + "B": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + } + ) res = df.quantile(0.5, numeric_only=False) - exp = pd.Series([pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days')], - name=0.5, index=['A', 'B', 'C']) + exp = pd.Series( + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=["A", "B", "C"], + ) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days')]], - index=[0.5], columns=['A', 'B', 'C']) + exp = pd.DataFrame( + [ + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(res, exp) # DatetimeBlock may be consolidated and contain NaT in different loc - df = DataFrame({'A': [pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')], - 'a': [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.NaT, - pd.Timestamp('2011-01-03')], - 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.NaT, - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')], - 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.NaT, - pd.Timestamp('2011-01-03', tz='US/Eastern')], - 'C': [pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days'), - pd.NaT], - 'c': [pd.NaT, - pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days')]}, - columns=list('AaBbCc')) + df = DataFrame( + { + "A": [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + "a": [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.NaT, + pd.Timestamp("2011-01-03"), + ], + "B": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.NaT, + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "b": [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.NaT, + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + "C": [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + "c": [ + pd.NaT, + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ], + }, + columns=list("AaBbCc"), + ) res = df.quantile(0.5, numeric_only=False) - exp = pd.Series([pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days'), - pd.Timedelta('2 days')], - name=0.5, index=list('AaBbCc')) + exp = pd.Series( + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ], + name=0.5, + index=list("AaBbCc"), + ) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timedelta('2 days'), - pd.Timedelta('2 days')]], - index=[0.5], columns=list('AaBbCc')) + exp = pd.DataFrame( + [ + [ + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timedelta("2 days"), + pd.Timedelta("2 days"), + ] + ], + index=[0.5], + columns=list("AaBbCc"), + ) tm.assert_frame_equal(res, exp) def test_quantile_nan(self): # GH 14357 - float block where some cols have missing values - df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) + df = DataFrame({"a": np.arange(1, 6.0), "b": np.arange(1, 6.0)}) df.iloc[-1, 1] = np.nan res = df.quantile(0.5) - exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) + exp = Series([3.0, 2.5], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) - exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [2.5, 3.25]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) res = df.quantile(0.5, axis=1) @@ -325,57 +396,62 @@ def test_quantile_nan(self): tm.assert_frame_equal(res, exp) # full-nan column - df['b'] = np.nan + df["b"] = np.nan res = df.quantile(0.5) - exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) + exp = Series([3.0, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5, 0.75]) - exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, - index=[0.5, 0.75]) + exp = DataFrame({"a": [3.0, 4.0], "b": [np.nan, np.nan]}, index=[0.5, 0.75]) tm.assert_frame_equal(res, exp) def test_quantile_nat(self): # full NaT column - df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) + df = DataFrame({"a": [pd.NaT, pd.NaT, pd.NaT]}) res = df.quantile(0.5, numeric_only=False) - exp = Series([pd.NaT], index=['a'], name=0.5) + exp = Series([pd.NaT], index=["a"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) + exp = DataFrame({"a": [pd.NaT]}, index=[0.5]) tm.assert_frame_equal(res, exp) # mixed non-null / full null column - df = DataFrame({'a': [pd.Timestamp('2012-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2012-01-03')], - 'b': [pd.NaT, pd.NaT, pd.NaT]}) + df = DataFrame( + { + "a": [ + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2012-01-03"), + ], + "b": [pd.NaT, pd.NaT, pd.NaT], + } + ) res = df.quantile(0.5, numeric_only=False) - exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], - name=0.5) + exp = Series([pd.Timestamp("2012-01-02"), pd.NaT], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) - exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], - columns=['a', 'b']) + exp = DataFrame( + [[pd.Timestamp("2012-01-02"), pd.NaT]], index=[0.5], columns=["a", "b"] + ) tm.assert_frame_equal(res, exp) def test_quantile_empty(self): # floats - df = DataFrame(columns=['a', 'b'], dtype='float64') + df = DataFrame(columns=["a", "b"], dtype="float64") res = df.quantile(0.5) - exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) + exp = Series([np.nan, np.nan], index=["a", "b"], name=0.5) tm.assert_series_equal(res, exp) res = df.quantile([0.5]) - exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=["a", "b"], index=[0.5]) tm.assert_frame_equal(res, exp) # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) @@ -383,13 +459,13 @@ def test_quantile_empty(self): # res = df.quantile([0.5], axis=1) # ints - df = DataFrame(columns=['a', 'b'], dtype='int64') + df = DataFrame(columns=["a", "b"], dtype="int64") # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) # res = df.quantile(0.5) # datetimes - df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') + df = DataFrame(columns=["a", "b"], dtype="datetime64[ns]") # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) # res = df.quantile(0.5, numeric_only=False) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 0ed484f678fdf..70c58471dd0d4 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -11,10 +11,13 @@ from pandas.core.computation.check import _NUMEXPR_INSTALLED from pandas.tests.frame.common import TestData from pandas.util.testing import ( - assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf) + assert_frame_equal, + assert_series_equal, + makeCustomDataframe as mkdf, +) -PARSERS = 'python', 'pandas' -ENGINES = 'python', pytest.param('numexpr', marks=td.skip_if_no_ne) +PARSERS = "python", "pandas" +ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) @pytest.fixture(params=PARSERS, ids=lambda x: x) @@ -28,14 +31,13 @@ def engine(request): def skip_if_no_pandas_parser(parser): - if parser != 'pandas': + if parser != "pandas": pytest.skip("cannot evaluate with parser {0!r}".format(parser)) class TestCompat: - def setup_method(self, method): - self.df = DataFrame({'A': [1, 2, 3]}) + self.df = DataFrame({"A": [1, 2, 3]}) self.expected1 = self.df[self.df.A > 0] self.expected2 = self.df.A + 1 @@ -44,44 +46,43 @@ def test_query_default(self): # GH 12749 # this should always work, whether _NUMEXPR_INSTALLED or not df = self.df - result = df.query('A>0') + result = df.query("A>0") assert_frame_equal(result, self.expected1) - result = df.eval('A+1') + result = df.eval("A+1") assert_series_equal(result, self.expected2, check_names=False) def test_query_None(self): df = self.df - result = df.query('A>0', engine=None) + result = df.query("A>0", engine=None) assert_frame_equal(result, self.expected1) - result = df.eval('A+1', engine=None) + result = df.eval("A+1", engine=None) assert_series_equal(result, self.expected2, check_names=False) def test_query_python(self): df = self.df - result = df.query('A>0', engine='python') + result = df.query("A>0", engine="python") assert_frame_equal(result, self.expected1) - result = df.eval('A+1', engine='python') + result = df.eval("A+1", engine="python") assert_series_equal(result, self.expected2, check_names=False) def test_query_numexpr(self): df = self.df if _NUMEXPR_INSTALLED: - result = df.query('A>0', engine='numexpr') + result = df.query("A>0", engine="numexpr") assert_frame_equal(result, self.expected1) - result = df.eval('A+1', engine='numexpr') + result = df.eval("A+1", engine="numexpr") assert_series_equal(result, self.expected2, check_names=False) else: with pytest.raises(ImportError): - df.query('A>0', engine='numexpr') + df.query("A>0", engine="numexpr") with pytest.raises(ImportError): - df.eval('A+1', engine='numexpr') + df.eval("A+1", engine="numexpr") class TestDataFrameEval(TestData): - def test_ops(self): # tst ops and reversed ops in evaluation @@ -90,18 +91,20 @@ def test_ops(self): # smaller hits python, larger hits numexpr for n in [4, 4000]: - df = DataFrame(1, index=range(n), columns=list('abcd')) + df = DataFrame(1, index=range(n), columns=list("abcd")) df.iloc[0] = 2 m = df.mean() - for op_str, op, rop in [('+', '__add__', '__radd__'), - ('-', '__sub__', '__rsub__'), - ('*', '__mul__', '__rmul__'), - ('/', '__truediv__', '__rtruediv__')]: + for op_str, op, rop in [ + ("+", "__add__", "__radd__"), + ("-", "__sub__", "__rsub__"), + ("*", "__mul__", "__rmul__"), + ("/", "__truediv__", "__rtruediv__"), + ]: - base = (DataFrame(np.tile(m.values, n) # noqa - .reshape(n, -1), - columns=list('abcd'))) + base = DataFrame( + np.tile(m.values, n).reshape(n, -1), columns=list("abcd") # noqa + ) expected = eval("base{op}df".format(op=op_str)) @@ -110,25 +113,25 @@ def test_ops(self): assert_frame_equal(result, expected) # these are commutative - if op in ['+', '*']: + if op in ["+", "*"]: result = getattr(df, op)(m) assert_frame_equal(result, expected) # these are not - elif op in ['-', '/']: + elif op in ["-", "/"]: result = getattr(df, rop)(m) assert_frame_equal(result, expected) # GH7192 df = DataFrame(dict(A=np.random.randn(25000))) df.iloc[0:5] = np.nan - expected = (1 - np.isnan(df.iloc[0:25])) + expected = 1 - np.isnan(df.iloc[0:25]) result = (1 - np.isnan(df)).iloc[0:25] assert_frame_equal(result, expected) def test_query_non_str(self): # GH 11485 - df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'b']}) + df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]}) msg = "expr must be a string to be evaluated" with pytest.raises(ValueError, match=msg): @@ -139,119 +142,116 @@ def test_query_non_str(self): def test_query_empty_string(self): # GH 13139 - df = pd.DataFrame({'A': [1, 2, 3]}) + df = pd.DataFrame({"A": [1, 2, 3]}) msg = "expr cannot be an empty string" with pytest.raises(ValueError, match=msg): - df.query('') + df.query("") def test_eval_resolvers_as_list(self): # GH 14095 - df = DataFrame(np.random.randn(10, 2), columns=list('ab')) - dict1 = {'a': 1} - dict2 = {'b': 2} - assert (df.eval('a + b', resolvers=[dict1, dict2]) == - dict1['a'] + dict2['b']) - assert (pd.eval('a + b', resolvers=[dict1, dict2]) == - dict1['a'] + dict2['b']) + df = DataFrame(np.random.randn(10, 2), columns=list("ab")) + dict1 = {"a": 1} + dict2 = {"b": 2} + assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] + assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] class TestDataFrameQueryWithMultiIndex: - def test_query_with_named_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) - a = np.random.choice(['red', 'green'], size=10) - b = np.random.choice(['eggs', 'ham'], size=10) - index = MultiIndex.from_arrays([a, b], names=['color', 'food']) + a = np.random.choice(["red", "green"], size=10) + b = np.random.choice(["eggs", "ham"], size=10) + index = MultiIndex.from_arrays([a, b], names=["color", "food"]) df = DataFrame(np.random.randn(10, 2), index=index) - ind = Series(df.index.get_level_values('color').values, index=index, - name='color') + ind = Series( + df.index.get_level_values("color").values, index=index, name="color" + ) # equality res1 = df.query('color == "red"', parser=parser, engine=engine) res2 = df.query('"red" == color', parser=parser, engine=engine) - exp = df[ind == 'red'] + exp = df[ind == "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # inequality res1 = df.query('color != "red"', parser=parser, engine=engine) res2 = df.query('"red" != color', parser=parser, engine=engine) - exp = df[ind != 'red'] + exp = df[ind != "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('color == ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] == color', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('color != ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] != color', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["red"] in color', parser=parser, engine=engine) res2 = df.query('"red" in color', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('["red"] not in color', parser=parser, engine=engine) res2 = df.query('"red" not in color', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) def test_query_with_unnamed_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) - a = np.random.choice(['red', 'green'], size=10) - b = np.random.choice(['eggs', 'ham'], size=10) + a = np.random.choice(["red", "green"], size=10) + b = np.random.choice(["eggs", "ham"], size=10) index = MultiIndex.from_arrays([a, b]) df = DataFrame(np.random.randn(10, 2), index=index) ind = Series(df.index.get_level_values(0).values, index=index) res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) - exp = df[ind == 'red'] + exp = df[ind == "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # inequality res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine) res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine) - exp = df[ind != 'red'] + exp = df[ind != "red"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine) res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine) res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine) - exp = df[ind.isin(['red'])] + exp = df[ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) - res1 = df.query('["red"] not in ilevel_0', parser=parser, - engine=engine) + res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine) res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine) - exp = df[~ind.isin(['red'])] + exp = df[~ind.isin(["red"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) @@ -259,60 +259,61 @@ def test_query_with_unnamed_multiindex(self, parser, engine): ind = Series(df.index.get_level_values(1).values, index=index) res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine) res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine) - exp = df[ind == 'eggs'] + exp = df[ind == "eggs"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # inequality res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine) res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine) - exp = df[ind != 'eggs'] + exp = df[ind != "eggs"] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # list equality (really just set membership) res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine) res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine) - exp = df[ind.isin(['eggs'])] + exp = df[ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine) res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine) - exp = df[~ind.isin(['eggs'])] + exp = df[~ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) # in/not in ops res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine) res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine) - exp = df[ind.isin(['eggs'])] + exp = df[ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) - res1 = df.query('["eggs"] not in ilevel_1', parser=parser, - engine=engine) + res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine) res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine) - exp = df[~ind.isin(['eggs'])] + exp = df[~ind.isin(["eggs"])] assert_frame_equal(res1, exp) assert_frame_equal(res2, exp) def test_query_with_partially_named_multiindex(self, parser, engine): skip_if_no_pandas_parser(parser) - a = np.random.choice(['red', 'green'], size=10) + a = np.random.choice(["red", "green"], size=10) b = np.arange(10) index = MultiIndex.from_arrays([a, b]) - index.names = [None, 'rating'] + index.names = [None, "rating"] df = DataFrame(np.random.randn(10, 2), index=index) - res = df.query('rating == 1', parser=parser, engine=engine) - ind = Series(df.index.get_level_values('rating').values, index=index, - name='rating') + res = df.query("rating == 1", parser=parser, engine=engine) + ind = Series( + df.index.get_level_values("rating").values, index=index, name="rating" + ) exp = df[ind == 1] assert_frame_equal(res, exp) - res = df.query('rating != 1', parser=parser, engine=engine) - ind = Series(df.index.get_level_values('rating').values, index=index, - name='rating') + res = df.query("rating != 1", parser=parser, engine=engine) + ind = Series( + df.index.get_level_values("rating").values, index=index, name="rating" + ) exp = df[ind != 1] assert_frame_equal(res, exp) @@ -327,7 +328,7 @@ def test_query_with_partially_named_multiindex(self, parser, engine): assert_frame_equal(res, exp) def test_query_multiindex_get_index_resolvers(self): - df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) + df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=["spam", "eggs"]) resolvers = df._get_index_resolvers() def to_series(mi, level): @@ -337,11 +338,13 @@ def to_series(mi, level): return s col_series = df.columns.to_series() - expected = {'index': df.index, - 'columns': col_series, - 'spam': to_series(df.index, 'spam'), - 'eggs': to_series(df.index, 'eggs'), - 'C0': col_series} + expected = { + "index": df.index, + "columns": col_series, + "spam": to_series(df.index, "spam"), + "eggs": to_series(df.index, "eggs"), + "C0": col_series, + } for k, v in resolvers.items(): if isinstance(v, Index): assert v.is_(expected[k]) @@ -353,11 +356,10 @@ def to_series(mi, level): @td.skip_if_no_ne class TestDataFrameQueryNumExprPandas: - @classmethod def setup_class(cls): - cls.engine = 'numexpr' - cls.parser = 'pandas' + cls.engine = "numexpr" + cls.parser = "pandas" @classmethod def teardown_class(cls): @@ -367,186 +369,190 @@ def test_date_query_with_attribute_access(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) df = DataFrame(np.random.randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - df['dates2'] = date_range('1/1/2013', periods=5) - df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine, - parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query( + "@df.dates1 < 20130101 < @df.dates3", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_no_attribute_access(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - df['dates2'] = date_range('1/1/2013', periods=5) - df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('dates1 < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates2'] = date_range('1/1/2013', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT - res = df.query('dates1 < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates2"] = date_range("1/1/2013", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.set_index('dates1', inplace=True, drop=True) - res = df.query('index < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.set_index("dates1", inplace=True, drop=True) + res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) - res = df.query('index < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df.set_index("dates1", inplace=True, drop=True) + res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT_duplicates(self): engine, parser = self.engine, self.parser n = 10 d = {} - d['dates1'] = date_range('1/1/2012', periods=n) - d['dates3'] = date_range('1/1/2014', periods=n) + d["dates1"] = date_range("1/1/2012", periods=n) + d["dates3"] = date_range("1/1/2014", periods=n) df = DataFrame(d) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) - res = df.query('dates1 < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.index.to_series() < '20130101') & - ('20130101' < df.dates3)] + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) + res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) + expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_with_non_date(self): engine, parser = self.engine, self.parser n = 10 - df = DataFrame({'dates': date_range('1/1/2012', periods=n), - 'nondate': np.arange(n)}) + df = DataFrame( + {"dates": date_range("1/1/2012", periods=n), "nondate": np.arange(n)} + ) - result = df.query('dates == nondate', parser=parser, engine=engine) + result = df.query("dates == nondate", parser=parser, engine=engine) assert len(result) == 0 - result = df.query('dates != nondate', parser=parser, engine=engine) + result = df.query("dates != nondate", parser=parser, engine=engine) assert_frame_equal(result, df) - for op in ['<', '>', '<=', '>=']: + for op in ["<", ">", "<=", ">="]: with pytest.raises(TypeError): - df.query('dates %s nondate' % op, parser=parser, engine=engine) + df.query("dates %s nondate" % op, parser=parser, engine=engine) def test_query_syntax_error(self): engine, parser = self.engine, self.parser - df = DataFrame({"i": range(10), "+": range(3, 13), - "r": range(4, 14)}) + df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)}) with pytest.raises(SyntaxError): - df.query('i - +', engine=engine, parser=parser) + df.query("i - +", engine=engine, parser=parser) def test_query_scope(self): from pandas.core.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) - df = DataFrame(np.random.randn(20, 2), columns=list('ab')) + df = DataFrame(np.random.randn(20, 2), columns=list("ab")) a, b = 1, 2 # noqa - res = df.query('a > b', engine=engine, parser=parser) + res = df.query("a > b", engine=engine, parser=parser) expected = df[df.a > df.b] assert_frame_equal(res, expected) - res = df.query('@a > b', engine=engine, parser=parser) + res = df.query("@a > b", engine=engine, parser=parser) expected = df[a > df.b] assert_frame_equal(res, expected) # no local variable c with pytest.raises(UndefinedVariableError): - df.query('@a > b > @c', engine=engine, parser=parser) + df.query("@a > b > @c", engine=engine, parser=parser) # no column named 'c' with pytest.raises(UndefinedVariableError): - df.query('@a > b > c', engine=engine, parser=parser) + df.query("@a > b > c", engine=engine, parser=parser) def test_query_doesnt_pickup_local(self): from pandas.core.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) # we don't pick up the local 'sin' with pytest.raises(UndefinedVariableError): - df.query('sin > 5', engine=engine, parser=parser) + df.query("sin > 5", engine=engine, parser=parser) def test_query_builtin(self): from pandas.core.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) - df.index.name = 'sin' - msg = 'Variables in expression.+' + df.index.name = "sin" + msg = "Variables in expression.+" with pytest.raises(NumExprClobberingError, match=msg): - df.query('sin > 5', engine=engine, parser=parser) + df.query("sin > 5", engine=engine, parser=parser) def test_query(self): engine, parser = self.engine, self.parser - df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) - assert_frame_equal(df.query('a < b', engine=engine, parser=parser), - df[df.a < df.b]) - assert_frame_equal(df.query('a + b > b * c', engine=engine, - parser=parser), - df[df.a + df.b > df.b * df.c]) + assert_frame_equal( + df.query("a < b", engine=engine, parser=parser), df[df.a < df.b] + ) + assert_frame_equal( + df.query("a + b > b * c", engine=engine, parser=parser), + df[df.a + df.b > df.b * df.c], + ) def test_query_index_with_name(self): engine, parser = self.engine, self.parser - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) + df = DataFrame( + np.random.randint(10, size=(10, 3)), + index=Index(range(10), name="blob"), + columns=["a", "b", "c"], + ) + res = df.query("(blob < 5) & (a < b)", engine=engine, parser=parser) expec = df[(df.index < 5) & (df.a < df.b)] assert_frame_equal(res, expec) - res = df.query('blob < b', engine=engine, parser=parser) + res = df.query("blob < b", engine=engine, parser=parser) expec = df[df.index < df.b] assert_frame_equal(res, expec) def test_query_index_without_name(self): engine, parser = self.engine, self.parser - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=range(10), columns=['a', 'b', 'c']) + df = DataFrame( + np.random.randint(10, size=(10, 3)), + index=range(10), + columns=["a", "b", "c"], + ) # "index" should refer to the index - res = df.query('index < b', engine=engine, parser=parser) + res = df.query("index < b", engine=engine, parser=parser) expec = df[df.index < df.b] assert_frame_equal(res, expec) # test against a scalar - res = df.query('index < 5', engine=engine, parser=parser) + res = df.query("index < 5", engine=engine, parser=parser) expec = df[df.index < 5] assert_frame_equal(res, expec) @@ -560,21 +566,20 @@ def test_nested_scope(self): df2 = DataFrame(np.random.randn(5, 3)) expected = df[(df > 0) & (df2 > 0)] - result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, - parser=parser) + result = df.query("(@df > 0) & (@df2 > 0)", engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, - parser=parser) + result = pd.eval("df[df > 0 and df2 > 0]", engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', - engine=engine, parser=parser) + result = pd.eval( + "df[df > 0 and df2 > 0 and df[df > 0] > 0]", engine=engine, parser=parser + ) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) - result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + result = pd.eval("df[(df>0) & (df2>0)]", engine=engine, parser=parser) + expected = df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) assert_frame_equal(result, expected) def test_nested_raises_on_local_self_reference(self): @@ -584,104 +589,106 @@ def test_nested_raises_on_local_self_reference(self): # can't reference ourself b/c we're a local so @ is necessary with pytest.raises(UndefinedVariableError): - df.query('df > 0', engine=self.engine, parser=self.parser) + df.query("df > 0", engine=self.engine, parser=self.parser) def test_local_syntax(self): skip_if_no_pandas_parser(self.parser) engine, parser = self.engine, self.parser - df = DataFrame(np.random.randn(100, 10), columns=list('abcdefghij')) + df = DataFrame(np.random.randn(100, 10), columns=list("abcdefghij")) b = 1 expect = df[df.a < b] - result = df.query('a < @b', engine=engine, parser=parser) + result = df.query("a < @b", engine=engine, parser=parser) assert_frame_equal(result, expect) expect = df[df.a < df.b] - result = df.query('a < b', engine=engine, parser=parser) + result = df.query("a < b", engine=engine, parser=parser) assert_frame_equal(result, expect) def test_chained_cmp_and_in(self): skip_if_no_pandas_parser(self.parser) engine, parser = self.engine, self.parser - cols = list('abc') + cols = list("abc") df = DataFrame(np.random.randn(100, len(cols)), columns=cols) - res = df.query('a < b < c and a not in b not in c', engine=engine, - parser=parser) - ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) # noqa + res = df.query( + "a < b < c and a not in b not in c", engine=engine, parser=parser + ) + ind = ( + (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) + ) # noqa expec = df[ind] assert_frame_equal(res, expec) def test_local_variable_with_in(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) - a = Series(np.random.randint(3, size=15), name='a') - b = Series(np.random.randint(10, size=15), name='b') - df = DataFrame({'a': a, 'b': b}) + a = Series(np.random.randint(3, size=15), name="a") + b = Series(np.random.randint(10, size=15), name="b") + df = DataFrame({"a": a, "b": b}) expected = df.loc[(df.b - 1).isin(a)] - result = df.query('b - 1 in a', engine=engine, parser=parser) + result = df.query("b - 1 in a", engine=engine, parser=parser) assert_frame_equal(expected, result) - b = Series(np.random.randint(10, size=15), name='b') + b = Series(np.random.randint(10, size=15), name="b") expected = df.loc[(b - 1).isin(a)] - result = df.query('@b - 1 in a', engine=engine, parser=parser) + result = df.query("@b - 1 in a", engine=engine, parser=parser) assert_frame_equal(expected, result) def test_at_inside_string(self): engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) c = 1 # noqa - df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']}) + df = DataFrame({"a": ["a", "a", "b", "b", "@c", "@c"]}) result = df.query('a == "@c"', engine=engine, parser=parser) expected = df[df.a == "@c"] assert_frame_equal(result, expected) def test_query_undefined_local(self): from pandas.core.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser skip_if_no_pandas_parser(parser) - df = DataFrame(np.random.rand(10, 2), columns=list('ab')) + df = DataFrame(np.random.rand(10, 2), columns=list("ab")) msg = "local variable 'c' is not defined" with pytest.raises(UndefinedVariableError, match=msg): - df.query('a == @c', engine=engine, parser=parser) + df.query("a == @c", engine=engine, parser=parser) def test_index_resolvers_come_after_columns_with_the_same_name(self): n = 1 # noqa a = np.r_[20:101:20] - df = DataFrame({'index': a, 'b': np.random.randn(a.size)}) - df.index.name = 'index' - result = df.query('index > 5', engine=self.engine, parser=self.parser) - expected = df[df['index'] > 5] + df = DataFrame({"index": a, "b": np.random.randn(a.size)}) + df.index.name = "index" + result = df.query("index > 5", engine=self.engine, parser=self.parser) + expected = df[df["index"] > 5] assert_frame_equal(result, expected) - df = DataFrame({'index': a, - 'b': np.random.randn(a.size)}) - result = df.query('ilevel_0 > 5', engine=self.engine, - parser=self.parser) + df = DataFrame({"index": a, "b": np.random.randn(a.size)}) + result = df.query("ilevel_0 > 5", engine=self.engine, parser=self.parser) expected = df.loc[df.index[df.index > 5]] assert_frame_equal(result, expected) - df = DataFrame({'a': a, 'b': np.random.randn(a.size)}) - df.index.name = 'a' - result = df.query('a > 5', engine=self.engine, parser=self.parser) + df = DataFrame({"a": a, "b": np.random.randn(a.size)}) + df.index.name = "a" + result = df.query("a > 5", engine=self.engine, parser=self.parser) expected = df[df.a > 5] assert_frame_equal(result, expected) - result = df.query('index > 5', engine=self.engine, parser=self.parser) + result = df.query("index > 5", engine=self.engine, parser=self.parser) expected = df.loc[df.index[df.index > 5]] assert_frame_equal(result, expected) def test_inf(self): n = 10 - df = DataFrame({'a': np.random.rand(n), 'b': np.random.rand(n)}) + df = DataFrame({"a": np.random.rand(n), "b": np.random.rand(n)}) df.loc[::2, 0] = np.inf - ops = '==', '!=' + ops = "==", "!=" d = dict(zip(ops, (operator.eq, operator.ne))) for op, f in d.items(): - q = 'a %s inf' % op + q = "a %s inf" % op expected = df[f(df.a, np.inf)] result = df.query(q, engine=self.engine, parser=self.parser) assert_frame_equal(result, expected) @@ -689,82 +696,86 @@ def test_inf(self): @td.skip_if_no_ne class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'numexpr' - cls.parser = 'python' + cls.engine = "numexpr" + cls.parser = "python" cls.frame = TestData().frame def test_date_query_no_attribute_access(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - df['dates2'] = date_range('1/1/2013', periods=5) - df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('(dates1 < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=5) + df["dates2"] = date_range("1/1/2013", periods=5) + df["dates3"] = date_range("1/1/2014", periods=5) + res = df.query( + "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates2'] = date_range('1/1/2013', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT - res = df.query('(dates1 < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates2"] = date_range("1/1/2013", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT + res = df.query( + "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.set_index('dates1', inplace=True, drop=True) - res = df.query('(index < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.set_index("dates1", inplace=True, drop=True) + res = df.query( + "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) - res = df.query('(index < 20130101) & (20130101 < dates3)', - engine=engine, parser=parser) - expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] + df.set_index("dates1", inplace=True, drop=True) + res = df.query( + "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser + ) + expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] assert_frame_equal(res, expec) def test_date_index_query_with_NaT_duplicates(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) - df['dates1'] = date_range('1/1/2012', periods=n) - df['dates3'] = date_range('1/1/2014', periods=n) - df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT - df.set_index('dates1', inplace=True, drop=True) + df["dates1"] = date_range("1/1/2012", periods=n) + df["dates3"] = date_range("1/1/2014", periods=n) + df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT + df.set_index("dates1", inplace=True, drop=True) with pytest.raises(NotImplementedError): - df.query('index < 20130101 < dates3', engine=engine, parser=parser) + df.query("index < 20130101 < dates3", engine=engine, parser=parser) def test_nested_scope(self): from pandas.core.computation.ops import UndefinedVariableError + engine = self.engine parser = self.parser # smoke test x = 1 # noqa - result = pd.eval('x + 1', engine=engine, parser=parser) + result = pd.eval("x + 1", engine=engine, parser=parser) assert result == 2 df = DataFrame(np.random.randn(5, 3)) @@ -772,170 +783,175 @@ def test_nested_scope(self): # don't have the pandas parser with pytest.raises(SyntaxError): - df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) with pytest.raises(UndefinedVariableError): - df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + df.query("(df>0) & (df2>0)", engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0)] - result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, - parser=parser) + result = pd.eval("df[(df > 0) & (df2 > 0)]", engine=engine, parser=parser) assert_frame_equal(expected, result) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] - result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', - engine=engine, parser=parser) + result = pd.eval( + "df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]", engine=engine, parser=parser + ) assert_frame_equal(expected, result) class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = 'python' - cls.parser = 'pandas' + cls.engine = "python" + cls.parser = "pandas" cls.frame = TestData().frame def test_query_builtin(self): engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) - df.index.name = 'sin' + df.index.name = "sin" expected = df[df.index > 5] - result = df.query('sin > 5', engine=engine, parser=parser) + result = df.query("sin > 5", engine=engine, parser=parser) assert_frame_equal(expected, result) class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): - @classmethod def setup_class(cls): super().setup_class() - cls.engine = cls.parser = 'python' + cls.engine = cls.parser = "python" cls.frame = TestData().frame def test_query_builtin(self): engine, parser = self.engine, self.parser n = m = 10 - df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc")) - df.index.name = 'sin' + df.index.name = "sin" expected = df[df.index > 5] - result = df.query('sin > 5', engine=engine, parser=parser) + result = df.query("sin > 5", engine=engine, parser=parser) assert_frame_equal(expected, result) class TestDataFrameQueryStrings: - def test_str_query_method(self, parser, engine): - df = DataFrame(np.random.randn(10, 1), columns=['b']) - df['strings'] = Series(list('aabbccddee')) - expect = df[df.strings == 'a'] + df = DataFrame(np.random.randn(10, 1), columns=["b"]) + df["strings"] = Series(list("aabbccddee")) + expect = df[df.strings == "a"] - if parser != 'pandas': - col = 'strings' + if parser != "pandas": + col = "strings" lst = '"a"' lhs = [col] * 2 + [lst] * 2 rhs = lhs[::-1] - eq, ne = '==', '!=' + eq, ne = "==", "!=" ops = 2 * ([eq] + [ne]) for lhs, op, rhs in zip(lhs, ops, rhs): - ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs) msg = r"'(Not)?In' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): - df.query(ex, engine=engine, parser=parser, - local_dict={'strings': df.strings}) + df.query( + ex, + engine=engine, + parser=parser, + local_dict={"strings": df.strings}, + ) else: res = df.query('"a" == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('strings == "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) - assert_frame_equal(res, df[df.strings.isin(['a'])]) + assert_frame_equal(res, df[df.strings.isin(["a"])]) - expect = df[df.strings != 'a'] + expect = df[df.strings != "a"] res = df.query('strings != "a"', engine=engine, parser=parser) assert_frame_equal(res, expect) res = df.query('"a" != strings', engine=engine, parser=parser) assert_frame_equal(res, expect) - assert_frame_equal(res, df[~df.strings.isin(['a'])]) + assert_frame_equal(res, df[~df.strings.isin(["a"])]) def test_str_list_query_method(self, parser, engine): - df = DataFrame(np.random.randn(10, 1), columns=['b']) - df['strings'] = Series(list('aabbccddee')) - expect = df[df.strings.isin(['a', 'b'])] + df = DataFrame(np.random.randn(10, 1), columns=["b"]) + df["strings"] = Series(list("aabbccddee")) + expect = df[df.strings.isin(["a", "b"])] - if parser != 'pandas': - col = 'strings' + if parser != "pandas": + col = "strings" lst = '["a", "b"]' lhs = [col] * 2 + [lst] * 2 rhs = lhs[::-1] - eq, ne = '==', '!=' + eq, ne = "==", "!=" ops = 2 * ([eq] + [ne]) for lhs, op, rhs in zip(lhs, ops, rhs): - ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs) with pytest.raises(NotImplementedError): df.query(ex, engine=engine, parser=parser) else: - res = df.query('strings == ["a", "b"]', engine=engine, - parser=parser) + res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) assert_frame_equal(res, expect) - res = df.query('["a", "b"] == strings', engine=engine, - parser=parser) + res = df.query('["a", "b"] == strings', engine=engine, parser=parser) assert_frame_equal(res, expect) - expect = df[~df.strings.isin(['a', 'b'])] + expect = df[~df.strings.isin(["a", "b"])] - res = df.query('strings != ["a", "b"]', engine=engine, - parser=parser) + res = df.query('strings != ["a", "b"]', engine=engine, parser=parser) assert_frame_equal(res, expect) - res = df.query('["a", "b"] != strings', engine=engine, - parser=parser) + res = df.query('["a", "b"] != strings', engine=engine, parser=parser) assert_frame_equal(res, expect) def test_query_with_string_columns(self, parser, engine): - df = DataFrame({'a': list('aaaabbbbcccc'), - 'b': list('aabbccddeeff'), - 'c': np.random.randint(5, size=12), - 'd': np.random.randint(9, size=12)}) - if parser == 'pandas': - res = df.query('a in b', parser=parser, engine=engine) + df = DataFrame( + { + "a": list("aaaabbbbcccc"), + "b": list("aabbccddeeff"), + "c": np.random.randint(5, size=12), + "d": np.random.randint(9, size=12), + } + ) + if parser == "pandas": + res = df.query("a in b", parser=parser, engine=engine) expec = df[df.a.isin(df.b)] assert_frame_equal(res, expec) - res = df.query('a in b and c < d', parser=parser, engine=engine) + res = df.query("a in b and c < d", parser=parser, engine=engine) expec = df[df.a.isin(df.b) & (df.c < df.d)] assert_frame_equal(res, expec) else: with pytest.raises(NotImplementedError): - df.query('a in b', parser=parser, engine=engine) + df.query("a in b", parser=parser, engine=engine) with pytest.raises(NotImplementedError): - df.query('a in b and c < d', parser=parser, engine=engine) + df.query("a in b and c < d", parser=parser, engine=engine) def test_object_array_eq_ne(self, parser, engine): - df = DataFrame({'a': list('aaaabbbbcccc'), - 'b': list('aabbccddeeff'), - 'c': np.random.randint(5, size=12), - 'd': np.random.randint(9, size=12)}) - res = df.query('a == b', parser=parser, engine=engine) + df = DataFrame( + { + "a": list("aaaabbbbcccc"), + "b": list("aabbccddeeff"), + "c": np.random.randint(5, size=12), + "d": np.random.randint(9, size=12), + } + ) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] assert_frame_equal(res, exp) - res = df.query('a != b', parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] assert_frame_equal(res, exp) @@ -955,126 +971,131 @@ def test_query_with_nested_strings(self, parser, engine): 6 "page 3 load" 2/1/2014 1:02:01 6 "page 3 exit" 2/1/2014 1:02:31 """ - df = pd.read_csv(StringIO(raw), sep=r'\s{2,}', engine='python', - parse_dates=['timestamp']) + df = pd.read_csv( + StringIO(raw), sep=r"\s{2,}", engine="python", parse_dates=["timestamp"] + ) expected = df[df.event == '"page 1 load"'] - res = df.query("""'"page 1 load"' in event""", parser=parser, - engine=engine) + res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine) assert_frame_equal(expected, res) def test_query_with_nested_special_character(self, parser, engine): skip_if_no_pandas_parser(parser) - df = DataFrame({'a': ['a', 'b', 'test & test'], - 'b': [1, 2, 3]}) + df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]}) res = df.query('a == "test & test"', parser=parser, engine=engine) - expec = df[df.a == 'test & test'] + expec = df[df.a == "test & test"] assert_frame_equal(res, expec) def test_query_lex_compare_strings(self, parser, engine): import operator as opr - a = Series(np.random.choice(list('abcde'), 20)) + a = Series(np.random.choice(list("abcde"), 20)) b = Series(np.arange(a.size)) - df = DataFrame({'X': a, 'Y': b}) + df = DataFrame({"X": a, "Y": b}) - ops = {'<': opr.lt, '>': opr.gt, '<=': opr.le, '>=': opr.ge} + ops = {"<": opr.lt, ">": opr.gt, "<=": opr.le, ">=": opr.ge} for op, func in ops.items(): res = df.query('X %s "d"' % op, engine=engine, parser=parser) - expected = df[func(df.X, 'd')] + expected = df[func(df.X, "d")] assert_frame_equal(res, expected) def test_query_single_element_booleans(self, parser, engine): - columns = 'bid', 'bidsize', 'ask', 'asksize' + columns = "bid", "bidsize", "ask", "asksize" data = np.random.randint(2, size=(1, len(columns))).astype(bool) df = DataFrame(data, columns=columns) - res = df.query('bid & ask', engine=engine, parser=parser) + res = df.query("bid & ask", engine=engine, parser=parser) expected = df[df.bid & df.ask] assert_frame_equal(res, expected) def test_query_string_scalar_variable(self, parser, engine): skip_if_no_pandas_parser(parser) - df = pd.DataFrame({'Symbol': ['BUD US', 'BUD US', 'IBM US', 'IBM US'], - 'Price': [109.70, 109.72, 183.30, 183.35]}) - e = df[df.Symbol == 'BUD US'] - symb = 'BUD US' # noqa - r = df.query('Symbol == @symb', parser=parser, engine=engine) + df = pd.DataFrame( + { + "Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"], + "Price": [109.70, 109.72, 183.30, 183.35], + } + ) + e = df[df.Symbol == "BUD US"] + symb = "BUD US" # noqa + r = df.query("Symbol == @symb", parser=parser, engine=engine) assert_frame_equal(e, r) class TestDataFrameEvalWithFrame: - def setup_method(self, method): - self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc')) + self.frame = DataFrame(np.random.randn(10, 3), columns=list("abc")) def teardown_method(self, method): del self.frame def test_simple_expr(self, parser, engine): - res = self.frame.eval('a + b', engine=engine, parser=parser) + res = self.frame.eval("a + b", engine=engine, parser=parser) expect = self.frame.a + self.frame.b assert_series_equal(res, expect) def test_bool_arith_expr(self, parser, engine): - res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser) + res = self.frame.eval("a[a < 1] + b", engine=engine, parser=parser) expect = self.frame.a[self.frame.a < 1] + self.frame.b assert_series_equal(res, expect) - @pytest.mark.parametrize('op', ['+', '-', '*', '/']) + @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): - df = DataFrame({'a': [1, 2], 'b': ['c', 'd']}) + df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" with pytest.raises(TypeError, match=msg): - df.eval('a {0} b'.format(op), engine=engine, parser=parser) + df.eval("a {0} b".format(op), engine=engine, parser=parser) class TestDataFrameQueryBacktickQuoting: - - @pytest.fixture(scope='class') + @pytest.fixture(scope="class") def df(self): - yield DataFrame({'A': [1, 2, 3], - 'B B': [3, 2, 1], - 'C C': [4, 5, 6], - 'C_C': [8, 9, 10], - 'D_D D': [11, 1, 101]}) + yield DataFrame( + { + "A": [1, 2, 3], + "B B": [3, 2, 1], + "C C": [4, 5, 6], + "C_C": [8, 9, 10], + "D_D D": [11, 1, 101], + } + ) def test_single_backtick_variable_query(self, df): - res = df.query('1 < `B B`') - expect = df[1 < df['B B']] + res = df.query("1 < `B B`") + expect = df[1 < df["B B"]] assert_frame_equal(res, expect) def test_two_backtick_variables_query(self, df): - res = df.query('1 < `B B` and 4 < `C C`') - expect = df[(1 < df['B B']) & (4 < df['C C'])] + res = df.query("1 < `B B` and 4 < `C C`") + expect = df[(1 < df["B B"]) & (4 < df["C C"])] assert_frame_equal(res, expect) def test_single_backtick_variable_expr(self, df): - res = df.eval('A + `B B`') - expect = df['A'] + df['B B'] + res = df.eval("A + `B B`") + expect = df["A"] + df["B B"] assert_series_equal(res, expect) def test_two_backtick_variables_expr(self, df): - res = df.eval('`B B` + `C C`') - expect = df['B B'] + df['C C'] + res = df.eval("`B B` + `C C`") + expect = df["B B"] + df["C C"] assert_series_equal(res, expect) def test_already_underscore_variable(self, df): - res = df.eval('`C_C` + A') - expect = df['C_C'] + df['A'] + res = df.eval("`C_C` + A") + expect = df["C_C"] + df["A"] assert_series_equal(res, expect) def test_same_name_but_underscores(self, df): - res = df.eval('C_C + `C C`') - expect = df['C_C'] + df['C C'] + res = df.eval("C_C + `C C`") + expect = df["C_C"] + df["C C"] assert_series_equal(res, expect) def test_mixed_underscores_and_spaces(self, df): - res = df.eval('A + `D_D D`') - expect = df['A'] + df['D_D D'] + res = df.eval("A + `D_D D`") + expect = df["A"] + df["D_D D"] assert_series_equal(res, expect) def backtick_quote_name_with_no_spaces(self, df): - res = df.eval('A + `C_C`') - expect = df['A'] + df['C_C'] + res = df.eval("A + `C_C`") + expect = df["A"] + df["C_C"] assert_series_equal(res, expect) diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index c93defe7c64a6..fd9c53c7d9f5b 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -10,18 +10,17 @@ class TestRank: s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) - df = DataFrame({'A': s, 'B': s}) + df = DataFrame({"A": s, "B": s}) results = { - 'average': np.array([1.5, 5.5, 7.0, 3.5, np.nan, - 3.5, 1.5, 8.0, np.nan, 5.5]), - 'min': np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), - 'max': np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), - 'first': np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), - 'dense': np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), + "average": np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5]), + "min": np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5]), + "max": np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6]), + "first": np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6]), + "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } - @pytest.fixture(params=['average', 'min', 'max', 'first', 'dense']) + @pytest.fixture(params=["average", "min", "max", "first", "dense"]) def method(self, request): """ Fixture for trying all rank methods @@ -29,12 +28,12 @@ def method(self, request): return request.param def test_rank(self, float_frame): - rankdata = pytest.importorskip('scipy.stats.rankdata') + rankdata = pytest.importorskip("scipy.stats.rankdata") - float_frame['A'][::2] = np.nan - float_frame['B'][::3] = np.nan - float_frame['C'][::4] = np.nan - float_frame['D'][::5] = np.nan + float_frame["A"][::2] = np.nan + float_frame["B"][::3] = np.nan + float_frame["C"][::4] = np.nan + float_frame["D"][::5] = np.nan ranks0 = float_frame.rank() ranks1 = float_frame.rank(1) @@ -73,7 +72,7 @@ def test_rank2(self): result = df.rank(0, pct=True) tm.assert_frame_equal(result, expected) - df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) + df = DataFrame([["b", "c", "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) @@ -82,7 +81,7 @@ def test_rank2(self): result = df.rank(0, numeric_only=False) tm.assert_frame_equal(result, expected) - df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) + df = DataFrame([["b", np.nan, "a"], ["a", "c", "b"]]) expected = DataFrame([[2.0, np.nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) tm.assert_frame_equal(result, expected) @@ -92,46 +91,44 @@ def test_rank2(self): tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround - data = [[datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], - [datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 1)]] + data = [ + [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)], + ] df = DataFrame(data) # check the rank - expected = DataFrame([[2., np.nan, 1.], - [2., 3., 1.]]) + expected = DataFrame([[2.0, np.nan, 1.0], [2.0, 3.0, 1.0]]) result = df.rank(1, numeric_only=False, ascending=True) tm.assert_frame_equal(result, expected) - expected = DataFrame([[1., np.nan, 2.], - [2., 1., 3.]]) + expected = DataFrame([[1.0, np.nan, 2.0], [2.0, 1.0, 3.0]]) result = df.rank(1, numeric_only=False, ascending=False) tm.assert_frame_equal(result, expected) - df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, - 1e60, 1e80, 1e-30]}) - exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) + df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) + exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) tm.assert_frame_equal(df.rank(), exp) def test_rank_mixed_frame(self, float_string_frame): - float_string_frame['datetime'] = datetime.now() - float_string_frame['timedelta'] = timedelta(days=1, seconds=1) + float_string_frame["datetime"] = datetime.now() + float_string_frame["timedelta"] = timedelta(days=1, seconds=1) result = float_string_frame.rank(1) expected = float_string_frame.rank(1, numeric_only=True) tm.assert_frame_equal(result, expected) def test_rank_na_option(self, float_frame): - rankdata = pytest.importorskip('scipy.stats.rankdata') + rankdata = pytest.importorskip("scipy.stats.rankdata") - float_frame['A'][::2] = np.nan - float_frame['B'][::3] = np.nan - float_frame['C'][::4] = np.nan - float_frame['D'][::5] = np.nan + float_frame["A"][::2] = np.nan + float_frame["B"][::3] = np.nan + float_frame["C"][::4] = np.nan + float_frame["D"][::5] = np.nan # bottom - ranks0 = float_frame.rank(na_option='bottom') - ranks1 = float_frame.rank(1, na_option='bottom') + ranks0 = float_frame.rank(na_option="bottom") + ranks1 = float_frame.rank(1, na_option="bottom") fvals = float_frame.fillna(np.inf).values @@ -142,8 +139,8 @@ def test_rank_na_option(self, float_frame): tm.assert_almost_equal(ranks1.values, exp1) # top - ranks0 = float_frame.rank(na_option='top') - ranks1 = float_frame.rank(1, na_option='top') + ranks0 = float_frame.rank(na_option="top") + ranks1 = float_frame.rank(1, na_option="top") fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values fval1 = float_frame.T @@ -159,8 +156,8 @@ def test_rank_na_option(self, float_frame): # descending # bottom - ranks0 = float_frame.rank(na_option='top', ascending=False) - ranks1 = float_frame.rank(1, na_option='top', ascending=False) + ranks0 = float_frame.rank(na_option="top", ascending=False) + ranks1 = float_frame.rank(1, na_option="top", ascending=False) fvals = float_frame.fillna(np.inf).values @@ -173,8 +170,8 @@ def test_rank_na_option(self, float_frame): # descending # top - ranks0 = float_frame.rank(na_option='bottom', ascending=False) - ranks1 = float_frame.rank(1, na_option='bottom', ascending=False) + ranks0 = float_frame.rank(na_option="bottom", ascending=False) + ranks1 = float_frame.rank(1, na_option="bottom", ascending=False) fval0 = float_frame.fillna((float_frame.min() - 1).to_dict()).values fval1 = float_frame.T @@ -191,7 +188,7 @@ def test_rank_na_option(self, float_frame): msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): - float_frame.rank(na_option='bad', ascending=False) + float_frame.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): @@ -200,35 +197,34 @@ def test_rank_na_option(self, float_frame): def test_rank_axis(self): # check if using axes' names gives the same result df = DataFrame([[2, 1], [4, 3]]) - tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) - tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) def test_rank_methods_frame(self): - pytest.importorskip('scipy.stats.special') - rankdata = pytest.importorskip('scipy.stats.rankdata') + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 - cols = [chr(ord('z') - i) for i in range(xs.shape[1])] + cols = [chr(ord("z") - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: - for m in ['average', 'min', 'max', 'first', 'dense']: + for m in ["average", "min", "max", "first", "dense"]: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( - rankdata, ax, vals, - m if m != 'first' else 'ordinal') + rankdata, ax, vals, m if m != "first" else "ordinal" + ) sprank = sprank.astype(np.float64) - expected = DataFrame(sprank, - columns=cols).astype('float64') + expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) + @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) def test_rank_descending(self, method, dtype): - if 'i' in dtype: + if "i" in dtype: df = self.df.dropna() else: df = self.df.astype(dtype) @@ -237,27 +233,25 @@ def test_rank_descending(self, method, dtype): expected = (df.max() - df).rank() assert_frame_equal(res, expected) - if method == 'first' and dtype == 'O': + if method == "first" and dtype == "O": return expected = (df.max() - df).rank(method=method) - if dtype != 'O': - res2 = df.rank(method=method, ascending=False, - numeric_only=True) + if dtype != "O": + res2 = df.rank(method=method, ascending=False, numeric_only=True) assert_frame_equal(res2, expected) - res3 = df.rank(method=method, ascending=False, - numeric_only=False) + res3 = df.rank(method=method, ascending=False, numeric_only=False) assert_frame_equal(res3, expected) - @pytest.mark.parametrize('axis', [0, 1]) - @pytest.mark.parametrize('dtype', [None, object]) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("dtype", [None, object]) def test_rank_2d_tie_methods(self, method, axis, dtype): df = self.df - def _check2d(df, expected, method='average', axis=0): - exp_df = DataFrame({'A': expected, 'B': expected}) + def _check2d(df, expected, method="average", axis=0): + exp_df = DataFrame({"A": expected, "B": expected}) if axis == 1: df = df.T @@ -266,33 +260,42 @@ def _check2d(df, expected, method='average', axis=0): result = df.rank(method=method, axis=axis) assert_frame_equal(result, exp_df) - disabled = {(object, 'first')} + disabled = {(object, "first")} if (dtype, method) in disabled: return frame = df if dtype is None else df.astype(dtype) _check2d(frame, self.results[method], method=method, axis=axis) @pytest.mark.parametrize( - "method,exp", [("dense", - [[1., 1., 1.], - [1., 0.5, 2. / 3], - [1., 0.5, 1. / 3]]), - ("min", - [[1. / 3, 1., 1.], - [1. / 3, 1. / 3, 2. / 3], - [1. / 3, 1. / 3, 1. / 3]]), - ("max", - [[1., 1., 1.], - [1., 2. / 3, 2. / 3], - [1., 2. / 3, 1. / 3]]), - ("average", - [[2. / 3, 1., 1.], - [2. / 3, 0.5, 2. / 3], - [2. / 3, 0.5, 1. / 3]]), - ("first", - [[1. / 3, 1., 1.], - [2. / 3, 1. / 3, 2. / 3], - [3. / 3, 2. / 3, 1. / 3]])]) + "method,exp", + [ + ("dense", [[1.0, 1.0, 1.0], [1.0, 0.5, 2.0 / 3], [1.0, 0.5, 1.0 / 3]]), + ( + "min", + [ + [1.0 / 3, 1.0, 1.0], + [1.0 / 3, 1.0 / 3, 2.0 / 3], + [1.0 / 3, 1.0 / 3, 1.0 / 3], + ], + ), + ( + "max", + [[1.0, 1.0, 1.0], [1.0, 2.0 / 3, 2.0 / 3], [1.0, 2.0 / 3, 1.0 / 3]], + ), + ( + "average", + [[2.0 / 3, 1.0, 1.0], [2.0 / 3, 0.5, 2.0 / 3], [2.0 / 3, 0.5, 1.0 / 3]], + ), + ( + "first", + [ + [1.0 / 3, 1.0, 1.0], + [2.0 / 3, 1.0 / 3, 2.0 / 3], + [3.0 / 3, 2.0 / 3, 1.0 / 3], + ], + ), + ], + ) def test_rank_pct_true(self, method, exp): # see gh-15630. @@ -306,7 +309,8 @@ def test_rank_pct_true(self, method, exp): @pytest.mark.high_memory def test_pct_max_many_rows(self): # GH 18271 - df = DataFrame({'A': np.arange(2**24 + 1), - 'B': np.arange(2**24 + 1, 0, -1)}) + df = DataFrame( + {"A": np.arange(2 ** 24 + 1), "B": np.arange(2 ** 24 + 1, 0, -1)} + ) result = df.rank(pct=True).max() assert (result == 1).all() diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 2513508822fec..2862615ef8585 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -14,24 +14,18 @@ @pytest.fixture def mix_ab() -> Dict[str, list]: - return {'a': list(range(4)), - 'b': list('ab..'), - } + return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture def mix_abc() -> Dict[str, list]: - return {'a': list(range(4)), - 'b': list('ab..'), - 'c': ['a', 'b', np.nan, 'd'], - } + return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} class TestDataFrameReplace(TestData): - def test_replace_inplace(self): - self.tsframe['A'][:5] = np.nan - self.tsframe['A'][-5:] = np.nan + self.tsframe["A"][:5] = np.nan + self.tsframe["A"][-5:] = np.nan tsframe = self.tsframe.copy() tsframe.replace(np.nan, 0, inplace=True) @@ -39,8 +33,8 @@ def test_replace_inplace(self): # mixed type mf = self.mixed_frame - mf.iloc[5:20, mf.columns.get_loc('foo')] = np.nan - mf.iloc[-10:, mf.columns.get_loc('A')] = np.nan + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = self.mixed_frame.replace(np.nan, 0) expected = self.mixed_frame.fillna(value=0) @@ -51,72 +45,72 @@ def test_replace_inplace(self): assert_frame_equal(tsframe, self.tsframe.fillna(0)) def test_regex_replace_scalar(self, mix_ab): - obj = {'a': list('ab..'), 'b': list('efgh')} + obj = {"a": list("ab.."), "b": list("efgh")} dfobj = DataFrame(obj) dfmix = DataFrame(mix_ab) # simplest cases # regex -> value # obj frame - res = dfobj.replace(r'\s*\.\s*', np.nan, regex=True) - assert_frame_equal(dfobj, res.fillna('.')) + res = dfobj.replace(r"\s*\.\s*", np.nan, regex=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed - res = dfmix.replace(r'\s*\.\s*', np.nan, regex=True) - assert_frame_equal(dfmix, res.fillna('.')) + res = dfmix.replace(r"\s*\.\s*", np.nan, regex=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame - res = dfobj.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + res = dfobj.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed - res = dfmix.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True) + res = dfmix.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) # everything with compiled regexs as well - res = dfobj.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True) - assert_frame_equal(dfobj, res.fillna('.')) + res = dfobj.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed - res = dfmix.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True) - assert_frame_equal(dfmix, res.fillna('.')) + res = dfmix.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame - res = dfobj.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + res = dfobj.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed - res = dfmix.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1') + res = dfmix.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1") mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) - res = dfmix.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1') + res = dfmix.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1") mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) - res = dfmix.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1') + res = dfmix.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1") mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) def test_regex_replace_scalar_inplace(self, mix_ab): - obj = {'a': list('ab..'), 'b': list('efgh')} + obj = {"a": list("ab.."), "b": list("efgh")} dfobj = DataFrame(obj) dfmix = DataFrame(mix_ab) @@ -124,209 +118,229 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # regex -> value # obj frame res = dfobj.copy() - res.replace(r'\s*\.\s*', np.nan, regex=True, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(r'\s*\.\s*', np.nan, regex=True, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(r'\s*(\.)\s*', r'\1\1\1', regex=True, inplace=True) + res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.copy() - res.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(re.compile(r'\s*\.\s*'), np.nan, regex=True, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, - inplace=True) + res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(re.compile(r'\s*(\.)\s*'), r'\1\1\1', regex=True, - inplace=True) + res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) res = dfobj.copy() - res.replace(regex=r'\s*\.\s*', value=np.nan, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(regex=r'\s*\.\s*', value=np.nan, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(regex=r'\s*(\.)\s*', value=r'\1\1\1', inplace=True) + res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) # everything with compiled regexs as well res = dfobj.copy() - res.replace(regex=re.compile(r'\s*\.\s*'), value=np.nan, inplace=True) - assert_frame_equal(dfobj, res.fillna('.')) + res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(regex=re.compile(r'\s*\.\s*'), value=np.nan, inplace=True) - assert_frame_equal(dfmix, res.fillna('.')) + res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', - inplace=True) + res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) objc = obj.copy() - objc['a'] = ['a', 'b', '...', '...'] + objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) assert_frame_equal(res, expec) # with mixed res = dfmix.copy() - res.replace(regex=re.compile(r'\s*(\.)\s*'), value=r'\1\1\1', - inplace=True) + res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) mixc = mix_ab.copy() - mixc['b'] = ['a', 'b', '...', '...'] + mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) assert_frame_equal(res, expec) def test_regex_replace_list_obj(self): - obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'e|f|g'] - values = [np.nan, 'crap'] + to_replace_res = [r"\s*\.\s*", r"e|f|g"] + values = [np.nan, "crap"] res = dfobj.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], - 'b': ['crap'] * 3 + ['h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] + values = [r"\1\1", r"\1_crap"] res = dfobj.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['e_crap', 'f_crap', 'g_crap', 'h'], - 'c': ['h', 'e_crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.replace(value=values, regex=to_replace_res) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) def test_regex_replace_list_obj_inplace(self): # same as above with inplace=True # lists of regexes and values - obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'e|f|g'] - values = [np.nan, 'crap'] + to_replace_res = [r"\s*\.\s*", r"e|f|g"] + values = [np.nan, "crap"] res = dfobj.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], - 'b': ['crap'] * 3 + ['h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap"] * 3 + ["h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(e|f|g)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] + values = [r"\1\1", r"\1_crap"] res = dfobj.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['e_crap', 'f_crap', 'g_crap', 'h'], - 'c': ['h', 'e_crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e_crap", "f_crap", "g_crap", "h"], + "c": ["h", "e_crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'e'] - values = [r'\1\1', r'crap'] + to_replace_res = [r"\s*(\.)\s*", r"e"] + values = [r"\1\1", r"crap"] res = dfobj.copy() res.replace(value=values, regex=to_replace_res, inplace=True) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) def test_regex_replace_list_mixed(self, mix_ab): @@ -335,42 +349,39 @@ def test_regex_replace_list_mixed(self, mix_ab): # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'a'] - values = [np.nan, 'crap'] - mix2 = {'a': list(range(4)), 'b': list('ab..'), 'c': list('halo')} + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] + mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")} dfmix2 = DataFrame(mix2) res = dfmix2.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': mix2['a'], - 'b': ['crap', 'b', np.nan, np.nan], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": mix2["a"], + "b": ["crap", "b", np.nan, np.nan], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] res = dfmix.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['a_crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.replace(to_replace_res, values, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.replace(regex=to_replace_res, value=values) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) def test_regex_replace_list_mixed_inplace(self, mix_ab): @@ -378,43 +389,35 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab): # the same inplace # lists of regexes and values # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN] - to_replace_res = [r'\s*\.\s*', r'a'] - values = [np.nan, 'crap'] + to_replace_res = [r"\s*\.\s*", r"a"] + values = [np.nan, "crap"] res = dfmix.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b', np.nan, np.nan], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]}) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [re1, re2, .., reN] - to_replace_res = [r'\s*(\.)\s*', r'(a|b)'] - values = [r'\1\1', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] + values = [r"\1\1", r"\1_crap"] res = dfmix.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['a_crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN # or vN)] - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() res.replace(to_replace_res, values, inplace=True, regex=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) - to_replace_res = [r'\s*(\.)\s*', r'a', r'(b)'] - values = [r'\1\1', r'crap', r'\1_crap'] + to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] + values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() res.replace(regex=to_replace_res, value=values, inplace=True) - expec = DataFrame({'a': mix_ab['a'], - 'b': ['crap', 'b_crap', '..', '..'], - }) + expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) assert_frame_equal(res, expec) def test_regex_replace_dict_mixed(self, mix_abc): @@ -426,76 +429,67 @@ def test_regex_replace_dict_mixed(self, mix_abc): # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole # frame - res = dfmix.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) + res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, - inplace=True, regex=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', np.nan, np.nan], - 'c': mix_abc['c'], - }) + res2.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the # whole frame - res = dfmix.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) res2 = dfmix.copy() - res2.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, inplace=True, - regex=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', '.ty', '.ty'], - 'c': mix_abc['c'], - }) + res2.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) - res = dfmix.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}) + res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}) res2 = dfmix.copy() - res2.replace(regex={'b': r'\s*(\.)\s*'}, value={'b': r'\1ty'}, - inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', '.ty', '.ty'], - 'c': mix_abc['c'], - }) + res2.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) # scalar -> dict # to_replace regex, {value: value} - expec = DataFrame({'a': mix_abc['a'], - 'b': [np.nan, 'b', '.', '.'], - 'c': mix_abc['c'], - }) - res = dfmix.replace('a', {'b': np.nan}, regex=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) + res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace('a', {'b': np.nan}, regex=True, inplace=True) + res2.replace("a", {"b": np.nan}, regex=True, inplace=True) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) - res = dfmix.replace('a', {'b': np.nan}, regex=True) + res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace(regex='a', value={'b': np.nan}, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': [np.nan, 'b', '.', '.'], - 'c': mix_abc['c'], - }) + res2.replace(regex="a", value={"b": np.nan}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) def test_regex_replace_dict_nested(self, mix_abc): # nested dicts will not work until this is implemented for Series dfmix = DataFrame(mix_abc) - res = dfmix.replace({'b': {r'\s*\.\s*': np.nan}}, regex=True) + res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True) res2 = dfmix.copy() res4 = dfmix.copy() - res2.replace({'b': {r'\s*\.\s*': np.nan}}, inplace=True, regex=True) - res3 = dfmix.replace(regex={'b': {r'\s*\.\s*': np.nan}}) - res4.replace(regex={'b': {r'\s*\.\s*': np.nan}}, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', np.nan, np.nan], - 'c': mix_abc['c'], - }) + res2.replace({"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True) + res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}}) + res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) @@ -503,28 +497,31 @@ def test_regex_replace_dict_nested(self, mix_abc): def test_regex_replace_dict_nested_non_first_character(self): # GH 25259 - df = pd.DataFrame({'first': ['abc', 'bca', 'cab']}) - expected = pd.DataFrame({'first': ['.bc', 'bc.', 'c.b']}) - result = df.replace({'a': '.'}, regex=True) + df = pd.DataFrame({"first": ["abc", "bca", "cab"]}) + expected = pd.DataFrame({"first": [".bc", "bc.", "c.b"]}) + result = df.replace({"a": "."}, regex=True) assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): - df = pd.DataFrame({'Type': ['Q', 'T', 'Q', 'Q', 'T'], 'tmp': 2}) - expected = DataFrame({'Type': [0, 1, 0, 0, 1], 'tmp': 2}) - result = df.replace({'Type': {'Q': 0, 'T': 1}}) + df = pd.DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) + expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) + result = df.replace({"Type": {"Q": 0, "T": 1}}) assert_frame_equal(result, expected) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) - expec = DataFrame({'a': mix_abc['a'], - 'b': np.array([np.nan] * 4), - 'c': [np.nan, np.nan, np.nan, 'd'], - }) - res = df.replace([r'\s*\.\s*', 'a|b'], np.nan, regex=True) + expec = DataFrame( + { + "a": mix_abc["a"], + "b": np.array([np.nan] * 4), + "c": [np.nan, np.nan, np.nan, "d"], + } + ) + res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - res2.replace([r'\s*\.\s*', 'a|b'], np.nan, regex=True, inplace=True) - res3.replace(regex=[r'\s*\.\s*', 'a|b'], value=np.nan, inplace=True) + res2.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True) + res3.replace(regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) @@ -532,122 +529,122 @@ def test_regex_replace_list_to_scalar(self, mix_abc): def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) - res = df.replace(r'\s*\.\s*', 0, regex=True) + res = df.replace(r"\s*\.\s*", 0, regex=True) res2 = df.copy() - res2.replace(r'\s*\.\s*', 0, inplace=True, regex=True) + res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) res3 = df.copy() - res3.replace(regex=r'\s*\.\s*', value=0, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', 0, 0], - 'c': mix_abc['c'], - }) + res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) + expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) - res = df.replace([r'\s*\.\s*', 'b'], 0, regex=True) + res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) res2 = df.copy() - res2.replace([r'\s*\.\s*', 'b'], 0, regex=True, inplace=True) + res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) res3 = df.copy() - res3.replace(regex=[r'\s*\.\s*', 'b'], value=0, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 0, 0, 0], - 'c': ['a', 0, np.nan, 'd'], - }) + res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) def test_regex_replace_series_of_regexes(self, mix_abc): df = DataFrame(mix_abc) - s1 = Series({'b': r'\s*\.\s*'}) - s2 = Series({'b': np.nan}) + s1 = Series({"b": r"\s*\.\s*"}) + s2 = Series({"b": np.nan}) res = df.replace(s1, s2, regex=True) res2 = df.copy() res2.replace(s1, s2, inplace=True, regex=True) res3 = df.copy() res3.replace(regex=s1, value=s2, inplace=True) - expec = DataFrame({'a': mix_abc['a'], - 'b': ['a', 'b', np.nan, np.nan], - 'c': mix_abc['c'], - }) + expec = DataFrame( + {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} + ) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) assert_frame_equal(res3, expec) def test_regex_replace_numeric_to_object_conversion(self, mix_abc): df = DataFrame(mix_abc) - expec = DataFrame({'a': ['a', 1, 2, 3], - 'b': mix_abc['b'], - 'c': mix_abc['c'], - }) - res = df.replace(0, 'a') + expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]}) + res = df.replace(0, "a") assert_frame_equal(res, expec) assert res.a.dtype == np.object_ - @pytest.mark.parametrize('metachar', ['[]', '()', r'\d', r'\w', r'\s']) + @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) def test_replace_regex_metachar(self, metachar): - df = DataFrame({'a': [metachar, 'else']}) - result = df.replace({'a': {metachar: 'paren'}}) - expected = DataFrame({'a': ['paren', 'else']}) + df = DataFrame({"a": [metachar, "else"]}) + result = df.replace({"a": {metachar: "paren"}}) + expected = DataFrame({"a": ["paren", "else"]}) assert_frame_equal(result, expected) def test_replace(self): - self.tsframe['A'][:5] = np.nan - self.tsframe['A'][-5:] = np.nan + self.tsframe["A"][:5] = np.nan + self.tsframe["A"][-5:] = np.nan zero_filled = self.tsframe.replace(np.nan, -1e8) assert_frame_equal(zero_filled, self.tsframe.fillna(-1e8)) assert_frame_equal(zero_filled.replace(-1e8, np.nan), self.tsframe) - self.tsframe['A'][:5] = np.nan - self.tsframe['A'][-5:] = np.nan - self.tsframe['B'][:5] = -1e8 + self.tsframe["A"][:5] = np.nan + self.tsframe["A"][-5:] = np.nan + self.tsframe["B"][:5] = -1e8 # empty - df = DataFrame(index=['a', 'b']) + df = DataFrame(index=["a", "b"]) assert_frame_equal(df, df.replace(5, 7)) # GH 11698 # test for mixed data types. - df = pd.DataFrame([('-', pd.to_datetime('20150101')), - ('a', pd.to_datetime('20150102'))]) - df1 = df.replace('-', np.nan) - expected_df = pd.DataFrame([(np.nan, pd.to_datetime('20150101')), - ('a', pd.to_datetime('20150102'))]) + df = pd.DataFrame( + [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) + df1 = df.replace("-", np.nan) + expected_df = pd.DataFrame( + [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))] + ) assert_frame_equal(df1, expected_df) def test_replace_list(self): - obj = {'a': list('ab..'), 'b': list('efgh'), 'c': list('helo')} + obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")} dfobj = DataFrame(obj) # lists of regexes and values # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN] - to_replace_res = [r'.', r'e'] - values = [np.nan, 'crap'] + to_replace_res = [r".", r"e"] + values = [np.nan, "crap"] res = dfobj.replace(to_replace_res, values) - expec = DataFrame({'a': ['a', 'b', np.nan, np.nan], - 'b': ['crap', 'f', 'g', 'h'], - 'c': ['h', 'crap', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", np.nan, np.nan], + "b": ["crap", "f", "g", "h"], + "c": ["h", "crap", "l", "o"], + } + ) assert_frame_equal(res, expec) # list of [v1, v2, ..., vN] -> [v1, v2, .., vN] - to_replace_res = [r'.', r'f'] - values = [r'..', r'crap'] + to_replace_res = [r".", r"f"] + values = [r"..", r"crap"] res = dfobj.replace(to_replace_res, values) - expec = DataFrame({'a': ['a', 'b', '..', '..'], - 'b': ['e', 'crap', 'g', 'h'], - 'c': ['h', 'e', 'l', 'o'], - }) + expec = DataFrame( + { + "a": ["a", "b", "..", ".."], + "b": ["e", "crap", "g", "h"], + "c": ["h", "e", "l", "o"], + } + ) assert_frame_equal(res, expec) def test_replace_with_empty_list(self): # GH 21977 - s = pd.Series([['a', 'b'], [], np.nan, [1]]) - df = pd.DataFrame({'col': s}) + s = pd.Series([["a", "b"], [], np.nan, [1]]) + df = pd.DataFrame({"col": s}) expected = df result = df.replace([], np.nan) assert_frame_equal(result, expected) @@ -656,25 +653,23 @@ def test_replace_with_empty_list(self): with pytest.raises(ValueError, match="cannot assign mismatch"): df.replace({np.nan: []}) with pytest.raises(ValueError, match="cannot assign mismatch"): - df.replace({np.nan: ['dummy', 'alt']}) + df.replace({np.nan: ["dummy", "alt"]}) def test_replace_series_dict(self): # from GH 3064 - df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) - result = df.replace(0, {'zero': 0.5, 'one': 1.0}) - expected = DataFrame( - {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}}) + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + result = df.replace(0, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}}) assert_frame_equal(result, expected) result = df.replace(0, df.mean()) assert_frame_equal(result, expected) # series to series/dict - df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}}) - s = Series({'zero': 0.0, 'one': 2.0}) - result = df.replace(s, {'zero': 0.5, 'one': 1.0}) - expected = DataFrame( - {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}}) + df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}}) + s = Series({"zero": 0.0, "one": 2.0}) + result = df.replace(s, {"zero": 0.5, "one": 1.0}) + expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}}) assert_frame_equal(result, expected) result = df.replace(s, df.mean()) @@ -682,8 +677,8 @@ def test_replace_series_dict(self): def test_replace_convert(self): # gh 3907 - df = DataFrame([['foo', 'bar', 'bah'], ['bar', 'foo', 'bah']]) - m = {'foo': 1, 'bar': 2, 'bah': 3} + df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) + m = {"foo": 1, "bar": 2, "bah": 3} rep = df.replace(m) expec = Series([np.int64] * 3) res = rep.dtypes @@ -691,8 +686,8 @@ def test_replace_convert(self): def test_replace_mixed(self): mf = self.mixed_frame - mf.iloc[5:20, mf.columns.get_loc('foo')] = np.nan - mf.iloc[-10:, mf.columns.get_loc('A')] = np.nan + mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan + mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan result = self.mixed_frame.replace(np.nan, -18) expected = self.mixed_frame.fillna(value=-18) @@ -705,10 +700,18 @@ def test_replace_mixed(self): assert_frame_equal(result.replace(-1e8, np.nan), self.mixed_frame) # int block upcasting - df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0, 1], dtype='int64')}) - expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0.5, 1], dtype='float64')}) + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + } + ) result = df.replace(0, 0.5) assert_frame_equal(result, expected) @@ -716,57 +719,78 @@ def test_replace_mixed(self): assert_frame_equal(df, expected) # int block splitting - df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0, 1], dtype='int64'), - 'C': Series([1, 2], dtype='int64')}) - expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0.5, 1], dtype='float64'), - 'C': Series([1, 2], dtype='int64')}) + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + "C": Series([1, 2], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0.5, 1], dtype="float64"), + "C": Series([1, 2], dtype="int64"), + } + ) result = df.replace(0, 0.5) assert_frame_equal(result, expected) # to object block upcasting - df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'), - 'B': Series([0, 1], dtype='int64')}) - expected = DataFrame({'A': Series([1, 'foo'], dtype='object'), - 'B': Series([0, 1], dtype='int64')}) - result = df.replace(2, 'foo') + df = DataFrame( + { + "A": Series([1.0, 2.0], dtype="float64"), + "B": Series([0, 1], dtype="int64"), + } + ) + expected = DataFrame( + { + "A": Series([1, "foo"], dtype="object"), + "B": Series([0, 1], dtype="int64"), + } + ) + result = df.replace(2, "foo") assert_frame_equal(result, expected) - expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'), - 'B': Series([0, 'foo'], dtype='object')}) - result = df.replace([1, 2], ['foo', 'bar']) + expected = DataFrame( + { + "A": Series(["foo", "bar"], dtype="object"), + "B": Series([0, "foo"], dtype="object"), + } + ) + result = df.replace([1, 2], ["foo", "bar"]) assert_frame_equal(result, expected) # test case from - df = DataFrame({'A': Series([3, 0], dtype='int64'), - 'B': Series([0, 3], dtype='int64')}) + df = DataFrame( + {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")} + ) result = df.replace(3, df.mean().to_dict()) - expected = df.copy().astype('float64') + expected = df.copy().astype("float64") m = df.mean() expected.iloc[0, 0] = m[0] expected.iloc[1, 1] = m[1] assert_frame_equal(result, expected) def test_replace_simple_nested_dict(self): - df = DataFrame({'col': range(1, 5)}) - expected = DataFrame({'col': ['a', 2, 3, 'b']}) + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) - result = df.replace({'col': {1: 'a', 4: 'b'}}) + result = df.replace({"col": {1: "a", 4: "b"}}) assert_frame_equal(expected, result) # in this case, should be the same as the not nested version - result = df.replace({1: 'a', 4: 'b'}) + result = df.replace({1: "a", 4: "b"}) assert_frame_equal(expected, result) def test_replace_simple_nested_dict_with_nonexistent_value(self): - df = DataFrame({'col': range(1, 5)}) - expected = DataFrame({'col': ['a', 2, 3, 'b']}) + df = DataFrame({"col": range(1, 5)}) + expected = DataFrame({"col": ["a", 2, 3, "b"]}) - result = df.replace({-1: '-', 1: 'a', 4: 'b'}) + result = df.replace({-1: "-", 1: "a", 4: "b"}) assert_frame_equal(expected, result) - result = df.replace({'col': {-1: '-', 1: 'a', 4: 'b'}}) + result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}}) assert_frame_equal(expected, result) def test_replace_value_is_none(self): @@ -793,82 +817,129 @@ def test_replace_for_new_dtypes(self): # dtypes tsframe = self.tsframe.copy().astype(np.float32) - tsframe['A'][:5] = np.nan - tsframe['A'][-5:] = np.nan + tsframe["A"][:5] = np.nan + tsframe["A"][-5:] = np.nan zero_filled = tsframe.replace(np.nan, -1e8) assert_frame_equal(zero_filled, tsframe.fillna(-1e8)) assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe) - tsframe['A'][:5] = np.nan - tsframe['A'][-5:] = np.nan - tsframe['B'][:5] = -1e8 + tsframe["A"][:5] = np.nan + tsframe["A"][-5:] = np.nan + tsframe["B"][:5] = -1e8 - b = tsframe['B'] + b = tsframe["B"] b[b == -1e8] = np.nan - tsframe['B'] = b - result = tsframe.fillna(method='bfill') - assert_frame_equal(result, tsframe.fillna(method='bfill')) - - @pytest.mark.parametrize('frame, to_replace, value, expected', [ - (DataFrame({'ints': [1, 2, 3]}), 1, 0, - DataFrame({'ints': [0, 2, 3]})), - (DataFrame({'ints': [1, 2, 3]}, dtype=np.int32), 1, 0, - DataFrame({'ints': [0, 2, 3]}, dtype=np.int32)), - (DataFrame({'ints': [1, 2, 3]}, dtype=np.int16), 1, 0, - DataFrame({'ints': [0, 2, 3]}, dtype=np.int16)), - (DataFrame({'bools': [True, False, True]}), False, True, - DataFrame({'bools': [True, True, True]})), - (DataFrame({'complex': [1j, 2j, 3j]}), 1j, 0, - DataFrame({'complex': [0j, 2j, 3j]})), - (DataFrame({'datetime64': Index([datetime(2018, 5, 28), - datetime(2018, 7, 28), - datetime(2018, 5, 28)])}), - datetime(2018, 5, 28), datetime(2018, 7, 28), - DataFrame({'datetime64': Index([datetime(2018, 7, 28)] * 3)})), - # GH 20380 - (DataFrame({'dt': [datetime(3017, 12, 20)], 'str': ['foo']}), - 'foo', 'bar', - DataFrame({'dt': [datetime(3017, 12, 20)], 'str': ['bar']})), - (DataFrame({'A': date_range('20130101', periods=3, tz='US/Eastern'), - 'B': [0, np.nan, 2]}), - Timestamp('20130102', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern'), - DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]})) - ]) + tsframe["B"] = b + result = tsframe.fillna(method="bfill") + assert_frame_equal(result, tsframe.fillna(method="bfill")) + + @pytest.mark.parametrize( + "frame, to_replace, value, expected", + [ + (DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int32), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int32), + ), + ( + DataFrame({"ints": [1, 2, 3]}, dtype=np.int16), + 1, + 0, + DataFrame({"ints": [0, 2, 3]}, dtype=np.int16), + ), + ( + DataFrame({"bools": [True, False, True]}), + False, + True, + DataFrame({"bools": [True, True, True]}), + ), + ( + DataFrame({"complex": [1j, 2j, 3j]}), + 1j, + 0, + DataFrame({"complex": [0j, 2j, 3j]}), + ), + ( + DataFrame( + { + "datetime64": Index( + [ + datetime(2018, 5, 28), + datetime(2018, 7, 28), + datetime(2018, 5, 28), + ] + ) + } + ), + datetime(2018, 5, 28), + datetime(2018, 7, 28), + DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}), + ), + # GH 20380 + ( + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}), + "foo", + "bar", + DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}), + ), + ( + DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ), + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ), + ), + ], + ) def test_replace_dtypes(self, frame, to_replace, value, expected): - result = getattr(frame, 'replace')(to_replace, value) + result = getattr(frame, "replace")(to_replace, value) assert_frame_equal(result, expected) def test_replace_input_formats_listlike(self): # both dicts - to_rep = {'A': np.nan, 'B': 0, 'C': ''} - values = {'A': 0, 'B': -1, 'C': 'missing'} - df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], - 'C': ['', 'asdf', 'fd']}) + to_rep = {"A": np.nan, "B": 0, "C": ""} + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) filled = df.replace(to_rep, values) expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) result = df.replace([0, 2, 5], [5, 2, 0]) - expected = DataFrame({'A': [np.nan, 5, np.inf], 'B': [5, 2, 0], - 'C': ['', 'asdf', 'fd']}) + expected = DataFrame( + {"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]} + ) assert_frame_equal(result, expected) # scalar to dict - values = {'A': 0, 'B': -1, 'C': 'missing'} - df = DataFrame({'A': [np.nan, 0, np.nan], 'B': [0, 2, 5], - 'C': ['', 'asdf', 'fd']}) + values = {"A": 0, "B": -1, "C": "missing"} + df = DataFrame( + {"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) filled = df.replace(np.nan, values) expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) # list to list - to_rep = [np.nan, 0, ''] - values = [-2, -1, 'missing'] + to_rep = [np.nan, 0, ""] + values = [-2, -1, "missing"] result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): @@ -880,21 +951,22 @@ def test_replace_input_formats_listlike(self): df.replace(to_rep, values[1:]) def test_replace_input_formats_scalar(self): - df = DataFrame({'A': [np.nan, 0, np.inf], 'B': [0, 2, 5], - 'C': ['', 'asdf', 'fd']}) + df = DataFrame( + {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} + ) # dict to scalar - to_rep = {'A': np.nan, 'B': 0, 'C': ''} + to_rep = {"A": np.nan, "B": 0, "C": ""} filled = df.replace(to_rep, 0) expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()} assert_frame_equal(filled, DataFrame(expected)) msg = "value argument must be scalar, dict, or Series" with pytest.raises(TypeError, match=msg): - df.replace(to_rep, [np.nan, 0, '']) + df.replace(to_rep, [np.nan, 0, ""]) # list to scalar - to_rep = [np.nan, 0, ''] + to_rep = [np.nan, 0, ""] result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): @@ -905,19 +977,45 @@ def test_replace_limit(self): pass def test_replace_dict_no_regex(self): - answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: - 'Disagree', 4: 'Strongly Disagree'}) - weights = {'Agree': 4, 'Disagree': 2, 'Neutral': 3, 'Strongly Agree': - 5, 'Strongly Disagree': 1} + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) result = answer.replace(weights) assert_series_equal(result, expected) def test_replace_series_no_regex(self): - answer = Series({0: 'Strongly Agree', 1: 'Agree', 2: 'Neutral', 3: - 'Disagree', 4: 'Strongly Disagree'}) - weights = Series({'Agree': 4, 'Disagree': 2, 'Neutral': 3, - 'Strongly Agree': 5, 'Strongly Disagree': 1}) + answer = Series( + { + 0: "Strongly Agree", + 1: "Agree", + 2: "Neutral", + 3: "Disagree", + 4: "Strongly Disagree", + } + ) + weights = Series( + { + "Agree": 4, + "Disagree": 2, + "Neutral": 3, + "Strongly Agree": 5, + "Strongly Disagree": 1, + } + ) expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1}) result = answer.replace(weights) assert_series_equal(result, expected) @@ -928,7 +1026,7 @@ def test_replace_dict_tuple_list_ordering_remains_the_same(self): res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0]) res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0]) - expected = DataFrame({'A': [0, -1e8]}) + expected = DataFrame({"A": [0, -1e8]}) assert_frame_equal(res1, res2) assert_frame_equal(res2, res3) assert_frame_equal(res3, expected) @@ -939,19 +1037,19 @@ def test_replace_doesnt_replace_without_regex(self): 1 2 vr 0 0 2 2 0 0 0 3 3 0 bt 0""" - df = pd.read_csv(StringIO(raw), sep=r'\s+') - res = df.replace({r'\D': 1}) + df = pd.read_csv(StringIO(raw), sep=r"\s+") + res = df.replace({r"\D": 1}) assert_frame_equal(df, res) def test_replace_bool_with_string(self): - df = DataFrame({'a': [True, False], 'b': list('ab')}) - result = df.replace(True, 'a') - expected = DataFrame({'a': ['a', False], 'b': df.b}) + df = DataFrame({"a": [True, False], "b": list("ab")}) + result = df.replace(True, "a") + expected = DataFrame({"a": ["a", False], "b": df.b}) assert_frame_equal(result, expected) def test_replace_pure_bool_with_string_no_op(self): df = DataFrame(np.random.rand(2, 2) > 0.5) - result = df.replace('asdf', 'fdsa') + result = df.replace("asdf", "fdsa") assert_frame_equal(df, result) def test_replace_bool_with_bool(self): @@ -962,93 +1060,102 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) - with pytest.raises(TypeError, match='Cannot compare types .+'): - df.replace({'asdf': 'asdb', True: 'yes'}) + with pytest.raises(TypeError, match="Cannot compare types .+"): + df.replace({"asdf": "asdb", True: "yes"}) def test_replace_truthy(self): - df = DataFrame({'a': [True, True]}) + df = DataFrame({"a": [True, True]}) r = df.replace([np.inf, -np.inf], np.nan) e = df assert_frame_equal(r, e) def test_replace_int_to_int_chain(self): - df = DataFrame({'a': list(range(1, 5))}) + df = DataFrame({"a": list(range(1, 5))}) with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({'a': dict(zip(range(1, 5), range(2, 6)))}) + df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) def test_replace_str_to_str_chain(self): a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) - df = DataFrame({'a': astr}) + df = DataFrame({"a": astr}) with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({'a': dict(zip(astr, bstr))}) + df.replace({"a": dict(zip(astr, bstr))}) def test_replace_swapping_bug(self): - df = pd.DataFrame({'a': [True, False, True]}) - res = df.replace({'a': {True: 'Y', False: 'N'}}) - expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + df = pd.DataFrame({"a": [True, False, True]}) + res = df.replace({"a": {True: "Y", False: "N"}}) + expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) assert_frame_equal(res, expect) - df = pd.DataFrame({'a': [0, 1, 0]}) - res = df.replace({'a': {0: 'Y', 1: 'N'}}) - expect = pd.DataFrame({'a': ['Y', 'N', 'Y']}) + df = pd.DataFrame({"a": [0, 1, 0]}) + res = df.replace({"a": {0: "Y", 1: "N"}}) + expect = pd.DataFrame({"a": ["Y", "N", "Y"]}) assert_frame_equal(res, expect) def test_replace_period(self): d = { - 'fname': { - 'out_augmented_AUG_2011.json': - pd.Period(year=2011, month=8, freq='M'), - 'out_augmented_JAN_2011.json': - pd.Period(year=2011, month=1, freq='M'), - 'out_augmented_MAY_2012.json': - pd.Period(year=2012, month=5, freq='M'), - 'out_augmented_SUBSIDY_WEEK.json': - pd.Period(year=2011, month=4, freq='M'), - 'out_augmented_AUG_2012.json': - pd.Period(year=2012, month=8, freq='M'), - 'out_augmented_MAY_2011.json': - pd.Period(year=2011, month=5, freq='M'), - 'out_augmented_SEP_2013.json': - pd.Period(year=2013, month=9, freq='M')}} - - df = pd.DataFrame(['out_augmented_AUG_2012.json', - 'out_augmented_SEP_2013.json', - 'out_augmented_SUBSIDY_WEEK.json', - 'out_augmented_MAY_2012.json', - 'out_augmented_MAY_2011.json', - 'out_augmented_AUG_2011.json', - 'out_augmented_JAN_2011.json'], columns=['fname']) - assert set(df.fname.values) == set(d['fname'].keys()) + "fname": { + "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"), + "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"), + "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"), + "out_augmented_SUBSIDY_WEEK.json": pd.Period( + year=2011, month=4, freq="M" + ), + "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"), + "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"), + "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"), + } + } + + df = pd.DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) # We don't support converting object -> specialized EA in # replace yet. - expected = DataFrame({'fname': [d['fname'][k] - for k in df.fname.values]}, - dtype=object) + expected = DataFrame( + {"fname": [d["fname"][k] for k in df.fname.values]}, dtype=object + ) result = df.replace(d) assert_frame_equal(result, expected) def test_replace_datetime(self): - d = {'fname': - {'out_augmented_AUG_2011.json': pd.Timestamp('2011-08'), - 'out_augmented_JAN_2011.json': pd.Timestamp('2011-01'), - 'out_augmented_MAY_2012.json': pd.Timestamp('2012-05'), - 'out_augmented_SUBSIDY_WEEK.json': pd.Timestamp('2011-04'), - 'out_augmented_AUG_2012.json': pd.Timestamp('2012-08'), - 'out_augmented_MAY_2011.json': pd.Timestamp('2011-05'), - 'out_augmented_SEP_2013.json': pd.Timestamp('2013-09')}} - - df = pd.DataFrame(['out_augmented_AUG_2012.json', - 'out_augmented_SEP_2013.json', - 'out_augmented_SUBSIDY_WEEK.json', - 'out_augmented_MAY_2012.json', - 'out_augmented_MAY_2011.json', - 'out_augmented_AUG_2011.json', - 'out_augmented_JAN_2011.json'], columns=['fname']) - assert set(df.fname.values) == set(d['fname'].keys()) - expected = DataFrame({'fname': [d['fname'][k] - for k in df.fname.values]}) + d = { + "fname": { + "out_augmented_AUG_2011.json": pd.Timestamp("2011-08"), + "out_augmented_JAN_2011.json": pd.Timestamp("2011-01"), + "out_augmented_MAY_2012.json": pd.Timestamp("2012-05"), + "out_augmented_SUBSIDY_WEEK.json": pd.Timestamp("2011-04"), + "out_augmented_AUG_2012.json": pd.Timestamp("2012-08"), + "out_augmented_MAY_2011.json": pd.Timestamp("2011-05"), + "out_augmented_SEP_2013.json": pd.Timestamp("2013-09"), + } + } + + df = pd.DataFrame( + [ + "out_augmented_AUG_2012.json", + "out_augmented_SEP_2013.json", + "out_augmented_SUBSIDY_WEEK.json", + "out_augmented_MAY_2012.json", + "out_augmented_MAY_2011.json", + "out_augmented_AUG_2011.json", + "out_augmented_JAN_2011.json", + ], + columns=["fname"], + ) + assert set(df.fname.values) == set(d["fname"].keys()) + expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]}) result = df.replace(d) assert_frame_equal(result, expected) @@ -1056,56 +1163,83 @@ def test_replace_datetimetz(self): # GH 11326 # behaving poorly when presented with a datetime64[ns, tz] - df = DataFrame({'A': date_range('20130101', periods=3, - tz='US/Eastern'), - 'B': [0, np.nan, 2]}) + df = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [0, np.nan, 2], + } + ) result = df.replace(np.nan, 1) - expected = DataFrame({'A': date_range('20130101', periods=3, - tz='US/Eastern'), - 'B': Series([0, 1, 2], dtype='float64')}) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": Series([0, 1, 2], dtype="float64"), + } + ) assert_frame_equal(result, expected) result = df.fillna(1) assert_frame_equal(result, expected) result = df.replace(0, np.nan) - expected = DataFrame({'A': date_range('20130101', periods=3, - tz='US/Eastern'), - 'B': [np.nan, np.nan, 2]}) + expected = DataFrame( + { + "A": date_range("20130101", periods=3, tz="US/Eastern"), + "B": [np.nan, np.nan, 2], + } + ) assert_frame_equal(result, expected) - result = df.replace(Timestamp('20130102', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern')) - expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104', tz='US/Eastern'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]}) + result = df.replace( + Timestamp("20130102", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + ) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Eastern"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan - result = result.replace( - {'A': pd.NaT}, Timestamp('20130104', tz='US/Eastern')) + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern")) assert_frame_equal(result, expected) # coerce to object result = df.copy() result.iloc[1, 0] = np.nan - result = result.replace( - {'A': pd.NaT}, Timestamp('20130104', tz='US/Pacific')) - expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104', tz='US/Pacific'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]}) + result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104", tz="US/Pacific"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) assert_frame_equal(result, expected) result = df.copy() result.iloc[1, 0] = np.nan - result = result.replace({'A': np.nan}, Timestamp('20130104')) - expected = DataFrame({'A': [Timestamp('20130101', tz='US/Eastern'), - Timestamp('20130104'), - Timestamp('20130103', tz='US/Eastern')], - 'B': [0, np.nan, 2]}) + result = result.replace({"A": np.nan}, Timestamp("20130104")) + expected = DataFrame( + { + "A": [ + Timestamp("20130101", tz="US/Eastern"), + Timestamp("20130104"), + Timestamp("20130103", tz="US/Eastern"), + ], + "B": [0, np.nan, 2], + } + ) assert_frame_equal(result, expected) def test_replace_with_empty_dictlike(self, mix_abc): @@ -1114,37 +1248,44 @@ def test_replace_with_empty_dictlike(self, mix_abc): assert_frame_equal(df, df.replace({})) assert_frame_equal(df, df.replace(Series([]))) - assert_frame_equal(df, df.replace({'b': {}})) - assert_frame_equal(df, df.replace(Series({'b': {}}))) - - @pytest.mark.parametrize("to_replace, method, expected", [ - (0, 'bfill', {'A': [1, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - (np.nan, 'bfill', {'A': [0, 1, 2], - 'B': [5.0, 7.0, 7.0], - 'C': ['a', 'b', 'c']}), - ('d', 'ffill', {'A': [0, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - ([0, 2], 'bfill', {'A': [1, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - ([1, 2], 'pad', {'A': [0, 0, 0], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - ((1, 2), 'bfill', {'A': [0, 2, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}), - (['b', 'c'], 'ffill', {'A': [0, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'a', 'a']}), - ]) + assert_frame_equal(df, df.replace({"b": {}})) + assert_frame_equal(df, df.replace(Series({"b": {}}))) + + @pytest.mark.parametrize( + "to_replace, method, expected", + [ + (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + np.nan, + "bfill", + {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]}, + ), + ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}), + ( + [0, 2], + "bfill", + {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + [1, 2], + "pad", + {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + (1, 2), + "bfill", + {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}, + ), + ( + ["b", "c"], + "ffill", + {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]}, + ), + ], + ) def test_replace_method(self, to_replace, method, expected): # GH 19632 - df = DataFrame({'A': [0, 1, 2], - 'B': [5, np.nan, 7], - 'C': ['a', 'b', 'c']}) + df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}) result = df.replace(to_replace=to_replace, value=None, method=method) expected = DataFrame(expected) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 24dba8cb964cc..c33b758d2d62c 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -11,7 +11,13 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Series, date_range, option_context, period_range) + Categorical, + DataFrame, + Series, + date_range, + option_context, + period_range, +) from pandas.tests.frame.common import TestData import pandas.util.testing as tm @@ -22,7 +28,6 @@ class TestDataFrameReprInfoEtc(TestData): - def test_repr_empty(self): # empty foo = repr(self.empty) # noqa @@ -41,11 +46,11 @@ def test_repr_mixed(self): @pytest.mark.slow def test_repr_mixed_big(self): # big mixed - biggie = DataFrame({'A': np.random.randn(200), - 'B': tm.makeStringIndex(200)}, - index=range(200)) - biggie.loc[:20, 'A'] = np.nan - biggie.loc[:20, 'B'] = np.nan + biggie = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=range(200) + ) + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan foo = repr(biggie) # noqa @@ -57,8 +62,8 @@ def test_repr(self): self.frame.info(verbose=False, buf=buf) # even smaller - self.frame.reindex(columns=['A']).info(verbose=False, buf=buf) - self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf) + self.frame.reindex(columns=["A"]).info(verbose=False, buf=buf) + self.frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) # exhausting cases in DataFrame.info @@ -75,45 +80,47 @@ def test_repr(self): assert "a\n" not in repr(df) def test_repr_dimensions(self): - df = DataFrame([[1, 2, ], [3, 4]]) - with option_context('display.show_dimensions', True): + df = DataFrame([[1, 2], [3, 4]]) + with option_context("display.show_dimensions", True): assert "2 rows x 2 columns" in repr(df) - with option_context('display.show_dimensions', False): + with option_context("display.show_dimensions", False): assert "2 rows x 2 columns" not in repr(df) - with option_context('display.show_dimensions', 'truncate'): + with option_context("display.show_dimensions", "truncate"): assert "2 rows x 2 columns" not in repr(df) @pytest.mark.slow def test_repr_big(self): # big one - biggie = DataFrame(np.zeros((200, 4)), columns=range(4), - index=range(200)) + biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) repr(biggie) def test_repr_unsortable(self): # columns are not sortable import warnings + warn_filters = warnings.filters - warnings.filterwarnings('ignore', - category=FutureWarning, - module=".*format") - - unsortable = DataFrame({'foo': [1] * 50, - datetime.today(): [1] * 50, - 'bar': ['bar'] * 50, - datetime.today() + timedelta(1): ['bar'] * 50}, - index=np.arange(50)) + warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") + + unsortable = DataFrame( + { + "foo": [1] * 50, + datetime.today(): [1] * 50, + "bar": ["bar"] * 50, + datetime.today() + timedelta(1): ["bar"] * 50, + }, + index=np.arange(50), + ) repr(unsortable) - fmt.set_option('display.precision', 3, 'display.column_space', 10) + fmt.set_option("display.precision", 3, "display.column_space", 10) repr(self.frame) - fmt.set_option('display.max_rows', 10, 'display.max_columns', 2) + fmt.set_option("display.max_rows", 10, "display.max_columns", 2) repr(self.frame) - fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000) + fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) repr(self.frame) tm.reset_display_options() @@ -121,51 +128,56 @@ def test_repr_unsortable(self): warnings.filters = warn_filters def test_repr_unicode(self): - uval = '\u03c3\u03c3\u03c3\u03c3' + uval = "\u03c3\u03c3\u03c3\u03c3" # TODO(wesm): is this supposed to be used? - bval = uval.encode('utf-8') # noqa + bval = uval.encode("utf-8") # noqa - df = DataFrame({'A': [uval, uval]}) + df = DataFrame({"A": [uval, uval]}) result = repr(df) - ex_top = ' A' - assert result.split('\n')[0].rstrip() == ex_top + ex_top = " A" + assert result.split("\n")[0].rstrip() == ex_top - df = DataFrame({'A': [uval, uval]}) + df = DataFrame({"A": [uval, uval]}) result = repr(df) - assert result.split('\n')[0].rstrip() == ex_top + assert result.split("\n")[0].rstrip() == ex_top def test_unicode_string_with_unicode(self): - df = DataFrame({'A': ["\u05d0"]}) + df = DataFrame({"A": ["\u05d0"]}) str(df) def test_str_to_bytes_raises(self): # GH 26447 - df = DataFrame({'A': ["abc"]}) + df = DataFrame({"A": ["abc"]}) msg = "^'str' object cannot be interpreted as an integer$" with pytest.raises(TypeError, match=msg): bytes(df) def test_very_wide_info_repr(self): - df = DataFrame(np.random.randn(10, 20), - columns=tm.rands_array(10, 20)) + df = DataFrame(np.random.randn(10, 20), columns=tm.rands_array(10, 20)) repr(df) def test_repr_column_name_unicode_truncation_bug(self): # #1906 - df = DataFrame({'Id': [7117434], - 'StringCol': ('Is it possible to modify drop plot code' - ' so that the output graph is displayed ' - 'in iphone simulator, Is it possible to ' - 'modify drop plot code so that the ' - 'output graph is \xe2\x80\xa8displayed ' - 'in iphone simulator.Now we are adding ' - 'the CSV file externally. I want to Call' - ' the File through the code..')}) - - with option_context('display.max_columns', 20): - assert 'StringCol' in repr(df) + df = DataFrame( + { + "Id": [7117434], + "StringCol": ( + "Is it possible to modify drop plot code" + " so that the output graph is displayed " + "in iphone simulator, Is it possible to " + "modify drop plot code so that the " + "output graph is \xe2\x80\xa8displayed " + "in iphone simulator.Now we are adding " + "the CSV file externally. I want to Call" + " the File through the code.." + ), + } + ) + + with option_context("display.max_columns", 20): + assert "StringCol" in repr(df) def test_latex_repr(self): result = r"""\begin{tabular}{llll} @@ -177,9 +189,8 @@ def test_latex_repr(self): \bottomrule \end{tabular} """ - with option_context("display.latex.escape", False, - 'display.latex.repr', True): - df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]]) + with option_context("display.latex.escape", False, "display.latex.repr", True): + df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]]) assert result == df._repr_latex_() # GH 12182 @@ -197,25 +208,30 @@ def test_info(self): def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 - df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')}) + df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) buf = StringIO() df.info(buf=buf) result = buf.getvalue() bytes = float(df.memory_usage().sum()) - expected = textwrap.dedent("""\ + expected = textwrap.dedent( + """\ RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes - """.format(bytes)) + """.format( + bytes + ) + ) assert result == expected def test_info_wide(self): from pandas import set_option, reset_option + io = StringIO() df = DataFrame(np.random.randn(5, 101)) df.info(buf=io) @@ -226,35 +242,40 @@ def test_info_wide(self): assert len(rs.splitlines()) > 100 xp = rs - set_option('display.max_info_columns', 101) + set_option("display.max_info_columns", 101) io = StringIO() df.info(buf=io) assert rs == xp - reset_option('display.max_info_columns') + reset_option("display.max_info_columns") def test_info_duplicate_columns(self): io = StringIO() # it works! - frame = DataFrame(np.random.randn(1500, 4), - columns=['a', 'a', 'b', 'b']) + frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"]) frame.info(buf=io) def test_info_duplicate_columns_shows_correct_dtypes(self): # GH11761 io = StringIO() - frame = DataFrame([[1, 2.0]], - columns=['a', 'a']) + frame = DataFrame([[1, 2.0]], columns=["a", "a"]) frame.info(buf=io) io.seek(0) lines = io.readlines() - assert 'a 1 non-null int64\n' == lines[3] - assert 'a 1 non-null float64\n' == lines[4] + assert "a 1 non-null int64\n" == lines[3] + assert "a 1 non-null float64\n" == lines[4] def test_info_shows_column_dtypes(self): - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] data = {} n = 10 for i, dtype in enumerate(dtypes): @@ -264,47 +285,54 @@ def test_info_shows_column_dtypes(self): df.info(buf=buf) res = buf.getvalue() for i, dtype in enumerate(dtypes): - name = '%d %d non-null %s' % (i, n, dtype) + name = "%d %d non-null %s" % (i, n, dtype) assert name in res def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) for len_, verbose in [(5, None), (5, False), (10, True)]: # For verbose always ^ setting ^ summarize ^ full output - with option_context('max_info_columns', 4): + with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ for len_, verbose in [(10, None), (5, False), (10, True)]: # max_cols no exceeded - with option_context('max_info_columns', 5): + with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ for len_, max_cols in [(10, 5), (5, 4)]: # setting truncates - with option_context('max_info_columns', 4): + with option_context("max_info_columns", 4): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ # setting wouldn't truncate - with option_context('max_info_columns', 5): + with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, max_cols=max_cols) res = buf.getvalue() - assert len(res.strip().split('\n')) == len_ + assert len(res.strip().split("\n")) == len_ def test_info_memory_usage(self): # Ensure memory usage is displayed, when asserted, on the last line - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] data = {} n = 10 for i, dtype in enumerate(dtypes): @@ -335,7 +363,7 @@ def test_info_memory_usage(self): assert not re.match(r"memory usage: [^+]+\+", res[-1]) # Test a DataFrame with duplicate columns - dtypes = ['int64', 'int64', 'int64', 'float64'] + dtypes = ["int64", "int64", "int64", "float64"] data = {} n = 100 for i, dtype in enumerate(dtypes): @@ -343,12 +371,12 @@ def test_info_memory_usage(self): df = DataFrame(data) df.columns = dtypes - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) df_with_object_index.info(buf=buf, memory_usage=True) res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+\+", res[-1]) - df_with_object_index.info(buf=buf, memory_usage='deep') + df_with_object_index.info(buf=buf, memory_usage="deep") res = buf.getvalue().splitlines() assert re.match(r"memory usage: [^+]+$", res[-1]) @@ -366,15 +394,12 @@ def test_info_memory_usage(self): assert df.memory_usage().sum() == df.memory_usage(deep=True).sum() # test for validity - DataFrame(1, index=['a'], columns=['A'] - ).memory_usage(index=True) - DataFrame(1, index=['a'], columns=['A'] - ).index.nbytes + DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) + DataFrame(1, index=["a"], columns=["A"]).index.nbytes df = DataFrame( data=1, - index=pd.MultiIndex.from_product( - [['a'], range(1000)]), - columns=['A'] + index=pd.MultiIndex.from_product([["a"], range(1000)]), + columns=["A"], ) df.index.nbytes df.memory_usage(index=True) @@ -383,39 +408,34 @@ def test_info_memory_usage(self): mem = df.memory_usage(deep=True).sum() assert mem > 0 - @pytest.mark.skipif(PYPY, - reason="on PyPy deep=True doesn't change result") + @pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result") def test_info_memory_usage_deep_not_pypy(self): - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) - assert (df_with_object_index.memory_usage( - index=True, deep=True).sum() > - df_with_object_index.memory_usage( - index=True).sum()) - - df_object = pd.DataFrame({'a': ['a']}) - assert (df_object.memory_usage(deep=True).sum() > - df_object.memory_usage().sum()) - - @pytest.mark.skipif(not PYPY, - reason="on PyPy deep=True does not change result") + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + > df_with_object_index.memory_usage(index=True).sum() + ) + + df_object = pd.DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum() + + @pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result") def test_info_memory_usage_deep_pypy(self): - df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo']) - assert (df_with_object_index.memory_usage( - index=True, deep=True).sum() == - df_with_object_index.memory_usage( - index=True).sum()) + df_with_object_index = pd.DataFrame({"a": [1]}, index=["foo"]) + assert ( + df_with_object_index.memory_usage(index=True, deep=True).sum() + == df_with_object_index.memory_usage(index=True).sum() + ) - df_object = pd.DataFrame({'a': ['a']}) - assert (df_object.memory_usage(deep=True).sum() == - df_object.memory_usage().sum()) + df_object = pd.DataFrame({"a": ["a"]}) + assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum() @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(self): df = DataFrame( data=1, - index=pd.MultiIndex.from_product( - [['a'], range(1000)]), - columns=['A'] + index=pd.MultiIndex.from_product([["a"], range(1000)]), + columns=["A"], ) mem = df.memory_usage(deep=True).sum() # sys.getsizeof will call the .memory_usage with @@ -426,30 +446,32 @@ def test_usage_via_getsizeof(self): def test_info_memory_usage_qualified(self): buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=[1, 2, 3]) + df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) - assert '+' not in buf.getvalue() + assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=list('ABC')) + df = DataFrame(1, columns=list("ab"), index=list("ABC")) df.info(buf=buf) - assert '+' in buf.getvalue() + assert "+" in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=pd.MultiIndex.from_product( - [range(3), range(3)])) + df = DataFrame( + 1, + columns=list("ab"), + index=pd.MultiIndex.from_product([range(3), range(3)]), + ) df.info(buf=buf) - assert '+' not in buf.getvalue() + assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list('ab'), - index=pd.MultiIndex.from_product( - [range(3), ['foo', 'bar']])) + df = DataFrame( + 1, + columns=list("ab"), + index=pd.MultiIndex.from_product([range(3), ["foo", "bar"]]), + ) df.info(buf=buf) - assert '+' in buf.getvalue() + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(self): # GH 14308 @@ -462,13 +484,13 @@ def memory_usage(f): N = 100 M = len(uppercase) - index = pd.MultiIndex.from_product([list(uppercase), - pd.date_range('20160101', - periods=N)], - names=['id', 'date']) - df = DataFrame({'value': np.random.randn(N * M)}, index=index) + index = pd.MultiIndex.from_product( + [list(uppercase), pd.date_range("20160101", periods=N)], + names=["id", "date"], + ) + df = DataFrame({"value": np.random.randn(N * M)}, index=index) - unstacked = df.unstack('id') + unstacked = df.unstack("id") assert df.values.nbytes == unstacked.values.nbytes assert memory_usage(df) > memory_usage(unstacked) @@ -477,7 +499,7 @@ def memory_usage(f): def test_info_categorical(self): # GH14298 - idx = pd.CategoricalIndex(['a', 'b']) + idx = pd.CategoricalIndex(["a", "b"]) df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) buf = StringIO() @@ -487,23 +509,23 @@ def test_info_categorical_column(self): # make sure it works n = 2500 - df = DataFrame({'int64': np.random.randint(100, size=n)}) - df['category'] = Series(np.array(list('abcdefghij')).take( - np.random.randint(0, 10, size=n))).astype('category') + df = DataFrame({"int64": np.random.randint(100, size=n)}) + df["category"] = Series( + np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n)) + ).astype("category") df.isna() buf = StringIO() df.info(buf=buf) - df2 = df[df['category'] == 'd'] + df2 = df[df["category"] == "d"] buf = StringIO() df2.info(buf=buf) def test_repr_categorical_dates_periods(self): # normal DataFrame - dt = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') - p = period_range('2011-01', freq='M', periods=5) - df = DataFrame({'dt': dt, 'p': p}) + dt = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") + p = period_range("2011-01", freq="M", periods=5) + df = DataFrame({"dt": dt, "p": p}) exp = """ dt p 0 2011-01-01 09:00:00-05:00 2011-01 1 2011-01-01 10:00:00-05:00 2011-02 @@ -513,14 +535,15 @@ def test_repr_categorical_dates_periods(self): assert repr(df) == exp - df2 = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)}) + df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)}) assert repr(df2) == exp - @pytest.mark.parametrize('arg', [np.datetime64, np.timedelta64]) - @pytest.mark.parametrize('box, expected', [ - [Series, '0 NaT\ndtype: object'], - [DataFrame, ' 0\n0 NaT']]) + @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64]) + @pytest.mark.parametrize( + "box, expected", + [[Series, "0 NaT\ndtype: object"], [DataFrame, " 0\n0 NaT"]], + ) def test_repr_np_nat_with_object(self, arg, box, expected): # GH 25445 - result = repr(box([arg('NaT')], dtype=object)) + result = repr(box([arg("NaT")], dtype=object)) assert result == expected diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 04c1375418e67..f3452e9a85fb3 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -5,53 +5,56 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range) +from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal class TestDataFrameReshape(TestData): - def test_pivot(self): data = { - 'index': ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values': [1., 2., 3., 3., 2., 1.] + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } frame = DataFrame(data) - pivoted = frame.pivot( - index='index', columns='columns', values='values') + pivoted = frame.pivot(index="index", columns="columns", values="values") - expected = DataFrame({ - 'One': {'A': 1., 'B': 2., 'C': 3.}, - 'Two': {'A': 1., 'B': 2., 'C': 3.} - }) + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) - expected.index.name, expected.columns.name = 'index', 'columns' + expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(pivoted, expected) # name tracking - assert pivoted.index.name == 'index' - assert pivoted.columns.name == 'columns' + assert pivoted.index.name == "index" + assert pivoted.columns.name == "columns" # don't specify values - pivoted = frame.pivot(index='index', columns='columns') - assert pivoted.index.name == 'index' - assert pivoted.columns.names == (None, 'columns') + pivoted = frame.pivot(index="index", columns="columns") + assert pivoted.index.name == "index" + assert pivoted.columns.names == (None, "columns") def test_pivot_duplicates(self): - data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'], - 'b': ['one', 'two', 'one', 'one', 'two'], - 'c': [1., 2., 3., 3., 4.]}) - with pytest.raises(ValueError, match='duplicate entries'): - data.pivot('a', 'b', 'c') + data = DataFrame( + { + "a": ["bar", "bar", "foo", "foo", "foo"], + "b": ["one", "two", "one", "one", "two"], + "c": [1.0, 2.0, 3.0, 3.0, 4.0], + } + ) + with pytest.raises(ValueError, match="duplicate entries"): + data.pivot("a", "b", "c") def test_pivot_empty(self): - df = DataFrame(columns=['a', 'b', 'c']) - result = df.pivot('a', 'b', 'c') + df = DataFrame(columns=["a", "b", "c"]) + result = df.pivot("a", "b", "c") expected = DataFrame() tm.assert_frame_equal(result, expected, check_names=False) @@ -60,40 +63,42 @@ def test_pivot_integer_bug(self): result = df.pivot(index=1, columns=0, values=2) repr(result) - tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0)) + tm.assert_index_equal(result.columns, Index(["A", "B"], name=0)) def test_pivot_index_none(self): # gh-3962 data = { - 'index': ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values': [1., 2., 3., 3., 2., 1.] + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], } - frame = DataFrame(data).set_index('index') - result = frame.pivot(columns='columns', values='values') - expected = DataFrame({ - 'One': {'A': 1., 'B': 2., 'C': 3.}, - 'Two': {'A': 1., 'B': 2., 'C': 3.} - }) + frame = DataFrame(data).set_index("index") + result = frame.pivot(columns="columns", values="values") + expected = DataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) - expected.index.name, expected.columns.name = 'index', 'columns' + expected.index.name, expected.columns.name = "index", "columns" assert_frame_equal(result, expected) # omit values - result = frame.pivot(columns='columns') + result = frame.pivot(columns="columns") - expected.columns = pd.MultiIndex.from_tuples([('values', 'One'), - ('values', 'Two')], - names=[None, 'columns']) - expected.index.name = 'index' + expected.columns = pd.MultiIndex.from_tuples( + [("values", "One"), ("values", "Two")], names=[None, "columns"] + ) + expected.index.name = "index" tm.assert_frame_equal(result, expected, check_names=False) - assert result.index.name == 'index' - assert result.columns.names == (None, 'columns') + assert result.index.name == "index" + assert result.columns.names == (None, "columns") expected.columns = expected.columns.droplevel(0) - result = frame.pivot(columns='columns', values='values') + result = frame.pivot(columns="columns", values="values") - expected.columns.name = 'columns' + expected.columns.name = "columns" tm.assert_frame_equal(result, expected) def test_stack_unstack(self): @@ -101,22 +106,22 @@ def test_stack_unstack(self): df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) stacked = df.stack() - stacked_df = DataFrame({'foo': stacked, 'bar': stacked}) + stacked_df = DataFrame({"foo": stacked, "bar": stacked}) unstacked = stacked.unstack() unstacked_df = stacked_df.unstack() assert_frame_equal(unstacked, df) - assert_frame_equal(unstacked_df['bar'], df) + assert_frame_equal(unstacked_df["bar"], df) unstacked_cols = stacked.unstack(0) unstacked_cols_df = stacked_df.unstack(0) assert_frame_equal(unstacked_cols.T, df) - assert_frame_equal(unstacked_cols_df['bar'].T, df) + assert_frame_equal(unstacked_cols_df["bar"].T, df) def test_stack_mixed_level(self): # GH 18310 - levels = [range(3), [3, 'a', 'b'], [1, 2]] + levels = [range(3), [3, "a", "b"], [1, 2]] # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) @@ -125,17 +130,16 @@ def test_stack_mixed_level(self): assert_series_equal(result, expected) # MultiIndex columns: - df = DataFrame(1, index=levels[0], - columns=MultiIndex.from_product(levels[1:])) + df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) result = df.stack(1) - expected = DataFrame(1, index=MultiIndex.from_product([levels[0], - levels[2]]), - columns=levels[1]) + expected = DataFrame( + 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1] + ) assert_frame_equal(result, expected) # as above, but used labels in level are actually of homogeneous type - result = df[['a', 'b']].stack(1) - expected = expected[['a', 'b']] + result = df[["a", "b"]].stack(1) + expected = expected[["a", "b"]] assert_frame_equal(result, expected) def test_unstack_fill(self): @@ -146,31 +150,33 @@ def test_unstack_fill(self): # From a series data = Series([1, 2, 4, 5], dtype=np.int16) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack(fill_value=-1) - expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, - index=['x', 'y', 'z'], dtype=np.int16) + expected = DataFrame( + {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16 + ) assert_frame_equal(result, expected) # From a series with incorrect data type for fill_value result = data.unstack(fill_value=0.5) - expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, - index=['x', 'y', 'z'], dtype=np.float) + expected = DataFrame( + {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=np.float + ) assert_frame_equal(result, expected) # GH #13971: fill_value when unstacking multiple levels: - df = DataFrame({'x': ['a', 'a', 'b'], - 'y': ['j', 'k', 'j'], - 'z': [0, 1, 2], - 'w': [0, 1, 2]}).set_index(['x', 'y', 'z']) - unstacked = df.unstack(['x', 'y'], fill_value=0) - key = ('w', 'b', 'j') + df = DataFrame( + {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]} + ).set_index(["x", "y", "z"]) + unstacked = df.unstack(["x", "y"], fill_value=0) + key = ("w", "b", "j") expected = unstacked[key] result = pd.Series([0, 0, 2], index=unstacked.index, name=key) assert_series_equal(result, expected) - stacked = unstacked.stack(['x', 'y']) + stacked = unstacked.stack(["x", "y"]) stacked.index = stacked.index.reorder_levels(df.index.names) # Workaround for GH #17886 (unnecessarily casts to float): stacked = stacked.astype(np.int64) @@ -178,63 +184,69 @@ def test_unstack_fill(self): assert_frame_equal(result, df) # From a series - s = df['w'] - result = s.unstack(['x', 'y'], fill_value=0) - expected = unstacked['w'] + s = df["w"] + result = s.unstack(["x", "y"], fill_value=0) + expected = unstacked["w"] assert_frame_equal(result, expected) def test_unstack_fill_frame(self): # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] - df = DataFrame(rows, columns=list('AB'), dtype=np.int32) + df = DataFrame(rows, columns=list("AB"), dtype=np.int32) df.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = df.unstack(fill_value=-1) rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] - expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) + expected = DataFrame(rows, index=list("xyz"), dtype=np.int32) expected.columns = MultiIndex.from_tuples( - [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] + ) assert_frame_equal(result, expected) # From a mixed type dataframe - df['A'] = df['A'].astype(np.int16) - df['B'] = df['B'].astype(np.float64) + df["A"] = df["A"].astype(np.int16) + df["B"] = df["B"].astype(np.float64) result = df.unstack(fill_value=-1) - expected['A'] = expected['A'].astype(np.int16) - expected['B'] = expected['B'].astype(np.float64) + expected["A"] = expected["A"].astype(np.int16) + expected["B"] = expected["B"].astype(np.float64) assert_frame_equal(result, expected) # From a dataframe with incorrect data type for fill_value result = df.unstack(fill_value=0.5) rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] - expected = DataFrame(rows, index=list('xyz'), dtype=np.float) + expected = DataFrame(rows, index=list("xyz"), dtype=np.float) expected.columns = MultiIndex.from_tuples( - [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")] + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_datetime(self): # Test unstacking with date times - dv = pd.date_range('2012-01-01', periods=4).values + dv = pd.date_range("2012-01-01", periods=4).values data = Series(dv) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack() - expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]], - 'b': [dv[1], dv[2], pd.NaT]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) result = data.unstack(fill_value=dv[0]) - expected = DataFrame({'a': [dv[0], dv[0], dv[3]], - 'b': [dv[1], dv[2], dv[0]]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_timedelta(self): @@ -243,228 +255,263 @@ def test_unstack_fill_frame_timedelta(self): td = [Timedelta(days=i) for i in range(4)] data = Series(td) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack() - expected = DataFrame({'a': [td[0], pd.NaT, td[3]], - 'b': [td[1], td[2], pd.NaT]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) result = data.unstack(fill_value=td[1]) - expected = DataFrame({'a': [td[0], td[1], td[3]], - 'b': [td[1], td[2], td[1]]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_period(self): # Test unstacking with period - periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), - Period('2012-04')] + periods = [ + Period("2012-01"), + Period("2012-02"), + Period("2012-03"), + Period("2012-04"), + ] data = Series(periods) data.index = MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) result = data.unstack() - expected = DataFrame({'a': [periods[0], None, periods[3]], - 'b': [periods[1], periods[2], None]}, - index=['x', 'y', 'z']) + expected = DataFrame( + {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]}, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) result = data.unstack(fill_value=periods[1]) - expected = DataFrame({'a': [periods[0], periods[1], periods[3]], - 'b': [periods[1], periods[2], periods[1]]}, - index=['x', 'y', 'z']) + expected = DataFrame( + { + "a": [periods[0], periods[1], periods[3]], + "b": [periods[1], periods[2], periods[1]], + }, + index=["x", "y", "z"], + ) assert_frame_equal(result, expected) def test_unstack_fill_frame_categorical(self): # Test unstacking with categorical - data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') + data = pd.Series(["a", "b", "c", "a"], dtype="category") data.index = pd.MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')], + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] ) # By default missing values will be NaN result = data.unstack() - expected = DataFrame({'a': pd.Categorical(list('axa'), - categories=list('abc')), - 'b': pd.Categorical(list('bcx'), - categories=list('abc'))}, - index=list('xyz')) + expected = DataFrame( + { + "a": pd.Categorical(list("axa"), categories=list("abc")), + "b": pd.Categorical(list("bcx"), categories=list("abc")), + }, + index=list("xyz"), + ) assert_frame_equal(result, expected) # Fill with non-category results in a TypeError msg = r"'fill_value' \('d'\) is not in" with pytest.raises(TypeError, match=msg): - data.unstack(fill_value='d') + data.unstack(fill_value="d") # Fill with category value replaces missing values as expected - result = data.unstack(fill_value='c') - expected = DataFrame({'a': pd.Categorical(list('aca'), - categories=list('abc')), - 'b': pd.Categorical(list('bcc'), - categories=list('abc'))}, - index=list('xyz')) + result = data.unstack(fill_value="c") + expected = DataFrame( + { + "a": pd.Categorical(list("aca"), categories=list("abc")), + "b": pd.Categorical(list("bcc"), categories=list("abc")), + }, + index=list("xyz"), + ) assert_frame_equal(result, expected) def test_unstack_preserve_dtypes(self): # Checks fix for #11847 - df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'], - index=['a', 'b', 'c'], - some_categories=pd.Series(['a', 'b', 'c'] - ).astype('category'), - A=np.random.rand(3), - B=1, - C='foo', - D=pd.Timestamp('20010102'), - E=pd.Series([1.0, 50.0, 100.0] - ).astype('float32'), - F=pd.Series([3.0, 4.0, 5.0]).astype('float64'), - G=False, - H=pd.Series([1, 200, 923442], dtype='int8'))) + df = pd.DataFrame( + dict( + state=["IL", "MI", "NC"], + index=["a", "b", "c"], + some_categories=pd.Series(["a", "b", "c"]).astype("category"), + A=np.random.rand(3), + B=1, + C="foo", + D=pd.Timestamp("20010102"), + E=pd.Series([1.0, 50.0, 100.0]).astype("float32"), + F=pd.Series([3.0, 4.0, 5.0]).astype("float64"), + G=False, + H=pd.Series([1, 200, 923442], dtype="int8"), + ) + ) def unstack_and_compare(df, column_name): unstacked1 = df.unstack([column_name]) unstacked2 = df.unstack(column_name) assert_frame_equal(unstacked1, unstacked2) - df1 = df.set_index(['state', 'index']) - unstack_and_compare(df1, 'index') + df1 = df.set_index(["state", "index"]) + unstack_and_compare(df1, "index") - df1 = df.set_index(['state', 'some_categories']) - unstack_and_compare(df1, 'some_categories') + df1 = df.set_index(["state", "some_categories"]) + unstack_and_compare(df1, "some_categories") - df1 = df.set_index(['F', 'C']) - unstack_and_compare(df1, 'F') + df1 = df.set_index(["F", "C"]) + unstack_and_compare(df1, "F") - df1 = df.set_index(['G', 'B', 'state']) - unstack_and_compare(df1, 'B') + df1 = df.set_index(["G", "B", "state"]) + unstack_and_compare(df1, "B") - df1 = df.set_index(['E', 'A']) - unstack_and_compare(df1, 'E') + df1 = df.set_index(["E", "A"]) + unstack_and_compare(df1, "E") - df1 = df.set_index(['state', 'index']) - s = df1['A'] - unstack_and_compare(s, 'index') + df1 = df.set_index(["state", "index"]) + s = df1["A"] + unstack_and_compare(s, "index") def test_stack_ints(self): - columns = MultiIndex.from_tuples(list(itertools.product(range(3), - repeat=3))) + columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) df = DataFrame(np.random.randn(30, 27), columns=columns) - assert_frame_equal(df.stack(level=[1, 2]), - df.stack(level=1).stack(level=1)) - assert_frame_equal(df.stack(level=[-2, -1]), - df.stack(level=1).stack(level=1)) + assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) + assert_frame_equal(df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1)) df_named = df.copy() df_named.columns.set_names(range(3), inplace=True) - assert_frame_equal(df_named.stack(level=[1, 2]), - df_named.stack(level=1).stack(level=1)) + assert_frame_equal( + df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) + ) def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( - [('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short')], - names=['exp', 'animal', 'hair_length'] + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) - animal_hair_stacked = df.stack(level=['animal', 'hair_length']) - exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + animal_hair_stacked = df.stack(level=["animal", "hair_length"]) + exp_hair_stacked = df.stack(level=["exp", "hair_length"]) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() - df2.columns.names = ['exp', 'animal', 1] - assert_frame_equal(df2.stack(level=['animal', 1]), - animal_hair_stacked, check_names=False) - assert_frame_equal(df2.stack(level=['exp', 1]), - exp_hair_stacked, check_names=False) + df2.columns.names = ["exp", "animal", 1] + assert_frame_equal( + df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False + ) + assert_frame_equal( + df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False + ) # When mixed types are passed and the ints are not level # names, raise - msg = ("level should contain all level names or all level numbers, not" - " a mixture of the two") + msg = ( + "level should contain all level names or all level numbers, not" + " a mixture of the two" + ) with pytest.raises(ValueError, match=msg): - df2.stack(level=['animal', 0]) + df2.stack(level=["animal", 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() - df3.columns.names = ['exp', 'animal', 0] - assert_frame_equal(df3.stack(level=['animal', 0]), - animal_hair_stacked, check_names=False) + df3.columns.names = ["exp", "animal", 0] + assert_frame_equal( + df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False + ) def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( - [('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short')], - names=['exp', 'animal', 'hair_length'] + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) - exp_animal_stacked = df.stack(level=['exp', 'animal']) - animal_hair_stacked = df.stack(level=['animal', 'hair_length']) - exp_hair_stacked = df.stack(level=['exp', 'hair_length']) + exp_animal_stacked = df.stack(level=["exp", "animal"]) + animal_hair_stacked = df.stack(level=["animal", "hair_length"]) + exp_hair_stacked = df.stack(level=["exp", "hair_length"]) df2 = df.copy() df2.columns.names = [0, 1, 2] - assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, - check_names=False) - assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, - check_names=False) - assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, - check_names=False) + assert_frame_equal( + df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False + ) + assert_frame_equal( + df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False + ) + assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] - assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, - check_names=False) - assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, - check_names=False) - assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, - check_names=False) + assert_frame_equal( + df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False + ) + assert_frame_equal( + df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False + ) + assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False) def test_unstack_bool(self): - df = DataFrame([False, False], - index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]), - columns=['col']) + df = DataFrame( + [False, False], + index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]), + columns=["col"], + ) rs = df.unstack() - xp = DataFrame(np.array([[False, np.nan], [np.nan, False]], - dtype=object), - index=['a', 'b'], - columns=MultiIndex.from_arrays([['col', 'col'], - ['c', 'l']])) + xp = DataFrame( + np.array([[False, np.nan], [np.nan, False]], dtype=object), + index=["a", "b"], + columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]), + ) assert_frame_equal(rs, xp) def test_unstack_level_binding(self): # GH9856 mi = pd.MultiIndex( - levels=[['foo', 'bar'], ['one', 'two'], - ['a', 'b']], + levels=[["foo", "bar"], ["one", "two"], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]], - names=['first', 'second', 'third']) + names=["first", "second", "third"], + ) s = pd.Series(0, index=mi) result = s.unstack([1, 2]).stack(0) expected_mi = pd.MultiIndex( - levels=[['foo', 'bar'], ['one', 'two']], + levels=[["foo", "bar"], ["one", "two"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=['first', 'second']) + names=["first", "second"], + ) - expected = pd.DataFrame(np.array([[np.nan, 0], - [0, np.nan], - [np.nan, 0], - [0, np.nan]], - dtype=np.float64), - index=expected_mi, - columns=pd.Index(['a', 'b'], name='third')) + expected = pd.DataFrame( + np.array( + [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64 + ), + index=expected_mi, + columns=pd.Index(["a", "b"], name="third"), + ) assert_frame_equal(result, expected) @@ -477,12 +524,14 @@ def test_unstack_to_series(self): assert_frame_equal(undo, self.frame) # check NA handling - data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]}) - data.index = Index(['a', 'b', 'c']) + data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + data.index = Index(["a", "b", "c"]) result = data.unstack() - midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']], - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + midx = MultiIndex( + levels=[["x", "y"], ["a", "b", "c"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + ) expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx) assert_series_equal(result, expected) @@ -496,89 +545,94 @@ def test_unstack_to_series(self): def test_unstack_dtypes(self): # GH 2929 - rows = [[1, 1, 3, 4], - [1, 2, 3, 4], - [2, 1, 3, 4], - [2, 2, 3, 4]] + rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] - df = DataFrame(rows, columns=list('ABCD')) + df = DataFrame(rows, columns=list("ABCD")) result = df.dtypes - expected = Series([np.dtype('int64')] * 4, - index=list('ABCD')) + expected = Series([np.dtype("int64")] * 4, index=list("ABCD")) assert_series_equal(result, expected) # single dtype - df2 = df.set_index(['A', 'B']) - df3 = df2.unstack('B') + df2 = df.set_index(["A", "B"]) + df3 = df2.unstack("B") result = df3.dtypes - expected = Series([np.dtype('int64')] * 4, - index=pd.MultiIndex.from_arrays([ - ['C', 'C', 'D', 'D'], - [1, 2, 1, 2] - ], names=(None, 'B'))) + expected = Series( + [np.dtype("int64")] * 4, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) assert_series_equal(result, expected) # mixed - df2 = df.set_index(['A', 'B']) - df2['C'] = 3. - df3 = df2.unstack('B') + df2 = df.set_index(["A", "B"]) + df2["C"] = 3.0 + df3 = df2.unstack("B") result = df3.dtypes - expected = Series([np.dtype('float64')] * 2 + [np.dtype('int64')] * 2, - index=pd.MultiIndex.from_arrays([ - ['C', 'C', 'D', 'D'], - [1, 2, 1, 2] - ], names=(None, 'B'))) + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("int64")] * 2, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) assert_series_equal(result, expected) - df2['D'] = 'foo' - df3 = df2.unstack('B') + df2["D"] = "foo" + df3 = df2.unstack("B") result = df3.dtypes - expected = Series([np.dtype('float64')] * 2 + [np.dtype('object')] * 2, - index=pd.MultiIndex.from_arrays([ - ['C', 'C', 'D', 'D'], - [1, 2, 1, 2] - ], names=(None, 'B'))) + expected = Series( + [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + index=pd.MultiIndex.from_arrays( + [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") + ), + ) assert_series_equal(result, expected) # GH7405 - for c, d in (np.zeros(5), np.zeros(5)), \ - (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')): - - df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d, - 'B': pd.date_range('2012-01-01', periods=5)}) + for c, d in ( + (np.zeros(5), np.zeros(5)), + (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")), + ): + + df = DataFrame( + { + "A": ["a"] * 5, + "C": c, + "D": d, + "B": pd.date_range("2012-01-01", periods=5), + } + ) right = df.iloc[:3].copy(deep=True) - df = df.set_index(['A', 'B']) - df['D'] = df['D'].astype('int64') + df = df.set_index(["A", "B"]) + df["D"] = df["D"].astype("int64") left = df.iloc[:3].unstack(0) - right = right.set_index(['A', 'B']).unstack(0) - right[('D', 'a')] = right[('D', 'a')].astype('int64') + right = right.set_index(["A", "B"]).unstack(0) + right[("D", "a")] = right[("D", "a")].astype("int64") assert left.shape == (3, 2) tm.assert_frame_equal(left, right) def test_unstack_non_unique_index_names(self): - idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], - names=['c1', 'c1']) + idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"]) df = DataFrame([1, 2], index=idx) with pytest.raises(ValueError): - df.unstack('c1') + df.unstack("c1") with pytest.raises(ValueError): - df.T.stack('c1') + df.T.stack("c1") def test_unstack_unused_levels(self): # GH 17845: unused codes in index make unstack() cast int to float - idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] + idx = pd.MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1] df = pd.DataFrame([[1, 0]] * 3, index=idx) result = df.unstack() - exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']]) - expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'], - columns=exp_col) + exp_col = pd.MultiIndex.from_product([[0, 1], ["A", "B", "C"]]) + expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col) tm.assert_frame_equal(result, expected) - assert((result.columns.levels[1] == idx.levels[1]).all()) + assert (result.columns.levels[1] == idx.levels[1]).all() # Unused items on both levels levels = [[0, 1, 7], [0, 1, 2, 3]] @@ -587,67 +641,73 @@ def test_unstack_unused_levels(self): block = np.arange(4).reshape(2, 2) df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx) result = df.unstack() - expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1], - axis=1), - columns=idx) + expected = pd.DataFrame( + np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx + ) tm.assert_frame_equal(result, expected) - assert((result.columns.levels[1] == idx.levels[1]).all()) + assert (result.columns.levels[1] == idx.levels[1]).all() # With mixed dtype and NaN - levels = [['a', 2, 'c'], [1, 3, 5, 7]] + levels = [["a", 2, "c"], [1, 3, 5, 7]] codes = [[0, -1, 1, 1], [0, 2, -1, 2]] idx = pd.MultiIndex(levels, codes) data = np.arange(8) df = pd.DataFrame(data.reshape(4, 2), index=idx) - cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11], - [np.nan, 'a', 2], [np.nan, 5, 1]), - (1, [8, 11, 1, 4, 12, 15, 13, 16], - [np.nan, 5, 1], [np.nan, 'a', 2])) + cases = ( + (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]), + (1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]), + ) for level, idces, col_level, idx_level in cases: result = df.unstack(level=level) exp_data = np.zeros(18) * np.nan exp_data[idces] = data cols = pd.MultiIndex.from_product([[0, 1], col_level]) - expected = pd.DataFrame(exp_data.reshape(3, 6), - index=idx_level, columns=cols) + expected = pd.DataFrame( + exp_data.reshape(3, 6), index=idx_level, columns=cols + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("cols", [['A', 'C'], slice(None)]) + @pytest.mark.parametrize("cols", [["A", "C"], slice(None)]) def test_unstack_unused_level(self, cols): # GH 18562 : unused codes on the unstacked level - df = pd.DataFrame([[2010, 'a', 'I'], - [2011, 'b', 'II']], - columns=['A', 'B', 'C']) + df = pd.DataFrame( + [[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"] + ) - ind = df.set_index(['A', 'B', 'C'], drop=False) - selection = ind.loc[(slice(None), slice(None), 'I'), cols] + ind = df.set_index(["A", "B", "C"], drop=False) + selection = ind.loc[(slice(None), slice(None), "I"), cols] result = selection.unstack() expected = ind.iloc[[0]][cols] - expected.columns = MultiIndex.from_product([expected.columns, ['I']], - names=[None, 'C']) - expected.index = expected.index.droplevel('C') + expected.columns = MultiIndex.from_product( + [expected.columns, ["I"]], names=[None, "C"] + ) + expected.index = expected.index.droplevel("C") tm.assert_frame_equal(result, expected) def test_unstack_nan_index(self): # GH7466 - cast = lambda val: '{0:1}'.format('' if val != val else val) + cast = lambda val: "{0:1}".format("" if val != val else val) def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): - left = sorted(df.iloc[i, j].split('.')) + left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(list(map(cast, right))) assert left == right - df = DataFrame({'jim': ['a', 'b', np.nan, 'd'], - 'joe': ['w', 'x', 'y', 'z'], - 'jolie': ['a.w', 'b.x', ' .y', 'd.z']}) + df = DataFrame( + { + "jim": ["a", "b", np.nan, "d"], + "joe": ["w", "x", "y", "z"], + "jolie": ["a.w", "b.x", " .y", "d.z"], + } + ) - left = df.set_index(['jim', 'joe']).unstack()['jolie'] - right = df.set_index(['joe', 'jim']).unstack()['jolie'].T + left = df.set_index(["jim", "joe"]).unstack()["jolie"] + right = df.set_index(["joe", "jim"]).unstack()["jolie"].T assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): @@ -655,154 +715,208 @@ def verify(df): for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) - verify(udf['jolie']) + verify(udf["jolie"]) - df = DataFrame({'1st': ['d'] * 3 + [np.nan] * 5 + ['a'] * 2 + - ['c'] * 3 + ['e'] * 2 + ['b'] * 5, - '2nd': ['y'] * 2 + ['w'] * 3 + [np.nan] * 3 + - ['z'] * 4 + [np.nan] * 3 + ['x'] * 3 + [np.nan] * 2, - '3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, - 50, 62, 59, 76, 52, 14, 53, 60, 51]}) + df = DataFrame( + { + "1st": ["d"] * 3 + + [np.nan] * 5 + + ["a"] * 2 + + ["c"] * 3 + + ["e"] * 2 + + ["b"] * 5, + "2nd": ["y"] * 2 + + ["w"] * 3 + + [np.nan] * 3 + + ["z"] * 4 + + [np.nan] * 3 + + ["x"] * 3 + + [np.nan] * 2, + "3rd": [ + 67, + 39, + 53, + 72, + 57, + 80, + 31, + 18, + 11, + 30, + 59, + 50, + 62, + 59, + 76, + 52, + 14, + 53, + 60, + 51, + ], + } + ) - df['4th'], df['5th'] = \ - df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \ - df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1) + df["4th"], df["5th"] = ( + df.apply(lambda r: ".".join(map(cast, r)), axis=1), + df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), + ) - for idx in itertools.permutations(['1st', '2nd', '3rd']): + for idx in itertools.permutations(["1st", "2nd", "3rd"]): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) - for col in ['4th', '5th']: + for col in ["4th", "5th"]: verify(udf[col]) # GH7403 - df = pd.DataFrame( - {'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)}) + df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) df.iloc[3, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack(0) + left = df.set_index(["A", "B"]).unstack(0) - vals = [[3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7]] + vals = [ + [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], + ] vals = list(map(list, zip(*vals))) - idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name='B') - cols = MultiIndex(levels=[['C'], ['a', 'b']], - codes=[[0, 0], [0, 1]], - names=[None, 'A']) + idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2, - 'C': range(8)}) + df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) df.iloc[2, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack(0) + left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] - cols = MultiIndex(levels=[['C'], ['a', 'b']], - codes=[[0, 0], [0, 1]], - names=[None, 'A']) - idx = Index([np.nan, 0, 1, 2, 3], name='B') + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) + idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2, - 'C': range(8)}) + df = pd.DataFrame( + {"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)} + ) df.iloc[3, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack(0) + left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] - cols = MultiIndex(levels=[['C'], ['a', 'b']], - codes=[[0, 0], [0, 1]], - names=[None, 'A']) - idx = Index([np.nan, 0, 1, 2, 3], name='B') + cols = MultiIndex( + levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] + ) + idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH7401 - df = pd.DataFrame({'A': list('aaaaabbbbb'), - 'B': (date_range('2012-01-01', periods=5) - .tolist() * 2), - 'C': np.arange(10)}) + df = pd.DataFrame( + { + "A": list("aaaaabbbbb"), + "B": (date_range("2012-01-01", periods=5).tolist() * 2), + "C": np.arange(10), + } + ) df.iloc[3, 1] = np.NaN - left = df.set_index(['A', 'B']).unstack() + left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) - idx = Index(['a', 'b'], name='A') - cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)], - codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], - names=[None, 'B']) + idx = Index(["a", "b"], name="A") + cols = MultiIndex( + levels=[["C"], date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) # GH4862 - vals = [['Hg', np.nan, np.nan, 680585148], - ['U', 0.0, np.nan, 680585148], - ['Pb', 7.07e-06, np.nan, 680585148], - ['Sn', 2.3614e-05, 0.0133, 680607017], - ['Ag', 0.0, 0.0133, 680607017], - ['Hg', -0.00015, 0.0133, 680607017]] - df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'], - index=[17263, 17264, 17265, 17266, 17267, 17268]) + vals = [ + ["Hg", np.nan, np.nan, 680585148], + ["U", 0.0, np.nan, 680585148], + ["Pb", 7.07e-06, np.nan, 680585148], + ["Sn", 2.3614e-05, 0.0133, 680607017], + ["Ag", 0.0, 0.0133, 680607017], + ["Hg", -0.00015, 0.0133, 680607017], + ] + df = DataFrame( + vals, + columns=["agent", "change", "dosage", "s_id"], + index=[17263, 17264, 17265, 17266, 17267, 17268], + ) - left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack() + left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() - vals = [[np.nan, np.nan, 7.07e-06, np.nan, 0.0], - [0.0, -0.00015, np.nan, 2.3614e-05, np.nan]] + vals = [ + [np.nan, np.nan, 7.07e-06, np.nan, 0.0], + [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], + ] - idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]], - codes=[[0, 1], [-1, 0]], - names=['s_id', 'dosage']) + idx = MultiIndex( + levels=[[680585148, 680607017], [0.0133]], + codes=[[0, 1], [-1, 0]], + names=["s_id", "dosage"], + ) - cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']], - codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], - names=[None, 'agent']) + cols = MultiIndex( + levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], + codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], + names=[None, "agent"], + ) right = DataFrame(vals, columns=cols, index=idx) assert_frame_equal(left, right) - left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent']) + left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls - df = DataFrame({'1st': [1, 2, 1, 2, 1, 2], - '2nd': pd.date_range('2014-02-01', periods=6, - freq='D'), - 'jim': 100 + np.arange(6), - 'joe': (np.random.randn(6) * 10).round(2)}) + df = DataFrame( + { + "1st": [1, 2, 1, 2, 1, 2], + "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), + "jim": 100 + np.arange(6), + "joe": (np.random.randn(6) * 10).round(2), + } + ) - df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02') - df.loc[1, '2nd'] = df.loc[3, '2nd'] = np.nan - df.loc[1, '3rd'] = df.loc[4, '3rd'] = np.nan + df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") + df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan + df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan - left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd']) + left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) assert left.notna().values.sum() == 2 * len(df) - for col in ['jim', 'joe']: + for col in ["jim", "joe"]: for _, r in df.iterrows(): - key = r['1st'], (col, r['2nd'], r['3rd']) + key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key] def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) - df = DataFrame( - [1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')])) + df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) result = df.stack() - eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)]) - ecols = MultiIndex.from_tuples([(t, 'A')]) + eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) + ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) assert_frame_equal(result, expected) def test_stack_partial_multiIndex(self): # GH 8844 def _test_stack_with_multiindex(multiindex): - df = DataFrame(np.arange(3 * len(multiindex)) - .reshape(3, len(multiindex)), - columns=multiindex) + df = DataFrame( + np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), + columns=multiindex, + ) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) @@ -816,48 +930,57 @@ def _test_stack_with_multiindex(multiindex): else: assert_frame_equal(result, expected) - df.columns = MultiIndex.from_tuples(df.columns.to_numpy(), - names=df.columns.names) + df.columns = MultiIndex.from_tuples( + df.columns.to_numpy(), names=df.columns.names + ) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected) - full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'), - ('A', 'y'), - ('C', 'x'), ('C', 'u')], - names=['Upper', 'Lower']) - for multiindex_columns in ([0, 1, 2, 3, 4], - [0, 1, 2, 3], [0, 1, 2, 4], - [0, 1, 2], [1, 2, 3], [2, 3, 4], - [0, 1], [0, 2], [0, 3], - [0], [2], [4]): + full_multiindex = MultiIndex.from_tuples( + [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], + names=["Upper", "Lower"], + ) + for multiindex_columns in ( + [0, 1, 2, 3, 4], + [0, 1, 2, 3], + [0, 1, 2, 4], + [0, 1, 2], + [1, 2, 3], + [2, 3, 4], + [0, 1], + [0, 2], + [0, 3], + [0], + [2], + [4], + ): _test_stack_with_multiindex(full_multiindex[multiindex_columns]) if len(multiindex_columns) > 1: multiindex_columns.reverse() - _test_stack_with_multiindex( - full_multiindex[multiindex_columns]) + _test_stack_with_multiindex(full_multiindex[multiindex_columns]) - df = DataFrame(np.arange(6).reshape(2, 3), - columns=full_multiindex[[0, 1, 3]]) + df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) result = df.stack(dropna=False) - expected = DataFrame([[0, 2], [1, np.nan], [3, 5], [4, np.nan]], - index=MultiIndex( - levels=[[0, 1], ['u', 'x', 'y', 'z']], - codes=[[0, 0, 1, 1], - [1, 3, 1, 3]], - names=[None, 'Lower']), - columns=Index(['B', 'C'], name='Upper'), - dtype=df.dtypes[0]) + expected = DataFrame( + [[0, 2], [1, np.nan], [3, 5], [4, np.nan]], + index=MultiIndex( + levels=[[0, 1], ["u", "x", "y", "z"]], + codes=[[0, 0, 1, 1], [1, 3, 1, 3]], + names=[None, "Lower"], + ), + columns=Index(["B", "C"], name="Upper"), + dtype=df.dtypes[0], + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize('ordered', [False, True]) - @pytest.mark.parametrize('labels', [list("yxz"), list("yxy")]) + @pytest.mark.parametrize("ordered", [False, True]) + @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")]) def test_stack_preserve_categorical_dtype(self, ordered, labels): # GH13854 - cidx = pd.CategoricalIndex(labels, categories=list("xyz"), - ordered=ordered) + cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() @@ -870,43 +993,51 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): def test_stack_preserve_categorical_dtype_values(self): # GH-23077 - cat = pd.Categorical(['a', 'a', 'b', 'c']) + cat = pd.Categorical(["a", "a", "b", "c"]) df = pd.DataFrame({"A": cat, "B": cat}) result = df.stack() - index = pd.MultiIndex.from_product([[0, 1, 2, 3], ['A', 'B']]) - expected = pd.Series(pd.Categorical(['a', 'a', 'a', 'a', - 'b', 'b', 'c', 'c']), - index=index) + index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) + expected = pd.Series( + pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index + ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('level', [0, 1]) + @pytest.mark.parametrize("level", [0, 1]) def test_unstack_mixed_extension_types(self, level): - index = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 1)], - names=['a', 'b']) - df = pd.DataFrame({"A": pd.core.arrays.integer_array([0, 1, None]), - "B": pd.Categorical(['a', 'a', 'b'])}, index=index) + index = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("B", 1)], names=["a", "b"] + ) + df = pd.DataFrame( + { + "A": pd.core.arrays.integer_array([0, 1, None]), + "B": pd.Categorical(["a", "a", "b"]), + }, + index=index, + ) result = df.unstack(level=level) expected = df.astype(object).unstack(level=level) - expected_dtypes = pd.Series([df.A.dtype] * 2 + [df.B.dtype] * 2, - index=result.columns) + expected_dtypes = pd.Series( + [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns + ) tm.assert_series_equal(result.dtypes, expected_dtypes) tm.assert_frame_equal(result.astype(object), expected) - @pytest.mark.parametrize("level", [0, 'baz']) + @pytest.mark.parametrize("level", [0, "baz"]) def test_unstack_swaplevel_sortlevel(self, level): # GH 20994 - mi = pd.MultiIndex.from_product([[0], ['d', 'c']], - names=['bar', 'baz']) - df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A']) - df.columns.name = 'foo' - - expected = pd.DataFrame([ - [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([ - ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[ - 'baz', 'foo'])) - expected.index.name = 'bar' + mi = pd.MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"]) + df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"]) + df.columns.name = "foo" + + expected = pd.DataFrame( + [[3, 1, 2, 0]], + columns=pd.MultiIndex.from_tuples( + [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"] + ), + ) + expected.index.name = "bar" result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) tm.assert_frame_equal(result, expected) @@ -914,55 +1045,61 @@ def test_unstack_swaplevel_sortlevel(self, level): def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. - data = pd.Series(['a', 'b', 'c', 'a'], dtype='object') + data = pd.Series(["a", "b", "c", "a"], dtype="object") data.index = pd.MultiIndex.from_tuples( - [('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) + [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")] + ) # By default missing values will be NaN result = data.unstack() expected = pd.DataFrame( - {'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]}, - index=list('xyz') + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") ) assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected - result = data.unstack(fill_value='d') + result = data.unstack(fill_value="d") expected = pd.DataFrame( - {'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']}, - index=list('xyz') + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") ) assert_frame_equal(result, expected) def test_unstack_timezone_aware_values(): # GH 18338 - df = pd.DataFrame({ - 'timestamp': [ - pd.Timestamp('2017-08-27 01:00:00.709949+0000', tz='UTC')], - 'a': ['a'], - 'b': ['b'], - 'c': ['c'], - }, columns=['timestamp', 'a', 'b', 'c']) - result = df.set_index(['a', 'b']).unstack() - expected = pd.DataFrame([[pd.Timestamp('2017-08-27 01:00:00.709949+0000', - tz='UTC'), - 'c']], - index=pd.Index(['a'], name='a'), - columns=pd.MultiIndex( - levels=[['timestamp', 'c'], ['b']], - codes=[[0, 1], [0, 0]], - names=[None, 'b'])) + df = pd.DataFrame( + { + "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")], + "a": ["a"], + "b": ["b"], + "c": ["c"], + }, + columns=["timestamp", "a", "b", "c"], + ) + result = df.set_index(["a", "b"]).unstack() + expected = pd.DataFrame( + [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]], + index=pd.Index(["a"], name="a"), + columns=pd.MultiIndex( + levels=[["timestamp", "c"], ["b"]], + codes=[[0, 1], [0, 0]], + names=[None, "b"], + ), + ) assert_frame_equal(result, expected) def test_stack_timezone_aware_values(): # GH 19420 - ts = pd.date_range(freq="D", start="20180101", end="20180103", - tz="America/New_York") + ts = pd.date_range( + freq="D", start="20180101", end="20180103", tz="America/New_York" + ) df = pd.DataFrame({"A": ts}, index=["a", "b", "c"]) result = df.stack() - expected = pd.Series(ts, - index=pd.MultiIndex(levels=[['a', 'b', 'c'], ['A']], - codes=[[0, 1, 2], [0, 0, 0]])) + expected = pd.Series( + ts, + index=pd.MultiIndex( + levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]] + ), + ) assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index 3dca82a229b2b..2bcc115bcd09c 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -10,32 +10,34 @@ @pytest.fixture def df_none(): - return DataFrame({ - 'outer': ['a', 'a', 'a', 'b', 'b', 'b'], - 'inner': [1, 2, 2, 2, 1, 1], - 'A': np.arange(6, 0, -1), - ('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']}) + return DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 2, 2, 1, 1], + "A": np.arange(6, 0, -1), + ("B", 5): ["one", "one", "two", "two", "one", "one"], + } + ) -@pytest.fixture(params=[ - ['outer'], - ['outer', 'inner'] -]) +@pytest.fixture(params=[["outer"], ["outer", "inner"]]) def df_idx(request, df_none): levels = request.param return df_none.set_index(levels) -@pytest.fixture(params=[ - 'inner', # index level - ['outer'], # list of index level - 'A', # column - [('B', 5)], # list of column - ['inner', 'outer'], # two index levels - [('B', 5), 'outer'], # index level and column - ['A', ('B', 5)], # Two columns - ['inner', 'outer'] # two index levels and column -]) +@pytest.fixture( + params=[ + "inner", # index level + ["outer"], # list of index level + "A", # column + [("B", 5)], # list of column + ["inner", "outer"], # two index levels + [("B", 5), "outer"], # index level and column + ["A", ("B", 5)], # Two columns + ["inner", "outer"], # two index levels and column + ] +) def sort_names(request): return request.param @@ -45,8 +47,7 @@ def ascending(request): return request.param -def test_sort_index_level_and_column_label( - df_none, df_idx, sort_names, ascending): +def test_sort_index_level_and_column_label(df_none, df_idx, sort_names, ascending): # GH 14353 @@ -54,20 +55,17 @@ def test_sort_index_level_and_column_label( levels = df_idx.index.names # Compute expected by sorting on columns and the setting index - expected = df_none.sort_values(by=sort_names, - ascending=ascending, - axis=0).set_index(levels) + expected = df_none.sort_values( + by=sort_names, ascending=ascending, axis=0 + ).set_index(levels) # Compute result sorting on mix on columns and index levels - result = df_idx.sort_values(by=sort_names, - ascending=ascending, - axis=0) + result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) assert_frame_equal(result, expected) -def test_sort_column_level_and_index_label( - df_none, df_idx, sort_names, ascending): +def test_sort_column_level_and_index_label(df_none, df_idx, sort_names, ascending): # GH 14353 @@ -77,20 +75,19 @@ def test_sort_column_level_and_index_label( # Compute expected by sorting on axis=0, setting index levels, and then # transposing. For some cases this will result in a frame with # multiple column levels - expected = df_none.sort_values(by=sort_names, - ascending=ascending, - axis=0).set_index(levels).T + expected = ( + df_none.sort_values(by=sort_names, ascending=ascending, axis=0) + .set_index(levels) + .T + ) # Compute result by transposing and sorting on axis=1. - result = df_idx.T.sort_values(by=sort_names, - ascending=ascending, - axis=1) + result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) if len(levels) > 1: # Accessing multi-level columns that are not lexsorted raises a # performance warning - with tm.assert_produces_warning(PerformanceWarning, - check_stacklevel=False): + with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): assert_frame_equal(result, expected) else: assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 11de77f6779e6..b6442d8938843 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -5,8 +5,15 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, IntervalIndex, MultiIndex, NaT, Series, Timestamp, - date_range) + Categorical, + DataFrame, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, +) from pandas.api.types import CategoricalDtype from pandas.tests.frame.common import TestData import pandas.util.testing as tm @@ -14,44 +21,43 @@ class TestDataFrameSorting(TestData): - def test_sort_values(self): - frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], - index=[1, 2, 3], columns=list('ABC')) + frame = DataFrame( + [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") + ) # by column (axis=0) - sorted_df = frame.sort_values(by='A') - indexer = frame['A'].argsort().values + sorted_df = frame.sort_values(by="A") + indexer = frame["A"].argsort().values expected = frame.loc[frame.index[indexer]] assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by='A', ascending=False) + sorted_df = frame.sort_values(by="A", ascending=False) indexer = indexer[::-1] expected = frame.loc[frame.index[indexer]] assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by='A', ascending=False) + sorted_df = frame.sort_values(by="A", ascending=False) assert_frame_equal(sorted_df, expected) # GH4839 - sorted_df = frame.sort_values(by=['A'], ascending=[False]) + sorted_df = frame.sort_values(by=["A"], ascending=[False]) assert_frame_equal(sorted_df, expected) # multiple bys - sorted_df = frame.sort_values(by=['B', 'C']) + sorted_df = frame.sort_values(by=["B", "C"]) expected = frame.loc[[2, 1, 3]] assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by=['B', 'C'], ascending=False) + sorted_df = frame.sort_values(by=["B", "C"], ascending=False) assert_frame_equal(sorted_df, expected[::-1]) - sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False]) + sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) assert_frame_equal(sorted_df, expected) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): - frame.sort_values(by=['A', 'B'], axis=2, inplace=True) + frame.sort_values(by=["A", "B"], axis=2, inplace=True) # by row (axis=1): GH 10806 sorted_df = frame.sort_values(by=3, axis=1) @@ -59,32 +65,32 @@ def test_sort_values(self): assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=3, axis=1, ascending=False) - expected = frame.reindex(columns=['C', 'B', 'A']) + expected = frame.reindex(columns=["C", "B", "A"]) assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by=[1, 2], axis='columns') - expected = frame.reindex(columns=['B', 'A', 'C']) + sorted_df = frame.sort_values(by=[1, 2], axis="columns") + expected = frame.reindex(columns=["B", "A", "C"]) assert_frame_equal(sorted_df, expected) - sorted_df = frame.sort_values(by=[1, 3], axis=1, - ascending=[True, False]) + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) - expected = frame.reindex(columns=['C', 'B', 'A']) + expected = frame.reindex(columns=["C", "B", "A"]) assert_frame_equal(sorted_df, expected) - msg = r'Length of ascending \(5\) != length of by \(2\)' + msg = r"Length of ascending \(5\) != length of by \(2\)" with pytest.raises(ValueError, match=msg): - frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) + frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5) def test_sort_values_inplace(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) sorted_df = frame.copy() - sorted_df.sort_values(by='A', inplace=True) - expected = frame.sort_values(by='A') + sorted_df.sort_values(by="A", inplace=True) + expected = frame.sort_values(by="A") assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() @@ -93,190 +99,204 @@ def test_sort_values_inplace(self): assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by='A', ascending=False, inplace=True) - expected = frame.sort_values(by='A', ascending=False) + sorted_df.sort_values(by="A", ascending=False, inplace=True) + expected = frame.sort_values(by="A", ascending=False) assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True) - expected = frame.sort_values(by=['A', 'B'], ascending=False) + sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True) + expected = frame.sort_values(by=["A", "B"], ascending=False) assert_frame_equal(sorted_df, expected) def test_sort_nan(self): # GH3917 nan = np.nan - df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}) + df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}) # sort one column only expected = DataFrame( - {'A': [nan, 1, 1, 2, 4, 6, 8], - 'B': [5, 9, 2, nan, 5, 5, 4]}, - index=[2, 0, 3, 1, 6, 4, 5]) - sorted_df = df.sort_values(['A'], na_position='first') + {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A"], na_position="first") assert_frame_equal(sorted_df, expected) expected = DataFrame( - {'A': [nan, 8, 6, 4, 2, 1, 1], - 'B': [5, 4, 5, 5, nan, 9, 2]}, - index=[2, 5, 4, 6, 1, 0, 3]) - sorted_df = df.sort_values(['A'], na_position='first', ascending=False) + {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values(["A"], na_position="first", ascending=False) assert_frame_equal(sorted_df, expected) - expected = df.reindex(columns=['B', 'A']) - sorted_df = df.sort_values(by=1, axis=1, na_position='first') + expected = df.reindex(columns=["B", "A"]) + sorted_df = df.sort_values(by=1, axis=1, na_position="first") assert_frame_equal(sorted_df, expected) # na_position='last', order expected = DataFrame( - {'A': [1, 1, 2, 4, 6, 8, nan], - 'B': [2, 9, nan, 5, 5, 4, 5]}, - index=[3, 0, 1, 6, 4, 5, 2]) - sorted_df = df.sort_values(['A', 'B']) + {"A": [1, 1, 2, 4, 6, 8, nan], "B": [2, 9, nan, 5, 5, 4, 5]}, + index=[3, 0, 1, 6, 4, 5, 2], + ) + sorted_df = df.sort_values(["A", "B"]) assert_frame_equal(sorted_df, expected) # na_position='first', order expected = DataFrame( - {'A': [nan, 1, 1, 2, 4, 6, 8], - 'B': [5, 2, 9, nan, 5, 5, 4]}, - index=[2, 3, 0, 1, 6, 4, 5]) - sorted_df = df.sort_values(['A', 'B'], na_position='first') + {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, nan, 5, 5, 4]}, + index=[2, 3, 0, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], na_position="first") assert_frame_equal(sorted_df, expected) # na_position='first', not order expected = DataFrame( - {'A': [nan, 1, 1, 2, 4, 6, 8], - 'B': [5, 9, 2, nan, 5, 5, 4]}, - index=[2, 0, 3, 1, 6, 4, 5]) - sorted_df = df.sort_values(['A', 'B'], ascending=[ - 1, 0], na_position='first') + {"A": [nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, nan, 5, 5, 4]}, + index=[2, 0, 3, 1, 6, 4, 5], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") assert_frame_equal(sorted_df, expected) # na_position='last', not order expected = DataFrame( - {'A': [8, 6, 4, 2, 1, 1, nan], - 'B': [4, 5, 5, nan, 2, 9, 5]}, - index=[5, 4, 6, 1, 3, 0, 2]) - sorted_df = df.sort_values(['A', 'B'], ascending=[ - 0, 1], na_position='last') + {"A": [8, 6, 4, 2, 1, 1, nan], "B": [4, 5, 5, nan, 2, 9, 5]}, + index=[5, 4, 6, 1, 3, 0, 2], + ) + sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") assert_frame_equal(sorted_df, expected) # Test DataFrame with nan label - df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}, - index=[1, 2, 3, 4, 5, 6, nan]) + df = DataFrame( + {"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, nan], + ) # NaN label, ascending=True, na_position='last' - sorted_df = df.sort_index( - kind='quicksort', ascending=True, na_position='last') - expected = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}, - index=[1, 2, 3, 4, 5, 6, nan]) + sorted_df = df.sort_index(kind="quicksort", ascending=True, na_position="last") + expected = DataFrame( + {"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}, + index=[1, 2, 3, 4, 5, 6, nan], + ) assert_frame_equal(sorted_df, expected) # NaN label, ascending=True, na_position='first' - sorted_df = df.sort_index(na_position='first') - expected = DataFrame({'A': [4, 1, 2, nan, 1, 6, 8], - 'B': [5, 9, nan, 5, 2, 5, 4]}, - index=[nan, 1, 2, 3, 4, 5, 6]) + sorted_df = df.sort_index(na_position="first") + expected = DataFrame( + {"A": [4, 1, 2, nan, 1, 6, 8], "B": [5, 9, nan, 5, 2, 5, 4]}, + index=[nan, 1, 2, 3, 4, 5, 6], + ) assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='last' - sorted_df = df.sort_index(kind='quicksort', ascending=False) - expected = DataFrame({'A': [8, 6, 1, nan, 2, 1, 4], - 'B': [4, 5, 2, 5, nan, 9, 5]}, - index=[6, 5, 4, 3, 2, 1, nan]) + sorted_df = df.sort_index(kind="quicksort", ascending=False) + expected = DataFrame( + {"A": [8, 6, 1, nan, 2, 1, 4], "B": [4, 5, 2, 5, nan, 9, 5]}, + index=[6, 5, 4, 3, 2, 1, nan], + ) assert_frame_equal(sorted_df, expected) # NaN label, ascending=False, na_position='first' sorted_df = df.sort_index( - kind='quicksort', ascending=False, na_position='first') - expected = DataFrame({'A': [4, 8, 6, 1, nan, 2, 1], - 'B': [5, 4, 5, 2, 5, nan, 9]}, - index=[nan, 6, 5, 4, 3, 2, 1]) + kind="quicksort", ascending=False, na_position="first" + ) + expected = DataFrame( + {"A": [4, 8, 6, 1, nan, 2, 1], "B": [5, 4, 5, 2, 5, nan, 9]}, + index=[nan, 6, 5, 4, 3, 2, 1], + ) assert_frame_equal(sorted_df, expected) def test_stable_descending_sort(self): # GH #6399 - df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], - columns=['sort_col', 'order']) - sorted_df = df.sort_values(by='sort_col', kind='mergesort', - ascending=False) + df = DataFrame( + [[2, "first"], [2, "second"], [1, "a"], [1, "b"]], + columns=["sort_col", "order"], + ) + sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) assert_frame_equal(df, sorted_df) def test_stable_descending_multicolumn_sort(self): nan = np.nan - df = DataFrame({'A': [1, 2, nan, 1, 6, 8, 4], - 'B': [9, nan, 5, 2, 5, 4, 5]}) + df = DataFrame({"A": [1, 2, nan, 1, 6, 8, 4], "B": [9, nan, 5, 2, 5, 4, 5]}) # test stable mergesort expected = DataFrame( - {'A': [nan, 8, 6, 4, 2, 1, 1], - 'B': [5, 4, 5, 5, nan, 2, 9]}, - index=[2, 5, 4, 6, 1, 3, 0]) - sorted_df = df.sort_values(['A', 'B'], ascending=[0, 1], - na_position='first', - kind='mergesort') + {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 2, 9]}, + index=[2, 5, 4, 6, 1, 3, 0], + ) + sorted_df = df.sort_values( + ["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort" + ) assert_frame_equal(sorted_df, expected) expected = DataFrame( - {'A': [nan, 8, 6, 4, 2, 1, 1], - 'B': [5, 4, 5, 5, nan, 9, 2]}, - index=[2, 5, 4, 6, 1, 0, 3]) - sorted_df = df.sort_values(['A', 'B'], ascending=[0, 0], - na_position='first', - kind='mergesort') + {"A": [nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, nan, 9, 2]}, + index=[2, 5, 4, 6, 1, 0, 3], + ) + sorted_df = df.sort_values( + ["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort" + ) assert_frame_equal(sorted_df, expected) def test_sort_multi_index(self): # GH 25775, testing that sorting by index works with a multi-index. - df = DataFrame({'a': [3, 1, 2], 'b': [0, 0, 0], - 'c': [0, 1, 2], 'd': list('abc')}) - result = df.set_index(list('abc')).sort_index(level=list('ba')) + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ) + result = df.set_index(list("abc")).sort_index(level=list("ba")) - expected = DataFrame({'a': [1, 2, 3], 'b': [0, 0, 0], - 'c': [1, 2, 0], 'd': list('bca')}) - expected = expected.set_index(list('abc')) + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ) + expected = expected.set_index(list("abc")) tm.assert_frame_equal(result, expected) def test_stable_categorial(self): # GH 16793 - df = DataFrame({ - 'x': pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True) - }) + df = DataFrame({"x": pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)}) expected = df.copy() - sorted_df = df.sort_values('x', kind='mergesort') + sorted_df = df.sort_values("x", kind="mergesort") assert_frame_equal(sorted_df, expected) def test_sort_datetimes(self): # GH 3461, argsort / lexsort differences for a datetime column - df = DataFrame(['a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'], - columns=['A'], - index=date_range('20130101', periods=9)) - dts = [Timestamp(x) - for x in ['2004-02-11', '2004-01-21', '2004-01-26', - '2005-09-20', '2010-10-04', '2009-05-12', - '2008-11-12', '2010-09-28', '2010-09-28']] - df['B'] = dts[::2] + dts[1::2] - df['C'] = 2. - df['A1'] = 3. - - df1 = df.sort_values(by='A') - df2 = df.sort_values(by=['A']) + df = DataFrame( + ["a", "a", "a", "b", "c", "d", "e", "f", "g"], + columns=["A"], + index=date_range("20130101", periods=9), + ) + dts = [ + Timestamp(x) + for x in [ + "2004-02-11", + "2004-01-21", + "2004-01-26", + "2005-09-20", + "2010-10-04", + "2009-05-12", + "2008-11-12", + "2010-09-28", + "2010-09-28", + ] + ] + df["B"] = dts[::2] + dts[1::2] + df["C"] = 2.0 + df["A1"] = 3.0 + + df1 = df.sort_values(by="A") + df2 = df.sort_values(by=["A"]) assert_frame_equal(df1, df2) - df1 = df.sort_values(by='B') - df2 = df.sort_values(by=['B']) + df1 = df.sort_values(by="B") + df2 = df.sort_values(by=["B"]) assert_frame_equal(df1, df2) - df1 = df.sort_values(by='B') + df1 = df.sort_values(by="B") - df2 = df.sort_values(by=['C', 'B']) + df2 = df.sort_values(by=["C", "B"]) assert_frame_equal(df1, df2) def test_frame_column_inplace_sort_exception(self): - s = self.frame['A'] + s = self.frame["A"] with pytest.raises(ValueError, match="This Series is a view"): s.sort_values(inplace=True) @@ -293,13 +313,15 @@ def test_sort_nat_values_in_int_column(self): int_values = (2, int(NaT)) float_values = (2.0, -1.797693e308) - df = DataFrame(dict(int=int_values, float=float_values), - columns=["int", "float"]) + df = DataFrame( + dict(int=int_values, float=float_values), columns=["int", "float"] + ) - df_reversed = DataFrame(dict(int=int_values[::-1], - float=float_values[::-1]), - columns=["int", "float"], - index=[1, 0]) + df_reversed = DataFrame( + dict(int=int_values[::-1], float=float_values[::-1]), + columns=["int", "float"], + index=[1, 0], + ) # NaT is not a "na" for int64 columns, so na_position must not # influence the result: @@ -315,13 +337,16 @@ def test_sort_nat_values_in_int_column(self): # and now check if NaT is still considered as "na" for datetime64 # columns: - df = DataFrame(dict(datetime=[Timestamp("2016-01-01"), NaT], - float=float_values), columns=["datetime", "float"]) + df = DataFrame( + dict(datetime=[Timestamp("2016-01-01"), NaT], float=float_values), + columns=["datetime", "float"], + ) - df_reversed = DataFrame(dict(datetime=[NaT, Timestamp("2016-01-01")], - float=float_values[::-1]), - columns=["datetime", "float"], - index=[1, 0]) + df_reversed = DataFrame( + dict(datetime=[NaT, Timestamp("2016-01-01")], float=float_values[::-1]), + columns=["datetime", "float"], + index=[1, 0], + ) df_sorted = df.sort_values(["datetime", "float"], na_position="first") assert_frame_equal(df_sorted, df_reversed) @@ -337,68 +362,70 @@ def test_sort_nat(self): # GH 16836 - d1 = [Timestamp(x) for x in ['2016-01-01', '2015-01-01', - np.nan, '2016-01-01']] - d2 = [Timestamp(x) for x in ['2017-01-01', '2014-01-01', - '2016-01-01', '2015-01-01']] - df = pd.DataFrame({'a': d1, 'b': d2}, index=[0, 1, 2, 3]) - - d3 = [Timestamp(x) for x in ['2015-01-01', '2016-01-01', - '2016-01-01', np.nan]] - d4 = [Timestamp(x) for x in ['2014-01-01', '2015-01-01', - '2017-01-01', '2016-01-01']] - expected = pd.DataFrame({'a': d3, 'b': d4}, index=[1, 3, 0, 2]) - sorted_df = df.sort_values(by=['a', 'b'], ) + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) tm.assert_frame_equal(sorted_df, expected) class TestDataFrameSortIndexKinds(TestData): - def test_sort_index_multicolumn(self): A = np.arange(5).repeat(20) B = np.tile(np.arange(5), 20) random.shuffle(A) random.shuffle(B) - frame = DataFrame({'A': A, 'B': B, - 'C': np.random.randn(100)}) + frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=['A', 'B']) - result = frame.sort_values(by=['A', 'B']) - indexer = np.lexsort((frame['B'], frame['A'])) + frame.sort_index(by=["A", "B"]) + result = frame.sort_values(by=["A", "B"]) + indexer = np.lexsort((frame["B"], frame["A"])) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=['A', 'B'], ascending=False) - result = frame.sort_values(by=['A', 'B'], ascending=False) - indexer = np.lexsort((frame['B'].rank(ascending=False), - frame['A'].rank(ascending=False))) + frame.sort_index(by=["A", "B"], ascending=False) + result = frame.sort_values(by=["A", "B"], ascending=False) + indexer = np.lexsort( + (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) + ) expected = frame.take(indexer) assert_frame_equal(result, expected) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - frame.sort_index(by=['B', 'A']) - result = frame.sort_values(by=['B', 'A']) - indexer = np.lexsort((frame['A'], frame['B'])) + frame.sort_index(by=["B", "A"]) + result = frame.sort_values(by=["B", "A"]) + indexer = np.lexsort((frame["A"], frame["B"])) expected = frame.take(indexer) assert_frame_equal(result, expected) def test_sort_index_inplace(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) # axis=0 unordered = frame.loc[[3, 2, 4, 1]] - a_id = id(unordered['A']) + a_id = id(unordered["A"]) df = unordered.copy() df.sort_index(inplace=True) expected = frame assert_frame_equal(df, expected) - assert a_id != id(df['A']) + assert a_id != id(df["A"]) df = unordered.copy() df.sort_index(ascending=False, inplace=True) @@ -406,7 +433,7 @@ def test_sort_index_inplace(self): assert_frame_equal(df, expected) # axis=1 - unordered = frame.loc[:, ['D', 'B', 'C', 'A']] + unordered = frame.loc[:, ["D", "B", "C", "A"]] df = unordered.copy() df.sort_index(axis=1, inplace=True) expected = frame @@ -425,114 +452,115 @@ def test_sort_index_different_sortorder(self): A = A.take(indexer) B = B.take(indexer) - df = DataFrame({'A': A, 'B': B, - 'C': np.random.randn(100)}) + df = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=['A', 'B'], ascending=[1, 0]) - result = df.sort_values(by=['A', 'B'], ascending=[1, 0]) + df.sort_index(by=["A", "B"], ascending=[1, 0]) + result = df.sort_values(by=["A", "B"], ascending=[1, 0]) ex_indexer = np.lexsort((df.B.max() - df.B, df.A)) expected = df.take(ex_indexer) assert_frame_equal(result, expected) # test with multiindex, too - idf = df.set_index(['A', 'B']) + idf = df.set_index(["A", "B"]) result = idf.sort_index(ascending=[1, 0]) expected = idf.take(ex_indexer) assert_frame_equal(result, expected) # also, Series! - result = idf['C'].sort_index(ascending=[1, 0]) - assert_series_equal(result, expected['C']) + result = idf["C"].sort_index(ascending=[1, 0]) + assert_series_equal(result, expected["C"]) def test_sort_index_duplicates(self): # with 9816, these are all translated to .sort_values - df = DataFrame([range(5, 9), range(4)], - columns=['a', 'a', 'b', 'b']) + df = DataFrame([range(5, 9), range(4)], columns=["a", "a", "b", "b"]) - with pytest.raises(ValueError, match='not unique'): + with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by='a') - with pytest.raises(ValueError, match='not unique'): - df.sort_values(by='a') + df.sort_index(by="a") + with pytest.raises(ValueError, match="not unique"): + df.sort_values(by="a") - with pytest.raises(ValueError, match='not unique'): + with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=['a']) - with pytest.raises(ValueError, match='not unique'): - df.sort_values(by=['a']) + df.sort_index(by=["a"]) + with pytest.raises(ValueError, match="not unique"): + df.sort_values(by=["a"]) - with pytest.raises(ValueError, match='not unique'): + with pytest.raises(ValueError, match="not unique"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): # multi-column 'by' is separate codepath - df.sort_index(by=['a', 'b']) - with pytest.raises(ValueError, match='not unique'): + df.sort_index(by=["a", "b"]) + with pytest.raises(ValueError, match="not unique"): # multi-column 'by' is separate codepath - df.sort_values(by=['a', 'b']) + df.sort_values(by=["a", "b"]) # with multi-index # GH4370 - df = DataFrame(np.random.randn(4, 2), - columns=MultiIndex.from_tuples([('a', 0), ('a', 1)])) - with pytest.raises(ValueError, match='level'): + df = DataFrame( + np.random.randn(4, 2), columns=MultiIndex.from_tuples([("a", 0), ("a", 1)]) + ) + with pytest.raises(ValueError, match="level"): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by='a') - with pytest.raises(ValueError, match='level'): - df.sort_values(by='a') + df.sort_index(by="a") + with pytest.raises(ValueError, match="level"): + df.sort_values(by="a") # convert tuples to a list of tuples # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=[('a', 1)]) - expected = df.sort_values(by=[('a', 1)]) + df.sort_index(by=[("a", 1)]) + expected = df.sort_values(by=[("a", 1)]) # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): - df.sort_index(by=('a', 1)) - result = df.sort_values(by=('a', 1)) + df.sort_index(by=("a", 1)) + result = df.sort_values(by=("a", 1)) assert_frame_equal(result, expected) def test_sort_index_level(self): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) df = DataFrame([[1, 2], [3, 4]], mi) - result = df.sort_index(level='A', sort_remaining=False) + result = df.sort_index(level="A", sort_remaining=False) expected = df assert_frame_equal(result, expected) - result = df.sort_index(level=['A', 'B'], sort_remaining=False) + result = df.sort_index(level=["A", "B"], sort_remaining=False) expected = df assert_frame_equal(result, expected) # Error thrown by sort_index when # first index is sorted last (#26053) - result = df.sort_index(level=['C', 'B', 'A']) + result = df.sort_index(level=["C", "B", "A"]) expected = df.iloc[[1, 0]] assert_frame_equal(result, expected) - result = df.sort_index(level=['B', 'C', 'A']) + result = df.sort_index(level=["B", "C", "A"]) expected = df.iloc[[1, 0]] assert_frame_equal(result, expected) - result = df.sort_index(level=['C', 'A']) + result = df.sort_index(level=["C", "A"]) expected = df.iloc[[1, 0]] assert_frame_equal(result, expected) def test_sort_index_categorical_index(self): - df = (DataFrame({'A': np.arange(6, dtype='int64'), - 'B': Series(list('aabbca')) - .astype(CategoricalDtype(list('cab')))}) - .set_index('B')) + df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") result = df.sort_index() expected = df.iloc[[4, 0, 1, 5, 2, 3]] @@ -545,8 +573,11 @@ def test_sort_index_categorical_index(self): def test_sort_index(self): # GH13496 - frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) + frame = DataFrame( + np.arange(16).reshape(4, 4), + index=[1, 2, 3, 4], + columns=["A", "B", "C", "D"], + ) # axis=0 : sort rows by index labels unordered = frame.loc[[3, 2, 4, 1]] @@ -567,35 +598,28 @@ def test_sort_index(self): expected = frame.iloc[:, ::-1] assert_frame_equal(result, expected) - @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + @pytest.mark.parametrize("level", ["A", 0]) # GH 21052 def test_sort_index_multiindex(self, level): # GH13496 # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples([ - [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples( + [[2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list("ABC") + ) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) - expected_mi = MultiIndex.from_tuples([ - [1, 1, 1], - [2, 1, 2], - [2, 1, 3]], names=list('ABC')) - expected = pd.DataFrame([ - [5, 6], - [3, 4], - [1, 2]], index=expected_mi) + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 2], [2, 1, 3]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [3, 4], [1, 2]], index=expected_mi) result = df.sort_index(level=level) assert_frame_equal(result, expected) # sort_remaining=False - expected_mi = MultiIndex.from_tuples([ - [1, 1, 1], - [2, 1, 3], - [2, 1, 2]], names=list('ABC')) - expected = pd.DataFrame([ - [5, 6], - [1, 2], - [3, 4]], index=expected_mi) + expected_mi = MultiIndex.from_tuples( + [[1, 1, 1], [2, 1, 3], [2, 1, 2]], names=list("ABC") + ) + expected = pd.DataFrame([[5, 6], [1, 2], [3, 4]], index=expected_mi) result = df.sort_index(level=level, sort_remaining=False) assert_frame_equal(result, expected) @@ -604,92 +628,112 @@ def test_sort_index_intervalindex(self): # confirming that we sort in the order of the bins y = Series(np.random.randn(100)) x1 = Series(np.sign(np.random.randn(100))) - x2 = pd.cut(Series(np.random.randn(100)), - bins=[-3, -0.5, 0, 0.5, 3]) - model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2']) + x2 = pd.cut(Series(np.random.randn(100)), bins=[-3, -0.5, 0, 0.5, 3]) + model = pd.concat([y, x1, x2], axis=1, keys=["Y", "X1", "X2"]) - result = model.groupby(['X1', 'X2'], observed=True).mean().unstack() + result = model.groupby(["X1", "X2"], observed=True).mean().unstack() expected = IntervalIndex.from_tuples( - [(-3.0, -0.5), (-0.5, 0.0), - (0.0, 0.5), (0.5, 3.0)], - closed='right') + [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], closed="right" + ) result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) def test_sort_index_na_position_with_categories(self): # GH 22556 # Positioning missing value properly when column is Categorical. - categories = ['A', 'B', 'C'] + categories = ["A", "B", "C"] category_indices = [0, 2, 4] list_of_nans = [np.nan, np.nan] na_indices = [1, 3] - na_position_first = 'first' - na_position_last = 'last' - column_name = 'c' + na_position_first = "first" + na_position_last = "last" + column_name = "c" reversed_categories = sorted(categories, reverse=True) reversed_category_indices = sorted(category_indices, reverse=True) reversed_na_indices = sorted(na_indices) - df = pd.DataFrame({ - column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], - categories=categories, - ordered=True)}) + df = pd.DataFrame( + { + column_name: pd.Categorical( + ["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True + ) + } + ) # sort ascending with na first - result = df.sort_values(by=column_name, - ascending=True, - na_position=na_position_first) - expected = DataFrame({ - column_name: Categorical(list_of_nans + categories, - categories=categories, - ordered=True) - }, index=na_indices + category_indices) + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + categories, categories=categories, ordered=True + ) + }, + index=na_indices + category_indices, + ) assert_frame_equal(result, expected) # sort ascending with na last - result = df.sort_values(by=column_name, - ascending=True, - na_position=na_position_last) - expected = DataFrame({ - column_name: Categorical(categories + list_of_nans, - categories=categories, - ordered=True) - }, index=category_indices + na_indices) + result = df.sort_values( + by=column_name, ascending=True, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + categories + list_of_nans, categories=categories, ordered=True + ) + }, + index=category_indices + na_indices, + ) assert_frame_equal(result, expected) # sort descending with na first - result = df.sort_values(by=column_name, - ascending=False, - na_position=na_position_first) - expected = DataFrame({ - column_name: Categorical(list_of_nans + reversed_categories, - categories=categories, - ordered=True) - }, index=reversed_na_indices + reversed_category_indices) + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_first + ) + expected = DataFrame( + { + column_name: Categorical( + list_of_nans + reversed_categories, + categories=categories, + ordered=True, + ) + }, + index=reversed_na_indices + reversed_category_indices, + ) assert_frame_equal(result, expected) # sort descending with na last - result = df.sort_values(by=column_name, - ascending=False, - na_position=na_position_last) - expected = DataFrame({ - column_name: Categorical(reversed_categories + list_of_nans, - categories=categories, - ordered=True) - }, index=reversed_category_indices + reversed_na_indices) + result = df.sort_values( + by=column_name, ascending=False, na_position=na_position_last + ) + expected = DataFrame( + { + column_name: Categorical( + reversed_categories + list_of_nans, + categories=categories, + ordered=True, + ) + }, + index=reversed_category_indices + reversed_na_indices, + ) assert_frame_equal(result, expected) def test_sort_index_na_position_with_categories_raises(self): - df = pd.DataFrame({ - 'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], - categories=['A', 'B', 'C'], - ordered=True)}) + df = pd.DataFrame( + { + "c": pd.Categorical( + ["A", np.nan, "B", np.nan, "C"], + categories=["A", "B", "C"], + ordered=True, + ) + } + ) with pytest.raises(ValueError): - df.sort_values(by='c', - ascending=False, - na_position='bad_position') + df.sort_values(by="c", ascending=False, na_position="bad_position") diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 45b13e5159bcd..c66a97c2b294b 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -8,19 +8,17 @@ class TestDataFrameSubclassing(TestData): - def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 class CustomSeries(Series): - @property def _constructor(self): return CustomSeries def custom_series_function(self): - return 'OK' + return "OK" class CustomDataFrame(DataFrame): """ @@ -38,10 +36,9 @@ def _constructor(self): _constructor_sliced = CustomSeries def custom_frame_function(self): - return 'OK' + return "OK" - data = {'col1': range(10), - 'col2': range(10)} + data = {"col1": range(10), "col2": range(10)} cdf = CustomDataFrame(data) # Did we get back our own DF class? @@ -50,34 +47,35 @@ def custom_frame_function(self): # Do we get back our own Series class after selecting a column? cdf_series = cdf.col1 assert isinstance(cdf_series, CustomSeries) - assert cdf_series.custom_series_function() == 'OK' + assert cdf_series.custom_series_function() == "OK" # Do we get back our own DF class after slicing row-wise? cdf_rows = cdf[1:5] assert isinstance(cdf_rows, CustomDataFrame) - assert cdf_rows.custom_frame_function() == 'OK' + assert cdf_rows.custom_frame_function() == "OK" # Make sure sliced part of multi-index frame is custom class - mcol = pd.MultiIndex.from_tuples([('A', 'A'), ('A', 'B')]) + mcol = pd.MultiIndex.from_tuples([("A", "A"), ("A", "B")]) cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) - assert isinstance(cdf_multi['A'], CustomDataFrame) + assert isinstance(cdf_multi["A"], CustomDataFrame) - mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')]) + mcol = pd.MultiIndex.from_tuples([("A", ""), ("B", "")]) cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) - assert isinstance(cdf_multi2['A'], CustomSeries) + assert isinstance(cdf_multi2["A"], CustomSeries) def test_dataframe_metadata(self): - df = tm.SubclassedDataFrame({'X': [1, 2, 3], 'Y': [1, 2, 3]}, - index=['a', 'b', 'c']) - df.testattr = 'XXX' + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"] + ) + df.testattr = "XXX" - assert df.testattr == 'XXX' - assert df[['X']].testattr == 'XXX' - assert df.loc[['a', 'b'], :].testattr == 'XXX' - assert df.iloc[[0, 1], :].testattr == 'XXX' + assert df.testattr == "XXX" + assert df[["X"]].testattr == "XXX" + assert df.loc[["a", "b"], :].testattr == "XXX" + assert df.iloc[[0, 1], :].testattr == "XXX" # see gh-9776 - assert df.iloc[0:1, :].testattr == 'XXX' + assert df.iloc[0:1, :].testattr == "XXX" # see gh-10553 unpickled = tm.round_trip_pickle(df) @@ -87,64 +85,67 @@ def test_dataframe_metadata(self): def test_indexing_sliced(self): # GH 11559 - df = tm.SubclassedDataFrame({'X': [1, 2, 3], - 'Y': [4, 5, 6], - 'Z': [7, 8, 9]}, - index=['a', 'b', 'c']) - res = df.loc[:, 'X'] - exp = tm.SubclassedSeries([1, 2, 3], index=list('abc'), name='X') + df = tm.SubclassedDataFrame( + {"X": [1, 2, 3], "Y": [4, 5, 6], "Z": [7, 8, 9]}, index=["a", "b", "c"] + ) + res = df.loc[:, "X"] + exp = tm.SubclassedSeries([1, 2, 3], index=list("abc"), name="X") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) res = df.iloc[:, 1] - exp = tm.SubclassedSeries([4, 5, 6], index=list('abc'), name='Y') + exp = tm.SubclassedSeries([4, 5, 6], index=list("abc"), name="Y") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) - res = df.loc[:, 'Z'] - exp = tm.SubclassedSeries([7, 8, 9], index=list('abc'), name='Z') + res = df.loc[:, "Z"] + exp = tm.SubclassedSeries([7, 8, 9], index=list("abc"), name="Z") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) - res = df.loc['a', :] - exp = tm.SubclassedSeries([1, 4, 7], index=list('XYZ'), name='a') + res = df.loc["a", :] + exp = tm.SubclassedSeries([1, 4, 7], index=list("XYZ"), name="a") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) res = df.iloc[1, :] - exp = tm.SubclassedSeries([2, 5, 8], index=list('XYZ'), name='b') + exp = tm.SubclassedSeries([2, 5, 8], index=list("XYZ"), name="b") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) - res = df.loc['c', :] - exp = tm.SubclassedSeries([3, 6, 9], index=list('XYZ'), name='c') + res = df.loc["c", :] + exp = tm.SubclassedSeries([3, 6, 9], index=list("XYZ"), name="c") tm.assert_series_equal(res, exp) assert isinstance(res, tm.SubclassedSeries) def test_subclass_attr_err_propagation(self): # GH 11808 class A(DataFrame): - @property def bar(self): return self.i_dont_exist - with pytest.raises(AttributeError, match='.*i_dont_exist.*'): + + with pytest.raises(AttributeError, match=".*i_dont_exist.*"): A().bar def test_subclass_align(self): # GH 12983 - df1 = tm.SubclassedDataFrame({'a': [1, 3, 5], - 'b': [1, 3, 5]}, index=list('ACE')) - df2 = tm.SubclassedDataFrame({'c': [1, 2, 4], - 'd': [1, 2, 4]}, index=list('ABD')) + df1 = tm.SubclassedDataFrame( + {"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE") + ) + df2 = tm.SubclassedDataFrame( + {"c": [1, 2, 4], "d": [1, 2, 4]}, index=list("ABD") + ) res1, res2 = df1.align(df2, axis=0) - exp1 = tm.SubclassedDataFrame({'a': [1, np.nan, 3, np.nan, 5], - 'b': [1, np.nan, 3, np.nan, 5]}, - index=list('ABCDE')) - exp2 = tm.SubclassedDataFrame({'c': [1, 2, np.nan, 4, np.nan], - 'd': [1, 2, np.nan, 4, np.nan]}, - index=list('ABCDE')) + exp1 = tm.SubclassedDataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) + exp2 = tm.SubclassedDataFrame( + {"c": [1, 2, np.nan, 4, np.nan], "d": [1, 2, np.nan, 4, np.nan]}, + index=list("ABCDE"), + ) assert isinstance(res1, tm.SubclassedDataFrame) tm.assert_frame_equal(res1, exp1) assert isinstance(res2, tm.SubclassedDataFrame) @@ -158,18 +159,17 @@ def test_subclass_align(self): def test_subclass_align_combinations(self): # GH 12983 - df = tm.SubclassedDataFrame({'a': [1, 3, 5], - 'b': [1, 3, 5]}, index=list('ACE')) - s = tm.SubclassedSeries([1, 2, 4], index=list('ABD'), name='x') + df = tm.SubclassedDataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")) + s = tm.SubclassedSeries([1, 2, 4], index=list("ABD"), name="x") # frame + series res1, res2 = df.align(s, axis=0) - exp1 = pd.DataFrame({'a': [1, np.nan, 3, np.nan, 5], - 'b': [1, np.nan, 3, np.nan, 5]}, - index=list('ABCDE')) + exp1 = pd.DataFrame( + {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]}, + index=list("ABCDE"), + ) # name is lost when - exp2 = pd.Series([1, 2, np.nan, 4, np.nan], - index=list('ABCDE'), name='x') + exp2 = pd.Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x") assert isinstance(res1, tm.SubclassedDataFrame) tm.assert_frame_equal(res1, exp1) @@ -185,7 +185,7 @@ def test_subclass_align_combinations(self): def test_subclass_iterrows(self): # GH 13977 - df = tm.SubclassedDataFrame({'a': [1]}) + df = tm.SubclassedDataFrame({"a": [1]}) for i, row in df.iterrows(): assert isinstance(row, tm.SubclassedSeries) tm.assert_series_equal(row, df.loc[i]) @@ -196,274 +196,315 @@ def test_subclass_sparse_slice(self): ssdf = tm.SubclassedSparseDataFrame(rows) ssdf.testattr = "testattr" - tm.assert_sp_frame_equal(ssdf.loc[:2], - tm.SubclassedSparseDataFrame(rows[:3])) - tm.assert_sp_frame_equal(ssdf.iloc[:2], - tm.SubclassedSparseDataFrame(rows[:2])) - tm.assert_sp_frame_equal(ssdf[:2], - tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_sp_frame_equal(ssdf.loc[:2], tm.SubclassedSparseDataFrame(rows[:3])) + tm.assert_sp_frame_equal(ssdf.iloc[:2], tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_sp_frame_equal(ssdf[:2], tm.SubclassedSparseDataFrame(rows[:2])) assert ssdf.loc[:2].testattr == "testattr" assert ssdf.iloc[:2].testattr == "testattr" assert ssdf[:2].testattr == "testattr" - tm.assert_sp_series_equal(ssdf.loc[1], - tm.SubclassedSparseSeries(rows[1]), - check_names=False, - check_kind=False) - tm.assert_sp_series_equal(ssdf.iloc[1], - tm.SubclassedSparseSeries(rows[1]), - check_names=False, - check_kind=False) + tm.assert_sp_series_equal( + ssdf.loc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False, + check_kind=False, + ) + tm.assert_sp_series_equal( + ssdf.iloc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False, + check_kind=False, + ) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_subclass_sparse_transpose(self): - ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], - [4, 5, 6]]) - essdf = tm.SubclassedSparseDataFrame([[1, 4], - [2, 5], - [3, 6]]) + ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], [4, 5, 6]]) + essdf = tm.SubclassedSparseDataFrame([[1, 4], [2, 5], [3, 6]]) tm.assert_sp_frame_equal(ossdf.T, essdf) def test_subclass_stack(self): # GH 15564 - df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'b', 'c'], - columns=['X', 'Y', 'Z']) + df = tm.SubclassedDataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["a", "b", "c"], + columns=["X", "Y", "Z"], + ) res = df.stack() exp = tm.SubclassedSeries( - [1, 2, 3, 4, 5, 6, 7, 8, 9], - index=[list('aaabbbccc'), list('XYZXYZXYZ')]) + [1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")] + ) tm.assert_series_equal(res, exp) def test_subclass_stack_multi(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12, 13], - [20, 21, 22, 23], - [30, 31, 32, 33], - [40, 41, 42, 43]], + df = tm.SubclassedDataFrame( + [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) - - exp = tm.SubclassedDataFrame([ - [10, 12], - [11, 13], - [20, 22], - [21, 23], - [30, 32], - [31, 33], - [40, 42], - [41, 43]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), - names=['aaa', 'ccc', 'yyy']), - columns=Index(['W', 'X'], name='www')) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 12], + [11, 13], + [20, 22], + [21, 23], + [30, 32], + [31, 33], + [40, 42], + [41, 43], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))), + names=["aaa", "ccc", "yyy"], + ), + columns=Index(["W", "X"], name="www"), + ) res = df.stack() tm.assert_frame_equal(res, exp) - res = df.stack('yyy') + res = df.stack("yyy") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10, 11], - [12, 13], - [20, 21], - [22, 23], - [30, 31], - [32, 33], - [40, 41], - [42, 43]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), - names=['aaa', 'ccc', 'www']), - columns=Index(['y', 'z'], name='yyy')) - - res = df.stack('www') + exp = tm.SubclassedDataFrame( + [ + [10, 11], + [12, 13], + [20, 21], + [22, 23], + [30, 31], + [32, 33], + [40, 41], + [42, 43], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))), + names=["aaa", "ccc", "www"], + ), + columns=Index(["y", "z"], name="yyy"), + ) + + res = df.stack("www") tm.assert_frame_equal(res, exp) def test_subclass_stack_multi_mixed(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12.0, 13.0], - [20, 21, 22.0, 23.0], - [30, 31, 32.0, 33.0], - [40, 41, 42.0, 43.0]], + df = tm.SubclassedDataFrame( + [ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0], + ], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) - - exp = tm.SubclassedDataFrame([ - [10, 12.0], - [11, 13.0], - [20, 22.0], - [21, 23.0], - [30, 32.0], - [31, 33.0], - [40, 42.0], - [41, 43.0]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('yzyzyzyz'))), - names=['aaa', 'ccc', 'yyy']), - columns=Index(['W', 'X'], name='www')) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 12.0], + [11, 13.0], + [20, 22.0], + [21, 23.0], + [30, 32.0], + [31, 33.0], + [40, 42.0], + [41, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))), + names=["aaa", "ccc", "yyy"], + ), + columns=Index(["W", "X"], name="www"), + ) res = df.stack() tm.assert_frame_equal(res, exp) - res = df.stack('yyy') + res = df.stack("yyy") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10.0, 11.0], - [12.0, 13.0], - [20.0, 21.0], - [22.0, 23.0], - [30.0, 31.0], - [32.0, 33.0], - [40.0, 41.0], - [42.0, 43.0]], - index=MultiIndex.from_tuples(list(zip( - list('AAAABBBB'), list('ccddccdd'), list('WXWXWXWX'))), - names=['aaa', 'ccc', 'www']), - columns=Index(['y', 'z'], name='yyy')) - - res = df.stack('www') + exp = tm.SubclassedDataFrame( + [ + [10.0, 11.0], + [12.0, 13.0], + [20.0, 21.0], + [22.0, 23.0], + [30.0, 31.0], + [32.0, 33.0], + [40.0, 41.0], + [42.0, 43.0], + ], + index=MultiIndex.from_tuples( + list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))), + names=["aaa", "ccc", "www"], + ), + columns=Index(["y", "z"], name="yyy"), + ) + + res = df.stack("www") tm.assert_frame_equal(res, exp) def test_subclass_unstack(self): # GH 15564 - df = tm.SubclassedDataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'b', 'c'], - columns=['X', 'Y', 'Z']) + df = tm.SubclassedDataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["a", "b", "c"], + columns=["X", "Y", "Z"], + ) res = df.unstack() exp = tm.SubclassedSeries( - [1, 4, 7, 2, 5, 8, 3, 6, 9], - index=[list('XXXYYYZZZ'), list('abcabcabc')]) + [1, 4, 7, 2, 5, 8, 3, 6, 9], index=[list("XXXYYYZZZ"), list("abcabcabc")] + ) tm.assert_series_equal(res, exp) def test_subclass_unstack_multi(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12, 13], - [20, 21, 22, 23], - [30, 31, 32, 33], - [40, 41, 42, 43]], + df = tm.SubclassedDataFrame( + [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) - exp = tm.SubclassedDataFrame([ - [10, 20, 11, 21, 12, 22, 13, 23], - [30, 40, 31, 41, 32, 42, 33, 43]], - index=Index(['A', 'B'], name='aaa'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), - names=['www', 'yyy', 'ccc'])) + exp = tm.SubclassedDataFrame( + [[10, 20, 11, 21, 12, 22, 13, 23], [30, 40, 31, 41, 32, 42, 33, 43]], + index=Index(["A", "B"], name="aaa"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))), + names=["www", "yyy", "ccc"], + ), + ) res = df.unstack() tm.assert_frame_equal(res, exp) - res = df.unstack('ccc') + res = df.unstack("ccc") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10, 30, 11, 31, 12, 32, 13, 33], - [20, 40, 21, 41, 22, 42, 23, 43]], - index=Index(['c', 'd'], name='ccc'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), - names=['www', 'yyy', 'aaa'])) + exp = tm.SubclassedDataFrame( + [[10, 30, 11, 31, 12, 32, 13, 33], [20, 40, 21, 41, 22, 42, 23, 43]], + index=Index(["c", "d"], name="ccc"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))), + names=["www", "yyy", "aaa"], + ), + ) - res = df.unstack('aaa') + res = df.unstack("aaa") tm.assert_frame_equal(res, exp) def test_subclass_unstack_multi_mixed(self): # GH 15564 - df = tm.SubclassedDataFrame([ - [10, 11, 12.0, 13.0], - [20, 21, 22.0, 23.0], - [30, 31, 32.0, 33.0], - [40, 41, 42.0, 43.0]], + df = tm.SubclassedDataFrame( + [ + [10, 11, 12.0, 13.0], + [20, 21, 22.0, 23.0], + [30, 31, 32.0, 33.0], + [40, 41, 42.0, 43.0], + ], index=MultiIndex.from_tuples( - list(zip(list('AABB'), list('cdcd'))), - names=['aaa', 'ccc']), + list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"] + ), columns=MultiIndex.from_tuples( - list(zip(list('WWXX'), list('yzyz'))), - names=['www', 'yyy'])) - - exp = tm.SubclassedDataFrame([ - [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0], - [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0]], - index=Index(['A', 'B'], name='aaa'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('cdcdcdcd'))), - names=['www', 'yyy', 'ccc'])) + list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"] + ), + ) + + exp = tm.SubclassedDataFrame( + [ + [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0], + [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0], + ], + index=Index(["A", "B"], name="aaa"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))), + names=["www", "yyy", "ccc"], + ), + ) res = df.unstack() tm.assert_frame_equal(res, exp) - res = df.unstack('ccc') + res = df.unstack("ccc") tm.assert_frame_equal(res, exp) - exp = tm.SubclassedDataFrame([ - [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0], - [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0]], - index=Index(['c', 'd'], name='ccc'), - columns=MultiIndex.from_tuples(list(zip( - list('WWWWXXXX'), list('yyzzyyzz'), list('ABABABAB'))), - names=['www', 'yyy', 'aaa'])) + exp = tm.SubclassedDataFrame( + [ + [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0], + [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0], + ], + index=Index(["c", "d"], name="ccc"), + columns=MultiIndex.from_tuples( + list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))), + names=["www", "yyy", "aaa"], + ), + ) - res = df.unstack('aaa') + res = df.unstack("aaa") tm.assert_frame_equal(res, exp) def test_subclass_pivot(self): # GH 15564 - df = tm.SubclassedDataFrame({ - 'index': ['A', 'B', 'C', 'C', 'B', 'A'], - 'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'], - 'values': [1., 2., 3., 3., 2., 1.]}) + df = tm.SubclassedDataFrame( + { + "index": ["A", "B", "C", "C", "B", "A"], + "columns": ["One", "One", "One", "Two", "Two", "Two"], + "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0], + } + ) - pivoted = df.pivot( - index='index', columns='columns', values='values') + pivoted = df.pivot(index="index", columns="columns", values="values") - expected = tm.SubclassedDataFrame({ - 'One': {'A': 1., 'B': 2., 'C': 3.}, - 'Two': {'A': 1., 'B': 2., 'C': 3.}}) + expected = tm.SubclassedDataFrame( + { + "One": {"A": 1.0, "B": 2.0, "C": 3.0}, + "Two": {"A": 1.0, "B": 2.0, "C": 3.0}, + } + ) - expected.index.name, expected.columns.name = 'index', 'columns' + expected.index.name, expected.columns.name = "index", "columns" tm.assert_frame_equal(pivoted, expected) def test_subclassed_melt(self): # GH 15564 - cheese = tm.SubclassedDataFrame({ - 'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) - - melted = pd.melt(cheese, id_vars=['first', 'last']) - - expected = tm.SubclassedDataFrame([ - ['John', 'Doe', 'height', 5.5], - ['Mary', 'Bo', 'height', 6.0], - ['John', 'Doe', 'weight', 130], - ['Mary', 'Bo', 'weight', 150]], - columns=['first', 'last', 'variable', 'value']) + cheese = tm.SubclassedDataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) + + melted = pd.melt(cheese, id_vars=["first", "last"]) + + expected = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 5.5], + ["Mary", "Bo", "height", 6.0], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) tm.assert_frame_equal(melted, expected) @@ -472,21 +513,26 @@ def test_subclassed_wide_to_long(self): np.random.seed(123) x = np.random.randn(3) - df = tm.SubclassedDataFrame({ - "A1970": {0: "a", 1: "b", 2: "c"}, - "A1980": {0: "d", 1: "e", 2: "f"}, - "B1970": {0: 2.5, 1: 1.2, 2: .7}, - "B1980": {0: 3.2, 1: 1.3, 2: .1}, - "X": dict(zip(range(3), x))}) + df = tm.SubclassedDataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = tm.SubclassedDataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(long_frame, expected) @@ -502,32 +548,34 @@ def strech(row): row["value"] += 0.5 return row - df = tm.SubclassedDataFrame([ - ['John', 'Doe', 'height', 5.5], - ['Mary', 'Bo', 'height', 6.0], - ['John', 'Doe', 'weight', 130], - ['Mary', 'Bo', 'weight', 150]], - columns=['first', 'last', 'variable', 'value']) + df = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 5.5], + ["Mary", "Bo", "height", 6.0], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) df.apply(lambda x: check_row_subclass(x)) df.apply(lambda x: check_row_subclass(x), axis=1) - expected = tm.SubclassedDataFrame([ - ['John', 'Doe', 'height', 6.0], - ['Mary', 'Bo', 'height', 6.5], - ['John', 'Doe', 'weight', 130], - ['Mary', 'Bo', 'weight', 150]], - columns=['first', 'last', 'variable', 'value']) + expected = tm.SubclassedDataFrame( + [ + ["John", "Doe", "height", 6.0], + ["Mary", "Bo", "height", 6.5], + ["John", "Doe", "weight", 130], + ["Mary", "Bo", "weight", 150], + ], + columns=["first", "last", "variable", "value"], + ) result = df.apply(lambda x: strech(x), axis=1) assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) - expected = tm.SubclassedDataFrame([ - [1, 2, 3], - [1, 2, 3], - [1, 2, 3], - [1, 2, 3]]) + expected = tm.SubclassedDataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1) assert isinstance(result, tm.SubclassedDataFrame) @@ -537,11 +585,7 @@ def strech(row): assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) - expected = tm.SubclassedSeries([ - [1, 2, 3], - [1, 2, 3], - [1, 2, 3], - [1, 2, 3]]) + expected = tm.SubclassedSeries([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) result = df.apply(lambda x: [1, 2, 3], axis=1) assert not isinstance(result, tm.SubclassedDataFrame) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index bce9f70fdc20c..92801b02dee22 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -7,12 +7,23 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, MultiIndex, Series, Timestamp, date_range, - period_range, to_datetime) + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + period_range, + to_datetime, +) from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_frame_equal, assert_index_equal, assert_series_equal) + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) import pandas.tseries.offsets as offsets @@ -23,56 +34,71 @@ def close_open_fixture(request): class TestDataFrameTimeSeriesMethods(TestData): - def test_diff(self): the_diff = self.tsframe.diff(1) - assert_series_equal(the_diff['A'], - self.tsframe['A'] - self.tsframe['A'].shift(1)) + assert_series_equal( + the_diff["A"], self.tsframe["A"] - self.tsframe["A"].shift(1) + ) # int dtype a = 10000000000000000 b = a + 1 s = Series([a, b]) - rs = DataFrame({'s': s}).diff() + rs = DataFrame({"s": s}).diff() assert rs.s[1] == 1 # mixed numeric - tf = self.tsframe.astype('float32') + tf = self.tsframe.astype("float32") the_diff = tf.diff(1) - assert_series_equal(the_diff['A'], - tf['A'] - tf['A'].shift(1)) + assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) # issue 10907 - df = pd.DataFrame({'y': pd.Series([2]), 'z': pd.Series([3])}) - df.insert(0, 'x', 1) + df = pd.DataFrame({"y": pd.Series([2]), "z": pd.Series([3])}) + df.insert(0, "x", 1) result = df.diff(axis=1) - expected = pd.DataFrame({'x': np.nan, 'y': pd.Series( - 1), 'z': pd.Series(1)}).astype('float64') + expected = pd.DataFrame( + {"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)} + ).astype("float64") assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_axis0(self, tz): # GH 18578 - df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), - 1: date_range('2010', freq='D', periods=2, tz=tz)}) + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) result = df.diff(axis=0) - expected = DataFrame({0: pd.TimedeltaIndex(['NaT', '1 days']), - 1: pd.TimedeltaIndex(['NaT', '1 days'])}) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "1 days"]), + 1: pd.TimedeltaIndex(["NaT", "1 days"]), + } + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_axis1(self, tz): # GH 18578 - df = DataFrame({0: date_range('2010', freq='D', periods=2, tz=tz), - 1: date_range('2010', freq='D', periods=2, tz=tz)}) + df = DataFrame( + { + 0: date_range("2010", freq="D", periods=2, tz=tz), + 1: date_range("2010", freq="D", periods=2, tz=tz), + } + ) if tz is None: result = df.diff(axis=1) - expected = DataFrame({0: pd.TimedeltaIndex(['NaT', 'NaT']), - 1: pd.TimedeltaIndex(['0 days', - '0 days'])}) + expected = DataFrame( + { + 0: pd.TimedeltaIndex(["NaT", "NaT"]), + 1: pd.TimedeltaIndex(["0 days", "0 days"]), + } + ) assert_frame_equal(result, expected) else: with pytest.raises(NotImplementedError): @@ -80,19 +106,22 @@ def test_diff_datetime_axis1(self, tz): def test_diff_timedelta(self): # GH 4533 - df = DataFrame(dict(time=[Timestamp('20130101 9:01'), - Timestamp('20130101 9:02')], - value=[1.0, 2.0])) + df = DataFrame( + dict( + time=[Timestamp("20130101 9:01"), Timestamp("20130101 9:02")], + value=[1.0, 2.0], + ) + ) res = df.diff() - exp = DataFrame([[pd.NaT, np.nan], - [pd.Timedelta('00:01:00'), 1]], - columns=['time', 'value']) + exp = DataFrame( + [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] + ) assert_frame_equal(res, exp) def test_diff_mixed_dtype(self): df = DataFrame(np.random.randn(5, 3)) - df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) + df["A"] = np.array([1, 2, 3, 4, 5], dtype=object) result = df.diff() assert result[0].dtype == np.float64 @@ -103,140 +132,136 @@ def test_diff_neg_n(self): assert_frame_equal(rs, xp) def test_diff_float_n(self): - rs = self.tsframe.diff(1.) + rs = self.tsframe.diff(1.0) xp = self.tsframe.diff(1) assert_frame_equal(rs, xp) def test_diff_axis(self): # GH 9727 - df = DataFrame([[1., 2.], [3., 4.]]) - assert_frame_equal(df.diff(axis=1), DataFrame( - [[np.nan, 1.], [np.nan, 1.]])) - assert_frame_equal(df.diff(axis=0), DataFrame( - [[np.nan, np.nan], [2., 2.]])) + df = DataFrame([[1.0, 2.0], [3.0, 4.0]]) + assert_frame_equal(df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])) + assert_frame_equal(df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])) def test_pct_change(self): rs = self.tsframe.pct_change(fill_method=None) assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1) rs = self.tsframe.pct_change(2) - filled = self.tsframe.fillna(method='pad') + filled = self.tsframe.fillna(method="pad") assert_frame_equal(rs, filled / filled.shift(2) - 1) - rs = self.tsframe.pct_change(fill_method='bfill', limit=1) - filled = self.tsframe.fillna(method='bfill', limit=1) + rs = self.tsframe.pct_change(fill_method="bfill", limit=1) + filled = self.tsframe.fillna(method="bfill", limit=1) assert_frame_equal(rs, filled / filled.shift(1) - 1) - rs = self.tsframe.pct_change(freq='5D') - filled = self.tsframe.fillna(method='pad') - assert_frame_equal(rs, - (filled / filled.shift(freq='5D') - 1) - .reindex_like(filled)) + rs = self.tsframe.pct_change(freq="5D") + filled = self.tsframe.fillna(method="pad") + assert_frame_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) def test_pct_change_shift_over_nas(self): - s = Series([1., 1.5, np.nan, 2.5, 3.]) + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - df = DataFrame({'a': s, 'b': s}) + df = DataFrame({"a": s, "b": s}) chg = df.pct_change() - expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2]) - edf = DataFrame({'a': expected, 'b': expected}) + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + edf = DataFrame({"a": expected, "b": expected}) assert_frame_equal(chg, edf) - @pytest.mark.parametrize("freq, periods, fill_method, limit", - [('5B', 5, None, None), - ('3B', 3, None, None), - ('3B', 3, 'bfill', None), - ('7B', 7, 'pad', 1), - ('7B', 7, 'bfill', 3), - ('14B', 14, None, None)]) + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): # GH 7292 - rs_freq = self.tsframe.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = self.tsframe.pct_change(periods, - fill_method=fill_method, - limit=limit) + rs_freq = self.tsframe.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = self.tsframe.pct_change( + periods, fill_method=fill_method, limit=limit + ) assert_frame_equal(rs_freq, rs_periods) - empty_ts = DataFrame(index=self.tsframe.index, - columns=self.tsframe.columns) - rs_freq = empty_ts.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = empty_ts.pct_change(periods, - fill_method=fill_method, - limit=limit) + empty_ts = DataFrame(index=self.tsframe.index, columns=self.tsframe.columns) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_frame_equal(rs_freq, rs_periods) def test_frame_ctor_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") dates = np.asarray(rng) - df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) - assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]')) + df = DataFrame({"A": np.random.randn(len(rng)), "B": dates}) + assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]")) def test_frame_append_datetime64_column(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") df = DataFrame(index=np.arange(len(rng))) - df['A'] = rng - assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]')) + df["A"] = rng + assert np.issubdtype(df["A"].dtype, np.dtype("M8[ns]")) def test_frame_datetime64_pre1900_repr(self): - df = DataFrame({'year': date_range('1/1/1700', periods=50, - freq='A-DEC')}) + df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="A-DEC")}) # it works! repr(df) def test_frame_append_datetime64_col_other_units(self): n = 100 - units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] + units = ["h", "m", "s", "ms", "D", "M", "Y"] - ns_dtype = np.dtype('M8[ns]') + ns_dtype = np.dtype("M8[ns]") for unit in units: - dtype = np.dtype('M8[%s]' % unit) + dtype = np.dtype("M8[%s]" % unit) vals = np.arange(n, dtype=np.int64).view(dtype) - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) df[unit] = vals - ex_vals = to_datetime(vals.astype('O')).values + ex_vals = to_datetime(vals.astype("O")).values assert df[unit].dtype == ns_dtype assert (df[unit].values == ex_vals).all() # Test insertion into existing datetime64 column - df = DataFrame({'ints': np.arange(n)}, index=np.arange(n)) - df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) + df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) + df["dates"] = np.arange(n, dtype=np.int64).view(ns_dtype) for unit in units: - dtype = np.dtype('M8[%s]' % unit) + dtype = np.dtype("M8[%s]" % unit) vals = np.arange(n, dtype=np.int64).view(dtype) tmp = df.copy() - tmp['dates'] = vals - ex_vals = to_datetime(vals.astype('O')).values + tmp["dates"] = vals + ex_vals = to_datetime(vals.astype("O")).values - assert (tmp['dates'].values == ex_vals).all() + assert (tmp["dates"].values == ex_vals).all() def test_shift(self): # naive shift shiftedFrame = self.tsframe.shift(5) tm.assert_index_equal(shiftedFrame.index, self.tsframe.index) - shiftedSeries = self.tsframe['A'].shift(5) - assert_series_equal(shiftedFrame['A'], shiftedSeries) + shiftedSeries = self.tsframe["A"].shift(5) + assert_series_equal(shiftedFrame["A"], shiftedSeries) shiftedFrame = self.tsframe.shift(-5) tm.assert_index_equal(shiftedFrame.index, self.tsframe.index) - shiftedSeries = self.tsframe['A'].shift(-5) - assert_series_equal(shiftedFrame['A'], shiftedSeries) + shiftedSeries = self.tsframe["A"].shift(-5) + assert_series_equal(shiftedFrame["A"], shiftedSeries) # shift by 0 unshifted = self.tsframe.shift(0) @@ -246,13 +271,14 @@ def test_shift(self): shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay()) assert len(shiftedFrame) == len(self.tsframe) - shiftedFrame2 = self.tsframe.shift(5, freq='B') + shiftedFrame2 = self.tsframe.shift(5, freq="B") assert_frame_equal(shiftedFrame, shiftedFrame2) d = self.tsframe.index[0] shifted_d = d + offsets.BDay(5) - assert_series_equal(self.tsframe.xs(d), - shiftedFrame.xs(shifted_d), check_names=False) + assert_series_equal( + self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False + ) # shift int frame int_shifted = self.intframe.shift(1) # noqa @@ -263,72 +289,78 @@ def test_shift(self): unshifted = shifted.shift(-1) tm.assert_index_equal(shifted.index, ps.index) tm.assert_index_equal(unshifted.index, ps.index) - tm.assert_numpy_array_equal(unshifted.iloc[:, 0].dropna().values, - ps.iloc[:-1, 0].values) + tm.assert_numpy_array_equal( + unshifted.iloc[:, 0].dropna().values, ps.iloc[:-1, 0].values + ) - shifted2 = ps.shift(1, 'B') + shifted2 = ps.shift(1, "B") shifted3 = ps.shift(1, offsets.BDay()) assert_frame_equal(shifted2, shifted3) - assert_frame_equal(ps, shifted2.shift(-1, 'B')) + assert_frame_equal(ps, shifted2.shift(-1, "B")) - msg = 'does not match PeriodIndex freq' + msg = "does not match PeriodIndex freq" with pytest.raises(ValueError, match=msg): - ps.shift(freq='D') + ps.shift(freq="D") # shift other axis # GH 6371 df = DataFrame(np.random.rand(10, 5)) - expected = pd.concat([DataFrame(np.nan, index=df.index, - columns=[0]), - df.iloc[:, 0:-1]], - ignore_index=True, axis=1) + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) result = df.shift(1, axis=1) assert_frame_equal(result, expected) # shift named axis df = DataFrame(np.random.rand(10, 5)) - expected = pd.concat([DataFrame(np.nan, index=df.index, - columns=[0]), - df.iloc[:, 0:-1]], - ignore_index=True, axis=1) - result = df.shift(1, axis='columns') + expected = pd.concat( + [DataFrame(np.nan, index=df.index, columns=[0]), df.iloc[:, 0:-1]], + ignore_index=True, + axis=1, + ) + result = df.shift(1, axis="columns") assert_frame_equal(result, expected) def test_shift_bool(self): - df = DataFrame({'high': [True, False], - 'low': [False, False]}) + df = DataFrame({"high": [True, False], "low": [False, False]}) rs = df.shift(1) - xp = DataFrame(np.array([[np.nan, np.nan], - [True, False]], dtype=object), - columns=['high', 'low']) + xp = DataFrame( + np.array([[np.nan, np.nan], [True, False]], dtype=object), + columns=["high", "low"], + ) assert_frame_equal(rs, xp) def test_shift_categorical(self): # GH 9416 - s1 = pd.Series(['a', 'b', 'c'], dtype='category') - s2 = pd.Series(['A', 'B', 'C'], dtype='category') - df = DataFrame({'one': s1, 'two': s2}) + s1 = pd.Series(["a", "b", "c"], dtype="category") + s2 = pd.Series(["A", "B", "C"], dtype="category") + df = DataFrame({"one": s1, "two": s2}) rs = df.shift(1) - xp = DataFrame({'one': s1.shift(1), 'two': s2.shift(1)}) + xp = DataFrame({"one": s1.shift(1), "two": s2.shift(1)}) assert_frame_equal(rs, xp) def test_shift_fill_value(self): # GH #24128 - df = DataFrame([1, 2, 3, 4, 5], - index=date_range('1/1/2000', periods=5, freq='H')) - exp = DataFrame([0, 1, 2, 3, 4], - index=date_range('1/1/2000', periods=5, freq='H')) + df = DataFrame( + [1, 2, 3, 4, 5], index=date_range("1/1/2000", periods=5, freq="H") + ) + exp = DataFrame( + [0, 1, 2, 3, 4], index=date_range("1/1/2000", periods=5, freq="H") + ) result = df.shift(1, fill_value=0) assert_frame_equal(result, exp) - exp = DataFrame([0, 0, 1, 2, 3], - index=date_range('1/1/2000', periods=5, freq='H')) + exp = DataFrame( + [0, 0, 1, 2, 3], index=date_range("1/1/2000", periods=5, freq="H") + ) result = df.shift(2, fill_value=0) assert_frame_equal(result, exp) def test_shift_empty(self): # Regression test for #8019 - df = DataFrame({'foo': []}) + df = DataFrame({"foo": []}) rs = df.shift(-1) assert_frame_equal(df, rs) @@ -349,7 +381,7 @@ def test_shift_duplicate_columns(self): # sanity check the base case nulls = shifted[0].isna().sum() - assert_series_equal(nulls, Series(range(1, 6), dtype='int64')) + assert_series_equal(nulls, Series(range(1, 6), dtype="int64")) # check all answers are the same assert_frame_equal(shifted[0], shifted[1]) @@ -363,14 +395,14 @@ def test_tshift(self): assert_frame_equal(unshifted, ps) - shifted2 = ps.tshift(freq='B') + shifted2 = ps.tshift(freq="B") assert_frame_equal(shifted, shifted2) shifted3 = ps.tshift(freq=offsets.BDay()) assert_frame_equal(shifted, shifted3) - with pytest.raises(ValueError, match='does not match'): - ps.tshift(freq='M') + with pytest.raises(ValueError, match="does not match"): + ps.tshift(freq="M") # DatetimeIndex shifted = self.tsframe.tshift(1) @@ -381,9 +413,11 @@ def test_tshift(self): shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq) assert_frame_equal(shifted, shifted2) - inferred_ts = DataFrame(self.tsframe.values, - Index(np.asarray(self.tsframe.index)), - columns=self.tsframe.columns) + inferred_ts = DataFrame( + self.tsframe.values, + Index(np.asarray(self.tsframe.index)), + columns=self.tsframe.columns, + ) shifted = inferred_ts.tshift(1) unshifted = shifted.tshift(-1) assert_frame_equal(shifted, self.tsframe.tshift(1)) @@ -435,104 +469,113 @@ def test_truncate(self): msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-02-04 00:00:00" with pytest.raises(ValueError, match=msg): - ts.truncate(before=ts.index[-1] - ts.index.freq, - after=ts.index[0] + ts.index.freq) + ts.truncate( + before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq + ) def test_truncate_copy(self): index = self.tsframe.index truncated = self.tsframe.truncate(index[5], index[10]) - truncated.values[:] = 5. + truncated.values[:] = 5.0 assert not (self.tsframe.values[5:11] == 5).any() def test_truncate_nonsortedindex(self): # GH 17935 - df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e']}, - index=[5, 3, 2, 9, 0]) - msg = 'truncate requires a sorted index' + df = pd.DataFrame({"A": ["a", "b", "c", "d", "e"]}, index=[5, 3, 2, 9, 0]) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): df.truncate(before=3, after=9) - rng = pd.date_range('2011-01-01', '2012-01-01', freq='W') - ts = pd.DataFrame({'A': np.random.randn(len(rng)), - 'B': np.random.randn(len(rng))}, - index=rng) - msg = 'truncate requires a sorted index' + rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") + ts = pd.DataFrame( + {"A": np.random.randn(len(rng)), "B": np.random.randn(len(rng))}, index=rng + ) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): - ts.sort_values('A', ascending=False).truncate(before='2011-11', - after='2011-12') - - df = pd.DataFrame({3: np.random.randn(5), - 20: np.random.randn(5), - 2: np.random.randn(5), - 0: np.random.randn(5)}, - columns=[3, 20, 2, 0]) - msg = 'truncate requires a sorted index' + ts.sort_values("A", ascending=False).truncate( + before="2011-11", after="2011-12" + ) + + df = pd.DataFrame( + { + 3: np.random.randn(5), + 20: np.random.randn(5), + 2: np.random.randn(5), + 0: np.random.randn(5), + }, + columns=[3, 20, 2, 0], + ) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): df.truncate(before=2, after=20, axis=1) def test_asfreq(self): offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd()) - rule_monthly = self.tsframe.asfreq('BM') + rule_monthly = self.tsframe.asfreq("BM") - tm.assert_almost_equal(offset_monthly['A'], rule_monthly['A']) + tm.assert_almost_equal(offset_monthly["A"], rule_monthly["A"]) - filled = rule_monthly.asfreq('B', method='pad') # noqa + filled = rule_monthly.asfreq("B", method="pad") # noqa # TODO: actually check that this worked. # don't forget! - filled_dep = rule_monthly.asfreq('B', method='pad') # noqa + filled_dep = rule_monthly.asfreq("B", method="pad") # noqa # test does not blow up on length-0 DataFrame zero_length = self.tsframe.reindex([]) - result = zero_length.asfreq('BM') + result = zero_length.asfreq("BM") assert result is not zero_length def test_asfreq_datetimeindex(self): - df = DataFrame({'A': [1, 2, 3]}, - index=[datetime(2011, 11, 1), datetime(2011, 11, 2), - datetime(2011, 11, 3)]) - df = df.asfreq('B') + df = DataFrame( + {"A": [1, 2, 3]}, + index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)], + ) + df = df.asfreq("B") assert isinstance(df.index, DatetimeIndex) - ts = df['A'].asfreq('B') + ts = df["A"].asfreq("B") assert isinstance(ts.index, DatetimeIndex) def test_asfreq_fillvalue(self): # test for fill value during upsampling, related to issue 3715 # setup - rng = pd.date_range('1/1/2016', periods=10, freq='2S') + rng = pd.date_range("1/1/2016", periods=10, freq="2S") ts = pd.Series(np.arange(len(rng)), index=rng) - df = pd.DataFrame({'one': ts}) + df = pd.DataFrame({"one": ts}) # insert pre-existing missing value - df.loc['2016-01-01 00:00:08', 'one'] = None + df.loc["2016-01-01 00:00:08", "one"] = None - actual_df = df.asfreq(freq='1S', fill_value=9.0) - expected_df = df.asfreq(freq='1S').fillna(9.0) - expected_df.loc['2016-01-01 00:00:08', 'one'] = None + actual_df = df.asfreq(freq="1S", fill_value=9.0) + expected_df = df.asfreq(freq="1S").fillna(9.0) + expected_df.loc["2016-01-01 00:00:08", "one"] = None assert_frame_equal(expected_df, actual_df) - expected_series = ts.asfreq(freq='1S').fillna(9.0) - actual_series = ts.asfreq(freq='1S', fill_value=9.0) + expected_series = ts.asfreq(freq="1S").fillna(9.0) + actual_series = ts.asfreq(freq="1S", fill_value=9.0) assert_series_equal(expected_series, actual_series) - @pytest.mark.parametrize("data,idx,expected_first,expected_last", [ - ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2), - ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2), - ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'), - ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2), - ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), - ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)]) - def test_first_last_valid(self, data, idx, - expected_first, expected_last): + @pytest.mark.parametrize( + "data,idx,expected_first,expected_last", + [ + ({"A": [1, 2, 3]}, [1, 1, 2], 1, 2), + ({"A": [1, 2, 3]}, [1, 2, 2], 1, 2), + ({"A": [1, 2, 3, 4]}, ["d", "d", "d", "d"], "d", "d"), + ({"A": [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({"A": [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), + ], + ) + def test_first_last_valid(self, data, idx, expected_first, expected_last): N = len(self.frame.index) mat = np.random.randn(N) mat[:5] = np.nan mat[-5:] = np.nan - frame = DataFrame({'foo': mat}, index=self.frame.index) + frame = DataFrame({"foo": mat}, index=self.frame.index) index = frame.first_valid_index() assert index == frame.index[5] @@ -565,66 +608,66 @@ def test_first_last_valid(self, data, idx, assert expected_last == df.last_valid_index() def test_first_subset(self): - ts = tm.makeTimeDataFrame(freq='12h') - result = ts.first('10d') + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.first("10d") assert len(result) == 20 - ts = tm.makeTimeDataFrame(freq='D') - result = ts.first('10d') + ts = tm.makeTimeDataFrame(freq="D") + result = ts.first("10d") assert len(result) == 10 - result = ts.first('3M') - expected = ts[:'3/31/2000'] + result = ts.first("3M") + expected = ts[:"3/31/2000"] assert_frame_equal(result, expected) - result = ts.first('21D') + result = ts.first("21D") expected = ts[:21] assert_frame_equal(result, expected) - result = ts[:0].first('3M') + result = ts[:0].first("3M") assert_frame_equal(result, ts[:0]) def test_first_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.first('1D') + df.first("1D") def test_last_subset(self): - ts = tm.makeTimeDataFrame(freq='12h') - result = ts.last('10d') + ts = tm.makeTimeDataFrame(freq="12h") + result = ts.last("10d") assert len(result) == 20 - ts = tm.makeTimeDataFrame(nper=30, freq='D') - result = ts.last('10d') + ts = tm.makeTimeDataFrame(nper=30, freq="D") + result = ts.last("10d") assert len(result) == 10 - result = ts.last('21D') - expected = ts['2000-01-10':] + result = ts.last("21D") + expected = ts["2000-01-10":] assert_frame_equal(result, expected) - result = ts.last('21D') + result = ts.last("21D") expected = ts[-21:] assert_frame_equal(result, expected) - result = ts[:0].last('3M') + result = ts[:0].last("3M") assert_frame_equal(result, ts[:0]) def test_last_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.last('1D') + df.last("1D") def test_at_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) rs = ts.at_time(rng[1]) assert (rs.index.hour == rng[1].hour).all() assert (rs.index.minute == rng[1].minute).all() assert (rs.index.second == rng[1].second).all() - result = ts.at_time('9:30') + result = ts.at_time("9:30") expected = ts.at_time(time(9, 30)) assert_frame_equal(result, expected) @@ -634,25 +677,26 @@ def test_at_time(self): assert_frame_equal(result, expected) # midnight, everything - rng = date_range('1/1/2000', '1/31/2000') + rng = date_range("1/1/2000", "1/31/2000") ts = DataFrame(np.random.randn(len(rng), 3), index=rng) result = ts.at_time(time(0, 0)) assert_frame_equal(result, ts) # time doesn't exist - rng = date_range('1/1/2012', freq='23Min', periods=384) + rng = date_range("1/1/2012", freq="23Min", periods=384) ts = DataFrame(np.random.randn(len(rng), 2), rng) - rs = ts.at_time('16:00') + rs = ts.at_time("16:00") assert len(rs) == 0 - @pytest.mark.parametrize('hour', ['1:00', '1:00AM', time(1), - time(1, tzinfo=pytz.UTC)]) + @pytest.mark.parametrize( + "hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)] + ) def test_at_time_errors(self, hour): # GH 24043 - dti = pd.date_range('2018', periods=3, freq='H') + dti = pd.date_range("2018", periods=3, freq="H") df = pd.DataFrame(list(range(len(dti))), index=dti) - if getattr(hour, 'tzinfo', None) is None: + if getattr(hour, "tzinfo", None) is None: result = df.at_time(hour) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) @@ -662,9 +706,9 @@ def test_at_time_errors(self, hour): def test_at_time_tz(self): # GH 24043 - dti = pd.date_range('2018', periods=3, freq='H', tz='US/Pacific') + dti = pd.date_range("2018", periods=3, freq="H", tz="US/Pacific") df = pd.DataFrame(list(range(len(dti))), index=dti) - result = df.at_time(time(4, tzinfo=pytz.timezone('US/Eastern'))) + result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern"))) expected = df.iloc[1:2] tm.assert_frame_equal(result, expected) @@ -672,27 +716,27 @@ def test_at_time_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.at_time('00:00') + df.at_time("00:00") - @pytest.mark.parametrize('axis', ['index', 'columns', 0, 1]) + @pytest.mark.parametrize("axis", ["index", "columns", 0, 1]) def test_at_time_axis(self, axis): # issue 8839 - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), len(rng))) ts.index, ts.columns = rng, rng indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)] - if axis in ['index', 0]: + if axis in ["index", 0]: expected = ts.loc[indices, :] - elif axis in ['columns', 1]: + elif axis in ["columns", 1]: expected = ts.loc[:, indices] - result = ts.at_time('9:30', axis=axis) + result = ts.at_time("9:30", axis=axis) assert_frame_equal(result, expected) def test_between_time(self, close_open_fixture): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) stime = time(0, 0) etime = time(1, 0) @@ -718,12 +762,12 @@ def test_between_time(self, close_open_fixture): else: assert t < etime - result = ts.between_time('00:00', '01:00') + result = ts.between_time("00:00", "01:00") expected = ts.between_time(stime, etime) assert_frame_equal(result, expected) # across midnight - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) stime = time(22, 0) etime = time(9, 0) @@ -752,61 +796,60 @@ def test_between_time_raises(self): # GH20725 df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) with pytest.raises(TypeError): # index is not a DatetimeIndex - df.between_time(start_time='00:00', end_time='12:00') + df.between_time(start_time="00:00", end_time="12:00") def test_between_time_axis(self, axis): # issue 8839 - rng = date_range('1/1/2000', periods=100, freq='10min') + rng = date_range("1/1/2000", periods=100, freq="10min") ts = DataFrame(np.random.randn(len(rng), len(rng))) - stime, etime = ('08:00:00', '09:00:00') + stime, etime = ("08:00:00", "09:00:00") exp_len = 7 - if axis in ['index', 0]: + if axis in ["index", 0]: ts.index = rng assert len(ts.between_time(stime, etime)) == exp_len assert len(ts.between_time(stime, etime, axis=0)) == exp_len - if axis in ['columns', 1]: + if axis in ["columns", 1]: ts.columns = rng selected = ts.between_time(stime, etime, axis=1).columns assert len(selected) == exp_len def test_between_time_axis_raises(self, axis): # issue 8839 - rng = date_range('1/1/2000', periods=100, freq='10min') + rng = date_range("1/1/2000", periods=100, freq="10min") mask = np.arange(0, len(rng)) rand_data = np.random.randn(len(rng), len(rng)) ts = DataFrame(rand_data, index=rng, columns=rng) - stime, etime = ('08:00:00', '09:00:00') + stime, etime = ("08:00:00", "09:00:00") msg = "Index must be DatetimeIndex" - if axis in ['columns', 1]: + if axis in ["columns", 1]: ts.index = mask with pytest.raises(TypeError, match=msg): ts.between_time(stime, etime) with pytest.raises(TypeError, match=msg): ts.between_time(stime, etime, axis=0) - if axis in ['index', 0]: + if axis in ["index", 0]: ts.columns = mask with pytest.raises(TypeError, match=msg): ts.between_time(stime, etime, axis=1) def test_operation_on_NaT(self): # Both NaT and Timestamp are in DataFrame. - df = pd.DataFrame({'foo': [pd.NaT, pd.NaT, - pd.Timestamp('2012-05-01')]}) + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT, pd.Timestamp("2012-05-01")]}) res = df.min() - exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() - exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + exp = pd.Series([pd.Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) # GH12941, only NaTs are in DataFrame. - df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]}) + df = pd.DataFrame({"foo": [pd.NaT, pd.NaT]}) res = df.min() exp = pd.Series([pd.NaT], index=["foo"]) @@ -818,34 +861,36 @@ def test_operation_on_NaT(self): def test_datetime_assignment_with_NaT_and_diff_time_units(self): # GH 7492 - data_ns = np.array([1, 'nat'], dtype='datetime64[ns]') + data_ns = np.array([1, "nat"], dtype="datetime64[ns]") result = pd.Series(data_ns).to_frame() - result['new'] = data_ns - expected = pd.DataFrame({0: [1, None], - 'new': [1, None]}, dtype='datetime64[ns]') + result["new"] = data_ns + expected = pd.DataFrame( + {0: [1, None], "new": [1, None]}, dtype="datetime64[ns]" + ) tm.assert_frame_equal(result, expected) # OutOfBoundsDatetime error shouldn't occur - data_s = np.array([1, 'nat'], dtype='datetime64[s]') - result['new'] = data_s - expected = pd.DataFrame({0: [1, None], - 'new': [1e9, None]}, dtype='datetime64[ns]') + data_s = np.array([1, "nat"], dtype="datetime64[s]") + result["new"] = data_s + expected = pd.DataFrame( + {0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]" + ) tm.assert_frame_equal(result, expected) def test_frame_to_period(self): K = 5 - dr = date_range('1/1/2000', '1/1/2001') - pr = period_range('1/1/2000', '1/1/2001') + dr = date_range("1/1/2000", "1/1/2001") + pr = period_range("1/1/2000", "1/1/2001") df = DataFrame(np.random.randn(len(dr), K), index=dr) - df['mix'] = 'a' + df["mix"] = "a" pts = df.to_period() exp = df.copy() exp.index = pr assert_frame_equal(pts, exp) - pts = df.to_period('M') - tm.assert_index_equal(pts.index, exp.index.asfreq('M')) + pts = df.to_period("M") + tm.assert_index_equal(pts.index, exp.index.asfreq("M")) df = df.T pts = df.to_period(axis=1) @@ -853,55 +898,53 @@ def test_frame_to_period(self): exp.columns = pr assert_frame_equal(pts, exp) - pts = df.to_period('M', axis=1) - tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) + pts = df.to_period("M", axis=1) + tm.assert_index_equal(pts.columns, exp.columns.asfreq("M")) - msg = ("No axis named 2 for object type" - " ") + msg = "No axis named 2 for object type" " " with pytest.raises(ValueError, match=msg): df.to_period(axis=2) - @pytest.mark.parametrize("fn", ['tz_localize', 'tz_convert']) + @pytest.mark.parametrize("fn", ["tz_localize", "tz_convert"]) def test_tz_convert_and_localize(self, fn): - l0 = date_range('20140701', periods=5, freq='D') - l1 = date_range('20140701', periods=5, freq='D') + l0 = date_range("20140701", periods=5, freq="D") + l1 = date_range("20140701", periods=5, freq="D") int_idx = Index(range(5)) - if fn == 'tz_convert': - l0 = l0.tz_localize('UTC') - l1 = l1.tz_localize('UTC') + if fn == "tz_convert": + l0 = l0.tz_localize("UTC") + l1 = l1.tz_localize("UTC") for idx in [l0, l1]: - l0_expected = getattr(idx, fn)('US/Pacific') - l1_expected = getattr(idx, fn)('US/Pacific') + l0_expected = getattr(idx, fn)("US/Pacific") + l1_expected = getattr(idx, fn)("US/Pacific") df1 = DataFrame(np.ones(5), index=l0) - df1 = getattr(df1, fn)('US/Pacific') + df1 = getattr(df1, fn)("US/Pacific") assert_index_equal(df1.index, l0_expected) # MultiIndex # GH7846 df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) - df3 = getattr(df2, fn)('US/Pacific', level=0) + df3 = getattr(df2, fn)("US/Pacific", level=0) assert not df3.index.levels[0].equals(l0) assert_index_equal(df3.index.levels[0], l0_expected) assert_index_equal(df3.index.levels[1], l1) assert not df3.index.levels[1].equals(l1_expected) - df3 = getattr(df2, fn)('US/Pacific', level=1) + df3 = getattr(df2, fn)("US/Pacific", level=1) assert_index_equal(df3.index.levels[0], l0) assert not df3.index.levels[0].equals(l0_expected) assert_index_equal(df3.index.levels[1], l1_expected) assert not df3.index.levels[1].equals(l1) - df4 = DataFrame(np.ones(5), - MultiIndex.from_arrays([int_idx, l0])) + df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) # TODO: untested - df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa + df5 = getattr(df4, fn)("US/Pacific", level=1) # noqa assert_index_equal(df3.index.levels[0], l0) assert not df3.index.levels[0].equals(l0_expected) @@ -911,17 +954,16 @@ def test_tz_convert_and_localize(self, fn): # Bad Inputs # Not DatetimeIndex / PeriodIndex - with pytest.raises(TypeError, match='DatetimeIndex'): + with pytest.raises(TypeError, match="DatetimeIndex"): df = DataFrame(index=int_idx) - df = getattr(df, fn)('US/Pacific') + df = getattr(df, fn)("US/Pacific") # Not DatetimeIndex / PeriodIndex - with pytest.raises(TypeError, match='DatetimeIndex'): - df = DataFrame(np.ones(5), - MultiIndex.from_arrays([int_idx, l0])) - df = getattr(df, fn)('US/Pacific', level=0) + with pytest.raises(TypeError, match="DatetimeIndex"): + df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) + df = getattr(df, fn)("US/Pacific", level=0) # Invalid level - with pytest.raises(ValueError, match='not valid'): + with pytest.raises(ValueError, match="not valid"): df = DataFrame(index=l0) - df = getattr(df, fn)('US/Pacific', level=1) + df = getattr(df, fn)("US/Pacific", level=1) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index b7c73daae0002..3e110a4b040da 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -16,17 +16,18 @@ class TestDataFrameTimezones: - def test_frame_values_with_tz(self): tz = "US/Central" - df = DataFrame({"A": date_range('2000', periods=4, tz=tz)}) + df = DataFrame({"A": date_range("2000", periods=4, tz=tz)}) result = df.values - expected = np.array([ - [pd.Timestamp('2000-01-01', tz=tz)], - [pd.Timestamp('2000-01-02', tz=tz)], - [pd.Timestamp('2000-01-03', tz=tz)], - [pd.Timestamp('2000-01-04', tz=tz)], - ]) + expected = np.array( + [ + [pd.Timestamp("2000-01-01", tz=tz)], + [pd.Timestamp("2000-01-02", tz=tz)], + [pd.Timestamp("2000-01-03", tz=tz)], + [pd.Timestamp("2000-01-04", tz=tz)], + ] + ) tm.assert_numpy_array_equal(result, expected) # two columns, homogenous @@ -40,71 +41,78 @@ def test_frame_values_with_tz(self): est = "US/Eastern" df = df.assign(C=df.A.dt.tz_convert(est)) - new = np.array([ - [pd.Timestamp('2000-01-01T01:00:00', tz=est)], - [pd.Timestamp('2000-01-02T01:00:00', tz=est)], - [pd.Timestamp('2000-01-03T01:00:00', tz=est)], - [pd.Timestamp('2000-01-04T01:00:00', tz=est)], - ]) + new = np.array( + [ + [pd.Timestamp("2000-01-01T01:00:00", tz=est)], + [pd.Timestamp("2000-01-02T01:00:00", tz=est)], + [pd.Timestamp("2000-01-03T01:00:00", tz=est)], + [pd.Timestamp("2000-01-04T01:00:00", tz=est)], + ] + ) expected = np.concatenate([expected, new], axis=1) result = df.values tm.assert_numpy_array_equal(result, expected) def test_frame_from_records_utc(self): - rec = {'datum': 1.5, - 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} + rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} # it works - DataFrame.from_records([rec], index='begin_time') + DataFrame.from_records([rec], index="begin_time") def test_frame_tz_localize(self): - rng = date_range('1/1/2011', periods=100, freq='H') + rng = date_range("1/1/2011", periods=100, freq="H") - df = DataFrame({'a': 1}, index=rng) - result = df.tz_localize('utc') - expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) - assert result.index.tz.zone == 'UTC' + df = DataFrame({"a": 1}, index=rng) + result = df.tz_localize("utc") + expected = DataFrame({"a": 1}, rng.tz_localize("UTC")) + assert result.index.tz.zone == "UTC" tm.assert_frame_equal(result, expected) df = df.T - result = df.tz_localize('utc', axis=1) - assert result.columns.tz.zone == 'UTC' + result = df.tz_localize("utc", axis=1) + assert result.columns.tz.zone == "UTC" tm.assert_frame_equal(result, expected.T) def test_frame_tz_convert(self): - rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") - df = DataFrame({'a': 1}, index=rng) - result = df.tz_convert('Europe/Berlin') - expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) - assert result.index.tz.zone == 'Europe/Berlin' + df = DataFrame({"a": 1}, index=rng) + result = df.tz_convert("Europe/Berlin") + expected = DataFrame({"a": 1}, rng.tz_convert("Europe/Berlin")) + assert result.index.tz.zone == "Europe/Berlin" tm.assert_frame_equal(result, expected) df = df.T - result = df.tz_convert('Europe/Berlin', axis=1) - assert result.columns.tz.zone == 'Europe/Berlin' + result = df.tz_convert("Europe/Berlin", axis=1) + assert result.columns.tz.zone == "Europe/Berlin" tm.assert_frame_equal(result, expected.T) def test_frame_join_tzaware(self): - test1 = DataFrame(np.zeros((6, 3)), - index=date_range("2012-11-15 00:00:00", periods=6, - freq="100L", tz="US/Central")) - test2 = DataFrame(np.zeros((3, 3)), - index=date_range("2012-11-15 00:00:00", periods=3, - freq="250L", tz="US/Central"), - columns=range(3, 6)) - - result = test1.join(test2, how='outer') + test1 = DataFrame( + np.zeros((6, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central" + ), + ) + test2 = DataFrame( + np.zeros((3, 3)), + index=date_range( + "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central" + ), + columns=range(3, 6), + ) + + result = test1.join(test2, how="outer") ex_index = test1.index.union(test2.index) tm.assert_index_equal(result.index, ex_index) - assert result.index.tz.zone == 'US/Central' + assert result.index.tz.zone == "US/Central" def test_frame_add_tz_mismatch_converts_to_utc(self): - rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') - df = DataFrame(np.random.randn(len(rng)), index=rng, columns=['a']) + rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") + df = DataFrame(np.random.randn(len(rng)), index=rng, columns=["a"]) - df_moscow = df.tz_convert('Europe/Moscow') + df_moscow = df.tz_convert("Europe/Moscow") result = df + df_moscow assert result.index.tz is pytz.utc @@ -112,8 +120,8 @@ def test_frame_add_tz_mismatch_converts_to_utc(self): assert result.index.tz is pytz.utc def test_frame_align_aware(self): - idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') - idx2 = date_range('2001', periods=5, freq='2H', tz='US/Eastern') + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") + idx2 = date_range("2001", periods=5, freq="2H", tz="US/Eastern") df1 = DataFrame(np.random.randn(len(idx1), 3), idx1) df2 = DataFrame(np.random.randn(len(idx2), 3), idx2) new1, new2 = df1.align(df2) @@ -123,7 +131,7 @@ def test_frame_align_aware(self): # different timezones convert to UTC # frame with frame - df1_central = df1.tz_convert('US/Central') + df1_central = df1.tz_convert("US/Central") new1, new2 = df1.align(df1_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC @@ -137,72 +145,71 @@ def test_frame_align_aware(self): assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_frame_no_datetime64_dtype(self, tz): # after GH#7822 # these retain the timezones on dict construction - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") dr_tz = dr.tz_localize(tz) - df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) - tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo) - assert df['B'].dtype == tz_expected + df = DataFrame({"A": "foo", "B": dr_tz}, index=dr) + tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo) + assert df["B"].dtype == tz_expected # GH#2810 (with timezones) datetimes_naive = [ts.to_pydatetime() for ts in dr] datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] - df = DataFrame({'dr': dr}) - df['dr_tz'] = dr_tz - df['datetimes_naive'] = datetimes_naive - df['datetimes_with_tz'] = datetimes_with_tz + df = DataFrame({"dr": dr}) + df["dr_tz"] = dr_tz + df["datetimes_naive"] = datetimes_naive + df["datetimes_with_tz"] = datetimes_with_tz result = df.dtypes - expected = Series([ - np.dtype('datetime64[ns]'), - DatetimeTZDtype(tz=tz), - np.dtype('datetime64[ns]'), - DatetimeTZDtype(tz=tz) - ], - index=['dr', 'dr_tz', 'datetimes_naive', 'datetimes_with_tz']) + expected = Series( + [ + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + np.dtype("datetime64[ns]"), + DatetimeTZDtype(tz=tz), + ], + index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"], + ) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_frame_reset_index(self, tz): - dr = date_range('2012-06-02', periods=10, tz=tz) + dr = date_range("2012-06-02", periods=10, tz=tz) df = DataFrame(np.random.randn(len(dr)), dr) - roundtripped = df.reset_index().set_index('index') + roundtripped = df.reset_index().set_index("index") xp = df.index.tz rs = roundtripped.index.tz assert xp == rs - @pytest.mark.parametrize('tz', [None, 'America/New_York']) + @pytest.mark.parametrize("tz", [None, "America/New_York"]) def test_boolean_compare_transpose_tzindex_with_dst(self, tz): # GH 19970 - idx = date_range('20161101', '20161130', freq='4H', tz=tz) - df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))}, - index=idx) + idx = date_range("20161101", "20161130", freq="4H", tz=tz) + df = DataFrame({"a": range(len(idx)), "b": range(len(idx))}, index=idx) result = df.T == df.T - expected = DataFrame(True, index=list('ab'), columns=idx) + expected = DataFrame(True, index=list("ab"), columns=idx) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('copy', [True, False]) - @pytest.mark.parametrize('method, tz', [ - ['tz_localize', None], - ['tz_convert', 'Europe/Berlin'] - ]) + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize( + "method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]] + ) def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): # GH 6326 - result = DataFrame(np.arange(0, 5), - index=date_range('20131027', periods=5, - freq='1H', tz=tz)) - getattr(result, method)('UTC', copy=copy) - expected = DataFrame(np.arange(0, 5), - index=date_range('20131027', periods=5, - freq='1H', tz=tz)) + result = DataFrame( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + getattr(result, method)("UTC", copy=copy) + expected = DataFrame( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) tm.assert_frame_equal(result, expected) def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): # GH 25843 tz = tz_aware_fixture - result = DataFrame({'d': [pd.Timestamp('2019', tz=tz)]}, - dtype='datetime64[ns]') - expected = DataFrame({'d': [pd.Timestamp('2019')]}) + result = DataFrame({"d": [pd.Timestamp("2019", tz=tz)]}, dtype="datetime64[ns]") + expected = DataFrame({"d": [pd.Timestamp("2019")]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index cfe9e00a47db5..33f29c6f8acb5 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -9,24 +9,42 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv, - to_datetime) + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + read_csv, + to_datetime, +) import pandas.core.common as com from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal, ensure_clean, - makeCustomDataframe as mkdf) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, + ensure_clean, + makeCustomDataframe as mkdf, +) from pandas.io.common import _get_handle -MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64'] -MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', - 'int32', 'int64'] +MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] +MIXED_INT_DTYPES = [ + "uint8", + "uint16", + "uint32", + "uint64", + "int8", + "int16", + "int32", + "int64", +] class TestDataFrameToCSV(TestData): - def read_csv(self, path, **kwargs): params = dict(index_col=0, parse_dates=True) params.update(**kwargs) @@ -35,11 +53,11 @@ def read_csv(self, path, **kwargs): def test_to_csv_from_csv1(self): - with ensure_clean('__tmp_to_csv_from_csv1__') as path: - self.frame['A'][:5] = np.nan + with ensure_clean("__tmp_to_csv_from_csv1__") as path: + self.frame["A"][:5] = np.nan self.frame.to_csv(path) - self.frame.to_csv(path, columns=['A', 'B']) + self.frame.to_csv(path, columns=["A", "B"]) self.frame.to_csv(path, header=False) self.frame.to_csv(path, index=False) @@ -48,10 +66,10 @@ def test_to_csv_from_csv1(self): recons = self.read_csv(path) assert_frame_equal(self.tsframe, recons) - self.tsframe.to_csv(path, index_label='index') + self.tsframe.to_csv(path, index_label="index") recons = self.read_csv(path, index_col=None) - assert(len(recons.columns) == len(self.tsframe.columns) + 1) + assert len(recons.columns) == len(self.tsframe.columns) + 1 # no index self.tsframe.to_csv(path, index=False) @@ -59,8 +77,12 @@ def test_to_csv_from_csv1(self): assert_almost_equal(self.tsframe.values, recons.values) # corner case - dm = DataFrame({'s1': Series(range(3), index=np.arange(3)), - 's2': Series(range(2), index=np.arange(2))}) + dm = DataFrame( + { + "s1": Series(range(3), index=np.arange(3)), + "s2": Series(range(2), index=np.arange(2)), + } + ) dm.to_csv(path) recons = self.read_csv(path) @@ -68,27 +90,25 @@ def test_to_csv_from_csv1(self): def test_to_csv_from_csv2(self): - with ensure_clean('__tmp_to_csv_from_csv2__') as path: + with ensure_clean("__tmp_to_csv_from_csv2__") as path: # duplicate index - df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], - columns=['x', 'y', 'z']) + df = DataFrame( + np.random.randn(3, 3), index=["a", "a", "b"], columns=["x", "y", "z"] + ) df.to_csv(path) result = self.read_csv(path) assert_frame_equal(result, df) - midx = MultiIndex.from_tuples( - [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) - df = DataFrame(np.random.randn(3, 3), index=midx, - columns=['x', 'y', 'z']) + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + df = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) df.to_csv(path) - result = self.read_csv(path, index_col=[0, 1, 2], - parse_dates=False) + result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df, check_names=False) # column aliases - col_aliases = Index(['AA', 'X', 'Y', 'Z']) + col_aliases = Index(["AA", "X", "Y", "Z"]) self.frame2.to_csv(path, header=col_aliases) rs = self.read_csv(path) @@ -98,16 +118,16 @@ def test_to_csv_from_csv2(self): msg = "Writing 4 cols but got 2 aliases" with pytest.raises(ValueError, match=msg): - self.frame2.to_csv(path, header=['AA', 'X']) + self.frame2.to_csv(path, header=["AA", "X"]) def test_to_csv_from_csv3(self): - with ensure_clean('__tmp_to_csv_from_csv3__') as path: + with ensure_clean("__tmp_to_csv_from_csv3__") as path: df1 = DataFrame(np.random.randn(3, 1)) df2 = DataFrame(np.random.randn(3, 1)) df1.to_csv(path) - df2.to_csv(path, mode='a', header=False) + df2.to_csv(path, mode="a", header=False) xp = pd.concat([df1, df2]) rs = pd.read_csv(path, index_col=0) rs.columns = [int(label) for label in rs.columns] @@ -116,34 +136,38 @@ def test_to_csv_from_csv3(self): def test_to_csv_from_csv4(self): - with ensure_clean('__tmp_to_csv_from_csv4__') as path: + with ensure_clean("__tmp_to_csv_from_csv4__") as path: # GH 10833 (TimedeltaIndex formatting) dt = pd.Timedelta(seconds=1) - df = pd.DataFrame({'dt_data': [i * dt for i in range(3)]}, - index=pd.Index([i * dt for i in range(3)], - name='dt_index')) + df = pd.DataFrame( + {"dt_data": [i * dt for i in range(3)]}, + index=pd.Index([i * dt for i in range(3)], name="dt_index"), + ) df.to_csv(path) - result = pd.read_csv(path, index_col='dt_index') + result = pd.read_csv(path, index_col="dt_index") result.index = pd.to_timedelta(result.index) # TODO: remove renaming when GH 10875 is solved - result.index = result.index.rename('dt_index') - result['dt_data'] = pd.to_timedelta(result['dt_data']) + result.index = result.index.rename("dt_index") + result["dt_data"] = pd.to_timedelta(result["dt_data"]) assert_frame_equal(df, result, check_index_type=True) def test_to_csv_from_csv5(self): # tz, 8260 - with ensure_clean('__tmp_to_csv_from_csv5__') as path: + with ensure_clean("__tmp_to_csv_from_csv5__") as path: self.tzframe.to_csv(path) - result = pd.read_csv(path, index_col=0, parse_dates=['A']) - - converter = lambda c: to_datetime(result[c]).dt.tz_convert( - 'UTC').dt.tz_convert(self.tzframe[c].dt.tz) - result['B'] = converter('B') - result['C'] = converter('C') + result = pd.read_csv(path, index_col=0, parse_dates=["A"]) + + converter = ( + lambda c: to_datetime(result[c]) + .dt.tz_convert("UTC") + .dt.tz_convert(self.tzframe[c].dt.tz) + ) + result["B"] = converter("B") + result["C"] = converter("C") assert_frame_equal(result, self.tzframe) def test_to_csv_cols_reordering(self): @@ -178,8 +202,7 @@ def _check_df(df, cols=None): if df.columns.is_unique: rs_c.columns = cols else: - indexer, missing = df.columns.get_indexer_non_unique( - cols) + indexer, missing = df.columns.get_indexer_non_unique(cols) rs_c.columns = df.columns.take(indexer) for c in cols: @@ -188,8 +211,7 @@ def _check_df(df, cols=None): if isinstance(obj_df, Series): assert_series_equal(obj_df, obj_rs) else: - assert_frame_equal( - obj_df, obj_rs, check_names=False) + assert_frame_equal(obj_df, obj_rs, check_names=False) # wrote in the same order else: @@ -201,11 +223,11 @@ def _check_df(df, cols=None): # dupe cols df = mkdf(N, 3) - df.columns = ['a', 'a', 'b'] + df.columns = ["a", "a", "b"] _check_df(df, None) # dupe cols with selection - cols = ['b', 'a'] + cols = ["b", "a"] _check_df(df, cols) @pytest.mark.slow @@ -216,7 +238,7 @@ def test_to_csv_dtnat(self): def make_dtnat_arr(n, nnat=None): if nnat is None: nnat = int(n * 0.1) # 10% - s = list(date_range('2000', freq='5min', periods=n)) + s = list(date_range("2000", freq="5min", periods=n)) if nnat: for i in np.random.randint(0, len(s), nnat): s[i] = NaT @@ -231,154 +253,204 @@ def make_dtnat_arr(n, nnat=None): s2 = make_dtnat_arr(chunksize + 5, 0) # s3=make_dtnjat_arr(chunksize+5,0) - with ensure_clean('1.csv') as pth: + with ensure_clean("1.csv") as pth: df = DataFrame(dict(a=s1, b=s2)) df.to_csv(pth, chunksize=chunksize) - recons = self.read_csv(pth)._convert(datetime=True, - coerce=True) - assert_frame_equal(df, recons, check_names=False, - check_less_precise=True) + recons = self.read_csv(pth)._convert(datetime=True, coerce=True) + assert_frame_equal(df, recons, check_names=False, check_less_precise=True) @pytest.mark.slow def test_to_csv_moar(self): - - def _do_test(df, r_dtype=None, c_dtype=None, - rnlvl=None, cnlvl=None, dupe_col=False): + def _do_test( + df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False + ): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: - kwargs['index_col'] = list(range(rnlvl)) - kwargs['header'] = list(range(cnlvl)) + kwargs["index_col"] = list(range(rnlvl)) + kwargs["header"] = list(range(cnlvl)) - with ensure_clean('__tmp_to_csv_moar__') as path: - df.to_csv(path, encoding='utf8', - chunksize=chunksize) + with ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: - kwargs['header'] = 0 + kwargs["header"] = 0 - with ensure_clean('__tmp_to_csv_moar__') as path: - df.to_csv(path, encoding='utf8', chunksize=chunksize) + with ensure_clean("__tmp_to_csv_moar__") as path: + df.to_csv(path, encoding="utf8", chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, str): - return x.decode('utf8') + return x.decode("utf8") return x + if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: - delta_lvl = [recons.iloc[ - :, i].values for i in range(rnlvl - 1)] + delta_lvl = [recons.iloc[:, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix - recons = recons.iloc[:, rnlvl - 1:] + recons = recons.iloc[:, rnlvl - 1 :] - type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') + type_map = dict(i="i", f="f", s="O", u="O", dt="O", p="O") if r_dtype: - if r_dtype == 'u': # unicode - r_dtype = 'O' + if r_dtype == "u": # unicode + r_dtype = "O" recons.index = np.array( - [_to_uni(label) for label in recons.index], - dtype=r_dtype) + [_to_uni(label) for label in recons.index], dtype=r_dtype + ) df.index = np.array( - [_to_uni(label) for label in df.index], dtype=r_dtype) - elif r_dtype == 'dt': # unicode - r_dtype = 'O' + [_to_uni(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "dt": # unicode + r_dtype = "O" recons.index = np.array( - [Timestamp(label) for label in recons.index], - dtype=r_dtype) + [Timestamp(label) for label in recons.index], dtype=r_dtype + ) df.index = np.array( - [Timestamp(label) for label in df.index], - dtype=r_dtype) - elif r_dtype == 'p': - r_dtype = 'O' + [Timestamp(label) for label in df.index], dtype=r_dtype + ) + elif r_dtype == "p": + r_dtype = "O" idx_list = to_datetime(recons.index) recons.index = np.array( - [Timestamp(label) for label in idx_list], - dtype=r_dtype) + [Timestamp(label) for label in idx_list], dtype=r_dtype + ) df.index = np.array( - list(map(Timestamp, df.index.to_timestamp())), - dtype=r_dtype) + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype + ) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: - if c_dtype == 'u': - c_dtype = 'O' + if c_dtype == "u": + c_dtype = "O" recons.columns = np.array( - [_to_uni(label) for label in recons.columns], - dtype=c_dtype) + [_to_uni(label) for label in recons.columns], dtype=c_dtype + ) df.columns = np.array( - [_to_uni(label) for label in df.columns], - dtype=c_dtype) - elif c_dtype == 'dt': - c_dtype = 'O' + [_to_uni(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "dt": + c_dtype = "O" recons.columns = np.array( - [Timestamp(label) for label in recons.columns], - dtype=c_dtype) + [Timestamp(label) for label in recons.columns], dtype=c_dtype + ) df.columns = np.array( - [Timestamp(label) for label in df.columns], - dtype=c_dtype) - elif c_dtype == 'p': - c_dtype = 'O' + [Timestamp(label) for label in df.columns], dtype=c_dtype + ) + elif c_dtype == "p": + c_dtype = "O" col_list = to_datetime(recons.columns) recons.columns = np.array( - [Timestamp(label) for label in col_list], - dtype=c_dtype) + [Timestamp(label) for label in col_list], dtype=c_dtype + ) col_list = df.columns.to_timestamp() df.columns = np.array( - [Timestamp(label) for label in col_list], - dtype=c_dtype) + [Timestamp(label) for label in col_list], dtype=c_dtype + ) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) - assert_frame_equal(df, recons, check_names=False, - check_less_precise=True) + assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 for ncols in [4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: - _do_test(mkdf(nrows, ncols, r_idx_type='dt', - c_idx_type='s'), 'dt', 's') + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s") for ncols in [4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: - _do_test(mkdf(nrows, ncols, r_idx_type='dt', - c_idx_type='s'), 'dt', 's') + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test(mkdf(nrows, ncols, r_idx_type="dt", c_idx_type="s"), "dt", "s") pass - for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'), - ('p', 'p')]: + for r_idx_type, c_idx_type in [("i", "i"), ("s", "s"), ("u", "dt"), ("p", "p")]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: - _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type, - c_idx_type=c_idx_type), - r_idx_type, c_idx_type) + for nrows in [ + 2, + 10, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: + _do_test( + mkdf( + nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type + ), + r_idx_type, + c_idx_type, + ) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) - for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: + for nrows in [ + 10, + N - 2, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: _do_test(mkdf(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: @@ -397,20 +469,36 @@ def _to_uni(x): _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) - for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, - 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, - base - 1, base, base + 1]: + for nrows in [ + 10, + N - 2, + N - 1, + N, + N + 1, + N + 2, + 2 * N - 2, + 2 * N - 1, + 2 * N, + 2 * N + 1, + 2 * N + 2, + base - 1, + base, + base + 1, + ]: _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2) - _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), - rnlvl=2, cnlvl=2) + _do_test( + mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), + rnlvl=2, + cnlvl=2, + ) def test_to_csv_from_csv_w_some_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix - self.frame['G'] = np.nan - f = lambda x: [np.inf, np.nan][np.random.rand() < .5] - self.frame['H'] = self.frame.index.map(f) + self.frame["G"] = np.nan + f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5] + self.frame["H"] = self.frame.index.map(f) with ensure_clean() as path: self.frame.to_csv(path) @@ -418,14 +506,15 @@ def test_to_csv_from_csv_w_some_infs(self): # TODO to_csv drops column name assert_frame_equal(self.frame, recons, check_names=False) - assert_frame_equal(np.isinf(self.frame), - np.isinf(recons), check_names=False) + assert_frame_equal( + np.isinf(self.frame), np.isinf(recons), check_names=False + ) def test_to_csv_from_csv_w_all_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix - self.frame['E'] = np.inf - self.frame['F'] = -np.inf + self.frame["E"] = np.inf + self.frame["F"] = -np.inf with ensure_clean() as path: self.frame.to_csv(path) @@ -433,17 +522,18 @@ def test_to_csv_from_csv_w_all_infs(self): # TODO to_csv drops column name assert_frame_equal(self.frame, recons, check_names=False) - assert_frame_equal(np.isinf(self.frame), - np.isinf(recons), check_names=False) + assert_frame_equal( + np.isinf(self.frame), np.isinf(recons), check_names=False + ) def test_to_csv_no_index(self): # GH 3624, after appending columns, to_csv fails - with ensure_clean('__tmp_to_csv_no_index__') as path: - df = DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6]}) + with ensure_clean("__tmp_to_csv_no_index__") as path: + df = DataFrame({"c1": [1, 2, 3], "c2": [4, 5, 6]}) df.to_csv(path, index=False) result = read_csv(path) assert_frame_equal(df, result) - df['c3'] = Series([7, 8, 9], dtype='int64') + df["c3"] = Series([7, 8, 9], dtype="int64") df.to_csv(path, index=False) result = read_csv(path) assert_frame_equal(df, result) @@ -452,23 +542,22 @@ def test_to_csv_with_mix_columns(self): # gh-11637: incorrect output when a mix of integer and string column # names passed as columns parameter in to_csv - df = DataFrame({0: ['a', 'b', 'c'], - 1: ['aa', 'bb', 'cc']}) - df['test'] = 'txt' - assert df.to_csv() == df.to_csv(columns=[0, 1, 'test']) + df = DataFrame({0: ["a", "b", "c"], 1: ["aa", "bb", "cc"]}) + df["test"] = "txt" + assert df.to_csv() == df.to_csv(columns=[0, 1, "test"]) def test_to_csv_headers(self): # GH6186, the presence or absence of `index` incorrectly # causes to_csv to have different header semantics. - from_df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) - to_df = DataFrame([[1, 2], [3, 4]], columns=['X', 'Y']) - with ensure_clean('__tmp_to_csv_headers__') as path: - from_df.to_csv(path, header=['X', 'Y']) + from_df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) + to_df = DataFrame([[1, 2], [3, 4]], columns=["X", "Y"]) + with ensure_clean("__tmp_to_csv_headers__") as path: + from_df.to_csv(path, header=["X", "Y"]) recons = self.read_csv(path) assert_frame_equal(to_df, recons) - from_df.to_csv(path, index=False, header=['X', 'Y']) + from_df.to_csv(path, index=False, header=["X", "Y"]) recons = self.read_csv(path) recons.reset_index(inplace=True) @@ -479,19 +568,18 @@ def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - with ensure_clean('__tmp_to_csv_multiindex__') as path: + with ensure_clean("__tmp_to_csv_multiindex__") as path: frame.to_csv(path, header=False) - frame.to_csv(path, columns=['A', 'B']) + frame.to_csv(path, columns=["A", "B"]) # round trip frame.to_csv(path) - df = self.read_csv(path, index_col=[0, 1], - parse_dates=False) + df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) @@ -506,7 +594,7 @@ def test_to_csv_multiindex(self): new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) - tsframe.to_csv(path, index_label=['time', 'foo']) + tsframe.to_csv(path, index_label=["time", "foo"]) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name @@ -525,38 +613,36 @@ def test_to_csv_multiindex(self): # needed if setUp becomes class method self.tsframe.index = old_index - with ensure_clean('__tmp_to_csv_multiindex__') as path: + with ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: - names = ['first', 'second'] - return DataFrame(np.random.randint(0, 10, size=(3, 3)), - columns=MultiIndex.from_tuples( - [('bah', 'foo'), - ('bah', 'bar'), - ('ban', 'baz')], names=names), - dtype='int64') + names = ["first", "second"] + return DataFrame( + np.random.randint(0, 10, size=(3, 3)), + columns=MultiIndex.from_tuples( + [("bah", "foo"), ("bah", "bar"), ("ban", "baz")], names=names + ), + dtype="int64", + ) # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], - index_col=[0, 1]) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) - result = read_csv( - path, header=[0, 1, 2, 3], index_col=0) + result = read_csv(path, header=[0, 1, 2, 3], index_col=0) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) - result = read_csv(path, header=[0, 1, 2, 3], - index_col=[0, 1, 2]) + result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) assert_frame_equal(df, result) # writing with no index @@ -576,14 +662,12 @@ def _make_frame(names=None): # whatsnew example df = _make_frame() df.to_csv(path) - result = read_csv(path, header=[0, 1], - index_col=[0]) + result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) - result = read_csv(path, header=[0, 1], - index_col=[0]) + result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) # invalid options @@ -591,16 +675,16 @@ def _make_frame(names=None): df.to_csv(path) for i in [6, 7]: - msg = 'len of {i}, but only 5 lines in file'.format(i=i) + msg = "len of {i}, but only 5 lines in file".format(i=i) with pytest.raises(ParserError, match=msg): read_csv(path, header=list(range(i)), index_col=0) # write with cols - msg = 'cannot specify cols with a MultiIndex' + msg = "cannot specify cols with a MultiIndex" with pytest.raises(TypeError, match=msg): - df.to_csv(path, columns=['foo', 'bar']) + df.to_csv(path, columns=["foo", "bar"]) - with ensure_clean('__tmp_to_csv_multiindex__') as path: + with ensure_clean("__tmp_to_csv_multiindex__") as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) @@ -615,38 +699,40 @@ def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan - with ensure_clean('__tmp_to_csv_float32_nanrep__.csv') as path: + with ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path: df.to_csv(path, na_rep=999) with open(path) as f: lines = f.readlines() - assert lines[1].split(',')[2] == '999' + assert lines[1].split(",")[2] == "999" def test_to_csv_withcommas(self): # Commas inside fields should be correctly escaped when saving as CSV. - df = DataFrame({'A': [1, 2, 3], 'B': ['5,6', '7,8', '9,0']}) + df = DataFrame({"A": [1, 2, 3], "B": ["5,6", "7,8", "9,0"]}) - with ensure_clean('__tmp_to_csv_withcommas__.csv') as path: + with ensure_clean("__tmp_to_csv_withcommas__.csv") as path: df.to_csv(path) df2 = self.read_csv(path) assert_frame_equal(df2, df) def test_to_csv_mixed(self): - def create_cols(name): return ["%s%03d" % (name, i) for i in range(5)] - df_float = DataFrame(np.random.randn( - 100, 5), dtype='float64', columns=create_cols('float')) - df_int = DataFrame(np.random.randn(100, 5), - dtype='int64', columns=create_cols('int')) - df_bool = DataFrame(True, index=df_float.index, - columns=create_cols('bool')) - df_object = DataFrame('foo', index=df_float.index, - columns=create_cols('object')) - df_dt = DataFrame(Timestamp('20010101'), - index=df_float.index, columns=create_cols('date')) + df_float = DataFrame( + np.random.randn(100, 5), dtype="float64", columns=create_cols("float") + ) + df_int = DataFrame( + np.random.randn(100, 5), dtype="int64", columns=create_cols("int") + ) + df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) + df_object = DataFrame( + "foo", index=df_float.index, columns=create_cols("object") + ) + df_dt = DataFrame( + Timestamp("20010101"), index=df_float.index, columns=create_cols("date") + ) # add in some nans df_float.loc[30:50, 1:3] = np.nan @@ -658,22 +744,29 @@ def create_cols(name): # dtype dtypes = dict() - for n, dtype in [('float', np.float64), ('int', np.int64), - ('bool', np.bool), ('object', np.object)]: + for n, dtype in [ + ("float", np.float64), + ("int", np.int64), + ("bool", np.bool), + ("object", np.object), + ]: for c in create_cols(n): dtypes[c] = dtype with ensure_clean() as filename: df.to_csv(filename) - rs = read_csv(filename, index_col=0, dtype=dtypes, - parse_dates=create_cols('date')) + rs = read_csv( + filename, index_col=0, dtype=dtypes, parse_dates=create_cols("date") + ) assert_frame_equal(rs, df) def test_to_csv_dups_cols(self): - df = DataFrame(np.random.randn(1000, 30), - columns=list(range(15)) + list(range(15)), - dtype='float64') + df = DataFrame( + np.random.randn(1000, 30), + columns=list(range(15)) + list(range(15)), + dtype="float64", + ) with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine @@ -681,14 +774,14 @@ def test_to_csv_dups_cols(self): result.columns = df.columns assert_frame_equal(result, df) - df_float = DataFrame(np.random.randn(1000, 3), dtype='float64') - df_int = DataFrame(np.random.randn(1000, 3), dtype='int64') + df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") + df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") df_bool = DataFrame(True, index=df_float.index, columns=range(3)) - df_object = DataFrame('foo', index=df_float.index, columns=range(3)) - df_dt = DataFrame(Timestamp('20010101'), - index=df_float.index, columns=range(3)) - df = pd.concat([df_float, df_int, df_bool, df_object, - df_dt], axis=1, ignore_index=True) + df_object = DataFrame("foo", index=df_float.index, columns=range(3)) + df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) + df = pd.concat( + [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True + ) cols = [] for i in range(5): @@ -700,7 +793,7 @@ def test_to_csv_dups_cols(self): result = read_csv(filename, index_col=0) # date cols - for i in ['0.4', '1.4', '2.4']: + for i in ["0.4", "1.4", "2.4"]: result[i] = to_datetime(result[i]) result.columns = df.columns @@ -711,22 +804,22 @@ def test_to_csv_dups_cols(self): N = 10 df = mkdf(N, 3) - df.columns = ['a', 'a', 'b'] + df.columns = ["a", "a", "b"] with ensure_clean() as filename: df.to_csv(filename) # read_csv will rename the dups columns result = read_csv(filename, index_col=0) - result = result.rename(columns={'a.1': 'a'}) + result = result.rename(columns={"a.1": "a"}) assert_frame_equal(result, df) def test_to_csv_chunking(self): - aa = DataFrame({'A': range(100000)}) - aa['B'] = aa.A + 1.0 - aa['C'] = aa.A + 2.0 - aa['D'] = aa.A + 3.0 + aa = DataFrame({"A": range(100000)}) + aa["B"] = aa.A + 1.0 + aa["C"] = aa.A + 2.0 + aa["D"] = aa.A + 3.0 for chunksize in [10000, 50000, 100000]: with ensure_clean() as filename: @@ -744,9 +837,9 @@ def test_to_csv_wide_frame_formatting(self): assert_frame_equal(rs, df) def test_to_csv_bug(self): - f1 = StringIO('a,1.0\nb,2.0') + f1 = StringIO("a,1.0\nb,2.0") df = self.read_csv(f1, header=None) - newdf = DataFrame({'t': df[df.columns[0]]}) + newdf = DataFrame({"t": df[df.columns[0]]}) with ensure_clean() as path: newdf.to_csv(path) @@ -757,29 +850,29 @@ def test_to_csv_bug(self): def test_to_csv_unicode(self): - df = DataFrame({'c/\u03c3': [1, 2, 3]}) + df = DataFrame({"c/\u03c3": [1, 2, 3]}) with ensure_clean() as path: - df.to_csv(path, encoding='UTF-8') - df2 = read_csv(path, index_col=0, encoding='UTF-8') + df.to_csv(path, encoding="UTF-8") + df2 = read_csv(path, index_col=0, encoding="UTF-8") assert_frame_equal(df, df2) - df.to_csv(path, encoding='UTF-8', index=False) - df2 = read_csv(path, index_col=None, encoding='UTF-8') + df.to_csv(path, encoding="UTF-8", index=False) + df2 = read_csv(path, index_col=None, encoding="UTF-8") assert_frame_equal(df, df2) def test_to_csv_unicode_index_col(self): - buf = StringIO('') + buf = StringIO("") df = DataFrame( [["\u05d0", "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], - columns=["\u05d0", - "\u05d1", "\u05d2", "\u05d3"], - index=["\u05d0", "\u05d1"]) + columns=["\u05d0", "\u05d1", "\u05d2", "\u05d3"], + index=["\u05d0", "\u05d1"], + ) - df.to_csv(buf, encoding='UTF-8') + df.to_csv(buf, encoding="UTF-8") buf.seek(0) - df2 = read_csv(buf, index_col=0, encoding='UTF-8') + df2 = read_csv(buf, index_col=0, encoding="UTF-8") assert_frame_equal(df, df2) def test_to_csv_stringio(self): @@ -792,93 +885,93 @@ def test_to_csv_stringio(self): def test_to_csv_float_format(self): - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) with ensure_clean() as filename: - df.to_csv(filename, float_format='%.2f') + df.to_csv(filename, float_format="%.2f") rs = read_csv(filename, index_col=0) - xp = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + xp = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) assert_frame_equal(rs, xp) def test_to_csv_unicodewriter_quoting(self): - df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) + df = DataFrame({"A": [1, 2, 3], "B": ["foo", "bar", "baz"]}) buf = StringIO() - df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, - encoding='utf-8') + df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding="utf-8") result = buf.getvalue() - expected_rows = ['"A","B"', - '1,"foo"', - '2,"bar"', - '3,"baz"'] + expected_rows = ['"A","B"', '1,"foo"', '2,"bar"', '3,"baz"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_to_csv_quote_none(self): # GH4328 - df = DataFrame({'A': ['hello', '{"hello"}']}) - for encoding in (None, 'utf-8'): + df = DataFrame({"A": ["hello", '{"hello"}']}) + for encoding in (None, "utf-8"): buf = StringIO() - df.to_csv(buf, quoting=csv.QUOTE_NONE, - encoding=encoding, index=False) + df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) result = buf.getvalue() - expected_rows = ['A', - 'hello', - '{"hello"}'] + expected_rows = ["A", "hello", '{"hello"}'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_to_csv_index_no_leading_comma(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['one', 'two', 'three']) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) buf = StringIO() df.to_csv(buf, index_label=False) - expected_rows = ['A,B', - 'one,1,4', - 'two,2,5', - 'three,3,6'] + expected_rows = ["A,B", "one,1,4", "two,2,5", "three,3,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert buf.getvalue() == expected def test_to_csv_line_terminators(self): # see gh-20353 - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, - index=['one', 'two', 'three']) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["one", "two", "three"]) with ensure_clean() as path: # case 1: CRLF as line terminator - df.to_csv(path, line_terminator='\r\n') - expected = b',A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n' + df.to_csv(path, line_terminator="\r\n") + expected = b",A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n" - with open(path, mode='rb') as f: + with open(path, mode="rb") as f: assert f.read() == expected with ensure_clean() as path: # case 2: LF as line terminator - df.to_csv(path, line_terminator='\n') - expected = b',A,B\none,1,4\ntwo,2,5\nthree,3,6\n' + df.to_csv(path, line_terminator="\n") + expected = b",A,B\none,1,4\ntwo,2,5\nthree,3,6\n" - with open(path, mode='rb') as f: + with open(path, mode="rb") as f: assert f.read() == expected with ensure_clean() as path: # case 3: The default line terminator(=os.linesep)(gh-21406) df.to_csv(path) - os_linesep = os.linesep.encode('utf-8') - expected = (b',A,B' + os_linesep + b'one,1,4' + os_linesep + - b'two,2,5' + os_linesep + b'three,3,6' + os_linesep) - - with open(path, mode='rb') as f: + os_linesep = os.linesep.encode("utf-8") + expected = ( + b",A,B" + + os_linesep + + b"one,1,4" + + os_linesep + + b"two,2,5" + + os_linesep + + b"three,3,6" + + os_linesep + ) + + with open(path, mode="rb") as f: assert f.read() == expected def test_to_csv_from_csv_categorical(self): @@ -915,95 +1008,116 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - @pytest.mark.parametrize('df,encoding', [ - (DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']), None), - # GH 21241, 21118 - (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), - (DataFrame(5 * [[123, "你好", "世界"]], - columns=['X', 'Y', 'Z']), 'gb2312'), - (DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], - columns=['X', 'Y', 'Z']), 'cp737') - ]) + @pytest.mark.parametrize( + "df,encoding", + [ + ( + DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ), + None, + ), + # GH 21241, 21118 + (DataFrame([["abc", "def", "ghi"]], columns=["X", "Y", "Z"]), "ascii"), + (DataFrame(5 * [[123, "你好", "世界"]], columns=["X", "Y", "Z"]), "gb2312"), + ( + DataFrame(5 * [[123, "Γειά σου", "Κόσμε"]], columns=["X", "Y", "Z"]), + "cp737", + ), + ], + ) def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv - result = read_csv(filename, compression=compression, - index_col=0, encoding=encoding) + result = read_csv( + filename, compression=compression, index_col=0, encoding=encoding + ) assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = _get_handle(filename, 'w', compression=compression, - encoding=encoding) + f, _handles = _get_handle( + filename, "w", compression=compression, encoding=encoding + ) with f: df.to_csv(f, encoding=encoding) - result = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, squeeze=True) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) assert_frame_equal(df, result) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding or 'utf8') + text = fh.read().decode(encoding or "utf8") for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(df, read_csv(fh, - index_col=0, - encoding=encoding)) + assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) def test_to_csv_date_format(self): - with ensure_clean('__tmp_to_csv_date_format__') as path: + with ensure_clean("__tmp_to_csv_date_format__") as path: dt_index = self.tsframe.index datetime_frame = DataFrame( - {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) - datetime_frame.to_csv(path, date_format='%Y%m%d') + {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index + ) + datetime_frame.to_csv(path, date_format="%Y%m%d") # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_int = datetime_frame.applymap( - lambda x: int(x.strftime('%Y%m%d'))) + lambda x: int(x.strftime("%Y%m%d")) + ) datetime_frame_int.index = datetime_frame_int.index.map( - lambda x: int(x.strftime('%Y%m%d'))) + lambda x: int(x.strftime("%Y%m%d")) + ) assert_frame_equal(test, datetime_frame_int) - datetime_frame.to_csv(path, date_format='%Y-%m-%d') + datetime_frame.to_csv(path, date_format="%Y-%m-%d") # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_str = datetime_frame.applymap( - lambda x: x.strftime('%Y-%m-%d')) + lambda x: x.strftime("%Y-%m-%d") + ) datetime_frame_str.index = datetime_frame_str.index.map( - lambda x: x.strftime('%Y-%m-%d')) + lambda x: x.strftime("%Y-%m-%d") + ) assert_frame_equal(test, datetime_frame_str) # Check that columns get converted datetime_frame_columns = datetime_frame.T - datetime_frame_columns.to_csv(path, date_format='%Y%m%d') + datetime_frame_columns.to_csv(path, date_format="%Y%m%d") test = read_csv(path, index_col=0) datetime_frame_columns = datetime_frame_columns.applymap( - lambda x: int(x.strftime('%Y%m%d'))) + lambda x: int(x.strftime("%Y%m%d")) + ) # Columns don't get converted to ints by read_csv - datetime_frame_columns.columns = ( - datetime_frame_columns.columns - .map(lambda x: x.strftime('%Y%m%d'))) + datetime_frame_columns.columns = datetime_frame_columns.columns.map( + lambda x: x.strftime("%Y%m%d") + ) assert_frame_equal(test, datetime_frame_columns) # test NaTs nat_index = to_datetime( - ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) - nat_frame = DataFrame({'A': nat_index}, index=nat_index) - nat_frame.to_csv(path, date_format='%Y-%m-%d') + ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] + ) + nat_frame = DataFrame({"A": nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format="%Y-%m-%d") test = read_csv(path, parse_dates=[0, 1], index_col=0) @@ -1011,57 +1125,65 @@ def test_to_csv_date_format(self): def test_to_csv_with_dst_transitions(self): - with ensure_clean('csv_date_format_with_dst') as path: + with ensure_clean("csv_date_format_with_dst") as path: # make sure we are not failing on transitions - times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", - freq="H", - ambiguous='infer') - - for i in [times, times + pd.Timedelta('10s')]: - time_range = np.array(range(len(i)), dtype='int64') - df = DataFrame({'A': time_range}, index=i) + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + + for i in [times, times + pd.Timedelta("10s")]: + time_range = np.array(range(len(i)), dtype="int64") + df = DataFrame({"A": time_range}, index=i) df.to_csv(path, index=True) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) result.index = to_datetime(result.index, utc=True).tz_convert( - 'Europe/London') + "Europe/London" + ) assert_frame_equal(result, df) # GH11619 - idx = pd.date_range('2015-01-01', '2015-12-31', - freq='H', tz='Europe/Paris') - df = DataFrame({'values': 1, 'idx': idx}, - index=idx) - with ensure_clean('csv_date_format_with_dst') as path: + idx = pd.date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") + df = DataFrame({"values": 1, "idx": idx}, index=idx) + with ensure_clean("csv_date_format_with_dst") as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) result.index = to_datetime(result.index, utc=True).tz_convert( - 'Europe/Paris') - result['idx'] = to_datetime(result['idx'], utc=True).astype( - 'datetime64[ns, Europe/Paris]') + "Europe/Paris" + ) + result["idx"] = to_datetime(result["idx"], utc=True).astype( + "datetime64[ns, Europe/Paris]" + ) assert_frame_equal(result, df) # assert working df.astype(str) - with ensure_clean('csv_date_format_with_dst') as path: + with ensure_clean("csv_date_format_with_dst") as path: df.to_pickle(path) result = pd.read_pickle(path) assert_frame_equal(result, df) def test_to_csv_quoting(self): - df = DataFrame({ - 'c_bool': [True, False], - 'c_float': [1.0, 3.2], - 'c_int': [42, np.nan], - 'c_string': ['a', 'b,c'], - }) - - expected_rows = [',c_bool,c_float,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,"b,c"'] + df = DataFrame( + { + "c_bool": [True, False], + "c_float": [1.0, 3.2], + "c_int": [42, np.nan], + "c_string": ["a", "b,c"], + } + ) + + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv() @@ -1070,17 +1192,21 @@ def test_to_csv_quoting(self): result = df.to_csv(quoting=None) assert result == expected - expected_rows = [',c_bool,c_float,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,"b,c"'] + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + '1,False,3.2,,"b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_MINIMAL) assert result == expected - expected_rows = ['"","c_bool","c_float","c_int","c_string"', - '"0","True","1.0","42.0","a"', - '"1","False","3.2","","b,c"'] + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '"0","True","1.0","42.0","a"', + '"1","False","3.2","","b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_ALL) @@ -1088,9 +1214,11 @@ def test_to_csv_quoting(self): # see gh-12922, gh-13259: make sure changes to # the formatters do not break this behaviour - expected_rows = ['"","c_bool","c_float","c_int","c_string"', - '0,True,1.0,42.0,"a"', - '1,False,3.2,"","b,c"'] + expected_rows = [ + '"","c_bool","c_float","c_int","c_string"', + '0,True,1.0,42.0,"a"', + '1,False,3.2,"","b,c"', + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) assert result == expected @@ -1102,41 +1230,40 @@ def test_to_csv_quoting(self): with pytest.raises(csv.Error, match=msg): df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None) - expected_rows = [',c_bool,c_float,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,b!,c'] + expected_rows = [ + ",c_bool,c_float,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,b!,c", + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) - result = df.to_csv(quoting=csv.QUOTE_NONE, - escapechar='!') + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="!") assert result == expected - expected_rows = [',c_bool,c_ffloat,c_int,c_string', - '0,True,1.0,42.0,a', - '1,False,3.2,,bf,c'] + expected_rows = [ + ",c_bool,c_ffloat,c_int,c_string", + "0,True,1.0,42.0,a", + "1,False,3.2,,bf,c", + ] expected = tm.convert_rows_list_to_csv_str(expected_rows) - result = df.to_csv(quoting=csv.QUOTE_NONE, - escapechar='f') + result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar="f") assert result == expected # see gh-3503: quoting Windows line terminators # presents with encoding? - text_rows = ['a,b,c', - '1,"test \r\n",3'] + text_rows = ["a,b,c", '1,"test \r\n",3'] text = tm.convert_rows_list_to_csv_str(text_rows) df = pd.read_csv(StringIO(text)) buf = StringIO() - df.to_csv(buf, encoding='utf-8', index=False) + df.to_csv(buf, encoding="utf-8", index=False) assert buf.getvalue() == text # xref gh-7791: make sure the quoting parameter is passed through # with multi-indexes - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) - df = df.set_index(['a', 'b']) + df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) + df = df.set_index(["a", "b"]) - expected_rows = ['"a","b","c"', - '"1","3","5"', - '"2","4","6"'] + expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(quoting=csv.QUOTE_ALL) == expected @@ -1149,20 +1276,14 @@ def test_period_index_date_overflow(self): df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() - expected_rows = [',0', - '1990-01-01,4', - '2000-01-01,5', - '3005-01-01,6'] + expected_rows = [",0", "1990-01-01,4", "2000-01-01,5", "3005-01-01,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected date_format = "%m-%d-%Y" result = df.to_csv(date_format=date_format) - expected_rows = [',0', - '01-01-1990,4', - '01-01-2000,5', - '01-01-3005,6'] + expected_rows = [",0", "01-01-1990,4", "01-01-2000,5", "01-01-3005,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected @@ -1173,26 +1294,20 @@ def test_period_index_date_overflow(self): df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() - expected_rows = [',0', - '1990-01-01,4', - ',5', - '3005-01-01,6'] + expected_rows = [",0", "1990-01-01,4", ",5", "3005-01-01,6"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_multi_index_header(self): # see gh-5539 - columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), - ("b", 1), ("b", 2)]) + columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) df.columns = columns header = ["a", "b", "c", "d"] result = df.to_csv(header=header) - expected_rows = [',a,b,c,d', - '0,1,2,3,4', - '1,5,6,7,8'] + expected_rows = [",a,b,c,d", "0,1,2,3,4", "1,5,6,7,8"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected @@ -1202,17 +1317,17 @@ def test_to_csv_single_level_multi_index(self): df = pd.DataFrame([[1, 2, 3]], columns=index) df = df.reindex(columns=[(1,), (3,)]) expected = ",1,3\n0,1,3\n" - result = df.to_csv(line_terminator='\n') + result = df.to_csv(line_terminator="\n") assert_almost_equal(result, expected) def test_gz_lineend(self): # GH 25311 - df = pd.DataFrame({'a': [1, 2]}) - expected_rows = ['a', '1', '2'] + df = pd.DataFrame({"a": [1, 2]}) + expected_rows = ["a", "1", "2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - with ensure_clean('__test_gz_lineend.csv.gz') as path: + with ensure_clean("__test_gz_lineend.csv.gz") as path: df.to_csv(path, index=False) - with tm.decompress_file(path, compression='gzip') as f: - result = f.read().decode('utf-8') + with tm.decompress_file(path, compression="gzip") as f: + result = f.read().decode("utf-8") assert result == expected diff --git a/pandas/tests/frame/test_validate.py b/pandas/tests/frame/test_validate.py index 8597d91550c77..c7270322b980c 100644 --- a/pandas/tests/frame/test_validate.py +++ b/pandas/tests/frame/test_validate.py @@ -5,18 +5,27 @@ @pytest.fixture def dataframe(): - return DataFrame({'a': [1, 2], 'b': [3, 4]}) + return DataFrame({"a": [1, 2], "b": [3, 4]}) class TestDataFrameValidate: """Tests for error handling related to data types of method arguments.""" - @pytest.mark.parametrize("func", ["query", "eval", "set_index", - "reset_index", "dropna", - "drop_duplicates", "sort_values"]) + @pytest.mark.parametrize( + "func", + [ + "query", + "eval", + "set_index", + "reset_index", + "dropna", + "drop_duplicates", + "sort_values", + ], + ) @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(self, dataframe, func, inplace): - msg = "For argument \"inplace\" expected type bool" + msg = 'For argument "inplace" expected type bool' kwargs = dict(inplace=inplace) if func == "query": diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 4646c7c9196a3..ff9895cf14318 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -11,12 +11,16 @@ from pandas import DataFrame, MultiIndex, Series, date_range import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from .test_generic import Generic try: import xarray + _XARRAY_INSTALLED = True except ImportError: _XARRAY_INSTALLED = False @@ -27,32 +31,33 @@ class TestDataFrame(Generic): _comparator = lambda self, x, y: assert_frame_equal(x, y) def test_rename_mi(self): - df = DataFrame([ - 11, 21, 31 - ], index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]])) + df = DataFrame( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) df.rename(str.lower) def test_set_axis_name(self): df = pd.DataFrame([[1, 2], [3, 4]]) - funcs = ['_set_axis_name', 'rename_axis'] + funcs = ["_set_axis_name", "rename_axis"] for func in funcs: - result = methodcaller(func, 'foo')(df) + result = methodcaller(func, "foo")(df) assert df.index.name is None - assert result.index.name == 'foo' + assert result.index.name == "foo" - result = methodcaller(func, 'cols', axis=1)(df) + result = methodcaller(func, "cols", axis=1)(df) assert df.columns.name is None - assert result.columns.name == 'cols' + assert result.columns.name == "cols" def test_set_axis_name_mi(self): df = DataFrame( np.empty((3, 3)), - index=MultiIndex.from_tuples([("A", x) for x in list('aBc')]), - columns=MultiIndex.from_tuples([('C', x) for x in list('xyz')]) + index=MultiIndex.from_tuples([("A", x) for x in list("aBc")]), + columns=MultiIndex.from_tuples([("C", x) for x in list("xyz")]), ) - level_names = ['L1', 'L2'] - funcs = ['_set_axis_name', 'rename_axis'] + level_names = ["L1", "L2"] + funcs = ["_set_axis_name", "rename_axis"] for func in funcs: result = methodcaller(func, level_names)(df) assert result.index.names == level_names @@ -80,7 +85,7 @@ def test_nonzero_single_element(self): def test_get_numeric_data_preserve_dtype(self): # get the numeric data - o = DataFrame({'A': [1, '2', 3.]}) + o = DataFrame({"A": [1, "2", 3.0]}) result = o._get_numeric_data() expected = DataFrame(index=[0, 1, 2], dtype=object) self._compare(result, expected) @@ -89,17 +94,22 @@ def test_metadata_propagation_indiv(self): # groupby df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - result = df.groupby('A').sum() + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + result = df.groupby("A").sum() self.check_metadata(df, result) # resample - df = DataFrame(np.random.randn(1000, 2), - index=date_range('20130101', periods=1000, freq='s')) - result = df.resample('1T') + df = DataFrame( + np.random.randn(1000, 2), + index=date_range("20130101", periods=1000, freq="s"), + ) + result = df.resample("1T") self.check_metadata(df, result) # merging with override @@ -108,41 +118,40 @@ def test_metadata_propagation_indiv(self): _finalize = DataFrame.__finalize__ np.random.seed(10) - df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['a', 'b']) - df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=['c', 'd']) - DataFrame._metadata = ['filename'] - df1.filename = 'fname1.csv' - df2.filename = 'fname2.csv' + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=["a", "b"]) + df2 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=["c", "d"]) + DataFrame._metadata = ["filename"] + df1.filename = "fname1.csv" + df2.filename = "fname2.csv" def finalize(self, other, method=None, **kwargs): for name in self._metadata: - if method == 'merge': + if method == "merge": left, right = other.left, other.right - value = getattr(left, name, '') + '|' + getattr(right, - name, '') + value = getattr(left, name, "") + "|" + getattr(right, name, "") object.__setattr__(self, name, value) else: - object.__setattr__(self, name, getattr(other, name, '')) + object.__setattr__(self, name, getattr(other, name, "")) return self DataFrame.__finalize__ = finalize - result = df1.merge(df2, left_on=['a'], right_on=['c'], how='inner') - assert result.filename == 'fname1.csv|fname2.csv' + result = df1.merge(df2, left_on=["a"], right_on=["c"], how="inner") + assert result.filename == "fname1.csv|fname2.csv" # concat # GH 6927 - DataFrame._metadata = ['filename'] - df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list('ab')) - df1.filename = 'foo' + DataFrame._metadata = ["filename"] + df1 = DataFrame(np.random.randint(0, 4, (3, 2)), columns=list("ab")) + df1.filename = "foo" def finalize(self, other, method=None, **kwargs): for name in self._metadata: - if method == 'concat': - value = '+'.join([getattr( - o, name) for o in other.objs if getattr(o, name, None) - ]) + if method == "concat": + value = "+".join( + [getattr(o, name) for o in other.objs if getattr(o, name, None)] + ) object.__setattr__(self, name, value) else: object.__setattr__(self, name, getattr(other, name, None)) @@ -152,7 +161,7 @@ def finalize(self, other, method=None, **kwargs): DataFrame.__finalize__ = finalize result = pd.concat([df1, df1]) - assert result.filename == 'foo+foo' + assert result.filename == "foo+foo" # reset DataFrame._metadata = _metadata @@ -161,48 +170,59 @@ def finalize(self, other, method=None, **kwargs): def test_set_attribute(self): # Test for consistent setattr behavior when an attribute and a column # have the same name (Issue #8994) - df = DataFrame({'x': [1, 2, 3]}) + df = DataFrame({"x": [1, 2, 3]}) df.y = 2 - df['y'] = [2, 4, 6] + df["y"] = [2, 4, 6] df.y = 5 assert df.y == 5 - assert_series_equal(df['y'], Series([2, 4, 6], name='y')) - - @pytest.mark.skipif(not _XARRAY_INSTALLED or _XARRAY_INSTALLED and - LooseVersion(xarray.__version__) < - LooseVersion('0.10.0'), - reason='xarray >= 0.10.0 required') + assert_series_equal(df["y"], Series([2, 4, 6], name="y")) + + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) @pytest.mark.parametrize( - "index", ['FloatIndex', 'IntIndex', - 'StringIndex', 'UnicodeIndex', - 'DateIndex', 'PeriodIndex', - 'CategoricalIndex', 'TimedeltaIndex']) + "index", + [ + "FloatIndex", + "IntIndex", + "StringIndex", + "UnicodeIndex", + "DateIndex", + "PeriodIndex", + "CategoricalIndex", + "TimedeltaIndex", + ], + ) def test_to_xarray_index_types(self, index): from xarray import Dataset - index = getattr(tm, 'make{}'.format(index)) - df = DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', - periods=3, - tz='US/Eastern')} - ) + index = getattr(tm, "make{}".format(index)) + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) df.index = index(3) - df.index.name = 'foo' - df.columns.name = 'bar' + df.index.name = "foo" + df.columns.name = "bar" result = df.to_xarray() - assert result.dims['foo'] == 3 + assert result.dims["foo"] == 3 assert len(result.coords) == 1 assert len(result.data_vars) == 8 - assert_almost_equal(list(result.coords.keys()), ['foo']) + assert_almost_equal(list(result.coords.keys()), ["foo"]) assert isinstance(result, Dataset) # idempotency @@ -210,58 +230,60 @@ def test_to_xarray_index_types(self, index): # datetimes w/tz are not preserved # column names are lost expected = df.copy() - expected['f'] = expected['f'].astype(object) - expected['h'] = expected['h'].astype('datetime64[ns]') + expected["f"] = expected["f"].astype(object) + expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None - assert_frame_equal(result.to_dataframe(), expected, - check_index_type=False, check_categorical=False) + assert_frame_equal( + result.to_dataframe(), + expected, + check_index_type=False, + check_categorical=False, + ) - @td.skip_if_no('xarray', min_version='0.7.0') + @td.skip_if_no("xarray", min_version="0.7.0") def test_to_xarray(self): from xarray import Dataset - df = DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', - periods=3, - tz='US/Eastern')} - ) - - df.index.name = 'foo' + df = DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + + df.index.name = "foo" result = df[0:0].to_xarray() - assert result.dims['foo'] == 0 + assert result.dims["foo"] == 0 assert isinstance(result, Dataset) # available in 0.7.1 # MultiIndex - df.index = pd.MultiIndex.from_product([['a'], range(3)], - names=['one', 'two']) + df.index = pd.MultiIndex.from_product([["a"], range(3)], names=["one", "two"]) result = df.to_xarray() - assert result.dims['one'] == 1 - assert result.dims['two'] == 3 + assert result.dims["one"] == 1 + assert result.dims["two"] == 3 assert len(result.coords) == 2 assert len(result.data_vars) == 8 - assert_almost_equal(list(result.coords.keys()), ['one', 'two']) + assert_almost_equal(list(result.coords.keys()), ["one", "two"]) assert isinstance(result, Dataset) result = result.to_dataframe() expected = df.copy() - expected['f'] = expected['f'].astype(object) - expected['h'] = expected['h'].astype('datetime64[ns]') + expected["f"] = expected["f"].astype(object) + expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None - assert_frame_equal(result, - expected, - check_index_type=False) + assert_frame_equal(result, expected, check_index_type=False) def test_deepcopy_empty(self): # This test covers empty frame copying with non-empty column sets # as reported in issue GH15370 - empty_frame = DataFrame(data=[], index=[], columns=['A']) + empty_frame = DataFrame(data=[], index=[], columns=["A"]) empty_frame_copy = deepcopy(empty_frame) self._compare(empty_frame_copy, empty_frame) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index e8343a1cf318b..aef6c3fe8070c 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -15,7 +15,6 @@ class Generic: - @property def _ndim(self): return self._typ._AXIS_LEN @@ -33,7 +32,7 @@ def _construct(self, shape, value=None, dtype=None, **kwargs): shape = tuple([shape] * self._ndim) if value is not None: if is_scalar(value): - if value == 'empty': + if value == "empty": arr = None # remove the info axis @@ -59,7 +58,7 @@ def _compare(self, result, expected): def test_rename(self): # single axis - idx = list('ABCD') + idx = list("ABCD") # relabeling values passed into self.rename args = [ str.lower, @@ -75,7 +74,7 @@ def test_rename(self): # rename a single axis result = obj.rename(**{axis: arg}) expected = obj.copy() - setattr(expected, axis, list('abcd')) + setattr(expected, axis, list("abcd")) self._compare(result, expected) # multiple axes at once @@ -83,8 +82,7 @@ def test_rename(self): def test_get_numeric_data(self): n = 4 - kwargs = {self._typ._AXIS_NAMES[i]: list(range(n)) - for i in range(self._ndim)} + kwargs = {self._typ._AXIS_NAMES[i]: list(range(n)) for i in range(self._ndim)} # get the numeric data o = self._construct(n, **kwargs) @@ -93,7 +91,7 @@ def test_get_numeric_data(self): # non-inclusion result = o._get_bool_data() - expected = self._construct(n, value='empty', **kwargs) + expected = self._construct(n, value="empty", **kwargs) self._compare(result, expected) # get the bool data @@ -109,7 +107,7 @@ def test_get_default(self): # GH 7725 d0 = "a", "b", "c", "d" - d1 = np.arange(4, dtype='int64') + d1 = np.arange(4, dtype="int64") others = "e", 10 for data, index in ((d0, d1), (d1, d0)): @@ -127,8 +125,7 @@ def test_nonzero(self): # GH 4633 # look at the boolean/nonzero behavior for objects obj = self._construct(shape=4) - msg = "The truth value of a {} is ambiguous".format( - self._typ.__name__) + msg = "The truth value of a {} is ambiguous".format(self._typ.__name__) with pytest.raises(ValueError, match=msg): bool(obj == 0) with pytest.raises(ValueError, match=msg): @@ -178,24 +175,24 @@ def test_downcast(self): o = self._construct(shape=4, value=9, dtype=np.int64) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") self._compare(result, o) - o = self._construct(shape=4, value=9.) + o = self._construct(shape=4, value=9.0) expected = o.astype(np.int64) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") self._compare(result, expected) o = self._construct(shape=4, value=9.5) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") self._compare(result, o) # are close o = self._construct(shape=4, value=9.000000000005) result = o.copy() - result._data = o._data.downcast(dtypes='infer') + result._data = o._data.downcast(dtypes="infer") expected = o.astype(np.int64) self._compare(result, expected) @@ -206,15 +203,16 @@ def test_constructor_compound_dtypes(self): def f(dtype): return self._construct(shape=3, value=1, dtype=dtype) - msg = ("compound dtypes are not implemented in the {} constructor" - .format(self._typ.__name__)) + msg = "compound dtypes are not implemented in the {} constructor".format( + self._typ.__name__ + ) with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) # these work (though results may be unexpected) - f('int64') - f('float64') - f('M8[ns]') + f("int64") + f("float64") + f("M8[ns]") def check_metadata(self, x, y=None): for m in x._metadata: @@ -228,26 +226,26 @@ def test_metadata_propagation(self): # check that the metadata matches up on the resulting ops o = self._construct(shape=3) - o.name = 'foo' + o.name = "foo" o2 = self._construct(shape=3) - o2.name = 'bar' + o2.name = "bar" # ---------- # preserving # ---------- # simple ops with scalars - for op in ['__add__', '__sub__', '__truediv__', '__mul__']: + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: result = getattr(o, op)(1) self.check_metadata(o, result) # ops with like - for op in ['__add__', '__sub__', '__truediv__', '__mul__']: + for op in ["__add__", "__sub__", "__truediv__", "__mul__"]: result = getattr(o, op)(o) self.check_metadata(o, result) # simple boolean - for op in ['__eq__', '__le__', '__ge__']: + for op in ["__eq__", "__le__", "__ge__"]: v1 = getattr(o, op)(o) self.check_metadata(o, v1) self.check_metadata(o, v1 & v1) @@ -266,7 +264,7 @@ def test_metadata_propagation(self): self.check_metadata(result) # simple boolean - for op in ['__eq__', '__le__', '__ge__']: + for op in ["__eq__", "__le__", "__ge__"]: # this is a name matching op v1 = getattr(o, op)(o) @@ -281,9 +279,14 @@ def test_head_tail(self): o = self._construct(shape=10) # check all index types - for index in [tm.makeFloatIndex, tm.makeIntIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeDateIndex, - tm.makePeriodIndex]: + for index in [ + tm.makeFloatIndex, + tm.makeIntIndex, + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: axis = o._get_axis_name(0) setattr(o, axis, index(len(getattr(o, axis)))) @@ -318,19 +321,22 @@ def test_sample(self): for test in range(10): seed = np.random.randint(0, 100) self._compare( - o.sample(n=4, random_state=seed), o.sample(n=4, - random_state=seed)) + o.sample(n=4, random_state=seed), o.sample(n=4, random_state=seed) + ) self._compare( - o.sample(frac=0.7, random_state=seed), o.sample( - frac=0.7, random_state=seed)) + o.sample(frac=0.7, random_state=seed), + o.sample(frac=0.7, random_state=seed), + ) self._compare( o.sample(n=4, random_state=np.random.RandomState(test)), - o.sample(n=4, random_state=np.random.RandomState(test))) + o.sample(n=4, random_state=np.random.RandomState(test)), + ) self._compare( o.sample(frac=0.7, random_state=np.random.RandomState(test)), - o.sample(frac=0.7, random_state=np.random.RandomState(test))) + o.sample(frac=0.7, random_state=np.random.RandomState(test)), + ) os1, os2 = [], [] for _ in range(2): @@ -342,7 +348,7 @@ def test_sample(self): # Check for error when random_state argument invalid. with pytest.raises(ValueError): - o.sample(random_state='astring!') + o.sample(random_state="astring!") ### # Check behavior of `frac` and `N` @@ -412,14 +418,12 @@ def test_sample(self): # Check np.nan are replaced by zeros. weights_with_nan = [np.nan] * 10 weights_with_nan[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) + self._compare(o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) # Check None are also replaced by zeros. weights_with_None = [None] * 10 weights_with_None[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) + self._compare(o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) def test_size_compat(self): # GH8846 @@ -436,28 +440,28 @@ def test_split_compat(self): assert len(np.array_split(o, 2)) == 2 def test_unexpected_keyword(self): # GH8597 - df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe']) + df = DataFrame(np.random.randn(5, 2), columns=["jim", "joe"]) ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) - ts = df['joe'].copy() + ts = df["joe"].copy() ts[2] = np.nan - with pytest.raises(TypeError, match='unexpected keyword'): - df.drop('joe', axis=1, in_place=True) + with pytest.raises(TypeError, match="unexpected keyword"): + df.drop("joe", axis=1, in_place=True) - with pytest.raises(TypeError, match='unexpected keyword'): + with pytest.raises(TypeError, match="unexpected keyword"): df.reindex([1, 0], inplace=True) - with pytest.raises(TypeError, match='unexpected keyword'): + with pytest.raises(TypeError, match="unexpected keyword"): ca.fillna(0, inplace=True) - with pytest.raises(TypeError, match='unexpected keyword'): + with pytest.raises(TypeError, match="unexpected keyword"): ts.fillna(0, in_place=True) # See gh-12301 def test_stat_unexpected_keyword(self): obj = self._construct(5) - starwars = 'Star Wars' - errmsg = 'unexpected keyword' + starwars = "Star Wars" + errmsg = "unexpected keyword" with pytest.raises(TypeError, match=errmsg): obj.max(epic=starwars) # stat_function @@ -474,7 +478,7 @@ def test_api_compat(self): # compat for __name__, __qualname__ obj = self._construct(5) - for func in ['sum', 'cumsum', 'any', 'var']: + for func in ["sum", "cumsum", "any", "var"]: f = getattr(obj, func) assert f.__name__ == func assert f.__qualname__.endswith(func) @@ -498,29 +502,30 @@ def test_truncate_out_of_bounds(self): # small shape = [int(2e3)] + ([1] * (self._ndim - 1)) - small = self._construct(shape, dtype='int8', value=1) + small = self._construct(shape, dtype="int8", value=1) self._compare(small.truncate(), small) self._compare(small.truncate(before=0, after=3e3), small) self._compare(small.truncate(before=-1, after=2e3), small) # big shape = [int(2e6)] + ([1] * (self._ndim - 1)) - big = self._construct(shape, dtype='int8', value=1) + big = self._construct(shape, dtype="int8", value=1) self._compare(big.truncate(), big) self._compare(big.truncate(before=0, after=3e6), big) self._compare(big.truncate(before=-1, after=2e6), big) def test_validate_bool_args(self): - df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) invalid_values = [1, "True", [1, 2, 3], 5.0] for value in invalid_values: with pytest.raises(ValueError): - super(DataFrame, df).rename_axis(mapper={'a': 'x', 'b': 'y'}, - axis=1, inplace=value) + super(DataFrame, df).rename_axis( + mapper={"a": "x", "b": "y"}, axis=1, inplace=value + ) with pytest.raises(ValueError): - super(DataFrame, df).drop('a', axis=1, inplace=value) + super(DataFrame, df).drop("a", axis=1, inplace=value) with pytest.raises(ValueError): super(DataFrame, df).sort_index(inplace=value) @@ -532,8 +537,7 @@ def test_validate_bool_args(self): super(DataFrame, df).fillna(value=0, inplace=value) with pytest.raises(ValueError): - super(DataFrame, df).replace(to_replace=1, value=7, - inplace=value) + super(DataFrame, df).replace(to_replace=1, value=7, inplace=value) with pytest.raises(ValueError): super(DataFrame, df).interpolate(inplace=value) @@ -548,28 +552,33 @@ def test_copy_and_deepcopy(self): # GH 15444 for shape in [0, 1, 2]: obj = self._construct(shape) - for func in [copy, - deepcopy, - lambda x: x.copy(deep=False), - lambda x: x.copy(deep=True)]: + for func in [ + copy, + deepcopy, + lambda x: x.copy(deep=False), + lambda x: x.copy(deep=True), + ]: obj_copy = func(obj) assert obj_copy is not obj self._compare(obj_copy, obj) - @pytest.mark.parametrize("periods,fill_method,limit,exp", [ - (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), - (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), - (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), - (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), - (-1, "ffill", None, [np.nan, np.nan, -.5, -.5, -.6, 0, 0, np.nan]), - (-1, "ffill", 1, [np.nan, np.nan, -.5, -.5, -.6, 0, np.nan, np.nan]), - (-1, "bfill", None, [0, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan]), - (-1, "bfill", 1, [np.nan, 0, -.5, -.5, -.6, np.nan, np.nan, np.nan]) - ]) + @pytest.mark.parametrize( + "periods,fill_method,limit,exp", + [ + (1, "ffill", None, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, 0]), + (1, "ffill", 1, [np.nan, np.nan, np.nan, 1, 1, 1.5, 0, np.nan]), + (1, "bfill", None, [np.nan, 0, 0, 1, 1, 1.5, np.nan, np.nan]), + (1, "bfill", 1, [np.nan, np.nan, 0, 1, 1, 1.5, np.nan, np.nan]), + (-1, "ffill", None, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, 0, np.nan]), + (-1, "ffill", 1, [np.nan, np.nan, -0.5, -0.5, -0.6, 0, np.nan, np.nan]), + (-1, "bfill", None, [0, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + (-1, "bfill", 1, [np.nan, 0, -0.5, -0.5, -0.6, np.nan, np.nan, np.nan]), + ], + ) def test_pct_change(self, periods, fill_method, limit, exp): vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] obj = self._typ(vals) - func = getattr(obj, 'pct_change') + func = getattr(obj, "pct_change") res = func(periods=periods, fill_method=fill_method, limit=limit) if type(obj) is DataFrame: tm.assert_frame_equal(res, DataFrame(exp)) @@ -588,60 +597,62 @@ def test_sample(sel): easy_weight_list = [0] * 10 easy_weight_list[5] = 1 - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10, - 'easyweights': easy_weight_list}) - sample1 = df.sample(n=1, weights='easyweights') + df = pd.DataFrame( + { + "col1": range(10, 20), + "col2": range(20, 30), + "colString": ["a"] * 10, + "easyweights": easy_weight_list, + } + ) + sample1 = df.sample(n=1, weights="easyweights") assert_frame_equal(sample1, df.iloc[5:6]) # Ensure proper error if string given as weight for Series or # DataFrame with axis = 1. s = Series(range(10)) with pytest.raises(ValueError): - s.sample(n=3, weights='weight_column') + s.sample(n=3, weights="weight_column") with pytest.raises(ValueError): - df.sample(n=1, weights='weight_column', axis=1) + df.sample(n=1, weights="weight_column", axis=1) # Check weighting key error with pytest.raises(KeyError): - df.sample(n=3, weights='not_a_real_column_name') + df.sample(n=3, weights="not_a_real_column_name") # Check that re-normalizes weights that don't sum to one. weights_less_than_1 = [0] * 10 weights_less_than_1[0] = 0.5 - tm.assert_frame_equal( - df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) + tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) ### # Test axis argument ### # Test axis argument - df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) + df = pd.DataFrame({"col1": range(10), "col2": ["a"] * 10}) second_column_weight = [0, 1] assert_frame_equal( - df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) + df.sample(n=1, axis=1, weights=second_column_weight), df[["col2"]] + ) # Different axis arg types - assert_frame_equal(df.sample(n=1, axis='columns', - weights=second_column_weight), - df[['col2']]) + assert_frame_equal( + df.sample(n=1, axis="columns", weights=second_column_weight), df[["col2"]] + ) weight = [0] * 10 weight[5] = 0.5 - assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), - df.iloc[5:6]) - assert_frame_equal(df.sample(n=1, axis='index', weights=weight), - df.iloc[5:6]) + assert_frame_equal(df.sample(n=1, axis="rows", weights=weight), df.iloc[5:6]) + assert_frame_equal(df.sample(n=1, axis="index", weights=weight), df.iloc[5:6]) # Check out of range axis values with pytest.raises(ValueError): df.sample(n=1, axis=2) with pytest.raises(ValueError): - df.sample(n=1, axis='not_a_name') + df.sample(n=1, axis="not_a_name") with pytest.raises(ValueError): s = pd.Series(range(10)) @@ -655,21 +666,19 @@ def test_sample(sel): easy_weight_list = [0] * 3 easy_weight_list[2] = 1 - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10}) + df = pd.DataFrame( + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} + ) sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) - assert_frame_equal(sample1, df[['colString']]) + assert_frame_equal(sample1, df[["colString"]]) # Test default axes assert_frame_equal( - df.sample(n=3, random_state=42), df.sample(n=3, axis=0, - random_state=42)) + df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42) + ) # Test that function aligns weights with frame - df = DataFrame( - {'col1': [5, 6, 7], - 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) + df = DataFrame({"col1": [5, 6, 7], "col2": ["a", "b", "c"]}, index=[9, 5, 3]) s = Series([1, 0, 0], index=[3, 5, 9]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) @@ -689,39 +698,38 @@ def test_sample(sel): def test_squeeze(self): # noop - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: tm.assert_series_equal(s.squeeze(), s) for df in [tm.makeTimeDataFrame()]: tm.assert_frame_equal(df.squeeze(), df) # squeezing - df = tm.makeTimeDataFrame().reindex(columns=['A']) - tm.assert_series_equal(df.squeeze(), df['A']) + df = tm.makeTimeDataFrame().reindex(columns=["A"]) + tm.assert_series_equal(df.squeeze(), df["A"]) # don't fail with 0 length dimensions GH11229 & GH8999 - empty_series = Series([], name='five') + empty_series = Series([], name="five") empty_frame = DataFrame([empty_series]) - [tm.assert_series_equal(empty_series, higher_dim.squeeze()) - for higher_dim in [empty_series, empty_frame]] + [ + tm.assert_series_equal(empty_series, higher_dim.squeeze()) + for higher_dim in [empty_series, empty_frame] + ] # axis argument df = tm.makeTimeDataFrame(nper=1).iloc[:, :1] assert df.shape == (1, 1) tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0]) - tm.assert_series_equal(df.squeeze(axis='index'), df.iloc[0]) + tm.assert_series_equal(df.squeeze(axis="index"), df.iloc[0]) tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0]) - tm.assert_series_equal(df.squeeze(axis='columns'), df.iloc[:, 0]) + tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0]) assert df.squeeze() == df.iloc[0, 0] - msg = ("No axis named 2 for object type ") + msg = "No axis named 2 for object type " with pytest.raises(ValueError, match=msg): df.squeeze(axis=2) - msg = ("No axis named x for object type ") + msg = "No axis named x for object type " with pytest.raises(ValueError, match=msg): - df.squeeze(axis='x') + df.squeeze(axis="x") df = tm.makeTimeDataFrame(3) tm.assert_frame_equal(df.squeeze(axis=0), df) @@ -730,12 +738,11 @@ def test_numpy_squeeze(self): s = tm.makeFloatSeries() tm.assert_series_equal(np.squeeze(s), s) - df = tm.makeTimeDataFrame().reindex(columns=['A']) - tm.assert_series_equal(np.squeeze(df), df['A']) + df = tm.makeTimeDataFrame().reindex(columns=["A"]) + tm.assert_series_equal(np.squeeze(df), df["A"]) def test_transpose(self): - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: # calls implementation in pandas/core/base.py tm.assert_series_equal(s.transpose(), s) for df in [tm.makeTimeDataFrame()]: @@ -758,17 +765,19 @@ def test_numpy_transpose(self): def test_take(self): indices = [1, 5, -2, 6, 3, -1] - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries()]: + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: out = s.take(indices) - expected = Series(data=s.values.take(indices), - index=s.index.take(indices), dtype=s.dtype) + expected = Series( + data=s.values.take(indices), index=s.index.take(indices), dtype=s.dtype + ) tm.assert_series_equal(out, expected) for df in [tm.makeTimeDataFrame()]: out = df.take(indices) - expected = DataFrame(data=df.values.take(indices, axis=0), - index=df.index.take(indices), - columns=df.columns) + expected = DataFrame( + data=df.values.take(indices, axis=0), + index=df.index.take(indices), + columns=df.columns, + ) tm.assert_frame_equal(out, expected) def test_take_invalid_kwargs(self): @@ -787,7 +796,7 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - obj.take(indices, mode='clip') + obj.take(indices, mode="clip") def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) @@ -805,34 +814,32 @@ def test_equals(self): s2[0] = 9.9 assert not s1.equals(s2) - idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() assert s1.equals(s2) # Add object dtype column with nans index = np.random.random(10) - df1 = DataFrame( - np.random.random(10, ), index=index, columns=['floats']) - df1['text'] = 'the sky is so blue. we could use more chocolate.'.split( - ) - df1['start'] = date_range('2000-1-1', periods=10, freq='T') - df1['end'] = date_range('2000-1-1', periods=10, freq='D') - df1['diff'] = df1['end'] - df1['start'] - df1['bool'] = (np.arange(10) % 3 == 0) + df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) + df1["text"] = "the sky is so blue. we could use more chocolate.".split() + df1["start"] = date_range("2000-1-1", periods=10, freq="T") + df1["end"] = date_range("2000-1-1", periods=10, freq="D") + df1["diff"] = df1["end"] - df1["start"] + df1["bool"] = np.arange(10) % 3 == 0 df1.loc[::2] = np.nan df2 = df1.copy() - assert df1['text'].equals(df2['text']) - assert df1['start'].equals(df2['start']) - assert df1['end'].equals(df2['end']) - assert df1['diff'].equals(df2['diff']) - assert df1['bool'].equals(df2['bool']) + assert df1["text"].equals(df2["text"]) + assert df1["start"].equals(df2["start"]) + assert df1["end"].equals(df2["end"]) + assert df1["diff"].equals(df2["diff"]) + assert df1["bool"].equals(df2["bool"]) assert df1.equals(df2) assert not df1.equals(object) # different dtype different = df1.copy() - different['floats'] = different['floats'].astype('float32') + different["floats"] = different["floats"].astype("float32") assert not df1.equals(different) # different index @@ -846,22 +853,22 @@ def test_equals(self): assert not df1.equals(different) # DatetimeIndex - index = pd.date_range('2000-1-1', periods=10, freq='T') + index = pd.date_range("2000-1-1", periods=10, freq="T") df1 = df1.set_index(index) df2 = df1.copy() assert df1.equals(df2) # MultiIndex - df3 = df1.set_index(['text'], append=True) - df2 = df1.set_index(['text'], append=True) + df3 = df1.set_index(["text"], append=True) + df2 = df1.set_index(["text"], append=True) assert df3.equals(df2) - df2 = df1.set_index(['floats'], append=True) + df2 = df1.set_index(["floats"], append=True) assert not df3.equals(df2) # NaN in index - df3 = df1.set_index(['floats'], append=True) - df2 = df1.set_index(['floats'], append=True) + df3 = df1.set_index(["floats"], append=True) + df2 = df1.set_index(["floats"], append=True) assert df3.equals(df2) # GH 8437 @@ -880,44 +887,45 @@ def test_equals(self): assert e.equals(f) def test_pipe(self): - df = DataFrame({'A': [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 3]}) f = lambda x, y: x ** y result = df.pipe(f, 2) - expected = DataFrame({'A': [1, 4, 9]}) + expected = DataFrame({"A": [1, 4, 9]}) assert_frame_equal(result, expected) result = df.A.pipe(f, 2) assert_series_equal(result, expected.A) def test_pipe_tuple(self): - df = DataFrame({'A': [1, 2, 3]}) + df = DataFrame({"A": [1, 2, 3]}) f = lambda x, y: y - result = df.pipe((f, 'y'), 0) + result = df.pipe((f, "y"), 0) assert_frame_equal(result, df) - result = df.A.pipe((f, 'y'), 0) + result = df.A.pipe((f, "y"), 0) assert_series_equal(result, df.A) def test_pipe_tuple_error(self): df = DataFrame({"A": [1, 2, 3]}) f = lambda x, y: y with pytest.raises(ValueError): - df.pipe((f, 'y'), x=1, y=0) + df.pipe((f, "y"), x=1, y=0) with pytest.raises(ValueError): - df.A.pipe((f, 'y'), x=1, y=0) + df.A.pipe((f, "y"), x=1, y=0) - @pytest.mark.parametrize('box', [pd.Series, pd.DataFrame]) + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) def test_axis_classmethods(self, box): obj = box() - values = (list(box._AXIS_NAMES.keys()) + - list(box._AXIS_NUMBERS.keys()) + - list(box._AXIS_ALIASES.keys())) + values = ( + list(box._AXIS_NAMES.keys()) + + list(box._AXIS_NUMBERS.keys()) + + list(box._AXIS_ALIASES.keys()) + ) for v in values: assert obj._get_axis_number(v) == box._get_axis_number(v) assert obj._get_axis_name(v) == box._get_axis_name(v) - assert obj._get_block_manager_axis(v) == \ - box._get_block_manager_axis(v) + assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v) def test_deprecated_to_dense(self): # GH 26557: DEPR diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 1341837c46669..d3566f16ab49f 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -10,12 +10,10 @@ @pytest.fixture def df(): """DataFrame with columns 'L1', 'L2', and 'L3' """ - return pd.DataFrame({'L1': [1, 2, 3], - 'L2': [11, 12, 13], - 'L3': ['A', 'B', 'C']}) + return pd.DataFrame({"L1": [1, 2, 3], "L2": [11, 12, 13], "L3": ["A", "B", "C"]}) -@pytest.fixture(params=[[], ['L1'], ['L1', 'L2'], ['L1', 'L2', 'L3']]) +@pytest.fixture(params=[[], ["L1"], ["L1", "L2"], ["L1", "L2", "L3"]]) def df_levels(request, df): """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """ levels = request.param @@ -29,9 +27,9 @@ def df_levels(request, df): @pytest.fixture def df_ambig(df): """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """ - df = df.set_index(['L1', 'L2']) + df = df.set_index(["L1", "L2"]) - df['L1'] = df['L3'] + df["L1"] = df["L3"] return df @@ -39,8 +37,8 @@ def df_ambig(df): @pytest.fixture def df_duplabels(df): """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ - df = df.set_index(['L1']) - df = pd.concat([df, df['L2']], axis=1) + df = df.set_index(["L1"]) + df = pd.concat([df, df["L2"]], axis=1) return df @@ -49,8 +47,7 @@ def df_duplabels(df): # ============================= def get_labels_levels(df_levels): expected_labels = list(df_levels.columns) - expected_levels = [name for name in df_levels.index.names - if name is not None] + expected_levels = [name for name in df_levels.index.names if name is not None] return expected_labels, expected_levels @@ -76,7 +73,7 @@ def test_is_level_or_label_reference_df_simple(df_levels, axis): expected_labels, expected_levels = get_labels_levels(df_levels) # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_levels = df_levels.T # Perform checks @@ -87,19 +84,19 @@ def test_is_level_or_label_reference_df_simple(df_levels, axis): def test_is_level_reference_df_ambig(df_ambig, axis): # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_ambig = df_ambig.T # df has both an on-axis level and off-axis label named L1 # Therefore L1 should reference the label, not the level - assert_label_reference(df_ambig, ['L1'], axis=axis) + assert_label_reference(df_ambig, ["L1"], axis=axis) # df has an on-axis level named L2 and it is not ambiguous # Therefore L2 is an level reference - assert_level_reference(df_ambig, ['L2'], axis=axis) + assert_level_reference(df_ambig, ["L2"], axis=axis) # df has a column named L3 and it not an level reference - assert_label_reference(df_ambig, ['L3'], axis=axis) + assert_label_reference(df_ambig, ["L3"], axis=axis) # Series @@ -107,23 +104,23 @@ def test_is_level_reference_df_ambig(df_ambig, axis): def test_is_level_reference_series_simple_axis0(df): # Make series with L1 as index - s = df.set_index('L1').L2 - assert_level_reference(s, ['L1'], axis=0) - assert not s._is_level_reference('L2') + s = df.set_index("L1").L2 + assert_level_reference(s, ["L1"], axis=0) + assert not s._is_level_reference("L2") # Make series with L1 and L2 as index - s = df.set_index(['L1', 'L2']).L3 - assert_level_reference(s, ['L1', 'L2'], axis=0) - assert not s._is_level_reference('L3') + s = df.set_index(["L1", "L2"]).L3 + assert_level_reference(s, ["L1", "L2"], axis=0) + assert not s._is_level_reference("L3") def test_is_level_reference_series_axis1_error(df): # Make series with L1 as index - s = df.set_index('L1').L2 + s = df.set_index("L1").L2 with pytest.raises(ValueError, match="No axis named 1"): - s._is_level_reference('L1', axis=1) + s._is_level_reference("L1", axis=1) # Test _check_label_or_level_ambiguity_df @@ -175,17 +172,17 @@ def test_check_label_or_level_ambiguity_series(df): def test_check_label_or_level_ambiguity_series_axis1_error(df): # Make series with L1 as index - s = df.set_index('L1').L2 + s = df.set_index("L1").L2 with pytest.raises(ValueError, match="No axis named 1"): - s._check_label_or_level_ambiguity('L1', axis=1) + s._check_label_or_level_ambiguity("L1", axis=1) # Test _get_label_or_level_values # =============================== def assert_label_values(frame, labels, axis): for label in labels: - if axis in {0, 'index'}: + if axis in {0, "index"}: expected = frame[label]._values else: expected = frame.loc[label]._values @@ -213,7 +210,7 @@ def test_get_label_or_level_values_df_simple(df_levels, axis): expected_labels, expected_levels = get_labels_levels(df_levels) # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_levels = df_levels.T # Perform checks @@ -224,36 +221,36 @@ def test_get_label_or_level_values_df_simple(df_levels, axis): def test_get_label_or_level_values_df_ambig(df_ambig, axis): # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_ambig = df_ambig.T # df has an on-axis level named L2, and it is not ambiguous. - assert_level_values(df_ambig, ['L2'], axis=axis) + assert_level_values(df_ambig, ["L2"], axis=axis) # df has an off-axis label named L3, and it is not ambiguous. - assert_label_values(df_ambig, ['L3'], axis=axis) + assert_label_values(df_ambig, ["L3"], axis=axis) def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_duplabels = df_duplabels.T # df has unambiguous level 'L1' - assert_level_values(df_duplabels, ['L1'], axis=axis) + assert_level_values(df_duplabels, ["L1"], axis=axis) # df has unique label 'L3' - assert_label_values(df_duplabels, ['L3'], axis=axis) + assert_label_values(df_duplabels, ["L3"], axis=axis) # df has duplicate labels 'L2' - if axis in {0, 'index'}: + if axis in {0, "index"}: expected_msg = "The column label 'L2' is not unique" else: expected_msg = "The index label 'L2' is not unique" with pytest.raises(ValueError, match=expected_msg): - assert_label_values(df_duplabels, ['L2'], axis=axis) + assert_label_values(df_duplabels, ["L2"], axis=axis) # Series @@ -261,21 +258,21 @@ def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): def test_get_label_or_level_values_series_axis0(df): # Make series with L1 as index - s = df.set_index('L1').L2 - assert_level_values(s, ['L1'], axis=0) + s = df.set_index("L1").L2 + assert_level_values(s, ["L1"], axis=0) # Make series with L1 and L2 as index - s = df.set_index(['L1', 'L2']).L3 - assert_level_values(s, ['L1', 'L2'], axis=0) + s = df.set_index(["L1", "L2"]).L3 + assert_level_values(s, ["L1", "L2"], axis=0) def test_get_label_or_level_values_series_axis1_error(df): # Make series with L1 as index - s = df.set_index('L1').L2 + s = df.set_index("L1").L2 with pytest.raises(ValueError, match="No axis named 1"): - s._get_label_or_level_values('L1', axis=1) + s._get_label_or_level_values("L1", axis=1) # Test _drop_labels_or_levels @@ -284,7 +281,7 @@ def assert_labels_dropped(frame, labels, axis): for label in labels: df_dropped = frame._drop_labels_or_levels(label, axis=axis) - if axis in {0, 'index'}: + if axis in {0, "index"}: assert label in frame.columns assert label not in df_dropped.columns else: @@ -296,7 +293,7 @@ def assert_levels_dropped(frame, levels, axis): for level in levels: df_dropped = frame._drop_labels_or_levels(level, axis=axis) - if axis in {0, 'index'}: + if axis in {0, "index"}: assert level in frame.index.names assert level not in df_dropped.index.names else: @@ -312,7 +309,7 @@ def test_drop_labels_or_levels_df(df_levels, axis): expected_labels, expected_levels = get_labels_levels(df_levels) # Transpose frame if axis == 1 - if axis in {1, 'columns'}: + if axis in {1, "columns"}: df_levels = df_levels.T # Perform checks @@ -320,7 +317,7 @@ def test_drop_labels_or_levels_df(df_levels, axis): assert_levels_dropped(df_levels, expected_levels, axis=axis) with pytest.raises(ValueError, match="not valid labels or levels"): - df_levels._drop_labels_or_levels('L4', axis=axis) + df_levels._drop_labels_or_levels("L4", axis=axis) # Series @@ -328,15 +325,15 @@ def test_drop_labels_or_levels_df(df_levels, axis): def test_drop_labels_or_levels_series(df): # Make series with L1 as index - s = df.set_index('L1').L2 - assert_levels_dropped(s, ['L1'], axis=0) + s = df.set_index("L1").L2 + assert_levels_dropped(s, ["L1"], axis=0) with pytest.raises(ValueError, match="not valid labels or levels"): - s._drop_labels_or_levels('L4', axis=0) + s._drop_labels_or_levels("L4", axis=0) # Make series with L1 and L2 as index - s = df.set_index(['L1', 'L2']).L3 - assert_levels_dropped(s, ['L1', 'L2'], axis=0) + s = df.set_index(["L1", "L2"]).L3 + assert_levels_dropped(s, ["L1", "L2"], axis=0) with pytest.raises(ValueError, match="not valid labels or levels"): - s._drop_labels_or_levels('L4', axis=0) + s._drop_labels_or_levels("L4", axis=0) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index e29622cb8ac17..5c3c35832356e 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -15,6 +15,7 @@ try: import xarray + _XARRAY_INSTALLED = True except ImportError: _XARRAY_INSTALLED = False @@ -26,43 +27,46 @@ class TestSeries(Generic): def setup_method(self): self.ts = tm.makeTimeSeries() # Was at top level in test_series - self.ts.name = 'ts' + self.ts.name = "ts" self.series = tm.makeStringSeries() - self.series.name = 'series' + self.series.name = "series" def test_rename_mi(self): - s = Series([11, 21, 31], - index=MultiIndex.from_tuples( - [("A", x) for x in ["a", "B", "c"]])) + s = Series( + [11, 21, 31], + index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]]), + ) s.rename(str.lower) def test_set_axis_name(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - funcs = ['rename_axis', '_set_axis_name'] - name = 'foo' + s = Series([1, 2, 3], index=["a", "b", "c"]) + funcs = ["rename_axis", "_set_axis_name"] + name = "foo" for func in funcs: result = methodcaller(func, name)(s) assert s.index.name is None assert result.index.name == name def test_set_axis_name_mi(self): - s = Series([11, 21, 31], index=MultiIndex.from_tuples( - [("A", x) for x in ["a", "B", "c"]], - names=['l1', 'l2']) + s = Series( + [11, 21, 31], + index=MultiIndex.from_tuples( + [("A", x) for x in ["a", "B", "c"]], names=["l1", "l2"] + ), ) - funcs = ['rename_axis', '_set_axis_name'] + funcs = ["rename_axis", "_set_axis_name"] for func in funcs: - result = methodcaller(func, ['L1', 'L2'])(s) + result = methodcaller(func, ["L1", "L2"])(s) assert s.index.name is None - assert s.index.names == ['l1', 'l2'] + assert s.index.names == ["l1", "l2"] assert result.index.name is None - assert result.index.names, ['L1', 'L2'] + assert result.index.names, ["L1", "L2"] def test_set_axis_name_raises(self): s = pd.Series([1]) with pytest.raises(ValueError): - s._set_axis_name(name='a', axis=1) + s._set_axis_name(name="a", axis=1) def test_get_numeric_data_preserve_dtype(self): @@ -71,7 +75,7 @@ def test_get_numeric_data_preserve_dtype(self): result = o._get_numeric_data() self._compare(result, o) - o = Series([1, '2', 3.]) + o = Series([1, "2", 3.0]) result = o._get_numeric_data() expected = Series([], dtype=object, index=pd.Index([], dtype=object)) self._compare(result, expected) @@ -84,9 +88,9 @@ def test_get_numeric_data_preserve_dtype(self): result = o._get_bool_data() self._compare(result, o) - o = Series(date_range('20130101', periods=3)) + o = Series(date_range("20130101", periods=3)) result = o._get_numeric_data() - expected = Series([], dtype='M8[ns]', index=pd.Index([], dtype=object)) + expected = Series([], dtype="M8[ns]", index=pd.Index([], dtype=object)) self._compare(result, expected) def test_nonzero_single_element(self): @@ -100,8 +104,7 @@ def test_nonzero_single_element(self): msg = "The truth value of a Series is ambiguous" # single item nan to raise - for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), - Series([False])]: + for s in [Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False])]: with pytest.raises(ValueError, match=msg): bool(s) @@ -119,7 +122,7 @@ def test_nonzero_single_element(self): s.bool() # single non-bool are an error - for s in [Series([1]), Series([0]), Series(['a']), Series([0.0])]: + for s in [Series([1]), Series([0]), Series(["a"]), Series([0.0])]: msg = "The truth value of a Series is ambiguous" with pytest.raises(ValueError, match=msg): bool(s) @@ -131,38 +134,40 @@ def test_metadata_propagation_indiv(self): # check that the metadata matches up on the resulting ops o = Series(range(3), range(3)) - o.name = 'foo' + o.name = "foo" o2 = Series(range(3), range(3)) - o2.name = 'bar' + o2.name = "bar" result = o.T self.check_metadata(o, result) # resample - ts = Series(np.random.rand(1000), - index=date_range('20130101', periods=1000, freq='s'), - name='foo') - result = ts.resample('1T').mean() + ts = Series( + np.random.rand(1000), + index=date_range("20130101", periods=1000, freq="s"), + name="foo", + ) + result = ts.resample("1T").mean() self.check_metadata(ts, result) - result = ts.resample('1T').min() + result = ts.resample("1T").min() self.check_metadata(ts, result) - result = ts.resample('1T').apply(lambda x: x.sum()) + result = ts.resample("1T").apply(lambda x: x.sum()) self.check_metadata(ts, result) _metadata = Series._metadata _finalize = Series.__finalize__ - Series._metadata = ['name', 'filename'] - o.filename = 'foo' - o2.filename = 'bar' + Series._metadata = ["name", "filename"] + o.filename = "foo" + o2.filename = "bar" def finalize(self, other, method=None, **kwargs): for name in self._metadata: - if method == 'concat' and name == 'filename': - value = '+'.join([getattr( - o, name) for o in other.objs if getattr(o, name, None) - ]) + if method == "concat" and name == "filename": + value = "+".join( + [getattr(o, name) for o in other.objs if getattr(o, name, None)] + ) object.__setattr__(self, name, value) else: object.__setattr__(self, name, getattr(other, name, None)) @@ -172,60 +177,70 @@ def finalize(self, other, method=None, **kwargs): Series.__finalize__ = finalize result = pd.concat([o, o2]) - assert result.filename == 'foo+bar' + assert result.filename == "foo+bar" assert result.name is None # reset Series._metadata = _metadata Series.__finalize__ = _finalize - @pytest.mark.skipif(not _XARRAY_INSTALLED or _XARRAY_INSTALLED and - LooseVersion(xarray.__version__) < - LooseVersion('0.10.0'), - reason='xarray >= 0.10.0 required') + @pytest.mark.skipif( + not _XARRAY_INSTALLED + or _XARRAY_INSTALLED + and LooseVersion(xarray.__version__) < LooseVersion("0.10.0"), + reason="xarray >= 0.10.0 required", + ) @pytest.mark.parametrize( "index", - ['FloatIndex', 'IntIndex', - 'StringIndex', 'UnicodeIndex', - 'DateIndex', 'PeriodIndex', - 'TimedeltaIndex', 'CategoricalIndex']) + [ + "FloatIndex", + "IntIndex", + "StringIndex", + "UnicodeIndex", + "DateIndex", + "PeriodIndex", + "TimedeltaIndex", + "CategoricalIndex", + ], + ) def test_to_xarray_index_types(self, index): from xarray import DataArray - index = getattr(tm, 'make{}'.format(index)) + index = getattr(tm, "make{}".format(index)) s = Series(range(6), index=index(6)) - s.index.name = 'foo' + s.index.name = "foo" result = s.to_xarray() repr(result) assert len(result) == 6 assert len(result.coords) == 1 - assert_almost_equal(list(result.coords.keys()), ['foo']) + assert_almost_equal(list(result.coords.keys()), ["foo"]) assert isinstance(result, DataArray) # idempotency - assert_series_equal(result.to_series(), s, - check_index_type=False, - check_categorical=True) + assert_series_equal( + result.to_series(), s, check_index_type=False, check_categorical=True + ) - @td.skip_if_no('xarray', min_version='0.7.0') + @td.skip_if_no("xarray", min_version="0.7.0") def test_to_xarray(self): from xarray import DataArray s = Series([]) - s.index.name = 'foo' + s.index.name = "foo" result = s.to_xarray() assert len(result) == 0 assert len(result.coords) == 1 - assert_almost_equal(list(result.coords.keys()), ['foo']) + assert_almost_equal(list(result.coords.keys()), ["foo"]) assert isinstance(result, DataArray) s = Series(range(6)) - s.index.name = 'foo' - s.index = pd.MultiIndex.from_product([['a', 'b'], range(3)], - names=['one', 'two']) + s.index.name = "foo" + s.index = pd.MultiIndex.from_product( + [["a", "b"], range(3)], names=["one", "two"] + ) result = s.to_xarray() assert len(result) == 2 - assert_almost_equal(list(result.coords.keys()), ['one', 'two']) + assert_almost_equal(list(result.coords.keys()), ["one", "two"]) assert isinstance(result, DataArray) assert_series_equal(result.to_series(), s) @@ -234,20 +249,20 @@ def test_valid_deprecated(self): with tm.assert_produces_warning(FutureWarning): pd.Series([]).valid() - @pytest.mark.parametrize("s", [ - Series([np.arange(5)]), - pd.date_range('1/1/2011', periods=24, freq='H'), - pd.Series(range(5), index=pd.date_range("2017", periods=5)) - ]) + @pytest.mark.parametrize( + "s", + [ + Series([np.arange(5)]), + pd.date_range("1/1/2011", periods=24, freq="H"), + pd.Series(range(5), index=pd.date_range("2017", periods=5)), + ], + ) @pytest.mark.parametrize("shift_size", [0, 1, 2]) def test_shift_always_copy(self, s, shift_size): # GH22397 assert s.shift(shift_size) is not s - @pytest.mark.parametrize("move_by_freq", [ - pd.Timedelta('1D'), - pd.Timedelta('1M'), - ]) + @pytest.mark.parametrize("move_by_freq", [pd.Timedelta("1D"), pd.Timedelta("1M")]) def test_datetime_shift_always_copy(self, move_by_freq): # GH22397 s = pd.Series(range(5), index=pd.date_range("2017", periods=5)) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ea59cde54f17b..52d4fa76bf879 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -23,7 +23,7 @@ def test_agg_regression1(tsframe): def test_agg_must_agg(df): - grouped = df.groupby('A')['C'] + grouped = df.groupby("A")["C"] msg = "Must produce aggregated value" with pytest.raises(Exception, match=msg): @@ -38,33 +38,57 @@ def test_agg_ser_multi_key(df): f = lambda x: x.sum() results = df.C.groupby([df.A, df.B]).aggregate(f) - expected = df.groupby(['A', 'B']).sum()['C'] + expected = df.groupby(["A", "B"]).sum()["C"] tm.assert_series_equal(results, expected) def test_groupby_aggregation_mixed_dtype(): # GH 6212 - expected = DataFrame({ - 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1], - 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]}, - index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99), - ('big', 'damp'), - ('blue', 'dry'), - ('red', 'red'), ('red', 'wet')], - names=['by1', 'by2'])) - - df = DataFrame({ - 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], - 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, - 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, - np.nan, np.nan] - }) - - g = df.groupby(['by1', 'by2']) - result = g[['v1', 'v2']].mean() + expected = DataFrame( + { + "v1": [5, 5, 7, np.nan, 3, 3, 4, 1], + "v2": [55, 55, 77, np.nan, 33, 33, 44, 11], + }, + index=MultiIndex.from_tuples( + [ + (1, 95), + (1, 99), + (2, 95), + (2, 99), + ("big", "damp"), + ("blue", "dry"), + ("red", "red"), + ("red", "wet"), + ], + names=["by1", "by2"], + ), + ) + + df = DataFrame( + { + "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + "by2": [ + "wet", + "dry", + 99, + 95, + np.nan, + "damp", + 95, + 99, + "red", + 99, + np.nan, + np.nan, + ], + } + ) + + g = df.groupby(["by1", "by2"]) + result = g[["v1", "v2"]].mean() tm.assert_frame_equal(result, expected) @@ -74,21 +98,19 @@ def test_agg_apply_corner(ts, tsframe): assert ts.dtype == np.float64 # groupby float64 values results in Float64Index - exp = Series([], dtype=np.float64, - index=pd.Index([], dtype=np.float64)) + exp = Series([], dtype=np.float64, index=pd.Index([], dtype=np.float64)) tm.assert_series_equal(grouped.sum(), exp) tm.assert_series_equal(grouped.agg(np.sum), exp) - tm.assert_series_equal(grouped.apply(np.sum), exp, - check_index_type=False) + tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) # DataFrame - grouped = tsframe.groupby(tsframe['A'] * np.nan) - exp_df = DataFrame(columns=tsframe.columns, dtype=float, - index=pd.Index([], dtype=np.float64)) + grouped = tsframe.groupby(tsframe["A"] * np.nan) + exp_df = DataFrame( + columns=tsframe.columns, dtype=float, index=pd.Index([], dtype=np.float64) + ) tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], - check_names=False) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) def test_agg_grouping_is_list_tuple(ts): @@ -110,60 +132,66 @@ def test_agg_grouping_is_list_tuple(ts): def test_agg_python_multiindex(mframe): - grouped = mframe.groupby(['A', 'B']) + grouped = mframe.groupby(["A", "B"]) result = grouped.agg(np.mean) expected = grouped.mean() tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('groupbyfunc', [ - lambda x: x.weekday(), - [lambda x: x.month, lambda x: x.weekday()], -]) +@pytest.mark.parametrize( + "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]] +) def test_aggregate_str_func(tsframe, groupbyfunc): grouped = tsframe.groupby(groupbyfunc) # single series - result = grouped['A'].agg('std') - expected = grouped['A'].std() + result = grouped["A"].agg("std") + expected = grouped["A"].std() tm.assert_series_equal(result, expected) # group frame by function name - result = grouped.aggregate('var') + result = grouped.aggregate("var") expected = grouped.var() tm.assert_frame_equal(result, expected) # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], - ['B', 'std'], - ['C', 'mean'], - ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var()], - ['B', grouped['B'].std()], - ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) + result = grouped.agg( + OrderedDict([["A", "var"], ["B", "std"], ["C", "mean"], ["D", "sem"]]) + ) + expected = DataFrame( + OrderedDict( + [ + ["A", grouped["A"].var()], + ["B", grouped["B"].std()], + ["C", grouped["C"].mean()], + ["D", grouped["D"].sem()], + ] + ) + ) tm.assert_frame_equal(result, expected) def test_aggregate_item_by_item(df): - grouped = df.groupby('A') + grouped = df.groupby("A") aggfun = lambda ser: ser.size result = grouped.agg(aggfun) - foo = (df.A == 'foo').sum() - bar = (df.A == 'bar').sum() + foo = (df.A == "foo").sum() + bar = (df.A == "bar").sum() K = len(result.columns) # GH5782 # odd comparisons can result here, so cast to make easy - exp = pd.Series(np.array([foo] * K), index=list('BCD'), - dtype=np.float64, name='foo') - tm.assert_series_equal(result.xs('foo'), exp) + exp = pd.Series( + np.array([foo] * K), index=list("BCD"), dtype=np.float64, name="foo" + ) + tm.assert_series_equal(result.xs("foo"), exp) - exp = pd.Series(np.array([bar] * K), index=list('BCD'), - dtype=np.float64, name='bar') - tm.assert_almost_equal(result.xs('bar'), exp) + exp = pd.Series( + np.array([bar] * K), index=list("BCD"), dtype=np.float64, name="bar" + ) + tm.assert_almost_equal(result.xs("bar"), exp) def aggfun(ser): return ser.size @@ -174,7 +202,7 @@ def aggfun(ser): def test_wrap_agg_out(three_group): - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) def func(ser): if ser.dtype == np.object: @@ -183,53 +211,53 @@ def func(ser): return ser.sum() result = grouped.aggregate(func) - exp_grouped = three_group.loc[:, three_group.columns != 'C'] - expected = exp_grouped.groupby(['A', 'B']).aggregate(func) + exp_grouped = three_group.loc[:, three_group.columns != "C"] + expected = exp_grouped.groupby(["A", "B"]).aggregate(func) tm.assert_frame_equal(result, expected) def test_agg_multiple_functions_maintain_order(df): # GH #610 - funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] - result = df.groupby('A')['C'].agg(funcs) - exp_cols = Index(['mean', 'max', 'min']) + funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)] + result = df.groupby("A")["C"].agg(funcs) + exp_cols = Index(["mean", "max", "min"]) tm.assert_index_equal(result.columns, exp_cols) def test_multiple_functions_tuples_and_non_tuples(df): # #1359 - funcs = [('foo', 'mean'), 'std'] - ex_funcs = [('foo', 'mean'), ('std', 'std')] + funcs = [("foo", "mean"), "std"] + ex_funcs = [("foo", "mean"), ("std", "std")] - result = df.groupby('A')['C'].agg(funcs) - expected = df.groupby('A')['C'].agg(ex_funcs) + result = df.groupby("A")["C"].agg(funcs) + expected = df.groupby("A")["C"].agg(ex_funcs) tm.assert_frame_equal(result, expected) - result = df.groupby('A').agg(funcs) - expected = df.groupby('A').agg(ex_funcs) + result = df.groupby("A").agg(funcs) + expected = df.groupby("A").agg(ex_funcs) tm.assert_frame_equal(result, expected) def test_more_flexible_frame_multi_function(df): - grouped = df.groupby('A') + grouped = df.groupby("A") - exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]])) - exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]])) + exmean = grouped.agg(OrderedDict([["C", np.mean], ["D", np.mean]])) + exstd = grouped.agg(OrderedDict([["C", np.std], ["D", np.std]])) - expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) + expected = concat([exmean, exstd], keys=["mean", "std"], axis=1) expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1) - d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]]) + d = OrderedDict([["C", [np.mean, np.std]], ["D", [np.mean, np.std]]]) result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) # be careful - result = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) - expected = grouped.aggregate(OrderedDict([['C', np.mean], - ['D', [np.mean, np.std]]])) + result = grouped.aggregate(OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]])) + expected = grouped.aggregate( + OrderedDict([["C", np.mean], ["D", [np.mean, np.std]]]) + ) tm.assert_frame_equal(result, expected) def foo(x): @@ -240,12 +268,12 @@ def bar(x): # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - d = OrderedDict([['C', np.mean], - ['D', OrderedDict([['foo', np.mean], - ['bar', np.std]])]]) + d = OrderedDict( + [["C", np.mean], ["D", OrderedDict([["foo", np.mean], ["bar", np.std]])]] + ) result = grouped.aggregate(d) - d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]]) + d = OrderedDict([["C", [np.mean]], ["D", [foo, bar]]]) expected = grouped.aggregate(d) tm.assert_frame_equal(result, expected) @@ -253,26 +281,29 @@ def bar(x): def test_multi_function_flexible_mix(df): # GH #1268 - grouped = df.groupby('A') + grouped = df.groupby("A") # Expected - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', {'sum': 'sum'}]]) + d = OrderedDict( + [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", {"sum": "sum"}]] + ) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = grouped.aggregate(d) # Test 1 - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', 'sum']]) + d = OrderedDict( + [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", "sum"]] + ) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped.aggregate(d) tm.assert_frame_equal(result, expected) # Test 2 - d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])], - ['D', ['sum']]]) + d = OrderedDict( + [["C", OrderedDict([["foo", "mean"], ["bar", "std"]])], ["D", ["sum"]]] + ) # this uses column selection & renaming with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped.aggregate(d) @@ -281,65 +312,63 @@ def test_multi_function_flexible_mix(df): def test_groupby_agg_coercing_bools(): # issue 14873 - dat = pd.DataFrame( - {'a': [1, 1, 2, 2], 'b': [0, 1, 2, 3], 'c': [None, None, 1, 1]}) - gp = dat.groupby('a') + dat = pd.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]}) + gp = dat.groupby("a") - index = Index([1, 2], name='a') + index = Index([1, 2], name="a") - result = gp['b'].aggregate(lambda x: (x != 0).all()) - expected = Series([False, True], index=index, name='b') + result = gp["b"].aggregate(lambda x: (x != 0).all()) + expected = Series([False, True], index=index, name="b") tm.assert_series_equal(result, expected) - result = gp['c'].aggregate(lambda x: x.isnull().all()) - expected = Series([True, False], index=index, name='c') + result = gp["c"].aggregate(lambda x: x.isnull().all()) + expected = Series([True, False], index=index, name="c") tm.assert_series_equal(result, expected) def test_order_aggregate_multiple_funcs(): # GH 25692 - df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, 2, 3, 4]}) + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]}) - res = df.groupby('A').agg(['sum', 'max', 'mean', 'ohlc', 'min']) + res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"]) result = res.columns.levels[1] - expected = pd.Index(['sum', 'max', 'mean', 'ohlc', 'min']) + expected = pd.Index(["sum", "max", "mean", "ohlc", "min"]) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('dtype', [np.int64, np.uint64]) -@pytest.mark.parametrize('how', ['first', 'last', 'min', - 'max', 'mean', 'median']) +@pytest.mark.parametrize("dtype", [np.int64, np.uint64]) +@pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"]) def test_uint64_type_handling(dtype, how): # GH 26310 - df = pd.DataFrame({'x': 6903052872240755750, 'y': [1, 2]}) - expected = df.groupby('y').agg({'x': how}) + df = pd.DataFrame({"x": 6903052872240755750, "y": [1, 2]}) + expected = df.groupby("y").agg({"x": how}) df.x = df.x.astype(dtype) - result = df.groupby('y').agg({'x': how}) + result = df.groupby("y").agg({"x": how}) result.x = result.x.astype(np.int64) tm.assert_frame_equal(result, expected, check_exact=True) class TestNamedAggregationSeries: - def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) gr = df.groupby([0, 0, 1, 1]) - result = gr.agg(a='sum', b='min') - expected = pd.DataFrame({'a': [3, 7], 'b': [1, 3]}, - columns=['a', 'b'], index=[0, 1]) + result = gr.agg(a="sum", b="min") + expected = pd.DataFrame( + {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=[0, 1] + ) tm.assert_frame_equal(result, expected) - result = gr.agg(b='min', a='sum') + result = gr.agg(b="min", a="sum") # sort for 35 and earlier if compat.PY36: - expected = expected[['b', 'a']] + expected = expected[["b", "a"]] tm.assert_frame_equal(result, expected) def test_no_args_raises(self): gr = pd.Series([1, 2]).groupby([0, 1]) - with pytest.raises(TypeError, match='Must provide'): + with pytest.raises(TypeError, match="Must provide"): gr.agg() # but we do allow this @@ -352,61 +381,63 @@ def test_series_named_agg_duplicates_raises(self): # aggregate_multiple_funcs. It could maybe be lifted in the future. gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) with pytest.raises(SpecificationError): - gr.agg(a='sum', b='sum') + gr.agg(a="sum", b="sum") def test_mangled(self): gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) result = gr.agg(a=lambda x: 0, b=lambda x: 1) - expected = pd.DataFrame({'a': [0, 0], 'b': [1, 1]}) + expected = pd.DataFrame({"a": [0, 0], "b": [1, 1]}) tm.assert_frame_equal(result, expected) class TestNamedAggregationDataFrame: def test_agg_relabel(self): - df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], - "A": [0, 1, 2, 3], - "B": [5, 6, 7, 8]}) - result = df.groupby("group").agg( - a_max=("A", "max"), - b_max=("B", "max"), + df = pd.DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) + result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max")) + expected = pd.DataFrame( + {"a_max": [1, 3], "b_max": [6, 8]}, + index=pd.Index(["a", "b"], name="group"), + columns=["a_max", "b_max"], ) - expected = pd.DataFrame({"a_max": [1, 3], "b_max": [6, 8]}, - index=pd.Index(['a', 'b'], name='group'), - columns=['a_max', 'b_max']) tm.assert_frame_equal(result, expected) # order invariance p98 = functools.partial(np.percentile, q=98) - result = df.groupby('group').agg( + result = df.groupby("group").agg( b_min=("B", "min"), a_min=("A", min), a_mean=("A", np.mean), a_max=("A", "max"), b_max=("B", "max"), - a_98=("A", p98) + a_98=("A", p98), + ) + expected = pd.DataFrame( + { + "b_min": [5, 7], + "a_min": [0, 2], + "a_mean": [0.5, 2.5], + "a_max": [1, 3], + "b_max": [6, 8], + "a_98": [0.98, 2.98], + }, + index=pd.Index(["a", "b"], name="group"), + columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"], ) - expected = pd.DataFrame({"b_min": [5, 7], - "a_min": [0, 2], - "a_mean": [0.5, 2.5], - "a_max": [1, 3], - "b_max": [6, 8], - "a_98": [0.98, 2.98]}, - index=pd.Index(['a', 'b'], name='group'), - columns=['b_min', 'a_min', 'a_mean', - 'a_max', 'b_max', 'a_98']) if not compat.PY36: - expected = expected[['a_98', 'a_max', 'a_mean', - 'a_min', 'b_max', 'b_min']] + expected = expected[["a_98", "a_max", "a_mean", "a_min", "b_max", "b_min"]] tm.assert_frame_equal(result, expected) def test_agg_relabel_non_identifier(self): - df = pd.DataFrame({"group": ['a', 'a', 'b', 'b'], - "A": [0, 1, 2, 3], - "B": [5, 6, 7, 8]}) + df = pd.DataFrame( + {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} + ) - result = df.groupby("group").agg(**{'my col': ('A', 'max')}) - expected = pd.DataFrame({'my col': [1, 3]}, - index=pd.Index(['a', 'b'], name='group')) + result = df.groupby("group").agg(**{"my col": ("A", "max")}) + expected = pd.DataFrame( + {"my col": [1, 3]}, index=pd.Index(["a", "b"], name="group") + ) tm.assert_frame_equal(result, expected) def test_duplicate_raises(self): @@ -417,22 +448,22 @@ def test_duplicate_raises(self): df.groupby("A").agg(a=("A", "min"), b=("A", "min")) def test_agg_relabel_with_level(self): - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_product([['A', 'B'], - ['a', 'b']])) - result = df.groupby(level=0).agg(aa=('A', 'max'), bb=('A', 'min'), - cc=('B', 'mean')) - expected = pd.DataFrame({ - 'aa': [0, 1], - 'bb': [0, 1], - 'cc': [1.5, 3.5] - }, index=['A', 'B']) + df = pd.DataFrame( + {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product([["A", "B"], ["a", "b"]]), + ) + result = df.groupby(level=0).agg( + aa=("A", "max"), bb=("A", "min"), cc=("B", "mean") + ) + expected = pd.DataFrame( + {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"] + ) tm.assert_frame_equal(result, expected) def test_agg_relabel_other_raises(self): df = pd.DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]}) grouped = df.groupby("A") - match = 'Must provide' + match = "Must provide" with pytest.raises(TypeError, match=match): grouped.agg(foo=1) @@ -440,74 +471,65 @@ def test_agg_relabel_other_raises(self): grouped.agg() with pytest.raises(TypeError, match=match): - grouped.agg(a=('B', 'max'), b=(1, 2, 3)) + grouped.agg(a=("B", "max"), b=(1, 2, 3)) def test_missing_raises(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) with pytest.raises(KeyError, match="Column 'C' does not exist"): - df.groupby("A").agg(c=('C', 'sum')) + df.groupby("A").agg(c=("C", "sum")) def test_agg_namedtuple(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) result = df.groupby("A").agg( - b=pd.NamedAgg("B", "sum"), - c=pd.NamedAgg(column="B", aggfunc="count") + b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count") ) - expected = df.groupby("A").agg(b=("B", "sum"), - c=("B", "count")) + expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count")) tm.assert_frame_equal(result, expected) def test_mangled(self): df = pd.DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]}) - result = df.groupby("A").agg( - b=("B", lambda x: 0), - c=("C", lambda x: 1) + result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1)) + expected = pd.DataFrame( + {"b": [0, 0], "c": [1, 1]}, index=pd.Index([0, 1], name="A") ) - expected = pd.DataFrame({"b": [0, 0], "c": [1, 1]}, - index=pd.Index([0, 1], name='A')) tm.assert_frame_equal(result, expected) class TestLambdaMangling: - def test_maybe_mangle_lambdas_passthrough(self): - assert _maybe_mangle_lambdas('mean') == 'mean' - assert _maybe_mangle_lambdas(lambda x: x).__name__ == '' + assert _maybe_mangle_lambdas("mean") == "mean" + assert _maybe_mangle_lambdas(lambda x: x).__name__ == "" # don't mangel single lambda. - assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == '' + assert _maybe_mangle_lambdas([lambda x: x])[0].__name__ == "" def test_maybe_mangle_lambdas_listlike(self): aggfuncs = [lambda x: 1, lambda x: 2] result = _maybe_mangle_lambdas(aggfuncs) - assert result[0].__name__ == '' - assert result[1].__name__ == '' + assert result[0].__name__ == "" + assert result[1].__name__ == "" assert aggfuncs[0](None) == result[0](None) assert aggfuncs[1](None) == result[1](None) def test_maybe_mangle_lambdas(self): - func = { - 'A': [lambda x: 0, lambda x: 1] - } + func = {"A": [lambda x: 0, lambda x: 1]} result = _maybe_mangle_lambdas(func) - assert result['A'][0].__name__ == '' - assert result['A'][1].__name__ == '' + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" def test_maybe_mangle_lambdas_args(self): - func = { - 'A': [lambda x, a, b=1: (0, a, b), lambda x: 1] - } + func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]} result = _maybe_mangle_lambdas(func) - assert result['A'][0].__name__ == '' - assert result['A'][1].__name__ == '' + assert result["A"][0].__name__ == "" + assert result["A"][1].__name__ == "" - assert func['A'][0](0, 1) == (0, 1, 1) - assert func['A'][0](0, 1, 2) == (0, 1, 2) - assert func['A'][0](0, 2, b=3) == (0, 2, 3) + assert func["A"][0](0, 1) == (0, 1, 1) + assert func["A"][0](0, 1, 2) == (0, 1, 2) + assert func["A"][0](0, 2, b=3) == (0, 2, 3) def test_maybe_mangle_lambdas_named(self): - func = OrderedDict([('C', np.mean), - ('D', OrderedDict([('foo', np.mean), - ('bar', np.mean)]))]) + func = OrderedDict( + [("C", np.mean), ("D", OrderedDict([("foo", np.mean), ("bar", np.mean)]))] + ) result = _maybe_mangle_lambdas(func) assert result == func @@ -515,15 +537,16 @@ def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]}) - expected = pd.DataFrame({("B", ""): [0, 0], - ("B", ""): [1, 1]}, - index=pd.Index([0, 1], name='A')) + expected = pd.DataFrame( + {("B", ""): [0, 0], ("B", ""): [1, 1]}, + index=pd.Index([0, 1], name="A"), + ) tm.assert_frame_equal(result, expected) def test_mangle_series_groupby(self): gr = pd.Series([1, 2, 3, 4]).groupby([0, 0, 1, 1]) result = gr.agg([lambda x: 0, lambda x: 1]) - expected = pd.DataFrame({'': [0, 0], '': [1, 1]}) + expected = pd.DataFrame({"": [0, 0], "": [1, 1]}) tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.") @@ -531,9 +554,9 @@ def test_with_kwargs(self): f1 = lambda x, y, b=1: x.sum() + y + b f2 = lambda x, y, b=2: x.sum() + y * b result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0) - expected = pd.DataFrame({'': [4], '': [6]}) + expected = pd.DataFrame({"": [4], "": [6]}) tm.assert_frame_equal(result, expected) result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) - expected = pd.DataFrame({'': [13], '': [30]}) + expected = pd.DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c2f98b11bb33e..5d50c044cf9f5 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -6,121 +6,135 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range) +from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range from pandas.core.groupby.groupby import DataError import pandas.util.testing as tm -@pytest.mark.parametrize('op_name', [ - 'count', - 'sum', - 'std', - 'var', - 'sem', - 'mean', - pytest.param('median', - # ignore mean of empty slice - # and all-NaN - marks=[pytest.mark.filterwarnings( - "ignore::RuntimeWarning" - )]), - 'prod', - 'min', - 'max', -]) +@pytest.mark.parametrize( + "op_name", + [ + "count", + "sum", + "std", + "var", + "sem", + "mean", + pytest.param( + "median", + # ignore mean of empty slice + # and all-NaN + marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")], + ), + "prod", + "min", + "max", + ], +) def test_cythonized_aggers(op_name): - data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., np.nan, np.nan], - 'B': ['A', 'B'] * 6, - 'C': np.random.randn(12)} + data = { + "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan], + "B": ["A", "B"] * 6, + "C": np.random.randn(12), + } df = DataFrame(data) - df.loc[2:10:2, 'C'] = np.nan + df.loc[2:10:2, "C"] = np.nan op = lambda x: getattr(x, op_name)() # single column - grouped = df.drop(['B'], axis=1).groupby('A') - exp = {cat: op(group['C']) for cat, group in grouped} - exp = DataFrame({'C': exp}) - exp.index.name = 'A' + grouped = df.drop(["B"], axis=1).groupby("A") + exp = {cat: op(group["C"]) for cat, group in grouped} + exp = DataFrame({"C": exp}) + exp.index.name = "A" result = op(grouped) tm.assert_frame_equal(result, exp) # multiple columns - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) expd = {} for (cat1, cat2), group in grouped: - expd.setdefault(cat1, {})[cat2] = op(group['C']) + expd.setdefault(cat1, {})[cat2] = op(group["C"]) exp = DataFrame(expd).T.stack(dropna=False) - exp.index.names = ['A', 'B'] - exp.name = 'C' + exp.index.names = ["A", "B"] + exp.name = "C" - result = op(grouped)['C'] - if op_name in ['sum', 'prod']: + result = op(grouped)["C"] + if op_name in ["sum", "prod"]: tm.assert_series_equal(result, exp) def test_cython_agg_boolean(): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': np.random.randint(0, 2, 50).astype('bool')}) - result = frame.groupby('a')['b'].mean() - expected = frame.groupby('a')['b'].agg(np.mean) + frame = DataFrame( + { + "a": np.random.randint(0, 5, 50), + "b": np.random.randint(0, 2, 50).astype("bool"), + } + ) + result = frame.groupby("a")["b"].mean() + expected = frame.groupby("a")["b"].agg(np.mean) tm.assert_series_equal(result, expected) def test_cython_agg_nothing_to_agg(): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) + frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - frame.groupby('a')['b'].mean() + frame.groupby("a")["b"].mean() - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25}) + frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) with pytest.raises(DataError, match=msg): - frame[['b']].groupby(frame['a']).mean() + frame[["b"]].groupby(frame["a"]).mean() def test_cython_agg_nothing_to_agg_with_dates(): - frame = DataFrame({'a': np.random.randint(0, 5, 50), - 'b': ['foo', 'bar'] * 25, - 'dates': pd.date_range('now', periods=50, freq='T')}) + frame = DataFrame( + { + "a": np.random.randint(0, 5, 50), + "b": ["foo", "bar"] * 25, + "dates": pd.date_range("now", periods=50, freq="T"), + } + ) msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - frame.groupby('b').dates.mean() + frame.groupby("b").dates.mean() def test_cython_agg_frame_columns(): # #2113 - df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) + df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]}) - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() - df.groupby(level=0, axis='columns').mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() + df.groupby(level=0, axis="columns").mean() def test_cython_agg_return_dict(): # GH 16741 df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict()) - expected = Series([{'two': 1, 'one': 1, 'three': 1}, - {'two': 2, 'one': 2, 'three': 1}], - index=Index(['bar', 'foo'], name='A'), - name='B') + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + + ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict()) + expected = Series( + [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}], + index=Index(["bar", "foo"], name="A"), + name="B", + ) tm.assert_series_equal(ts, expected) def test_cython_fail_agg(): - dr = bdate_range('1/1/2000', periods=50) - ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + dr = bdate_range("1/1/2000", periods=50) + ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() @@ -128,17 +142,20 @@ def test_cython_fail_agg(): tm.assert_series_equal(summed, expected) -@pytest.mark.parametrize('op, targop', [ - ('mean', np.mean), - ('median', np.median), - ('var', np.var), - ('add', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), -]) +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", np.median), + ("var", np.var), + ("add", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ], +) def test__cython_agg_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) @@ -148,12 +165,15 @@ def test__cython_agg_general(op, targop): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('op, targop', [ - ('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('min', np.min), - ('max', np.max), ] +@pytest.mark.parametrize( + "op, targop", + [ + ("mean", np.mean), + ("median", lambda x: np.median(x) if len(x) > 0 else np.nan), + ("var", lambda x: np.var(x, ddof=1)), + ("min", np.min), + ("max", np.max), + ], ) def test_cython_agg_empty_buckets(op, targop, observed): df = pd.DataFrame([11, 12, 13]) @@ -172,43 +192,47 @@ def test_cython_agg_empty_buckets(op, targop, observed): def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these - df = pd.DataFrame([11, 12, 13], columns=['a']) + df = pd.DataFrame([11, 12, 13], columns=["a"]) grps = range(0, 25, 5) # add / sum - result = df.groupby(pd.cut(df['a'], grps), - observed=observed)._cython_agg_general('add') + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "add" + ) intervals = pd.interval_range(0, 20, freq=5) expected = pd.DataFrame( {"a": [0, 0, 36, 0]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) if observed: expected = expected[expected.a != 0] tm.assert_frame_equal(result, expected) # prod - result = df.groupby(pd.cut(df['a'], grps), - observed=observed)._cython_agg_general('prod') + result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( + "prod" + ) expected = pd.DataFrame( {"a": [1, 1, 1716, 1]}, - index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + index=pd.CategoricalIndex(intervals, name="a", ordered=True), + ) if observed: expected = expected[expected.a != 1] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('op', ['first', 'last', 'max', 'min']) -@pytest.mark.parametrize('data', [ - Timestamp('2016-10-14 21:00:44.557'), - Timedelta('17088 days 21:00:44.557'), ]) +@pytest.mark.parametrize("op", ["first", "last", "max", "min"]) +@pytest.mark.parametrize( + "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")] +) def test_cython_with_timestamp_and_nat(op, data): # https://github.com/pandas-dev/pandas/issues/19526 - df = DataFrame({'a': [0, 1], 'b': [data, NaT]}) - index = Index([0, 1], name='a') + df = DataFrame({"a": [0, 1], "b": [data, NaT]}) + index = Index([0, 1], name="a") # We will group by a and test the cython aggregations - expected = DataFrame({'b': [data, NaT]}, index=index) + expected = DataFrame({"b": [data, NaT]}, index=index) - result = df.groupby('a').aggregate(op) + result = df.groupby("a").aggregate(op) tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 903ffa23173cb..7905575a4a1a8 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -11,8 +11,14 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, PeriodIndex, Series, date_range, - period_range) + DataFrame, + Index, + MultiIndex, + PeriodIndex, + Series, + date_range, + period_range, +) from pandas.core.groupby.groupby import SpecificationError import pandas.util.testing as tm @@ -24,60 +30,73 @@ def test_agg_api(): # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame - df = DataFrame({'data1': np.random.randn(5), - 'data2': np.random.randn(5), - 'key1': ['a', 'a', 'b', 'b', 'a'], - 'key2': ['one', 'two', 'one', 'two', 'one']}) - grouped = df.groupby('key1') + df = DataFrame( + { + "data1": np.random.randn(5), + "data2": np.random.randn(5), + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + } + ) + grouped = df.groupby("key1") def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) - expected.columns = ['data1', 'data2'] + expected.columns = ["data1", "data2"] result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) def test_agg_datetimes_mixed(): - data = [[1, '2012-01-01', 1.0], - [2, '2012-01-02', 2.0], - [3, None, 3.0]] - - df1 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - data = [[row[0], - (dt.datetime.strptime(row[1], '%Y-%m-%d').date() - if row[1] else None), - row[2]] - for row in data] - - df2 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - df1['weights'] = df1['value'] / df1['value'].sum() - gb1 = df1.groupby('date').aggregate(np.sum) - - df2['weights'] = df1['value'] / df1['value'].sum() - gb2 = df2.groupby('date').aggregate(np.sum) - - assert (len(gb1) == len(gb2)) + data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]] + + df1 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + data = [ + [ + row[0], + (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None), + row[2], + ] + for row in data + ] + + df2 = DataFrame( + { + "key": [x[0] for x in data], + "date": [x[1] for x in data], + "value": [x[2] for x in data], + } + ) + + df1["weights"] = df1["value"] / df1["value"].sum() + gb1 = df1.groupby("date").aggregate(np.sum) + + df2["weights"] = df1["value"] / df1["value"].sum() + gb2 = df2.groupby("date").aggregate(np.sum) + + assert len(gb1) == len(gb2) def test_agg_period_index(): - prng = period_range('2012-1-1', freq='M', periods=3) + prng = period_range("2012-1-1", freq="M", periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() assert isinstance(rs.index, PeriodIndex) # GH 3579 - index = period_range(start='1999-01', periods=5, freq='M') + index = period_range(start="1999-01", periods=5, freq="M") s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) - series = [('s1', s1), ('s2', s2)] + series = [("s1", s1), ("s2", s2)] df = DataFrame.from_dict(OrderedDict(series)) grouped = df.groupby(df.index.month) list(grouped) @@ -86,38 +105,38 @@ def test_agg_period_index(): def test_agg_dict_parameter_cast_result_dtypes(): # GH 12821 - df = DataFrame({'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], - 'time': date_range('1/1/2011', periods=8, freq='H')}) - df.loc[[0, 1, 2, 5], 'time'] = None + df = DataFrame( + { + "class": ["A", "A", "B", "B", "C", "C", "D", "D"], + "time": date_range("1/1/2011", periods=8, freq="H"), + } + ) + df.loc[[0, 1, 2, 5], "time"] = None # test for `first` function - exp = df.loc[[0, 3, 4, 6]].set_index('class') - grouped = df.groupby('class') + exp = df.loc[[0, 3, 4, 6]].set_index("class") + grouped = df.groupby("class") tm.assert_frame_equal(grouped.first(), exp) - tm.assert_frame_equal(grouped.agg('first'), exp) - tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp) - tm.assert_series_equal(grouped.time.first(), exp['time']) - tm.assert_series_equal(grouped.time.agg('first'), exp['time']) + tm.assert_frame_equal(grouped.agg("first"), exp) + tm.assert_frame_equal(grouped.agg({"time": "first"}), exp) + tm.assert_series_equal(grouped.time.first(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("first"), exp["time"]) # test for `last` function - exp = df.loc[[0, 3, 4, 7]].set_index('class') - grouped = df.groupby('class') + exp = df.loc[[0, 3, 4, 7]].set_index("class") + grouped = df.groupby("class") tm.assert_frame_equal(grouped.last(), exp) - tm.assert_frame_equal(grouped.agg('last'), exp) - tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp) - tm.assert_series_equal(grouped.time.last(), exp['time']) - tm.assert_series_equal(grouped.time.agg('last'), exp['time']) + tm.assert_frame_equal(grouped.agg("last"), exp) + tm.assert_frame_equal(grouped.agg({"time": "last"}), exp) + tm.assert_series_equal(grouped.time.last(), exp["time"]) + tm.assert_series_equal(grouped.time.agg("last"), exp["time"]) # count - exp = pd.Series([2, 2, 2, 2], - index=Index(list('ABCD'), name='class'), - name='time') + exp = pd.Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time") tm.assert_series_equal(grouped.time.agg(len), exp) tm.assert_series_equal(grouped.time.size(), exp) - exp = pd.Series([0, 1, 1, 2], - index=Index(list('ABCD'), name='class'), - name='time') + exp = pd.Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time") tm.assert_series_equal(grouped.time.count(), exp) @@ -125,19 +144,17 @@ def test_agg_cast_results_dtypes(): # similar to GH12821 # xref #11444 u = [dt.datetime(2015, x + 1, 1) for x in range(12)] - v = list('aaabbbbbbccd') - df = pd.DataFrame({'X': v, 'Y': u}) + v = list("aaabbbbbbccd") + df = pd.DataFrame({"X": v, "Y": u}) - result = df.groupby('X')['Y'].agg(len) - expected = df.groupby('X')['Y'].count() + result = df.groupby("X")["Y"].agg(len) + expected = df.groupby("X")["Y"].count() tm.assert_series_equal(result, expected) def test_aggregate_float64_no_int64(): # see gh-11199 - df = DataFrame({"a": [1, 2, 3, 4, 5], - "b": [1, 2, 2, 4, 5], - "c": [1, 2, 3, 4, 5]}) + df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" @@ -145,8 +162,7 @@ def test_aggregate_float64_no_int64(): result = df.groupby("b")[["a"]].mean() tm.assert_frame_equal(result, expected) - expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, - index=[1, 2, 4, 5]) + expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" result = df.groupby("b")[["a", "c"]].mean() @@ -157,138 +173,133 @@ def test_aggregate_api_consistency(): # GH 9052 # make sure that the aggregates via dict # are consistent - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - grouped = df.groupby(['A', 'B']) - c_mean = grouped['C'].mean() - c_sum = grouped['C'].sum() - d_mean = grouped['D'].mean() - d_sum = grouped['D'].sum() - - result = grouped['D'].agg(['sum', 'mean']) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + grouped = df.groupby(["A", "B"]) + c_mean = grouped["C"].mean() + c_sum = grouped["C"].sum() + d_mean = grouped["D"].mean() + d_sum = grouped["D"].sum() + + result = grouped["D"].agg(["sum", "mean"]) expected = pd.concat([d_sum, d_mean], axis=1) - expected.columns = ['sum', 'mean'] + expected.columns = ["sum", "mean"] tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['sum', 'mean']]) + expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) - result = grouped[['D', 'C']].agg([np.sum, np.mean]) + result = grouped[["D", "C"]].agg([np.sum, np.mean]) expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) - expected.columns = MultiIndex.from_product([['D', 'C'], - ['sum', 'mean']]) + expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]]) tm.assert_frame_equal(result, expected, check_like=True) - result = grouped.agg({'C': 'mean', 'D': 'sum'}) + result = grouped.agg({"C": "mean", "D": "sum"}) expected = pd.concat([d_sum, c_mean], axis=1) tm.assert_frame_equal(result, expected, check_like=True) - result = grouped.agg({'C': ['mean', 'sum'], - 'D': ['mean', 'sum']}) + result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]}) expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['mean', 'sum']]) + expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = grouped[['D', 'C']].agg({'r': np.sum, - 'r2': np.mean}) + result = grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean}) expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) - expected.columns = MultiIndex.from_product([['r', 'r2'], - ['D', 'C']]) + expected.columns = MultiIndex.from_product([["r", "r2"], ["D", "C"]]) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_dict_renaming_deprecation(): # 15931 - df = pd.DataFrame({'A': [1, 1, 1, 2, 2], - 'B': range(5), - 'C': range(5)}) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w: - df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, - 'C': {'bar': ['count', 'min']}}) + df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: + df.groupby("A").agg( + {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}} + ) assert "using a dict with renaming" in str(w[0].message) assert "named aggregation" in str(w[0].message) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) + df.groupby("A")[["B", "C"]].agg({"ma": "max"}) with tm.assert_produces_warning(FutureWarning) as w: - df.groupby('A').B.agg({'foo': 'count'}) + df.groupby("A").B.agg({"foo": "count"}) assert "using a dict on a Series for aggregation" in str(w[0].message) assert "named aggregation instead." in str(w[0].message) def test_agg_compat(): # GH 12334 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) - expected.columns = MultiIndex.from_tuples([('C', 'sum'), - ('C', 'std')]) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) + expected.columns = MultiIndex.from_tuples([("C", "sum"), ("C", "std")]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g['D'].agg({'C': ['sum', 'std']}) + result = g["D"].agg({"C": ["sum", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) - expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) - expected.columns = ['C', 'D'] + expected = pd.concat([g["D"].sum(), g["D"].std()], axis=1) + expected.columns = ["C", "D"] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g['D'].agg({'C': 'sum', 'D': 'std'}) + result = g["D"].agg({"C": "sum", "D": "std"}) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_nested_dicts(): # API change for disallowing these types of nested dicts - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - msg = r'cannot perform renaming for r[1-2] with a nested dictionary' + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + g = df.groupby(["A", "B"]) + + msg = r"cannot perform renaming for r[1-2] with a nested dictionary" with pytest.raises(SpecificationError, match=msg): - g.aggregate({'r1': {'C': ['mean', 'sum']}, - 'r2': {'D': ['mean', 'sum']}}) + g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g.agg({'C': {'ra': ['mean', 'std']}, - 'D': {'rb': ['mean', 'std']}}) - expected = pd.concat([g['C'].mean(), g['C'].std(), - g['D'].mean(), g['D'].std()], - axis=1) + result = g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}}) + expected = pd.concat( + [g["C"].mean(), g["C"].std(), g["D"].mean(), g["D"].std()], axis=1 + ) expected.columns = pd.MultiIndex.from_tuples( - [('ra', 'mean'), ('ra', 'std'), - ('rb', 'mean'), ('rb', 'std')]) + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) tm.assert_frame_equal(result, expected, check_like=True) # same name as the original column # GH9052 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) - expected = expected.rename(columns={'result1': 'D'}) + expected = g["D"].agg({"result1": np.sum, "result2": np.mean}) + expected = expected.rename(columns={"result1": "D"}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = g['D'].agg({'D': np.sum, 'result2': np.mean}) + result = g["D"].agg({"D": np.sum, "result2": np.mean}) tm.assert_frame_equal(result, expected, check_like=True) @@ -296,11 +307,11 @@ def test_agg_item_by_item_raise_typeerror(): df = DataFrame(np.random.randint(10, size=(20, 10))) def raiseException(df): - pprint_thing('----------------------------------------') + pprint_thing("----------------------------------------") pprint_thing(df.to_string()) - raise TypeError('test') + raise TypeError("test") - with pytest.raises(TypeError, match='test'): + with pytest.raises(TypeError, match="test"): df.groupby(0).agg(raiseException) @@ -315,22 +326,58 @@ def test_series_agg_multikey(): def test_series_agg_multi_pure_python(): data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) def bad(x): - assert (len(x.values.base) > 0) - return 'foo' + assert len(x.values.base) > 0 + return "foo" - result = data.groupby(['A', 'B']).agg(bad) - expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') + result = data.groupby(["A", "B"]).agg(bad) + expected = data.groupby(["A", "B"]).agg(lambda x: "foo") tm.assert_frame_equal(result, expected) @@ -343,12 +390,20 @@ def P1(a): except Exception: return np.nan - df = DataFrame({'col1': [1, 2, 3, 4], - 'col2': [10, 25, 26, 31], - 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), - dt.date(2013, 2, 11), dt.date(2013, 2, 11)]}) - - g = df.groupby('date') + df = DataFrame( + { + "col1": [1, 2, 3, 4], + "col2": [10, 25, 26, 31], + "date": [ + dt.date(2013, 2, 10), + dt.date(2013, 2, 10), + dt.date(2013, 2, 11), + dt.date(2013, 2, 11), + ], + } + ) + + g = df.groupby("date") expected = g.agg([P1]) expected.columns = expected.columns.levels[0] @@ -359,74 +414,80 @@ def P1(a): def test_agg_callables(): # GH 7929 - df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64) + df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64) class fn_class: - def __call__(self, x): return sum(x) - equiv_callables = [sum, - np.sum, - lambda x: sum(x), - lambda x: x.sum(), - partial(sum), - fn_class(), ] + equiv_callables = [ + sum, + np.sum, + lambda x: sum(x), + lambda x: x.sum(), + partial(sum), + fn_class(), + ] expected = df.groupby("foo").agg(sum) for ecall in equiv_callables: - result = df.groupby('foo').agg(ecall) + result = df.groupby("foo").agg(ecall) tm.assert_frame_equal(result, expected) def test_agg_over_numpy_arrays(): # GH 3788 - df = pd.DataFrame([[1, np.array([10, 20, 30])], - [1, np.array([40, 50, 60])], - [2, np.array([20, 30, 40])]], - columns=['category', 'arraydata']) - result = df.groupby('category').agg(sum) + df = pd.DataFrame( + [ + [1, np.array([10, 20, 30])], + [1, np.array([40, 50, 60])], + [2, np.array([20, 30, 40])], + ], + columns=["category", "arraydata"], + ) + result = df.groupby("category").agg(sum) expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] - expected_index = pd.Index([1, 2], name='category') - expected_column = ['arraydata'] - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_column) + expected_index = pd.Index([1, 2], name="category") + expected_column = ["arraydata"] + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_column + ) tm.assert_frame_equal(result, expected) def test_agg_timezone_round_trip(): # GH 15426 - ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') - df = pd.DataFrame({'a': 1, - 'b': [ts + dt.timedelta(minutes=nn) - for nn in range(10)]}) + ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") + df = pd.DataFrame( + {"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]} + ) - result1 = df.groupby('a')['b'].agg(np.min).iloc[0] - result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0] - result3 = df.groupby('a')['b'].min().iloc[0] + result1 = df.groupby("a")["b"].agg(np.min).iloc[0] + result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0] + result3 = df.groupby("a")["b"].min().iloc[0] assert result1 == ts assert result2 == ts assert result3 == ts - dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific') - for i in range(1, 5)] - df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates}) - grouped = df.groupby('A') + dates = [ + pd.Timestamp("2016-01-0%d 12:00:00" % i, tz="US/Pacific") for i in range(1, 5) + ] + df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) + grouped = df.groupby("A") - ts = df['B'].iloc[0] - assert ts == grouped.nth(0)['B'].iloc[0] - assert ts == grouped.head(1)['B'].iloc[0] - assert ts == grouped.first()['B'].iloc[0] + ts = df["B"].iloc[0] + assert ts == grouped.nth(0)["B"].iloc[0] + assert ts == grouped.head(1)["B"].iloc[0] + assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] - ts = df['B'].iloc[2] - assert ts == grouped.last()['B'].iloc[0] + ts = df["B"].iloc[2] + assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] @@ -438,68 +499,75 @@ def test_sum_uint64_overflow(): df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) df = df + 9223372036854775807 - index = pd.Index([9223372036854775808, - 9223372036854775810, - 9223372036854775812], - dtype=np.uint64) - expected = pd.DataFrame({1: [9223372036854775809, - 9223372036854775811, - 9223372036854775813]}, - index=index) + index = pd.Index( + [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64 + ) + expected = pd.DataFrame( + {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, + index=index, + ) expected.index.name = 0 result = df.groupby(0).sum() tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("structure, expected", [ - (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), - (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), - (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1), - (3, 4): (3, 4, 4)}})), - (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1], - (3, 4): [3, 4, 4]}})) -]) +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), + (list, pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), + ( + lambda x: tuple(x), + pd.DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}), + ), + ( + lambda x: list(x), + pd.DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}), + ), + ], +) def test_agg_structs_dataframe(structure, expected): - df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], - 'B': [1, 1, 1, 4, 4, 4], - 'C': [1, 1, 1, 3, 4, 4]}) + df = pd.DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) - result = df.groupby(['A', 'B']).aggregate(structure) - expected.index.names = ['A', 'B'] + result = df.groupby(["A", "B"]).aggregate(structure) + expected.index.names = ["A", "B"] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("structure, expected", [ - (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), - (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')), - (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], - index=[1, 3], name='C')), - (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], - index=[1, 3], name='C')) -]) +@pytest.mark.parametrize( + "structure, expected", + [ + (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")), + (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")), + ], +) def test_agg_structs_series(structure, expected): # Issue #18079 - df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], - 'B': [1, 1, 1, 4, 4, 4], - 'C': [1, 1, 1, 3, 4, 4]}) + df = pd.DataFrame( + {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]} + ) - result = df.groupby('A')['C'].aggregate(structure) - expected.index.name = 'A' + result = df.groupby("A")["C"].aggregate(structure) + expected.index.name = "A" tm.assert_series_equal(result, expected) def test_agg_category_nansum(observed): - categories = ['a', 'b', 'c'] - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=categories), - 'B': [1, 2, 3]}) + categories = ["a", "b", "c"] + df = pd.DataFrame( + {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]} + ) result = df.groupby("A", observed=observed).B.agg(np.nansum) - expected = pd.Series([3, 3, 0], - index=pd.CategoricalIndex(['a', 'b', 'c'], - categories=categories, - name='A'), - name='B') + expected = pd.Series( + [3, 3, 0], + index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"), + name="B", + ) if observed: expected = expected[expected != 0] tm.assert_series_equal(result, expected) @@ -507,25 +575,32 @@ def test_agg_category_nansum(observed): def test_agg_list_like_func(): # GH 18473 - df = pd.DataFrame({'A': [str(x) for x in range(3)], - 'B': [str(x) for x in range(3)]}) - grouped = df.groupby('A', as_index=False, sort=False) - result = grouped.agg({'B': lambda x: list(x)}) - expected = pd.DataFrame({'A': [str(x) for x in range(3)], - 'B': [[str(x)] for x in range(3)]}) + df = pd.DataFrame( + {"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]} + ) + grouped = df.groupby("A", as_index=False, sort=False) + result = grouped.agg({"B": lambda x: list(x)}) + expected = pd.DataFrame( + {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]} + ) tm.assert_frame_equal(result, expected) def test_agg_lambda_with_timezone(): # GH 23683 - df = pd.DataFrame({ - 'tag': [1, 1], - 'date': [ - pd.Timestamp('2018-01-01', tz='UTC'), - pd.Timestamp('2018-01-02', tz='UTC')] - }) - result = df.groupby('tag').agg({'date': lambda e: e.head(1)}) - expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')], - index=pd.Index([1], name='tag'), - columns=['date']) + df = pd.DataFrame( + { + "tag": [1, 1], + "date": [ + pd.Timestamp("2018-01-01", tz="UTC"), + pd.Timestamp("2018-01-02", tz="UTC"), + ], + } + ) + result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) + expected = pd.DataFrame( + [pd.Timestamp("2018-01-01", tz="UTC")], + index=pd.Index([1], name="tag"), + columns=["date"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 3b636c87dc584..bdf93756b7559 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -7,22 +7,24 @@ @pytest.fixture def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) @pytest.fixture def df(): return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) @pytest.fixture @@ -42,27 +44,61 @@ def tsframe(tsd): @pytest.fixture def df_mixed_floats(): - return DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) + return DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.array(np.random.randn(8), dtype="float32"), + } + ) @pytest.fixture def three_group(): - return DataFrame({'A': ['foo', 'foo', 'foo', - 'foo', 'bar', 'bar', - 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', - 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', - 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + return DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 8f57254eae219..44a583bf661e8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -25,9 +25,13 @@ def test_apply_issues(): 2011.05.18,04:00,1.40750 2011.05.18,05:00,1.40649""" - df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value'], - parse_dates=[['date', 'time']]) - df = df.set_index('date_time') + df = pd.read_csv( + StringIO(s), + header=None, + names=["date", "time", "value"], + parse_dates=[["date", "time"]], + ) + df = df.set_index("date_time") expected = df.groupby(df.index.date).idxmax() result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) @@ -35,44 +39,45 @@ def test_apply_issues(): # GH 5789 # don't auto coerce dates - df = pd.read_csv(StringIO(s), header=None, names=['date', 'time', 'value']) + df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"]) exp_idx = pd.Index( - ['2011.05.16', '2011.05.17', '2011.05.18' - ], dtype=object, name='date') - expected = Series(['00:00', '02:00', '02:00'], index=exp_idx) - result = df.groupby('date').apply( - lambda x: x['time'][x['value'].idxmax()]) + ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date" + ) + expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) + result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()]) tm.assert_series_equal(result, expected) def test_apply_trivial(): # GH 20066 # trivial apply: ignore input and return a constant dataframe. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df.iloc[1:]) + df = pd.DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + result = df.groupby([str(x) for x in df.dtypes], axis=1).apply( + lambda x: df.iloc[1:] + ) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH#20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object.") +@pytest.mark.xfail( + reason="GH#20066; function passed into apply " + "returns a DataFrame with the same index " + "as the one to create GroupBy object." +) def test_apply_trivial_fail(): # GH 20066 # trivial apply fails if the constant dataframe has the same index # with the one used to create GroupBy object. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df, df], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df) + df = pd.DataFrame( + {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=["key", "data"], + ) + expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(lambda x: df) tm.assert_frame_equal(result, expected) @@ -85,15 +90,19 @@ def test_fast_apply(): N = 1000 labels = np.random.randint(0, 2000, size=N) labels2 = np.random.randint(0, 3, size=N) - df = DataFrame({'key': labels, - 'key2': labels2, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + df = DataFrame( + { + "key": labels, + "key2": labels2, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) def f(g): return 1 - g = df.groupby(['key', 'key2']) + g = df.groupby(["key", "key2"]) grouper = g.grouper @@ -108,28 +117,33 @@ def f(g): @pytest.mark.parametrize( "df, group_names", [ - (DataFrame({"a": [1, 1, 1, 2, 3], - "b": ["a", "a", "a", "b", "c"]}), - [1, 2, 3]), - (DataFrame({"a": [0, 0, 1, 1], - "b": [0, 1, 0, 1]}), - [0, 1]), - (DataFrame({"a": [1]}), - [1]), - (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], - "b": range(8)}), - [1, 2]), - (DataFrame({"a": [1, 2, 3, 1, 2, 3], - "two": [4, 5, 6, 7, 8, 9]}), - [1, 2, 3]), - (DataFrame({"a": list("aaabbbcccc"), + (DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]), + (DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]), + (DataFrame({"a": [1]}), [1]), + (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]), + (DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]), + ( + DataFrame( + { + "a": list("aaabbbcccc"), "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4], - "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}), - ["a", "b", "c"]), - (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), - [1, 2]), - ], ids=['GH2936', 'GH7739 & GH10519', 'GH10519', - 'GH2656', 'GH12155', 'GH20084', 'GH21417']) + "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8], + } + ), + ["a", "b", "c"], + ), + (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]), + ], + ids=[ + "GH2936", + "GH7739 & GH10519", + "GH10519", + "GH2656", + "GH12155", + "GH20084", + "GH21417", + ], +) def test_group_apply_once_per_group(df, group_names): # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 @@ -177,8 +191,12 @@ def f_constant_df(group): def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 - df = DataFrame({'foo1': np.random.randn(6), - 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) + df = DataFrame( + { + "foo1": np.random.randn(6), + "foo2": ["one", "two", "two", "three", "one", "two"], + } + ) result = df.apply(lambda x: x, axis=1).dtypes expected = df.dtypes tm.assert_series_equal(result, expected) @@ -193,12 +211,16 @@ def test_apply_with_mixed_dtype(): def test_groupby_as_index_apply(df): # GH #4648 and #3417 - df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], - 'user_id': [1, 2, 1, 1, 3, 1], - 'time': range(6)}) + df = DataFrame( + { + "item_id": ["b", "b", "a", "c", "a", "b"], + "user_id": [1, 2, 1, 1, 3, 1], + "time": range(6), + } + ) - g_as = df.groupby('user_id', as_index=True) - g_not_as = df.groupby('user_id', as_index=False) + g_as = df.groupby("user_id", as_index=True) + g_not_as = df.groupby("user_id", as_index=False) res_as = g_as.head(2).index res_not_as = g_not_as.head(2).index @@ -211,32 +233,31 @@ def test_groupby_as_index_apply(df): # apply doesn't maintain the original ordering # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( - 2, 4)]) + exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)]) tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) + exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None]) tm.assert_index_equal(res_as_apply, exp_as_apply) tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) - ind = Index(list('abcde')) + ind = Index(list("abcde")) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index tm.assert_index_equal(res, ind) def test_apply_concat_preserve_names(three_group): - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) def desc(group): result = group.describe() - result.index.name = 'stat' + result.index.name = "stat" return result def desc2(group): result = group.describe() - result.index.name = 'stat' - result = result[:len(group)] + result.index.name = "stat" + result = result[: len(group)] # weirdo return result @@ -244,31 +265,31 @@ def desc3(group): result = group.describe() # names are different - result.index.name = 'stat_%d' % len(group) + result.index.name = "stat_%d" % len(group) - result = result[:len(group)] + result = result[: len(group)] # weirdo return result result = grouped.apply(desc) - assert result.index.names == ('A', 'B', 'stat') + assert result.index.names == ("A", "B", "stat") result2 = grouped.apply(desc2) - assert result2.index.names == ('A', 'B', 'stat') + assert result2.index.names == ("A", "B", "stat") result3 = grouped.apply(desc3) - assert result3.index.names == ('A', 'B', None) + assert result3.index.names == ("A", "B", None) def test_apply_series_to_frame(): def f(piece): - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): logged = np.log(piece) - return DataFrame({'value': piece, - 'demeaned': piece - piece.mean(), - 'logged': logged}) + return DataFrame( + {"value": piece, "demeaned": piece - piece.mean(), "logged": logged} + ) - dr = bdate_range('1/1/2000', periods=100) + dr = bdate_range("1/1/2000", periods=100) ts = Series(np.random.randn(100), index=dr) grouped = ts.groupby(lambda x: x.month) @@ -279,45 +300,49 @@ def f(piece): def test_apply_series_yield_constant(df): - result = df.groupby(['A', 'B'])['C'].apply(len) - assert result.index.names[:2] == ('A', 'B') + result = df.groupby(["A", "B"])["C"].apply(len) + assert result.index.names[:2] == ("A", "B") def test_apply_frame_yield_constant(df): # GH13568 - result = df.groupby(['A', 'B']).apply(len) + result = df.groupby(["A", "B"]).apply(len) assert isinstance(result, Series) assert result.name is None - result = df.groupby(['A', 'B'])[['C', 'D']].apply(len) + result = df.groupby(["A", "B"])[["C", "D"]].apply(len) assert isinstance(result, Series) assert result.name is None def test_apply_frame_to_series(df): - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) result = grouped.apply(len) - expected = grouped.count()['C'] + expected = grouped.count()["C"] tm.assert_index_equal(result.index, expected.index) tm.assert_numpy_array_equal(result.values, expected.values) def test_apply_frame_concat_series(): def trans(group): - return group.groupby('B')['C'].sum().sort_values()[:2] + return group.groupby("B")["C"].sum().sort_values()[:2] def trans2(group): - grouped = group.groupby(df.reindex(group.index)['B']) + grouped = group.groupby(df.reindex(group.index)["B"]) return grouped.sum().sort_values()[:2] - df = DataFrame({'A': np.random.randint(0, 5, 1000), - 'B': np.random.randint(0, 5, 1000), - 'C': np.random.randn(1000)}) + df = DataFrame( + { + "A": np.random.randint(0, 5, 1000), + "B": np.random.randint(0, 5, 1000), + "C": np.random.randn(1000), + } + ) - result = df.groupby('A').apply(trans) - exp = df.groupby('A')['C'].apply(trans2) + result = df.groupby("A").apply(trans) + exp = df.groupby("A")["C"].apply(trans2) tm.assert_series_equal(result, exp, check_names=False) - assert result.name == 'C' + assert result.name == "C" def test_apply_transform(ts): @@ -331,7 +356,7 @@ def test_apply_multikey_corner(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) def f(group): - return group.sort_values('A')[-5:] + return group.sort_values("A")[-5:] result = grouped.apply(f) for key, group in grouped: @@ -340,59 +365,69 @@ def f(group): def test_apply_chunk_view(): # Low level tinkering could be unsafe, make sure not - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'value': range(9)}) + df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result = df.groupby('key', group_keys=False).apply(lambda x: x[:2]) + result = df.groupby("key", group_keys=False).apply(lambda x: x[:2]) expected = df.take([0, 1, 3, 4, 6, 7]) tm.assert_frame_equal(result, expected) def test_apply_no_name_column_conflict(): - df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], - 'value': range(9, -1, -1)}) + df = DataFrame( + { + "name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + "name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], + "value": range(9, -1, -1), + } + ) # it works! #2605 - grouped = df.groupby(['name', 'name2']) - grouped.apply(lambda x: x.sort_values('value', inplace=True)) + grouped = df.groupby(["name", "name2"]) + grouped.apply(lambda x: x.sort_values("value", inplace=True)) def test_apply_typecast_fail(): - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile( - ['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + } + ) def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) + v = group["v"] + group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby('d').apply(f) + result = df.groupby("d").apply(f) expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) + expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) def test_apply_multiindex_fail(): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + }, + index=index, + ) def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) + v = group["v"] + group["v2"] = (v - v.min()) / (v.max() - v.min()) return group - result = df.groupby('d').apply(f) + result = df.groupby("d").apply(f) expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) + expected["v2"] = np.tile([0.0, 0.5, 1], 2) tm.assert_frame_equal(result, expected) @@ -407,24 +442,28 @@ def test_apply_without_copy(): # GH 5545 # returning a non-copy in an applied function fails - data = DataFrame({'id_field': [100, 100, 200, 300], - 'category': ['a', 'b', 'c', 'c'], - 'value': [1, 2, 3, 4]}) + data = DataFrame( + { + "id_field": [100, 100, 200, 300], + "category": ["a", "b", "c", "c"], + "value": [1, 2, 3, 4], + } + ) def filt1(x): if x.shape[0] == 1: return x.copy() else: - return x[x.category == 'c'] + return x[x.category == "c"] def filt2(x): if x.shape[0] == 1: return x else: - return x[x.category == 'c'] + return x[x.category == "c"] - expected = data.groupby('id_field').apply(filt1) - result = data.groupby('id_field').apply(filt2) + expected = data.groupby("id_field").apply(filt1) + result = data.groupby("id_field").apply(filt2) tm.assert_frame_equal(result, expected) @@ -433,18 +472,22 @@ def test_apply_corner_cases(): N = 1000 labels = np.random.randint(0, 100, size=N) - df = DataFrame({'key': labels, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + df = DataFrame( + { + "key": labels, + "value1": np.random.randn(N), + "value2": ["foo", "bar", "baz", "qux"] * (N // 4), + } + ) - grouped = df.groupby('key') + grouped = df.groupby("key") def f(g): - g['value3'] = g['value1'] * 2 + g["value3"] = g["value1"] * 2 return g result = grouped.apply(f) - assert 'value3' in result + assert "value3" in result def test_apply_numeric_coercion_when_datetime(): @@ -454,45 +497,49 @@ def test_apply_numeric_coercion_when_datetime(): # for which are here. # GH 15670 - df = pd.DataFrame({'Number': [1, 2], - 'Date': ["2017-03-02"] * 2, - 'Str': ["foo", "inf"]}) - expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + df = pd.DataFrame( + {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]} + ) + expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) df.Date = pd.to_datetime(df.Date) - result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) - tm.assert_series_equal(result['Str'], expected['Str']) + result = df.groupby(["Number"]).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result["Str"], expected["Str"]) # GH 15421 - df = pd.DataFrame({'A': [10, 20, 30], - 'B': ['foo', '3', '4'], - 'T': [pd.Timestamp("12:31:22")] * 3}) + df = pd.DataFrame( + {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3} + ) def get_B(g): - return g.iloc[0][['B']] - result = df.groupby('A').apply(get_B)['B'] + return g.iloc[0][["B"]] + + result = df.groupby("A").apply(get_B)["B"] expected = df.B expected.index = df.A tm.assert_series_equal(result, expected) # GH 14423 def predictions(tool): - out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) - if 'step1' in list(tool.State): - out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) - if 'step2' in list(tool.State): - out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) - out['useTime'] = str( - tool[tool.State == 'step2'].oTime.values[0]) + out = pd.Series(index=["p1", "p2", "useTime"], dtype=object) + if "step1" in list(tool.State): + out["p1"] = str(tool[tool.State == "step1"].Machine.values[0]) + if "step2" in list(tool.State): + out["p2"] = str(tool[tool.State == "step2"].Machine.values[0]) + out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0]) return out - df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], - 'State': ['step1', 'step2', 'step1', 'step2'], - 'oTime': ['', '2016-09-19 05:24:33', - '', '2016-09-19 23:59:04'], - 'Machine': ['23', '36L', '36R', '36R']}) + + df1 = pd.DataFrame( + { + "Key": ["B", "B", "A", "A"], + "State": ["step1", "step2", "step1", "step2"], + "oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"], + "Machine": ["23", "36L", "36R", "36R"], + } + ) df2 = df1.copy() df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby('Key').apply(predictions).p1 - result = df2.groupby('Key').apply(predictions).p1 + expected = df1.groupby("Key").apply(predictions).p1 + result = df2.groupby("Key").apply(predictions).p1 tm.assert_series_equal(expected, result) @@ -502,23 +549,23 @@ def test_time_field_bug(): # that were not returned by the apply function, an exception would be # raised. - df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]}) + df = pd.DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]}) def func_with_no_date(batch): - return pd.Series({'c': 2}) + return pd.Series({"c": 2}) def func_with_date(batch): - return pd.Series({'b': datetime(2015, 1, 1), 'c': 2}) + return pd.Series({"b": datetime(2015, 1, 1), "c": 2}) - dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date) - dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1]) - dfg_no_conversion_expected.index.name = 'a' + dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date) + dfg_no_conversion_expected = pd.DataFrame({"c": 2}, index=[1]) + dfg_no_conversion_expected.index.name = "a" - dfg_conversion = df.groupby(by=['a']).apply(func_with_date) + dfg_conversion = df.groupby(by=["a"]).apply(func_with_date) dfg_conversion_expected = pd.DataFrame( - {'b': datetime(2015, 1, 1), - 'c': 2}, index=[1]) - dfg_conversion_expected.index.name = 'a' + {"b": datetime(2015, 1, 1), "c": 2}, index=[1] + ) + dfg_conversion_expected.index.name = "a" tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected) @@ -527,14 +574,16 @@ def func_with_date(batch): def test_gb_apply_list_of_unequal_len_arrays(): # GH1738 - df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a', - 'b', 'b', 'b'], - 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd', - 'd', 'd', 'e'], - 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], - 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) + df = DataFrame( + { + "group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"], + "group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"], + "weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], + "value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3], + } + ) + df = df.set_index(["group1", "group2"]) + df_grouped = df.groupby(level=["group1", "group2"], sort=True) def noddy(value, weight): out = np.array(value * weight).repeat(3) @@ -552,61 +601,58 @@ def noddy(value, weight): def test_groupby_apply_all_none(): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. - test_df = DataFrame({'groups': [0, 0, 1, 1], - 'random_vars': [8, 7, 4, 5]}) + test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]}) def test_func(x): pass - result = test_df.groupby('groups').apply(test_func) + result = test_df.groupby("groups").apply(test_func) expected = DataFrame() tm.assert_frame_equal(result, expected) def test_groupby_apply_none_first(): # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) - test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) + test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]}) + test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]}) def test_func(x): if x.shape[0] < 2: return None return x.iloc[[0, -1]] - result1 = test_df1.groupby('groups').apply(test_func) - result2 = test_df2.groupby('groups').apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], - names=['groups', None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], - names=['groups', None]) - expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, - index=index1) - expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, - index=index2) + result1 = test_df1.groupby("groups").apply(test_func) + result2 = test_df2.groupby("groups").apply(test_func) + index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None]) + index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None]) + expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1) + expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2) tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) def test_groupby_apply_return_empty_chunk(): # GH 22221: apply filter which returns some empty groups - df = pd.DataFrame(dict(value=[0, 1], group=['filled', 'empty'])) - groups = df.groupby('group') - result = groups.apply(lambda group: group[group.value != 1]['value']) - expected = pd.Series([0], name='value', - index=MultiIndex.from_product([['empty', 'filled'], - [0]], - names=['group', None] - ).drop('empty')) + df = pd.DataFrame(dict(value=[0, 1], group=["filled", "empty"])) + groups = df.groupby("group") + result = groups.apply(lambda group: group[group.value != 1]["value"]) + expected = pd.Series( + [0], + name="value", + index=MultiIndex.from_product( + [["empty", "filled"], [0]], names=["group", None] + ).drop("empty"), + ) tm.assert_series_equal(result, expected) def test_apply_with_mixed_types(): # gh-20949 - df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]}) - g = df.groupby('A') + df = pd.DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}) + g = df.groupby("A") result = g.transform(lambda x: x / x.sum()) - expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]}) + expected = pd.DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) result = g.apply(lambda x: x / x.sum()) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 18bb9315b68c4..7c12b490f46d2 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -14,6 +14,7 @@ def test_series_grouper(): from pandas import Series + obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -31,6 +32,7 @@ def test_series_grouper(): def test_series_bin_grouper(): from pandas import Series + obj = Series(np.random.randn(10)) dummy = obj[:0] @@ -47,7 +49,6 @@ def test_series_bin_grouper(): class TestBinGroupers: - def setup_method(self, method): self.obj = np.random.randn(10, 1) self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) @@ -58,31 +59,31 @@ def test_generate_bins(self): binner = np.array([0, 3, 6, 9], dtype=np.int64) for func in [lib.generate_bins_dt64, generate_bins_generic]: - bins = func(values, binner, closed='left') - assert ((bins == np.array([2, 5, 6])).all()) + bins = func(values, binner, closed="left") + assert (bins == np.array([2, 5, 6])).all() - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6, 6])).all()) + bins = func(values, binner, closed="right") + assert (bins == np.array([3, 6, 6])).all() for func in [lib.generate_bins_dt64, generate_bins_generic]: values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) binner = np.array([0, 3, 6], dtype=np.int64) - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6])).all()) + bins = func(values, binner, closed="right") + assert (bins == np.array([3, 6])).all() msg = "Invalid length for values or for binner" with pytest.raises(ValueError, match=msg): - generate_bins_generic(values, [], 'right') + generate_bins_generic(values, [], "right") with pytest.raises(ValueError, match=msg): - generate_bins_generic(values[:0], binner, 'right') + generate_bins_generic(values[:0], binner, "right") msg = "Values falls before first bin" with pytest.raises(ValueError, match=msg): - generate_bins_generic(values, [4], 'right') + generate_bins_generic(values, [4], "right") msg = "Values falls after last bin" with pytest.raises(ValueError, match=msg): - generate_bins_generic(values, [-3, -1], 'right') + generate_bins_generic(values, [-3, -1], "right") def test_group_ohlc(): @@ -92,10 +93,9 @@ def _check(dtype): bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) - labels = ensure_int64(np.repeat(np.arange(3), - np.diff(np.r_[0, bins]))) + labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - func = getattr(groupby, 'group_ohlc_%s' % dtype) + func = getattr(groupby, "group_ohlc_%s" % dtype) func(out, counts, obj[:, None], labels) def _ohlc(group): @@ -103,20 +103,18 @@ def _ohlc(group): return np.repeat(nan, 4) return [group[0], group.max(), group.min(), group[-1]] - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), - _ohlc(obj[12:])]) + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) assert_almost_equal(out, expected) - tm.assert_numpy_array_equal(counts, - np.array([6, 6, 8], dtype=np.int64)) + tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) obj[:6] = nan func(out, counts, obj[:, None], labels) expected[0] = nan assert_almost_equal(out, expected) - _check('float32') - _check('float64') + _check("float32") + _check("float64") class TestMoments: @@ -124,7 +122,6 @@ class TestMoments: class TestReducer: - def test_int_index(self): from pandas.core.series import Series @@ -133,23 +130,23 @@ def test_int_index(self): expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.reduce(arr, np.sum, axis=1, - labels=Index(np.arange(100))) + result = reduction.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) expected = arr.sum(1) assert_almost_equal(result, expected) - dummy = Series(0., index=np.arange(100)) - result = reduction.reduce(arr, np.sum, dummy=dummy, - labels=Index(np.arange(4))) + dummy = Series(0.0, index=np.arange(100)) + result = reduction.reduce(arr, np.sum, dummy=dummy, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - dummy = Series(0., index=np.arange(4)) - result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) + dummy = Series(0.0, index=np.arange(4)) + result = reduction.reduce( + arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) + ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) + result = reduction.reduce( + arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) + ) assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 58a43dc218d33..486b3b28b29a3 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -8,10 +8,16 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut) + Categorical, + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + qcut, +) import pandas.util.testing as tm -from pandas.util.testing import ( - assert_equal, assert_frame_equal, assert_series_equal) +from pandas.util.testing import assert_equal, assert_frame_equal, assert_series_equal def cartesian_product_for_groupers(result, args, names): @@ -21,9 +27,9 @@ def cartesian_product_for_groupers(result, args, names): def f(a): if isinstance(a, (CategoricalIndex, Categorical)): categories = a.categories - a = Categorical.from_codes(np.arange(len(categories)), - categories=categories, - ordered=a.ordered) + a = Categorical.from_codes( + np.arange(len(categories)), categories=categories, ordered=a.ordered + ) return a index = MultiIndex.from_product(map(f, args), names=names) @@ -34,60 +40,64 @@ def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} + return { + "min": group.min(), + "max": group.max(), + "count": group.count(), + "mean": group.mean(), + } result = df.groupby(cats, observed=False).D.apply(get_stats) - assert result.index.names[0] == 'C' + assert result.index.names[0] == "C" def test_basic(): - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) + cats = Categorical( + ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) - expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) + exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True) + expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) - cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) - exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) + exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) + expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 - x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], - [1, 'John P. Doe']], - columns=['person_id', 'person_name']) - x['person_name'] = Categorical(x.person_name) + x = DataFrame( + [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], + columns=["person_id", "person_name"], + ) + x["person_name"] = Categorical(x.person_name) - g = x.groupby(['person_id'], observed=False) + g = x.groupby(["person_id"], observed=False) result = g.transform(lambda x: x) - tm.assert_frame_equal(result, x[['person_name']]) + tm.assert_frame_equal(result, x[["person_name"]]) - result = x.drop_duplicates('person_name') + result = x.drop_duplicates("person_name") expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): - return x.drop_duplicates('person_name').iloc[0] + return x.drop_duplicates("person_name").iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() - expected.index = Index([1, 2], name='person_id') - expected['person_name'] = expected['person_name'].astype('object') + expected.index = Index([1, 2], name="person_id") + expected["person_name"] = expected["person_name"].astype("object") tm.assert_frame_equal(result, expected) # GH 9921 @@ -96,56 +106,47 @@ def f(x): c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) - tm.assert_series_equal(result, df['a']) + tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( - df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), - df['a']) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] + ) + tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( - df.groupby(c, observed=False).transform(sum), - df[['a']]) - tm.assert_frame_equal( - df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), - df[['a']]) + df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[["a"]] + ) # Filter - tm.assert_series_equal( - df.a.groupby(c, observed=False).filter(np.all), - df['a']) - tm.assert_frame_equal( - df.groupby(c, observed=False).filter(np.all), - df) + tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"]) + tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) - tm.assert_series_equal(result, df['a']) + tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( - df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), - df['a']) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"] + ) + tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( - df.groupby(c, observed=False).transform(sum), - df[['a']]) - tm.assert_frame_equal( - df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), - df[['a']]) + df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]] + ) # GH 9603 - df = DataFrame({'a': [1, 0, 0, 0]}) - c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) + df = DataFrame({"a": [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) result = df.groupby(c, observed=False).apply(len) - exp_index = CategoricalIndex( - c.values.categories, ordered=c.values.ordered) + exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) - expected.index.name = 'a' + expected.index.name = "a" tm.assert_series_equal(result, expected) # more basic - levels = ['foo', 'bar', 'baz', 'qux'] + levels = ["foo", "bar", "baz", "qux"] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) @@ -155,8 +156,7 @@ def f(x): result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) + exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) @@ -168,73 +168,69 @@ def f(x): ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby( - exp_cats, sort=False, observed=False).describe() + exp_cats = Categorical( + ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"] + ) + expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) def test_level_get_group(observed): # GH15155 - df = DataFrame(data=np.arange(2, 22, 2), - index=MultiIndex( - levels=[CategoricalIndex(["a", "b"]), range(10)], - codes=[[0] * 5 + [1] * 5, range(10)], - names=["Index1", "Index2"])) + df = DataFrame( + data=np.arange(2, 22, 2), + index=MultiIndex( + levels=[CategoricalIndex(["a", "b"]), range(10)], + codes=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"], + ), + ) g = df.groupby(level=["Index1"], observed=observed) # expected should equal test.loc[["a"]] # GH15166 - expected = DataFrame(data=np.arange(2, 12, 2), - index=MultiIndex(levels=[CategoricalIndex( - ["a", "b"]), range(5)], - codes=[[0] * 5, range(5)], - names=["Index1", "Index2"])) - result = g.get_group('a') + expected = DataFrame( + data=np.arange(2, 12, 2), + index=MultiIndex( + levels=[CategoricalIndex(["a", "b"]), range(5)], + codes=[[0] * 5, range(5)], + names=["Index1", "Index2"], + ), + ) + result = g.get_group("a") assert_frame_equal(result, expected) @pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636", strict=False) -@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 - dense = Categorical(list('abc'), ordered=ordered) + dense = Categorical(list("abc"), ordered=ordered) # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) + missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered) values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense'], observed=True) + df = DataFrame({"missing": missing, "dense": dense, "values": values}) + grouped = df.groupby(["missing", "dense"], observed=True) # missing category 'b' should still exist in the output index - idx = MultiIndex.from_arrays( - [missing, dense], names=['missing', 'dense']) - expected = DataFrame([0, 1, 2.], - index=idx, - columns=['values']) + idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) + expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"]) result = grouped.apply(lambda x: np.mean(x)) assert_frame_equal(result, expected) # we coerce back to ints - expected = expected.astype('int') + expected = expected.astype("int") result = grouped.mean() assert_frame_equal(result, expected) @@ -242,8 +238,7 @@ def test_apply(ordered): assert_frame_equal(result, expected) # but for transform we should still get back the original index - idx = MultiIndex.from_arrays([missing, dense], - names=['missing', 'dense']) + idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"]) expected = Series(1, index=idx) result = grouped.apply(lambda x: 1) assert_series_equal(result, expected) @@ -257,88 +252,81 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 - cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - df['C'] = ['foo', 'bar'] * 2 + df["C"] = ["foo", "bar"] * 2 # multiple groupers with a non-cat - gb = df.groupby(['A', 'B', 'C'], observed=observed) + gb = df.groupby(["A", "B", "C"], observed=observed) exp_index = MultiIndex.from_arrays( - [cat1, cat2, ['foo', 'bar'] * 2], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - [1, 2, 3, 4], index=exp_index)}).sort_index() + [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"] + ) + expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index() result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, - [cat1, cat2, ['foo', 'bar']], - list('ABC')) + expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + ) tm.assert_frame_equal(result, expected) - gb = df.groupby(['A', 'B'], observed=observed) - exp_index = MultiIndex.from_arrays( - [cat1, cat2], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, 3, 4]}, - index=exp_index) + gb = df.groupby(["A", "B"], observed=observed) + exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) + expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers( - expected, - [cat1, cat2], - list('AB')) + expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) tm.assert_frame_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} + d = { + "cat": Categorical( + ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 1, 2, 2], + "val": [10, 20, 30, 40], + } df = DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() - exp_index = CategoricalIndex(list('ab'), name="cat", - categories=list('abc'), - ordered=True) - expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, - index=exp_index) + exp_index = CategoricalIndex( + list("ab"), name="cat", categories=list("abc"), ordered=True + ) + expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index) if not observed: - index = CategoricalIndex(list('abc'), name="cat", - categories=list('abc'), - ordered=True) + index = CategoricalIndex( + list("abc"), name="cat", categories=list("abc"), ordered=True + ) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) # Grouping on two columns groups_double_key = df.groupby(["cat", "ints"], observed=observed) - result = groups_double_key.agg('mean') + result = groups_double_key.agg("mean") expected = DataFrame( - {"val": [10, 30, 20, 40], - "cat": Categorical(['a', 'a', 'b', 'b'], - categories=['a', 'b', 'c'], - ordered=True), - "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) + { + "val": [10, 30, 20, 40], + "cat": Categorical( + ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True + ), + "ints": [1, 2, 1, 2], + } + ).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers( - expected, - [df.cat.values, [1, 2]], - ['cat', 'ints']) + expected, [df.cat.values, [1, 2]], ["cat", "ints"] + ) tm.assert_frame_equal(result, expected) # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]: c, i = key result = groups_double_key.get_group(key) expected = df[(df.cat == c) & (df.ints == i)] @@ -346,37 +334,37 @@ def test_observed(observed): # gh-8869 # with as_index - d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], - 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} + d = { + "foo": [10, 8, 4, 8, 4, 1, 1], + "bar": [10, 20, 30, 40, 50, 60, 70], + "baz": ["d", "c", "e", "a", "a", "d", "c"], + } df = DataFrame(d) - cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) - df['range'] = cat - groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) - result = groups.agg('mean') + cat = pd.cut(df["foo"], np.linspace(0, 10, 3)) + df["range"] = cat + groups = df.groupby(["range", "baz"], as_index=False, observed=observed) + result = groups.agg("mean") - groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed) - expected = groups2.agg('mean').reset_index() + groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed) + expected = groups2.agg("mean").reset_index() tm.assert_frame_equal(result, expected) def test_observed_codes_remap(observed): - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]} df = DataFrame(d) - values = pd.cut(df['C1'], [1, 2, 3, 6]) + values = pd.cut(df["C1"], [1, 2, 3, 6]) values.name = "cat" - groups_double_key = df.groupby([values, 'C2'], observed=observed) + groups_double_key = df.groupby([values, "C2"], observed=observed) - idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], - names=["cat", "C2"]) - expected = DataFrame({"C1": [3, 3, 4, 5], - "C3": [10, 100, 200, 34]}, index=idx) + idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"]) + expected = DataFrame({"C1": [3, 3, 4, 5], "C3": [10, 100, 200, 34]}, index=idx) if not observed: expected = cartesian_product_for_groupers( - expected, - [values.values, [1, 2, 3, 4]], - ['cat', 'C2']) + expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"] + ) - result = groups_double_key.agg('mean') + result = groups_double_key.agg("mean") tm.assert_frame_equal(result, expected) @@ -384,14 +372,17 @@ def test_observed_perf(): # we create a cartesian product, so this is # non-performant if we don't use observed values # gh-14942 - df = DataFrame({ - 'cat': np.random.randint(0, 255, size=30000), - 'int_id': np.random.randint(0, 255, size=30000), - 'other_id': np.random.randint(0, 10000, size=30000), - 'foo': 0}) - df['cat'] = df.cat.astype(str).astype('category') - - grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True) + df = DataFrame( + { + "cat": np.random.randint(0, 255, size=30000), + "int_id": np.random.randint(0, 255, size=30000), + "other_id": np.random.randint(0, 10000, size=30000), + "foo": 0, + } + ) + df["cat"] = df.cat.astype(str).astype("category") + + grouped = df.groupby(["cat", "int_id", "other_id"], observed=True) result = grouped.count() assert result.index.levels[0].nunique() == df.cat.nunique() assert result.index.levels[1].nunique() == df.int_id.nunique() @@ -402,52 +393,61 @@ def test_observed_groups(observed): # gh-20583 # test that we have the appropriate groups - cat = Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) - df = DataFrame({'cat': cat, 'vals': [1, 2, 3]}) - g = df.groupby('cat', observed=observed) + cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"]) + df = DataFrame({"cat": cat, "vals": [1, 2, 3]}) + g = df.groupby("cat", observed=observed) result = g.groups if observed: - expected = {'a': Index([0, 2], dtype='int64'), - 'c': Index([1], dtype='int64')} + expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")} else: - expected = {'a': Index([0, 2], dtype='int64'), - 'b': Index([], dtype='int64'), - 'c': Index([1], dtype='int64')} + expected = { + "a": Index([0, 2], dtype="int64"), + "b": Index([], dtype="int64"), + "c": Index([1], dtype="int64"), + } tm.assert_dict_equal(result, expected) def test_observed_groups_with_nan(observed): # GH 24740 - df = DataFrame({'cat': Categorical(['a', np.nan, 'a'], - categories=['a', 'b', 'd']), - 'vals': [1, 2, 3]}) - g = df.groupby('cat', observed=observed) + df = DataFrame( + { + "cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]), + "vals": [1, 2, 3], + } + ) + g = df.groupby("cat", observed=observed) result = g.groups if observed: - expected = {'a': Index([0, 2], dtype='int64')} + expected = {"a": Index([0, 2], dtype="int64")} else: - expected = {'a': Index([0, 2], dtype='int64'), - 'b': Index([], dtype='int64'), - 'd': Index([], dtype='int64')} + expected = { + "a": Index([0, 2], dtype="int64"), + "b": Index([], dtype="int64"), + "d": Index([], dtype="int64"), + } tm.assert_dict_equal(result, expected) def test_dataframe_categorical_with_nan(observed): # GH 21151 - s1 = Categorical([np.nan, 'a', np.nan, 'a'], - categories=['a', 'b', 'c']) + s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"]) s2 = Series([1, 2, 3, 4]) - df = DataFrame({'s1': s1, 's2': s2}) - result = df.groupby('s1', observed=observed).first().reset_index() + df = DataFrame({"s1": s1, "s2": s2}) + result = df.groupby("s1", observed=observed).first().reset_index() if observed: - expected = DataFrame({'s1': Categorical(['a'], - categories=['a', 'b', 'c']), 's2': [2]}) + expected = DataFrame( + {"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]} + ) else: - expected = DataFrame({'s1': Categorical(['a', 'b', 'c'], - categories=['a', 'b', 'c']), - 's2': [2, np.nan, np.nan]}) + expected = DataFrame( + { + "s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]), + "s2": [2, np.nan, np.nan], + } + ) tm.assert_frame_equal(result, expected) @@ -460,32 +460,35 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # Build a dataframe with cat having one unobserved category ('missing'), # and a Series with identical values - label = Categorical(['d', 'a', 'b', 'a', 'd', 'b'], - categories=['a', 'b', 'missing', 'd'], - ordered=ordered) - val = Series(['d', 'a', 'b', 'a', 'd', 'b']) - df = DataFrame({'label': label, 'val': val}) + label = Categorical( + ["d", "a", "b", "a", "d", "b"], + categories=["a", "b", "missing", "d"], + ordered=ordered, + ) + val = Series(["d", "a", "b", "a", "d", "b"]) + df = DataFrame({"label": label, "val": val}) # aggregate on the Categorical - result = (df.groupby('label', observed=observed, sort=sort)['val'] - .aggregate('first')) + result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first") # If ordering works, we expect index labels equal to aggregation results, # except for 'observed=False': label 'missing' has aggregation None - label = Series(result.index.array, dtype='object') + label = Series(result.index.array, dtype="object") aggr = Series(result.array) if not observed: - aggr[aggr.isna()] = 'missing' + aggr[aggr.isna()] = "missing" if not all(label == aggr): - msg = ('Labels and aggregation results not consistently sorted\n' + - 'for (ordered={}, observed={}, sort={})\n' + - 'Result:\n{}').format(ordered, observed, sort, result) + msg = ( + "Labels and aggregation results not consistently sorted\n" + + "for (ordered={}, observed={}, sort={})\n" + + "Result:\n{}" + ).format(ordered, observed, sort, result) assert False, msg def test_datetime(): # GH9049: ensure backward compatibility - levels = pd.date_range('2014-01-01', periods=4) + levels = pd.date_range("2014-01-01", periods=4) codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) @@ -495,9 +498,9 @@ def test_datetime(): expected = data.groupby(np.asarray(cats), observed=False).mean() expected = expected.reindex(levels) - expected.index = CategoricalIndex(expected.index, - categories=expected.index, - ordered=True) + expected.index = CategoricalIndex( + expected.index, categories=expected.index, ordered=True + ) assert_frame_equal(result, expected) @@ -511,54 +514,50 @@ def test_datetime(): assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal( - desc_result.index.get_level_values(0), - expected.index.get_level_values(0)) + desc_result.index.get_level_values(0), expected.index.get_level_values(0) + ) # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) + tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) + exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) + tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) def test_categorical_index(): s = np.random.RandomState(12345) - levels = ['foo', 'bar', 'baz', 'qux'] + levels = ["foo", "bar", "baz", "qux"] codes = s.randint(0, 4, size=20) cats = Categorical.from_codes(codes, levels, ordered=True) - df = DataFrame( - np.repeat( - np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) - df['cats'] = cats + df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd")) + df["cats"] = cats # with a cat index - result = df.set_index('cats').groupby(level=0, observed=False).sum() - expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() + result = df.set_index("cats").groupby(level=0, observed=False).sum() + expected = df[list("abcd")].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') + Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats" + ) assert_frame_equal(result, expected) # with a cat column, should produce a cat index - result = df.groupby('cats', observed=False).sum() - expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() + result = df.groupby("cats", observed=False).sum() + expected = df[list("abcd")].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') + Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats" + ) assert_frame_equal(result, expected) def test_describe_categorical_columns(): # GH 11558 - cats = CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) + cats = CategoricalIndex( + ["qux", "foo", "baz", "bar"], + categories=["foo", "bar", "baz", "qux"], + ordered=True, + ) df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() @@ -568,22 +567,20 @@ def test_describe_categorical_columns(): def test_unstack_categorical(): # GH11558 (example is taken from the original issue) - df = DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) - df['medium'] = df['medium'].astype('category') + df = DataFrame( + {"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2} + ) + df["medium"] = df["medium"].astype("category") - gcat = df.groupby( - ['artist', 'medium'], observed=False)['a'].count().unstack() + gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack() result = gcat.describe() - exp_columns = CategoricalIndex(['A', 'B'], ordered=False, - name='medium') + exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium") tm.assert_index_equal(result.columns, exp_columns) tm.assert_categorical_equal(result.columns.values, exp_columns.values) - result = gcat['A'] + gcat['B'] - expected = Series([6, 4], index=Index(['X', 'Y'], name='artist')) + result = gcat["A"] + gcat["B"] + expected = Series([6, 4], index=Index(["X", "Y"], name="artist")) tm.assert_series_equal(result, expected) @@ -599,41 +596,54 @@ def test_bins_unequal_len(): def test_as_index(): # GH13204 - df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), - 'A': [10, 11, 11], - 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum() + df = DataFrame( + { + "cat": Categorical([1, 2, 2], [1, 2, 3]), + "A": [10, 11, 11], + "B": [101, 102, 103], + } + ) + result = df.groupby(["cat", "A"], as_index=False, observed=True).sum() expected = DataFrame( - {'cat': Categorical([1, 2], categories=df.cat.cat.categories), - 'A': [10, 11], - 'B': [101, 205]}, - columns=['cat', 'A', 'B']) + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 11], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) tm.assert_frame_equal(result, expected) # function grouper - f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False, observed=True).sum() + f = lambda r: df.loc[r, "A"] + result = df.groupby(["cat", f], as_index=False, observed=True).sum() expected = DataFrame( - {'cat': Categorical([1, 2], categories=df.cat.cat.categories), - 'A': [10, 22], - 'B': [101, 205]}, - columns=['cat', 'A', 'B']) + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 22], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) tm.assert_frame_equal(result, expected) # another not in-axis grouper (conflicting names in index) - s = Series(['a', 'b', 'b'], name='cat') - result = df.groupby(['cat', s], as_index=False, observed=True).sum() + s = Series(["a", "b", "b"], name="cat") + result = df.groupby(["cat", s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) # is original index dropped? - group_columns = ['cat', 'A'] + group_columns = ["cat", "A"] expected = DataFrame( - {'cat': Categorical([1, 2], categories=df.cat.cat.categories), - 'A': [10, 11], - 'B': [101, 205]}, - columns=['cat', 'A', 'B']) - - for name in [None, 'X', 'B']: + { + "cat": Categorical([1, 2], categories=df.cat.cat.categories), + "A": [10, 11], + "B": [101, 205], + }, + columns=["cat", "A", "B"], + ) + + for name in [None, "X", "B"]: df.index = Index(list("abc"), name=name) result = df.groupby(group_columns, as_index=False, observed=True).sum() @@ -642,79 +652,76 @@ def test_as_index(): def test_preserve_categories(): # GH-13179 - categories = list('abc') + categories = list("abc") # ordered=True - df = DataFrame({'A': Categorical(list('ba'), - categories=categories, - ordered=True)}) + df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)}) index = CategoricalIndex(categories, categories, ordered=True) tm.assert_index_equal( - df.groupby('A', sort=True, observed=False).first().index, index) + df.groupby("A", sort=True, observed=False).first().index, index + ) tm.assert_index_equal( - df.groupby('A', sort=False, observed=False).first().index, index) + df.groupby("A", sort=False, observed=False).first().index, index + ) # ordered=False - df = DataFrame({'A': Categorical(list('ba'), - categories=categories, - ordered=False)}) + df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)}) sort_index = CategoricalIndex(categories, categories, ordered=False) - nosort_index = CategoricalIndex(list('bac'), list('bac'), - ordered=False) + nosort_index = CategoricalIndex(list("bac"), list("bac"), ordered=False) tm.assert_index_equal( - df.groupby('A', sort=True, observed=False).first().index, - sort_index) + df.groupby("A", sort=True, observed=False).first().index, sort_index + ) tm.assert_index_equal( - df.groupby('A', sort=False, observed=False).first().index, - nosort_index) + df.groupby("A", sort=False, observed=False).first().index, nosort_index + ) def test_preserve_categorical_dtype(): # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) + df = DataFrame( + { + "A": [1, 2, 1, 1, 2], + "B": [10, 16, 22, 28, 34], + "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False), + "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True), + } + ) # single grouper - exp_full = DataFrame({'A': [2.0, 1.0, np.nan], - 'B': [25.0, 20.0, np.nan], - 'C1': Categorical(list("bac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bac"), - categories=list("bac"), - ordered=True)}) - for col in ['C1', 'C2']: + exp_full = DataFrame( + { + "A": [2.0, 1.0, np.nan], + "B": [25.0, 20.0, np.nan], + "C1": Categorical(list("bac"), categories=list("bac"), ordered=False), + "C2": Categorical(list("bac"), categories=list("bac"), ordered=True), + } + ) + for col in ["C1", "C2"]: result1 = df.groupby(by=col, as_index=False, observed=False).mean() - result2 = df.groupby( - by=col, as_index=True, observed=False).mean().reset_index() + result2 = df.groupby(by=col, as_index=True, observed=False).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) @pytest.mark.parametrize( - 'func, values', - [('first', ['second', 'first']), - ('last', ['fourth', 'third']), - ('min', ['fourth', 'first']), - ('max', ['second', 'third'])]) + "func, values", + [ + ("first", ["second", "first"]), + ("last", ["fourth", "third"]), + ("min", ["fourth", "first"]), + ("max", ["second", "third"]), + ], +) def test_preserve_on_ordered_ops(func, values): # gh-18502 # preserve the categoricals on ops - c = pd.Categorical(['first', 'second', 'third', 'fourth'], ordered=True) - df = pd.DataFrame( - {'payload': [-1, -2, -1, -2], - 'col': c}) - g = df.groupby('payload') + c = pd.Categorical(["first", "second", "third", "fourth"], ordered=True) + df = pd.DataFrame({"payload": [-1, -2, -1, -2], "col": c}) + g = df.groupby("payload") result = getattr(g, func)() expected = pd.DataFrame( - {'payload': [-2, -1], - 'col': pd.Series(values, dtype=c.dtype)}).set_index('payload') + {"payload": [-2, -1], "col": pd.Series(values, dtype=c.dtype)} + ).set_index("payload") tm.assert_frame_equal(result, expected) @@ -727,8 +734,9 @@ def test_categorical_no_compress(): result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean() - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) + exp.index = CategoricalIndex( + exp.index, categories=cats.categories, ordered=cats.ordered + ) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) @@ -736,12 +744,16 @@ def test_categorical_no_compress(): result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) + exp.index = CategoricalIndex( + exp.index, categories=cats.categories, ordered=cats.ordered + ) assert_series_equal(result, exp) - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) + cats = Categorical( + ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) result = data.groupby("b", observed=False).mean() @@ -757,15 +769,16 @@ def test_sort(): # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + df = DataFrame({"value": np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) - res = df.groupby(['value_group'], observed=False)['value_group'].count() + res = df.groupby(["value_group"], observed=False)["value_group"].count() exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] exp.index = CategoricalIndex(exp.index, name=exp.index.name) tm.assert_series_equal(res, exp) @@ -773,20 +786,27 @@ def test_sort(): def test_sort2(): # dataframe groupby sort was being ignored # GH 8868 - df = DataFrame([['(7.5, 10]', 10, 10], - ['(7.5, 10]', 8, 20], - ['(2.5, 5]', 5, 30], - ['(5, 7.5]', 6, 40], - ['(2.5, 5]', 4, 50], - ['(0, 2.5]', 1, 60], - ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) - df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range', ordered=True) - expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - col = 'range' + df = DataFrame( + [ + ["(7.5, 10]", 10, 10], + ["(7.5, 10]", 8, 20], + ["(2.5, 5]", 5, 30], + ["(5, 7.5]", 6, 40], + ["(2.5, 5]", 4, 50], + ["(0, 2.5]", 1, 60], + ["(5, 7.5]", 7, 70], + ], + columns=["range", "foo", "bar"], + ) + df["range"] = Categorical(df["range"], ordered=True) + index = CategoricalIndex( + ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range", ordered=True + ) + expected_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index + ) + + col = "range" result_sort = df.groupby(col, sort=True, observed=False).first() assert_frame_equal(result_sort, expected_sort) @@ -795,21 +815,24 @@ def test_sort2(): result_sort = df.groupby(col, sort=False, observed=False).first() assert_frame_equal(result_sort, expected_sort) - df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range') - expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', - '(0, 2.5]'], - categories=['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], - name='range') - expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - index=index, columns=['foo', 'bar']) - - col = 'range' + df["range"] = Categorical(df["range"], ordered=False) + index = CategoricalIndex( + ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"], name="range" + ) + expected_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"], index=index + ) + + index = CategoricalIndex( + ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], + categories=["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"], + name="range", + ) + expected_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=["foo", "bar"] + ) + + col = "range" # this is an unordered categorical, but we allow this #### result_sort = df.groupby(col, sort=True, observed=False).first() @@ -824,165 +847,207 @@ def test_sort_datetimelike(): # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month - df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), - datetime(2011, 2, 1), datetime(2011, 5, 1), - datetime(2011, 2, 1), datetime(2011, 1, 1), - datetime(2011, 5, 1)], - 'foo': [10, 8, 5, 6, 4, 1, 7], - 'bar': [10, 20, 30, 40, 50, 60, 70]}, - columns=['dt', 'foo', 'bar']) + df = DataFrame( + { + "dt": [ + datetime(2011, 7, 1), + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 2, 1), + datetime(2011, 1, 1), + datetime(2011, 5, 1), + ], + "foo": [10, 8, 5, 6, 4, 1, 7], + "bar": [10, 20, 30, 40, 50, 60, 70], + }, + columns=["dt", "foo", "bar"], + ) # ordered=True - df['dt'] = Categorical(df['dt'], ordered=True) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] + df["dt"] = Categorical(df["dt"], ordered=True) + index = [ + datetime(2011, 1, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 7, 1), + ] result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt', ordered=True) - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt', ordered=True) - - col = 'dt' - assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first()) + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] + ) + result_sort.index = CategoricalIndex(index, name="dt", ordered=True) + + index = [ + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 1, 1), + ] + result_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] + ) + result_nosort.index = CategoricalIndex( + index, categories=index, name="dt", ordered=True + ) + + col = "dt" + assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) # when categories is ordered, group is ordered by category's order - assert_frame_equal( - result_sort, df.groupby(col, sort=False, observed=False).first()) + assert_frame_equal(result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False - df['dt'] = Categorical(df['dt'], ordered=False) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] + df["dt"] = Categorical(df["dt"], ordered=False) + index = [ + datetime(2011, 1, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 7, 1), + ] result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt') - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt') - - col = 'dt' + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"] + ) + result_sort.index = CategoricalIndex(index, name="dt") + + index = [ + datetime(2011, 7, 1), + datetime(2011, 2, 1), + datetime(2011, 5, 1), + datetime(2011, 1, 1), + ] + result_nosort = DataFrame( + [[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"] + ) + result_nosort.index = CategoricalIndex(index, categories=index, name="dt") + + col = "dt" + assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) assert_frame_equal( - result_sort, df.groupby(col, sort=True, observed=False).first()) - assert_frame_equal( - result_nosort, df.groupby(col, sort=False, observed=False).first()) + result_nosort, df.groupby(col, sort=False, observed=False).first() + ) def test_empty_sum(): # https://github.com/pandas-dev/pandas/issues/18678 - df = DataFrame({"A": Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') + df = DataFrame( + {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]} + ) + expected_idx = CategoricalIndex(["a", "b", "c"], name="A") # 0 by default result = df.groupby("A", observed=False).B.sum() - expected = Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.sum(min_count=0) - expected = Series([3, 1, 0], expected_idx, name='B') + expected = Series([3, 1, 0], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.sum(min_count=1) - expected = Series([3, 1, np.nan], expected_idx, name='B') + expected = Series([3, 1, np.nan], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count>1 result = df.groupby("A", observed=False).B.sum(min_count=2) - expected = Series([3, np.nan, np.nan], expected_idx, name='B') + expected = Series([3, np.nan, np.nan], expected_idx, name="B") tm.assert_series_equal(result, expected) def test_empty_prod(): # https://github.com/pandas-dev/pandas/issues/18678 - df = DataFrame({"A": Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) + df = DataFrame( + {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]} + ) - expected_idx = CategoricalIndex(['a', 'b', 'c'], name='A') + expected_idx = CategoricalIndex(["a", "b", "c"], name="A") # 1 by default result = df.groupby("A", observed=False).B.prod() - expected = Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.prod(min_count=0) - expected = Series([2, 1, 1], expected_idx, name='B') + expected = Series([2, 1, 1], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.prod(min_count=1) - expected = Series([2, 1, np.nan], expected_idx, name='B') + expected = Series([2, 1, np.nan], expected_idx, name="B") tm.assert_series_equal(result, expected) def test_groupby_multiindex_categorical_datetime(): # https://github.com/pandas-dev/pandas/issues/21390 - df = DataFrame({ - 'key1': Categorical(list('abcbabcba')), - 'key2': Categorical( - list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), - 'values': np.arange(9), - }) - result = df.groupby(['key1', 'key2']).mean() + df = DataFrame( + { + "key1": Categorical(list("abcbabcba")), + "key2": Categorical( + list(pd.date_range("2018-06-01 00", freq="1T", periods=3)) * 3 + ), + "values": np.arange(9), + } + ) + result = df.groupby(["key1", "key2"]).mean() idx = MultiIndex.from_product( - [Categorical(['a', 'b', 'c']), - Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], - names=['key1', 'key2']) - expected = DataFrame( - {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) + [ + Categorical(["a", "b", "c"]), + Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)), + ], + names=["key1", "key2"], + ) + expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) assert_frame_equal(result, expected) -@pytest.mark.parametrize("as_index, expected", [ - (True, Series( - index=MultiIndex.from_arrays( - [Series([1, 1, 2], dtype='category'), - [1, 2, 2]], names=['a', 'b'] +@pytest.mark.parametrize( + "as_index, expected", + [ + ( + True, + Series( + index=MultiIndex.from_arrays( + [Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"] + ), + data=[1, 2, 3], + name="x", + ), ), - data=[1, 2, 3], name='x' - )), - (False, DataFrame({ - 'a': Series([1, 1, 2], dtype='category'), - 'b': [1, 2, 2], - 'x': [1, 2, 3] - })) -]) + ( + False, + DataFrame( + { + "a": Series([1, 1, 2], dtype="category"), + "b": [1, 2, 2], + "x": [1, 2, 3], + } + ), + ), + ], +) def test_groupby_agg_observed_true_single_column(as_index, expected): # GH-23970 - df = DataFrame({ - 'a': Series([1, 1, 2], dtype='category'), - 'b': [1, 2, 2], - 'x': [1, 2, 3] - }) + df = DataFrame( + {"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]} + ) - result = df.groupby( - ['a', 'b'], as_index=as_index, observed=True)['x'].sum() + result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum() assert_equal(result, expected) -@pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT]) +@pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT]) def test_shift(fill_value): - ct = Categorical(['a', 'b', 'c', 'd'], - categories=['a', 'b', 'c', 'd'], ordered=False) - expected = Categorical([None, 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], ordered=False) + ct = Categorical( + ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False + ) + expected = Categorical( + [None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) res = ct.shift(1, fill_value=fill_value) assert_equal(res, expected) @@ -1005,66 +1070,98 @@ def df_cat(df): df_cat: DataFrame """ df_cat = df.copy()[:4] # leave out some groups - df_cat['A'] = df_cat['A'].astype('category') - df_cat['B'] = df_cat['B'].astype('category') - df_cat['C'] = Series([1, 2, 3, 4]) - df_cat = df_cat.drop(['D'], axis=1) + df_cat["A"] = df_cat["A"].astype("category") + df_cat["B"] = df_cat["B"].astype("category") + df_cat["C"] = Series([1, 2, 3, 4]) + df_cat = df_cat.drop(["D"], axis=1) return df_cat -@pytest.mark.parametrize('operation, kwargs', [ - ('agg', dict(dtype='category')), - ('apply', dict())]) +@pytest.mark.parametrize( + "operation, kwargs", [("agg", dict(dtype="category")), ("apply", dict())] +) def test_seriesgroupby_observed_true(df_cat, operation, kwargs): # GH 24880 index = MultiIndex.from_frame( - DataFrame({'A': ['foo', 'foo', 'bar', 'bar'], - 'B': ['one', 'two', 'one', 'three'] - }, **kwargs)) - expected = Series(data=[1, 3, 2, 4], index=index, name='C') - grouped = df_cat.groupby(['A', 'B'], observed=True)['C'] + DataFrame( + {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]}, + **kwargs + ) + ) + expected = Series(data=[1, 3, 2, 4], index=index, name="C") + grouped = df_cat.groupby(["A", "B"], observed=True)["C"] result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) -@pytest.mark.parametrize('operation', ['agg', 'apply']) -@pytest.mark.parametrize('observed', [False, None]) +@pytest.mark.parametrize("operation", ["agg", "apply"]) +@pytest.mark.parametrize("observed", [False, None]) def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): # GH 24880 index, _ = MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], ordered=False), - CategoricalIndex(['one', 'three', 'two'], ordered=False)], - names=['A', 'B']).sortlevel() - - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], - index=index, name='C') - grouped = df_cat.groupby(['A', 'B'], observed=observed)['C'] + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + ], + names=["A", "B"], + ).sortlevel() + + expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) assert_series_equal(result, expected) -@pytest.mark.parametrize("observed, index, data", [ - (True, MultiIndex.from_tuples( - [('foo', 'one', 'min'), ('foo', 'one', 'max'), - ('foo', 'two', 'min'), ('foo', 'two', 'max'), - ('bar', 'one', 'min'), ('bar', 'one', 'max'), - ('bar', 'three', 'min'), ('bar', 'three', 'max')], - names=['A', 'B', None]), [1, 1, 3, 3, 2, 2, 4, 4]), - (False, MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], ordered=False), - CategoricalIndex(['one', 'three', 'two'], ordered=False), - Index(['min', 'max'])], - names=['A', 'B', None]), - [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3]), - (None, MultiIndex.from_product( - [CategoricalIndex(['bar', 'foo'], ordered=False), - CategoricalIndex(['one', 'three', 'two'], ordered=False), - Index(['min', 'max'])], - names=['A', 'B', None]), - [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3])]) +@pytest.mark.parametrize( + "observed, index, data", + [ + ( + True, + MultiIndex.from_tuples( + [ + ("foo", "one", "min"), + ("foo", "one", "max"), + ("foo", "two", "min"), + ("foo", "two", "max"), + ("bar", "one", "min"), + ("bar", "one", "max"), + ("bar", "three", "min"), + ("bar", "three", "max"), + ], + names=["A", "B", None], + ), + [1, 1, 3, 3, 2, 2, 4, 4], + ), + ( + False, + MultiIndex.from_product( + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + Index(["min", "max"]), + ], + names=["A", "B", None], + ), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], + ), + ( + None, + MultiIndex.from_product( + [ + CategoricalIndex(["bar", "foo"], ordered=False), + CategoricalIndex(["one", "three", "two"], ordered=False), + Index(["min", "max"]), + ], + names=["A", "B", None], + ), + [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], + ), + ], +) def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): # GH 24880 - expected = Series(data=data, index=index, name='C') - result = df_cat.groupby(['A', 'B'], observed=observed)['C'].apply( - lambda x: OrderedDict([('min', x.min()), ('max', x.max())])) + expected = Series(data=data, index=index, name="C") + result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply( + lambda x: OrderedDict([("min", x.min()), ("max", x.max())]) + ) assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 2dd26bac1e102..5a864b3ab8cb4 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -8,10 +8,9 @@ class TestCounting: - def test_cumcount(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) - g = df.groupby('A') + df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"]) + g = df.groupby("A") sg = g.A expected = Series([0, 1, 2, 0, 3]) @@ -24,15 +23,16 @@ def test_cumcount_empty(self): se = Series().groupby(level=0) # edge case, as this is usually considered float - e = Series(dtype='int64') + e = Series(dtype="int64") assert_series_equal(e, ge.cumcount()) assert_series_equal(e, se.cumcount()) def test_cumcount_dupe_index(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby('A') + df = DataFrame( + [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 + ) + g = df.groupby("A") sg = g.A expected = Series([0, 1, 2, 0, 3], index=[0] * 5) @@ -42,9 +42,8 @@ def test_cumcount_dupe_index(self): def test_cumcount_mi(self): mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=mi) - g = df.groupby('A') + df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi) + g = df.groupby("A") sg = g.A expected = Series([0, 1, 2, 0, 3], index=mi) @@ -53,8 +52,9 @@ def test_cumcount_mi(self): assert_series_equal(expected, sg.cumcount()) def test_cumcount_groupby_not_col(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) + df = DataFrame( + [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5 + ) g = df.groupby([0, 0, 0, 1, 0]) sg = g.A @@ -64,8 +64,8 @@ def test_cumcount_groupby_not_col(self): assert_series_equal(expected, sg.cumcount()) def test_ngroup(self): - df = DataFrame({'A': list('aaaba')}) - g = df.groupby('A') + df = DataFrame({"A": list("aaaba")}) + g = df.groupby("A") sg = g.A expected = Series([0, 0, 0, 1, 0]) @@ -74,18 +74,18 @@ def test_ngroup(self): assert_series_equal(expected, sg.ngroup()) def test_ngroup_distinct(self): - df = DataFrame({'A': list('abcde')}) - g = df.groupby('A') + df = DataFrame({"A": list("abcde")}) + g = df.groupby("A") sg = g.A - expected = Series(range(5), dtype='int64') + expected = Series(range(5), dtype="int64") assert_series_equal(expected, g.ngroup()) assert_series_equal(expected, sg.ngroup()) def test_ngroup_one_group(self): - df = DataFrame({'A': [0] * 5}) - g = df.groupby('A') + df = DataFrame({"A": [0] * 5}) + g = df.groupby("A") sg = g.A expected = Series([0] * 5) @@ -98,21 +98,20 @@ def test_ngroup_empty(self): se = Series().groupby(level=0) # edge case, as this is usually considered float - e = Series(dtype='int64') + e = Series(dtype="int64") assert_series_equal(e, ge.ngroup()) assert_series_equal(e, se.ngroup()) def test_ngroup_series_matches_frame(self): - df = DataFrame({'A': list('aaaba')}) - s = Series(list('aaaba')) + df = DataFrame({"A": list("aaaba")}) + s = Series(list("aaaba")) - assert_series_equal(df.groupby(s).ngroup(), - s.groupby(s).ngroup()) + assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup()) def test_ngroup_dupe_index(self): - df = DataFrame({'A': list('aaaba')}, index=[0] * 5) - g = df.groupby('A') + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) + g = df.groupby("A") sg = g.A expected = Series([0, 0, 0, 1, 0], index=[0] * 5) @@ -122,8 +121,8 @@ def test_ngroup_dupe_index(self): def test_ngroup_mi(self): mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame({'A': list('aaaba')}, index=mi) - g = df.groupby('A') + df = DataFrame({"A": list("aaaba")}, index=mi) + g = df.groupby("A") sg = g.A expected = Series([0, 0, 0, 1, 0], index=mi) @@ -131,7 +130,7 @@ def test_ngroup_mi(self): assert_series_equal(expected, sg.ngroup()) def test_ngroup_groupby_not_col(self): - df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + df = DataFrame({"A": list("aaaba")}, index=[0] * 5) g = df.groupby([0, 0, 0, 1, 0]) sg = g.A @@ -141,8 +140,8 @@ def test_ngroup_groupby_not_col(self): assert_series_equal(expected, sg.ngroup()) def test_ngroup_descending(self): - df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A']) - g = df.groupby(['A']) + df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"]) + g = df.groupby(["A"]) ascending = Series([0, 0, 1, 0, 1]) descending = Series([1, 1, 0, 1, 0]) @@ -153,9 +152,11 @@ def test_ngroup_descending(self): def test_ngroup_matches_cumcount(self): # verify one manually-worked out case works - df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'], - ['a', 'x'], ['b', 'y']], columns=['A', 'X']) - g = df.groupby(['A', 'X']) + df = DataFrame( + [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]], + columns=["A", "X"], + ) + g = df.groupby(["A", "X"]) g_ngroup = g.ngroup() g_cumcount = g.cumcount() expected_ngroup = Series([0, 1, 2, 0, 3]) @@ -167,8 +168,8 @@ def test_ngroup_matches_cumcount(self): def test_ngroup_cumcount_pair(self): # brute force comparison for all small series for p in product(range(3), repeat=4): - df = DataFrame({'a': p}) - g = df.groupby(['a']) + df = DataFrame({"a": p}) + g = df.groupby(["a"]) order = sorted(set(p)) ngroupd = [order.index(val) for val in p] @@ -179,43 +180,43 @@ def test_ngroup_cumcount_pair(self): def test_ngroup_respects_groupby_order(self): np.random.seed(0) - df = DataFrame({'a': np.random.choice(list('abcdef'), 100)}) + df = DataFrame({"a": np.random.choice(list("abcdef"), 100)}) for sort_flag in (False, True): - g = df.groupby(['a'], sort=sort_flag) - df['group_id'] = -1 - df['group_index'] = -1 + g = df.groupby(["a"], sort=sort_flag) + df["group_id"] = -1 + df["group_index"] = -1 for i, (_, group) in enumerate(g): - df.loc[group.index, 'group_id'] = i + df.loc[group.index, "group_id"] = i for j, ind in enumerate(group.index): - df.loc[ind, 'group_index'] = j - - assert_series_equal(Series(df['group_id'].values), - g.ngroup()) - assert_series_equal(Series(df['group_index'].values), - g.cumcount()) - - @pytest.mark.parametrize('datetimelike', [ - [Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)], - [Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)], - [Timedelta(x, unit="h") for x in range(1, 4)], - [Period(freq="2W", year=2017, month=x) for x in range(1, 4)]]) + df.loc[ind, "group_index"] = j + + assert_series_equal(Series(df["group_id"].values), g.ngroup()) + assert_series_equal(Series(df["group_index"].values), g.cumcount()) + + @pytest.mark.parametrize( + "datetimelike", + [ + [Timestamp("2016-05-%02d 20:09:25+00:00" % i) for i in range(1, 4)], + [Timestamp("2016-05-%02d 20:09:25" % i) for i in range(1, 4)], + [Timedelta(x, unit="h") for x in range(1, 4)], + [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], + ], + ) def test_count_with_datetimelike(self, datetimelike): # test for #13393, where DataframeGroupBy.count() fails # when counting a datetimelike column. - df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike}) - res = df.groupby('x').count() - expected = DataFrame({'y': [2, 1]}, index=['a', 'b']) + df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike}) + res = df.groupby("x").count() + expected = DataFrame({"y": [2, 1]}, index=["a", "b"]) expected.index.name = "x" assert_frame_equal(expected, res) def test_count_with_only_nans_in_first_group(self): # GH21956 - df = DataFrame({'A': [np.nan, np.nan], 'B': ['a', 'b'], 'C': [1, 2]}) - result = df.groupby(['A', 'B']).C.count() - mi = MultiIndex(levels=[[], ['a', 'b']], - codes=[[], []], - names=['A', 'B']) - expected = Series([], index=mi, dtype=np.int64, name='C') + df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]}) + result = df.groupby(["A", "B"]).C.count() + mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"]) + expected = Series([], index=mi, dtype=np.int64, name="C") assert_series_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 52c4654ae8c73..2ce04fc774083 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -12,17 +12,17 @@ def test_filter_series(): expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) - tm.assert_series_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - tm.assert_series_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) # Test dropna=False. tm.assert_series_equal( grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(s.index)) + expected_odd.reindex(s.index), + ) tm.assert_series_equal( grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(s.index)) + expected_even.reindex(s.index), + ) def test_filter_single_column_df(): @@ -31,36 +31,35 @@ def test_filter_single_column_df(): expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) grouper = df[0].apply(lambda x: x % 2) grouped = df.groupby(grouper) - tm.assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - tm.assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) + tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even) # Test dropna=False. tm.assert_frame_equal( grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(df.index)) + expected_odd.reindex(df.index), + ) tm.assert_frame_equal( grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(df.index)) + expected_even.reindex(df.index), + ) def test_filter_multi_column_df(): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + expected = pd.DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2]) tm.assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), - expected) + grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected + ) def test_filter_mixed_df(): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) - tm.assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 10), expected) + expected = pd.DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2]) + tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected) def test_filter_out_all_groups(): @@ -68,11 +67,10 @@ def test_filter_out_all_groups(): grouper = s.apply(lambda x: x % 2) grouped = s.groupby(grouper) tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - tm.assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]]) + tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]]) def test_filter_out_no_groups(): @@ -81,25 +79,25 @@ def test_filter_out_no_groups(): grouped = s.groupby(grouper) filtered = grouped.filter(lambda x: x.mean() > 0) tm.assert_series_equal(filtered, s) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) + df = pd.DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()}) + grouper = df["A"].apply(lambda x: x % 2) grouped = df.groupby(grouper) - filtered = grouped.filter(lambda x: x['A'].mean() > 0) + filtered = grouped.filter(lambda x: x["A"].mean() > 0) tm.assert_frame_equal(filtered, df) def test_filter_out_all_groups_in_df(): # GH12768 - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) - expected = pd.DataFrame({'a': [np.nan] * 3, 'b': [np.nan] * 3}) + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + res = df.groupby("a") + res = res.filter(lambda x: x["b"].sum() > 5, dropna=False) + expected = pd.DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3}) tm.assert_frame_equal(expected, res) - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) - expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]}) + res = df.groupby("a") + res = res.filter(lambda x: x["b"].sum() > 5, dropna=True) + expected = pd.DataFrame({"a": [], "b": []}, dtype="int64") tm.assert_frame_equal(expected, res) @@ -121,20 +119,16 @@ def raise_if_sum_is_zero(x): def test_filter_with_axis_in_groupby(): # issue 11041 index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = pd.DataFrame( - np.arange(100).reshape(-1, 20), columns=index, dtype='int64') - result = data.groupby(level=0, - axis=1).filter(lambda x: x.iloc[0, 0] > 10) + data = pd.DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64") + result = data.groupby(level=0, axis=1).filter(lambda x: x.iloc[0, 0] > 10) expected = data.iloc[:, 12:20] tm.assert_frame_equal(result, expected) def test_filter_bad_shapes(): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby('B') + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + s = df["B"] + g_df = df.groupby("B") g_s = s.groupby(s) f = lambda x: x @@ -163,11 +157,9 @@ def test_filter_bad_shapes(): def test_filter_nan_is_false(): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby(df['B']) + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + s = df["B"] + g_df = df.groupby(df["B"]) g_s = s.groupby(s) f = lambda x: np.nan @@ -183,7 +175,7 @@ def test_filter_against_workaround(): grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] + old_way = s[grouped.transform(f).astype("bool")] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) @@ -192,53 +184,53 @@ def test_filter_against_workaround(): grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] + old_way = s[grouped.transform(f).astype("bool")] new_way = grouped.filter(f) tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), - 'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) + df = DataFrame( + { + "ints": Series(np.random.randint(0, 100, N)), + "floats": N / 10 * Series(np.random.random(N)), + "letters": Series(random_letters), + } + ) # Group by ints; filter on floats. - grouped = df.groupby('ints') - old_way = df[grouped.floats. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) + grouped = df.groupby("ints") + old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")] + new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) - old_way = df[grouped.letters. - transform(lambda x: len(x) < N / 10).astype('bool')] + old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) tm.assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. - grouped = df.groupby('letters') - old_way = df[grouped.ints. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) + grouped = df.groupby("letters") + old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")] + new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20) tm.assert_frame_equal(new_way, old_way) def test_filter_using_len(): # BUG GH4447 - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - grouped = df.groupby('B') + df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)}) + grouped = df.groupby("B") actual = grouped.filter(lambda x: len(x) > 2) expected = DataFrame( - {'A': np.arange(2, 6), - 'B': list('bbbb'), - 'C': np.arange(2, 6)}, index=np.arange(2, 6)) + {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)}, + index=np.arange(2, 6), + ) tm.assert_frame_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) @@ -246,10 +238,10 @@ def test_filter_using_len(): tm.assert_frame_equal(actual, expected) # Series have always worked properly, but we'll test anyway. - s = df['B'] + s = df["B"] grouped = s.groupby(s) actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') + expected = Series(4 * ["b"], index=np.arange(2, 6), name="B") tm.assert_series_equal(actual, expected) actual = grouped.filter(lambda x: len(x) > 4) @@ -259,28 +251,29 @@ def test_filter_using_len(): def test_filter_maintains_ordering(): # Simple case: index is sequential. #4621 - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) - s = df['pid'] - grouped = df.groupby('tag') + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]} + ) + s = df["pid"] + grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) - grouped = s.groupby(df['tag']) + grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) # Now index is sequentially decreasing. df.index = np.arange(len(df) - 1, -1, -1) - s = df['pid'] - grouped = df.groupby('tag') + s = df["pid"] + grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) - grouped = s.groupby(df['tag']) + grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) @@ -288,13 +281,13 @@ def test_filter_maintains_ordering(): # Index is shuffled. SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] df.index = df.index[SHUFFLED] - s = df['pid'] - grouped = df.groupby('tag') + s = df["pid"] + grouped = df.groupby("tag") actual = grouped.filter(lambda x: len(x) > 1) expected = df.iloc[[1, 2, 4, 7]] tm.assert_frame_equal(actual, expected) - grouped = s.groupby(df['tag']) + grouped = s.groupby(df["tag"]) actual = grouped.filter(lambda x: len(x) > 1) expected = s.iloc[[1, 2, 4, 7]] tm.assert_series_equal(actual, expected) @@ -302,39 +295,45 @@ def test_filter_maintains_ordering(): def test_filter_multiple_timestamp(): # GH 10114 - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], - 'C': Timestamp('20130101')}) + df = DataFrame( + { + "A": np.arange(5, dtype="int64"), + "B": ["foo", "bar", "foo", "bar", "bar"], + "C": Timestamp("20130101"), + } + ) - grouped = df.groupby(['B', 'C']) + grouped = df.groupby(["B", "C"]) - result = grouped['A'].filter(lambda x: True) - tm.assert_series_equal(df['A'], result) + result = grouped["A"].filter(lambda x: True) + tm.assert_series_equal(df["A"], result) - result = grouped['A'].transform(len) - expected = Series([2, 3, 2, 3, 3], name='A') + result = grouped["A"].transform(len) + expected = Series([2, 3, 2, 3, 3], name="A") tm.assert_series_equal(result, expected) result = grouped.filter(lambda x: True) tm.assert_frame_equal(df, result) - result = grouped.transform('sum') - expected = DataFrame({'A': [2, 8, 2, 8, 8]}) + result = grouped.transform("sum") + expected = DataFrame({"A": [2, 8, 2, 8, 8]}) tm.assert_frame_equal(result, expected) result = grouped.transform(len) - expected = DataFrame({'A': [2, 3, 2, 3, 3]}) + expected = DataFrame({"A": [2, 3, 2, 3, 3]}) tm.assert_frame_equal(result, expected) def test_filter_and_transform_with_non_unique_int_index(): # GH4620 index = [1, 1, 1, 2, 1, 1, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -354,13 +353,13 @@ def test_filter_and_transform_with_non_unique_int_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -371,11 +370,13 @@ def test_filter_and_transform_with_non_unique_int_index(): def test_filter_and_transform_with_multiple_non_unique_int_index(): # GH4620 index = [1, 1, 1, 2, 0, 0, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -395,13 +396,13 @@ def test_filter_and_transform_with_multiple_non_unique_int_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -412,11 +413,13 @@ def test_filter_and_transform_with_multiple_non_unique_int_index(): def test_filter_and_transform_with_non_unique_float_index(): # GH4620 index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -436,13 +439,13 @@ def test_filter_and_transform_with_non_unique_float_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -452,15 +455,17 @@ def test_filter_and_transform_with_non_unique_float_index(): def test_filter_and_transform_with_non_unique_timestamp_index(): # GH4620 - t0 = Timestamp('2013-09-30 00:05:00') - t1 = Timestamp('2013-10-30 00:05:00') - t2 = Timestamp('2013-11-30 00:05:00') + t0 = Timestamp("2013-09-30 00:05:00") + t1 = Timestamp("2013-10-30 00:05:00") + t2 = Timestamp("2013-11-30 00:05:00") index = [t1, t1, t1, t2, t1, t1, t0, t1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -480,13 +485,13 @@ def test_filter_and_transform_with_non_unique_timestamp_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -496,12 +501,14 @@ def test_filter_and_transform_with_non_unique_timestamp_index(): def test_filter_and_transform_with_non_unique_string_index(): # GH4620 - index = list('bbbcbbab') - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) + index = list("bbbcbbab") + df = DataFrame( + {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}, + index=index, + ) + grouped_df = df.groupby("tag") + ser = df["pid"] + grouped_ser = ser.groupby(df["tag"]) expected_indexes = [1, 2, 4, 7] # Filter DataFrame @@ -521,13 +528,13 @@ def test_filter_and_transform_with_non_unique_string_index(): actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name="pid") # ^ made manually because this can get confusing! tm.assert_series_equal(actual, expected) # Transform Series actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid") tm.assert_series_equal(actual, expected) # Transform (a column from) DataFrameGroupBy @@ -536,39 +543,45 @@ def test_filter_and_transform_with_non_unique_string_index(): def test_filter_has_access_to_grouped_cols(): - df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") # previously didn't have access to col A #???? - filt = g.filter(lambda x: x['A'].sum() == 2) + filt = g.filter(lambda x: x["A"].sum() == 2) tm.assert_frame_equal(filt, df.iloc[[0, 1]]) def test_filter_enforces_scalarness(): - df = pd.DataFrame([ - ['best', 'a', 'x'], - ['worst', 'b', 'y'], - ['best', 'c', 'x'], - ['best', 'd', 'y'], - ['worst', 'd', 'y'], - ['worst', 'd', 'y'], - ['best', 'd', 'z'], - ], columns=['a', 'b', 'c']) - with pytest.raises(TypeError, match='filter function returned a.*'): - df.groupby('c').filter(lambda g: g['a'] == 'best') + df = pd.DataFrame( + [ + ["best", "a", "x"], + ["worst", "b", "y"], + ["best", "c", "x"], + ["best", "d", "y"], + ["worst", "d", "y"], + ["worst", "d", "y"], + ["best", "d", "z"], + ], + columns=["a", "b", "c"], + ) + with pytest.raises(TypeError, match="filter function returned a.*"): + df.groupby("c").filter(lambda g: g["a"] == "best") def test_filter_non_bool_raises(): - df = pd.DataFrame([ - ['best', 'a', 1], - ['worst', 'b', 1], - ['best', 'c', 1], - ['best', 'd', 1], - ['worst', 'd', 1], - ['worst', 'd', 1], - ['best', 'd', 1], - ], columns=['a', 'b', 'c']) - with pytest.raises(TypeError, match='filter function returned a.*'): - df.groupby('a').filter(lambda g: g.c.mean()) + df = pd.DataFrame( + [ + ["best", "a", 1], + ["worst", "b", 1], + ["best", "c", 1], + ["best", "d", 1], + ["worst", "d", 1], + ["worst", "d", 1], + ["best", "d", 1], + ], + columns=["a", "b", "c"], + ) + with pytest.raises(TypeError, match="filter function returned a.*"): + df.groupby("a").filter(lambda g: g.c.mean()) def test_filter_dropna_with_empty_groups(): @@ -576,8 +589,7 @@ def test_filter_dropna_with_empty_groups(): data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) groupped = data.groupby(level=0) result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) - expected_false = pd.Series([np.nan] * 9, - index=np.repeat([1, 2, 3], 3)) + expected_false = pd.Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3)) tm.assert_series_equal(result_false, expected_false) result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e4303c0a07076..68e3db3a1ccb0 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -9,58 +9,65 @@ from pandas.errors import UnsupportedFunctionCall import pandas as pd -from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna) +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna import pandas.core.nanops as nanops from pandas.util import _test_decorators as td, testing as tm -@pytest.mark.parametrize("agg_func", ['any', 'all']) +@pytest.mark.parametrize("agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("vals", [ - ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], - [1, 2, 3], [1, 0, 0], [0, 0, 0], - [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], - [True, True, True], [True, False, False], [False, False, False], - [np.nan, np.nan, np.nan] -]) +@pytest.mark.parametrize( + "vals", + [ + ["foo", "bar", "baz"], + ["foo", "", ""], + ["", "", ""], + [1, 2, 3], + [1, 0, 0], + [0, 0, 0], + [1.0, 2.0, 3.0], + [1.0, 0.0, 0.0], + [0.0, 0.0, 0.0], + [True, True, True], + [True, False, False], + [False, False, False], + [np.nan, np.nan, np.nan], + ], +) def test_groupby_bool_aggs(agg_func, skipna, vals): - df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) + df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2}) # Figure out expectation using Python builtin exp = getattr(builtins, agg_func)(vals) # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == 'any': + if skipna and all(isna(vals)) and agg_func == "any": exp = False - exp_df = DataFrame([exp] * 2, columns=['val'], index=Index( - ['a', 'b'], name='key')) - result = getattr(df.groupby('key'), agg_func)(skipna=skipna) + exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key")) + result = getattr(df.groupby("key"), agg_func)(skipna=skipna) tm.assert_frame_equal(result, exp_df) def test_max_min_non_numeric(): # #2700 - aa = DataFrame({'nn': [11, 11, 22, 22], - 'ii': [1, 2, 3, 4], - 'ss': 4 * ['mama']}) + aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]}) - result = aa.groupby('nn').max() - assert 'ss' in result + result = aa.groupby("nn").max() + assert "ss" in result - result = aa.groupby('nn').max(numeric_only=False) - assert 'ss' in result + result = aa.groupby("nn").max(numeric_only=False) + assert "ss" in result - result = aa.groupby('nn').min() - assert 'ss' in result + result = aa.groupby("nn").min() + assert "ss" in result - result = aa.groupby('nn').min(numeric_only=False) - assert 'ss' in result + result = aa.groupby("nn").min(numeric_only=False) + assert "ss" in result def test_intercept_builtin_sum(): - s = Series([1., 2., np.nan, 3.]) + s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) result = grouped.agg(builtins.sum) @@ -73,35 +80,34 @@ def test_intercept_builtin_sum(): # @pytest.mark.parametrize("f", [max, min, sum]) # def test_builtins_apply(f): + @pytest.mark.parametrize("f", [max, min, sum]) -@pytest.mark.parametrize('keys', [ - "jim", # Single key - ["jim", "joe"] # Multi-key -]) +@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key def test_builtins_apply(keys, f): # see gh-8155 - df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), - columns=["jim", "joe"]) + df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"]) df["jolie"] = np.random.randn(1000) fname = f.__name__ result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = ("invalid frame shape: {} " - "(expected ({}, 3))".format(result.shape, ngroups)) + assert_msg = "invalid frame shape: {} " "(expected ({}, 3))".format( + result.shape, ngroups + ) assert result.shape == (ngroups, 3), assert_msg - tm.assert_frame_equal(result, # numpy's equivalent function - df.groupby(keys).apply(getattr(np, fname))) + tm.assert_frame_equal( + result, # numpy's equivalent function + df.groupby(keys).apply(getattr(np, fname)), + ) if f != sum: expected = df.groupby(keys).agg(fname).reset_index() expected.set_index(keys, inplace=True, drop=False) tm.assert_frame_equal(result, expected, check_dtype=False) - tm.assert_series_equal(getattr(result, fname)(), - getattr(df, fname)()) + tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) def test_arg_passthru(): @@ -111,42 +117,54 @@ def test_arg_passthru(): # GH3668 # GH5724 df = pd.DataFrame( - {'group': [1, 1, 2], - 'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'category_string': pd.Series(list('abc')).astype('category'), - 'category_int': [7, 8, 9], - 'datetime': pd.date_range('20130101', periods=3), - 'datetimetz': pd.date_range('20130101', - periods=3, - tz='US/Eastern'), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, - columns=['group', 'int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - - expected_columns_numeric = Index(['int', 'float', 'category_int']) + { + "group": [1, 1, 2], + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "category_string": pd.Series(list("abc")).astype("category"), + "category_int": [7, 8, 9], + "datetime": pd.date_range("20130101", periods=3), + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + }, + columns=[ + "group", + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ], + ) + + expected_columns_numeric = Index(["int", "float", "category_int"]) # mean / median expected = pd.DataFrame( - {'category_int': [7.5, 9], - 'float': [4.5, 6.], - 'timedelta': [pd.Timedelta('1.5s'), - pd.Timedelta('3s')], - 'int': [1.5, 3], - 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), - pd.Timestamp('2013-01-03 00:00:00')], - 'datetimetz': [ - pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), - pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, - index=Index([1, 2], name='group'), - columns=['int', 'float', 'category_int', - 'datetime', 'datetimetz', 'timedelta']) - - for attr in ['mean', 'median']: - f = getattr(df.groupby('group'), attr) + { + "category_int": [7.5, 9], + "float": [4.5, 6.0], + "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")], + "int": [1.5, 3], + "datetime": [ + pd.Timestamp("2013-01-01 12:00:00"), + pd.Timestamp("2013-01-03 00:00:00"), + ], + "datetimetz": [ + pd.Timestamp("2013-01-01 12:00:00", tz="US/Eastern"), + pd.Timestamp("2013-01-03 00:00:00", tz="US/Eastern"), + ], + }, + index=Index([1, 2], name="group"), + columns=["int", "float", "category_int", "datetime", "datetimetz", "timedelta"], + ) + + for attr in ["mean", "median"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -155,43 +173,57 @@ def test_arg_passthru(): # TODO: min, max *should* handle # categorical (ordered) dtype - expected_columns = Index(['int', 'float', 'string', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['min', 'max']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index( + [ + "int", + "float", + "string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + for attr in ["min", "max"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['first', 'last']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index( + [ + "int", + "float", + "string", + "category_string", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + ) + for attr in ["first", "last"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'string', - 'category_int', 'timedelta']) - for attr in ['sum']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index(["int", "float", "string", "category_int", "timedelta"]) + for attr in ["sum"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'category_int']) - for attr in ['prod', 'cumprod']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index(["int", "float", "category_int"]) + for attr in ["prod", "cumprod"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -199,12 +231,11 @@ def test_arg_passthru(): tm.assert_index_equal(result.columns, expected_columns) # like min, max, but don't include strings - expected_columns = Index(['int', 'float', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['cummin', 'cummax']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index( + ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] + ) + for attr in ["cummin", "cummax"]: + f = getattr(df.groupby("group"), attr) result = f() # GH 15561: numeric_only=False set by default like min/max tm.assert_index_equal(result.columns, expected_columns) @@ -212,10 +243,9 @@ def test_arg_passthru(): result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) - expected_columns = Index(['int', 'float', 'category_int', - 'timedelta']) - for attr in ['cumsum']: - f = getattr(df.groupby('group'), attr) + expected_columns = Index(["int", "float", "category_int", "timedelta"]) + for attr in ["cumsum"]: + f = getattr(df.groupby("group"), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) @@ -229,54 +259,59 @@ def test_non_cython_api(): # non-cython calls should not include the grouper df = DataFrame( - [[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, 'baz']], - columns=['A', 'B', 'C']) - g = df.groupby('A') - gni = df.groupby('A', as_index=False) + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"] + ) + g = df.groupby("A") + gni = df.groupby("A", as_index=False) # mad - expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3]) + expected.index.name = "A" result = g.mad() tm.assert_frame_equal(result, expected) - expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], - index=[0, 1]) + expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1]) result = gni.mad() tm.assert_frame_equal(result, expected) # describe - expected_index = pd.Index([1, 3], name='A') - expected_col = pd.MultiIndex(levels=[['B'], - ['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']], - codes=[[0] * 8, list(range(8))]) - expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan]], - index=expected_index, - columns=expected_col) + expected_index = pd.Index([1, 3], name="A") + expected_col = pd.MultiIndex( + levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], + codes=[[0] * 8, list(range(8))], + ) + expected = pd.DataFrame( + [ + [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + ], + index=expected_index, + columns=expected_col, + ) result = g.describe() tm.assert_frame_equal(result, expected) - expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, - df[df.A == 3].describe().unstack().to_frame().T]) + expected = pd.concat( + [ + df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T, + ] + ) expected.index = pd.Index([0, 1]) result = gni.describe() tm.assert_frame_equal(result, expected) # any - expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame( + [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] + ) + expected.index.name = "A" result = g.any() tm.assert_frame_equal(result, expected) # idxmax - expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) + expected.index.name = "A" result = g.idxmax() tm.assert_frame_equal(result, expected) @@ -286,23 +321,20 @@ def test_cython_api2(): # this takes the fast apply path # cumsum (GH5614) - df = DataFrame( - [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] - ], columns=['A', 'B', 'C']) - expected = DataFrame( - [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) - result = df.groupby('A').cumsum() + df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) + expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) + result = df.groupby("A").cumsum() tm.assert_frame_equal(result, expected) # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby('A', as_index=False).cumsum() + result = df.groupby("A", as_index=False).cumsum() tm.assert_frame_equal(result, expected) # GH 13994 - result = df.groupby('A').cumsum(axis=1) + result = df.groupby("A").cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) - result = df.groupby('A').cumprod(axis=1) + result = df.groupby("A").cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected) @@ -335,63 +367,69 @@ def test_median_empty_bins(observed): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", [ - 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) -@pytest.mark.parametrize("method,data", [ - ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}), - ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}) -]) +@pytest.mark.parametrize( + "dtype", ["int8", "int16", "int32", "int64", "float32", "float64"] +) +@pytest.mark.parametrize( + "method,data", + [ + ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}), + ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}), + ("nth", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}], "args": [1]}), + ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}), + ], +) def test_groupby_non_arithmetic_agg_types(dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( - [{'a': 1, 'b': 1}, - {'a': 1, 'b': 2}, - {'a': 2, 'b': 3}, - {'a': 2, 'b': 4}]) + [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}] + ) - df['b'] = df.b.astype(dtype) + df["b"] = df.b.astype(dtype) - if 'args' not in data: - data['args'] = [] + if "args" not in data: + data["args"] = [] - if 'out_type' in data: - out_type = data['out_type'] + if "out_type" in data: + out_type = data["out_type"] else: out_type = dtype - exp = data['df'] + exp = data["df"] df_out = pd.DataFrame(exp) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + df_out["b"] = df_out.b.astype(out_type) + df_out.set_index("a", inplace=True) - grpd = df.groupby('a') - t = getattr(grpd, method)(*data['args']) + grpd = df.groupby("a") + t = getattr(grpd, method)(*data["args"]) tm.assert_frame_equal(t, df_out) -@pytest.mark.parametrize("i", [ - (Timestamp("2011-01-15 12:50:28.502376"), - Timestamp("2011-01-20 12:50:28.593448")), - (24650000000000001, 24650000000000002) -]) +@pytest.mark.parametrize( + "i", + [ + ( + Timestamp("2011-01-15 12:50:28.502376"), + Timestamp("2011-01-20 12:50:28.593448"), + ), + (24650000000000001, 24650000000000002), + ], +) def test_groupby_non_arithmetic_agg_int_like_precision(i): # see gh-6620, gh-9311 df = pd.DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}]) - grp_exp = {"first": {"expected": i[0]}, - "last": {"expected": i[1]}, - "min": {"expected": i[0]}, - "max": {"expected": i[1]}, - "nth": {"expected": i[1], - "args": [1]}, - "count": {"expected": 2}} + grp_exp = { + "first": {"expected": i[0]}, + "last": {"expected": i[1]}, + "min": {"expected": i[0]}, + "max": {"expected": i[1]}, + "nth": {"expected": i[1], "args": [1]}, + "count": {"expected": 2}, + } for method, data in grp_exp.items(): if "args" not in data: @@ -403,21 +441,28 @@ def test_groupby_non_arithmetic_agg_int_like_precision(i): assert res.iloc[0].b == data["expected"] -@pytest.mark.parametrize("func, values", [ - ("idxmin", {'c_int': [0, 2], 'c_float': [1, 3], 'c_date': [1, 2]}), - ("idxmax", {'c_int': [1, 3], 'c_float': [0, 2], 'c_date': [0, 3]}) -]) +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}), + ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}), + ], +) def test_idxmin_idxmax_returns_int_types(func, values): # GH 25444 - df = pd.DataFrame({'name': ['A', 'A', 'B', 'B'], - 'c_int': [1, 2, 3, 4], - 'c_float': [4.02, 3.03, 2.04, 1.05], - 'c_date': ['2019', '2018', '2016', '2017']}) - df['c_date'] = pd.to_datetime(df['c_date']) + df = pd.DataFrame( + { + "name": ["A", "A", "B", "B"], + "c_int": [1, 2, 3, 4], + "c_float": [4.02, 3.03, 2.04, 1.05], + "c_date": ["2019", "2018", "2016", "2017"], + } + ) + df["c_date"] = pd.to_datetime(df["c_date"]) - result = getattr(df.groupby('name'), func)() + result = getattr(df.groupby("name"), func)() - expected = pd.DataFrame(values, index=Index(['A', 'B'], name="name")) + expected = pd.DataFrame(values, index=Index(["A", "B"], name="name")) tm.assert_frame_equal(result, expected) @@ -427,59 +472,88 @@ def test_fill_consistency(): # GH9221 # pass thru keyword arguments to the generated wrapper # are set if the passed kw is None (only) - df = DataFrame(index=pd.MultiIndex.from_product( - [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]), - columns=Index( - ['1', '2'], name='id')) - df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan, - np.nan, 22, np.nan] - df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan, - np.nan, 44, np.nan] - - expected = df.groupby(level=0, axis=0).fillna(method='ffill') - result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T + df = DataFrame( + index=pd.MultiIndex.from_product( + [["value1", "value2"], date_range("2014-01-01", "2014-01-06")] + ), + columns=Index(["1", "2"], name="id"), + ) + df["1"] = [ + np.nan, + 1, + np.nan, + np.nan, + 11, + np.nan, + np.nan, + 2, + np.nan, + np.nan, + 22, + np.nan, + ] + df["2"] = [ + np.nan, + 3, + np.nan, + np.nan, + 33, + np.nan, + np.nan, + 4, + np.nan, + np.nan, + 44, + np.nan, + ] + + expected = df.groupby(level=0, axis=0).fillna(method="ffill") + result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T tm.assert_frame_equal(result, expected) def test_groupby_cumprod(): # GH 4095 - df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) + df = pd.DataFrame({"key": ["b"] * 10, "value": 2}) - actual = df.groupby('key')['value'].cumprod() - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' + actual = df.groupby("key")["value"].cumprod() + expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected.name = "value" tm.assert_series_equal(actual, expected) - df = pd.DataFrame({'key': ['b'] * 100, 'value': 2}) - actual = df.groupby('key')['value'].cumprod() + df = pd.DataFrame({"key": ["b"] * 100, "value": 2}) + actual = df.groupby("key")["value"].cumprod() # if overflows, groupby product casts to float # while numpy passes back invalid values - df['value'] = df['value'].astype(float) - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' + df["value"] = df["value"].astype(float) + expected = df.groupby("key")["value"].apply(lambda x: x.cumprod()) + expected.name = "value" tm.assert_series_equal(actual, expected) def scipy_sem(*args, **kwargs): from scipy.stats import sem + return sem(*args, ddof=1, **kwargs) @pytest.mark.parametrize( - 'op,targop', - [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), - pytest.param( - 'sem', scipy_sem, marks=td.skip_if_no_scipy)]) + "op,targop", + [ + ("mean", np.mean), + ("median", np.median), + ("std", np.std), + ("var", np.var), + ("sum", np.sum), + ("prod", np.prod), + ("min", np.min), + ("max", np.max), + ("first", lambda x: x.iloc[0]), + ("last", lambda x: x.iloc[-1]), + ("count", np.size), + pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy), + ], +) def test_ops_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) @@ -496,56 +570,58 @@ def test_max_nan_bug(): -05-07,2013-05-07 00:00:00,OE,xlsx""" df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby('Date') - r = gb[['File']].max() - e = gb['File'].max().to_frame() + gb = df.groupby("Date") + r = gb[["File"]].max() + e = gb["File"].max().to_frame() tm.assert_frame_equal(r, e) - assert not r['File'].isna().any() + assert not r["File"].isna().any() def test_nlargest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) + b = Series(list("a" * 5 + "b" * 5)) gb = a.groupby(b) r = gb.nlargest(3) - e = Series([ - 7, 5, 3, 10, 9, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]])) + e = Series( + [7, 5, 3, 10, 9, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]), + ) tm.assert_series_equal(r, e) a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) gb = a.groupby(b) - e = Series([ - 3, 2, 1, 3, 3, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) - tm.assert_series_equal(gb.nlargest(3, keep='last'), e) + e = Series( + [3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]), + ) + tm.assert_series_equal(gb.nlargest(3, keep="last"), e) def test_nsmallest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) + b = Series(list("a" * 5 + "b" * 5)) gb = a.groupby(b) r = gb.nsmallest(3) - e = Series([ - 1, 2, 3, 0, 4, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) + e = Series( + [1, 2, 3, 0, 4, 6], + index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]), + ) tm.assert_series_equal(r, e) a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) gb = a.groupby(b) - e = Series([ - 0, 1, 1, 0, 1, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) - tm.assert_series_equal(gb.nsmallest(3, keep='last'), e) + e = Series( + [0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]), + ) + tm.assert_series_equal(gb.nsmallest(3, keep="last"), e) -@pytest.mark.parametrize("func", [ - 'mean', 'var', 'std', 'cumprod', 'cumsum' -]) +@pytest.mark.parametrize("func", ["mean", "var", "std", "cumprod", "cumsum"]) def test_numpy_compat(func): # see gh-12811 - df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) - g = df.groupby('A') + df = pd.DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]}) + g = df.groupby("A") msg = "numpy operations are not valid with groupby" @@ -558,12 +634,21 @@ def test_numpy_compat(func): def test_cummin_cummax(): # GH 15048 num_types = [np.int32, np.int64, np.float32, np.float64] - num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, - np.finfo(np.float32).min, np.finfo(np.float64).min] - num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, - np.finfo(np.float32).max, np.finfo(np.float64).max] - base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], - 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) + num_mins = [ + np.iinfo(np.int32).min, + np.iinfo(np.int64).min, + np.finfo(np.float32).min, + np.finfo(np.float64).min, + ] + num_max = [ + np.iinfo(np.int32).max, + np.iinfo(np.int64).max, + np.finfo(np.float32).max, + np.finfo(np.float64).max, + ] + base_df = pd.DataFrame( + {"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} + ) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] @@ -571,154 +656,155 @@ def test_cummin_cummax(): df = base_df.astype(dtype) # cummin - expected = pd.DataFrame({'B': expected_mins}).astype(dtype) - result = df.groupby('A').cummin() + expected = pd.DataFrame({"B": expected_mins}).astype(dtype) + result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # Test cummin w/ min value for dtype - df.loc[[2, 6], 'B'] = min_val - expected.loc[[2, 3, 6, 7], 'B'] = min_val - result = df.groupby('A').cummin() + df.loc[[2, 6], "B"] = min_val + expected.loc[[2, 3, 6, 7], "B"] = min_val + result = df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # cummax - expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) - result = df.groupby('A').cummax() + expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) + result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test cummax w/ max value for dtype - df.loc[[2, 6], 'B'] = max_val - expected.loc[[2, 3, 6, 7], 'B'] = max_val - result = df.groupby('A').cummax() + df.loc[[2, 6], "B"] = max_val + expected.loc[[2, 3, 6, 7], "B"] = max_val + result = df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test nan in some values - base_df.loc[[0, 2, 4, 6], 'B'] = np.nan - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, - np.nan, 3, np.nan, 1]}) - result = base_df.groupby('A').cummin() + base_df.loc[[0, 2, 4, 6], "B"] = np.nan + expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) + result = base_df.groupby("A").cummin() tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummin()) - .to_frame()) + expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, - np.nan, 3, np.nan, 3]}) - result = base_df.groupby('A').cummax() + expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) + result = base_df.groupby("A").cummax() tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummax()) - .to_frame()) + expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test nan in entire column - base_df['B'] = np.nan - expected = pd.DataFrame({'B': [np.nan] * 8}) - result = base_df.groupby('A').cummin() + base_df["B"] = np.nan + expected = pd.DataFrame({"B": [np.nan] * 8}) + result = base_df.groupby("A").cummin() tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').cummax() + result = base_df.groupby("A").cummax() tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(expected, result) # GH 15561 - df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) - expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') - for method in ['cummax', 'cummin']: - result = getattr(df.groupby('a')['b'], method)() + df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) + expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") + for method in ["cummax", "cummin"]: + result = getattr(df.groupby("a")["b"], method)() tm.assert_series_equal(expected, result) # GH 15635 df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) - result = df.groupby('a').b.cummax() - expected = pd.Series([2, 1, 2], name='b') + result = df.groupby("a").b.cummax() + expected = pd.Series([2, 1, 2], name="b") tm.assert_series_equal(result, expected) df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) - result = df.groupby('a').b.cummin() - expected = pd.Series([1, 2, 1], name='b') + result = df.groupby("a").b.cummin() + expected = pd.Series([1, 2, 1], name="b") tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('in_vals, out_vals', [ - - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), -]) +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) def test_is_monotonic_increasing(in_vals, out_vals): # GH 17015 source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_increasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') + result = df.groupby("B").C.is_monotonic_increasing + index = Index(list("abcd"), name="B") + expected = pd.Series(index=index, data=out_vals, name="C") tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. - expected = ( - df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) + expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('in_vals, out_vals', [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), -]) +@pytest.mark.parametrize( + "in_vals, out_vals", + [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), + # Test with inf vals + ( + [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True], + ), + # Test with nan vals; should always be False + ( + [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False], + ), + ], +) def test_is_monotonic_decreasing(in_vals, out_vals): # GH 17015 source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} + "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], + "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"], + "C": in_vals, + } df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_decreasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') + result = df.groupby("B").C.is_monotonic_decreasing + index = Index(list("abcd"), name="B") + expected = pd.Series(index=index, data=out_vals, name="C") tm.assert_series_equal(result, expected) # describe # -------------------------------- + def test_apply_describe_bug(mframe): - grouped = mframe.groupby(level='first') + grouped = mframe.groupby(level="first") grouped.describe() # it works! @@ -726,10 +812,9 @@ def test_series_describe_multikey(): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() - tm.assert_series_equal(result['mean'], grouped.mean(), - check_names=False) - tm.assert_series_equal(result['std'], grouped.std(), check_names=False) - tm.assert_series_equal(result['min'], grouped.min(), check_names=False) + tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False) + tm.assert_series_equal(result["std"], grouped.std(), check_names=False) + tm.assert_series_equal(result["min"], grouped.min(), check_names=False) def test_series_describe_single(): @@ -741,9 +826,9 @@ def test_series_describe_single(): def test_series_index_name(df): - grouped = df.loc[:, ['C']].groupby(df['A']) + grouped = df.loc[:, ["C"]].groupby(df["A"]) result = grouped.agg(lambda x: x.mean()) - assert result.index.name == 'A' + assert result.index.name == "A" def test_frame_describe_multikey(tsframe): @@ -755,172 +840,184 @@ def test_frame_describe_multikey(tsframe): # GH 17464 - Remove duplicate MultiIndex levels group_col = pd.MultiIndex( levels=[[col], group.columns], - codes=[[0] * len(group.columns), range(len(group.columns))]) - group = pd.DataFrame(group.values, - columns=group_col, - index=group.index) + codes=[[0] * len(group.columns), range(len(group.columns))], + ) + group = pd.DataFrame(group.values, columns=group_col, index=group.index) desc_groups.append(group) expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) - groupedT = tsframe.groupby({'A': 0, 'B': 0, - 'C': 1, 'D': 1}, axis=1) + groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T expected.index = pd.MultiIndex( levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))]) + codes=[[0, 0, 1, 1], range(len(expected.index))], + ) tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, - 'y': [10, 20, 30, 40, 50] * 3, - 'z': [100, 200, 300, 400, 500] * 3}) - df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={'k': 'key'}) + df1 = DataFrame( + { + "x": [1, 2, 3, 4, 5] * 3, + "y": [10, 20, 30, 40, 50] * 3, + "z": [100, 200, 300, 400, 500] * 3, + } + ) + df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={"k": "key"}) msg = "Names should be list-like for a MultiIndex" with pytest.raises(ValueError, match=msg): - df1.groupby('k').describe() + df1.groupby("k").describe() with pytest.raises(ValueError, match=msg): - df2.groupby('key').describe() + df2.groupby("key").describe() def test_frame_describe_unstacked_format(): # GH 4792 - prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} - volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} - df = pd.DataFrame({'PRICE': prices, - 'VOLUME': volumes}) - result = df.groupby('PRICE').VOLUME.describe() - data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist()] - expected = pd.DataFrame(data, - index=pd.Index([24990, 25499], name='PRICE'), - columns=['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']) + prices = { + pd.Timestamp("2011-01-06 10:59:05", tz=None): 24990, + pd.Timestamp("2011-01-06 12:43:33", tz=None): 25499, + pd.Timestamp("2011-01-06 12:54:09", tz=None): 25499, + } + volumes = { + pd.Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + pd.Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + pd.Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + } + df = pd.DataFrame({"PRICE": prices, "VOLUME": volumes}) + result = df.groupby("PRICE").VOLUME.describe() + data = [ + df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist(), + ] + expected = pd.DataFrame( + data, + index=pd.Index([24990, 25499], name="PRICE"), + columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_frame_equal(result, expected) # nunique # -------------------------------- -@pytest.mark.parametrize('n', 10 ** np.arange(2, 6)) -@pytest.mark.parametrize('m', [10, 100, 1000]) -@pytest.mark.parametrize('sort', [False, True]) -@pytest.mark.parametrize('dropna', [False, True]) -def test_series_groupby_nunique(n, m, sort, dropna): +@pytest.mark.parametrize("n", 10 ** np.arange(2, 6)) +@pytest.mark.parametrize("m", [10, 100, 1000]) +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("dropna", [False, True]) +def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr['julie'].nunique(dropna=dropna) + left = gr["julie"].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr['julie'].apply(Series.nunique, dropna=dropna) + right = gr["julie"].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) - days = date_range('2015-08-23', periods=10) + days = date_range("2015-08-23", periods=10) - frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n), - 'joe': np.random.choice(days, n), - 'julie': np.random.randint(0, m, n)}) + frame = DataFrame( + { + "jim": np.random.choice(list(ascii_lowercase), n), + "joe": np.random.choice(days, n), + "julie": np.random.randint(0, m, n), + } + ) - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) - frame.loc[1::17, 'jim'] = None - frame.loc[3::37, 'joe'] = None - frame.loc[7::19, 'julie'] = None - frame.loc[8::19, 'julie'] = None - frame.loc[9::19, 'julie'] = None + frame.loc[1::17, "jim"] = None + frame.loc[3::37, "joe"] = None + frame.loc[7::19, "julie"] = None + frame.loc[8::19, "julie"] = None + frame.loc[9::19, "julie"] = None - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - check_nunique(frame, ['jim'], as_index=False) - check_nunique(frame, ['jim', 'joe'], as_index=False) + check_nunique(frame, ["jim"]) + check_nunique(frame, ["jim", "joe"]) + check_nunique(frame, ["jim"], as_index=False) + check_nunique(frame, ["jim", "joe"], as_index=False) def test_nunique(): - df = DataFrame({ - 'A': list('abbacc'), - 'B': list('abxacc'), - 'C': list('abbacx'), - }) - - expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) - result = df.groupby('A', as_index=False).nunique() + df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")}) + + expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]}) + result = df.groupby("A", as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index - expected.index = list('abc') - expected.index.name = 'A' - result = df.groupby('A').nunique() + expected.index = list("abc") + expected.index.name = "A" + result = df.groupby("A").nunique() tm.assert_frame_equal(result, expected) # with na - result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + result = df.replace({"x": None}).groupby("A").nunique(dropna=False) tm.assert_frame_equal(result, expected) # dropna - expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, - index=list('abc')) - expected.index.name = 'A' - result = df.replace({'x': None}).groupby('A').nunique() + expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc")) + expected.index.name = "A" + result = df.replace({"x": None}).groupby("A").nunique() tm.assert_frame_equal(result, expected) def test_nunique_with_object(): # GH 11077 data = pd.DataFrame( - [[100, 1, 'Alice'], - [200, 2, 'Bob'], - [300, 3, 'Charlie'], - [-400, 4, 'Dan'], - [500, 5, 'Edith']], - columns=['amount', 'id', 'name'] + [ + [100, 1, "Alice"], + [200, 2, "Bob"], + [300, 3, "Charlie"], + [-400, 4, "Dan"], + [500, 5, "Edith"], + ], + columns=["amount", "id", "name"], ) - result = data.groupby(['id', 'amount'])['name'].nunique() + result = data.groupby(["id", "amount"])["name"].nunique() index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name='name', index=index) + expected = pd.Series([1] * 5, name="name", index=index) tm.assert_series_equal(result, expected) def test_nunique_with_empty_series(): # GH 12553 - data = pd.Series(name='name') + data = pd.Series(name="name") result = data.groupby(level=0).nunique() - expected = pd.Series(name='name', dtype='int64') + expected = pd.Series(name="name", dtype="int64") tm.assert_series_equal(result, expected) def test_nunique_with_timegrouper(): # GH 13453 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - Timestamp('2016-06-28 16:09:30'), - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}).set_index('time') - result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() - expected = test.groupby( - pd.Grouper(freq='h') - )['data'].apply(pd.Series.nunique) + test = pd.DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + Timestamp("2016-06-28 16:09:30"), + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ).set_index("time") + result = test.groupby(pd.Grouper(freq="h"))["data"].nunique() + expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(pd.Series.nunique) tm.assert_series_equal(result, expected) def test_nunique_preserves_column_level_names(): # GH 23222 - test = pd.DataFrame([1, 2, 2], - columns=pd.Index(['A'], name="level_0")) + test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) result = test.groupby([0, 0, 0]).nunique() expected = pd.DataFrame([2], columns=test.columns) tm.assert_frame_equal(result, expected) @@ -929,40 +1026,40 @@ def test_nunique_preserves_column_level_names(): # count # -------------------------------- + def test_groupby_timedelta_cython_count(): - df = DataFrame({'g': list('ab' * 2), - 'delt': np.arange(4).astype('timedelta64[ns]')}) - expected = Series([ - 2, 2 - ], index=pd.Index(['a', 'b'], name='g'), name='delt') - result = df.groupby('g').delt.count() + df = DataFrame( + {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")} + ) + expected = Series([2, 2], index=pd.Index(["a", "b"], name="g"), name="delt") + result = df.groupby("g").delt.count() tm.assert_series_equal(expected, result) def test_count(): n = 1 << 15 - dr = date_range('2015-08-30', periods=n // 10, freq='T') - - df = DataFrame({ - '1st': np.random.choice( - list(ascii_lowercase), n), - '2nd': np.random.randint(0, 5, n), - '3rd': np.random.randn(n).round(3), - '4th': np.random.randint(-10, 10, n), - '5th': np.random.choice(dr, n), - '6th': np.random.randn(n).round(3), - '7th': np.random.randn(n).round(3), - '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), - '9th': np.random.choice( - list(ascii_lowercase), n) - }) - - for col in df.columns.drop(['1st', '2nd', '4th']): + dr = date_range("2015-08-30", periods=n // 10, freq="T") + + df = DataFrame( + { + "1st": np.random.choice(list(ascii_lowercase), n), + "2nd": np.random.randint(0, 5, n), + "3rd": np.random.randn(n).round(3), + "4th": np.random.randint(-10, 10, n), + "5th": np.random.choice(dr, n), + "6th": np.random.randn(n).round(3), + "7th": np.random.randn(n).round(3), + "8th": np.random.choice(dr, n) - np.random.choice(dr, 1), + "9th": np.random.choice(list(ascii_lowercase), n), + } + ) + + for col in df.columns.drop(["1st", "2nd", "4th"]): df.loc[np.random.choice(n, n // 10), col] = np.nan - df['9th'] = df['9th'].astype('category') + df["9th"] = df["9th"].astype("category") - for key in ['1st', '2nd', ['1st', '2nd']]: + for key in ["1st", "2nd", ["1st", "2nd"]]: left = df.groupby(key).count() right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) @@ -971,70 +1068,65 @@ def test_count(): def test_count_non_nulls(): # GH#5610 # count counts non-nulls - df = pd.DataFrame([[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, np.nan]], - columns=['A', 'B', 'C']) + df = pd.DataFrame( + [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]], + columns=["A", "B", "C"], + ) - count_as = df.groupby('A').count() - count_not_as = df.groupby('A', as_index=False).count() + count_as = df.groupby("A").count() + count_not_as = df.groupby("A", as_index=False).count() - expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' + expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3]) + expected.index.name = "A" tm.assert_frame_equal(count_not_as, expected.reset_index()) tm.assert_frame_equal(count_as, expected) - count_B = df.groupby('A')['B'].count() - tm.assert_series_equal(count_B, expected['B']) + count_B = df.groupby("A")["B"].count() + tm.assert_series_equal(count_B, expected["B"]) def test_count_object(): - df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 3, 3 - ], index=pd.Index([2, 3], name='c'), name='a') + df = pd.DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([3, 3], index=pd.Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) - df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, - 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 1, 3 - ], index=pd.Index([2, 3], name='c'), name='a') + df = pd.DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3}) + result = df.groupby("c").a.count() + expected = pd.Series([1, 3], index=pd.Index([2, 3], name="c"), name="a") tm.assert_series_equal(result, expected) def test_count_cross_type(): # GH8169 - vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( - 0, 2, (100, 2)))) + vals = np.hstack( + (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2))) + ) - df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(vals, columns=["a", "b", "c", "d"]) df[df == 2] = np.nan - expected = df.groupby(['c', 'd']).count() + expected = df.groupby(["c", "d"]).count() - for t in ['float32', 'object']: - df['a'] = df['a'].astype(t) - df['b'] = df['b'].astype(t) - result = df.groupby(['c', 'd']).count() + for t in ["float32", "object"]: + df["a"] = df["a"].astype(t) + df["b"] = df["b"].astype(t) + result = df.groupby(["c", "d"]).count() tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): - df = DataFrame({'a': np.array( - [0, 1, 2, 100], np.int8), - 'b': np.array( - [1, 2, 3, 6], np.uint32), - 'c': np.array( - [4, 5, 6, 8], np.int16), - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2], - 'b': [2, 2], - 'c': [2, 2]}, index=pd.Index(list('ab'), - name='grp')) + df = DataFrame( + { + "a": np.array([0, 1, 2, 100], np.int8), + "b": np.array([1, 2, 3, 6], np.uint32), + "c": np.array([4, 5, 6, 8], np.int16), + "grp": list("ab" * 2), + } + ) + result = df.groupby("grp").count() + expected = DataFrame( + {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=pd.Index(list("ab"), name="grp") + ) tm.assert_frame_equal(result, expected) @@ -1043,8 +1135,7 @@ class RaisingObjectException(Exception): pass class RaisingObject: - - def __init__(self, msg='I will raise inside Cython'): + def __init__(self, msg="I will raise inside Cython"): super().__init__() self.msg = msg @@ -1052,116 +1143,129 @@ def __eq__(self, other): # gets called in Cython to check that raising calls the method raise RaisingObjectException(self.msg) - df = DataFrame({'a': [RaisingObject() for _ in range(4)], - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2]}, index=pd.Index( - list('ab'), name='grp')) + df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)}) + result = df.groupby("grp").count() + expected = DataFrame({"a": [2, 2]}, index=pd.Index(list("ab"), name="grp")) tm.assert_frame_equal(result, expected) # size # -------------------------------- + def test_size(df): - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) result = grouped.size() for key, group in grouped: assert result[key] == len(group) - grouped = df.groupby('A') + grouped = df.groupby("A") result = grouped.size() for key, group in grouped: assert result[key] == len(group) - grouped = df.groupby('B') + grouped = df.groupby("B") result = grouped.size() for key, group in grouped: assert result[key] == len(group) - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) - for sort, key in product((False, True), ('a', 'b', ['a', 'b'])): + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list("abc")) + for sort, key in product((False, True), ("a", "b", ["a", "b"])): left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) + right = df.groupby(key, sort=sort)["c"].apply(lambda a: a.shape[0]) tm.assert_series_equal(left, right, check_names=False) # GH11699 - df = DataFrame(columns=['A', 'B']) - out = Series(dtype='int64', index=Index([], name='A')) - tm.assert_series_equal(df.groupby('A').size(), out) + df = DataFrame(columns=["A", "B"]) + out = Series(dtype="int64", index=Index([], name="A")) + tm.assert_series_equal(df.groupby("A").size(), out) def test_size_groupby_all_null(): # GH23050 # Assert no 'Value Error : Length of passed values is 2, index implies 0' - df = DataFrame({'A': [None, None]}) # all-null groups - result = df.groupby('A').size() - expected = Series(dtype='int64', index=Index([], name='A')) + df = DataFrame({"A": [None, None]}) # all-null groups + result = df.groupby("A").size() + expected = Series(dtype="int64", index=Index([], name="A")) tm.assert_series_equal(result, expected) # quantile # -------------------------------- -@pytest.mark.parametrize("interpolation", [ - "linear", "lower", "higher", "nearest", "midpoint"]) -@pytest.mark.parametrize("a_vals,b_vals", [ - # Ints - ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), - ([1, 2, 3, 4], [4, 3, 2, 1]), - ([1, 2, 3, 4, 5], [4, 3, 2, 1]), - # Floats - ([1., 2., 3., 4., 5.], [5., 4., 3., 2., 1.]), - # Missing data - ([1., np.nan, 3., np.nan, 5.], [5., np.nan, 3., np.nan, 1.]), - ([np.nan, 4., np.nan, 2., np.nan], [np.nan, 4., np.nan, 2., np.nan]), - # Timestamps - ([x for x in pd.date_range('1/1/18', freq='D', periods=5)], - [x for x in pd.date_range('1/1/18', freq='D', periods=5)][::-1]), - # All NA - ([np.nan] * 5, [np.nan] * 5), -]) -@pytest.mark.parametrize('q', [0, .25, .5, .75, 1]) +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize( + "a_vals,b_vals", + [ + # Ints + ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]), + ([1, 2, 3, 4], [4, 3, 2, 1]), + ([1, 2, 3, 4, 5], [4, 3, 2, 1]), + # Floats + ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]), + # Missing data + ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]), + ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]), + # Timestamps + ( + [x for x in pd.date_range("1/1/18", freq="D", periods=5)], + [x for x in pd.date_range("1/1/18", freq="D", periods=5)][::-1], + ), + # All NA + ([np.nan] * 5, [np.nan] * 5), + ], +) +@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1]) def test_quantile(interpolation, a_vals, b_vals, q): - if interpolation == 'nearest' and q == 0.5 and b_vals == [4, 3, 2, 1]: - pytest.skip("Unclear numpy expectation for nearest result with " - "equidistant data") + if interpolation == "nearest" and q == 0.5 and b_vals == [4, 3, 2, 1]: + pytest.skip( + "Unclear numpy expectation for nearest result with " "equidistant data" + ) a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation) b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation) - df = DataFrame({ - 'key': ['a'] * len(a_vals) + ['b'] * len(b_vals), - 'val': a_vals + b_vals}) + df = DataFrame( + {"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": a_vals + b_vals} + ) - expected = DataFrame([a_expected, b_expected], columns=['val'], - index=Index(['a', 'b'], name='key')) - result = df.groupby('key').quantile(q, interpolation=interpolation) + expected = DataFrame( + [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key") + ) + result = df.groupby("key").quantile(q, interpolation=interpolation) tm.assert_frame_equal(result, expected) def test_quantile_raises(): - df = pd.DataFrame([ - ['foo', 'a'], ['foo', 'b'], ['foo', 'c']], columns=['key', 'val']) + df = pd.DataFrame( + [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] + ) - with pytest.raises(TypeError, match="cannot be performed against " - "'object' dtypes"): - df.groupby('key').quantile() + with pytest.raises( + TypeError, match="cannot be performed against " "'object' dtypes" + ): + df.groupby("key").quantile() # pipe # -------------------------------- + def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 random_state = np.random.RandomState(1234567890) - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': random_state.randn(8), - 'C': random_state.randn(8)}) + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": random_state.randn(8), + "C": random_state.randn(8), + } + ) def f(dfgb): return dfgb.B.max() - dfgb.C.min().min() @@ -1174,11 +1278,10 @@ def square(srs): # Series -> Series # This then chains the GroupBy.pipe and the # NDFrame.pipe methods - result = df.groupby('A').pipe(f).pipe(square) + result = df.groupby("A").pipe(f).pipe(square) - index = Index(['bar', 'foo'], dtype='object', name='A') - expected = pd.Series([8.99110003361, 8.17516964785], name='B', - index=index) + index = Index(["bar", "foo"], dtype="object", name="A") + expected = pd.Series([8.99110003361, 8.17516964785], name="B", index=index) tm.assert_series_equal(expected, result) @@ -1187,13 +1290,18 @@ def test_pipe_args(): # Test passing args to the pipe method of DataFrameGroupBy. # Issue #17871 - df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], - 'x': [1.0, 2.0, 3.0, 2.0, 5.0], - 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) + df = pd.DataFrame( + { + "group": ["A", "A", "B", "B", "C"], + "x": [1.0, 2.0, 3.0, 2.0, 5.0], + "y": [10.0, 100.0, 1000.0, -100.0, -1000.0], + } + ) def f(dfgb, arg1): - return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) - .groupby(dfgb.grouper)) + return dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False).groupby( + dfgb.grouper + ) def g(dfgb, arg2): return dfgb.sum() / dfgb.sum().sum() + arg2 @@ -1201,16 +1309,11 @@ def g(dfgb, arg2): def h(df, arg3): return df.x + df.y - arg3 - result = (df - .groupby('group') - .pipe(f, 0) - .pipe(g, 10) - .pipe(h, 100)) + result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100) # Assert the results here - index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, -80], - index=index) + index = pd.Index(["A", "B", "C"], name="group") + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) tm.assert_series_equal(expected, result) @@ -1225,8 +1328,10 @@ def h(df, arg3): def test_groupby_mean_no_overflow(): # Regression test for (#22487) - df = pd.DataFrame({ - "user": ["A", "A", "A", "A", "A"], - "connections": [4970, 4749, 4719, 4704, 18446744073699999744] - }) - assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840 + df = pd.DataFrame( + { + "user": ["A", "A", "A", "A", "A"], + "connections": [4970, 4749, 4719, 4704, 18446744073699999744], + } + ) + assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d13dddac79042..2379d25ebe5aa 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,22 +9,24 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) def test_repr(): # GH18203 - result = repr(pd.Grouper(key='A', level='B')) + result = repr(pd.Grouper(key="A", level="B")) expected = "Grouper(key='A', level='B', axis=0, sort=False)" assert result == expected -@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32']) +@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) def test_basic(dtype): data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) @@ -51,15 +53,13 @@ def test_basic(dtype): assert_series_equal(transformed, expected) value_grouped = data.groupby(data) - assert_series_equal(value_grouped.aggregate(np.mean), agged, - check_index_type=False) + assert_series_equal(value_grouped.aggregate(np.mean), agged, check_index_type=False) # complex agg agged = grouped.aggregate([np.mean, np.std]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - agged = grouped.aggregate({'one': np.mean, 'two': np.std}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + agged = grouped.aggregate({"one": np.mean, "two": np.std}) group_constants = {0: 10, 1: 20, 2: 30} agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) @@ -77,22 +77,22 @@ def test_groupby_nonobject_dtype(mframe, df_mixed_floats): grouped = mframe.groupby(key) result = grouped.sum() - expected = mframe.groupby(key.astype('O')).sum() + expected = mframe.groupby(key.astype("O")).sum() assert_frame_equal(result, expected) # GH 3911, mixed frame non-conversion df = df_mixed_floats.copy() - df['value'] = range(len(df)) + df["value"] = range(len(df)) def max_value(group): - return group.loc[group['value'].idxmax()] + return group.loc[group["value"].idxmax()] - applied = df.groupby('A').apply(max_value) + applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = Series([np.dtype('object')] * 2 + - [np.dtype('float64')] * 2 + - [np.dtype('int64')], - index=['A', 'B', 'C', 'D', 'value']) + expected = Series( + [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], + index=["A", "B", "C", "D", "value"], + ) assert_series_equal(result, expected) @@ -100,11 +100,13 @@ def test_groupby_return_type(): # GH2893, return a reduced type df1 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 2, "val2": 27}, - {"val1": 2, "val2": 12} - ]) + [ + {"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 2, "val2": 27}, + {"val1": 2, "val2": 12}, + ] + ) def func(dataf): return dataf["val2"] - dataf["val2"].mean() @@ -113,11 +115,13 @@ def func(dataf): assert isinstance(result, Series) df2 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 1, "val2": 27}, - {"val1": 1, "val2": 12} - ]) + [ + {"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 1, "val2": 27}, + {"val1": 1, "val2": 12}, + ] + ) def func(dataf): return dataf["val2"] - dataf["val2"].mean() @@ -126,72 +130,74 @@ def func(dataf): assert isinstance(result, Series) # GH3596, return a consistent type (regression in 0.11 from 0.10.1) - df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y']) - result = df.groupby('X', squeeze=False).count() + df = DataFrame([[1, 1], [1, 1]], columns=["X", "Y"]) + result = df.groupby("X", squeeze=False).count() assert isinstance(result, DataFrame) def test_inconsistent_return_type(): # GH5592 # inconsistent return type - df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', - 'Pony', 'Pony'], - B=Series(np.arange(7), dtype='int64'), - C=date_range('20130101', periods=7))) + df = DataFrame( + dict( + A=["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], + B=Series(np.arange(7), dtype="int64"), + C=date_range("20130101", periods=7), + ) + ) def f(grp): return grp.iloc[0] - expected = df.groupby('A').first()[['B']] - result = df.groupby('A').apply(f)[['B']] + expected = df.groupby("A").first()[["B"]] + result = df.groupby("A").apply(f)[["B"]] assert_frame_equal(result, expected) def f(grp): - if grp.name == 'Tiger': + if grp.name == "Tiger": return None return grp.iloc[0] - result = df.groupby('A').apply(f)[['B']] + result = df.groupby("A").apply(f)[["B"]] e = expected.copy() - e.loc['Tiger'] = np.nan + e.loc["Tiger"] = np.nan assert_frame_equal(result, e) def f(grp): - if grp.name == 'Pony': + if grp.name == "Pony": return None return grp.iloc[0] - result = df.groupby('A').apply(f)[['B']] + result = df.groupby("A").apply(f)[["B"]] e = expected.copy() - e.loc['Pony'] = np.nan + e.loc["Pony"] = np.nan assert_frame_equal(result, e) # 5592 revisited, with datetimes def f(grp): - if grp.name == 'Pony': + if grp.name == "Pony": return None return grp.iloc[0] - result = df.groupby('A').apply(f)[['C']] - e = df.groupby('A').first()[['C']] - e.loc['Pony'] = pd.NaT + result = df.groupby("A").apply(f)[["C"]] + e = df.groupby("A").first()[["C"]] + e.loc["Pony"] = pd.NaT assert_frame_equal(result, e) # scalar outputs def f(grp): - if grp.name == 'Pony': + if grp.name == "Pony": return None - return grp.iloc[0].loc['C'] + return grp.iloc[0].loc["C"] - result = df.groupby('A').apply(f) - e = df.groupby('A').first()['C'].copy() - e.loc['Pony'] = np.nan + result = df.groupby("A").apply(f) + e = df.groupby("A").first()["C"].copy() + e.loc["Pony"] = np.nan e.name = None assert_series_equal(result, e) def test_pass_args_kwargs(ts, tsframe): - def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -203,7 +209,7 @@ def f(x, q=None, axis=0): apply_result = ts_grouped.apply(np.percentile, 80, axis=0) trans_result = ts_grouped.transform(np.percentile, 80, axis=0) - agg_expected = ts_grouped.quantile(.8) + agg_expected = ts_grouped.quantile(0.8) trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) @@ -220,21 +226,20 @@ def f(x, q=None, axis=0): # DataFrame df_grouped = tsframe.groupby(lambda x: x.month) agg_result = df_grouped.agg(np.percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, .8) - expected = df_grouped.quantile(.8) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) assert_frame_equal(apply_result, expected, check_names=False) assert_frame_equal(agg_result, expected) agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=.8) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) assert_frame_equal(agg_result, expected) assert_frame_equal(apply_result, expected, check_names=False) def test_len(): df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) + grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) assert len(grouped) == len(df) grouped = df.groupby([lambda x: x.year, lambda x: x.month]) @@ -243,34 +248,37 @@ def test_len(): # issue 11016 df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) - assert len(df.groupby(('a'))) == 0 - assert len(df.groupby(('b'))) == 3 - assert len(df.groupby(['a', 'b'])) == 3 + assert len(df.groupby(("a"))) == 0 + assert len(df.groupby(("b"))) == 3 + assert len(df.groupby(["a", "b"])) == 3 def test_basic_regression(): # regression result = Series([1.0 * x for x in list(range(1, 10)) * 10]) - data = np.random.random(1100) * 10. + data = np.random.random(1100) * 10.0 groupings = Series(data) grouped = result.groupby(groupings) grouped.mean() -@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64', - 'int32', 'int16', 'int8']) +@pytest.mark.parametrize( + "dtype", ["float64", "float32", "int64", "int32", "int16", "int8"] +) def test_with_na_groups(dtype): index = Index(np.arange(10)) values = Series(np.ones(10), index, dtype=dtype) - labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, - 'bar', 'bar', np.nan, 'foo'], index=index) + labels = Series( + [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], + index=index, + ) # this SHOULD be an int grouped = values.groupby(labels) agged = grouped.agg(len) - expected = Series([4, 2], index=['bar', 'foo']) + expected = Series([4, 2], index=["bar", "foo"]) assert_series_equal(agged, expected, check_dtype=False) @@ -281,7 +289,7 @@ def f(x): return float(len(x)) agged = grouped.agg(f) - expected = Series([4, 2], index=['bar', 'foo']) + expected = Series([4, 2], index=["bar", "foo"]) assert_series_equal(agged, expected, check_dtype=False) assert issubclass(agged.dtype.type, np.dtype(dtype).type) @@ -294,12 +302,11 @@ def test_indices_concatenation_order(): def f1(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, - names=['b', 'c']) - res = DataFrame(columns=['a'], index=multiindex) + multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"]) + res = DataFrame(columns=["a"], index=multiindex) return res else: - y = y.set_index(['b', 'c']) + y = y.set_index(["b", "c"]) return y def f2(x): @@ -307,40 +314,41 @@ def f2(x): if y.empty: return DataFrame() else: - y = y.set_index(['b', 'c']) + y = y.set_index(["b", "c"]) return y def f3(x): y = x[(x.b % 2) == 1] ** 2 if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, - names=['foo', 'bar']) - res = DataFrame(columns=['a', 'b'], index=multiindex) + multiindex = MultiIndex( + levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"] + ) + res = DataFrame(columns=["a", "b"], index=multiindex) return res else: return y - df = DataFrame({'a': [1, 2, 2, 2], 'b': range(4), 'c': range(5, 9)}) + df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)}) - df2 = DataFrame({'a': [3, 2, 2, 2], 'b': range(4), 'c': range(5, 9)}) + df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - result1 = df.groupby('a').apply(f1) - result2 = df2.groupby('a').apply(f1) + result1 = df.groupby("a").apply(f1) + result2 = df2.groupby("a").apply(f1) assert_frame_equal(result1, result2) # should fail (not the same number of levels) msg = "Cannot concat indices that do not have the same number of levels" with pytest.raises(AssertionError, match=msg): - df.groupby('a').apply(f2) + df.groupby("a").apply(f2) with pytest.raises(AssertionError, match=msg): - df2.groupby('a').apply(f2) + df2.groupby("a").apply(f2) # should fail (incorrect shape) with pytest.raises(AssertionError, match=msg): - df.groupby('a').apply(f3) + df.groupby("a").apply(f3) with pytest.raises(AssertionError, match=msg): - df2.groupby('a').apply(f3) + df2.groupby("a").apply(f3) def test_attr_wrapper(ts): @@ -363,7 +371,7 @@ def test_attr_wrapper(ts): # make sure raises error msg = "'SeriesGroupBy' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): - getattr(grouped, 'foo') + getattr(grouped, "foo") def test_frame_groupby(tsframe): @@ -376,8 +384,8 @@ def test_frame_groupby(tsframe): # by string tscopy = tsframe.copy() - tscopy['weekday'] = [x.weekday() for x in tscopy.index] - stragged = tscopy.groupby('weekday').aggregate(np.mean) + tscopy["weekday"] = [x.weekday() for x in tscopy.index] + stragged = tscopy.groupby("weekday").aggregate(np.mean) assert_frame_equal(stragged, aggregated, check_names=False) # transform @@ -391,8 +399,7 @@ def test_frame_groupby(tsframe): for name, group in grouped: mean = group.mean() for idx in group.index: - tm.assert_series_equal(transformed.xs(idx), mean, - check_names=False) + tm.assert_series_equal(transformed.xs(idx), mean, check_names=False) # iterate for weekday, group in grouped: @@ -408,7 +415,7 @@ def test_frame_groupby(tsframe): def test_frame_groupby_columns(tsframe): - mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} + mapping = {"A": 0, "B": 0, "C": 1, "D": 1} grouped = tsframe.groupby(mapping, axis=1) # aggregate @@ -427,109 +434,149 @@ def test_frame_groupby_columns(tsframe): def test_frame_set_name_single(df): - grouped = df.groupby('A') + grouped = df.groupby("A") result = grouped.mean() - assert result.index.name == 'A' + assert result.index.name == "A" - result = df.groupby('A', as_index=False).mean() - assert result.index.name != 'A' + result = df.groupby("A", as_index=False).mean() + assert result.index.name != "A" result = grouped.agg(np.mean) - assert result.index.name == 'A' + assert result.index.name == "A" - result = grouped.agg({'C': np.mean, 'D': np.std}) - assert result.index.name == 'A' + result = grouped.agg({"C": np.mean, "D": np.std}) + assert result.index.name == "A" - result = grouped['C'].mean() - assert result.index.name == 'A' - result = grouped['C'].agg(np.mean) - assert result.index.name == 'A' - result = grouped['C'].agg([np.mean, np.std]) - assert result.index.name == 'A' + result = grouped["C"].mean() + assert result.index.name == "A" + result = grouped["C"].agg(np.mean) + assert result.index.name == "A" + result = grouped["C"].agg([np.mean, np.std]) + assert result.index.name == "A" - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) - assert result.index.name == 'A' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = grouped["C"].agg({"foo": np.mean, "bar": np.std}) + assert result.index.name == "A" def test_multi_func(df): - col1 = df['A'] - col2 = df['B'] + col1 = df["A"] + col2 = df["B"] grouped = df.groupby([col1.get, col2.get]) agged = grouped.mean() - expected = df.groupby(['A', 'B']).mean() + expected = df.groupby(["A", "B"]).mean() # TODO groupby get drops names - assert_frame_equal(agged.loc[:, ['C', 'D']], - expected.loc[:, ['C', 'D']], - check_names=False) + assert_frame_equal( + agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False + ) # some "groups" with no data - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, - index=['one', 'two', 'three', 'four', 'five', 'six']) + df = DataFrame( + { + "v1": np.random.randn(6), + "v2": np.random.randn(6), + "k1": np.array(["b", "b", "b", "a", "a", "a"]), + "k2": np.array(["1", "1", "1", "2", "2", "2"]), + }, + index=["one", "two", "three", "four", "five", "six"], + ) # only verify that it works for now - grouped = df.groupby(['k1', 'k2']) + grouped = df.groupby(["k1", "k2"]) grouped.agg(np.sum) def test_multi_key_multiple_functions(df): - grouped = df.groupby(['A', 'B'])['C'] + grouped = df.groupby(["A", "B"])["C"] agged = grouped.agg([np.mean, np.std]) - expected = DataFrame({'mean': grouped.agg(np.mean), - 'std': grouped.agg(np.std)}) + expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)}) assert_frame_equal(agged, expected) def test_frame_multi_key_function_list(): data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - grouped = data.groupby(['A', 'B']) + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) + + grouped = data.groupby(["A", "B"]) funcs = [np.mean, np.std] agged = grouped.agg(funcs) - expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), - grouped['F'].agg(funcs)], - keys=['D', 'E', 'F'], axis=1) - assert (isinstance(agged.index, MultiIndex)) - assert (isinstance(expected.index, MultiIndex)) + expected = pd.concat( + [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], + keys=["D", "E", "F"], + axis=1, + ) + assert isinstance(agged.index, MultiIndex) + assert isinstance(expected.index, MultiIndex) assert_frame_equal(agged, expected) -@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()]) +@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()]) def test_groupby_multiple_columns(df, op): data = df - grouped = data.groupby(['A', 'B']) + grouped = data.groupby(["A", "B"]) result1 = op(grouped) keys = [] values = [] - for n1, gp1 in data.groupby('A'): - for n2, gp2 in gp1.groupby('B'): + for n1, gp1 in data.groupby("A"): + for n2, gp2 in gp1.groupby("B"): keys.append((n1, n2)) - values.append(op(gp2.loc[:, ['C', 'D']])) + values.append(op(gp2.loc[:, ["C", "D"]])) - mi = MultiIndex.from_tuples(keys, names=['A', 'B']) + mi = MultiIndex.from_tuples(keys, names=["A", "B"]) expected = pd.concat(values, axis=1).T expected.index = mi # a little bit crude - for col in ['C', 'D']: + for col in ["C", "D"]: result_col = op(grouped[col]) pivoted = result1[col] exp = expected[col] @@ -537,14 +584,14 @@ def test_groupby_multiple_columns(df, op): assert_series_equal(pivoted, exp) # test single series works the same - result = data['C'].groupby([data['A'], data['B']]).mean() - expected = data.groupby(['A', 'B']).mean()['C'] + result = data["C"].groupby([data["A"], data["B"]]).mean() + expected = data.groupby(["A", "B"]).mean()["C"] assert_series_equal(result, expected) def test_groupby_as_index_agg(df): - grouped = df.groupby('A', as_index=False) + grouped = df.groupby("A", as_index=False) # single-key @@ -552,48 +599,46 @@ def test_groupby_as_index_agg(df): expected = grouped.mean() assert_frame_equal(result, expected) - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + result2 = grouped.agg(OrderedDict([["C", np.mean], ["D", np.sum]])) expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] + expected2["D"] = grouped.sum()["D"] assert_frame_equal(result2, expected2) - grouped = df.groupby('A', as_index=True) - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + grouped = df.groupby("A", as_index=True) + expected3 = grouped["C"].sum() + expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result3 = grouped['C'].agg({'Q': np.sum}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result3 = grouped["C"].agg({"Q": np.sum}) assert_frame_equal(result3, expected3) # multi-key - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + result2 = grouped.agg(OrderedDict([["C", np.mean], ["D", np.sum]])) expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] + expected2["D"] = grouped.sum()["D"] assert_frame_equal(result2, expected2) - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) - result3 = grouped['C'].agg({'Q': np.sum}) + expected3 = grouped["C"].sum() + expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) + result3 = grouped["C"].agg({"Q": np.sum}) assert_frame_equal(result3, expected3) # GH7115 & GH8112 & GH8582 - df = DataFrame(np.random.randint(0, 100, (50, 3)), - columns=['jim', 'joe', 'jolie']) - ts = Series(np.random.randint(5, 10, 50), name='jim') + df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"]) + ts = Series(np.random.randint(5, 10, 50), name="jim") gr = df.groupby(ts) gr.nth(0) # invokes set_selection_from_grouper internally assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) - for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: + for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: gr = df.groupby(ts, as_index=False) left = getattr(gr, attr)() @@ -604,69 +649,69 @@ def test_groupby_as_index_agg(df): def test_as_index_series_return_frame(df): - grouped = df.groupby('A', as_index=False) - grouped2 = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby("A", as_index=False) + grouped2 = df.groupby(["A", "B"], as_index=False) - result = grouped['C'].agg(np.sum) - expected = grouped.agg(np.sum).loc[:, ['A', 'C']] + result = grouped["C"].agg(np.sum) + expected = grouped.agg(np.sum).loc[:, ["A", "C"]] assert isinstance(result, DataFrame) assert_frame_equal(result, expected) - result2 = grouped2['C'].agg(np.sum) - expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] + result2 = grouped2["C"].agg(np.sum) + expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]] assert isinstance(result2, DataFrame) assert_frame_equal(result2, expected2) - result = grouped['C'].sum() - expected = grouped.sum().loc[:, ['A', 'C']] + result = grouped["C"].sum() + expected = grouped.sum().loc[:, ["A", "C"]] assert isinstance(result, DataFrame) assert_frame_equal(result, expected) - result2 = grouped2['C'].sum() - expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] + result2 = grouped2["C"].sum() + expected2 = grouped2.sum().loc[:, ["A", "B", "C"]] assert isinstance(result2, DataFrame) assert_frame_equal(result2, expected2) def test_as_index_series_column_slice_raises(df): # GH15072 - grouped = df.groupby('A', as_index=False) + grouped = df.groupby("A", as_index=False) msg = r"Column\(s\) C already selected" with pytest.raises(IndexError, match=msg): - grouped['C'].__getitem__('D') + grouped["C"].__getitem__("D") def test_groupby_as_index_cython(df): data = df # single-key - grouped = data.groupby('A', as_index=False) + grouped = data.groupby("A", as_index=False) result = grouped.mean() - expected = data.groupby(['A']).mean() - expected.insert(0, 'A', expected.index) + expected = data.groupby(["A"]).mean() + expected.insert(0, "A", expected.index) expected.index = np.arange(len(expected)) assert_frame_equal(result, expected) # multi-key - grouped = data.groupby(['A', 'B'], as_index=False) + grouped = data.groupby(["A", "B"], as_index=False) result = grouped.mean() - expected = data.groupby(['A', 'B']).mean() + expected = data.groupby(["A", "B"]).mean() arrays = list(zip(*expected.index.values)) - expected.insert(0, 'A', arrays[0]) - expected.insert(1, 'B', arrays[1]) + expected.insert(0, "A", arrays[0]) + expected.insert(1, "B", arrays[1]) expected.index = np.arange(len(expected)) assert_frame_equal(result, expected) def test_groupby_as_index_series_scalar(df): - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) # GH #421 - result = grouped['C'].agg(len) - expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] + result = grouped["C"].agg(len) + expected = grouped.agg(len).loc[:, ["A", "B", "C"]] assert_frame_equal(result, expected) @@ -682,14 +727,13 @@ def test_groupby_as_index_corner(df, ts): def test_groupby_multiple_key(df): df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) + grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) agged = grouped.sum() assert_almost_equal(df.values, agged.values) - grouped = df.T.groupby([lambda x: x.year, - lambda x: x.month, - lambda x: x.day], axis=1) + grouped = df.T.groupby( + [lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1 + ) agged = grouped.agg(lambda x: x.sum()) tm.assert_index_equal(agged.index, df.columns) @@ -702,44 +746,46 @@ def test_groupby_multiple_key(df): def test_groupby_multi_corner(df): # test that having an all-NA column doesn't mess you up df = df.copy() - df['bad'] = np.nan - agged = df.groupby(['A', 'B']).mean() + df["bad"] = np.nan + agged = df.groupby(["A", "B"]).mean() - expected = df.groupby(['A', 'B']).mean() - expected['bad'] = np.nan + expected = df.groupby(["A", "B"]).mean() + expected["bad"] = np.nan assert_frame_equal(agged, expected) def test_omit_nuisance(df): - grouped = df.groupby('A') + grouped = df.groupby("A") result = grouped.mean() - expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() assert_frame_equal(result, expected) agged = grouped.agg(np.mean) exp = grouped.mean() assert_frame_equal(agged, exp) - df = df.loc[:, ['A', 'C', 'D']] - df['E'] = datetime.now() - grouped = df.groupby('A') + df = df.loc[:, ["A", "C", "D"]] + df["E"] = datetime.now() + grouped = df.groupby("A") result = grouped.agg(np.sum) expected = grouped.sum() assert_frame_equal(result, expected) # won't work with axis = 1 - grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) - msg = (r'\("unsupported operand type\(s\) for \+: ' - "'Timestamp' and 'float'\"" - r", 'occurred at index 0'\)") + grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) + msg = ( + r'\("unsupported operand type\(s\) for \+: ' + "'Timestamp' and 'float'\"" + r", 'occurred at index 0'\)" + ) with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) def test_omit_nuisance_python_multiple(three_group): - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) agged = grouped.agg(np.mean) exp = grouped.mean() @@ -748,41 +794,45 @@ def test_omit_nuisance_python_multiple(three_group): def test_empty_groups_corner(mframe): # handle empty groups - df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2']), - 'k3': ['foo', 'bar'] * 3, - 'v1': np.random.randn(6), - 'v2': np.random.randn(6)}) - - grouped = df.groupby(['k1', 'k2']) + df = DataFrame( + { + "k1": np.array(["b", "b", "b", "a", "a", "a"]), + "k2": np.array(["1", "1", "1", "2", "2", "2"]), + "k3": ["foo", "bar"] * 3, + "v1": np.random.randn(6), + "v2": np.random.randn(6), + } + ) + + grouped = df.groupby(["k1", "k2"]) result = grouped.agg(np.mean) expected = grouped.mean() assert_frame_equal(result, expected) grouped = mframe[3:5].groupby(level=0) agged = grouped.apply(lambda x: x.mean()) - agged_A = grouped['A'].apply(np.mean) - assert_series_equal(agged['A'], agged_A) - assert agged.index.name == 'first' + agged_A = grouped["A"].apply(np.mean) + assert_series_equal(agged["A"], agged_A) + assert agged.index.name == "first" def test_nonsense_func(): df = DataFrame([0]) msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" with pytest.raises(TypeError, match=msg): - df.groupby(lambda x: x + 'foo') + df.groupby(lambda x: x + "foo") def test_wrap_aggregated_output_multindex(mframe): df = mframe.T - df['baz', 'two'] = 'peekaboo' + df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] agged = df.groupby(keys).agg(np.mean) assert isinstance(agged.columns, MultiIndex) def aggfun(ser): - if ser.name == ('foo', 'one'): + if ser.name == ("foo", "one"): raise TypeError else: return ser.sum() @@ -794,28 +844,28 @@ def aggfun(ser): def test_groupby_level_apply(mframe): result = mframe.groupby(level=0).count() - assert result.index.name == 'first' + assert result.index.name == "first" result = mframe.groupby(level=1).count() - assert result.index.name == 'second' + assert result.index.name == "second" - result = mframe['A'].groupby(level=0).count() - assert result.index.name == 'first' + result = mframe["A"].groupby(level=0).count() + assert result.index.name == "first" def test_groupby_level_mapper(mframe): deleveled = mframe.reset_index() - mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1} - mapper1 = {'one': 0, 'two': 0, 'three': 1} + mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} + mapper1 = {"one": 0, "two": 0, "three": 1} result0 = mframe.groupby(mapper0, level=0).sum() result1 = mframe.groupby(mapper1, level=1).sum() - mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) - mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) + mapped_level0 = np.array([mapper0.get(x) for x in deleveled["first"]]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled["second"]]) expected0 = mframe.groupby(mapped_level0).sum() expected1 = mframe.groupby(mapped_level1).sum() - expected0.index.name, expected1.index.name = 'first', 'second' + expected0.index.name, expected1.index.name = "first", "second" assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) @@ -823,10 +873,8 @@ def test_groupby_level_mapper(mframe): def test_groupby_level_nonmulti(): # GH 1313, GH 13901 - s = Series([1, 2, 3, 10, 4, 5, 20, 6], - Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo')) - expected = Series([11, 22, 3, 4, 5, 6], - Index(range(1, 7), name='foo')) + s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) + expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) result = s.groupby(level=0).sum() tm.assert_series_equal(result, expected) @@ -871,25 +919,32 @@ def test_mutate_groups(): # GH3380 - df = DataFrame({ - 'cat1': ['a'] * 8 + ['b'] * 6, - 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + - ['d'] * 2 + ['e'] * 2, - 'cat3': ['g{}'.format(x) for x in range(1, 15)], - 'val': np.random.randint(100, size=14), - }) + df = DataFrame( + { + "cat1": ["a"] * 8 + ["b"] * 6, + "cat2": ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2 + + ["f"] * 2 + + ["c"] * 2 + + ["d"] * 2 + + ["e"] * 2, + "cat3": ["g{}".format(x) for x in range(1, 15)], + "val": np.random.randint(100, size=14), + } + ) def f_copy(x): x = x.copy() - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() def f_no_copy(x): - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() + x["rank"] = x.val.rank(method="min") + return x.groupby("cat2")["rank"].min() - grpby_copy = df.groupby('cat1').apply(f_copy) - grpby_no_copy = df.groupby('cat1').apply(f_no_copy) + grpby_copy = df.groupby("cat1").apply(f_copy) + grpby_no_copy = df.groupby("cat1").apply(f_no_copy) assert_series_equal(grpby_copy, grpby_no_copy) @@ -898,18 +953,21 @@ def test_no_mutate_but_looks_like(): # GH 8467 # first show's mutation indicator # second does not, but should yield the same results - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)}) + df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)}) - result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) + result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key) assert_series_equal(result1, result2) def test_groupby_series_indexed_differently(): - s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], - index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) - s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], - index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) + s1 = Series( + [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], + index=Index(["a", "b", "c", "d", "e", "f", "g"]), + ) + s2 = Series( + [1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"]) + ) grouped = s1.groupby(s2) agged = grouped.mean() @@ -918,12 +976,18 @@ def test_groupby_series_indexed_differently(): def test_groupby_with_hier_columns(): - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', - 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', - 'one', 'two']])) + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) index = MultiIndex.from_tuples(tuples) - columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), ( - 'B', 'cat'), ('A', 'dog')]) + columns = MultiIndex.from_tuples( + [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")] + ) df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) result = df.groupby(level=0).mean() @@ -939,23 +1003,24 @@ def test_groupby_with_hier_columns(): tm.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) - tm.assert_index_equal(result.columns, Index(['A', 'B'])) + tm.assert_index_equal(result.columns, Index(["A", "B"])) tm.assert_index_equal(result.index, df.index) # add a nuisance column sorted_columns, _ = columns.sortlevel(0) - df['A', 'foo'] = 'bar' + df["A", "foo"] = "bar" result = df.groupby(level=0).mean() tm.assert_index_equal(result.columns, df.columns[:-1]) def test_grouping_ndarray(df): - grouped = df.groupby(df['A'].values) + grouped = df.groupby(df["A"].values) result = grouped.sum() - expected = df.groupby('A').sum() - assert_frame_equal(result, expected, check_names=False - ) # Note: no names when grouping by value + expected = df.groupby("A").sum() + assert_frame_equal( + result, expected, check_names=False + ) # Note: no names when grouping by value def test_groupby_wrong_multi_labels(): @@ -968,7 +1033,7 @@ def test_groupby_wrong_multi_labels(): data = read_csv(StringIO(data), index_col=0) - grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) + grouped = data.groupby(["foo", "bar", "baz", "spam"]) result = grouped.agg(np.mean) expected = grouped.mean() @@ -976,40 +1041,41 @@ def test_groupby_wrong_multi_labels(): def test_groupby_series_with_name(df): - result = df.groupby(df['A']).mean() - result2 = df.groupby(df['A'], as_index=False).mean() - assert result.index.name == 'A' - assert 'A' in result2 + result = df.groupby(df["A"]).mean() + result2 = df.groupby(df["A"], as_index=False).mean() + assert result.index.name == "A" + assert "A" in result2 - result = df.groupby([df['A'], df['B']]).mean() - result2 = df.groupby([df['A'], df['B']], - as_index=False).mean() - assert result.index.names == ('A', 'B') - assert 'A' in result2 - assert 'B' in result2 + result = df.groupby([df["A"], df["B"]]).mean() + result2 = df.groupby([df["A"], df["B"]], as_index=False).mean() + assert result.index.names == ("A", "B") + assert "A" in result2 + assert "B" in result2 def test_seriesgroupby_name_attr(df): # GH 6265 - result = df.groupby('A')['C'] - assert result.count().name == 'C' - assert result.mean().name == 'C' + result = df.groupby("A")["C"] + assert result.count().name == "C" + assert result.mean().name == "C" testFunc = lambda x: np.sum(x) * 2 - assert result.agg(testFunc).name == 'C' + assert result.agg(testFunc).name == "C" def test_consistency_name(): # GH 12363 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - expected = df.groupby(['A']).B.count() + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "two", "two", "two", "one", "two"], + "C": np.random.randn(8) + 1.0, + "D": np.arange(8), + } + ) + + expected = df.groupby(["A"]).B.count() result = df.B.groupby(df.A).count() assert_series_equal(result, expected) @@ -1017,23 +1083,19 @@ def test_consistency_name(): def test_groupby_name_propagation(df): # GH 6124 def summarize(df, name=None): - return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=name) def summarize_random_name(df): # Provide a different name for each Series. In this case, groupby # should not attempt to propagate the Series name since they are # inconsistent. - return Series({ - 'count': 1, - 'mean': 2, - 'omissions': 3, - }, name=df.iloc[0]['A']) + return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) - metrics = df.groupby('A').apply(summarize) + metrics = df.groupby("A").apply(summarize) assert metrics.columns.name is None - metrics = df.groupby('A').apply(summarize, 'metrics') - assert metrics.columns.name == 'metrics' - metrics = df.groupby('A').apply(summarize_random_name) + metrics = df.groupby("A").apply(summarize, "metrics") + assert metrics.columns.name == "metrics" + metrics = df.groupby("A").apply(summarize_random_name) assert metrics.columns.name is None @@ -1047,14 +1109,13 @@ def test_groupby_nonstring_columns(): def test_groupby_mixed_type_columns(): # GH 13432, unorderable types in py3 - df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) - expected = DataFrame([[1, 2]], columns=['B', 0], - index=Index([0], name='A')) + df = DataFrame([[0, 1, 2]], columns=["A", "B", 0]) + expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A")) - result = df.groupby('A').first() + result = df.groupby("A").first() tm.assert_frame_equal(result, expected) - result = df.groupby('A').sum() + result = df.groupby("A").sum() tm.assert_frame_equal(result, expected) @@ -1087,14 +1148,14 @@ def test_series_grouper_noncontig_index(): def test_convert_objects_leave_decimal_alone(): s = Series(range(5)) - labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') + labels = np.array(["a", "b", "c", "d", "e"], dtype="O") def convert_fast(x): return Decimal(str(x.mean())) def convert_force_pure(x): # base will be length 0 - assert (len(x.values.base) > 0) + assert len(x.values.base) > 0 return Decimal(str(x.mean())) grouped = s.groupby(labels) @@ -1110,43 +1171,39 @@ def convert_force_pure(x): def test_groupby_dtype_inference_empty(): # GH 6733 - df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')}) - assert df['x'].dtype == np.float64 + df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")}) + assert df["x"].dtype == np.float64 - result = df.groupby('x').first() - exp_index = Index([], name='x', dtype=np.float64) - expected = DataFrame({'range': Series( - [], index=exp_index, dtype='int64')}) + result = df.groupby("x").first() + exp_index = Index([], name="x", dtype=np.float64) + expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")}) assert_frame_equal(result, expected, by_blocks=True) def test_groupby_list_infer_array_like(df): - result = df.groupby(list(df['A'])).mean() - expected = df.groupby(df['A']).mean() + result = df.groupby(list(df["A"])).mean() + expected = df.groupby(df["A"]).mean() assert_frame_equal(result, expected, check_names=False) with pytest.raises(KeyError, match=r"^'foo'$"): - df.groupby(list(df['A'][:-1])) + df.groupby(list(df["A"][:-1])) # pathological case of ambiguity - df = DataFrame({'foo': [0, 1], - 'bar': [3, 4], - 'val': np.random.randn(2)}) + df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)}) - result = df.groupby(['foo', 'bar']).mean() - expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + result = df.groupby(["foo", "bar"]).mean() + expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]] def test_groupby_keys_same_size_as_index(): # GH 11185 - freq = 's' - index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'), - periods=2, freq=freq) - df = pd.DataFrame([['A', 10], ['B', 15]], columns=[ - 'metric', 'values' - ], index=index) - result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean() - expected = df.set_index([df.index, 'metric']) + freq = "s" + index = pd.date_range( + start=pd.Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq + ) + df = pd.DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) + result = df.groupby([pd.Grouper(level=0, freq=freq), "metric"]).mean() + expected = df.set_index([df.index, "metric"]) assert_frame_equal(result, expected) @@ -1154,23 +1211,33 @@ def test_groupby_keys_same_size_as_index(): def test_groupby_one_row(): # GH 11741 msg = r"^'Z'$" - df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD')) + df1 = pd.DataFrame(np.random.randn(1, 4), columns=list("ABCD")) with pytest.raises(KeyError, match=msg): - df1.groupby('Z') - df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD')) + df1.groupby("Z") + df2 = pd.DataFrame(np.random.randn(2, 4), columns=list("ABCD")) with pytest.raises(KeyError, match=msg): - df2.groupby('Z') + df2.groupby("Z") def test_groupby_nat_exclude(): # GH 6992 df = pd.DataFrame( - {'values': np.random.randn(8), - 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp( - '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan, - pd.Timestamp('2013-01-01')], - 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) - grouped = df.groupby('dt') + { + "values": np.random.randn(8), + "dt": [ + np.nan, + pd.Timestamp("2013-01-01"), + np.nan, + pd.Timestamp("2013-02-01"), + np.nan, + pd.Timestamp("2013-02-01"), + np.nan, + pd.Timestamp("2013-01-01"), + ], + "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"], + } + ) + grouped = df.groupby("dt") expected = [pd.Index([1, 7]), pd.Index([3, 5])] keys = sorted(grouped.groups.keys()) @@ -1185,27 +1252,26 @@ def test_groupby_nat_exclude(): assert grouped.ngroups == 2 expected = { - Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64), - Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64) + Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.int64), + Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.int64), } for k in grouped.indices: tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) + tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) with pytest.raises(KeyError, match=r"^NaT$"): grouped.get_group(pd.NaT) - nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], - 'nat': [pd.NaT, pd.NaT, pd.NaT]}) - assert nan_df['nan'].dtype == 'float64' - assert nan_df['nat'].dtype == 'datetime64[ns]' + nan_df = DataFrame( + {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} + ) + assert nan_df["nan"].dtype == "float64" + assert nan_df["nat"].dtype == "datetime64[ns]" - for key in ['nan', 'nat']: + for key in ["nan", "nat"]: grouped = nan_df.groupby(key) assert grouped.groups == {} assert grouped.ngroups == 0 @@ -1218,56 +1284,55 @@ def test_groupby_nat_exclude(): def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) - d['group'] = ['g1', 'g2'] - d['zeros'] = [0, 0] - d['ones'] = [1, 1] - d['label'] = ['l1', 'l2'] - tmp = d.groupby(['group']).mean() + d["group"] = ["g1", "g2"] + d["zeros"] = [0, 0] + d["ones"] = [1, 1] + d["label"] = ["l1", "l2"] + tmp = d.groupby(["group"]).mean() res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) - tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) tm.assert_numpy_array_equal(tmp.values, res_values) def test_int32_overflow(): - B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000) - )) + B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000))) A = np.arange(25000) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': np.random.randn(25000)}) - - left = df.groupby(['A', 'B', 'C', 'D']).sum() - right = df.groupby(['D', 'C', 'B', 'A']).sum() + df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)}) + + left = df.groupby(["A", "B", "C", "D"]).sum() + right = df.groupby(["D", "C", "B", "A"]).sum() assert len(left) == len(right) def test_groupby_sort_multi(): - df = DataFrame({'a': ['foo', 'bar', 'baz'], - 'b': [3, 2, 1], - 'c': [0, 1, 2], - 'd': np.random.randn(3)}) - - tups = [tuple(row) for row in df[['a', 'b', 'c']].values] + df = DataFrame( + { + "a": ["foo", "bar", "baz"], + "b": [3, 2, 1], + "c": [0, 1, 2], + "d": np.random.randn(3), + } + ) + + tups = [tuple(row) for row in df[["a", "b", "c"]].values] tups = com.asarray_tuplesafe(tups) - result = df.groupby(['a', 'b', 'c'], sort=True).sum() + result = df.groupby(["a", "b", "c"], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) - tups = [tuple(row) for row in df[['c', 'a', 'b']].values] + tups = [tuple(row) for row in df[["c", "a", "b"]].values] tups = com.asarray_tuplesafe(tups) - result = df.groupby(['c', 'a', 'b'], sort=True).sum() + result = df.groupby(["c", "a", "b"], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups) - tups = [tuple(x) for x in df[['b', 'c', 'a']].values] + tups = [tuple(x) for x in df[["b", "c", "a"]].values] tups = com.asarray_tuplesafe(tups) - result = df.groupby(['b', 'c', 'a'], sort=True).sum() + result = df.groupby(["b", "c", "a"], sort=True).sum() tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) - df = DataFrame({'a': [0, 1, 2, 0, 1, 2], - 'b': [0, 0, 0, 1, 1, 1], - 'd': np.random.randn(6)}) - grouped = df.groupby(['a', 'b'])['d'] + df = DataFrame( + {"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)} + ) + grouped = df.groupby(["a", "b"])["d"] result = grouped.sum() def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): @@ -1275,16 +1340,17 @@ def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = com.asarray_tuplesafe(tups) expected = f(df.groupby(tups)[field]) for k, v in expected.items(): - assert (result[k] == v) + assert result[k] == v - _check_groupby(df, result, ['a', 'b'], 'd') + _check_groupby(df, result, ["a", "b"], "d") def test_dont_clobber_name_column(): - df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], - 'name': ['foo', 'bar', 'baz'] * 2}) + df = DataFrame( + {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} + ) - result = df.groupby('key').apply(lambda x: x) + result = df.groupby("key").apply(lambda x: x) assert_frame_equal(result, df) @@ -1293,14 +1359,14 @@ def test_skip_group_keys(): tsf = tm.makeTimeDataFrame() grouped = tsf.groupby(lambda x: x.month, group_keys=False) - result = grouped.apply(lambda x: x.sort_values(by='A')[:3]) + result = grouped.apply(lambda x: x.sort_values(by="A")[:3]) - pieces = [group.sort_values(by='A')[:3] for key, group in grouped] + pieces = [group.sort_values(by="A")[:3] for key, group in grouped] expected = pd.concat(pieces) assert_frame_equal(result, expected) - grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) + grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False) result = grouped.apply(lambda x: x.sort_values()[:3]) pieces = [group.sort_values()[:3] for key, group in grouped] @@ -1311,39 +1377,39 @@ def test_skip_group_keys(): def test_no_nonsense_name(float_frame): # GH #995 - s = float_frame['C'].copy() + s = float_frame["C"].copy() s.name = None - result = s.groupby(float_frame['A']).agg(np.sum) + result = s.groupby(float_frame["A"]).agg(np.sum) assert result.name is None def test_multifunc_sum_bug(): # GH #1065 x = DataFrame(np.arange(9).reshape(3, 3)) - x['test'] = 0 - x['fl'] = [1.3, 1.5, 1.6] + x["test"] = 0 + x["fl"] = [1.3, 1.5, 1.6] - grouped = x.groupby('test') - result = grouped.agg({'fl': 'sum', 2: 'size'}) - assert result['fl'].dtype == np.float64 + grouped = x.groupby("test") + result = grouped.agg({"fl": "sum", 2: "size"}) + assert result["fl"].dtype == np.float64 def test_handle_dict_return_value(df): def f(group): - return {'max': group.max(), 'min': group.min()} + return {"max": group.max(), "min": group.min()} def g(group): - return Series({'max': group.max(), 'min': group.min()}) + return Series({"max": group.max(), "min": group.min()}) - result = df.groupby('A')['C'].apply(f) - expected = df.groupby('A')['C'].apply(g) + result = df.groupby("A")["C"].apply(f) + expected = df.groupby("A")["C"].apply(g) assert isinstance(result, Series) assert_series_equal(result, expected) -@pytest.mark.parametrize('grouper', ['A', ['A', 'B']]) +@pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) def test_set_group_name(df, grouper): def f(group): assert group.name is not None @@ -1361,25 +1427,26 @@ def foo(x): # make sure all these work grouped.apply(f) grouped.aggregate(freduce) - grouped.aggregate({'C': freduce, 'D': freduce}) + grouped.aggregate({"C": freduce, "D": freduce}) grouped.transform(f) - grouped['C'].apply(f) - grouped['C'].aggregate(freduce) - grouped['C'].aggregate([freduce, foo]) - grouped['C'].transform(f) + grouped["C"].apply(f) + grouped["C"].aggregate(freduce) + grouped["C"].aggregate([freduce, foo]) + grouped["C"].transform(f) def test_group_name_available_in_inference_pass(): # gh-15062 - df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) + df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) names = [] def f(group): names.append(group.name) return group.copy() - df.groupby('a', sort=False, group_keys=False).apply(f) + + df.groupby("a", sort=False, group_keys=False).apply(f) expected_names = [0, 1, 2] assert names == expected_names @@ -1387,10 +1454,10 @@ def f(group): def test_no_dummy_key_names(df): # see gh-1291 - result = df.groupby(df['A'].values).sum() + result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df['A'].values, df['B'].values]).sum() + result = df.groupby([df["A"].values, df["B"].values]).sum() assert result.index.names == (None, None) @@ -1398,26 +1465,28 @@ def test_groupby_sort_multiindex_series(): # series multiindex groupby sort argument was not being passed through # _compress_group_index # GH 9444 - index = MultiIndex(levels=[[1, 2], [1, 2]], - codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], - names=['a', 'b']) + index = MultiIndex( + levels=[[1, 2], [1, 2]], + codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], + names=["a", "b"], + ) mseries = Series([0, 1, 2, 3, 4, 5], index=index) - index = MultiIndex(levels=[[1, 2], [1, 2]], - codes=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) + index = MultiIndex( + levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"] + ) mseries_result = Series([0, 2, 4], index=index) - result = mseries.groupby(level=['a', 'b'], sort=False).first() + result = mseries.groupby(level=["a", "b"], sort=False).first() assert_series_equal(result, mseries_result) - result = mseries.groupby(level=['a', 'b'], sort=True).first() + result = mseries.groupby(level=["a", "b"], sort=True).first() assert_series_equal(result, mseries_result.sort_index()) def test_groupby_reindex_inside_function(): periods = 1000 - ind = date_range(start='2012/1/1', freq='5min', periods=periods) - df = DataFrame({'high': np.arange( - periods), 'low': np.arange(periods)}, index=ind) + ind = date_range(start="2012/1/1", freq="5min", periods=periods) + df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind) def agg_before(hour, func, fix=False): """ @@ -1425,8 +1494,7 @@ def agg_before(hour, func, fix=False): """ def _func(data): - d = data.loc[data.index.map( - lambda x: x.hour < 11)].dropna() + d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna() if fix: data[data.index[0]] if len(d) == 0: @@ -1440,24 +1508,29 @@ def afunc(data): return np.max(d) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) - closure_bad = grouped.agg({'high': agg_before(11, np.max)}) - closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) + closure_bad = grouped.agg({"high": agg_before(11, np.max)}) + closure_good = grouped.agg({"high": agg_before(11, np.max, True)}) assert_frame_equal(closure_bad, closure_good) def test_groupby_multiindex_missing_pair(): # GH9049 - df = DataFrame({'group1': ['a', 'a', 'a', 'b'], - 'group2': ['c', 'c', 'd', 'c'], - 'value': [1, 1, 1, 5]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) - - res = df_grouped.agg('sum') + df = DataFrame( + { + "group1": ["a", "a", "a", "b"], + "group2": ["c", "c", "d", "c"], + "value": [1, 1, 1, 5], + } + ) + df = df.set_index(["group1", "group2"]) + df_grouped = df.groupby(level=["group1", "group2"], sort=True) + + res = df_grouped.agg("sum") idx = MultiIndex.from_tuples( - [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2']) - exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) + [("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"] + ) + exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"]) tm.assert_frame_equal(res, exp) @@ -1467,43 +1540,47 @@ def test_groupby_multiindex_not_lexsorted(): # define the lexsorted version lexsorted_mi = MultiIndex.from_tuples( - [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) + [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] + ) lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) assert lexsorted_df.columns.is_lexsorted() # define the non-lexsorted version - not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], - [1, 'b2', 'c2', 4]]) + not_lexsorted_df = DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) not_lexsorted_df = not_lexsorted_df.pivot_table( - index='a', columns=['b', 'c'], values='d') + index="a", columns=["b", "c"], values="d" + ) not_lexsorted_df = not_lexsorted_df.reset_index() assert not not_lexsorted_df.columns.is_lexsorted() # compare the results tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - expected = lexsorted_df.groupby('a').mean() + expected = lexsorted_df.groupby("a").mean() with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.groupby('a').mean() + result = not_lexsorted_df.groupby("a").mean() tm.assert_frame_equal(expected, result) # a transforming function should work regardless of sort # GH 14776 - df = DataFrame({'x': ['a', 'a', 'b', 'a'], - 'y': [1, 1, 2, 2], - 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) + df = DataFrame( + {"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]} + ).set_index(["x", "y"]) assert not df.index.is_lexsorted() for level in [0, 1, [0, 1]]: for sort in [False, True]: - result = df.groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) + result = df.groupby(level=level, sort=sort).apply(DataFrame.drop_duplicates) expected = df tm.assert_frame_equal(expected, result) - result = df.sort_index().groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) + result = ( + df.sort_index() + .groupby(level=level, sort=sort) + .apply(DataFrame.drop_duplicates) + ) expected = df.sort_index() tm.assert_frame_equal(expected, result) @@ -1511,27 +1588,27 @@ def test_groupby_multiindex_not_lexsorted(): def test_index_label_overlaps_location(): # checking we don't have any label/location confusion in the # the wake of GH5375 - df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) - g = df.groupby(list('ababb')) + df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1]) + g = df.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = df.iloc[[1, 3, 4]] assert_frame_equal(actual, expected) ser = df[0] - g = ser.groupby(list('ababb')) + g = ser.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) # ... and again, with a generic Index of floats df.index = df.index.astype(float) - g = df.groupby(list('ababb')) + g = df.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = df.iloc[[1, 3, 4]] assert_frame_equal(actual, expected) ser = df[0] - g = ser.groupby(list('ababb')) + g = ser.groupby(list("ababb")) actual = g.filter(lambda x: len(x) > 2) expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) @@ -1541,32 +1618,37 @@ def test_transform_doesnt_clobber_ints(): # GH 7972 n = 6 x = np.arange(n) - df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x}) - df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x}) + df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x}) + df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x}) - gb = df.groupby('a') - result = gb.transform('mean') + gb = df.groupby("a") + result = gb.transform("mean") - gb2 = df2.groupby('a') - expected = gb2.transform('mean') + gb2 = df2.groupby("a") + expected = gb2.transform("mean") tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings', - ['ints', 'floats'], - ['ints', 'strings']]) -@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups', - ['int_groups', 'string_groups']]) +@pytest.mark.parametrize( + "sort_column", + ["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]], +) +@pytest.mark.parametrize( + "group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]] +) def test_groupby_preserves_sort(sort_column, group_column): # Test to ensure that groupby always preserves sort order of original # object. Issue #8588 and #9651 df = DataFrame( - {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3], - 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'], - 'ints': [8, 7, 4, 5, 2, 9, 1, 1], - 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], - 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']}) + { + "int_groups": [3, 1, 0, 1, 0, 3, 3, 3], + "string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"], + "ints": [8, 7, 4, 5, 2, 9, 1, 1], + "floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], + "strings": ["z", "d", "a", "e", "word", "word2", "42", "47"], + } + ) # Try sorting on different types and with different group types @@ -1575,6 +1657,7 @@ def test_groupby_preserves_sort(sort_column, group_column): def test_sort(x): assert_frame_equal(x, x.sort_values(by=sort_column)) + g.apply(test_sort) @@ -1586,15 +1669,20 @@ def test_group_shift_with_null_key(): # values in column `B`, and then group by [`A`, `B`]. This should # force `-1` in `labels` array of `g.grouper.group_info` exactly # at those places, where the group-by key is partially missing. - df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) - for i in range(n_rows)], dtype=float, - columns=["A", "B", "Z"], index=None) + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) g = df.groupby(["A", "B"]) - expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 - else np.nan) - for i in range(n_rows)], dtype=float, - columns=["Z"], index=None) + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) result = g.shift(-1) assert_frame_equal(result, expected) @@ -1603,15 +1691,20 @@ def test_group_shift_with_null_key(): def test_group_shift_with_fill_value(): # GH #24128 n_rows = 24 - df = DataFrame([(i % 12, i % 3, i) - for i in range(n_rows)], dtype=float, - columns=["A", "B", "Z"], index=None) + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) g = df.groupby(["A", "B"]) - expected = DataFrame([(i + 12 if i < n_rows - 12 - else 0) - for i in range(n_rows)], dtype=float, - columns=["Z"], index=None) + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) result = g.shift(-1, fill_value=0)[["Z"]] assert_frame_equal(result, expected) @@ -1619,48 +1712,58 @@ def test_group_shift_with_fill_value(): def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 - df = pd.DataFrame({'eventDate': - pd.date_range(pd.datetime.today(), - periods=20, freq='M').tolist(), - 'thename': range(0, 20)}) + df = pd.DataFrame( + { + "eventDate": pd.date_range( + pd.datetime.today(), periods=20, freq="M" + ).tolist(), + "thename": range(0, 20), + } + ) - df['year'] = df.set_index('eventDate').index.year - df['month'] = df.set_index('eventDate').index.month + df["year"] = df.set_index("eventDate").index.year + df["month"] = df.set_index("eventDate").index.month with pytest.raises(KeyError, match="'badname'"): - df.reset_index().pivot_table(index='year', columns='month', - values='badname', aggfunc='count') + df.reset_index().pivot_table( + index="year", columns="month", values="badname", aggfunc="count" + ) def test_empty_dataframe_groupby(): # GH8093 - df = DataFrame(columns=['A', 'B', 'C']) + df = DataFrame(columns=["A", "B", "C"]) - result = df.groupby('A').sum() - expected = DataFrame(columns=['B', 'C'], dtype=np.float64) - expected.index.name = 'A' + result = df.groupby("A").sum() + expected = DataFrame(columns=["B", "C"], dtype=np.float64) + expected.index.name = "A" assert_frame_equal(result, expected) def test_tuple_warns(): # https://github.com/pandas-dev/pandas/issues/18314 - df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], - 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) + df = pd.DataFrame( + { + ("a", "b"): [1, 1, 2, 2], + "a": [1, 1, 1, 2], + "b": [1, 2, 2, 2], + "c": [1, 1, 1, 1], + } + ) with tm.assert_produces_warning(FutureWarning) as w: - df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() + df[["a", "b", "c"]].groupby(("a", "b")).c.mean() assert "Interpreting tuple 'by' as a list" in str(w[0].message) with tm.assert_produces_warning(None): - df.groupby(('a', 'b')).c.mean() + df.groupby(("a", "b")).c.mean() def test_tuple_warns_unhashable(): # https://github.com/pandas-dev/pandas/issues/18314 - business_dates = date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) + business_dates = date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = DataFrame(1, index=business_dates, columns=["a", "b"]) with tm.assert_produces_warning(FutureWarning) as w: df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) @@ -1670,28 +1773,36 @@ def test_tuple_warns_unhashable(): def test_tuple_correct_keyerror(): # https://github.com/pandas-dev/pandas/issues/18798 - df = pd.DataFrame(1, index=range(3), - columns=pd.MultiIndex.from_product([[1, 2], - [3, 4]])) + df = pd.DataFrame( + 1, index=range(3), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]]) + ) with pytest.raises(KeyError, match=r"^\(7, 8\)$"): df.groupby((7, 8)).mean() def test_groupby_agg_ohlc_non_first(): # GH 21716 - df = pd.DataFrame([[1], [1]], columns=['foo'], - index=pd.date_range('2018-01-01', periods=2, freq='D')) - - expected = pd.DataFrame([ - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1] - ], columns=pd.MultiIndex.from_tuples(( - ('foo', 'sum', 'foo'), ('foo', 'ohlc', 'open'), - ('foo', 'ohlc', 'high'), ('foo', 'ohlc', 'low'), - ('foo', 'ohlc', 'close'))), index=pd.date_range( - '2018-01-01', periods=2, freq='D')) - - result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) + df = pd.DataFrame( + [[1], [1]], + columns=["foo"], + index=pd.date_range("2018-01-01", periods=2, freq="D"), + ) + + expected = pd.DataFrame( + [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], + columns=pd.MultiIndex.from_tuples( + ( + ("foo", "sum", "foo"), + ("foo", "ohlc", "open"), + ("foo", "ohlc", "high"), + ("foo", "ohlc", "low"), + ("foo", "ohlc", "close"), + ) + ), + index=pd.date_range("2018-01-01", periods=2, freq="D"), + ) + + result = df.groupby(pd.Grouper(freq="D")).agg(["sum", "ohlc"]) tm.assert_frame_equal(result, expected) @@ -1699,23 +1810,23 @@ def test_groupby_agg_ohlc_non_first(): def test_groupby_multiindex_nat(): # GH 9236 values = [ - (pd.NaT, 'a'), - (datetime(2012, 1, 2), 'a'), - (datetime(2012, 1, 2), 'b'), - (datetime(2012, 1, 3), 'a') + (pd.NaT, "a"), + (datetime(2012, 1, 2), "a"), + (datetime(2012, 1, 2), "b"), + (datetime(2012, 1, 3), "a"), ] - mi = pd.MultiIndex.from_tuples(values, names=['date', None]) + mi = pd.MultiIndex.from_tuples(values, names=["date", None]) ser = pd.Series([3, 2, 2.5, 4], index=mi) result = ser.groupby(level=1).mean() - expected = pd.Series([3., 2.5], index=["a", "b"]) + expected = pd.Series([3.0, 2.5], index=["a", "b"]) assert_series_equal(result, expected) def test_groupby_empty_list_raises(): # GH 5289 values = zip(range(10), range(10)) - df = DataFrame(values, columns=['apple', 'b']) + df = DataFrame(values, columns=["apple", "b"]) msg = "Grouper and axis must be same length" with pytest.raises(ValueError, match=msg): df.groupby([[]]) @@ -1723,18 +1834,14 @@ def test_groupby_empty_list_raises(): def test_groupby_multiindex_series_keys_len_equal_group_axis(): # GH 25704 - index_array = [ - ['x', 'x'], - ['a', 'b'], - ['k', 'k'] - ] - index_names = ['first', 'second', 'third'] + index_array = [["x", "x"], ["a", "b"], ["k", "k"]] + index_names = ["first", "second", "third"] ri = pd.MultiIndex.from_arrays(index_array, names=index_names) s = pd.Series(data=[1, 2], index=ri) - result = s.groupby(['first', 'third']).sum() + result = s.groupby(["first", "third"]).sum() - index_array = [['x'], ['k']] - index_names = ['first', 'third'] + index_array = [["x"], ["k"]] + index_names = ["first", "third"] ei = pd.MultiIndex.from_arrays(index_array, names=index_names) expected = pd.Series([3], index=ei) @@ -1744,14 +1851,12 @@ def test_groupby_multiindex_series_keys_len_equal_group_axis(): def test_groupby_groups_in_BaseGrouper(): # GH 26326 # Test if DataFrame grouped with a pandas.Grouper has correct groups - mi = pd.MultiIndex.from_product([['A', 'B'], - ['C', 'D']], names=['alpha', 'beta']) - df = pd.DataFrame({'foo': [1, 2, 1, 2], 'bar': [1, 2, 3, 4]}, - index=mi) - result = df.groupby([pd.Grouper(level='alpha'), 'beta']) - expected = df.groupby(['alpha', 'beta']) - assert(result.groups == expected.groups) - - result = df.groupby(['beta', pd.Grouper(level='alpha')]) - expected = df.groupby(['beta', 'alpha']) - assert(result.groups == expected.groups) + mi = pd.MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"]) + df = pd.DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi) + result = df.groupby([pd.Grouper(level="alpha"), "beta"]) + expected = df.groupby(["alpha", "beta"]) + assert result.groups == expected.groups + + result = df.groupby(["beta", pd.Grouper(level="alpha")]) + expected = df.groupby(["beta", "alpha"]) + assert result.groups == expected.groups diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 5508c290b0429..1fd67caadf2e4 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -5,69 +5,81 @@ import pandas as pd from pandas import ( - CategoricalIndex, DataFrame, Index, MultiIndex, Series, Timestamp, - date_range) + CategoricalIndex, + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, +) from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) # selection # -------------------------------- class TestSelection: - def test_select_bad_cols(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) - g = df.groupby('A') - with pytest.raises(KeyError, match='"Columns not found: \'C\'"'): - g[['C']] + df = DataFrame([[1, 2]], columns=["A", "B"]) + g = df.groupby("A") + with pytest.raises(KeyError, match="\"Columns not found: 'C'\""): + g[["C"]] - with pytest.raises(KeyError, match='^[^A]+$'): + with pytest.raises(KeyError, match="^[^A]+$"): # A should not be referenced as a bad column... # will have to rethink regex if you change message! - g[['A', 'C']] + g[["A", "C"]] def test_groupby_duplicated_column_errormsg(self): # GH7511 - df = DataFrame(columns=['A', 'B', 'A', 'C'], - data=[range(4), range(2, 6), range(0, 8, 2)]) + df = DataFrame( + columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)] + ) msg = "Grouper for 'A' not 1-dimensional" with pytest.raises(ValueError, match=msg): - df.groupby('A') + df.groupby("A") with pytest.raises(ValueError, match=msg): - df.groupby(['A', 'B']) + df.groupby(["A", "B"]) - grouped = df.groupby('B') + grouped = df.groupby("B") c = grouped.count() assert c.columns.nlevels == 1 assert c.columns.size == 3 def test_column_select_via_attr(self, df): - result = df.groupby('A').C.sum() - expected = df.groupby('A')['C'].sum() + result = df.groupby("A").C.sum() + expected = df.groupby("A")["C"].sum() assert_series_equal(result, expected) - df['mean'] = 1.5 - result = df.groupby('A').mean() - expected = df.groupby('A').agg(np.mean) + df["mean"] = 1.5 + result = df.groupby("A").mean() + expected = df.groupby("A").agg(np.mean) assert_frame_equal(result, expected) def test_getitem_list_of_columns(self): df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), - 'E': np.random.randn(8)}) + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": np.random.randn(8), + } + ) - result = df.groupby('A')[['C', 'D']].mean() - result2 = df.groupby('A')['C', 'D'].mean() - result3 = df.groupby('A')[df.columns[2:4]].mean() + result = df.groupby("A")[["C", "D"]].mean() + result2 = df.groupby("A")["C", "D"].mean() + result3 = df.groupby("A")[df.columns[2:4]].mean() - expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() assert_frame_equal(result, expected) assert_frame_equal(result2, expected) @@ -75,10 +87,14 @@ def test_getitem_list_of_columns(self): def test_getitem_numeric_column_names(self): # GH #13731 - df = DataFrame({0: list('abcd') * 2, - 2: np.random.randn(8), - 4: np.random.randn(8), - 6: np.random.randn(8)}) + df = DataFrame( + { + 0: list("abcd") * 2, + 2: np.random.randn(8), + 4: np.random.randn(8), + 6: np.random.randn(8), + } + ) result = df.groupby(0)[df.columns[1:3]].mean() result2 = df.groupby(0)[2, 4].mean() result3 = df.groupby(0)[[2, 4]].mean() @@ -93,110 +109,123 @@ def test_getitem_numeric_column_names(self): # grouping # -------------------------------- -class TestGrouping: +class TestGrouping: def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index - df = DataFrame(np.arange(10).reshape(5, 2), columns=list('AB')) - for index in [tm.makeFloatIndex, tm.makeStringIndex, - tm.makeUnicodeIndex, tm.makeIntIndex, tm.makeDateIndex, - tm.makePeriodIndex]: + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB")) + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: df.index = index(len(df)) - df.groupby(list('abcde')).apply(lambda x: x) + df.groupby(list("abcde")).apply(lambda x: x) df.index = list(reversed(df.index.tolist())) - df.groupby(list('abcde')).apply(lambda x: x) + df.groupby(list("abcde")).apply(lambda x: x) def test_grouper_multilevel_freq(self): # GH 7885 # with level and freq specified in a pd.Grouper from datetime import date, timedelta + d0 = date.today() - timedelta(days=14) dates = date_range(d0, date.today()) - date_index = pd.MultiIndex.from_product( - [dates, dates], names=['foo', 'bar']) + date_index = pd.MultiIndex.from_product([dates, dates], names=["foo", "bar"]) df = pd.DataFrame(np.random.randint(0, 100, 225), index=date_index) # Check string level - expected = df.reset_index().groupby([pd.Grouper( - key='foo', freq='W'), pd.Grouper(key='bar', freq='W')]).sum() + expected = ( + df.reset_index() + .groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")]) + .sum() + ) # reset index changes columns dtype to object - expected.columns = pd.Index([0], dtype='int64') + expected.columns = pd.Index([0], dtype="int64") - result = df.groupby([pd.Grouper(level='foo', freq='W'), pd.Grouper( - level='bar', freq='W')]).sum() + result = df.groupby( + [pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")] + ).sum() assert_frame_equal(result, expected) # Check integer level - result = df.groupby([pd.Grouper(level=0, freq='W'), pd.Grouper( - level=1, freq='W')]).sum() + result = df.groupby( + [pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")] + ).sum() assert_frame_equal(result, expected) def test_grouper_creation_bug(self): # GH 8795 - df = DataFrame({'A': [0, 0, 1, 1, 2, 2], 'B': [1, 2, 3, 4, 5, 6]}) - g = df.groupby('A') + df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]}) + g = df.groupby("A") expected = g.sum() - g = df.groupby(pd.Grouper(key='A')) + g = df.groupby(pd.Grouper(key="A")) result = g.sum() assert_frame_equal(result, expected) result = g.apply(lambda x: x.sum()) assert_frame_equal(result, expected) - g = df.groupby(pd.Grouper(key='A', axis=0)) + g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() assert_frame_equal(result, expected) # GH14334 # pd.Grouper(key=...) may be passed in a list - df = DataFrame({'A': [0, 0, 0, 1, 1, 1], - 'B': [1, 1, 2, 2, 3, 3], - 'C': [1, 2, 3, 4, 5, 6]}) + df = DataFrame( + {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]} + ) # Group by single column - expected = df.groupby('A').sum() - g = df.groupby([pd.Grouper(key='A')]) + expected = df.groupby("A").sum() + g = df.groupby([pd.Grouper(key="A")]) result = g.sum() assert_frame_equal(result, expected) # Group by two columns # using a combination of strings and Grouper objects - expected = df.groupby(['A', 'B']).sum() + expected = df.groupby(["A", "B"]).sum() # Group with two Grouper objects - g = df.groupby([pd.Grouper(key='A'), pd.Grouper(key='B')]) + g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")]) result = g.sum() assert_frame_equal(result, expected) # Group with a string and a Grouper object - g = df.groupby(['A', pd.Grouper(key='B')]) + g = df.groupby(["A", pd.Grouper(key="B")]) result = g.sum() assert_frame_equal(result, expected) # Group with a Grouper object and a string - g = df.groupby([pd.Grouper(key='A'), 'B']) + g = df.groupby([pd.Grouper(key="A"), "B"]) result = g.sum() assert_frame_equal(result, expected) # GH8866 - s = Series(np.arange(8, dtype='int64'), - index=pd.MultiIndex.from_product( - [list('ab'), range(2), - date_range('20130101', periods=2)], - names=['one', 'two', 'three'])) - result = s.groupby(pd.Grouper(level='three', freq='M')).sum() - expected = Series([28], index=Index( - [Timestamp('2013-01-31')], freq='M', name='three')) + s = Series( + np.arange(8, dtype="int64"), + index=pd.MultiIndex.from_product( + [list("ab"), range(2), date_range("20130101", periods=2)], + names=["one", "two", "three"], + ), + ) + result = s.groupby(pd.Grouper(level="three", freq="M")).sum() + expected = Series( + [28], index=Index([Timestamp("2013-01-31")], freq="M", name="three") + ) assert_series_equal(result, expected) # just specifying a level breaks - result = s.groupby(pd.Grouper(level='one')).sum() - expected = s.groupby(level='one').sum() + result = s.groupby(pd.Grouper(level="one")).sum() + expected = s.groupby(level="one").sum() assert_series_equal(result, expected) def test_grouper_column_and_index(self): @@ -204,39 +233,41 @@ def test_grouper_column_and_index(self): # Grouping a multi-index frame by a column and an index level should # be equivalent to resetting the index and grouping by two columns - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3)]) - idx.names = ['outer', 'inner'] - df_multi = pd.DataFrame({"A": np.arange(6), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one']}, - index=idx) - result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_multi.reset_index().groupby(['B', 'inner']).mean() + idx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)] + ) + idx.names = ["outer", "inner"] + df_multi = pd.DataFrame( + {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]}, + index=idx, + ) + result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_multi.reset_index().groupby(["B", "inner"]).mean() assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_multi.reset_index().groupby(['inner', 'B']).mean() + result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_multi.reset_index().groupby(["inner", "B"]).mean() assert_frame_equal(result, expected) # Grouping a single-index frame by a column and the index should # be equivalent to resetting the index and grouping by two columns - df_single = df_multi.reset_index('outer') - result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean() - expected = df_single.reset_index().groupby(['B', 'inner']).mean() + df_single = df_multi.reset_index("outer") + result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean() + expected = df_single.reset_index().groupby(["B", "inner"]).mean() assert_frame_equal(result, expected) # Test the reverse grouping order - result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean() - expected = df_single.reset_index().groupby(['inner', 'B']).mean() + result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean() + expected = df_single.reset_index().groupby(["inner", "B"]).mean() assert_frame_equal(result, expected) def test_groupby_levels_and_columns(self): # GH9344, GH9049 - idx_names = ['x', 'y'] + idx_names = ["x", "y"] idx = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names) + [(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names + ) df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx) by_levels = df.groupby(level=idx_names).mean() @@ -250,26 +281,18 @@ def test_groupby_levels_and_columns(self): def test_groupby_categorical_index_and_columns(self, observed): # GH18432, adapted for GH25871 - columns = ['A', 'B', 'A', 'B'] - categories = ['B', 'A'] - data = np.array([[1, 2, 1, 2], - [1, 2, 1, 2], - [1, 2, 1, 2], - [1, 2, 1, 2], - [1, 2, 1, 2]], int) - cat_columns = CategoricalIndex(columns, - categories=categories, - ordered=True) + columns = ["A", "B", "A", "B"] + categories = ["B", "A"] + data = np.array( + [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int + ) + cat_columns = CategoricalIndex(columns, categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) result = df.groupby(axis=1, level=0, observed=observed).sum() - expected_data = np.array([[4, 2], - [4, 2], - [4, 2], - [4, 2], - [4, 2]], int) - expected_columns = CategoricalIndex(categories, - categories=categories, - ordered=True) + expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int) + expected_columns = CategoricalIndex( + categories, categories=categories, ordered=True + ) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) @@ -284,20 +307,26 @@ def test_grouper_getting_correct_binner(self): # GH 10063 # using a non-time-based grouper and a time-based grouper # and specifying levels - df = DataFrame({'A': 1}, index=pd.MultiIndex.from_product( - [list('ab'), date_range('20130101', periods=80)], names=['one', - 'two'])) - result = df.groupby([pd.Grouper(level='one'), pd.Grouper( - level='two', freq='M')]).sum() - expected = DataFrame({'A': [31, 28, 21, 31, 28, 21]}, - index=MultiIndex.from_product( - [list('ab'), - date_range('20130101', freq='M', periods=3)], - names=['one', 'two'])) + df = DataFrame( + {"A": 1}, + index=pd.MultiIndex.from_product( + [list("ab"), date_range("20130101", periods=80)], names=["one", "two"] + ), + ) + result = df.groupby( + [pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")] + ).sum() + expected = DataFrame( + {"A": [31, 28, 21, 31, 28, 21]}, + index=MultiIndex.from_product( + [list("ab"), date_range("20130101", freq="M", periods=3)], + names=["one", "two"], + ), + ) assert_frame_equal(result, expected) def test_grouper_iter(self, df): - assert sorted(df.groupby('A').grouper) == ['bar', 'foo'] + assert sorted(df.groupby("A").grouper) == ["bar", "foo"] def test_empty_groups(self, df): # see gh-1048 @@ -305,7 +334,7 @@ def test_empty_groups(self, df): df.groupby([]) def test_groupby_grouper(self, df): - grouped = df.groupby('A') + grouped = df.groupby("A") result = df.groupby(grouped.grouper).mean() expected = grouped.mean() @@ -314,13 +343,14 @@ def test_groupby_grouper(self, df): def test_groupby_dict_mapping(self): # GH #679 from pandas import Series - s = Series({'T1': 5}) - result = s.groupby({'T1': 'T2'}).agg(sum) - expected = s.groupby(['T2']).agg(sum) + + s = Series({"T1": 5}) + result = s.groupby({"T1": "T2"}).agg(sum) + expected = s.groupby(["T2"]).agg(sum) assert_series_equal(result, expected) - s = Series([1., 2., 3., 4.], index=list('abcd')) - mapping = {'a': 0, 'b': 0, 'c': 1, 'd': 1} + s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd")) + mapping = {"a": 0, "b": 0, "c": 1, "d": 1} result = s.groupby(mapping).mean() result2 = s.groupby(mapping).agg(np.mean) @@ -331,7 +361,7 @@ def test_groupby_dict_mapping(self): assert_series_equal(result, expected2) def test_groupby_grouper_f_sanity_checked(self): - dates = date_range('01-Jan-2013', periods=12, freq='MS') + dates = date_range("01-Jan-2013", periods=12, freq="MS") ts = Series(np.random.randn(12), index=dates) # GH3035 @@ -348,10 +378,9 @@ def test_groupby_grouper_f_sanity_checked(self): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): - msg = ("Grouper for ''" - " not 1-dimensional") + msg = "Grouper for ''" " not 1-dimensional" with pytest.raises(ValueError, match=msg): - Grouping(df.index, df[['A', 'A']]) + Grouping(df.index, df[["A", "A"]]) def test_multiindex_passthru(self): @@ -366,76 +395,77 @@ def test_multiindex_passthru(self): def test_multiindex_negative_level(self, mframe): # GH 13901 result = mframe.groupby(level=-1).sum() - expected = mframe.groupby(level='second').sum() + expected = mframe.groupby(level="second").sum() assert_frame_equal(result, expected) result = mframe.groupby(level=-2).sum() - expected = mframe.groupby(level='first').sum() + expected = mframe.groupby(level="first").sum() assert_frame_equal(result, expected) result = mframe.groupby(level=[-2, -1]).sum() expected = mframe assert_frame_equal(result, expected) - result = mframe.groupby(level=[-1, 'first']).sum() - expected = mframe.groupby(level=['second', 'first']).sum() + result = mframe.groupby(level=[-1, "first"]).sum() + expected = mframe.groupby(level=["second", "first"]).sum() assert_frame_equal(result, expected) def test_multifunc_select_col_integer_cols(self, df): df.columns = np.arange(len(df.columns)) # it works! - df.groupby(1, as_index=False)[2].agg({'Q': np.mean}) + df.groupby(1, as_index=False)[2].agg({"Q": np.mean}) def test_multiindex_columns_empty_level(self): - lst = [['count', 'values'], ['to filter', '']] + lst = [["count", "values"], ["to filter", ""]] midx = MultiIndex.from_tuples(lst) - df = DataFrame([[1, 'A']], columns=midx) + df = DataFrame([[1, "A"]], columns=midx) - grouped = df.groupby('to filter').groups - assert grouped['A'] == [0] + grouped = df.groupby("to filter").groups + assert grouped["A"] == [0] - grouped = df.groupby([('to filter', '')]).groups - assert grouped['A'] == [0] + grouped = df.groupby([("to filter", "")]).groups + assert grouped["A"] == [0] - df = DataFrame([[1, 'A'], [2, 'B']], columns=midx) + df = DataFrame([[1, "A"], [2, "B"]], columns=midx) - expected = df.groupby('to filter').groups - result = df.groupby([('to filter', '')]).groups + expected = df.groupby("to filter").groups + result = df.groupby([("to filter", "")]).groups assert result == expected - df = DataFrame([[1, 'A'], [2, 'A']], columns=midx) + df = DataFrame([[1, "A"], [2, "A"]], columns=midx) - expected = df.groupby('to filter').groups - result = df.groupby([('to filter', '')]).groups + expected = df.groupby("to filter").groups + result = df.groupby([("to filter", "")]).groups tm.assert_dict_equal(result, expected) def test_groupby_multiindex_tuple(self): # GH 17979 - df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], - columns=pd.MultiIndex.from_arrays( - [['a', 'b', 'b', 'c'], - [1, 1, 2, 2]])) - expected = df.groupby([('b', 1)]).groups - result = df.groupby(('b', 1)).groups + df = pd.DataFrame( + [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]], + columns=pd.MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]), + ) + expected = df.groupby([("b", 1)]).groups + result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - df2 = pd.DataFrame(df.values, - columns=pd.MultiIndex.from_arrays( - [['a', 'b', 'b', 'c'], - ['d', 'd', 'e', 'e']])) - expected = df2.groupby([('b', 'd')]).groups - result = df.groupby(('b', 1)).groups + df2 = pd.DataFrame( + df.values, + columns=pd.MultiIndex.from_arrays( + [["a", "b", "b", "c"], ["d", "d", "e", "e"]] + ), + ) + expected = df2.groupby([("b", "d")]).groups + result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - df3 = pd.DataFrame(df.values, - columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c']) - expected = df3.groupby([('b', 'd')]).groups - result = df.groupby(('b', 1)).groups + df3 = pd.DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"]) + expected = df3.groupby([("b", "d")]).groups + result = df.groupby(("b", 1)).groups tm.assert_dict_equal(expected, result) - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level(self, sort, mframe, df): # GH 17537 frame = mframe @@ -444,14 +474,14 @@ def test_groupby_level(self, sort, mframe, df): result0 = frame.groupby(level=0, sort=sort).sum() result1 = frame.groupby(level=1, sort=sort).sum() - expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() - expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() + expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum() + expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum() - expected0.index.name = 'first' - expected1.index.name = 'second' + expected0.index.name = "first" + expected1.index.name = "second" - assert result0.index.name == 'first' - assert result1.index.name == 'second' + assert result0.index.name == "first" + assert result1.index.name == "second" assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) @@ -459,8 +489,8 @@ def test_groupby_level(self, sort, mframe, df): assert result1.index.name == frame.index.names[1] # groupby level name - result0 = frame.groupby(level='first', sort=sort).sum() - result1 = frame.groupby(level='second', sort=sort).sum() + result0 = frame.groupby(level="first", sort=sort).sum() + result1 = frame.groupby(level="second", sort=sort).sum() assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) @@ -478,34 +508,37 @@ def test_groupby_level(self, sort, mframe, df): def test_groupby_level_index_names(self): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) - df = DataFrame({'exp': ['A'] * 3 + ['B'] * 3, - 'var1': range(6), }).set_index('exp') - df.groupby(level='exp') + df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index( + "exp" + ) + df.groupby(level="exp") msg = "level name foo is not the name of the index" with pytest.raises(ValueError, match=msg): - df.groupby(level='foo') + df.groupby(level="foo") - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_groupby_level_with_nas(self, sort): # GH 17537 - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, - 2, 3]]) + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) + s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 22.], index=[0, 1]) + expected = Series([6.0, 22.0], index=[0, 1]) assert_series_equal(result, expected) - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, - 1, 2, 3]]) + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) # factorizing doesn't confuse things - s = Series(np.arange(8.), index=index) + s = Series(np.arange(8.0), index=index) result = s.groupby(level=0, sort=sort).sum() - expected = Series([6., 18.], index=[0.0, 1.0]) + expected = Series([6.0, 18.0], index=[0.0, 1.0]) assert_series_equal(result, expected) def test_groupby_args(self, mframe): @@ -520,10 +553,13 @@ def test_groupby_args(self, mframe): with pytest.raises(TypeError, match=msg): frame.groupby(by=None, level=None) - @pytest.mark.parametrize('sort,labels', [ - [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], - [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] - ]) + @pytest.mark.parametrize( + "sort,labels", + [ + [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], + [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]], + ], + ) def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 grouped = mframe.groupby(level=0, sort=sort) @@ -537,28 +573,28 @@ def test_grouping_labels(self, mframe): def test_list_grouper_with_nat(self): # GH 14715 - df = pd.DataFrame({'date': pd.date_range('1/1/2011', - periods=365, freq='D')}) + df = pd.DataFrame({"date": pd.date_range("1/1/2011", periods=365, freq="D")}) df.iloc[-1] = pd.NaT - grouper = pd.Grouper(key='date', freq='AS') + grouper = pd.Grouper(key="date", freq="AS") # Grouper in a list grouping result = df.groupby([grouper]) - expected = {pd.Timestamp('2011-01-01'): pd.Index(list(range(364)))} + expected = {pd.Timestamp("2011-01-01"): pd.Index(list(range(364)))} tm.assert_dict_equal(result.groups, expected) # Test case without a list result = df.groupby(grouper) - expected = {pd.Timestamp('2011-01-01'): 365} + expected = {pd.Timestamp("2011-01-01"): 365} tm.assert_dict_equal(result.groups, expected) @pytest.mark.parametrize( - 'func,expected', + "func,expected", [ - ('transform', pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))), - ('agg', pd.Series(name=2, index=pd.Float64Index([], name=1))), - ('apply', pd.Series(name=2, index=pd.Float64Index([], name=1))), - ]) + ("transform", pd.Series(name=2, index=pd.RangeIndex(0, 0, 1))), + ("agg", pd.Series(name=2, index=pd.Float64Index([], name=1))), + ("apply", pd.Series(name=2, index=pd.Float64Index([], name=1))), + ], + ) def test_evaluate_with_empty_groups(self, func, expected): # 26208 # test transform'ing empty groups @@ -571,7 +607,7 @@ def test_evaluate_with_empty_groups(self, func, expected): def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 - s = pd.Series([], name='name') + s = pd.Series([], name="name") gr = s.groupby([]) result = gr.mean() @@ -579,32 +615,46 @@ def test_groupby_empty(self): # check group properties assert len(gr.grouper.groupings) == 1 - tm.assert_numpy_array_equal(gr.grouper.group_info[0], - np.array([], dtype=np.dtype("intp"))) + tm.assert_numpy_array_equal( + gr.grouper.group_info[0], np.array([], dtype=np.dtype("intp")) + ) - tm.assert_numpy_array_equal(gr.grouper.group_info[1], - np.array([], dtype=np.dtype('int'))) + tm.assert_numpy_array_equal( + gr.grouper.group_info[1], np.array([], dtype=np.dtype("int")) + ) assert gr.grouper.group_info[2] == 0 # check name - assert s.groupby(s).grouper.names == ['name'] + assert s.groupby(s).grouper.names == ["name"] # get_group # -------------------------------- + class TestGetGroup: def test_get_group(self): # GH 5267 # be datelike friendly - df = DataFrame({'DATE': pd.to_datetime( - ['10-Oct-2013', '10-Oct-2013', '10-Oct-2013', '11-Oct-2013', - '11-Oct-2013', '11-Oct-2013']), - 'label': ['foo', 'foo', 'bar', 'foo', 'foo', 'bar'], - 'VAL': [1, 2, 3, 4, 5, 6]}) - - g = df.groupby('DATE') + df = DataFrame( + { + "DATE": pd.to_datetime( + [ + "10-Oct-2013", + "10-Oct-2013", + "10-Oct-2013", + "11-Oct-2013", + "11-Oct-2013", + "11-Oct-2013", + ] + ), + "label": ["foo", "foo", "bar", "foo", "foo", "bar"], + "VAL": [1, 2, 3, 4, 5, 6], + } + ) + + g = df.groupby("DATE") key = list(g.groups)[0] result1 = g.get_group(key) result2 = g.get_group(Timestamp(key).to_pydatetime()) @@ -612,7 +662,7 @@ def test_get_group(self): assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) - g = df.groupby(['DATE', 'label']) + g = df.groupby(["DATE", "label"]) key = list(g.groups)[0] result1 = g.get_group(key) @@ -624,13 +674,15 @@ def test_get_group(self): # must pass a same-length tuple with multiple keys msg = "must supply a tuple to get_group with multiple grouping keys" with pytest.raises(ValueError, match=msg): - g.get_group('foo') + g.get_group("foo") with pytest.raises(ValueError, match=msg): - g.get_group(('foo')) - msg = ("must supply a same-length tuple to get_group with multiple" - " grouping keys") + g.get_group(("foo")) + msg = ( + "must supply a same-length tuple to get_group with multiple" + " grouping keys" + ) with pytest.raises(ValueError, match=msg): - g.get_group(('foo', 'bar', 'baz')) + g.get_group(("foo", "bar", "baz")) def test_get_group_empty_bins(self, observed): @@ -650,71 +702,70 @@ def test_get_group_empty_bins(self, observed): def test_get_group_grouped_by_tuple(self): # GH 8121 - df = DataFrame([[(1, ), (1, 2), (1, ), (1, 2)]], index=['ids']).T - gr = df.groupby('ids') - expected = DataFrame({'ids': [(1, ), (1, )]}, index=[0, 2]) - result = gr.get_group((1, )) + df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T + gr = df.groupby("ids") + expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2]) + result = gr.get_group((1,)) assert_frame_equal(result, expected) - dt = pd.to_datetime(['2010-01-01', '2010-01-02', '2010-01-01', - '2010-01-02']) - df = DataFrame({'ids': [(x, ) for x in dt]}) - gr = df.groupby('ids') - result = gr.get_group(('2010-01-01', )) - expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) + dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"]) + df = DataFrame({"ids": [(x,) for x in dt]}) + gr = df.groupby("ids") + result = gr.get_group(("2010-01-01",)) + expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2]) assert_frame_equal(result, expected) def test_groupby_with_empty(self): index = pd.DatetimeIndex(()) data = () series = pd.Series(data, index) - grouper = pd.Grouper(freq='D') + grouper = pd.Grouper(freq="D") grouped = series.groupby(grouper) assert next(iter(grouped), None) is None def test_groupby_with_single_column(self): - df = pd.DataFrame({'a': list('abssbab')}) - tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) + df = pd.DataFrame({"a": list("abssbab")}) + tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = pd.DataFrame(index=pd.Index(['a', 'b', 's'], name='a')) - tm.assert_frame_equal(df.groupby('a').count(), exp) - tm.assert_frame_equal(df.groupby('a').sum(), exp) - tm.assert_frame_equal(df.groupby('a').nth(1), exp) + exp = pd.DataFrame(index=pd.Index(["a", "b", "s"], name="a")) + tm.assert_frame_equal(df.groupby("a").count(), exp) + tm.assert_frame_equal(df.groupby("a").sum(), exp) + tm.assert_frame_equal(df.groupby("a").nth(1), exp) def test_gb_key_len_equal_axis_len(self): # GH16843 # test ensures that index and column keys are recognized correctly # when number of keys equals axis length of groupby - df = pd.DataFrame([['foo', 'bar', 'B', 1], - ['foo', 'bar', 'B', 2], - ['foo', 'baz', 'C', 3]], - columns=['first', 'second', 'third', 'one']) - df = df.set_index(['first', 'second']) - df = df.groupby(['first', 'second', 'third']).size() - assert df.loc[('foo', 'bar', 'B')] == 2 - assert df.loc[('foo', 'baz', 'C')] == 1 + df = pd.DataFrame( + [["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]], + columns=["first", "second", "third", "one"], + ) + df = df.set_index(["first", "second"]) + df = df.groupby(["first", "second", "third"]).size() + assert df.loc[("foo", "bar", "B")] == 2 + assert df.loc[("foo", "baz", "C")] == 1 # groups & iteration # -------------------------------- -class TestIteration: +class TestIteration: def test_groups(self, df): - grouped = df.groupby(['A']) + grouped = df.groupby(["A"]) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in grouped.groups.items(): - assert (df.loc[v]['A'] == k).all() + assert (df.loc[v]["A"] == k).all() - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in grouped.groups.items(): - assert (df.loc[v]['A'] == k[0]).all() - assert (df.loc[v]['B'] == k[1]).all() + assert (df.loc[v]["A"] == k[0]).all() + assert (df.loc[v]["B"] == k[1]).all() def test_grouping_is_iterable(self, tsframe): # this code path isn't used anywhere else @@ -727,14 +778,18 @@ def test_grouping_is_iterable(self, tsframe): def test_multi_iter(self): s = Series(np.arange(6)) - k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) + k1 = np.array(["a", "a", "a", "b", "b", "b"]) + k2 = np.array(["1", "2", "1", "2", "1", "2"]) grouped = s.groupby([k1, k2]) iterated = list(grouped) - expected = [('a', '1', s[[0, 2]]), ('a', '2', s[[1]]), - ('b', '1', s[[4]]), ('b', '2', s[[3, 5]])] + expected = [ + ("a", "1", s[[0, 2]]), + ("a", "2", s[[1]]), + ("b", "1", s[[4]]), + ("b", "2", s[[3, 5]]), + ] for i, ((one, two), three) in enumerate(iterated): e1, e2, e3 = expected[i] assert e1 == one @@ -742,22 +797,24 @@ def test_multi_iter(self): assert_series_equal(three, e3) def test_multi_iter_frame(self, three_group): - k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - k2 = np.array(['1', '2', '1', '2', '1', '2']) - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': k1, 'k2': k2}, - index=['one', 'two', 'three', 'four', 'five', 'six']) + k1 = np.array(["b", "b", "b", "a", "a", "a"]) + k2 = np.array(["1", "2", "1", "2", "1", "2"]) + df = DataFrame( + {"v1": np.random.randn(6), "v2": np.random.randn(6), "k1": k1, "k2": k2}, + index=["one", "two", "three", "four", "five", "six"], + ) - grouped = df.groupby(['k1', 'k2']) + grouped = df.groupby(["k1", "k2"]) # things get sorted! iterated = list(grouped) idx = df.index - expected = [('a', '1', df.loc[idx[[4]]]), - ('a', '2', df.loc[idx[[3, 5]]]), - ('b', '1', df.loc[idx[[0, 2]]]), - ('b', '2', df.loc[idx[[1]]])] + expected = [ + ("a", "1", df.loc[idx[[4]]]), + ("a", "2", df.loc[idx[[3, 5]]]), + ("b", "1", df.loc[idx[[0, 2]]]), + ("b", "2", df.loc[idx[[1]]]), + ] for i, ((one, two), three) in enumerate(iterated): e1, e2, e3 = expected[i] assert e1 == one @@ -765,82 +822,83 @@ def test_multi_iter_frame(self, three_group): assert_frame_equal(three, e3) # don't iterate through groups with no data - df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) - df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) - grouped = df.groupby(['k1', 'k2']) + df["k1"] = np.array(["b", "b", "b", "a", "a", "a"]) + df["k2"] = np.array(["1", "1", "1", "2", "2", "2"]) + grouped = df.groupby(["k1", "k2"]) groups = {key: gp for key, gp in grouped} assert len(groups) == 2 # axis = 1 - three_levels = three_group.groupby(['A', 'B', 'C']).mean() + three_levels = three_group.groupby(["A", "B", "C"]).mean() grouped = three_levels.T.groupby(axis=1, level=(1, 2)) for key, group in grouped: pass def test_dictify(self, df): - dict(iter(df.groupby('A'))) - dict(iter(df.groupby(['A', 'B']))) - dict(iter(df['C'].groupby(df['A']))) - dict(iter(df['C'].groupby([df['A'], df['B']]))) - dict(iter(df.groupby('A')['C'])) - dict(iter(df.groupby(['A', 'B'])['C'])) + dict(iter(df.groupby("A"))) + dict(iter(df.groupby(["A", "B"]))) + dict(iter(df["C"].groupby(df["A"]))) + dict(iter(df["C"].groupby([df["A"], df["B"]]))) + dict(iter(df.groupby("A")["C"])) + dict(iter(df.groupby(["A", "B"])["C"])) def test_groupby_with_small_elem(self): # GH 8542 # length=2 - df = pd.DataFrame({'event': ['start', 'start'], - 'change': [1234, 5678]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + df = pd.DataFrame( + {"event": ["start", "start"], "change": [1234, 5678]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) tm.assert_frame_equal(res, df.iloc[[1], :]) - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-09-15'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + df = pd.DataFrame( + {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) assert len(grouped.groups) == 2 assert grouped.ngroups == 2 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) tm.assert_frame_equal(res, df.iloc[[0, 2], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) tm.assert_frame_equal(res, df.iloc[[1], :]) # length=3 - df = pd.DataFrame({'event': ['start', 'start', 'start'], - 'change': [1234, 5678, 9123]}, - index=pd.DatetimeIndex(['2014-09-10', '2013-10-10', - '2014-08-05'])) - grouped = df.groupby([pd.Grouper(freq='M'), 'event']) + df = pd.DataFrame( + {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]}, + index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]), + ) + grouped = df.groupby([pd.Grouper(freq="M"), "event"]) assert len(grouped.groups) == 3 assert grouped.ngroups == 3 - assert (pd.Timestamp('2014-09-30'), 'start') in grouped.groups - assert (pd.Timestamp('2013-10-31'), 'start') in grouped.groups - assert (pd.Timestamp('2014-08-31'), 'start') in grouped.groups + assert (pd.Timestamp("2014-09-30"), "start") in grouped.groups + assert (pd.Timestamp("2013-10-31"), "start") in grouped.groups + assert (pd.Timestamp("2014-08-31"), "start") in grouped.groups - res = grouped.get_group((pd.Timestamp('2014-09-30'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-09-30"), "start")) tm.assert_frame_equal(res, df.iloc[[0], :]) - res = grouped.get_group((pd.Timestamp('2013-10-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2013-10-31"), "start")) tm.assert_frame_equal(res, df.iloc[[1], :]) - res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) + res = grouped.get_group((pd.Timestamp("2014-08-31"), "start")) tm.assert_frame_equal(res, df.iloc[[2], :]) def test_grouping_string_repr(self): # GH 13394 mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) df = DataFrame([[1, 2, 3]], columns=mi) - gr = df.groupby(df[('A', 'a')]) + gr = df.groupby(df[("A", "a")]) result = gr.grouper.groupings[0].__repr__() expected = "Grouping(('A', 'a'))" diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 141381f84300b..71d545e960566 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -5,13 +5,17 @@ from pandas.util.testing import assert_frame_equal, assert_series_equal -@pytest.fixture(params=[['inner'], ['inner', 'outer']]) +@pytest.fixture(params=[["inner"], ["inner", "outer"]]) def frame(request): levels = request.param - df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'], - 'inner': [1, 2, 3, 1, 2, 3], - 'A': np.arange(6), - 'B': ['one', 'one', 'two', 'two', 'one', 'one']}) + df = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) if levels: df = df.set_index(levels) @@ -20,39 +24,49 @@ def frame(request): @pytest.fixture() def series(): - df = pd.DataFrame({'outer': ['a', 'a', 'a', 'b', 'b', 'b'], - 'inner': [1, 2, 3, 1, 2, 3], - 'A': np.arange(6), - 'B': ['one', 'one', 'two', 'two', 'one', 'one']}) - s = df.set_index(['outer', 'inner', 'B'])['A'] + df = pd.DataFrame( + { + "outer": ["a", "a", "a", "b", "b", "b"], + "inner": [1, 2, 3, 1, 2, 3], + "A": np.arange(6), + "B": ["one", "one", "two", "two", "one", "one"], + } + ) + s = df.set_index(["outer", "inner", "B"])["A"] return s -@pytest.mark.parametrize('key_strs,groupers', [ - ('inner', # Index name - pd.Grouper(level='inner') - ), - (['inner'], # List of index name - [pd.Grouper(level='inner')] - ), - (['B', 'inner'], # Column and index - ['B', pd.Grouper(level='inner')] - ), - (['inner', 'B'], # Index and column - [pd.Grouper(level='inner'), 'B'])]) +@pytest.mark.parametrize( + "key_strs,groupers", + [ + ("inner", pd.Grouper(level="inner")), # Index name + (["inner"], [pd.Grouper(level="inner")]), # List of index name + (["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index + (["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column + ], +) def test_grouper_index_level_as_string(frame, key_strs, groupers): result = frame.groupby(key_strs).mean() expected = frame.groupby(groupers).mean() assert_frame_equal(result, expected) -@pytest.mark.parametrize('levels', [ - 'inner', 'outer', 'B', - ['inner'], ['outer'], ['B'], - ['inner', 'outer'], ['outer', 'inner'], - ['inner', 'outer', 'B'], ['B', 'outer', 'inner'] -]) +@pytest.mark.parametrize( + "levels", + [ + "inner", + "outer", + "B", + ["inner"], + ["outer"], + ["B"], + ["inner", "outer"], + ["outer", "inner"], + ["inner", "outer", "B"], + ["B", "outer", "inner"], + ], +) def test_grouper_index_level_as_string_series(series, levels): # Compute expected result diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index deb0f48b9cea2..2c4b56793580c 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -8,10 +8,10 @@ def test_first_last_nth(df): # tests for first / last / nth - grouped = df.groupby('A') + grouped = df.groupby("A") first = grouped.first() - expected = df.loc[[1, 0], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[1, 0], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(first, expected) @@ -19,122 +19,120 @@ def test_first_last_nth(df): assert_frame_equal(nth, expected) last = grouped.last() - expected = df.loc[[5, 7], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[5, 7], ["B", "C", "D"]] + expected.index = Index(["bar", "foo"], name="A") assert_frame_equal(last, expected) nth = grouped.nth(-1) assert_frame_equal(nth, expected) nth = grouped.nth(1) - expected = df.loc[[2, 3], ['B', 'C', 'D']].copy() - expected.index = Index(['foo', 'bar'], name='A') + expected = df.loc[[2, 3], ["B", "C", "D"]].copy() + expected.index = Index(["foo", "bar"], name="A") expected = expected.sort_index() assert_frame_equal(nth, expected) # it works! - grouped['B'].first() - grouped['B'].last() - grouped['B'].nth(0) + grouped["B"].first() + grouped["B"].last() + grouped["B"].nth(0) - df.loc[df['A'] == 'foo', 'B'] = np.nan - assert isna(grouped['B'].first()['foo']) - assert isna(grouped['B'].last()['foo']) - assert isna(grouped['B'].nth(0)['foo']) + df.loc[df["A"] == "foo", "B"] = np.nan + assert isna(grouped["B"].first()["foo"]) + assert isna(grouped["B"].last()["foo"]) + assert isna(grouped["B"].nth(0)["foo"]) # v0.14.0 whatsnew - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") result = g.first() - expected = df.iloc[[1, 2]].set_index('A') + expected = df.iloc[[1, 2]].set_index("A") assert_frame_equal(result, expected) - expected = df.iloc[[1, 2]].set_index('A') - result = g.nth(0, dropna='any') + expected = df.iloc[[1, 2]].set_index("A") + result = g.nth(0, dropna="any") assert_frame_equal(result, expected) def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy() - df['E'] = True - df['F'] = 1 + df["E"] = True + df["F"] = 1 # tests for first / last / nth - grouped = df.groupby('A') + grouped = df.groupby("A") first = grouped.first() - expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(first, expected) last = grouped.last() - expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(last, expected) nth = grouped.nth(1) - expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') + expected = df.loc[[3, 2], ["B", "C", "D", "E", "F"]] + expected.index = Index(["bar", "foo"], name="A") expected = expected.sort_index() assert_frame_equal(nth, expected) # GH 2763, first/last shifting dtypes idx = list(range(10)) idx.append(9) - s = Series(data=range(11), index=idx, name='IntCol') - assert s.dtype == 'int64' + s = Series(data=range(11), index=idx, name="IntCol") + assert s.dtype == "int64" f = s.groupby(level=0).first() - assert f.dtype == 'int64' + assert f.dtype == "int64" def test_nth(): - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) - assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) - assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) - assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) - assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) - assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) - assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) - assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) - assert_frame_equal(g[['B']].nth(0), - df.loc[[0, 2], ['A', 'B']].set_index('A')) - - exp = df.set_index('A') - assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) - - exp['B'] = np.nan - assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index("A")) + assert_frame_equal(g.nth(1), df.iloc[[1]].set_index("A")) + assert_frame_equal(g.nth(2), df.loc[[]].set_index("A")) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index("A")) + assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index("A")) + assert_frame_equal(g.nth(-3), df.loc[[]].set_index("A")) + assert_series_equal(g.B.nth(0), df.set_index("A").B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.set_index("A").B.iloc[[1]]) + assert_frame_equal(g[["B"]].nth(0), df.loc[[0, 2], ["A", "B"]].set_index("A")) + + exp = df.set_index("A") + assert_frame_equal(g.nth(0, dropna="any"), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna="any"), exp.iloc[[1, 2]]) + + exp["B"] = np.nan + assert_frame_equal(g.nth(7, dropna="any"), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna="any"), exp.iloc[[1, 2]]) # out of bounds, regression from 0.13.1 # GH 6621 - df = DataFrame({'color': {0: 'green', - 1: 'green', - 2: 'red', - 3: 'red', - 4: 'red'}, - 'food': {0: 'ham', - 1: 'eggs', - 2: 'eggs', - 3: 'ham', - 4: 'pork'}, - 'two': {0: 1.5456590000000001, - 1: -0.070345000000000005, - 2: -2.4004539999999999, - 3: 0.46206000000000003, - 4: 0.52350799999999997}, - 'one': {0: 0.56573799999999996, - 1: -0.9742360000000001, - 2: 1.033801, - 3: -0.78543499999999999, - 4: 0.70422799999999997}}).set_index(['color', - 'food']) + df = DataFrame( + { + "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"}, + "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"}, + "two": { + 0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997, + }, + "one": { + 0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997, + }, + } + ).set_index(["color", "food"]) result = df.groupby(level=0, as_index=False).nth(2) expected = df.iloc[[-1]] @@ -146,7 +144,7 @@ def test_nth(): # GH 7559 # from the vbench - df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') + df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64") s = df[1] g = df[0] expected = s.groupby(g).first() @@ -164,93 +162,115 @@ def test_nth(): # as it keeps the order in the series (and not the group order) # related GH 7287 expected = s.groupby(g, sort=False).first() - result = s.groupby(g, sort=False).nth(0, dropna='all') + result = s.groupby(g, sort=False).nth(0, dropna="all") assert_series_equal(result, expected) - with pytest.raises(ValueError, match='For a DataFrame groupby'): + with pytest.raises(ValueError, match="For a DataFrame groupby"): s.groupby(g, sort=False).nth(0, dropna=True) # doc example - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - result = g.B.nth(0, dropna='all') + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") + result = g.B.nth(0, dropna="all") expected = g.B.first() assert_series_equal(result, expected) # test multiple nth values - df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], - columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) - assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) - - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"]) + g = df.groupby("A") + + assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index("A")) + assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index("A")) + assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index("A")) + assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index("A")) + assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index("A")) + + business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, fourth and last two business days for each month key = [df.index.year, df.index.month] result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) expected_dates = pd.to_datetime( - ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', - '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', - '2014/6/27', '2014/6/30']) - expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) + [ + "2014/4/1", + "2014/4/4", + "2014/4/29", + "2014/4/30", + "2014/5/1", + "2014/5/6", + "2014/5/29", + "2014/5/30", + "2014/6/2", + "2014/6/5", + "2014/6/27", + "2014/6/30", + ] + ) + expected = DataFrame(1, columns=["a", "b"], index=expected_dates) assert_frame_equal(result, expected) def test_nth_multi_index(three_group): # PR 9090, related to issue 8979 # test nth on MultiIndex, should match .first() - grouped = three_group.groupby(['A', 'B']) + grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) expected = grouped.first() assert_frame_equal(result, expected) -@pytest.mark.parametrize('data, expected_first, expected_last', [ - ({'id': ['A'], - 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - 'foo': [1]}, - {'id': ['A'], - 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - 'foo': [1]}, - {'id': ['A'], - 'time': Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - 'foo': [1]}), - ({'id': ['A', 'B', 'A'], - 'time': [Timestamp('2012-01-01 13:00:00', - tz='America/New_York'), - Timestamp('2012-02-01 14:00:00', - tz='US/Central'), - Timestamp('2012-03-01 12:00:00', - tz='Europe/London')], - 'foo': [1, 2, 3]}, - {'id': ['A', 'B'], - 'time': [Timestamp('2012-01-01 13:00:00', - tz='America/New_York'), - Timestamp('2012-02-01 14:00:00', - tz='US/Central')], - 'foo': [1, 2]}, - {'id': ['A', 'B'], - 'time': [Timestamp('2012-03-01 12:00:00', - tz='Europe/London'), - Timestamp('2012-02-01 14:00:00', - tz='US/Central')], - 'foo': [3, 2]}) -]) +@pytest.mark.parametrize( + "data, expected_first, expected_last", + [ + ( + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + { + "id": ["A"], + "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"), + "foo": [1], + }, + ), + ( + { + "id": ["A", "B", "A"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + ], + "foo": [1, 2, 3], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-01-01 13:00:00", tz="America/New_York"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [1, 2], + }, + { + "id": ["A", "B"], + "time": [ + Timestamp("2012-03-01 12:00:00", tz="Europe/London"), + Timestamp("2012-02-01 14:00:00", tz="US/Central"), + ], + "foo": [3, 2], + }, + ), + ], +) def test_first_last_tz(data, expected_first, expected_last): # GH15884 # Test that the timezone is retained when calling first @@ -258,43 +278,50 @@ def test_first_last_tz(data, expected_first, expected_last): df = DataFrame(data) - result = df.groupby('id', as_index=False).first() + result = df.groupby("id", as_index=False).first() expected = DataFrame(expected_first) - cols = ['id', 'time', 'foo'] + cols = ["id", "time", "foo"] assert_frame_equal(result[cols], expected[cols]) - result = df.groupby('id', as_index=False)['time'].first() - assert_frame_equal(result, expected[['id', 'time']]) + result = df.groupby("id", as_index=False)["time"].first() + assert_frame_equal(result, expected[["id", "time"]]) - result = df.groupby('id', as_index=False).last() + result = df.groupby("id", as_index=False).last() expected = DataFrame(expected_last) - cols = ['id', 'time', 'foo'] + cols = ["id", "time", "foo"] assert_frame_equal(result[cols], expected[cols]) - result = df.groupby('id', as_index=False)['time'].last() - assert_frame_equal(result, expected[['id', 'time']]) + result = df.groupby("id", as_index=False)["time"].last() + assert_frame_equal(result, expected[["id", "time"]]) -@pytest.mark.parametrize('method, ts, alpha', [ - ['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'], - ['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b'] -]) +@pytest.mark.parametrize( + "method, ts, alpha", + [ + ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"], + ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"], + ], +) def test_first_last_tz_multi_column(method, ts, alpha): # GH 21603 - category_string = pd.Series(list('abc')).astype( - 'category') - df = pd.DataFrame({'group': [1, 1, 2], - 'category_string': category_string, - 'datetimetz': pd.date_range('20130101', periods=3, - tz='US/Eastern')}) - result = getattr(df.groupby('group'), method)() + category_string = pd.Series(list("abc")).astype("category") + df = pd.DataFrame( + { + "group": [1, 1, 2], + "category_string": category_string, + "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + } + ) + result = getattr(df.groupby("group"), method)() expected = pd.DataFrame( - {'category_string': pd.Categorical( - [alpha, 'c'], dtype=category_string.dtype), - 'datetimetz': [ts, - Timestamp('2013-01-03', - tz='US/Eastern')]}, - index=pd.Index([1, 2], name='group')) + { + "category_string": pd.Categorical( + [alpha, "c"], dtype=category_string.dtype + ), + "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")], + }, + index=pd.Index([1, 2], name="group"), + ) assert_frame_equal(result, expected) @@ -302,35 +329,74 @@ def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny']}) - grouped = three_group.groupby(['A', 'B']) + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + } + ) + grouped = three_group.groupby(["A", "B"]) result = grouped.nth(0) expected = DataFrame( - {'C': ['dull', 'dull', 'dull', 'dull']}, - index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], - ['one', 'two', 'one', 'two']], - names=['A', 'B'])) + {"C": ["dull", "dull", "dull", "dull"]}, + index=MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo"], ["one", "two", "one", "two"]], + names=["A", "B"], + ), + ) assert_frame_equal(result, expected) def test_groupby_head_tail(): - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g_as = df.groupby('A', as_index=True) - g_not_as = df.groupby('A', as_index=False) + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + g_as = df.groupby("A", as_index=True) + g_not_as = df.groupby("A", as_index=False) # as_index= False, much easier assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - empty_not_as = DataFrame(columns=df.columns, - index=pd.Index([], dtype=df.index.dtype)) - empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) + empty_not_as = DataFrame( + columns=df.columns, index=pd.Index([], dtype=df.index.dtype) + ) + empty_not_as["A"] = empty_not_as["A"].astype(df.A.dtype) + empty_not_as["B"] = empty_not_as["B"].astype(df.B.dtype) assert_frame_equal(empty_not_as, g_not_as.head(0)) assert_frame_equal(empty_not_as, g_not_as.tail(0)) assert_frame_equal(empty_not_as, g_not_as.head(-1)) @@ -346,8 +412,8 @@ def test_groupby_head_tail(): assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) - empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) + empty_as["A"] = empty_not_as["A"].astype(df.A.dtype) + empty_as["B"] = empty_not_as["B"].astype(df.B.dtype) assert_frame_equal(empty_as, g_as.head(0)) assert_frame_equal(empty_as, g_as.tail(0)) assert_frame_equal(empty_as, g_as.head(-1)) @@ -358,40 +424,40 @@ def test_groupby_head_tail(): # test with selection assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + assert_frame_equal(g_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) + assert_frame_equal(g_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) + assert_frame_equal(g_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + assert_frame_equal(g_not_as[["A"]].head(1), df_as.loc[[0, 2], ["A"]]) + assert_frame_equal(g_not_as[["B"]].head(1), df_as.loc[[0, 2], ["B"]]) + assert_frame_equal(g_not_as[["A", "B"]].head(1), df_as.loc[[0, 2]]) def test_group_selection_cache(): # GH 12839 nth, head, and tail should return same result consistently - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - expected = df.iloc[[0, 2]].set_index('A') + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + expected = df.iloc[[0, 2]].set_index("A") - g = df.groupby('A') + g = df.groupby("A") result1 = g.head(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) - g = df.groupby('A') + g = df.groupby("A") result1 = g.tail(n=2) result2 = g.nth(0) assert_frame_equal(result1, df) assert_frame_equal(result2, expected) - g = df.groupby('A') + g = df.groupby("A") result1 = g.nth(0) result2 = g.head(n=2) assert_frame_equal(result1, expected) assert_frame_equal(result2, df) - g = df.groupby('A') + g = df.groupby("A") result1 = g.nth(0) result2 = g.tail(n=2) assert_frame_equal(result1, expected) @@ -400,54 +466,48 @@ def test_group_selection_cache(): def test_nth_empty(): # GH 16064 - df = DataFrame(index=[0], columns=['a', 'b', 'c']) - result = df.groupby('a').nth(10) - expected = DataFrame(index=Index([], name='a'), columns=['b', 'c']) + df = DataFrame(index=[0], columns=["a", "b", "c"]) + result = df.groupby("a").nth(10) + expected = DataFrame(index=Index([], name="a"), columns=["b", "c"]) assert_frame_equal(result, expected) - result = df.groupby(['a', 'b']).nth(10) - expected = DataFrame(index=MultiIndex([[], []], [[], []], - names=['a', 'b']), - columns=['c']) + result = df.groupby(["a", "b"]).nth(10) + expected = DataFrame( + index=MultiIndex([[], []], [[], []], names=["a", "b"]), columns=["c"] + ) assert_frame_equal(result, expected) def test_nth_column_order(): # GH 20760 # Check that nth preserves column order - df = DataFrame([[1, 'b', 100], - [1, 'a', 50], - [1, 'a', np.nan], - [2, 'c', 200], - [2, 'd', 150]], - columns=['A', 'C', 'B']) - result = df.groupby('A').nth(0) - expected = DataFrame([['b', 100.0], - ['c', 200.0]], - columns=['C', 'B'], - index=Index([1, 2], name='A')) + df = DataFrame( + [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]], + columns=["A", "C", "B"], + ) + result = df.groupby("A").nth(0) + expected = DataFrame( + [["b", 100.0], ["c", 200.0]], columns=["C", "B"], index=Index([1, 2], name="A") + ) assert_frame_equal(result, expected) - result = df.groupby('A').nth(-1, dropna='any') - expected = DataFrame([['a', 50.0], - ['d', 150.0]], - columns=['C', 'B'], - index=Index([1, 2], name='A')) + result = df.groupby("A").nth(-1, dropna="any") + expected = DataFrame( + [["a", 50.0], ["d", 150.0]], columns=["C", "B"], index=Index([1, 2], name="A") + ) assert_frame_equal(result, expected) -@pytest.mark.parametrize("dropna", [None, 'any', 'all']) +@pytest.mark.parametrize("dropna", [None, "any", "all"]) def test_nth_nan_in_grouper(dropna): # GH 26011 - df = DataFrame([ - [np.nan, 0, 1], - ['abc', 2, 3], - [np.nan, 4, 5], - ['def', 6, 7], - [np.nan, 8, 9], - ], columns=list('abc')) - result = df.groupby('a').nth(0, dropna=dropna) - expected = pd.DataFrame([[2, 3], [6, 7]], columns=list('bc'), - index=Index(['abc', 'def'], name='a')) + df = DataFrame( + [[np.nan, 0, 1], ["abc", 2, 3], [np.nan, 4, 5], ["def", 6, 7], [np.nan, 8, 9]], + columns=list("abc"), + ) + result = df.groupby("a").nth(0, dropna=dropna) + expected = pd.DataFrame( + [[2, 3], [6, 7]], columns=list("bc"), index=Index(["abc", "def"], name="a") + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 9b0396bb530a1..a6ea793b53c41 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -12,275 +12,410 @@ def test_rank_apply(): lab1 = np.random.randint(0, 100, size=500) lab2 = np.random.randint(0, 130, size=500) - df = DataFrame({'value': np.random.randn(500), - 'key1': lev1.take(lab1), - 'key2': lev2.take(lab2)}) + df = DataFrame( + { + "value": np.random.randn(500), + "key1": lev1.take(lab1), + "key2": lev2.take(lab2), + } + ) - result = df.groupby(['key1', 'key2']).value.rank() + result = df.groupby(["key1", "key2"]).value.rank() - expected = [piece.value.rank() - for key, piece in df.groupby(['key1', 'key2'])] + expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])] expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected) - result = df.groupby(['key1', 'key2']).value.rank(pct=True) + result = df.groupby(["key1", "key2"]).value.rank(pct=True) - expected = [piece.value.rank(pct=True) - for key, piece in df.groupby(['key1', 'key2'])] + expected = [ + piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"]) + ] expected = concat(expected, axis=0) expected = expected.reindex(result.index) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) -@pytest.mark.parametrize("vals", [ - [2, 2, 8, 2, 6], - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06')]]) -@pytest.mark.parametrize("ties_method,ascending,pct,exp", [ - ('average', True, False, [2., 2., 5., 2., 4.]), - ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), - ('average', False, False, [4., 4., 1., 4., 2.]), - ('average', False, True, [.8, .8, .2, .8, .4]), - ('min', True, False, [1., 1., 5., 1., 4.]), - ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), - ('min', False, False, [3., 3., 1., 3., 2.]), - ('min', False, True, [.6, .6, .2, .6, .4]), - ('max', True, False, [3., 3., 5., 3., 4.]), - ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), - ('max', False, False, [5., 5., 1., 5., 2.]), - ('max', False, True, [1., 1., .2, 1., .4]), - ('first', True, False, [1., 2., 5., 3., 4.]), - ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), - ('first', False, False, [3., 4., 1., 5., 2.]), - ('first', False, True, [.6, .8, .2, 1., .4]), - ('dense', True, False, [1., 1., 3., 1., 2.]), - ('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]), - ('dense', False, False, [3., 3., 1., 3., 2.]), - ('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]), -]) +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + [2, 2, 8, 2, 6], + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + ], + ], +) +@pytest.mark.parametrize( + "ties_method,ascending,pct,exp", + [ + ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]), + ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]), + ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]), + ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]), + ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]), + ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]), + ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]), + ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]), + ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]), + ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]), + ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]), + ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]), + ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]), + ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]), + ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]), + ], +) def test_rank_args(grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, pct=pct) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) - exp_df = DataFrame(exp * len(grps), columns=['val']) + exp_df = DataFrame(exp * len(grps), columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) -@pytest.mark.parametrize("vals", [ - [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], -]) -@pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ - ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), - ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), - ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), - ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), - ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), - ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), - ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), - ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), - ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), - ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), - ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), - ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), - ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), - ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), - ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), - ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), - ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), - ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), - ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), - ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), - ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), - ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), - ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), - ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), - ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), - ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), - ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), - ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), - ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), - ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) -]) +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]] +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,exp", + [ + ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]), + ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]), + ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]), + ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]), + ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]), + ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]), + ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]), + ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]), + ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]), + ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]), + ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]), + ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]), + ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]), + ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]), + ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]), + ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]), + ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]), + ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]), + ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]), + ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]), + ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]), + ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]), + ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]), + ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]), + ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]), + ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]), + ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]), + ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]), + ], +) def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): # GH 20561 key = np.repeat(grps, len(vals)) vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option) - exp_df = DataFrame(exp * len(grps), columns=['val']) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option + ) + exp_df = DataFrame(exp * len(grps), columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) -@pytest.mark.parametrize("vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06'), np.nan, np.nan] -]) -@pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ - ('average', True, 'keep', False, - [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), - ('average', True, 'keep', True, - [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), - ('average', False, 'keep', False, - [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), - ('average', False, 'keep', True, - [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), - ('min', True, 'keep', False, - [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), - ('min', True, 'keep', True, - [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), - ('min', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('min', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), - ('max', True, 'keep', False, - [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), - ('max', True, 'keep', True, - [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('max', False, 'keep', False, - [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), - ('max', False, 'keep', True, - [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('first', True, 'keep', False, - [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), - ('first', True, 'keep', True, - [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('first', False, 'keep', False, - [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), - ('first', False, 'keep', True, - [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('dense', True, 'keep', False, - [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), - ('dense', True, 'keep', True, - [1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]), - ('dense', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('dense', False, 'keep', True, - [3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]), - ('average', True, 'bottom', False, [2., 2., 7., 5., 2., 4., 7., 7.]), - ('average', True, 'bottom', True, - [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), - ('average', False, 'bottom', False, [4., 4., 7., 1., 4., 2., 7., 7.]), - ('average', False, 'bottom', True, - [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), - ('min', True, 'bottom', False, [1., 1., 6., 5., 1., 4., 6., 6.]), - ('min', True, 'bottom', True, - [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), - ('min', False, 'bottom', False, [3., 3., 6., 1., 3., 2., 6., 6.]), - ('min', False, 'bottom', True, - [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), - ('max', True, 'bottom', False, [3., 3., 8., 5., 3., 4., 8., 8.]), - ('max', True, 'bottom', True, - [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), - ('max', False, 'bottom', False, [5., 5., 8., 1., 5., 2., 8., 8.]), - ('max', False, 'bottom', True, - [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), - ('first', True, 'bottom', False, [1., 2., 6., 5., 3., 4., 7., 8.]), - ('first', True, 'bottom', True, - [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), - ('first', False, 'bottom', False, [3., 4., 6., 1., 5., 2., 7., 8.]), - ('first', False, 'bottom', True, - [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), - ('dense', True, 'bottom', False, [1., 1., 4., 3., 1., 2., 4., 4.]), - ('dense', True, 'bottom', True, - [0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]), - ('dense', False, 'bottom', False, [3., 3., 4., 1., 3., 2., 4., 4.]), - ('dense', False, 'bottom', True, - [0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.]) -]) -def test_rank_args_missing(grps, vals, ties_method, ascending, - na_option, pct, exp): +@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]]) +@pytest.mark.parametrize( + "vals", + [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], + [ + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-02"), + np.nan, + pd.Timestamp("2018-01-08"), + pd.Timestamp("2018-01-02"), + pd.Timestamp("2018-01-06"), + np.nan, + np.nan, + ], + ], +) +@pytest.mark.parametrize( + "ties_method,ascending,na_option,pct,exp", + [ + ( + "average", + True, + "keep", + False, + [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan], + ), + ( + "average", + True, + "keep", + True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + False, + [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan], + ), + ( + "average", + False, + "keep", + True, + [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan], + ), + ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]), + ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ( + "min", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]), + ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ( + "max", + False, + "keep", + False, + [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]), + ( + "first", + True, + "keep", + False, + [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan], + ), + ( + "first", + True, + "keep", + True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + False, + [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan], + ), + ( + "first", + False, + "keep", + True, + [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + False, + [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + True, + "keep", + True, + [ + 1.0 / 3.0, + 1.0 / 3.0, + np.nan, + 3.0 / 3.0, + 1.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ( + "dense", + False, + "keep", + False, + [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan], + ), + ( + "dense", + False, + "keep", + True, + [ + 3.0 / 3.0, + 3.0 / 3.0, + np.nan, + 1.0 / 3.0, + 3.0 / 3.0, + 2.0 / 3.0, + np.nan, + np.nan, + ], + ), + ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]), + ( + "average", + True, + "bottom", + True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875], + ), + ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]), + ( + "average", + False, + "bottom", + True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875], + ), + ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]), + ( + "min", + True, + "bottom", + True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75], + ), + ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]), + ( + "min", + False, + "bottom", + True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75], + ), + ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]), + ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]), + ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]), + ( + "max", + False, + "bottom", + True, + [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0], + ), + ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]), + ( + "first", + True, + "bottom", + True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0], + ), + ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]), + ( + "first", + False, + "bottom", + True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0], + ), + ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]), + ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]), + ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]), + ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]), + ], +) +def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): key = np.repeat(grps, len(vals)) vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) + df = DataFrame({"key": key, "val": vals}) + result = df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) - exp_df = DataFrame(exp * len(grps), columns=['val']) + exp_df = DataFrame(exp * len(grps), columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("pct,exp", [ - (False, [3., 3., 3., 3., 3.]), - (True, [.6, .6, .6, .6, .6])]) +@pytest.mark.parametrize( + "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])] +) def test_rank_resets_each_group(pct, exp): df = DataFrame( - {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], - 'val': [1] * 10} + {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10} ) - result = df.groupby('key').rank(pct=pct) - exp_df = DataFrame(exp * 2, columns=['val']) + result = df.groupby("key").rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=["val"]) tm.assert_frame_equal(result, exp_df) def test_rank_avg_even_vals(): - df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) - result = df.groupby('key').rank() - exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + df = DataFrame({"key": ["a"] * 4, "val": [1] * 4}) + result = df.groupby("key").rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"]) tm.assert_frame_equal(result, exp_df) -@pytest.mark.parametrize("ties_method", [ - 'average', 'min', 'max', 'first', 'dense']) +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) @pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("vals", [ - ['bar', 'bar', 'foo', 'bar', 'baz'], - ['bar', np.nan, 'foo', np.nan, 'baz'] -]) -def test_rank_object_raises(ties_method, ascending, na_option, - pct, vals): - df = DataFrame({'key': ['foo'] * 5, 'val': vals}) +@pytest.mark.parametrize( + "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]] +) +def test_rank_object_raises(ties_method, ascending, na_option, pct, vals): + df = DataFrame({"key": ["foo"] * 5, "val": vals}) with pytest.raises(TypeError, match="not callable"): - df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) @pytest.mark.parametrize("na_option", [True, "bad", 1]) -@pytest.mark.parametrize("ties_method", [ - 'average', 'min', 'max', 'first', 'dense']) +@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("vals", [ - ['bar', 'bar', 'foo', 'bar', 'baz'], - ['bar', np.nan, 'foo', np.nan, 'baz'], - [1, np.nan, 2, np.nan, 3] -]) +@pytest.mark.parametrize( + "vals", + [ + ["bar", "bar", "foo", "bar", "baz"], + ["bar", np.nan, "foo", np.nan, "baz"], + [1, np.nan, 2, np.nan, 3], + ], +) def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals): - df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + df = DataFrame({"key": ["foo"] * 5, "val": vals}) msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): - df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) + df.groupby("key").rank( + method=ties_method, ascending=ascending, na_option=na_option, pct=pct + ) def test_rank_empty_group(): # see gh-22519 column = "A" - df = DataFrame({ - "A": [0, 1, 0], - "B": [1., np.nan, 2.] - }) + df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]}) result = df.groupby(column).B.rank(pct=True) expected = Series([0.5, np.nan, 1.0], name="B") @@ -291,12 +426,15 @@ def test_rank_empty_group(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("input_key,input_value,output_value", [ - ([1, 2], [1, 1], [1.0, 1.0]), - ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), - ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), - ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]) -]) +@pytest.mark.parametrize( + "input_key,input_value,output_value", + [ + ([1, 2], [1, 1], [1.0, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), + ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]), + ], +) def test_rank_zero_div(input_key, input_value, output_value): # GH 23666 df = DataFrame({"A": input_key, "B": input_value}) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 4ca470d316e5c..d201b887739ec 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -17,45 +17,47 @@ class TestGroupBy: - def test_groupby_with_timegrouper(self): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name - df_original = DataFrame({ - 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(), - 'Quantity': [18, 3, 5, 1, 9, 3], - 'Date': [ - datetime(2013, 9, 1, 13, 0), - datetime(2013, 9, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 3, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 9, 2, 14, 0), - ] - }) + df_original = DataFrame( + { + "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) # GH 6908 change target column's order - df_reordered = df_original.sort_values(by='Quantity') + df_reordered = df_original.sort_values(by="Quantity") for df in [df_original, df_reordered]: - df = df.set_index(['Date']) + df = df.set_index(["Date"]) expected = DataFrame( - {'Quantity': 0}, - index=date_range('20130901', - '20131205', freq='5D', - name='Date', closed='left')) - expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64') + {"Quantity": 0}, + index=date_range( + "20130901", "20131205", freq="5D", name="Date", closed="left" + ), + ) + expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") - result1 = df.resample('5D') .sum() + result1 = df.resample("5D").sum() assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum() + result2 = df_sorted.groupby(pd.Grouper(freq="5D")).sum() assert_frame_equal(result2, expected) - result3 = df.groupby(pd.Grouper(freq='5D')).sum() + result3 = df.groupby(pd.Grouper(freq="5D")).sum() assert_frame_equal(result3, expected) @pytest.mark.parametrize("should_sort", [True, False]) @@ -63,25 +65,27 @@ def test_groupby_with_timegrouper_methods(self, should_sort): # GH 3881 # make sure API of timegrouper conforms - df = pd.DataFrame({ - 'Branch': 'A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 8, 9, 3], - 'Date': [ - datetime(2013, 1, 1, 13, 0), - datetime(2013, 1, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 12, 2, 14, 0), - ] - }) + df = pd.DataFrame( + { + "Branch": "A A A A A B".split(), + "Buyer": "Carl Mark Carl Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 8, 9, 3], + "Date": [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ], + } + ) if should_sort: - df = df.sort_values(by='Quantity', ascending=False) + df = df.sort_values(by="Quantity", ascending=False) - df = df.set_index('Date', drop=False) - g = df.groupby(pd.Grouper(freq='6M')) + df = df.set_index("Date", drop=False) + g = df.groupby(pd.Grouper(freq="6M")) assert g.group_keys assert isinstance(g.grouper, BinGrouper) @@ -94,242 +98,291 @@ def test_timegrouper_with_reg_groups(self): # GH 3794 # allow combination of timegrouper/reg groups - df_original = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime(2013, 1, 1, 13, 0), - datetime(2013, 1, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 12, 2, 14, 0), - ] - }).set_index('Date') - - df_sorted = df_original.sort_values(by='Quantity', ascending=False) + df_original = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 1, 1, 13, 0), + datetime(2013, 1, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 12, 2, 14, 0), + ], + } + ).set_index("Date") + + df_sorted = df_original.sort_values(by="Quantity", ascending=False) for df in [df_original, df_sorted]: - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - datetime(2013, 12, 31, 0, 0), - datetime(2013, 12, 31, 0, 0), - datetime(2013, 12, 31, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - - result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + datetime(2013, 12, 31, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + + result = df.groupby([pd.Grouper(freq="A"), "Buyer"]).sum() assert_frame_equal(result, expected) - expected = DataFrame({ - 'Buyer': 'Carl Mark Carl Joe'.split(), - 'Quantity': [1, 3, 9, 18], - 'Date': [ - datetime(2013, 1, 1, 0, 0), - datetime(2013, 1, 1, 0, 0), - datetime(2013, 7, 1, 0, 0), - datetime(2013, 7, 1, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum() + expected = DataFrame( + { + "Buyer": "Carl Mark Carl Joe".split(), + "Quantity": [1, 3, 9, 18], + "Date": [ + datetime(2013, 1, 1, 0, 0), + datetime(2013, 1, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + datetime(2013, 7, 1, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + result = df.groupby([pd.Grouper(freq="6MS"), "Buyer"]).sum() assert_frame_equal(result, expected) - df_original = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime(2013, 10, 1, 13, 0), - datetime(2013, 10, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 2, 12, 0), - datetime(2013, 10, 2, 14, 0), - ] - }).set_index('Date') - - df_sorted = df_original.sort_values(by='Quantity', ascending=False) + df_original = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 10, 1, 13, 0), + datetime(2013, 10, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 10, 2, 14, 0), + ], + } + ).set_index("Date") + + df_sorted = df_original.sort_values(by="Quantity", ascending=False) for df in [df_original, df_sorted]: - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark Carl Joe'.split(), - 'Quantity': [6, 8, 3, 4, 10], - 'Date': [ - datetime(2013, 10, 1, 0, 0), - datetime(2013, 10, 1, 0, 0), - datetime(2013, 10, 1, 0, 0), - datetime(2013, 10, 2, 0, 0), - datetime(2013, 10, 2, 0, 0), - ] - }).set_index(['Date', 'Buyer']) - - result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark Carl Joe".split(), + "Quantity": [6, 8, 3, 4, 10], + "Date": [ + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 1, 0, 0), + datetime(2013, 10, 2, 0, 0), + datetime(2013, 10, 2, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) + + result = df.groupby([pd.Grouper(freq="1D"), "Buyer"]).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - datetime(2013, 10, 31, 0, 0), - datetime(2013, 10, 31, 0, 0), - datetime(2013, 10, 31, 0, 0), - ] - }).set_index(['Date', 'Buyer']) + result = df.groupby([pd.Grouper(freq="1M"), "Buyer"]).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + datetime(2013, 10, 31, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) assert_frame_equal(result, expected) # passing the name df = df.reset_index() - result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' - ]).sum() + result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() assert_frame_equal(result, expected) with pytest.raises(KeyError): - df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", key="foo"), "Buyer"]).sum() # passing the level - df = df.set_index('Date') - result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer' - ]).sum() + df = df.set_index("Date") + result = df.groupby([pd.Grouper(freq="1M", level="Date"), "Buyer"]).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum( - ) + result = df.groupby([pd.Grouper(freq="1M", level=0), "Buyer"]).sum() assert_frame_equal(result, expected) with pytest.raises(ValueError): - df.groupby([pd.Grouper(freq='1M', level='foo'), - 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", level="foo"), "Buyer"]).sum() # multi names df = df.copy() - df['Date'] = df.index + pd.offsets.MonthEnd(2) - result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer' - ]).sum() - expected = DataFrame({ - 'Buyer': 'Carl Joe Mark'.split(), - 'Quantity': [10, 18, 3], - 'Date': [ - datetime(2013, 11, 30, 0, 0), - datetime(2013, 11, 30, 0, 0), - datetime(2013, 11, 30, 0, 0), - ] - }).set_index(['Date', 'Buyer']) + df["Date"] = df.index + pd.offsets.MonthEnd(2) + result = df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() + expected = DataFrame( + { + "Buyer": "Carl Joe Mark".split(), + "Quantity": [10, 18, 3], + "Date": [ + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + datetime(2013, 11, 30, 0, 0), + ], + } + ).set_index(["Date", "Buyer"]) assert_frame_equal(result, expected) # error as we have both a level and a name! with pytest.raises(ValueError): - df.groupby([pd.Grouper(freq='1M', key='Date', - level='Date'), 'Buyer']).sum() + df.groupby( + [pd.Grouper(freq="1M", key="Date", level="Date"), "Buyer"] + ).sum() # single groupers - expected = DataFrame({'Quantity': [31], - 'Date': [datetime(2013, 10, 31, 0, 0) - ]}).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M')).sum() + expected = DataFrame( + {"Quantity": [31], "Date": [datetime(2013, 10, 31, 0, 0)]} + ).set_index("Date") + result = df.groupby(pd.Grouper(freq="1M")).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M')]).sum() + result = df.groupby([pd.Grouper(freq="1M")]).sum() assert_frame_equal(result, expected) - expected = DataFrame({'Quantity': [31], - 'Date': [datetime(2013, 11, 30, 0, 0) - ]}).set_index('Date') - result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum() + expected = DataFrame( + {"Quantity": [31], "Date": [datetime(2013, 11, 30, 0, 0)]} + ).set_index("Date") + result = df.groupby(pd.Grouper(freq="1M", key="Date")).sum() assert_frame_equal(result, expected) - result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() + result = df.groupby([pd.Grouper(freq="1M", key="Date")]).sum() assert_frame_equal(result, expected) - @pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR']) + @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort - df = DataFrame({ - 'date': pd.to_datetime([ - '20121002', '20121007', '20130130', '20130202', '20130305', - '20121002', '20121207', '20130130', '20130202', '20130305', - '20130202', '20130305' - ]), - 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], - 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, - 359, 801], - 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] - }).set_index('date') + df = DataFrame( + { + "date": pd.to_datetime( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + 364, + 280, + 259, + 201, + 623, + 90, + 312, + 359, + 301, + 359, + 801, + ], + "cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12], + } + ).set_index("date") expected = ( - df.groupby('user_id')['whole_cost'] - .resample(freq) - .sum(min_count=1) # XXX - .dropna() - .reorder_levels(['date', 'user_id']) - .sort_index() - .astype('int64') + df.groupby("user_id")["whole_cost"] + .resample(freq) + .sum(min_count=1) # XXX + .dropna() + .reorder_levels(["date", "user_id"]) + .sort_index() + .astype("int64") ) - expected.name = 'whole_cost' + expected.name = "whole_cost" - result1 = df.sort_index().groupby([pd.Grouper(freq=freq), - 'user_id'])['whole_cost'].sum() + result1 = ( + df.sort_index() + .groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"] + .sum() + ) assert_series_equal(result1, expected) - result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ - 'whole_cost'].sum() + result2 = df.groupby([pd.Grouper(freq=freq), "user_id"])["whole_cost"].sum() assert_series_equal(result2, expected) def test_timegrouper_get_group(self): # GH 6914 - df_original = DataFrame({ - 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(), - 'Quantity': [18, 3, 5, 1, 9, 3], - 'Date': [datetime(2013, 9, 1, 13, 0), - datetime(2013, 9, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 3, 10, 0), - datetime(2013, 12, 2, 12, 0), - datetime(2013, 9, 2, 14, 0), ] - }) - df_reordered = df_original.sort_values(by='Quantity') + df_original = DataFrame( + { + "Buyer": "Carl Joe Joe Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) + df_reordered = df_original.sort_values(by="Quantity") # single grouping - expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], - df_original.iloc[[4]]] - dt_list = ['2013-09-30', '2013-10-31', '2013-12-31'] + expected_list = [ + df_original.iloc[[0, 1, 5]], + df_original.iloc[[2, 3]], + df_original.iloc[[4]], + ] + dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] for df in [df_original, df_reordered]: - grouped = df.groupby(pd.Grouper(freq='M', key='Date')) + grouped = df.groupby(pd.Grouper(freq="M", key="Date")) for t, expected in zip(dt_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group(dt) assert_frame_equal(result, expected) # multiple grouping - expected_list = [df_original.iloc[[1]], df_original.iloc[[3]], - df_original.iloc[[4]]] - g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'), - ('Joe', '2013-12-31')] + expected_list = [ + df_original.iloc[[1]], + df_original.iloc[[3]], + df_original.iloc[[4]], + ] + g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] for df in [df_original, df_reordered]: - grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')]) + grouped = df.groupby(["Buyer", pd.Grouper(freq="M", key="Date")]) for (b, t), expected in zip(g_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group((b, dt)) assert_frame_equal(result, expected) # with index - df_original = df_original.set_index('Date') - df_reordered = df_original.sort_values(by='Quantity') + df_original = df_original.set_index("Date") + df_reordered = df_original.sort_values(by="Quantity") - expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]], - df_original.iloc[[4]]] + expected_list = [ + df_original.iloc[[0, 1, 5]], + df_original.iloc[[2, 3]], + df_original.iloc[[4]], + ] for df in [df_original, df_reordered]: - grouped = df.groupby(pd.Grouper(freq='M')) + grouped = df.groupby(pd.Grouper(freq="M")) for t, expected in zip(dt_list, expected_list): dt = pd.Timestamp(t) result = grouped.get_group(dt) @@ -339,44 +392,43 @@ def test_timegrouper_apply_return_type_series(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 - df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], - 'value': [10, 13]}) + df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) df_dt = df.copy() - df_dt['date'] = pd.to_datetime(df_dt['date']) + df_dt["date"] = pd.to_datetime(df_dt["date"]) def sumfunc_series(x): - return pd.Series([x['value'].sum()], ('sum',)) + return pd.Series([x["value"].sum()], ("sum",)) - expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series) - result = (df_dt.groupby(pd.Grouper(freq='M', key='date')) - .apply(sumfunc_series)) - assert_frame_equal(result.reset_index(drop=True), - expected.reset_index(drop=True)) + expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_series) + result = df_dt.groupby(pd.Grouper(freq="M", key="date")).apply(sumfunc_series) + assert_frame_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) def test_timegrouper_apply_return_type_value(self): # Using `apply` with the `TimeGrouper` should give the # same return type as an `apply` with a `Grouper`. # Issue #11742 - df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'], - 'value': [10, 13]}) + df = pd.DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) df_dt = df.copy() - df_dt['date'] = pd.to_datetime(df_dt['date']) + df_dt["date"] = pd.to_datetime(df_dt["date"]) def sumfunc_value(x): return x.value.sum() - expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value) - result = (df_dt.groupby(Grouper(freq='M', key='date')) - .apply(sumfunc_value)) - assert_series_equal(result.reset_index(drop=True), - expected.reset_index(drop=True)) + expected = df.groupby(pd.Grouper(key="date")).apply(sumfunc_value) + result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value) + assert_series_equal( + result.reset_index(drop=True), expected.reset_index(drop=True) + ) def test_groupby_groups_datetimeindex(self): # GH#1430 periods = 1000 - ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods) - df = DataFrame({'high': np.arange(periods), - 'low': np.arange(periods)}, index=ind) + ind = pd.date_range(start="2012/1/1", freq="5min", periods=periods) + df = DataFrame( + {"high": np.arange(periods), "low": np.arange(periods)}, index=ind + ) grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) # it works! @@ -384,76 +436,94 @@ def test_groupby_groups_datetimeindex(self): assert isinstance(list(groups.keys())[0], datetime) # GH#11442 - index = pd.date_range('2015/01/01', periods=5, name='date') - df = pd.DataFrame({'A': [5, 6, 7, 8, 9], - 'B': [1, 2, 3, 4, 5]}, index=index) - result = df.groupby(level='date').groups - dates = ['2015-01-05', '2015-01-04', '2015-01-03', - '2015-01-02', '2015-01-01'] - expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date') - for date in dates} + index = pd.date_range("2015/01/01", periods=5, name="date") + df = pd.DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) + result = df.groupby(level="date").groups + dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"] + expected = { + pd.Timestamp(date): pd.DatetimeIndex([date], name="date") for date in dates + } tm.assert_dict_equal(result, expected) - grouped = df.groupby(level='date') + grouped = df.groupby(level="date") for date in dates: result = grouped.get_group(date) - data = [[df.loc[date, 'A'], df.loc[date, 'B']]] - expected_index = pd.DatetimeIndex([date], name='date') - expected = pd.DataFrame(data, - columns=list('AB'), - index=expected_index) + data = [[df.loc[date, "A"], df.loc[date, "B"]]] + expected_index = pd.DatetimeIndex([date], name="date") + expected = pd.DataFrame(data, columns=list("AB"), index=expected_index) tm.assert_frame_equal(result, expected) def test_groupby_groups_datetimeindex_tz(self): # GH 3950 - dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'datetime': dates, - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - df['datetime'] = df['datetime'].apply( - lambda d: Timestamp(d, tz='US/Pacific')) - - exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 09:00:00'], - tz='US/Pacific', name='datetime') - exp_idx2 = Index(['a', 'b'] * 3, name='label') + dates = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": dates, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific")) + + exp_idx1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Pacific", + name="datetime", + ) + exp_idx2 = Index(["a", "b"] * 3, name="label") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], - 'value2': [1, 2, 2, 1, 1, 2]}, - index=exp_idx, columns=['value1', 'value2']) + expected = DataFrame( + {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, + index=exp_idx, + columns=["value1", "value2"], + ) - result = df.groupby(['datetime', 'label']).sum() + result = df.groupby(["datetime", "label"]).sum() assert_frame_equal(result, expected) # by level - didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo') - df = DataFrame({'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2, 3, 1, 2, 3]}, - index=didx) + didx = pd.DatetimeIndex(dates, tz="Asia/Tokyo") + df = DataFrame( + {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, + index=didx, + ) - exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], tz='Asia/Tokyo') - expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, - index=exp_idx, columns=['value1', 'value2']) + exp_idx = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="Asia/Tokyo", + ) + expected = DataFrame( + {"value1": [3, 5, 7], "value2": [2, 4, 6]}, + index=exp_idx, + columns=["value1", "value2"], + ) result = df.groupby(level=0).sum() assert_frame_equal(result, expected) def test_frame_datetime64_handling_groupby(self): # it works! - df = DataFrame([(3, np.datetime64('2012-07-03')), - (3, np.datetime64('2012-07-04'))], - columns=['a', 'date']) - result = df.groupby('a').first() - assert result['date'][3] == Timestamp('2012-07-03') + df = DataFrame( + [(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))], + columns=["a", "date"], + ) + result = df.groupby("a").first() + assert result["date"][3] == Timestamp("2012-07-03") def test_groupby_multi_timezone(self): @@ -465,77 +535,99 @@ def test_groupby_multi_timezone(self): 3,2000-01-31 16:50:00,America/Chicago 4,2000-01-01 16:50:00,America/New_York""" - df = pd.read_csv(StringIO(data), header=None, - names=['value', 'date', 'tz']) - result = df.groupby('tz').date.apply( - lambda x: pd.to_datetime(x).dt.tz_localize(x.name)) - - expected = Series([Timestamp('2000-01-28 16:47:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-29 16:48:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-30 16:49:00-0800', - tz='America/Los_Angeles'), - Timestamp('2000-01-31 16:50:00-0600', - tz='America/Chicago'), - Timestamp('2000-01-01 16:50:00-0500', - tz='America/New_York')], - name='date', - dtype=object) + df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"]) + result = df.groupby("tz").date.apply( + lambda x: pd.to_datetime(x).dt.tz_localize(x.name) + ) + + expected = Series( + [ + Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"), + Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"), + Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"), + Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"), + Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"), + ], + name="date", + dtype=object, + ) assert_series_equal(result, expected) - tz = 'America/Chicago' - res_values = df.groupby('tz').date.get_group(tz) + tz = "America/Chicago" + res_values = df.groupby("tz").date.get_group(tz) result = pd.to_datetime(res_values).dt.tz_localize(tz) - exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00', - '2000-01-31 16:50:00'], - index=[0, 1, 3], name='date') + exp_values = Series( + ["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"], + index=[0, 1, 3], + name="date", + ) expected = pd.to_datetime(exp_values).dt.tz_localize(tz) assert_series_equal(result, expected) def test_groupby_groups_periods(self): - dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'period': [pd.Period(d, freq='H') for d in dates], - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - - exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00', - '2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 09:00:00'], - freq='H', name='period') - exp_idx2 = Index(['a', 'b'] * 3, name='label') + dates = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "period": [pd.Period(d, freq="H") for d in dates], + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + + exp_idx1 = pd.PeriodIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 09:00:00", + ], + freq="H", + name="period", + ) + exp_idx2 = Index(["a", "b"] * 3, name="label") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], - 'value2': [1, 2, 2, 1, 1, 2]}, - index=exp_idx, columns=['value1', 'value2']) + expected = DataFrame( + {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, + index=exp_idx, + columns=["value1", "value2"], + ) - result = df.groupby(['period', 'label']).sum() + result = df.groupby(["period", "label"]).sum() assert_frame_equal(result, expected) # by level - didx = pd.PeriodIndex(dates, freq='H') - df = DataFrame({'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2, 3, 1, 2, 3]}, - index=didx) + didx = pd.PeriodIndex(dates, freq="H") + df = DataFrame( + {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, + index=didx, + ) - exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], freq='H') - expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, - index=exp_idx, columns=['value1', 'value2']) + exp_idx = pd.PeriodIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + freq="H", + ) + expected = DataFrame( + {"value1": [3, 5, 7], "value2": [2, 4, 6]}, + index=exp_idx, + columns=["value1", "value2"], + ) result = df.groupby(level=0).sum() assert_frame_equal(result, expected) def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) - df[1] = df[1].view('M8[ns]') + df[1] = df[1].view("M8[ns]") assert issubclass(df[1].dtype.type, np.datetime64) @@ -550,50 +642,52 @@ def test_groupby_first_datetime64(self): def test_groupby_max_datetime64(self): # GH 5869 # datetimelike dtype conversion from int - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - expected = df.groupby('A')['A'].apply(lambda x: x.max()) - result = df.groupby('A')['A'].max() + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + expected = df.groupby("A")["A"].apply(lambda x: x.max()) + result = df.groupby("A")["A"].max() assert_series_equal(result, expected) def test_groupby_datetime64_32_bit(self): # GH 6410 / numpy 4328 # 32-bit under 1.9-dev indexing issue - df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2}) + df = DataFrame({"A": range(2), "B": [pd.Timestamp("2000-01-1")] * 2}) result = df.groupby("A")["B"].transform(min) - expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') + expected = Series([pd.Timestamp("2000-01-1")] * 2, name="B") assert_series_equal(result, expected) def test_groupby_with_timezone_selection(self): # GH 11616 # Test that column selection returns output in correct timezone. np.random.seed(42) - df = pd.DataFrame({ - 'factor': np.random.randint(0, 3, size=60), - 'time': pd.date_range('01/01/2000 00:00', periods=60, - freq='s', tz='UTC') - }) - df1 = df.groupby('factor').max()['time'] - df2 = df.groupby('factor')['time'].max() + df = pd.DataFrame( + { + "factor": np.random.randint(0, 3, size=60), + "time": pd.date_range( + "01/01/2000 00:00", periods=60, freq="s", tz="UTC" + ), + } + ) + df1 = df.groupby("factor").max()["time"] + df2 = df.groupby("factor")["time"].max() tm.assert_series_equal(df1, df2) def test_timezone_info(self): # see gh-11682: Timezone info lost when broadcasting # scalar datetime to DataFrame - df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]}) - assert df['b'][0].tzinfo == pytz.utc - df = pd.DataFrame({'a': [1, 2, 3]}) - df['b'] = datetime.now(pytz.utc) - assert df['b'][0].tzinfo == pytz.utc + df = pd.DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) + assert df["b"][0].tzinfo == pytz.utc + df = pd.DataFrame({"a": [1, 2, 3]}) + df["b"] = datetime.now(pytz.utc) + assert df["b"][0].tzinfo == pytz.utc def test_datetime_count(self): - df = DataFrame({'a': [1, 2, 3] * 2, - 'dates': pd.date_range('now', periods=6, freq='T')}) - result = df.groupby('a').dates.count() - expected = Series([ - 2, 2, 2 - ], index=Index([1, 2, 3], name='a'), name='dates') + df = DataFrame( + {"a": [1, 2, 3] * 2, "dates": pd.date_range("now", periods=6, freq="T")} + ) + result = df.groupby("a").dates.count() + expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates") tm.assert_series_equal(result, expected) def test_first_last_max_min_on_time_data(self): @@ -601,16 +695,25 @@ def test_first_last_max_min_on_time_data(self): # Verify that NaT is not in the result of max, min, first and last on # Dataframe with datetime or timedelta values. from datetime import timedelta as td + df_test = DataFrame( - {'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11', - '2015-07-23 12:12', nan], - 'td': [nan, td(days=1), td(days=2), td(days=3), nan]}) + { + "dt": [ + nan, + "2015-07-24 10:10", + "2015-07-25 11:11", + "2015-07-23 12:12", + nan, + ], + "td": [nan, td(days=1), td(days=2), td(days=3), nan], + } + ) df_test.dt = pd.to_datetime(df_test.dt) - df_test['group'] = 'A' + df_test["group"] = "A" df_ref = df_test[df_test.dt.notna()] - grouped_test = df_test.groupby('group') - grouped_ref = df_ref.groupby('group') + grouped_test = df_test.groupby("group") + grouped_ref = df_ref.groupby("group") assert_frame_equal(grouped_ref.max(), grouped_test.max()) assert_frame_equal(grouped_ref.min(), grouped_test.min()) @@ -619,28 +722,34 @@ def test_first_last_max_min_on_time_data(self): def test_nunique_with_timegrouper_and_nat(self): # GH 17575 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - pd.NaT, - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}) - - grouper = pd.Grouper(key='time', freq='h') - result = test.groupby(grouper)['data'].nunique() - expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() + test = pd.DataFrame( + { + "time": [ + Timestamp("2016-06-28 09:35:35"), + pd.NaT, + Timestamp("2016-06-28 16:46:28"), + ], + "data": ["1", "2", "3"], + } + ) + + grouper = pd.Grouper(key="time", freq="h") + result = test.groupby(grouper)["data"].nunique() + expected = test[test.time.notnull()].groupby(grouper)["data"].nunique() tm.assert_series_equal(result, expected) def test_scalar_call_versus_list_call(self): # Issue: 17530 data_frame = { - 'location': ['shanghai', 'beijing', 'shanghai'], - 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15', - '2017-08-11 22:23:15'], - dtype='datetime64[ns]'), - 'value': [1, 2, 3] + "location": ["shanghai", "beijing", "shanghai"], + "time": pd.Series( + ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"], + dtype="datetime64[ns]", + ), + "value": [1, 2, 3], } - data_frame = pd.DataFrame(data_frame).set_index('time') - grouper = pd.Grouper(freq='D') + data_frame = pd.DataFrame(data_frame).set_index("time") + grouper = pd.Grouper(freq="D") grouped = data_frame.groupby(grouper) result = grouped.count() diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 6ed2e178a7fc7..705e4080cf34e 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -10,7 +10,14 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timestamp, concat, date_range) + Categorical, + DataFrame, + MultiIndex, + Series, + Timestamp, + concat, + date_range, +) from pandas.core.groupby.groupby import DataError from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -36,79 +43,92 @@ def test_transform(): # make sure that we preserve the input order df = DataFrame( - np.arange(6, dtype='int64').reshape( - 3, 2), columns=["a", "b"], index=[0, 2, 1]) + np.arange(6, dtype="int64").reshape(3, 2), columns=["a", "b"], index=[0, 2, 1] + ) key = [0, 0, 1] - expected = df.sort_index().groupby(key).transform( - lambda x: x - x.mean()).groupby(key).mean() - result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( - key).mean() + expected = ( + df.sort_index() + .groupby(key) + .transform(lambda x: x - x.mean()) + .groupby(key) + .mean() + ) + result = df.groupby(key).transform(lambda x: x - x.mean()).groupby(key).mean() assert_frame_equal(result, expected) def demean(arr): return arr - arr.mean() - people = DataFrame(np.random.randn(5, 5), - columns=['a', 'b', 'c', 'd', 'e'], - index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) - key = ['one', 'two', 'one', 'two', 'one'] + people = DataFrame( + np.random.randn(5, 5), + columns=["a", "b", "c", "d", "e"], + index=["Joe", "Steve", "Wes", "Jim", "Travis"], + ) + key = ["one", "two", "one", "two", "one"] result = people.groupby(key).transform(demean).groupby(key).mean() expected = people.groupby(key).apply(demean).groupby(key).mean() assert_frame_equal(result, expected) # GH 8430 df = tm.makeTimeDataFrame() - g = df.groupby(pd.Grouper(freq='M')) + g = df.groupby(pd.Grouper(freq="M")) g.transform(lambda x: x - 1) # GH 9700 - df = DataFrame({'a': range(5, 10), 'b': range(5)}) - result = df.groupby('a').transform(max) - expected = DataFrame({'b': range(5)}) + df = DataFrame({"a": range(5, 10), "b": range(5)}) + result = df.groupby("a").transform(max) + expected = DataFrame({"b": range(5)}) tm.assert_frame_equal(result, expected) def test_transform_fast(): - df = DataFrame({'id': np.arange(100000) / 3, - 'val': np.random.randn(100000)}) + df = DataFrame({"id": np.arange(100000) / 3, "val": np.random.randn(100000)}) - grp = df.groupby('id')['val'] + grp = df.groupby("id")["val"] - values = np.repeat(grp.mean().values, - ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index, name='val') + values = np.repeat(grp.mean().values, ensure_platform_int(grp.count().values)) + expected = pd.Series(values, index=df.index, name="val") result = grp.transform(np.mean) assert_series_equal(result, expected) - result = grp.transform('mean') + result = grp.transform("mean") assert_series_equal(result, expected) # GH 12737 - df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], - 'd': pd.date_range('2014-1-1', '2014-1-4'), - 'i': [1, 2, 3, 4]}, - columns=['grouping', 'f', 'i', 'd']) - result = df.groupby('grouping').transform('first') - - dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] - expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], - 'd': dates, - 'i': [1, 2, 2, 4]}, - columns=['f', 'i', 'd']) + df = pd.DataFrame( + { + "grouping": [0, 1, 1, 3], + "f": [1.1, 2.1, 3.1, 4.5], + "d": pd.date_range("2014-1-1", "2014-1-4"), + "i": [1, 2, 3, 4], + }, + columns=["grouping", "f", "i", "d"], + ) + result = df.groupby("grouping").transform("first") + + dates = [ + pd.Timestamp("2014-1-1"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-4"), + ] + expected = pd.DataFrame( + {"f": [1.1, 2.1, 2.1, 4.5], "d": dates, "i": [1, 2, 2, 4]}, + columns=["f", "i", "d"], + ) assert_frame_equal(result, expected) # selection - result = df.groupby('grouping')[['f', 'i']].transform('first') - expected = expected[['f', 'i']] + result = df.groupby("grouping")[["f", "i"]].transform("first") + expected = expected[["f", "i"]] assert_frame_equal(result, expected) # dup columns - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) - result = df.groupby('g').transform('first') - expected = df.drop('g', axis=1) + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["g", "a", "a"]) + result = df.groupby("g").transform("first") + expected = df.drop("g", axis=1) assert_frame_equal(result, expected) @@ -130,8 +150,7 @@ def test_transform_broadcast(tsframe, ts): assert_fp_equal(res[col], agged[col]) # group columns - grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, - axis=1) + grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) tm.assert_index_equal(result.columns, tsframe.columns) @@ -152,33 +171,32 @@ def test_transform_axis(tsframe): base = tsframe.iloc[0:5] r = len(base.index) c = len(base.columns) - tso = DataFrame(np.random.randn(r, c), - index=base.index, - columns=base.columns, - dtype='float64') + tso = DataFrame( + np.random.randn(r, c), index=base.index, columns=base.columns, dtype="float64" + ) # monotonic ts = tso grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: x - x.mean()) assert_frame_equal(result, expected) ts = ts.T grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) assert_frame_equal(result, expected) # non-monotonic ts = tso.iloc[[1, 0] + list(range(2, len(base)))] grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: x - x.mean()) assert_frame_equal(result, expected) ts = ts.T grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') + result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) assert_frame_equal(result, expected) @@ -187,7 +205,7 @@ def test_transform_dtype(): # GH 9807 # Check transform dtype output is preserved df = DataFrame([[1, 3], [2, 3]]) - result = df.groupby(1).transform('mean') + result = df.groupby(1).transform("mean") expected = DataFrame([[1.5], [1.5]]) assert_frame_equal(result, expected) @@ -195,63 +213,63 @@ def test_transform_dtype(): def test_transform_bug(): # GH 5712 # transforming on a datetime column - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - result = df.groupby('A')['B'].transform( - lambda x: x.rank(ascending=False)) - expected = Series(np.arange(5, 0, step=-1), name='B') + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + result = df.groupby("A")["B"].transform(lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5, 0, step=-1), name="B") assert_series_equal(result, expected) def test_transform_numeric_to_boolean(): # GH 16875 # inconsistency in transforming boolean values - expected = pd.Series([True, True], name='A') + expected = pd.Series([True, True], name="A") - df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) + df = pd.DataFrame({"A": [1.1, 2.2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) assert_series_equal(result, expected) - df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) + df = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.groupby("B").A.transform(lambda x: True) assert_series_equal(result, expected) def test_transform_datetime_to_timedelta(): # GH 15429 # transforming a datetime to timedelta - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - expected = pd.Series([ - Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') + df = DataFrame(dict(A=Timestamp("20130101"), B=np.arange(5))) + expected = pd.Series([Timestamp("20130101") - Timestamp("20130101")] * 5, name="A") # this does date math without changing result type in transform - base_time = df['A'][0] - result = df.groupby('A')['A'].transform( - lambda x: x.max() - x.min() + base_time) - base_time + base_time = df["A"][0] + result = ( + df.groupby("A")["A"].transform(lambda x: x.max() - x.min() + base_time) + - base_time + ) assert_series_equal(result, expected) # this does date math and causes the transform to return timedelta - result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) + result = df.groupby("A")["A"].transform(lambda x: x.max() - x.min()) assert_series_equal(result, expected) def test_transform_datetime_to_numeric(): # GH 10972 # convert dt to float - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean() + ) - expected = Series([-0.5, 0.5], name='b') + expected = Series([-0.5, 0.5], name="b") assert_series_equal(result, expected) # convert dt to int - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) + df = DataFrame({"a": 1, "b": date_range("2015-01-01", periods=2, freq="D")}) + result = df.groupby("a").b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min() + ) - expected = Series([0, 1], name='b') + expected = Series([0, 1], name="b") assert_series_equal(result, expected) @@ -271,14 +289,14 @@ def test_transform_casting(): 9 B-053 b76cd912ff "2014-10-08 19:17:48" 10 B-065 b76cd912ff "2014-10-08 19:21:38" """ - df = pd.read_csv(StringIO(data), sep=r'\s+', - index_col=[0], parse_dates=['DATETIME']) + df = pd.read_csv( + StringIO(data), sep=r"\s+", index_col=[0], parse_dates=["DATETIME"] + ) - result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) + result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff()) assert is_timedelta64_dtype(result.dtype) - result = df[['ID3', 'DATETIME']].groupby('ID3').transform( - lambda x: x.diff()) + result = df[["ID3", "DATETIME"]].groupby("ID3").transform(lambda x: x.diff()) assert is_timedelta64_dtype(result.DATETIME.dtype) @@ -294,18 +312,18 @@ def test_dispatch_transform(tsframe): grouped = df.groupby(lambda x: x.month) - filled = grouped.fillna(method='pad') - fillit = lambda x: x.fillna(method='pad') + filled = grouped.fillna(method="pad") + fillit = lambda x: x.fillna(method="pad") expected = df.groupby(lambda x: x.month).transform(fillit) assert_frame_equal(filled, expected) def test_transform_select_columns(df): f = lambda x: x.mean() - result = df.groupby('A')['C', 'D'].transform(f) + result = df.groupby("A")["C", "D"].transform(f) - selection = df[['C', 'D']] - expected = selection.groupby(df['A']).transform(f) + selection = df[["C", "D"]] + expected = selection.groupby(df["A"]).transform(f) assert_frame_equal(result, expected) @@ -315,48 +333,55 @@ def test_transform_exclude_nuisance(df): # this also tests orderings in transform between # series/frame to make sure it's consistent expected = {} - grouped = df.groupby('A') - expected['C'] = grouped['C'].transform(np.mean) - expected['D'] = grouped['D'].transform(np.mean) + grouped = df.groupby("A") + expected["C"] = grouped["C"].transform(np.mean) + expected["D"] = grouped["D"].transform(np.mean) expected = DataFrame(expected) - result = df.groupby('A').transform(np.mean) + result = df.groupby("A").transform(np.mean) assert_frame_equal(result, expected) def test_transform_function_aliases(df): - result = df.groupby('A').transform('mean') - expected = df.groupby('A').transform(np.mean) + result = df.groupby("A").transform("mean") + expected = df.groupby("A").transform(np.mean) assert_frame_equal(result, expected) - result = df.groupby('A')['C'].transform('mean') - expected = df.groupby('A')['C'].transform(np.mean) + result = df.groupby("A")["C"].transform("mean") + expected = df.groupby("A")["C"].transform(np.mean) assert_series_equal(result, expected) def test_series_fast_transform_date(): # GH 13191 - df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], - 'd': pd.date_range('2014-1-1', '2014-1-4')}) - result = df.groupby('grouping')['d'].transform('first') - dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-4')] - expected = pd.Series(dates, name='d') + df = pd.DataFrame( + {"grouping": [np.nan, 1, 1, 3], "d": pd.date_range("2014-1-1", "2014-1-4")} + ) + result = df.groupby("grouping")["d"].transform("first") + dates = [ + pd.NaT, + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-2"), + pd.Timestamp("2014-1-4"), + ] + expected = pd.Series(dates, name="d") assert_series_equal(result, expected) def test_transform_length(): # GH 9697 - df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) + df = pd.DataFrame({"col1": [1, 1, 2, 2], "col2": [1, 2, 3, np.nan]}) expected = pd.Series([3.0] * 4) def nsum(x): return np.nansum(x) - results = [df.groupby('col1').transform(sum)['col2'], - df.groupby('col1')['col2'].transform(sum), - df.groupby('col1').transform(nsum)['col2'], - df.groupby('col1')['col2'].transform(nsum)] + results = [ + df.groupby("col1").transform(sum)["col2"], + df.groupby("col1")["col2"].transform(sum), + df.groupby("col1").transform(nsum)["col2"], + df.groupby("col1")["col2"].transform(nsum), + ] for result in results: assert_series_equal(result, expected, check_names=False) @@ -366,8 +391,8 @@ def test_transform_coercion(): # 14457 # when we are transforming be sure to not coerce # via assignment - df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) - g = df.groupby('A') + df = pd.DataFrame(dict(A=["a", "a"], B=[0, 1])) + g = df.groupby("A") expected = g.transform(np.mean) result = g.transform(lambda x: np.mean(x)) @@ -379,31 +404,33 @@ def test_groupby_transform_with_int(): # GH 3740, make sure that we might upcast on item-by-item transform # floats - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), - C=Series( - [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=Series( - [-1, 0, 1, -1, 0, 1], dtype='float64'))) + df = DataFrame( + dict( + A=[1, 1, 1, 2, 2, 2], + B=Series(1, dtype="float64"), + C=Series([1, 2, 3, 1, 2, 3], dtype="float64"), + D="foo", + ) + ) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + expected = DataFrame( + dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64")) + ) assert_frame_equal(result, expected) # int case - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, - C=[1, 2, 3, 1, 2, 3], D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=[1, 2, 3, 1, 2, 3], D="foo")) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) assert_frame_equal(result, expected) # int that needs float conversion s = Series([2, 3, 4, 10, 5, -1]) - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D="foo")) + with np.errstate(all="ignore"): + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) s1 = s.iloc[0:3] s1 = (s1 - s1.mean()) / s1.std() @@ -413,39 +440,43 @@ def test_groupby_transform_with_int(): assert_frame_equal(result, expected) # int downcasting - result = df.groupby('A').transform(lambda x: x * 2 / 2) + result = df.groupby("A").transform(lambda x: x * 2 / 2) expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) assert_frame_equal(result, expected) def test_groupby_transform_with_nan_group(): # GH 9941 - df = pd.DataFrame({'a': range(10), - 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) - result = df.groupby(df.b)['a'].transform(max) - expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], - name='a') + df = pd.DataFrame({"a": range(10), "b": [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + result = df.groupby(df.b)["a"].transform(max) + expected = pd.Series( + [1.0, 1.0, 2.0, 3.0, np.nan, 6.0, 6.0, 9.0, 9.0, 9.0], name="a" + ) assert_series_equal(result, expected) def test_transform_mixed_type(): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]]) + df = DataFrame( + { + "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], + "c": np.tile(["a", "b", "c"], 2), + "v": np.arange(1.0, 7.0), + }, + index=index, + ) def f(group): - group['g'] = group['d'] * 2 + group["g"] = group["d"] * 2 return group[:1] - grouped = df.groupby('c') + grouped = df.groupby("c") result = grouped.apply(f) - assert result['d'].dtype == np.float64 + assert result["d"].dtype == np.float64 # this is by definition a mutating operation! - with pd.option_context('mode.chained_assignment', None): + with pd.option_context("mode.chained_assignment", None): for key, group in grouped: res = f(group) assert_frame_equal(res, result.loc[key]) @@ -474,8 +505,7 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): ngroups = 1 pd_op(ans, data, labels, ngroups, is_datetimelike) - tm.assert_numpy_array_equal(np_op(data), ans[:, 0], - check_dtype=False) + tm.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) def test_cython_group_transform_cumsum(any_real_dtype): @@ -500,38 +530,45 @@ def test_cython_group_transform_algos(): labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) ngroups = 1 - data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') + data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") actual = np.zeros_like(data) actual.fill(np.nan) - groupby.group_cumprod_float64(actual, data, labels, ngroups, - is_datetimelike) - expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') + groupby.group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected) actual = np.zeros_like(data) actual.fill(np.nan) groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike) - expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') + expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected) # timedelta is_datetimelike = True - data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - actual = np.zeros_like(data, dtype='int64') - groupby.group_cumsum(actual, data.view('int64'), labels, - ngroups, is_datetimelike) - expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( - 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), - np.timedelta64(5, 'ns')]) - tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) + data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] + actual = np.zeros_like(data, dtype="int64") + groupby.group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) + expected = np.array( + [ + np.timedelta64(1, "ns"), + np.timedelta64(2, "ns"), + np.timedelta64(3, "ns"), + np.timedelta64(4, "ns"), + np.timedelta64(5, "ns"), + ] + ) + tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) @pytest.mark.parametrize( "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) def test_cython_transform_series(op, args, targop): # GH 4095 s = Series(np.random.randn(1000)) @@ -544,64 +581,104 @@ def test_cython_transform_series(op, args, targop): # print(data.head()) expected = data.groupby(labels).transform(targop) - tm.assert_series_equal( - expected, - data.groupby(labels).transform(op, *args)) - tm.assert_series_equal(expected, getattr( - data.groupby(labels), op)(*args)) + tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) -@pytest.mark.parametrize("op", ['cumprod', 'cumsum']) +@pytest.mark.parametrize("op", ["cumprod", "cumsum"]) @pytest.mark.parametrize("skipna", [False, True]) -@pytest.mark.parametrize('input, exp', [ - # When everything is NaN - ({'key': ['b'] * 10, 'value': np.nan}, - pd.Series([np.nan] * 10, name='value')), - # When there is a single NaN - ({'key': ['b'] * 10 + ['a'] * 2, - 'value': [3] * 3 + [np.nan] + [3] * 8}, - {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], - ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., - 2187., 6561., 19683., 3.0, 9.0], - ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], - ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., - 21., 24., 27., 3.0, 6.0]})]) +@pytest.mark.parametrize( + "input, exp", + [ + # When everything is NaN + ({"key": ["b"] * 10, "value": np.nan}, pd.Series([np.nan] * 10, name="value")), + # When there is a single NaN + ( + {"key": ["b"] * 10 + ["a"] * 2, "value": [3] * 3 + [np.nan] + [3] * 8}, + { + ("cumprod", False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ("cumprod", True): [ + 3.0, + 9.0, + 27.0, + np.nan, + 81.0, + 243.0, + 729.0, + 2187.0, + 6561.0, + 19683.0, + 3.0, + 9.0, + ], + ("cumsum", False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ("cumsum", True): [ + 3.0, + 6.0, + 9.0, + np.nan, + 12.0, + 15.0, + 18.0, + 21.0, + 24.0, + 27.0, + 3.0, + 6.0, + ], + }, + ), + ], +) def test_groupby_cum_skipna(op, skipna, input, exp): df = pd.DataFrame(input) - result = df.groupby('key')['value'].transform(op, skipna=skipna) + result = df.groupby("key")["value"].transform(op, skipna=skipna) if isinstance(exp, dict): expected = exp[(op, skipna)] else: expected = exp - expected = pd.Series(expected, name='value') + expected = pd.Series(expected, name="value") tm.assert_series_equal(expected, result) @pytest.mark.parametrize( "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) + [ + ("cumprod", (), lambda x: x.cumprod()), + ("cumsum", (), lambda x: x.cumsum()), + ("shift", (-1,), lambda x: x.shift(-1)), + ("shift", (1,), lambda x: x.shift()), + ], +) def test_cython_transform_frame(op, args, targop): s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) - strings = list('qwertyuiopasdfghjklz') + strings = list("qwertyuiopasdfghjklz") strings_missing = strings[:] strings_missing[5] = np.nan - df = DataFrame({'float': s, - 'float_missing': s_missing, - 'int': [1, 1, 1, 1, 2] * 200, - 'datetime': pd.date_range('1990-1-1', periods=1000), - 'timedelta': pd.timedelta_range(1, freq='s', - periods=1000), - 'string': strings * 50, - 'string_missing': strings_missing * 50}, - columns=['float', 'float_missing', 'int', 'datetime', - 'timedelta', 'string', 'string_missing']) - df['cat'] = df['string'].astype('category') + df = DataFrame( + { + "float": s, + "float_missing": s_missing, + "int": [1, 1, 1, 1, 2] * 200, + "datetime": pd.date_range("1990-1-1", periods=1000), + "timedelta": pd.timedelta_range(1, freq="s", periods=1000), + "string": strings * 50, + "string_missing": strings_missing * 50, + }, + columns=[ + "float", + "float_missing", + "int", + "datetime", + "timedelta", + "string", + "string_missing", + ], + ) + df["cat"] = df["string"].astype("category") df2 = df.copy() df2.index = pd.MultiIndex.from_product([range(100), range(10)]) @@ -609,37 +686,35 @@ def test_cython_transform_frame(op, args, targop): # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: - for gb_target in [dict(by=labels), dict(level=0), dict(by='string') - ]: # dict(by='string_missing')]: + for gb_target in [ + dict(by=labels), + dict(level=0), + dict(by="string"), + ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: gb = df.groupby(**gb_target) # whitelisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior - if op == 'shift': + if op == "shift": gb._set_group_selection() - if op != 'shift' and 'int' not in gb_target: + if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have # to apply separately and concat - i = gb[['int']].apply(targop) - f = gb[['float', 'float_missing']].apply(targop) + i = gb[["int"]].apply(targop) + f = gb[["float", "float_missing"]].apply(targop) expected = pd.concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, - gb.transform(op, *args).sort_index( - axis=1)) - tm.assert_frame_equal( - expected, - getattr(gb, op)(*args).sort_index(axis=1)) + tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) + tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: - if c not in ['float', 'int', 'float_missing' - ] and op != 'shift': + if c not in ["float", "int", "float_missing"] and op != "shift": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): gb[c].transform(op) @@ -648,84 +723,102 @@ def test_cython_transform_frame(op, args, targop): else: expected = gb[c].apply(targop) expected.name = c - tm.assert_series_equal(expected, - gb[c].transform(op, *args)) - tm.assert_series_equal(expected, - getattr(gb[c], op)(*args)) + tm.assert_series_equal(expected, gb[c].transform(op, *args)) + tm.assert_series_equal(expected, getattr(gb[c], op)(*args)) def test_transform_with_non_scalar_group(): # GH 10165 - cols = pd.MultiIndex.from_tuples([ - ('syn', 'A'), ('mis', 'A'), ('non', 'A'), - ('syn', 'C'), ('mis', 'C'), ('non', 'C'), - ('syn', 'T'), ('mis', 'T'), ('non', 'T'), - ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) - df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), - columns=cols, - index=['A', 'C', 'G', 'T']) - - msg = 'transform must return a scalar value for each group.*' + cols = pd.MultiIndex.from_tuples( + [ + ("syn", "A"), + ("mis", "A"), + ("non", "A"), + ("syn", "C"), + ("mis", "C"), + ("non", "C"), + ("syn", "T"), + ("mis", "T"), + ("non", "T"), + ("syn", "G"), + ("mis", "G"), + ("non", "G"), + ] + ) + df = pd.DataFrame( + np.random.randint(1, 10, (4, 12)), columns=cols, index=["A", "C", "G", "T"] + ) + + msg = "transform must return a scalar value for each group.*" with pytest.raises(ValueError, match=msg): - df.groupby(axis=1, level=1).transform( - lambda z: z.div(z.sum(axis=1), axis=0)) + df.groupby(axis=1, level=1).transform(lambda z: z.div(z.sum(axis=1), axis=0)) -@pytest.mark.parametrize('cols,exp,comp_func', [ - ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal), - (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}), - tm.assert_frame_equal) -]) -@pytest.mark.parametrize('agg_func', [ - 'count', 'rank', 'size']) +@pytest.mark.parametrize( + "cols,exp,comp_func", + [ + ("a", pd.Series([1, 1, 1], name="a"), tm.assert_series_equal), + ( + ["a", "c"], + pd.DataFrame({"a": [1, 1, 1], "c": [1, 1, 1]}), + tm.assert_frame_equal, + ), + ], +) +@pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) def test_transform_numeric_ret(cols, exp, comp_func, agg_func): - if agg_func == 'size' and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with " - "NDFrameGroupy") + if agg_func == "size" and isinstance(cols, list): + pytest.xfail("'size' transformation not supported with " "NDFrameGroupy") # GH 19200 df = pd.DataFrame( - {'a': pd.date_range('2018-01-01', periods=3), - 'b': range(3), - 'c': range(7, 10)}) + {"a": pd.date_range("2018-01-01", periods=3), "b": range(3), "c": range(7, 10)} + ) - result = df.groupby('b')[cols].transform(agg_func) + result = df.groupby("b")[cols].transform(agg_func) - if agg_func == 'rank': - exp = exp.astype('float') + if agg_func == "rank": + exp = exp.astype("float") comp_func(result, exp) @pytest.mark.parametrize("mix_groupings", [True, False]) @pytest.mark.parametrize("as_series", [True, False]) -@pytest.mark.parametrize("val1,val2", [ - ('foo', 'bar'), (1, 2), (1., 2.)]) -@pytest.mark.parametrize("fill_method,limit,exp_vals", [ - ("ffill", None, - [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), - ("ffill", 1, - [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), - ("bfill", None, - ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), - ("bfill", 1, - [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) -]) -def test_group_fill_methods(mix_groupings, as_series, val1, val2, - fill_method, limit, exp_vals): +@pytest.mark.parametrize("val1,val2", [("foo", "bar"), (1, 2), (1.0, 2.0)]) +@pytest.mark.parametrize( + "fill_method,limit,exp_vals", + [ + ( + "ffill", + None, + [np.nan, np.nan, "val1", "val1", "val1", "val2", "val2", "val2"], + ), + ("ffill", 1, [np.nan, np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan]), + ( + "bfill", + None, + ["val1", "val1", "val1", "val2", "val2", "val2", np.nan, np.nan], + ), + ("bfill", 1, [np.nan, "val1", "val1", np.nan, "val2", "val2", np.nan, np.nan]), + ], +) +def test_group_fill_methods( + mix_groupings, as_series, val1, val2, fill_method, limit, exp_vals +): vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] _exp_vals = list(exp_vals) # Overwrite placeholder values for index, exp_val in enumerate(_exp_vals): - if exp_val == 'val1': + if exp_val == "val1": _exp_vals[index] = val1 - elif exp_val == 'val2': + elif exp_val == "val2": _exp_vals[index] = val2 # Need to modify values and expectations depending on the # Series / DataFrame that we ultimately want to generate if mix_groupings: # ['a', 'b', 'a, 'b', ...] - keys = ['a', 'b'] * len(vals) + keys = ["a", "b"] * len(vals) def interweave(list_obj): temp = list() @@ -737,82 +830,98 @@ def interweave(list_obj): _exp_vals = interweave(_exp_vals) vals = interweave(vals) else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] - keys = ['a'] * len(vals) + ['b'] * len(vals) + keys = ["a"] * len(vals) + ["b"] * len(vals) _exp_vals = _exp_vals * 2 vals = vals * 2 - df = DataFrame({'key': keys, 'val': vals}) + df = DataFrame({"key": keys, "val": vals}) if as_series: - result = getattr( - df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(_exp_vals, name='val') + result = getattr(df.groupby("key")["val"], fill_method)(limit=limit) + exp = Series(_exp_vals, name="val") assert_series_equal(result, exp) else: - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'val': _exp_vals}) + result = getattr(df.groupby("key"), fill_method)(limit=limit) + exp = DataFrame({"val": _exp_vals}) assert_frame_equal(result, exp) -@pytest.mark.parametrize("fill_method", ['ffill', 'bfill']) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill"]) def test_pad_stable_sorting(fill_method): # GH 21207 x = [0] * 20 y = [np.nan] * 10 + [1] * 10 - if fill_method == 'bfill': + if fill_method == "bfill": y = y[::-1] - df = pd.DataFrame({'x': x, 'y': y}) - expected = df.drop('x', 1) + df = pd.DataFrame({"x": x, "y": y}) + expected = df.drop("x", 1) - result = getattr(df.groupby('x'), fill_method)() + result = getattr(df.groupby("x"), fill_method)() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("test_series", [True, False]) -@pytest.mark.parametrize("freq", [ - None, - pytest.param('D', marks=pytest.mark.xfail( - reason='GH#23918 before method uses freq in vectorized approach'))]) -@pytest.mark.parametrize("periods,fill_method,limit", [ - (1, 'ffill', None), (1, 'ffill', 1), - (1, 'bfill', None), (1, 'bfill', 1), - (-1, 'ffill', None), (-1, 'ffill', 1), - (-1, 'bfill', None), (-1, 'bfill', 1), -]) +@pytest.mark.parametrize( + "freq", + [ + None, + pytest.param( + "D", + marks=pytest.mark.xfail( + reason="GH#23918 before method uses freq in vectorized approach" + ), + ), + ], +) +@pytest.mark.parametrize( + "periods,fill_method,limit", + [ + (1, "ffill", None), + (1, "ffill", 1), + (1, "bfill", None), + (1, "bfill", 1), + (-1, "ffill", None), + (-1, "ffill", 1), + (-1, "bfill", None), + (-1, "bfill", 1), + ], +) def test_pct_change(test_series, freq, periods, fill_method, limit): # GH 21200, 21621 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] - keys = ['a', 'b'] + keys = ["a", "b"] key_v = np.repeat(keys, len(vals)) - df = DataFrame({'key': key_v, 'vals': vals * 2}) + df = DataFrame({"key": key_v, "vals": vals * 2}) - df_g = getattr(df.groupby('key'), fill_method)(limit=limit) + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) grp = df_g.groupby(df.key) - expected = grp['vals'].obj / grp['vals'].shift(periods) - 1 + expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 if test_series: - result = df.groupby('key')['vals'].pct_change( - periods=periods, fill_method=fill_method, limit=limit, freq=freq) + result = df.groupby("key")["vals"].pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) tm.assert_series_equal(result, expected) else: - result = df.groupby('key').pct_change( - periods=periods, fill_method=fill_method, limit=limit, freq=freq) - tm.assert_frame_equal(result, expected.to_frame('vals')) + result = df.groupby("key").pct_change( + periods=periods, fill_method=fill_method, limit=limit, freq=freq + ) + tm.assert_frame_equal(result, expected.to_frame("vals")) @pytest.mark.parametrize("func", [np.any, np.all]) def test_any_all_np_func(func): # GH 20653 - df = pd.DataFrame([['foo', True], - [np.nan, True], - ['foo', True]], columns=['key', 'val']) + df = pd.DataFrame( + [["foo", True], [np.nan, True], ["foo", True]], columns=["key", "val"] + ) - exp = pd.Series([True, np.nan, True], name='val') + exp = pd.Series([True, np.nan, True], name="val") - res = df.groupby('key')['val'].transform(func) + res = df.groupby("key")["val"].transform(func) tm.assert_series_equal(res, exp) @@ -825,51 +934,52 @@ def demean_rename(x): return result result = result.rename( - columns={c: '{}_demeaned'.format(c) for c in result.columns}) + columns={c: "{}_demeaned".format(c) for c in result.columns} + ) return result - df = pd.DataFrame({'group': list('ababa'), - 'value': [1, 1, 1, 2, 2]}) - expected = pd.DataFrame({'value': [-1. / 3, -0.5, -1. / 3, 0.5, 2. / 3]}) + df = pd.DataFrame({"group": list("ababa"), "value": [1, 1, 1, 2, 2]}) + expected = pd.DataFrame({"value": [-1.0 / 3, -0.5, -1.0 / 3, 0.5, 2.0 / 3]}) - result = df.groupby('group').transform(demean_rename) + result = df.groupby("group").transform(demean_rename) tm.assert_frame_equal(result, expected) - result_single = df.groupby('group').value.transform(demean_rename) - tm.assert_series_equal(result_single, expected['value']) + result_single = df.groupby("group").value.transform(demean_rename) + tm.assert_series_equal(result_single, expected["value"]) -@pytest.mark.parametrize('func', [min, max, np.min, np.max, 'first', 'last']) +@pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"]) def test_groupby_transform_timezone_column(func): # GH 24198 - ts = pd.to_datetime('now', utc=True).tz_convert('Asia/Singapore') - result = pd.DataFrame({'end_time': [ts], 'id': [1]}) - result['max_end_time'] = result.groupby('id').end_time.transform(func) - expected = pd.DataFrame([[ts, 1, ts]], columns=['end_time', 'id', - 'max_end_time']) + ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") + result = pd.DataFrame({"end_time": [ts], "id": [1]}) + result["max_end_time"] = result.groupby("id").end_time.transform(func) + expected = pd.DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func, values", [ - ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]), - ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]) -]) +@pytest.mark.parametrize( + "func, values", + [ + ("idxmin", ["1/1/2011"] * 2 + ["1/3/2011"] * 7 + ["1/10/2011"]), + ("idxmax", ["1/2/2011"] * 2 + ["1/9/2011"] * 7 + ["1/10/2011"]), + ], +) def test_groupby_transform_with_datetimes(func, values): # GH 15306 - dates = pd.date_range('1/1/2011', periods=10, freq='D') + dates = pd.date_range("1/1/2011", periods=10, freq="D") - stocks = pd.DataFrame({'price': np.arange(10.0)}, index=dates) - stocks['week_id'] = pd.to_datetime(stocks.index).week + stocks = pd.DataFrame({"price": np.arange(10.0)}, index=dates) + stocks["week_id"] = pd.to_datetime(stocks.index).week - result = stocks.groupby(stocks['week_id'])['price'].transform(func) + result = stocks.groupby(stocks["week_id"])["price"].transform(func) - expected = pd.Series(data=pd.to_datetime(values), - index=dates, name="price") + expected = pd.Series(data=pd.to_datetime(values), index=dates, name="price") tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('func', ['cumsum', 'cumprod', 'cummin', 'cummax']) +@pytest.mark.parametrize("func", ["cumsum", "cumprod", "cummin", "cummax"]) def test_transform_absent_categories(func): # GH 16771 # cython transforms with more groups than rows @@ -882,8 +992,8 @@ def test_transform_absent_categories(func): assert_series_equal(result, expected) -@pytest.mark.parametrize('func', ['ffill', 'bfill', 'shift']) -@pytest.mark.parametrize('key, val', [('level', 0), ('by', Series([0]))]) +@pytest.mark.parametrize("func", ["ffill", "bfill", "shift"]) +@pytest.mark.parametrize("key, val", [("level", 0), ("by", Series([0]))]) def test_ffill_not_in_axis(func, key, val): # GH 21521 df = pd.DataFrame([[np.nan]]) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 2b5f87aa59a8d..c7b28822092a8 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -16,21 +16,22 @@ # our starting frame def seed_df(seed_nans, n, m): np.random.seed(1234) - days = date_range('2015-08-24', periods=10) + days = date_range("2015-08-24", periods=10) - frame = DataFrame({ - '1st': np.random.choice( - list('abcd'), n), - '2nd': np.random.choice(days, n), - '3rd': np.random.randint(1, m + 1, n) - }) + frame = DataFrame( + { + "1st": np.random.choice(list("abcd"), n), + "2nd": np.random.choice(days, n), + "3rd": np.random.randint(1, m + 1, n), + } + ) if seed_nans: - frame.loc[1::11, '1st'] = np.nan - frame.loc[3::17, '2nd'] = np.nan - frame.loc[7::19, '3rd'] = np.nan - frame.loc[8::19, '3rd'] = np.nan - frame.loc[9::19, '3rd'] = np.nan + frame.loc[1::11, "1st"] = np.nan + frame.loc[3::17, "2nd"] = np.nan + frame.loc[7::19, "3rd"] = np.nan + frame.loc[8::19, "3rd"] = np.nan + frame.loc[9::19, "3rd"] = np.nan return frame @@ -42,8 +43,8 @@ def seed_df(seed_nans, n, m): for n, m in product((100, 1000), (5, 20)): df = seed_df(seed_nans, n, m) - bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2) - keys = '1st', '2nd', ['1st', '2nd'] + bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2) + keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) ids.append("{}-{}-{}".format(k, n, m)) @@ -52,24 +53,27 @@ def seed_df(seed_nans, n, m): @pytest.mark.slow @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) def test_series_groupby_value_counts(df, keys, bins, n, m): - def rebuild_index(df): arr = list(map(df.index.get_level_values, range(df.index.nlevels))) df.index = MultiIndex.from_arrays(arr, names=df.index.names) return df - for isort, normalize, sort, ascending, dropna \ - in product((False, True), repeat=5): + for isort, normalize, sort, ascending, dropna in product((False, True), repeat=5): - kwargs = dict(normalize=normalize, sort=sort, - ascending=ascending, dropna=dropna, bins=bins) + kwargs = dict( + normalize=normalize, + sort=sort, + ascending=ascending, + dropna=dropna, + bins=bins, + ) gr = df.groupby(keys, sort=isort) - left = gr['3rd'].value_counts(**kwargs) + left = gr["3rd"].value_counts(**kwargs) gr = df.groupby(keys, sort=isort) - right = gr['3rd'].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ['3rd'] + right = gr["3rd"].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ["3rd"] # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 2bd2f3fb00b56..03e10ff44c299 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -11,26 +11,37 @@ from pandas import DataFrame, Index, MultiIndex, Series, date_range from pandas.util import testing as tm -AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', - 'mad', 'std', 'var', 'sem'] -AGG_FUNCTIONS_WITH_SKIPNA = ['skew', 'mad'] +AGG_FUNCTIONS = [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "mad", + "std", + "var", + "sem", +] +AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"] df_whitelist = [ - 'quantile', - 'fillna', - 'mad', - 'take', - 'idxmax', - 'idxmin', - 'tshift', - 'skew', - 'plot', - 'hist', - 'dtypes', - 'corrwith', - 'corr', - 'cov', - 'diff', + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "plot", + "hist", + "dtypes", + "corrwith", + "corr", + "cov", + "diff", ] @@ -40,25 +51,25 @@ def df_whitelist_fixture(request): s_whitelist = [ - 'quantile', - 'fillna', - 'mad', - 'take', - 'idxmax', - 'idxmin', - 'tshift', - 'skew', - 'plot', - 'hist', - 'dtype', - 'corr', - 'cov', - 'diff', - 'unique', - 'nlargest', - 'nsmallest', - 'is_monotonic_increasing', - 'is_monotonic_decreasing', + "quantile", + "fillna", + "mad", + "take", + "idxmax", + "idxmin", + "tshift", + "skew", + "plot", + "hist", + "dtype", + "corr", + "cov", + "diff", + "unique", + "nlargest", + "nsmallest", + "is_monotonic_increasing", + "is_monotonic_decreasing", ] @@ -69,22 +80,24 @@ def s_whitelist_fixture(request): @pytest.fixture def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) @pytest.fixture def df(): return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) @pytest.fixture @@ -92,8 +105,12 @@ def df_letters(): letters = np.array(list(ascii_lowercase)) N = 10 random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) + df = DataFrame( + { + "floats": N / 10 * Series(np.random.random(N)), + "letters": Series(random_letters), + } + ) return df @@ -104,7 +121,7 @@ def test_groupby_whitelist(df_letters, whitelist): # dataframe obj = df_letters else: - obj = df_letters['floats'] + obj = df_letters["floats"] gb = obj.groupby(df.letters) @@ -147,26 +164,25 @@ def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture): @pytest.fixture def raw_frame(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - raw_frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + raw_frame = DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) raw_frame.iloc[1, [1, 2]] = np.nan raw_frame.iloc[7, [0, 1]] = np.nan return raw_frame -@pytest.mark.parametrize('op', AGG_FUNCTIONS) -@pytest.mark.parametrize('level', [0, 1]) -@pytest.mark.parametrize('axis', [0, 1]) -@pytest.mark.parametrize('skipna', [True, False]) -@pytest.mark.parametrize('sort', [True, False]) -def test_regression_whitelist_methods( - raw_frame, op, level, - axis, skipna, sort): +@pytest.mark.parametrize("op", AGG_FUNCTIONS) +@pytest.mark.parametrize("level", [0, 1]) +@pytest.mark.parametrize("axis", [0, 1]) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna, sort): # GH6944 # GH 17537 # explicitly test the whitelist methods @@ -179,8 +195,7 @@ def test_regression_whitelist_methods( if op in AGG_FUNCTIONS_WITH_SKIPNA: grouped = frame.groupby(level=level, axis=axis, sort=sort) result = getattr(grouped, op)(skipna=skipna) - expected = getattr(frame, op)(level=level, axis=axis, - skipna=skipna) + expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) if sort: expected = expected.sort_index(axis=axis, level=level) tm.assert_frame_equal(result, expected) @@ -198,21 +213,32 @@ def test_groupby_blacklist(df_letters): s = df_letters.floats blacklist = [ - 'eval', 'query', 'abs', 'where', - 'mask', 'align', 'groupby', 'clip', 'astype', - 'at', 'combine', 'consolidate', 'convert_objects', + "eval", + "query", + "abs", + "where", + "mask", + "align", + "groupby", + "clip", + "astype", + "at", + "combine", + "consolidate", + "convert_objects", ] - to_methods = [method for method in dir(df) if method.startswith('to_')] + to_methods = [method for method in dir(df) if method.startswith("to_")] blacklist.extend(to_methods) # e.g., to_csv - defined_but_not_allowed = ("(?:^Cannot.+{0!r}.+{1!r}.+try using the " - "'apply' method$)") + defined_but_not_allowed = ( + "(?:^Cannot.+{0!r}.+{1!r}.+try using the " "'apply' method$)" + ) # e.g., query, eval not_defined = "(?:^{1!r} object has no attribute {0!r}$)" - fmt = defined_but_not_allowed + '|' + not_defined + fmt = defined_but_not_allowed + "|" + not_defined for bl in blacklist: for obj in (df, s): gb = obj.groupby(df.letters) @@ -222,58 +248,117 @@ def test_groupby_blacklist(df_letters): def test_tab_completion(mframe): - grp = mframe.groupby(level='second') - results = {v for v in dir(grp) if not v.startswith('_')} + grp = mframe.groupby(level="second") + results = {v for v in dir(grp) if not v.startswith("_")} expected = { - 'A', 'B', 'C', 'agg', 'aggregate', 'apply', 'boxplot', 'filter', - 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', - 'mean', 'median', 'min', 'ngroups', 'nth', 'ohlc', 'plot', - 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'nunique', 'head', 'describe', 'cummax', 'quantile', - 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', - 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', - 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', - 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding', 'pipe', + "A", + "B", + "C", + "agg", + "aggregate", + "apply", + "boxplot", + "filter", + "first", + "get_group", + "groups", + "hist", + "indices", + "last", + "max", + "mean", + "median", + "min", + "ngroups", + "nth", + "ohlc", + "plot", + "prod", + "size", + "std", + "sum", + "transform", + "var", + "sem", + "count", + "nunique", + "head", + "describe", + "cummax", + "quantile", + "rank", + "cumprod", + "tail", + "resample", + "cummin", + "fillna", + "cumsum", + "cumcount", + "ngroup", + "all", + "shift", + "skew", + "take", + "tshift", + "pct_change", + "any", + "mad", + "corr", + "corrwith", + "cov", + "dtypes", + "ndim", + "diff", + "idxmax", + "idxmin", + "ffill", + "bfill", + "pad", + "backfill", + "rolling", + "expanding", + "pipe", } assert results == expected def test_groupby_function_rename(mframe): - grp = mframe.groupby(level='second') - for name in ['sum', 'prod', 'min', 'max', 'first', 'last']: + grp = mframe.groupby(level="second") + for name in ["sum", "prod", "min", "max", "first", "last"]: f = getattr(grp, name) assert f.__name__ == name def test_groupby_selection_with_methods(df): # some methods which require DatetimeIndex - rng = date_range('2014', periods=len(df)) + rng = date_range("2014", periods=len(df)) df.index = rng - g = df.groupby(['A'])[['C']] - g_exp = df[['C']].groupby(df['A']) + g = df.groupby(["A"])[["C"]] + g_exp = df[["C"]].groupby(df["A"]) # TODO check groupby with > 1 col ? # methods which are called as .foo() - methods = ['count', - 'corr', - 'cummax', - 'cummin', - 'cumprod', - 'describe', - 'rank', - 'quantile', - 'diff', - 'shift', - 'all', - 'any', - 'idxmin', - 'idxmax', - 'ffill', - 'bfill', - 'pct_change', - 'tshift'] + methods = [ + "count", + "corr", + "cummax", + "cummin", + "cumprod", + "describe", + "rank", + "quantile", + "diff", + "shift", + "all", + "any", + "idxmin", + "idxmax", + "ffill", + "bfill", + "pct_change", + "tshift", + ] for m in methods: res = getattr(g, m)() @@ -285,12 +370,11 @@ def test_groupby_selection_with_methods(df): # methods which aren't just .foo() tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) tm.assert_frame_equal(g.dtypes, g_exp.dtypes) - tm.assert_frame_equal(g.apply(lambda x: x.sum()), - g_exp.apply(lambda x: x.sum())) + tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) - tm.assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean()) - tm.assert_frame_equal(g.resample('D').ohlc(), - g_exp.resample('D').ohlc()) + tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean()) + tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc()) - tm.assert_frame_equal(g.filter(lambda x: len(x) == 3), - g_exp.filter(lambda x: len(x) == 3)) + tm.assert_frame_equal( + g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3) + ) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 674f600bc8693..9459069f0ea2d 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,9 +7,19 @@ import pandas as pd from pandas import ( - CategoricalIndex, DatetimeIndex, Index, Int64Index, IntervalIndex, - MultiIndex, PeriodIndex, RangeIndex, Series, TimedeltaIndex, UInt64Index, - isna) + CategoricalIndex, + DatetimeIndex, + Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, + isna, +) from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin import pandas.util.testing as tm @@ -17,8 +27,9 @@ class Base: """ base class for index sub-class tests """ + _holder = None - _compat_props = ['shape', 'ndim', 'size', 'nbytes'] + _compat_props = ["shape", "ndim", "size", "nbytes"] def setup_indices(self): for name, idx in self.indices.items(): @@ -26,10 +37,12 @@ def setup_indices(self): def test_pickle_compat_construction(self): # need an object to create with - msg = (r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed|" - r"__new__\(\) missing 1 required positional argument: 'data'|" - r"__new__\(\) takes at least 2 arguments \(1 given\)") + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + r" kind, None was passed|" + r"__new__\(\) missing 1 required positional argument: 'data'|" + r"__new__\(\) takes at least 2 arguments \(1 given\)" + ) with pytest.raises(TypeError, match=msg): self._holder() @@ -55,7 +68,7 @@ def test_to_series_with_arguments(self): # name kwarg idx = self.create_index() - s = idx.to_series(name='__test') + s = idx.to_series(name="__test") assert s.values is not idx.values assert s.index is not idx @@ -83,8 +96,8 @@ def test_to_frame(self, name): def test_to_frame_datetime_tz(self): # GH 25809 - idx = pd.date_range(start='2019-01-01', end='2019-01-30', freq='D') - idx = idx.tz_localize('UTC') + idx = pd.date_range(start="2019-01-01", end="2019-01-30", freq="D") + idx = idx.tz_localize("UTC") result = idx.to_frame() expected = pd.DataFrame(idx, index=idx) tm.assert_frame_equal(result, expected) @@ -105,29 +118,52 @@ def test_create_index_existing_name(self): # specified, the new index should inherit the previous object name expected = self.create_index() if not isinstance(expected, MultiIndex): - expected.name = 'foo' + expected.name = "foo" result = pd.Index(expected) tm.assert_index_equal(result, expected) - result = pd.Index(expected, name='bar') - expected.name = 'bar' + result = pd.Index(expected, name="bar") + expected.name = "bar" tm.assert_index_equal(result, expected) else: - expected.names = ['foo', 'bar'] + expected.names = ["foo", "bar"] result = pd.Index(expected) tm.assert_index_equal( - result, Index(Index([('foo', 'one'), ('foo', 'two'), - ('bar', 'one'), ('baz', 'two'), - ('qux', 'one'), ('qux', 'two')], - dtype='object'), - names=['foo', 'bar'])) - - result = pd.Index(expected, names=['A', 'B']) + result, + Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + names=["foo", "bar"], + ), + ) + + result = pd.Index(expected, names=["A", "B"]) tm.assert_index_equal( result, - Index(Index([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), - ('baz', 'two'), ('qux', 'one'), ('qux', 'two')], - dtype='object'), names=['A', 'B'])) + Index( + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", + ), + names=["A", "B"], + ), + ) def test_numeric_compat(self): @@ -141,7 +177,7 @@ def test_numeric_compat(self): with pytest.raises(TypeError, match=div_err): idx / 1 - div_err = div_err.replace(' __', ' __r') + div_err = div_err.replace(" __", " __r") with pytest.raises(TypeError, match=div_err): 1 / idx with pytest.raises(TypeError, match="cannot perform __floordiv__"): @@ -151,9 +187,9 @@ def test_numeric_compat(self): def test_logical_compat(self): idx = self.create_index() - with pytest.raises(TypeError, match='cannot perform all'): + with pytest.raises(TypeError, match="cannot perform all"): idx.all() - with pytest.raises(TypeError, match='cannot perform any'): + with pytest.raises(TypeError, match="cannot perform any"): idx.any() def test_boolean_context_compat(self): @@ -161,7 +197,7 @@ def test_boolean_context_compat(self): # boolean context compat idx = self.create_index() - with pytest.raises(ValueError, match='The truth value of a'): + with pytest.raises(ValueError, match="The truth value of a"): if idx: pass @@ -172,8 +208,8 @@ def test_reindex_base(self): actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) - with pytest.raises(ValueError, match='Invalid fill method'): - idx.get_indexer(idx, method='invalid') + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") def test_get_indexer_consistency(self): # See GH 16819 @@ -216,7 +252,7 @@ def test_str(self): # test the string repr idx = self.create_index() - idx.name = 'foo' + idx.name = "foo" assert "'foo'" in str(idx) assert idx.__class__.__name__ in str(idx) @@ -226,7 +262,7 @@ def test_repr_max_seq_item_setting(self): idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) - assert '...' not in str(idx) + assert "..." not in str(idx) def test_copy_name(self): # gh-12309: Check that the "name" argument @@ -236,7 +272,7 @@ def test_copy_name(self): if isinstance(index, MultiIndex): continue - first = index.__class__(index, copy=True, name='mario') + first = index.__class__(index, copy=True, name="mario") second = first.__class__(first, copy=False) # Even though "copy=False", we want a new object. @@ -245,8 +281,8 @@ def test_copy_name(self): # Not using tm.assert_index_equal() since names differ. assert index.equals(first) - assert first.name == 'mario' - assert second.name == 'mario' + assert first.name == "mario" + assert second.name == "mario" s1 = Series(2, index=first) s2 = Series(3, index=second[:-1]) @@ -254,7 +290,7 @@ def test_copy_name(self): if not isinstance(index, CategoricalIndex): # See gh-13365 s3 = s1 * s2 - assert s3.index.name == 'mario' + assert s3.index.name == "mario" def test_ensure_copied_data(self): # Check the "copy" argument of each Index.__new__ is honoured @@ -263,7 +299,7 @@ def test_ensure_copied_data(self): init_kwargs = {} if isinstance(index, PeriodIndex): # Needs "freq" specification: - init_kwargs['freq'] = index.freq + init_kwargs["freq"] = index.freq elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)): # RangeIndex cannot be initialized from data # MultiIndex and CategoricalIndex are tested separately @@ -272,27 +308,27 @@ def test_ensure_copied_data(self): index_type = index.__class__ result = index_type(index.values, copy=True, **init_kwargs) tm.assert_index_equal(index, result) - tm.assert_numpy_array_equal(index._ndarray_values, - result._ndarray_values, - check_same='copy') + tm.assert_numpy_array_equal( + index._ndarray_values, result._ndarray_values, check_same="copy" + ) if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied - result = index_type(ordinal=index.asi8, copy=False, - **init_kwargs) - tm.assert_numpy_array_equal(index._ndarray_values, - result._ndarray_values, - check_same='same') + result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) + tm.assert_numpy_array_equal( + index._ndarray_values, result._ndarray_values, check_same="same" + ) elif isinstance(index, IntervalIndex): # checked in test_interval.py pass else: result = index_type(index.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index.values, result.values, - check_same='same') - tm.assert_numpy_array_equal(index._ndarray_values, - result._ndarray_values, - check_same='same') + tm.assert_numpy_array_equal( + index.values, result.values, check_same="same" + ) + tm.assert_numpy_array_equal( + index._ndarray_values, result._ndarray_values, check_same="same" + ) def test_memory_usage(self): for name, index in self.indices.items(): @@ -307,7 +343,7 @@ def test_memory_usage(self): if not isinstance(index, (RangeIndex, IntervalIndex)): assert result2 > result - if index.inferred_type == 'object': + if index.inferred_type == "object": assert result3 > result2 else: @@ -319,7 +355,7 @@ def test_argsort(self): for k, ind in self.indices.items(): # separately tested - if k in ['catIndex']: + if k in ["catIndex"]: continue result = ind.argsort() @@ -346,26 +382,25 @@ def test_numpy_argsort(self): msg = "the 'kind' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(ind, kind='mergesort') + np.argsort(ind, kind="mergesort") msg = "the 'order' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(ind, order=('a', 'b')) + np.argsort(ind, order=("a", "b")) def test_take(self): indexer = [4, 3, 0, 2] for k, ind in self.indices.items(): # separate - if k in ['boolIndex', 'tuples', 'empty']: + if k in ["boolIndex", "tuples", "empty"]: continue result = ind.take(indexer) expected = ind[indexer] assert result.equals(expected) - if not isinstance(ind, - (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + if not isinstance(ind, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): # GH 10791 with pytest.raises(AttributeError): ind.freq @@ -384,7 +419,7 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') + idx.take(indices, mode="clip") def test_repeat(self): rep = 2 @@ -407,7 +442,7 @@ def test_numpy_repeat(self): with pytest.raises(ValueError, match=msg): np.repeat(i, rep, axis=0) - @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where(self, klass): i = self.create_index() @@ -422,8 +457,9 @@ def test_where(self, klass): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("case", [0.5, "xxx"]) - @pytest.mark.parametrize("method", ["intersection", "union", - "difference", "symmetric_difference"]) + @pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] + ) def test_set_ops_error_cases(self, case, method): for name, idx in self.indices.items(): # non-iterable input @@ -444,8 +480,7 @@ def test_intersection_base(self): assert tm.equalContents(intersect, second) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass @@ -467,8 +502,7 @@ def test_union_base(self): assert tm.equalContents(union, everything) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass @@ -495,15 +529,15 @@ def test_difference_base(self, sort): assert tm.equalContents(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass elif isinstance(idx, (DatetimeIndex, TimedeltaIndex)): assert result.__class__ == answer.__class__ - tm.assert_numpy_array_equal(result.sort_values().asi8, - answer.sort_values().asi8) + tm.assert_numpy_array_equal( + result.sort_values().asi8, answer.sort_values().asi8 + ) else: result = first.difference(case, sort) assert tm.equalContents(result, answer) @@ -525,8 +559,7 @@ def test_symmetric_difference(self): assert tm.equalContents(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: if isinstance(idx, CategoricalIndex): pass @@ -599,7 +632,7 @@ def test_equals_op(self): # GH9947, GH10637 index_a = self.create_index() if isinstance(index_a, PeriodIndex): - pytest.skip('Skip check for PeriodIndex') + pytest.skip("Skip check for PeriodIndex") n = len(index_a) index_b = index_a[0:-1] @@ -741,8 +774,7 @@ def test_nulls(self): for name, index in self.indices.items(): if len(index) == 0: - tm.assert_numpy_array_equal( - index.isna(), np.array([], dtype=bool)) + tm.assert_numpy_array_equal(index.isna(), np.array([], dtype=bool)) elif isinstance(index, MultiIndex): idx = index.copy() msg = "isna is not defined for MultiIndex" @@ -752,9 +784,11 @@ def test_nulls(self): if not index.hasnans: tm.assert_numpy_array_equal( - index.isna(), np.zeros(len(index), dtype=bool)) + index.isna(), np.zeros(len(index), dtype=bool) + ) tm.assert_numpy_array_equal( - index.notna(), np.ones(len(index), dtype=bool)) + index.notna(), np.ones(len(index), dtype=bool) + ) else: result = isna(index) tm.assert_numpy_array_equal(index.isna(), result) @@ -778,7 +812,7 @@ def test_map(self): # we don't infer UInt64 if isinstance(index, pd.UInt64Index): - expected = index.astype('int64') + expected = index.astype("int64") else: expected = index @@ -789,7 +823,9 @@ def test_map(self): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index)]) + lambda values, index: pd.Series(values, index), + ], + ) def test_map_dictlike(self, mapper): index = self.create_index() @@ -800,7 +836,7 @@ def test_map_dictlike(self, mapper): # we don't infer to UInt64 for a dict if isinstance(index, pd.UInt64Index) and isinstance(identity, dict): - expected = index.astype('int64') + expected = index.astype("int64") else: expected = index @@ -823,11 +859,11 @@ def test_putmask_with_wrong_mask(self): index.putmask(np.ones(len(index) - 1, np.bool), 1) with pytest.raises(ValueError): - index.putmask('foo', 1) + index.putmask("foo", 1) - @pytest.mark.parametrize('copy', [True, False]) - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('ordered', [True, False]) + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("ordered", [True, False]) def test_astype_category(self, copy, name, ordered): # GH 18630 index = self.create_index() @@ -848,7 +884,7 @@ def test_astype_category(self, copy, name, ordered): if ordered is False: # dtype='category' defaults to ordered=False, so only test once - result = index.astype('category', copy=copy) + result = index.astype("category", copy=copy) expected = CategoricalIndex(index.values, name=name) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 83f1f22b158b1..12c5fb8339549 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -5,22 +5,23 @@ from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm -indices_list = [tm.makeUnicodeIndex(100), - tm.makeStringIndex(100), - tm.makeDateIndex(100), - tm.makePeriodIndex(100), - tm.makeTimedeltaIndex(100), - tm.makeIntIndex(100), - tm.makeUIntIndex(100), - tm.makeRangeIndex(100), - tm.makeFloatIndex(100), - Index([True, False]), - tm.makeCategoricalIndex(100), - tm.makeIntervalIndex(100), - Index([]), - MultiIndex.from_tuples(zip( - ['foo', 'bar', 'baz'], [1, 2, 3])), - Index([0, 0, 1, 1, 2, 2])] +indices_list = [ + tm.makeUnicodeIndex(100), + tm.makeStringIndex(100), + tm.makeDateIndex(100), + tm.makePeriodIndex(100), + tm.makeTimedeltaIndex(100), + tm.makeIntIndex(100), + tm.makeUIntIndex(100), + tm.makeRangeIndex(100), + tm.makeFloatIndex(100), + Index([True, False]), + tm.makeCategoricalIndex(100), + tm.makeIntervalIndex(100), + Index([]), + MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + Index([0, 0, 1, 1, 2, 2]), +] @pytest.fixture(params=indices_list, ids=lambda x: type(x).__name__) @@ -34,11 +35,12 @@ def one(request): return request.param -zeros = [box([0] * 5, dtype=dtype) - for box in [pd.Index, np.array] - for dtype in [np.int64, np.uint64, np.float64]] -zeros.extend([np.array(0, dtype=dtype) - for dtype in [np.int64, np.uint64, np.float64]]) +zeros = [ + box([0] * 5, dtype=dtype) + for box in [pd.Index, np.array] + for dtype in [np.int64, np.uint64, np.float64] +] +zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]]) zeros.extend([0, 0.0]) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 180033c2d2619..1b3c4e65d252b 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -9,7 +9,6 @@ class DatetimeLike(Base): - def test_argmax_axis_invalid(self): # GH#23081 rng = self.create_index() @@ -36,21 +35,21 @@ def test_str(self): # test the string repr idx = self.create_index() - idx.name = 'foo' + idx.name = "foo" assert not "length=%s" % len(idx) in str(idx) assert "'foo'" in str(idx) assert idx.__class__.__name__ in str(idx) - if hasattr(idx, 'tz'): + if hasattr(idx, "tz"): if idx.tz is not None: assert idx.tz in str(idx) - if hasattr(idx, 'freq'): + if hasattr(idx, "freq"): assert "freq='%s'" % idx.freqstr in str(idx) def test_view(self): i = self.create_index() - i_view = i.view('i8') + i_view = i.view("i8") result = self._holder(i) tm.assert_index_equal(result, i) @@ -72,7 +71,9 @@ def test_map_callable(self): "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index)]) + lambda values, index: pd.Series(values, index), + ], + ) def test_map_dictlike(self, mapper): expected = self.index + self.index.freq diff --git a/pandas/tests/indexes/datetimes/test_arithmetic.py b/pandas/tests/indexes/datetimes/test_arithmetic.py index e998f77582c85..4851dd5a55c1e 100644 --- a/pandas/tests/indexes/datetimes/test_arithmetic.py +++ b/pandas/tests/indexes/datetimes/test_arithmetic.py @@ -18,44 +18,56 @@ class TestDatetimeIndexArithmetic: def test_dti_shift_tzaware(self, tz_naive_fixture): # GH#9903 tz = tz_naive_fixture - idx = pd.DatetimeIndex([], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) - - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01 11:00', - '2011-01-01 12:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.DatetimeIndex(['2011-01-01 13:00', '2011-01-01 14:00', - '2011-01-01 15:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.DatetimeIndex(['2011-01-01 07:00', '2011-01-01 08:00', - '2011-01-01 09:00'], name='xxx', tz=tz) - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + idx = pd.DatetimeIndex([], name="xxx", tz=tz) + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(3, freq="H"), idx) + + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", "2011-01-01 11:00", "2011-01-01 12:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + exp = pd.DatetimeIndex( + ["2011-01-01 13:00", "2011-01-01 14:00", "2011-01-01 15:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(3, freq="H"), exp) + exp = pd.DatetimeIndex( + ["2011-01-01 07:00", "2011-01-01 08:00", "2011-01-01 09:00"], + name="xxx", + tz=tz, + ) + tm.assert_index_equal(idx.shift(-3, freq="H"), exp) def test_dti_shift_freqs(self): # test shift for DatetimeIndex and non DatetimeIndex # GH#8083 - drange = pd.date_range('20130101', periods=5) + drange = pd.date_range("20130101", periods=5) result = drange.shift(1) - expected = pd.DatetimeIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', - '2013-01-06'], freq='D') + expected = pd.DatetimeIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) tm.assert_index_equal(result, expected) result = drange.shift(-1) - expected = pd.DatetimeIndex(['2012-12-31', '2013-01-01', '2013-01-02', - '2013-01-03', '2013-01-04'], - freq='D') + expected = pd.DatetimeIndex( + ["2012-12-31", "2013-01-01", "2013-01-02", "2013-01-03", "2013-01-04"], + freq="D", + ) tm.assert_index_equal(result, expected) - result = drange.shift(3, freq='2D') - expected = pd.DatetimeIndex(['2013-01-07', '2013-01-08', '2013-01-09', - '2013-01-10', - '2013-01-11'], freq='D') + result = drange.shift(3, freq="2D") + expected = pd.DatetimeIndex( + ["2013-01-07", "2013-01-08", "2013-01-09", "2013-01-10", "2013-01-11"], + freq="D", + ) tm.assert_index_equal(result, expected) def test_dti_shift_int(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 @@ -73,36 +85,39 @@ def test_dti_shift_int(self): def test_dti_shift_no_freq(self): # GH#19147 - dti = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-01'], freq=None) + dti = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None) with pytest.raises(NullFrequencyError): dti.shift(2) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_shift_localized(self, tzstr): - dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') + dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") dr_tz = dr.tz_localize(tzstr) - result = dr_tz.shift(1, '10T') + result = dr_tz.shift(1, "10T") assert result.tz == dr_tz.tz def test_dti_shift_across_dst(self): # GH 8616 - idx = date_range('2013-11-03', tz='America/Chicago', - periods=7, freq='H') + idx = date_range("2013-11-03", tz="America/Chicago", periods=7, freq="H") s = Series(index=idx[:-1]) - result = s.shift(freq='H') + result = s.shift(freq="H") expected = Series(index=idx[1:]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('shift, result_time', [ - [0, '2014-11-14 00:00:00'], - [-1, '2014-11-13 23:00:00'], - [1, '2014-11-14 01:00:00']]) + @pytest.mark.parametrize( + "shift, result_time", + [ + [0, "2014-11-14 00:00:00"], + [-1, "2014-11-13 23:00:00"], + [1, "2014-11-14 01:00:00"], + ], + ) def test_dti_shift_near_midnight(self, shift, result_time): # GH 8616 dt = datetime(2014, 11, 14, 0) - dt_est = pytz.timezone('EST').localize(dt) + dt_est = pytz.timezone("EST").localize(dt) s = Series(data=[1], index=[dt_est]) - result = s.shift(shift, freq='H') - expected = Series(1, index=DatetimeIndex([result_time], tz='EST')) + result = s.shift(shift, freq="H") + expected = Series(1, index=DatetimeIndex([result_time], tz="EST")) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 38a060bb0d1d3..eabf293ae915f 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -8,33 +8,40 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, Int64Index, NaT, Period, Series, Timestamp, - date_range) + DatetimeIndex, + Index, + Int64Index, + NaT, + Period, + Series, + Timestamp, + date_range, +) import pandas.util.testing as tm class TestDatetimeIndex: - def test_astype(self): # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) result = idx.astype(object) - expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object) + expected = Index([Timestamp("2016-05-16")] + [NaT] * 3, dtype=object) tm.assert_index_equal(result, expected) result = idx.astype(int) - expected = Int64Index([1463356800000000000] + - [-9223372036854775808] * 3, dtype=np.int64) + expected = Int64Index( + [1463356800000000000] + [-9223372036854775808] * 3, dtype=np.int64 + ) tm.assert_index_equal(result, expected) - rng = date_range('1/1/2000', periods=10) - result = rng.astype('i8') + rng = date_range("1/1/2000", periods=10) + result = rng.astype("i8") tm.assert_index_equal(result, Index(rng.asi8)) tm.assert_numpy_array_equal(result.values, rng.asi8) def test_astype_uint(self): - arr = date_range('2000', periods=2) + arr = date_range("2000", periods=2) expected = pd.UInt64Index( np.array([946684800000000000, 946771200000000000], dtype="uint64") ) @@ -45,139 +52,167 @@ def test_astype_uint(self): def test_astype_with_tz(self): # with tz - rng = date_range('1/1/2000', periods=10, tz='US/Eastern') - result = rng.astype('datetime64[ns]') - expected = (date_range('1/1/2000', periods=10, - tz='US/Eastern') - .tz_convert('UTC').tz_localize(None)) + rng = date_range("1/1/2000", periods=10, tz="US/Eastern") + result = rng.astype("datetime64[ns]") + expected = ( + date_range("1/1/2000", periods=10, tz="US/Eastern") + .tz_convert("UTC") + .tz_localize(None) + ) tm.assert_index_equal(result, expected) # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex - result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) - expected = pd.Series( - ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) + result = pd.Series(pd.date_range("2012-01-01", periods=3)).astype(str) + expected = pd.Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) tm.assert_series_equal(result, expected) - result = Series(pd.date_range('2012-01-01', periods=3, - tz='US/Eastern')).astype(str) - expected = Series(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - dtype=object) + result = Series(pd.date_range("2012-01-01", periods=3, tz="US/Eastern")).astype( + str + ) + expected = Series( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + dtype=object, + ) tm.assert_series_equal(result, expected) # GH 18951: tz-aware to tz-aware - idx = date_range('20170101', periods=4, tz='US/Pacific') - result = idx.astype('datetime64[ns, US/Eastern]') - expected = date_range('20170101 03:00:00', periods=4, tz='US/Eastern') + idx = date_range("20170101", periods=4, tz="US/Pacific") + result = idx.astype("datetime64[ns, US/Eastern]") + expected = date_range("20170101 03:00:00", periods=4, tz="US/Eastern") tm.assert_index_equal(result, expected) # GH 18951: tz-naive to tz-aware - idx = date_range('20170101', periods=4) - result = idx.astype('datetime64[ns, US/Eastern]') - expected = date_range('20170101', periods=4, tz='US/Eastern') + idx = date_range("20170101", periods=4) + result = idx.astype("datetime64[ns, US/Eastern]") + expected = date_range("20170101", periods=4, tz="US/Eastern") tm.assert_index_equal(result, expected) def test_astype_str_compat(self): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) result = idx.astype(str) - expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object) + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): # test astype string - #10442 - result = date_range('2012-01-01', periods=4, - name='test_name').astype(str) - expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', - '2012-01-04'], name='test_name', dtype=object) + result = date_range("2012-01-01", periods=4, name="test_name").astype(str) + expected = Index( + ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], + name="test_name", + dtype=object, + ) tm.assert_index_equal(result, expected) # test astype string with tz and name - result = date_range('2012-01-01', periods=3, name='test_name', - tz='US/Eastern').astype(str) - expected = Index(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - name='test_name', dtype=object) + result = date_range( + "2012-01-01", periods=3, name="test_name", tz="US/Eastern" + ).astype(str) + expected = Index( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + name="test_name", + dtype=object, + ) tm.assert_index_equal(result, expected) # test astype string with freqH and name - result = date_range('1/1/2011', periods=3, freq='H', - name='test_name').astype(str) - expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', - '2011-01-01 02:00:00'], - name='test_name', dtype=object) + result = date_range("1/1/2011", periods=3, freq="H", name="test_name").astype( + str + ) + expected = Index( + ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], + name="test_name", + dtype=object, + ) tm.assert_index_equal(result, expected) # test astype string with freqH and timezone - result = date_range('3/6/2012 00:00', periods=2, freq='H', - tz='Europe/London', name='test_name').astype(str) - expected = Index(['2012-03-06 00:00:00+00:00', - '2012-03-06 01:00:00+00:00'], - dtype=object, name='test_name') + result = date_range( + "3/6/2012 00:00", periods=2, freq="H", tz="Europe/London", name="test_name" + ).astype(str) + expected = Index( + ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], + dtype=object, + name="test_name", + ) tm.assert_index_equal(result, expected) def test_astype_datetime64(self): # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) - result = idx.astype('datetime64[ns]') + result = idx.astype("datetime64[ns]") tm.assert_index_equal(result, idx) assert result is not idx - result = idx.astype('datetime64[ns]', copy=False) + result = idx.astype("datetime64[ns]", copy=False) tm.assert_index_equal(result, idx) assert result is idx - idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST') - result = idx_tz.astype('datetime64[ns]') - expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]') + idx_tz = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN], tz="EST") + result = idx_tz.astype("datetime64[ns]") + expected = DatetimeIndex( + ["2016-05-16 05:00:00", "NaT", "NaT", "NaT"], dtype="datetime64[ns]" + ) tm.assert_index_equal(result, expected) def test_astype_object(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) - casted = rng.astype('O') + casted = rng.astype("O") exp_values = list(rng) tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) assert casted.tolist() == exp_values - @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo']) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo"]) def test_astype_object_tz(self, tz): - idx = pd.date_range(start='2013-01-01', periods=4, freq='M', - name='idx', tz=tz) - expected_list = [Timestamp('2013-01-31', tz=tz), - Timestamp('2013-02-28', tz=tz), - Timestamp('2013-03-31', tz=tz), - Timestamp('2013-04-30', tz=tz)] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = pd.date_range(start="2013-01-01", periods=4, freq="M", name="idx", tz=tz) + expected_list = [ + Timestamp("2013-01-31", tz=tz), + Timestamp("2013-02-28", tz=tz), + Timestamp("2013-03-31", tz=tz), + Timestamp("2013-04-30", tz=tz), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list def test_astype_object_with_nat(self): - idx = DatetimeIndex([datetime(2013, 1, 1), datetime(2013, 1, 2), - pd.NaT, datetime(2013, 1, 4)], name='idx') - expected_list = [Timestamp('2013-01-01'), - Timestamp('2013-01-02'), pd.NaT, - Timestamp('2013-01-04')] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = DatetimeIndex( + [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, datetime(2013, 1, 4)], + name="idx", + ) + expected_list = [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + pd.NaT, + Timestamp("2013-01-04"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - @pytest.mark.parametrize('dtype', [ - float, 'timedelta64', 'timedelta64[ns]', 'datetime64', - 'datetime64[D]']) + @pytest.mark.parametrize( + "dtype", + [float, "timedelta64", "timedelta64[ns]", "datetime64", "datetime64[D]"], + ) def test_astype_raises(self, dtype): # GH 13149, GH 13209 - idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) - msg = 'Cannot cast DatetimeArray to dtype' + idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.NaN]) + msg = "Cannot cast DatetimeArray to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) @@ -190,9 +225,9 @@ def _check_rng(rng): assert x == stamp.to_pydatetime() assert x.tzinfo == stamp.tzinfo - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') - rng_utc = date_range('20090415', '20090519', tz='utc') + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="US/Eastern") + rng_utc = date_range("20090415", "20090519", tz="utc") _check_rng(rng) _check_rng(rng_eastern) @@ -207,10 +242,9 @@ def _check_rng(rng): assert x == stamp.to_pydatetime() assert x.tzinfo == stamp.tzinfo - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz=pytz.timezone('US/Eastern')) - rng_utc = date_range('20090415', '20090519', tz=pytz.utc) + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) + rng_utc = date_range("20090415", "20090519", tz=pytz.utc) _check_rng(rng) _check_rng(rng_eastern) @@ -225,31 +259,32 @@ def _check_rng(rng): assert x == stamp.to_pydatetime() assert x.tzinfo == stamp.tzinfo - rng = date_range('20090415', '20090519') - rng_eastern = date_range('20090415', '20090519', - tz='dateutil/US/Eastern') - rng_utc = date_range('20090415', '20090519', tz=dateutil.tz.tzutc()) + rng = date_range("20090415", "20090519") + rng_eastern = date_range("20090415", "20090519", tz="dateutil/US/Eastern") + rng_utc = date_range("20090415", "20090519", tz=dateutil.tz.tzutc()) _check_rng(rng) _check_rng(rng_eastern) _check_rng(rng_utc) - @pytest.mark.parametrize('tz, dtype', [ - ['US/Pacific', 'datetime64[ns, US/Pacific]'], - [None, 'datetime64[ns]']]) + @pytest.mark.parametrize( + "tz, dtype", + [["US/Pacific", "datetime64[ns, US/Pacific]"], [None, "datetime64[ns]"]], + ) def test_integer_index_astype_datetime(self, tz, dtype): # GH 20997, 20964, 24559 - val = [pd.Timestamp('2018-01-01', tz=tz).value] + val = [pd.Timestamp("2018-01-01", tz=tz).value] result = pd.Index(val).astype(dtype) expected = pd.DatetimeIndex(["2018-01-01"], tz=tz) tm.assert_index_equal(result, expected) class TestToPeriod: - def setup_method(self, method): - data = [Timestamp('2007-01-01 10:11:12.123456Z'), - Timestamp('2007-01-01 10:11:13.789123Z')] + data = [ + Timestamp("2007-01-01 10:11:12.123456Z"), + Timestamp("2007-01-01 10:11:13.789123Z"), + ] self.index = DatetimeIndex(data) def test_to_period_millisecond(self): @@ -257,26 +292,27 @@ def test_to_period_millisecond(self): with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq='L') + period = index.to_period(freq="L") assert 2 == len(period) - assert period[0] == Period('2007-01-01 10:11:12.123Z', 'L') - assert period[1] == Period('2007-01-01 10:11:13.789Z', 'L') + assert period[0] == Period("2007-01-01 10:11:12.123Z", "L") + assert period[1] == Period("2007-01-01 10:11:13.789Z", "L") def test_to_period_microsecond(self): index = self.index with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - period = index.to_period(freq='U') + period = index.to_period(freq="U") assert 2 == len(period) - assert period[0] == Period('2007-01-01 10:11:12.123456Z', 'U') - assert period[1] == Period('2007-01-01 10:11:13.789123Z', 'U') + assert period[0] == Period("2007-01-01 10:11:12.123456Z", "U") + assert period[1] == Period("2007-01-01 10:11:13.789123Z", "U") - @pytest.mark.parametrize('tz', [ - 'US/Eastern', pytz.utc, tzlocal(), 'dateutil/US/Eastern', - dateutil.tz.tzutc()]) + @pytest.mark.parametrize( + "tz", + ["US/Eastern", pytz.utc, tzlocal(), "dateutil/US/Eastern", dateutil.tz.tzutc()], + ) def test_to_period_tz(self, tz): - ts = date_range('1/1/2000', '2/1/2000', tz=tz) + ts = date_range("1/1/2000", "2/1/2000", tz=tz) with tm.assert_produces_warning(UserWarning): # GH#21333 warning that timezone info will be lost @@ -285,7 +321,7 @@ def test_to_period_tz(self, tz): assert result == expected - expected = date_range('1/1/2000', '2/1/2000').to_period() + expected = date_range("1/1/2000", "2/1/2000").to_period() with tm.assert_produces_warning(UserWarning): # GH#21333 warning that timezone info will be lost @@ -293,45 +329,44 @@ def test_to_period_tz(self, tz): tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', ['Etc/GMT-1', 'Etc/GMT+1']) + @pytest.mark.parametrize("tz", ["Etc/GMT-1", "Etc/GMT+1"]) def test_to_period_tz_utc_offset_consistency(self, tz): # GH 22905 - ts = pd.date_range('1/1/2000', '2/1/2000', tz='Etc/GMT-1') + ts = pd.date_range("1/1/2000", "2/1/2000", tz="Etc/GMT-1") with tm.assert_produces_warning(UserWarning): result = ts.to_period()[0] expected = ts[0].to_period() assert result == expected def test_to_period_nofreq(self): - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) with pytest.raises(ValueError): idx.to_period() - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], - freq='infer') - assert idx.freqstr == 'D' - expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', - '2000-01-03'], freq='D') + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="infer") + assert idx.freqstr == "D" + expected = pd.PeriodIndex(["2000-01-01", "2000-01-02", "2000-01-03"], freq="D") tm.assert_index_equal(idx.to_period(), expected) # GH 7606 - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) assert idx.freqstr is None tm.assert_index_equal(idx.to_period(), expected) - @pytest.mark.parametrize('tz', [None, 'US/Central']) + @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_category(self, tz): obj = pd.date_range("2000", periods=2, tz=tz) - result = obj.astype('category') - expected = pd.CategoricalIndex([pd.Timestamp('2000-01-01', tz=tz), - pd.Timestamp('2000-01-02', tz=tz)]) + result = obj.astype("category") + expected = pd.CategoricalIndex( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) tm.assert_index_equal(result, expected) - result = obj._data.astype('category') + result = obj._data.astype("category") expected = expected.values tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'US/Central']) + @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_astype_array_fallback(self, tz): obj = pd.date_range("2000", periods=2, tz=tz) result = obj.astype(bool) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index 56dfbfd485eb1..f22c820253ee5 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -11,26 +11,31 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, Timestamp, date_range, datetime, offsets, - to_datetime) + DatetimeIndex, + Index, + Timestamp, + date_range, + datetime, + offsets, + to_datetime, +) from pandas.core.arrays import DatetimeArray, period_array import pandas.util.testing as tm class TestDatetimeIndex: - - @pytest.mark.parametrize('dt_cls', [DatetimeIndex, - DatetimeArray._from_sequence]) + @pytest.mark.parametrize("dt_cls", [DatetimeIndex, DatetimeArray._from_sequence]) def test_freq_validation_with_nat(self, dt_cls): # GH#11587 make sure we get a useful error message when generate_range # raises - msg = ("Inferred frequency None from passed values does not conform " - "to passed frequency D") + msg = ( + "Inferred frequency None from passed values does not conform " + "to passed frequency D" + ) with pytest.raises(ValueError, match=msg): - dt_cls([pd.NaT, pd.Timestamp('2011-01-01')], freq='D') + dt_cls([pd.NaT, pd.Timestamp("2011-01-01")], freq="D") with pytest.raises(ValueError, match=msg): - dt_cls([pd.NaT, pd.Timestamp('2011-01-01').value], - freq='D') + dt_cls([pd.NaT, pd.Timestamp("2011-01-01").value], freq="D") def test_categorical_preserves_tz(self): # GH#18664 retain tz when going DTI-->Categorical-->DTI @@ -38,8 +43,8 @@ def test_categorical_preserves_tz(self): # once CategoricalIndex(DTA) works dti = pd.DatetimeIndex( - [pd.NaT, '2015-01-01', '1999-04-06 15:14:13', '2015-01-01'], - tz='US/Eastern') + [pd.NaT, "2015-01-01", "1999-04-06 15:14:13", "2015-01-01"], tz="US/Eastern" + ) ci = pd.CategoricalIndex(dti) carr = pd.Categorical(dti) @@ -51,7 +56,7 @@ def test_categorical_preserves_tz(self): def test_dti_with_period_data_raises(self): # GH#23675 - data = pd.PeriodIndex(['2016Q1', '2016Q2'], freq='Q') + data = pd.PeriodIndex(["2016Q1", "2016Q2"], freq="Q") with pytest.raises(TypeError, match="PeriodDtype data is invalid"): DatetimeIndex(data) @@ -67,59 +72,64 @@ def test_dti_with_period_data_raises(self): def test_dti_with_timedelta64_data_deprecation(self): # GH#23675 - data = np.array([0], dtype='m8[ns]') + data = np.array([0], dtype="m8[ns]") with tm.assert_produces_warning(FutureWarning): result = DatetimeIndex(data) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = to_datetime(data) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") with tm.assert_produces_warning(FutureWarning): result = DatetimeIndex(pd.TimedeltaIndex(data)) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = to_datetime(pd.TimedeltaIndex(data)) - assert result[0] == Timestamp('1970-01-01') + assert result[0] == Timestamp("1970-01-01") def test_construction_caching(self): - df = pd.DataFrame({'dt': pd.date_range('20130101', periods=3), - 'dttz': pd.date_range('20130101', periods=3, - tz='US/Eastern'), - 'dt_with_null': [pd.Timestamp('20130101'), pd.NaT, - pd.Timestamp('20130103')], - 'dtns': pd.date_range('20130101', periods=3, - freq='ns')}) - assert df.dttz.dtype.tz.zone == 'US/Eastern' - - @pytest.mark.parametrize('kwargs', [ - {'tz': 'dtype.tz'}, - {'dtype': 'dtype'}, - {'dtype': 'dtype', 'tz': 'dtype.tz'}]) + df = pd.DataFrame( + { + "dt": pd.date_range("20130101", periods=3), + "dttz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "dt_with_null": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + "dtns": pd.date_range("20130101", periods=3, freq="ns"), + } + ) + assert df.dttz.dtype.tz.zone == "US/Eastern" + + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) def test_construction_with_alt(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = pd.date_range('20130101', periods=5, freq='H', tz=tz) + i = pd.date_range("20130101", periods=5, freq="H", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} result = DatetimeIndex(i, **kwargs) tm.assert_index_equal(i, result) - @pytest.mark.parametrize('kwargs', [ - {'tz': 'dtype.tz'}, - {'dtype': 'dtype'}, - {'dtype': 'dtype', 'tz': 'dtype.tz'}]) + @pytest.mark.parametrize( + "kwargs", + [{"tz": "dtype.tz"}, {"dtype": "dtype"}, {"dtype": "dtype", "tz": "dtype.tz"}], + ) def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): tz = tz_aware_fixture - i = pd.date_range('20130101', periods=5, freq='H', tz=tz) + i = pd.date_range("20130101", periods=5, freq="H", tz=tz) kwargs = {key: attrgetter(val)(i) for key, val in kwargs.items()} - if str(tz) in ('UTC', 'tzutc()', 'UTC+00:00'): + if str(tz) in ("UTC", "tzutc()", "UTC+00:00"): warn = None else: warn = FutureWarning @@ -130,81 +140,110 @@ def test_construction_with_alt_tz_localize(self, kwargs, tz_aware_fixture): tm.assert_index_equal(result, expected) # localize into the provided tz - i2 = DatetimeIndex(i.tz_localize(None).asi8, tz='UTC') - expected = i.tz_localize(None).tz_localize('UTC') + i2 = DatetimeIndex(i.tz_localize(None).asi8, tz="UTC") + expected = i.tz_localize(None).tz_localize("UTC") tm.assert_index_equal(i2, expected) # incompat tz/dtype msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(i.tz_localize(None).asi8, - dtype=i.dtype, tz='US/Pacific') + DatetimeIndex(i.tz_localize(None).asi8, dtype=i.dtype, tz="US/Pacific") def test_construction_index_with_mixed_timezones(self): # gh-11488: no tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01'), - Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01'), - Timestamp('2011-01-02')], name='idx') + result = Index([Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # same tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], - name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) exp = DatetimeIndex( - [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') - ], tz='Asia/Tokyo', name='idx') + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # same tz results in DatetimeIndex (DST) - result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - Timestamp('2011-08-01 10:00', tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # Different tz results in Index(dtype=object) - result = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - exp = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) - result = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - exp = Index([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) # length = 1 - result = Index([Timestamp('2011-01-01')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') + result = Index([Timestamp("2011-01-01")], name="idx") + exp = DatetimeIndex([Timestamp("2011-01-01")], name="idx") tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # length = 1 with tz - result = Index( - [Timestamp('2011-01-01 10:00', tz='Asia/Tokyo')], name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00')], tz='Asia/Tokyo', - name='idx') + result = Index([Timestamp("2011-01-01 10:00", tz="Asia/Tokyo")], name="idx") + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00")], tz="Asia/Tokyo", name="idx" + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None @@ -212,70 +251,117 @@ def test_construction_index_with_mixed_timezones(self): def test_construction_index_with_mixed_timezones_with_NaT(self): # see gh-11488 - result = Index([pd.NaT, Timestamp('2011-01-01'), - pd.NaT, Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01'), - pd.NaT, Timestamp('2011-01-02')], name='idx') + result = Index( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) + exp = DatetimeIndex( + [pd.NaT, Timestamp("2011-01-01"), pd.NaT, Timestamp("2011-01-02")], + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # Same tz results in DatetimeIndex - result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='Asia/Tokyo')], - name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00')], - tz='Asia/Tokyo', name='idx') + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00"), + ], + tz="Asia/Tokyo", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # same tz results in DatetimeIndex (DST) - result = Index([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - pd.NaT, - Timestamp('2011-08-01 10:00', tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), pd.NaT, - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') + result = Index( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.NaT, + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), pd.NaT, Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is not None assert result.tz == exp.tz # different tz results in Index(dtype=object) - result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], - name='idx') - exp = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) - result = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], name='idx') - exp = Index([pd.NaT, Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - dtype='object', name='idx') + result = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + dtype="object", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert not isinstance(result, DatetimeIndex) # all NaT - result = Index([pd.NaT, pd.NaT], name='idx') - exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') + result = Index([pd.NaT, pd.NaT], name="idx") + exp = DatetimeIndex([pd.NaT, pd.NaT], name="idx") tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) assert result.tz is None # all NaT with tz - result = Index([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([pd.NaT, pd.NaT], tz='Asia/Tokyo', name='idx') + result = Index([pd.NaT, pd.NaT], tz="Asia/Tokyo", name="idx") + exp = DatetimeIndex([pd.NaT, pd.NaT], tz="Asia/Tokyo", name="idx") tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) @@ -287,74 +373,109 @@ def test_construction_dti_with_mixed_timezones(self): # no tz results in DatetimeIndex result = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) exp = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + [Timestamp("2011-01-01"), Timestamp("2011-01-02")], name="idx" + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) # same tz results in DatetimeIndex - result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', - tz='Asia/Tokyo')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00')], - tz='Asia/Tokyo', name='idx') + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="Asia/Tokyo"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00")], + tz="Asia/Tokyo", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) # same tz results in DatetimeIndex (DST) - result = DatetimeIndex([Timestamp('2011-01-01 10:00', tz='US/Eastern'), - Timestamp('2011-08-01 10:00', - tz='US/Eastern')], - name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-08-01 10:00')], - tz='US/Eastern', name='idx') + result = DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="US/Eastern"), + Timestamp("2011-08-01 10:00", tz="US/Eastern"), + ], + name="idx", + ) + exp = DatetimeIndex( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-08-01 10:00")], + tz="US/Eastern", + name="idx", + ) tm.assert_index_equal(result, exp, exact=True) assert isinstance(result, DatetimeIndex) # tz mismatch affecting to tz-aware raises TypeError/ValueError with pytest.raises(ValueError): - DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - name='idx') - - msg = 'cannot be converted to datetime64' + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + name="idx", + ) + + msg = "cannot be converted to datetime64" with pytest.raises(ValueError, match=msg): - DatetimeIndex([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="Asia/Tokyo", + name="idx", + ) with pytest.raises(ValueError): - DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='US/Eastern', name='idx') + DatetimeIndex( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="US/Eastern", + name="idx", + ) with pytest.raises(ValueError, match=msg): # passing tz should results in DatetimeIndex, then mismatch raises # TypeError - Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') + Index( + [ + pd.NaT, + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-02 10:00", tz="US/Eastern"), + ], + tz="Asia/Tokyo", + name="idx", + ) def test_construction_base_constructor(self): - arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + arr = [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.DatetimeIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) - arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] + arr = [np.nan, pd.NaT, pd.Timestamp("2011-01-03")] tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.DatetimeIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.DatetimeIndex(np.array(arr))) def test_construction_outofbounds(self): # GH 13663 - dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), - datetime(5000, 1, 1), datetime(6000, 1, 1)] + dates = [ + datetime(3000, 1, 1), + datetime(4000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + ] exp = Index(dates, dtype=object) # coerces to object tm.assert_index_equal(Index(dates), exp) @@ -365,186 +486,202 @@ def test_construction_outofbounds(self): def test_construction_with_ndarray(self): # GH 5152 - dates = [datetime(2013, 10, 7), - datetime(2013, 10, 8), - datetime(2013, 10, 9)] + dates = [datetime(2013, 10, 7), datetime(2013, 10, 8), datetime(2013, 10, 9)] data = DatetimeIndex(dates, freq=pd.offsets.BDay()).values result = DatetimeIndex(data, freq=pd.offsets.BDay()) - expected = DatetimeIndex(['2013-10-07', - '2013-10-08', - '2013-10-09'], - freq='B') + expected = DatetimeIndex(["2013-10-07", "2013-10-08", "2013-10-09"], freq="B") tm.assert_index_equal(result, expected) def test_verify_integrity_deprecated(self): # GH#23919 with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(['1/1/2000'], verify_integrity=False) + DatetimeIndex(["1/1/2000"], verify_integrity=False) def test_range_kwargs_deprecated(self): # GH#23919 with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(start='1/1/2000', end='1/10/2000', freq='D') + DatetimeIndex(start="1/1/2000", end="1/10/2000", freq="D") def test_integer_values_and_tz_deprecated(self): # GH-24559 values = np.array([946684800000000000]) with tm.assert_produces_warning(FutureWarning): - result = DatetimeIndex(values, tz='US/Central') - expected = pd.DatetimeIndex(['2000-01-01T00:00:00'], tz="US/Central") + result = DatetimeIndex(values, tz="US/Central") + expected = pd.DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") tm.assert_index_equal(result, expected) # but UTC is *not* deprecated. with tm.assert_produces_warning(None): - result = DatetimeIndex(values, tz='UTC') - expected = pd.DatetimeIndex(['2000-01-01T00:00:00'], tz="US/Central") + result = DatetimeIndex(values, tz="UTC") + expected = pd.DatetimeIndex(["2000-01-01T00:00:00"], tz="US/Central") def test_constructor_coverage(self): - rng = date_range('1/1/2000', periods=10.5) - exp = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10.5) + exp = date_range("1/1/2000", periods=10) tm.assert_index_equal(rng, exp) - msg = 'periods must be a number, got foo' + msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): - date_range(start='1/1/2000', periods='foo', freq='D') + date_range(start="1/1/2000", periods="foo", freq="D") with pytest.raises(ValueError): with tm.assert_produces_warning(FutureWarning): - DatetimeIndex(start='1/1/2000', end='1/10/2000') + DatetimeIndex(start="1/1/2000", end="1/10/2000") with pytest.raises(TypeError): - DatetimeIndex('1/1/2000') + DatetimeIndex("1/1/2000") # generator expression gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) result = DatetimeIndex(gen) - expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) - for i in range(10)]) + expected = DatetimeIndex( + [datetime(2000, 1, 1) + timedelta(i) for i in range(10)] + ) tm.assert_index_equal(result, expected) # NumPy string array - strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) + strings = np.array(["2000-01-01", "2000-01-02", "2000-01-03"]) result = DatetimeIndex(strings) - expected = DatetimeIndex(strings.astype('O')) + expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) tm.assert_index_equal(from_ints, expected) # string with NaT - strings = np.array(['2000-01-01', '2000-01-02', 'NaT']) + strings = np.array(["2000-01-01", "2000-01-02", "NaT"]) result = DatetimeIndex(strings) - expected = DatetimeIndex(strings.astype('O')) + expected = DatetimeIndex(strings.astype("O")) tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) tm.assert_index_equal(from_ints, expected) # non-conforming - msg = ("Inferred frequency None from passed values does not conform" - " to passed frequency D") + msg = ( + "Inferred frequency None from passed values does not conform" + " to passed frequency D" + ) with pytest.raises(ValueError, match=msg): - DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04'], freq='D') + DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") - msg = ("Of the four parameters: start, end, periods, and freq, exactly" - " three must be specified") + msg = ( + "Of the four parameters: start, end, periods, and freq, exactly" + " three must be specified" + ) with pytest.raises(ValueError, match=msg): - date_range(start='2011-01-01', freq='b') + date_range(start="2011-01-01", freq="b") with pytest.raises(ValueError, match=msg): - date_range(end='2011-01-01', freq='B') + date_range(end="2011-01-01", freq="B") with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq='D') + date_range(periods=10, freq="D") - @pytest.mark.parametrize('freq', ['AS', 'W-SUN']) + @pytest.mark.parametrize("freq", ["AS", "W-SUN"]) def test_constructor_datetime64_tzformat(self, freq): # see GH#6572: ISO 8601 format results in pytz.FixedOffset - idx = date_range('2013-01-01T00:00:00-05:00', - '2016-01-01T23:59:59-05:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) + idx = date_range( + "2013-01-01T00:00:00-05:00", "2016-01-01T23:59:59-05:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(-300), + ) tm.assert_index_equal(idx, expected) # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - idx = date_range('2013-01-01T00:00:00+09:00', - '2016-01-01T23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) + idx = date_range( + "2013-01-01T00:00:00+09:00", "2016-01-01T23:59:59+09:00", freq=freq + ) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(540), + ) tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) # Non ISO 8601 format results in dateutil.tz.tzoffset - idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00', - freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(-300)) + idx = date_range("2013/1/1 0:00:00-5:00", "2016/1/1 23:59:59-5:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(-300), + ) tm.assert_index_equal(idx, expected) # Unable to use `US/Eastern` because of DST - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='America/Lima') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="America/Lima" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) - idx = date_range('2013/1/1 0:00:00+9:00', - '2016/1/1 23:59:59+09:00', freq=freq) - expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=pytz.FixedOffset(540)) + idx = date_range("2013/1/1 0:00:00+9:00", "2016/1/1 23:59:59+09:00", freq=freq) + expected = date_range( + "2013-01-01T00:00:00", + "2016-01-01T23:59:59", + freq=freq, + tz=pytz.FixedOffset(540), + ) tm.assert_index_equal(idx, expected) - expected_i8 = date_range('2013-01-01T00:00:00', - '2016-01-01T23:59:59', freq=freq, - tz='Asia/Tokyo') + expected_i8 = date_range( + "2013-01-01T00:00:00", "2016-01-01T23:59:59", freq=freq, tz="Asia/Tokyo" + ) tm.assert_numpy_array_equal(idx.asi8, expected_i8.asi8) def test_constructor_dtype(self): # passing a dtype with a tz should localize - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - dtype='datetime64[ns, US/Eastern]') - expected = DatetimeIndex(['2013-01-01', '2013-01-02'] - ).tz_localize('US/Eastern') + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + expected = DatetimeIndex(["2013-01-01", "2013-01-02"]).tz_localize("US/Eastern") tm.assert_index_equal(idx, expected) - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - tz='US/Eastern') + idx = DatetimeIndex(["2013-01-01", "2013-01-02"], tz="US/Eastern") tm.assert_index_equal(idx, expected) # if we already have a tz and its not the same, then raise - idx = DatetimeIndex(['2013-01-01', '2013-01-02'], - dtype='datetime64[ns, US/Eastern]') - - msg = ("cannot supply both a tz and a timezone-naive dtype" - r" \(i\.e\. datetime64\[ns\]\)") + idx = DatetimeIndex( + ["2013-01-01", "2013-01-02"], dtype="datetime64[ns, US/Eastern]" + ) + + msg = ( + "cannot supply both a tz and a timezone-naive dtype" + r" \(i\.e\. datetime64\[ns\]\)" + ) with pytest.raises(ValueError, match=msg): - DatetimeIndex(idx, dtype='datetime64[ns]') + DatetimeIndex(idx, dtype="datetime64[ns]") # this is effectively trying to convert tz's - msg = ("data is already tz-aware US/Eastern, unable to set specified" - " tz: CET") + msg = "data is already tz-aware US/Eastern, unable to set specified" " tz: CET" with pytest.raises(TypeError, match=msg): - DatetimeIndex(idx, dtype='datetime64[ns, CET]') + DatetimeIndex(idx, dtype="datetime64[ns, CET]") msg = "cannot supply both a tz and a dtype with a tz" with pytest.raises(ValueError, match=msg): - DatetimeIndex(idx, tz='CET', dtype='datetime64[ns, US/Eastern]') + DatetimeIndex(idx, tz="CET", dtype="datetime64[ns, US/Eastern]") - result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]') + result = DatetimeIndex(idx, dtype="datetime64[ns, US/Eastern]") tm.assert_index_equal(idx, result) - @pytest.mark.parametrize('dtype', [object, np.int32, np.int64]) + @pytest.mark.parametrize("dtype", [object, np.int32, np.int64]) def test_constructor_invalid_dtype_raises(self, dtype): # GH 23986 with pytest.raises(ValueError): DatetimeIndex([1, 2], dtype=dtype) def test_constructor_name(self): - idx = date_range(start='2000-01-01', periods=1, freq='A', - name='TEST') - assert idx.name == 'TEST' + idx = date_range(start="2000-01-01", periods=1, freq="A", name="TEST") + assert idx.name == "TEST" def test_000constructor_resolution(self): # 2252 @@ -555,55 +692,68 @@ def test_000constructor_resolution(self): def test_disallow_setting_tz(self): # GH 3746 - dti = DatetimeIndex(['2010'], tz='UTC') + dti = DatetimeIndex(["2010"], tz="UTC") with pytest.raises(AttributeError): - dti.tz = pytz.timezone('US/Pacific') - - @pytest.mark.parametrize('tz', [ - None, 'America/Los_Angeles', pytz.timezone('America/Los_Angeles'), - Timestamp('2000', tz='America/Los_Angeles').tz]) + dti.tz = pytz.timezone("US/Pacific") + + @pytest.mark.parametrize( + "tz", + [ + None, + "America/Los_Angeles", + pytz.timezone("America/Los_Angeles"), + Timestamp("2000", tz="America/Los_Angeles").tz, + ], + ) def test_constructor_start_end_with_tz(self, tz): # GH 18595 - start = Timestamp('2013-01-01 06:00:00', tz='America/Los_Angeles') - end = Timestamp('2013-01-02 06:00:00', tz='America/Los_Angeles') - result = date_range(freq='D', start=start, end=end, tz=tz) - expected = DatetimeIndex(['2013-01-01 06:00:00', - '2013-01-02 06:00:00'], - tz='America/Los_Angeles') + start = Timestamp("2013-01-01 06:00:00", tz="America/Los_Angeles") + end = Timestamp("2013-01-02 06:00:00", tz="America/Los_Angeles") + result = date_range(freq="D", start=start, end=end, tz=tz) + expected = DatetimeIndex( + ["2013-01-01 06:00:00", "2013-01-02 06:00:00"], tz="America/Los_Angeles" + ) tm.assert_index_equal(result, expected) # Especially assert that the timezone is consistent for pytz - assert pytz.timezone('America/Los_Angeles') is result.tz + assert pytz.timezone("America/Los_Angeles") is result.tz - @pytest.mark.parametrize('tz', ['US/Pacific', 'US/Eastern', 'Asia/Tokyo']) + @pytest.mark.parametrize("tz", ["US/Pacific", "US/Eastern", "Asia/Tokyo"]) def test_constructor_with_non_normalized_pytz(self, tz): # GH 18595 - non_norm_tz = Timestamp('2010', tz=tz).tz - result = DatetimeIndex(['2010'], tz=non_norm_tz) + non_norm_tz = Timestamp("2010", tz=tz).tz + result = DatetimeIndex(["2010"], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz def test_constructor_timestamp_near_dst(self): # GH 20854 - ts = [Timestamp('2016-10-30 03:00:00+0300', tz='Europe/Helsinki'), - Timestamp('2016-10-30 03:00:00+0200', tz='Europe/Helsinki')] + ts = [ + Timestamp("2016-10-30 03:00:00+0300", tz="Europe/Helsinki"), + Timestamp("2016-10-30 03:00:00+0200", tz="Europe/Helsinki"), + ] result = DatetimeIndex(ts) - expected = DatetimeIndex([ts[0].to_pydatetime(), - ts[1].to_pydatetime()]) + expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) tm.assert_index_equal(result, expected) # TODO(GH-24559): Remove the xfail for the tz-aware case. - @pytest.mark.parametrize('klass', [Index, DatetimeIndex]) - @pytest.mark.parametrize('box', [ - np.array, partial(np.array, dtype=object), list]) - @pytest.mark.parametrize('tz, dtype', [ - pytest.param('US/Pacific', 'datetime64[ns, US/Pacific]', - marks=[pytest.mark.xfail(), - pytest.mark.filterwarnings( - "ignore:\\n Passing:FutureWarning")]), - [None, 'datetime64[ns]'], - ]) + @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) + @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) + @pytest.mark.parametrize( + "tz, dtype", + [ + pytest.param( + "US/Pacific", + "datetime64[ns, US/Pacific]", + marks=[ + pytest.mark.xfail(), + pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning"), + ], + ), + [None, "datetime64[ns]"], + ], + ) def test_constructor_with_int_tz(self, klass, box, tz, dtype): # GH 20997, 20964 - ts = Timestamp('2018-01-01', tz=tz) + ts = Timestamp("2018-01-01", tz=tz) result = klass(box([ts.value]), dtype=dtype) expected = klass([ts]) assert result == expected @@ -621,56 +771,61 @@ def test_construction_int_rountrip(self, tz_naive_fixture): def test_construction_from_replaced_timestamps_with_dst(self): # GH 18785 - index = pd.date_range(pd.Timestamp(2000, 1, 1), - pd.Timestamp(2005, 1, 1), - freq='MS', tz='Australia/Melbourne') - test = pd.DataFrame({'data': range(len(index))}, index=index) - test = test.resample('Y').mean() - result = pd.DatetimeIndex([x.replace(month=6, day=1) - for x in test.index]) - expected = pd.DatetimeIndex(['2000-06-01 00:00:00', - '2001-06-01 00:00:00', - '2002-06-01 00:00:00', - '2003-06-01 00:00:00', - '2004-06-01 00:00:00', - '2005-06-01 00:00:00'], - tz='Australia/Melbourne') + index = pd.date_range( + pd.Timestamp(2000, 1, 1), + pd.Timestamp(2005, 1, 1), + freq="MS", + tz="Australia/Melbourne", + ) + test = pd.DataFrame({"data": range(len(index))}, index=index) + test = test.resample("Y").mean() + result = pd.DatetimeIndex([x.replace(month=6, day=1) for x in test.index]) + expected = pd.DatetimeIndex( + [ + "2000-06-01 00:00:00", + "2001-06-01 00:00:00", + "2002-06-01 00:00:00", + "2003-06-01 00:00:00", + "2004-06-01 00:00:00", + "2005-06-01 00:00:00", + ], + tz="Australia/Melbourne", + ) tm.assert_index_equal(result, expected) def test_construction_with_tz_and_tz_aware_dti(self): # GH 23579 - dti = date_range('2016-01-01', periods=3, tz='US/Central') + dti = date_range("2016-01-01", periods=3, tz="US/Central") with pytest.raises(TypeError): - DatetimeIndex(dti, tz='Asia/Tokyo') + DatetimeIndex(dti, tz="Asia/Tokyo") def test_construction_with_nat_and_tzlocal(self): tz = dateutil.tz.tzlocal() - result = DatetimeIndex(['2018', 'NaT'], tz=tz) - expected = DatetimeIndex([Timestamp('2018', tz=tz), pd.NaT]) + result = DatetimeIndex(["2018", "NaT"], tz=tz) + expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) tm.assert_index_equal(result, expected) def test_constructor_no_precision_warns(self): # GH-24753, GH-24739 - expected = pd.DatetimeIndex(['2000'], dtype='datetime64[ns]') + expected = pd.DatetimeIndex(["2000"], dtype="datetime64[ns]") # we set the stacklevel for DatetimeIndex with tm.assert_produces_warning(FutureWarning): - result = pd.DatetimeIndex(['2000'], dtype='datetime64') + result = pd.DatetimeIndex(["2000"], dtype="datetime64") tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = pd.Index(['2000'], dtype='datetime64') + result = pd.Index(["2000"], dtype="datetime64") tm.assert_index_equal(result, expected) def test_constructor_wrong_precision_raises(self): with pytest.raises(ValueError): - pd.DatetimeIndex(['2000'], dtype='datetime64[us]') + pd.DatetimeIndex(["2000"], dtype="datetime64[us]") class TestTimeSeries: - def test_dti_constructor_preserve_dti_freq(self): - rng = date_range('1/1/2000', '1/2/2000', freq='5min') + rng = date_range("1/1/2000", "1/2/2000", freq="5min") rng2 = DatetimeIndex(rng) assert rng.freq == rng2.freq @@ -678,43 +833,51 @@ def test_dti_constructor_preserve_dti_freq(self): def test_dti_constructor_years_only(self, tz_naive_fixture): tz = tz_naive_fixture # GH 6961 - rng1 = date_range('2014', '2015', freq='M', tz=tz) - expected1 = date_range('2014-01-31', '2014-12-31', freq='M', tz=tz) + rng1 = date_range("2014", "2015", freq="M", tz=tz) + expected1 = date_range("2014-01-31", "2014-12-31", freq="M", tz=tz) - rng2 = date_range('2014', '2015', freq='MS', tz=tz) - expected2 = date_range('2014-01-01', '2015-01-01', freq='MS', tz=tz) + rng2 = date_range("2014", "2015", freq="MS", tz=tz) + expected2 = date_range("2014-01-01", "2015-01-01", freq="MS", tz=tz) - rng3 = date_range('2014', '2020', freq='A', tz=tz) - expected3 = date_range('2014-12-31', '2019-12-31', freq='A', tz=tz) + rng3 = date_range("2014", "2020", freq="A", tz=tz) + expected3 = date_range("2014-12-31", "2019-12-31", freq="A", tz=tz) - rng4 = date_range('2014', '2020', freq='AS', tz=tz) - expected4 = date_range('2014-01-01', '2020-01-01', freq='AS', tz=tz) + rng4 = date_range("2014", "2020", freq="AS", tz=tz) + expected4 = date_range("2014-01-01", "2020-01-01", freq="AS", tz=tz) - for rng, expected in [(rng1, expected1), (rng2, expected2), - (rng3, expected3), (rng4, expected4)]: + for rng, expected in [ + (rng1, expected1), + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: tm.assert_index_equal(rng, expected) def test_dti_constructor_small_int(self, any_int_dtype): # see gh-13721 - exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', - '1970-01-01 00:00:00.00000001', - '1970-01-01 00:00:00.00000002']) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.00000000", + "1970-01-01 00:00:00.00000001", + "1970-01-01 00:00:00.00000002", + ] + ) arr = np.array([0, 10, 20], dtype=any_int_dtype) tm.assert_index_equal(DatetimeIndex(arr), exp) def test_ctor_str_intraday(self): - rng = DatetimeIndex(['1-1-2000 00:00:01']) + rng = DatetimeIndex(["1-1-2000 00:00:01"]) assert rng[0].second == 1 def test_is_(self): - dti = date_range(start='1/1/2005', end='12/1/2005', freq='M') + dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") assert dti.is_(dti) assert dti.is_(dti.view()) assert not dti.is_(dti.copy()) def test_index_cast_datetime64_other_units(self): - arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') + arr = np.arange(0, 100, 10, dtype=np.int64).view("M8[D]") idx = Index(arr) assert (idx.values == conversion.ensure_datetime64ns(arr)).all() @@ -733,48 +896,48 @@ def test_constructor_int64_nocopy(self): arr[50:100] = -1 assert (index.asi8[50:100] != -1).all() - @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B', 'BH', - 'T', 'S', 'L', 'U', 'H', 'N', 'C']) + @pytest.mark.parametrize( + "freq", ["M", "Q", "A", "D", "B", "BH", "T", "S", "L", "U", "H", "N", "C"] + ) def test_from_freq_recreate_from_data(self, freq): - org = date_range(start='2001/02/01 09:00', freq=freq, periods=1) + org = date_range(start="2001/02/01 09:00", freq=freq, periods=1) idx = DatetimeIndex(org, freq=freq) tm.assert_index_equal(idx, org) - org = date_range(start='2001/02/01 09:00', freq=freq, - tz='US/Pacific', periods=1) - idx = DatetimeIndex(org, freq=freq, tz='US/Pacific') + org = date_range( + start="2001/02/01 09:00", freq=freq, tz="US/Pacific", periods=1 + ) + idx = DatetimeIndex(org, freq=freq, tz="US/Pacific") tm.assert_index_equal(idx, org) def test_datetimeindex_constructor_misc(self): - arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] + arr = ["1/1/2005", "1/2/2005", "Jn 3, 2005", "2005-01-04"] msg = r"(\(')?Unknown string format(:', 'Jn 3, 2005'\))?" with pytest.raises(ValueError, match=msg): DatetimeIndex(arr) - arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] + arr = ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] idx1 = DatetimeIndex(arr) - arr = [datetime(2005, 1, 1), '1/2/2005', '1/3/2005', '2005-01-04'] + arr = [datetime(2005, 1, 1), "1/2/2005", "1/3/2005", "2005-01-04"] idx2 = DatetimeIndex(arr) - arr = [Timestamp(datetime(2005, 1, 1)), '1/2/2005', '1/3/2005', - '2005-01-04'] + arr = [Timestamp(datetime(2005, 1, 1)), "1/2/2005", "1/3/2005", "2005-01-04"] idx3 = DatetimeIndex(arr) - arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', - '2005-01-04'], dtype='O') + arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") idx4 = DatetimeIndex(arr) - arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) + arr = to_datetime(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"]) idx5 = DatetimeIndex(arr) - arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04' - ]) + arr = to_datetime(["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"]) idx6 = DatetimeIndex(arr) - idx7 = DatetimeIndex(['12/05/2007', '25/01/2008'], dayfirst=True) - idx8 = DatetimeIndex(['2007/05/12', '2008/01/25'], dayfirst=False, - yearfirst=True) + idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) + idx8 = DatetimeIndex( + ["2007/05/12", "2008/01/25"], dayfirst=False, yearfirst=True + ) tm.assert_index_equal(idx7, idx8) for other in [idx2, idx3, idx4, idx5, idx6]: @@ -782,30 +945,31 @@ def test_datetimeindex_constructor_misc(self): sdate = datetime(1999, 12, 25) edate = datetime(2000, 1, 1) - idx = date_range(start=sdate, freq='1B', periods=20) + idx = date_range(start=sdate, freq="1B", periods=20) assert len(idx) == 20 assert idx[0] == sdate + 0 * offsets.BDay() - assert idx.freq == 'B' + assert idx.freq == "B" - idx = date_range(end=edate, freq=('D', 5), periods=20) + idx = date_range(end=edate, freq=("D", 5), periods=20) assert len(idx) == 20 assert idx[-1] == edate - assert idx.freq == '5D' + assert idx.freq == "5D" - idx1 = date_range(start=sdate, end=edate, freq='W-SUN') - idx2 = date_range(start=sdate, end=edate, - freq=offsets.Week(weekday=6)) + idx1 = date_range(start=sdate, end=edate, freq="W-SUN") + idx2 = date_range(start=sdate, end=edate, freq=offsets.Week(weekday=6)) assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq - idx1 = date_range(start=sdate, end=edate, freq='QS') - idx2 = date_range(start=sdate, end=edate, - freq=offsets.QuarterBegin(startingMonth=1)) + idx1 = date_range(start=sdate, end=edate, freq="QS") + idx2 = date_range( + start=sdate, end=edate, freq=offsets.QuarterBegin(startingMonth=1) + ) assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq - idx1 = date_range(start=sdate, end=edate, freq='BQ') - idx2 = date_range(start=sdate, end=edate, - freq=offsets.BQuarterEnd(startingMonth=12)) + idx1 = date_range(start=sdate, end=edate, freq="BQ") + idx2 = date_range( + start=sdate, end=edate, freq=offsets.BQuarterEnd(startingMonth=12) + ) assert len(idx1) == len(idx2) assert idx1.freq == idx2.freq diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 1545cc52eb1f4..54c931cd60d20 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -18,7 +18,13 @@ import pandas.util.testing as tm from pandas.tseries.offsets import ( - BDay, CDay, DateOffset, MonthEnd, generate_range, prefix_mapping) + BDay, + CDay, + DateOffset, + MonthEnd, + generate_range, + prefix_mapping, +) START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) @@ -30,50 +36,48 @@ class TestTimestampEquivDateRange: # pertinent cases. def test_date_range_timestamp_equiv(self): - rng = date_range('20090415', '20090519', tz='US/Eastern') + rng = date_range("20090415", "20090519", tz="US/Eastern") stamp = rng[0] - ts = Timestamp('20090415', tz='US/Eastern', freq='D') + ts = Timestamp("20090415", tz="US/Eastern", freq="D") assert ts == stamp def test_date_range_timestamp_equiv_dateutil(self): - rng = date_range('20090415', '20090519', tz='dateutil/US/Eastern') + rng = date_range("20090415", "20090519", tz="dateutil/US/Eastern") stamp = rng[0] - ts = Timestamp('20090415', tz='dateutil/US/Eastern', freq='D') + ts = Timestamp("20090415", tz="dateutil/US/Eastern", freq="D") assert ts == stamp def test_date_range_timestamp_equiv_explicit_pytz(self): - rng = date_range('20090415', '20090519', - tz=pytz.timezone('US/Eastern')) + rng = date_range("20090415", "20090519", tz=pytz.timezone("US/Eastern")) stamp = rng[0] - ts = Timestamp('20090415', tz=pytz.timezone('US/Eastern'), freq='D') + ts = Timestamp("20090415", tz=pytz.timezone("US/Eastern"), freq="D") assert ts == stamp @td.skip_if_windows_python_3 def test_date_range_timestamp_equiv_explicit_dateutil(self): from pandas._libs.tslibs.timezones import dateutil_gettz as gettz - rng = date_range('20090415', '20090519', tz=gettz('US/Eastern')) + rng = date_range("20090415", "20090519", tz=gettz("US/Eastern")) stamp = rng[0] - ts = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D') + ts = Timestamp("20090415", tz=gettz("US/Eastern"), freq="D") assert ts == stamp def test_date_range_timestamp_equiv_from_datetime_instance(self): datetime_instance = datetime(2014, 3, 4) # build a timestamp with a frequency, since then it supports # addition/subtraction of integers - timestamp_instance = date_range(datetime_instance, periods=1, - freq='D')[0] + timestamp_instance = date_range(datetime_instance, periods=1, freq="D")[0] - ts = Timestamp(datetime_instance, freq='D') + ts = Timestamp(datetime_instance, freq="D") assert ts == timestamp_instance def test_date_range_timestamp_equiv_preserve_frequency(self): - timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0] - ts = Timestamp('2014-03-05', freq='D') + timestamp_instance = date_range("2014-03-05", periods=1, freq="D")[0] + ts = Timestamp("2014-03-05", freq="D") assert timestamp_instance == ts @@ -83,9 +87,9 @@ def test_date_range_nat(self): # GH#11587 msg = "Neither `start` nor `end` can be NaT" with pytest.raises(ValueError, match=msg): - date_range(start='2016-01-01', end=pd.NaT, freq='D') + date_range(start="2016-01-01", end=pd.NaT, freq="D") with pytest.raises(ValueError, match=msg): - date_range(start=pd.NaT, end='2016-01-01', freq='D') + date_range(start=pd.NaT, end="2016-01-01", freq="D") def test_date_range_multiplication_overflow(self): # GH#24255 @@ -93,25 +97,25 @@ def test_date_range_multiplication_overflow(self): # are caught with tm.assert_produces_warning(None): # we should _not_ be seeing a overflow RuntimeWarning - dti = date_range(start='1677-09-22', periods=213503, freq='D') + dti = date_range(start="1677-09-22", periods=213503, freq="D") - assert dti[0] == Timestamp('1677-09-22') + assert dti[0] == Timestamp("1677-09-22") assert len(dti) == 213503 msg = "Cannot generate range with" with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range('1969-05-04', periods=200000000, freq='30000D') + date_range("1969-05-04", periods=200000000, freq="30000D") def test_date_range_unsigned_overflow_handling(self): # GH#24255 # case where `addend = periods * stride` overflows int64 bounds # but not uint64 bounds - dti = date_range(start='1677-09-22', end='2262-04-11', freq='D') + dti = date_range(start="1677-09-22", end="2262-04-11", freq="D") - dti2 = date_range(start=dti[0], periods=len(dti), freq='D') + dti2 = date_range(start=dti[0], periods=len(dti), freq="D") assert dti2.equals(dti) - dti3 = date_range(end=dti[-1], periods=len(dti), freq='D') + dti3 = date_range(end=dti[-1], periods=len(dti), freq="D") assert dti3.equals(dti) def test_date_range_int64_overflow_non_recoverable(self): @@ -119,217 +123,257 @@ def test_date_range_int64_overflow_non_recoverable(self): # case with start later than 1970-01-01, overflow int64 but not uint64 msg = "Cannot generate range with" with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(start='1970-02-01', periods=106752 * 24, freq='H') + date_range(start="1970-02-01", periods=106752 * 24, freq="H") # case with end before 1970-01-01, overflow int64 but not uint64 with pytest.raises(OutOfBoundsDatetime, match=msg): - date_range(end='1969-11-14', periods=106752 * 24, freq='H') + date_range(end="1969-11-14", periods=106752 * 24, freq="H") def test_date_range_int64_overflow_stride_endpoint_different_signs(self): # cases where stride * periods overflow int64 and stride/endpoint # have different signs - start = Timestamp('2262-02-23') - end = Timestamp('1969-11-14') + start = Timestamp("2262-02-23") + end = Timestamp("1969-11-14") - expected = date_range(start=start, end=end, freq='-1H') + expected = date_range(start=start, end=end, freq="-1H") assert expected[0] == start assert expected[-1] == end - dti = date_range(end=end, periods=len(expected), freq='-1H') + dti = date_range(end=end, periods=len(expected), freq="-1H") tm.assert_index_equal(dti, expected) - start2 = Timestamp('1970-02-01') - end2 = Timestamp('1677-10-22') + start2 = Timestamp("1970-02-01") + end2 = Timestamp("1677-10-22") - expected2 = date_range(start=start2, end=end2, freq='-1H') + expected2 = date_range(start=start2, end=end2, freq="-1H") assert expected2[0] == start2 assert expected2[-1] == end2 - dti2 = date_range(start=start2, periods=len(expected2), freq='-1H') + dti2 = date_range(start=start2, periods=len(expected2), freq="-1H") tm.assert_index_equal(dti2, expected2) def test_date_range_out_of_bounds(self): # GH#14187 with pytest.raises(OutOfBoundsDatetime): - date_range('2016-01-01', periods=100000, freq='D') + date_range("2016-01-01", periods=100000, freq="D") with pytest.raises(OutOfBoundsDatetime): - date_range(end='1763-10-12', periods=100000, freq='D') + date_range(end="1763-10-12", periods=100000, freq="D") def test_date_range_gen_error(self): - rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') + rng = date_range("1/1/2000 00:00", "1/1/2000 00:18", freq="5min") assert len(rng) == 4 @pytest.mark.parametrize("freq", ["AS", "YS"]) def test_begin_year_alias(self, freq): # see gh-9313 rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = pd.DatetimeIndex(["2013-01-01", "2014-01-01", - "2015-01-01", "2016-01-01", - "2017-01-01"], freq=freq) + exp = pd.DatetimeIndex( + ["2013-01-01", "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01"], + freq=freq, + ) tm.assert_index_equal(rng, exp) @pytest.mark.parametrize("freq", ["A", "Y"]) def test_end_year_alias(self, freq): # see gh-9313 rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31", - "2015-12-31", "2016-12-31"], freq=freq) + exp = pd.DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-31"], freq=freq + ) tm.assert_index_equal(rng, exp) @pytest.mark.parametrize("freq", ["BA", "BY"]) def test_business_end_year_alias(self, freq): # see gh-9313 rng = date_range("1/1/2013", "7/1/2017", freq=freq) - exp = pd.DatetimeIndex(["2013-12-31", "2014-12-31", - "2015-12-31", "2016-12-30"], freq=freq) + exp = pd.DatetimeIndex( + ["2013-12-31", "2014-12-31", "2015-12-31", "2016-12-30"], freq=freq + ) tm.assert_index_equal(rng, exp) def test_date_range_negative_freq(self): # GH 11018 - rng = date_range('2011-12-31', freq='-2A', periods=3) - exp = pd.DatetimeIndex(['2011-12-31', '2009-12-31', - '2007-12-31'], freq='-2A') + rng = date_range("2011-12-31", freq="-2A", periods=3) + exp = pd.DatetimeIndex(["2011-12-31", "2009-12-31", "2007-12-31"], freq="-2A") tm.assert_index_equal(rng, exp) - assert rng.freq == '-2A' + assert rng.freq == "-2A" - rng = date_range('2011-01-31', freq='-2M', periods=3) - exp = pd.DatetimeIndex(['2011-01-31', '2010-11-30', - '2010-09-30'], freq='-2M') + rng = date_range("2011-01-31", freq="-2M", periods=3) + exp = pd.DatetimeIndex(["2011-01-31", "2010-11-30", "2010-09-30"], freq="-2M") tm.assert_index_equal(rng, exp) - assert rng.freq == '-2M' + assert rng.freq == "-2M" def test_date_range_bms_bug(self): # #1645 - rng = date_range('1/1/2000', periods=10, freq='BMS') + rng = date_range("1/1/2000", periods=10, freq="BMS") - ex_first = Timestamp('2000-01-03') + ex_first = Timestamp("2000-01-03") assert rng[0] == ex_first def test_date_range_normalize(self): snap = datetime.today() n = 50 - rng = date_range(snap, periods=n, normalize=False, freq='2D') + rng = date_range(snap, periods=n, normalize=False, freq="2D") offset = timedelta(2) values = DatetimeIndex([snap + i * offset for i in range(n)]) tm.assert_index_equal(rng, values) - rng = date_range('1/1/2000 08:15', periods=n, normalize=False, - freq='B') + rng = date_range("1/1/2000 08:15", periods=n, normalize=False, freq="B") the_time = time(8, 15) for val in rng: assert val.time() == the_time def test_date_range_fy5252(self): - dr = date_range(start="2013-01-01", periods=2, freq=offsets.FY5253( - startingMonth=1, weekday=3, variation="nearest")) - assert dr[0] == Timestamp('2013-01-31') - assert dr[1] == Timestamp('2014-01-30') + dr = date_range( + start="2013-01-01", + periods=2, + freq=offsets.FY5253(startingMonth=1, weekday=3, variation="nearest"), + ) + assert dr[0] == Timestamp("2013-01-31") + assert dr[1] == Timestamp("2014-01-30") def test_date_range_ambiguous_arguments(self): # #2538 start = datetime(2011, 1, 1, 5, 3, 40) end = datetime(2011, 1, 1, 8, 9, 40) - msg = ('Of the four parameters: start, end, periods, and ' - 'freq, exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): - date_range(start, end, periods=10, freq='s') + date_range(start, end, periods=10, freq="s") def test_date_range_convenience_periods(self): # GH 20808 - result = date_range('2018-04-24', '2018-04-27', periods=3) - expected = DatetimeIndex(['2018-04-24 00:00:00', - '2018-04-25 12:00:00', - '2018-04-27 00:00:00'], freq=None) + result = date_range("2018-04-24", "2018-04-27", periods=3) + expected = DatetimeIndex( + ["2018-04-24 00:00:00", "2018-04-25 12:00:00", "2018-04-27 00:00:00"], + freq=None, + ) tm.assert_index_equal(result, expected) # Test if spacing remains linear if tz changes to dst in range - result = date_range('2018-04-01 01:00:00', - '2018-04-01 04:00:00', - tz='Australia/Sydney', - periods=3) - expected = DatetimeIndex([Timestamp('2018-04-01 01:00:00+1100', - tz='Australia/Sydney'), - Timestamp('2018-04-01 02:00:00+1000', - tz='Australia/Sydney'), - Timestamp('2018-04-01 04:00:00+1000', - tz='Australia/Sydney')]) + result = date_range( + "2018-04-01 01:00:00", + "2018-04-01 04:00:00", + tz="Australia/Sydney", + periods=3, + ) + expected = DatetimeIndex( + [ + Timestamp("2018-04-01 01:00:00+1100", tz="Australia/Sydney"), + Timestamp("2018-04-01 02:00:00+1000", tz="Australia/Sydney"), + Timestamp("2018-04-01 04:00:00+1000", tz="Australia/Sydney"), + ] + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start,end,result_tz', [ - ['20180101', '20180103', 'US/Eastern'], - [datetime(2018, 1, 1), datetime(2018, 1, 3), 'US/Eastern'], - [Timestamp('20180101'), Timestamp('20180103'), 'US/Eastern'], - [Timestamp('20180101', tz='US/Eastern'), - Timestamp('20180103', tz='US/Eastern'), 'US/Eastern'], - [Timestamp('20180101', tz='US/Eastern'), - Timestamp('20180103', tz='US/Eastern'), None]]) + @pytest.mark.parametrize( + "start,end,result_tz", + [ + ["20180101", "20180103", "US/Eastern"], + [datetime(2018, 1, 1), datetime(2018, 1, 3), "US/Eastern"], + [Timestamp("20180101"), Timestamp("20180103"), "US/Eastern"], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + "US/Eastern", + ], + [ + Timestamp("20180101", tz="US/Eastern"), + Timestamp("20180103", tz="US/Eastern"), + None, + ], + ], + ) def test_date_range_linspacing_tz(self, start, end, result_tz): # GH 20983 result = date_range(start, end, periods=3, tz=result_tz) - expected = date_range('20180101', periods=3, freq='D', tz='US/Eastern') + expected = date_range("20180101", periods=3, freq="D", tz="US/Eastern") tm.assert_index_equal(result, expected) def test_date_range_businesshour(self): - idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', - '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', - '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00'], - freq='BH') - rng = date_range('2014-07-04 09:00', '2014-07-04 16:00', freq='BH') + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + ], + freq="BH", + ) + rng = date_range("2014-07-04 09:00", "2014-07-04 16:00", freq="BH") tm.assert_index_equal(idx, rng) - idx = DatetimeIndex( - ['2014-07-04 16:00', '2014-07-07 09:00'], freq='BH') - rng = date_range('2014-07-04 16:00', '2014-07-07 09:00', freq='BH') + idx = DatetimeIndex(["2014-07-04 16:00", "2014-07-07 09:00"], freq="BH") + rng = date_range("2014-07-04 16:00", "2014-07-07 09:00", freq="BH") tm.assert_index_equal(idx, rng) - idx = DatetimeIndex(['2014-07-04 09:00', '2014-07-04 10:00', - '2014-07-04 11:00', - '2014-07-04 12:00', '2014-07-04 13:00', - '2014-07-04 14:00', - '2014-07-04 15:00', '2014-07-04 16:00', - '2014-07-07 09:00', '2014-07-07 10:00', - '2014-07-07 11:00', - '2014-07-07 12:00', '2014-07-07 13:00', - '2014-07-07 14:00', - '2014-07-07 15:00', '2014-07-07 16:00', - '2014-07-08 09:00', '2014-07-08 10:00', - '2014-07-08 11:00', - '2014-07-08 12:00', '2014-07-08 13:00', - '2014-07-08 14:00', - '2014-07-08 15:00', '2014-07-08 16:00'], - freq='BH') - rng = date_range('2014-07-04 09:00', '2014-07-08 16:00', freq='BH') + idx = DatetimeIndex( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + freq="BH", + ) + rng = date_range("2014-07-04 09:00", "2014-07-08 16:00", freq="BH") tm.assert_index_equal(idx, rng) def test_range_misspecified(self): # GH #1095 - msg = ('Of the four parameters: start, end, periods, and ' - 'freq, exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and " + "freq, exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): - date_range(start='1/1/2000') + date_range(start="1/1/2000") with pytest.raises(ValueError, match=msg): - date_range(end='1/1/2000') + date_range(end="1/1/2000") with pytest.raises(ValueError, match=msg): date_range(periods=10) with pytest.raises(ValueError, match=msg): - date_range(start='1/1/2000', freq='H') + date_range(start="1/1/2000", freq="H") with pytest.raises(ValueError, match=msg): - date_range(end='1/1/2000', freq='H') + date_range(end="1/1/2000", freq="H") with pytest.raises(ValueError, match=msg): - date_range(periods=10, freq='H') + date_range(periods=10, freq="H") with pytest.raises(ValueError, match=msg): date_range() @@ -337,8 +381,9 @@ def test_range_misspecified(self): def test_compat_replace(self): # https://github.com/statsmodels/statsmodels/issues/3349 # replace should take ints/longs for compat - result = date_range(Timestamp('1960-04-01 00:00:00', freq='QS-JAN'), - periods=76, freq='QS-JAN') + result = date_range( + Timestamp("1960-04-01 00:00:00", freq="QS-JAN"), periods=76, freq="QS-JAN" + ) assert len(result) == 76 def test_catch_infinite_loop(self): @@ -346,42 +391,49 @@ def test_catch_infinite_loop(self): # blow up, don't loop forever msg = "Offset did not increment date" with pytest.raises(ValueError, match=msg): - date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), - freq=offset) + date_range(datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) - @pytest.mark.parametrize('periods', (1, 2)) + @pytest.mark.parametrize("periods", (1, 2)) def test_wom_len(self, periods): # https://github.com/pandas-dev/pandas/issues/20517 - res = date_range(start='20110101', periods=periods, freq='WOM-1MON') + res = date_range(start="20110101", periods=periods, freq="WOM-1MON") assert len(res) == periods def test_construct_over_dst(self): # GH 20854 - pre_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', - ambiguous=True) - pst_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', - ambiguous=False) - expect_data = [Timestamp('2010-11-07 00:00:00', tz='US/Pacific'), - pre_dst, - pst_dst] + pre_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=True + ) + pst_dst = Timestamp("2010-11-07 01:00:00").tz_localize( + "US/Pacific", ambiguous=False + ) + expect_data = [ + Timestamp("2010-11-07 00:00:00", tz="US/Pacific"), + pre_dst, + pst_dst, + ] expected = DatetimeIndex(expect_data) - result = date_range(start='2010-11-7', periods=3, - freq='H', tz='US/Pacific') + result = date_range(start="2010-11-7", periods=3, freq="H", tz="US/Pacific") tm.assert_index_equal(result, expected) def test_construct_with_different_start_end_string_format(self): # GH 12064 - result = date_range('2013-01-01 00:00:00+09:00', - '2013/01/01 02:00:00+09:00', freq='H') - expected = DatetimeIndex([Timestamp('2013-01-01 00:00:00+09:00'), - Timestamp('2013-01-01 01:00:00+09:00'), - Timestamp('2013-01-01 02:00:00+09:00')]) + result = date_range( + "2013-01-01 00:00:00+09:00", "2013/01/01 02:00:00+09:00", freq="H" + ) + expected = DatetimeIndex( + [ + Timestamp("2013-01-01 00:00:00+09:00"), + Timestamp("2013-01-01 01:00:00+09:00"), + Timestamp("2013-01-01 02:00:00+09:00"), + ] + ) tm.assert_index_equal(result, expected) def test_error_with_zero_monthends(self): - msg = r'Offset <0 \* MonthEnds> did not increment date' + msg = r"Offset <0 \* MonthEnds> did not increment date" with pytest.raises(ValueError, match=msg): - date_range('1/1/2000', '1/1/2001', freq=MonthEnd(0)) + date_range("1/1/2000", "1/1/2001", freq=MonthEnd(0)) def test_range_bug(self): # GH #770 @@ -394,7 +446,7 @@ def test_range_bug(self): def test_range_tz_pytz(self): # see gh-2906 - tz = timezone('US/Eastern') + tz = timezone("US/Eastern") start = tz.localize(datetime(2011, 1, 1)) end = tz.localize(datetime(2011, 1, 3)) @@ -413,25 +465,36 @@ def test_range_tz_pytz(self): assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize('start, end', [ - [Timestamp(datetime(2014, 3, 6), tz='US/Eastern'), - Timestamp(datetime(2014, 3, 12), tz='US/Eastern')], - [Timestamp(datetime(2013, 11, 1), tz='US/Eastern'), - Timestamp(datetime(2013, 11, 6), tz='US/Eastern')] - ]) + @pytest.mark.parametrize( + "start, end", + [ + [ + Timestamp(datetime(2014, 3, 6), tz="US/Eastern"), + Timestamp(datetime(2014, 3, 12), tz="US/Eastern"), + ], + [ + Timestamp(datetime(2013, 11, 1), tz="US/Eastern"), + Timestamp(datetime(2013, 11, 6), tz="US/Eastern"), + ], + ], + ) def test_range_tz_dst_straddle_pytz(self, start, end): - dr = date_range(start, end, freq='D') + dr = date_range(start, end, freq="D") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start, end, freq='D', tz='US/Eastern') + dr = date_range(start, end, freq="D", tz="US/Eastern") assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) - dr = date_range(start.replace(tzinfo=None), end.replace( - tzinfo=None), freq='D', tz='US/Eastern') + dr = date_range( + start.replace(tzinfo=None), + end.replace(tzinfo=None), + freq="D", + tz="US/Eastern", + ) assert dr[0] == start assert dr[-1] == end assert np.all(dr.hour == 0) @@ -441,27 +504,28 @@ def test_range_tz_dateutil(self): # Use maybe_get_tz to fix filename in tz under dateutil. from pandas._libs.tslibs.timezones import maybe_get_tz - tz = lambda x: maybe_get_tz('dateutil/' + x) - start = datetime(2011, 1, 1, tzinfo=tz('US/Eastern')) - end = datetime(2011, 1, 3, tzinfo=tz('US/Eastern')) + tz = lambda x: maybe_get_tz("dateutil/" + x) + + start = datetime(2011, 1, 1, tzinfo=tz("US/Eastern")) + end = datetime(2011, 1, 3, tzinfo=tz("US/Eastern")) dr = date_range(start=start, periods=3) - assert dr.tz == tz('US/Eastern') + assert dr.tz == tz("US/Eastern") assert dr[0] == start assert dr[2] == end dr = date_range(end=end, periods=3) - assert dr.tz == tz('US/Eastern') + assert dr.tz == tz("US/Eastern") assert dr[0] == start assert dr[2] == end dr = date_range(start=start, end=end) - assert dr.tz == tz('US/Eastern') + assert dr.tz == tz("US/Eastern") assert dr[0] == start assert dr[2] == end - @pytest.mark.parametrize('freq', ["1D", "3D", "2M", "7W", "3H", "A"]) + @pytest.mark.parametrize("freq", ["1D", "3D", "2M", "7W", "3H", "A"]) def test_range_closed(self, freq): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) @@ -482,8 +546,8 @@ def test_range_closed(self, freq): def test_range_closed_with_tz_aware_start_end(self): # GH12409, GH12684 - begin = Timestamp('2011/1/1', tz='US/Eastern') - end = Timestamp('2014/1/1', tz='US/Eastern') + begin = Timestamp("2011/1/1", tz="US/Eastern") + end = Timestamp("2014/1/1", tz="US/Eastern") for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: closed = date_range(begin, end, closed=None, freq=freq) @@ -500,18 +564,15 @@ def test_range_closed_with_tz_aware_start_end(self): tm.assert_index_equal(expected_left, left) tm.assert_index_equal(expected_right, right) - begin = Timestamp('2011/1/1') - end = Timestamp('2014/1/1') - begintz = Timestamp('2011/1/1', tz='US/Eastern') - endtz = Timestamp('2014/1/1', tz='US/Eastern') + begin = Timestamp("2011/1/1") + end = Timestamp("2014/1/1") + begintz = Timestamp("2011/1/1", tz="US/Eastern") + endtz = Timestamp("2014/1/1", tz="US/Eastern") for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: - closed = date_range(begin, end, closed=None, freq=freq, - tz='US/Eastern') - left = date_range(begin, end, closed="left", freq=freq, - tz='US/Eastern') - right = date_range(begin, end, closed="right", freq=freq, - tz='US/Eastern') + closed = date_range(begin, end, closed=None, freq=freq, tz="US/Eastern") + left = date_range(begin, end, closed="left", freq=freq, tz="US/Eastern") + right = date_range(begin, end, closed="right", freq=freq, tz="US/Eastern") expected_left = left expected_right = right @@ -523,20 +584,23 @@ def test_range_closed_with_tz_aware_start_end(self): tm.assert_index_equal(expected_left, left) tm.assert_index_equal(expected_right, right) - @pytest.mark.parametrize('closed', ['right', 'left', None]) + @pytest.mark.parametrize("closed", ["right", "left", None]) def test_range_closed_boundary(self, closed): # GH#11804 - right_boundary = date_range('2015-09-12', '2015-12-01', - freq='QS-MAR', closed=closed) - left_boundary = date_range('2015-09-01', '2015-09-12', - freq='QS-MAR', closed=closed) - both_boundary = date_range('2015-09-01', '2015-12-01', - freq='QS-MAR', closed=closed) + right_boundary = date_range( + "2015-09-12", "2015-12-01", freq="QS-MAR", closed=closed + ) + left_boundary = date_range( + "2015-09-01", "2015-09-12", freq="QS-MAR", closed=closed + ) + both_boundary = date_range( + "2015-09-01", "2015-12-01", freq="QS-MAR", closed=closed + ) expected_right = expected_left = expected_both = both_boundary - if closed == 'right': + if closed == "right": expected_left = both_boundary[1:] - if closed == 'left': + if closed == "left": expected_right = both_boundary[:-1] if closed is None: expected_right = both_boundary[1:] @@ -548,66 +612,65 @@ def test_range_closed_boundary(self, closed): def test_years_only(self): # GH 6961 - dr = date_range('2014', '2015', freq='M') + dr = date_range("2014", "2015", freq="M") assert dr[0] == datetime(2014, 1, 31) assert dr[-1] == datetime(2014, 12, 31) def test_freq_divides_end_in_nanos(self): # GH 10885 - result_1 = date_range('2005-01-12 10:00', '2005-01-12 16:00', - freq='345min') - result_2 = date_range('2005-01-13 10:00', '2005-01-13 16:00', - freq='345min') - expected_1 = DatetimeIndex(['2005-01-12 10:00:00', - '2005-01-12 15:45:00'], - dtype='datetime64[ns]', freq='345T', - tz=None) - expected_2 = DatetimeIndex(['2005-01-13 10:00:00', - '2005-01-13 15:45:00'], - dtype='datetime64[ns]', freq='345T', - tz=None) + result_1 = date_range("2005-01-12 10:00", "2005-01-12 16:00", freq="345min") + result_2 = date_range("2005-01-13 10:00", "2005-01-13 16:00", freq="345min") + expected_1 = DatetimeIndex( + ["2005-01-12 10:00:00", "2005-01-12 15:45:00"], + dtype="datetime64[ns]", + freq="345T", + tz=None, + ) + expected_2 = DatetimeIndex( + ["2005-01-13 10:00:00", "2005-01-13 15:45:00"], + dtype="datetime64[ns]", + freq="345T", + tz=None, + ) tm.assert_index_equal(result_1, expected_1) tm.assert_index_equal(result_2, expected_2) def test_cached_range_bug(self): - rng = date_range('2010-09-01 05:00:00', periods=50, - freq=DateOffset(hours=6)) + rng = date_range("2010-09-01 05:00:00", periods=50, freq=DateOffset(hours=6)) assert len(rng) == 50 assert rng[0] == datetime(2010, 9, 1, 5) def test_timezone_comparaison_bug(self): # smoke test - start = Timestamp('20130220 10:00', tz='US/Eastern') - result = date_range(start, periods=2, tz='US/Eastern') + start = Timestamp("20130220 10:00", tz="US/Eastern") + result = date_range(start, periods=2, tz="US/Eastern") assert len(result) == 2 def test_timezone_comparaison_assert(self): - start = Timestamp('20130220 10:00', tz='US/Eastern') - msg = 'Inferred time zone not equal to passed time zone' + start = Timestamp("20130220 10:00", tz="US/Eastern") + msg = "Inferred time zone not equal to passed time zone" with pytest.raises(AssertionError, match=msg): - date_range(start, periods=2, tz='Europe/Berlin') + date_range(start, periods=2, tz="Europe/Berlin") - def test_negative_non_tick_frequency_descending_dates(self, - tz_aware_fixture): + def test_negative_non_tick_frequency_descending_dates(self, tz_aware_fixture): # GH 23270 tz = tz_aware_fixture - result = pd.date_range(start='2011-06-01', end='2011-01-01', - freq='-1MS', tz=tz) - expected = pd.date_range(end='2011-06-01', start='2011-01-01', - freq='1MS', tz=tz)[::-1] + result = pd.date_range(start="2011-06-01", end="2011-01-01", freq="-1MS", tz=tz) + expected = pd.date_range( + end="2011-06-01", start="2011-01-01", freq="1MS", tz=tz + )[::-1] tm.assert_index_equal(result, expected) class TestGenRangeGeneration: - def test_generate(self): rng1 = list(generate_range(START, END, offset=BDay())) - rng2 = list(generate_range(START, END, offset='B')) + rng2 = list(generate_range(START, END, offset="B")) assert rng1 == rng2 def test_generate_cday(self): rng1 = list(generate_range(START, END, offset=CDay())) - rng2 = list(generate_range(START, END, offset='C')) + rng2 = list(generate_range(START, END, offset="C")) assert rng1 == rng2 def test_1(self): @@ -616,48 +679,62 @@ def test_1(self): assert rng == expected def test_2(self): - rng = list(generate_range(start=datetime(2008, 1, 1), - end=datetime(2008, 1, 3))) - expected = [datetime(2008, 1, 1), - datetime(2008, 1, 2), - datetime(2008, 1, 3)] + rng = list(generate_range(start=datetime(2008, 1, 1), end=datetime(2008, 1, 3))) + expected = [datetime(2008, 1, 1), datetime(2008, 1, 2), datetime(2008, 1, 3)] assert rng == expected def test_3(self): - rng = list(generate_range(start=datetime(2008, 1, 5), - end=datetime(2008, 1, 6))) + rng = list(generate_range(start=datetime(2008, 1, 5), end=datetime(2008, 1, 6))) expected = [] assert rng == expected def test_precision_finer_than_offset(self): # GH#9907 - result1 = pd.date_range(start='2015-04-15 00:00:03', - end='2016-04-22 00:00:00', freq='Q') - result2 = pd.date_range(start='2015-04-15 00:00:03', - end='2015-06-22 00:00:04', freq='W') - expected1_list = ['2015-06-30 00:00:03', '2015-09-30 00:00:03', - '2015-12-31 00:00:03', '2016-03-31 00:00:03'] - expected2_list = ['2015-04-19 00:00:03', '2015-04-26 00:00:03', - '2015-05-03 00:00:03', '2015-05-10 00:00:03', - '2015-05-17 00:00:03', '2015-05-24 00:00:03', - '2015-05-31 00:00:03', '2015-06-07 00:00:03', - '2015-06-14 00:00:03', '2015-06-21 00:00:03'] - expected1 = DatetimeIndex(expected1_list, dtype='datetime64[ns]', - freq='Q-DEC', tz=None) - expected2 = DatetimeIndex(expected2_list, dtype='datetime64[ns]', - freq='W-SUN', tz=None) + result1 = pd.date_range( + start="2015-04-15 00:00:03", end="2016-04-22 00:00:00", freq="Q" + ) + result2 = pd.date_range( + start="2015-04-15 00:00:03", end="2015-06-22 00:00:04", freq="W" + ) + expected1_list = [ + "2015-06-30 00:00:03", + "2015-09-30 00:00:03", + "2015-12-31 00:00:03", + "2016-03-31 00:00:03", + ] + expected2_list = [ + "2015-04-19 00:00:03", + "2015-04-26 00:00:03", + "2015-05-03 00:00:03", + "2015-05-10 00:00:03", + "2015-05-17 00:00:03", + "2015-05-24 00:00:03", + "2015-05-31 00:00:03", + "2015-06-07 00:00:03", + "2015-06-14 00:00:03", + "2015-06-21 00:00:03", + ] + expected1 = DatetimeIndex( + expected1_list, dtype="datetime64[ns]", freq="Q-DEC", tz=None + ) + expected2 = DatetimeIndex( + expected2_list, dtype="datetime64[ns]", freq="W-SUN", tz=None + ) tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) - dt1, dt2 = '2017-01-01', '2017-01-01' - tz1, tz2 = 'US/Eastern', 'Europe/London' - - @pytest.mark.parametrize("start,end", [ - (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), - (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), - (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), - (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)) - ]) + dt1, dt2 = "2017-01-01", "2017-01-01" + tz1, tz2 = "US/Eastern", "Europe/London" + + @pytest.mark.parametrize( + "start,end", + [ + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), + (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)), + ], + ) def test_mismatching_tz_raises_err(self, start, end): # issue 18488 with pytest.raises(TypeError): @@ -667,20 +744,19 @@ def test_mismatching_tz_raises_err(self, start, end): class TestBusinessDateRange: - def test_constructor(self): bdate_range(START, END, freq=BDay()) bdate_range(START, periods=20, freq=BDay()) bdate_range(end=START, periods=20, freq=BDay()) - msg = 'periods must be a number, got B' + msg = "periods must be a number, got B" with pytest.raises(TypeError, match=msg): - date_range('2011-1-1', '2012-1-1', 'B') + date_range("2011-1-1", "2012-1-1", "B") with pytest.raises(TypeError, match=msg): - bdate_range('2011-1-1', '2012-1-1', 'B') + bdate_range("2011-1-1", "2012-1-1", "B") - msg = 'freq must be specified for bdate_range; use date_range instead' + msg = "freq must be specified for bdate_range; use date_range instead" with pytest.raises(TypeError, match=msg): bdate_range(START, END, periods=10, freq=None) @@ -688,7 +764,7 @@ def test_naive_aware_conflicts(self): naive = bdate_range(START, END, freq=BDay(), tz=None) aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong") - msg = 'tz-naive.*tz-aware' + msg = "tz-naive.*tz-aware" with pytest.raises(TypeError, match=msg): naive.join(aware) @@ -705,7 +781,7 @@ def test_misc(self): assert dr[-1] == end def test_date_parse_failure(self): - badly_formed_date = '2007/100/1' + badly_formed_date = "2007/100/1" with pytest.raises(ValueError): Timestamp(badly_formed_date) @@ -721,56 +797,55 @@ def test_date_parse_failure(self): def test_daterange_bug_456(self): # GH #456 - rng1 = bdate_range('12/5/2011', '12/5/2011') - rng2 = bdate_range('12/2/2011', '12/5/2011') + rng1 = bdate_range("12/5/2011", "12/5/2011") + rng2 = bdate_range("12/2/2011", "12/5/2011") rng2.freq = BDay() result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) - @pytest.mark.parametrize('closed', ['left', 'right']) + @pytest.mark.parametrize("closed", ["left", "right"]) def test_bdays_and_open_boundaries(self, closed): # GH 6673 - start = '2018-07-21' # Saturday - end = '2018-07-29' # Sunday - result = pd.date_range(start, end, freq='B', closed=closed) + start = "2018-07-21" # Saturday + end = "2018-07-29" # Sunday + result = pd.date_range(start, end, freq="B", closed=closed) - bday_start = '2018-07-23' # Monday - bday_end = '2018-07-27' # Friday - expected = pd.date_range(bday_start, bday_end, freq='D') + bday_start = "2018-07-23" # Monday + bday_end = "2018-07-27" # Friday + expected = pd.date_range(bday_start, bday_end, freq="D") tm.assert_index_equal(result, expected) def test_bday_near_overflow(self): # GH#24252 avoid doing unnecessary addition that _would_ overflow start = pd.Timestamp.max.floor("D").to_pydatetime() - rng = pd.date_range(start, end=None, periods=1, freq='B') - expected = pd.DatetimeIndex([start], freq='B') + rng = pd.date_range(start, end=None, periods=1, freq="B") + expected = pd.DatetimeIndex([start], freq="B") tm.assert_index_equal(rng, expected) def test_bday_overflow_error(self): # GH#24252 check that we get OutOfBoundsDatetime and not OverflowError start = pd.Timestamp.max.floor("D").to_pydatetime() with pytest.raises(OutOfBoundsDatetime): - pd.date_range(start, periods=2, freq='B') + pd.date_range(start, periods=2, freq="B") class TestCustomDateRange: - def test_constructor(self): bdate_range(START, END, freq=CDay()) bdate_range(START, periods=20, freq=CDay()) bdate_range(end=START, periods=20, freq=CDay()) - msg = 'periods must be a number, got C' + msg = "periods must be a number, got C" with pytest.raises(TypeError, match=msg): - date_range('2011-1-1', '2012-1-1', 'C') + date_range("2011-1-1", "2012-1-1", "C") with pytest.raises(TypeError, match=msg): - bdate_range('2011-1-1', '2012-1-1', 'C') + bdate_range("2011-1-1", "2012-1-1", "C") def test_misc(self): end = datetime(2009, 5, 13) - dr = bdate_range(end=end, periods=20, freq='C') + dr = bdate_range(end=end, periods=20, freq="C") firstDate = end - 19 * CDay() assert len(dr) == 20 @@ -779,77 +854,95 @@ def test_misc(self): def test_daterange_bug_456(self): # GH #456 - rng1 = bdate_range('12/5/2011', '12/5/2011', freq='C') - rng2 = bdate_range('12/2/2011', '12/5/2011', freq='C') + rng1 = bdate_range("12/5/2011", "12/5/2011", freq="C") + rng2 = bdate_range("12/2/2011", "12/5/2011", freq="C") rng2.freq = CDay() result = rng1.union(rng2) assert isinstance(result, DatetimeIndex) def test_cdaterange(self): - result = bdate_range('2013-05-01', periods=3, freq='C') - expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) + result = bdate_range("2013-05-01", periods=3, freq="C") + expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-03"]) tm.assert_index_equal(result, expected) def test_cdaterange_weekmask(self): - result = bdate_range('2013-05-01', periods=3, freq='C', - weekmask='Sun Mon Tue Wed Thu') - expected = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) + result = bdate_range( + "2013-05-01", periods=3, freq="C", weekmask="Sun Mon Tue Wed Thu" + ) + expected = DatetimeIndex(["2013-05-01", "2013-05-02", "2013-05-05"]) tm.assert_index_equal(result, expected) # raise with non-custom freq - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency B') + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) with pytest.raises(ValueError, match=msg): - bdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu') + bdate_range("2013-05-01", periods=3, weekmask="Sun Mon Tue Wed Thu") def test_cdaterange_holidays(self): - result = bdate_range('2013-05-01', periods=3, freq='C', - holidays=['2013-05-01']) - expected = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) + result = bdate_range("2013-05-01", periods=3, freq="C", holidays=["2013-05-01"]) + expected = DatetimeIndex(["2013-05-02", "2013-05-03", "2013-05-06"]) tm.assert_index_equal(result, expected) # raise with non-custom freq - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency B') + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) with pytest.raises(ValueError, match=msg): - bdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) + bdate_range("2013-05-01", periods=3, holidays=["2013-05-01"]) def test_cdaterange_weekmask_and_holidays(self): - result = bdate_range('2013-05-01', periods=3, freq='C', - weekmask='Sun Mon Tue Wed Thu', - holidays=['2013-05-01']) - expected = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) + result = bdate_range( + "2013-05-01", + periods=3, + freq="C", + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + ) + expected = DatetimeIndex(["2013-05-02", "2013-05-05", "2013-05-06"]) tm.assert_index_equal(result, expected) # raise with non-custom freq - msg = ('a custom frequency string is required when holidays or ' - 'weekmask are passed, got frequency B') + msg = ( + "a custom frequency string is required when holidays or " + "weekmask are passed, got frequency B" + ) with pytest.raises(ValueError, match=msg): - bdate_range('2013-05-01', periods=3, - weekmask='Sun Mon Tue Wed Thu', - holidays=['2013-05-01']) - - @pytest.mark.parametrize('freq', [freq for freq in prefix_mapping - if freq.startswith('C')]) + bdate_range( + "2013-05-01", + periods=3, + weekmask="Sun Mon Tue Wed Thu", + holidays=["2013-05-01"], + ) + + @pytest.mark.parametrize( + "freq", [freq for freq in prefix_mapping if freq.startswith("C")] + ) def test_all_custom_freq(self, freq): # should not raise - bdate_range(START, END, freq=freq, weekmask='Mon Wed Fri', - holidays=['2009-03-14']) + bdate_range( + START, END, freq=freq, weekmask="Mon Wed Fri", holidays=["2009-03-14"] + ) - bad_freq = freq + 'FOO' - msg = 'invalid custom frequency string: {freq}' + bad_freq = freq + "FOO" + msg = "invalid custom frequency string: {freq}" with pytest.raises(ValueError, match=msg.format(freq=bad_freq)): bdate_range(START, END, freq=bad_freq) - @pytest.mark.parametrize('start_end', [ - ('2018-01-01T00:00:01.000Z', '2018-01-03T00:00:01.000Z'), - ('2018-01-01T00:00:00.010Z', '2018-01-03T00:00:00.010Z'), - ('2001-01-01T00:00:00.010Z', '2001-01-03T00:00:00.010Z')]) + @pytest.mark.parametrize( + "start_end", + [ + ("2018-01-01T00:00:01.000Z", "2018-01-03T00:00:01.000Z"), + ("2018-01-01T00:00:00.010Z", "2018-01-03T00:00:00.010Z"), + ("2001-01-01T00:00:00.010Z", "2001-01-03T00:00:00.010Z"), + ], + ) def test_range_with_millisecond_resolution(self, start_end): # https://github.com/pandas-dev/pandas/issues/24110 start, end = start_end - result = pd.date_range(start=start, end=end, periods=2, closed='left') + result = pd.date_range(start=start, end=end, periods=2, closed="left") expected = DatetimeIndex([start]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 01649cb4646de..aeff489861f5d 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -5,8 +5,7 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets) +from pandas import DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -14,20 +13,19 @@ class TestDatetimeIndex: - def test_roundtrip_pickle_with_tz(self): # GH 8367 # round-trip of timezone - index = date_range('20130101', periods=3, tz='US/Eastern', name='foo') + index = date_range("20130101", periods=3, tz="US/Eastern", name="foo") unpickled = tm.round_trip_pickle(index) tm.assert_index_equal(index, unpickled) def test_reindex_preserves_tz_if_target_is_empty_list_or_array(self): # GH7774 - index = date_range('20130101', periods=3, tz='US/Eastern') - assert str(index.reindex([])[0].tz) == 'US/Eastern' - assert str(index.reindex(np.array([]))[0].tz) == 'US/Eastern' + index = date_range("20130101", periods=3, tz="US/Eastern") + assert str(index.reindex([])[0].tz) == "US/Eastern" + assert str(index.reindex(np.array([]))[0].tz) == "US/Eastern" def test_time_loc(self): # GH8667 from datetime import time @@ -39,12 +37,11 @@ def test_time_loc(self): # GH8667 step = 24 * 3600 for n in ns: - idx = pd.date_range('2014-11-26', periods=n, freq='S') + idx = pd.date_range("2014-11-26", periods=n, freq="S") ts = pd.Series(np.random.randn(n), index=idx) i = np.arange(start, n, step) - tm.assert_numpy_array_equal(ts.index.get_loc(key), i, - check_dtype=False) + tm.assert_numpy_array_equal(ts.index.get_loc(key), i, check_dtype=False) tm.assert_series_equal(ts[key], ts.iloc[i]) left, right = ts.copy(), ts.copy() @@ -61,10 +58,10 @@ def test_time_overflow_for_32bit_machines(self): # overflow. periods = np.int_(1000) - idx1 = pd.date_range(start='2000', periods=periods, freq='S') + idx1 = pd.date_range(start="2000", periods=periods, freq="S") assert len(idx1) == periods - idx2 = pd.date_range(end='2000', periods=periods, freq='S') + idx2 = pd.date_range(end="2000", periods=periods, freq="S") assert len(idx2) == periods def test_nat(self): @@ -85,62 +82,64 @@ def test_week_of_month_frequency(self): tm.assert_index_equal(result_union, expected) # GH 5115 - result = date_range("2013-1-1", periods=4, freq='WOM-1SAT') - dates = ['2013-01-05', '2013-02-02', '2013-03-02', '2013-04-06'] - expected = DatetimeIndex(dates, freq='WOM-1SAT') + result = date_range("2013-1-1", periods=4, freq="WOM-1SAT") + dates = ["2013-01-05", "2013-02-02", "2013-03-02", "2013-04-06"] + expected = DatetimeIndex(dates, freq="WOM-1SAT") tm.assert_index_equal(result, expected) def test_hash_error(self): - index = date_range('20010101', periods=10) - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + index = date_range("20010101", periods=10) + with pytest.raises( + TypeError, match=("unhashable type: %r" % type(index).__name__) + ): hash(index) def test_stringified_slice_with_tz(self): # GH#2658 - start = '2013-01-07' - idx = date_range(start=start, freq="1d", periods=10, tz='US/Eastern') + start = "2013-01-07" + idx = date_range(start=start, freq="1d", periods=10, tz="US/Eastern") df = DataFrame(np.arange(10), index=idx) df["2013-01-14 23:44:34.437768-05:00":] # no exception here def test_append_join_nondatetimeindex(self): - rng = date_range('1/1/2000', periods=10) - idx = Index(['a', 'b', 'c', 'd']) + rng = date_range("1/1/2000", periods=10) + idx = Index(["a", "b", "c", "d"]) result = rng.append(idx) assert isinstance(result[0], Timestamp) # it works - rng.join(idx, how='outer') + rng.join(idx, how="outer") def test_map(self): - rng = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10) - f = lambda x: x.strftime('%Y%m%d') + f = lambda x: x.strftime("%Y%m%d") result = rng.map(f) - exp = Index([f(x) for x in rng], dtype='= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -253,28 +280,33 @@ def test_take_fill_value(self): idx.take(np.array([1, -5])) def test_take_fill_value_with_timezone(self): - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', tz='US/Eastern') + idx = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', tz='US/Eastern') + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", tz="US/Eastern" + ) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', tz='US/Eastern') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", tz="US/Eastern" + ) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -285,56 +317,72 @@ def test_take_fill_value_with_timezone(self): class TestDatetimeIndex: - @pytest.mark.parametrize('null', [None, np.nan, pd.NaT]) - @pytest.mark.parametrize('tz', [None, 'UTC', 'US/Eastern']) + @pytest.mark.parametrize("null", [None, np.nan, pd.NaT]) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) def test_insert_nat(self, tz, null): # GH#16537, GH#18295 (test missing) - idx = pd.DatetimeIndex(['2017-01-01'], tz=tz) - expected = pd.DatetimeIndex(['NaT', '2017-01-01'], tz=tz) + idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) + expected = pd.DatetimeIndex(["NaT", "2017-01-01"], tz=tz) res = idx.insert(0, null) tm.assert_index_equal(res, expected) def test_insert(self): - idx = DatetimeIndex( - ['2000-01-04', '2000-01-01', '2000-01-02'], name='idx') + idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") result = idx.insert(2, datetime(2000, 1, 5)) - exp = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-05', - '2000-01-02'], name='idx') + exp = DatetimeIndex( + ["2000-01-04", "2000-01-01", "2000-01-05", "2000-01-02"], name="idx" + ) tm.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index - result = idx.insert(1, 'inserted') - expected = Index([datetime(2000, 1, 4), 'inserted', - datetime(2000, 1, 1), - datetime(2000, 1, 2)], name='idx') + result = idx.insert(1, "inserted") + expected = Index( + [ + datetime(2000, 1, 4), + "inserted", + datetime(2000, 1, 1), + datetime(2000, 1, 2), + ], + name="idx", + ) assert not isinstance(result, DatetimeIndex) tm.assert_index_equal(result, expected) assert result.name == expected.name - idx = date_range('1/1/2000', periods=3, freq='M', name='idx') + idx = date_range("1/1/2000", periods=3, freq="M", name="idx") # preserve freq - expected_0 = DatetimeIndex(['1999-12-31', '2000-01-31', '2000-02-29', - '2000-03-31'], name='idx', freq='M') - expected_3 = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', - '2000-04-30'], name='idx', freq='M') + expected_0 = DatetimeIndex( + ["1999-12-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq="M", + ) + expected_3 = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30"], + name="idx", + freq="M", + ) # reset freq to None - expected_1_nofreq = DatetimeIndex(['2000-01-31', '2000-01-31', - '2000-02-29', - '2000-03-31'], name='idx', - freq=None) - expected_3_nofreq = DatetimeIndex(['2000-01-31', '2000-02-29', - '2000-03-31', - '2000-01-02'], name='idx', - freq=None) - - cases = [(0, datetime(1999, 12, 31), expected_0), - (-3, datetime(1999, 12, 31), expected_0), - (3, datetime(2000, 4, 30), expected_3), - (1, datetime(2000, 1, 31), expected_1_nofreq), - (3, datetime(2000, 1, 2), expected_3_nofreq)] + expected_1_nofreq = DatetimeIndex( + ["2000-01-31", "2000-01-31", "2000-02-29", "2000-03-31"], + name="idx", + freq=None, + ) + expected_3_nofreq = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) + + cases = [ + (0, datetime(1999, 12, 31), expected_0), + (-3, datetime(1999, 12, 31), expected_0), + (3, datetime(2000, 4, 30), expected_3), + (1, datetime(2000, 1, 31), expected_1_nofreq), + (3, datetime(2000, 1, 2), expected_3_nofreq), + ] for n, d, expected in cases: result = idx.insert(n, d) @@ -344,33 +392,36 @@ def test_insert(self): # reset freq to None result = idx.insert(3, datetime(2000, 1, 2)) - expected = DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', - '2000-01-02'], name='idx', freq=None) + expected = DatetimeIndex( + ["2000-01-31", "2000-02-29", "2000-03-31", "2000-01-02"], + name="idx", + freq=None, + ) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq is None # see gh-7299 - idx = date_range('1/1/2000', periods=3, freq='D', tz='Asia/Tokyo', - name='idx') + idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") with pytest.raises(ValueError): - idx.insert(3, pd.Timestamp('2000-01-04')) + idx.insert(3, pd.Timestamp("2000-01-04")) with pytest.raises(ValueError): idx.insert(3, datetime(2000, 1, 4)) with pytest.raises(ValueError): - idx.insert(3, pd.Timestamp('2000-01-04', tz='US/Eastern')) + idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) with pytest.raises(ValueError): - idx.insert(3, datetime(2000, 1, 4, - tzinfo=pytz.timezone('US/Eastern'))) + idx.insert(3, datetime(2000, 1, 4, tzinfo=pytz.timezone("US/Eastern"))) - for tz in ['US/Pacific', 'Asia/Singapore']: - idx = date_range('1/1/2000 09:00', periods=6, freq='H', tz=tz, - name='idx') + for tz in ["US/Pacific", "Asia/Singapore"]: + idx = date_range("1/1/2000 09:00", periods=6, freq="H", tz=tz, name="idx") # preserve freq - expected = date_range('1/1/2000 09:00', periods=7, freq='H', tz=tz, - name='idx') - for d in [pd.Timestamp('2000-01-01 15:00', tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 15))]: + expected = date_range( + "1/1/2000 09:00", periods=7, freq="H", tz=tz, name="idx" + ) + for d in [ + pd.Timestamp("2000-01-01 15:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 15)), + ]: result = idx.insert(6, d) tm.assert_index_equal(result, expected) @@ -378,15 +429,25 @@ def test_insert(self): assert result.freq == expected.freq assert result.tz == expected.tz - expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 10:00', - '2000-01-01 11:00', - '2000-01-01 12:00', '2000-01-01 13:00', - '2000-01-01 14:00', - '2000-01-01 10:00'], name='idx', - tz=tz, freq=None) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-01 14:00", + "2000-01-01 10:00", + ], + name="idx", + tz=tz, + freq=None, + ) # reset freq to None - for d in [pd.Timestamp('2000-01-01 10:00', tz=tz), - pytz.timezone(tz).localize(datetime(2000, 1, 1, 10))]: + for d in [ + pd.Timestamp("2000-01-01 10:00", tz=tz), + pytz.timezone(tz).localize(datetime(2000, 1, 1, 10)), + ]: result = idx.insert(6, d) tm.assert_index_equal(result, expected) assert result.name == expected.name @@ -394,23 +455,26 @@ def test_insert(self): assert result.freq is None def test_delete(self): - idx = date_range(start='2000-01-01', periods=5, freq='M', name='idx') + idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") # prserve freq - expected_0 = date_range(start='2000-02-01', periods=4, freq='M', - name='idx') - expected_4 = date_range(start='2000-01-01', periods=4, freq='M', - name='idx') + expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") + expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") # reset freq to None - expected_1 = DatetimeIndex(['2000-01-31', '2000-03-31', '2000-04-30', - '2000-05-31'], freq=None, name='idx') - - cases = {0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1} + expected_1 = DatetimeIndex( + ["2000-01-31", "2000-03-31", "2000-04-30", "2000-05-31"], + freq=None, + name="idx", + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -421,43 +485,56 @@ def test_delete(self): # either depending on numpy version idx.delete(5) - for tz in [None, 'Asia/Tokyo', 'US/Pacific']: - idx = date_range(start='2000-01-01 09:00', periods=10, freq='H', - name='idx', tz=tz) + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + idx = date_range( + start="2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ) - expected = date_range(start='2000-01-01 10:00', periods=9, - freq='H', name='idx', tz=tz) + expected = date_range( + start="2000-01-01 10:00", periods=9, freq="H", name="idx", tz=tz + ) result = idx.delete(0) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == 'H' + assert result.freqstr == "H" assert result.tz == expected.tz - expected = date_range(start='2000-01-01 09:00', periods=9, - freq='H', name='idx', tz=tz) + expected = date_range( + start="2000-01-01 09:00", periods=9, freq="H", name="idx", tz=tz + ) result = idx.delete(-1) tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freqstr == 'H' + assert result.freqstr == "H" assert result.tz == expected.tz def test_delete_slice(self): - idx = date_range(start='2000-01-01', periods=10, freq='D', name='idx') + idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") # prserve freq - expected_0_2 = date_range(start='2000-01-04', periods=7, freq='D', - name='idx') - expected_7_9 = date_range(start='2000-01-01', periods=7, freq='D', - name='idx') + expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") + expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") # reset freq to None - expected_3_5 = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', - '2000-01-07', '2000-01-08', '2000-01-09', - '2000-01-10'], freq=None, name='idx') - - cases = {(0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5} + expected_3_5 = DatetimeIndex( + [ + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + ], + freq=None, + name="idx", + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -469,13 +546,18 @@ def test_delete_slice(self): assert result.name == expected.name assert result.freq == expected.freq - for tz in [None, 'Asia/Tokyo', 'US/Pacific']: - ts = pd.Series(1, index=pd.date_range( - '2000-01-01 09:00', periods=10, freq='H', name='idx', tz=tz)) + for tz in [None, "Asia/Tokyo", "US/Pacific"]: + ts = pd.Series( + 1, + index=pd.date_range( + "2000-01-01 09:00", periods=10, freq="H", name="idx", tz=tz + ), + ) # preserve freq result = ts.drop(ts.index[:5]).index - expected = pd.date_range('2000-01-01 14:00', periods=5, freq='H', - name='idx', tz=tz) + expected = pd.date_range( + "2000-01-01 14:00", periods=5, freq="H", name="idx", tz=tz + ) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq @@ -483,128 +565,155 @@ def test_delete_slice(self): # reset freq to None result = ts.drop(ts.index[[1, 3, 5, 7, 9]]).index - expected = DatetimeIndex(['2000-01-01 09:00', '2000-01-01 11:00', - '2000-01-01 13:00', - '2000-01-01 15:00', '2000-01-01 17:00'], - freq=None, name='idx', tz=tz) + expected = DatetimeIndex( + [ + "2000-01-01 09:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + "2000-01-01 15:00", + "2000-01-01 17:00", + ], + freq=None, + name="idx", + tz=tz, + ) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq assert result.tz == expected.tz def test_get_loc(self): - idx = pd.date_range('2000-01-01', periods=3) + idx = pd.date_range("2000-01-01", periods=3) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(idx[1], method) == 1 assert idx.get_loc(idx[1].to_pydatetime(), method) == 1 assert idx.get_loc(str(idx[1]), method) == 1 if method is not None: - assert idx.get_loc(idx[1], method, - tolerance=pd.Timedelta('0 days')) == 1 - - assert idx.get_loc('2000-01-01', method='nearest') == 0 - assert idx.get_loc('2000-01-01T12', method='nearest') == 1 - - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance='1 day') == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=pd.Timedelta('1D')) == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=np.timedelta64(1, 'D')) == 1 - assert idx.get_loc('2000-01-01T12', method='nearest', - tolerance=timedelta(1)) == 1 - with pytest.raises(ValueError, match='unit abbreviation w/o a number'): - idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') + assert ( + idx.get_loc(idx[1], method, tolerance=pd.Timedelta("0 days")) == 1 + ) + + assert idx.get_loc("2000-01-01", method="nearest") == 0 + assert idx.get_loc("2000-01-01T12", method="nearest") == 1 + + assert idx.get_loc("2000-01-01T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=pd.Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-01T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-01T12", method="nearest", tolerance=timedelta(1)) == 1 + ) + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc("2000-01-01T12", method="nearest", tolerance="foo") with pytest.raises(KeyError): - idx.get_loc('2000-01-01T03', method='nearest', tolerance='2 hours') + idx.get_loc("2000-01-01T03", method="nearest", tolerance="2 hours") with pytest.raises( - ValueError, - match='tolerance size must match target index size'): - idx.get_loc('2000-01-01', method='nearest', - tolerance=[pd.Timedelta('1day').to_timedelta64(), - pd.Timedelta('1day').to_timedelta64()]) - - assert idx.get_loc('2000', method='nearest') == slice(0, 3) - assert idx.get_loc('2000-01', method='nearest') == slice(0, 3) - - assert idx.get_loc('1999', method='nearest') == 0 - assert idx.get_loc('2001', method='nearest') == 2 + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-01", + method="nearest", + tolerance=[ + pd.Timedelta("1day").to_timedelta64(), + pd.Timedelta("1day").to_timedelta64(), + ], + ) + + assert idx.get_loc("2000", method="nearest") == slice(0, 3) + assert idx.get_loc("2000-01", method="nearest") == slice(0, 3) + + assert idx.get_loc("1999", method="nearest") == 0 + assert idx.get_loc("2001", method="nearest") == 2 with pytest.raises(KeyError): - idx.get_loc('1999', method='pad') + idx.get_loc("1999", method="pad") with pytest.raises(KeyError): - idx.get_loc('2001', method='backfill') + idx.get_loc("2001", method="backfill") with pytest.raises(KeyError): - idx.get_loc('foobar') + idx.get_loc("foobar") with pytest.raises(TypeError): idx.get_loc(slice(2)) - idx = pd.to_datetime(['2000-01-01', '2000-01-04']) - assert idx.get_loc('2000-01-02', method='nearest') == 0 - assert idx.get_loc('2000-01-03', method='nearest') == 1 - assert idx.get_loc('2000-01', method='nearest') == slice(0, 2) + idx = pd.to_datetime(["2000-01-01", "2000-01-04"]) + assert idx.get_loc("2000-01-02", method="nearest") == 0 + assert idx.get_loc("2000-01-03", method="nearest") == 1 + assert idx.get_loc("2000-01", method="nearest") == slice(0, 2) # time indexing - idx = pd.date_range('2000-01-01', periods=24, freq='H') - tm.assert_numpy_array_equal(idx.get_loc(time(12)), - np.array([12]), check_dtype=False) - tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), - np.array([]), check_dtype=False) + idx = pd.date_range("2000-01-01", periods=24, freq="H") + tm.assert_numpy_array_equal( + idx.get_loc(time(12)), np.array([12]), check_dtype=False + ) + tm.assert_numpy_array_equal( + idx.get_loc(time(12, 30)), np.array([]), check_dtype=False + ) with pytest.raises(NotImplementedError): - idx.get_loc(time(12, 30), method='pad') + idx.get_loc(time(12, 30), method="pad") def test_get_indexer(self): - idx = pd.date_range('2000-01-01', periods=3) + idx = pd.date_range("2000-01-01", periods=3) exp = np.array([0, 1, 2], dtype=np.intp) tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) - target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', - '1 day 1 hour']) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) + target = idx[0] + pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=pd.Timedelta('1 hour')), - np.array([0, -1, 1], dtype=np.intp)) - tol_raw = [pd.Timedelta('1 hour'), - pd.Timedelta('1 hour'), - pd.Timedelta('1 hour').to_timedelta64(), ] + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=[np.timedelta64(x) for x in tol_raw]), - np.array([0, -1, 1], dtype=np.intp)) - tol_bad = [pd.Timedelta('2 hour').to_timedelta64(), - pd.Timedelta('1 hour').to_timedelta64(), - 'foo', ] - with pytest.raises( - ValueError, match='abbreviation w/o a number'): - idx.get_indexer(target, 'nearest', tolerance=tol_bad) + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 hour")), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_raw = [ + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour").to_timedelta64(), + ] + tm.assert_numpy_array_equal( + idx.get_indexer( + target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] + ), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_bad = [ + pd.Timedelta("2 hour").to_timedelta64(), + pd.Timedelta("1 hour").to_timedelta64(), + "foo", + ] + with pytest.raises(ValueError, match="abbreviation w/o a number"): + idx.get_indexer(target, "nearest", tolerance=tol_bad) with pytest.raises(ValueError): - idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') + idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") def test_reasonable_key_error(self): # GH#1062 - index = DatetimeIndex(['1/3/2000']) - with pytest.raises(KeyError, match='2000'): - index.get_loc('1/1/2000') + index = DatetimeIndex(["1/3/2000"]) + with pytest.raises(KeyError, match="2000"): + index.get_loc("1/1/2000") - @pytest.mark.parametrize('key', [pd.Timedelta(0), - pd.Timedelta(1), - timedelta(0)]) + @pytest.mark.parametrize("key", [pd.Timedelta(0), pd.Timedelta(1), timedelta(0)]) def test_timedelta_invalid_key(self, key): # GH#20464 - dti = pd.date_range('1970-01-01', periods=10) + dti = pd.date_range("1970-01-01", periods=10) with pytest.raises(TypeError): dti.get_loc(key) def test_get_loc_nat(self): # GH#20464 - index = DatetimeIndex(['1/3/2000', 'NaT']) + index = DatetimeIndex(["1/3/2000", "NaT"]) assert index.get_loc(pd.NaT) == 1 diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 91e614cd516b9..4ea32359b8d4a 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -6,16 +6,14 @@ import pytest import pandas as pd -from pandas import ( - DatetimeIndex, Index, Timestamp, date_range, datetime, offsets) +from pandas import DatetimeIndex, Index, Timestamp, date_range, datetime, offsets import pandas.util.testing as tm class TestTimeSeries: - def test_pass_datetimeindex_to_index(self): # Bugs in #1396 - rng = date_range('1/1/2000', '3/1/2000') + rng = date_range("1/1/2000", "3/1/2000") idx = Index(rng, dtype=object) expected = Index(rng.to_pydatetime(), dtype=object) @@ -24,78 +22,126 @@ def test_pass_datetimeindex_to_index(self): def test_range_edges(self): # GH#13672 - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000001'), - end=Timestamp('1970-01-01 00:00:00.000000004'), - freq='N') - exp = DatetimeIndex(['1970-01-01 00:00:00.000000001', - '1970-01-01 00:00:00.000000002', - '1970-01-01 00:00:00.000000003', - '1970-01-01 00:00:00.000000004']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000001"), + end=Timestamp("1970-01-01 00:00:00.000000004"), + freq="N", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.000000001", + "1970-01-01 00:00:00.000000002", + "1970-01-01 00:00:00.000000003", + "1970-01-01 00:00:00.000000004", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000004'), - end=Timestamp('1970-01-01 00:00:00.000000001'), - freq='N') + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000004"), + end=Timestamp("1970-01-01 00:00:00.000000001"), + freq="N", + ) exp = DatetimeIndex([]) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000000001'), - end=Timestamp('1970-01-01 00:00:00.000000001'), - freq='N') - exp = DatetimeIndex(['1970-01-01 00:00:00.000000001']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000000001"), + end=Timestamp("1970-01-01 00:00:00.000000001"), + freq="N", + ) + exp = DatetimeIndex(["1970-01-01 00:00:00.000000001"]) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.000001'), - end=Timestamp('1970-01-01 00:00:00.000004'), - freq='U') - exp = DatetimeIndex(['1970-01-01 00:00:00.000001', - '1970-01-01 00:00:00.000002', - '1970-01-01 00:00:00.000003', - '1970-01-01 00:00:00.000004']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.000001"), + end=Timestamp("1970-01-01 00:00:00.000004"), + freq="U", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.000001", + "1970-01-01 00:00:00.000002", + "1970-01-01 00:00:00.000003", + "1970-01-01 00:00:00.000004", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:00.001'), - end=Timestamp('1970-01-01 00:00:00.004'), - freq='L') - exp = DatetimeIndex(['1970-01-01 00:00:00.001', - '1970-01-01 00:00:00.002', - '1970-01-01 00:00:00.003', - '1970-01-01 00:00:00.004']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:00.001"), + end=Timestamp("1970-01-01 00:00:00.004"), + freq="L", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:00.001", + "1970-01-01 00:00:00.002", + "1970-01-01 00:00:00.003", + "1970-01-01 00:00:00.004", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:00:01'), - end=Timestamp('1970-01-01 00:00:04'), freq='S') - exp = DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02', - '1970-01-01 00:00:03', '1970-01-01 00:00:04']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:00:01"), + end=Timestamp("1970-01-01 00:00:04"), + freq="S", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:00:01", + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + "1970-01-01 00:00:04", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 00:01'), - end=Timestamp('1970-01-01 00:04'), freq='T') - exp = DatetimeIndex(['1970-01-01 00:01', '1970-01-01 00:02', - '1970-01-01 00:03', '1970-01-01 00:04']) + idx = pd.date_range( + start=Timestamp("1970-01-01 00:01"), + end=Timestamp("1970-01-01 00:04"), + freq="T", + ) + exp = DatetimeIndex( + [ + "1970-01-01 00:01", + "1970-01-01 00:02", + "1970-01-01 00:03", + "1970-01-01 00:04", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01 01:00'), - end=Timestamp('1970-01-01 04:00'), freq='H') - exp = DatetimeIndex(['1970-01-01 01:00', '1970-01-01 02:00', - '1970-01-01 03:00', '1970-01-01 04:00']) + idx = pd.date_range( + start=Timestamp("1970-01-01 01:00"), + end=Timestamp("1970-01-01 04:00"), + freq="H", + ) + exp = DatetimeIndex( + [ + "1970-01-01 01:00", + "1970-01-01 02:00", + "1970-01-01 03:00", + "1970-01-01 04:00", + ] + ) tm.assert_index_equal(idx, exp) - idx = pd.date_range(start=Timestamp('1970-01-01'), - end=Timestamp('1970-01-04'), freq='D') - exp = DatetimeIndex(['1970-01-01', '1970-01-02', - '1970-01-03', '1970-01-04']) + idx = pd.date_range( + start=Timestamp("1970-01-01"), end=Timestamp("1970-01-04"), freq="D" + ) + exp = DatetimeIndex(["1970-01-01", "1970-01-02", "1970-01-03", "1970-01-04"]) tm.assert_index_equal(idx, exp) class TestDatetime64: - def test_datetimeindex_accessors(self): - dti_naive = pd.date_range(freq='D', start=datetime(1998, 1, 1), - periods=365) + dti_naive = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365) # GH#13303 - dti_tz = pd.date_range(freq='D', start=datetime(1998, 1, 1), - periods=365, tz='US/Eastern') + dti_tz = pd.date_range( + freq="D", start=datetime(1998, 1, 1), periods=365, tz="US/Eastern" + ) for dti in [dti_naive, dti_tz]: assert dti.year[0] == 1998 @@ -156,14 +202,14 @@ def test_datetimeindex_accessors(self): assert len(dti.is_year_end) == 365 assert len(dti.weekday_name) == 365 - dti.name = 'name' + dti.name = "name" # non boolean accessors -> return Index for accessor in DatetimeIndex._field_ops: res = getattr(dti, accessor) assert len(res) == 365 assert isinstance(res, Index) - assert res.name == 'name' + assert res.name == "name" # boolean accessors -> return array for accessor in DatetimeIndex._bool_ops: @@ -176,11 +222,10 @@ def test_datetimeindex_accessors(self): exp = dti[[0, 90, 181, 273]] tm.assert_index_equal(res, exp) res = dti[dti.is_leap_year] - exp = DatetimeIndex([], freq='D', tz=dti.tz, name='name') + exp = DatetimeIndex([], freq="D", tz=dti.tz, name="name") tm.assert_index_equal(res, exp) - dti = pd.date_range(freq='BQ-FEB', start=datetime(1998, 1, 1), - periods=4) + dti = pd.date_range(freq="BQ-FEB", start=datetime(1998, 1, 1), periods=4) assert sum(dti.is_quarter_start) == 0 assert sum(dti.is_quarter_end) == 4 @@ -188,49 +233,50 @@ def test_datetimeindex_accessors(self): assert sum(dti.is_year_end) == 1 # Ensure is_start/end accessors throw ValueError for CustomBusinessDay, - bday_egypt = offsets.CustomBusinessDay(weekmask='Sun Mon Tue Wed Thu') + bday_egypt = offsets.CustomBusinessDay(weekmask="Sun Mon Tue Wed Thu") dti = date_range(datetime(2013, 4, 30), periods=5, freq=bday_egypt) msg = "Custom business days is not supported by is_month_start" with pytest.raises(ValueError, match=msg): dti.is_month_start - dti = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) + dti = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-03"]) assert dti.is_month_start[0] == 1 tests = [ - (Timestamp('2013-06-01', freq='M').is_month_start, 1), - (Timestamp('2013-06-01', freq='BM').is_month_start, 0), - (Timestamp('2013-06-03', freq='M').is_month_start, 0), - (Timestamp('2013-06-03', freq='BM').is_month_start, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1), - (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1), - (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1), - (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1), - (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0), - (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0), - (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1), - (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1), - (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1), - (Timestamp('2013-06-30', freq='BQ').is_month_end, 0), - (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0), - (Timestamp('2013-06-30', freq='BQ').is_year_end, 0), - (Timestamp('2013-06-28', freq='BQ').is_month_end, 1), - (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1), - (Timestamp('2013-06-28', freq='BQ').is_year_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0), - (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0), - (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1), - (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1), - (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1), - (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1), - (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1), - (Timestamp('2012-02-01').days_in_month, 29), - (Timestamp('2013-02-01').days_in_month, 28)] + (Timestamp("2013-06-01", freq="M").is_month_start, 1), + (Timestamp("2013-06-01", freq="BM").is_month_start, 0), + (Timestamp("2013-06-03", freq="M").is_month_start, 0), + (Timestamp("2013-06-03", freq="BM").is_month_start, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_month_end, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_quarter_end, 1), + (Timestamp("2013-02-28", freq="Q-FEB").is_year_end, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_month_start, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_quarter_start, 1), + (Timestamp("2013-03-01", freq="Q-FEB").is_year_start, 1), + (Timestamp("2013-03-31", freq="QS-FEB").is_month_end, 1), + (Timestamp("2013-03-31", freq="QS-FEB").is_quarter_end, 0), + (Timestamp("2013-03-31", freq="QS-FEB").is_year_end, 0), + (Timestamp("2013-02-01", freq="QS-FEB").is_month_start, 1), + (Timestamp("2013-02-01", freq="QS-FEB").is_quarter_start, 1), + (Timestamp("2013-02-01", freq="QS-FEB").is_year_start, 1), + (Timestamp("2013-06-30", freq="BQ").is_month_end, 0), + (Timestamp("2013-06-30", freq="BQ").is_quarter_end, 0), + (Timestamp("2013-06-30", freq="BQ").is_year_end, 0), + (Timestamp("2013-06-28", freq="BQ").is_month_end, 1), + (Timestamp("2013-06-28", freq="BQ").is_quarter_end, 1), + (Timestamp("2013-06-28", freq="BQ").is_year_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_month_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_quarter_end, 0), + (Timestamp("2013-06-30", freq="BQS-APR").is_year_end, 0), + (Timestamp("2013-06-28", freq="BQS-APR").is_month_end, 1), + (Timestamp("2013-06-28", freq="BQS-APR").is_quarter_end, 1), + (Timestamp("2013-03-29", freq="BQS-APR").is_year_end, 1), + (Timestamp("2013-11-01", freq="AS-NOV").is_year_start, 1), + (Timestamp("2013-10-31", freq="AS-NOV").is_year_end, 1), + (Timestamp("2012-02-01").days_in_month, 29), + (Timestamp("2013-02-01").days_in_month, 28), + ] for ts, value in tests: assert ts == value @@ -244,37 +290,59 @@ def test_datetimeindex_accessors(self): assert [d.weekofyear for d in dates] == expected # GH 12806 - @pytest.mark.parametrize('time_locale', [ - None] if tm.get_locales() is None else [None] + tm.get_locales()) + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) def test_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes - expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - expected_months = ['January', 'February', 'March', 'April', 'May', - 'June', 'July', 'August', 'September', - 'October', 'November', 'December'] + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] # GH#11128 - dti = pd.date_range(freq='D', start=datetime(1998, 1, 1), - periods=365) - english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - for day, name, eng_name in zip(range(4, 11), - expected_days, - english_days): + dti = pd.date_range(freq="D", start=datetime(1998, 1, 1), periods=365) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert dti.weekday_name[day] == eng_name assert dti.day_name(locale=time_locale)[day] == name ts = Timestamp(datetime(2016, 4, day)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert ts.weekday_name == eng_name assert ts.day_name(locale=time_locale) == name dti = dti.append(DatetimeIndex([pd.NaT])) @@ -283,7 +351,7 @@ def test_datetime_name_accessors(self, time_locale): assert np.isnan(ts.day_name(locale=time_locale)) # GH#12805 - dti = pd.date_range(freq='M', start='2012', end='2013') + dti = pd.date_range(freq="M", start="2012", end="2013") result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) @@ -308,5 +376,4 @@ def test_datetime_name_accessors(self, time_locale): def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) - tm.assert_index_equal(dti.nanosecond, - pd.Index(np.arange(10, dtype=np.int64))) + tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64))) diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py index 5a6f2fa86b11f..6d94319b33b02 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -5,48 +5,58 @@ class TestDatetimeIndex: - - @pytest.mark.parametrize('tz', ['US/Eastern', 'Asia/Tokyo']) + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) def test_fillna_datetime64(self, tz): # GH 11343 - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00']) + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"]) - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00']) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"] + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) # tz mismatch - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 10:00', tz=tz), - pd.Timestamp('2011-01-01 11:00')], dtype=object) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00", tz=tz), + pd.Timestamp("2011-01-01 11:00"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00'), 'x', - pd.Timestamp('2011-01-01 11:00')], dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) - - idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00'], tz=tz) - - exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], tz=tz) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) - - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - pd.Timestamp('2011-01-01 10:00'), - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - tm.assert_index_equal( - idx.fillna(pd.Timestamp('2011-01-01 10:00')), exp) + exp = pd.Index( + [pd.Timestamp("2011-01-01 09:00"), "x", pd.Timestamp("2011-01-01 11:00")], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + idx = pd.DatetimeIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"], tz=tz) + + exp = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], tz=tz + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00", tz=tz)), exp) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Timestamp("2011-01-01 10:00")), exp) # object - exp = pd.Index([pd.Timestamp('2011-01-01 09:00', tz=tz), - 'x', - pd.Timestamp('2011-01-01 11:00', tz=tz)], - dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01 09:00", tz=tz), + "x", + pd.Timestamp("2011-01-01 11:00", tz=tz), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index 4a8f691987f8e..d4dff2cbce89b 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -8,8 +8,14 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, PeriodIndex, Series, Timestamp, bdate_range, - date_range) + DatetimeIndex, + Index, + PeriodIndex, + Series, + Timestamp, + bdate_range, + date_range, +) from pandas.tests.test_base import Ops import pandas.util.testing as tm @@ -19,11 +25,9 @@ class TestDatetimeIndexOps(Ops): - def setup_method(self, method): super().setup_method(method) - mask = lambda x: (isinstance(x, DatetimeIndex) or - isinstance(x, PeriodIndex)) + mask = lambda x: (isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex)) self.is_valid_objs = [o for o in self.objs if mask(o)] self.not_valid_objs = [o for o in self.objs if not mask(o)] @@ -38,7 +42,7 @@ def test_ops_properties_basic(self): # sanity check that the behavior didn't change # GH#7206 msg = "'Series' object has no attribute '{}'" - for op in ['year', 'day', 'second', 'weekday']: + for op in ["year", "day", "second", "weekday"]: with pytest.raises(AttributeError, match=msg.format(op)): getattr(self.dt_series, op) @@ -53,32 +57,43 @@ def test_ops_properties_basic(self): def test_repeat_range(self, tz_naive_fixture): tz = tz_naive_fixture - rng = date_range('1/1/2000', '1/1/2001') + rng = date_range("1/1/2000", "1/1/2001") result = rng.repeat(5) assert result.freq is None assert len(result) == 5 * len(rng) - index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-02', '2001-01-02'], tz=tz) + index = pd.date_range("2001-01-01", periods=2, freq="D", tz=tz) + exp = pd.DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-02", "2001-01-02"], tz=tz + ) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', - '2001-01-03', '2001-01-03'], tz=tz) + index = pd.date_range("2001-01-01", periods=2, freq="2D", tz=tz) + exp = pd.DatetimeIndex( + ["2001-01-01", "2001-01-01", "2001-01-03", "2001-01-03"], tz=tz + ) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], - tz=tz) - exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', - 'NaT', 'NaT', 'NaT', - '2003-01-01', '2003-01-01', '2003-01-01'], - tz=tz) + index = pd.DatetimeIndex(["2001-01-01", "NaT", "2003-01-01"], tz=tz) + exp = pd.DatetimeIndex( + [ + "2001-01-01", + "2001-01-01", + "2001-01-01", + "NaT", + "NaT", + "NaT", + "2003-01-01", + "2003-01-01", + "2003-01-01", + ], + tz=tz, + ) for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) assert res.freq is None @@ -88,15 +103,16 @@ def test_repeat(self, tz_naive_fixture): reps = 2 msg = "the 'axis' parameter is not supported" - rng = pd.date_range(start='2016-01-01', periods=2, - freq='30Min', tz=tz) + rng = pd.date_range(start="2016-01-01", periods=2, freq="30Min", tz=tz) - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), - ]) + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:30:00", tz=tz, freq="30T"), + ] + ) res = rng.repeat(reps) tm.assert_index_equal(res, expected_rng) @@ -108,75 +124,93 @@ def test_repeat(self, tz_naive_fixture): def test_resolution(self, tz_naive_fixture): tz = tz_naive_fixture - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', - 'S', 'L', 'U'], - ['day', 'day', 'day', 'day', 'hour', - 'minute', 'second', 'millisecond', - 'microsecond']): - idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, - tz=tz) + for freq, expected in zip( + ["A", "Q", "M", "D", "H", "T", "S", "L", "U"], + [ + "day", + "day", + "day", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + ], + ): + idx = pd.date_range(start="2013-04-01", periods=30, freq=freq, tz=tz) assert idx.resolution == expected def test_value_counts_unique(self, tz_naive_fixture): tz = tz_naive_fixture # GH 7735 - idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) + idx = pd.date_range("2011-01-01 09:00", freq="H", periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), - tz=tz) + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) - exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, - tz=tz) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + exp_idx = pd.date_range("2011-01-01 18:00", freq="-1H", periods=10, tz=tz) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, - tz=tz) + expected = pd.date_range("2011-01-01 09:00", freq="H", periods=10, tz=tz) tm.assert_index_equal(idx.unique(), expected) - idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 09:00', '2013-01-01 08:00', - '2013-01-01 08:00', pd.NaT], tz=tz) - - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], - tz=tz) + idx = DatetimeIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + pd.NaT, + ], + tz=tz, + ) + + exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00"], tz=tz) expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', - pd.NaT], tz=tz) + exp_idx = DatetimeIndex(["2013-01-01 09:00", "2013-01-01 08:00", pd.NaT], tz=tz) expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: - tm.assert_series_equal(obj.value_counts(dropna=False), - expected) + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) def test_nonunique_contains(self): # GH 9512 - for idx in map(DatetimeIndex, - ([0, 1, 0], [0, 0, -1], [0, -1, -1], - ['2015', '2015', '2016'], ['2015', '2015', '2014'])): + for idx in map( + DatetimeIndex, + ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["2015", "2015", "2016"], + ["2015", "2015", "2014"], + ), + ): assert idx[0] in idx - @pytest.mark.parametrize('idx', - [ - DatetimeIndex( - ['2011-01-01', - '2011-01-02', - '2011-01-03'], - freq='D', name='idx'), - DatetimeIndex( - ['2011-01-01 09:00', - '2011-01-01 10:00', - '2011-01-01 11:00'], - freq='H', name='tzidx', tz='Asia/Tokyo') - ]) + @pytest.mark.parametrize( + "idx", + [ + DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq="D", name="idx" + ), + DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], + freq="H", + name="tzidx", + tz="Asia/Tokyo", + ), + ], + ) def test_order_with_freq(self, idx): ordered = idx.sort_values() tm.assert_index_equal(ordered, idx) @@ -190,41 +224,39 @@ def test_order_with_freq(self, idx): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) assert ordered.freq == idx.freq - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) expected = idx[::-1] tm.assert_index_equal(ordered, expected) - tm.assert_numpy_array_equal(indexer, - np.array([2, 1, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) assert ordered.freq == expected.freq assert ordered.freq.n == -1 - @pytest.mark.parametrize('index_dates,expected_dates', [ - (['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - ['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05']), - (['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - ['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05']), - ([pd.NaT, '2011-01-03', '2011-01-05', - '2011-01-02', pd.NaT], - [pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', - '2011-01-05']) - ]) - def test_order_without_freq(self, index_dates, expected_dates, - tz_naive_fixture): + @pytest.mark.parametrize( + "index_dates,expected_dates", + [ + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ( + [pd.NaT, "2011-01-03", "2011-01-05", "2011-01-02", pd.NaT], + [pd.NaT, pd.NaT, "2011-01-02", "2011-01-03", "2011-01-05"], + ), + ], + ) + def test_order_without_freq(self, index_dates, expected_dates, tz_naive_fixture): tz = tz_naive_fixture # without freq - index = DatetimeIndex(index_dates, tz=tz, name='idx') - expected = DatetimeIndex(expected_dates, tz=tz, name='idx') + index = DatetimeIndex(index_dates, tz=tz, name="idx") + expected = DatetimeIndex(expected_dates, tz=tz, name="idx") ordered = index.sort_values() tm.assert_index_equal(ordered, expected) @@ -241,8 +273,7 @@ def test_order_without_freq(self, index_dates, expected_dates, tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - ordered, indexer = index.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = index.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) @@ -251,7 +282,7 @@ def test_order_without_freq(self, index_dates, expected_dates, def test_drop_duplicates_metadata(self): # GH 10115 - idx = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -264,7 +295,7 @@ def test_drop_duplicates_metadata(self): def test_drop_duplicates(self): # to check Index/Series compat - base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + base = pd.date_range("2011-01-01", "2011-01-31", freq="D", name="idx") idx = base.append(base[:5]) res = idx.drop_duplicates() @@ -272,10 +303,10 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates() tm.assert_series_equal(res, Series(base)) - res = idx.drop_duplicates(keep='last') + res = idx.drop_duplicates(keep="last") exp = base[5:].append(base[:5]) tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') + res = Series(idx).drop_duplicates(keep="last") tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) res = idx.drop_duplicates(keep=False) @@ -283,14 +314,34 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates(keep=False) tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - @pytest.mark.parametrize('freq', [ - 'A', '2A', '-2A', 'Q', '-1Q', 'M', '-1M', 'D', '3D', - '-3D', 'W', '-1W', 'H', '2H', '-2H', 'T', '2T', 'S', - '-3S']) + @pytest.mark.parametrize( + "freq", + [ + "A", + "2A", + "-2A", + "Q", + "-1Q", + "M", + "-1M", + "D", + "3D", + "-3D", + "W", + "-1W", + "H", + "2H", + "-2H", + "T", + "2T", + "S", + "-3S", + ], + ) def test_infer_freq(self, freq): # GH 11018 - idx = pd.date_range('2011-01-01 09:00:00', freq=freq, periods=10) - result = pd.DatetimeIndex(idx.asi8, freq='infer') + idx = pd.date_range("2011-01-01 09:00:00", freq=freq, periods=10) + result = pd.DatetimeIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) assert result.freq == freq @@ -299,25 +350,23 @@ def test_nat(self, tz_naive_fixture): assert pd.DatetimeIndex._na_value is pd.NaT assert pd.DatetimeIndex([])._na_value is pd.NaT - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) - idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) + idx = pd.DatetimeIndex(["2011-01-01", "NaT"], tz=tz) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) def test_equals(self): # GH 13107 - idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) + idx = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"]) assert idx.equals(idx) assert idx.equals(idx.copy()) assert idx.equals(idx.astype(object)) @@ -326,8 +375,7 @@ def test_equals(self): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], - tz='US/Pacific') + idx2 = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "NaT"], tz="US/Pacific") assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -336,7 +384,7 @@ def test_equals(self): assert not idx.equals(pd.Series(idx2)) # same internal, different tz - idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') + idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz="US/Pacific") tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) assert not idx.equals(idx3) assert not idx.equals(idx3.copy()) @@ -345,11 +393,9 @@ def test_equals(self): assert not idx.equals(list(idx3)) assert not idx.equals(pd.Series(idx3)) - @pytest.mark.parametrize('values', [ - ['20180101', '20180103', '20180105'], []]) - @pytest.mark.parametrize('freq', [ - '2D', Day(2), '2B', BDay(2), '48H', Hour(48)]) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) + @pytest.mark.parametrize("values", [["20180101", "20180103", "20180105"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "2B", BDay(2), "48H", Hour(48)]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_freq_setter(self, values, freq, tz): # GH 20678 idx = DatetimeIndex(values, tz=tz) @@ -365,21 +411,23 @@ def test_freq_setter(self, values, freq, tz): def test_freq_setter_errors(self): # GH 20678 - idx = DatetimeIndex(['20180101', '20180103', '20180105']) + idx = DatetimeIndex(["20180101", "20180103", "20180105"]) # setting with an incompatible freq - msg = ('Inferred frequency 2D from passed values does not conform to ' - 'passed frequency 5D') + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) with pytest.raises(ValueError, match=msg): - idx.freq = '5D' + idx.freq = "5D" # setting with non-freq string - with pytest.raises(ValueError, match='Invalid frequency'): - idx.freq = 'foo' + with pytest.raises(ValueError, match="Invalid frequency"): + idx.freq = "foo" def test_offset_deprecated(self): # GH 20716 - idx = pd.DatetimeIndex(['20180101', '20180102']) + idx = pd.DatetimeIndex(["20180101", "20180102"]) # getter deprecated with tm.assert_produces_warning(FutureWarning): @@ -391,7 +439,6 @@ def test_offset_deprecated(self): class TestBusinessDatetimeIndex: - def setup_method(self, method): self.rng = bdate_range(START, END) @@ -437,10 +484,10 @@ def test_identical(self): assert t1.identical(t2) # name - t1 = t1.rename('foo') + t1 = t1.rename("foo") assert t1.equals(t2) assert not t1.identical(t2) - t2 = t2.rename('foo') + t2 = t2.rename("foo") assert t1.identical(t2) # freq @@ -451,7 +498,7 @@ def test_identical(self): class TestCustomDatetimeIndex: def setup_method(self, method): - self.rng = bdate_range(START, END, freq='C') + self.rng = bdate_range(START, END, freq="C") def test_comparison(self): d = self.rng[10] @@ -490,8 +537,7 @@ def test_shift_periods(self): idx = pd.date_range(start=START, end=END, periods=3) tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=True): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): tm.assert_index_equal(idx.shift(n=0), idx) def test_pickle_unpickle(self): diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 6ec8568ce7242..3095bf9657277 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -8,37 +8,43 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, Series, Timedelta, Timestamp, date_range) + DataFrame, + DatetimeIndex, + Index, + Series, + Timedelta, + Timestamp, + date_range, +) from pandas.core.indexing import IndexingError from pandas.util import testing as tm class TestSlicing: def test_dti_slicing(self): - dti = date_range(start='1/1/2005', end='12/1/2005', freq='M') + dti = date_range(start="1/1/2005", end="12/1/2005", freq="M") dti2 = dti[[1, 3, 5]] v1 = dti2[0] v2 = dti2[1] v3 = dti2[2] - assert v1 == Timestamp('2/28/2005') - assert v2 == Timestamp('4/30/2005') - assert v3 == Timestamp('6/30/2005') + assert v1 == Timestamp("2/28/2005") + assert v2 == Timestamp("4/30/2005") + assert v3 == Timestamp("6/30/2005") # don't carry freq through irregular slicing assert dti2.freq is None def test_slice_keeps_name(self): # GH4226 - st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles') - et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles') - dr = pd.date_range(st, et, freq='H', name='timebucket') + st = pd.Timestamp("2013-07-01 00:00:00", tz="America/Los_Angeles") + et = pd.Timestamp("2013-07-02 00:00:00", tz="America/Los_Angeles") + dr = pd.date_range(st, et, freq="H", name="timebucket") assert dr[1:].name == dr.name def test_slice_with_negative_step(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) + ts = Series(np.arange(20), date_range("2014-01-01", periods=20, freq="MS")) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -46,50 +52,51 @@ def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1]) - assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1]) + assert_slices_equivalent(SLC[Timestamp("2014-10-01") :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC["2014-10-01"::-1], SLC[9::-1]) - assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[: Timestamp("2014-10-01") : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:"2014-10-01":-1], SLC[:8:-1]) - assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp( - '2014-10-01'):-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1], - SLC[13:8:-1]) + assert_slices_equivalent(SLC["2015-02-01":"2014-10-01":-1], SLC[13:8:-1]) + assert_slices_equivalent( + SLC[Timestamp("2015-02-01") : Timestamp("2014-10-01") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent( + SLC["2015-02-01" : Timestamp("2014-10-01") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent( + SLC[Timestamp("2015-02-01") : "2014-10-01" : -1], SLC[13:8:-1] + ) - assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0]) + assert_slices_equivalent(SLC["2014-10-01":"2015-02-01":-1], SLC[:0]) def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), - date_range('2014-01-01', periods=20, freq='MS')) - with pytest.raises(ValueError, match='slice step cannot be zero'): + ts = Series(np.arange(20), date_range("2014-01-01", periods=20, freq="MS")) + with pytest.raises(ValueError, match="slice step cannot be zero"): ts[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] def test_slice_bounds_empty(self): # GH#14354 - empty_idx = date_range(freq='1H', periods=0, end='2015') + empty_idx = date_range(freq="1H", periods=0, end="2015") - right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') - exp = Timestamp('2015-01-02 23:59:59.999999999') + right = empty_idx._maybe_cast_slice_bound("2015-01-02", "right", "loc") + exp = Timestamp("2015-01-02 23:59:59.999999999") assert right == exp - left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') - exp = Timestamp('2015-01-02 00:00:00') + left = empty_idx._maybe_cast_slice_bound("2015-01-02", "left", "loc") + exp = Timestamp("2015-01-02 00:00:00") assert left == exp def test_slice_duplicate_monotonic(self): # https://github.com/pandas-dev/pandas/issues/16515 - idx = pd.DatetimeIndex(['2017', '2017']) - result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc') - expected = Timestamp('2017-01-01') + idx = pd.DatetimeIndex(["2017", "2017"]) + result = idx._maybe_cast_slice_bound("2017-01-01", "left", "loc") + expected = Timestamp("2017-01-01") assert result == expected def test_monotone_DTI_indexing_bug(self): @@ -98,136 +105,139 @@ def test_monotone_DTI_indexing_bug(self): # partial string indexing. df = pd.DataFrame(list(range(5))) - date_list = ['2018-01-02', '2017-02-10', '2016-03-10', - '2015-03-15', '2014-03-16'] + date_list = [ + "2018-01-02", + "2017-02-10", + "2016-03-10", + "2015-03-15", + "2014-03-16", + ] date_index = pd.to_datetime(date_list) - df['date'] = date_index - expected = pd.DataFrame({0: list(range(5)), 'date': date_index}) + df["date"] = date_index + expected = pd.DataFrame({0: list(range(5)), "date": date_index}) tm.assert_frame_equal(df, expected) - df = pd.DataFrame({'A': [1, 2, 3]}, - index=pd.date_range('20170101', - periods=3)[::-1]) - expected = pd.DataFrame({'A': 1}, - index=pd.date_range('20170103', - periods=1)) - tm.assert_frame_equal(df.loc['2017-01-03'], expected) + df = pd.DataFrame( + {"A": [1, 2, 3]}, index=pd.date_range("20170101", periods=3)[::-1] + ) + expected = pd.DataFrame({"A": 1}, index=pd.date_range("20170103", periods=1)) + tm.assert_frame_equal(df.loc["2017-01-03"], expected) def test_slice_year(self): - dti = date_range(freq='B', start=datetime(2005, 1, 1), periods=500) + dti = date_range(freq="B", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) - result = s['2005'] + result = s["2005"] expected = s[s.index.year == 2005] tm.assert_series_equal(result, expected) df = DataFrame(np.random.rand(len(dti), 5), index=dti) - result = df.loc['2005'] + result = df.loc["2005"] expected = df[df.index.year == 2005] tm.assert_frame_equal(result, expected) - rng = date_range('1/1/2000', '1/1/2010') + rng = date_range("1/1/2000", "1/1/2010") - result = rng.get_loc('2009') + result = rng.get_loc("2009") expected = slice(3288, 3653) assert result == expected def test_slice_quarter(self): - dti = date_range(freq='D', start=datetime(2000, 6, 1), periods=500) + dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) - assert len(s['2001Q1']) == 90 + assert len(s["2001Q1"]) == 90 df = DataFrame(np.random.rand(len(dti), 5), index=dti) - assert len(df.loc['1Q01']) == 90 + assert len(df.loc["1Q01"]) == 90 def test_slice_month(self): - dti = date_range(freq='D', start=datetime(2005, 1, 1), periods=500) + dti = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(dti)), index=dti) - assert len(s['2005-11']) == 30 + assert len(s["2005-11"]) == 30 df = DataFrame(np.random.rand(len(dti), 5), index=dti) - assert len(df.loc['2005-11']) == 30 + assert len(df.loc["2005-11"]) == 30 - tm.assert_series_equal(s['2005-11'], s['11-2005']) + tm.assert_series_equal(s["2005-11"], s["11-2005"]) def test_partial_slice(self): - rng = date_range(freq='D', start=datetime(2005, 1, 1), periods=500) + rng = date_range(freq="D", start=datetime(2005, 1, 1), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-05':'2006-02'] - expected = s['20050501':'20060228'] + result = s["2005-05":"2006-02"] + expected = s["20050501":"20060228"] tm.assert_series_equal(result, expected) - result = s['2005-05':] - expected = s['20050501':] + result = s["2005-05":] + expected = s["20050501":] tm.assert_series_equal(result, expected) - result = s[:'2006-02'] - expected = s[:'20060228'] + result = s[:"2006-02"] + expected = s[:"20060228"] tm.assert_series_equal(result, expected) - result = s['2005-1-1'] + result = s["2005-1-1"] assert result == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31'$"): - s['2004-12-31'] + s["2004-12-31"] def test_partial_slice_daily(self): - rng = date_range(freq='H', start=datetime(2005, 1, 31), periods=500) + rng = date_range(freq="H", start=datetime(2005, 1, 31), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-1-31'] + result = s["2005-1-31"] tm.assert_series_equal(result, s.iloc[:24]) with pytest.raises(KeyError, match=r"^'2004-12-31 00'$"): - s['2004-12-31 00'] + s["2004-12-31 00"] def test_partial_slice_hourly(self): - rng = date_range(freq='T', start=datetime(2005, 1, 1, 20, 0, 0), - periods=500) + rng = date_range(freq="T", start=datetime(2005, 1, 1, 20, 0, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-1-1'] - tm.assert_series_equal(result, s.iloc[:60 * 4]) + result = s["2005-1-1"] + tm.assert_series_equal(result, s.iloc[: 60 * 4]) - result = s['2005-1-1 20'] + result = s["2005-1-1 20"] tm.assert_series_equal(result, s.iloc[:60]) - assert s['2005-1-1 20:00'] == s.iloc[0] + assert s["2005-1-1 20:00"] == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31 00:15'$"): - s['2004-12-31 00:15'] + s["2004-12-31 00:15"] def test_partial_slice_minutely(self): - rng = date_range(freq='S', start=datetime(2005, 1, 1, 23, 59, 0), - periods=500) + rng = date_range(freq="S", start=datetime(2005, 1, 1, 23, 59, 0), periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['2005-1-1 23:59'] + result = s["2005-1-1 23:59"] tm.assert_series_equal(result, s.iloc[:60]) - result = s['2005-1-1'] + result = s["2005-1-1"] tm.assert_series_equal(result, s.iloc[:60]) - assert s[Timestamp('2005-1-1 23:59:00')] == s.iloc[0] + assert s[Timestamp("2005-1-1 23:59:00")] == s.iloc[0] with pytest.raises(KeyError, match=r"^'2004-12-31 00:00:00'$"): - s['2004-12-31 00:00:00'] + s["2004-12-31 00:00:00"] def test_partial_slice_second_precision(self): - rng = date_range(start=datetime(2005, 1, 1, 0, 0, 59, - microsecond=999990), - periods=20, freq='US') + rng = date_range( + start=datetime(2005, 1, 1, 0, 0, 59, microsecond=999990), + periods=20, + freq="US", + ) s = Series(np.arange(20), rng) - tm.assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10]) - tm.assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10]) + tm.assert_series_equal(s["2005-1-1 00:00"], s.iloc[:10]) + tm.assert_series_equal(s["2005-1-1 00:00:59"], s.iloc[:10]) - tm.assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:]) - tm.assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:]) + tm.assert_series_equal(s["2005-1-1 00:01"], s.iloc[10:]) + tm.assert_series_equal(s["2005-1-1 00:01:00"], s.iloc[10:]) - assert s[Timestamp('2005-1-1 00:00:59.999990')] == s.iloc[0] - with pytest.raises(KeyError, match='2005-1-1 00:00:00'): - s['2005-1-1 00:00:00'] + assert s[Timestamp("2005-1-1 00:00:59.999990")] == s.iloc[0] + with pytest.raises(KeyError, match="2005-1-1 00:00:00"): + s["2005-1-1 00:00:00"] def test_partial_slicing_dataframe(self): # GH14856 @@ -237,17 +247,22 @@ def test_partial_slicing_dataframe(self): # string is considered a slice # - If string resolution is equal to or more precise than index # resolution, string is considered an exact match - formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H', - '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S'] - resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second'] + formats = [ + "%Y", + "%Y-%m", + "%Y-%m-%d", + "%Y-%m-%d %H", + "%Y-%m-%d %H:%M", + "%Y-%m-%d %H:%M:%S", + ] + resolutions = ["year", "month", "day", "hour", "minute", "second"] for rnum, resolution in enumerate(resolutions[2:], 2): # we check only 'day', 'hour', 'minute' and 'second' unit = Timedelta("1 " + resolution) middate = datetime(2012, 1, 1, 0, 0, 0) - index = DatetimeIndex([middate - unit, - middate, middate + unit]) + index = DatetimeIndex([middate - unit, middate, middate + unit]) values = [1, 2, 3] - df = DataFrame({'a': values}, index, dtype=np.int64) + df = DataFrame({"a": values}, index, dtype=np.int64) assert df.index.resolution == resolution # Timestamp with the same resolution as index @@ -256,7 +271,7 @@ def test_partial_slicing_dataframe(self): for timestamp, expected in zip(index, values): ts_string = timestamp.strftime(formats[rnum]) # make ts_string as precise as index - result = df['a'][ts_string] + result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == expected msg = r"^'{}'$".format(ts_string) @@ -265,13 +280,12 @@ def test_partial_slicing_dataframe(self): # Timestamp with resolution less precise than index for fmt in formats[:rnum]: - for element, theslice in [[0, slice(None, 1)], - [1, slice(1, None)]]: + for element, theslice in [[0, slice(None, 1)], [1, slice(1, None)]]: ts_string = index[element].strftime(fmt) # Series should return slice - result = df['a'][ts_string] - expected = df['a'][theslice] + result = df["a"][ts_string] + expected = df["a"][theslice] tm.assert_series_equal(result, expected) # Frame should return slice as well @@ -283,9 +297,9 @@ def test_partial_slicing_dataframe(self): # Compatible with existing key # Should return scalar for Series # and raise KeyError for Frame - for fmt in formats[rnum + 1:]: + for fmt in formats[rnum + 1 :]: ts_string = index[1].strftime(fmt) - result = df['a'][ts_string] + result = df["a"][ts_string] assert isinstance(result, np.int64) assert result == 2 msg = r"^'{}'$".format(ts_string) @@ -294,12 +308,12 @@ def test_partial_slicing_dataframe(self): # Not compatible with existing key # Should raise KeyError - for fmt, res in list(zip(formats, resolutions))[rnum + 1:]: + for fmt, res in list(zip(formats, resolutions))[rnum + 1 :]: ts = index[1] + Timedelta("1 " + res) ts_string = ts.strftime(fmt) msg = r"^'{}'$".format(ts_string) with pytest.raises(KeyError, match=msg): - df['a'][ts_string] + df["a"][ts_string] with pytest.raises(KeyError, match=msg): df[ts_string] @@ -307,109 +321,140 @@ def test_partial_slicing_with_multiindex(self): # GH 4758 # partial string indexing with a multi-index buggy - df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], - 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"], - 'val': [1, 2, 3, 4]}, - index=date_range("2013-06-19 09:30:00", - periods=4, freq='5T')) - df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True) - - expected = DataFrame([ - [1] - ], index=Index(['ABC'], name='TICKER'), columns=['val']) - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')] + df = DataFrame( + { + "ACCOUNT": ["ACCT1", "ACCT1", "ACCT1", "ACCT2"], + "TICKER": ["ABC", "MNP", "XYZ", "XYZ"], + "val": [1, 2, 3, 4], + }, + index=date_range("2013-06-19 09:30:00", periods=4, freq="5T"), + ) + df_multi = df.set_index(["ACCOUNT", "TICKER"], append=True) + + expected = DataFrame( + [[1]], index=Index(["ABC"], name="TICKER"), columns=["val"] + ) + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1")] tm.assert_frame_equal(result, expected) expected = df_multi.loc[ - (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')] - result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')] + (pd.Timestamp("2013-06-19 09:30:00", tz=None), "ACCT1", "ABC") + ] + result = df_multi.loc[("2013-06-19 09:30:00", "ACCT1", "ABC")] tm.assert_series_equal(result, expected) # this is an IndexingError as we don't do partial string selection on # multi-levels. msg = "Too many indexers" with pytest.raises(IndexingError, match=msg): - df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')] + df_multi.loc[("2013-06-19", "ACCT1", "ABC")] # GH 4294 # partial slice on a series mi - s = pd.DataFrame(np.random.rand(1000, 1000), index=pd.date_range( - '2000-1-1', periods=1000)).stack() + s = pd.DataFrame( + np.random.rand(1000, 1000), index=pd.date_range("2000-1-1", periods=1000) + ).stack() s2 = s[:-1].copy() - expected = s2['2000-1-4'] - result = s2[pd.Timestamp('2000-1-4')] + expected = s2["2000-1-4"] + result = s2[pd.Timestamp("2000-1-4")] tm.assert_series_equal(result, expected) - result = s[pd.Timestamp('2000-1-4')] - expected = s['2000-1-4'] + result = s[pd.Timestamp("2000-1-4")] + expected = s["2000-1-4"] tm.assert_series_equal(result, expected) df2 = pd.DataFrame(s) - expected = df2.xs('2000-1-4') - result = df2.loc[pd.Timestamp('2000-1-4')] + expected = df2.xs("2000-1-4") + result = df2.loc[pd.Timestamp("2000-1-4")] tm.assert_frame_equal(result, expected) def test_partial_slice_doesnt_require_monotonicity(self): # For historical reasons. - s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10)) + s = pd.Series(np.arange(10), pd.date_range("2014-01-01", periods=10)) nonmonotonic = s[[3, 5, 4]] expected = nonmonotonic.iloc[:0] - timestamp = pd.Timestamp('2014-01-10') + timestamp = pd.Timestamp("2014-01-10") - tm.assert_series_equal(nonmonotonic['2014-01-10':], expected) - with pytest.raises(KeyError, - match=r"Timestamp\('2014-01-10 00:00:00'\)"): + tm.assert_series_equal(nonmonotonic["2014-01-10":], expected) + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic[timestamp:] - tm.assert_series_equal(nonmonotonic.loc['2014-01-10':], expected) - with pytest.raises(KeyError, - match=r"Timestamp\('2014-01-10 00:00:00'\)"): + tm.assert_series_equal(nonmonotonic.loc["2014-01-10":], expected) + with pytest.raises(KeyError, match=r"Timestamp\('2014-01-10 00:00:00'\)"): nonmonotonic.loc[timestamp:] def test_loc_datetime_length_one(self): # GH16071 - df = pd.DataFrame(columns=['1'], - index=pd.date_range('2016-10-01T00:00:00', - '2016-10-01T23:59:59')) - result = df.loc[datetime(2016, 10, 1):] + df = pd.DataFrame( + columns=["1"], + index=pd.date_range("2016-10-01T00:00:00", "2016-10-01T23:59:59"), + ) + result = df.loc[datetime(2016, 10, 1) :] tm.assert_frame_equal(result, df) - result = df.loc['2016-10-01T00:00:00':] + result = df.loc["2016-10-01T00:00:00":] tm.assert_frame_equal(result, df) - @pytest.mark.parametrize('datetimelike', [ - Timestamp('20130101'), datetime(2013, 1, 1), - np.datetime64('2013-01-01T00:00', 'ns')]) - @pytest.mark.parametrize('op,expected', [ - (op.lt, [True, False, False, False]), - (op.le, [True, True, False, False]), - (op.eq, [False, True, False, False]), - (op.gt, [False, False, False, True])]) + @pytest.mark.parametrize( + "datetimelike", + [ + Timestamp("20130101"), + datetime(2013, 1, 1), + np.datetime64("2013-01-01T00:00", "ns"), + ], + ) + @pytest.mark.parametrize( + "op,expected", + [ + (op.lt, [True, False, False, False]), + (op.le, [True, True, False, False]), + (op.eq, [False, True, False, False]), + (op.gt, [False, False, False, True]), + ], + ) def test_selection_by_datetimelike(self, datetimelike, op, expected): # GH issue #17965, test for ability to compare datetime64[ns] columns # to datetimelike - df = DataFrame({'A': [pd.Timestamp('20120101'), - pd.Timestamp('20130101'), - np.nan, pd.Timestamp('20130103')]}) + df = DataFrame( + { + "A": [ + pd.Timestamp("20120101"), + pd.Timestamp("20130101"), + np.nan, + pd.Timestamp("20130103"), + ] + } + ) result = op(df.A, datetimelike) - expected = Series(expected, name='A') + expected = Series(expected, name="A") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('start', [ - '2018-12-02 21:50:00+00:00', pd.Timestamp('2018-12-02 21:50:00+00:00'), - pd.Timestamp('2018-12-02 21:50:00+00:00').to_pydatetime() - ]) - @pytest.mark.parametrize('end', [ - '2018-12-02 21:52:00+00:00', pd.Timestamp('2018-12-02 21:52:00+00:00'), - pd.Timestamp('2018-12-02 21:52:00+00:00').to_pydatetime() - ]) + @pytest.mark.parametrize( + "start", + [ + "2018-12-02 21:50:00+00:00", + pd.Timestamp("2018-12-02 21:50:00+00:00"), + pd.Timestamp("2018-12-02 21:50:00+00:00").to_pydatetime(), + ], + ) + @pytest.mark.parametrize( + "end", + [ + "2018-12-02 21:52:00+00:00", + pd.Timestamp("2018-12-02 21:52:00+00:00"), + pd.Timestamp("2018-12-02 21:52:00+00:00").to_pydatetime(), + ], + ) def test_getitem_with_datestring_with_UTC_offset(self, start, end): # GH 24076 - idx = pd.date_range(start='2018-12-02 14:50:00-07:00', - end='2018-12-02 14:50:00-07:00', freq='1min') - df = pd.DataFrame(1, index=idx, columns=['A']) + idx = pd.date_range( + start="2018-12-02 14:50:00-07:00", + end="2018-12-02 14:50:00-07:00", + freq="1min", + ) + df = pd.DataFrame(1, index=idx, columns=["A"]) result = df[start:end] expected = df.iloc[0:3, :] tm.assert_frame_equal(result, expected) @@ -418,7 +463,7 @@ def test_getitem_with_datestring_with_UTC_offset(self, start, end): start = str(start) end = str(end) with pytest.raises(ValueError, match="Both dates must"): - df[start:end[:-4] + '1:00'] + df[start : end[:-4] + "1:00"] with pytest.raises(ValueError, match="The index must be timezone"): df = df.tz_localize(None) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index ae14396c75398..00310f4fba7c7 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -17,38 +17,48 @@ class TestDatetimeIndexOps: def test_dti_time(self): - rng = date_range('1/1/2000', freq='12min', periods=10) + rng = date_range("1/1/2000", freq="12min", periods=10) result = pd.Index(rng).time expected = [t.time() for t in rng] assert (result == expected).all() def test_dti_date(self): - rng = date_range('1/1/2000', freq='12H', periods=10) + rng = date_range("1/1/2000", freq="12H", periods=10) result = pd.Index(rng).date expected = [t.date() for t in rng] assert (result == expected).all() - @pytest.mark.parametrize('data', [ - ['1400-01-01'], - [datetime(1400, 1, 1)]]) + @pytest.mark.parametrize("data", [["1400-01-01"], [datetime(1400, 1, 1)]]) def test_dti_date_out_of_range(self, data): # GH#1475 msg = "Out of bounds nanosecond timestamp: 1400-01-01 00:00:00" with pytest.raises(OutOfBoundsDatetime, match=msg): DatetimeIndex(data) - @pytest.mark.parametrize('field', [ - 'dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter', - 'days_in_month', 'is_month_start', 'is_month_end', - 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'weekday_name']) + @pytest.mark.parametrize( + "field", + [ + "dayofweek", + "dayofyear", + "week", + "weekofyear", + "quarter", + "days_in_month", + "is_month_start", + "is_month_end", + "is_quarter_start", + "is_quarter_end", + "is_year_start", + "is_year_end", + "weekday_name", + ], + ) def test_dti_timestamp_fields(self, field): # extra fields from DatetimeIndex like quarter and week idx = tm.makeDateIndex(100) expected = getattr(idx, field)[-1] - if field == 'weekday_name': - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + if field == "weekday_name": + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = getattr(Timestamp(idx[-1]), field) else: result = getattr(Timestamp(idx[-1]), field) @@ -65,117 +75,141 @@ def test_dti_timestamp_freq_fields(self): # DatetimeIndex.round def test_round_daily(self): - dti = date_range('20130101 09:10:11', periods=5) - result = dti.round('D') - expected = date_range('20130101', periods=5) + dti = date_range("20130101 09:10:11", periods=5) + result = dti.round("D") + expected = date_range("20130101", periods=5) tm.assert_index_equal(result, expected) - dti = dti.tz_localize('UTC').tz_convert('US/Eastern') - result = dti.round('D') - expected = date_range('20130101', - periods=5).tz_localize('US/Eastern') + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") + result = dti.round("D") + expected = date_range("20130101", periods=5).tz_localize("US/Eastern") tm.assert_index_equal(result, expected) - result = dti.round('s') + result = dti.round("s") tm.assert_index_equal(result, dti) - @pytest.mark.parametrize('freq, error_msg', [ - ('Y', ' is a non-fixed frequency'), - ('M', ' is a non-fixed frequency'), - ('foobar', 'Invalid frequency: foobar')]) + @pytest.mark.parametrize( + "freq, error_msg", + [ + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ], + ) def test_round_invalid(self, freq, error_msg): - dti = date_range('20130101 09:10:11', periods=5) - dti = dti.tz_localize('UTC').tz_convert('US/Eastern') + dti = date_range("20130101 09:10:11", periods=5) + dti = dti.tz_localize("UTC").tz_convert("US/Eastern") with pytest.raises(ValueError, match=error_msg): dti.round(freq) def test_round(self, tz_naive_fixture): tz = tz_naive_fixture - rng = date_range(start='2016-01-01', periods=5, - freq='30Min', tz=tz) + rng = date_range(start="2016-01-01", periods=5, freq="30Min", tz=tz) elt = rng[1] - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), - ]) + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 00:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 01:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 02:00:00", tz=tz, freq="30T"), + Timestamp("2016-01-01 02:00:00", tz=tz, freq="30T"), + ] + ) expected_elt = expected_rng[1] - tm.assert_index_equal(rng.round(freq='H'), expected_rng) - assert elt.round(freq='H') == expected_elt + tm.assert_index_equal(rng.round(freq="H"), expected_rng) + assert elt.round(freq="H") == expected_elt msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - rng.round(freq='foo') + rng.round(freq="foo") with pytest.raises(ValueError, match=msg): - elt.round(freq='foo') + elt.round(freq="foo") msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - rng.round(freq='M') + rng.round(freq="M") with pytest.raises(ValueError, match=msg): - elt.round(freq='M') + elt.round(freq="M") # GH#14440 & GH#15578 - index = DatetimeIndex(['2016-10-17 12:00:00.0015'], tz=tz) - result = index.round('ms') - expected = DatetimeIndex(['2016-10-17 12:00:00.002000'], tz=tz) + index = DatetimeIndex(["2016-10-17 12:00:00.0015"], tz=tz) + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.002000"], tz=tz) tm.assert_index_equal(result, expected) - for freq in ['us', 'ns']: + for freq in ["us", "ns"]: tm.assert_index_equal(index, index.round(freq)) - index = DatetimeIndex(['2016-10-17 12:00:00.00149'], tz=tz) - result = index.round('ms') - expected = DatetimeIndex(['2016-10-17 12:00:00.001000'], tz=tz) + index = DatetimeIndex(["2016-10-17 12:00:00.00149"], tz=tz) + result = index.round("ms") + expected = DatetimeIndex(["2016-10-17 12:00:00.001000"], tz=tz) tm.assert_index_equal(result, expected) - index = DatetimeIndex(['2016-10-17 12:00:00.001501031']) - result = index.round('10ns') - expected = DatetimeIndex(['2016-10-17 12:00:00.001501030']) + index = DatetimeIndex(["2016-10-17 12:00:00.001501031"]) + result = index.round("10ns") + expected = DatetimeIndex(["2016-10-17 12:00:00.001501030"]) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(False): - ts = '2016-10-17 12:00:00.001501031' - DatetimeIndex([ts]).round('1010ns') + ts = "2016-10-17 12:00:00.001501031" + DatetimeIndex([ts]).round("1010ns") def test_no_rounding_occurs(self, tz_naive_fixture): # GH 21262 tz = tz_naive_fixture - rng = date_range(start='2016-01-01', periods=5, - freq='2Min', tz=tz) - - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:02:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:04:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:06:00', tz=tz, freq='2T'), - Timestamp('2016-01-01 00:08:00', tz=tz, freq='2T'), - ]) - - tm.assert_index_equal(rng.round(freq='2T'), expected_rng) - - @pytest.mark.parametrize('test_input, rounder, freq, expected', [ - (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), - (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), - (['2117-01-01 00:00:45.000000012'], 'floor', '10ns', - ['2117-01-01 00:00:45.000000010']), - (['1823-01-01 00:00:01.000000012'], 'ceil', '10ns', - ['1823-01-01 00:00:01.000000020']), - (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), - (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), - (['2018-01-01 00:15:00'], 'ceil', '15T', ['2018-01-01 00:15:00']), - (['2018-01-01 00:15:00'], 'floor', '15T', ['2018-01-01 00:15:00']), - (['1823-01-01 03:00:00'], 'ceil', '3H', ['1823-01-01 03:00:00']), - (['1823-01-01 03:00:00'], 'floor', '3H', ['1823-01-01 03:00:00']), - (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', - ('NaT', '1823-01-01 00:00:01')), - (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', - ('NaT', '1823-01-01 00:00:01')) - ]) + rng = date_range(start="2016-01-01", periods=5, freq="2Min", tz=tz) + + expected_rng = DatetimeIndex( + [ + Timestamp("2016-01-01 00:00:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:02:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:04:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:06:00", tz=tz, freq="2T"), + Timestamp("2016-01-01 00:08:00", tz=tz, freq="2T"), + ] + ) + + tm.assert_index_equal(rng.round(freq="2T"), expected_rng) + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + (["2117-01-01 00:00:45"], "floor", "15s", ["2117-01-01 00:00:45"]), + (["2117-01-01 00:00:45"], "ceil", "15s", ["2117-01-01 00:00:45"]), + ( + ["2117-01-01 00:00:45.000000012"], + "floor", + "10ns", + ["2117-01-01 00:00:45.000000010"], + ), + ( + ["1823-01-01 00:00:01.000000012"], + "ceil", + "10ns", + ["1823-01-01 00:00:01.000000020"], + ), + (["1823-01-01 00:00:01"], "floor", "1s", ["1823-01-01 00:00:01"]), + (["1823-01-01 00:00:01"], "ceil", "1s", ["1823-01-01 00:00:01"]), + (["2018-01-01 00:15:00"], "ceil", "15T", ["2018-01-01 00:15:00"]), + (["2018-01-01 00:15:00"], "floor", "15T", ["2018-01-01 00:15:00"]), + (["1823-01-01 03:00:00"], "ceil", "3H", ["1823-01-01 03:00:00"]), + (["1823-01-01 03:00:00"], "floor", "3H", ["1823-01-01 03:00:00"]), + ( + ("NaT", "1823-01-01 00:00:01"), + "floor", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ( + ("NaT", "1823-01-01 00:00:01"), + "ceil", + "1s", + ("NaT", "1823-01-01 00:00:01"), + ), + ], + ) def test_ceil_floor_edge(self, test_input, rounder, freq, expected): dt = DatetimeIndex(list(test_input)) func = getattr(dt, rounder) @@ -183,17 +217,34 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = DatetimeIndex(list(expected)) assert expected.equals(result) - @pytest.mark.parametrize('start, index_freq, periods', [ - ('2018-01-01', '12H', 25), - ('2018-01-01 0:0:0.124999', '1ns', 1000), - ]) - @pytest.mark.parametrize('round_freq', [ - '2ns', '3ns', '4ns', '5ns', '6ns', '7ns', - '250ns', '500ns', '750ns', - '1us', '19us', '250us', '500us', '750us', - '1s', '2s', '3s', - '12H', '1D', - ]) + @pytest.mark.parametrize( + "start, index_freq, periods", + [("2018-01-01", "12H", 25), ("2018-01-01 0:0:0.124999", "1ns", 1000)], + ) + @pytest.mark.parametrize( + "round_freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "12H", + "1D", + ], + ) def test_round_int64(self, start, index_freq, periods, round_freq): dt = date_range(start=start, freq=index_freq, periods=periods) unit = to_offset(round_freq).nanos @@ -227,19 +278,21 @@ def test_round_int64(self, start, index_freq, periods, round_freq): # DatetimeIndex.normalize def test_normalize(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D') + rng = date_range("1/1/2000 9:30", periods=10, freq="D") result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D') + expected = date_range("1/1/2000", periods=10, freq="D") tm.assert_index_equal(result, expected) - arr_ns = np.array([1380585623454345752, - 1380585612343234312]).astype("datetime64[ns]") + arr_ns = np.array([1380585623454345752, 1380585612343234312]).astype( + "datetime64[ns]" + ) rng_ns = DatetimeIndex(arr_ns) rng_ns_normalized = rng_ns.normalize() - arr_ns = np.array([1380585600000000000, - 1380585600000000000]).astype("datetime64[ns]") + arr_ns = np.array([1380585600000000000, 1380585600000000000]).astype( + "datetime64[ns]" + ) expected = DatetimeIndex(arr_ns) tm.assert_index_equal(rng_ns_normalized, expected) @@ -247,44 +300,43 @@ def test_normalize(self): assert not rng.is_normalized def test_normalize_nat(self): - dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')]) + dti = DatetimeIndex([pd.NaT, Timestamp("2018-01-01 01:00:00")]) result = dti.normalize() - expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')]) + expected = DatetimeIndex([pd.NaT, Timestamp("2018-01-01")]) tm.assert_index_equal(result, expected) class TestDateTimeIndexToJulianDate: - def test_1700(self): - dr = date_range(start=Timestamp('1710-10-01'), periods=5, freq='D') + dr = date_range(start=Timestamp("1710-10-01"), periods=5, freq="D") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_2000(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='D') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="D") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_hour(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='H') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="H") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_minute(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='T') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="T") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) tm.assert_index_equal(r1, r2) def test_second(self): - dr = date_range(start=Timestamp('2000-02-27'), periods=5, freq='S') + dr = date_range(start=Timestamp("2000-02-27"), periods=5, freq="S") r1 = pd.Index([x.to_julian_date() for x in dr]) r2 = dr.to_julian_date() assert isinstance(r2, pd.Float64Index) diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index fd666f3d56c9d..67fc70c17d7bc 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -7,8 +7,15 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, Int64Index, Series, bdate_range, - date_range, to_datetime) + DataFrame, + DatetimeIndex, + Index, + Int64Index, + Series, + bdate_range, + date_range, + to_datetime, +) import pandas.util.testing as tm from pandas.tseries.offsets import BMonthEnd, Minute, MonthEnd @@ -17,8 +24,14 @@ class TestDatetimeIndexSetOps: - tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', 'dateutil/Asia/Singapore', - 'dateutil/US/Pacific'] + tz = [ + None, + "UTC", + "Asia/Tokyo", + "US/Eastern", + "dateutil/Asia/Singapore", + "dateutil/US/Pacific", + ] # TODO: moved from test_datetimelike; dedup with version below @pytest.mark.parametrize("sort", [None, False]) @@ -37,9 +50,9 @@ def test_union3(self, sort, box): second = everything[5:] # GH 10149 - expected = first.astype('O').union( - pd.Index(second.values, dtype='O') - ).astype('O') + expected = ( + first.astype("O").union(pd.Index(second.values, dtype="O")).astype("O") + ) case = box(second.values) result = first.union(case, sort=sort) tm.assert_index_equal(result, expected) @@ -47,27 +60,26 @@ def test_union3(self, sort, box): @pytest.mark.parametrize("tz", tz) @pytest.mark.parametrize("sort", [None, False]) def test_union(self, tz, sort): - rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) - expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + rng1 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + other1 = pd.date_range("1/6/2000", freq="D", periods=5, tz=tz) + expected1 = pd.date_range("1/1/2000", freq="D", periods=10, tz=tz) expected1_notsorted = pd.DatetimeIndex(list(other1) + list(rng1)) - rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) - expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + rng2 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) + other2 = pd.date_range("1/4/2000", freq="D", periods=5, tz=tz) + expected2 = pd.date_range("1/1/2000", freq="D", periods=8, tz=tz) expected2_notsorted = pd.DatetimeIndex(list(other2) + list(rng2[:3])) - rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + rng3 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) other3 = pd.DatetimeIndex([], tz=tz) - expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + expected3 = pd.date_range("1/1/2000", freq="D", periods=5, tz=tz) expected3_notsorted = rng3 - for rng, other, exp, exp_notsorted in [(rng1, other1, expected1, - expected1_notsorted), - (rng2, other2, expected2, - expected2_notsorted), - (rng3, other3, expected3, - expected3_notsorted)]: + for rng, other, exp, exp_notsorted in [ + (rng1, other1, expected1, expected1_notsorted), + (rng2, other2, expected2, expected2_notsorted), + (rng3, other3, expected3, expected3_notsorted), + ]: result_union = rng.union(other, sort=sort) tm.assert_index_equal(result_union, exp) @@ -80,8 +92,8 @@ def test_union(self, tz, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_coverage(self, sort): - idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) - ordered = DatetimeIndex(idx.sort_values(), freq='infer') + idx = DatetimeIndex(["2000-01-03", "2000-01-01", "2000-01-02"]) + ordered = DatetimeIndex(idx.sort_values(), freq="infer") result = ordered.union(idx, sort=sort) tm.assert_index_equal(result, ordered) @@ -91,8 +103,8 @@ def test_union_coverage(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_1730(self, sort): - rng_a = date_range('1/1/2012', periods=4, freq='3H') - rng_b = date_range('1/1/2012', periods=4, freq='4H') + rng_a = date_range("1/1/2012", periods=4, freq="3H") + rng_b = date_range("1/1/2012", periods=4, freq="4H") result = rng_a.union(rng_b, sort=sort) exp = list(rng_a) + list(rng_b[1:]) @@ -104,16 +116,24 @@ def test_union_bug_1730(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_1745(self, sort): - left = DatetimeIndex(['2012-05-11 15:19:49.695000']) - right = DatetimeIndex(['2012-05-29 13:04:21.322000', - '2012-05-11 15:27:24.873000', - '2012-05-11 15:31:05.350000']) + left = DatetimeIndex(["2012-05-11 15:19:49.695000"]) + right = DatetimeIndex( + [ + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) result = left.union(right, sort=sort) - exp = DatetimeIndex(['2012-05-11 15:19:49.695000', - '2012-05-29 13:04:21.322000', - '2012-05-11 15:27:24.873000', - '2012-05-11 15:31:05.350000']) + exp = DatetimeIndex( + [ + "2012-05-11 15:19:49.695000", + "2012-05-29 13:04:21.322000", + "2012-05-11 15:27:24.873000", + "2012-05-11 15:31:05.350000", + ] + ) if sort is None: exp = exp.sort_values() tm.assert_index_equal(result, exp) @@ -121,6 +141,7 @@ def test_union_bug_1745(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_bug_4564(self, sort): from pandas import DateOffset + left = date_range("2013-01-01", "2013-02-01") right = left + DateOffset(minutes=15) @@ -135,7 +156,7 @@ def test_union_bug_4564(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_freq_both_none(self, sort): # GH11086 - expected = bdate_range('20150101', periods=10) + expected = bdate_range("20150101", periods=10) expected.freq = None result = expected.union(expected, sort=sort) @@ -143,20 +164,20 @@ def test_union_freq_both_none(self, sort): assert result.freq is None def test_union_dataframe_index(self): - rng1 = date_range('1/1/1999', '1/1/2012', freq='MS') + rng1 = date_range("1/1/1999", "1/1/2012", freq="MS") s1 = Series(np.random.randn(len(rng1)), rng1) - rng2 = date_range('1/1/1980', '12/1/2001', freq='MS') + rng2 = date_range("1/1/1980", "12/1/2001", freq="MS") s2 = Series(np.random.randn(len(rng2)), rng2) - df = DataFrame({'s1': s1, 's2': s2}) + df = DataFrame({"s1": s1, "s2": s2}) - exp = pd.date_range('1/1/1980', '1/1/2012', freq='MS') + exp = pd.date_range("1/1/1980", "1/1/2012", freq="MS") tm.assert_index_equal(df.index, exp) @pytest.mark.parametrize("sort", [None, False]) def test_union_with_DatetimeIndex(self, sort): i1 = Int64Index(np.arange(0, 20, 2)) - i2 = date_range(start='2012-01-03 00:00:00', periods=10, freq='D') + i2 = date_range(start="2012-01-03 00:00:00", periods=10, freq="D") # Works i1.union(i2, sort=sort) # Fails with "AttributeError: can't set attribute" @@ -175,31 +196,35 @@ def test_intersection2(self): result = first.intersection(case) assert tm.equalContents(result, second) - third = Index(['a', 'b', 'c']) + third = Index(["a", "b", "c"]) result = first.intersection(third) expected = pd.Index([], dtype=object) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("tz", [None, 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Pacific']) + @pytest.mark.parametrize( + "tz", [None, "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, tz, sort): # GH 4690 (with tz) - base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') + base = date_range("6/1/2000", "6/30/2000", freq="D", name="idx") # if target has the same name, it is preserved - rng2 = date_range('5/15/2000', '6/20/2000', freq='D', name='idx') - expected2 = date_range('6/1/2000', '6/20/2000', freq='D', name='idx') + rng2 = date_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = date_range("6/1/2000", "6/20/2000", freq="D", name="idx") # if target name is different, it will be reset - rng3 = date_range('5/15/2000', '6/20/2000', freq='D', name='other') - expected3 = date_range('6/1/2000', '6/20/2000', freq='D', name=None) + rng3 = date_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = date_range("6/1/2000", "6/20/2000", freq="D", name=None) - rng4 = date_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = DatetimeIndex([], name='idx') + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = DatetimeIndex([], name="idx") - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng) tm.assert_index_equal(result, expected) assert result.name == expected.name @@ -207,29 +232,31 @@ def test_intersection(self, tz, sort): assert result.tz == expected.tz # non-monotonic - base = DatetimeIndex(['2011-01-05', '2011-01-04', - '2011-01-02', '2011-01-03'], - tz=tz, name='idx') - - rng2 = DatetimeIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - tz=tz, name='idx') - expected2 = DatetimeIndex(['2011-01-04', '2011-01-02'], - tz=tz, name='idx') - - rng3 = DatetimeIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - tz=tz, name='other') - expected3 = DatetimeIndex(['2011-01-04', '2011-01-02'], - tz=tz, name=None) + base = DatetimeIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], tz=tz, name="idx" + ) + + rng2 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], tz=tz, name="idx" + ) + expected2 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name="idx") + + rng3 = DatetimeIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + tz=tz, + name="other", + ) + expected3 = DatetimeIndex(["2011-01-04", "2011-01-02"], tz=tz, name=None) # GH 7880 - rng4 = date_range('7/1/2000', '7/31/2000', freq='D', tz=tz, - name='idx') - expected4 = DatetimeIndex([], tz=tz, name='idx') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + rng4 = date_range("7/1/2000", "7/31/2000", freq="D", tz=tz, name="idx") + expected4 = DatetimeIndex([], tz=tz, name="idx") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() @@ -240,7 +267,7 @@ def test_intersection(self, tz, sort): def test_intersection_empty(self): # empty same freq GH2129 - rng = date_range('6/1/2000', '6/15/2000', freq='T') + rng = date_range("6/1/2000", "6/15/2000", freq="T") result = rng[0:0].intersection(rng) assert len(result) == 0 @@ -249,7 +276,8 @@ def test_intersection_empty(self): def test_intersection_bug_1708(self): from pandas import DateOffset - index_1 = date_range('1/1/2012', periods=4, freq='12H') + + index_1 = date_range("1/1/2012", periods=4, freq="12H") index_2 = index_1 + DateOffset(hours=1) result = index_1 & index_2 @@ -258,24 +286,25 @@ def test_intersection_bug_1708(self): @pytest.mark.parametrize("tz", tz) @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, tz, sort): - rng_dates = ['1/2/2000', '1/3/2000', '1/1/2000', '1/4/2000', - '1/5/2000'] + rng_dates = ["1/2/2000", "1/3/2000", "1/1/2000", "1/4/2000", "1/5/2000"] rng1 = pd.DatetimeIndex(rng_dates, tz=tz) - other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) + other1 = pd.date_range("1/6/2000", freq="D", periods=5, tz=tz) expected1 = pd.DatetimeIndex(rng_dates, tz=tz) rng2 = pd.DatetimeIndex(rng_dates, tz=tz) - other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) + other2 = pd.date_range("1/4/2000", freq="D", periods=5, tz=tz) expected2 = pd.DatetimeIndex(rng_dates[:3], tz=tz) rng3 = pd.DatetimeIndex(rng_dates, tz=tz) other3 = pd.DatetimeIndex([], tz=tz) expected3 = pd.DatetimeIndex(rng_dates, tz=tz) - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + ]: result_diff = rng.difference(other, sort) if sort is None: expected = expected.sort_values() @@ -290,29 +319,27 @@ def test_difference_freq(self, sort): expected = DatetimeIndex(["20160920", "20160925"], freq=None) idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = date_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) expected = DatetimeIndex(["20160920", "20160921"], freq=None) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) @pytest.mark.parametrize("sort", [None, False]) def test_datetimeindex_diff(self, sort): - dti1 = date_range(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=100) - dti2 = date_range(freq='Q-JAN', start=datetime(1997, 12, 31), - periods=98) + dti1 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=100) + dti2 = date_range(freq="Q-JAN", start=datetime(1997, 12, 31), periods=98) assert len(dti1.difference(dti2, sort)) == 2 @pytest.mark.parametrize("sort", [None, False]) def test_datetimeindex_union_join_empty(self, sort): - dti = date_range(start='1/1/2001', end='2/1/2001', freq='D') + dti = date_range(start="1/1/2001", end="2/1/2001", freq="D") empty = Index([]) result = dti.union(empty, sort=sort) - expected = dti.astype('O') + expected = dti.astype("O") tm.assert_index_equal(result, expected) result = dti.join(empty) @@ -320,16 +347,13 @@ def test_datetimeindex_union_join_empty(self, sort): tm.assert_index_equal(result, dti) def test_join_nonunique(self): - idx1 = to_datetime(['2012-11-06 16:00:11.477563', - '2012-11-06 16:00:11.477563']) - idx2 = to_datetime(['2012-11-06 15:11:09.006507', - '2012-11-06 15:11:09.006507']) - rs = idx1.join(idx2, how='outer') + idx1 = to_datetime(["2012-11-06 16:00:11.477563", "2012-11-06 16:00:11.477563"]) + idx2 = to_datetime(["2012-11-06 15:11:09.006507", "2012-11-06 15:11:09.006507"]) + rs = idx1.join(idx2, how="outer") assert rs.is_monotonic class TestBusinessDatetimeIndex: - def setup_method(self, method): self.rng = bdate_range(START, END) @@ -376,14 +400,14 @@ def test_outer_join(self): left = self.rng[:10] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None @@ -391,19 +415,19 @@ def test_outer_join(self): left = self.rng[:5] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_join = self.rng.join(rng, how='outer') + the_join = self.rng.join(rng, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None @pytest.mark.parametrize("sort", [None, False]) def test_union_not_cacheable(self, sort): - rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng = date_range("1/1/2000", periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_union = rng1.union(rng2, sort=sort) @@ -420,7 +444,7 @@ def test_union_not_cacheable(self, sort): tm.assert_index_equal(the_union, expected) def test_intersection(self): - rng = date_range('1/1/2000', periods=50, freq=Minute()) + rng = date_range("1/1/2000", periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_int = rng1.intersection(rng2) @@ -439,15 +463,16 @@ def test_intersection(self): def test_intersection_bug(self): # GH #771 - a = bdate_range('11/30/2011', '12/31/2011') - b = bdate_range('12/10/2011', '12/20/2011') + a = bdate_range("11/30/2011", "12/31/2011") + b = bdate_range("12/10/2011", "12/20/2011") result = a.intersection(b) tm.assert_index_equal(result, b) @pytest.mark.parametrize("sort", [None, False]) def test_month_range_union_tz_pytz(self, sort): from pytz import timezone - tz = timezone('US/Eastern') + + tz = timezone("US/Eastern") early_start = datetime(2011, 1, 1) early_end = datetime(2011, 3, 1) @@ -455,10 +480,8 @@ def test_month_range_union_tz_pytz(self, sort): late_start = datetime(2011, 3, 1) late_end = datetime(2011, 5, 1) - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) early_dr.union(late_dr, sort=sort) @@ -466,7 +489,8 @@ def test_month_range_union_tz_pytz(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_month_range_union_tz_dateutil(self, sort): from pandas._libs.tslibs.timezones import dateutil_gettz - tz = dateutil_gettz('US/Eastern') + + tz = dateutil_gettz("US/Eastern") early_start = datetime(2011, 1, 1) early_end = datetime(2011, 3, 1) @@ -474,18 +498,15 @@ def test_month_range_union_tz_dateutil(self, sort): late_start = datetime(2011, 3, 1) late_end = datetime(2011, 5, 1) - early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=MonthEnd()) - late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=MonthEnd()) + early_dr = date_range(start=early_start, end=early_end, tz=tz, freq=MonthEnd()) + late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) early_dr.union(late_dr, sort=sort) class TestCustomDatetimeIndex: - def setup_method(self, method): - self.rng = bdate_range(START, END, freq='C') + self.rng = bdate_range(START, END, freq="C") @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): @@ -527,14 +548,14 @@ def test_outer_join(self): left = self.rng[:10] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None @@ -542,19 +563,19 @@ def test_outer_join(self): left = self.rng[:5] right = self.rng[5:10] - the_join = left.join(right, how='outer') + the_join = left.join(right, how="outer") assert isinstance(the_join, DatetimeIndex) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_join = self.rng.join(rng, how='outer') + the_join = self.rng.join(rng, how="outer") assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None def test_intersection_bug(self): # GH #771 - a = bdate_range('11/30/2011', '12/31/2011', freq='C') - b = bdate_range('12/10/2011', '12/20/2011', freq='C') + a = bdate_range("11/30/2011", "12/31/2011", freq="C") + b = bdate_range("12/10/2011", "12/20/2011", freq="C") result = a.intersection(b) tm.assert_index_equal(result, b) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index af0183379790a..059dbb00019d8 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -14,8 +14,14 @@ import pandas as pd from pandas import ( - DatetimeIndex, Index, Timestamp, bdate_range, date_range, isna, - to_datetime) + DatetimeIndex, + Index, + Timestamp, + bdate_range, + date_range, + isna, + to_datetime, +) import pandas.util.testing as tm @@ -36,7 +42,7 @@ def dst(self, dt): return timedelta(0) -fixed_off = FixedOffset(-420, '-07:00') +fixed_off = FixedOffset(-420, "-07:00") fixed_off_no_name = FixedOffset(-330, None) @@ -47,43 +53,43 @@ def test_tz_convert_nat(self): # GH#5546 dates = [pd.NaT] idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) - idx = idx.tz_convert('UTC') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) - - dates = ['2010-12-01 00:00', '2010-12-02 00:00', pd.NaT] + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Eastern")) + idx = idx.tz_convert("UTC") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="UTC")) + + dates = ["2010-12-01 00:00", "2010-12-02 00:00", pd.NaT] idx = DatetimeIndex(dates) - idx = idx.tz_localize('US/Pacific') - tm.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 03:00', '2010-12-02 03:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + idx = idx.tz_localize("US/Pacific") + tm.assert_index_equal(idx, DatetimeIndex(dates, tz="US/Pacific")) + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 03:00", "2010-12-02 03:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) idx = idx + pd.offsets.Hour(5) - expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) - idx = idx.tz_convert('US/Pacific') - expected = ['2010-12-01 05:00', '2010-12-02 05:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) + idx = idx.tz_convert("US/Pacific") + expected = ["2010-12-01 05:00", "2010-12-02 05:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) - idx = idx + np.timedelta64(3, 'h') - expected = ['2010-12-01 08:00', '2010-12-02 08:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) + idx = idx + np.timedelta64(3, "h") + expected = ["2010-12-01 08:00", "2010-12-02 08:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Pacific")) - idx = idx.tz_convert('US/Eastern') - expected = ['2010-12-01 11:00', '2010-12-02 11:00', pd.NaT] - tm.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) + idx = idx.tz_convert("US/Eastern") + expected = ["2010-12-01 11:00", "2010-12-02 11:00", pd.NaT] + tm.assert_index_equal(idx, DatetimeIndex(expected, tz="US/Eastern")) - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_dti_tz_convert_compat_timestamp(self, prefix): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - idx = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + idx = DatetimeIndex(strdates, tz=prefix + "US/Eastern") - conv = idx[0].tz_convert(prefix + 'US/Pacific') - expected = idx.tz_convert(prefix + 'US/Pacific')[0] + conv = idx[0].tz_convert(prefix + "US/Pacific") + expected = idx.tz_convert(prefix + "US/Pacific")[0] assert conv == expected @@ -92,196 +98,203 @@ def test_dti_tz_convert_hour_overflow_dst(self): # https://github.com/pandas-dev/pandas/issues/13306 # sorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2009-05-12 09:50:32'] - tt = DatetimeIndex(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2009-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # sorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2009-05-12 13:50:32'] - tt = DatetimeIndex(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2009-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) # unsorted case US/Eastern -> UTC - ts = ['2008-05-12 09:50:00', - '2008-12-12 09:50:35', - '2008-05-12 09:50:32'] - tt = DatetimeIndex(ts).tz_localize('US/Eastern') - ut = tt.tz_convert('UTC') + ts = ["2008-05-12 09:50:00", "2008-12-12 09:50:35", "2008-05-12 09:50:32"] + tt = DatetimeIndex(ts).tz_localize("US/Eastern") + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # unsorted case UTC -> US/Eastern - ts = ['2008-05-12 13:50:00', - '2008-12-12 14:50:35', - '2008-05-12 13:50:32'] - tt = DatetimeIndex(ts).tz_localize('UTC') - ut = tt.tz_convert('US/Eastern') + ts = ["2008-05-12 13:50:00", "2008-12-12 14:50:35", "2008-05-12 13:50:32"] + tt = DatetimeIndex(ts).tz_localize("UTC") + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_convert_hour_overflow_dst_timestamps(self, tz): # Regression test for GH#13306 # sorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2009-05-12 09:50:32', tz=tz)] + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2009-05-12 09:50:32", tz=tz), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('UTC') + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # sorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2009-05-12 13:50:32', tz='UTC')] + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2009-05-12 13:50:32", tz="UTC"), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('US/Eastern') + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) # unsorted case US/Eastern -> UTC - ts = [Timestamp('2008-05-12 09:50:00', tz=tz), - Timestamp('2008-12-12 09:50:35', tz=tz), - Timestamp('2008-05-12 09:50:32', tz=tz)] + ts = [ + Timestamp("2008-05-12 09:50:00", tz=tz), + Timestamp("2008-12-12 09:50:35", tz=tz), + Timestamp("2008-05-12 09:50:32", tz=tz), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('UTC') + ut = tt.tz_convert("UTC") expected = Index([13, 14, 13]) tm.assert_index_equal(ut.hour, expected) # unsorted case UTC -> US/Eastern - ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), - Timestamp('2008-12-12 14:50:35', tz='UTC'), - Timestamp('2008-05-12 13:50:32', tz='UTC')] + ts = [ + Timestamp("2008-05-12 13:50:00", tz="UTC"), + Timestamp("2008-12-12 14:50:35", tz="UTC"), + Timestamp("2008-05-12 13:50:32", tz="UTC"), + ] tt = DatetimeIndex(ts) - ut = tt.tz_convert('US/Eastern') + ut = tt.tz_convert("US/Eastern") expected = Index([9, 9, 9]) tm.assert_index_equal(ut.hour, expected) - @pytest.mark.parametrize('freq, n', [('H', 1), ('T', 60), ('S', 3600)]) + @pytest.mark.parametrize("freq, n", [("H", 1), ("T", 60), ("S", 3600)]) def test_dti_tz_convert_trans_pos_plus_1__bug(self, freq, n): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pandas-dev/pandas/issues/4496 for details. - idx = date_range(datetime(2011, 3, 26, 23), - datetime(2011, 3, 27, 1), freq=freq) - idx = idx.tz_localize('UTC') - idx = idx.tz_convert('Europe/Moscow') + idx = date_range(datetime(2011, 3, 26, 23), datetime(2011, 3, 27, 1), freq=freq) + idx = idx.tz_localize("UTC") + idx = idx.tz_convert("Europe/Moscow") expected = np.repeat(np.array([3, 4, 5]), np.array([n, n, 1])) tm.assert_index_equal(idx.hour, Index(expected)) def test_dti_tz_convert_dst(self): - for freq, n in [('H', 1), ('T', 60), ('S', 3600)]: + for freq, n in [("H", 1), ("T", 60), ("S", 3600)]: # Start DST - idx = date_range('2014-03-08 23:00', '2014-03-09 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([18, 19, 20, 21, 22, 23, - 0, 1, 3, 4, 5]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) + idx = date_range( + "2014-03-08 23:00", "2014-03-09 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([18, 19, 20, 21, 22, 23, 0, 1, 3, 4, 5]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) - idx = date_range('2014-03-08 18:00', '2014-03-09 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) + idx = date_range( + "2014-03-08 18:00", "2014-03-09 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) # End DST - idx = date_range('2014-11-01 23:00', '2014-11-02 09:00', freq=freq, - tz='UTC') - idx = idx.tz_convert('US/Eastern') - expected = np.repeat(np.array([19, 20, 21, 22, 23, - 0, 1, 1, 2, 3, 4]), - np.array([n, n, n, n, n, n, n, n, n, n, 1])) + idx = date_range( + "2014-11-01 23:00", "2014-11-02 09:00", freq=freq, tz="UTC" + ) + idx = idx.tz_convert("US/Eastern") + expected = np.repeat( + np.array([19, 20, 21, 22, 23, 0, 1, 1, 2, 3, 4]), + np.array([n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) - idx = date_range('2014-11-01 18:00', '2014-11-02 05:00', freq=freq, - tz='US/Eastern') - idx = idx.tz_convert('UTC') - expected = np.repeat(np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10]), - np.array([n, n, n, n, n, n, n, n, n, - n, n, n, 1])) + idx = date_range( + "2014-11-01 18:00", "2014-11-02 05:00", freq=freq, tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") + expected = np.repeat( + np.array([22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + np.array([n, n, n, n, n, n, n, n, n, n, n, n, 1]), + ) tm.assert_index_equal(idx.hour, Index(expected)) # daily # Start DST - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') + idx = date_range("2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") tm.assert_index_equal(idx.hour, Index([19, 19])) - idx = date_range('2014-03-08 00:00', '2014-03-09 00:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') + idx = date_range( + "2014-03-08 00:00", "2014-03-09 00:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") tm.assert_index_equal(idx.hour, Index([5, 5])) # End DST - idx = date_range('2014-11-01 00:00', '2014-11-02 00:00', freq='D', - tz='UTC') - idx = idx.tz_convert('US/Eastern') + idx = date_range("2014-11-01 00:00", "2014-11-02 00:00", freq="D", tz="UTC") + idx = idx.tz_convert("US/Eastern") tm.assert_index_equal(idx.hour, Index([20, 20])) - idx = date_range('2014-11-01 00:00', '2014-11-02 000:00', freq='D', - tz='US/Eastern') - idx = idx.tz_convert('UTC') + idx = date_range( + "2014-11-01 00:00", "2014-11-02 000:00", freq="D", tz="US/Eastern" + ) + idx = idx.tz_convert("UTC") tm.assert_index_equal(idx.hour, Index([4, 4])) def test_tz_convert_roundtrip(self, tz_aware_fixture): tz = tz_aware_fixture - idx1 = date_range(start='2014-01-01', end='2014-12-31', freq='M', - tz='UTC') - exp1 = date_range(start='2014-01-01', end='2014-12-31', freq='M') + idx1 = date_range(start="2014-01-01", end="2014-12-31", freq="M", tz="UTC") + exp1 = date_range(start="2014-01-01", end="2014-12-31", freq="M") - idx2 = date_range(start='2014-01-01', end='2014-12-31', freq='D', - tz='UTC') - exp2 = date_range(start='2014-01-01', end='2014-12-31', freq='D') + idx2 = date_range(start="2014-01-01", end="2014-12-31", freq="D", tz="UTC") + exp2 = date_range(start="2014-01-01", end="2014-12-31", freq="D") - idx3 = date_range(start='2014-01-01', end='2014-03-01', freq='H', - tz='UTC') - exp3 = date_range(start='2014-01-01', end='2014-03-01', freq='H') + idx3 = date_range(start="2014-01-01", end="2014-03-01", freq="H", tz="UTC") + exp3 = date_range(start="2014-01-01", end="2014-03-01", freq="H") - idx4 = date_range(start='2014-08-01', end='2014-10-31', freq='T', - tz='UTC') - exp4 = date_range(start='2014-08-01', end='2014-10-31', freq='T') + idx4 = date_range(start="2014-08-01", end="2014-10-31", freq="T", tz="UTC") + exp4 = date_range(start="2014-08-01", end="2014-10-31", freq="T") - for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), - (idx4, exp4)]: + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3), (idx4, exp4)]: converted = idx.tz_convert(tz) reset = converted.tz_convert(None) tm.assert_index_equal(reset, expected) assert reset.tzinfo is None - expected = converted.tz_convert('UTC').tz_localize(None) + expected = converted.tz_convert("UTC").tz_localize(None) tm.assert_index_equal(reset, expected) def test_dti_tz_convert_tzlocal(self): # GH#13583 # tz_convert doesn't affect to internal - dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') + dti = date_range(start="2001-01-01", end="2001-03-01", tz="UTC") dti2 = dti.tz_convert(dateutil.tz.tzlocal()) tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) dti2 = dti.tz_convert(None) tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', - pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) def test_dti_tz_convert_utc_to_local_no_modify(self, tz): - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") rng_eastern = rng.tz_convert(tz) # Values are unmodified @@ -289,9 +302,9 @@ def test_dti_tz_convert_utc_to_local_no_modify(self, tz): assert timezones.tz_compare(rng_eastern.tz, timezones.maybe_get_tz(tz)) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_tz_convert_unsorted(self, tzstr): - dr = date_range('2012-03-09', freq='H', periods=100, tz='utc') + dr = date_range("2012-03-09", freq="H", periods=100, tz="utc") dr = dr.tz_convert(tzstr) result = dr[::-1].hour @@ -303,80 +316,80 @@ def test_tz_convert_unsorted(self, tzstr): def test_dti_tz_localize_nonexistent_raise_coerce(self): # GH#13057 - times = ['2015-03-08 01:00', '2015-03-08 02:00', '2015-03-08 03:00'] + times = ["2015-03-08 01:00", "2015-03-08 02:00", "2015-03-08 03:00"] index = DatetimeIndex(times) - tz = 'US/Eastern' + tz = "US/Eastern" with pytest.raises(pytz.NonExistentTimeError): index.tz_localize(tz=tz) with pytest.raises(pytz.NonExistentTimeError): with tm.assert_produces_warning(FutureWarning): - index.tz_localize(tz=tz, errors='raise') - - with tm.assert_produces_warning(FutureWarning, - clear=FutureWarning, - check_stacklevel=False): - result = index.tz_localize(tz=tz, errors='coerce') - test_times = ['2015-03-08 01:00-05:00', 'NaT', - '2015-03-08 03:00-04:00'] + index.tz_localize(tz=tz, errors="raise") + + with tm.assert_produces_warning( + FutureWarning, clear=FutureWarning, check_stacklevel=False + ): + result = index.tz_localize(tz=tz, errors="coerce") + test_times = ["2015-03-08 01:00-05:00", "NaT", "2015-03-08 03:00-04:00"] dti = to_datetime(test_times, utc=True) - expected = dti.tz_convert('US/Eastern') + expected = dti.tz_convert("US/Eastern") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_infer(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour()) with pytest.raises(pytz.AmbiguousTimeError): dr.tz_localize(tz) # With repeated hours, we can infer the transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=pd.offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='infer') + localized = di.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(dr, localized) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous='infer')) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, ambiguous="infer")) # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) localized = dr.tz_localize(tz) - localized_infer = dr.tz_localize(tz, ambiguous='infer') + localized_infer = dr.tz_localize(tz, ambiguous="infer") tm.assert_index_equal(localized, localized_infer) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_times(self, tz): # March 13, 2011, spring forward, skip from 2 AM to 3 AM - dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, freq=pd.offsets.Hour()) with pytest.raises(pytz.NonExistentTimeError): dr.tz_localize(tz) # after dst transition, it works - dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, - freq=pd.offsets.Hour(), tz=tz) + dr = date_range( + datetime(2011, 3, 13, 3, 30), periods=3, freq=pd.offsets.Hour(), tz=tz + ) # November 6, 2011, fall back, repeat 2 AM hour - dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, freq=pd.offsets.Hour()) with pytest.raises(pytz.AmbiguousTimeError): dr.tz_localize(tz) # UTC is OK - dr = date_range(datetime(2011, 3, 13), periods=48, - freq=pd.offsets.Minute(30), tz=pytz.utc) + dr = date_range( + datetime(2011, 3, 13), periods=48, freq=pd.offsets.Minute(30), tz=pytz.utc + ) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] idx = DatetimeIndex(strdates) conv = idx.tz_localize(tzstr) @@ -386,47 +399,51 @@ def test_dti_tz_localize_pass_dates_to_utc(self, tzstr): assert conv.tz == fromdates.tz tm.assert_numpy_array_equal(conv.values, fromdates.values) - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_dti_tz_localize(self, prefix): - tzstr = prefix + 'US/Eastern' - dti = pd.date_range(start='1/1/2005', end='1/1/2005 0:00:30.256', - freq='L') + tzstr = prefix + "US/Eastern" + dti = pd.date_range(start="1/1/2005", end="1/1/2005 0:00:30.256", freq="L") dti2 = dti.tz_localize(tzstr) - dti_utc = pd.date_range(start='1/1/2005 05:00', - end='1/1/2005 5:00:30.256', freq='L', tz='utc') + dti_utc = pd.date_range( + start="1/1/2005 05:00", end="1/1/2005 5:00:30.256", freq="L", tz="utc" + ) tm.assert_numpy_array_equal(dti2.values, dti_utc.values) - dti3 = dti2.tz_convert(prefix + 'US/Pacific') + dti3 = dti2.tz_convert(prefix + "US/Pacific") tm.assert_numpy_array_equal(dti3.values, dti_utc.values) - dti = pd.date_range(start='11/6/2011 1:59', end='11/6/2011 2:00', - freq='L') + dti = pd.date_range(start="11/6/2011 1:59", end="11/6/2011 2:00", freq="L") with pytest.raises(pytz.AmbiguousTimeError): dti.tz_localize(tzstr) - dti = pd.date_range(start='3/13/2011 1:59', end='3/13/2011 2:00', - freq='L') + dti = pd.date_range(start="3/13/2011 1:59", end="3/13/2011 2:00", freq="L") with pytest.raises(pytz.NonExistentTimeError): dti.tz_localize(tzstr) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern', - pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize( + "tz", + [ + "US/Eastern", + "dateutil/US/Eastern", + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + ], + ) def test_dti_tz_localize_utc_conversion(self, tz): # Localizing to time zone should: # 1) check for DST ambiguities # 2) convert to UTC - rng = date_range('3/10/2012', '3/11/2012', freq='30T') + rng = date_range("3/10/2012", "3/11/2012", freq="30T") converted = rng.tz_localize(tz) expected_naive = rng + pd.offsets.Hour(5) tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail - rng = date_range('3/11/2012', '3/12/2012', freq='30T') + rng = date_range("3/11/2012", "3/12/2012", freq="30T") # Is this really how it should fail?? with pytest.raises(pytz.NonExistentTimeError): rng.tz_localize(tz) @@ -435,7 +452,7 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): # note: this tz tests that a tz-naive index can be localized # and de-localized successfully, when there are no DST transitions # in the range. - idx = date_range(start='2014-06-01', end='2014-08-30', freq='15T') + idx = date_range(start="2014-06-01", end="2014-08-30", freq="15T") tz = tz_aware_fixture localized = idx.tz_localize(tz) # cant localize a tz-aware object @@ -446,10 +463,10 @@ def test_dti_tz_localize_roundtrip(self, tz_aware_fixture): tm.assert_index_equal(reset, idx) def test_dti_tz_localize_naive(self): - rng = date_range('1/1/2011', periods=100, freq='H') + rng = date_range("1/1/2011", periods=100, freq="H") - conv = rng.tz_localize('US/Pacific') - exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') + conv = rng.tz_localize("US/Pacific") + exp = date_range("1/1/2011", periods=100, freq="H", tz="US/Pacific") tm.assert_index_equal(conv, exp) @@ -458,55 +475,66 @@ def test_dti_tz_localize_tzlocal(self): offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) offset = int(offset.total_seconds() * 1000000000) - dti = date_range(start='2001-01-01', end='2001-03-01') + dti = date_range(start="2001-01-01", end="2001-03-01") dti2 = dti.tz_localize(dateutil.tz.tzlocal()) tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) - dti = date_range(start='2001-01-01', end='2001-03-01', - tz=dateutil.tz.tzlocal()) + dti = date_range(start="2001-01-01", end="2001-03-01", tz=dateutil.tz.tzlocal()) dti2 = dti.tz_localize(None) tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_nat(self, tz): - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] di = DatetimeIndex(times) - localized = di.tz_localize(tz, ambiguous='NaT') + localized = di.tz_localize(tz, ambiguous="NaT") - times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', - '11/06/2011 03:00'] - di_test = DatetimeIndex(times, tz='US/Eastern') + times = [ + "11/06/2011 00:00", + np.NaN, + np.NaN, + "11/06/2011 02:00", + "11/06/2011 03:00", + ] + di_test = DatetimeIndex(times, tz="US/Eastern") # left dtype is datetime64[ns, US/Eastern] # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] tm.assert_numpy_array_equal(di_test.values, localized.values) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_tz_localize_ambiguous_flags(self, tz): # November 6, 2011, fall back, repeat 2 AM hour # Pass in flags to determine right dst transition - dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=pd.offsets.Hour(), tz=tz) - times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', - '11/06/2011 02:00', '11/06/2011 03:00'] + dr = date_range( + datetime(2011, 11, 6, 0), periods=5, freq=pd.offsets.Hour(), tz=tz + ) + times = [ + "11/06/2011 00:00", + "11/06/2011 01:00", + "11/06/2011 01:00", + "11/06/2011 02:00", + "11/06/2011 03:00", + ] # Test tz_localize di = DatetimeIndex(times) is_dst = [1, 1, 0, 0, 0] localized = di.tz_localize(tz, ambiguous=is_dst) tm.assert_index_equal(dr, localized) - tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous=is_dst)) + tm.assert_index_equal(dr, DatetimeIndex(times, tz=tz, ambiguous=is_dst)) localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) tm.assert_index_equal(dr, localized) - localized = di.tz_localize(tz, - ambiguous=np.array(is_dst).astype('bool')) + localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype("bool")) tm.assert_index_equal(dr, localized) # Test constructor @@ -528,76 +556,81 @@ def test_dti_tz_localize_ambiguous_flags(self, tz): tm.assert_index_equal(dr, localized) # When there is no dst transition, nothing special happens - dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=pd.offsets.Hour()) + dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=pd.offsets.Hour()) is_dst = np.array([1] * 10) localized = dr.tz_localize(tz) localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) tm.assert_index_equal(localized, localized_is_dst) # TODO: belongs outside tz_localize tests? - @pytest.mark.parametrize('tz', ['Europe/London', 'dateutil/Europe/London']) + @pytest.mark.parametrize("tz", ["Europe/London", "dateutil/Europe/London"]) def test_dti_construction_ambiguous_endpoint(self, tz): # construction with an ambiguous end-point # GH#11626 with pytest.raises(pytz.AmbiguousTimeError): - date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", freq="H") + date_range( + "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H" + ) - times = date_range("2013-10-26 23:00", "2013-10-27 01:00", freq="H", - tz=tz, ambiguous='infer') - assert times[0] == Timestamp('2013-10-26 23:00', tz=tz, freq="H") + times = date_range( + "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" + ) + assert times[0] == Timestamp("2013-10-26 23:00", tz=tz, freq="H") - if str(tz).startswith('dateutil'): + if str(tz).startswith("dateutil"): # fixed ambiguous behavior # see GH#14621 - assert times[-1] == Timestamp('2013-10-27 01:00:00+0100', - tz=tz, freq="H") + assert times[-1] == Timestamp("2013-10-27 01:00:00+0100", tz=tz, freq="H") else: - assert times[-1] == Timestamp('2013-10-27 01:00:00+0000', - tz=tz, freq="H") - - @pytest.mark.parametrize('tz, option, expected', [ - ['US/Pacific', 'shift_forward', "2019-03-10 03:00"], - ['dateutil/US/Pacific', 'shift_forward', "2019-03-10 03:00"], - ['US/Pacific', 'shift_backward', "2019-03-10 01:00"], - pytest.param('dateutil/US/Pacific', 'shift_backward', - "2019-03-10 01:00", - marks=pytest.mark.xfail(reason="GH 24329")), - ['US/Pacific', timedelta(hours=1), "2019-03-10 03:00"] - ]) + assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz, freq="H") + + @pytest.mark.parametrize( + "tz, option, expected", + [ + ["US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["dateutil/US/Pacific", "shift_forward", "2019-03-10 03:00"], + ["US/Pacific", "shift_backward", "2019-03-10 01:00"], + pytest.param( + "dateutil/US/Pacific", + "shift_backward", + "2019-03-10 01:00", + marks=pytest.mark.xfail(reason="GH 24329"), + ), + ["US/Pacific", timedelta(hours=1), "2019-03-10 03:00"], + ], + ) def test_dti_construction_nonexistent_endpoint(self, tz, option, expected): # construction with an nonexistent end-point with pytest.raises(pytz.NonExistentTimeError): - date_range("2019-03-10 00:00", "2019-03-10 02:00", - tz="US/Pacific", freq="H") + date_range( + "2019-03-10 00:00", "2019-03-10 02:00", tz="US/Pacific", freq="H" + ) - times = date_range("2019-03-10 00:00", "2019-03-10 02:00", freq="H", - tz=tz, nonexistent=option) + times = date_range( + "2019-03-10 00:00", "2019-03-10 02:00", freq="H", tz=tz, nonexistent=option + ) assert times[-1] == Timestamp(expected, tz=tz, freq="H") def test_dti_tz_localize_bdate_range(self): - dr = pd.bdate_range('1/1/2009', '1/1/2010') - dr_utc = pd.bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) + dr = pd.bdate_range("1/1/2009", "1/1/2010") + dr_utc = pd.bdate_range("1/1/2009", "1/1/2010", tz=pytz.utc) localized = dr.tz_localize(pytz.utc) tm.assert_index_equal(dr_utc, localized) - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) - @pytest.mark.parametrize('method, exp', [ - ['NaT', pd.NaT], - ['raise', None], - ['foo', 'invalid'] - ]) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + @pytest.mark.parametrize( + "method, exp", [["NaT", pd.NaT], ["raise", None], ["foo", "invalid"]] + ) def test_dti_tz_localize_nonexistent(self, tz, method, exp): # GH 8917 n = 60 - dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') - if method == 'raise': + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") + if method == "raise": with pytest.raises(pytz.NonExistentTimeError): dti.tz_localize(tz, nonexistent=method) - elif exp == 'invalid': + elif exp == "invalid": with pytest.raises(ValueError): dti.tz_localize(tz, nonexistent=method) else: @@ -605,106 +638,135 @@ def test_dti_tz_localize_nonexistent(self, tz, method, exp): expected = DatetimeIndex([exp] * n, tz=tz) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start_ts, tz, end_ts, shift', [ - ['2015-03-29 02:20:00', 'Europe/Warsaw', '2015-03-29 03:00:00', - 'forward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:59:59.999999999', 'backward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 03:20:00', timedelta(hours=1)], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:20:00', timedelta(hours=-1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:00:00', - 'forward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:59:59.999999999', - 'backward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:33:00', - timedelta(hours=1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:33:00', - timedelta(hours=-1)] - ]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) - def test_dti_tz_localize_nonexistent_shift(self, start_ts, tz, - end_ts, shift, - tz_type): + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_dti_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type + ): # GH 8917 tz = tz_type + tz if isinstance(shift, str): - shift = 'shift_' + shift + shift = "shift_" + shift dti = DatetimeIndex([Timestamp(start_ts)]) result = dti.tz_localize(tz, nonexistent=shift) expected = DatetimeIndex([Timestamp(end_ts)]).tz_localize(tz) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('offset', [-1, 1]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) + @pytest.mark.parametrize("offset", [-1, 1]) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) def test_dti_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): # GH 8917 - tz = tz_type + 'Europe/Warsaw' - dti = DatetimeIndex([Timestamp('2015-03-29 02:20:00')]) + tz = tz_type + "Europe/Warsaw" + dti = DatetimeIndex([Timestamp("2015-03-29 02:20:00")]) msg = "The provided timedelta will relocalize on a nonexistent time" with pytest.raises(ValueError, match=msg): dti.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_dti_tz_localize_errors_deprecation(self): # GH 22644 - tz = 'Europe/Warsaw' + tz = "Europe/Warsaw" n = 60 - dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with pytest.raises(ValueError): - dti.tz_localize(tz, errors='foo') + dti.tz_localize(tz, errors="foo") # make sure errors='coerce' gets mapped correctly to nonexistent - result = dti.tz_localize(tz, errors='coerce') - expected = dti.tz_localize(tz, nonexistent='NaT') + result = dti.tz_localize(tz, errors="coerce") + expected = dti.tz_localize(tz, nonexistent="NaT") tm.assert_index_equal(result, expected) # ------------------------------------------------------------- # DatetimeIndex.normalize def test_normalize_tz(self): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz='US/Eastern') + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="US/Eastern") result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz='US/Eastern') + expected = date_range("1/1/2000", periods=10, freq="D", tz="US/Eastern") tm.assert_index_equal(result, expected) assert result.is_normalized assert not rng.is_normalized - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz='UTC') + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz="UTC") result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') + expected = date_range("1/1/2000", periods=10, freq="D", tz="UTC") tm.assert_index_equal(result, expected) assert result.is_normalized assert not rng.is_normalized - rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) tm.assert_index_equal(result, expected) assert result.is_normalized assert not rng.is_normalized @td.skip_if_windows - @pytest.mark.parametrize('timezone', ['US/Pacific', 'US/Eastern', 'UTC', - 'Asia/Kolkata', 'Asia/Shanghai', - 'Australia/Canberra']) + @pytest.mark.parametrize( + "timezone", + [ + "US/Pacific", + "US/Eastern", + "UTC", + "Asia/Kolkata", + "Asia/Shanghai", + "Australia/Canberra", + ], + ) def test_normalize_tz_local(self, timezone): # GH#13459 with tm.set_timezone(timezone): - rng = date_range('1/1/2000 9:30', periods=10, freq='D', - tz=tzlocal()) + rng = date_range("1/1/2000 9:30", periods=10, freq="D", tz=tzlocal()) result = rng.normalize() - expected = date_range('1/1/2000', periods=10, freq='D', - tz=tzlocal()) + expected = date_range("1/1/2000", periods=10, freq="D", tz=tzlocal()) tm.assert_index_equal(result, expected) assert result.is_normalized @@ -713,15 +775,15 @@ def test_normalize_tz_local(self, timezone): # ------------------------------------------------------------ # DatetimeIndex.__new__ - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_dti_constructor_static_tzinfo(self, prefix): # it works! - index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + 'EST') + index = DatetimeIndex([datetime(2012, 1, 1)], tz=prefix + "EST") index.hour index[0] def test_dti_constructor_with_fixed_tz(self): - off = FixedOffset(420, '+07:00') + off = FixedOffset(420, "+07:00") start = datetime(2012, 3, 11, 5, 0, 0, tzinfo=off) end = datetime(2012, 6, 11, 5, 0, 0, tzinfo=off) rng = date_range(start=start, end=end) @@ -730,44 +792,39 @@ def test_dti_constructor_with_fixed_tz(self): rng2 = date_range(start, periods=len(rng), tz=off) tm.assert_index_equal(rng, rng2) - rng3 = date_range('3/11/2012 05:00:00+07:00', - '6/11/2012 05:00:00+07:00') + rng3 = date_range("3/11/2012 05:00:00+07:00", "6/11/2012 05:00:00+07:00") assert (rng.values == rng3.values).all() - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_convert_datetime_list(self, tzstr): - dr = date_range('2012-06-02', periods=10, - tz=tzstr, name='foo') - dr2 = DatetimeIndex(list(dr), name='foo') + dr = date_range("2012-06-02", periods=10, tz=tzstr, name="foo") + dr2 = DatetimeIndex(list(dr), name="foo") tm.assert_index_equal(dr, dr2) assert dr.tz == dr2.tz - assert dr2.name == 'foo' + assert dr2.name == "foo" def test_dti_construction_univalent(self): - rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', - tz='US/Eastern') - rng2 = DatetimeIndex(data=rng, tz='US/Eastern') + rng = date_range("03/12/2012 00:00", periods=10, freq="W-FRI", tz="US/Eastern") + rng2 = DatetimeIndex(data=rng, tz="US/Eastern") tm.assert_index_equal(rng, rng2) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_from_tzaware_datetime(self, tz): d = [datetime(2012, 8, 19, tzinfo=tz)] index = DatetimeIndex(d) assert timezones.tz_compare(index.tz, tz) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_constructors(self, tzstr): """ Test different DatetimeIndex constructions with timezone Follow-up of GH#4229 """ - arr = ['11/10/2005 08:00:00', '11/10/2005 09:00:00'] + arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] idx1 = to_datetime(arr).tz_localize(tzstr) - idx2 = pd.date_range(start="2005-11-10 08:00:00", freq='H', periods=2, - tz=tzstr) + idx2 = pd.date_range(start="2005-11-10 08:00:00", freq="H", periods=2, tz=tzstr) idx3 = DatetimeIndex(arr, tz=tzstr) idx4 = DatetimeIndex(np.array(arr), tz=tzstr) @@ -778,10 +835,10 @@ def test_dti_tz_constructors(self, tzstr): # Unsorted def test_join_utc_convert(self, join_type): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") - left = rng.tz_convert('US/Eastern') - right = rng.tz_convert('Europe/Berlin') + left = rng.tz_convert("US/Eastern") + right = rng.tz_convert("Europe/Berlin") result = left.join(left[:-5], how=join_type) assert isinstance(result, DatetimeIndex) @@ -789,30 +846,30 @@ def test_join_utc_convert(self, join_type): result = left.join(right[:-5], how=join_type) assert isinstance(result, DatetimeIndex) - assert result.tz.zone == 'UTC' + assert result.tz.zone == "UTC" - @pytest.mark.parametrize("dtype", [ - None, 'datetime64[ns, CET]', - 'datetime64[ns, EST]', 'datetime64[ns, UTC]' - ]) + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) def test_date_accessor(self, dtype): # Regression test for GH#21230 expected = np.array([date(2018, 6, 4), pd.NaT]) - index = DatetimeIndex(['2018-06-04 10:00:00', pd.NaT], dtype=dtype) + index = DatetimeIndex(["2018-06-04 10:00:00", pd.NaT], dtype=dtype) result = index.date tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dtype", [ - None, 'datetime64[ns, CET]', - 'datetime64[ns, EST]', 'datetime64[ns, UTC]' - ]) + @pytest.mark.parametrize( + "dtype", + [None, "datetime64[ns, CET]", "datetime64[ns, EST]", "datetime64[ns, UTC]"], + ) def test_time_accessor(self, dtype): # Regression test for GH#21267 expected = np.array([time(10, 20, 30), pd.NaT]) - index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], dtype=dtype) + index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], dtype=dtype) result = index.time tm.assert_numpy_array_equal(result, expected) @@ -823,7 +880,7 @@ def test_timetz_accessor(self, tz_naive_fixture): expected = np.array([time(10, 20, 30, tzinfo=tz), pd.NaT]) - index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], tz=tz) + index = DatetimeIndex(["2018-06-04 10:20:30", pd.NaT], tz=tz) result = index.timetz tm.assert_numpy_array_equal(result, expected) @@ -837,10 +894,10 @@ def test_dti_drop_dont_lose_tz(self): def test_dti_tz_conversion_freq(self, tz_naive_fixture): # GH25241 - t3 = DatetimeIndex(['2019-01-01 10:00'], freq='H') + t3 = DatetimeIndex(["2019-01-01 10:00"], freq="H") assert t3.tz_localize(tz=tz_naive_fixture).freq == t3.freq - t4 = DatetimeIndex(['2019-01-02 12:00'], tz='UTC', freq='T') - assert t4.tz_convert(tz='UTC').freq == t4.freq + t4 = DatetimeIndex(["2019-01-02 12:00"], tz="UTC", freq="T") + assert t4.tz_convert(tz="UTC").freq == t4.freq def test_drop_dst_boundary(self): # see gh-18031 @@ -851,32 +908,52 @@ def test_drop_dst_boundary(self): end = pd.Timestamp("201710290300", tz=tz) index = pd.date_range(start=start, end=end, freq=freq) - expected = DatetimeIndex(["201710290115", "201710290130", - "201710290145", "201710290200", - "201710290215", "201710290230", - "201710290245", "201710290200", - "201710290215", "201710290230", - "201710290245", "201710290300"], - tz=tz, freq=freq, - ambiguous=[True, True, True, True, - True, True, True, False, - False, False, False, False]) + expected = DatetimeIndex( + [ + "201710290115", + "201710290130", + "201710290145", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290200", + "201710290215", + "201710290230", + "201710290245", + "201710290300", + ], + tz=tz, + freq=freq, + ambiguous=[ + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + ], + ) result = index.drop(index[0]) tm.assert_index_equal(result, expected) def test_date_range_localize(self): - rng = date_range('3/11/2012 03:00', periods=15, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], - tz='US/Eastern') - rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') - rng3 = rng3.tz_localize('US/Eastern') + rng = date_range("3/11/2012 03:00", periods=15, freq="H", tz="US/Eastern") + rng2 = DatetimeIndex(["3/11/2012 03:00", "3/11/2012 04:00"], tz="US/Eastern") + rng3 = date_range("3/11/2012 03:00", periods=15, freq="H") + rng3 = rng3.tz_localize("US/Eastern") tm.assert_index_equal(rng, rng3) # DST transition time val = rng[0] - exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') + exp = Timestamp("3/11/2012 03:00", tz="US/Eastern") assert val.hour == 3 assert exp.hour == 3 @@ -884,26 +961,23 @@ def test_date_range_localize(self): tm.assert_index_equal(rng[:2], rng2) # Right before the DST transition - rng = date_range('3/11/2012 00:00', periods=2, freq='H', - tz='US/Eastern') - rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], - tz='US/Eastern') + rng = date_range("3/11/2012 00:00", periods=2, freq="H", tz="US/Eastern") + rng2 = DatetimeIndex(["3/11/2012 00:00", "3/11/2012 01:00"], tz="US/Eastern") tm.assert_index_equal(rng, rng2) - exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') + exp = Timestamp("3/11/2012 00:00", tz="US/Eastern") assert exp.hour == 0 assert rng[0] == exp - exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') + exp = Timestamp("3/11/2012 01:00", tz="US/Eastern") assert exp.hour == 1 assert rng[1] == exp - rng = date_range('3/11/2012 00:00', periods=10, freq='H', - tz='US/Eastern') + rng = date_range("3/11/2012 00:00", periods=10, freq="H", tz="US/Eastern") assert rng[2].hour == 3 def test_timestamp_equality_different_timezones(self): - utc_range = date_range('1/1/2000', periods=20, tz='UTC') - eastern_range = utc_range.tz_convert('US/Eastern') - berlin_range = utc_range.tz_convert('Europe/Berlin') + utc_range = date_range("1/1/2000", periods=20, tz="UTC") + eastern_range = utc_range.tz_convert("US/Eastern") + berlin_range = utc_range.tz_convert("Europe/Berlin") for a, b, c in zip(utc_range, eastern_range, berlin_range): assert a == b @@ -915,7 +989,7 @@ def test_timestamp_equality_different_timezones(self): assert (berlin_range == eastern_range).all() def test_dti_intersection(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") left = rng[10:90][::-1] right = rng[20:80][::-1] @@ -925,24 +999,24 @@ def test_dti_intersection(self): assert result.tz == left.tz def test_dti_equals_with_tz(self): - left = date_range('1/1/2011', periods=100, freq='H', tz='utc') - right = date_range('1/1/2011', periods=100, freq='H', tz='US/Eastern') + left = date_range("1/1/2011", periods=100, freq="H", tz="utc") + right = date_range("1/1/2011", periods=100, freq="H", tz="US/Eastern") assert not left.equals(right) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_nat(self, tzstr): idx = DatetimeIndex([Timestamp("2013-1-1", tz=tzstr), pd.NaT]) assert isna(idx[1]) assert idx[0].tzinfo is not None - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_astype_asobject_tzinfos(self, tzstr): # GH#1345 # dates around a dst transition - rng = date_range('2/13/2010', '5/6/2010', tz=tzstr) + rng = date_range("2/13/2010", "5/6/2010", tz=tzstr) objs = rng.astype(object) for i, x in enumerate(objs): @@ -956,28 +1030,28 @@ def test_dti_astype_asobject_tzinfos(self, tzstr): assert x == exval assert x.tzinfo == exval.tzinfo - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_with_timezone_repr(self, tzstr): - rng = date_range('4/13/2010', '5/6/2010') + rng = date_range("4/13/2010", "5/6/2010") rng_eastern = rng.tz_localize(tzstr) rng_repr = repr(rng_eastern) - assert '2010-04-13 00:00:00' in rng_repr + assert "2010-04-13 00:00:00" in rng_repr - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_take_dont_lose_meta(self, tzstr): - rng = date_range('1/1/2000', periods=20, tz=tzstr) + rng = date_range("1/1/2000", periods=20, tz=tzstr) result = rng.take(range(5)) assert result.tz == rng.tz assert result.freq == rng.freq - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_utc_box_timestamp_and_localize(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng = date_range("3/11/2012", "3/12/2012", freq="H", tz="utc") rng_eastern = rng.tz_convert(tzstr) expected = rng[-1].astimezone(tz) @@ -987,15 +1061,16 @@ def test_utc_box_timestamp_and_localize(self, tzstr): assert stamp.tzinfo == expected.tzinfo # right tzinfo - rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') + rng = date_range("3/13/2012", "3/14/2012", freq="H", tz="utc") rng_eastern = rng.tz_convert(tzstr) # test not valid for dateutil timezones. # assert 'EDT' in repr(rng_eastern[0].tzinfo) - assert ('EDT' in repr(rng_eastern[0].tzinfo) or - 'tzfile' in repr(rng_eastern[0].tzinfo)) + assert "EDT" in repr(rng_eastern[0].tzinfo) or "tzfile" in repr( + rng_eastern[0].tzinfo + ) def test_dti_to_pydatetime(self): - dt = dateutil.parser.parse('2012-06-13T01:39:00Z') + dt = dateutil.parser.parse("2012-06-13T01:39:00Z") dt = dt.replace(tzinfo=tzlocal()) arr = np.array([dt], dtype=object) @@ -1003,15 +1078,19 @@ def test_dti_to_pydatetime(self): result = to_datetime(arr, utc=True) assert result.tz is pytz.utc - rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) + rng = date_range("2012-11-03 03:00", "2012-11-05 03:00", tz=tzlocal()) arr = rng.to_pydatetime() result = to_datetime(arr, utc=True) assert result.tz is pytz.utc def test_dti_to_pydatetime_fizedtz(self): - dates = np.array([datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)]) + dates = np.array( + [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] + ) dti = DatetimeIndex(dates) result = dti.to_pydatetime() @@ -1020,8 +1099,7 @@ def test_dti_to_pydatetime_fizedtz(self): result = dti._mpl_repr() tm.assert_numpy_array_equal(dates, result) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Central'), - gettz('US/Central')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Central"), gettz("US/Central")]) def test_with_tz(self, tz): # just want it to work start = datetime(2011, 3, 12, tzinfo=pytz.utc) @@ -1029,8 +1107,8 @@ def test_with_tz(self, tz): assert dr.tz is pytz.utc # DateRange with naive datetimes - dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) - dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) + dr = bdate_range("1/1/2005", "1/1/2009", tz=pytz.utc) + dr = bdate_range("1/1/2005", "1/1/2009", tz=tz) # normalized central = dr.tz_convert(tz) @@ -1045,31 +1123,30 @@ def test_with_tz(self, tz): assert central[0].tz is comp # datetimes with tzinfo set - dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - datetime(2009, 1, 1, tzinfo=pytz.utc)) + dr = bdate_range( + datetime(2005, 1, 1, tzinfo=pytz.utc), datetime(2009, 1, 1, tzinfo=pytz.utc) + ) with pytest.raises(Exception): - bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', - tz=tz) + bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), "1/1/2009", tz=tz) - @pytest.mark.parametrize('prefix', ['', 'dateutil/']) + @pytest.mark.parametrize("prefix", ["", "dateutil/"]) def test_field_access_localize(self, prefix): - strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] - rng = DatetimeIndex(strdates, tz=prefix + 'US/Eastern') + strdates = ["1/1/2012", "3/1/2012", "4/1/2012"] + rng = DatetimeIndex(strdates, tz=prefix + "US/Eastern") assert (rng.hour == 0).all() # a more unusual time zone, #1946 - dr = date_range('2011-10-02 00:00', freq='h', periods=10, - tz=prefix + 'America/Atikokan') + dr = date_range( + "2011-10-02 00:00", freq="h", periods=10, tz=prefix + "America/Atikokan" + ) expected = Index(np.arange(10, dtype=np.int64)) tm.assert_index_equal(dr.hour, expected) - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern')]) + @pytest.mark.parametrize("tz", [pytz.timezone("US/Eastern"), gettz("US/Eastern")]) def test_dti_convert_tz_aware_datetime_datetime(self, tz): # GH#1581 - dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)] + dates = [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)] dates_aware = [conversion.localize_pydatetime(x, tz) for x in dates] result = DatetimeIndex(dates_aware) @@ -1082,70 +1159,74 @@ def test_dti_convert_tz_aware_datetime_datetime(self, tz): def test_dti_union_aware(self): # non-overlapping - rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", - tz="US/Central") + rng = date_range("2012-11-15 00:00:00", periods=6, freq="H", tz="US/Central") - rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", - tz="US/Eastern") + rng2 = date_range("2012-11-15 12:00:00", periods=6, freq="H", tz="US/Eastern") result = rng.union(rng2) - expected = rng.astype('O').union(rng2.astype('O')) + expected = rng.astype("O").union(rng2.astype("O")) tm.assert_index_equal(result, expected) - assert result[0].tz.zone == 'US/Central' - assert result[-1].tz.zone == 'US/Eastern' + assert result[0].tz.zone == "US/Central" + assert result[-1].tz.zone == "US/Eastern" def test_dti_union_mixed(self): # GH 21671 - rng = DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT]) - rng2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz='Asia/Tokyo') + rng = DatetimeIndex([pd.Timestamp("2011-01-01"), pd.NaT]) + rng2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="Asia/Tokyo") result = rng.union(rng2) - expected = Index([pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), - pd.Timestamp('2012-01-02', tz='Asia/Tokyo')], - dtype=object) + expected = Index( + [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), + pd.Timestamp("2012-01-02", tz="Asia/Tokyo"), + ], + dtype=object, + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC', "US/Central", - dateutil.tz.tzoffset(None, -28800)]) + @pytest.mark.parametrize( + "tz", [None, "UTC", "US/Central", dateutil.tz.tzoffset(None, -28800)] + ) @pytest.mark.usefixtures("datetime_tz_utc") def test_iteration_preserves_nanoseconds(self, tz): # GH 19603 - index = DatetimeIndex(["2018-02-08 15:00:00.168456358", - "2018-02-08 15:00:00.168456359"], tz=tz) + index = DatetimeIndex( + ["2018-02-08 15:00:00.168456358", "2018-02-08 15:00:00.168456359"], tz=tz + ) for i, ts in enumerate(index): assert ts == index[i] class TestDateRange: """Tests for date_range with timezones""" + def test_hongkong_tz_convert(self): # GH#1673 smoke test - dr = date_range('2012-01-01', '2012-01-10', freq='D', tz='Hongkong') + dr = date_range("2012-01-01", "2012-01-10", freq="D", tz="Hongkong") # it works! dr.hour - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_date_range_span_dst_transition(self, tzstr): # GH#1778 # Standard -> Daylight Savings Time - dr = date_range('03/06/2012 00:00', periods=200, freq='W-FRI', - tz='US/Eastern') + dr = date_range("03/06/2012 00:00", periods=200, freq="W-FRI", tz="US/Eastern") assert (dr.hour == 0).all() - dr = date_range('2012-11-02', periods=10, tz=tzstr) + dr = date_range("2012-11-02", periods=10, tz=tzstr) result = dr.hour expected = Index([0] * 10) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_date_range_timezone_str_argument(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - result = date_range('1/1/2000', periods=10, tz=tzstr) - expected = date_range('1/1/2000', periods=10, tz=tz) + result = date_range("1/1/2000", periods=10, tz=tzstr) + expected = date_range("1/1/2000", periods=10, tz=tz) tm.assert_index_equal(result, expected) @@ -1159,29 +1240,30 @@ def test_date_range_with_fixedoffset_noname(self): idx = Index([start, end]) assert off == idx.tz - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_date_range_with_tz(self, tzstr): - stamp = Timestamp('3/11/2012 05:00', tz=tzstr) + stamp = Timestamp("3/11/2012 05:00", tz=tzstr) assert stamp.hour == 5 - rng = date_range('3/11/2012 04:00', periods=10, freq='H', - tz=tzstr) + rng = date_range("3/11/2012 04:00", periods=10, freq="H", tz=tzstr) assert stamp == rng[1] class TestToDatetime: """Tests for the to_datetime constructor with timezones""" + def test_to_datetime_utc(self): - arr = np.array([dateutil.parser.parse('2012-06-13T01:39:00Z')], - dtype=object) + arr = np.array([dateutil.parser.parse("2012-06-13T01:39:00Z")], dtype=object) result = to_datetime(arr, utc=True) assert result.tz is pytz.utc def test_to_datetime_fixed_offset(self): - dates = [datetime(2000, 1, 1, tzinfo=fixed_off), - datetime(2000, 1, 2, tzinfo=fixed_off), - datetime(2000, 1, 3, tzinfo=fixed_off)] + dates = [ + datetime(2000, 1, 1, tzinfo=fixed_off), + datetime(2000, 1, 2, tzinfo=fixed_off), + datetime(2000, 1, 3, tzinfo=fixed_off), + ] result = to_datetime(dates) assert result.tz == fixed_off diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 784633b2512ce..ec4310dbc8396 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -19,8 +19,16 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, NaT, Series, Timestamp, date_range, isna, - to_datetime) + DataFrame, + DatetimeIndex, + Index, + NaT, + Series, + Timestamp, + date_range, + isna, + to_datetime, +) from pandas.core.arrays import DatetimeArray from pandas.core.tools import datetimes as tools from pandas.util import testing as tm @@ -28,23 +36,21 @@ class TestTimeConversionFormats: - - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format(self, cache): - values = ['1/1/2000', '1/2/2000', '1/3/2000'] - - results1 = [Timestamp('20000101'), Timestamp('20000201'), - Timestamp('20000301')] - results2 = [Timestamp('20000101'), Timestamp('20000102'), - Timestamp('20000103')] - for vals, expecteds in [(values, (Index(results1), Index(results2))), - (Series(values), - (Series(results1), Series(results2))), - (values[0], (results1[0], results2[0])), - (values[1], (results1[1], results2[1])), - (values[2], (results1[2], results2[2]))]: - - for i, fmt in enumerate(['%d/%m/%Y', '%m/%d/%Y']): + values = ["1/1/2000", "1/2/2000", "1/3/2000"] + + results1 = [Timestamp("20000101"), Timestamp("20000201"), Timestamp("20000301")] + results2 = [Timestamp("20000101"), Timestamp("20000102"), Timestamp("20000103")] + for vals, expecteds in [ + (values, (Index(results1), Index(results2))), + (Series(values), (Series(results1), Series(results2))), + (values[0], (results1[0], results2[0])), + (values[1], (results1[1], results2[1])), + (values[2], (results1[2], results2[2])), + ]: + + for i, fmt in enumerate(["%d/%m/%Y", "%m/%d/%Y"]): result = to_datetime(vals, format=fmt, cache=cache) expected = expecteds[i] @@ -55,123 +61,138 @@ def test_to_datetime_format(self, cache): else: tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_YYYYMMDD(self, cache): s = Series([19801222, 19801222] + [19810105] * 5) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y%m%d', cache=cache) + result = to_datetime(s, format="%Y%m%d", cache=cache) assert_series_equal(result, expected) - result = to_datetime(s.apply(str), format='%Y%m%d', cache=cache) + result = to_datetime(s.apply(str), format="%Y%m%d", cache=cache) assert_series_equal(result, expected) # with NaT - expected = Series([Timestamp("19801222"), Timestamp("19801222")] + - [Timestamp("19810105")] * 5) + expected = Series( + [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + ) expected[2] = np.nan s[2] = np.nan - result = to_datetime(s, format='%Y%m%d', cache=cache) + result = to_datetime(s, format="%Y%m%d", cache=cache) assert_series_equal(result, expected) # string with NaT s = s.apply(str) - s[2] = 'nat' - result = to_datetime(s, format='%Y%m%d', cache=cache) + s[2] = "nat" + result = to_datetime(s, format="%Y%m%d", cache=cache) assert_series_equal(result, expected) # coercion # GH 7930 s = Series([20121231, 20141231, 99991231]) - result = pd.to_datetime(s, format='%Y%m%d', errors='ignore', - cache=cache) - expected = Series([datetime(2012, 12, 31), - datetime(2014, 12, 31), datetime(9999, 12, 31)], - dtype=object) + result = pd.to_datetime(s, format="%Y%m%d", errors="ignore", cache=cache) + expected = Series( + [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object, + ) tm.assert_series_equal(result, expected) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', - cache=cache) - expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + result = pd.to_datetime(s, format="%Y%m%d", errors="coerce", cache=cache) + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("input_s, expected", [ - # NaN before strings with invalid date values - [Series(['19801222', np.nan, '20010012', '10019999']), - Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], - # NaN after strings with invalid date values - [Series(['19801222', '20010012', '10019999', np.nan]), - Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], - # NaN before integers with invalid date values - [Series([20190813, np.nan, 20010012, 20019999]), - Series([Timestamp('20190813'), np.nan, np.nan, np.nan])], - # NaN after integers with invalid date values - [Series([20190813, 20010012, np.nan, 20019999]), - Series([Timestamp('20190813'), np.nan, np.nan, np.nan])]]) + @pytest.mark.parametrize( + "input_s, expected", + [ + # NaN before strings with invalid date values + [ + Series(["19801222", np.nan, "20010012", "10019999"]), + Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ], + # NaN after strings with invalid date values + [ + Series(["19801222", "20010012", "10019999", np.nan]), + Series([Timestamp("19801222"), np.nan, np.nan, np.nan]), + ], + # NaN before integers with invalid date values + [ + Series([20190813, np.nan, 20010012, 20019999]), + Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + ], + # NaN after integers with invalid date values + [ + Series([20190813, 20010012, np.nan, 20019999]), + Series([Timestamp("20190813"), np.nan, np.nan, np.nan]), + ], + ], + ) def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): # GH 25512 # format='%Y%m%d', errors='coerce' - result = pd.to_datetime(input_s, format='%Y%m%d', errors='coerce') + result = pd.to_datetime(input_s, format="%Y%m%d", errors="coerce") assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_integer(self, cache): # GH 10178 s = Series([2000, 2001, 2002]) expected = Series([Timestamp(x) for x in s.apply(str)]) - result = to_datetime(s, format='%Y', cache=cache) + result = to_datetime(s, format="%Y", cache=cache) assert_series_equal(result, expected) s = Series([200001, 200105, 200206]) - expected = Series([Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) - ]) + expected = Series([Timestamp(x[:4] + "-" + x[4:]) for x in s.apply(str)]) - result = to_datetime(s, format='%Y%m', cache=cache) + result = to_datetime(s, format="%Y%m", cache=cache) assert_series_equal(result, expected) - @pytest.mark.parametrize('int_date, expected', [ - # valid date, length == 8 - [20121030, datetime(2012, 10, 30)], - # short valid date, length == 6 - [199934, datetime(1999, 3, 4)], - # long integer date partially parsed to datetime(2012,1,1), length > 8 - [2012010101, 2012010101], - # invalid date partially parsed to datetime(2012,9,9), length == 8 - [20129930, 20129930], - # short integer date partially parsed to datetime(2012,9,9), length < 8 - [2012993, 2012993], - # short invalid date, length == 4 - [2121, 2121]]) - def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, - expected): + @pytest.mark.parametrize( + "int_date, expected", + [ + # valid date, length == 8 + [20121030, datetime(2012, 10, 30)], + # short valid date, length == 6 + [199934, datetime(1999, 3, 4)], + # long integer date partially parsed to datetime(2012,1,1), length > 8 + [2012010101, 2012010101], + # invalid date partially parsed to datetime(2012,9,9), length == 8 + [20129930, 20129930], + # short integer date partially parsed to datetime(2012,9,9), length < 8 + [2012993, 2012993], + # short invalid date, length == 4 + [2121, 2121], + ], + ) + def test_int_to_datetime_format_YYYYMMDD_typeerror(self, int_date, expected): # GH 26583 - result = to_datetime(int_date, format='%Y%m%d', errors='ignore') + result = to_datetime(int_date, format="%Y%m%d", errors="ignore") assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_microsecond(self, cache): # these are locale dependent lang, _ = locale.getlocale() month_abbr = calendar.month_abbr[4] - val = '01-{}-2011 00:00:01.978'.format(month_abbr) + val = "01-{}-2011 00:00:01.978".format(month_abbr) - format = '%d-%b-%Y %H:%M:%S.%f' + format = "%d-%b-%Y %H:%M:%S.%f" result = to_datetime(val, format=format, cache=cache) exp = datetime.strptime(val, format) assert result == exp - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_time(self, cache): data = [ - ['01/10/2010 15:20', '%m/%d/%Y %H:%M', - Timestamp('2010-01-10 15:20')], - ['01/10/2010 05:43', '%m/%d/%Y %I:%M', - Timestamp('2010-01-10 05:43')], - ['01/10/2010 13:56:01', '%m/%d/%Y %H:%M:%S', - Timestamp('2010-01-10 13:56:01')] # , + ["01/10/2010 15:20", "%m/%d/%Y %H:%M", Timestamp("2010-01-10 15:20")], + ["01/10/2010 05:43", "%m/%d/%Y %I:%M", Timestamp("2010-01-10 05:43")], + [ + "01/10/2010 13:56:01", + "%m/%d/%Y %H:%M:%S", + Timestamp("2010-01-10 13:56:01"), + ] # , # ['01/10/2010 08:14 PM', '%m/%d/%Y %I:%M %p', # Timestamp('2010-01-10 20:14')], # ['01/10/2010 07:40 AM', '%m/%d/%Y %I:%M %p', @@ -183,75 +204,97 @@ def test_to_datetime_format_time(self, cache): assert to_datetime(s, format=format, cache=cache) == dt @td.skip_if_has_locale - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_non_exact(self, cache): # GH 10834 # 8904 # exact kw - s = Series(['19MAY11', 'foobar19MAY11', '19MAY11:00:00:00', - '19MAY11 00:00:00Z']) - result = to_datetime(s, format='%d%b%y', exact=False, cache=cache) - expected = to_datetime(s.str.extract(r'(\d+\w+\d+)', expand=False), - format='%d%b%y', cache=cache) + s = Series( + ["19MAY11", "foobar19MAY11", "19MAY11:00:00:00", "19MAY11 00:00:00Z"] + ) + result = to_datetime(s, format="%d%b%y", exact=False, cache=cache) + expected = to_datetime( + s.str.extract(r"(\d+\w+\d+)", expand=False), format="%d%b%y", cache=cache + ) assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_parse_nanoseconds_with_formula(self, cache): # GH8989 # truncating the nanoseconds when a format was provided - for v in ["2012-01-01 09:00:00.000000001", - "2012-01-01 09:00:00.000001", - "2012-01-01 09:00:00.001", - "2012-01-01 09:00:00.001000", - "2012-01-01 09:00:00.001000000", ]: + for v in [ + "2012-01-01 09:00:00.000000001", + "2012-01-01 09:00:00.000001", + "2012-01-01 09:00:00.001", + "2012-01-01 09:00:00.001000", + "2012-01-01 09:00:00.001000000", + ]: expected = pd.to_datetime(v, cache=cache) - result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", - cache=cache) + result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f", cache=cache) assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_format_weeks(self, cache): data = [ - ['2009324', '%Y%W%w', Timestamp('2009-08-13')], - ['2013020', '%Y%U%w', Timestamp('2013-01-13')] + ["2009324", "%Y%W%w", Timestamp("2009-08-13")], + ["2013020", "%Y%U%w", Timestamp("2013-01-13")], ] for s, format, dt in data: assert to_datetime(s, format=format, cache=cache) == dt - @pytest.mark.parametrize("fmt,dates,expected_dates", [ - ['%Y-%m-%d %H:%M:%S %Z', - ['2010-01-01 12:00:00 UTC'] * 2, - [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2], - ['%Y-%m-%d %H:%M:%S %Z', - ['2010-01-01 12:00:00 UTC', - '2010-01-01 12:00:00 GMT', - '2010-01-01 12:00:00 US/Pacific'], - [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'), - pd.Timestamp('2010-01-01 12:00:00', tz='GMT'), - pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]], - ['%Y-%m-%d %H:%M:%S%z', - ['2010-01-01 12:00:00+0100'] * 2, - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(60))] * 2], - ['%Y-%m-%d %H:%M:%S %z', - ['2010-01-01 12:00:00 +0100'] * 2, - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(60))] * 2], - ['%Y-%m-%d %H:%M:%S %z', - ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'], - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(60)), - pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(-60))]], - ['%Y-%m-%d %H:%M:%S %z', - ['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'], - [pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(0)), # pytz coerces to UTC - pd.Timestamp('2010-01-01 12:00:00', - tzinfo=pytz.FixedOffset(0))]]]) - def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, - expected_dates): + @pytest.mark.parametrize( + "fmt,dates,expected_dates", + [ + [ + "%Y-%m-%d %H:%M:%S %Z", + ["2010-01-01 12:00:00 UTC"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tz="UTC")] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %Z", + [ + "2010-01-01 12:00:00 UTC", + "2010-01-01 12:00:00 GMT", + "2010-01-01 12:00:00 US/Pacific", + ], + [ + pd.Timestamp("2010-01-01 12:00:00", tz="UTC"), + pd.Timestamp("2010-01-01 12:00:00", tz="GMT"), + pd.Timestamp("2010-01-01 12:00:00", tz="US/Pacific"), + ], + ], + [ + "%Y-%m-%d %H:%M:%S%z", + ["2010-01-01 12:00:00+0100"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60))] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 +0100"] * 2, + [pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60))] * 2, + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100"], + [ + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(60)), + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(-60)), + ], + ], + [ + "%Y-%m-%d %H:%M:%S %z", + ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"], + [ + pd.Timestamp( + "2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0) + ), # pytz coerces to UTC + pd.Timestamp("2010-01-01 12:00:00", tzinfo=pytz.FixedOffset(0)), + ], + ], + ], + ) + def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates): # GH 13486 result = pd.to_datetime(dates, format=fmt) expected = pd.Index(expected_dates) @@ -260,69 +303,120 @@ def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, with pytest.raises(ValueError): pd.to_datetime(dates, format=fmt, utc=True) - @pytest.mark.parametrize('offset', [ - '+0', '-1foo', 'UTCbar', ':10', '+01:000:01', '']) + @pytest.mark.parametrize( + "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""] + ) def test_to_datetime_parse_timezone_malformed(self, offset): - fmt = '%Y-%m-%d %H:%M:%S %z' - date = '2010-01-01 12:00:00 ' + offset + fmt = "%Y-%m-%d %H:%M:%S %z" + date = "2010-01-01 12:00:00 " + offset with pytest.raises(ValueError): pd.to_datetime([date], format=fmt) def test_to_datetime_parse_timezone_keeps_name(self): # GH 21697 - fmt = '%Y-%m-%d %H:%M:%S %z' - arg = pd.Index(['2010-01-01 12:00:00 Z'], name='foo') + fmt = "%Y-%m-%d %H:%M:%S %z" + arg = pd.Index(["2010-01-01 12:00:00 Z"], name="foo") result = pd.to_datetime(arg, format=fmt) - expected = pd.DatetimeIndex(['2010-01-01 12:00:00'], tz='UTC', - name='foo') + expected = pd.DatetimeIndex(["2010-01-01 12:00:00"], tz="UTC", name="foo") tm.assert_index_equal(result, expected) class TestToDatetime: - @pytest.mark.parametrize("s, _format, dt", [ - ['2015-1-1', '%G-%V-%u', datetime(2014, 12, 29, 0, 0)], - ['2015-1-4', '%G-%V-%u', datetime(2015, 1, 1, 0, 0)], - ['2015-1-7', '%G-%V-%u', datetime(2015, 1, 4, 0, 0)] - ]) + @pytest.mark.parametrize( + "s, _format, dt", + [ + ["2015-1-1", "%G-%V-%u", datetime(2014, 12, 29, 0, 0)], + ["2015-1-4", "%G-%V-%u", datetime(2015, 1, 1, 0, 0)], + ["2015-1-7", "%G-%V-%u", datetime(2015, 1, 4, 0, 0)], + ], + ) def test_to_datetime_iso_week_year_format(self, s, _format, dt): # See GH#16607 assert to_datetime(s, format=_format) == dt - @pytest.mark.parametrize("msg, s, _format", [ - ["ISO week directive '%V' must be used with the ISO year directive " - "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 50", - "%Y %V"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 51", - "%G %V"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 " - "Monday", "%G %A"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 Mon", - "%G %a"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 6", - "%G %w"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 6", - "%G %u"], - ["ISO year directive '%G' must be used with the ISO week directive " - "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "2051", - "%G"], - ["Day of the year directive '%j' is not compatible with ISO year " - "directive '%G'. Use '%Y' instead.", "1999 51 6 256", "%G %V %u %j"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 Sunday", "%Y %V %A"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 Sun", "%Y %V %a"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 1", "%Y %V %w"], - ["ISO week directive '%V' is incompatible with the year directive " - "'%Y'. Use the ISO year '%G' instead.", "1999 51 1", "%Y %V %u"], - ["ISO week directive '%V' must be used with the ISO year directive " - "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", "20", "%V"] - ]) + @pytest.mark.parametrize( + "msg, s, _format", + [ + [ + "ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 50", + "%Y %V", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 51", + "%G %V", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 " "Monday", + "%G %A", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 Mon", + "%G %a", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 6", + "%G %w", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "1999 6", + "%G %u", + ], + [ + "ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", + "2051", + "%G", + ], + [ + "Day of the year directive '%j' is not compatible with ISO year " + "directive '%G'. Use '%Y' instead.", + "1999 51 6 256", + "%G %V %u %j", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 Sunday", + "%Y %V %A", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 Sun", + "%Y %V %a", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 1", + "%Y %V %w", + ], + [ + "ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", + "1999 51 1", + "%Y %V %u", + ], + [ + "ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", + "20", + "%V", + ], + ], + ) def test_error_iso_week_year(self, msg, s, _format): # See GH#16607 # This test checks for errors thrown when giving the wrong format @@ -331,15 +425,17 @@ def test_error_iso_week_year(self, msg, s, _format): # locale specific, but the test data is in english. # Therefore, the tests only run when locale is not overwritten, # as a sort of solution to this problem. - if (locale.getlocale() != ('zh_CN', 'UTF-8') and - locale.getlocale() != ('it_IT', 'UTF-8')): + if locale.getlocale() != ("zh_CN", "UTF-8") and locale.getlocale() != ( + "it_IT", + "UTF-8", + ): with pytest.raises(ValueError, match=msg): to_datetime(s, format=_format) - @pytest.mark.parametrize('tz', [None, 'US/Central']) + @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_to_datetime_dtarr(self, tz): # DatetimeArray - dti = date_range('1965-04-03', periods=19, freq='2W', tz=tz) + dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz) arr = DatetimeArray(dti) result = to_datetime(arr) @@ -353,21 +449,21 @@ def test_to_datetime_pydatetime(self): assert actual == datetime(2008, 1, 15) def test_to_datetime_YYYYMMDD(self): - actual = pd.to_datetime('20080115') + actual = pd.to_datetime("20080115") assert actual == datetime(2008, 1, 15) def test_to_datetime_unparseable_ignore(self): # unparseable - s = 'Month 1, 1999' - assert pd.to_datetime(s, errors='ignore') == s + s = "Month 1, 1999" + assert pd.to_datetime(s, errors="ignore") == s @td.skip_if_windows # `tm.set_timezone` does not work in windows def test_to_datetime_now(self): # See GH#18666 - with tm.set_timezone('US/Eastern'): - npnow = np.datetime64('now').astype('datetime64[ns]') - pdnow = pd.to_datetime('now') - pdnow2 = pd.to_datetime(['now'])[0] + with tm.set_timezone("US/Eastern"): + npnow = np.datetime64("now").astype("datetime64[ns]") + pdnow = pd.to_datetime("now") + pdnow2 = pd.to_datetime(["now"])[0] # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -385,13 +481,12 @@ def test_to_datetime_today(self): # Unfortunately this test between 12 and 1 AM Samoa time # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. - with tm.set_timezone('Pacific/Auckland'): # 12-13 hours ahead of UTC - nptoday = np.datetime64('today')\ - .astype('datetime64[ns]').astype(np.int64) - pdtoday = pd.to_datetime('today') - pdtoday2 = pd.to_datetime(['today'])[0] + with tm.set_timezone("Pacific/Auckland"): # 12-13 hours ahead of UTC + nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + pdtoday = pd.to_datetime("today") + pdtoday2 = pd.to_datetime(["today"])[0] - tstoday = pd.Timestamp('today') + tstoday = pd.Timestamp("today") tstoday2 = pd.Timestamp.today() # These should all be equal with infinite perf; this gives @@ -404,11 +499,10 @@ def test_to_datetime_today(self): assert pdtoday.tzinfo is None assert pdtoday2.tzinfo is None - with tm.set_timezone('US/Samoa'): # 11 hours behind UTC - nptoday = np.datetime64('today')\ - .astype('datetime64[ns]').astype(np.int64) - pdtoday = pd.to_datetime('today') - pdtoday2 = pd.to_datetime(['today'])[0] + with tm.set_timezone("US/Samoa"): # 11 hours behind UTC + nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + pdtoday = pd.to_datetime("today") + pdtoday2 = pd.to_datetime(["today"])[0] # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -419,175 +513,190 @@ def test_to_datetime_today(self): assert pdtoday2.tzinfo is None def test_to_datetime_today_now_unicode_bytes(self): - to_datetime(['now']) - to_datetime(['today']) + to_datetime(["now"]) + to_datetime(["today"]) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_dt64s(self, cache): - in_bound_dts = [ - np.datetime64('2000-01-01'), - np.datetime64('2000-01-02'), - ] + in_bound_dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] for dt in in_bound_dts: assert pd.to_datetime(dt, cache=cache) == Timestamp(dt) - @pytest.mark.parametrize('dt', [np.datetime64('1000-01-01'), - np.datetime64('5000-01-02')]) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize( + "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")] + ) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_dt64s_out_of_bounds(self, cache, dt): msg = "Out of bounds nanosecond timestamp: {}".format(dt) with pytest.raises(OutOfBoundsDatetime, match=msg): - pd.to_datetime(dt, errors='raise') + pd.to_datetime(dt, errors="raise") with pytest.raises(OutOfBoundsDatetime, match=msg): Timestamp(dt) - assert pd.to_datetime(dt, errors='coerce', cache=cache) is NaT + assert pd.to_datetime(dt, errors="coerce", cache=cache) is NaT - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_array_of_dt64s(self, cache): - dts = [np.datetime64('2000-01-01'), np.datetime64('2000-01-02'), ] + dts = [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing tm.assert_index_equal( pd.to_datetime(dts, cache=cache), - pd.DatetimeIndex([Timestamp(x).asm8 for x in dts]) + pd.DatetimeIndex([Timestamp(x).asm8 for x in dts]), ) # A list of datetimes where the last one is out of bounds - dts_with_oob = dts + [np.datetime64('9999-01-01')] + dts_with_oob = dts + [np.datetime64("9999-01-01")] msg = "Out of bounds nanosecond timestamp: 9999-01-01 00:00:00" with pytest.raises(OutOfBoundsDatetime, match=msg): - pd.to_datetime(dts_with_oob, errors='raise') + pd.to_datetime(dts_with_oob, errors="raise") tm.assert_index_equal( - pd.to_datetime(dts_with_oob, errors='coerce', - cache=cache), + pd.to_datetime(dts_with_oob, errors="coerce", cache=cache), pd.DatetimeIndex( [ Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8, - pd.NaT + pd.NaT, ] - ) + ), ) # With errors='ignore', out of bounds datetime64s # are converted to their .item(), which depending on the version of # numpy is either a python datetime.datetime or datetime.date tm.assert_index_equal( - pd.to_datetime(dts_with_oob, errors='ignore', - cache=cache), - pd.Index( - [dt.item() for dt in dts_with_oob] - ) + pd.to_datetime(dts_with_oob, errors="ignore", cache=cache), + pd.Index([dt.item() for dt in dts_with_oob]), ) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz(self, cache): # xref 8260 # uniform returns a DatetimeIndex - arr = [pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')] + arr = [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ] result = pd.to_datetime(arr, cache=cache) expected = DatetimeIndex( - ['2013-01-01 13:00:00', '2013-01-02 14:00:00'], tz='US/Pacific') + ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" + ) tm.assert_index_equal(result, expected) # mixed tzs will raise - arr = [pd.Timestamp('2013-01-01 13:00:00', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00', tz='US/Eastern')] - msg = ("Tz-aware datetime.datetime cannot be converted to datetime64" - " unless utc=True") + arr = [ + pd.Timestamp("2013-01-01 13:00:00", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), + ] + msg = ( + "Tz-aware datetime.datetime cannot be converted to datetime64" + " unless utc=True" + ) with pytest.raises(ValueError, match=msg): pd.to_datetime(arr, cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_different_offsets(self, cache): # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark # see GH-26097 for more - ts_string_1 = 'March 1, 2018 12:00:00+0400' - ts_string_2 = 'March 1, 2018 12:00:00+0500' + ts_string_1 = "March 1, 2018 12:00:00+0400" + ts_string_2 = "March 1, 2018 12:00:00+0500" arr = [ts_string_1] * 5 + [ts_string_2] * 5 expected = pd.Index([parse(x) for x in arr]) result = pd.to_datetime(arr, cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz_pytz(self, cache): # see gh-8260 - us_eastern = pytz.timezone('US/Eastern') - arr = np.array([us_eastern.localize(datetime(year=2000, month=1, day=1, - hour=3, minute=0)), - us_eastern.localize(datetime(year=2000, month=6, day=1, - hour=3, minute=0))], - dtype=object) + us_eastern = pytz.timezone("US/Eastern") + arr = np.array( + [ + us_eastern.localize( + datetime(year=2000, month=1, day=1, hour=3, minute=0) + ), + us_eastern.localize( + datetime(year=2000, month=6, day=1, hour=3, minute=0) + ), + ], + dtype=object, + ) result = pd.to_datetime(arr, utc=True, cache=cache) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + expected = DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], + dtype="datetime64[ns, UTC]", + freq=None, + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) - @pytest.mark.parametrize("init_constructor, end_constructor, test_method", - [(Index, DatetimeIndex, tm.assert_index_equal), - (list, DatetimeIndex, tm.assert_index_equal), - (np.array, DatetimeIndex, tm.assert_index_equal), - (Series, Series, tm.assert_series_equal)]) - def test_to_datetime_utc_true(self, - cache, - init_constructor, - end_constructor, - test_method): + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "init_constructor, end_constructor, test_method", + [ + (Index, DatetimeIndex, tm.assert_index_equal), + (list, DatetimeIndex, tm.assert_index_equal), + (np.array, DatetimeIndex, tm.assert_index_equal), + (Series, Series, tm.assert_series_equal), + ], + ) + def test_to_datetime_utc_true( + self, cache, init_constructor, end_constructor, test_method + ): # See gh-11934 & gh-6415 - data = ['20100102 121314', '20100102 121315'] - expected_data = [pd.Timestamp('2010-01-02 12:13:14', tz='utc'), - pd.Timestamp('2010-01-02 12:13:15', tz='utc')] - - result = pd.to_datetime(init_constructor(data), - format='%Y%m%d %H%M%S', - utc=True, - cache=cache) + data = ["20100102 121314", "20100102 121315"] + expected_data = [ + pd.Timestamp("2010-01-02 12:13:14", tz="utc"), + pd.Timestamp("2010-01-02 12:13:15", tz="utc"), + ] + + result = pd.to_datetime( + init_constructor(data), format="%Y%m%d %H%M%S", utc=True, cache=cache + ) expected = end_constructor(expected_data) test_method(result, expected) # Test scalar case as well for scalar, expected in zip(data, expected_data): - result = pd.to_datetime(scalar, format='%Y%m%d %H%M%S', utc=True, - cache=cache) + result = pd.to_datetime( + scalar, format="%Y%m%d %H%M%S", utc=True, cache=cache + ) assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_utc_true_with_series_single_value(self, cache): # GH 15760 UTC=True with Series ts = 1.5e18 result = pd.to_datetime(pd.Series([ts]), utc=True, cache=cache) - expected = pd.Series([pd.Timestamp(ts, tz='utc')]) + expected = pd.Series([pd.Timestamp(ts, tz="utc")]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_utc_true_with_series_tzaware_string(self, cache): - ts = '2013-01-01 00:00:00-01:00' - expected_ts = '2013-01-01 01:00:00' + ts = "2013-01-01 00:00:00-01:00" + expected_ts = "2013-01-01 01:00:00" data = pd.Series([ts] * 3) result = pd.to_datetime(data, utc=True, cache=cache) - expected = pd.Series([pd.Timestamp(expected_ts, tz='utc')] * 3) + expected = pd.Series([pd.Timestamp(expected_ts, tz="utc")] * 3) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) - @pytest.mark.parametrize('date, dtype', - [('2013-01-01 01:00:00', 'datetime64[ns]'), - ('2013-01-01 01:00:00', 'datetime64[ns, UTC]')]) - def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, - dtype): - expected = pd.Series([pd.Timestamp('2013-01-01 01:00:00', tz='UTC')]) - result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, - cache=cache) + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "date, dtype", + [ + ("2013-01-01 01:00:00", "datetime64[ns]"), + ("2013-01-01 01:00:00", "datetime64[ns, UTC]"), + ], + ) + def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): + expected = pd.Series([pd.Timestamp("2013-01-01 01:00:00", tz="UTC")]) + result = pd.to_datetime(pd.Series([date], dtype=dtype), utc=True, cache=cache) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_tz_psycopg2(self, cache): # xref 8260 @@ -599,36 +708,43 @@ def test_to_datetime_tz_psycopg2(self, cache): # misc cases tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) tz2 = psycopg2.tz.FixedOffsetTimezone(offset=-240, name=None) - arr = np.array([datetime(2000, 1, 1, 3, 0, tzinfo=tz1), - datetime(2000, 6, 1, 3, 0, tzinfo=tz2)], - dtype=object) - - result = pd.to_datetime(arr, errors='coerce', utc=True, cache=cache) - expected = DatetimeIndex(['2000-01-01 08:00:00+00:00', - '2000-06-01 07:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + arr = np.array( + [ + datetime(2000, 1, 1, 3, 0, tzinfo=tz1), + datetime(2000, 6, 1, 3, 0, tzinfo=tz2), + ], + dtype=object, + ) + + result = pd.to_datetime(arr, errors="coerce", utc=True, cache=cache) + expected = DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], + dtype="datetime64[ns, UTC]", + freq=None, + ) tm.assert_index_equal(result, expected) # dtype coercion - i = pd.DatetimeIndex([ - '2000-01-01 08:00:00' - ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) + i = pd.DatetimeIndex( + ["2000-01-01 08:00:00"], + tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None), + ) assert is_datetime64_ns_dtype(i) # tz coercion - result = pd.to_datetime(i, errors='coerce', cache=cache) + result = pd.to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) - result = pd.to_datetime(i, errors='coerce', utc=True, cache=cache) - expected = pd.DatetimeIndex(['2000-01-01 13:00:00'], - dtype='datetime64[ns, UTC]') + result = pd.to_datetime(i, errors="coerce", utc=True, cache=cache) + expected = pd.DatetimeIndex( + ["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]" + ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - 'cache', - [pytest.param(True, - marks=pytest.mark.skipif(True, reason="GH 18111")), - False]) + "cache", + [pytest.param(True, marks=pytest.mark.skipif(True, reason="GH 18111")), False], + ) def test_datetime_bool(self, cache): # GH13176 with pytest.raises(TypeError): @@ -642,13 +758,13 @@ def test_datetime_bool(self, cache): with pytest.raises(TypeError): to_datetime([False, datetime.today()], cache=cache) with pytest.raises(TypeError): - to_datetime(['20130101', True], cache=cache) - tm.assert_index_equal(to_datetime([0, False, NaT, 0.0], - errors="coerce", cache=cache), - DatetimeIndex([to_datetime(0, cache=cache), - NaT, - NaT, - to_datetime(0, cache=cache)])) + to_datetime(["20130101", True], cache=cache) + tm.assert_index_equal( + to_datetime([0, False, NaT, 0.0], errors="coerce", cache=cache), + DatetimeIndex( + [to_datetime(0, cache=cache), NaT, NaT, to_datetime(0, cache=cache)] + ), + ) def test_datetime_invalid_datatype(self): # GH13176 @@ -658,69 +774,78 @@ def test_datetime_invalid_datatype(self): with pytest.raises(TypeError): pd.to_datetime(pd.to_datetime) - @pytest.mark.parametrize('value', ["a", "00:01:99"]) - @pytest.mark.parametrize('infer', [True, False]) - @pytest.mark.parametrize('format', [None, 'H%:M%:S%']) + @pytest.mark.parametrize("value", ["a", "00:01:99"]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) def test_datetime_invalid_scalar(self, value, format, infer): # GH24763 - res = pd.to_datetime(value, errors='ignore', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="ignore", format=format, infer_datetime_format=infer + ) assert res == value - res = pd.to_datetime(value, errors='coerce', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="coerce", format=format, infer_datetime_format=infer + ) assert res is pd.NaT with pytest.raises(ValueError): - pd.to_datetime(value, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) - @pytest.mark.parametrize('value', ["3000/12/11 00:00:00"]) - @pytest.mark.parametrize('infer', [True, False]) - @pytest.mark.parametrize('format', [None, 'H%:M%:S%']) + @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) def test_datetime_outofbounds_scalar(self, value, format, infer): # GH24763 - res = pd.to_datetime(value, errors='ignore', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="ignore", format=format, infer_datetime_format=infer + ) assert res == value - res = pd.to_datetime(value, errors='coerce', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + value, errors="coerce", format=format, infer_datetime_format=infer + ) assert res is pd.NaT if format is not None: with pytest.raises(ValueError): - pd.to_datetime(value, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) else: with pytest.raises(OutOfBoundsDatetime): - pd.to_datetime(value, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + value, errors="raise", format=format, infer_datetime_format=infer + ) - @pytest.mark.parametrize('values', [["a"], ["00:01:99"], - ["a", "b", "99:00:00"]]) - @pytest.mark.parametrize('infer', [True, False]) - @pytest.mark.parametrize('format', [None, 'H%:M%:S%']) + @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) + @pytest.mark.parametrize("infer", [True, False]) + @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) def test_datetime_invalid_index(self, values, format, infer): # GH24763 - res = pd.to_datetime(values, errors='ignore', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + values, errors="ignore", format=format, infer_datetime_format=infer + ) tm.assert_index_equal(res, pd.Index(values)) - res = pd.to_datetime(values, errors='coerce', format=format, - infer_datetime_format=infer) + res = pd.to_datetime( + values, errors="coerce", format=format, infer_datetime_format=infer + ) tm.assert_index_equal(res, pd.DatetimeIndex([pd.NaT] * len(values))) with pytest.raises(ValueError): - pd.to_datetime(values, errors='raise', format=format, - infer_datetime_format=infer) + pd.to_datetime( + values, errors="raise", format=format, infer_datetime_format=infer + ) @pytest.mark.parametrize("utc", [True, None]) - @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) @pytest.mark.parametrize("constructor", [list, tuple, np.array, pd.Index]) def test_to_datetime_cache(self, utc, format, constructor): - date = '20130101 00:00:00' - test_dates = [date] * 10**5 + date = "20130101 00:00:00" + test_dates = [date] * 10 ** 5 data = constructor(test_dates) result = pd.to_datetime(data, utc=utc, format=format, cache=True) @@ -729,28 +854,32 @@ def test_to_datetime_cache(self, utc, format, constructor): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("utc", [True, None]) - @pytest.mark.parametrize("format", ['%Y%m%d %H:%M:%S', None]) + @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) def test_to_datetime_cache_series(self, utc, format): - date = '20130101 00:00:00' - test_dates = [date] * 10**5 + date = "20130101 00:00:00" + test_dates = [date] * 10 ** 5 data = pd.Series(test_dates) result = pd.to_datetime(data, utc=utc, format=format, cache=True) expected = pd.to_datetime(data, utc=utc, format=format, cache=False) tm.assert_series_equal(result, expected) def test_to_datetime_cache_scalar(self): - date = '20130101 00:00:00' + date = "20130101 00:00:00" result = pd.to_datetime(date, cache=True) - expected = pd.Timestamp('20130101 00:00:00') + expected = pd.Timestamp("20130101 00:00:00") assert result == expected - @pytest.mark.parametrize('date, format', - [('2017-20', '%Y-%W'), - ('20 Sunday', '%W %A'), - ('20 Sun', '%W %a'), - ('2017-21', '%Y-%U'), - ('20 Sunday', '%U %A'), - ('20 Sun', '%U %a')]) + @pytest.mark.parametrize( + "date, format", + [ + ("2017-20", "%Y-%W"), + ("20 Sunday", "%W %A"), + ("20 Sun", "%W %a"), + ("2017-21", "%Y-%U"), + ("20 Sunday", "%U %A"), + ("20 Sun", "%U %a"), + ], + ) def test_week_without_day_and_calendar_year(self, date, format): # GH16774 @@ -760,15 +889,19 @@ def test_week_without_day_and_calendar_year(self, date, format): def test_to_datetime_coerce(self): # GH 26122 - ts_strings = ['March 1, 2018 12:00:00+0400', - 'March 1, 2018 12:00:00+0500', - '20100240'] - result = to_datetime(ts_strings, errors='coerce') - expected = Index([datetime(2018, 3, 1, 12, 0, - tzinfo=tzoffset(None, 14400)), - datetime(2018, 3, 1, 12, 0, - tzinfo=tzoffset(None, 18000)), - NaT]) + ts_strings = [ + "March 1, 2018 12:00:00+0400", + "March 1, 2018 12:00:00+0500", + "20100240", + ] + result = to_datetime(ts_strings, errors="coerce") + expected = Index( + [ + datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 14400)), + datetime(2018, 3, 1, 12, 0, tzinfo=tzoffset(None, 18000)), + NaT, + ] + ) tm.assert_index_equal(result, expected) def test_iso_8601_strings_with_same_offset(self): @@ -787,93 +920,109 @@ def test_iso_8601_strings_with_same_offset(self): def test_iso_8601_strings_same_offset_no_box(self): # GH 22446 - data = ['2018-01-04 09:01:00+09:00', '2018-01-04 09:02:00+09:00'] + data = ["2018-01-04 09:01:00+09:00", "2018-01-04 09:02:00+09:00"] with tm.assert_produces_warning(FutureWarning): result = pd.to_datetime(data, box=False) - expected = np.array([ - datetime(2018, 1, 4, 9, 1, tzinfo=pytz.FixedOffset(540)), - datetime(2018, 1, 4, 9, 2, tzinfo=pytz.FixedOffset(540)) - ], - dtype=object) + expected = np.array( + [ + datetime(2018, 1, 4, 9, 1, tzinfo=pytz.FixedOffset(540)), + datetime(2018, 1, 4, 9, 2, tzinfo=pytz.FixedOffset(540)), + ], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) def test_iso_8601_strings_with_different_offsets(self): # GH 17697, 11736 - ts_strings = ["2015-11-18 15:30:00+05:30", - "2015-11-18 16:30:00+06:30", - NaT] + ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT] result = to_datetime(ts_strings) - expected = np.array([datetime(2015, 11, 18, 15, 30, - tzinfo=tzoffset(None, 19800)), - datetime(2015, 11, 18, 16, 30, - tzinfo=tzoffset(None, 23400)), - NaT], - dtype=object) + expected = np.array( + [ + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), + datetime(2015, 11, 18, 16, 30, tzinfo=tzoffset(None, 23400)), + NaT, + ], + dtype=object, + ) # GH 21864 expected = Index(expected) tm.assert_index_equal(result, expected) result = to_datetime(ts_strings, utc=True) - expected = DatetimeIndex([Timestamp(2015, 11, 18, 10), - Timestamp(2015, 11, 18, 10), - NaT], tz='UTC') + expected = DatetimeIndex( + [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" + ) tm.assert_index_equal(result, expected) def test_iso8601_strings_mixed_offsets_with_naive(self): # GH 24992 - result = pd.to_datetime([ - '2018-11-28T00:00:00', - '2018-11-28T00:00:00+12:00', - '2018-11-28T00:00:00', - '2018-11-28T00:00:00+06:00', - '2018-11-28T00:00:00' - ], utc=True) - expected = pd.to_datetime([ - '2018-11-28T00:00:00', - '2018-11-27T12:00:00', - '2018-11-28T00:00:00', - '2018-11-27T18:00:00', - '2018-11-28T00:00:00' - ], utc=True) + result = pd.to_datetime( + [ + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+12:00", + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+06:00", + "2018-11-28T00:00:00", + ], + utc=True, + ) + expected = pd.to_datetime( + [ + "2018-11-28T00:00:00", + "2018-11-27T12:00:00", + "2018-11-28T00:00:00", + "2018-11-27T18:00:00", + "2018-11-28T00:00:00", + ], + utc=True, + ) tm.assert_index_equal(result, expected) - items = ['2018-11-28T00:00:00+12:00', '2018-11-28T00:00:00'] + items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] result = pd.to_datetime(items, utc=True) expected = pd.to_datetime(list(reversed(items)), utc=True)[::-1] tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 - s = pd.Series([ - 'nan', - pd.Timestamp("1990-01-01"), - "2015-03-14T16:15:14.123-08:00", - "2019-03-04T21:56:32.620-07:00", - None, - ]) + s = pd.Series( + [ + "nan", + pd.Timestamp("1990-01-01"), + "2015-03-14T16:15:14.123-08:00", + "2019-03-04T21:56:32.620-07:00", + None, + ] + ) with pytest.raises(ValueError, match="Tz-aware datetime.datetime"): pd.to_datetime(s) def test_non_iso_strings_with_tz_offset(self): - result = to_datetime(['March 1, 2018 12:00:00+0400'] * 2) - expected = DatetimeIndex([datetime(2018, 3, 1, 12, - tzinfo=pytz.FixedOffset(240))] * 2) + result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) + expected = DatetimeIndex( + [datetime(2018, 3, 1, 12, tzinfo=pytz.FixedOffset(240))] * 2 + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('ts, expected', [ - (Timestamp('2018-01-01'), - Timestamp('2018-01-01', tz='UTC')), - (Timestamp('2018-01-01', tz='US/Pacific'), - Timestamp('2018-01-01 08:00', tz='UTC'))]) + @pytest.mark.parametrize( + "ts, expected", + [ + (Timestamp("2018-01-01"), Timestamp("2018-01-01", tz="UTC")), + ( + Timestamp("2018-01-01", tz="US/Pacific"), + Timestamp("2018-01-01 08:00", tz="UTC"), + ), + ], + ) def test_timestamp_utc_true(self, ts, expected): # GH 24415 result = to_datetime(ts, utc=True) assert result == expected def test_to_datetime_box_deprecated(self): - expected = np.datetime64('2018-09-09') + expected = np.datetime64("2018-09-09") # Deprecated - see GH24416 with tm.assert_produces_warning(FutureWarning): @@ -884,85 +1033,89 @@ def test_to_datetime_box_deprecated(self): class TestToDatetimeUnit: - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit(self, cache): # GH 11758 # test proper behavior with erros with pytest.raises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d', cache=cache) - - values = [11111111, 1, 1.0, iNaT, NaT, np.nan, - 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore', cache=cache) - expected = Index([11111111, Timestamp('1970-01-02'), - Timestamp('1970-01-02'), NaT, - NaT, NaT, NaT, NaT], - dtype=object) + to_datetime([1], unit="D", format="%Y%m%d", cache=cache) + + values = [11111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""] + result = to_datetime(values, unit="D", errors="ignore", cache=cache) + expected = Index( + [ + 11111111, + Timestamp("1970-01-02"), + Timestamp("1970-01-02"), + NaT, + NaT, + NaT, + NaT, + NaT, + ], + dtype=object, + ) tm.assert_index_equal(result, expected) - result = to_datetime(values, unit='D', errors='coerce', cache=cache) - expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', - 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + result = to_datetime(values, unit="D", errors="coerce", cache=cache) + expected = DatetimeIndex( + ["NaT", "1970-01-02", "1970-01-02", "NaT", "NaT", "NaT", "NaT", "NaT"] + ) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise', cache=cache) + to_datetime(values, unit="D", errors="raise", cache=cache) - values = [1420043460000, iNaT, NaT, np.nan, 'NaT'] + values = [1420043460000, iNaT, NaT, np.nan, "NaT"] - result = to_datetime(values, errors='ignore', unit='s', cache=cache) - expected = Index([1420043460000, NaT, NaT, - NaT, NaT], dtype=object) + result = to_datetime(values, errors="ignore", unit="s", cache=cache) + expected = Index([1420043460000, NaT, NaT, NaT, NaT], dtype=object) tm.assert_index_equal(result, expected) - result = to_datetime(values, errors='coerce', unit='s', cache=cache) - expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + result = to_datetime(values, errors="coerce", unit="s", cache=cache) + expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"]) tm.assert_index_equal(result, expected) with pytest.raises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s', cache=cache) + to_datetime(values, errors="raise", unit="s", cache=cache) # if we have a string, then we raise a ValueError # and NOT an OutOfBoundsDatetime - for val in ['foo', Timestamp('20130101')]: + for val in ["foo", Timestamp("20130101")]: try: - to_datetime(val, errors='raise', unit='s', cache=cache) + to_datetime(val, errors="raise", unit="s", cache=cache) except tslib.OutOfBoundsDatetime: raise AssertionError("incorrect exception raised") except ValueError: pass - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_consistency(self, cache): # consistency of conversions - expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise', - cache=cache) + expected = Timestamp("1970-05-09 14:25:11") + result = pd.to_datetime(11111111, unit="s", errors="raise", cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='coerce', - cache=cache) + result = pd.to_datetime(11111111, unit="s", errors="coerce", cache=cache) assert result == expected assert isinstance(result, Timestamp) - result = pd.to_datetime(11111111, unit='s', errors='ignore', - cache=cache) + result = pd.to_datetime(11111111, unit="s", errors="ignore", cache=cache) assert result == expected assert isinstance(result, Timestamp) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_with_numeric(self, cache): # GH 13180 # coercions from floats/ints are ok - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr1 = [1.434692e+18, 1.432766e+18] - arr2 = np.array(arr1).astype('int64') - for errors in ['ignore', 'raise', 'coerce']: + expected = DatetimeIndex(["2015-06-19 05:33:20", "2015-05-27 22:33:20"]) + arr1 = [1.434692e18, 1.432766e18] + arr2 = np.array(arr1).astype("int64") + for errors in ["ignore", "raise", "coerce"]: result = pd.to_datetime(arr1, errors=errors, cache=cache) tm.assert_index_equal(result, expected) @@ -971,127 +1124,137 @@ def test_unit_with_numeric(self, cache): # but we want to make sure that we are coercing # if we have ints/strings - expected = DatetimeIndex(['NaT', - '2015-06-19 05:33:20', - '2015-05-27 22:33:20']) - arr = ['foo', 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex(["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"]) + arr = ["foo", 1.434692e18, 1.432766e18] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - expected = DatetimeIndex(['2015-06-19 05:33:20', - '2015-05-27 22:33:20', - 'NaT', - 'NaT']) - arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex( + ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"] + ) + arr = [1.434692e18, 1.432766e18, "foo", "NaT"] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_mixed(self, cache): # mixed integers/datetimes - expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) - arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex(["2013-01-01", "NaT", "NaT"]) + arr = [pd.Timestamp("20130101"), 1.434692e18, 1.432766e18] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise', cache=cache) + pd.to_datetime(arr, errors="raise", cache=cache) - expected = DatetimeIndex(['NaT', - 'NaT', - '2013-01-01']) - arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] - result = pd.to_datetime(arr, errors='coerce', cache=cache) + expected = DatetimeIndex(["NaT", "NaT", "2013-01-01"]) + arr = [1.434692e18, 1.432766e18, pd.Timestamp("20130101")] + result = pd.to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) with pytest.raises(ValueError): - pd.to_datetime(arr, errors='raise', cache=cache) + pd.to_datetime(arr, errors="raise", cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_rounding(self, cache): # GH 14156: argument will incur floating point errors but no # premature rounding - result = pd.to_datetime(1434743731.8770001, unit='s', cache=cache) - expected = pd.Timestamp('2015-06-19 19:55:31.877000093') + result = pd.to_datetime(1434743731.8770001, unit="s", cache=cache) + expected = pd.Timestamp("2015-06-19 19:55:31.877000093") assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_unit_ignore_keeps_name(self, cache): # GH 21697 - expected = pd.Index([15e9] * 2, name='name') - result = pd.to_datetime(expected, errors='ignore', unit='s', - cache=cache) + expected = pd.Index([15e9] * 2, name="name") + result = pd.to_datetime(expected, errors="ignore", unit="s", cache=cache) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_dataframe(self, cache): - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [6, 7], - 'minute': [58, 59], - 'second': [10, 11], - 'ms': [1, 1], - 'us': [2, 2], - 'ns': [3, 3]}) - - result = to_datetime({'year': df['year'], - 'month': df['month'], - 'day': df['day']}, cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:0:00')]) + df = DataFrame( + { + "year": [2015, 2016], + "month": [2, 3], + "day": [4, 5], + "hour": [6, 7], + "minute": [58, 59], + "second": [10, 11], + "ms": [1, 1], + "us": [2, 2], + "ns": [3, 3], + } + ) + + result = to_datetime( + {"year": df["year"], "month": df["month"], "day": df["day"]}, cache=cache + ) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:0:00")] + ) assert_series_equal(result, expected) # dict-like - result = to_datetime(df[['year', 'month', 'day']].to_dict(), - cache=cache) + result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache) assert_series_equal(result, expected) # dict but with constructable - df2 = df[['year', 'month', 'day']].to_dict() - df2['month'] = 2 + df2 = df[["year", "month", "day"]].to_dict() + df2["month"] = 2 result = to_datetime(df2, cache=cache) - expected2 = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160205 00:0:00')]) + expected2 = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")] + ) assert_series_equal(result, expected2) # unit mappings - units = [{'year': 'years', - 'month': 'months', - 'day': 'days', - 'hour': 'hours', - 'minute': 'minutes', - 'second': 'seconds'}, - {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second'}, - ] + units = [ + { + "year": "years", + "month": "months", + "day": "days", + "hour": "hours", + "minute": "minutes", + "second": "seconds", + }, + { + "year": "year", + "month": "month", + "day": "day", + "hour": "hour", + "minute": "minute", + "second": "second", + }, + ] for d in units: - result = to_datetime(df[list(d.keys())].rename(columns=d), - cache=cache) - expected = Series([Timestamp('20150204 06:58:10'), - Timestamp('20160305 07:59:11')]) + result = to_datetime(df[list(d.keys())].rename(columns=d), cache=cache) + expected = Series( + [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")] + ) assert_series_equal(result, expected) - d = {'year': 'year', - 'month': 'month', - 'day': 'day', - 'hour': 'hour', - 'minute': 'minute', - 'second': 'second', - 'ms': 'ms', - 'us': 'us', - 'ns': 'ns'} + d = { + "year": "year", + "month": "month", + "day": "day", + "hour": "hour", + "minute": "minute", + "second": "second", + "ms": "ms", + "us": "us", + "ns": "ns", + } result = to_datetime(df.rename(columns=d), cache=cache) - expected = Series([Timestamp('20150204 06:58:10.001002003'), - Timestamp('20160305 07:59:11.001002003')]) + expected = Series( + [ + Timestamp("20150204 06:58:10.001002003"), + Timestamp("20160305 07:59:11.001002003"), + ] + ) assert_series_equal(result, expected) # coerce back to int @@ -1099,110 +1262,103 @@ def test_dataframe(self, cache): assert_series_equal(result, expected) # passing coerce - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) + df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) - msg = ("cannot assemble the datetimes: time data .+ does not " - r"match format '%Y%m%d' \(match\)") + msg = ( + "cannot assemble the datetimes: time data .+ does not " + r"match format '%Y%m%d' \(match\)" + ) with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - result = to_datetime(df2, errors='coerce', cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - NaT]) + result = to_datetime(df2, errors="coerce", cache=cache) + expected = Series([Timestamp("20150204 00:00:00"), NaT]) assert_series_equal(result, expected) # extra columns - msg = ("extra keys have been passed to the datetime assemblage: " - r"\[foo\]") + msg = "extra keys have been passed to the datetime assemblage: " r"\[foo\]" with pytest.raises(ValueError, match=msg): df2 = df.copy() - df2['foo'] = 1 + df2["foo"] = 1 to_datetime(df2, cache=cache) # not enough - msg = (r'to assemble mappings requires at least that \[year, month, ' - r'day\] be specified: \[.+\] is missing') - for c in [['year'], - ['year', 'month'], - ['year', 'month', 'second'], - ['month', 'day'], - ['year', 'day', 'second']]: + msg = ( + r"to assemble mappings requires at least that \[year, month, " + r"day\] be specified: \[.+\] is missing" + ) + for c in [ + ["year"], + ["year", "month"], + ["year", "month", "second"], + ["month", "day"], + ["year", "day", "second"], + ]: with pytest.raises(ValueError, match=msg): to_datetime(df[c], cache=cache) # duplicates - msg = 'cannot assemble with duplicate keys' - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5]}) - df2.columns = ['year', 'year', 'day'] + msg = "cannot assemble with duplicate keys" + df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) + df2.columns = ["year", "year", "day"] with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - df2 = DataFrame({'year': [2015, 2016], - 'month': [2, 20], - 'day': [4, 5], - 'hour': [4, 5]}) - df2.columns = ['year', 'month', 'day', 'day'] + df2 = DataFrame( + {"year": [2015, 2016], "month": [2, 20], "day": [4, 5], "hour": [4, 5]} + ) + df2.columns = ["year", "month", "day", "day"] with pytest.raises(ValueError, match=msg): to_datetime(df2, cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_dataframe_dtypes(self, cache): # #13451 - df = DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) # int16 - result = to_datetime(df.astype('int16'), cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) + result = to_datetime(df.astype("int16"), cache=cache) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")] + ) assert_series_equal(result, expected) # mixed dtypes - df['month'] = df['month'].astype('int8') - df['day'] = df['day'].astype('int8') + df["month"] = df["month"].astype("int8") + df["day"] = df["day"].astype("int8") result = to_datetime(df, cache=cache) - expected = Series([Timestamp('20150204 00:00:00'), - Timestamp('20160305 00:00:00')]) + expected = Series( + [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")] + ) assert_series_equal(result, expected) # float - df = DataFrame({'year': [2000, 2001], - 'month': [1.5, 1], - 'day': [1, 1]}) + df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) with pytest.raises(ValueError): to_datetime(df, cache=cache) def test_dataframe_box_false(self): # GH 23760 - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) with tm.assert_produces_warning(FutureWarning): result = pd.to_datetime(df, box=False) - expected = np.array(['2015-02-04', '2016-03-05'], - dtype='datetime64[ns]') + expected = np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) def test_dataframe_utc_true(self): # GH 23760 - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5]}) + df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = pd.to_datetime(df, utc=True) - expected = pd.Series(np.array(['2015-02-04', '2016-03-05'], - dtype='datetime64[ns]')).dt.tz_localize('UTC') + expected = pd.Series( + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) def test_to_datetime_errors_ignore_utc_true(self): # GH 23758 - result = pd.to_datetime([1], unit='s', utc=True, errors='ignore') - expected = DatetimeIndex(['1970-01-01 00:00:01'], tz='UTC') + result = pd.to_datetime([1], unit="s", utc=True, errors="ignore") + expected = DatetimeIndex(["1970-01-01 00:00:01"], tz="UTC") tm.assert_index_equal(result, expected) @@ -1211,24 +1367,24 @@ def test_to_datetime_barely_out_of_bounds(self): # GH#19529 # GH#19382 close enough to bounds that dropping nanos would result # in an in-bounds datetime - arr = np.array(['2262-04-11 23:47:16.854775808'], dtype=object) + arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object) with pytest.raises(OutOfBoundsDatetime): to_datetime(arr) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601(self, cache): result = to_datetime(["2012-01-01 00:00:00"], cache=cache) exp = Timestamp("2012-01-01 00:00:00") assert result[0] == exp - result = to_datetime(['20121001'], cache=cache) # bad iso 8601 - exp = Timestamp('2012-10-01') + result = to_datetime(["20121001"], cache=cache) # bad iso 8601 + exp = Timestamp("2012-10-01") assert result[0] == exp - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_default(self, cache): - rs = to_datetime('2001', cache=cache) + rs = to_datetime("2001", cache=cache) xp = datetime(2001, 1, 1) assert rs == xp @@ -1238,63 +1394,59 @@ def test_to_datetime_default(self, cache): # pytest.raises(ValueError, to_datetime('01-13-2012', # dayfirst=True)) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_on_datetime64_series(self, cache): # #2699 - s = Series(date_range('1/1/2000', periods=10)) + s = Series(date_range("1/1/2000", periods=10)) result = to_datetime(s, cache=cache) assert result[0] == s[0] - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_space_in_series(self, cache): # GH 6428 - s = Series(['10/18/2006', '10/18/2008', ' ']) + s = Series(["10/18/2006", "10/18/2008", " "]) msg = r"(\(')?String does not contain a date(:', ' '\))?" with pytest.raises(ValueError, match=msg): - to_datetime(s, errors='raise', cache=cache) - result_coerce = to_datetime(s, errors='coerce', cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), - datetime(2008, 10, 18), - NaT]) + to_datetime(s, errors="raise", cache=cache) + result_coerce = to_datetime(s, errors="coerce", cache=cache) + expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) tm.assert_series_equal(result_coerce, expected_coerce) - result_ignore = to_datetime(s, errors='ignore', cache=cache) + result_ignore = to_datetime(s, errors="ignore", cache=cache) tm.assert_series_equal(result_ignore, s) @td.skip_if_has_locale - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_with_apply(self, cache): # this is only locale tested with US/None locales # GH 5195 # with a format and coerce a single item to_datetime fails - td = Series(['May 04', 'Jun 02', 'Dec 11'], index=[1, 2, 3]) - expected = pd.to_datetime(td, format='%b %y', cache=cache) - result = td.apply(pd.to_datetime, format='%b %y', cache=cache) + td = Series(["May 04", "Jun 02", "Dec 11"], index=[1, 2, 3]) + expected = pd.to_datetime(td, format="%b %y", cache=cache) + result = td.apply(pd.to_datetime, format="%b %y", cache=cache) assert_series_equal(result, expected) - td = pd.Series(['May 04', 'Jun 02', ''], index=[1, 2, 3]) + td = pd.Series(["May 04", "Jun 02", ""], index=[1, 2, 3]) msg = r"time data '' does not match format '%b %y' \(match\)" with pytest.raises(ValueError, match=msg): - pd.to_datetime(td, format='%b %y', errors='raise', cache=cache) + pd.to_datetime(td, format="%b %y", errors="raise", cache=cache) with pytest.raises(ValueError, match=msg): - td.apply(pd.to_datetime, format='%b %y', - errors='raise', cache=cache) - expected = pd.to_datetime(td, format='%b %y', errors='coerce', - cache=cache) + td.apply(pd.to_datetime, format="%b %y", errors="raise", cache=cache) + expected = pd.to_datetime(td, format="%b %y", errors="coerce", cache=cache) result = td.apply( - lambda x: pd.to_datetime(x, format='%b %y', errors='coerce', - cache=cache)) + lambda x: pd.to_datetime(x, format="%b %y", errors="coerce", cache=cache) + ) assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_types(self, cache): # empty string - result = to_datetime('', cache=cache) + result = to_datetime("", cache=cache) assert result is NaT - result = to_datetime(['', ''], cache=cache) + result = to_datetime(["", ""], cache=cache) assert isna(result).all() # ints @@ -1303,12 +1455,12 @@ def test_to_datetime_types(self, cache): assert result == expected # GH 3888 (strings) - expected = to_datetime(['2012'], cache=cache)[0] - result = to_datetime('2012', cache=cache) + expected = to_datetime(["2012"], cache=cache)[0] + result = to_datetime("2012", cache=cache) assert result == expected # array = ['2012','20120101','20120101 12:01:01'] - array = ['20120101', '20120101 12:01:01'] + array = ["20120101", "20120101 12:01:01"] expected = list(to_datetime(array, cache=cache)) result = [Timestamp(date_str) for date_str in array] tm.assert_almost_equal(result, expected) @@ -1318,31 +1470,31 @@ def test_to_datetime_types(self, cache): # expected = to_datetime('2012') # assert result == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_unprocessable_input(self, cache): # GH 4928 # GH 21864 - result = to_datetime([1, '1'], errors='ignore', cache=cache) + result = to_datetime([1, "1"], errors="ignore", cache=cache) - expected = Index(np.array([1, '1'], dtype='O')) + expected = Index(np.array([1, "1"], dtype="O")) tm.assert_equal(result, expected) msg = "invalid string coercion to datetime" with pytest.raises(TypeError, match=msg): - to_datetime([1, '1'], errors='raise', cache=cache) + to_datetime([1, "1"], errors="raise", cache=cache) def test_to_datetime_other_datetime64_units(self): # 5/25/2012 - scalar = np.int64(1337904000000000).view('M8[us]') - as_obj = scalar.astype('O') + scalar = np.int64(1337904000000000).view("M8[us]") + as_obj = scalar.astype("O") index = DatetimeIndex([scalar]) - assert index[0] == scalar.astype('O') + assert index[0] == scalar.astype("O") value = Timestamp(scalar) assert value == as_obj def test_to_datetime_list_of_integers(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) rng = DatetimeIndex(rng.values) ints = list(rng.asi8) @@ -1356,16 +1508,17 @@ def test_to_datetime_overflow(self): # we are overflowing Timedelta range here with pytest.raises(OverflowError): - date_range(start='1/1/1700', freq='B', periods=100000) + date_range(start="1/1/1700", freq="B", periods=100000) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_string_na_nat_conversion(self, cache): # GH #999, #858 - strings = np.array(['1/1/2000', '1/2/2000', np.nan, - '1/4/2000, 12:34:56'], dtype=object) + strings = np.array( + ["1/1/2000", "1/2/2000", np.nan, "1/4/2000, 12:34:56"], dtype=object + ) - expected = np.empty(4, dtype='M8[ns]') + expected = np.empty(4, dtype="M8[ns]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -1379,33 +1532,41 @@ def test_string_na_nat_conversion(self, cache): assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) - malformed = np.array(['1/100/2000', np.nan], dtype=object) + malformed = np.array(["1/100/2000", np.nan], dtype=object) # GH 10636, default is now 'raise' - msg = (r"Unknown string format:|day is out of range for month") + msg = r"Unknown string format:|day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors='raise', cache=cache) + to_datetime(malformed, errors="raise", cache=cache) - result = to_datetime(malformed, errors='ignore', cache=cache) + result = to_datetime(malformed, errors="ignore", cache=cache) # GH 21864 expected = Index(malformed) tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors='raise', cache=cache) + to_datetime(malformed, errors="raise", cache=cache) - idx = ['a', 'b', 'c', 'd', 'e'] - series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, - '1/5/2000'], index=idx, name='foo') - dseries = Series([to_datetime('1/1/2000', cache=cache), np.nan, - to_datetime('1/3/2000', cache=cache), np.nan, - to_datetime('1/5/2000', cache=cache)], - index=idx, name='foo') + idx = ["a", "b", "c", "d", "e"] + series = Series( + ["1/1/2000", np.nan, "1/3/2000", np.nan, "1/5/2000"], index=idx, name="foo" + ) + dseries = Series( + [ + to_datetime("1/1/2000", cache=cache), + np.nan, + to_datetime("1/3/2000", cache=cache), + np.nan, + to_datetime("1/5/2000", cache=cache), + ], + index=idx, + name="foo", + ) result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) + expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) for i in range(5): x = series[i] if isna(x): @@ -1414,32 +1575,41 @@ def test_string_na_nat_conversion(self, cache): expected[i] = to_datetime(x, cache=cache) assert_series_equal(result, expected, check_names=False) - assert result.name == 'foo' + assert result.name == "foo" assert_series_equal(dresult, expected, check_names=False) - assert dresult.name == 'foo' + assert dresult.name == "foo" - @pytest.mark.parametrize('dtype', [ - 'datetime64[h]', 'datetime64[m]', - 'datetime64[s]', 'datetime64[ms]', - 'datetime64[us]', 'datetime64[ns]']) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize( + "dtype", + [ + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], + ) + @pytest.mark.parametrize("cache", [True, False]) def test_dti_constructor_numpy_timeunits(self, cache, dtype): # GH 9114 - base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT'], - cache=cache) + base = pd.to_datetime( + ["2000-01-01T00:00", "2000-01-02T00:00", "NaT"], cache=cache + ) values = base.values.astype(dtype) tm.assert_index_equal(DatetimeIndex(values), base) tm.assert_index_equal(to_datetime(values, cache=cache), base) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_dayfirst(self, cache): # GH 5917 - arr = ['10/02/2014', '11/02/2014', '12/02/2014'] - expected = DatetimeIndex([datetime(2014, 2, 10), datetime(2014, 2, 11), - datetime(2014, 2, 12)]) + arr = ["10/02/2014", "11/02/2014", "12/02/2014"] + expected = DatetimeIndex( + [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] + ) idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -1455,218 +1625,237 @@ def test_dayfirst(self, cache): class TestGuessDatetimeFormat: - @td.skip_if_not_us_locale def test_guess_datetime_format_for_array(self): - expected_format = '%Y-%m-%d %H:%M:%S.%f' + expected_format = "%Y-%m-%d %H:%M:%S.%f" dt_string = datetime(2011, 12, 30, 0, 0, 0).strftime(expected_format) test_arrays = [ - np.array([dt_string, dt_string, dt_string], dtype='O'), - np.array([np.nan, np.nan, dt_string], dtype='O'), - np.array([dt_string, 'random_string'], dtype='O'), + np.array([dt_string, dt_string, dt_string], dtype="O"), + np.array([np.nan, np.nan, dt_string], dtype="O"), + np.array([dt_string, "random_string"], dtype="O"), ] for test_array in test_arrays: - assert tools._guess_datetime_format_for_array( - test_array) == expected_format + assert tools._guess_datetime_format_for_array(test_array) == expected_format format_for_string_of_nans = tools._guess_datetime_format_for_array( - np.array( - [np.nan, np.nan, np.nan], dtype='O')) + np.array([np.nan, np.nan, np.nan], dtype="O") + ) assert format_for_string_of_nans is None class TestToDatetimeInferFormat: - - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_consistent_format(self, cache): - s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) + s = pd.Series(pd.date_range("20000101", periods=50, freq="H")) - test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', - '%Y-%m-%dT%H:%M:%S.%f'] + test_formats = ["%m-%d-%Y", "%m/%d/%Y %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%f"] for test_format in test_formats: s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) - with_format = pd.to_datetime(s_as_dt_strings, format=test_format, - cache=cache) - no_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=False, - cache=cache) - yes_infer = pd.to_datetime(s_as_dt_strings, - infer_datetime_format=True, - cache=cache) + with_format = pd.to_datetime( + s_as_dt_strings, format=test_format, cache=cache + ) + no_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=False, cache=cache + ) + yes_infer = pd.to_datetime( + s_as_dt_strings, infer_datetime_format=True, cache=cache + ) # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same tm.assert_series_equal(with_format, no_infer) tm.assert_series_equal(no_infer, yes_infer) - @pytest.mark.parametrize('cache', [True, False]) - def test_to_datetime_infer_datetime_format_inconsistent_format(self, - cache): - s = pd.Series(np.array(['01/01/2011 00:00:00', - '01-02-2011 00:00:00', - '2011-01-03T00:00:00'])) + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache): + s = pd.Series( + np.array( + ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] + ) + ) # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) - s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) + s = pd.Series(np.array(["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): - s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, - '01/03/2011 00:00:00', np.nan])) - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) - - @pytest.mark.parametrize('cache', [True, False]) - def test_to_datetime_infer_datetime_format_series_start_with_nans(self, - cache): - s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', - '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - - tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False, - cache=cache), - pd.to_datetime(s, infer_datetime_format=True, - cache=cache)) - - @pytest.mark.parametrize('cache', [True, False]) + s = pd.Series( + np.array(["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan]) + ) + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + @pytest.mark.parametrize("cache", [True, False]) + def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): + s = pd.Series( + np.array( + [ + np.nan, + np.nan, + "01/01/2011 00:00:00", + "01/02/2011 00:00:00", + "01/03/2011 00:00:00", + ] + ) + ) + + tm.assert_series_equal( + pd.to_datetime(s, infer_datetime_format=False, cache=cache), + pd.to_datetime(s, infer_datetime_format=True, cache=cache), + ) + + @pytest.mark.parametrize("cache", [True, False]) def test_to_datetime_iso8601_noleading_0s(self, cache): # GH 11871 - s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) - expected = pd.Series([pd.Timestamp('2014-01-01'), - pd.Timestamp('2014-02-02'), - pd.Timestamp('2015-03-03')]) + s = pd.Series(["2014-1-1", "2014-2-2", "2015-3-3"]) + expected = pd.Series( + [ + pd.Timestamp("2014-01-01"), + pd.Timestamp("2014-02-02"), + pd.Timestamp("2015-03-03"), + ] + ) tm.assert_series_equal(pd.to_datetime(s, cache=cache), expected) - tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d', - cache=cache), expected) + tm.assert_series_equal( + pd.to_datetime(s, format="%Y-%m-%d", cache=cache), expected + ) class TestDaysInMonth: # tests for issue #10154 - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_coerce(self, cache): - assert isna(to_datetime('2015-02-29', errors='coerce', cache=cache)) - assert isna(to_datetime('2015-02-29', format="%Y-%m-%d", - errors='coerce', cache=cache)) - assert isna(to_datetime('2015-02-32', format="%Y-%m-%d", - errors='coerce', cache=cache)) - assert isna(to_datetime('2015-04-31', format="%Y-%m-%d", - errors='coerce', cache=cache)) - - @pytest.mark.parametrize('cache', [True, False]) + assert isna(to_datetime("2015-02-29", errors="coerce", cache=cache)) + assert isna( + to_datetime("2015-02-29", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + assert isna( + to_datetime("2015-02-32", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + assert isna( + to_datetime("2015-04-31", format="%Y-%m-%d", errors="coerce", cache=cache) + ) + + @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_raise(self, cache): msg = "day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime('2015-02-29', errors='raise', cache=cache) + to_datetime("2015-02-29", errors="raise", cache=cache) msg = "time data 2015-02-29 doesn't match format specified" with pytest.raises(ValueError, match=msg): - to_datetime('2015-02-29', errors='raise', format="%Y-%m-%d", - cache=cache) + to_datetime("2015-02-29", errors="raise", format="%Y-%m-%d", cache=cache) msg = "time data 2015-02-32 doesn't match format specified" with pytest.raises(ValueError, match=msg): - to_datetime('2015-02-32', errors='raise', format="%Y-%m-%d", - cache=cache) + to_datetime("2015-02-32", errors="raise", format="%Y-%m-%d", cache=cache) msg = "time data 2015-04-31 doesn't match format specified" with pytest.raises(ValueError, match=msg): - to_datetime('2015-04-31', errors='raise', format="%Y-%m-%d", - cache=cache) + to_datetime("2015-04-31", errors="raise", format="%Y-%m-%d", cache=cache) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_day_not_in_month_ignore(self, cache): - assert to_datetime('2015-02-29', errors='ignore', - cache=cache) == '2015-02-29' - assert to_datetime('2015-02-29', errors='ignore', - format="%Y-%m-%d", cache=cache) == '2015-02-29' - assert to_datetime('2015-02-32', errors='ignore', - format="%Y-%m-%d", cache=cache) == '2015-02-32' - assert to_datetime('2015-04-31', errors='ignore', - format="%Y-%m-%d", cache=cache) == '2015-04-31' + assert to_datetime("2015-02-29", errors="ignore", cache=cache) == "2015-02-29" + assert ( + to_datetime("2015-02-29", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-02-29" + ) + assert ( + to_datetime("2015-02-32", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-02-32" + ) + assert ( + to_datetime("2015-04-31", errors="ignore", format="%Y-%m-%d", cache=cache) + == "2015-04-31" + ) class TestDatetimeParsingWrappers: - - @pytest.mark.parametrize('date_str,expected', list({ - '2011-01-01': datetime(2011, 1, 1), - '2Q2005': datetime(2005, 4, 1), - '2Q05': datetime(2005, 4, 1), - '2005Q1': datetime(2005, 1, 1), - '05Q1': datetime(2005, 1, 1), - '2011Q3': datetime(2011, 7, 1), - '11Q3': datetime(2011, 7, 1), - '3Q2011': datetime(2011, 7, 1), - '3Q11': datetime(2011, 7, 1), - - # quarterly without space - '2000Q4': datetime(2000, 10, 1), - '00Q4': datetime(2000, 10, 1), - '4Q2000': datetime(2000, 10, 1), - '4Q00': datetime(2000, 10, 1), - '2000q4': datetime(2000, 10, 1), - '2000-Q4': datetime(2000, 10, 1), - '00-Q4': datetime(2000, 10, 1), - '4Q-2000': datetime(2000, 10, 1), - '4Q-00': datetime(2000, 10, 1), - '00q4': datetime(2000, 10, 1), - '2005': datetime(2005, 1, 1), - '2005-11': datetime(2005, 11, 1), - '2005 11': datetime(2005, 11, 1), - '11-2005': datetime(2005, 11, 1), - '11 2005': datetime(2005, 11, 1), - '200511': datetime(2020, 5, 11), - '20051109': datetime(2005, 11, 9), - '20051109 10:15': datetime(2005, 11, 9, 10, 15), - '20051109 08H': datetime(2005, 11, 9, 8, 0), - '2005-11-09 10:15': datetime(2005, 11, 9, 10, 15), - '2005-11-09 08H': datetime(2005, 11, 9, 8, 0), - '2005/11/09 10:15': datetime(2005, 11, 9, 10, 15), - '2005/11/09 08H': datetime(2005, 11, 9, 8, 0), - "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), - "Thu Sep 25 2003": datetime(2003, 9, 25), - "Sep 25 2003": datetime(2003, 9, 25), - "January 1 2014": datetime(2014, 1, 1), - - # GHE10537 - '2014-06': datetime(2014, 6, 1), - '06-2014': datetime(2014, 6, 1), - '2014-6': datetime(2014, 6, 1), - '6-2014': datetime(2014, 6, 1), - - '20010101 12': datetime(2001, 1, 1, 12), - '20010101 1234': datetime(2001, 1, 1, 12, 34), - '20010101 123456': datetime(2001, 1, 1, 12, 34, 56)}.items())) - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize( + "date_str,expected", + list( + { + "2011-01-01": datetime(2011, 1, 1), + "2Q2005": datetime(2005, 4, 1), + "2Q05": datetime(2005, 4, 1), + "2005Q1": datetime(2005, 1, 1), + "05Q1": datetime(2005, 1, 1), + "2011Q3": datetime(2011, 7, 1), + "11Q3": datetime(2011, 7, 1), + "3Q2011": datetime(2011, 7, 1), + "3Q11": datetime(2011, 7, 1), + # quarterly without space + "2000Q4": datetime(2000, 10, 1), + "00Q4": datetime(2000, 10, 1), + "4Q2000": datetime(2000, 10, 1), + "4Q00": datetime(2000, 10, 1), + "2000q4": datetime(2000, 10, 1), + "2000-Q4": datetime(2000, 10, 1), + "00-Q4": datetime(2000, 10, 1), + "4Q-2000": datetime(2000, 10, 1), + "4Q-00": datetime(2000, 10, 1), + "00q4": datetime(2000, 10, 1), + "2005": datetime(2005, 1, 1), + "2005-11": datetime(2005, 11, 1), + "2005 11": datetime(2005, 11, 1), + "11-2005": datetime(2005, 11, 1), + "11 2005": datetime(2005, 11, 1), + "200511": datetime(2020, 5, 11), + "20051109": datetime(2005, 11, 9), + "20051109 10:15": datetime(2005, 11, 9, 10, 15), + "20051109 08H": datetime(2005, 11, 9, 8, 0), + "2005-11-09 10:15": datetime(2005, 11, 9, 10, 15), + "2005-11-09 08H": datetime(2005, 11, 9, 8, 0), + "2005/11/09 10:15": datetime(2005, 11, 9, 10, 15), + "2005/11/09 08H": datetime(2005, 11, 9, 8, 0), + "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), + "Thu Sep 25 2003": datetime(2003, 9, 25), + "Sep 25 2003": datetime(2003, 9, 25), + "January 1 2014": datetime(2014, 1, 1), + # GHE10537 + "2014-06": datetime(2014, 6, 1), + "06-2014": datetime(2014, 6, 1), + "2014-6": datetime(2014, 6, 1), + "6-2014": datetime(2014, 6, 1), + "20010101 12": datetime(2001, 1, 1, 12), + "20010101 1234": datetime(2001, 1, 1, 12, 34), + "20010101 123456": datetime(2001, 1, 1, 12, 34, 56), + }.items() + ), + ) + @pytest.mark.parametrize("cache", [True, False]) def test_parsers(self, date_str, expected, cache): # dateutil >= 2.5.0 defaults to yearfirst=True # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _, _ = parsing.parse_time_string(date_str, - yearfirst=yearfirst) + result1, _, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below - result4 = to_datetime(np.array([date_str], dtype=object), - yearfirst=yearfirst, cache=cache) + result4 = to_datetime( + np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache + ) result6 = DatetimeIndex([date_str], yearfirst=yearfirst) # result7 is used below result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) @@ -1682,31 +1871,30 @@ def test_parsers(self, date_str, expected, cache): if not yearfirst: result5 = Timestamp(date_str) assert result5 == expected - result7 = date_range(date_str, freq='S', periods=1, - yearfirst=yearfirst) + result7 = date_range(date_str, freq="S", periods=1, yearfirst=yearfirst) assert result7 == expected - @pytest.mark.parametrize('cache', [True, False]) - def test_na_values_with_cache(self, cache, unique_nulls_fixture, - unique_nulls_fixture2): + @pytest.mark.parametrize("cache", [True, False]) + def test_na_values_with_cache( + self, cache, unique_nulls_fixture, unique_nulls_fixture2 + ): # GH22305 - expected = Index([NaT, NaT], dtype='datetime64[ns]') - result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], - cache=cache) + expected = Index([NaT, NaT], dtype="datetime64[ns]") + result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) def test_parsers_nat(self): # Test that each of several string-accepting methods return pd.NaT - result1, _, _ = parsing.parse_time_string('NaT') - result2 = to_datetime('NaT') - result3 = Timestamp('NaT') - result4 = DatetimeIndex(['NaT'])[0] + result1, _, _ = parsing.parse_time_string("NaT") + result2 = to_datetime("NaT") + result3 = Timestamp("NaT") + result4 = DatetimeIndex(["NaT"])[0] assert result1 is NaT assert result2 is NaT assert result3 is NaT assert result4 is NaT - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_parsers_dayfirst_yearfirst(self, cache): # OK # 2.5.1 10-11-12 [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00 @@ -1749,55 +1937,58 @@ def test_parsers_dayfirst_yearfirst(self, cache): # 2.5.3 20/12/21 [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00 # str : dayfirst, yearfirst, expected - cases = {'10-11-12': [(False, False, - datetime(2012, 10, 11)), - (True, False, - datetime(2012, 11, 10)), - (False, True, - datetime(2010, 11, 12)), - (True, True, - datetime(2010, 12, 11))], - '20/12/21': [(False, False, - datetime(2021, 12, 20)), - (True, False, - datetime(2021, 12, 20)), - (False, True, - datetime(2020, 12, 21)), - (True, True, - datetime(2020, 12, 21))]} + cases = { + "10-11-12": [ + (False, False, datetime(2012, 10, 11)), + (True, False, datetime(2012, 11, 10)), + (False, True, datetime(2010, 11, 12)), + (True, True, datetime(2010, 12, 11)), + ], + "20/12/21": [ + (False, False, datetime(2021, 12, 20)), + (True, False, datetime(2021, 12, 20)), + (False, True, datetime(2020, 12, 21)), + (True, True, datetime(2020, 12, 21)), + ], + } for date_str, values in cases.items(): for dayfirst, yearfirst, expected in values: # compare with dateutil result - dateutil_result = parse(date_str, dayfirst=dayfirst, - yearfirst=yearfirst) + dateutil_result = parse( + date_str, dayfirst=dayfirst, yearfirst=yearfirst + ) assert dateutil_result == expected - result1, _, _ = parsing.parse_time_string(date_str, - dayfirst=dayfirst, - yearfirst=yearfirst) + result1, _, _ = parsing.parse_time_string( + date_str, dayfirst=dayfirst, yearfirst=yearfirst + ) # we don't support dayfirst/yearfirst here: if not dayfirst and not yearfirst: result2 = Timestamp(date_str) assert result2 == expected - result3 = to_datetime(date_str, dayfirst=dayfirst, - yearfirst=yearfirst, cache=cache) + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) - result4 = DatetimeIndex([date_str], dayfirst=dayfirst, - yearfirst=yearfirst)[0] + result4 = DatetimeIndex( + [date_str], dayfirst=dayfirst, yearfirst=yearfirst + )[0] assert result1 == expected assert result3 == expected assert result4 == expected - @pytest.mark.parametrize('cache', [True, False]) + @pytest.mark.parametrize("cache", [True, False]) def test_parsers_timestring(self, cache): # must be the same as dateutil result - cases = {'10:15': (parse('10:15'), datetime(1, 1, 1, 10, 15)), - '9:05': (parse('9:05'), datetime(1, 1, 1, 9, 5))} + cases = { + "10:15": (parse("10:15"), datetime(1, 1, 1, 10, 15)), + "9:05": (parse("9:05"), datetime(1, 1, 1, 9, 5)), + } for date_str, (exp_now, exp_def) in cases.items(): result1, _, _ = parsing.parse_time_string(date_str) @@ -1817,8 +2008,17 @@ def test_parsers_timestring(self, cache): @td.skip_if_has_locale def test_parsers_time(self): # GH11818 - strings = ["14:15", "1415", "2:15pm", "0215pm", "14:15:00", "141500", - "2:15:00pm", "021500pm", time(14, 15)] + strings = [ + "14:15", + "1415", + "2:15pm", + "0215pm", + "14:15:00", + "141500", + "2:15:00pm", + "021500pm", + time(14, 15), + ] expected = time(14, 15) for time_string in strings: @@ -1835,8 +2035,7 @@ def test_parsers_time(self): assert tools.to_time(arg) == expected_arr assert tools.to_time(arg, format="%H:%M") == expected_arr assert tools.to_time(arg, infer_time_format=True) == expected_arr - assert tools.to_time(arg, format="%I:%M%p", - errors="coerce") == [None, None] + assert tools.to_time(arg, format="%I:%M%p", errors="coerce") == [None, None] res = tools.to_time(arg, format="%I:%M%p", errors="ignore") tm.assert_numpy_array_equal(res, np.array(arg, dtype=np.object_)) @@ -1844,30 +2043,42 @@ def test_parsers_time(self): with pytest.raises(ValueError): tools.to_time(arg, format="%I:%M%p", errors="raise") - tm.assert_series_equal(tools.to_time(Series(arg, name="test")), - Series(expected_arr, name="test")) + tm.assert_series_equal( + tools.to_time(Series(arg, name="test")), Series(expected_arr, name="test") + ) res = tools.to_time(np.array(arg)) assert isinstance(res, list) assert res == expected_arr - @pytest.mark.parametrize('cache', [True, False]) - @pytest.mark.parametrize('dt_string, tz, dt_string_repr', [ - ('2013-01-01 05:45+0545', pytz.FixedOffset(345), - "Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')"), - ('2013-01-01 05:30+0530', pytz.FixedOffset(330), - "Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')")]) - def test_parsers_timezone_minute_offsets_roundtrip(self, cache, dt_string, - tz, dt_string_repr): + @pytest.mark.parametrize("cache", [True, False]) + @pytest.mark.parametrize( + "dt_string, tz, dt_string_repr", + [ + ( + "2013-01-01 05:45+0545", + pytz.FixedOffset(345), + "Timestamp('2013-01-01 05:45:00+0545', tz='pytz.FixedOffset(345)')", + ), + ( + "2013-01-01 05:30+0530", + pytz.FixedOffset(330), + "Timestamp('2013-01-01 05:30:00+0530', tz='pytz.FixedOffset(330)')", + ), + ], + ) + def test_parsers_timezone_minute_offsets_roundtrip( + self, cache, dt_string, tz, dt_string_repr + ): # GH11708 base = to_datetime("2013-01-01 00:00:00", cache=cache) - base = base.tz_localize('UTC').tz_convert(tz) + base = base.tz_localize("UTC").tz_convert(tz) dt_time = to_datetime(dt_string, cache=cache) assert base == dt_time assert dt_string_repr == repr(dt_time) -@pytest.fixture(params=['D', 's', 'ms', 'us', 'ns']) +@pytest.fixture(params=["D", "s", "ms", "us", "ns"]) def units(request): """Day and some time units. @@ -1883,7 +2094,7 @@ def units(request): @pytest.fixture def epoch_1960(): """Timestamp at 1960-01-01.""" - return Timestamp('1960-01-01') + return Timestamp("1960-01-01") @pytest.fixture @@ -1891,7 +2102,7 @@ def units_from_epochs(): return list(range(5)) -@pytest.fixture(params=['timestamp', 'pydatetime', 'datetime64', 'str_1960']) +@pytest.fixture(params=["timestamp", "pydatetime", "datetime64", "str_1960"]) def epochs(epoch_1960, request): """Timestamp at 1960-01-01 in various forms. @@ -1900,11 +2111,10 @@ def epochs(epoch_1960, request): * numpy.datetime64 * str """ - assert request.param in {'timestamp', 'pydatetime', 'datetime64', - "str_1960"} - if request.param == 'timestamp': + assert request.param in {"timestamp", "pydatetime", "datetime64", "str_1960"} + if request.param == "timestamp": return epoch_1960 - elif request.param == 'pydatetime': + elif request.param == "pydatetime": return epoch_1960.to_pydatetime() elif request.param == "datetime64": return epoch_1960.to_datetime64() @@ -1914,50 +2124,47 @@ def epochs(epoch_1960, request): @pytest.fixture def julian_dates(): - return pd.date_range('2014-1-1', periods=10).to_julian_date().values + return pd.date_range("2014-1-1", periods=10).to_julian_date().values class TestOrigin: - def test_to_basic(self, julian_dates): # gh-11276, gh-11745 # for origin as julian - result = Series(pd.to_datetime( - julian_dates, unit='D', origin='julian')) - expected = Series(pd.to_datetime( - julian_dates - pd.Timestamp(0).to_julian_date(), unit='D')) + result = Series(pd.to_datetime(julian_dates, unit="D", origin="julian")) + expected = Series( + pd.to_datetime(julian_dates - pd.Timestamp(0).to_julian_date(), unit="D") + ) assert_series_equal(result, expected) - result = Series(pd.to_datetime( - [0, 1, 2], unit='D', origin='unix')) - expected = Series([Timestamp('1970-01-01'), - Timestamp('1970-01-02'), - Timestamp('1970-01-03')]) + result = Series(pd.to_datetime([0, 1, 2], unit="D", origin="unix")) + expected = Series( + [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ) assert_series_equal(result, expected) # default - result = Series(pd.to_datetime( - [0, 1, 2], unit='D')) - expected = Series([Timestamp('1970-01-01'), - Timestamp('1970-01-02'), - Timestamp('1970-01-03')]) + result = Series(pd.to_datetime([0, 1, 2], unit="D")) + expected = Series( + [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ) assert_series_equal(result, expected) def test_julian_round_trip(self): - result = pd.to_datetime(2456658, origin='julian', unit='D') + result = pd.to_datetime(2456658, origin="julian", unit="D") assert result.to_julian_date() == 2456658 # out-of-bounds with pytest.raises(ValueError): - pd.to_datetime(1, origin="julian", unit='D') + pd.to_datetime(1, origin="julian", unit="D") def test_invalid_unit(self, units, julian_dates): # checking for invalid combination of origin='julian' and unit != D - if units != 'D': + if units != "D": with pytest.raises(ValueError): - pd.to_datetime(julian_dates, unit=units, origin='julian') + pd.to_datetime(julian_dates, unit=units, origin="julian") def test_invalid_origin(self): @@ -1966,38 +2173,37 @@ def test_invalid_origin(self): pd.to_datetime("2005-01-01", origin="1960-01-01") with pytest.raises(ValueError): - pd.to_datetime("2005-01-01", origin="1960-01-01", unit='D') + pd.to_datetime("2005-01-01", origin="1960-01-01", unit="D") def test_epoch(self, units, epochs, epoch_1960, units_from_epochs): expected = Series( - [pd.Timedelta(x, unit=units) + - epoch_1960 for x in units_from_epochs]) + [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs] + ) - result = Series(pd.to_datetime( - units_from_epochs, unit=units, origin=epochs)) + result = Series(pd.to_datetime(units_from_epochs, unit=units, origin=epochs)) assert_series_equal(result, expected) - @pytest.mark.parametrize("origin, exc", - [('random_string', ValueError), - ('epoch', ValueError), - ('13-24-1990', ValueError), - (datetime(1, 1, 1), tslib.OutOfBoundsDatetime)]) + @pytest.mark.parametrize( + "origin, exc", + [ + ("random_string", ValueError), + ("epoch", ValueError), + ("13-24-1990", ValueError), + (datetime(1, 1, 1), tslib.OutOfBoundsDatetime), + ], + ) def test_invalid_origins(self, origin, exc, units, units_from_epochs): with pytest.raises(exc): - pd.to_datetime(units_from_epochs, unit=units, - origin=origin) + pd.to_datetime(units_from_epochs, unit=units, origin=origin) def test_invalid_origins_tzinfo(self): # GH16842 with pytest.raises(ValueError): - pd.to_datetime(1, unit='D', - origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) + pd.to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=pytz.utc)) - @pytest.mark.parametrize("format", [ - None, "%Y-%m-%d %H:%M:%S" - ]) + @pytest.mark.parametrize("format", [None, "%Y-%m-%d %H:%M:%S"]) def test_to_datetime_out_of_bounds_with_format_arg(self, format): # see gh-23830 msg = "Out of bounds nanosecond timestamp" @@ -2008,45 +2214,53 @@ def test_processing_order(self): # make sure we handle out-of-bounds *before* # constructing the dates - result = pd.to_datetime(200 * 365, unit='D') - expected = Timestamp('2169-11-13 00:00:00') + result = pd.to_datetime(200 * 365, unit="D") + expected = Timestamp("2169-11-13 00:00:00") assert result == expected - result = pd.to_datetime(200 * 365, unit='D', origin='1870-01-01') - expected = Timestamp('2069-11-13 00:00:00') + result = pd.to_datetime(200 * 365, unit="D", origin="1870-01-01") + expected = Timestamp("2069-11-13 00:00:00") assert result == expected - result = pd.to_datetime(300 * 365, unit='D', origin='1870-01-01') - expected = Timestamp('2169-10-20 00:00:00') + result = pd.to_datetime(300 * 365, unit="D", origin="1870-01-01") + expected = Timestamp("2169-10-20 00:00:00") assert result == expected - @pytest.mark.parametrize('offset,utc,exp', [ - ["Z", True, "2019-01-01T00:00:00.000Z"], - ["Z", None, "2019-01-01T00:00:00.000Z"], - ["-01:00", True, "2019-01-01T01:00:00.000Z"], - ["-01:00", None, "2019-01-01T00:00:00.000-01:00"], - ]) + @pytest.mark.parametrize( + "offset,utc,exp", + [ + ["Z", True, "2019-01-01T00:00:00.000Z"], + ["Z", None, "2019-01-01T00:00:00.000Z"], + ["-01:00", True, "2019-01-01T01:00:00.000Z"], + ["-01:00", None, "2019-01-01T00:00:00.000-01:00"], + ], + ) def test_arg_tz_ns_unit(self, offset, utc, exp): # GH 25546 arg = "2019-01-01T00:00:00.000" + offset - result = to_datetime([arg], unit='ns', utc=utc) + result = to_datetime([arg], unit="ns", utc=utc) expected = to_datetime([exp]) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('listlike,do_caching', [ - ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), - ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True) -]) +@pytest.mark.parametrize( + "listlike,do_caching", + [([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False), ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True)], +) def test_should_cache(listlike, do_caching): - assert tools.should_cache(listlike, check_count=len(listlike), - unique_share=0.7) == do_caching - - -@pytest.mark.parametrize('unique_share,check_count, err_message', [ - (0.5, 11, r'check_count must be in next bounds: \[0; len\(arg\)\]'), - (10, 2, r'unique_share must be in next bounds: \(0; 1\)') -]) + assert ( + tools.should_cache(listlike, check_count=len(listlike), unique_share=0.7) + == do_caching + ) + + +@pytest.mark.parametrize( + "unique_share,check_count, err_message", + [ + (0.5, 11, r"check_count must be in next bounds: \[0; len\(arg\)\]"), + (10, 2, r"unique_share must be in next bounds: \(0; 1\)"), + ], +) def test_should_cache_errors(unique_share, check_count, err_message): arg = [5] * 10 diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 16bcb459a22f9..91022fef16521 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -4,8 +4,14 @@ from pandas.core.dtypes.dtypes import CategoricalDtype, IntervalDtype from pandas import ( - CategoricalIndex, Index, IntervalIndex, NaT, Timedelta, Timestamp, - interval_range) + CategoricalIndex, + Index, + IntervalIndex, + NaT, + Timedelta, + Timestamp, + interval_range, +) import pandas.util.testing as tm @@ -13,7 +19,7 @@ class Base: """Tests common to IntervalIndex with any subtype""" def test_astype_idempotent(self, index): - result = index.astype('interval') + result = index.astype("interval") tm.assert_index_equal(result, index) result = index.astype(index.dtype) @@ -21,12 +27,12 @@ def test_astype_idempotent(self, index): def test_astype_object(self, index): result = index.astype(object) - expected = Index(index.values, dtype='object') + expected = Index(index.values, dtype="object") tm.assert_index_equal(result, expected) assert not result.equals(index) def test_astype_category(self, index): - result = index.astype('category') + result = index.astype("category") expected = CategoricalIndex(index.values) tm.assert_index_equal(result, expected) @@ -37,64 +43,77 @@ def test_astype_category(self, index): categories = index.dropna().unique().values[:-1] dtype = CategoricalDtype(categories=categories, ordered=True) result = index.astype(dtype) - expected = CategoricalIndex( - index.values, categories=categories, ordered=True) + expected = CategoricalIndex(index.values, categories=categories, ordered=True) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - 'int64', 'uint64', 'float64', 'complex128', 'period[M]', - 'timedelta64', 'timedelta64[ns]', 'datetime64', 'datetime64[ns]', - 'datetime64[ns, US/Eastern]']) + @pytest.mark.parametrize( + "dtype", + [ + "int64", + "uint64", + "float64", + "complex128", + "period[M]", + "timedelta64", + "timedelta64[ns]", + "datetime64", + "datetime64[ns]", + "datetime64[ns, US/Eastern]", + ], + ) def test_astype_cannot_cast(self, index, dtype): - msg = 'Cannot cast IntervalIndex to dtype' + msg = "Cannot cast IntervalIndex to dtype" with pytest.raises(TypeError, match=msg): index.astype(dtype) def test_astype_invalid_dtype(self, index): msg = "data type 'fake_dtype' not understood" with pytest.raises(TypeError, match=msg): - index.astype('fake_dtype') + index.astype("fake_dtype") class TestIntSubtype(Base): """Tests specific to IntervalIndex with integer-like subtype""" indexes = [ - IntervalIndex.from_breaks(np.arange(-10, 11, dtype='int64')), - IntervalIndex.from_breaks( - np.arange(100, dtype='uint64'), closed='left'), + IntervalIndex.from_breaks(np.arange(-10, 11, dtype="int64")), + IntervalIndex.from_breaks(np.arange(100, dtype="uint64"), closed="left"), ] @pytest.fixture(params=indexes) def index(self, request): return request.param - @pytest.mark.parametrize('subtype', [ - 'float64', 'datetime64[ns]', 'timedelta64[ns]']) + @pytest.mark.parametrize( + "subtype", ["float64", "datetime64[ns]", "timedelta64[ns]"] + ) def test_subtype_conversion(self, index, subtype): dtype = IntervalDtype(subtype) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype), - index.right.astype(subtype), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('subtype_start, subtype_end', [ - ('int64', 'uint64'), ('uint64', 'int64')]) + @pytest.mark.parametrize( + "subtype_start, subtype_end", [("int64", "uint64"), ("uint64", "int64")] + ) def test_subtype_integer(self, subtype_start, subtype_end): index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start)) dtype = IntervalDtype(subtype_end) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype_end), - index.right.astype(subtype_end), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype_end), + index.right.astype(subtype_end), + closed=index.closed, + ) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(reason='GH#15832') + @pytest.mark.xfail(reason="GH#15832") def test_subtype_integer_errors(self): # int64 -> uint64 fails with negative values index = interval_range(-10, 10) - dtype = IntervalDtype('uint64') + dtype = IntervalDtype("uint64") with pytest.raises(ValueError): index.astype(dtype) @@ -103,53 +122,53 @@ class TestFloatSubtype(Base): """Tests specific to IntervalIndex with float subtype""" indexes = [ - interval_range(-10.0, 10.0, closed='neither'), - IntervalIndex.from_arrays([-1.5, np.nan, 0., 0., 1.5], - [-0.5, np.nan, 1., 1., 3.], - closed='both'), + interval_range(-10.0, 10.0, closed="neither"), + IntervalIndex.from_arrays( + [-1.5, np.nan, 0.0, 0.0, 1.5], [-0.5, np.nan, 1.0, 1.0, 3.0], closed="both" + ), ] @pytest.fixture(params=indexes) def index(self, request): return request.param - @pytest.mark.parametrize('subtype', ['int64', 'uint64']) + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer(self, subtype): index = interval_range(0.0, 10.0) dtype = IntervalDtype(subtype) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype), - index.right.astype(subtype), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) tm.assert_index_equal(result, expected) # raises with NA - msg = 'Cannot convert NA to integer' + msg = "Cannot convert NA to integer" with pytest.raises(ValueError, match=msg): index.insert(0, np.nan).astype(dtype) - @pytest.mark.xfail(reason='GH#15832') + @pytest.mark.xfail(reason="GH#15832") def test_subtype_integer_errors(self): # float64 -> uint64 fails with negative values index = interval_range(-10.0, 10.0) - dtype = IntervalDtype('uint64') + dtype = IntervalDtype("uint64") with pytest.raises(ValueError): index.astype(dtype) # float64 -> integer-like fails with non-integer valued floats index = interval_range(0.0, 10.0, freq=0.25) - dtype = IntervalDtype('int64') + dtype = IntervalDtype("int64") with pytest.raises(ValueError): index.astype(dtype) - dtype = IntervalDtype('uint64') + dtype = IntervalDtype("uint64") with pytest.raises(ValueError): index.astype(dtype) - @pytest.mark.parametrize('subtype', ['datetime64[ns]', 'timedelta64[ns]']) + @pytest.mark.parametrize("subtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_subtype_datetimelike(self, index, subtype): dtype = IntervalDtype(subtype) - msg = 'Cannot convert .* to .*; subtypes are incompatible' + msg = "Cannot convert .* to .*; subtypes are incompatible" with pytest.raises(TypeError, match=msg): index.astype(dtype) @@ -158,47 +177,47 @@ class TestDatetimelikeSubtype(Base): """Tests specific to IntervalIndex with datetime-like subtype""" indexes = [ - interval_range(Timestamp('2018-01-01'), periods=10, closed='neither'), - interval_range(Timestamp('2018-01-01'), periods=10).insert(2, NaT), - interval_range(Timestamp('2018-01-01', tz='US/Eastern'), periods=10), - interval_range(Timedelta('0 days'), periods=10, closed='both'), - interval_range(Timedelta('0 days'), periods=10).insert(2, NaT), + interval_range(Timestamp("2018-01-01"), periods=10, closed="neither"), + interval_range(Timestamp("2018-01-01"), periods=10).insert(2, NaT), + interval_range(Timestamp("2018-01-01", tz="US/Eastern"), periods=10), + interval_range(Timedelta("0 days"), periods=10, closed="both"), + interval_range(Timedelta("0 days"), periods=10).insert(2, NaT), ] @pytest.fixture(params=indexes) def index(self, request): return request.param - @pytest.mark.parametrize('subtype', ['int64', 'uint64']) + @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer(self, index, subtype): dtype = IntervalDtype(subtype) result = index.astype(dtype) - expected = IntervalIndex.from_arrays(index.left.astype(subtype), - index.right.astype(subtype), - closed=index.closed) + expected = IntervalIndex.from_arrays( + index.left.astype(subtype), index.right.astype(subtype), closed=index.closed + ) tm.assert_index_equal(result, expected) def test_subtype_float(self, index): - dtype = IntervalDtype('float64') - msg = 'Cannot convert .* to .*; subtypes are incompatible' + dtype = IntervalDtype("float64") + msg = "Cannot convert .* to .*; subtypes are incompatible" with pytest.raises(TypeError, match=msg): index.astype(dtype) def test_subtype_datetimelike(self): # datetime -> timedelta raises - dtype = IntervalDtype('timedelta64[ns]') - msg = 'Cannot convert .* to .*; subtypes are incompatible' + dtype = IntervalDtype("timedelta64[ns]") + msg = "Cannot convert .* to .*; subtypes are incompatible" - index = interval_range(Timestamp('2018-01-01'), periods=10) + index = interval_range(Timestamp("2018-01-01"), periods=10) with pytest.raises(TypeError, match=msg): index.astype(dtype) - index = interval_range(Timestamp('2018-01-01', tz='CET'), periods=10) + index = interval_range(Timestamp("2018-01-01", tz="CET"), periods=10) with pytest.raises(TypeError, match=msg): index.astype(dtype) # timedelta -> datetime raises - dtype = IntervalDtype('datetime64[ns]') - index = interval_range(Timedelta('0 days'), periods=10) + dtype = IntervalDtype("datetime64[ns]") + index = interval_range(Timedelta("0 days"), periods=10) with pytest.raises(TypeError, match=msg): index.astype(dtype) diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index aabaaa0f297f9..e2abb4531525a 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -7,14 +7,24 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas import ( - Categorical, CategoricalIndex, Float64Index, Index, Int64Index, Interval, - IntervalIndex, date_range, notna, period_range, timedelta_range) + Categorical, + CategoricalIndex, + Float64Index, + Index, + Int64Index, + Interval, + IntervalIndex, + date_range, + notna, + period_range, + timedelta_range, +) from pandas.core.arrays import IntervalArray import pandas.core.common as com import pandas.util.testing as tm -@pytest.fixture(params=[None, 'foo']) +@pytest.fixture(params=[None, "foo"]) def name(request): return request.param @@ -26,31 +36,39 @@ class Base: get_kwargs_from_breaks to the expected format. """ - @pytest.mark.parametrize('breaks', [ - [3, 14, 15, 92, 653], - np.arange(10, dtype='int64'), - Int64Index(range(-10, 11)), - Float64Index(np.arange(20, 30, 0.5)), - date_range('20180101', periods=10), - date_range('20180101', periods=10, tz='US/Eastern'), - timedelta_range('1 day', periods=10)]) + @pytest.mark.parametrize( + "breaks", + [ + [3, 14, 15, 92, 653], + np.arange(10, dtype="int64"), + Int64Index(range(-10, 11)), + Float64Index(np.arange(20, 30, 0.5)), + date_range("20180101", periods=10), + date_range("20180101", periods=10, tz="US/Eastern"), + timedelta_range("1 day", periods=10), + ], + ) def test_constructor(self, constructor, breaks, closed, name): result_kwargs = self.get_kwargs_from_breaks(breaks, closed) result = constructor(closed=closed, name=name, **result_kwargs) assert result.closed == closed assert result.name == name - assert result.dtype.subtype == getattr(breaks, 'dtype', 'int64') + assert result.dtype.subtype == getattr(breaks, "dtype", "int64") tm.assert_index_equal(result.left, Index(breaks[:-1])) tm.assert_index_equal(result.right, Index(breaks[1:])) - @pytest.mark.parametrize('breaks, subtype', [ - (Int64Index([0, 1, 2, 3, 4]), 'float64'), - (Int64Index([0, 1, 2, 3, 4]), 'datetime64[ns]'), - (Int64Index([0, 1, 2, 3, 4]), 'timedelta64[ns]'), - (Float64Index([0, 1, 2, 3, 4]), 'int64'), - (date_range('2017-01-01', periods=5), 'int64'), - (timedelta_range('1 day', periods=5), 'int64')]) + @pytest.mark.parametrize( + "breaks, subtype", + [ + (Int64Index([0, 1, 2, 3, 4]), "float64"), + (Int64Index([0, 1, 2, 3, 4]), "datetime64[ns]"), + (Int64Index([0, 1, 2, 3, 4]), "timedelta64[ns]"), + (Float64Index([0, 1, 2, 3, 4]), "int64"), + (date_range("2017-01-01", periods=5), "int64"), + (timedelta_range("1 day", periods=5), "int64"), + ], + ) def test_constructor_dtype(self, constructor, breaks, subtype): # GH 19262: conversion via dtype parameter expected_kwargs = self.get_kwargs_from_breaks(breaks.astype(subtype)) @@ -62,8 +80,7 @@ def test_constructor_dtype(self, constructor, breaks, subtype): result = constructor(dtype=dtype, **result_kwargs) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('breaks', [ - [np.nan] * 2, [np.nan] * 4, [np.nan] * 50]) + @pytest.mark.parametrize("breaks", [[np.nan] * 2, [np.nan] * 4, [np.nan] * 50]) def test_constructor_nan(self, constructor, breaks, closed): # GH 18421 result_kwargs = self.get_kwargs_from_breaks(breaks) @@ -76,46 +93,55 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.dtype.subtype == expected_subtype tm.assert_numpy_array_equal(result._ndarray_values, expected_values) - @pytest.mark.parametrize('breaks', [ - [], - np.array([], dtype='int64'), - np.array([], dtype='float64'), - np.array([], dtype='datetime64[ns]'), - np.array([], dtype='timedelta64[ns]')]) + @pytest.mark.parametrize( + "breaks", + [ + [], + np.array([], dtype="int64"), + np.array([], dtype="float64"), + np.array([], dtype="datetime64[ns]"), + np.array([], dtype="timedelta64[ns]"), + ], + ) def test_constructor_empty(self, constructor, breaks, closed): # GH 18421 result_kwargs = self.get_kwargs_from_breaks(breaks) result = constructor(closed=closed, **result_kwargs) expected_values = np.array([], dtype=object) - expected_subtype = getattr(breaks, 'dtype', np.int64) + expected_subtype = getattr(breaks, "dtype", np.int64) assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype tm.assert_numpy_array_equal(result._ndarray_values, expected_values) - @pytest.mark.parametrize('breaks', [ - tuple('0123456789'), - list('abcdefghij'), - np.array(list('abcdefghij'), dtype=object), - np.array(list('abcdefghij'), dtype=' self.index tm.assert_numpy_array_equal(actual, expected) - actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') + actual = self.index == IntervalIndex.from_breaks([0, 1, 2], "left") tm.assert_numpy_array_equal(actual, expected) actual = self.index == self.index.values @@ -718,9 +785,9 @@ def test_comparison(self): actual = self.index == self.index.left tm.assert_numpy_array_equal(actual, np.array([False, False])) - with pytest.raises(TypeError, match='unorderable types'): + with pytest.raises(TypeError, match="unorderable types"): self.index > 0 - with pytest.raises(TypeError, match='unorderable types'): + with pytest.raises(TypeError, match="unorderable types"): self.index <= 0 msg = r"unorderable types: Interval\(\) > int\(\)" with pytest.raises(TypeError, match=msg): @@ -730,20 +797,22 @@ def test_comparison(self): self.index > np.arange(3) def test_missing_values(self, closed): - idx = Index([np.nan, Interval(0, 1, closed=closed), - Interval(1, 2, closed=closed)]) - idx2 = IntervalIndex.from_arrays( - [np.nan, 0, 1], [np.nan, 1, 2], closed=closed) + idx = Index( + [np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed)] + ) + idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) - msg = ("missing values must be missing in the same location both left" - " and right sides") + msg = ( + "missing values must be missing in the same location both left" + " and right sides" + ) with pytest.raises(ValueError, match=msg): IntervalIndex.from_arrays( - [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed) + [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed + ) - tm.assert_numpy_array_equal(isna(idx), - np.array([True, False, False])) + tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) def test_sort_values(self, closed): index = self.create_index(closed=closed) @@ -765,44 +834,46 @@ def test_sort_values(self, closed): expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_datetime(self, tz): - start = Timestamp('2000-01-01', tz=tz) + start = Timestamp("2000-01-01", tz=tz) dates = date_range(start=start, periods=10) index = IntervalIndex.from_breaks(dates) # test mid - start = Timestamp('2000-01-01T12:00', tz=tz) + start = Timestamp("2000-01-01T12:00", tz=tz) expected = date_range(start=start, periods=9) tm.assert_index_equal(index.mid, expected) # __contains__ doesn't check individual points - assert Timestamp('2000-01-01', tz=tz) not in index - assert Timestamp('2000-01-01T12', tz=tz) not in index - assert Timestamp('2000-01-02', tz=tz) not in index - iv_true = Interval(Timestamp('2000-01-02', tz=tz), - Timestamp('2000-01-03', tz=tz)) - iv_false = Interval(Timestamp('1999-12-31', tz=tz), - Timestamp('2000-01-01', tz=tz)) + assert Timestamp("2000-01-01", tz=tz) not in index + assert Timestamp("2000-01-01T12", tz=tz) not in index + assert Timestamp("2000-01-02", tz=tz) not in index + iv_true = Interval( + Timestamp("2000-01-02", tz=tz), Timestamp("2000-01-03", tz=tz) + ) + iv_false = Interval( + Timestamp("1999-12-31", tz=tz), Timestamp("2000-01-01", tz=tz) + ) assert iv_true in index assert iv_false not in index # .contains does check individual points - assert not index.contains(Timestamp('2000-01-01', tz=tz)).any() - assert index.contains(Timestamp('2000-01-01T12', tz=tz)).any() - assert index.contains(Timestamp('2000-01-02', tz=tz)).any() + assert not index.contains(Timestamp("2000-01-01", tz=tz)).any() + assert index.contains(Timestamp("2000-01-01T12", tz=tz)).any() + assert index.contains(Timestamp("2000-01-02", tz=tz)).any() # test get_indexer - start = Timestamp('1999-12-31T12:00', tz=tz) - target = date_range(start=start, periods=7, freq='12H') + start = Timestamp("1999-12-31T12:00", tz=tz) + target = date_range(start=start, periods=7, freq="12H") actual = index.get_indexer(target) - expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype='intp') + expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") tm.assert_numpy_array_equal(actual, expected) - start = Timestamp('2000-01-08T18:00', tz=tz) - target = date_range(start=start, periods=7, freq='6H') + start = Timestamp("2000-01-08T18:00", tz=tz) + target = date_range(start=start, periods=7, freq="6H") actual = index.get_indexer(target) - expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype='intp') + expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") tm.assert_numpy_array_equal(actual, expected) def test_append(self, closed): @@ -811,20 +882,23 @@ def test_append(self, closed): index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) result = index1.append(index2) - expected = IntervalIndex.from_arrays( - [0, 1, 1, 2], [1, 2, 2, 3], closed=closed) + expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) result = index1.append([index1, index2]) expected = IntervalIndex.from_arrays( - [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed) + [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed + ) tm.assert_index_equal(result, expected) - msg = ('can only append two IntervalIndex objects that are closed ' - 'on the same side') - for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: + msg = ( + "can only append two IntervalIndex objects that are closed " + "on the same side" + ) + for other_closed in {"left", "right", "both", "neither"} - {closed}: index_other_closed = IntervalIndex.from_arrays( - [0, 1], [1, 2], closed=other_closed) + [0, 1], [1, 2], closed=other_closed + ) with pytest.raises(ValueError, match=msg): index1.append(index_other_closed) @@ -854,24 +928,27 @@ def test_is_non_overlapping_monotonic(self, closed): assert idx.is_non_overlapping_monotonic is False # Should be False for closed='both', otherwise True (GH16560) - if closed == 'both': + if closed == "both": idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is False else: idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True - @pytest.mark.parametrize('start, shift, na_value', [ - (0, 1, np.nan), - (Timestamp('2018-01-01'), Timedelta('1 day'), pd.NaT), - (Timedelta('0 days'), Timedelta('1 day'), pd.NaT)]) + @pytest.mark.parametrize( + "start, shift, na_value", + [ + (0, 1, np.nan), + (Timestamp("2018-01-01"), Timedelta("1 day"), pd.NaT), + (Timedelta("0 days"), Timedelta("1 day"), pd.NaT), + ], + ) def test_is_overlapping(self, start, shift, na_value, closed): # GH 23309 # see test_interval_tree.py for extensive tests; interface tests here # non-overlapping - tuples = [(start + n * shift, start + (n + 1) * shift) - for n in (0, 2, 4)] + tuples = [(start + n * shift, start + (n + 1) * shift) for n in (0, 2, 4)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is False @@ -881,8 +958,7 @@ def test_is_overlapping(self, start, shift, na_value, closed): assert index.is_overlapping is False # overlapping - tuples = [(start + n * shift, start + (n + 2) * shift) - for n in range(3)] + tuples = [(start + n * shift, start + (n + 2) * shift) for n in range(3)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is True @@ -892,11 +968,10 @@ def test_is_overlapping(self, start, shift, na_value, closed): assert index.is_overlapping is True # common endpoints - tuples = [(start + n * shift, start + (n + 1) * shift) - for n in range(3)] + tuples = [(start + n * shift, start + (n + 1) * shift) for n in range(3)] index = IntervalIndex.from_tuples(tuples, closed=closed) result = index.is_overlapping - expected = closed == 'both' + expected = closed == "both" assert result is expected # common endpoints with NA @@ -905,12 +980,24 @@ def test_is_overlapping(self, start, shift, na_value, closed): result = index.is_overlapping assert result is expected - @pytest.mark.parametrize('tuples', [ - list(zip(range(10), range(1, 11))), - list(zip(date_range('20170101', periods=10), - date_range('20170101', periods=10))), - list(zip(timedelta_range('0 days', periods=10), - timedelta_range('1 day', periods=10)))]) + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))), + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ), + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ), + ], + ) def test_to_tuples(self, tuples): # GH 18756 idx = IntervalIndex.from_tuples(tuples) @@ -918,13 +1005,27 @@ def test_to_tuples(self, tuples): expected = Index(com.asarray_tuplesafe(tuples)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tuples', [ - list(zip(range(10), range(1, 11))) + [np.nan], - list(zip(date_range('20170101', periods=10), - date_range('20170101', periods=10))) + [np.nan], - list(zip(timedelta_range('0 days', periods=10), - timedelta_range('1 day', periods=10))) + [np.nan]]) - @pytest.mark.parametrize('na_tuple', [True, False]) + @pytest.mark.parametrize( + "tuples", + [ + list(zip(range(10), range(1, 11))) + [np.nan], + list( + zip( + date_range("20170101", periods=10), + date_range("20170101", periods=10), + ) + ) + + [np.nan], + list( + zip( + timedelta_range("0 days", periods=10), + timedelta_range("1 day", periods=10), + ) + ) + + [np.nan], + ], + ) + @pytest.mark.parametrize("na_tuple", [True, False]) def test_to_tuples_na(self, tuples, na_tuple): # GH 18756 idx = IntervalIndex.from_tuples(tuples) @@ -946,8 +1047,8 @@ def test_to_tuples_na(self, tuples, na_tuple): def test_nbytes(self): # GH 19209 - left = np.arange(0, 4, dtype='i8') - right = np.arange(1, 5, dtype='i8') + left = np.arange(0, 4, dtype="i8") + right = np.arange(1, 5, dtype="i8") result = IntervalIndex.from_arrays(left, right).nbytes expected = 64 # 4 * 8 * 2 @@ -955,8 +1056,8 @@ def test_nbytes(self): def test_itemsize(self): # GH 19209 - left = np.arange(0, 4, dtype='i8') - right = np.arange(1, 5, dtype='i8') + left = np.arange(0, 4, dtype="i8") + right = np.arange(1, 5, dtype="i8") expected = 16 # 8 * 2 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -964,8 +1065,7 @@ def test_itemsize(self): assert result == expected - @pytest.mark.parametrize('new_closed', [ - 'left', 'right', 'both', 'neither']) + @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) def test_set_closed(self, name, closed, new_closed): # GH 21670 index = interval_range(0, 5, closed=closed, name=name) @@ -973,7 +1073,7 @@ def test_set_closed(self, name, closed, new_closed): expected = interval_range(0, 5, closed=new_closed, name=name) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('bad_closed', ['foo', 10, 'LEFT', True, False]) + @pytest.mark.parametrize("bad_closed", ["foo", 10, "LEFT", True, False]) def test_set_closed_errors(self, bad_closed): # GH 21670 index = interval_range(0, 5) @@ -983,7 +1083,8 @@ def test_set_closed_errors(self, bad_closed): def test_is_all_dates(self): # GH 23576 - year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'), - pd.Timestamp('2018-01-01 00:00:00')) + year_2017 = pd.Interval( + pd.Timestamp("2017-01-01 00:00:00"), pd.Timestamp("2018-01-01 00:00:00") + ) year_2017_index = pd.IntervalIndex([year_2017]) assert not year_2017_index.is_all_dates diff --git a/pandas/tests/indexes/interval/test_interval_new.py b/pandas/tests/indexes/interval/test_interval_new.py index cef230e98a6ee..ab9f7ef1c3e26 100644 --- a/pandas/tests/indexes/interval/test_interval_new.py +++ b/pandas/tests/indexes/interval/test_interval_new.py @@ -7,14 +7,12 @@ class TestIntervalIndex: - - @pytest.mark.parametrize("side", ['right', 'left', 'both', 'neither']) + @pytest.mark.parametrize("side", ["right", "left", "both", "neither"]) def test_get_loc_interval(self, closed, side): idx = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) - for bound in [[0, 1], [1, 2], [2, 3], [3, 4], - [0, 2], [2.5, 3], [-1, 4]]: + for bound in [[0, 1], [1, 2], [2, 3], [3, 4], [0, 2], [2.5, 3], [-1, 4]]: # if get_loc is supplied an interval, it should only search # for exact matches, not overlaps or covers, else KeyError. if closed == side: @@ -34,10 +32,12 @@ def test_get_loc_scalar(self, closed, scalar): # correct = {side: {query: answer}}. # If query is not in the dict, that query should raise a KeyError - correct = {'right': {0.5: 0, 1: 0, 2.5: 1, 3: 1}, - 'left': {0: 0, 0.5: 0, 2: 1, 2.5: 1}, - 'both': {0: 0, 0.5: 0, 1: 0, 2: 1, 2.5: 1, 3: 1}, - 'neither': {0.5: 0, 2.5: 1}} + correct = { + "right": {0.5: 0, 1: 0, 2.5: 1, 3: 1}, + "left": {0: 0, 0.5: 0, 2: 1, 2.5: 1}, + "both": {0: 0, 0.5: 0, 1: 0, 2: 1, 2.5: 1, 3: 1}, + "neither": {0.5: 0, 2.5: 1}, + } idx = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) @@ -54,35 +54,29 @@ def test_slice_locs_with_interval(self): # increasing monotonically index = IntervalIndex.from_tuples([(0, 2), (1, 3), (2, 4)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) assert index.slice_locs(start=Interval(0, 2)) == (0, 3) assert index.slice_locs(end=Interval(2, 4)) == (0, 3) assert index.slice_locs(end=Interval(0, 2)) == (0, 1) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (2, 1) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (2, 1) # decreasing monotonically index = IntervalIndex.from_tuples([(2, 4), (1, 3), (0, 2)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (2, 1) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (2, 1) assert index.slice_locs(start=Interval(0, 2)) == (2, 3) assert index.slice_locs(end=Interval(2, 4)) == (0, 1) assert index.slice_locs(end=Interval(0, 2)) == (0, 3) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (0, 3) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (0, 3) # sorted duplicates index = IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) assert index.slice_locs(start=Interval(0, 2)) == (0, 3) assert index.slice_locs(end=Interval(2, 4)) == (0, 3) assert index.slice_locs(end=Interval(0, 2)) == (0, 2) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) # unsorted duplicates index = IntervalIndex.from_tuples([(0, 2), (2, 4), (0, 2)]) @@ -104,13 +98,11 @@ def test_slice_locs_with_interval(self): # another unsorted duplicates index = IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4), (1, 3)]) - assert index.slice_locs( - start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) + assert index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) == (0, 3) assert index.slice_locs(start=Interval(0, 2)) == (0, 4) assert index.slice_locs(end=Interval(2, 4)) == (0, 3) assert index.slice_locs(end=Interval(0, 2)) == (0, 2) - assert index.slice_locs( - start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) + assert index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) == (2, 2) def test_slice_locs_with_ints_and_floats_succeeds(self): @@ -133,108 +125,125 @@ def test_slice_locs_with_ints_and_floats_succeeds(self): assert index.slice_locs(3, 4) == (1, 1) assert index.slice_locs(0, 4) == (3, 1) - @pytest.mark.parametrize("query", [ - [0, 1], [0, 2], [0, 3], [0, 4]]) - @pytest.mark.parametrize("tuples", [ - [(0, 2), (1, 3), (2, 4)], - [(2, 4), (1, 3), (0, 2)], - [(0, 2), (0, 2), (2, 4)], - [(0, 2), (2, 4), (0, 2)], - [(0, 2), (0, 2), (2, 4), (1, 3)]]) + @pytest.mark.parametrize("query", [[0, 1], [0, 2], [0, 3], [0, 4]]) + @pytest.mark.parametrize( + "tuples", + [ + [(0, 2), (1, 3), (2, 4)], + [(2, 4), (1, 3), (0, 2)], + [(0, 2), (0, 2), (2, 4)], + [(0, 2), (2, 4), (0, 2)], + [(0, 2), (0, 2), (2, 4), (1, 3)], + ], + ) def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): start, stop = query index = IntervalIndex.from_tuples(tuples) with pytest.raises(KeyError): index.slice_locs(start, stop) - @pytest.mark.parametrize('query, expected', [ - ([Interval(2, 4, closed='right')], [1]), - ([Interval(2, 4, closed='left')], [-1]), - ([Interval(2, 4, closed='both')], [-1]), - ([Interval(2, 4, closed='neither')], [-1]), - ([Interval(1, 4, closed='right')], [-1]), - ([Interval(0, 4, closed='right')], [-1]), - ([Interval(0.5, 1.5, closed='right')], [-1]), - ([Interval(2, 4, closed='right'), Interval(0, 1, closed='right')], - [1, -1]), - ([Interval(2, 4, closed='right'), Interval(2, 4, closed='right')], - [1, 1]), - ([Interval(5, 7, closed='right'), Interval(2, 4, closed='right')], - [2, 1]), - ([Interval(2, 4, closed='right'), Interval(2, 4, closed='left')], - [1, -1])]) + @pytest.mark.parametrize( + "query, expected", + [ + ([Interval(2, 4, closed="right")], [1]), + ([Interval(2, 4, closed="left")], [-1]), + ([Interval(2, 4, closed="both")], [-1]), + ([Interval(2, 4, closed="neither")], [-1]), + ([Interval(1, 4, closed="right")], [-1]), + ([Interval(0, 4, closed="right")], [-1]), + ([Interval(0.5, 1.5, closed="right")], [-1]), + ([Interval(2, 4, closed="right"), Interval(0, 1, closed="right")], [1, -1]), + ([Interval(2, 4, closed="right"), Interval(2, 4, closed="right")], [1, 1]), + ([Interval(5, 7, closed="right"), Interval(2, 4, closed="right")], [2, 1]), + ([Interval(2, 4, closed="right"), Interval(2, 4, closed="left")], [1, -1]), + ], + ) def test_get_indexer_with_interval(self, query, expected): tuples = [(0, 2), (2, 4), (5, 7)] - index = IntervalIndex.from_tuples(tuples, closed='right') + index = IntervalIndex.from_tuples(tuples, closed="right") result = index.get_indexer(query) - expected = np.array(expected, dtype='intp') + expected = np.array(expected, dtype="intp") tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('query, expected', [ - ([-0.5], [-1]), - ([0], [-1]), - ([0.5], [0]), - ([1], [0]), - ([1.5], [1]), - ([2], [1]), - ([2.5], [-1]), - ([3], [-1]), - ([3.5], [2]), - ([4], [2]), - ([4.5], [-1]), - ([1, 2], [0, 1]), - ([1, 2, 3], [0, 1, -1]), - ([1, 2, 3, 4], [0, 1, -1, 2]), - ([1, 2, 3, 4, 2], [0, 1, -1, 2, 1])]) + @pytest.mark.parametrize( + "query, expected", + [ + ([-0.5], [-1]), + ([0], [-1]), + ([0.5], [0]), + ([1], [0]), + ([1.5], [1]), + ([2], [1]), + ([2.5], [-1]), + ([3], [-1]), + ([3.5], [2]), + ([4], [2]), + ([4.5], [-1]), + ([1, 2], [0, 1]), + ([1, 2, 3], [0, 1, -1]), + ([1, 2, 3, 4], [0, 1, -1, 2]), + ([1, 2, 3, 4, 2], [0, 1, -1, 2, 1]), + ], + ) def test_get_indexer_with_int_and_float(self, query, expected): tuples = [(0, 1), (1, 2), (3, 4)] - index = IntervalIndex.from_tuples(tuples, closed='right') + index = IntervalIndex.from_tuples(tuples, closed="right") result = index.get_indexer(query) - expected = np.array(expected, dtype='intp') + expected = np.array(expected, dtype="intp") tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('tuples, closed', [ - ([(0, 2), (1, 3), (3, 4)], 'neither'), - ([(0, 5), (1, 4), (6, 7)], 'left'), - ([(0, 1), (0, 1), (1, 2)], 'right'), - ([(0, 1), (2, 3), (3, 4)], 'both')]) + @pytest.mark.parametrize( + "tuples, closed", + [ + ([(0, 2), (1, 3), (3, 4)], "neither"), + ([(0, 5), (1, 4), (6, 7)], "left"), + ([(0, 1), (0, 1), (1, 2)], "right"), + ([(0, 1), (2, 3), (3, 4)], "both"), + ], + ) def test_get_indexer_errors(self, tuples, closed): # IntervalIndex needs non-overlapping for uniqueness when querying index = IntervalIndex.from_tuples(tuples, closed=closed) - msg = ('cannot handle overlapping indices; use ' - 'IntervalIndex.get_indexer_non_unique') + msg = ( + "cannot handle overlapping indices; use " + "IntervalIndex.get_indexer_non_unique" + ) with pytest.raises(InvalidIndexError, match=msg): index.get_indexer([0, 2]) - @pytest.mark.parametrize('query, expected', [ - ([-0.5], ([-1], [0])), - ([0], ([0], [])), - ([0.5], ([0], [])), - ([1], ([0, 1], [])), - ([1.5], ([0, 1], [])), - ([2], ([0, 1, 2], [])), - ([2.5], ([1, 2], [])), - ([3], ([2], [])), - ([3.5], ([2], [])), - ([4], ([-1], [0])), - ([4.5], ([-1], [0])), - ([1, 2], ([0, 1, 0, 1, 2], [])), - ([1, 2, 3], ([0, 1, 0, 1, 2, 2], [])), - ([1, 2, 3, 4], ([0, 1, 0, 1, 2, 2, -1], [3])), - ([1, 2, 3, 4, 2], ([0, 1, 0, 1, 2, 2, -1, 0, 1, 2], [3]))]) + @pytest.mark.parametrize( + "query, expected", + [ + ([-0.5], ([-1], [0])), + ([0], ([0], [])), + ([0.5], ([0], [])), + ([1], ([0, 1], [])), + ([1.5], ([0, 1], [])), + ([2], ([0, 1, 2], [])), + ([2.5], ([1, 2], [])), + ([3], ([2], [])), + ([3.5], ([2], [])), + ([4], ([-1], [0])), + ([4.5], ([-1], [0])), + ([1, 2], ([0, 1, 0, 1, 2], [])), + ([1, 2, 3], ([0, 1, 0, 1, 2, 2], [])), + ([1, 2, 3, 4], ([0, 1, 0, 1, 2, 2, -1], [3])), + ([1, 2, 3, 4, 2], ([0, 1, 0, 1, 2, 2, -1, 0, 1, 2], [3])), + ], + ) def test_get_indexer_non_unique_with_int_and_float(self, query, expected): tuples = [(0, 2.5), (1, 3), (2, 4)] - index = IntervalIndex.from_tuples(tuples, closed='left') + index = IntervalIndex.from_tuples(tuples, closed="left") result_indexer, result_missing = index.get_indexer_non_unique(query) - expected_indexer = np.array(expected[0], dtype='intp') - expected_missing = np.array(expected[1], dtype='intp') + expected_indexer = np.array(expected[0], dtype="intp") + expected_missing = np.array(expected[1], dtype="intp") tm.assert_numpy_array_equal(result_indexer, expected_indexer) tm.assert_numpy_array_equal(result_missing, expected_missing) @@ -244,17 +253,17 @@ def test_get_indexer_non_unique_with_int_and_float(self, query, expected): def test_contains_dunder(self): - index = IntervalIndex.from_arrays([0, 1], [1, 2], closed='right') + index = IntervalIndex.from_arrays([0, 1], [1, 2], closed="right") # __contains__ requires perfect matches to intervals. assert 0 not in index assert 1 not in index assert 2 not in index - assert Interval(0, 1, closed='right') in index - assert Interval(0, 2, closed='right') not in index - assert Interval(0, 0.5, closed='right') not in index - assert Interval(3, 5, closed='right') not in index - assert Interval(-1, 0, closed='left') not in index - assert Interval(0, 1, closed='left') not in index - assert Interval(0, 1, closed='both') not in index + assert Interval(0, 1, closed="right") in index + assert Interval(0, 2, closed="right") not in index + assert Interval(0, 0.5, closed="right") not in index + assert Interval(3, 5, closed="right") not in index + assert Interval(-1, 0, closed="left") not in index + assert Interval(0, 1, closed="left") not in index + assert Interval(0, 1, closed="both") not in index diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 572fe5fbad100..b102444b4ec9c 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -6,22 +6,27 @@ from pandas.core.dtypes.common import is_integer from pandas import ( - DateOffset, Interval, IntervalIndex, Timedelta, Timestamp, date_range, - interval_range, timedelta_range) + DateOffset, + Interval, + IntervalIndex, + Timedelta, + Timestamp, + date_range, + interval_range, + timedelta_range, +) import pandas.util.testing as tm from pandas.tseries.offsets import Day -@pytest.fixture(scope='class', params=[None, 'foo']) +@pytest.fixture(scope="class", params=[None, "foo"]) def name(request): return request.param class TestIntervalRange: - - @pytest.mark.parametrize('freq, periods', [ - (1, 100), (2.5, 40), (5, 20), (25, 4)]) + @pytest.mark.parametrize("freq, periods", [(1, 100), (2.5, 40), (5, 20), (25, 4)]) def test_constructor_numeric(self, closed, name, freq, periods): start, end = 0, 100 breaks = np.arange(101, step=freq) @@ -29,106 +34,127 @@ def test_constructor_numeric(self, closed, name, freq, periods): # defined from start/end/freq result = interval_range( - start=start, end=end, freq=freq, name=name, closed=closed) + start=start, end=end, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from start/periods/freq result = interval_range( - start=start, periods=periods, freq=freq, name=name, closed=closed) + start=start, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from end/periods/freq result = interval_range( - end=end, periods=periods, freq=freq, name=name, closed=closed) + end=end, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods result = interval_range( - start=start, end=end, periods=periods, name=name, closed=closed) + start=start, end=end, periods=periods, name=name, closed=closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'US/Eastern']) - @pytest.mark.parametrize('freq, periods', [ - ('D', 364), ('2D', 182), ('22D18H', 16), ('M', 11)]) + @pytest.mark.parametrize("tz", [None, "US/Eastern"]) + @pytest.mark.parametrize( + "freq, periods", [("D", 364), ("2D", 182), ("22D18H", 16), ("M", 11)] + ) def test_constructor_timestamp(self, closed, name, freq, periods, tz): - start, end = Timestamp('20180101', tz=tz), Timestamp('20181231', tz=tz) + start, end = Timestamp("20180101", tz=tz), Timestamp("20181231", tz=tz) breaks = date_range(start=start, end=end, freq=freq) expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) # defined from start/end/freq result = interval_range( - start=start, end=end, freq=freq, name=name, closed=closed) + start=start, end=end, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from start/periods/freq result = interval_range( - start=start, periods=periods, freq=freq, name=name, closed=closed) + start=start, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from end/periods/freq result = interval_range( - end=end, periods=periods, freq=freq, name=name, closed=closed) + end=end, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods if not breaks.freq.isAnchored() and tz is None: # matches expected only for non-anchored offsets and tz naive # (anchored/DST transitions cause unequal spacing in expected) - result = interval_range(start=start, end=end, periods=periods, - name=name, closed=closed) + result = interval_range( + start=start, end=end, periods=periods, name=name, closed=closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq, periods', [ - ('D', 100), ('2D12H', 40), ('5D', 20), ('25D', 4)]) + @pytest.mark.parametrize( + "freq, periods", [("D", 100), ("2D12H", 40), ("5D", 20), ("25D", 4)] + ) def test_constructor_timedelta(self, closed, name, freq, periods): - start, end = Timedelta('0 days'), Timedelta('100 days') + start, end = Timedelta("0 days"), Timedelta("100 days") breaks = timedelta_range(start=start, end=end, freq=freq) expected = IntervalIndex.from_breaks(breaks, name=name, closed=closed) # defined from start/end/freq result = interval_range( - start=start, end=end, freq=freq, name=name, closed=closed) + start=start, end=end, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from start/periods/freq result = interval_range( - start=start, periods=periods, freq=freq, name=name, closed=closed) + start=start, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # defined from end/periods/freq result = interval_range( - end=end, periods=periods, freq=freq, name=name, closed=closed) + end=end, periods=periods, freq=freq, name=name, closed=closed + ) tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods result = interval_range( - start=start, end=end, periods=periods, name=name, closed=closed) + start=start, end=end, periods=periods, name=name, closed=closed + ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start, end, freq, expected_endpoint', [ - (0, 10, 3, 9), - (0, 10, 1.5, 9), - (0.5, 10, 3, 9.5), - (Timedelta('0D'), Timedelta('10D'), '2D4H', Timedelta('8D16H')), - (Timestamp('2018-01-01'), - Timestamp('2018-02-09'), - 'MS', - Timestamp('2018-02-01')), - (Timestamp('2018-01-01', tz='US/Eastern'), - Timestamp('2018-01-20', tz='US/Eastern'), - '5D12H', - Timestamp('2018-01-17 12:00:00', tz='US/Eastern'))]) + @pytest.mark.parametrize( + "start, end, freq, expected_endpoint", + [ + (0, 10, 3, 9), + (0, 10, 1.5, 9), + (0.5, 10, 3, 9.5), + (Timedelta("0D"), Timedelta("10D"), "2D4H", Timedelta("8D16H")), + ( + Timestamp("2018-01-01"), + Timestamp("2018-02-09"), + "MS", + Timestamp("2018-02-01"), + ), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-20", tz="US/Eastern"), + "5D12H", + Timestamp("2018-01-17 12:00:00", tz="US/Eastern"), + ), + ], + ) def test_early_truncation(self, start, end, freq, expected_endpoint): # index truncates early if freq causes end to be skipped result = interval_range(start=start, end=end, freq=freq) result_endpoint = result.right[-1] assert result_endpoint == expected_endpoint - @pytest.mark.parametrize('start, end, freq', [ - (0.5, None, None), - (None, 4.5, None), - (0.5, None, 1.5), - (None, 6.5, 1.5)]) + @pytest.mark.parametrize( + "start, end, freq", + [(0.5, None, None), (None, 4.5, None), (0.5, None, 1.5), (None, 6.5, 1.5)], + ) def test_no_invalid_float_truncation(self, start, end, freq): # GH 21161 if freq is None: @@ -140,13 +166,21 @@ def test_no_invalid_float_truncation(self, start, end, freq): result = interval_range(start=start, end=end, periods=4, freq=freq) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('start, mid, end', [ - (Timestamp('2018-03-10', tz='US/Eastern'), - Timestamp('2018-03-10 23:30:00', tz='US/Eastern'), - Timestamp('2018-03-12', tz='US/Eastern')), - (Timestamp('2018-11-03', tz='US/Eastern'), - Timestamp('2018-11-04 00:30:00', tz='US/Eastern'), - Timestamp('2018-11-05', tz='US/Eastern'))]) + @pytest.mark.parametrize( + "start, mid, end", + [ + ( + Timestamp("2018-03-10", tz="US/Eastern"), + Timestamp("2018-03-10 23:30:00", tz="US/Eastern"), + Timestamp("2018-03-12", tz="US/Eastern"), + ), + ( + Timestamp("2018-11-03", tz="US/Eastern"), + Timestamp("2018-11-04 00:30:00", tz="US/Eastern"), + Timestamp("2018-11-05", tz="US/Eastern"), + ), + ], + ) def test_linspace_dst_transition(self, start, mid, end): # GH 20976: linspace behavior defined from start/end/periods # accounts for the hour gained/lost during DST transition @@ -154,9 +188,9 @@ def test_linspace_dst_transition(self, start, mid, end): expected = IntervalIndex.from_breaks([start, mid, end]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq', [2, 2.0]) - @pytest.mark.parametrize('end', [10, 10.0]) - @pytest.mark.parametrize('start', [0, 0.0]) + @pytest.mark.parametrize("freq", [2, 2.0]) + @pytest.mark.parametrize("end", [10, 10.0]) + @pytest.mark.parametrize("start", [0, 0.0]) def test_float_subtype(self, start, end, freq): # Has float subtype if any of start/end/freq are float, even if all # resulting endpoints can safely be upcast to integers @@ -164,25 +198,25 @@ def test_float_subtype(self, start, end, freq): # defined from start/end/freq index = interval_range(start=start, end=end, freq=freq) result = index.dtype.subtype - expected = 'int64' if is_integer(start + end + freq) else 'float64' + expected = "int64" if is_integer(start + end + freq) else "float64" assert result == expected # defined from start/periods/freq index = interval_range(start=start, periods=5, freq=freq) result = index.dtype.subtype - expected = 'int64' if is_integer(start + freq) else 'float64' + expected = "int64" if is_integer(start + freq) else "float64" assert result == expected # defined from end/periods/freq index = interval_range(end=end, periods=5, freq=freq) result = index.dtype.subtype - expected = 'int64' if is_integer(end + freq) else 'float64' + expected = "int64" if is_integer(end + freq) else "float64" assert result == expected # GH 20976: linspace behavior defined from start/end/periods index = interval_range(start=start, end=end, periods=5) result = index.dtype.subtype - expected = 'int64' if is_integer(start + end) else 'float64' + expected = "int64" if is_integer(start + end) else "float64" assert result == expected def test_constructor_coverage(self): @@ -192,19 +226,23 @@ def test_constructor_coverage(self): tm.assert_index_equal(result, expected) # equivalent timestamp-like start/end - start, end = Timestamp('2017-01-01'), Timestamp('2017-01-15') + start, end = Timestamp("2017-01-01"), Timestamp("2017-01-15") expected = interval_range(start=start, end=end) - result = interval_range(start=start.to_pydatetime(), - end=end.to_pydatetime()) + result = interval_range(start=start.to_pydatetime(), end=end.to_pydatetime()) tm.assert_index_equal(result, expected) result = interval_range(start=start.asm8, end=end.asm8) tm.assert_index_equal(result, expected) # equivalent freq with timestamp - equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1), - DateOffset(days=1)] + equiv_freq = [ + "D", + Day(), + Timedelta(days=1), + timedelta(days=1), + DateOffset(days=1), + ] for freq in equiv_freq: result = interval_range(start=start, end=end, freq=freq) tm.assert_index_equal(result, expected) @@ -213,23 +251,24 @@ def test_constructor_coverage(self): start, end = Timedelta(days=1), Timedelta(days=10) expected = interval_range(start=start, end=end) - result = interval_range(start=start.to_pytimedelta(), - end=end.to_pytimedelta()) + result = interval_range(start=start.to_pytimedelta(), end=end.to_pytimedelta()) tm.assert_index_equal(result, expected) result = interval_range(start=start.asm8, end=end.asm8) tm.assert_index_equal(result, expected) # equivalent freq with timedelta - equiv_freq = ['D', Day(), Timedelta(days=1), timedelta(days=1)] + equiv_freq = ["D", Day(), Timedelta(days=1), timedelta(days=1)] for freq in equiv_freq: result = interval_range(start=start, end=end, freq=freq) tm.assert_index_equal(result, expected) def test_errors(self): # not enough params - msg = ('Of the four parameters: start, end, periods, and freq, ' - 'exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): interval_range(start=0) @@ -248,67 +287,69 @@ def test_errors(self): interval_range(start=0, end=5, periods=6, freq=1.5) # mixed units - msg = 'start, end, freq need to be type compatible' + msg = "start, end, freq need to be type compatible" with pytest.raises(TypeError, match=msg): - interval_range(start=0, end=Timestamp('20130101'), freq=2) + interval_range(start=0, end=Timestamp("20130101"), freq=2) with pytest.raises(TypeError, match=msg): - interval_range(start=0, end=Timedelta('1 day'), freq=2) + interval_range(start=0, end=Timedelta("1 day"), freq=2) with pytest.raises(TypeError, match=msg): - interval_range(start=0, end=10, freq='D') + interval_range(start=0, end=10, freq="D") with pytest.raises(TypeError, match=msg): - interval_range(start=Timestamp('20130101'), end=10, freq='D') + interval_range(start=Timestamp("20130101"), end=10, freq="D") with pytest.raises(TypeError, match=msg): - interval_range(start=Timestamp('20130101'), - end=Timedelta('1 day'), freq='D') + interval_range( + start=Timestamp("20130101"), end=Timedelta("1 day"), freq="D" + ) with pytest.raises(TypeError, match=msg): - interval_range(start=Timestamp('20130101'), - end=Timestamp('20130110'), freq=2) + interval_range( + start=Timestamp("20130101"), end=Timestamp("20130110"), freq=2 + ) with pytest.raises(TypeError, match=msg): - interval_range(start=Timedelta('1 day'), end=10, freq='D') + interval_range(start=Timedelta("1 day"), end=10, freq="D") with pytest.raises(TypeError, match=msg): - interval_range(start=Timedelta('1 day'), - end=Timestamp('20130110'), freq='D') + interval_range( + start=Timedelta("1 day"), end=Timestamp("20130110"), freq="D" + ) with pytest.raises(TypeError, match=msg): - interval_range(start=Timedelta('1 day'), - end=Timedelta('10 days'), freq=2) + interval_range(start=Timedelta("1 day"), end=Timedelta("10 days"), freq=2) # invalid periods - msg = 'periods must be a number, got foo' + msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): - interval_range(start=0, periods='foo') + interval_range(start=0, periods="foo") # invalid start - msg = 'start must be numeric or datetime-like, got foo' + msg = "start must be numeric or datetime-like, got foo" with pytest.raises(ValueError, match=msg): - interval_range(start='foo', periods=10) + interval_range(start="foo", periods=10) # invalid end - msg = r'end must be numeric or datetime-like, got \(0, 1\]' + msg = r"end must be numeric or datetime-like, got \(0, 1\]" with pytest.raises(ValueError, match=msg): interval_range(end=Interval(0, 1), periods=10) # invalid freq for datetime-like - msg = 'freq must be numeric or convertible to DateOffset, got foo' + msg = "freq must be numeric or convertible to DateOffset, got foo" with pytest.raises(ValueError, match=msg): - interval_range(start=0, end=10, freq='foo') + interval_range(start=0, end=10, freq="foo") with pytest.raises(ValueError, match=msg): - interval_range(start=Timestamp('20130101'), periods=10, freq='foo') + interval_range(start=Timestamp("20130101"), periods=10, freq="foo") with pytest.raises(ValueError, match=msg): - interval_range(end=Timedelta('1 day'), periods=10, freq='foo') + interval_range(end=Timedelta("1 day"), periods=10, freq="foo") # mixed tz - start = Timestamp('2017-01-01', tz='US/Eastern') - end = Timestamp('2017-01-07', tz='US/Pacific') - msg = 'Start and end cannot both be tz-aware with different timezones' + start = Timestamp("2017-01-01", tz="US/Eastern") + end = Timestamp("2017-01-07", tz="US/Pacific") + msg = "Start and end cannot both be tz-aware with different timezones" with pytest.raises(TypeError, match=msg): interval_range(start=start, end=end) diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index a3868a5675177..b7104242b5ccc 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -14,13 +14,15 @@ def skipif_32bit(param): Skip parameters in a parametrize on 32bit systems. Specifically used here to skip leaf_size parameters related to GH 23440. """ - marks = pytest.mark.skipif(compat.is_platform_32bit(), - reason='GH 23440: int type mismatch on 32bit') + marks = pytest.mark.skipif( + compat.is_platform_32bit(), reason="GH 23440: int type mismatch on 32bit" + ) return pytest.param(param, marks=marks) @pytest.fixture( - scope='class', params=['int32', 'int64', 'float32', 'float64', 'uint64']) + scope="class", params=["int32", "int64", "float32", "float64", "uint64"] +) def dtype(request): return request.param @@ -34,28 +36,30 @@ def leaf_size(request): return request.param -@pytest.fixture(params=[ - np.arange(5, dtype='int64'), - np.arange(5, dtype='int32'), - np.arange(5, dtype='uint64'), - np.arange(5, dtype='float64'), - np.arange(5, dtype='float32'), - np.array([0, 1, 2, 3, 4, np.nan], dtype='float64'), - np.array([0, 1, 2, 3, 4, np.nan], dtype='float32')]) +@pytest.fixture( + params=[ + np.arange(5, dtype="int64"), + np.arange(5, dtype="int32"), + np.arange(5, dtype="uint64"), + np.arange(5, dtype="float64"), + np.arange(5, dtype="float32"), + np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), + np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"), + ] +) def tree(request, leaf_size): left = request.param return IntervalTree(left, left + 2, leaf_size=leaf_size) class TestIntervalTree: - def test_get_loc(self, tree): result = tree.get_loc(1) - expected = np.array([0], dtype='intp') + expected = np.array([0], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = np.sort(tree.get_loc(2)) - expected = np.array([0, 1], dtype='intp') + expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) with pytest.raises(KeyError): @@ -63,30 +67,29 @@ def test_get_loc(self, tree): def test_get_indexer(self, tree): result = tree.get_indexer(np.array([1.0, 5.5, 6.5])) - expected = np.array([0, 4, -1], dtype='intp') + expected = np.array([0, 4, -1], dtype="intp") tm.assert_numpy_array_equal(result, expected) with pytest.raises(KeyError): tree.get_indexer(np.array([3.0])) def test_get_indexer_non_unique(self, tree): - indexer, missing = tree.get_indexer_non_unique( - np.array([1.0, 2.0, 6.5])) + indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) result = indexer[:1] - expected = np.array([0], dtype='intp') + expected = np.array([0], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = np.sort(indexer[1:3]) - expected = np.array([0, 1], dtype='intp') + expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = np.sort(indexer[3:]) - expected = np.array([-1], dtype='intp') + expected = np.array([-1], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = missing - expected = np.array([2], dtype='intp') + expected = np.array([2], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_duplicates(self, dtype): @@ -94,7 +97,7 @@ def test_duplicates(self, dtype): tree = IntervalTree(left, left + 1) result = np.sort(tree.get_loc(0.5)) - expected = np.array([0, 1, 2], dtype='intp') + expected = np.array([0, 1, 2], dtype="intp") tm.assert_numpy_array_equal(result, expected) with pytest.raises(KeyError): @@ -102,31 +105,31 @@ def test_duplicates(self, dtype): indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) result = np.sort(indexer) - expected = np.array([0, 1, 2], dtype='intp') + expected = np.array([0, 1, 2], dtype="intp") tm.assert_numpy_array_equal(result, expected) result = missing - expected = np.array([], dtype='intp') + expected = np.array([], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_loc_closed(self, closed): tree = IntervalTree([0], [1], closed=closed) - for p, errors in [(0, tree.open_left), - (1, tree.open_right)]: + for p, errors in [(0, tree.open_left), (1, tree.open_right)]: if errors: with pytest.raises(KeyError): tree.get_loc(p) else: result = tree.get_loc(p) - expected = np.array([0], dtype='intp') + expected = np.array([0], dtype="intp") tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('leaf_size', [ - skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000]) + @pytest.mark.parametrize( + "leaf_size", [skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000] + ) def test_get_indexer_closed(self, closed, leaf_size): - x = np.arange(1000, dtype='float64') - found = x.astype('intp') - not_found = (-1 * np.ones(1000)).astype('intp') + x = np.arange(1000, dtype="float64") + found = x.astype("intp") + not_found = (-1 * np.ones(1000)).astype("intp") tree = IntervalTree(x, x + 0.5, closed=closed, leaf_size=leaf_size) tm.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) @@ -137,40 +140,48 @@ def test_get_indexer_closed(self, closed, leaf_size): expected = found if tree.closed_right else not_found tm.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) - @pytest.mark.parametrize('left, right, expected', [ - (np.array([0, 1, 4]), np.array([2, 3, 5]), True), - (np.array([0, 1, 2]), np.array([5, 4, 3]), True), - (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), - (np.array([0, 2, 4]), np.array([1, 3, 5]), False), - (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False)]) - @pytest.mark.parametrize('order', map(list, permutations(range(3)))) + @pytest.mark.parametrize( + "left, right, expected", + [ + (np.array([0, 1, 4]), np.array([2, 3, 5]), True), + (np.array([0, 1, 2]), np.array([5, 4, 3]), True), + (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), + (np.array([0, 2, 4]), np.array([1, 3, 5]), False), + (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), + ], + ) + @pytest.mark.parametrize("order", map(list, permutations(range(3)))) def test_is_overlapping(self, closed, order, left, right, expected): # GH 23309 tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping assert result is expected - @pytest.mark.parametrize('order', map(list, permutations(range(3)))) + @pytest.mark.parametrize("order", map(list, permutations(range(3)))) def test_is_overlapping_endpoints(self, closed, order): """shared endpoints are marked as overlapping""" # GH 23309 left, right = np.arange(3), np.arange(1, 4) tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping - expected = closed == 'both' + expected = closed == "both" assert result is expected - @pytest.mark.parametrize('left, right', [ - (np.array([], dtype='int64'), np.array([], dtype='int64')), - (np.array([0], dtype='int64'), np.array([1], dtype='int64')), - (np.array([np.nan]), np.array([np.nan])), - (np.array([np.nan] * 3), np.array([np.nan] * 3))]) + @pytest.mark.parametrize( + "left, right", + [ + (np.array([], dtype="int64"), np.array([], dtype="int64")), + (np.array([0], dtype="int64"), np.array([1], dtype="int64")), + (np.array([np.nan]), np.array([np.nan])), + (np.array([np.nan] * 3), np.array([np.nan] * 3)), + ], + ) def test_is_overlapping_trivial(self, closed, left, right): # GH 23309 tree = IntervalTree(left, right, closed=closed) assert tree.is_overlapping is False - @pytest.mark.skipif(compat.is_platform_32bit(), reason='GH 23440') + @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") def test_construction_overflow(self): # GH 25485 left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 9ab0d15cbe6a3..89e733c30b1e3 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -5,7 +5,7 @@ import pandas.util.testing as tm -@pytest.fixture(scope='class', params=[None, 'foo']) +@pytest.fixture(scope="class", params=[None, "foo"]) def name(request): return request.param @@ -15,17 +15,15 @@ def sort(request): return request.param -def monotonic_index(start, end, dtype='int64', closed='right'): - return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), - closed=closed) +def monotonic_index(start, end, dtype="int64", closed="right"): + return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), closed=closed) -def empty_index(dtype='int64', closed='right'): +def empty_index(dtype="int64", closed="right"): return IntervalIndex(np.array([], dtype=dtype), closed=closed) class TestIntervalIndex: - def test_union(self, closed, sort): index = monotonic_index(0, 11, closed=closed) other = monotonic_index(5, 13, closed=closed) @@ -45,12 +43,12 @@ def test_union(self, closed, sort): tm.assert_index_equal(index.union(index[:1], sort=sort), index) # GH 19101: empty result, same dtype - index = empty_index(dtype='int64', closed=closed) + index = empty_index(dtype="int64", closed=closed) result = index.union(index, sort=sort) tm.assert_index_equal(result, index) # GH 19101: empty result, different dtypes - other = empty_index(dtype='float64', closed=closed) + other = empty_index(dtype="float64", closed=closed) result = index.union(other, sort=sort) tm.assert_index_equal(result, index) @@ -73,12 +71,12 @@ def test_intersection(self, closed, sort): # GH 19101: empty result, same dtype other = monotonic_index(300, 314, closed=closed) - expected = empty_index(dtype='int64', closed=closed) + expected = empty_index(dtype="int64", closed=closed) result = index.intersection(other, sort=sort) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes - other = monotonic_index(300, 314, dtype='float64', closed=closed) + other = monotonic_index(300, 314, dtype="float64", closed=closed) result = index.intersection(other, sort=sort) tm.assert_index_equal(result, expected) @@ -111,9 +109,7 @@ def test_intersection(self, closed, sort): tm.assert_index_equal(result, expected) def test_difference(self, closed, sort): - index = IntervalIndex.from_arrays([1, 0, 3, 2], - [1, 2, 3, 4], - closed=closed) + index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], closed=closed) result = index.difference(index[:1], sort=sort) expected = index[1:] if sort is None: @@ -122,12 +118,13 @@ def test_difference(self, closed, sort): # GH 19101: empty result, same dtype result = index.difference(index, sort=sort) - expected = empty_index(dtype='int64', closed=closed) + expected = empty_index(dtype="int64", closed=closed) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) result = index.difference(other, sort=sort) tm.assert_index_equal(result, expected) @@ -141,19 +138,21 @@ def test_symmetric_difference(self, closed, sort): # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) - expected = empty_index(dtype='int64', closed=closed) + expected = empty_index(dtype="int64", closed=closed) if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) # GH 19101: empty result, different dtypes - other = IntervalIndex.from_arrays(index.left.astype('float64'), - index.right, closed=closed) + other = IntervalIndex.from_arrays( + index.left.astype("float64"), index.right, closed=closed + ) result = index.symmetric_difference(other, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('op_name', [ - 'union', 'intersection', 'difference', 'symmetric_difference']) + @pytest.mark.parametrize( + "op_name", ["union", "intersection", "difference", "symmetric_difference"] + ) @pytest.mark.parametrize("sort", [None, False]) def test_set_incompatible_types(self, closed, op_name, sort): index = monotonic_index(0, 11, closed=closed) @@ -161,24 +160,28 @@ def test_set_incompatible_types(self, closed, op_name, sort): # TODO: standardize return type of non-union setops type(self vs other) # non-IntervalIndex - if op_name == 'difference': + if op_name == "difference": expected = index else: - expected = getattr(index.astype('O'), op_name)(Index([1, 2, 3])) + expected = getattr(index.astype("O"), op_name)(Index([1, 2, 3])) result = set_op(Index([1, 2, 3]), sort=sort) tm.assert_index_equal(result, expected) # mixed closed - msg = ('can only do set operations between two IntervalIndex objects ' - 'that are closed on the same side') - for other_closed in {'right', 'left', 'both', 'neither'} - {closed}: + msg = ( + "can only do set operations between two IntervalIndex objects " + "that are closed on the same side" + ) + for other_closed in {"right", "left", "both", "neither"} - {closed}: other = monotonic_index(0, 11, closed=other_closed) with pytest.raises(ValueError, match=msg): set_op(other, sort=sort) # GH 19016: incompatible dtypes - other = interval_range(Timestamp('20180101'), periods=9, closed=closed) - msg = ('can only do {op} between two IntervalIndex objects that have ' - 'compatible dtypes').format(op=op_name) + other = interval_range(Timestamp("20180101"), periods=9, closed=closed) + msg = ( + "can only do {op} between two IntervalIndex objects that have " + "compatible dtypes" + ).format(op=op_name) with pytest.raises(TypeError, match=msg): set_op(other, sort=sort) diff --git a/pandas/tests/indexes/multi/conftest.py b/pandas/tests/indexes/multi/conftest.py index 307772347e8f5..acaea4ff96ff5 100644 --- a/pandas/tests/indexes/multi/conftest.py +++ b/pandas/tests/indexes/multi/conftest.py @@ -9,30 +9,36 @@ def idx(): # a MultiIndex used to test the general functionality of the # general functionality of this object - major_axis = Index(['foo', 'bar', 'baz', 'qux']) - minor_axis = Index(['one', 'two']) + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ['first', 'second'] - mi = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, verify_integrity=False) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) return mi @pytest.fixture def idx_dup(): # compare tests/indexes/multi/conftest.py - major_axis = Index(['foo', 'bar', 'baz', 'qux']) - minor_axis = Index(['one', 'two']) + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) major_codes = np.array([0, 0, 1, 0, 1, 1]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index_names = ['first', 'second'] - mi = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=index_names, verify_integrity=False) + index_names = ["first", "second"] + mi = MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=index_names, + verify_integrity=False, + ) return mi @@ -40,7 +46,7 @@ def idx_dup(): def index_names(): # names that match those in the idx fixture for testing equality of # names assigned to the idx - return ['first', 'second'] + return ["first", "second"] @pytest.fixture @@ -52,7 +58,7 @@ def holder(): @pytest.fixture def compat_props(): # a MultiIndex must have these properties associated with it - return ['shape', 'ndim', 'size'] + return ["shape", "ndim", "size"] @pytest.fixture @@ -61,10 +67,9 @@ def narrow_multi_index(): Return a MultiIndex that is narrower than the display (<80 characters). """ n = 1000 - ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n)) - dti = pd.date_range('2000-01-01', freq='s', periods=n * 2) - return pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti], - names=['a', 'b', 'dti']) + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) + return pd.MultiIndex.from_arrays([ci, ci.codes + 9, dti], names=["a", "b", "dti"]) @pytest.fixture @@ -73,8 +78,8 @@ def wide_multi_index(): Return a MultiIndex that is wider than the display (>80 characters). """ n = 1000 - ci = pd.CategoricalIndex(list('a' * n) + (['abc'] * n)) - dti = pd.date_range('2000-01-01', freq='s', periods=n * 2) + ci = pd.CategoricalIndex(list("a" * n) + (["abc"] * n)) + dti = pd.date_range("2000-01-01", freq="s", periods=n * 2) levels = [ci, ci.codes + 9, dti, dti, dti] - names = ['a', 'b', 'dti_1', 'dti_2', 'dti_3'] + names = ["a", "b", "dti_1", "dti_2", "dti_3"] return pd.MultiIndex.from_arrays(levels, names=names) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index f886d78da6da2..7f5d57db8da88 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -37,11 +37,12 @@ def test_truncate(): major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) result = index.truncate(before=1) - assert 'foo' not in result.levels[0] + assert "foo" not in result.levels[0] assert 1 in result.levels[0] result = index.truncate(after=1) @@ -57,16 +58,16 @@ def test_truncate(): def test_where(): - i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) msg = r"\.where is not supported for MultiIndex operations" with pytest.raises(NotImplementedError, match=msg): i.where(True) -@pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) +@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) def test_where_array_like(klass): - i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) cond = [False, True] msg = r"\.where is not supported for MultiIndex operations" with pytest.raises(NotImplementedError, match=msg): @@ -78,19 +79,17 @@ def test_where_array_like(klass): def test_reorder_levels(idx): # this blows up - with pytest.raises(IndexError, match='^Too many levels'): + with pytest.raises(IndexError, match="^Too many levels"): idx.reorder_levels([2, 1, 0]) def test_numpy_repeat(): reps = 2 numbers = [1, 2, 3] - names = np.array(['foo', 'bar']) + names = np.array(["foo", "bar"]) - m = MultiIndex.from_product([ - numbers, names], names=names) - expected = MultiIndex.from_product([ - numbers, names.repeat(reps)], names=names) + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) tm.assert_index_equal(np.repeat(m, reps), expected) msg = "the 'axis' parameter is not supported" @@ -100,36 +99,50 @@ def test_numpy_repeat(): def test_append_mixed_dtypes(): # GH 13660 - dti = date_range('2011-01-01', freq='M', periods=3, ) - dti_tz = date_range('2011-01-01', freq='M', periods=3, tz='US/Eastern') - pi = period_range('2011-01', freq='M', periods=3) - - mi = MultiIndex.from_arrays([[1, 2, 3], - [1.1, np.nan, 3.3], - ['a', 'b', 'c'], - dti, dti_tz, pi]) + dti = date_range("2011-01-01", freq="M", periods=3) + dti_tz = date_range("2011-01-01", freq="M", periods=3, tz="US/Eastern") + pi = period_range("2011-01", freq="M", periods=3) + + mi = MultiIndex.from_arrays( + [[1, 2, 3], [1.1, np.nan, 3.3], ["a", "b", "c"], dti, dti_tz, pi] + ) assert mi.nlevels == 6 res = mi.append(mi) - exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3], - [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], - ['a', 'b', 'c', 'a', 'b', 'c'], - dti.append(dti), - dti_tz.append(dti_tz), - pi.append(pi)]) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, 1, 2, 3], + [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], + ["a", "b", "c", "a", "b", "c"], + dti.append(dti), + dti_tz.append(dti_tz), + pi.append(pi), + ] + ) tm.assert_index_equal(res, exp) - other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'], - ['x', 'y', 'z'], ['x', 'y', 'z'], - ['x', 'y', 'z'], ['x', 'y', 'z']]) + other = MultiIndex.from_arrays( + [ + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ["x", "y", "z"], + ] + ) res = mi.append(other) - exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'], - [1.1, np.nan, 3.3, 'x', 'y', 'z'], - ['a', 'b', 'c', 'x', 'y', 'z'], - dti.append(pd.Index(['x', 'y', 'z'])), - dti_tz.append(pd.Index(['x', 'y', 'z'])), - pi.append(pd.Index(['x', 'y', 'z']))]) + exp = MultiIndex.from_arrays( + [ + [1, 2, 3, "x", "y", "z"], + [1.1, np.nan, 3.3, "x", "y", "z"], + ["a", "b", "c", "x", "y", "z"], + dti.append(pd.Index(["x", "y", "z"])), + dti_tz.append(pd.Index(["x", "y", "z"])), + pi.append(pd.Index(["x", "y", "z"])), + ] + ) tm.assert_index_equal(res, exp) @@ -162,41 +175,46 @@ def test_take_invalid_kwargs(idx): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') + idx.take(indices, mode="clip") def test_take_fill_value(): # GH 12631 - vals = [['A', 'B'], - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]] - idx = pd.MultiIndex.from_product(vals, names=['str', 'dt']) + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) result = idx.take(np.array([1, 0, -1])) - exp_vals = [('A', pd.Timestamp('2011-01-02')), - ('A', pd.Timestamp('2011-01-01')), - ('B', pd.Timestamp('2011-01-02'))] - expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt']) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - exp_vals = [('A', pd.Timestamp('2011-01-02')), - ('A', pd.Timestamp('2011-01-01')), - (np.nan, pd.NaT)] - expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt']) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + (np.nan, pd.NaT), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - exp_vals = [('A', pd.Timestamp('2011-01-02')), - ('A', pd.Timestamp('2011-01-01')), - ('B', pd.Timestamp('2011-01-02'))] - expected = pd.MultiIndex.from_tuples(exp_vals, names=['str', 'dt']) + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + exp_vals = [ + ("A", pd.Timestamp("2011-01-02")), + ("A", pd.Timestamp("2011-01-01")), + ("B", pd.Timestamp("2011-01-02")), + ] + expected = pd.MultiIndex.from_tuples(exp_vals, names=["str", "dt"]) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -209,8 +227,14 @@ def test_take_fill_value(): def test_iter(idx): result = list(idx) - expected = [('foo', 'one'), ('foo', 'two'), ('bar', 'one'), - ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] + expected = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] assert result == expected @@ -237,7 +261,7 @@ def test_map(idx): # we don't infer UInt64 if isinstance(index, pd.UInt64Index): - expected = index.astype('int64') + expected = index.astype("int64") else: expected = index @@ -249,7 +273,9 @@ def test_map(idx): "mapper", [ lambda values, idx: {i: e for e, i in zip(values, idx)}, - lambda values, idx: pd.Series(values, idx)]) + lambda values, idx: pd.Series(values, idx), + ], +) def test_map_dictlike(idx, mapper): if isinstance(idx, (pd.CategoricalIndex, pd.IntervalIndex)): @@ -259,7 +285,7 @@ def test_map_dictlike(idx, mapper): # we don't infer to UInt64 for a dict if isinstance(idx, pd.UInt64Index) and isinstance(identity, dict): - expected = idx.astype('int64') + expected = idx.astype("int64") else: expected = idx @@ -272,13 +298,34 @@ def test_map_dictlike(idx, mapper): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('func', [ - np.exp, np.exp2, np.expm1, np.log, np.log2, np.log10, - np.log1p, np.sqrt, np.sin, np.cos, np.tan, np.arcsin, - np.arccos, np.arctan, np.sinh, np.cosh, np.tanh, - np.arcsinh, np.arccosh, np.arctanh, np.deg2rad, - np.rad2deg -], ids=lambda func: func.__name__) +@pytest.mark.parametrize( + "func", + [ + np.exp, + np.exp2, + np.expm1, + np.log, + np.log2, + np.log10, + np.log1p, + np.sqrt, + np.sin, + np.cos, + np.tan, + np.arcsin, + np.arccos, + np.arctan, + np.sinh, + np.cosh, + np.tanh, + np.arcsinh, + np.arccosh, + np.arctanh, + np.deg2rad, + np.rad2deg, + ], + ids=lambda func: func.__name__, +) def test_numpy_ufuncs(idx, func): # test ufuncs of numpy. see: # http://docs.scipy.org/doc/numpy/reference/ufuncs.html @@ -288,18 +335,24 @@ def test_numpy_ufuncs(idx, func): msg = "'tuple' object has no attribute '{}'".format(func.__name__) else: expected_exception = TypeError - msg = ("loop of ufunc does not support argument 0 of type tuple which" - " has no callable {} method").format(func.__name__) + msg = ( + "loop of ufunc does not support argument 0 of type tuple which" + " has no callable {} method" + ).format(func.__name__) with pytest.raises(expected_exception, match=msg): func(idx) -@pytest.mark.parametrize('func', [ - np.isfinite, np.isinf, np.isnan, np.signbit -], ids=lambda func: func.__name__) +@pytest.mark.parametrize( + "func", + [np.isfinite, np.isinf, np.isnan, np.signbit], + ids=lambda func: func.__name__, +) def test_numpy_type_funcs(idx, func): - msg = ("ufunc '{}' not supported for the input types, and the inputs" - " could not be safely coerced to any supported types according to" - " the casting rule ''safe''").format(func.__name__) + msg = ( + "ufunc '{}' not supported for the input types, and the inputs" + " could not be safely coerced to any supported types according to" + " the casting rule ''safe''" + ).format(func.__name__) with pytest.raises(TypeError, match=msg): func(idx) diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 5ee44fde730d7..4adcdd0112b26 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -8,7 +8,7 @@ def test_astype(idx): expected = idx.copy() - actual = idx.astype('O') + actual = idx.astype("O") assert_copy(actual.levels, expected.levels) assert_copy(actual.codes, expected.codes) assert [level.name for level in actual.levels] == list(expected.names) @@ -17,14 +17,14 @@ def test_astype(idx): idx.astype(np.dtype(int)) -@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize("ordered", [True, False]) def test_astype_category(idx, ordered): # GH 18630 - msg = '> 1 ndim Categorical are not supported at this time' + msg = "> 1 ndim Categorical are not supported at this time" with pytest.raises(NotImplementedError, match=msg): idx.astype(CategoricalDtype(ordered=ordered)) if ordered is False: # dtype='category' defaults to ordered=False, so only test once with pytest.raises(NotImplementedError, match=msg): - idx.astype('category') + idx.astype("category") diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index bd194cab8e204..b02f87dc4aacb 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -45,8 +45,8 @@ def test_boolean_context_compat2(): # boolean context compat # GH7897 - i1 = MultiIndex.from_tuples([('A', 1), ('A', 2)]) - i2 = MultiIndex.from_tuples([('A', 1), ('A', 3)]) + i1 = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + i2 = MultiIndex.from_tuples([("A", 1), ("A", 3)]) common = i1.intersection(i2) with pytest.raises(ValueError): @@ -54,8 +54,8 @@ def test_boolean_context_compat2(): def test_inplace_mutation_resets_values(): - levels = [['a', 'b', 'c'], [4]] - levels2 = [[1, 2, 3], ['a']] + levels = [["a", "b", "c"], [4]] + levels2 = [[1, 2, 3], ["a"]] codes = [[0, 1, 0, 2, 2, 0], [0, 0, 0, 0, 0, 0]] mi1 = MultiIndex(levels=levels, codes=codes) @@ -82,7 +82,7 @@ def test_inplace_mutation_resets_values(): # Make sure label setting works too codes2 = [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]] exp_values = np.empty((6,), dtype=object) - exp_values[:] = [(1, 'a')] * 6 + exp_values[:] = [(1, "a")] * 6 # Must be 1d array of tuples assert exp_values.shape == (6,) diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 7cab05660ac49..1b6177ede30ec 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -13,12 +13,13 @@ def test_constructor_single_level(): - result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - codes=[[0, 1, 2, 3]], names=['first']) + result = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) assert isinstance(result, MultiIndex) - expected = Index(['foo', 'bar', 'baz', 'qux'], name='first') + expected = Index(["foo", "bar", "baz", "qux"], name="first") tm.assert_index_equal(result.levels[0], expected) - assert result.names == ['first'] + assert result.names == ["first"] def test_constructor_no_levels(): @@ -35,18 +36,20 @@ def test_constructor_no_levels(): def test_constructor_nonhashable_names(): # GH 20527 - levels = [[1, 2], ['one', 'two']] + levels = [[1, 2], ["one", "two"]] codes = [[0, 0, 1, 1], [0, 1, 0, 1]] - names = (['foo'], ['bar']) + names = (["foo"], ["bar"]) msg = r"MultiIndex\.name must be a hashable type" with pytest.raises(TypeError, match=msg): MultiIndex(levels=levels, codes=codes, names=names) # With .rename() - mi = MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=('foo', 'bar')) - renamed = [['foor'], ['barr']] + mi = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=("foo", "bar"), + ) + renamed = [["foor"], ["barr"]] with pytest.raises(TypeError, match=msg): mi.rename(names=renamed) @@ -63,70 +66,76 @@ def test_constructor_mismatched_codes_levels(idx): with pytest.raises(ValueError, match=msg): MultiIndex(levels=levels, codes=codes) - length_error = (r"On level 0, code max \(3\) >= length of level \(1\)\." - " NOTE: this index is in an inconsistent state") + length_error = ( + r"On level 0, code max \(3\) >= length of level \(1\)\." + " NOTE: this index is in an inconsistent state" + ) label_error = r"Unequal code lengths: \[4, 2\]" code_value_error = r"On level 0, code value \(-2\) < -1" # important to check that it's looking at the right thing. with pytest.raises(ValueError, match=length_error): - MultiIndex(levels=[['a'], ['b']], - codes=[[0, 1, 2, 3], [0, 3, 4, 1]]) + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 1, 2, 3], [0, 3, 4, 1]]) with pytest.raises(ValueError, match=label_error): - MultiIndex(levels=[['a'], ['b']], codes=[[0, 0, 0, 0], [0, 0]]) + MultiIndex(levels=[["a"], ["b"]], codes=[[0, 0, 0, 0], [0, 0]]) # external API with pytest.raises(ValueError, match=length_error): - idx.copy().set_levels([['a'], ['b']]) + idx.copy().set_levels([["a"], ["b"]]) with pytest.raises(ValueError, match=label_error): idx.copy().set_codes([[0, 0, 0, 0], [0, 0]]) # test set_codes with verify_integrity=False # the setting should not raise any value error - idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], - verify_integrity=False) + idx.copy().set_codes(codes=[[0, 0, 0, 0], [0, 0]], verify_integrity=False) # code value smaller than -1 with pytest.raises(ValueError, match=code_value_error): - MultiIndex(levels=[['a'], ['b']], codes=[[0, -2], [0, 0]]) + MultiIndex(levels=[["a"], ["b"]], codes=[[0, -2], [0, 0]]) def test_na_levels(): # GH26408 # test if codes are re-assigned value -1 for levels # with mising values (NaN, NaT, None) - result = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[0, -1, 1, 2, 3, 4]]) - expected = MultiIndex(levels=[[np.nan, None, pd.NaT, 128, 2]], - codes=[[-1, -1, -1, -1, 3, 4]]) + result = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, None, pd.NaT, 128, 2]], codes=[[-1, -1, -1, -1, 3, 4]] + ) tm.assert_index_equal(result, expected) - result = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[0, -1, 1, 2, 3, 4]]) - expected = MultiIndex(levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[-1, -1, 1, -1, 3, -1]]) + result = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[0, -1, 1, 2, 3, 4]] + ) + expected = MultiIndex( + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[-1, -1, 1, -1, 3, -1]] + ) tm.assert_index_equal(result, expected) # verify set_levels and set_codes result = MultiIndex( - levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]]).set_levels( - [[np.nan, 's', pd.NaT, 128, None]]) + levels=[[1, 2, 3, 4, 5]], codes=[[0, -1, 1, 2, 3, 4]] + ).set_levels([[np.nan, "s", pd.NaT, 128, None]]) tm.assert_index_equal(result, expected) result = MultiIndex( - levels=[[np.nan, 's', pd.NaT, 128, None]], - codes=[[1, 2, 2, 2, 2, 2]]).set_codes( - [[0, -1, 1, 2, 3, 4]]) + levels=[[np.nan, "s", pd.NaT, 128, None]], codes=[[1, 2, 2, 2, 2, 2]] + ).set_codes([[0, -1, 1, 2, 3, 4]]) tm.assert_index_equal(result, expected) def test_labels_deprecated(idx): # GH23752 with tm.assert_produces_warning(FutureWarning): - MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - labels=[[0, 1, 2, 3]], names=['first']) + MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], + labels=[[0, 1, 2, 3]], + names=["first"], + ) with tm.assert_produces_warning(FutureWarning): idx.labels @@ -135,8 +144,7 @@ def test_copy_in_constructor(): levels = np.array(["a", "b", "c"]) codes = np.array([1, 1, 2, 0, 0, 1, 1]) val = codes[0] - mi = MultiIndex(levels=[levels, levels], codes=[codes, codes], - copy=True) + mi = MultiIndex(levels=[levels, levels], codes=[codes, codes], copy=True) assert mi.codes[0][0] == val codes[0] = 15 assert mi.codes[0][0] == val @@ -149,24 +157,27 @@ def test_copy_in_constructor(): # from_arrays # ---------------------------------------------------------------------------- def test_from_arrays(idx): - arrays = [np.asarray(lev).take(level_codes) - for lev, level_codes in zip(idx.levels, idx.codes)] + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] # list of arrays as input result = MultiIndex.from_arrays(arrays, names=idx.names) tm.assert_index_equal(result, idx) # infer correctly - result = MultiIndex.from_arrays([[pd.NaT, Timestamp('20130101')], - ['a', 'b']]) - assert result.levels[0].equals(Index([Timestamp('20130101')])) - assert result.levels[1].equals(Index(['a', 'b'])) + result = MultiIndex.from_arrays([[pd.NaT, Timestamp("20130101")], ["a", "b"]]) + assert result.levels[0].equals(Index([Timestamp("20130101")])) + assert result.levels[1].equals(Index(["a", "b"])) def test_from_arrays_iterator(idx): # GH 18434 - arrays = [np.asarray(lev).take(level_codes) - for lev, level_codes in zip(idx.levels, idx.codes)] + arrays = [ + np.asarray(lev).take(level_codes) + for lev, level_codes in zip(idx.levels, idx.codes) + ] # iterator as input result = MultiIndex.from_arrays(iter(arrays), names=idx.names) @@ -179,8 +190,10 @@ def test_from_arrays_iterator(idx): def test_from_arrays_tuples(idx): - arrays = tuple(tuple(np.asarray(lev).take(level_codes)) - for lev, level_codes in zip(idx.levels, idx.codes)) + arrays = tuple( + tuple(np.asarray(lev).take(level_codes)) + for lev, level_codes in zip(idx.levels, idx.codes) + ) # tuple of tuples as input result = MultiIndex.from_arrays(arrays, names=idx.names) @@ -188,10 +201,8 @@ def test_from_arrays_tuples(idx): def test_from_arrays_index_series_datetimetz(): - idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern') - idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3, - tz='Asia/Tokyo') + idx1 = pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx2 = pd.date_range("2015-01-01 10:00", freq="H", periods=3, tz="Asia/Tokyo") result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) @@ -204,8 +215,8 @@ def test_from_arrays_index_series_datetimetz(): def test_from_arrays_index_series_timedelta(): - idx1 = pd.timedelta_range('1 days', freq='D', periods=3) - idx2 = pd.timedelta_range('2 hours', freq='H', periods=3) + idx1 = pd.timedelta_range("1 days", freq="D", periods=3) + idx2 = pd.timedelta_range("2 hours", freq="H", periods=3) result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) @@ -218,8 +229,8 @@ def test_from_arrays_index_series_timedelta(): def test_from_arrays_index_series_period(): - idx1 = pd.period_range('2011-01-01', freq='D', periods=3) - idx2 = pd.period_range('2015-01-01', freq='H', periods=3) + idx1 = pd.period_range("2011-01-01", freq="D", periods=3) + idx2 = pd.period_range("2015-01-01", freq="H", periods=3) result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) tm.assert_index_equal(result.get_level_values(1), idx2) @@ -232,11 +243,10 @@ def test_from_arrays_index_series_period(): def test_from_arrays_index_datetimelike_mixed(): - idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern') - idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3) - idx3 = pd.timedelta_range('1 days', freq='D', periods=3) - idx4 = pd.period_range('2011-01-01', freq='D', periods=3) + idx1 = pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern") + idx2 = pd.date_range("2015-01-01 10:00", freq="H", periods=3) + idx3 = pd.timedelta_range("1 days", freq="D", periods=3) + idx4 = pd.period_range("2011-01-01", freq="D", periods=3) result = pd.MultiIndex.from_arrays([idx1, idx2, idx3, idx4]) tm.assert_index_equal(result.get_level_values(0), idx1) @@ -244,10 +254,9 @@ def test_from_arrays_index_datetimelike_mixed(): tm.assert_index_equal(result.get_level_values(2), idx3) tm.assert_index_equal(result.get_level_values(3), idx4) - result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), - pd.Series(idx2), - pd.Series(idx3), - pd.Series(idx4)]) + result2 = pd.MultiIndex.from_arrays( + [pd.Series(idx1), pd.Series(idx2), pd.Series(idx3), pd.Series(idx4)] + ) tm.assert_index_equal(result2.get_level_values(0), idx1) tm.assert_index_equal(result2.get_level_values(1), idx2) tm.assert_index_equal(result2.get_level_values(2), idx3) @@ -258,10 +267,8 @@ def test_from_arrays_index_datetimelike_mixed(): def test_from_arrays_index_series_categorical(): # GH13743 - idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), - ordered=False) - idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), - ordered=True) + idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=False) + idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=True) result = pd.MultiIndex.from_arrays([idx1, idx2]) tm.assert_index_equal(result.get_level_values(0), idx1) @@ -283,41 +290,60 @@ def test_from_arrays_empty(): MultiIndex.from_arrays(arrays=[]) # 1 level - result = MultiIndex.from_arrays(arrays=[[]], names=['A']) + result = MultiIndex.from_arrays(arrays=[[]], names=["A"]) assert isinstance(result, MultiIndex) - expected = Index([], name='A') + expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) # N levels for N in [2, 3]: arrays = [[]] * N - names = list('ABC')[:N] + names = list("ABC")[:N] result = MultiIndex.from_arrays(arrays=arrays, names=names) - expected = MultiIndex(levels=[[]] * N, codes=[[]] * N, - names=names) + expected = MultiIndex(levels=[[]] * N, codes=[[]] * N, names=names) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('invalid_sequence_of_arrays', [ - 1, [1], [1, 2], [[1], 2], [1, [2]], 'a', ['a'], ['a', 'b'], [['a'], 'b'], - (1,), (1, 2), ([1], 2), (1, [2]), 'a', ('a',), ('a', 'b'), (['a'], 'b'), - [(1,), 2], [1, (2,)], [('a',), 'b'], - ((1,), 2), (1, (2,)), (('a',), 'b') -]) +@pytest.mark.parametrize( + "invalid_sequence_of_arrays", + [ + 1, + [1], + [1, 2], + [[1], 2], + [1, [2]], + "a", + ["a"], + ["a", "b"], + [["a"], "b"], + (1,), + (1, 2), + ([1], 2), + (1, [2]), + "a", + ("a",), + ("a", "b"), + (["a"], "b"), + [(1,), 2], + [1, (2,)], + [("a",), "b"], + ((1,), 2), + (1, (2,)), + (("a",), "b"), + ], +) def test_from_arrays_invalid_input(invalid_sequence_of_arrays): msg = "Input must be a list / sequence of array-likes" with pytest.raises(TypeError, match=msg): MultiIndex.from_arrays(arrays=invalid_sequence_of_arrays) -@pytest.mark.parametrize('idx1, idx2', [ - ([1, 2, 3], ['a', 'b']), - ([], ['a', 'b']), - ([1, 2, 3], []) -]) +@pytest.mark.parametrize( + "idx1, idx2", [([1, 2, 3], ["a", "b"]), ([], ["a", "b"]), ([1, 2, 3], [])] +) def test_from_arrays_different_lengths(idx1, idx2): # see gh-13599 - msg = '^all arrays must be same length$' + msg = "^all arrays must be same length$" with pytest.raises(ValueError, match=msg): MultiIndex.from_arrays([idx1, idx2]) @@ -326,40 +352,39 @@ def test_from_arrays_different_lengths(idx1, idx2): # from_tuples # ---------------------------------------------------------------------------- def test_from_tuples(): - msg = 'Cannot infer number of levels from empty list' + msg = "Cannot infer number of levels from empty list" with pytest.raises(TypeError, match=msg): MultiIndex.from_tuples([]) - expected = MultiIndex(levels=[[1, 3], [2, 4]], - codes=[[0, 1], [0, 1]], - names=['a', 'b']) + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) # input tuples - result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) + result = MultiIndex.from_tuples(((1, 2), (3, 4)), names=["a", "b"]) tm.assert_index_equal(result, expected) def test_from_tuples_iterator(): # GH 18434 # input iterator for tuples - expected = MultiIndex(levels=[[1, 3], [2, 4]], - codes=[[0, 1], [0, 1]], - names=['a', 'b']) + expected = MultiIndex( + levels=[[1, 3], [2, 4]], codes=[[0, 1], [0, 1]], names=["a", "b"] + ) - result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=['a', 'b']) + result = MultiIndex.from_tuples(zip([1, 3], [2, 4]), names=["a", "b"]) tm.assert_index_equal(result, expected) # input non-iterables - msg = 'Input must be a list / sequence of tuple-likes.' + msg = "Input must be a list / sequence of tuple-likes." with pytest.raises(TypeError, match=msg): MultiIndex.from_tuples(0) def test_from_tuples_empty(): # GH 16777 - result = MultiIndex.from_tuples([], names=['a', 'b']) - expected = MultiIndex.from_arrays(arrays=[[], []], - names=['a', 'b']) + result = MultiIndex.from_tuples([], names=["a", "b"]) + expected = MultiIndex.from_arrays(arrays=[[], []], names=["a", "b"]) tm.assert_index_equal(result, expected) @@ -374,17 +399,18 @@ def test_tuples_with_name_string(): li = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] msg = "Names should be list-like for a MultiIndex" with pytest.raises(ValueError, match=msg): - pd.Index(li, name='abc') + pd.Index(li, name="abc") with pytest.raises(ValueError, match=msg): - pd.Index(li, name='a') + pd.Index(li, name="a") def test_from_tuples_with_tuple_label(): # GH 15457 - expected = pd.DataFrame([[2, 1, 2], [4, (1, 2), 3]], - columns=['a', 'b', 'c']).set_index(['a', 'b']) - idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=('a', 'b')) - result = pd.DataFrame([2, 3], columns=['c'], index=idx) + expected = pd.DataFrame( + [[2, 1, 2], [4, (1, 2), 3]], columns=["a", "b", "c"] + ).set_index(["a", "b"]) + idx = pd.MultiIndex.from_tuples([(2, 1), (4, (1, 2))], names=("a", "b")) + result = pd.DataFrame([2, 3], columns=["c"], index=idx) tm.assert_frame_equal(expected, result) @@ -399,79 +425,64 @@ def test_from_product_empty_zero_levels(): def test_from_product_empty_one_level(): - result = MultiIndex.from_product([[]], names=['A']) - expected = pd.Index([], name='A') + result = MultiIndex.from_product([[]], names=["A"]) + expected = pd.Index([], name="A") tm.assert_index_equal(result.levels[0], expected) -@pytest.mark.parametrize('first, second', [ - ([], []), - (['foo', 'bar', 'baz'], []), - ([], ['a', 'b', 'c']), -]) +@pytest.mark.parametrize( + "first, second", [([], []), (["foo", "bar", "baz"], []), ([], ["a", "b", "c"])] +) def test_from_product_empty_two_levels(first, second): - names = ['A', 'B'] + names = ["A", "B"] result = MultiIndex.from_product([first, second], names=names) - expected = MultiIndex(levels=[first, second], - codes=[[], []], names=names) + expected = MultiIndex(levels=[first, second], codes=[[], []], names=names) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('N', list(range(4))) +@pytest.mark.parametrize("N", list(range(4))) def test_from_product_empty_three_levels(N): # GH12258 - names = ['A', 'B', 'C'] + names = ["A", "B", "C"] lvl2 = list(range(N)) result = MultiIndex.from_product([[], lvl2, []], names=names) - expected = MultiIndex(levels=[[], lvl2, []], - codes=[[], [], []], names=names) + expected = MultiIndex(levels=[[], lvl2, []], codes=[[], [], []], names=names) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('invalid_input', [ - 1, - [1], - [1, 2], - [[1], 2], - 'a', - ['a'], - ['a', 'b'], - [['a'], 'b'], -]) +@pytest.mark.parametrize( + "invalid_input", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] +) def test_from_product_invalid_input(invalid_input): - msg = (r"Input must be a list / sequence of iterables|" - "Input must be list-like") + msg = r"Input must be a list / sequence of iterables|" "Input must be list-like" with pytest.raises(TypeError, match=msg): MultiIndex.from_product(iterables=invalid_input) def test_from_product_datetimeindex(): - dt_index = date_range('2000-01-01', periods=2) + dt_index = date_range("2000-01-01", periods=2) mi = pd.MultiIndex.from_product([[1, 2], dt_index]) - etalon = construct_1d_object_array_from_listlike([ - (1, pd.Timestamp('2000-01-01')), - (1, pd.Timestamp('2000-01-02')), - (2, pd.Timestamp('2000-01-01')), - (2, pd.Timestamp('2000-01-02')), - ]) + etalon = construct_1d_object_array_from_listlike( + [ + (1, pd.Timestamp("2000-01-01")), + (1, pd.Timestamp("2000-01-02")), + (2, pd.Timestamp("2000-01-01")), + (2, pd.Timestamp("2000-01-02")), + ] + ) tm.assert_numpy_array_equal(mi.values, etalon) -@pytest.mark.parametrize('ordered', [False, True]) -@pytest.mark.parametrize('f', [ - lambda x: x, - lambda x: pd.Series(x), - lambda x: x.values -]) +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize("f", [lambda x: x, lambda x: pd.Series(x), lambda x: x.values]) def test_from_product_index_series_categorical(ordered, f): # GH13743 - first = ['foo', 'bar'] + first = ["foo", "bar"] - idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), - ordered=ordered) - expected = pd.CategoricalIndex(list("abcaab") + list("abcaab"), - categories=list("bac"), - ordered=ordered) + idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), ordered=ordered) + expected = pd.CategoricalIndex( + list("abcaab") + list("abcaab"), categories=list("bac"), ordered=ordered + ) result = pd.MultiIndex.from_product([first, f(idx)]) tm.assert_index_equal(result.get_level_values(1), expected) @@ -479,14 +490,22 @@ def test_from_product_index_series_categorical(ordered, f): def test_from_product(): - first = ['foo', 'bar', 'buz'] - second = ['a', 'b', 'c'] - names = ['first', 'second'] + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] result = MultiIndex.from_product([first, second], names=names) - tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'), - ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'), - ('buz', 'c')] + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] expected = MultiIndex.from_tuples(tuples, names=names) tm.assert_index_equal(result, expected) @@ -494,12 +513,20 @@ def test_from_product(): def test_from_product_iterator(): # GH 18434 - first = ['foo', 'bar', 'buz'] - second = ['a', 'b', 'c'] - names = ['first', 'second'] - tuples = [('foo', 'a'), ('foo', 'b'), ('foo', 'c'), ('bar', 'a'), - ('bar', 'b'), ('bar', 'c'), ('buz', 'a'), ('buz', 'b'), - ('buz', 'c')] + first = ["foo", "bar", "buz"] + second = ["a", "b", "c"] + names = ["first", "second"] + tuples = [ + ("foo", "a"), + ("foo", "b"), + ("foo", "c"), + ("bar", "a"), + ("bar", "b"), + ("bar", "c"), + ("buz", "a"), + ("buz", "b"), + ("buz", "c"), + ] expected = MultiIndex.from_tuples(tuples, names=names) # iterator as input @@ -517,28 +544,38 @@ def test_create_index_existing_name(idx): # GH11193, when an existing index is passed, and a new name is not # specified, the new index should inherit the previous object name index = idx - index.names = ['foo', 'bar'] + index.names = ["foo", "bar"] result = pd.Index(index) expected = Index( - Index([ - ('foo', 'one'), ('foo', 'two'), - ('bar', 'one'), ('baz', 'two'), - ('qux', 'one'), ('qux', 'two')], - dtype='object' + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", ), - names=['foo', 'bar'] + names=["foo", "bar"], ) tm.assert_index_equal(result, expected) - result = pd.Index(index, names=['A', 'B']) + result = pd.Index(index, names=["A", "B"]) expected = Index( - Index([ - ('foo', 'one'), ('foo', 'two'), - ('bar', 'one'), ('baz', 'two'), - ('qux', 'one'), ('qux', 'two')], - dtype='object' + Index( + [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ], + dtype="object", ), - names=['A', 'B'] + names=["A", "B"], ) tm.assert_index_equal(result, expected) @@ -548,45 +585,56 @@ def test_create_index_existing_name(idx): # ---------------------------------------------------------------------------- def test_from_frame(): # GH 22420 - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=['L1', 'L2']) - expected = pd.MultiIndex.from_tuples([('a', 'a'), ('a', 'b'), - ('b', 'a'), ('b', 'b')], - names=['L1', 'L2']) + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], columns=["L1", "L2"] + ) + expected = pd.MultiIndex.from_tuples( + [("a", "a"), ("a", "b"), ("b", "a"), ("b", "b")], names=["L1", "L2"] + ) result = pd.MultiIndex.from_frame(df) tm.assert_index_equal(expected, result) -@pytest.mark.parametrize('non_frame', [ - pd.Series([1, 2, 3, 4]), - [1, 2, 3, 4], - [[1, 2], [3, 4], [5, 6]], - pd.Index([1, 2, 3, 4]), - np.array([[1, 2], [3, 4], [5, 6]]), - 27 -]) +@pytest.mark.parametrize( + "non_frame", + [ + pd.Series([1, 2, 3, 4]), + [1, 2, 3, 4], + [[1, 2], [3, 4], [5, 6]], + pd.Index([1, 2, 3, 4]), + np.array([[1, 2], [3, 4], [5, 6]]), + 27, + ], +) def test_from_frame_error(non_frame): # GH 22420 - with pytest.raises(TypeError, match='Input must be a DataFrame'): + with pytest.raises(TypeError, match="Input must be a DataFrame"): pd.MultiIndex.from_frame(non_frame) def test_from_frame_dtype_fidelity(): # GH 22420 - df = pd.DataFrame(OrderedDict([ - ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), - ('a', [1, 1, 1, 2, 2, 2]), - ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), - ('c', ['x', 'x', 'y', 'z', 'x', 'y']) - ])) + df = pd.DataFrame( + OrderedDict( + [ + ("dates", pd.date_range("19910905", periods=6, tz="US/Eastern")), + ("a", [1, 1, 1, 2, 2, 2]), + ("b", pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True)), + ("c", ["x", "x", "y", "z", "x", "y"]), + ] + ) + ) original_dtypes = df.dtypes.to_dict() - expected_mi = pd.MultiIndex.from_arrays([ - pd.date_range('19910905', periods=6, tz='US/Eastern'), - [1, 1, 1, 2, 2, 2], - pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), - ['x', 'x', 'y', 'z', 'x', 'y'] - ], names=['dates', 'a', 'b', 'c']) + expected_mi = pd.MultiIndex.from_arrays( + [ + pd.date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) mi = pd.MultiIndex.from_frame(df) mi_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} @@ -594,28 +642,31 @@ def test_from_frame_dtype_fidelity(): assert original_dtypes == mi_dtypes -@pytest.mark.parametrize('names_in,names_out', [ - (None, [('L1', 'x'), ('L2', 'y')]), - (['x', 'y'], ['x', 'y']), -]) +@pytest.mark.parametrize( + "names_in,names_out", [(None, [("L1", "x"), ("L2", "y")]), (["x", "y"], ["x", "y"])] +) def test_from_frame_valid_names(names_in, names_out): # GH 22420 - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=pd.MultiIndex.from_tuples([('L1', 'x'), - ('L2', 'y')])) + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=pd.MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) mi = pd.MultiIndex.from_frame(df, names=names_in) assert mi.names == names_out -@pytest.mark.parametrize('names,expected_error_msg', [ - ('bad_input', "Names should be list-like for a MultiIndex"), - (['a', 'b', 'c'], - "Length of names must match number of levels in MultiIndex") -]) +@pytest.mark.parametrize( + "names,expected_error_msg", + [ + ("bad_input", "Names should be list-like for a MultiIndex"), + (["a", "b", "c"], "Length of names must match number of levels in MultiIndex"), + ], +) def test_from_frame_invalid_names(names, expected_error_msg): # GH 22420 - df = pd.DataFrame([['a', 'a'], ['a', 'b'], ['b', 'a'], ['b', 'b']], - columns=pd.MultiIndex.from_tuples([('L1', 'x'), - ('L2', 'y')])) + df = pd.DataFrame( + [["a", "a"], ["a", "b"], ["b", "a"], ["b", "b"]], + columns=pd.MultiIndex.from_tuples([("L1", "x"), ("L2", "y")]), + ) with pytest.raises(ValueError, match=expected_error_msg): pd.MultiIndex.from_frame(df, names=names) diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index 4b6934d445fd0..21b71613f00f0 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -9,44 +9,42 @@ def test_contains_top_level(): - midx = MultiIndex.from_product([['A', 'B'], [1, 2]]) - assert 'A' in midx - assert 'A' not in midx._engine + midx = MultiIndex.from_product([["A", "B"], [1, 2]]) + assert "A" in midx + assert "A" not in midx._engine def test_contains_with_nat(): # MI with a NaT - mi = MultiIndex(levels=[['C'], - pd.date_range('2012-01-01', periods=5)], - codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], - names=[None, 'B']) - assert ('C', pd.Timestamp('2012-01-01')) in mi + mi = MultiIndex( + levels=[["C"], pd.date_range("2012-01-01", periods=5)], + codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, "B"], + ) + assert ("C", pd.Timestamp("2012-01-01")) in mi for val in mi.values: assert val in mi def test_contains(idx): - assert ('foo', 'two') in idx - assert ('bar', 'two') not in idx + assert ("foo", "two") in idx + assert ("bar", "two") not in idx assert None not in idx @pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") def test_isin_nan_pypy(): - idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) - tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), - np.array([False, True])) - tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), - np.array([False, True])) + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, True]) + ) def test_isin(): - values = [('foo', 2), ('bar', 3), ('quux', 4)] + values = [("foo", 2), ("bar", 3), ("quux", 4)] - idx = MultiIndex.from_arrays([ - ['qux', 'baz', 'foo', 'bar'], - np.arange(4) - ]) + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) result = idx.isin(values) expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(result, expected) @@ -60,18 +58,17 @@ def test_isin(): @pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy") def test_isin_nan_not_pypy(): - idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]]) - tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]), - np.array([False, False])) - tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]), - np.array([False, False])) + idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) + tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False])) + tm.assert_numpy_array_equal( + idx.isin([("bar", float("nan"))]), np.array([False, False]) + ) def test_isin_level_kwarg(): - idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange( - 4)]) + idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) - vals_0 = ['foo', 'bar', 'quux'] + vals_0 = ["foo", "bar", "quux"] vals_1 = [2, 3, 10] expected = np.array([False, False, True, True]) @@ -84,8 +81,7 @@ def test_isin_level_kwarg(): msg = "Too many levels: Index has only 2 levels, not 6" with pytest.raises(IndexError, match=msg): idx.isin(vals_0, level=5) - msg = ("Too many levels: Index has only 2 levels, -5 is not a valid level" - " number") + msg = "Too many levels: Index has only 2 levels, -5 is not a valid level" " number" with pytest.raises(IndexError, match=msg): idx.isin(vals_0, level=-5) @@ -94,11 +90,11 @@ def test_isin_level_kwarg(): with pytest.raises(KeyError, match=r"'Level -1\.0 not found'"): idx.isin(vals_1, level=-1.0) with pytest.raises(KeyError, match="'Level A not found'"): - idx.isin(vals_1, level='A') + idx.isin(vals_1, level="A") - idx.names = ['A', 'B'] - tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level='A')) - tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level='B')) + idx.names = ["A", "B"] + tm.assert_numpy_array_equal(expected, idx.isin(vals_0, level="A")) + tm.assert_numpy_array_equal(expected, idx.isin(vals_1, level="B")) with pytest.raises(KeyError, match="'Level C not found'"): - idx.isin(vals_1, level='C') + idx.isin(vals_1, level="C") diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 877904a91b953..3fc73dd05bc72 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -21,7 +21,7 @@ def test_to_numpy(idx): def test_to_frame(): - tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False) @@ -32,11 +32,11 @@ def test_to_frame(): expected.index = index tm.assert_frame_equal(result, expected) - tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] + index = MultiIndex.from_tuples(tuples, names=["first", "second"]) result = index.to_frame(index=False) expected = DataFrame(tuples) - expected.columns = ['first', 'second'] + expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) result = index.to_frame() @@ -45,31 +45,33 @@ def test_to_frame(): # See GH-22580 index = MultiIndex.from_tuples(tuples) - result = index.to_frame(index=False, name=['first', 'second']) + result = index.to_frame(index=False, name=["first", "second"]) expected = DataFrame(tuples) - expected.columns = ['first', 'second'] + expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) - result = index.to_frame(name=['first', 'second']) + result = index.to_frame(name=["first", "second"]) expected.index = index - expected.columns = ['first', 'second'] + expected.columns = ["first", "second"] tm.assert_frame_equal(result, expected) msg = "'name' must be a list / sequence of column names." with pytest.raises(TypeError, match=msg): - index.to_frame(name='first') + index.to_frame(name="first") msg = "'name' should have same length as number of levels on index." with pytest.raises(ValueError, match=msg): - index.to_frame(name=['first']) + index.to_frame(name=["first"]) # Tests for datetime index - index = MultiIndex.from_product([range(5), - pd.date_range('20130101', periods=3)]) + index = MultiIndex.from_product([range(5), pd.date_range("20130101", periods=3)]) result = index.to_frame(index=False) expected = DataFrame( - {0: np.repeat(np.arange(5, dtype='int64'), 3), - 1: np.tile(pd.date_range('20130101', periods=3), 5)}) + { + 0: np.repeat(np.arange(5, dtype="int64"), 3), + 1: np.tile(pd.date_range("20130101", periods=3), 5), + } + ) tm.assert_frame_equal(result, expected) result = index.to_frame() @@ -77,34 +79,43 @@ def test_to_frame(): tm.assert_frame_equal(result, expected) # See GH-22580 - result = index.to_frame(index=False, name=['first', 'second']) + result = index.to_frame(index=False, name=["first", "second"]) expected = DataFrame( - {'first': np.repeat(np.arange(5, dtype='int64'), 3), - 'second': np.tile(pd.date_range('20130101', periods=3), 5)}) + { + "first": np.repeat(np.arange(5, dtype="int64"), 3), + "second": np.tile(pd.date_range("20130101", periods=3), 5), + } + ) tm.assert_frame_equal(result, expected) - result = index.to_frame(name=['first', 'second']) + result = index.to_frame(name=["first", "second"]) expected.index = index tm.assert_frame_equal(result, expected) def test_to_frame_dtype_fidelity(): # GH 22420 - mi = pd.MultiIndex.from_arrays([ - pd.date_range('19910905', periods=6, tz='US/Eastern'), - [1, 1, 1, 2, 2, 2], - pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True), - ['x', 'x', 'y', 'z', 'x', 'y'] - ], names=['dates', 'a', 'b', 'c']) - original_dtypes = {name: mi.levels[i].dtype - for i, name in enumerate(mi.names)} - - expected_df = pd.DataFrame(OrderedDict([ - ('dates', pd.date_range('19910905', periods=6, tz='US/Eastern')), - ('a', [1, 1, 1, 2, 2, 2]), - ('b', pd.Categorical(['a', 'a', 'b', 'b', 'c', 'c'], ordered=True)), - ('c', ['x', 'x', 'y', 'z', 'x', 'y']) - ])) + mi = pd.MultiIndex.from_arrays( + [ + pd.date_range("19910905", periods=6, tz="US/Eastern"), + [1, 1, 1, 2, 2, 2], + pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True), + ["x", "x", "y", "z", "x", "y"], + ], + names=["dates", "a", "b", "c"], + ) + original_dtypes = {name: mi.levels[i].dtype for i, name in enumerate(mi.names)} + + expected_df = pd.DataFrame( + OrderedDict( + [ + ("dates", pd.date_range("19910905", periods=6, tz="US/Eastern")), + ("a", [1, 1, 1, 2, 2, 2]), + ("b", pd.Categorical(["a", "a", "b", "b", "c", "c"], ordered=True)), + ("c", ["x", "x", "y", "z", "x", "y"]), + ] + ) + ) df = mi.to_frame(index=False) df_dtypes = df.dtypes.to_dict() @@ -114,48 +125,61 @@ def test_to_frame_dtype_fidelity(): def test_to_frame_resulting_column_order(): # GH 22420 - expected = ['z', 0, 'a'] - mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z'], - ['q', 'w', 'e']], names=expected) + expected = ["z", 0, "a"] + mi = pd.MultiIndex.from_arrays( + [["a", "b", "c"], ["x", "y", "z"], ["q", "w", "e"]], names=expected + ) result = mi.to_frame().columns.tolist() assert result == expected def test_to_hierarchical(): - index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( - 2, 'two')]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + index = MultiIndex.from_tuples([(1, "one"), (1, "two"), (2, "one"), (2, "two")]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = index.to_hierarchical(3) - expected = MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], - [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]]) + expected = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[ + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1], + ], + ) tm.assert_index_equal(result, expected) assert result.names == index.names # K > 1 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = index.to_hierarchical(3, 2) - expected = MultiIndex(levels=[[1, 2], ['one', 'two']], - codes=[[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]) + expected = MultiIndex( + levels=[[1, 2], ["one", "two"]], + codes=[ + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], + ], + ) tm.assert_index_equal(result, expected) assert result.names == index.names # non-sorted - index = MultiIndex.from_tuples([(2, 'c'), (1, 'b'), - (2, 'a'), (2, 'b')], - names=['N1', 'N2']) + index = MultiIndex.from_tuples( + [(2, "c"), (1, "b"), (2, "a"), (2, "b")], names=["N1", "N2"] + ) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = index.to_hierarchical(2) - expected = MultiIndex.from_tuples([(2, 'c'), (2, 'c'), (1, 'b'), - (1, 'b'), - (2, 'a'), (2, 'a'), - (2, 'b'), (2, 'b')], - names=['N1', 'N2']) + expected = MultiIndex.from_tuples( + [ + (2, "c"), + (2, "c"), + (1, "b"), + (1, "b"), + (2, "a"), + (2, "a"), + (2, "b"), + (2, "b"), + ], + names=["N1", "N2"], + ) tm.assert_index_equal(result, expected) assert result.names == index.names @@ -166,9 +190,9 @@ def test_roundtrip_pickle_with_tz(): # GH 8367 # round-trip of timezone index = MultiIndex.from_product( - [[1, 2], ['a', 'b'], date_range('20130101', periods=3, - tz='US/Eastern') - ], names=['one', 'two', 'three']) + [[1, 2], ["a", "b"], date_range("20130101", periods=3, tz="US/Eastern")], + names=["one", "two", "three"], + ) unpickled = tm.round_trip_pickle(index) assert index.equal_levels(unpickled) @@ -178,7 +202,7 @@ def test_pickle(indices): unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) - original_name, indices.name = indices.name, 'foo' + original_name, indices.name = indices.name, "foo" unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) indices.name = original_name @@ -205,7 +229,7 @@ def test_to_series_with_arguments(idx): # name kwarg idx = idx - s = idx.to_series(name='__test') + s = idx.to_series(name="__test") assert s.values is not idx.values assert s.index is not idx @@ -213,8 +237,16 @@ def test_to_series_with_arguments(idx): def test_to_flat_index(idx): - expected = pd.Index((('foo', 'one'), ('foo', 'two'), ('bar', 'one'), - ('baz', 'two'), ('qux', 'one'), ('qux', 'two')), - tupleize_cols=False) + expected = pd.Index( + ( + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ), + tupleize_cols=False, + ) result = idx.to_flat_index() tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 17e81a348f186..35a5cccc0ec45 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -46,46 +46,49 @@ def test_view(idx): assert_multiindex_copied(i_view, idx) -@pytest.mark.parametrize('func', [copy, deepcopy]) +@pytest.mark.parametrize("func", [copy, deepcopy]) def test_copy_and_deepcopy(func): idx = MultiIndex( - levels=[['foo', 'bar'], ['fizz', 'buzz']], + levels=[["foo", "bar"], ["fizz", "buzz"]], codes=[[0, 0, 0, 1], [0, 0, 1, 1]], - names=['first', 'second'] + names=["first", "second"], ) idx_copy = func(idx) assert idx_copy is not idx assert idx_copy.equals(idx) -@pytest.mark.parametrize('deep', [True, False]) +@pytest.mark.parametrize("deep", [True, False]) def test_copy_method(deep): idx = MultiIndex( - levels=[['foo', 'bar'], ['fizz', 'buzz']], + levels=[["foo", "bar"], ["fizz", "buzz"]], codes=[[0, 0, 0, 1], [0, 0, 1, 1]], - names=['first', 'second'] + names=["first", "second"], ) idx_copy = idx.copy(deep=deep) assert idx_copy.equals(idx) -@pytest.mark.parametrize('deep', [True, False]) -@pytest.mark.parametrize('kwarg, value', [ - ('names', ['thrid', 'fourth']), - ('levels', [['foo2', 'bar2'], ['fizz2', 'buzz2']]), - ('codes', [[1, 0, 0, 0], [1, 1, 0, 0]]) -]) +@pytest.mark.parametrize("deep", [True, False]) +@pytest.mark.parametrize( + "kwarg, value", + [ + ("names", ["thrid", "fourth"]), + ("levels", [["foo2", "bar2"], ["fizz2", "buzz2"]]), + ("codes", [[1, 0, 0, 0], [1, 1, 0, 0]]), + ], +) def test_copy_method_kwargs(deep, kwarg, value): # gh-12309: Check that the "name" argument as well other kwargs are honored idx = MultiIndex( - levels=[['foo', 'bar'], ['fizz', 'buzz']], + levels=[["foo", "bar"], ["fizz", "buzz"]], codes=[[0, 0, 0, 1], [0, 0, 1, 1]], - names=['first', 'second'] + names=["first", "second"], ) return - idx_copy = idx.copy(**{kwarg: value, 'deep': deep}) - if kwarg == 'names': + idx_copy = idx.copy(**{kwarg: value, "deep": deep}) + if kwarg == "names": assert getattr(idx_copy, kwarg) == value else: assert [list(i) for i in getattr(idx_copy, kwarg)] == value diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 555ed948f82a1..2c24c5bd57085 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -9,78 +9,81 @@ def test_drop(idx): - dropped = idx.drop([('foo', 'two'), ('qux', 'one')]) + dropped = idx.drop([("foo", "two"), ("qux", "one")]) - index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')]) + index = MultiIndex.from_tuples([("foo", "two"), ("qux", "one")]) dropped2 = idx.drop(index) expected = idx[[0, 2, 3, 5]] tm.assert_index_equal(dropped, expected) tm.assert_index_equal(dropped2, expected) - dropped = idx.drop(['bar']) + dropped = idx.drop(["bar"]) expected = idx[[0, 1, 3, 4, 5]] tm.assert_index_equal(dropped, expected) - dropped = idx.drop('foo') + dropped = idx.drop("foo") expected = idx[[2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) - index = MultiIndex.from_tuples([('bar', 'two')]) + index = MultiIndex.from_tuples([("bar", "two")]) with pytest.raises(KeyError, match=r"^10$"): - idx.drop([('bar', 'two')]) + idx.drop([("bar", "two")]) with pytest.raises(KeyError, match=r"^10$"): idx.drop(index) with pytest.raises(KeyError, match=r"^'two'$"): - idx.drop(['foo', 'two']) + idx.drop(["foo", "two"]) # partially correct argument - mixed_index = MultiIndex.from_tuples([('qux', 'one'), ('bar', 'two')]) + mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")]) with pytest.raises(KeyError, match=r"^10$"): idx.drop(mixed_index) # error='ignore' - dropped = idx.drop(index, errors='ignore') + dropped = idx.drop(index, errors="ignore") expected = idx[[0, 1, 2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) - dropped = idx.drop(mixed_index, errors='ignore') + dropped = idx.drop(mixed_index, errors="ignore") expected = idx[[0, 1, 2, 3, 5]] tm.assert_index_equal(dropped, expected) - dropped = idx.drop(['foo', 'two'], errors='ignore') + dropped = idx.drop(["foo", "two"], errors="ignore") expected = idx[[2, 3, 4, 5]] tm.assert_index_equal(dropped, expected) # mixed partial / full drop - dropped = idx.drop(['foo', ('qux', 'one')]) + dropped = idx.drop(["foo", ("qux", "one")]) expected = idx[[2, 3, 5]] tm.assert_index_equal(dropped, expected) # mixed partial / full drop / error='ignore' - mixed_index = ['foo', ('qux', 'one'), 'two'] + mixed_index = ["foo", ("qux", "one"), "two"] with pytest.raises(KeyError, match=r"^'two'$"): idx.drop(mixed_index) - dropped = idx.drop(mixed_index, errors='ignore') + dropped = idx.drop(mixed_index, errors="ignore") expected = idx[[2, 3, 5]] tm.assert_index_equal(dropped, expected) def test_droplevel_with_names(idx): - index = idx[idx.get_loc('foo')] + index = idx[idx.get_loc("foo")] dropped = index.droplevel(0) - assert dropped.name == 'second' + assert dropped.name == "second" index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - names=['one', 'two', 'three']) + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) dropped = index.droplevel(0) - assert dropped.names == ('two', 'three') + assert dropped.names == ("two", "three") - dropped = index.droplevel('two') + dropped = index.droplevel("two") expected = index.droplevel(1) assert dropped.equals(expected) @@ -88,12 +91,15 @@ def test_droplevel_with_names(idx): def test_droplevel_list(): index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - names=['one', 'two', 'three']) - - dropped = index[:2].droplevel(['three', 'one']) + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + names=["one", "two", "three"], + ) + + dropped = index[:2].droplevel(["three", "one"]) expected = index[:2].droplevel(2).droplevel(0) assert dropped.equals(expected) @@ -101,27 +107,30 @@ def test_droplevel_list(): expected = index[:2] assert dropped.equals(expected) - msg = ("Cannot remove 3 levels from an index with 3 levels: at least one" - " level must be left") + msg = ( + "Cannot remove 3 levels from an index with 3 levels: at least one" + " level must be left" + ) with pytest.raises(ValueError, match=msg): - index[:2].droplevel(['one', 'two', 'three']) + index[:2].droplevel(["one", "two", "three"]) with pytest.raises(KeyError, match="'Level four not found'"): - index[:2].droplevel(['one', 'four']) + index[:2].droplevel(["one", "four"]) def test_drop_not_lexsorted(): # GH 12078 # define the lexsorted version of the multi-index - tuples = [('a', ''), ('b1', 'c1'), ('b2', 'c2')] - lexsorted_mi = MultiIndex.from_tuples(tuples, names=['b', 'c']) + tuples = [("a", ""), ("b1", "c1"), ("b2", "c2")] + lexsorted_mi = MultiIndex.from_tuples(tuples, names=["b", "c"]) assert lexsorted_mi.is_lexsorted() # and the not-lexsorted version - df = pd.DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], [1, 'b2', 'c2', 4]]) - df = df.pivot_table(index='a', columns=['b', 'c'], values='d') + df = pd.DataFrame( + columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] + ) + df = df.pivot_table(index="a", columns=["b", "c"], values="d") df = df.reset_index() not_lexsorted_mi = df.columns assert not not_lexsorted_mi.is_lexsorted() @@ -129,5 +138,4 @@ def test_drop_not_lexsorted(): # compare the results tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) with tm.assert_produces_warning(PerformanceWarning): - tm.assert_index_equal(lexsorted_mi.drop('a'), - not_lexsorted_mi.drop('a')) + tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index b1eff00d07484..518bd093b23b1 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -9,7 +9,7 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('names', [None, ['first', 'second']]) +@pytest.mark.parametrize("names", [None, ["first", "second"]]) def test_unique(names): mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) @@ -17,15 +17,14 @@ def test_unique(names): exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names) tm.assert_index_equal(res, exp) - mi = MultiIndex.from_arrays([list('aaaa'), list('abab')], - names=names) + mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names) res = mi.unique() - exp = MultiIndex.from_arrays([list('aa'), list('ab')], names=mi.names) + exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names) tm.assert_index_equal(res, exp) - mi = MultiIndex.from_arrays([list('aaaa'), list('aaaa')], names=names) + mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names) res = mi.unique() - exp = MultiIndex.from_arrays([['a'], ['a']], names=mi.names) + exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names) tm.assert_index_equal(res, exp) # GH #20568 - empty MI @@ -35,22 +34,24 @@ def test_unique(names): def test_unique_datetimelike(): - idx1 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', - '2015-01-01', 'NaT', 'NaT']) - idx2 = DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02', - '2015-01-02', 'NaT', '2015-01-01'], - tz='Asia/Tokyo') + idx1 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"] + ) + idx2 = DatetimeIndex( + ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"], + tz="Asia/Tokyo", + ) result = MultiIndex.from_arrays([idx1, idx2]).unique() - eidx1 = DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT']) - eidx2 = DatetimeIndex(['2015-01-01', '2015-01-02', - 'NaT', '2015-01-01'], - tz='Asia/Tokyo') + eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"]) + eidx2 = DatetimeIndex( + ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo" + ) exp = MultiIndex.from_arrays([eidx1, eidx2]) tm.assert_index_equal(result, exp) -@pytest.mark.parametrize('level', [0, 'first', 1, 'second']) +@pytest.mark.parametrize("level", [0, "first", 1, "second"]) def test_unique_level(idx, level): # GH #17896 - with level= argument result = idx.unique(level=level) @@ -58,19 +59,18 @@ def test_unique_level(idx, level): tm.assert_index_equal(result, expected) # With already unique level - mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], - names=['first', 'second']) + mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"]) result = mi.unique(level=level) expected = mi.get_level_values(level) tm.assert_index_equal(result, expected) # With empty MI - mi = MultiIndex.from_arrays([[], []], names=['first', 'second']) + mi = MultiIndex.from_arrays([[], []], names=["first", "second"]) result = mi.unique(level=level) expected = mi.get_level_values(level) -@pytest.mark.parametrize('dropna', [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) def test_get_unique_index(idx, dropna): mi = idx[[0, 1, 0, 1, 1, 0, 0]] expected = mi._shallow_copy(mi[[0, 1]]) @@ -84,18 +84,15 @@ def test_duplicate_multiindex_codes(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError with pytest.raises(ValueError): - mi = MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) + mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)]) # And that using set_levels with duplicate levels fails - mi = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], - [1, 2, 1, 2, 3]]) + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) with pytest.raises(ValueError): - mi.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], - inplace=True) + mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) -@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], - [1, 'a', 1]]) +@pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) def test_duplicate_level_names(names): # GH18872, GH19029 mi = MultiIndex.from_product([[0, 1]] * 3, names=names) @@ -115,14 +112,15 @@ def test_duplicate_level_names(names): def test_duplicate_meta_data(): # GH 10115 mi = MultiIndex( - levels=[[0, 1], [0, 1, 2]], - codes=[[0, 0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 0, 1, 2]]) - - for idx in [mi, - mi.set_names([None, None]), - mi.set_names([None, 'Num']), - mi.set_names(['Upper', 'Num']), ]: + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) + + for idx in [ + mi, + mi.set_names([None, None]), + mi.set_names([None, "Num"]), + mi.set_names(["Upper", "Num"]), + ]: assert idx.has_duplicates assert idx.drop_duplicates().names == idx.names @@ -134,45 +132,49 @@ def test_has_duplicates(idx, idx_dup): assert idx_dup.is_unique is False assert idx_dup.has_duplicates is True - mi = MultiIndex(levels=[[0, 1], [0, 1, 2]], - codes=[[0, 0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 0, 1, 2]]) + mi = MultiIndex( + levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] + ) assert mi.is_unique is False assert mi.has_duplicates is True # single instance of NaN - mi_nan = MultiIndex(levels=[['a', 'b'], [0, 1]], - codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]]) + mi_nan = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]] + ) assert mi_nan.is_unique is True assert mi_nan.has_duplicates is False # multiple instances of NaN - mi_nan_dup = MultiIndex(levels=[['a', 'b'], [0, 1]], - codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]]) + mi_nan_dup = MultiIndex( + levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]] + ) assert mi_nan_dup.is_unique is False assert mi_nan_dup.has_duplicates is True def test_has_duplicates_from_tuples(): # GH 9075 - t = [('x', 'out', 'z', 5, 'y', 'in', 'z', 169), - ('x', 'out', 'z', 7, 'y', 'in', 'z', 119), - ('x', 'out', 'z', 9, 'y', 'in', 'z', 135), - ('x', 'out', 'z', 13, 'y', 'in', 'z', 145), - ('x', 'out', 'z', 14, 'y', 'in', 'z', 158), - ('x', 'out', 'z', 16, 'y', 'in', 'z', 122), - ('x', 'out', 'z', 17, 'y', 'in', 'z', 160), - ('x', 'out', 'z', 18, 'y', 'in', 'z', 180), - ('x', 'out', 'z', 20, 'y', 'in', 'z', 143), - ('x', 'out', 'z', 21, 'y', 'in', 'z', 128), - ('x', 'out', 'z', 22, 'y', 'in', 'z', 129), - ('x', 'out', 'z', 25, 'y', 'in', 'z', 111), - ('x', 'out', 'z', 28, 'y', 'in', 'z', 114), - ('x', 'out', 'z', 29, 'y', 'in', 'z', 121), - ('x', 'out', 'z', 31, 'y', 'in', 'z', 126), - ('x', 'out', 'z', 32, 'y', 'in', 'z', 155), - ('x', 'out', 'z', 33, 'y', 'in', 'z', 123), - ('x', 'out', 'z', 12, 'y', 'in', 'z', 144)] + t = [ + ("x", "out", "z", 5, "y", "in", "z", 169), + ("x", "out", "z", 7, "y", "in", "z", 119), + ("x", "out", "z", 9, "y", "in", "z", 135), + ("x", "out", "z", 13, "y", "in", "z", 145), + ("x", "out", "z", 14, "y", "in", "z", 158), + ("x", "out", "z", 16, "y", "in", "z", 122), + ("x", "out", "z", 17, "y", "in", "z", 160), + ("x", "out", "z", 18, "y", "in", "z", 180), + ("x", "out", "z", 20, "y", "in", "z", 143), + ("x", "out", "z", 21, "y", "in", "z", 128), + ("x", "out", "z", 22, "y", "in", "z", 129), + ("x", "out", "z", 25, "y", "in", "z", 111), + ("x", "out", "z", 28, "y", "in", "z", 114), + ("x", "out", "z", 29, "y", "in", "z", 121), + ("x", "out", "z", 31, "y", "in", "z", 126), + ("x", "out", "z", 32, "y", "in", "z", 155), + ("x", "out", "z", 33, "y", "in", "z", 123), + ("x", "out", "z", 12, "y", "in", "z", 144), + ] mi = MultiIndex.from_tuples(t) assert not mi.has_duplicates @@ -202,8 +204,10 @@ def check(nlevels, with_nulls): # with a dup if with_nulls: + def f(a): return np.insert(a, 1000, a[0]) + codes = list(map(f, codes)) mi = MultiIndex(levels=levels, codes=codes) else: @@ -221,17 +225,20 @@ def f(a): check(8, True) -@pytest.mark.parametrize('keep, expected', [ - ('first', np.array([False, False, False, True, True, False])), - ('last', np.array([False, True, True, False, False, False])), - (False, np.array([False, True, True, True, True, False])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", np.array([False, False, False, True, True, False])), + ("last", np.array([False, True, True, False, False, False])), + (False, np.array([False, True, True, True, True, False])), + ], +) def test_duplicated(idx_dup, keep, expected): result = idx_dup.duplicated(keep=keep) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize('keep', ['first', 'last', False]) +@pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 @@ -254,22 +261,23 @@ def test_get_duplicates(): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) - tm.assert_numpy_array_equal(mi.duplicated(), - np.zeros(2, dtype='bool')) + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan codes = product(range(-1, n), range(-1, m)) - mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], - codes=np.random.permutation(list(codes)).T) + mi = MultiIndex( + levels=[list("abcde")[:n], list("WXYZ")[:m]], + codes=np.random.permutation(list(codes)).T, + ) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 - assert mi.get_duplicates().equals(MultiIndex.from_arrays( - [[], []])) + assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) - tm.assert_numpy_array_equal(mi.duplicated(), - np.zeros(len(mi), dtype='bool')) + tm.assert_numpy_array_equal( + mi.duplicated(), np.zeros(len(mi), dtype="bool") + ) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 3bdccbb8ab38d..f61ba0132ab97 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -94,13 +94,14 @@ def test_equals_multi(idx): assert not idx.equals(idx[-1]) # different number of levels - index = MultiIndex(levels=[Index(list(range(4))), - Index(list(range(4))), - Index(list(range(4)))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(list(range(4))), Index(list(range(4))), Index(list(range(4)))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) index2 = MultiIndex(levels=index.levels[:-1], codes=index.codes[:-1]) assert not index.equals(index2) @@ -113,20 +114,22 @@ def test_equals_multi(idx): major_codes = np.array([0, 0, 1, 2, 2, 3]) minor_codes = np.array([0, 1, 0, 0, 1, 0]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) assert not idx.equals(index) assert not idx.equal_levels(index) # some of the labels are different - major_axis = Index(['foo', 'bar', 'baz', 'qux']) - minor_axis = Index(['one', 'two']) + major_axis = Index(["foo", "bar", "baz", "qux"]) + minor_axis = Index(["one", "two"]) major_codes = np.array([0, 0, 2, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) assert not idx.equals(index) @@ -135,11 +138,11 @@ def test_identical(idx): mi2 = idx.copy() assert mi.identical(mi2) - mi = mi.set_names(['new1', 'new2']) + mi = mi.set_names(["new1", "new2"]) assert mi.equals(mi2) assert not mi.identical(mi2) - mi2 = mi2.set_names(['new1', 'new2']) + mi2 = mi2.set_names(["new1", "new2"]) assert mi.identical(mi2) mi3 = Index(mi.tolist(), names=mi.names) @@ -156,8 +159,7 @@ def test_equals_operator(idx): def test_equals_missing_values(): # make sure take is not using -1 - i = pd.MultiIndex.from_tuples([(0, pd.NaT), - (0, pd.Timestamp('20130101'))]) + i = pd.MultiIndex.from_tuples([(0, pd.NaT), (0, pd.Timestamp("20130101"))]) result = i[0:1].equals(i[0]) assert not result result = i[1:2].equals(i[1]) diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 8413fc1318d0b..a7f58b9ea78bd 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -20,20 +20,20 @@ def test_format(idx): def test_format_integer_names(): - index = MultiIndex(levels=[[0, 1], [0, 1]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1]) + index = MultiIndex( + levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1] + ) index.format(names=True) def test_format_sparse_config(idx): warn_filters = warnings.filters - warnings.filterwarnings('ignore', category=FutureWarning, - module=".*format") + warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") # GH1538 - pd.set_option('display.multi_sparse', False) + pd.set_option("display.multi_sparse", False) result = idx.format() - assert result[1] == 'foo two' + assert result[1] == "foo two" tm.reset_display_options() @@ -41,24 +41,29 @@ def test_format_sparse_config(idx): def test_format_sparse_display(): - index = MultiIndex(levels=[[0, 1], [0, 1], [0, 1], [0]], - codes=[[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1], - [0, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]]) + index = MultiIndex( + levels=[[0, 1], [0, 1], [0, 1], [0]], + codes=[ + [0, 0, 0, 1, 1, 1], + [0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0], + ], + ) result = index.format() - assert result[3] == '1 0 0 0' + assert result[3] == "1 0 0 0" def test_repr_with_unicode_data(): - with pd.option_context("display.encoding", 'UTF-8'): + with pd.option_context("display.encoding", "UTF-8"): d = {"a": ["\u05d0", 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} index = pd.DataFrame(d).set_index(["a", "b"]).index assert "\\" not in repr(index) # we don't want unicode-escaped def test_repr_roundtrip_raises(): - mi = MultiIndex.from_product([list('ab'), range(3)], - names=['first', 'second']) + mi = MultiIndex.from_product([list("ab"), range(3)], names=["first", "second"]) with pytest.raises(TypeError): eval(repr(mi)) @@ -74,11 +79,10 @@ def test_repr_max_seq_item_setting(idx): idx = idx.repeat(50) with pd.option_context("display.max_seq_items", None): repr(idx) - assert '...' not in str(idx) + assert "..." not in str(idx) class TestRepr: - def test_repr(self, idx): result = idx[:1].__repr__() expected = """\ @@ -97,7 +101,7 @@ def test_repr(self, idx): names=['first', 'second'])""" assert result == expected - with pd.option_context('display.max_seq_items', 5): + with pd.option_context("display.max_seq_items", 5): result = idx.__repr__() expected = """\ MultiIndex([('foo', 'one'), diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index e07f1ce7c4e92..5ab817d8468c3 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -23,68 +23,66 @@ def test_get_level_number_integer(idx): msg = "Too many levels: Index has only 2 levels, not 3" with pytest.raises(IndexError, match=msg): idx._get_level_number(2) - with pytest.raises(KeyError, match='Level fourth not found'): - idx._get_level_number('fourth') + with pytest.raises(KeyError, match="Level fourth not found"): + idx._get_level_number("fourth") def test_get_level_values(idx): result = idx.get_level_values(0) - expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'], - name='first') + expected = Index(["foo", "foo", "bar", "baz", "qux", "qux"], name="first") tm.assert_index_equal(result, expected) - assert result.name == 'first' + assert result.name == "first" - result = idx.get_level_values('first') + result = idx.get_level_values("first") expected = idx.get_level_values(0) tm.assert_index_equal(result, expected) # GH 10460 index = MultiIndex( - levels=[CategoricalIndex(['A', 'B']), - CategoricalIndex([1, 2, 3])], - codes=[np.array([0, 0, 0, 1, 1, 1]), - np.array([0, 1, 2, 0, 1, 2])]) + levels=[CategoricalIndex(["A", "B"]), CategoricalIndex([1, 2, 3])], + codes=[np.array([0, 0, 0, 1, 1, 1]), np.array([0, 1, 2, 0, 1, 2])], + ) - exp = CategoricalIndex(['A', 'A', 'A', 'B', 'B', 'B']) + exp = CategoricalIndex(["A", "A", "A", "B", "B", "B"]) tm.assert_index_equal(index.get_level_values(0), exp) exp = CategoricalIndex([1, 2, 3, 1, 2, 3]) tm.assert_index_equal(index.get_level_values(1), exp) def test_get_value_duplicates(): - index = MultiIndex(levels=[['D', 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) + index = MultiIndex( + levels=[["D", "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) - assert index.get_loc('D') == slice(0, 3) + assert index.get_loc("D") == slice(0, 3) with pytest.raises(KeyError, match=r"^'D'$"): - index._engine.get_value(np.array([]), 'D') + index._engine.get_value(np.array([]), "D") def test_get_level_values_all_na(): # GH 17924 when level entirely consists of nan - arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) expected = pd.Index([np.nan, np.nan, np.nan], dtype=np.float64) tm.assert_index_equal(result, expected) result = index.get_level_values(1) - expected = pd.Index(['a', np.nan, 1], dtype=object) + expected = pd.Index(["a", np.nan, 1], dtype=object) tm.assert_index_equal(result, expected) def test_get_level_values_int_with_na(): # GH 17924 - arrays = [['a', 'b', 'b'], [1, np.nan, 2]] + arrays = [["a", "b", "b"], [1, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(1) expected = Index([1, np.nan, 2]) tm.assert_index_equal(result, expected) - arrays = [['a', 'b', 'b'], [np.nan, np.nan, 2]] + arrays = [["a", "b", "b"], [np.nan, np.nan, 2]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(1) expected = Index([np.nan, np.nan, 2]) @@ -92,17 +90,17 @@ def test_get_level_values_int_with_na(): def test_get_level_values_na(): - arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]] + arrays = [[np.nan, np.nan, np.nan], ["a", np.nan, 1]] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(0) expected = pd.Index([np.nan, np.nan, np.nan]) tm.assert_index_equal(result, expected) result = index.get_level_values(1) - expected = pd.Index(['a', np.nan, 1]) + expected = pd.Index(["a", np.nan, 1]) tm.assert_index_equal(result, expected) - arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])] + arrays = [["a", "b", "b"], pd.DatetimeIndex([0, 1, pd.NaT])] index = pd.MultiIndex.from_arrays(arrays) result = index.get_level_values(1) expected = pd.DatetimeIndex([0, 1, pd.NaT]) @@ -153,7 +151,7 @@ def test_set_levels_codes_directly(idx): # setting levels/codes directly raises AttributeError levels = idx.levels - new_levels = [[lev + 'a' for lev in level] for level in levels] + new_levels = [[lev + "a" for lev in level] for level in levels] codes = idx.codes major_codes, minor_codes = codes @@ -172,7 +170,7 @@ def test_set_levels(idx): # side note - you probably wouldn't want to use levels and codes # directly like this - but it is possible. levels = idx.levels - new_levels = [[lev + 'a' for lev in level] for level in levels] + new_levels = [[lev + "a" for lev in level] for level in levels] # level changing [w/o mutation] ind2 = idx.set_levels(new_levels) @@ -214,8 +212,7 @@ def test_set_levels(idx): # level changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, level=[0, 1], - inplace=True) + inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) assert_matching(idx.levels, levels) @@ -225,25 +222,20 @@ def test_set_levels(idx): original_index = idx.copy() for inplace in [True, False]: with pytest.raises(ValueError, match="^On"): - idx.set_levels(['c'], level=0, inplace=inplace) - assert_matching(idx.levels, original_index.levels, - check_dtype=True) + idx.set_levels(["c"], level=0, inplace=inplace) + assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(ValueError, match="^On"): - idx.set_codes([0, 1, 2, 3, 4, 5], level=0, - inplace=inplace) - assert_matching(idx.codes, original_index.codes, - check_dtype=True) + idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) + assert_matching(idx.codes, original_index.codes, check_dtype=True) with pytest.raises(TypeError, match="^Levels"): - idx.set_levels('c', level=0, inplace=inplace) - assert_matching(idx.levels, original_index.levels, - check_dtype=True) + idx.set_levels("c", level=0, inplace=inplace) + assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(TypeError, match="^Codes"): idx.set_codes(1, level=0, inplace=inplace) - assert_matching(idx.codes, original_index.codes, - check_dtype=True) + assert_matching(idx.codes, original_index.codes, check_dtype=True) def test_set_codes(idx): @@ -295,8 +287,7 @@ def test_set_codes(idx): # codes changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, level=[0, 1], - inplace=True) + inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) assert_matching(idx.codes, codes) @@ -304,8 +295,7 @@ def test_set_codes(idx): # label changing for levels of different magnitude of categories ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) new_codes = range(129, -1, -1) - expected = pd.MultiIndex.from_tuples( - [(0, i) for i in new_codes]) + expected = pd.MultiIndex.from_tuples([(0, i) for i in new_codes]) # [w/o mutation] result = ind.set_codes(codes=new_codes, level=1) @@ -324,8 +314,7 @@ def test_set_labels_deprecated(): # GH23752 ind = pd.MultiIndex.from_tuples([(0, i) for i in range(130)]) new_labels = range(129, -1, -1) - expected = pd.MultiIndex.from_tuples( - [(0, i) for i in new_labels]) + expected = pd.MultiIndex.from_tuples([(0, i) for i in new_labels]) # [w/o mutation] with tm.assert_produces_warning(FutureWarning): @@ -343,59 +332,57 @@ def test_set_levels_codes_names_bad_input(idx): levels, codes = idx.levels, idx.codes names = idx.names - with pytest.raises(ValueError, match='Length of levels'): + with pytest.raises(ValueError, match="Length of levels"): idx.set_levels([levels[0]]) - with pytest.raises(ValueError, match='Length of codes'): + with pytest.raises(ValueError, match="Length of codes"): idx.set_codes([codes[0]]) - with pytest.raises(ValueError, match='Length of names'): + with pytest.raises(ValueError, match="Length of names"): idx.set_names([names[0]]) # shouldn't scalar data error, instead should demand list-like - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_levels(levels[0]) # shouldn't scalar data error, instead should demand list-like - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_codes(codes[0]) # shouldn't scalar data error, instead should demand list-like - with pytest.raises(TypeError, match='list-like'): + with pytest.raises(TypeError, match="list-like"): idx.set_names(names[0]) # should have equal lengths - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_levels(levels[0], level=[0, 1]) - with pytest.raises(TypeError, match='list-like'): + with pytest.raises(TypeError, match="list-like"): idx.set_levels(levels, level=0) # should have equal lengths - with pytest.raises(TypeError, match='list of lists-like'): + with pytest.raises(TypeError, match="list of lists-like"): idx.set_codes(codes[0], level=[0, 1]) - with pytest.raises(TypeError, match='list-like'): + with pytest.raises(TypeError, match="list-like"): idx.set_codes(codes, level=0) # should have equal lengths - with pytest.raises(ValueError, match='Length of names'): + with pytest.raises(ValueError, match="Length of names"): idx.set_names(names[0], level=[0, 1]) - with pytest.raises(TypeError, match='Names must be a'): + with pytest.raises(TypeError, match="Names must be a"): idx.set_names(names, level=0) -@pytest.mark.parametrize('inplace', [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) def test_set_names_with_nlevel_1(inplace): # GH 21149 # Ensure that .set_names for MultiIndex with # nlevels == 1 does not raise any errors - expected = pd.MultiIndex(levels=[[0, 1]], - codes=[[0, 1]], - names=['first']) + expected = pd.MultiIndex(levels=[[0, 1]], codes=[[0, 1]], names=["first"]) m = pd.MultiIndex.from_product([[0, 1]]) - result = m.set_names('first', level=0, inplace=inplace) + result = m.set_names("first", level=0, inplace=inplace) if inplace: result = m @@ -403,50 +390,49 @@ def test_set_names_with_nlevel_1(inplace): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize("ordered", [True, False]) def test_set_levels_categorical(ordered): # GH13854 index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]]) cidx = CategoricalIndex(list("bac"), ordered=ordered) result = index.set_levels(cidx, 0) - expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], - codes=index.codes) + expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], codes=index.codes) tm.assert_index_equal(result, expected) result_lvl = result.get_level_values(0) - expected_lvl = CategoricalIndex(list("bacb"), - categories=cidx.categories, - ordered=cidx.ordered) + expected_lvl = CategoricalIndex( + list("bacb"), categories=cidx.categories, ordered=cidx.ordered + ) tm.assert_index_equal(result_lvl, expected_lvl) def test_set_value_keeps_names(): # motivating example from #3742 - lev1 = ['hans', 'hans', 'hans', 'grethe', 'grethe', 'grethe'] - lev2 = ['1', '2', '3'] * 2 - idx = pd.MultiIndex.from_arrays([lev1, lev2], names=['Name', 'Number']) + lev1 = ["hans", "hans", "hans", "grethe", "grethe", "grethe"] + lev2 = ["1", "2", "3"] * 2 + idx = pd.MultiIndex.from_arrays([lev1, lev2], names=["Name", "Number"]) df = pd.DataFrame( - np.random.randn(6, 4), - columns=['one', 'two', 'three', 'four'], - index=idx) + np.random.randn(6, 4), columns=["one", "two", "three", "four"], index=idx + ) df = df.sort_index() assert df._is_copy is None - assert df.index.names == ('Name', 'Number') - df.at[('grethe', '4'), 'one'] = 99.34 + assert df.index.names == ("Name", "Number") + df.at[("grethe", "4"), "one"] = 99.34 assert df._is_copy is None - assert df.index.names == ('Name', 'Number') + assert df.index.names == ("Name", "Number") def test_set_levels_with_iterable(): # GH23273 sizes = [1, 2, 3] - colors = ['black'] * 3 - index = pd.MultiIndex.from_arrays([sizes, colors], names=['size', 'color']) + colors = ["black"] * 3 + index = pd.MultiIndex.from_arrays([sizes, colors], names=["size", "color"]) - result = index.set_levels(map(int, ['3', '2', '1']), level='size') + result = index.set_levels(map(int, ["3", "2", "1"]), level="size") expected_sizes = [3, 2, 1] - expected = pd.MultiIndex.from_arrays([expected_sizes, colors], - names=['size', 'color']) + expected = pd.MultiIndex.from_arrays( + [expected_sizes, colors], names=["size", "color"] + ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 3acd194b28a05..75dea68eadbf7 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -5,8 +5,13 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, Index, IntervalIndex, MultiIndex, - date_range) + Categorical, + CategoricalIndex, + Index, + IntervalIndex, + MultiIndex, + date_range, +) from pandas.core.indexes.base import InvalidIndexError import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -15,16 +20,16 @@ def test_slice_locs_partial(idx): sorted_idx, _ = idx.sortlevel(0) - result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one')) + result = sorted_idx.slice_locs(("foo", "two"), ("qux", "one")) assert result == (1, 5) - result = sorted_idx.slice_locs(None, ('qux', 'one')) + result = sorted_idx.slice_locs(None, ("qux", "one")) assert result == (0, 5) - result = sorted_idx.slice_locs(('foo', 'two'), None) + result = sorted_idx.slice_locs(("foo", "two"), None) assert result == (1, len(sorted_idx)) - result = sorted_idx.slice_locs('bar', 'baz') + result = sorted_idx.slice_locs("bar", "baz") assert result == (2, 4) @@ -38,8 +43,11 @@ def test_slice_locs(): expected = df[5:16].stack() tm.assert_almost_equal(sliced.values, expected.values) - slob = slice(*idx.slice_locs(df.index[5] + timedelta(seconds=30), - df.index[15] - timedelta(seconds=30))) + slob = slice( + *idx.slice_locs( + df.index[5] + timedelta(seconds=30), df.index[15] - timedelta(seconds=30) + ) + ) sliced = stacked[slob] expected = df[6:15].stack() tm.assert_almost_equal(sliced.values, expected.values) @@ -49,28 +57,29 @@ def test_slice_locs_with_type_mismatch(): df = tm.makeTimeDataFrame() stacked = df.stack() idx = stacked.index - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs((1, 3)) - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[5] + timedelta(seconds=30), (5, 2)) df = tm.makeCustomDataframe(5, 5) stacked = df.stack() idx = stacked.index - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(timedelta(seconds=30)) # TODO: Try creating a UnicodeDecodeError in exception message - with pytest.raises(TypeError, match='^Level type mismatch'): + with pytest.raises(TypeError, match="^Level type mismatch"): idx.slice_locs(df.index[1], (16, "a")) def test_slice_locs_not_sorted(): - index = MultiIndex(levels=[Index(np.arange(4)), - Index(np.arange(4)), - Index(np.arange(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) msg = "[Kk]ey length.*greater than MultiIndex lexsort depth" with pytest.raises(KeyError, match=msg): index.slice_locs((1, 0, 1), (2, 1, 0)) @@ -84,9 +93,11 @@ def test_slice_locs_not_sorted(): def test_slice_locs_not_contained(): # some searchsorted action - index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]], - codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], - [0, 1, 2, 1, 2, 2, 0, 1, 2]], sortorder=0) + index = MultiIndex( + levels=[[0, 2, 4, 6], [0, 2, 4]], + codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]], + sortorder=0, + ) result = index.slice_locs((1, 0), (5, 2)) assert result == (3, 6) @@ -118,7 +129,7 @@ def test_putmask_with_wrong_mask(idx): idx.putmask(np.ones(len(idx) - 1, np.bool), 1) with pytest.raises(ValueError, match=msg): - idx.putmask('foo', 1) + idx.putmask("foo", 1) def test_get_indexer(): @@ -128,32 +139,33 @@ def test_get_indexer(): major_codes = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) minor_codes = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) idx1 = index[:5] idx2 = index[[1, 3, 5]] r1 = idx1.get_indexer(idx2) assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) - r1 = idx2.get_indexer(idx1, method='pad') + r1 = idx2.get_indexer(idx1, method="pad") e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) assert_almost_equal(r1, e1) - r2 = idx2.get_indexer(idx1[::-1], method='pad') + r2 = idx2.get_indexer(idx1[::-1], method="pad") assert_almost_equal(r2, e1[::-1]) - rffill1 = idx2.get_indexer(idx1, method='ffill') + rffill1 = idx2.get_indexer(idx1, method="ffill") assert_almost_equal(r1, rffill1) - r1 = idx2.get_indexer(idx1, method='backfill') + r1 = idx2.get_indexer(idx1, method="backfill") e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) assert_almost_equal(r1, e1) - r2 = idx2.get_indexer(idx1[::-1], method='backfill') + r2 = idx2.get_indexer(idx1[::-1], method="backfill") assert_almost_equal(r2, e1[::-1]) - rbfill1 = idx2.get_indexer(idx1, method='bfill') + rbfill1 = idx2.get_indexer(idx1, method="bfill") assert_almost_equal(r1, rbfill1) # pass non-MultiIndex @@ -174,19 +186,20 @@ def test_get_indexer(): def test_get_indexer_nearest(): - midx = MultiIndex.from_tuples([('a', 1), ('b', 2)]) - msg = ("method='nearest' not implemented yet for MultiIndex; see GitHub" - " issue 9365") + midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = ( + "method='nearest' not implemented yet for MultiIndex; see GitHub" " issue 9365" + ) with pytest.raises(NotImplementedError, match=msg): - midx.get_indexer(['a'], method='nearest') + midx.get_indexer(["a"], method="nearest") msg = "tolerance not implemented yet for MultiIndex" with pytest.raises(NotImplementedError, match=msg): - midx.get_indexer(['a'], method='pad', tolerance=2) + midx.get_indexer(["a"], method="pad", tolerance=2) def test_getitem(idx): # scalar - assert idx[2] == ('bar', 'one') + assert idx[2] == ("bar", "one") # slice result = idx[2:5] @@ -203,8 +216,8 @@ def test_getitem(idx): def test_getitem_group_select(idx): sorted_idx, _ = idx.sortlevel(0) - assert sorted_idx.get_loc('baz') == slice(3, 4) - assert sorted_idx.get_loc('foo') == slice(0, 2) + assert sorted_idx.get_loc("baz") == slice(3, 4) + assert sorted_idx.get_loc("foo") == slice(0, 2) def test_get_indexer_consistency(idx): @@ -226,54 +239,55 @@ def test_get_indexer_consistency(idx): assert indexer.dtype == np.intp -@pytest.mark.parametrize('ind1', [[True] * 5, pd.Index([True] * 5)]) -@pytest.mark.parametrize('ind2', [[True, False, True, False, False], - pd.Index([True, False, True, False, - False])]) +@pytest.mark.parametrize("ind1", [[True] * 5, pd.Index([True] * 5)]) +@pytest.mark.parametrize( + "ind2", + [[True, False, True, False, False], pd.Index([True, False, True, False, False])], +) def test_getitem_bool_index_all(ind1, ind2): # GH#22533 - idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), - (40, 4), (50, 5)]) + idx = MultiIndex.from_tuples([(10, 1), (20, 2), (30, 3), (40, 4), (50, 5)]) tm.assert_index_equal(idx[ind1], idx) expected = MultiIndex.from_tuples([(10, 1), (30, 3)]) tm.assert_index_equal(idx[ind2], expected) -@pytest.mark.parametrize('ind1', [[True], pd.Index([True])]) -@pytest.mark.parametrize('ind2', [[False], pd.Index([False])]) +@pytest.mark.parametrize("ind1", [[True], pd.Index([True])]) +@pytest.mark.parametrize("ind2", [[False], pd.Index([False])]) def test_getitem_bool_index_single(ind1, ind2): # GH#22533 idx = MultiIndex.from_tuples([(10, 1)]) tm.assert_index_equal(idx[ind1], idx) - expected = pd.MultiIndex(levels=[np.array([], dtype=np.int64), - np.array([], dtype=np.int64)], - codes=[[], []]) + expected = pd.MultiIndex( + levels=[np.array([], dtype=np.int64), np.array([], dtype=np.int64)], + codes=[[], []], + ) tm.assert_index_equal(idx[ind2], expected) def test_get_loc(idx): - assert idx.get_loc(('foo', 'two')) == 1 - assert idx.get_loc(('baz', 'two')) == 3 + assert idx.get_loc(("foo", "two")) == 1 + assert idx.get_loc(("baz", "two")) == 3 with pytest.raises(KeyError, match=r"^10$"): - idx.get_loc(('bar', 'two')) + idx.get_loc(("bar", "two")) with pytest.raises(KeyError, match=r"^'quux'$"): - idx.get_loc('quux') + idx.get_loc("quux") - msg = ("only the default get_loc method is currently supported for" - " MultiIndex") + msg = "only the default get_loc method is currently supported for" " MultiIndex" with pytest.raises(NotImplementedError, match=msg): - idx.get_loc('foo', method='nearest') + idx.get_loc("foo", method="nearest") # 3 levels - index = MultiIndex(levels=[Index(np.arange(4)), - Index(np.arange(4)), - Index(np.arange(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) with pytest.raises(KeyError, match=r"^\(1, 1\)$"): index.get_loc((1, 1)) assert index.get_loc((2, 0)) == slice(3, 5) @@ -286,20 +300,21 @@ def test_get_loc_duplicates(): assert result == expected # pytest.raises(Exception, index.get_loc, 2) - index = Index(['c', 'a', 'a', 'b', 'b']) - rs = index.get_loc('c') + index = Index(["c", "a", "a", "b", "b"]) + rs = index.get_loc("c") xp = 0 assert rs == xp def test_get_loc_level(): - index = MultiIndex(levels=[Index(np.arange(4)), - Index(np.arange(4)), - Index(np.arange(4))], - codes=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), - np.array([0, 1, 0, 0, 0, 1, 0, 1]), - np.array([1, 0, 1, 1, 0, 0, 1, 0])], - ) + index = MultiIndex( + levels=[Index(np.arange(4)), Index(np.arange(4)), Index(np.arange(4))], + codes=[ + np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0]), + ], + ) loc, new_index = index.get_loc_level((0, 1)) expected = slice(1, 2) exp_index = index[expected].droplevel(0).droplevel(0) @@ -320,32 +335,31 @@ def test_get_loc_level(): with pytest.raises(KeyError, match=r"^2$"): index.drop(1, level=2).get_loc_level(2, level=2) - index = MultiIndex(levels=[[2000], list(range(4))], - codes=[np.array([0, 0, 0, 0]), - np.array([0, 1, 2, 3])], - ) + index = MultiIndex( + levels=[[2000], list(range(4))], + codes=[np.array([0, 0, 0, 0]), np.array([0, 1, 2, 3])], + ) result, new_index = index.get_loc_level((2000, slice(None, None))) expected = slice(None, None) assert result == expected assert new_index.equals(index.droplevel(0)) -@pytest.mark.parametrize('dtype1', [int, float, bool, str]) -@pytest.mark.parametrize('dtype2', [int, float, bool, str]) +@pytest.mark.parametrize("dtype1", [int, float, bool, str]) +@pytest.mark.parametrize("dtype2", [int, float, bool, str]) def test_get_loc_multiple_dtypes(dtype1, dtype2): # GH 18520 - levels = [np.array([0, 1]).astype(dtype1), - np.array([0, 1]).astype(dtype2)] + levels = [np.array([0, 1]).astype(dtype1), np.array([0, 1]).astype(dtype2)] idx = pd.MultiIndex.from_product(levels) assert idx.get_loc(idx[2]) == 2 -@pytest.mark.parametrize('level', [0, 1]) -@pytest.mark.parametrize('dtypes', [[int, float], [float, int]]) +@pytest.mark.parametrize("level", [0, 1]) +@pytest.mark.parametrize("dtypes", [[int, float], [float, int]]) def test_get_loc_implicit_cast(level, dtypes): # GH 18818, GH 15994 : as flat index, cast int to float and vice-versa - levels = [['a', 'b'], ['c', 'd']] - key = ['b', 'd'] + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] lev_dtype, key_dtype = dtypes levels[level] = np.array([0, 1], dtype=lev_dtype) key[level] = key_dtype(1) @@ -355,7 +369,7 @@ def test_get_loc_implicit_cast(level, dtypes): def test_get_loc_cast_bool(): # GH 19086 : int is casted to bool, but not vice-versa - levels = [[False, True], np.arange(2, dtype='int64')] + levels = [[False, True], np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) assert idx.get_loc((0, 1)) == 1 @@ -367,11 +381,11 @@ def test_get_loc_cast_bool(): idx.get_loc((True, False)) -@pytest.mark.parametrize('level', [0, 1]) +@pytest.mark.parametrize("level", [0, 1]) def test_get_loc_nan(level, nulls_fixture): # GH 18485 : NaN in MultiIndex - levels = [['a', 'b'], ['c', 'd']] - key = ['b', 'd'] + levels = [["a", "b"], ["c", "d"]] + key = ["b", "d"] levels[level] = np.array([0, nulls_fixture], dtype=type(nulls_fixture)) key[level] = nulls_fixture idx = MultiIndex.from_product(levels) @@ -393,28 +407,37 @@ def test_get_loc_missing_nan(): def test_get_indexer_categorical_time(): # https://github.com/pandas-dev/pandas/issues/21390 midx = MultiIndex.from_product( - [Categorical(['a', 'b', 'c']), - Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) result = midx.get_indexer(midx) tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) def test_timestamp_multiindex_indexer(): # https://github.com/pandas-dev/pandas/issues/26944 - idx = pd.MultiIndex.from_product([ - pd.date_range("2019-01-01T00:15:33", periods=100, freq="H", - name="date"), - ['x'], - [3] - ]) - df = pd.DataFrame({'foo': np.arange(len(idx))}, idx) - result = df.loc[pd.IndexSlice['2019-1-2':, "x", :], 'foo'] - qidx = pd.MultiIndex.from_product([ - pd.date_range(start="2019-01-02T00:15:33", end='2019-01-05T02:15:33', - freq="H", name="date"), - ['x'], - [3] - ]) - should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, - name="foo") + idx = pd.MultiIndex.from_product( + [ + pd.date_range("2019-01-01T00:15:33", periods=100, freq="H", name="date"), + ["x"], + [3], + ] + ) + df = pd.DataFrame({"foo": np.arange(len(idx))}, idx) + result = df.loc[pd.IndexSlice["2019-1-2":, "x", :], "foo"] + qidx = pd.MultiIndex.from_product( + [ + pd.date_range( + start="2019-01-02T00:15:33", + end="2019-01-05T02:15:33", + freq="H", + name="date", + ), + ["x"], + [3], + ] + ) + should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index a0bdb4c98e6c7..dba75b6247a20 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -13,28 +13,31 @@ def test_labels_dtypes(): # GH 8456 - i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) - assert i.codes[0].dtype == 'int8' - assert i.codes[1].dtype == 'int8' - - i = MultiIndex.from_product([['a'], range(40)]) - assert i.codes[1].dtype == 'int8' - i = MultiIndex.from_product([['a'], range(400)]) - assert i.codes[1].dtype == 'int16' - i = MultiIndex.from_product([['a'], range(40000)]) - assert i.codes[1].dtype == 'int32' - - i = pd.MultiIndex.from_product([['a'], range(1000)]) + i = MultiIndex.from_tuples([("A", 1), ("A", 2)]) + assert i.codes[0].dtype == "int8" + assert i.codes[1].dtype == "int8" + + i = MultiIndex.from_product([["a"], range(40)]) + assert i.codes[1].dtype == "int8" + i = MultiIndex.from_product([["a"], range(400)]) + assert i.codes[1].dtype == "int16" + i = MultiIndex.from_product([["a"], range(40000)]) + assert i.codes[1].dtype == "int32" + + i = pd.MultiIndex.from_product([["a"], range(1000)]) assert (i.codes[0] >= 0).all() assert (i.codes[1] >= 0).all() def test_values_boxed(): - tuples = [(1, pd.Timestamp('2000-01-01')), (2, pd.NaT), - (3, pd.Timestamp('2000-01-03')), - (1, pd.Timestamp('2000-01-04')), - (2, pd.Timestamp('2000-01-02')), - (3, pd.Timestamp('2000-01-03'))] + tuples = [ + (1, pd.Timestamp("2000-01-01")), + (2, pd.NaT), + (3, pd.Timestamp("2000-01-03")), + (1, pd.Timestamp("2000-01-04")), + (2, pd.Timestamp("2000-01-02")), + (3, pd.Timestamp("2000-01-03")), + ] result = pd.MultiIndex.from_tuples(tuples) expected = construct_1d_object_array_from_listlike(tuples) tm.assert_numpy_array_equal(result.values, expected) @@ -48,7 +51,7 @@ def test_values_multiindex_datetimeindex(): naive = pd.DatetimeIndex(ints) # TODO(GH-24559): Remove the FutureWarning with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - aware = pd.DatetimeIndex(ints, tz='US/Central') + aware = pd.DatetimeIndex(ints, tz="US/Central") idx = pd.MultiIndex.from_arrays([naive, aware]) result = idx.values @@ -72,7 +75,7 @@ def test_values_multiindex_datetimeindex(): def test_values_multiindex_periodindex(): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(2007, 2012) - pidx = pd.PeriodIndex(ints, freq='D') + pidx = pd.PeriodIndex(ints, freq="D") idx = pd.MultiIndex.from_arrays([ints, pidx]) result = idx.values @@ -102,14 +105,16 @@ def test_consistency(): minor_codes = np.repeat(range(10), 7000) # the fact that is works means it's consistent - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) # inconsistent major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) - index = MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes]) + index = MultiIndex( + levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + ) assert index.is_unique is False @@ -117,11 +122,11 @@ def test_consistency(): def test_hash_collisions(): # non-smoke test that we don't get hash collisions - index = MultiIndex.from_product([np.arange(1000), np.arange(1000)], - names=['one', 'two']) + index = MultiIndex.from_product( + [np.arange(1000), np.arange(1000)], names=["one", "two"] + ) result = index.get_indexer(index.values) - tm.assert_numpy_array_equal(result, np.arange( - len(index), dtype='intp')) + tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp")) for i in [0, 1, len(index) - 2, len(index) - 1]: result = index.get_loc(index[i]) @@ -133,9 +138,8 @@ def test_dims(): def take_invalid_kwargs(): - vals = [['A', 'B'], - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')]] - idx = pd.MultiIndex.from_product(vals, names=['str', 'dt']) + vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]] + idx = pd.MultiIndex.from_product(vals, names=["str", "dt"]) indices = [1, 2] msg = r"take\(\) got an unexpected keyword argument 'foo'" @@ -148,7 +152,7 @@ def take_invalid_kwargs(): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') + idx.take(indices, mode="clip") def test_isna_behavior(idx): @@ -163,30 +167,31 @@ def test_isna_behavior(idx): def test_large_multiindex_error(): # GH12527 df_below_1000000 = pd.DataFrame( - 1, index=pd.MultiIndex.from_product([[1, 2], range(499999)]), - columns=['dest']) + 1, index=pd.MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"] + ) with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): - df_below_1000000.loc[(-1, 0), 'dest'] + df_below_1000000.loc[(-1, 0), "dest"] with pytest.raises(KeyError, match=r"^\(3, 0\)$"): - df_below_1000000.loc[(3, 0), 'dest'] + df_below_1000000.loc[(3, 0), "dest"] df_above_1000000 = pd.DataFrame( - 1, index=pd.MultiIndex.from_product([[1, 2], range(500001)]), - columns=['dest']) + 1, index=pd.MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"] + ) with pytest.raises(KeyError, match=r"^\(-1, 0\)$"): - df_above_1000000.loc[(-1, 0), 'dest'] + df_above_1000000.loc[(-1, 0), "dest"] with pytest.raises(KeyError, match=r"^\(3, 0\)$"): - df_above_1000000.loc[(3, 0), 'dest'] + df_above_1000000.loc[(3, 0), "dest"] def test_million_record_attribute_error(): # GH 18165 r = list(range(1000000)) - df = pd.DataFrame({'a': r, 'b': r}, - index=pd.MultiIndex.from_tuples([(x, x) for x in r])) + df = pd.DataFrame( + {"a": r, "b": r}, index=pd.MultiIndex.from_tuples([(x, x) for x in r]) + ) msg = "'Series' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): - df['a'].foo() + df["a"].foo() def test_can_hold_identifiers(idx): @@ -197,7 +202,7 @@ def test_can_hold_identifiers(idx): def test_metadata_immutable(idx): levels, codes = idx.levels, idx.codes # shouldn't be able to set at either the top level or base level - mutable_regex = re.compile('does not support mutable operations') + mutable_regex = re.compile("does not support mutable operations") with pytest.raises(TypeError, match=mutable_regex): levels[0] = levels[0] with pytest.raises(TypeError, match=mutable_regex): @@ -214,11 +219,9 @@ def test_metadata_immutable(idx): def test_level_setting_resets_attributes(): - ind = pd.MultiIndex.from_arrays([ - ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] - ]) + ind = pd.MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) assert ind.is_monotonic - ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True) + ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic @@ -227,30 +230,30 @@ def test_rangeindex_fallback_coercion_bug(): # GH 12893 foo = pd.DataFrame(np.arange(100).reshape((10, 10))) bar = pd.DataFrame(np.arange(100).reshape((10, 10))) - df = pd.concat({'foo': foo.stack(), 'bar': bar.stack()}, axis=1) - df.index.names = ['fizz', 'buzz'] + df = pd.concat({"foo": foo.stack(), "bar": bar.stack()}, axis=1) + df.index.names = ["fizz", "buzz"] str(df) - expected = pd.DataFrame({'bar': np.arange(100), - 'foo': np.arange(100)}, - index=pd.MultiIndex.from_product( - [range(10), range(10)], - names=['fizz', 'buzz'])) + expected = pd.DataFrame( + {"bar": np.arange(100), "foo": np.arange(100)}, + index=pd.MultiIndex.from_product( + [range(10), range(10)], names=["fizz", "buzz"] + ), + ) tm.assert_frame_equal(df, expected, check_like=True) - result = df.index.get_level_values('fizz') - expected = pd.Int64Index(np.arange(10), name='fizz').repeat(10) + result = df.index.get_level_values("fizz") + expected = pd.Int64Index(np.arange(10), name="fizz").repeat(10) tm.assert_index_equal(result, expected) - result = df.index.get_level_values('buzz') - expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz') + result = df.index.get_level_values("buzz") + expected = pd.Int64Index(np.tile(np.arange(10), 10), name="buzz") tm.assert_index_equal(result, expected) def test_hash_error(indices): index = indices - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + with pytest.raises(TypeError, match=("unhashable type: %r" % type(index).__name__)): hash(indices) @@ -279,7 +282,7 @@ def test_memory_usage(idx): if not isinstance(idx, (RangeIndex, IntervalIndex)): assert result2 > result - if idx.inferred_type == 'object': + if idx.inferred_type == "object": assert result3 > result2 else: diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index ea26f210daaf6..42d8cf761842e 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -6,30 +6,27 @@ import pandas.util.testing as tm -@pytest.mark.parametrize('other', [ - Index(['three', 'one', 'two']), - Index(['one']), - Index(['one', 'three']), -]) +@pytest.mark.parametrize( + "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])] +) def test_join_level(idx, other, join_type): - join_index, lidx, ridx = other.join(idx, how=join_type, - level='second', - return_indexers=True) + join_index, lidx, ridx = other.join( + idx, how=join_type, level="second", return_indexers=True + ) exp_level = other.join(idx.levels[1], how=join_type) assert join_index.levels[0].equals(idx.levels[0]) assert join_index.levels[1].equals(exp_level) # pare down levels - mask = np.array( - [x[1] in exp_level for x in idx], dtype=bool) + mask = np.array([x[1] in exp_level for x in idx], dtype=bool) exp_values = idx.values[mask] tm.assert_numpy_array_equal(join_index.values, exp_values) - if join_type in ('outer', 'inner'): - join_index2, ridx2, lidx2 = \ - idx.join(other, how=join_type, level='second', - return_indexers=True) + if join_type in ("outer", "inner"): + join_index2, ridx2, lidx2 = idx.join( + other, how=join_type, level="second", return_indexers=True + ) assert join_index.equals(join_index2) tm.assert_numpy_array_equal(lidx, lidx2) @@ -39,8 +36,8 @@ def test_join_level(idx, other, join_type): def test_join_level_corner_case(idx): # some corner cases - index = Index(['three', 'one', 'two']) - result = index.join(idx, level='second') + index = Index(["three", "one", "two"]) + result = index.join(idx, level="second") assert isinstance(result, MultiIndex) with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"): @@ -54,34 +51,33 @@ def test_join_self(idx, join_type): def test_join_multi(): # GH 10665 - midx = pd.MultiIndex.from_product( - [np.arange(4), np.arange(4)], names=['a', 'b']) - idx = pd.Index([1, 2, 5], name='b') + midx = pd.MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"]) + idx = pd.Index([1, 2, 5], name="b") # inner - jidx, lidx, ridx = midx.join(idx, how='inner', return_indexers=True) - exp_idx = pd.MultiIndex.from_product( - [np.arange(4), [1, 2]], names=['a', 'b']) + jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True) + exp_idx = pd.MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"]) exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp) exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp) tm.assert_index_equal(jidx, exp_idx) tm.assert_numpy_array_equal(lidx, exp_lidx) tm.assert_numpy_array_equal(ridx, exp_ridx) # flip - jidx, ridx, lidx = idx.join(midx, how='inner', return_indexers=True) + jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True) tm.assert_index_equal(jidx, exp_idx) tm.assert_numpy_array_equal(lidx, exp_lidx) tm.assert_numpy_array_equal(ridx, exp_ridx) # keep MultiIndex - jidx, lidx, ridx = midx.join(idx, how='left', return_indexers=True) - exp_ridx = np.array([-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, - 1, -1], dtype=np.intp) + jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True) + exp_ridx = np.array( + [-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp + ) tm.assert_index_equal(jidx, midx) assert lidx is None tm.assert_numpy_array_equal(ridx, exp_ridx) # flip - jidx, ridx, lidx = idx.join(midx, how='right', return_indexers=True) + jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True) tm.assert_index_equal(jidx, midx) assert lidx is None tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 1928c303a1bcd..15bbd2ce97c3c 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -13,7 +13,7 @@ def test_fillna(idx): # GH 11343 # TODO: Remove or Refactor. Not Implemented for MultiIndex - for name, index in [('idx', idx), ]: + for name, index in [("idx", idx)]: if len(index) == 0: pass elif isinstance(index, MultiIndex): @@ -54,39 +54,42 @@ def test_fillna(idx): def test_dropna(): # GH 6194 - idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5], - [1, 2, np.nan, np.nan, 5], - ['a', 'b', 'c', np.nan, 'e']]) - - exp = pd.MultiIndex.from_arrays([[1, 5], - [1, 5], - ['a', 'e']]) + idx = pd.MultiIndex.from_arrays( + [ + [1, np.nan, 3, np.nan, 5], + [1, 2, np.nan, np.nan, 5], + ["a", "b", "c", np.nan, "e"], + ] + ) + + exp = pd.MultiIndex.from_arrays([[1, 5], [1, 5], ["a", "e"]]) tm.assert_index_equal(idx.dropna(), exp) - tm.assert_index_equal(idx.dropna(how='any'), exp) + tm.assert_index_equal(idx.dropna(how="any"), exp) - exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5], - [1, 2, np.nan, 5], - ['a', 'b', 'c', 'e']]) - tm.assert_index_equal(idx.dropna(how='all'), exp) + exp = pd.MultiIndex.from_arrays( + [[1, np.nan, 3, 5], [1, 2, np.nan, 5], ["a", "b", "c", "e"]] + ) + tm.assert_index_equal(idx.dropna(how="all"), exp) msg = "invalid how option: xxx" with pytest.raises(ValueError, match=msg): - idx.dropna(how='xxx') + idx.dropna(how="xxx") # GH26408 # test if missing values are dropped for multiindex constructed # from codes and values - idx = MultiIndex(levels=[[np.nan, None, pd.NaT, "128", 2], - [np.nan, None, pd.NaT, "128", 2]], - codes=[[0, -1, 1, 2, 3, 4], - [0, -1, 3, 3, 3, 4]]) + idx = MultiIndex( + levels=[[np.nan, None, pd.NaT, "128", 2], [np.nan, None, pd.NaT, "128", 2]], + codes=[[0, -1, 1, 2, 3, 4], [0, -1, 3, 3, 3, 4]], + ) expected = MultiIndex.from_arrays([["128", 2], ["128", 2]]) tm.assert_index_equal(idx.dropna(), expected) - tm.assert_index_equal(idx.dropna(how='any'), expected) + tm.assert_index_equal(idx.dropna(how="any"), expected) - expected = MultiIndex.from_arrays([[np.nan, np.nan, "128", 2], - ["128", "128", "128", 2]]) - tm.assert_index_equal(idx.dropna(how='all'), expected) + expected = MultiIndex.from_arrays( + [[np.nan, np.nan, "128", 2], ["128", "128", "128", 2]] + ) + tm.assert_index_equal(idx.dropna(how="all"), expected) def test_nulls(idx): @@ -123,13 +126,11 @@ def test_hasnans_isnans(idx): def test_nan_stays_float(): # GH 7031 - idx0 = pd.MultiIndex(levels=[["A", "B"], []], - codes=[[1, 0], [-1, -1]], - names=[0, 1]) - idx1 = pd.MultiIndex(levels=[["C"], ["D"]], - codes=[[0], [0]], - names=[0, 1]) - idxm = idx0.join(idx1, how='outer') + idx0 = pd.MultiIndex( + levels=[["A", "B"], []], codes=[[1, 0], [-1, -1]], names=[0, 1] + ) + idx1 = pd.MultiIndex(levels=[["C"], ["D"]], codes=[[0], [0]], names=[0, 1]) + idxm = idx0.join(idx1, how="outer") assert pd.isna(idx0.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(idxm.get_level_values(1)[:-1]).all() diff --git a/pandas/tests/indexes/multi/test_monotonic.py b/pandas/tests/indexes/multi/test_monotonic.py index a160dc2678157..b5c73d5e97745 100644 --- a/pandas/tests/indexes/multi/test_monotonic.py +++ b/pandas/tests/indexes/multi/test_monotonic.py @@ -7,50 +7,50 @@ def test_is_monotonic_increasing(): - i = MultiIndex.from_product([np.arange(10), - np.arange(10)], names=['one', 'two']) + i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=["one", "two"]) assert i.is_monotonic is True assert i._is_strictly_monotonic_increasing is True assert Index(i.values).is_monotonic is True assert i._is_strictly_monotonic_increasing is True - i = MultiIndex.from_product([np.arange(10, 0, -1), - np.arange(10)], names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False - i = MultiIndex.from_product([np.arange(10), - np.arange(10, 0, -1)], - names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False - i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) + i = MultiIndex.from_product([[1.0, np.nan, 2.0], ["a", "b", "c"]]) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values).is_monotonic is False assert Index(i.values)._is_strictly_monotonic_increasing is False # string ordering - i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic is False assert Index(i.values).is_monotonic is False assert i._is_strictly_monotonic_increasing is False assert Index(i.values)._is_strictly_monotonic_increasing is False - i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], - ['mom', 'next', 'zenith']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["bar", "baz", "foo", "qux"], ["mom", "next", "zenith"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic is True assert Index(i.values).is_monotonic is True assert i._is_strictly_monotonic_increasing is True @@ -58,11 +58,19 @@ def test_is_monotonic_increasing(): # mixed levels, hits the TypeError i = MultiIndex( - levels=[[1, 2, 3, 4], ['gb00b03mlx29', 'lu0197800237', - 'nl0000289783', - 'nl0000289965', 'nl0000301109']], + levels=[ + [1, 2, 3, 4], + [ + "gb00b03mlx29", + "lu0197800237", + "nl0000289783", + "nl0000289965", + "nl0000301109", + ], + ], codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], - names=['household_id', 'asset_id']) + names=["household_id", "asset_id"], + ) assert i.is_monotonic is False assert i._is_strictly_monotonic_increasing is False @@ -76,51 +84,52 @@ def test_is_monotonic_increasing(): def test_is_monotonic_decreasing(): - i = MultiIndex.from_product([np.arange(9, -1, -1), - np.arange(9, -1, -1)], - names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(9, -1, -1), np.arange(9, -1, -1)], names=["one", "two"] + ) assert i.is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True - i = MultiIndex.from_product([np.arange(10), - np.arange(10, 0, -1)], - names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10), np.arange(10, 0, -1)], names=["one", "two"] + ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False - i = MultiIndex.from_product([np.arange(10, 0, -1), - np.arange(10)], names=['one', 'two']) + i = MultiIndex.from_product( + [np.arange(10, 0, -1), np.arange(10)], names=["one", "two"] + ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False - i = MultiIndex.from_product([[2.0, np.nan, 1.0], ['c', 'b', 'a']]) + i = MultiIndex.from_product([[2.0, np.nan, 1.0], ["c", "b", "a"]]) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False # string ordering - i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'], - ['three', 'two', 'one']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["three", "two", "one"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic_decreasing is False assert Index(i.values).is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False assert Index(i.values)._is_strictly_monotonic_decreasing is False - i = MultiIndex(levels=[['qux', 'foo', 'baz', 'bar'], - ['zenith', 'next', 'mom']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) + i = MultiIndex( + levels=[["qux", "foo", "baz", "bar"], ["zenith", "next", "mom"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) assert i.is_monotonic_decreasing is True assert Index(i.values).is_monotonic_decreasing is True assert i._is_strictly_monotonic_decreasing is True @@ -128,11 +137,19 @@ def test_is_monotonic_decreasing(): # mixed levels, hits the TypeError i = MultiIndex( - levels=[[4, 3, 2, 1], ['nl0000301109', 'nl0000289965', - 'nl0000289783', 'lu0197800237', - 'gb00b03mlx29']], + levels=[ + [4, 3, 2, 1], + [ + "nl0000301109", + "nl0000289965", + "nl0000289783", + "lu0197800237", + "gb00b03mlx29", + ], + ], codes=[[0, 1, 1, 2, 2, 2, 3], [4, 2, 0, 0, 1, 3, -1]], - names=['household_id', 'asset_id']) + names=["household_id", "asset_id"], + ) assert i.is_monotonic_decreasing is False assert i._is_strictly_monotonic_decreasing is False @@ -146,15 +163,17 @@ def test_is_monotonic_decreasing(): def test_is_strictly_monotonic_increasing(): - idx = pd.MultiIndex(levels=[['bar', 'baz'], ['mom', 'next']], - codes=[[0, 0, 1, 1], [0, 0, 0, 1]]) + idx = pd.MultiIndex( + levels=[["bar", "baz"], ["mom", "next"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) assert idx.is_monotonic_increasing is True assert idx._is_strictly_monotonic_increasing is False def test_is_strictly_monotonic_decreasing(): - idx = pd.MultiIndex(levels=[['baz', 'bar'], ['next', 'mom']], - codes=[[0, 0, 1, 1], [0, 0, 0, 1]]) + idx = pd.MultiIndex( + levels=[["baz", "bar"], ["next", "mom"]], codes=[[0, 0, 1, 1], [0, 0, 0, 1]] + ) assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is False @@ -180,32 +199,32 @@ def test_searchsorted_monotonic(indices): # test _searchsorted_monotonic in all cases # test searchsorted only for increasing if indices.is_monotonic_increasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert is_scalar(ssm_left) assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert is_scalar(ssm_right) assert expected_right == ssm_right - ss_left = indices.searchsorted(value, side='left') + ss_left = indices.searchsorted(value, side="left") assert is_scalar(ss_left) assert expected_left == ss_left - ss_right = indices.searchsorted(value, side='right') + ss_right = indices.searchsorted(value, side="right") assert is_scalar(ss_right) assert expected_right == ss_right elif indices.is_monotonic_decreasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert is_scalar(ssm_left) assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert is_scalar(ssm_right) assert expected_right == ssm_right else: # non-monotonic should raise. with pytest.raises(ValueError): - indices._searchsorted_monotonic(value, side='left') + indices._searchsorted_monotonic(value, side="left") diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 4785e1590b75d..5856cb56b307b 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -10,22 +10,19 @@ def check_level_names(index, names): def test_slice_keep_name(): - x = MultiIndex.from_tuples([('a', 'b'), (1, 2), ('c', 'd')], - names=['x', 'y']) + x = MultiIndex.from_tuples([("a", "b"), (1, 2), ("c", "d")], names=["x", "y"]) assert x[1:].names == x.names def test_index_name_retained(): # GH9857 - result = pd.DataFrame({'x': [1, 2, 6], - 'y': [2, 2, 8], - 'z': [-5, 0, 5]}) - result = result.set_index('z') + result = pd.DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}) + result = result.set_index("z") result.loc[10] = [9, 10] - df_expected = pd.DataFrame({'x': [1, 2, 6, 9], - 'y': [2, 2, 8, 10], - 'z': [-5, 0, 5, 10]}) - df_expected = df_expected.set_index('z') + df_expected = pd.DataFrame( + {"x": [1, 2, 6, 9], "y": [2, 2, 8, 10], "z": [-5, 0, 5, 10]} + ) + df_expected = df_expected.set_index("z") tm.assert_frame_equal(result, df_expected) @@ -62,24 +59,24 @@ def test_take_preserve_name(idx): def test_copy_names(): # Check that adding a "names" parameter to the copy is honored # GH14302 - multi_idx = pd.Index([(1, 2), (3, 4)], names=['MyName1', 'MyName2']) + multi_idx = pd.Index([(1, 2), (3, 4)], names=["MyName1", "MyName2"]) multi_idx1 = multi_idx.copy() assert multi_idx.equals(multi_idx1) - assert multi_idx.names == ['MyName1', 'MyName2'] - assert multi_idx1.names == ['MyName1', 'MyName2'] + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx1.names == ["MyName1", "MyName2"] - multi_idx2 = multi_idx.copy(names=['NewName1', 'NewName2']) + multi_idx2 = multi_idx.copy(names=["NewName1", "NewName2"]) assert multi_idx.equals(multi_idx2) - assert multi_idx.names == ['MyName1', 'MyName2'] - assert multi_idx2.names == ['NewName1', 'NewName2'] + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx2.names == ["NewName1", "NewName2"] - multi_idx3 = multi_idx.copy(name=['NewName1', 'NewName2']) + multi_idx3 = multi_idx.copy(name=["NewName1", "NewName2"]) assert multi_idx.equals(multi_idx3) - assert multi_idx.names == ['MyName1', 'MyName2'] - assert multi_idx3.names == ['NewName1', 'NewName2'] + assert multi_idx.names == ["MyName1", "MyName2"] + assert multi_idx3.names == ["NewName1", "NewName2"] def test_names(idx, index_names): @@ -100,13 +97,17 @@ def test_names(idx, index_names): major_axis, minor_axis = idx.levels major_codes, minor_codes = idx.codes with pytest.raises(ValueError, match="^Length of names"): - MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=['first']) + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first"], + ) with pytest.raises(ValueError, match="^Length of names"): - MultiIndex(levels=[major_axis, minor_axis], - codes=[major_codes, minor_codes], - names=['first', 'second', 'third']) + MultiIndex( + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["first", "second", "third"], + ) # names are assigned index.names = ["a", "b"] @@ -117,6 +118,6 @@ def test_names(idx, index_names): def test_duplicate_level_names_access_raises(idx): # GH19029 - idx.names = ['foo', 'foo'] - with pytest.raises(ValueError, match='name foo occurs multiple times'): - idx._get_level_number('foo') + idx.names = ["foo", "foo"] + with pytest.raises(ValueError, match="name foo occurs multiple times"): + idx._get_level_number("foo") diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index b75396a313666..d6799e86683a9 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -8,10 +8,10 @@ def test_partial_string_timestamp_multiindex(): # GH10331 - dr = pd.date_range('2016-01-01', '2016-01-03', freq='12H') - abc = ['a', 'b', 'c'] + dr = pd.date_range("2016-01-01", "2016-01-03", freq="12H") + abc = ["a", "b", "c"] ix = pd.MultiIndex.from_product([dr, abc]) - df = pd.DataFrame({'c1': range(0, 15)}, index=ix) + df = pd.DataFrame({"c1": range(0, 15)}, index=ix) idx = pd.IndexSlice # c1 @@ -32,23 +32,21 @@ def test_partial_string_timestamp_multiindex(): # c 14 # partial string matching on a single index - for df_swap in (df.swaplevel(), - df.swaplevel(0), - df.swaplevel(0, 1)): + for df_swap in (df.swaplevel(), df.swaplevel(0), df.swaplevel(0, 1)): df_swap = df_swap.sort_index() - just_a = df_swap.loc['a'] - result = just_a.loc['2016-01-01'] - expected = df.loc[idx[:, 'a'], :].iloc[0:2] + just_a = df_swap.loc["a"] + result = just_a.loc["2016-01-01"] + expected = df.loc[idx[:, "a"], :].iloc[0:2] expected.index = expected.index.droplevel(1) tm.assert_frame_equal(result, expected) # indexing with IndexSlice - result = df.loc[idx['2016-01-01':'2016-02-01', :], :] + result = df.loc[idx["2016-01-01":"2016-02-01", :], :] expected = df tm.assert_frame_equal(result, expected) # match on secondary index - result = df_swap.loc[idx[:, '2016-01-01':'2016-01-01'], :] + result = df_swap.loc[idx[:, "2016-01-01":"2016-01-01"], :] expected = df_swap.iloc[[0, 1, 5, 6, 10, 11]] tm.assert_frame_equal(result, expected) @@ -57,42 +55,42 @@ def test_partial_string_timestamp_multiindex(): # in multi-indexes. This would amount to selecting a scalar from a # column. with pytest.raises(KeyError): - df['2016-01-01'] + df["2016-01-01"] # partial string match on year only - result = df.loc['2016'] + result = df.loc["2016"] expected = df tm.assert_frame_equal(result, expected) # partial string match on date - result = df.loc['2016-01-01'] + result = df.loc["2016-01-01"] expected = df.iloc[0:6] tm.assert_frame_equal(result, expected) # partial string match on date and hour, from middle - result = df.loc['2016-01-02 12'] + result = df.loc["2016-01-02 12"] expected = df.iloc[9:12] tm.assert_frame_equal(result, expected) # partial string match on secondary index - result = df_swap.loc[idx[:, '2016-01-02'], :] + result = df_swap.loc[idx[:, "2016-01-02"], :] expected = df_swap.iloc[[2, 3, 7, 8, 12, 13]] tm.assert_frame_equal(result, expected) # tuple selector with partial string match on date - result = df.loc[('2016-01-01', 'a'), :] + result = df.loc[("2016-01-01", "a"), :] expected = df.iloc[[0, 3]] tm.assert_frame_equal(result, expected) # Slicing date on first level should break (of course) with pytest.raises(KeyError): - df_swap.loc['2016-01-01'] + df_swap.loc["2016-01-01"] # GH12685 (partial string with daily resolution or below) - dr = date_range('2013-01-01', periods=100, freq='D') - ix = MultiIndex.from_product([dr, ['a', 'b']]) - df = DataFrame(np.random.randn(200, 1), columns=['A'], index=ix) + dr = date_range("2013-01-01", periods=100, freq="D") + ix = MultiIndex.from_product([dr, ["a", "b"]]) + df = DataFrame(np.random.randn(200, 1), columns=["A"], index=ix) - result = df.loc[idx['2013-03':'2013-03', :], :] + result = df.loc[idx["2013-03":"2013-03", :], :] expected = df.iloc[118:180] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 6f13a374f6cc9..88de4d1e80386 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -22,13 +22,13 @@ def test_reindex(idx): def test_reindex_level(idx): - index = Index(['one']) + index = Index(["one"]) - target, indexer = idx.reindex(index, level='second') - target2, indexer2 = index.reindex(idx, level='second') + target, indexer = idx.reindex(index, level="second") + target2, indexer2 = index.reindex(idx, level="second") - exp_index = idx.join(index, level='second', how='right') - exp_index2 = idx.join(index, level='second', how='left') + exp_index = idx.join(index, level="second", how="right") + exp_index2 = idx.join(index, level="second", how="left") assert target.equals(exp_index) exp_indexer = np.array([0, 2, 4]) @@ -39,10 +39,10 @@ def test_reindex_level(idx): tm.assert_numpy_array_equal(indexer2, exp_indexer2, check_dtype=False) with pytest.raises(TypeError, match="Fill method not supported"): - idx.reindex(idx, method='pad', level='second') + idx.reindex(idx, method="pad", level="second") with pytest.raises(TypeError, match="Fill method not supported"): - index.reindex(index, method='bfill', level='first') + index.reindex(index, method="bfill", level="first") def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx): @@ -61,26 +61,25 @@ def test_reindex_preserves_names_when_target_is_list_or_ndarray(idx): assert idx.reindex(other_dtype.tolist())[0].names == [None, None] assert idx.reindex(other_dtype.values)[0].names == [None, None] - idx.names = ['foo', 'bar'] - assert idx.reindex([])[0].names == ['foo', 'bar'] - assert idx.reindex(np.array([]))[0].names == ['foo', 'bar'] - assert idx.reindex(target.tolist())[0].names == ['foo', 'bar'] - assert idx.reindex(target.values)[0].names == ['foo', 'bar'] - assert idx.reindex(other_dtype.tolist())[0].names == ['foo', 'bar'] - assert idx.reindex(other_dtype.values)[0].names == ['foo', 'bar'] + idx.names = ["foo", "bar"] + assert idx.reindex([])[0].names == ["foo", "bar"] + assert idx.reindex(np.array([]))[0].names == ["foo", "bar"] + assert idx.reindex(target.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(target.values)[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.tolist())[0].names == ["foo", "bar"] + assert idx.reindex(other_dtype.values)[0].names == ["foo", "bar"] def test_reindex_lvl_preserves_names_when_target_is_list_or_array(): # GH7774 - idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']], - names=['foo', 'bar']) - assert idx.reindex([], level=0)[0].names == ['foo', 'bar'] - assert idx.reindex([], level=1)[0].names == ['foo', 'bar'] + idx = pd.MultiIndex.from_product([[0, 1], ["a", "b"]], names=["foo", "bar"]) + assert idx.reindex([], level=0)[0].names == ["foo", "bar"] + assert idx.reindex([], level=1)[0].names == ["foo", "bar"] def test_reindex_lvl_preserves_type_if_target_is_empty_list_or_array(): # GH7774 - idx = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) assert idx.reindex([], level=0)[0].levels[0].dtype.type == np.int64 assert idx.reindex([], level=1)[0].levels[1].dtype.type == np.object_ @@ -92,8 +91,8 @@ def test_reindex_base(idx): actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) - with pytest.raises(ValueError, match='Invalid fill method'): - idx.get_indexer(idx, method='invalid') + with pytest.raises(ValueError, match="Invalid fill method"): + idx.get_indexer(idx, method="invalid") def test_reindex_non_unique(): @@ -101,6 +100,6 @@ def test_reindex_non_unique(): a = pd.Series(np.arange(4), index=idx) new_idx = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - msg = 'cannot handle a non-unique multi-index!' + msg = "cannot handle a non-unique multi-index!" with pytest.raises(ValueError, match=msg): a.reindex(new_idx) diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 87a2751631fe6..a30e6f33d1499 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -8,67 +8,75 @@ def test_insert(idx): # key contained in all levels - new_index = idx.insert(0, ('bar', 'two')) + new_index = idx.insert(0, ("bar", "two")) assert new_index.equal_levels(idx) - assert new_index[0] == ('bar', 'two') + assert new_index[0] == ("bar", "two") # key not contained in all levels - new_index = idx.insert(0, ('abc', 'three')) + new_index = idx.insert(0, ("abc", "three")) - exp0 = Index(list(idx.levels[0]) + ['abc'], name='first') + exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") tm.assert_index_equal(new_index.levels[0], exp0) - exp1 = Index(list(idx.levels[1]) + ['three'], name='second') + exp1 = Index(list(idx.levels[1]) + ["three"], name="second") tm.assert_index_equal(new_index.levels[1], exp1) - assert new_index[0] == ('abc', 'three') + assert new_index[0] == ("abc", "three") # key wrong length msg = "Item must have length equal to number of levels" with pytest.raises(ValueError, match=msg): - idx.insert(0, ('foo2',)) - - left = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1]], - columns=['1st', '2nd', '3rd']) - left.set_index(['1st', '2nd'], inplace=True) - ts = left['3rd'].copy(deep=True) - - left.loc[('b', 'x'), '3rd'] = 2 - left.loc[('b', 'a'), '3rd'] = -1 - left.loc[('b', 'b'), '3rd'] = 3 - left.loc[('a', 'x'), '3rd'] = 4 - left.loc[('a', 'w'), '3rd'] = 5 - left.loc[('a', 'a'), '3rd'] = 6 - - ts.loc[('b', 'x')] = 2 - ts.loc['b', 'a'] = -1 - ts.loc[('b', 'b')] = 3 - ts.loc['a', 'x'] = 4 - ts.loc[('a', 'w')] = 5 - ts.loc['a', 'a'] = 6 - - right = pd.DataFrame([['a', 'b', 0], ['b', 'd', 1], ['b', 'x', 2], - ['b', 'a', -1], ['b', 'b', 3], ['a', 'x', 4], - ['a', 'w', 5], ['a', 'a', 6]], - columns=['1st', '2nd', '3rd']) - right.set_index(['1st', '2nd'], inplace=True) + idx.insert(0, ("foo2",)) + + left = pd.DataFrame([["a", "b", 0], ["b", "d", 1]], columns=["1st", "2nd", "3rd"]) + left.set_index(["1st", "2nd"], inplace=True) + ts = left["3rd"].copy(deep=True) + + left.loc[("b", "x"), "3rd"] = 2 + left.loc[("b", "a"), "3rd"] = -1 + left.loc[("b", "b"), "3rd"] = 3 + left.loc[("a", "x"), "3rd"] = 4 + left.loc[("a", "w"), "3rd"] = 5 + left.loc[("a", "a"), "3rd"] = 6 + + ts.loc[("b", "x")] = 2 + ts.loc["b", "a"] = -1 + ts.loc[("b", "b")] = 3 + ts.loc["a", "x"] = 4 + ts.loc[("a", "w")] = 5 + ts.loc["a", "a"] = 6 + + right = pd.DataFrame( + [ + ["a", "b", 0], + ["b", "d", 1], + ["b", "x", 2], + ["b", "a", -1], + ["b", "b", 3], + ["a", "x", 4], + ["a", "w", 5], + ["a", "a", 6], + ], + columns=["1st", "2nd", "3rd"], + ) + right.set_index(["1st", "2nd"], inplace=True) # FIXME data types changes to float because # of intermediate nan insertion; tm.assert_frame_equal(left, right, check_dtype=False) - tm.assert_series_equal(ts, right['3rd']) + tm.assert_series_equal(ts, right["3rd"]) # GH9250 - idx = [('test1', i) for i in range(5)] + \ - [('test2', i) for i in range(6)] + \ - [('test', 17), ('test', 18)] + idx = ( + [("test1", i) for i in range(5)] + + [("test2", i) for i in range(6)] + + [("test", 17), ("test", 18)] + ) - left = pd.Series(np.linspace(0, 10, 11), - pd.MultiIndex.from_tuples(idx[:-2])) + left = pd.Series(np.linspace(0, 10, 11), pd.MultiIndex.from_tuples(idx[:-2])) - left.loc[('test', 17)] = 11 - left.loc[('test', 18)] = 12 + left.loc[("test", 17)] = 11 + left.loc[("test", 18)] = 12 - right = pd.Series(np.linspace(0, 12, 13), - pd.MultiIndex.from_tuples(idx)) + right = pd.Series(np.linspace(0, 12, 13), pd.MultiIndex.from_tuples(idx)) tm.assert_series_equal(left, right) @@ -89,12 +97,10 @@ def test_append(idx): def test_repeat(): reps = 2 numbers = [1, 2, 3] - names = np.array(['foo', 'bar']) + names = np.array(["foo", "bar"]) - m = MultiIndex.from_product([ - numbers, names], names=names) - expected = MultiIndex.from_product([ - numbers, names.repeat(reps)], names=names) + m = MultiIndex.from_product([numbers, names], names=names) + expected = MultiIndex.from_product([numbers, names.repeat(reps)], names=names) tm.assert_index_equal(m.repeat(reps), expected) diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index 640e121d32b3a..835784054261e 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -8,8 +8,9 @@ @pytest.mark.parametrize("case", [0.5, "xxx"]) @pytest.mark.parametrize("sort", [None, False]) -@pytest.mark.parametrize("method", ["intersection", "union", - "difference", "symmetric_difference"]) +@pytest.mark.parametrize( + "method", ["intersection", "union", "difference", "symmetric_difference"] +) def test_set_ops_error_cases(idx, case, sort, method): # non-iterable input msg = "Input must be Index or array-like" @@ -28,8 +29,7 @@ def test_intersection_base(idx, sort): assert tm.equalContents(intersect, second) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.intersection(case, sort=sort) if sort is None: @@ -52,8 +52,7 @@ def test_union_base(idx, sort): assert tm.equalContents(union, everything) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.union(case, sort=sort) if sort is None: @@ -78,8 +77,7 @@ def test_difference_base(idx, sort): tm.assert_index_equal(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = idx.difference(case, sort=sort) tm.assert_index_equal(result, answer) @@ -102,8 +100,7 @@ def test_symmetric_difference(idx, sort): tm.assert_index_equal(result, answer) # GH 10149 - cases = [klass(second.values) - for klass in [np.array, Series, list]] + cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: result = first.symmetric_difference(case, sort=sort) tm.assert_index_equal(result, answer) @@ -129,9 +126,7 @@ def test_difference(idx, sort): if sort is None: vals = sorted(vals) - expected = MultiIndex.from_tuples(vals, - sortorder=0, - names=idx.names) + expected = MultiIndex.from_tuples(vals, sortorder=0, names=idx.names) assert isinstance(result, MultiIndex) assert result.equals(expected) @@ -158,7 +153,7 @@ def test_difference(idx, sort): # names not the same chunklet = idx[-3:] - chunklet.names = ['foo', 'baz'] + chunklet.names = ["foo", "baz"] result = first.difference(chunklet, sort=sort) assert result.names == (None, None) @@ -176,9 +171,10 @@ def test_difference(idx, sort): assert first.names == result.names # name from non-empty array - result = first.difference([('foo', 'one')], sort=sort) - expected = pd.MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ( - 'foo', 'two'), ('qux', 'one'), ('qux', 'two')]) + result = first.difference([("foo", "one")], sort=sort) + expected = pd.MultiIndex.from_tuples( + [("bar", "one"), ("baz", "two"), ("foo", "two"), ("qux", "one"), ("qux", "two")] + ) expected.names = first.names assert first.names == result.names @@ -189,7 +185,7 @@ def test_difference(idx, sort): def test_difference_sort_special(): # GH-24959 - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) # sort=None, the default result = idx.difference([]) tm.assert_index_equal(result, idx) @@ -198,19 +194,17 @@ def test_difference_sort_special(): @pytest.mark.xfail(reason="Not implemented.") def test_difference_sort_special_true(): # TODO decide on True behaviour - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) result = idx.difference([], sort=True) - expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + expected = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) tm.assert_index_equal(result, expected) def test_difference_sort_incomparable(): # GH-24959 - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], - ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) - other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], - ['c', 'd']]) + other = pd.MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) # sort=None, the default # MultiIndex.difference deviates here from other difference # implementations in not catching the TypeError @@ -226,10 +220,8 @@ def test_difference_sort_incomparable(): def test_difference_sort_incomparable_true(): # TODO decide on True behaviour # # sort=True, raises - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], - ['a', 'b']]) - other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], - ['c', 'd']]) + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000"), 2], ["a", "b"]]) + other = pd.MultiIndex.from_product([[3, pd.Timestamp("2000"), 4], ["c", "d"]]) with pytest.raises(TypeError): idx.difference(other, sort=True) @@ -299,7 +291,7 @@ def test_intersection(idx, sort): def test_intersect_equal_sort(): # GH-24959 - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) tm.assert_index_equal(idx.intersection(idx, sort=False), idx) tm.assert_index_equal(idx.intersection(idx, sort=None), idx) @@ -307,15 +299,15 @@ def test_intersect_equal_sort(): @pytest.mark.xfail(reason="Not implemented.") def test_intersect_equal_sort_true(): # TODO decide on True behaviour - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) - sorted_ = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) + sorted_ = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) -@pytest.mark.parametrize('slice_', [slice(None), slice(0)]) +@pytest.mark.parametrize("slice_", [slice(None), slice(0)]) def test_union_sort_other_empty(slice_): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) # default, sort=None other = idx[slice_] @@ -331,16 +323,16 @@ def test_union_sort_other_empty(slice_): def test_union_sort_other_empty_sort(slice_): # TODO decide on True behaviour # # sort=True - idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, 0], ["a", "b"]]) other = idx[:0] result = idx.union(other, sort=True) - expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + expected = pd.MultiIndex.from_product([[0, 1], ["a", "b"]]) tm.assert_index_equal(result, expected) def test_union_sort_other_incomparable(): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) # default, sort=None result = idx.union(idx[:1]) @@ -355,16 +347,17 @@ def test_union_sort_other_incomparable(): def test_union_sort_other_incomparable_sort(): # TODO decide on True behaviour # # sort=True - idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) - with pytest.raises(TypeError, match='Cannot compare'): + idx = pd.MultiIndex.from_product([[1, pd.Timestamp("2000")], ["a", "b"]]) + with pytest.raises(TypeError, match="Cannot compare"): idx.union(idx[:1], sort=True) -@pytest.mark.parametrize("method", ['union', 'intersection', 'difference', - 'symmetric_difference']) +@pytest.mark.parametrize( + "method", ["union", "intersection", "difference", "symmetric_difference"] +) def test_setops_disallow_true(method): - idx1 = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) - idx2 = pd.MultiIndex.from_product([['b', 'c'], [1, 2]]) + idx1 = pd.MultiIndex.from_product([["a", "b"], [1, 2]]) + idx2 = pd.MultiIndex.from_product([["b", "c"], [1, 2]]) with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index b3c0bd69475e3..c62bc80cfb53f 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -33,14 +33,20 @@ def test_sortlevel(idx): def test_sortlevel_not_sort_remaining(): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - sorted_idx, _ = mi.sortlevel('A', sort_remaining=False) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + sorted_idx, _ = mi.sortlevel("A", sort_remaining=False) assert sorted_idx.equals(mi) def test_sortlevel_deterministic(): - tuples = [('bar', 'one'), ('foo', 'two'), ('qux', 'two'), - ('foo', 'one'), ('baz', 'two'), ('qux', 'one')] + tuples = [ + ("bar", "one"), + ("foo", "two"), + ("qux", "two"), + ("foo", "one"), + ("baz", "two"), + ("qux", "one"), + ] index = MultiIndex.from_tuples(tuples) @@ -84,55 +90,55 @@ def test_numpy_argsort(idx): msg = "the 'kind' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(idx, kind='mergesort') + np.argsort(idx, kind="mergesort") msg = "the 'order' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(idx, order=('a', 'b')) + np.argsort(idx, order=("a", "b")) def test_unsortedindex(): # GH 11897 - mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), - ('x', 'b'), ('y', 'a'), ('z', 'b')], - names=['one', 'two']) - df = pd.DataFrame([[i, 10 * i] for i in range(6)], index=mi, - columns=['one', 'two']) + mi = pd.MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) + df = pd.DataFrame([[i, 10 * i] for i in range(6)], index=mi, columns=["one", "two"]) # GH 16734: not sorted, but no real slicing - result = df.loc(axis=0)['z', 'a'] + result = df.loc(axis=0)["z", "a"] expected = df.iloc[0] tm.assert_series_equal(result, expected) with pytest.raises(UnsortedIndexError): - df.loc(axis=0)['z', slice('a')] + df.loc(axis=0)["z", slice("a")] df.sort_index(inplace=True) - assert len(df.loc(axis=0)['z', :]) == 2 + assert len(df.loc(axis=0)["z", :]) == 2 with pytest.raises(KeyError): - df.loc(axis=0)['q', :] + df.loc(axis=0)["q", :] def test_unsortedindex_doc_examples(): # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa - dfm = DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}) + dfm = DataFrame( + {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} + ) - dfm = dfm.set_index(['jim', 'joe']) + dfm = dfm.set_index(["jim", "joe"]) with tm.assert_produces_warning(PerformanceWarning): - dfm.loc[(1, 'z')] + dfm.loc[(1, "z")] with pytest.raises(UnsortedIndexError): - dfm.loc[(0, 'y'):(1, 'z')] + dfm.loc[(0, "y"):(1, "z")] assert not dfm.index.is_lexsorted() assert dfm.index.lexsort_depth == 1 # sort it dfm = dfm.sort_index() - dfm.loc[(1, 'z')] - dfm.loc[(0, 'y'):(1, 'z')] + dfm.loc[(1, "z")] + dfm.loc[(0, "y"):(1, "z")] assert dfm.index.is_lexsorted() assert dfm.index.lexsort_depth == 2 @@ -141,9 +147,7 @@ def test_unsortedindex_doc_examples(): def test_reconstruct_sort(): # starts off lexsorted & monotonic - mi = MultiIndex.from_arrays([ - ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] - ]) + mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) assert mi.is_lexsorted() assert mi.is_monotonic @@ -156,9 +160,10 @@ def test_reconstruct_sort(): assert Index(mi.values).equals(Index(recons.values)) # cannot convert to lexsorted - mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'), - ('x', 'b'), ('y', 'a'), ('z', 'b')], - names=['one', 'two']) + mi = pd.MultiIndex.from_tuples( + [("z", "a"), ("x", "a"), ("y", "b"), ("x", "b"), ("y", "a"), ("z", "b")], + names=["one", "two"], + ) assert not mi.is_lexsorted() assert not mi.is_monotonic @@ -170,9 +175,11 @@ def test_reconstruct_sort(): assert Index(mi.values).equals(Index(recons.values)) # cannot convert to lexsorted - mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], - codes=[[0, 1, 0, 2], [2, 0, 0, 1]], - names=['col1', 'col2']) + mi = MultiIndex( + levels=[["b", "d", "a"], [1, 2, 3]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"], + ) assert not mi.is_lexsorted() assert not mi.is_monotonic @@ -186,25 +193,27 @@ def test_reconstruct_sort(): def test_reconstruct_remove_unused(): # xref to GH 2770 - df = DataFrame([['deleteMe', 1, 9], - ['keepMe', 2, 9], - ['keepMeToo', 3, 9]], - columns=['first', 'second', 'third']) - df2 = df.set_index(['first', 'second'], drop=False) - df2 = df2[df2['first'] != 'deleteMe'] + df = DataFrame( + [["deleteMe", 1, 9], ["keepMe", 2, 9], ["keepMeToo", 3, 9]], + columns=["first", "second", "third"], + ) + df2 = df.set_index(["first", "second"], drop=False) + df2 = df2[df2["first"] != "deleteMe"] # removed levels are there - expected = MultiIndex(levels=[['deleteMe', 'keepMe', 'keepMeToo'], - [1, 2, 3]], - codes=[[1, 2], [1, 2]], - names=['first', 'second']) + expected = MultiIndex( + levels=[["deleteMe", "keepMe", "keepMeToo"], [1, 2, 3]], + codes=[[1, 2], [1, 2]], + names=["first", "second"], + ) result = df2.index tm.assert_index_equal(result, expected) - expected = MultiIndex(levels=[['keepMe', 'keepMeToo'], - [2, 3]], - codes=[[0, 1], [0, 1]], - names=['first', 'second']) + expected = MultiIndex( + levels=[["keepMe", "keepMeToo"], [2, 3]], + codes=[[0, 1], [0, 1]], + names=["first", "second"], + ) result = df2.index.remove_unused_levels() tm.assert_index_equal(result, expected) @@ -214,10 +223,9 @@ def test_reconstruct_remove_unused(): assert result2.is_(result) -@pytest.mark.parametrize('first_type,second_type', [ - ('int64', 'int64'), - ('datetime64[D]', 'str') -]) +@pytest.mark.parametrize( + "first_type,second_type", [("int64", "int64"), ("datetime64[D]", "str")] +) def test_remove_unused_levels_large(first_type, second_type): # GH16556 @@ -227,11 +235,14 @@ def test_remove_unused_levels_large(first_type, second_type): rng = np.random.RandomState(4) # seed is arbitrary value that works size = 1 << 16 - df = DataFrame(dict( - first=rng.randint(0, 1 << 13, size).astype(first_type), - second=rng.randint(0, 1 << 10, size).astype(second_type), - third=rng.rand(size))) - df = df.groupby(['first', 'second']).sum() + df = DataFrame( + dict( + first=rng.randint(0, 1 << 13, size).astype(first_type), + second=rng.randint(0, 1 << 10, size).astype(second_type), + third=rng.rand(size), + ) + ) + df = df.groupby(["first", "second"]).sum() df = df[df.third < 0.1] result = df.index.remove_unused_levels() @@ -239,23 +250,24 @@ def test_remove_unused_levels_large(first_type, second_type): assert len(result.levels[1]) < len(df.index.levels[1]) assert result.equals(df.index) - expected = df.reset_index().set_index(['first', 'second']).index + expected = df.reset_index().set_index(["first", "second"]).index tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('level0', [['a', 'd', 'b'], - ['a', 'd', 'b', 'unused']]) -@pytest.mark.parametrize('level1', [['w', 'x', 'y', 'z'], - ['w', 'x', 'y', 'z', 'unused']]) +@pytest.mark.parametrize("level0", [["a", "d", "b"], ["a", "d", "b", "unused"]]) +@pytest.mark.parametrize( + "level1", [["w", "x", "y", "z"], ["w", "x", "y", "z", "unused"]] +) def test_remove_unused_nan(level0, level1): # GH 18417 - mi = pd.MultiIndex(levels=[level0, level1], - codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]]) + mi = pd.MultiIndex( + levels=[level0, level1], codes=[[0, 2, -1, 1, -1], [0, 1, 2, 3, 2]] + ) result = mi.remove_unused_levels() tm.assert_index_equal(result, mi) for level in 0, 1: - assert('unused' not in result.levels[level]) + assert "unused" not in result.levels[level] def test_argsort(idx): diff --git a/pandas/tests/indexes/period/test_arithmetic.py b/pandas/tests/indexes/period/test_arithmetic.py index d41e33a241141..1057ca7bbd662 100644 --- a/pandas/tests/indexes/period/test_arithmetic.py +++ b/pandas/tests/indexes/period/test_arithmetic.py @@ -11,96 +11,111 @@ class TestPeriodIndexArithmetic: # PeriodIndex.shift is used by __add__ and __sub__ def test_pi_shift_ndarray(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx" + ) tm.assert_index_equal(result, expected) result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex(['2011-02', '2010-12', 'NaT', '2010-12'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-02", "2010-12", "NaT", "2010-12"], freq="M", name="idx" + ) tm.assert_index_equal(result, expected) def test_shift(self): - pi1 = period_range(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='A', start='1/1/2002', end='12/1/2010') + pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="A", start="1/1/2002", end="12/1/2010") tm.assert_index_equal(pi1.shift(0), pi1) assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq='A', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='A', start='1/1/2000', end='12/1/2008') + pi1 = period_range(freq="A", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="A", start="1/1/2000", end="12/1/2008") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) - pi1 = period_range(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='M', start='2/1/2001', end='1/1/2010') + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="2/1/2001", end="1/1/2010") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq='M', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='M', start='12/1/2000', end='11/1/2009') + pi1 = period_range(freq="M", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="M", start="12/1/2000", end="11/1/2009") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) - pi1 = period_range(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='D', start='1/2/2001', end='12/2/2009') + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="1/2/2001", end="12/2/2009") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(1), pi2) - pi1 = period_range(freq='D', start='1/1/2001', end='12/1/2009') - pi2 = period_range(freq='D', start='12/31/2000', end='11/30/2009') + pi1 = period_range(freq="D", start="1/1/2001", end="12/1/2009") + pi2 = period_range(freq="D", start="12/31/2000", end="11/30/2009") assert len(pi1) == len(pi2) tm.assert_index_equal(pi1.shift(-1), pi2) def test_shift_corner_cases(self): # GH#9903 - idx = pd.PeriodIndex([], name='xxx', freq='H') + idx = pd.PeriodIndex([], name="xxx", freq="H") with pytest.raises(TypeError): # period shift doesn't accept freq - idx.shift(1, freq='H') + idx.shift(1, freq="H") tm.assert_index_equal(idx.shift(0), idx) tm.assert_index_equal(idx.shift(3), idx) - idx = pd.PeriodIndex(['2011-01-01 10:00', '2011-01-01 11:00' - '2011-01-01 12:00'], name='xxx', freq='H') + idx = pd.PeriodIndex( + ["2011-01-01 10:00", "2011-01-01 11:00" "2011-01-01 12:00"], + name="xxx", + freq="H", + ) tm.assert_index_equal(idx.shift(0), idx) - exp = pd.PeriodIndex(['2011-01-01 13:00', '2011-01-01 14:00' - '2011-01-01 15:00'], name='xxx', freq='H') + exp = pd.PeriodIndex( + ["2011-01-01 13:00", "2011-01-01 14:00" "2011-01-01 15:00"], + name="xxx", + freq="H", + ) tm.assert_index_equal(idx.shift(3), exp) - exp = pd.PeriodIndex(['2011-01-01 07:00', '2011-01-01 08:00' - '2011-01-01 09:00'], name='xxx', freq='H') + exp = pd.PeriodIndex( + ["2011-01-01 07:00", "2011-01-01 08:00" "2011-01-01 09:00"], + name="xxx", + freq="H", + ) tm.assert_index_equal(idx.shift(-3), exp) def test_shift_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], - freq='M', name='idx') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx" + ) result = idx.shift(1) - expected = PeriodIndex(['2011-02', '2011-03', 'NaT', '2011-05'], - freq='M', name='idx') + expected = PeriodIndex( + ["2011-02", "2011-03", "NaT", "2011-05"], freq="M", name="idx" + ) tm.assert_index_equal(result, expected) assert result.name == expected.name def test_shift_gh8083(self): # test shift for PeriodIndex # GH#8083 - drange = pd.period_range('20130101', periods=5, freq='D') + drange = pd.period_range("20130101", periods=5, freq="D") result = drange.shift(1) - expected = PeriodIndex(['2013-01-02', '2013-01-03', '2013-01-04', - '2013-01-05', '2013-01-06'], freq='D') + expected = PeriodIndex( + ["2013-01-02", "2013-01-03", "2013-01-04", "2013-01-05", "2013-01-06"], + freq="D", + ) tm.assert_index_equal(result, expected) def test_shift_periods(self): # GH #22458 : argument 'n' was deprecated in favor of 'periods' - idx = period_range(freq='A', start='1/1/2001', end='12/1/2009') + idx = period_range(freq="A", start="1/1/2001", end="12/1/2009") tm.assert_index_equal(idx.shift(periods=0), idx) tm.assert_index_equal(idx.shift(0), idx) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=True): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): tm.assert_index_equal(idx.shift(n=0), idx) diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py index 373f42b930425..2a6e84da229e1 100644 --- a/pandas/tests/indexes/period/test_asfreq.py +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -7,148 +7,143 @@ class TestPeriodIndex: - def test_asfreq(self): - pi1 = period_range(freq='A', start='1/1/2001', end='1/1/2001') - pi2 = period_range(freq='Q', start='1/1/2001', end='1/1/2001') - pi3 = period_range(freq='M', start='1/1/2001', end='1/1/2001') - pi4 = period_range(freq='D', start='1/1/2001', end='1/1/2001') - pi5 = period_range(freq='H', start='1/1/2001', end='1/1/2001 00:00') - pi6 = period_range(freq='Min', start='1/1/2001', end='1/1/2001 00:00') - pi7 = period_range(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') - - assert pi1.asfreq('Q', 'S') == pi2 - assert pi1.asfreq('Q', 's') == pi2 - assert pi1.asfreq('M', 'start') == pi3 - assert pi1.asfreq('D', 'StarT') == pi4 - assert pi1.asfreq('H', 'beGIN') == pi5 - assert pi1.asfreq('Min', 'S') == pi6 - assert pi1.asfreq('S', 'S') == pi7 - - assert pi2.asfreq('A', 'S') == pi1 - assert pi2.asfreq('M', 'S') == pi3 - assert pi2.asfreq('D', 'S') == pi4 - assert pi2.asfreq('H', 'S') == pi5 - assert pi2.asfreq('Min', 'S') == pi6 - assert pi2.asfreq('S', 'S') == pi7 - - assert pi3.asfreq('A', 'S') == pi1 - assert pi3.asfreq('Q', 'S') == pi2 - assert pi3.asfreq('D', 'S') == pi4 - assert pi3.asfreq('H', 'S') == pi5 - assert pi3.asfreq('Min', 'S') == pi6 - assert pi3.asfreq('S', 'S') == pi7 - - assert pi4.asfreq('A', 'S') == pi1 - assert pi4.asfreq('Q', 'S') == pi2 - assert pi4.asfreq('M', 'S') == pi3 - assert pi4.asfreq('H', 'S') == pi5 - assert pi4.asfreq('Min', 'S') == pi6 - assert pi4.asfreq('S', 'S') == pi7 - - assert pi5.asfreq('A', 'S') == pi1 - assert pi5.asfreq('Q', 'S') == pi2 - assert pi5.asfreq('M', 'S') == pi3 - assert pi5.asfreq('D', 'S') == pi4 - assert pi5.asfreq('Min', 'S') == pi6 - assert pi5.asfreq('S', 'S') == pi7 - - assert pi6.asfreq('A', 'S') == pi1 - assert pi6.asfreq('Q', 'S') == pi2 - assert pi6.asfreq('M', 'S') == pi3 - assert pi6.asfreq('D', 'S') == pi4 - assert pi6.asfreq('H', 'S') == pi5 - assert pi6.asfreq('S', 'S') == pi7 - - assert pi7.asfreq('A', 'S') == pi1 - assert pi7.asfreq('Q', 'S') == pi2 - assert pi7.asfreq('M', 'S') == pi3 - assert pi7.asfreq('D', 'S') == pi4 - assert pi7.asfreq('H', 'S') == pi5 - assert pi7.asfreq('Min', 'S') == pi6 + pi1 = period_range(freq="A", start="1/1/2001", end="1/1/2001") + pi2 = period_range(freq="Q", start="1/1/2001", end="1/1/2001") + pi3 = period_range(freq="M", start="1/1/2001", end="1/1/2001") + pi4 = period_range(freq="D", start="1/1/2001", end="1/1/2001") + pi5 = period_range(freq="H", start="1/1/2001", end="1/1/2001 00:00") + pi6 = period_range(freq="Min", start="1/1/2001", end="1/1/2001 00:00") + pi7 = period_range(freq="S", start="1/1/2001", end="1/1/2001 00:00:00") + + assert pi1.asfreq("Q", "S") == pi2 + assert pi1.asfreq("Q", "s") == pi2 + assert pi1.asfreq("M", "start") == pi3 + assert pi1.asfreq("D", "StarT") == pi4 + assert pi1.asfreq("H", "beGIN") == pi5 + assert pi1.asfreq("Min", "S") == pi6 + assert pi1.asfreq("S", "S") == pi7 + + assert pi2.asfreq("A", "S") == pi1 + assert pi2.asfreq("M", "S") == pi3 + assert pi2.asfreq("D", "S") == pi4 + assert pi2.asfreq("H", "S") == pi5 + assert pi2.asfreq("Min", "S") == pi6 + assert pi2.asfreq("S", "S") == pi7 + + assert pi3.asfreq("A", "S") == pi1 + assert pi3.asfreq("Q", "S") == pi2 + assert pi3.asfreq("D", "S") == pi4 + assert pi3.asfreq("H", "S") == pi5 + assert pi3.asfreq("Min", "S") == pi6 + assert pi3.asfreq("S", "S") == pi7 + + assert pi4.asfreq("A", "S") == pi1 + assert pi4.asfreq("Q", "S") == pi2 + assert pi4.asfreq("M", "S") == pi3 + assert pi4.asfreq("H", "S") == pi5 + assert pi4.asfreq("Min", "S") == pi6 + assert pi4.asfreq("S", "S") == pi7 + + assert pi5.asfreq("A", "S") == pi1 + assert pi5.asfreq("Q", "S") == pi2 + assert pi5.asfreq("M", "S") == pi3 + assert pi5.asfreq("D", "S") == pi4 + assert pi5.asfreq("Min", "S") == pi6 + assert pi5.asfreq("S", "S") == pi7 + + assert pi6.asfreq("A", "S") == pi1 + assert pi6.asfreq("Q", "S") == pi2 + assert pi6.asfreq("M", "S") == pi3 + assert pi6.asfreq("D", "S") == pi4 + assert pi6.asfreq("H", "S") == pi5 + assert pi6.asfreq("S", "S") == pi7 + + assert pi7.asfreq("A", "S") == pi1 + assert pi7.asfreq("Q", "S") == pi2 + assert pi7.asfreq("M", "S") == pi3 + assert pi7.asfreq("D", "S") == pi4 + assert pi7.asfreq("H", "S") == pi5 + assert pi7.asfreq("Min", "S") == pi6 msg = "How must be one of S or E" with pytest.raises(ValueError, match=msg): - pi7.asfreq('T', 'foo') - result1 = pi1.asfreq('3M') - result2 = pi1.asfreq('M') - expected = period_range(freq='M', start='2001-12', end='2001-12') + pi7.asfreq("T", "foo") + result1 = pi1.asfreq("3M") + result2 = pi1.asfreq("M") + expected = period_range(freq="M", start="2001-12", end="2001-12") tm.assert_numpy_array_equal(result1.asi8, expected.asi8) - assert result1.freqstr == '3M' + assert result1.freqstr == "3M" tm.assert_numpy_array_equal(result2.asi8, expected.asi8) - assert result2.freqstr == 'M' + assert result2.freqstr == "M" def test_asfreq_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M') - result = idx.asfreq(freq='Q') - expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') + idx = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-04"], freq="M") + result = idx.asfreq(freq="Q") + expected = PeriodIndex(["2011Q1", "2011Q1", "NaT", "2011Q2"], freq="Q") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('freq', ['D', '3D']) + @pytest.mark.parametrize("freq", ["D", "3D"]) def test_asfreq_mult_pi(self, freq): - pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M') + pi = PeriodIndex(["2001-01", "2001-02", "NaT", "2001-03"], freq="2M") result = pi.asfreq(freq) - exp = PeriodIndex(['2001-02-28', '2001-03-31', 'NaT', - '2001-04-30'], freq=freq) + exp = PeriodIndex(["2001-02-28", "2001-03-31", "NaT", "2001-04-30"], freq=freq) tm.assert_index_equal(result, exp) assert result.freq == exp.freq - result = pi.asfreq(freq, how='S') - exp = PeriodIndex(['2001-01-01', '2001-02-01', 'NaT', - '2001-03-01'], freq=freq) + result = pi.asfreq(freq, how="S") + exp = PeriodIndex(["2001-01-01", "2001-02-01", "NaT", "2001-03-01"], freq=freq) tm.assert_index_equal(result, exp) assert result.freq == exp.freq def test_asfreq_combined_pi(self): - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='H') - exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='25H') - for freq, how in zip(['1D1H', '1H1D'], ['S', 'E']): + pi = pd.PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="25H") + for freq, how in zip(["1D1H", "1H1D"], ["S", "E"]): result = pi.asfreq(freq, how=how) tm.assert_index_equal(result, exp) assert result.freq == exp.freq - for freq in ['1D1H', '1H1D']: - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', - 'NaT'], freq=freq) - result = pi.asfreq('H') - exp = PeriodIndex(['2001-01-02 00:00', '2001-01-03 02:00', 'NaT'], - freq='H') + for freq in ["1D1H", "1H1D"]: + pi = pd.PeriodIndex( + ["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq + ) + result = pi.asfreq("H") + exp = PeriodIndex(["2001-01-02 00:00", "2001-01-03 02:00", "NaT"], freq="H") tm.assert_index_equal(result, exp) assert result.freq == exp.freq - pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', - 'NaT'], freq=freq) - result = pi.asfreq('H', how='S') - exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], - freq='H') + pi = pd.PeriodIndex( + ["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq=freq + ) + result = pi.asfreq("H", how="S") + exp = PeriodIndex(["2001-01-01 00:00", "2001-01-02 02:00", "NaT"], freq="H") tm.assert_index_equal(result, exp) assert result.freq == exp.freq def test_asfreq_ts(self): - index = period_range(freq='A', start='1/1/2001', end='12/31/2010') + index = period_range(freq="A", start="1/1/2001", end="12/31/2010") ts = Series(np.random.randn(len(index)), index=index) df = DataFrame(np.random.randn(len(index), 3), index=index) - result = ts.asfreq('D', how='end') - df_result = df.asfreq('D', how='end') - exp_index = index.asfreq('D', how='end') + result = ts.asfreq("D", how="end") + df_result = df.asfreq("D", how="end") + exp_index = index.asfreq("D", how="end") assert len(result) == len(ts) tm.assert_index_equal(result.index, exp_index) tm.assert_index_equal(df_result.index, exp_index) - result = ts.asfreq('D', how='start') + result = ts.asfreq("D", how="start") assert len(result) == len(ts) - tm.assert_index_equal(result.index, index.asfreq('D', how='start')) + tm.assert_index_equal(result.index, index.asfreq("D", how="start")) def test_astype_asfreq(self): - pi1 = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], freq='D') - exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') - tm.assert_index_equal(pi1.asfreq('M'), exp) - tm.assert_index_equal(pi1.astype('period[M]'), exp) - - exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M') - tm.assert_index_equal(pi1.asfreq('3M'), exp) - tm.assert_index_equal(pi1.astype('period[3M]'), exp) + pi1 = PeriodIndex(["2011-01-01", "2011-02-01", "2011-03-01"], freq="D") + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + tm.assert_index_equal(pi1.asfreq("M"), exp) + tm.assert_index_equal(pi1.astype("period[M]"), exp) + + exp = PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="3M") + tm.assert_index_equal(pi1.asfreq("3M"), exp) + tm.assert_index_equal(pi1.astype("period[3M]"), exp) diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py index bcf1109358d22..fa57ec2b1f7ca 100644 --- a/pandas/tests/indexes/period/test_astype.py +++ b/pandas/tests/indexes/period/test_astype.py @@ -7,73 +7,73 @@ class TestPeriodIndexAsType: - @pytest.mark.parametrize('dtype', [ - float, 'timedelta64', 'timedelta64[ns]']) + @pytest.mark.parametrize("dtype", [float, "timedelta64", "timedelta64[ns]"]) def test_astype_raises(self, dtype): # GH#13149, GH#13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') - msg = 'Cannot cast PeriodArray to dtype' + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") + msg = "Cannot cast PeriodArray to dtype" with pytest.raises(TypeError, match=msg): idx.astype(dtype) def test_astype_conversion(self): # GH#13149, GH#13209 - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq="D") result = idx.astype(object) - expected = Index([Period('2016-05-16', freq='D')] + - [Period(NaT, freq='D')] * 3, dtype='object') + expected = Index( + [Period("2016-05-16", freq="D")] + [Period(NaT, freq="D")] * 3, + dtype="object", + ) tm.assert_index_equal(result, expected) result = idx.astype(np.int64) - expected = Int64Index([16937] + [-9223372036854775808] * 3, - dtype=np.int64) + expected = Int64Index([16937] + [-9223372036854775808] * 3, dtype=np.int64) tm.assert_index_equal(result, expected) result = idx.astype(str) expected = Index(str(x) for x in idx) tm.assert_index_equal(result, expected) - idx = period_range('1990', '2009', freq='A') - result = idx.astype('i8') + idx = period_range("1990", "2009", freq="A") + result = idx.astype("i8") tm.assert_index_equal(result, Index(idx.asi8)) tm.assert_numpy_array_equal(result.values, idx.asi8) def test_astype_uint(self): - arr = period_range('2000', periods=2) - expected = pd.UInt64Index(np.array([10957, 10958], dtype='uint64')) + arr = period_range("2000", periods=2) + expected = pd.UInt64Index(np.array([10957, 10958], dtype="uint64")) tm.assert_index_equal(arr.astype("uint64"), expected) tm.assert_index_equal(arr.astype("uint32"), expected) def test_astype_object(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") exp = np.array([], dtype=object) tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") - exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + exp = np.array([pd.Period("2011-01", freq="M"), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) - exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], - dtype=object) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + exp = np.array([pd.Period("2011-01-01", freq="D"), pd.NaT], dtype=object) + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") tm.assert_numpy_array_equal(idx.astype(object).values, exp) tm.assert_numpy_array_equal(idx._mpl_repr(), exp) # TODO: de-duplicate this version (from test_ops) with the one above # (from test_period) def test_astype_object2(self): - idx = pd.period_range(start='2013-01-01', periods=4, freq='M', - name='idx') - expected_list = [pd.Period('2013-01-31', freq='M'), - pd.Period('2013-02-28', freq='M'), - pd.Period('2013-03-31', freq='M'), - pd.Period('2013-04-30', freq='M')] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = pd.period_range(start="2013-01-01", periods=4, freq="M", name="idx") + expected_list = [ + pd.Period("2013-01-31", freq="M"), + pd.Period("2013-02-28", freq="M"), + pd.Period("2013-03-31", freq="M"), + pd.Period("2013-04-30", freq="M"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -81,13 +81,16 @@ def test_astype_object2(self): assert result.name == expected.name assert idx.tolist() == expected_list - idx = PeriodIndex(['2013-01-01', '2013-01-02', 'NaT', - '2013-01-04'], freq='D', name='idx') - expected_list = [pd.Period('2013-01-01', freq='D'), - pd.Period('2013-01-02', freq='D'), - pd.Period('NaT', freq='D'), - pd.Period('2013-01-04', freq='D')] - expected = pd.Index(expected_list, dtype=object, name='idx') + idx = PeriodIndex( + ["2013-01-01", "2013-01-02", "NaT", "2013-01-04"], freq="D", name="idx" + ) + expected_list = [ + pd.Period("2013-01-01", freq="D"), + pd.Period("2013-01-02", freq="D"), + pd.Period("NaT", freq="D"), + pd.Period("2013-01-04", freq="D"), + ] + expected = pd.Index(expected_list, dtype=object, name="idx") result = idx.astype(object) assert isinstance(result, Index) assert result.dtype == object @@ -104,12 +107,13 @@ def test_astype_object2(self): def test_astype_category(self): obj = pd.period_range("2000", periods=2) - result = obj.astype('category') - expected = pd.CategoricalIndex([pd.Period('2000-01-01', freq="D"), - pd.Period('2000-01-02', freq="D")]) + result = obj.astype("category") + expected = pd.CategoricalIndex( + [pd.Period("2000-01-01", freq="D"), pd.Period("2000-01-02", freq="D")] + ) tm.assert_index_equal(result, expected) - result = obj._data.astype('category') + result = obj._data.astype("category") expected = expected.values tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 2b420dd725998..7c10239faad42 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -6,43 +6,38 @@ from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd -from pandas import ( - Index, Period, PeriodIndex, Series, date_range, offsets, period_range) +from pandas import Index, Period, PeriodIndex, Series, date_range, offsets, period_range import pandas.core.indexes.period as period import pandas.util.testing as tm class TestPeriodIndex: - def setup_method(self, method): pass def test_construction_base_constructor(self): # GH 13664 - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='M')] + arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="M")] tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.PeriodIndex(np.array(arr))) - arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + arr = [np.nan, pd.NaT, pd.Period("2011-03", freq="M")] tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.PeriodIndex(np.array(arr))) + tm.assert_index_equal(pd.Index(np.array(arr)), pd.PeriodIndex(np.array(arr))) - arr = [pd.Period('2011-01', freq='M'), pd.NaT, - pd.Period('2011-03', freq='D')] + arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="D")] tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) - tm.assert_index_equal(pd.Index(np.array(arr)), - pd.Index(np.array(arr), dtype=object)) + tm.assert_index_equal( + pd.Index(np.array(arr)), pd.Index(np.array(arr), dtype=object) + ) def test_constructor_use_start_freq(self): # GH #1118 - p = Period('4/2/2012', freq='B') + p = Period("4/2/2012", freq="B") with tm.assert_produces_warning(FutureWarning): index = PeriodIndex(start=p, periods=10) - expected = period_range(start='4/2/2012', periods=10, freq='B') + expected = period_range(start="4/2/2012", periods=10, freq="B") tm.assert_index_equal(index, expected) index = period_range(start=p, periods=10) @@ -54,11 +49,11 @@ def test_constructor_field_arrays(self): years = np.arange(1990, 2010).repeat(4)[2:-2] quarters = np.tile(np.arange(1, 5), 20)[2:-2] - index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') - expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') + index = PeriodIndex(year=years, quarter=quarters, freq="Q-DEC") + expected = period_range("1990Q3", "2009Q2", freq="Q-DEC") tm.assert_index_equal(index, expected) - index2 = PeriodIndex(year=years, quarter=quarters, freq='2Q-DEC') + index2 = PeriodIndex(year=years, quarter=quarters, freq="2Q-DEC") tm.assert_numpy_array_equal(index.asi8, index2.asi8) index = PeriodIndex(year=years, quarter=quarters) @@ -69,33 +64,40 @@ def test_constructor_field_arrays(self): msg = "Mismatched Period array lengths" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq='M') + PeriodIndex(year=years, month=months, freq="M") with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq='2M') + PeriodIndex(year=years, month=months, freq="2M") msg = "Can either instantiate from fields or endpoints, but not both" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=years, month=months, freq='M', - start=Period('2007-01', freq='M')) + PeriodIndex( + year=years, month=months, freq="M", start=Period("2007-01", freq="M") + ) years = [2007, 2007, 2007] months = [1, 2, 3] - idx = PeriodIndex(year=years, month=months, freq='M') - exp = period_range('2007-01', periods=3, freq='M') + idx = PeriodIndex(year=years, month=months, freq="M") + exp = period_range("2007-01", periods=3, freq="M") tm.assert_index_equal(idx, exp) def test_constructor_U(self): # U was used as undefined period with pytest.raises(ValueError, match="Invalid frequency: X"): - period_range('2007-1-1', periods=500, freq='X') + period_range("2007-1-1", periods=500, freq="X") def test_constructor_nano(self): - idx = period_range(start=Period(ordinal=1, freq='N'), - end=Period(ordinal=4, freq='N'), freq='N') - exp = PeriodIndex([Period(ordinal=1, freq='N'), - Period(ordinal=2, freq='N'), - Period(ordinal=3, freq='N'), - Period(ordinal=4, freq='N')], freq='N') + idx = period_range( + start=Period(ordinal=1, freq="N"), end=Period(ordinal=4, freq="N"), freq="N" + ) + exp = PeriodIndex( + [ + Period(ordinal=1, freq="N"), + Period(ordinal=2, freq="N"), + Period(ordinal=3, freq="N"), + Period(ordinal=4, freq="N"), + ], + freq="N", + ) tm.assert_index_equal(idx, exp) def test_constructor_arrays_negative_year(self): @@ -110,34 +112,35 @@ def test_constructor_arrays_negative_year(self): def test_constructor_invalid_quarters(self): msg = "Quarter must be 1 <= q <= 4" with pytest.raises(ValueError, match=msg): - PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), - freq='Q-DEC') + PeriodIndex(year=range(2000, 2004), quarter=list(range(4)), freq="Q-DEC") def test_constructor_corner(self): msg = "Not enough parameters to construct Period range" with pytest.raises(ValueError, match=msg): - PeriodIndex(periods=10, freq='A') + PeriodIndex(periods=10, freq="A") - start = Period('2007', freq='A-JUN') - end = Period('2010', freq='A-DEC') + start = Period("2007", freq="A-JUN") + end = Period("2010", freq="A-DEC") msg = "start and end must have same freq" with pytest.raises(ValueError, match=msg): PeriodIndex(start=start, end=end) - msg = ("Of the three parameters: start, end, and periods, exactly two" - " must be specified") + msg = ( + "Of the three parameters: start, end, and periods, exactly two" + " must be specified" + ) with pytest.raises(ValueError, match=msg): PeriodIndex(start=start) with pytest.raises(ValueError, match=msg): PeriodIndex(end=end) - result = period_range('2007-01', periods=10.5, freq='M') - exp = period_range('2007-01', periods=10, freq='M') + result = period_range("2007-01", periods=10.5, freq="M") + exp = period_range("2007-01", periods=10, freq="M") tm.assert_index_equal(result, exp) def test_constructor_fromarraylike(self): - idx = period_range('2007-01', periods=20, freq='M') + idx = period_range("2007-01", periods=20, freq="M") # values is an array of Period, thus can retrieve freq tm.assert_index_equal(PeriodIndex(idx.values), idx) @@ -151,7 +154,7 @@ def test_constructor_fromarraylike(self): msg = "'Period' object is not iterable" with pytest.raises(TypeError, match=msg): - PeriodIndex(data=Period('2007', freq='A')) + PeriodIndex(data=Period("2007", freq="A")) result = PeriodIndex(iter(idx)) tm.assert_index_equal(result, idx) @@ -159,329 +162,344 @@ def test_constructor_fromarraylike(self): result = PeriodIndex(idx) tm.assert_index_equal(result, idx) - result = PeriodIndex(idx, freq='M') + result = PeriodIndex(idx, freq="M") tm.assert_index_equal(result, idx) result = PeriodIndex(idx, freq=offsets.MonthEnd()) tm.assert_index_equal(result, idx) - assert result.freq == 'M' + assert result.freq == "M" - result = PeriodIndex(idx, freq='2M') - tm.assert_index_equal(result, idx.asfreq('2M')) - assert result.freq == '2M' + result = PeriodIndex(idx, freq="2M") + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2M" result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) - tm.assert_index_equal(result, idx.asfreq('2M')) - assert result.freq == '2M' + tm.assert_index_equal(result, idx.asfreq("2M")) + assert result.freq == "2M" - result = PeriodIndex(idx, freq='D') - exp = idx.asfreq('D', 'e') + result = PeriodIndex(idx, freq="D") + exp = idx.asfreq("D", "e") tm.assert_index_equal(result, exp) def test_constructor_datetime64arr(self): vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) - vals = vals.view(np.dtype('M8[us]')) + vals = vals.view(np.dtype("M8[us]")) msg = r"Wrong dtype: datetime64\[us\]" with pytest.raises(ValueError, match=msg): - PeriodIndex(vals, freq='D') + PeriodIndex(vals, freq="D") - @pytest.mark.parametrize('box', [None, 'series', 'index']) + @pytest.mark.parametrize("box", [None, "series", "index"]) def test_constructor_datetime64arr_ok(self, box): # https://github.com/pandas-dev/pandas/issues/23438 - data = pd.date_range('2017', periods=4, freq="M") + data = pd.date_range("2017", periods=4, freq="M") if box is None: data = data._values - elif box == 'series': + elif box == "series": data = pd.Series(data) - result = PeriodIndex(data, freq='D') - expected = PeriodIndex([ - '2017-01-31', '2017-02-28', '2017-03-31', '2017-04-30' - ], freq="D") + result = PeriodIndex(data, freq="D") + expected = PeriodIndex( + ["2017-01-31", "2017-02-28", "2017-03-31", "2017-04-30"], freq="D" + ) tm.assert_index_equal(result, expected) def test_constructor_dtype(self): # passing a dtype with a tz should localize - idx = PeriodIndex(['2013-01', '2013-03'], dtype='period[M]') - exp = PeriodIndex(['2013-01', '2013-03'], freq='M') + idx = PeriodIndex(["2013-01", "2013-03"], dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-03"], freq="M") tm.assert_index_equal(idx, exp) - assert idx.dtype == 'period[M]' + assert idx.dtype == "period[M]" - idx = PeriodIndex(['2013-01-05', '2013-03-05'], dtype='period[3D]') - exp = PeriodIndex(['2013-01-05', '2013-03-05'], freq='3D') + idx = PeriodIndex(["2013-01-05", "2013-03-05"], dtype="period[3D]") + exp = PeriodIndex(["2013-01-05", "2013-03-05"], freq="3D") tm.assert_index_equal(idx, exp) - assert idx.dtype == 'period[3D]' + assert idx.dtype == "period[3D]" # if we already have a freq and its not the same, then asfreq # (not changed) - idx = PeriodIndex(['2013-01-01', '2013-01-02'], freq='D') + idx = PeriodIndex(["2013-01-01", "2013-01-02"], freq="D") - res = PeriodIndex(idx, dtype='period[M]') - exp = PeriodIndex(['2013-01', '2013-01'], freq='M') + res = PeriodIndex(idx, dtype="period[M]") + exp = PeriodIndex(["2013-01", "2013-01"], freq="M") tm.assert_index_equal(res, exp) - assert res.dtype == 'period[M]' + assert res.dtype == "period[M]" - res = PeriodIndex(idx, freq='M') + res = PeriodIndex(idx, freq="M") tm.assert_index_equal(res, exp) - assert res.dtype == 'period[M]' + assert res.dtype == "period[M]" - msg = 'specified freq and dtype are different' + msg = "specified freq and dtype are different" with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex(['2011-01'], freq='M', dtype='period[D]') + PeriodIndex(["2011-01"], freq="M", dtype="period[D]") def test_constructor_empty(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") assert isinstance(idx, PeriodIndex) assert len(idx) == 0 - assert idx.freq == 'M' + assert idx.freq == "M" - with pytest.raises(ValueError, match='freq not specified'): + with pytest.raises(ValueError, match="freq not specified"): pd.PeriodIndex([]) def test_constructor_pi_nat(self): - idx = PeriodIndex([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='M')]) - exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + idx = PeriodIndex( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='M')])) + idx = PeriodIndex( + np.array([Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="M")]) + ) tm.assert_index_equal(idx, exp) - idx = PeriodIndex([pd.NaT, pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='M')]) - exp = PeriodIndex(['NaT', 'NaT', '2011-01', '2011-01'], freq='M') + idx = PeriodIndex( + [pd.NaT, pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="M")] + ) + exp = PeriodIndex(["NaT", "NaT", "2011-01", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex(np.array([pd.NaT, pd.NaT, - Period('2011-01', freq='M'), - Period('2011-01', freq='M')])) + idx = PeriodIndex( + np.array( + [ + pd.NaT, + pd.NaT, + Period("2011-01", freq="M"), + Period("2011-01", freq="M"), + ] + ) + ) tm.assert_index_equal(idx, exp) - idx = PeriodIndex([pd.NaT, pd.NaT, '2011-01', '2011-01'], freq='M') + idx = PeriodIndex([pd.NaT, pd.NaT, "2011-01", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - with pytest.raises(ValueError, match='freq not specified'): + with pytest.raises(ValueError, match="freq not specified"): PeriodIndex([pd.NaT, pd.NaT]) - with pytest.raises(ValueError, match='freq not specified'): + with pytest.raises(ValueError, match="freq not specified"): PeriodIndex(np.array([pd.NaT, pd.NaT])) - with pytest.raises(ValueError, match='freq not specified'): - PeriodIndex(['NaT', 'NaT']) + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(["NaT", "NaT"]) - with pytest.raises(ValueError, match='freq not specified'): - PeriodIndex(np.array(['NaT', 'NaT'])) + with pytest.raises(ValueError, match="freq not specified"): + PeriodIndex(np.array(["NaT", "NaT"])) def test_constructor_incompat_freq(self): msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='D')]) + PeriodIndex( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] + ) with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, - Period('2011-01', freq='D')])) + PeriodIndex( + np.array( + [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] + ) + ) # first element is pd.NaT with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex([pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='D')]) + PeriodIndex( + [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] + ) with pytest.raises(period.IncompatibleFrequency, match=msg): - PeriodIndex(np.array([pd.NaT, Period('2011-01', freq='M'), - Period('2011-01', freq='D')])) + PeriodIndex( + np.array( + [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] + ) + ) def test_constructor_mixed(self): - idx = PeriodIndex(['2011-01', pd.NaT, Period('2011-01', freq='M')]) - exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + idx = PeriodIndex(["2011-01", pd.NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["2011-01", "NaT", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex(['NaT', pd.NaT, Period('2011-01', freq='M')]) - exp = PeriodIndex(['NaT', 'NaT', '2011-01'], freq='M') + idx = PeriodIndex(["NaT", pd.NaT, Period("2011-01", freq="M")]) + exp = PeriodIndex(["NaT", "NaT", "2011-01"], freq="M") tm.assert_index_equal(idx, exp) - idx = PeriodIndex([Period('2011-01-01', freq='D'), pd.NaT, - '2012-01-01']) - exp = PeriodIndex(['2011-01-01', 'NaT', '2012-01-01'], freq='D') + idx = PeriodIndex([Period("2011-01-01", freq="D"), pd.NaT, "2012-01-01"]) + exp = PeriodIndex(["2011-01-01", "NaT", "2012-01-01"], freq="D") tm.assert_index_equal(idx, exp) def test_constructor_simple_new(self): - idx = period_range('2007-01', name='p', periods=2, freq='M') - result = idx._simple_new(idx, name='p', freq=idx.freq) + idx = period_range("2007-01", name="p", periods=2, freq="M") + result = idx._simple_new(idx, name="p", freq=idx.freq) tm.assert_index_equal(result, idx) - result = idx._simple_new(idx.astype('i8'), name='p', freq=idx.freq) + result = idx._simple_new(idx.astype("i8"), name="p", freq=idx.freq) tm.assert_index_equal(result, idx) def test_constructor_simple_new_empty(self): # GH13079 - idx = PeriodIndex([], freq='M', name='p') - result = idx._simple_new(idx, name='p', freq='M') + idx = PeriodIndex([], freq="M", name="p") + result = idx._simple_new(idx, name="p", freq="M") tm.assert_index_equal(result, idx) - @pytest.mark.parametrize('floats', [[1.1, 2.1], np.array([1.1, 2.1])]) + @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) def test_constructor_floats(self, floats): msg = r"PeriodIndex\._simple_new does not accept floats" with pytest.raises(TypeError, match=msg): - pd.PeriodIndex._simple_new(floats, freq='M') + pd.PeriodIndex._simple_new(floats, freq="M") msg = "PeriodIndex does not allow floating point in construction" with pytest.raises(TypeError, match=msg): - pd.PeriodIndex(floats, freq='M') + pd.PeriodIndex(floats, freq="M") def test_constructor_nat(self): msg = "start and end must not be NaT" with pytest.raises(ValueError, match=msg): - period_range(start='NaT', end='2011-01-01', freq='M') + period_range(start="NaT", end="2011-01-01", freq="M") with pytest.raises(ValueError, match=msg): - period_range(start='2011-01-01', end='NaT', freq='M') + period_range(start="2011-01-01", end="NaT", freq="M") def test_constructor_year_and_quarter(self): year = pd.Series([2001, 2002, 2003]) quarter = year - 2000 idx = PeriodIndex(year=year, quarter=quarter) - strs = ['%dQ%d' % t for t in zip(quarter, year)] + strs = ["%dQ%d" % t for t in zip(quarter, year)] lops = list(map(Period, strs)) p = PeriodIndex(lops) tm.assert_index_equal(p, idx) - @pytest.mark.parametrize('func, warning', [ - (PeriodIndex, FutureWarning), - (period_range, None) - ]) + @pytest.mark.parametrize( + "func, warning", [(PeriodIndex, FutureWarning), (period_range, None)] + ) def test_constructor_freq_mult(self, func, warning): # GH #7811 with tm.assert_produces_warning(warning): # must be the same, but for sure... - pidx = func(start='2014-01', freq='2M', periods=4) - expected = PeriodIndex(['2014-01', '2014-03', - '2014-05', '2014-07'], freq='2M') + pidx = func(start="2014-01", freq="2M", periods=4) + expected = PeriodIndex(["2014-01", "2014-03", "2014-05", "2014-07"], freq="2M") tm.assert_index_equal(pidx, expected) with tm.assert_produces_warning(warning): - pidx = func(start='2014-01-02', end='2014-01-15', freq='3D') - expected = PeriodIndex(['2014-01-02', '2014-01-05', - '2014-01-08', '2014-01-11', - '2014-01-14'], freq='3D') + pidx = func(start="2014-01-02", end="2014-01-15", freq="3D") + expected = PeriodIndex( + ["2014-01-02", "2014-01-05", "2014-01-08", "2014-01-11", "2014-01-14"], + freq="3D", + ) tm.assert_index_equal(pidx, expected) with tm.assert_produces_warning(warning): - pidx = func(end='2014-01-01 17:00', freq='4H', periods=3) - expected = PeriodIndex(['2014-01-01 09:00', '2014-01-01 13:00', - '2014-01-01 17:00'], freq='4H') + pidx = func(end="2014-01-01 17:00", freq="4H", periods=3) + expected = PeriodIndex( + ["2014-01-01 09:00", "2014-01-01 13:00", "2014-01-01 17:00"], freq="4H" + ) tm.assert_index_equal(pidx, expected) - msg = ('Frequency must be positive, because it' - ' represents span: -1M') + msg = "Frequency must be positive, because it" " represents span: -1M" with pytest.raises(ValueError, match=msg): - PeriodIndex(['2011-01'], freq='-1M') + PeriodIndex(["2011-01"], freq="-1M") - msg = ('Frequency must be positive, because it' ' represents span: 0M') + msg = "Frequency must be positive, because it" " represents span: 0M" with pytest.raises(ValueError, match=msg): - PeriodIndex(['2011-01'], freq='0M') + PeriodIndex(["2011-01"], freq="0M") - msg = ('Frequency must be positive, because it' ' represents span: 0M') + msg = "Frequency must be positive, because it" " represents span: 0M" with pytest.raises(ValueError, match=msg): - period_range('2011-01', periods=3, freq='0M') + period_range("2011-01", periods=3, freq="0M") - @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'T', 'S']) - @pytest.mark.parametrize('mult', [1, 2, 3, 4, 5]) + @pytest.mark.parametrize("freq", ["A", "M", "D", "T", "S"]) + @pytest.mark.parametrize("mult", [1, 2, 3, 4, 5]) def test_constructor_freq_mult_dti_compat(self, mult, freq): freqstr = str(mult) + freq - pidx = period_range(start='2014-04-01', freq=freqstr, periods=10) - expected = date_range(start='2014-04-01', freq=freqstr, - periods=10).to_period(freqstr) + pidx = period_range(start="2014-04-01", freq=freqstr, periods=10) + expected = date_range(start="2014-04-01", freq=freqstr, periods=10).to_period( + freqstr + ) tm.assert_index_equal(pidx, expected) def test_constructor_freq_combined(self): - for freq in ['1D1H', '1H1D']: - pidx = PeriodIndex(['2016-01-01', '2016-01-02'], freq=freq) - expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 00:00'], - freq='25H') - for freq in ['1D1H', '1H1D']: - pidx = period_range(start='2016-01-01', periods=2, freq=freq) - expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 01:00'], - freq='25H') + for freq in ["1D1H", "1H1D"]: + pidx = PeriodIndex(["2016-01-01", "2016-01-02"], freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 00:00"], freq="25H") + for freq in ["1D1H", "1H1D"]: + pidx = period_range(start="2016-01-01", periods=2, freq=freq) + expected = PeriodIndex(["2016-01-01 00:00", "2016-01-02 01:00"], freq="25H") tm.assert_index_equal(pidx, expected) def test_constructor_range_based_deprecated(self): with tm.assert_produces_warning(FutureWarning): - pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi = PeriodIndex(freq="A", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 def test_constructor_range_based_deprecated_different_freq(self): with tm.assert_produces_warning(FutureWarning) as m: - PeriodIndex(start='2000', periods=2) + PeriodIndex(start="2000", periods=2) warning, = m assert 'freq="A-DEC"' in str(warning.message) def test_constructor(self): - pi = period_range(freq='A', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 - pi = period_range(freq='Q', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") assert len(pi) == 4 * 9 - pi = period_range(freq='M', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="M", start="1/1/2001", end="12/1/2009") assert len(pi) == 12 * 9 - pi = period_range(freq='D', start='1/1/2001', end='12/31/2009') + pi = period_range(freq="D", start="1/1/2001", end="12/31/2009") assert len(pi) == 365 * 9 + 2 - pi = period_range(freq='B', start='1/1/2001', end='12/31/2009') + pi = period_range(freq="B", start="1/1/2001", end="12/31/2009") assert len(pi) == 261 * 9 - pi = period_range(freq='H', start='1/1/2001', end='12/31/2001 23:00') + pi = period_range(freq="H", start="1/1/2001", end="12/31/2001 23:00") assert len(pi) == 365 * 24 - pi = period_range(freq='Min', start='1/1/2001', end='1/1/2001 23:59') + pi = period_range(freq="Min", start="1/1/2001", end="1/1/2001 23:59") assert len(pi) == 24 * 60 - pi = period_range(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') + pi = period_range(freq="S", start="1/1/2001", end="1/1/2001 23:59:59") assert len(pi) == 24 * 60 * 60 - start = Period('02-Apr-2005', 'B') + start = Period("02-Apr-2005", "B") i1 = period_range(start=start, periods=20) assert len(i1) == 20 assert i1.freq == start.freq assert i1[0] == start - end_intv = Period('2006-12-31', 'W') + end_intv = Period("2006-12-31", "W") i1 = period_range(end=end_intv, periods=10) assert len(i1) == 10 assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period('2006-12-31', '1w') + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period('2006-12-31', ('w', 1)) + end_intv = Period("2006-12-31", ("w", 1)) i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period('2005-05-01', 'B') + end_intv = Period("2005-05-01", "B") i1 = period_range(start=start, end=end_intv) # infer freq from first element - i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + i2 = PeriodIndex([end_intv, Period("2005-05-05", "B")]) assert len(i2) == 2 assert i2[0] == end_intv - i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + i2 = PeriodIndex(np.array([end_intv, Period("2005-05-05", "B")])) assert len(i2) == 2 assert i2[0] == end_intv # Mixed freq should fail - vals = [end_intv, Period('2006-12-31', 'w')] + vals = [end_intv, Period("2006-12-31", "w")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(vals) @@ -490,28 +508,31 @@ def test_constructor(self): PeriodIndex(vals) def test_constructor_error(self): - start = Period('02-Apr-2005', 'B') - end_intv = Period('2006-12-31', ('w', 1)) + start = Period("02-Apr-2005", "B") + end_intv = Period("2006-12-31", ("w", 1)) - msg = 'start and end must have same freq' + msg = "start and end must have same freq" with pytest.raises(ValueError, match=msg): PeriodIndex(start=start, end=end_intv) - msg = ('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') + msg = ( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) with pytest.raises(ValueError, match=msg): PeriodIndex(start=start) - @pytest.mark.parametrize('freq', ['M', 'Q', 'A', 'D', 'B', - 'T', 'S', 'L', 'U', 'N', 'H']) + @pytest.mark.parametrize( + "freq", ["M", "Q", "A", "D", "B", "T", "S", "L", "U", "N", "H"] + ) def test_recreate_from_data(self, freq): - org = period_range(start='2001/04/01', freq=freq, periods=1) + org = period_range(start="2001/04/01", freq=freq, periods=1) idx = PeriodIndex(org.values, freq=freq) tm.assert_index_equal(idx, org) def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] - index = PeriodIndex(raw, freq='A') + index = PeriodIndex(raw, freq="A") expected = Index([str(num) for num in raw]) res = index.map(str) @@ -527,18 +548,15 @@ def test_map_with_string_constructor(self): class TestSeriesPeriod: - def setup_method(self, method): - self.series = Series(period_range('2000-01-01', periods=10, freq='D')) + self.series = Series(period_range("2000-01-01", periods=10, freq="D")) def test_constructor_cant_cast_period(self): msg = "Cannot cast PeriodArray to dtype float64" with pytest.raises(TypeError, match=msg): - Series(period_range('2000-01-01', periods=10, freq='D'), - dtype=float) + Series(period_range("2000-01-01", periods=10, freq="D"), dtype=float) def test_constructor_cast_object(self): - s = Series(period_range('1/1/2000', periods=10), - dtype=PeriodDtype("D")) - exp = Series(period_range('1/1/2000', periods=10)) + s = Series(period_range("1/1/2000", periods=10), dtype=PeriodDtype("D")) + exp = Series(period_range("1/1/2000", periods=10)) tm.assert_series_equal(s, exp) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index e1d1dd307ea76..c5566f74af11e 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -7,125 +7,122 @@ def test_to_native_types(): - index = PeriodIndex(['2017-01-01', '2017-01-02', - '2017-01-03'], freq='D') + index = PeriodIndex(["2017-01-01", "2017-01-02", "2017-01-03"], freq="D") # First, with no arguments. - expected = np.array(['2017-01-01', '2017-01-02', - '2017-01-03'], dtype='=U10') + expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype="=U10") result = index.to_native_types() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index.to_native_types(na_rep='pandas') + result = index.to_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure slicing works - expected = np.array(['2017-01-01', '2017-01-03'], dtype='=U10') + expected = np.array(["2017-01-01", "2017-01-03"], dtype="=U10") result = index.to_native_types([0, 2]) tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works - expected = np.array(['01-2017-01', '01-2017-02', - '01-2017-03'], dtype='=U10') + expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype="=U10") - result = index.to_native_types(date_format='%m-%Y-%d') + result = index.to_native_types(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work - index = PeriodIndex(['2017-01-01', pd.NaT, '2017-01-03'], freq='D') - expected = np.array(['2017-01-01', 'NaT', '2017-01-03'], dtype=object) + index = PeriodIndex(["2017-01-01", pd.NaT, "2017-01-03"], freq="D") + expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) result = index.to_native_types() tm.assert_numpy_array_equal(result, expected) - expected = np.array(['2017-01-01', 'pandas', - '2017-01-03'], dtype=object) + expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index.to_native_types(na_rep='pandas') + result = index.to_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) class TestPeriodIndexRendering: - def test_frame_repr(self): - df = pd.DataFrame({"A": [1, 2, 3]}, - index=pd.date_range('2000', periods=3)) + df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.date_range("2000", periods=3)) result = repr(df) - expected = ( - ' A\n' - '2000-01-01 1\n' - '2000-01-02 2\n' - '2000-01-03 3') + expected = " A\n" "2000-01-01 1\n" "2000-01-02 2\n" "2000-01-03 3" assert result == expected - @pytest.mark.parametrize('method', ['__repr__', '__str__']) + @pytest.mark.parametrize("method", ["__repr__", "__str__"]) def test_representation(self, method): # GH#7601 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], - freq='H') - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") - idx10 = PeriodIndex(['2011-01-01', '2011-02-01'], freq='3D') + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") + idx10 = PeriodIndex(["2011-01-01", "2011-02-01"], freq="3D") exp1 = """PeriodIndex([], dtype='period[D]', freq='D')""" exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')""" - exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " - "freq='D')") - - exp4 = ("PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='period[D]', freq='D')") - - exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " - "freq='A-DEC')") - - exp6 = ("PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " - "dtype='period[H]', freq='H')") - - exp7 = ("PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " - "freq='Q-DEC')") - - exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " - "freq='Q-DEC')") - - exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " - "dtype='period[Q-DEC]', freq='Q-DEC')") - - exp10 = ("PeriodIndex(['2011-01-01', '2011-02-01'], " - "dtype='period[3D]', freq='3D')") - - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9, idx10], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9, exp10]): + exp3 = ( + "PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " "freq='D')" + ) + + exp4 = ( + "PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " + "dtype='period[D]', freq='D')" + ) + + exp5 = ( + "PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " + "freq='A-DEC')" + ) + + exp6 = ( + "PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " + "dtype='period[H]', freq='H')" + ) + + exp7 = "PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " "freq='Q-DEC')" + + exp8 = ( + "PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " "freq='Q-DEC')" + ) + + exp9 = ( + "PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " + "dtype='period[Q-DEC]', freq='Q-DEC')" + ) + + exp10 = ( + "PeriodIndex(['2011-01-01', '2011-02-01'], " + "dtype='period[3D]', freq='3D')" + ) + + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9, idx10], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9, exp10], + ): result = getattr(idx, method)() assert result == expected def test_representation_to_series(self): # GH#10971 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], - freq='H') - - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") exp1 = """Series([], dtype: period[D])""" @@ -163,27 +160,25 @@ def test_representation_to_series(self): 2 2013Q3 dtype: period[Q-DEC]""" - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9], + ): result = repr(pd.Series(idx)) assert result == expected def test_summary(self): # GH#9116 - idx1 = PeriodIndex([], freq='D') - idx2 = PeriodIndex(['2011-01-01'], freq='D') - idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq='D') - idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], - freq='H') - - idx7 = pd.period_range('2013Q1', periods=1, freq="Q") - idx8 = pd.period_range('2013Q1', periods=2, freq="Q") - idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + idx1 = PeriodIndex([], freq="D") + idx2 = PeriodIndex(["2011-01-01"], freq="D") + idx3 = PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") + idx4 = PeriodIndex(["2011-01-01", "2011-01-02", "2011-01-03"], freq="D") + idx5 = PeriodIndex(["2011", "2012", "2013"], freq="A") + idx6 = PeriodIndex(["2011-01-01 09:00", "2012-02-01 10:00", "NaT"], freq="H") + + idx7 = pd.period_range("2013Q1", periods=1, freq="Q") + idx8 = pd.period_range("2013Q1", periods=2, freq="Q") + idx9 = pd.period_range("2013Q1", periods=3, freq="Q") exp1 = """PeriodIndex: 0 entries Freq: D""" @@ -212,9 +207,9 @@ def test_summary(self): exp9 = """PeriodIndex: 3 entries, 2013Q1 to 2013Q3 Freq: Q-DEC""" - for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], - [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): + for idx, expected in zip( + [idx1, idx2, idx3, idx4, idx5, idx6, idx7, idx8, idx9], + [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9], + ): result = idx._summary() assert result == expected diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 27a690e58b70f..3f66891caddc3 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -6,102 +6,101 @@ from pandas._libs.tslibs import period as libperiod import pandas as pd -from pandas import ( - DatetimeIndex, Period, PeriodIndex, Series, notna, period_range) +from pandas import DatetimeIndex, Period, PeriodIndex, Series, notna, period_range from pandas.util import testing as tm class TestGetItem: def test_ellipsis(self): # GH#21282 - idx = period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + idx = period_range("2011-01-01", "2011-01-31", freq="D", name="idx") result = idx[...] assert result.equals(idx) assert result is not idx def test_getitem(self): - idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + idx1 = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") for idx in [idx1]: result = idx[0] - assert result == pd.Period('2011-01-01', freq='D') + assert result == pd.Period("2011-01-01", freq="D") result = idx[-1] - assert result == pd.Period('2011-01-31', freq='D') + assert result == pd.Period("2011-01-31", freq="D") result = idx[0:5] - expected = pd.period_range('2011-01-01', '2011-01-05', freq='D', - name='idx') + expected = pd.period_range("2011-01-01", "2011-01-05", freq="D", name="idx") tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx[0:10:2] - expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', - '2011-01-05', - '2011-01-07', '2011-01-09'], - freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-07", "2011-01-09"], + freq="D", + name="idx", + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx[-20:-5:3] - expected = pd.PeriodIndex(['2011-01-12', '2011-01-15', - '2011-01-18', - '2011-01-21', '2011-01-24'], - freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-12", "2011-01-15", "2011-01-18", "2011-01-21", "2011-01-24"], + freq="D", + name="idx", + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx[4::-1] - expected = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-03', - '2011-01-02', '2011-01-01'], - freq='D', name='idx') + expected = PeriodIndex( + ["2011-01-05", "2011-01-04", "2011-01-03", "2011-01-02", "2011-01-01"], + freq="D", + name="idx", + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" def test_getitem_index(self): - idx = period_range('2007-01', periods=10, freq='M', name='x') + idx = period_range("2007-01", periods=10, freq="M", name="x") result = idx[[1, 3, 5]] - exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], - freq='M', name='x') + exp = pd.PeriodIndex(["2007-02", "2007-04", "2007-06"], freq="M", name="x") tm.assert_index_equal(result, exp) - result = idx[[True, True, False, False, False, - True, True, False, False, False]] - exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], - freq='M', name='x') + result = idx[[True, True, False, False, False, True, True, False, False, False]] + exp = pd.PeriodIndex( + ["2007-01", "2007-02", "2007-06", "2007-07"], freq="M", name="x" + ) tm.assert_index_equal(result, exp) def test_getitem_partial(self): - rng = period_range('2007-01', periods=50, freq='M') + rng = period_range("2007-01", periods=50, freq="M") ts = Series(np.random.randn(len(rng)), rng) with pytest.raises(KeyError, match=r"^'2006'$"): - ts['2006'] + ts["2006"] - result = ts['2008'] + result = ts["2008"] assert (result.index.year == 2008).all() - result = ts['2008':'2009'] + result = ts["2008":"2009"] assert len(result) == 24 - result = ts['2008-1':'2009-12'] + result = ts["2008-1":"2009-12"] assert len(result) == 24 - result = ts['2008Q1':'2009Q4'] + result = ts["2008Q1":"2009Q4"] assert len(result) == 24 - result = ts[:'2009'] + result = ts[:"2009"] assert len(result) == 36 - result = ts['2009':] + result = ts["2009":] assert len(result) == 50 - 24 exp = result @@ -111,10 +110,10 @@ def test_getitem_partial(self): ts = ts[10:].append(ts[10:]) msg = "left slice bound for non-unique label: '2008'" with pytest.raises(KeyError, match=msg): - ts[slice('2008', '2009')] + ts[slice("2008", "2009")] def test_getitem_datetime(self): - rng = period_range(start='2012-01-01', periods=10, freq='W-MON') + rng = period_range(start="2012-01-01", periods=10, freq="W-MON") ts = Series(range(len(rng)), index=rng) dt1 = datetime(2011, 10, 2) @@ -124,36 +123,38 @@ def test_getitem_datetime(self): tm.assert_series_equal(rs, ts) def test_getitem_nat(self): - idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') - assert idx[0] == pd.Period('2011-01', freq='M') + idx = pd.PeriodIndex(["2011-01", "NaT", "2011-02"], freq="M") + assert idx[0] == pd.Period("2011-01", freq="M") assert idx[1] is pd.NaT s = pd.Series([0, 1, 2], index=idx) assert s[pd.NaT] == 1 s = pd.Series(idx, index=idx) - assert (s[pd.Period('2011-01', freq='M')] == - pd.Period('2011-01', freq='M')) + assert s[pd.Period("2011-01", freq="M")] == pd.Period("2011-01", freq="M") assert s[pd.NaT] is pd.NaT def test_getitem_list_periods(self): # GH 7710 - rng = period_range(start='2012-01-01', periods=10, freq='D') + rng = period_range(start="2012-01-01", periods=10, freq="D") ts = Series(range(len(rng)), index=rng) exp = ts.iloc[[1]] - tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) + tm.assert_series_equal(ts[[Period("2012-01-02", freq="D")]], exp) def test_getitem_seconds(self): # GH#6716 - didx = pd.date_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) - pidx = period_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) + didx = pd.date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) for idx in [didx, pidx]: # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: # GH7116 # these show deprecations as we are trying @@ -163,21 +164,26 @@ def test_getitem_seconds(self): continue s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) - tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) - for d in ['2013/01/01', '2013/01', '2013']: + tm.assert_series_equal(s["2013/01/01 10:00"], s[3600:3660]) + tm.assert_series_equal(s["2013/01/01 9H"], s[:3600]) + for d in ["2013/01/01", "2013/01", "2013"]: tm.assert_series_equal(s[d], s) def test_getitem_day(self): # GH#6716 # Confirm DatetimeIndex and PeriodIndex works identically - didx = pd.date_range(start='2013/01/01', freq='D', periods=400) - pidx = period_range(start='2013/01/01', freq='D', periods=400) + didx = pd.date_range(start="2013/01/01", freq="D", periods=400) + pidx = period_range(start="2013/01/01", freq="D", periods=400) for idx in [didx, pidx]: # getitem against index should raise ValueError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: # GH7116 @@ -188,46 +194,44 @@ def test_getitem_day(self): continue s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01'], s[0:31]) - tm.assert_series_equal(s['2013/02'], s[31:59]) - tm.assert_series_equal(s['2014'], s[365:]) + tm.assert_series_equal(s["2013/01"], s[0:31]) + tm.assert_series_equal(s["2013/02"], s[31:59]) + tm.assert_series_equal(s["2014"], s[365:]) - invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + invalid = ["2013/02/01 9H", "2013/02/01 09:00"] for v in invalid: with pytest.raises(KeyError): s[v] class TestWhere: - @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where(self, klass): - i = period_range('20130101', periods=5, freq='D') + i = period_range("20130101", periods=5, freq="D") cond = [True] * len(i) expected = i result = i.where(klass(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * (len(i) - 1) - expected = PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D') + expected = PeriodIndex([pd.NaT] + i[1:].tolist(), freq="D") result = i.where(klass(cond)) tm.assert_index_equal(result, expected) def test_where_other(self): - i = period_range('20130101', periods=5, freq='D') + i = period_range("20130101", periods=5, freq="D") for arr in [np.nan, pd.NaT]: result = i.where(notna(i), other=np.nan) expected = i tm.assert_index_equal(result, expected) i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq="D") result = i.where(notna(i2), i2) tm.assert_index_equal(result, i2) i2 = i.copy() - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), freq="D") result = i.where(notna(i2), i2.values) tm.assert_index_equal(result, i2) @@ -235,58 +239,65 @@ def test_where_other(self): class TestTake: def test_take(self): # GH#10295 - idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + idx1 = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") for idx in [idx1]: result = idx.take([0]) - assert result == pd.Period('2011-01-01', freq='D') + assert result == pd.Period("2011-01-01", freq="D") result = idx.take([5]) - assert result == pd.Period('2011-01-06', freq='D') + assert result == pd.Period("2011-01-06", freq="D") result = idx.take([0, 1, 2]) - expected = pd.period_range('2011-01-01', '2011-01-03', freq='D', - name='idx') + expected = pd.period_range("2011-01-01", "2011-01-03", freq="D", name="idx") tm.assert_index_equal(result, expected) - assert result.freq == 'D' + assert result.freq == "D" assert result.freq == expected.freq result = idx.take([0, 2, 4]) - expected = pd.PeriodIndex(['2011-01-01', '2011-01-03', - '2011-01-05'], freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx.take([7, 4, 1]) - expected = pd.PeriodIndex(['2011-01-08', '2011-01-05', - '2011-01-02'], - freq='D', name='idx') + expected = pd.PeriodIndex( + ["2011-01-08", "2011-01-05", "2011-01-02"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx.take([3, 2, 5]) - expected = PeriodIndex(['2011-01-04', '2011-01-03', '2011-01-06'], - freq='D', name='idx') + expected = PeriodIndex( + ["2011-01-04", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" result = idx.take([-3, 2, 5]) - expected = PeriodIndex(['2011-01-29', '2011-01-03', '2011-01-06'], - freq='D', name='idx') + expected = PeriodIndex( + ["2011-01-29", "2011-01-03", "2011-01-06"], freq="D", name="idx" + ) tm.assert_index_equal(result, expected) assert result.freq == expected.freq - assert result.freq == 'D' + assert result.freq == "D" def test_take_misc(self): - index = period_range(start='1/1/10', end='12/31/12', freq='D', - name='idx') - expected = PeriodIndex([datetime(2010, 1, 6), datetime(2010, 1, 7), - datetime(2010, 1, 9), datetime(2010, 1, 13)], - freq='D', name='idx') + index = period_range(start="1/1/10", end="12/31/12", freq="D", name="idx") + expected = PeriodIndex( + [ + datetime(2010, 1, 6), + datetime(2010, 1, 7), + datetime(2010, 1, 9), + datetime(2010, 1, 13), + ], + freq="D", + name="idx", + ) taken1 = index.take([5, 6, 8, 12]) taken2 = index[[5, 6, 8, 12]] @@ -299,28 +310,33 @@ def test_take_misc(self): def test_take_fill_value(self): # GH#12631 - idx = pd.PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx', freq='D') + idx = pd.PeriodIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx", freq="D" + ) result = idx.take(np.array([1, 0, -1])) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx', freq='D') + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "NaT"], name="xxx", freq="D" + ) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.PeriodIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx', freq='D') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.PeriodIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx", freq="D" + ) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -332,10 +348,9 @@ def test_take_fill_value(self): class TestIndexing: - def test_get_loc_msg(self): - idx = period_range('2000-1-1', freq='A', periods=10) - bad_period = Period('2012', 'A') + idx = period_range("2000-1-1", freq="A", periods=10) + bad_period = Period("2012", "A") with pytest.raises(KeyError, match=r"^Period\('2012', 'A-DEC'\)$"): idx.get_loc(bad_period) @@ -345,21 +360,21 @@ def test_get_loc_msg(self): assert inst.args[0] == bad_period def test_get_loc_nat(self): - didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) - pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + didx = DatetimeIndex(["2011-01-01", "NaT", "2011-01-03"]) + pidx = PeriodIndex(["2011-01-01", "NaT", "2011-01-03"], freq="M") # check DatetimeIndex compat for idx in [didx, pidx]: assert idx.get_loc(pd.NaT) == 1 assert idx.get_loc(None) == 1 - assert idx.get_loc(float('nan')) == 1 + assert idx.get_loc(float("nan")) == 1 assert idx.get_loc(np.nan) == 1 def test_get_loc(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") # get the location of p1/p2 from # monotonic increasing PeriodIndex with non-duplicate @@ -374,12 +389,14 @@ def test_get_loc(self): msg = "Cannot interpret 'foo' as period" with pytest.raises(KeyError, match=msg): - idx0.get_loc('foo') + idx0.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.1$"): idx0.get_loc(1.1) - msg = (r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\]," - r" dtype='period\[D\]', freq='D'\)' is an invalid key") + msg = ( + r"'PeriodIndex\(\['2017-09-01', '2017-09-02', '2017-09-03'\]," + r" dtype='period\[D\]', freq='D'\)' is an invalid key" + ) with pytest.raises(TypeError, match=msg): idx0.get_loc(idx0) @@ -396,13 +413,15 @@ def test_get_loc(self): msg = "Cannot interpret 'foo' as period" with pytest.raises(KeyError, match=msg): - idx1.get_loc('foo') + idx1.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.1$"): idx1.get_loc(1.1) - msg = (r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\]," - r" dtype='period\[D\]', freq='D'\)' is an invalid key") + msg = ( + r"'PeriodIndex\(\['2017-09-02', '2017-09-02', '2017-09-03'\]," + r" dtype='period\[D\]', freq='D'\)' is an invalid key" + ) with pytest.raises(TypeError, match=msg): idx1.get_loc(idx1) @@ -419,9 +438,9 @@ def test_get_loc(self): def test_is_monotonic_increasing(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") idx_inc0 = pd.PeriodIndex([p0, p1, p2]) idx_inc1 = pd.PeriodIndex([p0, p1, p1]) @@ -437,9 +456,9 @@ def test_is_monotonic_increasing(self): def test_is_monotonic_decreasing(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") idx_inc0 = pd.PeriodIndex([p0, p1, p2]) idx_inc1 = pd.PeriodIndex([p0, p1, p1]) @@ -455,10 +474,10 @@ def test_is_monotonic_decreasing(self): def test_contains(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') - p3 = pd.Period('2017-09-04') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + p3 = pd.Period("2017-09-04") ps0 = [p0, p1, p2] idx0 = pd.PeriodIndex(ps0) @@ -467,17 +486,17 @@ def test_contains(self): assert p in idx0 assert str(p) in idx0 - assert '2017-09-01 00:00:01' in idx0 + assert "2017-09-01 00:00:01" in idx0 - assert '2017-09' in idx0 + assert "2017-09" in idx0 assert p3 not in idx0 def test_get_value(self): # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') + p0 = pd.Period("2017-09-01") + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") idx0 = pd.PeriodIndex([p0, p1, p2]) input0 = np.array([1, 2, 3]) @@ -502,39 +521,41 @@ def test_get_value(self): def test_get_indexer(self): # GH 17717 - p1 = pd.Period('2017-09-01') - p2 = pd.Period('2017-09-04') - p3 = pd.Period('2017-09-07') + p1 = pd.Period("2017-09-01") + p2 = pd.Period("2017-09-04") + p3 = pd.Period("2017-09-07") - tp0 = pd.Period('2017-08-31') - tp1 = pd.Period('2017-09-02') - tp2 = pd.Period('2017-09-05') - tp3 = pd.Period('2017-09-09') + tp0 = pd.Period("2017-08-31") + tp1 = pd.Period("2017-09-02") + tp2 = pd.Period("2017-09-05") + tp3 = pd.Period("2017-09-09") idx = pd.PeriodIndex([p1, p2, p3]) - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) target = pd.PeriodIndex([tp0, tp1, tp2, tp3]) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2, -1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 0, 1, 2], dtype=np.intp)) - - res = idx.get_indexer(target, 'nearest', - tolerance=pd.Timedelta('1 day')) - tm.assert_numpy_array_equal(res, - np.array([0, 0, 1, -1], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2, -1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 0, 1, 2], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=pd.Timedelta("1 day")) + tm.assert_numpy_array_equal(res, np.array([0, 0, 1, -1], dtype=np.intp)) def test_get_indexer_non_unique(self): # GH 17717 - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') - p3 = pd.Period('2017-09-04') - p4 = pd.Period('2017-09-05') + p1 = pd.Period("2017-09-02") + p2 = pd.Period("2017-09-03") + p3 = pd.Period("2017-09-04") + p4 = pd.Period("2017-09-05") idx1 = pd.PeriodIndex([p1, p2, p1]) idx2 = pd.PeriodIndex([p2, p1, p3, p4]) @@ -548,85 +569,108 @@ def test_get_indexer_non_unique(self): # TODO: This method came from test_period; de-dup with version above def test_get_loc2(self): - idx = pd.period_range('2000-01-01', periods=3) + idx = pd.period_range("2000-01-01", periods=3) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(idx[1], method) == 1 - assert idx.get_loc(idx[1].asfreq('H', how='start'), method) == 1 + assert idx.get_loc(idx[1].asfreq("H", how="start"), method) == 1 assert idx.get_loc(idx[1].to_timestamp(), method) == 1 - assert idx.get_loc(idx[1].to_timestamp() - .to_pydatetime(), method) == 1 + assert idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method) == 1 assert idx.get_loc(str(idx[1]), method) == 1 - idx = pd.period_range('2000-01-01', periods=5)[::2] - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance='1 day') == 1 - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance=pd.Timedelta('1D')) == 1 - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance=np.timedelta64(1, 'D')) == 1 - assert idx.get_loc('2000-01-02T12', method='nearest', - tolerance=timedelta(1)) == 1 - - msg = 'unit abbreviation w/o a number' + idx = pd.period_range("2000-01-01", periods=5)[::2] + assert idx.get_loc("2000-01-02T12", method="nearest", tolerance="1 day") == 1 + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=pd.Timedelta("1D")) + == 1 + ) + assert ( + idx.get_loc( + "2000-01-02T12", method="nearest", tolerance=np.timedelta64(1, "D") + ) + == 1 + ) + assert ( + idx.get_loc("2000-01-02T12", method="nearest", tolerance=timedelta(1)) == 1 + ) + + msg = "unit abbreviation w/o a number" with pytest.raises(ValueError, match=msg): - idx.get_loc('2000-01-10', method='nearest', tolerance='foo') + idx.get_loc("2000-01-10", method="nearest", tolerance="foo") - msg = 'Input has different freq=None from PeriodArray\\(freq=D\\)' + msg = "Input has different freq=None from PeriodArray\\(freq=D\\)" with pytest.raises(ValueError, match=msg): - idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour') + idx.get_loc("2000-01-10", method="nearest", tolerance="1 hour") with pytest.raises(KeyError, match=r"^Period\('2000-01-10', 'D'\)$"): - idx.get_loc('2000-01-10', method='nearest', tolerance='1 day') + idx.get_loc("2000-01-10", method="nearest", tolerance="1 day") with pytest.raises( - ValueError, - match='list-like tolerance size must match target index size'): - idx.get_loc('2000-01-10', method='nearest', - tolerance=[pd.Timedelta('1 day').to_timedelta64(), - pd.Timedelta('1 day').to_timedelta64()]) + ValueError, match="list-like tolerance size must match target index size" + ): + idx.get_loc( + "2000-01-10", + method="nearest", + tolerance=[ + pd.Timedelta("1 day").to_timedelta64(), + pd.Timedelta("1 day").to_timedelta64(), + ], + ) # TODO: This method came from test_period; de-dup with version above def test_get_indexer2(self): - idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) - - target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12', - '2000-01-02T01'], freq='H') - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', - tolerance='1 hour'), - np.array([0, -1, 1], dtype=np.intp)) - - msg = 'Input has different freq=None from PeriodArray\\(freq=H\\)' + idx = pd.period_range("2000-01-01", periods=3).asfreq("H", how="start") + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.PeriodIndex( + ["1999-12-31T23", "2000-01-01T12", "2000-01-02T01"], freq="H" + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance="1 hour"), + np.array([0, -1, 1], dtype=np.intp), + ) + + msg = "Input has different freq=None from PeriodArray\\(freq=H\\)" with pytest.raises(ValueError, match=msg): - idx.get_indexer(target, 'nearest', tolerance='1 minute') - - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', - tolerance='1 day'), - np.array([0, 1, 1], dtype=np.intp)) - tol_raw = [pd.Timedelta('1 hour'), - pd.Timedelta('1 hour'), - np.timedelta64(1, 'D'), ] + idx.get_indexer(target, "nearest", tolerance="1 minute") + + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest", tolerance="1 day"), + np.array([0, 1, 1], dtype=np.intp), + ) + tol_raw = [ + pd.Timedelta("1 hour"), + pd.Timedelta("1 hour"), + np.timedelta64(1, "D"), + ] tm.assert_numpy_array_equal( - idx.get_indexer(target, 'nearest', - tolerance=[np.timedelta64(x) for x in tol_raw]), - np.array([0, -1, 1], dtype=np.intp)) - tol_bad = [pd.Timedelta('2 hour').to_timedelta64(), - pd.Timedelta('1 hour').to_timedelta64(), - np.timedelta64(1, 'M'), ] + idx.get_indexer( + target, "nearest", tolerance=[np.timedelta64(x) for x in tol_raw] + ), + np.array([0, -1, 1], dtype=np.intp), + ) + tol_bad = [ + pd.Timedelta("2 hour").to_timedelta64(), + pd.Timedelta("1 hour").to_timedelta64(), + np.timedelta64(1, "M"), + ] with pytest.raises( - libperiod.IncompatibleFrequency, - match='Input has different freq=None from'): - idx.get_indexer(target, 'nearest', tolerance=tol_bad) + libperiod.IncompatibleFrequency, match="Input has different freq=None from" + ): + idx.get_indexer(target, "nearest", tolerance=tol_bad) def test_indexing(self): # GH 4390, iat incorrectly indexing - index = period_range('1/1/2001', periods=10) + index = period_range("1/1/2001", periods=10) s = Series(np.random.randn(10), index=index) expected = s[index[0]] result = s.iat[0] @@ -634,7 +678,7 @@ def test_indexing(self): def test_period_index_indexer(self): # GH4125 - idx = pd.period_range('2002-01', '2003-12', freq='M') + idx = pd.period_range("2002-01", "2003-12", freq="M") df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) tm.assert_frame_equal(df, df.loc[idx]) tm.assert_frame_equal(df, df.loc[list(idx)]) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index 766919735c191..96042f4dbaba2 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -1,4 +1,3 @@ - import numpy as np import pytest @@ -10,11 +9,9 @@ class TestPeriodIndexOps(Ops): - def setup_method(self, method): super().setup_method(method) - mask = lambda x: (isinstance(x, DatetimeIndex) or - isinstance(x, PeriodIndex)) + mask = lambda x: (isinstance(x, DatetimeIndex) or isinstance(x, PeriodIndex)) self.is_valid_objs = [o for o in self.objs if mask(o)] self.not_valid_objs = [o for o in self.objs if not mask(o)] @@ -25,50 +22,72 @@ def test_ops_properties(self): self.check_ops_properties(PeriodArray._bool_ops, f) def test_resolution(self): - for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', - 'T', 'S', 'L', 'U'], - ['day', 'day', 'day', 'day', - 'hour', 'minute', 'second', - 'millisecond', 'microsecond']): - - idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) + for freq, expected in zip( + ["A", "Q", "M", "D", "H", "T", "S", "L", "U"], + [ + "day", + "day", + "day", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + ], + ): + + idx = pd.period_range(start="2013-04-01", periods=30, freq=freq) assert idx.resolution == expected def test_value_counts_unique(self): # GH 7735 - idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + idx = pd.period_range("2011-01-01 09:00", freq="H", periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex(np.repeat(idx._values, range(1, len(idx) + 1)), - freq='H') - - exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', - '2011-01-01 16:00', '2011-01-01 15:00', - '2011-01-01 14:00', '2011-01-01 13:00', - '2011-01-01 12:00', '2011-01-01 11:00', - '2011-01-01 10:00', - '2011-01-01 09:00'], freq='H') - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + idx = PeriodIndex(np.repeat(idx._values, range(1, len(idx) + 1)), freq="H") + + exp_idx = PeriodIndex( + [ + "2011-01-01 18:00", + "2011-01-01 17:00", + "2011-01-01 16:00", + "2011-01-01 15:00", + "2011-01-01 14:00", + "2011-01-01 13:00", + "2011-01-01 12:00", + "2011-01-01 11:00", + "2011-01-01 10:00", + "2011-01-01 09:00", + ], + freq="H", + ) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - expected = pd.period_range('2011-01-01 09:00', freq='H', - periods=10) + expected = pd.period_range("2011-01-01 09:00", freq="H", periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', - '2013-01-01 09:00', '2013-01-01 08:00', - '2013-01-01 08:00', NaT], freq='H') + idx = PeriodIndex( + [ + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 09:00", + "2013-01-01 08:00", + "2013-01-01 08:00", + NaT, + ], + freq="H", + ) - exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], - freq='H') + exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00"], freq="H") expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', - NaT], freq='H') + exp_idx = PeriodIndex(["2013-01-01 09:00", "2013-01-01 08:00", NaT], freq="H") expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: @@ -78,7 +97,7 @@ def test_value_counts_unique(self): def test_drop_duplicates_metadata(self): # GH 10115 - idx = pd.period_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -90,8 +109,7 @@ def test_drop_duplicates_metadata(self): def test_drop_duplicates(self): # to check Index/Series compat - base = pd.period_range('2011-01-01', '2011-01-31', freq='D', - name='idx') + base = pd.period_range("2011-01-01", "2011-01-31", freq="D", name="idx") idx = base.append(base[:5]) res = idx.drop_duplicates() @@ -99,10 +117,10 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates() tm.assert_series_equal(res, Series(base)) - res = idx.drop_duplicates(keep='last') + res = idx.drop_duplicates(keep="last") exp = base[5:].append(base[:5]) tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') + res = Series(idx).drop_duplicates(keep="last") tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) res = idx.drop_duplicates(keep=False) @@ -115,9 +133,9 @@ def _check_freq(index, expected_index): if isinstance(index, PeriodIndex): assert index.freq == expected_index.freq - pidx = PeriodIndex(['2011', '2012', '2013'], name='pidx', freq='A') + pidx = PeriodIndex(["2011", "2012", "2013"], name="pidx", freq="A") # for compatibility check - iidx = Index([2011, 2012, 2013], name='idx') + iidx = Index([2011, 2012, 2013], name="idx") for idx in [pidx, iidx]: ordered = idx.sort_values() tm.assert_index_equal(ordered, idx) @@ -129,24 +147,23 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) _check_freq(ordered, idx) - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, idx[::-1]) - tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) _check_freq(ordered, idx[::-1]) - pidx = PeriodIndex(['2011', '2013', '2015', '2012', - '2011'], name='pidx', freq='A') + pidx = PeriodIndex( + ["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A" + ) pexpected = PeriodIndex( - ['2011', '2011', '2012', '2013', '2015'], name='pidx', freq='A') + ["2011", "2011", "2012", "2013", "2015"], name="pidx", freq="A" + ) # for compatibility check - iidx = Index([2011, 2013, 2015, 2012, 2011], name='idx') - iexpected = Index([2011, 2011, 2012, 2013, 2015], name='idx') + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + iexpected = Index([2011, 2011, 2012, 2013, 2015], name="idx") for idx, expected in [(pidx, pexpected), (iidx, iexpected)]: ordered = idx.sort_values() tm.assert_index_equal(ordered, expected) @@ -163,33 +180,30 @@ def _check_freq(index, expected_index): tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) - pidx = PeriodIndex(['2011', '2013', 'NaT', '2011'], name='pidx', - freq='D') + pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") result = pidx.sort_values() - expected = PeriodIndex(['NaT', '2011', '2011', '2013'], - name='pidx', freq='D') + expected = PeriodIndex(["NaT", "2011", "2011", "2013"], name="pidx", freq="D") tm.assert_index_equal(result, expected) - assert result.freq == 'D' + assert result.freq == "D" result = pidx.sort_values(ascending=False) - expected = PeriodIndex( - ['2013', '2011', '2011', 'NaT'], name='pidx', freq='D') + expected = PeriodIndex(["2013", "2011", "2011", "NaT"], name="pidx", freq="D") tm.assert_index_equal(result, expected) - assert result.freq == 'D' + assert result.freq == "D" def test_order(self): - for freq in ['D', '2D', '4D']: - idx = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - freq=freq, name='idx') + for freq in ["D", "2D", "4D"]: + idx = PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], freq=freq, name="idx" + ) ordered = idx.sort_values() tm.assert_index_equal(ordered, idx) @@ -203,60 +217,68 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) assert ordered.freq == idx.freq assert ordered.freq == freq - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) expected = idx[::-1] tm.assert_index_equal(ordered, expected) - tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([2, 1, 0]), check_dtype=False) assert ordered.freq == expected.freq assert ordered.freq == freq - idx1 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], freq='D', name='idx1') - exp1 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], freq='D', name='idx1') + idx1 = PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + freq="D", + name="idx1", + ) + exp1 = PeriodIndex( + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx1", + ) - idx2 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - freq='D', name='idx2') - exp2 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], - freq='D', name='idx2') + idx2 = PeriodIndex( + ["2011-01-01", "2011-01-03", "2011-01-05", "2011-01-02", "2011-01-01"], + freq="D", + name="idx2", + ) + exp2 = PeriodIndex( + ["2011-01-01", "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-05"], + freq="D", + name="idx2", + ) - idx3 = PeriodIndex([NaT, '2011-01-03', '2011-01-05', - '2011-01-02', NaT], freq='D', name='idx3') - exp3 = PeriodIndex([NaT, NaT, '2011-01-02', '2011-01-03', - '2011-01-05'], freq='D', name='idx3') + idx3 = PeriodIndex( + [NaT, "2011-01-03", "2011-01-05", "2011-01-02", NaT], freq="D", name="idx3" + ) + exp3 = PeriodIndex( + [NaT, NaT, "2011-01-02", "2011-01-03", "2011-01-05"], freq="D", name="idx3" + ) for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: ordered = idx.sort_values() tm.assert_index_equal(ordered, expected) - assert ordered.freq == 'D' + assert ordered.freq == "D" ordered = idx.sort_values(ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - assert ordered.freq == 'D' + assert ordered.freq == "D" ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, expected) exp = np.array([0, 4, 3, 1, 2]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq == 'D' + assert ordered.freq == "D" - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) - assert ordered.freq == 'D' + assert ordered.freq == "D" def test_shift(self): # This is tested in test_arithmetic @@ -264,29 +286,26 @@ def test_shift(self): def test_nat(self): assert pd.PeriodIndex._na_value is NaT - assert pd.PeriodIndex([], freq='M')._na_value is NaT + assert pd.PeriodIndex([], freq="M")._na_value is NaT - idx = pd.PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + idx = pd.PeriodIndex(["2011-01-01", "2011-01-02"], freq="D") assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) - idx = pd.PeriodIndex(['2011-01-01', 'NaT'], freq='D') + idx = pd.PeriodIndex(["2011-01-01", "NaT"], freq="D") assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) - @pytest.mark.parametrize('freq', ['D', 'M']) + @pytest.mark.parametrize("freq", ["D", "M"]) def test_equals(self, freq): # GH#13107 - idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], - freq=freq) + idx = pd.PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq=freq) assert idx.equals(idx) assert idx.equals(idx.copy()) assert idx.equals(idx.astype(object)) @@ -295,8 +314,7 @@ def test_equals(self, freq): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], - freq='H') + idx2 = pd.PeriodIndex(["2011-01-01", "2011-01-02", "NaT"], freq="H") assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -318,7 +336,7 @@ def test_equals(self, freq): def test_freq_setter_deprecated(self): # GH 20678 - idx = pd.period_range('2018Q1', periods=4, freq='Q') + idx = pd.period_range("2018Q1", periods=4, freq="Q") # no warning for getter with tm.assert_produces_warning(None): diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index dbde7ecf3826d..00b9803980bc8 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -7,13 +7,11 @@ class TestPeriodIndex: - def setup_method(self, method): pass def test_slice_with_negative_step(self): - ts = Series(np.arange(20), - period_range('2014-01', periods=20, freq='M')) + ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -21,112 +19,117 @@ def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) - assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) + assert_slices_equivalent(SLC[Period("2014-10") :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC["2014-10"::-1], SLC[9::-1]) - assert_slices_equivalent(SLC[:Period('2014-10'):-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[:'2014-10':-1], SLC[:8:-1]) + assert_slices_equivalent(SLC[: Period("2014-10") : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[:"2014-10":-1], SLC[:8:-1]) - assert_slices_equivalent(SLC['2015-02':'2014-10':-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC[Period('2015-02'):Period('2014-10'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC['2015-02':Period('2014-10'):-1], - SLC[13:8:-1]) - assert_slices_equivalent(SLC[Period('2015-02'):'2014-10':-1], - SLC[13:8:-1]) + assert_slices_equivalent(SLC["2015-02":"2014-10":-1], SLC[13:8:-1]) + assert_slices_equivalent( + SLC[Period("2015-02") : Period("2014-10") : -1], SLC[13:8:-1] + ) + assert_slices_equivalent(SLC["2015-02" : Period("2014-10") : -1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[Period("2015-02") : "2014-10" : -1], SLC[13:8:-1]) - assert_slices_equivalent(SLC['2014-10':'2015-02':-1], SLC[:0]) + assert_slices_equivalent(SLC["2014-10":"2015-02":-1], SLC[:0]) def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), - period_range('2014-01', periods=20, freq='M')) - with pytest.raises(ValueError, match='slice step cannot be zero'): + ts = Series(np.arange(20), period_range("2014-01", periods=20, freq="M")) + with pytest.raises(ValueError, match="slice step cannot be zero"): ts[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] def test_slice_keep_name(self): - idx = period_range('20010101', periods=10, freq='D', name='bob') + idx = period_range("20010101", periods=10, freq="D", name="bob") assert idx.name == idx[1:].name def test_pindex_slice_index(self): - pi = period_range(start='1/1/10', end='12/31/12', freq='M') + pi = period_range(start="1/1/10", end="12/31/12", freq="M") s = Series(np.random.rand(len(pi)), index=pi) - res = s['2010'] + res = s["2010"] exp = s[0:12] tm.assert_series_equal(res, exp) - res = s['2011'] + res = s["2011"] exp = s[12:24] tm.assert_series_equal(res, exp) def test_range_slice_day(self): # GH#6716 - didx = pd.date_range(start='2013/01/01', freq='D', periods=400) - pidx = period_range(start='2013/01/01', freq='D', periods=400) + didx = pd.date_range(start="2013/01/01", freq="D", periods=400) + pidx = period_range(start="2013/01/01", freq="D", periods=400) for idx in [didx, pidx]: # slices against index should raise IndexError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: with pytest.raises(TypeError): idx[v:] s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/02':], s[1:]) - tm.assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) - tm.assert_series_equal(s['2013/02':], s[31:]) - tm.assert_series_equal(s['2014':], s[365:]) + tm.assert_series_equal(s["2013/01/02":], s[1:]) + tm.assert_series_equal(s["2013/01/02":"2013/01/05"], s[1:5]) + tm.assert_series_equal(s["2013/02":], s[31:]) + tm.assert_series_equal(s["2014":], s[365:]) - invalid = ['2013/02/01 9H', '2013/02/01 09:00'] + invalid = ["2013/02/01 9H", "2013/02/01 09:00"] for v in invalid: with pytest.raises(TypeError): idx[v:] def test_range_slice_seconds(self): # GH#6716 - didx = pd.date_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) - pidx = period_range(start='2013/01/01 09:00:00', freq='S', - periods=4000) + didx = pd.date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) + pidx = period_range(start="2013/01/01 09:00:00", freq="S", periods=4000) for idx in [didx, pidx]: # slices against index should raise IndexError - values = ['2014', '2013/02', '2013/01/02', '2013/02/01 9H', - '2013/02/01 09:00'] + values = [ + "2014", + "2013/02", + "2013/01/02", + "2013/02/01 9H", + "2013/02/01 09:00", + ] for v in values: with pytest.raises(TypeError): idx[v:] s = Series(np.random.rand(len(idx)), index=idx) - tm.assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], - s[300:660]) - tm.assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], - s[3600:3960]) - tm.assert_series_equal(s['2013/01/01 10H':], s[3600:]) - tm.assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) - for d in ['2013/01/01', '2013/01', '2013']: + tm.assert_series_equal(s["2013/01/01 09:05":"2013/01/01 09:10"], s[300:660]) + tm.assert_series_equal( + s["2013/01/01 10:00":"2013/01/01 10:05"], s[3600:3960] + ) + tm.assert_series_equal(s["2013/01/01 10H":], s[3600:]) + tm.assert_series_equal(s[:"2013/01/01 09:30"], s[:1860]) + for d in ["2013/01/01", "2013/01", "2013"]: tm.assert_series_equal(s[d:], s) def test_range_slice_outofbounds(self): # GH#5407 - didx = pd.date_range(start='2013/10/01', freq='D', periods=10) - pidx = period_range(start='2013/10/01', freq='D', periods=10) + didx = pd.date_range(start="2013/10/01", freq="D", periods=10) + pidx = period_range(start="2013/10/01", freq="D", periods=10) for idx in [didx, pidx]: df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) - empty = DataFrame(index=idx.__class__([], freq='D'), - columns=['units']) - empty['units'] = empty['units'].astype('int64') - - tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) - tm.assert_frame_equal(df['2013/09/30':'2013/10/02'], df.iloc[:2]) - tm.assert_frame_equal(df['2013/10/01':'2013/10/02'], df.iloc[:2]) - tm.assert_frame_equal(df['2013/10/02':'2013/09/30'], empty) - tm.assert_frame_equal(df['2013/10/15':'2013/10/17'], empty) - tm.assert_frame_equal(df['2013-06':'2013-09'], empty) - tm.assert_frame_equal(df['2013-11':'2013-12'], empty) + empty = DataFrame(index=idx.__class__([], freq="D"), columns=["units"]) + empty["units"] = empty["units"].astype("int64") + + tm.assert_frame_equal(df["2013/09/01":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/09/30":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/01":"2013/10/02"], df.iloc[:2]) + tm.assert_frame_equal(df["2013/10/02":"2013/09/30"], empty) + tm.assert_frame_equal(df["2013/10/15":"2013/10/17"], empty) + tm.assert_frame_equal(df["2013-06":"2013-09"], empty) + tm.assert_frame_equal(df["2013-11":"2013-12"], empty) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index b33982f3d62f3..8b3b66bd1ee6b 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -6,8 +6,17 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, NaT, Period, PeriodIndex, Series, - date_range, offsets, period_range) + DataFrame, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + Series, + date_range, + offsets, + period_range, +) from pandas.util import testing as tm from ..datetimelike import DatetimeLike @@ -17,20 +26,21 @@ class TestPeriodIndex(DatetimeLike): _holder = PeriodIndex def setup_method(self, method): - self.indices = dict(index=tm.makePeriodIndex(10), - index_dec=period_range('20130101', periods=10, - freq='D')[::-1]) + self.indices = dict( + index=tm.makePeriodIndex(10), + index_dec=period_range("20130101", periods=10, freq="D")[::-1], + ) self.setup_indices() def create_index(self): - return period_range('20130101', periods=5, freq='D') + return period_range("20130101", periods=5, freq="D") def test_pickle_compat_construction(self): pass - @pytest.mark.parametrize('freq', ['D', 'M', 'A']) + @pytest.mark.parametrize("freq", ["D", "M", "A"]) def test_pickle_round_trip(self, freq): - idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq=freq) + idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.NaN], freq=freq) result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) @@ -38,11 +48,15 @@ def test_where(self): # This is handled in test_indexing pass - @pytest.mark.parametrize('use_numpy', [True, False]) - @pytest.mark.parametrize('index', [ - pd.period_range('2000-01-01', periods=3, freq='D'), - pd.period_range('2001-01-01', periods=3, freq='2D'), - pd.PeriodIndex(['2001-01', 'NaT', '2003-01'], freq='M')]) + @pytest.mark.parametrize("use_numpy", [True, False]) + @pytest.mark.parametrize( + "index", + [ + pd.period_range("2000-01-01", periods=3, freq="D"), + pd.period_range("2001-01-01", periods=3, freq="2D"), + pd.PeriodIndex(["2001-01", "NaT", "2003-01"], freq="M"), + ], + ) def test_repeat_freqstr(self, index, use_numpy): # GH10183 expected = PeriodIndex([p for p in index for _ in range(3)]) @@ -52,23 +66,32 @@ def test_repeat_freqstr(self, index, use_numpy): def test_fillna_period(self): # GH 11343 - idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, - '2011-01-01 11:00'], freq='H') - - exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', - '2011-01-01 11:00'], freq='H') - tm.assert_index_equal( - idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) - - exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), 'x', - pd.Period('2011-01-01 11:00', freq='H')], dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) - - exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), - pd.Period('2011-01-01', freq='D'), - pd.Period('2011-01-01 11:00', freq='H')], dtype=object) - tm.assert_index_equal(idx.fillna( - pd.Period('2011-01-01', freq='D')), exp) + idx = pd.PeriodIndex(["2011-01-01 09:00", pd.NaT, "2011-01-01 11:00"], freq="H") + + exp = pd.PeriodIndex( + ["2011-01-01 09:00", "2011-01-01 10:00", "2011-01-01 11:00"], freq="H" + ) + tm.assert_index_equal(idx.fillna(pd.Period("2011-01-01 10:00", freq="H")), exp) + + exp = pd.Index( + [ + pd.Period("2011-01-01 09:00", freq="H"), + "x", + pd.Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna("x"), exp) + + exp = pd.Index( + [ + pd.Period("2011-01-01 09:00", freq="H"), + pd.Period("2011-01-01", freq="D"), + pd.Period("2011-01-01 11:00", freq="H"), + ], + dtype=object, + ) + tm.assert_index_equal(idx.fillna(pd.Period("2011-01-01", freq="D")), exp) def test_no_millisecond_field(self): msg = "type object 'DatetimeIndex' has no attribute 'millisecond'" @@ -87,32 +110,32 @@ def test_difference_freq(self, sort): index = period_range("20160920", "20160925", freq="D") other = period_range("20160921", "20160924", freq="D") - expected = PeriodIndex(["20160920", "20160925"], freq='D') + expected = PeriodIndex(["20160920", "20160925"], freq="D") idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = period_range("20160922", "20160925", freq="D") idx_diff = index.difference(other, sort) - expected = PeriodIndex(["20160920", "20160921"], freq='D') + expected = PeriodIndex(["20160920", "20160921"], freq="D") tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) def test_hash_error(self): - index = period_range('20010101', periods=10) + index = period_range("20010101", periods=10) msg = "unhashable type: '{}'".format(type(index).__name__) with pytest.raises(TypeError, match=msg): hash(index) def test_make_time_series(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") series = Series(1, index=index) assert isinstance(series, Series) def test_shallow_copy_empty(self): # GH13067 - idx = PeriodIndex([], freq='M') + idx = PeriodIndex([], freq="M") result = idx._shallow_copy() expected = idx @@ -131,36 +154,36 @@ def test_shallow_copy_changing_freq_raises(self): pi._shallow_copy(pi, freq="H") def test_dtype_str(self): - pi = pd.PeriodIndex([], freq='M') + pi = pd.PeriodIndex([], freq="M") with tm.assert_produces_warning(FutureWarning): - assert pi.dtype_str == 'period[M]' + assert pi.dtype_str == "period[M]" assert pi.dtype_str == str(pi.dtype) with tm.assert_produces_warning(FutureWarning): - pi = pd.PeriodIndex([], freq='3M') - assert pi.dtype_str == 'period[3M]' + pi = pd.PeriodIndex([], freq="3M") + assert pi.dtype_str == "period[3M]" assert pi.dtype_str == str(pi.dtype) def test_view_asi8(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.view("i8"), exp) tm.assert_numpy_array_equal(idx.asi8, exp) - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.view("i8"), exp) tm.assert_numpy_array_equal(idx.asi8, exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') - tm.assert_numpy_array_equal(idx.view('i8'), exp) + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") + tm.assert_numpy_array_equal(idx.view("i8"), exp) tm.assert_numpy_array_equal(idx.asi8, exp) def test_values(self): - idx = pd.PeriodIndex([], freq='M') + idx = pd.PeriodIndex([], freq="M") exp = np.array([], dtype=np.object) tm.assert_numpy_array_equal(idx.values, exp) @@ -170,52 +193,51 @@ def test_values(self): exp = np.array([], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) - idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + idx = pd.PeriodIndex(["2011-01", pd.NaT], freq="M") - exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + exp = np.array([pd.Period("2011-01", freq="M"), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) - idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + idx = pd.PeriodIndex(["2011-01-01", pd.NaT], freq="D") - exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], - dtype=object) + exp = np.array([pd.Period("2011-01-01", freq="D"), pd.NaT], dtype=object) tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.to_numpy(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) tm.assert_numpy_array_equal(idx._ndarray_values, exp) def test_period_index_length(self): - pi = period_range(freq='A', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="A", start="1/1/2001", end="12/1/2009") assert len(pi) == 9 - pi = period_range(freq='Q', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2009") assert len(pi) == 4 * 9 - pi = period_range(freq='M', start='1/1/2001', end='12/1/2009') + pi = period_range(freq="M", start="1/1/2001", end="12/1/2009") assert len(pi) == 12 * 9 - start = Period('02-Apr-2005', 'B') + start = Period("02-Apr-2005", "B") i1 = period_range(start=start, periods=20) assert len(i1) == 20 assert i1.freq == start.freq assert i1[0] == start - end_intv = Period('2006-12-31', 'W') + end_intv = Period("2006-12-31", "W") i1 = period_range(end=end_intv, periods=10) assert len(i1) == 10 assert i1.freq == end_intv.freq assert i1[-1] == end_intv - end_intv = Period('2006-12-31', '1w') + end_intv = Period("2006-12-31", "1w") i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period('2006-12-31', ('w', 1)) + end_intv = Period("2006-12-31", ("w", 1)) i2 = period_range(end=end_intv, periods=10) assert len(i1) == len(i2) assert (i1 == i2).all() @@ -225,25 +247,27 @@ def test_period_index_length(self): with pytest.raises(ValueError, match=msg): period_range(start=start, end=end_intv) - end_intv = Period('2005-05-01', 'B') + end_intv = Period("2005-05-01", "B") i1 = period_range(start=start, end=end_intv) - msg = ("Of the three parameters: start, end, and periods, exactly two" - " must be specified") + msg = ( + "Of the three parameters: start, end, and periods, exactly two" + " must be specified" + ) with pytest.raises(ValueError, match=msg): period_range(start=start) # infer freq from first element - i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + i2 = PeriodIndex([end_intv, Period("2005-05-05", "B")]) assert len(i2) == 2 assert i2[0] == end_intv - i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + i2 = PeriodIndex(np.array([end_intv, Period("2005-05-05", "B")])) assert len(i2) == 2 assert i2[0] == end_intv # Mixed freq should fail - vals = [end_intv, Period('2006-12-31', 'w')] + vals = [end_intv, Period("2006-12-31", "w")] msg = r"Input has different freq=W-SUN from PeriodIndex\(freq=B\)" with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(vals) @@ -255,39 +279,52 @@ def test_fields(self): # year, month, day, hour, minute # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter # qyear - pi = period_range(freq='A', start='1/1/2001', end='12/1/2005') + pi = period_range(freq="A", start="1/1/2001", end="12/1/2005") self._check_all_fields(pi) - pi = period_range(freq='Q', start='1/1/2001', end='12/1/2002') + pi = period_range(freq="Q", start="1/1/2001", end="12/1/2002") self._check_all_fields(pi) - pi = period_range(freq='M', start='1/1/2001', end='1/1/2002') + pi = period_range(freq="M", start="1/1/2001", end="1/1/2002") self._check_all_fields(pi) - pi = period_range(freq='D', start='12/1/2001', end='6/1/2001') + pi = period_range(freq="D", start="12/1/2001", end="6/1/2001") self._check_all_fields(pi) - pi = period_range(freq='B', start='12/1/2001', end='6/1/2001') + pi = period_range(freq="B", start="12/1/2001", end="6/1/2001") self._check_all_fields(pi) - pi = period_range(freq='H', start='12/31/2001', end='1/1/2002 23:00') + pi = period_range(freq="H", start="12/31/2001", end="1/1/2002 23:00") self._check_all_fields(pi) - pi = period_range(freq='Min', start='12/31/2001', end='1/1/2002 00:20') + pi = period_range(freq="Min", start="12/31/2001", end="1/1/2002 00:20") self._check_all_fields(pi) - pi = period_range(freq='S', start='12/31/2001 00:00:00', - end='12/31/2001 00:05:00') + pi = period_range( + freq="S", start="12/31/2001 00:00:00", end="12/31/2001 00:05:00" + ) self._check_all_fields(pi) - end_intv = Period('2006-12-31', 'W') + end_intv = Period("2006-12-31", "W") i1 = period_range(end=end_intv, periods=10) self._check_all_fields(i1) def _check_all_fields(self, periodindex): - fields = ['year', 'month', 'day', 'hour', 'minute', 'second', - 'weekofyear', 'week', 'dayofweek', 'dayofyear', - 'quarter', 'qyear', 'days_in_month'] + fields = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "weekofyear", + "week", + "dayofweek", + "dayofyear", + "quarter", + "qyear", + "days_in_month", + ] periods = list(periodindex) s = pd.Series(periodindex) @@ -309,8 +346,8 @@ def _check_all_fields(self, periodindex): def test_period_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) - idx1 = period_range('2011/01/01', periods=6, freq='M') - idx2 = period_range('2013', periods=6, freq='A') + idx1 = period_range("2011/01/01", periods=6, freq="M") + idx2 = period_range("2013", periods=6, freq="A") df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) @@ -318,11 +355,12 @@ def test_period_set_index_reindex(self): tm.assert_index_equal(df.index, idx2) def test_factorize(self): - idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', - '2014-03', '2014-03'], freq='M') + idx1 = PeriodIndex( + ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M" + ) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) - exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M") arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) @@ -332,8 +370,9 @@ def test_factorize(self): tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', - '2014-03', '2014-01'], freq='M') + idx2 = pd.PeriodIndex( + ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M" + ) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) arr, idx = idx2.factorize(sort=True) @@ -341,14 +380,13 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) - exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') + exp_idx = PeriodIndex(["2014-03", "2014-02", "2014-01"], freq="M") arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) def test_is_(self): - create_index = lambda: period_range(freq='A', start='1/1/2001', - end='12/1/2009') + create_index = lambda: period_range(freq="A", start="1/1/2001", end="12/1/2009") index = create_index() assert index.is_(index) assert not index.is_(create_index()) @@ -359,55 +397,57 @@ def test_is_(self): index.name = "Apple" assert ind2.is_(index) assert not index.is_(index[:]) - assert not index.is_(index.asfreq('M')) - assert not index.is_(index.asfreq('A')) + assert not index.is_(index.asfreq("M")) + assert not index.is_(index.asfreq("A")) assert not index.is_(index - 2) assert not index.is_(index - 0) def test_contains(self): - rng = period_range('2007-01', freq='M', periods=10) + rng = period_range("2007-01", freq="M", periods=10) - assert Period('2007-01', freq='M') in rng - assert not Period('2007-01', freq='D') in rng - assert not Period('2007-01', freq='2M') in rng + assert Period("2007-01", freq="M") in rng + assert not Period("2007-01", freq="D") in rng + assert not Period("2007-01", freq="2M") in rng def test_contains_nat(self): # see gh-13582 - idx = period_range('2007-01', freq='M', periods=10) + idx = period_range("2007-01", freq="M", periods=10) assert pd.NaT not in idx assert None not in idx - assert float('nan') not in idx + assert float("nan") not in idx assert np.nan not in idx - idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + idx = pd.PeriodIndex(["2011-01", "NaT", "2011-02"], freq="M") assert pd.NaT in idx assert None in idx - assert float('nan') in idx + assert float("nan") in idx assert np.nan in idx def test_periods_number_check(self): - msg = ("Of the three parameters: start, end, and periods, exactly two" - " must be specified") + msg = ( + "Of the three parameters: start, end, and periods, exactly two" + " must be specified" + ) with pytest.raises(ValueError, match=msg): - period_range('2011-1-1', '2012-1-1', 'B') + period_range("2011-1-1", "2012-1-1", "B") def test_start_time(self): # GH 17157 - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="MS") tm.assert_index_equal(index.start_time, expected_index) def test_end_time(self): # GH 17157 - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - expected_index = expected_index.shift(1, freq='D').shift(-1, freq='ns') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index = expected_index.shift(1, freq="D").shift(-1, freq="ns") tm.assert_index_equal(index.end_time, expected_index) def test_index_duplicate_periods(self): # monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) result = ts[2007] @@ -417,7 +457,7 @@ def test_index_duplicate_periods(self): assert (ts[1:3] == 1).all() # not monotonic - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) result = ts[2007] @@ -425,15 +465,13 @@ def test_index_duplicate_periods(self): tm.assert_series_equal(result, expected) def test_index_unique(self): - idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') - expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") + expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN") tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 - idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', - tz='US/Eastern') - expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', - tz='US/Eastern') + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN", tz="US/Eastern") + expected = PeriodIndex([2000, 2007, 2009], freq="A-JUN", tz="US/Eastern") tm.assert_index_equal(idx.unique(), expected) assert idx.nunique() == 3 @@ -446,68 +484,71 @@ def test_ndarray_compat_properties(self): super().test_ndarray_compat_properties() def test_negative_ordinals(self): - Period(ordinal=-1000, freq='A') - Period(ordinal=0, freq='A') + Period(ordinal=-1000, freq="A") + Period(ordinal=0, freq="A") - idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A') - idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') + idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq="A") + idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq="A") tm.assert_index_equal(idx1, idx2) def test_pindex_fieldaccessor_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', - '2012-03', '2012-04'], freq='D', name='name') + idx = PeriodIndex( + ["2011-01", "2011-02", "NaT", "2012-03", "2012-04"], freq="D", name="name" + ) - exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name='name') + exp = Index([2011, 2011, -1, 2012, 2012], dtype=np.int64, name="name") tm.assert_index_equal(idx.year, exp) - exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name='name') + exp = Index([1, 2, -1, 3, 4], dtype=np.int64, name="name") tm.assert_index_equal(idx.month, exp) def test_pindex_qaccess(self): - pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') + pi = PeriodIndex(["2Q05", "3Q05", "4Q05", "1Q06", "2Q06"], freq="Q") s = Series(np.random.rand(len(pi)), index=pi).cumsum() # Todo: fix these accessors! - assert s['05Q4'] == s[2] + assert s["05Q4"] == s[2] def test_pindex_multiples(self): with tm.assert_produces_warning(FutureWarning): - pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M') - expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07', - '2011-09', '2011-11'], freq='2M') + pi = PeriodIndex(start="1/1/11", end="12/31/11", freq="2M") + expected = PeriodIndex( + ["2011-01", "2011-03", "2011-05", "2011-07", "2011-09", "2011-11"], + freq="2M", + ) tm.assert_index_equal(pi, expected) assert pi.freq == offsets.MonthEnd(2) - assert pi.freqstr == '2M' + assert pi.freqstr == "2M" - pi = period_range(start='1/1/11', end='12/31/11', freq='2M') + pi = period_range(start="1/1/11", end="12/31/11", freq="2M") tm.assert_index_equal(pi, expected) assert pi.freq == offsets.MonthEnd(2) - assert pi.freqstr == '2M' + assert pi.freqstr == "2M" - pi = period_range(start='1/1/11', periods=6, freq='2M') + pi = period_range(start="1/1/11", periods=6, freq="2M") tm.assert_index_equal(pi, expected) assert pi.freq == offsets.MonthEnd(2) - assert pi.freqstr == '2M' + assert pi.freqstr == "2M" def test_iteration(self): - index = period_range(start='1/1/10', periods=4, freq='B') + index = period_range(start="1/1/10", periods=4, freq="B") result = list(index) assert isinstance(result[0], Period) assert result[0].freq == index.freq def test_is_full(self): - index = PeriodIndex([2005, 2007, 2009], freq='A') + index = PeriodIndex([2005, 2007, 2009], freq="A") assert not index.is_full - index = PeriodIndex([2005, 2006, 2007], freq='A') + index = PeriodIndex([2005, 2006, 2007], freq="A") assert index.is_full - index = PeriodIndex([2005, 2005, 2007], freq='A') + index = PeriodIndex([2005, 2005, 2007], freq="A") assert not index.is_full - index = PeriodIndex([2005, 2005, 2006], freq='A') + index = PeriodIndex([2005, 2005, 2006], freq="A") assert index.is_full - index = PeriodIndex([2006, 2005, 2005], freq='A') + index = PeriodIndex([2006, 2005, 2005], freq="A") with pytest.raises(ValueError, match="Index is not monotonic"): index.is_full @@ -515,8 +556,8 @@ def test_is_full(self): def test_with_multi_index(self): # #1705 - index = date_range('1/1/2012', periods=4, freq='12H') - index_as_arrays = [index.to_period(freq='D'), index.hour] + index = date_range("1/1/2012", periods=4, freq="12H") + index_as_arrays = [index.to_period(freq="D"), index.hour] s = Series([0, 1, 2, 3], index_as_arrays) @@ -525,7 +566,7 @@ def test_with_multi_index(self): assert isinstance(s.index.values[0][0], Period) def test_convert_array_of_periods(self): - rng = period_range('1/1/2000', periods=20, freq='D') + rng = period_range("1/1/2000", periods=20, freq="D") periods = list(rng) result = pd.Index(periods) @@ -533,8 +574,8 @@ def test_convert_array_of_periods(self): def test_append_concat(self): # #1815 - d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') - d2 = date_range('12/31/2000', '12/31/2009', freq='A-DEC') + d1 = date_range("12/31/1990", "12/31/1999", freq="A-DEC") + d2 = date_range("12/31/2000", "12/31/2009", freq="A-DEC") s1 = Series(np.random.randn(10), d1) s2 = Series(np.random.randn(10), d2) @@ -549,35 +590,36 @@ def test_append_concat(self): def test_pickle_freq(self): # GH2891 - prng = period_range('1/1/2011', '1/1/2012', freq='M') + prng = period_range("1/1/2011", "1/1/2012", freq="M") new_prng = tm.round_trip_pickle(prng) assert new_prng.freq == offsets.MonthEnd() - assert new_prng.freqstr == 'M' + assert new_prng.freqstr == "M" def test_map(self): # test_map_dictlike generally tests - index = PeriodIndex([2005, 2007, 2009], freq='A') + index = PeriodIndex([2005, 2007, 2009], freq="A") result = index.map(lambda x: x.ordinal) exp = Index([x.ordinal for x in index]) tm.assert_index_equal(result, exp) def test_join_self(self, join_type): - index = period_range('1/1/2000', periods=10) + index = period_range("1/1/2000", periods=10) joined = index.join(index, how=join_type) assert index is joined def test_insert(self): # GH 18295 (test missing) expected = PeriodIndex( - ['2017Q1', pd.NaT, '2017Q2', '2017Q3', '2017Q4'], freq='Q') + ["2017Q1", pd.NaT, "2017Q2", "2017Q3", "2017Q4"], freq="Q" + ) for na in (np.nan, pd.NaT, None): - result = period_range('2017Q1', periods=4, freq='Q').insert(1, na) + result = period_range("2017Q1", periods=4, freq="Q").insert(1, na) tm.assert_index_equal(result, expected) def test_maybe_convert_timedelta(): - pi = PeriodIndex(['2000', '2001'], freq='D') + pi = PeriodIndex(["2000", "2001"], freq="D") offset = offsets.Day(2) assert pi._maybe_convert_timedelta(offset) == 2 assert pi._maybe_convert_timedelta(2) == 2 diff --git a/pandas/tests/indexes/period/test_period_range.py b/pandas/tests/indexes/period/test_period_range.py index ca75635e56161..828fab08daceb 100644 --- a/pandas/tests/indexes/period/test_period_range.py +++ b/pandas/tests/indexes/period/test_period_range.py @@ -5,71 +5,75 @@ class TestPeriodRange: - - @pytest.mark.parametrize('freq', ['D', 'W', 'M', 'Q', 'A']) + @pytest.mark.parametrize("freq", ["D", "W", "M", "Q", "A"]) def test_construction_from_string(self, freq): # non-empty - expected = date_range(start='2017-01-01', periods=5, - freq=freq, name='foo').to_period() + expected = date_range( + start="2017-01-01", periods=5, freq=freq, name="foo" + ).to_period() start, end = str(expected[0]), str(expected[-1]) - result = period_range(start=start, end=end, freq=freq, name='foo') + result = period_range(start=start, end=end, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(start=start, periods=5, freq=freq, name='foo') + result = period_range(start=start, periods=5, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(end=end, periods=5, freq=freq, name='foo') + result = period_range(end=end, periods=5, freq=freq, name="foo") tm.assert_index_equal(result, expected) # empty - expected = PeriodIndex([], freq=freq, name='foo') + expected = PeriodIndex([], freq=freq, name="foo") - result = period_range(start=start, periods=0, freq=freq, name='foo') + result = period_range(start=start, periods=0, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(end=end, periods=0, freq=freq, name='foo') + result = period_range(end=end, periods=0, freq=freq, name="foo") tm.assert_index_equal(result, expected) - result = period_range(start=end, end=start, freq=freq, name='foo') + result = period_range(start=end, end=start, freq=freq, name="foo") tm.assert_index_equal(result, expected) def test_construction_from_period(self): # upsampling - start, end = Period('2017Q1', freq='Q'), Period('2018Q1', freq='Q') - expected = date_range(start='2017-03-31', end='2018-03-31', freq='M', - name='foo').to_period() - result = period_range(start=start, end=end, freq='M', name='foo') + start, end = Period("2017Q1", freq="Q"), Period("2018Q1", freq="Q") + expected = date_range( + start="2017-03-31", end="2018-03-31", freq="M", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="M", name="foo") tm.assert_index_equal(result, expected) # downsampling - start, end = Period('2017-1', freq='M'), Period('2019-12', freq='M') - expected = date_range(start='2017-01-31', end='2019-12-31', freq='Q', - name='foo').to_period() - result = period_range(start=start, end=end, freq='Q', name='foo') + start, end = Period("2017-1", freq="M"), Period("2019-12", freq="M") + expected = date_range( + start="2017-01-31", end="2019-12-31", freq="Q", name="foo" + ).to_period() + result = period_range(start=start, end=end, freq="Q", name="foo") tm.assert_index_equal(result, expected) # empty - expected = PeriodIndex([], freq='W', name='foo') + expected = PeriodIndex([], freq="W", name="foo") - result = period_range(start=start, periods=0, freq='W', name='foo') + result = period_range(start=start, periods=0, freq="W", name="foo") tm.assert_index_equal(result, expected) - result = period_range(end=end, periods=0, freq='W', name='foo') + result = period_range(end=end, periods=0, freq="W", name="foo") tm.assert_index_equal(result, expected) - result = period_range(start=end, end=start, freq='W', name='foo') + result = period_range(start=end, end=start, freq="W", name="foo") tm.assert_index_equal(result, expected) def test_errors(self): # not enough params - msg = ('Of the three parameters: start, end, and periods, ' - 'exactly two must be specified') + msg = ( + "Of the three parameters: start, end, and periods, " + "exactly two must be specified" + ) with pytest.raises(ValueError, match=msg): - period_range(start='2017Q1') + period_range(start="2017Q1") with pytest.raises(ValueError, match=msg): - period_range(end='2017Q1') + period_range(end="2017Q1") with pytest.raises(ValueError, match=msg): period_range(periods=5) @@ -79,17 +83,17 @@ def test_errors(self): # too many params with pytest.raises(ValueError, match=msg): - period_range(start='2017Q1', end='2018Q1', periods=8, freq='Q') + period_range(start="2017Q1", end="2018Q1", periods=8, freq="Q") # start/end NaT - msg = 'start and end must not be NaT' + msg = "start and end must not be NaT" with pytest.raises(ValueError, match=msg): - period_range(start=NaT, end='2018Q1') + period_range(start=NaT, end="2018Q1") with pytest.raises(ValueError, match=msg): - period_range(start='2017Q1', end=NaT) + period_range(start="2017Q1", end=NaT) # invalid periods param - msg = 'periods must be a number, got foo' + msg = "periods must be a number, got foo" with pytest.raises(TypeError, match=msg): - period_range(start='2017Q1', periods='foo') + period_range(start="2017Q1", periods="foo") diff --git a/pandas/tests/indexes/period/test_scalar_compat.py b/pandas/tests/indexes/period/test_scalar_compat.py index ac01b4aad81c9..7956b9f26e6ef 100644 --- a/pandas/tests/indexes/period/test_scalar_compat.py +++ b/pandas/tests/indexes/period/test_scalar_compat.py @@ -6,12 +6,12 @@ class TestPeriodIndexOps: def test_start_time(self): - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="MS") tm.assert_index_equal(index.start_time, expected_index) def test_end_time(self): - index = period_range(freq='M', start='2016-01-01', end='2016-05-31') - expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - expected_index += Timedelta(1, 'D') - Timedelta(1, 'ns') + index = period_range(freq="M", start="2016-01-01", end="2016-05-31") + expected_index = date_range("2016-01-01", end="2016-05-31", freq="M") + expected_index += Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(index.end_time, expected_index) diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index a9102aeec060c..94b061330002f 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -12,9 +12,8 @@ def _permute(obj): class TestPeriodIndex: - def test_joins(self, join_type): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") joined = index.join(index[:-5], how=join_type) @@ -22,86 +21,140 @@ def test_joins(self, join_type): assert joined.freq == index.freq def test_join_self(self, join_type): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") res = index.join(index, how=join_type) assert index is res def test_join_does_not_recur(self): df = tm.makeCustomDataframe( - 3, 2, data_gen_f=lambda *args: np.random.randint(2), - c_idx_type='p', r_idx_type='dt') + 3, + 2, + data_gen_f=lambda *args: np.random.randint(2), + c_idx_type="p", + r_idx_type="dt", + ) s = df.iloc[:2, 0] - res = s.index.join(df.columns, how='outer') - expected = Index([s.index[0], s.index[1], - df.columns[0], df.columns[1]], object) + res = s.index.join(df.columns, how="outer") + expected = Index([s.index[0], s.index[1], df.columns[0], df.columns[1]], object) tm.assert_index_equal(res, expected) @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): # union - other1 = pd.period_range('1/1/2000', freq='D', periods=5) - rng1 = pd.period_range('1/6/2000', freq='D', periods=5) - expected1 = pd.PeriodIndex(['2000-01-06', '2000-01-07', - '2000-01-08', '2000-01-09', - '2000-01-10', '2000-01-01', - '2000-01-02', '2000-01-03', - '2000-01-04', '2000-01-05'], - freq='D') - - rng2 = pd.period_range('1/1/2000', freq='D', periods=5) - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.period_range('1/1/2000', freq='D', periods=8) - - rng3 = pd.period_range('1/1/2000', freq='D', periods=5) - other3 = pd.PeriodIndex([], freq='D') - expected3 = pd.period_range('1/1/2000', freq='D', periods=5) - - rng4 = pd.period_range('2000-01-01 09:00', freq='H', periods=5) - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) - expected4 = pd.PeriodIndex(['2000-01-01 09:00', '2000-01-01 10:00', - '2000-01-01 11:00', '2000-01-01 12:00', - '2000-01-01 13:00', '2000-01-02 09:00', - '2000-01-02 10:00', '2000-01-02 11:00', - '2000-01-02 12:00', '2000-01-02 13:00'], - freq='H') - - rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' - '2000-01-01 09:08'], - freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', - '2000-01-01 09:05', '2000-01-01 09:08'], - freq='T') - - rng6 = pd.period_range('2000-01-01', freq='M', periods=7) - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.period_range('2000-01-01', freq='M', periods=10) - - rng7 = pd.period_range('2003-01-01', freq='A', periods=5) - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.PeriodIndex(['2003', '2004', '2005', '2006', '2007', - '1998', '1999', '2000', '2001', '2002'], - freq='A') - - rng8 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000', - '1/5/2000', '1/4/2000'], freq='D') - other8 = pd.period_range('1/6/2000', freq='D', periods=5) - expected8 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000', - '1/5/2000', '1/4/2000', '1/6/2000', - '1/7/2000', '1/8/2000', '1/9/2000', - '1/10/2000'], freq='D') - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3), - (rng4, other4, expected4), - (rng5, other5, expected5), - (rng6, other6, expected6), - (rng7, other7, expected7), - (rng8, other8, expected8)]: + other1 = pd.period_range("1/1/2000", freq="D", periods=5) + rng1 = pd.period_range("1/6/2000", freq="D", periods=5) + expected1 = pd.PeriodIndex( + [ + "2000-01-06", + "2000-01-07", + "2000-01-08", + "2000-01-09", + "2000-01-10", + "2000-01-01", + "2000-01-02", + "2000-01-03", + "2000-01-04", + "2000-01-05", + ], + freq="D", + ) + + rng2 = pd.period_range("1/1/2000", freq="D", periods=5) + other2 = pd.period_range("1/4/2000", freq="D", periods=5) + expected2 = pd.period_range("1/1/2000", freq="D", periods=8) + + rng3 = pd.period_range("1/1/2000", freq="D", periods=5) + other3 = pd.PeriodIndex([], freq="D") + expected3 = pd.period_range("1/1/2000", freq="D", periods=5) + + rng4 = pd.period_range("2000-01-01 09:00", freq="H", periods=5) + other4 = pd.period_range("2000-01-02 09:00", freq="H", periods=5) + expected4 = pd.PeriodIndex( + [ + "2000-01-01 09:00", + "2000-01-01 10:00", + "2000-01-01 11:00", + "2000-01-01 12:00", + "2000-01-01 13:00", + "2000-01-02 09:00", + "2000-01-02 10:00", + "2000-01-02 11:00", + "2000-01-02 12:00", + "2000-01-02 13:00", + ], + freq="H", + ) + + rng5 = pd.PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:03", "2000-01-01 09:05"], freq="T" + ) + other5 = pd.PeriodIndex( + ["2000-01-01 09:01", "2000-01-01 09:05" "2000-01-01 09:08"], freq="T" + ) + expected5 = pd.PeriodIndex( + [ + "2000-01-01 09:01", + "2000-01-01 09:03", + "2000-01-01 09:05", + "2000-01-01 09:08", + ], + freq="T", + ) + + rng6 = pd.period_range("2000-01-01", freq="M", periods=7) + other6 = pd.period_range("2000-04-01", freq="M", periods=7) + expected6 = pd.period_range("2000-01-01", freq="M", periods=10) + + rng7 = pd.period_range("2003-01-01", freq="A", periods=5) + other7 = pd.period_range("1998-01-01", freq="A", periods=8) + expected7 = pd.PeriodIndex( + [ + "2003", + "2004", + "2005", + "2006", + "2007", + "1998", + "1999", + "2000", + "2001", + "2002", + ], + freq="A", + ) + + rng8 = pd.PeriodIndex( + ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"], freq="D" + ) + other8 = pd.period_range("1/6/2000", freq="D", periods=5) + expected8 = pd.PeriodIndex( + [ + "1/3/2000", + "1/2/2000", + "1/1/2000", + "1/5/2000", + "1/4/2000", + "1/6/2000", + "1/7/2000", + "1/8/2000", + "1/9/2000", + "1/10/2000", + ], + freq="D", + ) + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + (rng8, other8, expected8), + ]: result_union = rng.union(other, sort=sort) if sort is None: @@ -110,7 +163,7 @@ def test_union(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_union_misc(self, sort): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") result = index[:-5].union(index[10:], sort=sort) tm.assert_index_equal(result, index) @@ -122,29 +175,29 @@ def test_union_misc(self, sort): assert tm.equalContents(result, index) # raise if different frequencies - index = period_range('1/1/2000', '1/20/2000', freq='D') - index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") with pytest.raises(period.IncompatibleFrequency): index.union(index2, sort=sort) - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') + index3 = period_range("1/1/2000", "1/20/2000", freq="2D") with pytest.raises(period.IncompatibleFrequency): index.join(index3) def test_union_dataframe_index(self): - rng1 = pd.period_range('1/1/1999', '1/1/2012', freq='M') + rng1 = pd.period_range("1/1/1999", "1/1/2012", freq="M") s1 = pd.Series(np.random.randn(len(rng1)), rng1) - rng2 = pd.period_range('1/1/1980', '12/1/2001', freq='M') + rng2 = pd.period_range("1/1/1980", "12/1/2001", freq="M") s2 = pd.Series(np.random.randn(len(rng2)), rng2) - df = pd.DataFrame({'s1': s1, 's2': s2}) + df = pd.DataFrame({"s1": s1, "s2": s2}) - exp = pd.period_range('1/1/1980', '1/1/2012', freq='M') + exp = pd.period_range("1/1/1980", "1/1/2012", freq="M") tm.assert_index_equal(df.index, exp) @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): - index = period_range('1/1/2000', '1/20/2000', freq='D') + index = period_range("1/1/2000", "1/20/2000", freq="D") result = index[:-5].intersection(index[10:], sort=sort) tm.assert_index_equal(result, index[10:-5]) @@ -158,69 +211,78 @@ def test_intersection(self, sort): assert tm.equalContents(result, index[10:-5]) # raise if different frequencies - index = period_range('1/1/2000', '1/20/2000', freq='D') - index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + index = period_range("1/1/2000", "1/20/2000", freq="D") + index2 = period_range("1/1/2000", "1/20/2000", freq="W-WED") with pytest.raises(period.IncompatibleFrequency): index.intersection(index2, sort=sort) - index3 = period_range('1/1/2000', '1/20/2000', freq='2D') + index3 = period_range("1/1/2000", "1/20/2000", freq="2D") with pytest.raises(period.IncompatibleFrequency): index.intersection(index3, sort=sort) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_cases(self, sort): - base = period_range('6/1/2000', '6/30/2000', freq='D', name='idx') + base = period_range("6/1/2000", "6/30/2000", freq="D", name="idx") # if target has the same name, it is preserved - rng2 = period_range('5/15/2000', '6/20/2000', freq='D', name='idx') - expected2 = period_range('6/1/2000', '6/20/2000', freq='D', - name='idx') + rng2 = period_range("5/15/2000", "6/20/2000", freq="D", name="idx") + expected2 = period_range("6/1/2000", "6/20/2000", freq="D", name="idx") # if target name is different, it will be reset - rng3 = period_range('5/15/2000', '6/20/2000', freq='D', name='other') - expected3 = period_range('6/1/2000', '6/20/2000', freq='D', - name=None) + rng3 = period_range("5/15/2000", "6/20/2000", freq="D", name="other") + expected3 = period_range("6/1/2000", "6/20/2000", freq="D", name=None) - rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = PeriodIndex([], name='idx', freq='D') + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], name="idx", freq="D") - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng, sort=sort) tm.assert_index_equal(result, expected) assert result.name == expected.name assert result.freq == expected.freq # non-monotonic - base = PeriodIndex(['2011-01-05', '2011-01-04', '2011-01-02', - '2011-01-03'], freq='D', name='idx') - - rng2 = PeriodIndex(['2011-01-04', '2011-01-02', - '2011-02-02', '2011-02-03'], - freq='D', name='idx') - expected2 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', - name='idx') - - rng3 = PeriodIndex(['2011-01-04', '2011-01-02', '2011-02-02', - '2011-02-03'], - freq='D', name='other') - expected3 = PeriodIndex(['2011-01-04', '2011-01-02'], freq='D', - name=None) - - rng4 = period_range('7/1/2000', '7/31/2000', freq='D', name='idx') - expected4 = PeriodIndex([], freq='D', name='idx') - - for (rng, expected) in [(rng2, expected2), (rng3, expected3), - (rng4, expected4)]: + base = PeriodIndex( + ["2011-01-05", "2011-01-04", "2011-01-02", "2011-01-03"], + freq="D", + name="idx", + ) + + rng2 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="idx", + ) + expected2 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name="idx") + + rng3 = PeriodIndex( + ["2011-01-04", "2011-01-02", "2011-02-02", "2011-02-03"], + freq="D", + name="other", + ) + expected3 = PeriodIndex(["2011-01-04", "2011-01-02"], freq="D", name=None) + + rng4 = period_range("7/1/2000", "7/31/2000", freq="D", name="idx") + expected4 = PeriodIndex([], freq="D", name="idx") + + for (rng, expected) in [ + (rng2, expected2), + (rng3, expected3), + (rng4, expected4), + ]: result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) assert result.name == expected.name - assert result.freq == 'D' + assert result.freq == "D" # empty same freq - rng = date_range('6/1/2000', '6/15/2000', freq='T') + rng = date_range("6/1/2000", "6/15/2000", freq="T") result = rng[0:0].intersection(rng) assert len(result) == 0 @@ -230,54 +292,63 @@ def test_intersection_cases(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, sort): # diff - period_rng = ['1/3/2000', '1/2/2000', '1/1/2000', '1/5/2000', - '1/4/2000'] - rng1 = pd.PeriodIndex(period_rng, freq='D') - other1 = pd.period_range('1/6/2000', freq='D', periods=5) + period_rng = ["1/3/2000", "1/2/2000", "1/1/2000", "1/5/2000", "1/4/2000"] + rng1 = pd.PeriodIndex(period_rng, freq="D") + other1 = pd.period_range("1/6/2000", freq="D", periods=5) expected1 = rng1 - rng2 = pd.PeriodIndex(period_rng, freq='D') - other2 = pd.period_range('1/4/2000', freq='D', periods=5) - expected2 = pd.PeriodIndex(['1/3/2000', '1/2/2000', '1/1/2000'], - freq='D') + rng2 = pd.PeriodIndex(period_rng, freq="D") + other2 = pd.period_range("1/4/2000", freq="D", periods=5) + expected2 = pd.PeriodIndex(["1/3/2000", "1/2/2000", "1/1/2000"], freq="D") - rng3 = pd.PeriodIndex(period_rng, freq='D') - other3 = pd.PeriodIndex([], freq='D') + rng3 = pd.PeriodIndex(period_rng, freq="D") + other3 = pd.PeriodIndex([], freq="D") expected3 = rng3 - period_rng = ['2000-01-01 10:00', '2000-01-01 09:00', - '2000-01-01 12:00', '2000-01-01 11:00', - '2000-01-01 13:00'] - rng4 = pd.PeriodIndex(period_rng, freq='H') - other4 = pd.period_range('2000-01-02 09:00', freq='H', periods=5) + period_rng = [ + "2000-01-01 10:00", + "2000-01-01 09:00", + "2000-01-01 12:00", + "2000-01-01 11:00", + "2000-01-01 13:00", + ] + rng4 = pd.PeriodIndex(period_rng, freq="H") + other4 = pd.period_range("2000-01-02 09:00", freq="H", periods=5) expected4 = rng4 - rng5 = pd.PeriodIndex(['2000-01-01 09:03', '2000-01-01 09:01', - '2000-01-01 09:05'], freq='T') - other5 = pd.PeriodIndex( - ['2000-01-01 09:01', '2000-01-01 09:05'], freq='T') - expected5 = pd.PeriodIndex(['2000-01-01 09:03'], freq='T') - - period_rng = ['2000-02-01', '2000-01-01', '2000-06-01', - '2000-07-01', '2000-05-01', '2000-03-01', - '2000-04-01'] - rng6 = pd.PeriodIndex(period_rng, freq='M') - other6 = pd.period_range('2000-04-01', freq='M', periods=7) - expected6 = pd.PeriodIndex(['2000-02-01', '2000-01-01', '2000-03-01'], - freq='M') - - period_rng = ['2003', '2007', '2006', '2005', '2004'] - rng7 = pd.PeriodIndex(period_rng, freq='A') - other7 = pd.period_range('1998-01-01', freq='A', periods=8) - expected7 = pd.PeriodIndex(['2007', '2006'], freq='A') - - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3), - (rng4, other4, expected4), - (rng5, other5, expected5), - (rng6, other6, expected6), - (rng7, other7, expected7), ]: + rng5 = pd.PeriodIndex( + ["2000-01-01 09:03", "2000-01-01 09:01", "2000-01-01 09:05"], freq="T" + ) + other5 = pd.PeriodIndex(["2000-01-01 09:01", "2000-01-01 09:05"], freq="T") + expected5 = pd.PeriodIndex(["2000-01-01 09:03"], freq="T") + + period_rng = [ + "2000-02-01", + "2000-01-01", + "2000-06-01", + "2000-07-01", + "2000-05-01", + "2000-03-01", + "2000-04-01", + ] + rng6 = pd.PeriodIndex(period_rng, freq="M") + other6 = pd.period_range("2000-04-01", freq="M", periods=7) + expected6 = pd.PeriodIndex(["2000-02-01", "2000-01-01", "2000-03-01"], freq="M") + + period_rng = ["2003", "2007", "2006", "2005", "2004"] + rng7 = pd.PeriodIndex(period_rng, freq="A") + other7 = pd.period_range("1998-01-01", freq="A", periods=8) + expected7 = pd.PeriodIndex(["2007", "2006"], freq="A") + + for rng, other, expected in [ + (rng1, other1, expected1), + (rng2, other2, expected2), + (rng3, other3, expected3), + (rng4, other4, expected4), + (rng5, other5, expected5), + (rng6, other6, expected6), + (rng7, other7, expected7), + ]: result_difference = rng.difference(other, sort=sort) if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 7d69723b4a118..e52954a114578 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -7,8 +7,16 @@ import pandas as pd from pandas import ( - DatetimeIndex, Period, PeriodIndex, Series, Timedelta, Timestamp, - date_range, period_range, to_datetime) + DatetimeIndex, + Period, + PeriodIndex, + Series, + Timedelta, + Timestamp, + date_range, + period_range, + to_datetime, +) import pandas.core.indexes.period as period import pandas.util.testing as tm @@ -25,20 +33,19 @@ def _check_freq(self, freq, base_date): tm.assert_numpy_array_equal(rng.asi8, exp) def test_annual(self): - self._check_freq('A', 1970) + self._check_freq("A", 1970) def test_monthly(self): - self._check_freq('M', '1970-01') + self._check_freq("M", "1970-01") - @pytest.mark.parametrize('freq', ['W-THU', 'D', 'B', 'H', 'T', - 'S', 'L', 'U', 'N']) + @pytest.mark.parametrize("freq", ["W-THU", "D", "B", "H", "T", "S", "L", "U", "N"]) def test_freq(self, freq): - self._check_freq(freq, '1970-01-01') + self._check_freq(freq, "1970-01-01") def test_negone_ordinals(self): - freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] + freqs = ["A", "M", "Q", "D", "H", "T", "S"] - period = Period(ordinal=-1, freq='D') + period = Period(ordinal=-1, freq="D") for freq in freqs: repr(period.asfreq(freq)) @@ -47,188 +54,195 @@ def test_negone_ordinals(self): repr(period) assert period.year == 1969 - period = Period(ordinal=-1, freq='B') + period = Period(ordinal=-1, freq="B") repr(period) - period = Period(ordinal=-1, freq='W') + period = Period(ordinal=-1, freq="W") repr(period) class TestPeriodIndex: def test_to_timestamp(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') - series = Series(1, index=index, name='foo') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") + series = Series(1, index=index, name="foo") - exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') - result = series.to_timestamp(how='end') - exp_index = exp_index + Timedelta(1, 'D') - Timedelta(1, 'ns') + exp_index = date_range("1/1/2001", end="12/31/2009", freq="A-DEC") + result = series.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - assert result.name == 'foo' + assert result.name == "foo" - exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') - result = series.to_timestamp(how='start') + exp_index = date_range("1/1/2001", end="1/1/2009", freq="AS-JAN") + result = series.to_timestamp(how="start") tm.assert_index_equal(result.index, exp_index) - def _get_with_delta(delta, freq='A-DEC'): - return date_range(to_datetime('1/1/2001') + delta, - to_datetime('12/31/2009') + delta, freq=freq) + def _get_with_delta(delta, freq="A-DEC"): + return date_range( + to_datetime("1/1/2001") + delta, + to_datetime("12/31/2009") + delta, + freq=freq, + ) delta = timedelta(hours=23) - result = series.to_timestamp('H', 'end') + result = series.to_timestamp("H", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'h') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) - result = series.to_timestamp('T', 'end') + result = series.to_timestamp("T", "end") exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 'm') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "m") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - result = series.to_timestamp('S', 'end') + result = series.to_timestamp("S", "end") delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - index = period_range(freq='H', start='1/1/2001', end='1/2/2001') - series = Series(1, index=index, name='foo') + index = period_range(freq="H", start="1/1/2001", end="1/2/2001") + series = Series(1, index=index, name="foo") - exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59', - freq='H') - result = series.to_timestamp(how='end') - exp_index = exp_index + Timedelta(1, 's') - Timedelta(1, 'ns') + exp_index = date_range("1/1/2001 00:59:59", end="1/2/2001 00:59:59", freq="H") + result = series.to_timestamp(how="end") + exp_index = exp_index + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result.index, exp_index) - assert result.name == 'foo' + assert result.name == "foo" def test_to_timestamp_freq(self): - idx = pd.period_range('2017', periods=12, freq="A-DEC") + idx = pd.period_range("2017", periods=12, freq="A-DEC") result = idx.to_timestamp() expected = pd.date_range("2017", periods=12, freq="AS-JAN") tm.assert_index_equal(result, expected) def test_to_timestamp_repr_is_code(self): - zs = [Timestamp('99-04-17 00:00:00', tz='UTC'), - Timestamp('2001-04-17 00:00:00', tz='UTC'), - Timestamp('2001-04-17 00:00:00', tz='America/Los_Angeles'), - Timestamp('2001-04-17 00:00:00', tz=None)] + zs = [ + Timestamp("99-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="UTC"), + Timestamp("2001-04-17 00:00:00", tz="America/Los_Angeles"), + Timestamp("2001-04-17 00:00:00", tz=None), + ] for z in zs: assert eval(repr(z)) == z def test_to_timestamp_to_period_astype(self): - idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx') + idx = DatetimeIndex([pd.NaT, "2011-01-01", "2011-02-01"], name="idx") - res = idx.astype('period[M]') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx') + res = idx.astype("period[M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") tm.assert_index_equal(res, exp) - res = idx.astype('period[3M]') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') + res = idx.astype("period[3M]") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") tm.assert_index_equal(res, exp) def test_dti_to_period(self): - dti = pd.date_range(start='1/1/2005', end='12/1/2005', freq='M') + dti = pd.date_range(start="1/1/2005", end="12/1/2005", freq="M") pi1 = dti.to_period() - pi2 = dti.to_period(freq='D') - pi3 = dti.to_period(freq='3D') - - assert pi1[0] == Period('Jan 2005', freq='M') - assert pi2[0] == Period('1/31/2005', freq='D') - assert pi3[0] == Period('1/31/2005', freq='3D') - - assert pi1[-1] == Period('Nov 2005', freq='M') - assert pi2[-1] == Period('11/30/2005', freq='D') - assert pi3[-1], Period('11/30/2005', freq='3D') - - tm.assert_index_equal(pi1, period_range('1/1/2005', '11/1/2005', - freq='M')) - tm.assert_index_equal(pi2, period_range('1/1/2005', '11/1/2005', - freq='M').asfreq('D')) - tm.assert_index_equal(pi3, period_range('1/1/2005', '11/1/2005', - freq='M').asfreq('3D')) - - @pytest.mark.parametrize('month', MONTHS) + pi2 = dti.to_period(freq="D") + pi3 = dti.to_period(freq="3D") + + assert pi1[0] == Period("Jan 2005", freq="M") + assert pi2[0] == Period("1/31/2005", freq="D") + assert pi3[0] == Period("1/31/2005", freq="3D") + + assert pi1[-1] == Period("Nov 2005", freq="M") + assert pi2[-1] == Period("11/30/2005", freq="D") + assert pi3[-1], Period("11/30/2005", freq="3D") + + tm.assert_index_equal(pi1, period_range("1/1/2005", "11/1/2005", freq="M")) + tm.assert_index_equal( + pi2, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("D") + ) + tm.assert_index_equal( + pi3, period_range("1/1/2005", "11/1/2005", freq="M").asfreq("3D") + ) + + @pytest.mark.parametrize("month", MONTHS) def test_to_period_quarterly(self, month): # make sure we can make the round trip - freq = 'Q-%s' % month - rng = period_range('1989Q3', '1991Q3', freq=freq) + freq = "Q-%s" % month + rng = period_range("1989Q3", "1991Q3", freq=freq) stamps = rng.to_timestamp() result = stamps.to_period(freq) tm.assert_index_equal(rng, result) - @pytest.mark.parametrize('off', ['BQ', 'QS', 'BQS']) + @pytest.mark.parametrize("off", ["BQ", "QS", "BQS"]) def test_to_period_quarterlyish(self, off): - rng = date_range('01-Jan-2012', periods=8, freq=off) + rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == 'Q-DEC' + assert prng.freq == "Q-DEC" - @pytest.mark.parametrize('off', ['BA', 'AS', 'BAS']) + @pytest.mark.parametrize("off", ["BA", "AS", "BAS"]) def test_to_period_annualish(self, off): - rng = date_range('01-Jan-2012', periods=8, freq=off) + rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == 'A-DEC' + assert prng.freq == "A-DEC" def test_to_period_monthish(self): - offsets = ['MS', 'BM'] + offsets = ["MS", "BM"] for off in offsets: - rng = date_range('01-Jan-2012', periods=8, freq=off) + rng = date_range("01-Jan-2012", periods=8, freq=off) prng = rng.to_period() - assert prng.freq == 'M' + assert prng.freq == "M" - rng = date_range('01-Jan-2012', periods=8, freq='M') + rng = date_range("01-Jan-2012", periods=8, freq="M") prng = rng.to_period() - assert prng.freq == 'M' + assert prng.freq == "M" msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - date_range('01-Jan-2012', periods=8, freq='EOM') + date_range("01-Jan-2012", periods=8, freq="EOM") def test_period_dt64_round_trip(self): - dti = date_range('1/1/2000', '1/7/2002', freq='B') + dti = date_range("1/1/2000", "1/7/2002", freq="B") pi = dti.to_period() tm.assert_index_equal(pi.to_timestamp(), dti) - dti = date_range('1/1/2000', '1/7/2002', freq='B') - pi = dti.to_period(freq='H') + dti = date_range("1/1/2000", "1/7/2002", freq="B") + pi = dti.to_period(freq="H") tm.assert_index_equal(pi.to_timestamp(), dti) def test_combine_first(self): # GH#3367 - didx = pd.date_range(start='1950-01-31', end='1950-07-31', freq='M') - pidx = pd.period_range(start=pd.Period('1950-1'), - end=pd.Period('1950-7'), freq='M') + didx = pd.date_range(start="1950-01-31", end="1950-07-31", freq="M") + pidx = pd.period_range( + start=pd.Period("1950-1"), end=pd.Period("1950-7"), freq="M" + ) # check to be consistent with DatetimeIndex for idx in [didx, pidx]: a = pd.Series([1, np.nan, np.nan, 4, 5, np.nan, 7], index=idx) b = pd.Series([9, 9, 9, 9, 9, 9, 9], index=idx) result = a.combine_first(b) - expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, - dtype=np.float64) + expected = pd.Series([1, 9, 9, 4, 5, 9, 7], index=idx, dtype=np.float64) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('freq', ['D', '2D']) + @pytest.mark.parametrize("freq", ["D", "2D"]) def test_searchsorted(self, freq): - pidx = pd.PeriodIndex(['2014-01-01', '2014-01-02', '2014-01-03', - '2014-01-04', '2014-01-05'], freq=freq) + pidx = pd.PeriodIndex( + ["2014-01-01", "2014-01-02", "2014-01-03", "2014-01-04", "2014-01-05"], + freq=freq, + ) - p1 = pd.Period('2014-01-01', freq=freq) + p1 = pd.Period("2014-01-01", freq=freq) assert pidx.searchsorted(p1) == 0 - p2 = pd.Period('2014-01-04', freq=freq) + p2 = pd.Period("2014-01-04", freq=freq) assert pidx.searchsorted(p2) == 3 msg = "Input has different freq=H from PeriodIndex" with pytest.raises(period.IncompatibleFrequency, match=msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='H')) + pidx.searchsorted(pd.Period("2014-01-01", freq="H")) msg = "Input has different freq=5D from PeriodIndex" with pytest.raises(period.IncompatibleFrequency, match=msg): - pidx.searchsorted(pd.Period('2014-01-01', freq='5D')) + pidx.searchsorted(pd.Period("2014-01-01", freq="5D")) class TestPeriodIndexConversion: def test_tolist(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009') + index = period_range(freq="A", start="1/1/2001", end="12/1/2009") rs = index.tolist() for x in rs: assert isinstance(x, Period) @@ -238,37 +252,34 @@ def test_tolist(self): def test_to_timestamp_pi_nat(self): # GH#7228 - index = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', - name='idx') + index = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="M", name="idx") - result = index.to_timestamp('D') - expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), - datetime(2011, 2, 1)], name='idx') + result = index.to_timestamp("D") + expected = DatetimeIndex( + [pd.NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name="idx" + ) tm.assert_index_equal(result, expected) - assert result.name == 'idx' + assert result.name == "idx" - result2 = result.to_period(freq='M') + result2 = result.to_period(freq="M") tm.assert_index_equal(result2, index) - assert result2.name == 'idx' + assert result2.name == "idx" - result3 = result.to_period(freq='3M') - exp = PeriodIndex(['NaT', '2011-01', '2011-02'], - freq='3M', name='idx') + result3 = result.to_period(freq="3M") + exp = PeriodIndex(["NaT", "2011-01", "2011-02"], freq="3M", name="idx") tm.assert_index_equal(result3, exp) - assert result3.freqstr == '3M' + assert result3.freqstr == "3M" - msg = ('Frequency must be positive, because it' - ' represents span: -2A') + msg = "Frequency must be positive, because it" " represents span: -2A" with pytest.raises(ValueError, match=msg): - result.to_period(freq='-2A') + result.to_period(freq="-2A") def test_to_timestamp_preserve_name(self): - index = period_range(freq='A', start='1/1/2001', end='12/1/2009', - name='foo') - assert index.name == 'foo' + index = period_range(freq="A", start="1/1/2001", end="12/1/2009", name="foo") + assert index.name == "foo" - conv = index.to_timestamp('D') - assert conv.name == 'foo' + conv = index.to_timestamp("D") + assert conv.name == "foo" def test_to_timestamp_quarterly_bug(self): years = np.arange(1960, 2000).repeat(4) @@ -276,69 +287,66 @@ def test_to_timestamp_quarterly_bug(self): pindex = PeriodIndex(year=years, quarter=quarters) - stamps = pindex.to_timestamp('D', 'end') - expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) + stamps = pindex.to_timestamp("D", "end") + expected = DatetimeIndex([x.to_timestamp("D", "end") for x in pindex]) tm.assert_index_equal(stamps, expected) def test_to_timestamp_pi_mult(self): - idx = PeriodIndex(['2011-01', 'NaT', '2011-02'], - freq='2M', name='idx') + idx = PeriodIndex(["2011-01", "NaT", "2011-02"], freq="2M", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(['2011-01-01', 'NaT', '2011-02-01'], - name='idx') + expected = DatetimeIndex(["2011-01-01", "NaT", "2011-02-01"], name="idx") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex(['2011-02-28', 'NaT', '2011-03-31'], - name='idx') - expected = expected + Timedelta(1, 'D') - Timedelta(1, 'ns') + result = idx.to_timestamp(how="E") + expected = DatetimeIndex(["2011-02-28", "NaT", "2011-03-31"], name="idx") + expected = expected + Timedelta(1, "D") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) def test_to_timestamp_pi_combined(self): - idx = period_range(start='2011', periods=2, freq='1D1H', name='idx') + idx = period_range(start="2011", periods=2, freq="1D1H", name="idx") result = idx.to_timestamp() - expected = DatetimeIndex(['2011-01-01 00:00', '2011-01-02 01:00'], - name='idx') + expected = DatetimeIndex(["2011-01-01 00:00", "2011-01-02 01:00"], name="idx") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E') - expected = DatetimeIndex(['2011-01-02 00:59:59', - '2011-01-03 01:59:59'], - name='idx') - expected = expected + Timedelta(1, 's') - Timedelta(1, 'ns') + result = idx.to_timestamp(how="E") + expected = DatetimeIndex( + ["2011-01-02 00:59:59", "2011-01-03 01:59:59"], name="idx" + ) + expected = expected + Timedelta(1, "s") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) - result = idx.to_timestamp(how='E', freq='H') - expected = DatetimeIndex(['2011-01-02 00:00', '2011-01-03 01:00'], - name='idx') - expected = expected + Timedelta(1, 'h') - Timedelta(1, 'ns') + result = idx.to_timestamp(how="E", freq="H") + expected = DatetimeIndex(["2011-01-02 00:00", "2011-01-03 01:00"], name="idx") + expected = expected + Timedelta(1, "h") - Timedelta(1, "ns") tm.assert_index_equal(result, expected) def test_period_astype_to_timestamp(self): - pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) - tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) - exp = exp + Timedelta(1, 'D') - Timedelta(1, 'ns') - tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) - - exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - tz='US/Eastern') - res = pi.astype('datetime64[ns, US/Eastern]') - tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) - - exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], - tz='US/Eastern') - exp = exp + Timedelta(1, 'D') - Timedelta(1, 'ns') - res = pi.astype('datetime64[ns, US/Eastern]', how='end') + pi = pd.PeriodIndex(["2011-01", "2011-02", "2011-03"], freq="M") + + exp = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) + tm.assert_index_equal(pi.astype("datetime64[ns]"), exp) + + exp = pd.DatetimeIndex(["2011-01-31", "2011-02-28", "2011-03-31"]) + exp = exp + Timedelta(1, "D") - Timedelta(1, "ns") + tm.assert_index_equal(pi.astype("datetime64[ns]", how="end"), exp) + + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-03-01"], tz="US/Eastern" + ) + res = pi.astype("datetime64[ns, US/Eastern]") + tm.assert_index_equal(pi.astype("datetime64[ns, US/Eastern]"), exp) + + exp = pd.DatetimeIndex( + ["2011-01-31", "2011-02-28", "2011-03-31"], tz="US/Eastern" + ) + exp = exp + Timedelta(1, "D") - Timedelta(1, "ns") + res = pi.astype("datetime64[ns, US/Eastern]", how="end") tm.assert_index_equal(res, exp) def test_to_timestamp_1703(self): - index = period_range('1/1/2012', periods=4, freq='D') + index = period_range("1/1/2012", periods=4, freq="D") result = index.to_timestamp() - assert result[0] == Timestamp('1/1/2012') + assert result[0] == Timestamp("1/1/2012") diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index a3563838e048d..f0382a040e063 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -19,11 +19,25 @@ import pandas as pd from pandas import ( - CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, Int64Index, - PeriodIndex, RangeIndex, Series, TimedeltaIndex, UInt64Index, date_range, - isna, period_range) + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Int64Index, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, + date_range, + isna, + period_range, +) from pandas.core.index import ( - _get_combined_index, ensure_index, ensure_index_from_sequences) + _get_combined_index, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.api import Index, MultiIndex from pandas.core.sorting import safe_sort from pandas.tests.indexes.common import Base @@ -35,25 +49,26 @@ class TestIndex(Base): _holder = Index def setup_method(self, method): - self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100), - strIndex=tm.makeStringIndex(100), - dateIndex=tm.makeDateIndex(100), - periodIndex=tm.makePeriodIndex(100), - tdIndex=tm.makeTimedeltaIndex(100), - intIndex=tm.makeIntIndex(100), - uintIndex=tm.makeUIntIndex(100), - rangeIndex=tm.makeRangeIndex(100), - floatIndex=tm.makeFloatIndex(100), - boolIndex=Index([True, False]), - catIndex=tm.makeCategoricalIndex(100), - empty=Index([]), - tuples=MultiIndex.from_tuples(zip( - ['foo', 'bar', 'baz'], [1, 2, 3])), - repeats=Index([0, 0, 1, 1, 2, 2])) + self.indices = dict( + unicodeIndex=tm.makeUnicodeIndex(100), + strIndex=tm.makeStringIndex(100), + dateIndex=tm.makeDateIndex(100), + periodIndex=tm.makePeriodIndex(100), + tdIndex=tm.makeTimedeltaIndex(100), + intIndex=tm.makeIntIndex(100), + uintIndex=tm.makeUIntIndex(100), + rangeIndex=tm.makeRangeIndex(100), + floatIndex=tm.makeFloatIndex(100), + boolIndex=Index([True, False]), + catIndex=tm.makeCategoricalIndex(100), + empty=Index([]), + tuples=MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), + repeats=Index([0, 0, 1, 1, 2, 2]), + ) self.setup_indices() def create_index(self): - return Index(list('abcde')) + return Index(list("abcde")) def generate_index_types(self, skip_index_keys=[]): """ @@ -76,9 +91,9 @@ def test_new_axis(self): def test_copy_and_deepcopy(self): new_copy2 = self.intIndex.copy(dtype=int) - assert new_copy2.dtype.kind == 'i' + assert new_copy2.dtype.kind == "i" - @pytest.mark.parametrize("attr", ['strIndex', 'dateIndex']) + @pytest.mark.parametrize("attr", ["strIndex", "dateIndex"]) def test_constructor_regular(self, attr): # regular instance creation index = getattr(self, attr) @@ -94,9 +109,9 @@ def test_constructor_casting(self): def test_constructor_copy(self): # copy arr = np.array(self.strIndex) - index = Index(arr, copy=True, name='name') + index = Index(arr, copy=True, name="name") assert isinstance(index, Index) - assert index.name == 'name' + assert index.name == "name" tm.assert_numpy_array_equal(arr, index.values) arr[0] = "SOMEBIGLONGSTRING" assert index[0] != "SOMEBIGLONGSTRING" @@ -107,13 +122,14 @@ def test_constructor_copy(self): def test_constructor_corner(self): # corner case - msg = (r"Index\(\.\.\.\) must be called with a collection of some" - " kind, 0 was passed") + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + " kind, 0 was passed" + ) with pytest.raises(TypeError, match=msg): Index(0) - @pytest.mark.parametrize("index_vals", [ - [('A', 1), 'B'], ['B', ('A', 1)]]) + @pytest.mark.parametrize("index_vals", [[("A", 1), "B"], ["B", ("A", 1)]]) def test_construction_list_mixed_tuples(self, index_vals): # see gh-10697: if we are constructing from a mixed list of tuples, # make sure that we are independent of the sorting order. @@ -121,23 +137,31 @@ def test_construction_list_mixed_tuples(self, index_vals): assert isinstance(index, Index) assert not isinstance(index, MultiIndex) - @pytest.mark.parametrize('na_value', [None, np.nan]) - @pytest.mark.parametrize('vtype', [list, tuple, iter]) + @pytest.mark.parametrize("na_value", [None, np.nan]) + @pytest.mark.parametrize("vtype", [list, tuple, iter]) def test_construction_list_tuples_nan(self, na_value, vtype): # GH 18505 : valid tuples containing NaN - values = [(1, 'two'), (3., na_value)] + values = [(1, "two"), (3.0, na_value)] result = Index(vtype(values)) expected = MultiIndex.from_tuples(values) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("cast_as_obj", [True, False]) - @pytest.mark.parametrize("index", [ - pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern', name='Green Eggs & Ham'), # DTI with tz - pd.date_range('2015-01-01 10:00', freq='D', periods=3), # DTI no tz - pd.timedelta_range('1 days', freq='D', periods=3), # td - pd.period_range('2015-01-01', freq='D', periods=3) # period - ]) + @pytest.mark.parametrize( + "index", + [ + pd.date_range( + "2015-01-01 10:00", + freq="D", + periods=3, + tz="US/Eastern", + name="Green Eggs & Ham", + ), # DTI with tz + pd.date_range("2015-01-01 10:00", freq="D", periods=3), # DTI no tz + pd.timedelta_range("1 days", freq="D", periods=3), # td + pd.period_range("2015-01-01", freq="D", periods=3), # period + ], + ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: result = pd.Index(index.astype(object)) @@ -157,12 +181,17 @@ def test_constructor_from_index_dtlike(self, cast_as_obj, index): assert result.dtype == np.object_ assert list(result) == list(index) - @pytest.mark.parametrize("index,has_tz", [ - (pd.date_range('2015-01-01 10:00', freq='D', periods=3, - tz='US/Eastern'), True), # datetimetz - (pd.timedelta_range('1 days', freq='D', periods=3), False), # td - (pd.period_range('2015-01-01', freq='D', periods=3), False) # period - ]) + @pytest.mark.parametrize( + "index,has_tz", + [ + ( + pd.date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern"), + True, + ), # datetimetz + (pd.timedelta_range("1 days", freq="D", periods=3), False), # td + (pd.period_range("2015-01-01", freq="D", periods=3), False), # period + ], + ) def test_constructor_from_series_dtlike(self, index, has_tz): result = pd.Index(pd.Series(index)) tm.assert_index_equal(result, index) @@ -172,50 +201,56 @@ def test_constructor_from_series_dtlike(self, index, has_tz): @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) def test_constructor_from_series(self, klass): - expected = DatetimeIndex([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')]) - s = Series([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')]) + expected = DatetimeIndex( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + ) + s = Series( + [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")] + ) result = klass(s) tm.assert_index_equal(result, expected) def test_constructor_from_series_freq(self): # GH 6273 # create from a series, passing a freq - dts = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'] - expected = DatetimeIndex(dts, freq='MS') + dts = ["1-1-1990", "2-1-1990", "3-1-1990", "4-1-1990", "5-1-1990"] + expected = DatetimeIndex(dts, freq="MS") s = Series(pd.to_datetime(dts)) - result = DatetimeIndex(s, freq='MS') + result = DatetimeIndex(s, freq="MS") tm.assert_index_equal(result, expected) def test_constructor_from_frame_series_freq(self): # GH 6273 # create from a series, passing a freq - dts = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'] - expected = DatetimeIndex(dts, freq='MS') + dts = ["1-1-1990", "2-1-1990", "3-1-1990", "4-1-1990", "5-1-1990"] + expected = DatetimeIndex(dts, freq="MS") df = pd.DataFrame(np.random.rand(5, 3)) - df['date'] = dts - result = DatetimeIndex(df['date'], freq='MS') + df["date"] = dts + result = DatetimeIndex(df["date"], freq="MS") - assert df['date'].dtype == object - expected.name = 'date' + assert df["date"].dtype == object + expected.name = "date" tm.assert_index_equal(result, expected) - expected = pd.Series(dts, name='date') - tm.assert_series_equal(df['date'], expected) + expected = pd.Series(dts, name="date") + tm.assert_series_equal(df["date"], expected) # GH 6274 # infer freq of same - freq = pd.infer_freq(df['date']) - assert freq == 'MS' + freq = pd.infer_freq(df["date"]) + assert freq == "MS" - @pytest.mark.parametrize("array", [ - np.arange(5), np.array(['a', 'b', 'c']), date_range( - '2000-01-01', periods=3).values - ]) + @pytest.mark.parametrize( + "array", + [ + np.arange(5), + np.array(["a", "b", "c"]), + date_range("2000-01-01", periods=3).values, + ], + ) def test_constructor_ndarray_like(self, array): # GH 5460#issuecomment-44474502 # it should be possible to convert any object that satisfies the numpy @@ -231,9 +266,10 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('dtype', [ - int, 'int64', 'int32', 'int16', 'int8', 'uint64', 'uint32', - 'uint16', 'uint8']) + @pytest.mark.parametrize( + "dtype", + [int, "int64", "int32", "int16", "int8", "uint64", "uint32", "uint16", "uint8"], + ) def test_constructor_int_dtype_float(self, dtype): # GH 18400 if is_unsigned_integer_dtype(dtype): @@ -242,17 +278,17 @@ def test_constructor_int_dtype_float(self, dtype): index_type = Int64Index expected = index_type([0, 1, 2, 3]) - result = Index([0., 1., 2., 3.], dtype=dtype) + result = Index([0.0, 1.0, 2.0, 3.0], dtype=dtype) tm.assert_index_equal(result, expected) def test_constructor_int_dtype_nan(self): # see gh-15187 data = [np.nan] expected = Float64Index(data) - result = Index(data, dtype='float') + result = Index(data, dtype="float") tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("dtype", ['int64', 'uint64']) + @pytest.mark.parametrize("dtype", ["int64", "uint64"]) def test_constructor_int_dtype_nan_raises(self, dtype): # see gh-15187 data = [np.nan] @@ -266,10 +302,13 @@ def test_constructor_no_pandas_array(self): expected = pd.Index([1, 2, 3]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass,dtype,na_val", [ - (pd.Float64Index, np.float64, np.nan), - (pd.DatetimeIndex, 'datetime64[ns]', pd.NaT) - ]) + @pytest.mark.parametrize( + "klass,dtype,na_val", + [ + (pd.Float64Index, np.float64, np.nan), + (pd.DatetimeIndex, "datetime64[ns]", pd.NaT), + ], + ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): # GH 13467 na_list = [na_val, na_val] @@ -283,12 +322,14 @@ def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("pos", [0, 1]) - @pytest.mark.parametrize("klass,dtype,ctor", [ - (pd.DatetimeIndex, 'datetime64[ns]', np.datetime64('nat')), - (pd.TimedeltaIndex, 'timedelta64[ns]', np.timedelta64('nat')) - ]) - def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, - nulls_fixture): + @pytest.mark.parametrize( + "klass,dtype,ctor", + [ + (pd.DatetimeIndex, "datetime64[ns]", np.datetime64("nat")), + (pd.TimedeltaIndex, "timedelta64[ns]", np.timedelta64("nat")), + ], + ) + def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, nulls_fixture): expected = klass([pd.NaT, pd.NaT]) assert expected.dtype == dtype data = [ctor] @@ -303,7 +344,7 @@ def test_index_ctor_infer_nat_dt_like(self, pos, klass, dtype, ctor, @pytest.mark.parametrize("swap_objs", [True, False]) def test_index_ctor_nat_result(self, swap_objs): # mixed np.datetime64/timedelta64 nat results in object - data = [np.datetime64('nat'), np.timedelta64('nat')] + data = [np.datetime64("nat"), np.timedelta64("nat")] if swap_objs: data = data[::-1] @@ -312,41 +353,57 @@ def test_index_ctor_nat_result(self, swap_objs): tm.assert_index_equal(Index(np.array(data, dtype=object)), expected) def test_index_ctor_infer_periodindex(self): - xp = period_range('2012-1-1', freq='M', periods=3) + xp = period_range("2012-1-1", freq="M", periods=3) rs = Index(xp) tm.assert_index_equal(rs, xp) assert isinstance(rs, PeriodIndex) - @pytest.mark.parametrize("vals,dtype", [ - ([1, 2, 3, 4, 5], 'int'), ([1.1, np.nan, 2.2, 3.0], 'float'), - (['A', 'B', 'C', np.nan], 'obj') - ]) + @pytest.mark.parametrize( + "vals,dtype", + [ + ([1, 2, 3, 4, 5], "int"), + ([1.1, np.nan, 2.2, 3.0], "float"), + (["A", "B", "C", np.nan], "obj"), + ], + ) def test_constructor_simple_new(self, vals, dtype): index = Index(vals, name=dtype) result = index._simple_new(index.values, dtype) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("vals", [ - [1, 2, 3], np.array([1, 2, 3]), np.array([1, 2, 3], dtype=int), - # below should coerce - [1., 2., 3.], np.array([1., 2., 3.], dtype=float) - ]) + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3]), + np.array([1, 2, 3], dtype=int), + # below should coerce + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) def test_constructor_dtypes_to_int64(self, vals): index = Index(vals, dtype=int) assert isinstance(index, Int64Index) - @pytest.mark.parametrize("vals", [ - [1, 2, 3], [1., 2., 3.], np.array([1., 2., 3.]), - np.array([1, 2, 3], dtype=int), np.array([1., 2., 3.], dtype=float) - ]) + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + np.array([1, 2, 3], dtype=int), + np.array([1.0, 2.0, 3.0], dtype=float), + ], + ) def test_constructor_dtypes_to_float64(self, vals): index = Index(vals, dtype=float) assert isinstance(index, Float64Index) @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize("vals", [ - [True, False, True], np.array([True, False, True], dtype=bool) - ]) + @pytest.mark.parametrize( + "vals", [[True, False, True], np.array([True, False, True], dtype=bool)] + ) def test_constructor_dtypes_to_object(self, cast_index, vals): if cast_index: index = Index(vals, dtype=bool) @@ -356,23 +413,36 @@ def test_constructor_dtypes_to_object(self, cast_index, vals): assert isinstance(index, Index) assert index.dtype == object - @pytest.mark.parametrize("vals", [ - [1, 2, 3], np.array([1, 2, 3], dtype=int), - np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')]), - [datetime(2011, 1, 1), datetime(2011, 1, 2)] - ]) + @pytest.mark.parametrize( + "vals", + [ + [1, 2, 3], + np.array([1, 2, 3], dtype=int), + np.array( + [np_datetime64_compat("2011-01-01"), np_datetime64_compat("2011-01-02")] + ), + [datetime(2011, 1, 1), datetime(2011, 1, 2)], + ], + ) def test_constructor_dtypes_to_categorical(self, vals): - index = Index(vals, dtype='category') + index = Index(vals, dtype="category") assert isinstance(index, CategoricalIndex) @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize("vals", [ - Index(np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')])), - Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]) - - ]) + @pytest.mark.parametrize( + "vals", + [ + Index( + np.array( + [ + np_datetime64_compat("2011-01-01"), + np_datetime64_compat("2011-01-02"), + ] + ) + ), + Index([datetime(2011, 1, 1), datetime(2011, 1, 2)]), + ], + ) def test_constructor_dtypes_to_datetime(self, cast_index, vals): if cast_index: index = Index(vals, dtype=object) @@ -383,10 +453,13 @@ def test_constructor_dtypes_to_datetime(self, cast_index, vals): assert isinstance(index, DatetimeIndex) @pytest.mark.parametrize("cast_index", [True, False]) - @pytest.mark.parametrize("vals", [ - np.array([np.timedelta64(1, 'D'), np.timedelta64(1, 'D')]), - [timedelta(1), timedelta(1)] - ]) + @pytest.mark.parametrize( + "vals", + [ + np.array([np.timedelta64(1, "D"), np.timedelta64(1, "D")]), + [timedelta(1), timedelta(1)], + ], + ) def test_constructor_dtypes_to_timedelta(self, cast_index, vals): if cast_index: index = Index(vals, dtype=object) @@ -396,24 +469,24 @@ def test_constructor_dtypes_to_timedelta(self, cast_index, vals): index = Index(vals) assert isinstance(index, TimedeltaIndex) - @pytest.mark.parametrize("attr, utc", [ - ['values', False], - ['asi8', True]]) + @pytest.mark.parametrize("attr, utc", [["values", False], ["asi8", True]]) @pytest.mark.parametrize("klass", [pd.Index, pd.DatetimeIndex]) - def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, - klass): + def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, klass): # Test constructing with a datetimetz dtype # .values produces numpy datetimes, so these are considered naive # .asi8 produces integers, so these are considered epoch timestamps # ^the above will be true in a later version. Right now we `.view` # the i8 values as NS_DTYPE, effectively treating them as wall times. - index = pd.date_range('2011-01-01', periods=5) + index = pd.date_range("2011-01-01", periods=5) arg = getattr(index, attr) index = index.tz_localize(tz_naive_fixture) dtype = index.dtype - if (tz_naive_fixture and attr == "asi8" and - str(tz_naive_fixture) not in ('UTC', 'tzutc()', 'UTC+00:00')): + if ( + tz_naive_fixture + and attr == "asi8" + and str(tz_naive_fixture) not in ("UTC", "tzutc()", "UTC+00:00") + ): ex_warn = FutureWarning else: ex_warn = None @@ -436,10 +509,10 @@ def test_constructor_dtypes_datetime(self, tz_naive_fixture, attr, utc, result = klass(list(arg), dtype=dtype) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("attr", ['values', 'asi8']) + @pytest.mark.parametrize("attr", ["values", "asi8"]) @pytest.mark.parametrize("klass", [pd.Index, pd.TimedeltaIndex]) def test_constructor_dtypes_timedelta(self, attr, klass): - index = pd.timedelta_range('1 days', periods=5) + index = pd.timedelta_range("1 days", periods=5) dtype = index.dtype values = getattr(index, attr) @@ -451,119 +524,136 @@ def test_constructor_dtypes_timedelta(self, attr, klass): tm.assert_index_equal(result, index) @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])]) - @pytest.mark.parametrize("klass", - [Index, Float64Index, Int64Index, UInt64Index, - CategoricalIndex, DatetimeIndex, TimedeltaIndex]) + @pytest.mark.parametrize( + "klass", + [ + Index, + Float64Index, + Int64Index, + UInt64Index, + CategoricalIndex, + DatetimeIndex, + TimedeltaIndex, + ], + ) def test_constructor_empty(self, value, klass): empty = klass(value) assert isinstance(empty, klass) assert not len(empty) - @pytest.mark.parametrize("empty,klass", [ - (PeriodIndex([], freq='B'), PeriodIndex), - (PeriodIndex(iter([]), freq='B'), PeriodIndex), - (PeriodIndex((x for x in []), freq='B'), PeriodIndex), - (RangeIndex(step=1), pd.RangeIndex), - (MultiIndex(levels=[[1, 2], ['blue', 'red']], - codes=[[], []]), MultiIndex) - ]) + @pytest.mark.parametrize( + "empty,klass", + [ + (PeriodIndex([], freq="B"), PeriodIndex), + (PeriodIndex(iter([]), freq="B"), PeriodIndex), + (PeriodIndex((x for x in []), freq="B"), PeriodIndex), + (RangeIndex(step=1), pd.RangeIndex), + (MultiIndex(levels=[[1, 2], ["blue", "red"]], codes=[[], []]), MultiIndex), + ], + ) def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) assert not len(empty) def test_constructor_overflow_int64(self): # see gh-15832 - msg = ("The elements provided in the data cannot " - "all be casted to the dtype int64") + msg = ( + "The elements provided in the data cannot " + "all be casted to the dtype int64" + ) with pytest.raises(OverflowError, match=msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") - @pytest.mark.xfail(reason="see GH#21311: Index " - "doesn't enforce dtype argument") + @pytest.mark.xfail(reason="see GH#21311: Index " "doesn't enforce dtype argument") def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): Index(["a", "b", "c"], dtype=float) def test_view_with_args(self): - restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', - 'empty'] + restricted = ["unicodeIndex", "strIndex", "catIndex", "boolIndex", "empty"] for i in list(set(self.indices.keys()) - set(restricted)): ind = self.indices[i] - ind.view('i8') - - @pytest.mark.parametrize('index_type', [ - 'unicodeIndex', - 'strIndex', - pytest.param('catIndex', marks=pytest.mark.xfail(reason="gh-25464")), - 'boolIndex', - 'empty']) + ind.view("i8") + + @pytest.mark.parametrize( + "index_type", + [ + "unicodeIndex", + "strIndex", + pytest.param("catIndex", marks=pytest.mark.xfail(reason="gh-25464")), + "boolIndex", + "empty", + ], + ) def test_view_with_args_object_array_raises(self, index_type): ind = self.indices[index_type] msg = "Cannot change data-type for object array" with pytest.raises(TypeError, match=msg): - ind.view('i8') + ind.view("i8") def test_astype(self): - casted = self.intIndex.astype('i8') + casted = self.intIndex.astype("i8") # it works! casted.get_loc(5) # pass on name - self.intIndex.name = 'foobar' - casted = self.intIndex.astype('i8') - assert casted.name == 'foobar' + self.intIndex.name = "foobar" + casted = self.intIndex.astype("i8") + assert casted.name == "foobar" def test_equals_object(self): # same - assert Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c'])) + assert Index(["a", "b", "c"]).equals(Index(["a", "b", "c"])) - @pytest.mark.parametrize("comp", [ - Index(['a', 'b']), Index(['a', 'b', 'd']), ['a', 'b', 'c']]) + @pytest.mark.parametrize( + "comp", [Index(["a", "b"]), Index(["a", "b", "d"]), ["a", "b", "c"]] + ) def test_not_equals_object(self, comp): - assert not Index(['a', 'b', 'c']).equals(comp) + assert not Index(["a", "b", "c"]).equals(comp) def test_insert(self): # GH 7256 # validate neg/pos inserts - result = Index(['b', 'c', 'd']) + result = Index(["b", "c", "d"]) # test 0th element - tm.assert_index_equal(Index(['a', 'b', 'c', 'd']), - result.insert(0, 'a')) + tm.assert_index_equal(Index(["a", "b", "c", "d"]), result.insert(0, "a")) # test Nth element that follows Python list behavior - tm.assert_index_equal(Index(['b', 'c', 'e', 'd']), - result.insert(-1, 'e')) + tm.assert_index_equal(Index(["b", "c", "e", "d"]), result.insert(-1, "e")) # test loc +/- neq (0, -1) - tm.assert_index_equal(result.insert(1, 'z'), result.insert(-2, 'z')) + tm.assert_index_equal(result.insert(1, "z"), result.insert(-2, "z")) # test empty null_index = Index([]) - tm.assert_index_equal(Index(['a']), null_index.insert(0, 'a')) + tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a")) def test_insert_missing(self, nulls_fixture): # GH 22295 # test there is no mangling of NA values - expected = Index(['a', nulls_fixture, 'b', 'c']) - result = Index(list('abc')).insert(1, nulls_fixture) + expected = Index(["a", nulls_fixture, "b", "c"]) + result = Index(list("abc")).insert(1, nulls_fixture) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("pos,expected", [ - (0, Index(['b', 'c', 'd'], name='index')), - (-1, Index(['a', 'b', 'c'], name='index')) - ]) + @pytest.mark.parametrize( + "pos,expected", + [ + (0, Index(["b", "c", "d"], name="index")), + (-1, Index(["a", "b", "c"], name="index")), + ], + ) def test_delete(self, pos, expected): - index = Index(['a', 'b', 'c', 'd'], name='index') + index = Index(["a", "b", "c", "d"], name="index") result = index.delete(pos) tm.assert_index_equal(result, expected) assert result.name == expected.name def test_delete_raises(self): - index = Index(['a', 'b', 'c', 'd'], name='index') + index = Index(["a", "b", "c", "d"], name="index") msg = "index 5 is out of bounds for axis 0 with size 4" with pytest.raises(IndexError, match=msg): index.delete(5) @@ -571,20 +661,20 @@ def test_delete_raises(self): def test_identical(self): # index - i1 = Index(['a', 'b', 'c']) - i2 = Index(['a', 'b', 'c']) + i1 = Index(["a", "b", "c"]) + i2 = Index(["a", "b", "c"]) assert i1.identical(i2) - i1 = i1.rename('foo') + i1 = i1.rename("foo") assert i1.equals(i2) assert not i1.identical(i2) - i2 = i2.rename('foo') + i2 = i2.rename("foo") assert i1.identical(i2) - i3 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')]) - i4 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')], tupleize_cols=False) + i3 = Index([("a", "a"), ("a", "b"), ("b", "a")]) + i4 = Index([("a", "a"), ("a", "b"), ("b", "a")], tupleize_cols=False) assert not i3.identical(i4) def test_is_(self): @@ -600,7 +690,7 @@ def test_is_(self): # quasi-implementation dependent assert ind.is_(ind.view()) ind2 = ind.view() - ind2.name = 'bob' + ind2.name = "bob" assert ind.is_(ind2) assert ind2.is_(ind) # doesn't matter if Indices are *actually* views of underlying data, @@ -622,14 +712,14 @@ def test_asof(self): assert isinstance(self.dateIndex.asof(d), Timestamp) def test_asof_datetime_partial(self): - index = pd.date_range('2010-01-01', periods=2, freq='m') - expected = Timestamp('2010-02-28') - result = index.asof('2010-02') + index = pd.date_range("2010-01-01", periods=2, freq="m") + expected = Timestamp("2010-02-28") + result = index.asof("2010-02") assert result == expected assert not isinstance(result, Index) def test_nanosecond_index_access(self): - s = Series([Timestamp('20130101')]).values.view('i8')[0] + s = Series([Timestamp("20130101")]).values.view("i8")[0] r = DatetimeIndex([s + 50 + i for i in range(100)]) x = Series(np.random.randn(100), index=r) @@ -638,8 +728,9 @@ def test_nanosecond_index_access(self): # this does not yet work, as parsing strings is done via dateutil # assert first_value == x['2013-01-01 00:00:00.000000050+0000'] - expected_ts = np_datetime64_compat('2013-01-01 00:00:00.000000050+' - '0000', 'ns') + expected_ts = np_datetime64_compat( + "2013-01-01 00:00:00.000000050+" "0000", "ns" + ) assert first_value == x[Timestamp(expected_ts)] def test_booleanindex(self): @@ -660,8 +751,7 @@ def test_fancy(self): for i in sl: assert i == sl[sl.get_loc(i)] - @pytest.mark.parametrize("attr", [ - 'strIndex', 'intIndex', 'floatIndex']) + @pytest.mark.parametrize("attr", ["strIndex", "intIndex", "floatIndex"]) @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) def test_empty_fancy(self, attr, dtype): empty_arr = np.array([], dtype=dtype) @@ -671,8 +761,7 @@ def test_empty_fancy(self, attr, dtype): assert index[[]].identical(empty_index) assert index[empty_arr].identical(empty_index) - @pytest.mark.parametrize("attr", [ - 'strIndex', 'intIndex', 'floatIndex']) + @pytest.mark.parametrize("attr", ["strIndex", "intIndex", "floatIndex"]) def test_empty_fancy_raises(self, attr): # pd.DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. @@ -699,27 +788,34 @@ def test_intersection(self, sort): inter = first.intersection(first, sort=sort) assert inter is first - @pytest.mark.parametrize("index2,keeps_name", [ - (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name - (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names - (Index([3, 4, 5, 6, 7]), False)]) + @pytest.mark.parametrize( + "index2,keeps_name", + [ + (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name + (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names + (Index([3, 4, 5, 6, 7]), False), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_name_preservation(self, index2, keeps_name, sort): - index1 = Index([1, 2, 3, 4, 5], name='index') + index1 = Index([1, 2, 3, 4, 5], name="index") expected = Index([3, 4, 5]) result = index1.intersection(index2, sort) if keeps_name: - expected.name = 'index' + expected.name = "index" assert result.name == expected.name tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("first_name,second_name,expected_name", [ - ('A', 'A', 'A'), ('A', 'B', None), (None, 'B', None)]) + @pytest.mark.parametrize( + "first_name,second_name,expected_name", + [("A", "A", "A"), ("A", "B", None), (None, "B", None)], + ) @pytest.mark.parametrize("sort", [None, False]) - def test_intersection_name_preservation2(self, first_name, second_name, - expected_name, sort): + def test_intersection_name_preservation2( + self, first_name, second_name, expected_name, sort + ): first = self.strIndex[5:20] second = self.strIndex[:10] first.name = first_name @@ -727,12 +823,16 @@ def test_intersection_name_preservation2(self, first_name, second_name, intersect = first.intersection(second, sort=sort) assert intersect.name == expected_name - @pytest.mark.parametrize("index2,keeps_name", [ - (Index([4, 7, 6, 5, 3], name='index'), True), - (Index([4, 7, 6, 5, 3], name='other'), False)]) + @pytest.mark.parametrize( + "index2,keeps_name", + [ + (Index([4, 7, 6, 5, 3], name="index"), True), + (Index([4, 7, 6, 5, 3], name="other"), False), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_monotonic(self, index2, keeps_name, sort): - index1 = Index([5, 3, 2, 4, 1], name='index') + index1 = Index([5, 3, 2, 4, 1], name="index") expected = Index([5, 3, 4]) if keeps_name: @@ -743,15 +843,15 @@ def test_intersection_monotonic(self, index2, keeps_name, sort): expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("index2,expected_arr", [ - (Index(['B', 'D']), ['B']), - (Index(['B', 'D', 'A']), ['A', 'B', 'A'])]) + @pytest.mark.parametrize( + "index2,expected_arr", + [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], + ) @pytest.mark.parametrize("sort", [None, False]) - def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, - sort): + def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique - index1 = Index(['A', 'B', 'A', 'C']) - expected = Index(expected_arr, dtype='object') + index1 = Index(["A", "B", "A", "C"]) + expected = Index(expected_arr, dtype="object") result = index1.intersection(index2, sort=sort) if sort is None: expected = expected.sort_values() @@ -762,41 +862,41 @@ def test_intersect_str_dates(self, sort): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] i1 = Index(dt_dates, dtype=object) - i2 = Index(['aa'], dtype=object) + i2 = Index(["aa"], dtype=object) result = i2.intersection(i1, sort=sort) assert len(result) == 0 def test_intersect_nosort(self): - result = pd.Index(['c', 'b', 'a']).intersection(['b', 'a']) - expected = pd.Index(['b', 'a']) + result = pd.Index(["c", "b", "a"]).intersection(["b", "a"]) + expected = pd.Index(["b", "a"]) tm.assert_index_equal(result, expected) def test_intersection_equal_sort(self): - idx = pd.Index(['c', 'a', 'b']) + idx = pd.Index(["c", "a", "b"]) tm.assert_index_equal(idx.intersection(idx, sort=False), idx) tm.assert_index_equal(idx.intersection(idx, sort=None), idx) @pytest.mark.xfail(reason="Not implemented") def test_intersection_equal_sort_true(self): # TODO decide on True behaviour - idx = pd.Index(['c', 'a', 'b']) - sorted_ = pd.Index(['a', 'b', 'c']) + idx = pd.Index(["c", "a", "b"]) + sorted_ = pd.Index(["a", "b", "c"]) tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) @pytest.mark.parametrize("sort", [None, False]) def test_chained_union(self, sort): # Chained unions handles names correctly - i1 = Index([1, 2], name='i1') - i2 = Index([5, 6], name='i2') - i3 = Index([3, 4], name='i3') + i1 = Index([1, 2], name="i1") + i2 = Index([5, 6], name="i2") + i3 = Index([3, 4], name="i3") union = i1.union(i2.union(i3, sort=sort), sort=sort) expected = i1.union(i2, sort=sort).union(i3, sort=sort) tm.assert_index_equal(union, expected) - j1 = Index([1, 2], name='j1') - j2 = Index([], name='j2') - j3 = Index([], name='j3') + j1 = Index([1, 2], name="j1") + j2 = Index([], name="j2") + j3 = Index([], name="j3") union = j1.union(j2.union(j3, sort=sort), sort=sort) expected = j1.union(j2, sort=sort).union(j3, sort=sort) tm.assert_index_equal(union, expected) @@ -813,7 +913,7 @@ def test_union(self, sort): tm.assert_index_equal(union, everything.sort_values()) assert tm.equalContents(union, everything) - @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) def test_union_sort_other_special(self, slice_): # https://github.com/pandas-dev/pandas/issues/24959 @@ -827,7 +927,7 @@ def test_union_sort_other_special(self, slice_): tm.assert_index_equal(idx.union(other, sort=False), idx) @pytest.mark.xfail(reason="Not implemented") - @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + @pytest.mark.parametrize("slice_", [slice(None), slice(0)]) def test_union_sort_special_true(self, slice_): # TODO decide on True behaviour # sort=True @@ -841,7 +941,7 @@ def test_union_sort_special_true(self, slice_): def test_union_sort_other_incomparable(self): # https://github.com/pandas-dev/pandas/issues/24959 - idx = pd.Index([1, pd.Timestamp('2000')]) + idx = pd.Index([1, pd.Timestamp("2000")]) # default (sort=None) with tm.assert_produces_warning(RuntimeWarning): result = idx.union(idx[:1]) @@ -861,12 +961,11 @@ def test_union_sort_other_incomparable(self): def test_union_sort_other_incomparable_true(self): # TODO decide on True behaviour # sort=True - idx = pd.Index([1, pd.Timestamp('2000')]) - with pytest.raises(TypeError, match='.*'): + idx = pd.Index([1, pd.Timestamp("2000")]) + with pytest.raises(TypeError, match=".*"): idx.union(idx[:1], sort=True) - @pytest.mark.parametrize("klass", [ - np.array, Series, list]) + @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("sort", [None, False]) def test_union_from_iterables(self, klass, sort): # GH 10149 @@ -898,13 +997,16 @@ def test_union_identity(self, sort): union = Index([]).union(first, sort=sort) assert (union is first) is (not sort) - @pytest.mark.parametrize("first_list", [list('ba'), list()]) - @pytest.mark.parametrize("second_list", [list('ab'), list()]) - @pytest.mark.parametrize("first_name, second_name, expected_name", [ - ('A', 'B', None), (None, 'B', None), ('A', None, None)]) + @pytest.mark.parametrize("first_list", [list("ba"), list()]) + @pytest.mark.parametrize("second_list", [list("ab"), list()]) + @pytest.mark.parametrize( + "first_name, second_name, expected_name", + [("A", "B", None), (None, "B", None), ("A", None, None)], + ) @pytest.mark.parametrize("sort", [None, False]) - def test_union_name_preservation(self, first_list, second_list, first_name, - second_name, expected_name, sort): + def test_union_name_preservation( + self, first_list, second_list, first_name, second_name, expected_name, sort + ): first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -927,7 +1029,7 @@ def test_union_dt_as_obj(self, sort): if self.dateIndex.dtype == np.object_: appended = np.append(self.strIndex, self.dateIndex) else: - appended = np.append(self.strIndex, self.dateIndex.astype('O')) + appended = np.append(self.strIndex, self.dateIndex.astype("O")) assert tm.equalContents(firstCat, appended) assert tm.equalContents(secondCat, self.strIndex) @@ -935,11 +1037,12 @@ def test_union_dt_as_obj(self, sort): tm.assert_contains_all(self.strIndex, secondCat) tm.assert_contains_all(self.dateIndex, firstCat) - @pytest.mark.parametrize("method", ['union', 'intersection', 'difference', - 'symmetric_difference']) + @pytest.mark.parametrize( + "method", ["union", "intersection", "difference", "symmetric_difference"] + ) def test_setops_disallow_true(self, method): - idx1 = pd.Index(['a', 'b']) - idx2 = pd.Index(['b', 'c']) + idx1 = pd.Index(["a", "b"]) + idx2 = pd.Index(["b", "c"]) with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) @@ -969,13 +1072,14 @@ def test_map_with_tuples(self): def test_map_with_tuples_mi(self): # Test that returning a single object from a MultiIndex # returns an Index. - first_level = ['foo', 'bar', 'baz'] + first_level = ["foo", "bar", "baz"] multi_index = MultiIndex.from_tuples(zip(first_level, [1, 2, 3])) reduced_index = multi_index.map(lambda x: x[0]) tm.assert_index_equal(reduced_index, Index(first_level)) - @pytest.mark.parametrize("attr", [ - 'makeDateIndex', 'makePeriodIndex', 'makeTimedeltaIndex']) + @pytest.mark.parametrize( + "attr", ["makeDateIndex", "makePeriodIndex", "makeTimedeltaIndex"] + ) def test_map_tseries_indices_return_index(self, attr): index = getattr(tm, attr)(10) expected = Index([1] * 10) @@ -983,28 +1087,30 @@ def test_map_tseries_indices_return_index(self, attr): tm.assert_index_equal(expected, result) def test_map_tseries_indices_accsr_return_index(self): - date_index = tm.makeDateIndex(24, freq='h', name='hourly') - expected = Index(range(24), name='hourly') + date_index = tm.makeDateIndex(24, freq="h", name="hourly") + expected = Index(range(24), name="hourly") tm.assert_index_equal(expected, date_index.map(lambda x: x.hour)) @pytest.mark.parametrize( "mapper", [ lambda values, index: {i: e for e, i in zip(values, index)}, - lambda values, index: pd.Series(values, index)]) + lambda values, index: pd.Series(values, index), + ], + ) def test_map_dictlike(self, mapper): # GH 12756 - expected = Index(['foo', 'bar', 'baz']) + expected = Index(["foo", "bar", "baz"]) index = tm.makeIntIndex(3) result = index.map(mapper(expected.values, index)) tm.assert_index_equal(result, expected) # TODO: replace with fixture for name in self.indices.keys(): - if name == 'catIndex': + if name == "catIndex": # Tested in test_categorical continue - elif name == 'repeats': + elif name == "repeats": # Cannot map duplicated index continue @@ -1012,18 +1118,19 @@ def test_map_dictlike(self, mapper): expected = Index(np.arange(len(index), 0, -1)) # to match proper result coercion for uints - if name == 'empty': + if name == "empty": expected = Index([]) result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("mapper", [ - Series(['foo', 2., 'baz'], index=[0, 2, -1]), - {0: 'foo', 2: 2.0, -1: 'baz'}]) + @pytest.mark.parametrize( + "mapper", + [Series(["foo", 2.0, "baz"], index=[0, 2, -1]), {0: "foo", 2: 2.0, -1: "baz"}], + ) def test_map_with_non_function_missing_values(self, mapper): # GH 12756 - expected = Index([2., np.nan, 'foo']) + expected = Index([2.0, np.nan, "foo"]) result = Index([2, 1, 0]).map(mapper) tm.assert_index_equal(expected, result) @@ -1031,20 +1138,20 @@ def test_map_with_non_function_missing_values(self, mapper): def test_map_na_exclusion(self): index = Index([1.5, np.nan, 3, np.nan, 5]) - result = index.map(lambda x: x * 2, na_action='ignore') + result = index.map(lambda x: x * 2, na_action="ignore") expected = index * 2 tm.assert_index_equal(result, expected) def test_map_defaultdict(self): index = Index([1, 2, 3]) - default_dict = defaultdict(lambda: 'blank') - default_dict[1] = 'stuff' + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" result = index.map(default_dict) - expected = Index(['stuff', 'blank', 'blank']) + expected = Index(["stuff", "blank", "blank"]) tm.assert_index_equal(result, expected) def test_append_multiple(self): - index = Index(['a', 'b', 'c', 'd', 'e', 'f']) + index = Index(["a", "b", "c", "d", "e", "f"]) foos = [index[:2], index[2:4], index[4:]] result = foos[0].append(foos[1:]) @@ -1054,17 +1161,15 @@ def test_append_multiple(self): result = index.append([]) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("name,expected", [ - ('foo', 'foo'), ('bar', None)]) + @pytest.mark.parametrize("name,expected", [("foo", "foo"), ("bar", None)]) def test_append_empty_preserve_name(self, name, expected): - left = Index([], name='foo') + left = Index([], name="foo") right = Index([1, 2, 3], name=name) result = left.append(right) assert result.name == expected - @pytest.mark.parametrize("second_name,expected", [ - (None, None), ('name', 'name')]) + @pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")]) @pytest.mark.parametrize("sort", [None, False]) def test_difference_name_preservation(self, second_name, expected, sort): # TODO: replace with fixturesult @@ -1072,7 +1177,7 @@ def test_difference_name_preservation(self, second_name, expected, sort): second = self.strIndex[:10] answer = self.strIndex[10:20] - first.name = 'name' + first.name = "name" second.name = second_name result = first.difference(second, sort=sort) @@ -1086,7 +1191,7 @@ def test_difference_name_preservation(self, second_name, expected, sort): @pytest.mark.parametrize("sort", [None, False]) def test_difference_empty_arg(self, sort): first = self.strIndex[5:20] - first.name == 'name' + first.name == "name" result = first.difference([], sort) assert tm.equalContents(result, first) @@ -1095,7 +1200,7 @@ def test_difference_empty_arg(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_difference_identity(self, sort): first = self.strIndex[5:20] - first.name == 'name' + first.name == "name" result = first.difference(first, sort) assert len(result) == 0 @@ -1117,7 +1222,7 @@ def test_difference_sort(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(self, sort): # smoke - index1 = Index([5, 2, 3, 4], name='index1') + index1 = Index([5, 2, 3, 4], name="index1") index2 = Index([2, 3, 4, 1]) result = index1.symmetric_difference(index2, sort=sort) expected = Index([5, 1]) @@ -1132,16 +1237,16 @@ def test_symmetric_difference(self, sort): assert tm.equalContents(result, expected) assert result.name is None - @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + @pytest.mark.parametrize("opname", ["difference", "symmetric_difference"]) def test_difference_incomparable(self, opname): - a = pd.Index([3, pd.Timestamp('2000'), 1]) - b = pd.Index([2, pd.Timestamp('1999'), 1]) + a = pd.Index([3, pd.Timestamp("2000"), 1]) + b = pd.Index([2, pd.Timestamp("1999"), 1]) op = operator.methodcaller(opname, b) # sort=None, the default result = op(a) - expected = pd.Index([3, pd.Timestamp('2000'), 2, pd.Timestamp('1999')]) - if opname == 'difference': + expected = pd.Index([3, pd.Timestamp("2000"), 2, pd.Timestamp("1999")]) + if opname == "difference": expected = expected[:2] tm.assert_index_equal(result, expected) @@ -1151,31 +1256,35 @@ def test_difference_incomparable(self, opname): tm.assert_index_equal(result, expected) @pytest.mark.xfail(reason="Not implemented") - @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + @pytest.mark.parametrize("opname", ["difference", "symmetric_difference"]) def test_difference_incomparable_true(self, opname): # TODO decide on True behaviour # # sort=True, raises - a = pd.Index([3, pd.Timestamp('2000'), 1]) - b = pd.Index([2, pd.Timestamp('1999'), 1]) + a = pd.Index([3, pd.Timestamp("2000"), 1]) + b = pd.Index([2, pd.Timestamp("1999"), 1]) op = operator.methodcaller(opname, b, sort=True) - with pytest.raises(TypeError, match='Cannot compare'): + with pytest.raises(TypeError, match="Cannot compare"): op(a) @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_mi(self, sort): index1 = MultiIndex.from_tuples(self.tuples) - index2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) + index2 = MultiIndex.from_tuples([("foo", 1), ("bar", 3)]) result = index1.symmetric_difference(index2, sort=sort) - expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) + expected = MultiIndex.from_tuples([("bar", 2), ("baz", 3), ("bar", 3)]) if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) - @pytest.mark.parametrize("index2,expected", [ - (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])), - (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0]))]) + @pytest.mark.parametrize( + "index2,expected", + [ + (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])), + (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0])), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_missing(self, index2, expected, sort): # GH 13514 change: {nan} - {nan} == {} @@ -1189,24 +1298,23 @@ def test_symmetric_difference_missing(self, index2, expected, sort): @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_non_index(self, sort): - index1 = Index([1, 2, 3, 4], name='index1') + index1 = Index([1, 2, 3, 4], name="index1") index2 = np.array([2, 3, 4, 5]) expected = Index([1, 5]) result = index1.symmetric_difference(index2, sort=sort) assert tm.equalContents(result, expected) - assert result.name == 'index1' + assert result.name == "index1" - result = index1.symmetric_difference(index2, result_name='new_name', - sort=sort) + result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) assert tm.equalContents(result, expected) - assert result.name == 'new_name' + assert result.name == "new_name" @pytest.mark.parametrize("sort", [None, False]) def test_difference_type(self, sort): # GH 20040 # If taking difference of a set and itself, it # needs to preserve the type of the index - skip_index_keys = ['repeats'] + skip_index_keys = ["repeats"] for key, index in self.generate_index_types(skip_index_keys): result = index.difference(index, sort=sort) expected = index.drop(index) @@ -1218,42 +1326,66 @@ def test_intersection_difference(self, sort): # Test that the intersection of an index with an # empty index produces the same index as the difference # of an index with itself. Test for all types - skip_index_keys = ['repeats'] + skip_index_keys = ["repeats"] for key, index in self.generate_index_types(skip_index_keys): inter = index.intersection(index.drop(index)) diff = index.difference(index, sort=sort) tm.assert_index_equal(inter, diff) - @pytest.mark.parametrize("attr,expected", [ - ('strIndex', False), ('boolIndex', False), ('catIndex', False), - ('intIndex', True), ('dateIndex', False), ('floatIndex', True)]) + @pytest.mark.parametrize( + "attr,expected", + [ + ("strIndex", False), + ("boolIndex", False), + ("catIndex", False), + ("intIndex", True), + ("dateIndex", False), + ("floatIndex", True), + ], + ) def test_is_numeric(self, attr, expected): assert getattr(self, attr).is_numeric() == expected - @pytest.mark.parametrize("attr,expected", [ - ('strIndex', True), ('boolIndex', True), ('catIndex', False), - ('intIndex', False), ('dateIndex', False), ('floatIndex', False)]) + @pytest.mark.parametrize( + "attr,expected", + [ + ("strIndex", True), + ("boolIndex", True), + ("catIndex", False), + ("intIndex", False), + ("dateIndex", False), + ("floatIndex", False), + ], + ) def test_is_object(self, attr, expected): assert getattr(self, attr).is_object() == expected - @pytest.mark.parametrize("attr,expected", [ - ('strIndex', False), ('boolIndex', False), ('catIndex', False), - ('intIndex', False), ('dateIndex', True), ('floatIndex', False)]) + @pytest.mark.parametrize( + "attr,expected", + [ + ("strIndex", False), + ("boolIndex", False), + ("catIndex", False), + ("intIndex", False), + ("dateIndex", True), + ("floatIndex", False), + ], + ) def test_is_all_dates(self, attr, expected): assert getattr(self, attr).is_all_dates == expected def test_summary(self): self._check_method_works(Index._summary) # GH3869 - ind = Index(['{other}%s', "~:{range}:0"], name='A') + ind = Index(["{other}%s", "~:{range}:0"], name="A") result = ind._summary() # shouldn't be formatted accidentally. - assert '~:{range}:0' in result - assert '{other}%s' in result + assert "~:{range}:0" in result + assert "{other}%s" in result # GH18217 def test_summary_deprecated(self): - ind = Index(['{other}%s', "~:{range}:0"], name='A') + ind = Index(["{other}%s", "~:{range}:0"], name="A") with tm.assert_produces_warning(FutureWarning): ind.summary() @@ -1274,8 +1406,7 @@ def test_format(self): self.strIndex[:0].format() - @pytest.mark.parametrize("vals", [ - [1, 2.0 + 3.0j, 4.], ['a', 'b', 'c']]) + @pytest.mark.parametrize("vals", [[1, 2.0 + 3.0j, 4.0], ["a", "b", "c"]]) def test_format_missing(self, vals, nulls_fixture): # 2845 vals = list(vals) # Copy for each iteration @@ -1283,7 +1414,7 @@ def test_format_missing(self, vals, nulls_fixture): index = Index(vals) formatted = index.format() - expected = [str(index[0]), str(index[1]), str(index[2]), 'NaN'] + expected = [str(index[0]), str(index[1]), str(index[2]), "NaN"] assert formatted == expected assert index[3] is nulls_fixture @@ -1291,20 +1422,20 @@ def test_format_missing(self, vals, nulls_fixture): def test_format_with_name_time_info(self): # bug I fixed 12/20/2011 inc = timedelta(hours=4) - dates = Index([dt + inc for dt in self.dateIndex], name='something') + dates = Index([dt + inc for dt in self.dateIndex], name="something") formatted = dates.format(name=True) - assert formatted[0] == 'something' + assert formatted[0] == "something" def test_format_datetime_with_time(self): t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) result = t.format() - expected = ['2012-02-07 00:00:00', '2012-02-07 23:00:00'] + expected = ["2012-02-07 00:00:00", "2012-02-07 23:00:00"] assert len(result) == 2 assert result == expected - @pytest.mark.parametrize("op", ['any', 'all']) + @pytest.mark.parametrize("op", ["any", "all"]) def test_logical_compat(self, op): index = self.create_index() assert getattr(index, op)() == getattr(index.values, op)() @@ -1328,11 +1459,15 @@ def test_get_indexer(self): assert_almost_equal(r1, e1) @pytest.mark.parametrize("reverse", [True, False]) - @pytest.mark.parametrize("expected,method", [ - (np.array([-1, 0, 0, 1, 1], dtype=np.intp), 'pad'), - (np.array([-1, 0, 0, 1, 1], dtype=np.intp), 'ffill'), - (np.array([0, 0, 1, 1, 2], dtype=np.intp), 'backfill'), - (np.array([0, 0, 1, 1, 2], dtype=np.intp), 'bfill')]) + @pytest.mark.parametrize( + "expected,method", + [ + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "pad"), + (np.array([-1, 0, 0, 1, 1], dtype=np.intp), "ffill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "backfill"), + (np.array([0, 0, 1, 1, 2], dtype=np.intp), "bfill"), + ], + ) def test_get_indexer_methods(self, reverse, expected, method): index1 = Index([1, 2, 3, 4, 5]) index2 = Index([2, 4, 6]) @@ -1348,66 +1483,70 @@ def test_get_indexer_invalid(self): # GH10411 index = Index(np.arange(10)) - with pytest.raises(ValueError, match='tolerance argument'): + with pytest.raises(ValueError, match="tolerance argument"): index.get_indexer([1, 0], tolerance=1) - with pytest.raises(ValueError, match='limit argument'): + with pytest.raises(ValueError, match="limit argument"): index.get_indexer([1, 0], limit=1) @pytest.mark.parametrize( - 'method, tolerance, indexer, expected', + "method, tolerance, indexer, expected", [ - ('pad', None, [0, 5, 9], [0, 5, 9]), - ('backfill', None, [0, 5, 9], [0, 5, 9]), - ('nearest', None, [0, 5, 9], [0, 5, 9]), - ('pad', 0, [0, 5, 9], [0, 5, 9]), - ('backfill', 0, [0, 5, 9], [0, 5, 9]), - ('nearest', 0, [0, 5, 9], [0, 5, 9]), - - ('pad', None, [0.2, 1.8, 8.5], [0, 1, 8]), - ('backfill', None, [0.2, 1.8, 8.5], [1, 2, 9]), - ('nearest', None, [0.2, 1.8, 8.5], [0, 2, 9]), - ('pad', 1, [0.2, 1.8, 8.5], [0, 1, 8]), - ('backfill', 1, [0.2, 1.8, 8.5], [1, 2, 9]), - ('nearest', 1, [0.2, 1.8, 8.5], [0, 2, 9]), - - ('pad', 0.2, [0.2, 1.8, 8.5], [0, -1, -1]), - ('backfill', 0.2, [0.2, 1.8, 8.5], [-1, 2, -1]), - ('nearest', 0.2, [0.2, 1.8, 8.5], [0, 2, -1])]) + ("pad", None, [0, 5, 9], [0, 5, 9]), + ("backfill", None, [0, 5, 9], [0, 5, 9]), + ("nearest", None, [0, 5, 9], [0, 5, 9]), + ("pad", 0, [0, 5, 9], [0, 5, 9]), + ("backfill", 0, [0, 5, 9], [0, 5, 9]), + ("nearest", 0, [0, 5, 9], [0, 5, 9]), + ("pad", None, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", None, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", None, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 1, [0.2, 1.8, 8.5], [0, 1, 8]), + ("backfill", 1, [0.2, 1.8, 8.5], [1, 2, 9]), + ("nearest", 1, [0.2, 1.8, 8.5], [0, 2, 9]), + ("pad", 0.2, [0.2, 1.8, 8.5], [0, -1, -1]), + ("backfill", 0.2, [0.2, 1.8, 8.5], [-1, 2, -1]), + ("nearest", 0.2, [0.2, 1.8, 8.5], [0, 2, -1]), + ], + ) def test_get_indexer_nearest(self, method, tolerance, indexer, expected): index = Index(np.arange(10)) actual = index.get_indexer(indexer, method=method, tolerance=tolerance) - tm.assert_numpy_array_equal(actual, np.array(expected, - dtype=np.intp)) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) - @pytest.mark.parametrize('listtype', [list, tuple, Series, np.array]) + @pytest.mark.parametrize("listtype", [list, tuple, Series, np.array]) @pytest.mark.parametrize( - 'tolerance, expected', - list(zip([[0.3, 0.3, 0.1], [0.2, 0.1, 0.1], - [0.1, 0.5, 0.5]], - [[0, 2, -1], [0, -1, -1], - [-1, 2, 9]]))) - def test_get_indexer_nearest_listlike_tolerance(self, tolerance, - expected, listtype): + "tolerance, expected", + list( + zip( + [[0.3, 0.3, 0.1], [0.2, 0.1, 0.1], [0.1, 0.5, 0.5]], + [[0, 2, -1], [0, -1, -1], [-1, 2, 9]], + ) + ), + ) + def test_get_indexer_nearest_listlike_tolerance( + self, tolerance, expected, listtype + ): index = Index(np.arange(10)) - actual = index.get_indexer([0.2, 1.8, 8.5], method='nearest', - tolerance=listtype(tolerance)) - tm.assert_numpy_array_equal(actual, np.array(expected, - dtype=np.intp)) + actual = index.get_indexer( + [0.2, 1.8, 8.5], method="nearest", tolerance=listtype(tolerance) + ) + tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) def test_get_indexer_nearest_error(self): index = Index(np.arange(10)) - with pytest.raises(ValueError, match='limit argument'): - index.get_indexer([1, 0], method='nearest', limit=1) + with pytest.raises(ValueError, match="limit argument"): + index.get_indexer([1, 0], method="nearest", limit=1) - with pytest.raises(ValueError, match='tolerance size must match'): - index.get_indexer([1, 0], method='nearest', - tolerance=[1, 2, 3]) + with pytest.raises(ValueError, match="tolerance size must match"): + index.get_indexer([1, 0], method="nearest", tolerance=[1, 2, 3]) - @pytest.mark.parametrize("method,expected", [ - ('pad', [8, 7, 0]), ('backfill', [9, 8, 1]), ('nearest', [9, 7, 0])]) + @pytest.mark.parametrize( + "method,expected", + [("pad", [8, 7, 0]), ("backfill", [9, 8, 1]), ("nearest", [9, 7, 0])], + ) def test_get_indexer_nearest_decreasing(self, method, expected): index = Index(np.arange(10))[::-1] @@ -1417,31 +1556,35 @@ def test_get_indexer_nearest_decreasing(self, method, expected): actual = index.get_indexer([0.2, 1.8, 8.5], method=method) tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp)) - @pytest.mark.parametrize("method,expected", [ - ('pad', np.array([-1, 0, 1, 1], dtype=np.intp)), - ('backfill', np.array([0, 0, 1, -1], dtype=np.intp))]) + @pytest.mark.parametrize( + "method,expected", + [ + ("pad", np.array([-1, 0, 1, 1], dtype=np.intp)), + ("backfill", np.array([0, 0, 1, -1], dtype=np.intp)), + ], + ) def test_get_indexer_strings(self, method, expected): - index = pd.Index(['b', 'c']) - actual = index.get_indexer(['a', 'b', 'c', 'd'], method=method) + index = pd.Index(["b", "c"]) + actual = index.get_indexer(["a", "b", "c", "d"], method=method) tm.assert_numpy_array_equal(actual, expected) def test_get_indexer_strings_raises(self): - index = pd.Index(['b', 'c']) + index = pd.Index(["b", "c"]) msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" with pytest.raises(TypeError, match=msg): - index.get_indexer(['a', 'b', 'c', 'd'], method='nearest') + index.get_indexer(["a", "b", "c", "d"], method="nearest") with pytest.raises(TypeError, match=msg): - index.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=2) + index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2) with pytest.raises(TypeError, match=msg): - index.get_indexer(['a', 'b', 'c', 'd'], method='pad', - tolerance=[2, 2, 2, 2]) + index.get_indexer( + ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] + ) - @pytest.mark.parametrize("idx_class", [Int64Index, RangeIndex, - Float64Index]) + @pytest.mark.parametrize("idx_class", [Int64Index, RangeIndex, Float64Index]) def test_get_indexer_numeric_index_boolean_target(self, idx_class): # GH 16877 @@ -1450,22 +1593,23 @@ def test_get_indexer_numeric_index_boolean_target(self, idx_class): expected = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - def test_get_indexer_with_NA_values(self, unique_nulls_fixture, - unique_nulls_fixture2): + def test_get_indexer_with_NA_values( + self, unique_nulls_fixture, unique_nulls_fixture2 + ): # GH 22332 # check pairwise, that no pair of na values # is mangled if unique_nulls_fixture is unique_nulls_fixture2: return # skip it, values are not unique - arr = np.array([unique_nulls_fixture, - unique_nulls_fixture2], dtype=np.object) + arr = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) index = pd.Index(arr, dtype=np.object) - result = index.get_indexer([unique_nulls_fixture, - unique_nulls_fixture2, 'Unknown']) + result = index.get_indexer( + [unique_nulls_fixture, unique_nulls_fixture2, "Unknown"] + ) expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest']) + @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) def test_get_loc(self, method): index = pd.Index([0, 1, 2]) assert index.get_loc(1, method=method) == 1 @@ -1473,58 +1617,59 @@ def test_get_loc(self, method): if method: assert index.get_loc(1, method=method, tolerance=0) == 1 - @pytest.mark.parametrize("method", [None, 'pad', 'backfill', 'nearest']) + @pytest.mark.parametrize("method", [None, "pad", "backfill", "nearest"]) def test_get_loc_raises_bad_label(self, method): index = pd.Index([0, 1, 2]) if method: # Messages vary across versions if PY36: - msg = 'not supported between' + msg = "not supported between" else: - msg = 'unorderable types' + msg = "unorderable types" else: - msg = 'invalid key' + msg = "invalid key" with pytest.raises(TypeError, match=msg): index.get_loc([1, 2], method=method) - @pytest.mark.parametrize("method,loc", [ - ('pad', 1), ('backfill', 2), ('nearest', 1)]) + @pytest.mark.parametrize( + "method,loc", [("pad", 1), ("backfill", 2), ("nearest", 1)] + ) def test_get_loc_tolerance(self, method, loc): index = pd.Index([0, 1, 2]) assert index.get_loc(1.1, method) == loc assert index.get_loc(1.1, method, tolerance=1) == loc - @pytest.mark.parametrize("method", ['pad', 'backfill', 'nearest']) + @pytest.mark.parametrize("method", ["pad", "backfill", "nearest"]) def test_get_loc_outside_tolerance_raises(self, method): index = pd.Index([0, 1, 2]) - with pytest.raises(KeyError, match='1.1'): + with pytest.raises(KeyError, match="1.1"): index.get_loc(1.1, method, tolerance=0.05) def test_get_loc_bad_tolerance_raises(self): index = pd.Index([0, 1, 2]) - with pytest.raises(ValueError, match='must be numeric'): - index.get_loc(1.1, 'nearest', tolerance='invalid') + with pytest.raises(ValueError, match="must be numeric"): + index.get_loc(1.1, "nearest", tolerance="invalid") def test_get_loc_tolerance_no_method_raises(self): index = pd.Index([0, 1, 2]) - with pytest.raises(ValueError, match='tolerance .* valid if'): + with pytest.raises(ValueError, match="tolerance .* valid if"): index.get_loc(1.1, tolerance=1) def test_get_loc_raises_missized_tolerance(self): index = pd.Index([0, 1, 2]) - with pytest.raises(ValueError, match='tolerance size must match'): - index.get_loc(1.1, 'nearest', tolerance=[1, 1]) + with pytest.raises(ValueError, match="tolerance size must match"): + index.get_loc(1.1, "nearest", tolerance=[1, 1]) def test_get_loc_raises_object_nearest(self): - index = pd.Index(['a', 'c']) - with pytest.raises(TypeError, match='unsupported operand type'): - index.get_loc('a', method='nearest') + index = pd.Index(["a", "c"]) + with pytest.raises(TypeError, match="unsupported operand type"): + index.get_loc("a", method="nearest") def test_get_loc_raises_object_tolerance(self): - index = pd.Index(['a', 'c']) - with pytest.raises(TypeError, match='unsupported operand type'): - index.get_loc('a', method='pad', tolerance='invalid') + index = pd.Index(["a", "c"]) + with pytest.raises(TypeError, match="unsupported operand type"): + index.get_loc("a", method="pad", tolerance="invalid") @pytest.mark.parametrize("dtype", [int, float]) def test_slice_locs(self, dtype): @@ -1555,17 +1700,17 @@ def test_slice_float_locs(self, dtype): assert index2.slice_locs(10.5, -1) == (0, n) def test_slice_locs_dup(self): - index = Index(['a', 'a', 'b', 'c', 'd', 'd']) - assert index.slice_locs('a', 'd') == (0, 6) - assert index.slice_locs(end='d') == (0, 6) - assert index.slice_locs('a', 'c') == (0, 4) - assert index.slice_locs('b', 'd') == (2, 6) + index = Index(["a", "a", "b", "c", "d", "d"]) + assert index.slice_locs("a", "d") == (0, 6) + assert index.slice_locs(end="d") == (0, 6) + assert index.slice_locs("a", "c") == (0, 4) + assert index.slice_locs("b", "d") == (2, 6) index2 = index[::-1] - assert index2.slice_locs('d', 'a') == (0, 6) - assert index2.slice_locs(end='a') == (0, 6) - assert index2.slice_locs('d', 'b') == (0, 4) - assert index2.slice_locs('c', 'a') == (2, 6) + assert index2.slice_locs("d", "a") == (0, 6) + assert index2.slice_locs(end="a") == (0, 6) + assert index2.slice_locs("d", "b") == (0, 4) + assert index2.slice_locs("c", "a") == (2, 6) @pytest.mark.parametrize("dtype", [int, float]) def test_slice_locs_dup_numeric(self, dtype): @@ -1587,30 +1732,39 @@ def test_slice_locs_na(self): def test_slice_locs_na_raises(self): index = Index([np.nan, 1, 2]) - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): index.slice_locs(start=1.5) - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): index.slice_locs(end=1.5) - @pytest.mark.parametrize("in_slice,expected", [ - (pd.IndexSlice[::-1], 'yxdcb'), (pd.IndexSlice['b':'y':-1], ''), - (pd.IndexSlice['b'::-1], 'b'), (pd.IndexSlice[:'b':-1], 'yxdcb'), - (pd.IndexSlice[:'y':-1], 'y'), (pd.IndexSlice['y'::-1], 'yxdcb'), - (pd.IndexSlice['y'::-4], 'yb'), - # absent labels - (pd.IndexSlice[:'a':-1], 'yxdcb'), (pd.IndexSlice[:'a':-2], 'ydb'), - (pd.IndexSlice['z'::-1], 'yxdcb'), (pd.IndexSlice['z'::-3], 'yc'), - (pd.IndexSlice['m'::-1], 'dcb'), (pd.IndexSlice[:'m':-1], 'yx'), - (pd.IndexSlice['a':'a':-1], ''), (pd.IndexSlice['z':'z':-1], ''), - (pd.IndexSlice['m':'m':-1], '') - ]) + @pytest.mark.parametrize( + "in_slice,expected", + [ + (pd.IndexSlice[::-1], "yxdcb"), + (pd.IndexSlice["b":"y":-1], ""), + (pd.IndexSlice["b"::-1], "b"), + (pd.IndexSlice[:"b":-1], "yxdcb"), + (pd.IndexSlice[:"y":-1], "y"), + (pd.IndexSlice["y"::-1], "yxdcb"), + (pd.IndexSlice["y"::-4], "yb"), + # absent labels + (pd.IndexSlice[:"a":-1], "yxdcb"), + (pd.IndexSlice[:"a":-2], "ydb"), + (pd.IndexSlice["z"::-1], "yxdcb"), + (pd.IndexSlice["z"::-3], "yc"), + (pd.IndexSlice["m"::-1], "dcb"), + (pd.IndexSlice[:"m":-1], "yx"), + (pd.IndexSlice["a":"a":-1], ""), + (pd.IndexSlice["z":"z":-1], ""), + (pd.IndexSlice["m":"m":-1], ""), + ], + ) def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list('bcdxy')) + index = Index(list("bcdxy")) - s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, - in_slice.step) - result = index[s_start:s_stop:in_slice.step] + s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) + result = index[s_start : s_stop : in_slice.step] expected = pd.Index(list(expected)) tm.assert_index_equal(result, expected) @@ -1627,9 +1781,9 @@ def test_drop_by_str_label(self): expected = self.strIndex[1:] tm.assert_index_equal(dropped, expected) - @pytest.mark.parametrize("keys", [['foo', 'bar'], ['1', 'bar']]) + @pytest.mark.parametrize("keys", [["foo", "bar"], ["1", "bar"]]) def test_drop_by_str_label_raises_missing_keys(self, keys): - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): self.strIndex.drop(keys) def test_drop_by_str_label_errors_ignore(self): @@ -1638,13 +1792,13 @@ def test_drop_by_str_label_errors_ignore(self): # errors='ignore' n = len(self.strIndex) drop = self.strIndex[list(range(5, 10))] - mixed = drop.tolist() + ['foo'] - dropped = self.strIndex.drop(mixed, errors='ignore') + mixed = drop.tolist() + ["foo"] + dropped = self.strIndex.drop(mixed, errors="ignore") expected = self.strIndex[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = self.strIndex.drop(['foo', 'bar'], errors='ignore') + dropped = self.strIndex.drop(["foo", "bar"], errors="ignore") expected = self.strIndex[list(range(n))] tm.assert_index_equal(dropped, expected) @@ -1658,25 +1812,27 @@ def test_drop_by_numeric_label_loc(self): def test_drop_by_numeric_label_raises_missing_keys(self): index = Index([1, 2, 3]) - with pytest.raises(KeyError, match=''): + with pytest.raises(KeyError, match=""): index.drop([3, 4]) - @pytest.mark.parametrize("key,expected", [ - (4, Index([1, 2, 3])), ([3, 4, 5], Index([1, 2]))]) + @pytest.mark.parametrize( + "key,expected", [(4, Index([1, 2, 3])), ([3, 4, 5], Index([1, 2]))] + ) def test_drop_by_numeric_label_errors_ignore(self, key, expected): index = Index([1, 2, 3]) - dropped = index.drop(key, errors='ignore') + dropped = index.drop(key, errors="ignore") tm.assert_index_equal(dropped, expected) - @pytest.mark.parametrize("values", [['a', 'b', ('c', 'd')], - ['a', ('c', 'd'), 'b'], - [('c', 'd'), 'a', 'b']]) - @pytest.mark.parametrize("to_drop", [[('c', 'd'), 'a'], ['a', ('c', 'd')]]) + @pytest.mark.parametrize( + "values", + [["a", "b", ("c", "d")], ["a", ("c", "d"), "b"], [("c", "d"), "a", "b"]], + ) + @pytest.mark.parametrize("to_drop", [[("c", "d"), "a"], ["a", ("c", "d")]]) def test_drop_tuple(self, values, to_drop): # GH 18304 index = pd.Index(values) - expected = pd.Index(['b']) + expected = pd.Index(["b"]) result = index.drop(to_drop) tm.assert_index_equal(result, expected) @@ -1687,31 +1843,53 @@ def test_drop_tuple(self, values, to_drop): tm.assert_index_equal(result, expected) removed = index.drop(to_drop[1]) - msg = r"\"\[{}\] not found in axis\"".format( - re.escape(to_drop[1].__repr__())) + msg = r"\"\[{}\] not found in axis\"".format(re.escape(to_drop[1].__repr__())) for drop_me in to_drop[1], [to_drop[1]]: with pytest.raises(KeyError, match=msg): removed.drop(drop_me) - @pytest.mark.parametrize("method,expected,sort", [ - ('intersection', np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], - dtype=[('num', int), ('let', 'a1')]), - False), - - ('intersection', np.array([(1, 'A'), (1, 'B'), (2, 'A'), (2, 'B')], - dtype=[('num', int), ('let', 'a1')]), - None), - - ('union', np.array([(1, 'A'), (1, 'B'), (1, 'C'), (2, 'A'), (2, 'B'), - (2, 'C')], dtype=[('num', int), ('let', 'a1')]), - None) - ]) + @pytest.mark.parametrize( + "method,expected,sort", + [ + ( + "intersection", + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ), + False, + ), + ( + "intersection", + np.array( + [(1, "A"), (1, "B"), (2, "A"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ), + None, + ), + ( + "union", + np.array( + [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")], + dtype=[("num", int), ("let", "a1")], + ), + None, + ), + ], + ) def test_tuple_union_bug(self, method, expected, sort): - index1 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], - dtype=[('num', int), ('let', 'a1')])) - index2 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), - (2, 'B'), (1, 'C'), (2, 'C')], - dtype=[('num', int), ('let', 'a1')])) + index1 = Index( + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B")], + dtype=[("num", int), ("let", "a1")], + ) + ) + index2 = Index( + np.array( + [(1, "A"), (2, "A"), (1, "B"), (2, "B"), (1, "C"), (2, "C")], + dtype=[("num", int), ("let", "a1")], + ) + ) result = getattr(index1, method)(index2, sort=sort) assert result.ndim == 1 @@ -1719,10 +1897,15 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("attr", [ - 'is_monotonic_increasing', 'is_monotonic_decreasing', - '_is_strictly_monotonic_increasing', - '_is_strictly_monotonic_decreasing']) + @pytest.mark.parametrize( + "attr", + [ + "is_monotonic_increasing", + "is_monotonic_decreasing", + "_is_strictly_monotonic_increasing", + "_is_strictly_monotonic_decreasing", + ], + ) def test_is_monotonic_incomparable(self, attr): index = Index([5, datetime.now(), 7]) assert not getattr(index, attr) @@ -1737,13 +1920,14 @@ def test_get_set_value(self): self.dateIndex.set_value(values, date, 10) assert values[67] == 10 - @pytest.mark.parametrize("values", [ - ['foo', 'bar', 'quux'], {'foo', 'bar', 'quux'}]) - @pytest.mark.parametrize("index,expected", [ - (Index(['qux', 'baz', 'foo', 'bar']), - np.array([False, False, True, True])), - (Index([]), np.array([], dtype=bool)) # empty - ]) + @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) + @pytest.mark.parametrize( + "index,expected", + [ + (Index(["qux", "baz", "foo", "bar"]), np.array([False, False, True, True])), + (Index([]), np.array([], dtype=bool)), # empty + ], + ) def test_isin(self, values, index, expected): result = index.isin(values) tm.assert_numpy_array_equal(result, expected) @@ -1753,62 +1937,75 @@ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): # mangle the various types (save a corner case with PyPy) # all nans are the same - if (isinstance(nulls_fixture, float) and - isinstance(nulls_fixture2, float) and - math.isnan(nulls_fixture) and - math.isnan(nulls_fixture2)): - tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [nulls_fixture2]), np.array([False, True])) + if ( + isinstance(nulls_fixture, float) + and isinstance(nulls_fixture2, float) + and math.isnan(nulls_fixture) + and math.isnan(nulls_fixture2) + ): + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, True]), + ) elif nulls_fixture is nulls_fixture2: # should preserve NA type - tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [nulls_fixture2]), np.array([False, True])) + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, True]), + ) else: - tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [nulls_fixture2]), np.array([False, False])) + tm.assert_numpy_array_equal( + Index(["a", nulls_fixture]).isin([nulls_fixture2]), + np.array([False, False]), + ) def test_isin_nan_common_float64(self, nulls_fixture): if nulls_fixture is pd.NaT: pytest.skip("pd.NaT not compatible with Float64Index") # Float64Index overrides isin, so must be checked separately - tm.assert_numpy_array_equal(Float64Index([1.0, nulls_fixture]).isin( - [np.nan]), np.array([False, True])) + tm.assert_numpy_array_equal( + Float64Index([1.0, nulls_fixture]).isin([np.nan]), np.array([False, True]) + ) # we cannot compare NaT with NaN - tm.assert_numpy_array_equal(Float64Index([1.0, nulls_fixture]).isin( - [pd.NaT]), np.array([False, False])) + tm.assert_numpy_array_equal( + Float64Index([1.0, nulls_fixture]).isin([pd.NaT]), np.array([False, False]) + ) @pytest.mark.parametrize("level", [0, -1]) - @pytest.mark.parametrize("index", [ - Index(['qux', 'baz', 'foo', 'bar']), - # Float64Index overrides isin, so must be checked separately - Float64Index([1.0, 2.0, 3.0, 4.0])]) + @pytest.mark.parametrize( + "index", + [ + Index(["qux", "baz", "foo", "bar"]), + # Float64Index overrides isin, so must be checked separately + Float64Index([1.0, 2.0, 3.0, 4.0]), + ], + ) def test_isin_level_kwarg(self, level, index): - values = index.tolist()[-2:] + ['nonexisting'] + values = index.tolist()[-2:] + ["nonexisting"] expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(expected, index.isin(values, level=level)) - index.name = 'foobar' - tm.assert_numpy_array_equal(expected, - index.isin(values, level='foobar')) + index.name = "foobar" + tm.assert_numpy_array_equal(expected, index.isin(values, level="foobar")) @pytest.mark.parametrize("level", [2, 10, -3]) def test_isin_level_kwarg_bad_level_raises(self, level, indices): index = indices - with pytest.raises(IndexError, match='Too many levels'): + with pytest.raises(IndexError, match="Too many levels"): index.isin([], level=level) - @pytest.mark.parametrize("label", [1.0, 'foobar', 'xyzzy', np.nan]) + @pytest.mark.parametrize("label", [1.0, "foobar", "xyzzy", np.nan]) def test_isin_level_kwarg_bad_label_raises(self, label, indices): index = indices if isinstance(index, MultiIndex): - index = index.rename(['foo', 'bar']) + index = index.rename(["foo", "bar"]) msg = "'Level {} not found'" else: - index = index.rename('foo') + index = index.rename("foo") msg = r"'Level {} must be same as name \(foo\)'" with pytest.raises(KeyError, match=msg.format(label)): index.isin([], level=label) @@ -1822,21 +2019,24 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @pytest.mark.parametrize("values", [ - [1, 2, 3, 4], - [1., 2., 3., 4.], - [True, True, True, True], - ["foo", "bar", "baz", "qux"], - pd.date_range('2018-01-01', freq='D', periods=4)]) + @pytest.mark.parametrize( + "values", + [ + [1, 2, 3, 4], + [1.0, 2.0, 3.0, 4.0], + [True, True, True, True], + ["foo", "bar", "baz", "qux"], + pd.date_range("2018-01-01", freq="D", periods=4), + ], + ) def test_boolean_cmp(self, values): index = Index(values) - result = (index == values) + result = index == values expected = np.array([True, True, True, True], dtype=bool) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("name,level", [ - (None, 0), ('a', 'a')]) + @pytest.mark.parametrize("name,level", [(None, 0), ("a", "a")]) def test_get_level_values(self, name, level): expected = self.strIndex.copy() if name: @@ -1846,43 +2046,55 @@ def test_get_level_values(self, name, level): tm.assert_index_equal(result, expected) def test_slice_keep_name(self): - index = Index(['a', 'b'], name='asdf') + index = Index(["a", "b"], name="asdf") assert index.name == index[1:].name # instance attributes of the form self.Index - @pytest.mark.parametrize('index_kind', - ['unicode', 'str', 'date', 'int', 'float']) + @pytest.mark.parametrize("index_kind", ["unicode", "str", "date", "int", "float"]) def test_join_self(self, join_type, index_kind): - res = getattr(self, '{0}Index'.format(index_kind)) + res = getattr(self, "{0}Index".format(index_kind)) joined = res.join(res, how=join_type) assert res is joined - @pytest.mark.parametrize("method", ['strip', 'rstrip', 'lstrip']) + @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): # GH9068 - index = Index([' jack', 'jill ', ' jesse ', 'frank']) + index = Index([" jack", "jill ", " jesse ", "frank"]) expected = Index([getattr(str, method)(x) for x in index.values]) result = getattr(index.str, method)() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("index", [ - Index(range(5)), tm.makeDateIndex(10), - MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]), - period_range(start='2000', end='2010', freq='A')]) + @pytest.mark.parametrize( + "index", + [ + Index(range(5)), + tm.makeDateIndex(10), + MultiIndex.from_tuples([("foo", "1"), ("bar", "3")]), + period_range(start="2000", end="2010", freq="A"), + ], + ) def test_str_attribute_raises(self, index): - with pytest.raises(AttributeError, match='only use .str accessor'): + with pytest.raises(AttributeError, match="only use .str accessor"): index.str.repeat(2) - @pytest.mark.parametrize("expand,expected", [ - (None, Index([['a', 'b', 'c'], ['d', 'e'], ['f']])), - (False, Index([['a', 'b', 'c'], ['d', 'e'], ['f']])), - (True, MultiIndex.from_tuples([('a', 'b', 'c'), ('d', 'e', np.nan), - ('f', np.nan, np.nan)]))]) + @pytest.mark.parametrize( + "expand,expected", + [ + (None, Index([["a", "b", "c"], ["d", "e"], ["f"]])), + (False, Index([["a", "b", "c"], ["d", "e"], ["f"]])), + ( + True, + MultiIndex.from_tuples( + [("a", "b", "c"), ("d", "e", np.nan), ("f", np.nan, np.nan)] + ), + ), + ], + ) def test_str_split(self, expand, expected): - index = Index(['a b c', 'd e', 'f']) + index = Index(["a b c", "d e", "f"]) if expand is not None: result = index.str.split(expand=expand) else: @@ -1892,30 +2104,31 @@ def test_str_split(self, expand, expected): def test_str_bool_return(self): # test boolean case, should return np.array instead of boolean Index - index = Index(['a1', 'a2', 'b1', 'b2']) - result = index.str.startswith('a') + index = Index(["a1", "a2", "b1", "b2"]) + result = index.str.startswith("a") expected = np.array([True, True, False, False]) tm.assert_numpy_array_equal(result, expected) assert isinstance(result, np.ndarray) def test_str_bool_series_indexing(self): - index = Index(['a1', 'a2', 'b1', 'b2']) + index = Index(["a1", "a2", "b1", "b2"]) s = Series(range(4), index=index) - result = s[s.index.str.startswith('a')] - expected = Series(range(2), index=['a1', 'a2']) + result = s[s.index.str.startswith("a")] + expected = Series(range(2), index=["a1", "a2"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("index,expected", [ - (Index(list('abcd')), True), (Index(range(4)), False)]) + @pytest.mark.parametrize( + "index,expected", [(Index(list("abcd")), True), (Index(range(4)), False)] + ) def test_tab_completion(self, index, expected): # GH 9910 - result = 'str' in dir(index) + result = "str" in dir(index) assert result == expected def test_indexing_doesnt_change_class(self): - index = Index([1, 2, 3, 'a', 'b', 'c']) + index = Index([1, 2, 3, "a", "b", "c"]) assert index[1:3].identical(pd.Index([2, 3], dtype=np.object_)) assert index[[0, 1]].identical(pd.Index([1, 2], dtype=np.object_)) @@ -1925,44 +2138,44 @@ def test_outer_join_sort(self): right_index = tm.makeDateIndex(10) with tm.assert_produces_warning(RuntimeWarning): - result = left_index.join(right_index, how='outer') + result = left_index.join(right_index, how="outer") # right_index in this case because DatetimeIndex has join precedence # over Int64Index with tm.assert_produces_warning(RuntimeWarning): - expected = right_index.astype(object).union( - left_index.astype(object)) + expected = right_index.astype(object).union(left_index.astype(object)) tm.assert_index_equal(result, expected) def test_nan_first_take_datetime(self): - index = Index([pd.NaT, Timestamp('20130101'), Timestamp('20130102')]) + index = Index([pd.NaT, Timestamp("20130101"), Timestamp("20130102")]) result = index.take([-1, 0, 1]) expected = Index([index[-1], index[0], index[1]]) tm.assert_index_equal(result, expected) def test_take_fill_value(self): # GH 12631 - index = pd.Index(list('ABC'), name='xxx') + index = pd.Index(list("ABC"), name="xxx") result = index.take(np.array([1, 0, -1])) - expected = pd.Index(list('BAC'), name='xxx') + expected = pd.Index(list("BAC"), name="xxx") tm.assert_index_equal(result, expected) # fill_value result = index.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.Index(['B', 'A', np.nan], name='xxx') + expected = pd.Index(["B", "A", np.nan], name="xxx") tm.assert_index_equal(result, expected) # allow_fill=False - result = index.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.Index(['B', 'A', 'C'], name='xxx') + result = index.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Index(["B", "A", "C"], name="xxx") tm.assert_index_equal(result, expected) def test_take_fill_value_none_raises(self): - index = pd.Index(list('ABC'), name='xxx') - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + index = pd.Index(list("ABC"), name="xxx") + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): index.take(np.array([1, 0, -2]), fill_value=True) @@ -1970,47 +2183,55 @@ def test_take_fill_value_none_raises(self): index.take(np.array([1, 0, -5]), fill_value=True) def test_take_bad_bounds_raises(self): - index = pd.Index(list('ABC'), name='xxx') - with pytest.raises(IndexError, match='out of bounds'): + index = pd.Index(list("ABC"), name="xxx") + with pytest.raises(IndexError, match="out of bounds"): index.take(np.array([1, -5])) - @pytest.mark.parametrize("name", [None, 'foobar']) - @pytest.mark.parametrize("labels", [ - [], np.array([]), ['A', 'B', 'C'], ['C', 'B', 'A'], - np.array(['A', 'B', 'C']), np.array(['C', 'B', 'A']), - # Must preserve name even if dtype changes - pd.date_range('20130101', periods=3).values, - pd.date_range('20130101', periods=3).tolist()]) - def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name, - labels): + @pytest.mark.parametrize("name", [None, "foobar"]) + @pytest.mark.parametrize( + "labels", + [ + [], + np.array([]), + ["A", "B", "C"], + ["C", "B", "A"], + np.array(["A", "B", "C"]), + np.array(["C", "B", "A"]), + # Must preserve name even if dtype changes + pd.date_range("20130101", periods=3).values, + pd.date_range("20130101", periods=3).tolist(), + ], + ) + def test_reindex_preserves_name_if_target_is_list_or_ndarray(self, name, labels): # GH6552 index = pd.Index([0, 1, 2]) index.name = name assert index.reindex(labels)[0].name == name - @pytest.mark.parametrize("labels", [ - [], np.array([]), np.array([], dtype=np.int64)]) - def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, - labels): + @pytest.mark.parametrize("labels", [[], np.array([]), np.array([], dtype=np.int64)]) + def test_reindex_preserves_type_if_target_is_empty_list_or_array(self, labels): # GH7774 - index = pd.Index(list('abc')) + index = pd.Index(list("abc")) assert index.reindex(labels)[0].dtype.type == np.object_ - @pytest.mark.parametrize("labels,dtype", [ - (pd.Int64Index([]), np.int64), - (pd.Float64Index([]), np.float64), - (pd.DatetimeIndex([]), np.datetime64)]) - def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self, - labels, - dtype): + @pytest.mark.parametrize( + "labels,dtype", + [ + (pd.Int64Index([]), np.int64), + (pd.Float64Index([]), np.float64), + (pd.DatetimeIndex([]), np.datetime64), + ], + ) + def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self, labels, dtype): # GH7774 - index = pd.Index(list('abc')) + index = pd.Index(list("abc")) assert index.reindex(labels)[0].dtype.type == dtype def test_reindex_no_type_preserve_target_empty_mi(self): - index = pd.Index(list('abc')) - result = index.reindex(pd.MultiIndex( - [pd.Int64Index([]), pd.Float64Index([])], [[], []]))[0] + index = pd.Index(list("abc")) + result = index.reindex( + pd.MultiIndex([pd.Int64Index([]), pd.Float64Index([])], [[], []]) + )[0] assert result.levels[0].dtype.type == np.int64 assert result.levels[1].dtype.type == np.float64 @@ -2021,121 +2242,161 @@ def test_groupby(self): tm.assert_dict_equal(result, expected) - @pytest.mark.parametrize("mi,expected", [ - (MultiIndex.from_tuples([(1, 2), (4, 5)]), np.array([True, True])), - (MultiIndex.from_tuples([(1, 2), (4, 6)]), np.array([True, False]))]) + @pytest.mark.parametrize( + "mi,expected", + [ + (MultiIndex.from_tuples([(1, 2), (4, 5)]), np.array([True, True])), + (MultiIndex.from_tuples([(1, 2), (4, 6)]), np.array([True, False])), + ], + ) def test_equals_op_multiindex(self, mi, expected): # GH9785 # test comparisons of multiindex - df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) result = df.index == mi tm.assert_numpy_array_equal(result, expected) def test_equals_op_multiindex_identify(self): - df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) result = df.index == df.index expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("index", [ - MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]), - Index(['foo', 'bar', 'baz'])]) + @pytest.mark.parametrize( + "index", + [ + MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]), + Index(["foo", "bar", "baz"]), + ], + ) def test_equals_op_mismatched_multiindex_raises(self, index): - df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) + df = pd.read_csv(StringIO("a,b,c\n1,2,3\n4,5,6"), index_col=[0, 1]) with pytest.raises(ValueError, match="Lengths must match"): df.index == index def test_equals_op_index_vs_mi_same_length(self): mi = MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]) - index = Index(['foo', 'bar', 'baz']) + index = Index(["foo", "bar", "baz"]) result = mi == index expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dt_conv", [ - pd.to_datetime, pd.to_timedelta]) + @pytest.mark.parametrize("dt_conv", [pd.to_datetime, pd.to_timedelta]) def test_dt_conversion_preserves_name(self, dt_conv): # GH 10875 - index = pd.Index(['01:02:03', '01:02:04'], name='label') + index = pd.Index(["01:02:03", "01:02:04"], name="label") assert index.name == dt_conv(index).name - @pytest.mark.parametrize("index,expected", [ - # ASCII - # short - (pd.Index(['a', 'bb', 'ccc']), - """Index(['a', 'bb', 'ccc'], dtype='object')"""), - # multiple lines - (pd.Index(['a', 'bb', 'ccc'] * 10), - """\ + @pytest.mark.parametrize( + "index,expected", + [ + # ASCII + # short + ( + pd.Index(["a", "bb", "ccc"]), + """Index(['a', 'bb', 'ccc'], dtype='object')""", + ), + # multiple lines + ( + pd.Index(["a", "bb", "ccc"] * 10), + """\ Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - dtype='object')"""), - # truncated - (pd.Index(['a', 'bb', 'ccc'] * 100), - """\ + dtype='object')""", + ), + # truncated + ( + pd.Index(["a", "bb", "ccc"] * 100), + """\ Index(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - dtype='object', length=300)"""), - - # Non-ASCII - # short - (pd.Index(['あ', 'いい', 'ううう']), - """Index(['あ', 'いい', 'ううう'], dtype='object')"""), - # multiple lines - (pd.Index(['あ', 'いい', 'ううう'] * 10), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " - "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " - "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう'],\n" - " dtype='object')")), - # truncated - (pd.Index(['あ', 'いい', 'ううう'] * 100), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " - "'あ', 'いい', 'ううう', 'あ',\n" - " ...\n" - " 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう'],\n" - " dtype='object', length=300)"))]) + dtype='object', length=300)""", + ), + # Non-ASCII + # short + ( + pd.Index(["あ", "いい", "ううう"]), + """Index(['あ', 'いい', 'ううう'], dtype='object')""", + ), + # multiple lines + ( + pd.Index(["あ", "いい", "ううう"] * 10), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう'],\n" + " dtype='object')" + ), + ), + # truncated + ( + pd.Index(["あ", "いい", "ううう"] * 100), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + "'あ', 'いい', 'ううう', 'あ',\n" + " ...\n" + " 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう'],\n" + " dtype='object', length=300)" + ), + ), + ], + ) def test_string_index_repr(self, index, expected): result = repr(index) assert result == expected - @pytest.mark.parametrize("index,expected", [ - # short - (pd.Index(['あ', 'いい', 'ううう']), - ("Index(['あ', 'いい', 'ううう'], " - "dtype='object')")), - # multiple lines - (pd.Index(['あ', 'いい', 'ううう'] * 10), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ', 'いい', 'ううう'],\n" - " dtype='object')""")), - # truncated - (pd.Index(['あ', 'いい', 'ううう'] * 100), - ("Index(['あ', 'いい', 'ううう', 'あ', 'いい', " - "'ううう', 'あ', 'いい', 'ううう',\n" - " 'あ',\n" - " ...\n" - " 'ううう', 'あ', 'いい', 'ううう', 'あ', " - "'いい', 'ううう', 'あ', 'いい',\n" - " 'ううう'],\n" - " dtype='object', length=300)"))]) + @pytest.mark.parametrize( + "index,expected", + [ + # short + ( + pd.Index(["あ", "いい", "ううう"]), + ("Index(['あ', 'いい', 'ううう'], " "dtype='object')"), + ), + # multiple lines + ( + pd.Index(["あ", "いい", "ううう"] * 10), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ', 'いい', 'ううう'],\n" + " dtype='object')" + "" + ), + ), + # truncated + ( + pd.Index(["あ", "いい", "ううう"] * 100), + ( + "Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + "'ううう', 'あ', 'いい', 'ううう',\n" + " 'あ',\n" + " ...\n" + " 'ううう', 'あ', 'いい', 'ううう', 'あ', " + "'いい', 'ううう', 'あ', 'いい',\n" + " 'ううう'],\n" + " dtype='object', length=300)" + ), + ), + ], + ) def test_string_index_repr_with_unicode_option(self, index, expected): # Enable Unicode option ----------------------------------------- - with cf.option_context('display.unicode.east_asian_width', True): + with cf.option_context("display.unicode.east_asian_width", True): result = repr(index) assert result == expected @@ -2151,14 +2412,14 @@ def test_get_duplicates_deprecated(self): def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('idx.', 4)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("idx.", 4)) def test_deprecated_contains(self): for index in self.indices.values(): @@ -2174,7 +2435,7 @@ class TestMixedIntIndex(Base): _holder = Index def setup_method(self, method): - self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) + self.indices = dict(mixedIndex=Index([0, "a", 1, "b", 2, "c"])) self.setup_indices() def create_index(self): @@ -2203,42 +2464,42 @@ def test_copy_name(self): # GH12309 index = self.create_index() - first = index.__class__(index, copy=True, name='mario') + first = index.__class__(index, copy=True, name="mario") second = first.__class__(first, copy=False) # Even though "copy=False", we want a new object. assert first is not second tm.assert_index_equal(first, second) - assert first.name == 'mario' - assert second.name == 'mario' + assert first.name == "mario" + assert second.name == "mario" s1 = Series(2, index=first) s2 = Series(3, index=second[:-1]) s3 = s1 * s2 - assert s3.index.name == 'mario' + assert s3.index.name == "mario" def test_copy_name2(self): # Check that adding a "name" parameter to the copy is honored # GH14302 - index = pd.Index([1, 2], name='MyName') + index = pd.Index([1, 2], name="MyName") index1 = index.copy() tm.assert_index_equal(index, index1) - index2 = index.copy(name='NewName') + index2 = index.copy(name="NewName") tm.assert_index_equal(index, index2, check_names=False) - assert index.name == 'MyName' - assert index2.name == 'NewName' + assert index.name == "MyName" + assert index2.name == "NewName" - index3 = index.copy(names=['NewName']) + index3 = index.copy(names=["NewName"]) tm.assert_index_equal(index, index3, check_names=False) - assert index.name == 'MyName' - assert index.names == ['MyName'] - assert index3.name == 'NewName' - assert index3.names == ['NewName'] + assert index.name == "MyName" + assert index.names == ["MyName"] + assert index3.name == "NewName" + assert index3.names == ["NewName"] def test_union_base(self): index = self.create_index() @@ -2247,11 +2508,10 @@ def test_union_base(self): result = first.union(second) - expected = Index([0, 1, 2, 'a', 'b', 'c']) + expected = Index([0, 1, 2, "a", "b", "c"]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass", [ - np.array, Series, list]) + @pytest.mark.parametrize("klass", [np.array, Series, list]) def test_union_different_type_base(self, klass): # GH 10149 index = self.create_index() @@ -2263,8 +2523,8 @@ def test_union_different_type_base(self, klass): assert tm.equalContents(result, index) def test_unique_na(self): - idx = pd.Index([2, np.nan, 2, 1], name='my_index') - expected = pd.Index([2, np.nan, 1], name='my_index') + idx = pd.Index([2, np.nan, 2, 1], name="my_index") + expected = pd.Index([2, np.nan, 1], name="my_index") result = idx.unique() tm.assert_index_equal(result, expected) @@ -2275,12 +2535,11 @@ def test_intersection_base(self, sort): first = index[:5] second = index[:3] - expected = Index([0, 1, 'a']) if sort is None else Index([0, 'a', 1]) + expected = Index([0, 1, "a"]) if sort is None else Index([0, "a", 1]) result = first.intersection(second, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("klass", [ - np.array, Series, list]) + @pytest.mark.parametrize("klass", [np.array, Series, list]) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_different_type_base(self, klass, sort): # GH 10149 @@ -2299,7 +2558,7 @@ def test_difference_base(self, sort): second = index[3:] result = first.difference(second, sort) - expected = Index([0, 'a', 1]) + expected = Index([0, "a", 1]) if sort is None: expected = Index(safe_sort(expected)) tm.assert_index_equal(result, expected) @@ -2311,7 +2570,7 @@ def test_symmetric_difference(self): second = index[3:] result = first.symmetric_difference(second) - expected = Index([0, 1, 2, 'a', 'c']) + expected = Index([0, 1, 2, "a", "c"]) tm.assert_index_equal(result, expected) def test_logical_compat(self): @@ -2319,14 +2578,18 @@ def test_logical_compat(self): assert index.all() == index.values.all() assert index.any() == index.values.any() - @pytest.mark.parametrize("how", ['any', 'all']) - @pytest.mark.parametrize("dtype", [ - None, object, 'category']) - @pytest.mark.parametrize("vals,expected", [ - ([1, 2, 3], [1, 2, 3]), ([1., 2., 3.], [1., 2., 3.]), - ([1., 2., np.nan, 3.], [1., 2., 3.]), - (['A', 'B', 'C'], ['A', 'B', 'C']), - (['A', np.nan, 'B', 'C'], ['A', 'B', 'C'])]) + @pytest.mark.parametrize("how", ["any", "all"]) + @pytest.mark.parametrize("dtype", [None, object, "category"]) + @pytest.mark.parametrize( + "vals,expected", + [ + ([1, 2, 3], [1, 2, 3]), + ([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]), + ([1.0, 2.0, np.nan, 3.0], [1.0, 2.0, 3.0]), + (["A", "B", "C"], ["A", "B", "C"]), + (["A", np.nan, "B", "C"], ["A", "B", "C"]), + ], + ) def test_dropna(self, how, dtype, vals, expected): # GH 6194 index = pd.Index(vals, dtype=dtype) @@ -2334,20 +2597,36 @@ def test_dropna(self, how, dtype, vals, expected): expected = pd.Index(expected, dtype=dtype) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("how", ['any', 'all']) - @pytest.mark.parametrize("index,expected", [ - (pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03']), - pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'])), - (pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', pd.NaT]), - pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'])), - (pd.TimedeltaIndex(['1 days', '2 days', '3 days']), - pd.TimedeltaIndex(['1 days', '2 days', '3 days'])), - (pd.TimedeltaIndex([pd.NaT, '1 days', '2 days', '3 days', pd.NaT]), - pd.TimedeltaIndex(['1 days', '2 days', '3 days'])), - (pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M'), - pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M')), - (pd.PeriodIndex(['2012-02', '2012-04', 'NaT', '2012-05'], freq='M'), - pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M'))]) + @pytest.mark.parametrize("how", ["any", "all"]) + @pytest.mark.parametrize( + "index,expected", + [ + ( + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + ), + ( + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", pd.NaT]), + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"]), + ), + ( + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + ), + ( + pd.TimedeltaIndex([pd.NaT, "1 days", "2 days", "3 days", pd.NaT]), + pd.TimedeltaIndex(["1 days", "2 days", "3 days"]), + ), + ( + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + ), + ( + pd.PeriodIndex(["2012-02", "2012-04", "NaT", "2012-05"], freq="M"), + pd.PeriodIndex(["2012-02", "2012-04", "2012-05"], freq="M"), + ), + ], + ) def test_dropna_dt_like(self, how, index, expected): result = index.dropna(how=how) tm.assert_index_equal(result, expected) @@ -2355,7 +2634,7 @@ def test_dropna_dt_like(self, how, index, expected): def test_dropna_invalid_how_raises(self): msg = "invalid how option: xxx" with pytest.raises(ValueError, match=msg): - pd.Index([1, 2, 3]).dropna(how='xxx') + pd.Index([1, 2, 3]).dropna(how="xxx") def test_get_combined_index(self): result = _get_combined_index([]) @@ -2370,12 +2649,19 @@ def test_repeat(self): result = index.repeat(repeats) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("index", [ - pd.Index([np.nan]), pd.Index([np.nan, 1]), - pd.Index([1, 2, np.nan]), pd.Index(['a', 'b', np.nan]), - pd.to_datetime(['NaT']), pd.to_datetime(['NaT', '2000-01-01']), - pd.to_datetime(['2000-01-01', 'NaT', '2000-01-02']), - pd.to_timedelta(['1 day', 'NaT'])]) + @pytest.mark.parametrize( + "index", + [ + pd.Index([np.nan]), + pd.Index([np.nan, 1]), + pd.Index([1, 2, np.nan]), + pd.Index(["a", "b", np.nan]), + pd.to_datetime(["NaT"]), + pd.to_datetime(["NaT", "2000-01-01"]), + pd.to_datetime(["2000-01-01", "NaT", "2000-01-02"]), + pd.to_timedelta(["1 day", "NaT"]), + ], + ) def test_is_monotonic_na(self, index): assert index.is_monotonic_increasing is False assert index.is_monotonic_decreasing is False @@ -2383,21 +2669,19 @@ def test_is_monotonic_na(self, index): assert index._is_strictly_monotonic_decreasing is False def test_repr_summary(self): - with cf.option_context('display.max_seq_items', 10): + with cf.option_context("display.max_seq_items", 10): result = repr(pd.Index(np.arange(1000))) assert len(result) < 200 assert "..." in result @pytest.mark.parametrize("klass", [Series, DataFrame]) def test_int_name_format(self, klass): - index = Index(['a', 'b', 'c'], name=0) + index = Index(["a", "b", "c"], name=0) result = klass(list(range(3)), index=index) - assert '0' in repr(result) + assert "0" in repr(result) def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], - "\u05d1": [4, 5, 6], - "c": [7, 8, 9]}) + df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) repr(df.columns) # should not raise UnicodeDecodeError def test_str_to_bytes_raises(self): @@ -2411,7 +2695,7 @@ def test_intersect_str_dates(self): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] index1 = Index(dt_dates, dtype=object) - index2 = Index(['aa'], dtype=object) + index2 = Index(["aa"], dtype=object) result = index2.intersection(index1) expected = Index([], dtype=object) @@ -2419,78 +2703,107 @@ def test_intersect_str_dates(self): class TestIndexUtils: - - @pytest.mark.parametrize('data, names, expected', [ - ([[1, 2, 3]], None, Index([1, 2, 3])), - ([[1, 2, 3]], ['name'], Index([1, 2, 3], name='name')), - ([['a', 'a'], ['c', 'd']], None, - MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]])), - ([['a', 'a'], ['c', 'd']], ['L1', 'L2'], - MultiIndex([['a'], ['c', 'd']], [[0, 0], [0, 1]], - names=['L1', 'L2'])), - ]) + @pytest.mark.parametrize( + "data, names, expected", + [ + ([[1, 2, 3]], None, Index([1, 2, 3])), + ([[1, 2, 3]], ["name"], Index([1, 2, 3], name="name")), + ( + [["a", "a"], ["c", "d"]], + None, + MultiIndex([["a"], ["c", "d"]], [[0, 0], [0, 1]]), + ), + ( + [["a", "a"], ["c", "d"]], + ["L1", "L2"], + MultiIndex([["a"], ["c", "d"]], [[0, 0], [0, 1]], names=["L1", "L2"]), + ), + ], + ) def test_ensure_index_from_sequences(self, data, names, expected): result = ensure_index_from_sequences(data, names) tm.assert_index_equal(result, expected) def test_ensure_index_mixed_closed_intervals(self): # GH27172 - intervals = [pd.Interval(0, 1, closed='left'), - pd.Interval(1, 2, closed='right'), - pd.Interval(2, 3, closed='neither'), - pd.Interval(3, 4, closed='both')] + intervals = [ + pd.Interval(0, 1, closed="left"), + pd.Interval(1, 2, closed="right"), + pd.Interval(2, 3, closed="neither"), + pd.Interval(3, 4, closed="both"), + ] result = ensure_index(intervals) expected = Index(intervals, dtype=object) tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('opname', ['eq', 'ne', 'le', 'lt', 'ge', 'gt', - 'add', 'radd', 'sub', 'rsub', - 'mul', 'rmul', 'truediv', 'rtruediv', - 'floordiv', 'rfloordiv', - 'pow', 'rpow', 'mod', 'divmod']) +@pytest.mark.parametrize( + "opname", + [ + "eq", + "ne", + "le", + "lt", + "ge", + "gt", + "add", + "radd", + "sub", + "rsub", + "mul", + "rmul", + "truediv", + "rtruediv", + "floordiv", + "rfloordiv", + "pow", + "rpow", + "mod", + "divmod", + ], +) def test_generated_op_names(opname, indices): index = indices - if isinstance(index, ABCIndex) and opname == 'rsub': + if isinstance(index, ABCIndex) and opname == "rsub": # pd.Index.__rsub__ does not exist; though the method does exist # for subclasses. see GH#19723 return - opname = '__{name}__'.format(name=opname) + opname = "__{name}__".format(name=opname) method = getattr(index, opname) assert method.__name__ == opname -@pytest.mark.parametrize('index_maker', tm.index_subclass_makers_generator()) +@pytest.mark.parametrize("index_maker", tm.index_subclass_makers_generator()) def test_index_subclass_constructor_wrong_kwargs(index_maker): # GH #19348 - with pytest.raises(TypeError, match='unexpected keyword argument'): - index_maker(foo='bar') + with pytest.raises(TypeError, match="unexpected keyword argument"): + index_maker(foo="bar") def test_deprecated_fastpath(): with tm.assert_produces_warning(FutureWarning): - idx = pd.Index( - np.array(['a', 'b'], dtype=object), name='test', fastpath=True) + idx = pd.Index(np.array(["a", "b"], dtype=object), name="test", fastpath=True) - expected = pd.Index(['a', 'b'], name='test') + expected = pd.Index(["a", "b"], name="test") tm.assert_index_equal(idx, expected) with tm.assert_produces_warning(FutureWarning): idx = pd.Int64Index( - np.array([1, 2, 3], dtype='int64'), name='test', fastpath=True) + np.array([1, 2, 3], dtype="int64"), name="test", fastpath=True + ) - expected = pd.Index([1, 2, 3], name='test', dtype='int64') + expected = pd.Index([1, 2, 3], name="test", dtype="int64") tm.assert_index_equal(idx, expected) with tm.assert_produces_warning(FutureWarning): - idx = pd.RangeIndex(0, 5, 2, name='test', fastpath=True) + idx = pd.RangeIndex(0, 5, 2, name="test", fastpath=True) - expected = pd.RangeIndex(0, 5, 2, name='test') + expected = pd.RangeIndex(0, 5, 2, name="test") tm.assert_index_equal(idx, expected) with tm.assert_produces_warning(FutureWarning): - idx = pd.CategoricalIndex(['a', 'b', 'c'], name='test', fastpath=True) + idx = pd.CategoricalIndex(["a", "b", "c"], name="test", fastpath=True) - expected = pd.CategoricalIndex(['a', 'b', 'c'], name='test') + expected = pd.CategoricalIndex(["a", "b", "c"], name="test") tm.assert_index_equal(idx, expected) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 3b5092c901061..d52bc818c95aa 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -25,18 +25,17 @@ def setup_method(self, method): def create_index(self, categories=None, ordered=False): if categories is None: - categories = list('cab') - return CategoricalIndex( - list('aabbca'), categories=categories, ordered=ordered) + categories = list("cab") + return CategoricalIndex(list("aabbca"), categories=categories, ordered=ordered) def test_can_hold_identifiers(self): - idx = self.create_index(categories=list('abcd')) + idx = self.create_index(categories=list("abcd")) key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True def test_construction(self): - ci = self.create_index(categories=list('abcd')) + ci = self.create_index(categories=list("abcd")) categories = ci.categories result = Index(ci) @@ -50,56 +49,57 @@ def test_construction(self): # empty result = CategoricalIndex(categories=categories) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, np.array([], dtype='int8')) + tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) assert not result.ordered # passing categories - result = CategoricalIndex(list('aabbca'), categories=categories) + result = CategoricalIndex(list("aabbca"), categories=categories) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) - c = pd.Categorical(list('aabbca')) + c = pd.Categorical(list("aabbca")) result = CategoricalIndex(c) - tm.assert_index_equal(result.categories, Index(list('abc'))) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_index_equal(result.categories, Index(list("abc"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) assert not result.ordered result = CategoricalIndex(c, categories=categories) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) assert not result.ordered - ci = CategoricalIndex(c, categories=list('abcd')) + ci = CategoricalIndex(c, categories=list("abcd")) result = CategoricalIndex(ci) tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, 2, 0], dtype='int8')) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) assert not result.ordered - result = CategoricalIndex(ci, categories=list('ab')) - tm.assert_index_equal(result.categories, Index(list('ab'))) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, -1, 0], dtype='int8')) + result = CategoricalIndex(ci, categories=list("ab")) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) assert not result.ordered - result = CategoricalIndex(ci, categories=list('ab'), ordered=True) - tm.assert_index_equal(result.categories, Index(list('ab'))) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1, - 1, -1, 0], dtype='int8')) + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) assert result.ordered - result = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True) - expected = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True, - dtype='category') + result = pd.CategoricalIndex(ci, categories=list("ab"), ordered=True) + expected = pd.CategoricalIndex( + ci, categories=list("ab"), ordered=True, dtype="category" + ) tm.assert_index_equal(result, expected, exact=True) # turn me to an Index @@ -110,24 +110,22 @@ def test_construction(self): def test_construction_with_dtype(self): # specify dtype - ci = self.create_index(categories=list('abc')) + ci = self.create_index(categories=list("abc")) - result = Index(np.array(ci), dtype='category') + result = Index(np.array(ci), dtype="category") tm.assert_index_equal(result, ci, exact=True) - result = Index(np.array(ci).tolist(), dtype='category') + result = Index(np.array(ci).tolist(), dtype="category") tm.assert_index_equal(result, ci, exact=True) # these are generally only equal when the categories are reordered ci = self.create_index() - result = Index( - np.array(ci), dtype='category').reorder_categories(ci.categories) + result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) tm.assert_index_equal(result, ci, exact=True) # make sure indexes are handled - expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], - ordered=True) + expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) idx = Index(range(3)) result = CategoricalIndex(idx, categories=idx, ordered=True) tm.assert_index_equal(result, expected, exact=True) @@ -141,7 +139,7 @@ def test_construction_empty_with_bool_categories(self): def test_construction_with_categorical_dtype(self): # construction with CategoricalDtype # GH18109 - data, cats, ordered = 'a a b b'.split(), 'c b a'.split(), True + data, cats, ordered = "a a b b".split(), "c b a".split(), True dtype = CategoricalDtype(categories=cats, ordered=ordered) result = CategoricalIndex(data, dtype=dtype) @@ -170,93 +168,108 @@ def test_create_categorical(self): # https://github.com/pandas-dev/pandas/pull/17513 # The public CI constructor doesn't hit this code path with # instances of CategoricalIndex, but we still want to test the code - ci = CategoricalIndex(['a', 'b', 'c']) + ci = CategoricalIndex(["a", "b", "c"]) # First ci is self, second ci is data. result = CategoricalIndex._create_categorical(ci, ci) - expected = Categorical(['a', 'b', 'c']) + expected = Categorical(["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('func,op_name', [ - (lambda idx: idx - idx, '__sub__'), - (lambda idx: idx + idx, '__add__'), - (lambda idx: idx - ['a', 'b'], '__sub__'), - (lambda idx: idx + ['a', 'b'], '__add__'), - (lambda idx: ['a', 'b'] - idx, '__rsub__'), - (lambda idx: ['a', 'b'] + idx, '__radd__'), - ]) + @pytest.mark.parametrize( + "func,op_name", + [ + (lambda idx: idx - idx, "__sub__"), + (lambda idx: idx + idx, "__add__"), + (lambda idx: idx - ["a", "b"], "__sub__"), + (lambda idx: idx + ["a", "b"], "__add__"), + (lambda idx: ["a", "b"] - idx, "__rsub__"), + (lambda idx: ["a", "b"] + idx, "__radd__"), + ], + ) def test_disallow_set_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError - idx = pd.Index(pd.Categorical(['a', 'b'])) + idx = pd.Index(pd.Categorical(["a", "b"])) msg = "cannot perform {} with this index type: CategoricalIndex" with pytest.raises(TypeError, match=msg.format(op_name)): func(idx) def test_method_delegation(self): - ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) - result = ci.set_categories(list('cab')) - tm.assert_index_equal(result, CategoricalIndex( - list('aabbca'), categories=list('cab'))) + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) + result = ci.set_categories(list("cab")) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cab")) + ) - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - result = ci.rename_categories(list('efg')) - tm.assert_index_equal(result, CategoricalIndex( - list('ffggef'), categories=list('efg'))) + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.rename_categories(list("efg")) + tm.assert_index_equal( + result, CategoricalIndex(list("ffggef"), categories=list("efg")) + ) # GH18862 (let rename_categories take callables) result = ci.rename_categories(lambda x: x.upper()) - tm.assert_index_equal(result, CategoricalIndex( - list('AABBCA'), categories=list('CAB'))) - - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - result = ci.add_categories(['d']) - tm.assert_index_equal(result, CategoricalIndex( - list('aabbca'), categories=list('cabd'))) - - ci = CategoricalIndex(list('aabbca'), categories=list('cab')) - result = ci.remove_categories(['c']) - tm.assert_index_equal(result, CategoricalIndex( - list('aabb') + [np.nan] + ['a'], categories=list('ab'))) - - ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + tm.assert_index_equal( + result, CategoricalIndex(list("AABBCA"), categories=list("CAB")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.add_categories(["d"]) + tm.assert_index_equal( + result, CategoricalIndex(list("aabbca"), categories=list("cabd")) + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cab")) + result = ci.remove_categories(["c"]) + tm.assert_index_equal( + result, + CategoricalIndex(list("aabb") + [np.nan] + ["a"], categories=list("ab")), + ) + + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) result = ci.as_unordered() tm.assert_index_equal(result, ci) - ci = CategoricalIndex(list('aabbca'), categories=list('cabdef')) + ci = CategoricalIndex(list("aabbca"), categories=list("cabdef")) result = ci.as_ordered() - tm.assert_index_equal(result, CategoricalIndex( - list('aabbca'), categories=list('cabdef'), ordered=True)) + tm.assert_index_equal( + result, + CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=True), + ) # invalid msg = "cannot use inplace with CategoricalIndex" with pytest.raises(ValueError, match=msg): - ci.set_categories(list('cab'), inplace=True) + ci.set_categories(list("cab"), inplace=True) def test_contains(self): - ci = self.create_index(categories=list('cabdef')) + ci = self.create_index(categories=list("cabdef")) - assert 'a' in ci - assert 'z' not in ci - assert 'e' not in ci + assert "a" in ci + assert "z" not in ci + assert "e" not in ci assert np.nan not in ci # assert codes NOT in index assert 0 not in ci assert 1 not in ci - ci = CategoricalIndex( - list('aabbca') + [np.nan], categories=list('cabdef')) + ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) assert np.nan in ci - @pytest.mark.parametrize('item, expected', [ - (pd.Interval(0, 1), True), - (1.5, True), - (pd.Interval(0.5, 1.5), False), - ('a', False), - (pd.Timestamp(1), False), - (pd.Timedelta(1), False)], ids=str) + @pytest.mark.parametrize( + "item, expected", + [ + (pd.Interval(0, 1), True), + (1.5, True), + (pd.Interval(0.5, 1.5), False), + ("a", False), + (pd.Timestamp(1), False), + (pd.Timedelta(1), False), + ], + ids=str, + ) def test_contains_interval(self, item, expected): # GH 23705 ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) @@ -264,49 +277,47 @@ def test_contains_interval(self, item, expected): assert result is expected def test_map(self): - ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'), - ordered=True) + ci = pd.CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) result = ci.map(lambda x: x.lower()) - exp = pd.CategoricalIndex(list('ababc'), categories=list('cba'), - ordered=True) + exp = pd.CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) tm.assert_index_equal(result, exp) - ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), - ordered=False, name='XXX') + ci = pd.CategoricalIndex( + list("ABABC"), categories=list("BAC"), ordered=False, name="XXX" + ) result = ci.map(lambda x: x.lower()) - exp = pd.CategoricalIndex(list('ababc'), categories=list('bac'), - ordered=False, name='XXX') + exp = pd.CategoricalIndex( + list("ababc"), categories=list("bac"), ordered=False, name="XXX" + ) tm.assert_index_equal(result, exp) # GH 12766: Return an index not an array - tm.assert_index_equal(ci.map(lambda x: 1), - Index(np.array([1] * 5, dtype=np.int64), - name='XXX')) + tm.assert_index_equal( + ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX") + ) # change categories dtype - ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), - ordered=False) + ci = pd.CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) def f(x): - return {'A': 10, 'B': 20, 'C': 30}.get(x) + return {"A": 10, "B": 20, "C": 30}.get(x) result = ci.map(f) - exp = pd.CategoricalIndex([10, 20, 10, 20, 30], - categories=[20, 10, 30], - ordered=False) + exp = pd.CategoricalIndex( + [10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False + ) tm.assert_index_equal(result, exp) - result = ci.map(pd.Series([10, 20, 30], index=['A', 'B', 'C'])) + result = ci.map(pd.Series([10, 20, 30], index=["A", "B", "C"])) tm.assert_index_equal(result, exp) - result = ci.map({'A': 10, 'B': 20, 'C': 30}) + result = ci.map({"A": 10, "B": 20, "C": 30}) tm.assert_index_equal(result, exp) def test_map_with_categorical_series(self): # GH 12756 a = pd.Index([1, 2, 3, 4]) - b = pd.Series(["even", "odd", "even", "odd"], - dtype="category") + b = pd.Series(["even", "odd", "even", "odd"], dtype="category") c = pd.Series(["even", "odd", "even", "odd"]) exp = CategoricalIndex(["odd", "even", "odd", np.nan]) @@ -315,18 +326,16 @@ def test_map_with_categorical_series(self): tm.assert_index_equal(a.map(c), exp) @pytest.mark.parametrize( - ( - 'data', - 'f' - ), + ("data", "f"), ( ([1, 1, np.nan], pd.isna), ([1, 2, np.nan], pd.isna), ([1, 1, np.nan], {1: False}), ([1, 2, np.nan], {1: False, 2: False}), ([1, 1, np.nan], pd.Series([False, False])), - ([1, 2, np.nan], pd.Series([False, False, False])) - )) + ([1, 2, np.nan], pd.Series([False, False, False])), + ), + ) def test_map_with_nan(self, data, f): # GH 24241 values = pd.Categorical(data) result = values.map(f) @@ -337,7 +346,7 @@ def test_map_with_nan(self, data, f): # GH 24241 expected = pd.Index([False, False, np.nan]) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) def test_where(self, klass): i = self.create_index() cond = [True] * len(i) @@ -346,8 +355,7 @@ def test_where(self, klass): tm.assert_index_equal(result, expected) cond = [False] + [True] * (len(i) - 1) - expected = CategoricalIndex([np.nan] + i[1:].tolist(), - categories=i.categories) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories) result = i.where(klass(cond)) tm.assert_index_equal(result, expected) @@ -371,31 +379,31 @@ def test_append(self): # appending with different categories or reordered is not ok msg = "all inputs must be Index" with pytest.raises(TypeError, match=msg): - ci.append(ci.values.set_categories(list('abcd'))) + ci.append(ci.values.set_categories(list("abcd"))) with pytest.raises(TypeError, match=msg): - ci.append(ci.values.reorder_categories(list('abc'))) + ci.append(ci.values.reorder_categories(list("abc"))) # with objects - result = ci.append(Index(['c', 'a'])) - expected = CategoricalIndex(list('aabbcaca'), categories=categories) + result = ci.append(Index(["c", "a"])) + expected = CategoricalIndex(list("aabbcaca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid objects msg = "cannot append a non-category item to a CategoricalIndex" with pytest.raises(TypeError, match=msg): - ci.append(Index(['a', 'd'])) + ci.append(Index(["a", "d"])) # GH14298 - if base object is not categorical -> coerce to object - result = Index(['c', 'a']).append(ci) - expected = Index(list('caaabbca')) + result = Index(["c", "a"]).append(ci) + expected = Index(list("caaabbca")) tm.assert_index_equal(result, expected, exact=True) def test_append_to_another(self): # hits _concat_index_asobject - fst = Index(['a', 'b']) - snd = CategoricalIndex(['d', 'e']) + fst = Index(["a", "b"]) + snd = CategoricalIndex(["d", "e"]) result = fst.append(snd) - expected = Index(['a', 'b', 'd', 'e']) + expected = Index(["a", "b", "d", "e"]) tm.assert_index_equal(result, expected) def test_insert(self): @@ -404,30 +412,32 @@ def test_insert(self): categories = ci.categories # test 0th element - result = ci.insert(0, 'a') - expected = CategoricalIndex(list('aaabbca'), categories=categories) + result = ci.insert(0, "a") + expected = CategoricalIndex(list("aaabbca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # test Nth element that follows Python list behavior - result = ci.insert(-1, 'a') - expected = CategoricalIndex(list('aabbcaa'), categories=categories) + result = ci.insert(-1, "a") + expected = CategoricalIndex(list("aabbcaa"), categories=categories) tm.assert_index_equal(result, expected, exact=True) # test empty - result = CategoricalIndex(categories=categories).insert(0, 'a') - expected = CategoricalIndex(['a'], categories=categories) + result = CategoricalIndex(categories=categories).insert(0, "a") + expected = CategoricalIndex(["a"], categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid - msg = ("cannot insert an item into a CategoricalIndex that is not" - " already an existing category") + msg = ( + "cannot insert an item into a CategoricalIndex that is not" + " already an existing category" + ) with pytest.raises(TypeError, match=msg): - ci.insert(0, 'd') + ci.insert(0, "d") # GH 18295 (test missing) - expected = CategoricalIndex(['a', np.nan, 'a', 'b', 'c', 'b']) + expected = CategoricalIndex(["a", np.nan, "a", "b", "c", "b"]) for na in (np.nan, pd.NaT, None): - result = CategoricalIndex(list('aabcb')).insert(1, na) + result = CategoricalIndex(list("aabcb")).insert(1, na) tm.assert_index_equal(result, expected) def test_delete(self): @@ -436,11 +446,11 @@ def test_delete(self): categories = ci.categories result = ci.delete(0) - expected = CategoricalIndex(list('abbca'), categories=categories) + expected = CategoricalIndex(list("abbca"), categories=categories) tm.assert_index_equal(result, expected, exact=True) result = ci.delete(-1) - expected = CategoricalIndex(list('aabbc'), categories=categories) + expected = CategoricalIndex(list("aabbc"), categories=categories) tm.assert_index_equal(result, expected, exact=True) with pytest.raises((IndexError, ValueError)): @@ -459,23 +469,22 @@ def test_astype(self): assert not isinstance(result, CategoricalIndex) # interval - ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], - right=[2, 4], - closed='right') + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") - ci = CategoricalIndex(Categorical.from_codes( - [0, 1, -1], categories=ii, ordered=True)) + ci = CategoricalIndex( + Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) + ) - result = ci.astype('interval') + result = ci.astype("interval") expected = ii.take([0, 1, -1]) tm.assert_index_equal(result, expected) result = IntervalIndex(result.values) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('dtype_ordered', [True, False]) - @pytest.mark.parametrize('index_ordered', [True, False]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("index_ordered", [True, False]) def test_astype_category(self, name, dtype_ordered, index_ordered): # GH 18630 index = self.create_index(ordered=index_ordered) @@ -485,10 +494,12 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): # standard categories dtype = CategoricalDtype(ordered=dtype_ordered) result = index.astype(dtype) - expected = CategoricalIndex(index.tolist(), - name=name, - categories=index.categories, - ordered=dtype_ordered) + expected = CategoricalIndex( + index.tolist(), + name=name, + categories=index.categories, + ordered=dtype_ordered, + ) tm.assert_index_equal(result, expected) # non-standard categories @@ -499,17 +510,18 @@ def test_astype_category(self, name, dtype_ordered, index_ordered): if dtype_ordered is False: # dtype='category' can't specify ordered, so only test once - result = index.astype('category') + result = index.astype("category") expected = index tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('none, warning', [ - (None, None), (ordered_sentinel, FutureWarning)]) + @pytest.mark.parametrize( + "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] + ) def test_astype_category_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) - idx = CategoricalIndex(list('abcdaba'), dtype=cdt1) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) + idx = CategoricalIndex(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning): idx.astype(cdt2) @@ -550,59 +562,52 @@ def test_reindexing(self): tm.assert_numpy_array_equal(expected, actual) def test_reindex_dtype(self): - c = CategoricalIndex(['a', 'b', 'c', 'a']) - res, indexer = c.reindex(['a', 'c']) - tm.assert_index_equal(res, Index(['a', 'a', 'c']), exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(["a", "c"]) + tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(['a', 'b', 'c', 'a']) - res, indexer = c.reindex(Categorical(['a', 'c'])) + c = CategoricalIndex(["a", "b", "c", "a"]) + res, indexer = c.reindex(Categorical(["a", "c"])) - exp = CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(['a', 'b', 'c', 'a'], - categories=['a', 'b', 'c', 'd']) - res, indexer = c.reindex(['a', 'c']) - exp = Index(['a', 'a', 'c'], dtype='object') + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(["a", "c"]) + exp = Index(["a", "a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(['a', 'b', 'c', 'a'], - categories=['a', 'b', 'c', 'd']) - res, indexer = c.reindex(Categorical(['a', 'c'])) - exp = CategoricalIndex(['a', 'a', 'c'], categories=['a', 'c']) + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(Categorical(["a", "c"])) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) def test_reindex_duplicate_target(self): # See GH23963 - c = CategoricalIndex(['a', 'b', 'c', 'a'], - categories=['a', 'b', 'c', 'd']) - with pytest.raises(ValueError, match='non-unique indexer'): - c.reindex(['a', 'a', 'c']) + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match="non-unique indexer"): + c.reindex(["a", "a", "c"]) - with pytest.raises(ValueError, match='non-unique indexer'): - c.reindex(CategoricalIndex(['a', 'a', 'c'], - categories=['a', 'b', 'c', 'd'])) + with pytest.raises(ValueError, match="non-unique indexer"): + c.reindex( + CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"]) + ) def test_reindex_empty_index(self): # See GH16770 c = CategoricalIndex([]) - res, indexer = c.reindex(['a', 'b']) - tm.assert_index_equal(res, Index(['a', 'b']), exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([-1, -1], dtype=np.intp)) - - @pytest.mark.parametrize('data, non_lexsorted_data', [ - [[1, 2, 3], [9, 0, 1, 2, 3]], - [list('abc'), list('fabcd')], - ]) + res, indexer = c.reindex(["a", "b"]) + tm.assert_index_equal(res, Index(["a", "b"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) + + @pytest.mark.parametrize( + "data, non_lexsorted_data", + [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]], + ) def test_is_monotonic(self, data, non_lexsorted_data): c = CategoricalIndex(data) assert c.is_monotonic_increasing is True @@ -639,86 +644,89 @@ def test_is_monotonic(self, data, non_lexsorted_data): def test_has_duplicates(self): - idx = CategoricalIndex([0, 0, 0], name='foo') + idx = CategoricalIndex([0, 0, 0], name="foo") assert idx.is_unique is False assert idx.has_duplicates is True def test_drop_duplicates(self): - idx = CategoricalIndex([0, 0, 0], name='foo') - expected = CategoricalIndex([0], name='foo') + idx = CategoricalIndex([0, 0, 0], name="foo") + expected = CategoricalIndex([0], name="foo") tm.assert_index_equal(idx.drop_duplicates(), expected) tm.assert_index_equal(idx.unique(), expected) def test_get_indexer(self): - idx1 = CategoricalIndex(list('aabcde'), categories=list('edabc')) - idx2 = CategoricalIndex(list('abf')) + idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) + idx2 = CategoricalIndex(list("abf")) - for indexer in [idx2, list('abf'), Index(list('abf'))]: + for indexer in [idx2, list("abf"), Index(list("abf"))]: r1 = idx1.get_indexer(idx2) assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) - msg = ("method='pad' and method='backfill' not implemented yet for" - " CategoricalIndex") + msg = ( + "method='pad' and method='backfill' not implemented yet for" + " CategoricalIndex" + ) with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method='pad') + idx2.get_indexer(idx1, method="pad") with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method='backfill') + idx2.get_indexer(idx1, method="backfill") msg = "method='nearest' not implemented yet for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): - idx2.get_indexer(idx1, method='nearest') + idx2.get_indexer(idx1, method="nearest") def test_get_loc(self): # GH 12531 - cidx1 = CategoricalIndex(list('abcde'), categories=list('edabc')) - idx1 = Index(list('abcde')) - assert cidx1.get_loc('a') == idx1.get_loc('a') - assert cidx1.get_loc('e') == idx1.get_loc('e') + cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc")) + idx1 = Index(list("abcde")) + assert cidx1.get_loc("a") == idx1.get_loc("a") + assert cidx1.get_loc("e") == idx1.get_loc("e") for i in [cidx1, idx1]: with pytest.raises(KeyError): - i.get_loc('NOT-EXIST') + i.get_loc("NOT-EXIST") # non-unique - cidx2 = CategoricalIndex(list('aacded'), categories=list('edabc')) - idx2 = Index(list('aacded')) + cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc")) + idx2 = Index(list("aacded")) # results in bool array - res = cidx2.get_loc('d') - tm.assert_numpy_array_equal(res, idx2.get_loc('d')) - tm.assert_numpy_array_equal(res, np.array([False, False, False, - True, False, True])) + res = cidx2.get_loc("d") + tm.assert_numpy_array_equal(res, idx2.get_loc("d")) + tm.assert_numpy_array_equal( + res, np.array([False, False, False, True, False, True]) + ) # unique element results in scalar - res = cidx2.get_loc('e') - assert res == idx2.get_loc('e') + res = cidx2.get_loc("e") + assert res == idx2.get_loc("e") assert res == 4 for i in [cidx2, idx2]: with pytest.raises(KeyError): - i.get_loc('NOT-EXIST') + i.get_loc("NOT-EXIST") # non-unique, sliceable - cidx3 = CategoricalIndex(list('aabbb'), categories=list('abc')) - idx3 = Index(list('aabbb')) + cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc")) + idx3 = Index(list("aabbb")) # results in slice - res = cidx3.get_loc('a') - assert res == idx3.get_loc('a') + res = cidx3.get_loc("a") + assert res == idx3.get_loc("a") assert res == slice(0, 2, None) - res = cidx3.get_loc('b') - assert res == idx3.get_loc('b') + res = cidx3.get_loc("b") + assert res == idx3.get_loc("b") assert res == slice(2, 5, None) for i in [cidx3, idx3]: with pytest.raises(KeyError): - i.get_loc('c') + i.get_loc("c") def test_repr_roundtrip(self): - ci = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) + ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) str(ci) tm.assert_index_equal(eval(repr(ci)), ci, exact=True) @@ -732,30 +740,30 @@ def test_repr_roundtrip(self): def test_isin(self): - ci = CategoricalIndex( - list('aabca') + [np.nan], categories=['c', 'a', 'b']) + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) tm.assert_numpy_array_equal( - ci.isin(['c']), - np.array([False, False, False, True, False, False])) + ci.isin(["c"]), np.array([False, False, False, True, False, False]) + ) tm.assert_numpy_array_equal( - ci.isin(['c', 'a', 'b']), np.array([True] * 5 + [False])) + ci.isin(["c", "a", "b"]), np.array([True] * 5 + [False]) + ) tm.assert_numpy_array_equal( - ci.isin(['c', 'a', 'b', np.nan]), np.array([True] * 6)) + ci.isin(["c", "a", "b", np.nan]), np.array([True] * 6) + ) # mismatched categorical -> coerced to ndarray so doesn't matter - result = ci.isin(ci.set_categories(list('abcdefghi'))) + result = ci.isin(ci.set_categories(list("abcdefghi"))) expected = np.array([True] * 6) tm.assert_numpy_array_equal(result, expected) - result = ci.isin(ci.set_categories(list('defghi'))) + result = ci.isin(ci.set_categories(list("defghi"))) expected = np.array([False] * 5 + [True]) tm.assert_numpy_array_equal(result, expected) def test_identical(self): - ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) - ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], - ordered=True) + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) assert ci1.identical(ci1) assert ci1.identical(ci1.copy()) assert not ci1.identical(ci2) @@ -777,9 +785,8 @@ def test_ensure_copied_data(self): assert _base(index.values) is _base(result.values) def test_equals_categorical(self): - ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) - ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], - ordered=True) + ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) + ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) assert ci1.equals(ci1) assert not ci1.equals(ci2) @@ -794,71 +801,69 @@ def test_equals_categorical(self): assert (ci1 >= ci1).all() assert not (ci1 == 1).all() - assert (ci1 == Index(['a', 'b'])).all() + assert (ci1 == Index(["a", "b"])).all() assert (ci1 == ci1.values).all() # invalid comparisons with pytest.raises(ValueError, match="Lengths must match"): - ci1 == Index(['a', 'b', 'c']) + ci1 == Index(["a", "b", "c"]) - msg = ("categorical index comparisons must have the same categories" - " and ordered attributes") + msg = ( + "categorical index comparisons must have the same categories" + " and ordered attributes" + ) with pytest.raises(TypeError, match=msg): ci1 == ci2 with pytest.raises(TypeError, match=msg): ci1 == Categorical(ci1.values, ordered=False) with pytest.raises(TypeError, match=msg): - ci1 == Categorical(ci1.values, categories=list('abc')) + ci1 == Categorical(ci1.values, categories=list("abc")) # tests # make sure that we are testing for category inclusion properly - ci = CategoricalIndex(list('aabca'), categories=['c', 'a', 'b']) - assert not ci.equals(list('aabca')) + ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) # Same categories, but different order # Unordered - assert ci.equals(CategoricalIndex(list('aabca'))) + assert ci.equals(CategoricalIndex(list("aabca"))) # Ordered - assert not ci.equals(CategoricalIndex(list('aabca'), ordered=True)) + assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True)) assert ci.equals(ci.copy()) - ci = CategoricalIndex(list('aabca') + [np.nan], - categories=['c', 'a', 'b']) - assert not ci.equals(list('aabca')) - assert not ci.equals(CategoricalIndex(list('aabca'))) + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca")) + assert not ci.equals(CategoricalIndex(list("aabca"))) assert ci.equals(ci.copy()) - ci = CategoricalIndex(list('aabca') + [np.nan], - categories=['c', 'a', 'b']) - assert not ci.equals(list('aabca') + [np.nan]) - assert ci.equals(CategoricalIndex(list('aabca') + [np.nan])) - assert not ci.equals(CategoricalIndex(list('aabca') + [np.nan], - ordered=True)) + ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) + assert not ci.equals(list("aabca") + [np.nan]) + assert ci.equals(CategoricalIndex(list("aabca") + [np.nan])) + assert not ci.equals(CategoricalIndex(list("aabca") + [np.nan], ordered=True)) assert ci.equals(ci.copy()) def test_equals_categoridcal_unordered(self): # https://github.com/pandas-dev/pandas/issues/16603 - a = pd.CategoricalIndex(['A'], categories=['A', 'B']) - b = pd.CategoricalIndex(['A'], categories=['B', 'A']) - c = pd.CategoricalIndex(['C'], categories=['B', 'A']) + a = pd.CategoricalIndex(["A"], categories=["A", "B"]) + b = pd.CategoricalIndex(["A"], categories=["B", "A"]) + c = pd.CategoricalIndex(["C"], categories=["B", "A"]) assert a.equals(b) assert not a.equals(c) assert not b.equals(c) def test_frame_repr(self): - df = pd.DataFrame({"A": [1, 2, 3]}, - index=pd.CategoricalIndex(['a', 'b', 'c'])) + df = pd.DataFrame({"A": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"])) result = repr(df) - expected = ' A\na 1\nb 2\nc 3' + expected = " A\na 1\nb 2\nc 3" assert result == expected def test_string_categorical_index_repr(self): # short - idx = pd.CategoricalIndex(['a', 'bb', 'ccc']) + idx = pd.CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines - idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 10) + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], @@ -867,7 +872,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # truncated - idx = pd.CategoricalIndex(['a', 'bb', 'ccc'] * 100) + idx = pd.CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], @@ -876,7 +881,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # larger categories - idx = pd.CategoricalIndex(list('abcdefghijklmmo')) + idx = pd.CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa @@ -884,12 +889,12 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # short - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう']) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 10) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], @@ -898,7 +903,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # truncated - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 100) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], @@ -907,7 +912,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # larger categories - idx = pd.CategoricalIndex(list('あいうえおかきくけこさしすせそ')) + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa @@ -915,15 +920,15 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # Emable Unicode option ----------------------------------------- - with cf.option_context('display.unicode.east_asian_width', True): + with cf.option_context("display.unicode.east_asian_width", True): # short - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう']) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 10) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', @@ -933,7 +938,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # truncated - idx = pd.CategoricalIndex(['あ', 'いい', 'ううう'] * 100) + idx = pd.CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... @@ -944,7 +949,7 @@ def test_string_categorical_index_repr(self): assert repr(idx) == expected # larger categories - idx = pd.CategoricalIndex(list('あいうえおかきくけこさしすせそ')) + idx = pd.CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa @@ -953,13 +958,13 @@ def test_string_categorical_index_repr(self): def test_fillna_categorical(self): # GH 11343 - idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name='x') + idx = CategoricalIndex([1.0, np.nan, 3.0, 1.0], name="x") # fill by value in categories - exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name='x') + exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x") tm.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError - msg = 'fill value must be in categories' + msg = "fill value must be in categories" with pytest.raises(ValueError, match=msg): idx.fillna(2.0) @@ -967,53 +972,55 @@ def test_take_fill_value(self): # GH 12631 # numeric category - idx = pd.CategoricalIndex([1, 2, 3], name='xxx') + idx = pd.CategoricalIndex([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = pd.CategoricalIndex([2, 1, 3], name='xxx') + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], - name='xxx') + expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.CategoricalIndex([2, 1, 3], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # object category - idx = pd.CategoricalIndex(list('CBA'), categories=list('ABC'), - ordered=True, name='xxx') + idx = pd.CategoricalIndex( + list("CBA"), categories=list("ABC"), ordered=True, name="xxx" + ) result = idx.take(np.array([1, 0, -1])) - expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'), - ordered=True, name='xxx') + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.CategoricalIndex(['B', 'C', np.nan], - categories=list('ABC'), ordered=True, - name='xxx') + expected = pd.CategoricalIndex( + ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx" + ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'), - ordered=True, name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.CategoricalIndex( + list("BCA"), categories=list("ABC"), ordered=True, name="xxx" + ) tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -1025,33 +1032,34 @@ def test_take_fill_value(self): def test_take_fill_value_datetime(self): # datetime category - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], - name='xxx') + idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx") idx = pd.CategoricalIndex(idx) result = idx.take(np.array([1, 0, -1])) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx') + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) expected = pd.CategoricalIndex(expected) tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', 'NaT'], - name='xxx') - exp_cats = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) + expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx") + exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"]) expected = pd.CategoricalIndex(expected, categories=exp_cats) tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.DatetimeIndex(['2011-02-01', '2011-01-01', '2011-03-01'], - name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.DatetimeIndex( + ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx" + ) expected = pd.CategoricalIndex(expected) tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -1061,7 +1069,7 @@ def test_take_fill_value_datetime(self): idx.take(np.array([1, -5])) def test_take_invalid_kwargs(self): - idx = pd.CategoricalIndex([1, 2, 3], name='foo') + idx = pd.CategoricalIndex([1, 2, 3], name="foo") indices = [1, 0, -1] msg = r"take\(\) got an unexpected keyword argument 'foo'" @@ -1074,14 +1082,17 @@ def test_take_invalid_kwargs(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - idx.take(indices, mode='clip') - - @pytest.mark.parametrize('dtype, engine_type', [ - (np.int8, libindex.Int8Engine), - (np.int16, libindex.Int16Engine), - (np.int32, libindex.Int32Engine), - (np.int64, libindex.Int64Engine), - ]) + idx.take(indices, mode="clip") + + @pytest.mark.parametrize( + "dtype, engine_type", + [ + (np.int8, libindex.Int8Engine), + (np.int16, libindex.Int16Engine), + (np.int32, libindex.Int32Engine), + (np.int64, libindex.Int64Engine), + ], + ) def test_engine_type(self, dtype, engine_type): if dtype != np.int64: # num. of uniques required to push CategoricalIndex.codes to a @@ -1092,6 +1103,6 @@ def test_engine_type(self, dtype, engine_type): # having 2**32 - 2**31 categories would be very memory-intensive, # so we cheat a bit with the dtype ci = pd.CategoricalIndex(range(32768)) # == 2**16 - 2**(16 - 1) - ci.values._codes = ci.values._codes.astype('int64') + ci.values._codes = ci.values._codes.astype("int64") assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 3cb907c6f5844..465b7f5e23bb8 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -18,7 +18,6 @@ class TestCommon: - def test_droplevel(self, indices): # GH 21115 if isinstance(indices, MultiIndex): @@ -34,7 +33,7 @@ def test_droplevel(self, indices): with pytest.raises(ValueError): indices.droplevel(level) - for level in 'wrong', ['wrong']: + for level in "wrong", ["wrong"]: with pytest.raises(KeyError): indices.droplevel(level) @@ -45,7 +44,7 @@ def test_constructor_non_hashable_name(self, indices): pytest.skip("multiindex handled in test_multi.py") message = "Index.name must be a hashable type" - renamed = [['1']] + renamed = [["1"]] # With .rename() with pytest.raises(TypeError, match=message): @@ -62,7 +61,7 @@ def test_constructor_unwraps_index(self, indices): b = type(a)(a) tm.assert_equal(a._data, b._data) - @pytest.mark.parametrize("itm", [101, 'no_int']) + @pytest.mark.parametrize("itm", [101, "no_int"]) # FutureWarning from non-tuple sequence of nd indexing @pytest.mark.filterwarnings("ignore::FutureWarning") def test_getitem_error(self, indices, itm): @@ -70,14 +69,15 @@ def test_getitem_error(self, indices, itm): indices[itm] @pytest.mark.parametrize( - 'fname, sname, expected_name', + "fname, sname, expected_name", [ - ('A', 'A', 'A'), - ('A', 'B', None), - ('A', None, None), - (None, 'B', None), + ("A", "A", "A"), + ("A", "B", None), + ("A", None, None), + (None, "B", None), (None, None, None), - ]) + ], + ) def test_corner_union(self, indices, fname, sname, expected_name): # GH 9943 9862 # Test unions with various name combinations @@ -131,7 +131,7 @@ def test_set_name_methods(self, indices): # don't tests a MultiIndex here (as its tested separated) if isinstance(indices, MultiIndex): - pytest.skip('Skip check for MultiIndex') + pytest.skip("Skip check for MultiIndex") original_name = indices.name new_ind = indices.set_names([new_name]) assert new_ind.name == new_name @@ -149,7 +149,7 @@ def test_set_name_methods(self, indices): indices.set_names("a", level=0) # rename in place just leaves tuples and other containers alone - name = ('A', 'B') + name = ("A", "B") indices.rename(name, inplace=True) assert indices.name == name assert indices.names == [name] @@ -162,15 +162,16 @@ def test_dtype_str(self, indices): def test_hash_error(self, indices): index = indices - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + with pytest.raises( + TypeError, match=("unhashable type: %r" % type(index).__name__) + ): hash(indices) def test_copy_and_deepcopy(self, indices): from copy import copy, deepcopy if isinstance(indices, MultiIndex): - pytest.skip('Skip check for MultiIndex') + pytest.skip("Skip check for MultiIndex") for func in (copy, deepcopy): idx_copy = func(indices) @@ -184,7 +185,7 @@ def test_unique(self, indices): # don't test a MultiIndex here (as its tested separated) # don't test a CategoricalIndex because categories change (GH 18291) if isinstance(indices, (MultiIndex, CategoricalIndex)): - pytest.skip('Skip check for MultiIndex/CategoricalIndex') + pytest.skip("Skip check for MultiIndex/CategoricalIndex") # GH 17896 expected = indices.drop_duplicates() @@ -197,14 +198,15 @@ def test_unique(self, indices): indices.unique(level=3) msg = r"Level wrong must be same as name \({}\)".format( - re.escape(indices.name.__repr__())) + re.escape(indices.name.__repr__()) + ) with pytest.raises(KeyError, match=msg): - indices.unique(level='wrong') + indices.unique(level="wrong") def test_get_unique_index(self, indices): # MultiIndex tested separately if not len(indices) or isinstance(indices, MultiIndex): - pytest.skip('Skip check for empty Index and MultiIndex') + pytest.skip("Skip check for empty Index and MultiIndex") idx = indices[[0] * 5] idx_unique = indices[[0]] @@ -223,7 +225,7 @@ def test_get_unique_index(self, indices): # nans: if not indices._can_hold_na: - pytest.skip('Skip na-check if index cannot hold na') + pytest.skip("Skip na-check if index cannot hold na") if needs_i8_conversion(indices): vals = indices.asi8[[0] * 5] @@ -240,9 +242,7 @@ def test_get_unique_index(self, indices): assert idx_nan.dtype == indices.dtype assert idx_unique_nan.dtype == indices.dtype - for dropna, expected in zip([False, True], - [idx_unique_nan, - idx_unique]): + for dropna, expected in zip([False, True], [idx_unique_nan, idx_unique]): for i in [idx_nan, idx_unique_nan]: result = i._get_unique_index(dropna=dropna) tm.assert_index_equal(result, expected) @@ -254,7 +254,7 @@ def test_sort(self, indices): def test_mutability(self, indices): if not len(indices): - pytest.skip('Skip check for empty Index') + pytest.skip("Skip check for empty Index") msg = "Index does not support mutable operations" with pytest.raises(TypeError, match=msg): indices[0] = indices[0] @@ -270,11 +270,11 @@ def test_searchsorted_monotonic(self, indices): # not implemented for tuple searches in MultiIndex # or Intervals searches in IntervalIndex if isinstance(indices, (MultiIndex, pd.IntervalIndex)): - pytest.skip('Skip check for MultiIndex/IntervalIndex') + pytest.skip("Skip check for MultiIndex/IntervalIndex") # nothing to test if the index is empty if indices.empty: - pytest.skip('Skip check for empty Index') + pytest.skip("Skip check for empty Index") value = indices[0] # determine the expected results (handle dupes for 'right') @@ -286,41 +286,41 @@ def test_searchsorted_monotonic(self, indices): # test _searchsorted_monotonic in all cases # test searchsorted only for increasing if indices.is_monotonic_increasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert expected_right == ssm_right - ss_left = indices.searchsorted(value, side='left') + ss_left = indices.searchsorted(value, side="left") assert expected_left == ss_left - ss_right = indices.searchsorted(value, side='right') + ss_right = indices.searchsorted(value, side="right") assert expected_right == ss_right elif indices.is_monotonic_decreasing: - ssm_left = indices._searchsorted_monotonic(value, side='left') + ssm_left = indices._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side='right') + ssm_right = indices._searchsorted_monotonic(value, side="right") assert expected_right == ssm_right else: # non-monotonic should raise. with pytest.raises(ValueError): - indices._searchsorted_monotonic(value, side='left') + indices._searchsorted_monotonic(value, side="left") def test_pickle(self, indices): - original_name, indices.name = indices.name, 'foo' + original_name, indices.name = indices.name, "foo" unpickled = tm.round_trip_pickle(indices) assert indices.equals(unpickled) indices.name = original_name - @pytest.mark.parametrize('keep', ['first', 'last', False]) + @pytest.mark.parametrize("keep", ["first", "last", False]) def test_duplicated(self, indices, keep): if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates - pytest.skip('Skip check for empty Index, MultiIndex, RangeIndex') + pytest.skip("Skip check for empty Index, MultiIndex, RangeIndex") holder = type(indices) @@ -348,8 +348,7 @@ def test_has_duplicates(self, indices): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates. # RangeIndex is unique by definition. - pytest.skip('Skip check for empty Index, MultiIndex, ' - 'and RangeIndex') + pytest.skip("Skip check for empty Index, MultiIndex, " "and RangeIndex") idx = holder([indices[0]] * 5) assert idx.is_unique is False diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index 56efd4bbfd62a..57acc8ee72b70 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -9,7 +9,7 @@ class TestFrozenList(CheckImmutable, CheckStringMixin): - mutable_methods = ('extend', 'pop', 'remove', 'insert') + mutable_methods = ("extend", "pop", "remove", "insert") unicode_container = FrozenList(["\u05d0", "\u05d1", "c"]) def setup_method(self, _): @@ -58,7 +58,7 @@ def test_tricky_container_to_bytes_raises(self): class TestFrozenNDArray(CheckImmutable, CheckStringMixin): - mutable_methods = ('put', 'itemset', 'fill') + mutable_methods = ("put", "itemset", "fill") def setup_method(self, _): self.lst = [3, 5, 7, -2] diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 3437f501aa910..1feb82a923b19 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -14,7 +14,6 @@ class Numeric(Base): - def test_can_hold_identifiers(self): idx = self.create_index() key = idx[0] @@ -27,10 +26,10 @@ def test_explicit_conversions(self): # GH 8608 # add/sub are overridden explicitly for Float/Int Index - idx = self._holder(np.arange(5, dtype='int64')) + idx = self._holder(np.arange(5, dtype="int64")) # float conversions - arr = np.arange(5, dtype='int64') * 3.2 + arr = np.arange(5, dtype="int64") * 3.2 expected = Float64Index(arr) fidx = idx * 3.2 tm.assert_index_equal(fidx, expected) @@ -39,40 +38,44 @@ def test_explicit_conversions(self): # interops with numpy arrays expected = Float64Index(arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = fidx - a tm.assert_index_equal(result, expected) expected = Float64Index(-arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = a - fidx tm.assert_index_equal(result, expected) def test_index_groupby(self): int_idx = Index(range(6)) float_idx = Index(np.arange(0, 0.6, 0.1)) - obj_idx = Index('A B C D E F'.split()) - dt_idx = pd.date_range('2013-01-01', freq='M', periods=6) + obj_idx = Index("A B C D E F".split()) + dt_idx = pd.date_range("2013-01-01", freq="M", periods=6) for idx in [int_idx, float_idx, obj_idx, dt_idx]: to_groupby = np.array([1, 2, np.nan, np.nan, 2, 1]) - tm.assert_dict_equal(idx.groupby(to_groupby), - {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]}) - - to_groupby = Index([datetime(2011, 11, 1), - datetime(2011, 12, 1), - pd.NaT, - pd.NaT, - datetime(2011, 12, 1), - datetime(2011, 11, 1)], - tz='UTC').values - - ex_keys = [Timestamp('2011-11-01'), Timestamp('2011-12-01')] - expected = {ex_keys[0]: idx[[0, 5]], - ex_keys[1]: idx[[1, 4]]} + tm.assert_dict_equal( + idx.groupby(to_groupby), {1.0: idx[[0, 5]], 2.0: idx[[1, 4]]} + ) + + to_groupby = Index( + [ + datetime(2011, 11, 1), + datetime(2011, 12, 1), + pd.NaT, + pd.NaT, + datetime(2011, 12, 1), + datetime(2011, 11, 1), + ], + tz="UTC", + ).values + + ex_keys = [Timestamp("2011-11-01"), Timestamp("2011-12-01")] + expected = {ex_keys[0]: idx[[0, 5]], ex_keys[1]: idx[[1, 4]]} tm.assert_dict_equal(idx.groupby(to_groupby), expected) - @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + @pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where(self, klass): i = self.create_index() cond = [True] * len(i) @@ -96,14 +99,16 @@ class TestFloat64Index(Numeric): _holder = Float64Index def setup_method(self, method): - self.indices = dict(mixed=Float64Index([1.5, 2, 3, 4, 5]), - float=Float64Index(np.arange(5) * 2.5), - mixed_dec=Float64Index([5, 4, 3, 2, 1.5]), - float_dec=Float64Index(np.arange(4, -1, -1) * 2.5)) + self.indices = dict( + mixed=Float64Index([1.5, 2, 3, 4, 5]), + float=Float64Index(np.arange(5) * 2.5), + mixed_dec=Float64Index([5, 4, 3, 2, 1.5]), + float_dec=Float64Index(np.arange(4, -1, -1) * 2.5), + ) self.setup_indices() def create_index(self): - return Float64Index(np.arange(5, dtype='float64')) + return Float64Index(np.arange(5, dtype="float64")) def test_repr_roundtrip(self): for ind in (self.mixed, self.float): @@ -126,17 +131,17 @@ def test_constructor(self): # explicit construction index = Float64Index([1, 2, 3, 4, 5]) assert isinstance(index, Float64Index) - expected = np.array([1, 2, 3, 4, 5], dtype='float64') + expected = np.array([1, 2, 3, 4, 5], dtype="float64") tm.assert_numpy_array_equal(index.values, expected) index = Float64Index(np.array([1, 2, 3, 4, 5])) assert isinstance(index, Float64Index) - index = Float64Index([1., 2, 3, 4, 5]) + index = Float64Index([1.0, 2, 3, 4, 5]) assert isinstance(index, Float64Index) - index = Float64Index(np.array([1., 2, 3, 4, 5])) + index = Float64Index(np.array([1.0, 2, 3, 4, 5])) assert isinstance(index, Float64Index) assert index.dtype == float - index = Float64Index(np.array([1., 2, 3, 4, 5]), dtype=np.float32) + index = Float64Index(np.array([1.0, 2, 3, 4, 5]), dtype=np.float32) assert isinstance(index, Float64Index) assert index.dtype == np.float64 @@ -155,34 +160,37 @@ def test_constructor(self): def test_constructor_invalid(self): # invalid - msg = (r"Float64Index\(\.\.\.\) must be called with a collection of" - r" some kind, 0\.0 was passed") + msg = ( + r"Float64Index\(\.\.\.\) must be called with a collection of" + r" some kind, 0\.0 was passed" + ) with pytest.raises(TypeError, match=msg): - Float64Index(0.) - msg = ("String dtype not supported, you may need to explicitly cast to" - " a numeric type") + Float64Index(0.0) + msg = ( + "String dtype not supported, you may need to explicitly cast to" + " a numeric type" + ) with pytest.raises(TypeError, match=msg): - Float64Index(['a', 'b', 0.]) - msg = (r"float\(\) argument must be a string or a number, not" - " 'Timestamp'") + Float64Index(["a", "b", 0.0]) + msg = r"float\(\) argument must be a string or a number, not" " 'Timestamp'" with pytest.raises(TypeError, match=msg): - Float64Index([Timestamp('20130101')]) + Float64Index([Timestamp("20130101")]) def test_constructor_coerce(self): self.check_coerce(self.mixed, Index([1.5, 2, 3, 4, 5])) self.check_coerce(self.float, Index(np.arange(5) * 2.5)) - self.check_coerce(self.float, Index(np.array( - np.arange(5) * 2.5, dtype=object))) + self.check_coerce(self.float, Index(np.array(np.arange(5) * 2.5, dtype=object))) def test_constructor_explicit(self): # these don't auto convert - self.check_coerce(self.float, - Index((np.arange(5) * 2.5), dtype=object), - is_float_index=False) - self.check_coerce(self.mixed, Index( - [1.5, 2, 3, 4, 5], dtype=object), is_float_index=False) + self.check_coerce( + self.float, Index((np.arange(5) * 2.5), dtype=object), is_float_index=False + ) + self.check_coerce( + self.mixed, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False + ) def test_astype(self): @@ -192,7 +200,7 @@ def test_astype(self): self.check_is_index(result) i = self.mixed.copy() - i.name = 'foo' + i.name = "foo" result = i.astype(object) assert result.equals(i) assert i.equals(result) @@ -200,7 +208,7 @@ def test_astype(self): # GH 12881 # a float astype int - for dtype in ['int16', 'int32', 'int64']: + for dtype in ["int16", "int32", "int64"]: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = Int64Index([0, 1, 2]) @@ -211,7 +219,7 @@ def test_astype(self): expected = Int64Index([0, 1, 2]) tm.assert_index_equal(result, expected) - for dtype in ['float32', 'float64']: + for dtype in ["float32", "float64"]: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = i @@ -223,14 +231,16 @@ def test_astype(self): tm.assert_index_equal(result, expected) # invalid - for dtype in ['M8[ns]', 'm8[ns]']: - msg = ("Cannot convert Float64Index to dtype {}; integer values" - " are required for conversion").format(pandas_dtype(dtype)) + for dtype in ["M8[ns]", "m8[ns]"]: + msg = ( + "Cannot convert Float64Index to dtype {}; integer values" + " are required for conversion" + ).format(pandas_dtype(dtype)) with pytest.raises(TypeError, match=re.escape(msg)): i.astype(dtype) # GH 13149 - for dtype in ['int16', 'int32', 'int64']: + for dtype in ["int16", "int32", "int64"]: i = Float64Index([0, 1.1, np.NAN]) msg = "Cannot convert NA to integer" with pytest.raises(ValueError, match=msg): @@ -266,49 +276,53 @@ def test_equals_numeric(self): def test_get_indexer(self): idx = Float64Index([0.0, 1.0, 2.0]) - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) target = [-0.1, 0.5, 1.1] - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) def test_get_loc(self): idx = Float64Index([0.0, 1.0, 2.0]) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(1, method) == 1 if method is not None: assert idx.get_loc(1, method, tolerance=0) == 1 - for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: assert idx.get_loc(1.1, method) == loc assert idx.get_loc(1.1, method, tolerance=0.9) == loc with pytest.raises(KeyError, match="^'foo'$"): - idx.get_loc('foo') + idx.get_loc("foo") with pytest.raises(KeyError, match=r"^1\.5$"): idx.get_loc(1.5) with pytest.raises(KeyError, match=r"^1\.5$"): - idx.get_loc(1.5, method='pad', tolerance=0.1) + idx.get_loc(1.5, method="pad", tolerance=0.1) with pytest.raises(KeyError, match="^True$"): idx.get_loc(True) with pytest.raises(KeyError, match="^False$"): idx.get_loc(False) - with pytest.raises(ValueError, match='must be numeric'): - idx.get_loc(1.4, method='nearest', tolerance='foo') + with pytest.raises(ValueError, match="must be numeric"): + idx.get_loc(1.4, method="nearest", tolerance="foo") - with pytest.raises(ValueError, match='must contain numeric elements'): - idx.get_loc(1.4, method='nearest', tolerance=np.array(['foo'])) + with pytest.raises(ValueError, match="must contain numeric elements"): + idx.get_loc(1.4, method="nearest", tolerance=np.array(["foo"])) with pytest.raises( - ValueError, - match='tolerance size must match target index size'): - idx.get_loc(1.4, method='nearest', tolerance=np.array([1, 2])) + ValueError, match="tolerance size must match target index size" + ): + idx.get_loc(1.4, method="nearest", tolerance=np.array([1, 2])) def test_get_loc_na(self): idx = Float64Index([np.nan, 1, 2]) @@ -359,16 +373,14 @@ def test_doesnt_contain_all_the_things(self): def test_nan_multiple_containment(self): i = Float64Index([1.0, np.nan]) tm.assert_numpy_array_equal(i.isin([1.0]), np.array([True, False])) - tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), - np.array([False, False])) + tm.assert_numpy_array_equal(i.isin([2.0, np.pi]), np.array([False, False])) tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, True])) - tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), - np.array([True, True])) + tm.assert_numpy_array_equal(i.isin([1.0, np.nan]), np.array([True, True])) i = Float64Index([1.0, 2.0]) tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False])) def test_astype_from_object(self): - index = Index([1.0, np.nan, 0.2], dtype='object') + index = Index([1.0, np.nan, 0.2], dtype="object") result = index.astype(float) expected = Float64Index([1.0, np.nan, 0.2]) assert result.dtype == expected.dtype @@ -376,39 +388,40 @@ def test_astype_from_object(self): def test_fillna_float64(self): # GH 11343 - idx = Index([1.0, np.nan, 3.0], dtype=float, name='x') + idx = Index([1.0, np.nan, 3.0], dtype=float, name="x") # can't downcast - exp = Index([1.0, 0.1, 3.0], name='x') + exp = Index([1.0, 0.1, 3.0], name="x") tm.assert_index_equal(idx.fillna(0.1), exp) # downcast - exp = Float64Index([1.0, 2.0, 3.0], name='x') + exp = Float64Index([1.0, 2.0, 3.0], name="x") tm.assert_index_equal(idx.fillna(2), exp) # object - exp = Index([1.0, 'obj', 3.0], name='x') - tm.assert_index_equal(idx.fillna('obj'), exp) + exp = Index([1.0, "obj", 3.0], name="x") + tm.assert_index_equal(idx.fillna("obj"), exp) def test_take_fill_value(self): # GH 12631 - idx = pd.Float64Index([1., 2., 3.], name='xxx') + idx = pd.Float64Index([1.0, 2.0, 3.0], name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = pd.Float64Index([2., 1., 3.], name='xxx') + expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") tm.assert_index_equal(result, expected) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) - expected = pd.Float64Index([2., 1., np.nan], name='xxx') + expected = pd.Float64Index([2.0, 1.0, np.nan], name="xxx") tm.assert_index_equal(result, expected) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.Float64Index([2., 1., 3.], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Float64Index([2.0, 1.0, 3.0], name="xxx") tm.assert_index_equal(result, expected) - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -419,17 +432,16 @@ def test_take_fill_value(self): class NumericInt(Numeric): - def test_view(self): - i = self._holder([], name='Foo') + i = self._holder([], name="Foo") i_view = i.view() - assert i_view.name == 'Foo' + assert i_view.name == "Foo" i_view = i.view(self._dtype) - tm.assert_index_equal(i, self._holder(i_view, name='Foo')) + tm.assert_index_equal(i, self._holder(i_view, name="Foo")) i_view = i.view(self._holder) - tm.assert_index_equal(i, self._holder(i_view, name='Foo')) + tm.assert_index_equal(i, self._holder(i_view, name="Foo")) def test_is_monotonic(self): assert self.index.is_monotonic is True @@ -478,15 +490,16 @@ def test_identical(self): assert not i.identical(same_values_different_type) i = self.index.copy(dtype=object) - i = i.rename('foo') + i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(i) assert not i.identical(self.index) - assert Index(same_values, name='foo', dtype=object).identical(i) + assert Index(same_values, name="foo", dtype=object).identical(i) assert not self.index.copy(dtype=object).identical( - self.index.copy(dtype=self._dtype)) + self.index.copy(dtype=self._dtype) + ) def test_join_non_unique(self): left = Index([4, 4, 3, 3]) @@ -502,13 +515,14 @@ def test_join_non_unique(self): exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) - @pytest.mark.parametrize('kind', ['outer', 'inner', 'left', 'right']) + @pytest.mark.parametrize("kind", ["outer", "inner", "left", "right"]) def test_join_self(self, kind): joined = self.index.join(self.index, how=kind) assert self.index is joined def test_union_noncomparable(self): from datetime import datetime, timedelta + # corner case, non-Int64Index now = datetime.now() other = Index([now + timedelta(i) for i in range(4)], dtype=object) @@ -521,15 +535,17 @@ def test_union_noncomparable(self): tm.assert_index_equal(result, expected) def test_cant_or_shouldnt_cast(self): - msg = ("String dtype not supported, you may need to explicitly cast to" - " a numeric type") + msg = ( + "String dtype not supported, you may need to explicitly cast to" + " a numeric type" + ) # can't - data = ['foo', 'bar', 'baz'] + data = ["foo", "bar", "baz"] with pytest.raises(TypeError, match=msg): self._holder(data) # shouldn't - data = ['0', '1', '2'] + data = ["0", "1", "2"] with pytest.raises(TypeError, match=msg): self._holder(data) @@ -537,33 +553,33 @@ def test_view_index(self): self.index.view(Index) def test_prevent_casting(self): - result = self.index.astype('O') + result = self.index.astype("O") assert result.dtype == np.object_ def test_take_preserve_name(self): - index = self._holder([1, 2, 3, 4], name='foo') + index = self._holder([1, 2, 3, 4], name="foo") taken = index.take([3, 0, 1]) assert index.name == taken.name def test_take_fill_value(self): # see gh-12631 - idx = self._holder([1, 2, 3], name='xxx') + idx = self._holder([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = self._holder([2, 1, 3], name='xxx') + expected = self._holder([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) name = self._holder.__name__ - msg = ("Unable to fill values because " - "{name} cannot contain NA").format(name=name) + msg = ("Unable to fill values because " "{name} cannot contain NA").format( + name=name + ) # fill_value=True with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -1]), fill_value=True) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = self._holder([2, 1, 3], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = self._holder([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): @@ -575,21 +591,23 @@ def test_take_fill_value(self): idx.take(np.array([1, -5])) def test_slice_keep_name(self): - idx = self._holder([1, 2], name='asdf') + idx = self._holder([1, 2], name="asdf") assert idx.name == idx[1:].name class TestInt64Index(NumericInt): - _dtype = 'int64' + _dtype = "int64" _holder = Int64Index def setup_method(self, method): - self.indices = dict(index=Int64Index(np.arange(0, 20, 2)), - index_dec=Int64Index(np.arange(19, -1, -1))) + self.indices = dict( + index=Int64Index(np.arange(0, 20, 2)), + index_dec=Int64Index(np.arange(19, -1, -1)), + ) self.setup_indices() def create_index(self): - return Int64Index(np.arange(5, dtype='int64')) + return Int64Index(np.arange(5, dtype="int64")) def test_constructor(self): # pass list, coerce fine @@ -602,8 +620,10 @@ def test_constructor(self): tm.assert_index_equal(index, expected) # scalar raise Exception - msg = (r"Int64Index\(\.\.\.\) must be called with a collection of some" - " kind, 5 was passed") + msg = ( + r"Int64Index\(\.\.\.\) must be called with a collection of some" + " kind, 5 was passed" + ) with pytest.raises(TypeError, match=msg): Int64Index(5) @@ -620,9 +640,11 @@ def test_constructor(self): # interpret list-like expected = Int64Index([5, 0]) for cls in [Index, Int64Index]: - for idx in [cls([5, 0], dtype='int64'), - cls(np.array([5, 0]), dtype='int64'), - cls(Series([5, 0]), dtype='int64')]: + for idx in [ + cls([5, 0], dtype="int64"), + cls(np.array([5, 0]), dtype="int64"), + cls(Series([5, 0]), dtype="int64"), + ]: tm.assert_index_equal(idx, expected) def test_constructor_corner(self): @@ -632,12 +654,12 @@ def test_constructor_corner(self): tm.assert_index_equal(index, Index(arr)) # preventing casting - arr = np.array([1, '2', 3, '4'], dtype=object) - with pytest.raises(TypeError, match='casting'): + arr = np.array([1, "2", 3, "4"], dtype=object) + with pytest.raises(TypeError, match="casting"): Int64Index(arr) arr_with_floats = [0, 2, 3, 4, 5, 1.25, 3, -1] - with pytest.raises(TypeError, match='casting'): + with pytest.raises(TypeError, match="casting"): Int64Index(arr_with_floats) def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): @@ -651,7 +673,7 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): def test_constructor_unwraps_index(self): idx = pd.Index([1, 2]) result = pd.Int64Index(idx) - expected = np.array([1, 2], dtype='int64') + expected = np.array([1, 2], dtype="int64") tm.assert_numpy_array_equal(result._data, expected) def test_coerce_list(self): @@ -670,25 +692,25 @@ def test_get_indexer(self): tm.assert_numpy_array_equal(indexer, expected) target = Int64Index(np.arange(10)) - indexer = self.index.get_indexer(target, method='pad') + indexer = self.index.get_indexer(target, method="pad") expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) target = Int64Index(np.arange(10)) - indexer = self.index.get_indexer(target, method='backfill') + indexer = self.index.get_indexer(target, method="backfill") expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_intersection(self): other = Index([1, 2, 3, 4, 5]) result = self.index.intersection(other) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(self.index) - expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, - other.values)))) + expected = Index( + np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + ) tm.assert_index_equal(result, expected) def test_join_inner(self): @@ -696,8 +718,7 @@ def test_join_inner(self): other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -715,8 +736,7 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="inner", return_indexers=True) res2 = self.index.intersection(other_mono) tm.assert_index_equal(res, res2) @@ -733,11 +753,9 @@ def test_join_left(self): other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) eres = self.index - eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], - dtype=np.intp) + eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) @@ -745,10 +763,8 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='left', - return_indexers=True) - eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], - dtype=np.intp) + res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True) + eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) assert lidx is None @@ -757,7 +773,7 @@ def test_join_left(self): # non-unique idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -770,8 +786,7 @@ def test_join_right(self): other_mono = Int64Index([1, 2, 5, 7, 12, 25]) # not monotonic - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) @@ -781,8 +796,7 @@ def test_join_right(self): assert ridx is None # monotonic - res, lidx, ridx = self.index.join(other_mono, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="right", return_indexers=True) eres = other_mono elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) assert isinstance(other, Int64Index) @@ -793,7 +807,7 @@ def test_join_right(self): # non-unique idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -804,28 +818,28 @@ def test_join_right(self): def test_join_non_int_index(self): other = Index([3, 6, 7, 8, 10], dtype=object) - outer = self.index.join(other, how='outer') - outer2 = other.join(self.index, how='outer') + outer = self.index.join(other, how="outer") + outer2 = other.join(self.index, how="outer") expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how='inner') - inner2 = other.join(self.index, how='inner') + inner = self.index.join(other, how="inner") + inner2 = other.join(self.index, how="inner") expected = Index([6, 8, 10]) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how='left') + left = self.index.join(other, how="left") tm.assert_index_equal(left, self.index.astype(object)) - left2 = other.join(self.index, how='left') + left2 = other.join(self.index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how='right') + right = self.index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how='right') + right2 = other.join(self.index, how="right") tm.assert_index_equal(right2, self.index.astype(object)) def test_join_outer(self): @@ -834,16 +848,15 @@ def test_join_outer(self): # not monotonic # guarantee of sortedness - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) - elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.intp) - eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], - dtype=np.intp) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], dtype=np.intp + ) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) @@ -851,15 +864,14 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='outer', - return_indexers=True) - noidx_res = self.index.join(other_mono, how='outer') + res, lidx, ridx = self.index.join(other_mono, how="outer", return_indexers=True) + noidx_res = self.index.join(other_mono, how="outer") tm.assert_index_equal(res, noidx_res) - elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.intp) - eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], - dtype=np.intp) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp) + eridx = np.array( + [-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], dtype=np.intp + ) assert isinstance(res, Int64Index) tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -868,75 +880,72 @@ def test_join_outer(self): class TestUInt64Index(NumericInt): - _dtype = 'uint64' + _dtype = "uint64" _holder = UInt64Index def setup_method(self, method): - vals = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25] - self.indices = dict(index=UInt64Index(vals), - index_dec=UInt64Index(reversed(vals))) + vals = [2 ** 63, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20, 2 ** 63 + 25] + self.indices = dict( + index=UInt64Index(vals), index_dec=UInt64Index(reversed(vals)) + ) self.setup_indices() def create_index(self): - return UInt64Index(np.arange(5, dtype='uint64')) + return UInt64Index(np.arange(5, dtype="uint64")) def test_constructor(self): idx = UInt64Index([1, 2, 3]) res = Index([1, 2, 3], dtype=np.uint64) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2**63]) - res = Index([1, 2**63], dtype=np.uint64) + idx = UInt64Index([1, 2 ** 63]) + res = Index([1, 2 ** 63], dtype=np.uint64) tm.assert_index_equal(res, idx) - idx = UInt64Index([1, 2**63]) - res = Index([1, 2**63]) + idx = UInt64Index([1, 2 ** 63]) + res = Index([1, 2 ** 63]) tm.assert_index_equal(res, idx) - idx = Index([-1, 2**63], dtype=object) - res = Index(np.array([-1, 2**63], dtype=object)) + idx = Index([-1, 2 ** 63], dtype=object) + res = Index(np.array([-1, 2 ** 63], dtype=object)) tm.assert_index_equal(res, idx) def test_get_indexer(self): - target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) indexer = self.index.get_indexer(target) - expected = np.array([0, -1, 1, 2, 3, 4, - -1, -1, -1, -1], dtype=np.intp) + expected = np.array([0, -1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) - indexer = self.index.get_indexer(target, method='pad') - expected = np.array([0, 0, 1, 2, 3, 4, - 4, 4, 4, 4], dtype=np.intp) + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = self.index.get_indexer(target, method="pad") + expected = np.array([0, 0, 1, 2, 3, 4, 4, 4, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) - target = UInt64Index(np.arange(10).astype('uint64') * 5 + 2**63) - indexer = self.index.get_indexer(target, method='backfill') - expected = np.array([0, 1, 1, 2, 3, 4, - -1, -1, -1, -1], dtype=np.intp) + target = UInt64Index(np.arange(10).astype("uint64") * 5 + 2 ** 63) + indexer = self.index.get_indexer(target, method="backfill") + expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_intersection(self): - other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20]) + other = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10, 2 ** 63 + 15, 2 ** 63 + 20]) result = self.index.intersection(other) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(self.index) - expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, - other.values)))) + expected = Index( + np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + ) tm.assert_index_equal(result, expected) def test_join_inner(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -944,7 +953,7 @@ def test_join_inner(self): lidx = lidx.take(ind) ridx = ridx.take(ind) - eres = UInt64Index(2**63 + np.array([10, 25], dtype='uint64')) + eres = UInt64Index(2 ** 63 + np.array([10, 25], dtype="uint64")) elidx = np.array([1, 4], dtype=np.intp) eridx = np.array([5, 2], dtype=np.intp) @@ -954,8 +963,7 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="inner", return_indexers=True) res2 = self.index.intersection(other_mono) tm.assert_index_equal(res, res2) @@ -969,14 +977,13 @@ def test_join_inner(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_left(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) eres = self.index eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp) @@ -986,8 +993,7 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="left", return_indexers=True) eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp) assert isinstance(res, UInt64Index) @@ -996,13 +1002,12 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) # non-unique - idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64')) - idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64')) - res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) + idx = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = UInt64Index(2 ** 63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True) # 1 is in idx2, so it should be x2 - eres = UInt64Index(2**63 + np.array( - [1, 1, 2, 5, 7, 9], dtype='uint64')) + eres = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -1011,14 +1016,13 @@ def test_join_left(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_right(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp) @@ -1028,8 +1032,7 @@ def test_join_right(self): assert ridx is None # monotonic - res, lidx, ridx = self.index.join(other_mono, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other_mono, how="right", return_indexers=True) eres = other_mono elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp) @@ -1039,13 +1042,12 @@ def test_join_right(self): assert ridx is None # non-unique - idx = UInt64Index(2**63 + np.array([1, 1, 2, 5], dtype='uint64')) - idx2 = UInt64Index(2**63 + np.array([1, 2, 5, 7, 9], dtype='uint64')) - res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) + idx = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5], dtype="uint64")) + idx2 = UInt64Index(2 ** 63 + np.array([1, 2, 5, 7, 9], dtype="uint64")) + res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True) # 1 is in idx2, so it should be x2 - eres = UInt64Index(2**63 + np.array( - [1, 1, 2, 5, 7, 9], dtype='uint64')) + eres = UInt64Index(2 ** 63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64")) elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) @@ -1054,49 +1056,51 @@ def test_join_right(self): tm.assert_numpy_array_equal(ridx, eridx) def test_join_non_int_index(self): - other = Index(2**63 + np.array( - [1, 5, 7, 10, 20], dtype='uint64'), dtype=object) - - outer = self.index.join(other, how='outer') - outer2 = other.join(self.index, how='outer') - expected = Index(2**63 + np.array( - [0, 1, 5, 7, 10, 15, 20, 25], dtype='uint64')) + other = Index( + 2 ** 63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object + ) + + outer = self.index.join(other, how="outer") + outer2 = other.join(self.index, how="outer") + expected = Index( + 2 ** 63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64") + ) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how='inner') - inner2 = other.join(self.index, how='inner') - expected = Index(2**63 + np.array([10, 20], dtype='uint64')) + inner = self.index.join(other, how="inner") + inner2 = other.join(self.index, how="inner") + expected = Index(2 ** 63 + np.array([10, 20], dtype="uint64")) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how='left') + left = self.index.join(other, how="left") tm.assert_index_equal(left, self.index.astype(object)) - left2 = other.join(self.index, how='left') + left2 = other.join(self.index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how='right') + right = self.index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how='right') + right2 = other.join(self.index, how="right") tm.assert_index_equal(right2, self.index.astype(object)) def test_join_outer(self): - other = UInt64Index(2**63 + np.array( - [7, 12, 25, 1, 2, 10], dtype='uint64')) - other_mono = UInt64Index(2**63 + np.array( - [1, 2, 7, 10, 12, 25], dtype='uint64')) + other = UInt64Index(2 ** 63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64")) + other_mono = UInt64Index( + 2 ** 63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64") + ) # not monotonic # guarantee of sortedness - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) - eres = UInt64Index(2**63 + np.array( - [0, 1, 2, 7, 10, 12, 15, 20, 25], dtype='uint64')) + eres = UInt64Index( + 2 ** 63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64") + ) elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp) @@ -1106,9 +1110,8 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) # monotonic - res, lidx, ridx = self.index.join(other_mono, how='outer', - return_indexers=True) - noidx_res = self.index.join(other_mono, how='outer') + res, lidx, ridx = self.index.join(other_mono, how="outer", return_indexers=True) + noidx_res = self.index.join(other_mono, how="outer") tm.assert_index_equal(res, noidx_res) elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp) @@ -1120,7 +1123,7 @@ def test_join_outer(self): tm.assert_numpy_array_equal(ridx, eridx) -@pytest.mark.parametrize("dtype", ['int64', 'uint64']) +@pytest.mark.parametrize("dtype", ["int64", "uint64"]) def test_int_float_union_dtype(dtype): # https://github.com/pandas-dev/pandas/issues/26778 # [u]int | float -> float diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 349d10f5079e8..f9ca1bca04165 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -2,19 +2,46 @@ import pytest from pandas import ( - DatetimeIndex, Float64Index, Index, Int64Index, TimedeltaIndex, - UInt64Index, _np_version_under1p17) + DatetimeIndex, + Float64Index, + Index, + Int64Index, + TimedeltaIndex, + UInt64Index, + _np_version_under1p17, +) from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.util import testing as tm @pytest.mark.parametrize( - 'func', [np.exp, np.exp2, np.expm1, np.log, np.log2, np.log10, - np.log1p, np.sqrt, np.sin, np.cos, np.tan, np.arcsin, - np.arccos, np.arctan, np.sinh, np.cosh, np.tanh, - np.arcsinh, np.arccosh, np.arctanh, np.deg2rad, - np.rad2deg], - ids=lambda x: x.__name__) + "func", + [ + np.exp, + np.exp2, + np.expm1, + np.log, + np.log2, + np.log10, + np.log1p, + np.sqrt, + np.sin, + np.cos, + np.tan, + np.arcsin, + np.arccos, + np.arctan, + np.sinh, + np.cosh, + np.tanh, + np.arcsinh, + np.arccosh, + np.arctanh, + np.deg2rad, + np.rad2deg, + ], + ids=lambda x: x.__name__, +) def test_numpy_ufuncs_basic(indices, func): # test ufuncs of numpy, see: # http://docs.scipy.org/doc/numpy/reference/ufuncs.html @@ -23,11 +50,11 @@ def test_numpy_ufuncs_basic(indices, func): if isinstance(idx, DatetimeIndexOpsMixin): # raise TypeError or ValueError (PeriodIndex) with pytest.raises(Exception): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): func(idx) elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): # coerces to float (e.g. np.sin) - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): result = func(idx) exp = Index(func(idx.values), name=idx.name) @@ -39,13 +66,13 @@ def test_numpy_ufuncs_basic(indices, func): pass else: with pytest.raises(Exception): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): func(idx) @pytest.mark.parametrize( - 'func', [np.isfinite, np.isinf, np.isnan, np.signbit], - ids=lambda x: x.__name__) + "func", [np.isfinite, np.isinf, np.isnan, np.signbit], ids=lambda x: x.__name__ +) def test_numpy_ufuncs_other(indices, func): # test ufuncs of numpy, see: # http://docs.scipy.org/doc/numpy/reference/ufuncs.html @@ -90,6 +117,5 @@ def test_elementwise_comparison_warning(): # When NumPy dev actually enforces this change, we'll need to skip # this test. idx = Index([1, 2]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - idx == 'a' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + idx == "a" diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 7cdf5db64b3a9..213d9c6505229 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -20,11 +20,13 @@ class TestRangeIndex(Numeric): _holder = RangeIndex - _compat_props = ['shape', 'ndim', 'size'] + _compat_props = ["shape", "ndim", "size"] def setup_method(self, method): - self.indices = dict(index=RangeIndex(0, 20, 2, name='foo'), - index_dec=RangeIndex(18, -1, -2, name='bar')) + self.indices = dict( + index=RangeIndex(0, 20, 2, name="foo"), + index_dec=RangeIndex(18, -1, -2, name="bar"), + ) self.setup_indices() def create_index(self): @@ -39,19 +41,22 @@ def test_too_many_names(self): with pytest.raises(ValueError, match="^Length"): self.index.names = ["roger", "harold"] - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('args, kwargs, start, stop, step', [ - ((5,), dict(), 0, 5, 1), - ((1, 5), dict(), 1, 5, 1), - ((1, 5, 2), dict(), 1, 5, 2), - ((0,), dict(), 0, 0, 1), - ((0, 0), dict(), 0, 0, 1), - (tuple(), dict(start=0), 0, 0, 1), - (tuple(), dict(stop=0), 0, 0, 1)]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize( + "args, kwargs, start, stop, step", + [ + ((5,), dict(), 0, 5, 1), + ((1, 5), dict(), 1, 5, 1), + ((1, 5, 2), dict(), 1, 5, 2), + ((0,), dict(), 0, 0, 1), + ((0, 0), dict(), 0, 0, 1), + (tuple(), dict(start=0), 0, 0, 1), + (tuple(), dict(stop=0), 0, 0, 1), + ], + ) def test_constructor(self, args, kwargs, start, stop, step, name): result = RangeIndex(*args, name=name, **kwargs) - expected = Index(np.arange(start, stop, step, dtype=np.int64), - name=name) + expected = Index(np.arange(start, stop, step, dtype=np.int64), name=name) assert isinstance(result, RangeIndex) assert result.name is name assert result._range == range(start, stop, step) @@ -63,18 +68,28 @@ def test_constructor_invalid_args(self): RangeIndex() with pytest.raises(TypeError, match=msg): - RangeIndex(name='Foo') + RangeIndex(name="Foo") # invalid args - for i in [Index(['a', 'b']), Series(['a', 'b']), np.array(['a', 'b']), - [], 'foo', datetime(2000, 1, 1, 0, 0), np.arange(0, 10), - np.array([1]), [1]]: + for i in [ + Index(["a", "b"]), + Series(["a", "b"]), + np.array(["a", "b"]), + [], + "foo", + datetime(2000, 1, 1, 0, 0), + np.arange(0, 10), + np.array([1]), + [1], + ]: with pytest.raises(TypeError): RangeIndex(i) # we don't allow on a bare Index - msg = (r'Index\(\.\.\.\) must be called with a collection of some ' - r'kind, 0 was passed') + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, 0 was passed" + ) with pytest.raises(TypeError, match=msg): Index(0, 1000) @@ -92,7 +107,7 @@ def test_constructor_same(self): tm.assert_index_equal(result, index, exact=True) with pytest.raises(TypeError): - RangeIndex(index, dtype='float64') + RangeIndex(index, dtype="float64") def test_constructor_range(self): @@ -122,29 +137,29 @@ def test_constructor_range(self): tm.assert_index_equal(result, expected, exact=True) with pytest.raises(TypeError): - Index(range(1, 5, 2), dtype='float64') - msg = r'^from_range\(\) got an unexpected keyword argument' + Index(range(1, 5, 2), dtype="float64") + msg = r"^from_range\(\) got an unexpected keyword argument" with pytest.raises(TypeError, match=msg): pd.RangeIndex.from_range(range(10), copy=True) def test_constructor_name(self): # GH12288 orig = RangeIndex(10) - orig.name = 'original' + orig.name = "original" copy = RangeIndex(orig) - copy.name = 'copy' + copy.name = "copy" - assert orig.name == 'original' - assert copy.name == 'copy' + assert orig.name == "original" + assert copy.name == "copy" new = Index(copy) - assert new.name == 'copy' + assert new.name == "copy" - new.name = 'new' - assert orig.name == 'original' - assert copy.name == 'copy' - assert new.name == 'new' + new.name = "new" + assert orig.name == "original" + assert copy.name == "copy" + assert new.name == "new" def test_constructor_corner(self): arr = np.array([1, 2, 3, 4], dtype=object) @@ -154,26 +169,30 @@ def test_constructor_corner(self): # non-int raise Exception with pytest.raises(TypeError): - RangeIndex('1', '10', '1') + RangeIndex("1", "10", "1") with pytest.raises(TypeError): RangeIndex(1.1, 10.2, 1.3) # invalid passed type with pytest.raises(TypeError): - RangeIndex(1, 5, dtype='float64') - - @pytest.mark.parametrize('index, start, stop, step', [ - (RangeIndex(5), 0, 5, 1), - (RangeIndex(0, 5), 0, 5, 1), - (RangeIndex(5, step=2), 0, 5, 2), - (RangeIndex(1, 5, 2), 1, 5, 2)]) + RangeIndex(1, 5, dtype="float64") + + @pytest.mark.parametrize( + "index, start, stop, step", + [ + (RangeIndex(5), 0, 5, 1), + (RangeIndex(0, 5), 0, 5, 1), + (RangeIndex(5, step=2), 0, 5, 2), + (RangeIndex(1, 5, 2), 1, 5, 2), + ], + ) def test_start_stop_step_attrs(self, index, start, stop, step): # GH 25710 assert index.start == start assert index.stop == stop assert index.step == step - @pytest.mark.parametrize('attr_name', ['_start', '_stop', '_step']) + @pytest.mark.parametrize("attr_name", ["_start", "_stop", "_step"]) def test_deprecated_start_stop_step_attrs(self, attr_name): # GH 26581 idx = self.create_index() @@ -181,15 +200,15 @@ def test_deprecated_start_stop_step_attrs(self, attr_name): getattr(idx, attr_name) def test_copy(self): - i = RangeIndex(5, name='Foo') + i = RangeIndex(5, name="Foo") i_copy = i.copy() assert i_copy is not i assert i_copy.identical(i) assert i_copy._range == range(0, 5, 1) - assert i_copy.name == 'Foo' + assert i_copy.name == "Foo" def test_repr(self): - i = RangeIndex(5, name='Foo') + i = RangeIndex(5, name="Foo") result = repr(i) expected = "RangeIndex(start=0, stop=5, step=1, name='Foo')" assert result == expected @@ -207,7 +226,7 @@ def test_repr(self): def test_insert(self): - idx = RangeIndex(5, name='Foo') + idx = RangeIndex(5, name="Foo") result = idx[1:4] # test 0th element @@ -221,7 +240,7 @@ def test_insert(self): def test_delete(self): - idx = RangeIndex(5, name='Foo') + idx = RangeIndex(5, name="Foo") expected = idx[1:].astype(int) result = idx.delete(0) tm.assert_index_equal(result, expected) @@ -237,11 +256,11 @@ def test_delete(self): result = idx.delete(len(idx)) def test_view(self): - i = RangeIndex(0, name='Foo') + i = RangeIndex(0, name="Foo") i_view = i.view() - assert i_view.name == 'Foo' + assert i_view.name == "Foo" - i_view = i.view('i8') + i_view = i.view("i8") tm.assert_numpy_array_equal(i.values, i_view) i_view = i.view(RangeIndex) @@ -287,7 +306,7 @@ def test_cached_data(self): idx.any() assert idx._cached_data is None - df = pd.DataFrame({'a': range(10)}, index=idx) + df = pd.DataFrame({"a": range(10)}, index=idx) df.loc[50] assert idx._cached_data is None @@ -341,10 +360,12 @@ def test_is_monotonic(self): assert index._is_strictly_monotonic_decreasing is True def test_equals_range(self): - equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), - (RangeIndex(0), RangeIndex(1, -1, 3)), - (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), - (RangeIndex(0, -9, -2), RangeIndex(0, -10, -2))] + equiv_pairs = [ + (RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), + (RangeIndex(0), RangeIndex(1, -1, 3)), + (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), + (RangeIndex(0, -9, -2), RangeIndex(0, -10, -2)), + ] for left, right in equiv_pairs: assert left.equals(right) assert right.equals(left) @@ -366,15 +387,16 @@ def test_identical(self): assert not i.identical(same_values_different_type) i = self.index.copy(dtype=object) - i = i.rename('foo') + i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(self.index.copy(dtype=object)) assert not i.identical(self.index) - assert Index(same_values, name='foo', dtype=object).identical(i) + assert Index(same_values, name="foo", dtype=object).identical(i) assert not self.index.copy(dtype=object).identical( - self.index.copy(dtype='int64')) + self.index.copy(dtype="int64") + ) def test_get_indexer(self): target = RangeIndex(10) @@ -384,13 +406,13 @@ def test_get_indexer(self): def test_get_indexer_pad(self): target = RangeIndex(10) - indexer = self.index.get_indexer(target, method='pad') + indexer = self.index.get_indexer(target, method="pad") expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_backfill(self): target = RangeIndex(10) - indexer = self.index.get_indexer(target, method='backfill') + indexer = self.index.get_indexer(target, method="backfill") expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) @@ -398,17 +420,21 @@ def test_join_outer(self): # join with Int64Index other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) - eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25]) - elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, - -1, -1, -1, -1, -1, -1, -1], dtype=np.intp) - eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, - 5, 4, 3, 2, 1, 0], dtype=np.intp) + eres = Int64Index( + [0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25] + ) + elidx = np.array( + [0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, -1, -1, -1, -1, -1, -1, -1], + dtype=np.intp, + ) + eridx = np.array( + [-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + dtype=np.intp, + ) assert isinstance(res, Int64Index) assert not isinstance(res, RangeIndex) @@ -419,9 +445,8 @@ def test_join_outer(self): # join with RangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how='outer', - return_indexers=True) - noidx_res = self.index.join(other, how='outer') + res, lidx, ridx = self.index.join(other, how="outer", return_indexers=True) + noidx_res = self.index.join(other, how="outer") tm.assert_index_equal(res, noidx_res) assert isinstance(res, Int64Index) @@ -434,8 +459,7 @@ def test_join_inner(self): # Join with non-RangeIndex other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) # no guarantee of sortedness, so sort for comparison purposes ind = res.argsort() @@ -455,8 +479,7 @@ def test_join_inner(self): # Join two RangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how='inner', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="inner", return_indexers=True) assert isinstance(res, RangeIndex) tm.assert_index_equal(res, eres) @@ -467,8 +490,7 @@ def test_join_left(self): # Join with Int64Index other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) eres = self.index eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.intp) @@ -480,8 +502,7 @@ def test_join_left(self): # Join withRangeIndex other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='left', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="left", return_indexers=True) assert isinstance(res, RangeIndex) tm.assert_index_equal(res, eres) @@ -492,11 +513,9 @@ def test_join_right(self): # Join with Int64Index other = Int64Index(np.arange(25, 14, -1)) - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other - elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], - dtype=np.intp) + elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], dtype=np.intp) assert isinstance(other, Int64Index) tm.assert_index_equal(res, eres) @@ -506,8 +525,7 @@ def test_join_right(self): # Join withRangeIndex other = RangeIndex(25, 14, -1) - res, lidx, ridx = self.index.join(other, how='right', - return_indexers=True) + res, lidx, ridx = self.index.join(other, how="right", return_indexers=True) eres = other assert isinstance(other, RangeIndex) @@ -518,28 +536,28 @@ def test_join_right(self): def test_join_non_int_index(self): other = Index([3, 6, 7, 8, 10], dtype=object) - outer = self.index.join(other, how='outer') - outer2 = other.join(self.index, how='outer') + outer = self.index.join(other, how="outer") + outer2 = other.join(self.index, how="outer") expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18]) tm.assert_index_equal(outer, outer2) tm.assert_index_equal(outer, expected) - inner = self.index.join(other, how='inner') - inner2 = other.join(self.index, how='inner') + inner = self.index.join(other, how="inner") + inner2 = other.join(self.index, how="inner") expected = Index([6, 8, 10]) tm.assert_index_equal(inner, inner2) tm.assert_index_equal(inner, expected) - left = self.index.join(other, how='left') + left = self.index.join(other, how="left") tm.assert_index_equal(left, self.index.astype(object)) - left2 = other.join(self.index, how='left') + left2 = other.join(self.index, how="left") tm.assert_index_equal(left2, other) - right = self.index.join(other, how='right') + right = self.index.join(other, how="right") tm.assert_index_equal(right, other) - right2 = other.join(self.index, how='right') + right2 = other.join(self.index, how="right") tm.assert_index_equal(right2, self.index.astype(object)) def test_join_non_unique(self): @@ -549,15 +567,14 @@ def test_join_non_unique(self): eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp) - eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], - dtype=np.intp) + eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], dtype=np.intp) tm.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) def test_join_self(self): - kinds = 'outer', 'inner', 'left', 'right' + kinds = "outer", "inner", "left", "right" for kind in kinds: joined = self.index.join(self.index, how=kind) assert self.index is joined @@ -567,27 +584,25 @@ def test_intersection(self, sort): # intersect with Int64Index other = Index(np.arange(1, 6)) result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) result = other.intersection(self.index, sort=sort) - expected = Index(np.sort(np.asarray(np.intersect1d(self.index.values, - other.values)))) + expected = Index( + np.sort(np.asarray(np.intersect1d(self.index.values, other.values))) + ) tm.assert_index_equal(result, expected) # intersect with increasing RangeIndex other = RangeIndex(1, 6) result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) # intersect with decreasing RangeIndex other = RangeIndex(5, 0, -1) result = self.index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(self.index.values, - other.values))) + expected = Index(np.sort(np.intersect1d(self.index.values, other.values))) tm.assert_index_equal(result, expected) # reversed (GH 17296) @@ -634,9 +649,10 @@ def test_intersection(self, sort): expected = RangeIndex(0, 0, 1) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('sort', [False, None]) + @pytest.mark.parametrize("sort", [False, None]) def test_union_noncomparable(self, sort): from datetime import datetime, timedelta + # corner case, non-Int64Index now = datetime.now() other = Index([now + timedelta(i) for i in range(4)], dtype=object) @@ -648,40 +664,72 @@ def test_union_noncomparable(self, sort): expected = Index(np.concatenate((other, self.index))) tm.assert_index_equal(result, expected) - @pytest.fixture(params=[ - (RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)), - (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1), - I64(range(0, -20, -1))), - (RI(0, 10, 2), RI(1, 10, 2), RI(0, 10, 1), - I64(list(range(0, 10, 2)) + list(range(1, 10, 2)))), - (RI(0, 11, 2), RI(1, 12, 2), RI(0, 12, 1), - I64(list(range(0, 11, 2)) + list(range(1, 12, 2)))), - (RI(0, 21, 4), RI(-2, 24, 4), RI(-2, 24, 2), - I64(list(range(0, 21, 4)) + list(range(-2, 24, 4)))), - (RI(0, -20, -2), RI(-1, -21, -2), RI(-19, 1, 1), - I64(list(range(0, -20, -2)) + list(range(-1, -21, -2)))), - (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5), I64(range(0, 100, 5))), - (RI(0, -100, -5), RI(5, -100, -20), RI(-95, 10, 5), - I64(list(range(0, -100, -5)) + [5])), - (RI(0, -11, -1), RI(1, -12, -4), RI(-11, 2, 1), - I64(list(range(0, -11, -1)) + [1, -11])), - (RI(0), RI(0), RI(0), RI(0)), - (RI(0, -10, -2), RI(0), RI(0, -10, -2), RI(0, -10, -2)), - (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2), - I64(range(0, 102, 2))), - (RI(0, -100, -2), RI(-100, 50, 102), RI(-100, 4, 2), - I64(list(range(0, -100, -2)) + [-100, 2])), - (RI(0, -100, -1), RI(0, -50, -3), RI(-99, 1, 1), - I64(list(range(0, -100, -1)))), - (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5), I64([0, 5])), - (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5), I64([0, 5, -5])), - (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4]), I64([0, 1, 2, 4])), - (RI(0, 10, 1), I64([]), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0), I64([1, 5, 6]), I64([1, 5, 6]), I64([1, 5, 6])) - ]) + @pytest.fixture( + params=[ + (RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)), + (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1), I64(range(20))), + (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1), I64(range(20))), + (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)), + (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1), I64(range(0, -20, -1))), + ( + RI(0, 10, 2), + RI(1, 10, 2), + RI(0, 10, 1), + I64(list(range(0, 10, 2)) + list(range(1, 10, 2))), + ), + ( + RI(0, 11, 2), + RI(1, 12, 2), + RI(0, 12, 1), + I64(list(range(0, 11, 2)) + list(range(1, 12, 2))), + ), + ( + RI(0, 21, 4), + RI(-2, 24, 4), + RI(-2, 24, 2), + I64(list(range(0, 21, 4)) + list(range(-2, 24, 4))), + ), + ( + RI(0, -20, -2), + RI(-1, -21, -2), + RI(-19, 1, 1), + I64(list(range(0, -20, -2)) + list(range(-1, -21, -2))), + ), + (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5), I64(range(0, 100, 5))), + ( + RI(0, -100, -5), + RI(5, -100, -20), + RI(-95, 10, 5), + I64(list(range(0, -100, -5)) + [5]), + ), + ( + RI(0, -11, -1), + RI(1, -12, -4), + RI(-11, 2, 1), + I64(list(range(0, -11, -1)) + [1, -11]), + ), + (RI(0), RI(0), RI(0), RI(0)), + (RI(0, -10, -2), RI(0), RI(0, -10, -2), RI(0, -10, -2)), + (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2), I64(range(0, 102, 2))), + ( + RI(0, -100, -2), + RI(-100, 50, 102), + RI(-100, 4, 2), + I64(list(range(0, -100, -2)) + [-100, 2]), + ), + ( + RI(0, -100, -1), + RI(0, -50, -3), + RI(-99, 1, 1), + I64(list(range(0, -100, -1))), + ), + (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5), I64([0, 5])), + (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5), I64([0, 5, -5])), + (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4]), I64([0, 1, 2, 4])), + (RI(0, 10, 1), I64([]), RI(0, 10, 1), RI(0, 10, 1)), + (RI(0), I64([1, 5, 6]), I64([1, 5, 6]), I64([1, 5, 6])), + ] + ) def unions(self, request): """Inputs and expected outputs for RangeIndex.union tests""" @@ -715,29 +763,29 @@ def test_nbytes(self): def test_cant_or_shouldnt_cast(self): # can't with pytest.raises(TypeError): - RangeIndex('foo', 'bar', 'baz') + RangeIndex("foo", "bar", "baz") # shouldn't with pytest.raises(TypeError): - RangeIndex('0', '1', '2') + RangeIndex("0", "1", "2") def test_view_Index(self): self.index.view(Index) def test_prevent_casting(self): - result = self.index.astype('O') + result = self.index.astype("O") assert result.dtype == np.object_ def test_take_preserve_name(self): - index = RangeIndex(1, 5, name='foo') + index = RangeIndex(1, 5, name="foo") taken = index.take([3, 0, 1]) assert index.name == taken.name def test_take_fill_value(self): # GH 12631 - idx = pd.RangeIndex(1, 4, name='xxx') + idx = pd.RangeIndex(1, 4, name="xxx") result = idx.take(np.array([1, 0, -1])) - expected = pd.Int64Index([2, 1, 3], name='xxx') + expected = pd.Int64Index([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) # fill_value @@ -746,9 +794,8 @@ def test_take_fill_value(self): idx.take(np.array([1, 0, -1]), fill_value=True) # allow_fill=False - result = idx.take(np.array([1, 0, -1]), allow_fill=False, - fill_value=True) - expected = pd.Int64Index([2, 1, 3], name='xxx') + result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) + expected = pd.Int64Index([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) msg = "Unable to fill values because RangeIndex cannot contain NA" @@ -761,16 +808,14 @@ def test_take_fill_value(self): idx.take(np.array([1, -5])) def test_print_unicode_columns(self): - df = pd.DataFrame({"\u05d0": [1, 2, 3], - "\u05d1": [4, 5, 6], - "c": [7, 8, 9]}) + df = pd.DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]}) repr(df.columns) # should not raise UnicodeDecodeError def test_repr_roundtrip(self): tm.assert_index_equal(eval(repr(self.index)), self.index) def test_slice_keep_name(self): - idx = RangeIndex(1, 2, name='asdf') + idx = RangeIndex(1, 2, name="asdf") assert idx.name == idx[1:].name def test_explicit_conversions(self): @@ -780,7 +825,7 @@ def test_explicit_conversions(self): idx = RangeIndex(5) # float conversions - arr = np.arange(5, dtype='int64') * 3.2 + arr = np.arange(5, dtype="int64") * 3.2 expected = Float64Index(arr) fidx = idx * 3.2 tm.assert_index_equal(fidx, expected) @@ -789,12 +834,12 @@ def test_explicit_conversions(self): # interops with numpy arrays expected = Float64Index(arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = fidx - a tm.assert_index_equal(result, expected) expected = Float64Index(-arr) - a = np.zeros(5, dtype='float64') + a = np.zeros(5, dtype="float64") result = a - fidx tm.assert_index_equal(result, expected) @@ -874,38 +919,38 @@ def test_slice_specialised(self): # positive slice values index = self.index[7:10:2] - expected = Index(np.array([14, 18]), name='foo') + expected = Index(np.array([14, 18]), name="foo") tm.assert_index_equal(index, expected) # negative slice values index = self.index[-1:-5:-2] - expected = Index(np.array([18, 14]), name='foo') + expected = Index(np.array([18, 14]), name="foo") tm.assert_index_equal(index, expected) # stop overshoot index = self.index[2:100:4] - expected = Index(np.array([4, 12]), name='foo') + expected = Index(np.array([4, 12]), name="foo") tm.assert_index_equal(index, expected) # reverse index = self.index[::-1] - expected = Index(self.index.values[::-1], name='foo') + expected = Index(self.index.values[::-1], name="foo") tm.assert_index_equal(index, expected) index = self.index[-8::-1] - expected = Index(np.array([4, 2, 0]), name='foo') + expected = Index(np.array([4, 2, 0]), name="foo") tm.assert_index_equal(index, expected) index = self.index[-40::-1] - expected = Index(np.array([], dtype=np.int64), name='foo') + expected = Index(np.array([], dtype=np.int64), name="foo") tm.assert_index_equal(index, expected) index = self.index[40::-1] - expected = Index(self.index.values[40::-1], name='foo') + expected = Index(self.index.values[40::-1], name="foo") tm.assert_index_equal(index, expected) index = self.index[10::-1] - expected = Index(self.index.values[::-1], name='foo') + expected = Index(self.index.values[::-1], name="foo") tm.assert_index_equal(index, expected) def test_len_specialised(self): @@ -931,27 +976,29 @@ def test_len_specialised(self): i = RangeIndex(0, 5, step) assert len(i) == 0 - @pytest.fixture(params=[ - ([RI(1, 12, 5)], RI(1, 12, 5)), - ([RI(0, 6, 4)], RI(0, 6, 4)), - ([RI(1, 3), RI(3, 7)], RI(1, 7)), - ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), - ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), - ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), - ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), - ([RI(-4, -8), RI(3, -4)], RI(0, 0)), - ([RI(-4, -8), RI(3, 5)], RI(3, 5)), - ([RI(-4, -2), RI(3, 5)], I64([-4, -3, 3, 4])), - ([RI(-2,), RI(3, 5)], RI(3, 5)), - ([RI(2,), RI(2)], I64([0, 1, 0, 1])), - ([RI(2,), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), - ([RI(2,), RI(3, 5), RI(5, 8, 4)], I64([0, 1, 3, 4, 5])), - ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), - ([RI(3,), I64([-1, 3, 15])], I64([0, 1, 2, -1, 3, 15])), - ([RI(3,), F64([-1, 3.1, 15.])], F64([0, 1, 2, -1, 3.1, 15.])), - ([RI(3,), OI(['a', None, 14])], OI([0, 1, 2, 'a', None, 14])), - ([RI(3, 1), OI(['a', None, 14])], OI(['a', None, 14])) - ]) + @pytest.fixture( + params=[ + ([RI(1, 12, 5)], RI(1, 12, 5)), + ([RI(0, 6, 4)], RI(0, 6, 4)), + ([RI(1, 3), RI(3, 7)], RI(1, 7)), + ([RI(1, 5, 2), RI(5, 6)], RI(1, 6, 2)), + ([RI(1, 3, 2), RI(4, 7, 3)], RI(1, 7, 3)), + ([RI(-4, 3, 2), RI(4, 7, 2)], RI(-4, 7, 2)), + ([RI(-4, -8), RI(-8, -12)], RI(0, 0)), + ([RI(-4, -8), RI(3, -4)], RI(0, 0)), + ([RI(-4, -8), RI(3, 5)], RI(3, 5)), + ([RI(-4, -2), RI(3, 5)], I64([-4, -3, 3, 4])), + ([RI(-2), RI(3, 5)], RI(3, 5)), + ([RI(2), RI(2)], I64([0, 1, 0, 1])), + ([RI(2), RI(2, 5), RI(5, 8, 4)], RI(0, 6)), + ([RI(2), RI(3, 5), RI(5, 8, 4)], I64([0, 1, 3, 4, 5])), + ([RI(-2, 2), RI(2, 5), RI(5, 8, 4)], RI(-2, 6)), + ([RI(3), I64([-1, 3, 15])], I64([0, 1, 2, -1, 3, 15])), + ([RI(3), F64([-1, 3.1, 15.0])], F64([0, 1, 2, -1, 3.1, 15.0])), + ([RI(3), OI(["a", None, 14])], OI([0, 1, 2, "a", None, 14])), + ([RI(3, 1), OI(["a", None, 14])], OI(["a", None, 14])), + ] + ) def appends(self, request): """Inputs and expected outputs for RangeIndex.append test""" @@ -977,15 +1024,16 @@ def test_engineless_lookup(self): idx = RangeIndex(2, 10, 3) assert idx.get_loc(5) == 1 - tm.assert_numpy_array_equal(idx.get_indexer([2, 8]), - ensure_platform_int(np.array([0, 2]))) + tm.assert_numpy_array_equal( + idx.get_indexer([2, 8]), ensure_platform_int(np.array([0, 2])) + ) with pytest.raises(KeyError): idx.get_loc(3) - assert '_engine' not in idx._cache + assert "_engine" not in idx._cache # The engine is still required for lookup of a different dtype scalar: with pytest.raises(KeyError): - assert idx.get_loc('a') == -1 + assert idx.get_loc("a") == -1 - assert '_engine' in idx._cache + assert "_engine" in idx._cache diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 8c0762c7e7e5a..b3850f7a4e09e 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -1,7 +1,7 @@ -''' +""" The tests in this package are to ensure the proper resultant dtypes of set operations. -''' +""" from collections import OrderedDict import itertools as it @@ -16,16 +16,20 @@ from pandas.tests.indexes.conftest import indices_list import pandas.util.testing as tm -COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict([ - ((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)), - ((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)), - ((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)), - ((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)), -]) +COMPATIBLE_INCONSISTENT_PAIRS = OrderedDict( + [ + ((Int64Index, RangeIndex), (tm.makeIntIndex, tm.makeRangeIndex)), + ((Float64Index, Int64Index), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, RangeIndex), (tm.makeFloatIndex, tm.makeIntIndex)), + ((Float64Index, UInt64Index), (tm.makeFloatIndex, tm.makeUIntIndex)), + ] +) -@pytest.fixture(params=list(it.combinations(indices_list, 2)), - ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__) +@pytest.fixture( + params=list(it.combinations(indices_list, 2)), + ids=lambda x: type(x[0]).__name__ + type(x[1]).__name__, +) def index_pair(request): """ Create all combinations of 2 index types. @@ -46,13 +50,13 @@ def test_union_different_types(index_pair): idx1, idx2 = index_pair type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: - pytest.xfail('This test only considers non compatible indexes.') + pytest.xfail("This test only considers non compatible indexes.") if any(isinstance(idx, pd.MultiIndex) for idx in index_pair): - pytest.xfail('This test doesn\'t consider multiindixes.') + pytest.xfail("This test doesn't consider multiindixes.") if is_dtype_equal(idx1.dtype, idx2.dtype): - pytest.xfail('This test only considers non matching dtypes.') + pytest.xfail("This test only considers non matching dtypes.") # A union with a CategoricalIndex (even as dtype('O')) and a # non-CategoricalIndex can only be made if both indices are monotonic. @@ -63,12 +67,11 @@ def test_union_different_types(index_pair): idx1 = idx1.sort_values() idx2 = idx2.sort_values() - assert idx1.union(idx2).dtype == np.dtype('O') - assert idx2.union(idx1).dtype == np.dtype('O') + assert idx1.union(idx2).dtype == np.dtype("O") + assert idx2.union(idx1).dtype == np.dtype("O") -@pytest.mark.parametrize('idx_fact1,idx_fact2', - COMPATIBLE_INCONSISTENT_PAIRS.values()) +@pytest.mark.parametrize("idx_fact1,idx_fact2", COMPATIBLE_INCONSISTENT_PAIRS.values()) def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): # GH 23525 idx1 = idx_fact1(10) @@ -81,23 +84,26 @@ def test_compatible_inconsistent_pairs(idx_fact1, idx_fact2): assert res2.dtype in (idx1.dtype, idx2.dtype) -@pytest.mark.parametrize('left, right, expected', [ - ('int64', 'int64', 'int64'), - ('int64', 'uint64', 'object'), - ('int64', 'float64', 'float64'), - ('uint64', 'float64', 'float64'), - ('uint64', 'uint64', 'uint64'), - ('float64', 'float64', 'float64'), - ('datetime64[ns]', 'int64', 'object'), - ('datetime64[ns]', 'uint64', 'object'), - ('datetime64[ns]', 'float64', 'object'), - ('datetime64[ns, CET]', 'int64', 'object'), - ('datetime64[ns, CET]', 'uint64', 'object'), - ('datetime64[ns, CET]', 'float64', 'object'), - ('Period[D]', 'int64', 'object'), - ('Period[D]', 'uint64', 'object'), - ('Period[D]', 'float64', 'object'), -]) +@pytest.mark.parametrize( + "left, right, expected", + [ + ("int64", "int64", "int64"), + ("int64", "uint64", "object"), + ("int64", "float64", "float64"), + ("uint64", "float64", "float64"), + ("uint64", "uint64", "uint64"), + ("float64", "float64", "float64"), + ("datetime64[ns]", "int64", "object"), + ("datetime64[ns]", "uint64", "object"), + ("datetime64[ns]", "float64", "object"), + ("datetime64[ns, CET]", "int64", "object"), + ("datetime64[ns, CET]", "uint64", "object"), + ("datetime64[ns, CET]", "float64", "object"), + ("Period[D]", "int64", "object"), + ("Period[D]", "uint64", "object"), + ("Period[D]", "float64", "object"), + ], +) def test_union_dtypes(left, right, expected): left = pandas_dtype(left) right = pandas_dtype(right) diff --git a/pandas/tests/indexes/timedeltas/test_arithmetic.py b/pandas/tests/indexes/timedeltas/test_arithmetic.py index 5ede6a289d42f..0f51a6333ab2d 100644 --- a/pandas/tests/indexes/timedeltas/test_arithmetic.py +++ b/pandas/tests/indexes/timedeltas/test_arithmetic.py @@ -10,15 +10,21 @@ import pandas.util.testing as tm -@pytest.fixture(params=[pd.offsets.Hour(2), timedelta(hours=2), - np.timedelta64(2, 'h'), Timedelta(hours=2)], - ids=str) +@pytest.fixture( + params=[ + pd.offsets.Hour(2), + timedelta(hours=2), + np.timedelta64(2, "h"), + Timedelta(hours=2), + ], + ids=str, +) def delta(request): # Several ways of representing two hours return request.param -@pytest.fixture(params=['B', 'D']) +@pytest.fixture(params=["B", "D"]) def freq(request): return request.param @@ -31,52 +37,63 @@ class TestTimedeltaIndexArithmetic: def test_tdi_shift_empty(self): # GH#9903 - idx = pd.TimedeltaIndex([], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - tm.assert_index_equal(idx.shift(3, freq='H'), idx) + idx = pd.TimedeltaIndex([], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + tm.assert_index_equal(idx.shift(3, freq="H"), idx) def test_tdi_shift_hours(self): # GH#9903 - idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='H'), idx) - exp = pd.TimedeltaIndex(['8 hours', '9 hours', '12 hours'], name='xxx') - tm.assert_index_equal(idx.shift(3, freq='H'), exp) - exp = pd.TimedeltaIndex(['2 hours', '3 hours', '6 hours'], name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="H"), idx) + exp = pd.TimedeltaIndex(["8 hours", "9 hours", "12 hours"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="H"), exp) + exp = pd.TimedeltaIndex(["2 hours", "3 hours", "6 hours"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="H"), exp) def test_tdi_shift_minutes(self): # GH#9903 - idx = pd.TimedeltaIndex(['5 hours', '6 hours', '9 hours'], name='xxx') - tm.assert_index_equal(idx.shift(0, freq='T'), idx) - exp = pd.TimedeltaIndex(['05:03:00', '06:03:00', '9:03:00'], - name='xxx') - tm.assert_index_equal(idx.shift(3, freq='T'), exp) - exp = pd.TimedeltaIndex(['04:57:00', '05:57:00', '8:57:00'], - name='xxx') - tm.assert_index_equal(idx.shift(-3, freq='T'), exp) + idx = pd.TimedeltaIndex(["5 hours", "6 hours", "9 hours"], name="xxx") + tm.assert_index_equal(idx.shift(0, freq="T"), idx) + exp = pd.TimedeltaIndex(["05:03:00", "06:03:00", "9:03:00"], name="xxx") + tm.assert_index_equal(idx.shift(3, freq="T"), exp) + exp = pd.TimedeltaIndex(["04:57:00", "05:57:00", "8:57:00"], name="xxx") + tm.assert_index_equal(idx.shift(-3, freq="T"), exp) def test_tdi_shift_int(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) result = trange.shift(1) - expected = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00', - '3 days 01:00:00', - '4 days 01:00:00', '5 days 01:00:00'], - freq='D') + expected = TimedeltaIndex( + [ + "1 days 01:00:00", + "2 days 01:00:00", + "3 days 01:00:00", + "4 days 01:00:00", + "5 days 01:00:00", + ], + freq="D", + ) tm.assert_index_equal(result, expected) def test_tdi_shift_nonstandard_freq(self): # GH#8083 - trange = pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - result = trange.shift(3, freq='2D 1s') - expected = TimedeltaIndex(['6 days 01:00:03', '7 days 01:00:03', - '8 days 01:00:03', '9 days 01:00:03', - '10 days 01:00:03'], freq='D') + trange = pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + result = trange.shift(3, freq="2D 1s") + expected = TimedeltaIndex( + [ + "6 days 01:00:03", + "7 days 01:00:03", + "8 days 01:00:03", + "9 days 01:00:03", + "10 days 01:00:03", + ], + freq="D", + ) tm.assert_index_equal(result, expected) def test_shift_no_freq(self): # GH#19147 - tdi = TimedeltaIndex(['1 days 01:00:00', '2 days 01:00:00'], freq=None) + tdi = TimedeltaIndex(["1 days 01:00:00", "2 days 01:00:00"], freq=None) with pytest.raises(NullFrequencyError): tdi.shift(2) @@ -85,32 +102,32 @@ def test_shift_no_freq(self): def test_tdi_add_int(self, one): # Variants of `one` for #19012 - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng + one - expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) + expected = timedelta_range("1 days 10:00:00", freq="H", periods=10) tm.assert_index_equal(result, expected) def test_tdi_iadd_int(self, one): - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - expected = timedelta_range('1 days 10:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + expected = timedelta_range("1 days 10:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 rng += one tm.assert_index_equal(rng, expected) def test_tdi_sub_int(self, one): - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng - one - expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) + expected = timedelta_range("1 days 08:00:00", freq="H", periods=10) tm.assert_index_equal(result, expected) def test_tdi_isub_int(self, one): - rng = timedelta_range('1 days 09:00:00', freq='H', periods=10) - expected = timedelta_range('1 days 08:00:00', freq='H', periods=10) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=10) + expected = timedelta_range("1 days 08:00:00", freq="H", periods=10) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 rng -= one @@ -119,12 +136,12 @@ def test_tdi_isub_int(self, one): # ------------------------------------------------------------- # __add__/__sub__ with integer arrays - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_tdi_add_integer_array(self, box): # GH#19959 - rng = timedelta_range('1 days 09:00:00', freq='H', periods=3) + rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) other = box([4, 3, 2]) - expected = TimedeltaIndex(['1 day 13:00:00'] * 3) + expected = TimedeltaIndex(["1 day 13:00:00"] * 3) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng + other @@ -135,12 +152,12 @@ def test_tdi_add_integer_array(self, box): result = other + rng tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_tdi_sub_integer_array(self, box): # GH#19959 - rng = timedelta_range('9H', freq='H', periods=3) + rng = timedelta_range("9H", freq="H", periods=3) other = box([4, 3, 2]) - expected = TimedeltaIndex(['5H', '7H', '9H']) + expected = TimedeltaIndex(["5H", "7H", "9H"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#22535 result = rng - other @@ -151,10 +168,10 @@ def test_tdi_sub_integer_array(self, box): result = other - rng tm.assert_index_equal(result, -expected) - @pytest.mark.parametrize('box', [np.array, pd.Index]) + @pytest.mark.parametrize("box", [np.array, pd.Index]) def test_tdi_addsub_integer_array_no_freq(self, box): # GH#19959 - tdi = TimedeltaIndex(['1 Day', 'NaT', '3 Hours']) + tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"]) other = box([14, -1, 16]) with pytest.raises(NullFrequencyError): tdi + other @@ -172,16 +189,15 @@ def test_tdi_addsub_integer_array_no_freq(self, box): def test_tdi_iadd_timedeltalike(self, delta): # only test adding/sub offsets as + is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('1 days 02:00:00', '10 days 02:00:00', - freq='D') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") rng += delta tm.assert_index_equal(rng, expected) def test_tdi_isub_timedeltalike(self, delta): # only test adding/sub offsets as - is now numeric - rng = timedelta_range('1 days', '10 days') - expected = timedelta_range('0 days 22:00:00', '9 days 22:00:00') + rng = timedelta_range("1 days", "10 days") + expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") rng -= delta tm.assert_index_equal(rng, expected) @@ -189,89 +205,90 @@ def test_tdi_isub_timedeltalike(self, delta): # TODO: after #24365 this probably belongs in scalar tests def test_ops_ndarray(self): - td = Timedelta('1 day') + td = Timedelta("1 day") # timedelta, timedelta - other = pd.to_timedelta(['1 day']).values - expected = pd.to_timedelta(['2 days']).values + other = pd.to_timedelta(["1 day"]).values + expected = pd.to_timedelta(["2 days"]).values tm.assert_numpy_array_equal(td + other, expected) tm.assert_numpy_array_equal(other + td, expected) msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" with pytest.raises(TypeError, match=msg): td + np.array([1]) - msg = (r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and" - " 'Timedelta'") + msg = ( + r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and" " 'Timedelta'" + ) with pytest.raises(TypeError, match=msg): np.array([1]) + td - expected = pd.to_timedelta(['0 days']).values + expected = pd.to_timedelta(["0 days"]).values tm.assert_numpy_array_equal(td - other, expected) tm.assert_numpy_array_equal(-other + td, expected) msg = r"unsupported operand type\(s\) for -: 'Timedelta' and 'int'" with pytest.raises(TypeError, match=msg): td - np.array([1]) - msg = (r"unsupported operand type\(s\) for -: 'numpy.ndarray' and" - " 'Timedelta'") + msg = r"unsupported operand type\(s\) for -: 'numpy.ndarray' and" " 'Timedelta'" with pytest.raises(TypeError, match=msg): np.array([1]) - td - expected = pd.to_timedelta(['2 days']).values + expected = pd.to_timedelta(["2 days"]).values tm.assert_numpy_array_equal(td * np.array([2]), expected) tm.assert_numpy_array_equal(np.array([2]) * td, expected) - msg = ("ufunc '?multiply'? cannot use operands with types" - r" dtype\('= -1') + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): @@ -169,46 +161,57 @@ def test_take_fill_value(self): class TestTimedeltaIndex: - def test_insert(self): - idx = TimedeltaIndex(['4day', '1day', '2day'], name='idx') + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") result = idx.insert(2, timedelta(days=5)) - exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') + exp = TimedeltaIndex(["4day", "1day", "5day", "2day"], name="idx") tm.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index - result = idx.insert(1, 'inserted') - expected = Index([Timedelta('4day'), 'inserted', Timedelta('1day'), - Timedelta('2day')], name='idx') + result = idx.insert(1, "inserted") + expected = Index( + [Timedelta("4day"), "inserted", Timedelta("1day"), Timedelta("2day")], + name="idx", + ) assert not isinstance(result, TimedeltaIndex) tm.assert_index_equal(result, expected) assert result.name == expected.name - idx = timedelta_range('1day 00:00:01', periods=3, freq='s', name='idx') + idx = timedelta_range("1day 00:00:01", periods=3, freq="s", name="idx") # preserve freq - expected_0 = TimedeltaIndex(['1day', '1day 00:00:01', '1day 00:00:02', - '1day 00:00:03'], - name='idx', freq='s') - expected_3 = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:04'], - name='idx', freq='s') + expected_0 = TimedeltaIndex( + ["1day", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq="s", + ) + expected_3 = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:04"], + name="idx", + freq="s", + ) # reset freq to None - expected_1_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:01', - '1day 00:00:02', '1day 00:00:03'], - name='idx', freq=None) - expected_3_nofreq = TimedeltaIndex(['1day 00:00:01', '1day 00:00:02', - '1day 00:00:03', '1day 00:00:05'], - name='idx', freq=None) - - cases = [(0, Timedelta('1day'), expected_0), - (-3, Timedelta('1day'), expected_0), - (3, Timedelta('1day 00:00:04'), expected_3), - (1, Timedelta('1day 00:00:01'), expected_1_nofreq), - (3, Timedelta('1day 00:00:05'), expected_3_nofreq)] + expected_1_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:01", "1day 00:00:02", "1day 00:00:03"], + name="idx", + freq=None, + ) + expected_3_nofreq = TimedeltaIndex( + ["1day 00:00:01", "1day 00:00:02", "1day 00:00:03", "1day 00:00:05"], + name="idx", + freq=None, + ) + + cases = [ + (0, Timedelta("1day"), expected_0), + (-3, Timedelta("1day"), expected_0), + (3, Timedelta("1day 00:00:04"), expected_3), + (1, Timedelta("1day 00:00:01"), expected_1_nofreq), + (3, Timedelta("1day 00:00:05"), expected_3_nofreq), + ] for n, d, expected in cases: result = idx.insert(n, d) @@ -217,29 +220,30 @@ def test_insert(self): assert result.freq == expected.freq # GH 18295 (test missing) - expected = TimedeltaIndex(['1day', pd.NaT, '2day', '3day']) + expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) for na in (np.nan, pd.NaT, None): - result = timedelta_range('1day', '3day').insert(1, na) + result = timedelta_range("1day", "3day").insert(1, na) tm.assert_index_equal(result, expected) def test_delete(self): - idx = timedelta_range(start='1 Days', periods=5, freq='D', name='idx') + idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") # prserve freq - expected_0 = timedelta_range(start='2 Days', periods=4, freq='D', - name='idx') - expected_4 = timedelta_range(start='1 Days', periods=4, freq='D', - name='idx') + expected_0 = timedelta_range(start="2 Days", periods=4, freq="D", name="idx") + expected_4 = timedelta_range(start="1 Days", periods=4, freq="D", name="idx") # reset freq to None expected_1 = TimedeltaIndex( - ['1 day', '3 day', '4 day', '5 day'], freq=None, name='idx') - - cases = {0: expected_0, - -5: expected_0, - -1: expected_4, - 4: expected_4, - 1: expected_1} + ["1 day", "3 day", "4 day", "5 day"], freq=None, name="idx" + ) + + cases = { + 0: expected_0, + -5: expected_0, + -1: expected_4, + 4: expected_4, + 1: expected_1, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -251,22 +255,22 @@ def test_delete(self): idx.delete(5) def test_delete_slice(self): - idx = timedelta_range(start='1 days', periods=10, freq='D', name='idx') + idx = timedelta_range(start="1 days", periods=10, freq="D", name="idx") # prserve freq - expected_0_2 = timedelta_range(start='4 days', periods=7, freq='D', - name='idx') - expected_7_9 = timedelta_range(start='1 days', periods=7, freq='D', - name='idx') + expected_0_2 = timedelta_range(start="4 days", periods=7, freq="D", name="idx") + expected_7_9 = timedelta_range(start="1 days", periods=7, freq="D", name="idx") # reset freq to None - expected_3_5 = TimedeltaIndex(['1 d', '2 d', '3 d', - '7 d', '8 d', '9 d', '10d'], - freq=None, name='idx') - - cases = {(0, 1, 2): expected_0_2, - (7, 8, 9): expected_7_9, - (3, 4, 5): expected_3_5} + expected_3_5 = TimedeltaIndex( + ["1 d", "2 d", "3 d", "7 d", "8 d", "9 d", "10d"], freq=None, name="idx" + ) + + cases = { + (0, 1, 2): expected_0_2, + (7, 8, 9): expected_7_9, + (3, 4, 5): expected_3_5, + } for n, expected in cases.items(): result = idx.delete(n) tm.assert_index_equal(result, expected) @@ -279,60 +283,63 @@ def test_delete_slice(self): assert result.freq == expected.freq def test_get_loc(self): - idx = pd.to_timedelta(['0 days', '1 days', '2 days']) + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) - for method in [None, 'pad', 'backfill', 'nearest']: + for method in [None, "pad", "backfill", "nearest"]: assert idx.get_loc(idx[1], method) == 1 assert idx.get_loc(idx[1].to_pytimedelta(), method) == 1 assert idx.get_loc(str(idx[1]), method) == 1 - assert idx.get_loc(idx[1], 'pad', - tolerance=Timedelta(0)) == 1 - assert idx.get_loc(idx[1], 'pad', - tolerance=np.timedelta64(0, 's')) == 1 - assert idx.get_loc(idx[1], 'pad', - tolerance=timedelta(0)) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=Timedelta(0)) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=np.timedelta64(0, "s")) == 1 + assert idx.get_loc(idx[1], "pad", tolerance=timedelta(0)) == 1 - with pytest.raises(ValueError, match='unit abbreviation w/o a number'): - idx.get_loc(idx[1], method='nearest', tolerance='foo') + with pytest.raises(ValueError, match="unit abbreviation w/o a number"): + idx.get_loc(idx[1], method="nearest", tolerance="foo") - with pytest.raises( - ValueError, - match='tolerance size must match'): - idx.get_loc(idx[1], method='nearest', - tolerance=[Timedelta(0).to_timedelta64(), - Timedelta(0).to_timedelta64()]) + with pytest.raises(ValueError, match="tolerance size must match"): + idx.get_loc( + idx[1], + method="nearest", + tolerance=[ + Timedelta(0).to_timedelta64(), + Timedelta(0).to_timedelta64(), + ], + ) - for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: - assert idx.get_loc('1 day 1 hour', method) == loc + for method, loc in [("pad", 1), ("backfill", 2), ("nearest", 1)]: + assert idx.get_loc("1 day 1 hour", method) == loc # GH 16909 assert idx.get_loc(idx[1].to_timedelta64()) == 1 # GH 16896 - assert idx.get_loc('0 days') == 0 + assert idx.get_loc("0 days") == 0 def test_get_loc_nat(self): - tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) + tidx = TimedeltaIndex(["1 days 01:00:00", "NaT", "2 days 01:00:00"]) assert tidx.get_loc(pd.NaT) == 1 assert tidx.get_loc(None) == 1 - assert tidx.get_loc(float('nan')) == 1 + assert tidx.get_loc(float("nan")) == 1 assert tidx.get_loc(np.nan) == 1 def test_get_indexer(self): - idx = pd.to_timedelta(['0 days', '1 days', '2 days']) - tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.intp)) - - target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.intp)) - tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.intp)) - - res = idx.get_indexer(target, 'nearest', - tolerance=Timedelta('1 hour')) + idx = pd.to_timedelta(["0 days", "1 days", "2 days"]) + tm.assert_numpy_array_equal( + idx.get_indexer(idx), np.array([0, 1, 2], dtype=np.intp) + ) + + target = pd.to_timedelta(["-1 hour", "12 hours", "1 day 1 hour"]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "pad"), np.array([-1, 0, 1], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "backfill"), np.array([0, 1, 2], dtype=np.intp) + ) + tm.assert_numpy_array_equal( + idx.get_indexer(target, "nearest"), np.array([0, 1, 1], dtype=np.intp) + ) + + res = idx.get_indexer(target, "nearest", tolerance=Timedelta("1 hour")) tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 9e96b7d99e35d..d7d8b10347861 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -1,4 +1,3 @@ - import numpy as np import pytest @@ -27,31 +26,37 @@ def test_ops_properties(self): def test_value_counts_unique(self): # GH 7735 - idx = timedelta_range('1 days 09:00:00', freq='H', periods=10) + idx = timedelta_range("1 days 09:00:00", freq="H", periods=10) # create repeated values, 'n'th element is repeated by n+1 times idx = TimedeltaIndex(np.repeat(idx.values, range(1, len(idx) + 1))) - exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) - expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + exp_idx = timedelta_range("1 days 18:00:00", freq="-1H", periods=10) + expected = Series(range(10, 0, -1), index=exp_idx, dtype="int64") for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) + expected = timedelta_range("1 days 09:00:00", freq="H", periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', - '1 days 09:00:00', '1 days 08:00:00', - '1 days 08:00:00', pd.NaT]) - - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) + idx = TimedeltaIndex( + [ + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 09:00:00", + "1 days 08:00:00", + "1 days 08:00:00", + pd.NaT, + ] + ) + + exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00"]) expected = Series([3, 2], index=exp_idx) for obj in [idx, Series(idx)]: tm.assert_series_equal(obj.value_counts(), expected) - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', - pd.NaT]) + exp_idx = TimedeltaIndex(["1 days 09:00:00", "1 days 08:00:00", pd.NaT]) expected = Series([3, 2, 1], index=exp_idx) for obj in [idx, Series(idx)]: @@ -61,26 +66,31 @@ def test_value_counts_unique(self): def test_nonunique_contains(self): # GH 9512 - for idx in map(TimedeltaIndex, ([0, 1, 0], [0, 0, -1], [0, -1, -1], - ['00:01:00', '00:01:00', '00:02:00'], - ['00:01:00', '00:01:00', '00:00:01'])): + for idx in map( + TimedeltaIndex, + ( + [0, 1, 0], + [0, 0, -1], + [0, -1, -1], + ["00:01:00", "00:01:00", "00:02:00"], + ["00:01:00", "00:01:00", "00:00:01"], + ), + ): assert idx[0] in idx def test_unknown_attribute(self): # see gh-9680 - tdi = pd.timedelta_range(start=0, periods=10, freq='1s') + tdi = pd.timedelta_range(start=0, periods=10, freq="1s") ts = pd.Series(np.random.normal(size=10), index=tdi) - assert 'foo' not in ts.__dict__.keys() + assert "foo" not in ts.__dict__.keys() msg = "'Series' object has no attribute 'foo'" with pytest.raises(AttributeError, match=msg): ts.foo def test_order(self): # GH 10295 - idx1 = TimedeltaIndex(['1 day', '2 day', '3 day'], freq='D', - name='idx') - idx2 = TimedeltaIndex( - ['1 hour', '2 hour', '3 hour'], freq='H', name='idx') + idx1 = TimedeltaIndex(["1 day", "2 day", "3 day"], freq="D", name="idx") + idx2 = TimedeltaIndex(["1 hour", "2 hour", "3 hour"], freq="H", name="idx") for idx in [idx1, idx2]: ordered = idx.sort_values() @@ -95,23 +105,24 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) tm.assert_index_equal(ordered, idx) - tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), - check_dtype=False) + tm.assert_numpy_array_equal(indexer, np.array([0, 1, 2]), check_dtype=False) assert ordered.freq == idx.freq - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, idx[::-1]) assert ordered.freq == expected.freq assert ordered.freq.n == -1 - idx1 = TimedeltaIndex(['1 hour', '3 hour', '5 hour', - '2 hour ', '1 hour'], name='idx1') - exp1 = TimedeltaIndex(['1 hour', '1 hour', '2 hour', - '3 hour', '5 hour'], name='idx1') + idx1 = TimedeltaIndex( + ["1 hour", "3 hour", "5 hour", "2 hour ", "1 hour"], name="idx1" + ) + exp1 = TimedeltaIndex( + ["1 hour", "1 hour", "2 hour", "3 hour", "5 hour"], name="idx1" + ) - idx2 = TimedeltaIndex(['1 day', '3 day', '5 day', - '2 day', '1 day'], name='idx2') + idx2 = TimedeltaIndex( + ["1 day", "3 day", "5 day", "2 day", "1 day"], name="idx2" + ) # TODO(wesm): unused? # exp2 = TimedeltaIndex(['1 day', '1 day', '2 day', @@ -138,8 +149,7 @@ def test_order(self): tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) assert ordered.freq is None - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) + ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) exp = np.array([2, 1, 3, 4, 0]) @@ -148,7 +158,7 @@ def test_order(self): def test_drop_duplicates_metadata(self): # GH 10115 - idx = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + idx = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") result = idx.drop_duplicates() tm.assert_index_equal(idx, result) assert idx.freq == result.freq @@ -161,7 +171,7 @@ def test_drop_duplicates_metadata(self): def test_drop_duplicates(self): # to check Index/Series compat - base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + base = pd.timedelta_range("1 day", "31 day", freq="D", name="idx") idx = base.append(base[:5]) res = idx.drop_duplicates() @@ -169,10 +179,10 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates() tm.assert_series_equal(res, Series(base)) - res = idx.drop_duplicates(keep='last') + res = idx.drop_duplicates(keep="last") exp = base[5:].append(base[:5]) tm.assert_index_equal(res, exp) - res = Series(idx).drop_duplicates(keep='last') + res = Series(idx).drop_duplicates(keep="last") tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) res = idx.drop_duplicates(keep=False) @@ -180,13 +190,13 @@ def test_drop_duplicates(self): res = Series(idx).drop_duplicates(keep=False) tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) - @pytest.mark.parametrize('freq', ['D', '3D', '-3D', - 'H', '2H', '-2H', - 'T', '2T', 'S', '-3S']) + @pytest.mark.parametrize( + "freq", ["D", "3D", "-3D", "H", "2H", "-2H", "T", "2T", "S", "-3S"] + ) def test_infer_freq(self, freq): # GH#11018 - idx = pd.timedelta_range('1', freq=freq, periods=10) - result = pd.TimedeltaIndex(idx.asi8, freq='infer') + idx = pd.timedelta_range("1", freq=freq, periods=10) + result = pd.TimedeltaIndex(idx.asi8, freq="infer") tm.assert_index_equal(idx, result) assert result.freq == freq @@ -194,16 +204,26 @@ def test_shift(self): pass # handled in test_arithmetic.py def test_repeat(self): - index = pd.timedelta_range('1 days', periods=2, freq='D') - exp = pd.TimedeltaIndex(['1 days', '1 days', '2 days', '2 days']) + index = pd.timedelta_range("1 days", periods=2, freq="D") + exp = pd.TimedeltaIndex(["1 days", "1 days", "2 days", "2 days"]) for res in [index.repeat(2), np.repeat(index, 2)]: tm.assert_index_equal(res, exp) assert res.freq is None - index = TimedeltaIndex(['1 days', 'NaT', '3 days']) - exp = TimedeltaIndex(['1 days', '1 days', '1 days', - 'NaT', 'NaT', 'NaT', - '3 days', '3 days', '3 days']) + index = TimedeltaIndex(["1 days", "NaT", "3 days"]) + exp = TimedeltaIndex( + [ + "1 days", + "1 days", + "1 days", + "NaT", + "NaT", + "NaT", + "3 days", + "3 days", + "3 days", + ] + ) for res in [index.repeat(3), np.repeat(index, 3)]: tm.assert_index_equal(res, exp) assert res.freq is None @@ -212,25 +232,23 @@ def test_nat(self): assert pd.TimedeltaIndex._na_value is pd.NaT assert pd.TimedeltaIndex([])._na_value is pd.NaT - idx = pd.TimedeltaIndex(['1 days', '2 days']) + idx = pd.TimedeltaIndex(["1 days", "2 days"]) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) assert idx.hasnans is False - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([], dtype=np.intp)) - idx = pd.TimedeltaIndex(['1 days', 'NaT']) + idx = pd.TimedeltaIndex(["1 days", "NaT"]) assert idx._can_hold_na tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) assert idx.hasnans is True - tm.assert_numpy_array_equal(idx._nan_idxs, - np.array([1], dtype=np.intp)) + tm.assert_numpy_array_equal(idx._nan_idxs, np.array([1], dtype=np.intp)) def test_equals(self): # GH 13107 - idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) + idx = pd.TimedeltaIndex(["1 days", "2 days", "NaT"]) assert idx.equals(idx) assert idx.equals(idx.copy()) assert idx.equals(idx.astype(object)) @@ -239,7 +257,7 @@ def test_equals(self): assert not idx.equals(list(idx)) assert not idx.equals(pd.Series(idx)) - idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) + idx2 = pd.TimedeltaIndex(["2 days", "1 days", "NaT"]) assert not idx.equals(idx2) assert not idx.equals(idx2.copy()) assert not idx.equals(idx2.astype(object)) @@ -248,8 +266,8 @@ def test_equals(self): assert not idx.equals(list(idx2)) assert not idx.equals(pd.Series(idx2)) - @pytest.mark.parametrize('values', [['0 days', '2 days', '4 days'], []]) - @pytest.mark.parametrize('freq', ['2D', Day(2), '48H', Hour(48)]) + @pytest.mark.parametrize("values", [["0 days", "2 days", "4 days"], []]) + @pytest.mark.parametrize("freq", ["2D", Day(2), "48H", Hour(48)]) def test_freq_setter(self, values, freq): # GH 20678 idx = TimedeltaIndex(values) @@ -265,19 +283,21 @@ def test_freq_setter(self, values, freq): def test_freq_setter_errors(self): # GH 20678 - idx = TimedeltaIndex(['0 days', '2 days', '4 days']) + idx = TimedeltaIndex(["0 days", "2 days", "4 days"]) # setting with an incompatible freq - msg = ('Inferred frequency 2D from passed values does not conform to ' - 'passed frequency 5D') + msg = ( + "Inferred frequency 2D from passed values does not conform to " + "passed frequency 5D" + ) with pytest.raises(ValueError, match=msg): - idx.freq = '5D' + idx.freq = "5D" # setting with a non-fixed frequency - msg = r'<2 \* BusinessDays> is a non-fixed frequency' + msg = r"<2 \* BusinessDays> is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - idx.freq = '2B' + idx.freq = "2B" # setting with non-freq string - with pytest.raises(ValueError, match='Invalid frequency'): - idx.freq = 'foo' + with pytest.raises(ValueError, match="Invalid frequency"): + idx.freq = "foo" diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py index 0c1ecffec2bf7..446b67d5f501d 100644 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -9,51 +9,51 @@ class TestSlicing: def test_slice_keeps_name(self): # GH4226 - dr = pd.timedelta_range('1d', '5d', freq='H', name='timebucket') + dr = pd.timedelta_range("1d", "5d", freq="H", name="timebucket") assert dr[1:].name == dr.name def test_partial_slice(self): - rng = timedelta_range('1 day 10:11:12', freq='h', periods=500) + rng = timedelta_range("1 day 10:11:12", freq="h", periods=500) s = Series(np.arange(len(rng)), index=rng) - result = s['5 day':'6 day'] + result = s["5 day":"6 day"] expected = s.iloc[86:134] assert_series_equal(result, expected) - result = s['5 day':] + result = s["5 day":] expected = s.iloc[86:] assert_series_equal(result, expected) - result = s[:'6 day'] + result = s[:"6 day"] expected = s.iloc[:134] assert_series_equal(result, expected) - result = s['6 days, 23:11:12'] + result = s["6 days, 23:11:12"] assert result == s.iloc[133] msg = r"^Timedelta\('50 days 00:00:00'\)$" with pytest.raises(KeyError, match=msg): - s['50 days'] + s["50 days"] def test_partial_slice_high_reso(self): # higher reso - rng = timedelta_range('1 day 10:11:12', freq='us', periods=2000) + rng = timedelta_range("1 day 10:11:12", freq="us", periods=2000) s = Series(np.arange(len(rng)), index=rng) - result = s['1 day 10:11:12':] + result = s["1 day 10:11:12":] expected = s.iloc[0:] assert_series_equal(result, expected) - result = s['1 day 10:11:12.001':] + result = s["1 day 10:11:12.001":] expected = s.iloc[1000:] assert_series_equal(result, expected) - result = s['1 days, 10:11:12.001001'] + result = s["1 days, 10:11:12.001001"] assert result == s.iloc[1001] def test_slice_with_negative_step(self): - ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) + ts = Series(np.arange(20), timedelta_range("0", periods=20, freq="H")) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -61,27 +61,30 @@ def assert_slices_equivalent(l_slc, i_slc): assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_slices_equivalent(SLC[Timedelta(hours=7)::-1], SLC[7::-1]) - assert_slices_equivalent(SLC['7 hours'::-1], SLC[7::-1]) + assert_slices_equivalent(SLC[Timedelta(hours=7) :: -1], SLC[7::-1]) + assert_slices_equivalent(SLC["7 hours"::-1], SLC[7::-1]) - assert_slices_equivalent(SLC[:Timedelta(hours=7):-1], SLC[:6:-1]) - assert_slices_equivalent(SLC[:'7 hours':-1], SLC[:6:-1]) + assert_slices_equivalent(SLC[: Timedelta(hours=7) : -1], SLC[:6:-1]) + assert_slices_equivalent(SLC[:"7 hours":-1], SLC[:6:-1]) - assert_slices_equivalent(SLC['15 hours':'7 hours':-1], SLC[15:6:-1]) - assert_slices_equivalent(SLC[Timedelta(hours=15):Timedelta(hours=7):- - 1], SLC[15:6:-1]) - assert_slices_equivalent(SLC['15 hours':Timedelta(hours=7):-1], - SLC[15:6:-1]) - assert_slices_equivalent(SLC[Timedelta(hours=15):'7 hours':-1], - SLC[15:6:-1]) + assert_slices_equivalent(SLC["15 hours":"7 hours":-1], SLC[15:6:-1]) + assert_slices_equivalent( + SLC[Timedelta(hours=15) : Timedelta(hours=7) : -1], SLC[15:6:-1] + ) + assert_slices_equivalent( + SLC["15 hours" : Timedelta(hours=7) : -1], SLC[15:6:-1] + ) + assert_slices_equivalent( + SLC[Timedelta(hours=15) : "7 hours" : -1], SLC[15:6:-1] + ) - assert_slices_equivalent(SLC['7 hours':'15 hours':-1], SLC[:0]) + assert_slices_equivalent(SLC["7 hours":"15 hours":-1], SLC[:0]) def test_slice_with_zero_step_raises(self): - ts = Series(np.arange(20), timedelta_range('0', periods=20, freq='H')) - with pytest.raises(ValueError, match='slice step cannot be zero'): + ts = Series(np.arange(20), timedelta_range("0", periods=20, freq="H")) + with pytest.raises(ValueError, match="slice step cannot be zero"): ts[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): ts.loc[::0] diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index fd3c0f7eabe0c..38f1d2c7d4a1b 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -14,10 +14,11 @@ class TestVectorizedTimedelta: def test_tdi_total_seconds(self): # GH#10939 # test index - rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, - freq='s') - expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, - 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] + rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") + expt = [ + 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, + 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456.0 / 1e9, + ] tm.assert_almost_equal(rng.total_seconds(), Index(expt)) # test Series @@ -27,37 +28,44 @@ def test_tdi_total_seconds(self): # with nat ser[1] = np.nan - s_expt = Series([1 * 86400 + 10 * 3600 + 11 * 60 + - 12 + 100123456. / 1e9, np.nan], index=[0, 1]) + s_expt = Series( + [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9, np.nan], + index=[0, 1], + ) tm.assert_series_equal(ser.dt.total_seconds(), s_expt) # with both nat - ser = Series([np.nan, np.nan], dtype='timedelta64[ns]') - tm.assert_series_equal(ser.dt.total_seconds(), - Series([np.nan, np.nan], index=[0, 1])) + ser = Series([np.nan, np.nan], dtype="timedelta64[ns]") + tm.assert_series_equal( + ser.dt.total_seconds(), Series([np.nan, np.nan], index=[0, 1]) + ) def test_tdi_round(self): - td = pd.timedelta_range(start='16801 days', periods=5, freq='30Min') + td = pd.timedelta_range(start="16801 days", periods=5, freq="30Min") elt = td[1] - expected_rng = TimedeltaIndex([Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 00:00:00'), - Timedelta('16801 days 01:00:00'), - Timedelta('16801 days 02:00:00'), - Timedelta('16801 days 02:00:00')]) + expected_rng = TimedeltaIndex( + [ + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 00:00:00"), + Timedelta("16801 days 01:00:00"), + Timedelta("16801 days 02:00:00"), + Timedelta("16801 days 02:00:00"), + ] + ) expected_elt = expected_rng[1] - tm.assert_index_equal(td.round(freq='H'), expected_rng) - assert elt.round(freq='H') == expected_elt + tm.assert_index_equal(td.round(freq="H"), expected_rng) + assert elt.round(freq="H") == expected_elt msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - td.round(freq='foo') + td.round(freq="foo") with pytest.raises(ValueError, match=msg): - elt.round(freq='foo') + elt.round(freq="foo") msg = " is a non-fixed frequency" with pytest.raises(ValueError, match=msg): - td.round(freq='M') + td.round(freq="M") with pytest.raises(ValueError, match=msg): - elt.round(freq='M') + elt.round(freq="M") diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index a0dc74408a4af..861067480b5fa 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -9,24 +9,23 @@ class TestTimedeltaIndex: - def test_union(self): - i1 = timedelta_range('1day', periods=5) - i2 = timedelta_range('3day', periods=5) + i1 = timedelta_range("1day", periods=5) + i2 = timedelta_range("3day", periods=5) result = i1.union(i2) - expected = timedelta_range('1day', periods=7) + expected = timedelta_range("1day", periods=7) tm.assert_index_equal(result, expected) i1 = Int64Index(np.arange(0, 20, 2)) - i2 = timedelta_range(start='1 day', periods=10, freq='D') + i2 = timedelta_range(start="1 day", periods=10, freq="D") i1.union(i2) # Works i2.union(i1) # Fails with "AttributeError: can't set attribute" def test_union_coverage(self): - idx = TimedeltaIndex(['3d', '1d', '2d']) - ordered = TimedeltaIndex(idx.sort_values(), freq='infer') + idx = TimedeltaIndex(["3d", "1d", "2d"]) + ordered = TimedeltaIndex(idx.sort_values(), freq="infer") result = ordered.union(idx) tm.assert_index_equal(result, ordered) @@ -36,8 +35,8 @@ def test_union_coverage(self): def test_union_bug_1730(self): - rng_a = timedelta_range('1 day', periods=4, freq='3H') - rng_b = timedelta_range('1 day', periods=4, freq='4H') + rng_a = timedelta_range("1 day", periods=4, freq="3H") + rng_b = timedelta_range("1 day", periods=4, freq="4H") result = rng_a.union(rng_b) exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) @@ -45,10 +44,10 @@ def test_union_bug_1730(self): def test_union_bug_1745(self): - left = TimedeltaIndex(['1 day 15:19:49.695000']) - right = TimedeltaIndex(['2 day 13:04:21.322000', - '1 day 15:27:24.873000', - '1 day 15:31:05.350000']) + left = TimedeltaIndex(["1 day 15:19:49.695000"]) + right = TimedeltaIndex( + ["2 day 13:04:21.322000", "1 day 15:27:24.873000", "1 day 15:31:05.350000"] + ) result = left.union(right) exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) @@ -64,25 +63,25 @@ def test_union_bug_4564(self): tm.assert_index_equal(result, exp) def test_intersection_bug_1708(self): - index_1 = timedelta_range('1 day', periods=4, freq='h') + index_1 = timedelta_range("1 day", periods=4, freq="h") index_2 = index_1 + pd.offsets.Hour(5) result = index_1 & index_2 assert len(result) == 0 - index_1 = timedelta_range('1 day', periods=4, freq='h') + index_1 = timedelta_range("1 day", periods=4, freq="h") index_2 = index_1 + pd.offsets.Hour(1) result = index_1 & index_2 - expected = timedelta_range('1 day 01:00:00', periods=3, freq='h') + expected = timedelta_range("1 day 01:00:00", periods=3, freq="h") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_equal(self, sort): # GH 24471 Test intersection outcome given the sort keyword # for equal indicies intersection should return the original index - first = timedelta_range('1 day', periods=4, freq='h') - second = timedelta_range('1 day', periods=4, freq='h') + first = timedelta_range("1 day", periods=4, freq="h") + second = timedelta_range("1 day", periods=4, freq="h") intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) @@ -96,17 +95,17 @@ def test_intersection_equal(self, sort): @pytest.mark.parametrize("sort", [None, False]) def test_intersection_zero_length(self, period_1, period_2, sort): # GH 24471 test for non overlap the intersection should be zero length - index_1 = timedelta_range('1 day', periods=period_1, freq='h') - index_2 = timedelta_range('1 day', periods=period_2, freq='h') - expected = timedelta_range('1 day', periods=0, freq='h') + index_1 = timedelta_range("1 day", periods=period_1, freq="h") + index_2 = timedelta_range("1 day", periods=period_2, freq="h") + expected = timedelta_range("1 day", periods=0, freq="h") result = index_1.intersection(index_2, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('sort', [None, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_zero_length_input_index(self, sort): # GH 24966 test for 0-len intersections are copied - index_1 = timedelta_range('1 day', periods=0, freq='h') - index_2 = timedelta_range('1 day', periods=3, freq='h') + index_1 = timedelta_range("1 day", periods=0, freq="h") + index_2 = timedelta_range("1 day", periods=3, freq="h") result = index_1.intersection(index_2, sort=sort) assert index_1 is not result assert index_2 is not result @@ -116,18 +115,26 @@ def test_zero_length_input_index(self, sort): "rng, expected", # if target has the same name, it is preserved [ - (timedelta_range('1 day', periods=5, freq='h', name='idx'), - timedelta_range('1 day', periods=4, freq='h', name='idx')), + ( + timedelta_range("1 day", periods=5, freq="h", name="idx"), + timedelta_range("1 day", periods=4, freq="h", name="idx"), + ), # if target name is different, it will be reset - (timedelta_range('1 day', periods=5, freq='h', name='other'), - timedelta_range('1 day', periods=4, freq='h', name=None)), + ( + timedelta_range("1 day", periods=5, freq="h", name="other"), + timedelta_range("1 day", periods=4, freq="h", name=None), + ), # if no overlap exists return empty index - (timedelta_range('1 day', periods=10, freq='h', name='idx')[5:], - TimedeltaIndex([], name='idx'))]) + ( + timedelta_range("1 day", periods=10, freq="h", name="idx")[5:], + TimedeltaIndex([], name="idx"), + ), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, rng, expected, sort): # GH 4690 (with tz) - base = timedelta_range('1 day', periods=4, freq='h', name='idx') + base = timedelta_range("1 day", periods=4, freq="h", name="idx") result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() @@ -139,23 +146,28 @@ def test_intersection(self, rng, expected, sort): "rng, expected", # part intersection works [ - (TimedeltaIndex(['5 hour', '2 hour', '4 hour', '9 hour'], - name='idx'), - TimedeltaIndex(['2 hour', '4 hour'], name='idx')), + ( + TimedeltaIndex(["5 hour", "2 hour", "4 hour", "9 hour"], name="idx"), + TimedeltaIndex(["2 hour", "4 hour"], name="idx"), + ), # reordered part intersection - (TimedeltaIndex(['2 hour', '5 hour', '5 hour', '1 hour'], - name='other'), - TimedeltaIndex(['1 hour', '2 hour'], name=None)), + ( + TimedeltaIndex(["2 hour", "5 hour", "5 hour", "1 hour"], name="other"), + TimedeltaIndex(["1 hour", "2 hour"], name=None), + ), # reveresed index - (TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'], - name='idx')[::-1], - TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'], - name='idx'))]) + ( + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx")[ + ::-1 + ], + TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx"), + ), + ], + ) @pytest.mark.parametrize("sort", [None, False]) def test_intersection_non_monotonic(self, rng, expected, sort): # 24471 non-monotonic - base = TimedeltaIndex(['1 hour', '2 hour', '4 hour', '3 hour'], - name='idx') + base = TimedeltaIndex(["1 hour", "2 hour", "4 hour", "3 hour"], name="idx") result = base.intersection(rng, sort=sort) if sort is None: expected = expected.sort_values() diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 79d064c57fa40..018ccfb2439dc 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -6,11 +6,21 @@ import pandas as pd from pandas import ( - DataFrame, Index, Int64Index, Series, Timedelta, TimedeltaIndex, - date_range, timedelta_range) + DataFrame, + Index, + Int64Index, + Series, + Timedelta, + TimedeltaIndex, + date_range, + timedelta_range, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_index_equal, assert_series_equal) + assert_almost_equal, + assert_index_equal, + assert_series_equal, +) from ..datetimelike import DatetimeLike @@ -25,7 +35,7 @@ def setup_method(self, method): self.setup_indices() def create_index(self): - return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + return pd.to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) def test_numeric_compat(self): # Dummy method to override super's version; this test is now done @@ -40,17 +50,18 @@ def test_pickle_compat_construction(self): def test_fillna_timedelta(self): # GH 11343 - idx = pd.TimedeltaIndex(['1 day', pd.NaT, '3 day']) + idx = pd.TimedeltaIndex(["1 day", pd.NaT, "3 day"]) - exp = pd.TimedeltaIndex(['1 day', '2 day', '3 day']) - tm.assert_index_equal(idx.fillna(pd.Timedelta('2 day')), exp) + exp = pd.TimedeltaIndex(["1 day", "2 day", "3 day"]) + tm.assert_index_equal(idx.fillna(pd.Timedelta("2 day")), exp) - exp = pd.TimedeltaIndex(['1 day', '3 hour', '3 day']) - idx.fillna(pd.Timedelta('3 hour')) + exp = pd.TimedeltaIndex(["1 day", "3 hour", "3 day"]) + idx.fillna(pd.Timedelta("3 hour")) exp = pd.Index( - [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) - tm.assert_index_equal(idx.fillna('x'), exp) + [pd.Timedelta("1 day"), "x", pd.Timedelta("3 day")], dtype=object + ) + tm.assert_index_equal(idx.fillna("x"), exp) @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): @@ -62,19 +73,20 @@ def test_difference_freq(self, sort): expected = TimedeltaIndex(["0 days", "5 days"], freq=None) idx_diff = index.difference(other, sort) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) expected = TimedeltaIndex(["0 days", "1 days"], freq=None) tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) @pytest.mark.parametrize("sort", [None, False]) def test_difference_sort(self, sort): - index = pd.TimedeltaIndex(["5 days", "3 days", "2 days", "4 days", - "1 days", "0 days"]) + index = pd.TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) other = timedelta_range("1 days", "4 days", freq="D") idx_diff = index.difference(other, sort) @@ -85,7 +97,7 @@ def test_difference_sort(self, sort): expected = expected.sort_values() tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) other = timedelta_range("2 days", "5 days", freq="D") idx_diff = index.difference(other, sort) @@ -95,7 +107,7 @@ def test_difference_sort(self, sort): expected = expected.sort_values() tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal('freq', idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) def test_isin(self): @@ -106,15 +118,15 @@ def test_isin(self): result = index.isin(list(index)) assert result.all() - assert_almost_equal(index.isin([index[2], 5]), - np.array([False, False, True, False])) + assert_almost_equal( + index.isin([index[2], 5]), np.array([False, False, True, False]) + ) def test_factorize(self): - idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day', - '3 day']) + idx1 = TimedeltaIndex(["1 day", "1 day", "2 day", "2 day", "3 day", "3 day"]) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) - exp_idx = TimedeltaIndex(['1 day', '2 day', '3 day']) + exp_idx = TimedeltaIndex(["1 day", "2 day", "3 day"]) arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) @@ -125,32 +137,36 @@ def test_factorize(self): tm.assert_index_equal(idx, exp_idx) # freq must be preserved - idx3 = timedelta_range('1 day', periods=4, freq='s') + idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) def test_join_self(self, join_type): - index = timedelta_range('1 day', periods=10) + index = timedelta_range("1 day", periods=10) joined = index.join(index, how=join_type) tm.assert_index_equal(index, joined) def test_does_not_convert_mixed_integer(self): - df = tm.makeCustomDataframe(10, 10, - data_gen_f=lambda *args, **kwargs: randn(), - r_idx_type='i', c_idx_type='td') + df = tm.makeCustomDataframe( + 10, + 10, + data_gen_f=lambda *args, **kwargs: randn(), + r_idx_type="i", + c_idx_type="td", + ) str(df) - cols = df.columns.join(df.index, how='outer') + cols = df.columns.join(df.index, how="outer") joined = cols.join(df.columns) - assert cols.dtype == np.dtype('O') + assert cols.dtype == np.dtype("O") assert cols.dtype == joined.dtype tm.assert_index_equal(cols, joined) def test_sort_values(self): - idx = TimedeltaIndex(['4d', '1d', '2d']) + idx = TimedeltaIndex(["4d", "1d", "2d"]) ordered = idx.sort_values() assert ordered.is_monotonic @@ -161,48 +177,44 @@ def test_sort_values(self): ordered, dexer = idx.sort_values(return_indexer=True) assert ordered.is_monotonic - tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0]), - check_dtype=False) + tm.assert_numpy_array_equal(dexer, np.array([1, 2, 0]), check_dtype=False) ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) assert ordered[::-1].is_monotonic - tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), - check_dtype=False) + tm.assert_numpy_array_equal(dexer, np.array([0, 2, 1]), check_dtype=False) def test_get_duplicates(self): - idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day', - '4day']) + idx = TimedeltaIndex(["1 day", "2 day", "2 day", "3 day", "3day", "4day"]) with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 result = idx.get_duplicates() - ex = TimedeltaIndex(['2 day', '3day']) + ex = TimedeltaIndex(["2 day", "3day"]) tm.assert_index_equal(result, ex) def test_argmin_argmax(self): - idx = TimedeltaIndex(['1 day 00:00:05', '1 day 00:00:01', - '1 day 00:00:02']) + idx = TimedeltaIndex(["1 day 00:00:05", "1 day 00:00:01", "1 day 00:00:02"]) assert idx.argmin() == 1 assert idx.argmax() == 0 def test_misc_coverage(self): - rng = timedelta_range('1 day', periods=5) + rng = timedelta_range("1 day", periods=5) result = rng.groupby(rng.days) assert isinstance(list(result.values())[0][0], Timedelta) - idx = TimedeltaIndex(['3d', '1d', '2d']) + idx = TimedeltaIndex(["3d", "1d", "2d"]) assert not idx.equals(list(idx)) - non_td = Index(list('abc')) + non_td = Index(list("abc")) assert not idx.equals(list(non_td)) def test_map(self): # test_map_dictlike generally tests - rng = timedelta_range('1 day', periods=10) + rng = timedelta_range("1 day", periods=10) f = lambda x: x.days result = rng.map(f) @@ -211,7 +223,7 @@ def test_map(self): def test_pass_TimedeltaIndex_to_index(self): - rng = timedelta_range('1 days', '10 days') + rng = timedelta_range("1 days", "10 days") idx = Index(rng, dtype=object) expected = Index(rng.to_pytimedelta(), dtype=object) @@ -220,56 +232,55 @@ def test_pass_TimedeltaIndex_to_index(self): def test_pickle(self): - rng = timedelta_range('1 days', periods=10) + rng = timedelta_range("1 days", periods=10) rng_p = tm.round_trip_pickle(rng) tm.assert_index_equal(rng, rng_p) def test_hash_error(self): - index = timedelta_range('1 days', periods=10) - with pytest.raises(TypeError, match=("unhashable type: %r" % - type(index).__name__)): + index = timedelta_range("1 days", periods=10) + with pytest.raises( + TypeError, match=("unhashable type: %r" % type(index).__name__) + ): hash(index) def test_append_join_nondatetimeindex(self): - rng = timedelta_range('1 days', periods=10) - idx = Index(['a', 'b', 'c', 'd']) + rng = timedelta_range("1 days", periods=10) + idx = Index(["a", "b", "c", "d"]) result = rng.append(idx) assert isinstance(result[0], Timedelta) # it works - rng.join(idx, how='outer') + rng.join(idx, how="outer") def test_append_numpy_bug_1681(self): - td = timedelta_range('1 days', '10 days', freq='2D') + td = timedelta_range("1 days", "10 days", freq="2D") a = DataFrame() - c = DataFrame({'A': 'foo', 'B': td}, index=td) + c = DataFrame({"A": "foo", "B": td}, index=td) str(c) result = a.append(c) - assert (result['B'] == td).all() + assert (result["B"] == td).all() def test_fields(self): - rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, - freq='s') - tm.assert_index_equal(rng.days, Index([1, 1], dtype='int64')) + rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") + tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) tm.assert_index_equal( rng.seconds, - Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], - dtype='int64')) + Index([10 * 3600 + 11 * 60 + 12, 10 * 3600 + 11 * 60 + 13], dtype="int64"), + ) tm.assert_index_equal( - rng.microseconds, - Index([100 * 1000 + 123, 100 * 1000 + 123], dtype='int64')) - tm.assert_index_equal(rng.nanoseconds, - Index([456, 456], dtype='int64')) + rng.microseconds, Index([100 * 1000 + 123, 100 * 1000 + 123], dtype="int64") + ) + tm.assert_index_equal(rng.nanoseconds, Index([456, 456], dtype="int64")) msg = "'TimedeltaIndex' object has no attribute '{}'" - with pytest.raises(AttributeError, match=msg.format('hours')): + with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours - with pytest.raises(AttributeError, match=msg.format('minutes')): + with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes - with pytest.raises(AttributeError, match=msg.format('milliseconds')): + with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # with nat @@ -277,71 +288,69 @@ def test_fields(self): s[1] = np.nan tm.assert_series_equal(s.dt.days, Series([1, np.nan], index=[0, 1])) - tm.assert_series_equal(s.dt.seconds, Series( - [10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1])) + tm.assert_series_equal( + s.dt.seconds, Series([10 * 3600 + 11 * 60 + 12, np.nan], index=[0, 1]) + ) # preserve name (GH15589) - rng.name = 'name' - assert rng.days.name == 'name' + rng.name = "name" + assert rng.days.name == "name" def test_freq_conversion(self): # doc example # series - td = Series(date_range('20130101', periods=4)) - \ - Series(date_range('20121201', periods=4)) + td = Series(date_range("20130101", periods=4)) - Series( + date_range("20121201", periods=4) + ) td[2] += timedelta(minutes=5, seconds=3) td[3] = np.nan - result = td / np.timedelta64(1, 'D') - expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan - ]) + result = td / np.timedelta64(1, "D") + expected = Series([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) assert_series_equal(result, expected) - result = td.astype('timedelta64[D]') + result = td.astype("timedelta64[D]") expected = Series([31, 31, 31, np.nan]) assert_series_equal(result, expected) - result = td / np.timedelta64(1, 's') - expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, - np.nan]) + result = td / np.timedelta64(1, "s") + expected = Series([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) assert_series_equal(result, expected) - result = td.astype('timedelta64[s]') + result = td.astype("timedelta64[s]") assert_series_equal(result, expected) # tdi td = TimedeltaIndex(td) - result = td / np.timedelta64(1, 'D') + result = td / np.timedelta64(1, "D") expected = Index([31, 31, (31 * 86400 + 5 * 60 + 3) / 86400.0, np.nan]) assert_index_equal(result, expected) - result = td.astype('timedelta64[D]') + result = td.astype("timedelta64[D]") expected = Index([31, 31, 31, np.nan]) assert_index_equal(result, expected) - result = td / np.timedelta64(1, 's') - expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, - np.nan]) + result = td / np.timedelta64(1, "s") + expected = Index([31 * 86400, 31 * 86400, 31 * 86400 + 5 * 60 + 3, np.nan]) assert_index_equal(result, expected) - result = td.astype('timedelta64[s]') + result = td.astype("timedelta64[s]") assert_index_equal(result, expected) - @pytest.mark.parametrize('unit', ['Y', 'y', 'M']) + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_deprecated(self, unit): with tm.assert_produces_warning(FutureWarning) as w: TimedeltaIndex([1, 3, 7], unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w[0].message)) class TestTimeSeries: - def test_series_box_timedelta(self): - rng = timedelta_range('1 day 1 s', periods=5, freq='h') + rng = timedelta_range("1 day 1 s", periods=5, freq="h") s = Series(rng) assert isinstance(s[1], Timedelta) assert isinstance(s.iat[2], Timedelta) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 971cbe65b5da1..1c1d0f1a735cf 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -9,28 +9,26 @@ class TestTimedeltas: - def test_timedelta_range(self): - expected = to_timedelta(np.arange(5), unit='D') - result = timedelta_range('0 days', periods=5, freq='D') + expected = to_timedelta(np.arange(5), unit="D") + result = timedelta_range("0 days", periods=5, freq="D") tm.assert_index_equal(result, expected) - expected = to_timedelta(np.arange(11), unit='D') - result = timedelta_range('0 days', '10 days', freq='D') + expected = to_timedelta(np.arange(11), unit="D") + result = timedelta_range("0 days", "10 days", freq="D") tm.assert_index_equal(result, expected) - expected = to_timedelta(np.arange(5), unit='D') + Second(2) + Day() - result = timedelta_range('1 days, 00:00:02', '5 days, 00:00:02', - freq='D') + expected = to_timedelta(np.arange(5), unit="D") + Second(2) + Day() + result = timedelta_range("1 days, 00:00:02", "5 days, 00:00:02", freq="D") tm.assert_index_equal(result, expected) - expected = to_timedelta([1, 3, 5, 7, 9], unit='D') + Second(2) - result = timedelta_range('1 days, 00:00:02', periods=5, freq='2D') + expected = to_timedelta([1, 3, 5, 7, 9], unit="D") + Second(2) + result = timedelta_range("1 days, 00:00:02", periods=5, freq="2D") tm.assert_index_equal(result, expected) - expected = to_timedelta(np.arange(50), unit='T') * 30 - result = timedelta_range('0 days', freq='30T', periods=50) + expected = to_timedelta(np.arange(50), unit="T") * 30 + result = timedelta_range("0 days", freq="30T", periods=50) tm.assert_index_equal(result, expected) # GH 11776 @@ -39,34 +37,37 @@ def test_timedelta_range(self): for arg in (arr, df): with pytest.raises(TypeError, match="1-d array"): to_timedelta(arg) - for errors in ['ignore', 'raise', 'coerce']: + for errors in ["ignore", "raise", "coerce"]: with pytest.raises(TypeError, match="1-d array"): to_timedelta(arg, errors=errors) # issue10583 df = pd.DataFrame(np.random.normal(size=(10, 4))) - df.index = pd.timedelta_range(start='0s', periods=10, freq='s') - expected = df.loc[pd.Timedelta('0s'):, :] - result = df.loc['0s':, :] + df.index = pd.timedelta_range(start="0s", periods=10, freq="s") + expected = df.loc[pd.Timedelta("0s") :, :] + result = df.loc["0s":, :] tm.assert_frame_equal(expected, result) - @pytest.mark.parametrize('periods, freq', [ - (3, '2D'), (5, 'D'), (6, '19H12T'), (7, '16H'), (9, '12H')]) + @pytest.mark.parametrize( + "periods, freq", [(3, "2D"), (5, "D"), (6, "19H12T"), (7, "16H"), (9, "12H")] + ) def test_linspace_behavior(self, periods, freq): # GH 20976 - result = timedelta_range(start='0 days', end='4 days', periods=periods) - expected = timedelta_range(start='0 days', end='4 days', freq=freq) + result = timedelta_range(start="0 days", end="4 days", periods=periods) + expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) def test_errors(self): # not enough params - msg = ('Of the four parameters: start, end, periods, and freq, ' - 'exactly three must be specified') + msg = ( + "Of the four parameters: start, end, periods, and freq, " + "exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): - timedelta_range(start='0 days') + timedelta_range(start="0 days") with pytest.raises(ValueError, match=msg): - timedelta_range(end='5 days') + timedelta_range(end="5 days") with pytest.raises(ValueError, match=msg): timedelta_range(periods=2) @@ -76,4 +77,4 @@ def test_errors(self): # too many params with pytest.raises(ValueError, match=msg): - timedelta_range(start='0 days', end='5 days', periods=10, freq='H') + timedelta_range(start="0 days", end="5 days", periods=10, freq="H") diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 81e51fed788e4..4aed0b1af81a6 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -12,49 +12,52 @@ class TestTimedeltas: - def test_to_timedelta(self): def conv(v): - return v.astype('m8[ns]') + return v.astype("m8[ns]") - d1 = np.timedelta64(1, 'D') + d1 = np.timedelta64(1, "D") with tm.assert_produces_warning(FutureWarning): - assert (to_timedelta('1 days 06:05:01.00003', box=False) == - conv(d1 + np.timedelta64(6 * 3600 + 5 * 60 + 1, 's') + - np.timedelta64(30, 'us'))) + assert to_timedelta("1 days 06:05:01.00003", box=False) == conv( + d1 + + np.timedelta64(6 * 3600 + 5 * 60 + 1, "s") + + np.timedelta64(30, "us") + ) with tm.assert_produces_warning(FutureWarning): - assert (to_timedelta('15.5us', box=False) == - conv(np.timedelta64(15500, 'ns'))) + assert to_timedelta("15.5us", box=False) == conv( + np.timedelta64(15500, "ns") + ) # empty string - result = to_timedelta('', box=False) - assert result.astype('int64') == iNaT + result = to_timedelta("", box=False) + assert result.astype("int64") == iNaT - result = to_timedelta(['', '']) + result = to_timedelta(["", ""]) assert isna(result).all() # pass thru - result = to_timedelta(np.array([np.timedelta64(1, 's')])) - expected = pd.Index(np.array([np.timedelta64(1, 's')])) + result = to_timedelta(np.array([np.timedelta64(1, "s")])) + expected = pd.Index(np.array([np.timedelta64(1, "s")])) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # ints - result = np.timedelta64(0, 'ns') + result = np.timedelta64(0, "ns") expected = to_timedelta(0, box=False) assert result == expected # Series expected = Series([timedelta(days=1), timedelta(days=1, seconds=1)]) - result = to_timedelta(Series(['1d', '1days 00:00:01'])) + result = to_timedelta(Series(["1d", "1days 00:00:01"])) tm.assert_series_equal(result, expected) # with units - result = TimedeltaIndex([np.timedelta64(0, 'ns'), np.timedelta64( - 10, 's').astype('m8[ns]')]) - expected = to_timedelta([0, 10], unit='s') + result = TimedeltaIndex( + [np.timedelta64(0, "ns"), np.timedelta64(10, "s").astype("m8[ns]")] + ) + expected = to_timedelta([0, 10], unit="s") tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning): @@ -71,51 +74,51 @@ def conv(v): assert result == expected # arrays of various dtypes - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='s') - expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="s") + expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='m') - expected = TimedeltaIndex([np.timedelta64(1, 'm')] * 5) + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="m") + expected = TimedeltaIndex([np.timedelta64(1, "m")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='int64') - result = to_timedelta(arr, unit='h') - expected = TimedeltaIndex([np.timedelta64(1, 'h')] * 5) + arr = np.array([1] * 5, dtype="int64") + result = to_timedelta(arr, unit="h") + expected = TimedeltaIndex([np.timedelta64(1, "h")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='timedelta64[s]') + arr = np.array([1] * 5, dtype="timedelta64[s]") result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, 's')] * 5) + expected = TimedeltaIndex([np.timedelta64(1, "s")] * 5) tm.assert_index_equal(result, expected) - arr = np.array([1] * 5, dtype='timedelta64[D]') + arr = np.array([1] * 5, dtype="timedelta64[D]") result = to_timedelta(arr) - expected = TimedeltaIndex([np.timedelta64(1, 'D')] * 5) + expected = TimedeltaIndex([np.timedelta64(1, "D")] * 5) tm.assert_index_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # Test with lists as input when box=false - expected = np.array(np.arange(3) * 1000000000, - dtype='timedelta64[ns]') - result = to_timedelta(range(3), unit='s', box=False) + expected = np.array(np.arange(3) * 1000000000, dtype="timedelta64[ns]") + result = to_timedelta(range(3), unit="s", box=False) tm.assert_numpy_array_equal(expected, result) with tm.assert_produces_warning(FutureWarning): - result = to_timedelta(np.arange(3), unit='s', box=False) + result = to_timedelta(np.arange(3), unit="s", box=False) tm.assert_numpy_array_equal(expected, result) with tm.assert_produces_warning(FutureWarning): - result = to_timedelta([0, 1, 2], unit='s', box=False) + result = to_timedelta([0, 1, 2], unit="s", box=False) tm.assert_numpy_array_equal(expected, result) with tm.assert_produces_warning(FutureWarning): # Tests with fractional seconds as input: expected = np.array( - [0, 500000000, 800000000, 1200000000], dtype='timedelta64[ns]') - result = to_timedelta([0., 0.5, 0.8, 1.2], unit='s', box=False) + [0, 500000000, 800000000, 1200000000], dtype="timedelta64[ns]" + ) + result = to_timedelta([0.0, 0.5, 0.8, 1.2], unit="s", box=False) tm.assert_numpy_array_equal(expected, result) def test_to_timedelta_invalid(self): @@ -123,85 +126,92 @@ def test_to_timedelta_invalid(self): # bad value for errors parameter msg = "errors must be one of" with pytest.raises(ValueError, match=msg): - to_timedelta(['foo'], errors='never') + to_timedelta(["foo"], errors="never") # these will error msg = "invalid unit abbreviation: foo" with pytest.raises(ValueError, match=msg): - to_timedelta([1, 2], unit='foo') + to_timedelta([1, 2], unit="foo") with pytest.raises(ValueError, match=msg): - to_timedelta(1, unit='foo') + to_timedelta(1, unit="foo") # time not supported ATM - msg = ("Value must be Timedelta, string, integer, float, timedelta or" - " convertible") + msg = ( + "Value must be Timedelta, string, integer, float, timedelta or" + " convertible" + ) with pytest.raises(ValueError, match=msg): to_timedelta(time(second=1)) - assert to_timedelta(time(second=1), errors='coerce') is pd.NaT + assert to_timedelta(time(second=1), errors="coerce") is pd.NaT msg = "unit abbreviation w/o a number" with pytest.raises(ValueError, match=msg): - to_timedelta(['foo', 'bar']) - tm.assert_index_equal(TimedeltaIndex([pd.NaT, pd.NaT]), - to_timedelta(['foo', 'bar'], errors='coerce')) + to_timedelta(["foo", "bar"]) + tm.assert_index_equal( + TimedeltaIndex([pd.NaT, pd.NaT]), + to_timedelta(["foo", "bar"], errors="coerce"), + ) - tm.assert_index_equal(TimedeltaIndex(['1 day', pd.NaT, '1 min']), - to_timedelta(['1 day', 'bar', '1 min'], - errors='coerce')) + tm.assert_index_equal( + TimedeltaIndex(["1 day", pd.NaT, "1 min"]), + to_timedelta(["1 day", "bar", "1 min"], errors="coerce"), + ) # gh-13613: these should not error because errors='ignore' - invalid_data = 'apple' - assert invalid_data == to_timedelta(invalid_data, errors='ignore') + invalid_data = "apple" + assert invalid_data == to_timedelta(invalid_data, errors="ignore") - invalid_data = ['apple', '1 days'] + invalid_data = ["apple", "1 days"] tm.assert_numpy_array_equal( np.array(invalid_data, dtype=object), - to_timedelta(invalid_data, errors='ignore')) + to_timedelta(invalid_data, errors="ignore"), + ) - invalid_data = pd.Index(['apple', '1 days']) - tm.assert_index_equal(invalid_data, to_timedelta( - invalid_data, errors='ignore')) + invalid_data = pd.Index(["apple", "1 days"]) + tm.assert_index_equal(invalid_data, to_timedelta(invalid_data, errors="ignore")) - invalid_data = Series(['apple', '1 days']) - tm.assert_series_equal(invalid_data, to_timedelta( - invalid_data, errors='ignore')) + invalid_data = Series(["apple", "1 days"]) + tm.assert_series_equal( + invalid_data, to_timedelta(invalid_data, errors="ignore") + ) def test_to_timedelta_via_apply(self): # GH 5458 - expected = Series([np.timedelta64(1, 's')]) - result = Series(['00:00:01']).apply(to_timedelta) + expected = Series([np.timedelta64(1, "s")]) + result = Series(["00:00:01"]).apply(to_timedelta) tm.assert_series_equal(result, expected) - result = Series([to_timedelta('00:00:01')]) + result = Series([to_timedelta("00:00:01")]) tm.assert_series_equal(result, expected) def test_to_timedelta_on_missing_values(self): # GH5438 - timedelta_NaT = np.timedelta64('NaT') + timedelta_NaT = np.timedelta64("NaT") - actual = pd.to_timedelta(Series(['00:00:01', np.nan])) - expected = Series([np.timedelta64(1000000000, 'ns'), - timedelta_NaT], dtype='%-8.8s,obj->%-8.8s," - "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" % - (name, result, t, o, method1, method2, a, error or '')) + v = ( + "%-16.16s [%-16.16s]: [typ->%-8.8s,obj->%-8.8s," + "key1->(%-4.4s),key2->(%-4.4s),axis->%s] %s" + % (name, result, t, o, method1, method2, a, error or "") + ) if _verbose: pprint_thing(v) @@ -187,7 +213,7 @@ def _print(result, error=None): try: xp = self.get_result(obj, method2, k2, a) except Exception: - result = 'no comp' + result = "no comp" _print(result) return @@ -200,18 +226,18 @@ def _print(result, error=None): tm.assert_series_equal(rs, xp) elif xp.ndim == 2: tm.assert_frame_equal(rs, xp) - result = 'ok' + result = "ok" except AssertionError as e: detail = str(e) - result = 'fail' + result = "fail" # reverse the checks if fails is True: - if result == 'fail': - result = 'ok (fail)' + if result == "fail": + result = "ok (fail)" _print(result) - if not result.startswith('ok'): + if not result.startswith("ok"): raise AssertionError(detail) except AssertionError: @@ -221,7 +247,7 @@ def _print(result, error=None): # if we are in fails, the ok, otherwise raise it if fails is not None: if isinstance(detail, fails): - result = 'ok (%s)' % type(detail).__name__ + result = "ok (%s)" % type(detail).__name__ _print(result) return diff --git a/pandas/tests/indexing/conftest.py b/pandas/tests/indexing/conftest.py index be1cf4800a2ef..142bedaa943a6 100644 --- a/pandas/tests/indexing/conftest.py +++ b/pandas/tests/indexing/conftest.py @@ -4,17 +4,20 @@ from pandas._libs import index as libindex -@pytest.fixture(params=[ - (libindex.Int64Engine, np.int64), - (libindex.Int32Engine, np.int32), - (libindex.Int16Engine, np.int16), - (libindex.Int8Engine, np.int8), - (libindex.UInt64Engine, np.uint64), - (libindex.UInt32Engine, np.uint32), - (libindex.UInt16Engine, np.uint16), - (libindex.UInt8Engine, np.uint8), - (libindex.Float64Engine, np.float64), - (libindex.Float32Engine, np.float32), -], ids=lambda x: x[0].__name__) +@pytest.fixture( + params=[ + (libindex.Int64Engine, np.int64), + (libindex.Int32Engine, np.int32), + (libindex.Int16Engine, np.int16), + (libindex.Int8Engine, np.int8), + (libindex.UInt64Engine, np.uint64), + (libindex.UInt32Engine, np.uint32), + (libindex.UInt16Engine, np.uint16), + (libindex.UInt8Engine, np.uint8), + (libindex.Float64Engine, np.float64), + (libindex.Float32Engine, np.float32), + ], + ids=lambda x: x[0].__name__, +) def numeric_indexing_engine_type_and_dtype(request): return request.param diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 76f0b94ea3904..1bdb665101d41 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -7,7 +7,6 @@ class TestIntervalIndex: - def setup_method(self, method): self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) @@ -28,14 +27,14 @@ def test_getitem_with_scalar(self): expected = s.iloc[2:5] tm.assert_series_equal(expected, s[s >= 2]) - @pytest.mark.parametrize('direction', ['increasing', 'decreasing']) + @pytest.mark.parametrize("direction", ["increasing", "decreasing"]) def test_nonoverlapping_monotonic(self, direction, closed): tpls = [(0, 1), (2, 3), (4, 5)] - if direction == 'decreasing': + if direction == "decreasing": tpls = tpls[::-1] idx = IntervalIndex.from_tuples(tpls, closed=closed) - s = Series(list('abc'), idx) + s = Series(list("abc"), idx) for key, expected in zip(idx.left, s): if idx.closed_left: @@ -73,8 +72,9 @@ def test_non_matching(self): s.loc[[-1, 3]] def test_large_series(self): - s = Series(np.arange(1000000), - index=IntervalIndex.from_breaks(np.arange(1000001))) + s = Series( + np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001)) + ) result1 = s.loc[:80000] result2 = s.loc[0:80000] @@ -84,10 +84,10 @@ def test_large_series(self): def test_loc_getitem_frame(self): - df = DataFrame({'A': range(10)}) + df = DataFrame({"A": range(10)}) s = pd.cut(df.A, 5) - df['B'] = s - df = df.set_index('B') + df["B"] = s + df = df.set_index("B") result = df.loc[4] expected = df.iloc[4:6] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index aa016ac5dd1a7..92c71bbc6eb32 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -6,7 +6,6 @@ class TestIntervalIndex: - def setup_method(self, method): self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) @@ -32,10 +31,10 @@ def test_loc_with_interval(self): # missing or not exact with pytest.raises(KeyError): - s.loc[Interval(3, 5, closed='left')] + s.loc[Interval(3, 5, closed="left")] with pytest.raises(KeyError): - s[Interval(3, 5, closed='left')] + s[Interval(3, 5, closed="left")] with pytest.raises(KeyError): s[Interval(3, 5)] @@ -99,29 +98,29 @@ def test_loc_with_slices(self): # slice of interval expected = s.iloc[:3] - result = s.loc[Interval(0, 1):Interval(2, 3)] + result = s.loc[Interval(0, 1) : Interval(2, 3)] tm.assert_series_equal(expected, result) - result = s[Interval(0, 1):Interval(2, 3)] + result = s[Interval(0, 1) : Interval(2, 3)] tm.assert_series_equal(expected, result) expected = s.iloc[3:] - result = s.loc[Interval(3, 4):] + result = s.loc[Interval(3, 4) :] tm.assert_series_equal(expected, result) - result = s[Interval(3, 4):] + result = s[Interval(3, 4) :] tm.assert_series_equal(expected, result) - msg = 'Interval objects are not currently supported' + msg = "Interval objects are not currently supported" with pytest.raises(NotImplementedError, match=msg): - s.loc[Interval(3, 6):] + s.loc[Interval(3, 6) :] with pytest.raises(NotImplementedError, match=msg): - s[Interval(3, 6):] + s[Interval(3, 6) :] with pytest.raises(NotImplementedError, match=msg): - s.loc[Interval(3, 4, closed='left'):] + s.loc[Interval(3, 4, closed="left") :] with pytest.raises(NotImplementedError, match=msg): - s[Interval(3, 4, closed='left'):] + s[Interval(3, 4, closed="left") :] # TODO with non-existing intervals ? # s.loc[Interval(-1, 0):Interval(2, 3)] @@ -190,17 +189,17 @@ def test_loc_with_overlap(self): # slices with interval (only exact matches) expected = s - result = s.loc[Interval(1, 5):Interval(3, 7)] + result = s.loc[Interval(1, 5) : Interval(3, 7)] tm.assert_series_equal(expected, result) - result = s[Interval(1, 5):Interval(3, 7)] + result = s[Interval(1, 5) : Interval(3, 7)] tm.assert_series_equal(expected, result) with pytest.raises(KeyError): - s.loc[Interval(1, 6):Interval(3, 8)] + s.loc[Interval(1, 6) : Interval(3, 8)] with pytest.raises(KeyError): - s[Interval(1, 6):Interval(3, 8)] + s[Interval(1, 6) : Interval(3, 8)] # slices with scalar raise for overlapping intervals # TODO KeyError is the appropriate error? @@ -229,11 +228,11 @@ def test_non_unique_moar(self): tm.assert_series_equal(expected, result) expected = s - result = s.loc[Interval(1, 3):] + result = s.loc[Interval(1, 3) :] tm.assert_series_equal(expected, result) expected = s - result = s[Interval(1, 3):] + result = s[Interval(1, 3) :] tm.assert_series_equal(expected, result) expected = s.iloc[[0, 1]] diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py index 545e092d9ce65..23149944f3c38 100644 --- a/pandas/tests/indexing/multiindex/conftest.py +++ b/pandas/tests/indexing/multiindex/conftest.py @@ -8,13 +8,14 @@ @pytest.fixture def multiindex_dataframe_random_data(): """DataFrame with 2 level MultiIndex with random data""" - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) @pytest.fixture @@ -22,10 +23,8 @@ def multiindex_year_month_day_dataframe_random_data(): """DataFrame with 3 level MultiIndex (year, month, day) covering first 100 business days from 2000-01-01 with random data""" tdf = tm.makeTimeDataFrame(100) - ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]).sum() + ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype('i8') for lev in ymd.index.levels], - inplace=True) - ymd.index.set_names(['year', 'month', 'day'], inplace=True) + ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index.set_names(["year", "month", "day"], inplace=True) return ymd diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index ff59e446a7b2e..3183721eeb54f 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -13,37 +13,38 @@ def test_detect_chained_assignment(): b = [123, None] c = [1234, 2345] d = [12345, 23456] - tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'), - ('ears', 'right')] - events = {('eyes', 'left'): a, - ('eyes', 'right'): b, - ('ears', 'left'): c, - ('ears', 'right'): d} - multiind = MultiIndex.from_tuples(tuples, names=['part', 'side']) - zed = DataFrame(events, index=['a', 'b'], columns=multiind) + tuples = [("eyes", "left"), ("eyes", "right"), ("ears", "left"), ("ears", "right")] + events = { + ("eyes", "left"): a, + ("eyes", "right"): b, + ("ears", "left"): c, + ("ears", "right"): d, + } + multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) + zed = DataFrame(events, index=["a", "b"], columns=multiind) with pytest.raises(com.SettingWithCopyError): - zed['eyes']['right'].fillna(value=555, inplace=True) + zed["eyes"]["right"].fillna(value=555, inplace=True) def test_cache_updating(): # 5216 # make sure that we don't try to set a dead cache a = np.random.rand(10, 3) - df = DataFrame(a, columns=['x', 'y', 'z']) + df = DataFrame(a, columns=["x", "y", "z"]) tuples = [(i, j) for i in range(5) for j in range(2)] index = MultiIndex.from_tuples(tuples) df.index = index # setting via chained assignment # but actually works, since everything is a view - df.loc[0]['z'].iloc[0] = 1. - result = df.loc[(0, 0), 'z'] + df.loc[0]["z"].iloc[0] = 1.0 + result = df.loc[(0, 0), "z"] assert result == 1 # correct setting - df.loc[(0, 0), 'z'] = 2 - result = df.loc[(0, 0), 'z'] + df.loc[(0, 0), "z"] = 2 + result = df.loc[(0, 0), "z"] assert result == 2 diff --git a/pandas/tests/indexing/multiindex/test_datetime.py b/pandas/tests/indexing/multiindex/test_datetime.py index a270ab32e9b04..907d20cd5bd53 100644 --- a/pandas/tests/indexing/multiindex/test_datetime.py +++ b/pandas/tests/indexing/multiindex/test_datetime.py @@ -8,15 +8,15 @@ def test_multiindex_period_datetime(): # GH4861, using datetime in period of multiindex raises exception - idx1 = Index(['a', 'a', 'a', 'b', 'b']) - idx2 = period_range('2012-01', periods=len(idx1), freq='M') + idx1 = Index(["a", "a", "a", "b", "b"]) + idx2 = period_range("2012-01", periods=len(idx1), freq="M") s = Series(np.random.randn(len(idx1)), [idx1, idx2]) # try Period as index expected = s.iloc[0] - result = s.loc['a', Period('2012-01')] + result = s.loc["a", Period("2012-01")] assert result == expected # try datetime as index - result = s.loc['a', datetime(2012, 1, 1)] + result = s.loc["a", datetime(2012, 1, 1)] assert result == expected diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 2fbbdef33b633..0c61644eb46ae 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -10,13 +10,14 @@ # ---------------------------------------------------------------------------- -@pytest.mark.parametrize('access_method', [lambda s, x: s[:, x], - lambda s, x: s.loc[:, x], - lambda s, x: s.xs(x, level=1)]) -@pytest.mark.parametrize('level1_value, expected', [ - (0, Series([1], index=[0])), - (1, Series([2, 3], index=[1, 2])) -]) +@pytest.mark.parametrize( + "access_method", + [lambda s, x: s[:, x], lambda s, x: s.loc[:, x], lambda s, x: s.xs(x, level=1)], +) +@pytest.mark.parametrize( + "level1_value, expected", + [(0, Series([1], index=[0])), (1, Series([2, 3], index=[1, 2]))], +) def test_series_getitem_multiindex(access_method, level1_value, expected): # GH 6018 @@ -28,40 +29,37 @@ def test_series_getitem_multiindex(access_method, level1_value, expected): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('level0_value', ['D', 'A']) +@pytest.mark.parametrize("level0_value", ["D", "A"]) def test_series_getitem_duplicates_multiindex(level0_value): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! - index = MultiIndex(levels=[[level0_value, 'B', 'C'], - [0, 26, 27, 37, 57, 67, 75, 82]], - codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], - [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], - names=['tag', 'day']) + index = MultiIndex( + levels=[[level0_value, "B", "C"], [0, 26, 27, 37, 57, 67, 75, 82]], + codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], + names=["tag", "day"], + ) arr = np.random.randn(len(index), 1) - df = DataFrame(arr, index=index, columns=['val']) + df = DataFrame(arr, index=index, columns=["val"]) # confirm indexing on missing value raises KeyError - if level0_value != 'A': + if level0_value != "A": with pytest.raises(KeyError, match=r"^'A'$"): - df.val['A'] + df.val["A"] with pytest.raises(KeyError, match=r"^'X'$"): - df.val['X'] + df.val["X"] result = df.val[level0_value] - expected = Series(arr.ravel()[0:3], name='val', index=Index( - [26, 37, 57], name='day')) + expected = Series( + arr.ravel()[0:3], name="val", index=Index([26, 37, 57], name="day") + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda s: s[2000, 3], - lambda s: s.loc[2000, 3] -]) -def test_series_getitem( - multiindex_year_month_day_dataframe_random_data, indexer): - s = multiindex_year_month_day_dataframe_random_data['A'] +@pytest.mark.parametrize("indexer", [lambda s: s[2000, 3], lambda s: s.loc[2000, 3]]) +def test_series_getitem(multiindex_year_month_day_dataframe_random_data, indexer): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) @@ -69,40 +67,50 @@ def test_series_getitem( tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda s: s[2000, 3, 10], - lambda s: s.loc[2000, 3, 10] -]) +@pytest.mark.parametrize( + "indexer", [lambda s: s[2000, 3, 10], lambda s: s.loc[2000, 3, 10]] +) def test_series_getitem_returns_scalar( - multiindex_year_month_day_dataframe_random_data, indexer): - s = multiindex_year_month_day_dataframe_random_data['A'] + multiindex_year_month_day_dataframe_random_data, indexer +): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.iloc[49] result = indexer(s) assert result == expected -@pytest.mark.parametrize('indexer,expected_error,expected_error_msg', [ - (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^356$"), - (lambda s: s[(2000, 3, 4)], KeyError, r"^356$"), - (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^356$"), - (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, 'Too many indexers'), - (lambda s: s.__getitem__(len(s)), IndexError, 'index out of bounds'), - (lambda s: s[len(s)], IndexError, 'index out of bounds'), - (lambda s: s.iloc[len(s)], IndexError, - 'single positional indexer is out-of-bounds') -]) +@pytest.mark.parametrize( + "indexer,expected_error,expected_error_msg", + [ + (lambda s: s.__getitem__((2000, 3, 4)), KeyError, r"^356$"), + (lambda s: s[(2000, 3, 4)], KeyError, r"^356$"), + (lambda s: s.loc[(2000, 3, 4)], KeyError, r"^356$"), + (lambda s: s.loc[(2000, 3, 4, 5)], IndexingError, "Too many indexers"), + (lambda s: s.__getitem__(len(s)), IndexError, "index out of bounds"), + (lambda s: s[len(s)], IndexError, "index out of bounds"), + ( + lambda s: s.iloc[len(s)], + IndexError, + "single positional indexer is out-of-bounds", + ), + ], +) def test_series_getitem_indexing_errors( - multiindex_year_month_day_dataframe_random_data, indexer, - expected_error, expected_error_msg): - s = multiindex_year_month_day_dataframe_random_data['A'] + multiindex_year_month_day_dataframe_random_data, + indexer, + expected_error, + expected_error_msg, +): + s = multiindex_year_month_day_dataframe_random_data["A"] with pytest.raises(expected_error, match=expected_error_msg): indexer(s) def test_series_getitem_corner_generator( - multiindex_year_month_day_dataframe_random_data): - s = multiindex_year_month_day_dataframe_random_data['A'] + multiindex_year_month_day_dataframe_random_data +): + s = multiindex_year_month_day_dataframe_random_data["A"] result = s[(x > 0 for x in s)] expected = s[s > 0] tm.assert_series_equal(result, expected) @@ -112,42 +120,55 @@ def test_series_getitem_corner_generator( # test indexing of DataFrame with multi-level Index # ---------------------------------------------------------------------------- + def test_getitem_simple(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data.T expected = df.values[:, 0] - result = df['foo', 'one'].values + result = df["foo", "one"].values tm.assert_almost_equal(result, expected) -@pytest.mark.parametrize('indexer,expected_error_msg', [ - (lambda df: df[('foo', 'four')], r"^\('foo', 'four'\)$"), - (lambda df: df['foobar'], r"^'foobar'$") -]) +@pytest.mark.parametrize( + "indexer,expected_error_msg", + [ + (lambda df: df[("foo", "four")], r"^\('foo', 'four'\)$"), + (lambda df: df["foobar"], r"^'foobar'$"), + ], +) def test_frame_getitem_simple_key_error( - multiindex_dataframe_random_data, indexer, expected_error_msg): + multiindex_dataframe_random_data, indexer, expected_error_msg +): df = multiindex_dataframe_random_data.T with pytest.raises(KeyError, match=expected_error_msg): indexer(df) def test_frame_getitem_multicolumn_empty_level(): - df = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']}) - df.columns = [['level1 item1', 'level1 item2'], ['', 'level2 item2'], - ['level3 item1', 'level3 item2']] - - result = df['level1 item1'] - expected = DataFrame([['1'], ['2'], ['3']], index=df.index, - columns=['level3 item1']) + df = DataFrame({"a": ["1", "2", "3"], "b": ["2", "3", "4"]}) + df.columns = [ + ["level1 item1", "level1 item2"], + ["", "level2 item2"], + ["level3 item1", "level3 item2"], + ] + + result = df["level1 item1"] + expected = DataFrame( + [["1"], ["2"], ["3"]], index=df.index, columns=["level3 item1"] + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('indexer,expected_slice', [ - (lambda df: df['foo'], slice(3)), - (lambda df: df['bar'], slice(3, 5)), - (lambda df: df.loc[:, 'bar'], slice(3, 5)) -]) +@pytest.mark.parametrize( + "indexer,expected_slice", + [ + (lambda df: df["foo"], slice(3)), + (lambda df: df["bar"], slice(3, 5)), + (lambda df: df.loc[:, "bar"], slice(3, 5)), + ], +) def test_frame_getitem_toplevel( - multiindex_dataframe_random_data, indexer, expected_slice): + multiindex_dataframe_random_data, indexer, expected_slice +): df = multiindex_dataframe_random_data.T expected = df.reindex(columns=df.columns[expected_slice]) expected.columns = expected.columns.droplevel(0) @@ -156,21 +177,23 @@ def test_frame_getitem_toplevel( def test_frame_mixed_depth_get(): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.randn(4, 6), columns=index) - result = df['a'] - expected = df['a', '', ''].rename('a') + result = df["a"] + expected = df["a", "", ""].rename("a") tm.assert_series_equal(result, expected) - result = df['routine1', 'result1'] - expected = df['routine1', 'result1', ''] - expected = expected.rename(('routine1', 'result1')) + result = df["routine1", "result1"] + expected = df["routine1", "result1", ""] + expected = expected.rename(("routine1", "result1")) tm.assert_series_equal(result, expected) @@ -178,30 +201,29 @@ def test_frame_mixed_depth_get(): # test indexing of DataFrame with multi-level Index with duplicates # ---------------------------------------------------------------------------- + @pytest.fixture def dataframe_with_duplicate_index(): """Fixture for DataFrame used in tests for gh-4145 and gh-4146""" - data = [['a', 'd', 'e', 'c', 'f', 'b'], - [1, 4, 5, 3, 6, 2], - [1, 4, 5, 3, 6, 2]] - index = ['h1', 'h3', 'h5'] + data = [["a", "d", "e", "c", "f", "b"], [1, 4, 5, 3, 6, 2], [1, 4, 5, 3, 6, 2]] + index = ["h1", "h3", "h5"] columns = MultiIndex( - levels=[['A', 'B'], ['A1', 'A2', 'B1', 'B2']], + levels=[["A", "B"], ["A1", "A2", "B1", "B2"]], codes=[[0, 0, 0, 1, 1, 1], [0, 3, 3, 0, 1, 2]], - names=['main', 'sub']) + names=["main", "sub"], + ) return DataFrame(data, index=index, columns=columns) -@pytest.mark.parametrize('indexer', [ - lambda df: df[('A', 'A1')], - lambda df: df.loc[:, ('A', 'A1')] -]) +@pytest.mark.parametrize( + "indexer", [lambda df: df[("A", "A1")], lambda df: df.loc[:, ("A", "A1")]] +) def test_frame_mi_access(dataframe_with_duplicate_index, indexer): # GH 4145 df = dataframe_with_duplicate_index - index = Index(['h1', 'h3', 'h5']) - columns = MultiIndex.from_tuples([('A', 'A1')], names=['main', 'sub']) - expected = DataFrame([['a', 1, 1]], index=columns, columns=index).T + index = Index(["h1", "h3", "h5"]) + columns = MultiIndex.from_tuples([("A", "A1")], names=["main", "sub"]) + expected = DataFrame([["a", 1, 1]], index=columns, columns=index).T result = indexer(df) tm.assert_frame_equal(result, expected) @@ -213,16 +235,18 @@ def test_frame_mi_access_returns_series(dataframe_with_duplicate_index): # as of 4879, this returns a Series (which is similar to what happens # with a non-unique) df = dataframe_with_duplicate_index - expected = Series(['a', 1, 1], index=['h1', 'h3', 'h5'], name='A1') - result = df['A']['A1'] + expected = Series(["a", 1, 1], index=["h1", "h3", "h5"], name="A1") + result = df["A"]["A1"] tm.assert_series_equal(result, expected) def test_frame_mi_access_returns_frame(dataframe_with_duplicate_index): # selecting a non_unique from the 2nd level df = dataframe_with_duplicate_index - expected = DataFrame([['d', 4, 4], ['e', 5, 5]], - index=Index(['B2', 'B2'], name='sub'), - columns=['h1', 'h3', 'h5'], ).T - result = df['A']['B2'] + expected = DataFrame( + [["d", 4, 4], ["e", 5, 5]], + index=Index(["B2", "B2"], name="sub"), + columns=["h1", "h3", "h5"], + ).T + result = df["A"]["B2"] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_iloc.py b/pandas/tests/indexing/multiindex/test_iloc.py index bdd505804c82b..1335ee9dc2bdf 100644 --- a/pandas/tests/indexing/multiindex/test_iloc.py +++ b/pandas/tests/indexing/multiindex/test_iloc.py @@ -12,23 +12,34 @@ def simple_multiindex_dataframe(): both columns and row MultiIndex using supplied data or random data by default. """ + def _simple_multiindex_dataframe(data=None): if data is None: data = np.random.randn(3, 3) - return DataFrame(data, columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + return DataFrame( + data, columns=[[2, 2, 4], [6, 8, 10]], index=[[4, 4, 8], [8, 10, 12]] + ) + return _simple_multiindex_dataframe -@pytest.mark.parametrize('indexer, expected', [ - (lambda df: df.iloc[0], - lambda arr: Series(arr[0], index=[[2, 2, 4], [6, 8, 10]], name=(4, 8))), - (lambda df: df.iloc[2], - lambda arr: Series(arr[2], index=[[2, 2, 4], [6, 8, 10]], name=(8, 12))), - (lambda df: df.iloc[:, 2], - lambda arr: Series( - arr[:, 2], index=[[4, 4, 8], [8, 10, 12]], name=(4, 10))) -]) +@pytest.mark.parametrize( + "indexer, expected", + [ + ( + lambda df: df.iloc[0], + lambda arr: Series(arr[0], index=[[2, 2, 4], [6, 8, 10]], name=(4, 8)), + ), + ( + lambda df: df.iloc[2], + lambda arr: Series(arr[2], index=[[2, 2, 4], [6, 8, 10]], name=(8, 12)), + ), + ( + lambda df: df.iloc[:, 2], + lambda arr: Series(arr[:, 2], index=[[4, 4, 8], [8, 10, 12]], name=(4, 10)), + ), + ], +) def test_iloc_returns_series(indexer, expected, simple_multiindex_dataframe): arr = np.random.randn(3, 3) df = simple_multiindex_dataframe(arr) @@ -54,20 +65,22 @@ def test_iloc_returns_scalar(simple_multiindex_dataframe): def test_iloc_getitem_multiple_items(): # GH 5528 - tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) + tup = zip(*[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) index = MultiIndex.from_tuples(tup) df = DataFrame(np.random.randn(4, 4), index=index) result = df.iloc[[2, 3]] - expected = df.xs('b', drop_level=False) + expected = df.xs("b", drop_level=False) tm.assert_frame_equal(result, expected) def test_iloc_getitem_labels(): # this is basically regular indexing arr = np.random.randn(4, 3) - df = DataFrame(arr, - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y', 'Y']]) + df = DataFrame( + arr, + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j", "k"], ["X", "X", "Y", "Y"]], + ) result = df.iloc[2, 2] expected = arr[2, 2] assert result == expected @@ -91,23 +104,30 @@ def test_frame_setitem_slice(multiindex_dataframe_random_data): def test_indexing_ambiguity_bug_1678(): # GH 1678 columns = MultiIndex.from_tuples( - [('Ohio', 'Green'), ('Ohio', 'Red'), ('Colorado', 'Green')]) - index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]) + [("Ohio", "Green"), ("Ohio", "Red"), ("Colorado", "Green")] + ) + index = MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) df = DataFrame(np.arange(12).reshape((4, 3)), index=index, columns=columns) result = df.iloc[:, 1] - expected = df.loc[:, ('Ohio', 'Red')] + expected = df.loc[:, ("Ohio", "Red")] tm.assert_series_equal(result, expected) def test_iloc_integer_locations(): # GH 13797 - data = [['str00', 'str01'], ['str10', 'str11'], ['str20', 'srt21'], - ['str30', 'str31'], ['str40', 'str41']] + data = [ + ["str00", "str01"], + ["str10", "str11"], + ["str20", "srt21"], + ["str30", "str31"], + ["str40", "str41"], + ] index = MultiIndex.from_tuples( - [('CC', 'A'), ('CC', 'B'), ('CC', 'B'), ('BB', 'a'), ('BB', 'b')]) + [("CC", "A"), ("CC", "B"), ("CC", "B"), ("BB", "a"), ("BB", "b")] + ) expected = DataFrame(data) df = DataFrame(data, index=index) @@ -118,28 +138,28 @@ def test_iloc_integer_locations(): @pytest.mark.parametrize( - 'data, indexes, values, expected_k', [ + "data, indexes, values, expected_k", + [ # test without indexer value in first level of MultiIndex ([[2, 22, 5], [2, 33, 6]], [0, -1, 1], [2, 3, 1], [7, 10]), # test like code sample 1 in the issue - ([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100], - [755, 1066]), + ([[1, 22, 555], [1, 33, 666]], [0, -1, 1], [200, 300, 100], [755, 1066]), # test like code sample 2 in the issue ([[1, 3, 7], [2, 4, 8]], [0, -1, 1], [10, 10, 1000], [17, 1018]), # test like code sample 3 in the issue - ([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10], - [8, 15, 13]) - ]) + ([[1, 11, 4], [2, 22, 5], [3, 33, 6]], [0, -1, 1], [4, 7, 10], [8, 15, 13]), + ], +) def test_iloc_setitem_int_multiindex_series(data, indexes, values, expected_k): # GH17148 - df = DataFrame(data=data, columns=['i', 'j', 'k']) - df = df.set_index(['i', 'j']) + df = DataFrame(data=data, columns=["i", "j", "k"]) + df = df.set_index(["i", "j"]) series = df.k.copy() for i, v in zip(indexes, values): series.iloc[i] += v - df['k'] = expected_k + df["k"] = expected_k expected = df.k tm.assert_series_equal(series, expected) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index 1eb137eb65fa1..aab44daf8d17f 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -16,36 +16,37 @@ def test_multiindex_get_loc(): # GH7724, GH2646 # test indexing into a multi-index before & past the lexsort depth from numpy.random import randint, choice, randn - cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] + + cols = ["jim", "joe", "jolie", "joline", "jolia"] def validate(mi, df, key): - mask = np.ones(len(df)).astype('bool') + mask = np.ones(len(df)).astype("bool") # test for all partials of this key for i, k in enumerate(key): mask &= df.iloc[:, i] == k if not mask.any(): - assert key[:i + 1] not in mi.index + assert key[: i + 1] not in mi.index continue - assert key[:i + 1] in mi.index + assert key[: i + 1] in mi.index right = df[mask].copy() if i + 1 != len(key): # partial key - right.drop(cols[:i + 1], axis=1, inplace=True) - right.set_index(cols[i + 1:-1], inplace=True) - tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + right.drop(cols[: i + 1], axis=1, inplace=True) + right.set_index(cols[i + 1 : -1], inplace=True) + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) else: # full key right.set_index(cols[:-1], inplace=True) if len(right) == 1: # single hit - right = Series(right['jolia'].values, - name=right.index[0], - index=['jolia']) - tm.assert_series_equal(mi.loc[key[:i + 1]], right) + right = Series( + right["jolia"].values, name=right.index[0], index=["jolia"] + ) + tm.assert_series_equal(mi.loc[key[: i + 1]], right) else: # multi hit - tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + tm.assert_frame_equal(mi.loc[key[: i + 1]], right) def loop(mi, df, keys): for key in keys: @@ -53,19 +54,24 @@ def loop(mi, df, keys): n, m = 1000, 50 - vals = [randint(0, 10, n), choice( - list('abcdefghij'), n), choice( - pd.date_range('20141009', periods=10).tolist(), n), choice( - list('ZYXWVUTSRQ'), n), randn(n)] + vals = [ + randint(0, 10, n), + choice(list("abcdefghij"), n), + choice(pd.date_range("20141009", periods=10).tolist(), n), + choice(list("ZYXWVUTSRQ"), n), + randn(n), + ] vals = list(map(tuple, zip(*vals))) # bunch of keys for testing - keys = [randint(0, 11, m), choice( - list('abcdefghijk'), m), choice( - pd.date_range('20141009', periods=11).tolist(), m), choice( - list('ZYXWVUTSRQP'), m)] + keys = [ + randint(0, 11, m), + choice(list("abcdefghijk"), m), + choice(pd.date_range("20141009", periods=11).tolist(), m), + choice(list("ZYXWVUTSRQP"), m), + ] keys = list(map(tuple, zip(*keys))) - keys += list(map(lambda t: t[:-1], vals[::n // m])) + keys += list(map(lambda t: t[:-1], vals[:: n // m])) # covers both unique index and non-unique index df = DataFrame(vals, columns=cols) @@ -73,8 +79,7 @@ def loop(mi, df, keys): for frame in a, b: for i in range(5): # lexsort depth - df = frame.copy() if i == 0 else frame.sort_values( - by=cols[:i]) + df = frame.copy() if i == 0 else frame.sort_values(by=cols[:i]) mi = df.set_index(cols[:-1]) assert not mi.index.lexsort_depth < i loop(mi, df, keys) @@ -84,4 +89,4 @@ def loop(mi, df, keys): def test_large_mi_dataframe_indexing(): # GH10645 result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) - assert (not (10 ** 6, 0) in result) + assert not (10 ** 6, 0) in result diff --git a/pandas/tests/indexing/multiindex/test_ix.py b/pandas/tests/indexing/multiindex/test_ix.py index 5ea172f14f6f6..d43115d60c029 100644 --- a/pandas/tests/indexing/multiindex/test_ix.py +++ b/pandas/tests/indexing/multiindex/test_ix.py @@ -11,34 +11,35 @@ @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestMultiIndexIx: - def test_frame_setitem_ix(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data - frame.loc[('bar', 'two'), 'B'] = 5 - assert frame.loc[('bar', 'two'), 'B'] == 5 + frame.loc[("bar", "two"), "B"] = 5 + assert frame.loc[("bar", "two"), "B"] == 5 # with integer labels df = frame.copy() df.columns = list(range(3)) - df.loc[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 + df.loc[("bar", "two"), 1] = 7 + assert df.loc[("bar", "two"), 1] == 7 with catch_warnings(record=True): simplefilter("ignore", FutureWarning) df = frame.copy() df.columns = list(range(3)) - df.ix[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 + df.ix[("bar", "two"), 1] = 7 + assert df.loc[("bar", "two"), 1] == 7 def test_ix_general(self): # ix general issues # GH 2817 - data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, - 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, - 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} - df = DataFrame(data).set_index(keys=['col', 'year']) + data = { + "amount": {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + "col": {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + "year": {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}, + } + df = DataFrame(data).set_index(keys=["col", "year"]) key = 4.0, 2012 # emits a PerformanceWarning, ok @@ -50,23 +51,25 @@ def test_ix_general(self): res = df.loc[key] # col has float dtype, result should be Float64Index - index = MultiIndex.from_arrays([[4.] * 3, [2012] * 3], - names=['col', 'year']) - expected = DataFrame({'amount': [222, 333, 444]}, index=index) + index = MultiIndex.from_arrays([[4.0] * 3, [2012] * 3], names=["col", "year"]) + expected = DataFrame({"amount": [222, 333, 444]}, index=index) tm.assert_frame_equal(res, expected) def test_ix_multiindex_missing_label_raises(self): # GH 21593 - df = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) with pytest.raises(KeyError, match=r"^2$"): df.ix[2] def test_series_ix_getitem_fancy( - self, multiindex_year_month_day_dataframe_random_data): - s = multiindex_year_month_day_dataframe_random_data['A'] + self, multiindex_year_month_day_dataframe_random_data + ): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.reindex(s.index[49:51]) result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 962976b8ded55..9188adc7d6e93 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -12,8 +12,9 @@ @pytest.fixture def single_level_multiindex(): """single level MultiIndex""" - return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - codes=[[0, 1, 2, 3]], names=['first']) + return MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) @pytest.fixture @@ -25,17 +26,17 @@ def frame_random_data_integer_multi_index(): class TestMultiIndexLoc: - def test_loc_getitem_series(self): # GH14730 # passing a series as a key with a MultiIndex - index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) x = Series(index=index, data=range(9), dtype=np.float64) y = Series([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], - index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), - dtype=np.float64) + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) result = x.loc[y] tm.assert_series_equal(result, expected) @@ -48,99 +49,107 @@ def test_loc_getitem_series(self): tm.assert_series_equal(result, expected) empty = Series(data=[], dtype=np.float64) - expected = Series([], index=MultiIndex( - levels=index.levels, codes=[[], []], dtype=np.float64)) + expected = Series( + [], index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64) + ) result = x.loc[empty] tm.assert_series_equal(result, expected) def test_loc_getitem_array(self): # GH15434 # passing an array as a key with a MultiIndex - index = MultiIndex.from_product([[1, 2, 3], ['A', 'B', 'C']]) + index = MultiIndex.from_product([[1, 2, 3], ["A", "B", "C"]]) x = Series(index=index, data=range(9), dtype=np.float64) y = np.array([1, 3]) expected = Series( data=[0, 1, 2, 6, 7, 8], - index=MultiIndex.from_product([[1, 3], ['A', 'B', 'C']]), - dtype=np.float64) + index=MultiIndex.from_product([[1, 3], ["A", "B", "C"]]), + dtype=np.float64, + ) result = x.loc[y] tm.assert_series_equal(result, expected) # empty array: empty = np.array([]) - expected = Series([], index=MultiIndex( - levels=index.levels, codes=[[], []], dtype=np.float64)) + expected = Series( + [], index=MultiIndex(levels=index.levels, codes=[[], []], dtype=np.float64) + ) result = x.loc[empty] tm.assert_series_equal(result, expected) # 0-dim array (scalar): scalar = np.int64(1) - expected = Series( - data=[0, 1, 2], - index=['A', 'B', 'C'], - dtype=np.float64) + expected = Series(data=[0, 1, 2], index=["A", "B", "C"], dtype=np.float64) result = x.loc[scalar] tm.assert_series_equal(result, expected) def test_loc_multiindex_labels(self): - df = DataFrame(np.random.randn(3, 3), - columns=[['i', 'i', 'j'], ['A', 'A', 'B']], - index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) + df = DataFrame( + np.random.randn(3, 3), + columns=[["i", "i", "j"], ["A", "A", "B"]], + index=[["i", "i", "j"], ["X", "X", "Y"]], + ) # the first 2 rows expected = df.iloc[[0, 1]].droplevel(0) - result = df.loc['i'] + result = df.loc["i"] tm.assert_frame_equal(result, expected) # 2nd (last) column expected = df.iloc[:, [2]].droplevel(0, axis=1) - result = df.loc[:, 'j'] + result = df.loc[:, "j"] tm.assert_frame_equal(result, expected) # bottom right corner expected = df.iloc[[2], [2]].droplevel(0).droplevel(0, axis=1) - result = df.loc['j'].loc[:, 'j'] + result = df.loc["j"].loc[:, "j"] tm.assert_frame_equal(result, expected) # with a tuple expected = df.iloc[[0, 1]] - result = df.loc[('i', 'X')] + result = df.loc[("i", "X")] tm.assert_frame_equal(result, expected) def test_loc_multiindex_ints(self): - df = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) expected = df.iloc[[0, 1]].droplevel(0) result = df.loc[4] tm.assert_frame_equal(result, expected) def test_loc_multiindex_missing_label_raises(self): - df = DataFrame(np.random.randn(3, 3), - columns=[[2, 2, 4], [6, 8, 10]], - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randn(3, 3), + columns=[[2, 2, 4], [6, 8, 10]], + index=[[4, 4, 8], [8, 10, 12]], + ) with pytest.raises(KeyError, match=r"^2$"): df.loc[2] def test_loc_multiindex_too_many_dims_raises(self): # GH 14885 - s = Series(range(8), index=MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']])) + s = Series( + range(8), + index=MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + ) with pytest.raises(KeyError, match=r"^\('a', 'b'\)$"): - s.loc['a', 'b'] + s.loc["a", "b"] with pytest.raises(KeyError, match=r"^\('a', 'd', 'g'\)$"): - s.loc['a', 'd', 'g'] - with pytest.raises(IndexingError, match='Too many indexers'): - s.loc['a', 'd', 'g', 'j'] + s.loc["a", "d", "g"] + with pytest.raises(IndexingError, match="Too many indexers"): + s.loc["a", "d", "g", "j"] def test_loc_multiindex_indexer_none(self): # GH6788 # multi-index indexer is None (meaning take all) - attributes = ['Attribute' + str(i) for i in range(1)] - attribute_values = ['Value' + str(i) for i in range(5)] + attributes = ["Attribute" + str(i) for i in range(1)] + attribute_values = ["Value" + str(i) for i in range(5)] index = MultiIndex.from_product([attributes, attribute_values]) df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 @@ -150,11 +159,12 @@ def test_loc_multiindex_indexer_none(self): # GH 7349 # loc with a multi-index seems to be doing fallback - df = DataFrame(np.arange(12).reshape(-1, 1), - index=MultiIndex.from_product([[1, 2, 3, 4], - [1, 2, 3]])) + df = DataFrame( + np.arange(12).reshape(-1, 1), + index=MultiIndex.from_product([[1, 2, 3, 4], [1, 2, 3]]), + ) - expected = df.loc[([1, 2], ), :] + expected = df.loc[([1, 2],), :] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) @@ -162,34 +172,37 @@ def test_loc_multiindex_incomplete(self): # GH 7399 # incomplete indexers - s = Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) - expected = s.loc[:, 'a':'c'] + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) + expected = s.loc[:, "a":"c"] - result = s.loc[0:4, 'a':'c'] + result = s.loc[0:4, "a":"c"] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) - result = s.loc[:4, 'a':'c'] + result = s.loc[:4, "a":"c"] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) - result = s.loc[0:, 'a':'c'] + result = s.loc[0:, "a":"c"] tm.assert_series_equal(result, expected) tm.assert_series_equal(result, expected) # GH 7400 # multiindexer gettitem with list of indexers skips wrong element - s = Series(np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + s = Series( + np.arange(15, dtype="int64"), + MultiIndex.from_product([range(5), ["a", "b", "c"]]), + ) expected = s.iloc[[6, 7, 8, 12, 13, 14]] - result = s.loc[2:4:2, 'a':'c'] + result = s.loc[2:4:2, "a":"c"] tm.assert_series_equal(result, expected) def test_get_loc_single_level(self, single_level_multiindex): single_level = single_level_multiindex - s = Series(np.random.randn(len(single_level)), - index=single_level) + s = Series(np.random.randn(len(single_level)), index=single_level) for k in single_level.values: s[k] @@ -197,16 +210,17 @@ def test_loc_getitem_int_slice(self): # GH 3053 # loc should treat integer slices like label slices - index = MultiIndex.from_tuples([t for t in itertools.product( - [6, 7, 8], ['a', 'b'])]) + index = MultiIndex.from_tuples( + [t for t in itertools.product([6, 7, 8], ["a", "b"])] + ) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df tm.assert_frame_equal(result, expected) - index = MultiIndex.from_tuples([t - for t in itertools.product( - [10, 20, 30], ['a', 'b'])]) + index = MultiIndex.from_tuples( + [t for t in itertools.product([10, 20, 30], ["a", "b"])] + ) df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.iloc[2:] @@ -215,7 +229,7 @@ def test_loc_getitem_int_slice(self): # doc examples result = df.loc[10, :] expected = df.iloc[0:2] - expected.index = ['a', 'b'] + expected.index = ["a", "b"] tm.assert_frame_equal(result, expected) result = df.loc[:, 10] @@ -223,11 +237,11 @@ def test_loc_getitem_int_slice(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - 'indexer_type_1', - (list, tuple, set, slice, np.ndarray, Series, Index)) + "indexer_type_1", (list, tuple, set, slice, np.ndarray, Series, Index) + ) @pytest.mark.parametrize( - 'indexer_type_2', - (list, tuple, set, slice, np.ndarray, Series, Index)) + "indexer_type_2", (list, tuple, set, slice, np.ndarray, Series, Index) + ) def test_loc_getitem_nested_indexer(self, indexer_type_1, indexer_type_2): # GH #19686 # .loc should work with nested indexers which can be @@ -244,8 +258,8 @@ def convert_nested_indexer(indexer_type, keys): b = [1, 2, 3] index = MultiIndex.from_product([a, b]) df = DataFrame( - np.arange(len(index), dtype='int64'), - index=index, columns=['Data']) + np.arange(len(index), dtype="int64"), index=index, columns=["Data"] + ) keys = ([10, 20], [2, 3]) types = (indexer_type_1, indexer_type_2) @@ -254,43 +268,52 @@ def convert_nested_indexer(indexer_type, keys): # of all the valid types indexer = tuple( convert_nested_indexer(indexer_type, k) - for indexer_type, k in zip(types, keys)) + for indexer_type, k in zip(types, keys) + ) - result = df.loc[indexer, 'Data'] + result = df.loc[indexer, "Data"] expected = Series( - [1, 2, 4, 5], name='Data', - index=MultiIndex.from_product(keys)) + [1, 2, 4, 5], name="Data", index=MultiIndex.from_product(keys) + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('indexer, is_level1, expected_error', [ - ([], False, None), # empty ok - (['A'], False, None), - (['A', 'D'], False, None), - (['D'], False, r"\['D'\] not in index"), # not any values found - (pd.IndexSlice[:, ['foo']], True, None), - (pd.IndexSlice[:, ['foo', 'bah']], True, None) -]) -def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, is_level1, - expected_error): +@pytest.mark.parametrize( + "indexer, is_level1, expected_error", + [ + ([], False, None), # empty ok + (["A"], False, None), + (["A", "D"], False, None), + (["D"], False, r"\['D'\] not in index"), # not any values found + (pd.IndexSlice[:, ["foo"]], True, None), + (pd.IndexSlice[:, ["foo", "bah"]], True, None), + ], +) +def test_loc_getitem_duplicates_multiindex_missing_indexers( + indexer, is_level1, expected_error +): # GH 7866 # multi-index slicing with missing indexers - idx = MultiIndex.from_product([['A', 'B', 'C'], - ['foo', 'bar', 'baz']], - names=['one', 'two']) - s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() + idx = MultiIndex.from_product( + [["A", "B", "C"], ["foo", "bar", "baz"]], names=["one", "two"] + ) + s = Series(np.arange(9, dtype="int64"), index=idx).sort_index() if indexer == []: expected = s.iloc[[]] elif is_level1: - expected = Series([0, 3, 6], index=MultiIndex.from_product( - [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() + expected = Series( + [0, 3, 6], + index=MultiIndex.from_product( + [["A", "B", "C"], ["foo"]], names=["one", "two"] + ), + ).sort_index() else: - exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], - names=['one', 'two']) - expected = Series(np.arange(3, dtype='int64'), - index=exp_idx).sort_index() + exp_idx = MultiIndex.from_product( + [["A"], ["foo", "bar", "baz"]], names=["one", "two"] + ) + expected = Series(np.arange(3, dtype="int64"), index=exp_idx).sort_index() if expected_error is not None: with pytest.raises(KeyError, match=expected_error): @@ -300,23 +323,18 @@ def test_loc_getitem_duplicates_multiindex_missing_indexers(indexer, is_level1, tm.assert_series_equal(result, expected) -def test_series_loc_getitem_fancy( - multiindex_year_month_day_dataframe_random_data): - s = multiindex_year_month_day_dataframe_random_data['A'] +def test_series_loc_getitem_fancy(multiindex_year_month_day_dataframe_random_data): + s = multiindex_year_month_day_dataframe_random_data["A"] expected = s.reindex(s.index[49:51]) result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('columns_indexer', [ - ([], slice(None)), - (['foo'], []) -]) +@pytest.mark.parametrize("columns_indexer", [([], slice(None)), (["foo"], [])]) def test_loc_getitem_duplicates_multiindex_empty_indexer(columns_indexer): # GH 8737 # empty indexer - multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'], - ['alpha', 'beta'])) + multi_index = MultiIndex.from_product((["foo", "bar", "baz"], ["alpha", "beta"])) df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index) df = df.sort_index(level=0, axis=1) @@ -328,22 +346,26 @@ def test_loc_getitem_duplicates_multiindex_empty_indexer(columns_indexer): def test_loc_getitem_duplicates_multiindex_non_scalar_type_object(): # regression from < 0.14.0 # GH 7914 - df = DataFrame([[np.mean, np.median], ['mean', 'median']], - columns=MultiIndex.from_tuples([('functs', 'mean'), - ('functs', 'median')]), - index=['function', 'name']) - result = df.loc['function', ('functs', 'mean')] + df = DataFrame( + [[np.mean, np.median], ["mean", "median"]], + columns=MultiIndex.from_tuples([("functs", "mean"), ("functs", "median")]), + index=["function", "name"], + ) + result = df.loc["function", ("functs", "mean")] expected = np.mean assert result == expected def test_loc_getitem_tuple_plus_slice(): # GH 671 - df = DataFrame({'a': np.arange(10), - 'b': np.arange(10), - 'c': np.random.randn(10), - 'd': np.random.randn(10)} - ).set_index(['a', 'b']) + df = DataFrame( + { + "a": np.arange(10), + "b": np.arange(10), + "c": np.random.randn(10), + "d": np.random.randn(10), + } + ).set_index(["a", "b"]) expected = df.loc[0, 0] result = df.loc[(0, 0), :] tm.assert_series_equal(result, expected) @@ -357,8 +379,7 @@ def test_loc_getitem_int(frame_random_data_integer_multi_index): tm.assert_frame_equal(result, expected) -def test_loc_getitem_int_raises_exception( - frame_random_data_integer_multi_index): +def test_loc_getitem_int_raises_exception(frame_random_data_integer_multi_index): df = frame_random_data_integer_multi_index with pytest.raises(KeyError, match=r"^3$"): df.loc[3] @@ -369,10 +390,10 @@ def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data): # test setup - check key not in dataframe with pytest.raises(KeyError, match=r"^11$"): - df.loc[('bar', 'three'), 'B'] + df.loc[("bar", "three"), "B"] # in theory should be inserting in a sorted space???? - df.loc[('bar', 'three'), 'B'] = 0 + df.loc[("bar", "three"), "B"] = 0 expected = 0 - result = df.sort_index().loc[('bar', 'three'), 'B'] + result = df.sort_index().loc[("bar", "three"), "B"] assert result == expected diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 6dc8d67a971d3..ccaaa2ae02499 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,4 +1,3 @@ - import numpy as np import pytest @@ -11,20 +10,22 @@ class TestMultiIndexBasic: - def test_multiindex_perf_warn(self): - df = DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}).set_index(['jim', 'joe']) + df = DataFrame( + { + "jim": [0, 0, 1, 1], + "joe": ["x", "x", "z", "y"], + "jolie": np.random.rand(4), + } + ).set_index(["jim", "joe"]) - with tm.assert_produces_warning(PerformanceWarning, - clear=[pd.core.index]): - df.loc[(1, 'z')] + with tm.assert_produces_warning(PerformanceWarning, clear=[pd.core.index]): + df.loc[(1, "z")] df = df.iloc[[2, 1, 3, 0]] with tm.assert_produces_warning(PerformanceWarning): - df.loc[(0, )] + df.loc[(0,)] def test_multiindex_contains_dropped(self): # GH 19027 @@ -40,17 +41,20 @@ def test_multiindex_contains_dropped(self): assert 2 not in idx # also applies to strings - idx = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) - assert 'a' in idx - idx = idx.drop('a') - assert 'a' in idx.levels[0] - assert 'a' not in idx - - @pytest.mark.parametrize("data, expected", [ - (MultiIndex.from_product([(), ()]), True), - (MultiIndex.from_product([(1, 2), (3, 4)]), True), - (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), - ]) + idx = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + assert "a" in idx + idx = idx.drop("a") + assert "a" in idx.levels[0] + assert "a" not in idx + + @pytest.mark.parametrize( + "data, expected", + [ + (MultiIndex.from_product([(), ()]), True), + (MultiIndex.from_product([(1, 2), (3, 4)]), True), + (MultiIndex.from_product([("a", "b"), (1, 2)]), False), + ], + ) def test_multiindex_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected @@ -60,8 +64,7 @@ def test_indexing_over_hashtable_size_cutoff(self): old_cutoff = _index._SIZE_CUTOFF _index._SIZE_CUTOFF = 20000 - s = Series(np.arange(n), - MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + s = Series(np.arange(n), MultiIndex.from_arrays((["a"] * n, np.arange(n)))) # hai it works! assert s[("a", 5)] == 5 @@ -73,22 +76,31 @@ def test_indexing_over_hashtable_size_cutoff(self): def test_multi_nan_indexing(self): # GH 3588 - df = DataFrame({"a": ['R1', 'R2', np.nan, 'R4'], - 'b': ["C1", "C2", "C3", "C4"], - "c": [10, 15, np.nan, 20]}) - result = df.set_index(['a', 'b'], drop=False) - expected = DataFrame({"a": ['R1', 'R2', np.nan, 'R4'], - 'b': ["C1", "C2", "C3", "C4"], - "c": [10, 15, np.nan, 20]}, - index=[Index(['R1', 'R2', np.nan, 'R4'], - name='a'), - Index(['C1', 'C2', 'C3', 'C4'], name='b')]) + df = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + } + ) + result = df.set_index(["a", "b"], drop=False) + expected = DataFrame( + { + "a": ["R1", "R2", np.nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, np.nan, 20], + }, + index=[ + Index(["R1", "R2", np.nan, "R4"], name="a"), + Index(["C1", "C2", "C3", "C4"], name="b"), + ], + ) tm.assert_frame_equal(result, expected) def test_contains(self): # GH 24570 - tx = pd.timedelta_range('09:30:00', '16:00:00', freq='30 min') + tx = pd.timedelta_range("09:30:00", "16:00:00", freq="30 min") idx = MultiIndex.from_arrays([tx, np.arange(len(tx))]) assert tx[0] in idx - assert 'element_not_exit' not in idx - assert '0 day 09:30:00' in idx + assert "element_not_exit" not in idx + assert "0 day 09:30:00" in idx diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 20830bbe4680b..3c65f1b8abddb 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -8,27 +8,25 @@ class TestMultiIndexPartial: - def test_getitem_partial_int(self): # GH 12416 # with single item l1 = [10, 20] - l2 = ['a', 'b'] - df = DataFrame(index=range(2), - columns=MultiIndex.from_product([l1, l2])) - expected = DataFrame(index=range(2), - columns=l2) + l2 = ["a", "b"] + df = DataFrame(index=range(2), columns=MultiIndex.from_product([l1, l2])) + expected = DataFrame(index=range(2), columns=l2) result = df[20] tm.assert_frame_equal(result, expected) # with list - expected = DataFrame(index=range(2), - columns=MultiIndex.from_product([l1[1:], l2])) + expected = DataFrame( + index=range(2), columns=MultiIndex.from_product([l1[1:], l2]) + ) result = df[[20]] tm.assert_frame_equal(result, expected) # missing item: - with pytest.raises(KeyError, match='1'): + with pytest.raises(KeyError, match="1"): df[1] with pytest.raises(KeyError, match=r"'\[1\] not in index'"): df[[1]] @@ -36,13 +34,16 @@ def test_getitem_partial_int(self): def test_series_slice_partial(self): pass - def test_xs_partial(self, multiindex_dataframe_random_data, - multiindex_year_month_day_dataframe_random_data): + def test_xs_partial( + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): frame = multiindex_dataframe_random_data ymd = multiindex_year_month_day_dataframe_random_data - result = frame.xs('foo') - result2 = frame.loc['foo'] - expected = frame.T['foo'].T + result = frame.xs("foo") + result2 = frame.loc["foo"] + expected = frame.T["foo"].T tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result, result2) @@ -51,19 +52,21 @@ def test_xs_partial(self, multiindex_dataframe_random_data, tm.assert_frame_equal(result, expected) # ex from #1796 - index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], - codes=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, - 0, 1]]) - df = DataFrame(np.random.randn(8, 4), index=index, - columns=list('abcd')) - - result = df.xs(['foo', 'one']) - expected = df.loc['foo', 'one'] + index = MultiIndex( + levels=[["foo", "bar"], ["one", "two"], [-1, 1]], + codes=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + ) + df = DataFrame(np.random.randn(8, 4), index=index, columns=list("abcd")) + + result = df.xs(["foo", "one"]) + expected = df.loc["foo", "one"] tm.assert_frame_equal(result, expected) - def test_getitem_partial( - self, multiindex_year_month_day_dataframe_random_data): + def test_getitem_partial(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data ymd = ymd.T result = ymd[2000, 2] @@ -73,10 +76,12 @@ def test_getitem_partial( tm.assert_frame_equal(result, expected) def test_fancy_slice_partial( - self, multiindex_dataframe_random_data, - multiindex_year_month_day_dataframe_random_data): + self, + multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data, + ): frame = multiindex_dataframe_random_data - result = frame.loc['bar':'baz'] + result = frame.loc["bar":"baz"] expected = frame[3:7] tm.assert_frame_equal(result, expected) @@ -87,28 +92,29 @@ def test_fancy_slice_partial( tm.assert_frame_equal(result, expected) def test_getitem_partial_column_select(self): - idx = MultiIndex(codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], - levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) + idx = MultiIndex( + codes=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + levels=[["a", "b"], ["x", "y"], ["p", "q"]], + ) df = DataFrame(np.random.rand(3, 2), index=idx) - result = df.loc[('a', 'y'), :] - expected = df.loc[('a', 'y')] + result = df.loc[("a", "y"), :] + expected = df.loc[("a", "y")] tm.assert_frame_equal(result, expected) - result = df.loc[('a', 'y'), [1, 0]] - expected = df.loc[('a', 'y')][[1, 0]] + result = df.loc[("a", "y"), [1, 0]] + expected = df.loc[("a", "y")][[1, 0]] tm.assert_frame_equal(result, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = df.ix[('a', 'y'), [1, 0]] + result = df.ix[("a", "y"), [1, 0]] tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match=r"\('a', 'foo'\)"): - df.loc[('a', 'foo'), :] + df.loc[("a", "foo"), :] - def test_partial_set( - self, multiindex_year_month_day_dataframe_random_data): + def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data df = ymd.copy() @@ -117,8 +123,8 @@ def test_partial_set( exp.loc[2000, 4].values[:] = 0 tm.assert_frame_equal(df, exp) - df['A'].loc[2000, 4] = 1 - exp['A'].loc[2000, 4].values[:] = 1 + df["A"].loc[2000, 4] = 1 + exp["A"].loc[2000, 4].values[:] = 1 tm.assert_frame_equal(df, exp) df.loc[2000] = 5 @@ -126,19 +132,18 @@ def test_partial_set( tm.assert_frame_equal(df, exp) # this works...for now - df['A'].iloc[14] = 5 - assert df['A'][14] == 5 + df["A"].iloc[14] = 5 + assert df["A"][14] == 5 # --------------------------------------------------------------------- # AMBIGUOUS CASES! - def test_partial_loc_missing( - self, multiindex_year_month_day_dataframe_random_data): + def test_partial_loc_missing(self, multiindex_year_month_day_dataframe_random_data): pytest.skip("skipping for now") ymd = multiindex_year_month_day_dataframe_random_data result = ymd.loc[2000, 0] - expected = ymd.loc[2000]['A'] + expected = ymd.loc[2000]["A"] tm.assert_series_equal(result, expected) # need to put in some work here @@ -158,28 +163,28 @@ def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data expected = frame.copy() result = frame.copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_frame_equal(result, expected) expected = frame.copy() result = frame.copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_frame_equal(result, expected) - expected = frame['A'].copy() - result = frame['A'].copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc[["foo", "bar"]] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_series_equal(result, expected) - expected = frame['A'].copy() - result = frame['A'].copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 + expected = frame["A"].copy() + result = frame["A"].copy() + result.loc["foo":"bar"] = 0 + expected.loc["foo"] = 0 + expected.loc["bar"] = 0 tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_set_ops.py b/pandas/tests/indexing/multiindex/test_set_ops.py index 6c7d209333d62..5d0bc61e9957c 100644 --- a/pandas/tests/indexing/multiindex/test_set_ops.py +++ b/pandas/tests/indexing/multiindex/test_set_ops.py @@ -5,22 +5,22 @@ class TestMultiIndexSetOps: - def test_multiindex_symmetric_difference(self): # GH 13490 - idx = MultiIndex.from_product([['a', 'b'], ['A', 'B']], - names=['a', 'b']) + idx = MultiIndex.from_product([["a", "b"], ["A", "B"]], names=["a", "b"]) result = idx ^ idx assert result.names == idx.names - idx2 = idx.copy().rename(['A', 'B']) + idx2 = idx.copy().rename(["A", "B"]) result = idx ^ idx2 assert result.names == [None, None] def test_mixed_depth_insert(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) @@ -28,15 +28,14 @@ def test_mixed_depth_insert(self): result = df.copy() expected = df.copy() - result['b'] = [1, 2, 3, 4] - expected['b', '', ''] = [1, 2, 3, 4] + result["b"] = [1, 2, 3, 4] + expected["b", "", ""] = [1, 2, 3, 4] tm.assert_frame_equal(result, expected) def test_dataframe_insert_column_all_na(self): # GH #1534 - mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c') - ]) + mix = MultiIndex.from_tuples([("1a", "2a"), ("1a", "2b"), ("1a", "2c")]) df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) s = Series({(1, 1): 1, (1, 2): 2}) - df['new'] = s - assert df['new'].isna().all() + df["new"] = s + assert df["new"].isna().all() diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 44aae4cd55e35..261d2e9c04e77 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -5,19 +5,17 @@ import pytest import pandas as pd -from pandas import ( - DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna) +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna import pandas.core.common as com from pandas.util import testing as tm @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestMultiIndexSetItem: - def test_setitem_multiindex(self): with catch_warnings(record=True): - for index_fn in ('ix', 'loc'): + for index_fn in ("ix", "loc"): def assert_equal(a, b): assert a == b @@ -29,136 +27,168 @@ def check(target, indexers, value, compare_fn, expected=None): if expected is None: expected = value compare_fn(result, expected) + # GH7190 - index = MultiIndex.from_product([np.arange(0, 100), - np.arange(0, 80)], - names=['time', 'firm']) + index = MultiIndex.from_product( + [np.arange(0, 100), np.arange(0, 80)], names=["time", "firm"] + ) t, n = 0, 2 - df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=0, - compare_fn=assert_equal) - - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=1, - compare_fn=assert_equal) - - df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, indexers=((t, n), 'X'), value=2, - compare_fn=assert_equal) + df = DataFrame( + np.nan, + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], + index=index, + ) + check( + target=df, indexers=((t, n), "X"), value=0, compare_fn=assert_equal + ) + + df = DataFrame( + -999, + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], + index=index, + ) + check( + target=df, indexers=((t, n), "X"), value=1, compare_fn=assert_equal + ) + + df = DataFrame( + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], index=index + ) + check( + target=df, indexers=((t, n), "X"), value=2, compare_fn=assert_equal + ) # gh-7218: assigning with 0-dim arrays - df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', - 'X', 'd', 'profit'], - index=index) - check(target=df, - indexers=((t, n), 'X'), - value=np.array(3), - compare_fn=assert_equal, - expected=3, ) + df = DataFrame( + -999, + columns=["A", "w", "l", "a", "x", "X", "d", "profit"], + index=index, + ) + check( + target=df, + indexers=((t, n), "X"), + value=np.array(3), + compare_fn=assert_equal, + expected=3, + ) # GH5206 - df = DataFrame(np.arange(25).reshape(5, 5), - columns='A,B,C,D,E'.split(','), dtype=float) - df['F'] = 99 - row_selection = df['A'] % 2 == 0 - col_selection = ['B', 'C'] + df = DataFrame( + np.arange(25).reshape(5, 5), + columns="A,B,C,D,E".split(","), + dtype=float, + ) + df["F"] = 99 + row_selection = df["A"] % 2 == 0 + col_selection = ["B", "C"] with catch_warnings(record=True): - df.ix[row_selection, col_selection] = df['F'] - output = DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) + df.ix[row_selection, col_selection] = df["F"] + output = DataFrame(99.0, index=[0, 2, 4], columns=["B", "C"]) with catch_warnings(record=True): - tm.assert_frame_equal(df.ix[row_selection, col_selection], - output) - check(target=df, - indexers=(row_selection, col_selection), - value=df['F'], - compare_fn=tm.assert_frame_equal, - expected=output, ) + tm.assert_frame_equal(df.ix[row_selection, col_selection], output) + check( + target=df, + indexers=(row_selection, col_selection), + value=df["F"], + compare_fn=tm.assert_frame_equal, + expected=output, + ) # GH11372 - idx = MultiIndex.from_product([ - ['A', 'B', 'C'], - date_range('2015-01-01', '2015-04-01', freq='MS')]) - cols = MultiIndex.from_product([ - ['foo', 'bar'], - date_range('2016-01-01', '2016-02-01', freq='MS')]) + idx = MultiIndex.from_product( + [["A", "B", "C"], date_range("2015-01-01", "2015-04-01", freq="MS")] + ) + cols = MultiIndex.from_product( + [["foo", "bar"], date_range("2016-01-01", "2016-02-01", freq="MS")] + ) - df = DataFrame(np.random.random((12, 4)), - index=idx, columns=cols) + df = DataFrame(np.random.random((12, 4)), index=idx, columns=cols) subidx = MultiIndex.from_tuples( - [('A', Timestamp('2015-01-01')), - ('A', Timestamp('2015-02-01'))]) + [("A", Timestamp("2015-01-01")), ("A", Timestamp("2015-02-01"))] + ) subcols = MultiIndex.from_tuples( - [('foo', Timestamp('2016-01-01')), - ('foo', Timestamp('2016-02-01'))]) - - vals = DataFrame(np.random.random((2, 2)), - index=subidx, columns=subcols) - check(target=df, - indexers=(subidx, subcols), - value=vals, - compare_fn=tm.assert_frame_equal, ) - # set all columns + [("foo", Timestamp("2016-01-01")), ("foo", Timestamp("2016-02-01"))] + ) + vals = DataFrame( - np.random.random((2, 4)), index=subidx, columns=cols) - check(target=df, - indexers=(subidx, slice(None, None, None)), - value=vals, - compare_fn=tm.assert_frame_equal, ) + np.random.random((2, 2)), index=subidx, columns=subcols + ) + check( + target=df, + indexers=(subidx, subcols), + value=vals, + compare_fn=tm.assert_frame_equal, + ) + # set all columns + vals = DataFrame(np.random.random((2, 4)), index=subidx, columns=cols) + check( + target=df, + indexers=(subidx, slice(None, None, None)), + value=vals, + compare_fn=tm.assert_frame_equal, + ) # identity copy = df.copy() - check(target=df, indexers=(df.index, df.columns), value=df, - compare_fn=tm.assert_frame_equal, expected=copy) + check( + target=df, + indexers=(df.index, df.columns), + value=df, + compare_fn=tm.assert_frame_equal, + expected=copy, + ) def test_multiindex_setitem(self): # GH 3738 # setting with a multi-index right hand side - arrays = [np.array(['bar', 'bar', 'baz', 'qux', 'qux', 'bar']), - np.array(['one', 'two', 'one', 'one', 'two', 'one']), - np.arange(0, 6, 1)] + arrays = [ + np.array(["bar", "bar", "baz", "qux", "qux", "bar"]), + np.array(["one", "two", "one", "one", "two", "one"]), + np.arange(0, 6, 1), + ] - df_orig = DataFrame(np.random.randn(6, 3), index=arrays, - columns=['A', 'B', 'C']).sort_index() + df_orig = DataFrame( + np.random.randn(6, 3), index=arrays, columns=["A", "B", "C"] + ).sort_index() - expected = df_orig.loc[['bar']] * 2 + expected = df_orig.loc[["bar"]] * 2 df = df_orig.copy() - df.loc[['bar']] *= 2 - tm.assert_frame_equal(df.loc[['bar']], expected) + df.loc[["bar"]] *= 2 + tm.assert_frame_equal(df.loc[["bar"]], expected) # raise because these have differing levels with pytest.raises(TypeError): - df.loc['bar'] *= 2 + df.loc["bar"] *= 2 # from SO # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation - df_orig = DataFrame.from_dict({'price': { - ('DE', 'Coal', 'Stock'): 2, - ('DE', 'Gas', 'Stock'): 4, - ('DE', 'Elec', 'Demand'): 1, - ('FR', 'Gas', 'Stock'): 5, - ('FR', 'Solar', 'SupIm'): 0, - ('FR', 'Wind', 'SupIm'): 0 - }}) - df_orig.index = MultiIndex.from_tuples(df_orig.index, - names=['Sit', 'Com', 'Type']) + df_orig = DataFrame.from_dict( + { + "price": { + ("DE", "Coal", "Stock"): 2, + ("DE", "Gas", "Stock"): 4, + ("DE", "Elec", "Demand"): 1, + ("FR", "Gas", "Stock"): 5, + ("FR", "Solar", "SupIm"): 0, + ("FR", "Wind", "SupIm"): 0, + } + } + ) + df_orig.index = MultiIndex.from_tuples( + df_orig.index, names=["Sit", "Com", "Type"] + ) expected = df_orig.copy() expected.iloc[[0, 2, 3]] *= 2 idx = pd.IndexSlice df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], :] *= 2 + df.loc[idx[:, :, "Stock"], :] *= 2 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[idx[:, :, 'Stock'], 'price'] *= 2 + df.loc[idx[:, :, "Stock"], "price"] *= 2 tm.assert_frame_equal(df, expected) def test_multiindex_assignment(self): @@ -166,60 +196,66 @@ def test_multiindex_assignment(self): # GH3777 part 2 # mixed dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) - df['d'] = np.nan - arr = np.array([0., 1.]) + df = DataFrame( + np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + ) + df["d"] = np.nan + arr = np.array([0.0, 1.0]) with catch_warnings(record=True): - df.ix[4, 'd'] = arr - tm.assert_series_equal(df.ix[4, 'd'], - Series(arr, index=[8, 10], name='d')) + df.ix[4, "d"] = arr + tm.assert_series_equal(df.ix[4, "d"], Series(arr, index=[8, 10], name="d")) # single dtype - df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), - columns=list('abc'), - index=[[4, 4, 8], [8, 10, 12]]) + df = DataFrame( + np.random.randint(5, 10, size=9).reshape(3, 3), + columns=list("abc"), + index=[[4, 4, 8], [8, 10, 12]], + ) with catch_warnings(record=True): - df.ix[4, 'c'] = arr - exp = Series(arr, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) + df.ix[4, "c"] = arr + exp = Series(arr, index=[8, 10], name="c", dtype="float64") + tm.assert_series_equal(df.ix[4, "c"], exp) # scalar ok with catch_warnings(record=True): - df.ix[4, 'c'] = 10 - exp = Series(10, index=[8, 10], name='c', dtype='float64') - tm.assert_series_equal(df.ix[4, 'c'], exp) + df.ix[4, "c"] = 10 + exp = Series(10, index=[8, 10], name="c", dtype="float64") + tm.assert_series_equal(df.ix[4, "c"], exp) # invalid assignments with pytest.raises(ValueError): with catch_warnings(record=True): - df.ix[4, 'c'] = [0, 1, 2, 3] + df.ix[4, "c"] = [0, 1, 2, 3] with pytest.raises(ValueError): with catch_warnings(record=True): - df.ix[4, 'c'] = [0] + df.ix[4, "c"] = [0] # groupby example NUM_ROWS = 100 NUM_COLS = 10 - col_names = ['A' + num for num in - map(str, np.arange(NUM_COLS).tolist())] + col_names = ["A" + num for num in map(str, np.arange(NUM_COLS).tolist())] index_cols = col_names[:5] - df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), - dtype=np.int64, columns=col_names) + df = DataFrame( + np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, + columns=col_names, + ) df = df.set_index(index_cols).sort_index() grp = df.groupby(level=index_cols[:4]) - df['new_col'] = np.nan + df["new_col"] = np.nan f_index = np.arange(5) def f(name, df2): - return Series(np.arange(df2.shape[0]), - name=df2.index.values[0]).reindex(f_index) + return Series(np.arange(df2.shape[0]), name=df2.index.values[0]).reindex( + f_index + ) # TODO(wesm): unused? # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T @@ -229,12 +265,11 @@ def f(name, df2): for name, df2 in grp: new_vals = np.arange(df2.shape[0]) with catch_warnings(record=True): - df.ix[name, 'new_col'] = new_vals + df.ix[name, "new_col"] = new_vals - def test_series_setitem( - self, multiindex_year_month_day_dataframe_random_data): + def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] + s = ymd["A"] s[2000, 3] = np.nan assert isna(s.values[42:65]).all() @@ -244,8 +279,7 @@ def test_series_setitem( s[2000, 3, 10] = np.nan assert isna(s[49]) - def test_frame_getitem_setitem_boolean( - self, multiindex_dataframe_random_data): + def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data df = frame.T.copy() values = df.values @@ -267,88 +301,90 @@ def test_frame_getitem_setitem_boolean( np.putmask(values[:-1], values[:-1] < 0, 2) tm.assert_almost_equal(df.values, values) - with pytest.raises(TypeError, match='boolean values only'): + with pytest.raises(TypeError, match="boolean values only"): df[df * 0] = 2 def test_frame_getitem_setitem_multislice(self): - levels = [['t1', 't2'], ['a', 'b', 'c']] + levels = [["t1", "t2"], ["a", "b", "c"]] codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] - midx = MultiIndex(codes=codes, levels=levels, names=[None, 'id']) - df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) + midx = MultiIndex(codes=codes, levels=levels, names=[None, "id"]) + df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx) - result = df.loc[:, 'value'] - tm.assert_series_equal(df['value'], result) + result = df.loc[:, "value"] + tm.assert_series_equal(df["value"], result) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - result = df.ix[:, 'value'] - tm.assert_series_equal(df['value'], result) + result = df.ix[:, "value"] + tm.assert_series_equal(df["value"], result) - result = df.loc[df.index[1:3], 'value'] - tm.assert_series_equal(df['value'][1:3], result) + result = df.loc[df.index[1:3], "value"] + tm.assert_series_equal(df["value"][1:3], result) result = df.loc[:, :] tm.assert_frame_equal(df, result) result = df - df.loc[:, 'value'] = 10 - result['value'] = 10 + df.loc[:, "value"] = 10 + result["value"] = 10 tm.assert_frame_equal(df, result) df.loc[:, :] = 10 tm.assert_frame_equal(df, result) def test_frame_setitem_multi_column(self): - df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], - [0, 1, 0, 1]]) + df = DataFrame(randn(10, 4), columns=[["a", "a", "b", "b"], [0, 1, 0, 1]]) cp = df.copy() - cp['a'] = cp['b'] - tm.assert_frame_equal(cp['a'], cp['b']) + cp["a"] = cp["b"] + tm.assert_frame_equal(cp["a"], cp["b"]) # set with ndarray cp = df.copy() - cp['a'] = cp['b'].values - tm.assert_frame_equal(cp['a'], cp['b']) + cp["a"] = cp["b"].values + tm.assert_frame_equal(cp["a"], cp["b"]) # --------------------------------------- # #1803 - columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) + columns = MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]) df = DataFrame(index=[1, 3, 5], columns=columns) # Works, but adds a column instead of updating the two existing ones - df['A'] = 0.0 # Doesn't work - assert (df['A'].values == 0).all() + df["A"] = 0.0 # Doesn't work + assert (df["A"].values == 0).all() # it broadcasts - df['B', '1'] = [1, 2, 3] - df['A'] = df['B', '1'] + df["B", "1"] = [1, 2, 3] + df["A"] = df["B", "1"] - sliced_a1 = df['A', '1'] - sliced_a2 = df['A', '2'] - sliced_b1 = df['B', '1'] + sliced_a1 = df["A", "1"] + sliced_a2 = df["A", "2"] + sliced_b1 = df["B", "1"] tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) - assert sliced_a1.name == ('A', '1') - assert sliced_a2.name == ('A', '2') - assert sliced_b1.name == ('B', '1') + assert sliced_a1.name == ("A", "1") + assert sliced_a2.name == ("A", "2") + assert sliced_b1.name == ("B", "1") def test_getitem_setitem_tuple_plus_columns( - self, multiindex_year_month_day_dataframe_random_data): + self, multiindex_year_month_day_dataframe_random_data + ): # GH #1013 ymd = multiindex_year_month_day_dataframe_random_data df = ymd[:5] - result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] - expected = df.loc[2000, 1, 6][['A', 'B', 'C']] + result = df.loc[(2000, 1, 6), ["A", "B", "C"]] + expected = df.loc[2000, 1, 6][["A", "B", "C"]] tm.assert_series_equal(result, expected) def test_getitem_setitem_slice_integers(self): - index = MultiIndex(levels=[[0, 1, 2], [0, 2]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[[0, 1, 2], [0, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]] + ) - frame = DataFrame(np.random.randn(len(index), 4), index=index, - columns=['a', 'b', 'c', 'd']) + frame = DataFrame( + np.random.randn(len(index), 4), index=index, columns=["a", "b", "c", "d"] + ) res = frame.loc[1:2] exp = frame.reindex(frame.index[2:]) tm.assert_frame_equal(res, exp) @@ -368,13 +404,13 @@ def test_getitem_setitem_slice_integers(self): def test_setitem_change_dtype(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data dft = frame.T - s = dft['foo', 'two'] - dft['foo', 'two'] = s > s.median() - tm.assert_series_equal(dft['foo', 'two'], s > s.median()) + s = dft["foo", "two"] + dft["foo", "two"] = s > s.median() + tm.assert_series_equal(dft["foo", "two"], s > s.median()) # assert isinstance(dft._data.blocks[1].items, MultiIndex) - reindexed = dft.reindex(columns=[('foo', 'two')]) - tm.assert_series_equal(reindexed['foo', 'two'], s > s.median()) + reindexed = dft.reindex(columns=[("foo", "two")]) + tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data @@ -383,31 +419,31 @@ def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data): frame.loc[subset] = 99 assert (frame.loc[subset].values == 99).all() - col = frame['B'] + col = frame["B"] col[subset] = 97 - assert (frame.loc[subset, 'B'] == 97).all() + assert (frame.loc[subset, "B"] == 97).all() def test_nonunique_assignment_1750(self): - df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], - columns=list("ABCD")) + df = DataFrame( + [[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], columns=list("ABCD") + ) - df = df.set_index(['A', 'B']) + df = df.set_index(["A", "B"]) ix = MultiIndex.from_tuples([(1, 1)]) - df.loc[ix, "C"] = '_' + df.loc[ix, "C"] = "_" - assert (df.xs((1, 1))['C'] == '_').all() + assert (df.xs((1, 1))["C"] == "_").all() def test_astype_assignment_with_dups(self): # GH 4686 # assignment with dups that has a dtype change - cols = MultiIndex.from_tuples([('A', '1'), ('B', '1'), ('A', '2')]) - df = DataFrame(np.arange(3).reshape((1, 3)), - columns=cols, dtype=object) + cols = MultiIndex.from_tuples([("A", "1"), ("B", "1"), ("A", "2")]) + df = DataFrame(np.arange(3).reshape((1, 3)), columns=cols, dtype=object) index = df.index.copy() - df['A'] = df['A'].astype(np.float64) + df["A"] = df["A"].astype(np.float64) tm.assert_index_equal(df.index, index) @@ -415,8 +451,8 @@ def test_frame_setitem_view_direct(multiindex_dataframe_random_data): # this works because we are modifying the underlying array # really a no-no df = multiindex_dataframe_random_data.T - df['foo'].values[:] = 0 - assert (df['foo'].values == 0).all() + df["foo"].values[:] = 0 + assert (df["foo"].values == 0).all() def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): @@ -424,7 +460,7 @@ def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data.T msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - df['foo']['one'] = 2 + df["foo"]["one"] = 2 def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): @@ -433,7 +469,7 @@ def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): df = frame.copy() msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - df['foo']['one'] = 2 + df["foo"]["one"] = 2 result = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 2431f27bff78a..421ca71428bcc 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -14,40 +14,48 @@ @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestMultiIndexSlicers: - def test_per_axis_per_level_getitem(self): # GH6134 # example test case - ix = MultiIndex.from_product([_mklbl('A', 5), _mklbl('B', 7), _mklbl( - 'C', 4), _mklbl('D', 2)]) + ix = MultiIndex.from_product( + [_mklbl("A", 5), _mklbl("B", 7), _mklbl("C", 4), _mklbl("D", 2)] + ) df = DataFrame(np.arange(len(ix.to_numpy())), index=ix) - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] tm.assert_frame_equal(result, expected) - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C2' or c == 'C3')]] - result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") + and (c == "C1" or c == "C2" or c == "C3") + ] + ] + result = df.loc[(slice("A1", "A3"), slice(None), slice("C1", "C3")), :] tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) df = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) df = df.sort_index(axis=0).sort_index(axis=1) # identity @@ -68,19 +76,21 @@ def test_per_axis_per_level_getitem(self): tm.assert_frame_equal(result, expected) # columns - result = df.loc[:, (slice(None), ['foo'])] + result = df.loc[:, (slice(None), ["foo"])] expected = df.iloc[:, [1, 3]] tm.assert_frame_equal(result, expected) # both - result = df.loc[(slice(None), 1), (slice(None), ['foo'])] + result = df.loc[(slice(None), 1), (slice(None), ["foo"])] expected = df.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(result, expected) - result = df.loc['A', 'a'] - expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), - index=Index([1, 2, 3], name='two'), - columns=Index(['bar', 'foo'], name='lvl1')) + result = df.loc["A", "a"] + expected = DataFrame( + dict(bar=[1, 5, 9], foo=[0, 4, 8]), + index=Index([1, 2, 3], name="two"), + columns=Index(["bar", "foo"], name="lvl1"), + ) tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] @@ -89,15 +99,18 @@ def test_per_axis_per_level_getitem(self): # multi-level series s = Series(np.arange(len(ix.to_numpy())), index=ix) - result = s.loc['A1':'A3', :, ['C1', 'C3']] - expected = s.loc[[tuple([a, b, c, d]) - for a, b, c, d in s.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] + result = s.loc["A1":"A3", :, ["C1", "C3"]] + expected = s.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in s.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] tm.assert_series_equal(result, expected) # boolean indexers - result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] expected = df.iloc[[2, 3]] tm.assert_frame_equal(result, expected) @@ -119,28 +132,39 @@ def test_per_axis_per_level_getitem(self): df = df.sort_index(level=1, axis=0) assert df.index.lexsort_depth == 0 - msg = ('MultiIndex slicing requires the index to be ' - r'lexsorted: slicing on levels \[1\], lexsort depth 0') + msg = ( + "MultiIndex slicing requires the index to be " + r"lexsorted: slicing on levels \[1\], lexsort depth 0" + ) with pytest.raises(UnsortedIndexError, match=msg): - df.loc[(slice(None), slice('bar')), :] + df.loc[(slice(None), slice("bar")), :] # GH 16734: not sorted, but no real slicing - result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] + result = df.loc[(slice(None), df.loc[:, ("a", "bar")] > 5), :] tm.assert_frame_equal(result, df.iloc[[1, 3], :]) def test_multiindex_slicers_non_unique(self): # GH 7106 # non-unique mi index support - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 3], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) + df = ( + DataFrame( + dict( + A=["foo", "foo", "foo", "foo"], + B=["a", "a", "a", "a"], + C=[1, 2, 1, 3], + D=[1, 2, 3, 4], + ) + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) assert not df.index.is_unique - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) + expected = ( + DataFrame(dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])) + .set_index(["A", "B", "C"]) + .sort_index() + ) result = df.loc[(slice(None), slice(None), 1), :] tm.assert_frame_equal(result, expected) @@ -148,25 +172,56 @@ def test_multiindex_slicers_non_unique(self): result = df.xs(1, level=2, drop_level=False) tm.assert_frame_equal(result, expected) - df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], - B=['a', 'a', 'a', 'a'], - C=[1, 2, 1, 2], - D=[1, 2, 3, 4])) - .set_index(['A', 'B', 'C']).sort_index()) + df = ( + DataFrame( + dict( + A=["foo", "foo", "foo", "foo"], + B=["a", "a", "a", "a"], + C=[1, 2, 1, 2], + D=[1, 2, 3, 4], + ) + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) assert not df.index.is_unique - expected = (DataFrame(dict(A=['foo', 'foo'], B=['a', 'a'], - C=[1, 1], D=[1, 3])) - .set_index(['A', 'B', 'C']).sort_index()) + expected = ( + DataFrame(dict(A=["foo", "foo"], B=["a", "a"], C=[1, 1], D=[1, 3])) + .set_index(["A", "B", "C"]) + .sort_index() + ) result = df.loc[(slice(None), slice(None), 1), :] assert not result.index.is_unique tm.assert_frame_equal(result, expected) # GH12896 # numpy-implementation dependent bug - ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, - 17, 18, 19, 200000, 200000] + ints = [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 12, + 13, + 14, + 14, + 16, + 17, + 18, + 19, + 200000, + 200000, + ] n = len(ints) - idx = MultiIndex.from_arrays([['a'] * n, ints]) + idx = MultiIndex.from_arrays([["a"] * n, ints]) result = Series([1] * n, index=idx) result = result.sort_index() result = result.loc[(slice(None), slice(100000))] @@ -178,100 +233,139 @@ def test_multiindex_slicers_datetimelike(self): # GH 7429 # buggy/inconsistent behavior when slicing with datetime-like import datetime - dates = [datetime.datetime(2012, 1, 1, 12, 12, 12) + - datetime.timedelta(days=i) for i in range(6)] + + dates = [ + datetime.datetime(2012, 1, 1, 12, 12, 12) + datetime.timedelta(days=i) + for i in range(6) + ] freq = [1, 2] - index = MultiIndex.from_product( - [dates, freq], names=['date', 'frequency']) + index = MultiIndex.from_product([dates, freq], names=["date", "frequency"]) df = DataFrame( - np.arange(6 * 2 * 4, dtype='int64').reshape( - -1, 4), index=index, columns=list('ABCD')) + np.arange(6 * 2 * 4, dtype="int64").reshape(-1, 4), + index=index, + columns=list("ABCD"), + ) # multi-axis slicing idx = pd.IndexSlice expected = df.iloc[[0, 2, 4], [0, 1]] - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), - slice(1, 1)), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( - '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), - Timestamp('2012-01-03 12:12:12')), 1), - slice('A', 'B')] + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + slice(1, 1), + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + idx[ + Timestamp("2012-01-01 12:12:12") : Timestamp("2012-01-03 12:12:12") + ], + idx[1:1], + ), + slice("A", "B"), + ] + tm.assert_frame_equal(result, expected) + + result = df.loc[ + ( + slice( + Timestamp("2012-01-01 12:12:12"), Timestamp("2012-01-03 12:12:12") + ), + 1, + ), + slice("A", "B"), + ] tm.assert_frame_equal(result, expected) # with strings - result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), - slice(1, 1)), slice('A', 'B')] + result = df.loc[ + (slice("2012-01-01 12:12:12", "2012-01-03 12:12:12"), slice(1, 1)), + slice("A", "B"), + ] tm.assert_frame_equal(result, expected) - result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), - idx['A', 'B']] + result = df.loc[ + (idx["2012-01-01 12:12:12":"2012-01-03 12:12:12"], 1), idx["A", "B"] + ] tm.assert_frame_equal(result, expected) def test_multiindex_slicers_edges(self): # GH 8132 # various edge cases df = DataFrame( - {'A': ['A0'] * 5 + ['A1'] * 5 + ['A2'] * 5, - 'B': ['B0', 'B0', 'B1', 'B1', 'B2'] * 3, - 'DATE': ["2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", - "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09", - "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01", - "2013-07-09", "2013-08-06", "2013-09-03"], - 'VALUES': [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2]}) - - df['DATE'] = pd.to_datetime(df['DATE']) - df1 = df.set_index(['A', 'B', 'DATE']) + { + "A": ["A0"] * 5 + ["A1"] * 5 + ["A2"] * 5, + "B": ["B0", "B0", "B1", "B1", "B2"] * 3, + "DATE": [ + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-06-11", + "2013-07-02", + "2013-07-09", + "2013-07-30", + "2013-08-06", + "2013-09-03", + "2013-10-01", + "2013-07-09", + "2013-08-06", + "2013-09-03", + ], + "VALUES": [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2], + } + ) + + df["DATE"] = pd.to_datetime(df["DATE"]) + df1 = df.set_index(["A", "B", "DATE"]) df1 = df1.sort_index() # A1 - Get all values under "A0" and "A1" - result = df1.loc[(slice('A1')), :] + result = df1.loc[(slice("A1")), :] expected = df1.iloc[0:10] tm.assert_frame_equal(result, expected) # A2 - Get all values from the start to "A2" - result = df1.loc[(slice('A2')), :] + result = df1.loc[(slice("A2")), :] expected = df1 tm.assert_frame_equal(result, expected) # A3 - Get all values under "B1" or "B2" - result = df1.loc[(slice(None), slice('B1', 'B2')), :] + result = df1.loc[(slice(None), slice("B1", "B2")), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] tm.assert_frame_equal(result, expected) # A4 - Get all values between 2013-07-02 and 2013-07-09 - result = df1.loc[(slice(None), slice(None), - slice('20130702', '20130709')), :] + result = df1.loc[(slice(None), slice(None), slice("20130702", "20130709")), :] expected = df1.iloc[[1, 2, 6, 7, 12]] tm.assert_frame_equal(result, expected) # B1 - Get all values in B0 that are also under A0, A1 and A2 - result = df1.loc[(slice('A2'), slice('B0')), :] + result = df1.loc[(slice("A2"), slice("B0")), :] expected = df1.iloc[[0, 1, 5, 6, 10, 11]] tm.assert_frame_equal(result, expected) # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for # the As) - result = df1.loc[(slice(None), slice('B2')), :] + result = df1.loc[(slice(None), slice("B2")), :] expected = df1 tm.assert_frame_equal(result, expected) # B3 - Get all values from B1 to B2 and up to 2013-08-06 - result = df1.loc[(slice(None), slice('B1', 'B2'), - slice('2013-08-06')), :] + result = df1.loc[(slice(None), slice("B1", "B2"), slice("2013-08-06")), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] tm.assert_frame_equal(result, expected) # B4 - Same as A4 but the start of the date slice is not a key. # shows indexing on a partial selection slice - result = df1.loc[(slice(None), slice(None), - slice('20130701', '20130709')), :] + result = df1.loc[(slice(None), slice(None), slice("20130701", "20130709")), :] expected = df1.iloc[[1, 2, 6, 7, 12]] tm.assert_frame_equal(result, expected) @@ -281,92 +375,122 @@ def test_per_axis_per_level_doc_examples(self): idx = pd.IndexSlice # from indexing.rst / advanced - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, columns=columns) - result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] - tm.assert_frame_equal(result, expected) - - result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - result = df.loc[idx[:, :, ['C1', 'C3']], :] + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + result = df.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx["A1":"A3", :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + + result = df.loc[(slice(None), slice(None), ["C1", "C3"]), :] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + result = df.loc[idx[:, :, ["C1", "C3"]], :] tm.assert_frame_equal(result, expected) # not sorted with pytest.raises(UnsortedIndexError): - df.loc['A1', ('a', slice('foo'))] + df.loc["A1", ("a", slice("foo"))] # GH 16734: not sorted, but no real slicing - tm.assert_frame_equal(df.loc['A1', (slice(None), 'foo')], - df.loc['A1'].iloc[:, [0, 2]]) + tm.assert_frame_equal( + df.loc["A1", (slice(None), "foo")], df.loc["A1"].iloc[:, [0, 2]] + ) df = df.sort_index(axis=1) # slicing - df.loc['A1', (slice(None), 'foo')] - df.loc[(slice(None), slice(None), ['C1', 'C3']), (slice(None), 'foo')] + df.loc["A1", (slice(None), "foo")] + df.loc[(slice(None), slice(None), ["C1", "C3"]), (slice(None), "foo")] # setitem - df.loc(axis=0)[:, :, ['C1', 'C3']] = -10 + df.loc(axis=0)[:, :, ["C1", "C3"]] = -10 def test_loc_axis_arguments(self): - index = MultiIndex.from_product([_mklbl('A', 4), _mklbl('B', 2), - _mklbl('C', 4), _mklbl('D', 2)]) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') - .reshape((len(index), len(columns))), - index=index, - columns=columns).sort_index().sort_index(axis=1) + index = MultiIndex.from_product( + [_mklbl("A", 4), _mklbl("B", 2), _mklbl("C", 4), _mklbl("D", 2)] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) + df = ( + DataFrame( + np.arange(len(index) * len(columns), dtype="int64").reshape( + (len(index), len(columns)) + ), + index=index, + columns=columns, + ) + .sort_index() + .sort_index(axis=1) + ) # axis 0 - result = df.loc(axis=0)['A1':'A3', :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (a == 'A1' or a == 'A2' or a == 'A3') and ( - c == 'C1' or c == 'C3')]] - tm.assert_frame_equal(result, expected) - - result = df.loc(axis='index')[:, :, ['C1', 'C3']] - expected = df.loc[[tuple([a, b, c, d]) - for a, b, c, d in df.index.values - if (c == 'C1' or c == 'C3')]] + result = df.loc(axis=0)["A1":"A3", :, ["C1", "C3"]] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (a == "A1" or a == "A2" or a == "A3") and (c == "C1" or c == "C3") + ] + ] + tm.assert_frame_equal(result, expected) + + result = df.loc(axis="index")[:, :, ["C1", "C3"]] + expected = df.loc[ + [ + tuple([a, b, c, d]) + for a, b, c, d in df.index.values + if (c == "C1" or c == "C3") + ] + ] tm.assert_frame_equal(result, expected) # axis 1 - result = df.loc(axis=1)[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] + result = df.loc(axis=1)[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] tm.assert_frame_equal(result, expected) - result = df.loc(axis='columns')[:, 'foo'] - expected = df.loc[:, (slice(None), 'foo')] + result = df.loc(axis="columns")[:, "foo"] + expected = df.loc[:, (slice(None), "foo")] tm.assert_frame_equal(result, expected) # invalid axis with pytest.raises(ValueError): - df.loc(axis=-1)[:, :, ['C1', 'C3']] + df.loc(axis=-1)[:, :, ["C1", "C3"]] with pytest.raises(ValueError): - df.loc(axis=2)[:, :, ['C1', 'C3']] + df.loc(axis=2)[:, :, ["C1", "C3"]] with pytest.raises(ValueError): - df.loc(axis='foo')[:, :, ['C1', 'C3']] + df.loc(axis="foo")[:, :, ["C1", "C3"]] def test_per_axis_per_level_setitem(self): @@ -374,16 +498,17 @@ def test_per_axis_per_level_setitem(self): idx = pd.IndexSlice # test multi-index slicing with per axis and per index controls - index = MultiIndex.from_tuples([('A', 1), ('A', 2), - ('A', 3), ('B', 1)], - names=['one', 'two']) - columns = MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) + index = MultiIndex.from_tuples( + [("A", 1), ("A", 2), ("A", 3), ("B", 1)], names=["one", "two"] + ) + columns = MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], + names=["lvl0", "lvl1"], + ) df_orig = DataFrame( - np.arange(16, dtype='int64').reshape( - 4, 4), index=index, columns=columns) + np.arange(16, dtype="int64").reshape(4, 4), index=index, columns=columns + ) df_orig = df_orig.sort_index(axis=0).sort_index(axis=1) # identity @@ -432,34 +557,35 @@ def test_per_axis_per_level_setitem(self): # columns df = df_orig.copy() - df.loc[:, (slice(None), ['foo'])] = 100 + df.loc[:, (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 tm.assert_frame_equal(df, expected) # both df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 + df.loc[(slice(None), 1), (slice(None), ["foo"])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[idx[:, 1], idx[:, ['foo']]] = 100 + df.loc[idx[:, 1], idx[:, ["foo"]]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc['A', 'a'] = 100 + df.loc["A", "a"] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100, 100], [100, 100]], dtype='int64') + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100, 100], [100, 100]], dtype="int64" + ) expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 tm.assert_frame_equal(df, expected) @@ -468,39 +594,44 @@ def test_per_axis_per_level_setitem(self): df = df_orig.copy() with pytest.raises(ValueError): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [[100], [100, 100]], dtype='int64') + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [[100], [100, 100]], dtype="int64" + ) with pytest.raises(ValueError): - df.loc[(slice(None), 1), (slice(None), ['foo'])] = np.array( - [100, 100, 100, 100], dtype='int64') + df.loc[(slice(None), 1), (slice(None), ["foo"])] = np.array( + [100, 100, 100, 100], dtype="int64" + ) # with an alignable rhs df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] = df.loc[(slice( - None), 1), (slice(None), ['foo'])] * 5 + df.loc[(slice(None), 1), (slice(None), ["foo"])] = ( + df.loc[(slice(None), 1), (slice(None), ["foo"])] * 5 + ) expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( - None), 1), (slice(None), ['foo'])] + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= df.loc[ + (slice(None), 1), (slice(None), ["foo"]) + ] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) - rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() - rhs.loc[:, ('c', 'bah')] = 10 + rhs = df_orig.loc[(slice(None), 1), (slice(None), ["foo"])].copy() + rhs.loc[:, ("c", "bah")] = 10 df = df_orig.copy() - df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs + df.loc[(slice(None), 1), (slice(None), ["foo"])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] tm.assert_frame_equal(df, expected) def test_multiindex_label_slicing_with_negative_step(self): - s = Series(np.arange(20), - MultiIndex.from_product([list('abcde'), np.arange(4)])) + s = Series( + np.arange(20), MultiIndex.from_product([list("abcde"), np.arange(4)]) + ) SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): @@ -511,46 +642,45 @@ def assert_slices_equivalent(l_slc, i_slc): assert_slices_equivalent(SLC[::-1], SLC[::-1]) - assert_slices_equivalent(SLC['d'::-1], SLC[15::-1]) - assert_slices_equivalent(SLC[('d', )::-1], SLC[15::-1]) + assert_slices_equivalent(SLC["d"::-1], SLC[15::-1]) + assert_slices_equivalent(SLC[("d",)::-1], SLC[15::-1]) - assert_slices_equivalent(SLC[:'d':-1], SLC[:11:-1]) - assert_slices_equivalent(SLC[:('d', ):-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:"d":-1], SLC[:11:-1]) + assert_slices_equivalent(SLC[:("d",):-1], SLC[:11:-1]) - assert_slices_equivalent(SLC['d':'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):'b':-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['d':('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC[('d', ):('b', ):-1], SLC[15:3:-1]) - assert_slices_equivalent(SLC['b':'d':-1], SLC[:0]) + assert_slices_equivalent(SLC["d":"b":-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[("d",):"b":-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC["d":("b",):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC[("d",):("b",):-1], SLC[15:3:-1]) + assert_slices_equivalent(SLC["b":"d":-1], SLC[:0]) - assert_slices_equivalent(SLC[('c', 2)::-1], SLC[10::-1]) - assert_slices_equivalent(SLC[:('c', 2):-1], SLC[:9:-1]) - assert_slices_equivalent(SLC[('e', 0):('c', 2):-1], SLC[16:9:-1]) + assert_slices_equivalent(SLC[("c", 2)::-1], SLC[10::-1]) + assert_slices_equivalent(SLC[:("c", 2):-1], SLC[:9:-1]) + assert_slices_equivalent(SLC[("e", 0):("c", 2):-1], SLC[16:9:-1]) def test_multiindex_slice_first_level(self): # GH 12697 - freq = ['a', 'b', 'c', 'd'] + freq = ["a", "b", "c", "d"] idx = MultiIndex.from_product([freq, np.arange(500)]) - df = DataFrame(list(range(2000)), index=idx, columns=['Test']) + df = DataFrame(list(range(2000)), index=idx, columns=["Test"]) df_slice = df.loc[pd.IndexSlice[:, 30:70], :] - result = df_slice.loc['a'] - expected = DataFrame(list(range(30, 71)), - columns=['Test'], index=range(30, 71)) + result = df_slice.loc["a"] + expected = DataFrame(list(range(30, 71)), columns=["Test"], index=range(30, 71)) tm.assert_frame_equal(result, expected) - result = df_slice.loc['d'] - expected = DataFrame(list(range(1530, 1571)), - columns=['Test'], index=range(30, 71)) + result = df_slice.loc["d"] + expected = DataFrame( + list(range(1530, 1571)), columns=["Test"], index=range(30, 71) + ) tm.assert_frame_equal(result, expected) - def test_int_series_slicing( - self, multiindex_year_month_day_dataframe_random_data): + def test_int_series_slicing(self, multiindex_year_month_day_dataframe_random_data): ymd = multiindex_year_month_day_dataframe_random_data - s = ymd['A'] + s = ymd["A"] result = s[5:] expected = s.reindex(s.index[5:]) tm.assert_series_equal(result, expected) - exp = ymd['A'].copy() + exp = ymd["A"].copy() s[5:] = 0 exp.values[5:] = 0 tm.assert_numpy_array_equal(s.values, exp.values) @@ -562,16 +692,16 @@ def test_int_series_slicing( def test_non_reducing_slice_on_multiindex(self): # GH 19861 dic = { - ('a', 'd'): [1, 4], - ('a', 'c'): [2, 3], - ('b', 'c'): [3, 2], - ('b', 'd'): [4, 1] + ("a", "d"): [1, 4], + ("a", "c"): [2, 3], + ("b", "c"): [3, 2], + ("b", "d"): [4, 1], } df = pd.DataFrame(dic, index=[0, 1]) idx = pd.IndexSlice - slice_ = idx[:, idx['b', 'd']] + slice_ = idx[:, idx["b", "d"]] tslice_ = _non_reducing_slice(slice_) result = df.loc[tslice_] - expected = pd.DataFrame({('b', 'd'): [4, 1]}) + expected = pd.DataFrame({("b", "d"): [4, 1]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 7fb6c806ae793..43ad66b7d1116 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -8,14 +8,15 @@ class TestMultiIndexSorted: def test_getitem_multilevel_index_tuple_not_sorted(self): index_columns = list("abc") - df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], - columns=index_columns + ["data"]) + df = DataFrame( + [[0, 1, 0, "x"], [0, 0, 1, "y"]], columns=index_columns + ["data"] + ) df = df.set_index(index_columns) query_index = df.index[:1] rs = df.loc[query_index, "data"] - xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) - xp = Series(['x'], index=xp_idx, name='data') + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=["a", "b", "c"]) + xp = Series(["x"], index=xp_idx, name="data") tm.assert_series_equal(rs, xp) def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): @@ -23,21 +24,25 @@ def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): df = frame.sort_index(level=1).T # buglet with int typechecking - result = df.iloc[:, :np.int32(3)] + result = df.iloc[:, : np.int32(3)] expected = df.reindex(columns=df.columns[:3]) tm.assert_frame_equal(result, expected) def test_frame_getitem_not_sorted2(self): # 13431 - df = DataFrame({'col1': ['b', 'd', 'b', 'a'], - 'col2': [3, 1, 1, 2], - 'data': ['one', 'two', 'three', 'four']}) - - df2 = df.set_index(['col1', 'col2']) + df = DataFrame( + { + "col1": ["b", "d", "b", "a"], + "col2": [3, 1, 1, 2], + "data": ["one", "two", "three", "four"], + } + ) + + df2 = df.set_index(["col1", "col2"]) df2_original = df2.copy() - df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) - df2.index.set_codes([0, 1, 0, 2], level='col1', inplace=True) + df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) + df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic @@ -54,37 +59,39 @@ def test_frame_getitem_not_sorted2(self): def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data df = frame.T - df['foo', 'four'] = 'foo' + df["foo", "four"] = "foo" arrays = [np.array(x) for x in zip(*df.columns.values)] - result = df['foo'] - result2 = df.loc[:, 'foo'] - expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + result = df["foo"] + result2 = df.loc[:, "foo"] + expected = df.reindex(columns=df.columns[arrays[0] == "foo"]) expected.columns = expected.columns.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) df = df.T - result = df.xs('foo') - result2 = df.loc['foo'] - expected = df.reindex(df.index[arrays[0] == 'foo']) + result = df.xs("foo") + result2 = df.loc["foo"] + expected = df.reindex(df.index[arrays[0] == "foo"]) expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.values)] - result = s['qux'] - result2 = s.loc['qux'] - expected = s[arrays[0] == 'qux'] + result = s["qux"] + result2 = s.loc["qux"] + expected = s[arrays[0] == "qux"] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index bbc55c75c5b77..5e58b3ec155ed 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -10,26 +10,36 @@ @pytest.fixture def four_level_index_dataframe(): - arr = np.array([[-0.5109, -2.3358, -0.4645, 0.05076, 0.364], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], - [-0.6662, -0.5243, -0.358, 0.89145, 2.5838]]) + arr = np.array( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.358, 0.89145, 2.5838], + ] + ) index = MultiIndex( - levels=[['a', 'x'], ['b', 'q'], [10.0032, 20.0, 30.0], [3, 4, 5]], + levels=[["a", "x"], ["b", "q"], [10.0032, 20.0, 30.0], [3, 4, 5]], codes=[[0, 0, 1], [0, 1, 1], [0, 1, 2], [2, 1, 0]], - names=['one', 'two', 'three', 'four']) - return DataFrame(arr, index=index, columns=list('ABCDE')) - - -@pytest.mark.parametrize('key, level, exp_arr, exp_index', [ - ('a', 'lvl0', lambda x: x[:, 0:2], Index(['bar', 'foo'], name='lvl1')), - ('foo', 'lvl1', lambda x: x[:, 1:2], Index(['a'], name='lvl0')) -]) + names=["one", "two", "three", "four"], + ) + return DataFrame(arr, index=index, columns=list("ABCDE")) + + +@pytest.mark.parametrize( + "key, level, exp_arr, exp_index", + [ + ("a", "lvl0", lambda x: x[:, 0:2], Index(["bar", "foo"], name="lvl1")), + ("foo", "lvl1", lambda x: x[:, 1:2], Index(["a"], name="lvl0")), + ], +) def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index): # see gh-2903 arr = np.random.randn(4, 4) - index = MultiIndex(levels=[['a', 'b'], ['bar', 'foo', 'hello', 'world']], - codes=[[0, 0, 1, 1], [0, 1, 2, 3]], - names=['lvl0', 'lvl1']) + index = MultiIndex( + levels=[["a", "b"], ["bar", "foo", "hello", "world"]], + codes=[[0, 0, 1, 1], [0, 1, 2, 3]], + names=["lvl0", "lvl1"], + ) df = DataFrame(arr, columns=index) result = df.xs(key, level=level, axis=1) expected = DataFrame(exp_arr(arr), columns=exp_index) @@ -38,15 +48,15 @@ def test_xs_named_levels_axis_eq_1(key, level, exp_arr, exp_index): def test_xs_values(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data - result = df.xs(('bar', 'two')).values + result = df.xs(("bar", "two")).values expected = df.values[4] tm.assert_almost_equal(result, expected) def test_xs_loc_equality(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data - result = df.xs(('bar', 'two')) - expected = df.loc[('bar', 'two')] + result = df.xs(("bar", "two")) + expected = df.loc[("bar", "two")] tm.assert_series_equal(result, expected) @@ -54,33 +64,31 @@ def test_xs_missing_values_in_index(): # see gh-6574 # missing values in returned index should be preserved acc = [ - ('a', 'abcde', 1), - ('b', 'bbcde', 2), - ('y', 'yzcde', 25), - ('z', 'xbcde', 24), - ('z', None, 26), - ('z', 'zbcde', 25), - ('z', 'ybcde', 26), + ("a", "abcde", 1), + ("b", "bbcde", 2), + ("y", "yzcde", 25), + ("z", "xbcde", 24), + ("z", None, 26), + ("z", "zbcde", 25), + ("z", "ybcde", 26), ] - df = DataFrame(acc, - columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) - expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( - ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) + df = DataFrame(acc, columns=["a1", "a2", "cnt"]).set_index(["a1", "a2"]) + expected = DataFrame( + {"cnt": [24, 26, 25, 26]}, + index=Index(["xbcde", np.nan, "zbcde", "ybcde"], name="a2"), + ) - result = df.xs('z', level='a1') + result = df.xs("z", level="a1") tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('key, level', [ - ('one', 'second'), - (['one'], ['second']) -]) +@pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) def test_xs_with_duplicates(key, level, multiindex_dataframe_random_data): # see gh-13719 frame = multiindex_dataframe_random_data df = concat([frame] * 2) assert df.index.is_unique is False - expected = concat([frame.xs('one', level='second')] * 2) + expected = concat([frame.xs("one", level="second")] * 2) result = df.xs(key, level=level) tm.assert_frame_equal(result, expected) @@ -88,36 +96,38 @@ def test_xs_with_duplicates(key, level, multiindex_dataframe_random_data): def test_xs_level(multiindex_dataframe_random_data): df = multiindex_dataframe_random_data - result = df.xs('two', level='second') - expected = df[df.index.get_level_values(1) == 'two'] - expected.index = Index(['foo', 'bar', 'baz', 'qux'], name='first') + result = df.xs("two", level="second") + expected = df[df.index.get_level_values(1) == "two"] + expected.index = Index(["foo", "bar", "baz", "qux"], name="first") tm.assert_frame_equal(result, expected) def test_xs_level_eq_2(): arr = np.random.randn(3, 5) index = MultiIndex( - levels=[['a', 'p', 'x'], ['b', 'q', 'y'], ['c', 'r', 'z']], - codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]]) + levels=[["a", "p", "x"], ["b", "q", "y"], ["c", "r", "z"]], + codes=[[2, 0, 1], [2, 0, 1], [2, 0, 1]], + ) df = DataFrame(arr, index=index) - expected = DataFrame(arr[1:2], index=[['a'], ['b']]) - result = df.xs('c', level=2) + expected = DataFrame(arr[1:2], index=[["a"], ["b"]]) + result = df.xs("c", level=2) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda df: df.xs(('a', 4), level=['one', 'four']), - lambda df: df.xs('a').xs(4, level='four') -]) +@pytest.mark.parametrize( + "indexer", + [ + lambda df: df.xs(("a", 4), level=["one", "four"]), + lambda df: df.xs("a").xs(4, level="four"), + ], +) def test_xs_level_multiple(indexer, four_level_index_dataframe): df = four_level_index_dataframe expected_values = [[0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] expected_index = MultiIndex( - levels=[['q'], [20.0]], - codes=[[0], [0]], - names=['two', 'three']) - expected = DataFrame( - expected_values, index=expected_index, columns=list('ABCDE')) + levels=[["q"], [20.0]], codes=[[0], [0]], names=["two", "three"] + ) + expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) result = indexer(df) tm.assert_frame_equal(result, expected) @@ -125,11 +135,11 @@ def test_xs_level_multiple(indexer, four_level_index_dataframe): def test_xs_setting_with_copy_error(multiindex_dataframe_random_data): # this is a copy in 0.14 df = multiindex_dataframe_random_data - result = df.xs('two', level='second') + result = df.xs("two", level="second") # setting this will give a SettingWithCopyError # as we are trying to write a view - msg = 'A value is trying to be set on a copy of a slice from a DataFrame' + msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): result[:] = 10 @@ -137,11 +147,11 @@ def test_xs_setting_with_copy_error(multiindex_dataframe_random_data): def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): # this is a copy in 0.14 df = four_level_index_dataframe - result = df.xs(('a', 4), level=['one', 'four']) + result = df.xs(("a", 4), level=["one", "four"]) # setting this will give a SettingWithCopyError # as we are trying to write a view - msg = 'A value is trying to be set on a copy of a slice from a DataFrame' + msg = "A value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): result[:] = 10 @@ -149,32 +159,32 @@ def test_xs_setting_with_copy_error_multiple(four_level_index_dataframe): def test_xs_integer_key(): # see gh-2107 dates = range(20111201, 20111205) - ids = 'abcde' + ids = "abcde" index = MultiIndex.from_tuples( - [x for x in product(dates, ids)], - names=['date', 'secid']) - df = DataFrame( - np.random.randn(len(index), 3), index, ['X', 'Y', 'Z']) + [x for x in product(dates, ids)], names=["date", "secid"] + ) + df = DataFrame(np.random.randn(len(index), 3), index, ["X", "Y", "Z"]) - result = df.xs(20111201, level='date') + result = df.xs(20111201, level="date") expected = df.loc[20111201, :] tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize('indexer', [ - lambda df: df.xs('a', level=0), - lambda df: df.xs('a') -]) +@pytest.mark.parametrize( + "indexer", [lambda df: df.xs("a", level=0), lambda df: df.xs("a")] +) def test_xs_level0(indexer, four_level_index_dataframe): df = four_level_index_dataframe - expected_values = [[-0.5109, -2.3358, -0.4645, 0.05076, 0.364], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744]] + expected_values = [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.364], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + ] expected_index = MultiIndex( - levels=[['b', 'q'], [10.0032, 20.0], [4, 5]], + levels=[["b", "q"], [10.0032, 20.0], [4, 5]], codes=[[0, 1], [0, 1], [1, 0]], - names=['two', 'three', 'four']) - expected = DataFrame( - expected_values, index=expected_index, columns=list('ABCDE')) + names=["two", "three", "four"], + ) + expected = DataFrame(expected_values, index=expected_index, columns=list("ABCDE")) result = indexer(df) tm.assert_frame_equal(result, expected) @@ -184,9 +194,9 @@ def test_xs_level_series(multiindex_dataframe_random_data): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor df = multiindex_dataframe_random_data - s = df['A'] - result = s[:, 'two'] - expected = df.xs('two', level=1)['A'] + s = df["A"] + result = s[:, "two"] + expected = df.xs("two", level=1)["A"] tm.assert_series_equal(result, expected) @@ -194,44 +204,46 @@ def test_xs_level_series_ymd(multiindex_year_month_day_dataframe_random_data): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor df = multiindex_year_month_day_dataframe_random_data - s = df['A'] + s = df["A"] result = s[2000, 5] - expected = df.loc[2000, 5]['A'] + expected = df.loc[2000, 5]["A"] tm.assert_series_equal(result, expected) def test_xs_level_series_slice_not_implemented( - multiindex_year_month_day_dataframe_random_data): + multiindex_year_month_day_dataframe_random_data +): # this test is not explicitly testing .xs functionality # TODO: move to another module or refactor # not implementing this for now df = multiindex_year_month_day_dataframe_random_data - s = df['A'] + s = df["A"] - msg = r'\(2000, slice\(3, 4, None\)\)' + msg = r"\(2000, slice\(3, 4, None\)\)" with pytest.raises(TypeError, match=msg): s[2000, 3:4] def test_series_getitem_multiindex_xs(): # GH6258 - dt = list(date_range('20130903', periods=3)) - idx = MultiIndex.from_product([list('AB'), dt]) + dt = list(date_range("20130903", periods=3)) + idx = MultiIndex.from_product([list("AB"), dt]) s = Series([1, 3, 4, 1, 3, 4], index=idx) - expected = Series([1, 1], index=list('AB')) + expected = Series([1, 1], index=list("AB")) - result = s.xs('20130903', level=1) + result = s.xs("20130903", level=1) tm.assert_series_equal(result, expected) def test_series_getitem_multiindex_xs_by_label(): # GH5684 - idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')]) + idx = MultiIndex.from_tuples( + [("a", "one"), ("a", "two"), ("b", "one"), ("b", "two")] + ) s = Series([1, 2, 3, 4], index=idx) - s.index.set_names(['L1', 'L2'], inplace=True) - expected = Series([1, 3], index=['a', 'b']) - expected.index.set_names(['L1'], inplace=True) + s.index.set_names(["L1", "L2"], inplace=True) + expected = Series([1, 3], index=["a", "b"]) + expected.index.set_names(["L1"], inplace=True) - result = s.xs('one', level='L2') + result = s.xs("one", level="L2") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 657309170cac3..78aaf80b532fb 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -5,11 +5,9 @@ class TestIndexingCallable: - def test_frame_loc_callable(self): # GH 11485 - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': list('aabb'), - 'C': [1, 2, 3, 4]}) + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) # iloc cannot use boolean Series (see GH3635) # return bool indexer @@ -19,168 +17,161 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2, ] - tm.assert_frame_equal(res, df.loc[df.A > 2, ]) + res = df.loc[lambda x: x.A > 2,] + tm.assert_frame_equal(res, df.loc[df.A > 2,]) - res = df.loc[lambda x: x.A > 2, ] - tm.assert_frame_equal(res, df.loc[df.A > 2, ]) + res = df.loc[lambda x: x.A > 2,] + tm.assert_frame_equal(res, df.loc[df.A > 2,]) - res = df.loc[lambda x: x.B == 'b', :] - tm.assert_frame_equal(res, df.loc[df.B == 'b', :]) + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) - res = df.loc[lambda x: x.B == 'b', :] - tm.assert_frame_equal(res, df.loc[df.B == 'b', :]) + res = df.loc[lambda x: x.B == "b", :] + tm.assert_frame_equal(res, df.loc[df.B == "b", :]) - res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B'] + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) - res = df.loc[lambda x: x.A > 2, lambda x: x.columns == 'B'] + res = df.loc[lambda x: x.A > 2, lambda x: x.columns == "B"] tm.assert_frame_equal(res, df.loc[df.A > 2, [False, True, False]]) - res = df.loc[lambda x: x.A > 2, lambda x: 'B'] - tm.assert_series_equal(res, df.loc[df.A > 2, 'B']) + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) - res = df.loc[lambda x: x.A > 2, lambda x: 'B'] - tm.assert_series_equal(res, df.loc[df.A > 2, 'B']) + res = df.loc[lambda x: x.A > 2, lambda x: "B"] + tm.assert_series_equal(res, df.loc[df.A > 2, "B"]) - res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[lambda x: x.A > 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']]) + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) - res = df.loc[lambda x: x.A == 2, lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A == 2, ['A', 'B']]) + res = df.loc[lambda x: x.A == 2, lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A == 2, ["A", "B"]]) # scalar - res = df.loc[lambda x: 1, lambda x: 'A'] - assert res == df.loc[1, 'A'] + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] - res = df.loc[lambda x: 1, lambda x: 'A'] - assert res == df.loc[1, 'A'] + res = df.loc[lambda x: 1, lambda x: "A"] + assert res == df.loc[1, "A"] def test_frame_loc_callable_mixture(self): # GH 11485 - df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': list('aabb'), - 'C': [1, 2, 3, 4]}) + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": list("aabb"), "C": [1, 2, 3, 4]}) - res = df.loc[lambda x: x.A > 2, ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[lambda x: x.A > 2, ['A', 'B']] - tm.assert_frame_equal(res, df.loc[df.A > 2, ['A', 'B']]) + res = df.loc[lambda x: x.A > 2, ["A", "B"]] + tm.assert_frame_equal(res, df.loc[df.A > 2, ["A", "B"]]) - res = df.loc[[2, 3], lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']]) + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) - res = df.loc[[2, 3], lambda x: ['A', 'B']] - tm.assert_frame_equal(res, df.loc[[2, 3], ['A', 'B']]) + res = df.loc[[2, 3], lambda x: ["A", "B"]] + tm.assert_frame_equal(res, df.loc[[2, 3], ["A", "B"]]) - res = df.loc[3, lambda x: ['A', 'B']] - tm.assert_series_equal(res, df.loc[3, ['A', 'B']]) + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) - res = df.loc[3, lambda x: ['A', 'B']] - tm.assert_series_equal(res, df.loc[3, ['A', 'B']]) + res = df.loc[3, lambda x: ["A", "B"]] + tm.assert_series_equal(res, df.loc[3, ["A", "B"]]) def test_frame_loc_callable_labels(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return label - res = df.loc[lambda x: ['A', 'C']] - tm.assert_frame_equal(res, df.loc[['A', 'C']]) + res = df.loc[lambda x: ["A", "C"]] + tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[lambda x: ['A', 'C'], ] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ]) + res = df.loc[lambda x: ["A", "C"],] + tm.assert_frame_equal(res, df.loc[["A", "C"],]) - res = df.loc[lambda x: ['A', 'C'], :] - tm.assert_frame_equal(res, df.loc[['A', 'C'], :]) + res = df.loc[lambda x: ["A", "C"], :] + tm.assert_frame_equal(res, df.loc[["A", "C"], :]) - res = df.loc[lambda x: ['A', 'C'], lambda x: 'X'] - tm.assert_series_equal(res, df.loc[['A', 'C'], 'X']) + res = df.loc[lambda x: ["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - res = df.loc[lambda x: ['A', 'C'], lambda x: ['X']] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']]) + res = df.loc[lambda x: ["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) # mixture - res = df.loc[['A', 'C'], lambda x: 'X'] - tm.assert_series_equal(res, df.loc[['A', 'C'], 'X']) + res = df.loc[["A", "C"], lambda x: "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - res = df.loc[['A', 'C'], lambda x: ['X']] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']]) + res = df.loc[["A", "C"], lambda x: ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) - res = df.loc[lambda x: ['A', 'C'], 'X'] - tm.assert_series_equal(res, df.loc[['A', 'C'], 'X']) + res = df.loc[lambda x: ["A", "C"], "X"] + tm.assert_series_equal(res, df.loc[["A", "C"], "X"]) - res = df.loc[lambda x: ['A', 'C'], ['X']] - tm.assert_frame_equal(res, df.loc[['A', 'C'], ['X']]) + res = df.loc[lambda x: ["A", "C"], ["X"]] + tm.assert_frame_equal(res, df.loc[["A", "C"], ["X"]]) def test_frame_loc_callable_setitem(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return label res = df.copy() - res.loc[lambda x: ['A', 'C']] = -20 + res.loc[lambda x: ["A", "C"]] = -20 exp = df.copy() - exp.loc[['A', 'C']] = -20 + exp.loc[["A", "C"]] = -20 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], :] = 20 + res.loc[lambda x: ["A", "C"], :] = 20 exp = df.copy() - exp.loc[['A', 'C'], :] = 20 + exp.loc[["A", "C"], :] = 20 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], lambda x: 'X'] = -1 + res.loc[lambda x: ["A", "C"], lambda x: "X"] = -1 exp = df.copy() - exp.loc[['A', 'C'], 'X'] = -1 + exp.loc[["A", "C"], "X"] = -1 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], lambda x: ['X']] = [5, 10] + res.loc[lambda x: ["A", "C"], lambda x: ["X"]] = [5, 10] exp = df.copy() - exp.loc[['A', 'C'], ['X']] = [5, 10] + exp.loc[["A", "C"], ["X"]] = [5, 10] tm.assert_frame_equal(res, exp) # mixture res = df.copy() - res.loc[['A', 'C'], lambda x: 'X'] = np.array([-1, -2]) + res.loc[["A", "C"], lambda x: "X"] = np.array([-1, -2]) exp = df.copy() - exp.loc[['A', 'C'], 'X'] = np.array([-1, -2]) + exp.loc[["A", "C"], "X"] = np.array([-1, -2]) tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[['A', 'C'], lambda x: ['X']] = 10 + res.loc[["A", "C"], lambda x: ["X"]] = 10 exp = df.copy() - exp.loc[['A', 'C'], ['X']] = 10 + exp.loc[["A", "C"], ["X"]] = 10 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], 'X'] = -2 + res.loc[lambda x: ["A", "C"], "X"] = -2 exp = df.copy() - exp.loc[['A', 'C'], 'X'] = -2 + exp.loc[["A", "C"], "X"] = -2 tm.assert_frame_equal(res, exp) res = df.copy() - res.loc[lambda x: ['A', 'C'], ['X']] = -4 + res.loc[lambda x: ["A", "C"], ["X"]] = -4 exp = df.copy() - exp.loc[['A', 'C'], ['X']] = -4 + exp.loc[["A", "C"], ["X"]] = -4 tm.assert_frame_equal(res, exp) def test_frame_iloc_callable(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return location res = df.iloc[lambda x: [1, 3]] @@ -210,9 +201,7 @@ def test_frame_iloc_callable(self): def test_frame_iloc_callable_setitem(self): # GH 11485 - df = pd.DataFrame({'X': [1, 2, 3, 4], - 'Y': list('aabb')}, - index=list('ABCD')) + df = pd.DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD")) # return location res = df.copy() diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 1ec89af42a1e1..3549d81623e10 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -6,67 +6,84 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Index, Interval, Series, - Timestamp) + Categorical, + CategoricalIndex, + DataFrame, + Index, + Interval, + Series, + Timestamp, +) from pandas.api.types import CategoricalDtype as CDT from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal class TestCategoricalIndex: - def setup_method(self, method): - self.df = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': Series(list('aabbca')).astype( - CDT(list('cab')))}).set_index('B') - self.df2 = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': Series(list('aabbca')).astype( - CDT(list('cabe')))}).set_index('B') - self.df3 = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': (Series([1, 1, 2, 1, 3, 2]) - .astype(CDT([3, 2, 1], ordered=True))) - }).set_index('B') - self.df4 = DataFrame({'A': np.arange(6, dtype='int64'), - 'B': (Series([1, 1, 2, 1, 3, 2]) - .astype(CDT([3, 2, 1], ordered=False))) - }).set_index('B') + self.df = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CDT(list("cab"))), + } + ).set_index("B") + self.df2 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype(CDT(list("cabe"))), + } + ).set_index("B") + self.df3 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=True))), + } + ).set_index("B") + self.df4 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), + } + ).set_index("B") def test_loc_scalar(self): - result = self.df.loc['a'] - expected = (DataFrame({'A': [0, 1, 5], - 'B': (Series(list('aaa')) - .astype(CDT(list('cab'))))}) - .set_index('B')) + result = self.df.loc["a"] + expected = DataFrame( + {"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))} + ).set_index("B") assert_frame_equal(result, expected) df = self.df.copy() - df.loc['a'] = 20 - expected = (DataFrame({'A': [20, 20, 2, 3, 4, 20], - 'B': (Series(list('aabbca')) - .astype(CDT(list('cab'))))}) - .set_index('B')) + df.loc["a"] = 20 + expected = DataFrame( + { + "A": [20, 20, 2, 3, 4, 20], + "B": (Series(list("aabbca")).astype(CDT(list("cab")))), + } + ).set_index("B") assert_frame_equal(df, expected) # value not in the categories with pytest.raises(KeyError, match=r"^'d'$"): - df.loc['d'] + df.loc["d"] msg = "cannot append a non-category item to a CategoricalIndex" with pytest.raises(TypeError, match=msg): - df.loc['d'] = 10 + df.loc["d"] = 10 - msg = ("cannot insert an item into a CategoricalIndex that is not" - " already an existing category") + msg = ( + "cannot insert an item into a CategoricalIndex that is not" + " already an existing category" + ) with pytest.raises(TypeError, match=msg): - df.loc['d', 'A'] = 10 + df.loc["d", "A"] = 10 with pytest.raises(TypeError, match=msg): - df.loc['d', 'C'] = 10 + df.loc["d", "C"] = 10 def test_getitem_scalar(self): - cats = Categorical([Timestamp('12-31-1999'), - Timestamp('12-31-2000')]) + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) s = Series([1, 2], index=cats) @@ -79,7 +96,7 @@ def test_slicing_directly(self): sliced = cat[3] assert sliced == "d" sliced = cat[3:5] - expected = Categorical(["d", "a"], categories=['a', 'b', 'c', 'd']) + expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) tm.assert_numpy_array_equal(sliced._codes, expected._codes) tm.assert_index_equal(sliced.categories, expected.categories) @@ -89,20 +106,22 @@ def test_slicing(self): exp = np.array([4, 3, 2, 1], dtype=np.int64) tm.assert_numpy_array_equal(reversed.__array__(), exp) - df = DataFrame({'value': (np.arange(100) + 1).astype('int64')}) - df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) + df = DataFrame({"value": (np.arange(100) + 1).astype("int64")}) + df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) - expected = Series([11, Interval(0, 25)], index=['value', 'D'], name=10) + expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10) result = df.iloc[10] tm.assert_series_equal(result, expected) - expected = DataFrame({'value': np.arange(11, 21).astype('int64')}, - index=np.arange(10, 20).astype('int64')) - expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) + expected = DataFrame( + {"value": np.arange(11, 21).astype("int64")}, + index=np.arange(10, 20).astype("int64"), + ) + expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) result = df.iloc[10:20] tm.assert_frame_equal(result, expected) - expected = Series([9, Interval(0, 25)], index=['value', 'D'], name=8) + expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8) result = df.loc[8] tm.assert_series_equal(result, expected) @@ -116,7 +135,8 @@ def test_slicing_and_getting_ops(self): # - returning a single value cats = Categorical( - ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]) + ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"] + ) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 3, 4, 5, 6, 7] df = DataFrame({"cats": cats, "values": values}, index=idx) @@ -130,11 +150,10 @@ def test_slicing_and_getting_ops(self): exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2) # :,"cats" | :,0 - exp_col = Series(cats, index=idx, name='cats') + exp_col = Series(cats, index=idx, name="cats") # "j",: | 2,: - exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", - name="j") + exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j") # "j","cats | 2,0 exp_val = "b" @@ -248,39 +267,47 @@ def test_slicing_and_getting_ops(self): def test_slicing_doc_examples(self): # GH 7918 - cats = Categorical(["a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n", ]) + cats = Categorical( + ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"] + ) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( - {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']), - "values": [2, 2]}, index=['j', 'k']) + { + "cats": Categorical(["b", "b"], categories=["a", "b", "c"]), + "values": [2, 2], + }, + index=["j", "k"], + ) tm.assert_frame_equal(result, expected) result = df.iloc[2:4, :].dtypes - expected = Series(['category', 'int64'], ['cats', 'values']) + expected = Series(["category", "int64"], ["cats", "values"]) tm.assert_series_equal(result, expected) result = df.loc["h":"j", "cats"] - expected = Series(Categorical(['a', 'b', 'b'], - categories=['a', 'b', 'c']), - index=['h', 'i', 'j'], name='cats') + expected = Series( + Categorical(["a", "b", "b"], categories=["a", "b", "c"]), + index=["h", "i", "j"], + name="cats", + ) tm.assert_series_equal(result, expected) result = df.loc["h":"j", df.columns[0:1]] - expected = DataFrame({'cats': Categorical(['a', 'b', 'b'], - categories=['a', 'b', 'c'])}, - index=['h', 'i', 'j']) + expected = DataFrame( + {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])}, + index=["h", "i", "j"], + ) tm.assert_frame_equal(result, expected) def test_getitem_category_type(self): # GH 14580 # test iloc() on Series with Categorical data - s = Series([1, 2, 3]).astype('category') + s = Series([1, 2, 3]).astype("category") # get slice result = s.iloc[0:2] @@ -300,156 +327,147 @@ def test_getitem_category_type(self): def test_loc_listlike(self): # list of labels - result = self.df.loc[['c', 'a']] + result = self.df.loc[["c", "a"]] expected = self.df.iloc[[4, 0, 1, 5]] assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.loc[['a', 'b', 'e']] - exp_index = CategoricalIndex( - list('aaabbe'), categories=list('cabe'), name='B') - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index) + result = self.df2.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) assert_frame_equal(result, expected, check_index_type=True) # element in the categories but not in the values with pytest.raises(KeyError, match=r"^'e'$"): - self.df2.loc['e'] + self.df2.loc["e"] # assign is ok df = self.df2.copy() - df.loc['e'] = 20 - result = df.loc[['a', 'b', 'e']] - exp_index = CategoricalIndex( - list('aaabbe'), categories=list('cabe'), name='B') - expected = DataFrame({'A': [0, 1, 5, 2, 3, 20]}, index=exp_index) + df.loc["e"] = 20 + result = df.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) assert_frame_equal(result, expected) df = self.df2.copy() - result = df.loc[['a', 'b', 'e']] - exp_index = CategoricalIndex( - list('aaabbe'), categories=list('cabe'), name='B') - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan]}, index=exp_index) + result = df.loc[["a", "b", "e"]] + exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") + expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) assert_frame_equal(result, expected, check_index_type=True) # not all labels in the categories with pytest.raises(KeyError): - self.df2.loc[['a', 'd']] + self.df2.loc[["a", "d"]] def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes - index = CategoricalIndex(['a', 'b', 'c']) - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) + index = CategoricalIndex(["a", "b", "c"]) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice - res = df.loc[['a', 'b']] - exp_index = CategoricalIndex(['a', 'b'], - categories=index.categories) - exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) + res = df.loc[["a", "b"]] + exp_index = CategoricalIndex(["a", "b"], categories=index.categories) + exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice - res = df.loc[['a', 'a', 'b']] + res = df.loc[["a", "a", "b"]] - exp_index = CategoricalIndex(['a', 'a', 'b'], - categories=index.categories) - exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) + exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories) + exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = ('a list-indexer must only include ' - 'values that are in the categories') + msg = "a list-indexer must only include " "values that are in the categories" with pytest.raises(KeyError, match=msg): - df.loc[['a', 'x']] + df.loc[["a", "x"]] # duplicated categories and codes - index = CategoricalIndex(['a', 'b', 'a']) - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) + index = CategoricalIndex(["a", "b", "a"]) + df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice - res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 3, 2], - 'B': [4, 6, 5]}, - index=CategoricalIndex(['a', 'a', 'b'])) + res = df.loc[["a", "b"]] + exp = DataFrame( + {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"]) + ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice - res = df.loc[['a', 'a', 'b']] + res = df.loc[["a", "a", "b"]] exp = DataFrame( - {'A': [1, 3, 1, 3, 2], - 'B': [4, 6, 4, 6, 5 - ]}, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'])) + {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]}, + index=CategoricalIndex(["a", "a", "a", "a", "b"]), + ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = ('a list-indexer must only include values ' - 'that are in the categories') + msg = "a list-indexer must only include values " "that are in the categories" with pytest.raises(KeyError, match=msg): - df.loc[['a', 'x']] + df.loc[["a", "x"]] # contains unused category - index = CategoricalIndex( - ['a', 'b', 'a', 'c'], categories=list('abcde')) - df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index) - - res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 3, 2], 'B': [5, 7, 6]}, - index=CategoricalIndex(['a', 'a', 'b'], - categories=list('abcde'))) + index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) + df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) + + res = df.loc[["a", "b"]] + exp = DataFrame( + {"A": [1, 3, 2], "B": [5, 7, 6]}, + index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")), + ) tm.assert_frame_equal(res, exp, check_index_type=True) - res = df.loc[['a', 'e']] - exp = DataFrame({'A': [1, 3, np.nan], 'B': [5, 7, np.nan]}, - index=CategoricalIndex(['a', 'a', 'e'], - categories=list('abcde'))) + res = df.loc[["a", "e"]] + exp = DataFrame( + {"A": [1, 3, np.nan], "B": [5, 7, np.nan]}, + index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), + ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice - res = df.loc[['a', 'a', 'b']] - exp = DataFrame({'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6]}, - index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'], - categories=list('abcde'))) + res = df.loc[["a", "a", "b"]] + exp = DataFrame( + {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]}, + index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")), + ) tm.assert_frame_equal(res, exp, check_index_type=True) - msg = ('a list-indexer must only include values ' - 'that are in the categories') + msg = "a list-indexer must only include values " "that are in the categories" with pytest.raises(KeyError, match=msg): - df.loc[['a', 'x']] + df.loc[["a", "x"]] def test_get_indexer_array(self): - arr = np.array([Timestamp('1999-12-31 00:00:00'), - Timestamp('2000-12-31 00:00:00')], dtype=object) - cats = [Timestamp('1999-12-31 00:00:00'), - Timestamp('2000-12-31 00:00:00')] - ci = CategoricalIndex(cats, - categories=cats, - ordered=False, dtype='category') + arr = np.array( + [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], + dtype=object, + ) + cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] + ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") result = ci.get_indexer(arr) - expected = np.array([0, 1], dtype='intp') + expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_same_order(self): - ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - result = ci.get_indexer(CategoricalIndex(['b', 'b'], - categories=['a', 'b'])) - expected = np.array([1, 1], dtype='intp') + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) + expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19551 - ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - result = ci.get_indexer(CategoricalIndex(['b', 'b'], - categories=['b', 'a'])) - expected = np.array([1, 1], dtype='intp') + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) + expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_getitem_with_listlike(self): # GH 16115 - cats = Categorical([Timestamp('12-31-1999'), - Timestamp('12-31-2000')]) + cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) - expected = DataFrame([[1, 0], [0, 1]], dtype='uint8', - index=[0, 1], columns=cats) + expected = DataFrame( + [[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats + ) dummies = pd.get_dummies(cats) result = dummies[[c for c in dummies.columns]] assert_frame_equal(result, expected) @@ -459,65 +477,65 @@ def test_setitem_listlike(self): # GH 9469 # properly coerce the input indexers np.random.seed(1) - c = Categorical(np.random.randint(0, 5, size=150000).astype( - np.int8)).add_categories([-1000]) + c = Categorical( + np.random.randint(0, 5, size=150000).astype(np.int8) + ).add_categories([-1000]) indexer = np.array([100000]).astype(np.int64) c[indexer] = -1000 # we are asserting the code result here # which maps to the -1000 category result = c.codes[np.array([100000]).astype(np.int64)] - tm.assert_numpy_array_equal(result, np.array([5], dtype='int8')) + tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) def test_ix_categorical_index(self): # GH 12531 - df = DataFrame(np.random.randn(3, 3), - index=list('ABC'), columns=list('XYZ')) + df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) - expect = Series(df.loc['A', :], index=cdf.columns, name='A') - assert_series_equal(cdf.loc['A', :], expect) + expect = Series(df.loc["A", :], index=cdf.columns, name="A") + assert_series_equal(cdf.loc["A", :], expect) - expect = Series(df.loc[:, 'X'], index=cdf.index, name='X') - assert_series_equal(cdf.loc[:, 'X'], expect) + expect = Series(df.loc[:, "X"], index=cdf.index, name="X") + assert_series_equal(cdf.loc[:, "X"], expect) - exp_index = CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) - expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, - index=exp_index) - assert_frame_equal(cdf.loc[['A', 'B'], :], expect) + exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"]) + expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index) + assert_frame_equal(cdf.loc[["A", "B"], :], expect) - exp_columns = CategoricalIndex(list('XY'), - categories=['X', 'Y', 'Z']) - expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, - columns=exp_columns) - assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) + exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"]) + expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) + assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) # non-unique - df = DataFrame(np.random.randn(3, 3), - index=list('ABA'), columns=list('XYX')) + df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) - exp_index = CategoricalIndex(list('AA'), categories=['A', 'B']) - expect = DataFrame(df.loc['A', :], columns=cdf.columns, - index=exp_index) - assert_frame_equal(cdf.loc['A', :], expect) + exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) + expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) + assert_frame_equal(cdf.loc["A", :], expect) - exp_columns = CategoricalIndex(list('XX'), categories=['X', 'Y']) - expect = DataFrame(df.loc[:, 'X'], index=cdf.index, - columns=exp_columns) - assert_frame_equal(cdf.loc[:, 'X'], expect) + exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) + expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) + assert_frame_equal(cdf.loc[:, "X"], expect) - expect = DataFrame(df.loc[['A', 'B'], :], columns=cdf.columns, - index=CategoricalIndex(list('AAB'))) - assert_frame_equal(cdf.loc[['A', 'B'], :], expect) + expect = DataFrame( + df.loc[["A", "B"], :], + columns=cdf.columns, + index=CategoricalIndex(list("AAB")), + ) + assert_frame_equal(cdf.loc[["A", "B"], :], expect) - expect = DataFrame(df.loc[:, ['X', 'Y']], index=cdf.index, - columns=CategoricalIndex(list('XXY'))) - assert_frame_equal(cdf.loc[:, ['X', 'Y']], expect) + expect = DataFrame( + df.loc[:, ["X", "Y"]], + index=cdf.index, + columns=CategoricalIndex(list("XXY")), + ) + assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_read_only_source(self): # GH 10043 @@ -542,95 +560,101 @@ def test_reindexing(self): # reindexing # convert to a regular index - result = self.df2.reindex(['a', 'b', 'e']) - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan], - 'B': Series(list('aaabbe'))}).set_index('B') + result = self.df2.reindex(["a", "b", "e"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['a', 'b']) - expected = DataFrame({'A': [0, 1, 5, 2, 3], - 'B': Series(list('aaabb'))}).set_index('B') + result = self.df2.reindex(["a", "b"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['e']) - expected = DataFrame({'A': [np.nan], - 'B': Series(['e'])}).set_index('B') + result = self.df2.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['d']) - expected = DataFrame({'A': [np.nan], - 'B': Series(['d'])}).set_index('B') + result = self.df2.reindex(["d"]) + expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # since we are actually reindexing with a Categorical # then return a Categorical - cats = list('cabe') + cats = list("cabe") - result = self.df2.reindex(Categorical(['a', 'd'], categories=cats)) - expected = DataFrame({'A': [0, 1, 5, np.nan], - 'B': Series(list('aaad')).astype( - CDT(cats))}).set_index('B') + result = self.df2.reindex(Categorical(["a", "d"], categories=cats)) + expected = DataFrame( + {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(['a'], categories=cats)) - expected = DataFrame({'A': [0, 1, 5], - 'B': Series(list('aaa')).astype( - CDT(cats))}).set_index('B') + result = self.df2.reindex(Categorical(["a"], categories=cats)) + expected = DataFrame( + {"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['a', 'b', 'e']) - expected = DataFrame({'A': [0, 1, 5, 2, 3, np.nan], - 'B': Series(list('aaabbe'))}).set_index('B') + result = self.df2.reindex(["a", "b", "e"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['a', 'b']) - expected = DataFrame({'A': [0, 1, 5, 2, 3], - 'B': Series(list('aaabb'))}).set_index('B') + result = self.df2.reindex(["a", "b"]) + expected = DataFrame( + {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(['e']) - expected = DataFrame({'A': [np.nan], - 'B': Series(['e'])}).set_index('B') + result = self.df2.reindex(["e"]) + expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df2.reindex(Categorical( - ['a', 'd'], categories=cats, ordered=True)) + result = self.df2.reindex( + Categorical(["a", "d"], categories=cats, ordered=True) + ) expected = DataFrame( - {'A': [0, 1, 5, np.nan], - 'B': Series(list('aaad')).astype( - CDT(cats, ordered=True))}).set_index('B') + { + "A": [0, 1, 5, np.nan], + "B": Series(list("aaad")).astype(CDT(cats, ordered=True)), + } + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical( - ['a', 'd'], categories=['a', 'd'])) - expected = DataFrame({'A': [0, 1, 5, np.nan], - 'B': Series(list('aaad')).astype( - CDT(['a', 'd']))}).set_index('B') + result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"])) + expected = DataFrame( + {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))} + ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed msg = "cannot reindex with a non-unique indexer" with pytest.raises(ValueError, match=msg): - self.df2.reindex(['a', 'a']) + self.df2.reindex(["a", "a"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" - with pytest.raises(NotImplementedError, match=msg.format('method')): - self.df2.reindex(['a'], method='ffill') - with pytest.raises(NotImplementedError, match=msg.format('level')): - self.df2.reindex(['a'], level=1) - with pytest.raises(NotImplementedError, match=msg.format('limit')): - self.df2.reindex(['a'], limit=2) + with pytest.raises(NotImplementedError, match=msg.format("method")): + self.df2.reindex(["a"], method="ffill") + with pytest.raises(NotImplementedError, match=msg.format("level")): + self.df2.reindex(["a"], level=1) + with pytest.raises(NotImplementedError, match=msg.format("limit")): + self.df2.reindex(["a"], limit=2) def test_loc_slice(self): # slicing # not implemented ATM # GH9748 - msg = ("cannot do slice indexing on {klass} with these " - r"indexers \[1\] of {kind}".format( - klass=str(CategoricalIndex), kind=str(int))) + msg = ( + "cannot do slice indexing on {klass} with these " + r"indexers \[1\] of {kind}".format( + klass=str(CategoricalIndex), kind=str(int) + ) + ) with pytest.raises(TypeError, match=msg): self.df.loc[1:5] @@ -641,23 +665,24 @@ def test_loc_slice(self): def test_loc_and_at_with_categorical_index(self): # GH 20629 s = Series([1, 2, 3], index=pd.CategoricalIndex(["A", "B", "C"])) - assert s.loc['A'] == 1 - assert s.at['A'] == 1 - df = DataFrame([[1, 2], [3, 4], [5, 6]], - index=pd.CategoricalIndex(["A", "B", "C"])) - assert df.loc['B', 1] == 4 - assert df.at['B', 1] == 4 + assert s.loc["A"] == 1 + assert s.at["A"] == 1 + df = DataFrame( + [[1, 2], [3, 4], [5, 6]], index=pd.CategoricalIndex(["A", "B", "C"]) + ) + assert df.loc["B", 1] == 4 + assert df.at["B", 1] == 4 def test_boolean_selection(self): df3 = self.df3 df4 = self.df4 - result = df3[df3.index == 'a'] + result = df3[df3.index == "a"] expected = df3.iloc[[]] assert_frame_equal(result, expected) - result = df4[df4.index == 'a'] + result = df4[df4.index == "a"] expected = df4.iloc[[]] assert_frame_equal(result, expected) @@ -701,31 +726,31 @@ def test_indexing_with_category(self): # https://github.com/pandas-dev/pandas/issues/12564 # consistent result if comparing as Dataframe - cat = DataFrame({'A': ['foo', 'bar', 'baz']}) - exp = DataFrame({'A': [True, False, False]}) + cat = DataFrame({"A": ["foo", "bar", "baz"]}) + exp = DataFrame({"A": [True, False, False]}) - res = (cat[['A']] == 'foo') + res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) - cat['A'] = cat['A'].astype('category') + cat["A"] = cat["A"].astype("category") - res = (cat[['A']] == 'foo') + res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) def test_map_with_dict_or_series(self): - orig_values = ['a', 'B', 1, 'a'] - new_values = ['one', 2, 3.0, 'one'] - cur_index = pd.CategoricalIndex(orig_values, name='XXX') - expected = pd.CategoricalIndex(new_values, - name='XXX', categories=[3.0, 2, 'one']) + orig_values = ["a", "B", 1, "a"] + new_values = ["one", 2, 3.0, "one"] + cur_index = pd.CategoricalIndex(orig_values, name="XXX") + expected = pd.CategoricalIndex( + new_values, name="XXX", categories=[3.0, 2, "one"] + ) mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) - mapper = {o: n for o, n in - zip(orig_values[:-1], new_values[:-1])} + mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index b94d3000a5841..7d47063623d87 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -8,11 +8,10 @@ class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self): # this is chained assignment, but will 'work' - with option_context('chained_assignment', None): + with option_context("chained_assignment", None): # #3970 df = DataFrame({"aa": np.arange(5), "bb": [2.2] * 5}) @@ -27,164 +26,168 @@ def test_slice_consolidate_invalidate_item_cache(self): repr(df) # Assignment to wrong series - df['bb'].iloc[0] = 0.17 + df["bb"].iloc[0] = 0.17 df._clear_item_cache() - tm.assert_almost_equal(df['bb'][0], 0.17) + tm.assert_almost_equal(df["bb"][0], 0.17) def test_setitem_cache_updating(self): # GH 5424 - cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven'] + cont = ["one", "two", "three", "four", "five", "six", "seven"] for do_ref in [False, False]: - df = DataFrame({'a': cont, - "b": cont[3:] + cont[:3], - 'c': np.arange(7)}) + df = DataFrame({"a": cont, "b": cont[3:] + cont[:3], "c": np.arange(7)}) # ref the cache if do_ref: df.loc[0, "c"] # set it - df.loc[7, 'c'] = 1 + df.loc[7, "c"] = 1 - assert df.loc[0, 'c'] == 0.0 - assert df.loc[7, 'c'] == 1.0 + assert df.loc[0, "c"] == 0.0 + assert df.loc[7, "c"] == 1.0 # GH 7084 # not updating cache on series setting with slices - expected = DataFrame({'A': [600, 600, 600]}, - index=date_range('5/7/2014', '5/9/2014')) - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) - df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]}) + expected = DataFrame( + {"A": [600, 600, 600]}, index=date_range("5/7/2014", "5/9/2014") + ) + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + df = DataFrame({"C": ["A", "A", "A"], "D": [100, 200, 300]}) # loop through df to update out - six = Timestamp('5/7/2014') - eix = Timestamp('5/9/2014') + six = Timestamp("5/7/2014") + eix = Timestamp("5/9/2014") for ix, row in df.iterrows(): - out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D'] + out.loc[six:eix, row["C"]] = out.loc[six:eix, row["C"]] + row["D"] tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) + tm.assert_series_equal(out["A"], expected["A"]) # try via a chain indexing # this actually works - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): - v = out[row['C']][six:eix] + row['D'] - out[row['C']][six:eix] = v + v = out[row["C"]][six:eix] + row["D"] + out[row["C"]][six:eix] = v tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) + tm.assert_series_equal(out["A"], expected["A"]) - out = DataFrame({'A': [0, 0, 0]}, - index=date_range('5/7/2014', '5/9/2014')) + out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): - out.loc[six:eix, row['C']] += row['D'] + out.loc[six:eix, row["C"]] += row["D"] tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out['A'], expected['A']) + tm.assert_series_equal(out["A"], expected["A"]) class TestChaining: - def test_setitem_chained_setfault(self): # GH6026 - data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout'] - mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none'] + data = ["right", "left", "left", "left", "right", "left", "timeout"] + mdata = ["right", "left", "left", "left", "right", "left", "none"] - df = DataFrame({'response': np.array(data)}) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata})) + df = DataFrame({"response": np.array(data)}) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata})) - recarray = np.rec.fromarrays([data], names=['response']) + recarray = np.rec.fromarrays([data], names=["response"]) df = DataFrame(recarray) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata})) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata})) - df = DataFrame({'response': data, 'response1': data}) - mask = df.response == 'timeout' - df.response[mask] = 'none' - tm.assert_frame_equal(df, DataFrame({'response': mdata, - 'response1': data})) + df = DataFrame({"response": data, "response1": data}) + mask = df.response == "timeout" + df.response[mask] = "none" + tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) # GH 6056 - expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar'])) - df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) - df['A'].iloc[0] = np.nan + expected = DataFrame(dict(A=[np.nan, "bar", "bah", "foo", "bar"])) + df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"]))) + df["A"].iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected) - df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) + df = DataFrame(dict(A=np.array(["foo", "bar", "bah", "foo", "bar"]))) df.A.iloc[0] = np.nan result = df.head() tm.assert_frame_equal(result, expected) def test_detect_chained_assignment(self): - pd.set_option('chained_assignment', 'raise') + pd.set_option("chained_assignment", "raise") # work with the chain - expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB')) - df = DataFrame(np.arange(4).reshape(2, 2), - columns=list('AB'), dtype='int64') + expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB")) + df = DataFrame(np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64") assert df._is_copy is None - df['A'][0] = -5 - df['A'][1] = -6 + df["A"][0] = -5 + df["A"][1] = -6 tm.assert_frame_equal(df, expected) # test with the chaining - df = DataFrame({'A': Series(range(2), dtype='int64'), - 'B': np.array(np.arange(2, 4), dtype=np.float64)}) + df = DataFrame( + { + "A": Series(range(2), dtype="int64"), + "B": np.array(np.arange(2, 4), dtype=np.float64), + } + ) assert df._is_copy is None with pytest.raises(com.SettingWithCopyError): - df['A'][0] = -5 + df["A"][0] = -5 with pytest.raises(com.SettingWithCopyError): - df['A'][1] = np.nan + df["A"][1] = np.nan - assert df['A']._is_copy is None + assert df["A"]._is_copy is None # Using a copy (the chain), fails - df = DataFrame({'A': Series(range(2), dtype='int64'), - 'B': np.array(np.arange(2, 4), dtype=np.float64)}) + df = DataFrame( + { + "A": Series(range(2), dtype="int64"), + "B": np.array(np.arange(2, 4), dtype=np.float64), + } + ) with pytest.raises(com.SettingWithCopyError): - df.loc[0]['A'] = -5 + df.loc[0]["A"] = -5 # Doc example - df = DataFrame({'a': ['one', 'one', 'two', 'three', - 'two', 'one', 'six'], - 'c': Series(range(7), dtype='int64')}) + df = DataFrame( + { + "a": ["one", "one", "two", "three", "two", "one", "six"], + "c": Series(range(7), dtype="int64"), + } + ) assert df._is_copy is None with pytest.raises(com.SettingWithCopyError): - indexer = df.a.str.startswith('o') - df[indexer]['c'] = 42 + indexer = df.a.str.startswith("o") + df[indexer]["c"] = 42 - expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]}) - df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) + expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) + df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) with pytest.raises(com.SettingWithCopyError): - df['A'][0] = 111 + df["A"][0] = 111 with pytest.raises(com.SettingWithCopyError): - df.loc[0]['A'] = 111 + df.loc[0]["A"] = 111 - df.loc[0, 'A'] = 111 + df.loc[0, "A"] = 111 tm.assert_frame_equal(df, expected) # gh-5475: Make sure that is_copy is picked up reconstruction df = DataFrame({"A": [1, 2]}) assert df._is_copy is None - with tm.ensure_clean('__tmp__pickle') as path: + with tm.ensure_clean("__tmp__pickle") as path: df.to_pickle(path) df2 = pd.read_pickle(path) df2["B"] = df2["A"] @@ -199,9 +202,9 @@ def random_text(nobs=100): idx = np.random.randint(len(letters), size=2) idx.sort() - df.append([letters[idx[0]:idx[1]]]) + df.append([letters[idx[0] : idx[1]]]) - return DataFrame(df, columns=['letters']) + return DataFrame(df, columns=["letters"]) df = random_text(100000) @@ -217,7 +220,7 @@ def random_text(nobs=100): df = df.loc[indexer].copy() assert df._is_copy is None - df['letters'] = df['letters'].apply(str.lower) + df["letters"] = df["letters"].apply(str.lower) # Implicitly take df = random_text(100000) @@ -225,7 +228,7 @@ def random_text(nobs=100): df = df.loc[indexer] assert df._is_copy is not None - df['letters'] = df['letters'].apply(str.lower) + df["letters"] = df["letters"].apply(str.lower) # Implicitly take 2 df = random_text(100000) @@ -233,23 +236,22 @@ def random_text(nobs=100): df = df.loc[indexer] assert df._is_copy is not None - df.loc[:, 'letters'] = df['letters'].apply(str.lower) + df.loc[:, "letters"] = df["letters"].apply(str.lower) # Should be ok even though it's a copy! assert df._is_copy is None - df['letters'] = df['letters'].apply(str.lower) + df["letters"] = df["letters"].apply(str.lower) assert df._is_copy is None df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) - df.loc[indexer, 'letters'] = ( - df.loc[indexer, 'letters'].apply(str.lower)) + df.loc[indexer, "letters"] = df.loc[indexer, "letters"].apply(str.lower) # an identical take, so no copy - df = DataFrame({'a': [1]}).dropna() + df = DataFrame({"a": [1]}).dropna() assert df._is_copy is None - df['a'] += 1 + df["a"] += 1 df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0].sort_values() @@ -258,62 +260,64 @@ def random_text(nobs=100): tm.assert_series_equal(s, df[0].sort_values()) # see gh-6025: false positives - df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]}) + df = DataFrame({"column1": ["a", "a", "a"], "column2": [4, 8, 9]}) str(df) - df['column1'] = df['column1'] + 'b' + df["column1"] = df["column1"] + "b" str(df) - df = df[df['column2'] != 8] + df = df[df["column2"] != 8] str(df) - df['column1'] = df['column1'] + 'c' + df["column1"] = df["column1"] + "c" str(df) # from SO: # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc - df = DataFrame(np.arange(0, 9), columns=['count']) - df['group'] = 'b' + df = DataFrame(np.arange(0, 9), columns=["count"]) + df["group"] = "b" with pytest.raises(com.SettingWithCopyError): - df.iloc[0:5]['group'] = 'a' + df.iloc[0:5]["group"] = "a" # Mixed type setting but same dtype & changing dtype - df = DataFrame(dict(A=date_range('20130101', periods=5), - B=np.random.randn(5), - C=np.arange(5, dtype='int64'), - D=list('abcde'))) + df = DataFrame( + dict( + A=date_range("20130101", periods=5), + B=np.random.randn(5), + C=np.arange(5, dtype="int64"), + D=list("abcde"), + ) + ) with pytest.raises(com.SettingWithCopyError): - df.loc[2]['D'] = 'foo' + df.loc[2]["D"] = "foo" with pytest.raises(com.SettingWithCopyError): - df.loc[2]['C'] = 'foo' + df.loc[2]["C"] = "foo" with pytest.raises(com.SettingWithCopyError): - df['C'][2] = 'foo' + df["C"][2] = "foo" def test_setting_with_copy_bug(self): # operating on a copy - df = DataFrame({'a': list(range(4)), - 'b': list('ab..'), - 'c': ['a', 'b', np.nan, 'd']}) + df = DataFrame( + {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} + ) mask = pd.isna(df.c) - msg = ("A value is trying to be set on a copy of a slice from a" - " DataFrame") + msg = "A value is trying to be set on a copy of a slice from a" " DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - df[['c']][mask] = df[['b']][mask] + df[["c"]][mask] = df[["b"]][mask] # invalid warning as we are returning a new object # GH 8730 - df1 = DataFrame({'x': Series(['a', 'b', 'c']), - 'y': Series(['d', 'e', 'f'])}) - df2 = df1[['x']] + df1 = DataFrame({"x": Series(["a", "b", "c"]), "y": Series(["d", "e", "f"])}) + df2 = df1[["x"]] # this should not raise - df2['y'] = ['g', 'h', 'i'] + df2["y"] = ["g", "h", "i"] def test_detect_chained_assignment_warnings(self): with option_context("chained_assignment", "warn"): @@ -325,14 +329,16 @@ def test_detect_chained_assignment_warnings(self): def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): # xref gh-13017. with option_context("chained_assignment", "warn"): - df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], - columns=["a", "a", "c"]) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] + ) with tm.assert_produces_warning(com.SettingWithCopyWarning): df.c.loc[df.c > 0] = None - expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], - columns=["a", "a", "c"]) + expected = pd.DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] + ) tm.assert_frame_equal(df, expected) def test_chained_getitem_with_lists(self): @@ -344,15 +350,15 @@ def check(result, expected): tm.assert_numpy_array_equal(result, expected) assert isinstance(result, np.ndarray) - df = DataFrame({'A': 5 * [np.zeros(3)], 'B': 5 * [np.ones(3)]}) - expected = df['A'].iloc[2] - result = df.loc[2, 'A'] + df = DataFrame({"A": 5 * [np.zeros(3)], "B": 5 * [np.ones(3)]}) + expected = df["A"].iloc[2] + result = df.loc[2, "A"] check(result, expected) - result2 = df.iloc[2]['A'] + result2 = df.iloc[2]["A"] check(result2, expected) - result3 = df['A'].loc[2] + result3 = df["A"].loc[2] check(result3, expected) - result4 = df['A'].iloc[2] + result4 = df["A"].iloc[2] check(result4, expected) @pytest.mark.filterwarnings("ignore::FutureWarning") @@ -360,26 +366,32 @@ def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem df = tm.makeDataFrame() - df['A'] # cache series + df["A"] # cache series df.ix["Hello Friend"] = df.ix[0] - assert "Hello Friend" in df['A'].index - assert "Hello Friend" in df['B'].index + assert "Hello Friend" in df["A"].index + assert "Hello Friend" in df["B"].index # 10264 - df = DataFrame(np.zeros((5, 5), dtype='int64'), columns=[ - 'a', 'b', 'c', 'd', 'e'], index=range(5)) - df['f'] = 0 + df = DataFrame( + np.zeros((5, 5), dtype="int64"), + columns=["a", "b", "c", "d", "e"], + index=range(5), + ) + df["f"] = 0 df.f.values[3] = 1 # TODO(wesm): unused? # y = df.iloc[np.arange(2, len(df))] df.f.values[3] = 2 - expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[ - 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5)) - expected.at[3, 'f'] = 2 + expected = DataFrame( + np.zeros((5, 6), dtype="int64"), + columns=["a", "b", "c", "d", "e", "f"], + index=range(5), + ) + expected.at[3, "f"] = 2 tm.assert_frame_equal(df, expected) - expected = Series([0, 0, 0, 2, 0], name='f') + expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) def test_deprecate_is_copy(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index e9c1b85e7d40c..f46fbcdb504e9 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -13,7 +13,7 @@ ############################################################### -@pytest.fixture(autouse=True, scope='class') +@pytest.fixture(autouse=True, scope="class") def check_comprehensiveness(request): # Iterate over combination of dtype, method and klass # and ensure that each are contained within a collected test @@ -23,12 +23,13 @@ def check_comprehensiveness(request): def has_test(combo): klass, dtype, method = combo cls_funcs = request.node.session.items - return any(klass in x.name and dtype in x.name and - method in x.name for x in cls_funcs) + return any( + klass in x.name and dtype in x.name and method in x.name for x in cls_funcs + ) for combo in combos: if not has_test(combo): - msg = 'test method is not defined: {0}, {1}' + msg = "test method is not defined: {0}, {1}" raise AssertionError(msg.format(cls.__name__, combo)) yield @@ -36,9 +37,18 @@ def has_test(combo): class CoercionBase: - klasses = ['index', 'series'] - dtypes = ['object', 'int64', 'float64', 'complex128', 'bool', - 'datetime64', 'datetime64tz', 'timedelta64', 'period'] + klasses = ["index", "series"] + dtypes = [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64", + "datetime64tz", + "timedelta64", + "period", + ] @property def method(self): @@ -58,10 +68,11 @@ def _assert(self, left, right, dtype): class TestSetitemCoercion(CoercionBase): - method = 'setitem' + method = "setitem" - def _assert_setitem_series_conversion(self, original_series, loc_value, - expected_series, expected_dtype): + def _assert_setitem_series_conversion( + self, original_series, loc_value, expected_series, expected_dtype + ): """ test series value's coercion triggered by assignment """ temp = original_series.copy() temp[1] = loc_value @@ -74,23 +85,21 @@ def _assert_setitem_series_conversion(self, original_series, loc_value, # temp.loc[1] = loc_value # tm.assert_series_equal(temp, expected_series) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) def test_setitem_series_object(self, val, exp_dtype): - obj = pd.Series(list('abcd')) + obj = pd.Series(list("abcd")) assert obj.dtype == np.object - exp = pd.Series(['a', val, 'c', 'd']) + exp = pd.Series(["a", val, "c", "d"]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.int64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [(1, np.int64), (1.1, np.float64), (1 + 1j, np.complex128), (True, np.object)], + ) def test_setitem_series_int64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4]) assert obj.dtype == np.int64 @@ -103,9 +112,9 @@ def test_setitem_series_int64(self, val, exp_dtype): exp = pd.Series([1, val, 3, 4]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (np.int32(1), np.int8), - (np.int16(2**9), np.int16)]) + @pytest.mark.parametrize( + "val,exp_dtype", [(np.int32(1), np.int8), (np.int16(2 ** 9), np.int16)] + ) def test_setitem_series_int8(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], dtype=np.int8) assert obj.dtype == np.int8 @@ -118,11 +127,15 @@ def test_setitem_series_int8(self, val, exp_dtype): exp = pd.Series([1, val, 3, 4], dtype=np.int8) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.float64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_setitem_series_float64(self, val, exp_dtype): obj = pd.Series([1.1, 2.2, 3.3, 4.4]) assert obj.dtype == np.float64 @@ -130,11 +143,15 @@ def test_setitem_series_float64(self, val, exp_dtype): exp = pd.Series([1.1, val, 3.3, 4.4]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.complex128), - (1.1, np.complex128), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_setitem_series_complex128(self, val, exp_dtype): obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 @@ -142,12 +159,16 @@ def test_setitem_series_complex128(self, val, exp_dtype): exp = pd.Series([1 + 1j, val, 3 + 3j, 4 + 4j]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (1, np.int64), - (3, np.int64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.bool)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.int64), + (3, np.int64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.bool), + ], + ) def test_setitem_series_bool(self, val, exp_dtype): obj = pd.Series([True, False, True, False]) assert obj.dtype == np.bool @@ -168,62 +189,89 @@ def test_setitem_series_bool(self, val, exp_dtype): exp = pd.Series([True, val, True, False]) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (1, np.object), - ('x', np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (1, np.object), + ("x", np.object), + ], + ) def test_setitem_series_datetime64(self, val, exp_dtype): - obj = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' - - exp = pd.Series([pd.Timestamp('2011-01-01'), - val, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) + obj = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + val, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (pd.Timestamp('2012-01-01', tz='US/Eastern'), - 'datetime64[ns, US/Eastern]'), - (pd.Timestamp('2012-01-01', tz='US/Pacific'), np.object), - (pd.Timestamp('2012-01-01'), np.object), - (1, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [ + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Pacific"), np.object), + (pd.Timestamp("2012-01-01"), np.object), + (1, np.object), + ], + ) def test_setitem_series_datetime64tz(self, val, exp_dtype): - tz = 'US/Eastern' - obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-02', tz=tz), - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - assert obj.dtype == 'datetime64[ns, US/Eastern]' - - exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - val, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) + tz = "US/Eastern" + obj = pd.Series( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + assert obj.dtype == "datetime64[ns, US/Eastern]" + + exp = pd.Series( + [ + pd.Timestamp("2011-01-01", tz=tz), + val, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (pd.Timedelta('12 day'), 'timedelta64[ns]'), - (1, np.object), - ('x', np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", + [(pd.Timedelta("12 day"), "timedelta64[ns]"), (1, np.object), ("x", np.object)], + ) def test_setitem_series_timedelta64(self, val, exp_dtype): - obj = pd.Series([pd.Timedelta('1 day'), - pd.Timedelta('2 day'), - pd.Timedelta('3 day'), - pd.Timedelta('4 day')]) - assert obj.dtype == 'timedelta64[ns]' - - exp = pd.Series([pd.Timedelta('1 day'), - val, - pd.Timedelta('3 day'), - pd.Timedelta('4 day')]) + obj = pd.Series( + [ + pd.Timedelta("1 day"), + pd.Timedelta("2 day"), + pd.Timedelta("3 day"), + pd.Timedelta("4 day"), + ] + ) + assert obj.dtype == "timedelta64[ns]" + + exp = pd.Series( + [pd.Timedelta("1 day"), val, pd.Timedelta("3 day"), pd.Timedelta("4 day")] + ) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) - def _assert_setitem_index_conversion(self, original_series, loc_key, - expected_index, expected_dtype): + def _assert_setitem_index_conversion( + self, original_series, loc_key, expected_index, expected_dtype + ): """ test index's coercion triggered by assign key """ temp = original_series.copy() temp[loc_key] = 5 @@ -239,12 +287,11 @@ def _assert_setitem_index_conversion(self, original_series, loc_key, # check dtype explicitly for sure assert temp.index.dtype == expected_dtype - @pytest.mark.parametrize("val,exp_dtype", [ - ('x', np.object), - (5, IndexError), - (1.1, np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", [("x", np.object), (5, IndexError), (1.1, np.object)] + ) def test_setitem_index_object(self, val, exp_dtype): - obj = pd.Series([1, 2, 3, 4], index=list('abcd')) + obj = pd.Series([1, 2, 3, 4], index=list("abcd")) assert obj.index.dtype == np.object if exp_dtype is IndexError: @@ -252,14 +299,12 @@ def test_setitem_index_object(self, val, exp_dtype): with pytest.raises(exp_dtype): temp[5] = 5 else: - exp_index = pd.Index(list('abcd') + [val]) - self._assert_setitem_index_conversion(obj, val, exp_index, - exp_dtype) - - @pytest.mark.parametrize("val,exp_dtype", [ - (5, np.int64), - (1.1, np.float64), - ('x', np.object)]) + exp_index = pd.Index(list("abcd") + [val]) + self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) + + @pytest.mark.parametrize( + "val,exp_dtype", [(5, np.int64), (1.1, np.float64), ("x", np.object)] + ) def test_setitem_index_int64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4]) assert obj.index.dtype == np.int64 @@ -267,10 +312,9 @@ def test_setitem_index_int64(self, val, exp_dtype): exp_index = pd.Index([0, 1, 2, 3, val]) self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype) - @pytest.mark.parametrize("val,exp_dtype", [ - (5, IndexError), - (5.1, np.float64), - ('x', np.object)]) + @pytest.mark.parametrize( + "val,exp_dtype", [(5, IndexError), (5.1, np.float64), ("x", np.object)] + ) def test_setitem_index_float64(self, val, exp_dtype): obj = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) assert obj.index.dtype == np.float64 @@ -309,34 +353,41 @@ def test_setitem_index_period(self): class TestInsertIndexCoercion(CoercionBase): - klasses = ['index'] - method = 'insert' + klasses = ["index"] + method = "insert" - def _assert_insert_conversion(self, original, value, - expected, expected_dtype): + def _assert_insert_conversion(self, original, value, expected, expected_dtype): """ test coercion triggered by insert """ target = original.copy() res = target.insert(1, value) tm.assert_index_equal(res, expected) assert res.dtype == expected_dtype - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (1, 1, np.object), - (1.1, 1.1, np.object), - (False, False, np.object), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1, np.object), + (1.1, 1.1, np.object), + (False, False, np.object), + ("x", "x", np.object), + ], + ) def test_insert_index_object(self, insert, coerced_val, coerced_dtype): - obj = pd.Index(list('abcd')) + obj = pd.Index(list("abcd")) assert obj.dtype == np.object - exp = pd.Index(['a', coerced_val, 'b', 'c', 'd']) + exp = pd.Index(["a", coerced_val, "b", "c", "d"]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (1, 1, np.int64), - (1.1, 1.1, np.float64), - (False, 0, np.int64), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1, np.int64), + (1.1, 1.1, np.float64), + (False, 0, np.int64), + ("x", "x", np.object), + ], + ) def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): obj = pd.Int64Index([1, 2, 3, 4]) assert obj.dtype == np.int64 @@ -344,39 +395,49 @@ def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): exp = pd.Index([1, coerced_val, 2, 3, 4]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (1, 1., np.float64), - (1.1, 1.1, np.float64), - (False, 0., np.float64), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (1, 1.0, np.float64), + (1.1, 1.1, np.float64), + (False, 0.0, np.float64), + ("x", "x", np.object), + ], + ) def test_insert_index_float64(self, insert, coerced_val, coerced_dtype): - obj = pd.Float64Index([1., 2., 3., 4.]) + obj = pd.Float64Index([1.0, 2.0, 3.0, 4.0]) assert obj.dtype == np.float64 - exp = pd.Index([1., coerced_val, 2., 3., 4.]) + exp = pd.Index([1.0, coerced_val, 2.0, 3.0, 4.0]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) - @pytest.mark.parametrize('fill_val,exp_dtype', [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), - 'datetime64[ns, US/Eastern]')], - ids=['datetime64', 'datetime64tz']) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + ], + ids=["datetime64", "datetime64tz"], + ) def test_insert_index_datetimes(self, fill_val, exp_dtype): - obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04'], tz=fill_val.tz) + obj = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], tz=fill_val.tz + ) assert obj.dtype == exp_dtype - exp = pd.DatetimeIndex(['2011-01-01', fill_val.date(), '2011-01-02', - '2011-01-03', '2011-01-04'], tz=fill_val.tz) + exp = pd.DatetimeIndex( + ["2011-01-01", fill_val.date(), "2011-01-02", "2011-01-03", "2011-01-04"], + tz=fill_val.tz, + ) self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) msg = "Passed item and index have different timezone" if fill_val.tz: with pytest.raises(ValueError, match=msg): - obj.insert(1, pd.Timestamp('2012-01-01')) + obj.insert(1, pd.Timestamp("2012-01-01")) with pytest.raises(ValueError, match=msg): - obj.insert(1, pd.Timestamp('2012-01-01', tz='Asia/Tokyo')) + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) msg = "cannot insert DatetimeIndex with incompatible label" with pytest.raises(TypeError, match=msg): @@ -385,44 +446,53 @@ def test_insert_index_datetimes(self, fill_val, exp_dtype): pytest.xfail("ToDo: must coerce to object") def test_insert_index_timedelta64(self): - obj = pd.TimedeltaIndex(['1 day', '2 day', '3 day', '4 day']) - assert obj.dtype == 'timedelta64[ns]' + obj = pd.TimedeltaIndex(["1 day", "2 day", "3 day", "4 day"]) + assert obj.dtype == "timedelta64[ns]" # timedelta64 + timedelta64 => timedelta64 - exp = pd.TimedeltaIndex(['1 day', '10 day', '2 day', '3 day', '4 day']) - self._assert_insert_conversion(obj, pd.Timedelta('10 day'), - exp, 'timedelta64[ns]') + exp = pd.TimedeltaIndex(["1 day", "10 day", "2 day", "3 day", "4 day"]) + self._assert_insert_conversion( + obj, pd.Timedelta("10 day"), exp, "timedelta64[ns]" + ) # ToDo: must coerce to object msg = "cannot insert TimedeltaIndex with incompatible label" with pytest.raises(TypeError, match=msg): - obj.insert(1, pd.Timestamp('2012-01-01')) + obj.insert(1, pd.Timestamp("2012-01-01")) # ToDo: must coerce to object msg = "cannot insert TimedeltaIndex with incompatible label" with pytest.raises(TypeError, match=msg): obj.insert(1, 1) - @pytest.mark.parametrize("insert, coerced_val, coerced_dtype", [ - (pd.Period('2012-01', freq='M'), '2012-01', 'period[M]'), - (pd.Timestamp('2012-01-01'), pd.Timestamp('2012-01-01'), np.object), - (1, 1, np.object), - ('x', 'x', np.object)]) + @pytest.mark.parametrize( + "insert, coerced_val, coerced_dtype", + [ + (pd.Period("2012-01", freq="M"), "2012-01", "period[M]"), + (pd.Timestamp("2012-01-01"), pd.Timestamp("2012-01-01"), np.object), + (1, 1, np.object), + ("x", "x", np.object), + ], + ) def test_insert_index_period(self, insert, coerced_val, coerced_dtype): - obj = pd.PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], - freq='M') - assert obj.dtype == 'period[M]' + obj = pd.PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq="M") + assert obj.dtype == "period[M]" if isinstance(insert, pd.Period): index_type = pd.PeriodIndex else: index_type = pd.Index - exp = index_type([pd.Period('2011-01', freq='M'), - coerced_val, - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-04', freq='M')], freq='M') + exp = index_type( + [ + pd.Period("2011-01", freq="M"), + coerced_val, + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-04", freq="M"), + ], + freq="M", + ) self._assert_insert_conversion(obj, insert, exp, coerced_dtype) def test_insert_index_complex128(self): @@ -434,24 +504,23 @@ def test_insert_index_bool(self): class TestWhereCoercion(CoercionBase): - method = 'where' + method = "where" - def _assert_where_conversion(self, original, cond, values, - expected, expected_dtype): + def _assert_where_conversion( + self, original, cond, values, expected, expected_dtype + ): """ test coercion triggered by where """ target = original.copy() res = target.where(cond, values) self._assert(res, expected, expected_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) def test_where_object(self, klass, fill_val, exp_dtype): - obj = klass(list('abcd')) + obj = klass(list("abcd")) assert obj.dtype == np.object cond = klass([True, False, True, False]) @@ -460,7 +529,7 @@ def test_where_object(self, klass, fill_val, exp_dtype): else: ret_val = fill_val - exp = klass(['a', ret_val, 'c', ret_val]) + exp = klass(["a", ret_val, "c", ret_val]) self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) if fill_val is True: @@ -468,16 +537,14 @@ def test_where_object(self, klass, fill_val, exp_dtype): else: values = klass(fill_val * x for x in [5, 6, 7, 8]) - exp = klass(['a', values[1], 'c', values[3]]) + exp = klass(["a", values[1], "c", values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.int64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.int64), (1.1, np.float64), (1 + 1j, np.complex128), (True, np.object)], + ) def test_where_int64(self, klass, fill_val, exp_dtype): if klass is pd.Index and exp_dtype is np.complex128: pytest.skip("Complex Index not supported") @@ -495,13 +562,16 @@ def test_where_int64(self, klass, fill_val, exp_dtype): exp = klass([1, values[1], 3, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val, exp_dtype", [ - (1, np.float64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val, exp_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_where_float64(self, klass, fill_val, exp_dtype): if klass is pd.Index and exp_dtype is np.complex128: pytest.skip("Complex Index not supported") @@ -519,11 +589,15 @@ def test_where_float64(self, klass, fill_val, exp_dtype): exp = klass([1.1, values[1], 3.3, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.complex128), - (1.1, np.complex128), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_where_series_complex128(self, fill_val, exp_dtype): obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 @@ -539,11 +613,10 @@ def test_where_series_complex128(self, fill_val, exp_dtype): exp = pd.Series([1 + 1j, values[1], 3 + 3j, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.bool)]) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.bool)], + ) def test_where_series_bool(self, fill_val, exp_dtype): obj = pd.Series([True, False, True, False]) @@ -560,81 +633,112 @@ def test_where_series_bool(self, fill_val, exp_dtype): exp = pd.Series([True, values[1], True, values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)], - ids=['datetime64', 'datetime64tz']) + @pytest.mark.parametrize( + "fill_val,exp_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), np.object), + ], + ids=["datetime64", "datetime64tz"], + ) def test_where_series_datetime64(self, fill_val, exp_dtype): - obj = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' + obj = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" cond = pd.Series([True, False, True, False]) - exp = pd.Series([pd.Timestamp('2011-01-01'), fill_val, - pd.Timestamp('2011-01-03'), fill_val]) + exp = pd.Series( + [pd.Timestamp("2011-01-01"), fill_val, pd.Timestamp("2011-01-03"), fill_val] + ) self._assert_where_conversion(obj, cond, fill_val, exp, exp_dtype) values = pd.Series(pd.date_range(fill_val, periods=4)) if fill_val.tz: - exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02 00:00', tz='US/Eastern'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04 00:00', - tz='US/Eastern')]) + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02 00:00", tz="US/Eastern"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04 00:00", tz="US/Eastern"), + ] + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - exp = pd.Series([pd.Timestamp('2011-01-01'), values[1], - pd.Timestamp('2011-01-03'), values[3]]) + exp = pd.Series( + [ + pd.Timestamp("2011-01-01"), + values[1], + pd.Timestamp("2011-01-03"), + values[3], + ] + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) def test_where_index_datetime(self): - fill_val = pd.Timestamp('2012-01-01') - exp_dtype = 'datetime64[ns]' - obj = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' + fill_val = pd.Timestamp("2012-01-01") + exp_dtype = "datetime64[ns]" + obj = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" cond = pd.Index([True, False, True, False]) - msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " - "of some kind") + msg = "Index\\(\\.\\.\\.\\) must be called with a collection " "of some kind" with pytest.raises(TypeError, match=msg): obj.where(cond, fill_val) values = pd.Index(pd.date_range(fill_val, periods=4)) - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04')]) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04"), + ] + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.xfail( - reason="GH 22839: do not ignore timezone, must be object") + @pytest.mark.xfail(reason="GH 22839: do not ignore timezone, must be object") def test_where_index_datetimetz(self): - fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern') + fill_val = pd.Timestamp("2012-01-01", tz="US/Eastern") exp_dtype = np.object - obj = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' + obj = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" cond = pd.Index([True, False, True, False]) - msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " - "of some kind") + msg = "Index\\(\\.\\.\\.\\) must be called with a collection " "of some kind" with pytest.raises(TypeError, match=msg): obj.where(cond, fill_val) values = pd.Index(pd.date_range(fill_val, periods=4)) - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2012-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03'), - pd.Timestamp('2012-01-04', tz='US/Eastern')], - dtype=exp_dtype) + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2012-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03"), + pd.Timestamp("2012-01-04", tz="US/Eastern"), + ], + dtype=exp_dtype, + ) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) @@ -667,39 +771,39 @@ class TestFillnaSeriesCoercion(CoercionBase): # not indexing, but place here for consistency - method = 'fillna' + method = "fillna" def test_has_comprehensive_tests(self): pass - def _assert_fillna_conversion(self, original, value, - expected, expected_dtype): + def _assert_fillna_conversion(self, original, value, expected, expected_dtype): """ test coercion triggered by fillna """ target = original.copy() res = target.fillna(value) self._assert(res, expected, expected_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val, fill_dtype", [ - (1, np.object), - (1.1, np.object), - (1 + 1j, np.object), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val, fill_dtype", + [(1, np.object), (1.1, np.object), (1 + 1j, np.object), (True, np.object)], + ) def test_fillna_object(self, klass, fill_val, fill_dtype): - obj = klass(['a', np.nan, 'c', 'd']) + obj = klass(["a", np.nan, "c", "d"]) assert obj.dtype == np.object - exp = klass(['a', fill_val, 'c', 'd']) + exp = klass(["a", fill_val, "c", "d"]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (1, np.float64), - (1.1, np.float64), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (1, np.float64), + (1.1, np.float64), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_fillna_float64(self, klass, fill_val, fill_dtype): obj = klass([1.1, np.nan, 3.3, 4.4]) assert obj.dtype == np.float64 @@ -712,11 +816,15 @@ def test_fillna_float64(self, klass, fill_val, fill_dtype): fill_dtype = np.object self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (1, np.complex128), - (1.1, np.complex128), - (1 + 1j, np.complex128), - (True, np.object)]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (1, np.complex128), + (1.1, np.complex128), + (1 + 1j, np.complex128), + (True, np.object), + ], + ) def test_fillna_series_complex128(self, fill_val, fill_dtype): obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) assert obj.dtype == np.complex128 @@ -724,47 +832,70 @@ def test_fillna_series_complex128(self, fill_val, fill_dtype): exp = pd.Series([1 + 1j, fill_val, 3 + 3j, 4 + 4j]) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) - @pytest.mark.parametrize("klass", [pd.Series, pd.Index], - ids=['series', 'index']) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object), - (1, np.object), ('x', np.object)], - ids=['datetime64', 'datetime64tz', 'object', 'object']) + @pytest.mark.parametrize("klass", [pd.Series, pd.Index], ids=["series", "index"]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), np.object), + (1, np.object), + ("x", np.object), + ], + ids=["datetime64", "datetime64tz", "object", "object"], + ) def test_fillna_datetime(self, klass, fill_val, fill_dtype): - obj = klass([pd.Timestamp('2011-01-01'), - pd.NaT, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) - assert obj.dtype == 'datetime64[ns]' - - exp = klass([pd.Timestamp('2011-01-01'), - fill_val, - pd.Timestamp('2011-01-03'), - pd.Timestamp('2011-01-04')]) + obj = klass( + [ + pd.Timestamp("2011-01-01"), + pd.NaT, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) + assert obj.dtype == "datetime64[ns]" + + exp = klass( + [ + pd.Timestamp("2011-01-01"), + fill_val, + pd.Timestamp("2011-01-03"), + pd.Timestamp("2011-01-04"), + ] + ) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.parametrize("klass", [pd.Series, pd.Index]) - @pytest.mark.parametrize("fill_val,fill_dtype", [ - (pd.Timestamp('2012-01-01', tz='US/Eastern'), - 'datetime64[ns, US/Eastern]'), - (pd.Timestamp('2012-01-01'), np.object), - (pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), np.object), - (1, np.object), - ('x', np.object)]) + @pytest.mark.parametrize( + "fill_val,fill_dtype", + [ + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01"), np.object), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), np.object), + (1, np.object), + ("x", np.object), + ], + ) def test_fillna_datetime64tz(self, klass, fill_val, fill_dtype): - tz = 'US/Eastern' - - obj = klass([pd.Timestamp('2011-01-01', tz=tz), - pd.NaT, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) - assert obj.dtype == 'datetime64[ns, US/Eastern]' - - exp = klass([pd.Timestamp('2011-01-01', tz=tz), - fill_val, - pd.Timestamp('2011-01-03', tz=tz), - pd.Timestamp('2011-01-04', tz=tz)]) + tz = "US/Eastern" + + obj = klass( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.NaT, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) + assert obj.dtype == "datetime64[ns, US/Eastern]" + + exp = klass( + [ + pd.Timestamp("2011-01-01", tz=tz), + fill_val, + pd.Timestamp("2011-01-03", tz=tz), + pd.Timestamp("2011-01-04", tz=tz), + ] + ) self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) def test_fillna_series_int64(self): @@ -794,83 +925,114 @@ def test_fillna_index_period(self): class TestReplaceSeriesCoercion(CoercionBase): - klasses = ['series'] - method = 'replace' + klasses = ["series"] + method = "replace" rep = {} - rep['object'] = ['a', 'b'] - rep['int64'] = [4, 5] - rep['float64'] = [1.1, 2.2] - rep['complex128'] = [1 + 1j, 2 + 2j] - rep['bool'] = [True, False] - rep['datetime64[ns]'] = [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-03')] - - for tz in ['UTC', 'US/Eastern']: + rep["object"] = ["a", "b"] + rep["int64"] = [4, 5] + rep["float64"] = [1.1, 2.2] + rep["complex128"] = [1 + 1j, 2 + 2j] + rep["bool"] = [True, False] + rep["datetime64[ns]"] = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-03")] + + for tz in ["UTC", "US/Eastern"]: # to test tz => different tz replacement - key = 'datetime64[ns, {0}]'.format(tz) - rep[key] = [pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-03', tz=tz)] - - rep['timedelta64[ns]'] = [pd.Timedelta('1 day'), - pd.Timedelta('2 day')] - - @pytest.mark.parametrize('how', ['dict', 'series']) - @pytest.mark.parametrize('to_key', [ - 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]', - 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]' - ], ids=['object', 'int64', 'float64', 'complex128', 'bool', - 'datetime64', 'datetime64tz', 'datetime64tz', 'timedelta64']) - @pytest.mark.parametrize('from_key', [ - 'object', 'int64', 'float64', 'complex128', 'bool', 'datetime64[ns]', - 'datetime64[ns, UTC]', 'datetime64[ns, US/Eastern]', 'timedelta64[ns]'] + key = "datetime64[ns, {0}]".format(tz) + rep[key] = [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-03", tz=tz), + ] + + rep["timedelta64[ns]"] = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] + + @pytest.mark.parametrize("how", ["dict", "series"]) + @pytest.mark.parametrize( + "to_key", + [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64[ns]", + "datetime64[ns, UTC]", + "datetime64[ns, US/Eastern]", + "timedelta64[ns]", + ], + ids=[ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64", + "datetime64tz", + "datetime64tz", + "timedelta64", + ], + ) + @pytest.mark.parametrize( + "from_key", + [ + "object", + "int64", + "float64", + "complex128", + "bool", + "datetime64[ns]", + "datetime64[ns, UTC]", + "datetime64[ns, US/Eastern]", + "timedelta64[ns]", + ], ) def test_replace_series(self, how, to_key, from_key): - if from_key == 'bool' and how == 'series': + if from_key == "bool" and how == "series": # doesn't work in PY3, though ...dict_from_bool works fine pytest.skip("doesn't work as in PY3") - index = pd.Index([3, 4], name='xxx') - obj = pd.Series(self.rep[from_key], index=index, name='yyy') + index = pd.Index([3, 4], name="xxx") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - if (from_key.startswith('datetime') and to_key.startswith('datetime')): + if from_key.startswith("datetime") and to_key.startswith("datetime"): # tested below return - elif from_key in ['datetime64[ns, US/Eastern]', 'datetime64[ns, UTC]']: + elif from_key in ["datetime64[ns, US/Eastern]", "datetime64[ns, UTC]"]: # tested below return - if how == 'dict': + if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) - elif how == 'series': + elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) - if ((from_key == 'float64' and to_key in ('int64')) or - (from_key == 'complex128' and - to_key in ('int64', 'float64'))): + if (from_key == "float64" and to_key in ("int64")) or ( + from_key == "complex128" and to_key in ("int64", "float64") + ): if compat.is_platform_32bit() or compat.is_platform_windows(): - pytest.skip("32-bit platform buggy: {0} -> {1}".format - (from_key, to_key)) + pytest.skip( + "32-bit platform buggy: {0} -> {1}".format(from_key, to_key) + ) # Expected: do not downcast by replacement - exp = pd.Series(self.rep[to_key], index=index, - name='yyy', dtype=from_key) + exp = pd.Series(self.rep[to_key], index=index, name="yyy", dtype=from_key) else: - exp = pd.Series(self.rep[to_key], index=index, name='yyy') + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp) # TODO(jbrockmendel) commented out to only have a single xfail printed - @pytest.mark.xfail(reason='GH #18376, tzawareness-compat bug ' - 'in BlockManager.replace_list') + @pytest.mark.xfail( + reason="GH #18376, tzawareness-compat bug " "in BlockManager.replace_list" + ) # @pytest.mark.parametrize('how', ['dict', 'series']) # @pytest.mark.parametrize('to_key', ['timedelta64[ns]', 'bool', 'object', # 'complex128', 'float64', 'int64']) @@ -878,31 +1040,32 @@ def test_replace_series(self, how, to_key, from_key): # 'datetime64[ns, US/Eastern]']) # def test_replace_series_datetime_tz(self, how, to_key, from_key): def test_replace_series_datetime_tz(self): - how = 'series' - from_key = 'datetime64[ns, US/Eastern]' - to_key = 'timedelta64[ns]' + how = "series" + from_key = "datetime64[ns, US/Eastern]" + to_key = "timedelta64[ns]" - index = pd.Index([3, 4], name='xxx') - obj = pd.Series(self.rep[from_key], index=index, name='yyy') + index = pd.Index([3, 4], name="xxx") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - if how == 'dict': + if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) - elif how == 'series': + elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name='yyy') + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp) # TODO(jreback) commented out to only have a single xfail printed - @pytest.mark.xfail(reason="different tz, " - "currently mask_missing raises SystemError", - strict=False) + @pytest.mark.xfail( + reason="different tz, " "currently mask_missing raises SystemError", + strict=False, + ) # @pytest.mark.parametrize('how', ['dict', 'series']) # @pytest.mark.parametrize('to_key', [ # 'datetime64[ns]', 'datetime64[ns, UTC]', @@ -912,23 +1075,23 @@ def test_replace_series_datetime_tz(self): # 'datetime64[ns, US/Eastern]']) # def test_replace_series_datetime_datetime(self, how, to_key, from_key): def test_replace_series_datetime_datetime(self): - how = 'dict' - to_key = 'datetime64[ns]' - from_key = 'datetime64[ns]' + how = "dict" + to_key = "datetime64[ns]" + from_key = "datetime64[ns]" - index = pd.Index([3, 4], name='xxx') - obj = pd.Series(self.rep[from_key], index=index, name='yyy') + index = pd.Index([3, 4], name="xxx") + obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - if how == 'dict': + if how == "dict": replacer = dict(zip(self.rep[from_key], self.rep[to_key])) - elif how == 'series': + elif how == "series": replacer = pd.Series(self.rep[to_key], index=self.rep[from_key]) else: raise ValueError result = obj.replace(replacer) - exp = pd.Series(self.rep[to_key], index=index, name='yyy') + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key tm.assert_series_equal(result, exp) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 278fa6bd44f99..31e9cff68445e 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -9,14 +9,13 @@ class TestDatetimeIndex: - def test_setitem_with_datetime_tz(self): # 16889 # support .loc with alignment and tz-aware DatetimeIndex mask = np.array([True, False, True, False]) - idx = date_range('20010101', periods=4, tz='UTC') - df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64') + idx = date_range("20010101", periods=4, tz="UTC") + df = DataFrame({"a": np.arange(4)}, index=idx).astype("float64") result = df.copy() result.loc[mask, :] = df.loc[mask, :] @@ -26,8 +25,8 @@ def test_setitem_with_datetime_tz(self): result.loc[mask] = df.loc[mask] tm.assert_frame_equal(result, df) - idx = date_range('20010101', periods=4) - df = DataFrame({'a': np.arange(4)}, index=idx).astype('float64') + idx = date_range("20010101", periods=4) + df = DataFrame({"a": np.arange(4)}, index=idx).astype("float64") result = df.copy() result.loc[mask, :] = df.loc[mask, :] @@ -42,31 +41,37 @@ def test_indexing_with_datetime_tz(self): # GH#8260 # support datetime64 with tz - idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') - dr = date_range('20130110', periods=3) - df = DataFrame({'A': idx, 'B': dr}) - df['C'] = idx + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + dr = date_range("20130110", periods=3) + df = DataFrame({"A": idx, "B": dr}) + df["C"] = idx df.iloc[1, 1] = pd.NaT df.iloc[1, 2] = pd.NaT # indexing result = df.iloc[1] - expected = Series([Timestamp('2013-01-02 00:00:00-0500', - tz='US/Eastern'), np.nan, np.nan], - index=list('ABC'), dtype='object', name=1) + expected = Series( + [Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), np.nan, np.nan], + index=list("ABC"), + dtype="object", + name=1, + ) tm.assert_series_equal(result, expected) result = df.loc[1] - expected = Series([Timestamp('2013-01-02 00:00:00-0500', - tz='US/Eastern'), np.nan, np.nan], - index=list('ABC'), dtype='object', name=1) + expected = Series( + [Timestamp("2013-01-02 00:00:00-0500", tz="US/Eastern"), np.nan, np.nan], + index=list("ABC"), + dtype="object", + name=1, + ) tm.assert_series_equal(result, expected) # indexing - fast_xs - df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) + df = DataFrame({"a": date_range("2014-01-01", periods=10, tz="UTC")}) result = df.iloc[5] - expected = Series([Timestamp('2014-01-06 00:00:00+0000', tz='UTC')], - index=['a'], name=5) + expected = Series( + [Timestamp("2014-01-06 00:00:00+0000", tz="UTC")], index=["a"], name=5 + ) tm.assert_series_equal(result, expected) result = df.loc[5] @@ -78,34 +83,33 @@ def test_indexing_with_datetime_tz(self): tm.assert_frame_equal(result, expected) # indexing - setting an element - df = DataFrame(data=pd.to_datetime( - ['2015-03-30 20:12:32', '2015-03-12 00:11:11']), columns=['time']) - df['new_col'] = ['new', 'old'] - df.time = df.set_index('time').index.tz_localize('UTC') - v = df[df.new_col == 'new'].set_index('time').index.tz_convert( - 'US/Pacific') + df = DataFrame( + data=pd.to_datetime(["2015-03-30 20:12:32", "2015-03-12 00:11:11"]), + columns=["time"], + ) + df["new_col"] = ["new", "old"] + df.time = df.set_index("time").index.tz_localize("UTC") + v = df[df.new_col == "new"].set_index("time").index.tz_convert("US/Pacific") # trying to set a single element on a part of a different timezone # this converts to object df2 = df.copy() - df2.loc[df2.new_col == 'new', 'time'] = v + df2.loc[df2.new_col == "new", "time"] = v - expected = Series([v[0], df.loc[1, 'time']], name='time') + expected = Series([v[0], df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') - df.loc[df.new_col == 'new', 'time'] = v - tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) + v = df.loc[df.new_col == "new", "time"] + pd.Timedelta("1s") + df.loc[df.new_col == "new", "time"] = v + tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) def test_consistency_with_tz_aware_scalar(self): # xef gh-12938 # various ways of indexing the same tz-aware scalar - df = Series([Timestamp('2016-03-30 14:35:25', - tz='Europe/Brussels')]).to_frame() + df = Series([Timestamp("2016-03-30 14:35:25", tz="Europe/Brussels")]).to_frame() df = pd.concat([df, df]).reset_index(drop=True) - expected = Timestamp('2016-03-30 14:35:25+0200', - tz='Europe/Brussels') + expected = Timestamp("2016-03-30 14:35:25+0200", tz="Europe/Brussels") result = df[0][0] assert result == expected @@ -132,9 +136,9 @@ def test_indexing_with_datetimeindex_tz(self): # GH 12050 # indexing on a series with a datetimeindex with tz - index = date_range('2015-01-01', periods=2, tz='utc') + index = date_range("2015-01-01", periods=2, tz="utc") - ser = Series(range(2), index=index, dtype='int64') + ser = Series(range(2), index=index, dtype="int64") # list-like indexing @@ -181,108 +185,128 @@ def test_partial_setting_with_datetimelike_dtype(self): # GH9478 # a datetimeindex alignment issue with partial setting - df = DataFrame(np.arange(6.).reshape(3, 2), columns=list('AB'), - index=date_range('1/1/2000', periods=3, freq='1H')) + df = DataFrame( + np.arange(6.0).reshape(3, 2), + columns=list("AB"), + index=date_range("1/1/2000", periods=3, freq="1H"), + ) expected = df.copy() - expected['C'] = [expected.index[0]] + [pd.NaT, pd.NaT] + expected["C"] = [expected.index[0]] + [pd.NaT, pd.NaT] mask = df.A < 1 - df.loc[mask, 'C'] = df.loc[mask].index + df.loc[mask, "C"] = df.loc[mask].index tm.assert_frame_equal(df, expected) def test_loc_setitem_datetime(self): # GH 9516 - dt1 = Timestamp('20130101 09:00:00') - dt2 = Timestamp('20130101 10:00:00') + dt1 = Timestamp("20130101 09:00:00") + dt2 = Timestamp("20130101 10:00:00") - for conv in [lambda x: x, lambda x: x.to_datetime64(), - lambda x: x.to_pydatetime(), lambda x: np.datetime64(x)]: + for conv in [ + lambda x: x, + lambda x: x.to_datetime64(), + lambda x: x.to_pydatetime(), + lambda x: np.datetime64(x), + ]: df = DataFrame() - df.loc[conv(dt1), 'one'] = 100 - df.loc[conv(dt2), 'one'] = 200 + df.loc[conv(dt1), "one"] = 100 + df.loc[conv(dt2), "one"] = 200 - expected = DataFrame({'one': [100.0, 200.0]}, index=[dt1, dt2]) + expected = DataFrame({"one": [100.0, 200.0]}, index=[dt1, dt2]) tm.assert_frame_equal(df, expected) def test_series_partial_set_datetime(self): # GH 11497 - idx = date_range('2011-01-01', '2011-01-02', freq='D', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') + idx = date_range("2011-01-01", "2011-01-02", freq="D", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") - result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]] - exp = Series([0.1, 0.2], index=idx, name='s') + result = ser.loc[[Timestamp("2011-01-01"), Timestamp("2011-01-02")]] + exp = Series([0.1, 0.2], index=idx, name="s") tm.assert_series_equal(result, exp, check_index_type=True) - keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), - Timestamp('2011-01-01')] - exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), - name='s') + keys = [ + Timestamp("2011-01-02"), + Timestamp("2011-01-02"), + Timestamp("2011-01-01"), + ] + exp = Series( + [0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name="idx"), name="s" + ) tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) - keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), - Timestamp('2011-01-03')] - exp = Series([np.nan, 0.2, np.nan], - index=pd.DatetimeIndex(keys, name='idx'), name='s') - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + keys = [ + Timestamp("2011-01-03"), + Timestamp("2011-01-02"), + Timestamp("2011-01-03"), + ] + exp = Series( + [np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name="idx"), name="s" + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) def test_series_partial_set_period(self): # GH 11497 - idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') + idx = pd.period_range("2011-01-01", "2011-01-02", freq="D", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") - result = ser.loc[[pd.Period('2011-01-01', freq='D'), - pd.Period('2011-01-02', freq='D')]] - exp = Series([0.1, 0.2], index=idx, name='s') + result = ser.loc[ + [pd.Period("2011-01-01", freq="D"), pd.Period("2011-01-02", freq="D")] + ] + exp = Series([0.1, 0.2], index=idx, name="s") tm.assert_series_equal(result, exp, check_index_type=True) - keys = [pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-01', freq='D')] - exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), - name='s') + keys = [ + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-01", freq="D"), + ] + exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name="idx"), name="s") tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) - keys = [pd.Period('2011-01-03', freq='D'), - pd.Period('2011-01-02', freq='D'), - pd.Period('2011-01-03', freq='D')] - exp = Series([np.nan, 0.2, np.nan], - index=pd.PeriodIndex(keys, name='idx'), name='s') - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + keys = [ + pd.Period("2011-01-03", freq="D"), + pd.Period("2011-01-02", freq="D"), + pd.Period("2011-01-03", freq="D"), + ] + exp = Series( + [np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name="idx"), name="s" + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.loc[keys] tm.assert_series_equal(result, exp) def test_nanosecond_getitem_setitem_with_tz(self): # GH 11679 - data = ['2016-06-28 08:30:00.123456789'] - index = pd.DatetimeIndex(data, dtype='datetime64[ns, America/Chicago]') - df = DataFrame({'a': [10]}, index=index) + data = ["2016-06-28 08:30:00.123456789"] + index = pd.DatetimeIndex(data, dtype="datetime64[ns, America/Chicago]") + df = DataFrame({"a": [10]}, index=index) result = df.loc[df.index[0]] - expected = Series(10, index=['a'], name=df.index[0]) + expected = Series(10, index=["a"], name=df.index[0]) tm.assert_series_equal(result, expected) result = df.copy() - result.loc[df.index[0], 'a'] = -1 - expected = DataFrame(-1, index=index, columns=['a']) + result.loc[df.index[0], "a"] = -1 + expected = DataFrame(-1, index=index, columns=["a"]) tm.assert_frame_equal(result, expected) def test_loc_getitem_across_dst(self): # GH 21846 - idx = pd.date_range('2017-10-29 01:30:00', - tz='Europe/Berlin', periods=5, freq='30 min') - series2 = pd.Series([0, 1, 2, 3, 4], - index=idx) - - t_1 = pd.Timestamp('2017-10-29 02:30:00+02:00', tz='Europe/Berlin', - freq='30min') - t_2 = pd.Timestamp('2017-10-29 02:00:00+01:00', tz='Europe/Berlin', - freq='30min') + idx = pd.date_range( + "2017-10-29 01:30:00", tz="Europe/Berlin", periods=5, freq="30 min" + ) + series2 = pd.Series([0, 1, 2, 3, 4], index=idx) + + t_1 = pd.Timestamp( + "2017-10-29 02:30:00+02:00", tz="Europe/Berlin", freq="30min" + ) + t_2 = pd.Timestamp( + "2017-10-29 02:00:00+01:00", tz="Europe/Berlin", freq="30min" + ) result = series2.loc[t_1:t_2] expected = pd.Series([2, 3], index=idx[2:4]) tm.assert_series_equal(result, expected) @@ -303,20 +327,22 @@ def test_loc_incremental_setitem_with_dst(self): def test_loc_setitem_with_existing_dst(self): # GH 18308 - start = pd.Timestamp('2017-10-29 00:00:00+0200', tz='Europe/Madrid') - end = pd.Timestamp('2017-10-29 03:00:00+0100', tz='Europe/Madrid') - ts = pd.Timestamp('2016-10-10 03:00:00', tz='Europe/Madrid') - idx = pd.date_range(start, end, closed='left', freq="H") - result = pd.DataFrame(index=idx, columns=['value']) - result.loc[ts, 'value'] = 12 - expected = pd.DataFrame([np.nan] * len(idx) + [12], - index=idx.append(pd.DatetimeIndex([ts])), - columns=['value'], - dtype=object) + start = pd.Timestamp("2017-10-29 00:00:00+0200", tz="Europe/Madrid") + end = pd.Timestamp("2017-10-29 03:00:00+0100", tz="Europe/Madrid") + ts = pd.Timestamp("2016-10-10 03:00:00", tz="Europe/Madrid") + idx = pd.date_range(start, end, closed="left", freq="H") + result = pd.DataFrame(index=idx, columns=["value"]) + result.loc[ts, "value"] = 12 + expected = pd.DataFrame( + [np.nan] * len(idx) + [12], + index=idx.append(pd.DatetimeIndex([ts])), + columns=["value"], + dtype=object, + ) tm.assert_frame_equal(result, expected) def test_loc_str_slicing(self): - ix = pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + ix = pd.period_range(start="2017-01-01", end="2018-01-01", freq="M") ser = ix.to_series() result = ser.loc[:"2017-12"] expected = ser.iloc[:-1] @@ -324,9 +350,9 @@ def test_loc_str_slicing(self): tm.assert_series_equal(result, expected) def test_loc_label_slicing(self): - ix = pd.period_range(start='2017-01-01', end='2018-01-01', freq='M') + ix = pd.period_range(start="2017-01-01", end="2018-01-01", freq="M") ser = ix.to_series() - result = ser.loc[:ix[-2]] + result = ser.loc[: ix[-2]] expected = ser.iloc[:-1] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index ada613110d9bf..78ff6580bb1e1 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -3,8 +3,7 @@ import numpy as np import pytest -from pandas import ( - DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series) +from pandas import DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal, assert_series_equal @@ -12,7 +11,6 @@ class TestFloatIndexers: - def check(self, result, original, indexer, getitem): """ comparator for results @@ -38,23 +36,29 @@ def test_scalar_error(self): # but is specifically testing for the error # message - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex, tm.makeIntIndex, - tm.makeRangeIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeCategoricalIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + tm.makeIntIndex, + tm.makeRangeIndex, + ]: i = index(5) s = Series(np.arange(len(i)), index=i) - msg = 'Cannot index by location index' + msg = "Cannot index by location index" with pytest.raises(TypeError, match=msg): s.iloc[3.0] - msg = ("cannot do positional indexing on {klass} with these " - r"indexers \[3\.0\] of {kind}".format( - klass=type(i), kind=str(float))) + msg = ( + "cannot do positional indexing on {klass} with these " + r"indexers \[3\.0\] of {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[3.0] = 0 @@ -65,22 +69,28 @@ def test_scalar_non_numeric(self): # float_indexers should raise exceptions # on appropriate Index types & accessors - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeCategoricalIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeCategoricalIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ]: i = index(5) - for s in [Series( - np.arange(len(i)), index=i), DataFrame( - np.random.randn( - len(i), len(i)), index=i, columns=i)]: + for s in [ + Series(np.arange(len(i)), index=i), + DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ]: # getting - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.iloc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.iloc, False), + (lambda x: x, True), + ]: # gettitem on a DataFrame is a KeyError as it is indexing # via labels on the columns @@ -89,26 +99,28 @@ def test_scalar_non_numeric(self): msg = r"^3(\.0)?$" else: error = TypeError - msg = (r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}|" - "Cannot index by location index with a" - " non-integer key" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index|positional) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}|" + "Cannot index by location index with a" + " non-integer key".format(klass=type(i), kind=str(float)) + ) with catch_warnings(record=True): with pytest.raises(error, match=msg): idxr(s)[3.0] # label based can be a TypeError or KeyError - if s.index.inferred_type in ['string', 'unicode', 'mixed']: + if s.index.inferred_type in ["string", "unicode", "mixed"]: error = KeyError msg = r"^3$" else: error = TypeError - msg = (r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(error, match=msg): s.loc[3.0] @@ -116,19 +128,19 @@ def test_scalar_non_numeric(self): assert 3.0 not in s # setting with a float fails with iloc - msg = (r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index|positional) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[3.0] = 0 # setting with an indexer - if s.index.inferred_type in ['categorical']: + if s.index.inferred_type in ["categorical"]: # Value or Type Error pass - elif s.index.inferred_type in ['datetime64', 'timedelta64', - 'period']: + elif s.index.inferred_type in ["datetime64", "timedelta64", "period"]: # these should prob work # and are inconsisten between series/dataframe ATM @@ -146,8 +158,7 @@ def test_scalar_non_numeric(self): s2.loc[3.0] = 10 assert s2.index.is_object() - for idxr in [lambda x: x.ix, - lambda x: x]: + for idxr in [lambda x: x.ix, lambda x: x]: s2 = s.copy() with catch_warnings(record=True): idxr(s2)[3.0] = 0 @@ -156,30 +167,32 @@ def test_scalar_non_numeric(self): # fallsback to position selection, series only s = Series(np.arange(len(i)), index=i) s[3] - msg = (r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=type(i), kind=str(float))) + msg = ( + r"cannot do (label|index) indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=type(i), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[3.0] @ignore_ix def test_scalar_with_mixed(self): - s2 = Series([1, 2, 3], index=['a', 'b', 'c']) - s3 = Series([1, 2, 3], index=['a', 'b', 1.5]) + s2 = Series([1, 2, 3], index=["a", "b", "c"]) + s3 = Series([1, 2, 3], index=["a", "b", 1.5]) # lookup in a pure stringstr # with an invalid indexer - for idxr in [lambda x: x.ix, - lambda x: x, - lambda x: x.iloc]: - - msg = (r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}|" - "Cannot index by location index with a non-integer key" - .format(klass=str(Index), kind=str(float))) + for idxr in [lambda x: x.ix, lambda x: x, lambda x: x.iloc]: + + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}|" + "Cannot index by location index with a non-integer key".format( + klass=str(Index), kind=str(float) + ) + ) with catch_warnings(record=True): with pytest.raises(TypeError, match=msg): idxr(s2)[1.0] @@ -187,7 +200,7 @@ def test_scalar_with_mixed(self): with pytest.raises(KeyError, match=r"^1$"): s2.loc[1.0] - result = s2.loc['b'] + result = s2.loc["b"] expected = 2 assert result == expected @@ -195,10 +208,11 @@ def test_scalar_with_mixed(self): # indexing for idxr in [lambda x: x]: - msg = (r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}" - .format(klass=str(Index), kind=str(float))) + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}".format(klass=str(Index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): idxr(s3)[1.0] @@ -211,10 +225,11 @@ def test_scalar_with_mixed(self): for idxr in [lambda x: x.ix]: with catch_warnings(record=True): - msg = (r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}" - .format(klass=str(Index), kind=str(float))) + msg = ( + r"cannot do label indexing" + r" on {klass} with these indexers \[1\.0\] of" + r" {kind}".format(klass=str(Index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): idxr(s3)[1.0] @@ -240,36 +255,41 @@ def test_scalar_integer(self): # integer index for i in [Int64Index(range(5)), RangeIndex(5)]: - for s in [Series(np.arange(len(i))), - DataFrame(np.random.randn(len(i), len(i)), - index=i, columns=i)]: + for s in [ + Series(np.arange(len(i))), + DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ]: # coerce to equal int - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.loc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.loc, False), + (lambda x: x, True), + ]: with catch_warnings(record=True): result = idxr(s)[3.0] self.check(result, s, 3, getitem) # coerce to equal int - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.loc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.loc, False), + (lambda x: x, True), + ]: if isinstance(s, Series): + def compare(x, y): assert x == y + expected = 100 else: compare = tm.assert_series_equal if getitem: - expected = Series(100, - index=range(len(s)), name=3) + expected = Series(100, index=range(len(s)), name=3) else: - expected = Series(100., - index=range(len(s)), name=3) + expected = Series(100.0, index=range(len(s)), name=3) s2 = s.copy() with catch_warnings(record=True): @@ -289,16 +309,21 @@ def compare(x, y): def test_scalar_float(self): # scalar float indexers work on a float index - index = Index(np.arange(5.)) - for s in [Series(np.arange(len(index)), index=index), - DataFrame(np.random.randn(len(index), len(index)), - index=index, columns=index)]: + index = Index(np.arange(5.0)) + for s in [ + Series(np.arange(len(index)), index=index), + DataFrame( + np.random.randn(len(index), len(index)), index=index, columns=index + ), + ]: # assert all operations except for iloc are ok indexer = index[3] - for idxr, getitem in [(lambda x: x.ix, False), - (lambda x: x.loc, False), - (lambda x: x, True)]: + for idxr, getitem in [ + (lambda x: x.ix, False), + (lambda x: x.loc, False), + (lambda x: x, True), + ]: # getting result = idxr(s)[indexer] @@ -332,10 +357,11 @@ def test_scalar_float(self): with pytest.raises(TypeError, match=msg): s.iloc[3.0] - msg = (r"cannot do positional indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}" - .format(klass=str(Float64Index), kind=str(float))) + msg = ( + r"cannot do positional indexing" + r" on {klass} with these indexers \[3\.0\] of" + r" {kind}".format(klass=str(Float64Index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s2.iloc[3.0] = 0 @@ -346,65 +372,79 @@ def test_slice_non_numeric(self): # float_indexers should raise exceptions # on appropriate Index types & accessors - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ]: index = index(5) - for s in [Series(range(5), index=index), - DataFrame(np.random.randn(5, 2), index=index)]: + for s in [ + Series(range(5), index=index), + DataFrame(np.random.randn(5, 2), index=index), + ]: # getitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: - - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[l] - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x.iloc, - lambda x: x]: - - msg = ("cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})" - .format(klass=type(index), - kind_float=str(float), - kind_int=str(int))) + for idxr in [ + lambda x: x.ix, + lambda x: x.loc, + lambda x: x.iloc, + lambda x: x, + ]: + + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers" + r" \[(3|4)(\.0)?\]" + r" of ({kind_float}|{kind_int})".format( + klass=type(index), + kind_float=str(float), + kind_int=str(int), + ) + ) with catch_warnings(record=True): with pytest.raises(TypeError, match=msg): idxr(s)[l] # setitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: - - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: + + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s.iloc[l] = 0 - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x.iloc, - lambda x: x]: - msg = ("cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})" - .format(klass=type(index), - kind_float=str(float), - kind_int=str(int))) + for idxr in [ + lambda x: x.ix, + lambda x: x.loc, + lambda x: x.iloc, + lambda x: x, + ]: + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers" + r" \[(3|4)(\.0)?\]" + r" of ({kind_float}|{kind_int})".format( + klass=type(index), + kind_float=str(float), + kind_int=str(int), + ) + ) with catch_warnings(record=True): with pytest.raises(TypeError, match=msg): idxr(s)[l] = 0 @@ -416,20 +456,19 @@ def test_slice_integer(self): # these coerce to a like integer # oob indicates if we are out of bounds # of positional indexing - for index, oob in [(Int64Index(range(5)), False), - (RangeIndex(5), False), - (Int64Index(range(5)) + 10, True)]: + for index, oob in [ + (Int64Index(range(5)), False), + (RangeIndex(5), False), + (Int64Index(range(5)) + 10, True), + ]: # s is an in-range index s = Series(range(5), index=index) # getitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] @@ -444,19 +483,18 @@ def test_slice_integer(self): self.check(result, s, indexer, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # getitem out-of-bounds - for l in [slice(-6, 6), - slice(-6.0, 6.0)]: + for l in [slice(-6, 6), slice(-6.0, 6.0)]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] @@ -470,20 +508,22 @@ def test_slice_integer(self): self.check(result, s, indexer, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[-6\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[-6\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[slice(-6.0, 6.0)] # getitem odd floats - for l, res1 in [(slice(2.5, 4), slice(3, 5)), - (slice(2, 3.5), slice(2, 4)), - (slice(2.5, 3.5), slice(3, 4))]: + for l, res1 in [ + (slice(2.5, 4), slice(3, 5)), + (slice(2, 3.5), slice(2, 4)), + (slice(2.5, 3.5), slice(3, 4)), + ]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] @@ -495,20 +535,18 @@ def test_slice_integer(self): self.check(result, s, res, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(2|3)\.5\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(2|3)\.5\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # setitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: - for idxr in [lambda x: x.loc, - lambda x: x.ix]: + for idxr in [lambda x: x.loc, lambda x: x.ix]: sc = s.copy() with catch_warnings(record=True): idxr(sc)[l] = 0 @@ -516,10 +554,11 @@ def test_slice_integer(self): assert (result == 0).all() # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -533,18 +572,16 @@ def test_integer_positional_indexing(self): expected = s.iloc[2:4] assert_series_equal(result, expected) - for idxr in [lambda x: x, - lambda x: x.iloc]: + for idxr in [lambda x: x, lambda x: x.iloc]: - for l in [slice(2, 4.0), - slice(2.0, 4), - slice(2.0, 4.0)]: + for l in [slice(2, 4.0), slice(2.0, 4), slice(2.0, 4.0)]: klass = RangeIndex - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(2|4)\.0\] of" - " {kind}" - .format(klass=str(klass), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(2|4)\.0\] of" + " {kind}".format(klass=str(klass), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): idxr(s)[l] @@ -559,57 +596,57 @@ def test_slice_integer_frame_getitem(self): def f(idxr): # getitem - for l in [slice(0.0, 1), - slice(0, 1.0), - slice(0.0, 1.0)]: + for l in [slice(0.0, 1), slice(0, 1.0), slice(0.0, 1.0)]: result = idxr(s)[l] indexer = slice(0, 2) self.check(result, s, indexer, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(0|1)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # getitem out-of-bounds - for l in [slice(-10, 10), - slice(-10.0, 10.0)]: + for l in [slice(-10, 10), slice(-10.0, 10.0)]: result = idxr(s)[l] self.check(result, s, slice(-10, 10), True) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[-10\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[-10\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[slice(-10.0, 10.0)] # getitem odd floats - for l, res in [(slice(0.5, 1), slice(1, 2)), - (slice(0, 0.5), slice(0, 1)), - (slice(0.5, 1.5), slice(1, 2))]: + for l, res in [ + (slice(0.5, 1), slice(1, 2)), + (slice(0, 0.5), slice(0, 1)), + (slice(0.5, 1.5), slice(1, 2)), + ]: result = idxr(s)[l] self.check(result, s, res, False) # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[0\.5\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[0\.5\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] # setitem - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: sc = s.copy() idxr(sc)[l] = 0 @@ -617,10 +654,11 @@ def f(idxr): assert (result == 0).all() # positional indexing - msg = ("cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}" - .format(klass=type(index), kind=str(float))) + msg = ( + "cannot do slice indexing" + r" on {klass} with these indexers \[(3|4)\.0\] of" + " {kind}".format(klass=type(index), kind=str(float)) + ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -632,18 +670,16 @@ def f(idxr): def test_slice_float(self): # same as above, but for floats - index = Index(np.arange(5.)) + 0.1 - for s in [Series(range(5), index=index), - DataFrame(np.random.randn(5, 2), index=index)]: + index = Index(np.arange(5.0)) + 0.1 + for s in [ + Series(range(5), index=index), + DataFrame(np.random.randn(5, 2), index=index), + ]: - for l in [slice(3.0, 4), - slice(3, 4.0), - slice(3.0, 4.0)]: + for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: expected = s.iloc[3:4] - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: + for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: # getitem with catch_warnings(record=True): @@ -714,7 +750,7 @@ def test_floating_misc(self): assert_series_equal(s.loc[fancy_idx], expected) assert_series_equal(s.loc[fancy_idx], expected) - expected = Series([2, 0], index=Index([5, 0], dtype='int64')) + expected = Series([2, 0], index=Index([5, 0], dtype="int64")) for fancy_idx in [[5, 0], np.array([5, 0])]: # int assert_series_equal(s[fancy_idx], expected) assert_series_equal(s.loc[fancy_idx], expected) @@ -771,8 +807,7 @@ def test_floating_misc(self): result3 = s.loc[[1.6, 5, 10]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) - assert_series_equal(result1, Series( - [np.nan, 2, 4], index=[1.6, 5, 10])) + assert_series_equal(result1, Series([np.nan, 2, 4], index=[1.6, 5, 10])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result1 = s[[0, 1, 2]] @@ -782,8 +817,7 @@ def test_floating_misc(self): result3 = s.loc[[0, 1, 2]] assert_series_equal(result1, result2) assert_series_equal(result1, result3) - assert_series_equal(result1, Series( - [0.0, np.nan, np.nan], index=[0, 1, 2])) + assert_series_equal(result1, Series([0.0, np.nan, np.nan], index=[0, 1, 2])) result1 = s.loc[[2.5, 5]] result2 = s.loc[[2.5, 5]] @@ -799,168 +833,170 @@ def test_floating_misc(self): def test_floating_tuples(self): # see gh-13509 - s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name='foo') + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name="foo") result = s[0.0] assert result == (1, 1) - expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name='foo') - s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name='foo') + expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name="foo") + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name="foo") result = s[0.0] tm.assert_series_equal(result, expected) def test_float64index_slicing_bug(self): # GH 5557, related to slicing a float index - ser = {256: 2321.0, - 1: 78.0, - 2: 2716.0, - 3: 0.0, - 4: 369.0, - 5: 0.0, - 6: 269.0, - 7: 0.0, - 8: 0.0, - 9: 0.0, - 10: 3536.0, - 11: 0.0, - 12: 24.0, - 13: 0.0, - 14: 931.0, - 15: 0.0, - 16: 101.0, - 17: 78.0, - 18: 9643.0, - 19: 0.0, - 20: 0.0, - 21: 0.0, - 22: 63761.0, - 23: 0.0, - 24: 446.0, - 25: 0.0, - 26: 34773.0, - 27: 0.0, - 28: 729.0, - 29: 78.0, - 30: 0.0, - 31: 0.0, - 32: 3374.0, - 33: 0.0, - 34: 1391.0, - 35: 0.0, - 36: 361.0, - 37: 0.0, - 38: 61808.0, - 39: 0.0, - 40: 0.0, - 41: 0.0, - 42: 6677.0, - 43: 0.0, - 44: 802.0, - 45: 0.0, - 46: 2691.0, - 47: 0.0, - 48: 3582.0, - 49: 0.0, - 50: 734.0, - 51: 0.0, - 52: 627.0, - 53: 70.0, - 54: 2584.0, - 55: 0.0, - 56: 324.0, - 57: 0.0, - 58: 605.0, - 59: 0.0, - 60: 0.0, - 61: 0.0, - 62: 3989.0, - 63: 10.0, - 64: 42.0, - 65: 0.0, - 66: 904.0, - 67: 0.0, - 68: 88.0, - 69: 70.0, - 70: 8172.0, - 71: 0.0, - 72: 0.0, - 73: 0.0, - 74: 64902.0, - 75: 0.0, - 76: 347.0, - 77: 0.0, - 78: 36605.0, - 79: 0.0, - 80: 379.0, - 81: 70.0, - 82: 0.0, - 83: 0.0, - 84: 3001.0, - 85: 0.0, - 86: 1630.0, - 87: 7.0, - 88: 364.0, - 89: 0.0, - 90: 67404.0, - 91: 9.0, - 92: 0.0, - 93: 0.0, - 94: 7685.0, - 95: 0.0, - 96: 1017.0, - 97: 0.0, - 98: 2831.0, - 99: 0.0, - 100: 2963.0, - 101: 0.0, - 102: 854.0, - 103: 0.0, - 104: 0.0, - 105: 0.0, - 106: 0.0, - 107: 0.0, - 108: 0.0, - 109: 0.0, - 110: 0.0, - 111: 0.0, - 112: 0.0, - 113: 0.0, - 114: 0.0, - 115: 0.0, - 116: 0.0, - 117: 0.0, - 118: 0.0, - 119: 0.0, - 120: 0.0, - 121: 0.0, - 122: 0.0, - 123: 0.0, - 124: 0.0, - 125: 0.0, - 126: 67744.0, - 127: 22.0, - 128: 264.0, - 129: 0.0, - 260: 197.0, - 268: 0.0, - 265: 0.0, - 269: 0.0, - 261: 0.0, - 266: 1198.0, - 267: 0.0, - 262: 2629.0, - 258: 775.0, - 257: 0.0, - 263: 0.0, - 259: 0.0, - 264: 163.0, - 250: 10326.0, - 251: 0.0, - 252: 1228.0, - 253: 0.0, - 254: 2769.0, - 255: 0.0} + ser = { + 256: 2321.0, + 1: 78.0, + 2: 2716.0, + 3: 0.0, + 4: 369.0, + 5: 0.0, + 6: 269.0, + 7: 0.0, + 8: 0.0, + 9: 0.0, + 10: 3536.0, + 11: 0.0, + 12: 24.0, + 13: 0.0, + 14: 931.0, + 15: 0.0, + 16: 101.0, + 17: 78.0, + 18: 9643.0, + 19: 0.0, + 20: 0.0, + 21: 0.0, + 22: 63761.0, + 23: 0.0, + 24: 446.0, + 25: 0.0, + 26: 34773.0, + 27: 0.0, + 28: 729.0, + 29: 78.0, + 30: 0.0, + 31: 0.0, + 32: 3374.0, + 33: 0.0, + 34: 1391.0, + 35: 0.0, + 36: 361.0, + 37: 0.0, + 38: 61808.0, + 39: 0.0, + 40: 0.0, + 41: 0.0, + 42: 6677.0, + 43: 0.0, + 44: 802.0, + 45: 0.0, + 46: 2691.0, + 47: 0.0, + 48: 3582.0, + 49: 0.0, + 50: 734.0, + 51: 0.0, + 52: 627.0, + 53: 70.0, + 54: 2584.0, + 55: 0.0, + 56: 324.0, + 57: 0.0, + 58: 605.0, + 59: 0.0, + 60: 0.0, + 61: 0.0, + 62: 3989.0, + 63: 10.0, + 64: 42.0, + 65: 0.0, + 66: 904.0, + 67: 0.0, + 68: 88.0, + 69: 70.0, + 70: 8172.0, + 71: 0.0, + 72: 0.0, + 73: 0.0, + 74: 64902.0, + 75: 0.0, + 76: 347.0, + 77: 0.0, + 78: 36605.0, + 79: 0.0, + 80: 379.0, + 81: 70.0, + 82: 0.0, + 83: 0.0, + 84: 3001.0, + 85: 0.0, + 86: 1630.0, + 87: 7.0, + 88: 364.0, + 89: 0.0, + 90: 67404.0, + 91: 9.0, + 92: 0.0, + 93: 0.0, + 94: 7685.0, + 95: 0.0, + 96: 1017.0, + 97: 0.0, + 98: 2831.0, + 99: 0.0, + 100: 2963.0, + 101: 0.0, + 102: 854.0, + 103: 0.0, + 104: 0.0, + 105: 0.0, + 106: 0.0, + 107: 0.0, + 108: 0.0, + 109: 0.0, + 110: 0.0, + 111: 0.0, + 112: 0.0, + 113: 0.0, + 114: 0.0, + 115: 0.0, + 116: 0.0, + 117: 0.0, + 118: 0.0, + 119: 0.0, + 120: 0.0, + 121: 0.0, + 122: 0.0, + 123: 0.0, + 124: 0.0, + 125: 0.0, + 126: 67744.0, + 127: 22.0, + 128: 264.0, + 129: 0.0, + 260: 197.0, + 268: 0.0, + 265: 0.0, + 269: 0.0, + 261: 0.0, + 266: 1198.0, + 267: 0.0, + 262: 2629.0, + 258: 775.0, + 257: 0.0, + 263: 0.0, + 259: 0.0, + 264: 163.0, + 250: 10326.0, + 251: 0.0, + 252: 1228.0, + 253: 0.0, + 254: 2769.0, + 255: 0.0, + } # smoke test for the repr s = Series(ser) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 8b54907131b8c..760d8c70b9434 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -13,15 +13,14 @@ class TestiLoc(Base): - def test_iloc_exceeds_bounds(self): # GH6296 # iloc should allow indexers that exceed the bounds - df = DataFrame(np.random.random_sample((20, 5)), columns=list('ABCDE')) + df = DataFrame(np.random.random_sample((20, 5)), columns=list("ABCDE")) # lists of positions should raise IndexError! - msg = 'positional indexers are out-of-bounds' + msg = "positional indexers are out-of-bounds" with pytest.raises(IndexError, match=msg): df.iloc[:, [0, 1, 2, 3, 4, 5]] with pytest.raises(IndexError, match=msg): @@ -31,14 +30,14 @@ def test_iloc_exceeds_bounds(self): with pytest.raises(IndexError, match=msg): df.iloc[[100]] - s = df['A'] + s = df["A"] with pytest.raises(IndexError, match=msg): s.iloc[[100]] with pytest.raises(IndexError, match=msg): s.iloc[[-100]] # still raise on a single indexer - msg = 'single positional indexer is out-of-bounds' + msg = "single positional indexer is out-of-bounds" with pytest.raises(IndexError, match=msg): df.iloc[30] with pytest.raises(IndexError, match=msg): @@ -104,7 +103,7 @@ def check(result, expected): result.dtypes tm.assert_frame_equal(result, expected) - dfl = DataFrame(np.random.randn(5, 2), columns=list('AB')) + dfl = DataFrame(np.random.randn(5, 2), columns=list("AB")) check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) check(dfl.iloc[:, 1:3], dfl.iloc[:, [1]]) check(dfl.iloc[4:6], dfl.iloc[[4]]) @@ -116,40 +115,57 @@ def check(result, expected): with pytest.raises(IndexError, match=msg): dfl.iloc[:, 4] - @pytest.mark.parametrize("index,columns", [(np.arange(20), list('ABCDE'))]) - @pytest.mark.parametrize("index_vals,column_vals", [ - ([slice(None), ['A', 'D']]), - (['1', '2'], slice(None)), - ([pd.datetime(2019, 1, 1)], slice(None))]) - def test_iloc_non_integer_raises(self, index, columns, - index_vals, column_vals): + @pytest.mark.parametrize("index,columns", [(np.arange(20), list("ABCDE"))]) + @pytest.mark.parametrize( + "index_vals,column_vals", + [ + ([slice(None), ["A", "D"]]), + (["1", "2"], slice(None)), + ([pd.datetime(2019, 1, 1)], slice(None)), + ], + ) + def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): # GH 25753 - df = DataFrame(np.random.randn(len(index), len(columns)), - index=index, - columns=columns) - msg = '.iloc requires numeric indexers, got' + df = DataFrame( + np.random.randn(len(index), len(columns)), index=index, columns=columns + ) + msg = ".iloc requires numeric indexers, got" with pytest.raises(IndexError, match=msg): df.iloc[index_vals, column_vals] def test_iloc_getitem_int(self): # integer - self.check_result('integer', 'iloc', 2, 'ix', - {0: 4, 1: 6, 2: 8}, typs=['ints', 'uints']) - self.check_result('integer', 'iloc', 2, 'indexer', 2, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "integer", "iloc", 2, "ix", {0: 4, 1: 6, 2: 8}, typs=["ints", "uints"] + ) + self.check_result( + "integer", + "iloc", + 2, + "indexer", + 2, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) def test_iloc_getitem_neg_int(self): # neg integer - self.check_result('neg int', 'iloc', -1, 'ix', - {0: 6, 1: 9, 2: 12}, typs=['ints', 'uints']) - self.check_result('neg int', 'iloc', -1, 'indexer', -1, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - @pytest.mark.parametrize('dims', [1, 2]) + self.check_result( + "neg int", "iloc", -1, "ix", {0: 6, 1: 9, 2: 12}, typs=["ints", "uints"] + ) + self.check_result( + "neg int", + "iloc", + -1, + "indexer", + -1, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + @pytest.mark.parametrize("dims", [1, 2]) def test_iloc_getitem_invalid_scalar(self, dims): # GH 21982 @@ -158,19 +174,18 @@ def test_iloc_getitem_invalid_scalar(self, dims): else: s = DataFrame(np.arange(100).reshape(10, 10)) - with pytest.raises(TypeError, match='Cannot index by location index'): - s.iloc['a'] + with pytest.raises(TypeError, match="Cannot index by location index"): + s.iloc["a"] def test_iloc_array_not_mutating_negative_indices(self): # GH 21867 array_with_neg_numbers = np.array([1, 2, -1]) array_copy = array_with_neg_numbers.copy() - df = pd.DataFrame({ - 'A': [100, 101, 102], - 'B': [103, 104, 105], - 'C': [106, 107, 108]}, - index=[1, 2, 3]) + df = pd.DataFrame( + {"A": [100, 101, 102], "B": [103, 104, 105], "C": [106, 107, 108]}, + index=[1, 2, 3], + ) df.iloc[array_with_neg_numbers] tm.assert_numpy_array_equal(array_with_neg_numbers, array_copy) df.iloc[:, array_with_neg_numbers] @@ -179,33 +194,65 @@ def test_iloc_array_not_mutating_negative_indices(self): def test_iloc_getitem_list_int(self): # list of ints - self.check_result('list int', 'iloc', [0, 1, 2], 'ix', - {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, - typs=['ints', 'uints']) - self.check_result('list int', 'iloc', [2], 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) - self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2], - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "list int", + "iloc", + [0, 1, 2], + "ix", + {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, + typs=["ints", "uints"], + ) + self.check_result( + "list int", + "iloc", + [2], + "ix", + {0: [4], 1: [6], 2: [8]}, + typs=["ints", "uints"], + ) + self.check_result( + "list int", + "iloc", + [0, 1, 2], + "indexer", + [0, 1, 2], + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) # array of ints (GH5006), make sure that a single indexer is returning # the correct type - self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'ix', - {0: [0, 2, 4], - 1: [0, 3, 6], - 2: [0, 4, 8]}, typs=['ints', 'uints']) - self.check_result('array int', 'iloc', np.array([2]), 'ix', - {0: [4], 1: [6], 2: [8]}, typs=['ints', 'uints']) - self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer', - [0, 1, 2], - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "array int", + "iloc", + np.array([0, 1, 2]), + "ix", + {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, + typs=["ints", "uints"], + ) + self.check_result( + "array int", + "iloc", + np.array([2]), + "ix", + {0: [4], 1: [6], 2: [8]}, + typs=["ints", "uints"], + ) + self.check_result( + "array int", + "iloc", + np.array([0, 1, 2]), + "indexer", + [0, 1, 2], + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) def test_iloc_getitem_neg_int_can_reach_first_index(self): # GH10547 and GH10779 # negative integers should be able to reach index 0 - df = DataFrame({'A': [2, 3, 5], 'B': [7, 11, 13]}) - s = df['A'] + df = DataFrame({"A": [2, 3, 5], "B": [7, 11, 13]}) + s = df["A"] expected = df.iloc[0] result = df.iloc[-3] @@ -224,19 +271,25 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): tm.assert_series_equal(result, expected) # check the length 1 Series case highlighted in GH10547 - expected = Series(['a'], index=['A']) + expected = Series(["a"], index=["A"]) result = expected.iloc[[-1]] tm.assert_series_equal(result, expected) def test_iloc_getitem_dups(self): - self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', - {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, - objs=['series', 'frame'], typs=['ints', 'uints']) + self.check_result( + "list int (dups)", + "iloc", + [0, 1, 1, 3], + "ix", + {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, + objs=["series", "frame"], + typs=["ints", "uints"], + ) # GH 6766 - df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) - df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) + df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) df = concat([df1, df2], axis=1) # cross-sectional indexing @@ -244,53 +297,74 @@ def test_iloc_getitem_dups(self): assert isna(result) result = df.iloc[0, :] - expected = Series([np.nan, 1, 3, 3], index=['A', 'B', 'A', 'B'], - name=0) + expected = Series([np.nan, 1, 3, 3], index=["A", "B", "A", "B"], name=0) tm.assert_series_equal(result, expected) def test_iloc_getitem_array(self): # array like s = Series(index=range(1, 4)) - self.check_result('array like', 'iloc', s.index, 'ix', - {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, - typs=['ints', 'uints']) + self.check_result( + "array like", + "iloc", + s.index, + "ix", + {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, + typs=["ints", "uints"], + ) def test_iloc_getitem_bool(self): # boolean indexers - b = [True, False, True, False, ] - self.check_result('bool', 'iloc', b, 'ix', b, typs=['ints', 'uints']) - self.check_result('bool', 'iloc', b, 'ix', b, - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) - - @pytest.mark.parametrize('index', [[True, False], - [True, False, True, False]]) + b = [True, False, True, False] + self.check_result("bool", "iloc", b, "ix", b, typs=["ints", "uints"]) + self.check_result( + "bool", + "iloc", + b, + "ix", + b, + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) + + @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) def test_iloc_getitem_bool_diff_len(self, index): # GH26658 s = Series([1, 2, 3]) - with pytest.raises(IndexError, - match=('Item wrong length {} instead of {}.'.format( - len(index), len(s)))): + with pytest.raises( + IndexError, + match=("Item wrong length {} instead of {}.".format(len(index), len(s))), + ): _ = s.iloc[index] def test_iloc_getitem_slice(self): # slices - self.check_result('slice', 'iloc', slice(1, 3), 'ix', - {0: [2, 4], 1: [3, 6], 2: [4, 8]}, - typs=['ints', 'uints']) - self.check_result('slice', 'iloc', slice(1, 3), 'indexer', - slice(1, 3), - typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result( + "slice", + "iloc", + slice(1, 3), + "ix", + {0: [2, 4], 1: [3, 6], 2: [4, 8]}, + typs=["ints", "uints"], + ) + self.check_result( + "slice", + "iloc", + slice(1, 3), + "indexer", + slice(1, 3), + typs=["labels", "mixed", "ts", "floats", "empty"], + fails=IndexError, + ) def test_iloc_getitem_slice_dups(self): - df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) - df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), - columns=['A', 'C']) + df1 = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df2 = DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ) # axis=1 df = concat([df1, df2], axis=1) @@ -332,14 +406,17 @@ def test_iloc_setitem(self): def test_iloc_setitem_list(self): # setitem with an iloc list - df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], - columns=["A", "B", "C"]) + df = DataFrame( + np.arange(9).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"] + ) df.iloc[[0, 1], [1, 2]] df.iloc[[0, 1], [1, 2]] += 100 expected = DataFrame( np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)), - index=["A", "B", "C"], columns=["A", "B", "C"]) + index=["A", "B", "C"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(df, expected) def test_iloc_setitem_pandas_object(self): @@ -359,12 +436,12 @@ def test_iloc_setitem_dups(self): # GH 6766 # iloc with a mask aligning from another iloc - df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) - df2 = DataFrame([{'A': 3, 'B': 3}, {'A': 4, 'B': 4}]) + df1 = DataFrame([{"A": None, "B": 1}, {"A": 2, "B": 2}]) + df2 = DataFrame([{"A": 3, "B": 3}, {"A": 4, "B": 4}]) df = concat([df1, df2], axis=1) expected = df.fillna(3) - expected['A'] = expected['A'].astype('float64') + expected["A"] = expected["A"].astype("float64") inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] @@ -372,8 +449,8 @@ def test_iloc_setitem_dups(self): # del a dup column across blocks expected = DataFrame({0: [1, 2], 1: [3, 4]}) - expected.columns = ['B', 'B'] - del df['A'] + expected.columns = ["B", "B"] + del df["A"] tm.assert_frame_equal(df, expected) # assign back to self @@ -381,15 +458,14 @@ def test_iloc_setitem_dups(self): tm.assert_frame_equal(df, expected) # reversed x 2 - df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( - drop=True) - df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( - drop=True) + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) + df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) tm.assert_frame_equal(df, expected) def test_iloc_getitem_frame(self): - df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2), - columns=range(0, 8, 2)) + df = DataFrame( + np.random.randn(10, 4), index=range(0, 20, 2), columns=range(0, 8, 2) + ) result = df.iloc[2] with catch_warnings(record=True): @@ -453,20 +529,21 @@ def test_iloc_getitem_frame(self): def test_iloc_getitem_labelled_frame(self): # try with labelled frame - df = DataFrame(np.random.randn(10, 4), - index=list('abcdefghij'), columns=list('ABCD')) + df = DataFrame( + np.random.randn(10, 4), index=list("abcdefghij"), columns=list("ABCD") + ) result = df.iloc[1, 1] - exp = df.loc['b', 'B'] + exp = df.loc["b", "B"] assert result == exp result = df.iloc[:, 2:3] - expected = df.loc[:, ['C']] + expected = df.loc[:, ["C"]] tm.assert_frame_equal(result, expected) # negative indexing result = df.iloc[-1, -1] - exp = df.loc['j', 'D'] + exp = df.loc["j", "D"] assert result == exp # out-of-bounds exception @@ -475,11 +552,13 @@ def test_iloc_getitem_labelled_frame(self): df.iloc[10, 5] # trying to use a label - msg = (r"Location based indexing can only have \[integer, integer" - r" slice \(START point is INCLUDED, END point is EXCLUDED\)," - r" listlike of integers, boolean array\] types") + msg = ( + r"Location based indexing can only have \[integer, integer" + r" slice \(START point is INCLUDED, END point is EXCLUDED\)," + r" listlike of integers, boolean array\] types" + ) with pytest.raises(ValueError, match=msg): - df.iloc['j', 'D'] + df.iloc["j", "D"] def test_iloc_getitem_doc_issue(self): @@ -487,8 +566,8 @@ def test_iloc_getitem_doc_issue(self): # surfaced in GH 6059 arr = np.random.randn(6, 4) - index = date_range('20130101', periods=6) - columns = list('ABCD') + index = date_range("20130101", periods=6) + columns = list("ABCD") df = DataFrame(arr, index=index, columns=columns) # defines ref_locs @@ -498,18 +577,16 @@ def test_iloc_getitem_doc_issue(self): str(result) result.dtypes - expected = DataFrame(arr[3:5, 0:2], index=index[3:5], - columns=columns[0:2]) + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=columns[0:2]) tm.assert_frame_equal(result, expected) # for dups - df.columns = list('aaaa') + df.columns = list("aaaa") result = df.iloc[3:5, 0:2] str(result) result.dtypes - expected = DataFrame(arr[3:5, 0:2], index=index[3:5], - columns=list('aa')) + expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=list("aa")) tm.assert_frame_equal(result, expected) # related @@ -522,13 +599,13 @@ def test_iloc_getitem_doc_issue(self): result = df.iloc[1:5, 2:4] str(result) result.dtypes - expected = DataFrame(arr[1:5, 2:4], index=index[1:5], - columns=columns[2:4]) + expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) tm.assert_frame_equal(result, expected) def test_iloc_setitem_series(self): - df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), - columns=list('ABCD')) + df = DataFrame( + np.random.randn(10, 4), index=list("abcdefghij"), columns=list("ABCD") + ) df.iloc[1, 1] = 1 result = df.iloc[1, 1] @@ -561,46 +638,41 @@ def test_iloc_setitem_list_of_lists(self): # GH 7551 # list-of-list is set incorrectly in mixed vs. single dtyped frames - df = DataFrame(dict(A=np.arange(5, dtype='int64'), - B=np.arange(5, 10, dtype='int64'))) + df = DataFrame( + dict(A=np.arange(5, dtype="int64"), B=np.arange(5, 10, dtype="int64")) + ) df.iloc[2:4] = [[10, 11], [12, 13]] expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) tm.assert_frame_equal(df, expected) - df = DataFrame( - dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) - df.iloc[2:4] = [['x', 11], ['y', 13]] - expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], - B=[5, 6, 11, 13, 9])) + df = DataFrame(dict(A=list("abcde"), B=np.arange(5, 10, dtype="int64"))) + df.iloc[2:4] = [["x", 11], ["y", 13]] + expected = DataFrame(dict(A=["a", "b", "x", "y", "e"], B=[5, 6, 11, 13, 9])) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - 'indexer', [[0], slice(None, 1, None), np.array([0])]) - @pytest.mark.parametrize( - 'value', [['Z'], np.array(['Z'])]) + @pytest.mark.parametrize("indexer", [[0], slice(None, 1, None), np.array([0])]) + @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])]) def test_iloc_setitem_with_scalar_index(self, indexer, value): # GH #19474 # assigning like "df.iloc[0, [0]] = ['Z']" should be evaluated # elementwisely, not using "setter('A', ['Z'])". - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) df.iloc[0, indexer] = value result = df.iloc[0, 0] - assert is_scalar(result) and result == 'Z' + assert is_scalar(result) and result == "Z" def test_iloc_mask(self): # GH 3631, iloc with a mask (of a series) should raise - df = DataFrame(list(range(5)), index=list('ABCDE'), columns=['a']) - mask = (df.a % 2 == 0) - msg = ("iLocation based boolean indexing cannot use an indexable as" - " a mask") + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 + msg = "iLocation based boolean indexing cannot use an indexable as" " a mask" with pytest.raises(ValueError, match=msg): df.iloc[mask] mask.index = range(len(mask)) - msg = ("iLocation based boolean indexing on an integer type is not" - " available") + msg = "iLocation based boolean indexing on an integer type is not" " available" with pytest.raises(NotImplementedError, match=msg): df.iloc[mask] @@ -612,41 +684,44 @@ def test_iloc_mask(self): locs = np.arange(4) nums = 2 ** locs reps = [bin(num) for num in nums] - df = DataFrame({'locs': locs, 'nums': nums}, reps) + df = DataFrame({"locs": locs, "nums": nums}, reps) expected = { - (None, ''): '0b1100', - (None, '.loc'): '0b1100', - (None, '.iloc'): '0b1100', - ('index', ''): '0b11', - ('index', '.loc'): '0b11', - ('index', '.iloc'): ('iLocation based boolean indexing ' - 'cannot use an indexable as a mask'), - ('locs', ''): 'Unalignable boolean Series provided as indexer ' - '(index of the boolean Series and of the indexed ' - 'object do not match).', - ('locs', '.loc'): 'Unalignable boolean Series provided as indexer ' - '(index of the boolean Series and of the ' - 'indexed object do not match).', - ('locs', '.iloc'): ('iLocation based boolean indexing on an ' - 'integer type is not available'), + (None, ""): "0b1100", + (None, ".loc"): "0b1100", + (None, ".iloc"): "0b1100", + ("index", ""): "0b11", + ("index", ".loc"): "0b11", + ("index", ".iloc"): ( + "iLocation based boolean indexing " "cannot use an indexable as a mask" + ), + ("locs", ""): "Unalignable boolean Series provided as indexer " + "(index of the boolean Series and of the indexed " + "object do not match).", + ("locs", ".loc"): "Unalignable boolean Series provided as indexer " + "(index of the boolean Series and of the " + "indexed object do not match).", + ("locs", ".iloc"): ( + "iLocation based boolean indexing on an " + "integer type is not available" + ), } # UserWarnings from reindex of a boolean mask with catch_warnings(record=True): simplefilter("ignore", UserWarning) result = dict() - for idx in [None, 'index', 'locs']: + for idx in [None, "index", "locs"]: mask = (df.nums > 2).values if idx: mask = Series(mask, list(reversed(getattr(df, idx)))) - for method in ['', '.loc', '.iloc']: + for method in ["", ".loc", ".iloc"]: try: if method: accessor = getattr(df, method[1:]) else: accessor = df - ans = str(bin(accessor[mask]['nums'].sum())) + ans = str(bin(accessor[mask]["nums"].sum())) except Exception as e: ans = str(e) @@ -654,13 +729,13 @@ def test_iloc_mask(self): r = expected.get(key) if r != ans: raise AssertionError( - "[%s] does not match [%s], received [%s]" - % (key, ans, r)) + "[%s] does not match [%s], received [%s]" % (key, ans, r) + ) def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) - df = DataFrame({'A': [0.1] * 3000, 'B': [1] * 3000}) + df = DataFrame({"A": [0.1] * 3000, "B": [1] * 3000}) idx = np.arange(30) * 99 expected = df.iloc[idx] @@ -669,7 +744,7 @@ def test_iloc_non_unique_indexing(self): tm.assert_frame_equal(result, expected) - df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) + df2 = DataFrame({"A": [0.1] * 1000, "B": [1] * 1000}) df2 = concat([df2, 2 * df2, 3 * df2]) sidx = df2.index.to_series() @@ -682,35 +757,43 @@ def test_iloc_non_unique_indexing(self): new_list.append(s * 3) expected = DataFrame(new_list) - expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], - sort=True) + expected = concat([expected, DataFrame(index=idx[idx > sidx.max()])], sort=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df2.loc[idx] tm.assert_frame_equal(result, expected, check_index_type=False) def test_iloc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) # vertical empty - tm.assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.iloc[:, []], + df.iloc[:, :0], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.iloc[[], :], + df.iloc[:0, :], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.iloc[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) def test_identity_slice_returns_new_object(self): # GH13873 - original_df = DataFrame({'a': [1, 2, 3]}) + original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.iloc[:] assert sliced_df is not original_df # should be a shallow copy - original_df['a'] = [4, 4, 4] - assert (sliced_df['a'] == 4).all() + original_df["a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 92966e721aedc..e06047b52ac15 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -13,7 +13,10 @@ from pandas import DataFrame, Index, NaT, Series from pandas.core.generic import NDFrame from pandas.core.indexing import ( - _maybe_numeric_slice, _non_reducing_slice, validate_indices) + _maybe_numeric_slice, + _non_reducing_slice, + validate_indices, +) from pandas.tests.indexing.common import Base, _mklbl import pandas.util.testing as tm @@ -31,70 +34,89 @@ def test_setitem_ndarray_1d(self): # len of indexer vs length of the 1d ndarray df = DataFrame(index=Index(np.arange(1, 11))) - df['foo'] = np.zeros(10, dtype=np.float64) - df['bar'] = np.zeros(10, dtype=np.complex) + df["foo"] = np.zeros(10, dtype=np.float64) + df["bar"] = np.zeros(10, dtype=np.complex) # invalid with pytest.raises(ValueError): - df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j, - 2.2, 1.0]) + df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) # valid - df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, - 2.2, 1.0]) + df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) - result = df.loc[df.index[2:6], 'bar'] - expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], - name='bar') + result = df.loc[df.index[2:6], "bar"] + expected = Series( + [2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar" + ) tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(np.arange(1, 11))) - df['foo'] = np.zeros(10, dtype=np.float64) - df['bar'] = np.zeros(10, dtype=np.complex) + df["foo"] = np.zeros(10, dtype=np.float64) + df["bar"] = np.zeros(10, dtype=np.complex) with pytest.raises(ValueError): df[2:5] = np.arange(1, 4) * 1j - @pytest.mark.parametrize('index', tm.all_index_generator(5), - ids=lambda x: type(x).__name__) - @pytest.mark.parametrize('obj', [ - lambda i: Series(np.arange(len(i)), index=i), - lambda i: DataFrame( - np.random.randn(len(i), len(i)), index=i, columns=i) - ], ids=['Series', 'DataFrame']) - @pytest.mark.parametrize('idxr, idxr_id', [ - (lambda x: x, 'getitem'), - (lambda x: x.loc, 'loc'), - (lambda x: x.iloc, 'iloc'), - pytest.param(lambda x: x.ix, 'ix', marks=ignore_ix) - ]) + @pytest.mark.parametrize( + "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "obj", + [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ], + ids=["Series", "DataFrame"], + ) + @pytest.mark.parametrize( + "idxr, idxr_id", + [ + (lambda x: x, "getitem"), + (lambda x: x.loc, "loc"), + (lambda x: x.iloc, "iloc"), + pytest.param(lambda x: x.ix, "ix", marks=ignore_ix), + ], + ) def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) - msg = (r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Cannot index with multidimensional key|" - r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" - "No matching signature found|" # TypeError - "unhashable type: 'numpy.ndarray'" # TypeError - ) - - if (isinstance(obj, Series) and idxr_id == 'getitem' and - index.inferred_type in [ - 'string', 'datetime64', 'period', 'timedelta64', - 'boolean', 'categorical']): + msg = ( + r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "The truth value of an array with more than one element is" + " ambiguous|" + "Cannot index with multidimensional key|" + r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" + "No matching signature found|" # TypeError + "unhashable type: 'numpy.ndarray'" # TypeError + ) + + if ( + isinstance(obj, Series) + and idxr_id == "getitem" + and index.inferred_type + in [ + "string", + "datetime64", + "period", + "timedelta64", + "boolean", + "categorical", + ] + ): idxr[nd3] else: - if (isinstance(obj, DataFrame) and idxr_id == 'getitem' and - index.inferred_type == 'boolean'): + if ( + isinstance(obj, DataFrame) + and idxr_id == "getitem" + and index.inferred_type == "boolean" + ): error = TypeError - elif idxr_id == 'getitem' and index.inferred_type == 'interval': + elif idxr_id == "getitem" and index.inferred_type == "interval": error = TypeError else: error = ValueError @@ -102,49 +124,74 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): with pytest.raises(error, match=msg): idxr[nd3] - @pytest.mark.parametrize('index', tm.all_index_generator(5), - ids=lambda x: type(x).__name__) - @pytest.mark.parametrize('obj', [ - lambda i: Series(np.arange(len(i)), index=i), - lambda i: DataFrame( - np.random.randn(len(i), len(i)), index=i, columns=i) - ], ids=['Series', 'DataFrame']) - @pytest.mark.parametrize('idxr, idxr_id', [ - (lambda x: x, 'setitem'), - (lambda x: x.loc, 'loc'), - (lambda x: x.iloc, 'iloc'), - pytest.param(lambda x: x.ix, 'ix', marks=ignore_ix) - ]) + @pytest.mark.parametrize( + "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ + ) + @pytest.mark.parametrize( + "obj", + [ + lambda i: Series(np.arange(len(i)), index=i), + lambda i: DataFrame(np.random.randn(len(i), len(i)), index=i, columns=i), + ], + ids=["Series", "DataFrame"], + ) + @pytest.mark.parametrize( + "idxr, idxr_id", + [ + (lambda x: x, "setitem"), + (lambda x: x.loc, "loc"), + (lambda x: x.iloc, "iloc"), + pytest.param(lambda x: x.ix, "ix", marks=ignore_ix), + ], + ) def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) - msg = (r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Only 1-dimensional input arrays are supported|" - "'pandas._libs.interval.IntervalTree' object has no attribute" - " 'set_value'|" # AttributeError - "unhashable type: 'numpy.ndarray'|" # TypeError - "No matching signature found|" # TypeError - r"^\[\[\[" # pandas.core.indexing.IndexingError - ) - - if ((idxr_id == 'iloc') - or ((isinstance(obj, Series) and idxr_id == 'setitem' - and index.inferred_type in [ - 'floating', 'string', 'datetime64', 'period', 'timedelta64', - 'boolean', 'categorical'])) - or (idxr_id == 'ix' and index.inferred_type in [ - 'string', 'datetime64', 'period', 'boolean'])): + msg = ( + r"Buffer has wrong number of dimensions \(expected 1," + r" got 3\)|" + "The truth value of an array with more than one element is" + " ambiguous|" + "Only 1-dimensional input arrays are supported|" + "'pandas._libs.interval.IntervalTree' object has no attribute" + " 'set_value'|" # AttributeError + "unhashable type: 'numpy.ndarray'|" # TypeError + "No matching signature found|" # TypeError + r"^\[\[\[" # pandas.core.indexing.IndexingError + ) + + if ( + (idxr_id == "iloc") + or ( + ( + isinstance(obj, Series) + and idxr_id == "setitem" + and index.inferred_type + in [ + "floating", + "string", + "datetime64", + "period", + "timedelta64", + "boolean", + "categorical", + ] + ) + ) + or ( + idxr_id == "ix" + and index.inferred_type in ["string", "datetime64", "period", "boolean"] + ) + ): idxr[nd3] = 0 else: with pytest.raises( - (ValueError, AttributeError, TypeError, - pd.core.indexing.IndexingError), match=msg): + (ValueError, AttributeError, TypeError, pd.core.indexing.IndexingError), + match=msg, + ): idxr[nd3] = 0 def test_inf_upcast(self): @@ -179,58 +226,67 @@ def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) - df['c'] = np.nan - assert df['c'].dtype == np.float64 + df["c"] = np.nan + assert df["c"].dtype == np.float64 - df.loc[0, 'c'] = 'foo' - expected = DataFrame([{"a": 1, "c": 'foo'}, - {"a": 3, "b": 2, "c": np.nan}]) + df.loc[0, "c"] = "foo" + expected = DataFrame([{"a": 1, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]) tm.assert_frame_equal(df, expected) # GH10280 - df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), - index=list('ab'), - columns=['foo', 'bar', 'baz']) + df = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3), + index=list("ab"), + columns=["foo", "bar", "baz"], + ) - for val in [3.14, 'wxyz']: + for val in [3.14, "wxyz"]: left = df.copy() - left.loc['a', 'bar'] = val - right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), - columns=['foo', 'bar', 'baz']) + left.loc["a", "bar"] = val + right = DataFrame( + [[0, val, 2], [3, 4, 5]], + index=list("ab"), + columns=["foo", "bar", "baz"], + ) tm.assert_frame_equal(left, right) - assert is_integer_dtype(left['foo']) - assert is_integer_dtype(left['baz']) - - left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, - index=list('ab'), - columns=['foo', 'bar', 'baz']) - left.loc['a', 'bar'] = 'wxyz' - - right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), - columns=['foo', 'bar', 'baz']) + assert is_integer_dtype(left["foo"]) + assert is_integer_dtype(left["baz"]) + + left = DataFrame( + np.arange(6, dtype="int64").reshape(2, 3) / 10.0, + index=list("ab"), + columns=["foo", "bar", "baz"], + ) + left.loc["a", "bar"] = "wxyz" + + right = DataFrame( + [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], + index=list("ab"), + columns=["foo", "bar", "baz"], + ) tm.assert_frame_equal(left, right) - assert is_float_dtype(left['foo']) - assert is_float_dtype(left['baz']) + assert is_float_dtype(left["foo"]) + assert is_float_dtype(left["baz"]) def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(10, 3) - df.columns = ['a', 'a', 'b'] - result = df[['b', 'a']].columns - expected = Index(['b', 'a', 'a']) + df.columns = ["a", "a", "b"] + result = df[["b", "a"]].columns + expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes - df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], - columns=list('aaaaaaa')) + df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) - result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) - result.columns = list('aaaaaaa') + result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) + result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa @@ -240,45 +296,51 @@ def test_dups_fancy_indexing(self): # GH 3561, dups not in selected order df = DataFrame( - {'test': [5, 7, 9, 11], - 'test1': [4., 5, 6, 7], - 'other': list('abcd')}, index=['A', 'A', 'B', 'C']) - rows = ['C', 'B'] + {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")}, + index=["A", "A", "B", "C"], + ) + rows = ["C", "B"] expected = DataFrame( - {'test': [11, 9], - 'test1': [7., 6], - 'other': ['d', 'c']}, index=rows) + {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows + ) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) - rows = ['C', 'B', 'E'] + rows = ["C", "B", "E"] expected = DataFrame( - {'test': [11, 9, np.nan], - 'test1': [7., 6, np.nan], - 'other': ['d', 'c', np.nan]}, index=rows) + { + "test": [11, 9, np.nan], + "test1": [7.0, 6, np.nan], + "other": ["d", "c", np.nan], + }, + index=rows, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer - rows = ['F', 'G', 'H', 'C', 'B', 'E'] - expected = DataFrame({'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], - 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], - 'other': [np.nan, np.nan, np.nan, - 'd', 'c', np.nan]}, - index=rows) + rows = ["F", "G", "H", "C", "B", "E"] + expected = DataFrame( + { + "test": [np.nan, np.nan, np.nan, 11, 9, np.nan], + "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan], + "other": [np.nan, np.nan, np.nan, "d", "c", np.nan], + }, + index=rows, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label - dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) + dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises(KeyError): - dfnu.loc[['E']] + dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 @@ -289,50 +351,52 @@ def test_dups_fancy_indexing(self): expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) - df = DataFrame({"A": list('abc')}) + df = DataFrame({"A": list("abc")}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] - expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) + expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector - df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) + df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) expected = DataFrame( - {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) + {"test": [5, 7, 5, 7, np.nan]}, index=["A", "A", "A", "A", "E"] + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[['A', 'A', 'E']] + result = df.loc[["A", "A", "E"]] tm.assert_frame_equal(result, expected) def test_dups_fancy_indexing2(self): # GH 5835 # dups on index and missing values - df = DataFrame( - np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) + df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"]) expected = pd.concat( - [df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], - index=df.index)], axis=1) + [df.loc[:, ["A", "B"]], DataFrame(np.nan, columns=["C"], index=df.index)], + axis=1, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.loc[:, ['A', 'B', 'C']] + result = df.loc[:, ["A", "B", "C"]] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing - df = DataFrame(np.random.randn(9, 2), - index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) + df = DataFrame( + np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=["a", "b"] + ) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df - result = df.loc[:, ['a', 'b']] + result = df.loc[:, ["a", "b"]] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] - result = df.loc[[1, 2], ['a', 'b']] + result = df.loc[[1, 2], ["a", "b"]] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('case', [lambda s: s, lambda s: s.loc]) + @pytest.mark.parametrize("case", [lambda s: s, lambda s: s.loc]) def test_duplicate_int_indexing(self, case): # GH 17347 s = pd.Series(range(3), index=[1, 1, 3]) @@ -343,17 +407,18 @@ def test_duplicate_int_indexing(self, case): def test_indexing_mixed_frame_bug(self): # GH3492 - df = DataFrame({'a': {1: 'aaa', 2: 'bbb', 3: 'ccc'}, - 'b': {1: 111, 2: 222, 3: 333}}) + df = DataFrame( + {"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}} + ) # this works, new column is created correctly - df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x) + df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x) # this does not work, ie column test is not changed - idx = df['test'] == '_' - temp = df.loc[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x) - df.loc[idx, 'test'] = temp - assert df.iloc[0, 2] == '-----' + idx = df["test"] == "_" + temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x) + df.loc[idx, "test"] = temp + assert df.iloc[0, 2] == "-----" # if I look at df, then element [0,2] equals '_'. If instead I type # df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I @@ -361,8 +426,7 @@ def test_indexing_mixed_frame_bug(self): def test_multitype_list_index_access(self): # GH 10610 - df = DataFrame(np.random.random((10, 5)), - columns=["a"] + [20, 21, 22, 23]) + df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) with pytest.raises(KeyError): df[[22, 26, -8]] @@ -371,88 +435,110 @@ def test_multitype_list_index_access(self): def test_set_index_nan(self): # GH 3586 - df = DataFrame({'PRuid': {17: 'nonQC', - 18: 'nonQC', - 19: 'nonQC', - 20: '10', - 21: '11', - 22: '12', - 23: '13', - 24: '24', - 25: '35', - 26: '46', - 27: '47', - 28: '48', - 29: '59', - 30: '10'}, - 'QC': {17: 0.0, - 18: 0.0, - 19: 0.0, - 20: np.nan, - 21: np.nan, - 22: np.nan, - 23: np.nan, - 24: 1.0, - 25: np.nan, - 26: np.nan, - 27: np.nan, - 28: np.nan, - 29: np.nan, - 30: np.nan}, - 'data': {17: 7.9544899999999998, - 18: 8.0142609999999994, - 19: 7.8591520000000008, - 20: 0.86140349999999999, - 21: 0.87853110000000001, - 22: 0.8427041999999999, - 23: 0.78587700000000005, - 24: 0.73062459999999996, - 25: 0.81668560000000001, - 26: 0.81927080000000008, - 27: 0.80705009999999999, - 28: 0.81440240000000008, - 29: 0.80140849999999997, - 30: 0.81307740000000006}, - 'year': {17: 2006, - 18: 2007, - 19: 2008, - 20: 1985, - 21: 1985, - 22: 1985, - 23: 1985, - 24: 1985, - 25: 1985, - 26: 1985, - 27: 1985, - 28: 1985, - 29: 1985, - 30: 1986}}).reset_index() - - result = df.set_index(['year', 'PRuid', 'QC']).reset_index().reindex( - columns=df.columns) + df = DataFrame( + { + "PRuid": { + 17: "nonQC", + 18: "nonQC", + 19: "nonQC", + 20: "10", + 21: "11", + 22: "12", + 23: "13", + 24: "24", + 25: "35", + 26: "46", + 27: "47", + 28: "48", + 29: "59", + 30: "10", + }, + "QC": { + 17: 0.0, + 18: 0.0, + 19: 0.0, + 20: np.nan, + 21: np.nan, + 22: np.nan, + 23: np.nan, + 24: 1.0, + 25: np.nan, + 26: np.nan, + 27: np.nan, + 28: np.nan, + 29: np.nan, + 30: np.nan, + }, + "data": { + 17: 7.9544899999999998, + 18: 8.0142609999999994, + 19: 7.8591520000000008, + 20: 0.86140349999999999, + 21: 0.87853110000000001, + 22: 0.8427041999999999, + 23: 0.78587700000000005, + 24: 0.73062459999999996, + 25: 0.81668560000000001, + 26: 0.81927080000000008, + 27: 0.80705009999999999, + 28: 0.81440240000000008, + 29: 0.80140849999999997, + 30: 0.81307740000000006, + }, + "year": { + 17: 2006, + 18: 2007, + 19: 2008, + 20: 1985, + 21: 1985, + 22: 1985, + 23: 1985, + 24: 1985, + 25: 1985, + 26: 1985, + 27: 1985, + 28: 1985, + 29: 1985, + 30: 1986, + }, + } + ).reset_index() + + result = ( + df.set_index(["year", "PRuid", "QC"]) + .reset_index() + .reindex(columns=df.columns) + ) tm.assert_frame_equal(result, df) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df - df = DataFrame({'FC': ['a', 'b', 'a', 'b', 'a', 'b'], - 'PF': [0, 0, 0, 0, 1, 1], - 'col1': list(range(6)), - 'col2': list(range(6, 12)), - }) + df = DataFrame( + { + "FC": ["a", "b", "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": list(range(6)), + "col2": list(range(6, 12)), + } + ) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isna() - cols = ['col1', 'col2'] + cols = ["col1", "col2"] dft = df2 * 2 dft.iloc[3, 3] = np.nan - expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], - 'PF': [0, 0, 0, 0, 1, 1], - 'col1': Series([0, 1, 4, 6, 8, 10]), - 'col2': [12, 7, 16, np.nan, 20, 22]}) + expected = DataFrame( + { + "FC": ["a", np.nan, "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": Series([0, 1, 4, 6, 8, 10]), + "col2": [12, 7, 16, np.nan, 20, 22], + } + ) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] @@ -464,10 +550,14 @@ def test_multi_assign(self): # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 - expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], - 'PF': [0, 0, 0, 0, 1, 1], - 'col1': [0., 1., 4., 6., 8., 10.], - 'col2': [12, 7, 16, np.nan, 20, 22]}) + expected = DataFrame( + { + "FC": ["a", np.nan, "a", "b", "a", "b"], + "PF": [0, 0, 0, 0, 1, 1], + "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], + "col2": [12, 7, 16, np.nan, 20, 22], + } + ) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) @@ -475,15 +565,21 @@ def test_multi_assign(self): tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required - df = DataFrame(dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[ - 0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7])) + df = DataFrame( + dict( + A=[1, 2, 0, 0, 0], + B=[0, 0, 0, 10, 11], + C=[0, 0, 0, 10, 11], + D=[3, 4, 5, 6, 7], + ) + ) expected = df.copy() - mask = expected['A'] == 0 - for col in ['A', 'B']: - expected.loc[mask, col] = df['D'] + mask = expected["A"] == 0 + for col in ["A", "B"]: + expected.loc[mask, col] = df["D"] - df.loc[df['A'] == 0, ['A', 'B']] = df['D'] + df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected) def test_setitem_list(self): @@ -505,7 +601,6 @@ def test_setitem_list(self): # ix with an object class TO: - def __init__(self, value): self.value = value @@ -547,104 +642,113 @@ def test_string_slice(self): # GH 14424 # string indexing against datetimelike with object # dtype should properly raises KeyError - df = DataFrame([1], Index([pd.Timestamp('2011-01-01')], dtype=object)) + df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object)) assert df.index.is_all_dates with pytest.raises(KeyError): - df['2011'] + df["2011"] with pytest.raises(KeyError): - df.loc['2011', 0] + df.loc["2011", 0] df = DataFrame() assert not df.index.is_all_dates with pytest.raises(KeyError): - df['2011'] + df["2011"] with pytest.raises(KeyError): - df.loc['2011', 0] + df.loc["2011", 0] def test_astype_assignment(self): # GH4312 (iloc) - df_orig = DataFrame([['1', '2', '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + df_orig = DataFrame( + [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) - expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) - expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + expected = DataFrame( + [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() - df.loc[:, 'A'] = df.loc[:, 'A'].astype(np.int64) - expected = DataFrame([[1, '2', '3', '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) + expected = DataFrame( + [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.loc[:, ['B', 'C']] = df.loc[:, ['B', 'C']].astype(np.int64) - expected = DataFrame([['1', 2, 3, '.4', 5, 6., 'foo']], - columns=list('ABCDEFG')) + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) + expected = DataFrame( + [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") + ) tm.assert_frame_equal(df, expected) # full replacements / no nans - df = DataFrame({'A': [1., 2., 3., 4.]}) - df.iloc[:, 0] = df['A'].astype(np.int64) - expected = DataFrame({'A': [1, 2, 3, 4]}) + df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) + df.iloc[:, 0] = df["A"].astype(np.int64) + expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) - df = DataFrame({'A': [1., 2., 3., 4.]}) - df.loc[:, 'A'] = df['A'].astype(np.int64) - expected = DataFrame({'A': [1, 2, 3, 4]}) + df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) + df.loc[:, "A"] = df["A"].astype(np.int64) + expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, 2]), 2), - (Index([0, 1, '2']), '2'), - (Index([0, 1, 2, np.inf, 4]), 4), - (Index([0, 1, 2, np.nan, 4]), 4), - (Index([0, 1, 2, np.inf]), np.inf), - (Index([0, 1, 2, np.nan]), np.nan), - ]) + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), 2), + (Index([0, 1, "2"]), "2"), + (Index([0, 1, 2, np.inf, 4]), 4), + (Index([0, 1, 2, np.nan, 4]), 4), + (Index([0, 1, 2, np.inf]), np.inf), + (Index([0, 1, 2, np.nan]), np.nan), + ], + ) def test_index_contains(self, index, val): assert val in index - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, 2]), '2'), - (Index([0, 1, '2']), 2), - (Index([0, 1, 2, np.inf]), 4), - (Index([0, 1, 2, np.nan]), 4), - (Index([0, 1, 2, np.inf]), np.nan), - (Index([0, 1, 2, np.nan]), np.inf), - # Checking if np.inf in Int64Index should not cause an OverflowError - # Related to GH 16957 - (pd.Int64Index([0, 1, 2]), np.inf), - (pd.Int64Index([0, 1, 2]), np.nan), - (pd.UInt64Index([0, 1, 2]), np.inf), - (pd.UInt64Index([0, 1, 2]), np.nan), - ]) + @pytest.mark.parametrize( + "index,val", + [ + (Index([0, 1, 2]), "2"), + (Index([0, 1, "2"]), 2), + (Index([0, 1, 2, np.inf]), 4), + (Index([0, 1, 2, np.nan]), 4), + (Index([0, 1, 2, np.inf]), np.nan), + (Index([0, 1, 2, np.nan]), np.inf), + # Checking if np.inf in Int64Index should not cause an OverflowError + # Related to GH 16957 + (pd.Int64Index([0, 1, 2]), np.inf), + (pd.Int64Index([0, 1, 2]), np.nan), + (pd.UInt64Index([0, 1, 2]), np.inf), + (pd.UInt64Index([0, 1, 2]), np.nan), + ], + ) def test_index_not_contains(self, index, val): assert val not in index - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, '2']), 0), - (Index([0, 1, '2']), '2'), - ]) + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), 0), (Index([0, 1, "2"]), "2")] + ) def test_mixed_index_contains(self, index, val): # GH 19860 assert val in index - @pytest.mark.parametrize("index,val", [ - (Index([0, 1, '2']), '1'), - (Index([0, 1, '2']), 2), - ]) + @pytest.mark.parametrize( + "index,val", [(Index([0, 1, "2"]), "1"), (Index([0, 1, "2"]), 2)] + ) def test_mixed_index_not_contains(self, index, val): # GH 19860 assert val not in index @@ -676,14 +780,11 @@ def test_index_type_coercion(self): # then we need to coerce to object # integer indexes - for s in [Series(range(5)), - Series(range(5), index=range(1, 6))]: + for s in [Series(range(5)), Series(range(5), index=range(1, 6))]: assert s.index.is_integer() - for indexer in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: + for indexer in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() indexer(s2)[0.1] = 0 assert s2.index.is_floating() @@ -697,16 +798,14 @@ def test_index_type_coercion(self): tm.assert_index_equal(s2.index, exp) s2 = s.copy() - indexer(s2)['0'] = 0 + indexer(s2)["0"] = 0 assert s2.index.is_object() - for s in [Series(range(5), index=np.arange(5.))]: + for s in [Series(range(5), index=np.arange(5.0))]: assert s.index.is_floating() - for idxr in [lambda x: x.ix, - lambda x: x.loc, - lambda x: x]: + for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() idxr(s2)[0.1] = 0 @@ -718,27 +817,25 @@ def test_index_type_coercion(self): tm.assert_index_equal(s2.index, s.index) s2 = s.copy() - idxr(s2)['0'] = 0 + idxr(s2)["0"] = 0 assert s2.index.is_object() class TestMisc(Base): - def test_float_index_to_mixed(self): df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)}) - df['a'] = 10 - tm.assert_frame_equal(DataFrame({0.0: df[0.0], - 1.0: df[1.0], - 'a': [10] * 10}), - df) + df["a"] = 10 + tm.assert_frame_equal( + DataFrame({0.0: df[0.0], 1.0: df[1.0], "a": [10] * 10}), df + ) def test_float_index_non_scalar_assignment(self): - df = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}, index=[1., 2., 3.]) + df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df.loc[df.index[:2]] = 1 - expected = DataFrame({'a': [1, 1, 3], 'b': [1, 1, 5]}, index=df.index) + expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index) tm.assert_frame_equal(expected, df) - df = DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}, index=[1., 2., 3.]) + df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0]) df2 = df.copy() df.loc[df.index] = df.loc[df.index] tm.assert_frame_equal(df, df2) @@ -752,15 +849,15 @@ def test_float_index_at_iat(self): def test_mixed_index_assignment(self): # GH 19860 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) - s.at['a'] = 11 + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) + s.at["a"] = 11 assert s.iat[0] == 11 s.at[1] = 22 assert s.iat[3] == 22 def test_mixed_index_no_fallback(self): # GH 19860 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) with pytest.raises(KeyError): s.at[0] with pytest.raises(KeyError): @@ -771,8 +868,8 @@ def test_rhs_alignment(self): # assigned to. covers both uniform data-type & multi-type cases def run_tests(df, rhs, right): # label, index, slice - lbl_one, idx_one, slice_one = list('bcd'), [1, 2, 3], slice(1, 4) - lbl_two, idx_two, slice_two = ['joe', 'jolie'], [1, 2], slice(1, 3) + lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4) + lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3) left = df.copy() left.loc[lbl_one, lbl_two] = rhs @@ -802,8 +899,8 @@ def run_tests(df, rhs, right): tm.assert_frame_equal(left, right) xs = np.arange(20).reshape(5, 4) - cols = ['jim', 'joe', 'jolie', 'joline'] - df = DataFrame(xs, columns=cols, index=list('abcde')) + cols = ["jim", "joe", "jolie", "joline"] + df = DataFrame(xs, columns=cols, index=list("abcde")) # right hand side; permute the indices and multiplpy by -2 rhs = -2 * df.iloc[3:0:-1, 2:0:-1] @@ -817,8 +914,8 @@ def run_tests(df, rhs, right): # make frames multi-type & re-run tests for frame in [df, rhs, right]: - frame['joe'] = frame['joe'].astype('float64') - frame['jolie'] = frame['jolie'].map('@{0}'.format) + frame["joe"] = frame["joe"].astype("float64") + frame["jolie"] = frame["jolie"].map("@{0}".format) run_tests(df, rhs, right) @@ -833,30 +930,27 @@ def assert_slices_equivalent(l_slc, i_slc): tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - for idx in [_mklbl('A', 20), np.arange(20) + 100, - np.linspace(100, 150, 20)]: + for idx in [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: idx = Index(idx) s = Series(np.arange(20), index=idx) - assert_slices_equivalent(SLC[idx[9]::-1], SLC[9::-1]) - assert_slices_equivalent(SLC[:idx[9]:-1], SLC[:8:-1]) - assert_slices_equivalent(SLC[idx[13]:idx[9]:-1], SLC[13:8:-1]) - assert_slices_equivalent(SLC[idx[9]:idx[13]:-1], SLC[:0]) + assert_slices_equivalent(SLC[idx[9] :: -1], SLC[9::-1]) + assert_slices_equivalent(SLC[: idx[9] : -1], SLC[:8:-1]) + assert_slices_equivalent(SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]) + assert_slices_equivalent(SLC[idx[9] : idx[13] : -1], SLC[:0]) def test_slice_with_zero_step_raises(self): - s = Series(np.arange(20), index=_mklbl('A', 20)) - with pytest.raises(ValueError, match='slice step cannot be zero'): + s = Series(np.arange(20), index=_mklbl("A", 20)) + with pytest.raises(ValueError, match="slice step cannot be zero"): s[::0] - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): s.loc[::0] with catch_warnings(record=True): simplefilter("ignore") - with pytest.raises(ValueError, match='slice step cannot be zero'): + with pytest.raises(ValueError, match="slice step cannot be zero"): s.ix[::0] def test_indexing_assignment_dict_already_exists(self): - df = DataFrame({'x': [1, 2, 6], - 'y': [2, 2, 8], - 'z': [-5, 0, 5]}).set_index('z') + df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8], "z": [-5, 0, 5]}).set_index("z") expected = df.copy() rhs = dict(x=9, y=99) df.loc[5] = rhs @@ -865,16 +959,16 @@ def test_indexing_assignment_dict_already_exists(self): def test_indexing_dtypes_on_empty(self): # Check that .iloc and .ix return correct dtypes GH9983 - df = DataFrame({'a': [1, 2, 3], 'b': ['b', 'b2', 'b3']}) + df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]}) with catch_warnings(record=True): simplefilter("ignore") df2 = df.ix[[], :] - assert df2.loc[:, 'a'].dtype == np.int64 - tm.assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0]) + assert df2.loc[:, "a"].dtype == np.int64 + tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0]) with catch_warnings(record=True): simplefilter("ignore") - tm.assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) + tm.assert_series_equal(df2.loc[:, "a"], df2.ix[:, 0]) def test_range_in_series_indexing(self): # range can cause an indexing error @@ -902,7 +996,7 @@ def test_non_reducing_slice(self): slice(None, None, None), [0, 1], np.array([0, 1]), - Series([0, 1]) + Series([0, 1]), ] for slice_ in slices: tslice_ = _non_reducing_slice(slice_) @@ -910,45 +1004,46 @@ def test_non_reducing_slice(self): def test_list_slice(self): # like dataframe getitem - slices = [['A'], Series(['A']), np.array(['A'])] - df = DataFrame({'A': [1, 2], 'B': [3, 4]}, index=['A', 'B']) - expected = pd.IndexSlice[:, ['A']] + slices = [["A"], Series(["A"]), np.array(["A"])] + df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) + expected = pd.IndexSlice[:, ["A"]] for subset in slices: result = _non_reducing_slice(subset) tm.assert_frame_equal(df.loc[result], df.loc[expected]) def test_maybe_numeric_slice(self): - df = DataFrame({'A': [1, 2], 'B': ['c', 'd'], 'C': [True, False]}) + df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]}) result = _maybe_numeric_slice(df, slice_=None) - expected = pd.IndexSlice[:, ['A']] + expected = pd.IndexSlice[:, ["A"]] assert result == expected result = _maybe_numeric_slice(df, None, include_bool=True) - expected = pd.IndexSlice[:, ['A', 'C']] + expected = pd.IndexSlice[:, ["A", "C"]] result = _maybe_numeric_slice(df, [1]) expected = [1] assert result == expected def test_partial_boolean_frame_indexing(self): # GH 17170 - df = DataFrame(np.arange(9.).reshape(3, 3), - index=list('abc'), columns=list('ABC')) - index_df = DataFrame(1, index=list('ab'), columns=list('AB')) + df = DataFrame( + np.arange(9.0).reshape(3, 3), index=list("abc"), columns=list("ABC") + ) + index_df = DataFrame(1, index=list("ab"), columns=list("AB")) result = df[index_df.notnull()] - expected = DataFrame(np.array([[0., 1., np.nan], - [3., 4., np.nan], - [np.nan] * 3]), - index=list('abc'), - columns=list('ABC')) + expected = DataFrame( + np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]), + index=list("abc"), + columns=list("ABC"), + ) tm.assert_frame_equal(result, expected) def test_no_reference_cycle(self): - df = DataFrame({'a': [0, 1], 'b': [2, 3]}) - for name in ('loc', 'iloc', 'at', 'iat'): + df = DataFrame({"a": [0, 1], "b": [2, 3]}) + for name in ("loc", "iloc", "at", "iat"): getattr(df, name) with catch_warnings(record=True): simplefilter("ignore") - getattr(df, 'ix') + getattr(df, "ix") wr = weakref.ref(df) del df assert wr() is None @@ -959,11 +1054,11 @@ class TestSeriesNoneCoercion: # For numeric series, we should coerce to NaN. ([1, 2, 3], [np.nan, 2, 3]), ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), - # For datetime series, we should coerce to NaT. - ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), # For objects, we should preserve the None value. (["foo", "bar", "baz"], [None, "bar", "baz"]), ] @@ -1006,56 +1101,60 @@ class TestDataframeNoneCoercion: # For numeric series, we should coerce to NaN. ([1, 2, 3], [np.nan, 2, 3]), ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), - # For datetime series, we should coerce to NaT. - ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), - + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), # For objects, we should preserve the None value. (["foo", "bar", "baz"], [None, "bar", "baz"]), ] def test_coercion_with_loc(self): - for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({'foo': start_data}) - start_dataframe.loc[0, ['foo']] = None + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[0, ["foo"]] = None - expected_dataframe = DataFrame({'foo': expected_result}) + expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_coercion_with_setitem_and_dataframe(self): - for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({'foo': start_data}) - start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][ - 0]] = None + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None - expected_dataframe = DataFrame({'foo': expected_result}) + expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_loc_and_dataframe(self): - for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: - start_dataframe = DataFrame({'foo': start_data}) - start_dataframe.loc[start_dataframe['foo'] == start_dataframe[ - 'foo'][0]] = None + for start_data, expected_result in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({"foo": start_data}) + start_dataframe.loc[ + start_dataframe["foo"] == start_dataframe["foo"][0] + ] = None - expected_dataframe = DataFrame({'foo': expected_result}) + expected_dataframe = DataFrame({"foo": expected_result}) tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_mixed_dtypes(self): - start_dataframe = DataFrame({ - 'a': [1, 2, 3], - 'b': [1.0, 2.0, 3.0], - 'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, - 3)], - 'd': ['a', 'b', 'c'] - }) + start_dataframe = DataFrame( + { + "a": [1, 2, 3], + "b": [1.0, 2.0, 3.0], + "c": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + "d": ["a", "b", "c"], + } + ) start_dataframe.iloc[0] = None - exp = DataFrame({'a': [np.nan, 2, 3], - 'b': [np.nan, 2.0, 3.0], - 'c': [NaT, datetime(2000, 1, 2), - datetime(2000, 1, 3)], - 'd': [None, 'b', 'c']}) + exp = DataFrame( + { + "a": [np.nan, 2, 3], + "b": [np.nan, 2.0, 3.0], + "c": [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + "d": [None, "b", "c"], + } + ) tm.assert_frame_equal(start_dataframe, exp) @@ -1085,13 +1184,17 @@ def test_validate_indices_empty(): def test_extension_array_cross_section(): # A cross-section of a homogeneous EA should be an EA - df = pd.DataFrame({ - "A": pd.core.arrays.integer_array([1, 2]), - "B": pd.core.arrays.integer_array([3, 4]) - }, index=['a', 'b']) - expected = pd.Series(pd.core.arrays.integer_array([1, 3]), - index=['A', 'B'], name='a') - result = df.loc['a'] + df = pd.DataFrame( + { + "A": pd.core.arrays.integer_array([1, 2]), + "B": pd.core.arrays.integer_array([3, 4]), + }, + index=["a", "b"], + ) + expected = pd.Series( + pd.core.arrays.integer_array([1, 3]), index=["A", "B"], name="a" + ) + result = df.loc["a"] tm.assert_series_equal(result, expected) result = df.iloc[0] @@ -1099,34 +1202,40 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): - df = pd.DataFrame({ - "A": pd.core.arrays.integer_array([1, 2]), - "B": np.array([1, 2]), - }, index=['a', 'b']) - result = df.loc['a'] - expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a') + df = pd.DataFrame( + {"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])}, + index=["a", "b"], + ) + result = df.loc["a"] + expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a") tm.assert_series_equal(result, expected) result = df.iloc[0] tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('idxr, error, error_message', [ - (lambda x: x, - AttributeError, - "'numpy.ndarray' object has no attribute 'get'"), - (lambda x: x.loc, - AttributeError, - "type object 'NDFrame' has no attribute '_AXIS_ALIASES'"), - (lambda x: x.iloc, - AttributeError, - "type object 'NDFrame' has no attribute '_AXIS_ALIASES'"), - pytest.param( - lambda x: x.ix, - ValueError, - "NDFrameIndexer does not support NDFrame objects with ndim > 2", - marks=ignore_ix) -]) +@pytest.mark.parametrize( + "idxr, error, error_message", + [ + (lambda x: x, AttributeError, "'numpy.ndarray' object has no attribute 'get'"), + ( + lambda x: x.loc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_ALIASES'", + ), + ( + lambda x: x.iloc, + AttributeError, + "type object 'NDFrame' has no attribute '_AXIS_ALIASES'", + ), + pytest.param( + lambda x: x.ix, + ValueError, + "NDFrameIndexer does not support NDFrame objects with ndim > 2", + marks=ignore_ix, + ), + ], +) def test_ndframe_indexing_raises(idxr, error, error_message): # GH 25567 frame = NDFrame(np.random.randint(5, size=(2, 2, 2))) diff --git a/pandas/tests/indexing/test_indexing_engines.py b/pandas/tests/indexing/test_indexing_engines.py index 71a797741bbdb..7303c1ff3d111 100644 --- a/pandas/tests/indexing/test_indexing_engines.py +++ b/pandas/tests/indexing/test_indexing_engines.py @@ -61,8 +61,7 @@ def test_get_loc(self, numeric_indexing_engine_type_and_dtype): result = engine.get_loc(2) assert (result == expected).all() - def test_get_backfill_indexer( - self, numeric_indexing_engine_type_and_dtype): + def test_get_backfill_indexer(self, numeric_indexing_engine_type_and_dtype): engine_type, dtype = numeric_indexing_engine_type_and_dtype arr = np.array([1, 5, 10], dtype=dtype) @@ -74,8 +73,7 @@ def test_get_backfill_indexer( expected = libalgos.backfill(arr, new) tm.assert_numpy_array_equal(result, expected) - def test_get_pad_indexer( - self, numeric_indexing_engine_type_and_dtype): + def test_get_pad_indexer(self, numeric_indexing_engine_type_and_dtype): engine_type, dtype = numeric_indexing_engine_type_and_dtype arr = np.array([1, 5, 10], dtype=dtype) @@ -91,13 +89,12 @@ def test_get_pad_indexer( class TestObjectEngine: engine_type = libindex.ObjectEngine dtype = np.object_ - values = list('abc') + values = list("abc") def test_is_monotonic(self): num = 1000 - arr = np.array(['a'] * num + ['a'] * num + ['c'] * num, - dtype=self.dtype) + arr = np.array(["a"] * num + ["a"] * num + ["c"] * num, dtype=self.dtype) # monotonic increasing engine = self.engine_type(lambda: arr, len(arr)) @@ -110,8 +107,7 @@ def test_is_monotonic(self): assert engine.is_monotonic_decreasing is True # neither monotonic increasing or decreasing - arr = np.array(['a'] * num + ['b'] * num + ['a'] * num, - dtype=self.dtype) + arr = np.array(["a"] * num + ["b"] * num + ["a"] * num, dtype=self.dtype) engine = self.engine_type(lambda: arr[::-1], len(arr)) assert engine.is_monotonic_increasing is False assert engine.is_monotonic_decreasing is False @@ -123,7 +119,7 @@ def test_is_unique(self): assert engine.is_unique is True # not unique - arr = np.array(['a', 'b', 'a'], dtype=self.dtype) + arr = np.array(["a", "b", "a"], dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) assert engine.is_unique is False @@ -131,37 +127,36 @@ def test_get_loc(self): # unique arr = np.array(self.values, dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - assert engine.get_loc('b') == 1 + assert engine.get_loc("b") == 1 # monotonic num = 1000 - arr = np.array(['a'] * num + ['b'] * num + ['c'] * num, - dtype=self.dtype) + arr = np.array(["a"] * num + ["b"] * num + ["c"] * num, dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - assert engine.get_loc('b') == slice(1000, 2000) + assert engine.get_loc("b") == slice(1000, 2000) # not monotonic arr = np.array(self.values * num, dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) expected = np.array([False, True, False] * num, dtype=bool) - result = engine.get_loc('b') + result = engine.get_loc("b") assert (result == expected).all() def test_get_backfill_indexer(self): - arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + arr = np.array(["a", "e", "j"], dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - new = np.array(list('abcdefghij'), dtype=self.dtype) + new = np.array(list("abcdefghij"), dtype=self.dtype) result = engine.get_backfill_indexer(new) expected = libalgos.backfill["object"](arr, new) tm.assert_numpy_array_equal(result, expected) def test_get_pad_indexer(self): - arr = np.array(['a', 'e', 'j'], dtype=self.dtype) + arr = np.array(["a", "e", "j"], dtype=self.dtype) engine = self.engine_type(lambda: arr, len(arr)) - new = np.array(list('abcdefghij'), dtype=self.dtype) + new = np.array(list("abcdefghij"), dtype=self.dtype) result = engine.get_pad_indexer(new) expected = libalgos.pad["object"](arr, new) diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py index 67467a5f34c44..bf8c6afd00561 100644 --- a/pandas/tests/indexing/test_indexing_slow.py +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -5,11 +5,10 @@ class TestIndexingSlow: - @pytest.mark.slow def test_large_dataframe_indexing(self): # GH10692 - result = DataFrame({'x': range(10 ** 6)}, dtype='int64') + result = DataFrame({"x": range(10 ** 6)}, dtype="int64") result.loc[len(result)] = len(result) + 1 - expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64') + expected = DataFrame({"x": range(10 ** 6 + 1)}, dtype="int64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py index 270fa8c5502a6..ee62c91ad9698 100644 --- a/pandas/tests/indexing/test_ix.py +++ b/pandas/tests/indexing/test_ix.py @@ -15,15 +15,13 @@ def test_ix_deprecation(): # GH 15114 - df = DataFrame({'A': [1, 2, 3]}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=True): - df.ix[1, 'A'] + df = DataFrame({"A": [1, 2, 3]}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=True): + df.ix[1, "A"] @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") class TestIX: - def test_ix_loc_setitem_consistency(self): # GH 5771 @@ -35,45 +33,51 @@ def test_ix_loc_setitem_consistency(self): # GH 5928 # chained indexing assignment - df = DataFrame({'a': [0, 1, 2]}) + df = DataFrame({"a": [0, 1, 2]}) expected = df.copy() with catch_warnings(record=True): - expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] + expected.ix[[0, 1, 2], "a"] = -expected.ix[[0, 1, 2], "a"] with catch_warnings(record=True): - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] + df["a"].ix[[0, 1, 2]] = -df["a"].ix[[0, 1, 2]] tm.assert_frame_equal(df, expected) - df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) + df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2]}) with catch_warnings(record=True): - df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype( - 'float64') + 0.5 - expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) + df["a"].ix[[0, 1, 2]] = -df["a"].ix[[0, 1, 2]].astype("float64") + 0.5 + expected = DataFrame({"a": [0.5, -0.5, -1.5], "b": [0, 1, 2]}) tm.assert_frame_equal(df, expected) # GH 8607 # ix setitem consistency - df = DataFrame({'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470], - 'timestamp': [1413840976, 1413842580, 1413760580]}) - expected = DataFrame({'delta': [1174, 904, 161], - 'elapsed': [7673, 9277, 1470], - 'timestamp': pd.to_datetime( - [1413840976, 1413842580, 1413760580], - unit='s') - }) + df = DataFrame( + { + "delta": [1174, 904, 161], + "elapsed": [7673, 9277, 1470], + "timestamp": [1413840976, 1413842580, 1413760580], + } + ) + expected = DataFrame( + { + "delta": [1174, 904, 161], + "elapsed": [7673, 9277, 1470], + "timestamp": pd.to_datetime( + [1413840976, 1413842580, 1413760580], unit="s" + ), + } + ) df2 = df.copy() - df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') + df2["timestamp"] = pd.to_datetime(df["timestamp"], unit="s") tm.assert_frame_equal(df2, expected) df2 = df.copy() - df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s') + df2.loc[:, "timestamp"] = pd.to_datetime(df["timestamp"], unit="s") tm.assert_frame_equal(df2, expected) df2 = df.copy() with catch_warnings(record=True): - df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') + df2.ix[:, 2] = pd.to_datetime(df["timestamp"], unit="s") tm.assert_frame_equal(df2, expected) def test_ix_loc_consistency(self): @@ -89,34 +93,47 @@ def compare(result, expected): assert expected.equals(result) # failure cases for .loc, but these work for .ix - df = DataFrame(np.random.randn(5, 4), columns=list('ABCD')) - for key in [slice(1, 3), tuple([slice(0, 2), slice(0, 2)]), - tuple([slice(0, 2), df.columns[0:2]])]: - - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, - tm.makeDateIndex, tm.makePeriodIndex, - tm.makeTimedeltaIndex]: + df = DataFrame(np.random.randn(5, 4), columns=list("ABCD")) + for key in [ + slice(1, 3), + tuple([slice(0, 2), slice(0, 2)]), + tuple([slice(0, 2), df.columns[0:2]]), + ]: + + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeTimedeltaIndex, + ]: df.index = index(len(df.index)) with catch_warnings(record=True): df.ix[key] - msg = (r"cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\] of" - r" {kind}" - .format(klass=type(df.index), kind=str(int))) + msg = ( + r"cannot do slice indexing" + r" on {klass} with these indexers \[(0|1)\] of" + r" {kind}".format(klass=type(df.index), kind=str(int)) + ) with pytest.raises(TypeError, match=msg): df.loc[key] - df = DataFrame(np.random.randn(5, 4), columns=list('ABCD'), - index=pd.date_range('2012-01-01', periods=5)) - - for key in ['2012-01-03', - '2012-01-31', - slice('2012-01-03', '2012-01-03'), - slice('2012-01-03', '2012-01-04'), - slice('2012-01-03', '2012-01-06', 2), - slice('2012-01-03', '2012-01-31'), - tuple([[True, True, True, False, True]]), ]: + df = DataFrame( + np.random.randn(5, 4), + columns=list("ABCD"), + index=pd.date_range("2012-01-01", periods=5), + ) + + for key in [ + "2012-01-03", + "2012-01-31", + slice("2012-01-03", "2012-01-03"), + slice("2012-01-03", "2012-01-04"), + slice("2012-01-03", "2012-01-06", 2), + slice("2012-01-03", "2012-01-31"), + tuple([[True, True, True, False, True]]), + ]: # getitem @@ -142,12 +159,12 @@ def compare(result, expected): compare(df2, df1) # edge cases - s = Series([1, 2, 3, 4], index=list('abde')) + s = Series([1, 2, 3, 4], index=list("abde")) - result1 = s['a':'c'] + result1 = s["a":"c"] with catch_warnings(record=True): - result2 = s.ix['a':'c'] - result3 = s.loc['a':'c'] + result2 = s.ix["a":"c"] + result3 = s.loc["a":"c"] tm.assert_series_equal(result1, result2) tm.assert_series_equal(result1, result3) @@ -166,65 +183,57 @@ def compare(result, expected): def test_ix_weird_slicing(self): # http://stackoverflow.com/q/17056560/1240268 - df = DataFrame({'one': [1, 2, 3, np.nan, np.nan], - 'two': [1, 2, 3, 4, 5]}) - df.loc[df['one'] > 1, 'two'] = -df['two'] - - expected = DataFrame({'one': {0: 1.0, - 1: 2.0, - 2: 3.0, - 3: np.nan, - 4: np.nan}, - 'two': {0: 1, - 1: -2, - 2: -3, - 3: 4, - 4: 5}}) + df = DataFrame({"one": [1, 2, 3, np.nan, np.nan], "two": [1, 2, 3, 4, 5]}) + df.loc[df["one"] > 1, "two"] = -df["two"] + + expected = DataFrame( + { + "one": {0: 1.0, 1: 2.0, 2: 3.0, 3: np.nan, 4: np.nan}, + "two": {0: 1, 1: -2, 2: -3, 3: 4, 4: 5}, + } + ) tm.assert_frame_equal(df, expected) def test_ix_assign_column_mixed(self, float_frame): # GH #1142 df = float_frame - df['foo'] = 'bar' + df["foo"] = "bar" - orig = df.loc[:, 'B'].copy() - df.loc[:, 'B'] = df.loc[:, 'B'] + 1 + orig = df.loc[:, "B"].copy() + df.loc[:, "B"] = df.loc[:, "B"] + 1 tm.assert_series_equal(df.B, orig + 1) # GH 3668, mixed frame with series value - df = DataFrame({'x': np.arange(10), - 'y': np.arange(10, 20), - 'z': 'bar'}) + df = DataFrame({"x": np.arange(10), "y": np.arange(10, 20), "z": "bar"}) expected = df.copy() for i in range(5): indexer = i * 2 v = 1000 + i * 200 - expected.loc[indexer, 'y'] = v - assert expected.loc[indexer, 'y'] == v + expected.loc[indexer, "y"] = v + assert expected.loc[indexer, "y"] == v - df.loc[df.x % 2 == 0, 'y'] = df.loc[df.x % 2 == 0, 'y'] * 100 + df.loc[df.x % 2 == 0, "y"] = df.loc[df.x % 2 == 0, "y"] * 100 tm.assert_frame_equal(df, expected) # GH 4508, making sure consistency of assignments - df = DataFrame({'a': [1, 2, 3], 'b': [0, 1, 2]}) - df.loc[[0, 2, ], 'b'] = [100, -100] - expected = DataFrame({'a': [1, 2, 3], 'b': [100, 1, -100]}) + df = DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]}) + df.loc[[0, 2], "b"] = [100, -100] + expected = DataFrame({"a": [1, 2, 3], "b": [100, 1, -100]}) tm.assert_frame_equal(df, expected) - df = DataFrame({'a': list(range(4))}) - df['b'] = np.nan - df.loc[[1, 3], 'b'] = [100, -100] - expected = DataFrame({'a': [0, 1, 2, 3], - 'b': [np.nan, 100, np.nan, -100]}) + df = DataFrame({"a": list(range(4))}) + df["b"] = np.nan + df.loc[[1, 3], "b"] = [100, -100] + expected = DataFrame({"a": [0, 1, 2, 3], "b": [np.nan, 100, np.nan, -100]}) tm.assert_frame_equal(df, expected) # ok, but chained assignments are dangerous # if we turn off chained assignment it will work - with option_context('chained_assignment', None): - df = DataFrame({'a': list(range(4))}) - df['b'] = np.nan - df['b'].loc[[1, 3]] = [100, -100] + with option_context("chained_assignment", None): + df = DataFrame({"a": list(range(4))}) + df["b"] = np.nan + df["b"].loc[[1, 3]] = [100, -100] tm.assert_frame_equal(df, expected) def test_ix_get_set_consistency(self): @@ -232,58 +241,60 @@ def test_ix_get_set_consistency(self): # GH 4544 # ix/loc get/set not consistent when # a mixed int/string index - df = DataFrame(np.arange(16).reshape((4, 4)), - columns=['a', 'b', 8, 'c'], - index=['e', 7, 'f', 'g']) + df = DataFrame( + np.arange(16).reshape((4, 4)), + columns=["a", "b", 8, "c"], + index=["e", 7, "f", "g"], + ) with catch_warnings(record=True): - assert df.ix['e', 8] == 2 - assert df.loc['e', 8] == 2 + assert df.ix["e", 8] == 2 + assert df.loc["e", 8] == 2 with catch_warnings(record=True): - df.ix['e', 8] = 42 - assert df.ix['e', 8] == 42 - assert df.loc['e', 8] == 42 + df.ix["e", 8] = 42 + assert df.ix["e", 8] == 42 + assert df.loc["e", 8] == 42 - df.loc['e', 8] = 45 + df.loc["e", 8] = 45 with catch_warnings(record=True): - assert df.ix['e', 8] == 45 - assert df.loc['e', 8] == 45 + assert df.ix["e", 8] == 45 + assert df.loc["e", 8] == 45 def test_ix_slicing_strings(self): # see gh-3836 - data = {'Classification': - ['SA EQUITY CFD', 'bbb', 'SA EQUITY', 'SA SSF', 'aaa'], - 'Random': [1, 2, 3, 4, 5], - 'X': ['correct', 'wrong', 'correct', 'correct', 'wrong']} + data = { + "Classification": ["SA EQUITY CFD", "bbb", "SA EQUITY", "SA SSF", "aaa"], + "Random": [1, 2, 3, 4, 5], + "X": ["correct", "wrong", "correct", "correct", "wrong"], + } df = DataFrame(data) - x = df[~df.Classification.isin(['SA EQUITY CFD', 'SA EQUITY', 'SA SSF' - ])] + x = df[~df.Classification.isin(["SA EQUITY CFD", "SA EQUITY", "SA SSF"])] with catch_warnings(record=True): - df.ix[x.index, 'X'] = df['Classification'] - - expected = DataFrame({'Classification': {0: 'SA EQUITY CFD', - 1: 'bbb', - 2: 'SA EQUITY', - 3: 'SA SSF', - 4: 'aaa'}, - 'Random': {0: 1, - 1: 2, - 2: 3, - 3: 4, - 4: 5}, - 'X': {0: 'correct', - 1: 'bbb', - 2: 'correct', - 3: 'correct', - 4: 'aaa'}}) # bug was 4: 'bbb' + df.ix[x.index, "X"] = df["Classification"] + + expected = DataFrame( + { + "Classification": { + 0: "SA EQUITY CFD", + 1: "bbb", + 2: "SA EQUITY", + 3: "SA SSF", + 4: "aaa", + }, + "Random": {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, + "X": {0: "correct", 1: "bbb", 2: "correct", 3: "correct", 4: "aaa"}, + } + ) # bug was 4: 'bbb' tm.assert_frame_equal(df, expected) def test_ix_setitem_out_of_bounds_axis_0(self): df = DataFrame( - np.random.randn(2, 5), index=["row%s" % i for i in range(2)], - columns=["col%s" % i for i in range(5)]) + np.random.randn(2, 5), + index=["row%s" % i for i in range(2)], + columns=["col%s" % i for i in range(5)], + ) with catch_warnings(record=True): msg = "cannot set by positional indexing with enlargement" with pytest.raises(ValueError, match=msg): @@ -291,8 +302,10 @@ def test_ix_setitem_out_of_bounds_axis_0(self): def test_ix_setitem_out_of_bounds_axis_1(self): df = DataFrame( - np.random.randn(5, 2), index=["row%s" % i for i in range(5)], - columns=["col%s" % i for i in range(2)]) + np.random.randn(5, 2), + index=["row%s" % i for i in range(5)], + columns=["col%s" % i for i in range(2)], + ) with catch_warnings(record=True): msg = "cannot set by positional indexing with enlargement" with pytest.raises(ValueError, match=msg): @@ -301,24 +314,32 @@ def test_ix_setitem_out_of_bounds_axis_1(self): def test_ix_empty_list_indexer_is_ok(self): with catch_warnings(record=True): from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) # vertical empty - tm.assert_frame_equal(df.ix[:, []], df.iloc[:, :0], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.ix[:, []], + df.iloc[:, :0], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.ix[[], :], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.ix[[], :], + df.iloc[:0, :], + check_index_type=True, + check_column_type=True, + ) # horizontal empty - tm.assert_frame_equal(df.ix[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.ix[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) def test_ix_duplicate_returns_series(self): - df = DataFrame(np.random.randn(3, 3), index=[0.1, 0.2, 0.2], - columns=list('abc')) + df = DataFrame( + np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") + ) with catch_warnings(record=True): - r = df.ix[0.2, 'a'] - e = df.loc[0.2, 'a'] + r = df.ix[0.2, "a"] + e = df.loc[0.2, "a"] tm.assert_series_equal(r, e) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 2f6e908717071..d749e697c8282 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -13,25 +13,30 @@ class TestLoc(Base): - def test_loc_getitem_dups(self): # GH 5678 # repeated getitems on a dup index returning a ndarray df = DataFrame( - np.random.random_sample((20, 5)), - index=['ABCDE' [x % 5] for x in range(20)]) - expected = df.loc['A', 0] - result = df.loc[:, 0].loc['A'] + np.random.random_sample((20, 5)), index=["ABCDE"[x % 5] for x in range(20)] + ) + expected = df.loc["A", 0] + result = df.loc[:, 0].loc["A"] tm.assert_series_equal(result, expected) def test_loc_getitem_dups2(self): # GH4726 # dup indexing with iloc/loc - df = DataFrame([[1, 2, 'foo', 'bar', Timestamp('20130101')]], - columns=['a', 'a', 'a', 'a', 'a'], index=[1]) - expected = Series([1, 2, 'foo', 'bar', Timestamp('20130101')], - index=['a', 'a', 'a', 'a', 'a'], name=1) + df = DataFrame( + [[1, 2, "foo", "bar", Timestamp("20130101")]], + columns=["a", "a", "a", "a", "a"], + index=[1], + ) + expected = Series( + [1, 2, "foo", "bar", Timestamp("20130101")], + index=["a", "a", "a", "a", "a"], + name=1, + ) result = df.iloc[0] tm.assert_series_equal(result, expected) @@ -43,22 +48,25 @@ def test_loc_setitem_dups(self): # GH 6541 df_orig = DataFrame( - {'me': list('rttti'), - 'foo': list('aaade'), - 'bar': np.arange(5, dtype='float64') * 1.34 + 2, - 'bar2': np.arange(5, dtype='float64') * -.34 + 2}).set_index('me') - - indexer = tuple(['r', ['bar', 'bar2']]) + { + "me": list("rttti"), + "foo": list("aaade"), + "bar": np.arange(5, dtype="float64") * 1.34 + 2, + "bar2": np.arange(5, dtype="float64") * -0.34 + 2, + } + ).set_index("me") + + indexer = tuple(["r", ["bar", "bar2"]]) df = df_orig.copy() df.loc[indexer] *= 2.0 tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) - indexer = tuple(['r', 'bar']) + indexer = tuple(["r", "bar"]) df = df_orig.copy() df.loc[indexer] *= 2.0 assert df.loc[indexer] == 2.0 * df_orig.loc[indexer] - indexer = tuple(['t', ['bar', 'bar2']]) + indexer = tuple(["t", ["bar", "bar2"]]) df = df_orig.copy() df.loc[indexer] *= 2.0 tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) @@ -67,109 +75,193 @@ def test_loc_setitem_slice(self): # GH10503 # assigning the same type should not change the type - df1 = DataFrame({'a': [0, 1, 1], - 'b': Series([100, 200, 300], dtype='uint32')}) - ix = df1['a'] == 1 - newb1 = df1.loc[ix, 'b'] + 1 - df1.loc[ix, 'b'] = newb1 - expected = DataFrame({'a': [0, 1, 1], - 'b': Series([100, 201, 301], dtype='uint32')}) + df1 = DataFrame({"a": [0, 1, 1], "b": Series([100, 200, 300], dtype="uint32")}) + ix = df1["a"] == 1 + newb1 = df1.loc[ix, "b"] + 1 + df1.loc[ix, "b"] = newb1 + expected = DataFrame( + {"a": [0, 1, 1], "b": Series([100, 201, 301], dtype="uint32")} + ) tm.assert_frame_equal(df1, expected) # assigning a new type should get the inferred type - df2 = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint64') - ix = df1['a'] == 1 - newb2 = df2.loc[ix, 'b'] - df1.loc[ix, 'b'] = newb2 - expected = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, - dtype='uint64') + df2 = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") + ix = df1["a"] == 1 + newb2 = df2.loc[ix, "b"] + df1.loc[ix, "b"] = newb2 + expected = DataFrame({"a": [0, 1, 1], "b": [100, 200, 300]}, dtype="uint64") tm.assert_frame_equal(df2, expected) def test_loc_getitem_int(self): # int label - self.check_result('int label', 'loc', 2, 'ix', 2, - typs=['ints', 'uints'], axes=0) - self.check_result('int label', 'loc', 3, 'ix', 3, - typs=['ints', 'uints'], axes=1) - self.check_result('int label', 'loc', 2, 'ix', 2, - typs=['label'], fails=KeyError) + self.check_result( + "int label", "loc", 2, "ix", 2, typs=["ints", "uints"], axes=0 + ) + self.check_result( + "int label", "loc", 3, "ix", 3, typs=["ints", "uints"], axes=1 + ) + self.check_result( + "int label", "loc", 2, "ix", 2, typs=["label"], fails=KeyError + ) def test_loc_getitem_label(self): # label - self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['labels'], - axes=0) - self.check_result('label', 'loc', 'null', 'ix', 'null', typs=['mixed'], - axes=0) - self.check_result('label', 'loc', 8, 'ix', 8, typs=['mixed'], axes=0) - self.check_result('label', 'loc', Timestamp('20130102'), 'ix', 1, - typs=['ts'], axes=0) - self.check_result('label', 'loc', 'c', 'ix', 'c', typs=['empty'], - fails=KeyError) + self.check_result("label", "loc", "c", "ix", "c", typs=["labels"], axes=0) + self.check_result("label", "loc", "null", "ix", "null", typs=["mixed"], axes=0) + self.check_result("label", "loc", 8, "ix", 8, typs=["mixed"], axes=0) + self.check_result( + "label", "loc", Timestamp("20130102"), "ix", 1, typs=["ts"], axes=0 + ) + self.check_result( + "label", "loc", "c", "ix", "c", typs=["empty"], fails=KeyError + ) def test_loc_getitem_label_out_of_range(self): # out of range label - self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['ints', 'uints', 'labels', 'mixed', 'ts'], - fails=KeyError) - self.check_result('label range', 'loc', 'f', 'ix', 'f', - typs=['floats'], fails=KeyError) - self.check_result('label range', 'loc', 20, 'ix', 20, - typs=['ints', 'uints', 'mixed'], fails=KeyError) - self.check_result('label range', 'loc', 20, 'ix', 20, - typs=['labels'], fails=TypeError) - self.check_result('label range', 'loc', 20, 'ix', 20, typs=['ts'], - axes=0, fails=TypeError) - self.check_result('label range', 'loc', 20, 'ix', 20, typs=['floats'], - axes=0, fails=KeyError) + self.check_result( + "label range", + "loc", + "f", + "ix", + "f", + typs=["ints", "uints", "labels", "mixed", "ts"], + fails=KeyError, + ) + self.check_result( + "label range", "loc", "f", "ix", "f", typs=["floats"], fails=KeyError + ) + self.check_result( + "label range", + "loc", + 20, + "ix", + 20, + typs=["ints", "uints", "mixed"], + fails=KeyError, + ) + self.check_result( + "label range", "loc", 20, "ix", 20, typs=["labels"], fails=TypeError + ) + self.check_result( + "label range", "loc", 20, "ix", 20, typs=["ts"], axes=0, fails=TypeError + ) + self.check_result( + "label range", "loc", 20, "ix", 20, typs=["floats"], axes=0, fails=KeyError + ) def test_loc_getitem_label_list(self): # list of labels - self.check_result('list lbl', 'loc', [0, 2, 4], 'ix', [0, 2, 4], - typs=['ints', 'uints'], axes=0) - self.check_result('list lbl', 'loc', [3, 6, 9], 'ix', [3, 6, 9], - typs=['ints', 'uints'], axes=1) - self.check_result('list lbl', 'loc', ['a', 'b', 'd'], 'ix', - ['a', 'b', 'd'], typs=['labels'], axes=0) - self.check_result('list lbl', 'loc', ['A', 'B', 'C'], 'ix', - ['A', 'B', 'C'], typs=['labels'], axes=1) - self.check_result('list lbl', 'loc', [2, 8, 'null'], 'ix', - [2, 8, 'null'], typs=['mixed'], axes=0) - self.check_result('list lbl', 'loc', - [Timestamp('20130102'), Timestamp('20130103')], 'ix', - [Timestamp('20130102'), Timestamp('20130103')], - typs=['ts'], axes=0) + self.check_result( + "list lbl", + "loc", + [0, 2, 4], + "ix", + [0, 2, 4], + typs=["ints", "uints"], + axes=0, + ) + self.check_result( + "list lbl", + "loc", + [3, 6, 9], + "ix", + [3, 6, 9], + typs=["ints", "uints"], + axes=1, + ) + self.check_result( + "list lbl", + "loc", + ["a", "b", "d"], + "ix", + ["a", "b", "d"], + typs=["labels"], + axes=0, + ) + self.check_result( + "list lbl", + "loc", + ["A", "B", "C"], + "ix", + ["A", "B", "C"], + typs=["labels"], + axes=1, + ) + self.check_result( + "list lbl", + "loc", + [2, 8, "null"], + "ix", + [2, 8, "null"], + typs=["mixed"], + axes=0, + ) + self.check_result( + "list lbl", + "loc", + [Timestamp("20130102"), Timestamp("20130103")], + "ix", + [Timestamp("20130102"), Timestamp("20130103")], + typs=["ts"], + axes=0, + ) def test_loc_getitem_label_list_with_missing(self): - self.check_result('list lbl', 'loc', [0, 1, 2], 'indexer', [0, 1, 2], - typs=['empty'], fails=KeyError) + self.check_result( + "list lbl", + "loc", + [0, 1, 2], + "indexer", + [0, 1, 2], + typs=["empty"], + fails=KeyError, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [0, 2, 10], 'ix', [0, 2, 10], - typs=['ints', 'uints', 'floats'], - axes=0, fails=KeyError) + self.check_result( + "list lbl", + "loc", + [0, 2, 10], + "ix", + [0, 2, 10], + typs=["ints", "uints", "floats"], + axes=0, + fails=KeyError, + ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [3, 6, 7], 'ix', [3, 6, 7], - typs=['ints', 'uints', 'floats'], - axes=1, fails=KeyError) + self.check_result( + "list lbl", + "loc", + [3, 6, 7], + "ix", + [3, 6, 7], + typs=["ints", "uints", "floats"], + axes=1, + fails=KeyError, + ) # GH 17758 - MultiIndex and missing keys with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.check_result('list lbl', 'loc', [(1, 3), (1, 4), (2, 5)], - 'ix', [(1, 3), (1, 4), (2, 5)], - typs=['multi'], - axes=0) + self.check_result( + "list lbl", + "loc", + [(1, 3), (1, 4), (2, 5)], + "ix", + [(1, 3), (1, 4), (2, 5)], + typs=["multi"], + axes=0, + ) def test_getitem_label_list_with_missing(self): - s = Series(range(3), index=['a', 'b', 'c']) + s = Series(range(3), index=["a", "b", "c"]) # consistency with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s[['a', 'd']] + s[["a", "d"]] s = Series(range(3)) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -177,53 +269,95 @@ def test_getitem_label_list_with_missing(self): def test_loc_getitem_label_list_fails(self): # fails - self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], - typs=['ints', 'uints'], axes=1, fails=KeyError) + self.check_result( + "list lbl", + "loc", + [20, 30, 40], + "ix", + [20, 30, 40], + typs=["ints", "uints"], + axes=1, + fails=KeyError, + ) def test_loc_getitem_label_array_like(self): # array like - self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index, - 'ix', [0, 2, 4], typs=['ints', 'uints'], axes=0) - self.check_result('array like', 'loc', Series(index=[3, 6, 9]).index, - 'ix', [3, 6, 9], typs=['ints', 'uints'], axes=1) + self.check_result( + "array like", + "loc", + Series(index=[0, 2, 4]).index, + "ix", + [0, 2, 4], + typs=["ints", "uints"], + axes=0, + ) + self.check_result( + "array like", + "loc", + Series(index=[3, 6, 9]).index, + "ix", + [3, 6, 9], + typs=["ints", "uints"], + axes=1, + ) def test_loc_getitem_bool(self): # boolean indexers b = [True, False, True, False] - self.check_result('bool', 'loc', b, 'ix', b, - typs=['ints', 'uints', 'labels', - 'mixed', 'ts', 'floats']) - self.check_result('bool', 'loc', b, 'ix', b, typs=['empty'], - fails=IndexError) - - @pytest.mark.parametrize('index', [[True, False], - [True, False, True, False]]) + self.check_result( + "bool", + "loc", + b, + "ix", + b, + typs=["ints", "uints", "labels", "mixed", "ts", "floats"], + ) + self.check_result("bool", "loc", b, "ix", b, typs=["empty"], fails=IndexError) + + @pytest.mark.parametrize("index", [[True, False], [True, False, True, False]]) def test_loc_getitem_bool_diff_len(self, index): # GH26658 s = Series([1, 2, 3]) - with pytest.raises(IndexError, - match=('Item wrong length {} instead of {}.'.format( - len(index), len(s)))): + with pytest.raises( + IndexError, + match=("Item wrong length {} instead of {}.".format(len(index), len(s))), + ): _ = s.loc[index] def test_loc_getitem_int_slice(self): # ok - self.check_result('int slice2', 'loc', slice(2, 4), 'ix', [2, 4], - typs=['ints', 'uints'], axes=0) - self.check_result('int slice2', 'loc', slice(3, 6), 'ix', [3, 6], - typs=['ints', 'uints'], axes=1) + self.check_result( + "int slice2", + "loc", + slice(2, 4), + "ix", + [2, 4], + typs=["ints", "uints"], + axes=0, + ) + self.check_result( + "int slice2", + "loc", + slice(3, 6), + "ix", + [3, 6], + typs=["ints", "uints"], + axes=1, + ) def test_loc_to_fail(self): # GH3449 - df = DataFrame(np.random.random((3, 3)), - index=['a', 'b', 'c'], - columns=['e', 'f', 'g']) + df = DataFrame( + np.random.random((3, 3)), index=["a", "b", "c"], columns=["e", "f", "g"] + ) # raise a KeyError? - msg = (r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): df.loc[[1, 2], [1, 2]] @@ -232,45 +366,51 @@ def test_loc_to_fail(self): s = Series() s.loc[1] = 1 - s.loc['a'] = 2 + s.loc["a"] = 2 with pytest.raises(KeyError, match=r"^-1$"): s.loc[-1] - msg = (r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): s.loc[[-1, -2]] - msg = (r"\"None of \[Index\(\['4'\], dtype='object'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Index\(\['4'\], dtype='object'\)\] are" r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): - s.loc[['4']] + s.loc[["4"]] s.loc[-1] = 3 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[-1, -2]] expected = Series([3, np.nan], index=[-1, -2]) tm.assert_series_equal(result, expected) - s['a'] = 2 - msg = (r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are" - r" in the \[index\]\"") + s["a"] = 2 + msg = ( + r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): s.loc[[-2]] - del s['a'] + del s["a"] with pytest.raises(KeyError, match=msg): s.loc[[-2]] = 0 # inconsistency between .loc[values] and .loc[values,:] # GH 7999 - df = DataFrame([['a'], ['b']], index=[1, 2], columns=['value']) + df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) - msg = (r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): df.loc[[3], :] @@ -297,35 +437,98 @@ def test_loc_getitem_list_with_fail(self): def test_loc_getitem_label_slice(self): # label slices (with ints) - self.check_result('lab slice', 'loc', slice(1, 3), - 'ix', slice(1, 3), - typs=['labels', 'mixed', 'empty', 'ts', 'floats'], - fails=TypeError) + self.check_result( + "lab slice", + "loc", + slice(1, 3), + "ix", + slice(1, 3), + typs=["labels", "mixed", "empty", "ts", "floats"], + fails=TypeError, + ) # real label slices - self.check_result('lab slice', 'loc', slice('a', 'c'), - 'ix', slice('a', 'c'), typs=['labels'], axes=0) - self.check_result('lab slice', 'loc', slice('A', 'C'), - 'ix', slice('A', 'C'), typs=['labels'], axes=1) - - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=0) - self.check_result('ts slice', 'loc', slice('20130102', '20130104'), - 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=1, fails=TypeError) + self.check_result( + "lab slice", + "loc", + slice("a", "c"), + "ix", + slice("a", "c"), + typs=["labels"], + axes=0, + ) + self.check_result( + "lab slice", + "loc", + slice("A", "C"), + "ix", + slice("A", "C"), + typs=["labels"], + axes=1, + ) + + self.check_result( + "ts slice", + "loc", + slice("20130102", "20130104"), + "ix", + slice("20130102", "20130104"), + typs=["ts"], + axes=0, + ) + self.check_result( + "ts slice", + "loc", + slice("20130102", "20130104"), + "ix", + slice("20130102", "20130104"), + typs=["ts"], + axes=1, + fails=TypeError, + ) # GH 14316 - self.check_result('ts slice rev', 'loc', slice('20130104', '20130102'), - 'indexer', [0, 1, 2], typs=['ts_rev'], axes=0) - - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=0, fails=TypeError) - self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), - typs=['mixed'], axes=1, fails=KeyError) - - self.check_result('mixed slice', 'loc', slice(2, 4, 2), 'ix', slice( - 2, 4, 2), typs=['mixed'], axes=0, fails=TypeError) + self.check_result( + "ts slice rev", + "loc", + slice("20130104", "20130102"), + "indexer", + [0, 1, 2], + typs=["ts_rev"], + axes=0, + ) + + self.check_result( + "mixed slice", + "loc", + slice(2, 8), + "ix", + slice(2, 8), + typs=["mixed"], + axes=0, + fails=TypeError, + ) + self.check_result( + "mixed slice", + "loc", + slice(2, 8), + "ix", + slice(2, 8), + typs=["mixed"], + axes=1, + fails=KeyError, + ) + + self.check_result( + "mixed slice", + "loc", + slice(2, 4, 2), + "ix", + slice(2, 4, 2), + typs=["mixed"], + axes=0, + fails=TypeError, + ) def test_loc_index(self): # gh-17131 @@ -333,7 +536,8 @@ def test_loc_index(self): df = DataFrame( np.random.random(size=(5, 10)), - index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"]) + index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"], + ) mask = df.index.map(lambda x: "alpha" in x) expected = df.loc[np.array(mask)] @@ -347,74 +551,106 @@ def test_loc_index(self): def test_loc_general(self): df = DataFrame( - np.random.rand(4, 4), columns=['A', 'B', 'C', 'D'], - index=['A', 'B', 'C', 'D']) + np.random.rand(4, 4), + columns=["A", "B", "C", "D"], + index=["A", "B", "C", "D"], + ) # want this to work result = df.loc[:, "A":"B"].iloc[0:2, :] - assert (result.columns == ['A', 'B']).all() - assert (result.index == ['A', 'B']).all() + assert (result.columns == ["A", "B"]).all() + assert (result.index == ["A", "B"]).all() # mixed type - result = DataFrame({'a': [Timestamp('20130101')], 'b': [1]}).iloc[0] - expected = Series([Timestamp('20130101'), 1], index=['a', 'b'], name=0) + result = DataFrame({"a": [Timestamp("20130101")], "b": [1]}).iloc[0] + expected = Series([Timestamp("20130101"), 1], index=["a", "b"], name=0) tm.assert_series_equal(result, expected) assert result.dtype == object def test_loc_setitem_consistency(self): # GH 6149 # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame({'date': Series(0, index=range(5), - dtype=np.int64), - 'val': Series(range(5), dtype=np.int64)}) - - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 0 + expected = DataFrame( + { + "date": Series(0, index=range(5), dtype=np.int64), + "val": Series(range(5), dtype=np.int64), + } + ) + + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = 0 tm.assert_frame_equal(df, expected) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = np.array(0, dtype=np.int64) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = np.array(0, dtype=np.int64) tm.assert_frame_equal(df, expected) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = np.array([0, 0, 0, 0, 0], dtype=np.int64) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = np.array([0, 0, 0, 0, 0], dtype=np.int64) tm.assert_frame_equal(df, expected) - expected = DataFrame({'date': Series('foo', index=range(5)), - 'val': Series(range(5), dtype=np.int64)}) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 'foo' + expected = DataFrame( + { + "date": Series("foo", index=range(5)), + "val": Series(range(5), dtype=np.int64), + } + ) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = "foo" tm.assert_frame_equal(df, expected) - expected = DataFrame({'date': Series(1.0, index=range(5)), - 'val': Series(range(5), dtype=np.int64)}) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series(range(5), dtype=np.int64)}) - df.loc[:, 'date'] = 1.0 + expected = DataFrame( + { + "date": Series(1.0, index=range(5)), + "val": Series(range(5), dtype=np.int64), + } + ) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + df.loc[:, "date"] = 1.0 tm.assert_frame_equal(df, expected) # GH 15494 # setting on frame with single row - df = DataFrame({'date': Series([Timestamp('20180101')])}) - df.loc[:, 'date'] = 'string' - expected = DataFrame({'date': Series(['string'])}) + df = DataFrame({"date": Series([Timestamp("20180101")])}) + df.loc[:, "date"] = "string" + expected = DataFrame({"date": Series(["string"])}) tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_empty(self): # empty (essentially noops) - expected = DataFrame(columns=['x', 'y']) - expected['x'] = expected['x'].astype(np.int64) - df = DataFrame(columns=['x', 'y']) - df.loc[:, 'x'] = 1 + expected = DataFrame(columns=["x", "y"]) + expected["x"] = expected["x"].astype(np.int64) + df = DataFrame(columns=["x", "y"]) + df.loc[:, "x"] = 1 tm.assert_frame_equal(df, expected) - df = DataFrame(columns=['x', 'y']) - df['x'] = 1 + df = DataFrame(columns=["x", "y"]) + df["x"] = 1 tm.assert_frame_equal(df, expected) def test_loc_setitem_consistency_slice_column_len(self): @@ -429,130 +665,147 @@ def test_loc_setitem_consistency_slice_column_len(self): Region_1,Site_2,3977723089,A,5/20/2015 8:33,5/20/2015 9:09,Yes,No""" df = pd.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1, 2]) - df.loc[:, ('Respondent', 'StartDate')] = pd.to_datetime(df.loc[:, ( - 'Respondent', 'StartDate')]) - df.loc[:, ('Respondent', 'EndDate')] = pd.to_datetime(df.loc[:, ( - 'Respondent', 'EndDate')]) - df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( - 'Respondent', 'EndDate')] - df.loc[:, ('Respondent', 'StartDate')] - - df.loc[:, ('Respondent', 'Duration')] = df.loc[:, ( - 'Respondent', 'Duration')].astype('timedelta64[s]') - expected = Series([1380, 720, 840, 2160.], index=df.index, - name=('Respondent', 'Duration')) - tm.assert_series_equal(df[('Respondent', 'Duration')], expected) + df.loc[:, ("Respondent", "StartDate")] = pd.to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + df.loc[:, ("Respondent", "EndDate")] = pd.to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + df.loc[:, ("Respondent", "Duration")] = ( + df.loc[:, ("Respondent", "EndDate")] + - df.loc[:, ("Respondent", "StartDate")] + ) + + df.loc[:, ("Respondent", "Duration")] = df.loc[ + :, ("Respondent", "Duration") + ].astype("timedelta64[s]") + expected = Series( + [1380, 720, 840, 2160.0], index=df.index, name=("Respondent", "Duration") + ) + tm.assert_series_equal(df[("Respondent", "Duration")], expected) def test_loc_setitem_frame(self): df = self.frame_labels result = df.iloc[0, 0] - df.loc['a', 'A'] = 1 - result = df.loc['a', 'A'] + df.loc["a", "A"] = 1 + result = df.loc["a", "A"] assert result == 1 result = df.iloc[0, 0] assert result == 1 - df.loc[:, 'B':'D'] = 0 - expected = df.loc[:, 'B':'D'] + df.loc[:, "B":"D"] = 0 + expected = df.loc[:, "B":"D"] result = df.iloc[:, 1:] tm.assert_frame_equal(result, expected) # GH 6254 # setting issue - df = DataFrame(index=[3, 5, 4], columns=['A']) - df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3], dtype='int64') - expected = DataFrame(dict(A=Series( - [1, 2, 3], index=[4, 3, 5]))).reindex(index=[3, 5, 4]) + df = DataFrame(index=[3, 5, 4], columns=["A"]) + df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") + expected = DataFrame(dict(A=Series([1, 2, 3], index=[4, 3, 5]))).reindex( + index=[3, 5, 4] + ) tm.assert_frame_equal(df, expected) # GH 6252 # setting with an empty frame - keys1 = ['@' + str(i) for i in range(5)] - val1 = np.arange(5, dtype='int64') + keys1 = ["@" + str(i) for i in range(5)] + val1 = np.arange(5, dtype="int64") - keys2 = ['@' + str(i) for i in range(4)] - val2 = np.arange(4, dtype='int64') + keys2 = ["@" + str(i) for i in range(4)] + val2 = np.arange(4, dtype="int64") index = list(set(keys1).union(keys2)) df = DataFrame(index=index) - df['A'] = np.nan - df.loc[keys1, 'A'] = val1 + df["A"] = np.nan + df.loc[keys1, "A"] = val1 - df['B'] = np.nan - df.loc[keys2, 'B'] = val2 + df["B"] = np.nan + df.loc[keys2, "B"] = val2 - expected = DataFrame(dict(A=Series(val1, index=keys1), B=Series( - val2, index=keys2))).reindex(index=index) + expected = DataFrame( + dict(A=Series(val1, index=keys1), B=Series(val2, index=keys2)) + ).reindex(index=index) tm.assert_frame_equal(df, expected) # GH 8669 # invalid coercion of nan -> int - df = DataFrame({'A': [1, 2, 3], 'B': np.nan}) - df.loc[df.B > df.A, 'B'] = df.A - expected = DataFrame({'A': [1, 2, 3], 'B': np.nan}) + df = DataFrame({"A": [1, 2, 3], "B": np.nan}) + df.loc[df.B > df.A, "B"] = df.A + expected = DataFrame({"A": [1, 2, 3], "B": np.nan}) tm.assert_frame_equal(df, expected) # GH 6546 # setting with mixed labels - df = DataFrame({1: [1, 2], 2: [3, 4], 'a': ['a', 'b']}) + df = DataFrame({1: [1, 2], 2: [3, 4], "a": ["a", "b"]}) result = df.loc[0, [1, 2]] expected = Series([1, 3], index=[1, 2], dtype=object, name=0) tm.assert_series_equal(result, expected) - expected = DataFrame({1: [5, 2], 2: [6, 4], 'a': ['a', 'b']}) + expected = DataFrame({1: [5, 2], 2: [6, 4], "a": ["a", "b"]}) df.loc[0, [1, 2]] = [5, 6] tm.assert_frame_equal(df, expected) def test_loc_setitem_frame_multiples(self): # multiple setting - df = DataFrame({'A': ['foo', 'bar', 'baz'], - 'B': Series( - range(3), dtype=np.int64)}) + df = DataFrame( + {"A": ["foo", "bar", "baz"], "B": Series(range(3), dtype=np.int64)} + ) rhs = df.loc[1:2] rhs.index = df.index[0:2] df.loc[0:1] = rhs - expected = DataFrame({'A': ['bar', 'baz', 'baz'], - 'B': Series( - [1, 2, 2], dtype=np.int64)}) + expected = DataFrame( + {"A": ["bar", "baz", "baz"], "B": Series([1, 2, 2], dtype=np.int64)} + ) tm.assert_frame_equal(df, expected) # multiple setting with frame on rhs (with M8) - df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) - expected = DataFrame({'date': [Timestamp('20000101'), Timestamp( - '20000102'), Timestamp('20000101'), Timestamp('20000102'), - Timestamp('20000103')], - 'val': Series( - [0, 1, 0, 1, 2], dtype=np.int64)}) + df = DataFrame( + { + "date": date_range("2000-01-01", "2000-01-5"), + "val": Series(range(5), dtype=np.int64), + } + ) + expected = DataFrame( + { + "date": [ + Timestamp("20000101"), + Timestamp("20000102"), + Timestamp("20000101"), + Timestamp("20000102"), + Timestamp("20000103"), + ], + "val": Series([0, 1, 0, 1, 2], dtype=np.int64), + } + ) rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( - 'indexer', [['A'], slice(None, 'A', None), np.array(['A'])]) - @pytest.mark.parametrize( - 'value', [['Z'], np.array(['Z'])]) + "indexer", [["A"], slice(None, "A", None), np.array(["A"])] + ) + @pytest.mark.parametrize("value", [["Z"], np.array(["Z"])]) def test_loc_setitem_with_scalar_index(self, indexer, value): # GH #19474 # assigning like "df.loc[0, ['A']] = ['Z']" should be evaluated # elementwisely, not using "setter('A', ['Z'])". - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) df.loc[0, indexer] = value - result = df.loc[0, 'A'] + result = df.loc[0, "A"] - assert is_scalar(result) and result == 'Z' + assert is_scalar(result) and result == "Z" def test_loc_coerceion(self): # 12411 - df = DataFrame({'date': [Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) + df = DataFrame({"date": [Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) expected = df.dtypes result = df.iloc[[0]] @@ -563,8 +816,10 @@ def test_loc_coerceion(self): # 12045 import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) + + df = DataFrame( + {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} + ) expected = df.dtypes result = df.iloc[[0]] @@ -574,7 +829,7 @@ def test_loc_coerceion(self): tm.assert_series_equal(result.dtypes, expected) # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) + df = DataFrame({"text": ["some words"] + [None] * 9}) expected = df.dtypes result = df.iloc[0:2] @@ -589,8 +844,9 @@ def test_loc_non_unique(self): # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs # these are going to raise because the we are non monotonic - df = DataFrame({'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) + df = DataFrame( + {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3] + ) msg = "'Cannot get left slice bound for non-unique label: 1'" with pytest.raises(KeyError, match=msg): df.loc[1:] @@ -602,20 +858,18 @@ def test_loc_non_unique(self): df.loc[1:2] # monotonic are ok - df = DataFrame({'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, - index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0) + df = DataFrame( + {"A": [1, 2, 3, 4, 5, 6], "B": [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3] + ).sort_index(axis=0) result = df.loc[1:] - expected = DataFrame({'A': [2, 4, 5, 6], 'B': [4, 6, 7, 8]}, - index=[1, 1, 2, 3]) + expected = DataFrame({"A": [2, 4, 5, 6], "B": [4, 6, 7, 8]}, index=[1, 1, 2, 3]) tm.assert_frame_equal(result, expected) result = df.loc[0:] tm.assert_frame_equal(result, df) result = df.loc[1:2] - expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, - index=[1, 1, 2]) + expected = DataFrame({"A": [2, 4, 5], "B": [4, 6, 7]}, index=[1, 1, 2]) tm.assert_frame_equal(result, expected) def test_loc_non_unique_memory_error(self): @@ -623,22 +877,35 @@ def test_loc_non_unique_memory_error(self): # GH 4280 # non_unique index with a large selection triggers a memory error - columns = list('ABCDEFG') + columns = list("ABCDEFG") def gen_test(l, l2): - return pd.concat([ - DataFrame(np.random.randn(l, len(columns)), - index=np.arange(l), columns=columns), - DataFrame(np.ones((l2, len(columns))), - index=[0] * l2, columns=columns)]) + return pd.concat( + [ + DataFrame( + np.random.randn(l, len(columns)), + index=np.arange(l), + columns=columns, + ), + DataFrame( + np.ones((l2, len(columns))), index=[0] * l2, columns=columns + ), + ] + ) def gen_expected(df, mask): len_mask = len(mask) - return pd.concat([df.take([0]), - DataFrame(np.ones((len_mask, len(columns))), - index=[0] * len_mask, - columns=columns), - df.take(mask[1:])]) + return pd.concat( + [ + df.take([0]), + DataFrame( + np.ones((len_mask, len(columns))), + index=[0] * len_mask, + columns=columns, + ), + df.take(mask[1:]), + ] + ) df = gen_test(900, 100) assert df.index.is_unique is False @@ -659,42 +926,45 @@ def gen_expected(df, mask): def test_loc_name(self): # GH 3880 df = DataFrame([[1, 1], [1, 1]]) - df.index.name = 'index_name' + df.index.name = "index_name" result = df.iloc[[0, 1]].index.name - assert result == 'index_name' + assert result == "index_name" with catch_warnings(record=True): filterwarnings("ignore", "\\n.ix", FutureWarning) result = df.ix[[0, 1]].index.name - assert result == 'index_name' + assert result == "index_name" result = df.loc[[0, 1]].index.name - assert result == 'index_name' + assert result == "index_name" def test_loc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) # vertical empty - tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.loc[:, []], df.iloc[:, :0], check_index_type=True, check_column_type=True + ) # horizontal empty - tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal( + df.loc[[], :], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) # horizontal empty - tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :], - check_index_type=True, - check_column_type=True) + tm.assert_frame_equal( + df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True + ) def test_identity_slice_returns_new_object(self): # GH13873 - original_df = DataFrame({'a': [1, 2, 3]}) + original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.loc[:] assert sliced_df is not original_df assert original_df[:] is not original_df # should be a shallow copy - original_df['a'] = [4, 4, 4] - assert (sliced_df['a'] == 4).all() + original_df["a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() # These should not return copies assert original_df is original_df.loc[:, :] @@ -713,53 +983,53 @@ def test_identity_slice_returns_new_object(self): def test_loc_uint64(self): # GH20722 # Test whether loc accept uint64 max value as index. - s = pd.Series([1, 2], - index=[np.iinfo('uint64').max - 1, - np.iinfo('uint64').max]) + s = pd.Series( + [1, 2], index=[np.iinfo("uint64").max - 1, np.iinfo("uint64").max] + ) - result = s.loc[np.iinfo('uint64').max - 1] + result = s.loc[np.iinfo("uint64").max - 1] expected = s.iloc[0] assert result == expected - result = s.loc[[np.iinfo('uint64').max - 1]] + result = s.loc[[np.iinfo("uint64").max - 1]] expected = s.iloc[[0]] tm.assert_series_equal(result, expected) - result = s.loc[[np.iinfo('uint64').max - 1, - np.iinfo('uint64').max]] + result = s.loc[[np.iinfo("uint64").max - 1, np.iinfo("uint64").max]] tm.assert_series_equal(result, s) def test_loc_setitem_empty_append(self): # GH6173, various appends to an empty dataframe data = [1, 2, 3] - expected = DataFrame({'x': data, 'y': [None] * len(data)}) + expected = DataFrame({"x": data, "y": [None] * len(data)}) # appends to fit length of data - df = DataFrame(columns=['x', 'y']) - df.loc[:, 'x'] = data + df = DataFrame(columns=["x", "y"]) + df.loc[:, "x"] = data tm.assert_frame_equal(df, expected) # only appends one value - expected = DataFrame({'x': [1.0], 'y': [np.nan]}) - df = DataFrame(columns=['x', 'y'], - dtype=np.float) - df.loc[0, 'x'] = expected.loc[0, 'x'] + expected = DataFrame({"x": [1.0], "y": [np.nan]}) + df = DataFrame(columns=["x", "y"], dtype=np.float) + df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe data = [1, 2] - df = DataFrame(columns=['x', 'y']) - msg = (r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " - r"are in the \[index\]") + df = DataFrame(columns=["x", "y"]) + msg = ( + r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " + r"are in the \[index\]" + ) with pytest.raises(KeyError, match=msg): - df.loc[[0, 1], 'x'] = data + df.loc[[0, 1], "x"] = data msg = "cannot copy sequence with size 2 to array axis with dimension 0" with pytest.raises(ValueError, match=msg): - df.loc[0:2, 'x'] = data + df.loc[0:2, "x"] = data def test_indexing_zerodim_np_array(self): # GH24924 diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index f2696f282c2c4..68e93f06e43dc 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -15,7 +15,6 @@ class TestPartialSetting: - @pytest.mark.filterwarnings("ignore:\\n.ix:FutureWarning") def test_partial_setting(self): @@ -35,100 +34,101 @@ def test_partial_setting(self): tm.assert_series_equal(s, expected) s = s_orig.copy() - s[5] = 5. - expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) + s[5] = 5.0 + expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() - s.loc[5] = 5. - expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) + s.loc[5] = 5.0 + expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() with pytest.raises(IndexError): - s.iloc[3] = 5. + s.iloc[3] = 5.0 with pytest.raises(IndexError): - s.iat[3] = 5. + s.iat[3] = 5.0 # ## frame ## df_orig = DataFrame( - np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') + np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64" + ) # iloc/iat raise df = df_orig.copy() with pytest.raises(IndexError): - df.iloc[4, 2] = 5. + df.iloc[4, 2] = 5.0 with pytest.raises(IndexError): - df.iat[4, 2] = 5. + df.iat[4, 2] = 5.0 # row setting where it exists - expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) + expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) - expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) + expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation - expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) + expected = DataFrame(dict({"A": [0, 2, 4, 4], "B": [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite - expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) + expected = DataFrame(dict({"A": [0, 2, 4], "B": [0, 2, 4]})) df = df_orig.copy() with catch_warnings(record=True): - df.ix[:, 'B'] = df.ix[:, 'A'] + df.ix[:, "B"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite - expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) + expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])})) df = df_orig.copy() - df['B'] = df['B'].astype(np.float64) + df["B"] = df["B"].astype(np.float64) with catch_warnings(record=True): - df.ix[:, 'B'] = df.ix[:, 'A'] + df.ix[:, "B"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() - expected['C'] = df['A'] + expected["C"] = df["A"] df = df_orig.copy() with catch_warnings(record=True): - df.ix[:, 'C'] = df.ix[:, 'A'] + df.ix[:, "C"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() - expected['C'] = df['A'] + expected["C"] = df["A"] df = df_orig.copy() with catch_warnings(record=True): - df.ix[:, 'C'] = df.ix[:, 'A'] + df.ix[:, "C"] = df.ix[:, "A"] tm.assert_frame_equal(df, expected) # GH 8473 - dates = date_range('1/1/2000', periods=8) - df_orig = DataFrame(np.random.randn(8, 4), index=dates, - columns=['A', 'B', 'C', 'D']) - - expected = pd.concat([df_orig, - DataFrame({'A': 7}, - index=[dates[-1] + dates.freq])], - sort=True) + dates = date_range("1/1/2000", periods=8) + df_orig = DataFrame( + np.random.randn(8, 4), index=dates, columns=["A", "B", "C", "D"] + ) + + expected = pd.concat( + [df_orig, DataFrame({"A": 7}, index=[dates[-1] + dates.freq])], sort=True + ) df = df_orig.copy() - df.loc[dates[-1] + dates.freq, 'A'] = 7 + df.loc[dates[-1] + dates.freq, "A"] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() - df.at[dates[-1] + dates.freq, 'A'] = 7 + df.at[dates[-1] + dates.freq, "A"] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq]) @@ -155,30 +155,28 @@ def test_partial_setting_mixed_dtype(self): tm.assert_frame_equal(df, expected) # columns will align - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df.loc[0] = Series(1, index=range(4)) - tm.assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) + tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0])) # columns will align - df = DataFrame(columns=['A', 'B']) - df.loc[0] = Series(1, index=['B']) + df = DataFrame(columns=["A", "B"]) + df.loc[0] = Series(1, index=["B"]) - exp = DataFrame([[np.nan, 1]], columns=['A', 'B'], - index=[0], dtype='float64') + exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") tm.assert_frame_equal(df, exp) # list-like must conform - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) with pytest.raises(ValueError): df.loc[0] = [1, 2, 3] # TODO: #15657, these are left as object and not coerced - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df.loc[3] = [6, 7] - exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], - dtype='object') + exp = DataFrame([[6, 7]], index=[3], columns=["A", "B"], dtype="object") tm.assert_frame_equal(df, exp) def test_series_partial_set(self): @@ -195,29 +193,31 @@ def test_series_partial_set(self): result = ser.reindex([3, 2, 3]) tm.assert_series_equal(result, expected, check_index_type=True) - expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x']) + expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, "x"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[3, 2, 3, 'x']] + result = ser.loc[[3, 2, 3, "x"]] tm.assert_series_equal(result, expected, check_index_type=True) - result = ser.reindex([3, 2, 3, 'x']) + result = ser.reindex([3, 2, 3, "x"]) tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) result = ser.loc[[2, 2, 1]] tm.assert_series_equal(result, expected, check_index_type=True) - expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1]) + expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, "x", 1]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[2, 2, 'x', 1]] + result = ser.loc[[2, 2, "x", 1]] tm.assert_series_equal(result, expected, check_index_type=True) - result = ser.reindex([2, 2, 'x', 1]) + result = ser.reindex([2, 2, "x", 1]) tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index - msg = (r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are" - r" in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are" + r" in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] @@ -238,8 +238,7 @@ def test_series_partial_set(self): result = s.reindex([3, 4, 4]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[5, 3, 3]] @@ -248,8 +247,7 @@ def test_series_partial_set(self): result = s.reindex([5, 3, 3]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[5, 4, 4]] @@ -258,8 +256,7 @@ def test_series_partial_set(self): result = s.reindex([5, 4, 4]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[4, 5, 6, 7]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]) expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[7, 2, 2]] @@ -268,8 +265,7 @@ def test_series_partial_set(self): result = s.reindex([7, 2, 2]) tm.assert_series_equal(result, expected, check_index_type=True) - s = Series([0.1, 0.2, 0.3, 0.4], - index=[1, 2, 3, 4]) + s = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]) expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s.loc[[4, 5, 5]] @@ -286,90 +282,85 @@ def test_series_partial_set(self): def test_series_partial_set_with_name(self): # GH 11497 - idx = Index([1, 2], dtype='int64', name='idx') - ser = Series([0.1, 0.2], index=idx, name='s') + idx = Index([1, 2], dtype="int64", name="idx") + ser = Series([0.1, 0.2], index=idx, name="s") # loc - exp_idx = Index([3, 2, 3], dtype='int64', name='idx') - expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s') + exp_idx = Index([3, 2, 3], dtype="int64", name="idx") + expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.loc[[3, 2, 3]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx') - expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, - name='s') + exp_idx = Index([3, 2, 3, "x"], dtype="object", name="idx") + expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[3, 2, 3, 'x']] + result = ser.loc[[3, 2, 3, "x"]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([2, 2, 1], dtype='int64', name='idx') - expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s') + exp_idx = Index([2, 2, 1], dtype="int64", name="idx") + expected = Series([0.2, 0.2, 0.1], index=exp_idx, name="s") result = ser.loc[[2, 2, 1]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx') - expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s') + exp_idx = Index([2, 2, "x", 1], dtype="object", name="idx") + expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = ser.loc[[2, 2, 'x', 1]] + result = ser.loc[[2, 2, "x", 1]] tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index - msg = (r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'," - r" name='idx'\)\] are in the \[index\]\"") + msg = ( + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'," + r" name='idx'\)\] are in the \[index\]\"" + ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] - exp_idx = Index([2, 2, 3], dtype='int64', name='idx') - expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s') + exp_idx = Index([2, 2, 3], dtype="int64", name="idx") + expected = Series([0.2, 0.2, np.nan], index=exp_idx, name="s") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.loc[[2, 2, 3]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([3, 4, 4], dtype='int64', name='idx') - expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([1, 2, 3], dtype='int64', name='idx') + exp_idx = Index([3, 4, 4], dtype="int64", name="idx") + expected = Series([0.3, np.nan, np.nan], index=exp_idx, name="s") + idx = Index([1, 2, 3], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3], - index=idx, - name='s').loc[[3, 4, 4]] + result = Series([0.1, 0.2, 0.3], index=idx, name="s").loc[[3, 4, 4]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([5, 3, 3], dtype='int64', name='idx') - expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + exp_idx = Index([5, 3, 3], dtype="int64", name="idx") + expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name="s") + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[5, 3, 3]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 3, 3]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([5, 4, 4], dtype='int64', name='idx') - expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + exp_idx = Index([5, 4, 4], dtype="int64", name="idx") + expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name="s") + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[5, 4, 4]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[5, 4, 4]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([7, 2, 2], dtype='int64', name='idx') - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([4, 5, 6, 7], dtype='int64', name='idx') + exp_idx = Index([7, 2, 2], dtype="int64", name="idx") + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name="s") + idx = Index([4, 5, 6, 7], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[7, 2, 2]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[7, 2, 2]] tm.assert_series_equal(result, expected, check_index_type=True) - exp_idx = Index([4, 5, 5], dtype='int64', name='idx') - expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') - idx = Index([1, 2, 3, 4], dtype='int64', name='idx') + exp_idx = Index([4, 5, 5], dtype="int64", name="idx") + expected = Series([0.4, np.nan, np.nan], index=exp_idx, name="s") + idx = Index([1, 2, 3, 4], dtype="int64", name="idx") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = Series([0.1, 0.2, 0.3, 0.4], index=idx, - name='s').loc[[4, 5, 5]] + result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name="s").loc[[4, 5, 5]] tm.assert_series_equal(result, expected, check_index_type=True) # iloc - exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx') - expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s') + exp_idx = Index([2, 2, 1, 1], dtype="int64", name="idx") + expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name="s") result = ser.iloc[[1, 1, 0, 0]] tm.assert_series_equal(result, expected, check_index_type=True) @@ -402,11 +393,11 @@ def test_partial_set_invalid(self): # allow object conversion here df = orig.copy() with catch_warnings(record=True): - df.loc['a', :] = df.ix[0] - exp = orig.append(Series(df.ix[0], name='a')) + df.loc["a", :] = df.ix[0] + exp = orig.append(Series(df.ix[0], name="a")) tm.assert_frame_equal(df, exp) - tm.assert_index_equal(df.index, Index(orig.index.tolist() + ['a'])) - assert df.index.dtype == 'object' + tm.assert_index_equal(df.index, Index(orig.index.tolist() + ["a"])) + assert df.index.dtype == "object" def test_partial_set_empty_series(self): @@ -420,18 +411,18 @@ def test_partial_set_empty_series(self): tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) s = Series() - s.loc[1] = 1. - tm.assert_series_equal(s, Series([1.], index=[1])) - s.loc[3] = 3. - tm.assert_series_equal(s, Series([1., 3.], index=[1, 3])) + s.loc[1] = 1.0 + tm.assert_series_equal(s, Series([1.0], index=[1])) + s.loc[3] = 3.0 + tm.assert_series_equal(s, Series([1.0, 3.0], index=[1, 3])) s = Series() - s.loc['foo'] = 1 - tm.assert_series_equal(s, Series([1], index=['foo'])) - s.loc['bar'] = 3 - tm.assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) + s.loc["foo"] = 1 + tm.assert_series_equal(s, Series([1], index=["foo"])) + s.loc["bar"] = 3 + tm.assert_series_equal(s, Series([1, 3], index=["foo", "bar"])) s.loc[3] = 4 - tm.assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) + tm.assert_series_equal(s, Series([1, 3, 4], index=["foo", "bar", 3])) def test_partial_set_empty_frame(self): @@ -443,7 +434,7 @@ def test_partial_set_empty_frame(self): df.loc[1] = 1 with pytest.raises(ValueError): - df.loc[1] = Series([1], index=['foo']) + df.loc[1] = Series([1], index=["foo"]) with pytest.raises(ValueError): df.loc[:, 1] = 1 @@ -451,75 +442,74 @@ def test_partial_set_empty_frame(self): # these work as they don't really change # anything but the index # GH5632 - expected = DataFrame(columns=['foo'], index=Index([], dtype='int64')) + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) def f(): df = DataFrame() - df['foo'] = Series([], dtype='object') + df["foo"] = Series([], dtype="object") return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = Series(df.index) + df["foo"] = Series(df.index) return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = df.index + df["foo"] = df.index return df tm.assert_frame_equal(f(), expected) - expected = DataFrame(columns=['foo'], index=Index([], dtype='int64')) - expected['foo'] = expected['foo'].astype('float64') + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected["foo"] = expected["foo"].astype("float64") def f(): df = DataFrame() - df['foo'] = [] + df["foo"] = [] return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - df['foo'] = Series(np.arange(len(df)), dtype='float64') + df["foo"] = Series(np.arange(len(df)), dtype="float64") return df tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() - tm.assert_index_equal(df.index, Index([], dtype='object')) - df['foo'] = range(len(df)) + tm.assert_index_equal(df.index, Index([], dtype="object")) + df["foo"] = range(len(df)) return df - expected = DataFrame(columns=['foo'], index=Index([], dtype='int64')) - expected['foo'] = expected['foo'].astype('float64') + expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected["foo"] = expected["foo"].astype("float64") tm.assert_frame_equal(f(), expected) df = DataFrame() tm.assert_index_equal(df.columns, Index([], dtype=object)) df2 = DataFrame() - df2[1] = Series([1], index=['foo']) - df.loc[:, 1] = Series([1], index=['foo']) - tm.assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) + df2[1] = Series([1], index=["foo"]) + df.loc[:, 1] = Series([1], index=["foo"]) + tm.assert_frame_equal(df, DataFrame([[1]], index=["foo"], columns=[1])) tm.assert_frame_equal(df, df2) # no index to start - expected = DataFrame({0: Series(1, index=range(4))}, - columns=['A', 'B', 0]) + expected = DataFrame({0: Series(1, index=range(4))}, columns=["A", "B", 0]) - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df[0] = Series(1, index=range(4)) df.dtypes str(df) tm.assert_frame_equal(df, expected) - df = DataFrame(columns=['A', 'B']) + df = DataFrame(columns=["A", "B"]) df.loc[:, 0] = Series(1, index=range(4)) df.dtypes str(df) @@ -528,34 +518,32 @@ def f(): def test_partial_set_empty_frame_row(self): # GH5720, GH5744 # don't create rows when empty - expected = DataFrame(columns=['A', 'B', 'New'], - index=Index([], dtype='int64')) - expected['A'] = expected['A'].astype('int64') - expected['B'] = expected['B'].astype('float64') - expected['New'] = expected['New'].astype('float64') + expected = DataFrame(columns=["A", "B", "New"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["New"] = expected["New"].astype("float64") df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] - y['New'] = np.nan + y["New"] = np.nan tm.assert_frame_equal(y, expected) # tm.assert_frame_equal(y,expected) - expected = DataFrame(columns=['a', 'b', 'c c', 'd']) - expected['d'] = expected['d'].astype('int64') - df = DataFrame(columns=['a', 'b', 'c c']) - df['d'] = 3 + expected = DataFrame(columns=["a", "b", "c c", "d"]) + expected["d"] = expected["d"].astype("int64") + df = DataFrame(columns=["a", "b", "c c"]) + df["d"] = 3 tm.assert_frame_equal(df, expected) - tm.assert_series_equal(df['c c'], Series(name='c c', dtype=object)) + tm.assert_series_equal(df["c c"], Series(name="c c", dtype=object)) # reindex columns is ok df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] - result = y.reindex(columns=['A', 'B', 'C']) - expected = DataFrame(columns=['A', 'B', 'C'], - index=Index([], dtype='int64')) - expected['A'] = expected['A'].astype('int64') - expected['B'] = expected['B'].astype('float64') - expected['C'] = expected['C'].astype('float64') + result = y.reindex(columns=["A", "B", "C"]) + expected = DataFrame(columns=["A", "B", "C"], index=Index([], dtype="int64")) + expected["A"] = expected["A"].astype("int64") + expected["B"] = expected["B"].astype("float64") + expected["C"] = expected["C"].astype("float64") tm.assert_frame_equal(result, expected) def test_partial_set_empty_frame_set_series(self): @@ -564,33 +552,32 @@ def test_partial_set_empty_frame_set_series(self): df = DataFrame(Series()) tm.assert_frame_equal(df, DataFrame({0: Series()})) - df = DataFrame(Series(name='foo')) - tm.assert_frame_equal(df, DataFrame({'foo': Series()})) + df = DataFrame(Series(name="foo")) + tm.assert_frame_equal(df, DataFrame({"foo": Series()})) def test_partial_set_empty_frame_empty_copy_assignment(self): # GH 5932 # copy on empty with assignment fails df = DataFrame(index=[0]) df = df.copy() - df['a'] = 0 - expected = DataFrame(0, index=[0], columns=['a']) + df["a"] = 0 + expected = DataFrame(0, index=[0], columns=["a"]) tm.assert_frame_equal(df, expected) def test_partial_set_empty_frame_empty_consistencies(self): # GH 6171 # consistency on empty frames - df = DataFrame(columns=['x', 'y']) - df['x'] = [1, 2] + df = DataFrame(columns=["x", "y"]) + df["x"] = [1, 2] expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan])) tm.assert_frame_equal(df, expected, check_dtype=False) - df = DataFrame(columns=['x', 'y']) - df['x'] = ['1', '2'] - expected = DataFrame( - dict(x=['1', '2'], y=[np.nan, np.nan]), dtype=object) + df = DataFrame(columns=["x", "y"]) + df["x"] = ["1", "2"] + expected = DataFrame(dict(x=["1", "2"], y=[np.nan, np.nan]), dtype=object) tm.assert_frame_equal(df, expected) - df = DataFrame(columns=['x', 'y']) - df.loc[0, 'x'] = 1 + df = DataFrame(columns=["x", "y"]) + df.loc[0, "x"] = 1 expected = DataFrame(dict(x=[1], y=[np.nan])) tm.assert_frame_equal(df, expected, check_dtype=False) diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 20053264ac4f1..a6e1273a229dc 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -9,7 +9,6 @@ class TestScalar(Base): - def test_at_and_iat_get(self): def _check(f, func, values=False): @@ -25,19 +24,18 @@ def _check(f, func, values=False): d = getattr(self, o) # iat - for f in [d['ints'], d['uints']]: - _check(f, 'iat', values=True) + for f in [d["ints"], d["uints"]]: + _check(f, "iat", values=True) - for f in [d['labels'], d['ts'], d['floats']]: + for f in [d["labels"], d["ts"], d["floats"]]: if f is not None: msg = "iAt based indexing can only have integer indexers" with pytest.raises(ValueError, match=msg): - self.check_values(f, 'iat') + self.check_values(f, "iat") # at - for f in [d['ints'], d['uints'], d['labels'], - d['ts'], d['floats']]: - _check(f, 'at') + for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: + _check(f, "at") def test_at_and_iat_set(self): def _check(f, func, values=False): @@ -54,28 +52,25 @@ def _check(f, func, values=False): d = getattr(self, t) # iat - for f in [d['ints'], d['uints']]: - _check(f, 'iat', values=True) + for f in [d["ints"], d["uints"]]: + _check(f, "iat", values=True) - for f in [d['labels'], d['ts'], d['floats']]: + for f in [d["labels"], d["ts"], d["floats"]]: if f is not None: msg = "iAt based indexing can only have integer indexers" with pytest.raises(ValueError, match=msg): - _check(f, 'iat') + _check(f, "iat") # at - for f in [d['ints'], d['uints'], d['labels'], - d['ts'], d['floats']]: - _check(f, 'at') + for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: + _check(f, "at") def test_at_iat_coercion(self): # as timestamp is not a tuple! - dates = date_range('1/1/2000', periods=8) - df = DataFrame(np.random.randn(8, 4), - index=dates, - columns=['A', 'B', 'C', 'D']) - s = df['A'] + dates = date_range("1/1/2000", periods=8) + df = DataFrame(np.random.randn(8, 4), index=dates, columns=["A", "B", "C", "D"]) + s = df["A"] result = s.at[dates[5]] xp = s.values[5] @@ -83,15 +78,15 @@ def test_at_iat_coercion(self): # GH 7729 # make sure we are boxing the returns - s = Series(['2014-01-01', '2014-02-02'], dtype='datetime64[ns]') - expected = Timestamp('2014-02-02') + s = Series(["2014-01-01", "2014-02-02"], dtype="datetime64[ns]") + expected = Timestamp("2014-02-02") for r in [lambda: s.iat[1], lambda: s.iloc[1]]: result = r() assert result == expected - s = Series(['1 days', '2 days'], dtype='timedelta64[ns]') - expected = Timedelta('2 days') + s = Series(["1 days", "2 days"], dtype="timedelta64[ns]") + expected = Timedelta("2 days") for r in [lambda: s.iat[1], lambda: s.iloc[1]]: result = r() @@ -105,7 +100,7 @@ def test_imethods_with_dups(self): # GH6493 # iat/iloc with dups - s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64') + s = Series(range(5), index=[1, 1, 2, 2, 3], dtype="int64") result = s.iloc[2] assert result == 2 result = s.iat[2] @@ -119,7 +114,7 @@ def test_imethods_with_dups(self): s.iat[-10] result = s.iloc[[2, 3]] - expected = Series([2, 3], [2, 2], dtype='int64') + expected = Series([2, 3], [2, 2], dtype="int64") tm.assert_series_equal(result, expected) df = s.to_frame() @@ -133,61 +128,68 @@ def test_imethods_with_dups(self): def test_at_to_fail(self): # at should not fallback # GH 7814 - s = Series([1, 2, 3], index=list('abc')) - result = s.at['a'] + s = Series([1, 2, 3], index=list("abc")) + result = s.at["a"] assert result == 1 - msg = ("At based indexing on an non-integer index can only have" - " non-integer indexers") + msg = ( + "At based indexing on an non-integer index can only have" + " non-integer indexers" + ) with pytest.raises(ValueError, match=msg): s.at[0] - df = DataFrame({'A': [1, 2, 3]}, index=list('abc')) - result = df.at['a', 'A'] + df = DataFrame({"A": [1, 2, 3]}, index=list("abc")) + result = df.at["a", "A"] assert result == 1 with pytest.raises(ValueError, match=msg): - df.at['a', 0] + df.at["a", 0] s = Series([1, 2, 3], index=[3, 2, 1]) result = s.at[1] assert result == 3 - msg = ("At based indexing on an integer index can only have integer" - " indexers") + msg = "At based indexing on an integer index can only have integer" " indexers" with pytest.raises(ValueError, match=msg): - s.at['a'] + s.at["a"] df = DataFrame({0: [1, 2, 3]}, index=[3, 2, 1]) result = df.at[1, 0] assert result == 3 with pytest.raises(ValueError, match=msg): - df.at['a', 0] + df.at["a", 0] # GH 13822, incorrect error string with non-unique columns when missing # column is accessed - df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]}) - df.columns = ['x', 'x', 'z'] + df = DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}) + df.columns = ["x", "x", "z"] # Check that we get the correct value in the KeyError with pytest.raises(KeyError, match=r"\['y'\] not in index"): - df[['x', 'y', 'z']] + df[["x", "y", "z"]] def test_at_with_tz(self): # gh-15822 - df = DataFrame({'name': ['John', 'Anderson'], - 'date': [Timestamp(2017, 3, 13, 13, 32, 56), - Timestamp(2017, 2, 16, 12, 10, 3)]}) - df['date'] = df['date'].dt.tz_localize('Asia/Shanghai') - - expected = Timestamp('2017-03-13 13:32:56+0800', tz='Asia/Shanghai') - - result = df.loc[0, 'date'] + df = DataFrame( + { + "name": ["John", "Anderson"], + "date": [ + Timestamp(2017, 3, 13, 13, 32, 56), + Timestamp(2017, 2, 16, 12, 10, 3), + ], + } + ) + df["date"] = df["date"].dt.tz_localize("Asia/Shanghai") + + expected = Timestamp("2017-03-13 13:32:56+0800", tz="Asia/Shanghai") + + result = df.loc[0, "date"] assert result == expected - result = df.at[0, 'date'] + result = df.at[0, "date"] assert result == expected def test_series_set_tz_timestamp(self, tz_naive_fixture): # GH 25506 - ts = Timestamp('2017-08-05 00:00:00+0100', tz=tz_naive_fixture) + ts = Timestamp("2017-08-05 00:00:00+0100", tz=tz_naive_fixture) result = Series(ts) result.at[1] = ts expected = Series([ts, ts]) @@ -195,7 +197,7 @@ def test_series_set_tz_timestamp(self, tz_naive_fixture): def test_mixed_index_at_iat_loc_iloc_series(self): # GH 19860 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 1, 2]) + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", 1, 2]) for el, item in s.iteritems(): assert s.at[el] == s.loc[el] == item for i in range(len(s)): @@ -208,8 +210,9 @@ def test_mixed_index_at_iat_loc_iloc_series(self): def test_mixed_index_at_iat_loc_iloc_dataframe(self): # GH 19860 - df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], - columns=['a', 'b', 'c', 1, 2]) + df = DataFrame( + [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], columns=["a", "b", "c", 1, 2] + ) for rowIdx, row in df.iterrows(): for el, item in row.iteritems(): assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item @@ -225,7 +228,7 @@ def test_mixed_index_at_iat_loc_iloc_dataframe(self): def test_iat_setter_incompatible_assignment(self): # GH 23236 - result = DataFrame({'a': [0, 1], 'b': [4, 5]}) + result = DataFrame({"a": [0, 1], "b": [4, 5]}) result.iat[0, 0] = None expected = DataFrame({"a": [None, 1], "b": [4, 5]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index e3f5bcff4a22e..7628aa53ef3cb 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -8,75 +8,82 @@ class TestTimedeltaIndexing: def test_boolean_indexing(self): # GH 14946 - df = pd.DataFrame({'x': range(10)}) - df.index = pd.to_timedelta(range(10), unit='s') - conditions = [df['x'] > 3, df['x'] == 3, df['x'] < 3] - expected_data = [[0, 1, 2, 3, 10, 10, 10, 10, 10, 10], - [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], - [10, 10, 10, 3, 4, 5, 6, 7, 8, 9]] + df = pd.DataFrame({"x": range(10)}) + df.index = pd.to_timedelta(range(10), unit="s") + conditions = [df["x"] > 3, df["x"] == 3, df["x"] < 3] + expected_data = [ + [0, 1, 2, 3, 10, 10, 10, 10, 10, 10], + [0, 1, 2, 10, 4, 5, 6, 7, 8, 9], + [10, 10, 10, 3, 4, 5, 6, 7, 8, 9], + ] for cond, data in zip(conditions, expected_data): - result = df.assign(x=df.mask(cond, 10).astype('int64')) - expected = pd.DataFrame(data, - index=pd.to_timedelta(range(10), unit='s'), - columns=['x'], - dtype='int64') + result = df.assign(x=df.mask(cond, 10).astype("int64")) + expected = pd.DataFrame( + data, + index=pd.to_timedelta(range(10), unit="s"), + columns=["x"], + dtype="int64", + ) tm.assert_frame_equal(expected, result) @pytest.mark.parametrize( "indexer, expected", - [(0, [20, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - (slice(4, 8), [0, 1, 2, 3, 20, 20, 20, 20, 8, 9]), - ([3, 5], [0, 1, 2, 20, 4, 20, 6, 7, 8, 9])]) + [ + (0, [20, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + (slice(4, 8), [0, 1, 2, 3, 20, 20, 20, 20, 8, 9]), + ([3, 5], [0, 1, 2, 20, 4, 20, 6, 7, 8, 9]), + ], + ) def test_list_like_indexing(self, indexer, expected): # GH 16637 - df = pd.DataFrame({'x': range(10)}, dtype="int64") - df.index = pd.to_timedelta(range(10), unit='s') + df = pd.DataFrame({"x": range(10)}, dtype="int64") + df.index = pd.to_timedelta(range(10), unit="s") - df.loc[df.index[indexer], 'x'] = 20 + df.loc[df.index[indexer], "x"] = 20 - expected = pd.DataFrame(expected, - index=pd.to_timedelta(range(10), unit='s'), - columns=['x'], - dtype="int64") + expected = pd.DataFrame( + expected, + index=pd.to_timedelta(range(10), unit="s"), + columns=["x"], + dtype="int64", + ) tm.assert_frame_equal(expected, df) def test_string_indexing(self): # GH 16896 - df = pd.DataFrame({'x': range(3)}, - index=pd.to_timedelta(range(3), unit='days')) + df = pd.DataFrame({"x": range(3)}, index=pd.to_timedelta(range(3), unit="days")) expected = df.iloc[0] - sliced = df.loc['0 days'] + sliced = df.loc["0 days"] tm.assert_series_equal(sliced, expected) - @pytest.mark.parametrize( - "value", - [None, pd.NaT, np.nan]) + @pytest.mark.parametrize("value", [None, pd.NaT, np.nan]) def test_masked_setitem(self, value): # issue (#18586) - series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series = pd.Series([0, 1, 2], dtype="timedelta64[ns]") series[series == series[0]] = value - expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, 1, 2], dtype="timedelta64[ns]") tm.assert_series_equal(series, expected) - @pytest.mark.parametrize( - "value", - [None, pd.NaT, np.nan]) + @pytest.mark.parametrize("value", [None, pd.NaT, np.nan]) def test_listlike_setitem(self, value): # issue (#18586) - series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series = pd.Series([0, 1, 2], dtype="timedelta64[ns]") series.iloc[0] = value - expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + expected = pd.Series([pd.NaT, 1, 2], dtype="timedelta64[ns]") tm.assert_series_equal(series, expected) - @pytest.mark.parametrize('start,stop, expected_slice', [ - [np.timedelta64(0, 'ns'), None, slice(0, 11)], - [np.timedelta64(1, 'D'), np.timedelta64(6, 'D'), slice(1, 7)], - [None, np.timedelta64(4, 'D'), slice(0, 5)]]) - def test_numpy_timedelta_scalar_indexing(self, start, stop, - expected_slice): + @pytest.mark.parametrize( + "start,stop, expected_slice", + [ + [np.timedelta64(0, "ns"), None, slice(0, 11)], + [np.timedelta64(1, "D"), np.timedelta64(6, "D"), slice(1, 7)], + [None, np.timedelta64(4, "D"), slice(0, 5)], + ], + ) + def test_numpy_timedelta_scalar_indexing(self, start, stop, expected_slice): # GH 20393 - s = pd.Series(range(11), pd.timedelta_range('0 days', '10 days')) + s = pd.Series(range(11), pd.timedelta_range("0 days", "10 days")) result = s.loc[slice(start, stop)] expected = s.iloc[expected_slice] tm.assert_series_equal(result, expected) @@ -85,19 +92,19 @@ def test_roundtrip_thru_setitem(self): # PR 23462 dt1 = pd.Timedelta(0) dt2 = pd.Timedelta(28767471428571405) - df = pd.DataFrame({'dt': pd.Series([dt1, dt2])}) + df = pd.DataFrame({"dt": pd.Series([dt1, dt2])}) df_copy = df.copy() s = pd.Series([dt1]) - expected = df['dt'].iloc[1].value + expected = df["dt"].iloc[1].value df.loc[[True, False]] = s - result = df['dt'].iloc[1].value + result = df["dt"].iloc[1].value assert expected == result tm.assert_frame_equal(df, df_copy) def test_loc_str_slicing(self): - ix = pd.timedelta_range(start='1 day', end='2 days', freq='1H') + ix = pd.timedelta_range(start="1 day", end="2 days", freq="1H") ser = ix.to_series() result = ser.loc[:"1 days"] expected = ser.iloc[:-1] @@ -105,9 +112,9 @@ def test_loc_str_slicing(self): tm.assert_series_equal(result, expected) def test_loc_slicing(self): - ix = pd.timedelta_range(start='1 day', end='2 days', freq='1H') + ix = pd.timedelta_range(start="1 day", end="2 days", freq="1H") ser = ix.to_series() - result = ser.loc[:ix[-2]] + result = ser.loc[: ix[-2]] expected = ser.iloc[:-1] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 411146843d60f..9ce1062a6ec26 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -13,25 +13,36 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, - SparseArray) + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + SparseArray, +) import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, TimedeltaArray from pandas.core.internals import BlockManager, SingleBlockManager, make_block import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal, randn) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, + randn, +) # in 3.6.1 a c-api slicing function changed, see src/compat_helper.h -PY361 = LooseVersion(sys.version) >= LooseVersion('3.6.1') +PY361 = LooseVersion(sys.version) >= LooseVersion("3.6.1") @pytest.fixture def mgr(): return create_mgr( - 'a: f8; b: object; c: f8; d: object; e: f8;' - 'f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;' - 'k: M8[ns, US/Eastern]; l: M8[ns, CET];') + "a: f8; b: object; c: f8; d: object; e: f8;" + "f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;" + "k: M8[ns, US/Eastern]; l: M8[ns, CET];" + ) def assert_block_equal(left, right): @@ -39,14 +50,14 @@ def assert_block_equal(left, right): assert left.dtype == right.dtype assert isinstance(left.mgr_locs, BlockPlacement) assert isinstance(right.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(left.mgr_locs.as_array, - right.mgr_locs.as_array) + tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array) def get_numeric_mat(shape): arr = np.arange(shape[0]) - return np.lib.stride_tricks.as_strided(x=arr, shape=shape, strides=( - arr.itemsize, ) + (0, ) * (len(shape) - 1)).copy() + return np.lib.stride_tricks.as_strided( + x=arr, shape=shape, strides=(arr.itemsize,) + (0,) * (len(shape) - 1) + ).copy() N = 10 @@ -73,50 +84,64 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): num_items = len(placement) if item_shape is None: - item_shape = (N, ) + item_shape = (N,) - shape = (num_items, ) + item_shape + shape = (num_items,) + item_shape mat = get_numeric_mat(shape) - if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1', - 'uint', 'u8', 'u4', 'u2', 'u1'): + if typestr in ( + "float", + "f8", + "f4", + "f2", + "int", + "i8", + "i4", + "i2", + "i1", + "uint", + "u8", + "u4", + "u2", + "u1", + ): values = mat.astype(typestr) + num_offset - elif typestr in ('complex', 'c16', 'c8'): - values = 1.j * (mat.astype(typestr) + num_offset) - elif typestr in ('object', 'string', 'O'): - values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], - shape) - elif typestr in ('b', 'bool', ): + elif typestr in ("complex", "c16", "c8"): + values = 1.0j * (mat.astype(typestr) + num_offset) + elif typestr in ("object", "string", "O"): + values = np.reshape(["A%d" % i for i in mat.ravel() + num_offset], shape) + elif typestr in ("b", "bool"): values = np.ones(shape, dtype=np.bool_) - elif typestr in ('datetime', 'dt', 'M8[ns]'): - values = (mat * 1e9).astype('M8[ns]') - elif typestr.startswith('M8[ns'): + elif typestr in ("datetime", "dt", "M8[ns]"): + values = (mat * 1e9).astype("M8[ns]") + elif typestr.startswith("M8[ns"): # datetime with tz - m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr) + m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr) assert m is not None, "incompatible typestr -> {0}".format(typestr) tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) - elif typestr in ('timedelta', 'td', 'm8[ns]'): - values = (mat * 1).astype('m8[ns]') - elif typestr in ('category', ): + elif typestr in ("timedelta", "td", "m8[ns]"): + values = (mat * 1).astype("m8[ns]") + elif typestr in ("category",): values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) - elif typestr in ('category2', ): - values = Categorical(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'd' - ]) - elif typestr in ('sparse', 'sparse_na'): + elif typestr in ("category2",): + values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"]) + elif typestr in ("sparse", "sparse_na"): # FIXME: doesn't support num_rows != 10 assert shape[-1] == 10 assert all(s == 1 for s in shape[:-1]) - if typestr.endswith('_na'): + if typestr.endswith("_na"): fill_value = np.nan else: fill_value = 0.0 - values = SparseArray([fill_value, fill_value, 1, 2, 3, fill_value, - 4, 5, fill_value, 6], fill_value=fill_value) + values = SparseArray( + [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], + fill_value=fill_value, + ) arr = values.sp_values.view() - arr += (num_offset - 1) + arr += num_offset - 1 else: raise ValueError('Unsupported typestr: "%s"' % typestr) @@ -129,7 +154,8 @@ def create_single_mgr(typestr, num_rows=None): return SingleBlockManager( create_block(typestr, placement=slice(0, num_rows), item_shape=()), - np.arange(num_rows)) + np.arange(num_rows), + ) def create_mgr(descr, item_shape=None): @@ -154,18 +180,18 @@ def create_mgr(descr, item_shape=None): """ if item_shape is None: - item_shape = (N, ) + item_shape = (N,) offset = 0 mgr_items = [] block_placements = OrderedDict() - for d in descr.split(';'): + for d in descr.split(";"): d = d.strip() if not len(d): continue - names, blockstr = d.partition(':')[::2] + names, blockstr = d.partition(":")[::2] blockstr = blockstr.strip() - names = names.strip().split(',') + names = names.strip().split(",") mgr_items.extend(names) placement = list(np.arange(len(names)) + offset) @@ -180,19 +206,21 @@ def create_mgr(descr, item_shape=None): blocks = [] num_offset = 0 for blockstr, placement in block_placements.items(): - typestr = blockstr.split('-')[0] - blocks.append(create_block(typestr, - placement, - item_shape=item_shape, - num_offset=num_offset, )) + typestr = blockstr.split("-")[0] + blocks.append( + create_block( + typestr, placement, item_shape=item_shape, num_offset=num_offset + ) + ) num_offset += len(placement) - return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]), - [mgr_items] + [np.arange(n) for n in item_shape]) + return BlockManager( + sorted(blocks, key=lambda b: b.mgr_locs[0]), + [mgr_items] + [np.arange(n) for n in item_shape], + ) class TestBlock: - def setup_method(self, method): # self.fblock = get_float_ex() # a,c,e # self.cblock = get_complex_ex() # @@ -200,14 +228,14 @@ def setup_method(self, method): # self.bool_block = get_bool_ex() # self.int_block = get_int_ex() - self.fblock = create_block('float', [0, 2, 4]) - self.cblock = create_block('complex', [7]) - self.oblock = create_block('object', [1, 3]) - self.bool_block = create_block('bool', [5]) - self.int_block = create_block('int', [6]) + self.fblock = create_block("float", [0, 2, 4]) + self.cblock = create_block("complex", [7]) + self.oblock = create_block("object", [1, 3]) + self.bool_block = create_block("bool", [5]) + self.int_block = create_block("int", [6]) def test_constructor(self): - int32block = create_block('i4', [0]) + int32block = create_block("i4", [0]) assert int32block.dtype == np.int32 def test_pickle(self): @@ -221,8 +249,9 @@ def _check(blk): def test_mgr_locs(self): assert isinstance(self.fblock.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(self.fblock.mgr_locs.as_array, - np.array([0, 2, 4], dtype=np.int64)) + tm.assert_numpy_array_equal( + self.fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.int64) + ) def test_attrs(self): assert self.fblock.shape == self.fblock.values.shape @@ -233,13 +262,14 @@ def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) - ref_cols = Index(['e', 'a', 'b', 'd', 'f']) + ref_cols = Index(["e", "a", "b", "d", "f"]) - ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) - bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) + ablock = make_block(avals, ref_cols.get_indexer(["e", "b"])) + bblock = make_block(bvals, ref_cols.get_indexer(["a", "d"])) merged = ablock.merge(bblock) - tm.assert_numpy_array_equal(merged.mgr_locs.as_array, - np.array([0, 1, 2, 3], dtype=np.int64)) + tm.assert_numpy_array_equal( + merged.mgr_locs.as_array, np.array([0, 1, 2, 3], dtype=np.int64) + ) tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals)) @@ -263,21 +293,24 @@ def test_delete(self): newb = self.fblock.copy() newb.delete(0) assert isinstance(newb.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(newb.mgr_locs.as_array, - np.array([2, 4], dtype=np.int64)) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([2, 4], dtype=np.int64) + ) assert (newb.values[0] == 1).all() newb = self.fblock.copy() newb.delete(1) assert isinstance(newb.mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(newb.mgr_locs.as_array, - np.array([0, 4], dtype=np.int64)) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([0, 4], dtype=np.int64) + ) assert (newb.values[1] == 2).all() newb = self.fblock.copy() newb.delete(2) - tm.assert_numpy_array_equal(newb.mgr_locs.as_array, - np.array([0, 2], dtype=np.int64)) + tm.assert_numpy_array_equal( + newb.mgr_locs.as_array, np.array([0, 2], dtype=np.int64) + ) assert (newb.values[1] == 1).all() newb = self.fblock.copy() @@ -286,50 +319,45 @@ def test_delete(self): def test_make_block_same_class(self): # issue 19431 - block = create_block('M8[ns, US/Eastern]', [3]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - block.make_block_same_class(block.values, - dtype=block.values.dtype) + block = create_block("M8[ns, US/Eastern]", [3]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + block.make_block_same_class(block.values, dtype=block.values.dtype) class TestDatetimeBlock: - def test_try_coerce_arg(self): - block = create_block('datetime', [0]) + block = create_block("datetime", [0]) # coerce None none_coerced = block._try_coerce_args(None) assert pd.Timestamp(none_coerced) is pd.NaT # coerce different types of date bojects - vals = (np.datetime64('2010-10-10'), datetime(2010, 10, 10), - date(2010, 10, 10)) + vals = (np.datetime64("2010-10-10"), datetime(2010, 10, 10), date(2010, 10, 10)) for val in vals: coerced = block._try_coerce_args(val) assert np.int64 == type(coerced) - assert pd.Timestamp('2010-10-10') == pd.Timestamp(coerced) + assert pd.Timestamp("2010-10-10") == pd.Timestamp(coerced) class TestBlockManager: - def test_constructor_corner(self): pass def test_attrs(self): - mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2') + mgr = create_mgr("a,b,c: f8-1; d,e,f: f8-2") assert mgr.nblocks == 2 assert len(mgr) == 6 def test_is_mixed_dtype(self): - assert not create_mgr('a,b:f8').is_mixed_type - assert not create_mgr('a:f8-1; b:f8-2').is_mixed_type + assert not create_mgr("a,b:f8").is_mixed_type + assert not create_mgr("a:f8-1; b:f8-2").is_mixed_type - assert create_mgr('a,b:f8; c,d: f4').is_mixed_type - assert create_mgr('a,b:f8; c,d: object').is_mixed_type + assert create_mgr("a,b:f8; c,d: f4").is_mixed_type + assert create_mgr("a,b:f8; c,d: object").is_mixed_type def test_duplicate_ref_loc_failure(self): - tmp_mgr = create_mgr('a:bool; a: f8') + tmp_mgr = create_mgr("a:bool; a: f8") axes, blocks = tmp_mgr.axes, tmp_mgr.blocks @@ -346,8 +374,8 @@ def test_duplicate_ref_loc_failure(self): mgr.iget(1) def test_contains(self, mgr): - assert 'a' in mgr - assert 'baz' not in mgr + assert "a" in mgr + assert "baz" not in mgr def test_pickle(self, mgr): @@ -367,78 +395,77 @@ def test_pickle(self, mgr): def test_non_unique_pickle(self): - mgr = create_mgr('a,a,a:f8') + mgr = create_mgr("a,a,a:f8") mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) - mgr = create_mgr('a: f8; a: i8') + mgr = create_mgr("a: f8; a: i8") mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) def test_categorical_block_pickle(self): - mgr = create_mgr('a: category') + mgr = create_mgr("a: category") mgr2 = tm.round_trip_pickle(mgr) assert_frame_equal(DataFrame(mgr), DataFrame(mgr2)) - smgr = create_single_mgr('category') + smgr = create_single_mgr("category") smgr2 = tm.round_trip_pickle(smgr) assert_series_equal(Series(smgr), Series(smgr2)) def test_get(self): - cols = Index(list('abc')) + cols = Index(list("abc")) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) - assert_almost_equal(mgr.get('a', fastpath=False), values[0]) - assert_almost_equal(mgr.get('b', fastpath=False), values[1]) - assert_almost_equal(mgr.get('c', fastpath=False), values[2]) - assert_almost_equal(mgr.get('a').internal_values(), values[0]) - assert_almost_equal(mgr.get('b').internal_values(), values[1]) - assert_almost_equal(mgr.get('c').internal_values(), values[2]) + assert_almost_equal(mgr.get("a", fastpath=False), values[0]) + assert_almost_equal(mgr.get("b", fastpath=False), values[1]) + assert_almost_equal(mgr.get("c", fastpath=False), values[2]) + assert_almost_equal(mgr.get("a").internal_values(), values[0]) + assert_almost_equal(mgr.get("b").internal_values(), values[1]) + assert_almost_equal(mgr.get("c").internal_values(), values[2]) def test_set(self): - mgr = create_mgr('a,b,c: int', item_shape=(3, )) - - mgr.set('d', np.array(['foo'] * 3)) - mgr.set('b', np.array(['bar'] * 3)) - tm.assert_numpy_array_equal(mgr.get('a').internal_values(), - np.array([0] * 3)) - tm.assert_numpy_array_equal(mgr.get('b').internal_values(), - np.array(['bar'] * 3, dtype=np.object_)) - tm.assert_numpy_array_equal(mgr.get('c').internal_values(), - np.array([2] * 3)) - tm.assert_numpy_array_equal(mgr.get('d').internal_values(), - np.array(['foo'] * 3, dtype=np.object_)) + mgr = create_mgr("a,b,c: int", item_shape=(3,)) + + mgr.set("d", np.array(["foo"] * 3)) + mgr.set("b", np.array(["bar"] * 3)) + tm.assert_numpy_array_equal(mgr.get("a").internal_values(), np.array([0] * 3)) + tm.assert_numpy_array_equal( + mgr.get("b").internal_values(), np.array(["bar"] * 3, dtype=np.object_) + ) + tm.assert_numpy_array_equal(mgr.get("c").internal_values(), np.array([2] * 3)) + tm.assert_numpy_array_equal( + mgr.get("d").internal_values(), np.array(["foo"] * 3, dtype=np.object_) + ) def test_set_change_dtype(self, mgr): - mgr.set('baz', np.zeros(N, dtype=bool)) + mgr.set("baz", np.zeros(N, dtype=bool)) - mgr.set('baz', np.repeat('foo', N)) - assert mgr.get('baz').dtype == np.object_ + mgr.set("baz", np.repeat("foo", N)) + assert mgr.get("baz").dtype == np.object_ mgr2 = mgr.consolidate() - mgr2.set('baz', np.repeat('foo', N)) - assert mgr2.get('baz').dtype == np.object_ + mgr2.set("baz", np.repeat("foo", N)) + assert mgr2.get("baz").dtype == np.object_ - mgr2.set('quux', randn(N).astype(int)) - assert mgr2.get('quux').dtype == np.int_ + mgr2.set("quux", randn(N).astype(int)) + assert mgr2.get("quux").dtype == np.int_ - mgr2.set('quux', randn(N)) - assert mgr2.get('quux').dtype == np.float_ + mgr2.set("quux", randn(N)) + assert mgr2.get("quux").dtype == np.float_ def test_set_change_dtype_slice(self): # GH8850 - cols = MultiIndex.from_tuples([('1st', 'a'), ('2nd', 'b'), ('3rd', 'c') - ]) + cols = MultiIndex.from_tuples([("1st", "a"), ("2nd", "b"), ("3rd", "c")]) df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols) - df['2nd'] = df['2nd'] * 2.0 + df["2nd"] = df["2nd"] * 2.0 blocks = df._to_dict_of_blocks() - assert sorted(blocks.keys()) == ['float64', 'int64'] - assert_frame_equal(blocks['float64'], DataFrame( - [[1.0, 4.0], [4.0, 10.0]], columns=cols[:2])) - assert_frame_equal(blocks['int64'], DataFrame( - [[3], [6]], columns=cols[2:])) + assert sorted(blocks.keys()) == ["float64", "int64"] + assert_frame_equal( + blocks["float64"], DataFrame([[1.0, 4.0], [4.0, 10.0]], columns=cols[:2]) + ) + assert_frame_equal(blocks["int64"], DataFrame([[3], [6]], columns=cols[2:])) def test_copy(self, mgr): cp = mgr.copy(deep=False) @@ -464,71 +491,70 @@ def test_copy(self, mgr): assert cp_blk.values.base is None and blk.values.base is None def test_sparse(self): - mgr = create_mgr('a: sparse-1; b: sparse-2') + mgr = create_mgr("a: sparse-1; b: sparse-2") # what to test here? assert mgr.as_array().dtype == np.float64 def test_sparse_mixed(self): - mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') + mgr = create_mgr("a: sparse-1; b: sparse-2; c: f8") assert len(mgr.blocks) == 3 assert isinstance(mgr, BlockManager) # what to test here? def test_as_array_float(self): - mgr = create_mgr('c: f4; d: f2; e: f8') + mgr = create_mgr("c: f4; d: f2; e: f8") assert mgr.as_array().dtype == np.float64 - mgr = create_mgr('c: f4; d: f2') + mgr = create_mgr("c: f4; d: f2") assert mgr.as_array().dtype == np.float32 def test_as_array_int_bool(self): - mgr = create_mgr('a: bool-1; b: bool-2') + mgr = create_mgr("a: bool-1; b: bool-2") assert mgr.as_array().dtype == np.bool_ - mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') + mgr = create_mgr("a: i8-1; b: i8-2; c: i4; d: i2; e: u1") assert mgr.as_array().dtype == np.int64 - mgr = create_mgr('c: i4; d: i2; e: u1') + mgr = create_mgr("c: i4; d: i2; e: u1") assert mgr.as_array().dtype == np.int32 def test_as_array_datetime(self): - mgr = create_mgr('h: datetime-1; g: datetime-2') - assert mgr.as_array().dtype == 'M8[ns]' + mgr = create_mgr("h: datetime-1; g: datetime-2") + assert mgr.as_array().dtype == "M8[ns]" def test_as_array_datetime_tz(self): - mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]') - assert mgr.get('h').dtype == 'datetime64[ns, US/Eastern]' - assert mgr.get('g').dtype == 'datetime64[ns, CET]' - assert mgr.as_array().dtype == 'object' + mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]") + assert mgr.get("h").dtype == "datetime64[ns, US/Eastern]" + assert mgr.get("g").dtype == "datetime64[ns, CET]" + assert mgr.as_array().dtype == "object" def test_astype(self): # coerce all - mgr = create_mgr('c: f4; d: f2; e: f8') - for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + mgr = create_mgr("c: f4; d: f2; e: f8") + for t in ["float16", "float32", "float64", "int32", "int64"]: t = np.dtype(t) tmgr = mgr.astype(t) - assert tmgr.get('c').dtype.type == t - assert tmgr.get('d').dtype.type == t - assert tmgr.get('e').dtype.type == t + assert tmgr.get("c").dtype.type == t + assert tmgr.get("d").dtype.type == t + assert tmgr.get("e").dtype.type == t # mixed - mgr = create_mgr('a,b: object; c: bool; d: datetime;' - 'e: f4; f: f2; g: f8') - for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + mgr = create_mgr("a,b: object; c: bool; d: datetime;" "e: f4; f: f2; g: f8") + for t in ["float16", "float32", "float64", "int32", "int64"]: t = np.dtype(t) - tmgr = mgr.astype(t, errors='ignore') - assert tmgr.get('c').dtype.type == t - assert tmgr.get('e').dtype.type == t - assert tmgr.get('f').dtype.type == t - assert tmgr.get('g').dtype.type == t - - assert tmgr.get('a').dtype.type == np.object_ - assert tmgr.get('b').dtype.type == np.object_ + tmgr = mgr.astype(t, errors="ignore") + assert tmgr.get("c").dtype.type == t + assert tmgr.get("e").dtype.type == t + assert tmgr.get("f").dtype.type == t + assert tmgr.get("g").dtype.type == t + + assert tmgr.get("a").dtype.type == np.object_ + assert tmgr.get("b").dtype.type == np.object_ if t != np.int64: - assert tmgr.get('d').dtype.type == np.datetime64 + assert tmgr.get("d").dtype.type == np.datetime64 else: - assert tmgr.get('d').dtype.type == t + assert tmgr.get("d").dtype.type == t def test_convert(self): def _compare(old_mgr, new_mgr): @@ -555,101 +581,101 @@ def _compare(old_mgr, new_mgr): assert found # noops - mgr = create_mgr('f: i8; g: f8') + mgr = create_mgr("f: i8; g: f8") new_mgr = mgr.convert() _compare(mgr, new_mgr) - mgr = create_mgr('a, b: object; f: i8; g: f8') + mgr = create_mgr("a, b: object; f: i8; g: f8") new_mgr = mgr.convert() _compare(mgr, new_mgr) # convert - mgr = create_mgr('a,b,foo: object; f: i8; g: f8') - mgr.set('a', np.array(['1'] * N, dtype=np.object_)) - mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) - mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + mgr = create_mgr("a,b,foo: object; f: i8; g: f8") + mgr.set("a", np.array(["1"] * N, dtype=np.object_)) + mgr.set("b", np.array(["2."] * N, dtype=np.object_)) + mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get('a').dtype == np.int64 - assert new_mgr.get('b').dtype == np.float64 - assert new_mgr.get('foo').dtype == np.object_ - assert new_mgr.get('f').dtype == np.int64 - assert new_mgr.get('g').dtype == np.float64 - - mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;' - 'i: i8; g: f8; h: f2') - mgr.set('a', np.array(['1'] * N, dtype=np.object_)) - mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) - mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + assert new_mgr.get("a").dtype == np.int64 + assert new_mgr.get("b").dtype == np.float64 + assert new_mgr.get("foo").dtype == np.object_ + assert new_mgr.get("f").dtype == np.int64 + assert new_mgr.get("g").dtype == np.float64 + + mgr = create_mgr( + "a,b,foo: object; f: i4; bool: bool; dt: datetime;" "i: i8; g: f8; h: f2" + ) + mgr.set("a", np.array(["1"] * N, dtype=np.object_)) + mgr.set("b", np.array(["2."] * N, dtype=np.object_)) + mgr.set("foo", np.array(["foo."] * N, dtype=np.object_)) new_mgr = mgr.convert(numeric=True) - assert new_mgr.get('a').dtype == np.int64 - assert new_mgr.get('b').dtype == np.float64 - assert new_mgr.get('foo').dtype == np.object_ - assert new_mgr.get('f').dtype == np.int32 - assert new_mgr.get('bool').dtype == np.bool_ - assert new_mgr.get('dt').dtype.type, np.datetime64 - assert new_mgr.get('i').dtype == np.int64 - assert new_mgr.get('g').dtype == np.float64 - assert new_mgr.get('h').dtype == np.float16 + assert new_mgr.get("a").dtype == np.int64 + assert new_mgr.get("b").dtype == np.float64 + assert new_mgr.get("foo").dtype == np.object_ + assert new_mgr.get("f").dtype == np.int32 + assert new_mgr.get("bool").dtype == np.bool_ + assert new_mgr.get("dt").dtype.type, np.datetime64 + assert new_mgr.get("i").dtype == np.int64 + assert new_mgr.get("g").dtype == np.float64 + assert new_mgr.get("h").dtype == np.float16 def test_interleave(self): # self - for dtype in ['f8', 'i8', 'object', 'bool', 'complex', 'M8[ns]', - 'm8[ns]']: - mgr = create_mgr('a: {0}'.format(dtype)) + for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]: + mgr = create_mgr("a: {0}".format(dtype)) assert mgr.as_array().dtype == dtype - mgr = create_mgr('a: {0}; b: {0}'.format(dtype)) + mgr = create_mgr("a: {0}; b: {0}".format(dtype)) assert mgr.as_array().dtype == dtype # will be converted according the actual dtype of the underlying - mgr = create_mgr('a: category') - assert mgr.as_array().dtype == 'i8' - mgr = create_mgr('a: category; b: category') - assert mgr.as_array().dtype == 'i8' - mgr = create_mgr('a: category; b: category2') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: category2') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: category2; b: category2') - assert mgr.as_array().dtype == 'object' + mgr = create_mgr("a: category") + assert mgr.as_array().dtype == "i8" + mgr = create_mgr("a: category; b: category") + assert mgr.as_array().dtype == "i8" + mgr = create_mgr("a: category; b: category2") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: category2") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: category2; b: category2") + assert mgr.as_array().dtype == "object" # combinations - mgr = create_mgr('a: f8') - assert mgr.as_array().dtype == 'f8' - mgr = create_mgr('a: f8; b: i8') - assert mgr.as_array().dtype == 'f8' - mgr = create_mgr('a: f4; b: i8') - assert mgr.as_array().dtype == 'f8' - mgr = create_mgr('a: f4; b: i8; d: object') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: bool; b: i8') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: complex') - assert mgr.as_array().dtype == 'complex' - mgr = create_mgr('a: f8; b: category') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: category') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: bool') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: i8') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: m8[ns]; b: bool') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: m8[ns]; b: i8') - assert mgr.as_array().dtype == 'object' - mgr = create_mgr('a: M8[ns]; b: m8[ns]') - assert mgr.as_array().dtype == 'object' + mgr = create_mgr("a: f8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f8; b: i8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f4; b: i8") + assert mgr.as_array().dtype == "f8" + mgr = create_mgr("a: f4; b: i8; d: object") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: bool; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: complex") + assert mgr.as_array().dtype == "complex" + mgr = create_mgr("a: f8; b: category") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: category") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: bool") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: m8[ns]; b: bool") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: m8[ns]; b: i8") + assert mgr.as_array().dtype == "object" + mgr = create_mgr("a: M8[ns]; b: m8[ns]") + assert mgr.as_array().dtype == "object" def test_interleave_non_unique_cols(self): - df = DataFrame([ - [pd.Timestamp('20130101'), 3.5], - [pd.Timestamp('20130102'), 4.5]], - columns=['x', 'x'], - index=[1, 2]) + df = DataFrame( + [[pd.Timestamp("20130101"), 3.5], [pd.Timestamp("20130102"), 4.5]], + columns=["x", "x"], + index=[1, 2], + ) df_unique = df.copy() - df_unique.columns = ['x', 'y'] + df_unique.columns = ["x", "y"] assert df_unique.values.shape == df.values.shape tm.assert_numpy_array_equal(df_unique.values[0], df.values[0]) tm.assert_numpy_array_equal(df_unique.values[1], df.values[1]) @@ -658,11 +684,11 @@ def test_consolidate(self): pass def test_consolidate_ordering_issues(self, mgr): - mgr.set('f', randn(N)) - mgr.set('d', randn(N)) - mgr.set('b', randn(N)) - mgr.set('g', randn(N)) - mgr.set('h', randn(N)) + mgr.set("f", randn(N)) + mgr.set("d", randn(N)) + mgr.set("b", randn(N)) + mgr.set("g", randn(N)) + mgr.set("h", randn(N)) # we have datetime/tz blocks in mgr cons = mgr.consolidate() @@ -670,103 +696,122 @@ def test_consolidate_ordering_issues(self, mgr): cons = mgr.consolidate().get_numeric_data() assert cons.nblocks == 1 assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement) - tm.assert_numpy_array_equal(cons.blocks[0].mgr_locs.as_array, - np.arange(len(cons.items), dtype=np.int64)) + tm.assert_numpy_array_equal( + cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.int64) + ) def test_reindex_index(self): pass def test_reindex_items(self): # mgr is not consolidated, f8 & f8-2 blocks - mgr = create_mgr('a: f8; b: i8; c: f8; d: i8; e: f8;' - 'f: bool; g: f8-2') + mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8;" "f: bool; g: f8-2") - reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0) + reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0) assert reindexed.nblocks == 2 - tm.assert_index_equal(reindexed.items, pd.Index(['g', 'c', 'a', 'd'])) + tm.assert_index_equal(reindexed.items, pd.Index(["g", "c", "a", "d"])) assert_almost_equal( - mgr.get('g', fastpath=False), reindexed.get('g', fastpath=False)) + mgr.get("g", fastpath=False), reindexed.get("g", fastpath=False) + ) assert_almost_equal( - mgr.get('c', fastpath=False), reindexed.get('c', fastpath=False)) + mgr.get("c", fastpath=False), reindexed.get("c", fastpath=False) + ) assert_almost_equal( - mgr.get('a', fastpath=False), reindexed.get('a', fastpath=False)) + mgr.get("a", fastpath=False), reindexed.get("a", fastpath=False) + ) assert_almost_equal( - mgr.get('d', fastpath=False), reindexed.get('d', fastpath=False)) + mgr.get("d", fastpath=False), reindexed.get("d", fastpath=False) + ) assert_almost_equal( - mgr.get('g').internal_values(), - reindexed.get('g').internal_values()) + mgr.get("g").internal_values(), reindexed.get("g").internal_values() + ) assert_almost_equal( - mgr.get('c').internal_values(), - reindexed.get('c').internal_values()) + mgr.get("c").internal_values(), reindexed.get("c").internal_values() + ) assert_almost_equal( - mgr.get('a').internal_values(), - reindexed.get('a').internal_values()) + mgr.get("a").internal_values(), reindexed.get("a").internal_values() + ) assert_almost_equal( - mgr.get('d').internal_values(), - reindexed.get('d').internal_values()) + mgr.get("d").internal_values(), reindexed.get("d").internal_values() + ) def test_get_numeric_data(self): - mgr = create_mgr('int: int; float: float; complex: complex;' - 'str: object; bool: bool; obj: object; dt: datetime', - item_shape=(3, )) - mgr.set('obj', np.array([1, 2, 3], dtype=np.object_)) + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.set("obj", np.array([1, 2, 3], dtype=np.object_)) numeric = mgr.get_numeric_data() - tm.assert_index_equal(numeric.items, - pd.Index(['int', 'float', 'complex', 'bool'])) + tm.assert_index_equal( + numeric.items, pd.Index(["int", "float", "complex", "bool"]) + ) assert_almost_equal( - mgr.get('float', fastpath=False), numeric.get('float', - fastpath=False)) + mgr.get("float", fastpath=False), numeric.get("float", fastpath=False) + ) assert_almost_equal( - mgr.get('float').internal_values(), - numeric.get('float').internal_values()) + mgr.get("float").internal_values(), numeric.get("float").internal_values() + ) # Check sharing - numeric.set('float', np.array([100., 200., 300.])) + numeric.set("float", np.array([100.0, 200.0, 300.0])) assert_almost_equal( - mgr.get('float', fastpath=False), np.array([100., 200., 300.])) + mgr.get("float", fastpath=False), np.array([100.0, 200.0, 300.0]) + ) assert_almost_equal( - mgr.get('float').internal_values(), np.array([100., 200., 300.])) + mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + ) numeric2 = mgr.get_numeric_data(copy=True) - tm.assert_index_equal(numeric.items, - pd.Index(['int', 'float', 'complex', 'bool'])) - numeric2.set('float', np.array([1000., 2000., 3000.])) + tm.assert_index_equal( + numeric.items, pd.Index(["int", "float", "complex", "bool"]) + ) + numeric2.set("float", np.array([1000.0, 2000.0, 3000.0])) assert_almost_equal( - mgr.get('float', fastpath=False), np.array([100., 200., 300.])) + mgr.get("float", fastpath=False), np.array([100.0, 200.0, 300.0]) + ) assert_almost_equal( - mgr.get('float').internal_values(), np.array([100., 200., 300.])) + mgr.get("float").internal_values(), np.array([100.0, 200.0, 300.0]) + ) def test_get_bool_data(self): - mgr = create_mgr('int: int; float: float; complex: complex;' - 'str: object; bool: bool; obj: object; dt: datetime', - item_shape=(3, )) - mgr.set('obj', np.array([True, False, True], dtype=np.object_)) + mgr = create_mgr( + "int: int; float: float; complex: complex;" + "str: object; bool: bool; obj: object; dt: datetime", + item_shape=(3,), + ) + mgr.set("obj", np.array([True, False, True], dtype=np.object_)) bools = mgr.get_bool_data() - tm.assert_index_equal(bools.items, pd.Index(['bool'])) - assert_almost_equal(mgr.get('bool', fastpath=False), - bools.get('bool', fastpath=False)) + tm.assert_index_equal(bools.items, pd.Index(["bool"])) + assert_almost_equal( + mgr.get("bool", fastpath=False), bools.get("bool", fastpath=False) + ) assert_almost_equal( - mgr.get('bool').internal_values(), - bools.get('bool').internal_values()) + mgr.get("bool").internal_values(), bools.get("bool").internal_values() + ) - bools.set('bool', np.array([True, False, True])) - tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), - np.array([True, False, True])) - tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), - np.array([True, False, True])) + bools.set("bool", np.array([True, False, True])) + tm.assert_numpy_array_equal( + mgr.get("bool", fastpath=False), np.array([True, False, True]) + ) + tm.assert_numpy_array_equal( + mgr.get("bool").internal_values(), np.array([True, False, True]) + ) # Check sharing bools2 = mgr.get_bool_data(copy=True) - bools2.set('bool', np.array([False, True, False])) - tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), - np.array([True, False, True])) - tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), - np.array([True, False, True])) + bools2.set("bool", np.array([False, True, False])) + tm.assert_numpy_array_equal( + mgr.get("bool", fastpath=False), np.array([True, False, True]) + ) + tm.assert_numpy_array_equal( + mgr.get("bool").internal_values(), np.array([True, False, True]) + ) def test_unicode_repr_doesnt_raise(self): - repr(create_mgr('b,\u05d0: object')) + repr(create_mgr("b,\u05d0: object")) def test_missing_unicode_key(self): df = DataFrame({"a": [1]}) @@ -777,11 +822,11 @@ def test_missing_unicode_key(self): def test_equals(self): # unique items - bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') + bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2) - bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2') + bm1 = create_mgr("a,a,a: i8-1; b,b,b: i8-2") bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2) @@ -805,12 +850,12 @@ def test_equals_block_order_different_dtypes(self): assert bm_this.equals(bm) def test_single_mgr_ctor(self): - mgr = create_single_mgr('f8', num_rows=5) - assert mgr.as_array().tolist() == [0., 1., 2., 3., 4.] + mgr = create_single_mgr("f8", num_rows=5) + assert mgr.as_array().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0] def test_validate_bool_args(self): invalid_values = [1, "True", [1, 2, 3], 5.0] - bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') + bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") for value in invalid_values: with pytest.raises(ValueError): @@ -827,20 +872,18 @@ class TestIndexing: # and are disabled. MANAGERS = [ - create_single_mgr('f8', N), - create_single_mgr('i8', N), - + create_single_mgr("f8", N), + create_single_mgr("i8", N), # 2-dim - create_mgr('a,b,c,d,e,f: f8', item_shape=(N,)), - create_mgr('a,b,c,d,e,f: i8', item_shape=(N,)), - create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N,)), - create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N,)), - + create_mgr("a,b,c,d,e,f: f8", item_shape=(N,)), + create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)), + create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)), + create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)), # 3-dim - create_mgr('a,b,c,d,e,f: f8', item_shape=(N, N)), - create_mgr('a,b,c,d,e,f: i8', item_shape=(N, N)), - create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N, N)), - create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N, N)), + create_mgr("a,b,c,d,e,f: f8", item_shape=(N, N)), + create_mgr("a,b,c,d,e,f: i8", item_shape=(N, N)), + create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N, N)), + create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N, N)), ] # MANAGERS = [MANAGERS[6]] @@ -854,12 +897,14 @@ def assert_slice_ok(mgr, axis, slobj): if isinstance(slobj, np.ndarray): ax = mgr.axes[axis] if len(ax) and len(slobj) and len(slobj) != len(ax): - slobj = np.concatenate([slobj, np.zeros( - len(ax) - len(slobj), dtype=bool)]) + slobj = np.concatenate( + [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)] + ) sliced = mgr.get_slice(slobj, axis=axis) - mat_slobj = (slice(None), ) * axis + (slobj, ) - tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_array(), - check_dtype=False) + mat_slobj = (slice(None),) * axis + (slobj,) + tm.assert_numpy_array_equal( + mat[mat_slobj], sliced.as_array(), check_dtype=False + ) tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) for mgr in self.MANAGERS: @@ -872,22 +917,15 @@ def assert_slice_ok(mgr, axis, slobj): assert_slice_ok(mgr, ax, slice(3, 0, -2)) # boolean mask - assert_slice_ok( - mgr, ax, np.array([], dtype=np.bool_)) - assert_slice_ok( - mgr, ax, - np.ones(mgr.shape[ax], dtype=np.bool_)) - assert_slice_ok( - mgr, ax, - np.zeros(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.array([], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_)) + assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_)) if mgr.shape[ax] >= 3: + assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0) assert_slice_ok( - mgr, ax, - np.arange(mgr.shape[ax]) % 3 == 0) - assert_slice_ok( - mgr, ax, np.array( - [True, True, False], dtype=np.bool_)) + mgr, ax, np.array([True, True, False], dtype=np.bool_) + ) # fancy indexer assert_slice_ok(mgr, ax, []) @@ -901,10 +939,10 @@ def test_take(self): def assert_take_ok(mgr, axis, indexer): mat = mgr.as_array() taken = mgr.take(indexer, axis) - tm.assert_numpy_array_equal(np.take(mat, indexer, axis), - taken.as_array(), check_dtype=False) - tm.assert_index_equal(mgr.axes[axis].take(indexer), - taken.axes[axis]) + tm.assert_numpy_array_equal( + np.take(mat, indexer, axis), taken.as_array(), check_dtype=False + ) + tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis]) for mgr in self.MANAGERS: for ax in range(mgr.ndim): @@ -922,97 +960,106 @@ def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): mat = mgr.as_array() indexer = mgr.axes[axis].get_indexer_for(new_labels) - reindexed = mgr.reindex_axis(new_labels, axis, - fill_value=fill_value) - tm.assert_numpy_array_equal(algos.take_nd(mat, indexer, axis, - fill_value=fill_value), - reindexed.as_array(), - check_dtype=False) + reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) + tm.assert_numpy_array_equal( + algos.take_nd(mat, indexer, axis, fill_value=fill_value), + reindexed.as_array(), + check_dtype=False, + ) tm.assert_index_equal(reindexed.axes[axis], new_labels) for mgr in self.MANAGERS: for ax in range(mgr.ndim): - for fill_value in (None, np.nan, 100.): - assert_reindex_axis_is_ok( - mgr, ax, - pd.Index([]), fill_value) - assert_reindex_axis_is_ok( - mgr, ax, mgr.axes[ax], - fill_value) + for fill_value in (None, np.nan, 100.0): + assert_reindex_axis_is_ok(mgr, ax, pd.Index([]), fill_value) + assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value) assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][[0, 0, 0]], fill_value) + mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), fill_value) + mgr, ax, pd.Index(["foo", "bar", "baz"]), fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - pd.Index(['foo', mgr.axes[ax][0], 'baz']), - fill_value) + mgr, ax, pd.Index(["foo", mgr.axes[ax][0], "baz"]), fill_value + ) if mgr.shape[ax] >= 3: assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][:-3], fill_value) + mgr, ax, mgr.axes[ax][:-3], fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][-3::-1], fill_value) + mgr, ax, mgr.axes[ax][-3::-1], fill_value + ) assert_reindex_axis_is_ok( - mgr, ax, - mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value) + mgr, ax, mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value + ) def test_reindex_indexer(self): - - def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, - fill_value): + def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): mat = mgr.as_array() - reindexed_mat = algos.take_nd(mat, indexer, axis, - fill_value=fill_value) - reindexed = mgr.reindex_indexer(new_labels, indexer, axis, - fill_value=fill_value) - tm.assert_numpy_array_equal(reindexed_mat, - reindexed.as_array(), - check_dtype=False) + reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) + reindexed = mgr.reindex_indexer( + new_labels, indexer, axis, fill_value=fill_value + ) + tm.assert_numpy_array_equal( + reindexed_mat, reindexed.as_array(), check_dtype=False + ) tm.assert_index_equal(reindexed.axes[axis], new_labels) for mgr in self.MANAGERS: for ax in range(mgr.ndim): - for fill_value in (None, np.nan, 100.): + for fill_value in (None, np.nan, 100.0): + assert_reindex_indexer_is_ok(mgr, ax, pd.Index([]), [], fill_value) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index([]), [], fill_value) + mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value + ) assert_reindex_indexer_is_ok( - mgr, ax, - mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value) + mgr, + ax, + pd.Index(["foo"] * mgr.shape[ax]), + np.arange(mgr.shape[ax]), + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo'] * mgr.shape[ax]), - np.arange(mgr.shape[ax]), fill_value) + mgr, + ax, + mgr.axes[ax][::-1], + np.arange(mgr.shape[ax]), + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, - mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), - fill_value) + mgr, + ax, + mgr.axes[ax], + np.arange(mgr.shape[ax])[::-1], + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax], - np.arange(mgr.shape[ax])[::-1], fill_value) + mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value + ) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [0, 0, 0], fill_value) + mgr, + ax, + pd.Index(["foo", "bar", "baz"]), + [-1, 0, -1], + fill_value, + ) assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [-1, 0, -1], fill_value) - assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', mgr.axes[ax][0], 'baz']), - [-1, -1, -1], fill_value) + mgr, + ax, + pd.Index(["foo", mgr.axes[ax][0], "baz"]), + [-1, -1, -1], + fill_value, + ) if mgr.shape[ax] >= 3: assert_reindex_indexer_is_ok( - mgr, ax, - pd.Index(['foo', 'bar', 'baz']), - [0, 1, 2], fill_value) + mgr, + ax, + pd.Index(["foo", "bar", "baz"]), + [0, 1, 2], + fill_value, + ) # test_get_slice(slice_like, axis) # take(indexer, axis) @@ -1021,7 +1068,6 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, class TestBlockPlacement: - def test_slice_len(self): assert len(BlockPlacement(slice(0, 4))) == 4 assert len(BlockPlacement(slice(0, 4, 2))) == 2 @@ -1111,8 +1157,8 @@ def test_slice_iter(self): def test_slice_to_array_conversion(self): def assert_as_array_equals(slc, asarray): tm.assert_numpy_array_equal( - BlockPlacement(slc).as_array, - np.asarray(asarray, dtype=np.int64)) + BlockPlacement(slc).as_array, np.asarray(asarray, dtype=np.int64) + ) assert_as_array_equals(slice(0, 3), [0, 1, 2]) assert_as_array_equals(slice(0, 0), []) @@ -1188,45 +1234,56 @@ def any(self, axis=None): class TestCanHoldElement: - @pytest.mark.parametrize('value, dtype', [ - (1, 'i8'), - (1.0, 'f8'), - (2**63, 'f8'), - (1j, 'complex128'), - (2**63, 'complex128'), - (True, 'bool'), - (np.timedelta64(20, 'ns'), '= 1.11 otherwise, @@ -49,36 +49,34 @@ def s3_resource(tips_file, jsonl_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - moto = pytest.importorskip('moto') + moto = pytest.importorskip("moto") test_s3_files = [ - ('tips#1.csv', tips_file), - ('tips.csv', tips_file), - ('tips.csv.gz', tips_file + '.gz'), - ('tips.csv.bz2', tips_file + '.bz2'), - ('items.jsonl', jsonl_file), + ("tips#1.csv", tips_file), + ("tips.csv", tips_file), + ("tips.csv.gz", tips_file + ".gz"), + ("tips.csv.bz2", tips_file + ".bz2"), + ("items.jsonl", jsonl_file), ] def add_tips_files(bucket_name): for s3_key, file_name in test_s3_files: - with open(file_name, 'rb') as f: - conn.Bucket(bucket_name).put_object( - Key=s3_key, - Body=f) + with open(file_name, "rb") as f: + conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) try: s3 = moto.mock_s3() s3.start() # see gh-16135 - bucket = 'pandas-test' + bucket = "pandas-test" conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket) add_tips_files(bucket) - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') + conn.create_bucket(Bucket="cant_get_it", ACL="private") + add_tips_files("cant_get_it") yield conn finally: s3.stop() diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index dd96fb2366152..54acd2128369d 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -25,12 +25,11 @@ def df_ref(): """ Obtain the reference data from read_csv with the Python engine. """ - df_ref = read_csv('test1.csv', index_col=0, - parse_dates=True, engine='python') + df_ref = read_csv("test1.csv", index_col=0, parse_dates=True, engine="python") return df_ref -@pytest.fixture(params=['.xls', '.xlsx', '.xlsm', '.ods']) +@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods"]) def read_ext(request): """ Valid extensions for reading Excel files. diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 76b3fe19a0771..76871eddf1cee 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -12,15 +12,14 @@ @pytest.fixture(autouse=True) def cd_and_set_engine(monkeypatch, datapath): func = functools.partial(pd.read_excel, engine="odf") - monkeypatch.setattr(pd, 'read_excel', func) + monkeypatch.setattr(pd, "read_excel", func) monkeypatch.chdir(datapath("io", "data")) def test_read_invalid_types_raises(): # the invalid_value_type.ods required manually editing # of the included content.xml file - with pytest.raises(ValueError, - match="Unrecognized type awesome_new_type"): + with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"): pd.read_excel("invalid_value_type.ods") @@ -28,12 +27,12 @@ def test_read_writer_table(): # Also test reading tables from an text OpenDocument file # (.odt) index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header") - expected = pd.DataFrame([ - [1, np.nan, 7], - [2, np.nan, 8], - [3, np.nan, 9], - ], index=index, columns=["Column 1", "Unnamed: 2", "Column 3"]) + expected = pd.DataFrame( + [[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]], + index=index, + columns=["Column 1", "Unnamed: 2", "Column 3"], + ) - result = pd.read_excel("writertable.odt", 'Table1', index_col=0) + result = pd.read_excel("writertable.odt", "Table1", index_col=0) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 6815d2aa079f8..79fc87a62ad08 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -7,83 +7,62 @@ openpyxl = pytest.importorskip("openpyxl") -pytestmark = pytest.mark.parametrize("ext", ['.xlsx']) +pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) def test_to_excel_styleconverter(ext): from openpyxl import styles hstyle = { - "font": { - "color": '00FF0000', - "bold": True, - }, - "borders": { - "top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin", - }, - "alignment": { - "horizontal": "center", - "vertical": "top", - }, - "fill": { - "patternType": 'solid', - 'fgColor': { - 'rgb': '006666FF', - 'tint': 0.3, - }, - }, - "number_format": { - "format_code": "0.00" - }, - "protection": { - "locked": True, - "hidden": False, - }, + "font": {"color": "00FF0000", "bold": True}, + "borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}, + "fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}}, + "number_format": {"format_code": "0.00"}, + "protection": {"locked": True, "hidden": False}, } - font_color = styles.Color('00FF0000') + font_color = styles.Color("00FF0000") font = styles.Font(bold=True, color=font_color) side = styles.Side(style=styles.borders.BORDER_THIN) border = styles.Border(top=side, right=side, bottom=side, left=side) - alignment = styles.Alignment(horizontal='center', vertical='top') - fill_color = styles.Color(rgb='006666FF', tint=0.3) - fill = styles.PatternFill(patternType='solid', fgColor=fill_color) + alignment = styles.Alignment(horizontal="center", vertical="top") + fill_color = styles.Color(rgb="006666FF", tint=0.3) + fill = styles.PatternFill(patternType="solid", fgColor=fill_color) - number_format = '0.00' + number_format = "0.00" protection = styles.Protection(locked=True, hidden=False) kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) - assert kw['font'] == font - assert kw['border'] == border - assert kw['alignment'] == alignment - assert kw['fill'] == fill - assert kw['number_format'] == number_format - assert kw['protection'] == protection + assert kw["font"] == font + assert kw["border"] == border + assert kw["alignment"] == alignment + assert kw["fill"] == fill + assert kw["number_format"] == number_format + assert kw["protection"] == protection def test_write_cells_merge_styled(ext): from pandas.io.formats.excel import ExcelCell - sheet_name = 'merge_styled' + sheet_name = "merge_styled" - sty_b1 = {'font': {'color': '00FF0000'}} - sty_a2 = {'font': {'color': '0000FF00'}} + sty_b1 = {"font": {"color": "00FF0000"}} + sty_a2 = {"font": {"color": "0000FF00"}} initial_cells = [ ExcelCell(col=1, row=0, val=42, style=sty_b1), ExcelCell(col=0, row=1, val=99, style=sty_a2), ] - sty_merged = {'font': {'color': '000000FF', 'bold': True}} + sty_merged = {"font": {"color": "000000FF", "bold": True}} sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) - openpyxl_sty_merged = sty_kwargs['font'] + openpyxl_sty_merged = sty_kwargs["font"] merge_cells = [ - ExcelCell(col=0, row=0, val='pandas', - mergestart=1, mergeend=1, style=sty_merged), + ExcelCell( + col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged + ) ] with ensure_clean(ext) as path: @@ -92,27 +71,28 @@ def test_write_cells_merge_styled(ext): writer.write_cells(merge_cells, sheet_name=sheet_name) wks = writer.sheets[sheet_name] - xcell_b1 = wks['B1'] - xcell_a2 = wks['A2'] + xcell_b1 = wks["B1"] + xcell_a2 = wks["A2"] assert xcell_b1.font == openpyxl_sty_merged assert xcell_a2.font == openpyxl_sty_merged -@pytest.mark.parametrize("mode,expected", [ - ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])]) +@pytest.mark.parametrize( + "mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])] +) def test_write_append_mode(ext, mode, expected): - df = DataFrame([1], columns=['baz']) + df = DataFrame([1], columns=["baz"]) with ensure_clean(ext) as f: wb = openpyxl.Workbook() - wb.worksheets[0].title = 'foo' - wb.worksheets[0]['A1'].value = 'foo' - wb.create_sheet('bar') - wb.worksheets[1]['A1'].value = 'bar' + wb.worksheets[0].title = "foo" + wb.worksheets[0]["A1"].value = "foo" + wb.create_sheet("bar") + wb.worksheets[1]["A1"].value = "bar" wb.save(f) - writer = ExcelWriter(f, engine='openpyxl', mode=mode) - df.to_excel(writer, sheet_name='baz', index=False) + writer = ExcelWriter(f, engine="openpyxl", mode=mode) + df.to_excel(writer, sheet_name="baz", index=False) writer.save() wb2 = openpyxl.load_workbook(f) @@ -120,4 +100,4 @@ def test_write_append_mode(ext, mode, expected): assert result == expected for index, cell_value in enumerate(expected): - assert wb2.worksheets[index]['A1'].value == cell_value + assert wb2.worksheets[index]["A1"].value == cell_value diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index ae69c2302e60a..cd8848828f6c4 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -25,30 +25,42 @@ def ignore_xlrd_time_clock_warning(): """ with warnings.catch_warnings(): warnings.filterwarnings( - action='ignore', - message='time.clock has been deprecated', - category=DeprecationWarning) + action="ignore", + message="time.clock has been deprecated", + category=DeprecationWarning, + ) yield -@pytest.fixture(params=[ - # Add any engines to test here - # When defusedxml is installed it triggers deprecation warnings for - # xlrd and openpyxl, so catch those here - pytest.param('xlrd', marks=[ - td.skip_if_no('xlrd'), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ]), - pytest.param('openpyxl', marks=[ - td.skip_if_no('openpyxl'), - pytest.mark.filterwarnings("ignore:.*html argument"), - ]), - pytest.param(None, marks=[ - td.skip_if_no('xlrd'), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ]), - pytest.param("odf", marks=td.skip_if_no("odf")), -]) +@pytest.fixture( + params=[ + # Add any engines to test here + # When defusedxml is installed it triggers deprecation warnings for + # xlrd and openpyxl, so catch those here + pytest.param( + "xlrd", + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param( + "openpyxl", + marks=[ + td.skip_if_no("openpyxl"), + pytest.mark.filterwarnings("ignore:.*html argument"), + ], + ), + pytest.param( + None, + marks=[ + td.skip_if_no("xlrd"), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ], + ), + pytest.param("odf", marks=td.skip_if_no("odf")), + ] +) def engine(request): """ A fixture for Excel reader engines. @@ -57,41 +69,42 @@ def engine(request): class TestReaders: - @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for read_excel calls. """ - if engine == 'openpyxl' and read_ext == '.xls': + if engine == "openpyxl" and read_ext == ".xls": pytest.skip() - if engine == 'odf' and read_ext != '.ods': + if engine == "odf" and read_ext != ".ods": pytest.skip() if read_ext == ".ods" and engine != "odf": pytest.skip() func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data")) - monkeypatch.setattr(pd, 'read_excel', func) + monkeypatch.setattr(pd, "read_excel", func) def test_usecols_int(self, read_ext, df_ref): df_ref = df_ref.reindex(columns=["A", "B", "C"]) # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): with ignore_xlrd_time_clock_warning(): - df1 = pd.read_excel("test1" + read_ext, "Sheet1", - index_col=0, usecols=3) + df1 = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=3 + ) # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], - index_col=0, usecols=3) + df2 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3 + ) # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) @@ -99,11 +112,13 @@ def test_usecols_int(self, read_ext, df_ref): def test_usecols_list(self, read_ext, df_ref): - df_ref = df_ref.reindex(columns=['B', 'C']) - df1 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols=[0, 2, 3]) - df2 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols=[0, 2, 3]) + df_ref = df_ref.reindex(columns=["B", "C"]) + df1 = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=[0, 2, 3] + ) + df2 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=[0, 2, 3] + ) # TODO add index to xls file) tm.assert_frame_equal(df1, df_ref, check_names=False) @@ -111,50 +126,45 @@ def test_usecols_list(self, read_ext, df_ref): def test_usecols_str(self, read_ext, df_ref): - df1 = df_ref.reindex(columns=['A', 'B', 'C']) - df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols='A:D') - df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A:D') + df1 = df_ref.reindex(columns=["A", "B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A:D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A:D" + ) # TODO add index to xls, read xls ignores index name ? tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - df1 = df_ref.reindex(columns=['B', 'C']) - df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols='A,C,D') - df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C,D') + df1 = df_ref.reindex(columns=["B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C,D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C,D" + ) # TODO add index to xls file tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - df1 = df_ref.reindex(columns=['B', 'C']) - df2 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0, - usecols='A,C:D') - df3 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C:D') + df1 = df_ref.reindex(columns=["B", "C"]) + df2 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols="A,C:D") + df3 = pd.read_excel( + "test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols="A,C:D" + ) tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - @pytest.mark.parametrize("usecols", [ - [0, 1, 3], [0, 3, 1], - [1, 0, 3], [1, 3, 0], - [3, 0, 1], [3, 1, 0], - ]) - def test_usecols_diff_positional_int_columns_order( - self, read_ext, usecols, df_ref): + @pytest.mark.parametrize( + "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]] + ) + def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols, df_ref): expected = df_ref[["A", "C"]] - result = pd.read_excel("test1" + read_ext, "Sheet1", - index_col=0, usecols=usecols) + result = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols=usecols + ) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.parametrize("usecols", [ - ["B", "D"], ["D", "B"] - ]) - def test_usecols_diff_positional_str_columns_order( - self, read_ext, usecols, df_ref): + @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]]) + def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref): expected = df_ref[["B", "D"]] expected.index = range(len(expected)) @@ -168,8 +178,9 @@ def test_read_excel_without_slicing(self, read_ext, df_ref): def test_usecols_excel_range_str(self, read_ext, df_ref): expected = df_ref[["C", "D"]] - result = pd.read_excel("test1" + read_ext, "Sheet1", - index_col=0, usecols="A,D:E") + result = pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=0, usecols="A,D:E" + ) tm.assert_frame_equal(result, expected, check_names=False) def test_usecols_excel_range_str_invalid(self, read_ext): @@ -182,188 +193,215 @@ def test_index_col_label_error(self, read_ext): msg = "list indices must be integers.*, not str" with pytest.raises(TypeError, match=msg): - pd.read_excel("test1" + read_ext, "Sheet1", index_col=["A"], - usecols=["A", "C"]) + pd.read_excel( + "test1" + read_ext, "Sheet1", index_col=["A"], usecols=["A", "C"] + ) def test_index_col_empty(self, read_ext): # see gh-9208 - result = pd.read_excel("test1" + read_ext, "Sheet3", - index_col=["A", "B", "C"]) - expected = DataFrame(columns=["D", "E", "F"], - index=MultiIndex(levels=[[]] * 3, - codes=[[]] * 3, - names=["A", "B", "C"])) + result = pd.read_excel("test1" + read_ext, "Sheet3", index_col=["A", "B", "C"]) + expected = DataFrame( + columns=["D", "E", "F"], + index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]), + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("index_col", [None, 2]) def test_index_col_with_unnamed(self, read_ext, index_col): # see gh-18792 - result = pd.read_excel( - "test1" + read_ext, "Sheet4", index_col=index_col) - expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], - columns=["Unnamed: 0", "col1", "col2"]) + result = pd.read_excel("test1" + read_ext, "Sheet4", index_col=index_col) + expected = DataFrame( + [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"] + ) if index_col: expected = expected.set_index(expected.columns[index_col]) tm.assert_frame_equal(result, expected) def test_usecols_pass_non_existent_column(self, read_ext): - msg = ("Usecols do not match columns, " - "columns expected but not found: " + r"\['E'\]") + msg = ( + "Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]" + ) with pytest.raises(ValueError, match=msg): pd.read_excel("test1" + read_ext, usecols=["E"]) def test_usecols_wrong_type(self, read_ext): - msg = ("'usecols' must either be list-like of " - "all strings, all unicode, all integers or a callable.") + msg = ( + "'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable." + ) with pytest.raises(ValueError, match=msg): pd.read_excel("test1" + read_ext, usecols=["E1", 0]) def test_excel_stop_iterator(self, read_ext): - parsed = pd.read_excel('test2' + read_ext, 'Sheet1') - expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) + parsed = pd.read_excel("test2" + read_ext, "Sheet1") + expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"]) tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self, read_ext): - parsed = pd.read_excel('test3' + read_ext, 'Sheet1') - expected = DataFrame([[np.nan]], columns=['Test']) + parsed = pd.read_excel("test3" + read_ext, "Sheet1") + expected = DataFrame([[np.nan]], columns=["Test"]) tm.assert_frame_equal(parsed, expected) def test_excel_table(self, read_ext, df_ref): - df1 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0) - df2 = pd.read_excel('test1' + read_ext, 'Sheet2', skiprows=[1], - index_col=0) + df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0) + df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0) # TODO add index to file tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - df3 = pd.read_excel( - 'test1' + read_ext, 'Sheet1', index_col=0, skipfooter=1) + df3 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) def test_reader_special_dtypes(self, read_ext): - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, 4, 0]), - ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), - ("BoolCol", [True, False, True, True, False]), - ("StrCol", [1, 2, 3, 4, 5]), - # GH5394 - this is why convert_float isn't vectorized - ("Str2Col", ["a", 3, "c", "d", "e"]), - ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), - datetime(1905, 1, 1), datetime(2013, 12, 14), - datetime(2015, 3, 14)]) - ])) - basename = 'test_types' + expected = DataFrame.from_dict( + OrderedDict( + [ + ("IntCol", [1, 2, -3, 4, 0]), + ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), + ("BoolCol", [True, False, True, True, False]), + ("StrCol", [1, 2, 3, 4, 5]), + # GH5394 - this is why convert_float isn't vectorized + ("Str2Col", ["a", 3, "c", "d", "e"]), + ( + "DateCol", + [ + datetime(2013, 10, 30), + datetime(2013, 10, 31), + datetime(1905, 1, 1), + datetime(2013, 12, 14), + datetime(2015, 3, 14), + ], + ), + ] + ) + ) + basename = "test_types" # should read in correctly and infer types - actual = pd.read_excel(basename + read_ext, 'Sheet1') + actual = pd.read_excel(basename + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = pd.read_excel( - basename + read_ext, 'Sheet1', convert_float=False) + actual = pd.read_excel(basename + read_ext, "Sheet1", convert_float=False) tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = pd.read_excel( - basename + read_ext, 'Sheet1', index_col=icol) + actual = pd.read_excel(basename + read_ext, "Sheet1", index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) - actual = pd.read_excel(basename + read_ext, 'Sheet1', - converters={"StrCol": str}) + actual = pd.read_excel( + basename + read_ext, "Sheet1", converters={"StrCol": str} + ) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) actual = pd.read_excel( - basename + read_ext, 'Sheet1', - convert_float=False, converters={"StrCol": str}) + basename + read_ext, + "Sheet1", + convert_float=False, + converters={"StrCol": str}, + ) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values def test_reader_converters(self, read_ext): - basename = 'test_converters' - - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, -1000, 0]), - ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), - ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), - ("StrCol", ['1', np.nan, '3', '4', '5']), - ])) - - converters = {'IntCol': lambda x: int(x) if x != '' else -1000, - 'FloatCol': lambda x: 10 * x if x else np.nan, - 2: lambda x: 'Found' if x != '' else 'Not found', - 3: lambda x: str(x) if x else '', - } + basename = "test_converters" + + expected = DataFrame.from_dict( + OrderedDict( + [ + ("IntCol", [1, 2, -3, -1000, 0]), + ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), + ("BoolCol", ["Found", "Found", "Found", "Not found", "Found"]), + ("StrCol", ["1", np.nan, "3", "4", "5"]), + ] + ) + ) + + converters = { + "IntCol": lambda x: int(x) if x != "" else -1000, + "FloatCol": lambda x: 10 * x if x else np.nan, + 2: lambda x: "Found" if x != "" else "Not found", + 3: lambda x: str(x) if x else "", + } # should read in correctly and set types of single cells (not array # dtypes) - actual = pd.read_excel( - basename + read_ext, 'Sheet1', converters=converters) + actual = pd.read_excel(basename + read_ext, "Sheet1", converters=converters) tm.assert_frame_equal(actual, expected) def test_reader_dtype(self, read_ext): # GH 8212 - basename = 'testdtype' + basename = "testdtype" actual = pd.read_excel(basename + read_ext) - expected = DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [2.5, 3.5, 4.5, 5.5], - 'c': [1, 2, 3, 4], - 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( - columns=['a', 'b', 'c', 'd']) + expected = DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ).reindex(columns=["a", "b", "c", "d"]) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel(basename + read_ext, - dtype={'a': 'float64', - 'b': 'float32', - 'c': str}) + actual = pd.read_excel( + basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str} + ) - expected['a'] = expected['a'].astype('float64') - expected['b'] = expected['b'].astype('float32') - expected['c'] = ['001', '002', '003', '004'] + expected["a"] = expected["a"].astype("float64") + expected["b"] = expected["b"].astype("float32") + expected["c"] = ["001", "002", "003", "004"] tm.assert_frame_equal(actual, expected) with pytest.raises(ValueError): - pd.read_excel(basename + read_ext, dtype={'d': 'int64'}) - - @pytest.mark.parametrize("dtype,expected", [ - (None, - DataFrame({ - "a": [1, 2, 3, 4], - "b": [2.5, 3.5, 4.5, 5.5], - "c": [1, 2, 3, 4], - "d": [1.0, 2.0, np.nan, 4.0] - })), - ({"a": "float64", - "b": "float32", - "c": str, - "d": str - }, - DataFrame({ - "a": Series([1, 2, 3, 4], dtype="float64"), - "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": ["001", "002", "003", "004"], - "d": ["1", "2", np.nan, "4"] - })), - ]) + pd.read_excel(basename + read_ext, dtype={"d": "int64"}) + + @pytest.mark.parametrize( + "dtype,expected", + [ + ( + None, + DataFrame( + { + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0], + } + ), + ), + ( + {"a": "float64", "b": "float32", "c": str, "d": str}, + DataFrame( + { + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": ["001", "002", "003", "004"], + "d": ["1", "2", np.nan, "4"], + } + ), + ), + ], + ) def test_reader_dtype_str(self, read_ext, dtype, expected): # see gh-20377 basename = "testdtype" @@ -375,10 +413,10 @@ def test_reading_all_sheets(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. # See PR #9450 - basename = 'test_multisheet' + basename = "test_multisheet" dfs = pd.read_excel(basename + read_ext, sheet_name=None) # ensure this is not alphabetical to test order preservation - expected_keys = ['Charlie', 'Alpha', 'Beta'] + expected_keys = ["Charlie", "Alpha", "Beta"] tm.assert_contains_all(expected_keys, dfs.keys()) # Issue 9930 # Ensure sheet order is preserved @@ -390,9 +428,9 @@ def test_reading_multiple_specific_sheets(self, read_ext): # references (positions/names) are removed properly. # Ensure a dict is returned # See PR #9450 - basename = 'test_multisheet' + basename = "test_multisheet" # Explicitly request duplicates. Only the set should be returned. - expected_keys = [2, 'Charlie', 'Charlie'] + expected_keys = [2, "Charlie", "Charlie"] dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys) expected_keys = list(set(expected_keys)) tm.assert_contains_all(expected_keys, dfs.keys()) @@ -402,81 +440,86 @@ def test_reading_all_sheets_with_blank(self, read_ext): # Test reading all sheetnames by setting sheetname to None, # In the case where some sheets are blank. # Issue #11711 - basename = 'blank_with_header' + basename = "blank_with_header" dfs = pd.read_excel(basename + read_ext, sheet_name=None) - expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] + expected_keys = ["Sheet1", "Sheet2", "Sheet3"] tm.assert_contains_all(expected_keys, dfs.keys()) # GH6403 def test_read_excel_blank(self, read_ext): - actual = pd.read_excel('blank' + read_ext, 'Sheet1') + actual = pd.read_excel("blank" + read_ext, "Sheet1") tm.assert_frame_equal(actual, DataFrame()) def test_read_excel_blank_with_header(self, read_ext): - expected = DataFrame(columns=['col_1', 'col_2']) - actual = pd.read_excel('blank_with_header' + read_ext, 'Sheet1') + expected = DataFrame(columns=["col_1", "col_2"]) + actual = pd.read_excel("blank_with_header" + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) def test_date_conversion_overflow(self, read_ext): # GH 10001 : pandas.ExcelFile ignore parse_dates=False - expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], - [pd.Timestamp('2016-03-16'), 'Jack Black'], - [1e+20, 'Timothy Brown']], - columns=['DateColWithBigInt', 'StringCol']) - - if pd.read_excel.keywords['engine'] == 'openpyxl': + expected = pd.DataFrame( + [ + [pd.Timestamp("2016-03-12"), "Marc Johnson"], + [pd.Timestamp("2016-03-16"), "Jack Black"], + [1e20, "Timothy Brown"], + ], + columns=["DateColWithBigInt", "StringCol"], + ) + + if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") - result = pd.read_excel('testdateoverflow' + read_ext) + result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) def test_sheet_name(self, read_ext, df_ref): filename = "test1" sheet_name = "Sheet1" - df1 = pd.read_excel(filename + read_ext, - sheet_name=sheet_name, index_col=0) # doc + df1 = pd.read_excel( + filename + read_ext, sheet_name=sheet_name, index_col=0 + ) # doc with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel(filename + read_ext, index_col=0, - sheet_name=sheet_name) + df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) def test_excel_read_buffer(self, read_ext): - pth = 'test1' + read_ext - expected = pd.read_excel(pth, 'Sheet1', index_col=0) - with open(pth, 'rb') as f: - actual = pd.read_excel(f, 'Sheet1', index_col=0) + pth = "test1" + read_ext + expected = pd.read_excel(pth, "Sheet1", index_col=0) + with open(pth, "rb") as f: + actual = pd.read_excel(f, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) def test_bad_engine_raises(self, read_ext): - bad_engine = 'foo' + bad_engine = "foo" with pytest.raises(ValueError, match="Unknown engine: foo"): - pd.read_excel('', engine=bad_engine) + pd.read_excel("", engine=bad_engine) @tm.network def test_read_from_http_url(self, read_ext): - if read_ext == '.ods': # TODO: remove once on master + if read_ext == ".ods": # TODO: remove once on master pytest.skip() - url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/tests/io/data/test1' + read_ext) + url = ( + "https://raw.github.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/test1" + read_ext + ) url_table = pd.read_excel(url) - local_table = pd.read_excel('test1' + read_ext) + local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale def test_read_from_s3_url(self, read_ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py - with open('test1' + read_ext, "rb") as f: - s3_resource.Bucket("pandas-test").put_object( - Key="test1" + read_ext, Body=f) + with open("test1" + read_ext, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) - url = ('s3://pandas-test/test1' + read_ext) + url = "s3://pandas-test/test1" + read_ext url_table = pd.read_excel(url) - local_table = pd.read_excel('test1' + read_ext) + local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) @pytest.mark.slow @@ -485,16 +528,16 @@ def test_read_from_s3_url(self, read_ext, s3_resource): def test_read_from_file_url(self, read_ext, datapath): # FILE - localtable = os.path.join(datapath("io", "data"), 'test1' + read_ext) + localtable = os.path.join(datapath("io", "data"), "test1" + read_ext) local_table = pd.read_excel(localtable) try: - url_table = pd.read_excel('file://localhost/' + localtable) + url_table = pd.read_excel("file://localhost/" + localtable) except URLError: # fails on some systems import platform - pytest.skip("failing on %s" % - ' '.join(platform.uname()).strip()) + + pytest.skip("failing on %s" % " ".join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table) @@ -503,47 +546,53 @@ def test_read_from_pathlib_path(self, read_ext): # GH12655 from pathlib import Path - str_path = 'test1' + read_ext - expected = pd.read_excel(str_path, 'Sheet1', index_col=0) + str_path = "test1" + read_ext + expected = pd.read_excel(str_path, "Sheet1", index_col=0) - path_obj = Path('test1' + read_ext) - actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) + path_obj = Path("test1" + read_ext) + actual = pd.read_excel(path_obj, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_read_from_py_localpath(self, read_ext): # GH12655 from py.path import local as LocalPath - str_path = os.path.join('test1' + read_ext) - expected = pd.read_excel(str_path, 'Sheet1', index_col=0) + str_path = os.path.join("test1" + read_ext) + expected = pd.read_excel(str_path, "Sheet1", index_col=0) - path_obj = LocalPath().join('test1' + read_ext) - actual = pd.read_excel(path_obj, 'Sheet1', index_col=0) + path_obj = LocalPath().join("test1" + read_ext) + actual = pd.read_excel(path_obj, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) def test_reader_seconds(self, read_ext): # Test reading times with and without milliseconds. GH5945. - expected = DataFrame.from_dict({"Time": [time(1, 2, 3), - time(2, 45, 56, 100000), - time(4, 29, 49, 200000), - time(6, 13, 42, 300000), - time(7, 57, 35, 400000), - time(9, 41, 28, 500000), - time(11, 25, 21, 600000), - time(13, 9, 14, 700000), - time(14, 53, 7, 800000), - time(16, 37, 0, 900000), - time(18, 20, 54)]}) - - actual = pd.read_excel('times_1900' + read_ext, 'Sheet1') + expected = DataFrame.from_dict( + { + "Time": [ + time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54), + ] + } + ) + + actual = pd.read_excel("times_1900" + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) - actual = pd.read_excel('times_1904' + read_ext, 'Sheet1') + actual = pd.read_excel("times_1904" + read_ext, "Sheet1") tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, read_ext): @@ -552,14 +601,17 @@ def test_read_excel_multiindex(self, read_ext): mi_file = "testmultiindex" + read_ext # "mi_column" sheet - expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], - [2, 3.5, pd.Timestamp("2015-01-02"), False], - [3, 4.5, pd.Timestamp("2015-01-03"), False], - [4, 5.5, pd.Timestamp("2015-01-04"), True]], - columns=mi) - - actual = pd.read_excel( - mi_file, "mi_column", header=[0, 1], index_col=0) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=mi, + ) + + actual = pd.read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # "mi_index" sheet @@ -572,45 +624,40 @@ def test_read_excel_multiindex(self, read_ext): # "both" sheet expected.columns = mi - actual = pd.read_excel( - mi_file, "both", index_col=[0, 1], header=[0, 1]) + actual = pd.read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) # "mi_index_name" sheet expected.columns = ["a", "b", "c", "d"] expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = pd.read_excel( - mi_file, "mi_index_name", index_col=[0, 1]) + actual = pd.read_excel(mi_file, "mi_index_name", index_col=[0, 1]) tm.assert_frame_equal(actual, expected) # "mi_column_name" sheet expected.index = list(range(4)) expected.columns = mi.set_names(["c1", "c2"]) - actual = pd.read_excel(mi_file, "mi_column_name", - header=[0, 1], index_col=0) + actual = pd.read_excel(mi_file, "mi_column_name", header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # see gh-11317 # "name_with_int" sheet - expected.columns = mi.set_levels( - [1, 2], level=1).set_names(["c1", "c2"]) + expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"]) - actual = pd.read_excel(mi_file, "name_with_int", - index_col=0, header=[0, 1]) + actual = pd.read_excel(mi_file, "name_with_int", index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_name" sheet expected.columns = mi.set_names(["c1", "c2"]) expected.index = mi.set_names(["ilvl1", "ilvl2"]) - actual = pd.read_excel(mi_file, "both_name", - index_col=[0, 1], header=[0, 1]) + actual = pd.read_excel(mi_file, "both_name", index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) # "both_skiprows" sheet - actual = pd.read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], - header=[0, 1], skiprows=2) + actual = pd.read_excel( + mi_file, "both_name_skiprows", index_col=[0, 1], header=[0, 1], skiprows=2 + ) tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex_header_only(self, read_ext): @@ -631,21 +678,28 @@ def test_excel_old_index_format(self, read_ext): # We detect headers to determine if index names exist, so # that "index" name in the "names" version of the data will # now be interpreted as rows that include null data. - data = np.array([[None, None, None, None, None], - ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + data = np.array( + [ + [None, None, None, None, None], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ] + ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", - "R_l0_g2", "R_l0_g3", "R_l0_g4"], - ["R1", "R_l1_g0", "R_l1_g1", - "R_l1_g2", "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], - names=[None, None]) - si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) + mi = MultiIndex( + levels=[ + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None], + ) + si = Index( + ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None + ) expected = pd.DataFrame(data, index=si, columns=columns) @@ -659,20 +713,25 @@ def test_excel_old_index_format(self, read_ext): # The analogous versions of the "names" version data # where there are explicitly no names for the indices. - data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + data = np.array( + [ + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"], + ] + ) columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], - ["R_l1_g0", "R_l1_g1", "R_l1_g2", - "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], - names=[None, None]) - si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) + mi = MultiIndex( + levels=[ + ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"], + ], + codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + names=[None, None], + ) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None) expected = pd.DataFrame(data, index=si, columns=columns) @@ -688,154 +747,167 @@ def test_read_excel_bool_header_arg(self, read_ext): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): - pd.read_excel('test1' + read_ext, header=arg) + pd.read_excel("test1" + read_ext, header=arg) def test_read_excel_chunksize(self, read_ext): # GH 8011 with pytest.raises(NotImplementedError): - pd.read_excel('test1' + read_ext, chunksize=100) + pd.read_excel("test1" + read_ext, chunksize=100) def test_read_excel_skiprows_list(self, read_ext): # GH 4903 - actual = pd.read_excel('testskiprows' + read_ext, - 'skiprows_list', skiprows=[0, 2]) - expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], - [2, 3.5, pd.Timestamp('2015-01-02'), False], - [3, 4.5, pd.Timestamp('2015-01-03'), False], - [4, 5.5, pd.Timestamp('2015-01-04'), True]], - columns=['a', 'b', 'c', 'd']) + actual = pd.read_excel( + "testskiprows" + read_ext, "skiprows_list", skiprows=[0, 2] + ) + expected = DataFrame( + [ + [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel('testskiprows' + read_ext, - 'skiprows_list', skiprows=np.array([0, 2])) + actual = pd.read_excel( + "testskiprows" + read_ext, "skiprows_list", skiprows=np.array([0, 2]) + ) tm.assert_frame_equal(actual, expected) def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 - actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) - expected = pd.read_excel('test1' + read_ext) + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) + expected = pd.read_excel("test1" + read_ext) expected = expected[:num_rows_to_pull] tm.assert_frame_equal(actual, expected) def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext): # GH 16645 - expected = pd.read_excel('test1' + read_ext) + expected = pd.read_excel("test1" + read_ext) num_records_in_file = len(expected) num_rows_to_pull = num_records_in_file + 10 - actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull) + actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull) tm.assert_frame_equal(actual, expected) def test_read_excel_nrows_non_integer_parameter(self, read_ext): # GH 16645 msg = "'nrows' must be an integer >=0" with pytest.raises(ValueError, match=msg): - pd.read_excel('test1' + read_ext, nrows='5') + pd.read_excel("test1" + read_ext, nrows="5") def test_read_excel_squeeze(self, read_ext): # GH 12157 - f = 'test_squeeze' + read_ext + f = "test_squeeze" + read_ext - actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) - expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') - expected.index.name = 'a' + actual = pd.read_excel(f, "two_columns", index_col=0, squeeze=True) + expected = pd.Series([2, 3, 4], [4, 5, 6], name="b") + expected.index.name = "a" tm.assert_series_equal(actual, expected) - actual = pd.read_excel(f, 'two_columns', squeeze=True) - expected = pd.DataFrame({'a': [4, 5, 6], - 'b': [2, 3, 4]}) + actual = pd.read_excel(f, "two_columns", squeeze=True) + expected = pd.DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]}) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel(f, 'one_column', squeeze=True) - expected = pd.Series([1, 2, 3], name='a') + actual = pd.read_excel(f, "one_column", squeeze=True) + expected = pd.Series([1, 2, 3], name="a") tm.assert_series_equal(actual, expected) class TestExcelFileRead: - @pytest.fixture(autouse=True) def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for ExcelFile objects. """ - if engine == 'odf' and read_ext != '.ods': + if engine == "odf" and read_ext != ".ods": pytest.skip() if read_ext == ".ods" and engine != "odf": pytest.skip() - if engine == 'openpyxl' and read_ext == '.xls': + if engine == "openpyxl" and read_ext == ".xls": pytest.skip() func = partial(pd.ExcelFile, engine=engine) monkeypatch.chdir(datapath("io", "data")) - monkeypatch.setattr(pd, 'ExcelFile', func) + monkeypatch.setattr(pd, "ExcelFile", func) def test_excel_passes_na(self, read_ext): - with pd.ExcelFile('test4' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) - with pd.ExcelFile('test4' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test4" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) # 13967 - with pd.ExcelFile('test5' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=False, na_values=["apple"] + ) + expected = DataFrame( + [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) - with pd.ExcelFile('test5' + read_ext) as excel: - parsed = pd.read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"] + ) + expected = DataFrame( + [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"] + ) tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols']) + @pytest.mark.parametrize("arg", ["sheet", "sheetname", "parse_cols"]) def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 - kwarg = {arg: 'Sheet1'} + kwarg = {arg: "Sheet1"} msg = "unexpected keyword argument `{}`".format(arg) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: with pytest.raises(TypeError, match=msg): pd.read_excel(excel, **kwarg) def test_excel_table_sheet_by_index(self, read_ext, df_ref): - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df1 = pd.read_excel(excel, 0, index_col=0) df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df1 = excel.parse(0, index_col=0) df2 = excel.parse(1, skiprows=[1], index_col=0) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - raise_on_extra_warnings=False): - with pd.ExcelFile('test1' + read_ext) as excel: + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): + with pd.ExcelFile("test1" + read_ext) as excel: df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) tm.assert_frame_equal(df3, df4) - with pd.ExcelFile('test1' + read_ext) as excel: + with pd.ExcelFile("test1" + read_ext) as excel: df3 = excel.parse(0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) @@ -848,27 +920,26 @@ def test_sheet_name(self, read_ext, df_ref): df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc with pd.ExcelFile(filename + read_ext) as excel: - df2_parse = excel.parse(index_col=0, - sheet_name=sheet_name) + df2_parse = excel.parse(index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) def test_excel_read_buffer(self, engine, read_ext): - pth = 'test1' + read_ext - expected = pd.read_excel(pth, 'Sheet1', index_col=0, engine=engine) + pth = "test1" + read_ext + expected = pd.read_excel(pth, "Sheet1", index_col=0, engine=engine) - with open(pth, 'rb') as f: + with open(pth, "rb") as f: with pd.ExcelFile(f) as xls: - actual = pd.read_excel(xls, 'Sheet1', index_col=0) + actual = pd.read_excel(xls, "Sheet1", index_col=0) tm.assert_frame_equal(expected, actual) def test_reader_closes_file(self, engine, read_ext): - f = open('test1' + read_ext, 'rb') + f = open("test1" + read_ext, "rb") with pd.ExcelFile(f) as xlsx: # parses okay - pd.read_excel(xlsx, 'Sheet1', index_col=0, engine=engine) + pd.read_excel(xlsx, "Sheet1", index_col=0, engine=engine) assert f.closed @@ -878,4 +949,4 @@ def test_conflicting_excel_engines(self, read_ext): with pd.ExcelFile("test1" + read_ext) as xl: with pytest.raises(ValueError, match=msg): - pd.read_excel(xl, engine='foo') + pd.read_excel(xl, engine="foo") diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index d8971777f6eb4..76b27bce11b08 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -8,34 +8,48 @@ from pandas.io.formats.excel import ExcelFormatter -@pytest.mark.parametrize('engine', [ - pytest.param('xlwt', - marks=pytest.mark.xfail(reason='xlwt does not support ' - 'openpyxl-compatible ' - 'style dicts')), - 'xlsxwriter', - 'openpyxl', -]) +@pytest.mark.parametrize( + "engine", + [ + pytest.param( + "xlwt", + marks=pytest.mark.xfail( + reason="xlwt does not support " "openpyxl-compatible " "style dicts" + ), + ), + "xlsxwriter", + "openpyxl", + ], +) def test_styler_to_excel(engine): def style(df): # XXX: RGB colors not supported in xlwt - return DataFrame([['font-weight: bold', '', ''], - ['', 'color: blue', ''], - ['', '', 'text-decoration: underline'], - ['border-style: solid', '', ''], - ['', 'font-style: italic', ''], - ['', '', 'text-align: right'], - ['background-color: red', '', ''], - ['number-format: 0%', '', ''], - ['', '', ''], - ['', '', ''], - ['', '', '']], - index=df.index, columns=df.columns) + return DataFrame( + [ + ["font-weight: bold", "", ""], + ["", "color: blue", ""], + ["", "", "text-decoration: underline"], + ["border-style: solid", "", ""], + ["", "font-style: italic", ""], + ["", "", "text-align: right"], + ["background-color: red", "", ""], + ["number-format: 0%", "", ""], + ["", "", ""], + ["", "", ""], + ["", "", ""], + ], + index=df.index, + columns=df.columns, + ) def assert_equal_style(cell1, cell2, engine): - if engine in ['xlsxwriter', 'openpyxl']: - pytest.xfail(reason=("GH25351: failing on some attribute " - "comparisons in {}".format(engine))) + if engine in ["xlsxwriter", "openpyxl"]: + pytest.xfail( + reason=( + "GH25351: failing on some attribute " + "comparisons in {}".format(engine) + ) + ) # XXX: should find a better way to check equality assert cell1.alignment.__dict__ == cell2.alignment.__dict__ assert cell1.border.__dict__ == cell2.border.__dict__ @@ -46,36 +60,36 @@ def assert_equal_style(cell1, cell2, engine): def custom_converter(css): # use bold iff there is custom style attached to the cell - if css.strip(' \n;'): - return {'font': {'bold': True}} + if css.strip(" \n;"): + return {"font": {"bold": True}} return {} - pytest.importorskip('jinja2') + pytest.importorskip("jinja2") pytest.importorskip(engine) # Prepare spreadsheets df = DataFrame(np.random.randn(11, 3)) - with ensure_clean('.xlsx' if engine != 'xlwt' else '.xls') as path: + with ensure_clean(".xlsx" if engine != "xlwt" else ".xls") as path: writer = ExcelWriter(path, engine=engine) - df.to_excel(writer, sheet_name='frame') - df.style.to_excel(writer, sheet_name='unstyled') + df.to_excel(writer, sheet_name="frame") + df.style.to_excel(writer, sheet_name="unstyled") styled = df.style.apply(style, axis=None) - styled.to_excel(writer, sheet_name='styled') + styled.to_excel(writer, sheet_name="styled") ExcelFormatter(styled, style_converter=custom_converter).write( - writer, sheet_name='custom') + writer, sheet_name="custom" + ) writer.save() - if engine not in ('openpyxl', 'xlsxwriter'): + if engine not in ("openpyxl", "xlsxwriter"): # For other engines, we only smoke test return - openpyxl = pytest.importorskip('openpyxl') + openpyxl = pytest.importorskip("openpyxl") wb = openpyxl.load_workbook(path) # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['unstyled'].columns): + for col1, col2 in zip(wb["frame"].columns, wb["unstyled"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): assert cell1.value == cell2.value @@ -88,47 +102,47 @@ def custom_converter(css): # (2) check styling with default converter # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF - alpha = '00' if engine == 'openpyxl' else 'FF' + alpha = "00" if engine == "openpyxl" else "FF" n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['styled'].columns): + for col1, col2 in zip(wb["frame"].columns, wb["styled"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): - ref = '%s%d' % (cell2.column, cell2.row) + ref = "%s%d" % (cell2.column, cell2.row) # XXX: this isn't as strong a test as ideal; we should # confirm that differences are exclusive - if ref == 'B2': + if ref == "B2": assert not cell1.font.bold assert cell2.font.bold - elif ref == 'C3': + elif ref == "C3": assert cell1.font.color.rgb != cell2.font.color.rgb - assert cell2.font.color.rgb == alpha + '0000FF' - elif ref == 'D4': + assert cell2.font.color.rgb == alpha + "0000FF" + elif ref == "D4": assert cell1.font.underline != cell2.font.underline - assert cell2.font.underline == 'single' - elif ref == 'B5': + assert cell2.font.underline == "single" + elif ref == "B5": assert not cell1.border.left.style - assert (cell2.border.top.style == - cell2.border.right.style == - cell2.border.bottom.style == - cell2.border.left.style == - 'medium') - elif ref == 'C6': + assert ( + cell2.border.top.style + == cell2.border.right.style + == cell2.border.bottom.style + == cell2.border.left.style + == "medium" + ) + elif ref == "C6": assert not cell1.font.italic assert cell2.font.italic - elif ref == 'D7': - assert (cell1.alignment.horizontal != - cell2.alignment.horizontal) - assert cell2.alignment.horizontal == 'right' - elif ref == 'B8': + elif ref == "D7": + assert cell1.alignment.horizontal != cell2.alignment.horizontal + assert cell2.alignment.horizontal == "right" + elif ref == "B8": assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb assert cell1.fill.patternType != cell2.fill.patternType - assert cell2.fill.fgColor.rgb == alpha + 'FF0000' - assert cell2.fill.patternType == 'solid' - elif ref == 'B9': - assert cell1.number_format == 'General' - assert cell2.number_format == '0%' + assert cell2.fill.fgColor.rgb == alpha + "FF0000" + assert cell2.fill.patternType == "solid" + elif ref == "B9": + assert cell1.number_format == "General" + assert cell2.number_format == "0%" else: assert_equal_style(cell1, cell2, engine) @@ -139,12 +153,11 @@ def custom_converter(css): # (3) check styling with custom converter n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['custom'].columns): + for col1, col2 in zip(wb["frame"].columns, wb["custom"].columns): assert len(col1) == len(col2) for cell1, cell2 in zip(col1, col2): - ref = '%s%d' % (cell2.column, cell2.row) - if ref in ('B2', 'C3', 'D4', 'B5', 'C6', 'D7', 'B8', 'B9'): + ref = "%s%d" % (cell2.column, cell2.row) + if ref in ("B2", "C3", "D4", "B5", "C6", "D7", "B8", "B9"): assert not cell1.font.bold assert cell2.font.bold else: diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index d65bebe16804c..cf26b20e5d004 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -16,29 +16,28 @@ from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf from pandas.io.excel import ( - ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, - register_writer) + ExcelFile, + ExcelWriter, + _OpenpyxlWriter, + _XlsxWriter, + _XlwtWriter, + register_writer, +) -@td.skip_if_no('xlrd') -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +@td.skip_if_no("xlrd") +@pytest.mark.parametrize("ext", [".xls", ".xlsx", ".xlsm"]) class TestRoundTrip: - @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([np.nan] * 4)), - (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) - ]) + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], + ) def test_read_one_empty_col_no_header(self, ext, header, expected): # xref gh-12292 filename = "no_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) + df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with ensure_clean(ext) as path: df.to_excel(path, filename, index=False, header=False) @@ -48,46 +47,41 @@ def test_read_one_empty_col_no_header(self, ext, header, expected): @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([0] + [np.nan] * 4)), - (0, DataFrame([np.nan] * 4)) - ]) + @pytest.mark.parametrize( + "header,expected", + [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], + ) def test_read_one_empty_col_with_header(self, ext, header, expected): filename = "with_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) + df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with ensure_clean(ext) as path: - df.to_excel(path, 'with_header', index=False, header=True) + df.to_excel(path, "with_header", index=False, header=True) result = pd.read_excel(path, filename, usecols=[0], header=header) tm.assert_frame_equal(result, expected) - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') + @td.skip_if_no("openpyxl") + @td.skip_if_no("xlwt") def test_set_column_names_in_parameter(self, ext): # GH 12870 : pass down column names associated with # keyword argument names - refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], - [3, 'baz']], columns=['a', 'b']) + refdf = pd.DataFrame([[1, "foo"], [2, "bar"], [3, "baz"]], columns=["a", "b"]) with ensure_clean(ext) as pth: with ExcelWriter(pth) as writer: - refdf.to_excel(writer, 'Data_no_head', - header=False, index=False) - refdf.to_excel(writer, 'Data_with_head', index=False) + refdf.to_excel(writer, "Data_no_head", header=False, index=False) + refdf.to_excel(writer, "Data_with_head", index=False) - refdf.columns = ['A', 'B'] + refdf.columns = ["A", "B"] with ExcelFile(pth) as reader: - xlsdf_no_head = pd.read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) + xlsdf_no_head = pd.read_excel( + reader, "Data_no_head", header=None, names=["A", "B"] + ) xlsdf_with_head = pd.read_excel( - reader, 'Data_with_head', index_col=None, names=['A', 'B']) + reader, "Data_with_head", index_col=None, names=["A", "B"] + ) tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) @@ -122,37 +116,45 @@ def tdf(col_sheet_name): def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453 with ensure_clean(ext) as path: - df = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0} - }) - - expected = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0} - }) + df = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0}, + } + ) + + expected = DataFrame( + { + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0}, + } + ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - df = pd.DataFrame({ - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - expected = pd.DataFrame({ - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) + df = pd.DataFrame( + { + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) + + expected = pd.DataFrame( + { + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7}, + } + ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) @@ -163,37 +165,47 @@ def test_read_excel_multiindex_empty_level(self, ext): @pytest.mark.parametrize("r_idx_names", [True, False]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @pytest.mark.parametrize("r_idx_levels", [1, 3]) - def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels): + def test_excel_multindex_roundtrip( + self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels + ): # see gh-4679 with ensure_clean(ext) as pth: if c_idx_levels == 1 and c_idx_names: - pytest.skip("Column index name cannot be " - "serialized unless it's a MultiIndex") + pytest.skip( + "Column index name cannot be " "serialized unless it's a MultiIndex" + ) # Empty name case current read in as # unnamed levels, not Nones. check_names = r_idx_names or r_idx_levels <= 1 - df = mkdf(5, 5, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels) + df = mkdf(5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels) df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) + act = pd.read_excel( + pth, + index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels)), + ) tm.assert_frame_equal(df, act, check_names=check_names) @td.skip_if_no("xlwt") @@ -201,8 +213,8 @@ def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame( - {"col": [1, 2, 3], - "date_strings": pd.date_range("2012-01-01", periods=3)}) + {"col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3)} + ) df2 = df.copy() df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") @@ -216,13 +228,13 @@ def test_read_excel_parse_dates(self, ext): tm.assert_frame_equal(df, res) date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = pd.read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) + res = pd.read_excel( + pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 + ) tm.assert_frame_equal(df, res) class _WriterBase: - @pytest.fixture(autouse=True) def set_engine_and_path(self, engine, ext): """Fixture to set engine and open file for use in each test case @@ -241,7 +253,7 @@ def set_engine_and_path(self, engine, ext): class and any subclasses, on account of the `autouse=True` argument """ - option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) + option_name = "io.excel.{ext}.writer".format(ext=ext.strip(".")) prev_engine = get_option(option_name) set_option(option_name, engine) with ensure_clean(ext) as path: @@ -250,21 +262,24 @@ class and any subclasses, on account of the `autouse=True` set_option(option_name, prev_engine) # Roll back option change -@td.skip_if_no('xlrd') -@pytest.mark.parametrize("engine,ext", [ - pytest.param('openpyxl', '.xlsx', marks=td.skip_if_no('openpyxl')), - pytest.param('openpyxl', '.xlsm', marks=td.skip_if_no('openpyxl')), - pytest.param('xlwt', '.xls', marks=td.skip_if_no('xlwt')), - pytest.param('xlsxwriter', '.xlsx', marks=td.skip_if_no('xlsxwriter')) -]) +@td.skip_if_no("xlrd") +@pytest.mark.parametrize( + "engine,ext", + [ + pytest.param("openpyxl", ".xlsx", marks=td.skip_if_no("openpyxl")), + pytest.param("openpyxl", ".xlsm", marks=td.skip_if_no("openpyxl")), + pytest.param("xlwt", ".xls", marks=td.skip_if_no("xlwt")), + pytest.param("xlsxwriter", ".xlsx", marks=td.skip_if_no("xlsxwriter")), + ], +) class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. def test_excel_sheet_size(self, engine, ext): # GH 26080 - breaking_row_count = 2**20 + 1 - breaking_col_count = 2**14 + 1 + breaking_row_count = 2 ** 20 + 1 + breaking_col_count = 2 ** 14 + 1 # purposely using two arrays to prevent memory issues while testing row_arr = np.zeros(shape=(breaking_row_count, 1)) col_arr = np.zeros(shape=(1, breaking_col_count)) @@ -308,45 +323,42 @@ def test_excel_writer_context_manager(self, frame, engine, ext): def test_roundtrip(self, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # test roundtrip - frame.to_excel(self.path, 'test1') - recons = pd.read_excel(self.path, 'test1', index_col=0) + frame.to_excel(self.path, "test1") + recons = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, 'test1', index=False) - recons = pd.read_excel(self.path, 'test1', index_col=None) + frame.to_excel(self.path, "test1", index=False) + recons = pd.read_excel(self.path, "test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, 'test1', na_rep='NA') - recons = pd.read_excel( - self.path, 'test1', index_col=0, na_values=['NA']) + frame.to_excel(self.path, "test1", na_rep="NA") + recons = pd.read_excel(self.path, "test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(frame, recons) # GH 3611 - frame.to_excel(self.path, 'test1', na_rep='88') - recons = pd.read_excel( - self.path, 'test1', index_col=0, na_values=['88']) + frame.to_excel(self.path, "test1", na_rep="88") + recons = pd.read_excel(self.path, "test1", index_col=0, na_values=["88"]) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, 'test1', na_rep='88') - recons = pd.read_excel( - self.path, 'test1', index_col=0, na_values=[88, 88.0]) + frame.to_excel(self.path, "test1", na_rep="88") + recons = pd.read_excel(self.path, "test1", index_col=0, na_values=[88, 88.0]) tm.assert_frame_equal(frame, recons) # GH 6573 - frame.to_excel(self.path, 'Sheet1') + frame.to_excel(self.path, "Sheet1") recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(frame, recons) - frame.to_excel(self.path, '0') + frame.to_excel(self.path, "0") recons = pd.read_excel(self.path, index_col=0) tm.assert_frame_equal(frame, recons) @@ -358,11 +370,11 @@ def test_roundtrip(self, engine, ext, frame): def test_mixed(self, engine, ext, frame): mixed_frame = frame.copy() - mixed_frame['foo'] = 'bar' + mixed_frame["foo"] = "bar" - mixed_frame.to_excel(self.path, 'test1') + mixed_frame.to_excel(self.path, "test1") reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) def test_ts_frame(self, tsframe, engine, ext): @@ -376,19 +388,17 @@ def test_ts_frame(self, tsframe, engine, ext): def test_basics_with_nan(self, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) - - @pytest.mark.parametrize("np_type", [ - np.int8, np.int16, np.int32, np.int64]) + frame["A"][:5] = nan + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) + + @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, engine, ext, np_type): # Test np.int values read come back as int # (rather than float which is Excel's format). - df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), - dtype=np_type) + df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -402,14 +412,12 @@ def test_int_types(self, engine, ext, np_type): # Test with convert_float=False comes back as float. float_frame = df.astype(float) - recons = pd.read_excel(self.path, "test1", - convert_float=False, index_col=0) - tm.assert_frame_equal(recons, float_frame, - check_index_type=False, - check_column_type=False) - - @pytest.mark.parametrize("np_type", [ - np.float16, np.float32, np.float64]) + recons = pd.read_excel(self.path, "test1", convert_float=False, index_col=0) + tm.assert_frame_equal( + recons, float_frame, check_index_type=False, check_column_type=False + ) + + @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, engine, ext, np_type): # Test np.float values read come back as float. df = DataFrame(np.random.random_sample(10), dtype=np_type) @@ -423,7 +431,7 @@ def test_float_types(self, engine, ext, np_type): @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, engine, ext, np_type): # Test np.bool values read come back as float. - df = (DataFrame([1, 0, True, False], dtype=np_type)) + df = DataFrame([1, 0, True, False], dtype=np_type) df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -442,102 +450,99 @@ def test_inf_roundtrip(self, engine, ext): def test_sheets(self, engine, ext, frame, tsframe): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # Test writing to separate sheets writer = ExcelWriter(self.path) - frame.to_excel(writer, 'test1') - tsframe.to_excel(writer, 'test2') + frame.to_excel(writer, "test1") + tsframe.to_excel(writer, "test2") writer.save() reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=0) + recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(frame, recons) - recons = pd.read_excel(reader, 'test2', index_col=0) + recons = pd.read_excel(reader, "test2", index_col=0) tm.assert_frame_equal(tsframe, recons) assert 2 == len(reader.sheet_names) - assert 'test1' == reader.sheet_names[0] - assert 'test2' == reader.sheet_names[1] + assert "test1" == reader.sheet_names[0] + assert "test2" == reader.sheet_names[1] def test_colaliases(self, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # column aliases - col_aliases = Index(['AA', 'X', 'Y', 'Z']) - frame.to_excel(self.path, 'test1', header=col_aliases) + col_aliases = Index(["AA", "X", "Y", "Z"]) + frame.to_excel(self.path, "test1", header=col_aliases) reader = ExcelFile(self.path) - rs = pd.read_excel(reader, 'test1', index_col=0) + rs = pd.read_excel(reader, "test1", index_col=0) xp = frame.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) def test_roundtrip_indexlabels(self, merge_cells, engine, ext, frame): frame = frame.copy() - frame['A'][:5] = nan + frame["A"][:5] = nan - frame.to_excel(self.path, 'test1') - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', index=False) + frame.to_excel(self.path, "test1") + frame.to_excel(self.path, "test1", columns=["A", "B"]) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", index=False) # test index_label - df = (DataFrame(np.random.randn(10, 2)) >= 0) - df.to_excel(self.path, 'test1', - index_label=['test'], - merge_cells=merge_cells) + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel(self.path, "test1", index_label=["test"], merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = pd.read_excel( - reader, 'test1', index_col=0).astype(np.int64) - df.index.names = ['test'] + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] assert df.index.names == recons.index.names - df = (DataFrame(np.random.randn(10, 2)) >= 0) - df.to_excel(self.path, - 'test1', - index_label=['test', 'dummy', 'dummy2'], - merge_cells=merge_cells) + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel( + self.path, + "test1", + index_label=["test", "dummy", "dummy2"], + merge_cells=merge_cells, + ) reader = ExcelFile(self.path) - recons = pd.read_excel( - reader, 'test1', index_col=0).astype(np.int64) - df.index.names = ['test'] + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] assert df.index.names == recons.index.names - df = (DataFrame(np.random.randn(10, 2)) >= 0) - df.to_excel(self.path, - 'test1', - index_label='test', - merge_cells=merge_cells) + df = DataFrame(np.random.randn(10, 2)) >= 0 + df.to_excel(self.path, "test1", index_label="test", merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = pd.read_excel( - reader, 'test1', index_col=0).astype(np.int64) - df.index.names = ['test'] + recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) + df.index.names = ["test"] tm.assert_frame_equal(df, recons.astype(bool)) - frame.to_excel(self.path, - 'test1', - columns=['A', 'B', 'C', 'D'], - index=False, merge_cells=merge_cells) + frame.to_excel( + self.path, + "test1", + columns=["A", "B", "C", "D"], + index=False, + merge_cells=merge_cells, + ) # take 'A' and 'B' as indexes (same row as cols 'C', 'D') df = frame.copy() - df = df.set_index(['A', 'B']) + df = df.set_index(["A", "B"]) reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=[0, 1]) + recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons, check_less_precise=True) def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): df = DataFrame(np.random.randn(10, 4)) - df.index.name = 'foo' + df.index.name = "foo" df.to_excel(self.path, merge_cells=merge_cells) @@ -545,7 +550,7 @@ def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): result = pd.read_excel(xf, xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) - assert result.index.name == 'foo' + assert result.index.name == "foo" def test_excel_roundtrip_datetime(self, merge_cells, tsframe, engine, ext): # datetime.date, not sure what to test here exactly @@ -563,22 +568,30 @@ def test_excel_date_datetime_format(self, engine, ext): # see gh-4133 # # Excel output format strings - df = DataFrame([[date(2014, 1, 31), - date(1999, 9, 24)], - [datetime(1998, 5, 26, 23, 33, 4), - datetime(2014, 2, 28, 13, 5, 13)]], - index=["DATE", "DATETIME"], columns=["X", "Y"]) - df_expected = DataFrame([[datetime(2014, 1, 31), - datetime(1999, 9, 24)], - [datetime(1998, 5, 26, 23, 33, 4), - datetime(2014, 2, 28, 13, 5, 13)]], - index=["DATE", "DATETIME"], columns=["X", "Y"]) + df = DataFrame( + [ + [date(2014, 1, 31), date(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) + df_expected = DataFrame( + [ + [datetime(2014, 1, 31), datetime(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], + ], + index=["DATE", "DATETIME"], + columns=["X", "Y"], + ) with ensure_clean(ext) as filename2: writer1 = ExcelWriter(self.path) - writer2 = ExcelWriter(filename2, - date_format="DD.MM.YYYY", - datetime_format="DD.MM.YYYY HH-MM-SS") + writer2 = ExcelWriter( + filename2, + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS", + ) df.to_excel(writer1, "test1") df.to_excel(writer2, "test1") @@ -602,8 +615,7 @@ def test_to_excel_interval_no_labels(self, engine, ext): # see gh-19242 # # Test writing Interval without labels. - df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() df["new"] = pd.cut(df[0], 10) @@ -619,11 +631,11 @@ def test_to_excel_interval_labels(self, engine, ext): # see gh-19242 # # Test writing Interval with labels. - df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) + df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() - intervals = pd.cut(df[0], 10, labels=["A", "B", "C", "D", "E", - "F", "G", "H", "I", "J"]) + intervals = pd.cut( + df[0], 10, labels=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] + ) df["new"] = intervals expected["new"] = pd.Series(list(intervals)) @@ -637,13 +649,15 @@ def test_to_excel_timedelta(self, engine, ext): # see gh-19242, gh-9155 # # Test writing timedelta to xls. - df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=["A"], dtype=np.int64) + df = DataFrame( + np.random.randint(-10, 10, size=(20, 1)), columns=["A"], dtype=np.int64 + ) expected = df.copy() df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) expected["new"] = expected["A"].apply( - lambda x: timedelta(seconds=x).total_seconds() / float(86400)) + lambda x: timedelta(seconds=x).total_seconds() / float(86400) + ) df.to_excel(self.path, "test1") reader = ExcelFile(self.path) @@ -652,35 +666,34 @@ def test_to_excel_timedelta(self, engine, ext): tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, engine, ext, tsframe): - xp = tsframe.resample('M', kind='period').mean() + xp = tsframe.resample("M", kind="period").mean() - xp.to_excel(self.path, 'sht1') + xp.to_excel(self.path, "sht1") reader = ExcelFile(self.path) - rs = pd.read_excel(reader, 'sht1', index_col=0) - tm.assert_frame_equal(xp, rs.to_period('M')) + rs = pd.read_excel(reader, "sht1", index_col=0) + tm.assert_frame_equal(xp, rs.to_period("M")) def test_to_excel_multiindex(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, - names=['first', 'second']) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', columns=['A', 'B']) + frame.to_excel(self.path, "test1", header=False) + frame.to_excel(self.path, "test1", columns=["A", "B"]) # round trip - frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + frame.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) - df = pd.read_excel(reader, 'test1', index_col=[0, 1]) + df = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): - df = pd.DataFrame({'A': [None, 2, 3], - 'B': [10, 20, 30], - 'C': np.random.sample(3)}) - df = df.set_index(['A', 'B']) + df = pd.DataFrame( + {"A": [None, 2, 3], "B": [10, 20, 30], "C": np.random.sample(3)} + ) + df = df.set_index(["A", "B"]) df.to_excel(self.path, merge_cells=merge_cells) df1 = pd.read_excel(self.path, index_col=[0, 1]) @@ -691,46 +704,42 @@ def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): # merge_cells def test_to_excel_multiindex_cols(self, merge_cells, engine, ext, frame): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, - names=['first', 'second']) + new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index - new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), - (50, 1), (50, 2)]) + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) frame.columns = new_cols_index header = [0, 1] if not merge_cells: header = 0 # round trip - frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + frame.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) - df = pd.read_excel(reader, 'test1', header=header, index_col=[0, 1]) + df = pd.read_excel(reader, "test1", header=header, index_col=[0, 1]) if not merge_cells: - fm = frame.columns.format(sparsify=False, - adjoin=False, names=False) + fm = frame.columns.format(sparsify=False, adjoin=False, names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) - def test_to_excel_multiindex_dates( - self, merge_cells, engine, ext, tsframe): + def test_to_excel_multiindex_dates(self, merge_cells, engine, ext, tsframe): # try multiindex with dates new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) - tsframe.index.names = ['time', 'foo'] - tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells) + tsframe.index.names = ["time", "foo"] + tsframe.to_excel(self.path, "test1", merge_cells=merge_cells) reader = ExcelFile(self.path) - recons = pd.read_excel(reader, 'test1', index_col=[0, 1]) + recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) - assert recons.index.names == ('time', 'foo') + assert recons.index.names == ("time", "foo") def test_to_excel_multiindex_no_write_index(self, engine, ext): # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. - frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) + frame1 = DataFrame({"a": [10, 20], "b": [30, 40], "c": [50, 60]}) # Add a MI. frame2 = frame1.copy() @@ -738,40 +747,44 @@ def test_to_excel_multiindex_no_write_index(self, engine, ext): frame2.index = multi_index # Write out to Excel without the index. - frame2.to_excel(self.path, 'test1', index=False) + frame2.to_excel(self.path, "test1", index=False) # Read it back in. reader = ExcelFile(self.path) - frame3 = pd.read_excel(reader, 'test1') + frame3 = pd.read_excel(reader, "test1") # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) def test_to_excel_float_format(self, engine, ext): - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=["A", "B"], columns=["X", "Y", "Z"]) + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) df.to_excel(self.path, "test1", float_format="%.2f") reader = ExcelFile(self.path) result = pd.read_excel(reader, "test1", index_col=0) - expected = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=["A", "B"], columns=["X", "Y", "Z"]) + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) tm.assert_frame_equal(result, expected) def test_to_excel_output_encoding(self, engine, ext): # Avoid mixed inferred_type. - df = DataFrame([["\u0192", "\u0193", "\u0194"], - ["\u0195", "\u0196", "\u0197"]], - index=["A\u0192", "B"], - columns=["X\u0193", "Y", "Z"]) + df = DataFrame( + [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], + index=["A\u0192", "B"], + columns=["X\u0193", "Y", "Z"], + ) with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") - result = pd.read_excel(filename, "TestSheet", - encoding="utf8", index_col=0) + result = pd.read_excel(filename, "TestSheet", encoding="utf8", index_col=0) tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self, engine, ext): @@ -783,17 +796,21 @@ def test_to_excel_unicode_filename(self, engine, ext): else: f.close() - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=["A", "B"], columns=["X", "Y", "Z"]) + df = DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) result = pd.read_excel(reader, "test1", index_col=0) - expected = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=["A", "B"], columns=["X", "Y", "Z"]) + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) tm.assert_frame_equal(result, expected) # def test_to_excel_header_styling_xls(self, engine, ext): @@ -899,12 +916,13 @@ def test_to_excel_unicode_filename(self, engine, ext): @pytest.mark.parametrize("use_headers", [True, False]) @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) - def test_excel_010_hemstring(self, merge_cells, engine, ext, - c_idx_nlevels, r_idx_nlevels, use_headers): - + def test_excel_010_hemstring( + self, merge_cells, engine, ext, c_idx_nlevels, r_idx_nlevels, use_headers + ): def roundtrip(data, header=True, parser_hdr=0, index=True): - data.to_excel(self.path, header=header, - merge_cells=merge_cells, index=index) + data.to_excel( + self.path, header=header, merge_cells=merge_cells, index=index + ) xf = ExcelFile(self.path) return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr) @@ -921,11 +939,13 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): ncols = 3 from pandas.util.testing import makeCustomDataframe as mkdf + # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 - df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels, - c_idx_nlevels=c_idx_nlevels) + df = mkdf( + nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels + ) # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. @@ -948,29 +968,28 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): def test_duplicated_columns(self, engine, ext): # see gh-5235 - df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], - columns=["A", "B", "B"]) + df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) df.to_excel(self.path, "test1") - expected = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], - columns=["A", "B", "B.1"]) + expected = DataFrame( + [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] + ) # By default, we mangle. result = pd.read_excel(self.path, "test1", index_col=0) tm.assert_frame_equal(result, expected) # Explicitly, we pass in the parameter. - result = pd.read_excel(self.path, "test1", index_col=0, - mangle_dupe_cols=True) + result = pd.read_excel(self.path, "test1", index_col=0, mangle_dupe_cols=True) tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=["A", "B", "A", "B"]) + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) df.to_excel(self.path, "test1") result = pd.read_excel(self.path, "test1", index_col=0) - expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=["A", "B", "A.1", "B.1"]) + expected = DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] + ) tm.assert_frame_equal(result, expected) # see gh-10982 @@ -982,27 +1001,23 @@ def test_duplicated_columns(self, engine, ext): msg = "Setting mangle_dupe_cols=False is not supported yet" with pytest.raises(ValueError, match=msg): - pd.read_excel( - self.path, "test1", header=None, mangle_dupe_cols=False) + pd.read_excel(self.path, "test1", header=None, mangle_dupe_cols=False) def test_swapped_columns(self, engine, ext): # Test for issue #5427. - write_frame = DataFrame({'A': [1, 1, 1], - 'B': [2, 2, 2]}) - write_frame.to_excel(self.path, 'test1', columns=['B', 'A']) + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) + write_frame.to_excel(self.path, "test1", columns=["B", "A"]) - read_frame = pd.read_excel(self.path, 'test1', header=0) + read_frame = pd.read_excel(self.path, "test1", header=0) - tm.assert_series_equal(write_frame['A'], read_frame['A']) - tm.assert_series_equal(write_frame['B'], read_frame['B']) + tm.assert_series_equal(write_frame["A"], read_frame["A"]) + tm.assert_series_equal(write_frame["B"], read_frame["B"]) def test_invalid_columns(self, engine, ext): # see gh-10982 - write_frame = DataFrame({"A": [1, 1, 1], - "B": [2, 2, 2]}) + write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): write_frame.to_excel(self.path, "test1", columns=["B", "C"]) expected = write_frame.reindex(columns=["B", "C"]) @@ -1018,8 +1033,7 @@ def test_comment_arg(self, engine, ext): # Test the comment argument functionality to pd.read_excel. # Create file to read in. - df = DataFrame({"A": ["one", "#one", "one"], - "B": ["two", "two", "#two"]}) + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(self.path, "test_c") # Read file without comment arg. @@ -1037,13 +1051,12 @@ def test_comment_default(self, engine, ext): # Test the comment argument default to pd.read_excel # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(self.path, 'test_c') + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") # Read file with default and explicit comment=None - result1 = pd.read_excel(self.path, 'test_c') - result2 = pd.read_excel(self.path, 'test_c', comment=None) + result1 = pd.read_excel(self.path, "test_c") + result2 = pd.read_excel(self.path, "test_c", comment=None) tm.assert_frame_equal(result1, result2) def test_comment_used(self, engine, ext): @@ -1052,13 +1065,11 @@ def test_comment_used(self, engine, ext): # Test the comment argument is working as expected when used. # Create file to read in. - df = DataFrame({"A": ["one", "#one", "one"], - "B": ["two", "two", "#two"]}) + df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(self.path, "test_c") # Test read_frame_comment against manually produced expected output. - expected = DataFrame({"A": ["one", None, "one"], - "B": ["two", None, None]}) + expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) result = pd.read_excel(self.path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) @@ -1066,34 +1077,36 @@ def test_comment_empty_line(self, engine, ext): # Re issue #18735 # Test that pd.read_excel ignores commented lines at the end of file - df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) + df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) df.to_excel(self.path, index=False) # Test that all-comment lines at EoF are ignored - expected = DataFrame({'a': [1], 'b': [2]}) - result = pd.read_excel(self.path, comment='#') + expected = DataFrame({"a": [1], "b": [2]}) + result = pd.read_excel(self.path, comment="#") tm.assert_frame_equal(result, expected) def test_datetimes(self, engine, ext): # Test writing and reading datetimes. For issue #9139. (xref #9185) - datetimes = [datetime(2013, 1, 13, 1, 2, 3), - datetime(2013, 1, 13, 2, 45, 56), - datetime(2013, 1, 13, 4, 29, 49), - datetime(2013, 1, 13, 6, 13, 42), - datetime(2013, 1, 13, 7, 57, 35), - datetime(2013, 1, 13, 9, 41, 28), - datetime(2013, 1, 13, 11, 25, 21), - datetime(2013, 1, 13, 13, 9, 14), - datetime(2013, 1, 13, 14, 53, 7), - datetime(2013, 1, 13, 16, 37, 0), - datetime(2013, 1, 13, 18, 20, 52)] - - write_frame = DataFrame({'A': datetimes}) - write_frame.to_excel(self.path, 'Sheet1') - read_frame = pd.read_excel(self.path, 'Sheet1', header=0) - - tm.assert_series_equal(write_frame['A'], read_frame['A']) + datetimes = [ + datetime(2013, 1, 13, 1, 2, 3), + datetime(2013, 1, 13, 2, 45, 56), + datetime(2013, 1, 13, 4, 29, 49), + datetime(2013, 1, 13, 6, 13, 42), + datetime(2013, 1, 13, 7, 57, 35), + datetime(2013, 1, 13, 9, 41, 28), + datetime(2013, 1, 13, 11, 25, 21), + datetime(2013, 1, 13, 13, 9, 14), + datetime(2013, 1, 13, 14, 53, 7), + datetime(2013, 1, 13, 16, 37, 0), + datetime(2013, 1, 13, 18, 20, 52), + ] + + write_frame = DataFrame({"A": datetimes}) + write_frame.to_excel(self.path, "Sheet1") + read_frame = pd.read_excel(self.path, "Sheet1", header=0) + + tm.assert_series_equal(write_frame["A"], read_frame["A"]) def test_bytes_io(self, engine, ext): # see gh-7074 @@ -1111,9 +1124,13 @@ def test_bytes_io(self, engine, ext): def test_write_lists_dict(self, engine, ext): # see gh-8188. - df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], - "numeric": [1, 2, 3.0], - "str": ["apple", "banana", "cherry"]}) + df = DataFrame( + { + "mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], + "numeric": [1, 2, 3.0], + "str": ["apple", "banana", "cherry"], + } + ) df.to_excel(self.path, "Sheet1") read = pd.read_excel(self.path, "Sheet1", header=0, index_col=0) @@ -1129,8 +1146,9 @@ def test_true_and_false_value_options(self, engine, ext): expected = df.replace({"foo": True, "bar": False}) df.to_excel(self.path) - read_frame = pd.read_excel(self.path, true_values=["foo"], - false_values=["bar"], index_col=0) + read_frame = pd.read_excel( + self.path, true_values=["foo"], false_values=["bar"], index_col=0 + ) tm.assert_frame_equal(read_frame, expected) def test_freeze_panes(self, engine, ext): @@ -1146,8 +1164,7 @@ def test_path_path_lib(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, - path="foo.{ext}".format(ext=ext)) + result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) def test_path_local_path(self, engine, ext): @@ -1155,32 +1172,36 @@ def test_path_local_path(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, - path="foo.{ext}".format(ext=ext)) + result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) def test_merged_cell_custom_objects(self, engine, merge_cells, ext): # see GH-27006 - mi = MultiIndex.from_tuples([(pd.Period('2018'), pd.Period('2018Q1')), - (pd.Period('2018'), pd.Period('2018Q2'))]) + mi = MultiIndex.from_tuples( + [ + (pd.Period("2018"), pd.Period("2018Q1")), + (pd.Period("2018"), pd.Period("2018Q2")), + ] + ) expected = DataFrame(np.ones((2, 2)), columns=mi) expected.to_excel(self.path) - result = pd.read_excel(self.path, header=[0, 1], - index_col=0, convert_float=False) + result = pd.read_excel( + self.path, header=[0, 1], index_col=0, convert_float=False + ) # need to convert PeriodIndexes to standard Indexes for assert equal - expected.columns.set_levels([[str(i) for i in mi.levels[0]], - [str(i) for i in mi.levels[1]]], - level=[0, 1], - inplace=True) + expected.columns.set_levels( + [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], + level=[0, 1], + inplace=True, + ) expected.index = expected.index.astype(np.float64) tm.assert_frame_equal(expected, result) - @pytest.mark.parametrize('dtype', [None, object]) - def test_raise_when_saving_timezones(self, engine, ext, dtype, - tz_aware_fixture): + @pytest.mark.parametrize("dtype", [None, object]) + def test_raise_when_saving_timezones(self, engine, ext, dtype, tz_aware_fixture): # GH 27008, GH 7056 tz = tz_aware_fixture - data = pd.Timestamp('2019', tz=tz) + data = pd.Timestamp("2019", tz=tz) df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): df.to_excel(self.path) @@ -1192,25 +1213,26 @@ def test_raise_when_saving_timezones(self, engine, ext, dtype, class TestExcelWriterEngineTests: - - @pytest.mark.parametrize('klass,ext', [ - pytest.param(_XlsxWriter, '.xlsx', marks=td.skip_if_no('xlsxwriter')), - pytest.param( - _OpenpyxlWriter, '.xlsx', marks=td.skip_if_no('openpyxl')), - pytest.param(_XlwtWriter, '.xls', marks=td.skip_if_no('xlwt')) - ]) + @pytest.mark.parametrize( + "klass,ext", + [ + pytest.param(_XlsxWriter, ".xlsx", marks=td.skip_if_no("xlsxwriter")), + pytest.param(_OpenpyxlWriter, ".xlsx", marks=td.skip_if_no("openpyxl")), + pytest.param(_XlwtWriter, ".xls", marks=td.skip_if_no("xlwt")), + ], + ) def test_ExcelWriter_dispatch(self, klass, ext): with ensure_clean(ext) as path: writer = ExcelWriter(path) - if ext == '.xlsx' and td.safe_import('xlsxwriter'): + if ext == ".xlsx" and td.safe_import("xlsxwriter"): # xlsxwriter has preference over openpyxl if both installed assert isinstance(writer, _XlsxWriter) else: assert isinstance(writer, klass) def test_ExcelWriter_dispatch_raises(self): - with pytest.raises(ValueError, match='No engine'): - ExcelWriter('nothing') + with pytest.raises(ValueError, match="No engine"): + ExcelWriter("nothing") def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works @@ -1220,8 +1242,8 @@ def test_register_writer(self): class DummyClass(ExcelWriter): called_save = False called_write_cells = False - supported_extensions = ['xlsx', 'xls'] - engine = 'dummy' + supported_extensions = ["xlsx", "xls"] + engine = "dummy" def save(self): called_save.append(True) @@ -1236,24 +1258,21 @@ def check_called(func): del called_save[:] del called_write_cells[:] - with pd.option_context('io.excel.xlsx.writer', 'dummy'): + with pd.option_context("io.excel.xlsx.writer", "dummy"): register_writer(DummyClass) - writer = ExcelWriter('something.xlsx') + writer = ExcelWriter("something.xlsx") assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) - check_called(lambda: df.to_excel('something.xlsx')) - check_called( - lambda: df.to_excel( - 'something.xls', engine='dummy')) + check_called(lambda: df.to_excel("something.xlsx")) + check_called(lambda: df.to_excel("something.xls", engine="dummy")) -@td.skip_if_no('xlrd') -@td.skip_if_no('openpyxl') -@pytest.mark.skipif(not PY36, reason='requires fspath') +@td.skip_if_no("xlrd") +@td.skip_if_no("openpyxl") +@pytest.mark.skipif(not PY36, reason="requires fspath") class TestFSPath: - def test_excelfile_fspath(self): - with tm.ensure_clean('foo.xlsx') as path: + with tm.ensure_clean("foo.xlsx") as path: df = DataFrame({"A": [1, 2]}) df.to_excel(path) xl = ExcelFile(path) @@ -1261,6 +1280,6 @@ def test_excelfile_fspath(self): assert result == path def test_excelwriter_fspath(self): - with tm.ensure_clean('foo.xlsx') as path: + with tm.ensure_clean("foo.xlsx") as path: writer = ExcelWriter(path) assert os.fspath(writer) == str(path) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index d749f0ec3e252..c4d99c827318d 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -30,14 +30,13 @@ def test_read_xlrd_book(read_ext, frame): result = pd.read_excel(xl, sheet_name, index_col=0) tm.assert_frame_equal(df, result) - result = pd.read_excel(book, sheet_name=sheet_name, - engine=engine, index_col=0) + result = pd.read_excel(book, sheet_name=sheet_name, engine=engine, index_col=0) tm.assert_frame_equal(df, result) # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): - path = datapath("io", "data", 'test1{}'.format(read_ext)) + path = datapath("io", "data", "test1{}".format(read_ext)) with pd.ExcelFile(path) as excel: with pytest.raises(xlrd.XLRDError): - pd.read_excel(excel, 'asdf') + pd.read_excel(excel, "asdf") diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 391a1085161f0..4dae3db2e7abd 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -9,7 +9,7 @@ xlsxwriter = pytest.importorskip("xlsxwriter") -pytestmark = pytest.mark.parametrize("ext", ['.xlsx']) +pytestmark = pytest.mark.parametrize("ext", [".xlsx"]) def test_column_format(ext): @@ -21,33 +21,32 @@ def test_column_format(ext): openpyxl = pytest.importorskip("openpyxl") with ensure_clean(ext) as path: - frame = DataFrame({'A': [123456, 123456], - 'B': [123456, 123456]}) + frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]}) writer = ExcelWriter(path) frame.to_excel(writer) # Add a number format to col B and ensure it is applied to cells. - num_format = '#,##0' + num_format = "#,##0" write_workbook = writer.book write_worksheet = write_workbook.worksheets()[0] - col_format = write_workbook.add_format({'num_format': num_format}) - write_worksheet.set_column('B:B', None, col_format) + col_format = write_workbook.add_format({"num_format": num_format}) + write_worksheet.set_column("B:B", None, col_format) writer.save() read_workbook = openpyxl.load_workbook(path) try: - read_worksheet = read_workbook['Sheet1'] + read_worksheet = read_workbook["Sheet1"] except TypeError: # compat - read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1') + read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1") # Get the number format from the cell. try: - cell = read_worksheet['B2'] + cell = read_worksheet["B2"] except TypeError: # compat - cell = read_worksheet.cell('B2') + cell = read_worksheet.cell("B2") try: read_num_format = cell.number_format @@ -62,4 +61,4 @@ def test_write_append_mode_raises(ext): with ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine='xlsxwriter', mode='a') + ExcelWriter(f, engine="xlsxwriter", mode="a") diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index 9c687f1f514f9..51f94a2f01b84 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -9,14 +9,14 @@ xlwt = pytest.importorskip("xlwt") -pytestmark = pytest.mark.parametrize("ext,", ['.xls']) +pytestmark = pytest.mark.parametrize("ext,", [".xls"]) def test_excel_raise_error_on_multiindex_columns_and_no_index(ext): # MultiIndex as columns is not yet implemented 9794 - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) df = DataFrame(np.random.randn(10, 3), columns=cols) with pytest.raises(NotImplementedError): with ensure_clean(ext) as path: @@ -24,9 +24,9 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index(ext): def test_excel_multiindex_columns_and_index_true(ext): - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) df = pd.DataFrame(np.random.randn(10, 3), columns=cols) with ensure_clean(ext) as path: df.to_excel(path, index=True) @@ -34,21 +34,20 @@ def test_excel_multiindex_columns_and_index_true(ext): def test_excel_multiindex_index(ext): # MultiIndex as index works so assert no error #9794 - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + cols = MultiIndex.from_tuples( + [("site", ""), ("2014", "height"), ("2014", "weight")] + ) df = DataFrame(np.random.randn(3, 10), index=cols) with ensure_clean(ext) as path: df.to_excel(path, index=False) def test_to_excel_styleconverter(ext): - hstyle = {"font": {"bold": True}, - "borders": {"top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin"}, - "alignment": {"horizontal": "center", "vertical": "top"}} + hstyle = { + "font": {"bold": True}, + "borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}, + } xls_style = _XlwtWriter._convert_to_style(hstyle) assert xls_style.font.bold @@ -65,4 +64,4 @@ def test_write_append_mode_raises(ext): with ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine='xlwt', mode='a') + ExcelWriter(f, engine="xlwt", mode="a") diff --git a/pandas/tests/io/formats/test_console.py b/pandas/tests/io/formats/test_console.py index 2f012c4d01912..f4bee99296a83 100644 --- a/pandas/tests/io/formats/test_console.py +++ b/pandas/tests/io/formats/test_console.py @@ -9,6 +9,7 @@ class MockEncoding: # TODO(py27): replace with mock side effect is a str in nature, the value will be returned. Otherwise, the side effect should be an exception that will be raised. """ + def __init__(self, encoding): super().__init__() self.val = encoding @@ -25,50 +26,45 @@ def raise_or_return(val): raise val -@pytest.mark.parametrize('empty,filled', [ - ['stdin', 'stdout'], - ['stdout', 'stdin'] -]) +@pytest.mark.parametrize("empty,filled", [["stdin", "stdout"], ["stdout", "stdin"]]) def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled): # Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when # they have values filled. # GH 21552 with monkeypatch.context() as context: - context.setattr('sys.{}'.format(empty), MockEncoding('')) - context.setattr('sys.{}'.format(filled), MockEncoding(filled)) + context.setattr("sys.{}".format(empty), MockEncoding("")) + context.setattr("sys.{}".format(filled), MockEncoding(filled)) assert detect_console_encoding() == filled -@pytest.mark.parametrize('encoding', [ - AttributeError, - IOError, - 'ascii' -]) +@pytest.mark.parametrize("encoding", [AttributeError, IOError, "ascii"]) def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding): # GH 21552 with monkeypatch.context() as context: - context.setattr('locale.getpreferredencoding', lambda: 'foo') - context.setattr('sys.stdout', MockEncoding(encoding)) - assert detect_console_encoding() == 'foo' + context.setattr("locale.getpreferredencoding", lambda: "foo") + context.setattr("sys.stdout", MockEncoding(encoding)) + assert detect_console_encoding() == "foo" -@pytest.mark.parametrize('std,locale', [ - ['ascii', 'ascii'], - ['ascii', Exception], - [AttributeError, 'ascii'], - [AttributeError, Exception], - [IOError, 'ascii'], - [IOError, Exception] -]) +@pytest.mark.parametrize( + "std,locale", + [ + ["ascii", "ascii"], + ["ascii", Exception], + [AttributeError, "ascii"], + [AttributeError, Exception], + [IOError, "ascii"], + [IOError, Exception], + ], +) def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale): # When both the stdout/stdin encoding and locale preferred encoding checks # fail (or return 'ascii', we should default to the sys default encoding. # GH 21552 with monkeypatch.context() as context: context.setattr( - 'locale.getpreferredencoding', - lambda: MockEncoding.raise_or_return(locale) + "locale.getpreferredencoding", lambda: MockEncoding.raise_or_return(locale) ) - context.setattr('sys.stdout', MockEncoding(std)) - context.setattr('sys.getdefaultencoding', lambda: 'sysDefaultEncoding') - assert detect_console_encoding() == 'sysDefaultEncoding' + context.setattr("sys.stdout", MockEncoding(std)) + context.setattr("sys.getdefaultencoding", lambda: "sysDefaultEncoding") + assert detect_console_encoding() == "sysDefaultEncoding" diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index f251bd983509e..74e78b033bac9 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -18,35 +18,44 @@ def assert_same_resolution(css1, css2, inherited=None): assert resolved1 == resolved2 -@pytest.mark.parametrize('name,norm,abnorm', [ - ('whitespace', 'hello: world; foo: bar', - ' \t hello \t :\n world \n ; \n foo: \tbar\n\n'), - ('case', 'hello: world; foo: bar', 'Hello: WORLD; foO: bar'), - ('empty-decl', 'hello: world; foo: bar', - '; hello: world;; foo: bar;\n; ;'), - ('empty-list', '', ';'), -]) +@pytest.mark.parametrize( + "name,norm,abnorm", + [ + ( + "whitespace", + "hello: world; foo: bar", + " \t hello \t :\n world \n ; \n foo: \tbar\n\n", + ), + ("case", "hello: world; foo: bar", "Hello: WORLD; foO: bar"), + ("empty-decl", "hello: world; foo: bar", "; hello: world;; foo: bar;\n; ;"), + ("empty-list", "", ";"), + ], +) def test_css_parse_normalisation(name, norm, abnorm): assert_same_resolution(norm, abnorm) @pytest.mark.parametrize( - 'invalid_css,remainder', [ + "invalid_css,remainder", + [ # No colon - ('hello-world', ''), - ('border-style: solid; hello-world', 'border-style: solid'), - ('border-style: solid; hello-world; font-weight: bold', - 'border-style: solid; font-weight: bold'), + ("hello-world", ""), + ("border-style: solid; hello-world", "border-style: solid"), + ( + "border-style: solid; hello-world; font-weight: bold", + "border-style: solid; font-weight: bold", + ), # Unclosed string fail # Invalid size - ('font-size: blah', 'font-size: 1em'), - ('font-size: 1a2b', 'font-size: 1em'), - ('font-size: 1e5pt', 'font-size: 1em'), - ('font-size: 1+6pt', 'font-size: 1em'), - ('font-size: 1unknownunit', 'font-size: 1em'), - ('font-size: 10', 'font-size: 1em'), - ('font-size: 10 pt', 'font-size: 1em'), - ]) + ("font-size: blah", "font-size: 1em"), + ("font-size: 1a2b", "font-size: 1em"), + ("font-size: 1e5pt", "font-size: 1em"), + ("font-size: 1+6pt", "font-size: 1em"), + ("font-size: 1unknownunit", "font-size: 1em"), + ("font-size: 10", "font-size: 1em"), + ("font-size: 10 pt", "font-size: 1em"), + ], +) def test_css_parse_invalid(invalid_css, remainder): with tm.assert_produces_warning(CSSWarning): assert_same_resolution(invalid_css, remainder) @@ -55,59 +64,89 @@ def test_css_parse_invalid(invalid_css, remainder): @pytest.mark.parametrize( - 'shorthand,expansions', - [('margin', ['margin-top', 'margin-right', - 'margin-bottom', 'margin-left']), - ('padding', ['padding-top', 'padding-right', - 'padding-bottom', 'padding-left']), - ('border-width', ['border-top-width', 'border-right-width', - 'border-bottom-width', 'border-left-width']), - ('border-color', ['border-top-color', 'border-right-color', - 'border-bottom-color', 'border-left-color']), - ('border-style', ['border-top-style', 'border-right-style', - 'border-bottom-style', 'border-left-style']), - ]) + "shorthand,expansions", + [ + ("margin", ["margin-top", "margin-right", "margin-bottom", "margin-left"]), + ("padding", ["padding-top", "padding-right", "padding-bottom", "padding-left"]), + ( + "border-width", + [ + "border-top-width", + "border-right-width", + "border-bottom-width", + "border-left-width", + ], + ), + ( + "border-color", + [ + "border-top-color", + "border-right-color", + "border-bottom-color", + "border-left-color", + ], + ), + ( + "border-style", + [ + "border-top-style", + "border-right-style", + "border-bottom-style", + "border-left-style", + ], + ), + ], +) def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions - assert_resolves('{shorthand}: 1pt'.format(shorthand=shorthand), - {top: '1pt', right: '1pt', - bottom: '1pt', left: '1pt'}) + assert_resolves( + "{shorthand}: 1pt".format(shorthand=shorthand), + {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + ) - assert_resolves('{shorthand}: 1pt 4pt'.format(shorthand=shorthand), - {top: '1pt', right: '4pt', - bottom: '1pt', left: '4pt'}) + assert_resolves( + "{shorthand}: 1pt 4pt".format(shorthand=shorthand), + {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + ) - assert_resolves('{shorthand}: 1pt 4pt 2pt'.format(shorthand=shorthand), - {top: '1pt', right: '4pt', - bottom: '2pt', left: '4pt'}) + assert_resolves( + "{shorthand}: 1pt 4pt 2pt".format(shorthand=shorthand), + {top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"}, + ) - assert_resolves('{shorthand}: 1pt 4pt 2pt 0pt'.format(shorthand=shorthand), - {top: '1pt', right: '4pt', - bottom: '2pt', left: '0pt'}) + assert_resolves( + "{shorthand}: 1pt 4pt 2pt 0pt".format(shorthand=shorthand), + {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, + ) with tm.assert_produces_warning(CSSWarning): assert_resolves( - '{shorthand}: 1pt 1pt 1pt 1pt 1pt'.format(shorthand=shorthand), {}) - - -@pytest.mark.parametrize('style,inherited,equiv', [ - ('margin: 1px; margin: 2px', '', - 'margin: 2px'), - ('margin: 1px', 'margin: 2px', - 'margin: 1px'), - ('margin: 1px; margin: inherit', 'margin: 2px', - 'margin: 2px'), - ('margin: 1px; margin-top: 2px', '', - 'margin-left: 1px; margin-right: 1px; ' + - 'margin-bottom: 1px; margin-top: 2px'), - ('margin-top: 2px', 'margin: 1px', - 'margin: 1px; margin-top: 2px'), - ('margin: 1px', 'margin-top: 2px', - 'margin: 1px'), - ('margin: 1px; margin-top: inherit', 'margin: 2px', - 'margin: 1px; margin-top: 2px'), -]) + "{shorthand}: 1pt 1pt 1pt 1pt 1pt".format(shorthand=shorthand), {} + ) + + +@pytest.mark.parametrize( + "style,inherited,equiv", + [ + ("margin: 1px; margin: 2px", "", "margin: 2px"), + ("margin: 1px", "margin: 2px", "margin: 1px"), + ("margin: 1px; margin: inherit", "margin: 2px", "margin: 2px"), + ( + "margin: 1px; margin-top: 2px", + "", + "margin-left: 1px; margin-right: 1px; " + + "margin-bottom: 1px; margin-top: 2px", + ), + ("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"), + ("margin: 1px", "margin-top: 2px", "margin: 1px"), + ( + "margin: 1px; margin-top: inherit", + "margin: 2px", + "margin: 1px; margin-top: 2px", + ), + ], +) def test_css_precedence(style, inherited, equiv): resolve = CSSResolver() inherited_props = resolve(inherited) @@ -116,72 +155,86 @@ def test_css_precedence(style, inherited, equiv): assert style_props == equiv_props -@pytest.mark.parametrize('style,equiv', [ - ('margin: 1px; margin-top: inherit', - 'margin-bottom: 1px; margin-right: 1px; margin-left: 1px'), - ('margin-top: inherit', ''), - ('margin-top: initial', ''), -]) +@pytest.mark.parametrize( + "style,equiv", + [ + ( + "margin: 1px; margin-top: inherit", + "margin-bottom: 1px; margin-right: 1px; margin-left: 1px", + ), + ("margin-top: inherit", ""), + ("margin-top: initial", ""), + ], +) def test_css_none_absent(style, equiv): assert_same_resolution(style, equiv) -@pytest.mark.parametrize('size,resolved', [ - ('xx-small', '6pt'), - ('x-small', '{pt:f}pt'.format(pt=7.5)), - ('small', '{pt:f}pt'.format(pt=9.6)), - ('medium', '12pt'), - ('large', '{pt:f}pt'.format(pt=13.5)), - ('x-large', '18pt'), - ('xx-large', '24pt'), - - ('8px', '6pt'), - ('1.25pc', '15pt'), - ('.25in', '18pt'), - ('02.54cm', '72pt'), - ('25.4mm', '72pt'), - ('101.6q', '72pt'), - ('101.6q', '72pt'), -]) -@pytest.mark.parametrize('relative_to', # invariant to inherited size - [None, '16pt']) +@pytest.mark.parametrize( + "size,resolved", + [ + ("xx-small", "6pt"), + ("x-small", "{pt:f}pt".format(pt=7.5)), + ("small", "{pt:f}pt".format(pt=9.6)), + ("medium", "12pt"), + ("large", "{pt:f}pt".format(pt=13.5)), + ("x-large", "18pt"), + ("xx-large", "24pt"), + ("8px", "6pt"), + ("1.25pc", "15pt"), + (".25in", "18pt"), + ("02.54cm", "72pt"), + ("25.4mm", "72pt"), + ("101.6q", "72pt"), + ("101.6q", "72pt"), + ], +) +@pytest.mark.parametrize("relative_to", [None, "16pt"]) # invariant to inherited size def test_css_absolute_font_size(size, relative_to, resolved): if relative_to is None: inherited = None else: - inherited = {'font-size': relative_to} - assert_resolves('font-size: {size}'.format(size=size), - {'font-size': resolved}, inherited=inherited) - - -@pytest.mark.parametrize('size,relative_to,resolved', [ - ('1em', None, '12pt'), - ('1.0em', None, '12pt'), - ('1.25em', None, '15pt'), - ('1em', '16pt', '16pt'), - ('1.0em', '16pt', '16pt'), - ('1.25em', '16pt', '20pt'), - ('1rem', '16pt', '12pt'), - ('1.0rem', '16pt', '12pt'), - ('1.25rem', '16pt', '15pt'), - ('100%', None, '12pt'), - ('125%', None, '15pt'), - ('100%', '16pt', '16pt'), - ('125%', '16pt', '20pt'), - ('2ex', None, '12pt'), - ('2.0ex', None, '12pt'), - ('2.50ex', None, '15pt'), - ('inherit', '16pt', '16pt'), - - ('smaller', None, '10pt'), - ('smaller', '18pt', '15pt'), - ('larger', None, '{pt:f}pt'.format(pt=14.4)), - ('larger', '15pt', '18pt'), -]) + inherited = {"font-size": relative_to} + assert_resolves( + "font-size: {size}".format(size=size), + {"font-size": resolved}, + inherited=inherited, + ) + + +@pytest.mark.parametrize( + "size,relative_to,resolved", + [ + ("1em", None, "12pt"), + ("1.0em", None, "12pt"), + ("1.25em", None, "15pt"), + ("1em", "16pt", "16pt"), + ("1.0em", "16pt", "16pt"), + ("1.25em", "16pt", "20pt"), + ("1rem", "16pt", "12pt"), + ("1.0rem", "16pt", "12pt"), + ("1.25rem", "16pt", "15pt"), + ("100%", None, "12pt"), + ("125%", None, "15pt"), + ("100%", "16pt", "16pt"), + ("125%", "16pt", "20pt"), + ("2ex", None, "12pt"), + ("2.0ex", None, "12pt"), + ("2.50ex", None, "15pt"), + ("inherit", "16pt", "16pt"), + ("smaller", None, "10pt"), + ("smaller", "18pt", "15pt"), + ("larger", None, "{pt:f}pt".format(pt=14.4)), + ("larger", "15pt", "18pt"), + ], +) def test_css_relative_font_size(size, relative_to, resolved): if relative_to is None: inherited = None else: - inherited = {'font-size': relative_to} - assert_resolves('font-size: {size}'.format(size=size), - {'font-size': resolved}, inherited=inherited) + inherited = {"font-size": relative_to} + assert_resolves( + "font-size: {size}".format(size=size), + {"font-size": resolved}, + inherited=inherited, + ) diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index fc9886bec766f..b122e4f6c3f33 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -8,35 +8,32 @@ class TestEngFormatter: - def test_eng_float_formatter(self): - df = DataFrame({'A': [1.41, 141., 14100, 1410000.]}) + df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]}) fmt.set_eng_float_format() result = df.to_string() - expected = (' A\n' - '0 1.410E+00\n' - '1 141.000E+00\n' - '2 14.100E+03\n' - '3 1.410E+06') + expected = ( + " A\n" + "0 1.410E+00\n" + "1 141.000E+00\n" + "2 14.100E+03\n" + "3 1.410E+06" + ) assert result == expected fmt.set_eng_float_format(use_eng_prefix=True) result = df.to_string() - expected = (' A\n' - '0 1.410\n' - '1 141.000\n' - '2 14.100k\n' - '3 1.410M') + expected = ( + " A\n" "0 1.410\n" "1 141.000\n" "2 14.100k\n" "3 1.410M" + ) assert result == expected fmt.set_eng_float_format(accuracy=0) result = df.to_string() - expected = (' A\n' - '0 1E+00\n' - '1 141E+00\n' - '2 14E+03\n' - '3 1E+06') + expected = ( + " A\n" "0 1E+00\n" "1 141E+00\n" "2 14E+03\n" "3 1E+06" + ) assert result == expected tm.reset_display_options() @@ -64,32 +61,58 @@ def test_exponents_with_eng_prefix(self): formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) f = np.sqrt(2) in_out = [ - (f * 10 ** -24, " 1.414y"), (f * 10 ** -23, " 14.142y"), - (f * 10 ** -22, " 141.421y"), (f * 10 ** -21, " 1.414z"), - (f * 10 ** -20, " 14.142z"), (f * 10 ** -19, " 141.421z"), - (f * 10 ** -18, " 1.414a"), (f * 10 ** -17, " 14.142a"), - (f * 10 ** -16, " 141.421a"), (f * 10 ** -15, " 1.414f"), - (f * 10 ** -14, " 14.142f"), (f * 10 ** -13, " 141.421f"), - (f * 10 ** -12, " 1.414p"), (f * 10 ** -11, " 14.142p"), - (f * 10 ** -10, " 141.421p"), (f * 10 ** -9, " 1.414n"), - (f * 10 ** -8, " 14.142n"), (f * 10 ** -7, " 141.421n"), - (f * 10 ** -6, " 1.414u"), (f * 10 ** -5, " 14.142u"), - (f * 10 ** -4, " 141.421u"), (f * 10 ** -3, " 1.414m"), - (f * 10 ** -2, " 14.142m"), (f * 10 ** -1, " 141.421m"), - (f * 10 ** 0, " 1.414"), (f * 10 ** 1, " 14.142"), - (f * 10 ** 2, " 141.421"), (f * 10 ** 3, " 1.414k"), - (f * 10 ** 4, " 14.142k"), (f * 10 ** 5, " 141.421k"), - (f * 10 ** 6, " 1.414M"), (f * 10 ** 7, " 14.142M"), - (f * 10 ** 8, " 141.421M"), (f * 10 ** 9, " 1.414G"), - (f * 10 ** 10, " 14.142G"), (f * 10 ** 11, " 141.421G"), - (f * 10 ** 12, " 1.414T"), (f * 10 ** 13, " 14.142T"), - (f * 10 ** 14, " 141.421T"), (f * 10 ** 15, " 1.414P"), - (f * 10 ** 16, " 14.142P"), (f * 10 ** 17, " 141.421P"), - (f * 10 ** 18, " 1.414E"), (f * 10 ** 19, " 14.142E"), - (f * 10 ** 20, " 141.421E"), (f * 10 ** 21, " 1.414Z"), - (f * 10 ** 22, " 14.142Z"), (f * 10 ** 23, " 141.421Z"), - (f * 10 ** 24, " 1.414Y"), (f * 10 ** 25, " 14.142Y"), - (f * 10 ** 26, " 141.421Y")] + (f * 10 ** -24, " 1.414y"), + (f * 10 ** -23, " 14.142y"), + (f * 10 ** -22, " 141.421y"), + (f * 10 ** -21, " 1.414z"), + (f * 10 ** -20, " 14.142z"), + (f * 10 ** -19, " 141.421z"), + (f * 10 ** -18, " 1.414a"), + (f * 10 ** -17, " 14.142a"), + (f * 10 ** -16, " 141.421a"), + (f * 10 ** -15, " 1.414f"), + (f * 10 ** -14, " 14.142f"), + (f * 10 ** -13, " 141.421f"), + (f * 10 ** -12, " 1.414p"), + (f * 10 ** -11, " 14.142p"), + (f * 10 ** -10, " 141.421p"), + (f * 10 ** -9, " 1.414n"), + (f * 10 ** -8, " 14.142n"), + (f * 10 ** -7, " 141.421n"), + (f * 10 ** -6, " 1.414u"), + (f * 10 ** -5, " 14.142u"), + (f * 10 ** -4, " 141.421u"), + (f * 10 ** -3, " 1.414m"), + (f * 10 ** -2, " 14.142m"), + (f * 10 ** -1, " 141.421m"), + (f * 10 ** 0, " 1.414"), + (f * 10 ** 1, " 14.142"), + (f * 10 ** 2, " 141.421"), + (f * 10 ** 3, " 1.414k"), + (f * 10 ** 4, " 14.142k"), + (f * 10 ** 5, " 141.421k"), + (f * 10 ** 6, " 1.414M"), + (f * 10 ** 7, " 14.142M"), + (f * 10 ** 8, " 141.421M"), + (f * 10 ** 9, " 1.414G"), + (f * 10 ** 10, " 14.142G"), + (f * 10 ** 11, " 141.421G"), + (f * 10 ** 12, " 1.414T"), + (f * 10 ** 13, " 14.142T"), + (f * 10 ** 14, " 141.421T"), + (f * 10 ** 15, " 1.414P"), + (f * 10 ** 16, " 14.142P"), + (f * 10 ** 17, " 141.421P"), + (f * 10 ** 18, " 1.414E"), + (f * 10 ** 19, " 14.142E"), + (f * 10 ** 20, " 141.421E"), + (f * 10 ** 21, " 1.414Z"), + (f * 10 ** 22, " 14.142Z"), + (f * 10 ** 23, " 141.421Z"), + (f * 10 ** 24, " 1.414Y"), + (f * 10 ** 25, " 14.142Y"), + (f * 10 ** 26, " 141.421Y"), + ] self.compare_all(formatter, in_out) def test_exponents_without_eng_prefix(self): @@ -146,44 +169,66 @@ def test_exponents_without_eng_prefix(self): (f * 10 ** 23, " 314.1593E+21"), (f * 10 ** 24, " 3.1416E+24"), (f * 10 ** 25, " 31.4159E+24"), - (f * 10 ** 26, " 314.1593E+24")] + (f * 10 ** 26, " 314.1593E+24"), + ] self.compare_all(formatter, in_out) def test_rounding(self): formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) - in_out = [(5.55555, ' 5.556'), (55.5555, ' 55.556'), - (555.555, ' 555.555'), (5555.55, ' 5.556k'), - (55555.5, ' 55.556k'), (555555, ' 555.555k')] + in_out = [ + (5.55555, " 5.556"), + (55.5555, " 55.556"), + (555.555, " 555.555"), + (5555.55, " 5.556k"), + (55555.5, " 55.556k"), + (555555, " 555.555k"), + ] self.compare_all(formatter, in_out) formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) - in_out = [(5.55555, ' 5.6'), (55.5555, ' 55.6'), (555.555, ' 555.6'), - (5555.55, ' 5.6k'), (55555.5, ' 55.6k'), (555555, ' 555.6k')] + in_out = [ + (5.55555, " 5.6"), + (55.5555, " 55.6"), + (555.555, " 555.6"), + (5555.55, " 5.6k"), + (55555.5, " 55.6k"), + (555555, " 555.6k"), + ] self.compare_all(formatter, in_out) formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) - in_out = [(5.55555, ' 6'), (55.5555, ' 56'), (555.555, ' 556'), - (5555.55, ' 6k'), (55555.5, ' 56k'), (555555, ' 556k')] + in_out = [ + (5.55555, " 6"), + (55.5555, " 56"), + (555.555, " 556"), + (5555.55, " 6k"), + (55555.5, " 56k"), + (555555, " 556k"), + ] self.compare_all(formatter, in_out) formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) result = formatter(0) - assert result == ' 0.000' + assert result == " 0.000" def test_nan(self): # Issue #11981 formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.nan) - assert result == 'NaN' - - df = pd.DataFrame({'a': [1.5, 10.3, 20.5], - 'b': [50.3, 60.67, 70.12], - 'c': [100.2, 101.33, 120.33]}) - pt = df.pivot_table(values='a', index='b', columns='c') + assert result == "NaN" + + df = pd.DataFrame( + { + "a": [1.5, 10.3, 20.5], + "b": [50.3, 60.67, 70.12], + "c": [100.2, 101.33, 120.33], + } + ) + pt = df.pivot_table(values="a", index="b", columns="c") fmt.set_eng_float_format(accuracy=1) result = pt.to_string() - assert 'NaN' in result + assert "NaN" in result tm.reset_display_options() def test_inf(self): @@ -191,4 +236,4 @@ def test_inf(self): formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) result = formatter(np.inf) - assert result == 'inf' + assert result == "inf" diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 7098a382cad45..af862b11c756c 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -21,8 +21,19 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, NaT, Series, Timestamp, date_range, - get_option, option_context, read_csv, reset_option, set_option) + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + get_option, + option_context, + read_csv, + reset_option, + set_option, +) import pandas.util.testing as tm import pandas.io.formats.format as fmt @@ -38,8 +49,8 @@ def curpath(): def has_info_repr(df): r = repr(df) - c1 = r.split('\n')[0].startswith(" max_len: - assert '...' in line + assert "..." in line else: - assert '...' not in line + assert "..." not in line with option_context("display.max_colwidth", 999999): - assert '...' not in repr(df) + assert "..." not in repr(df) with option_context("display.max_colwidth", max_len + 2): - assert '...' not in repr(df) + assert "..." not in repr(df) def test_repr_chop_threshold(self): df = DataFrame([[0.1, 0.5], [0.5, -0.1]]) pd.reset_option("display.chop_threshold") # default None - assert repr(df) == ' 0 1\n0 0.1 0.5\n1 0.5 -0.1' + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" with option_context("display.chop_threshold", 0.2): - assert repr(df) == ' 0 1\n0 0.0 0.5\n1 0.5 0.0' + assert repr(df) == " 0 1\n0 0.0 0.5\n1 0.5 0.0" with option_context("display.chop_threshold", 0.6): - assert repr(df) == ' 0 1\n0 0.0 0.0\n1 0.0 0.0' + assert repr(df) == " 0 1\n0 0.0 0.0\n1 0.0 0.0" with option_context("display.chop_threshold", None): - assert repr(df) == ' 0 1\n0 0.1 0.5\n1 0.5 -0.1' + assert repr(df) == " 0 1\n0 0.1 0.5\n1 0.5 -0.1" def test_repr_chop_threshold_column_below(self): # GH 6839: validation case - df = pd.DataFrame([[10, 20, 30, 40], - [8e-10, -1e-11, 2e-9, -2e-11]]).T + df = pd.DataFrame([[10, 20, 30, 40], [8e-10, -1e-11, 2e-9, -2e-11]]).T with option_context("display.chop_threshold", 0): - assert repr(df) == (' 0 1\n' - '0 10.0 8.000000e-10\n' - '1 20.0 -1.000000e-11\n' - '2 30.0 2.000000e-09\n' - '3 40.0 -2.000000e-11') + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 -1.000000e-11\n" + "2 30.0 2.000000e-09\n" + "3 40.0 -2.000000e-11" + ) with option_context("display.chop_threshold", 1e-8): - assert repr(df) == (' 0 1\n' - '0 10.0 0.000000e+00\n' - '1 20.0 0.000000e+00\n' - '2 30.0 0.000000e+00\n' - '3 40.0 0.000000e+00') + assert repr(df) == ( + " 0 1\n" + "0 10.0 0.000000e+00\n" + "1 20.0 0.000000e+00\n" + "2 30.0 0.000000e+00\n" + "3 40.0 0.000000e+00" + ) with option_context("display.chop_threshold", 5e-11): - assert repr(df) == (' 0 1\n' - '0 10.0 8.000000e-10\n' - '1 20.0 0.000000e+00\n' - '2 30.0 2.000000e-09\n' - '3 40.0 0.000000e+00') + assert repr(df) == ( + " 0 1\n" + "0 10.0 8.000000e-10\n" + "1 20.0 0.000000e+00\n" + "2 30.0 2.000000e-09\n" + "3 40.0 0.000000e+00" + ) def test_repr_obeys_max_seq_limit(self): with option_context("display.max_seq_items", 2000): @@ -227,12 +245,12 @@ def test_repr_obeys_max_seq_limit(self): assert len(printing.pprint_thing(list(range(1000)))) < 100 def test_repr_set(self): - assert printing.pprint_thing({1}) == '{1}' + assert printing.pprint_thing({1}) == "{1}" def test_repr_is_valid_construction_code(self): # for the case of Index, where the repr is traditional rather then # stylized - idx = Index(['a', 'b']) + idx = Index(["a", "b"]) res = eval("pd." + repr(idx)) tm.assert_series_equal(Series(res), Series(idx)) @@ -249,20 +267,27 @@ def test_repr_should_return_str(self): assert type(df.__repr__()) == str # both py2 / 3 def test_repr_no_backslash(self): - with option_context('mode.sim_interactive', True): + with option_context("mode.sim_interactive", True): df = DataFrame(np.random.randn(10, 4)) - assert '\\' not in repr(df) + assert "\\" not in repr(df) def test_expand_frame_repr(self): - df_small = DataFrame('hello', index=[0], columns=[0]) - df_wide = DataFrame('hello', index=[0], columns=range(10)) - df_tall = DataFrame('hello', index=range(30), columns=range(5)) - - with option_context('mode.sim_interactive', True): - with option_context('display.max_columns', 10, 'display.width', 20, - 'display.max_rows', 20, - 'display.show_dimensions', True): - with option_context('display.expand_frame_repr', True): + df_small = DataFrame("hello", index=[0], columns=[0]) + df_wide = DataFrame("hello", index=[0], columns=range(10)) + df_tall = DataFrame("hello", index=range(30), columns=range(5)) + + with option_context("mode.sim_interactive", True): + with option_context( + "display.max_columns", + 10, + "display.width", + 20, + "display.max_rows", + 20, + "display.show_dimensions", + True, + ): + with option_context("display.expand_frame_repr", True): assert not has_truncated_repr(df_small) assert not has_expanded_repr(df_small) assert not has_truncated_repr(df_wide) @@ -270,7 +295,7 @@ def test_expand_frame_repr(self): assert has_vertically_truncated_repr(df_tall) assert has_expanded_repr(df_tall) - with option_context('display.expand_frame_repr', False): + with option_context("display.expand_frame_repr", False): assert not has_truncated_repr(df_small) assert not has_expanded_repr(df_small) assert not has_horizontally_truncated_repr(df_wide) @@ -281,10 +306,11 @@ def test_expand_frame_repr(self): def test_repr_non_interactive(self): # in non interactive mode, there can be no dependency on the # result of terminal auto size detection - df = DataFrame('hello', index=range(1000), columns=range(5)) + df = DataFrame("hello", index=range(1000), columns=range(5)) - with option_context('mode.sim_interactive', False, 'display.width', 0, - 'display.max_rows', 5000): + with option_context( + "mode.sim_interactive", False, "display.width", 0, "display.max_rows", 5000 + ): assert not has_truncated_repr(df) assert not has_expanded_repr(df) @@ -292,85 +318,99 @@ def test_repr_truncates_terminal_size(self, monkeypatch): # see gh-21180 terminal_size = (118, 96) - monkeypatch.setattr('pandas.io.formats.format.get_terminal_size', - lambda: terminal_size) + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) index = range(5) - columns = pd.MultiIndex.from_tuples([ - ('This is a long title with > 37 chars.', 'cat'), - ('This is a loooooonger title with > 43 chars.', 'dog'), - ]) + columns = pd.MultiIndex.from_tuples( + [ + ("This is a long title with > 37 chars.", "cat"), + ("This is a loooooonger title with > 43 chars.", "dog"), + ] + ) df = pd.DataFrame(1, index=index, columns=columns) result = repr(df) - h1, h2 = result.split('\n')[:2] - assert 'long' in h1 - assert 'loooooonger' in h1 - assert 'cat' in h2 - assert 'dog' in h2 + h1, h2 = result.split("\n")[:2] + assert "long" in h1 + assert "loooooonger" in h1 + assert "cat" in h2 + assert "dog" in h2 # regular columns - df2 = pd.DataFrame({"A" * 41: [1, 2], 'B' * 41: [1, 2]}) + df2 = pd.DataFrame({"A" * 41: [1, 2], "B" * 41: [1, 2]}) result = repr(df2) - assert df2.columns[0] in result.split('\n')[0] + assert df2.columns[0] in result.split("\n")[0] def test_repr_truncates_terminal_size_full(self, monkeypatch): # GH 22984 ensure entire window is filled terminal_size = (80, 24) df = pd.DataFrame(np.random.rand(1, 7)) - monkeypatch.setattr('pandas.io.formats.format.get_terminal_size', - lambda: terminal_size) + monkeypatch.setattr( + "pandas.io.formats.format.get_terminal_size", lambda: terminal_size + ) assert "..." not in str(df) def test_repr_truncation_column_size(self): # dataframe with last column very wide -> check it is not used to # determine size of truncation (...) column - df = pd.DataFrame({'a': [108480, 30830], 'b': [12345, 12345], - 'c': [12345, 12345], 'd': [12345, 12345], - 'e': ['a' * 50] * 2}) + df = pd.DataFrame( + { + "a": [108480, 30830], + "b": [12345, 12345], + "c": [12345, 12345], + "d": [12345, 12345], + "e": ["a" * 50] * 2, + } + ) assert "..." in str(df) assert " ... " not in str(df) def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: - pytest.skip("terminal size too small, " - "{0} x {1}".format(term_width, term_height)) + pytest.skip( + "terminal size too small, " "{0} x {1}".format(term_width, term_height) + ) def mkframe(n): - index = ['{i:05d}'.format(i=i) for i in range(n)] + index = ["{i:05d}".format(i=i) for i in range(n)] return DataFrame(0, index, index) df6 = mkframe(6) df10 = mkframe(10) - with option_context('mode.sim_interactive', True): - with option_context('display.width', term_width * 2): - with option_context('display.max_rows', 5, - 'display.max_columns', 5): + with option_context("mode.sim_interactive", True): + with option_context("display.width", term_width * 2): + with option_context("display.max_rows", 5, "display.max_columns", 5): assert not has_expanded_repr(mkframe(4)) assert not has_expanded_repr(mkframe(5)) assert not has_expanded_repr(df6) assert has_doubly_truncated_repr(df6) - with option_context('display.max_rows', 20, - 'display.max_columns', 10): + with option_context("display.max_rows", 20, "display.max_columns", 10): # Out off max_columns boundary, but no extending # since not exceeding width assert not has_expanded_repr(df6) assert not has_truncated_repr(df6) - with option_context('display.max_rows', 9, - 'display.max_columns', 10): + with option_context("display.max_rows", 9, "display.max_columns", 10): # out vertical bounds can not result in exanded repr assert not has_expanded_repr(df10) assert has_vertically_truncated_repr(df10) # width=None in terminal, auto detection - with option_context('display.max_columns', 100, 'display.max_rows', - term_width * 20, 'display.width', None): + with option_context( + "display.max_columns", + 100, + "display.max_rows", + term_width * 20, + "display.width", + None, + ): df = mkframe((term_width // 7) - 2) assert not has_expanded_repr(df) df = mkframe((term_width // 7) + 2) @@ -378,51 +418,58 @@ def mkframe(n): assert has_expanded_repr(df) def test_repr_min_rows(self): - df = pd.DataFrame({'a': range(20)}) + df = pd.DataFrame({"a": range(20)}) # default setting no truncation even if above min_rows - assert '..' not in repr(df) + assert ".." not in repr(df) - df = pd.DataFrame({'a': range(61)}) + df = pd.DataFrame({"a": range(61)}) # default of max_rows 60 triggers truncation if above - assert '..' in repr(df) + assert ".." in repr(df) - with option_context('display.max_rows', 10, 'display.min_rows', 4): + with option_context("display.max_rows", 10, "display.min_rows", 4): # truncated after first two rows - assert '..' in repr(df) - assert '2 ' not in repr(df) + assert ".." in repr(df) + assert "2 " not in repr(df) - with option_context('display.max_rows', 12, 'display.min_rows', None): + with option_context("display.max_rows", 12, "display.min_rows", None): # when set to None, follow value of max_rows - assert '5 5' in repr(df) + assert "5 5" in repr(df) - with option_context('display.max_rows', 10, 'display.min_rows', 12): + with option_context("display.max_rows", 10, "display.min_rows", 12): # when set value higher as max_rows, use the minimum - assert '5 5' not in repr(df) + assert "5 5" not in repr(df) - with option_context('display.max_rows', None, 'display.min_rows', 12): + with option_context("display.max_rows", None, "display.min_rows", 12): # max_rows of None -> never truncate - assert '..' not in repr(df) + assert ".." not in repr(df) def test_str_max_colwidth(self): # GH 7856 - df = pd.DataFrame([{'a': 'foo', - 'b': 'bar', - 'c': 'uncomfortably long line with lots of stuff', - 'd': 1}, {'a': 'foo', - 'b': 'bar', - 'c': 'stuff', - 'd': 1}]) - df.set_index(['a', 'b', 'c']) + df = pd.DataFrame( + [ + { + "a": "foo", + "b": "bar", + "c": "uncomfortably long line with lots of stuff", + "d": 1, + }, + {"a": "foo", "b": "bar", "c": "stuff", "d": 1}, + ] + ) + df.set_index(["a", "b", "c"]) assert str(df) == ( - ' a b c d\n' - '0 foo bar uncomfortably long line with lots of stuff 1\n' - '1 foo bar stuff 1') - with option_context('max_colwidth', 20): - assert str(df) == (' a b c d\n' - '0 foo bar uncomfortably lo... 1\n' - '1 foo bar stuff 1') + " a b c d\n" + "0 foo bar uncomfortably long line with lots of stuff 1\n" + "1 foo bar stuff 1" + ) + with option_context("max_colwidth", 20): + assert str(df) == ( + " a b c d\n" + "0 foo bar uncomfortably lo... 1\n" + "1 foo bar stuff 1" + ) def test_auto_detect(self): term_width, term_height = get_terminal_size() @@ -430,50 +477,50 @@ def test_auto_detect(self): cols = range(int(term_width * fac)) index = range(10) df = DataFrame(index=index, columns=cols) - with option_context('mode.sim_interactive', True): - with option_context('max_rows', None): - with option_context('max_columns', None): + with option_context("mode.sim_interactive", True): + with option_context("max_rows", None): + with option_context("max_columns", None): # Wrap around with None assert has_expanded_repr(df) - with option_context('max_rows', 0): - with option_context('max_columns', 0): + with option_context("max_rows", 0): + with option_context("max_columns", 0): # Truncate with auto detection. assert has_horizontally_truncated_repr(df) index = range(int(term_height * fac)) df = DataFrame(index=index, columns=cols) - with option_context('max_rows', 0): - with option_context('max_columns', None): + with option_context("max_rows", 0): + with option_context("max_columns", None): # Wrap around with None assert has_expanded_repr(df) # Truncate vertically assert has_vertically_truncated_repr(df) - with option_context('max_rows', None): - with option_context('max_columns', 0): + with option_context("max_rows", None): + with option_context("max_columns", 0): assert has_horizontally_truncated_repr(df) def test_to_string_repr_unicode(self): buf = StringIO() - unicode_values = ['\u03c3'] * 10 + unicode_values = ["\u03c3"] * 10 unicode_values = np.array(unicode_values, dtype=object) - df = DataFrame({'unicode': unicode_values}) + df = DataFrame({"unicode": unicode_values}) df.to_string(col_space=10, buf=buf) # it works! repr(df) - idx = Index(['abc', '\u03c3a', 'aegdvg']) + idx = Index(["abc", "\u03c3a", "aegdvg"]) ser = Series(np.random.randn(len(idx)), idx) - rs = repr(ser).split('\n') + rs = repr(ser).split("\n") line_len = len(rs[0]) for line in rs[1:]: try: line = line.decode(get_option("display.encoding")) except AttributeError: pass - if not line.startswith('dtype:'): + if not line.startswith("dtype:"): assert len(line) == line_len # it works even if sys.stdin in None @@ -485,7 +532,7 @@ def test_to_string_repr_unicode(self): sys.stdin = _stdin def test_to_string_unicode_columns(self, float_frame): - df = DataFrame({'\u03c3': np.arange(10.)}) + df = DataFrame({"\u03c3": np.arange(10.0)}) buf = StringIO() df.to_string(buf=buf) @@ -499,308 +546,385 @@ def test_to_string_unicode_columns(self, float_frame): assert isinstance(result, str) def test_to_string_utf8_columns(self): - n = "\u05d0".encode('utf-8') + n = "\u05d0".encode("utf-8") - with option_context('display.max_rows', 1): + with option_context("display.max_rows", 1): df = DataFrame([1, 2], columns=[n]) repr(df) def test_to_string_unicode_two(self): - dm = DataFrame({'c/\u03c3': []}) + dm = DataFrame({"c/\u03c3": []}) buf = StringIO() dm.to_string(buf) def test_to_string_unicode_three(self): - dm = DataFrame(['\xc2']) + dm = DataFrame(["\xc2"]) buf = StringIO() dm.to_string(buf) def test_to_string_with_formatters(self): - df = DataFrame({'int': [1, 2, 3], - 'float': [1.0, 2.0, 3.0], - 'object': [(1, 2), True, False]}, - columns=['int', 'float', 'object']) - - formatters = [('int', lambda x: '0x{x:x}'.format(x=x)), - ('float', lambda x: '[{x: 4.1f}]'.format(x=x)), - ('object', lambda x: '-{x!s}-'.format(x=x))] + df = DataFrame( + { + "int": [1, 2, 3], + "float": [1.0, 2.0, 3.0], + "object": [(1, 2), True, False], + }, + columns=["int", "float", "object"], + ) + + formatters = [ + ("int", lambda x: "0x{x:x}".format(x=x)), + ("float", lambda x: "[{x: 4.1f}]".format(x=x)), + ("object", lambda x: "-{x!s}-".format(x=x)), + ] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=list(zip(*formatters))[1]) - assert result == (' int float object\n' - '0 0x1 [ 1.0] -(1, 2)-\n' - '1 0x2 [ 2.0] -True-\n' - '2 0x3 [ 3.0] -False-') + assert result == ( + " int float object\n" + "0 0x1 [ 1.0] -(1, 2)-\n" + "1 0x2 [ 2.0] -True-\n" + "2 0x3 [ 3.0] -False-" + ) assert result == result2 def test_to_string_with_datetime64_monthformatter(self): months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] - x = DataFrame({'months': months}) + x = DataFrame({"months": months}) def format_func(x): - return x.strftime('%Y-%m') - result = x.to_string(formatters={'months': format_func}) - expected = 'months\n0 2016-01\n1 2016-02' + return x.strftime("%Y-%m") + + result = x.to_string(formatters={"months": format_func}) + expected = "months\n0 2016-01\n1 2016-02" assert result.strip() == expected def test_to_string_with_datetime64_hourformatter(self): - x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], - format='%H:%M:%S.%f')}) + x = DataFrame( + { + "hod": pd.to_datetime( + ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" + ) + } + ) def format_func(x): - return x.strftime('%H:%M') + return x.strftime("%H:%M") - result = x.to_string(formatters={'hod': format_func}) - expected = 'hod\n0 10:10\n1 12:12' + result = x.to_string(formatters={"hod": format_func}) + expected = "hod\n0 10:10\n1 12:12" assert result.strip() == expected def test_to_string_with_formatters_unicode(self): - df = DataFrame({'c/\u03c3': [1, 2, 3]}) - result = df.to_string( - formatters={'c/\u03c3': lambda x: '{x}'.format(x=x)}) - assert result == ' c/\u03c3\n' + '0 1\n1 2\n2 3' + df = DataFrame({"c/\u03c3": [1, 2, 3]}) + result = df.to_string(formatters={"c/\u03c3": lambda x: "{x}".format(x=x)}) + assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" def test_east_asian_unicode_false(self): # not aligned properly because of east asian width # mid col - df = DataFrame({'a': ['あ', 'いいい', 'う', 'ええええええ'], - 'b': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na あ 1\n" - "bb いいい 222\nc う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # last col - df = DataFrame({'a': [1, 222, 33333, 4], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na 1 あ\n" - "bb 222 いいい\nc 33333 う\n" - "ddd 4 ええええええ") + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) assert repr(df) == expected # all col - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na あああああ あ\n" - "bb い いいい\nc う う\n" - "ddd えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あああああ あ\n" + "bb い いいい\nc う う\n" + "ddd えええ ええええええ" + ) assert repr(df) == expected # column name - df = DataFrame({'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'あああああ': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" b あああああ\na あ 1\n" - "bb いいい 222\nc う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"b": ["あ", "いいい", "う", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # index - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['あああ', 'いいいいいい', 'うう', 'え']) - expected = (" a b\nあああ あああああ あ\n" - "いいいいいい い いいい\nうう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\nあああ あああああ あ\n" + "いいいいいい い いいい\nうう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # index name - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=pd.Index(['あ', 'い', 'うう', 'え'], - name='おおおお')) - expected = (" a b\n" - "おおおお \n" - "あ あああああ あ\n" - "い い いいい\n" - "うう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # all - df = DataFrame({'あああ': ['あああ', 'い', 'う', 'えええええ'], - 'いいいいい': ['あ', 'いいい', 'う', 'ええ']}, - index=pd.Index(['あ', 'いいい', 'うう', 'え'], - name='お')) - expected = (" あああ いいいいい\n" - "お \n" - "あ あああ あ\n" - "いいい い いいい\n" - "うう う う\n" - "え えええええ ええ") + df = DataFrame( + {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, + index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) assert repr(df) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples([('あ', 'いい'), ('う', 'え'), ( - 'おおお', 'かかかか'), ('き', 'くく')]) - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=idx) - expected = (" a b\n" - "あ いい あああああ あ\n" - "う え い いいい\n" - "おおお かかかか う う\n" - "き くく えええ ええええええ") + idx = pd.MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) assert repr(df) == expected # truncate - with option_context('display.max_rows', 3, 'display.max_columns', 3): - df = pd.DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'c': ['お', 'か', 'ききき', 'くくくくくく'], - 'ああああ': ['さ', 'し', 'す', 'せ']}, - columns=['a', 'b', 'c', 'ああああ']) - - expected = (" a ... ああああ\n0 あああああ ... さ\n" - ".. ... ... ...\n3 えええ ... せ\n" - "\n[4 rows x 4 columns]") + with option_context("display.max_rows", 3, "display.max_columns", 3): + df = pd.DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n0 あああああ ... さ\n" + ".. ... ... ...\n3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected - df.index = ['あああ', 'いいいい', 'う', 'aaa'] - expected = (" a ... ああああ\nあああ あああああ ... さ\n" - ".. ... ... ...\naaa えええ ... せ\n" - "\n[4 rows x 4 columns]") + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\nあああ あああああ ... さ\n" + ".. ... ... ...\naaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected def test_east_asian_unicode_true(self): # Enable Unicode option ----------------------------------------- - with option_context('display.unicode.east_asian_width', True): + with option_context("display.unicode.east_asian_width", True): # mid col - df = DataFrame({'a': ['あ', 'いいい', 'う', 'ええええええ'], - 'b': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na あ 1\n" - "bb いいい 222\nc う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"a": ["あ", "いいい", "う", "ええええええ"], "b": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na あ 1\n" + "bb いいい 222\nc う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # last col - df = DataFrame({'a': [1, 222, 33333, 4], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\na 1 あ\n" - "bb 222 いいい\nc 33333 う\n" - "ddd 4 ええええええ") + df = DataFrame( + {"a": [1, 222, 33333, 4], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\na 1 あ\n" + "bb 222 いいい\nc 33333 う\n" + "ddd 4 ええええええ" + ) assert repr(df) == expected # all col - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" a b\n" - "a あああああ あ\n" - "bb い いいい\n" - "c う う\n" - "ddd えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " a b\n" + "a あああああ あ\n" + "bb い いいい\n" + "c う う\n" + "ddd えええ ええええええ" + ) assert repr(df) == expected # column name - df = DataFrame({'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'あああああ': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', 'ddd']) - expected = (" b あああああ\n" - "a あ 1\n" - "bb いいい 222\n" - "c う 33333\n" - "ddd ええええええ 4") + df = DataFrame( + {"b": ["あ", "いいい", "う", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "ddd"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c う 33333\n" + "ddd ええええええ 4" + ) assert repr(df) == expected # index - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=['あああ', 'いいいいいい', 'うう', 'え']) - expected = (" a b\n" - "あああ あああああ あ\n" - "いいいいいい い いいい\n" - "うう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=["あああ", "いいいいいい", "うう", "え"], + ) + expected = ( + " a b\n" + "あああ あああああ あ\n" + "いいいいいい い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # index name - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=pd.Index(['あ', 'い', 'うう', 'え'], - name='おおおお')) - expected = (" a b\n" - "おおおお \n" - "あ あああああ あ\n" - "い い いいい\n" - "うう う う\n" - "え えええ ええええええ") + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=pd.Index(["あ", "い", "うう", "え"], name="おおおお"), + ) + expected = ( + " a b\n" + "おおおお \n" + "あ あああああ あ\n" + "い い いいい\n" + "うう う う\n" + "え えええ ええええええ" + ) assert repr(df) == expected # all - df = DataFrame({'あああ': ['あああ', 'い', 'う', 'えええええ'], - 'いいいいい': ['あ', 'いいい', 'う', 'ええ']}, - index=pd.Index(['あ', 'いいい', 'うう', 'え'], - name='お')) - expected = (" あああ いいいいい\n" - "お \n" - "あ あああ あ\n" - "いいい い いいい\n" - "うう う う\n" - "え えええええ ええ") + df = DataFrame( + {"あああ": ["あああ", "い", "う", "えええええ"], "いいいいい": ["あ", "いいい", "う", "ええ"]}, + index=pd.Index(["あ", "いいい", "うう", "え"], name="お"), + ) + expected = ( + " あああ いいいいい\n" + "お \n" + "あ あああ あ\n" + "いいい い いいい\n" + "うう う う\n" + "え えええええ ええ" + ) assert repr(df) == expected # MultiIndex - idx = pd.MultiIndex.from_tuples([('あ', 'いい'), ('う', 'え'), ( - 'おおお', 'かかかか'), ('き', 'くく')]) - df = DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ']}, - index=idx) - expected = (" a b\n" - "あ いい あああああ あ\n" - "う え い いいい\n" - "おおお かかかか う う\n" - "き くく えええ ええええええ") + idx = pd.MultiIndex.from_tuples( + [("あ", "いい"), ("う", "え"), ("おおお", "かかかか"), ("き", "くく")] + ) + df = DataFrame( + {"a": ["あああああ", "い", "う", "えええ"], "b": ["あ", "いいい", "う", "ええええええ"]}, + index=idx, + ) + expected = ( + " a b\n" + "あ いい あああああ あ\n" + "う え い いいい\n" + "おおお かかかか う う\n" + "き くく えええ ええええええ" + ) assert repr(df) == expected # truncate - with option_context('display.max_rows', 3, 'display.max_columns', - 3): - - df = pd.DataFrame({'a': ['あああああ', 'い', 'う', 'えええ'], - 'b': ['あ', 'いいい', 'う', 'ええええええ'], - 'c': ['お', 'か', 'ききき', 'くくくくくく'], - 'ああああ': ['さ', 'し', 'す', 'せ']}, - columns=['a', 'b', 'c', 'ああああ']) - - expected = (" a ... ああああ\n" - "0 あああああ ... さ\n" - ".. ... ... ...\n" - "3 えええ ... せ\n" - "\n[4 rows x 4 columns]") + with option_context("display.max_rows", 3, "display.max_columns", 3): + + df = pd.DataFrame( + { + "a": ["あああああ", "い", "う", "えええ"], + "b": ["あ", "いいい", "う", "ええええええ"], + "c": ["お", "か", "ききき", "くくくくくく"], + "ああああ": ["さ", "し", "す", "せ"], + }, + columns=["a", "b", "c", "ああああ"], + ) + + expected = ( + " a ... ああああ\n" + "0 あああああ ... さ\n" + ".. ... ... ...\n" + "3 えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected - df.index = ['あああ', 'いいいい', 'う', 'aaa'] - expected = (" a ... ああああ\n" - "あああ あああああ ... さ\n" - "... ... ... ...\n" - "aaa えええ ... せ\n" - "\n[4 rows x 4 columns]") + df.index = ["あああ", "いいいい", "う", "aaa"] + expected = ( + " a ... ああああ\n" + "あああ あああああ ... さ\n" + "... ... ... ...\n" + "aaa えええ ... せ\n" + "\n[4 rows x 4 columns]" + ) assert repr(df) == expected # ambiguous unicode - df = DataFrame({'b': ['あ', 'いいい', '¡¡', 'ええええええ'], - 'あああああ': [1, 222, 33333, 4]}, - index=['a', 'bb', 'c', '¡¡¡']) - expected = (" b あああああ\n" - "a あ 1\n" - "bb いいい 222\n" - "c ¡¡ 33333\n" - "¡¡¡ ええええええ 4") + df = DataFrame( + {"b": ["あ", "いいい", "¡¡", "ええええええ"], "あああああ": [1, 222, 33333, 4]}, + index=["a", "bb", "c", "¡¡¡"], + ) + expected = ( + " b あああああ\n" + "a あ 1\n" + "bb いいい 222\n" + "c ¡¡ 33333\n" + "¡¡¡ ええええええ 4" + ) assert repr(df) == expected def test_to_string_buffer_all_unicode(self): buf = StringIO() - empty = DataFrame({'c/\u03c3': Series()}) - nonempty = DataFrame({'c/\u03c3': Series([1, 2, 3])}) + empty = DataFrame({"c/\u03c3": Series()}) + nonempty = DataFrame({"c/\u03c3": Series([1, 2, 3])}) print(empty, file=buf) print(nonempty, file=buf) @@ -823,37 +947,41 @@ def test_to_string_with_col_space(self): assert len(with_header_row1) == len(no_header) def test_to_string_truncate_indices(self): - for index in [tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex]: + for index in [ + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + ]: for column in [tm.makeStringIndex]: for h in [10, 20]: for w in [10, 20]: - with option_context("display.expand_frame_repr", - False): + with option_context("display.expand_frame_repr", False): df = DataFrame(index=index(h), columns=column(w)) with option_context("display.max_rows", 15): if h == 20: assert has_vertically_truncated_repr(df) else: - assert not has_vertically_truncated_repr( - df) + assert not has_vertically_truncated_repr(df) with option_context("display.max_columns", 15): if w == 20: assert has_horizontally_truncated_repr(df) else: - assert not ( - has_horizontally_truncated_repr(df)) - with option_context("display.max_rows", 15, - "display.max_columns", 15): + assert not (has_horizontally_truncated_repr(df)) + with option_context( + "display.max_rows", 15, "display.max_columns", 15 + ): if h == 20 and w == 20: assert has_doubly_truncated_repr(df) else: - assert not has_doubly_truncated_repr( - df) + assert not has_doubly_truncated_repr(df) def test_to_string_truncate_multilevel(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] df = DataFrame(index=arrays, columns=arrays) with option_context("display.max_rows", 7, "display.max_columns", 7): assert has_doubly_truncated_repr(df) @@ -865,337 +993,358 @@ def test_truncate_with_different_dtypes(self): # 11594 import datetime - s = Series([datetime.datetime(2012, 1, 1)] * 10 + - [datetime.datetime(1012, 1, 2)] + [ - datetime.datetime(2012, 1, 3)] * 10) - with pd.option_context('display.max_rows', 8): + s = Series( + [datetime.datetime(2012, 1, 1)] * 10 + + [datetime.datetime(1012, 1, 2)] + + [datetime.datetime(2012, 1, 3)] * 10 + ) + + with pd.option_context("display.max_rows", 8): result = str(s) - assert 'object' in result + assert "object" in result # 12045 - df = DataFrame({'text': ['some words'] + [None] * 9}) + df = DataFrame({"text": ["some words"] + [None] * 9}) - with pd.option_context('display.max_rows', 8, - 'display.max_columns', 3): + with pd.option_context("display.max_rows", 8, "display.max_columns", 3): result = str(df) - assert 'None' in result - assert 'NaN' not in result + assert "None" in result + assert "NaN" not in result def test_datetimelike_frame(self): # GH 12211 df = DataFrame( - {'date': [pd.Timestamp('20130101').tz_localize('UTC')] + - [pd.NaT] * 5}) + {"date": [pd.Timestamp("20130101").tz_localize("UTC")] + [pd.NaT] * 5} + ) with option_context("display.max_rows", 5): result = str(df) - assert '2013-01-01 00:00:00+00:00' in result - assert 'NaT' in result - assert '...' in result - assert '[6 rows x 1 columns]' in result - - dts = [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5 + [pd.NaT] * 5 - df = pd.DataFrame({"dt": dts, - "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) - with option_context('display.max_rows', 5): - expected = (' dt x\n' - '0 2011-01-01 00:00:00-05:00 1\n' - '1 2011-01-01 00:00:00-05:00 2\n' - '.. ... ..\n' - '8 NaT 9\n' - '9 NaT 10\n\n' - '[10 rows x 2 columns]') + assert "2013-01-01 00:00:00+00:00" in result + assert "NaT" in result + assert "..." in result + assert "[6 rows x 1 columns]" in result + + dts = [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 + [pd.NaT] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00-05:00 1\n" + "1 2011-01-01 00:00:00-05:00 2\n" + ".. ... ..\n" + "8 NaT 9\n" + "9 NaT 10\n\n" + "[10 rows x 2 columns]" + ) assert repr(df) == expected - dts = [pd.NaT] * 5 + [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5 - df = pd.DataFrame({"dt": dts, - "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) - with option_context('display.max_rows', 5): - expected = (' dt x\n' - '0 NaT 1\n' - '1 NaT 2\n' - '.. ... ..\n' - '8 2011-01-01 00:00:00-05:00 9\n' - '9 2011-01-01 00:00:00-05:00 10\n\n' - '[10 rows x 2 columns]') + dts = [pd.NaT] * 5 + [pd.Timestamp("2011-01-01", tz="US/Eastern")] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 NaT 1\n" + "1 NaT 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) assert repr(df) == expected - dts = ([pd.Timestamp('2011-01-01', tz='Asia/Tokyo')] * 5 + - [pd.Timestamp('2011-01-01', tz='US/Eastern')] * 5) - df = pd.DataFrame({"dt": dts, - "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) - with option_context('display.max_rows', 5): - expected = (' dt x\n' - '0 2011-01-01 00:00:00+09:00 1\n' - '1 2011-01-01 00:00:00+09:00 2\n' - '.. ... ..\n' - '8 2011-01-01 00:00:00-05:00 9\n' - '9 2011-01-01 00:00:00-05:00 10\n\n' - '[10 rows x 2 columns]') + dts = [pd.Timestamp("2011-01-01", tz="Asia/Tokyo")] * 5 + [ + pd.Timestamp("2011-01-01", tz="US/Eastern") + ] * 5 + df = pd.DataFrame({"dt": dts, "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) + with option_context("display.max_rows", 5): + expected = ( + " dt x\n" + "0 2011-01-01 00:00:00+09:00 1\n" + "1 2011-01-01 00:00:00+09:00 2\n" + ".. ... ..\n" + "8 2011-01-01 00:00:00-05:00 9\n" + "9 2011-01-01 00:00:00-05:00 10\n\n" + "[10 rows x 2 columns]" + ) assert repr(df) == expected - @pytest.mark.parametrize('start_date', [ - '2017-01-01 23:59:59.999999999', - '2017-01-01 23:59:59.99999999', - '2017-01-01 23:59:59.9999999', - '2017-01-01 23:59:59.999999', - '2017-01-01 23:59:59.99999', - '2017-01-01 23:59:59.9999', - ]) + @pytest.mark.parametrize( + "start_date", + [ + "2017-01-01 23:59:59.999999999", + "2017-01-01 23:59:59.99999999", + "2017-01-01 23:59:59.9999999", + "2017-01-01 23:59:59.999999", + "2017-01-01 23:59:59.99999", + "2017-01-01 23:59:59.9999", + ], + ) def test_datetimeindex_highprecision(self, start_date): # GH19030 # Check that high-precision time values for the end of day are # included in repr for DatetimeIndex - df = DataFrame({'A': date_range(start=start_date, - freq='D', periods=5)}) + df = DataFrame({"A": date_range(start=start_date, freq="D", periods=5)}) result = str(df) assert start_date in result - dti = date_range(start=start_date, - freq='D', periods=5) - df = DataFrame({'A': range(5)}, index=dti) + dti = date_range(start=start_date, freq="D", periods=5) + df = DataFrame({"A": range(5)}, index=dti) result = str(df.index) assert start_date in result def test_nonunicode_nonascii_alignment(self): df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) rep_str = df.to_string() - lines = rep_str.split('\n') + lines = rep_str.split("\n") assert len(lines[1]) == len(lines[2]) def test_unicode_problem_decoding_as_ascii(self): - dm = DataFrame({'c/\u03c3': Series({'test': np.nan})}) + dm = DataFrame({"c/\u03c3": Series({"test": np.nan})}) str(dm.to_string()) def test_string_repr_encoding(self, datapath): - filepath = datapath('io', 'parser', 'data', 'unicode_series.csv') - df = pd.read_csv(filepath, header=None, encoding='latin1') + filepath = datapath("io", "parser", "data", "unicode_series.csv") + df = pd.read_csv(filepath, header=None, encoding="latin1") repr(df) repr(df[1]) def test_repr_corner(self): # representing infs poses no problems - df = DataFrame({'foo': [-np.inf, np.inf]}) + df = DataFrame({"foo": [-np.inf, np.inf]}) repr(df) def test_frame_info_encoding(self): - index = ['\'Til There Was You (1997)', - 'ldum klaka (Cold Fever) (1994)'] - fmt.set_option('display.max_rows', 1) - df = DataFrame(columns=['a', 'b', 'c'], index=index) + index = ["'Til There Was You (1997)", "ldum klaka (Cold Fever) (1994)"] + fmt.set_option("display.max_rows", 1) + df = DataFrame(columns=["a", "b", "c"], index=index) repr(df) repr(df.T) - fmt.set_option('display.max_rows', 200) + fmt.set_option("display.max_rows", 200) def test_wide_repr(self): - with option_context('mode.sim_interactive', True, - 'display.show_dimensions', True, - 'display.max_columns', 20): - max_cols = get_option('display.max_columns') + with option_context( + "mode.sim_interactive", + True, + "display.show_dimensions", + True, + "display.max_columns", + 20, + ): + max_cols = get_option("display.max_columns") df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - set_option('display.expand_frame_repr', False) + set_option("display.expand_frame_repr", False) rep_str = repr(df) assert "10 rows x {c} columns".format(c=max_cols - 1) in rep_str - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 120): + with option_context("display.width", 120): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_wide_columns(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): - df = DataFrame(np.random.randn(5, 3), - columns=['a' * 90, 'b' * 90, 'c' * 90]) + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + df = DataFrame( + np.random.randn(5, 3), columns=["a" * 90, "b" * 90, "c" * 90] + ) rep_str = repr(df) assert len(rep_str.splitlines()) == 20 def test_wide_repr_named(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): - max_cols = get_option('display.max_columns') + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - df.index.name = 'DataFrame Index' - set_option('display.expand_frame_repr', False) + df.index.name = "DataFrame Index" + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150): + with option_context("display.width", 150): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) for line in wide_repr.splitlines()[1::13]: - assert 'DataFrame Index' in line + assert "DataFrame Index" in line - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_multiindex(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) - max_cols = get_option('display.max_columns') - df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), - index=midx) - df.index.names = ['Level 0', 'Level 1'] - set_option('display.expand_frame_repr', False) + max_cols = get_option("display.max_columns") + df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), index=midx) + df.index.names = ["Level 0", "Level 1"] + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150): + with option_context("display.width", 150): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) for line in wide_repr.splitlines()[1::13]: - assert 'Level 0 Level 1' in line + assert "Level 0 Level 1" in line - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_multiindex_cols(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): - max_cols = get_option('display.max_columns') + with option_context("mode.sim_interactive", True, "display.max_columns", 20): + max_cols = get_option("display.max_columns") midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10))) - mcols = MultiIndex.from_arrays( - tm.rands_array(3, size=(2, max_cols - 1))) - df = DataFrame(tm.rands_array(25, (10, max_cols - 1)), - index=midx, columns=mcols) - df.index.names = ['Level 0', 'Level 1'] - set_option('display.expand_frame_repr', False) + mcols = MultiIndex.from_arrays(tm.rands_array(3, size=(2, max_cols - 1))) + df = DataFrame( + tm.rands_array(25, (10, max_cols - 1)), index=midx, columns=mcols + ) + df.index.names = ["Level 0", "Level 1"] + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150, 'display.max_columns', 20): + with option_context("display.width", 150, "display.max_columns", 20): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_unicode(self): - with option_context('mode.sim_interactive', True, - 'display.max_columns', 20): + with option_context("mode.sim_interactive", True, "display.max_columns", 20): max_cols = 20 df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - set_option('display.expand_frame_repr', False) + set_option("display.expand_frame_repr", False) rep_str = repr(df) - set_option('display.expand_frame_repr', True) + set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr - with option_context('display.width', 150): + with option_context("display.width", 150): wider_repr = repr(df) assert len(wider_repr) < len(wide_repr) - reset_option('display.expand_frame_repr') + reset_option("display.expand_frame_repr") def test_wide_repr_wide_long_columns(self): - with option_context('mode.sim_interactive', True): - df = DataFrame({'a': ['a' * 30, 'b' * 30], - 'b': ['c' * 70, 'd' * 80]}) + with option_context("mode.sim_interactive", True): + df = DataFrame({"a": ["a" * 30, "b" * 30], "b": ["c" * 70, "d" * 80]}) result = repr(df) - assert 'ccccc' in result - assert 'ddddd' in result + assert "ccccc" in result + assert "ddddd" in result def test_long_series(self): n = 1000 s = Series( np.random.randint(-50, 50, n), - index=['s{x:04d}'.format(x=x) for x in range(n)], dtype='int64') + index=["s{x:04d}".format(x=x) for x in range(n)], + dtype="int64", + ) import re + str_rep = str(s) - nmatches = len(re.findall('dtype', str_rep)) + nmatches = len(re.findall("dtype", str_rep)) assert nmatches == 1 def test_index_with_nan(self): # GH 2850 - df = DataFrame({'id1': {0: '1a3', - 1: '9h4'}, - 'id2': {0: np.nan, - 1: 'd67'}, - 'id3': {0: '78d', - 1: '79d'}, - 'value': {0: 123, - 1: 64}}) + df = DataFrame( + { + "id1": {0: "1a3", 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: "78d", 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) # multi-index - y = df.set_index(['id1', 'id2', 'id3']) + y = df.set_index(["id1", "id2", "id3"]) result = y.to_string() - expected = (' value\nid1 id2 id3 \n' - '1a3 NaN 78d 123\n9h4 d67 79d 64') + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) assert result == expected # index - y = df.set_index('id2') + y = df.set_index("id2") result = y.to_string() - expected = (' id1 id3 value\nid2 \n' - 'NaN 1a3 78d 123\nd67 9h4 79d 64') + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nd67 9h4 79d 64" + ) assert result == expected # with append (this failed in 0.12) - y = df.set_index(['id1', 'id2']).set_index('id3', append=True) + y = df.set_index(["id1", "id2"]).set_index("id3", append=True) result = y.to_string() - expected = (' value\nid1 id2 id3 \n' - '1a3 NaN 78d 123\n9h4 d67 79d 64') + expected = ( + " value\nid1 id2 id3 \n" + "1a3 NaN 78d 123\n9h4 d67 79d 64" + ) assert result == expected # all-nan in mi df2 = df.copy() - df2.loc[:, 'id2'] = np.nan - y = df2.set_index('id2') + df2.loc[:, "id2"] = np.nan + y = df2.set_index("id2") result = y.to_string() - expected = (' id1 id3 value\nid2 \n' - 'NaN 1a3 78d 123\nNaN 9h4 79d 64') + expected = ( + " id1 id3 value\nid2 \n" + "NaN 1a3 78d 123\nNaN 9h4 79d 64" + ) assert result == expected # partial nan in mi df2 = df.copy() - df2.loc[:, 'id2'] = np.nan - y = df2.set_index(['id2', 'id3']) + df2.loc[:, "id2"] = np.nan + y = df2.set_index(["id2", "id3"]) result = y.to_string() - expected = (' id1 value\nid2 id3 \n' - 'NaN 78d 1a3 123\n 79d 9h4 64') + expected = ( + " id1 value\nid2 id3 \n" + "NaN 78d 1a3 123\n 79d 9h4 64" + ) assert result == expected - df = DataFrame({'id1': {0: np.nan, - 1: '9h4'}, - 'id2': {0: np.nan, - 1: 'd67'}, - 'id3': {0: np.nan, - 1: '79d'}, - 'value': {0: 123, - 1: 64}}) + df = DataFrame( + { + "id1": {0: np.nan, 1: "9h4"}, + "id2": {0: np.nan, 1: "d67"}, + "id3": {0: np.nan, 1: "79d"}, + "value": {0: 123, 1: 64}, + } + ) - y = df.set_index(['id1', 'id2', 'id3']) + y = df.set_index(["id1", "id2", "id3"]) result = y.to_string() - expected = (' value\nid1 id2 id3 \n' - 'NaN NaN NaN 123\n9h4 d67 79d 64') + expected = ( + " value\nid1 id2 id3 \n" + "NaN NaN NaN 123\n9h4 d67 79d 64" + ) assert result == expected def test_to_string(self): # big mixed - biggie = DataFrame({'A': np.random.randn(200), - 'B': tm.makeStringIndex(200)}, - index=np.arange(200)) + biggie = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, + index=np.arange(200), + ) - biggie.loc[:20, 'A'] = np.nan - biggie.loc[:20, 'B'] = np.nan + biggie.loc[:20, "A"] = np.nan + biggie.loc[:20, "B"] = np.nan s = biggie.to_string() buf = StringIO() @@ -1206,37 +1355,37 @@ def test_to_string(self): assert isinstance(s, str) # print in right order - result = biggie.to_string(columns=['B', 'A'], col_space=17, - float_format='%.5f'.__mod__) - lines = result.split('\n') + result = biggie.to_string( + columns=["B", "A"], col_space=17, float_format="%.5f".__mod__ + ) + lines = result.split("\n") header = lines[0].strip().split() - joined = '\n'.join(re.sub(r'\s+', ' ', x).strip() for x in lines[1:]) - recons = read_csv(StringIO(joined), names=header, - header=None, sep=' ') - tm.assert_series_equal(recons['B'], biggie['B']) - assert recons['A'].count() == biggie['A'].count() - assert (np.abs(recons['A'].dropna() - - biggie['A'].dropna()) < 0.1).all() + joined = "\n".join(re.sub(r"\s+", " ", x).strip() for x in lines[1:]) + recons = read_csv(StringIO(joined), names=header, header=None, sep=" ") + tm.assert_series_equal(recons["B"], biggie["B"]) + assert recons["A"].count() == biggie["A"].count() + assert (np.abs(recons["A"].dropna() - biggie["A"].dropna()) < 0.1).all() # expected = ['B', 'A'] # assert header == expected - result = biggie.to_string(columns=['A'], col_space=17) - header = result.split('\n')[0].strip().split() - expected = ['A'] + result = biggie.to_string(columns=["A"], col_space=17) + header = result.split("\n")[0].strip().split() + expected = ["A"] assert header == expected - biggie.to_string(columns=['B', 'A'], - formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) + biggie.to_string( + columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)} + ) - biggie.to_string(columns=['B', 'A'], float_format=str) - biggie.to_string(columns=['B', 'A'], col_space=12, float_format=str) + biggie.to_string(columns=["B", "A"], float_format=str) + biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_string() def test_to_string_no_header(self): - df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) df_s = df.to_string(header=False) expected = "0 1 4\n1 2 5\n2 3 6" @@ -1244,50 +1393,46 @@ def test_to_string_no_header(self): assert df_s == expected def test_to_string_specified_header(self): - df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - df_s = df.to_string(header=['X', 'Y']) - expected = ' X Y\n0 1 4\n1 2 5\n2 3 6' + df_s = df.to_string(header=["X", "Y"]) + expected = " X Y\n0 1 4\n1 2 5\n2 3 6" assert df_s == expected with pytest.raises(ValueError): - df.to_string(header=['X']) + df.to_string(header=["X"]) def test_to_string_no_index(self): # GH 16839, GH 13032 - df = DataFrame({'x': [11, 22], 'y': [33, -44], 'z': ['AAA', ' ']}) + df = DataFrame({"x": [11, 22], "y": [33, -44], "z": ["AAA", " "]}) df_s = df.to_string(index=False) # Leading space is expected for positive numbers. - expected = (" x y z\n" - " 11 33 AAA\n" - " 22 -44 ") + expected = " x y z\n" " 11 33 AAA\n" " 22 -44 " assert df_s == expected - df_s = df[['y', 'x', 'z']].to_string(index=False) - expected = (" y x z\n" - " 33 11 AAA\n" - "-44 22 ") + df_s = df[["y", "x", "z"]].to_string(index=False) + expected = " y x z\n" " 33 11 AAA\n" "-44 22 " assert df_s == expected def test_to_string_line_width_no_index(self): # GH 13998, GH 22505 - df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected - df = DataFrame({'x': [11, 22, 33], 'y': [4, 5, 6]}) + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) expected = " x \\\n 11 \n 22 \n 33 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected - df = DataFrame({'x': [11, 22, -33], 'y': [4, 5, -6]}) + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) df_s = df.to_string(line_width=1, index=False) expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " @@ -1296,129 +1441,147 @@ def test_to_string_line_width_no_index(self): def test_to_string_float_formatting(self): tm.reset_display_options() - fmt.set_option('display.precision', 5, 'display.column_space', 12, - 'display.notebook_repr_html', False) + fmt.set_option( + "display.precision", + 5, + "display.column_space", + 12, + "display.notebook_repr_html", + False, + ) - df = DataFrame({'x': [0, 0.25, 3456.000, 12e+45, 1.64e+6, 1.7e+8, - 1.253456, np.pi, -1e6]}) + df = DataFrame( + {"x": [0, 0.25, 3456.000, 12e45, 1.64e6, 1.7e8, 1.253456, np.pi, -1e6]} + ) df_s = df.to_string() if _three_digit_exp(): - expected = (' x\n0 0.00000e+000\n1 2.50000e-001\n' - '2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n' - '5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n' - '8 -1.00000e+006') + expected = ( + " x\n0 0.00000e+000\n1 2.50000e-001\n" + "2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n" + "5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n" + "8 -1.00000e+006" + ) else: - expected = (' x\n0 0.00000e+00\n1 2.50000e-01\n' - '2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n' - '5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n' - '8 -1.00000e+06') + expected = ( + " x\n0 0.00000e+00\n1 2.50000e-01\n" + "2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n" + "5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n" + "8 -1.00000e+06" + ) assert df_s == expected - df = DataFrame({'x': [3234, 0.253]}) + df = DataFrame({"x": [3234, 0.253]}) df_s = df.to_string() - expected = (' x\n' '0 3234.000\n' '1 0.253') + expected = " x\n" "0 3234.000\n" "1 0.253" assert df_s == expected tm.reset_display_options() assert get_option("display.precision") == 6 - df = DataFrame({'x': [1e9, 0.2512]}) + df = DataFrame({"x": [1e9, 0.2512]}) df_s = df.to_string() if _three_digit_exp(): - expected = (' x\n' - '0 1.000000e+009\n' - '1 2.512000e-001') + expected = " x\n" "0 1.000000e+009\n" "1 2.512000e-001" else: - expected = (' x\n' - '0 1.000000e+09\n' - '1 2.512000e-01') + expected = " x\n" "0 1.000000e+09\n" "1 2.512000e-01" assert df_s == expected def test_to_string_float_format_no_fixed_width(self): # GH 21625 - df = DataFrame({'x': [0.19999]}) - expected = ' x\n0 0.200' - assert df.to_string(float_format='%.3f') == expected + df = DataFrame({"x": [0.19999]}) + expected = " x\n0 0.200" + assert df.to_string(float_format="%.3f") == expected # GH 22270 - df = DataFrame({'x': [100.0]}) - expected = ' x\n0 100' - assert df.to_string(float_format='%.0f') == expected + df = DataFrame({"x": [100.0]}) + expected = " x\n0 100" + assert df.to_string(float_format="%.0f") == expected def test_to_string_small_float_values(self): - df = DataFrame({'a': [1.5, 1e-17, -5.5e-7]}) + df = DataFrame({"a": [1.5, 1e-17, -5.5e-7]}) result = df.to_string() # sadness per above - if '{x:.4g}'.format(x=1.7e8) == '1.7e+008': - expected = (' a\n' - '0 1.500000e+000\n' - '1 1.000000e-017\n' - '2 -5.500000e-007') + if "{x:.4g}".format(x=1.7e8) == "1.7e+008": + expected = ( + " a\n" + "0 1.500000e+000\n" + "1 1.000000e-017\n" + "2 -5.500000e-007" + ) else: - expected = (' a\n' - '0 1.500000e+00\n' - '1 1.000000e-17\n' - '2 -5.500000e-07') + expected = ( + " a\n" + "0 1.500000e+00\n" + "1 1.000000e-17\n" + "2 -5.500000e-07" + ) assert result == expected # but not all exactly zero df = df * 0 result = df.to_string() - expected = (' 0\n' '0 0\n' '1 0\n' '2 -0') + expected = " 0\n" "0 0\n" "1 0\n" "2 -0" def test_to_string_float_index(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.arange(5), index=index) result = df.to_string() - expected = (' 0\n' - '1.5 0\n' - '2.0 1\n' - '3.0 2\n' - '4.0 3\n' - '5.0 4') + expected = " 0\n" "1.5 0\n" "2.0 1\n" "3.0 2\n" "4.0 3\n" "5.0 4" assert result == expected def test_to_string_complex_float_formatting(self): # GH #25514 - with pd.option_context('display.precision', 5): - df = DataFrame({'x': [ - (0.4467846931321966 + 0.0715185102060818j), - (0.2739442392974528 + 0.23515228785438969j), - (0.26974928742135185 + 0.3250604054898979j)]}) + with pd.option_context("display.precision", 5): + df = DataFrame( + { + "x": [ + (0.4467846931321966 + 0.0715185102060818j), + (0.2739442392974528 + 0.23515228785438969j), + (0.26974928742135185 + 0.3250604054898979j), + ] + } + ) result = df.to_string() - expected = (' x\n0 0.44678+0.07152j\n' - '1 0.27394+0.23515j\n' - '2 0.26975+0.32506j') + expected = ( + " x\n0 0.44678+0.07152j\n" + "1 0.27394+0.23515j\n" + "2 0.26975+0.32506j" + ) assert result == expected def test_to_string_ascii_error(self): - data = [('0 ', ' .gitignore ', ' 5 ', - ' \xe2\x80\xa2\xe2\x80\xa2\xe2\x80' - '\xa2\xe2\x80\xa2\xe2\x80\xa2')] + data = [ + ( + "0 ", + " .gitignore ", + " 5 ", + " \xe2\x80\xa2\xe2\x80\xa2\xe2\x80" "\xa2\xe2\x80\xa2\xe2\x80\xa2", + ) + ] df = DataFrame(data) # it works! repr(df) def test_to_string_int_formatting(self): - df = DataFrame({'x': [-15, 20, 25, -35]}) - assert issubclass(df['x'].dtype.type, np.integer) + df = DataFrame({"x": [-15, 20, 25, -35]}) + assert issubclass(df["x"].dtype.type, np.integer) output = df.to_string() - expected = (' x\n' '0 -15\n' '1 20\n' '2 25\n' '3 -35') + expected = " x\n" "0 -15\n" "1 20\n" "2 25\n" "3 -35" assert output == expected def test_to_string_index_formatter(self): df = DataFrame([range(5), range(5, 10), range(10, 15)]) - rs = df.to_string(formatters={'__index__': lambda x: 'abc' [x]}) + rs = df.to_string(formatters={"__index__": lambda x: "abc"[x]}) xp = """\ 0 1 2 3 4 @@ -1431,220 +1594,280 @@ def test_to_string_index_formatter(self): def test_to_string_left_justify_cols(self): tm.reset_display_options() - df = DataFrame({'x': [3234, 0.253]}) - df_s = df.to_string(justify='left') - expected = (' x \n' '0 3234.000\n' '1 0.253') + df = DataFrame({"x": [3234, 0.253]}) + df_s = df.to_string(justify="left") + expected = " x \n" "0 3234.000\n" "1 0.253" assert df_s == expected def test_to_string_format_na(self): tm.reset_display_options() - df = DataFrame({'A': [np.nan, -1, -2.1234, 3, 4], - 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + df = DataFrame( + { + "A": [np.nan, -1, -2.1234, 3, 4], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 NaN NaN\n' - '1 -1.0000 foo\n' - '2 -2.1234 foooo\n' - '3 3.0000 fooooo\n' - '4 4.0000 bar') + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0000 foo\n" + "2 -2.1234 foooo\n" + "3 3.0000 fooooo\n" + "4 4.0000 bar" + ) assert result == expected - df = DataFrame({'A': [np.nan, -1., -2., 3., 4.], - 'B': [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + df = DataFrame( + { + "A": [np.nan, -1.0, -2.0, 3.0, 4.0], + "B": [np.nan, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 NaN NaN\n' - '1 -1.0 foo\n' - '2 -2.0 foooo\n' - '3 3.0 fooooo\n' - '4 4.0 bar') + expected = ( + " A B\n" + "0 NaN NaN\n" + "1 -1.0 foo\n" + "2 -2.0 foooo\n" + "3 3.0 fooooo\n" + "4 4.0 bar" + ) assert result == expected def test_to_string_format_inf(self): # Issue #24861 tm.reset_display_options() - df = DataFrame({ - 'A': [-np.inf, np.inf, -1, -2.1234, 3, 4], - 'B': [-np.inf, np.inf, 'foo', 'foooo', 'fooooo', 'bar'] - }) + df = DataFrame( + { + "A": [-np.inf, np.inf, -1, -2.1234, 3, 4], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 -inf -inf\n' - '1 inf inf\n' - '2 -1.0000 foo\n' - '3 -2.1234 foooo\n' - '4 3.0000 fooooo\n' - '5 4.0000 bar') + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0000 foo\n" + "3 -2.1234 foooo\n" + "4 3.0000 fooooo\n" + "5 4.0000 bar" + ) assert result == expected - df = DataFrame({ - 'A': [-np.inf, np.inf, -1., -2., 3., 4.], - 'B': [-np.inf, np.inf, 'foo', 'foooo', 'fooooo', 'bar'] - }) + df = DataFrame( + { + "A": [-np.inf, np.inf, -1.0, -2.0, 3.0, 4.0], + "B": [-np.inf, np.inf, "foo", "foooo", "fooooo", "bar"], + } + ) result = df.to_string() - expected = (' A B\n' - '0 -inf -inf\n' - '1 inf inf\n' - '2 -1.0 foo\n' - '3 -2.0 foooo\n' - '4 3.0 fooooo\n' - '5 4.0 bar') + expected = ( + " A B\n" + "0 -inf -inf\n" + "1 inf inf\n" + "2 -1.0 foo\n" + "3 -2.0 foooo\n" + "4 3.0 fooooo\n" + "5 4.0 bar" + ) assert result == expected def test_to_string_decimal(self): # Issue #23614 - df = DataFrame({'A': [6.0, 3.1, 2.2]}) - expected = ' A\n0 6,0\n1 3,1\n2 2,2' - assert df.to_string(decimal=',') == expected + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + expected = " A\n0 6,0\n1 3,1\n2 2,2" + assert df.to_string(decimal=",") == expected def test_to_string_line_width(self): df = DataFrame(123, index=range(10, 15), columns=range(30)) s = df.to_string(line_width=80) - assert max(len(l) for l in s.split('\n')) == 80 + assert max(len(l) for l in s.split("\n")) == 80 def test_show_dimensions(self): df = DataFrame(123, index=range(10, 15), columns=range(30)) - with option_context('display.max_rows', 10, 'display.max_columns', 40, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', True): - assert '5 rows' in str(df) - assert '5 rows' in df._repr_html_() - with option_context('display.max_rows', 10, 'display.max_columns', 40, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', False): - assert '5 rows' not in str(df) - assert '5 rows' not in df._repr_html_() - with option_context('display.max_rows', 2, 'display.max_columns', 2, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', 'truncate'): - assert '5 rows' in str(df) - assert '5 rows' in df._repr_html_() - with option_context('display.max_rows', 10, 'display.max_columns', 40, - 'display.width', 500, 'display.expand_frame_repr', - 'info', 'display.show_dimensions', 'truncate'): - assert '5 rows' not in str(df) - assert '5 rows' not in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + True, + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + False, + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() + with option_context( + "display.max_rows", + 2, + "display.max_columns", + 2, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" in str(df) + assert "5 rows" in df._repr_html_() + with option_context( + "display.max_rows", + 10, + "display.max_columns", + 40, + "display.width", + 500, + "display.expand_frame_repr", + "info", + "display.show_dimensions", + "truncate", + ): + assert "5 rows" not in str(df) + assert "5 rows" not in df._repr_html_() def test_repr_html(self, float_frame): df = float_frame df._repr_html_() - fmt.set_option('display.max_rows', 1, 'display.max_columns', 1) + fmt.set_option("display.max_rows", 1, "display.max_columns", 1) df._repr_html_() - fmt.set_option('display.notebook_repr_html', False) + fmt.set_option("display.notebook_repr_html", False) df._repr_html_() tm.reset_display_options() df = DataFrame([[1, 2], [3, 4]]) - fmt.set_option('display.show_dimensions', True) - assert '2 rows' in df._repr_html_() - fmt.set_option('display.show_dimensions', False) - assert '2 rows' not in df._repr_html_() + fmt.set_option("display.show_dimensions", True) + assert "2 rows" in df._repr_html_() + fmt.set_option("display.show_dimensions", False) + assert "2 rows" not in df._repr_html_() tm.reset_display_options() def test_repr_html_mathjax(self): df = DataFrame([[1, 2], [3, 4]]) - assert 'tex2jax_ignore' not in df._repr_html_() + assert "tex2jax_ignore" not in df._repr_html_() - with pd.option_context('display.html.use_mathjax', False): - assert 'tex2jax_ignore' in df._repr_html_() + with pd.option_context("display.html.use_mathjax", False): + assert "tex2jax_ignore" in df._repr_html_() def test_repr_html_wide(self): max_cols = 20 df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1))) - with option_context('display.max_rows', 60, 'display.max_columns', 20): + with option_context("display.max_rows", 60, "display.max_columns", 20): assert "..." not in df._repr_html_() wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1))) - with option_context('display.max_rows', 60, 'display.max_columns', 20): + with option_context("display.max_rows", 60, "display.max_columns", 20): assert "..." in wide_df._repr_html_() def test_repr_html_wide_multiindex_cols(self): max_cols = 20 - mcols = MultiIndex.from_product([np.arange(max_cols // 2), - ['foo', 'bar']], - names=['first', 'second']) - df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), - columns=mcols) + mcols = MultiIndex.from_product( + [np.arange(max_cols // 2), ["foo", "bar"]], names=["first", "second"] + ) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) reg_repr = df._repr_html_() - assert '...' not in reg_repr + assert "..." not in reg_repr - mcols = MultiIndex.from_product((np.arange(1 + (max_cols // 2)), - ['foo', 'bar']), - names=['first', 'second']) - df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), - columns=mcols) - with option_context('display.max_rows', 60, 'display.max_columns', 20): - assert '...' in df._repr_html_() + mcols = MultiIndex.from_product( + (np.arange(1 + (max_cols // 2)), ["foo", "bar"]), names=["first", "second"] + ) + df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols) + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() def test_repr_html_long(self): - with option_context('display.max_rows', 60): - max_rows = get_option('display.max_rows') + with option_context("display.max_rows", 60): + max_rows = get_option("display.max_rows") h = max_rows - 1 - df = DataFrame({'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}) + df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) reg_repr = df._repr_html_() - assert '..' not in reg_repr + assert ".." not in reg_repr assert str(41 + max_rows // 2) in reg_repr h = max_rows + 1 - df = DataFrame({'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}) + df = DataFrame({"A": np.arange(1, 1 + h), "B": np.arange(41, 41 + h)}) long_repr = df._repr_html_() - assert '..' in long_repr + assert ".." in long_repr assert str(41 + max_rows // 2) not in long_repr - assert '{h} rows '.format(h=h) in long_repr - assert '2 columns' in long_repr + assert "{h} rows ".format(h=h) in long_repr + assert "2 columns" in long_repr def test_repr_html_float(self): - with option_context('display.max_rows', 60): + with option_context("display.max_rows", 60): - max_rows = get_option('display.max_rows') + max_rows = get_option("display.max_rows") h = max_rows - 1 - df = DataFrame({'idx': np.linspace(-10, 10, h), - 'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}).set_index('idx') + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") reg_repr = df._repr_html_() - assert '..' not in reg_repr - assert '{val}'.format(val=str(40 + h)) in reg_repr + assert ".." not in reg_repr + assert "{val}".format(val=str(40 + h)) in reg_repr h = max_rows + 1 - df = DataFrame({'idx': np.linspace(-10, 10, h), - 'A': np.arange(1, 1 + h), - 'B': np.arange(41, 41 + h)}).set_index('idx') + df = DataFrame( + { + "idx": np.linspace(-10, 10, h), + "A": np.arange(1, 1 + h), + "B": np.arange(41, 41 + h), + } + ).set_index("idx") long_repr = df._repr_html_() - assert '..' in long_repr - assert '{val}'.format(val='31') not in long_repr - assert '{h} rows '.format(h=h) in long_repr - assert '2 columns' in long_repr + assert ".." in long_repr + assert "{val}".format(val="31") not in long_repr + assert "{h} rows ".format(h=h) in long_repr + assert "2 columns" in long_repr def test_repr_html_long_multiindex(self): max_rows = 60 max_L1 = max_rows // 2 - tuples = list(itertools.product(np.arange(max_L1), ['foo', 'bar'])) - idx = MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame(np.random.randn(max_L1 * 2, 2), index=idx, - columns=['A', 'B']) - with option_context('display.max_rows', 60, 'display.max_columns', 20): + tuples = list(itertools.product(np.arange(max_L1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame(np.random.randn(max_L1 * 2, 2), index=idx, columns=["A", "B"]) + with option_context("display.max_rows", 60, "display.max_columns", 20): reg_repr = df._repr_html_() - assert '...' not in reg_repr + assert "..." not in reg_repr - tuples = list(itertools.product(np.arange(max_L1 + 1), ['foo', 'bar'])) - idx = MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = DataFrame(np.random.randn((max_L1 + 1) * 2, 2), index=idx, - columns=['A', 'B']) + tuples = list(itertools.product(np.arange(max_L1 + 1), ["foo", "bar"])) + idx = MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = DataFrame( + np.random.randn((max_L1 + 1) * 2, 2), index=idx, columns=["A", "B"] + ) long_repr = df._repr_html_() - assert '...' in long_repr + assert "..." in long_repr def test_repr_html_long_and_wide(self): max_cols = 20 @@ -1652,13 +1875,13 @@ def test_repr_html_long_and_wide(self): h, w = max_rows - 1, max_cols - 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context('display.max_rows', 60, 'display.max_columns', 20): - assert '...' not in df._repr_html_() + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." not in df._repr_html_() h, w = max_rows + 1, max_cols + 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - with option_context('display.max_rows', 60, 'display.max_columns', 20): - assert '...' in df._repr_html_() + with option_context("display.max_rows", 60, "display.max_columns", 20): + assert "..." in df._repr_html_() def test_info_repr(self): # GH#21746 For tests inside a terminal (i.e. not CI) we need to detect @@ -1671,28 +1894,39 @@ def test_info_repr(self): h, w = max_rows + 1, max_cols - 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert has_vertically_truncated_repr(df) - with option_context('display.large_repr', 'info'): + with option_context("display.large_repr", "info"): assert has_info_repr(df) # Wide h, w = max_rows - 1, max_cols + 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) assert has_horizontally_truncated_repr(df) - with option_context('display.large_repr', 'info', - 'display.max_columns', max_cols): + with option_context( + "display.large_repr", "info", "display.max_columns", max_cols + ): assert has_info_repr(df) def test_info_repr_max_cols(self): # GH #6939 df = DataFrame(np.random.randn(10, 5)) - with option_context('display.large_repr', 'info', - 'display.max_columns', 1, - 'display.max_info_columns', 4): + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 4, + ): assert has_non_verbose_info_repr(df) - with option_context('display.large_repr', 'info', - 'display.max_columns', 1, - 'display.max_info_columns', 5): + with option_context( + "display.large_repr", + "info", + "display.max_columns", + 1, + "display.max_info_columns", + 5, + ): assert not has_non_verbose_info_repr(df) # test verbose overrides @@ -1704,32 +1938,32 @@ def test_info_repr_html(self): # Long h, w = max_rows + 1, max_cols - 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert r'<class' not in df._repr_html_() - with option_context('display.large_repr', 'info'): - assert r'<class' in df._repr_html_() + assert r"<class" not in df._repr_html_() + with option_context("display.large_repr", "info"): + assert r"<class" in df._repr_html_() # Wide h, w = max_rows - 1, max_cols + 1 df = DataFrame({k: np.arange(1, 1 + h) for k in np.arange(w)}) - assert ' never truncate - assert '..' not in repr(s) + assert ".." not in repr(s) def test_to_string_name(self): - s = Series(range(100), dtype='int64') - s.name = 'myser' + s = Series(range(100), dtype="int64") + s.name = "myser" res = s.to_string(max_rows=2, name=True) - exp = '0 0\n ..\n99 99\nName: myser' + exp = "0 0\n ..\n99 99\nName: myser" assert res == exp res = s.to_string(max_rows=2, name=False) - exp = '0 0\n ..\n99 99' + exp = "0 0\n ..\n99 99" assert res == exp def test_to_string_dtype(self): - s = Series(range(100), dtype='int64') + s = Series(range(100), dtype="int64") res = s.to_string(max_rows=2, dtype=True) - exp = '0 0\n ..\n99 99\ndtype: int64' + exp = "0 0\n ..\n99 99\ndtype: int64" assert res == exp res = s.to_string(max_rows=2, dtype=False) - exp = '0 0\n ..\n99 99' + exp = "0 0\n ..\n99 99" assert res == exp def test_to_string_length(self): - s = Series(range(100), dtype='int64') + s = Series(range(100), dtype="int64") res = s.to_string(max_rows=2, length=True) - exp = '0 0\n ..\n99 99\nLength: 100' + exp = "0 0\n ..\n99 99\nLength: 100" assert res == exp def test_to_string_na_rep(self): s = pd.Series(index=range(100)) - res = s.to_string(na_rep='foo', max_rows=2) - exp = '0 foo\n ..\n99 foo' + res = s.to_string(na_rep="foo", max_rows=2) + exp = "0 foo\n ..\n99 foo" assert res == exp def test_to_string_float_format(self): - s = pd.Series(range(10), dtype='float64') - res = s.to_string(float_format=lambda x: '{0:2.1f}'.format(x), - max_rows=2) - exp = '0 0.0\n ..\n9 9.0' + s = pd.Series(range(10), dtype="float64") + res = s.to_string(float_format=lambda x: "{0:2.1f}".format(x), max_rows=2) + exp = "0 0.0\n ..\n9 9.0" assert res == exp def test_to_string_header(self): - s = pd.Series(range(10), dtype='int64') - s.index.name = 'foo' + s = pd.Series(range(10), dtype="int64") + s.index.name = "foo" res = s.to_string(header=True, max_rows=2) - exp = 'foo\n0 0\n ..\n9 9' + exp = "foo\n0 0\n ..\n9 9" assert res == exp res = s.to_string(header=False, max_rows=2) - exp = '0 0\n ..\n9 9' + exp = "0 0\n ..\n9 9" assert res == exp def test_to_string_multindex_header(self): # GH 16718 - df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]}) - .set_index(['a', 'b'])) - res = df.to_string(header=['r1', 'r2']) - exp = ' r1 r2\na b \n0 1 2 3' + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( + ["a", "b"] + ) + res = df.to_string(header=["r1", "r2"]) + exp = " r1 r2\na b \n0 1 2 3" assert res == exp def _three_digit_exp(): - return '{x:.4g}'.format(x=1.7e8) == '1.7e+008' + return "{x:.4g}".format(x=1.7e8) == "1.7e+008" class TestFloatArrayFormatter: - def test_misc(self): obj = fmt.FloatArrayFormatter(np.array([], dtype=np.float64)) result = obj.get_result() @@ -2418,57 +2724,70 @@ def test_output_significant_digits(self): # Issue #9764 # In case default display precision changes: - with pd.option_context('display.precision', 6): + with pd.option_context("display.precision", 6): # DataFrame example from issue #9764 d = pd.DataFrame( - {'col1': [9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, - 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, - 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) + { + "col1": [ + 9.999e-8, + 1e-7, + 1.0001e-7, + 2e-7, + 4.999e-7, + 5e-7, + 5.0001e-7, + 6e-7, + 9.999e-7, + 1e-6, + 1.0001e-6, + 2e-6, + 4.999e-6, + 5e-6, + 5.0001e-6, + 6e-6, + ] + } + ) expected_output = { - (0, 6): - ' col1\n' - '0 9.999000e-08\n' - '1 1.000000e-07\n' - '2 1.000100e-07\n' - '3 2.000000e-07\n' - '4 4.999000e-07\n' - '5 5.000000e-07', - (1, 6): - ' col1\n' - '1 1.000000e-07\n' - '2 1.000100e-07\n' - '3 2.000000e-07\n' - '4 4.999000e-07\n' - '5 5.000000e-07', - (1, 8): - ' col1\n' - '1 1.000000e-07\n' - '2 1.000100e-07\n' - '3 2.000000e-07\n' - '4 4.999000e-07\n' - '5 5.000000e-07\n' - '6 5.000100e-07\n' - '7 6.000000e-07', - (8, 16): - ' col1\n' - '8 9.999000e-07\n' - '9 1.000000e-06\n' - '10 1.000100e-06\n' - '11 2.000000e-06\n' - '12 4.999000e-06\n' - '13 5.000000e-06\n' - '14 5.000100e-06\n' - '15 6.000000e-06', - (9, 16): - ' col1\n' - '9 0.000001\n' - '10 0.000001\n' - '11 0.000002\n' - '12 0.000005\n' - '13 0.000005\n' - '14 0.000005\n' - '15 0.000006' + (0, 6): " col1\n" + "0 9.999000e-08\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 6): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07", + (1, 8): " col1\n" + "1 1.000000e-07\n" + "2 1.000100e-07\n" + "3 2.000000e-07\n" + "4 4.999000e-07\n" + "5 5.000000e-07\n" + "6 5.000100e-07\n" + "7 6.000000e-07", + (8, 16): " col1\n" + "8 9.999000e-07\n" + "9 1.000000e-06\n" + "10 1.000100e-06\n" + "11 2.000000e-06\n" + "12 4.999000e-06\n" + "13 5.000000e-06\n" + "14 5.000100e-06\n" + "15 6.000000e-06", + (9, 16): " col1\n" + "9 0.000001\n" + "10 0.000001\n" + "11 0.000002\n" + "12 0.000005\n" + "13 0.000005\n" + "14 0.000005\n" + "15 0.000006", } for (start, stop), v in expected_output.items(): @@ -2476,24 +2795,23 @@ def test_output_significant_digits(self): def test_too_long(self): # GH 10451 - with pd.option_context('display.precision', 4): + with pd.option_context("display.precision", 4): # need both a number > 1e6 and something that normally formats to # having length > display.precision + 6 df = pd.DataFrame(dict(x=[12345.6789])) - assert str(df) == ' x\n0 12345.6789' + assert str(df) == " x\n0 12345.6789" df = pd.DataFrame(dict(x=[2e6])) - assert str(df) == ' x\n0 2000000.0' + assert str(df) == " x\n0 2000000.0" df = pd.DataFrame(dict(x=[12345.6789, 2e6])) - assert str(df) == ' x\n0 1.2346e+04\n1 2.0000e+06' + assert str(df) == " x\n0 1.2346e+04\n1 2.0000e+06" class TestRepr_timedelta64: - def test_none(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1s = pd.to_timedelta(1, unit='s') - delta_500ms = pd.to_timedelta(500, unit='ms') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") drepr = lambda x: x._repr_base() assert drepr(delta_1d) == "1 days" @@ -2507,12 +2825,12 @@ def test_none(self): assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" def test_sub_day(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1s = pd.to_timedelta(1, unit='s') - delta_500ms = pd.to_timedelta(500, unit='ms') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") - drepr = lambda x: x._repr_base(format='sub_day') + drepr = lambda x: x._repr_base(format="sub_day") assert drepr(delta_1d) == "1 days" assert drepr(-delta_1d) == "-1 days" assert drepr(delta_0d) == "00:00:00" @@ -2524,12 +2842,12 @@ def test_sub_day(self): assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" def test_long(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1s = pd.to_timedelta(1, unit='s') - delta_500ms = pd.to_timedelta(500, unit='ms') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1s = pd.to_timedelta(1, unit="s") + delta_500ms = pd.to_timedelta(500, unit="ms") - drepr = lambda x: x._repr_base(format='long') + drepr = lambda x: x._repr_base(format="long") assert drepr(delta_1d) == "1 days 00:00:00" assert drepr(-delta_1d) == "-1 days +00:00:00" assert drepr(delta_0d) == "0 days 00:00:00" @@ -2541,11 +2859,11 @@ def test_long(self): assert drepr(-delta_1d + delta_500ms) == "-1 days +00:00:00.500000" def test_all(self): - delta_1d = pd.to_timedelta(1, unit='D') - delta_0d = pd.to_timedelta(0, unit='D') - delta_1ns = pd.to_timedelta(1, unit='ns') + delta_1d = pd.to_timedelta(1, unit="D") + delta_0d = pd.to_timedelta(0, unit="D") + delta_1ns = pd.to_timedelta(1, unit="ns") - drepr = lambda x: x._repr_base(format='all') + drepr = lambda x: x._repr_base(format="all") assert drepr(delta_1d) == "1 days 00:00:00.000000000" assert drepr(-delta_1d) == "-1 days +00:00:00.000000000" assert drepr(delta_0d) == "0 days 00:00:00.000000000" @@ -2554,9 +2872,8 @@ def test_all(self): class TestTimedelta64Formatter: - def test_days(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'1 days'" @@ -2572,35 +2889,34 @@ def test_days(self): assert result[0].strip() == "1 days" def test_days_neg(self): - x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='D') + x = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="D") result = fmt.Timedelta64Formatter(-x, box=True).get_result() assert result[0].strip() == "'0 days'" assert result[1].strip() == "'-1 days'" def test_subdays(self): - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(y, box=True).get_result() assert result[0].strip() == "'00:00:00'" assert result[1].strip() == "'00:00:01'" def test_subdays_neg(self): - y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit='s') + y = pd.to_timedelta(list(range(5)) + [pd.NaT], unit="s") result = fmt.Timedelta64Formatter(-y, box=True).get_result() assert result[0].strip() == "'00:00:00'" assert result[1].strip() == "'-1 days +23:59:59'" def test_zero(self): - x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit='D') + x = pd.to_timedelta(list(range(1)) + [pd.NaT], unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" - x = pd.to_timedelta(list(range(1)), unit='D') + x = pd.to_timedelta(list(range(1)), unit="D") result = fmt.Timedelta64Formatter(x, box=True).get_result() assert result[0].strip() == "'0 days'" class TestDatetime64Formatter: - def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT]) result = fmt.Datetime64Formatter(x).get_result() @@ -2622,35 +2938,35 @@ def test_dates_display(self): # 10170 # make sure that we are consistently display date formatting - x = Series(date_range('20130101 09:00:00', periods=5, freq='D')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="D")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" - x = Series(date_range('20130101 09:00:00', periods=5, freq='s')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="s")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" - x = Series(date_range('20130101 09:00:00', periods=5, freq='ms')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="ms")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" - x = Series(date_range('20130101 09:00:00', periods=5, freq='us')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="us")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" - x = Series(date_range('20130101 09:00:00', periods=5, freq='N')) + x = Series(date_range("20130101 09:00:00", periods=5, freq="N")) x.iloc[1] = np.nan result = fmt.Datetime64Formatter(x).get_result() assert result[0].strip() == "2013-01-01 09:00:00.000000000" @@ -2661,27 +2977,27 @@ def test_datetime64formatter_yearmonth(self): x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) def format_func(x): - return x.strftime('%Y-%m') + return x.strftime("%Y-%m") formatter = fmt.Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() - assert result == ['2016-01', '2016-02'] + assert result == ["2016-01", "2016-02"] def test_datetime64formatter_hoursecond(self): - x = Series(pd.to_datetime(['10:10:10.100', '12:12:12.120'], - format='%H:%M:%S.%f')) + x = Series( + pd.to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f") + ) def format_func(x): - return x.strftime('%H:%M') + return x.strftime("%H:%M") formatter = fmt.Datetime64Formatter(x, formatter=format_func) result = formatter.get_result() - assert result == ['10:10', '12:12'] + assert result == ["10:10", "12:12"] class TestNaTFormatting: - def test_repr(self): assert repr(pd.NaT) == "NaT" @@ -2690,7 +3006,6 @@ def test_str(self): class TestDatetimeIndexFormat: - def test_datetime(self): formatted = pd.to_datetime([datetime(2003, 1, 1, 12), pd.NaT]).format() assert formatted[0] == "2003-01-01 12:00:00" @@ -2705,34 +3020,34 @@ def test_date_tz(self): formatted = pd.to_datetime([datetime(2013, 1, 1)], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" - formatted = pd.to_datetime( - [datetime(2013, 1, 1), pd.NaT], utc=True).format() + formatted = pd.to_datetime([datetime(2013, 1, 1), pd.NaT], utc=True).format() assert formatted[0] == "2013-01-01 00:00:00+00:00" def test_date_explicit_date_format(self): formatted = pd.to_datetime([datetime(2003, 2, 1), pd.NaT]).format( - date_format="%m-%d-%Y", na_rep="UT") + date_format="%m-%d-%Y", na_rep="UT" + ) assert formatted[0] == "02-01-2003" assert formatted[1] == "UT" class TestDatetimeIndexUnicode: - def test_dates(self): - text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1) - ])) + text = str(pd.to_datetime([datetime(2013, 1, 1), datetime(2014, 1, 1)])) assert "['2013-01-01'," in text assert ", '2014-01-01']" in text def test_mixed(self): - text = str(pd.to_datetime([datetime(2013, 1, 1), datetime( - 2014, 1, 1, 12), datetime(2014, 1, 1)])) + text = str( + pd.to_datetime( + [datetime(2013, 1, 1), datetime(2014, 1, 1, 12), datetime(2014, 1, 1)] + ) + ) assert "'2013-01-01 00:00:00'," in text assert "'2014-01-01 00:00:00']" in text class TestStringRepTimestamp: - def test_no_tz(self): dt_date = datetime(2013, 1, 2) assert str(dt_date) == str(Timestamp(dt_date)) @@ -2772,17 +3087,17 @@ def test_tz_dateutil(self): assert str(dt_datetime_us) == str(Timestamp(dt_datetime_us)) def test_nat_representations(self): - for f in (str, repr, methodcaller('isoformat')): - assert f(pd.NaT) == 'NaT' + for f in (str, repr, methodcaller("isoformat")): + assert f(pd.NaT) == "NaT" def test_format_percentiles(): result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) - expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + expected = ["1.999%", "2.001%", "50%", "66.667%", "99.99%"] assert result == expected result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) - expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + expected = ["0%", "50%", "2.0%", "50%", "66.67%", "99.99%"] assert result == expected msg = r"percentiles should all be in the interval \[0,1\]" @@ -2793,19 +3108,31 @@ def test_format_percentiles(): with pytest.raises(ValueError, match=msg): fmt.format_percentiles([2, 0.1, 0.5]) with pytest.raises(ValueError, match=msg): - fmt.format_percentiles([0.1, 0.5, 'a']) + fmt.format_percentiles([0.1, 0.5, "a"]) def test_format_percentiles_integer_idx(): # Issue #26660 result = fmt.format_percentiles(np.linspace(0, 1, 10 + 1)) - expected = ['0%', '10%', '20%', '30%', '40%', '50%', - '60%', '70%', '80%', '90%', '100%'] + expected = [ + "0%", + "10%", + "20%", + "30%", + "40%", + "50%", + "60%", + "70%", + "80%", + "90%", + "100%", + ] assert result == expected def test_repr_html_ipython_config(ip): - code = textwrap.dedent("""\ + code = textwrap.dedent( + """\ import pandas as pd df = pd.DataFrame({"A": [1, 2]}) df._repr_html_() @@ -2813,6 +3140,7 @@ def test_repr_html_ipython_config(ip): cfg = get_ipython().config cfg['IPKernelApp']['parent_appname'] df._repr_html_() - """) + """ + ) result = ip.run_cell(code) assert not result.error_in_exec diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index 1ea7c4d94bbbe..f0d5ef19c4468 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -10,22 +10,23 @@ def test_adjoin(): - data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] - expected = 'a dd ggg\nb ee hhh\nc ff iii' + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" adjoined = printing.adjoin(2, *data) - assert (adjoined == expected) + assert adjoined == expected def test_repr_binary_type(): import string + letters = string.ascii_letters try: - raw = bytes(letters, encoding=cf.get_option('display.encoding')) + raw = bytes(letters, encoding=cf.get_option("display.encoding")) except TypeError: raw = bytes(letters) - b = str(raw.decode('utf-8')) + b = str(raw.decode("utf-8")) res = printing.pprint_thing(b, quote_strings=True) assert res == repr(b) res = printing.pprint_thing(b, quote_strings=False) @@ -33,18 +34,17 @@ def test_repr_binary_type(): class TestFormattBase: - def test_adjoin(self): - data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] - expected = 'a dd ggg\nb ee hhh\nc ff iii' + data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]] + expected = "a dd ggg\nb ee hhh\nc ff iii" adjoined = printing.adjoin(2, *data) assert adjoined == expected def test_adjoin_unicode(self): - data = [['あ', 'b', 'c'], ['dd', 'ええ', 'ff'], ['ggg', 'hhh', 'いいい']] - expected = 'あ dd ggg\nb ええ hhh\nc ff いいい' + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "hhh", "いいい"]] + expected = "あ dd ggg\nb ええ hhh\nc ff いいい" adjoined = printing.adjoin(2, *data) assert adjoined == expected @@ -56,7 +56,7 @@ def test_adjoin_unicode(self): adjoined = adj.adjoin(2, *data) assert adjoined == expected - cols = adjoined.split('\n') + cols = adjoined.split("\n") assert adj.len(cols[0]) == 13 assert adj.len(cols[1]) == 13 assert adj.len(cols[2]) == 16 @@ -67,7 +67,7 @@ def test_adjoin_unicode(self): adjoined = adj.adjoin(7, *data) assert adjoined == expected - cols = adjoined.split('\n') + cols = adjoined.split("\n") assert adj.len(cols[0]) == 23 assert adj.len(cols[1]) == 23 assert adj.len(cols[2]) == 26 @@ -79,92 +79,95 @@ def just(x, *args, **kwargs): # wrapper to test single str return adj.justify([x], *args, **kwargs)[0] - assert just('abc', 5, mode='left') == 'abc ' - assert just('abc', 5, mode='center') == ' abc ' - assert just('abc', 5, mode='right') == ' abc' - assert just('abc', 5, mode='left') == 'abc ' - assert just('abc', 5, mode='center') == ' abc ' - assert just('abc', 5, mode='right') == ' abc' + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" + assert just("abc", 5, mode="left") == "abc " + assert just("abc", 5, mode="center") == " abc " + assert just("abc", 5, mode="right") == " abc" - assert just('パンダ', 5, mode='left') == 'パンダ' - assert just('パンダ', 5, mode='center') == 'パンダ' - assert just('パンダ', 5, mode='right') == 'パンダ' + assert just("パンダ", 5, mode="left") == "パンダ" + assert just("パンダ", 5, mode="center") == "パンダ" + assert just("パンダ", 5, mode="right") == "パンダ" - assert just('パンダ', 10, mode='left') == 'パンダ ' - assert just('パンダ', 10, mode='center') == ' パンダ ' - assert just('パンダ', 10, mode='right') == ' パンダ' + assert just("パンダ", 10, mode="left") == "パンダ " + assert just("パンダ", 10, mode="center") == " パンダ " + assert just("パンダ", 10, mode="right") == " パンダ" def test_east_asian_len(self): adj = fmt.EastAsianTextAdjustment() - assert adj.len('abc') == 3 - assert adj.len('abc') == 3 + assert adj.len("abc") == 3 + assert adj.len("abc") == 3 - assert adj.len('パンダ') == 6 - assert adj.len('パンダ') == 5 - assert adj.len('パンダpanda') == 11 - assert adj.len('パンダpanda') == 10 + assert adj.len("パンダ") == 6 + assert adj.len("パンダ") == 5 + assert adj.len("パンダpanda") == 11 + assert adj.len("パンダpanda") == 10 def test_ambiguous_width(self): adj = fmt.EastAsianTextAdjustment() - assert adj.len('¡¡ab') == 4 + assert adj.len("¡¡ab") == 4 - with cf.option_context('display.unicode.ambiguous_as_wide', True): + with cf.option_context("display.unicode.ambiguous_as_wide", True): adj = fmt.EastAsianTextAdjustment() - assert adj.len('¡¡ab') == 6 + assert adj.len("¡¡ab") == 6 - data = [['あ', 'b', 'c'], ['dd', 'ええ', 'ff'], - ['ggg', '¡¡ab', 'いいい']] - expected = 'あ dd ggg \nb ええ ¡¡ab\nc ff いいい' + data = [["あ", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]] + expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい" adjoined = adj.adjoin(2, *data) assert adjoined == expected class TestTableSchemaRepr: - @classmethod def setup_class(cls): - pytest.importorskip('IPython') + pytest.importorskip("IPython") from IPython.core.interactiveshell import InteractiveShell + cls.display_formatter = InteractiveShell.instance().display_formatter def test_publishes(self): df = pd.DataFrame({"A": [1, 2]}) - objects = [df['A'], df, df] # dataframe / series + objects = [df["A"], df, df] # dataframe / series expected_keys = [ - {'text/plain', 'application/vnd.dataresource+json'}, - {'text/plain', 'text/html', 'application/vnd.dataresource+json'}, + {"text/plain", "application/vnd.dataresource+json"}, + {"text/plain", "text/html", "application/vnd.dataresource+json"}, ] - opt = pd.option_context('display.html.table_schema', True) + opt = pd.option_context("display.html.table_schema", True) for obj, expected in zip(objects, expected_keys): with opt: formatted = self.display_formatter.format(obj) assert set(formatted[0].keys()) == expected - with_latex = pd.option_context('display.latex.repr', True) + with_latex = pd.option_context("display.latex.repr", True) with opt, with_latex: formatted = self.display_formatter.format(obj) - expected = {'text/plain', 'text/html', 'text/latex', - 'application/vnd.dataresource+json'} + expected = { + "text/plain", + "text/html", + "text/latex", + "application/vnd.dataresource+json", + } assert set(formatted[0].keys()) == expected def test_publishes_not_implemented(self): # column MultiIndex # GH 15996 - midx = pd.MultiIndex.from_product([['A', 'B'], ['a', 'b', 'c']]) + midx = pd.MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) df = pd.DataFrame(np.random.randn(5, len(midx)), columns=midx) - opt = pd.option_context('display.html.table_schema', True) + opt = pd.option_context("display.html.table_schema", True) with opt: formatted = self.display_formatter.format(df) - expected = {'text/plain', 'text/html'} + expected = {"text/plain", "text/html"} assert set(formatted[0].keys()) == expected def test_config_on(self): @@ -184,19 +187,19 @@ def test_config_default_off(self): def test_enable_data_resource_formatter(self): # GH 10491 formatters = self.display_formatter.formatters - mimetype = 'application/vnd.dataresource+json' + mimetype = "application/vnd.dataresource+json" - with pd.option_context('display.html.table_schema', True): - assert 'application/vnd.dataresource+json' in formatters + with pd.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters assert formatters[mimetype].enabled # still there, just disabled - assert 'application/vnd.dataresource+json' in formatters + assert "application/vnd.dataresource+json" in formatters assert not formatters[mimetype].enabled # able to re-set - with pd.option_context('display.html.table_schema', True): - assert 'application/vnd.dataresource+json' in formatters + with pd.option_context("display.html.table_schema", True): + assert "application/vnd.dataresource+json" in formatters assert formatters[mimetype].enabled # smoke test that it works self.display_formatter.format(cf) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index dce3bb3b420d4..7bd27b2ad9be3 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -11,30 +11,29 @@ from pandas import DataFrame import pandas.util.testing as tm -jinja2 = pytest.importorskip('jinja2') +jinja2 = pytest.importorskip("jinja2") from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip class TestStyler: - def setup_method(self, method): np.random.seed(24) - self.s = DataFrame({'A': np.random.permutation(range(6))}) - self.df = DataFrame({'A': [0, 1], 'B': np.random.randn(2)}) + self.s = DataFrame({"A": np.random.permutation(range(6))}) + self.df = DataFrame({"A": [0, 1], "B": np.random.randn(2)}) self.f = lambda x: x self.g = lambda x: x - def h(x, foo='bar'): - return pd.Series( - 'color: {foo}'.format(foo=foo), index=x.index, name=x.name) + def h(x, foo="bar"): + return pd.Series("color: {foo}".format(foo=foo), index=x.index, name=x.name) self.h = h self.styler = Styler(self.df) - self.attrs = pd.DataFrame({'A': ['color: red', 'color: blue']}) + self.attrs = pd.DataFrame({"A": ["color: red", "color: blue"]}) self.dataframes = [ self.df, - pd.DataFrame({'f': [1., 2.], 'o': ['a', 'b'], - 'c': pd.Categorical(['a', 'b'])}) + pd.DataFrame( + {"f": [1.0, 2.0], "o": ["a", "b"], "c": pd.Categorical(["a", "b"])} + ), ] def test_init_non_pandas(self): @@ -50,31 +49,32 @@ def test_repr_html_ok(self): def test_repr_html_mathjax(self): # gh-19824 - assert 'tex2jax_ignore' not in self.styler._repr_html_() + assert "tex2jax_ignore" not in self.styler._repr_html_() - with pd.option_context('display.html.use_mathjax', False): - assert 'tex2jax_ignore' in self.styler._repr_html_() + with pd.option_context("display.html.use_mathjax", False): + assert "tex2jax_ignore" in self.styler._repr_html_() def test_update_ctx(self): self.styler._update_ctx(self.attrs) - expected = {(0, 0): ['color: red'], - (1, 0): ['color: blue']} + expected = {(0, 0): ["color: red"], (1, 0): ["color: blue"]} assert self.styler.ctx == expected def test_update_ctx_flatten_multi(self): - attrs = DataFrame({"A": ['color: red; foo: bar', - 'color: blue; foo: baz']}) + attrs = DataFrame({"A": ["color: red; foo: bar", "color: blue; foo: baz"]}) self.styler._update_ctx(attrs) - expected = {(0, 0): ['color: red', ' foo: bar'], - (1, 0): ['color: blue', ' foo: baz']} + expected = { + (0, 0): ["color: red", " foo: bar"], + (1, 0): ["color: blue", " foo: baz"], + } assert self.styler.ctx == expected def test_update_ctx_flatten_multi_traliing_semi(self): - attrs = DataFrame({"A": ['color: red; foo: bar;', - 'color: blue; foo: baz;']}) + attrs = DataFrame({"A": ["color: red; foo: bar;", "color: blue; foo: baz;"]}) self.styler._update_ctx(attrs) - expected = {(0, 0): ['color: red', ' foo: bar'], - (1, 0): ['color: blue', ' foo: baz']} + expected = { + (0, 0): ["color: red", " foo: bar"], + (1, 0): ["color: blue", " foo: baz"], + } assert self.styler.ctx == expected def test_copy(self): @@ -111,7 +111,7 @@ def test_clear(self): def test_render(self): df = pd.DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name) - s = Styler(df, uuid='AB').apply(style) + s = Styler(df, uuid="AB").apply(style) s.render() # it worked? @@ -120,23 +120,23 @@ def test_render_empty_dfs(self): es = Styler(empty_df) es.render() # An index but no columns - DataFrame(columns=['a']).style.render() + DataFrame(columns=["a"]).style.render() # A column but no index - DataFrame(index=['a']).style.render() + DataFrame(index=["a"]).style.render() # No IndexError raised? def test_render_double(self): df = pd.DataFrame({"A": [0, 1]}) - style = lambda x: pd.Series(["color: red; border: 1px", - "color: blue; border: 2px"], name=x.name) - s = Styler(df, uuid='AB').apply(style) + style = lambda x: pd.Series( + ["color: red; border: 1px", "color: blue; border: 2px"], name=x.name + ) + s = Styler(df, uuid="AB").apply(style) s.render() # it worked? def test_set_properties(self): df = pd.DataFrame({"A": [0, 1]}) - result = df.style.set_properties(color='white', - size='10px')._compute().ctx + result = df.style.set_properties(color="white", size="10px")._compute().ctx # order is deterministic v = ["color: white", "size: 10px"] expected = {(0, 0): v, (1, 0): v} @@ -145,77 +145,130 @@ def test_set_properties(self): assert sorted(v1) == sorted(v2) def test_set_properties_subset(self): - df = pd.DataFrame({'A': [0, 1]}) - result = df.style.set_properties(subset=pd.IndexSlice[0, 'A'], - color='white')._compute().ctx - expected = {(0, 0): ['color: white']} + df = pd.DataFrame({"A": [0, 1]}) + result = ( + df.style.set_properties(subset=pd.IndexSlice[0, "A"], color="white") + ._compute() + .ctx + ) + expected = {(0, 0): ["color: white"]} assert result == expected def test_empty_index_name_doesnt_display(self): # https://github.com/pandas-dev/pandas/pull/12090#issuecomment-180695902 - df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) result = df.style._translate() - expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', - 'is_visible': True, 'display_value': ''}, - {'class': 'col_heading level0 col0', - 'display_value': 'A', - 'type': 'th', - 'value': 'A', - 'is_visible': True, - }, - {'class': 'col_heading level0 col1', - 'display_value': 'B', - 'type': 'th', - 'value': 'B', - 'is_visible': True, - }, - {'class': 'col_heading level0 col2', - 'display_value': 'C', - 'type': 'th', - 'value': 'C', - 'is_visible': True, - }]] - - assert result['head'] == expected + expected = [ + [ + { + "class": "blank level0", + "type": "th", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "class": "col_heading level0 col0", + "display_value": "A", + "type": "th", + "value": "A", + "is_visible": True, + }, + { + "class": "col_heading level0 col1", + "display_value": "B", + "type": "th", + "value": "B", + "is_visible": True, + }, + { + "class": "col_heading level0 col2", + "display_value": "C", + "type": "th", + "value": "C", + "is_visible": True, + }, + ] + ] + + assert result["head"] == expected def test_index_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) - result = df.set_index('A').style._translate() - - expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', - 'display_value': '', 'is_visible': True}, - {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'B', 'display_value': 'B', 'is_visible': True}, - {'class': 'col_heading level0 col1', 'type': 'th', - 'value': 'C', 'display_value': 'C', 'is_visible': True}], - [{'class': 'index_name level0', 'type': 'th', - 'value': 'A'}, - {'class': 'blank', 'type': 'th', 'value': ''}, - {'class': 'blank', 'type': 'th', 'value': ''}]] - - assert result['head'] == expected + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.set_index("A").style._translate() + + expected = [ + [ + { + "class": "blank level0", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "col_heading level0 col0", + "type": "th", + "value": "B", + "display_value": "B", + "is_visible": True, + }, + { + "class": "col_heading level0 col1", + "type": "th", + "value": "C", + "display_value": "C", + "is_visible": True, + }, + ], + [ + {"class": "index_name level0", "type": "th", "value": "A"}, + {"class": "blank", "type": "th", "value": ""}, + {"class": "blank", "type": "th", "value": ""}, + ], + ] + + assert result["head"] == expected def test_multiindex_name(self): # https://github.com/pandas-dev/pandas/issues/11655 - df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) - result = df.set_index(['A', 'B']).style._translate() - - expected = [[ - {'class': 'blank', 'type': 'th', 'value': '', - 'display_value': '', 'is_visible': True}, - {'class': 'blank level0', 'type': 'th', 'value': '', - 'display_value': '', 'is_visible': True}, - {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'C', 'display_value': 'C', 'is_visible': True}], - [{'class': 'index_name level0', 'type': 'th', - 'value': 'A'}, - {'class': 'index_name level1', 'type': 'th', - 'value': 'B'}, - {'class': 'blank', 'type': 'th', 'value': ''}]] - - assert result['head'] == expected + df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + result = df.set_index(["A", "B"]).style._translate() + + expected = [ + [ + { + "class": "blank", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "blank level0", + "type": "th", + "value": "", + "display_value": "", + "is_visible": True, + }, + { + "class": "col_heading level0 col0", + "type": "th", + "value": "C", + "display_value": "C", + "is_visible": True, + }, + ], + [ + {"class": "index_name level0", "type": "th", "value": "A"}, + {"class": "index_name level1", "type": "th", "value": "B"}, + {"class": "blank", "type": "th", "value": ""}, + ], + ] + + assert result["head"] == expected def test_numeric_columns(self): # https://github.com/pandas-dev/pandas/issues/12125 @@ -224,19 +277,27 @@ def test_numeric_columns(self): df.style._translate() def test_apply_axis(self): - df = pd.DataFrame({'A': [0, 0], 'B': [1, 1]}) - f = lambda x: ['val: {max}'.format(max=x.max()) for v in x] + df = pd.DataFrame({"A": [0, 0], "B": [1, 1]}) + f = lambda x: ["val: {max}".format(max=x.max()) for v in x] result = df.style.apply(f, axis=1) assert len(result._todo) == 1 assert len(result.ctx) == 0 result._compute() - expected = {(0, 0): ['val: 1'], (0, 1): ['val: 1'], - (1, 0): ['val: 1'], (1, 1): ['val: 1']} + expected = { + (0, 0): ["val: 1"], + (0, 1): ["val: 1"], + (1, 0): ["val: 1"], + (1, 1): ["val: 1"], + } assert result.ctx == expected result = df.style.apply(f, axis=0) - expected = {(0, 0): ['val: 0'], (0, 1): ['val: 1'], - (1, 0): ['val: 0'], (1, 1): ['val: 1']} + expected = { + (0, 0): ["val: 0"], + (0, 1): ["val: 1"], + (1, 0): ["val: 0"], + (1, 1): ["val: 1"], + } result._compute() assert result.ctx == expected result = df.style.apply(f) # default @@ -245,35 +306,50 @@ def test_apply_axis(self): def test_apply_subset(self): axes = [0, 1] - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for ax in axes: for slice_ in slices: - result = self.df.style.apply(self.h, axis=ax, subset=slice_, - foo='baz')._compute().ctx - expected = {(r, c): ['color: baz'] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns) - if row in self.df.loc[slice_].index and - col in self.df.loc[slice_].columns} + result = ( + self.df.style.apply(self.h, axis=ax, subset=slice_, foo="baz") + ._compute() + .ctx + ) + expected = { + (r, c): ["color: baz"] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } assert result == expected def test_applymap_subset(self): def f(x): - return 'foo: bar' - - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + return "foo: bar" + + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for slice_ in slices: result = self.df.style.applymap(f, subset=slice_)._compute().ctx - expected = {(r, c): ['foo: bar'] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns) - if row in self.df.loc[slice_].index and - col in self.df.loc[slice_].columns} + expected = { + (r, c): ["foo: bar"] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } assert result == expected def test_applymap_subset_multiindex(self): @@ -285,34 +361,34 @@ def color_negative_red(val): the css property `'color: red'` for negative strings, black otherwise. """ - color = 'red' if val < 0 else 'black' - return 'color: %s' % color + color = "red" if val < 0 else "black" + return "color: %s" % color dic = { - ('a', 'd'): [-1.12, 2.11], - ('a', 'c'): [2.78, -2.88], - ('b', 'c'): [-3.99, 3.77], - ('b', 'd'): [4.21, -1.22], + ("a", "d"): [-1.12, 2.11], + ("a", "c"): [2.78, -2.88], + ("b", "c"): [-3.99, 3.77], + ("b", "d"): [4.21, -1.22], } idx = pd.IndexSlice df = pd.DataFrame(dic, index=[0, 1]) - (df.style - .applymap(color_negative_red, subset=idx[:, idx['b', 'd']]) - .render()) + (df.style.applymap(color_negative_red, subset=idx[:, idx["b", "d"]]).render()) def test_where_with_one_style(self): # GH 17474 def f(x): return x > 0.5 - style1 = 'foo: bar' + style1 = "foo: bar" result = self.df.style.where(f, style1)._compute().ctx - expected = {(r, c): [style1 if f(self.df.loc[row, col]) else ''] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns)} + expected = { + (r, c): [style1 if f(self.df.loc[row, col]) else ""] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + } assert result == expected def test_where_subset(self): @@ -320,22 +396,28 @@ def test_where_subset(self): def f(x): return x > 0.5 - style1 = 'foo: bar' - style2 = 'baz: foo' + style1 = "foo: bar" + style2 = "baz: foo" - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for slice_ in slices: - result = self.df.style.where(f, style1, style2, - subset=slice_)._compute().ctx - expected = {(r, c): - [style1 if f(self.df.loc[row, col]) else style2] - for r, row in enumerate(self.df.index) - for c, col in enumerate(self.df.columns) - if row in self.df.loc[slice_].index and - col in self.df.loc[slice_].columns} + result = ( + self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + ) + expected = { + (r, c): [style1 if f(self.df.loc[row, col]) else style2] + for r, row in enumerate(self.df.index) + for c, col in enumerate(self.df.columns) + if row in self.df.loc[slice_].index + and col in self.df.loc[slice_].columns + } assert result == expected def test_where_subset_compare_with_applymap(self): @@ -343,395 +425,579 @@ def test_where_subset_compare_with_applymap(self): def f(x): return x > 0.5 - style1 = 'foo: bar' - style2 = 'baz: foo' + style1 = "foo: bar" + style2 = "baz: foo" def g(x): return style1 if f(x) else style2 - slices = [pd.IndexSlice[:], pd.IndexSlice[:, ['A']], - pd.IndexSlice[[1], :], pd.IndexSlice[[1], ['A']], - pd.IndexSlice[:2, ['A', 'B']]] + slices = [ + pd.IndexSlice[:], + pd.IndexSlice[:, ["A"]], + pd.IndexSlice[[1], :], + pd.IndexSlice[[1], ["A"]], + pd.IndexSlice[:2, ["A", "B"]], + ] for slice_ in slices: - result = self.df.style.where(f, style1, style2, - subset=slice_)._compute().ctx + result = ( + self.df.style.where(f, style1, style2, subset=slice_)._compute().ctx + ) expected = self.df.style.applymap(g, subset=slice_)._compute().ctx assert result == expected def test_empty(self): - df = pd.DataFrame({'A': [1, 0]}) + df = pd.DataFrame({"A": [1, 0]}) s = df.style - s.ctx = {(0, 0): ['color: red'], - (1, 0): ['']} + s.ctx = {(0, 0): ["color: red"], (1, 0): [""]} - result = s._translate()['cellstyle'] - expected = [{'props': [['color', ' red']], 'selector': 'row0_col0'}, - {'props': [['', '']], 'selector': 'row1_col0'}] + result = s._translate()["cellstyle"] + expected = [ + {"props": [["color", " red"]], "selector": "row0_col0"}, + {"props": [["", ""]], "selector": "row1_col0"}, + ] assert result == expected def test_bar_align_left(self): - df = pd.DataFrame({'A': [0, 1, 2]}) + df = pd.DataFrame({"A": [0, 1, 2]}) result = df.style.bar()._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,#d65f5f 50.0%, transparent 50.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" + "90deg,#d65f5f 50.0%, transparent 50.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" + "90deg,#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected - result = df.style.bar(color='red', width=50)._compute().ctx + result = df.style.bar(color="red", width=50)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,red 25.0%, transparent 25.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(' - '90deg,red 50.0%, transparent 50.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" "90deg,red 25.0%, transparent 25.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(" "90deg,red 50.0%, transparent 50.0%)", + ], } assert result == expected - df['C'] = ['a'] * len(df) - result = df.style.bar(color='red', width=50)._compute().ctx + df["C"] = ["a"] * len(df) + result = df.style.bar(color="red", width=50)._compute().ctx assert result == expected - df['C'] = df['C'].astype('category') - result = df.style.bar(color='red', width=50)._compute().ctx + df["C"] = df["C"].astype("category") + result = df.style.bar(color="red", width=50)._compute().ctx assert result == expected def test_bar_align_left_0points(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) result = df.style.bar()._compute().ctx - expected = {(0, 0): ['width: 10em', ' height: 80%'], - (0, 1): ['width: 10em', ' height: 80%'], - (0, 2): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (1, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (2, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (2, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)']} + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (0, 1): ["width: 10em", " height: 80%"], + (0, 2): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (1, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + } assert result == expected result = df.style.bar(axis=1)._compute().ctx - expected = {(0, 0): ['width: 10em', ' height: 80%'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%,' - ' transparent 50.0%)'], - (0, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (1, 0): ['width: 10em', ' height: 80%'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%' - ', transparent 50.0%)'], - (1, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)'], - (2, 0): ['width: 10em', ' height: 80%'], - (2, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 50.0%' - ', transparent 50.0%)'], - (2, 2): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,#d65f5f 100.0%' - ', transparent 100.0%)']} + expected = { + (0, 0): ["width: 10em", " height: 80%"], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%," + " transparent 50.0%)", + ], + (0, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%" + ", transparent 50.0%)", + ], + (1, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + (2, 0): ["width: 10em", " height: 80%"], + (2, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 50.0%" + ", transparent 50.0%)", + ], + (2, 2): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg,#d65f5f 100.0%" + ", transparent 100.0%)", + ], + } assert result == expected def test_bar_align_mid_pos_and_neg(self): - df = pd.DataFrame({'A': [-10, 0, 20, 90]}) - - result = df.style.bar(align='mid', color=[ - '#d65f5f', '#5fba7d'])._compute().ctx - - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 10.0%, transparent 10.0%)'], - (1, 0): ['width: 10em', ' height: 80%', ], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 10.0%, #5fba7d 10.0%' - ', #5fba7d 30.0%, transparent 30.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 10.0%, ' - '#5fba7d 10.0%, #5fba7d 100.0%, ' - 'transparent 100.0%)']} + df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 10.0%, transparent 10.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, #5fba7d 10.0%" + ", #5fba7d 30.0%, transparent 30.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, " + "#5fba7d 10.0%, #5fba7d 100.0%, " + "transparent 100.0%)", + ], + } assert result == expected def test_bar_align_mid_all_pos(self): - df = pd.DataFrame({'A': [10, 20, 50, 100]}) - - result = df.style.bar(align='mid', color=[ - '#d65f5f', '#5fba7d'])._compute().ctx - - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 10.0%, transparent 10.0%)'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 20.0%, transparent 20.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 50.0%, transparent 50.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#5fba7d 100.0%, transparent 100.0%)']} + df = pd.DataFrame({"A": [10, 20, 50, 100]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 10.0%, transparent 10.0%)", + ], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 20.0%, transparent 20.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 50.0%, transparent 50.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#5fba7d 100.0%, transparent 100.0%)", + ], + } assert result == expected def test_bar_align_mid_all_neg(self): - df = pd.DataFrame({'A': [-100, -60, -30, -20]}) - - result = df.style.bar(align='mid', color=[ - '#d65f5f', '#5fba7d'])._compute().ctx - - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 100.0%, transparent 100.0%)'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 40.0%, ' - '#d65f5f 40.0%, #d65f5f 100.0%, ' - 'transparent 100.0%)'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 70.0%, ' - '#d65f5f 70.0%, #d65f5f 100.0%, ' - 'transparent 100.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 80.0%, ' - '#d65f5f 80.0%, #d65f5f 100.0%, ' - 'transparent 100.0%)']} + df = pd.DataFrame({"A": [-100, -60, -30, -20]}) + + result = df.style.bar(align="mid", color=["#d65f5f", "#5fba7d"])._compute().ctx + + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 100.0%, transparent 100.0%)", + ], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, " + "#d65f5f 40.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 70.0%, " + "#d65f5f 70.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 80.0%, " + "#d65f5f 80.0%, #d65f5f 100.0%, " + "transparent 100.0%)", + ], + } assert result == expected def test_bar_align_zero_pos_and_neg(self): # See https://github.com/pandas-dev/pandas/pull/14757 - df = pd.DataFrame({'A': [-10, 0, 20, 90]}) - - result = df.style.bar(align='zero', color=[ - '#d65f5f', '#5fba7d'], width=90)._compute().ctx - expected = {(0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 40.0%, #d65f5f 40.0%, ' - '#d65f5f 45.0%, transparent 45.0%)'], - (1, 0): ['width: 10em', ' height: 80%'], - (2, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 45.0%, #5fba7d 45.0%, ' - '#5fba7d 55.0%, transparent 55.0%)'], - (3, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 45.0%, #5fba7d 45.0%, ' - '#5fba7d 90.0%, transparent 90.0%)']} + df = pd.DataFrame({"A": [-10, 0, 20, 90]}) + + result = ( + df.style.bar(align="zero", color=["#d65f5f", "#5fba7d"], width=90) + ._compute() + .ctx + ) + expected = { + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, #d65f5f 40.0%, " + "#d65f5f 45.0%, transparent 45.0%)", + ], + (1, 0): ["width: 10em", " height: 80%"], + (2, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 45.0%, #5fba7d 45.0%, " + "#5fba7d 55.0%, transparent 55.0%)", + ], + (3, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 45.0%, #5fba7d 45.0%, " + "#5fba7d 90.0%, transparent 90.0%)", + ], + } assert result == expected def test_bar_align_left_axis_none(self): - df = pd.DataFrame({'A': [0, 1], 'B': [2, 4]}) + df = pd.DataFrame({"A": [0, 1], "B": [2, 4]}) result = df.style.bar(axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 25.0%, transparent 25.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_zero_axis_none(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='zero', axis=None)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 62.5%, transparent 62.5%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 62.5%, transparent 62.5%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_axis_none(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 33.3%, #d65f5f 33.3%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 33.3%, transparent 33.3%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 33.3%, #d65f5f 33.3%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 33.3%, #d65f5f 33.3%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 33.3%, transparent 33.3%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 33.3%, #d65f5f 33.3%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_vmin(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, vmin=-6)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-6)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 60.0%, #d65f5f 60.0%, ' - '#d65f5f 70.0%, transparent 70.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 40.0%, #d65f5f 40.0%, ' - '#d65f5f 60.0%, transparent 60.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 60.0%, #d65f5f 60.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 60.0%, #d65f5f 60.0%, " + "#d65f5f 70.0%, transparent 70.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 40.0%, #d65f5f 40.0%, " + "#d65f5f 60.0%, transparent 60.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 60.0%, #d65f5f 60.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_vmax(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, vmax=8)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmax=8)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 20.0%, #d65f5f 20.0%, ' - '#d65f5f 30.0%, transparent 30.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 20.0%, transparent 20.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 20.0%, #d65f5f 20.0%, ' - '#d65f5f 60.0%, transparent 60.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 20.0%, #d65f5f 20.0%, " + "#d65f5f 30.0%, transparent 30.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 20.0%, transparent 20.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 20.0%, #d65f5f 20.0%, " + "#d65f5f 60.0%, transparent 60.0%)", + ], } assert result == expected def test_bar_align_mid_vmin_vmax_wide(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, - vmin=-3, vmax=7)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-3, vmax=7)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 30.0%, #d65f5f 30.0%, ' - '#d65f5f 40.0%, transparent 40.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 10.0%, #d65f5f 10.0%, ' - '#d65f5f 30.0%, transparent 30.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 30.0%, #d65f5f 30.0%, ' - '#d65f5f 70.0%, transparent 70.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 30.0%, #d65f5f 30.0%, " + "#d65f5f 40.0%, transparent 40.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 10.0%, #d65f5f 10.0%, " + "#d65f5f 30.0%, transparent 30.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 30.0%, #d65f5f 30.0%, " + "#d65f5f 70.0%, transparent 70.0%)", + ], } assert result == expected def test_bar_align_mid_vmin_vmax_clipping(self): - df = pd.DataFrame({'A': [0, 1], 'B': [-2, 4]}) - result = df.style.bar(align='mid', axis=None, - vmin=-1, vmax=3)._compute().ctx + df = pd.DataFrame({"A": [0, 1], "B": [-2, 4]}) + result = df.style.bar(align="mid", axis=None, vmin=-1, vmax=3)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%'], - (1, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 25.0%, transparent 25.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): ["width: 10em", " height: 80%"], + (1, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_mid_nans(self): - df = pd.DataFrame({'A': [1, None], 'B': [-1, 3]}) - result = df.style.bar(align='mid', axis=None)._compute().ctx + df = pd.DataFrame({"A": [1, None], "B": [-1, 3]}) + result = df.style.bar(align="mid", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 0): [''], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg,' - '#d65f5f 25.0%, transparent 25.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 0): [""], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg," + "#d65f5f 25.0%, transparent 25.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_align_zero_nans(self): - df = pd.DataFrame({'A': [1, None], 'B': [-1, 2]}) - result = df.style.bar(align='zero', axis=None)._compute().ctx + df = pd.DataFrame({"A": [1, None], "B": [-1, 2]}) + result = df.style.bar(align="zero", axis=None)._compute().ctx expected = { - (0, 0): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 75.0%, transparent 75.0%)'], - (1, 0): [''], - (0, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 25.0%, #d65f5f 25.0%, ' - '#d65f5f 50.0%, transparent 50.0%)'], - (1, 1): ['width: 10em', ' height: 80%', - 'background: linear-gradient(90deg, ' - 'transparent 50.0%, #d65f5f 50.0%, ' - '#d65f5f 100.0%, transparent 100.0%)'] + (0, 0): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 75.0%, transparent 75.0%)", + ], + (1, 0): [""], + (0, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 25.0%, #d65f5f 25.0%, " + "#d65f5f 50.0%, transparent 50.0%)", + ], + (1, 1): [ + "width: 10em", + " height: 80%", + "background: linear-gradient(90deg, " + "transparent 50.0%, #d65f5f 50.0%, " + "#d65f5f 100.0%, transparent 100.0%)", + ], } assert result == expected def test_bar_bad_align_raises(self): - df = pd.DataFrame({'A': [-100, -60, -30, -20]}) + df = pd.DataFrame({"A": [-100, -60, -30, -20]}) with pytest.raises(ValueError): - df.style.bar(align='poorly', color=['#d65f5f', '#5fba7d']) + df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]) - def test_highlight_null(self, null_color='red'): - df = pd.DataFrame({'A': [0, np.nan]}) + def test_highlight_null(self, null_color="red"): + df = pd.DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx - expected = {(0, 0): [''], - (1, 0): ['background-color: red']} + expected = {(0, 0): [""], (1, 0): ["background-color: red"]} assert result == expected def test_nonunique_raises(self): - df = pd.DataFrame([[1, 2]], columns=['A', 'A']) + df = pd.DataFrame([[1, 2]], columns=["A", "A"]) with pytest.raises(ValueError): df.style @@ -739,38 +1005,38 @@ def test_nonunique_raises(self): Styler(df) def test_caption(self): - styler = Styler(self.df, caption='foo') + styler = Styler(self.df, caption="foo") result = styler.render() - assert all(['caption' in result, 'foo' in result]) + assert all(["caption" in result, "foo" in result]) styler = self.df.style - result = styler.set_caption('baz') + result = styler.set_caption("baz") assert styler is result - assert styler.caption == 'baz' + assert styler.caption == "baz" def test_uuid(self): - styler = Styler(self.df, uuid='abc123') + styler = Styler(self.df, uuid="abc123") result = styler.render() - assert 'abc123' in result + assert "abc123" in result styler = self.df.style - result = styler.set_uuid('aaa') + result = styler.set_uuid("aaa") assert result is styler - assert result.uuid == 'aaa' + assert result.uuid == "aaa" def test_unique_id(self): # See https://github.com/pandas-dev/pandas/issues/16780 - df = pd.DataFrame({'a': [1, 3, 5, 6], 'b': [2, 4, 12, 21]}) - result = df.style.render(uuid='test') - assert 'test' in result + df = pd.DataFrame({"a": [1, 3, 5, 6], "b": [2, 4, 12, 21]}) + result = df.style.render(uuid="test") + assert "test" in result ids = re.findall('id="(.*?)"', result) assert np.unique(ids).size == len(ids) def test_table_styles(self): - style = [{'selector': 'th', 'props': [('foo', 'bar')]}] + style = [{"selector": "th", "props": [("foo", "bar")]}] styler = Styler(self.df, table_styles=style) - result = ' '.join(styler.render().split()) - assert 'th { foo: bar; }' in result + result = " ".join(styler.render().split()) + assert "th { foo: bar; }" in result styler = self.df.style result = styler.set_table_styles(style) @@ -787,7 +1053,7 @@ def test_table_attributes(self): assert 'class="foo" data-bar' in result def test_precision(self): - with pd.option_context('display.precision', 10): + with pd.option_context("display.precision", 10): s = Styler(self.df) assert s.precision == 10 s = Styler(self.df, precision=2) @@ -799,65 +1065,75 @@ def test_precision(self): def test_apply_none(self): def f(x): - return pd.DataFrame(np.where(x == x.max(), 'color: red', ''), - index=x.index, columns=x.columns) - result = (pd.DataFrame([[1, 2], [3, 4]]) - .style.apply(f, axis=None)._compute().ctx) - assert result[(1, 1)] == ['color: red'] + return pd.DataFrame( + np.where(x == x.max(), "color: red", ""), + index=x.index, + columns=x.columns, + ) + + result = pd.DataFrame([[1, 2], [3, 4]]).style.apply(f, axis=None)._compute().ctx + assert result[(1, 1)] == ["color: red"] def test_trim(self): result = self.df.style.render() # trim=True - assert result.count('#') == 0 + assert result.count("#") == 0 result = self.df.style.highlight_max().render() - assert result.count('#') == len(self.df.columns) + assert result.count("#") == len(self.df.columns) def test_highlight_max(self): - df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) # max(df) = min(-df) for max_ in [True, False]: if max_: - attr = 'highlight_max' + attr = "highlight_max" else: df = -df - attr = 'highlight_min' + attr = "highlight_min" result = getattr(df.style, attr)()._compute().ctx - assert result[(1, 1)] == ['background-color: yellow'] + assert result[(1, 1)] == ["background-color: yellow"] - result = getattr(df.style, attr)(color='green')._compute().ctx - assert result[(1, 1)] == ['background-color: green'] + result = getattr(df.style, attr)(color="green")._compute().ctx + assert result[(1, 1)] == ["background-color: green"] - result = getattr(df.style, attr)(subset='A')._compute().ctx - assert result[(1, 0)] == ['background-color: yellow'] + result = getattr(df.style, attr)(subset="A")._compute().ctx + assert result[(1, 0)] == ["background-color: yellow"] result = getattr(df.style, attr)(axis=0)._compute().ctx - expected = {(1, 0): ['background-color: yellow'], - (1, 1): ['background-color: yellow'], - (0, 1): [''], (0, 0): ['']} + expected = { + (1, 0): ["background-color: yellow"], + (1, 1): ["background-color: yellow"], + (0, 1): [""], + (0, 0): [""], + } assert result == expected result = getattr(df.style, attr)(axis=1)._compute().ctx - expected = {(0, 1): ['background-color: yellow'], - (1, 1): ['background-color: yellow'], - (0, 0): [''], (1, 0): ['']} + expected = { + (0, 1): ["background-color: yellow"], + (1, 1): ["background-color: yellow"], + (0, 0): [""], + (1, 0): [""], + } assert result == expected # separate since we can't negate the strs - df['C'] = ['a', 'b'] + df["C"] = ["a", "b"] result = df.style.highlight_max()._compute().ctx - expected = {(1, 1): ['background-color: yellow']} + expected = {(1, 1): ["background-color: yellow"]} result = df.style.highlight_min()._compute().ctx - expected = {(0, 0): ['background-color: yellow']} + expected = {(0, 0): ["background-color: yellow"]} def test_export(self): - f = lambda x: 'color: red' if x > 0 else 'color: blue' - g = lambda x, y, z: 'color: {z}'.format(z=z) \ - if x > 0 else 'color: {z}'.format(z=z) + f = lambda x: "color: red" if x > 0 else "color: blue" + g = ( + lambda x, y, z: "color: {z}".format(z=z) + if x > 0 + else "color: {z}".format(z=z) + ) style1 = self.styler - style1.applymap(f)\ - .applymap(g, y='a', z='b')\ - .highlight_max() + style1.applymap(f).applymap(g, y="a", z="b").highlight_max() result = style1.export() style2 = self.df.style style2.use(result) @@ -868,11 +1144,11 @@ def test_display_format(self): df = pd.DataFrame(np.random.random(size=(2, 2))) ctx = df.style.format("{:0.1f}")._translate() - assert all(['display_value' in c for c in row] - for row in ctx['body']) - assert all([len(c['display_value']) <= 3 for c in row[1:]] - for row in ctx['body']) - assert len(ctx['body'][0][1]['display_value'].lstrip('-')) <= 3 + assert all(["display_value" in c for c in row] for row in ctx["body"]) + assert all( + [len(c["display_value"]) <= 3 for c in row[1:]] for row in ctx["body"] + ) + assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3 def test_display_format_raises(self): df = pd.DataFrame(np.random.randn(2, 2)) @@ -882,165 +1158,202 @@ def test_display_format_raises(self): df.style.format(True) def test_display_subset(self): - df = pd.DataFrame([[.1234, .1234], [1.1234, 1.1234]], - columns=['a', 'b']) - ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"}, - subset=pd.IndexSlice[0, :])._translate() - expected = '0.1' - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == '1.1234' - assert ctx['body'][0][2]['display_value'] == '12.34%' - - raw_11 = '1.1234' - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[0, :])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == raw_11 - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[0, :])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == raw_11 - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice['a'])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][0][2]['display_value'] == '0.1234' - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[0, 'a'])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == raw_11 - - ctx = df.style.format("{:0.1f}", - subset=pd.IndexSlice[[0, 1], ['a']])._translate() - assert ctx['body'][0][1]['display_value'] == expected - assert ctx['body'][1][1]['display_value'] == '1.1' - assert ctx['body'][0][2]['display_value'] == '0.1234' - assert ctx['body'][1][2]['display_value'] == '1.1234' + df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) + ctx = df.style.format( + {"a": "{:0.1f}", "b": "{0:.2%}"}, subset=pd.IndexSlice[0, :] + )._translate() + expected = "0.1" + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == "1.1234" + assert ctx["body"][0][2]["display_value"] == "12.34%" + + raw_11 = "1.1234" + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, :])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, :])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice["a"])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][0][2]["display_value"] == "0.1234" + + ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, "a"])._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == raw_11 + + ctx = df.style.format( + "{:0.1f}", subset=pd.IndexSlice[[0, 1], ["a"]] + )._translate() + assert ctx["body"][0][1]["display_value"] == expected + assert ctx["body"][1][1]["display_value"] == "1.1" + assert ctx["body"][0][2]["display_value"] == "0.1234" + assert ctx["body"][1][2]["display_value"] == "1.1234" def test_display_dict(self): - df = pd.DataFrame([[.1234, .1234], [1.1234, 1.1234]], - columns=['a', 'b']) + df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) ctx = df.style.format({"a": "{:0.1f}", "b": "{0:.2%}"})._translate() - assert ctx['body'][0][1]['display_value'] == '0.1' - assert ctx['body'][0][2]['display_value'] == '12.34%' - df['c'] = ['aaa', 'bbb'] + assert ctx["body"][0][1]["display_value"] == "0.1" + assert ctx["body"][0][2]["display_value"] == "12.34%" + df["c"] = ["aaa", "bbb"] ctx = df.style.format({"a": "{:0.1f}", "c": str.upper})._translate() - assert ctx['body'][0][1]['display_value'] == '0.1' - assert ctx['body'][0][3]['display_value'] == 'AAA' + assert ctx["body"][0][1]["display_value"] == "0.1" + assert ctx["body"][0][3]["display_value"] == "AAA" def test_bad_apply_shape(self): df = pd.DataFrame([[1, 2], [3, 4]]) with pytest.raises(ValueError): - df.style._apply(lambda x: 'x', subset=pd.IndexSlice[[0, 1], :]) + df.style._apply(lambda x: "x", subset=pd.IndexSlice[[0, 1], :]) with pytest.raises(ValueError): - df.style._apply(lambda x: [''], subset=pd.IndexSlice[[0, 1], :]) + df.style._apply(lambda x: [""], subset=pd.IndexSlice[[0, 1], :]) with pytest.raises(ValueError): - df.style._apply(lambda x: ['', '', '', '']) + df.style._apply(lambda x: ["", "", "", ""]) with pytest.raises(ValueError): - df.style._apply(lambda x: ['', '', ''], subset=1) + df.style._apply(lambda x: ["", "", ""], subset=1) with pytest.raises(ValueError): - df.style._apply(lambda x: ['', '', ''], axis=1) + df.style._apply(lambda x: ["", "", ""], axis=1) def test_apply_bad_return(self): def f(x): - return '' + return "" + df = pd.DataFrame([[1, 2], [3, 4]]) with pytest.raises(TypeError): df.style._apply(f, axis=None) def test_apply_bad_labels(self): def f(x): - return pd.DataFrame(index=[1, 2], columns=['a', 'b']) + return pd.DataFrame(index=[1, 2], columns=["a", "b"]) + df = pd.DataFrame([[1, 2], [3, 4]]) with pytest.raises(ValueError): df.style._apply(f, axis=None) def test_get_level_lengths(self): - index = pd.MultiIndex.from_product([['a', 'b'], [0, 1, 2]]) - expected = {(0, 0): 3, (0, 3): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1, - (1, 3): 1, (1, 4): 1, (1, 5): 1} + index = pd.MultiIndex.from_product([["a", "b"], [0, 1, 2]]) + expected = { + (0, 0): 3, + (0, 3): 3, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + (1, 4): 1, + (1, 5): 1, + } result = _get_level_lengths(index) tm.assert_dict_equal(result, expected) def test_get_level_lengths_un_sorted(self): - index = pd.MultiIndex.from_arrays([ - [1, 1, 2, 1], - ['a', 'b', 'b', 'd'] - ]) - expected = {(0, 0): 2, (0, 2): 1, (0, 3): 1, - (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1} + index = pd.MultiIndex.from_arrays([[1, 1, 2, 1], ["a", "b", "b", "d"]]) + expected = { + (0, 0): 2, + (0, 2): 1, + (0, 3): 1, + (1, 0): 1, + (1, 1): 1, + (1, 2): 1, + (1, 3): 1, + } result = _get_level_lengths(index) tm.assert_dict_equal(result, expected) def test_mi_sparse(self): - df = pd.DataFrame({'A': [1, 2]}, - index=pd.MultiIndex.from_arrays([['a', 'a'], - [0, 1]])) + df = pd.DataFrame( + {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) + ) result = df.style._translate() - body_0 = result['body'][0][0] + body_0 = result["body"][0][0] expected_0 = { - "value": "a", "display_value": "a", "is_visible": True, - "type": "th", "attributes": ["rowspan=2"], - "class": "row_heading level0 row0", "id": "level0_row0" + "value": "a", + "display_value": "a", + "is_visible": True, + "type": "th", + "attributes": ["rowspan=2"], + "class": "row_heading level0 row0", + "id": "level0_row0", } tm.assert_dict_equal(body_0, expected_0) - body_1 = result['body'][0][1] + body_1 = result["body"][0][1] expected_1 = { - "value": 0, "display_value": 0, "is_visible": True, - "type": "th", "class": "row_heading level1 row0", - "id": "level1_row0" + "value": 0, + "display_value": 0, + "is_visible": True, + "type": "th", + "class": "row_heading level1 row0", + "id": "level1_row0", } tm.assert_dict_equal(body_1, expected_1) - body_10 = result['body'][1][0] + body_10 = result["body"][1][0] expected_10 = { - "value": 'a', "display_value": 'a', "is_visible": False, - "type": "th", "class": "row_heading level0 row1", - "id": "level0_row1" + "value": "a", + "display_value": "a", + "is_visible": False, + "type": "th", + "class": "row_heading level0 row1", + "id": "level0_row1", } tm.assert_dict_equal(body_10, expected_10) - head = result['head'][0] + head = result["head"][0] expected = [ - {'type': 'th', 'class': 'blank', 'value': '', - 'is_visible': True, "display_value": ''}, - {'type': 'th', 'class': 'blank level0', 'value': '', - 'is_visible': True, 'display_value': ''}, - {'type': 'th', 'class': 'col_heading level0 col0', 'value': 'A', - 'is_visible': True, 'display_value': 'A'}] + { + "type": "th", + "class": "blank", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "type": "th", + "class": "blank level0", + "value": "", + "is_visible": True, + "display_value": "", + }, + { + "type": "th", + "class": "col_heading level0 col0", + "value": "A", + "is_visible": True, + "display_value": "A", + }, + ] assert head == expected def test_mi_sparse_disabled(self): - with pd.option_context('display.multi_sparse', False): - df = pd.DataFrame({'A': [1, 2]}, - index=pd.MultiIndex.from_arrays([['a', 'a'], - [0, 1]])) + with pd.option_context("display.multi_sparse", False): + df = pd.DataFrame( + {"A": [1, 2]}, index=pd.MultiIndex.from_arrays([["a", "a"], [0, 1]]) + ) result = df.style._translate() - body = result['body'] + body = result["body"] for row in body: - assert 'attributes' not in row[0] + assert "attributes" not in row[0] def test_mi_sparse_index_names(self): - df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( - [['a', 'a'], [0, 1]], - names=['idx_level_0', 'idx_level_1']) + df = pd.DataFrame( + {"A": [1, 2]}, + index=pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), ) result = df.style._translate() - head = result['head'][1] - expected = [{ - 'class': 'index_name level0', 'value': 'idx_level_0', - 'type': 'th'}, - {'class': 'index_name level1', 'value': 'idx_level_1', - 'type': 'th'}, - {'class': 'blank', 'value': '', 'type': 'th'}] + head = result["head"][1] + expected = [ + {"class": "index_name level0", "value": "idx_level_0", "type": "th"}, + {"class": "index_name level1", "value": "idx_level_1", "type": "th"}, + {"class": "blank", "value": "", "type": "th"}, + ] assert head == expected @@ -1048,42 +1361,58 @@ def test_mi_sparse_column_names(self): df = pd.DataFrame( np.arange(16).reshape(4, 4), index=pd.MultiIndex.from_arrays( - [['a', 'a', 'b', 'a'], [0, 1, 1, 2]], - names=['idx_level_0', 'idx_level_1']), + [["a", "a", "b", "a"], [0, 1, 1, 2]], + names=["idx_level_0", "idx_level_1"], + ), columns=pd.MultiIndex.from_arrays( - [['C1', 'C1', 'C2', 'C2'], [1, 0, 1, 0]], - names=['col_0', 'col_1'] - ) + [["C1", "C1", "C2", "C2"], [1, 0, 1, 0]], names=["col_0", "col_1"] + ), ) result = df.style._translate() - head = result['head'][1] + head = result["head"][1] expected = [ - {'class': 'blank', 'value': '', 'display_value': '', - 'type': 'th', 'is_visible': True}, - {'class': 'index_name level1', 'value': 'col_1', - 'display_value': 'col_1', 'is_visible': True, 'type': 'th'}, - {'class': 'col_heading level1 col0', - 'display_value': 1, - 'is_visible': True, - 'type': 'th', - 'value': 1}, - {'class': 'col_heading level1 col1', - 'display_value': 0, - 'is_visible': True, - 'type': 'th', - 'value': 0}, - - {'class': 'col_heading level1 col2', - 'display_value': 1, - 'is_visible': True, - 'type': 'th', - 'value': 1}, - - {'class': 'col_heading level1 col3', - 'display_value': 0, - 'is_visible': True, - 'type': 'th', - 'value': 0}, + { + "class": "blank", + "value": "", + "display_value": "", + "type": "th", + "is_visible": True, + }, + { + "class": "index_name level1", + "value": "col_1", + "display_value": "col_1", + "is_visible": True, + "type": "th", + }, + { + "class": "col_heading level1 col0", + "display_value": 1, + "is_visible": True, + "type": "th", + "value": 1, + }, + { + "class": "col_heading level1 col1", + "display_value": 0, + "is_visible": True, + "type": "th", + "value": 0, + }, + { + "class": "col_heading level1 col2", + "display_value": 1, + "is_visible": True, + "type": "th", + "value": 1, + }, + { + "class": "col_heading level1 col3", + "display_value": 0, + "is_visible": True, + "type": "th", + "value": 0, + }, ] assert head == expected @@ -1091,195 +1420,206 @@ def test_hide_single_index(self): # GH 14194 # single unnamed index ctx = self.df.style._translate() - assert ctx['body'][0][0]['is_visible'] - assert ctx['head'][0][0]['is_visible'] + assert ctx["body"][0][0]["is_visible"] + assert ctx["head"][0][0]["is_visible"] ctx2 = self.df.style.hide_index()._translate() - assert not ctx2['body'][0][0]['is_visible'] - assert not ctx2['head'][0][0]['is_visible'] + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["head"][0][0]["is_visible"] # single named index - ctx3 = self.df.set_index('A').style._translate() - assert ctx3['body'][0][0]['is_visible'] - assert len(ctx3['head']) == 2 # 2 header levels - assert ctx3['head'][0][0]['is_visible'] + ctx3 = self.df.set_index("A").style._translate() + assert ctx3["body"][0][0]["is_visible"] + assert len(ctx3["head"]) == 2 # 2 header levels + assert ctx3["head"][0][0]["is_visible"] - ctx4 = self.df.set_index('A').style.hide_index()._translate() - assert not ctx4['body'][0][0]['is_visible'] - assert len(ctx4['head']) == 1 # only 1 header levels - assert not ctx4['head'][0][0]['is_visible'] + ctx4 = self.df.set_index("A").style.hide_index()._translate() + assert not ctx4["body"][0][0]["is_visible"] + assert len(ctx4["head"]) == 1 # only 1 header levels + assert not ctx4["head"][0][0]["is_visible"] def test_hide_multiindex(self): # GH 14194 - df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( - [['a', 'a'], [0, 1]], - names=['idx_level_0', 'idx_level_1']) + df = pd.DataFrame( + {"A": [1, 2]}, + index=pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ), ) ctx1 = df.style._translate() # tests for 'a' and '0' - assert ctx1['body'][0][0]['is_visible'] - assert ctx1['body'][0][1]['is_visible'] + assert ctx1["body"][0][0]["is_visible"] + assert ctx1["body"][0][1]["is_visible"] # check for blank header rows - assert ctx1['head'][0][0]['is_visible'] - assert ctx1['head'][0][1]['is_visible'] + assert ctx1["head"][0][0]["is_visible"] + assert ctx1["head"][0][1]["is_visible"] ctx2 = df.style.hide_index()._translate() # tests for 'a' and '0' - assert not ctx2['body'][0][0]['is_visible'] - assert not ctx2['body'][0][1]['is_visible'] + assert not ctx2["body"][0][0]["is_visible"] + assert not ctx2["body"][0][1]["is_visible"] # check for blank header rows - assert not ctx2['head'][0][0]['is_visible'] - assert not ctx2['head'][0][1]['is_visible'] + assert not ctx2["head"][0][0]["is_visible"] + assert not ctx2["head"][0][1]["is_visible"] def test_hide_columns_single_level(self): # GH 14194 # test hiding single column ctx = self.df.style._translate() - assert ctx['head'][0][1]['is_visible'] - assert ctx['head'][0][1]['display_value'] == 'A' - assert ctx['head'][0][2]['is_visible'] - assert ctx['head'][0][2]['display_value'] == 'B' - assert ctx['body'][0][1]['is_visible'] # col A, row 1 - assert ctx['body'][1][2]['is_visible'] # col B, row 1 - - ctx = self.df.style.hide_columns('A')._translate() - assert not ctx['head'][0][1]['is_visible'] - assert not ctx['body'][0][1]['is_visible'] # col A, row 1 - assert ctx['body'][1][2]['is_visible'] # col B, row 1 + assert ctx["head"][0][1]["is_visible"] + assert ctx["head"][0][1]["display_value"] == "A" + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][0][2]["display_value"] == "B" + assert ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 + + ctx = self.df.style.hide_columns("A")._translate() + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert ctx["body"][1][2]["is_visible"] # col B, row 1 # test hiding mulitiple columns - ctx = self.df.style.hide_columns(['A', 'B'])._translate() - assert not ctx['head'][0][1]['is_visible'] - assert not ctx['head'][0][2]['is_visible'] - assert not ctx['body'][0][1]['is_visible'] # col A, row 1 - assert not ctx['body'][1][2]['is_visible'] # col B, row 1 + ctx = self.df.style.hide_columns(["A", "B"])._translate() + assert not ctx["head"][0][1]["is_visible"] + assert not ctx["head"][0][2]["is_visible"] + assert not ctx["body"][0][1]["is_visible"] # col A, row 1 + assert not ctx["body"][1][2]["is_visible"] # col B, row 1 def test_hide_columns_mult_levels(self): # GH 14194 # setup dataframe with multiple column levels and indices - i1 = pd.MultiIndex.from_arrays([['a', 'a'], [0, 1]], - names=['idx_level_0', - 'idx_level_1']) - i2 = pd.MultiIndex.from_arrays([['b', 'b'], [0, 1]], - names=['col_level_0', - 'col_level_1']) + i1 = pd.MultiIndex.from_arrays( + [["a", "a"], [0, 1]], names=["idx_level_0", "idx_level_1"] + ) + i2 = pd.MultiIndex.from_arrays( + [["b", "b"], [0, 1]], names=["col_level_0", "col_level_1"] + ) df = pd.DataFrame([[1, 2], [3, 4]], index=i1, columns=i2) ctx = df.style._translate() # column headers - assert ctx['head'][0][2]['is_visible'] - assert ctx['head'][1][2]['is_visible'] - assert ctx['head'][1][3]['display_value'] == 1 + assert ctx["head"][0][2]["is_visible"] + assert ctx["head"][1][2]["is_visible"] + assert ctx["head"][1][3]["display_value"] == 1 # indices - assert ctx['body'][0][0]['is_visible'] + assert ctx["body"][0][0]["is_visible"] # data - assert ctx['body'][1][2]['is_visible'] - assert ctx['body'][1][2]['display_value'] == 3 - assert ctx['body'][1][3]['is_visible'] - assert ctx['body'][1][3]['display_value'] == 4 + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == 3 + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == 4 # hide top column level, which hides both columns - ctx = df.style.hide_columns('b')._translate() - assert not ctx['head'][0][2]['is_visible'] # b - assert not ctx['head'][1][2]['is_visible'] # 0 - assert not ctx['body'][1][2]['is_visible'] # 3 - assert ctx['body'][0][0]['is_visible'] # index + ctx = df.style.hide_columns("b")._translate() + assert not ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][0][0]["is_visible"] # index # hide first column only - ctx = df.style.hide_columns([('b', 0)])._translate() - assert ctx['head'][0][2]['is_visible'] # b - assert not ctx['head'][1][2]['is_visible'] # 0 - assert not ctx['body'][1][2]['is_visible'] # 3 - assert ctx['body'][1][3]['is_visible'] - assert ctx['body'][1][3]['display_value'] == 4 + ctx = df.style.hide_columns([("b", 0)])._translate() + assert ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["body"][1][2]["is_visible"] # 3 + assert ctx["body"][1][3]["is_visible"] + assert ctx["body"][1][3]["display_value"] == 4 # hide second column and index - ctx = df.style.hide_columns([('b', 1)]).hide_index()._translate() - assert not ctx['body'][0][0]['is_visible'] # index - assert ctx['head'][0][2]['is_visible'] # b - assert ctx['head'][1][2]['is_visible'] # 0 - assert not ctx['head'][1][3]['is_visible'] # 1 - assert not ctx['body'][1][3]['is_visible'] # 4 - assert ctx['body'][1][2]['is_visible'] - assert ctx['body'][1][2]['display_value'] == 3 + ctx = df.style.hide_columns([("b", 1)]).hide_index()._translate() + assert not ctx["body"][0][0]["is_visible"] # index + assert ctx["head"][0][2]["is_visible"] # b + assert ctx["head"][1][2]["is_visible"] # 0 + assert not ctx["head"][1][3]["is_visible"] # 1 + assert not ctx["body"][1][3]["is_visible"] # 4 + assert ctx["body"][1][2]["is_visible"] + assert ctx["body"][1][2]["display_value"] == 3 def test_pipe(self): def set_caption_from_template(styler, a, b): return styler.set_caption( - 'Dataframe with a = {a} and b = {b}'.format(a=a, b=b)) + "Dataframe with a = {a} and b = {b}".format(a=a, b=b) + ) - styler = self.df.style.pipe(set_caption_from_template, 'A', b='B') - assert 'Dataframe with a = A and b = B' in styler.render() + styler = self.df.style.pipe(set_caption_from_template, "A", b="B") + assert "Dataframe with a = A and b = B" in styler.render() # Test with an argument that is a (callable, keyword_name) pair. def f(a, b, styler): return (a, b, styler) styler = self.df.style - result = styler.pipe((f, 'styler'), a=1, b=2) + result = styler.pipe((f, "styler"), a=1, b=2) assert result == (1, 2, styler) @td.skip_if_no_mpl class TestStylerMatplotlibDep: - def test_background_gradient(self): - df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) - for c_map in [None, 'YlOrRd']: + for c_map in [None, "YlOrRd"]: result = df.style.background_gradient(cmap=c_map)._compute().ctx assert all("#" in x[0] for x in result.values()) assert result[(0, 0)] == result[(0, 1)] assert result[(1, 0)] == result[(1, 1)] - result = df.style.background_gradient( - subset=pd.IndexSlice[1, 'A'])._compute().ctx + result = ( + df.style.background_gradient(subset=pd.IndexSlice[1, "A"])._compute().ctx + ) - assert result[(1, 0)] == ['background-color: #fff7fb', - 'color: #000000'] + assert result[(1, 0)] == ["background-color: #fff7fb", "color: #000000"] @pytest.mark.parametrize( - 'c_map,expected', [ - (None, { - (0, 0): ['background-color: #440154', 'color: #f1f1f1'], - (1, 0): ['background-color: #fde725', 'color: #000000']}), - ('YlOrRd', { - (0, 0): ['background-color: #ffffcc', 'color: #000000'], - (1, 0): ['background-color: #800026', 'color: #f1f1f1']})]) + "c_map,expected", + [ + ( + None, + { + (0, 0): ["background-color: #440154", "color: #f1f1f1"], + (1, 0): ["background-color: #fde725", "color: #000000"], + }, + ), + ( + "YlOrRd", + { + (0, 0): ["background-color: #ffffcc", "color: #000000"], + (1, 0): ["background-color: #800026", "color: #f1f1f1"], + }, + ), + ], + ) def test_text_color_threshold(self, c_map, expected): - df = pd.DataFrame([1, 2], columns=['A']) + df = pd.DataFrame([1, 2], columns=["A"]) result = df.style.background_gradient(cmap=c_map)._compute().ctx assert result == expected - @pytest.mark.parametrize("text_color_threshold", [1.1, '1', -1, [2, 2]]) + @pytest.mark.parametrize("text_color_threshold", [1.1, "1", -1, [2, 2]]) def test_text_color_threshold_raises(self, text_color_threshold): - df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) msg = "`text_color_threshold` must be a value from 0 to 1." with pytest.raises(ValueError, match=msg): df.style.background_gradient( - text_color_threshold=text_color_threshold)._compute() + text_color_threshold=text_color_threshold + )._compute() @td.skip_if_no_mpl def test_background_gradient_axis(self): - df = pd.DataFrame([[1, 2], [2, 4]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [2, 4]], columns=["A", "B"]) - low = ['background-color: #f7fbff', 'color: #000000'] - high = ['background-color: #08306b', 'color: #f1f1f1'] - mid = ['background-color: #abd0e6', 'color: #000000'] - result = df.style.background_gradient(cmap='Blues', - axis=0)._compute().ctx + low = ["background-color: #f7fbff", "color: #000000"] + high = ["background-color: #08306b", "color: #f1f1f1"] + mid = ["background-color: #abd0e6", "color: #000000"] + result = df.style.background_gradient(cmap="Blues", axis=0)._compute().ctx assert result[(0, 0)] == low assert result[(0, 1)] == low assert result[(1, 0)] == high assert result[(1, 1)] == high - result = df.style.background_gradient(cmap='Blues', - axis=1)._compute().ctx + result = df.style.background_gradient(cmap="Blues", axis=1)._compute().ctx assert result[(0, 0)] == low assert result[(0, 1)] == high assert result[(1, 0)] == low assert result[(1, 1)] == high - result = df.style.background_gradient(cmap='Blues', - axis=None)._compute().ctx + result = df.style.background_gradient(cmap="Blues", axis=None)._compute().ctx assert result[(0, 0)] == low assert result[(0, 1)] == mid assert result[(1, 0)] == mid @@ -1289,10 +1629,23 @@ def test_background_gradient_axis(self): def test_block_names(): # catch accidental removal of a block expected = { - 'before_style', 'style', 'table_styles', 'before_cellstyle', - 'cellstyle', 'before_table', 'table', 'caption', 'thead', 'tbody', - 'after_table', 'before_head_rows', 'head_tr', 'after_head_rows', - 'before_rows', 'tr', 'after_rows', + "before_style", + "style", + "table_styles", + "before_cellstyle", + "cellstyle", + "before_table", + "table", + "caption", + "thead", + "tbody", + "after_table", + "before_head_rows", + "head_tr", + "after_head_rows", + "before_rows", + "tr", + "after_rows", } result = set(Styler.template.blocks) assert result == expected @@ -1300,14 +1653,17 @@ def test_block_names(): def test_from_custom_template(tmpdir): p = tmpdir.mkdir("templates").join("myhtml.tpl") - p.write(textwrap.dedent("""\ + p.write( + textwrap.dedent( + """\ {% extends "html.tpl" %} {% block table %}

{{ table_title|default("My Table") }}

{{ super() }} - {% endblock table %}""")) - result = Styler.from_custom_template(str(tmpdir.join('templates')), - 'myhtml.tpl') + {% endblock table %}""" + ) + ) + result = Styler.from_custom_template(str(tmpdir.join("templates")), "myhtml.tpl") assert issubclass(result, Styler) assert result.env is not Styler.env assert result.template is not Styler.template diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 0acdaaa7f82cd..7b493266144b0 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -10,10 +10,10 @@ class TestToCSV: - - @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5), - reason=("Python csv library bug " - "(see https://bugs.python.org/issue32255)")) + @pytest.mark.xfail( + (3, 6, 5) > sys.version_info >= (3, 5), + reason=("Python csv library bug " "(see https://bugs.python.org/issue32255)"), + ) def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # @@ -27,9 +27,9 @@ def test_to_csv_with_single_column(self): "" 1.0 """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df1.to_csv(path, header=None, index=None) - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected1 df2 = DataFrame([1, None]) @@ -37,31 +37,31 @@ def test_to_csv_with_single_column(self): 1.0 "" """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df2.to_csv(path, header=None, index=None) - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected2 def test_to_csv_defualt_encoding(self): # GH17097 - df = DataFrame({'col': ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) + df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: # the default to_csv encoding is uft-8. df.to_csv(path) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) def test_to_csv_quotechar(self): - df = DataFrame({'col': [1, 2]}) + df = DataFrame({"col": [1, 2]}) expected = """\ "","col" "0","1" "1","2" """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected expected = """\ @@ -70,396 +70,377 @@ def test_to_csv_quotechar(self): $1$,$2$ """ - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, quotechar="$") - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected - with tm.ensure_clean('test.csv') as path: - with pytest.raises(TypeError, match='quotechar'): + with tm.ensure_clean("test.csv") as path: + with pytest.raises(TypeError, match="quotechar"): df.to_csv(path, quoting=1, quotechar=None) def test_to_csv_doublequote(self): - df = DataFrame({'col': ['a"a', '"bb"']}) + df = DataFrame({"col": ['a"a', '"bb"']}) expected = '''\ "","col" "0","a""a" "1","""bb""" ''' - with tm.ensure_clean('test.csv') as path: + with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected from _csv import Error - with tm.ensure_clean('test.csv') as path: - with pytest.raises(Error, match='escapechar'): + + with tm.ensure_clean("test.csv") as path: + with pytest.raises(Error, match="escapechar"): df.to_csv(path, doublequote=False) # no escapechar set def test_to_csv_escapechar(self): - df = DataFrame({'col': ['a"a', '"bb"']}) - expected = '''\ + df = DataFrame({"col": ['a"a', '"bb"']}) + expected = """\ "","col" "0","a\\"a" "1","\\"bb\\"" -''' +""" - with tm.ensure_clean('test.csv') as path: # QUOTE_ALL - df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') - with open(path, 'r') as f: + with tm.ensure_clean("test.csv") as path: # QUOTE_ALL + df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") + with open(path, "r") as f: assert f.read() == expected - df = DataFrame({'col': ['a,a', ',bb,']}) + df = DataFrame({"col": ["a,a", ",bb,"]}) expected = """\ ,col 0,a\\,a 1,\\,bb\\, """ - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE - with open(path, 'r') as f: + with tm.ensure_clean("test.csv") as path: + df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE + with open(path, "r") as f: assert f.read() == expected def test_csv_to_string(self): - df = DataFrame({'col': [1, 2]}) - expected_rows = [',col', - '0,1', - '1,2'] + df = DataFrame({"col": [1, 2]}) + expected_rows = [",col", "0,1", "1,2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected def test_to_csv_decimal(self): # see gh-781 - df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) + df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) - expected_rows = [',col1,col2,col3', - '0,1,a,10.1'] + expected_rows = [",col1,col2,col3", "0,1,a,10.1"] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected_default - expected_rows = [';col1;col2;col3', - '0;1;a;10,1'] - expected_european_excel = tm.convert_rows_list_to_csv_str( - expected_rows) - assert df.to_csv(decimal=',', sep=';') == expected_european_excel + expected_rows = [";col1;col2;col3", "0;1;a;10,1"] + expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(decimal=",", sep=";") == expected_european_excel - expected_rows = [',col1,col2,col3', - '0,1,a,10.10'] - expected_float_format_default = tm.convert_rows_list_to_csv_str( - expected_rows) - assert df.to_csv(float_format='%.2f') == expected_float_format_default + expected_rows = [",col1,col2,col3", "0,1,a,10.10"] + expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.to_csv(float_format="%.2f") == expected_float_format_default - expected_rows = [';col1;col2;col3', - '0;1;a;10,10'] + expected_rows = [";col1;col2;col3", "0;1;a;10,10"] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(decimal=',', sep=';', - float_format='%.2f') == expected_float_format + assert ( + df.to_csv(decimal=",", sep=";", float_format="%.2f") + == expected_float_format + ) # see gh-11553: testing if decimal is taken into account for '0.0' - df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) + df = pd.DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) - expected_rows = ['a,b,c', - '0^0,2^2,1', - '1^1,3^3,1'] + expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(index=False, decimal='^') == expected + assert df.to_csv(index=False, decimal="^") == expected # same but for an index - assert df.set_index('a').to_csv(decimal='^') == expected + assert df.set_index("a").to_csv(decimal="^") == expected # same for a multi-index - assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected + assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected def test_to_csv_float_format(self): # testing if float_format is taken into account for the index # GH 11553 - df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1}) + df = pd.DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) - expected_rows = ['a,b,c', - '0,2.20,1', - '1,3.30,1'] + expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(float_format='%.2f') == expected + assert df.set_index("a").to_csv(float_format="%.2f") == expected # same for a multi-index - assert df.set_index(['a', 'b']).to_csv( - float_format='%.2f') == expected + assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. - df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) - expected_rows = ['a,b,c', - '0.0,0,2', - '_,1,3'] + df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(na_rep='_') == expected - assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # now with an index containing only NaNs - df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) - expected_rows = ['a,b,c', - '_,0,2', - '_,1,3'] + df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(na_rep='_') == expected - assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # check if na_rep parameter does not break anything when no NaN - df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) - expected_rows = ['a,b,c', - '0,0,2', - '0,1,3'] + df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) + expected_rows = ["a,b,c", "0,0,2", "0,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index('a').to_csv(na_rep='_') == expected - assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected + assert df.set_index("a").to_csv(na_rep="_") == expected + assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected def test_to_csv_date_format(self): # GH 10209 - df_sec = DataFrame({'A': pd.date_range('20130101', periods=5, freq='s') - }) - df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d') - }) - - expected_rows = [',A', - '0,2013-01-01 00:00:00', - '1,2013-01-01 00:00:01', - '2,2013-01-01 00:00:02', - '3,2013-01-01 00:00:03', - '4,2013-01-01 00:00:04'] + df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) + df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")}) + + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-01 00:00:01", + "2,2013-01-01 00:00:02", + "3,2013-01-01 00:00:03", + "4,2013-01-01 00:00:04", + ] expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv() == expected_default_sec - expected_rows = [',A', - '0,2013-01-01 00:00:00', - '1,2013-01-02 00:00:00', - '2,2013-01-03 00:00:00', - '3,2013-01-04 00:00:00', - '4,2013-01-05 00:00:00'] + expected_rows = [ + ",A", + "0,2013-01-01 00:00:00", + "1,2013-01-02 00:00:00", + "2,2013-01-03 00:00:00", + "3,2013-01-04 00:00:00", + "4,2013-01-05 00:00:00", + ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert (df_day.to_csv(date_format='%Y-%m-%d %H:%M:%S') == - expected_ymdhms_day) - - expected_rows = [',A', - '0,2013-01-01', - '1,2013-01-01', - '2,2013-01-01', - '3,2013-01-01', - '4,2013-01-01'] + assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-01", + "2,2013-01-01", + "3,2013-01-01", + "4,2013-01-01", + ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec - - expected_rows = [',A', - '0,2013-01-01', - '1,2013-01-02', - '2,2013-01-03', - '3,2013-01-04', - '4,2013-01-05'] + assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + + expected_rows = [ + ",A", + "0,2013-01-01", + "1,2013-01-02", + "2,2013-01-03", + "3,2013-01-04", + "4,2013-01-05", + ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv() == expected_default_day - assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day + assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day # see gh-7791 # # Testing if date_format parameter is taken into account # for multi-indexed DataFrames. - df_sec['B'] = 0 - df_sec['C'] = 1 + df_sec["B"] = 0 + df_sec["C"] = 1 - expected_rows = ['A,B,C', - '2013-01-01,0,1'] + expected_rows = ["A,B,C", "2013-01-01,0,1"] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) - df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B']) - assert (df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d') == - expected_ymd_sec) + df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) + assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) - exp_rows = [',1', - ',2', - '0,1'] + exp_rows = [",1", ",2", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp_rows = ['1', '2', '1'] + exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp - df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), - index=pd.MultiIndex.from_arrays([[1], [2]])) + df = DataFrame( + [1], + columns=pd.MultiIndex.from_arrays([[1], [2]]), + index=pd.MultiIndex.from_arrays([[1], [2]]), + ) - exp_rows = [',,1', ',,2', '1,2,1'] + exp_rows = [",,1", ",,2", "1,2,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp_rows = ['1', '2', '1'] + exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp - df = DataFrame( - [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) - exp_rows = [',foo', ',bar', '0,1'] + exp_rows = [",foo", ",bar", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp_rows = ['foo', 'bar', '1'] + exp_rows = ["foo", "bar", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp - @pytest.mark.parametrize("ind,expected", [ - (pd.MultiIndex(levels=[[1.0]], - codes=[[0]], - names=["x"]), - "x,data\n1.0,1\n"), - (pd.MultiIndex(levels=[[1.], [2.]], - codes=[[0], [0]], - names=["x", "y"]), - "x,y,data\n1.0,2.0,1\n") - ]) - @pytest.mark.parametrize("klass", [ - pd.DataFrame, pd.Series - ]) + @pytest.mark.parametrize( + "ind,expected", + [ + ( + pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]), + "x,data\n1.0,1\n", + ), + ( + pd.MultiIndex( + levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"] + ), + "x,y,data\n1.0,2.0,1\n", + ), + ], + ) + @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series]) def test_to_csv_single_level_multi_index(self, ind, expected, klass): # see gh-19589 result = klass(pd.Series([1], ind, name="data")).to_csv( - line_terminator="\n", header=True) + line_terminator="\n", header=True + ) assert result == expected def test_to_csv_string_array_ascii(self): # GH 10813 - str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}] + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = pd.DataFrame(str_array) - expected_ascii = '''\ + expected_ascii = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" -''' - with tm.ensure_clean('str_test.csv') as path: - df.to_csv(path, encoding='ascii') - with open(path, 'r') as f: +""" + with tm.ensure_clean("str_test.csv") as path: + df.to_csv(path, encoding="ascii") + with open(path, "r") as f: assert f.read() == expected_ascii @pytest.mark.xfail(strict=False) def test_to_csv_string_array_utf8(self): # GH 10813 - str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}] + str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = pd.DataFrame(str_array) - expected_utf8 = '''\ + expected_utf8 = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" -''' - with tm.ensure_clean('unicode_test.csv') as path: - df.to_csv(path, encoding='utf-8') - with open(path, 'r') as f: +""" + with tm.ensure_clean("unicode_test.csv") as path: + df.to_csv(path, encoding="utf-8") + with open(path, "r") as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self): # GH 20353 - data = { - 'int': [1, 2, 3], - 'str_lf': ['abc', 'd\nef', 'g\nh\n\ni'] - } + data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = pd.DataFrame(data) - with tm.ensure_clean('lf_test.csv') as path: + with tm.ensure_clean("lf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) - os_linesep = os.linesep.encode('utf-8') + os_linesep = os.linesep.encode("utf-8") expected_noarg = ( - b'int,str_lf' + os_linesep + - b'1,abc' + os_linesep + - b'2,"d\nef"' + os_linesep + - b'3,"g\nh\n\ni"' + os_linesep + b"int,str_lf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\nef"' + + os_linesep + + b'3,"g\nh\n\ni"' + + os_linesep ) df.to_csv(path, index=False) - with open(path, 'rb') as f: + with open(path, "rb") as f: assert f.read() == expected_noarg - with tm.ensure_clean('lf_test.csv') as path: + with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator - expected_lf = ( - b'int,str_lf\n' - b'1,abc\n' - b'2,"d\nef"\n' - b'3,"g\nh\n\ni"\n' - ) - df.to_csv(path, line_terminator='\n', index=False) - with open(path, 'rb') as f: + expected_lf = b"int,str_lf\n" b"1,abc\n" b'2,"d\nef"\n' b'3,"g\nh\n\ni"\n' + df.to_csv(path, line_terminator="\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_lf - with tm.ensure_clean('lf_test.csv') as path: + with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = ( - b'int,str_lf\r\n' - b'1,abc\r\n' - b'2,"d\nef"\r\n' - b'3,"g\nh\n\ni"\r\n' + b"int,str_lf\r\n" b"1,abc\r\n" b'2,"d\nef"\r\n' b'3,"g\nh\n\ni"\r\n' ) - df.to_csv(path, line_terminator='\r\n', index=False) - with open(path, 'rb') as f: + df.to_csv(path, line_terminator="\r\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_string_with_crlf(self): # GH 20353 - data = { - 'int': [1, 2, 3], - 'str_crlf': ['abc', 'd\r\nef', 'g\r\nh\r\n\r\ni'] - } + data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} df = pd.DataFrame(data) - with tm.ensure_clean('crlf_test.csv') as path: + with tm.ensure_clean("crlf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) - os_linesep = os.linesep.encode('utf-8') + os_linesep = os.linesep.encode("utf-8") expected_noarg = ( - b'int,str_crlf' + os_linesep + - b'1,abc' + os_linesep + - b'2,"d\r\nef"' + os_linesep + - b'3,"g\r\nh\r\n\r\ni"' + os_linesep + b"int,str_crlf" + + os_linesep + + b"1,abc" + + os_linesep + + b'2,"d\r\nef"' + + os_linesep + + b'3,"g\r\nh\r\n\r\ni"' + + os_linesep ) df.to_csv(path, index=False) - with open(path, 'rb') as f: + with open(path, "rb") as f: assert f.read() == expected_noarg - with tm.ensure_clean('crlf_test.csv') as path: + with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator expected_lf = ( - b'int,str_crlf\n' - b'1,abc\n' - b'2,"d\r\nef"\n' - b'3,"g\r\nh\r\n\r\ni"\n' + b"int,str_crlf\n" b"1,abc\n" b'2,"d\r\nef"\n' b'3,"g\r\nh\r\n\r\ni"\n' ) - df.to_csv(path, line_terminator='\n', index=False) - with open(path, 'rb') as f: + df.to_csv(path, line_terminator="\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_lf - with tm.ensure_clean('crlf_test.csv') as path: + with tm.ensure_clean("crlf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = ( - b'int,str_crlf\r\n' - b'1,abc\r\n' + b"int,str_crlf\r\n" + b"1,abc\r\n" b'2,"d\r\nef"\r\n' b'3,"g\r\nh\r\n\r\ni"\r\n' ) - df.to_csv(path, line_terminator='\r\n', index=False) - with open(path, 'rb') as f: + df.to_csv(path, line_terminator="\r\n", index=False) + with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_stdout_file(self, capsys): # GH 21561 - df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], - columns=['name_1', 'name_2']) - expected_rows = [',name_1,name_2', - '0,foo,bar', - '1,baz,qux'] + df = pd.DataFrame( + [["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"] + ) + expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) - df.to_csv(sys.stdout, encoding='ascii') + df.to_csv(sys.stdout, encoding="ascii") captured = capsys.readouterr() assert captured.out == expected_ascii @@ -467,52 +448,53 @@ def test_to_csv_stdout_file(self, capsys): @pytest.mark.xfail( compat.is_platform_windows(), - reason=("Especially in Windows, file stream should not be passed" - "to csv writer without newline='' option." - "(https://docs.python.org/3.6/library/csv.html#csv.writer)")) + reason=( + "Especially in Windows, file stream should not be passed" + "to csv writer without newline='' option." + "(https://docs.python.org/3.6/library/csv.html#csv.writer)" + ), + ) def test_to_csv_write_to_open_file(self): # GH 21696 - df = pd.DataFrame({'a': ['x', 'y', 'z']}) - expected = '''\ + df = pd.DataFrame({"a": ["x", "y", "z"]}) + expected = """\ manual header x y z -''' - with tm.ensure_clean('test.txt') as path: - with open(path, 'w') as f: - f.write('manual header\n') +""" + with tm.ensure_clean("test.txt") as path: + with open(path, "w") as f: + f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path, 'r') as f: + with open(path, "r") as f: assert f.read() == expected def test_to_csv_write_to_open_file_with_newline_py3(self): # see gh-21696 # see gh-20353 - df = pd.DataFrame({'a': ['x', 'y', 'z']}) - expected_rows = ["x", - "y", - "z"] - expected = ("manual header\n" + - tm.convert_rows_list_to_csv_str(expected_rows)) - with tm.ensure_clean('test.txt') as path: - with open(path, 'w', newline='') as f: - f.write('manual header\n') + df = pd.DataFrame({"a": ["x", "y", "z"]}) + expected_rows = ["x", "y", "z"] + expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) + with tm.ensure_clean("test.txt") as path: + with open(path, "w", newline="") as f: + f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path, 'rb') as f: - assert f.read() == bytes(expected, 'utf-8') + with open(path, "rb") as f: + assert f.read() == bytes(expected, "utf-8") @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) - def test_to_csv_compression(self, compression_only, - read_infer, to_infer): + def test_to_csv_compression(self, compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only if compression == "zip": - pytest.skip("{compression} is not supported " - "for to_csv".format(compression=compression)) + pytest.skip( + "{compression} is not supported " + "for to_csv".format(compression=compression) + ) # We'll complete file extension subsequently. filename = "test." @@ -531,6 +513,5 @@ def test_to_csv_compression(self, compression_only, with tm.ensure_clean(filename) as path: df.to_csv(path, compression=to_compression) - result = pd.read_csv(path, index_col=0, - compression=read_compression) + result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 13eb517fcab6a..1440b0a6f06f1 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -11,172 +11,200 @@ from pandas.io.formats.excel import CSSToExcelConverter -@pytest.mark.parametrize('css,expected', [ - # FONT - # - name - ('font-family: foo,bar', {'font': {'name': 'foo'}}), - ('font-family: "foo bar",baz', {'font': {'name': 'foo bar'}}), - ('font-family: foo,\nbar', {'font': {'name': 'foo'}}), - ('font-family: foo, bar, baz', {'font': {'name': 'foo'}}), - ('font-family: bar, foo', {'font': {'name': 'bar'}}), - ('font-family: \'foo bar\', baz', {'font': {'name': 'foo bar'}}), - ('font-family: \'foo \\\'bar\', baz', {'font': {'name': 'foo \'bar'}}), - ('font-family: "foo \\"bar", baz', {'font': {'name': 'foo "bar'}}), - ('font-family: "foo ,bar", baz', {'font': {'name': 'foo ,bar'}}), - # - family - ('font-family: serif', {'font': {'name': 'serif', 'family': 1}}), - ('font-family: Serif', {'font': {'name': 'serif', 'family': 1}}), - ('font-family: roman, serif', {'font': {'name': 'roman', 'family': 1}}), - ('font-family: roman, sans-serif', {'font': {'name': 'roman', - 'family': 2}}), - ('font-family: roman, sans serif', {'font': {'name': 'roman'}}), - ('font-family: roman, sansserif', {'font': {'name': 'roman'}}), - ('font-family: roman, cursive', {'font': {'name': 'roman', 'family': 4}}), - ('font-family: roman, fantasy', {'font': {'name': 'roman', 'family': 5}}), - # - size - ('font-size: 1em', {'font': {'size': 12}}), - ('font-size: xx-small', {'font': {'size': 6}}), - ('font-size: x-small', {'font': {'size': 7.5}}), - ('font-size: small', {'font': {'size': 9.6}}), - ('font-size: medium', {'font': {'size': 12}}), - ('font-size: large', {'font': {'size': 13.5}}), - ('font-size: x-large', {'font': {'size': 18}}), - ('font-size: xx-large', {'font': {'size': 24}}), - ('font-size: 50%', {'font': {'size': 6}}), - # - bold - ('font-weight: 100', {'font': {'bold': False}}), - ('font-weight: 200', {'font': {'bold': False}}), - ('font-weight: 300', {'font': {'bold': False}}), - ('font-weight: 400', {'font': {'bold': False}}), - ('font-weight: normal', {'font': {'bold': False}}), - ('font-weight: lighter', {'font': {'bold': False}}), - ('font-weight: bold', {'font': {'bold': True}}), - ('font-weight: bolder', {'font': {'bold': True}}), - ('font-weight: 700', {'font': {'bold': True}}), - ('font-weight: 800', {'font': {'bold': True}}), - ('font-weight: 900', {'font': {'bold': True}}), - # - italic - ('font-style: italic', {'font': {'italic': True}}), - ('font-style: oblique', {'font': {'italic': True}}), - # - underline - ('text-decoration: underline', - {'font': {'underline': 'single'}}), - ('text-decoration: overline', - {}), - ('text-decoration: none', - {}), - # - strike - ('text-decoration: line-through', - {'font': {'strike': True}}), - ('text-decoration: underline line-through', - {'font': {'strike': True, 'underline': 'single'}}), - ('text-decoration: underline; text-decoration: line-through', - {'font': {'strike': True}}), - # - color - ('color: red', {'font': {'color': 'FF0000'}}), - ('color: #ff0000', {'font': {'color': 'FF0000'}}), - ('color: #f0a', {'font': {'color': 'FF00AA'}}), - # - shadow - ('text-shadow: none', {'font': {'shadow': False}}), - ('text-shadow: 0px -0em 0px #CCC', {'font': {'shadow': False}}), - ('text-shadow: 0px -0em 0px #999', {'font': {'shadow': False}}), - ('text-shadow: 0px -0em 0px', {'font': {'shadow': False}}), - ('text-shadow: 2px -0em 0px #CCC', {'font': {'shadow': True}}), - ('text-shadow: 0px -2em 0px #CCC', {'font': {'shadow': True}}), - ('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}), - ('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}), - ('text-shadow: 0px -2em', {'font': {'shadow': True}}), - - # FILL - # - color, fillType - ('background-color: red', {'fill': {'fgColor': 'FF0000', - 'patternType': 'solid'}}), - ('background-color: #ff0000', {'fill': {'fgColor': 'FF0000', - 'patternType': 'solid'}}), - ('background-color: #f0a', {'fill': {'fgColor': 'FF00AA', - 'patternType': 'solid'}}), - # BORDER - # - style - ('border-style: solid', - {'border': {'top': {'style': 'medium'}, - 'bottom': {'style': 'medium'}, - 'left': {'style': 'medium'}, - 'right': {'style': 'medium'}}}), - ('border-style: solid; border-width: thin', - {'border': {'top': {'style': 'thin'}, - 'bottom': {'style': 'thin'}, - 'left': {'style': 'thin'}, - 'right': {'style': 'thin'}}}), - - ('border-top-style: solid; border-top-width: thin', - {'border': {'top': {'style': 'thin'}}}), - ('border-top-style: solid; border-top-width: 1pt', - {'border': {'top': {'style': 'thin'}}}), - ('border-top-style: solid', - {'border': {'top': {'style': 'medium'}}}), - ('border-top-style: solid; border-top-width: medium', - {'border': {'top': {'style': 'medium'}}}), - ('border-top-style: solid; border-top-width: 2pt', - {'border': {'top': {'style': 'medium'}}}), - ('border-top-style: solid; border-top-width: thick', - {'border': {'top': {'style': 'thick'}}}), - ('border-top-style: solid; border-top-width: 4pt', - {'border': {'top': {'style': 'thick'}}}), - - ('border-top-style: dotted', - {'border': {'top': {'style': 'mediumDashDotDot'}}}), - ('border-top-style: dotted; border-top-width: thin', - {'border': {'top': {'style': 'dotted'}}}), - ('border-top-style: dashed', - {'border': {'top': {'style': 'mediumDashed'}}}), - ('border-top-style: dashed; border-top-width: thin', - {'border': {'top': {'style': 'dashed'}}}), - ('border-top-style: double', - {'border': {'top': {'style': 'double'}}}), - # - color - ('border-style: solid; border-color: #0000ff', - {'border': {'top': {'style': 'medium', 'color': '0000FF'}, - 'right': {'style': 'medium', 'color': '0000FF'}, - 'bottom': {'style': 'medium', 'color': '0000FF'}, - 'left': {'style': 'medium', 'color': '0000FF'}}}), - ('border-top-style: double; border-top-color: blue', - {'border': {'top': {'style': 'double', 'color': '0000FF'}}}), - ('border-top-style: solid; border-top-color: #06c', - {'border': {'top': {'style': 'medium', 'color': '0066CC'}}}), - # ALIGNMENT - # - horizontal - ('text-align: center', - {'alignment': {'horizontal': 'center'}}), - ('text-align: left', - {'alignment': {'horizontal': 'left'}}), - ('text-align: right', - {'alignment': {'horizontal': 'right'}}), - ('text-align: justify', - {'alignment': {'horizontal': 'justify'}}), - # - vertical - ('vertical-align: top', - {'alignment': {'vertical': 'top'}}), - ('vertical-align: text-top', - {'alignment': {'vertical': 'top'}}), - ('vertical-align: middle', - {'alignment': {'vertical': 'center'}}), - ('vertical-align: bottom', - {'alignment': {'vertical': 'bottom'}}), - ('vertical-align: text-bottom', - {'alignment': {'vertical': 'bottom'}}), - # - wrap_text - ('white-space: nowrap', - {'alignment': {'wrap_text': False}}), - ('white-space: pre', - {'alignment': {'wrap_text': False}}), - ('white-space: pre-line', - {'alignment': {'wrap_text': False}}), - ('white-space: normal', - {'alignment': {'wrap_text': True}}), - # NUMBER FORMAT - ('number-format: 0%', - {'number_format': {'format_code': '0%'}}), -]) +@pytest.mark.parametrize( + "css,expected", + [ + # FONT + # - name + ("font-family: foo,bar", {"font": {"name": "foo"}}), + ('font-family: "foo bar",baz', {"font": {"name": "foo bar"}}), + ("font-family: foo,\nbar", {"font": {"name": "foo"}}), + ("font-family: foo, bar, baz", {"font": {"name": "foo"}}), + ("font-family: bar, foo", {"font": {"name": "bar"}}), + ("font-family: 'foo bar', baz", {"font": {"name": "foo bar"}}), + ("font-family: 'foo \\'bar', baz", {"font": {"name": "foo 'bar"}}), + ('font-family: "foo \\"bar", baz', {"font": {"name": 'foo "bar'}}), + ('font-family: "foo ,bar", baz', {"font": {"name": "foo ,bar"}}), + # - family + ("font-family: serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: Serif", {"font": {"name": "serif", "family": 1}}), + ("font-family: roman, serif", {"font": {"name": "roman", "family": 1}}), + ("font-family: roman, sans-serif", {"font": {"name": "roman", "family": 2}}), + ("font-family: roman, sans serif", {"font": {"name": "roman"}}), + ("font-family: roman, sansserif", {"font": {"name": "roman"}}), + ("font-family: roman, cursive", {"font": {"name": "roman", "family": 4}}), + ("font-family: roman, fantasy", {"font": {"name": "roman", "family": 5}}), + # - size + ("font-size: 1em", {"font": {"size": 12}}), + ("font-size: xx-small", {"font": {"size": 6}}), + ("font-size: x-small", {"font": {"size": 7.5}}), + ("font-size: small", {"font": {"size": 9.6}}), + ("font-size: medium", {"font": {"size": 12}}), + ("font-size: large", {"font": {"size": 13.5}}), + ("font-size: x-large", {"font": {"size": 18}}), + ("font-size: xx-large", {"font": {"size": 24}}), + ("font-size: 50%", {"font": {"size": 6}}), + # - bold + ("font-weight: 100", {"font": {"bold": False}}), + ("font-weight: 200", {"font": {"bold": False}}), + ("font-weight: 300", {"font": {"bold": False}}), + ("font-weight: 400", {"font": {"bold": False}}), + ("font-weight: normal", {"font": {"bold": False}}), + ("font-weight: lighter", {"font": {"bold": False}}), + ("font-weight: bold", {"font": {"bold": True}}), + ("font-weight: bolder", {"font": {"bold": True}}), + ("font-weight: 700", {"font": {"bold": True}}), + ("font-weight: 800", {"font": {"bold": True}}), + ("font-weight: 900", {"font": {"bold": True}}), + # - italic + ("font-style: italic", {"font": {"italic": True}}), + ("font-style: oblique", {"font": {"italic": True}}), + # - underline + ("text-decoration: underline", {"font": {"underline": "single"}}), + ("text-decoration: overline", {}), + ("text-decoration: none", {}), + # - strike + ("text-decoration: line-through", {"font": {"strike": True}}), + ( + "text-decoration: underline line-through", + {"font": {"strike": True, "underline": "single"}}, + ), + ( + "text-decoration: underline; text-decoration: line-through", + {"font": {"strike": True}}, + ), + # - color + ("color: red", {"font": {"color": "FF0000"}}), + ("color: #ff0000", {"font": {"color": "FF0000"}}), + ("color: #f0a", {"font": {"color": "FF00AA"}}), + # - shadow + ("text-shadow: none", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #CCC", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px #999", {"font": {"shadow": False}}), + ("text-shadow: 0px -0em 0px", {"font": {"shadow": False}}), + ("text-shadow: 2px -0em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em 0px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px #CCC", {"font": {"shadow": True}}), + ("text-shadow: 0px -0em 2px", {"font": {"shadow": True}}), + ("text-shadow: 0px -2em", {"font": {"shadow": True}}), + # FILL + # - color, fillType + ( + "background-color: red", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #ff0000", + {"fill": {"fgColor": "FF0000", "patternType": "solid"}}, + ), + ( + "background-color: #f0a", + {"fill": {"fgColor": "FF00AA", "patternType": "solid"}}, + ), + # BORDER + # - style + ( + "border-style: solid", + { + "border": { + "top": {"style": "medium"}, + "bottom": {"style": "medium"}, + "left": {"style": "medium"}, + "right": {"style": "medium"}, + } + }, + ), + ( + "border-style: solid; border-width: thin", + { + "border": { + "top": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + "right": {"style": "thin"}, + } + }, + ), + ( + "border-top-style: solid; border-top-width: thin", + {"border": {"top": {"style": "thin"}}}, + ), + ( + "border-top-style: solid; border-top-width: 1pt", + {"border": {"top": {"style": "thin"}}}, + ), + ("border-top-style: solid", {"border": {"top": {"style": "medium"}}}), + ( + "border-top-style: solid; border-top-width: medium", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: 2pt", + {"border": {"top": {"style": "medium"}}}, + ), + ( + "border-top-style: solid; border-top-width: thick", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: solid; border-top-width: 4pt", + {"border": {"top": {"style": "thick"}}}, + ), + ( + "border-top-style: dotted", + {"border": {"top": {"style": "mediumDashDotDot"}}}, + ), + ( + "border-top-style: dotted; border-top-width: thin", + {"border": {"top": {"style": "dotted"}}}, + ), + ("border-top-style: dashed", {"border": {"top": {"style": "mediumDashed"}}}), + ( + "border-top-style: dashed; border-top-width: thin", + {"border": {"top": {"style": "dashed"}}}, + ), + ("border-top-style: double", {"border": {"top": {"style": "double"}}}), + # - color + ( + "border-style: solid; border-color: #0000ff", + { + "border": { + "top": {"style": "medium", "color": "0000FF"}, + "right": {"style": "medium", "color": "0000FF"}, + "bottom": {"style": "medium", "color": "0000FF"}, + "left": {"style": "medium", "color": "0000FF"}, + } + }, + ), + ( + "border-top-style: double; border-top-color: blue", + {"border": {"top": {"style": "double", "color": "0000FF"}}}, + ), + ( + "border-top-style: solid; border-top-color: #06c", + {"border": {"top": {"style": "medium", "color": "0066CC"}}}, + ), + # ALIGNMENT + # - horizontal + ("text-align: center", {"alignment": {"horizontal": "center"}}), + ("text-align: left", {"alignment": {"horizontal": "left"}}), + ("text-align: right", {"alignment": {"horizontal": "right"}}), + ("text-align: justify", {"alignment": {"horizontal": "justify"}}), + # - vertical + ("vertical-align: top", {"alignment": {"vertical": "top"}}), + ("vertical-align: text-top", {"alignment": {"vertical": "top"}}), + ("vertical-align: middle", {"alignment": {"vertical": "center"}}), + ("vertical-align: bottom", {"alignment": {"vertical": "bottom"}}), + ("vertical-align: text-bottom", {"alignment": {"vertical": "bottom"}}), + # - wrap_text + ("white-space: nowrap", {"alignment": {"wrap_text": False}}), + ("white-space: pre", {"alignment": {"wrap_text": False}}), + ("white-space: pre-line", {"alignment": {"wrap_text": False}}), + ("white-space: normal", {"alignment": {"wrap_text": True}}), + # NUMBER FORMAT + ("number-format: 0%", {"number_format": {"format_code": "0%"}}), + ], +) def test_css_to_excel(css, expected): convert = CSSToExcelConverter() assert expected == convert(css) @@ -184,7 +212,8 @@ def test_css_to_excel(css, expected): def test_css_to_excel_multiple(): convert = CSSToExcelConverter() - actual = convert(''' + actual = convert( + """ font-weight: bold; text-decoration: underline; color: red; @@ -192,63 +221,71 @@ def test_css_to_excel_multiple(): text-align: center; vertical-align: top; unused: something; - ''') - assert {"font": {"bold": True, "underline": "single", "color": "FF0000"}, - "border": {"top": {"style": "thin"}, - "right": {"style": "thin"}, - "bottom": {"style": "thin"}, - "left": {"style": "thin"}}, - "alignment": {"horizontal": "center", - "vertical": "top"}} == actual + """ + ) + assert { + "font": {"bold": True, "underline": "single", "color": "FF0000"}, + "border": { + "top": {"style": "thin"}, + "right": {"style": "thin"}, + "bottom": {"style": "thin"}, + "left": {"style": "thin"}, + }, + "alignment": {"horizontal": "center", "vertical": "top"}, + } == actual -@pytest.mark.parametrize('css,inherited,expected', [ - ('font-weight: bold', '', - {'font': {'bold': True}}), - ('', 'font-weight: bold', - {'font': {'bold': True}}), - ('font-weight: bold', 'font-style: italic', - {'font': {'bold': True, 'italic': True}}), - ('font-style: normal', 'font-style: italic', - {'font': {'italic': False}}), - ('font-style: inherit', '', {}), - ('font-style: normal; font-style: inherit', 'font-style: italic', - {'font': {'italic': True}}), -]) +@pytest.mark.parametrize( + "css,inherited,expected", + [ + ("font-weight: bold", "", {"font": {"bold": True}}), + ("", "font-weight: bold", {"font": {"bold": True}}), + ( + "font-weight: bold", + "font-style: italic", + {"font": {"bold": True, "italic": True}}, + ), + ("font-style: normal", "font-style: italic", {"font": {"italic": False}}), + ("font-style: inherit", "", {}), + ( + "font-style: normal; font-style: inherit", + "font-style: italic", + {"font": {"italic": True}}, + ), + ], +) def test_css_to_excel_inherited(css, inherited, expected): convert = CSSToExcelConverter(inherited) assert expected == convert(css) -@pytest.mark.parametrize("input_color,output_color", ( - [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] + - [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + - [("#F0F", "FF00FF"), ("#ABC", "AABBCC")]) +@pytest.mark.parametrize( + "input_color,output_color", + ( + [(name, rgb) for name, rgb in CSSToExcelConverter.NAMED_COLORS.items()] + + [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()] + + [("#F0F", "FF00FF"), ("#ABC", "AABBCC")] + ), ) def test_css_to_excel_good_colors(input_color, output_color): # see gh-18392 - css = ("border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}").format(color=input_color) + css = ( + "border-top-color: {color}; " + "border-right-color: {color}; " + "border-bottom-color: {color}; " + "border-left-color: {color}; " + "background-color: {color}; " + "color: {color}" + ).format(color=input_color) expected = dict() - expected["fill"] = { - "patternType": "solid", - "fgColor": output_color - } + expected["fill"] = {"patternType": "solid", "fgColor": output_color} - expected["font"] = { - "color": output_color - } + expected["font"] = {"color": output_color} expected["border"] = { - k: { - "color": output_color, - } for k in ("top", "right", "bottom", "left") + k: {"color": output_color} for k in ("top", "right", "bottom", "left") } with tm.assert_produces_warning(None): @@ -259,19 +296,19 @@ def test_css_to_excel_good_colors(input_color, output_color): @pytest.mark.parametrize("input_color", [None, "not-a-color"]) def test_css_to_excel_bad_colors(input_color): # see gh-18392 - css = ("border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}").format(color=input_color) + css = ( + "border-top-color: {color}; " + "border-right-color: {color}; " + "border-bottom-color: {color}; " + "border-left-color: {color}; " + "background-color: {color}; " + "color: {color}" + ).format(color=input_color) expected = dict() if input_color is not None: - expected["fill"] = { - "patternType": "solid" - } + expected["fill"] = {"patternType": "solid"} with tm.assert_produces_warning(CSSWarning): convert = CSSToExcelConverter() diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 97d51f079fb2d..448e869df950d 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -18,7 +18,8 @@ " ea commodo consequat. Duis aute irure dolor in reprehenderit in" " voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur" " sint occaecat cupidatat non proident, sunt in culpa qui officia" - " deserunt mollit anim id est laborum.") + " deserunt mollit anim id est laborum." +) def expected_html(datapath, name): @@ -36,24 +37,25 @@ def expected_html(datapath, name): ------- str : contents of HTML file. """ - filename = '.'.join([name, 'html']) - filepath = datapath('io', 'formats', 'data', 'html', filename) - with open(filepath, encoding='utf-8') as f: + filename = ".".join([name, "html"]) + filepath = datapath("io", "formats", "data", "html", filename) + with open(filepath, encoding="utf-8") as f: html = f.read() return html.rstrip() -@pytest.fixture(params=['mixed', 'empty']) +@pytest.fixture(params=["mixed", "empty"]) def biggie_df_fixture(request): """Fixture for a big mixed Dataframe and an empty Dataframe""" - if request.param == 'mixed': - df = DataFrame({'A': np.random.randn(200), - 'B': tm.makeStringIndex(200)}, - index=np.arange(200)) - df.loc[:20, 'A'] = np.nan - df.loc[:20, 'B'] = np.nan + if request.param == "mixed": + df = DataFrame( + {"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, + index=np.arange(200), + ) + df.loc[:20, "A"] = np.nan + df.loc[:20, "B"] = np.nan return df - elif request.param == 'empty': + elif request.param == "empty": df = DataFrame(index=np.arange(200)) return df @@ -63,7 +65,7 @@ def justify(request): return request.param -@pytest.mark.parametrize('col_space', [30, 50]) +@pytest.mark.parametrize("col_space", [30, 50]) def test_to_html_with_col_space(col_space): df = DataFrame(np.random.random(size=(1, 3))) # check that col_space affects HTML generation @@ -78,16 +80,19 @@ def test_to_html_with_col_space(col_space): def test_to_html_with_empty_string_label(): # GH 3547, to_html regards empty string labels as repeated labels - data = {'c1': ['a', 'b'], 'c2': ['a', ''], 'data': [1, 2]} - df = DataFrame(data).set_index(['c1', 'c2']) + data = {"c1": ["a", "b"], "c2": ["a", ""], "data": [1, 2]} + df = DataFrame(data).set_index(["c1", "c2"]) result = df.to_html() assert "rowspan" not in result -@pytest.mark.parametrize('df,expected', [ - (DataFrame({'\u03c3': np.arange(10.)}), 'unicode_1'), - (DataFrame({'A': ['\u03c3']}), 'unicode_2') -]) +@pytest.mark.parametrize( + "df,expected", + [ + (DataFrame({"\u03c3": np.arange(10.0)}), "unicode_1"), + (DataFrame({"A": ["\u03c3"]}), "unicode_2"), + ], +) def test_to_html_unicode(df, expected, datapath): expected = expected_html(datapath, expected) result = df.to_html() @@ -96,99 +101,112 @@ def test_to_html_unicode(df, expected, datapath): def test_to_html_decimal(datapath): # GH 12031 - df = DataFrame({'A': [6.0, 3.1, 2.2]}) - result = df.to_html(decimal=',') - expected = expected_html(datapath, 'gh12031_expected_output') + df = DataFrame({"A": [6.0, 3.1, 2.2]}) + result = df.to_html(decimal=",") + expected = expected_html(datapath, "gh12031_expected_output") assert result == expected -@pytest.mark.parametrize('kwargs,string,expected', [ - (dict(), "", 'escaped'), - (dict(escape=False), "bold", 'escape_disabled') -]) +@pytest.mark.parametrize( + "kwargs,string,expected", + [ + (dict(), "", "escaped"), + (dict(escape=False), "bold", "escape_disabled"), + ], +) def test_to_html_escaped(kwargs, string, expected, datapath): - a = 'strl2': {a: string, - b: string}} + test_dict = {"col2": {a: string, b: string}} result = DataFrame(test_dict).to_html(**kwargs) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('index_is_named', [True, False]) +@pytest.mark.parametrize("index_is_named", [True, False]) def test_to_html_multiindex_index_false(index_is_named, datapath): # GH 8452 - df = DataFrame({ - 'a': range(2), - 'b': range(3, 5), - 'c': range(5, 7), - 'd': range(3, 5) - }) - df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) + df = DataFrame( + {"a": range(2), "b": range(3, 5), "c": range(5, 7), "d": range(3, 5)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) if index_is_named: - df.index = Index(df.index.values, name='idx') + df.index = Index(df.index.values, name="idx") result = df.to_html(index=False) - expected = expected_html(datapath, 'gh8452_expected_output') + expected = expected_html(datapath, "gh8452_expected_output") assert result == expected -@pytest.mark.parametrize('multi_sparse,expected', [ - (False, 'multiindex_sparsify_false_multi_sparse_1'), - (False, 'multiindex_sparsify_false_multi_sparse_2'), - (True, 'multiindex_sparsify_1'), - (True, 'multiindex_sparsify_2') -]) +@pytest.mark.parametrize( + "multi_sparse,expected", + [ + (False, "multiindex_sparsify_false_multi_sparse_1"), + (False, "multiindex_sparsify_false_multi_sparse_2"), + (True, "multiindex_sparsify_1"), + (True, "multiindex_sparsify_2"), + ], +) def test_to_html_multiindex_sparsify(multi_sparse, expected, datapath): - index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], - names=['foo', None]) + index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], names=["foo", None]) df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index) - if expected.endswith('2'): + if expected.endswith("2"): df.columns = index[::2] - with option_context('display.multi_sparse', multi_sparse): + with option_context("display.multi_sparse", multi_sparse): result = df.to_html() expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('max_rows,expected', [ - (60, 'gh14882_expected_output_1'), - - # Test that ... appears in a middle level - (56, 'gh14882_expected_output_2') -]) +@pytest.mark.parametrize( + "max_rows,expected", + [ + (60, "gh14882_expected_output_1"), + # Test that ... appears in a middle level + (56, "gh14882_expected_output_2"), + ], +) def test_to_html_multiindex_odd_even_truncate(max_rows, expected, datapath): # GH 14882 - Issue on truncation with odd length DataFrame - index = MultiIndex.from_product([[100, 200, 300], - [10, 20, 30], - [1, 2, 3, 4, 5, 6, 7]], - names=['a', 'b', 'c']) - df = DataFrame({'n': range(len(index))}, index=index) + index = MultiIndex.from_product( + [[100, 200, 300], [10, 20, 30], [1, 2, 3, 4, 5, 6, 7]], names=["a", "b", "c"] + ) + df = DataFrame({"n": range(len(index))}, index=index) result = df.to_html(max_rows=max_rows) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('df,formatters,expected', [ - (DataFrame( - [[0, 1], [2, 3], [4, 5], [6, 7]], - columns=['foo', None], index=np.arange(4)), - {'__index__': lambda x: 'abcd' [x]}, - 'index_formatter'), - - (DataFrame( - {'months': [datetime(2016, 1, 1), datetime(2016, 2, 2)]}), - {'months': lambda x: x.strftime('%Y-%m')}, - 'datetime64_monthformatter'), - - (DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], - format='%H:%M:%S.%f')}), - {'hod': lambda x: x.strftime('%H:%M')}, - 'datetime64_hourformatter') -]) +@pytest.mark.parametrize( + "df,formatters,expected", + [ + ( + DataFrame( + [[0, 1], [2, 3], [4, 5], [6, 7]], + columns=["foo", None], + index=np.arange(4), + ), + {"__index__": lambda x: "abcd"[x]}, + "index_formatter", + ), + ( + DataFrame({"months": [datetime(2016, 1, 1), datetime(2016, 2, 2)]}), + {"months": lambda x: x.strftime("%Y-%m")}, + "datetime64_monthformatter", + ), + ( + DataFrame( + { + "hod": pd.to_datetime( + ["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f" + ) + } + ), + {"hod": lambda x: x.strftime("%H:%M")}, + "datetime64_hourformatter", + ), + ], +) def test_to_html_formatters(df, formatters, expected, datapath): expected = expected_html(datapath, expected) result = df.to_html(formatters=formatters) @@ -196,55 +214,63 @@ def test_to_html_formatters(df, formatters, expected, datapath): def test_to_html_regression_GH6098(): - df = DataFrame({ - 'clé1': ['a', 'a', 'b', 'b', 'a'], - 'clé2': ['1er', '2ème', '1er', '2ème', '1er'], - 'données1': np.random.randn(5), - 'données2': np.random.randn(5)}) + df = DataFrame( + { + "clé1": ["a", "a", "b", "b", "a"], + "clé2": ["1er", "2ème", "1er", "2ème", "1er"], + "données1": np.random.randn(5), + "données2": np.random.randn(5), + } + ) # it works - df.pivot_table(index=['clé1'], columns=['clé2'])._repr_html_() + df.pivot_table(index=["clé1"], columns=["clé2"])._repr_html_() def test_to_html_truncate(datapath): - index = pd.date_range(start='20010101', freq='D', periods=20) + index = pd.date_range(start="20010101", freq="D", periods=20) df = DataFrame(index=index, columns=range(20)) result = df.to_html(max_rows=8, max_cols=4) - expected = expected_html(datapath, 'truncate') + expected = expected_html(datapath, "truncate") assert result == expected -@pytest.mark.parametrize('sparsify,expected', [ - (True, 'truncate_multi_index'), - (False, 'truncate_multi_index_sparse_off') -]) +@pytest.mark.parametrize( + "sparsify,expected", + [(True, "truncate_multi_index"), (False, "truncate_multi_index_sparse_off")], +) def test_to_html_truncate_multi_index(sparsify, expected, datapath): - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] df = DataFrame(index=arrays, columns=arrays) result = df.to_html(max_rows=7, max_cols=7, sparsify=sparsify) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('option,result,expected', [ - (None, lambda df: df.to_html(), '1'), - (None, lambda df: df.to_html(border=0), '0'), - (0, lambda df: df.to_html(), '0'), - (0, lambda df: df._repr_html_(), '0'), -]) +@pytest.mark.parametrize( + "option,result,expected", + [ + (None, lambda df: df.to_html(), "1"), + (None, lambda df: df.to_html(border=0), "0"), + (0, lambda df: df.to_html(), "0"), + (0, lambda df: df._repr_html_(), "0"), + ], +) def test_to_html_border(option, result, expected): - df = DataFrame({'A': [1, 2]}) + df = DataFrame({"A": [1, 2]}) if option is None: result = result(df) else: - with option_context('display.html.border', option): + with option_context("display.html.border", option): result = result(df) expected = 'border="{}"'.format(expected) assert expected in result -@pytest.mark.parametrize('biggie_df_fixture', ['mixed'], indirect=True) +@pytest.mark.parametrize("biggie_df_fixture", ["mixed"], indirect=True) def test_to_html(biggie_df_fixture): # TODO: split this test df = biggie_df_fixture @@ -257,15 +283,14 @@ def test_to_html(biggie_df_fixture): assert isinstance(s, str) - df.to_html(columns=['B', 'A'], col_space=17) - df.to_html(columns=['B', 'A'], - formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) + df.to_html(columns=["B", "A"], col_space=17) + df.to_html(columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)}) - df.to_html(columns=['B', 'A'], float_format=str) - df.to_html(columns=['B', 'A'], col_space=12, float_format=str) + df.to_html(columns=["B", "A"], float_format=str) + df.to_html(columns=["B", "A"], col_space=12, float_format=str) -@pytest.mark.parametrize('biggie_df_fixture', ['empty'], indirect=True) +@pytest.mark.parametrize("biggie_df_fixture", ["empty"], indirect=True) def test_to_html_empty_dataframe(biggie_df_fixture): df = biggie_df_fixture df.to_html() @@ -274,55 +299,62 @@ def test_to_html_empty_dataframe(biggie_df_fixture): def test_to_html_filename(biggie_df_fixture, tmpdir): df = biggie_df_fixture expected = df.to_html() - path = tmpdir.join('test.html') + path = tmpdir.join("test.html") df.to_html(path) result = path.read() assert result == expected def test_to_html_with_no_bold(): - df = DataFrame({'x': np.random.randn(5)}) + df = DataFrame({"x": np.random.randn(5)}) html = df.to_html(bold_rows=False) result = html[html.find("")] - assert 'B' not in result - - -@pytest.mark.parametrize('columns,justify,expected', [ - (MultiIndex.from_tuples( - list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))), - names=['CL0', 'CL1']), - 'left', - 'multiindex_1'), - - (MultiIndex.from_tuples( - list(zip(range(4), np.mod(range(4), 2)))), - 'right', - 'multiindex_2') -]) + result = float_frame.to_html(columns=["A"]) + assert "B" not in result + + +@pytest.mark.parametrize( + "columns,justify,expected", + [ + ( + MultiIndex.from_tuples( + list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))), + names=["CL0", "CL1"], + ), + "left", + "multiindex_1", + ), + ( + MultiIndex.from_tuples(list(zip(range(4), np.mod(range(4), 2)))), + "right", + "multiindex_2", + ), + ], +) def test_to_html_multiindex(columns, justify, expected, datapath): - df = DataFrame([list('abcd'), list('efgh')], columns=columns) + df = DataFrame([list("abcd"), list("efgh")], columns=columns) result = df.to_html(justify=justify) expected = expected_html(datapath, expected) assert result == expected def test_to_html_justify(justify, datapath): - df = DataFrame({'A': [6, 30000, 2], - 'B': [1, 2, 70000], - 'C': [223442, 0, 1]}, - columns=['A', 'B', 'C']) + df = DataFrame( + {"A": [6, 30000, 2], "B": [1, 2, 70000], "C": [223442, 0, 1]}, + columns=["A", "B", "C"], + ) result = df.to_html(justify=justify) - expected = expected_html(datapath, 'justify').format(justify=justify) + expected = expected_html(datapath, "justify").format(justify=justify) assert result == expected -@pytest.mark.parametrize("justify", ["super-right", "small-left", - "noinherit", "tiny", "pandas"]) +@pytest.mark.parametrize( + "justify", ["super-right", "small-left", "noinherit", "tiny", "pandas"] +) def test_to_html_invalid_justify(justify): # GH 17527 df = DataFrame() @@ -334,50 +366,47 @@ def test_to_html_invalid_justify(justify): def test_to_html_index(datapath): # TODO: split this test - index = ['foo', 'bar', 'baz'] - df = DataFrame({'A': [1, 2, 3], - 'B': [1.2, 3.4, 5.6], - 'C': ['one', 'two', np.nan]}, - columns=['A', 'B', 'C'], - index=index) - expected_with_index = expected_html(datapath, 'index_1') + index = ["foo", "bar", "baz"] + df = DataFrame( + {"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]}, + columns=["A", "B", "C"], + index=index, + ) + expected_with_index = expected_html(datapath, "index_1") assert df.to_html() == expected_with_index - expected_without_index = expected_html(datapath, 'index_2') + expected_without_index = expected_html(datapath, "index_2") result = df.to_html(index=False) for i in index: assert i not in result assert result == expected_without_index - df.index = Index(['foo', 'bar', 'baz'], name='idx') - expected_with_index = expected_html(datapath, 'index_3') + df.index = Index(["foo", "bar", "baz"], name="idx") + expected_with_index = expected_html(datapath, "index_3") assert df.to_html() == expected_with_index assert df.to_html(index=False) == expected_without_index - tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] + tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")] df.index = MultiIndex.from_tuples(tuples) - expected_with_index = expected_html(datapath, 'index_4') + expected_with_index = expected_html(datapath, "index_4") assert df.to_html() == expected_with_index result = df.to_html(index=False) - for i in ['foo', 'bar', 'car', 'bike']: + for i in ["foo", "bar", "car", "bike"]: assert i not in result # must be the same result as normal index assert result == expected_without_index - df.index = MultiIndex.from_tuples(tuples, names=['idx1', 'idx2']) - expected_with_index = expected_html(datapath, 'index_5') + df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"]) + expected_with_index = expected_html(datapath, "index_5") assert df.to_html() == expected_with_index assert df.to_html(index=False) == expected_without_index -@pytest.mark.parametrize('classes', [ - "sortable draggable", - ["sortable", "draggable"] -]) +@pytest.mark.parametrize("classes", ["sortable draggable", ["sortable", "draggable"]]) def test_to_html_with_classes(classes, datapath): df = DataFrame() - expected = expected_html(datapath, 'with_classes') + expected = expected_html(datapath, "with_classes") result = df.to_html(classes=classes) assert result == expected @@ -386,160 +415,195 @@ def test_to_html_no_index_max_rows(datapath): # GH 14998 df = DataFrame({"A": [1, 2, 3, 4]}) result = df.to_html(index=False, max_rows=1) - expected = expected_html(datapath, 'gh14998_expected_output') + expected = expected_html(datapath, "gh14998_expected_output") assert result == expected def test_to_html_multiindex_max_cols(datapath): # GH 6131 - index = MultiIndex(levels=[['ba', 'bb', 'bc'], ['ca', 'cb', 'cc']], - codes=[[0, 1, 2], [0, 1, 2]], - names=['b', 'c']) - columns = MultiIndex(levels=[['d'], ['aa', 'ab', 'ac']], - codes=[[0, 0, 0], [0, 1, 2]], - names=[None, 'a']) + index = MultiIndex( + levels=[["ba", "bb", "bc"], ["ca", "cb", "cc"]], + codes=[[0, 1, 2], [0, 1, 2]], + names=["b", "c"], + ) + columns = MultiIndex( + levels=[["d"], ["aa", "ab", "ac"]], + codes=[[0, 0, 0], [0, 1, 2]], + names=[None, "a"], + ) data = np.array( - [[1., np.nan, np.nan], [np.nan, 2., np.nan], [np.nan, np.nan, 3.]]) + [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]] + ) df = DataFrame(data, index, columns) result = df.to_html(max_cols=2) - expected = expected_html(datapath, 'gh6131_expected_output') + expected = expected_html(datapath, "gh6131_expected_output") assert result == expected def test_to_html_multi_indexes_index_false(datapath): # GH 22579 - df = DataFrame({'a': range(10), 'b': range(10, 20), 'c': range(10, 20), - 'd': range(10, 20)}) - df.columns = MultiIndex.from_product([['a', 'b'], ['c', 'd']]) - df.index = MultiIndex.from_product([['a', 'b'], - ['c', 'd', 'e', 'f', 'g']]) + df = DataFrame( + {"a": range(10), "b": range(10, 20), "c": range(10, 20), "d": range(10, 20)} + ) + df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]]) + df.index = MultiIndex.from_product([["a", "b"], ["c", "d", "e", "f", "g"]]) result = df.to_html(index=False) - expected = expected_html(datapath, 'gh22579_expected_output') + expected = expected_html(datapath, "gh22579_expected_output") assert result == expected -@pytest.mark.parametrize('index_names', [True, False]) -@pytest.mark.parametrize('header', [True, False]) -@pytest.mark.parametrize('index', [True, False]) -@pytest.mark.parametrize('column_index, column_type', [ - (Index([0, 1]), 'unnamed_standard'), - (Index([0, 1], name='columns.name'), 'named_standard'), - (MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a'], ['b', 'c']], names=['columns.name.0', - 'columns.name.1']), 'named_multi') -]) -@pytest.mark.parametrize('row_index, row_type', [ - (Index([0, 1]), 'unnamed_standard'), - (Index([0, 1], name='index.name'), 'named_standard'), - (MultiIndex.from_product([['a'], ['b', 'c']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a'], ['b', 'c']], names=['index.name.0', - 'index.name.1']), 'named_multi') -]) +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="columns.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["columns.name.0", "columns.name.1"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index([0, 1]), "unnamed_standard"), + (Index([0, 1], name="index.name"), "named_standard"), + (MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"), + ( + MultiIndex.from_product( + [["a"], ["b", "c"]], names=["index.name.0", "index.name.1"] + ), + "named_multi", + ), + ], +) def test_to_html_basic_alignment( - datapath, row_index, row_type, column_index, column_type, - index, header, index_names): + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): # GH 22747, GH 22579 - df = DataFrame(np.zeros((2, 2), dtype=int), - index=row_index, columns=column_index) - result = df.to_html( - index=index, header=header, index_names=index_names) + df = DataFrame(np.zeros((2, 2), dtype=int), index=row_index, columns=column_index) + result = df.to_html(index=index, header=header, index_names=index_names) if not index: - row_type = 'none' - elif not index_names and row_type.startswith('named'): - row_type = 'un' + row_type + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type if not header: - column_type = 'none' - elif not index_names and column_type.startswith('named'): - column_type = 'un' + column_type + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type - filename = 'index_' + row_type + '_columns_' + column_type + filename = "index_" + row_type + "_columns_" + column_type expected = expected_html(datapath, filename) assert result == expected -@pytest.mark.parametrize('index_names', [True, False]) -@pytest.mark.parametrize('header', [True, False]) -@pytest.mark.parametrize('index', [True, False]) -@pytest.mark.parametrize('column_index, column_type', [ - (Index(np.arange(8)), 'unnamed_standard'), - (Index(np.arange(8), name='columns.name'), 'named_standard'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']), - 'named_multi') -]) -@pytest.mark.parametrize('row_index, row_type', [ - (Index(np.arange(8)), 'unnamed_standard'), - (Index(np.arange(8), name='index.name'), 'named_standard'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']]), 'unnamed_multi'), - (MultiIndex.from_product( - [['a', 'b'], ['c', 'd'], ['e', 'f']], names=['foo', None, 'baz']), - 'named_multi') -]) +@pytest.mark.parametrize("index_names", [True, False]) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize( + "column_index, column_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="columns.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) +@pytest.mark.parametrize( + "row_index, row_type", + [ + (Index(np.arange(8)), "unnamed_standard"), + (Index(np.arange(8), name="index.name"), "named_standard"), + ( + MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]), + "unnamed_multi", + ), + ( + MultiIndex.from_product( + [["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"] + ), + "named_multi", + ), + ], +) def test_to_html_alignment_with_truncation( - datapath, row_index, row_type, column_index, column_type, - index, header, index_names): + datapath, row_index, row_type, column_index, column_type, index, header, index_names +): # GH 22747, GH 22579 - df = DataFrame(np.arange(64).reshape(8, 8), - index=row_index, columns=column_index) + df = DataFrame(np.arange(64).reshape(8, 8), index=row_index, columns=column_index) result = df.to_html( - max_rows=4, max_cols=4, - index=index, header=header, index_names=index_names) + max_rows=4, max_cols=4, index=index, header=header, index_names=index_names + ) if not index: - row_type = 'none' - elif not index_names and row_type.startswith('named'): - row_type = 'un' + row_type + row_type = "none" + elif not index_names and row_type.startswith("named"): + row_type = "un" + row_type if not header: - column_type = 'none' - elif not index_names and column_type.startswith('named'): - column_type = 'un' + column_type + column_type = "none" + elif not index_names and column_type.startswith("named"): + column_type = "un" + column_type - filename = 'trunc_df_index_' + row_type + '_columns_' + column_type + filename = "trunc_df_index_" + row_type + "_columns_" + column_type expected = expected_html(datapath, filename) assert result == expected -@pytest.mark.parametrize('index', [False, 0]) +@pytest.mark.parametrize("index", [False, 0]) def test_to_html_truncation_index_false_max_rows(datapath, index): # GH 15019 - data = [[1.764052, 0.400157], - [0.978738, 2.240893], - [1.867558, -0.977278], - [0.950088, -0.151357], - [-0.103219, 0.410599]] + data = [ + [1.764052, 0.400157], + [0.978738, 2.240893], + [1.867558, -0.977278], + [0.950088, -0.151357], + [-0.103219, 0.410599], + ] df = DataFrame(data) result = df.to_html(max_rows=4, index=index) - expected = expected_html(datapath, 'gh15019_expected_output') + expected = expected_html(datapath, "gh15019_expected_output") assert result == expected -@pytest.mark.parametrize('index', [False, 0]) -@pytest.mark.parametrize('col_index_named, expected_output', [ - (False, 'gh22783_expected_output'), - (True, 'gh22783_named_columns_index') -]) +@pytest.mark.parametrize("index", [False, 0]) +@pytest.mark.parametrize( + "col_index_named, expected_output", + [(False, "gh22783_expected_output"), (True, "gh22783_named_columns_index")], +) def test_to_html_truncation_index_false_max_cols( - datapath, index, col_index_named, expected_output): + datapath, index, col_index_named, expected_output +): # GH 22783 - data = [[1.764052, 0.400157, 0.978738, 2.240893, 1.867558], - [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599]] + data = [ + [1.764052, 0.400157, 0.978738, 2.240893, 1.867558], + [-0.977278, 0.950088, -0.151357, -0.103219, 0.410599], + ] df = DataFrame(data) if col_index_named: - df.columns.rename('columns.name', inplace=True) + df.columns.rename("columns.name", inplace=True) result = df.to_html(max_cols=4, index=index) expected = expected_html(datapath, expected_output) assert result == expected -@pytest.mark.parametrize('notebook', [True, False]) +@pytest.mark.parametrize("notebook", [True, False]) def test_to_html_notebook_has_style(notebook): df = DataFrame({"A": [1, 2, 3]}) result = df.to_html(notebook=notebook) @@ -556,59 +620,62 @@ def test_to_html_notebook_has_style(notebook): def test_to_html_with_index_names_false(): # GH 16493 - df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'], - name='myindexname')) + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) result = df.to_html(index_names=False) - assert 'myindexname' not in result + assert "myindexname" not in result def test_to_html_with_id(): # GH 8496 - df = DataFrame({"A": [1, 2]}, index=Index(['a', 'b'], - name='myindexname')) + df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname")) result = df.to_html(index_names=False, table_id="TEST_ID") assert ' id="TEST_ID"' in result -@pytest.mark.parametrize('value,float_format,expected', [ - (0.19999, '%.3f', 'gh21625_expected_output'), - (100.0, '%.0f', 'gh22270_expected_output'), -]) -def test_to_html_float_format_no_fixed_width( - value, float_format, expected, datapath): +@pytest.mark.parametrize( + "value,float_format,expected", + [ + (0.19999, "%.3f", "gh21625_expected_output"), + (100.0, "%.0f", "gh22270_expected_output"), + ], +) +def test_to_html_float_format_no_fixed_width(value, float_format, expected, datapath): # GH 21625, GH 22270 - df = DataFrame({'x': [value]}) + df = DataFrame({"x": [value]}) expected = expected_html(datapath, expected) result = df.to_html(float_format=float_format) assert result == expected -@pytest.mark.parametrize("render_links,expected", [ - (True, 'render_links_true'), - (False, 'render_links_false'), -]) +@pytest.mark.parametrize( + "render_links,expected", + [(True, "render_links_true"), (False, "render_links_false")], +) def test_to_html_render_links(render_links, expected, datapath): # GH 2679 data = [ - [0, 'http://pandas.pydata.org/?q1=a&q2=b', 'pydata.org'], - [0, 'www.pydata.org', 'pydata.org'] + [0, "http://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], + [0, "www.pydata.org", "pydata.org"], ] - df = DataFrame(data, columns=['foo', 'bar', None]) + df = DataFrame(data, columns=["foo", "bar", None]) result = df.to_html(render_links=render_links) expected = expected_html(datapath, expected) assert result == expected -@pytest.mark.parametrize('method,expected', [ - ('to_html', lambda x:lorem_ipsum), - ('_repr_html_', lambda x:lorem_ipsum[:x - 4] + '...') # regression case -]) -@pytest.mark.parametrize('max_colwidth', [10, 20, 50, 100]) +@pytest.mark.parametrize( + "method,expected", + [ + ("to_html", lambda x: lorem_ipsum), + ("_repr_html_", lambda x: lorem_ipsum[: x - 4] + "..."), # regression case + ], +) +@pytest.mark.parametrize("max_colwidth", [10, 20, 50, 100]) def test_ignore_display_max_colwidth(method, expected, max_colwidth): # see gh-17004 df = DataFrame([lorem_ipsum]) - with pd.option_context('display.max_colwidth', max_colwidth): + with pd.option_context("display.max_colwidth", max_colwidth): result = getattr(df, method)() expected = expected(max_colwidth) assert expected in result @@ -627,22 +694,22 @@ def test_to_html_invalid_classes_type(classes): def test_to_html_round_column_headers(): # GH 17280 df = DataFrame([1], columns=[0.55555]) - with pd.option_context('display.precision', 3): + with pd.option_context("display.precision", 3): html = df.to_html(notebook=False) notebook = df.to_html(notebook=True) assert "0.55555" in html assert "0.556" in notebook -@pytest.mark.parametrize("unit", ['100px', '10%', '5em', 150]) +@pytest.mark.parametrize("unit", ["100px", "10%", "5em", 150]) def test_to_html_with_col_space_units(unit): # GH 25941 df = DataFrame(np.random.random(size=(1, 3))) result = df.to_html(col_space=unit) - result = result.split('tbody')[0] + result = result.split("tbody")[0] hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] if isinstance(unit, int): - unit = str(unit) + 'px' + unit = str(unit) + "px" for h in hdrs: expected = ''.format(unit=unit) assert expected in h diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index b9f28ec36d021..a8a6a96f60d60 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -9,32 +9,31 @@ class TestToLatex: - def test_to_latex_filename(self, float_frame): - with tm.ensure_clean('test.tex') as path: + with tm.ensure_clean("test.tex") as path: float_frame.to_latex(path) - with open(path, 'r') as f: + with open(path, "r") as f: assert float_frame.to_latex() == f.read() # test with utf-8 and encoding option (GH 7061) - df = DataFrame([['au\xdfgangen']]) - with tm.ensure_clean('test.tex') as path: - df.to_latex(path, encoding='utf-8') - with codecs.open(path, 'r', encoding='utf-8') as f: + df = DataFrame([["au\xdfgangen"]]) + with tm.ensure_clean("test.tex") as path: + df.to_latex(path, encoding="utf-8") + with codecs.open(path, "r", encoding="utf-8") as f: assert df.to_latex() == f.read() # test with utf-8 without encoding option - with tm.ensure_clean('test.tex') as path: + with tm.ensure_clean("test.tex") as path: df.to_latex(path) - with codecs.open(path, 'r', encoding='utf-8') as f: + with codecs.open(path, "r", encoding="utf-8") as f: assert df.to_latex() == f.read() def test_to_latex(self, float_frame): # it works! float_frame.to_latex() - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex() withindex_expected = r"""\begin{tabular}{lrl} \toprule @@ -63,10 +62,10 @@ def test_to_latex(self, float_frame): def test_to_latex_format(self, float_frame): # GH Bug #9402 - float_frame.to_latex(column_format='ccc') + float_frame.to_latex(column_format="ccc") - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(column_format='ccc') + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(column_format="ccc") withindex_expected = r"""\begin{tabular}{ccc} \toprule {} & a & b \\ @@ -103,19 +102,26 @@ def test_to_latex_empty(self): assert result == expected def test_to_latex_with_formatters(self): - df = DataFrame({'datetime64': [datetime(2016, 1, 1), - datetime(2016, 2, 5), - datetime(2016, 3, 3)], - 'float': [1.0, 2.0, 3.0], - 'int': [1, 2, 3], - 'object': [(1, 2), True, False], - }) - - formatters = {'datetime64': lambda x: x.strftime('%Y-%m'), - 'float': lambda x: '[{x: 4.1f}]'.format(x=x), - 'int': lambda x: '0x{x:x}'.format(x=x), - 'object': lambda x: '-{x!s}-'.format(x=x), - '__index__': lambda x: 'index: {x}'.format(x=x)} + df = DataFrame( + { + "datetime64": [ + datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3), + ], + "float": [1.0, 2.0, 3.0], + "int": [1, 2, 3], + "object": [(1, 2), True, False], + } + ) + + formatters = { + "datetime64": lambda x: x.strftime("%Y-%m"), + "float": lambda x: "[{x: 4.1f}]".format(x=x), + "int": lambda x: "0x{x:x}".format(x=x), + "object": lambda x: "-{x!s}-".format(x=x), + "__index__": lambda x: "index: {x}".format(x=x), + } result = df.to_latex(formatters=dict(formatters)) expected = r"""\begin{tabular}{llrrl} @@ -131,7 +137,7 @@ def test_to_latex_with_formatters(self): assert result == expected def test_to_latex_multiindex(self): - df = DataFrame({('x', 'y'): ['a']}) + df = DataFrame({("x", "y"): ["a"]}) result = df.to_latex() expected = r"""\begin{tabular}{ll} \toprule @@ -157,13 +163,15 @@ def test_to_latex_multiindex(self): assert result == expected - df = DataFrame.from_dict({ - ('c1', 0): pd.Series({x: x for x in range(4)}), - ('c1', 1): pd.Series({x: x + 4 for x in range(4)}), - ('c2', 0): pd.Series({x: x for x in range(4)}), - ('c2', 1): pd.Series({x: x + 4 for x in range(4)}), - ('c3', 0): pd.Series({x: x for x in range(4)}), - }).T + df = DataFrame.from_dict( + { + ("c1", 0): pd.Series({x: x for x in range(4)}), + ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c2", 0): pd.Series({x: x for x in range(4)}), + ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c3", 0): pd.Series({x: x for x in range(4)}), + } + ).T result = df.to_latex() expected = r"""\begin{tabular}{llrrrr} \toprule @@ -182,7 +190,7 @@ def test_to_latex_multiindex(self): # GH 14184 df = df.T - df.columns.names = ['a', 'b'] + df.columns.names = ["a", "b"] result = df.to_latex() expected = r"""\begin{tabular}{lrrrrr} \toprule @@ -199,10 +207,8 @@ def test_to_latex_multiindex(self): assert result == expected # GH 10660 - df = pd.DataFrame({'a': [0, 0, 1, 1], - 'b': list('abab'), - 'c': [1, 2, 3, 4]}) - result = df.set_index(['a', 'b']).to_latex() + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) + result = df.set_index(["a", "b"]).to_latex() expected = r"""\begin{tabular}{llr} \toprule & & c \\ @@ -218,7 +224,7 @@ def test_to_latex_multiindex(self): assert result == expected - result = df.groupby('a').describe().to_latex() + result = df.groupby("a").describe().to_latex() expected = r"""\begin{tabular}{lrrrrrrrr} \toprule {} & \multicolumn{8}{l}{c} \\ @@ -241,8 +247,9 @@ def test_to_latex_multiindex_dupe_level(self): # ONLY happen if all higher order indices (to the left) are # equal too. In this test, 'c' has to be printed both times # because the higher order index 'A' != 'B'. - df = pd.DataFrame(index=pd.MultiIndex.from_tuples( - [('A', 'c'), ('B', 'c')]), columns=['col']) + df = pd.DataFrame( + index=pd.MultiIndex.from_tuples([("A", "c"), ("B", "c")]), columns=["col"] + ) result = df.to_latex() expected = r"""\begin{tabular}{lll} \toprule @@ -256,13 +263,15 @@ def test_to_latex_multiindex_dupe_level(self): assert result == expected def test_to_latex_multicolumnrow(self): - df = pd.DataFrame({ - ('c1', 0): {x: x for x in range(5)}, - ('c1', 1): {x: x + 5 for x in range(5)}, - ('c2', 0): {x: x for x in range(5)}, - ('c2', 1): {x: x + 5 for x in range(5)}, - ('c3', 0): {x: x for x in range(5)} - }) + df = pd.DataFrame( + { + ("c1", 0): {x: x for x in range(5)}, + ("c1", 1): {x: x + 5 for x in range(5)}, + ("c2", 0): {x: x for x in range(5)}, + ("c2", 1): {x: x + 5 for x in range(5)}, + ("c3", 0): {x: x for x in range(5)}, + } + ) result = df.to_latex() expected = r"""\begin{tabular}{lrrrrr} \toprule @@ -313,8 +322,7 @@ def test_to_latex_multicolumnrow(self): assert result == expected df.index = df.T.index - result = df.T.to_latex(multirow=True, multicolumn=True, - multicolumn_format='c') + result = df.T.to_latex(multirow=True, multicolumn=True, multicolumn_format="c") expected = r"""\begin{tabular}{llrrrrr} \toprule & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ @@ -333,19 +341,15 @@ def test_to_latex_multicolumnrow(self): assert result == expected def test_to_latex_escape(self): - a = 'a' - b = 'b' + a = "a" + b = "b" - test_dict = {'co$e^x$': {a: "a", - b: "b"}, - 'co^l1': {a: "a", - b: "b"}} + test_dict = {"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}} unescaped_result = DataFrame(test_dict).to_latex(escape=False) - escaped_result = DataFrame(test_dict).to_latex( - ) # default: escape=True + escaped_result = DataFrame(test_dict).to_latex() # default: escape=True - unescaped_expected = r'''\begin{tabular}{lll} + unescaped_expected = r"""\begin{tabular}{lll} \toprule {} & co$e^x$ & co^l1 \\ \midrule @@ -353,9 +357,9 @@ def test_to_latex_escape(self): b & b & b \\ \bottomrule \end{tabular} -''' +""" - escaped_expected = r'''\begin{tabular}{lll} + escaped_expected = r"""\begin{tabular}{lll} \toprule {} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\ \midrule @@ -363,7 +367,7 @@ def test_to_latex_escape(self): b & b & b \\ \bottomrule \end{tabular} -''' +""" assert unescaped_result == unescaped_expected assert escaped_result == escaped_expected @@ -387,7 +391,7 @@ def test_to_latex_special_escape(self): def test_to_latex_longtable(self, float_frame): float_frame.to_latex(longtable=True) - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex(longtable=True) withindex_expected = r"""\begin{longtable}{lrl} \toprule @@ -427,17 +431,16 @@ def test_to_latex_longtable(self, float_frame): assert withoutindex_result == withoutindex_expected - df = DataFrame({'a': [1, 2]}) + df = DataFrame({"a": [1, 2]}) with1column_result = df.to_latex(index=False, longtable=True) assert r"\multicolumn{1}" in with1column_result - df = DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) with3columns_result = df.to_latex(index=False, longtable=True) assert r"\multicolumn{3}" in with3columns_result def test_to_latex_escape_special_chars(self): - special_characters = ['&', '%', '$', '#', '_', '{', '}', '~', '^', - '\\'] + special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) observed = df.to_latex() expected = r"""\begin{tabular}{ll} @@ -462,7 +465,7 @@ def test_to_latex_escape_special_chars(self): def test_to_latex_no_header(self): # GH 7124 - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex(header=False) withindex_expected = r"""\begin{tabular}{lrl} \toprule @@ -487,8 +490,8 @@ def test_to_latex_no_header(self): def test_to_latex_specified_header(self): # GH 7124 - df = DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(header=['AA', 'BB']) + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(header=["AA", "BB"]) withindex_expected = r"""\begin{tabular}{lrl} \toprule {} & AA & BB \\ @@ -501,7 +504,7 @@ def test_to_latex_specified_header(self): assert withindex_result == withindex_expected - withoutindex_result = df.to_latex(header=['AA', 'BB'], index=False) + withoutindex_result = df.to_latex(header=["AA", "BB"], index=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule AA & BB \\ @@ -514,7 +517,7 @@ def test_to_latex_specified_header(self): assert withoutindex_result == withoutindex_expected - withoutescape_result = df.to_latex(header=['$A$', '$B$'], escape=False) + withoutescape_result = df.to_latex(header=["$A$", "$B$"], escape=False) withoutescape_expected = r"""\begin{tabular}{lrl} \toprule {} & $A$ & $B$ \\ @@ -528,14 +531,14 @@ def test_to_latex_specified_header(self): assert withoutescape_result == withoutescape_expected with pytest.raises(ValueError): - df.to_latex(header=['A']) + df.to_latex(header=["A"]) def test_to_latex_decimal(self, float_frame): # GH 12031 float_frame.to_latex() - df = DataFrame({'a': [1.0, 2.1], 'b': ['b1', 'b2']}) - withindex_result = df.to_latex(decimal=',') + df = DataFrame({"a": [1.0, 2.1], "b": ["b1", "b2"]}) + withindex_result = df.to_latex(decimal=",") withindex_expected = r"""\begin{tabular}{lrl} \toprule @@ -550,7 +553,7 @@ def test_to_latex_decimal(self, float_frame): assert withindex_result == withindex_expected def test_to_latex_series(self): - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) withindex_result = s.to_latex() withindex_expected = r"""\begin{tabular}{ll} \toprule @@ -566,7 +569,7 @@ def test_to_latex_series(self): def test_to_latex_bold_rows(self): # GH 16707 - df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) observed = df.to_latex(bold_rows=True) expected = r"""\begin{tabular}{lrl} \toprule @@ -581,7 +584,7 @@ def test_to_latex_bold_rows(self): def test_to_latex_no_bold_rows(self): # GH 16707 - df = pd.DataFrame({'a': [1, 2], 'b': ['b1', 'b2']}) + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) observed = df.to_latex(bold_rows=False) expected = r"""\begin{tabular}{lrl} \toprule @@ -594,9 +597,9 @@ def test_to_latex_no_bold_rows(self): """ assert observed == expected - @pytest.mark.parametrize('name0', [None, 'named0']) - @pytest.mark.parametrize('name1', [None, 'named1']) - @pytest.mark.parametrize('axes', [[0], [1], [0, 1]]) + @pytest.mark.parametrize("name0", [None, "named0"]) + @pytest.mark.parametrize("name1", [None, "named1"]) + @pytest.mark.parametrize("axes", [[0], [1], [0, 1]]) def test_to_latex_multiindex_names(self, name0, name1, axes): # GH 18667 names = [name0, name1] @@ -605,12 +608,14 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): for idx in axes: df.axes[idx].names = names - idx_names = tuple(n or '{}' for n in names) - idx_names_row = ('%s & %s & & & & \\\\\n' % idx_names - if (0 in axes and any(names)) else '') - placeholder = '{}' if any(names) and 1 in axes else ' ' - col_names = [n if (bool(n) and 1 in axes) else placeholder - for n in names] + idx_names = tuple(n or "{}" for n in names) + idx_names_row = ( + "%s & %s & & & & \\\\\n" % idx_names + if (0 in axes and any(names)) + else "" + ) + placeholder = "{}" if any(names) and 1 in axes else " " + col_names = [n if (bool(n) and 1 in axes) else placeholder for n in names] observed = df.to_latex() expected = r"""\begin{tabular}{llrrrr} \toprule @@ -623,16 +628,18 @@ def test_to_latex_multiindex_names(self, name0, name1, axes): & 4 & -1 & -1 & -1 & -1 \\ \bottomrule \end{tabular} -""" % tuple(list(col_names) + [idx_names_row]) +""" % tuple( + list(col_names) + [idx_names_row] + ) assert observed == expected - @pytest.mark.parametrize('one_row', [True, False]) + @pytest.mark.parametrize("one_row", [True, False]) def test_to_latex_multiindex_nans(self, one_row): # GH 14249 - df = pd.DataFrame({'a': [None, 1], 'b': [2, 3], 'c': [4, 5]}) + df = pd.DataFrame({"a": [None, 1], "b": [2, 3], "c": [4, 5]}) if one_row: df = df.iloc[[0]] - observed = df.set_index(['a', 'b']).to_latex() + observed = df.set_index(["a", "b"]).to_latex() expected = r"""\begin{tabular}{llr} \toprule & & c \\ @@ -665,8 +672,8 @@ def test_to_latex_non_string_index(self): def test_to_latex_midrule_location(self): # GH 18326 - df = pd.DataFrame({'a': [1, 2]}) - df.index.name = 'foo' + df = pd.DataFrame({"a": [1, 2]}) + df.index.name = "foo" observed = df.to_latex(index_names=False) expected = r"""\begin{tabular}{lr} \toprule @@ -682,7 +689,7 @@ def test_to_latex_midrule_location(self): def test_to_latex_multiindex_empty_name(self): # GH 18669 - mi = pd.MultiIndex.from_product([[1, 2]], names=['']) + mi = pd.MultiIndex.from_product([[1, 2]], names=[""]) df = pd.DataFrame(-1, index=mi, columns=range(4)) observed = df.to_latex() expected = r"""\begin{tabular}{lrrrr} @@ -700,7 +707,7 @@ def test_to_latex_multiindex_empty_name(self): def test_to_latex_float_format_no_fixed_width(self): # GH 21625 - df = DataFrame({'x': [0.19999]}) + df = DataFrame({"x": [0.19999]}) expected = r"""\begin{tabular}{lr} \toprule {} & x \\ @@ -709,10 +716,10 @@ def test_to_latex_float_format_no_fixed_width(self): \bottomrule \end{tabular} """ - assert df.to_latex(float_format='%.3f') == expected + assert df.to_latex(float_format="%.3f") == expected # GH 22270 - df = DataFrame({'x': [100.0]}) + df = DataFrame({"x": [100.0]}) expected = r"""\begin{tabular}{lr} \toprule {} & x \\ @@ -721,13 +728,14 @@ def test_to_latex_float_format_no_fixed_width(self): \bottomrule \end{tabular} """ - assert df.to_latex(float_format='%.0f') == expected + assert df.to_latex(float_format="%.0f") == expected def test_to_latex_multindex_header(self): # GH 16718 - df = (pd.DataFrame({'a': [0], 'b': [1], 'c': [2], 'd': [3]}) - .set_index(['a', 'b'])) - observed = df.to_latex(header=['r1', 'r2']) + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( + ["a", "b"] + ) + observed = df.to_latex(header=["r1", "r2"]) expected = r"""\begin{tabular}{llrr} \toprule & & r1 & r2 \\ diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 5bb7fe8e6367d..3ccb29f07dc83 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -45,15 +45,46 @@ import pandas from pandas import ( - Categorical, DataFrame, Index, MultiIndex, NaT, Period, RangeIndex, Series, - SparseDataFrame, SparseSeries, Timestamp, bdate_range, date_range, - period_range, timedelta_range, to_msgpack) + Categorical, + DataFrame, + Index, + MultiIndex, + NaT, + Period, + RangeIndex, + Series, + SparseDataFrame, + SparseSeries, + Timestamp, + bdate_range, + date_range, + period_range, + timedelta_range, + to_msgpack, +) from pandas.tseries.offsets import ( - FY5253, BusinessDay, BusinessHour, CustomBusinessDay, DateOffset, Day, - Easter, Hour, LastWeekOfMonth, Minute, MonthBegin, MonthEnd, QuarterBegin, - QuarterEnd, SemiMonthBegin, SemiMonthEnd, Week, WeekOfMonth, YearBegin, - YearEnd) + FY5253, + BusinessDay, + BusinessHour, + CustomBusinessDay, + DateOffset, + Day, + Easter, + Hour, + LastWeekOfMonth, + Minute, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + SemiMonthBegin, + SemiMonthEnd, + Week, + WeekOfMonth, + YearBegin, + YearEnd, +) _loose_version = LooseVersion(pandas.__version__) @@ -66,8 +97,8 @@ def _create_sp_series(): arr[7:12] = nan arr[-1:] = nan - bseries = SparseSeries(arr, kind='block') - bseries.name = 'bseries' + bseries = SparseSeries(arr, kind="block") + bseries.name = "bseries" return bseries @@ -79,21 +110,23 @@ def _create_sp_tsseries(): arr[7:12] = nan arr[-1:] = nan - date_index = bdate_range('1/1/2011', periods=len(arr)) - bseries = SparseSeries(arr, index=date_index, kind='block') - bseries.name = 'btsseries' + date_index = bdate_range("1/1/2011", periods=len(arr)) + bseries = SparseSeries(arr, index=date_index, kind="block") + bseries.name = "btsseries" return bseries def _create_sp_frame(): nan = np.nan - data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10).astype(np.int64), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + data = { + "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + "C": np.arange(10).astype(np.int64), + "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], + } - dates = bdate_range('1/1/2011', periods=10) + dates = bdate_range("1/1/2011", periods=10) return SparseDataFrame(data, index=dates) @@ -101,132 +134,165 @@ def create_data(): """ create the pickle/msgpack data """ data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] + "A": [0.0, 1.0, 2.0, 3.0, np.nan], + "B": [0, 1, 0, 1, 0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": date_range("1/1/2009", periods=5), + "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], } - scalars = dict(timestamp=Timestamp('20130101'), - period=Period('2012', 'M')) + scalars = dict(timestamp=Timestamp("20130101"), period=Period("2012", "M")) - index = dict(int=Index(np.arange(10)), - date=date_range('20130101', periods=10), - period=period_range('2013-01-01', freq='M', periods=10), - float=Index(np.arange(10, dtype=np.float64)), - uint=Index(np.arange(10, dtype=np.uint64)), - timedelta=timedelta_range('00:00:00', freq='30T', periods=10)) + index = dict( + int=Index(np.arange(10)), + date=date_range("20130101", periods=10), + period=period_range("2013-01-01", freq="M", periods=10), + float=Index(np.arange(10, dtype=np.float64)), + uint=Index(np.arange(10, dtype=np.uint64)), + timedelta=timedelta_range("00:00:00", freq="30T", periods=10), + ) - index['range'] = RangeIndex(10) + index["range"] = RangeIndex(10) - if _loose_version >= LooseVersion('0.21'): + if _loose_version >= LooseVersion("0.21"): from pandas import interval_range - index['interval'] = interval_range(0, periods=10) - - mi = dict(reg2=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', - 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', - 'two', 'one', 'two']])), - names=['first', 'second'])) - - series = dict(float=Series(data['A']), - int=Series(data['B']), - mixed=Series(data['E']), - ts=Series(np.arange(10).astype(np.int64), - index=date_range('20130101', periods=10)), - mi=Series(np.arange(5).astype(np.float64), - index=MultiIndex.from_tuples( - tuple(zip(*[[1, 1, 2, 2, 2], - [3, 4, 3, 4, 5]])), - names=['one', 'two'])), - dup=Series(np.arange(5).astype(np.float64), - index=['A', 'B', 'C', 'D', 'A']), - cat=Series(Categorical(['foo', 'bar', 'baz'])), - dt=Series(date_range('20130101', periods=5)), - dt_tz=Series(date_range('20130101', periods=5, - tz='US/Eastern')), - period=Series([Period('2000Q1')] * 5)) + + index["interval"] = interval_range(0, periods=10) + + mi = dict( + reg2=MultiIndex.from_tuples( + tuple( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ), + names=["first", "second"], + ) + ) + + series = dict( + float=Series(data["A"]), + int=Series(data["B"]), + mixed=Series(data["E"]), + ts=Series( + np.arange(10).astype(np.int64), index=date_range("20130101", periods=10) + ), + mi=Series( + np.arange(5).astype(np.float64), + index=MultiIndex.from_tuples( + tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"] + ), + ), + dup=Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]), + cat=Series(Categorical(["foo", "bar", "baz"])), + dt=Series(date_range("20130101", periods=5)), + dt_tz=Series(date_range("20130101", periods=5, tz="US/Eastern")), + period=Series([Period("2000Q1")] * 5), + ) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") - frame = dict(float=DataFrame({'A': series['float'], - 'B': series['float'] + 1}), - int=DataFrame({'A': series['int'], - 'B': series['int'] + 1}), - mixed=DataFrame({k: data[k] - for k in ['A', 'B', 'C', 'D']}), - mi=DataFrame({'A': np.arange(5).astype(np.float64), - 'B': np.arange(5).astype(np.int64)}, - index=MultiIndex.from_tuples( - tuple(zip(*[['bar', 'bar', 'baz', - 'baz', 'baz'], - ['one', 'two', 'one', - 'two', 'three']])), - names=['first', 'second'])), - dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), - columns=['A', 'B', 'A']), - cat_onecol=DataFrame({'A': Categorical(['foo', 'bar'])}), - cat_and_float=DataFrame({ - 'A': Categorical(['foo', 'bar', 'baz']), - 'B': np.arange(3).astype(np.int64)}), - mixed_dup=mixed_dup_df, - dt_mixed_tzs=DataFrame({ - 'A': Timestamp('20130102', tz='US/Eastern'), - 'B': Timestamp('20130603', tz='CET')}, index=range(5)), - dt_mixed2_tzs=DataFrame({ - 'A': Timestamp('20130102', tz='US/Eastern'), - 'B': Timestamp('20130603', tz='CET'), - 'C': Timestamp('20130603', tz='UTC')}, index=range(5)) - ) - - cat = dict(int8=Categorical(list('abcdefg')), - int16=Categorical(np.arange(1000)), - int32=Categorical(np.arange(10000))) - - timestamp = dict(normal=Timestamp('2011-01-01'), - nat=NaT, - tz=Timestamp('2011-01-01', tz='US/Eastern')) - - timestamp['freq'] = Timestamp('2011-01-01', freq='D') - timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', - freq='M') - - off = {'DateOffset': DateOffset(years=1), - 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824), - 'BusinessDay': BusinessDay(offset=timedelta(seconds=9)), - 'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'), - 'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'), - 'SemiMonthBegin': SemiMonthBegin(day_of_month=9), - 'SemiMonthEnd': SemiMonthEnd(day_of_month=24), - 'MonthBegin': MonthBegin(1), - 'MonthEnd': MonthEnd(1), - 'QuarterBegin': QuarterBegin(1), - 'QuarterEnd': QuarterEnd(1), - 'Day': Day(1), - 'YearBegin': YearBegin(1), - 'YearEnd': YearEnd(1), - 'Week': Week(1), - 'Week_Tues': Week(2, normalize=False, weekday=1), - 'WeekOfMonth': WeekOfMonth(week=3, weekday=4), - 'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3), - 'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"), - 'Easter': Easter(), - 'Hour': Hour(1), - 'Minute': Minute(1)} - - return dict(series=series, - frame=frame, - index=index, - scalars=scalars, - mi=mi, - sp_series=dict(float=_create_sp_series(), - ts=_create_sp_tsseries()), - sp_frame=dict(float=_create_sp_frame()), - cat=cat, - timestamp=timestamp, - offsets=off) + frame = dict( + float=DataFrame({"A": series["float"], "B": series["float"] + 1}), + int=DataFrame({"A": series["int"], "B": series["int"] + 1}), + mixed=DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}), + mi=DataFrame( + {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)}, + index=MultiIndex.from_tuples( + tuple( + zip( + *[ + ["bar", "bar", "baz", "baz", "baz"], + ["one", "two", "one", "two", "three"], + ] + ) + ), + names=["first", "second"], + ), + ), + dup=DataFrame( + np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"] + ), + cat_onecol=DataFrame({"A": Categorical(["foo", "bar"])}), + cat_and_float=DataFrame( + { + "A": Categorical(["foo", "bar", "baz"]), + "B": np.arange(3).astype(np.int64), + } + ), + mixed_dup=mixed_dup_df, + dt_mixed_tzs=DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + }, + index=range(5), + ), + dt_mixed2_tzs=DataFrame( + { + "A": Timestamp("20130102", tz="US/Eastern"), + "B": Timestamp("20130603", tz="CET"), + "C": Timestamp("20130603", tz="UTC"), + }, + index=range(5), + ), + ) + + cat = dict( + int8=Categorical(list("abcdefg")), + int16=Categorical(np.arange(1000)), + int32=Categorical(np.arange(10000)), + ) + + timestamp = dict( + normal=Timestamp("2011-01-01"), + nat=NaT, + tz=Timestamp("2011-01-01", tz="US/Eastern"), + ) + + timestamp["freq"] = Timestamp("2011-01-01", freq="D") + timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo", freq="M") + + off = { + "DateOffset": DateOffset(years=1), + "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824), + "BusinessDay": BusinessDay(offset=timedelta(seconds=9)), + "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"), + "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"), + "SemiMonthBegin": SemiMonthBegin(day_of_month=9), + "SemiMonthEnd": SemiMonthEnd(day_of_month=24), + "MonthBegin": MonthBegin(1), + "MonthEnd": MonthEnd(1), + "QuarterBegin": QuarterBegin(1), + "QuarterEnd": QuarterEnd(1), + "Day": Day(1), + "YearBegin": YearBegin(1), + "YearEnd": YearEnd(1), + "Week": Week(1), + "Week_Tues": Week(2, normalize=False, weekday=1), + "WeekOfMonth": WeekOfMonth(week=3, weekday=4), + "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3), + "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"), + "Easter": Easter(), + "Hour": Hour(1), + "Minute": Minute(1), + } + + return dict( + series=series, + frame=frame, + index=index, + scalars=scalars, + mi=mi, + sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), + sp_frame=dict(float=_create_sp_frame()), + cat=cat, + timestamp=timestamp, + offsets=off, + ) def create_pickle_data(): @@ -242,38 +308,47 @@ def _u(x): def create_msgpack_data(): data = create_data() # Not supported - del data['sp_series'] - del data['sp_frame'] - del data['series']['cat'] - del data['series']['period'] - del data['frame']['cat_onecol'] - del data['frame']['cat_and_float'] - del data['scalars']['period'] - if _loose_version >= LooseVersion('0.21') and ( - _loose_version < LooseVersion('0.23.0')): - del data['index']['interval'] - del data['offsets'] + del data["sp_series"] + del data["sp_frame"] + del data["series"]["cat"] + del data["series"]["period"] + del data["frame"]["cat_onecol"] + del data["frame"]["cat_and_float"] + del data["scalars"]["period"] + if _loose_version >= LooseVersion("0.21") and ( + _loose_version < LooseVersion("0.23.0") + ): + del data["index"]["interval"] + del data["offsets"] return _u(data) def platform_name(): - return '_'.join([str(pandas.__version__), str(pl.machine()), - str(pl.system().lower()), str(pl.python_version())]) + return "_".join( + [ + str(pandas.__version__), + str(pl.machine()), + str(pl.system().lower()), + str(pl.python_version()), + ] + ) def write_legacy_pickles(output_dir): version = pandas.__version__ - print("This script generates a storage file for the current arch, system, " - "and python version") + print( + "This script generates a storage file for the current arch, system, " + "and python version" + ) print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: pickle") - pth = '{0}.pickle'.format(platform_name()) + pth = "{0}.pickle".format(platform_name()) - fh = open(os.path.join(output_dir, pth), 'wb') + fh = open(os.path.join(output_dir, pth), "wb") pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL) fh.close() @@ -284,26 +359,29 @@ def write_legacy_msgpack(output_dir, compress): version = pandas.__version__ - print("This script generates a storage file for the current arch, " - "system, and python version") + print( + "This script generates a storage file for the current arch, " + "system, and python version" + ) print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: msgpack") - pth = '{0}.msgpack'.format(platform_name()) - to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), - compress=compress) + pth = "{0}.msgpack".format(platform_name()) + to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(), compress=compress) print("created msgpack file: %s" % pth) def write_legacy_file(): # force our cwd to be the first searched - sys.path.insert(0, '.') + sys.path.insert(0, ".") if not (3 <= len(sys.argv) <= 4): - exit("Specify output directory and storage type: generate_legacy_" - "storage_files.py " - "") + exit( + "Specify output directory and storage type: generate_legacy_" + "storage_files.py " + "" + ) output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) @@ -312,13 +390,13 @@ def write_legacy_file(): except IndexError: compress_type = None - if storage_type == 'pickle': + if storage_type == "pickle": write_legacy_pickles(output_dir=output_dir) - elif storage_type == 'msgpack': + elif storage_type == "msgpack": write_legacy_msgpack(output_dir=output_dir, compress=compress_type) else: exit("storage_type must be one of {'pickle', 'msgpack'}") -if __name__ == '__main__': +if __name__ == "__main__": write_legacy_file() diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 430acbdac804a..56be84bccc51a 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -8,18 +8,19 @@ def test_compression_roundtrip(compression): - df = pd.DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + df = pd.DataFrame( + [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) with tm.ensure_clean() as path: df.to_json(path, compression=compression) - assert_frame_equal(df, pd.read_json(path, - compression=compression)) + assert_frame_equal(df, pd.read_json(path, compression=compression)) # explicitly ensure file was compressed. with tm.decompress_file(path, compression) as fh: - result = fh.read().decode('utf8') + result = fh.read().decode("utf8") assert_frame_equal(df, pd.read_json(result)) @@ -28,7 +29,7 @@ def test_read_zipped_json(datapath): uncompressed_df = pd.read_json(uncompressed_path) compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") - compressed_df = pd.read_json(compressed_path, compression='zip') + compressed_df = pd.read_json(compressed_path, compression="zip") assert_frame_equal(uncompressed_df, compressed_df) @@ -41,11 +42,10 @@ def test_with_s3_url(compression, s3_resource): with tm.ensure_clean() as path: df.to_json(path, compression=compression) - with open(path, 'rb') as f: - s3_resource.Bucket("pandas-test").put_object(Key='test-1', Body=f) + with open(path, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json('s3://pandas-test/test-1', - compression=compression) + roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) assert_frame_equal(df, roundtripped_df) @@ -53,10 +53,8 @@ def test_lines_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') - df.to_json(path, orient='records', lines=True, - compression=compression) - roundtripped_df = pd.read_json(path, lines=True, - compression=compression) + df.to_json(path, orient="records", lines=True, compression=compression) + roundtripped_df = pd.read_json(path, lines=True, compression=compression) assert_frame_equal(df, roundtripped_df) @@ -64,11 +62,9 @@ def test_chunksize_with_compression(compression): with tm.ensure_clean() as path: df = pd.read_json('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}') - df.to_json(path, orient='records', lines=True, - compression=compression) + df.to_json(path, orient="records", lines=True, compression=compression) - res = pd.read_json(path, lines=True, chunksize=1, - compression=compression) + res = pd.read_json(path, lines=True, chunksize=1, compression=compression) roundtripped_df = pd.concat(res) assert_frame_equal(df, roundtripped_df) @@ -90,14 +86,15 @@ def test_read_unsupported_compression_type(): @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) -def test_to_json_compression(compression_only, - read_infer, to_infer): +def test_to_json_compression(compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only if compression == "zip": - pytest.skip("{compression} is not supported " - "for to_csv".format(compression=compression)) + pytest.skip( + "{compression} is not supported " + "for to_csv".format(compression=compression) + ) # We'll complete file extension subsequently. filename = "test." diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 4cc62d3db124f..28c8837731ec1 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -5,559 +5,703 @@ import numpy as np import pytest -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, DatetimeTZDtype, PeriodDtype) +from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype import pandas as pd from pandas import DataFrame import pandas.util.testing as tm from pandas.io.json.table_schema import ( - as_json_table_type, build_table_schema, convert_json_field_to_pandas_type, - convert_pandas_type_to_json_field, set_default_names) + as_json_table_type, + build_table_schema, + convert_json_field_to_pandas_type, + convert_pandas_type_to_json_field, + set_default_names, +) class TestBuildSchema: - def setup_method(self, method): self.df = DataFrame( - {'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=4), - 'D': pd.timedelta_range('1H', periods=4, freq='T'), - }, - index=pd.Index(range(4), name='idx')) + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=4), + "D": pd.timedelta_range("1H", periods=4, freq="T"), + }, + index=pd.Index(range(4), name="idx"), + ) def test_build_table_schema(self): result = build_table_schema(self.df, version=False) expected = { - 'fields': [{'name': 'idx', 'type': 'integer'}, - {'name': 'A', 'type': 'integer'}, - {'name': 'B', 'type': 'string'}, - {'name': 'C', 'type': 'datetime'}, - {'name': 'D', 'type': 'duration'}, - ], - 'primaryKey': ['idx'] + "fields": [ + {"name": "idx", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["idx"], } assert result == expected result = build_table_schema(self.df) assert "pandas_version" in result def test_series(self): - s = pd.Series([1, 2, 3], name='foo') + s = pd.Series([1, 2, 3], name="foo") result = build_table_schema(s, version=False) - expected = {'fields': [{'name': 'index', 'type': 'integer'}, - {'name': 'foo', 'type': 'integer'}], - 'primaryKey': ['index']} + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "foo", "type": "integer"}, + ], + "primaryKey": ["index"], + } assert result == expected result = build_table_schema(s) - assert 'pandas_version' in result + assert "pandas_version" in result def test_series_unnamed(self): result = build_table_schema(pd.Series([1, 2, 3]), version=False) - expected = {'fields': [{'name': 'index', 'type': 'integer'}, - {'name': 'values', 'type': 'integer'}], - 'primaryKey': ['index']} + expected = { + "fields": [ + {"name": "index", "type": "integer"}, + {"name": "values", "type": "integer"}, + ], + "primaryKey": ["index"], + } assert result == expected def test_multiindex(self): df = self.df.copy() - idx = pd.MultiIndex.from_product([('a', 'b'), (1, 2)]) + idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) df.index = idx result = build_table_schema(df, version=False) expected = { - 'fields': [{'name': 'level_0', 'type': 'string'}, - {'name': 'level_1', 'type': 'integer'}, - {'name': 'A', 'type': 'integer'}, - {'name': 'B', 'type': 'string'}, - {'name': 'C', 'type': 'datetime'}, - {'name': 'D', 'type': 'duration'}, - ], - 'primaryKey': ['level_0', 'level_1'] + "fields": [ + {"name": "level_0", "type": "string"}, + {"name": "level_1", "type": "integer"}, + {"name": "A", "type": "integer"}, + {"name": "B", "type": "string"}, + {"name": "C", "type": "datetime"}, + {"name": "D", "type": "duration"}, + ], + "primaryKey": ["level_0", "level_1"], } assert result == expected - df.index.names = ['idx0', None] - expected['fields'][0]['name'] = 'idx0' - expected['primaryKey'] = ['idx0', 'level_1'] + df.index.names = ["idx0", None] + expected["fields"][0]["name"] = "idx0" + expected["primaryKey"] = ["idx0", "level_1"] result = build_table_schema(df, version=False) assert result == expected class TestTableSchemaType: - - @pytest.mark.parametrize('int_type', [ - np.int, np.int16, np.int32, np.int64]) + @pytest.mark.parametrize("int_type", [np.int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_data(self, int_type): int_data = [1, 2, 3] - assert as_json_table_type(np.array( - int_data, dtype=int_type)) == 'integer' + assert as_json_table_type(np.array(int_data, dtype=int_type)) == "integer" - @pytest.mark.parametrize('float_type', [ - np.float, np.float16, np.float32, np.float64]) + @pytest.mark.parametrize( + "float_type", [np.float, np.float16, np.float32, np.float64] + ) def test_as_json_table_type_float_data(self, float_type): - float_data = [1., 2., 3.] - assert as_json_table_type(np.array( - float_data, dtype=float_type)) == 'number' + float_data = [1.0, 2.0, 3.0] + assert as_json_table_type(np.array(float_data, dtype=float_type)) == "number" - @pytest.mark.parametrize('bool_type', [bool, np.bool]) + @pytest.mark.parametrize("bool_type", [bool, np.bool]) def test_as_json_table_type_bool_data(self, bool_type): bool_data = [True, False] - assert as_json_table_type(np.array( - bool_data, dtype=bool_type)) == 'boolean' - - @pytest.mark.parametrize('date_data', [ - pd.to_datetime(['2016']), - pd.to_datetime(['2016'], utc=True), - pd.Series(pd.to_datetime(['2016'])), - pd.Series(pd.to_datetime(['2016'], utc=True)), - pd.period_range('2016', freq='A', periods=3) - ]) + assert as_json_table_type(np.array(bool_data, dtype=bool_type)) == "boolean" + + @pytest.mark.parametrize( + "date_data", + [ + pd.to_datetime(["2016"]), + pd.to_datetime(["2016"], utc=True), + pd.Series(pd.to_datetime(["2016"])), + pd.Series(pd.to_datetime(["2016"], utc=True)), + pd.period_range("2016", freq="A", periods=3), + ], + ) def test_as_json_table_type_date_data(self, date_data): - assert as_json_table_type(date_data) == 'datetime' + assert as_json_table_type(date_data) == "datetime" - @pytest.mark.parametrize('str_data', [ - pd.Series(['a', 'b']), pd.Index(['a', 'b'])]) + @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) def test_as_json_table_type_string_data(self, str_data): - assert as_json_table_type(str_data) == 'string' - - @pytest.mark.parametrize('cat_data', [ - pd.Categorical(['a']), - pd.Categorical([1]), - pd.Series(pd.Categorical([1])), - pd.CategoricalIndex([1]), - pd.Categorical([1])]) + assert as_json_table_type(str_data) == "string" + + @pytest.mark.parametrize( + "cat_data", + [ + pd.Categorical(["a"]), + pd.Categorical([1]), + pd.Series(pd.Categorical([1])), + pd.CategoricalIndex([1]), + pd.Categorical([1]), + ], + ) def test_as_json_table_type_categorical_data(self, cat_data): - assert as_json_table_type(cat_data) == 'any' + assert as_json_table_type(cat_data) == "any" # ------ # dtypes # ------ - @pytest.mark.parametrize('int_dtype', [ - np.int, np.int16, np.int32, np.int64]) + @pytest.mark.parametrize("int_dtype", [np.int, np.int16, np.int32, np.int64]) def test_as_json_table_type_int_dtypes(self, int_dtype): - assert as_json_table_type(int_dtype) == 'integer' + assert as_json_table_type(int_dtype) == "integer" - @pytest.mark.parametrize('float_dtype', [ - np.float, np.float16, np.float32, np.float64]) + @pytest.mark.parametrize( + "float_dtype", [np.float, np.float16, np.float32, np.float64] + ) def test_as_json_table_type_float_dtypes(self, float_dtype): - assert as_json_table_type(float_dtype) == 'number' + assert as_json_table_type(float_dtype) == "number" - @pytest.mark.parametrize('bool_dtype', [bool, np.bool]) + @pytest.mark.parametrize("bool_dtype", [bool, np.bool]) def test_as_json_table_type_bool_dtypes(self, bool_dtype): - assert as_json_table_type(bool_dtype) == 'boolean' - - @pytest.mark.parametrize('date_dtype', [ - np.datetime64, np.dtype("=1" with pytest.raises(ValueError, match=msg): - pd.read_json(StringIO(lines_json_df), lines=True, - chunksize=chunksize) + pd.read_json(StringIO(lines_json_df), lines=True, chunksize=chunksize) @pytest.mark.parametrize("chunksize", [None, 1, 2]) @@ -163,9 +167,10 @@ def test_readjson_chunks_multiple_empty_lines(chunksize): {"A":3,"B":6} """ - orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + orig = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) test = pd.read_json(j, lines=True, chunksize=chunksize) if chunksize is not None: test = pd.concat(test) tm.assert_frame_equal( - orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize)) + orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize) + ) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 0d84221b8d4e3..69a246487ddf1 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -40,12 +40,9 @@ def _clean_dict(d): return {str(k): v for k, v in d.items()} -@pytest.fixture(params=[ - None, # Column indexed by default. - "split", - "records", - "values", - "index"]) +@pytest.fixture( + params=[None, "split", "records", "values", "index"] # Column indexed by default. +) def orient(request): return request.param @@ -56,9 +53,9 @@ def numpy(request): class TestUltraJSONTests: - - @pytest.mark.skipif(compat.is_platform_32bit(), - reason="not compliant on 32-bit, xref #15865") + @pytest.mark.skipif( + compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" + ) def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.encode(sut, double_precision=15) @@ -117,15 +114,15 @@ def test_encode_decimal(self): @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): string_input = "A string \\ / \b \f \n \r \t &" - not_html_encoded = ('"A string \\\\ \\/ \\b \\f \\n ' - '\\r \\t <\\/script> &"') - html_encoded = ('"A string \\\\ \\/ \\b \\f \\n \\r \\t ' - '\\u003c\\/script\\u003e \\u0026"') + not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n ' '\\r \\t <\\/script> &"' + html_encoded = ( + '"A string \\\\ \\/ \\b \\f \\n \\r \\t ' '\\u003c\\/script\\u003e \\u0026"' + ) def helper(expected_output, **encode_kwargs): - output = ujson.encode(string_input, - ensure_ascii=ensure_ascii, - **encode_kwargs) + output = ujson.encode( + string_input, ensure_ascii=ensure_ascii, **encode_kwargs + ) assert output == expected_output assert string_input == json.loads(output) @@ -140,9 +137,9 @@ def helper(expected_output, **encode_kwargs): # Make sure explicit encode_html_chars=True does the encoding. helper(html_encoded, encode_html_chars=True) - @pytest.mark.parametrize("long_number", [ - -4342969734183514, -12345678901234.56789012, -528656961.4399388 - ]) + @pytest.mark.parametrize( + "long_number", [-4342969734183514, -12345678901234.56789012, -528656961.4399388] + ) def test_double_long_numbers(self, long_number): sut = {"a": long_number} encoded = ujson.encode(sut, double_precision=15) @@ -177,17 +174,14 @@ def test_encode_double_tiny_exponential(self): num = -1e-145 assert np.allclose(num, ujson.decode(ujson.encode(num))) - @pytest.mark.parametrize("unicode_key", [ - "key1", "بن" - ]) + @pytest.mark.parametrize("unicode_key", ["key1", "بن"]) def test_encode_dict_with_unicode_keys(self, unicode_key): unicode_dict = {unicode_key: "value1"} assert unicode_dict == ujson.decode(ujson.encode(unicode_dict)) - @pytest.mark.parametrize("double_input", [ - math.pi, - -math.pi # Should work with negatives too. - ]) + @pytest.mark.parametrize( + "double_input", [math.pi, -math.pi] # Should work with negatives too. + ) def test_encode_double_conversion(self, double_input): output = ujson.encode(double_input) assert round(double_input, 5) == round(json.loads(output), 5) @@ -207,19 +201,20 @@ def test_encode_array_of_nested_arrays(self): assert nested_input == ujson.decode(output) nested_input = np.array(nested_input) - tm.assert_numpy_array_equal(nested_input, ujson.decode( - output, numpy=True, dtype=nested_input.dtype)) + tm.assert_numpy_array_equal( + nested_input, ujson.decode(output, numpy=True, dtype=nested_input.dtype) + ) def test_encode_array_of_doubles(self): - doubles_input = [31337.31337, 31337.31337, - 31337.31337, 31337.31337] * 10 + doubles_input = [31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 output = ujson.encode(doubles_input) assert doubles_input == json.loads(output) assert doubles_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(doubles_input), - ujson.decode(output, numpy=True)) + tm.assert_numpy_array_equal( + np.array(doubles_input), ujson.decode(output, numpy=True) + ) def test_double_precision(self): double_input = 30.012345678901234 @@ -229,20 +224,16 @@ def test_double_precision(self): assert double_input == ujson.decode(output) for double_precision in (3, 9): - output = ujson.encode(double_input, - double_precision=double_precision) + output = ujson.encode(double_input, double_precision=double_precision) rounded_input = round(double_input, double_precision) assert rounded_input == json.loads(output) assert rounded_input == ujson.decode(output) - @pytest.mark.parametrize("invalid_val", [ - 20, -1, "9", None - ]) + @pytest.mark.parametrize("invalid_val", [20, -1, "9", None]) def test_invalid_double_precision(self, invalid_val): double_input = 30.12345678901234567890 - expected_exception = (ValueError if isinstance(invalid_val, int) - else TypeError) + expected_exception = ValueError if isinstance(invalid_val, int) else TypeError with pytest.raises(expected_exception): ujson.encode(double_input, double_precision=invalid_val) @@ -255,10 +246,10 @@ def test_encode_string_conversion2(self): assert string_input == ujson.decode(output) assert output == '"A string \\\\ \\/ \\b \\f \\n \\r \\t"' - @pytest.mark.parametrize("unicode_input", [ - "Räksmörgås اسامة بن محمد بن عوض بن لادن", - "\xe6\x97\xa5\xd1\x88" - ]) + @pytest.mark.parametrize( + "unicode_input", + ["Räksmörgås اسامة بن محمد بن عوض بن لادن", "\xe6\x97\xa5\xd1\x88"], + ) def test_encode_unicode_conversion(self, unicode_input): enc = ujson.encode(unicode_input) dec = ujson.decode(enc) @@ -307,14 +298,18 @@ def test_encode_array_in_array(self): assert output == json.dumps(arr_in_arr_input) assert arr_in_arr_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(arr_in_arr_input), - ujson.decode(output, numpy=True)) - - @pytest.mark.parametrize("num_input", [ - 31337, - -31337, # Negative number. - -9223372036854775808 # Large negative number. - ]) + tm.assert_numpy_array_equal( + np.array(arr_in_arr_input), ujson.decode(output, numpy=True) + ) + + @pytest.mark.parametrize( + "num_input", + [ + 31337, + -31337, # Negative number. + -9223372036854775808, # Large negative number. + ], + ) def test_encode_num_conversion(self, num_input): output = ujson.encode(num_input) assert num_input == json.loads(output) @@ -328,8 +323,9 @@ def test_encode_list_conversion(self): assert list_input == json.loads(output) assert list_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(list_input), - ujson.decode(output, numpy=True)) + tm.assert_numpy_array_equal( + np.array(list_input), ujson.decode(output, numpy=True) + ) def test_encode_dict_conversion(self): dict_input = {"k1": 1, "k2": 2, "k3": 3, "k4": 4} @@ -363,11 +359,10 @@ def test_encode_date_conversion(self): assert int(expected) == json.loads(output) assert int(expected) == ujson.decode(output) - @pytest.mark.parametrize("test", [ - datetime.time(), - datetime.time(1, 2, 3), - datetime.time(10, 12, 15, 343243), - ]) + @pytest.mark.parametrize( + "test", + [datetime.time(), datetime.time(1, 2, 3), datetime.time(10, 12, 15, 343243)], + ) def test_encode_time_conversion_basic(self, test): output = ujson.encode(test) expected = '"{iso}"'.format(iso=test.isoformat()) @@ -387,13 +382,9 @@ def test_encode_time_conversion_dateutil(self): expected = '"{iso}"'.format(iso=test.isoformat()) assert expected == output - @pytest.mark.parametrize("decoded_input", [ - NaT, - np.datetime64("NaT"), - np.nan, - np.inf, - -np.inf - ]) + @pytest.mark.parametrize( + "decoded_input", [NaT, np.datetime64("NaT"), np.nan, np.inf, -np.inf] + ) def test_encode_as_null(self, decoded_input): assert ujson.encode(decoded_input) == "null", "Expected null" @@ -401,21 +392,21 @@ def test_datetime_units(self): val = datetime.datetime(2013, 8, 17, 21, 17, 12, 215504) stamp = Timestamp(val) - roundtrip = ujson.decode(ujson.encode(val, date_unit='s')) - assert roundtrip == stamp.value // 10**9 + roundtrip = ujson.decode(ujson.encode(val, date_unit="s")) + assert roundtrip == stamp.value // 10 ** 9 - roundtrip = ujson.decode(ujson.encode(val, date_unit='ms')) - assert roundtrip == stamp.value // 10**6 + roundtrip = ujson.decode(ujson.encode(val, date_unit="ms")) + assert roundtrip == stamp.value // 10 ** 6 - roundtrip = ujson.decode(ujson.encode(val, date_unit='us')) - assert roundtrip == stamp.value // 10**3 + roundtrip = ujson.decode(ujson.encode(val, date_unit="us")) + assert roundtrip == stamp.value // 10 ** 3 - roundtrip = ujson.decode(ujson.encode(val, date_unit='ns')) + roundtrip = ujson.decode(ujson.encode(val, date_unit="ns")) assert roundtrip == stamp.value msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): - ujson.encode(val, date_unit='foo') + ujson.encode(val, date_unit="foo") def test_encode_to_utf8(self): unencoded = "\xe6\x97\xa5\xd1\x88" @@ -427,7 +418,7 @@ def test_encode_to_utf8(self): assert dec == json.loads(enc) def test_decode_from_unicode(self): - unicode_input = "{\"obj\": 31337}" + unicode_input = '{"obj": 31337}' dec1 = ujson.decode(unicode_input) dec2 = ujson.decode(str(unicode_input)) @@ -458,57 +449,59 @@ def test_decode_jibberish(self): with pytest.raises(ValueError): ujson.decode(jibberish) - @pytest.mark.parametrize("broken_json", [ - "[", # Broken array start. - "{", # Broken object start. - "]", # Broken array end. - "}", # Broken object end. - ]) + @pytest.mark.parametrize( + "broken_json", + [ + "[", # Broken array start. + "{", # Broken object start. + "]", # Broken array end. + "}", # Broken object end. + ], + ) def test_decode_broken_json(self, broken_json): with pytest.raises(ValueError): ujson.decode(broken_json) - @pytest.mark.parametrize("too_big_char", [ - "[", - "{", - ]) + @pytest.mark.parametrize("too_big_char", ["[", "{"]) def test_decode_depth_too_big(self, too_big_char): with pytest.raises(ValueError): ujson.decode(too_big_char * (1024 * 1024)) - @pytest.mark.parametrize("bad_string", [ - "\"TESTING", # Unterminated. - "\"TESTING\\\"", # Unterminated escape. - "tru", # Broken True. - "fa", # Broken False. - "n", # Broken None. - ]) + @pytest.mark.parametrize( + "bad_string", + [ + '"TESTING', # Unterminated. + '"TESTING\\"', # Unterminated escape. + "tru", # Broken True. + "fa", # Broken False. + "n", # Broken None. + ], + ) def test_decode_bad_string(self, bad_string): with pytest.raises(ValueError): ujson.decode(bad_string) - @pytest.mark.parametrize("broken_json", [ - '{{1337:""}}', - '{{"key":"}', - '[[[true', - ]) + @pytest.mark.parametrize("broken_json", ['{{1337:""}}', '{{"key":"}', "[[[true"]) def test_decode_broken_json_leak(self, broken_json): for _ in range(1000): with pytest.raises(ValueError): ujson.decode(broken_json) - @pytest.mark.parametrize("invalid_dict", [ - "{{{{31337}}}}", # No key. - "{{{{\"key\":}}}}", # No value. - "{{{{\"key\"}}}}", # No colon or value. - ]) + @pytest.mark.parametrize( + "invalid_dict", + [ + "{{{{31337}}}}", # No key. + '{{{{"key":}}}}', # No value. + '{{{{"key"}}}}', # No colon or value. + ], + ) def test_decode_invalid_dict(self, invalid_dict): with pytest.raises(ValueError): ujson.decode(invalid_dict) - @pytest.mark.parametrize("numeric_int_as_str", [ - "31337", "-31337" # Should work with negatives. - ]) + @pytest.mark.parametrize( + "numeric_int_as_str", ["31337", "-31337"] # Should work with negatives. + ) def test_decode_numeric_int(self, numeric_int_as_str): assert int(numeric_int_as_str) == ujson.decode(numeric_int_as_str) @@ -529,21 +522,26 @@ def test_encode_null_character(self): assert '" \\u0000\\r\\n "' == ujson.dumps(" \u0000\r\n ") def test_decode_null_character(self): - wrapped_input = "\"31337 \\u0000 31337\"" + wrapped_input = '"31337 \\u0000 31337"' assert ujson.decode(wrapped_input) == json.loads(wrapped_input) def test_encode_list_long_conversion(self): - long_input = [9223372036854775807, 9223372036854775807, - 9223372036854775807, 9223372036854775807, - 9223372036854775807, 9223372036854775807] + long_input = [ + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + 9223372036854775807, + ] output = ujson.encode(long_input) assert long_input == json.loads(output) assert long_input == ujson.decode(output) - tm.assert_numpy_array_equal(np.array(long_input), - ujson.decode(output, numpy=True, - dtype=np.int64)) + tm.assert_numpy_array_equal( + np.array(long_input), ujson.decode(output, numpy=True, dtype=np.int64) + ) def test_encode_long_conversion(self): long_input = 9223372036854775807 @@ -553,9 +551,9 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - @pytest.mark.parametrize("int_exp", [ - "1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4" - ]) + @pytest.mark.parametrize( + "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] + ) def test_decode_numeric_int_exp(self, int_exp): assert ujson.decode(int_exp) == json.loads(int_exp) @@ -565,8 +563,9 @@ def test_loads_non_str_bytes_raises(self): ujson.loads(None) def test_version(self): - assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ - "ujson.__version__ must be a string like '1.4.0'" + assert re.match( + r"^\d+\.\d+(\.\d+)?$", ujson.__version__ + ), "ujson.__version__ must be a string like '1.4.0'" def test_encode_numeric_overflow(self): with pytest.raises(OverflowError): @@ -580,9 +579,7 @@ class Nested: with pytest.raises(OverflowError): ujson.encode(Nested()) - @pytest.mark.parametrize("val", [ - 3590016419, 2**31, 2**32, (2**32) - 1 - ]) + @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1]) def test_decode_number_with_32bit_sign_bit(self, val): # Test that numbers that fit within 32 bits but would have the # sign bit set (2**31 <= x < 2**32) are decoded properly. @@ -592,14 +589,14 @@ def test_decode_number_with_32bit_sign_bit(self, val): def test_encode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): - base = '\u00e5'.encode("utf-8") + base = "\u00e5".encode("utf-8") escape_input = base * 1024 * 1024 * 2 ujson.encode(escape_input) def test_decode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): - base = '\u00e5'.encode("utf-8") + base = "\u00e5".encode("utf-8") quote = b'"' escape_input = quote + (base * 1024 * 1024 * 2) + quote @@ -619,9 +616,7 @@ def toDict(self): assert dec == d def test_default_handler(self): - class _TestObject: - def __init__(self, val): self.val = val @@ -635,14 +630,14 @@ def __str__(self): msg = "Maximum recursion level reached" with pytest.raises(OverflowError, match=msg): ujson.encode(_TestObject("foo")) - assert '"foo"' == ujson.encode(_TestObject("foo"), - default_handler=str) + assert '"foo"' == ujson.encode(_TestObject("foo"), default_handler=str) def my_handler(_): return "foobar" - assert '"foobar"' == ujson.encode(_TestObject("foo"), - default_handler=my_handler) + assert '"foobar"' == ujson.encode( + _TestObject("foo"), default_handler=my_handler + ) def my_handler_raises(_): raise TypeError("I raise for anything") @@ -653,34 +648,39 @@ def my_handler_raises(_): def my_int_handler(_): return 42 - assert ujson.decode(ujson.encode(_TestObject("foo"), - default_handler=my_int_handler)) == 42 + assert ( + ujson.decode( + ujson.encode(_TestObject("foo"), default_handler=my_int_handler) + ) + == 42 + ) def my_obj_handler(_): return datetime.datetime(2013, 2, 3) - assert (ujson.decode(ujson.encode(datetime.datetime(2013, 2, 3))) == - ujson.decode(ujson.encode(_TestObject("foo"), - default_handler=my_obj_handler))) + assert ujson.decode( + ujson.encode(datetime.datetime(2013, 2, 3)) + ) == ujson.decode( + ujson.encode(_TestObject("foo"), default_handler=my_obj_handler) + ) obj_list = [_TestObject("foo"), _TestObject("bar")] - assert (json.loads(json.dumps(obj_list, default=str)) == - ujson.decode(ujson.encode(obj_list, default_handler=str))) + assert json.loads(json.dumps(obj_list, default=str)) == ujson.decode( + ujson.encode(obj_list, default_handler=str) + ) class TestNumpyJSONTests: - @pytest.mark.parametrize("bool_input", [True, False]) def test_bool(self, bool_input): b = np.bool(bool_input) assert ujson.decode(ujson.encode(b)) == b def test_bool_array(self): - bool_array = np.array([ - True, False, True, True, - False, True, False, False], dtype=np.bool) - output = np.array(ujson.decode( - ujson.encode(bool_array)), dtype=np.bool) + bool_array = np.array( + [True, False, True, True, False, True, False, False], dtype=np.bool + ) + output = np.array(ujson.decode(ujson.encode(bool_array)), dtype=np.bool) tm.assert_numpy_array_equal(bool_array, output) def test_int(self, any_int_dtype): @@ -693,8 +693,9 @@ def test_int_array(self, any_int_dtype): arr = np.arange(100, dtype=np.int) arr_input = arr.astype(any_int_dtype) - arr_output = np.array(ujson.decode(ujson.encode(arr_input)), - dtype=any_int_dtype) + arr_output = np.array( + ujson.decode(ujson.encode(arr_input)), dtype=any_int_dtype + ) tm.assert_numpy_array_equal(arr_input, arr_output) def test_int_max(self, any_int_dtype): @@ -722,47 +723,49 @@ def test_float_array(self, float_dtype): arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) float_input = arr.astype(float_dtype) - float_output = np.array(ujson.decode( - ujson.encode(float_input, double_precision=15)), - dtype=float_dtype) + float_output = np.array( + ujson.decode(ujson.encode(float_input, double_precision=15)), + dtype=float_dtype, + ) tm.assert_almost_equal(float_input, float_output) def test_float_max(self, float_dtype): klass = np.dtype(float_dtype).type num = klass(np.finfo(float_dtype).max / 10) - tm.assert_almost_equal(klass(ujson.decode( - ujson.encode(num, double_precision=15))), num) + tm.assert_almost_equal( + klass(ujson.decode(ujson.encode(num, double_precision=15))), num + ) def test_array_basic(self): arr = np.arange(96) arr = arr.reshape((2, 2, 2, 2, 3, 2)) - tm.assert_numpy_array_equal( - np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode( - ujson.encode(arr), numpy=True), arr) - - @pytest.mark.parametrize("shape", [ - (10, 10), - (5, 5, 4), - (100, 1), - ]) + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + + @pytest.mark.parametrize("shape", [(10, 10), (5, 5, 4), (100, 1)]) def test_array_reshaped(self, shape): arr = np.arange(100) arr = arr.reshape(shape) - tm.assert_numpy_array_equal( - np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode( - ujson.encode(arr), numpy=True), arr) + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) def test_array_list(self): - arr_list = ["a", list(), dict(), dict(), list(), - 42, 97.8, ["a", "b"], {"key": "val"}] + arr_list = [ + "a", + list(), + dict(), + dict(), + list(), + 42, + 97.8, + ["a", "b"], + {"key": "val"}, + ] arr = np.array(arr_list) - tm.assert_numpy_array_equal( - np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) def test_array_float(self): dtype = np.float32 @@ -780,70 +783,69 @@ def test_0d_array(self): with pytest.raises(TypeError): ujson.encode(np.array(1)) - @pytest.mark.parametrize("bad_input,exc_type,kwargs", [ - ([{}, []], ValueError, {}), - ([42, None], TypeError, {}), - ([["a"], 42], ValueError, {}), - ([42, {}, "a"], TypeError, {}), - ([42, ["a"], 42], ValueError, {}), - (["a", "b", [], "c"], ValueError, {}), - ([{"a": "b"}], ValueError, dict(labelled=True)), - ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)), - ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True)) - ]) + @pytest.mark.parametrize( + "bad_input,exc_type,kwargs", + [ + ([{}, []], ValueError, {}), + ([42, None], TypeError, {}), + ([["a"], 42], ValueError, {}), + ([42, {}, "a"], TypeError, {}), + ([42, ["a"], 42], ValueError, {}), + (["a", "b", [], "c"], ValueError, {}), + ([{"a": "b"}], ValueError, dict(labelled=True)), + ({"a": {"b": {"c": 42}}}, ValueError, dict(labelled=True)), + ([{"a": 42, "b": 23}, {"c": 17}], ValueError, dict(labelled=True)), + ], + ) def test_array_numpy_except(self, bad_input, exc_type, kwargs): with pytest.raises(exc_type): ujson.decode(ujson.dumps(bad_input), numpy=True, **kwargs) def test_array_numpy_labelled(self): labelled_input = {"a": []} - output = ujson.loads(ujson.dumps(labelled_input), - numpy=True, labelled=True) + output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) assert (np.empty((1, 0)) == output[0]).all() assert (np.array(["a"]) == output[1]).all() assert output[2] is None labelled_input = [{"a": 42}] - output = ujson.loads(ujson.dumps(labelled_input), - numpy=True, labelled=True) + output = ujson.loads(ujson.dumps(labelled_input), numpy=True, labelled=True) assert (np.array(["a"]) == output[2]).all() assert (np.array([42]) == output[0]).all() assert output[1] is None # see gh-10837: write out the dump explicitly # so there is no dependency on iteration order - input_dumps = ('[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' - '{"a": 2.4, "b": 78}]') + input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' '{"a": 2.4, "b": 78}]' output = ujson.loads(input_dumps, numpy=True, labelled=True) - expected_vals = np.array( - [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() assert output[1] is None assert (np.array(["a", "b"]) == output[2]).all() - input_dumps = ('{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, ' - '"3": {"a": 2.4, "b": 78}}') + input_dumps = ( + '{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, ' + '"3": {"a": 2.4, "b": 78}}' + ) output = ujson.loads(input_dumps, numpy=True, labelled=True) - expected_vals = np.array( - [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() assert (np.array(["1", "2", "3"]) == output[1]).all() assert (np.array(["a", "b"]) == output[2]).all() class TestPandasJSONTests: - def test_dataframe(self, orient, numpy): if orient == "records" and numpy: pytest.skip("Not idiomatic pandas") - df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ - "a", "b"], columns=["x", "y", "z"]) + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + ) encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) - output = ujson.decode(ujson.encode(df, **encode_kwargs), - **decode_kwargs) + output = ujson.decode(ujson.encode(df, **encode_kwargs), **decode_kwargs) # Ensure proper DataFrame initialization. if orient == "split": @@ -864,26 +866,34 @@ def test_dataframe(self, orient, numpy): tm.assert_frame_equal(output, df, check_dtype=False) def test_dataframe_nested(self, orient): - df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ - "a", "b"], columns=["x", "y", "z"]) + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], index=["a", "b"], columns=["x", "y", "z"] + ) nested = {"df1": df, "df2": df.copy()} kwargs = {} if orient is None else dict(orient=orient) - exp = {"df1": ujson.decode(ujson.encode(df, **kwargs)), - "df2": ujson.decode(ujson.encode(df, **kwargs))} + exp = { + "df1": ujson.decode(ujson.encode(df, **kwargs)), + "df2": ujson.decode(ujson.encode(df, **kwargs)), + } assert ujson.decode(ujson.encode(nested, **kwargs)) == exp def test_dataframe_numpy_labelled(self, orient): if orient in ("split", "values"): pytest.skip("Incompatible with labelled=True") - df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ - "a", "b"], columns=["x", "y", "z"], dtype=np.int) + df = DataFrame( + [[1, 2, 3], [4, 5, 6]], + index=["a", "b"], + columns=["x", "y", "z"], + dtype=np.int, + ) kwargs = {} if orient is None else dict(orient=orient) - output = DataFrame(*ujson.decode(ujson.encode(df, **kwargs), - numpy=True, labelled=True)) + output = DataFrame( + *ujson.decode(ujson.encode(df, **kwargs), numpy=True, labelled=True) + ) if orient is None: df = df.T @@ -893,14 +903,14 @@ def test_dataframe_numpy_labelled(self, orient): tm.assert_frame_equal(output, df) def test_series(self, orient, numpy): - s = Series([10, 20, 30, 40, 50, 60], name="series", - index=[6, 7, 8, 9, 10, 15]).sort_values() + s = Series( + [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + ).sort_values() encode_kwargs = {} if orient is None else dict(orient=orient) decode_kwargs = {} if numpy is None else dict(numpy=numpy) - output = ujson.decode(ujson.encode(s, **encode_kwargs), - **decode_kwargs) + output = ujson.decode(ujson.encode(s, **encode_kwargs), **decode_kwargs) if orient == "split": dec = _clean_dict(output) @@ -919,13 +929,16 @@ def test_series(self, orient, numpy): tm.assert_series_equal(output, s, check_dtype=False) def test_series_nested(self, orient): - s = Series([10, 20, 30, 40, 50, 60], name="series", - index=[6, 7, 8, 9, 10, 15]).sort_values() + s = Series( + [10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15] + ).sort_values() nested = {"s1": s, "s2": s.copy()} kwargs = {} if orient is None else dict(orient=orient) - exp = {"s1": ujson.decode(ujson.encode(s, **kwargs)), - "s2": ujson.decode(ujson.encode(s, **kwargs))} + exp = { + "s1": ujson.decode(ujson.encode(s, **kwargs)), + "s2": ujson.decode(ujson.encode(s, **kwargs)), + } assert ujson.decode(ujson.encode(nested, **kwargs)) == exp def test_index(self): @@ -944,35 +957,34 @@ def test_index(self): tm.assert_index_equal(i, output) assert i.name == output.name - dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), - numpy=True)) + dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), numpy=True)) output = Index(**dec) tm.assert_index_equal(i, output) assert i.name == output.name - output = Index(ujson.decode(ujson.encode(i, orient="values")), - name="index") + output = Index(ujson.decode(ujson.encode(i, orient="values")), name="index") tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="values"), - numpy=True), name="index") + output = Index( + ujson.decode(ujson.encode(i, orient="values"), numpy=True), name="index" + ) tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="records")), - name="index") + output = Index(ujson.decode(ujson.encode(i, orient="records")), name="index") tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="records"), - numpy=True), name="index") + output = Index( + ujson.decode(ujson.encode(i, orient="records"), numpy=True), name="index" + ) tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="index")), - name="index") + output = Index(ujson.decode(ujson.encode(i, orient="index")), name="index") tm.assert_index_equal(i, output) - output = Index(ujson.decode(ujson.encode(i, orient="index"), - numpy=True), name="index") + output = Index( + ujson.decode(ujson.encode(i, orient="index"), numpy=True), name="index" + ) tm.assert_index_equal(i, output) def test_datetime_index(self): @@ -991,31 +1003,30 @@ def test_datetime_index(self): decoded.index = DatetimeIndex(idx_values) tm.assert_series_equal(ts, decoded) - @pytest.mark.parametrize("invalid_arr", [ - "[31337,]", # Trailing comma. - "[,31337]", # Leading comma. - "[]]", # Unmatched bracket. - "[,]", # Only comma. - ]) + @pytest.mark.parametrize( + "invalid_arr", + [ + "[31337,]", # Trailing comma. + "[,31337]", # Leading comma. + "[]]", # Unmatched bracket. + "[,]", # Only comma. + ], + ) def test_decode_invalid_array(self, invalid_arr): with pytest.raises(ValueError): ujson.decode(invalid_arr) - @pytest.mark.parametrize("arr", [ - [], [31337] - ]) + @pytest.mark.parametrize("arr", [[], [31337]]) def test_decode_array(self, arr): assert arr == ujson.decode(str(arr)) - @pytest.mark.parametrize("extreme_num", [ - 9223372036854775807, -9223372036854775808 - ]) + @pytest.mark.parametrize("extreme_num", [9223372036854775807, -9223372036854775808]) def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.decode(str(extreme_num)) - @pytest.mark.parametrize("too_extreme_num", [ - "9223372036854775808", "-90223372036854775809" - ]) + @pytest.mark.parametrize( + "too_extreme_num", ["9223372036854775808", "-90223372036854775809"] + ) def test_decode_too_extreme_numbers(self, too_extreme_num): with pytest.raises(ValueError): ujson.decode(too_extreme_num) @@ -1031,17 +1042,26 @@ def test_decode_array_with_big_int(self): with pytest.raises(ValueError): ujson.loads("[18446098363113800555]") - @pytest.mark.parametrize("float_number", [ - 1.1234567893, 1.234567893, 1.34567893, - 1.4567893, 1.567893, 1.67893, - 1.7893, 1.893, 1.3, - ]) + @pytest.mark.parametrize( + "float_number", + [ + 1.1234567893, + 1.234567893, + 1.34567893, + 1.4567893, + 1.567893, + 1.67893, + 1.7893, + 1.893, + 1.3, + ], + ) @pytest.mark.parametrize("sign", [-1, 1]) def test_decode_floating_point(self, sign, float_number): float_number *= sign - tm.assert_almost_equal(float_number, - ujson.loads(str(float_number)), - check_less_precise=15) + tm.assert_almost_equal( + float_number, ujson.loads(str(float_number)), check_less_precise=15 + ) def test_encode_big_set(self): s = set() diff --git a/pandas/tests/io/msgpack/test_buffer.py b/pandas/tests/io/msgpack/test_buffer.py index e36dc5bbdb4ba..fe1f4e73eba24 100644 --- a/pandas/tests/io/msgpack/test_buffer.py +++ b/pandas/tests/io/msgpack/test_buffer.py @@ -7,15 +7,16 @@ def test_unpack_buffer(): from array import array - buf = array('b') - frombytes(buf, packb((b'foo', b'bar'))) + + buf = array("b") + frombytes(buf, packb((b"foo", b"bar"))) obj = unpackb(buf, use_list=1) - assert [b'foo', b'bar'] == obj + assert [b"foo", b"bar"] == obj def test_unpack_bytearray(): - buf = bytearray(packb(('foo', 'bar'))) + buf = bytearray(packb(("foo", "bar"))) obj = unpackb(buf, use_list=1) - assert [b'foo', b'bar'] == obj + assert [b"foo", b"bar"] == obj expected_type = bytes assert all(type(s) == expected_type for s in obj) diff --git a/pandas/tests/io/msgpack/test_case.py b/pandas/tests/io/msgpack/test_case.py index c0e76b37ee46d..15b7090c11bad 100644 --- a/pandas/tests/io/msgpack/test_case.py +++ b/pandas/tests/io/msgpack/test_case.py @@ -5,14 +5,23 @@ def check(length, obj): v = packb(obj) - assert len(v) == length, \ - "%r length should be %r but get %r" % (obj, length, len(v)) + assert len(v) == length, "%r length should be %r but get %r" % (obj, length, len(v)) assert unpackb(v, use_list=0) == obj def test_1(): - for o in [None, True, False, 0, 1, (1 << 6), (1 << 7) - 1, -1, - -((1 << 5) - 1), -(1 << 5)]: + for o in [ + None, + True, + False, + 0, + 1, + (1 << 6), + (1 << 7) - 1, + -1, + -((1 << 5) - 1), + -(1 << 5), + ]: check(1, o) @@ -32,8 +41,16 @@ def test_5(): def test_9(): - for o in [1 << 32, (1 << 64) - 1, -((1 << 31) + 1), -(1 << 63), 1.0, 0.1, - -0.1, -1.0]: + for o in [ + 1 << 32, + (1 << 64) - 1, + -((1 << 31) + 1), + -(1 << 63), + 1.0, + 0.1, + -0.1, + -1.0, + ]: check(9, o) @@ -56,7 +73,7 @@ def test_raw32(): def check_array(overhead, num): - check(num + overhead, (None, ) * num) + check(num + overhead, (None,) * num) def test_fixarray(): @@ -80,31 +97,46 @@ def match(obj, buf): def test_match(): cases = [ - (None, b'\xc0'), - (False, b'\xc2'), - (True, b'\xc3'), - (0, b'\x00'), - (127, b'\x7f'), - (128, b'\xcc\x80'), - (256, b'\xcd\x01\x00'), - (-1, b'\xff'), - (-33, b'\xd0\xdf'), - (-129, b'\xd1\xff\x7f'), - ({1: 1}, b'\x81\x01\x01'), + (None, b"\xc0"), + (False, b"\xc2"), + (True, b"\xc3"), + (0, b"\x00"), + (127, b"\x7f"), + (128, b"\xcc\x80"), + (256, b"\xcd\x01\x00"), + (-1, b"\xff"), + (-33, b"\xd0\xdf"), + (-129, b"\xd1\xff\x7f"), + ({1: 1}, b"\x81\x01\x01"), (1.0, b"\xcb\x3f\xf0\x00\x00\x00\x00\x00\x00"), - ((), b'\x90'), - (tuple(range(15)), (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" - b"\x0a\x0b\x0c\x0d\x0e")), - (tuple(range(16)), (b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07" - b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f")), - ({}, b'\x80'), - ({x: x for x in range(15)}, - (b'\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07' - b'\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e')), - ({x: x for x in range(16)}, - (b'\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06' - b'\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e' - b'\x0f\x0f')), + ((), b"\x90"), + ( + tuple(range(15)), + (b"\x9f\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" b"\x0a\x0b\x0c\x0d\x0e"), + ), + ( + tuple(range(16)), + ( + b"\xdc\x00\x10\x00\x01\x02\x03\x04\x05\x06\x07" + b"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + ), + ), + ({}, b"\x80"), + ( + {x: x for x in range(15)}, + ( + b"\x8f\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06\x06\x07" + b"\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e" + ), + ), + ( + {x: x for x in range(16)}, + ( + b"\xde\x00\x10\x00\x00\x01\x01\x02\x02\x03\x03\x04\x04\x05\x05\x06" + b"\x06\x07\x07\x08\x08\t\t\n\n\x0b\x0b\x0c\x0c\r\r\x0e\x0e" + b"\x0f\x0f" + ), + ), ] for v, p in cases: @@ -112,4 +144,4 @@ def test_match(): def test_unicode(): - assert unpackb(packb('foobar'), use_list=1) == b'foobar' + assert unpackb(packb("foobar"), use_list=1) == b"foobar" diff --git a/pandas/tests/io/msgpack/test_except.py b/pandas/tests/io/msgpack/test_except.py index 0eeda3389a935..60c1dcca162a9 100644 --- a/pandas/tests/io/msgpack/test_except.py +++ b/pandas/tests/io/msgpack/test_except.py @@ -12,9 +12,8 @@ class DummyException(Exception): class TestExceptions: - def test_raise_on_find_unsupported_value(self): - msg = "can\'t serialize datetime" + msg = "can't serialize datetime" with pytest.raises(TypeError, match=msg): packb(datetime.now()) @@ -25,13 +24,13 @@ def hook(_): with pytest.raises(DummyException): unpackb(packb({}), object_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': 'buzz'}), object_hook=hook) + unpackb(packb({"fizz": "buzz"}), object_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': 'buzz'}), object_pairs_hook=hook) + unpackb(packb({"fizz": "buzz"}), object_pairs_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': {'buzz': 'spam'}}), object_hook=hook) + unpackb(packb({"fizz": {"buzz": "spam"}}), object_hook=hook) with pytest.raises(DummyException): - unpackb(packb({'fizz': {'buzz': 'spam'}}), object_pairs_hook=hook) + unpackb(packb({"fizz": {"buzz": "spam"}}), object_pairs_hook=hook) def test_invalid_value(self): msg = "Unpack failed: error" diff --git a/pandas/tests/io/msgpack/test_extension.py b/pandas/tests/io/msgpack/test_extension.py index 1bbfde5b9cd58..12f27459f5afe 100644 --- a/pandas/tests/io/msgpack/test_extension.py +++ b/pandas/tests/io/msgpack/test_extension.py @@ -12,50 +12,52 @@ def p(s): packer.pack_ext_type(0x42, s) return packer.bytes() - assert p(b'A') == b'\xd4\x42A' # fixext 1 - assert p(b'AB') == b'\xd5\x42AB' # fixext 2 - assert p(b'ABCD') == b'\xd6\x42ABCD' # fixext 4 - assert p(b'ABCDEFGH') == b'\xd7\x42ABCDEFGH' # fixext 8 - assert p(b'A' * 16) == b'\xd8\x42' + b'A' * 16 # fixext 16 - assert p(b'ABC') == b'\xc7\x03\x42ABC' # ext 8 - assert p(b'A' * 0x0123) == b'\xc8\x01\x23\x42' + b'A' * 0x0123 # ext 16 - assert (p(b'A' * 0x00012345) == - b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345) # ext 32 + assert p(b"A") == b"\xd4\x42A" # fixext 1 + assert p(b"AB") == b"\xd5\x42AB" # fixext 2 + assert p(b"ABCD") == b"\xd6\x42ABCD" # fixext 4 + assert p(b"ABCDEFGH") == b"\xd7\x42ABCDEFGH" # fixext 8 + assert p(b"A" * 16) == b"\xd8\x42" + b"A" * 16 # fixext 16 + assert p(b"ABC") == b"\xc7\x03\x42ABC" # ext 8 + assert p(b"A" * 0x0123) == b"\xc8\x01\x23\x42" + b"A" * 0x0123 # ext 16 + assert ( + p(b"A" * 0x00012345) == b"\xc9\x00\x01\x23\x45\x42" + b"A" * 0x00012345 + ) # ext 32 def test_unpack_ext_type(): def check(b, expected): assert msgpack.unpackb(b) == expected - check(b'\xd4\x42A', ExtType(0x42, b'A')) # fixext 1 - check(b'\xd5\x42AB', ExtType(0x42, b'AB')) # fixext 2 - check(b'\xd6\x42ABCD', ExtType(0x42, b'ABCD')) # fixext 4 - check(b'\xd7\x42ABCDEFGH', ExtType(0x42, b'ABCDEFGH')) # fixext 8 - check(b'\xd8\x42' + b'A' * 16, ExtType(0x42, b'A' * 16)) # fixext 16 - check(b'\xc7\x03\x42ABC', ExtType(0x42, b'ABC')) # ext 8 - check(b'\xc8\x01\x23\x42' + b'A' * 0x0123, - ExtType(0x42, b'A' * 0x0123)) # ext 16 - check(b'\xc9\x00\x01\x23\x45\x42' + b'A' * 0x00012345, - ExtType(0x42, b'A' * 0x00012345)) # ext 32 + check(b"\xd4\x42A", ExtType(0x42, b"A")) # fixext 1 + check(b"\xd5\x42AB", ExtType(0x42, b"AB")) # fixext 2 + check(b"\xd6\x42ABCD", ExtType(0x42, b"ABCD")) # fixext 4 + check(b"\xd7\x42ABCDEFGH", ExtType(0x42, b"ABCDEFGH")) # fixext 8 + check(b"\xd8\x42" + b"A" * 16, ExtType(0x42, b"A" * 16)) # fixext 16 + check(b"\xc7\x03\x42ABC", ExtType(0x42, b"ABC")) # ext 8 + check(b"\xc8\x01\x23\x42" + b"A" * 0x0123, ExtType(0x42, b"A" * 0x0123)) # ext 16 + check( + b"\xc9\x00\x01\x23\x45\x42" + b"A" * 0x00012345, + ExtType(0x42, b"A" * 0x00012345), + ) # ext 32 def test_extension_type(): def default(obj): - print('default called', obj) + print("default called", obj) if isinstance(obj, array.array): typecode = 123 # application specific typecode data = tobytes(obj) return ExtType(typecode, data) - raise TypeError("Unknown type object %r" % (obj, )) + raise TypeError("Unknown type object %r" % (obj,)) def ext_hook(code, data): - print('ext_hook called', code, data) + print("ext_hook called", code, data) assert code == 123 - obj = array.array('d') + obj = array.array("d") frombytes(obj, data) return obj - obj = [42, b'hello', array.array('d', [1.1, 2.2, 3.3])] + obj = [42, b"hello", array.array("d", [1.1, 2.2, 3.3])] s = msgpack.packb(obj, default=default) obj2 = msgpack.unpackb(s, ext_hook=ext_hook) assert obj == obj2 diff --git a/pandas/tests/io/msgpack/test_format.py b/pandas/tests/io/msgpack/test_format.py index 3659602e1381f..46d0116bc3926 100644 --- a/pandas/tests/io/msgpack/test_format.py +++ b/pandas/tests/io/msgpack/test_format.py @@ -8,84 +8,77 @@ def check(src, should, use_list=0): def testSimpleValue(): - check(b"\x93\xc0\xc2\xc3", (None, False, True, )) + check(b"\x93\xc0\xc2\xc3", (None, False, True)) def testFixnum(): - check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0, - 64, - 127, ), - (-32, - -16, - -1, ), )) + check(b"\x92\x93\x00\x40\x7f\x93\xe0\xf0\xff", ((0, 64, 127), (-32, -16, -1))) def testFixArray(): - check(b"\x92\x90\x91\x91\xc0", ((), ((None, ), ), ), ) + check(b"\x92\x90\x91\x91\xc0", ((), ((None,),))) def testFixRaw(): - check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def", ), ) + check(b"\x94\xa0\xa1a\xa2bc\xa3def", (b"", b"a", b"bc", b"def")) def testFixMap(): - check(b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80", - {False: {None: None}, - True: {None: {}}}, ) + check( + b"\x82\xc2\x81\xc0\xc0\xc3\x81\xc0\x80", {False: {None: None}, True: {None: {}}} + ) def testUnsignedInt(): - check(b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00" - b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00" - b"\xce\xff\xff\xff\xff", - (0, - 128, - 255, - 0, - 32768, - 65535, - 0, - 2147483648, - 4294967295, ), ) + check( + b"\x99\xcc\x00\xcc\x80\xcc\xff\xcd\x00\x00\xcd\x80\x00" + b"\xcd\xff\xff\xce\x00\x00\x00\x00\xce\x80\x00\x00\x00" + b"\xce\xff\xff\xff\xff", + (0, 128, 255, 0, 32768, 65535, 0, 2147483648, 4294967295), + ) def testSignedInt(): - check(b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00" - b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00" - b"\xd2\xff\xff\xff\xff", (0, - -128, - -1, - 0, - -32768, - -1, - 0, - -2147483648, - -1, )) + check( + b"\x99\xd0\x00\xd0\x80\xd0\xff\xd1\x00\x00\xd1\x80\x00" + b"\xd1\xff\xff\xd2\x00\x00\x00\x00\xd2\x80\x00\x00\x00" + b"\xd2\xff\xff\xff\xff", + (0, -128, -1, 0, -32768, -1, 0, -2147483648, -1), + ) def testRaw(): - check(b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00" - b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab", - (b"", b"a", b"ab", b"", b"a", b"ab")) + check( + b"\x96\xda\x00\x00\xda\x00\x01a\xda\x00\x02ab\xdb\x00\x00" + b"\x00\x00\xdb\x00\x00\x00\x01a\xdb\x00\x00\x00\x02ab", + (b"", b"a", b"ab", b"", b"a", b"ab"), + ) def testArray(): - check(b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00" - b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02" - b"\xc2\xc3", ((), (None, ), (False, True), (), (None, ), - (False, True))) + check( + b"\x96\xdc\x00\x00\xdc\x00\x01\xc0\xdc\x00\x02\xc2\xc3\xdd\x00" + b"\x00\x00\x00\xdd\x00\x00\x00\x01\xc0\xdd\x00\x00\x00\x02" + b"\xc2\xc3", + ((), (None,), (False, True), (), (None,), (False, True)), + ) def testMap(): - check(b"\x96" - b"\xde\x00\x00" - b"\xde\x00\x01\xc0\xc2" - b"\xde\x00\x02\xc0\xc2\xc3\xc2" - b"\xdf\x00\x00\x00\x00" - b"\xdf\x00\x00\x00\x01\xc0\xc2" - b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", ({}, {None: False}, - {True: False, - None: False}, {}, - {None: False}, - {True: False, - None: False})) + check( + b"\x96" + b"\xde\x00\x00" + b"\xde\x00\x01\xc0\xc2" + b"\xde\x00\x02\xc0\xc2\xc3\xc2" + b"\xdf\x00\x00\x00\x00" + b"\xdf\x00\x00\x00\x01\xc0\xc2" + b"\xdf\x00\x00\x00\x02\xc0\xc2\xc3\xc2", + ( + {}, + {None: False}, + {True: False, None: False}, + {}, + {None: False}, + {True: False, None: False}, + ), + ) diff --git a/pandas/tests/io/msgpack/test_limits.py b/pandas/tests/io/msgpack/test_limits.py index d90a9adfa5c87..4c0697f8faf64 100644 --- a/pandas/tests/io/msgpack/test_limits.py +++ b/pandas/tests/io/msgpack/test_limits.py @@ -5,12 +5,13 @@ class TestLimits: - def test_integer(self): x = -(2 ** 63) assert unpackb(packb(x)) == x - msg = (r"((long |Python )?(int )?too (big|large) to convert" - r"( to C (unsigned )?long))?") + msg = ( + r"((long |Python )?(int )?too (big|large) to convert" + r"( to C (unsigned )?long))?" + ) with pytest.raises((OverflowError, ValueError), match=msg): packb(x - 1) x = 2 ** 64 - 1 @@ -31,14 +32,14 @@ def test_map_header(self): packer.pack_array_header(2 ** 32) def test_max_str_len(self): - d = 'x' * 3 + d = "x" * 3 packed = packb(d) - unpacker = Unpacker(max_str_len=3, encoding='utf-8') + unpacker = Unpacker(max_str_len=3, encoding="utf-8") unpacker.feed(packed) assert unpacker.unpack() == d - unpacker = Unpacker(max_str_len=2, encoding='utf-8') + unpacker = Unpacker(max_str_len=2, encoding="utf-8") unpacker.feed(packed) msg = "3 exceeds max_str_len" @@ -46,7 +47,7 @@ def test_max_str_len(self): unpacker.unpack() def test_max_bin_len(self): - d = b'x' * 3 + d = b"x" * 3 packed = packb(d, use_bin_type=True) unpacker = Unpacker(max_bin_len=3) diff --git a/pandas/tests/io/msgpack/test_newspec.py b/pandas/tests/io/msgpack/test_newspec.py index d92c649c5e1ca..a1cf966b9d253 100644 --- a/pandas/tests/io/msgpack/test_newspec.py +++ b/pandas/tests/io/msgpack/test_newspec.py @@ -4,65 +4,65 @@ def test_str8(): - header = b'\xd9' - data = b'x' * 32 + header = b"\xd9" + data = b"x" * 32 b = packb(data.decode(), use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\x20' + assert b[0:2] == header + b"\x20" assert b[2:] == data assert unpackb(b) == data - data = b'x' * 255 + data = b"x" * 255 b = packb(data.decode(), use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\xff' + assert b[0:2] == header + b"\xff" assert b[2:] == data assert unpackb(b) == data def test_bin8(): - header = b'\xc4' - data = b'' + header = b"\xc4" + data = b"" b = packb(data, use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\x00' + assert b[0:2] == header + b"\x00" assert b[2:] == data assert unpackb(b) == data - data = b'x' * 255 + data = b"x" * 255 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 2 - assert b[0:2] == header + b'\xff' + assert b[0:2] == header + b"\xff" assert b[2:] == data assert unpackb(b) == data def test_bin16(): - header = b'\xc5' - data = b'x' * 256 + header = b"\xc5" + data = b"x" * 256 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 3 assert b[0:1] == header - assert b[1:3] == b'\x01\x00' + assert b[1:3] == b"\x01\x00" assert b[3:] == data assert unpackb(b) == data - data = b'x' * 65535 + data = b"x" * 65535 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 3 assert b[0:1] == header - assert b[1:3] == b'\xff\xff' + assert b[1:3] == b"\xff\xff" assert b[3:] == data assert unpackb(b) == data def test_bin32(): - header = b'\xc6' - data = b'x' * 65536 + header = b"\xc6" + data = b"x" * 65536 b = packb(data, use_bin_type=True) assert len(b) == len(data) + 5 assert b[0:1] == header - assert b[1:5] == b'\x00\x01\x00\x00' + assert b[1:5] == b"\x00\x01\x00\x00" assert b[5:] == data assert unpackb(b) == data @@ -72,21 +72,19 @@ def check(ext, packed): assert packb(ext) == packed assert unpackb(packed) == ext - check(ExtType(0x42, b'Z'), b'\xd4\x42Z') # fixext 1 - check(ExtType(0x42, b'ZZ'), b'\xd5\x42ZZ') # fixext 2 - check(ExtType(0x42, b'Z' * 4), b'\xd6\x42' + b'Z' * 4) # fixext 4 - check(ExtType(0x42, b'Z' * 8), b'\xd7\x42' + b'Z' * 8) # fixext 8 - check(ExtType(0x42, b'Z' * 16), b'\xd8\x42' + b'Z' * 16) # fixext 16 + check(ExtType(0x42, b"Z"), b"\xd4\x42Z") # fixext 1 + check(ExtType(0x42, b"ZZ"), b"\xd5\x42ZZ") # fixext 2 + check(ExtType(0x42, b"Z" * 4), b"\xd6\x42" + b"Z" * 4) # fixext 4 + check(ExtType(0x42, b"Z" * 8), b"\xd7\x42" + b"Z" * 8) # fixext 8 + check(ExtType(0x42, b"Z" * 16), b"\xd8\x42" + b"Z" * 16) # fixext 16 # ext 8 - check(ExtType(0x42, b''), b'\xc7\x00\x42') - check(ExtType(0x42, b'Z' * 255), b'\xc7\xff\x42' + b'Z' * 255) + check(ExtType(0x42, b""), b"\xc7\x00\x42") + check(ExtType(0x42, b"Z" * 255), b"\xc7\xff\x42" + b"Z" * 255) # ext 16 - check(ExtType(0x42, b'Z' * 256), b'\xc8\x01\x00\x42' + b'Z' * 256) - check(ExtType(0x42, b'Z' * 0xffff), b'\xc8\xff\xff\x42' + b'Z' * 0xffff) + check(ExtType(0x42, b"Z" * 256), b"\xc8\x01\x00\x42" + b"Z" * 256) + check(ExtType(0x42, b"Z" * 0xFFFF), b"\xc8\xff\xff\x42" + b"Z" * 0xFFFF) # ext 32 - check( - ExtType(0x42, b'Z' * - 0x10000), b'\xc9\x00\x01\x00\x00\x42' + b'Z' * 0x10000) + check(ExtType(0x42, b"Z" * 0x10000), b"\xc9\x00\x01\x00\x00\x42" + b"Z" * 0x10000) # needs large memory # check(ExtType(0x42, b'Z'*0xffffffff), # b'\xc9\xff\xff\xff\xff\x42' + b'Z'*0xffffffff) diff --git a/pandas/tests/io/msgpack/test_obj.py b/pandas/tests/io/msgpack/test_obj.py index 342c00f49ebff..03d8807c0922c 100644 --- a/pandas/tests/io/msgpack/test_obj.py +++ b/pandas/tests/io/msgpack/test_obj.py @@ -10,47 +10,44 @@ class DecodeError(Exception): class TestObj: - def _arr_to_str(self, arr): - return ''.join(str(c) for c in arr) + return "".join(str(c) for c in arr) def bad_complex_decoder(self, o): raise DecodeError("Ooops!") def _decode_complex(self, obj): - if b'__complex__' in obj: - return complex(obj[b'real'], obj[b'imag']) + if b"__complex__" in obj: + return complex(obj[b"real"], obj[b"imag"]) return obj def _encode_complex(self, obj): if isinstance(obj, complex): - return {b'__complex__': True, b'real': 1, b'imag': 2} + return {b"__complex__": True, b"real": 1, b"imag": 2} return obj def test_encode_hook(self): packed = packb([3, 1 + 2j], default=self._encode_complex) unpacked = unpackb(packed, use_list=1) - assert unpacked[1] == {b'__complex__': True, b'real': 1, b'imag': 2} + assert unpacked[1] == {b"__complex__": True, b"real": 1, b"imag": 2} def test_decode_hook(self): - packed = packb([3, {b'__complex__': True, b'real': 1, b'imag': 2}]) - unpacked = unpackb(packed, object_hook=self._decode_complex, - use_list=1) + packed = packb([3, {b"__complex__": True, b"real": 1, b"imag": 2}]) + unpacked = unpackb(packed, object_hook=self._decode_complex, use_list=1) assert unpacked[1] == 1 + 2j def test_decode_pairs_hook(self): packed = packb([3, {1: 2, 3: 4}]) prod_sum = 1 * 2 + 3 * 4 unpacked = unpackb( - packed, object_pairs_hook=lambda l: sum(k * v for k, v in l), - use_list=1) + packed, object_pairs_hook=lambda l: sum(k * v for k, v in l), use_list=1 + ) assert unpacked[1] == prod_sum def test_only_one_obj_hook(self): msg = "object_pairs_hook and object_hook are mutually exclusive" with pytest.raises(TypeError, match=msg): - unpackb(b'', object_hook=lambda x: x, - object_pairs_hook=lambda x: x) + unpackb(b"", object_hook=lambda x: x, object_pairs_hook=lambda x: x) def test_bad_hook(self): msg = r"can't serialize \(1\+2j\)" @@ -61,14 +58,14 @@ def test_bad_hook(self): def test_array_hook(self): packed = packb([1, 2, 3]) unpacked = unpackb(packed, list_hook=self._arr_to_str, use_list=1) - assert unpacked == '123' + assert unpacked == "123" def test_an_exception_in_objecthook1(self): - with pytest.raises(DecodeError, match='Ooops!'): - packed = packb({1: {'__complex__': True, 'real': 1, 'imag': 2}}) + with pytest.raises(DecodeError, match="Ooops!"): + packed = packb({1: {"__complex__": True, "real": 1, "imag": 2}}) unpackb(packed, object_hook=self.bad_complex_decoder) def test_an_exception_in_objecthook2(self): - with pytest.raises(DecodeError, match='Ooops!'): - packed = packb({1: [{'__complex__': True, 'real': 1, 'imag': 2}]}) + with pytest.raises(DecodeError, match="Ooops!"): + packed = packb({1: [{"__complex__": True, "real": 1, "imag": 2}]}) unpackb(packed, list_hook=self.bad_complex_decoder, use_list=1) diff --git a/pandas/tests/io/msgpack/test_pack.py b/pandas/tests/io/msgpack/test_pack.py index ba9f1ae57741d..5fc24027589cb 100644 --- a/pandas/tests/io/msgpack/test_pack.py +++ b/pandas/tests/io/msgpack/test_pack.py @@ -9,19 +9,38 @@ class TestPack: - def check(self, data, use_list=False): re = unpackb(packb(data), use_list=use_list) assert re == data def testPack(self): test_data = [ - 0, 1, 127, 128, 255, 256, 65535, 65536, - -1, -32, -33, -128, -129, -32768, -32769, + 0, + 1, + 127, + 128, + 255, + 256, + 65535, + 65536, + -1, + -32, + -33, + -128, + -129, + -32768, + -32769, 1.0, - b"", b"a", b"a" * 31, b"a" * 32, - None, True, False, - (), ((),), ((), None,), + b"", + b"a", + b"a" * 31, + b"a" * 32, + None, + True, + False, + (), + ((),), + ((), None), {None: 0}, (1 << 23), ] @@ -29,50 +48,54 @@ def testPack(self): self.check(td) def testPackUnicode(self): - test_data = ["", "abcd", ["defgh"], "Русский текст", ] + test_data = ["", "abcd", ["defgh"], "Русский текст"] for td in test_data: - re = unpackb( - packb(td, encoding='utf-8'), use_list=1, encoding='utf-8') + re = unpackb(packb(td, encoding="utf-8"), use_list=1, encoding="utf-8") assert re == td - packer = Packer(encoding='utf-8') + packer = Packer(encoding="utf-8") data = packer.pack(td) - re = Unpacker(BytesIO(data), encoding='utf-8', use_list=1).unpack() + re = Unpacker(BytesIO(data), encoding="utf-8", use_list=1).unpack() assert re == td def testPackUTF32(self): test_data = ["", "abcd", ["defgh"], "Русский текст"] for td in test_data: - re = unpackb( - packb(td, encoding='utf-32'), use_list=1, encoding='utf-32') + re = unpackb(packb(td, encoding="utf-32"), use_list=1, encoding="utf-32") assert re == td def testPackBytes(self): - test_data = [b"", b"abcd", (b"defgh", ), ] + test_data = [b"", b"abcd", (b"defgh",)] for td in test_data: self.check(td) def testIgnoreUnicodeErrors(self): re = unpackb( - packb(b'abc\xeddef'), encoding='utf-8', unicode_errors='ignore', - use_list=1) + packb(b"abc\xeddef"), encoding="utf-8", unicode_errors="ignore", use_list=1 + ) assert re == "abcdef" def testStrictUnicodeUnpack(self): - msg = (r"'utf-*8' codec can't decode byte 0xed in position 3:" - " invalid continuation byte") + msg = ( + r"'utf-*8' codec can't decode byte 0xed in position 3:" + " invalid continuation byte" + ) with pytest.raises(UnicodeDecodeError, match=msg): - unpackb(packb(b'abc\xeddef'), encoding='utf-8', use_list=1) + unpackb(packb(b"abc\xeddef"), encoding="utf-8", use_list=1) def testStrictUnicodePack(self): - msg = (r"'ascii' codec can't encode character '\\xed' in position 3:" - r" ordinal not in range\(128\)") + msg = ( + r"'ascii' codec can't encode character '\\xed' in position 3:" + r" ordinal not in range\(128\)" + ) with pytest.raises(UnicodeEncodeError, match=msg): - packb("abc\xeddef", encoding='ascii', unicode_errors='strict') + packb("abc\xeddef", encoding="ascii", unicode_errors="strict") def testIgnoreErrorsPack(self): re = unpackb( - packb("abcФФФdef", encoding='ascii', unicode_errors='ignore'), - encoding='utf-8', use_list=1) + packb("abcФФФdef", encoding="ascii", unicode_errors="ignore"), + encoding="utf-8", + use_list=1, + ) assert re == "abcdef" def testNoEncoding(self): @@ -85,10 +108,8 @@ def testDecodeBinary(self): assert re == b"abc" def testPackFloat(self): - assert packb(1.0, - use_single_float=True) == b'\xca' + struct.pack('>f', 1.0) - assert packb( - 1.0, use_single_float=False) == b'\xcb' + struct.pack('>d', 1.0) + assert packb(1.0, use_single_float=True) == b"\xca" + struct.pack(">f", 1.0) + assert packb(1.0, use_single_float=False) == b"\xcb" + struct.pack(">d", 1.0) def testArraySize(self, sizes=[0, 5, 50, 1000]): bio = BytesIO() @@ -116,7 +137,7 @@ def test_manualreset(self, sizes=[0, 5, 50, 1000]): assert unpacker.unpack() == list(range(size)) packer.reset() - assert packer.bytes() == b'' + assert packer.bytes() == b"" def testMapSize(self, sizes=[0, 5, 50, 1000]): bio = BytesIO() @@ -133,18 +154,17 @@ def testMapSize(self, sizes=[0, 5, 50, 1000]): assert unpacker.unpack() == {i: i * 2 for i in range(size)} def test_odict(self): - seq = [(b'one', 1), (b'two', 2), (b'three', 3), (b'four', 4)] + seq = [(b"one", 1), (b"two", 2), (b"three", 3), (b"four", 4)] od = OrderedDict(seq) assert unpackb(packb(od), use_list=1) == dict(seq) def pair_hook(seq): return list(seq) - assert unpackb( - packb(od), object_pairs_hook=pair_hook, use_list=1) == seq + assert unpackb(packb(od), object_pairs_hook=pair_hook, use_list=1) == seq def test_pairlist(self): - pairlist = [(b'a', 1), (2, b'b'), (b'foo', b'bar')] + pairlist = [(b"a", 1), (2, b"b"), (b"foo", b"bar")] packer = Packer() packed = packer.pack_map_pairs(pairlist) unpacked = unpackb(packed, object_pairs_hook=list) diff --git a/pandas/tests/io/msgpack/test_read_size.py b/pandas/tests/io/msgpack/test_read_size.py index 42791b571e8e7..7d2b539f12085 100644 --- a/pandas/tests/io/msgpack/test_read_size.py +++ b/pandas/tests/io/msgpack/test_read_size.py @@ -6,29 +6,29 @@ def test_read_array_header(): unpacker = Unpacker() - unpacker.feed(packb(['a', 'b', 'c'])) + unpacker.feed(packb(["a", "b", "c"])) assert unpacker.read_array_header() == 3 - assert unpacker.unpack() == b'a' - assert unpacker.unpack() == b'b' - assert unpacker.unpack() == b'c' + assert unpacker.unpack() == b"a" + assert unpacker.unpack() == b"b" + assert unpacker.unpack() == b"c" try: unpacker.unpack() - assert 0, 'should raise exception' + assert 0, "should raise exception" except OutOfData: - assert 1, 'okay' + assert 1, "okay" def test_read_map_header(): unpacker = Unpacker() - unpacker.feed(packb({'a': 'A'})) + unpacker.feed(packb({"a": "A"})) assert unpacker.read_map_header() == 1 - assert unpacker.unpack() == B'a' - assert unpacker.unpack() == B'A' + assert unpacker.unpack() == b"a" + assert unpacker.unpack() == b"A" try: unpacker.unpack() - assert 0, 'should raise exception' + assert 0, "should raise exception" except OutOfData: - assert 1, 'okay' + assert 1, "okay" def test_incorrect_type_array(): @@ -36,9 +36,9 @@ def test_incorrect_type_array(): unpacker.feed(packb(1)) try: unpacker.read_array_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" def test_incorrect_type_map(): @@ -46,26 +46,26 @@ def test_incorrect_type_map(): unpacker.feed(packb(1)) try: unpacker.read_map_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" def test_correct_type_nested_array(): unpacker = Unpacker() - unpacker.feed(packb({'a': ['b', 'c', 'd']})) + unpacker.feed(packb({"a": ["b", "c", "d"]})) try: unpacker.read_array_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" def test_incorrect_type_nested_map(): unpacker = Unpacker() - unpacker.feed(packb([{'a': 'b'}])) + unpacker.feed(packb([{"a": "b"}])) try: unpacker.read_map_header() - assert 0, 'should raise exception' + assert 0, "should raise exception" except UnexpectedTypeException: - assert 1, 'okay' + assert 1, "okay" diff --git a/pandas/tests/io/msgpack/test_seq.py b/pandas/tests/io/msgpack/test_seq.py index 68be8c2d975aa..c4ac13980bc67 100644 --- a/pandas/tests/io/msgpack/test_seq.py +++ b/pandas/tests/io/msgpack/test_seq.py @@ -8,7 +8,7 @@ def gen_binary_data(idx): - return binarydata[:idx % 300] + return binarydata[: idx % 300] def test_exceeding_unpacker_read_size(): diff --git a/pandas/tests/io/msgpack/test_sequnpack.py b/pandas/tests/io/msgpack/test_sequnpack.py index ea1e5035c7834..79feb78b3b013 100644 --- a/pandas/tests/io/msgpack/test_sequnpack.py +++ b/pandas/tests/io/msgpack/test_sequnpack.py @@ -7,7 +7,6 @@ class TestPack: - def test_partial_data(self): unpacker = Unpacker() msg = "No more data to unpack" @@ -22,34 +21,34 @@ def test_partial_data(self): def test_foobar(self): unpacker = Unpacker(read_size=3, use_list=1) - unpacker.feed(b'foobar') - assert unpacker.unpack() == ord(b'f') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'o') - assert unpacker.unpack() == ord(b'b') - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + unpacker.feed(b"foobar") + assert unpacker.unpack() == ord(b"f") + assert unpacker.unpack() == ord(b"o") + assert unpacker.unpack() == ord(b"o") + assert unpacker.unpack() == ord(b"b") + assert unpacker.unpack() == ord(b"a") + assert unpacker.unpack() == ord(b"r") msg = "No more data to unpack" with pytest.raises(OutOfData, match=msg): unpacker.unpack() - unpacker.feed(b'foo') - unpacker.feed(b'bar') + unpacker.feed(b"foo") + unpacker.feed(b"bar") k = 0 - for o, e in zip(unpacker, 'foobarbaz'): + for o, e in zip(unpacker, "foobarbaz"): assert o == ord(e) k += 1 - assert k == len(b'foobar') + assert k == len(b"foobar") def test_foobar_skip(self): unpacker = Unpacker(read_size=3, use_list=1) - unpacker.feed(b'foobar') - assert unpacker.unpack() == ord(b'f') + unpacker.feed(b"foobar") + assert unpacker.unpack() == ord(b"f") unpacker.skip() - assert unpacker.unpack() == ord(b'o') + assert unpacker.unpack() == ord(b"o") unpacker.skip() - assert unpacker.unpack() == ord(b'a') + assert unpacker.unpack() == ord(b"a") unpacker.skip() msg = "No more data to unpack" with pytest.raises(OutOfData, match=msg): @@ -62,42 +61,42 @@ def test_maxbuffersize_read_size_exceeds_max_buffer_size(self): def test_maxbuffersize_bufferfull(self): unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1) - unpacker.feed(b'foo') - with pytest.raises(BufferFull, match=r'^$'): - unpacker.feed(b'b') + unpacker.feed(b"foo") + with pytest.raises(BufferFull, match=r"^$"): + unpacker.feed(b"b") def test_maxbuffersize(self): unpacker = Unpacker(read_size=3, max_buffer_size=3, use_list=1) - unpacker.feed(b'foo') - assert ord('f') == next(unpacker) - unpacker.feed(b'b') - assert ord('o') == next(unpacker) - assert ord('o') == next(unpacker) - assert ord('b') == next(unpacker) + unpacker.feed(b"foo") + assert ord("f") == next(unpacker) + unpacker.feed(b"b") + assert ord("o") == next(unpacker) + assert ord("o") == next(unpacker) + assert ord("b") == next(unpacker) def test_readbytes(self): unpacker = Unpacker(read_size=3) - unpacker.feed(b'foobar') - assert unpacker.unpack() == ord(b'f') - assert unpacker.read_bytes(3) == b'oob' - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + unpacker.feed(b"foobar") + assert unpacker.unpack() == ord(b"f") + assert unpacker.read_bytes(3) == b"oob" + assert unpacker.unpack() == ord(b"a") + assert unpacker.unpack() == ord(b"r") # Test buffer refill - unpacker = Unpacker(BytesIO(b'foobar'), read_size=3) - assert unpacker.unpack() == ord(b'f') - assert unpacker.read_bytes(3) == b'oob' - assert unpacker.unpack() == ord(b'a') - assert unpacker.unpack() == ord(b'r') + unpacker = Unpacker(BytesIO(b"foobar"), read_size=3) + assert unpacker.unpack() == ord(b"f") + assert unpacker.read_bytes(3) == b"oob" + assert unpacker.unpack() == ord(b"a") + assert unpacker.unpack() == ord(b"r") def test_issue124(self): unpacker = Unpacker() - unpacker.feed(b'\xa1?\xa1!') - assert tuple(unpacker) == (b'?', b'!') + unpacker.feed(b"\xa1?\xa1!") + assert tuple(unpacker) == (b"?", b"!") assert tuple(unpacker) == () unpacker.feed(b"\xa1?\xa1") - assert tuple(unpacker) == (b'?', ) + assert tuple(unpacker) == (b"?",) assert tuple(unpacker) == () unpacker.feed(b"!") - assert tuple(unpacker) == (b'!', ) + assert tuple(unpacker) == (b"!",) assert tuple(unpacker) == () diff --git a/pandas/tests/io/msgpack/test_subtype.py b/pandas/tests/io/msgpack/test_subtype.py index 8af7e0b91d9b7..c82f6f6d3bf4e 100644 --- a/pandas/tests/io/msgpack/test_subtype.py +++ b/pandas/tests/io/msgpack/test_subtype.py @@ -17,7 +17,7 @@ class MyTuple(tuple): pass -MyNamedTuple = namedtuple('MyNamedTuple', 'x y') +MyNamedTuple = namedtuple("MyNamedTuple", "x y") def test_types(): diff --git a/pandas/tests/io/msgpack/test_unpack.py b/pandas/tests/io/msgpack/test_unpack.py index f33e0865a1145..483e09efe6bb8 100644 --- a/pandas/tests/io/msgpack/test_unpack.py +++ b/pandas/tests/io/msgpack/test_unpack.py @@ -7,7 +7,6 @@ class TestUnpack: - def test_unpack_array_header_from_file(self): f = BytesIO(packb([1, 2, 3, 4])) unpacker = Unpacker(f) @@ -21,8 +20,8 @@ def test_unpack_array_header_from_file(self): unpacker.unpack() def test_unpacker_hook_refcnt(self): - if not hasattr(sys, 'getrefcount'): - pytest.skip('no sys.getrefcount()') + if not hasattr(sys, "getrefcount"): + pytest.skip("no sys.getrefcount()") result = [] def hook(x): @@ -47,9 +46,8 @@ def hook(x): def test_unpacker_ext_hook(self): class MyUnpacker(Unpacker): - def __init__(self): - super().__init__(ext_hook=self._hook, encoding='utf-8') + super().__init__(ext_hook=self._hook, encoding="utf-8") def _hook(self, code, data): if code == 1: @@ -58,9 +56,9 @@ def _hook(self, code, data): return ExtType(code, data) unpacker = MyUnpacker() - unpacker.feed(packb({'a': 1}, encoding='utf-8')) - assert unpacker.unpack() == {'a': 1} - unpacker.feed(packb({'a': ExtType(1, b'123')}, encoding='utf-8')) - assert unpacker.unpack() == {'a': 123} - unpacker.feed(packb({'a': ExtType(2, b'321')}, encoding='utf-8')) - assert unpacker.unpack() == {'a': ExtType(2, b'321')} + unpacker.feed(packb({"a": 1}, encoding="utf-8")) + assert unpacker.unpack() == {"a": 1} + unpacker.feed(packb({"a": ExtType(1, b"123")}, encoding="utf-8")) + assert unpacker.unpack() == {"a": 123} + unpacker.feed(packb({"a": ExtType(2, b"321")}, encoding="utf-8")) + assert unpacker.unpack() == {"a": ExtType(2, b"321")} diff --git a/pandas/tests/io/msgpack/test_unpack_raw.py b/pandas/tests/io/msgpack/test_unpack_raw.py index 09ebb681d8709..f844553bfc34a 100644 --- a/pandas/tests/io/msgpack/test_unpack_raw.py +++ b/pandas/tests/io/msgpack/test_unpack_raw.py @@ -7,16 +7,16 @@ def test_write_bytes(): unpacker = Unpacker() - unpacker.feed(b'abc') + unpacker.feed(b"abc") f = io.BytesIO() - assert unpacker.unpack(f.write) == ord('a') - assert f.getvalue() == b'a' + assert unpacker.unpack(f.write) == ord("a") + assert f.getvalue() == b"a" f = io.BytesIO() assert unpacker.skip(f.write) is None - assert f.getvalue() == b'b' + assert f.getvalue() == b"b" f = io.BytesIO() assert unpacker.skip() is None - assert f.getvalue() == b'' + assert f.getvalue() == b"" def test_write_bytes_multi_buffer(): diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 8e35b58b90c48..2c347a096006a 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -12,8 +12,7 @@ class BaseParser: def update_kwargs(self, kwargs): kwargs = kwargs.copy() - kwargs.update(dict(engine=self.engine, - low_memory=self.low_memory)) + kwargs.update(dict(engine=self.engine, low_memory=self.low_memory)) return kwargs @@ -67,19 +66,16 @@ def csv1(csv_dir_path): _all_parser_ids = _c_parser_ids + _py_parser_ids -@pytest.fixture(params=_all_parsers, - ids=_all_parser_ids) +@pytest.fixture(params=_all_parsers, ids=_all_parser_ids) def all_parsers(request): return request.param -@pytest.fixture(params=_c_parsers_only, - ids=_c_parser_ids) +@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) def c_parser_only(request): return request.param -@pytest.fixture(params=_py_parsers_only, - ids=_py_parser_ids) +@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) def python_parser_only(request): return request.param diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 13f547bde692f..77b52eb90d61f 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -22,10 +22,9 @@ @pytest.mark.parametrize( "malformed", - ["1\r1\r1\r 1\r 1\r", - "1\r1\r1\r 1\r 1\r11\r", - "1\r1\r1\r 1\r 1\r11\r1\r"], - ids=["words pointer", "stream pointer", "lines pointer"]) + ["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"], + ids=["words pointer", "stream pointer", "lines pointer"], +) def test_buffer_overflow(c_parser_only, malformed): # see gh-9205: test certain malformed input files that cause # buffer overflows in tokenizer.c @@ -41,17 +40,17 @@ def test_buffer_rd_bytes(c_parser_only): # to a segfault if a corrupt gzip file is read with 'read_csv', and the # buffer is filled more than once before gzip raises an Exception. - data = "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" \ - "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" \ - "\xA6\x4D" + "\x55" * 267 + \ - "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" \ - "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" + data = ( + "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09" + "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0" + "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00" + "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO" + ) parser = c_parser_only for _ in range(100): try: - parser.read_csv(StringIO(data), compression="gzip", - delim_whitespace=True) + parser.read_csv(StringIO(data), compression="gzip", delim_whitespace=True) except Exception: pass @@ -61,10 +60,8 @@ def test_delim_whitespace_custom_terminator(c_parser_only): data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only - df = parser.read_csv(StringIO(data), lineterminator="~", - delim_whitespace=True) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["a", "b", "c"]) + df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) @@ -82,18 +79,15 @@ def test_dtype_and_names_error(c_parser_only): expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) tm.assert_frame_equal(result, expected) - result = parser.read_csv(StringIO(data), sep=r"\s+", - header=None, names=["a", "b"]) - expected = DataFrame( - [[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) + result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"]) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) # fallback casting - result = parser.read_csv(StringIO( - data), sep=r"\s+", header=None, - names=["a", "b"], dtype={"a": np.int32}) - expected = DataFrame([[1, 1], [2, 2], [3, 3]], - columns=["a", "b"]) + result = parser.read_csv( + StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32} + ) + expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"]) expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(result, expected) @@ -104,31 +98,46 @@ def test_dtype_and_names_error(c_parser_only): """ # fallback casting, but not castable with pytest.raises(ValueError, match="cannot safely convert"): - parser.read_csv(StringIO(data), sep=r"\s+", header=None, - names=["a", "b"], dtype={"a": np.int32}) - - -@pytest.mark.parametrize("match,kwargs", [ - # For each of these cases, all of the dtypes are valid, just unsupported. - (("the dtype datetime64 is not supported for parsing, " - "pass this column using parse_dates instead"), - dict(dtype={"A": "datetime64", "B": "float64"})), - - (("the dtype datetime64 is not supported for parsing, " - "pass this column using parse_dates instead"), - dict(dtype={"A": "datetime64", "B": "float64"}, - parse_dates=["B"])), + parser.read_csv( + StringIO(data), + sep=r"\s+", + header=None, + names=["a", "b"], + dtype={"a": np.int32}, + ) - ("the dtype timedelta64 is not supported for parsing", - dict(dtype={"A": "timedelta64", "B": "float64"})), - ("the dtype 262144b) parser = c_parser_only - header_narrow = "\t".join(["COL_HEADER_" + str(i) - for i in range(10)]) + "\n" - data_narrow = "\t".join(["somedatasomedatasomedata1" - for _ in range(10)]) + "\n" - header_wide = "\t".join(["COL_HEADER_" + str(i) - for i in range(15)]) + "\n" - data_wide = "\t".join(["somedatasomedatasomedata2" - for _ in range(15)]) + "\n" - test_input = (header_narrow + data_narrow * 1050 + - header_wide + data_wide * 2) + header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n" + data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n" + header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n" + data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n" + test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2 df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010) @@ -414,8 +434,7 @@ def test_read_nrows_large(c_parser_only): def test_float_precision_round_trip_with_text(c_parser_only): # see gh-15140 parser = c_parser_only - df = parser.read_csv(StringIO("a"), header=None, - float_precision="round_trip") + df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip") tm.assert_frame_equal(df, DataFrame({0: ["a"]})) @@ -439,7 +458,7 @@ def test_data_after_quote(c_parser_only): # see gh-15910 parser = c_parser_only - data = "a\n1\n\"b\"a" + data = 'a\n1\n"b"a' result = parser.read_csv(StringIO(data)) expected = DataFrame({"a": ["1", "ba"]}) @@ -459,18 +478,19 @@ def test_comment_whitespace_delimited(c_parser_only, capsys): 8# 1 field, NaN 9 2 3 # skipped line # comment""" - df = parser.read_csv(StringIO(test_input), comment="#", header=None, - delimiter="\\s+", skiprows=0, - error_bad_lines=False) + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + error_bad_lines=False, + ) captured = capsys.readouterr() # skipped lines 2, 3, 4, 9 for line_num in (2, 3, 4, 9): assert "Skipping line {}".format(line_num) in captured.err - expected = DataFrame([[1, 2], - [5, 2], - [6, 2], - [7, np.nan], - [8, np.nan]]) + expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) @@ -531,8 +551,7 @@ def test_bytes_exceed_2gb(c_parser_only): if parser.low_memory: pytest.skip("not a high_memory test") - csv = StringIO("strings\n" + "\n".join( - ["x" * (1 << 20) for _ in range(2100)])) + csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)])) df = parser.read_csv(csv) assert not df.empty diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 89c9f300b2cf4..e1d422142ab0b 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -18,18 +18,16 @@ def test_comment(all_parsers, na_values): 1,2.,4.#hello world 5.,NaN,10.0 """ - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) - result = parser.read_csv(StringIO(data), comment="#", - na_values=na_values) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) + result = parser.read_csv(StringIO(data), comment="#", na_values=na_values) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("read_kwargs", [ - dict(), - dict(lineterminator="*"), - dict(delim_whitespace=True), -]) +@pytest.mark.parametrize( + "read_kwargs", [dict(), dict(lineterminator="*"), dict(delim_whitespace=True)] +) def test_line_comment(all_parsers, read_kwargs): parser = all_parsers data = """# empty @@ -49,8 +47,9 @@ def test_line_comment(all_parsers, read_kwargs): read_kwargs["comment"] = "#" result = parser.read_csv(StringIO(data), **read_kwargs) - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) tm.assert_frame_equal(result, expected) @@ -65,8 +64,9 @@ def test_comment_skiprows(all_parsers): 5.,NaN,10.0 """ # This should ignore the first four lines (including comments). - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), comment="#", skiprows=4) tm.assert_frame_equal(result, expected) @@ -81,8 +81,9 @@ def test_comment_header(all_parsers): 5.,NaN,10.0 """ # Header should begin at the second non-comment line. - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), comment="#", header=1) tm.assert_frame_equal(result, expected) @@ -101,8 +102,9 @@ def test_comment_skiprows_header(all_parsers): # Skiprows should skip the first 4 lines (including comments), # while header should start from the second non-commented line, # starting with line 5. - expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) tm.assert_frame_equal(result, expected) @@ -111,8 +113,9 @@ def test_comment_skiprows_header(all_parsers): def test_custom_comment_char(all_parsers, comment_char): parser = all_parsers data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" - result = parser.read_csv(StringIO(data.replace("#", comment_char)), - comment=comment_char) + result = parser.read_csv( + StringIO(data.replace("#", comment_char)), comment=comment_char + ) expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c74e57627d679..7d5bf9ec850bc 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -55,17 +55,16 @@ def _set_noconvert_columns(self): parse_dates = [[1, 2]] cols = { "a": [0, 0], - "c_d": [ - Timestamp("2014-01-01 09:00:00"), - Timestamp("2014-01-02 10:00:00") - ] + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) parser = MyTextFileReader() - parser.options = {"usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ","} + parser.options = { + "usecols": [0, 2, 3], + "parse_dates": parse_dates, + "delimiter": ",", + } parser._engine = MyCParserWrapper(StringIO(data), **parser.options) result = parser.read() @@ -104,14 +103,14 @@ def test_bad_stream_exception(all_parsers, csv_dir_path): # and swallowing the exception that caused read to fail. path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") codec = codecs.lookup("utf-8") - utf8 = codecs.lookup('utf-8') + utf8 = codecs.lookup("utf-8") parser = all_parsers msg = "'utf-8' codec can't decode byte" # Stream must be binary UTF8. with open(path, "rb") as handle, codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, - codec.streamwriter) as stream: + handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter + ) as stream: with pytest.raises(UnicodeDecodeError, match=msg): parser.read_csv(stream) @@ -124,21 +123,30 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007]], - columns=["A", "B", "C", "D"], - index=Index([datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11)], name="index")) + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) tm.assert_frame_equal(result, expected) @@ -148,11 +156,7 @@ def test_1000_sep(all_parsers): 1|2,334|5 10|13|10. """ - expected = DataFrame({ - "A": [1, 10], - "B": [2334, 13], - "C": [5, 10.] - }) + expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) result = parser.read_csv(StringIO(data), sep="|", thousands=",") tm.assert_frame_equal(result, expected) @@ -168,8 +172,7 @@ def test_squeeze(all_parsers): index = Index(["a", "b", "c"], name=0) expected = Series([1, 2, 3], name=1, index=index) - result = parser.read_csv(StringIO(data), index_col=0, - header=None, squeeze=True) + result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True) tm.assert_series_equal(result, expected) # see gh-8217 @@ -203,9 +206,10 @@ def test_malformed_chunks(all_parsers, nrows): 2,3,4 """ parser = all_parsers - msg = 'Expected 3 fields in line 6, saw 5' - reader = parser.read_csv(StringIO(data), header=1, comment="#", - iterator=True, chunksize=1, skiprows=[2]) + msg = "Expected 3 fields in line 6, saw 5" + reader = parser.read_csv( + StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2] + ) with pytest.raises(ParserError, match=msg): reader.read(nrows) @@ -218,12 +222,11 @@ def test_unnamed_columns(all_parsers): 11,12,13,14,15 """ parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], - dtype=np.int64, columns=["A", "B", "C", - "Unnamed: 3", - "Unnamed: 4"]) + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + dtype=np.int64, + columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"], + ) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @@ -235,9 +238,7 @@ def test_csv_mixed_type(all_parsers): c,4,5 """ parser = all_parsers - expected = DataFrame({"A": ["a", "b", "c"], - "B": [1, 3, 4], - "C": [2, 4, 5]}) + expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @@ -254,8 +255,7 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): 2,2,3,4 3,3,4,5 """ - result = parser.read_csv(StringIO(data), low_memory=True, - index_col=0, nrows=0) + result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) expected = DataFrame(columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -264,21 +264,30 @@ def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738], - [1.047916, -0.041232, -0.16181208307, 0.212549], - [0.498581, 0.731168, -0.537677223318, 1.346270], - [1.120202, 1.567621, 0.00364077397681, 0.675253], - [-0.487094, 0.571455, -1.6116394093, 0.103469], - [0.836649, 0.246462, 0.588542635376, 1.062782], - [-0.157161, 1.340307, 1.1957779562, -1.097007]], - columns=["A", "B", "C", "D"], - index=Index([datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7), - datetime(2000, 1, 10), - datetime(2000, 1, 11)], name="index")) + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007], + ], + columns=["A", "B", "C", "D"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11), + ], + name="index", + ), + ) tm.assert_frame_equal(result, expected) @@ -287,22 +296,25 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): csv2 = os.path.join(csv_dir_path, "test2.csv") result = parser.read_csv(csv2, index_col=0, parse_dates=True) - expected = DataFrame([[0.980269, 3.685731, -0.364216805298, - -1.159738, "foo"], - [1.047916, -0.041232, -0.16181208307, - 0.212549, "bar"], - [0.498581, 0.731168, -0.537677223318, - 1.346270, "baz"], - [1.120202, 1.567621, 0.00364077397681, - 0.675253, "qux"], - [-0.487094, 0.571455, -1.6116394093, - 0.103469, "foo2"]], - columns=["A", "B", "C", "D", "E"], - index=Index([datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5), - datetime(2000, 1, 6), - datetime(2000, 1, 7)])) + expected = DataFrame( + [ + [0.980269, 3.685731, -0.364216805298, -1.159738, "foo"], + [1.047916, -0.041232, -0.16181208307, 0.212549, "bar"], + [0.498581, 0.731168, -0.537677223318, 1.346270, "baz"], + [1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"], + [-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"], + ], + columns=["A", "B", "C", "D", "E"], + index=Index( + [ + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + ] + ), + ) tm.assert_frame_equal(result, expected) @@ -341,12 +353,18 @@ def test_read_duplicate_index_explicit(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col=0) - expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], - [12, 13, 14, 15], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", - "qux", "foo", "bar"], name="index")) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"), + ) tm.assert_frame_equal(result, expected) @@ -362,29 +380,49 @@ def test_read_duplicate_index_implicit(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], - [12, 13, 14, 15], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - columns=["A", "B", "C", "D"], - index=Index(["foo", "bar", "baz", - "qux", "foo", "bar"])) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]), + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("A,B\nTrue,1\nFalse,2\nTrue,3", dict(), - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])), - ("A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", - dict(true_values=["yes", "Yes", "YES"], - false_values=["no", "NO", "No"]), - DataFrame([[True, 1], [False, 2], [True, 3], - [False, 3], [True, 3]], columns=["A", "B"])), - ("A,B\nTRUE,1\nFALSE,2\nTRUE,3", dict(), - DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])), - ("A,B\nfoo,bar\nbar,foo", dict(true_values=["foo"], - false_values=["bar"]), - DataFrame([[True, False], [False, True]], columns=["A", "B"])) -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "A,B\nTrue,1\nFalse,2\nTrue,3", + dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", + dict(true_values=["yes", "Yes", "YES"], false_values=["no", "NO", "No"]), + DataFrame( + [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]], + columns=["A", "B"], + ), + ), + ( + "A,B\nTRUE,1\nFALSE,2\nTRUE,3", + dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]), + ), + ( + "A,B\nfoo,bar\nbar,foo", + dict(true_values=["foo"], false_values=["bar"]), + DataFrame([[True, False], [False, True]], columns=["A", "B"]), + ), + ], +) def test_parse_bool(all_parsers, data, kwargs, expected): parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) @@ -415,10 +453,10 @@ def test_read_nrows(all_parsers, nrows): foo2,12,13,14,15 bar2,12,13,14,15 """ - expected = DataFrame([["foo", 2, 3, 4, 5], - ["bar", 7, 8, 9, 10], - ["baz", 12, 13, 14, 15]], - columns=["index", "A", "B", "C", "D"]) + expected = DataFrame( + [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]], + columns=["index", "A", "B", "C", "D"], + ) parser = all_parsers result = parser.read_csv(StringIO(data), nrows=nrows) @@ -455,13 +493,17 @@ def test_read_chunksize_with_index(all_parsers, index_col): """ reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2) - expected = DataFrame([["foo", 2, 3, 4, 5], - ["bar", 7, 8, 9, 10], - ["baz", 12, 13, 14, 15], - ["qux", 12, 13, 14, 15], - ["foo2", 12, 13, 14, 15], - ["bar2", 12, 13, 14, 15]], - columns=["index", "A", "B", "C", "D"]) + expected = DataFrame( + [ + ["foo", 2, 3, 4, 5], + ["bar", 7, 8, 9, 10], + ["baz", 12, 13, 14, 15], + ["qux", 12, 13, 14, 15], + ["foo2", 12, 13, 14, 15], + ["bar2", 12, 13, 14, 15], + ], + columns=["index", "A", "B", "C", "D"], + ) expected = expected.set_index("index") chunks = list(reader) @@ -578,8 +620,7 @@ def test_read_data_list(all_parsers): kwargs = dict(index_col=0) data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" - data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], - ["bar", "4", "5", "6"]] + data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] expected = parser.read_csv(StringIO(data), **kwargs) parser = TextParser(data_list, chunksize=2, **kwargs) @@ -622,9 +663,11 @@ def test_iterator2(all_parsers): reader = parser.read_csv(StringIO(data), iterator=True) result = list(reader) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result[0], expected) @@ -685,18 +728,17 @@ def test_iterator_stop_on_chunksize(all_parsers): result = list(reader) assert len(result) == 3 - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(concat(result), expected) -@pytest.mark.parametrize("kwargs", [ - dict(iterator=True, - chunksize=1), - dict(iterator=True), - dict(chunksize=1) -]) +@pytest.mark.parametrize( + "kwargs", [dict(iterator=True, chunksize=1), dict(iterator=True), dict(chunksize=1)] +) def test_iterator_skipfooter_errors(all_parsers, kwargs): msg = "'skipfooter' not supported for 'iteration'" parser = all_parsers @@ -715,33 +757,62 @@ def test_nrows_skipfooter_errors(all_parsers): parser.read_csv(StringIO(data), skipfooter=1, nrows=5) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("""foo,2,3,4,5 +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 -""", dict(index_col=0, names=["index", "A", "B", "C", "D"]), - DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15], [12, 13, 14, 15]], - index=Index(["foo", "bar", "baz", "qux", - "foo2", "bar2"], name="index"), - columns=["A", "B", "C", "D"])), - ("""foo,one,2,3,4,5 +""", + dict(index_col=0, names=["index", "A", "B", "C", "D"]), + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"), + columns=["A", "B", "C", "D"], + ), + ), + ( + """foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 -""", dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]), - DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - index=MultiIndex.from_tuples([ - ("foo", "one"), ("foo", "two"), ("foo", "three"), - ("bar", "one"), ("bar", "two")], - names=["index1", "index2"]), - columns=["A", "B", "C", "D"])), -]) +""", + dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]), + DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ], + names=["index1", "index2"], + ), + columns=["A", "B", "C", "D"], + ), + ), + ], +) def test_pass_names_with_index(all_parsers, data, kwargs, expected): parser = all_parsers result = parser.read_csv(StringIO(data), **kwargs) @@ -757,14 +828,14 @@ def test_multi_index_no_level_names(all_parsers, index_col): bar,one,12,13,14,15 bar,two,12,13,14,15 """ - headless_data = '\n'.join(data.split("\n")[1:]) + headless_data = "\n".join(data.split("\n")[1:]) names = ["A", "B", "C", "D"] parser = all_parsers - result = parser.read_csv(StringIO(headless_data), - index_col=index_col, - header=None, names=names) + result = parser.read_csv( + StringIO(headless_data), index_col=index_col, header=None, names=names + ) expected = parser.read_csv(StringIO(data), index_col=index_col) # No index names in headless data. @@ -783,20 +854,39 @@ def test_multi_index_no_level_names_implicit(all_parsers): """ result = parser.read_csv(StringIO(data)) - expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], - [12, 13, 14, 15], [12, 13, 14, 15]], - columns=["A", "B", "C", "D"], - index=MultiIndex.from_tuples([ - ("foo", "one"), ("foo", "two"), ("foo", "three"), - ("bar", "one"), ("bar", "two")])) + expected = DataFrame( + [ + [2, 3, 4, 5], + [7, 8, 9, 10], + [12, 13, 14, 15], + [12, 13, 14, 15], + [12, 13, 14, 15], + ], + columns=["A", "B", "C", "D"], + index=MultiIndex.from_tuples( + [ + ("foo", "one"), + ("foo", "two"), + ("foo", "three"), + ("bar", "one"), + ("bar", "two"), + ] + ), + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected,header", [ - ("a,b", DataFrame(columns=["a", "b"]), [0]), - ("a,b\nc,d", DataFrame(columns=MultiIndex.from_tuples( - [("a", "c"), ("b", "d")])), [0, 1]), -]) +@pytest.mark.parametrize( + "data,expected,header", + [ + ("a,b", DataFrame(columns=["a", "b"]), [0]), + ( + "a,b\nc,d", + DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])), + [0, 1], + ), + ], +) @pytest.mark.parametrize("round_trip", [True, False]) def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): # see gh-14545 @@ -815,9 +905,10 @@ def test_no_unnamed_index(all_parsers): 2 2 2 e f """ result = parser.read_csv(StringIO(data), sep=" ") - expected = DataFrame([[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], - [2, 2, 2, "e", "f"]], columns=["Unnamed: 0", "id", - "c0", "c1", "c2"]) + expected = DataFrame( + [[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]], + columns=["Unnamed: 0", "id", "c0", "c1", "c2"], + ) tm.assert_frame_equal(result, expected) @@ -840,8 +931,10 @@ def test_url(all_parsers, csv_dir_path): parser = all_parsers kwargs = dict(sep="\t") - url = ("https://raw.github.com/pandas-dev/pandas/master/" - "pandas/tests/io/parser/data/salaries.csv") + url = ( + "https://raw.github.com/pandas-dev/pandas/master/" + "pandas/tests/io/parser/data/salaries.csv" + ) url_result = parser.read_csv(url, **kwargs) local_path = os.path.join(csv_dir_path, "salaries.csv") @@ -869,8 +962,7 @@ def test_local_file(all_parsers, csv_dir_path): def test_path_path_lib(all_parsers): parser = all_parsers df = tm.makeDataFrame() - result = tm.round_trip_pathlib( - df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) @@ -878,7 +970,8 @@ def test_path_local_path(all_parsers): parser = all_parsers df = tm.makeDataFrame() result = tm.round_trip_localpath( - df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + df.to_csv, lambda p: parser.read_csv(p, index_col=0) + ) tm.assert_frame_equal(df, result) @@ -888,14 +981,12 @@ def test_nonexistent_path(all_parsers): parser = all_parsers path = "%s.csv" % tm.rands(10) - msg = ("does not exist" if parser.engine == "c" - else r"\[Errno 2\]") + msg = "does not exist" if parser.engine == "c" else r"\[Errno 2\]" with pytest.raises(FileNotFoundError, match=msg) as e: parser.read_csv(path) filename = e.value.filename - filename = filename.decode() if isinstance( - filename, bytes) else filename + filename = filename.decode() if isinstance(filename, bytes) else filename assert path == filename @@ -908,30 +999,70 @@ def test_missing_trailing_delimiters(all_parsers): 1,4,5""" result = parser.read_csv(StringIO(data)) - expected = DataFrame([[1, 2, 3, 4], [1, 3, 3, np.nan], - [1, 4, 5, np.nan]], columns=["A", "B", "C", "D"]) + expected = DataFrame( + [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]], + columns=["A", "B", "C", "D"], + ) tm.assert_frame_equal(result, expected) def test_skip_initial_space(all_parsers): - data = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' - '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' - '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' - '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' - '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' - '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') - parser = all_parsers - - result = parser.read_csv(StringIO(data), names=list(range(33)), - header=None, na_values=["-9999.0"], - skipinitialspace=True) - expected = DataFrame([["09-Apr-2012", "01:10:18.300", 2456026.548822908, - 12849, 1.00361, 1.12551, 330.65659, - 355626618.16711, 73.48821, 314.11625, 1917.09447, - 179.71425, 80.0, 240.0, -350, 70.06056, 344.9837, - 1, 1, -0.689265, -0.692787, 0.212036, 14.7674, - 41.605, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, 0, 12, 128]]) + data = ( + '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, " + "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, " + "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, " + "0.212036, 14.7674, 41.605, -9999.0, -9999.0, " + "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128" + ) + parser = all_parsers + + result = parser.read_csv( + StringIO(data), + names=list(range(33)), + header=None, + na_values=["-9999.0"], + skipinitialspace=True, + ) + expected = DataFrame( + [ + [ + "09-Apr-2012", + "01:10:18.300", + 2456026.548822908, + 12849, + 1.00361, + 1.12551, + 330.65659, + 355626618.16711, + 73.48821, + 314.11625, + 1917.09447, + 179.71425, + 80.0, + 240.0, + -350, + 70.06056, + 344.9837, + 1, + 1, + -0.689265, + -0.692787, + 0.212036, + 14.7674, + 41.605, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 0, + 12, + 128, + ] + ] + ) tm.assert_frame_equal(result, expected) @@ -944,13 +1075,16 @@ def test_utf16_bom_skiprows(all_parsers, sep, encoding): skip this too A,B,C 1,2,3 -4,5,6""".replace(",", sep) +4,5,6""".replace( + ",", sep + ) path = "__%s__.csv" % tm.rands(10) kwargs = dict(sep=sep, skiprows=2) utf8 = "utf-8" with tm.ensure_clean(path) as path: from io import TextIOWrapper + bytes_data = data.encode(encoding) with open(path, "wb") as f: @@ -981,7 +1115,7 @@ def test_unicode_encoding(all_parsers, csv_dir_path): result = result.set_index(0) got = result[1][1632] - expected = '\xc1 k\xf6ldum klaka (Cold Fever) (1994)' + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" assert got == expected @@ -1007,13 +1141,14 @@ def test_escapechar(all_parsers): "SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa parser = all_parsers - result = parser.read_csv(StringIO(data), escapechar='\\', - quotechar='"', encoding='utf-8') + result = parser.read_csv( + StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" + ) - assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", ' - 'IKEA:s 1700-tals serie') - tm.assert_index_equal(result.columns, - Index(['SEARCH_TERM', 'ACTUAL_URL'])) + assert result["SEARCH_TERM"][2] == ( + 'SLAGBORD, "Bergslagen", ' "IKEA:s 1700-tals serie" + ) + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) def test_int64_min_issues(all_parsers): @@ -1040,16 +1175,22 @@ def test_parse_integers_above_fp_precision(all_parsers): 17007000002000194""" parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame({"Numbers": [17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000194]}) + expected = DataFrame( + { + "Numbers": [ + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194, + ] + } + ) tm.assert_frame_equal(result, expected) @@ -1101,10 +1242,12 @@ def test_catch_too_many_names(all_parsers): 7,8,9 10,11,12\n""" parser = all_parsers - msg = ("Too many columns specified: " - "expected 4 and found 3" if parser.engine == "c" - else "Number of passed names did not match " - "number of header fields in the file") + msg = ( + "Too many columns specified: " "expected 4 and found 3" + if parser.engine == "c" + else "Number of passed names did not match " + "number of header fields in the file" + ) with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) @@ -1146,9 +1289,9 @@ def test_empty_with_multi_index(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col=["x", "y"]) - expected = DataFrame(columns=["z"], - index=MultiIndex.from_arrays( - [[]] * 2, names=["x", "y"])) + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ) tm.assert_frame_equal(result, expected) @@ -1157,9 +1300,9 @@ def test_empty_with_reversed_multi_index(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col=[1, 0]) - expected = DataFrame(columns=["z"], - index=MultiIndex.from_arrays( - [[]] * 2, names=["y", "x"])) + expected = DataFrame( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ) tm.assert_frame_equal(result, expected) @@ -1175,15 +1318,14 @@ def test_float_parser(all_parsers): def test_scientific_no_exponent(all_parsers): # see gh-12215 - df = DataFrame.from_dict(OrderedDict([("w", ["2e"]), ("x", ["3E"]), - ("y", ["42e"]), - ("z", ["632E"])])) + df = DataFrame.from_dict( + OrderedDict([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])]) + ) data = df.to_csv(index=False) parser = all_parsers for precision in parser.float_precision_choices: - df_roundtrip = parser.read_csv(StringIO(data), - float_precision=precision) + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) tm.assert_frame_equal(df_roundtrip, df) @@ -1203,31 +1345,36 @@ def test_int64_overflow(all_parsers, conv): # 13007854817840016671868 > UINT64_MAX, so this # will overflow and return object as the dtype. result = parser.read_csv(StringIO(data)) - expected = DataFrame(["00013007854817840016671868", - "00013007854817840016749251", - "00013007854817840016754630", - "00013007854817840016781876", - "00013007854817840017028824", - "00013007854817840017963235", - "00013007854817840018860166"], columns=["ID"]) + expected = DataFrame( + [ + "00013007854817840016671868", + "00013007854817840016749251", + "00013007854817840016754630", + "00013007854817840016781876", + "00013007854817840017028824", + "00013007854817840017963235", + "00013007854817840018860166", + ], + columns=["ID"], + ) tm.assert_frame_equal(result, expected) else: # 13007854817840016671868 > UINT64_MAX, so attempts # to cast to either int64 or uint64 will result in # an OverflowError being raised. - msg = ("(Python int too large to convert to C long)|" - "(long too big to convert)|" - "(int too big to convert)") + msg = ( + "(Python int too large to convert to C long)|" + "(long too big to convert)|" + "(int too big to convert)" + ) with pytest.raises(OverflowError, match=msg): parser.read_csv(StringIO(data), converters={"ID": conv}) -@pytest.mark.parametrize("val", [ - np.iinfo(np.uint64).max, - np.iinfo(np.int64).max, - np.iinfo(np.int64).min -]) +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min] +) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 # range, so they should be parsed as string. @@ -1238,10 +1385,9 @@ def test_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("val", [ - np.iinfo(np.uint64).max + 1, - np.iinfo(np.int64).min - 1 -]) +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) def test_outside_int64_uint64_range(all_parsers, val): # These numbers fall just outside the int64-uint64 # range, so they should be parsed as string. @@ -1252,8 +1398,7 @@ def test_outside_int64_uint64_range(all_parsers, val): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], - [str(2**63), str(-1)]]) +@pytest.mark.parametrize("exp_data", [[str(-1), str(2 ** 63)], [str(2 ** 63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): # No numerical dtype can hold both negative and uint64 # values, so they should be cast as string. @@ -1282,55 +1427,101 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected,msg", [ - # gh-10728: WHITESPACE_LINE - ("a,b,c\n4,5,6\n ", dict(), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # gh-10548: EAT_LINE_COMMENT - ("a,b,c\n4,5,6\n#comment", dict(comment="#"), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # EAT_CRNL_NOP - ("a,b,c\n4,5,6\n\r", dict(), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # EAT_COMMENT - ("a,b,c\n4,5,6#comment", dict(comment="#"), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # SKIP_LINE - ("a,b,c\n4,5,6\nskipme", dict(skiprows=[2]), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # EAT_LINE_COMMENT - ("a,b,c\n4,5,6\n#comment", dict(comment="#", skip_blank_lines=False), - DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), - - # IN_FIELD - ("a,b,c\n4,5,6\n ", dict(skip_blank_lines=False), - DataFrame([["4", 5, 6], [" ", None, None]], - columns=["a", "b", "c"]), None), - - # EAT_CRNL - ("a,b,c\n4,5,6\n\r", dict(skip_blank_lines=False), - DataFrame([[4, 5, 6], [None, None, None]], - columns=["a", "b", "c"]), None), - - # ESCAPED_CHAR - ("a,b,c\n4,5,6\n\\", dict(escapechar="\\"), - None, "(EOF following escape character)|(unexpected end of data)"), - - # ESCAPE_IN_QUOTED_FIELD - ('a,b,c\n4,5,6\n"\\', dict(escapechar="\\"), - None, "(EOF inside string starting at row 2)|(unexpected end of data)"), - - # IN_QUOTED_FIELD - ('a,b,c\n4,5,6\n"', dict(escapechar="\\"), - None, "(EOF inside string starting at row 2)|(unexpected end of data)"), -], ids=["whitespace-line", "eat-line-comment", "eat-crnl-nop", "eat-comment", - "skip-line", "eat-line-comment", "in-field", "eat-crnl", - "escaped-char", "escape-in-quoted-field", "in-quoted-field"]) +@pytest.mark.parametrize( + "data,kwargs,expected,msg", + [ + # gh-10728: WHITESPACE_LINE + ( + "a,b,c\n4,5,6\n ", + dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # gh-10548: EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL_NOP + ( + "a,b,c\n4,5,6\n\r", + dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_COMMENT + ( + "a,b,c\n4,5,6#comment", + dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # SKIP_LINE + ( + "a,b,c\n4,5,6\nskipme", + dict(skiprows=[2]), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # EAT_LINE_COMMENT + ( + "a,b,c\n4,5,6\n#comment", + dict(comment="#", skip_blank_lines=False), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), + None, + ), + # IN_FIELD + ( + "a,b,c\n4,5,6\n ", + dict(skip_blank_lines=False), + DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]), + None, + ), + # EAT_CRNL + ( + "a,b,c\n4,5,6\n\r", + dict(skip_blank_lines=False), + DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]), + None, + ), + # ESCAPED_CHAR + ( + "a,b,c\n4,5,6\n\\", + dict(escapechar="\\"), + None, + "(EOF following escape character)|(unexpected end of data)", + ), + # ESCAPE_IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"\\', + dict(escapechar="\\"), + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + # IN_QUOTED_FIELD + ( + 'a,b,c\n4,5,6\n"', + dict(escapechar="\\"), + None, + "(EOF inside string starting at row 2)|(unexpected end of data)", + ), + ], + ids=[ + "whitespace-line", + "eat-line-comment", + "eat-crnl-nop", + "eat-comment", + "skip-line", + "eat-line-comment", + "in-field", + "eat-crnl", + "escaped-char", + "escape-in-quoted-field", + "in-quoted-field", + ], +) def test_eof_states(all_parsers, data, kwargs, expected, msg): # see gh-10728, gh-10548 parser = all_parsers @@ -1359,25 +1550,31 @@ def test_uneven_lines_with_usecols(all_parsers, usecols): with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) else: - expected = DataFrame({ - "a": [0, 3, 8], - "b": [1, 4, 9] - }) + expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]}) result = parser.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - # First, check to see that the response of parser when faced with no - # provided columns raises the correct error, with or without usecols. - ("", dict(), None), - ("", dict(usecols=["X"]), None), - (",,", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), - DataFrame(columns=["X"], index=[0], dtype=np.float64)), - ("", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), - DataFrame(columns=["X"])), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # First, check to see that the response of parser when faced with no + # provided columns raises the correct error, with or without usecols. + ("", dict(), None), + ("", dict(usecols=["X"]), None), + ( + ",,", + dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"], index=[0], dtype=np.float64), + ), + ( + "", + dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"]), + ), + ], +) def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): # see gh-12493 parser = all_parsers @@ -1391,19 +1588,29 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,expected", [ - # gh-8661, gh-8679: this should ignore six lines, including - # lines with trailing whitespace and blank lines. - (dict(header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6], - skip_blank_lines=True), DataFrame([[1., 2., 4.], - [5.1, np.nan, 10.]])), - - # gh-8983: test skipping set of rows after a row with trailing spaces. - (dict(delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], - skip_blank_lines=True), DataFrame({"A": [1., 5.1], - "B": [2., np.nan], - "C": [4., 10]})), -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + # gh-8661, gh-8679: this should ignore six lines, including + # lines with trailing whitespace and blank lines. + ( + dict( + header=None, + delim_whitespace=True, + skiprows=[0, 1, 2, 3, 5, 6], + skip_blank_lines=True, + ), + DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]), + ), + # gh-8983: test skipping set of rows after a row with trailing spaces. + ( + dict( + delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True + ), + DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}), + ), + ], +) def test_trailing_spaces(all_parsers, kwargs, expected): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa parser = all_parsers @@ -1433,18 +1640,31 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): b\n""" expected = DataFrame({"MyColumn": list("abab")}) - result = parser.read_csv(StringIO(data), skipinitialspace=True, - delim_whitespace=delim_whitespace) + result = parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep,skip_blank_lines,exp_data", [ - (",", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]), - (r"\s+", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]), - (",", False, [[1., 2., 4.], [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], [5., np.nan, 10.], - [np.nan, np.nan, np.nan], [-70., .4, 1.]]), -]) +@pytest.mark.parametrize( + "sep,skip_blank_lines,exp_data", + [ + (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]), + ( + ",", + False, + [ + [1.0, 2.0, 4.0], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5.0, np.nan, 10.0], + [np.nan, np.nan, np.nan], + [-70.0, 0.4, 1.0], + ], + ), + ], +) def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): parser = all_parsers data = """\ @@ -1460,8 +1680,7 @@ def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): if sep == r"\s+": data = data.replace(",", " ") - result = parser.read_csv(StringIO(data), sep=sep, - skip_blank_lines=skip_blank_lines) + result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines) expected = DataFrame(exp_data, columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -1476,22 +1695,32 @@ def test_whitespace_lines(all_parsers): \t 1,2.,4. 5.,NaN,10.0 """ - expected = DataFrame([[1, 2., 4.], [5., np.nan, 10.]], - columns=["A", "B", "C"]) + expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected", [ - (""" A B C D +@pytest.mark.parametrize( + "data,expected", + [ + ( + """ A B C D a 1 2 3 4 b 1 2 3 4 c 1 2 3 4 -""", DataFrame([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], - columns=["A", "B", "C", "D"], index=["a", "b", "c"])), - (" a b c\n1 2 3 \n4 5 6\n 7 8 9", - DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])), -]) +""", + DataFrame( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + columns=["A", "B", "C", "D"], + index=["a", "b", "c"], + ), + ), + ( + " a b c\n1 2 3 \n4 5 6\n 7 8 9", + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]), + ), + ], +) def test_whitespace_regex_separator(all_parsers, data, expected): # see gh-6607 parser = all_parsers @@ -1563,27 +1792,34 @@ def test_iteration_open_handle(all_parsers): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("data,thousands,decimal", [ - ("""A|B|C +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C 1|2,334.01|5 10|13|10. -""", ",", "."), - ("""A|B|C +""", + ",", + ".", + ), + ( + """A|B|C 1|2.334,01|5 10|13|10, -""", ".", ","), -]) +""", + ".", + ",", + ), + ], +) def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): parser = all_parsers - expected = DataFrame({ - "A": [1, 10], - "B": [2334.01, 13], - "C": [5, 10.] - }) + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) - result = parser.read_csv(StringIO(data), sep="|", - thousands=thousands, - decimal=decimal) + result = parser.read_csv( + StringIO(data), sep="|", thousands=thousands, decimal=decimal + ) tm.assert_frame_equal(result, expected) @@ -1595,11 +1831,14 @@ def test_euro_decimal_format(all_parsers): 3;878,158;108013,434;GHI;rez;2,735694704""" result = parser.read_csv(StringIO(data), sep=";", decimal=",") - expected = DataFrame([ - [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], - [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], - [3, 878.158, 108013.434, "GHI", "rez", 2.735694704] - ], columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"]) + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], + [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], + [3, 878.158, 108013.434, "GHI", "rez", 2.735694704], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) tm.assert_frame_equal(result, expected) @@ -1618,9 +1857,10 @@ def test_inf_parsing(all_parsers, na_filter): h,-INf i,inF j,-inF""" - expected = DataFrame({"A": [float("inf"), float("-inf")] * 5}, - index=["a", "b", "c", "d", "e", - "f", "g", "h", "i", "j"]) + expected = DataFrame( + {"A": [float("inf"), float("-inf")] * 5}, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ) result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) tm.assert_frame_equal(result, expected) @@ -1639,11 +1879,9 @@ def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers - expected = DataFrame({ - "a": [1, 2, 3], - "b": ["one", "two", "three"], - "c": ["I", "II", "III"] - }) + expected = DataFrame( + {"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]} + ) result = parser.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(result, expected) @@ -1665,23 +1903,25 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) -@pytest.mark.parametrize("data,kwargs,expected", [ - # Basic test - ("a\n1", dict(), DataFrame({"a": [1]})), - - # "Regular" quoting - ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), - - # Test in a data row instead of header - ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), - - # Test in empty data row with skipping - ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), - - # Test in empty data row without skipping - ("\n1", dict(names=["a"], skip_blank_lines=False), - DataFrame({"a": [np.nan, 1]})), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]}), + ), + ], +) def test_utf8_bom(all_parsers, data, kwargs, expected): # see gh-4793 parser = all_parsers @@ -1692,8 +1932,7 @@ def _encode_data_with_bom(_data): bom_data = (bom + _data).encode(utf8) return BytesIO(bom_data) - result = parser.read_csv(_encode_data_with_bom(data), - encoding=utf8, **kwargs) + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) tm.assert_frame_equal(result, expected) @@ -1715,8 +1954,7 @@ def test_temporary_file(all_parsers): @pytest.mark.parametrize("byte", [8, 16]) -@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", - "UTF-{0}", "UTF_{0}"]) +@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) def test_read_csv_utf_aliases(all_parsers, byte, fmt): # see gh-13549 expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) @@ -1743,8 +1981,7 @@ def test_internal_eof_byte_to_file(all_parsers): # see gh-16559 parser = all_parsers data = b'c1,c2\r\n"test \x1a test", test\r\n' - expected = DataFrame([["test \x1a test", " test"]], - columns=["c1", "c2"]) + expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) path = "__%s__.csv" % tm.rands(10) with tm.ensure_clean(path) as path: @@ -1834,14 +2071,13 @@ def seek(self, pos, whence=0): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(), # Default is True. - dict(error_bad_lines=True), # Explicitly pass in. -]) -@pytest.mark.parametrize("warn_kwargs", [ - dict(), dict(warn_bad_lines=True), - dict(warn_bad_lines=False) -]) +@pytest.mark.parametrize( + "kwargs", + [dict(), dict(error_bad_lines=True)], # Default is True. # Explicitly pass in. +) +@pytest.mark.parametrize( + "warn_kwargs", [dict(), dict(warn_bad_lines=True), dict(warn_bad_lines=False)] +) def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): # see gh-15925 parser = all_parsers @@ -1859,9 +2095,7 @@ def test_warn_bad_lines(all_parsers, capsys): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), - error_bad_lines=False, - warn_bad_lines=True) + result = parser.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() @@ -1875,17 +2109,19 @@ def test_suppress_error_output(all_parsers, capsys): data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), - error_bad_lines=False, - warn_bad_lines=False) + result = parser.read_csv( + StringIO(data), error_bad_lines=False, warn_bad_lines=False + ) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert captured.err == "" -@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36, - reason="On Python < 3.6 won't pass on Windows") +@pytest.mark.skipif( + compat.is_platform_windows() and not compat.PY36, + reason="On Python < 3.6 won't pass on Windows", +) @pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"]) def test_filename_with_special_chars(all_parsers, filename): # see gh-15086. @@ -1932,6 +2168,6 @@ def test_first_row_bom(all_parsers): parser = all_parsers data = '''\ufeff"Head1" "Head2" "Head3"''' - result = parser.read_csv(StringIO(data), delimiter='\t') + result = parser.read_csv(StringIO(data), delimiter="\t") expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index a04a5c4e4ad6b..06ae2c0fef1b9 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -76,8 +76,7 @@ def test_zip_error_invalid_zip(parser_and_data): with tm.ensure_clean() as path: with open(path, "wb") as f: - with pytest.raises(zipfile.BadZipfile, - match="File is not a zip file"): + with pytest.raises(zipfile.BadZipfile, match="File is not a zip file"): parser.read_csv(f, compression="zip") @@ -90,8 +89,7 @@ def test_compression(parser_and_data, compression_only, buffer, filename): filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: - pytest.skip("Cannot deduce compression from " - "buffer of compressed data.") + pytest.skip("Cannot deduce compression from " "buffer of compressed data.") with tm.ensure_clean(filename=filename) as path: tm.write_to_compressed(compress_type, path, data) @@ -130,12 +128,13 @@ def test_compression_utf16_encoding(all_parsers, csv_dir_path): parser = all_parsers path = os.path.join(csv_dir_path, "utf16_ex_small.zip") - result = parser.read_csv(path, encoding="utf-16", - compression="zip", sep="\t") - expected = pd.DataFrame({ - "Country": ["Venezuela", "Venezuela"], - "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."] - }) + result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t") + expected = pd.DataFrame( + { + "Country": ["Venezuela", "Venezuela"], + "Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."], + } + ) tm.assert_frame_equal(result, expected) @@ -145,8 +144,7 @@ def test_invalid_compression(all_parsers, invalid_compression): parser = all_parsers compress_kwargs = dict(compression=invalid_compression) - msg = ("Unrecognized compression " - "type: {compression}".format(**compress_kwargs)) + msg = "Unrecognized compression " "type: {compression}".format(**compress_kwargs) with pytest.raises(ValueError, match=msg): parser.read_csv("test_file.zip", **compress_kwargs) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 4df99d396b7ac..2a3b1dc82fc59 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -24,10 +24,9 @@ def test_converters_type_must_be_dict(all_parsers): @pytest.mark.parametrize("column", [3, "D"]) -@pytest.mark.parametrize("converter", [ - parse, - lambda x: int(x.split("/")[2]) # Produce integer. -]) +@pytest.mark.parametrize( + "converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer. +) def test_converters(all_parsers, column, converter): parser = all_parsers data = """A,B,C,D @@ -49,8 +48,7 @@ def test_converters_no_implicit_conv(all_parsers): data = """000102,1.2,A\n001245,2,B""" converters = {0: lambda x: x.strip()} - result = parser.read_csv(StringIO(data), header=None, - converters=converters) + result = parser.read_csv(StringIO(data), header=None, converters=converters) # Column 0 should not be casted to numeric and should remain as object. expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]]) @@ -66,15 +64,19 @@ def test_converters_euro_decimal_format(all_parsers): 1;1521,1541;187101,9543;ABC;poi;4,7387 2;121,12;14897,76;DEF;uyt;0,3773 3;878,158;108013,434;GHI;rez;2,7356""" - converters["Number1"] = converters["Number2"] =\ - converters["Number3"] = lambda x: float(x.replace(",", ".")) + converters["Number1"] = converters["Number2"] = converters[ + "Number3" + ] = lambda x: float(x.replace(",", ".")) result = parser.read_csv(StringIO(data), sep=";", converters=converters) - expected = DataFrame([[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], - [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], - [3, 878.158, 108013.434, "GHI", "rez", 2.7356]], - columns=["Id", "Number1", "Number2", - "Text1", "Text2", "Number3"]) + expected = DataFrame( + [ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], + [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], + [3, 878.158, 108013.434, "GHI", "rez", 2.7356], + ], + columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"], + ) tm.assert_frame_equal(result, expected) @@ -134,10 +136,11 @@ def convert_score(x): results = [] for day_converter in [convert_days, convert_days_sentinel]: - result = parser.read_csv(StringIO(data), - converters={"score": convert_score, - "days": day_converter}, - na_values=["", None]) + result = parser.read_csv( + StringIO(data), + converters={"score": convert_score, "days": day_converter}, + na_values=["", None], + ) assert pd.isna(result["days"][1]) results.append(result) @@ -149,8 +152,9 @@ def test_converter_index_col_bug(all_parsers): parser = all_parsers data = "A;B\n1;2\n3;4" - rs = parser.read_csv(StringIO(data), sep=";", index_col="A", - converters={"A": lambda x: x}) + rs = parser.read_csv( + StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x} + ) xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index 6f2878fd2363a..dc10352bc6460 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -17,8 +17,14 @@ @pytest.fixture def custom_dialect(): dialect_name = "weird" - dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":", - skipinitialspace=False, quotechar="~", quoting=3) + dialect_kwargs = dict( + doublequote=False, + escapechar="~", + delimiter=":", + skipinitialspace=False, + quotechar="~", + quoting=3, + ) return dialect_name, dialect_kwargs @@ -40,7 +46,7 @@ def test_dialect(all_parsers): index2,b,d,f """ exp = parser.read_csv(StringIO(data)) - exp.replace("a", "\"a", inplace=True) + exp.replace("a", '"a', inplace=True) tm.assert_frame_equal(df, exp) @@ -52,10 +58,7 @@ def test_dialect_str(all_parsers): apple:broccoli pear:tomato """ - exp = DataFrame({ - "fruit": ["apple", "pear"], - "vegetable": ["broccoli", "tomato"] - }) + exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]}) with tm.with_csv_dialect(dialect_name, delimiter=":"): df = parser.read_csv(StringIO(data), dialect=dialect_name) @@ -74,11 +77,12 @@ class InvalidDialect: parser.read_csv(StringIO(data), dialect=InvalidDialect) -@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar", - "skipinitialspace", "quotechar", "quoting"]) +@pytest.mark.parametrize( + "arg", + [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"], +) @pytest.mark.parametrize("value", ["dialect", "default", "other"]) -def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, - arg, value): +def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value): # see gh-23761. dialect_name, dialect_kwargs = custom_dialect parser = all_parsers @@ -95,6 +99,7 @@ def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, kwds[arg] = dialect_kwargs[arg] elif "value" == "default": # Default --> no warning. from pandas.io.parsers import _parser_defaults + kwds[arg] = _parser_defaults[arg] else: # Non-default + conflict with dialect --> warning. warning_klass = ParserWarning @@ -102,23 +107,30 @@ def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, with tm.with_csv_dialect(dialect_name, **dialect_kwargs): with tm.assert_produces_warning(warning_klass): - result = parser.read_csv(StringIO(data), - dialect=dialect_name, **kwds) + result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwds) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,warning_klass", [ - (dict(sep=","), None), # sep is default --> sep_override=True - (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False - (dict(delimiter=":"), None), # No conflict - (dict(delimiter=None), None), # Default arguments --> sep_override=True - (dict(delimiter=","), ParserWarning), # Conflict - (dict(delimiter="."), ParserWarning), # Conflict -], ids=["sep-override-true", "sep-override-false", - "delimiter-no-conflict", "delimiter-default-arg", - "delimiter-conflict", "delimiter-conflict2"]) -def test_dialect_conflict_delimiter(all_parsers, custom_dialect, - kwargs, warning_klass): +@pytest.mark.parametrize( + "kwargs,warning_klass", + [ + (dict(sep=","), None), # sep is default --> sep_override=True + (dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False + (dict(delimiter=":"), None), # No conflict + (dict(delimiter=None), None), # Default arguments --> sep_override=True + (dict(delimiter=","), ParserWarning), # Conflict + (dict(delimiter="."), ParserWarning), # Conflict + ], + ids=[ + "sep-override-true", + "sep-override-false", + "delimiter-no-conflict", + "delimiter-default-arg", + "delimiter-conflict", + "delimiter-conflict2", + ], +) +def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass): # see gh-23761. dialect_name, dialect_kwargs = custom_dialect parser = all_parsers @@ -128,6 +140,5 @@ def test_dialect_conflict_delimiter(all_parsers, custom_dialect, with tm.with_csv_dialect(dialect_name, **dialect_kwargs): with tm.assert_produces_warning(warning_klass): - result = parser.read_csv(StringIO(data), - dialect=dialect_name, **kwargs) + result = parser.read_csv(StringIO(data), dialect=dialect_name, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 738b9d9693750..92c91565e1c23 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -13,8 +13,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd -from pandas import ( - Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat) +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat import pandas.util.testing as tm @@ -24,8 +23,11 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): # see gh-3795, gh-6607 parser = all_parsers - df = DataFrame(np.random.rand(5, 2).round(4), columns=list("AB"), - index=["1A", "1B", "1C", "1D", "1E"]) + df = DataFrame( + np.random.rand(5, 2).round(4), + columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"], + ) with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: df.to_csv(path) @@ -58,13 +60,13 @@ def test_dtype_per_column(all_parsers): 2,3.5 3,4.5 4,5.5""" - expected = DataFrame([[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], - columns=["one", "two"]) + expected = DataFrame( + [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] + ) expected["one"] = expected["one"].astype(np.float64) expected["two"] = expected["two"].astype(object) - result = parser.read_csv(StringIO(data), dtype={"one": np.float64, - 1: str}) + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -81,13 +83,14 @@ def test_invalid_dtype_per_column(all_parsers): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) -@pytest.mark.parametrize("dtype", [ - "category", - CategoricalDtype(), - {"a": "category", - "b": "category", - "c": CategoricalDtype()} -]) +@pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + {"a": "category", "b": "category", "c": CategoricalDtype()}, + ], +) def test_categorical_dtype(all_parsers, dtype): # see gh-10153 parser = all_parsers @@ -95,17 +98,18 @@ def test_categorical_dtype(all_parsers, dtype): 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = DataFrame({"a": Categorical(["1", "1", "2"]), - "b": Categorical(["a", "a", "b"]), - "c": Categorical(["3.4", "3.4", "4.5"])}) + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) -@pytest.mark.parametrize("dtype", [ - {"b": "category"}, - {1: "category"} -]) +@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}]) def test_categorical_dtype_single(all_parsers, dtype): # see gh-10153 parser = all_parsers @@ -113,9 +117,9 @@ def test_categorical_dtype_single(all_parsers, dtype): 1,a,3.4 1,a,3.4 2,b,4.5""" - expected = DataFrame({"a": [1, 1, 2], - "b": Categorical(["a", "a", "b"]), - "c": [3.4, 3.4, 4.5]}) + expected = DataFrame( + {"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]} + ) actual = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(actual, expected) @@ -127,9 +131,13 @@ def test_categorical_dtype_unsorted(all_parsers): 1,b,3.4 1,b,3.4 2,a,4.5""" - expected = DataFrame({"a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", "b", "a"]), - "c": Categorical(["3.4", "3.4", "4.5"])}) + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) actual = parser.read_csv(StringIO(data), dtype="category") tm.assert_frame_equal(actual, expected) @@ -141,9 +149,13 @@ def test_categorical_dtype_missing(all_parsers): 1,b,3.4 1,nan,3.4 2,a,4.5""" - expected = DataFrame({"a": Categorical(["1", "1", "2"]), - "b": Categorical(["b", np.nan, "a"]), - "c": Categorical(["3.4", "3.4", "4.5"])}) + expected = DataFrame( + { + "a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"]), + } + ) actual = parser.read_csv(StringIO(data), dtype="category") tm.assert_frame_equal(actual, expected) @@ -155,10 +167,10 @@ def test_categorical_dtype_high_cardinality_numeric(all_parsers): data = np.sort([str(i) for i in range(524289)]) expected = DataFrame({"a": Categorical(data, ordered=True)}) - actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), - dtype="category") + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category") actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True) + np.sort(actual.a.cat.categories), ordered=True + ) tm.assert_frame_equal(actual, expected) @@ -171,8 +183,7 @@ def test_categorical_dtype_latin1(all_parsers, csv_dir_path): expected = parser.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) - actual = parser.read_csv(pth, header=None, encoding=encoding, - dtype={1: "category"}) + actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"}) tm.assert_frame_equal(actual, expected) @@ -198,13 +209,11 @@ def test_categorical_dtype_chunksize_infer_categories(all_parsers): 1,b 1,b 2,c""" - expecteds = [DataFrame({"a": [1, 1], - "b": Categorical(["a", "b"])}), - DataFrame({"a": [1, 2], - "b": Categorical(["b", "c"])}, - index=[2, 3])] - actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, - chunksize=2) + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]), + ] + actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, chunksize=2) for actual, expected in zip(actuals, expecteds): tm.assert_frame_equal(actual, expected) @@ -219,13 +228,12 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers): 1,b 2,c""" cats = ["a", "b", "c"] - expecteds = [DataFrame({"a": [1, 1], - "b": Categorical(["a", "b"], - categories=cats)}), - DataFrame({"a": [1, 2], - "b": Categorical(["b", "c"], - categories=cats)}, - index=[2, 3])] + expecteds = [ + DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}), + DataFrame( + {"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)}, index=[2, 3] + ), + ] dtype = CategoricalDtype(cats) actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) @@ -234,12 +242,10 @@ def test_categorical_dtype_chunksize_explicit_categories(all_parsers): @pytest.mark.parametrize("ordered", [False, True]) -@pytest.mark.parametrize("categories", [ - ["a", "b", "c"], - ["a", "c", "b"], - ["a", "b", "c", "d"], - ["c", "b", "a"], -]) +@pytest.mark.parametrize( + "categories", + [["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]], +) def test_categorical_category_dtype(all_parsers, categories, ordered): parser = all_parsers data = """a,b @@ -247,15 +253,16 @@ def test_categorical_category_dtype(all_parsers, categories, ordered): 1,b 1,b 2,c""" - expected = DataFrame({ - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], - categories=categories, - ordered=ordered) - }) - - dtype = {"b": CategoricalDtype(categories=categories, - ordered=ordered)} + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical( + ["a", "b", "b", "c"], categories=categories, ordered=ordered + ), + } + ) + + dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)} result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) @@ -268,10 +275,12 @@ def test_categorical_category_dtype_unsorted(all_parsers): 1,b 2,c""" dtype = CategoricalDtype(["c", "b", "a"]) - expected = DataFrame({ - "a": [1, 1, 1, 2], - "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]) - }) + expected = DataFrame( + { + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]), + } + ) result = parser.read_csv(StringIO(data), dtype={"b": dtype}) tm.assert_frame_equal(result, expected) @@ -321,12 +330,15 @@ def test_categorical_coerces_timedelta(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data", [ - "b\nTrue\nFalse\nNA\nFalse", - "b\ntrue\nfalse\nNA\nfalse", - "b\nTRUE\nFALSE\nNA\nFALSE", - "b\nTrue\nFalse\nNA\nFALSE", -]) +@pytest.mark.parametrize( + "data", + [ + "b\nTrue\nFalse\nNA\nFalse", + "b\ntrue\nfalse\nNA\nfalse", + "b\nTRUE\nFALSE\nNA\nFALSE", + "b\nTrue\nFalse\nNA\nFALSE", + ], +) def test_categorical_dtype_coerces_boolean(all_parsers, data): # see gh-20498 parser = all_parsers @@ -342,8 +354,7 @@ def test_categorical_unexpected_categories(all_parsers): dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} data = "b\nd\na\nc\nd" # Unexpected c - expected = DataFrame({"b": Categorical(list("dacd"), - dtype=dtype["b"])}) + expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) @@ -355,9 +366,10 @@ def test_empty_pass_dtype(all_parsers): data = "one,two" result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) - expected = DataFrame({"one": np.empty(0, dtype="u1"), - "two": np.empty(0, dtype=np.object)}, - index=Index([], dtype=object)) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=np.object)}, + index=Index([], dtype=object), + ) tm.assert_frame_equal(result, expected) @@ -365,11 +377,13 @@ def test_empty_with_index_pass_dtype(all_parsers): parser = all_parsers data = "one,two" - result = parser.read_csv(StringIO(data), index_col=["one"], - dtype={"one": "u1", 1: "f"}) + result = parser.read_csv( + StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"} + ) - expected = DataFrame({"two": np.empty(0, dtype="f")}, - index=Index([], dtype="u1", name="one")) + expected = DataFrame( + {"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one") + ) tm.assert_frame_equal(result, expected) @@ -377,14 +391,14 @@ def test_empty_with_multi_index_pass_dtype(all_parsers): parser = all_parsers data = "one,two,three" - result = parser.read_csv(StringIO(data), index_col=["one", "two"], - dtype={"one": "u1", 1: "f8"}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype="u1"), - np.empty(0, dtype=np.float64)], - names=["one", "two"]) - expected = DataFrame({"three": np.empty(0, dtype=np.object)}, - index=exp_idx) + result = parser.read_csv( + StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"} + ) + + exp_idx = MultiIndex.from_arrays( + [np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)], names=["one", "two"] + ) + expected = DataFrame({"three": np.empty(0, dtype=np.object)}, index=exp_idx) tm.assert_frame_equal(result, expected) @@ -394,9 +408,10 @@ def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): data = "one,one" result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) - expected = DataFrame({"one": np.empty(0, dtype="u1"), - "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object)) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) tm.assert_frame_equal(result, expected) @@ -406,17 +421,20 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): data = "one,one" result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) - expected = DataFrame({"one": np.empty(0, dtype="u1"), - "one.1": np.empty(0, dtype="f")}, - index=Index([], dtype=object)) + expected = DataFrame( + {"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object), + ) tm.assert_frame_equal(result, expected) def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): # see gh-9424 parser = all_parsers - expected = concat([Series([], name="one", dtype="u1"), - Series([], name="one.1", dtype="f")], axis=1) + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) expected.index = expected.index.astype(object) data = "one,one" @@ -427,14 +445,15 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers): # see gh-9424 parser = all_parsers - expected = concat([Series([], name="one", dtype="u1"), - Series([], name="one.1", dtype="f")], axis=1) + expected = concat( + [Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")], + axis=1, + ) expected.index = expected.index.astype(object) - with pytest.raises(ValueError, match='Duplicate names'): + with pytest.raises(ValueError, match="Duplicate names"): data = "" - parser.read_csv(StringIO(data), names=["one", "one"], - dtype={0: "u1", 1: "f"}) + parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"}) def test_raise_on_passed_int_dtype_with_nas(all_parsers): @@ -445,11 +464,13 @@ def test_raise_on_passed_int_dtype_with_nas(all_parsers): 2001,,11 2001,106380451,67""" - msg = ("Integer column has NA values" if parser.engine == "c" else - "Unable to convert column DOY") + msg = ( + "Integer column has NA values" + if parser.engine == "c" + else "Unable to convert column DOY" + ) with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, - skipinitialspace=True) + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True) def test_dtype_with_converters(all_parsers): @@ -460,36 +481,56 @@ def test_dtype_with_converters(all_parsers): # Dtype spec ignored if converted specified. with tm.assert_produces_warning(ParserWarning): - result = parser.read_csv(StringIO(data), dtype={"a": "i8"}, - converters={"a": lambda x: str(x)}) + result = parser.read_csv( + StringIO(data), dtype={"a": "i8"}, converters={"a": lambda x: str(x)} + ) expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype,expected", [ - (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), - ("category", DataFrame({"a": Categorical([]), - "b": Categorical([])}, - index=[])), - (dict(a="category", b="category"), - DataFrame({"a": Categorical([]), - "b": Categorical([])}, - index=[])), - ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), - ("timedelta64[ns]", DataFrame({"a": Series([], dtype="timedelta64[ns]"), - "b": Series([], dtype="timedelta64[ns]")}, - index=[])), - (dict(a=np.int64, - b=np.int32), DataFrame({"a": Series([], dtype=np.int64), - "b": Series([], dtype=np.int32)}, - index=[])), - ({0: np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64), - "b": Series([], dtype=np.int32)}, - index=[])), - ({"a": np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64), - "b": Series([], dtype=np.int32)}, - index=[])), -]) +@pytest.mark.parametrize( + "dtype,expected", + [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ("category", DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[])), + ( + dict(a="category", b="category"), + DataFrame({"a": Categorical([]), "b": Categorical([])}, index=[]), + ), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ( + "timedelta64[ns]", + DataFrame( + { + "a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]"), + }, + index=[], + ), + ), + ( + dict(a=np.int64, b=np.int32), + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {0: np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ( + {"a": np.int64, 1: np.int32}, + DataFrame( + {"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)}, + index=[], + ), + ), + ], +) def test_empty_dtype(all_parsers, dtype, expected): # see gh-14712 parser = all_parsers @@ -499,8 +540,9 @@ def test_empty_dtype(all_parsers, dtype, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", list(np.typecodes["AllInteger"] + - np.typecodes["Float"])) +@pytest.mark.parametrize( + "dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) +) def test_numeric_dtype(all_parsers, dtype): data = "0\n1" parser = all_parsers diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index faf62f98be2e0..ff1dd10bdd0d9 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -46,10 +46,10 @@ def test_no_header_prefix(all_parsers): 11,12,13,14,15 """ result = parser.read_csv(StringIO(data), prefix="Field", header=None) - expected = DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], - columns=["Field0", "Field1", "Field2", - "Field3", "Field4"]) + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], + columns=["Field0", "Field1", "Field2", "Field3", "Field4"], + ) tm.assert_frame_equal(result, expected) @@ -62,9 +62,11 @@ def test_header_with_index_col(all_parsers): names = ["A", "B", "C"] result = parser.read_csv(StringIO(data), names=names) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result, expected) @@ -90,8 +92,7 @@ def test_header_not_first_line(all_parsers): def test_header_multi_index(all_parsers): parser = all_parsers - expected = tm.makeCustomDataframe( - 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 @@ -106,23 +107,31 @@ def test_header_multi_index(all_parsers): R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1]) + result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,msg", [ - (dict(index_col=["foo", "bar"]), ("index_col must only contain " - "row numbers when specifying " - "a multi-index header")), - (dict(index_col=[0, 1], names=["foo", "bar"]), ("cannot specify names " - "when specifying a " - "multi-index header")), - (dict(index_col=[0, 1], usecols=["foo", "bar"]), ("cannot specify " - "usecols when " - "specifying a " - "multi-index header")), -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + ( + dict(index_col=["foo", "bar"]), + ( + "index_col must only contain " + "row numbers when specifying " + "a multi-index header" + ), + ), + ( + dict(index_col=[0, 1], names=["foo", "bar"]), + ("cannot specify names " "when specifying a " "multi-index header"), + ), + ( + dict(index_col=[0, 1], usecols=["foo", "bar"]), + ("cannot specify " "usecols when " "specifying a " "multi-index header"), + ), + ], +) def test_header_multi_index_invalid(all_parsers, kwargs, msg): data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 @@ -146,23 +155,43 @@ def test_header_multi_index_invalid(all_parsers, kwargs, msg): _TestTuple = namedtuple("names", ["first", "second"]) -@pytest.mark.parametrize("kwargs", [ - dict(header=[0, 1]), - dict(skiprows=3, - names=[("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")]), - dict(skiprows=3, - names=[_TestTuple("a", "q"), _TestTuple("a", "r"), - _TestTuple("a", "s"), _TestTuple("b", "t"), - _TestTuple("c", "u"), _TestTuple("c", "v")]) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=3, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=3, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) def test_header_multi_index_common_format1(all_parsers, kwargs): parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")])) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) data = """,a,a,a,b,c,c ,q,r,s,t,u,v ,,,,,, @@ -173,23 +202,43 @@ def test_header_multi_index_common_format1(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(header=[0, 1]), - dict(skiprows=2, - names=[("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")]), - dict(skiprows=2, - names=[_TestTuple("a", "q"), _TestTuple("a", "r"), - _TestTuple("a", "s"), _TestTuple("b", "t"), - _TestTuple("c", "u"), _TestTuple("c", "v")]) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=2, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=2, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) def test_header_multi_index_common_format2(all_parsers, kwargs): parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")])) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) data = """,a,a,a,b,c,c ,q,r,s,t,u,v one,1,2,3,4,5,6 @@ -199,23 +248,43 @@ def test_header_multi_index_common_format2(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(header=[0, 1]), - dict(skiprows=2, - names=[("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")]), - dict(skiprows=2, - names=[_TestTuple("a", "q"), _TestTuple("a", "r"), - _TestTuple("a", "s"), _TestTuple("b", "t"), - _TestTuple("c", "u"), _TestTuple("c", "v")]) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(header=[0, 1]), + dict( + skiprows=2, + names=[ + ("a", "q"), + ("a", "r"), + ("a", "s"), + ("b", "t"), + ("c", "u"), + ("c", "v"), + ], + ), + dict( + skiprows=2, + names=[ + _TestTuple("a", "q"), + _TestTuple("a", "r"), + _TestTuple("a", "s"), + _TestTuple("b", "t"), + _TestTuple("c", "u"), + _TestTuple("c", "v"), + ], + ), + ], +) def test_header_multi_index_common_format3(all_parsers, kwargs): parser = all_parsers - expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=["one", "two"], - columns=MultiIndex.from_tuples( - [("a", "q"), ("a", "r"), ("a", "s"), - ("b", "t"), ("c", "u"), ("c", "v")])) + expected = DataFrame( + [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")] + ), + ) expected = expected.reset_index(drop=True) data = """a,a,a,b,c,c q,r,s,t,u,v @@ -228,14 +297,15 @@ def test_header_multi_index_common_format3(all_parsers, kwargs): def test_header_multi_index_common_format_malformed1(all_parsers): parser = all_parsers - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), index=Index([1, 7]), - columns=MultiIndex(levels=[["a", "b", "c"], - ["r", "s", "t", - "u", "v"]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=["a", "q"])) + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=["a", "q"], + ), + ) data = """a,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 @@ -247,14 +317,15 @@ def test_header_multi_index_common_format_malformed1(all_parsers): def test_header_multi_index_common_format_malformed2(all_parsers): parser = all_parsers - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + expected = DataFrame( + np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), index=Index([1, 7]), - columns=MultiIndex(levels=[["a", "b", "c"], - ["r", "s", "t", - "u", "v"]], - codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=[None, "q"])) + columns=MultiIndex( + levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]], + codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, "q"], + ), + ) data = """,a,a,b,c,c q,r,s,t,u,v @@ -267,14 +338,15 @@ def test_header_multi_index_common_format_malformed2(all_parsers): def test_header_multi_index_common_format_malformed3(all_parsers): parser = all_parsers - expected = DataFrame(np.array( - [[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), - index=MultiIndex(levels=[[1, 7], [2, 8]], - codes=[[0, 1], [0, 1]]), - columns=MultiIndex(levels=[["a", "b", "c"], - ["s", "t", "u", "v"]], - codes=[[0, 1, 2, 2], [0, 1, 2, 3]], - names=[None, "q"])) + expected = DataFrame( + np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), + index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]), + columns=MultiIndex( + levels=[["a", "b", "c"], ["s", "t", "u", "v"]], + codes=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, "q"], + ), + ) data = """,a,a,b,c,c q,r,s,t,u,v 1,2,3,4,5,6 @@ -284,24 +356,19 @@ def test_header_multi_index_common_format_malformed3(all_parsers): tm.assert_frame_equal(expected, result) -@pytest.mark.parametrize("data,header", [ - ("1,2,3\n4,5,6", None), - ("foo,bar,baz\n1,2,3\n4,5,6", 0), -]) +@pytest.mark.parametrize( + "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)] +) def test_header_names_backward_compat(all_parsers, data, header): # see gh-2539 parser = all_parsers - expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), - names=["a", "b", "c"]) + expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"]) - result = parser.read_csv(StringIO(data), names=["a", "b", "c"], - header=header) + result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(), dict(index_col=False) -]) +@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=False)]) def test_read_only_header_no_rows(all_parsers, kwargs): # See gh-7773 parser = all_parsers @@ -311,29 +378,31 @@ def test_read_only_header_no_rows(all_parsers, kwargs): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,names", [ - (dict(), [0, 1, 2, 3, 4]), - (dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]), - (dict(names=["foo", "bar", "baz", "quux", "panda"]), - ["foo", "bar", "baz", "quux", "panda"]) -]) +@pytest.mark.parametrize( + "kwargs,names", + [ + (dict(), [0, 1, 2, 3, 4]), + (dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]), + ( + dict(names=["foo", "bar", "baz", "quux", "panda"]), + ["foo", "bar", "baz", "quux", "panda"], + ), + ], +) def test_no_header(all_parsers, kwargs, names): parser = all_parsers data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ - expected = DataFrame([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], columns=names) + expected = DataFrame( + [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names + ) result = parser.read_csv(StringIO(data), header=None, **kwargs) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("header", [ - ["a", "b"], - "string_header" -]) +@pytest.mark.parametrize("header", [["a", "b"], "string_header"]) def test_non_int_header(all_parsers, header): # see gh-16338 msg = "header must be integer or list of integers" @@ -354,24 +423,44 @@ def test_singleton_header(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected", [ - ("A,A,A,B\none,one,one,two\n0,40,34,0.1", - DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), - ("A", "one.2"), ("B", "two")]))), - ("A,A,A,B\none,one,one.1,two\n0,40,34,0.1", - DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), - ("A", "one.1.1"), ("B", "two")]))), - ("A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", - DataFrame([[0, 40, 34, 0.1, 0.1]], - columns=MultiIndex.from_tuples( - [("A", "one"), ("A", "one.1"), - ("A", "one.1.1"), ("B", "two"), - ("B", "two.1")]))) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ( + "A,A,A,B\none,one,one,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B\none,one,one.1,two\n0,40,34,0.1", + DataFrame( + [[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")] + ), + ), + ), + ( + "A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", + DataFrame( + [[0, 40, 34, 0.1, 0.1]], + columns=MultiIndex.from_tuples( + [ + ("A", "one"), + ("A", "one.1"), + ("A", "one.1.1"), + ("B", "two"), + ("B", "two.1"), + ] + ), + ), + ), + ], +) def test_mangles_multi_index(all_parsers, data, expected): # see gh-18062 parser = all_parsers @@ -381,10 +470,9 @@ def test_mangles_multi_index(all_parsers, data, expected): @pytest.mark.parametrize("index_col", [None, [0]]) -@pytest.mark.parametrize("columns", [None, - (["", "Unnamed"]), - (["Unnamed", ""]), - (["Unnamed", "NotUnnamed"])]) +@pytest.mark.parametrize( + "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])] +) def test_multi_index_unnamed(all_parsers, index_col, columns): # see gh-23687 # @@ -400,18 +488,17 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): if index_col is None: data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" else: - data = (",".join([""] + (columns or ["", ""])) + - "\n,0,1\n0,2,3\n1,4,5\n") + data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n" if columns is None: - msg = (r"Passed header=\[0,1\] are too " - r"many rows for this multi_index of columns") + msg = ( + r"Passed header=\[0,1\] are too " + r"many rows for this multi_index of columns" + ) with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=header, - index_col=index_col) + parser.read_csv(StringIO(data), header=header, index_col=index_col) else: - result = parser.read_csv(StringIO(data), header=header, - index_col=index_col) + result = parser.read_csv(StringIO(data), header=header, index_col=index_col) template = "Unnamed: {i}_level_0" exp_columns = [] diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index de212880d1576..8199d632223c1 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -21,7 +21,9 @@ def test_index_col_named(all_parsers, with_header): KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa + header = ( + "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" + ) # noqa if with_header: data = header + no_header @@ -45,14 +47,13 @@ def test_index_col_named2(all_parsers): 9,10,11,12,foo """ - expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10], - "c": [3, 7, 11], "d": [4, 8, 12]}, - index=Index(["hello", "world", "foo"], - name="message")) + expected = DataFrame( + {"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]}, + index=Index(["hello", "world", "foo"], name="message"), + ) names = ["a", "b", "c", "d", "message"] - result = parser.read_csv(StringIO(data), names=names, - index_col=["message"]) + result = parser.read_csv(StringIO(data), names=names, index_col=["message"]) tm.assert_frame_equal(result, expected) @@ -61,8 +62,9 @@ def test_index_col_is_true(all_parsers): data = "a,b\n1,2" parser = all_parsers - with pytest.raises(ValueError, match="The value of index_col " - "couldn't be 'True'"): + with pytest.raises( + ValueError, match="The value of index_col " "couldn't be 'True'" + ): parser.read_csv(StringIO(data), index_col=True) @@ -75,28 +77,49 @@ def test_infer_index_col(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=["foo", "bar", "baz"], - columns=["A", "B", "C"]) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("index_col,kwargs", [ - (None, dict(columns=["x", "y", "z"])), - (False, dict(columns=["x", "y", "z"])), - (0, dict(columns=["y", "z"], index=Index([], name="x"))), - (1, dict(columns=["x", "z"], index=Index([], name="y"))), - ("x", dict(columns=["y", "z"], index=Index([], name="x"))), - ("y", dict(columns=["x", "z"], index=Index([], name="y"))), - ([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["x", "y"]))), - (["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["x", "y"]))), - ([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["y", "x"]))), - (["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays( - [[]] * 2, names=["y", "x"]))), -]) +@pytest.mark.parametrize( + "index_col,kwargs", + [ + (None, dict(columns=["x", "y", "z"])), + (False, dict(columns=["x", "y", "z"])), + (0, dict(columns=["y", "z"], index=Index([], name="x"))), + (1, dict(columns=["x", "z"], index=Index([], name="y"))), + ("x", dict(columns=["y", "z"], index=Index([], name="x"))), + ("y", dict(columns=["x", "z"], index=Index([], name="y"))), + ( + [0, 1], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ), + ), + ( + ["x", "y"], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"]) + ), + ), + ( + [1, 0], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ), + ), + ( + ["y", "x"], + dict( + columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"]) + ), + ), + ], +) def test_index_col_empty_data(all_parsers, index_col, kwargs): data = "x,y,z" parser = all_parsers @@ -116,13 +139,16 @@ def test_empty_with_index_col_false(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("index_names", [ - ["", ""], - ["foo", ""], - ["", "bar"], - ["foo", "bar"], - ["NotReallyUnnamed", "Unnamed: 0"], -]) +@pytest.mark.parametrize( + "index_names", + [ + ["", ""], + ["foo", ""], + ["", "bar"], + ["foo", "bar"], + ["NotReallyUnnamed", "Unnamed: 0"], + ], +) def test_multi_index_naming(all_parsers, index_names): parser = all_parsers @@ -130,9 +156,9 @@ def test_multi_index_naming(all_parsers, index_names): data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) result = parser.read_csv(StringIO(data), index_col=[0, 1]) - expected = DataFrame({"col": [1, 2, 3, 4]}, - index=MultiIndex.from_product([["a", "b"], - ["c", "d"]])) + expected = DataFrame( + {"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]]) + ) expected.index.names = [name if name else None for name in index_names] tm.assert_frame_equal(result, expected) @@ -142,8 +168,10 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" result = parser.read_csv(StringIO(data), index_col=[0, 2]) - expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, - index=MultiIndex( - levels=[['a', 'b'], [1, 2, 3, 4]], - codes=[[0, 0, 1, 1], [0, 1, 2, 3]])) + expected = DataFrame( + {"Unnamed: 2": ["c", "d", "c", "d"]}, + index=MultiIndex( + levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]] + ), + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 6ab761398631b..d144421090274 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -20,8 +20,7 @@ def test_basic(all_parsers, kwargs): data = "a,a,b,b,b\n1,2,3,4,5" result = parser.read_csv(StringIO(data), sep=",", **kwargs) - expected = DataFrame([[1, 2, 3, 4, 5]], - columns=["a", "a.1", "b", "b.1", "b.2"]) + expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"]) tm.assert_frame_equal(result, expected) @@ -30,8 +29,7 @@ def test_basic_names(all_parsers): parser = all_parsers data = "a,b,a\n0,1,2\n3,4,5" - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=["a", "b", "a.1"]) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"]) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) @@ -42,20 +40,30 @@ def test_basic_names_raise(all_parsers): parser = all_parsers data = "0,1,2\n3,4,5" - with pytest.raises(ValueError, match='Duplicate names'): + with pytest.raises(ValueError, match="Duplicate names"): parser.read_csv(StringIO(data), names=["a", "b", "a"]) -@pytest.mark.parametrize("data,expected", [ - ("a,a,a.1\n1,2,3", - DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])), - ("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", - DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"])), - ("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", - DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"])) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])), + ( + "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", + DataFrame( + [[1, 2, 3, 4, 5, 6]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + ), + ), + ( + "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", + DataFrame( + [[1, 2, 3, 4, 5, 6, 7]], + columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + ), + ), + ], +) def test_thorough_mangle_columns(all_parsers, data, expected): # see gh-17060 parser = all_parsers @@ -64,29 +72,42 @@ def test_thorough_mangle_columns(all_parsers, data, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,names,expected", [ - ("a,b,b\n1,2,3", - ["a.1", "a.1", "a.1.1"], - DataFrame([["a", "b", "b"], ["1", "2", "3"]], - columns=["a.1", "a.1.1", "a.1.1.1"])), - ("a,b,c,d,e,f\n1,2,3,4,5,6", - ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], - DataFrame([["a", "b", "c", "d", "e", "f"], - ["1", "2", "3", "4", "5", "6"]], - columns=["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"])), - ("a,b,c,d,e,f,g\n1,2,3,4,5,6,7", - ["a", "a", "a.3", "a.1", "a.2", "a", "a"], - DataFrame([["a", "b", "c", "d", "e", "f", "g"], - ["1", "2", "3", "4", "5", "6", "7"]], - columns=["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"])), -]) +@pytest.mark.parametrize( + "data,names,expected", + [ + ( + "a,b,b\n1,2,3", + ["a.1", "a.1", "a.1.1"], + DataFrame( + [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"] + ), + ), + ( + "a,b,c,d,e,f\n1,2,3,4,5,6", + ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], + DataFrame( + [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"], + ), + ), + ( + "a,b,c,d,e,f,g\n1,2,3,4,5,6,7", + ["a", "a", "a.3", "a.1", "a.2", "a", "a"], + DataFrame( + [ + ["a", "b", "c", "d", "e", "f", "g"], + ["1", "2", "3", "4", "5", "6", "7"], + ], + columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"], + ), + ), + ], +) def test_thorough_mangle_names(all_parsers, data, names, expected): # see gh-17095 parser = all_parsers - with pytest.raises(ValueError, match='Duplicate names'): + with pytest.raises(ValueError, match="Duplicate names"): parser.read_csv(StringIO(data), names=names) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index b749a8984284d..392628ee74ba2 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -29,9 +29,7 @@ def _construct_dataframe(num_rows): df["foo"] = "foo" df["bar"] = "bar" df["baz"] = "baz" - df["date"] = pd.date_range("20000101 09:00:00", - periods=num_rows, - freq="s") + df["date"] = pd.date_range("20000101 09:00:00", periods=num_rows, freq="s") df["int"] = np.arange(num_rows, dtype="int64") return df @@ -43,9 +41,9 @@ def test_multi_thread_string_io_read_csv(all_parsers): num_files = 100 bytes_to_df = [ - "\n".join( - ["%d,%d,%d" % (i, i, i) for i in range(max_row_range)] - ).encode() for _ in range(num_files)] + "\n".join(["%d,%d,%d" % (i, i, i) for i in range(max_row_range)]).encode() + for _ in range(num_files) + ] files = [BytesIO(b) for b in bytes_to_df] # Read all files in many threads. @@ -77,6 +75,7 @@ def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks): ------- df : DataFrame """ + def reader(arg): """ Create a reader for part of the CSV. @@ -98,16 +97,21 @@ def reader(arg): start, nrows = arg if not start: - return parser.read_csv(path, index_col=0, header=0, - nrows=nrows, parse_dates=["date"]) - - return parser.read_csv(path, index_col=0, header=None, - skiprows=int(start) + 1, - nrows=nrows, parse_dates=[9]) + return parser.read_csv( + path, index_col=0, header=0, nrows=nrows, parse_dates=["date"] + ) + + return parser.read_csv( + path, + index_col=0, + header=None, + skiprows=int(start) + 1, + nrows=nrows, + parse_dates=[9], + ) tasks = [ - (num_rows * i // num_tasks, - num_rows // num_tasks) for i in range(num_tasks) + (num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks) ] pool = ThreadPool(processes=num_tasks) @@ -134,6 +138,7 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe(parser, path, - num_rows, num_tasks) + final_dataframe = _generate_multi_thread_dataframe( + parser, path, num_rows, num_tasks + ) tm.assert_frame_equal(df, final_dataframe) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 2367f71a2557e..f154d09358dc1 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -21,10 +21,10 @@ def test_string_nas(all_parsers): ,g,h """ result = parser.read_csv(StringIO(data)) - expected = DataFrame([["a", "b", "c"], - ["d", np.nan, "f"], - [np.nan, "g", "h"]], - columns=["A", "B", "C"]) + expected = DataFrame( + [["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]], + columns=["A", "B", "C"], + ) tm.assert_frame_equal(result, expected) @@ -35,46 +35,70 @@ def test_detect_string_na(all_parsers): NA,baz NaN,nan """ - expected = DataFrame([["foo", "bar"], [np.nan, "baz"], - [np.nan, np.nan]], columns=["A", "B"]) + expected = DataFrame( + [["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"] + ) result = parser.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_values", [ - ["-999.0", "-999"], - [-999, -999.0], - [-999.0, -999], - ["-999.0"], ["-999"], - [-999.0], [-999] -]) -@pytest.mark.parametrize("data", [ - """A,B +@pytest.mark.parametrize( + "na_values", + [ + ["-999.0", "-999"], + [-999, -999.0], + [-999.0, -999], + ["-999.0"], + ["-999"], + [-999.0], + [-999], + ], +) +@pytest.mark.parametrize( + "data", + [ + """A,B -999,1.2 2,-999 3,4.5 """, - """A,B + """A,B -999,1.200 2,-999.000 3,4.500 -""" -]) +""", + ], +) def test_non_string_na_values(all_parsers, data, na_values): # see gh-3611: with an odd float format, we can't match # the string "999.0" exactly but still need float matching parser = all_parsers - expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], - [3.0, 4.5]], columns=["A", "B"]) + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"]) result = parser.read_csv(StringIO(data), na_values=na_values) tm.assert_frame_equal(result, expected) def test_default_na_values(all_parsers): - _NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A", - "N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan", - "-NaN", "-nan", "#N/A N/A", ""} + _NA_VALUES = { + "-1.#IND", + "1.#QNAN", + "1.#IND", + "-1.#QNAN", + "#N/A", + "N/A", + "n/a", + "NA", + "#NA", + "NULL", + "null", + "NaN", + "nan", + "-NaN", + "-nan", + "#N/A N/A", + "", + } assert _NA_VALUES == com._NA_VALUES parser = all_parsers @@ -109,8 +133,9 @@ def test_custom_na_values(all_parsers, na_values): -1.#IND,5,baz 7,8,NaN """ - expected = DataFrame([[1., np.nan, 3], [np.nan, 5, np.nan], - [7, 8, np.nan]], columns=["A", "B", "C"]) + expected = DataFrame( + [[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"] + ) result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1]) tm.assert_frame_equal(result, expected) @@ -122,9 +147,13 @@ def test_bool_na_values(all_parsers): False,NA,True""" parser = all_parsers result = parser.read_csv(StringIO(data)) - expected = DataFrame({"A": np.array([True, np.nan, False], dtype=object), - "B": np.array([False, True, np.nan], dtype=object), - "C": [True, False, True]}) + expected = DataFrame( + { + "A": np.array([True, np.nan, False], dtype=object), + "B": np.array([False, True, np.nan], dtype=object), + "C": [True, False, True], + } + ) tm.assert_frame_equal(result, expected) @@ -135,54 +164,95 @@ def test_na_value_dict(all_parsers): foo,bar,NA bar,foo,foo""" parser = all_parsers - df = parser.read_csv(StringIO(data), - na_values={"A": ["foo"], "B": ["bar"]}) - expected = DataFrame({"A": [np.nan, "bar", np.nan, "bar"], - "B": [np.nan, "foo", np.nan, "foo"], - "C": [np.nan, "foo", np.nan, "foo"]}) + df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]}) + expected = DataFrame( + { + "A": [np.nan, "bar", np.nan, "bar"], + "B": [np.nan, "foo", np.nan, "foo"], + "C": [np.nan, "foo", np.nan, "foo"], + } + ) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("index_col,expected", [ - ([0], DataFrame({"b": [np.nan], "c": [1], "d": [5]}, - index=Index([0], name="a"))), - ([0, 2], DataFrame({"b": [np.nan], "d": [5]}, - index=MultiIndex.from_tuples( - [(0, 1)], names=["a", "c"]))), - (["a", "c"], DataFrame({"b": [np.nan], "d": [5]}, - index=MultiIndex.from_tuples( - [(0, 1)], names=["a", "c"]))), -]) +@pytest.mark.parametrize( + "index_col,expected", + [ + ( + [0], + DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")), + ), + ( + [0, 2], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ( + ["a", "c"], + DataFrame( + {"b": [np.nan], "d": [5]}, + index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]), + ), + ), + ], +) def test_na_value_dict_multi_index(all_parsers, index_col, expected): data = """\ a,b,c,d 0,NA,1,5 """ parser = all_parsers - result = parser.read_csv(StringIO(data), na_values=set(), - index_col=index_col) + result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,expected", [ - (dict(), DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, "five", - np.nan, "seven"]})), - (dict(na_values={"A": [], "C": []}, keep_default_na=False), - DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"]})), - (dict(na_values=["a"], keep_default_na=False), - DataFrame({"A": [np.nan, "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", "nan", "five", "", "seven"]})), - (dict(na_values={"A": [], "C": []}), - DataFrame({"A": ["a", "b", np.nan, "d", "e", np.nan, "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["one", "two", "three", np.nan, - "five", np.nan, "seven"]})), -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + ( + dict(), + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ( + dict(na_values={"A": [], "C": []}, keep_default_na=False), + DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + dict(na_values=["a"], keep_default_na=False), + DataFrame( + { + "A": [np.nan, "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", "nan", "five", "", "seven"], + } + ), + ), + ( + dict(na_values={"A": [], "C": []}), + DataFrame( + { + "A": ["a", "b", np.nan, "d", "e", np.nan, "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["one", "two", "three", np.nan, "five", np.nan, "seven"], + } + ), + ), + ], +) def test_na_values_keep_default(all_parsers, kwargs, expected): data = """\ A,B,C @@ -215,10 +285,13 @@ def test_no_na_values_no_keep_default(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), keep_default_na=False) - expected = DataFrame({"A": ["a", "b", "", "d", "e", "nan", "g"], - "B": [1, 2, 3, 4, 5, 6, 7], - "C": ["None", "two", "None", "nan", - "five", "", "seven"]}) + expected = DataFrame( + { + "A": ["a", "b", "", "d", "e", "nan", "g"], + "B": [1, 2, 3, 4, 5, 6, 7], + "C": ["None", "two", "None", "nan", "five", "", "seven"], + } + ) tm.assert_frame_equal(result, expected) @@ -226,8 +299,9 @@ def test_no_keep_default_na_dict_na_values(all_parsers): # see gh-19227 data = "a,b\n,2" parser = all_parsers - result = parser.read_csv(StringIO(data), na_values={"b": ["2"]}, - keep_default_na=False) + result = parser.read_csv( + StringIO(data), na_values={"b": ["2"]}, keep_default_na=False + ) expected = DataFrame({"a": [""], "b": [np.nan]}) tm.assert_frame_equal(result, expected) @@ -238,42 +312,47 @@ def test_no_keep_default_na_dict_na_scalar_values(all_parsers): # Scalar values shouldn't cause the parsing to crash or fail. data = "a,b\n1,2" parser = all_parsers - df = parser.read_csv(StringIO(data), na_values={"b": 2}, - keep_default_na=False) + df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False) expected = DataFrame({"a": [1], "b": [np.nan]}) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("col_zero_na_values", [ - 113125, "113125" -]) -def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, - col_zero_na_values): +@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"]) +def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values): # see gh-19227 data = """\ 113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 729639,"qwer","",asdfkj,466.681,,252.373 """ parser = all_parsers - expected = DataFrame({0: [np.nan, 729639.0], - 1: [np.nan, "qwer"], - 2: ["/blaha", np.nan], - 3: ["kjsdkj", "asdfkj"], - 4: [412.166, 466.681], - 5: ["225.874", ""], - 6: [np.nan, 252.373]}) - - result = parser.read_csv(StringIO(data), header=None, - keep_default_na=False, - na_values={2: "", 6: "214.008", - 1: "blah", 0: col_zero_na_values}) + expected = DataFrame( + { + 0: [np.nan, 729639.0], + 1: [np.nan, "qwer"], + 2: ["/blaha", np.nan], + 3: ["kjsdkj", "asdfkj"], + 4: [412.166, 466.681], + 5: ["225.874", ""], + 6: [np.nan, 252.373], + } + ) + + result = parser.read_csv( + StringIO(data), + header=None, + keep_default_na=False, + na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values}, + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_filter,row_data", [ - (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), - (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), -]) +@pytest.mark.parametrize( + "na_filter,row_data", + [ + (True, [[1, "A"], [np.nan, np.nan], [3, "C"]]), + (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), + ], +) def test_na_values_na_filter_override(all_parsers, na_filter, row_data): data = """\ A,B @@ -282,8 +361,7 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): 3,C """ parser = all_parsers - result = parser.read_csv(StringIO(data), na_values=["B"], - na_filter=na_filter) + result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -297,18 +375,32 @@ def test_na_trailing_columns(all_parsers): # Trailing columns should be all NaN. result = parser.read_csv(StringIO(data)) - expected = DataFrame([ - ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], - ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], - ], columns=["Date", "Currency", "Symbol", "Type", - "Units", "UnitPrice", "Cost", "Tax"]) + expected = DataFrame( + [ + ["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan], + ["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan], + ], + columns=[ + "Date", + "Currency", + "Symbol", + "Type", + "Units", + "UnitPrice", + "Cost", + "Tax", + ], + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_values,row_data", [ - (1, [[np.nan, 2.0], [2.0, np.nan]]), - ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), -]) +@pytest.mark.parametrize( + "na_values,row_data", + [ + (1, [[np.nan, 2.0], [2.0, np.nan]]), + ({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]), + ], +) def test_na_values_scalar(all_parsers, na_values, row_data): # see gh-12224 parser = all_parsers @@ -346,14 +438,18 @@ def test_na_values_dict_col_index(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - (str(2**63) + "\n" + str(2**63 + 1), - dict(na_values=[2**63]), DataFrame([str(2**63), str(2**63 + 1)])), - (str(2**63) + ",1" + "\n,2", - dict(), DataFrame([[str(2**63), 1], ['', 2]])), - (str(2**63) + "\n1", - dict(na_values=[2**63]), DataFrame([np.nan, 1])), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + str(2 ** 63) + "\n" + str(2 ** 63 + 1), + dict(na_values=[2 ** 63]), + DataFrame([str(2 ** 63), str(2 ** 63 + 1)]), + ), + (str(2 ** 63) + ",1" + "\n,2", dict(), DataFrame([[str(2 ** 63), 1], ["", 2]])), + (str(2 ** 63) + "\n1", dict(na_values=[2 ** 63]), DataFrame([np.nan, 1])), + ], +) def test_na_values_uint64(all_parsers, data, kwargs, expected): # see gh-14983 parser = all_parsers @@ -367,15 +463,13 @@ def test_empty_na_values_no_default_with_index(all_parsers): parser = all_parsers expected = DataFrame({"1": [2]}, index=Index(["b"], name="a")) - result = parser.read_csv(StringIO(data), index_col=0, - keep_default_na=False) + result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("na_filter,index_data", [ - (False, ["", "5"]), - (True, [np.nan, 5.0]), -]) +@pytest.mark.parametrize( + "na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])] +) def test_no_na_filter_on_index(all_parsers, na_filter, index_data): # see gh-5239 # @@ -383,10 +477,8 @@ def test_no_na_filter_on_index(all_parsers, na_filter, index_data): parser = all_parsers data = "a,b,c\n1,,3\n4,5,6" - expected = DataFrame({"a": [1, 4], "c": [3, 6]}, - index=Index(index_data, name="b")) - result = parser.read_csv(StringIO(data), index_col=[1], - na_filter=na_filter) + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b")) + result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter) tm.assert_frame_equal(result, expected) @@ -396,10 +488,10 @@ def test_inf_na_values_with_int_index(all_parsers): data = "idx,col1,col2\n1,3,4\n2,inf,-inf" # Don't fail with OverflowError with inf's and integer index column. - out = parser.read_csv(StringIO(data), index_col=[0], - na_values=["inf", "-inf"]) - expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, - index=Index([1, 2], name="idx")) + out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"]) + expected = DataFrame( + {"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx") + ) tm.assert_frame_equal(out, expected) @@ -412,27 +504,35 @@ def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): # na_filter=True --> missing value becomes NaN. # na_filter=False --> missing value remains empty string. empty = np.nan if na_filter else "" - expected = DataFrame({"a": ["1", "4"], - "b": [empty, "5"], - "c": ["3", "6"]}) + expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]}) result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data, na_values", [ - ("false,1\n,1\ntrue", None), - ("false,1\nnull,1\ntrue", None), - ("false,1\nnan,1\ntrue", None), - ("false,1\nfoo,1\ntrue", 'foo'), - ("false,1\nfoo,1\ntrue", ['foo']), - ("false,1\nfoo,1\ntrue", {'a': 'foo'}), -]) +@pytest.mark.parametrize( + "data, na_values", + [ + ("false,1\n,1\ntrue", None), + ("false,1\nnull,1\ntrue", None), + ("false,1\nnan,1\ntrue", None), + ("false,1\nfoo,1\ntrue", "foo"), + ("false,1\nfoo,1\ntrue", ["foo"]), + ("false,1\nfoo,1\ntrue", {"a": "foo"}), + ], +) def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): parser = all_parsers - msg = ("(Bool column has NA values in column [0a])|" - "(cannot safely convert passed user dtype of " - "bool for object dtyped data in column 0)") + msg = ( + "(Bool column has NA values in column [0a])|" + "(cannot safely convert passed user dtype of " + "bool for object dtyped data in column 0)" + ) with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), header=None, names=['a', 'b'], - dtype={'a': 'bool'}, na_values=na_values) + parser.read_csv( + StringIO(data), + header=None, + names=["a", "b"], + dtype={"a": "bool"}, + na_values=na_values, + ) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 20bd5b74f1784..dbe721b10a3ce 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -17,57 +17,56 @@ @pytest.mark.network -@pytest.mark.parametrize("compress_type, extension", [ - ('gzip', '.gz'), ('bz2', '.bz2'), ('zip', '.zip'), ('xz', '.xz')]) -@pytest.mark.parametrize('mode', ['explicit', 'infer']) -@pytest.mark.parametrize('engine', ['python', 'c']) -def test_compressed_urls(salaries_table, compress_type, extension, mode, - engine): - check_compressed_urls(salaries_table, compress_type, extension, mode, - engine) +@pytest.mark.parametrize( + "compress_type, extension", + [("gzip", ".gz"), ("bz2", ".bz2"), ("zip", ".zip"), ("xz", ".xz")], +) +@pytest.mark.parametrize("mode", ["explicit", "infer"]) +@pytest.mark.parametrize("engine", ["python", "c"]) +def test_compressed_urls(salaries_table, compress_type, extension, mode, engine): + check_compressed_urls(salaries_table, compress_type, extension, mode, engine) @tm.network -def check_compressed_urls(salaries_table, compression, extension, mode, - engine): +def check_compressed_urls(salaries_table, compression, extension, mode, engine): # test reading compressed urls with various engines and # extension inference - base_url = ('https://github.com/pandas-dev/pandas/raw/master/' - 'pandas/tests/io/parser/data/salaries.csv') + base_url = ( + "https://github.com/pandas-dev/pandas/raw/master/" + "pandas/tests/io/parser/data/salaries.csv" + ) url = base_url + extension - if mode != 'explicit': + if mode != "explicit": compression = mode - url_table = read_csv(url, sep='\t', compression=compression, engine=engine) + url_table = read_csv(url, sep="\t", compression=compression, engine=engine) tm.assert_frame_equal(url_table, salaries_table) @pytest.fixture def tips_df(datapath): """DataFrame with the tips dataset.""" - return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) + return read_csv(datapath("io", "parser", "data", "tips.csv")) @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: - def test_parse_public_s3_bucket(self, tips_df): - pytest.importorskip('s3fs') + pytest.importorskip("s3fs") # more of an integration test due to the not-public contents portion # can probably mock this though. - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + - ext, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv('s3://cant_get_it/tips.csv') + df = read_csv("s3://cant_get_it/tips.csv") assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) @@ -75,22 +74,21 @@ def test_parse_public_s3_bucket(self, tips_df): def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL - df = read_csv('s3n://pandas-test/tips.csv', nrows=10) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL - df = read_csv('s3a://pandas-test/tips.csv', nrows=10) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3_bucket_nrows(self, tips_df): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + - ext, nrows=10, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) @@ -98,9 +96,10 @@ def test_parse_public_s3_bucket_nrows(self, tips_df): def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df_reader = read_csv( + "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them @@ -108,70 +107,75 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp, - engine='python') + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df_reader = read_csv( + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + engine="python", + ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: # Read a couple of chunks and make sure we see them properly. df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = tips_df.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] + true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) def test_parse_public_s3_bucket_python(self, tips_df): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', - compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) def test_infer_s3_compression(self, tips_df): - for ext in ['', '.gz', '.bz2']: - df = read_csv('s3://pandas-test/tips.csv' + ext, - engine='python', compression='infer') + for ext in ["", ".gz", ".bz2"]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) def test_parse_public_s3_bucket_nrows_python(self, tips_df): - for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', - nrows=10, compression=comp) + for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + engine="python", + nrows=10, + compression=comp, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_s3_fails(self): with pytest.raises(IOError): - read_csv('s3://nyqpug/asdf.csv') + read_csv("s3://nyqpug/asdf.csv") # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): - read_csv('s3://cant_get_it/') + read_csv("s3://cant_get_it/") - def test_read_csv_handles_boto_s3_object(self, - s3_resource, - tips_file): + def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 s3_object = s3_resource.meta.client.get_object( - Bucket='pandas-test', - Key='tips.csv') + Bucket="pandas-test", Key="tips.csv" + ) - result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8') + result = read_csv(BytesIO(s3_object["Body"].read()), encoding="utf8") assert isinstance(result, DataFrame) assert not result.empty @@ -180,24 +184,22 @@ def test_read_csv_handles_boto_s3_object(self, def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks - df = DataFrame(np.random.randn(100000, 4), columns=list('abcd')) + df = DataFrame(np.random.randn(100000, 4), columns=list("abcd")) buf = BytesIO() str_buf = StringIO() df.to_csv(str_buf) - buf = BytesIO(str_buf.getvalue().encode('utf-8')) + buf = BytesIO(str_buf.getvalue().encode("utf-8")) - s3_resource.Bucket("pandas-test").put_object( - Key="large-file.csv", - Body=buf) + s3_resource.Bucket("pandas-test").put_object(Key="large-file.csv", Body=buf) - with caplog.at_level(logging.DEBUG, logger='s3fs.core'): + with caplog.at_level(logging.DEBUG, logger="s3fs.core"): read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) - assert ((0, 5505024) in {x.args[-2:] for x in caplog.records}) + assert (0, 5505024) in {x.args[-2:] for x in caplog.records} def test_read_s3_with_hash_in_key(self, tips_df): # GH 25945 - result = read_csv('s3://pandas-test/tips#1.csv') + result = read_csv("s3://pandas-test/tips#1.csv") tm.assert_frame_equal(tips_df, result) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 25589a1682f7a..99e4e5c022ecb 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -42,11 +42,17 @@ def test_separator_date_conflict(all_parsers): # date parsing do not conflict. parser = all_parsers data = "06-02-2013;13:00;1-000.215" - expected = DataFrame([[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], - columns=["Date", 2]) + expected = DataFrame( + [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] + ) - df = parser.read_csv(StringIO(data), sep=";", thousands="-", - parse_dates={"Date": [0, 1]}, header=None) + df = parser.read_csv( + StringIO(data), + sep=";", + thousands="-", + parse_dates={"Date": [0, 1]}, + header=None, + ) tm.assert_frame_equal(df, expected) @@ -77,32 +83,109 @@ def date_parser(*date_cols): """ return parsing.try_parse_dates(parsing._concat_date_cols(date_cols)) - result = parser.read_csv(StringIO(data), header=None, - date_parser=date_parser, prefix="X", - parse_dates={"actual": [1, 2], - "nominal": [1, 3]}, - keep_date_col=keep_date_col) - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", "19990127", " 19:00:00", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", "19990127", " 20:00:00", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", "19990127", " 21:00:00", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", "19990127", " 21:00:00", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", "19990127", " 22:00:00", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", "19990127", " 23:00:00", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["actual", "nominal", "X0", "X1", "X2", - "X3", "X4", "X5", "X6", "X7", "X8"]) + result = parser.read_csv( + StringIO(data), + header=None, + date_parser=date_parser, + prefix="X", + parse_dates={"actual": [1, 2], "nominal": [1, 3]}, + keep_date_col=keep_date_col, + ) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "actual", + "nominal", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) @@ -139,30 +222,108 @@ def test_multiple_date_col(all_parsers, keep_date_col): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ parser = all_parsers - result = parser.read_csv(StringIO(data), header=None, - prefix="X", parse_dates=[[1, 2], [1, 3]], - keep_date_col=keep_date_col) - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", "19990127", " 19:00:00", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", "19990127", " 20:00:00", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", "19990127", " 21:00:00", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", "19990127", " 21:00:00", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", "19990127", " 22:00:00", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", "19990127", " 23:00:00", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["X1_X2", "X1_X3", "X0", "X1", "X2", - "X3", "X4", "X5", "X6", "X7", "X8"]) + result = parser.read_csv( + StringIO(data), + header=None, + prefix="X", + parse_dates=[[1, 2], [1, 3]], + keep_date_col=keep_date_col, + ) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + "19990127", + " 19:00:00", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + "19990127", + " 20:00:00", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + "19990127", + " 21:00:00", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + "19990127", + " 21:00:00", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + "19990127", + " 22:00:00", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + "19990127", + " 23:00:00", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "X1_X2", + "X1_X3", + "X0", + "X1", + "X2", + "X3", + "X4", + "X5", + "X6", + "X7", + "X8", + ], + ) if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) @@ -181,49 +342,84 @@ def test_date_col_as_index_col(all_parsers): KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 """ parser = all_parsers - result = parser.read_csv(StringIO(data), header=None, prefix="X", - parse_dates=[1], index_col=1) - - index = Index([datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 22, 0)], name="X1") - expected = DataFrame([ - ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], - ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], - ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], - ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], - ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], - ], columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], index=index) + result = parser.read_csv( + StringIO(data), header=None, prefix="X", parse_dates=[1], index_col=1 + ) + + index = Index( + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 22, 0), + ], + name="X1", + ) + expected = DataFrame( + [ + ["KORD", " 18:56:00", 0.81, 2.81, 7.2, 0.0, 280.0], + ["KORD", " 19:56:00", 0.01, 2.21, 7.2, 0.0, 260.0], + ["KORD", " 20:56:00", -0.59, 2.21, 5.7, 0.0, 280.0], + ["KORD", " 21:18:00", -0.99, 2.01, 3.6, 0.0, 270.0], + ["KORD", " 21:56:00", -0.59, 1.71, 5.1, 0.0, 290.0], + ], + columns=["X0", "X2", "X3", "X4", "X5", "X6", "X7"], + index=index, + ) tm.assert_frame_equal(result, expected) def test_multiple_date_cols_int_cast(all_parsers): - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ) parse_dates = {"actual": [1, 2], "nominal": [1, 3]} parser = all_parsers - result = parser.read_csv(StringIO(data), header=None, - date_parser=conv.parse_date_time, - parse_dates=parse_dates, prefix="X") - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", 0.01], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", -0.99], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", -0.59], - ], columns=["actual", "nominal", "X0", "X4"]) + result = parser.read_csv( + StringIO(data), + header=None, + date_parser=conv.parse_date_time, + parse_dates=parse_dates, + prefix="X", + ) + expected = DataFrame( + [ + [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], + [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", "X0", "X4"], + ) # Python can sometimes be flaky about how # the aggregated columns are entered, so @@ -237,14 +433,32 @@ def test_multiple_date_col_timestamp_parse(all_parsers): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - result = parser.read_csv(StringIO(data), parse_dates=[[0, 1]], - header=None, date_parser=Timestamp) - expected = DataFrame([ - [Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, 1, "E", 0, np.nan, 1306.25], - [Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, 8, "E", 0, np.nan, 1306.25] - ], columns=["0_1", 2, 3, 4, 5, 6, 7]) + result = parser.read_csv( + StringIO(data), parse_dates=[[0, 1]], header=None, date_parser=Timestamp + ) + expected = DataFrame( + [ + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 1, + "E", + 0, + np.nan, + 1306.25, + ], + [ + Timestamp("05/31/2012, 15:30:00.029"), + 1306.25, + 8, + "E", + 0, + np.nan, + 1306.25, + ], + ], + columns=["0_1", 2, 3, 4, 5, 6, 7], + ) tm.assert_frame_equal(result, expected) @@ -260,35 +474,104 @@ def test_multiple_date_cols_with_header(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["nominal", "ID", "ActualTime", "TDew", - "TAir", "Windspeed", "Precip", "WindDir"]) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,parse_dates,msg", [ - ("""\ +@pytest.mark.parametrize( + "data,parse_dates,msg", + [ + ( + """\ date_NominalTime,date,NominalTime KORD1,19990127, 19:00:00 -KORD2,19990127, 20:00:00""", [[1, 2]], ("New date column already " - "in dict date_NominalTime")), - ("""\ +KORD2,19990127, 20:00:00""", + [[1, 2]], + ("New date column already " "in dict date_NominalTime"), + ), + ( + """\ ID,date,nominalTime KORD,19990127, 19:00:00 -KORD,19990127, 20:00:00""", dict(ID=[1, 2]), "Date column ID already in dict") -]) +KORD,19990127, 20:00:00""", + dict(ID=[1, 2]), + "Date column ID already in dict", + ), + ], +) def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser = all_parsers @@ -299,30 +582,61 @@ def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): def test_date_parser_int_bug(all_parsers): # see gh-3071 parser = all_parsers - data = ("posix_timestamp,elapsed,sys,user,queries,query_time,rows," - "accountid,userid,contactid,level,silo,method\n" - "1343103150,0.062353,0,4,6,0.01690,3," - "12345,1,-1,3,invoice_InvoiceResource,search\n") + data = ( + "posix_timestamp,elapsed,sys,user,queries,query_time,rows," + "accountid,userid,contactid,level,silo,method\n" + "1343103150,0.062353,0,4,6,0.01690,3," + "12345,1,-1,3,invoice_InvoiceResource,search\n" + ) result = parser.read_csv( - StringIO(data), index_col=0, parse_dates=[0], - date_parser=lambda x: datetime.utcfromtimestamp(int(x))) - expected = DataFrame([[0.062353, 0, 4, 6, 0.01690, 3, 12345, 1, -1, - 3, "invoice_InvoiceResource", "search"]], - columns=["elapsed", "sys", "user", "queries", - "query_time", "rows", "accountid", - "userid", "contactid", "level", - "silo", "method"], - index=Index([Timestamp("2012-07-24 04:12:30")], - name="posix_timestamp")) + StringIO(data), + index_col=0, + parse_dates=[0], + date_parser=lambda x: datetime.utcfromtimestamp(int(x)), + ) + expected = DataFrame( + [ + [ + 0.062353, + 0, + 4, + 6, + 0.01690, + 3, + 12345, + 1, + -1, + 3, + "invoice_InvoiceResource", + "search", + ] + ], + columns=[ + "elapsed", + "sys", + "user", + "queries", + "query_time", + "rows", + "accountid", + "userid", + "contactid", + "level", + "silo", + "method", + ], + index=Index([Timestamp("2012-07-24 04:12:30")], name="posix_timestamp"), + ) tm.assert_frame_equal(result, expected) def test_nat_parse(all_parsers): # see gh-3062 parser = all_parsers - df = DataFrame(dict({"A": np.arange(10, dtype="float64"), - "B": pd.Timestamp("20010101")})) + df = DataFrame( + dict({"A": np.arange(10, dtype="float64"), "B": pd.Timestamp("20010101")}) + ) df.iloc[3:6, :] = np.nan with tm.ensure_clean("__nat_parse_.csv") as path: @@ -340,8 +654,8 @@ def test_csv_custom_parser(all_parsers): """ parser = all_parsers result = parser.read_csv( - StringIO(data), - date_parser=lambda x: datetime.strptime(x, "%Y%m%d")) + StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d") + ) expected = parser.read_csv(StringIO(data), parse_dates=True) tm.assert_frame_equal(result, expected) @@ -355,8 +669,7 @@ def test_parse_dates_implicit_first_col(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), parse_dates=True) - expected = parser.read_csv(StringIO(data), index_col=0, - parse_dates=True) + expected = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) tm.assert_frame_equal(result, expected) @@ -367,23 +680,20 @@ def test_parse_dates_string(all_parsers): 20090103,c,4,5 """ parser = all_parsers - result = parser.read_csv(StringIO(data), index_col="date", - parse_dates=["date"]) + result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) index = date_range("1/1/2009", periods=3) index.name = "date" - expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], - "C": [2, 4, 5]}, index=index) + expected = DataFrame( + {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index + ) tm.assert_frame_equal(result, expected) # Bug in https://github.com/dateutil/dateutil/issues/217 # has been addressed, but we just don't pass in the `yearfirst` @pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") -@pytest.mark.parametrize("parse_dates", [ - [["date", "time"]], - [[0, 1]] -]) +@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) def test_yy_format_with_year_first(all_parsers, parse_dates): data = """date,time,B,C 090131,0010,1,2 @@ -391,12 +701,16 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): 090331,0830,5,6 """ parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0, - parse_dates=parse_dates) - index = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0)], - dtype=object, name="date_time") + result = parser.read_csv(StringIO(data), index_col=0, parse_dates=parse_dates) + index = DatetimeIndex( + [ + datetime(2009, 1, 31, 0, 10, 0), + datetime(2009, 2, 28, 10, 20, 0), + datetime(2009, 3, 31, 8, 30, 0), + ], + dtype=object, + name="date_time", + ) expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) tm.assert_frame_equal(result, expected) @@ -406,12 +720,14 @@ def test_parse_dates_column_list(all_parsers, parse_dates): data = "a,b,c\n01/01/2010,1,15/02/2010" parser = all_parsers - expected = DataFrame({"a": [datetime(2010, 1, 1)], "b": [1], - "c": [datetime(2010, 2, 15)]}) + expected = DataFrame( + {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} + ) expected = expected.set_index(["a", "b"]) - result = parser.read_csv(StringIO(data), index_col=[0, 1], - parse_dates=parse_dates, dayfirst=True) + result = parser.read_csv( + StringIO(data), index_col=[0, 1], parse_dates=parse_dates, dayfirst=True + ) tm.assert_frame_equal(result, expected) @@ -429,27 +745,38 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers - index = MultiIndex.from_product([ - (datetime(2009, 1, 1), datetime(2009, 1, 2), - datetime(2009, 1, 3)), ("one", "two", "three")], - names=["index1", "index2"]) + index = MultiIndex.from_product( + [ + (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + ("one", "two", "three"), + ], + names=["index1", "index2"], + ) # Out of order. if index_col == [1, 0]: index = index.swaplevel(0, 1) - expected = DataFrame([["a", 1, 2], ["b", 3, 4], ["c", 4, 5], - ["a", 1, 2], ["b", 3, 4], ["c", 4, 5], - ["a", 1, 2], ["b", 3, 4], ["c", 4, 5]], - columns=["A", "B", "C"], index=index) - result = parser.read_csv(StringIO(data), index_col=index_col, - parse_dates=True) + expected = DataFrame( + [ + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ["a", 1, 2], + ["b", 3, 4], + ["c", 4, 5], + ], + columns=["A", "B", "C"], + index=index, + ) + result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(dayfirst=True), dict(day_first=True) -]) +@pytest.mark.parametrize("kwargs", [dict(dayfirst=True), dict(day_first=True)]) def test_parse_dates_custom_euro_format(all_parsers, kwargs): parser = all_parsers data = """foo,bar,baz @@ -458,22 +785,37 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): 02/02/2010,1,2 """ if "dayfirst" in kwargs: - df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - header=0, index_col=0, parse_dates=True, - na_values=["NA"]) - exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), - datetime(2010, 2, 2)], name="time") - expected = DataFrame({"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, - index=exp_index, columns=["Q", "NTU"]) + df = parser.read_csv( + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + header=0, + index_col=0, + parse_dates=True, + na_values=["NA"], + ) + exp_index = Index( + [datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], + name="time", + ) + expected = DataFrame( + {"Q": [1, 1, 1], "NTU": [2, np.nan, 2]}, + index=exp_index, + columns=["Q", "NTU"], + ) tm.assert_frame_equal(df, expected) else: msg = "got an unexpected keyword argument 'day_first'" with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), names=["time", "Q", "NTU"], - date_parser=lambda d: du_parse(d, **kwargs), - skiprows=[0], index_col=0, parse_dates=True, - na_values=["NA"]) + parser.read_csv( + StringIO(data), + names=["time", "Q", "NTU"], + date_parser=lambda d: du_parse(d, **kwargs), + skiprows=[0], + index_col=0, + parse_dates=True, + na_values=["NA"], + ) def test_parse_tz_aware(all_parsers): @@ -481,19 +823,18 @@ def test_parse_tz_aware(all_parsers): parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" - result = parser.read_csv(StringIO(data), index_col=0, - parse_dates=True) - expected = DataFrame({"x": [0.5]}, index=Index([Timestamp( - "2012-06-13 01:39:00+00:00")], name="Date")) + result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + expected = DataFrame( + {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") + ) tm.assert_frame_equal(result, expected) assert result.index.tz is pytz.utc -@pytest.mark.parametrize("parse_dates,index_col", [ - ({"nominal": [1, 2]}, "nominal"), - ({"nominal": [1, 2]}, 0), - ([[1, 2]], 0), -]) +@pytest.mark.parametrize( + "parse_dates,index_col", + [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], +) def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): parser = all_parsers data = """ @@ -505,28 +846,88 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), "KORD1", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), "KORD2", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), "KORD3", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), "KORD4", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), "KORD5", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), "KORD6", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["nominal", "ID", "ActualTime", "TDew", - "TAir", "Windspeed", "Precip", "WindDir"]) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD1", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD2", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD3", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD4", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD5", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD6", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=[ + "nominal", + "ID", + "ActualTime", + "TDew", + "TAir", + "Windspeed", + "Precip", + "WindDir", + ], + ) expected = expected.set_index("nominal") if not isinstance(parse_dates, dict): expected.index.name = "date_NominalTime" - result = parser.read_csv(StringIO(data), parse_dates=parse_dates, - index_col=index_col) + result = parser.read_csv( + StringIO(data), parse_dates=parse_dates, index_col=index_col + ) tm.assert_frame_equal(result, expected) @@ -542,24 +943,79 @@ def test_multiple_date_cols_chunked(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - expected = DataFrame([ - [datetime(1999, 1, 27, 19, 0), "KORD", " 18:56:00", - 0.81, 2.81, 7.2, 0.0, 280.0], - [datetime(1999, 1, 27, 20, 0), "KORD", " 19:56:00", - 0.01, 2.21, 7.2, 0.0, 260.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 20:56:00", - -0.59, 2.21, 5.7, 0.0, 280.0], - [datetime(1999, 1, 27, 21, 0), "KORD", " 21:18:00", - -0.99, 2.01, 3.6, 0.0, 270.0], - [datetime(1999, 1, 27, 22, 0), "KORD", " 21:56:00", - -0.59, 1.71, 5.1, 0.0, 290.0], - [datetime(1999, 1, 27, 23, 0), "KORD", " 22:56:00", - -0.59, 1.71, 4.6, 0.0, 280.0], - ], columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + "KORD", + " 18:56:00", + 0.81, + 2.81, + 7.2, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 20, 0), + "KORD", + " 19:56:00", + 0.01, + 2.21, + 7.2, + 0.0, + 260.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 20:56:00", + -0.59, + 2.21, + 5.7, + 0.0, + 280.0, + ], + [ + datetime(1999, 1, 27, 21, 0), + "KORD", + " 21:18:00", + -0.99, + 2.01, + 3.6, + 0.0, + 270.0, + ], + [ + datetime(1999, 1, 27, 22, 0), + "KORD", + " 21:56:00", + -0.59, + 1.71, + 5.1, + 0.0, + 290.0, + ], + [ + datetime(1999, 1, 27, 23, 0), + "KORD", + " 22:56:00", + -0.59, + 1.71, + 4.6, + 0.0, + 280.0, + ], + ], + columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], + ) expected = expected.set_index("nominal") - reader = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}, - index_col="nominal", chunksize=2) + reader = parser.read_csv( + StringIO(data), + parse_dates={"nominal": [1, 2]}, + index_col="nominal", + chunksize=2, + ) chunks = list(reader) tm.assert_frame_equal(chunks[0], expected[:2]) @@ -579,12 +1035,14 @@ def test_multiple_date_col_named_index_compat(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - with_indices = parser.read_csv(StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal") - with_names = parser.read_csv(StringIO(data), index_col="nominal", - parse_dates={"nominal": [ - "date", "nominalTime"]}) + with_indices = parser.read_csv( + StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" + ) + with_names = parser.read_csv( + StringIO(data), + index_col="nominal", + parse_dates={"nominal": ["date", "nominalTime"]}, + ) tm.assert_frame_equal(with_indices, with_names) @@ -599,10 +1057,10 @@ def test_multiple_date_col_multiple_index_compat(all_parsers): KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - result = parser.read_csv(StringIO(data), index_col=["nominal", "ID"], - parse_dates={"nominal": [1, 2]}) - expected = parser.read_csv(StringIO(data), - parse_dates={"nominal": [1, 2]}) + result = parser.read_csv( + StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} + ) + expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) expected = expected.set_index(["nominal", "ID"]) tm.assert_frame_equal(result, expected) @@ -612,8 +1070,10 @@ def test_multiple_date_col_multiple_index_compat(all_parsers): def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = ("Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter") + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) data = """A,B,C 1,2,2003-11-1""" @@ -621,13 +1081,13 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): parser.read_csv(StringIO(data), parse_dates="C", **kwargs) -@pytest.mark.parametrize("parse_dates", [ - (1,), np.array([4, 5]), {1, 3, 3} -]) +@pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = ("Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter") + msg = ( + "Only booleans, lists, and dictionaries " + "are accepted for the 'parse_dates' parameter" + ) data = """A,B,C 1,2,2003-11-1""" @@ -636,44 +1096,66 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): @pytest.mark.parametrize("cache_dates", [True, False]) -@pytest.mark.parametrize("value", [ - 'nan', '0', '']) +@pytest.mark.parametrize("value", ["nan", "0", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers - s = StringIO(('%s,\n' % value) * 50000) + s = StringIO(("%s,\n" % value) * 50000) - parser.read_csv(s, - header=None, names=['foo', 'bar'], parse_dates=['foo'], - infer_datetime_format=False, - cache_dates=cache_dates) + parser.read_csv( + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + infer_datetime_format=False, + cache_dates=cache_dates, + ) def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers data = "Date,test\n2012-01-01,1\n,2" - result = parser.read_csv(StringIO(data), parse_dates=["Date"], - na_filter=False) + result = parser.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) - expected = DataFrame([[datetime(2012, 1, 1), 1], [pd.NaT, 2]], - columns=["Date", "test"]) + expected = DataFrame( + [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] + ) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("a\n04.15.2016", dict(parse_dates=["a"]), - DataFrame([datetime(2016, 4, 15)], columns=["a"])), - ("a\n04.15.2016", dict(parse_dates=True, index_col=0), - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"))), - ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=["a", "b"]), - DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], - columns=["a", "b"])), - ("a,b\n04.15.2016,09.16.2013", dict(parse_dates=True, index_col=[0, 1]), - DataFrame(index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"]))), -]) +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + "a\n04.15.2016", + dict(parse_dates=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"]), + ), + ( + "a\n04.15.2016", + dict(parse_dates=True, index_col=0), + DataFrame(index=DatetimeIndex(["2016-04-15"], name="a")), + ), + ( + "a,b\n04.15.2016,09.16.2013", + dict(parse_dates=["a", "b"]), + DataFrame( + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + ), + ), + ( + "a,b\n04.15.2016,09.16.2013", + dict(parse_dates=True, index_col=[0, 1]), + DataFrame( + index=MultiIndex.from_tuples( + [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + ) + ), + ), + ], +) def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): # see gh-14066 parser = all_parsers @@ -690,51 +1172,96 @@ def test_parse_date_time_multi_level_column_name(all_parsers): 2001-01-06, 00:00:00, 1.0, 11. """ parser = all_parsers - result = parser.read_csv(StringIO(data), header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=conv.parse_date_time) - - expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.], - [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]] - expected = DataFrame(expected_data, - columns=["date_time", ("A", "a"), ("B", "b")]) + result = parser.read_csv( + StringIO(data), + header=[0, 1], + parse_dates={"date_time": [0, 1]}, + date_parser=conv.parse_date_time, + ) + + expected_data = [ + [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], + ] + expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("""\ +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """\ date,time,a,b 2001-01-05, 10:00:00, 0.0, 10. 2001-01-05, 00:00:00, 1., 11. -""", dict(header=0, parse_dates={"date_time": [0, 1]}), - DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], - [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0]], - columns=["date_time", "a", "b"])), - (("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900"), - dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}), - DataFrame([ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), - "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), - "KORD", 0.01], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 20, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 21, 18), - "KORD", -0.99], - [datetime(1999, 1, 27, 22, 0), datetime(1999, 1, 27, 21, 56), - "KORD", -0.59], - [datetime(1999, 1, 27, 23, 0), datetime(1999, 1, 27, 22, 56), - "KORD", -0.59]], columns=["actual", "nominal", 0, 4])), -]) +""", + dict(header=0, parse_dates={"date_time": [0, 1]}), + DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], + [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], + ], + columns=["date_time", "a", "b"], + ), + ), + ( + ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ), + dict(header=None, parse_dates={"actual": [1, 2], "nominal": [1, 3]}), + DataFrame( + [ + [ + datetime(1999, 1, 27, 19, 0), + datetime(1999, 1, 27, 18, 56), + "KORD", + 0.81, + ], + [ + datetime(1999, 1, 27, 20, 0), + datetime(1999, 1, 27, 19, 56), + "KORD", + 0.01, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 20, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 21, 0), + datetime(1999, 1, 27, 21, 18), + "KORD", + -0.99, + ], + [ + datetime(1999, 1, 27, 22, 0), + datetime(1999, 1, 27, 21, 56), + "KORD", + -0.59, + ], + [ + datetime(1999, 1, 27, 23, 0), + datetime(1999, 1, 27, 22, 56), + "KORD", + -0.59, + ], + ], + columns=["actual", "nominal", 0, 4], + ), + ), + ], +) def test_parse_date_time(all_parsers, data, kwargs, expected): parser = all_parsers - result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, - **kwargs) + result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, **kwargs) # Python can sometimes be flaky about how # the aggregated columns are entered, so @@ -745,14 +1272,18 @@ def test_parse_date_time(all_parsers, data, kwargs, expected): def test_parse_date_fields(all_parsers): parser = all_parsers - data = ("year,month,day,a\n2001,01,10,10.\n" - "2001,02,1,11.") - result = parser.read_csv(StringIO(data), header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=conv.parse_date_fields) - - expected = DataFrame([[datetime(2001, 1, 10), 10.], - [datetime(2001, 2, 1), 11.]], columns=["ymd", "a"]) + data = "year,month,day,a\n2001,01,10,10.\n" "2001,02,1,11." + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ymd": [0, 1, 2]}, + date_parser=conv.parse_date_fields, + ) + + expected = DataFrame( + [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], + columns=["ymd", "a"], + ) tm.assert_frame_equal(result, expected) @@ -763,12 +1294,19 @@ def test_parse_date_all_fields(all_parsers): 2001,01,05,10,00,0,0.0,10. 2001,01,5,10,0,00,1.,11. """ - result = parser.read_csv(StringIO(data), header=0, - date_parser=conv.parse_all_fields, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}) - expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0]], - columns=["ymdHMS", "a", "b"]) + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) tm.assert_frame_equal(result, expected) @@ -779,14 +1317,19 @@ def test_datetime_fractional_seconds(all_parsers): 2001,01,05,10,00,0.123456,0.0,10. 2001,01,5,10,0,0.500000,1.,11. """ - result = parser.read_csv(StringIO(data), header=0, - date_parser=conv.parse_all_fields, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}) - expected = DataFrame([[datetime(2001, 1, 5, 10, 0, 0, - microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, - microsecond=500000), 1.0, 11.0]], - columns=["ymdHMS", "a", "b"]) + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=conv.parse_all_fields, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) + expected = DataFrame( + [ + [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], + [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], + ], + columns=["ymdHMS", "a", "b"], + ) tm.assert_frame_equal(result, expected) @@ -794,14 +1337,16 @@ def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv(StringIO(data), header=0, - parse_dates={"ym": [0, 1]}, - date_parser=lambda y, m: date(year=int(y), - month=int(m), - day=1)) - expected = DataFrame([[date(2001, 1, 1), 10, 10.], - [date(2001, 2, 1), 1, 11.]], - columns=["ym", "day", "a"]) + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ym": [0, 1]}, + date_parser=lambda y, m: date(year=int(y), month=int(m), day=1), + ) + expected = DataFrame( + [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], + columns=["ym", "day", "a"], + ) tm.assert_frame_equal(result, expected) @@ -816,19 +1361,25 @@ def test_date_parser_resolution_if_not_ns(all_parsers): """ def date_parser(dt, time): - return np_array_datetime64_compat(dt + "T" + time + "Z", - dtype="datetime64[s]") - - result = parser.read_csv(StringIO(data), date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"]) - - datetimes = np_array_datetime64_compat(["2013-11-03T19:00:00Z"] * 3, - dtype="datetime64[s]") - expected = DataFrame(data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_tuples( - [(datetimes[0], 126), (datetimes[1], 23), - (datetimes[2], 13)], names=["datetime", "prn"])) + return np_array_datetime64_compat(dt + "T" + time + "Z", dtype="datetime64[s]") + + result = parser.read_csv( + StringIO(data), + date_parser=date_parser, + parse_dates={"datetime": ["date", "time"]}, + index_col=["datetime", "prn"], + ) + + datetimes = np_array_datetime64_compat( + ["2013-11-03T19:00:00Z"] * 3, dtype="datetime64[s]" + ) + expected = DataFrame( + data={"rxstatus": ["00E80000"] * 3}, + index=MultiIndex.from_tuples( + [(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)], + names=["datetime", "prn"], + ), + ) tm.assert_frame_equal(result, expected) @@ -838,22 +1389,24 @@ def test_parse_date_column_with_empty_string(all_parsers): data = "case,opdate\n7,10/18/2006\n7,10/18/2008\n621, " result = parser.read_csv(StringIO(data), parse_dates=["opdate"]) - expected_data = [[7, "10/18/2006"], - [7, "10/18/2008"], - [621, " "]] + expected_data = [[7, "10/18/2006"], [7, "10/18/2008"], [621, " "]] expected = DataFrame(expected_data, columns=["case", "opdate"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,expected", [ - ("a\n135217135789158401\n1352171357E+5", - DataFrame({"a": [135217135789158401, - 135217135700000]}, dtype="float64")), - ("a\n99999999999\n123456789012345\n1234E+0", - DataFrame({"a": [99999999999, - 123456789012345, - 1234]}, dtype="float64")) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ( + "a\n135217135789158401\n1352171357E+5", + DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64"), + ), + ( + "a\n99999999999\n123456789012345\n1234E+0", + DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64"), + ), + ], +) @pytest.mark.parametrize("parse_dates", [True, False]) def test_parse_date_float(all_parsers, data, expected, parse_dates): # see gh-2697 @@ -877,47 +1430,48 @@ def test_parse_timezone(all_parsers): 2018-01-04 09:05:00+09:00,23400""" result = parser.read_csv(StringIO(data), parse_dates=["dt"]) - dti = pd.date_range(start="2018-01-04 09:01:00", - end="2018-01-04 09:05:00", freq="1min", - tz=pytz.FixedOffset(540)) + dti = pd.date_range( + start="2018-01-04 09:01:00", + end="2018-01-04 09:05:00", + freq="1min", + tz=pytz.FixedOffset(540), + ) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("date_string", [ - "32/32/2019", - "02/30/2019", - "13/13/2019", - "13/2019", - "a3/11/2018", - "10/11/2o17" -]) +@pytest.mark.parametrize( + "date_string", + ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], +) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers expected = DataFrame({0: [date_string]}, dtype="object") - result = parser.read_csv(StringIO(date_string), - header=None, parse_dates=[0]) + result = parser.read_csv(StringIO(date_string), header=None, parse_dates=[0]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("date_string,dayfirst,expected", [ - # %d/%m/%Y; month > 12 thus replacement - ("13/02/2019", False, datetime(2019, 2, 13)), - ("13/02/2019", True, datetime(2019, 2, 13)), - # %m/%d/%Y; day > 12 thus there will be no replacement - ("02/13/2019", False, datetime(2019, 2, 13)), - ("02/13/2019", True, datetime(2019, 2, 13)), - # %d/%m/%Y; dayfirst==True thus replacement - ("04/02/2019", True, datetime(2019, 2, 4)) -]) -def test_parse_delimited_date_swap(all_parsers, date_string, - dayfirst, expected): +@pytest.mark.parametrize( + "date_string,dayfirst,expected", + [ + # %d/%m/%Y; month > 12 thus replacement + ("13/02/2019", False, datetime(2019, 2, 13)), + ("13/02/2019", True, datetime(2019, 2, 13)), + # %m/%d/%Y; day > 12 thus there will be no replacement + ("02/13/2019", False, datetime(2019, 2, 13)), + ("02/13/2019", True, datetime(2019, 2, 13)), + # %d/%m/%Y; dayfirst==True thus replacement + ("04/02/2019", True, datetime(2019, 2, 4)), + ], +) +def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - result = parser.read_csv(StringIO(date_string), header=None, - dayfirst=dayfirst, parse_dates=[0]) + result = parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) tm.assert_frame_equal(result, expected) @@ -935,31 +1489,30 @@ def _helper_hypothesis_delimited_date(call, date_string, **kwargs): @settings(deadline=None) @pytest.mark.parametrize("delimiter", list(" -./")) @pytest.mark.parametrize("dayfirst", [True, False]) -@pytest.mark.parametrize("date_format", [ - "%d %m %Y", - "%m %d %Y", - "%m %Y", - "%Y %m %d", - "%y %m %d", - "%Y%m%d", - "%y%m%d", -]) -def test_hypothesis_delimited_date(date_format, dayfirst, - delimiter, test_datetime): +@pytest.mark.parametrize( + "date_format", + ["%d %m %Y", "%m %d %Y", "%m %Y", "%Y %m %d", "%y %m %d", "%Y%m%d", "%y%m%d"], +) +def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_datetime): if date_format == "%m %Y" and delimiter == ".": - pytest.skip("parse_datetime_string cannot reliably tell whether \ - e.g. %m.%Y is a float or a date, thus we skip it") + pytest.skip( + "parse_datetime_string cannot reliably tell whether \ + e.g. %m.%Y is a float or a date, thus we skip it" + ) result, expected = None, None except_in_dateutil, except_out_dateutil = None, None - date_string = test_datetime.strftime(date_format.replace(' ', delimiter)) + date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) except_out_dateutil, result = _helper_hypothesis_delimited_date( - parse_datetime_string, date_string, - dayfirst=dayfirst) + parse_datetime_string, date_string, dayfirst=dayfirst + ) except_in_dateutil, expected = _helper_hypothesis_delimited_date( - du_parse, date_string, + du_parse, + date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=False) + dayfirst=dayfirst, + yearfirst=False, + ) assert except_out_dateutil == except_in_dateutil assert result == expected diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 51c44c08cb80c..5b381e43e3e19 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -49,10 +49,7 @@ def test_invalid_skipfooter_negative(python_parser_only): parser.read_csv(StringIO(data), skipfooter=-1) -@pytest.mark.parametrize("kwargs", [ - dict(sep=None), - dict(delimiter="|") -]) +@pytest.mark.parametrize("kwargs", [dict(sep=None), dict(delimiter="|")]) def test_sniff_delimiter(python_parser_only, kwargs): data = """index|A|B|C foo|1|2|3 @@ -61,9 +58,11 @@ def test_sniff_delimiter(python_parser_only, kwargs): """ parser = python_parser_only result = parser.read_csv(StringIO(data), index_col=0, **kwargs) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"], - index=Index(["foo", "bar", "baz"], name="index")) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) tm.assert_frame_equal(result, expected) @@ -80,25 +79,26 @@ def test_sniff_delimiter_encoding(python_parser_only, encoding): if encoding is not None: from io import TextIOWrapper + data = data.encode(encoding) data = BytesIO(data) data = TextIOWrapper(data, encoding=encoding) else: data = StringIO(data) - result = parser.read_csv(data, index_col=0, sep=None, - skiprows=2, encoding=encoding) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"], - index=Index(["foo", "bar", "baz"], name="index")) + result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding) + expected = DataFrame( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index"), + ) tm.assert_frame_equal(result, expected) def test_single_line(python_parser_only): # see gh-6607: sniff separator parser = python_parser_only - result = parser.read_csv(StringIO("1,2"), names=["a", "b"], - header=None, sep=None) + result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None) expected = DataFrame({"a": [1], "b": [2]}) tm.assert_frame_equal(result, expected) @@ -117,15 +117,13 @@ def test_skipfooter(python_parser_only, kwargs): parser = python_parser_only result = parser.read_csv(StringIO(data), **kwargs) - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=["A", "B", "C"]) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("compression,klass", [ - ("gzip", "GzipFile"), - ("bz2", "BZ2File"), -]) +@pytest.mark.parametrize( + "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")] +) def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): # see gh-6607 parser = python_parser_only @@ -144,8 +142,7 @@ def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): tmp.write(data) tmp.close() - result = parser.read_csv(path, sep="::", - compression=compression) + result = parser.read_csv(path, sep="::", compression=compression) tm.assert_frame_equal(result, expected) @@ -158,15 +155,18 @@ def test_read_csv_buglet_4x_multi_index(python_parser_only): x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" parser = python_parser_only - expected = DataFrame([[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], - [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], - [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838]], - columns=["A", "B", "C", "D", "E"], - index=MultiIndex.from_tuples([ - ("a", "b", 10.0032, 5), - ("a", "q", 20, 4), - ("x", "q", 30, 3), - ], names=["one", "two", "three", "four"])) + expected = DataFrame( + [ + [-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838], + ], + columns=["A", "B", "C", "D", "E"], + index=MultiIndex.from_tuples( + [("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)], + names=["one", "two", "three", "four"], + ), + ) result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -178,7 +178,9 @@ def test_read_csv_buglet_4x_multi_index2(python_parser_only): expected = DataFrame.from_records( [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], - columns=list("abcABC"), index=list("abc")) + columns=list("abcABC"), + index=list("abc"), + ) result = parser.read_csv(StringIO(data), sep=r"\s+") tm.assert_frame_equal(result, expected) @@ -198,15 +200,16 @@ def test_skipfooter_with_decimal(python_parser_only, add_footer): else: kwargs = dict() - result = parser.read_csv(StringIO(data), names=["a"], - decimal="#", **kwargs) + result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep", ["::", "#####", "!!!", "123", "#1!c5", - "%!c!d", "@@#4:2", "_!pd#_"]) -@pytest.mark.parametrize("encoding", ["utf-16", "utf-16-be", "utf-16-le", - "utf-32", "cp037"]) +@pytest.mark.parametrize( + "sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"] +) +@pytest.mark.parametrize( + "encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"] +) def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): # see gh-3404 expected = DataFrame({"a": [1], "b": [2]}) @@ -215,8 +218,9 @@ def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): data = "1" + sep + "2" encoded_data = data.encode(encoding) - result = parser.read_csv(BytesIO(encoded_data), sep=sep, - names=["a", "b"], encoding=encoding) + result = parser.read_csv( + BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding + ) tm.assert_frame_equal(result, expected) @@ -251,17 +255,16 @@ def test_none_delimiter(python_parser_only, capsys): # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv(StringIO(data), header=0, - sep=None, warn_bad_lines=True, - error_bad_lines=False) + result = parser.read_csv( + StringIO(data), header=0, sep=None, warn_bad_lines=True, error_bad_lines=False + ) tm.assert_frame_equal(result, expected) captured = capsys.readouterr() assert "Skipping line 3" in captured.err -@pytest.mark.parametrize("data", [ - 'a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) +@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) @pytest.mark.parametrize("skipfooter", [0, 1]) def test_skipfooter_bad_row(python_parser_only, data, skipfooter): # see gh-13879 and gh-15910 @@ -292,5 +295,4 @@ def test_malformed_skipfooter(python_parser_only): """ msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), header=1, - comment="#", skipfooter=1) + parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 71d2307773791..94858226d0b44 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -14,12 +14,17 @@ import pandas.util.testing as tm -@pytest.mark.parametrize("kwargs,msg", [ - (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'), - (dict(quotechar=None, quoting=csv.QUOTE_MINIMAL), - "quotechar must be set if quoting enabled"), - (dict(quotechar=2), '"quotechar" must be string, not int') -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'), + ( + dict(quotechar=None, quoting=csv.QUOTE_MINIMAL), + "quotechar must be set if quoting enabled", + ), + (dict(quotechar=2), '"quotechar" must be string, not int'), + ], +) def test_bad_quote_char(all_parsers, kwargs, msg): data = "1,2,3" parser = all_parsers @@ -28,10 +33,13 @@ def test_bad_quote_char(all_parsers, kwargs, msg): parser.read_csv(StringIO(data), **kwargs) -@pytest.mark.parametrize("quoting,msg", [ - ("foo", '"quoting" must be an integer'), - (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] -]) +@pytest.mark.parametrize( + "quoting,msg", + [ + ("foo", '"quoting" must be an integer'), + (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] + ], +) def test_bad_quoting(all_parsers, quoting, msg): data = "1,2,3" parser = all_parsers @@ -43,8 +51,7 @@ def test_bad_quoting(all_parsers, quoting, msg): def test_quote_char_basic(all_parsers): parser = all_parsers data = 'a,b,c\n1,2,"cat"' - expected = DataFrame([[1, 2, "cat"]], - columns=["a", "b", "c"]) + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) result = parser.read_csv(StringIO(data), quotechar='"') tm.assert_frame_equal(result, expected) @@ -53,8 +60,7 @@ def test_quote_char_basic(all_parsers): @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) def test_quote_char_various(all_parsers, quote_char): parser = all_parsers - expected = DataFrame([[1, 2, "cat"]], - columns=["a", "b", "c"]) + expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"]) data = 'a,b,c\n1,2,"cat"' new_data = data.replace('"', quote_char) @@ -82,23 +88,22 @@ def test_null_quote_char(all_parsers, quoting, quote_char): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,exp_data", [ - (dict(), [[1, 2, "foo"]]), # Test default. - - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]), - - # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. - (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]), - - # QUOTE_NONE tells the reader to do no special handling - # of quote characters and leave them alone. - (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]), - - # QUOTE_NONNUMERIC tells the reader to cast - # all non-quoted fields to float - (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]) -]) +@pytest.mark.parametrize( + "kwargs,exp_data", + [ + (dict(), [[1, 2, "foo"]]), # Test default. + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]), + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]), + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone. + (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]), + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]), + ], +) def test_quoting_various(all_parsers, kwargs, exp_data): data = '1,2,"foo"' parser = all_parsers @@ -109,21 +114,19 @@ def test_quoting_various(all_parsers, kwargs, exp_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("doublequote,exp_data", [ - (True, [[3, '4 " 5']]), - (False, [[3, '4 " 5"']]), -]) +@pytest.mark.parametrize( + "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])] +) def test_double_quote(all_parsers, doublequote, exp_data): parser = all_parsers data = 'a,b\n3,"4 "" 5"' - result = parser.read_csv(StringIO(data), quotechar='"', - doublequote=doublequote) + result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote) expected = DataFrame(exp_data, columns=["a", "b"]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("quotechar", ['"', '\u0001']) +@pytest.mark.parametrize("quotechar", ['"', "\u0001"]) def test_quotechar_unicode(all_parsers, quotechar): # see gh-14477 data = "a\n1" @@ -138,7 +141,7 @@ def test_quotechar_unicode(all_parsers, quotechar): def test_unbalanced_quoting(all_parsers, balanced): # see gh-22789. parser = all_parsers - data = "a,b,c\n1,2,\"3" + data = 'a,b,c\n1,2,"3' if balanced: # Re-balance the quoting and read in without errors. @@ -146,8 +149,11 @@ def test_unbalanced_quoting(all_parsers, balanced): result = parser.read_csv(StringIO(data + '"')) tm.assert_frame_equal(result, expected) else: - msg = ("EOF inside string starting at row 1" if parser.engine == "c" - else "unexpected end of data") + msg = ( + "EOF inside string starting at row 1" + if parser.engine == "c" + else "unexpected end of data" + ) with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data)) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 10859bc41d508..72885315e06bc 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -27,12 +27,16 @@ def test_basic(): 201162 502.953953 173.237159 12468.3 """ result = read_fwf(StringIO(data)) - expected = DataFrame([[201158, 360.242940, 149.910199, 11950.7], - [201159, 444.953632, 166.985655, 11788.4], - [201160, 364.136849, 183.628767, 11806.2], - [201161, 413.836124, 184.375703, 11916.8], - [201162, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D"]) + expected = DataFrame( + [ + [201158, 360.242940, 149.910199, 11950.7], + [201159, 444.953632, 166.985655, 11788.4], + [201160, 364.136849, 183.628767, 11806.2], + [201161, 413.836124, 184.375703, 11916.8], + [201162, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D"], + ) tm.assert_frame_equal(result, expected) @@ -48,12 +52,16 @@ def test_colspecs(): colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] result = read_fwf(StringIO(data), colspecs=colspecs) - expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) tm.assert_frame_equal(result, expected) @@ -68,12 +76,16 @@ def test_widths(): """ result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7]) - expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) tm.assert_frame_equal(result, expected) @@ -95,12 +107,16 @@ def test_non_space_filler(): colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~") - expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7], - [2011, 59, 444.953632, 166.985655, 11788.4], - [2011, 60, 364.136849, 183.628767, 11806.2], - [2011, 61, 413.836124, 184.375703, 11916.8], - [2011, 62, 502.953953, 173.237159, 12468.3]], - columns=["A", "B", "C", "D", "E"]) + expected = DataFrame( + [ + [2011, 58, 360.242940, 149.910199, 11950.7], + [2011, 59, 444.953632, 166.985655, 11788.4], + [2011, 60, 364.136849, 183.628767, 11806.2], + [2011, 61, 413.836124, 184.375703, 11916.8], + [2011, 62, 502.953953, 173.237159, 12468.3], + ], + columns=["A", "B", "C", "D", "E"], + ) tm.assert_frame_equal(result, expected) @@ -157,8 +173,9 @@ def test_read_csv_compat(): def test_bytes_io_input(): - result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), - widths=[2, 2], encoding="utf8") + result = read_fwf( + BytesIO("שלום\nשלום".encode("utf8")), widths=[2, 2], encoding="utf8" + ) expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) tm.assert_frame_equal(result, expected) @@ -195,12 +212,15 @@ def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(): read_fwf(StringIO(data), [("a", 1)]) -@pytest.mark.parametrize("colspecs,exp_data", [ - ([(0, 3), (3, None)], [[123, 456], [456, 789]]), - ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), - ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), - ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), -]) +@pytest.mark.parametrize( + "colspecs,exp_data", + [ + ([(0, 3), (3, None)], [[123, 456], [456, 789]]), + ([(None, 3), (3, 6)], [[123, 456], [456, 789]]), + ([(0, None), (3, None)], [[123456, 456], [456789, 789]]), + ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]), + ], +) def test_fwf_colspecs_none(colspecs, exp_data): # see gh-7079 data = """\ @@ -213,13 +233,15 @@ def test_fwf_colspecs_none(colspecs, exp_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("infer_nrows,exp_data", [ - # infer_nrows --> colspec == [(2, 3), (5, 6)] - (1, [[1, 2], [3, 8]]), - - # infer_nrows > number of rows - (10, [[1, 2], [123, 98]]), -]) +@pytest.mark.parametrize( + "infer_nrows,exp_data", + [ + # infer_nrows --> colspec == [(2, 3), (5, 6)] + (1, [[1, 2], [3, 8]]), + # infer_nrows > number of rows + (10, [[1, 2], [123, 98]]), + ], +) def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data): # see gh-15138 data = """\ @@ -247,39 +269,59 @@ def test_fwf_regression(): 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 """ - result = read_fwf(StringIO(data), index_col=0, header=None, names=names, - widths=widths, parse_dates=True, - date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S")) - expected = DataFrame([ - [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], - [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], - [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], - [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], - [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], - ], index=DatetimeIndex(["2009-06-13 20:20:00", "2009-06-13 20:30:00", - "2009-06-13 20:40:00", "2009-06-13 20:50:00", - "2009-06-13 21:00:00"]), - columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"]) + result = read_fwf( + StringIO(data), + index_col=0, + header=None, + names=names, + widths=widths, + parse_dates=True, + date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), + ) + expected = DataFrame( + [ + [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], + [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869], + [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657], + [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379], + [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039], + ], + index=DatetimeIndex( + [ + "2009-06-13 20:20:00", + "2009-06-13 20:30:00", + "2009-06-13 20:40:00", + "2009-06-13 20:50:00", + "2009-06-13 21:00:00", + ] + ), + columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], + ) tm.assert_frame_equal(result, expected) def test_fwf_for_uint8(): data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa - df = read_fwf(StringIO(data), - colspecs=[(0, 17), (25, 26), (33, 37), - (49, 51), (58, 62), (63, 1000)], - names=["time", "pri", "pgn", "dst", "src", "data"], - converters={ - "pgn": lambda x: int(x, 16), - "src": lambda x: int(x, 16), - "dst": lambda x: int(x, 16), - "data": lambda x: len(x.split(" "))}) - - expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8], - [1421302964.226776, 6, 61442, None, 71, 8]], - columns=["time", "pri", "pgn", - "dst", "src", "data"]) + df = read_fwf( + StringIO(data), + colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)], + names=["time", "pri", "pgn", "dst", "src", "data"], + converters={ + "pgn": lambda x: int(x, 16), + "src": lambda x: int(x, 16), + "dst": lambda x: int(x, 16), + "data": lambda x: len(x.split(" ")), + }, + ) + + expected = DataFrame( + [ + [1421302965.213420, 3, 61184, 23, 40, 8], + [1421302964.226776, 6, 61442, None, 71, 8], + ], + columns=["time", "pri", "pgn", "dst", "src", "data"], + ) expected["dst"] = expected["dst"].astype(object) tm.assert_frame_equal(df, expected) @@ -293,10 +335,9 @@ def test_fwf_comment(comment): data = data.replace("#", comment) colspecs = [(0, 3), (4, 9), (9, 25)] - expected = DataFrame([[1, 2., 4], [5, np.nan, 10.]]) + expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]]) - result = read_fwf(StringIO(data), colspecs=colspecs, - header=None, comment=comment) + result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment) tm.assert_almost_equal(result, expected) @@ -309,10 +350,11 @@ def test_fwf_thousands(thousands): data = data.replace(",", thousands) colspecs = [(0, 3), (3, 11), (12, 16)] - expected = DataFrame([[1, 2334., 5], [10, 13, 10.]]) + expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]]) - result = read_fwf(StringIO(data), header=None, - colspecs=colspecs, thousands=thousands) + result = read_fwf( + StringIO(data), header=None, colspecs=colspecs, thousands=thousands + ) tm.assert_almost_equal(result, expected) @@ -374,7 +416,9 @@ def test_full_file_with_spaces(): 868 Jennifer Love Hewitt 0 17000.00 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 5000.00 2/5/2007 -""".strip("\r\n") +""".strip( + "\r\n" + ) colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -391,7 +435,9 @@ def test_full_file_with_spaces_and_missing(): 868 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -""".strip("\r\n") +""".strip( + "\r\n" + ) colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -408,7 +454,9 @@ def test_messed_up_data(): 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -""".strip("\r\n") +""".strip( + "\r\n" + ) colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) expected = read_fwf(StringIO(test), colspecs=colspecs) @@ -424,7 +472,9 @@ def test_multiple_delimiters(): ++44~~~~12.01 baz~~Jennifer Love Hewitt ~~55 11+++foo++++Jada Pinkett-Smith ..66++++++.03~~~bar Bill Murray -""".strip("\r\n") +""".strip( + "\r\n" + ) delimiter = " +~.\\" colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter) @@ -438,19 +488,20 @@ def test_variable_width_unicode(): שלום שלום ום שלל של ום -""".strip("\r\n") +""".strip( + "\r\n" + ) encoding = "utf8" kwargs = dict(header=None, encoding=encoding) - expected = read_fwf(BytesIO(data.encode(encoding)), - colspecs=[(0, 4), (5, 9)], **kwargs) + expected = read_fwf( + BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs + ) result = read_fwf(BytesIO(data.encode(encoding)), **kwargs) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("dtype", [ - dict(), {"a": "float64", "b": str, "c": "int32"} -]) +@pytest.mark.parametrize("dtype", [dict(), {"a": "float64", "b": str, "c": "int32"}]) def test_dtype(dtype): data = """ a b c 1 2 3.2 @@ -459,9 +510,9 @@ def test_dtype(dtype): colspecs = [(0, 5), (5, 10), (10, None)] result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype) - expected = pd.DataFrame({ - "a": [1, 3], "b": [2, 4], - "c": [3.2, 5.2]}, columns=["a", "b", "c"]) + expected = pd.DataFrame( + {"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"] + ) for col, dt in dtype.items(): expected[col] = expected[col].astype(dt) @@ -479,8 +530,7 @@ def test_skiprows_inference(): 101.6 956.1 """.strip() skiprows = 2 - expected = read_csv(StringIO(data), skiprows=skiprows, - delim_whitespace=True) + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -495,8 +545,7 @@ def test_skiprows_by_index_inference(): 456 78 9 456 """.strip() skiprows = [0, 2] - expected = read_csv(StringIO(data), skiprows=skiprows, - delim_whitespace=True) + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -524,8 +573,9 @@ def test_whitespace_preservation(): fwf_data = """ a bbb ccdd """ - result = read_fwf(StringIO(fwf_data), widths=[3, 3], - header=header, skiprows=[0], delimiter="\n\t") + result = read_fwf( + StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" + ) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) @@ -539,8 +589,7 @@ def test_default_delimiter(): fwf_data = """ a \tbbb cc\tdd """ - result = read_fwf(StringIO(fwf_data), widths=[3, 3], - header=header, skiprows=[0]) + result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0]) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index e63aac9d6ff6f..d4f219d13ac53 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -29,24 +29,28 @@ def test_skip_rows_bug(all_parsers, skiprows): 1/2/2000,4,5,6 1/3/2000,7,8,9 """ - result = parser.read_csv(StringIO(text), skiprows=skiprows, header=None, - index_col=0, parse_dates=True) - index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)], name=0) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], index=index) + result = parser.read_csv( + StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) tm.assert_frame_equal(result, expected) def test_deep_skip_rows(all_parsers): # see gh-4382 parser = all_parsers - data = "a,b,c\n" + "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) - for i in range(10)]) - condensed_data = "a,b,c\n" + "\n".join([ - ",".join([str(i), str(i + 1), str(i + 2)]) - for i in [0, 1, 2, 3, 4, 6, 8, 9]]) + data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)] + ) + condensed_data = "a,b,c\n" + "\n".join( + [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]] + ) result = parser.read_csv(StringIO(data), skiprows=[6, 8]) condensed_result = parser.read_csv(StringIO(condensed_data)) @@ -66,38 +70,52 @@ def test_skip_rows_blank(all_parsers): 1/2/2000,4,5,6 1/3/2000,7,8,9 """ - data = parser.read_csv(StringIO(text), skiprows=6, header=None, - index_col=0, parse_dates=True) - index = Index([datetime(2000, 1, 1), datetime(2000, 1, 2), - datetime(2000, 1, 3)], name=0) - - expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), - columns=[1, 2, 3], - index=index) + data = parser.read_csv( + StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True + ) + index = Index( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + ) + + expected = DataFrame( + np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index + ) tm.assert_frame_equal(data, expected) -@pytest.mark.parametrize("data,kwargs,expected", [ - ("""id,text,num_lines +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + ( + """id,text,num_lines 1,"line 11 line 12",2 2,"line 21 line 22",2 3,"line 31",1""", - dict(skiprows=[1]), - DataFrame([[2, "line 21\nline 22", 2], - [3, "line 31", 1]], columns=["id", "text", "num_lines"])), - ("a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", - dict(quotechar="~", skiprows=[2]), - DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"])), - (("Text,url\n~example\n " - "sentence\n one~,url1\n~" - "example\n sentence\n two~,url2\n~" - "example\n sentence\n three~,url3"), - dict(quotechar="~", skiprows=[1, 3]), - DataFrame([['example\n sentence\n two', 'url2']], - columns=["Text", "url"])) -]) + dict(skiprows=[1]), + DataFrame( + [[2, "line 21\nline 22", 2], [3, "line 31", 1]], + columns=["id", "text", "num_lines"], + ), + ), + ( + "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~", + dict(quotechar="~", skiprows=[2]), + DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]), + ), + ( + ( + "Text,url\n~example\n " + "sentence\n one~,url1\n~" + "example\n sentence\n two~,url2\n~" + "example\n sentence\n three~,url3" + ), + dict(quotechar="~", skiprows=[1, 3]), + DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]), + ), + ], +) def test_skip_row_with_newline(all_parsers, data, kwargs, expected): # see gh-12775 and gh-10911 parser = all_parsers @@ -113,35 +131,39 @@ def test_skip_row_with_quote(all_parsers): 2,"line '21' line 22",2 3,"line '31' line 32",1""" - exp_data = [[2, "line '21' line 22", 2], - [3, "line '31' line 32", 1]] - expected = DataFrame(exp_data, columns=[ - "id", "text", "num_lines"]) + exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]] + expected = DataFrame(exp_data, columns=["id", "text", "num_lines"]) result = parser.read_csv(StringIO(data), skiprows=[1]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,exp_data", [ - ("""id,text,num_lines +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + """id,text,num_lines 1,"line \n'11' line 12",2 2,"line \n'21' line 22",2 3,"line \n'31' line 32",1""", - [[2, "line \n'21' line 22", 2], - [3, "line \n'31' line 32", 1]]), - ("""id,text,num_lines + [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], + ), + ( + """id,text,num_lines 1,"line '11\n' line 12",2 2,"line '21\n' line 22",2 3,"line '31\n' line 32",1""", - [[2, "line '21\n' line 22", 2], - [3, "line '31\n' line 32", 1]]), - ("""id,text,num_lines + [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], + ), + ( + """id,text,num_lines 1,"line '11\n' \r\tline 12",2 2,"line '21\n' \r\tline 22",2 3,"line '31\n' \r\tline 32",1""", - [[2, "line '21\n' \r\tline 22", 2], - [3, "line '31\n' \r\tline 32", 1]]), -]) + [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], + ), + ], +) def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): # see gh-12775 and gh-10911 parser = all_parsers @@ -151,54 +173,64 @@ def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("line_terminator", [ - "\n", # "LF" - "\r\n", # "CRLF" - "\r" # "CR" -]) +@pytest.mark.parametrize( + "line_terminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR" +) def test_skiprows_lineterminator(all_parsers, line_terminator): # see gh-9079 parser = all_parsers - data = "\n".join(["SMOSMANIA ThetaProbe-ML2X ", - "2007/01/01 01:00 0.2140 U M ", - "2007/01/01 02:00 0.2141 M O ", - "2007/01/01 04:00 0.2142 D M "]) - expected = DataFrame([["2007/01/01", "01:00", 0.2140, "U", "M"], - ["2007/01/01", "02:00", 0.2141, "M", "O"], - ["2007/01/01", "04:00", 0.2142, "D", "M"]], - columns=["date", "time", "var", "flag", - "oflag"]) + data = "\n".join( + [ + "SMOSMANIA ThetaProbe-ML2X ", + "2007/01/01 01:00 0.2140 U M ", + "2007/01/01 02:00 0.2141 M O ", + "2007/01/01 04:00 0.2142 D M ", + ] + ) + expected = DataFrame( + [ + ["2007/01/01", "01:00", 0.2140, "U", "M"], + ["2007/01/01", "02:00", 0.2141, "M", "O"], + ["2007/01/01", "04:00", 0.2142, "D", "M"], + ], + columns=["date", "time", "var", "flag", "oflag"], + ) if parser.engine == "python" and line_terminator == "\r": pytest.skip("'CR' not respect with the Python parser yet") data = data.replace("\n", line_terminator) - result = parser.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"]) + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], + ) tm.assert_frame_equal(result, expected) def test_skiprows_infield_quote(all_parsers): # see gh-14459 parser = all_parsers - data = "a\"\nb\"\na\n1" + data = 'a"\nb"\na\n1' expected = DataFrame({"a": [1]}) result = parser.read_csv(StringIO(data), skiprows=2) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("kwargs,expected", [ - (dict(), DataFrame({"1": [3, 5]})), - (dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})) -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + (dict(), DataFrame({"1": [3, 5]})), + (dict(header=0, names=["foo"]), DataFrame({"foo": [3, 5]})), + ], +) def test_skip_rows_callable(all_parsers, kwargs, expected): parser = all_parsers data = "a\n1\n2\n3\n4\n5" - result = parser.read_csv(StringIO(data), - skiprows=lambda x: x % 2 == 0, - **kwargs) + result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 2177d6bb93108..57096a2652b88 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -20,16 +20,15 @@ class TestTextReader: - @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.dirpath = datapath('io', 'parser', 'data') - self.csv1 = os.path.join(self.dirpath, 'test1.csv') - self.csv2 = os.path.join(self.dirpath, 'test2.csv') - self.xls1 = os.path.join(self.dirpath, 'test.xls') + self.dirpath = datapath("io", "parser", "data") + self.csv1 = os.path.join(self.dirpath, "test1.csv") + self.csv2 = os.path.join(self.dirpath, "test2.csv") + self.xls1 = os.path.join(self.dirpath, "test.xls") def test_file_handle(self): - with open(self.csv1, 'rb') as f: + with open(self.csv1, "rb") as f: reader = TextReader(f) reader.read() @@ -38,12 +37,12 @@ def test_string_filename(self): reader.read() def test_file_handle_mmap(self): - with open(self.csv1, 'rb') as f: + with open(self.csv1, "rb") as f: reader = TextReader(f, memory_map=True, header=None) reader.read() def test_StringIO(self): - with open(self.csv1, 'rb') as f: + with open(self.csv1, "rb") as f: text = f.read() src = BytesIO(text) reader = TextReader(src, header=None) @@ -51,28 +50,26 @@ def test_StringIO(self): def test_string_factorize(self): # should this be optional? - data = 'a\nb\na\nb\na' + data = "a\nb\na\nb\na" reader = TextReader(StringIO(data), header=None) result = reader.read() assert len(set(map(id, result[0]))) == 2 def test_skipinitialspace(self): - data = ('a, b\n' - 'a, b\n' - 'a, b\n' - 'a, b') + data = "a, b\n" "a, b\n" "a, b\n" "a, b" - reader = TextReader(StringIO(data), skipinitialspace=True, - header=None) + reader = TextReader(StringIO(data), skipinitialspace=True, header=None) result = reader.read() - tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a', 'a'], - dtype=np.object_)) - tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b', 'b'], - dtype=np.object_)) + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b", "b"], dtype=np.object_) + ) def test_parse_booleans(self): - data = 'True\nFalse\nTrue\nTrue' + data = "True\nFalse\nTrue\nTrue" reader = TextReader(StringIO(data), header=None) result = reader.read() @@ -82,14 +79,15 @@ def test_parse_booleans(self): def test_delimit_whitespace(self): data = 'a b\na\t\t "b"\n"a"\t \t b' - reader = TextReader(StringIO(data), delim_whitespace=True, - header=None) + reader = TextReader(StringIO(data), delim_whitespace=True, header=None) result = reader.read() - tm.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'], - dtype=np.object_)) - tm.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'], - dtype=np.object_)) + tm.assert_numpy_array_equal( + result[0], np.array(["a", "a", "a"], dtype=np.object_) + ) + tm.assert_numpy_array_equal( + result[1], np.array(["b", "b", "b"], dtype=np.object_) + ) def test_embedded_newline(self): data = 'a\n"hello\nthere"\nthis' @@ -97,34 +95,33 @@ def test_embedded_newline(self): reader = TextReader(StringIO(data), header=None) result = reader.read() - expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_) + expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_) tm.assert_numpy_array_equal(result[0], expected) def test_euro_decimal(self): - data = '12345,67\n345,678' + data = "12345,67\n345,678" - reader = TextReader(StringIO(data), delimiter=':', - decimal=',', header=None) + reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None) result = reader.read() expected = np.array([12345.67, 345.678]) tm.assert_almost_equal(result[0], expected) def test_integer_thousands(self): - data = '123,456\n12,500' + data = "123,456\n12,500" - reader = TextReader(StringIO(data), delimiter=':', - thousands=',', header=None) + reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None) result = reader.read() expected = np.array([123456, 12500], dtype=np.int64) tm.assert_almost_equal(result[0], expected) def test_integer_thousands_alt(self): - data = '123.456\n12.500' + data = "123.456\n12.500" - reader = TextFileReader(StringIO(data), delimiter=':', - thousands='.', header=None) + reader = TextFileReader( + StringIO(data), delimiter=":", thousands=".", header=None + ) result = reader.read() expected = DataFrame([123456, 12500]) @@ -132,65 +129,61 @@ def test_integer_thousands_alt(self): def test_skip_bad_lines(self, capsys): # too many lines, see #2430 for why - data = ('a:b:c\n' - 'd:e:f\n' - 'g:h:i\n' - 'j:k:l:m\n' - 'l:m:n\n' - 'o:p:q:r') - - reader = TextReader(StringIO(data), delimiter=':', - header=None) - msg = (r"Error tokenizing data\. C error: Expected 3 fields in" - " line 4, saw 4") + data = "a:b:c\n" "d:e:f\n" "g:h:i\n" "j:k:l:m\n" "l:m:n\n" "o:p:q:r" + + reader = TextReader(StringIO(data), delimiter=":", header=None) + msg = r"Error tokenizing data\. C error: Expected 3 fields in" " line 4, saw 4" with pytest.raises(parser.ParserError, match=msg): reader.read() - reader = TextReader(StringIO(data), delimiter=':', - header=None, - error_bad_lines=False, - warn_bad_lines=False) + reader = TextReader( + StringIO(data), + delimiter=":", + header=None, + error_bad_lines=False, + warn_bad_lines=False, + ) result = reader.read() - expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object), - 1: np.array(['b', 'e', 'h', 'm'], dtype=object), - 2: np.array(['c', 'f', 'i', 'n'], dtype=object)} + expected = { + 0: np.array(["a", "d", "g", "l"], dtype=object), + 1: np.array(["b", "e", "h", "m"], dtype=object), + 2: np.array(["c", "f", "i", "n"], dtype=object), + } assert_array_dicts_equal(result, expected) - reader = TextReader(StringIO(data), delimiter=':', - header=None, - error_bad_lines=False, - warn_bad_lines=True) + reader = TextReader( + StringIO(data), + delimiter=":", + header=None, + error_bad_lines=False, + warn_bad_lines=True, + ) reader.read() captured = capsys.readouterr() - assert 'Skipping line 4' in captured.err - assert 'Skipping line 6' in captured.err + assert "Skipping line 4" in captured.err + assert "Skipping line 6" in captured.err def test_header_not_enough_lines(self): - data = ('skip this\n' - 'skip this\n' - 'a,b,c\n' - '1,2,3\n' - '4,5,6') + data = "skip this\n" "skip this\n" "a,b,c\n" "1,2,3\n" "4,5,6" - reader = TextReader(StringIO(data), delimiter=',', header=2) + reader = TextReader(StringIO(data), delimiter=",", header=2) header = reader.header - expected = [['a', 'b', 'c']] + expected = [["a", "b", "c"]] assert header == expected recs = reader.read() - expected = {0: np.array([1, 4], dtype=np.int64), - 1: np.array([2, 5], dtype=np.int64), - 2: np.array([3, 6], dtype=np.int64)} + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array([2, 5], dtype=np.int64), + 2: np.array([3, 6], dtype=np.int64), + } assert_array_dicts_equal(recs, expected) def test_escapechar(self): - data = ('\\"hello world\"\n' - '\\"hello world\"\n' - '\\"hello world\"') + data = '\\"hello world"\n' '\\"hello world"\n' '\\"hello world"' - reader = TextReader(StringIO(data), delimiter=',', header=None, - escapechar='\\') + reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") result = reader.read() expected = {0: np.array(['"hello world"'] * 3, dtype=object)} assert_array_dicts_equal(result, expected) @@ -211,24 +204,23 @@ def test_numpy_string_dtype(self): aaaaa,5""" def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=',', header=None, - **kwds) + return TextReader(StringIO(data), delimiter=",", header=None, **kwds) - reader = _make_reader(dtype='S5,i4') + reader = _make_reader(dtype="S5,i4") result = reader.read() - assert result[0].dtype == 'S5' + assert result[0].dtype == "S5" - ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], dtype='S5') + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5") assert (result[0] == ex_values).all() - assert result[1].dtype == 'i4' + assert result[1].dtype == "i4" - reader = _make_reader(dtype='S4') + reader = _make_reader(dtype="S4") result = reader.read() - assert result[0].dtype == 'S4' - ex_values = np.array(['a', 'aa', 'aaa', 'aaaa', 'aaaa'], dtype='S4') + assert result[0].dtype == "S4" + ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4") assert (result[0] == ex_values).all() - assert result[1].dtype == 'S4' + assert result[1].dtype == "S4" def test_pass_dtype(self): data = """\ @@ -239,23 +231,22 @@ def test_pass_dtype(self): 4,d""" def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=',', **kwds) + return TextReader(StringIO(data), delimiter=",", **kwds) - reader = _make_reader(dtype={'one': 'u1', 1: 'S1'}) + reader = _make_reader(dtype={"one": "u1", 1: "S1"}) result = reader.read() - assert result[0].dtype == 'u1' - assert result[1].dtype == 'S1' + assert result[0].dtype == "u1" + assert result[1].dtype == "S1" - reader = _make_reader(dtype={'one': np.uint8, 1: object}) + reader = _make_reader(dtype={"one": np.uint8, 1: object}) result = reader.read() - assert result[0].dtype == 'u1' - assert result[1].dtype == 'O' + assert result[0].dtype == "u1" + assert result[1].dtype == "O" - reader = _make_reader(dtype={'one': np.dtype('u1'), - 1: np.dtype('O')}) + reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")}) result = reader.read() - assert result[0].dtype == 'u1' - assert result[1].dtype == 'O' + assert result[0].dtype == "u1" + assert result[1].dtype == "O" def test_usecols(self): data = """\ @@ -266,7 +257,7 @@ def test_usecols(self): 10,11,12""" def _make_reader(**kwds): - return TextReader(StringIO(data), delimiter=',', **kwds) + return TextReader(StringIO(data), delimiter=",", **kwds) reader = _make_reader(usecols=(1, 2)) result = reader.read() @@ -278,72 +269,76 @@ def _make_reader(**kwds): def test_cr_delimited(self): def _test(text, **kwargs): - nice_text = text.replace('\r', '\r\n') + nice_text = text.replace("\r", "\r\n") result = TextReader(StringIO(text), **kwargs).read() expected = TextReader(StringIO(nice_text), **kwargs).read() assert_array_dicts_equal(result, expected) - data = 'a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12' - _test(data, delimiter=',') + data = "a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12" + _test(data, delimiter=",") - data = 'a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12' + data = "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12" _test(data, delim_whitespace=True) - data = 'a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12' - _test(data, delimiter=',') + data = "a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12" + _test(data, delimiter=",") - sample = ('A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r' - 'AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r' - ',BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0') - _test(sample, delimiter=',') + sample = ( + "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r" + "AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r" + ",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0" + ) + _test(sample, delimiter=",") - data = 'A B C\r 2 3\r4 5 6' + data = "A B C\r 2 3\r4 5 6" _test(data, delim_whitespace=True) - data = 'A B C\r2 3\r4 5 6' + data = "A B C\r2 3\r4 5 6" _test(data, delim_whitespace=True) def test_empty_field_eof(self): - data = 'a,b,c\n1,2,3\n4,,' + data = "a,b,c\n1,2,3\n4,," - result = TextReader(StringIO(data), delimiter=',').read() + result = TextReader(StringIO(data), delimiter=",").read() - expected = {0: np.array([1, 4], dtype=np.int64), - 1: np.array(['2', ''], dtype=object), - 2: np.array(['3', ''], dtype=object)} + expected = { + 0: np.array([1, 4], dtype=np.int64), + 1: np.array(["2", ""], dtype=object), + 2: np.array(["3", ""], dtype=object), + } assert_array_dicts_equal(result, expected) # GH5664 - a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c']) - b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], - columns=list('abcd'), - index=[1, 1]) - c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan], - [8, 9, 10, 11], [13, 14, nan, nan]], - columns=list('abcd'), - index=[0, 5, 7, 12]) + a = DataFrame([["b"], [nan]], columns=["a"], index=["a", "c"]) + b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1]) + c = DataFrame( + [[1, 2, 3, 4], [6, nan, nan, nan], [8, 9, 10, 11], [13, 14, nan, nan]], + columns=list("abcd"), + index=[0, 5, 7, 12], + ) for _ in range(100): - df = read_csv(StringIO('a,b\nc\n'), skiprows=0, - names=['a'], engine='c') + df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c") assert_frame_equal(df, a) - df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2), - names=list("abcd"), engine='c') + df = read_csv( + StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c" + ) assert_frame_equal(df, b) - df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'), - names=list('abcd'), engine='c') + df = read_csv( + StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"), + names=list("abcd"), + engine="c", + ) assert_frame_equal(df, c) def test_empty_csv_input(self): # GH14867 - df = read_csv(StringIO(), chunksize=20, header=None, - names=['a', 'b', 'c']) + df = read_csv(StringIO(), chunksize=20, header=None, names=["a", "b", "c"]) assert isinstance(df, TextFileReader) def assert_array_dicts_equal(left, right): for k, v in left.items(): - tm.assert_numpy_array_equal(np.asarray(v), - np.asarray(right[k])) + tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k])) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index a8748c88e0e55..f135fac65f56a 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -24,40 +24,37 @@ def python_engine(request): class TestUnsupportedFeatures: - def test_mangle_dupe_cols_false(self): # see gh-12935 - data = 'a b c\n1 2 3' - msg = 'is not supported' + data = "a b c\n1 2 3" + msg = "is not supported" - for engine in ('c', 'python'): + for engine in ("c", "python"): with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine=engine, - mangle_dupe_cols=False) + read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False) def test_c_engine(self): # see gh-6607 - data = 'a b c\n1 2 3' - msg = 'does not support' + data = "a b c\n1 2 3" + msg = "does not support" # specify C engine with unsupported options (raise) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', - sep=None, delim_whitespace=False) + read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', sep=r'\s') + read_csv(StringIO(data), engine="c", sep=r"\s") with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', sep='\t', quotechar=chr(128)) + read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128)) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine='c', skipfooter=1) + read_csv(StringIO(data), engine="c", skipfooter=1) # specify C-unsupported options without python-unsupported options with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), sep=r'\s') + read_csv(StringIO(data), sep=r"\s") with tm.assert_produces_warning(parsers.ParserWarning): - read_csv(StringIO(data), sep='\t', quotechar=chr(128)) + read_csv(StringIO(data), sep="\t", quotechar=chr(128)) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), skipfooter=1) @@ -66,12 +63,12 @@ def test_c_engine(self): a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - msg = 'Error tokenizing data' + msg = "Error tokenizing data" with pytest.raises(ParserError, match=msg): - read_csv(StringIO(text), sep='\\s+') + read_csv(StringIO(text), sep="\\s+") with pytest.raises(ParserError, match=msg): - read_csv(StringIO(text), engine='c', sep='\\s+') + read_csv(StringIO(text), engine="c", sep="\\s+") msg = "Only length-1 thousands markers supported" data = """A|B|C @@ -79,14 +76,14 @@ def test_c_engine(self): 10|13|10. """ with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), thousands=',,') + read_csv(StringIO(data), thousands=",,") with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), thousands='') + read_csv(StringIO(data), thousands="") msg = "Only length-1 line terminators supported" - data = 'a,b,c~~1,2,3~~4,5,6' + data = "a,b,c~~1,2,3~~4,5,6" with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), lineterminator='~~') + read_csv(StringIO(data), lineterminator="~~") def test_python_engine(self, python_engine): from pandas.io.parsers import _python_unsupported as py_unsupported @@ -98,8 +95,10 @@ def test_python_engine(self, python_engine): 1,2,3,4,""" for default in py_unsupported: - msg = ('The %r option is not supported ' - 'with the %r engine' % (default, python_engine)) + msg = "The %r option is not supported " "with the %r engine" % ( + default, + python_engine, + ) kwargs = {default: object()} with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 92cd0e873c02f..b449e848a0b5a 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -12,19 +12,25 @@ from pandas import DataFrame, Index import pandas.util.testing as tm -_msg_validate_usecols_arg = ("'usecols' must either be list-like " - "of all strings, all unicode, all " - "integers or a callable.") -_msg_validate_usecols_names = ("Usecols do not match columns, columns " - "expected but not found: {0}") - - -@pytest.mark.parametrize("names,usecols,missing", [ - (None, [0, 3], r"\[3\]"), - (["a", "b", "c"], [0, -1, 2], r"\[-1\]"), - (None, [3], r"\[3\]"), - (["a"], [3], r"\[3\]") -]) +_msg_validate_usecols_arg = ( + "'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable." +) +_msg_validate_usecols_names = ( + "Usecols do not match columns, columns " "expected but not found: {0}" +) + + +@pytest.mark.parametrize( + "names,usecols,missing", + [ + (None, [0, 3], r"\[3\]"), + (["a", "b", "c"], [0, -1, 2], r"\[-1\]"), + (None, [3], r"\[3\]"), + (["a"], [3], r"\[3\]"), + ], +) def test_usecols_out_of_bounds(all_parsers, names, usecols, missing): # See gh-25623 data = "a,b,c\n1,2,3\n4,5,6" @@ -59,8 +65,7 @@ def test_usecols(all_parsers, usecols): parser = all_parsers result = parser.read_csv(StringIO(data), usecols=usecols) - expected = DataFrame([[2, 3], [5, 6], [8, 9], - [11, 12]], columns=["b", "c"]) + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) @@ -73,18 +78,15 @@ def test_usecols_with_names(all_parsers): 10,11,12""" parser = all_parsers names = ["foo", "bar"] - result = parser.read_csv(StringIO(data), names=names, - usecols=[1, 2], header=0) + result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0) - expected = DataFrame([[2, 3], [5, 6], [8, 9], - [11, 12]], columns=names) + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("names,usecols", [ - (["b", "c"], [1, 2]), - (["a", "b", "c"], ["b", "c"]) -]) +@pytest.mark.parametrize( + "names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])] +) def test_usecols_relative_to_names(all_parsers, names, usecols): data = """\ 1,2,3 @@ -92,11 +94,9 @@ def test_usecols_relative_to_names(all_parsers, names, usecols): 7,8,9 10,11,12""" parser = all_parsers - result = parser.read_csv(StringIO(data), names=names, - header=None, usecols=usecols) + result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols) - expected = DataFrame([[2, 3], [5, 6], [8, 9], - [11, 12]], columns=["b", "c"]) + expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"]) tm.assert_frame_equal(result, expected) @@ -108,11 +108,11 @@ def test_usecols_relative_to_names2(all_parsers): 7,8,9 10,11,12""" parser = all_parsers - result = parser.read_csv(StringIO(data), names=["a", "b"], - header=None, usecols=[0, 1]) + result = parser.read_csv( + StringIO(data), names=["a", "b"], header=None, usecols=[0, 1] + ) - expected = DataFrame([[1, 2], [4, 5], [7, 8], - [10, 11]], columns=["a", "b"]) + expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -123,14 +123,14 @@ def test_usecols_name_length_conflict(all_parsers): 7,8,9 10,11,12""" parser = all_parsers - msg = ("Number of passed names did not " - "match number of header fields in the file" - if parser.engine == "python" else - "Passed header names mismatches usecols") + msg = ( + "Number of passed names did not " "match number of header fields in the file" + if parser.engine == "python" + else "Passed header names mismatches usecols" + ) with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), names=["a", "b"], - header=None, usecols=[1]) + parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1]) def test_usecols_single_string(all_parsers): @@ -144,8 +144,9 @@ def test_usecols_single_string(all_parsers): parser.read_csv(StringIO(data), usecols="foo") -@pytest.mark.parametrize("data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", - "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]) +@pytest.mark.parametrize( + "data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"] +) def test_usecols_index_col_false(all_parsers, data): # see gh-9082 parser = all_parsers @@ -164,8 +165,7 @@ def test_usecols_index_col_conflict(all_parsers, usecols, index_col): data = "a,b,c,d\nA,a,1,one\nB,b,2,two" expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) - result = parser.read_csv(StringIO(data), usecols=usecols, - index_col=index_col) + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col) tm.assert_frame_equal(result, expected) @@ -177,8 +177,9 @@ def test_usecols_index_col_conflict2(all_parsers): expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) expected = expected.set_index(["b", "c"]) - result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], - index_col=["b", "c"]) + result = parser.read_csv( + StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"] + ) tm.assert_frame_equal(result, expected) @@ -188,8 +189,7 @@ def test_usecols_implicit_index_col(all_parsers): data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" result = parser.read_csv(StringIO(data), usecols=["a", "b"]) - expected = DataFrame({"a": ["apple", "orange"], - "b": ["bat", "cow"]}, index=[4, 8]) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) @@ -199,8 +199,7 @@ def test_usecols_regex_sep(all_parsers): data = "a b c\n4 apple bat 5.7\n8 orange cow 10" result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) - expected = DataFrame({"a": ["apple", "orange"], - "b": ["bat", "cow"]}, index=[4, 8]) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) @@ -208,22 +207,20 @@ def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" - result = parser.read_csv(StringIO(data), delim_whitespace=True, - usecols=("a", "b")) - expected = DataFrame({"a": ["apple", "orange"], - "b": ["bat", "cow"]}, index=[4, 8]) + result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols,expected", [ - # Column selection by index. - ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], - columns=["2", "0"])), - - # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], - columns=["0", "1"])), -]) +@pytest.mark.parametrize( + "usecols,expected", + [ + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), + # Column selection by name. + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), + ], +) def test_usecols_with_integer_like_header(all_parsers, usecols, expected): parser = all_parsers data = """2,0,1 @@ -245,14 +242,10 @@ def test_usecols_with_parse_dates(all_parsers, usecols): cols = { "a": [0, 0], - "c_d": [ - Timestamp("2014-01-01 09:00:00"), - Timestamp("2014-01-02 10:00:00") - ] + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(data), usecols=usecols, - parse_dates=parse_dates) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) @@ -267,16 +260,25 @@ def test_usecols_with_parse_dates2(all_parsers): usecols = names[:] parse_dates = [0] - index = Index([Timestamp("2008-02-07 09:40"), - Timestamp("2008-02-07 09:50"), - Timestamp("2008-02-07 10:00")], - name="date") + index = Index( + [ + Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00"), + ], + name="date", + ) cols = {"values": [1032.43, 1042.54, 1051.65]} expected = DataFrame(cols, index=index) - result = parser.read_csv(StringIO(data), parse_dates=parse_dates, - index_col=0, usecols=usecols, - header=None, names=names) + result = parser.read_csv( + StringIO(data), + parse_dates=parse_dates, + index_col=0, + usecols=usecols, + header=None, + names=names, + ) tm.assert_frame_equal(result, expected) @@ -289,14 +291,21 @@ def test_usecols_with_parse_dates3(all_parsers): usecols = list("abcdefghij") parse_dates = [0] - cols = {"a": Timestamp("2016-09-21"), - "b": [1], "c": [1], "d": [2], - "e": [3], "f": [4], "g": [5], - "h": [6], "i": [7], "j": [8]} + cols = { + "a": Timestamp("2016-09-21"), + "b": [1], + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } expected = DataFrame(cols, columns=usecols) - result = parser.read_csv(StringIO(data), usecols=usecols, - parse_dates=parse_dates) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) @@ -306,21 +315,31 @@ def test_usecols_with_parse_dates4(all_parsers): parse_dates = [[0, 1]] parser = all_parsers - cols = {"a_b": "2016/09/21 1", - "c": [1], "d": [2], "e": [3], "f": [4], - "g": [5], "h": [6], "i": [7], "j": [8]} + cols = { + "a_b": "2016/09/21 1", + "c": [1], + "d": [2], + "e": [3], + "f": [4], + "g": [5], + "h": [6], + "i": [7], + "j": [8], + } expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - result = parser.read_csv(StringIO(data), usecols=usecols, - parse_dates=parse_dates) + result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize("names", [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. -]) +@pytest.mark.parametrize( + "names", + [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. + ], +) def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): # see gh-9755 s = """0,1,20140101,0900,4 @@ -330,16 +349,13 @@ def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): cols = { "a": [0, 0], - "c_d": [ - Timestamp("2014-01-01 09:00:00"), - Timestamp("2014-01-02 10:00:00") - ] + "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], } expected = DataFrame(cols, columns=["c_d", "a"]) - result = parser.read_csv(StringIO(s), names=names, - parse_dates=parse_dates, - usecols=usecols) + result = parser.read_csv( + StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols + ) tm.assert_frame_equal(result, expected) @@ -352,12 +368,8 @@ def test_usecols_with_unicode_strings(all_parsers): parser = all_parsers exp_data = { - "AAA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "BBB": {0: 8, 1: 2, 2: 7} + "AAA": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "BBB": {0: 8, 1: 2, 2: 7}, } expected = DataFrame(exp_data) @@ -374,12 +386,8 @@ def test_usecols_with_single_byte_unicode_strings(all_parsers): parser = all_parsers exp_data = { - "A": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "B": {0: 8, 1: 2, 2: 7} + "A": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "B": {0: 8, 1: 2, 2: 7}, } expected = DataFrame(exp_data) @@ -399,10 +407,7 @@ def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): parser.read_csv(StringIO(data), usecols=usecols) -@pytest.mark.parametrize("usecols", [ - ["あああ", "いい"], - ["あああ", "いい"] -]) +@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]]) def test_usecols_with_multi_byte_characters(all_parsers, usecols): data = """あああ,いい,ううう,ええええ 0.056674973,8,True,a @@ -411,12 +416,8 @@ def test_usecols_with_multi_byte_characters(all_parsers, usecols): parser = all_parsers exp_data = { - "あああ": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "いい": {0: 8, 1: 2, 2: 7} + "あああ": {0: 0.056674972999999997, 1: 2.6132309819999997, 2: 3.5689350380000002}, + "いい": {0: 8, 1: 2, 2: 7}, } expected = DataFrame(exp_data) @@ -444,19 +445,26 @@ def test_np_array_usecols(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols,expected", [ - (lambda x: x.upper() in ["AAA", "BBB", "DDD"], - DataFrame({ - "AaA": { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - "bBb": {0: 8, 1: 2, 2: 7}, - "ddd": {0: "a", 1: "b", 2: "a"} - })), - (lambda x: False, DataFrame()), -]) +@pytest.mark.parametrize( + "usecols,expected", + [ + ( + lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame( + { + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002, + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"}, + } + ), + ), + (lambda x: False, DataFrame()), + ], +) def test_callable_usecols(all_parsers, usecols, expected): # see gh-14154 data = """AaA,bBb,CCC,ddd @@ -481,18 +489,31 @@ def test_incomplete_first_row(all_parsers, usecols): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("data,usecols,kwargs,expected", [ - # see gh-8985 - ("19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2], - dict(header=None), DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]])), - - # see gh-9549 - (("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n" - "1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"], - dict(), DataFrame({"A": [1, 3, 1, 1, 1, 5], - "B": [2, 4, 2, 2, 2, 6], - "C": [3, 5, 4, 3, 3, 7]})), -]) +@pytest.mark.parametrize( + "data,usecols,kwargs,expected", + [ + # see gh-8985 + ( + "19,29,39\n" * 2 + "10,20,30,40", + [0, 1, 2], + dict(header=None), + DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]), + ), + # see gh-9549 + ( + ("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n" "1,2,3,,,1,\n1,2,3\n5,6,7"), + ["A", "B", "C"], + dict(), + DataFrame( + { + "A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7], + } + ), + ), + ], +) def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): # see gh-8985 parser = all_parsers @@ -500,27 +521,50 @@ def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("usecols,kwargs,expected,msg", [ - (["a", "b", "c", "d"], dict(), - DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), None), - (["a", "b", "c", "f"], dict(), None, - _msg_validate_usecols_names.format(r"\['f'\]")), - (["a", "b", "f"], dict(), None, - _msg_validate_usecols_names.format(r"\['f'\]")), - (["a", "b", "f", "g"], dict(), None, - _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]")), - - # see gh-14671 - (None, dict(header=0, names=["A", "B", "C", "D"]), - DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], - "D": [4, 8]}), None), - (["A", "B", "C", "f"], dict(header=0, names=["A", "B", "C", "D"]), - None, _msg_validate_usecols_names.format(r"\['f'\]")), - (["A", "B", "f"], dict(names=["A", "B", "C", "D"]), - None, _msg_validate_usecols_names.format(r"\['f'\]")), -]) -def test_raises_on_usecols_names_mismatch(all_parsers, usecols, - kwargs, expected, msg): +@pytest.mark.parametrize( + "usecols,kwargs,expected,msg", + [ + ( + ["a", "b", "c", "d"], + dict(), + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), + None, + ), + ( + ["a", "b", "c", "f"], + dict(), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + (["a", "b", "f"], dict(), None, _msg_validate_usecols_names.format(r"\['f'\]")), + ( + ["a", "b", "f", "g"], + dict(), + None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"), + ), + # see gh-14671 + ( + None, + dict(header=0, names=["A", "B", "C", "D"]), + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}), + None, + ), + ( + ["A", "B", "C", "f"], + dict(header=0, names=["A", "B", "C", "D"]), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ( + ["A", "B", "f"], + dict(names=["A", "B", "C", "D"]), + None, + _msg_validate_usecols_names.format(r"\['f'\]"), + ), + ], +) +def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" kwargs.update(usecols=usecols) parser = all_parsers @@ -534,15 +578,14 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, @pytest.mark.xfail( - reason="see gh-16469: works on the C engine but not the Python engine", - strict=False) + reason="see gh-16469: works on the C engine but not the Python engine", strict=False +) @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" names = ["A", "B", "C", "D"] parser = all_parsers - result = parser.read_csv(StringIO(data), header=0, - names=names, usecols=usecols) + result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols) expected = DataFrame({"A": [1, 5], "C": [3, 7]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index 34ed066dd3748..f5f73beab6d60 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -4,7 +4,7 @@ from pandas.tests.io.pytables.test_pytables import ensure_clean_path from pandas.util.testing import assert_frame_equal -tables = pytest.importorskip('tables') +tables = pytest.importorskip("tables") @pytest.fixture @@ -12,26 +12,26 @@ def pytables_hdf5_file(): """Use PyTables to create a simple HDF5 file.""" table_schema = { - 'c0': tables.Time64Col(pos=0), - 'c1': tables.StringCol(5, pos=1), - 'c2': tables.Int64Col(pos=2), + "c0": tables.Time64Col(pos=0), + "c1": tables.StringCol(5, pos=1), + "c2": tables.Int64Col(pos=2), } t0 = 1561105000.0 testsamples = [ - {'c0': t0, 'c1': 'aaaaa', 'c2': 1}, - {'c0': t0 + 1, 'c1': 'bbbbb', 'c2': 2}, - {'c0': t0 + 2, 'c1': 'ccccc', 'c2': 10**5}, - {'c0': t0 + 3, 'c1': 'ddddd', 'c2': 4294967295}, + {"c0": t0, "c1": "aaaaa", "c2": 1}, + {"c0": t0 + 1, "c1": "bbbbb", "c2": 2}, + {"c0": t0 + 2, "c1": "ccccc", "c2": 10 ** 5}, + {"c0": t0 + 3, "c1": "ddddd", "c2": 4294967295}, ] - objname = 'pandas_test_timeseries' + objname = "pandas_test_timeseries" - with ensure_clean_path('written_with_pytables.h5') as path: + with ensure_clean_path("written_with_pytables.h5") as path: # The `ensure_clean_path` context mgr removes the temp file upon exit. - with tables.open_file(path, mode='w') as f: - t = f.create_table('/', name=objname, description=table_schema) + with tables.open_file(path, mode="w") as f: + t = f.create_table("/", name=objname, description=table_schema) for sample in testsamples: for key, value in sample.items(): t.row[key] = value diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index 00062b04d07d8..fee7e1cb2ba5f 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -17,33 +17,53 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Index, Int64Index, - MultiIndex, RangeIndex, Series, Timestamp, bdate_range, concat, date_range, - isna, timedelta_range) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + Int64Index, + MultiIndex, + RangeIndex, + Series, + Timestamp, + bdate_range, + concat, + date_range, + isna, + timedelta_range, +) import pandas.util.testing as tm -from pandas.util.testing import ( - assert_frame_equal, assert_series_equal, set_timezone) +from pandas.util.testing import assert_frame_equal, assert_series_equal, set_timezone from pandas.io import pytables as pytables # noqa:E402 from pandas.io.formats.printing import pprint_thing from pandas.io.pytables import ( - ClosedFileError, HDFStore, PossibleDataLossError, Term, read_hdf) + ClosedFileError, + HDFStore, + PossibleDataLossError, + Term, + read_hdf, +) from pandas.io.pytables import TableIterator # noqa:E402 -tables = pytest.importorskip('tables') +tables = pytest.importorskip("tables") # TODO: # remove when gh-24839 is fixed; this affects numpy 1.16 # and pytables 3.4.4 xfail_non_writeable = pytest.mark.xfail( - LooseVersion(np.__version__) >= LooseVersion('1.16') and - LooseVersion(tables.__version__) < LooseVersion('3.5.1'), - reason=('gh-25511, gh-24839. pytables needs a ' - 'release beyong 3.4.4 to support numpy 1.16x')) + LooseVersion(np.__version__) >= LooseVersion("1.16") + and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), + reason=( + "gh-25511, gh-24839. pytables needs a " + "release beyong 3.4.4 to support numpy 1.16x" + ), +) -_default_compressor = 'blosc' +_default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( @@ -82,8 +102,7 @@ def create_tempfile(path): @contextmanager -def ensure_clean_store(path, mode='a', complevel=None, complib=None, - fletcher32=False): +def ensure_clean_store(path, mode="a", complevel=None, complib=None, fletcher32=False): try: @@ -91,12 +110,13 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, if not len(os.path.dirname(path)): path = create_tempfile(path) - store = HDFStore(path, mode=mode, complevel=complevel, - complib=complib, fletcher32=False) + store = HDFStore( + path, mode=mode, complevel=complevel, complib=complib, fletcher32=False + ) yield store finally: safe_close(store) - if mode == 'w' or mode == 'a': + if mode == "w" or mode == "a": safe_remove(path) @@ -135,7 +155,6 @@ def _maybe_remove(store, key): class Base: - @classmethod def setup_class(cls): @@ -149,7 +168,7 @@ def teardown_class(cls): tm.set_testing_mode() def setup_method(self, method): - self.path = 'tmp.__%s__.h5' % tm.rands(10) + self.path = "tmp.__%s__.h5" % tm.rands(10) def teardown_method(self, method): pass @@ -157,7 +176,6 @@ def teardown_method(self, method): @pytest.mark.single class TestHDFStore(Base): - def test_format_kwarg_in_constructor(self): # GH 13291 with ensure_clean_path(self.path) as path: @@ -168,7 +186,7 @@ def test_context(self): path = create_tempfile(self.path) try: with HDFStore(path) as tbl: - raise ValueError('blah') + raise ValueError("blah") except ValueError: pass finally: @@ -176,34 +194,35 @@ def test_context(self): try: with HDFStore(path) as tbl: - tbl['a'] = tm.makeDataFrame() + tbl["a"] = tm.makeDataFrame() with HDFStore(path) as tbl: assert len(tbl) == 1 - assert type(tbl['a']) == DataFrame + assert type(tbl["a"]) == DataFrame finally: safe_remove(path) def test_conv_read_write(self): path = create_tempfile(self.path) try: + def roundtrip(key, obj, **kwargs): obj.to_hdf(path, key, **kwargs) return read_hdf(path, key) o = tm.makeTimeSeries() - assert_series_equal(o, roundtrip('series', o)) + assert_series_equal(o, roundtrip("series", o)) o = tm.makeStringSeries() - assert_series_equal(o, roundtrip('string_series', o)) + assert_series_equal(o, roundtrip("string_series", o)) o = tm.makeDataFrame() - assert_frame_equal(o, roundtrip('frame', o)) + assert_frame_equal(o, roundtrip("frame", o)) # table df = DataFrame(dict(A=range(5), B=range(5))) - df.to_hdf(path, 'table', append=True) - result = read_hdf(path, 'table', where=['index>2']) + df.to_hdf(path, "table", append=True) + result = read_hdf(path, "table", where=["index>2"]) assert_frame_equal(df[df.index > 2], result) finally: @@ -212,13 +231,14 @@ def roundtrip(key, obj, **kwargs): def test_long_strings(self): # GH6166 - df = DataFrame({'a': tm.rands_array(100, size=10)}, - index=tm.rands_array(100, size=10)) + df = DataFrame( + {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) + ) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=['a']) + store.append("df", df, data_columns=["a"]) - result = store.select('df') + result = store.select("df") assert_frame_equal(df, result) def test_api(self): @@ -228,68 +248,68 @@ def test_api(self): with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, 'df', append=True, format='table') - df.iloc[10:].to_hdf(path, 'df', append=True, format='table') - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=True, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) # append to False - df.iloc[:10].to_hdf(path, 'df', append=False, format='table') - df.iloc[10:].to_hdf(path, 'df', append=True, format='table') - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path, 'df', append=True) - df.iloc[10:].to_hdf(path, 'df', append=True, format='table') - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=True) + df.iloc[10:].to_hdf(path, "df", append=True, format="table") + assert_frame_equal(read_hdf(path, "df"), df) # append to False - df.iloc[:10].to_hdf(path, 'df', append=False, format='table') - df.iloc[10:].to_hdf(path, 'df', append=True) - assert_frame_equal(read_hdf(path, 'df'), df) + df.iloc[:10].to_hdf(path, "df", append=False, format="table") + df.iloc[10:].to_hdf(path, "df", append=True) + assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path, 'df', append=False, format='fixed') - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df", append=False, format="fixed") + assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, 'df', append=False, format='f') - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df", append=False, format="f") + assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, 'df', append=False) - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df", append=False) + assert_frame_equal(read_hdf(path, "df"), df) - df.to_hdf(path, 'df') - assert_frame_equal(read_hdf(path, 'df'), df) + df.to_hdf(path, "df") + assert_frame_equal(read_hdf(path, "df"), df) with ensure_clean_store(self.path) as store: path = store._path df = tm.makeDataFrame() - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=True, format='table') - store.append('df', df.iloc[10:], append=True, format='table') - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=True, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) # append to False - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=False, format='table') - store.append('df', df.iloc[10:], append=True, format='table') - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) # formats - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=False, format='table') - store.append('df', df.iloc[10:], append=True, format='table') - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format="table") + assert_frame_equal(store.select("df"), df) - _maybe_remove(store, 'df') - store.append('df', df.iloc[:10], append=False, format='table') - store.append('df', df.iloc[10:], append=True, format=None) - assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + store.append("df", df.iloc[:10], append=False, format="table") + store.append("df", df.iloc[10:], append=True, format=None) + assert_frame_equal(store.select("df"), df) with ensure_clean_path(self.path) as path: # Invalid. @@ -318,53 +338,53 @@ def test_api_default_format(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - pd.set_option('io.hdf.default_format', 'fixed') - _maybe_remove(store, 'df') - store.put('df', df) - assert not store.get_storer('df').is_table + pd.set_option("io.hdf.default_format", "fixed") + _maybe_remove(store, "df") + store.put("df", df) + assert not store.get_storer("df").is_table with pytest.raises(ValueError): store.append("df2", df) - pd.set_option('io.hdf.default_format', 'table') - _maybe_remove(store, 'df') - store.put('df', df) - assert store.get_storer('df').is_table - _maybe_remove(store, 'df2') - store.append('df2', df) - assert store.get_storer('df').is_table + pd.set_option("io.hdf.default_format", "table") + _maybe_remove(store, "df") + store.put("df", df) + assert store.get_storer("df").is_table + _maybe_remove(store, "df2") + store.append("df2", df) + assert store.get_storer("df").is_table - pd.set_option('io.hdf.default_format', None) + pd.set_option("io.hdf.default_format", None) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - pd.set_option('io.hdf.default_format', 'fixed') - df.to_hdf(path, 'df') + pd.set_option("io.hdf.default_format", "fixed") + df.to_hdf(path, "df") with HDFStore(path) as store: - assert not store.get_storer('df').is_table + assert not store.get_storer("df").is_table with pytest.raises(ValueError): df.to_hdf(path, "df2", append=True) - pd.set_option('io.hdf.default_format', 'table') - df.to_hdf(path, 'df3') + pd.set_option("io.hdf.default_format", "table") + df.to_hdf(path, "df3") with HDFStore(path) as store: - assert store.get_storer('df3').is_table - df.to_hdf(path, 'df4', append=True) + assert store.get_storer("df3").is_table + df.to_hdf(path, "df4", append=True) with HDFStore(path) as store: - assert store.get_storer('df4').is_table + assert store.get_storer("df4").is_table - pd.set_option('io.hdf.default_format', None) + pd.set_option("io.hdf.default_format", None) def test_keys(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeStringSeries() - store['c'] = tm.makeDataFrame() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() assert len(store) == 3 - expected = {'/a', '/b', '/c'} + expected = {"/a", "/b", "/c"} assert set(store.keys()) == expected assert set(store) == expected @@ -396,31 +416,31 @@ def test_repr(self): with ensure_clean_store(self.path) as store: repr(store) store.info() - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeStringSeries() - store['c'] = tm.makeDataFrame() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeStringSeries() + store["c"] = tm.makeDataFrame() df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - df['timestamp1'] = Timestamp('20010102') - df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) - df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ['obj1']] = np.nan + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) - store['df'] = df + store["df"] = df # make a random group in hdf space - store._handle.create_group(store._handle.root, 'bah') + store._handle.create_group(store._handle.root, "bah") assert store.filename in repr(store) assert store.filename in str(store) @@ -430,9 +450,9 @@ def test_repr(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - store.append('df', df) + store.append("df", df) - s = store.get_storer('df') + s = store.get_storer("df") repr(s) str(s) @@ -440,42 +460,42 @@ def test_repr(self): def test_contains(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeDataFrame() - store['foo/bar'] = tm.makeDataFrame() - assert 'a' in store - assert 'b' in store - assert 'c' not in store - assert 'foo/bar' in store - assert '/foo/bar' in store - assert '/foo/b' not in store - assert 'bar' not in store + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() + store["foo/bar"] = tm.makeDataFrame() + assert "a" in store + assert "b" in store + assert "c" not in store + assert "foo/bar" in store + assert "/foo/bar" in store + assert "/foo/b" not in store + assert "bar" not in store # gh-2694: tables.NaturalNameWarning with catch_warnings(record=True): - store['node())'] = tm.makeDataFrame() - assert 'node())' in store + store["node())"] = tm.makeDataFrame() + assert "node())" in store def test_versioning(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - store['b'] = tm.makeDataFrame() + store["a"] = tm.makeTimeSeries() + store["b"] = tm.makeDataFrame() df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df1') - store.append('df1', df[:10]) - store.append('df1', df[10:]) - assert store.root.a._v_attrs.pandas_version == '0.15.2' - assert store.root.b._v_attrs.pandas_version == '0.15.2' - assert store.root.df1._v_attrs.pandas_version == '0.15.2' + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + assert store.root.a._v_attrs.pandas_version == "0.15.2" + assert store.root.b._v_attrs.pandas_version == "0.15.2" + assert store.root.df1._v_attrs.pandas_version == "0.15.2" # write a file and wipe its versioning - _maybe_remove(store, 'df2') - store.append('df2', df) + _maybe_remove(store, "df2") + store.append("df2", df) # this is an error because its table_type is appendable, but no # version info - store.get_node('df2')._v_attrs.pandas_version = None + store.get_node("df2")._v_attrs.pandas_version = None with pytest.raises(Exception): store.select("df2") @@ -488,7 +508,7 @@ def check(mode): with ensure_clean_path(self.path) as path: # constructor - if mode in ['r', 'r+']: + if mode in ["r", "r+"]: with pytest.raises(IOError): HDFStore(path, mode=mode) @@ -500,7 +520,7 @@ def check(mode): with ensure_clean_path(self.path) as path: # context - if mode in ['r', 'r+']: + if mode in ["r", "r+"]: with pytest.raises(IOError): with HDFStore(path, mode=mode) as store: # noqa pass @@ -511,41 +531,41 @@ def check(mode): with ensure_clean_path(self.path) as path: # conv write - if mode in ['r', 'r+']: + if mode in ["r", "r+"]: with pytest.raises(IOError): df.to_hdf(path, "df", mode=mode) - df.to_hdf(path, 'df', mode='w') + df.to_hdf(path, "df", mode="w") else: - df.to_hdf(path, 'df', mode=mode) + df.to_hdf(path, "df", mode=mode) # conv read - if mode in ['w']: + if mode in ["w"]: with pytest.raises(ValueError): read_hdf(path, "df", mode=mode) else: - result = read_hdf(path, 'df', mode=mode) + result = read_hdf(path, "df", mode=mode) assert_frame_equal(result, df) def check_default_mode(): # read_hdf uses default mode with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='w') - result = read_hdf(path, 'df') + df.to_hdf(path, "df", mode="w") + result = read_hdf(path, "df") assert_frame_equal(result, df) - check('r') - check('r+') - check('a') - check('w') + check("r") + check("r+") + check("a") + check("w") check_default_mode() def test_reopen_handle(self): with ensure_clean_path(self.path) as path: - store = HDFStore(path, mode='a') - store['a'] = tm.makeTimeSeries() + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() # invalid mode change with pytest.raises(PossibleDataLossError): @@ -555,36 +575,36 @@ def test_reopen_handle(self): assert not store.is_open # truncation ok here - store.open('w') + store.open("w") assert store.is_open assert len(store) == 0 store.close() assert not store.is_open - store = HDFStore(path, mode='a') - store['a'] = tm.makeTimeSeries() + store = HDFStore(path, mode="a") + store["a"] = tm.makeTimeSeries() # reopen as read - store.open('r') + store.open("r") assert store.is_open assert len(store) == 1 - assert store._mode == 'r' + assert store._mode == "r" store.close() assert not store.is_open # reopen as append - store.open('a') + store.open("a") assert store.is_open assert len(store) == 1 - assert store._mode == 'a' + assert store._mode == "a" store.close() assert not store.is_open # reopen as append (again) - store.open('a') + store.open("a") assert store.is_open assert len(store) == 1 - assert store._mode == 'a' + assert store._mode == "a" store.close() assert not store.is_open @@ -595,13 +615,14 @@ def test_open_args(self): df = tm.makeDataFrame() # create an in memory store - store = HDFStore(path, mode='a', driver='H5FD_CORE', - driver_core_backing_store=0) - store['df'] = df - store.append('df2', df) + store = HDFStore( + path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0 + ) + store["df"] = df + store.append("df2", df) - tm.assert_frame_equal(store['df'], df) - tm.assert_frame_equal(store['df2'], df) + tm.assert_frame_equal(store["df"], df) + tm.assert_frame_equal(store["df2"], df) store.close() @@ -611,61 +632,70 @@ def test_open_args(self): def test_flush(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() + store["a"] = tm.makeTimeSeries() store.flush() store.flush(fsync=True) def test_get(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeSeries() - left = store.get('a') - right = store['a'] + store["a"] = tm.makeTimeSeries() + left = store.get("a") + right = store["a"] tm.assert_series_equal(left, right) - left = store.get('/a') - right = store['/a'] + left = store.get("/a") + right = store["/a"] tm.assert_series_equal(left, right) with pytest.raises(KeyError): store.get("b") - @pytest.mark.parametrize('where, expected', [ - ('/', { - '': ({'first_group', 'second_group'}, set()), - '/first_group': (set(), {'df1', 'df2'}), - '/second_group': ({'third_group'}, {'df3', 's1'}), - '/second_group/third_group': (set(), {'df4'}), - }), - ('/second_group', { - '/second_group': ({'third_group'}, {'df3', 's1'}), - '/second_group/third_group': (set(), {'df4'}), - }) - ]) + @pytest.mark.parametrize( + "where, expected", + [ + ( + "/", + { + "": ({"first_group", "second_group"}, set()), + "/first_group": (set(), {"df1", "df2"}), + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ( + "/second_group", + { + "/second_group": ({"third_group"}, {"df3", "s1"}), + "/second_group/third_group": (set(), {"df4"}), + }, + ), + ], + ) def test_walk(self, where, expected): # GH10143 objs = { - 'df1': pd.DataFrame([1, 2, 3]), - 'df2': pd.DataFrame([4, 5, 6]), - 'df3': pd.DataFrame([6, 7, 8]), - 'df4': pd.DataFrame([9, 10, 11]), - 's1': pd.Series([10, 9, 8]), + "df1": pd.DataFrame([1, 2, 3]), + "df2": pd.DataFrame([4, 5, 6]), + "df3": pd.DataFrame([6, 7, 8]), + "df4": pd.DataFrame([9, 10, 11]), + "s1": pd.Series([10, 9, 8]), # Next 3 items aren't pandas objects and should be ignored - 'a1': np.array([[1, 2, 3], [4, 5, 6]]), - 'tb1': np.array([(1, 2, 3), (4, 5, 6)], dtype='i,i,i'), - 'tb2': np.array([(7, 8, 9), (10, 11, 12)], dtype='i,i,i') + "a1": np.array([[1, 2, 3], [4, 5, 6]]), + "tb1": np.array([(1, 2, 3), (4, 5, 6)], dtype="i,i,i"), + "tb2": np.array([(7, 8, 9), (10, 11, 12)], dtype="i,i,i"), } - with ensure_clean_store('walk_groups.hdf', mode='w') as store: - store.put('/first_group/df1', objs['df1']) - store.put('/first_group/df2', objs['df2']) - store.put('/second_group/df3', objs['df3']) - store.put('/second_group/s1', objs['s1']) - store.put('/second_group/third_group/df4', objs['df4']) + with ensure_clean_store("walk_groups.hdf", mode="w") as store: + store.put("/first_group/df1", objs["df1"]) + store.put("/first_group/df2", objs["df2"]) + store.put("/second_group/df3", objs["df3"]) + store.put("/second_group/s1", objs["s1"]) + store.put("/second_group/third_group/df4", objs["df4"]) # Create non-pandas objects - store._handle.create_array('/first_group', 'a1', objs['a1']) - store._handle.create_table('/first_group', 'tb1', obj=objs['tb1']) - store._handle.create_table('/second_group', 'tb2', obj=objs['tb2']) + store._handle.create_array("/first_group", "a1", objs["a1"]) + store._handle.create_table("/first_group", "tb1", obj=objs["tb1"]) + store._handle.create_table("/second_group", "tb2", obj=objs["tb2"]) assert len(list(store.walk(where=where))) == len(expected) for path, groups, leaves in store.walk(where=where): @@ -674,9 +704,9 @@ def test_walk(self, where, expected): assert expected_groups == set(groups) assert expected_frames == set(leaves) for leaf in leaves: - frame_path = '/'.join([path, leaf]) + frame_path = "/".join([path, leaf]) obj = store.get(frame_path) - if 'df' in leaf: + if "df" in leaf: tm.assert_frame_equal(obj, objs[leaf]) else: tm.assert_series_equal(obj, objs[leaf]) @@ -686,16 +716,16 @@ def test_getattr(self): with ensure_clean_store(self.path) as store: s = tm.makeTimeSeries() - store['a'] = s + store["a"] = s # test attribute access result = store.a tm.assert_series_equal(result, s) - result = getattr(store, 'a') + result = getattr(store, "a") tm.assert_series_equal(result, s) df = tm.makeTimeDataFrame() - store['df'] = df + store["df"] = df result = store.df tm.assert_frame_equal(result, df) @@ -705,7 +735,7 @@ def test_getattr(self): getattr(store, x) # not stores - for x in ['mode', 'path', 'handle', 'complib']: + for x in ["mode", "path", "handle", "complib"]: getattr(store, "_%s" % x) def test_put(self): @@ -714,12 +744,12 @@ def test_put(self): ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() - store['a'] = ts - store['b'] = df[:10] - store['foo/bar/bah'] = df[:10] - store['foo'] = df[:10] - store['/foo'] = df[:10] - store.put('c', df[:10], format='table') + store["a"] = ts + store["b"] = df[:10] + store["foo/bar/bah"] = df[:10] + store["foo"] = df[:10] + store["/foo"] = df[:10] + store.put("c", df[:10], format="table") # not OK, not a table with pytest.raises(ValueError): @@ -727,7 +757,7 @@ def test_put(self): # node does not currently exist, test _is_table_type returns False # in this case - _maybe_remove(store, 'f') + _maybe_remove(store, "f") with pytest.raises(ValueError): store.put("f", df[10:], append=True) @@ -736,43 +766,43 @@ def test_put(self): store.put("c", df[10:], append=True) # overwrite table - store.put('c', df[:10], format='table', append=False) - tm.assert_frame_equal(df[:10], store['c']) + store.put("c", df[:10], format="table", append=False) + tm.assert_frame_equal(df[:10], store["c"]) def test_put_string_index(self): with ensure_clean_store(self.path) as store: - index = Index( - ["I am a very long string index: %s" % i for i in range(20)]) + index = Index(["I am a very long string index: %s" % i for i in range(20)]) s = Series(np.arange(20), index=index) - df = DataFrame({'A': s, 'B': s}) + df = DataFrame({"A": s, "B": s}) - store['a'] = s - tm.assert_series_equal(store['a'], s) + store["a"] = s + tm.assert_series_equal(store["a"], s) - store['b'] = df - tm.assert_frame_equal(store['b'], df) + store["b"] = df + tm.assert_frame_equal(store["b"], df) # mixed length - index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + - ["I am a very long string index: %s" % i - for i in range(20)]) + index = Index( + ["abcdefghijklmnopqrstuvwxyz1234567890"] + + ["I am a very long string index: %s" % i for i in range(20)] + ) s = Series(np.arange(21), index=index) - df = DataFrame({'A': s, 'B': s}) - store['a'] = s - tm.assert_series_equal(store['a'], s) + df = DataFrame({"A": s, "B": s}) + store["a"] = s + tm.assert_series_equal(store["a"], s) - store['b'] = df - tm.assert_frame_equal(store['b'], df) + store["b"] = df + tm.assert_frame_equal(store["b"], df) def test_put_compression(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame() - store.put('c', df, format='table', complib='zlib') - tm.assert_frame_equal(store['c'], df) + store.put("c", df, format="table", complib="zlib") + tm.assert_frame_equal(store["c"], df) # can't compress if format='fixed' with pytest.raises(ValueError): @@ -786,10 +816,10 @@ def test_put_compression_blosc(self): # can't compress if format='fixed' with pytest.raises(ValueError): - store.put('b', df, format='fixed', complib='blosc') + store.put("b", df, format="fixed", complib="blosc") - store.put('c', df, format='table', complib='blosc') - tm.assert_frame_equal(store['c'], df) + store.put("c", df, format="table", complib="blosc") + tm.assert_frame_equal(store["c"], df) def test_complibs_default_settings(self): # GH15943 @@ -798,51 +828,51 @@ def test_complibs_default_settings(self): # Set complevel and check if complib is automatically set to # default value with ensure_clean_path(self.path) as tmpfile: - df.to_hdf(tmpfile, 'df', complevel=9) - result = pd.read_hdf(tmpfile, 'df') + df.to_hdf(tmpfile, "df", complevel=9) + result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 9 - assert node.filters.complib == 'zlib' + assert node.filters.complib == "zlib" # Set complib and check to see if compression is disabled with ensure_clean_path(self.path) as tmpfile: - df.to_hdf(tmpfile, 'df', complib='zlib') - result = pd.read_hdf(tmpfile, 'df') + df.to_hdf(tmpfile, "df", complib="zlib") + result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if not setting complib or complevel results in no compression with ensure_clean_path(self.path) as tmpfile: - df.to_hdf(tmpfile, 'df') - result = pd.read_hdf(tmpfile, 'df') + df.to_hdf(tmpfile, "df") + result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if file-defaults can be overridden on a per table basis with ensure_clean_path(self.path) as tmpfile: store = pd.HDFStore(tmpfile) - store.append('dfc', df, complevel=9, complib='blosc') - store.append('df', df) + store.append("dfc", df, complevel=9, complib="blosc") + store.append("df", df) store.close() - with tables.open_file(tmpfile, mode='r') as h5file: - for node in h5file.walk_nodes(where='/df', classname='Leaf'): + with tables.open_file(tmpfile, mode="r") as h5file: + for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None - for node in h5file.walk_nodes(where='/dfc', classname='Leaf'): + for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): assert node.filters.complevel == 9 - assert node.filters.complib == 'blosc' + assert node.filters.complib == "blosc" def test_complibs(self): # GH14478 @@ -851,8 +881,8 @@ def test_complibs(self): # Building list of all complibs and complevels tuples all_complibs = tables.filters.all_complibs # Remove lzo if its not available on this platform - if not tables.which_lib_version('lzo'): - all_complibs.remove('lzo') + if not tables.which_lib_version("lzo"): + all_complibs.remove("lzo") # Remove bzip2 if its not available on this platform if not tables.which_lib_version("bzip2"): all_complibs.remove("bzip2") @@ -862,7 +892,7 @@ def test_complibs(self): for (lib, lvl) in all_tests: with ensure_clean_path(self.path) as tmpfile: - gname = 'foo' + gname = "foo" # Write and read file to see if data is consistent df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) @@ -871,9 +901,8 @@ def test_complibs(self): # Open file and check metadata # for correct amount of compression - h5table = tables.open_file(tmpfile, mode='r') - for node in h5table.walk_nodes(where='/' + gname, - classname='Leaf'): + h5table = tables.open_file(tmpfile, mode="r") + for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): assert node.filters.complevel == lvl if lvl == 0: assert node.filters.complib is None @@ -889,29 +918,29 @@ def test_put_integer(self): @xfail_non_writeable def test_put_mixed_type(self): df = tm.makeTimeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - df['timestamp1'] = Timestamp('20010102') - df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) - df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ['obj1']] = np.nan + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') + _maybe_remove(store, "df") # PerformanceWarning with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) - store.put('df', df) + store.put("df", df) - expected = store.get('df') + expected = store.get("df") tm.assert_frame_equal(expected, df) @pytest.mark.filterwarnings( @@ -926,56 +955,64 @@ def test_append(self): with catch_warnings(record=True): df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df1') - store.append('df1', df[:10]) - store.append('df1', df[10:]) - tm.assert_frame_equal(store['df1'], df) + _maybe_remove(store, "df1") + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) - _maybe_remove(store, 'df2') - store.put('df2', df[:10], format='table') - store.append('df2', df[10:]) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.put("df2", df[:10], format="table") + store.append("df2", df[10:]) + tm.assert_frame_equal(store["df2"], df) - _maybe_remove(store, 'df3') - store.append('/df3', df[:10]) - store.append('/df3', df[10:]) - tm.assert_frame_equal(store['df3'], df) + _maybe_remove(store, "df3") + store.append("/df3", df[:10]) + store.append("/df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) # this is allowed by almost always don't want to do it # tables.NaturalNameWarning - _maybe_remove(store, '/df3 foo') - store.append('/df3 foo', df[:10]) - store.append('/df3 foo', df[10:]) - tm.assert_frame_equal(store['df3 foo'], df) + _maybe_remove(store, "/df3 foo") + store.append("/df3 foo", df[:10]) + store.append("/df3 foo", df[10:]) + tm.assert_frame_equal(store["df3 foo"], df) # dtype issues - mizxed type in a single object column df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) - df['mixed_column'] = 'testing' - df.loc[2, 'mixed_column'] = np.nan - _maybe_remove(store, 'df') - store.append('df', df) - tm.assert_frame_equal(store['df'], df) + df["mixed_column"] = "testing" + df.loc[2, "mixed_column"] = np.nan + _maybe_remove(store, "df") + store.append("df", df) + tm.assert_frame_equal(store["df"], df) # uints - test storage of uints - uint_data = DataFrame({ - 'u08': Series(np.random.randint(0, high=255, size=5), - dtype=np.uint8), - 'u16': Series(np.random.randint(0, high=65535, size=5), - dtype=np.uint16), - 'u32': Series(np.random.randint(0, high=2**30, size=5), - dtype=np.uint32), - 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62], - dtype=np.uint64)}, index=np.arange(5)) - _maybe_remove(store, 'uints') - store.append('uints', uint_data) - tm.assert_frame_equal(store['uints'], uint_data) + uint_data = DataFrame( + { + "u08": Series( + np.random.randint(0, high=255, size=5), dtype=np.uint8 + ), + "u16": Series( + np.random.randint(0, high=65535, size=5), dtype=np.uint16 + ), + "u32": Series( + np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 + ), + "u64": Series( + [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], + dtype=np.uint64, + ), + }, + index=np.arange(5), + ) + _maybe_remove(store, "uints") + store.append("uints", uint_data) + tm.assert_frame_equal(store["uints"], uint_data) # uints - test storage of uints in indexable columns - _maybe_remove(store, 'uints') + _maybe_remove(store, "uints") # 64-bit indices not yet supported - store.append('uints', uint_data, data_columns=[ - 'u08', 'u16', 'u32']) - tm.assert_frame_equal(store['uints'], uint_data) + store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) + tm.assert_frame_equal(store["uints"], uint_data) def test_append_series(self): @@ -986,42 +1023,42 @@ def test_append_series(self): ts = tm.makeTimeSeries() ns = Series(np.arange(100)) - store.append('ss', ss) - result = store['ss'] + store.append("ss", ss) + result = store["ss"] tm.assert_series_equal(result, ss) assert result.name is None - store.append('ts', ts) - result = store['ts'] + store.append("ts", ts) + result = store["ts"] tm.assert_series_equal(result, ts) assert result.name is None - ns.name = 'foo' - store.append('ns', ns) - result = store['ns'] + ns.name = "foo" + store.append("ns", ns) + result = store["ns"] tm.assert_series_equal(result, ns) assert result.name == ns.name # select on the values expected = ns[ns > 60] - result = store.select('ns', 'foo>60') + result = store.select("ns", "foo>60") tm.assert_series_equal(result, expected) # select on the index and values expected = ns[(ns > 70) & (ns.index < 90)] - result = store.select('ns', 'foo>70 and index<90') + result = store.select("ns", "foo>70 and index<90") tm.assert_series_equal(result, expected) # multi-index - mi = DataFrame(np.random.randn(5, 1), columns=['A']) - mi['B'] = np.arange(len(mi)) - mi['C'] = 'foo' - mi.loc[3:5, 'C'] = 'bar' - mi.set_index(['C', 'B'], inplace=True) + mi = DataFrame(np.random.randn(5, 1), columns=["A"]) + mi["B"] = np.arange(len(mi)) + mi["C"] = "foo" + mi.loc[3:5, "C"] = "bar" + mi.set_index(["C", "B"], inplace=True) s = mi.stack() s.index = s.index.droplevel(2) - store.append('mi', s) - tm.assert_series_equal(store['mi'], s) + store.append("mi", s) + tm.assert_series_equal(store["mi"], s) def test_store_index_types(self): # GH5386 @@ -1030,76 +1067,82 @@ def test_store_index_types(self): with ensure_clean_store(self.path) as store: def check(format, index): - df = DataFrame(np.random.randn(10, 2), columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.index = index(len(df)) - _maybe_remove(store, 'df') - store.put('df', df, format=format) - assert_frame_equal(df, store['df']) + _maybe_remove(store, "df") + store.put("df", df, format=format) + assert_frame_equal(df, store["df"]) - for index in [tm.makeFloatIndex, tm.makeStringIndex, - tm.makeIntIndex, tm.makeDateIndex]: + for index in [ + tm.makeFloatIndex, + tm.makeStringIndex, + tm.makeIntIndex, + tm.makeDateIndex, + ]: - check('table', index) - check('fixed', index) + check("table", index) + check("fixed", index) # period index currently broken for table # seee GH7796 FIXME - check('fixed', tm.makePeriodIndex) + check("fixed", tm.makePeriodIndex) # check('table',tm.makePeriodIndex) # unicode index = tm.makeUnicodeIndex - check('table', index) - check('fixed', index) + check("table", index) + check("fixed", index) - @pytest.mark.skipif(not is_platform_little_endian(), - reason="reason platform is not little endian") + @pytest.mark.skipif( + not is_platform_little_endian(), reason="reason platform is not little endian" + ) def test_encoding(self): with ensure_clean_store(self.path) as store: - df = DataFrame(dict(A='foo', B='bar'), index=range(5)) - df.loc[2, 'A'] = np.nan - df.loc[3, 'B'] = np.nan - _maybe_remove(store, 'df') - store.append('df', df, encoding='ascii') - tm.assert_frame_equal(store['df'], df) - - expected = df.reindex(columns=['A']) - result = store.select('df', Term('columns=A', encoding='ascii')) + df = DataFrame(dict(A="foo", B="bar"), index=range(5)) + df.loc[2, "A"] = np.nan + df.loc[3, "B"] = np.nan + _maybe_remove(store, "df") + store.append("df", df, encoding="ascii") + tm.assert_frame_equal(store["df"], df) + + expected = df.reindex(columns=["A"]) + result = store.select("df", Term("columns=A", encoding="ascii")) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('val', [ - [b'E\xc9, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'a', b'b', b'c'], - [b'EE, 17', b'', b'a', b'b', b'c'], - [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], - [b'', b'a', b'b', b'c'], - [b'\xf8\xfc', b'a', b'b', b'c'], - [b'A\xf8\xfc', b'', b'a', b'b', b'c'], - [np.nan, b'', b'b', b'c'], - [b'A\xf8\xfc', np.nan, b'', b'b', b'c'] - ]) - @pytest.mark.parametrize('dtype', ['category', object]) + @pytest.mark.parametrize( + "val", + [ + [b"E\xc9, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"a", b"b", b"c"], + [b"EE, 17", b"", b"a", b"b", b"c"], + [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"], + [b"", b"a", b"b", b"c"], + [b"\xf8\xfc", b"a", b"b", b"c"], + [b"A\xf8\xfc", b"", b"a", b"b", b"c"], + [np.nan, b"", b"b", b"c"], + [b"A\xf8\xfc", np.nan, b"", b"b", b"c"], + ], + ) + @pytest.mark.parametrize("dtype", ["category", object]) def test_latin_encoding(self, dtype, val): - enc = 'latin-1' - nan_rep = '' - key = 'data' + enc = "latin-1" + nan_rep = "" + key = "data" val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] ser = pd.Series(val, dtype=dtype) with ensure_clean_path(self.path) as store: - ser.to_hdf(store, key, format='table', encoding=enc, - nan_rep=nan_rep) + ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) retr = read_hdf(store, key) s_nan = ser.replace(nan_rep, np.nan) if is_categorical_dtype(s_nan): assert is_categorical_dtype(retr) - assert_series_equal(s_nan, retr, check_dtype=False, - check_categorical=False) + assert_series_equal(s_nan, retr, check_dtype=False, check_categorical=False) else: assert_series_equal(s_nan, retr) @@ -1111,139 +1154,156 @@ def test_latin_encoding(self, dtype, val): def test_append_some_nans(self): with ensure_clean_store(self.path) as store: - df = DataFrame({'A': Series(np.random.randn(20)).astype('int32'), - 'A1': np.random.randn(20), - 'A2': np.random.randn(20), - 'B': 'foo', 'C': 'bar', - 'D': Timestamp("20010101"), - 'E': datetime.datetime(2001, 1, 2, 0, 0)}, - index=np.arange(20)) + df = DataFrame( + { + "A": Series(np.random.randn(20)).astype("int32"), + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) # some nans - _maybe_remove(store, 'df1') - df.loc[0:15, ['A1', 'B', 'D', 'E']] = np.nan - store.append('df1', df[:10]) - store.append('df1', df[10:]) - tm.assert_frame_equal(store['df1'], df) + _maybe_remove(store, "df1") + df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan + store.append("df1", df[:10]) + store.append("df1", df[10:]) + tm.assert_frame_equal(store["df1"], df) # first column df1 = df.copy() - df1.loc[:, 'A1'] = np.nan - _maybe_remove(store, 'df1') - store.append('df1', df1[:10]) - store.append('df1', df1[10:]) - tm.assert_frame_equal(store['df1'], df1) + df1.loc[:, "A1"] = np.nan + _maybe_remove(store, "df1") + store.append("df1", df1[:10]) + store.append("df1", df1[10:]) + tm.assert_frame_equal(store["df1"], df1) # 2nd column df2 = df.copy() - df2.loc[:, 'A2'] = np.nan - _maybe_remove(store, 'df2') - store.append('df2', df2[:10]) - store.append('df2', df2[10:]) - tm.assert_frame_equal(store['df2'], df2) + df2.loc[:, "A2"] = np.nan + _maybe_remove(store, "df2") + store.append("df2", df2[:10]) + store.append("df2", df2[10:]) + tm.assert_frame_equal(store["df2"], df2) # datetimes df3 = df.copy() - df3.loc[:, 'E'] = np.nan - _maybe_remove(store, 'df3') - store.append('df3', df3[:10]) - store.append('df3', df3[10:]) - tm.assert_frame_equal(store['df3'], df3) + df3.loc[:, "E"] = np.nan + _maybe_remove(store, "df3") + store.append("df3", df3[:10]) + store.append("df3", df3[10:]) + tm.assert_frame_equal(store["df3"], df3) def test_append_all_nans(self): with ensure_clean_store(self.path) as store: - df = DataFrame({'A1': np.random.randn(20), - 'A2': np.random.randn(20)}, - index=np.arange(20)) + df = DataFrame( + {"A1": np.random.randn(20), "A2": np.random.randn(20)}, + index=np.arange(20), + ) df.loc[0:15, :] = np.nan # nan some entire rows (dropna=True) - _maybe_remove(store, 'df') - store.append('df', df[:10], dropna=True) - store.append('df', df[10:], dropna=True) - tm.assert_frame_equal(store['df'], df[-4:]) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df[-4:]) # nan some entire rows (dropna=False) - _maybe_remove(store, 'df2') - store.append('df2', df[:10], dropna=False) - store.append('df2', df[10:], dropna=False) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) # tests the option io.hdf.dropna_table - pd.set_option('io.hdf.dropna_table', False) - _maybe_remove(store, 'df3') - store.append('df3', df[:10]) - store.append('df3', df[10:]) - tm.assert_frame_equal(store['df3'], df) - - pd.set_option('io.hdf.dropna_table', True) - _maybe_remove(store, 'df4') - store.append('df4', df[:10]) - store.append('df4', df[10:]) - tm.assert_frame_equal(store['df4'], df[-4:]) + pd.set_option("io.hdf.dropna_table", False) + _maybe_remove(store, "df3") + store.append("df3", df[:10]) + store.append("df3", df[10:]) + tm.assert_frame_equal(store["df3"], df) + + pd.set_option("io.hdf.dropna_table", True) + _maybe_remove(store, "df4") + store.append("df4", df[:10]) + store.append("df4", df[10:]) + tm.assert_frame_equal(store["df4"], df[-4:]) # nan some entire rows (string are still written!) - df = DataFrame({'A1': np.random.randn(20), - 'A2': np.random.randn(20), - 'B': 'foo', 'C': 'bar'}, - index=np.arange(20)) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + }, + index=np.arange(20), + ) df.loc[0:15, :] = np.nan - _maybe_remove(store, 'df') - store.append('df', df[:10], dropna=True) - store.append('df', df[10:], dropna=True) - tm.assert_frame_equal(store['df'], df) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) - _maybe_remove(store, 'df2') - store.append('df2', df[:10], dropna=False) - store.append('df2', df[10:], dropna=False) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) # nan some entire rows (but since we have dates they are still # written!) - df = DataFrame({'A1': np.random.randn(20), - 'A2': np.random.randn(20), - 'B': 'foo', 'C': 'bar', - 'D': Timestamp("20010101"), - 'E': datetime.datetime(2001, 1, 2, 0, 0)}, - index=np.arange(20)) + df = DataFrame( + { + "A1": np.random.randn(20), + "A2": np.random.randn(20), + "B": "foo", + "C": "bar", + "D": Timestamp("20010101"), + "E": datetime.datetime(2001, 1, 2, 0, 0), + }, + index=np.arange(20), + ) df.loc[0:15, :] = np.nan - _maybe_remove(store, 'df') - store.append('df', df[:10], dropna=True) - store.append('df', df[10:], dropna=True) - tm.assert_frame_equal(store['df'], df) + _maybe_remove(store, "df") + store.append("df", df[:10], dropna=True) + store.append("df", df[10:], dropna=True) + tm.assert_frame_equal(store["df"], df) - _maybe_remove(store, 'df2') - store.append('df2', df[:10], dropna=False) - store.append('df2', df[10:], dropna=False) - tm.assert_frame_equal(store['df2'], df) + _maybe_remove(store, "df2") + store.append("df2", df[:10], dropna=False) + store.append("df2", df[10:], dropna=False) + tm.assert_frame_equal(store["df2"], df) # Test to make sure defaults are to not drop. # Corresponding to Issue 9382 df_with_missing = DataFrame( - {'col1': [0, np.nan, 2], 'col2': [1, np.nan, np.nan]}) + {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + ) with ensure_clean_path(self.path) as path: - df_with_missing.to_hdf(path, 'df_with_missing', format='table') - reloaded = read_hdf(path, 'df_with_missing') + df_with_missing.to_hdf(path, "df_with_missing", format="table") + reloaded = read_hdf(path, "df_with_missing") tm.assert_frame_equal(df_with_missing, reloaded) def test_read_missing_key_close_store(self): # GH 25766 with ensure_clean_path(self.path) as path: - df = pd.DataFrame({'a': range(2), 'b': range(2)}) - df.to_hdf(path, 'k1') + df = pd.DataFrame({"a": range(2), "b": range(2)}) + df.to_hdf(path, "k1") with pytest.raises(KeyError): - pd.read_hdf(path, 'k2') + pd.read_hdf(path, "k2") # smoke test to test that file is properly closed after # read with KeyError before another write - df.to_hdf(path, 'k2') + df.to_hdf(path, "k2") def test_append_frame_column_oriented(self): @@ -1251,25 +1311,23 @@ def test_append_frame_column_oriented(self): # column oriented df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df1') - store.append('df1', df.iloc[:, :2], axes=['columns']) - store.append('df1', df.iloc[:, 2:]) - tm.assert_frame_equal(store['df1'], df) + _maybe_remove(store, "df1") + store.append("df1", df.iloc[:, :2], axes=["columns"]) + store.append("df1", df.iloc[:, 2:]) + tm.assert_frame_equal(store["df1"], df) - result = store.select('df1', 'columns=A') - expected = df.reindex(columns=['A']) + result = store.select("df1", "columns=A") + expected = df.reindex(columns=["A"]) tm.assert_frame_equal(expected, result) # selection on the non-indexable - result = store.select( - 'df1', ('columns=A', 'index=df.index[0:4]')) - expected = df.reindex(columns=['A'], index=df.index[0:4]) + result = store.select("df1", ("columns=A", "index=df.index[0:4]")) + expected = df.reindex(columns=["A"], index=df.index[0:4]) tm.assert_frame_equal(expected, result) # this isn't supported with pytest.raises(TypeError): - store.select('df1', - 'columns=A and index>df.index[4]') + store.select("df1", "columns=A and index>df.index[4]") def test_append_with_different_block_ordering(self): @@ -1278,42 +1336,41 @@ def test_append_with_different_block_ordering(self): for i in range(10): - df = DataFrame(np.random.randn(10, 2), columns=list('AB')) - df['index'] = range(10) - df['index'] += i * 10 - df['int64'] = Series([1] * len(df), dtype='int64') - df['int16'] = Series([1] * len(df), dtype='int16') + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) + df["index"] = range(10) + df["index"] += i * 10 + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") if i % 2 == 0: - del df['int64'] - df['int64'] = Series([1] * len(df), dtype='int64') + del df["int64"] + df["int64"] = Series([1] * len(df), dtype="int64") if i % 3 == 0: - a = df.pop('A') - df['A'] = a + a = df.pop("A") + df["A"] = a - df.set_index('index', inplace=True) + df.set_index("index", inplace=True) - store.append('df', df) + store.append("df", df) # test a different ordering but with more fields (like invalid # combinate) with ensure_clean_store(self.path) as store: - df = DataFrame(np.random.randn(10, 2), - columns=list('AB'), dtype='float64') - df['int64'] = Series([1] * len(df), dtype='int64') - df['int16'] = Series([1] * len(df), dtype='int16') - store.append('df', df) + df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") + df["int64"] = Series([1] * len(df), dtype="int64") + df["int16"] = Series([1] * len(df), dtype="int16") + store.append("df", df) # store additional fields in different blocks - df['int16_2'] = Series([1] * len(df), dtype='int16') + df["int16_2"] = Series([1] * len(df), dtype="int16") with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) # store multile additional fields in different blocks - df['float_3'] = Series([1.] * len(df), dtype='float64') + df["float_3"] = Series([1.0] * len(df), dtype="float64") with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) def test_append_with_strings(self): @@ -1321,309 +1378,311 @@ def test_append_with_strings(self): with catch_warnings(record=True): def check_col(key, name, size): - assert getattr(store.get_storer(key) - .table.description, name).itemsize == size + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - store.append('df_big', df) - tm.assert_frame_equal(store.select('df_big'), df) - check_col('df_big', 'values_block_1', 15) + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big", df) + tm.assert_frame_equal(store.select("df_big"), df) + check_col("df_big", "values_block_1", 15) # appending smaller string ok - df2 = DataFrame([[124, 'asdqy'], [346, 'dggnhefbdfb']]) - store.append('df_big', df2) + df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) + store.append("df_big", df2) expected = concat([df, df2]) - tm.assert_frame_equal(store.select('df_big'), expected) - check_col('df_big', 'values_block_1', 15) + tm.assert_frame_equal(store.select("df_big"), expected) + check_col("df_big", "values_block_1", 15) # avoid truncation on elements - df = DataFrame([[123, 'asdqwerty'], [345, 'dggnhebbsdfbdfb']]) - store.append('df_big2', df, min_itemsize={'values': 50}) - tm.assert_frame_equal(store.select('df_big2'), df) - check_col('df_big2', 'values_block_1', 50) + df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) + store.append("df_big2", df, min_itemsize={"values": 50}) + tm.assert_frame_equal(store.select("df_big2"), df) + check_col("df_big2", "values_block_1", 50) # bigger string on next append - store.append('df_new', df) + store.append("df_new", df) df_new = DataFrame( - [[124, 'abcdefqhij'], [346, 'abcdefghijklmnopqrtsuvwxyz']]) + [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] + ) with pytest.raises(ValueError): - store.append('df_new', df_new) + store.append("df_new", df_new) # min_itemsize on Series index (GH 11412) - df = tm.makeMixedDataFrame().set_index('C') - store.append('ss', df['B'], min_itemsize={'index': 4}) - tm.assert_series_equal(store.select('ss'), df['B']) + df = tm.makeMixedDataFrame().set_index("C") + store.append("ss", df["B"], min_itemsize={"index": 4}) + tm.assert_series_equal(store.select("ss"), df["B"]) # same as above, with data_columns=True - store.append('ss2', df['B'], data_columns=True, - min_itemsize={'index': 4}) - tm.assert_series_equal(store.select('ss2'), df['B']) + store.append( + "ss2", df["B"], data_columns=True, min_itemsize={"index": 4} + ) + tm.assert_series_equal(store.select("ss2"), df["B"]) # min_itemsize in index without appending (GH 10381) - store.put('ss3', df, format='table', - min_itemsize={'index': 6}) + store.put("ss3", df, format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C='longer').set_index('C') - store.append('ss3', df2) - tm.assert_frame_equal(store.select('ss3'), - pd.concat([df, df2])) + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + store.append("ss3", df2) + tm.assert_frame_equal(store.select("ss3"), pd.concat([df, df2])) # same as above, with a Series - store.put('ss4', df['B'], format='table', - min_itemsize={'index': 6}) - store.append('ss4', df2['B']) - tm.assert_series_equal(store.select('ss4'), - pd.concat([df['B'], df2['B']])) + store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) + store.append("ss4", df2["B"]) + tm.assert_series_equal( + store.select("ss4"), pd.concat([df["B"], df2["B"]]) + ) # with nans - _maybe_remove(store, 'df') + _maybe_remove(store, "df") df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.loc[1:4, 'string'] = np.nan - df['string2'] = 'bar' - df.loc[4:8, 'string2'] = np.nan - df['string3'] = 'bah' - df.loc[1:, 'string3'] = np.nan - store.append('df', df) - result = store.select('df') + df["string"] = "foo" + df.loc[1:4, "string"] = np.nan + df["string2"] = "bar" + df.loc[4:8, "string2"] = np.nan + df["string3"] = "bah" + df.loc[1:, "string3"] = np.nan + store.append("df", df) + result = store.select("df") tm.assert_frame_equal(result, df) with ensure_clean_store(self.path) as store: def check_col(key, name, size): - assert getattr(store.get_storer(key) - .table.description, name).itemsize, size + assert getattr( + store.get_storer(key).table.description, name + ).itemsize, size - df = DataFrame(dict(A='foo', B='bar'), index=range(10)) + df = DataFrame(dict(A="foo", B="bar"), index=range(10)) # a min_itemsize that creates a data_column - _maybe_remove(store, 'df') - store.append('df', df, min_itemsize={'A': 200}) - check_col('df', 'A', 200) - assert store.get_storer('df').data_columns == ['A'] + _maybe_remove(store, "df") + store.append("df", df, min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["A"] # a min_itemsize that creates a data_column2 - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['B'], min_itemsize={'A': 200}) - check_col('df', 'A', 200) - assert store.get_storer('df').data_columns == ['B', 'A'] + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) + check_col("df", "A", 200) + assert store.get_storer("df").data_columns == ["B", "A"] # a min_itemsize that creates a data_column2 - _maybe_remove(store, 'df') - store.append('df', df, data_columns=[ - 'B'], min_itemsize={'values': 200}) - check_col('df', 'B', 200) - check_col('df', 'values_block_0', 200) - assert store.get_storer('df').data_columns == ['B'] + _maybe_remove(store, "df") + store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) + check_col("df", "B", 200) + check_col("df", "values_block_0", 200) + assert store.get_storer("df").data_columns == ["B"] # infer the .typ on subsequent appends - _maybe_remove(store, 'df') - store.append('df', df[:5], min_itemsize=200) - store.append('df', df[5:], min_itemsize=200) - tm.assert_frame_equal(store['df'], df) + _maybe_remove(store, "df") + store.append("df", df[:5], min_itemsize=200) + store.append("df", df[5:], min_itemsize=200) + tm.assert_frame_equal(store["df"], df) # invalid min_itemsize keys - df = DataFrame(['foo', 'foo', 'foo', 'barh', - 'barh', 'barh'], columns=['A']) - _maybe_remove(store, 'df') + df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) + _maybe_remove(store, "df") with pytest.raises(ValueError): - store.append('df', df, min_itemsize={'foo': 20, 'foobar': 20}) + store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) def test_append_with_empty_string(self): with ensure_clean_store(self.path) as store: # with all empty strings (GH 12242) - df = DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', '']}) - store.append('df', df[:-1], min_itemsize={'x': 1}) - store.append('df', df[-1:], min_itemsize={'x': 1}) - tm.assert_frame_equal(store.select('df'), df) + df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) + store.append("df", df[:-1], min_itemsize={"x": 1}) + store.append("df", df[-1:], min_itemsize={"x": 1}) + tm.assert_frame_equal(store.select("df"), df) def test_to_hdf_with_min_itemsize(self): with ensure_clean_path(self.path) as path: # min_itemsize in index with to_hdf (GH 10381) - df = tm.makeMixedDataFrame().set_index('C') - df.to_hdf(path, 'ss3', format='table', min_itemsize={'index': 6}) + df = tm.makeMixedDataFrame().set_index("C") + df.to_hdf(path, "ss3", format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: - df2 = df.copy().reset_index().assign(C='longer').set_index('C') - df2.to_hdf(path, 'ss3', append=True, format='table') - tm.assert_frame_equal(pd.read_hdf(path, 'ss3'), - pd.concat([df, df2])) + df2 = df.copy().reset_index().assign(C="longer").set_index("C") + df2.to_hdf(path, "ss3", append=True, format="table") + tm.assert_frame_equal(pd.read_hdf(path, "ss3"), pd.concat([df, df2])) # same as above, with a Series - df['B'].to_hdf(path, 'ss4', format='table', - min_itemsize={'index': 6}) - df2['B'].to_hdf(path, 'ss4', append=True, format='table') - tm.assert_series_equal(pd.read_hdf(path, 'ss4'), - pd.concat([df['B'], df2['B']])) + df["B"].to_hdf(path, "ss4", format="table", min_itemsize={"index": 6}) + df2["B"].to_hdf(path, "ss4", append=True, format="table") + tm.assert_series_equal( + pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) + ) @pytest.mark.parametrize( - "format", - [pytest.param('fixed', marks=xfail_non_writeable), - 'table']) + "format", [pytest.param("fixed", marks=xfail_non_writeable), "table"] + ) def test_to_hdf_errors(self, format): - data = ['\ud800foo'] + data = ["\ud800foo"] ser = pd.Series(data, index=pd.Index(data)) with ensure_clean_path(self.path) as path: # GH 20835 - ser.to_hdf(path, 'table', format=format, errors='surrogatepass') + ser.to_hdf(path, "table", format=format, errors="surrogatepass") - result = pd.read_hdf(path, 'table', errors='surrogatepass') + result = pd.read_hdf(path, "table", errors="surrogatepass") tm.assert_series_equal(result, ser) def test_append_with_data_columns(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame() - df.iloc[0, df.columns.get_loc('B')] = 1. - _maybe_remove(store, 'df') - store.append('df', df[:2], data_columns=['B']) - store.append('df', df[2:]) - tm.assert_frame_equal(store['df'], df) + df.iloc[0, df.columns.get_loc("B")] = 1.0 + _maybe_remove(store, "df") + store.append("df", df[:2], data_columns=["B"]) + store.append("df", df[2:]) + tm.assert_frame_equal(store["df"], df) # check that we have indices created - assert(store._handle.root.df.table.cols.index.is_indexed is True) - assert(store._handle.root.df.table.cols.B.is_indexed is True) + assert store._handle.root.df.table.cols.index.is_indexed is True + assert store._handle.root.df.table.cols.B.is_indexed is True # data column searching - result = store.select('df', 'B>0') + result = store.select("df", "B>0") expected = df[df.B > 0] tm.assert_frame_equal(result, expected) # data column searching (with an indexable and a data_columns) - result = store.select( - 'df', 'B>0 and index>df.index[3]') + result = store.select("df", "B>0 and index>df.index[3]") df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) # data column selection with a string data_column df_new = df.copy() - df_new['string'] = 'foo' - df_new.loc[1:4, 'string'] = np.nan - df_new.loc[5:6, 'string'] = 'bar' - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string']) - result = store.select('df', "string='foo'") - expected = df_new[df_new.string == 'foo'] + df_new["string"] = "foo" + df_new.loc[1:4, "string"] = np.nan + df_new.loc[5:6, "string"] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"]) + result = store.select("df", "string='foo'") + expected = df_new[df_new.string == "foo"] tm.assert_frame_equal(result, expected) # using min_itemsize and a data column def check_col(key, name, size): - assert getattr(store.get_storer(key) - .table.description, name).itemsize == size + assert ( + getattr(store.get_storer(key).table.description, name).itemsize + == size + ) with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string'], - min_itemsize={'string': 30}) - check_col('df', 'string', 30) - _maybe_remove(store, 'df') + _maybe_remove(store, "df") + store.append( + "df", df_new, data_columns=["string"], min_itemsize={"string": 30} + ) + check_col("df", "string", 30) + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["string"], min_itemsize=30) + check_col("df", "string", 30) + _maybe_remove(store, "df") store.append( - 'df', df_new, data_columns=['string'], min_itemsize=30) - check_col('df', 'string', 30) - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string'], - min_itemsize={'values': 30}) - check_col('df', 'string', 30) + "df", df_new, data_columns=["string"], min_itemsize={"values": 30} + ) + check_col("df", "string", 30) with ensure_clean_store(self.path) as store: - df_new['string2'] = 'foobarbah' - df_new['string_block1'] = 'foobarbah1' - df_new['string_block2'] = 'foobarbah2' - _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string', 'string2'], - min_itemsize={'string': 30, 'string2': 40, - 'values': 50}) - check_col('df', 'string', 30) - check_col('df', 'string2', 40) - check_col('df', 'values_block_1', 50) + df_new["string2"] = "foobarbah" + df_new["string_block1"] = "foobarbah1" + df_new["string_block2"] = "foobarbah2" + _maybe_remove(store, "df") + store.append( + "df", + df_new, + data_columns=["string", "string2"], + min_itemsize={"string": 30, "string2": 40, "values": 50}, + ) + check_col("df", "string", 30) + check_col("df", "string2", 40) + check_col("df", "values_block_1", 50) with ensure_clean_store(self.path) as store: # multiple data columns df_new = df.copy() - df_new.iloc[0, df_new.columns.get_loc('A')] = 1. - df_new.iloc[0, df_new.columns.get_loc('B')] = -1. - df_new['string'] = 'foo' + df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 + df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 + df_new["string"] = "foo" - sl = df_new.columns.get_loc('string') + sl = df_new.columns.get_loc("string") df_new.iloc[1:4, sl] = np.nan - df_new.iloc[5:6, sl] = 'bar' + df_new.iloc[5:6, sl] = "bar" - df_new['string2'] = 'foo' - sl = df_new.columns.get_loc('string2') + df_new["string2"] = "foo" + sl = df_new.columns.get_loc("string2") df_new.iloc[2:5, sl] = np.nan - df_new.iloc[7:8, sl] = 'bar' - _maybe_remove(store, 'df') - store.append( - 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) - result = store.select('df', - "string='foo' and string2='foo'" - " and A>0 and B<0") - expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] + df_new.iloc[7:8, sl] = "bar" + _maybe_remove(store, "df") + store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) + result = store.select( + "df", "string='foo' and string2='foo'" " and A>0 and B<0" + ) + expected = df_new[ + (df_new.string == "foo") + & (df_new.string2 == "foo") + & (df_new.A > 0) + & (df_new.B < 0) + ] tm.assert_frame_equal(result, expected, check_index_type=False) # yield an empty frame - result = store.select('df', "string='foo' and string2='cool'") - expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'cool')] + result = store.select("df", "string='foo' and string2='cool'") + expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] tm.assert_frame_equal(result, expected, check_index_type=False) with ensure_clean_store(self.path) as store: # doc example df_dc = df.copy() - df_dc['string'] = 'foo' - df_dc.loc[4:6, 'string'] = np.nan - df_dc.loc[7:9, 'string'] = 'bar' - df_dc['string2'] = 'cool' - df_dc['datetime'] = Timestamp('20010102') + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc["string2"] = "cool" + df_dc["datetime"] = Timestamp("20010102") df_dc = df_dc._convert(datetime=True) - df_dc.loc[3:5, ['A', 'B', 'datetime']] = np.nan + df_dc.loc[3:5, ["A", "B", "datetime"]] = np.nan - _maybe_remove(store, 'df_dc') - store.append('df_dc', df_dc, - data_columns=['B', 'C', 'string', - 'string2', 'datetime']) - result = store.select('df_dc', 'B>0') + _maybe_remove(store, "df_dc") + store.append( + "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] + ) + result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected, check_index_type=False) - result = store.select( - 'df_dc', ['B > 0', 'C > 0', 'string == foo']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & ( - df_dc.string == 'foo')] + result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected, check_index_type=False) with ensure_clean_store(self.path) as store: # doc example part 2 np.random.seed(1234) - index = date_range('1/1/2000', periods=8) - df_dc = DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) - df_dc['string'] = 'foo' - df_dc.loc[4:6, 'string'] = np.nan - df_dc.loc[7:9, 'string'] = 'bar' - df_dc.loc[:, ['B', 'C']] = df_dc.loc[:, ['B', 'C']].abs() - df_dc['string2'] = 'cool' + index = date_range("1/1/2000", periods=8) + df_dc = DataFrame( + np.random.randn(8, 3), index=index, columns=["A", "B", "C"] + ) + df_dc["string"] = "foo" + df_dc.loc[4:6, "string"] = np.nan + df_dc.loc[7:9, "string"] = "bar" + df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() + df_dc["string2"] = "cool" # on-disk operations - store.append('df_dc', df_dc, data_columns=[ - 'B', 'C', 'string', 'string2']) + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) - result = store.select('df_dc', 'B>0') + result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) - result = store.select( - 'df_dc', ['B > 0', 'C > 0', 'string == "foo"']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & - (df_dc.string == 'foo')] + result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected) def test_create_table_index(self): @@ -1631,64 +1690,64 @@ def test_create_table_index(self): with ensure_clean_store(self.path) as store: with catch_warnings(record=True): + def col(t, column): return getattr(store.get_storer(t).table.cols, column) # data columns df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df['string2'] = 'bar' - store.append('f', df, data_columns=['string', 'string2']) - assert(col('f', 'index').is_indexed is True) - assert(col('f', 'string').is_indexed is True) - assert(col('f', 'string2').is_indexed is True) + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string", "string2"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + assert col("f", "string2").is_indexed is True # specify index=columns store.append( - 'f2', df, index=['string'], - data_columns=['string', 'string2']) - assert(col('f2', 'index').is_indexed is False) - assert(col('f2', 'string').is_indexed is True) - assert(col('f2', 'string2').is_indexed is False) + "f2", df, index=["string"], data_columns=["string", "string2"] + ) + assert col("f2", "index").is_indexed is False + assert col("f2", "string").is_indexed is True + assert col("f2", "string2").is_indexed is False # try to index a non-table - _maybe_remove(store, 'f2') - store.put('f2', df) + _maybe_remove(store, "f2") + store.put("f2", df) with pytest.raises(TypeError): - store.create_table_index('f2') + store.create_table_index("f2") def test_append_hierarchical(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) with ensure_clean_store(self.path) as store: - store.append('mi', df) - result = store.select('mi') + store.append("mi", df) + result = store.select("mi") tm.assert_frame_equal(result, df) # GH 3748 - result = store.select('mi', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) + result = store.select("mi", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) - with ensure_clean_path('test.hdf') as path: - df.to_hdf(path, 'df', format='table') - result = read_hdf(path, 'df', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) + with ensure_clean_path("test.hdf") as path: + df.to_hdf(path, "df", format="table") + result = read_hdf(path, "df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_column_multiindex(self): # GH 4710 # recreate multi-indexes properly - index = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), - ('B', 'a'), ('B', 'b')], - names=['first', 'second']) + index = MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"] + ) df = DataFrame(np.arange(12).reshape(3, 4), columns=index) expected = df.copy() if isinstance(expected.index, RangeIndex): @@ -1696,41 +1755,42 @@ def test_column_multiindex(self): with ensure_clean_store(self.path) as store: - store.put('df', df) - tm.assert_frame_equal(store['df'], expected, - check_index_type=True, - check_column_type=True) + store.put("df", df) + tm.assert_frame_equal( + store["df"], expected, check_index_type=True, check_column_type=True + ) - store.put('df1', df, format='table') - tm.assert_frame_equal(store['df1'], expected, - check_index_type=True, - check_column_type=True) + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) with pytest.raises(ValueError): - store.put('df2', df, format='table', data_columns=['A']) + store.put("df2", df, format="table", data_columns=["A"]) with pytest.raises(ValueError): - store.put('df3', df, format='table', data_columns=True) + store.put("df3", df, format="table", data_columns=True) # appending multi-column on existing table (see GH 6167) with ensure_clean_store(self.path) as store: - store.append('df2', df) - store.append('df2', df) + store.append("df2", df) + store.append("df2", df) - tm.assert_frame_equal(store['df2'], concat((df, df))) + tm.assert_frame_equal(store["df2"], concat((df, df))) # non_index_axes name - df = DataFrame(np.arange(12).reshape(3, 4), - columns=Index(list('ABCD'), name='foo')) + df = DataFrame( + np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo") + ) expected = df.copy() if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: - store.put('df1', df, format='table') - tm.assert_frame_equal(store['df1'], expected, - check_index_type=True, - check_column_type=True) + store.put("df1", df, format="table") + tm.assert_frame_equal( + store["df1"], expected, check_index_type=True, check_column_type=True + ) def test_store_multiindex(self): @@ -1739,97 +1799,108 @@ def test_store_multiindex(self): with ensure_clean_store(self.path) as store: def make_index(names=None): - return MultiIndex.from_tuples([(datetime.datetime(2013, 12, d), - s, t) - for d in range(1, 3) - for s in range(2) - for t in range(3)], - names=names) + return MultiIndex.from_tuples( + [ + (datetime.datetime(2013, 12, d), s, t) + for d in range(1, 3) + for s in range(2) + for t in range(3) + ], + names=names, + ) # no names - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index()) - store.append('df', df) - tm.assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index()) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) # partial names - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index(['date', None, None])) - store.append('df', df) - tm.assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", None, None]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) # series - _maybe_remove(store, 's') - s = Series(np.zeros(12), index=make_index(['date', None, None])) - store.append('s', s) - xp = Series(np.zeros(12), index=make_index( - ['date', 'level_1', 'level_2'])) - tm.assert_series_equal(store.select('s'), xp) + _maybe_remove(store, "s") + s = Series(np.zeros(12), index=make_index(["date", None, None])) + store.append("s", s) + xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"])) + tm.assert_series_equal(store.select("s"), xp) # dup with column - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index(['date', 'a', 't'])) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "a", "t"]), + ) with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) # dup within level - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], - index=make_index(['date', 'date', 'date'])) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "date", "date"]), + ) with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) # fully names - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=[ - 'a', 'b'], index=make_index(['date', 's', 't'])) - store.append('df', df) - tm.assert_frame_equal(store.select('df'), df) + _maybe_remove(store, "df") + df = DataFrame( + np.zeros((12, 2)), + columns=["a", "b"], + index=make_index(["date", "s", "t"]), + ) + store.append("df", df) + tm.assert_frame_equal(store.select("df"), df) def test_select_columns_in_where(self): # GH 6169 # recreate multi-indexes when columns is passed # in the `where` argument - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo_name', 'bar_name']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo_name", "bar_name"], + ) # With a DataFrame - df = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) with ensure_clean_store(self.path) as store: - store.put('df', df, format='table') - expected = df[['A']] + store.put("df", df, format="table") + expected = df[["A"]] - tm.assert_frame_equal(store.select('df', columns=['A']), expected) + tm.assert_frame_equal(store.select("df", columns=["A"]), expected) - tm.assert_frame_equal(store.select( - 'df', where="columns=['A']"), expected) + tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected) # With a Series - s = Series(np.random.randn(10), index=index, - name='A') + s = Series(np.random.randn(10), index=index, name="A") with ensure_clean_store(self.path) as store: - store.put('s', s, format='table') - tm.assert_series_equal(store.select('s', where="columns=['A']"), s) + store.put("s", s, format="table") + tm.assert_series_equal(store.select("s", where="columns=['A']"), s) def test_mi_data_columns(self): # GH 14435 - idx = pd.MultiIndex.from_arrays([date_range('2000-01-01', periods=5), - range(5)], names=['date', 'id']) - df = pd.DataFrame({'a': [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) + idx = pd.MultiIndex.from_arrays( + [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] + ) + df = pd.DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=True) + store.append("df", df, data_columns=True) - actual = store.select('df', where='id == 1') + actual = store.select("df", where="id == 1") expected = df.iloc[[1], :] tm.assert_frame_equal(actual, expected) @@ -1838,62 +1909,62 @@ def test_pass_spec_to_storer(self): df = tm.makeDataFrame() with ensure_clean_store(self.path) as store: - store.put('df', df) + store.put("df", df) with pytest.raises(TypeError): - store.select('df', columns=['A']) + store.select("df", columns=["A"]) with pytest.raises(TypeError): - store.select('df', where=[('columns=A')]) + store.select("df", where=[("columns=A")]) @xfail_non_writeable def test_append_misc(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - store.append('df', df, chunksize=1) - result = store.select('df') + store.append("df", df, chunksize=1) + result = store.select("df") tm.assert_frame_equal(result, df) - store.append('df1', df, expectedrows=10) - result = store.select('df1') + store.append("df1", df, expectedrows=10) + result = store.select("df1") tm.assert_frame_equal(result, df) # more chunksize in append tests def check(obj, comparator): for c in [10, 200, 1000]: - with ensure_clean_store(self.path, mode='w') as store: - store.append('obj', obj, chunksize=c) - result = store.select('obj') + with ensure_clean_store(self.path, mode="w") as store: + store.append("obj", obj, chunksize=c) + result = store.select("obj") comparator(result, obj) df = tm.makeDataFrame() - df['string'] = 'foo' - df['float322'] = 1. - df['float322'] = df['float322'].astype('float32') - df['bool'] = df['float322'] > 0 - df['time1'] = Timestamp('20130101') - df['time2'] = Timestamp('20130102') + df["string"] = "foo" + df["float322"] = 1.0 + df["float322"] = df["float322"].astype("float32") + df["bool"] = df["float322"] > 0 + df["time1"] = Timestamp("20130101") + df["time2"] = Timestamp("20130102") check(df, tm.assert_frame_equal) # empty frame, GH4273 with ensure_clean_store(self.path) as store: # 0 len - df_empty = DataFrame(columns=list('ABC')) - store.append('df', df_empty) + df_empty = DataFrame(columns=list("ABC")) + store.append("df", df_empty) with pytest.raises(KeyError): - store.select('df') + store.select("df") # repeated append of 0/non-zero frames - df = DataFrame(np.random.rand(10, 3), columns=list('ABC')) - store.append('df', df) - assert_frame_equal(store.select('df'), df) - store.append('df', df_empty) - assert_frame_equal(store.select('df'), df) + df = DataFrame(np.random.rand(10, 3), columns=list("ABC")) + store.append("df", df) + assert_frame_equal(store.select("df"), df) + store.append("df", df_empty) + assert_frame_equal(store.select("df"), df) # store - df = DataFrame(columns=list('ABC')) - store.put('df2', df) - assert_frame_equal(store.select('df2'), df) + df = DataFrame(columns=list("ABC")) + store.put("df2", df) + assert_frame_equal(store.select("df2"), df) def test_append_raise(self): @@ -1903,93 +1974,104 @@ def test_append_raise(self): # list in column df = tm.makeDataFrame() - df['invalid'] = [['a']] * len(df) - assert df.dtypes['invalid'] == np.object_ + df["invalid"] = [["a"]] * len(df) + assert df.dtypes["invalid"] == np.object_ with pytest.raises(TypeError): - store.append('df', df) + store.append("df", df) # multiple invalid columns - df['invalid2'] = [['a']] * len(df) - df['invalid3'] = [['a']] * len(df) + df["invalid2"] = [["a"]] * len(df) + df["invalid3"] = [["a"]] * len(df) with pytest.raises(TypeError): - store.append('df', df) + store.append("df", df) # datetime with embedded nans as object df = tm.makeDataFrame() s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) s[0:5] = np.nan - df['invalid'] = s - assert df.dtypes['invalid'] == np.object_ + df["invalid"] = s + assert df.dtypes["invalid"] == np.object_ with pytest.raises(TypeError): - store.append('df', df) + store.append("df", df) # directly ndarray with pytest.raises(TypeError): - store.append('df', np.arange(10)) + store.append("df", np.arange(10)) # series directly with pytest.raises(TypeError): - store.append('df', Series(np.arange(10))) + store.append("df", Series(np.arange(10))) # appending an incompatible table df = tm.makeDataFrame() - store.append('df', df) + store.append("df", df) - df['foo'] = 'foo' + df["foo"] = "foo" with pytest.raises(ValueError): - store.append('df', df) + store.append("df", df) def test_table_index_incompatible_dtypes(self): - df1 = DataFrame({'a': [1, 2, 3]}) - df2 = DataFrame({'a': [4, 5, 6]}, - index=date_range('1/1/2000', periods=3)) + df1 = DataFrame({"a": [1, 2, 3]}) + df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) with ensure_clean_store(self.path) as store: - store.put('frame', df1, format='table') + store.put("frame", df1, format="table") with pytest.raises(TypeError): - store.put('frame', df2, format='table', append=True) + store.put("frame", df2, format="table", append=True) def test_table_values_dtypes_roundtrip(self): with ensure_clean_store(self.path) as store: - df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') - store.append('df_f8', df1) - assert_series_equal(df1.dtypes, store['df_f8'].dtypes) + df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") + store.append("df_f8", df1) + assert_series_equal(df1.dtypes, store["df_f8"].dtypes) - df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') - store.append('df_i8', df2) - assert_series_equal(df2.dtypes, store['df_i8'].dtypes) + df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8") + store.append("df_i8", df2) + assert_series_equal(df2.dtypes, store["df_i8"].dtypes) # incompatible dtype with pytest.raises(ValueError): - store.append('df_i8', df1) + store.append("df_i8", df1) # check creation/storage/retrieval of float32 (a bit hacky to # actually create them thought) - df1 = DataFrame( - np.array([[1], [2], [3]], dtype='f4'), columns=['A']) - store.append('df_f4', df1) - assert_series_equal(df1.dtypes, store['df_f4'].dtypes) - assert df1.dtypes[0] == 'float32' + df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"]) + store.append("df_f4", df1) + assert_series_equal(df1.dtypes, store["df_f4"].dtypes) + assert df1.dtypes[0] == "float32" # check with mixed dtypes - df1 = DataFrame({c: Series(np.random.randint(5), dtype=c) - for c in ['float32', 'float64', 'int32', - 'int64', 'int16', 'int8']}) - df1['string'] = 'foo' - df1['float322'] = 1. - df1['float322'] = df1['float322'].astype('float32') - df1['bool'] = df1['float32'] > 0 - df1['time1'] = Timestamp('20130101') - df1['time2'] = Timestamp('20130102') - - store.append('df_mixed_dtypes1', df1) - result = store.select('df_mixed_dtypes1').dtypes.value_counts() + df1 = DataFrame( + { + c: Series(np.random.randint(5), dtype=c) + for c in ["float32", "float64", "int32", "int64", "int16", "int8"] + } + ) + df1["string"] = "foo" + df1["float322"] = 1.0 + df1["float322"] = df1["float322"].astype("float32") + df1["bool"] = df1["float32"] > 0 + df1["time1"] = Timestamp("20130101") + df1["time2"] = Timestamp("20130102") + + store.append("df_mixed_dtypes1", df1) + result = store.select("df_mixed_dtypes1").dtypes.value_counts() result.index = [str(i) for i in result.index] - expected = Series({'float32': 2, 'float64': 1, 'int32': 1, - 'bool': 1, 'int16': 1, 'int8': 1, - 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) + expected = Series( + { + "float32": 2, + "float64": 1, + "int32": 1, + "bool": 1, + "int16": 1, + "int8": 1, + "int64": 1, + "object": 1, + "datetime64[ns]": 2, + } + ) result = result.sort_index() expected = expected.sort_index() tm.assert_series_equal(result, expected) @@ -1998,87 +2080,93 @@ def test_table_mixed_dtypes(self): # frame df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['bool3'] = True - df['int1'] = 1 - df['int2'] = 2 - df['timestamp1'] = Timestamp('20010102') - df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) - df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) - df.loc[3:6, ['obj1']] = np.nan + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["bool3"] = True + df["int1"] = 1 + df["int2"] = 2 + df["timestamp1"] = Timestamp("20010102") + df["timestamp2"] = Timestamp("20010103") + df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) + df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) + df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: - store.append('df1_mixed', df) - tm.assert_frame_equal(store.select('df1_mixed'), df) + store.append("df1_mixed", df) + tm.assert_frame_equal(store.select("df1_mixed"), df) def test_unimplemented_dtypes_table_columns(self): with ensure_clean_store(self.path) as store: - dtypes = [('date', datetime.date(2001, 1, 2))] + dtypes = [("date", datetime.date(2001, 1, 2))] # currently not supported dtypes #### for n, f in dtypes: df = tm.makeDataFrame() df[n] = f with pytest.raises(TypeError): - store.append('df1_%s' % n, df) + store.append("df1_%s" % n, df) # frame df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['datetime1'] = datetime.date(2001, 1, 2) + df["obj1"] = "foo" + df["obj2"] = "bar" + df["datetime1"] = datetime.date(2001, 1, 2) df = df._consolidate()._convert(datetime=True) with ensure_clean_store(self.path) as store: # this fails because we have a date in the object block...... with pytest.raises(TypeError): - store.append('df_unimplemented', df) + store.append("df_unimplemented", df) @xfail_non_writeable @pytest.mark.skipif( - LooseVersion(np.__version__) == LooseVersion('1.15.0'), - reason=("Skipping pytables test when numpy version is " - "exactly equal to 1.15.0: gh-22098")) + LooseVersion(np.__version__) == LooseVersion("1.15.0"), + reason=( + "Skipping pytables test when numpy version is " + "exactly equal to 1.15.0: gh-22098" + ), + ) def test_calendar_roundtrip_issue(self): # 8591 # doc example from tseries holiday section - weekmask_egypt = 'Sun Mon Tue Wed Thu' - holidays = ['2012-05-01', - datetime.datetime(2013, 5, 1), np.datetime64('2014-05-01')] + weekmask_egypt = "Sun Mon Tue Wed Thu" + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] bday_egypt = pd.offsets.CustomBusinessDay( - holidays=holidays, weekmask=weekmask_egypt) + holidays=holidays, weekmask=weekmask_egypt + ) dt = datetime.datetime(2013, 4, 30) dts = date_range(dt, periods=5, freq=bday_egypt) - s = (Series(dts.weekday, dts).map( - Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) + s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) with ensure_clean_store(self.path) as store: - store.put('fixed', s) - result = store.select('fixed') + store.put("fixed", s) + result = store.select("fixed") assert_series_equal(result, s) - store.append('table', s) - result = store.select('table') + store.append("table", s) + result = store.select("table") assert_series_equal(result, s) def test_roundtrip_tz_aware_index(self): # GH 17618 - time = pd.Timestamp('2000-01-01 01:00:00', tz='US/Eastern') + time = pd.Timestamp("2000-01-01 01:00:00", tz="US/Eastern") df = pd.DataFrame(data=[0], index=[time]) with ensure_clean_store(self.path) as store: - store.put('frame', df, format='fixed') - recons = store['frame'] + store.put("frame", df, format="fixed") + recons = store["frame"] tm.assert_frame_equal(recons, df) assert recons.index[0].value == 946706400000000000 @@ -2086,42 +2174,49 @@ def test_append_with_timedelta(self): # GH 3577 # append timedelta - df = DataFrame(dict(A=Timestamp('20130101'), B=[Timestamp( - '20130101') + timedelta(days=i, seconds=10) for i in range(10)])) - df['C'] = df['A'] - df['B'] - df.loc[3:5, 'C'] = np.nan + df = DataFrame( + dict( + A=Timestamp("20130101"), + B=[ + Timestamp("20130101") + timedelta(days=i, seconds=10) + for i in range(10) + ], + ) + ) + df["C"] = df["A"] - df["B"] + df.loc[3:5, "C"] = np.nan with ensure_clean_store(self.path) as store: # table - _maybe_remove(store, 'df') - store.append('df', df, data_columns=True) - result = store.select('df') + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df") assert_frame_equal(result, df) - result = store.select('df', where="C<100000") + result = store.select("df", where="C<100000") assert_frame_equal(result, df) - result = store.select('df', where="C') + store.select("df", "index>") # from the docs with ensure_clean_path(self.path) as path: - dfq = DataFrame(np.random.randn(10, 4), columns=list( - 'ABCD'), index=date_range('20130101', periods=10)) - dfq.to_hdf(path, 'dfq', format='table', data_columns=True) + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table", data_columns=True) # check ok - read_hdf(path, 'dfq', - where="index>Timestamp('20130104') & columns=['A', 'B']") - read_hdf(path, 'dfq', where="A>0 or C>0") + read_hdf( + path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']" + ) + read_hdf(path, "dfq", where="A>0 or C>0") # catch the invalid reference with ensure_clean_path(self.path) as path: - dfq = DataFrame(np.random.randn(10, 4), columns=list( - 'ABCD'), index=date_range('20130101', periods=10)) - dfq.to_hdf(path, 'dfq', format='table') + dfq = DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=date_range("20130101", periods=10), + ) + dfq.to_hdf(path, "dfq", format="table") with pytest.raises(ValueError): - read_hdf(path, 'dfq', where="A>0 or C>0") + read_hdf(path, "dfq", where="A>0 or C>0") def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: import pandas as pd - df = DataFrame(np.random.randn(20, 2), - index=pd.date_range('20130101', periods=20)) - store.put('df', df, format='table') - expected = df[df.index > pd.Timestamp('20130105')] + + df = DataFrame( + np.random.randn(20, 2), index=pd.date_range("20130101", periods=20) + ) + store.put("df", df, format="table") + expected = df[df.index > pd.Timestamp("20130105")] import datetime # noqa - result = store.select('df', 'index>datetime.datetime(2013,1,5)') + + result = store.select("df", "index>datetime.datetime(2013,1,5)") assert_frame_equal(result, expected) from datetime import datetime # noqa # technically an error, but allow it - result = store.select('df', 'index>datetime.datetime(2013,1,5)') + result = store.select("df", "index>datetime.datetime(2013,1,5)") assert_frame_equal(result, expected) - result = store.select('df', 'index>datetime(2013,1,5)') + result = store.select("df", "index>datetime(2013,1,5)") assert_frame_equal(result, expected) def test_series(self): @@ -2239,10 +2344,8 @@ def test_series(self): ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), - dtype=object)) - self._check_roundtrip(ts3, tm.assert_series_equal, - check_index_type=False) + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) @ignore_sparse @ignore_series_tosparse @@ -2251,16 +2354,13 @@ def test_sparse_series(self): s = tm.makeStringSeries() s.iloc[3:5] = np.nan ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_roundtrip(ss2, tm.assert_series_equal, - check_series_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) @ignore_sparse @ignore_dataframe_tosparse @@ -2271,16 +2371,13 @@ def test_sparse_frame(self): s.iloc[8:10, -2] = np.nan ss = s.to_sparse() - self._check_double_roundtrip(ss, tm.assert_frame_equal, - check_frame_type=True) + self._check_double_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_double_roundtrip(ss2, tm.assert_frame_equal, - check_frame_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_double_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_double_roundtrip(ss3, tm.assert_frame_equal, - check_frame_type=True) + self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) def test_float_index(self): @@ -2294,7 +2391,7 @@ def test_tuple_index(self): # GH #492 col = np.arange(10) - idx = [(0., 1.), (2., 3.), (4., 5.)] + idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)] data = np.random.randn(30).reshape((3, 10)) DF = DataFrame(data, index=idx, columns=col) @@ -2309,13 +2406,12 @@ def test_index_types(self): with catch_warnings(record=True): values = np.random.randn(2) - func = lambda l, r: tm.assert_series_equal(l, r, - check_dtype=True, - check_index_type=True, - check_series_type=True) + func = lambda l, r: tm.assert_series_equal( + l, r, check_dtype=True, check_index_type=True, check_series_type=True + ) with catch_warnings(record=True): - ser = Series(values, [0, 'y']) + ser = Series(values, [0, "y"]) self._check_roundtrip(ser, func) with catch_warnings(record=True): @@ -2323,28 +2419,28 @@ def test_index_types(self): self._check_roundtrip(ser, func) with catch_warnings(record=True): - ser = Series(values, ['y', 0]) + ser = Series(values, ["y", 0]) self._check_roundtrip(ser, func) with catch_warnings(record=True): - ser = Series(values, [datetime.date.today(), 'a']) + ser = Series(values, [datetime.date.today(), "a"]) self._check_roundtrip(ser, func) with catch_warnings(record=True): - ser = Series(values, [0, 'y']) + ser = Series(values, [0, "y"]) self._check_roundtrip(ser, func) ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) - ser = Series(values, ['y', 0]) + ser = Series(values, ["y", 0]) self._check_roundtrip(ser, func) - ser = Series(values, [datetime.date.today(), 'a']) + ser = Series(values, [datetime.date.today(), "a"]) self._check_roundtrip(ser, func) - ser = Series(values, [1.23, 'b']) + ser = Series(values, [1.23, "b"]) self._check_roundtrip(ser, func) ser = Series(values, [1, 1.53]) @@ -2353,23 +2449,24 @@ def test_index_types(self): ser = Series(values, [1, 5]) self._check_roundtrip(ser, func) - ser = Series(values, [datetime.datetime( - 2012, 1, 1), datetime.datetime(2012, 1, 2)]) + ser = Series( + values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] + ) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): - dr = bdate_range('1/1/1940', '1/1/1960') + dr = bdate_range("1/1/1940", "1/1/1960") ts = Series(np.random.randn(len(dr)), index=dr) try: self._check_roundtrip(ts, tm.assert_series_equal) except OverflowError: - pytest.skip('known failer on some windows platforms') + pytest.skip("known failer on some windows platforms") @xfail_non_writeable - @pytest.mark.parametrize("compression", [ - False, pytest.param(True, marks=td.skip_if_windows_python_3) - ]) + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) def test_frame(self, compression): df = tm.makeDataFrame() @@ -2378,20 +2475,17 @@ def test_frame(self, compression): df.values[0, 0] = np.nan df.values[5, 3] = np.nan - self._check_roundtrip_table(df, tm.assert_frame_equal, - compression=compression) - self._check_roundtrip(df, tm.assert_frame_equal, - compression=compression) + self._check_roundtrip_table(df, tm.assert_frame_equal, compression=compression) + self._check_roundtrip(df, tm.assert_frame_equal, compression=compression) tdf = tm.makeTimeDataFrame() - self._check_roundtrip(tdf, tm.assert_frame_equal, - compression=compression) + self._check_roundtrip(tdf, tm.assert_frame_equal, compression=compression) with ensure_clean_store(self.path) as store: # not consolidated - df['foo'] = np.random.randn(len(df)) - store['df'] = df - recons = store['df'] + df["foo"] = np.random.randn(len(df)) + store["df"] = df + recons = store["df"] assert recons._data.is_consolidated() # empty @@ -2400,10 +2494,10 @@ def test_frame(self, compression): @xfail_non_writeable def test_empty_series_frame(self): s0 = Series() - s1 = Series(name='myseries') + s1 = Series(name="myseries") df0 = DataFrame() - df1 = DataFrame(index=['a', 'b', 'c']) - df2 = DataFrame(columns=['d', 'e', 'f']) + df1 = DataFrame(index=["a", "b", "c"]) + df2 = DataFrame(columns=["d", "e", "f"]) self._check_roundtrip(s0, tm.assert_series_equal) self._check_roundtrip(s1, tm.assert_series_equal) @@ -2413,73 +2507,75 @@ def test_empty_series_frame(self): @xfail_non_writeable @pytest.mark.parametrize( - 'dtype', [np.int64, np.float64, np.object, 'm8[ns]', 'M8[ns]']) + "dtype", [np.int64, np.float64, np.object, "m8[ns]", "M8[ns]"] + ) def test_empty_series(self, dtype): s = Series(dtype=dtype) self._check_roundtrip(s, tm.assert_series_equal) def test_can_serialize_dates(self): - rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] + rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) self._check_roundtrip(frame, tm.assert_frame_equal) def test_store_hierarchical(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - frame = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) self._check_roundtrip(frame, tm.assert_frame_equal) self._check_roundtrip(frame.T, tm.assert_frame_equal) - self._check_roundtrip(frame['A'], tm.assert_series_equal) + self._check_roundtrip(frame["A"], tm.assert_series_equal) # check that the names are stored with ensure_clean_store(self.path) as store: - store['frame'] = frame - recons = store['frame'] + store["frame"] = frame + recons = store["frame"] tm.assert_frame_equal(recons, frame) def test_store_index_name(self): df = tm.makeDataFrame() - df.index.name = 'foo' + df.index.name = "foo" with ensure_clean_store(self.path) as store: - store['frame'] = df - recons = store['frame'] + store["frame"] = df + recons = store["frame"] tm.assert_frame_equal(recons, df) def test_store_index_name_with_tz(self): # GH 13884 - df = pd.DataFrame({'A': [1, 2]}) + df = pd.DataFrame({"A": [1, 2]}) df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) - df.index = df.index.tz_localize('UTC') - df.index.name = 'foo' + df.index = df.index.tz_localize("UTC") + df.index.name = "foo" with ensure_clean_store(self.path) as store: - store.put('frame', df, format='table') - recons = store['frame'] + store.put("frame", df, format="table") + recons = store["frame"] tm.assert_frame_equal(recons, df) - @pytest.mark.parametrize('table_format', ['table', 'fixed']) + @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(self, table_format): # GH #13492 - idx = pd.Index(pd.to_datetime([datetime.date(2000, 1, 1), - datetime.date(2000, 1, 2)]), - name='cols\u05d2') - idx1 = pd.Index(pd.to_datetime([datetime.date(2010, 1, 1), - datetime.date(2010, 1, 2)]), - name='rows\u05d0') + idx = pd.Index( + pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), + name="cols\u05d2", + ) + idx1 = pd.Index( + pd.to_datetime([datetime.date(2010, 1, 1), datetime.date(2010, 1, 2)]), + name="rows\u05d0", + ) df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) # This used to fail, returning numpy strings instead of python strings. with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format=table_format) - df2 = read_hdf(path, 'df') + df.to_hdf(path, "df", format=table_format) + df2 = read_hdf(path, "df") assert_frame_equal(df, df2, check_names=True) @@ -2488,27 +2584,26 @@ def test_store_index_name_numpy_str(self, table_format): def test_store_series_name(self): df = tm.makeDataFrame() - series = df['A'] + series = df["A"] with ensure_clean_store(self.path) as store: - store['series'] = series - recons = store['series'] + store["series"] = series + recons = store["series"] tm.assert_series_equal(recons, series) @xfail_non_writeable - @pytest.mark.parametrize("compression", [ - False, pytest.param(True, marks=td.skip_if_windows_python_3) - ]) + @pytest.mark.parametrize( + "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] + ) def test_store_mixed(self, compression): - def _make_one(): df = tm.makeDataFrame() - df['obj1'] = 'foo' - df['obj2'] = 'bar' - df['bool1'] = df['A'] > 0 - df['bool2'] = df['B'] > 0 - df['int1'] = 1 - df['int2'] = 2 + df["obj1"] = "foo" + df["obj2"] = "bar" + df["bool1"] = df["A"] > 0 + df["bool2"] = df["B"] > 0 + df["int1"] = 1 + df["int2"] = 2 return df._consolidate() df1 = _make_one() @@ -2518,18 +2613,21 @@ def _make_one(): self._check_roundtrip(df2, tm.assert_frame_equal) with ensure_clean_store(self.path) as store: - store['obj'] = df1 - tm.assert_frame_equal(store['obj'], df1) - store['obj'] = df2 - tm.assert_frame_equal(store['obj'], df2) + store["obj"] = df1 + tm.assert_frame_equal(store["obj"], df1) + store["obj"] = df2 + tm.assert_frame_equal(store["obj"], df2) # check that can store Series of all of these types - self._check_roundtrip(df1['obj1'], tm.assert_series_equal, - compression=compression) - self._check_roundtrip(df1['bool1'], tm.assert_series_equal, - compression=compression) - self._check_roundtrip(df1['int1'], tm.assert_series_equal, - compression=compression) + self._check_roundtrip( + df1["obj1"], tm.assert_series_equal, compression=compression + ) + self._check_roundtrip( + df1["bool1"], tm.assert_series_equal, compression=compression + ) + self._check_roundtrip( + df1["int1"], tm.assert_series_equal, compression=compression + ) @pytest.mark.filterwarnings( "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" @@ -2537,70 +2635,73 @@ def _make_one(): def test_select_with_dups(self): # single dtypes - df = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) - df.index = date_range('20130101 9:30', periods=10, freq='T') + df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) + df.index = date_range("20130101 9:30", periods=10, freq="T") with ensure_clean_store(self.path) as store: - store.append('df', df) + store.append("df", df) - result = store.select('df') + result = store.select("df") expected = df assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df', columns=df.columns) + result = store.select("df", columns=df.columns) expected = df assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df', columns=['A']) - expected = df.loc[:, ['A']] + result = store.select("df", columns=["A"]) + expected = df.loc[:, ["A"]] assert_frame_equal(result, expected) # dups across dtypes - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - df.index = date_range('20130101 9:30', periods=10, freq='T') + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) + df.index = date_range("20130101 9:30", periods=10, freq="T") with ensure_clean_store(self.path) as store: - store.append('df', df) + store.append("df", df) - result = store.select('df') + result = store.select("df") expected = df assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df', columns=df.columns) + result = store.select("df", columns=df.columns) expected = df assert_frame_equal(result, expected, by_blocks=True) - expected = df.loc[:, ['A']] - result = store.select('df', columns=['A']) + expected = df.loc[:, ["A"]] + result = store.select("df", columns=["A"]) assert_frame_equal(result, expected, by_blocks=True) - expected = df.loc[:, ['B', 'A']] - result = store.select('df', columns=['B', 'A']) + expected = df.loc[:, ["B", "A"]] + result = store.select("df", columns=["B", "A"]) assert_frame_equal(result, expected, by_blocks=True) # duplicates on both index and columns with ensure_clean_store(self.path) as store: - store.append('df', df) - store.append('df', df) + store.append("df", df) + store.append("df", df) - expected = df.loc[:, ['B', 'A']] + expected = df.loc[:, ["B", "A"]] expected = concat([expected, expected]) - result = store.select('df', columns=['B', 'A']) + result = store.select("df", columns=["B", "A"]) assert_frame_equal(result, expected, by_blocks=True) def test_overwrite_node(self): with ensure_clean_store(self.path) as store: - store['a'] = tm.makeTimeDataFrame() + store["a"] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() - store['a'] = ts + store["a"] = ts - tm.assert_series_equal(store['a'], ts) + tm.assert_series_equal(store["a"], ts) @ignore_sparse @ignore_dataframe_tosparse @@ -2609,32 +2710,32 @@ def test_sparse_with_compression(self): # GH 2931 # make sparse dataframe - arr = np.random.binomial(n=1, p=.01, size=(1000, 10)) + arr = np.random.binomial(n=1, p=0.01, size=(1000, 10)) df = DataFrame(arr).to_sparse(fill_value=0) # case 1: store uncompressed - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression=False, - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression=False, check_frame_type=True + ) # case 2: store compressed (works) - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression='zlib', - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression="zlib", check_frame_type=True + ) # set one series to be completely sparse df[0] = np.zeros(1000) # case 3: store df with completely sparse series uncompressed - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression=False, - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression=False, check_frame_type=True + ) # case 4: try storing df with completely sparse series compressed # (fails) - self._check_double_roundtrip(df, tm.assert_frame_equal, - compression='zlib', - check_frame_type=True) + self._check_double_roundtrip( + df, tm.assert_frame_equal, compression="zlib", check_frame_type=True + ) def test_select(self): @@ -2644,113 +2745,113 @@ def test_select(self): # select with columns= df = tm.makeTimeDataFrame() - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df', columns=['A', 'B']) - expected = df.reindex(columns=['A', 'B']) + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df", columns=["A", "B"]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # equivalently - result = store.select('df', [("columns=['A', 'B']")]) - expected = df.reindex(columns=['A', 'B']) + result = store.select("df", [("columns=['A', 'B']")]) + expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['A']) - result = store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # all a data columns - _maybe_remove(store, 'df') - store.append('df', df, data_columns=True) - result = store.select('df', ['A > 0'], columns=['A', 'B']) - expected = df[df.A > 0].reindex(columns=['A', 'B']) + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + result = store.select("df", ["A > 0"], columns=["A", "B"]) + expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column, but different columns - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['A']) - result = store.select('df', ['A > 0'], columns=['C', 'D']) - expected = df[df.A > 0].reindex(columns=['C', 'D']) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["A"]) + result = store.select("df", ["A > 0"], columns=["C", "D"]) + expected = df[df.A > 0].reindex(columns=["C", "D"]) tm.assert_frame_equal(expected, result) def test_select_dtypes(self): with ensure_clean_store(self.path) as store: # with a Timestamp data column (GH #2637) - df = DataFrame(dict( - ts=bdate_range('2012-01-01', periods=300), - A=np.random.randn(300))) - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['ts', 'A']) - - result = store.select('df', "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp('2012-02-01')] + df = DataFrame( + dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300)) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A"]) + + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # bool columns (GH #2849) - df = DataFrame(np.random.randn(5, 2), columns=['A', 'B']) - df['object'] = 'foo' - df.loc[4:5, 'object'] = 'bar' - df['boolv'] = df['A'] > 0 - _maybe_remove(store, 'df') - store.append('df', df, data_columns=True) - - expected = (df[df.boolv == True] # noqa - .reindex(columns=['A', 'boolv'])) - for v in [True, 'true', 1]: - result = store.select('df', 'boolv == %s' % str(v), - columns=['A', 'boolv']) + df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) + df["object"] = "foo" + df.loc[4:5, "object"] = "bar" + df["boolv"] = df["A"] > 0 + _maybe_remove(store, "df") + store.append("df", df, data_columns=True) + + expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa + for v in [True, "true", 1]: + result = store.select( + "df", "boolv == %s" % str(v), columns=["A", "boolv"] + ) tm.assert_frame_equal(expected, result) - expected = (df[df.boolv == False] # noqa - .reindex(columns=['A', 'boolv'])) - for v in [False, 'false', 0]: + expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa + for v in [False, "false", 0]: result = store.select( - 'df', 'boolv == %s' % str(v), columns=['A', 'boolv']) + "df", "boolv == %s" % str(v), columns=["A", "boolv"] + ) tm.assert_frame_equal(expected, result) # integer index df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - _maybe_remove(store, 'df_int') - store.append('df_int', df) - result = store.select( - 'df_int', "index<10 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=['A']) + _maybe_remove(store, "df_int") + store.append("df_int", df) + result = store.select("df_int", "index<10 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) # float index - df = DataFrame(dict(A=np.random.rand( - 20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) - _maybe_remove(store, 'df_float') - store.append('df_float', df) - result = store.select( - 'df_float', "index<10.0 and columns=['A']") - expected = df.reindex(index=list(df.index)[0:10], columns=['A']) + df = DataFrame( + dict( + A=np.random.rand(20), + B=np.random.rand(20), + index=np.arange(20, dtype="f8"), + ) + ) + _maybe_remove(store, "df_float") + store.append("df_float", df) + result = store.select("df_float", "index<10.0 and columns=['A']") + expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) with ensure_clean_store(self.path) as store: # floats w/o NaN - df = DataFrame( - dict(cols=range(11), values=range(11)), dtype='float64') - df['cols'] = (df['cols'] + 10).apply(str) + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) - store.append('df1', df, data_columns=True) - result = store.select( - 'df1', where='values>2.0') - expected = df[df['values'] > 2.0] + store.append("df1", df, data_columns=True) + result = store.select("df1", where="values>2.0") + expected = df[df["values"] > 2.0] tm.assert_frame_equal(expected, result) # floats with NaN df.iloc[0] = np.nan - expected = df[df['values'] > 2.0] + expected = df[df["values"] > 2.0] - store.append('df2', df, data_columns=True, index=False) - result = store.select( - 'df2', where='values>2.0') + store.append("df2", df, data_columns=True, index=False) + result = store.select("df2", where="values>2.0") tm.assert_frame_equal(expected, result) # https://github.com/PyTables/PyTables/issues/282 @@ -2761,16 +2862,14 @@ def test_select_dtypes(self): # tm.assert_frame_equal(expected, result) # not in first position float with NaN ok too - df = DataFrame( - dict(cols=range(11), values=range(11)), dtype='float64') - df['cols'] = (df['cols'] + 10).apply(str) + df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) df.iloc[1] = np.nan - expected = df[df['values'] > 2.0] + expected = df[df["values"] > 2.0] - store.append('df4', df, data_columns=True) - result = store.select( - 'df4', where='values>2.0') + store.append("df4", df, data_columns=True) + result = store.select("df4", where="values>2.0") tm.assert_frame_equal(expected, result) # test selection with comparison against numpy scalar @@ -2778,56 +2877,62 @@ def test_select_dtypes(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - expected = df[df['A'] > 0] + expected = df[df["A"] > 0] - store.append('df', df, data_columns=True) + store.append("df", df, data_columns=True) np_zero = np.float64(0) # noqa - result = store.select('df', where=["A>np_zero"]) + result = store.select("df", where=["A>np_zero"]) tm.assert_frame_equal(expected, result) def test_select_with_many_inputs(self): with ensure_clean_store(self.path) as store: - df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), - A=np.random.randn(300), - B=range(300), - users=['a'] * 50 + ['b'] * 50 + ['c'] * 100 + - ['a%03d' % i for i in range(100)])) - _maybe_remove(store, 'df') - store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) + df = DataFrame( + dict( + ts=bdate_range("2012-01-01", periods=300), + A=np.random.randn(300), + B=range(300), + users=["a"] * 50 + + ["b"] * 50 + + ["c"] * 100 + + ["a%03d" % i for i in range(100)], + ) + ) + _maybe_remove(store, "df") + store.append("df", df, data_columns=["ts", "A", "B", "users"]) # regular select - result = store.select('df', "ts>=Timestamp('2012-02-01')") - expected = df[df.ts >= Timestamp('2012-02-01')] + result = store.select("df", "ts>=Timestamp('2012-02-01')") + expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # small selector result = store.select( - 'df', - "ts>=Timestamp('2012-02-01') & users=['a','b','c']") - expected = df[(df.ts >= Timestamp('2012-02-01')) & - df.users.isin(['a', 'b', 'c'])] + "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']" + ) + expected = df[ + (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"]) + ] tm.assert_frame_equal(expected, result) # big selector along the columns - selector = ['a', 'b', 'c'] + ['a%03d' % i for i in range(60)] + selector = ["a", "b", "c"] + ["a%03d" % i for i in range(60)] result = store.select( - 'df', - "ts>=Timestamp('2012-02-01') and users=selector") - expected = df[(df.ts >= Timestamp('2012-02-01')) & - df.users.isin(selector)] + "df", "ts>=Timestamp('2012-02-01') and users=selector" + ) + expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] tm.assert_frame_equal(expected, result) selector = range(100, 200) - result = store.select('df', 'B=selector') + result = store.select("df", "B=selector") expected = df[df.B.isin(selector)] tm.assert_frame_equal(expected, result) assert len(result) == 100 # big selector along the index selector = Index(df.ts[0:100].values) - result = store.select('df', 'ts=selector') + result = store.select("df", "ts=selector") expected = df[df.ts.isin(selector.values)] tm.assert_frame_equal(expected, result) assert len(result) == 100 @@ -2838,65 +2943,67 @@ def test_select_iterator(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame(500) - _maybe_remove(store, 'df') - store.append('df', df) + _maybe_remove(store, "df") + store.append("df", df) - expected = store.select('df') + expected = store.select("df") - results = [s for s in store.select('df', iterator=True)] + results = [s for s in store.select("df", iterator=True)] result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select('df', chunksize=100)] + results = [s for s in store.select("df", chunksize=100)] assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) - results = [s for s in store.select('df', chunksize=150)] + results = [s for s in store.select("df", chunksize=150)] result = concat(results) tm.assert_frame_equal(result, expected) with ensure_clean_path(self.path) as path: df = tm.makeTimeDataFrame(500) - df.to_hdf(path, 'df_non_table') + df.to_hdf(path, "df_non_table") with pytest.raises(TypeError): - read_hdf(path, 'df_non_table', chunksize=100) + read_hdf(path, "df_non_table", chunksize=100) with pytest.raises(TypeError): - read_hdf(path, 'df_non_table', iterator=True) + read_hdf(path, "df_non_table", iterator=True) with ensure_clean_path(self.path) as path: df = tm.makeTimeDataFrame(500) - df.to_hdf(path, 'df', format='table') + df.to_hdf(path, "df", format="table") - results = [s for s in read_hdf(path, 'df', chunksize=100)] + results = [s for s in read_hdf(path, "df", chunksize=100)] result = concat(results) assert len(results) == 5 tm.assert_frame_equal(result, df) - tm.assert_frame_equal(result, read_hdf(path, 'df')) + tm.assert_frame_equal(result, read_hdf(path, "df")) # multiple with ensure_clean_store(self.path) as store: df1 = tm.makeTimeDataFrame(500) - store.append('df1', df1, data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename( - columns=lambda x: "%s_2" % x) - df2['foo'] = 'bar' - store.append('df2', df2) + store.append("df1", df1, data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x) + df2["foo"] = "bar" + store.append("df2", df2) df = concat([df1, df2], axis=1) # full selection - expected = store.select_as_multiple( - ['df1', 'df2'], selector='df1') - results = [s for s in store.select_as_multiple( - ['df1', 'df2'], selector='df1', chunksize=150)] + expected = store.select_as_multiple(["df1", "df2"], selector="df1") + results = [ + s + for s in store.select_as_multiple( + ["df1", "df2"], selector="df1", chunksize=150 + ) + ] result = concat(results) tm.assert_frame_equal(expected, result) @@ -2909,68 +3016,65 @@ def test_select_iterator_complete_8014(self): # no iterator with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/o iteration and no where clause works - result = store.select('df') + result = store.select("df") tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, begin # of range, works where = "index >= '%s'" % beg_dt - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, end # of range, works where = "index <= '%s'" % end_dt - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, inclusive range, # works where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # with iterator, full range with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/iterator and no where clause works - results = [s for s in store.select('df', chunksize=chunksize)] + results = [s for s in store.select("df", chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) @@ -2983,51 +3087,48 @@ def test_select_iterator_non_complete_8014(self): # with iterator, non complete range with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[1] end_dt = expected.index[-2] # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & - (expected.index <= end_dt)] + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] tm.assert_frame_equal(rexpected, result) # with iterator, empty where with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100064, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100064, "S") + _maybe_remove(store, "df") + store.append("df", expected) end_dt = expected.index[-1] # select w/iterator and where clause, single term, begin of range where = "index > '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] assert 0 == len(results) def test_select_iterator_many_empty_frames(self): @@ -3040,25 +3141,23 @@ def test_select_iterator_many_empty_frames(self): # with iterator, range limited to the first chunk with ensure_clean_store(self.path) as store: - expected = tm.makeTimeDataFrame(100000, 'S') - _maybe_remove(store, 'df') - store.append('df', expected) + expected = tm.makeTimeDataFrame(100000, "S") + _maybe_remove(store, "df") + store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[chunksize - 1] # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] assert len(results) == 1 result = concat(results) @@ -3067,14 +3166,14 @@ def test_select_iterator_many_empty_frames(self): # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] # should be 1, is 10 assert len(results) == 1 result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & - (expected.index <= end_dt)] + rexpected = expected[ + (expected.index >= beg_dt) & (expected.index <= end_dt) + ] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause which selects @@ -3085,8 +3184,7 @@ def test_select_iterator_many_empty_frames(self): # True. where = "index <= '%s' & index >= '%s'" % (beg_dt, end_dt) - results = [s for s in store.select( - 'df', where=where, chunksize=chunksize)] + results = [s for s in store.select("df", where=where, chunksize=chunksize)] # should be [] assert len(results) == 0 @@ -3097,44 +3195,59 @@ def test_select_iterator_many_empty_frames(self): def test_retain_index_attributes(self): # GH 3499, losing frequency info on index recreation - df = DataFrame(dict( - A=Series(range(3), - index=date_range('2000-1-1', periods=3, freq='H')))) + df = DataFrame( + dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))) + ) with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'data') - store.put('data', df, format='table') + _maybe_remove(store, "data") + store.put("data", df, format="table") - result = store.get('data') + result = store.get("data") tm.assert_frame_equal(df, result) - for attr in ['freq', 'tz', 'name']: - for idx in ['index', 'columns']: - assert (getattr(getattr(df, idx), attr, None) == - getattr(getattr(result, idx), attr, None)) + for attr in ["freq", "tz", "name"]: + for idx in ["index", "columns"]: + assert getattr(getattr(df, idx), attr, None) == getattr( + getattr(result, idx), attr, None + ) # try to append a table with a different frequency with catch_warnings(record=True): - df2 = DataFrame(dict( - A=Series(range(3), - index=date_range('2002-1-1', - periods=3, freq='D')))) - store.append('data', df2) + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("data", df2) - assert store.get_storer('data').info['index']['freq'] is None + assert store.get_storer("data").info["index"]["freq"] is None # this is ok - _maybe_remove(store, 'df2') - df2 = DataFrame(dict( - A=Series(range(3), - index=[Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20020101')]))) - store.append('df2', df2) - df3 = DataFrame(dict( - A=Series(range(3), - index=date_range('2002-1-1', periods=3, - freq='D')))) - store.append('df2', df3) + _maybe_remove(store, "df2") + df2 = DataFrame( + dict( + A=Series( + range(3), + index=[ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20020101"), + ], + ) + ) + ) + store.append("df2", df2) + df3 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + store.append("df2", df3) @pytest.mark.filterwarnings( "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" @@ -3144,60 +3257,66 @@ def test_retain_index_attributes2(self): with catch_warnings(record=True): - df = DataFrame(dict( - A=Series(range(3), - index=date_range('2000-1-1', - periods=3, freq='H')))) - df.to_hdf(path, 'data', mode='w', append=True) - df2 = DataFrame(dict( - A=Series(range(3), - index=date_range('2002-1-1', periods=3, - freq='D')))) - df2.to_hdf(path, 'data', append=True) - - idx = date_range('2000-1-1', periods=3, freq='H') - idx.name = 'foo' + df = DataFrame( + dict( + A=Series( + range(3), index=date_range("2000-1-1", periods=3, freq="H") + ) + ) + ) + df.to_hdf(path, "data", mode="w", append=True) + df2 = DataFrame( + dict( + A=Series( + range(3), index=date_range("2002-1-1", periods=3, freq="D") + ) + ) + ) + df2.to_hdf(path, "data", append=True) + + idx = date_range("2000-1-1", periods=3, freq="H") + idx.name = "foo" df = DataFrame(dict(A=Series(range(3), index=idx))) - df.to_hdf(path, 'data', mode='w', append=True) + df.to_hdf(path, "data", mode="w", append=True) - assert read_hdf(path, 'data').index.name == 'foo' + assert read_hdf(path, "data").index.name == "foo" with catch_warnings(record=True): - idx2 = date_range('2001-1-1', periods=3, freq='H') - idx2.name = 'bar' + idx2 = date_range("2001-1-1", periods=3, freq="H") + idx2.name = "bar" df2 = DataFrame(dict(A=Series(range(3), index=idx2))) - df2.to_hdf(path, 'data', append=True) + df2.to_hdf(path, "data", append=True) - assert read_hdf(path, 'data').index.name is None + assert read_hdf(path, "data").index.name is None def test_frame_select(self): df = tm.makeTimeDataFrame() with ensure_clean_store(self.path) as store: - store.put('frame', df, format='table') + store.put("frame", df, format="table") date = df.index[len(df) // 2] - crit1 = Term('index>=date') - assert crit1.env.scope['date'] == date + crit1 = Term("index>=date") + assert crit1.env.scope["date"] == date - crit2 = ("columns=['A', 'D']") - crit3 = ('columns=A') + crit2 = "columns=['A', 'D']" + crit3 = "columns=A" - result = store.select('frame', [crit1, crit2]) - expected = df.loc[date:, ['A', 'D']] + result = store.select("frame", [crit1, crit2]) + expected = df.loc[date:, ["A", "D"]] tm.assert_frame_equal(result, expected) - result = store.select('frame', [crit3]) - expected = df.loc[:, ['A']] + result = store.select("frame", [crit3]) + expected = df.loc[:, ["A"]] tm.assert_frame_equal(result, expected) # invalid terms df = tm.makeTimeDataFrame() - store.append('df_time', df) + store.append("df_time", df) with pytest.raises(ValueError): - store.select('df_time', "index>0") + store.select("df_time", "index>0") # can't select if not written as table # store['frame'] = df @@ -3208,113 +3327,114 @@ def test_frame_select_complex(self): # select via complex criteria df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.loc[df.index[0:4], 'string'] = 'bar' + df["string"] = "foo" + df.loc[df.index[0:4], "string"] = "bar" with ensure_clean_store(self.path) as store: - store.put('df', df, format='table', data_columns=['string']) + store.put("df", df, format="table", data_columns=["string"]) # empty - result = store.select('df', 'index>df.index[3] & string="bar"') - expected = df.loc[(df.index > df.index[3]) & (df.string == 'bar')] + result = store.select("df", 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")] tm.assert_frame_equal(result, expected) - result = store.select('df', 'index>df.index[3] & string="foo"') - expected = df.loc[(df.index > df.index[3]) & (df.string == 'foo')] + result = store.select("df", 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")] tm.assert_frame_equal(result, expected) # or - result = store.select('df', 'index>df.index[3] | string="bar"') - expected = df.loc[(df.index > df.index[3]) | (df.string == 'bar')] + result = store.select("df", 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")] tm.assert_frame_equal(result, expected) - result = store.select('df', '(index>df.index[3] & ' - 'index<=df.index[6]) | string="bar"') - expected = df.loc[((df.index > df.index[3]) & ( - df.index <= df.index[6])) | (df.string == 'bar')] + result = store.select( + "df", "(index>df.index[3] & " 'index<=df.index[6]) | string="bar"' + ) + expected = df.loc[ + ((df.index > df.index[3]) & (df.index <= df.index[6])) + | (df.string == "bar") + ] tm.assert_frame_equal(result, expected) # invert - result = store.select('df', 'string!="bar"') - expected = df.loc[df.string != 'bar'] + result = store.select("df", 'string!="bar"') + expected = df.loc[df.string != "bar"] tm.assert_frame_equal(result, expected) # invert not implemented in numexpr :( with pytest.raises(NotImplementedError): - store.select('df', '~(string="bar")') + store.select("df", '~(string="bar")') # invert ok for filters - result = store.select('df', "~(columns=['A','B'])") - expected = df.loc[:, df.columns.difference(['A', 'B'])] + result = store.select("df", "~(columns=['A','B'])") + expected = df.loc[:, df.columns.difference(["A", "B"])] tm.assert_frame_equal(result, expected) # in - result = store.select( - 'df', "index>df.index[3] & columns in ['A','B']") - expected = df.loc[df.index > df.index[3]].reindex(columns=[ - 'A', 'B']) + result = store.select("df", "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_frame_select_complex2(self): - with ensure_clean_path(['parms.hdf', 'hist.hdf']) as paths: + with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: pp, hh = paths # use non-trivial selection criteria - parms = DataFrame({'A': [1, 1, 2, 2, 3]}) - parms.to_hdf(pp, 'df', mode='w', - format='table', data_columns=['A']) + parms = DataFrame({"A": [1, 1, 2, 2, 3]}) + parms.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) - selection = read_hdf(pp, 'df', where='A=[2,3]') - hist = DataFrame(np.random.randn(25, 1), - columns=['data'], - index=MultiIndex.from_tuples( - [(i, j) for i in range(5) - for j in range(5)], - names=['l1', 'l2'])) + selection = read_hdf(pp, "df", where="A=[2,3]") + hist = DataFrame( + np.random.randn(25, 1), + columns=["data"], + index=MultiIndex.from_tuples( + [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"] + ), + ) - hist.to_hdf(hh, 'df', mode='w', format='table') + hist.to_hdf(hh, "df", mode="w", format="table") - expected = read_hdf(hh, 'df', where='l1=[2, 3, 4]') + expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") # scope with list like l = selection.index.tolist() # noqa store = HDFStore(hh) - result = store.select('df', where='l1=l') + result = store.select("df", where="l1=l") assert_frame_equal(result, expected) store.close() - result = read_hdf(hh, 'df', where='l1=l') + result = read_hdf(hh, "df", where="l1=l") assert_frame_equal(result, expected) # index index = selection.index # noqa - result = read_hdf(hh, 'df', where='l1=index') + result = read_hdf(hh, "df", where="l1=index") assert_frame_equal(result, expected) - result = read_hdf(hh, 'df', where='l1=selection.index') + result = read_hdf(hh, "df", where="l1=selection.index") assert_frame_equal(result, expected) - result = read_hdf(hh, 'df', where='l1=selection.index.tolist()') + result = read_hdf(hh, "df", where="l1=selection.index.tolist()") assert_frame_equal(result, expected) - result = read_hdf(hh, 'df', where='l1=list(selection.index)') + result = read_hdf(hh, "df", where="l1=list(selection.index)") assert_frame_equal(result, expected) # scope with index store = HDFStore(hh) - result = store.select('df', where='l1=index') + result = store.select("df", where="l1=index") assert_frame_equal(result, expected) - result = store.select('df', where='l1=selection.index') + result = store.select("df", where="l1=selection.index") assert_frame_equal(result, expected) - result = store.select('df', where='l1=selection.index.tolist()') + result = store.select("df", where="l1=selection.index.tolist()") assert_frame_equal(result, expected) - result = store.select('df', where='l1=list(selection.index)') + result = store.select("df", where="l1=list(selection.index)") assert_frame_equal(result, expected) store.close() @@ -3326,15 +3446,15 @@ def test_invalid_filtering(self): df = tm.makeTimeDataFrame() with ensure_clean_store(self.path) as store: - store.put('df', df, format='table') + store.put("df", df, format="table") # not implemented with pytest.raises(NotImplementedError): - store.select('df', "columns=['A'] | columns=['B']") + store.select("df", "columns=['A'] | columns=['B']") # in theory we could deal with this with pytest.raises(NotImplementedError): - store.select('df', "columns=['A','B'] & columns=['C']") + store.select("df", "columns=['A','B'] & columns=['C']") def test_string_select(self): # GH 2973 @@ -3343,18 +3463,18 @@ def test_string_select(self): df = tm.makeTimeDataFrame() # test string ==/!= - df['x'] = 'none' - df.loc[2:7, 'x'] = '' + df["x"] = "none" + df.loc[2:7, "x"] = "" - store.append('df', df, data_columns=['x']) + store.append("df", df, data_columns=["x"]) - result = store.select('df', 'x=none') - expected = df[df.x == 'none'] + result = store.select("df", "x=none") + expected = df[df.x == "none"] assert_frame_equal(result, expected) try: - result = store.select('df', 'x!=none') - expected = df[df.x != 'none'] + result = store.select("df", "x!=none") + expected = df[df.x != "none"] assert_frame_equal(result, expected) except Exception as detail: pprint_thing("[{0}]".format(detail)) @@ -3362,24 +3482,24 @@ def test_string_select(self): pprint_thing(expected) df2 = df.copy() - df2.loc[df2.x == '', 'x'] = np.nan + df2.loc[df2.x == "", "x"] = np.nan - store.append('df2', df2, data_columns=['x']) - result = store.select('df2', 'x!=none') + store.append("df2", df2, data_columns=["x"]) + result = store.select("df2", "x!=none") expected = df2[isna(df2.x)] assert_frame_equal(result, expected) # int ==/!= - df['int'] = 1 - df.loc[2:7, 'int'] = 2 + df["int"] = 1 + df.loc[2:7, "int"] = 2 - store.append('df3', df, data_columns=['int']) + store.append("df3", df, data_columns=["int"]) - result = store.select('df3', 'int=2') + result = store.select("df3", "int=2") expected = df[df.int == 2] assert_frame_equal(result, expected) - result = store.select('df3', 'int!=2') + result = store.select("df3", "int!=2") expected = df[df.int != 2] assert_frame_equal(result, expected) @@ -3388,71 +3508,70 @@ def test_read_column(self): df = tm.makeTimeDataFrame() with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') + _maybe_remove(store, "df") # GH 17912 # HDFStore.select_column should raise a KeyError # exception if the key is not a valid store - with pytest.raises(KeyError, - match='No object named df in the file'): - store.select_column('df', 'index') + with pytest.raises(KeyError, match="No object named df in the file"): + store.select_column("df", "index") - store.append('df', df) + store.append("df", df) # error with pytest.raises(KeyError): - store.select_column('df', 'foo') + store.select_column("df", "foo") with pytest.raises(Exception): - store.select_column('df', 'index', where=['index>5']) + store.select_column("df", "index", where=["index>5"]) # valid - result = store.select_column('df', 'index') + result = store.select_column("df", "index") tm.assert_almost_equal(result.values, Series(df.index).values) assert isinstance(result, Series) # not a data indexable column with pytest.raises(ValueError): - store.select_column('df', 'values_block_0') + store.select_column("df", "values_block_0") # a data column df2 = df.copy() - df2['string'] = 'foo' - store.append('df2', df2, data_columns=['string']) - result = store.select_column('df2', 'string') - tm.assert_almost_equal(result.values, df2['string'].values) + df2["string"] = "foo" + store.append("df2", df2, data_columns=["string"]) + result = store.select_column("df2", "string") + tm.assert_almost_equal(result.values, df2["string"].values) # a data column with NaNs, result excludes the NaNs df3 = df.copy() - df3['string'] = 'foo' - df3.loc[4:6, 'string'] = np.nan - store.append('df3', df3, data_columns=['string']) - result = store.select_column('df3', 'string') - tm.assert_almost_equal(result.values, df3['string'].values) + df3["string"] = "foo" + df3.loc[4:6, "string"] = np.nan + store.append("df3", df3, data_columns=["string"]) + result = store.select_column("df3", "string") + tm.assert_almost_equal(result.values, df3["string"].values) # start/stop - result = store.select_column('df3', 'string', start=2) - tm.assert_almost_equal(result.values, df3['string'].values[2:]) + result = store.select_column("df3", "string", start=2) + tm.assert_almost_equal(result.values, df3["string"].values[2:]) - result = store.select_column('df3', 'string', start=-2) - tm.assert_almost_equal(result.values, df3['string'].values[-2:]) + result = store.select_column("df3", "string", start=-2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:]) - result = store.select_column('df3', 'string', stop=2) - tm.assert_almost_equal(result.values, df3['string'].values[:2]) + result = store.select_column("df3", "string", stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[:2]) - result = store.select_column('df3', 'string', stop=-2) - tm.assert_almost_equal(result.values, df3['string'].values[:-2]) + result = store.select_column("df3", "string", stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[:-2]) - result = store.select_column('df3', 'string', start=2, stop=-2) - tm.assert_almost_equal(result.values, df3['string'].values[2:-2]) + result = store.select_column("df3", "string", start=2, stop=-2) + tm.assert_almost_equal(result.values, df3["string"].values[2:-2]) - result = store.select_column('df3', 'string', start=-2, stop=2) - tm.assert_almost_equal(result.values, df3['string'].values[-2:2]) + result = store.select_column("df3", "string", start=-2, stop=2) + tm.assert_almost_equal(result.values, df3["string"].values[-2:2]) # GH 10392 - make sure column name is preserved - df4 = DataFrame({'A': np.random.randn(10), 'B': 'foo'}) - store.append('df4', df4, data_columns=True) - expected = df4['B'] - result = store.select_column('df4', 'B') + df4 = DataFrame({"A": np.random.randn(10), "B": "foo"}) + store.append("df4", df4, data_columns=True) + expected = df4["B"] + result = store.select_column("df4", "B") tm.assert_series_equal(result, expected) def test_coordinates(self): @@ -3460,42 +3579,42 @@ def test_coordinates(self): with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df') - store.append('df', df) + _maybe_remove(store, "df") + store.append("df", df) # all - c = store.select_as_coordinates('df') - assert((c.values == np.arange(len(df.index))).all()) + c = store.select_as_coordinates("df") + assert (c.values == np.arange(len(df.index))).all() # get coordinates back & test vs frame - _maybe_remove(store, 'df') + _maybe_remove(store, "df") df = DataFrame(dict(A=range(5), B=range(5))) - store.append('df', df) - c = store.select_as_coordinates('df', ['index<3']) - assert((c.values == np.arange(3)).all()) - result = store.select('df', where=c) + store.append("df", df) + c = store.select_as_coordinates("df", ["index<3"]) + assert (c.values == np.arange(3)).all() + result = store.select("df", where=c) expected = df.loc[0:2, :] tm.assert_frame_equal(result, expected) - c = store.select_as_coordinates('df', ['index>=3', 'index<=4']) - assert((c.values == np.arange(2) + 3).all()) - result = store.select('df', where=c) + c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) + assert (c.values == np.arange(2) + 3).all() + result = store.select("df", where=c) expected = df.loc[3:4, :] tm.assert_frame_equal(result, expected) assert isinstance(c, Index) # multiple tables - _maybe_remove(store, 'df1') - _maybe_remove(store, 'df2') + _maybe_remove(store, "df1") + _maybe_remove(store, "df2") df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - store.append('df1', df1, data_columns=['A', 'B']) - store.append('df2', df2) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) - c = store.select_as_coordinates('df1', ['A>0', 'B>0']) - df1_result = store.select('df1', c) - df2_result = store.select('df2', c) + c = store.select_as_coordinates("df1", ["A>0", "B>0"]) + df1_result = store.select("df1", c) + df2_result = store.select("df2", c) result = concat([df1_result, df2_result], axis=1) expected = concat([df1, df2], axis=1) @@ -3505,63 +3624,64 @@ def test_coordinates(self): # pass array/mask as the coordinates with ensure_clean_store(self.path) as store: - df = DataFrame(np.random.randn(1000, 2), - index=date_range('20000101', periods=1000)) - store.append('df', df) - c = store.select_column('df', 'index') + df = DataFrame( + np.random.randn(1000, 2), index=date_range("20000101", periods=1000) + ) + store.append("df", df) + c = store.select_column("df", "index") where = c[DatetimeIndex(c).month == 5].index expected = df.iloc[where] # locations - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # boolean - result = store.select('df', where=where) + result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # invalid with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df), dtype='float64')) + store.select("df", where=np.arange(len(df), dtype="float64")) with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df) + 1)) + store.select("df", where=np.arange(len(df) + 1)) with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df)), start=5) + store.select("df", where=np.arange(len(df)), start=5) with pytest.raises(ValueError): - store.select('df', where=np.arange(len(df)), start=5, stop=10) + store.select("df", where=np.arange(len(df)), start=5, stop=10) # selection with filter - selection = date_range('20000101', periods=500) - result = store.select('df', where='index in selection') + selection = date_range("20000101", periods=500) + result = store.select("df", where="index in selection") expected = df[df.index.isin(selection)] tm.assert_frame_equal(result, expected) # list df = DataFrame(np.random.randn(10, 2)) - store.append('df2', df) - result = store.select('df2', where=[0, 3, 5]) + store.append("df2", df) + result = store.select("df2", where=[0, 3, 5]) expected = df.iloc[[0, 3, 5]] tm.assert_frame_equal(result, expected) # boolean where = [True] * 10 where[-2] = False - result = store.select('df2', where=where) + result = store.select("df2", where=where) expected = df.loc[where] tm.assert_frame_equal(result, expected) # start/stop - result = store.select('df2', start=5, stop=10) + result = store.select("df2", start=5, stop=10) expected = df[5:10] tm.assert_frame_equal(result, expected) def test_append_to_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df2['foo'] = 'bar' + df2["foo"] = "bar" df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: @@ -3569,148 +3689,154 @@ def test_append_to_multiple(self): # exceptions with pytest.raises(ValueError): store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df3') + {"df1": ["A", "B"], "df2": None}, df, selector="df3" + ) with pytest.raises(ValueError): - store.append_to_multiple( - {'df1': None, 'df2': None}, df, selector='df3') + store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3") with pytest.raises(ValueError): - store.append_to_multiple('df1', df, 'df1') + store.append_to_multiple("df1", df, "df1") # regular operation store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df1') + {"df1": ["A", "B"], "df2": None}, df, selector="df1" + ) result = store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) expected = df[(df.A > 0) & (df.B > 0)] tm.assert_frame_equal(result, expected) def test_append_to_multiple_dropna(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: # dropna=True should guarantee rows are synchronized store.append_to_multiple( - {'df1': ['A', 'B'], 'df2': None}, df, selector='df1', - dropna=True) - result = store.select_as_multiple(['df1', 'df2']) + {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True + ) + result = store.select_as_multiple(["df1", "df2"]) expected = df.dropna() tm.assert_frame_equal(result, expected) - tm.assert_index_equal(store.select('df1').index, - store.select('df2').index) + tm.assert_index_equal(store.select("df1").index, store.select("df2").index) - @pytest.mark.xfail(run=False, - reason="append_to_multiple_dropna_false " - "is not raising as failed") + @pytest.mark.xfail( + run=False, reason="append_to_multiple_dropna_false " "is not raising as failed" + ) def test_append_to_multiple_dropna_false(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df1.iloc[1, df1.columns.get_indexer(['A', 'B'])] = np.nan + df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) with ensure_clean_store(self.path) as store: # dropna=False shouldn't synchronize row indexes store.append_to_multiple( - {'df1a': ['A', 'B'], 'df2a': None}, df, selector='df1a', - dropna=False) + {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False + ) with pytest.raises(ValueError): - store.select_as_multiple(['df1a', 'df2a']) + store.select_as_multiple(["df1a", "df2a"]) - assert not store.select('df1a').index.equals( - store.select('df2a').index) + assert not store.select("df1a").index.equals(store.select("df2a").index) def test_select_as_multiple(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns=lambda x: "%s_2" % x) - df2['foo'] = 'bar' + df2["foo"] = "bar" with ensure_clean_store(self.path) as store: # no tables stored with pytest.raises(Exception): - store.select_as_multiple( - None, where=['A>0', 'B>0'], selector='df1') + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") - store.append('df1', df1, data_columns=['A', 'B']) - store.append('df2', df2) + store.append("df1", df1, data_columns=["A", "B"]) + store.append("df2", df2) # exceptions with pytest.raises(Exception): - store.select_as_multiple(None, where=['A>0', 'B>0'], - selector='df1') + store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1") with pytest.raises(Exception): - store.select_as_multiple([None], where=['A>0', 'B>0'], - selector='df1') + store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1") with pytest.raises(KeyError): store.select_as_multiple( - ['df1', 'df3'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) with pytest.raises(KeyError): - store.select_as_multiple( - ['df3'], where=['A>0', 'B>0'], selector='df1') + store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1") with pytest.raises(KeyError): store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df4') + ["df1", "df2"], where=["A>0", "B>0"], selector="df4" + ) # default select - result = store.select('df1', ['A>0', 'B>0']) + result = store.select("df1", ["A>0", "B>0"]) expected = store.select_as_multiple( - ['df1'], where=['A>0', 'B>0'], selector='df1') + ["df1"], where=["A>0", "B>0"], selector="df1" + ) tm.assert_frame_equal(result, expected) expected = store.select_as_multiple( - 'df1', where=['A>0', 'B>0'], selector='df1') + "df1", where=["A>0", "B>0"], selector="df1" + ) tm.assert_frame_equal(result, expected) # multiple result = store.select_as_multiple( - ['df1', 'df2'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df2"], where=["A>0", "B>0"], selector="df1" + ) expected = concat([df1, df2], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected) # multiple (diff selector) result = store.select_as_multiple( - ['df1', 'df2'], where='index>df2.index[4]', selector='df2') + ["df1", "df2"], where="index>df2.index[4]", selector="df2" + ) expected = concat([df1, df2], axis=1) expected = expected[5:] tm.assert_frame_equal(result, expected) # test exception for diff rows - store.append('df3', tm.makeTimeDataFrame(nper=50)) + store.append("df3", tm.makeTimeDataFrame(nper=50)) with pytest.raises(ValueError): store.select_as_multiple( - ['df1', 'df3'], where=['A>0', 'B>0'], selector='df1') + ["df1", "df3"], where=["A>0", "B>0"], selector="df1" + ) @pytest.mark.skipif( - LooseVersion(tables.__version__) < LooseVersion('3.1.0'), - reason=("tables version does not support fix for nan selection " - "bug: GH 4858")) + LooseVersion(tables.__version__) < LooseVersion("3.1.0"), + reason=( + "tables version does not support fix for nan selection " "bug: GH 4858" + ), + ) def test_nan_selection_bug_4858(self): with ensure_clean_store(self.path) as store: - df = DataFrame(dict(cols=range(6), values=range(6)), - dtype='float64') - df['cols'] = (df['cols'] + 10).apply(str) + df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64") + df["cols"] = (df["cols"] + 10).apply(str) df.iloc[0] = np.nan - expected = DataFrame(dict(cols=['13.0', '14.0', '15.0'], values=[ - 3., 4., 5.]), index=[3, 4, 5]) + expected = DataFrame( + dict(cols=["13.0", "14.0", "15.0"], values=[3.0, 4.0, 5.0]), + index=[3, 4, 5], + ) # write w/o the index on that particular column - store.append('df', df, data_columns=True, index=['cols']) - result = store.select('df', where='values>2.0') + store.append("df", df, data_columns=True, index=["cols"]) + result = store.select("df", where="values>2.0") assert_frame_equal(result, expected) def test_start_stop_table(self): @@ -3719,18 +3845,16 @@ def test_start_stop_table(self): # table df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) - store.append('df', df) + store.append("df", df) - result = store.select( - 'df', "columns=['A']", start=0, stop=5) - expected = df.loc[0:4, ['A']] + result = store.select("df", "columns=['A']", start=0, stop=5) + expected = df.loc[0:4, ["A"]] tm.assert_frame_equal(result, expected) # out of range - result = store.select( - 'df', "columns=['A']", start=30, stop=40) + result = store.select("df", "columns=['A']", start=30, stop=40) assert len(result) == 0 - expected = df.loc[30:40, ['A']] + expected = df.loc[30:40, ["A"]] tm.assert_frame_equal(result, expected) def test_start_stop_multiple(self): @@ -3740,12 +3864,13 @@ def test_start_stop_multiple(self): df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) - store.append_to_multiple({'selector': ['foo'], 'data': None}, df, - selector='selector') - result = store.select_as_multiple(['selector', 'data'], - selector='selector', start=0, - stop=1) - expected = df.loc[[0], ['foo', 'bar']] + store.append_to_multiple( + {"selector": ["foo"], "data": None}, df, selector="selector" + ) + result = store.select_as_multiple( + ["selector", "data"], selector="selector", start=0, stop=1 + ) + expected = df.loc[[0], ["foo", "bar"]] tm.assert_frame_equal(result, expected) @ignore_sparse @@ -3755,35 +3880,33 @@ def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: # fixed, GH 8287 - df = DataFrame(dict(A=np.random.rand(20), - B=np.random.rand(20)), - index=pd.date_range('20130101', periods=20)) - store.put('df', df) + df = DataFrame( + dict(A=np.random.rand(20), B=np.random.rand(20)), + index=pd.date_range("20130101", periods=20), + ) + store.put("df", df) - result = store.select( - 'df', start=0, stop=5) + result = store.select("df", start=0, stop=5) expected = df.iloc[0:5, :] tm.assert_frame_equal(result, expected) - result = store.select( - 'df', start=5, stop=10) + result = store.select("df", start=5, stop=10) expected = df.iloc[5:10, :] tm.assert_frame_equal(result, expected) # out of range - result = store.select( - 'df', start=30, stop=40) + result = store.select("df", start=30, stop=40) expected = df.iloc[30:40, :] tm.assert_frame_equal(result, expected) # series s = df.A - store.put('s', s) - result = store.select('s', start=0, stop=5) + store.put("s", s) + result = store.select("s", start=0, stop=5) expected = s.iloc[0:5] tm.assert_series_equal(result, expected) - result = store.select('s', start=5, stop=10) + result = store.select("s", start=5, stop=10) expected = s.iloc[5:10] tm.assert_series_equal(result, expected) @@ -3792,46 +3915,50 @@ def test_start_stop_fixed(self): df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan dfs = df.to_sparse() - store.put('dfs', dfs) + store.put("dfs", dfs) with pytest.raises(NotImplementedError): - store.select('dfs', start=0, stop=5) + store.select("dfs", start=0, stop=5) def test_select_filter_corner(self): df = DataFrame(np.random.randn(50, 100)) - df.index = ['%.3d' % c for c in df.index] - df.columns = ['%.3d' % c for c in df.columns] + df.index = ["%.3d" % c for c in df.index] + df.columns = ["%.3d" % c for c in df.columns] with ensure_clean_store(self.path) as store: - store.put('frame', df, format='table') + store.put("frame", df, format="table") - crit = 'columns=df.columns[:75]' - result = store.select('frame', [crit]) + crit = "columns=df.columns[:75]" + result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75]]) - crit = 'columns=df.columns[:75:2]' - result = store.select('frame', [crit]) + crit = "columns=df.columns[:75:2]" + result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) def test_path_pathlib(self): df = tm.makeDataFrame() result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, 'df'), - lambda p: pd.read_hdf(p, 'df')) + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) tm.assert_frame_equal(df, result) - @pytest.mark.parametrize('start, stop', [(0, 2), (1, 2), (None, None)]) + @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) def test_contiguous_mixed_data_table(self, start, stop): # GH 17021 # ValueError when reading a contiguous mixed-data table ft. VLArray - df = DataFrame({'a': Series([20111010, 20111011, 20111012]), - 'b': Series(['ab', 'cd', 'ab'])}) + df = DataFrame( + { + "a": Series([20111010, 20111011, 20111012]), + "b": Series(["ab", "cd", "ab"]), + } + ) with ensure_clean_store(self.path) as store: - store.append('test_dataset', df) + store.append("test_dataset", df) - result = store.select('test_dataset', start=start, stop=stop) + result = store.select("test_dataset", start=start, stop=stop) assert_frame_equal(df[start:stop], result) def test_path_pathlib_hdfstore(self): @@ -3839,11 +3966,11 @@ def test_path_pathlib_hdfstore(self): def writer(path): with pd.HDFStore(path) as store: - df.to_hdf(store, 'df') + df.to_hdf(store, "df") def reader(path): with pd.HDFStore(path) as store: - return pd.read_hdf(store, 'df') + return pd.read_hdf(store, "df") result = tm.round_trip_pathlib(writer, reader) tm.assert_frame_equal(df, result) @@ -3851,8 +3978,8 @@ def reader(path): def test_pickle_path_localpath(self): df = tm.makeDataFrame() result = tm.round_trip_pathlib( - lambda p: df.to_hdf(p, 'df'), - lambda p: pd.read_hdf(p, 'df')) + lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") + ) tm.assert_frame_equal(df, result) def test_path_localpath_hdfstore(self): @@ -3860,11 +3987,11 @@ def test_path_localpath_hdfstore(self): def writer(path): with pd.HDFStore(path) as store: - df.to_hdf(store, 'df') + df.to_hdf(store, "df") def reader(path): with pd.HDFStore(path) as store: - return pd.read_hdf(store, 'df') + return pd.read_hdf(store, "df") result = tm.round_trip_localpath(writer, reader) tm.assert_frame_equal(df, result) @@ -3873,35 +4000,34 @@ def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: - options['complib'] = _default_compressor + options["complib"] = _default_compressor - with ensure_clean_store(self.path, 'w', **options) as store: - store['obj'] = obj - retrieved = store['obj'] + with ensure_clean_store(self.path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] comparator(retrieved, obj, **kwargs) - def _check_double_roundtrip(self, obj, comparator, compression=False, - **kwargs): + def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} if compression: - options['complib'] = compression or _default_compressor + options["complib"] = compression or _default_compressor - with ensure_clean_store(self.path, 'w', **options) as store: - store['obj'] = obj - retrieved = store['obj'] + with ensure_clean_store(self.path, "w", **options) as store: + store["obj"] = obj + retrieved = store["obj"] comparator(retrieved, obj, **kwargs) - store['obj'] = retrieved - again = store['obj'] + store["obj"] = retrieved + again = store["obj"] comparator(again, obj, **kwargs) def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: - options['complib'] = _default_compressor + options["complib"] = _default_compressor - with ensure_clean_store(self.path, 'w', **options) as store: - store.put('obj', obj, format='table') - retrieved = store['obj'] + with ensure_clean_store(self.path, "w", **options) as store: + store.put("obj", obj, format="table") + retrieved = store["obj"] comparator(retrieved, obj) @@ -3911,15 +4037,15 @@ def test_multiple_open_close(self): with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path, 'df', mode='w', format='table') + df.to_hdf(path, "df", mode="w", format="table") # single store = HDFStore(path) - assert 'CLOSED' not in store.info() + assert "CLOSED" not in store.info() assert store.is_open store.close() - assert 'CLOSED' in store.info() + assert "CLOSED" in store.info() assert not store.is_open with ensure_clean_path(self.path) as path: @@ -3939,55 +4065,55 @@ def test_multiple_open_close(self): store1 = HDFStore(path) store2 = HDFStore(path) - assert 'CLOSED' not in store1.info() - assert 'CLOSED' not in store2.info() + assert "CLOSED" not in store1.info() + assert "CLOSED" not in store2.info() assert store1.is_open assert store2.is_open store1.close() - assert 'CLOSED' in store1.info() + assert "CLOSED" in store1.info() assert not store1.is_open - assert 'CLOSED' not in store2.info() + assert "CLOSED" not in store2.info() assert store2.is_open store2.close() - assert 'CLOSED' in store1.info() - assert 'CLOSED' in store2.info() + assert "CLOSED" in store1.info() + assert "CLOSED" in store2.info() assert not store1.is_open assert not store2.is_open # nested close - store = HDFStore(path, mode='w') - store.append('df', df) + store = HDFStore(path, mode="w") + store.append("df", df) store2 = HDFStore(path) - store2.append('df2', df) + store2.append("df2", df) store2.close() - assert 'CLOSED' in store2.info() + assert "CLOSED" in store2.info() assert not store2.is_open store.close() - assert 'CLOSED' in store.info() + assert "CLOSED" in store.info() assert not store.is_open # double closing - store = HDFStore(path, mode='w') - store.append('df', df) + store = HDFStore(path, mode="w") + store.append("df", df) store2 = HDFStore(path) store.close() - assert 'CLOSED' in store.info() + assert "CLOSED" in store.info() assert not store.is_open store2.close() - assert 'CLOSED' in store2.info() + assert "CLOSED" in store2.info() assert not store2.is_open # ops on a closed store with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path, 'df', mode='w', format='table') + df.to_hdf(path, "df", mode="w", format="table") store = HDFStore(path) store.close() @@ -3996,53 +4122,54 @@ def test_multiple_open_close(self): store.keys() with pytest.raises(ClosedFileError): - 'df' in store + "df" in store with pytest.raises(ClosedFileError): len(store) with pytest.raises(ClosedFileError): - store['df'] + store["df"] with pytest.raises(AttributeError): store.df with pytest.raises(ClosedFileError): - store.select('df') + store.select("df") with pytest.raises(ClosedFileError): - store.get('df') + store.get("df") with pytest.raises(ClosedFileError): - store.append('df2', df) + store.append("df2", df) with pytest.raises(ClosedFileError): - store.put('df3', df) + store.put("df3", df) with pytest.raises(ClosedFileError): - store.get_storer('df2') + store.get_storer("df2") with pytest.raises(ClosedFileError): - store.remove('df2') + store.remove("df2") - with pytest.raises(ClosedFileError, match='file is not open'): - store.select('df') + with pytest.raises(ClosedFileError, match="file is not open"): + store.select("df") def test_pytables_native_read(self, datapath): with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf/pytables_native.h5'), - mode='r') as store: - d2 = store['detector/readout'] + datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" + ) as store: + d2 = store["detector/readout"] assert isinstance(d2, DataFrame) - @pytest.mark.skipif(is_platform_windows(), - reason="native2 read fails oddly on windows") + @pytest.mark.skipif( + is_platform_windows(), reason="native2 read fails oddly on windows" + ) def test_pytables_native2_read(self, datapath): with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'), - mode='r') as store: + datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" + ) as store: str(store) - d1 = store['detector'] + d1 = store["detector"] assert isinstance(d1, DataFrame) @xfail_non_writeable @@ -4050,46 +4177,43 @@ def test_legacy_table_fixed_format_read_py2(self, datapath): # GH 24510 # legacy table with fixed format written in Python 2 with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', - 'legacy_table_fixed_py2.h5'), - mode='r') as store: - result = store.select('df') - expected = pd.DataFrame([[1, 2, 3, 'D']], - columns=['A', 'B', 'C', 'D'], - index=pd.Index(['ABC'], - name='INDEX_NAME')) + datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r" + ) as store: + result = store.select("df") + expected = pd.DataFrame( + [[1, 2, 3, "D"]], + columns=["A", "B", "C", "D"], + index=pd.Index(["ABC"], name="INDEX_NAME"), + ) assert_frame_equal(expected, result) def test_legacy_table_read_py2(self, datapath): # issue: 24925 # legacy table written in Python 2 with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', - 'legacy_table_py2.h5'), - mode='r') as store: - result = store.select('table') - - expected = pd.DataFrame({ - "a": ["a", "b"], - "b": [2, 3] - }) + datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r" + ) as store: + result = store.select("table") + + expected = pd.DataFrame({"a": ["a", "b"], "b": [2, 3]}) assert_frame_equal(expected, result) def test_copy(self): with catch_warnings(record=True): - def do_copy(f, new_f=None, keys=None, - propindexes=True, **kwargs): + def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): try: - store = HDFStore(f, 'r') + store = HDFStore(f, "r") if new_f is None: import tempfile + fd, new_f = tempfile.mkstemp() tstore = store.copy( - new_f, keys=keys, propindexes=propindexes, **kwargs) + new_f, keys=keys, propindexes=propindexes, **kwargs + ) # check keys if keys is None: @@ -4125,7 +4249,7 @@ def do_copy(f, new_f=None, keys=None, try: path = create_tempfile(self.path) st = HDFStore(path) - st.append('df', df, data_columns=['A']) + st.append("df", df, data_columns=["A"]) st.close() do_copy(f=path) do_copy(f=path, propindexes=False) @@ -4137,16 +4261,16 @@ def test_store_datetime_fractional_secs(self): with ensure_clean_store(self.path) as store: dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) - store['a'] = series - assert store['a'].index[0] == dt + store["a"] = series + assert store["a"].index[0] == dt def test_tseries_indices_series(self): with ensure_clean_store(self.path) as store: idx = tm.makeDateIndex(10) ser = Series(np.random.randn(len(idx)), idx) - store['a'] = ser - result = store['a'] + store["a"] = ser + result = store["a"] tm.assert_series_equal(result, ser) assert result.index.freq == ser.index.freq @@ -4154,8 +4278,8 @@ def test_tseries_indices_series(self): idx = tm.makePeriodIndex(10) ser = Series(np.random.randn(len(idx)), idx) - store['a'] = ser - result = store['a'] + store["a"] = ser + result = store["a"] tm.assert_series_equal(result, ser) assert result.index.freq == ser.index.freq @@ -4166,27 +4290,25 @@ def test_tseries_indices_frame(self): with ensure_clean_store(self.path) as store: idx = tm.makeDateIndex(10) df = DataFrame(np.random.randn(len(idx), 3), index=idx) - store['a'] = df - result = store['a'] + store["a"] = df + result = store["a"] assert_frame_equal(result, df) assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, - obj="dataframe index") + tm.assert_class_equal(result.index, df.index, obj="dataframe index") idx = tm.makePeriodIndex(10) df = DataFrame(np.random.randn(len(idx), 3), idx) - store['a'] = df - result = store['a'] + store["a"] = df + result = store["a"] assert_frame_equal(result, df) assert result.index.freq == df.index.freq - tm.assert_class_equal(result.index, df.index, - obj="dataframe index") + tm.assert_class_equal(result.index, df.index, obj="dataframe index") def test_unicode_index(self): - unicode_values = ['\u03c3', '\u03c3\u03c3'] + unicode_values = ["\u03c3", "\u03c3\u03c3"] # PerformanceWarning with catch_warnings(record=True): @@ -4196,26 +4318,25 @@ def test_unicode_index(self): def test_unicode_longer_encoded(self): # GH 11234 - char = '\u0394' - df = pd.DataFrame({'A': [char]}) + char = "\u0394" + df = pd.DataFrame({"A": [char]}) with ensure_clean_store(self.path) as store: - store.put('df', df, format='table', encoding='utf-8') - result = store.get('df') + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") tm.assert_frame_equal(result, df) - df = pd.DataFrame({'A': ['a', char], 'B': ['b', 'b']}) + df = pd.DataFrame({"A": ["a", char], "B": ["b", "b"]}) with ensure_clean_store(self.path) as store: - store.put('df', df, format='table', encoding='utf-8') - result = store.get('df') + store.put("df", df, format="table", encoding="utf-8") + result = store.get("df") tm.assert_frame_equal(result, df) @xfail_non_writeable def test_store_datetime_mixed(self): - df = DataFrame( - {'a': [1, 2, 3], 'b': [1., 2., 3.], 'c': ['a', 'b', 'c']}) + df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) ts = tm.makeTimeSeries() - df['d'] = ts.index[:3] + df["d"] = ts.index[:3] self._check_roundtrip(df, tm.assert_frame_equal) # def test_cant_write_multiindex_table(self): @@ -4229,13 +4350,13 @@ def test_store_datetime_mixed(self): def test_append_with_diff_col_name_types_raises_value_error(self): df = DataFrame(np.random.randn(10, 1)) - df2 = DataFrame({'a': np.random.randn(10)}) + df2 = DataFrame({"a": np.random.randn(10)}) df3 = DataFrame({(1, 2): np.random.randn(10)}) - df4 = DataFrame({('1', 2): np.random.randn(10)}) - df5 = DataFrame({('1', 2, object): np.random.randn(10)}) + df4 = DataFrame({("1", 2): np.random.randn(10)}) + df5 = DataFrame({("1", 2, object): np.random.randn(10)}) with ensure_clean_store(self.path) as store: - name = 'df_%s' % tm.rands(10) + name = "df_%s" % tm.rands(10) store.append(name, df) for d in (df2, df3, df4, df5): @@ -4243,13 +4364,16 @@ def test_append_with_diff_col_name_types_raises_value_error(self): store.append(name, d) def test_query_with_nested_special_character(self): - df = DataFrame({'a': ['a', 'a', 'c', 'b', - 'test & test', 'c', 'b', 'e'], - 'b': [1, 2, 3, 4, 5, 6, 7, 8]}) - expected = df[df.a == 'test & test'] + df = DataFrame( + { + "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], + "b": [1, 2, 3, 4, 5, 6, 7, 8], + } + ) + expected = df[df.a == "test & test"] with ensure_clean_store(self.path) as store: - store.append('test', df, format='table', data_columns=True) - result = store.select('test', 'a = "test & test"') + store.append("test", df, format="table", data_columns=True) + result = store.select("test", 'a = "test & test"') tm.assert_frame_equal(expected, result) def test_categorical(self): @@ -4257,111 +4381,126 @@ def test_categorical(self): with ensure_clean_store(self.path) as store: # Basic - _maybe_remove(store, 's') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ - 'a', 'b', 'c', 'd'], ordered=False)) - store.append('s', s, format='table') - result = store.select('s') + _maybe_remove(store, "s") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s", s, format="table") + result = store.select("s") tm.assert_series_equal(s, result) - _maybe_remove(store, 's_ordered') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ - 'a', 'b', 'c', 'd'], ordered=True)) - store.append('s_ordered', s, format='table') - result = store.select('s_ordered') + _maybe_remove(store, "s_ordered") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=True, + ) + ) + store.append("s_ordered", s, format="table") + result = store.select("s_ordered") tm.assert_series_equal(s, result) - _maybe_remove(store, 'df') + _maybe_remove(store, "df") df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) - store.append('df', df, format='table') - result = store.select('df') + store.append("df", df, format="table") + result = store.select("df") tm.assert_frame_equal(result, df) # Dtypes - _maybe_remove(store, 'si') - s = Series([1, 1, 2, 2, 3, 4, 5]).astype('category') - store.append('si', s) - result = store.select('si') + _maybe_remove(store, "si") + s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") + store.append("si", s) + result = store.select("si") tm.assert_series_equal(result, s) - _maybe_remove(store, 'si2') - s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype('category') - store.append('si2', s) - result = store.select('si2') + _maybe_remove(store, "si2") + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") + store.append("si2", s) + result = store.select("si2") tm.assert_series_equal(result, s) # Multiple - _maybe_remove(store, 'df2') + _maybe_remove(store, "df2") df2 = df.copy() - df2['s2'] = Series(list('abcdefg')).astype('category') - store.append('df2', df2) - result = store.select('df2') + df2["s2"] = Series(list("abcdefg")).astype("category") + store.append("df2", df2) + result = store.select("df2") tm.assert_frame_equal(result, df2) # Make sure the metadata is OK info = store.info() - assert '/df2 ' in info + assert "/df2 " in info # assert '/df2/meta/values_block_0/meta' in info - assert '/df2/meta/values_block_1/meta' in info + assert "/df2/meta/values_block_1/meta" in info # unordered - _maybe_remove(store, 's2') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ - 'a', 'b', 'c', 'd'], ordered=False)) - store.append('s2', s, format='table') - result = store.select('s2') + _maybe_remove(store, "s2") + s = Series( + Categorical( + ["a", "b", "b", "a", "a", "c"], + categories=["a", "b", "c", "d"], + ordered=False, + ) + ) + store.append("s2", s, format="table") + result = store.select("s2") tm.assert_series_equal(result, s) # Query - _maybe_remove(store, 'df3') - store.append('df3', df, data_columns=['s']) - expected = df[df.s.isin(['b', 'c'])] - result = store.select('df3', where=['s in ["b","c"]']) + _maybe_remove(store, "df3") + store.append("df3", df, data_columns=["s"]) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) - expected = df[df.s.isin(['b', 'c'])] - result = store.select('df3', where=['s = ["b","c"]']) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s = ["b","c"]']) tm.assert_frame_equal(result, expected) - expected = df[df.s.isin(['d'])] - result = store.select('df3', where=['s in ["d"]']) + expected = df[df.s.isin(["d"])] + result = store.select("df3", where=['s in ["d"]']) tm.assert_frame_equal(result, expected) - expected = df[df.s.isin(['f'])] - result = store.select('df3', where=['s in ["f"]']) + expected = df[df.s.isin(["f"])] + result = store.select("df3", where=['s in ["f"]']) tm.assert_frame_equal(result, expected) # Appending with same categories is ok - store.append('df3', df) + store.append("df3", df) df = concat([df, df]) - expected = df[df.s.isin(['b', 'c'])] - result = store.select('df3', where=['s in ["b","c"]']) + expected = df[df.s.isin(["b", "c"])] + result = store.select("df3", where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) # Appending must have the same categories df3 = df.copy() - df3['s'].cat.remove_unused_categories(inplace=True) + df3["s"].cat.remove_unused_categories(inplace=True) with pytest.raises(ValueError): - store.append('df3', df3) + store.append("df3", df3) # Remove, and make sure meta data is removed (its a recursive # removal so should be). - result = store.select('df3/meta/s/meta') + result = store.select("df3/meta/s/meta") assert result is not None - store.remove('df3') + store.remove("df3") with pytest.raises(KeyError): - store.select('df3/meta/s/meta') + store.select("df3/meta/s/meta") def test_categorical_conversion(self): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if # where criteria isn't met. - obsids = ['ESP_012345_6789', 'ESP_987654_3210'] - imgids = ['APF00006np', 'APF0001imm'] + obsids = ["ESP_012345_6789", "ESP_987654_3210"] + imgids = ["APF00006np", "APF0001imm"] data = [4.3, 9.8] # Test without categories @@ -4370,38 +4509,40 @@ def test_categorical_conversion(self): # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', data_columns=True) - result = read_hdf(path, 'df', where='obsids=B') + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) # Test with categories - df.obsids = df.obsids.astype('category') - df.imgids = df.imgids.astype('category') + df.obsids = df.obsids.astype("category") + df.imgids = df.imgids.astype("category") # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', data_columns=True) - result = read_hdf(path, 'df', where='obsids=B') + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) def test_categorical_nan_only_columns(self): # GH18413 # Check that read_hdf with categorical columns with NaN-only values can # be read back. - df = pd.DataFrame({ - 'a': ['a', 'b', 'c', np.nan], - 'b': [np.nan, np.nan, np.nan, np.nan], - 'c': [1, 2, 3, 4], - 'd': pd.Series([None] * 4, dtype=object) - }) - df['a'] = df.a.astype('category') - df['b'] = df.b.astype('category') - df['d'] = df.b.astype('category') + df = pd.DataFrame( + { + "a": ["a", "b", "c", np.nan], + "b": [np.nan, np.nan, np.nan, np.nan], + "c": [1, 2, 3, 4], + "d": pd.Series([None] * 4, dtype=object), + } + ) + df["a"] = df.a.astype("category") + df["b"] = df.b.astype("category") + df["d"] = df.b.astype("category") expected = df with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', data_columns=True) - result = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table", data_columns=True) + result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) def test_duplicate_column_name(self): @@ -4409,10 +4550,10 @@ def test_duplicate_column_name(self): with ensure_clean_path(self.path) as path: with pytest.raises(ValueError): - df.to_hdf(path, 'df', format='fixed') + df.to_hdf(path, "df", format="fixed") - df.to_hdf(path, 'df', format='table') - other = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) @@ -4423,8 +4564,8 @@ def test_round_trip_equals(self): df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table') - other = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) @@ -4434,35 +4575,35 @@ def test_preserve_timedeltaindex_type(self): # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve # the type of the index. df = DataFrame(np.random.normal(size=(10, 5))) - df.index = timedelta_range( - start='0s', periods=10, freq='1s', name='example') + df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") with ensure_clean_store(self.path) as store: - store['df'] = df - assert_frame_equal(store['df'], df) + store["df"] = df + assert_frame_equal(store["df"], df) def test_columns_multiindex_modified(self): # BUG: 7212 # read_hdf store.select modified the passed columns parameters # when multi-indexed. - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) - df.index.name = 'letters' - df = df.set_index(keys='E', append=True) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) data_columns = df.index.names + df.columns.tolist() with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', - mode='a', - append=True, - data_columns=data_columns, - index=False) - cols2load = list('BCD') + df.to_hdf( + path, + "df", + mode="a", + append=True, + data_columns=data_columns, + index=False, + ) + cols2load = list("BCD") cols2load_original = list(cols2load) - df_loaded = read_hdf(path, 'df', columns=cols2load) # noqa + df_loaded = read_hdf(path, "df", columns=cols2load) # noqa assert cols2load_original == cols2load @ignore_natural_naming_warning @@ -4471,11 +4612,18 @@ def test_to_hdf_with_object_column_names(self): # Writing HDF5 table format should only work for string-like # column types - types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex] - types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex, - tm.makeUnicodeIndex] + types_should_fail = [ + tm.makeIntIndex, + tm.makeFloatIndex, + tm.makeDateIndex, + tm.makeTimedeltaIndex, + tm.makePeriodIndex, + ] + types_should_run = [ + tm.makeStringIndex, + tm.makeCategoricalIndex, + tm.makeUnicodeIndex, + ] for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) @@ -4483,107 +4631,97 @@ def test_to_hdf_with_object_column_names(self): with catch_warnings(record=True): msg = "cannot have non-object label DataIndexableCol" with pytest.raises(ValueError, match=msg): - df.to_hdf(path, 'df', format='table', - data_columns=True) + df.to_hdf(path, "df", format="table", data_columns=True) for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: with catch_warnings(record=True): - df.to_hdf(path, 'df', format='table', data_columns=True) + df.to_hdf(path, "df", format="table", data_columns=True) result = pd.read_hdf( - path, 'df', where="index = [{0}]".format(df.index[0])) - assert(len(result)) + path, "df", where="index = [{0}]".format(df.index[0]) + ) + assert len(result) def test_read_hdf_open_store(self): # GH10330 # No check for non-string path_or-buf, and no test of open store - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) - df.index.name = 'letters' - df = df.set_index(keys='E', append=True) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='w') - direct = read_hdf(path, 'df') - store = HDFStore(path, mode='r') - indirect = read_hdf(store, 'df') + df.to_hdf(path, "df", mode="w") + direct = read_hdf(path, "df") + store = HDFStore(path, mode="r") + indirect = read_hdf(store, "df") tm.assert_frame_equal(direct, indirect) assert store.is_open store.close() def test_read_hdf_iterator(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) - df.index.name = 'letters' - df = df.set_index(keys='E', append=True) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) + df.index.name = "letters" + df = df.set_index(keys="E", append=True) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='w', format='t') - direct = read_hdf(path, 'df') - iterator = read_hdf(path, 'df', iterator=True) + df.to_hdf(path, "df", mode="w", format="t") + direct = read_hdf(path, "df") + iterator = read_hdf(path, "df", iterator=True) assert isinstance(iterator, TableIterator) indirect = next(iterator.__iter__()) tm.assert_frame_equal(direct, indirect) iterator.store.close() def test_read_hdf_errors(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with ensure_clean_path(self.path) as path: with pytest.raises(IOError): - read_hdf(path, 'key') + read_hdf(path, "key") - df.to_hdf(path, 'df') - store = HDFStore(path, mode='r') + df.to_hdf(path, "df") + store = HDFStore(path, mode="r") store.close() with pytest.raises(IOError): - read_hdf(store, 'df') + read_hdf(store, "df") def test_read_hdf_generic_buffer_errors(self): with pytest.raises(NotImplementedError): - read_hdf(BytesIO(b''), 'df') + read_hdf(BytesIO(b""), "df") def test_invalid_complib(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with ensure_clean_path(self.path) as path: with pytest.raises(ValueError): - df.to_hdf(path, 'df', complib='foolib') + df.to_hdf(path, "df", complib="foolib") + # GH10443 def test_read_nokey(self): - df = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) # Categorical dtype not supported for "fixed" format. So no need # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='a') + df.to_hdf(path, "df", mode="a") reread = read_hdf(path) assert_frame_equal(df, reread) - df.to_hdf(path, 'df2', mode='a') + df.to_hdf(path, "df2", mode="a") with pytest.raises(ValueError): read_hdf(path) def test_read_nokey_table(self): # GH13231 - df = DataFrame({'i': range(5), - 'c': Series(list('abacd'), dtype='category')}) + df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', mode='a', format='table') + df.to_hdf(path, "df", mode="a", format="table") reread = read_hdf(path) assert_frame_equal(df, reread) - df.to_hdf(path, 'df2', mode='a', format='table') + df.to_hdf(path, "df2", mode="a", format="table") with pytest.raises(ValueError): read_hdf(path) @@ -4596,121 +4734,123 @@ def test_read_nokey_empty(self): with pytest.raises(ValueError): read_hdf(path) - @td.skip_if_no('pathlib') + @td.skip_if_no("pathlib") def test_read_from_pathlib_path(self): # GH11773 from pathlib import Path - expected = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) with ensure_clean_path(self.path) as filename: path_obj = Path(filename) - expected.to_hdf(path_obj, 'df', mode='a') - actual = read_hdf(path_obj, 'df') + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") tm.assert_frame_equal(expected, actual) - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_read_from_py_localpath(self): # GH11773 from py.path import local as LocalPath - expected = DataFrame(np.random.rand(4, 5), - index=list('abcd'), - columns=list('ABCDE')) + expected = DataFrame( + np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") + ) with ensure_clean_path(self.path) as filename: path_obj = LocalPath(filename) - expected.to_hdf(path_obj, 'df', mode='a') - actual = read_hdf(path_obj, 'df') + expected.to_hdf(path_obj, "df", mode="a") + actual = read_hdf(path_obj, "df") tm.assert_frame_equal(expected, actual) def test_query_long_float_literal(self): # GH 14241 - df = pd.DataFrame({'A': [1000000000.0009, - 1000000000.0011, - 1000000000.0015]}) + df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) with ensure_clean_store(self.path) as store: - store.append('test', df, format='table', data_columns=True) + store.append("test", df, format="table", data_columns=True) cutoff = 1000000000.0006 - result = store.select('test', "A < %.4f" % cutoff) + result = store.select("test", "A < %.4f" % cutoff) assert result.empty cutoff = 1000000000.0010 - result = store.select('test', "A > %.4f" % cutoff) + result = store.select("test", "A > %.4f" % cutoff) expected = df.loc[[1, 2], :] tm.assert_frame_equal(expected, result) exact = 1000000000.0011 - result = store.select('test', 'A == %.4f' % exact) + result = store.select("test", "A == %.4f" % exact) expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) def test_query_compare_column_type(self): # GH 15492 - df = pd.DataFrame({'date': ['2014-01-01', '2014-01-02'], - 'real_date': date_range('2014-01-01', periods=2), - 'float': [1.1, 1.2], - 'int': [1, 2]}, - columns=['date', 'real_date', 'float', 'int']) + df = pd.DataFrame( + { + "date": ["2014-01-01", "2014-01-02"], + "real_date": date_range("2014-01-01", periods=2), + "float": [1.1, 1.2], + "int": [1, 2], + }, + columns=["date", "real_date", "float", "int"], + ) with ensure_clean_store(self.path) as store: - store.append('test', df, format='table', data_columns=True) + store.append("test", df, format="table", data_columns=True) - ts = pd.Timestamp('2014-01-01') # noqa - result = store.select('test', where='real_date > ts') + ts = pd.Timestamp("2014-01-01") # noqa + result = store.select("test", where="real_date > ts") expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) - for op in ['<', '>', '==']: + for op in ["<", ">", "=="]: # non strings to string column always fail - for v in [2.1, True, pd.Timestamp('2014-01-01'), - pd.Timedelta(1, 's')]: - query = 'date {op} v'.format(op=op) + for v in [2.1, True, pd.Timestamp("2014-01-01"), pd.Timedelta(1, "s")]: + query = "date {op} v".format(op=op) with pytest.raises(TypeError): - store.select('test', where=query) + store.select("test", where=query) # strings to other columns must be convertible to type - v = 'a' - for col in ['int', 'float', 'real_date']: - query = '{col} {op} v'.format(op=op, col=col) + v = "a" + for col in ["int", "float", "real_date"]: + query = "{col} {op} v".format(op=op, col=col) with pytest.raises(ValueError): - store.select('test', where=query) + store.select("test", where=query) - for v, col in zip(['1', '1.1', '2014-01-01'], - ['int', 'float', 'real_date']): - query = '{col} {op} v'.format(op=op, col=col) - result = store.select('test', where=query) + for v, col in zip( + ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"] + ): + query = "{col} {op} v".format(op=op, col=col) + result = store.select("test", where=query) - if op == '==': + if op == "==": expected = df.loc[[0], :] - elif op == '>': + elif op == ">": expected = df.loc[[1], :] else: expected = df.loc[[], :] tm.assert_frame_equal(expected, result) - @pytest.mark.parametrize('format', ['fixed', 'table']) + @pytest.mark.parametrize("format", ["fixed", "table"]) def test_read_hdf_series_mode_r(self, format): # GH 16583 # Tests that reading a Series saved to an HDF file # still works if a mode='r' argument is supplied series = tm.makeFloatSeries() with ensure_clean_path(self.path) as path: - series.to_hdf(path, key='data', format=format) - result = pd.read_hdf(path, key='data', mode='r') + series.to_hdf(path, key="data", format=format) + result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) @pytest.mark.skipif(not PY36, reason="Need python 3.6") def test_fspath(self): - with tm.ensure_clean('foo.h5') as path: + with tm.ensure_clean("foo.h5") as path: with pd.HDFStore(path) as store: assert os.fspath(store) == str(path) @@ -4725,17 +4865,21 @@ def test_read_py2_hdf_file_in_py3(self, datapath): # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p') - expected = pd.DataFrame([1., 2, 3], index=pd.PeriodIndex( - ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) + expected = pd.DataFrame( + [1.0, 2, 3], + index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"), + ) with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', - 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), - mode='r') as store: - result = store['p'] + datapath( + "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5" + ), + mode="r", + ) as store: + result = store["p"] assert_frame_equal(result, expected) - @pytest.mark.parametrize("where", ["", (), (None, ), [], [None]]) + @pytest.mark.parametrize("where", ["", (), (None,), [], [None]]) def test_select_empty_where(self, where): # GH26610 @@ -4750,118 +4894,139 @@ def test_select_empty_where(self, where): result = pd.read_hdf(store, "df", where=where) assert_frame_equal(result, df) - @pytest.mark.parametrize('idx', [ - date_range('2019', freq='D', periods=3, tz='UTC'), - CategoricalIndex(list('abc')) - ]) + @pytest.mark.parametrize( + "idx", + [ + date_range("2019", freq="D", periods=3, tz="UTC"), + CategoricalIndex(list("abc")), + ], + ) def test_to_hdf_multiindex_extension_dtype(self, idx): # GH 7775 mi = MultiIndex.from_arrays([idx, idx]) - df = pd.DataFrame(0, index=mi, columns=['a']) + df = pd.DataFrame(0, index=mi, columns=["a"]) with ensure_clean_path(self.path) as path: - with pytest.raises(NotImplementedError, - match="Saving a MultiIndex"): - df.to_hdf(path, 'df') + with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): + df.to_hdf(path, "df") class TestHDFComplexValues(Base): # GH10447 def test_complex_fixed(self): - df = DataFrame(np.random.rand(4, 5).astype(np.complex64), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) - df = DataFrame(np.random.rand(4, 5).astype(np.complex128), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) def test_complex_table(self): - df = DataFrame(np.random.rand(4, 5).astype(np.complex64), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex64), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) - df = DataFrame(np.random.rand(4, 5).astype(np.complex128), - index=list('abcd'), - columns=list('ABCDE')) + df = DataFrame( + np.random.rand(4, 5).astype(np.complex128), + index=list("abcd"), + columns=list("ABCDE"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table', mode='w') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table", mode="w") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) @xfail_non_writeable def test_complex_mixed_fixed(self): - complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, - 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], - dtype=np.complex128) - df = DataFrame({'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': complex64, - 'D': complex128, - 'E': [1.0, 2.0, 3.0, 4.0]}, - index=list('abcd')) + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) def test_complex_mixed_table(self): - complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, - 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], - dtype=np.complex128) - df = DataFrame({'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': complex64, - 'D': complex128, - 'E': [1.0, 2.0, 3.0, 4.0]}, - index=list('abcd')) + complex64 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 + ) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + { + "A": [1, 2, 3, 4], + "B": ["a", "b", "c", "d"], + "C": complex64, + "D": complex128, + "E": [1.0, 2.0, 3.0, 4.0], + }, + index=list("abcd"), + ) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=['A', 'B']) - result = store.select('df', where='A>2') + store.append("df", df, data_columns=["A", "B"]) + result = store.select("df", where="A>2") assert_frame_equal(df.loc[df.A > 2], result) with ensure_clean_path(self.path) as path: - df.to_hdf(path, 'df', format='table') - reread = read_hdf(path, 'df') + df.to_hdf(path, "df", format="table") + reread = read_hdf(path, "df") assert_frame_equal(df, reread) def test_complex_across_dimensions_fixed(self): with catch_warnings(record=True): - complex128 = np.array( - [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list('abcd')) - df = DataFrame({'A': s, 'B': s}) + complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) objs = [s, df] comps = [tm.assert_series_equal, tm.assert_frame_equal] for obj, comp in zip(objs, comps): with ensure_clean_path(self.path) as path: - obj.to_hdf(path, 'obj', format='fixed') - reread = read_hdf(path, 'obj') + obj.to_hdf(path, "obj", format="fixed") + reread = read_hdf(path, "obj") comp(obj, reread) def test_complex_across_dimensions(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list('abcd')) - df = DataFrame({'A': s, 'B': s}) + s = Series(complex128, index=list("abcd")) + df = DataFrame({"A": s, "B": s}) with catch_warnings(record=True): @@ -4869,47 +5034,48 @@ def test_complex_across_dimensions(self): comps = [tm.assert_frame_equal] for obj, comp in zip(objs, comps): with ensure_clean_path(self.path) as path: - obj.to_hdf(path, 'obj', format='table') - reread = read_hdf(path, 'obj') + obj.to_hdf(path, "obj", format="table") + reread = read_hdf(path, "obj") comp(obj, reread) def test_complex_indexing_error(self): - complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], - dtype=np.complex128) - df = DataFrame({'A': [1, 2, 3, 4], - 'B': ['a', 'b', 'c', 'd'], - 'C': complex128}, - index=list('abcd')) + complex128 = np.array( + [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 + ) + df = DataFrame( + {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, + index=list("abcd"), + ) with ensure_clean_store(self.path) as store: with pytest.raises(TypeError): - store.append('df', df, data_columns=['C']) + store.append("df", df, data_columns=["C"]) def test_complex_series_error(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) - s = Series(complex128, index=list('abcd')) + s = Series(complex128, index=list("abcd")) with ensure_clean_path(self.path) as path: with pytest.raises(TypeError): - s.to_hdf(path, 'obj', format='t') + s.to_hdf(path, "obj", format="t") with ensure_clean_path(self.path) as path: - s.to_hdf(path, 'obj', format='t', index=False) - reread = read_hdf(path, 'obj') + s.to_hdf(path, "obj", format="t", index=False) + reread = read_hdf(path, "obj") tm.assert_series_equal(s, reread) def test_complex_append(self): - df = DataFrame({'a': np.random.randn(100).astype(np.complex128), - 'b': np.random.randn(100)}) + df = DataFrame( + {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} + ) with ensure_clean_store(self.path) as store: - store.append('df', df, data_columns=['b']) - store.append('df', df) - result = store.select('df') + store.append("df", df, data_columns=["b"]) + store.append("df", df) + result = store.select("df") assert_frame_equal(pd.concat([df, df], 0), result) class TestTimezones(Base): - def _compare_with_tz(self, a, b): tm.assert_frame_equal(a, b) @@ -4919,8 +5085,7 @@ def _compare_with_tz(self, a, b): a_e = a.loc[i, c] b_e = b.loc[i, c] if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError( - "invalid tz comparison [%s] [%s]" % (a_e, b_e)) + raise AssertionError("invalid tz comparison [%s] [%s]" % (a_e, b_e)) def test_append_with_timezones_dateutil(self): @@ -4929,74 +5094,98 @@ def test_append_with_timezones_dateutil(self): # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows # filename issues. from pandas._libs.tslibs.timezones import maybe_get_tz - gettz = lambda x: maybe_get_tz('dateutil/' + x) + + gettz = lambda x: maybe_get_tz("dateutil/" + x) # as columns with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', tz=gettz( - 'US/Eastern')) + timedelta(hours=1) * i for i in range(5)])) - - store.append('df_tz', df, data_columns=['A']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # select with tz aware expected = df[df.A >= df.A[3]] - result = store.select('df_tz', where='A>=df.A[3]') + result = store.select("df_tz", where="A>=df.A[3]") self._compare_with_tz(result, expected) # ensure we include dates in DST and STD time here. - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=Timestamp('20130102', - tz=gettz('US/Eastern')), - B=Timestamp('20130603', - tz=gettz('US/Eastern'))), - index=range(5)) - store.append('df_tz', df) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130603", tz=gettz("US/Eastern")), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) - df = DataFrame(dict(A=Timestamp('20130102', - tz=gettz('US/Eastern')), - B=Timestamp('20130102', tz=gettz('EET'))), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("EET")), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # this is ok - _maybe_remove(store, 'df_tz') - store.append('df_tz', df, data_columns=['A', 'B']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # can't append with diff timezone - df = DataFrame(dict(A=Timestamp('20130102', - tz=gettz('US/Eastern')), - B=Timestamp('20130102', tz=gettz('CET'))), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz=gettz("US/Eastern")), + B=Timestamp("20130102", tz=gettz("CET")), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # as index with ensure_clean_store(self.path) as store: # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=date_range( - '2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern'))))) - - _maybe_remove(store, 'df') - store.put('df', df) - result = store.select('df') + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern") + ), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") assert_frame_equal(result, df) - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df') + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") assert_frame_equal(result, df) def test_append_with_timezones_pytz(self): @@ -5006,65 +5195,91 @@ def test_append_with_timezones_pytz(self): # as columns with ensure_clean_store(self.path) as store: - _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', - tz='US/Eastern') + - timedelta(hours=1) * i - for i in range(5)])) - store.append('df_tz', df, data_columns=['A']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + df = DataFrame( + dict( + A=[ + Timestamp("20130102 2:00:00", tz="US/Eastern") + + timedelta(hours=1) * i + for i in range(5) + ] + ) + ) + store.append("df_tz", df, data_columns=["A"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # select with tz aware - self._compare_with_tz(store.select( - 'df_tz', where='A>=df.A[3]'), df[df.A >= df.A[3]]) + self._compare_with_tz( + store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]] + ) - _maybe_remove(store, 'df_tz') + _maybe_remove(store, "df_tz") # ensure we include dates in DST and STD time here. - df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='US/Eastern')), - index=range(5)) - store.append('df_tz', df) - result = store['df_tz'] + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="US/Eastern"), + ), + index=range(5), + ) + store.append("df_tz", df) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) - df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130102', tz='EET')), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="EET"), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # this is ok - _maybe_remove(store, 'df_tz') - store.append('df_tz', df, data_columns=['A', 'B']) - result = store['df_tz'] + _maybe_remove(store, "df_tz") + store.append("df_tz", df, data_columns=["A", "B"]) + result = store["df_tz"] self._compare_with_tz(result, df) assert_frame_equal(result, df) # can't append with diff timezone - df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130102', tz='CET')), - index=range(5)) + df = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130102", tz="CET"), + ), + index=range(5), + ) with pytest.raises(ValueError): - store.append('df_tz', df) + store.append("df_tz", df) # as index with ensure_clean_store(self.path) as store: # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=date_range( - '2000-1-1', periods=3, freq='H', tz='US/Eastern')))) - - _maybe_remove(store, 'df') - store.put('df', df) - result = store.select('df') + df = DataFrame( + dict( + A=Series( + range(3), + index=date_range( + "2000-1-1", periods=3, freq="H", tz="US/Eastern" + ), + ) + ) + ) + + _maybe_remove(store, "df") + store.put("df", df) + result = store.select("df") assert_frame_equal(result, df) - _maybe_remove(store, 'df') - store.append('df', df) - result = store.select('df') + _maybe_remove(store, "df") + store.append("df", df) + result = store.select("df") assert_frame_equal(result, df) def test_tseries_select_index_column(self): @@ -5073,60 +5288,65 @@ def test_tseries_select_index_column(self): # not preserve UTC tzinfo set before storing # check that no tz still works - rng = date_range('1/1/2000', '1/30/2000') + rng = date_range("1/1/2000", "1/30/2000") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') + store.append("frame", frame) + result = store.select_column("frame", "index") assert rng.tz == DatetimeIndex(result.values).tz # check utc - rng = date_range('1/1/2000', '1/30/2000', tz='UTC') + rng = date_range("1/1/2000", "1/30/2000", tz="UTC") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') + store.append("frame", frame) + result = store.select_column("frame", "index") assert rng.tz == result.dt.tz # double check non-utc - rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store.append('frame', frame) - result = store.select_column('frame', 'index') + store.append("frame", frame) + result = store.select_column("frame", "index") assert rng.tz == result.dt.tz def test_timezones_fixed(self): with ensure_clean_store(self.path) as store: # index - rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") df = DataFrame(np.random.randn(len(rng), 4), index=rng) - store['df'] = df - result = store['df'] + store["df"] = df + result = store["df"] assert_frame_equal(result, df) # as data # GH11411 - _maybe_remove(store, 'df') - df = DataFrame({'A': rng, - 'B': rng.tz_convert('UTC').tz_localize(None), - 'C': rng.tz_convert('CET'), - 'D': range(len(rng))}, index=rng) - store['df'] = df - result = store['df'] + _maybe_remove(store, "df") + df = DataFrame( + { + "A": rng, + "B": rng.tz_convert("UTC").tz_localize(None), + "C": rng.tz_convert("CET"), + "D": range(len(rng)), + }, + index=rng, + ) + store["df"] = df + result = store["df"] assert_frame_equal(result, df) def test_fixed_offset_tz(self): - rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00') + rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) with ensure_clean_store(self.path) as store: - store['frame'] = frame - recons = store['frame'] + store["frame"] = frame + recons = store["frame"] tm.assert_index_equal(recons.index, rng) assert rng.tz == recons.index.tz @@ -5141,63 +5361,67 @@ def test_store_timezone(self): today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) - store['obj1'] = df - result = store['obj1'] + store["obj1"] = df + result = store["obj1"] assert_frame_equal(result, df) # with tz setting with ensure_clean_store(self.path) as store: - with set_timezone('EST5EDT'): + with set_timezone("EST5EDT"): today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) - store['obj1'] = df + store["obj1"] = df - with set_timezone('CST6CDT'): - result = store['obj1'] + with set_timezone("CST6CDT"): + result = store["obj1"] assert_frame_equal(result, df) def test_legacy_datetimetz_object(self, datapath): # legacy from < 0.17.0 # 8260 - expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='CET')), - index=range(5)) + expected = DataFrame( + dict( + A=Timestamp("20130102", tz="US/Eastern"), + B=Timestamp("20130603", tz="CET"), + ), + index=range(5), + ) with ensure_clean_store( - datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'), - mode='r') as store: - result = store['df'] + datapath("io", "data", "legacy_hdf", "datetimetz_object.h5"), mode="r" + ) as store: + result = store["df"] assert_frame_equal(result, expected) def test_dst_transitions(self): # make sure we are not failing on transitions with ensure_clean_store(self.path) as store: - times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", - tz="Europe/London", - freq="H", - ambiguous='infer') - - for i in [times, times + pd.Timedelta('10min')]: - _maybe_remove(store, 'df') - df = DataFrame({'A': range(len(i)), 'B': i}, index=i) - store.append('df', df) - result = store.select('df') + times = pd.date_range( + "2013-10-26 23:00", + "2013-10-27 01:00", + tz="Europe/London", + freq="H", + ambiguous="infer", + ) + + for i in [times, times + pd.Timedelta("10min")]: + _maybe_remove(store, "df") + df = DataFrame({"A": range(len(i)), "B": i}, index=i) + store.append("df", df) + result = store.select("df") assert_frame_equal(result, df) def test_read_with_where_tz_aware_index(self): # GH 11926 periods = 10 - dts = pd.date_range('20151201', periods=periods, - freq='D', tz='UTC') - mi = pd.MultiIndex.from_arrays([dts, range(periods)], - names=['DATE', 'NO']) - expected = pd.DataFrame({'MYCOL': 0}, index=mi) + dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") + mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) + expected = pd.DataFrame({"MYCOL": 0}, index=mi) - key = 'mykey' + key = "mykey" with ensure_clean_path(self.path) as path: with pd.HDFStore(path) as store: - store.append(key, expected, format='table', append=True) - result = pd.read_hdf(path, key, - where="DATE > 20151130") + store.append(key, expected, format="table", append=True) + result = pd.read_hdf(path, key, where="DATE > 20151130") assert_frame_equal(result, expected) diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py index 134aa810db5be..fcd2e0e35ad9e 100644 --- a/pandas/tests/io/sas/test_sas.py +++ b/pandas/tests/io/sas/test_sas.py @@ -7,19 +7,20 @@ class TestSas: - def test_sas_buffer_format(self): # see gh-14947 b = StringIO("") - msg = ("If this is a buffer object rather than a string " - "name, you must specify a format string") + msg = ( + "If this is a buffer object rather than a string " + "name, you must specify a format string" + ) with pytest.raises(ValueError, match=msg): read_sas(b) def test_sas_read_no_format_or_extension(self): # see gh-24548 - msg = ("unable to infer format of SAS file") - with tm.ensure_clean('test_file_no_extension') as path: + msg = "unable to infer format of SAS file" + with tm.ensure_clean("test_file_no_extension") as path: with pytest.raises(ValueError, match=msg): read_sas(path) diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 2c8d1281f2c34..e37561c865c7a 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -14,20 +14,18 @@ # https://github.com/cython/cython/issues/1720 @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") class TestSAS7BDAT: - @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: - fname = os.path.join( - self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) + fname = os.path.join(self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) df = pd.read_csv(fname) epoch = pd.datetime(1960, 1, 1) - t1 = pd.to_timedelta(df["Column4"], unit='d') + t1 = pd.to_timedelta(df["Column4"], unit="d") df["Column4"] = epoch + t1 - t2 = pd.to_timedelta(df["Column12"], unit='d') + t2 = pd.to_timedelta(df["Column12"], unit="d") df["Column12"] = epoch + t2 for k in range(df.shape[1]): col = df.iloc[:, k] @@ -39,22 +37,21 @@ def test_from_file(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - df = pd.read_sas(fname, encoding='utf-8') + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - with open(fname, 'rb') as f: + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + with open(fname, "rb") as f: byts = f.read() buf = io.BytesIO(byts) - rdr = pd.read_sas(buf, format="sas7bdat", - iterator=True, encoding='utf-8') + rdr = pd.read_sas( + buf, format="sas7bdat", iterator=True, encoding="utf-8" + ) df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) rdr.close() @@ -63,35 +60,36 @@ def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - rdr = pd.read_sas(fname, iterator=True, encoding='utf-8') + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) rdr.close() - @td.skip_if_no('pathlib') + @td.skip_if_no("pathlib") def test_path_pathlib(self): from pathlib import Path + for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = Path(os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k))) - df = pd.read_sas(fname, encoding='utf-8') + fname = Path(os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))) + df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_path_localpath(self): from py.path import local as LocalPath + for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = LocalPath(os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k))) - df = pd.read_sas(fname, encoding='utf-8') + fname = LocalPath( + os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + ) + df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) def test_iterator_loop(self): @@ -99,9 +97,8 @@ def test_iterator_loop(self): for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: - fname = os.path.join( - self.dirpath, "test{k}.sas7bdat".format(k=k)) - rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') + fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8") y = 0 for x in rdr: y += x.shape[0] @@ -112,8 +109,7 @@ def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) - rdr = pd.read_sas(fname, format="sas7bdat", - iterator=True, encoding='utf-8') + rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8") d1 = rdr.read(rdr.row_count + 20) rdr.close() @@ -126,27 +122,28 @@ def test_iterator_read_too_much(self): def test_encoding_options(datapath): fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) - df2 = pd.read_sas(fname, encoding='utf-8') + df2 = pd.read_sas(fname, encoding="utf-8") for col in df1.columns: try: - df1[col] = df1[col].str.decode('utf-8') + df1[col] = df1[col].str.decode("utf-8") except AttributeError: pass tm.assert_frame_equal(df1, df2) from pandas.io.sas.sas7bdat import SAS7BDATReader + rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() rdr.close() for x, y in zip(df1.columns, df3.columns): - assert(x == y.decode()) + assert x == y.decode() def test_productsales(datapath): fname = datapath("io", "sas", "data", "productsales.sas7bdat") - df = pd.read_sas(fname, encoding='utf-8') + df = pd.read_sas(fname, encoding="utf-8") fname = datapath("io", "sas", "data", "productsales.csv") - df0 = pd.read_csv(fname, parse_dates=['MONTH']) + df0 = pd.read_csv(fname, parse_dates=["MONTH"]) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) @@ -175,42 +172,43 @@ def test_date_time(datapath): fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "datetime.csv") - df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', - 'DateTimeHi', 'Taiw']) + df0 = pd.read_csv( + fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"] + ) # GH 19732: Timestamps imported from sas will incur floating point errors - df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') + df.iloc[:, 3] = df.iloc[:, 3].dt.round("us") tm.assert_frame_equal(df, df0) def test_compact_numerical_values(datapath): # Regression test for #21616 fname = datapath("io", "sas", "data", "cars.sas7bdat") - df = pd.read_sas(fname, encoding='latin-1') + df = pd.read_sas(fname, encoding="latin-1") # The two columns CYL and WGT in cars.sas7bdat have column # width < 8 and only contain integral values. # Test that pandas doesn't corrupt the numbers by adding # decimals. - result = df['WGT'] - expected = df['WGT'].round() + result = df["WGT"] + expected = df["WGT"].round() tm.assert_series_equal(result, expected, check_exact=True) - result = df['CYL'] - expected = df['CYL'].round() + result = df["CYL"] + expected = df["CYL"].round() tm.assert_series_equal(result, expected, check_exact=True) def test_many_columns(datapath): # Test for looking for column information in more places (PR #22628) fname = datapath("io", "sas", "data", "many_columns.sas7bdat") - df = pd.read_sas(fname, encoding='latin-1') + df = pd.read_sas(fname, encoding="latin-1") fname = datapath("io", "sas", "data", "many_columns.csv") - df0 = pd.read_csv(fname, encoding='latin-1') + df0 = pd.read_csv(fname, encoding="latin-1") tm.assert_frame_equal(df, df0) def test_inconsistent_number_of_rows(datapath): # Regression test for issue #16615. (PR #22628) fname = datapath("io", "sas", "data", "load_log.sas7bdat") - df = pd.read_sas(fname, encoding='latin-1') + df = pd.read_sas(fname, encoding="latin-1") assert len(df) == 2097 diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 9024216fb60fa..7893877be2033 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -16,12 +16,11 @@ def numeric_as_float(data): for v in data.columns: - if data[v].dtype is np.dtype('int64'): + if data[v].dtype is np.dtype("int64"): data[v] = data[v].astype(np.float64) class TestXport: - @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") @@ -85,20 +84,16 @@ def test1_index(self): tm.assert_frame_equal(data, data_csv, check_index_type=False) # Test incremental read with `read` method. - reader = read_sas(self.file01, index="SEQN", format="xport", - iterator=True) + reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) data = reader.read(10) reader.close() - tm.assert_frame_equal(data, data_csv.iloc[0:10, :], - check_index_type=False) + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) # Test incremental read with `get_chunk` method. - reader = read_sas(self.file01, index="SEQN", format="xport", - chunksize=10) + reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) data = reader.get_chunk() reader.close() - tm.assert_frame_equal(data, data_csv.iloc[0:10, :], - check_index_type=False) + tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) def test1_incremental(self): # Test with DEMO_G.xpt, reading full file incrementally @@ -143,4 +138,4 @@ def test_truncated_float_support(self): data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv")) data = read_sas(self.file04, format="xport") - tm.assert_frame_equal(data.astype('int64'), data_csv) + tm.assert_frame_equal(data.astype("int64"), data_csv) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 04223efd6eacb..fccd52f9916b8 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -13,7 +13,7 @@ from pandas.io.clipboard.exceptions import PyperclipException try: - DataFrame({'A': [1, 2]}).to_clipboard() + DataFrame({"A": [1, 2]}).to_clipboard() _DEPS_INSTALLED = 1 except (PyperclipException, RuntimeError): _DEPS_INSTALLED = 0 @@ -21,57 +21,90 @@ def build_kwargs(sep, excel): kwargs = {} - if excel != 'default': - kwargs['excel'] = excel - if sep != 'default': - kwargs['sep'] = sep + if excel != "default": + kwargs["excel"] = excel + if sep != "default": + kwargs["sep"] = sep return kwargs -@pytest.fixture(params=['delims', 'utf8', 'utf16', 'string', 'long', - 'nonascii', 'colwidth', 'mixed', 'float', 'int']) +@pytest.fixture( + params=[ + "delims", + "utf8", + "utf16", + "string", + "long", + "nonascii", + "colwidth", + "mixed", + "float", + "int", + ] +) def df(request): data_type = request.param - if data_type == 'delims': - return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'], - 'b': ['hi\'j', 'k\'\'lm']}) - elif data_type == 'utf8': - return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], - 'b': ['øπ∆˚¬', 'œ∑´®']}) - elif data_type == 'utf16': - return pd.DataFrame({'a': ['\U0001f44d\U0001f44d', - '\U0001f44d\U0001f44d'], - 'b': ['abc', 'def']}) - elif data_type == 'string': - return mkdf(5, 3, c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'long': - max_rows = get_option('display.max_rows') - return mkdf(max_rows + 1, 3, - data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'nonascii': - return pd.DataFrame({'en': 'in English'.split(), - 'es': 'en español'.split()}) - elif data_type == 'colwidth': - _cw = get_option('display.max_colwidth') + 1 - return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'mixed': - return DataFrame({'a': np.arange(1.0, 6.0) + 0.01, - 'b': np.arange(1, 6), - 'c': list('abcde')}) - elif data_type == 'float': - return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - elif data_type == 'int': - return mkdf(5, 3, data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) + if data_type == "delims": + return pd.DataFrame({"a": ['"a,\t"b|c', "d\tef´"], "b": ["hi'j", "k''lm"]}) + elif data_type == "utf8": + return pd.DataFrame({"a": ["µasd", "Ωœ∑´"], "b": ["øπ∆˚¬", "œ∑´®"]}) + elif data_type == "utf16": + return pd.DataFrame( + {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]} + ) + elif data_type == "string": + return mkdf( + 5, 3, c_idx_type="s", r_idx_type="i", c_idx_names=[None], r_idx_names=[None] + ) + elif data_type == "long": + max_rows = get_option("display.max_rows") + return mkdf( + max_rows + 1, + 3, + data_gen_f=lambda *args: randint(2), + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "nonascii": + return pd.DataFrame({"en": "in English".split(), "es": "en español".split()}) + elif data_type == "colwidth": + _cw = get_option("display.max_colwidth") + 1 + return mkdf( + 5, + 3, + data_gen_f=lambda *args: "x" * _cw, + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "mixed": + return DataFrame( + {"a": np.arange(1.0, 6.0) + 0.01, "b": np.arange(1, 6), "c": list("abcde")} + ) + elif data_type == "float": + return mkdf( + 5, + 3, + data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) + elif data_type == "int": + return mkdf( + 5, + 3, + data_gen_f=lambda *args: randint(2), + c_idx_type="s", + r_idx_type="i", + c_idx_names=[None], + r_idx_names=[None], + ) else: raise ValueError @@ -108,6 +141,7 @@ def _mock_get(): @pytest.mark.clipboard def test_mock_clipboard(mock_clipboard): import pandas.io.clipboard + pandas.io.clipboard.clipboard_set("abc") assert "abc" in set(mock_clipboard.values()) result = pandas.io.clipboard.clipboard_get() @@ -116,16 +150,12 @@ def test_mock_clipboard(mock_clipboard): @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, - reason="clipboard primitives not installed") +@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.usefixtures("mock_clipboard") class TestClipboard: - - def check_round_trip_frame(self, data, excel=None, sep=None, - encoding=None): + def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - result = read_clipboard(sep=sep or '\t', index_col=0, - encoding=encoding) + result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding) tm.assert_frame_equal(data, result, check_dtype=False) # Test that default arguments copy as tab delimited @@ -133,7 +163,7 @@ def test_round_trip_frame(self, df): self.check_round_trip_frame(df) # Test that explicit delimiters are respected - @pytest.mark.parametrize('sep', ['\t', ',', '|']) + @pytest.mark.parametrize("sep", ["\t", ",", "|"]) def test_round_trip_frame_sep(self, df, sep): self.check_round_trip_frame(df, sep=sep) @@ -148,63 +178,67 @@ def test_round_trip_frame_string(self, df): # Test that multi-character separators are not silently passed def test_excel_sep_warning(self, df): with tm.assert_produces_warning(): - df.to_clipboard(excel=True, sep=r'\t') + df.to_clipboard(excel=True, sep=r"\t") # Separator is ignored when excel=False and should produce a warning def test_copy_delim_warning(self, df): with tm.assert_produces_warning(): - df.to_clipboard(excel=False, sep='\t') + df.to_clipboard(excel=False, sep="\t") # Tests that the default behavior of to_clipboard is tab # delimited and excel="True" - @pytest.mark.parametrize('sep', ['\t', None, 'default']) - @pytest.mark.parametrize('excel', [True, None, 'default']) - def test_clipboard_copy_tabs_default(self, sep, excel, df, request, - mock_clipboard): + @pytest.mark.parametrize("sep", ["\t", None, "default"]) + @pytest.mark.parametrize("excel", [True, None, "default"]) + def test_clipboard_copy_tabs_default(self, sep, excel, df, request, mock_clipboard): kwargs = build_kwargs(sep, excel) df.to_clipboard(**kwargs) - assert mock_clipboard[request.node.name] == df.to_csv(sep='\t') + assert mock_clipboard[request.node.name] == df.to_csv(sep="\t") # Tests reading of white space separated tables - @pytest.mark.parametrize('sep', [None, 'default']) - @pytest.mark.parametrize('excel', [False]) + @pytest.mark.parametrize("sep", [None, "default"]) + @pytest.mark.parametrize("excel", [False]) def test_clipboard_copy_strings(self, sep, excel, df): kwargs = build_kwargs(sep, excel) df.to_clipboard(**kwargs) - result = read_clipboard(sep=r'\s+') + result = read_clipboard(sep=r"\s+") assert result.to_string() == df.to_string() assert df.shape == result.shape - def test_read_clipboard_infer_excel(self, request, - mock_clipboard): + def test_read_clipboard_infer_excel(self, request, mock_clipboard): # gh-19010: avoid warnings clip_kwargs = dict(engine="python") - text = dedent(""" + text = dedent( + """ John James Charlie Mingus 1 2 4 Harry Carney - """.strip()) + """.strip() + ) mock_clipboard[request.node.name] = text df = pd.read_clipboard(**clip_kwargs) # excel data is parsed correctly - assert df.iloc[1][1] == 'Harry Carney' + assert df.iloc[1][1] == "Harry Carney" # having diff tab counts doesn't trigger it - text = dedent(""" + text = dedent( + """ a\t b 1 2 3 4 - """.strip()) + """.strip() + ) mock_clipboard[request.node.name] = text res = pd.read_clipboard(**clip_kwargs) - text = dedent(""" + text = dedent( + """ a b 1 2 3 4 - """.strip()) + """.strip() + ) mock_clipboard[request.node.name] = text exp = pd.read_clipboard(**clip_kwargs) @@ -213,20 +247,19 @@ def test_read_clipboard_infer_excel(self, request, def test_invalid_encoding(self, df): # test case for testing invalid encoding with pytest.raises(ValueError): - df.to_clipboard(encoding='ascii') + df.to_clipboard(encoding="ascii") with pytest.raises(NotImplementedError): - pd.read_clipboard(encoding='ascii') + pd.read_clipboard(encoding="ascii") - @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) + @pytest.mark.parametrize("enc", ["UTF-8", "utf-8", "utf8"]) def test_round_trip_valid_encodings(self, enc, df): self.check_round_trip_frame(df, encoding=enc) @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, - reason="clipboard primitives not installed") -@pytest.mark.parametrize('data', ['\U0001f44d...', 'Ωœ∑´...', 'abcd...']) +@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") +@pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) def test_raw_roundtrip(data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 04faf5aee4b6d..426698bfa1e94 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -18,6 +18,7 @@ class CustomFSPath: """For testing fspath on unknown objects""" + def __init__(self, path): self.path = path @@ -30,12 +31,14 @@ def __fspath__(self): try: from pathlib import Path + path_types.append(Path) except ImportError: pass try: from py.path import local as LocalPath + path_types.append(LocalPath) except ImportError: pass @@ -57,7 +60,7 @@ class TestCommonIOCapabilities: """ def test_expand_user(self): - filename = '~/sometest' + filename = "~/sometest" expanded_name = icom._expand_user(filename) assert expanded_name != filename @@ -65,48 +68,44 @@ def test_expand_user(self): assert os.path.expanduser(filename) == expanded_name def test_expand_user_normal_path(self): - filename = '/somefolder/sometest' + filename = "/somefolder/sometest" expanded_name = icom._expand_user(filename) assert expanded_name == filename assert os.path.expanduser(filename) == expanded_name - @td.skip_if_no('pathlib') + @td.skip_if_no("pathlib") def test_stringify_path_pathlib(self): - rel_path = icom._stringify_path(Path('.')) - assert rel_path == '.' - redundant_path = icom._stringify_path(Path('foo//bar')) - assert redundant_path == os.path.join('foo', 'bar') + rel_path = icom._stringify_path(Path(".")) + assert rel_path == "." + redundant_path = icom._stringify_path(Path("foo//bar")) + assert redundant_path == os.path.join("foo", "bar") - @td.skip_if_no('py.path') + @td.skip_if_no("py.path") def test_stringify_path_localpath(self): - path = os.path.join('foo', 'bar') + path = os.path.join("foo", "bar") abs_path = os.path.abspath(path) lpath = LocalPath(path) assert icom._stringify_path(lpath) == abs_path def test_stringify_path_fspath(self): - p = CustomFSPath('foo/bar.csv') + p = CustomFSPath("foo/bar.csv") result = icom._stringify_path(p) - assert result == 'foo/bar.csv' - - @pytest.mark.parametrize('extension,expected', [ - ('', None), - ('.gz', 'gzip'), - ('.bz2', 'bz2'), - ('.zip', 'zip'), - ('.xz', 'xz'), - ]) - @pytest.mark.parametrize('path_type', path_types) + assert result == "foo/bar.csv" + + @pytest.mark.parametrize( + "extension,expected", + [("", None), (".gz", "gzip"), (".bz2", "bz2"), (".zip", "zip"), (".xz", "xz")], + ) + @pytest.mark.parametrize("path_type", path_types) def test_infer_compression_from_path(self, extension, expected, path_type): - path = path_type('foo/bar.csv' + extension) - compression = icom._infer_compression(path, compression='infer') + path = path_type("foo/bar.csv" + extension) + compression = icom._infer_compression(path, compression="infer") assert compression == expected def test_get_filepath_or_buffer_with_path(self): - filename = '~/sometest' - filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( - filename) + filename = "~/sometest" + filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(filename) assert filepath_or_buffer != filename assert os.path.isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer @@ -115,7 +114,8 @@ def test_get_filepath_or_buffer_with_path(self): def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( - input_buffer) + input_buffer + ) assert filepath_or_buffer == input_buffer assert not should_close @@ -131,82 +131,100 @@ def test_iterator(self): tm.assert_frame_equal(first, expected.iloc[[0]]) tm.assert_frame_equal(pd.concat(it), expected.iloc[1:]) - @pytest.mark.parametrize('reader, module, error_class, fn_ext', [ - (pd.read_csv, 'os', FileNotFoundError, 'csv'), - (pd.read_fwf, 'os', FileNotFoundError, 'txt'), - (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'), - (pd.read_feather, 'feather', Exception, 'feather'), - (pd.read_hdf, 'tables', FileNotFoundError, 'h5'), - (pd.read_stata, 'os', FileNotFoundError, 'dta'), - (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'), - (pd.read_json, 'os', ValueError, 'json'), - (pd.read_msgpack, 'os', ValueError, 'mp'), - (pd.read_pickle, 'os', FileNotFoundError, 'pickle'), - ]) + @pytest.mark.parametrize( + "reader, module, error_class, fn_ext", + [ + (pd.read_csv, "os", FileNotFoundError, "csv"), + (pd.read_fwf, "os", FileNotFoundError, "txt"), + (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"), + (pd.read_feather, "feather", Exception, "feather"), + (pd.read_hdf, "tables", FileNotFoundError, "h5"), + (pd.read_stata, "os", FileNotFoundError, "dta"), + (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), + (pd.read_json, "os", ValueError, "json"), + (pd.read_msgpack, "os", ValueError, "mp"), + (pd.read_pickle, "os", FileNotFoundError, "pickle"), + ], + ) def test_read_non_existant(self, reader, module, error_class, fn_ext): pytest.importorskip(module) - path = os.path.join(HERE, 'data', 'does_not_exist.' + fn_ext) - msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist" - .format(fn_ext)) - msg2 = (r"\[Errno 2\] No such file or directory: '.+does_not_exist" - r"\.{}'").format(fn_ext) + path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) + msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg2 = ( + r"\[Errno 2\] No such file or directory: '.+does_not_exist" r"\.{}'" + ).format(fn_ext) msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" - msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext) - with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format( - msg1, msg2, msg3, msg4, msg5)): + msg5 = ( + r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" + r" '.+does_not_exist\.{}'" + ).format(fn_ext, fn_ext) + with pytest.raises( + error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + ): reader(path) - @pytest.mark.parametrize('reader, module, error_class, fn_ext', [ - (pd.read_csv, 'os', FileNotFoundError, 'csv'), - (pd.read_table, 'os', FileNotFoundError, 'csv'), - (pd.read_fwf, 'os', FileNotFoundError, 'txt'), - (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'), - (pd.read_feather, 'feather', Exception, 'feather'), - (pd.read_hdf, 'tables', FileNotFoundError, 'h5'), - (pd.read_stata, 'os', FileNotFoundError, 'dta'), - (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'), - (pd.read_json, 'os', ValueError, 'json'), - (pd.read_msgpack, 'os', ValueError, 'mp'), - (pd.read_pickle, 'os', FileNotFoundError, 'pickle'), - ]) - def test_read_expands_user_home_dir(self, reader, module, - error_class, fn_ext, monkeypatch): + @pytest.mark.parametrize( + "reader, module, error_class, fn_ext", + [ + (pd.read_csv, "os", FileNotFoundError, "csv"), + (pd.read_table, "os", FileNotFoundError, "csv"), + (pd.read_fwf, "os", FileNotFoundError, "txt"), + (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"), + (pd.read_feather, "feather", Exception, "feather"), + (pd.read_hdf, "tables", FileNotFoundError, "h5"), + (pd.read_stata, "os", FileNotFoundError, "dta"), + (pd.read_sas, "os", FileNotFoundError, "sas7bdat"), + (pd.read_json, "os", ValueError, "json"), + (pd.read_msgpack, "os", ValueError, "mp"), + (pd.read_pickle, "os", FileNotFoundError, "pickle"), + ], + ) + def test_read_expands_user_home_dir( + self, reader, module, error_class, fn_ext, monkeypatch + ): pytest.importorskip(module) - path = os.path.join('~', 'does_not_exist.' + fn_ext) - monkeypatch.setattr(icom, '_expand_user', - lambda x: os.path.join('foo', x)) + path = os.path.join("~", "does_not_exist." + fn_ext) + monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) - msg1 = (r"File (b')?.+does_not_exist\.{}'? does not exist" - .format(fn_ext)) - msg2 = (r"\[Errno 2\] No such file or directory:" - r" '.+does_not_exist\.{}'").format(fn_ext) + msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg2 = ( + r"\[Errno 2\] No such file or directory:" r" '.+does_not_exist\.{}'" + ).format(fn_ext) msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" - msg5 = (r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'").format(fn_ext, fn_ext) - - with pytest.raises(error_class, match=r"({}|{}|{}|{}|{})".format( - msg1, msg2, msg3, msg4, msg5)): + msg5 = ( + r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" + r" '.+does_not_exist\.{}'" + ).format(fn_ext, fn_ext) + + with pytest.raises( + error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + ): reader(path) - @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), - (pd.read_table, 'os', ('io', 'data', 'iris.csv')), - (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), - (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), - (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), - (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), - ]) + @pytest.mark.parametrize( + "reader, module, path", + [ + (pd.read_csv, "os", ("io", "data", "iris.csv")), + (pd.read_table, "os", ("io", "data", "iris.csv")), + (pd.read_fwf, "os", ("io", "data", "fixed_width_format.txt")), + (pd.read_excel, "xlrd", ("io", "data", "test1.xlsx")), + (pd.read_feather, "feather", ("io", "data", "feather-0_3_1.feather")), + ( + pd.read_hdf, + "tables", + ("io", "data", "legacy_hdf", "datetimetz_object.h5"), + ), + (pd.read_stata, "os", ("io", "data", "stata10_115.dta")), + (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")), + (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")), + (pd.read_msgpack, "os", ("io", "msgpack", "data", "frame.mp")), + (pd.read_pickle, "os", ("io", "data", "categorical_0_14_1.pickle")), + ], + ) def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) path = datapath(*path) @@ -215,26 +233,29 @@ def test_read_fspath_all(self, reader, module, path, datapath): result = reader(mypath) expected = reader(path) - if path.endswith('.pickle'): + if path.endswith(".pickle"): # categorical tm.assert_categorical_equal(result, expected) else: tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('writer_name, writer_kwargs, module', [ - ('to_csv', {}, 'os'), - ('to_excel', {'engine': 'xlwt'}, 'xlwt'), - ('to_feather', {}, 'feather'), - ('to_html', {}, 'os'), - ('to_json', {}, 'os'), - ('to_latex', {}, 'os'), - ('to_msgpack', {}, 'os'), - ('to_pickle', {}, 'os'), - ('to_stata', {'time_stamp': pd.to_datetime('2019-01-01 00:00')}, 'os'), - ]) + @pytest.mark.parametrize( + "writer_name, writer_kwargs, module", + [ + ("to_csv", {}, "os"), + ("to_excel", {"engine": "xlwt"}, "xlwt"), + ("to_feather", {}, "feather"), + ("to_html", {}, "os"), + ("to_json", {}, "os"), + ("to_latex", {}, "os"), + ("to_msgpack", {}, "os"), + ("to_pickle", {}, "os"), + ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), + ], + ) def test_write_fspath_all(self, writer_name, writer_kwargs, module): - p1 = tm.ensure_clean('string') - p2 = tm.ensure_clean('fspath') + p1 = tm.ensure_clean("string") + p2 = tm.ensure_clean("fspath") df = pd.DataFrame({"A": [1, 2]}) with p1 as string, p2 as fspath: @@ -243,11 +264,11 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): writer = getattr(df, writer_name) writer(string, **writer_kwargs) - with open(string, 'rb') as f: + with open(string, "rb") as f: expected = f.read() writer(mypath, **writer_kwargs) - with open(fspath, 'rb') as f: + with open(fspath, "rb") as f: result = f.read() assert result == expected @@ -256,32 +277,31 @@ def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll # have to read and compare equality - pytest.importorskip('tables') + pytest.importorskip("tables") df = pd.DataFrame({"A": [1, 2]}) - p1 = tm.ensure_clean('string') - p2 = tm.ensure_clean('fspath') + p1 = tm.ensure_clean("string") + p2 = tm.ensure_clean("fspath") with p1 as string, p2 as fspath: mypath = CustomFSPath(fspath) - df.to_hdf(mypath, key='bar') - df.to_hdf(string, key='bar') + df.to_hdf(mypath, key="bar") + df.to_hdf(string, key="bar") - result = pd.read_hdf(fspath, key='bar') - expected = pd.read_hdf(string, key='bar') + result = pd.read_hdf(fspath, key="bar") + expected = pd.read_hdf(string, key="bar") tm.assert_frame_equal(result, expected) @pytest.fixture def mmap_file(datapath): - return datapath('io', 'data', 'test_mmap.csv') + return datapath("io", "data", "test_mmap.csv") class TestMMapWrapper: - def test_constructor_bad_file(self, mmap_file): - non_file = StringIO('I am not a file') + non_file = StringIO("I am not a file") non_file.fileno = lambda: -1 # the error raised is different on Windows @@ -295,7 +315,7 @@ def test_constructor_bad_file(self, mmap_file): with pytest.raises(err, match=msg): icom.MMapWrapper(non_file) - target = open(mmap_file, 'r') + target = open(mmap_file, "r") target.close() msg = "I/O operation on closed file" @@ -303,21 +323,20 @@ def test_constructor_bad_file(self, mmap_file): icom.MMapWrapper(target) def test_get_attr(self, mmap_file): - with open(mmap_file, 'r') as target: + with open(mmap_file, "r") as target: wrapper = icom.MMapWrapper(target) attrs = dir(wrapper.mmap) - attrs = [attr for attr in attrs - if not attr.startswith('__')] - attrs.append('__next__') + attrs = [attr for attr in attrs if not attr.startswith("__")] + attrs.append("__next__") for attr in attrs: assert hasattr(wrapper, attr) - assert not hasattr(wrapper, 'foo') + assert not hasattr(wrapper, "foo") def test_next(self, mmap_file): - with open(mmap_file, 'r') as target: + with open(mmap_file, "r") as target: wrapper = icom.MMapWrapper(target) lines = target.readlines() @@ -325,12 +344,12 @@ def test_next(self, mmap_file): next_line = next(wrapper) assert next_line.strip() == line.strip() - with pytest.raises(StopIteration, match=r'^$'): + with pytest.raises(StopIteration, match=r"^$"): next(wrapper) def test_unknown_engine(self): with tm.ensure_clean() as path: df = tm.makeDataFrame() df.to_csv(path) - with pytest.raises(ValueError, match='Unknown engine'): - pd.read_csv(path, engine='pyt') + with pytest.raises(ValueError, match="Unknown engine"): + pd.read_csv(path, engine="pyt") diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index c0e19f07c148d..ce459ab24afe0 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -21,12 +21,17 @@ def catch_to_csv_depr(): yield -@pytest.mark.parametrize('obj', [ - pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']), - pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_compression_size(obj, method, compression_only): with tm.ensure_clean() as path: with catch_to_csv_depr(): @@ -37,15 +42,20 @@ def test_compression_size(obj, method, compression_only): assert uncompressed_size > compressed_size -@pytest.mark.parametrize('obj', [ - pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']), - pd.Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_csv', 'to_json']) +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, 'w', compression=compression_only) + f, handles = icom._get_handle(path, "w", compression=compression_only) with catch_to_csv_depr(): with f: getattr(obj, method)(f) @@ -53,7 +63,7 @@ def test_compression_size_fh(obj, method, compression_only): assert f.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, 'w', compression=None) + f, handles = icom._get_handle(path, "w", compression=None) with catch_to_csv_depr(): with f: getattr(obj, method)(f) @@ -63,35 +73,41 @@ def test_compression_size_fh(obj, method, compression_only): assert uncompressed_size > compressed_size -@pytest.mark.parametrize('write_method, write_kwargs, read_method', [ - ('to_csv', {'index': False}, pd.read_csv), - ('to_json', {}, pd.read_json), - ('to_pickle', {}, pd.read_pickle), -]) +@pytest.mark.parametrize( + "write_method, write_kwargs, read_method", + [ + ("to_csv", {"index": False}, pd.read_csv), + ("to_json", {}, pd.read_json), + ("to_pickle", {}, pd.read_pickle), + ], +) def test_dataframe_compression_defaults_to_infer( - write_method, write_kwargs, read_method, compression_only): + write_method, write_kwargs, read_method, compression_only +): # GH22004 - input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=['X', 'Y', 'Z']) + input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) extension = icom._compression_to_extension[compression_only] - with tm.ensure_clean('compressed' + extension) as path: + with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) tm.assert_frame_equal(output, input) -@pytest.mark.parametrize('write_method,write_kwargs,read_method,read_kwargs', [ - ('to_csv', {'index': False, 'header': True}, - pd.read_csv, {'squeeze': True}), - ('to_json', {}, pd.read_json, {'typ': 'series'}), - ('to_pickle', {}, pd.read_pickle, {}), -]) +@pytest.mark.parametrize( + "write_method,write_kwargs,read_method,read_kwargs", + [ + ("to_csv", {"index": False, "header": True}, pd.read_csv, {"squeeze": True}), + ("to_json", {}, pd.read_json, {"typ": "series"}), + ("to_pickle", {}, pd.read_pickle, {}), + ], +) def test_series_compression_defaults_to_infer( - write_method, write_kwargs, read_method, read_kwargs, - compression_only): + write_method, write_kwargs, read_method, read_kwargs, compression_only +): # GH22004 - input = pd.Series([0, 5, -2, 10], name='X') + input = pd.Series([0, 5, -2, 10], name="X") extension = icom._compression_to_extension[compression_only] - with tm.ensure_clean('compressed' + extension) as path: + with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only, **read_kwargs) tm.assert_series_equal(output, input, check_names=False) @@ -100,12 +116,12 @@ def test_series_compression_defaults_to_infer( def test_compression_warning(compression_only): # Assert that passing a file object to to_csv while explicitly specifying a # compression protocol triggers a RuntimeWarning, as per GH21227. - df = pd.DataFrame(100 * [[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - columns=['X', 'Y', 'Z']) + df = pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ) with tm.ensure_clean() as path: - f, handles = icom._get_handle(path, 'w', compression=compression_only) - with tm.assert_produces_warning(RuntimeWarning, - check_stacklevel=False): + f, handles = icom._get_handle(path, "w", compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): with f: df.to_csv(f, compression=compression_only) diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py index c5a94883aa609..2fa5e3b30d6af 100644 --- a/pandas/tests/io/test_date_converters.py +++ b/pandas/tests/io/test_date_converters.py @@ -8,10 +8,9 @@ def test_parse_date_time(): - dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) - times = np.array(['05:07:09', '06:08:00'], dtype=object) - expected = np.array([datetime(2007, 1, 3, 5, 7, 9), - datetime(2008, 2, 4, 6, 8, 0)]) + dates = np.array(["2007/1/3", "2008/2/4"], dtype=object) + times = np.array(["05:07:09", "06:08:00"], dtype=object) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) result = conv.parse_date_time(dates, times) tm.assert_numpy_array_equal(result, expected) @@ -36,8 +35,6 @@ def test_parse_all_fields(): years = np.array([2007, 2008]) months = np.array([1, 2]) - result = conv.parse_all_fields(years, months, days, - hours, minutes, seconds) - expected = np.array([datetime(2007, 1, 3, 5, 7, 9), - datetime(2008, 2, 4, 6, 8, 0)]) + result = conv.parse_all_fields(years, months, days, hours, minutes, seconds) + expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 805ce67e76e28..fa63f102580ff 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -10,7 +10,7 @@ from pandas.io.feather_format import read_feather, to_feather # noqa:E402 -pyarrow = pytest.importorskip('pyarrow') +pyarrow = pytest.importorskip("pyarrow") pyarrow_version = LooseVersion(pyarrow.__version__) @@ -18,7 +18,6 @@ @pytest.mark.single class TestFeather: - def check_error_on_write(self, df, exc): # check that we are raising the exception # on writing @@ -40,37 +39,46 @@ def check_round_trip(self, df, expected=None, **kwargs): def test_error(self): - for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'), - np.array([1, 2, 3])]: + for obj in [ + pd.Series([1, 2, 3]), + 1, + "foo", + pd.Timestamp("20130101"), + np.array([1, 2, 3]), + ]: self.check_error_on_write(obj, ValueError) def test_basic(self): - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4)), - 'uint': np.arange(3, 6).astype('u1'), - 'float': np.arange(4.0, 7.0, dtype='float64'), - 'float_with_null': [1., np.nan, 3], - 'bool': [True, False, True], - 'bool_with_null': [True, np.nan, False], - 'cat': pd.Categorical(list('abc')), - 'dt': pd.date_range('20130101', periods=3), - 'dttz': pd.date_range('20130101', periods=3, - tz='US/Eastern'), - 'dt_with_null': [pd.Timestamp('20130101'), pd.NaT, - pd.Timestamp('20130103')], - 'dtns': pd.date_range('20130101', periods=3, - freq='ns')}) - - assert df.dttz.dtype.tz.zone == 'US/Eastern' + df = pd.DataFrame( + { + "string": list("abc"), + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("u1"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_null": [1.0, np.nan, 3], + "bool": [True, False, True], + "bool_with_null": [True, np.nan, False], + "cat": pd.Categorical(list("abc")), + "dt": pd.date_range("20130101", periods=3), + "dttz": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "dt_with_null": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + "dtns": pd.date_range("20130101", periods=3, freq="ns"), + } + ) + + assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) def test_duplicate_columns(self): # https://github.com/wesm/feather/issues/53 # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, ValueError) def test_stringify_columns(self): @@ -80,71 +88,71 @@ def test_stringify_columns(self): def test_read_columns(self): # GH 24025 - df = pd.DataFrame({'col1': list('abc'), - 'col2': list(range(1, 4)), - 'col3': list('xyz'), - 'col4': list(range(4, 7))}) - columns = ['col1', 'col3'] - self.check_round_trip(df, expected=df[columns], - columns=columns) + df = pd.DataFrame( + { + "col1": list("abc"), + "col2": list(range(1, 4)), + "col3": list("xyz"), + "col4": list(range(4, 7)), + } + ) + columns = ["col1", "col3"] + self.check_round_trip(df, expected=df[columns], columns=columns) def test_unsupported_other(self): # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) def test_rw_nthreads(self): - df = pd.DataFrame({'A': np.arange(100000)}) + df = pd.DataFrame({"A": np.arange(100000)}) expected_warning = ( - "the 'nthreads' keyword is deprecated, " - "use 'use_threads' instead" + "the 'nthreads' keyword is deprecated, " "use 'use_threads' instead" ) # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False) as w: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: self.check_round_trip(df, nthreads=2) # we have an extra FutureWarning because of #GH23752 assert any(expected_warning in str(x) for x in w) # TODO: make the warning work with check_stacklevel=True - with tm.assert_produces_warning( - FutureWarning, check_stacklevel=False) as w: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: self.check_round_trip(df, nthreads=1) # we have an extra FutureWarnings because of #GH23752 assert any(expected_warning in str(x) for x in w) def test_rw_use_threads(self): - df = pd.DataFrame({'A': np.arange(100000)}) + df = pd.DataFrame({"A": np.arange(100000)}) self.check_round_trip(df, use_threads=True) self.check_round_trip(df, use_threads=False) def test_write_with_index(self): - df = pd.DataFrame({'A': [1, 2, 3]}) + df = pd.DataFrame({"A": [1, 2, 3]}) self.check_round_trip(df) # non-default index - for index in [[2, 3, 4], - pd.date_range('20130101', periods=3), - list('abc'), - [1, 3, 4], - pd.MultiIndex.from_tuples([('a', 1), ('a', 2), - ('b', 1)]), - ]: + for index in [ + [2, 3, 4], + pd.date_range("20130101", periods=3), + list("abc"), + [1, 3, 4], + pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]), + ]: df.index = index self.check_error_on_write(df, ValueError) # index with meta-data df.index = [0, 1, 2] - df.index.name = 'foo' + df.index.name = "foo" self.check_error_on_write(df, ValueError) # column multi-index df.index = [0, 1, 2] - df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), + df.columns = (pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]),) self.check_error_on_write(df, ValueError) def test_path_pathlib(self): diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 21e0a63bf4ce7..6ca6da01a6d6f 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -18,9 +18,9 @@ PRIVATE_KEY_JSON_PATH = None PRIVATE_KEY_JSON_CONTENTS = None -DATASET_ID = 'pydata_pandas_bq_testing_py3' +DATASET_ID = "pydata_pandas_bq_testing_py3" -TABLE_ID = 'new_test' +TABLE_ID = "new_test" DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID) VERSION = platform.python_version() @@ -28,43 +28,42 @@ def _skip_if_no_project_id(): if not _get_project_id(): - pytest.skip( - "Cannot run integration tests without a project id") + pytest.skip("Cannot run integration tests without a project id") def _skip_if_no_private_key_path(): if not _get_private_key_path(): - pytest.skip("Cannot run integration tests without a " - "private key json file path") + pytest.skip( + "Cannot run integration tests without a " "private key json file path" + ) def _in_travis_environment(): - return 'TRAVIS_BUILD_DIR' in os.environ and \ - 'GBQ_PROJECT_ID' in os.environ + return "TRAVIS_BUILD_DIR" in os.environ and "GBQ_PROJECT_ID" in os.environ def _get_project_id(): if _in_travis_environment(): - return os.environ.get('GBQ_PROJECT_ID') - return PROJECT_ID or os.environ.get('GBQ_PROJECT_ID') + return os.environ.get("GBQ_PROJECT_ID") + return PROJECT_ID or os.environ.get("GBQ_PROJECT_ID") def _get_private_key_path(): if _in_travis_environment(): - return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', - 'travis_gbq.json']) + return os.path.join( + *[os.environ.get("TRAVIS_BUILD_DIR"), "ci", "travis_gbq.json"] + ) private_key_path = PRIVATE_KEY_JSON_PATH if not private_key_path: - private_key_path = os.environ.get('GBQ_GOOGLE_APPLICATION_CREDENTIALS') + private_key_path = os.environ.get("GBQ_GOOGLE_APPLICATION_CREDENTIALS") return private_key_path def _get_credentials(): private_key_path = _get_private_key_path() if private_key_path: - return service_account.Credentials.from_service_account_file( - private_key_path) + return service_account.Credentials.from_service_account_file(private_key_path) def _get_client(): @@ -79,14 +78,17 @@ def make_mixed_dataframe_v2(test_size): flts = np.random.randn(1, test_size) ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) - times = [datetime.now(pytz.timezone('US/Arizona')) - for t in range(test_size)] - return DataFrame({'bools': bools[0], - 'flts': flts[0], - 'ints': ints[0], - 'strs': strs[0], - 'times': times[0]}, - index=range(test_size)) + times = [datetime.now(pytz.timezone("US/Arizona")) for t in range(test_size)] + return DataFrame( + { + "bools": bools[0], + "flts": flts[0], + "ints": ints[0], + "strs": strs[0], + "times": times[0], + }, + index=range(test_size), + ) def test_read_gbq_with_deprecated_kwargs(monkeypatch): @@ -146,7 +148,6 @@ def mock_read_gbq(sql, **kwargs): @pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath: - @classmethod def setup_class(cls): # - GLOBAL CLASS FIXTURES - @@ -179,12 +180,17 @@ def test_roundtrip(self): test_size = 20001 df = make_mixed_dataframe_v2(test_size) - df.to_gbq(destination_table, _get_project_id(), chunksize=None, - credentials=_get_credentials()) - - result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" - .format(destination_table), - project_id=_get_project_id(), - credentials=_get_credentials(), - dialect="standard") - assert result['num_rows'][0] == test_size + df.to_gbq( + destination_table, + _get_project_id(), + chunksize=None, + credentials=_get_credentials(), + ) + + result = pd.read_gbq( + "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), + project_id=_get_project_id(), + credentials=_get_credentials(), + dialect="standard", + ) + assert result["num_rows"][0] == test_size diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4fa0f3246910a..2ca56230b5b8c 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -16,57 +16,76 @@ def test_is_gcs_url(): assert not is_gcs_url("s3://pandas/somethingelse.com") -@td.skip_if_no('gcsfs') +@td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): - df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], - 'dt': date_range('2018-06-18', periods=2)}) + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) class MockGCSFileSystem: def open(*args): return StringIO(df1.to_csv(index=False)) - monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem) - df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) assert_frame_equal(df1, df2) -@td.skip_if_no('gcsfs') +@td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): - df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], - 'dt': date_range('2018-06-18', periods=2)}) + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) s = StringIO() class MockGCSFileSystem: def open(*args): return s - monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem) - df1.to_csv('gs://test/test.csv', index=True) - df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0) + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df1.to_csv("gs://test/test.csv", index=True) + df2 = read_csv(StringIO(s.getvalue()), parse_dates=["dt"], index_col=0) assert_frame_equal(df1, df2) -@td.skip_if_no('gcsfs') +@td.skip_if_no("gcsfs") def test_gcs_get_filepath_or_buffer(monkeypatch): - df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'], - 'dt': date_range('2018-06-18', periods=2)}) + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) def mock_get_filepath_or_buffer(*args, **kwargs): - return (StringIO(df1.to_csv(index=False)), - None, None, False) + return (StringIO(df1.to_csv(index=False)), None, None, False) - monkeypatch.setattr('pandas.io.gcs.get_filepath_or_buffer', - mock_get_filepath_or_buffer) - df2 = read_csv('gs://test/test.csv', parse_dates=['dt']) + monkeypatch.setattr( + "pandas.io.gcs.get_filepath_or_buffer", mock_get_filepath_or_buffer + ) + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) assert_frame_equal(df1, df2) -@pytest.mark.skipif(td.safe_import('gcsfs'), - reason='Only check when gcsfs not installed') +@pytest.mark.skipif( + td.safe_import("gcsfs"), reason="Only check when gcsfs not installed" +) def test_gcs_not_present_exception(): with pytest.raises(ImportError) as e: - read_csv('gs://test/test.csv') - assert 'gcsfs library is required' in str(e.value) + read_csv("gs://test/test.csv") + assert "gcsfs library is required" in str(e.value) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 63184dd1a8f83..d3d05b6281d5b 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -13,8 +13,7 @@ from pandas.errors import ParserError import pandas.util._test_decorators as td -from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv) +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -25,37 +24,46 @@ HERE = os.path.dirname(__file__) -@pytest.fixture(params=[ - 'chinese_utf-16.html', - 'chinese_utf-32.html', - 'chinese_utf-8.html', - 'letz_latin1.html', -]) +@pytest.fixture( + params=[ + "chinese_utf-16.html", + "chinese_utf-32.html", + "chinese_utf-8.html", + "letz_latin1.html", + ] +) def html_encoding_file(request, datapath): """Parametrized fixture for HTML encoding test filenames.""" - return datapath('io', 'data', 'html_encoding', request.param) + return datapath("io", "data", "html_encoding", request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): - assert len(list1) == len(list2), ('lists are not of equal size ' - 'len(list1) == {0}, ' - 'len(list2) == {1}'.format(len(list1), - len(list2))) - msg = 'not all list elements are DataFrames' - both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and - isinstance(y, DataFrame), list1, list2)) + assert len(list1) == len(list2), ( + "lists are not of equal size " + "len(list1) == {0}, " + "len(list2) == {1}".format(len(list1), len(list2)) + ) + msg = "not all list elements are DataFrames" + both_frames = all( + map( + lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame), + list1, + list2, + ) + ) assert both_frames, msg for frame_i, frame_j in zip(list1, list2): tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) - assert not frame_i.empty, 'frames are both empty' + assert not frame_i.empty, "frames are both empty" -@td.skip_if_no('bs4') +@td.skip_if_no("bs4") def test_bs4_version_fails(monkeypatch, datapath): import bs4 - monkeypatch.setattr(bs4, '__version__', '4.2') + + monkeypatch.setattr(bs4, "__version__", "4.2") with pytest.raises(ImportError, match="Pandas requires version"): - read_html(datapath("io", "data", "spam.html"), flavor='bs4') + read_html(datapath("io", "data", "spam.html"), flavor="bs4") def test_invalid_flavor(): @@ -67,25 +75,29 @@ def test_invalid_flavor(): read_html(url, "google", flavor=flavor) -@td.skip_if_no('bs4') -@td.skip_if_no('lxml') +@td.skip_if_no("bs4") +@td.skip_if_no("lxml") def test_same_ordering(datapath): - filename = datapath('io', 'data', 'valid_markup.html') - dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) - dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) + filename = datapath("io", "data", "valid_markup.html") + dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"]) + dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"]) assert_framelist_equal(dfs_lxml, dfs_bs4) -@pytest.mark.parametrize("flavor", [ - pytest.param('bs4', marks=td.skip_if_no('lxml')), - pytest.param('lxml', marks=td.skip_if_no('lxml'))], scope="class") +@pytest.mark.parametrize( + "flavor", + [ + pytest.param("bs4", marks=td.skip_if_no("lxml")), + pytest.param("lxml", marks=td.skip_if_no("lxml")), + ], + scope="class", +) class TestReadHtml: - @pytest.fixture(autouse=True) def set_files(self, datapath): - self.spam_data = datapath('io', 'data', 'spam.html') + self.spam_data = datapath("io", "data", "spam.html") self.spam_data_kwargs = {} - self.spam_data_kwargs['encoding'] = 'UTF-8' + self.spam_data_kwargs["encoding"] = "UTF-8" self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") @@ -94,46 +106,56 @@ def set_defaults(self, flavor, request): yield def test_to_html_compat(self): - df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, - r_idx_names=False).applymap('{0:.3f}'.format).astype(float) + df = ( + mkdf( + 4, + 3, + data_gen_f=lambda *args: rand(), + c_idx_names=False, + r_idx_names=False, + ) + .applymap("{0:.3f}".format) + .astype(float) + ) out = df.to_html() - res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] + res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) @network def test_banklist_url(self): - url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' - df1 = self.read_html(url, 'First Federal Bank of Florida', - attrs={"id": 'table'}) - df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) + url = "http://www.fdic.gov/bank/individual/failed/banklist.html" + df1 = self.read_html( + url, "First Federal Bank of Florida", attrs={"id": "table"} + ) + df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"}) assert_framelist_equal(df1, df2) @network def test_spam_url(self): - url = ('http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&' - 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam') - df1 = self.read_html(url, '.*Water.*') - df2 = self.read_html(url, 'Unit') + url = ( + "http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&" + "lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam" + ) + df1 = self.read_html(url, ".*Water.*") + df2 = self.read_html(url, "Unit") assert_framelist_equal(df1, df2) @pytest.mark.slow def test_banklist(self): - df1 = self.read_html(self.banklist_data, '.*Florida.*', - attrs={'id': 'table'}) - df2 = self.read_html(self.banklist_data, 'Metcalf Bank', - attrs={'id': 'table'}) + df1 = self.read_html(self.banklist_data, ".*Florida.*", attrs={"id": "table"}) + df2 = self.read_html(self.banklist_data, "Metcalf Bank", attrs={"id": "table"}) assert_framelist_equal(df1, df2) def test_spam(self): - df1 = self.read_html(self.spam_data, '.*Water.*') - df2 = self.read_html(self.spam_data, 'Unit') + df1 = self.read_html(self.spam_data, ".*Water.*") + df2 = self.read_html(self.spam_data, "Unit") assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' - assert df1[0].columns[0] == 'Nutrient' + assert df1[0].iloc[0, 0] == "Proximates" + assert df1[0].columns[0] == "Nutrient" def test_spam_no_match(self): dfs = self.read_html(self.spam_data) @@ -141,90 +163,88 @@ def test_spam_no_match(self): assert isinstance(df, DataFrame) def test_banklist_no_match(self): - dfs = self.read_html(self.banklist_data, attrs={'id': 'table'}) + dfs = self.read_html(self.banklist_data, attrs={"id": "table"}) for df in dfs: assert isinstance(df, DataFrame) def test_spam_header(self): - df = self.read_html(self.spam_data, '.*Water.*', header=2)[0] - assert df.columns[0] == 'Proximates' + df = self.read_html(self.spam_data, ".*Water.*", header=2)[0] + assert df.columns[0] == "Proximates" assert not df.empty def test_skiprows_int(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, "Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0] - df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0] + df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0] tm.assert_frame_equal(df1, df2) def test_skiprows_list(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1]) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=[1, 2]) + df2 = self.read_html(self.spam_data, "Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows={1, 2}) - df2 = self.read_html(self.spam_data, 'Unit', skiprows={2, 1}) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows={1, 2}) + df2 = self.read_html(self.spam_data, "Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) def test_skiprows_slice(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1) + df2 = self.read_html(self.spam_data, "Unit", skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2, 5)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): - df1 = self.read_html(self.spam_data, '.*Water.*', - skiprows=np.arange(2)) - df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) + df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=np.arange(2)) + df2 = self.read_html(self.spam_data, "Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): - with pytest.raises(TypeError, match=('is not a valid type ' - 'for skipping rows')): - self.read_html(self.spam_data, '.*Water.*', skiprows='asdf') + with pytest.raises( + TypeError, match=("is not a valid type " "for skipping rows") + ): + self.read_html(self.spam_data, ".*Water.*", skiprows="asdf") def test_index(self): - df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, "Unit", index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): - df1 = self.read_html(self.spam_data, '.*Water.*', header=1, - index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): - df1 = self.read_html(self.spam_data, '.*Water.*', header=1, - index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0) + df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self): # 10892 infer_types removed - df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) - df2 = self.read_html(self.spam_data, 'Unit', index_col=0) + df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0) + df2 = self.read_html(self.spam_data, "Unit", index_col=0) assert_framelist_equal(df1, df2) def test_string_io(self): @@ -234,49 +254,48 @@ def test_string_io(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) - df1 = self.read_html(data1, '.*Water.*') - df2 = self.read_html(data2, 'Unit') + df1 = self.read_html(data1, ".*Water.*") + df2 = self.read_html(data2, "Unit") assert_framelist_equal(df1, df2) def test_string(self): with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() - df1 = self.read_html(data, '.*Water.*') - df2 = self.read_html(data, 'Unit') + df1 = self.read_html(data, ".*Water.*") + df2 = self.read_html(data, "Unit") assert_framelist_equal(df1, df2) def test_file_like(self): with open(self.spam_data, **self.spam_data_kwargs) as f: - df1 = self.read_html(f, '.*Water.*') + df1 = self.read_html(f, ".*Water.*") with open(self.spam_data, **self.spam_data_kwargs) as f: - df2 = self.read_html(f, 'Unit') + df2 = self.read_html(f, "Unit") assert_framelist_equal(df1, df2) @network def test_bad_url_protocol(self): with pytest.raises(URLError): - self.read_html('git://github.com', match='.*Water.*') + self.read_html("git://github.com", match=".*Water.*") @network @pytest.mark.slow def test_invalid_url(self): try: with pytest.raises(URLError): - self.read_html('http://www.a23950sdfa908sd.com', - match='.*Water.*') + self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*") except ValueError as e: - assert 'No tables found' in str(e) + assert "No tables found" in str(e) @pytest.mark.slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(os.path.abspath(url)), - 'First', - attrs={'id': 'table'}) + dfs = self.read_html( + file_path_to_url(os.path.abspath(url)), "First", attrs={"id": "table"} + ) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) @@ -284,13 +303,15 @@ def test_file_url(self): @pytest.mark.slow def test_invalid_table_attrs(self): url = self.banklist_data - with pytest.raises(ValueError, match='No tables found'): - self.read_html(url, 'First Federal Bank of Florida', - attrs={'id': 'tasdfable'}) + with pytest.raises(ValueError, match="No tables found"): + self.read_html( + url, "First Federal Bank of Florida", attrs={"id": "tasdfable"} + ) def _bank_data(self, *args, **kwargs): - return self.read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'}, *args, **kwargs) + return self.read_html( + self.banklist_data, "Metcalf", attrs={"id": "table"}, *args, **kwargs + ) @pytest.mark.slow def test_multiindex_header(self): @@ -327,37 +348,38 @@ def test_multiindex_header_index_skiprows(self): @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(os.path.abspath(url)), - match=re.compile(re.compile('Florida')), - attrs={'id': 'table'}) + dfs = self.read_html( + file_path_to_url(os.path.abspath(url)), + match=re.compile(re.compile("Florida")), + attrs={"id": "table"}, + ) assert isinstance(dfs, list) for df in dfs: assert isinstance(df, DataFrame) def test_negative_skiprows(self): - msg = r'\(you passed a negative value\)' + msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): - self.read_html(self.spam_data, 'Water', skiprows=-1) + self.read_html(self.spam_data, "Water", skiprows=-1) @network def test_multiple_matches(self): - url = 'https://docs.python.org/2/' - dfs = self.read_html(url, match='Python') + url = "https://docs.python.org/2/" + dfs = self.read_html(url, match="Python") assert len(dfs) > 1 @network def test_python_docs_table(self): - url = 'https://docs.python.org/2/' - dfs = self.read_html(url, match='Python') + url = "https://docs.python.org/2/" + dfs = self.read_html(url, match="Python") zz = [df.iloc[0, 0][0:4] for df in dfs] - assert sorted(zz) == sorted(['Repo', 'What']) + assert sorted(zz) == sorted(["Repo", "What"]) @pytest.mark.slow def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 macau_data = datapath("io", "data", "macau.html") - dfs = self.read_html(macau_data, index_col=0, - attrs={'class': 'style1'}) + dfs = self.read_html(macau_data, index_col=0, attrs={"class": "style1"}) df = dfs[all_non_nan_table_index] assert not any(s.isna().any() for _, s in df.iteritems()) @@ -365,7 +387,7 @@ def test_thousands_macau_stats(self, datapath): @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = datapath('io', 'data', 'macau.html') + macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -375,7 +397,8 @@ def test_empty_tables(self): """ Make sure that read_html ignores empty tables. """ - result = self.read_html(''' + result = self.read_html( + """ @@ -394,14 +417,16 @@ def test_empty_tables(self):
- ''') + """ + ) assert len(result) == 1 def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. - result = self.read_html(''' + result = self.read_html( + """
@@ -420,9 +445,10 @@ def test_multiple_tbody(self): -
A4
''')[0] + """ + )[0] - expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B']) + expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -431,7 +457,8 @@ def test_header_and_one_column(self): Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ - result = self.read_html(''' + result = self.read_html( + """
@@ -442,9 +469,10 @@ def test_header_and_one_column(self): -
Headerfirst
''')[0] + """ + )[0] - expected = DataFrame(data={'Header': 'first'}, index=[0]) + expected = DataFrame(data={"Header": "first"}, index=[0]) tm.assert_frame_equal(result, expected) @@ -452,7 +480,8 @@ def test_thead_without_tr(self): """ Ensure parser adds within on malformed HTML. """ - result = self.read_html(''' + result = self.read_html( + """
@@ -467,10 +496,13 @@ def test_thead_without_tr(self): -
Country1944
''')[0] + """ + )[0] - expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]], - columns=['Country', 'Municipality', 'Year']) + expected = DataFrame( + data=[["Ukraine", "Odessa", 1944]], + columns=["Country", "Municipality", "Year"], + ) tm.assert_frame_equal(result, expected) @@ -479,7 +511,7 @@ def test_tfoot_read(self): Make sure that read_html reads tfoot, containing td or th. Ignores empty tfoot """ - data_template = ''' + data_template = """
@@ -495,16 +527,16 @@ def test_tfoot_read(self): {footer} -
A
''' + """ - expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B']) + expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"]) - expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']], - columns=['A', 'B']) + expected2 = DataFrame( + data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"] + ) data1 = data_template.format(footer="") - data2 = data_template.format( - footer="footAfootB") + data2 = data_template.format(footer="footAfootB") result1 = self.read_html(data1)[0] result2 = self.read_html(data2)[0] @@ -515,7 +547,8 @@ def test_tfoot_read(self): def test_parse_header_of_non_string_column(self): # GH5048: if header is specified explicitly, an int column should be # parsed as int while its header is parsed as str - result = self.read_html(''' + result = self.read_html( + """ @@ -526,19 +559,27 @@ def test_parse_header_of_non_string_column(self):
S1944
- ''', header=0)[0] + """, + header=0, + )[0] - expected = DataFrame([['text', 1944]], columns=('S', 'I')) + expected = DataFrame([["text", 1944]], columns=("S", "I")) tm.assert_frame_equal(result, expected) def test_nyse_wsj_commas_table(self, datapath): - data = datapath('io', 'data', 'nyse_wsj.html') - df = self.read_html(data, index_col=0, header=0, - attrs={'class': 'mdcTable'})[0] - - expected = Index(['Issue(Roll over for charts and headlines)', - 'Volume', 'Price', 'Chg', '% Chg']) + data = datapath("io", "data", "nyse_wsj.html") + df = self.read_html(data, index_col=0, header=0, attrs={"class": "mdcTable"})[0] + + expected = Index( + [ + "Issue(Roll over for charts and headlines)", + "Volume", + "Price", + "Chg", + "% Chg", + ] + ) nrows = 100 assert df.shape[0] == nrows tm.assert_index_equal(df.columns, expected) @@ -553,48 +594,57 @@ def try_remove_ws(x): except AttributeError: return x - df = self.read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'})[0] - ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), - converters={'Updated Date': Timestamp, - 'Closing Date': Timestamp}) + df = self.read_html(self.banklist_data, "Metcalf", attrs={"id": "table"})[0] + ground_truth = read_csv( + datapath("io", "data", "banklist.csv"), + converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, + ) assert df.shape == ground_truth.shape - old = ['First Vietnamese American BankIn Vietnamese', - 'Westernbank Puerto RicoEn Espanol', - 'R-G Premier Bank of Puerto RicoEn Espanol', - 'EurobankEn Espanol', 'Sanderson State BankEn Espanol', - 'Washington Mutual Bank(Including its subsidiary Washington ' - 'Mutual Bank FSB)', - 'Silver State BankEn Espanol', - 'AmTrade International BankEn Espanol', - 'Hamilton Bank, NAEn Espanol', - 'The Citizens Savings BankPioneer Community Bank, Inc.'] - new = ['First Vietnamese American Bank', 'Westernbank Puerto Rico', - 'R-G Premier Bank of Puerto Rico', 'Eurobank', - 'Sanderson State Bank', 'Washington Mutual Bank', - 'Silver State Bank', 'AmTrade International Bank', - 'Hamilton Bank, NA', 'The Citizens Savings Bank'] + old = [ + "First Vietnamese American BankIn Vietnamese", + "Westernbank Puerto RicoEn Espanol", + "R-G Premier Bank of Puerto RicoEn Espanol", + "EurobankEn Espanol", + "Sanderson State BankEn Espanol", + "Washington Mutual Bank(Including its subsidiary Washington " + "Mutual Bank FSB)", + "Silver State BankEn Espanol", + "AmTrade International BankEn Espanol", + "Hamilton Bank, NAEn Espanol", + "The Citizens Savings BankPioneer Community Bank, Inc.", + ] + new = [ + "First Vietnamese American Bank", + "Westernbank Puerto Rico", + "R-G Premier Bank of Puerto Rico", + "Eurobank", + "Sanderson State Bank", + "Washington Mutual Bank", + "Silver State Bank", + "AmTrade International Bank", + "Hamilton Bank, NA", + "The Citizens Savings Bank", + ] dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) converted = dfnew._convert(datetime=True, numeric=True) - date_cols = ['Closing Date', 'Updated Date'] - converted[date_cols] = converted[date_cols]._convert(datetime=True, - coerce=True) + date_cols = ["Closing Date", "Updated Date"] + converted[date_cols] = converted[date_cols]._convert(datetime=True, coerce=True) tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow def test_gold_canyon(self): - gc = 'Gold Canyon' - with open(self.banklist_data, 'r') as f: + gc = "Gold Canyon" + with open(self.banklist_data, "r") as f: raw_text = f.read() assert gc in raw_text - df = self.read_html(self.banklist_data, 'Gold Canyon', - attrs={'id': 'table'})[0] + df = self.read_html(self.banklist_data, "Gold Canyon", attrs={"id": "table"})[0] assert gc in df.to_string() def test_different_number_of_cols(self): - expected = self.read_html(""" + expected = self.read_html( + """
@@ -623,9 +673,12 @@ def test_different_number_of_cols(self): -
0.222
""", index_col=0)[0] + """, + index_col=0, + )[0] - result = self.read_html(""" + result = self.read_html( + """
@@ -651,13 +704,16 @@ def test_different_number_of_cols(self): -
0.222
""", index_col=0)[0] + """, + index_col=0, + )[0] tm.assert_frame_equal(result, expected) def test_colspan_rowspan_1(self): # GH17054 - result = self.read_html(""" + result = self.read_html( + """ @@ -670,9 +726,10 @@ def test_colspan_rowspan_1(self):
Ac
- """)[0] + """ + )[0] - expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C']) + expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"]) tm.assert_frame_equal(result, expected) @@ -684,7 +741,8 @@ def test_colspan_rowspan_copy_values(self): # X x Y Z W # A B b z C - result = self.read_html(""" + result = self.read_html( + """ @@ -698,10 +756,13 @@ def test_colspan_rowspan_copy_values(self):
XC
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']], - columns=['X', 'X.1', 'Y', 'Z', 'W']) + expected = DataFrame( + data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"] + ) tm.assert_frame_equal(result, expected) @@ -713,7 +774,8 @@ def test_colspan_rowspan_both_not_1(self): # A B b b C # a b b b D - result = self.read_html(""" + result = self.read_html( + """ @@ -724,10 +786,13 @@ def test_colspan_rowspan_both_not_1(self):
AD
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']], - columns=['A', 'B', 'B.1', 'B.2', 'C']) + expected = DataFrame( + data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"] + ) tm.assert_frame_equal(result, expected) @@ -739,7 +804,8 @@ def test_rowspan_at_end_of_row(self): # A B # C b - result = self.read_html(""" + result = self.read_html( + """ @@ -749,32 +815,37 @@ def test_rowspan_at_end_of_row(self):
AC
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['C', 'B']], columns=['A', 'B']) + expected = DataFrame(data=[["C", "B"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_rowspan_only_rows(self): # GH17054 - result = self.read_html(""" + result = self.read_html( + """
A B
- """, header=0)[0] + """, + header=0, + )[0] - expected = DataFrame(data=[['A', 'B'], ['A', 'B']], - columns=['A', 'B']) + expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_header_inferred_from_rows_with_only_th(self): # GH17054 - result = self.read_html(""" + result = self.read_html( + """ @@ -789,53 +860,63 @@ def test_header_inferred_from_rows_with_only_th(self):
A2
- """)[0] + """ + )[0] - columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], - codes=[[0, 1], [0, 1]]) + columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) def test_parse_dates_list(self): - df = DataFrame({'date': date_range('1/1/2001', periods=10)}) + df = DataFrame({"date": date_range("1/1/2001", periods=10)}) expected = df.to_html() res = self.read_html(expected, parse_dates=[1], index_col=0) tm.assert_frame_equal(df, res[0]) - res = self.read_html(expected, parse_dates=['date'], index_col=0) + res = self.read_html(expected, parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) def test_parse_dates_combine(self): - raw_dates = Series(date_range('1/1/2001', periods=10)) - df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), - 'time': raw_dates.map(lambda x: str(x.time()))}) - res = self.read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, - index_col=1) - newdf = DataFrame({'datetime': raw_dates}) + raw_dates = Series(date_range("1/1/2001", periods=10)) + df = DataFrame( + { + "date": raw_dates.map(lambda x: str(x.date())), + "time": raw_dates.map(lambda x: str(x.time())), + } + ) + res = self.read_html( + df.to_html(), parse_dates={"datetime": [1, 2]}, index_col=1 + ) + newdf = DataFrame({"datetime": raw_dates}) tm.assert_frame_equal(newdf, res[0]) def test_computer_sales_page(self, datapath): - data = datapath('io', 'data', 'computer_sales_page.html') - msg = (r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns") + data = datapath("io", "data", "computer_sales_page.html") + msg = ( + r"Passed header=\[0,1\] are too many " + r"rows for this multi_index of columns" + ) with pytest.raises(ParserError, match=msg): self.read_html(data, header=[0, 1]) - data = datapath('io', 'data', 'computer_sales_page.html') + data = datapath("io", "data", "computer_sales_page.html") assert self.read_html(data, header=[1, 2]) def test_wikipedia_states_table(self, datapath): - data = datapath('io', 'data', 'wikipedia_states.html') - assert os.path.isfile(data), '%r is not a file' % data - assert os.path.getsize(data), '%r is an empty file' % data - result = self.read_html(data, 'Arizona', header=1)[0] - assert result['sq mi'].dtype == np.dtype('float64') + data = datapath("io", "data", "wikipedia_states.html") + assert os.path.isfile(data), "%r is not a file" % data + assert os.path.getsize(data), "%r is an empty file" % data + result = self.read_html(data, "Arizona", header=1)[0] + assert result["sq mi"].dtype == np.dtype("float64") def test_parser_error_on_empty_header_row(self): - msg = (r"Passed header=\[0,1\] are too many " - r"rows for this multi_index of columns") + msg = ( + r"Passed header=\[0,1\] are too many " + r"rows for this multi_index of columns" + ) with pytest.raises(ParserError, match=msg): - self.read_html(""" + self.read_html( + """ @@ -845,11 +926,14 @@ def test_parser_error_on_empty_header_row(self):
ab
- """, header=[0, 1]) + """, + header=[0, 1], + ) def test_decimal_rows(self): # GH 12907 - result = self.read_html(''' + result = self.read_html( + """ @@ -864,11 +948,13 @@ def test_decimal_rows(self):
- ''', decimal='#')[0] + """, + decimal="#", + )[0] - expected = DataFrame(data={'Header': 1100.101}, index=[0]) + expected = DataFrame(data={"Header": 1100.101}, index=[0]) - assert result['Header'].dtype == np.dtype('float64') + assert result["Header"].dtype == np.dtype("float64") tm.assert_frame_equal(result, expected) def test_bool_header_arg(self): @@ -895,10 +981,10 @@ def test_converters(self): """, - converters={'a': str} + converters={"a": str}, )[0] - expected = DataFrame({'a': ['0.763', '0.244']}) + expected = DataFrame({"a": ["0.763", "0.244"]}) tm.assert_frame_equal(result, expected) @@ -920,9 +1006,10 @@ def test_na_values(self): """, - na_values=[0.244])[0] + na_values=[0.244], + )[0] - expected = DataFrame({'a': [0.763, np.nan]}) + expected = DataFrame({"a": [0.763, np.nan]}) tm.assert_frame_equal(result, expected) @@ -943,16 +1030,17 @@ def test_keep_default_na(self): """ - expected_df = DataFrame({'a': ['N/A', 'NA']}) + expected_df = DataFrame({"a": ["N/A", "NA"]}) html_df = self.read_html(html_data, keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) - expected_df = DataFrame({'a': [np.nan, np.nan]}) + expected_df = DataFrame({"a": [np.nan, np.nan]}) html_df = self.read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) def test_preserve_empty_rows(self): - result = self.read_html(""" + result = self.read_html( + """ @@ -967,15 +1055,16 @@ def test_preserve_empty_rows(self):
A
- """)[0] + """ + )[0] - expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]], - columns=['A', 'B']) + expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) def test_ignore_empty_rows_when_inferring_header(self): - result = self.read_html(""" + result = self.read_html( + """ @@ -986,50 +1075,56 @@ def test_ignore_empty_rows_when_inferring_header(self):
12
- """)[0] + """ + )[0] - columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], - codes=[[0, 1], [0, 1]]) + columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]]) expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) def test_multiple_header_rows(self): # Issue #13434 - expected_df = DataFrame(data=[("Hillary", 68, "D"), - ("Bernie", 74, "D"), - ("Donald", 69, "R")]) - expected_df.columns = [["Unnamed: 0_level_0", "Age", "Party"], - ["Name", "Unnamed: 1_level_1", - "Unnamed: 2_level_1"]] + expected_df = DataFrame( + data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")] + ) + expected_df.columns = [ + ["Unnamed: 0_level_0", "Age", "Party"], + ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"], + ] html = expected_df.to_html(index=False) - html_df = self.read_html(html, )[0] + html_df = self.read_html(html)[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): - filename = datapath('io', 'data', 'valid_markup.html') + filename = datapath("io", "data", "valid_markup.html") dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow def test_fallback_success(self, datapath): - banklist_data = datapath('io', 'data', 'banklist.html') - self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) + banklist_data = datapath("io", "data", "banklist.html") + self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"]) def test_to_html_timestamp(self): - rng = date_range('2000-01-01', periods=10) + rng = date_range("2000-01-01", periods=10) df = DataFrame(np.random.randn(10, 4), index=rng) result = df.to_html() - assert '2000-01-01' in result - - @pytest.mark.parametrize("displayed_only,exp0,exp1", [ - (True, DataFrame(["foo"]), None), - (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) + assert "2000-01-01" in result + + @pytest.mark.parametrize( + "displayed_only,exp0,exp1", + [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"])), + ], + ) def test_displayed_only(self, displayed_only, exp0, exp1): # GH 20027 - data = StringIO(""" + data = StringIO( + """ @@ -1047,7 +1142,8 @@ def test_displayed_only(self, displayed_only, exp0, exp1):
- """) + """ + ) dfs = self.read_html(data, displayed_only=displayed_only) tm.assert_frame_equal(dfs[0], exp0) @@ -1058,49 +1154,51 @@ def test_displayed_only(self, displayed_only, exp0, exp1): assert len(dfs) == 1 # Should not parse hidden table def test_encode(self, html_encoding_file): - _, encoding = os.path.splitext( - os.path.basename(html_encoding_file) - )[0].split('_') + _, encoding = os.path.splitext(os.path.basename(html_encoding_file))[0].split( + "_" + ) try: - with open(html_encoding_file, 'rb') as fobj: - from_string = self.read_html(fobj.read(), encoding=encoding, - index_col=0).pop() - - with open(html_encoding_file, 'rb') as fobj: - from_file_like = self.read_html(BytesIO(fobj.read()), - encoding=encoding, - index_col=0).pop() - - from_filename = self.read_html(html_encoding_file, - encoding=encoding, - index_col=0).pop() + with open(html_encoding_file, "rb") as fobj: + from_string = self.read_html( + fobj.read(), encoding=encoding, index_col=0 + ).pop() + + with open(html_encoding_file, "rb") as fobj: + from_file_like = self.read_html( + BytesIO(fobj.read()), encoding=encoding, index_col=0 + ).pop() + + from_filename = self.read_html( + html_encoding_file, encoding=encoding, index_col=0 + ).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): - if '16' in encoding or '32' in encoding: + if "16" in encoding or "32" in encoding: pytest.skip() raise def test_parse_failure_unseekable(self): # Issue #17975 - if self.read_html.keywords.get('flavor') == 'lxml': + if self.read_html.keywords.get("flavor") == "lxml": pytest.skip("Not applicable for lxml") class UnseekableStringIO(StringIO): def seekable(self): return False - bad = UnseekableStringIO(''' -
spameggs
''') + bad = UnseekableStringIO( + """ +
spameggs
""" + ) assert self.read_html(bad) - with pytest.raises(ValueError, - match='passed a non-rewindable file object'): + with pytest.raises(ValueError, match="passed a non-rewindable file object"): self.read_html(bad) def test_parse_failure_rewinds(self): @@ -1112,7 +1210,7 @@ def __init__(self, data): self.at_end = False def read(self, size=None): - data = '' if self.at_end else self.data + data = "" if self.at_end else self.data self.at_end = True return data @@ -1122,8 +1220,8 @@ def seek(self, offset): def seekable(self): return True - good = MockFile('
spam
eggs
') - bad = MockFile('
spameggs
') + good = MockFile("
spam
eggs
") + bad = MockFile("
spameggs
") assert self.read_html(good) assert self.read_html(bad) @@ -1144,7 +1242,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = datapath('io', 'data', 'valid_markup.html') + filename = datapath("io", "data", "valid_markup.html") helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 203b550b8936a..83c11cd9ab996 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -12,12 +12,27 @@ import pandas from pandas import ( - Categorical, DataFrame, Index, Interval, MultiIndex, NaT, Period, Series, - Timestamp, bdate_range, date_range, period_range) + Categorical, + DataFrame, + Index, + Interval, + MultiIndex, + NaT, + Period, + Series, + Timestamp, + bdate_range, + date_range, + period_range, +) import pandas.util.testing as tm from pandas.util.testing import ( - assert_categorical_equal, assert_frame_equal, assert_index_equal, - assert_series_equal, ensure_clean) + assert_categorical_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, + ensure_clean, +) from pandas.io.packers import read_msgpack, to_msgpack @@ -38,26 +53,26 @@ _ZLIB_INSTALLED = True -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def current_packers_data(): # our current version packers data - from pandas.tests.io.generate_legacy_storage_files import ( - create_msgpack_data) + from pandas.tests.io.generate_legacy_storage_files import create_msgpack_data + return create_msgpack_data() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def all_packers_data(): # our all of our current version packers data - from pandas.tests.io.generate_legacy_storage_files import ( - create_data) + from pandas.tests.io.generate_legacy_storage_files import create_data + return create_data() def check_arbitrary(a, b): if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): - assert(len(a) == len(b)) + assert len(a) == len(b) for a_, b_ in zip(a, b): check_arbitrary(a_, b_) elif isinstance(a, DataFrame): @@ -70,7 +85,7 @@ def check_arbitrary(a, b): # Temp, # Categorical.categories is changed from str to bytes in PY3 # maybe the same as GH 13591 - if b.categories.inferred_type == 'string': + if b.categories.inferred_type == "string": pass else: tm.assert_categorical_equal(a, b) @@ -80,14 +95,13 @@ def check_arbitrary(a, b): assert a == b assert a.freq == b.freq else: - assert(a == b) + assert a == b @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestPackers: - def setup_method(self, method): - self.path = '__%s__.msg' % tm.rands(10) + self.path = "__%s__.msg" % tm.rands(10) def teardown_method(self, method): pass @@ -100,7 +114,6 @@ def encode_decode(self, x, compress=None, **kwargs): @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestAPI(TestPackers): - def test_string_io(self): df = DataFrame(np.random.randn(10, 2)) @@ -123,7 +136,7 @@ def test_string_io(self): with ensure_clean(self.path) as p: s = df.to_msgpack() - with open(p, 'wb') as fh: + with open(p, "wb") as fh: fh.write(s) result = read_msgpack(p) tm.assert_frame_equal(result, df) @@ -148,22 +161,20 @@ def test_iterator_with_string_io(self): def test_invalid_arg(self): # GH10369 class A: - def __init__(self): self.read = 0 msg = "Invalid file path or buffer object type: " - with pytest.raises(ValueError, match=msg.format('NoneType')): + with pytest.raises(ValueError, match=msg.format("NoneType")): read_msgpack(path_or_buf=None) - with pytest.raises(ValueError, match=msg.format('dict')): + with pytest.raises(ValueError, match=msg.format("dict")): read_msgpack(path_or_buf={}) - with pytest.raises(ValueError, match=msg.format(r'.*\.A')): + with pytest.raises(ValueError, match=msg.format(r".*\.A")): read_msgpack(path_or_buf=A()) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestNumpy(TestPackers): - def test_numpy_scalar_float(self): x = np.float32(np.random.rand()) x_rec = self.encode_decode(x) @@ -203,12 +214,12 @@ def test_list_numpy_float(self): tm.assert_almost_equal(tuple(x), x_rec) def test_list_numpy_float_complex(self): - if not hasattr(np, 'complex128'): - pytest.skip('numpy can not handle complex128') + if not hasattr(np, "complex128"): + pytest.skip("numpy can not handle complex128") - x = [np.float32(np.random.rand()) for i in range(5)] + \ - [np.complex128(np.random.rand() + 1j * np.random.rand()) - for i in range(5)] + x = [np.float32(np.random.rand()) for i in range(5)] + [ + np.complex128(np.random.rand() + 1j * np.random.rand()) for i in range(5) + ] x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) @@ -222,18 +233,19 @@ def test_list_float(self): tm.assert_almost_equal(tuple(x), x_rec) def test_list_float_complex(self): - x = [np.random.rand() for i in range(5)] + \ - [(np.random.rand() + 1j * np.random.rand()) for i in range(5)] + x = [np.random.rand() for i in range(5)] + [ + (np.random.rand() + 1j * np.random.rand()) for i in range(5) + ] x_rec = self.encode_decode(x) assert np.allclose(x, x_rec) def test_dict_float(self): - x = {'foo': 1.0, 'bar': 2.0} + x = {"foo": 1.0, "bar": 2.0} x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_dict_complex(self): - x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j} + x = {"foo": 1.0 + 1.0j, "bar": 2.0 + 2.0j} x_rec = self.encode_decode(x) tm.assert_dict_equal(x, x_rec) @@ -241,13 +253,12 @@ def test_dict_complex(self): tm.assert_class_equal(x[key], x_rec[key], obj="complex value") def test_dict_numpy_float(self): - x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)} + x = {"foo": np.float32(1.0), "bar": np.float32(2.0)} x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) def test_dict_numpy_complex(self): - x = {'foo': np.complex128(1.0 + 1.0j), - 'bar': np.complex128(2.0 + 2.0j)} + x = {"foo": np.complex128(1.0 + 1.0j), "bar": np.complex128(2.0 + 2.0j)} x_rec = self.encode_decode(x) tm.assert_dict_equal(x, x_rec) @@ -259,7 +270,7 @@ def test_numpy_array_float(self): # run multiple times for n in range(10): x = np.random.rand(10) - for dtype in ['float32', 'float64']: + for dtype in ["float32", "float64"]: x = x.astype(dtype) x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) @@ -267,11 +278,10 @@ def test_numpy_array_float(self): def test_numpy_array_complex(self): x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) x_rec = self.encode_decode(x) - assert (all(map(lambda x, y: x == y, x, x_rec)) and - x.dtype == x_rec.dtype) + assert all(map(lambda x, y: x == y, x, x_rec)) and x.dtype == x_rec.dtype def test_list_mixed(self): - x = [1.0, np.float32(3.5), np.complex128(4.25), 'foo', np.bool_(1)] + x = [1.0, np.float32(3.5), np.complex128(4.25), "foo", np.bool_(1)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) @@ -282,12 +292,13 @@ def test_list_mixed(self): @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestBasic(TestPackers): - def test_timestamp(self): - for i in [Timestamp( - '20130101'), Timestamp('20130101', tz='US/Eastern'), - Timestamp('201301010501')]: + for i in [ + Timestamp("20130101"), + Timestamp("20130101", tz="US/Eastern"), + Timestamp("201301010501"), + ]: i_rec = self.encode_decode(i) assert i == i_rec @@ -297,62 +308,70 @@ def test_nat(self): def test_datetimes(self): - for i in [datetime.datetime(2013, 1, 1), - datetime.datetime(2013, 1, 1, 5, 1), - datetime.date(2013, 1, 1), - np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]: + for i in [ + datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 1, 1, 5, 1), + datetime.date(2013, 1, 1), + np.datetime64(datetime.datetime(2013, 1, 5, 2, 15)), + ]: i_rec = self.encode_decode(i) assert i == i_rec def test_timedeltas(self): - for i in [datetime.timedelta(days=1), - datetime.timedelta(days=1, seconds=10), - np.timedelta64(1000000)]: + for i in [ + datetime.timedelta(days=1), + datetime.timedelta(days=1, seconds=10), + np.timedelta64(1000000), + ]: i_rec = self.encode_decode(i) assert i == i_rec def test_periods(self): # 13463 - for i in [Period('2010-09', 'M'), Period('2014-Q1', 'Q')]: + for i in [Period("2010-09", "M"), Period("2014-Q1", "Q")]: i_rec = self.encode_decode(i) assert i == i_rec def test_intervals(self): # 19967 - for i in [Interval(0, 1), Interval(0, 1, 'left'), - Interval(10, 25., 'right')]: + for i in [Interval(0, 1), Interval(0, 1, "left"), Interval(10, 25.0, "right")]: i_rec = self.encode_decode(i) assert i == i_rec @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestIndex(TestPackers): - def setup_method(self, method): super().setup_method(method) self.d = { - 'string': tm.makeStringIndex(100), - 'date': tm.makeDateIndex(100), - 'int': tm.makeIntIndex(100), - 'rng': tm.makeRangeIndex(100), - 'float': tm.makeFloatIndex(100), - 'empty': Index([]), - 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), - 'period': Index(period_range('2012-1-1', freq='M', periods=3)), - 'date2': Index(date_range('2013-01-1', periods=10)), - 'bdate': Index(bdate_range('2013-01-02', periods=10)), - 'cat': tm.makeCategoricalIndex(100), - 'interval': tm.makeIntervalIndex(100), - 'timedelta': tm.makeTimedeltaIndex(100, 'H') + "string": tm.makeStringIndex(100), + "date": tm.makeDateIndex(100), + "int": tm.makeIntIndex(100), + "rng": tm.makeRangeIndex(100), + "float": tm.makeFloatIndex(100), + "empty": Index([]), + "tuple": Index(zip(["foo", "bar", "baz"], [1, 2, 3])), + "period": Index(period_range("2012-1-1", freq="M", periods=3)), + "date2": Index(date_range("2013-01-1", periods=10)), + "bdate": Index(bdate_range("2013-01-02", periods=10)), + "cat": tm.makeCategoricalIndex(100), + "interval": tm.makeIntervalIndex(100), + "timedelta": tm.makeTimedeltaIndex(100, "H"), } self.mi = { - 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), - ('foo', 'two'), - ('qux', 'one'), ('qux', 'two')], - names=['first', 'second']), + "reg": MultiIndex.from_tuples( + [ + ("bar", "one"), + ("baz", "two"), + ("foo", "two"), + ("qux", "one"), + ("qux", "two"), + ], + names=["first", "second"], + ) } def test_basic_index(self): @@ -362,13 +381,14 @@ def test_basic_index(self): tm.assert_index_equal(i, i_rec) # datetime with no freq (GH5506) - i = Index([Timestamp('20130101'), Timestamp('20130103')]) + i = Index([Timestamp("20130101"), Timestamp("20130103")]) i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) # datetime with timezone - i = Index([Timestamp('20130101 9:00:00'), Timestamp( - '20130103 11:00:00')]).tz_localize('US/Eastern') + i = Index( + [Timestamp("20130101 9:00:00"), Timestamp("20130103 11:00:00")] + ).tz_localize("US/Eastern") i_rec = self.encode_decode(i) tm.assert_index_equal(i, i_rec) @@ -387,52 +407,51 @@ def test_unicode(self): def categorical_index(self): # GH15487 df = DataFrame(np.random.randn(10, 2)) - df = df.astype({0: 'category'}).set_index(0) + df = df.astype({0: "category"}).set_index(0) result = self.encode_decode(df) tm.assert_frame_equal(result, df) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSeries(TestPackers): - def setup_method(self, method): super().setup_method(method) self.d = {} s = tm.makeStringSeries() - s.name = 'string' - self.d['string'] = s + s.name = "string" + self.d["string"] = s s = tm.makeObjectSeries() - s.name = 'object' - self.d['object'] = s + s.name = "object" + self.d["object"] = s - s = Series(iNaT, dtype='M8[ns]', index=range(5)) - self.d['date'] = s + s = Series(iNaT, dtype="M8[ns]", index=range(5)) + self.d["date"] = s data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], - 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 + - [Timestamp('20130603', tz='CET')] * 3, - 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, - 'H': Categorical([1, 2, 3, 4, 5]), - 'I': Categorical([1, 2, 3, 4, 5], ordered=True), - 'J': (np.bool_(1), 2, 3, 4, 5), + "A": [0.0, 1.0, 2.0, 3.0, np.nan], + "B": [0, 1, 0, 1, 0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": date_range("1/1/2009", periods=5), + "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], + "F": [Timestamp("20130102", tz="US/Eastern")] * 2 + + [Timestamp("20130603", tz="CET")] * 3, + "G": [Timestamp("20130102", tz="US/Eastern")] * 5, + "H": Categorical([1, 2, 3, 4, 5]), + "I": Categorical([1, 2, 3, 4, 5], ordered=True), + "J": (np.bool_(1), 2, 3, 4, 5), } - self.d['float'] = Series(data['A']) - self.d['int'] = Series(data['B']) - self.d['mixed'] = Series(data['E']) - self.d['dt_tz_mixed'] = Series(data['F']) - self.d['dt_tz'] = Series(data['G']) - self.d['cat_ordered'] = Series(data['H']) - self.d['cat_unordered'] = Series(data['I']) - self.d['numpy_bool_mixed'] = Series(data['J']) + self.d["float"] = Series(data["A"]) + self.d["int"] = Series(data["B"]) + self.d["mixed"] = Series(data["E"]) + self.d["dt_tz_mixed"] = Series(data["F"]) + self.d["dt_tz"] = Series(data["G"]) + self.d["cat_ordered"] = Series(data["H"]) + self.d["cat_unordered"] = Series(data["I"]) + self.d["numpy_bool_mixed"] = Series(data["J"]) def test_basic(self): @@ -445,18 +464,18 @@ def test_basic(self): @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestCategorical(TestPackers): - def setup_method(self, method): super().setup_method(method) self.d = {} - self.d['plain_str'] = Categorical(['a', 'b', 'c', 'd', 'e']) - self.d['plain_str_ordered'] = Categorical(['a', 'b', 'c', 'd', 'e'], - ordered=True) + self.d["plain_str"] = Categorical(["a", "b", "c", "d", "e"]) + self.d["plain_str_ordered"] = Categorical( + ["a", "b", "c", "d", "e"], ordered=True + ) - self.d['plain_int'] = Categorical([5, 6, 7, 8]) - self.d['plain_int_ordered'] = Categorical([5, 6, 7, 8], ordered=True) + self.d["plain_int"] = Categorical([5, 6, 7, 8]) + self.d["plain_int_ordered"] = Categorical([5, 6, 7, 8], ordered=True) def test_basic(self): @@ -469,26 +488,26 @@ def test_basic(self): @pytest.mark.filterwarnings("ignore:msgpack:FutureWarning") class TestNDFrame(TestPackers): - def setup_method(self, method): super().setup_method(method) data = { - 'A': [0., 1., 2., 3., np.nan], - 'B': [0, 1, 0, 1, 0], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': date_range('1/1/2009', periods=5), - 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], - 'F': [Timestamp('20130102', tz='US/Eastern')] * 5, - 'G': [Timestamp('20130603', tz='CET')] * 5, - 'H': Categorical(['a', 'b', 'c', 'd', 'e']), - 'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True), + "A": [0.0, 1.0, 2.0, 3.0, np.nan], + "B": [0, 1, 0, 1, 0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": date_range("1/1/2009", periods=5), + "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0], + "F": [Timestamp("20130102", tz="US/Eastern")] * 5, + "G": [Timestamp("20130603", tz="CET")] * 5, + "H": Categorical(["a", "b", "c", "d", "e"]), + "I": Categorical(["a", "b", "c", "d", "e"], ordered=True), } self.frame = { - 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), - 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), - 'mixed': DataFrame(data)} + "float": DataFrame(dict(A=data["A"], B=Series(data["A"]) + 1)), + "int": DataFrame(dict(A=data["B"], B=Series(data["B"]) + 1)), + "mixed": DataFrame(data), + } def test_basic_frame(self): @@ -502,22 +521,31 @@ def test_multi(self): for k in self.frame.keys(): assert_frame_equal(self.frame[k], i_rec[k]) - packed_items = tuple([self.frame['float'], self.frame['float'].A, - self.frame['float'].B, None]) + packed_items = tuple( + [self.frame["float"], self.frame["float"].A, self.frame["float"].B, None] + ) l_rec = self.encode_decode(packed_items) check_arbitrary(packed_items, l_rec) # this is an oddity in that packed lists will be returned as tuples - packed_items = [self.frame['float'], self.frame['float'].A, - self.frame['float'].B, None] + packed_items = [ + self.frame["float"], + self.frame["float"].A, + self.frame["float"].B, + None, + ] l_rec = self.encode_decode(packed_items) assert isinstance(l_rec, tuple) check_arbitrary(packed_items, l_rec) def test_iterator(self): - packed_items = [self.frame['float'], self.frame['float'].A, - self.frame['float'].B, None] + packed_items = [ + self.frame["float"], + self.frame["float"].A, + self.frame["float"].B, + None, + ] with ensure_clean(self.path) as path: to_msgpack(path, *packed_items) @@ -528,22 +556,22 @@ def tests_datetimeindex_freq_issue(self): # GH 5947 # inferring freq on the datetimeindex - df = DataFrame([1, 2, 3], index=date_range('1/1/2013', '1/3/2013')) + df = DataFrame([1, 2, 3], index=date_range("1/1/2013", "1/3/2013")) result = self.encode_decode(df) assert_frame_equal(result, df) - df = DataFrame([1, 2], index=date_range('1/1/2013', '1/2/2013')) + df = DataFrame([1, 2], index=date_range("1/1/2013", "1/2/2013")) result = self.encode_decode(df) assert_frame_equal(result, df) def test_dataframe_duplicate_column_names(self): # GH 9618 - expected_1 = DataFrame(columns=['a', 'a']) + expected_1 = DataFrame(columns=["a", "a"]) expected_2 = DataFrame(columns=[1] * 100) expected_2.loc[0] = np.random.randn(100) expected_3 = DataFrame(columns=[1, 1]) - expected_3.loc[0] = ['abc', np.nan] + expected_3.loc[0] = ["abc", np.nan] result_1 = self.encode_decode(expected_1) result_2 = self.encode_decode(expected_2) @@ -559,7 +587,6 @@ def test_dataframe_duplicate_column_names(self): @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestSparse(TestPackers): - def _check_roundtrip(self, obj, comparator, **kwargs): # currently these are not implemetned @@ -574,16 +601,13 @@ def test_sparse_series(self): s = tm.makeStringSeries() s[3:5] = np.nan ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss, tm.assert_series_equal, check_series_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_roundtrip(ss2, tm.assert_series_equal, - check_series_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_roundtrip(ss2, tm.assert_series_equal, check_series_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_series_equal, - check_series_type=True) + self._check_roundtrip(ss3, tm.assert_series_equal, check_series_type=True) def test_sparse_frame(self): @@ -592,16 +616,13 @@ def test_sparse_frame(self): s.loc[8:10, -2] = np.nan ss = s.to_sparse() - self._check_roundtrip(ss, tm.assert_frame_equal, - check_frame_type=True) + self._check_roundtrip(ss, tm.assert_frame_equal, check_frame_type=True) - ss2 = s.to_sparse(kind='integer') - self._check_roundtrip(ss2, tm.assert_frame_equal, - check_frame_type=True) + ss2 = s.to_sparse(kind="integer") + self._check_roundtrip(ss2, tm.assert_frame_equal, check_frame_type=True) ss3 = s.to_sparse(fill_value=0) - self._check_roundtrip(ss3, tm.assert_frame_equal, - check_frame_type=True) + self._check_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") @@ -612,6 +633,7 @@ class TestCompression(TestPackers): def setup_method(self, method): try: from sqlalchemy import create_engine + self._create_sql_engine = create_engine except ImportError: self._SQLALCHEMY_INSTALLED = False @@ -620,16 +642,16 @@ def setup_method(self, method): super().setup_method(method) data = { - 'A': np.arange(1000, dtype=np.float64), - 'B': np.arange(1000, dtype=np.int32), - 'C': list(100 * 'abcdefghij'), - 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), - 'E': [datetime.timedelta(days=x) for x in range(1000)], + "A": np.arange(1000, dtype=np.float64), + "B": np.arange(1000, dtype=np.int32), + "C": list(100 * "abcdefghij"), + "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), + "E": [datetime.timedelta(days=x) for x in range(1000)], } self.frame = { - 'float': DataFrame({k: data[k] for k in ['A', 'A']}), - 'int': DataFrame({k: data[k] for k in ['B', 'B']}), - 'mixed': DataFrame(data), + "float": DataFrame({k: data[k] for k in ["A", "A"]}), + "int": DataFrame({k: data[k] for k in ["B", "B"]}), + "mixed": DataFrame(data), } def test_plain(self): @@ -649,16 +671,15 @@ def _test_compression(self, compress): def test_compression_zlib(self): if not _ZLIB_INSTALLED: - pytest.skip('no zlib') - self._test_compression('zlib') + pytest.skip("no zlib") + self._test_compression("zlib") def test_compression_blosc(self): if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - self._test_compression('blosc') + pytest.skip("no blosc") + self._test_compression("blosc") - def _test_compression_warns_when_decompress_caches( - self, monkeypatch, compress): + def _test_compression_warns_when_decompress_caches(self, monkeypatch, compress): not_garbage = [] control = [] # copied data @@ -676,19 +697,20 @@ def decompress(ob): # types mapped to values to add in place. rhs = { - np.dtype('float64'): 1.0, - np.dtype('int32'): 1, - np.dtype('object'): 'a', - np.dtype('datetime64[ns]'): np.timedelta64(1, 'ns'), - np.dtype('timedelta64[ns]'): np.timedelta64(1, 'ns'), + np.dtype("float64"): 1.0, + np.dtype("int32"): 1, + np.dtype("object"): "a", + np.dtype("datetime64[ns]"): np.timedelta64(1, "ns"), + np.dtype("timedelta64[ns]"): np.timedelta64(1, "ns"), } - with monkeypatch.context() as m, \ - tm.assert_produces_warning(PerformanceWarning) as ws: - m.setattr(compress_module, 'decompress', decompress) + with monkeypatch.context() as m, tm.assert_produces_warning( + PerformanceWarning + ) as ws: + m.setattr(compress_module, "decompress", decompress) with catch_warnings(): - filterwarnings('ignore', category=FutureWarning) + filterwarnings("ignore", category=FutureWarning) i_rec = self.encode_decode(self.frame, compress=compress) for k in self.frame.keys(): @@ -704,9 +726,11 @@ def decompress(ob): for w in ws: # check the messages from our warnings - assert str(w.message) == ('copying data after decompressing; ' - 'this may mean that decompress is ' - 'caching its result') + assert str(w.message) == ( + "copying data after decompressing; " + "this may mean that decompress is " + "caching its result" + ) for buf, control_buf in zip(not_garbage, control): # make sure none of our mutations above affected the @@ -715,121 +739,115 @@ def decompress(ob): def test_compression_warns_when_decompress_caches_zlib(self, monkeypatch): if not _ZLIB_INSTALLED: - pytest.skip('no zlib') - self._test_compression_warns_when_decompress_caches( - monkeypatch, 'zlib') + pytest.skip("no zlib") + self._test_compression_warns_when_decompress_caches(monkeypatch, "zlib") def test_compression_warns_when_decompress_caches_blosc(self, monkeypatch): if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - self._test_compression_warns_when_decompress_caches( - monkeypatch, 'blosc') + pytest.skip("no blosc") + self._test_compression_warns_when_decompress_caches(monkeypatch, "blosc") def _test_small_strings_no_warn(self, compress): - empty = np.array([], dtype='uint8') + empty = np.array([], dtype="uint8") with tm.assert_produces_warning(None): with catch_warnings(): - filterwarnings('ignore', category=FutureWarning) + filterwarnings("ignore", category=FutureWarning) empty_unpacked = self.encode_decode(empty, compress=compress) tm.assert_numpy_array_equal(empty_unpacked, empty) assert empty_unpacked.flags.writeable - char = np.array([ord(b'a')], dtype='uint8') + char = np.array([ord(b"a")], dtype="uint8") with tm.assert_produces_warning(None): with catch_warnings(): - filterwarnings('ignore', category=FutureWarning) + filterwarnings("ignore", category=FutureWarning) char_unpacked = self.encode_decode(char, compress=compress) tm.assert_numpy_array_equal(char_unpacked, char) assert char_unpacked.flags.writeable # if this test fails I am sorry because the interpreter is now in a # bad state where b'a' points to 98 == ord(b'b'). - char_unpacked[0] = ord(b'b') + char_unpacked[0] = ord(b"b") # we compare the ord of bytes b'a' with unicode 'a' because the should # always be the same (unless we were able to mutate the shared # character singleton in which case ord(b'a') == ord(b'b'). - assert ord(b'a') == ord('a') - tm.assert_numpy_array_equal( - char_unpacked, - np.array([ord(b'b')], dtype='uint8'), - ) + assert ord(b"a") == ord("a") + tm.assert_numpy_array_equal(char_unpacked, np.array([ord(b"b")], dtype="uint8")) def test_small_strings_no_warn_zlib(self): if not _ZLIB_INSTALLED: - pytest.skip('no zlib') - self._test_small_strings_no_warn('zlib') + pytest.skip("no zlib") + self._test_small_strings_no_warn("zlib") def test_small_strings_no_warn_blosc(self): if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - self._test_small_strings_no_warn('blosc') + pytest.skip("no blosc") + self._test_small_strings_no_warn("blosc") def test_readonly_axis_blosc(self): # GH11880 if not _BLOSC_INSTALLED: - pytest.skip('no blosc') - df1 = DataFrame({'A': list('abcd')}) - df2 = DataFrame(df1, index=[1., 2., 3., 4.]) - assert 1 in self.encode_decode(df1['A'], compress='blosc') - assert 1. in self.encode_decode(df2['A'], compress='blosc') + pytest.skip("no blosc") + df1 = DataFrame({"A": list("abcd")}) + df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) + assert 1 in self.encode_decode(df1["A"], compress="blosc") + assert 1.0 in self.encode_decode(df2["A"], compress="blosc") def test_readonly_axis_zlib(self): # GH11880 - df1 = DataFrame({'A': list('abcd')}) - df2 = DataFrame(df1, index=[1., 2., 3., 4.]) - assert 1 in self.encode_decode(df1['A'], compress='zlib') - assert 1. in self.encode_decode(df2['A'], compress='zlib') + df1 = DataFrame({"A": list("abcd")}) + df2 = DataFrame(df1, index=[1.0, 2.0, 3.0, 4.0]) + assert 1 in self.encode_decode(df1["A"], compress="zlib") + assert 1.0 in self.encode_decode(df2["A"], compress="zlib") def test_readonly_axis_blosc_to_sql(self): # GH11880 if not _BLOSC_INSTALLED: - pytest.skip('no blosc') + pytest.skip("no blosc") if not self._SQLALCHEMY_INSTALLED: - pytest.skip('no sqlalchemy') - expected = DataFrame({'A': list('abcd')}) - df = self.encode_decode(expected, compress='blosc') + pytest.skip("no sqlalchemy") + expected = DataFrame({"A": list("abcd")}) + df = self.encode_decode(expected, compress="blosc") eng = self._create_sql_engine("sqlite:///:memory:") - df.to_sql('test', eng, if_exists='append') - result = pandas.read_sql_table('test', eng, index_col='index') + df.to_sql("test", eng, if_exists="append") + result = pandas.read_sql_table("test", eng, index_col="index") result.index.names = [None] assert_frame_equal(expected, result) def test_readonly_axis_zlib_to_sql(self): # GH11880 if not _ZLIB_INSTALLED: - pytest.skip('no zlib') + pytest.skip("no zlib") if not self._SQLALCHEMY_INSTALLED: - pytest.skip('no sqlalchemy') - expected = DataFrame({'A': list('abcd')}) - df = self.encode_decode(expected, compress='zlib') + pytest.skip("no sqlalchemy") + expected = DataFrame({"A": list("abcd")}) + df = self.encode_decode(expected, compress="zlib") eng = self._create_sql_engine("sqlite:///:memory:") - df.to_sql('test', eng, if_exists='append') - result = pandas.read_sql_table('test', eng, index_col='index') + df.to_sql("test", eng, if_exists="append") + result = pandas.read_sql_table("test", eng, index_col="index") result.index.names = [None] assert_frame_equal(expected, result) @pytest.mark.filterwarnings("ignore:.*msgpack:FutureWarning") class TestEncoding(TestPackers): - def setup_method(self, method): super().setup_method(method) data = { - 'A': ['\u2019'] * 1000, - 'B': np.arange(1000, dtype=np.int32), - 'C': list(100 * 'abcdefghij'), - 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), - 'E': [datetime.timedelta(days=x) for x in range(1000)], - 'G': [400] * 1000 + "A": ["\u2019"] * 1000, + "B": np.arange(1000, dtype=np.int32), + "C": list(100 * "abcdefghij"), + "D": date_range(datetime.datetime(2015, 4, 1), periods=1000), + "E": [datetime.timedelta(days=x) for x in range(1000)], + "G": [400] * 1000, } self.frame = { - 'float': DataFrame({k: data[k] for k in ['A', 'A']}), - 'int': DataFrame({k: data[k] for k in ['B', 'B']}), - 'mixed': DataFrame(data), + "float": DataFrame({k: data[k] for k in ["A", "A"]}), + "int": DataFrame({k: data[k] for k in ["B", "B"]}), + "mixed": DataFrame(data), } - self.utf_encodings = ['utf8', 'utf16', 'utf32'] + self.utf_encodings = ["utf8", "utf16", "utf32"] def test_utf(self): # GH10581 @@ -841,14 +859,15 @@ def test_utf(self): def test_default_encoding(self): for frame in self.frame.values(): result = frame.to_msgpack() - expected = frame.to_msgpack(encoding='utf8') + expected = frame.to_msgpack(encoding="utf8") assert result == expected result = self.encode_decode(frame) assert_frame_equal(result, frame) -files = glob.glob(os.path.join(os.path.dirname(__file__), "data", - "legacy_msgpack", "*", "*.msgpack")) +files = glob.glob( + os.path.join(os.path.dirname(__file__), "data", "legacy_msgpack", "*", "*.msgpack") +) @pytest.fixture(params=files) @@ -869,11 +888,12 @@ class TestMsgpack: 3. Move the created pickle to "data/legacy_msgpack/" directory. """ - minimum_structure = {'series': ['float', 'int', 'mixed', - 'ts', 'mi', 'dup'], - 'frame': ['float', 'int', 'mixed', 'mi'], - 'index': ['int', 'date', 'period'], - 'mi': ['reg2']} + minimum_structure = { + "series": ["float", "int", "mixed", "ts", "mi", "dup"], + "frame": ["float", "int", "mixed", "mi"], + "index": ["int", "date", "period"], + "mi": ["reg2"], + } def check_min_structure(self, data, version): for typ, v in self.minimum_structure.items(): @@ -888,12 +908,13 @@ def compare(self, current_data, all_data, vf, version): self.check_min_structure(data, version) for typ, dv in data.items(): - assert typ in all_data, ('unpacked data contains ' - 'extra key "{0}"' - .format(typ)) + assert typ in all_data, "unpacked data contains " 'extra key "{0}"'.format( + typ + ) for dt, result in dv.items(): - assert dt in current_data[typ], ('data["{0}"] contains extra ' - 'key "{1}"'.format(typ, dt)) + assert ( + dt in current_data[typ] + ), 'data["{0}"] contains extra ' 'key "{1}"'.format(typ, dt) try: expected = current_data[typ][dt] except KeyError: @@ -916,21 +937,23 @@ def compare_series_dt_tz(self, result, expected, typ, version): def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): tm.assert_frame_equal(result, expected) - def test_msgpacks_legacy(self, current_packers_data, all_packers_data, - legacy_packer, datapath): + def test_msgpacks_legacy( + self, current_packers_data, all_packers_data, legacy_packer, datapath + ): version = os.path.basename(os.path.dirname(legacy_packer)) try: with catch_warnings(record=True): - self.compare(current_packers_data, all_packers_data, - legacy_packer, version) + self.compare( + current_packers_data, all_packers_data, legacy_packer, version + ) except ImportError: # blosc not installed pass def test_msgpack_period_freq(self): # https://github.com/pandas-dev/pandas/issues/24135 - s = Series(np.random.rand(5), index=date_range('20130101', periods=5)) + s = Series(np.random.rand(5), index=date_range("20130101", periods=5)) r = read_msgpack(s.to_msgpack()) repr(r) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f5f8dac71d095..f3e045be2e790 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -12,31 +12,45 @@ from pandas.util import testing as tm from pandas.io.parquet import ( - FastParquetImpl, PyArrowImpl, get_engine, read_parquet, to_parquet) + FastParquetImpl, + PyArrowImpl, + get_engine, + read_parquet, + to_parquet, +) try: import pyarrow # noqa + _HAVE_PYARROW = True except ImportError: _HAVE_PYARROW = False try: import fastparquet # noqa + _HAVE_FASTPARQUET = True except ImportError: _HAVE_FASTPARQUET = False # setup engines & skips -@pytest.fixture(params=[ - pytest.param('fastparquet', - marks=pytest.mark.skipif(not _HAVE_FASTPARQUET, - reason='fastparquet is ' - 'not installed')), - pytest.param('pyarrow', - marks=pytest.mark.skipif(not _HAVE_PYARROW, - reason='pyarrow is ' - 'not installed'))]) +@pytest.fixture( + params=[ + pytest.param( + "fastparquet", + marks=pytest.mark.skipif( + not _HAVE_FASTPARQUET, reason="fastparquet is " "not installed" + ), + ), + pytest.param( + "pyarrow", + marks=pytest.mark.skipif( + not _HAVE_PYARROW, reason="pyarrow is " "not installed" + ), + ), + ] +) def engine(request): return request.param @@ -45,59 +59,73 @@ def engine(request): def pa(): if not _HAVE_PYARROW: pytest.skip("pyarrow is not installed") - return 'pyarrow' + return "pyarrow" @pytest.fixture def fp(): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") - return 'fastparquet' + return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'}) + return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) @pytest.fixture def df_cross_compat(): - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - # 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.date_range('20130101', periods=3), - # 'g': pd.date_range('20130101', periods=3, - # tz='US/Eastern'), - # 'h': pd.date_range('20130101', periods=3, freq='ns') - }) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + # 'c': np.arange(3, 6).astype('u1'), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + # 'g': pd.date_range('20130101', periods=3, + # tz='US/Eastern'), + # 'h': pd.date_range('20130101', periods=3, freq='ns') + } + ) return df @pytest.fixture def df_full(): return pd.DataFrame( - {'string': list('abc'), - 'string_with_nan': ['a', np.nan, 'c'], - 'string_with_none': ['a', None, 'c'], - 'bytes': [b'foo', b'bar', b'baz'], - 'unicode': ['foo', 'bar', 'baz'], - 'int': list(range(1, 4)), - 'uint': np.arange(3, 6).astype('u1'), - 'float': np.arange(4.0, 7.0, dtype='float64'), - 'float_with_nan': [2., np.nan, 3.], - 'bool': [True, False, True], - 'datetime': pd.date_range('20130101', periods=3), - 'datetime_with_nat': [pd.Timestamp('20130101'), - pd.NaT, - pd.Timestamp('20130103')]}) - - -def check_round_trip(df, engine=None, path=None, - write_kwargs=None, read_kwargs=None, - expected=None, check_names=True, - repeat=2): + { + "string": list("abc"), + "string_with_nan": ["a", np.nan, "c"], + "string_with_none": ["a", None, "c"], + "bytes": [b"foo", b"bar", b"baz"], + "unicode": ["foo", "bar", "baz"], + "int": list(range(1, 4)), + "uint": np.arange(3, 6).astype("u1"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_nan": [2.0, np.nan, 3.0], + "bool": [True, False, True], + "datetime": pd.date_range("20130101", periods=3), + "datetime_with_nat": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + } + ) + + +def check_round_trip( + df, + engine=None, + path=None, + write_kwargs=None, + read_kwargs=None, + expected=None, + check_names=True, + repeat=2, +): """Verify parquet serializer and deserializer produce the same results. Performs a pandas to disk and disk to pandas round trip, @@ -119,23 +147,22 @@ def check_round_trip(df, engine=None, path=None, How many times to repeat the test """ - write_kwargs = write_kwargs or {'compression': None} + write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} if expected is None: expected = df if engine: - write_kwargs['engine'] = engine - read_kwargs['engine'] = engine + write_kwargs["engine"] = engine + read_kwargs["engine"] = engine def compare(repeat): for _ in range(repeat): df.to_parquet(path, **write_kwargs) with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) - tm.assert_frame_equal(expected, actual, - check_names=check_names) + tm.assert_frame_equal(expected, actual, check_names=check_names) if path is None: with tm.ensure_clean() as path: @@ -146,48 +173,48 @@ def compare(repeat): def test_invalid_engine(df_compat): with pytest.raises(ValueError): - check_round_trip(df_compat, 'foo', 'bar') + check_round_trip(df_compat, "foo", "bar") def test_options_py(df_compat, pa): # use the set option - with pd.option_context('io.parquet.engine', 'pyarrow'): + with pd.option_context("io.parquet.engine", "pyarrow"): check_round_trip(df_compat) def test_options_fp(df_compat, fp): # use the set option - with pd.option_context('io.parquet.engine', 'fastparquet'): + with pd.option_context("io.parquet.engine", "fastparquet"): check_round_trip(df_compat) def test_options_auto(df_compat, fp, pa): # use the set option - with pd.option_context('io.parquet.engine', 'auto'): + with pd.option_context("io.parquet.engine", "auto"): check_round_trip(df_compat) def test_options_get_engine(fp, pa): - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) - with pd.option_context('io.parquet.engine', 'pyarrow'): - assert isinstance(get_engine('auto'), PyArrowImpl) - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + with pd.option_context("io.parquet.engine", "pyarrow"): + assert isinstance(get_engine("auto"), PyArrowImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) - with pd.option_context('io.parquet.engine', 'fastparquet'): - assert isinstance(get_engine('auto'), FastParquetImpl) - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + with pd.option_context("io.parquet.engine", "fastparquet"): + assert isinstance(get_engine("auto"), FastParquetImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) - with pd.option_context('io.parquet.engine', 'auto'): - assert isinstance(get_engine('auto'), PyArrowImpl) - assert isinstance(get_engine('pyarrow'), PyArrowImpl) - assert isinstance(get_engine('fastparquet'), FastParquetImpl) + with pd.option_context("io.parquet.engine", "auto"): + assert isinstance(get_engine("auto"), PyArrowImpl) + assert isinstance(get_engine("pyarrow"), PyArrowImpl) + assert isinstance(get_engine("fastparquet"), FastParquetImpl) def test_cross_engine_pa_fp(df_cross_compat, pa, fp): @@ -200,8 +227,8 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=fp, columns=['a', 'd']) - tm.assert_frame_equal(result, df[['a', 'd']]) + result = read_parquet(path, engine=fp, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) def test_cross_engine_fp_pa(df_cross_compat, pa, fp): @@ -215,12 +242,11 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) - result = read_parquet(path, engine=pa, columns=['a', 'd']) - tm.assert_frame_equal(result, df[['a', 'd']]) + result = read_parquet(path, engine=pa, columns=["a", "d"]) + tm.assert_frame_equal(result, df[["a", "d"]]) class Base: - def check_error_on_write(self, df, engine, exc): # check that we are raising the exception on writing with tm.ensure_clean() as path: @@ -229,68 +255,72 @@ def check_error_on_write(self, df, engine, exc): class TestBasic(Base): - def test_error(self, engine): - for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'), - np.array([1, 2, 3])]: + for obj in [ + pd.Series([1, 2, 3]), + 1, + "foo", + pd.Timestamp("20130101"), + np.array([1, 2, 3]), + ]: self.check_error_on_write(obj, engine, ValueError) def test_columns_dtypes(self, engine): - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4))}) + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) # unicode - df.columns = ['foo', 'bar'] + df.columns = ["foo", "bar"] check_round_trip(df, engine) def test_columns_dtypes_invalid(self, engine): - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4))}) + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) # numeric df.columns = [0, 1] self.check_error_on_write(df, engine, ValueError) # bytes - df.columns = [b'foo', b'bar'] + df.columns = [b"foo", b"bar"] self.check_error_on_write(df, engine, ValueError) # python object - df.columns = [datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1)] + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] self.check_error_on_write(df, engine, ValueError) - @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli']) + @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): - if compression == 'snappy': - pytest.importorskip('snappy') + if compression == "snappy": + pytest.importorskip("snappy") - elif compression == 'brotli': - pytest.importorskip('brotli') + elif compression == "brotli": + pytest.importorskip("brotli") - df = pd.DataFrame({'A': [1, 2, 3]}) - check_round_trip(df, engine, write_kwargs={'compression': compression}) + df = pd.DataFrame({"A": [1, 2, 3]}) + check_round_trip(df, engine, write_kwargs={"compression": compression}) def test_read_columns(self, engine): # GH18154 - df = pd.DataFrame({'string': list('abc'), - 'int': list(range(1, 4))}) + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - expected = pd.DataFrame({'string': list('abc')}) - check_round_trip(df, engine, expected=expected, - read_kwargs={'columns': ['string']}) + expected = pd.DataFrame({"string": list("abc")}) + check_round_trip( + df, engine, expected=expected, read_kwargs={"columns": ["string"]} + ) def test_write_index(self, engine): - check_names = engine != 'fastparquet' + check_names = engine != "fastparquet" - df = pd.DataFrame({'A': [1, 2, 3]}) + df = pd.DataFrame({"A": [1, 2, 3]}) check_round_trip(df, engine) indexes = [ [2, 3, 4], - pd.date_range('20130101', periods=3), - list('abc'), + pd.date_range("20130101", periods=3), + list("abc"), [1, 3, 4], ] # non-default index @@ -300,122 +330,118 @@ def test_write_index(self, engine): # index with meta-data df.index = [0, 1, 2] - df.index.name = 'foo' + df.index.name = "foo" check_round_trip(df, engine) def test_write_multiindex(self, pa): # Not supported in fastparquet as of 0.1.3 or older pyarrow version engine = pa - df = pd.DataFrame({'A': [1, 2, 3]}) - index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + df = pd.DataFrame({"A": [1, 2, 3]}) + index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df.index = index check_round_trip(df, engine) def test_write_column_multiindex(self, engine): # column multi-index - mi_columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]) + mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) self.check_error_on_write(df, engine, ValueError) def test_multiindex_with_columns(self, pa): engine = pa - dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS') - df = pd.DataFrame(np.random.randn(2 * len(dates), 3), - columns=list('ABC')) + dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") + df = pd.DataFrame(np.random.randn(2 * len(dates), 3), columns=list("ABC")) index1 = pd.MultiIndex.from_product( - [['Level1', 'Level2'], dates], - names=['level', 'date']) + [["Level1", "Level2"], dates], names=["level", "date"] + ) index2 = index1.copy(names=None) for index in [index1, index2]: df.index = index check_round_trip(df, engine) - check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']}, - expected=df[['A', 'B']]) + check_round_trip( + df, engine, read_kwargs={"columns": ["A", "B"]}, expected=df[["A", "B"]] + ) def test_write_ignoring_index(self, engine): # ENH 20768 # Ensure index=False omits the index from the written Parquet file. - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}) + df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]}) - write_kwargs = { - 'compression': None, - 'index': False, - } + write_kwargs = {"compression": None, "index": False} # Because we're dropping the index, we expect the loaded dataframe to # have the default integer index. expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs=write_kwargs, - expected=expected) + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) # Ignore custom index - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']}, - index=['zyx', 'wvu', 'tsr']) + df = pd.DataFrame( + {"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"] + ) - check_round_trip(df, engine, write_kwargs=write_kwargs, - expected=expected) + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) # Ignore multi-indexes as well. - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - df = pd.DataFrame({'one': [i for i in range(8)], - 'two': [-i for i in range(8)]}, index=arrays) + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = pd.DataFrame( + {"one": [i for i in range(8)], "two": [-i for i in range(8)]}, index=arrays + ) expected = df.reset_index(drop=True) - check_round_trip(df, engine, write_kwargs=write_kwargs, - expected=expected) + check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) class TestParquetPyArrow(Base): - def test_basic(self, pa, df_full): df = df_full # additional supported types for pyarrow - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='Europe/Brussels') - df['bool_with_none'] = [True, None, True] + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") + df["bool_with_none"] = [True, None, True] check_round_trip(df, pa) # TODO: This doesn't fail on all systems; track down which - @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)", - strict=False) + @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)", strict=False) def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full # additional supported types for pyarrow - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='Europe/Brussels') + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") - check_round_trip(df, pa, expected=df[['string', 'int']], - read_kwargs={'columns': ['string', 'int']}) + check_round_trip( + df, + pa, + expected=df[["string", "int"]], + read_kwargs={"columns": ["string", "int"]}, + ) def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid self.check_error_on_write(df, pa, Exception) # timedelta - df = pd.DataFrame({'a': pd.timedelta_range('1 day', - periods=3)}) + df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) self.check_error_on_write(df, pa, NotImplementedError) # mixed python objects - df = pd.DataFrame({'a': ['a', 1, 2.0]}) + df = pd.DataFrame({"a": ["a", 1, 2.0]}) # pyarrow 0.11 raises ArrowTypeError # older pyarrows raise ArrowInvalid self.check_error_on_write(df, pa, Exception) @@ -423,7 +449,7 @@ def test_unsupported(self, pa): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) + df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) # de-serialized as object expected = df.assign(a=df.a.astype(object)) @@ -431,104 +457,112 @@ def test_categorical(self, pa): def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 - check_round_trip(df_compat, pa, - path='s3://pandas-test/pyarrow.parquet') + check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") def test_partition_cols_supported(self, pa, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: - df.to_parquet(path, partition_cols=partition_cols, - compression=None) + df.to_parquet(path, partition_cols=partition_cols, compression=None) import pyarrow.parquet as pq + dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) class TestParquetFastParquet(Base): - - @td.skip_if_no('fastparquet', min_version="0.2.1") + @td.skip_if_no("fastparquet", min_version="0.2.1") def test_basic(self, fp, df_full): df = df_full - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='US/Eastern') - df['timedelta'] = pd.timedelta_range('1 day', periods=3) + df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="US/Eastern") + df["timedelta"] = pd.timedelta_range("1 day", periods=3) check_round_trip(df, fp) @pytest.mark.skip(reason="not supported") def test_duplicate_columns(self, fp): # not currently able to handle duplicate columns - df = pd.DataFrame(np.arange(12).reshape(4, 3), - columns=list('aaa')).copy() + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() self.check_error_on_write(df, fp, ValueError) def test_bool_with_none(self, fp): - df = pd.DataFrame({'a': [True, None, False]}) - expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16') + df = pd.DataFrame({"a": [True, None, False]}) + expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") check_round_trip(df, fp, expected=expected) def test_unsupported(self, fp): # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) self.check_error_on_write(df, fp, ValueError) # mixed - df = pd.DataFrame({'a': ['a', 1, 2.0]}) + df = pd.DataFrame({"a": ["a", 1, 2.0]}) self.check_error_on_write(df, fp, ValueError) def test_categorical(self, fp): - df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) + df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) check_round_trip(df, fp) def test_filter_row_groups(self, fp): - d = {'a': list(range(0, 3))} + d = {"a": list(range(0, 3))} df = pd.DataFrame(d) with tm.ensure_clean() as path: - df.to_parquet(path, fp, compression=None, - row_group_offsets=1) - result = read_parquet(path, fp, filters=[('a', '==', 0)]) + df.to_parquet(path, fp, compression=None, row_group_offsets=1) + result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 def test_s3_roundtrip(self, df_compat, s3_resource, fp): # GH #19134 - check_round_trip(df_compat, fp, - path='s3://pandas-test/fastparquet.parquet') + check_round_trip(df_compat, fp, path="s3://pandas-test/fastparquet.parquet") def test_partition_cols_supported(self, fp, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: - df.to_parquet(path, engine="fastparquet", - partition_cols=partition_cols, compression=None) + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) assert os.path.exists(path) import fastparquet # noqa: F811 + actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 def test_partition_on_supported(self, fp, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: - df.to_parquet(path, engine="fastparquet", compression=None, - partition_on=partition_cols) + df.to_parquet( + path, + engine="fastparquet", + compression=None, + partition_on=partition_cols, + ) assert os.path.exists(path) import fastparquet # noqa: F811 + actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 def test_error_on_using_partition_cols_and_partition_on(self, fp, df_full): # GH #23283 - partition_cols = ['bool', 'int'] + partition_cols = ["bool", "int"] df = df_full with pytest.raises(ValueError): with tm.ensure_clean_dir() as path: - df.to_parquet(path, engine="fastparquet", compression=None, - partition_on=partition_cols, - partition_cols=partition_cols) + df.to_parquet( + path, + engine="fastparquet", + compression=None, + partition_on=partition_cols, + partition_cols=partition_cols, + ) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index eb912908d28f4..7aba2a3677f84 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -31,11 +31,11 @@ from pandas.tseries.offsets import Day, MonthEnd -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def current_pickle_data(): # our current version pickle data - from pandas.tests.io.generate_legacy_storage_files import ( - create_pickle_data) + from pandas.tests.io.generate_legacy_storage_files import create_pickle_data + return create_pickle_data() @@ -47,18 +47,17 @@ def compare_element(result, expected, typ, version=None): tm.assert_index_equal(expected, result) return - if typ.startswith('sp_'): + if typ.startswith("sp_"): comparator = getattr(tm, "assert_%s_equal" % typ) comparator(result, expected, exact_indices=False) - elif typ == 'timestamp': + elif typ == "timestamp": if expected is pd.NaT: assert result is pd.NaT else: assert result == expected assert result.freq == expected.freq else: - comparator = getattr(tm, "assert_%s_equal" % - typ, tm.assert_almost_equal) + comparator = getattr(tm, "assert_%s_equal" % typ, tm.assert_almost_equal) comparator(result, expected) @@ -75,7 +74,7 @@ def compare(data, vf, version): # if available comparator = "compare_{typ}_{dt}".format(typ=typ, dt=dt) - comparator = m.get(comparator, m['compare_element']) + comparator = m.get(comparator, m["compare_element"]) comparator(result, expected, typ, version) return data @@ -128,7 +127,7 @@ def compare_index_period(result, expected, typ, version): tm.assert_index_equal(result, expected) assert isinstance(result.freq, MonthEnd) assert result.freq == MonthEnd() - assert result.freqstr == 'M' + assert result.freqstr == "M" tm.assert_index_equal(result.shift(2), expected.shift(2)) @@ -136,8 +135,9 @@ def compare_sp_frame_float(result, expected, typ, version): tm.assert_sp_frame_equal(result, expected) -files = glob.glob(os.path.join(os.path.dirname(__file__), "data", - "legacy_pickle", "*", "*.pickle")) +files = glob.glob( + os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle") +) @pytest.fixture(params=files) @@ -161,13 +161,12 @@ def test_pickles(current_pickle_data, legacy_pickle): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_round_trip_current(current_pickle_data): - def python_pickler(obj, path): - with open(path, 'wb') as fh: + with open(path, "wb") as fh: pickle.dump(obj, fh, protocol=-1) def python_unpickler(path): - with open(path, 'rb') as fh: + with open(path, "rb") as fh: fh.seek(0) return pickle.load(fh) @@ -194,9 +193,10 @@ def python_unpickler(path): def test_pickle_v0_14_1(datapath): - cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, - categories=['a', 'b', 'c', 'd']) - pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle') + cat = pd.Categorical( + values=["a", "b", "c"], ordered=False, categories=["a", "b", "c", "d"] + ) + pickle_path = datapath("io", "data", "categorical_0_14_1.pickle") # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -210,9 +210,10 @@ def test_pickle_v0_15_2(datapath): # ordered -> _ordered # GH 9347 - cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, - categories=['a', 'b', 'c', 'd']) - pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle') + cat = pd.Categorical( + values=["a", "b", "c"], ordered=False, categories=["a", "b", "c", "d"] + ) + pickle_path = datapath("io", "data", "categorical_0_15_2.pickle") # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -238,19 +239,20 @@ def test_pickle_path_localpath(): # test pickle compression # --------------------- + @pytest.fixture def get_random_path(): - return '__%s__.pickle' % tm.rands(10) + return "__%s__.pickle" % tm.rands(10) class TestCompression: _compression_to_extension = { None: ".none", - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', + "gzip": ".gz", + "bz2": ".bz2", + "zip": ".zip", + "xz": ".xz", } def compress_file(self, src_path, dest_path, compression): @@ -258,18 +260,17 @@ def compress_file(self, src_path, dest_path, compression): shutil.copyfile(src_path, dest_path) return - if compression == 'gzip': + if compression == "gzip": f = gzip.open(dest_path, "w") - elif compression == 'bz2': + elif compression == "bz2": f = bz2.BZ2File(dest_path, "w") - elif compression == 'zip': - with zipfile.ZipFile(dest_path, "w", - compression=zipfile.ZIP_DEFLATED) as f: + elif compression == "zip": + with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) - elif compression == 'xz': + elif compression == "xz": f = lzma.LZMAFile(dest_path, "w") else: - msg = 'Unrecognized compression type: {}'.format(compression) + msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) if compression != "zip": @@ -297,14 +298,14 @@ def test_write_explicit(self, compression, get_random_path): tm.assert_frame_equal(df, df2) - @pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) + @pytest.mark.parametrize("compression", ["", "None", "bad", "7z"]) def test_write_explicit_bad(self, compression, get_random_path): with pytest.raises(ValueError, match="Unrecognized compression type"): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() df.to_pickle(path, compression=compression) - @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.no_compress', '.xz']) + @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".no_compress", ".xz"]) def test_write_infer(self, ext, get_random_path): base = get_random_path path1 = base + ext @@ -350,8 +351,7 @@ def test_read_explicit(self, compression, get_random_path): tm.assert_frame_equal(df, df2) - @pytest.mark.parametrize('ext', [ - '', '.gz', '.bz2', '.zip', '.no_compress', '.xz']) + @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".zip", ".no_compress", ".xz"]) def test_read_infer(self, ext, get_random_path): base = get_random_path path1 = base + ".raw" @@ -381,9 +381,9 @@ def test_read_infer(self, ext, get_random_path): # test pickle compression # --------------------- -class TestProtocol: - @pytest.mark.parametrize('protocol', [-1, 0, 1, 2]) +class TestProtocol: + @pytest.mark.parametrize("protocol", [-1, 0, 1, 2]) def test_read(self, protocol, get_random_path): with tm.ensure_clean(get_random_path) as path: df = tm.makeDataFrame() diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 23075db2b38ce..04c6979596eca 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -8,7 +8,6 @@ class TestS3URL: - def test_is_s3_url(self): assert is_s3_url("s3://pandas/somethingelse.com") assert not is_s3_url("s4://pandas/somethingelse.com") @@ -17,13 +16,10 @@ def test_is_s3_url(self): def test_streaming_s3_objects(): # GH17135 # botocore gained iteration support in 1.10.47, can now be used in read_* - pytest.importorskip('botocore', minversion='1.10.47') + pytest.importorskip("botocore", minversion="1.10.47") from botocore.response import StreamingBody - data = [ - b'foo,bar,baz\n1,2,3\n4,5,6\n', - b'just,the,header\n', - ] + data = [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"] for el in data: body = StreamingBody(BytesIO(el), content_length=len(el)) read_csv(body) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index b9f58f9bf6cf6..b7a62b7ba431b 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -54,10 +54,9 @@ def test_spss_umlauts(datapath): fname = datapath("io", "data", "umlauts.sav") df = pd.read_spss(fname, convert_categoricals=True) - expected = pd.DataFrame({"var1": ["the ä umlaut", - "the ü umlaut", - "the ä umlaut", - "the ö umlaut"]}) + expected = pd.DataFrame( + {"var1": ["the ä umlaut", "the ü umlaut", "the ä umlaut", "the ö umlaut"]} + ) expected["var1"] = pd.Categorical(expected["var1"]) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b053afa4dd7d5..4fc90ea41718d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -28,13 +28,21 @@ from pandas.compat import PY36 -from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_datetime64tz_dtype) +from pandas.core.dtypes.common import is_datetime64_dtype, is_datetime64tz_dtype import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Series, Timestamp, concat, date_range, isna, - to_datetime, to_timedelta) + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + isna, + to_datetime, + to_timedelta, +) import pandas.util.testing as tm import pandas.io.sql as sql @@ -46,41 +54,42 @@ import sqlalchemy.sql.sqltypes as sqltypes from sqlalchemy.ext import declarative from sqlalchemy.orm import session as sa_session + SQLALCHEMY_INSTALLED = True except ImportError: SQLALCHEMY_INSTALLED = False SQL_STRINGS = { - 'create_iris': { - 'sqlite': """CREATE TABLE iris ( + "create_iris": { + "sqlite": """CREATE TABLE iris ( "SepalLength" REAL, "SepalWidth" REAL, "PetalLength" REAL, "PetalWidth" REAL, "Name" TEXT )""", - 'mysql': """CREATE TABLE iris ( + "mysql": """CREATE TABLE iris ( `SepalLength` DOUBLE, `SepalWidth` DOUBLE, `PetalLength` DOUBLE, `PetalWidth` DOUBLE, `Name` VARCHAR(200) )""", - 'postgresql': """CREATE TABLE iris ( + "postgresql": """CREATE TABLE iris ( "SepalLength" DOUBLE PRECISION, "SepalWidth" DOUBLE PRECISION, "PetalLength" DOUBLE PRECISION, "PetalWidth" DOUBLE PRECISION, "Name" VARCHAR(200) - )""" + )""", }, - 'insert_iris': { - 'sqlite': """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""", - 'mysql': """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""", - 'postgresql': """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);""" + "insert_iris": { + "sqlite": """INSERT INTO iris VALUES(?, ?, ?, ?, ?)""", + "mysql": """INSERT INTO iris VALUES(%s, %s, %s, %s, "%s");""", + "postgresql": """INSERT INTO iris VALUES(%s, %s, %s, %s, %s);""", }, - 'create_test_types': { - 'sqlite': """CREATE TABLE types_test_data ( + "create_test_types": { + "sqlite": """CREATE TABLE types_test_data ( "TextCol" TEXT, "DateCol" TEXT, "IntDateCol" INTEGER, @@ -91,7 +100,7 @@ "IntColWithNull" INTEGER, "BoolColWithNull" INTEGER )""", - 'mysql': """CREATE TABLE types_test_data ( + "mysql": """CREATE TABLE types_test_data ( `TextCol` TEXT, `DateCol` DATETIME, `IntDateCol` INTEGER, @@ -102,7 +111,7 @@ `IntColWithNull` INTEGER, `BoolColWithNull` BOOLEAN )""", - 'postgresql': """CREATE TABLE types_test_data ( + "postgresql": """CREATE TABLE types_test_data ( "TextCol" TEXT, "DateCol" TIMESTAMP, "DateColWithTz" TIMESTAMP WITH TIME ZONE, @@ -113,95 +122,112 @@ "BoolCol" BOOLEAN, "IntColWithNull" INTEGER, "BoolColWithNull" BOOLEAN - )""" + )""", }, - 'insert_test_types': { - 'sqlite': { - 'query': """ + "insert_test_types": { + "sqlite": { + "query": """ INSERT INTO types_test_data VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) """, - 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', - 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', - 'BoolColWithNull' - ) + "fields": ( + "TextCol", + "DateCol", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), }, - 'mysql': { - 'query': """ + "mysql": { + "query": """ INSERT INTO types_test_data VALUES("%s", %s, %s, %s, %s, %s, %s, %s, %s) """, - 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', - 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', - 'BoolColWithNull' - ) + "fields": ( + "TextCol", + "DateCol", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), }, - 'postgresql': { - 'query': """ + "postgresql": { + "query": """ INSERT INTO types_test_data VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, - 'fields': ( - 'TextCol', 'DateCol', 'DateColWithTz', - 'IntDateCol', 'IntDateOnlyCol', 'FloatCol', - 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' - ) + "fields": ( + "TextCol", + "DateCol", + "DateColWithTz", + "IntDateCol", + "IntDateOnlyCol", + "FloatCol", + "IntCol", + "BoolCol", + "IntColWithNull", + "BoolColWithNull", + ), }, }, - 'read_parameters': { - 'sqlite': "SELECT * FROM iris WHERE Name=? AND SepalLength=?", - 'mysql': 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s', - 'postgresql': 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s' + "read_parameters": { + "sqlite": "SELECT * FROM iris WHERE Name=? AND SepalLength=?", + "mysql": 'SELECT * FROM iris WHERE `Name`="%s" AND `SepalLength`=%s', + "postgresql": 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s', }, - 'read_named_parameters': { - 'sqlite': """ + "read_named_parameters": { + "sqlite": """ SELECT * FROM iris WHERE Name=:name AND SepalLength=:length """, - 'mysql': """ + "mysql": """ SELECT * FROM iris WHERE `Name`="%(name)s" AND `SepalLength`=%(length)s """, - 'postgresql': """ + "postgresql": """ SELECT * FROM iris WHERE "Name"=%(name)s AND "SepalLength"=%(length)s - """ + """, }, - 'create_view': { - 'sqlite': """ + "create_view": { + "sqlite": """ CREATE VIEW iris_view AS SELECT * FROM iris """ - } + }, } class MixInBase: - def teardown_method(self, method): # if setup fails, there may not be a connection to close. - if hasattr(self, 'conn'): + if hasattr(self, "conn"): for tbl in self._get_all_tables(): self.drop_table(tbl) self._close_conn() class MySQLMixIn(MixInBase): - def drop_table(self, table_name): cur = self.conn.cursor() - cur.execute("DROP TABLE IF EXISTS %s" % - sql._get_valid_mysql_name(table_name)) + cur.execute("DROP TABLE IF EXISTS %s" % sql._get_valid_mysql_name(table_name)) self.conn.commit() def _get_all_tables(self): cur = self.conn.cursor() - cur.execute('SHOW TABLES') + cur.execute("SHOW TABLES") return [table[0] for table in cur.fetchall()] def _close_conn(self): from pymysql.err import Error + try: self.conn.close() except Error: @@ -209,15 +235,14 @@ def _close_conn(self): class SQLiteMixIn(MixInBase): - def drop_table(self, table_name): - self.conn.execute("DROP TABLE IF EXISTS %s" % - sql._get_valid_sqlite_name(table_name)) + self.conn.execute( + "DROP TABLE IF EXISTS %s" % sql._get_valid_sqlite_name(table_name) + ) self.conn.commit() def _get_all_tables(self): - c = self.conn.execute( - "SELECT name FROM sqlite_master WHERE type='table'") + c = self.conn.execute("SELECT name FROM sqlite_master WHERE type='table'") return [table[0] for table in c.fetchall()] def _close_conn(self): @@ -225,7 +250,6 @@ def _close_conn(self): class SQLAlchemyMixIn(MixInBase): - def drop_table(self, table_name): sql.SQLDatabase(self.conn).drop_table(table_name) @@ -246,116 +270,144 @@ class PandasSQLTest: """ def _get_exec(self): - if hasattr(self.conn, 'execute'): + if hasattr(self.conn, "execute"): return self.conn else: return self.conn.cursor() - @pytest.fixture(params=[('io', 'data', 'iris.csv')]) + @pytest.fixture(params=[("io", "data", "iris.csv")]) def load_iris_data(self, datapath, request): import io + iris_csv_file = datapath(*request.param) - if not hasattr(self, 'conn'): + if not hasattr(self, "conn"): self.setup_connect() - self.drop_table('iris') - self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) + self.drop_table("iris") + self._get_exec().execute(SQL_STRINGS["create_iris"][self.flavor]) - with io.open(iris_csv_file, mode='r', newline=None) as iris_csv: + with io.open(iris_csv_file, mode="r", newline=None) as iris_csv: r = csv.reader(iris_csv) next(r) # skip header row - ins = SQL_STRINGS['insert_iris'][self.flavor] + ins = SQL_STRINGS["insert_iris"][self.flavor] for row in r: self._get_exec().execute(ins, row) def _load_iris_view(self): - self.drop_table('iris_view') - self._get_exec().execute(SQL_STRINGS['create_view'][self.flavor]) + self.drop_table("iris_view") + self._get_exec().execute(SQL_STRINGS["create_view"][self.flavor]) def _check_iris_loaded_frame(self, iris_frame): pytype = iris_frame.dtypes[0].type row = iris_frame.iloc[0] assert issubclass(pytype, np.floating) - tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + tm.equalContents(row.values, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) def _load_test1_data(self): - columns = ['index', 'A', 'B', 'C', 'D'] - data = [( - '2000-01-03 00:00:00', 0.980268513777, 3.68573087906, - -0.364216805298, -1.15973806169), - ('2000-01-04 00:00:00', 1.04791624281, - - 0.0412318367011, -0.16181208307, 0.212549316967), - ('2000-01-05 00:00:00', 0.498580885705, - 0.731167677815, -0.537677223318, 1.34627041952), - ('2000-01-06 00:00:00', 1.12020151869, 1.56762092543, - 0.00364077397681, 0.67525259227)] + columns = ["index", "A", "B", "C", "D"] + data = [ + ( + "2000-01-03 00:00:00", + 0.980268513777, + 3.68573087906, + -0.364216805298, + -1.15973806169, + ), + ( + "2000-01-04 00:00:00", + 1.04791624281, + -0.0412318367011, + -0.16181208307, + 0.212549316967, + ), + ( + "2000-01-05 00:00:00", + 0.498580885705, + 0.731167677815, + -0.537677223318, + 1.34627041952, + ), + ( + "2000-01-06 00:00:00", + 1.12020151869, + 1.56762092543, + 0.00364077397681, + 0.67525259227, + ), + ] self.test_frame1 = DataFrame(data, columns=columns) def _load_test2_data(self): - df = DataFrame(dict(A=[4, 1, 3, 6], - B=['asd', 'gsq', 'ylt', 'jkl'], - C=[1.1, 3.1, 6.9, 5.3], - D=[False, True, True, False], - E=['1990-11-22', '1991-10-26', - '1993-11-26', '1995-12-12'])) - df['E'] = to_datetime(df['E']) + df = DataFrame( + dict( + A=[4, 1, 3, 6], + B=["asd", "gsq", "ylt", "jkl"], + C=[1.1, 3.1, 6.9, 5.3], + D=[False, True, True, False], + E=["1990-11-22", "1991-10-26", "1993-11-26", "1995-12-12"], + ) + ) + df["E"] = to_datetime(df["E"]) self.test_frame2 = df def _load_test3_data(self): - columns = ['index', 'A', 'B'] - data = [( - '2000-01-03 00:00:00', 2 ** 31 - 1, -1.987670), - ('2000-01-04 00:00:00', -29, -0.0412318367011), - ('2000-01-05 00:00:00', 20000, 0.731167677815), - ('2000-01-06 00:00:00', -290867, 1.56762092543)] + columns = ["index", "A", "B"] + data = [ + ("2000-01-03 00:00:00", 2 ** 31 - 1, -1.987670), + ("2000-01-04 00:00:00", -29, -0.0412318367011), + ("2000-01-05 00:00:00", 20000, 0.731167677815), + ("2000-01-06 00:00:00", -290867, 1.56762092543), + ] self.test_frame3 = DataFrame(data, columns=columns) def _load_raw_sql(self): - self.drop_table('types_test_data') - self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor]) - ins = SQL_STRINGS['insert_test_types'][self.flavor] + self.drop_table("types_test_data") + self._get_exec().execute(SQL_STRINGS["create_test_types"][self.flavor]) + ins = SQL_STRINGS["insert_test_types"][self.flavor] data = [ { - 'TextCol': 'first', - 'DateCol': '2000-01-03 00:00:00', - 'DateColWithTz': '2000-01-01 00:00:00-08:00', - 'IntDateCol': 535852800, - 'IntDateOnlyCol': 20101010, - 'FloatCol': 10.10, - 'IntCol': 1, - 'BoolCol': False, - 'IntColWithNull': 1, - 'BoolColWithNull': False, + "TextCol": "first", + "DateCol": "2000-01-03 00:00:00", + "DateColWithTz": "2000-01-01 00:00:00-08:00", + "IntDateCol": 535852800, + "IntDateOnlyCol": 20101010, + "FloatCol": 10.10, + "IntCol": 1, + "BoolCol": False, + "IntColWithNull": 1, + "BoolColWithNull": False, }, { - 'TextCol': 'first', - 'DateCol': '2000-01-04 00:00:00', - 'DateColWithTz': '2000-06-01 00:00:00-07:00', - 'IntDateCol': 1356998400, - 'IntDateOnlyCol': 20101212, - 'FloatCol': 10.10, - 'IntCol': 1, - 'BoolCol': False, - 'IntColWithNull': None, - 'BoolColWithNull': None, + "TextCol": "first", + "DateCol": "2000-01-04 00:00:00", + "DateColWithTz": "2000-06-01 00:00:00-07:00", + "IntDateCol": 1356998400, + "IntDateOnlyCol": 20101212, + "FloatCol": 10.10, + "IntCol": 1, + "BoolCol": False, + "IntColWithNull": None, + "BoolColWithNull": None, }, ] for d in data: self._get_exec().execute( - ins['query'], - [d[field] for field in ins['fields']] + ins["query"], [d[field] for field in ins["fields"]] ) def _count_rows(self, table_name): - result = self._get_exec().execute( - "SELECT count(*) AS count_1 FROM %s" % table_name).fetchone() + result = ( + self._get_exec() + .execute("SELECT count(*) AS count_1 FROM %s" % table_name) + .fetchone() + ) return result[0] def _read_sql_iris(self): @@ -363,81 +415,75 @@ def _read_sql_iris(self): self._check_iris_loaded_frame(iris_frame) def _read_sql_iris_parameter(self): - query = SQL_STRINGS['read_parameters'][self.flavor] - params = ['Iris-setosa', 5.1] + query = SQL_STRINGS["read_parameters"][self.flavor] + params = ["Iris-setosa", 5.1] iris_frame = self.pandasSQL.read_query(query, params=params) self._check_iris_loaded_frame(iris_frame) def _read_sql_iris_named_parameter(self): - query = SQL_STRINGS['read_named_parameters'][self.flavor] - params = {'name': 'Iris-setosa', 'length': 5.1} + query = SQL_STRINGS["read_named_parameters"][self.flavor] + params = {"name": "Iris-setosa", "length": 5.1} iris_frame = self.pandasSQL.read_query(query, params=params) self._check_iris_loaded_frame(iris_frame) def _to_sql(self, method=None): - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=method) - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=method) + assert self.pandasSQL.has_table("test_frame1") num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries # Nuke table - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_empty(self): - self.drop_table('test_frame1') - self.pandasSQL.to_sql(self.test_frame1.iloc[:0], 'test_frame1') + self.drop_table("test_frame1") + self.pandasSQL.to_sql(self.test_frame1.iloc[:0], "test_frame1") def _to_sql_fail(self): - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") + assert self.pandasSQL.has_table("test_frame1") msg = "Table 'test_frame1' already exists" with pytest.raises(ValueError, match=msg): - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_replace(self): - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") # Add to table again - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='replace') - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="replace") + assert self.pandasSQL.has_table("test_frame1") num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_append(self): # Nuke table just in case - self.drop_table('test_frame1') + self.drop_table("test_frame1") - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='fail') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="fail") # Add to table again - self.pandasSQL.to_sql( - self.test_frame1, 'test_frame1', if_exists='append') - assert self.pandasSQL.has_table('test_frame1') + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", if_exists="append") + assert self.pandasSQL.has_table("test_frame1") num_entries = 2 * len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _to_sql_method_callable(self): check = [] # used to double check function below is really being used @@ -446,25 +492,25 @@ def sample(pd_table, conn, keys, data_iter): check.append(1) data = [dict(zip(keys, row)) for row in data_iter] conn.execute(pd_table.table.insert(), data) - self.drop_table('test_frame1') - self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=sample) - assert self.pandasSQL.has_table('test_frame1') + self.drop_table("test_frame1") + + self.pandasSQL.to_sql(self.test_frame1, "test_frame1", method=sample) + assert self.pandasSQL.has_table("test_frame1") assert check == [1] num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame1') + num_rows = self._count_rows("test_frame1") assert num_rows == num_entries # Nuke table - self.drop_table('test_frame1') + self.drop_table("test_frame1") def _roundtrip(self): - self.drop_table('test_frame_roundtrip') - self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip') - result = self.pandasSQL.read_query( - 'SELECT * FROM test_frame_roundtrip') + self.drop_table("test_frame_roundtrip") + self.pandasSQL.to_sql(self.test_frame1, "test_frame_roundtrip") + result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") - result.set_index('level_0', inplace=True) + result.set_index("level_0", inplace=True) # result.index.astype(int) result.index.name = None @@ -475,14 +521,15 @@ def _execute_sql(self): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done iris_results = self.pandasSQL.execute("SELECT * FROM iris") row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) def _to_sql_save_index(self): - df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')], - columns=['A', 'B', 'C'], index=['A']) - self.pandasSQL.to_sql(df, 'test_to_sql_saves_index') - ix_cols = self._get_index_columns('test_to_sql_saves_index') - assert ix_cols == [['A', ], ] + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"] + ) + self.pandasSQL.to_sql(df, "test_to_sql_saves_index") + ix_cols = self._get_index_columns("test_to_sql_saves_index") + assert ix_cols == [["A"]] def _transaction_test(self): self.pandasSQL.execute("CREATE TABLE test_trans (A INT, B TEXT)") @@ -493,23 +540,24 @@ def _transaction_test(self): try: with self.pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - raise Exception('error') + raise Exception("error") except Exception: # ignore raised exception pass - res = self.pandasSQL.read_query('SELECT * FROM test_trans') + res = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res) == 0 # Make sure when transaction is committed, rows do get inserted with self.pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - res2 = self.pandasSQL.read_query('SELECT * FROM test_trans') + res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res2) == 1 # ----------------------------------------------------------------------------- # -- Testing the public API + class _TestSQLApi(PandasSQLTest): """ @@ -527,7 +575,8 @@ class _TestSQLApi(PandasSQLTest): we don't use drop_table because that isn't part of the public api """ - flavor = 'sqlite' + + flavor = "sqlite" mode = None def setup_connect(self): @@ -545,95 +594,87 @@ def load_test_data_and_sql(self): self._load_raw_sql() def test_read_sql_iris(self): - iris_frame = sql.read_sql_query( - "SELECT * FROM iris", self.conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris", self.conn) self._check_iris_loaded_frame(iris_frame) def test_read_sql_view(self): - iris_frame = sql.read_sql_query( - "SELECT * FROM iris_view", self.conn) + iris_frame = sql.read_sql_query("SELECT * FROM iris_view", self.conn) self._check_iris_loaded_frame(iris_frame) def test_to_sql(self): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn) - assert sql.has_table('test_frame1', self.conn) + sql.to_sql(self.test_frame1, "test_frame1", self.conn) + assert sql.has_table("test_frame1", self.conn) def test_to_sql_fail(self): - sql.to_sql(self.test_frame1, 'test_frame2', - self.conn, if_exists='fail') - assert sql.has_table('test_frame2', self.conn) + sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") + assert sql.has_table("test_frame2", self.conn) msg = "Table 'test_frame2' already exists" with pytest.raises(ValueError, match=msg): - sql.to_sql(self.test_frame1, 'test_frame2', - self.conn, if_exists='fail') + sql.to_sql(self.test_frame1, "test_frame2", self.conn, if_exists="fail") def test_to_sql_replace(self): - sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, if_exists='fail') + sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="fail") # Add to table again - sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, if_exists='replace') - assert sql.has_table('test_frame3', self.conn) + sql.to_sql(self.test_frame1, "test_frame3", self.conn, if_exists="replace") + assert sql.has_table("test_frame3", self.conn) num_entries = len(self.test_frame1) - num_rows = self._count_rows('test_frame3') + num_rows = self._count_rows("test_frame3") assert num_rows == num_entries def test_to_sql_append(self): - sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, if_exists='fail') + sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="fail") # Add to table again - sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, if_exists='append') - assert sql.has_table('test_frame4', self.conn) + sql.to_sql(self.test_frame1, "test_frame4", self.conn, if_exists="append") + assert sql.has_table("test_frame4", self.conn) num_entries = 2 * len(self.test_frame1) - num_rows = self._count_rows('test_frame4') + num_rows = self._count_rows("test_frame4") assert num_rows == num_entries def test_to_sql_type_mapping(self): - sql.to_sql(self.test_frame3, 'test_frame5', self.conn, index=False) + sql.to_sql(self.test_frame3, "test_frame5", self.conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", self.conn) tm.assert_frame_equal(self.test_frame3, result) def test_to_sql_series(self): - s = Series(np.arange(5, dtype='int64'), name='series') + s = Series(np.arange(5, dtype="int64"), name="series") sql.to_sql(s, "test_series", self.conn, index=False) s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) tm.assert_frame_equal(s.to_frame(), s2) def test_roundtrip(self): - sql.to_sql(self.test_frame1, 'test_frame_roundtrip', - con=self.conn) - result = sql.read_sql_query( - 'SELECT * FROM test_frame_roundtrip', - con=self.conn) + sql.to_sql(self.test_frame1, "test_frame_roundtrip", con=self.conn) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) # HACK! result.index = self.test_frame1.index - result.set_index('level_0', inplace=True) + result.set_index("level_0", inplace=True) result.index.astype(int) result.index.name = None tm.assert_frame_equal(result, self.test_frame1) def test_roundtrip_chunksize(self): - sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, - index=False, chunksize=2) - result = sql.read_sql_query( - 'SELECT * FROM test_frame_roundtrip', - con=self.conn) + sql.to_sql( + self.test_frame1, + "test_frame_roundtrip", + con=self.conn, + index=False, + chunksize=2, + ) + result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=self.conn) tm.assert_frame_equal(result, self.test_frame1) def test_execute_sql(self): # drop_sql = "DROP TABLE IF EXISTS test" # should already be done iris_results = sql.execute("SELECT * FROM iris", con=self.conn) row = iris_results.fetchone() - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) + tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) def test_date_parsing(self): # Test date parsing in read_sql @@ -641,52 +682,64 @@ def test_date_parsing(self): df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn) assert not issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates=['DateCol']) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates=["DateCol"] + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) assert df.DateCol.tolist() == [ pd.Timestamp(2000, 1, 3, 0, 0, 0), - pd.Timestamp(2000, 1, 4, 0, 0, 0) + pd.Timestamp(2000, 1, 4, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"}, + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) assert df.DateCol.tolist() == [ pd.Timestamp(2000, 1, 3, 0, 0, 0), - pd.Timestamp(2000, 1, 4, 0, 0, 0) + pd.Timestamp(2000, 1, 4, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates=['IntDateCol']) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates=["IntDateCol"] + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) assert df.IntDateCol.tolist() == [ pd.Timestamp(1986, 12, 25, 0, 0, 0), - pd.Timestamp(2013, 1, 1, 0, 0, 0) + pd.Timestamp(2013, 1, 1, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates={'IntDateCol': 's'}) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", self.conn, parse_dates={"IntDateCol": "s"} + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) assert df.IntDateCol.tolist() == [ pd.Timestamp(1986, 12, 25, 0, 0, 0), - pd.Timestamp(2013, 1, 1, 0, 0, 0) + pd.Timestamp(2013, 1, 1, 0, 0, 0), ] - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - parse_dates={'IntDateOnlyCol': '%Y%m%d'}) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + parse_dates={"IntDateOnlyCol": "%Y%m%d"}, + ) assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) assert df.IntDateOnlyCol.tolist() == [ - pd.Timestamp('2010-10-10'), - pd.Timestamp('2010-12-12') + pd.Timestamp("2010-10-10"), + pd.Timestamp("2010-12-12"), ] def test_date_and_index(self): # Test case where same column appears in parse_date and index_col - df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, - index_col='DateCol', - parse_dates=['DateCol', 'IntDateCol']) + df = sql.read_sql_query( + "SELECT * FROM types_test_data", + self.conn, + index_col="DateCol", + parse_dates=["DateCol", "IntDateCol"], + ) assert issubclass(df.index.dtype.type, np.datetime64) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) @@ -694,120 +747,139 @@ def test_date_and_index(self): def test_timedelta(self): # see #6921 - df = to_timedelta( - Series(['00:00:01', '00:00:03'], name='foo')).to_frame() + df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() with tm.assert_produces_warning(UserWarning): - df.to_sql('test_timedelta', self.conn) - result = sql.read_sql_query('SELECT * FROM test_timedelta', self.conn) - tm.assert_series_equal(result['foo'], df['foo'].astype('int64')) + df.to_sql("test_timedelta", self.conn) + result = sql.read_sql_query("SELECT * FROM test_timedelta", self.conn) + tm.assert_series_equal(result["foo"], df["foo"].astype("int64")) def test_complex_raises(self): - df = DataFrame({'a': [1 + 1j, 2j]}) + df = DataFrame({"a": [1 + 1j, 2j]}) msg = "Complex datatypes not supported" with pytest.raises(ValueError, match=msg): - df.to_sql('test_complex', self.conn) - - @pytest.mark.parametrize("index_name,index_label,expected", [ - # no index name, defaults to 'index' - (None, None, "index"), - # specifying index_label - (None, "other_label", "other_label"), - # using the index name - ("index_name", None, "index_name"), - # has index name, but specifying index_label - ("index_name", "other_label", "other_label"), - # index name is integer - (0, None, "0"), - # index name is None but index label is integer - (None, 0, "0"), - ]) - def test_to_sql_index_label(self, index_name, - index_label, expected): - temp_frame = DataFrame({'col1': range(4)}) + df.to_sql("test_complex", self.conn) + + @pytest.mark.parametrize( + "index_name,index_label,expected", + [ + # no index name, defaults to 'index' + (None, None, "index"), + # specifying index_label + (None, "other_label", "other_label"), + # using the index name + ("index_name", None, "index_name"), + # has index name, but specifying index_label + ("index_name", "other_label", "other_label"), + # index name is integer + (0, None, "0"), + # index name is None but index label is integer + (None, 0, "0"), + ], + ) + def test_to_sql_index_label(self, index_name, index_label, expected): + temp_frame = DataFrame({"col1": range(4)}) temp_frame.index.name = index_name - query = 'SELECT * FROM test_index_label' - sql.to_sql(temp_frame, 'test_index_label', self.conn, - index_label=index_label) + query = "SELECT * FROM test_index_label" + sql.to_sql(temp_frame, "test_index_label", self.conn, index_label=index_label) frame = sql.read_sql_query(query, self.conn) assert frame.columns[0] == expected def test_to_sql_index_label_multiindex(self): - temp_frame = DataFrame({'col1': range(4)}, - index=MultiIndex.from_product( - [('A0', 'A1'), ('B0', 'B1')])) + temp_frame = DataFrame( + {"col1": range(4)}, + index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]), + ) # no index name, defaults to 'level_0' and 'level_1' - sql.to_sql(temp_frame, 'test_index_label', self.conn) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[0] == 'level_0' - assert frame.columns[1] == 'level_1' + sql.to_sql(temp_frame, "test_index_label", self.conn) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[0] == "level_0" + assert frame.columns[1] == "level_1" # specifying index_label - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label=['A', 'B']) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[:2].tolist() == ['A', 'B'] + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label=["A", "B"], + ) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["A", "B"] # using the index name - temp_frame.index.names = ['A', 'B'] - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace') - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[:2].tolist() == ['A', 'B'] + temp_frame.index.names = ["A", "B"] + sql.to_sql(temp_frame, "test_index_label", self.conn, if_exists="replace") + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["A", "B"] # has index name, but specifying index_label - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label=['C', 'D']) - frame = sql.read_sql_query('SELECT * FROM test_index_label', self.conn) - assert frame.columns[:2].tolist() == ['C', 'D'] + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label=["C", "D"], + ) + frame = sql.read_sql_query("SELECT * FROM test_index_label", self.conn) + assert frame.columns[:2].tolist() == ["C", "D"] - msg = ("Length of 'index_label' should match number of levels, which" - " is 2") + msg = "Length of 'index_label' should match number of levels, which" " is 2" with pytest.raises(ValueError, match=msg): - sql.to_sql(temp_frame, 'test_index_label', self.conn, - if_exists='replace', index_label='C') + sql.to_sql( + temp_frame, + "test_index_label", + self.conn, + if_exists="replace", + index_label="C", + ) def test_multiindex_roundtrip(self): - df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')], - columns=['A', 'B', 'C'], index=['A', 'B']) + df = DataFrame.from_records( + [(1, 2.1, "line1"), (2, 1.5, "line2")], + columns=["A", "B", "C"], + index=["A", "B"], + ) - df.to_sql('test_multiindex_roundtrip', self.conn) - result = sql.read_sql_query('SELECT * FROM test_multiindex_roundtrip', - self.conn, index_col=['A', 'B']) + df.to_sql("test_multiindex_roundtrip", self.conn) + result = sql.read_sql_query( + "SELECT * FROM test_multiindex_roundtrip", self.conn, index_col=["A", "B"] + ) tm.assert_frame_equal(df, result, check_index_type=True) def test_integer_col_names(self): df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) - sql.to_sql(df, "test_frame_integer_col_names", self.conn, - if_exists='replace') + sql.to_sql(df, "test_frame_integer_col_names", self.conn, if_exists="replace") def test_get_schema(self): - create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn) - assert 'CREATE' in create_sql + create_sql = sql.get_schema(self.test_frame1, "test", con=self.conn) + assert "CREATE" in create_sql def test_get_schema_dtypes(self): - float_frame = DataFrame({'a': [1.1, 1.2], 'b': [2.1, 2.2]}) - dtype = sqlalchemy.Integer if self.mode == 'sqlalchemy' else 'INTEGER' - create_sql = sql.get_schema(float_frame, 'test', - con=self.conn, dtype={'b': dtype}) - assert 'CREATE' in create_sql - assert 'INTEGER' in create_sql + float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) + dtype = sqlalchemy.Integer if self.mode == "sqlalchemy" else "INTEGER" + create_sql = sql.get_schema( + float_frame, "test", con=self.conn, dtype={"b": dtype} + ) + assert "CREATE" in create_sql + assert "INTEGER" in create_sql def test_get_schema_keys(self): - frame = DataFrame({'Col1': [1.1, 1.2], 'Col2': [2.1, 2.2]}) - create_sql = sql.get_schema(frame, 'test', con=self.conn, keys='Col1') + frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) + create_sql = sql.get_schema(frame, "test", con=self.conn, keys="Col1") constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' assert constraint_sentence in create_sql # multiple columns as key (GH10385) - create_sql = sql.get_schema(self.test_frame1, 'test', - con=self.conn, keys=['A', 'B']) + create_sql = sql.get_schema( + self.test_frame1, "test", con=self.conn, keys=["A", "B"] + ) constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' assert constraint_sentence in create_sql def test_chunksize_read(self): - df = DataFrame(np.random.randn(22, 5), columns=list('abcde')) - df.to_sql('test_chunksize', self.conn, index=False) + df = DataFrame(np.random.randn(22, 5), columns=list("abcde")) + df.to_sql("test_chunksize", self.conn, index=False) # reading the query in one time res1 = sql.read_sql_query("select * from test_chunksize", self.conn) @@ -817,8 +889,9 @@ def test_chunksize_read(self): i = 0 sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_query("select * from test_chunksize", - self.conn, chunksize=5): + for chunk in sql.read_sql_query( + "select * from test_chunksize", self.conn, chunksize=5 + ): res2 = concat([res2, chunk], ignore_index=True) assert len(chunk) == sizes[i] i += 1 @@ -826,13 +899,12 @@ def test_chunksize_read(self): tm.assert_frame_equal(res1, res2) # reading the query in chunks with read_sql_query - if self.mode == 'sqlalchemy': + if self.mode == "sqlalchemy": res3 = DataFrame() i = 0 sizes = [5, 5, 5, 5, 2] - for chunk in sql.read_sql_table("test_chunksize", self.conn, - chunksize=5): + for chunk in sql.read_sql_table("test_chunksize", self.conn, chunksize=5): res3 = concat([res3, chunk], ignore_index=True) assert len(chunk) == sizes[i] i += 1 @@ -843,35 +915,36 @@ def test_categorical(self): # GH8624 # test that categorical gets written correctly as dense column df = DataFrame( - {'person_id': [1, 2, 3], - 'person_name': ['John P. Doe', 'Jane Dove', 'John P. Doe']}) + { + "person_id": [1, 2, 3], + "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"], + } + ) df2 = df.copy() - df2['person_name'] = df2['person_name'].astype('category') + df2["person_name"] = df2["person_name"].astype("category") - df2.to_sql('test_categorical', self.conn, index=False) - res = sql.read_sql_query('SELECT * FROM test_categorical', self.conn) + df2.to_sql("test_categorical", self.conn, index=False) + res = sql.read_sql_query("SELECT * FROM test_categorical", self.conn) tm.assert_frame_equal(res, df) def test_unicode_column_name(self): # GH 11431 - df = DataFrame([[1, 2], [3, 4]], columns=['\xe9', 'b']) - df.to_sql('test_unicode', self.conn, index=False) + df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"]) + df.to_sql("test_unicode", self.conn, index=False) def test_escaped_table_name(self): # GH 13206 - df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]}) - df.to_sql('d1187b08-4943-4c8d-a7f6', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql("d1187b08-4943-4c8d-a7f6", self.conn, index=False) - res = sql.read_sql_query('SELECT * FROM `d1187b08-4943-4c8d-a7f6`', - self.conn) + res = sql.read_sql_query("SELECT * FROM `d1187b08-4943-4c8d-a7f6`", self.conn) tm.assert_frame_equal(res, df) @pytest.mark.single -@pytest.mark.skipif( - not SQLALCHEMY_INSTALLED, reason='SQLAlchemy not installed') +@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): """ Test the public API as it would be used directly @@ -880,46 +953,44 @@ class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): sqlalchemy mode. """ - flavor = 'sqlite' - mode = 'sqlalchemy' + + flavor = "sqlite" + mode = "sqlalchemy" def connect(self): - return sqlalchemy.create_engine('sqlite:///:memory:') + return sqlalchemy.create_engine("sqlite:///:memory:") def test_read_table_columns(self): # test columns argument in read_table - sql.to_sql(self.test_frame1, 'test_frame', self.conn) + sql.to_sql(self.test_frame1, "test_frame", self.conn) - cols = ['A', 'B'] - result = sql.read_sql_table('test_frame', self.conn, columns=cols) + cols = ["A", "B"] + result = sql.read_sql_table("test_frame", self.conn, columns=cols) assert result.columns.tolist() == cols def test_read_table_index_col(self): # test columns argument in read_table - sql.to_sql(self.test_frame1, 'test_frame', self.conn) + sql.to_sql(self.test_frame1, "test_frame", self.conn) - result = sql.read_sql_table('test_frame', self.conn, index_col="index") + result = sql.read_sql_table("test_frame", self.conn, index_col="index") assert result.index.names == ["index"] - result = sql.read_sql_table( - 'test_frame', self.conn, index_col=["A", "B"]) + result = sql.read_sql_table("test_frame", self.conn, index_col=["A", "B"]) assert result.index.names == ["A", "B"] - result = sql.read_sql_table('test_frame', self.conn, - index_col=["A", "B"], - columns=["C", "D"]) + result = sql.read_sql_table( + "test_frame", self.conn, index_col=["A", "B"], columns=["C", "D"] + ) assert result.index.names == ["A", "B"] assert result.columns.tolist() == ["C", "D"] def test_read_sql_delegate(self): - iris_frame1 = sql.read_sql_query( - "SELECT * FROM iris", self.conn) - iris_frame2 = sql.read_sql( - "SELECT * FROM iris", self.conn) + iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) + iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2) - iris_frame1 = sql.read_sql_table('iris', self.conn) - iris_frame2 = sql.read_sql('iris', self.conn) + iris_frame1 = sql.read_sql_table("iris", self.conn) + iris_frame2 = sql.read_sql("iris", self.conn) tm.assert_frame_equal(iris_frame1, iris_frame2) def test_not_reflect_all_tables(self): @@ -933,8 +1004,8 @@ def test_not_reflect_all_tables(self): # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. - sql.read_sql_table('other_table', self.conn) - sql.read_sql_query('SELECT * FROM other_table', self.conn) + sql.read_sql_table("other_table", self.conn) + sql.read_sql_query("SELECT * FROM other_table", self.conn) # Verify some things assert len(w) == 0 @@ -948,26 +1019,28 @@ def test_warning_case_insensitive_table_name(self): # Cause all warnings to always be triggered. warnings.simplefilter("always") # This should not trigger a Warning - self.test_frame1.to_sql('CaseSensitive', self.conn) + self.test_frame1.to_sql("CaseSensitive", self.conn) # Verify some things assert len(w) == 0 def _get_index_columns(self, tbl_name): from sqlalchemy.engine import reflection + insp = reflection.Inspector.from_engine(self.conn) - ixs = insp.get_indexes('test_index_saved') - ixs = [i['column_names'] for i in ixs] + ixs = insp.get_indexes("test_index_saved") + ixs = [i["column_names"] for i in ixs] return ixs def test_sqlalchemy_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame({'time': to_datetime(['201412120154', '201412110254'], - utc=True)}) + df = DataFrame( + {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + ) db = sql.SQLDatabase(self.conn) table = sql.SQLTable("test_type", db, frame=df) # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones - assert isinstance(table.table.c['time'].type, sqltypes.TIMESTAMP) + assert isinstance(table.table.c["time"].type, sqltypes.TIMESTAMP) def test_database_uri_string(self): @@ -977,12 +1050,12 @@ def test_database_uri_string(self): # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near # "iris": syntax error [SQL: 'iris'] with tm.ensure_clean() as name: - db_uri = 'sqlite:///' + name - table = 'iris' - test_frame1.to_sql(table, db_uri, if_exists='replace', index=False) + db_uri = "sqlite:///" + name + table = "iris" + test_frame1.to_sql(table, db_uri, if_exists="replace", index=False) test_frame2 = sql.read_sql(table, db_uri) test_frame3 = sql.read_sql_table(table, db_uri) - query = 'SELECT * FROM iris' + query = "SELECT * FROM iris" test_frame4 = sql.read_sql_query(query, db_uri) tm.assert_frame_equal(test_frame1, test_frame2) tm.assert_frame_equal(test_frame1, test_frame3) @@ -993,6 +1066,7 @@ def test_database_uri_string(self): try: # the rest of this test depends on pg8000's being absent import pg8000 # noqa + pytest.skip("pg8000 is installed") except ImportError: pass @@ -1004,34 +1078,35 @@ def test_database_uri_string(self): def _make_iris_table_metadata(self): sa = sqlalchemy metadata = sa.MetaData() - iris = sa.Table('iris', metadata, - sa.Column('SepalLength', sa.REAL), - sa.Column('SepalWidth', sa.REAL), - sa.Column('PetalLength', sa.REAL), - sa.Column('PetalWidth', sa.REAL), - sa.Column('Name', sa.TEXT) - ) + iris = sa.Table( + "iris", + metadata, + sa.Column("SepalLength", sa.REAL), + sa.Column("SepalWidth", sa.REAL), + sa.Column("PetalLength", sa.REAL), + sa.Column("PetalWidth", sa.REAL), + sa.Column("Name", sa.TEXT), + ) return iris def test_query_by_text_obj(self): # WIP : GH10846 - name_text = sqlalchemy.text('select * from iris where name=:name') - iris_df = sql.read_sql(name_text, self.conn, params={ - 'name': 'Iris-versicolor'}) - all_names = set(iris_df['Name']) - assert all_names == {'Iris-versicolor'} + name_text = sqlalchemy.text("select * from iris where name=:name") + iris_df = sql.read_sql(name_text, self.conn, params={"name": "Iris-versicolor"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-versicolor"} def test_query_by_select_obj(self): # WIP : GH10846 iris = self._make_iris_table_metadata() name_select = sqlalchemy.select([iris]).where( - iris.c.Name == sqlalchemy.bindparam('name')) - iris_df = sql.read_sql(name_select, self.conn, - params={'name': 'Iris-setosa'}) - all_names = set(iris_df['Name']) - assert all_names == {'Iris-setosa'} + iris.c.Name == sqlalchemy.bindparam("name") + ) + iris_df = sql.read_sql(name_select, self.conn, params={"name": "Iris-setosa"}) + all_names = set(iris_df["Name"]) + assert all_names == {"Iris-setosa"} class _EngineToConnMixin: @@ -1070,8 +1145,9 @@ class TestSQLiteFallbackApi(SQLiteMixIn, _TestSQLApi): Test the public sqlite connection fallback API """ - flavor = 'sqlite' - mode = 'fallback' + + flavor = "sqlite" + mode = "fallback" def connect(self, database=":memory:"): return sqlite3.connect(database) @@ -1083,20 +1159,18 @@ def test_sql_open_close(self): with tm.ensure_clean() as name: conn = self.connect(name) - sql.to_sql(self.test_frame3, "test_frame3_legacy", - conn, index=False) + sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, index=False) conn.close() conn = self.connect(name) - result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", - conn) + result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn) conn.close() tm.assert_frame_equal(self.test_frame3, result) - @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason='SQLAlchemy is installed') + @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") def test_con_string_import_error(self): - conn = 'mysql://root@localhost/pandas_nosetest' + conn = "mysql://root@localhost/pandas_nosetest" msg = "Using URI string without sqlalchemy installed" with pytest.raises(ImportError, match=msg): sql.read_sql("SELECT * FROM iris", conn) @@ -1108,36 +1182,37 @@ def test_read_sql_delegate(self): msg = "Execution failed on sql 'iris': near \"iris\": syntax error" with pytest.raises(sql.DatabaseError, match=msg): - sql.read_sql('iris', self.conn) + sql.read_sql("iris", self.conn) def test_safe_names_warning(self): # GH 6798 - df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b "]) # has a space # warns on create table with spaces in names with tm.assert_produces_warning(): sql.to_sql(df, "test_frame3_legacy", self.conn, index=False) def test_get_schema2(self): # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(self.test_frame1, 'test') - assert 'CREATE' in create_sql + create_sql = sql.get_schema(self.test_frame1, "test") + assert "CREATE" in create_sql def _get_sqlite_column_type(self, schema, column): - for col in schema.split('\n'): + for col in schema.split("\n"): if col.split()[0].strip('""') == column: return col.split()[1] - raise ValueError('Column %s not found' % (column)) + raise ValueError("Column %s not found" % (column)) def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) - df = DataFrame({'time': to_datetime(['201412120154', '201412110254'], - utc=True)}) + df = DataFrame( + {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + ) db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) schema = table.sql_schema() - assert self._get_sqlite_column_type(schema, 'time') == "TIMESTAMP" + assert self._get_sqlite_column_type(schema, "time") == "TIMESTAMP" # ----------------------------------------------------------------------------- @@ -1152,9 +1227,10 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): deviate for each flavor are overwritten there. """ + flavor = None - @pytest.fixture(autouse=True, scope='class') + @pytest.fixture(autouse=True, scope="class") def setup_class(cls): cls.setup_import() cls.setup_driver() @@ -1173,7 +1249,7 @@ def setup_method(self, load_iris_data): def setup_import(cls): # Skip this test if SQLAlchemy not available if not SQLALCHEMY_INSTALLED: - pytest.skip('SQLAlchemy not installed') + pytest.skip("SQLAlchemy not installed") @classmethod def setup_driver(cls): @@ -1190,8 +1266,7 @@ def setup_connect(self): # to test if connection can be made: self.conn.connect() except sqlalchemy.exc.OperationalError: - pytest.skip( - "Can't connect to {0} server".format(self.flavor)) + pytest.skip("Can't connect to {0} server".format(self.flavor)) def test_read_sql(self): self._read_sql_iris() @@ -1218,7 +1293,7 @@ def test_to_sql_append(self): self._to_sql_append() def test_to_sql_method_multi(self): - self._to_sql(method='multi') + self._to_sql(method="multi") def test_to_sql_method_callable(self): self._to_sql_method_callable() @@ -1226,27 +1301,29 @@ def test_to_sql_method_callable(self): def test_create_table(self): temp_conn = self.connect() temp_frame = DataFrame( - {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) pandasSQL = sql.SQLDatabase(temp_conn) - pandasSQL.to_sql(temp_frame, 'temp_frame') + pandasSQL.to_sql(temp_frame, "temp_frame") - assert temp_conn.has_table('temp_frame') + assert temp_conn.has_table("temp_frame") def test_drop_table(self): temp_conn = self.connect() temp_frame = DataFrame( - {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) pandasSQL = sql.SQLDatabase(temp_conn) - pandasSQL.to_sql(temp_frame, 'temp_frame') + pandasSQL.to_sql(temp_frame, "temp_frame") - assert temp_conn.has_table('temp_frame') + assert temp_conn.has_table("temp_frame") - pandasSQL.drop_table('temp_frame') + pandasSQL.drop_table("temp_frame") - assert not temp_conn.has_table('temp_frame') + assert not temp_conn.has_table("temp_frame") def test_roundtrip(self): self._roundtrip() @@ -1260,9 +1337,9 @@ def test_read_table(self): def test_read_table_columns(self): iris_frame = sql.read_sql_table( - "iris", con=self.conn, columns=['SepalLength', 'SepalLength']) - tm.equalContents( - iris_frame.columns.values, ['SepalLength', 'SepalLength']) + "iris", con=self.conn, columns=["SepalLength", "SepalLength"] + ) + tm.equalContents(iris_frame.columns.values, ["SepalLength", "SepalLength"]) def test_read_table_absent_raises(self): msg = "Table this_doesnt_exist not found" @@ -1283,9 +1360,9 @@ def test_default_type_conversion(self): def test_bigint(self): # int64 should be converted to BigInteger, GH7433 - df = DataFrame(data={'i64': [2**62]}) - df.to_sql('test_bigint', self.conn, index=False) - result = sql.read_sql_table('test_bigint', self.conn) + df = DataFrame(data={"i64": [2 ** 62]}) + df.to_sql("test_bigint", self.conn, index=False) + result = sql.read_sql_table("test_bigint", self.conn) tm.assert_frame_equal(df, result) @@ -1308,32 +1385,35 @@ def check(col): # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" - assert col[0] == Timestamp('2000-01-01 08:00:00') + assert col[0] == Timestamp("2000-01-01 08:00:00") # "2000-06-01 00:00:00-07:00" should convert to # "2000-06-01 07:00:00" - assert col[1] == Timestamp('2000-06-01 07:00:00') + assert col[1] == Timestamp("2000-06-01 07:00:00") elif is_datetime64tz_dtype(col.dtype): - assert str(col.dt.tz) == 'UTC' + assert str(col.dt.tz) == "UTC" # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" # "2000-06-01 00:00:00-07:00" should convert to # "2000-06-01 07:00:00" # GH 6415 - expected_data = [Timestamp('2000-01-01 08:00:00', tz='UTC'), - Timestamp('2000-06-01 07:00:00', tz='UTC')] + expected_data = [ + Timestamp("2000-01-01 08:00:00", tz="UTC"), + Timestamp("2000-06-01 07:00:00", tz="UTC"), + ] expected = Series(expected_data, name=col.name) tm.assert_series_equal(col, expected) else: - raise AssertionError("DateCol loaded with incorrect type " - "-> {0}".format(col.dtype)) + raise AssertionError( + "DateCol loaded with incorrect type " "-> {0}".format(col.dtype) + ) # GH11216 df = pd.read_sql_query("select * from types_test_data", self.conn) - if not hasattr(df, 'DateColWithTz'): + if not hasattr(df, "DateColWithTz"): pytest.skip("no column with datetime with time zone") # this is parsed on Travis (linux), but not on macosx for some reason @@ -1342,21 +1422,27 @@ def check(col): col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) - df = pd.read_sql_query("select * from types_test_data", - self.conn, parse_dates=['DateColWithTz']) - if not hasattr(df, 'DateColWithTz'): + df = pd.read_sql_query( + "select * from types_test_data", self.conn, parse_dates=["DateColWithTz"] + ) + if not hasattr(df, "DateColWithTz"): pytest.skip("no column with datetime with time zone") col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) - assert str(col.dt.tz) == 'UTC' + assert str(col.dt.tz) == "UTC" check(df.DateColWithTz) - df = pd.concat(list(pd.read_sql_query("select * from types_test_data", - self.conn, chunksize=1)), - ignore_index=True) + df = pd.concat( + list( + pd.read_sql_query( + "select * from types_test_data", self.conn, chunksize=1 + ) + ), + ignore_index=True, + ) col = df.DateColWithTz assert is_datetime64tz_dtype(col.dtype) - assert str(col.dt.tz) == 'UTC' + assert str(col.dt.tz) == "UTC" expected = sql.read_sql_table("types_test_data", self.conn) col = expected.DateColWithTz assert is_datetime64tz_dtype(col.dtype) @@ -1372,106 +1458,111 @@ def test_datetime_with_timezone_roundtrip(self): # Write datetimetz data to a db and read it back # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned - expected = DataFrame({'A': date_range( - '2013-01-01 09:00:00', periods=3, tz='US/Pacific' - )}) - expected.to_sql('test_datetime_tz', self.conn, index=False) + expected = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + ) + expected.to_sql("test_datetime_tz", self.conn, index=False) - if self.flavor == 'postgresql': + if self.flavor == "postgresql": # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC - expected['A'] = expected['A'].dt.tz_convert('UTC') + expected["A"] = expected["A"].dt.tz_convert("UTC") else: # Otherwise, timestamps are returned as local, naive - expected['A'] = expected['A'].dt.tz_localize(None) + expected["A"] = expected["A"].dt.tz_localize(None) - result = sql.read_sql_table('test_datetime_tz', self.conn) + result = sql.read_sql_table("test_datetime_tz", self.conn) tm.assert_frame_equal(result, expected) - result = sql.read_sql_query( - 'SELECT * FROM test_datetime_tz', self.conn - ) - if self.flavor == 'sqlite': + result = sql.read_sql_query("SELECT * FROM test_datetime_tz", self.conn) + if self.flavor == "sqlite": # read_sql_query does not return datetime type like read_sql_table - assert isinstance(result.loc[0, 'A'], str) - result['A'] = to_datetime(result['A']) + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) tm.assert_frame_equal(result, expected) def test_naive_datetimeindex_roundtrip(self): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC - dates = date_range('2018-01-01', periods=5, freq='6H') - expected = DataFrame({'nums': range(5)}, index=dates) - expected.to_sql('foo_table', self.conn, index_label='info_date') - result = sql.read_sql_table('foo_table', self.conn, - index_col='info_date') + dates = date_range("2018-01-01", periods=5, freq="6H") + expected = DataFrame({"nums": range(5)}, index=dates) + expected.to_sql("foo_table", self.conn, index_label="info_date") + result = sql.read_sql_table("foo_table", self.conn, index_col="info_date") # result index with gain a name from a set_index operation; expected tm.assert_frame_equal(result, expected, check_names=False) def test_date_parsing(self): # No Parsing df = sql.read_sql_table("types_test_data", self.conn) - expected_type = object if self.flavor == 'sqlite' else np.datetime64 + expected_type = object if self.flavor == "sqlite" else np.datetime64 assert issubclass(df.DateCol.dtype.type, expected_type) - df = sql.read_sql_table("types_test_data", self.conn, - parse_dates=['DateCol']) + df = sql.read_sql_table("types_test_data", self.conn, parse_dates=["DateCol"]) assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types_test_data", self.conn, - parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"} + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types_test_data", self.conn, parse_dates={ - 'DateCol': {'format': '%Y-%m-%d %H:%M:%S'}}) + df = sql.read_sql_table( + "types_test_data", + self.conn, + parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}}, + ) assert issubclass(df.DateCol.dtype.type, np.datetime64) df = sql.read_sql_table( - "types_test_data", self.conn, parse_dates=['IntDateCol']) + "types_test_data", self.conn, parse_dates=["IntDateCol"] + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) df = sql.read_sql_table( - "types_test_data", self.conn, parse_dates={'IntDateCol': 's'}) + "types_test_data", self.conn, parse_dates={"IntDateCol": "s"} + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) - df = sql.read_sql_table("types_test_data", self.conn, - parse_dates={'IntDateCol': {'unit': 's'}}) + df = sql.read_sql_table( + "types_test_data", self.conn, parse_dates={"IntDateCol": {"unit": "s"}} + ) assert issubclass(df.IntDateCol.dtype.type, np.datetime64) def test_datetime(self): - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), - 'B': np.arange(3.0)}) - df.to_sql('test_datetime', self.conn) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.to_sql("test_datetime", self.conn) # with read_table -> type information from schema used - result = sql.read_sql_table('test_datetime', self.conn) - result = result.drop('index', axis=1) + result = sql.read_sql_table("test_datetime", self.conn) + result = result.drop("index", axis=1) tm.assert_frame_equal(result, df) # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn) - result = result.drop('index', axis=1) - if self.flavor == 'sqlite': - assert isinstance(result.loc[0, 'A'], str) - result['A'] = to_datetime(result['A']) + result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) + result = result.drop("index", axis=1) + if self.flavor == "sqlite": + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"]) tm.assert_frame_equal(result, df) else: tm.assert_frame_equal(result, df) def test_datetime_NaT(self): - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), - 'B': np.arange(3.0)}) - df.loc[1, 'A'] = np.nan - df.to_sql('test_datetime', self.conn, index=False) + df = DataFrame( + {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)} + ) + df.loc[1, "A"] = np.nan + df.to_sql("test_datetime", self.conn, index=False) # with read_table -> type information from schema used - result = sql.read_sql_table('test_datetime', self.conn) + result = sql.read_sql_table("test_datetime", self.conn) tm.assert_frame_equal(result, df) # with read_sql -> no type information -> sqlite has no native - result = sql.read_sql_query('SELECT * FROM test_datetime', self.conn) - if self.flavor == 'sqlite': - assert isinstance(result.loc[0, 'A'], str) - result['A'] = to_datetime(result['A'], errors='coerce') + result = sql.read_sql_query("SELECT * FROM test_datetime", self.conn) + if self.flavor == "sqlite": + assert isinstance(result.loc[0, "A"], str) + result["A"] = to_datetime(result["A"], errors="coerce") tm.assert_frame_equal(result, df) else: tm.assert_frame_equal(result, df) @@ -1479,18 +1570,18 @@ def test_datetime_NaT(self): def test_datetime_date(self): # test support for datetime.date df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - df.to_sql('test_date', self.conn, index=False) - res = read_sql_table('test_date', self.conn) - result = res['a'] - expected = to_datetime(df['a']) + df.to_sql("test_date", self.conn, index=False) + res = read_sql_table("test_date", self.conn) + result = res["a"] + expected = to_datetime(df["a"]) # comes back as datetime64 tm.assert_series_equal(result, expected) def test_datetime_time(self): # test support for datetime.time df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - df.to_sql('test_time', self.conn, index=False) - res = read_sql_table('test_time', self.conn) + df.to_sql("test_time", self.conn, index=False) + res = read_sql_table("test_time", self.conn) tm.assert_frame_equal(res, df) # GH8341 @@ -1502,7 +1593,7 @@ def test_datetime_time(self): tm.assert_frame_equal(ref, res) # check if adapter is in place # then test if sqlalchemy is unaffected by the sqlite adapter sql.to_sql(df, "test_time3", self.conn, index=False) - if self.flavor == 'sqlite': + if self.flavor == "sqlite": res = sql.read_sql_query("SELECT * FROM test_time3", self.conn) ref = df.applymap(lambda _: _.strftime("%H:%M:%S.%f")) tm.assert_frame_equal(ref, res) @@ -1511,9 +1602,9 @@ def test_datetime_time(self): def test_mixed_dtype_insert(self): # see GH6509 - s1 = Series(2**25 + 1, dtype=np.int32) + s1 = Series(2 ** 25 + 1, dtype=np.int32) s2 = Series(0.0, dtype=np.float32) - df = DataFrame({'s1': s1, 's2': s2}) + df = DataFrame({"s1": s1, "s2": s2}) # write and read again df.to_sql("test_read_write", self.conn, index=False) @@ -1523,53 +1614,54 @@ def test_mixed_dtype_insert(self): def test_nan_numeric(self): # NaNs in numeric float column - df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]}) - df.to_sql('test_nan', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]}) + df.to_sql("test_nan", self.conn, index=False) # with read_table - result = sql.read_sql_table('test_nan', self.conn) + result = sql.read_sql_table("test_nan", self.conn) tm.assert_frame_equal(result, df) # with read_sql - result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) tm.assert_frame_equal(result, df) def test_nan_fullcolumn(self): # full NaN column (numeric float column) - df = DataFrame({'A': [0, 1, 2], 'B': [np.nan, np.nan, np.nan]}) - df.to_sql('test_nan', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]}) + df.to_sql("test_nan", self.conn, index=False) # with read_table - result = sql.read_sql_table('test_nan', self.conn) + result = sql.read_sql_table("test_nan", self.conn) tm.assert_frame_equal(result, df) # with read_sql -> not type info from table -> stays None - df['B'] = df['B'].astype('object') - df['B'] = None - result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + df["B"] = df["B"].astype("object") + df["B"] = None + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) tm.assert_frame_equal(result, df) def test_nan_string(self): # NaNs in string column - df = DataFrame({'A': [0, 1, 2], 'B': ['a', 'b', np.nan]}) - df.to_sql('test_nan', self.conn, index=False) + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]}) + df.to_sql("test_nan", self.conn, index=False) # NaNs are coming back as None - df.loc[2, 'B'] = None + df.loc[2, "B"] = None # with read_table - result = sql.read_sql_table('test_nan', self.conn) + result = sql.read_sql_table("test_nan", self.conn) tm.assert_frame_equal(result, df) # with read_sql - result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) + result = sql.read_sql_query("SELECT * FROM test_nan", self.conn) tm.assert_frame_equal(result, df) def _get_index_columns(self, tbl_name): from sqlalchemy.engine import reflection + insp = reflection.Inspector.from_engine(self.conn) ixs = insp.get_indexes(tbl_name) - ixs = [i['column_names'] for i in ixs] + ixs = [i["column_names"] for i in ixs] return ixs def test_to_sql_save_index(self): @@ -1584,112 +1676,117 @@ def test_get_schema_create_table(self): # mismatch) self._load_test3_data() - tbl = 'test_get_schema_create_table' + tbl = "test_get_schema_create_table" create_sql = sql.get_schema(self.test_frame3, tbl, con=self.conn) blank_test_df = self.test_frame3.iloc[:0] self.drop_table(tbl) self.conn.execute(create_sql) returned_df = sql.read_sql_table(tbl, self.conn) - tm.assert_frame_equal(returned_df, blank_test_df, - check_index_type=False) + tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) self.drop_table(tbl) def test_dtype(self): - cols = ['A', 'B'] - data = [(0.8, True), - (0.9, None)] + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] df = DataFrame(data, columns=cols) - df.to_sql('dtype_test', self.conn) - df.to_sql('dtype_test2', self.conn, dtype={'B': sqlalchemy.TEXT}) + df.to_sql("dtype_test", self.conn) + df.to_sql("dtype_test2", self.conn, dtype={"B": sqlalchemy.TEXT}) meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - sqltype = meta.tables['dtype_test2'].columns['B'].type + sqltype = meta.tables["dtype_test2"].columns["B"].type assert isinstance(sqltype, sqlalchemy.TEXT) msg = "The type of B is not a SQLAlchemy type" with pytest.raises(ValueError, match=msg): - df.to_sql('error', self.conn, dtype={'B': str}) + df.to_sql("error", self.conn, dtype={"B": str}) # GH9083 - df.to_sql('dtype_test3', self.conn, dtype={'B': sqlalchemy.String(10)}) + df.to_sql("dtype_test3", self.conn, dtype={"B": sqlalchemy.String(10)}) meta.reflect() - sqltype = meta.tables['dtype_test3'].columns['B'].type + sqltype = meta.tables["dtype_test3"].columns["B"].type assert isinstance(sqltype, sqlalchemy.String) assert sqltype.length == 10 # single dtype - df.to_sql('single_dtype_test', self.conn, dtype=sqlalchemy.TEXT) + df.to_sql("single_dtype_test", self.conn, dtype=sqlalchemy.TEXT) meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - sqltypea = meta.tables['single_dtype_test'].columns['A'].type - sqltypeb = meta.tables['single_dtype_test'].columns['B'].type + sqltypea = meta.tables["single_dtype_test"].columns["A"].type + sqltypeb = meta.tables["single_dtype_test"].columns["B"].type assert isinstance(sqltypea, sqlalchemy.TEXT) assert isinstance(sqltypeb, sqlalchemy.TEXT) def test_notna_dtype(self): - cols = {'Bool': Series([True, None]), - 'Date': Series([datetime(2012, 5, 1), None]), - 'Int': Series([1, None], dtype='object'), - 'Float': Series([1.1, None]) - } + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } df = DataFrame(cols) - tbl = 'notna_dtype_test' + tbl = "notna_dtype_test" df.to_sql(tbl, self.conn) returned_df = sql.read_sql_table(tbl, self.conn) # noqa meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - if self.flavor == 'mysql': + if self.flavor == "mysql": my_type = sqltypes.Integer else: my_type = sqltypes.Boolean col_dict = meta.tables[tbl].columns - assert isinstance(col_dict['Bool'].type, my_type) - assert isinstance(col_dict['Date'].type, sqltypes.DateTime) - assert isinstance(col_dict['Int'].type, sqltypes.Integer) - assert isinstance(col_dict['Float'].type, sqltypes.Float) + assert isinstance(col_dict["Bool"].type, my_type) + assert isinstance(col_dict["Date"].type, sqltypes.DateTime) + assert isinstance(col_dict["Int"].type, sqltypes.Integer) + assert isinstance(col_dict["Float"].type, sqltypes.Float) def test_double_precision(self): V = 1.23456789101112131415 - df = DataFrame({'f32': Series([V, ], dtype='float32'), - 'f64': Series([V, ], dtype='float64'), - 'f64_as_f32': Series([V, ], dtype='float64'), - 'i32': Series([5, ], dtype='int32'), - 'i64': Series([5, ], dtype='int64'), - }) + df = DataFrame( + { + "f32": Series([V], dtype="float32"), + "f64": Series([V], dtype="float64"), + "f64_as_f32": Series([V], dtype="float64"), + "i32": Series([5], dtype="int32"), + "i64": Series([5], dtype="int64"), + } + ) - df.to_sql('test_dtypes', self.conn, index=False, if_exists='replace', - dtype={'f64_as_f32': sqlalchemy.Float(precision=23)}) - res = sql.read_sql_table('test_dtypes', self.conn) + df.to_sql( + "test_dtypes", + self.conn, + index=False, + if_exists="replace", + dtype={"f64_as_f32": sqlalchemy.Float(precision=23)}, + ) + res = sql.read_sql_table("test_dtypes", self.conn) # check precision of float64 - assert (np.round(df['f64'].iloc[0], 14) == - np.round(res['f64'].iloc[0], 14)) + assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14) # check sql types meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() - col_dict = meta.tables['test_dtypes'].columns - assert str(col_dict['f32'].type) == str(col_dict['f64_as_f32'].type) - assert isinstance(col_dict['f32'].type, sqltypes.Float) - assert isinstance(col_dict['f64'].type, sqltypes.Float) - assert isinstance(col_dict['i32'].type, sqltypes.Integer) - assert isinstance(col_dict['i64'].type, sqltypes.BigInteger) + col_dict = meta.tables["test_dtypes"].columns + assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type) + assert isinstance(col_dict["f32"].type, sqltypes.Float) + assert isinstance(col_dict["f64"].type, sqltypes.Float) + assert isinstance(col_dict["i32"].type, sqltypes.Integer) + assert isinstance(col_dict["i64"].type, sqltypes.BigInteger) def test_connectable_issue_example(self): # This tests the example raised in issue # https://github.com/pandas-dev/pandas/issues/10104 def foo(connection): - query = 'SELECT test_foo_data FROM test_foo_data' + query = "SELECT test_foo_data FROM test_foo_data" return sql.read_sql_query(query, con=connection) def bar(connection, data): - data.to_sql(name='test_foo_data', - con=connection, if_exists='append') + data.to_sql(name="test_foo_data", con=connection, if_exists="append") def main(connectable): with connectable.connect() as conn: @@ -1697,18 +1794,17 @@ def main(connectable): foo_data = conn.run_callable(foo) conn.run_callable(bar, foo_data) - DataFrame({'test_foo_data': [0, 1, 2]}).to_sql( - 'test_foo_data', self.conn) + DataFrame({"test_foo_data": [0, 1, 2]}).to_sql("test_foo_data", self.conn) main(self.conn) def test_temporary_table(self): - test_data = 'Hello, World!' - expected = DataFrame({'spam': [test_data]}) + test_data = "Hello, World!" + expected = DataFrame({"spam": [test_data]}) Base = declarative.declarative_base() class Temporary(Base): - __tablename__ = 'temp_test' - __table_args__ = {'prefixes': ['TEMPORARY']} + __tablename__ = "temp_test" + __table_args__ = {"prefixes": ["TEMPORARY"]} id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True) spam = sqlalchemy.Column(sqlalchemy.Unicode(30), nullable=False) @@ -1719,19 +1815,14 @@ class Temporary(Base): Temporary.__table__.create(conn) session.add(Temporary(spam=test_data)) session.flush() - df = sql.read_sql_query( - sql=sqlalchemy.select([Temporary.spam]), - con=conn, - ) + df = sql.read_sql_query(sql=sqlalchemy.select([Temporary.spam]), con=conn) tm.assert_frame_equal(df, expected) class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): - def test_transactions(self): - pytest.skip( - "Nested transactions rollbacks don't work with Pandas") + pytest.skip("Nested transactions rollbacks don't work with Pandas") class _TestSQLiteAlchemy: @@ -1739,11 +1830,12 @@ class _TestSQLiteAlchemy: Test the sqlalchemy backend against an in-memory sqlite database. """ - flavor = 'sqlite' + + flavor = "sqlite" @classmethod def connect(cls): - return sqlalchemy.create_engine('sqlite:///:memory:') + return sqlalchemy.create_engine("sqlite:///:memory:") @classmethod def setup_driver(cls): @@ -1773,12 +1865,12 @@ def test_default_date_load(self): def test_bigint_warning(self): # test no warning for BIGINT (to support int64) is raised (GH7433) - df = DataFrame({'a': [1, 2]}, dtype='int64') - df.to_sql('test_bigintwarning', self.conn, index=False) + df = DataFrame({"a": [1, 2]}, dtype="int64") + df.to_sql("test_bigintwarning", self.conn, index=False) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - sql.read_sql_table('test_bigintwarning', self.conn) + sql.read_sql_table("test_bigintwarning", self.conn) assert len(w) == 0 @@ -1787,20 +1879,21 @@ class _TestMySQLAlchemy: Test the sqlalchemy backend against an MySQL database. """ - flavor = 'mysql' + + flavor = "mysql" @classmethod def connect(cls): - url = 'mysql+{driver}://root@localhost/pandas_nosetest' - return sqlalchemy.create_engine(url.format(driver=cls.driver), - connect_args=cls.connect_args) + url = "mysql+{driver}://root@localhost/pandas_nosetest" + return sqlalchemy.create_engine( + url.format(driver=cls.driver), connect_args=cls.connect_args + ) @classmethod def setup_driver(cls): - pymysql = pytest.importorskip('pymysql') - cls.driver = 'pymysql' - cls.connect_args = { - 'client_flag': pymysql.constants.CLIENT.MULTI_STATEMENTS} + pymysql = pytest.importorskip("pymysql") + cls.driver = "pymysql" + cls.connect_args = {"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS} def test_default_type_conversion(self): df = sql.read_sql_table("types_test_data", self.conn) @@ -1819,10 +1912,11 @@ def test_default_type_conversion(self): def test_read_procedure(self): import pymysql + # see GH7324. Although it is more an api test, it is added to the # mysql tests as sqlite does not have stored procedures - df = DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}) - df.to_sql('test_procedure', self.conn, index=False) + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df.to_sql("test_procedure", self.conn, index=False) proc = """DROP PROCEDURE IF EXISTS get_testdb; @@ -1854,48 +1948,49 @@ class _TestPostgreSQLAlchemy: Test the sqlalchemy backend against an PostgreSQL database. """ - flavor = 'postgresql' + + flavor = "postgresql" @classmethod def connect(cls): - url = 'postgresql+{driver}://postgres@localhost/pandas_nosetest' + url = "postgresql+{driver}://postgres@localhost/pandas_nosetest" return sqlalchemy.create_engine(url.format(driver=cls.driver)) @classmethod def setup_driver(cls): - pytest.importorskip('psycopg2') - cls.driver = 'psycopg2' + pytest.importorskip("psycopg2") + cls.driver = "psycopg2" def test_schema_support(self): # only test this for postgresql (schema's not supported in # mysql/sqlite) - df = DataFrame({'col1': [1, 2], 'col2': [ - 0.1, 0.2], 'col3': ['a', 'n']}) + df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) # create a schema self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") self.conn.execute("CREATE SCHEMA other;") # write dataframe to different schema's - df.to_sql('test_schema_public', self.conn, index=False) - df.to_sql('test_schema_public_explicit', self.conn, index=False, - schema='public') - df.to_sql('test_schema_other', self.conn, index=False, schema='other') + df.to_sql("test_schema_public", self.conn, index=False) + df.to_sql( + "test_schema_public_explicit", self.conn, index=False, schema="public" + ) + df.to_sql("test_schema_other", self.conn, index=False, schema="other") # read dataframes back in - res1 = sql.read_sql_table('test_schema_public', self.conn) + res1 = sql.read_sql_table("test_schema_public", self.conn) tm.assert_frame_equal(df, res1) - res2 = sql.read_sql_table('test_schema_public_explicit', self.conn) + res2 = sql.read_sql_table("test_schema_public_explicit", self.conn) tm.assert_frame_equal(df, res2) - res3 = sql.read_sql_table('test_schema_public_explicit', self.conn, - schema='public') + res3 = sql.read_sql_table( + "test_schema_public_explicit", self.conn, schema="public" + ) tm.assert_frame_equal(df, res3) - res4 = sql.read_sql_table('test_schema_other', self.conn, - schema='other') + res4 = sql.read_sql_table("test_schema_other", self.conn, schema="other") tm.assert_frame_equal(df, res4) msg = "Table test_schema_other not found" with pytest.raises(ValueError, match=msg): - sql.read_sql_table('test_schema_other', self.conn, schema='public') + sql.read_sql_table("test_schema_other", self.conn, schema="public") # different if_exists options @@ -1904,13 +1999,22 @@ def test_schema_support(self): self.conn.execute("CREATE SCHEMA other;") # write dataframe with different if_exists options - df.to_sql('test_schema_other', self.conn, schema='other', index=False) - df.to_sql('test_schema_other', self.conn, schema='other', index=False, - if_exists='replace') - df.to_sql('test_schema_other', self.conn, schema='other', index=False, - if_exists='append') - res = sql.read_sql_table( - 'test_schema_other', self.conn, schema='other') + df.to_sql("test_schema_other", self.conn, schema="other", index=False) + df.to_sql( + "test_schema_other", + self.conn, + schema="other", + index=False, + if_exists="replace", + ) + df.to_sql( + "test_schema_other", + self.conn, + schema="other", + index=False, + if_exists="append", + ) + res = sql.read_sql_table("test_schema_other", self.conn, schema="other") tm.assert_frame_equal(concat([df, df], ignore_index=True), res) # specifying schema in user-provided meta @@ -1919,16 +2023,13 @@ def test_schema_support(self): # because of transactional schemas if isinstance(self.conn, sqlalchemy.engine.Engine): engine2 = self.connect() - meta = sqlalchemy.MetaData(engine2, schema='other') + meta = sqlalchemy.MetaData(engine2, schema="other") pdsql = sql.SQLDatabase(engine2, meta=meta) - pdsql.to_sql(df, 'test_schema_other2', index=False) - pdsql.to_sql(df, 'test_schema_other2', - index=False, if_exists='replace') - pdsql.to_sql(df, 'test_schema_other2', - index=False, if_exists='append') - res1 = sql.read_sql_table( - 'test_schema_other2', self.conn, schema='other') - res2 = pdsql.read_table('test_schema_other2') + pdsql.to_sql(df, "test_schema_other2", index=False) + pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="replace") + pdsql.to_sql(df, "test_schema_other2", index=False, if_exists="append") + res1 = sql.read_sql_table("test_schema_other2", self.conn, schema="other") + res2 = pdsql.read_table("test_schema_other2") tm.assert_frame_equal(res1, res2) def test_copy_from_callable_insertion_method(self): @@ -1944,21 +2045,22 @@ def psql_insert_copy(table, conn, keys, data_iter): writer.writerows(data_iter) s_buf.seek(0) - columns = ', '.join('"{}"'.format(k) for k in keys) + columns = ", ".join('"{}"'.format(k) for k in keys) if table.schema: - table_name = '{}.{}'.format(table.schema, table.name) + table_name = "{}.{}".format(table.schema, table.name) else: table_name = table.name - sql_query = 'COPY {} ({}) FROM STDIN WITH CSV'.format( - table_name, columns) + sql_query = "COPY {} ({}) FROM STDIN WITH CSV".format( + table_name, columns + ) cur.copy_expert(sql=sql_query, file=s_buf) - expected = DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2], - 'col3': ['a', 'n']}) - expected.to_sql('test_copy_insert', self.conn, index=False, - method=psql_insert_copy) - result = sql.read_sql_table('test_copy_insert', self.conn) + expected = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) + expected.to_sql( + "test_copy_insert", self.conn, index=False, method=psql_insert_copy + ) + result = sql.read_sql_table("test_copy_insert", self.conn) tm.assert_frame_equal(result, expected) @@ -1999,17 +2101,19 @@ class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn): # ----------------------------------------------------------------------------- # -- Test Sqlite / MySQL fallback + @pytest.mark.single class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): """ Test the fallback mode against an in-memory sqlite database. """ - flavor = 'sqlite' + + flavor = "sqlite" @classmethod def connect(cls): - return sqlite3.connect(':memory:') + return sqlite3.connect(":memory:") def setup_connect(self): self.conn = self.connect() @@ -2048,15 +2152,16 @@ def test_to_sql_append(self): def test_create_and_drop_table(self): temp_frame = DataFrame( - {'one': [1., 2., 3., 4.], 'two': [4., 3., 2., 1.]}) + {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} + ) - self.pandasSQL.to_sql(temp_frame, 'drop_test_frame') + self.pandasSQL.to_sql(temp_frame, "drop_test_frame") - assert self.pandasSQL.has_table('drop_test_frame') + assert self.pandasSQL.has_table("drop_test_frame") - self.pandasSQL.drop_table('drop_test_frame') + self.pandasSQL.drop_table("drop_test_frame") - assert not self.pandasSQL.has_table('drop_test_frame') + assert not self.pandasSQL.has_table("drop_test_frame") def test_roundtrip(self): self._roundtrip() @@ -2067,32 +2172,33 @@ def test_execute_sql(self): def test_datetime_date(self): # test support for datetime.date df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - df.to_sql('test_date', self.conn, index=False) - res = read_sql_query('SELECT * FROM test_date', self.conn) - if self.flavor == 'sqlite': + df.to_sql("test_date", self.conn, index=False) + res = read_sql_query("SELECT * FROM test_date", self.conn) + if self.flavor == "sqlite": # comes back as strings tm.assert_frame_equal(res, df.astype(str)) - elif self.flavor == 'mysql': + elif self.flavor == "mysql": tm.assert_frame_equal(res, df) def test_datetime_time(self): # test support for datetime.time, GH #8341 df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - df.to_sql('test_time', self.conn, index=False) - res = read_sql_query('SELECT * FROM test_time', self.conn) - if self.flavor == 'sqlite': + df.to_sql("test_time", self.conn, index=False) + res = read_sql_query("SELECT * FROM test_time", self.conn) + if self.flavor == "sqlite": # comes back as strings expected = df.applymap(lambda _: _.strftime("%H:%M:%S.%f")) tm.assert_frame_equal(res, expected) def _get_index_columns(self, tbl_name): ixs = sql.read_sql_query( - "SELECT * FROM sqlite_master WHERE type = 'index' " + - "AND tbl_name = '%s'" % tbl_name, self.conn) + "SELECT * FROM sqlite_master WHERE type = 'index' " + + "AND tbl_name = '%s'" % tbl_name, + self.conn, + ) ix_cols = [] for ix_name in ixs.name: - ix_info = sql.read_sql_query( - "PRAGMA index_info(%s)" % ix_name, self.conn) + ix_info = sql.read_sql_query("PRAGMA index_info(%s)" % ix_name, self.conn) ix_cols.append(ix_info.name.tolist()) return ix_cols @@ -2105,76 +2211,81 @@ def test_transactions(self): self._transaction_test() def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute('PRAGMA table_info(%s)' % table) + recs = self.conn.execute("PRAGMA table_info(%s)" % table) for cid, name, ctype, not_null, default, pk in recs: if name == column: return ctype - raise ValueError('Table %s, column %s not found' % (table, column)) + raise ValueError("Table %s, column %s not found" % (table, column)) def test_dtype(self): - if self.flavor == 'mysql': - pytest.skip('Not applicable to MySQL legacy') - cols = ['A', 'B'] - data = [(0.8, True), - (0.9, None)] + if self.flavor == "mysql": + pytest.skip("Not applicable to MySQL legacy") + cols = ["A", "B"] + data = [(0.8, True), (0.9, None)] df = DataFrame(data, columns=cols) - df.to_sql('dtype_test', self.conn) - df.to_sql('dtype_test2', self.conn, dtype={'B': 'STRING'}) + df.to_sql("dtype_test", self.conn) + df.to_sql("dtype_test2", self.conn, dtype={"B": "STRING"}) # sqlite stores Boolean values as INTEGER - assert self._get_sqlite_column_type( - 'dtype_test', 'B') == 'INTEGER' + assert self._get_sqlite_column_type("dtype_test", "B") == "INTEGER" - assert self._get_sqlite_column_type( - 'dtype_test2', 'B') == 'STRING' + assert self._get_sqlite_column_type("dtype_test2", "B") == "STRING" msg = r"B \(\) not a string" with pytest.raises(ValueError, match=msg): - df.to_sql('error', self.conn, dtype={'B': bool}) + df.to_sql("error", self.conn, dtype={"B": bool}) # single dtype - df.to_sql('single_dtype_test', self.conn, dtype='STRING') - assert self._get_sqlite_column_type( - 'single_dtype_test', 'A') == 'STRING' - assert self._get_sqlite_column_type( - 'single_dtype_test', 'B') == 'STRING' + df.to_sql("single_dtype_test", self.conn, dtype="STRING") + assert self._get_sqlite_column_type("single_dtype_test", "A") == "STRING" + assert self._get_sqlite_column_type("single_dtype_test", "B") == "STRING" def test_notna_dtype(self): - if self.flavor == 'mysql': - pytest.skip('Not applicable to MySQL legacy') - - cols = {'Bool': Series([True, None]), - 'Date': Series([datetime(2012, 5, 1), None]), - 'Int': Series([1, None], dtype='object'), - 'Float': Series([1.1, None]) - } + if self.flavor == "mysql": + pytest.skip("Not applicable to MySQL legacy") + + cols = { + "Bool": Series([True, None]), + "Date": Series([datetime(2012, 5, 1), None]), + "Int": Series([1, None], dtype="object"), + "Float": Series([1.1, None]), + } df = DataFrame(cols) - tbl = 'notna_dtype_test' + tbl = "notna_dtype_test" df.to_sql(tbl, self.conn) - assert self._get_sqlite_column_type(tbl, 'Bool') == 'INTEGER' - assert self._get_sqlite_column_type(tbl, 'Date') == 'TIMESTAMP' - assert self._get_sqlite_column_type(tbl, 'Int') == 'INTEGER' - assert self._get_sqlite_column_type(tbl, 'Float') == 'REAL' + assert self._get_sqlite_column_type(tbl, "Bool") == "INTEGER" + assert self._get_sqlite_column_type(tbl, "Date") == "TIMESTAMP" + assert self._get_sqlite_column_type(tbl, "Int") == "INTEGER" + assert self._get_sqlite_column_type(tbl, "Float") == "REAL" def test_illegal_names(self): # For sqlite, these should work fine - df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) msg = "Empty table or column name specified" with pytest.raises(ValueError, match=msg): df.to_sql("", self.conn) for ndx, weird_name in enumerate( - ['test_weird_name]', 'test_weird_name[', - 'test_weird_name`', 'test_weird_name"', 'test_weird_name\'', - '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"', - '99beginswithnumber', '12345', '\xe9']): + [ + "test_weird_name]", + "test_weird_name[", + "test_weird_name`", + 'test_weird_name"', + "test_weird_name'", + "_b.test_weird_name_01-30", + '"_b.test_weird_name_01-30"', + "99beginswithnumber", + "12345", + "\xe9", + ] + ): df.to_sql(weird_name, self.conn) sql.table_exists(weird_name, self.conn) - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', weird_name]) - c_tbl = 'test_weird_col_name%d' % ndx + df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) + c_tbl = "test_weird_col_name%d" % ndx df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) @@ -2185,7 +2296,7 @@ def test_illegal_names(self): def date_format(dt): """Returns date in YYYYMMDD format.""" - return dt.strftime('%Y%m%d') + return dt.strftime("%Y%m%d") _formatters = { @@ -2227,17 +2338,16 @@ def tquery(query, con=None, cur=None): @pytest.mark.single class TestXSQLite(SQLiteMixIn): - @pytest.fixture(autouse=True) def setup_method(self, request, datapath): self.method = request.function - self.conn = sqlite3.connect(':memory:') + self.conn = sqlite3.connect(":memory:") # In some test cases we may close db connection # Re-open conn here so we can perform cleanup in teardown yield self.method = request.function - self.conn = sqlite3.connect(':memory:') + self.conn = sqlite3.connect(":memory:") def test_basic(self): frame = tm.makeTimeDataFrame() @@ -2247,7 +2357,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.iloc[0, 0] = np.nan - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() cur.execute(create_sql) @@ -2266,7 +2376,7 @@ def test_write_row_by_row(self): def test_execute(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() cur.execute(create_sql) ins = "INSERT INTO test VALUES (?, ?, ?, ?)" @@ -2281,15 +2391,15 @@ def test_execute(self): def test_schema(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") lines = create_sql.splitlines() for l in lines: - tokens = l.split(' ') - if len(tokens) == 2 and tokens[0] == 'A': - assert tokens[1] == 'DATETIME' + tokens = l.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) lines = create_sql.splitlines() assert 'PRIMARY KEY ("A", "B")' in create_sql cur = self.conn.cursor() @@ -2337,7 +2447,7 @@ def test_na_roundtrip(self): pass def _check_roundtrip(self, frame): - sql.to_sql(frame, name='test_table', con=self.conn, index=False) + sql.to_sql(frame, name="test_table", con=self.conn, index=False) result = sql.read_sql("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2346,32 +2456,30 @@ def _check_roundtrip(self, frame): expected = frame tm.assert_frame_equal(result, expected) - frame['txt'] = ['a'] * len(frame) + frame["txt"] = ["a"] * len(frame) frame2 = frame.copy() new_idx = Index(np.arange(len(frame2))) + 10 - frame2['Idx'] = new_idx.copy() - sql.to_sql(frame2, name='test_table2', con=self.conn, index=False) - result = sql.read_sql("select * from test_table2", self.conn, - index_col='Idx') + frame2["Idx"] = new_idx.copy() + sql.to_sql(frame2, name="test_table2", con=self.conn, index=False) + result = sql.read_sql("select * from test_table2", self.conn, index_col="Idx") expected = frame.copy() expected.index = new_idx - expected.index.name = 'Idx' + expected.index.name = "Idx" tm.assert_frame_equal(expected, result) def test_keyword_as_column_names(self): - df = DataFrame({'From': np.ones(5)}) - sql.to_sql(df, con=self.conn, name='testkeywords', index=False) + df = DataFrame({"From": np.ones(5)}) + sql.to_sql(df, con=self.conn, name="testkeywords", index=False) def test_onecolumn_of_integer(self): # GH 3628 # a column_of_integers dataframe should transfer well to sql - mono_df = DataFrame([1, 2], columns=['c0']) - sql.to_sql(mono_df, con=self.conn, name='mono_df', index=False) + mono_df = DataFrame([1, 2], columns=["c0"]) + sql.to_sql(mono_df, con=self.conn, name="mono_df", index=False) # computing the sum via sql con_x = self.conn - the_sum = sum(my_c0[0] - for my_c0 in con_x.execute("select * from mono_df")) + the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df")) # it should not fail, and gives 3 ( Issue #3628 ) assert the_sum == 3 @@ -2379,10 +2487,9 @@ def test_onecolumn_of_integer(self): tm.assert_frame_equal(result, mono_df) def test_if_exists(self): - df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) - df_if_exists_2 = DataFrame( - {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) - table_name = 'table_if_exists' + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" sql_select = "SELECT * FROM %s" % table_name def clean_up(test_table_to_drop): @@ -2394,81 +2501,113 @@ def clean_up(test_table_to_drop): msg = "'notvalidvalue' is not valid for if_exists" with pytest.raises(ValueError, match=msg): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='notvalidvalue') + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="notvalidvalue", + ) clean_up(table_name) # test if_exists='fail' - sql.to_sql(frame=df_if_exists_1, con=self.conn, - name=table_name, if_exists='fail') + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) msg = "Table 'table_if_exists' already exists" with pytest.raises(ValueError, match=msg): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail') + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) # test if_exists='replace' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert (tquery(sql_select, con=self.conn) == - [(3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(3, "C"), (4, "D"), (5, "E")] clean_up(table_name) # test if_exists='append' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='append', index=False) - assert (tquery(sql_select, con=self.conn) == - [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="append", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] clean_up(table_name) @pytest.mark.single @pytest.mark.db -@pytest.mark.skip(reason="gh-13611: there is no support for MySQL " - "if SQLAlchemy is not installed") +@pytest.mark.skip( + reason="gh-13611: there is no support for MySQL " "if SQLAlchemy is not installed" +) class TestXMySQL(MySQLMixIn): - - @pytest.fixture(autouse=True, scope='class') + @pytest.fixture(autouse=True, scope="class") def setup_class(cls): - pymysql = pytest.importorskip('pymysql') - pymysql.connect(host='localhost', user='root', passwd='', - db='pandas_nosetest') + pymysql = pytest.importorskip("pymysql") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") try: - pymysql.connect(read_default_group='pandas') + pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError: raise RuntimeError( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) except pymysql.Error: raise RuntimeError( "Cannot connect to database. " "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) @pytest.fixture(autouse=True) def setup_method(self, request, datapath): - pymysql = pytest.importorskip('pymysql') - pymysql.connect(host='localhost', user='root', passwd='', - db='pandas_nosetest') + pymysql = pytest.importorskip("pymysql") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") try: - pymysql.connect(read_default_group='pandas') + pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError: raise RuntimeError( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) except pymysql.Error: raise RuntimeError( "Cannot connect to database. " "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " - "typically located at ~/.my.cnf or /etc/.my.cnf.") + "typically located at ~/.my.cnf or /etc/.my.cnf." + ) self.method = request.function @@ -2480,7 +2619,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.iloc[0, 0] = np.nan drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() cur.execute(drop_sql) cur.execute(create_sql) @@ -2501,18 +2640,19 @@ def test_chunksize_read_type(self): drop_sql = "DROP TABLE IF EXISTS test" cur = self.conn.cursor() cur.execute(drop_sql) - sql.to_sql(frame, name='test', con=self.conn) + sql.to_sql(frame, name="test", con=self.conn) query = "select * from test" chunksize = 5 - chunk_gen = pd.read_sql_query(sql=query, con=self.conn, - chunksize=chunksize, index_col="index") + chunk_gen = pd.read_sql_query( + sql=query, con=self.conn, chunksize=chunksize, index_col="index" + ) chunk_df = next(chunk_gen) tm.assert_frame_equal(frame[:chunksize], chunk_df) def test_execute(self): frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") cur = self.conn.cursor() with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") @@ -2530,18 +2670,18 @@ def test_execute(self): def test_schema(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test') + create_sql = sql.get_schema(frame, "test") lines = create_sql.splitlines() for l in lines: - tokens = l.split(' ') - if len(tokens) == 2 and tokens[0] == 'A': - assert tokens[1] == 'DATETIME' + tokens = l.split(" ") + if len(tokens) == 2 and tokens[0] == "A": + assert tokens[1] == "DATETIME" frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) + create_sql = sql.get_schema(frame, "test", keys=["A", "B"]) lines = create_sql.splitlines() - assert 'PRIMARY KEY (`A`, `B`)' in create_sql + assert "PRIMARY KEY (`A`, `B`)" in create_sql cur = self.conn.cursor() cur.execute(drop_sql) cur.execute(create_sql) @@ -2600,7 +2740,7 @@ def _check_roundtrip(self, frame): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', con=self.conn, index=False) + sql.to_sql(frame, name="test_table", con=self.conn, index=False) result = sql.read_sql("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2610,19 +2750,17 @@ def _check_roundtrip(self, frame): expected = frame tm.assert_frame_equal(result, expected) - frame['txt'] = ['a'] * len(frame) + frame["txt"] = ["a"] * len(frame) frame2 = frame.copy() index = Index(np.arange(len(frame2))) + 10 - frame2['Idx'] = index + frame2["Idx"] = index drop_sql = "DROP TABLE IF EXISTS test_table2" cur = self.conn.cursor() with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.to_sql(frame2, name='test_table2', - con=self.conn, index=False) - result = sql.read_sql("select * from test_table2", self.conn, - index_col='Idx') + sql.to_sql(frame2, name="test_table2", con=self.conn, index=False) + result = sql.read_sql("select * from test_table2", self.conn, index_col="Idx") expected = frame.copy() # HACK! Change this once indexes are handled properly. @@ -2631,15 +2769,15 @@ def _check_roundtrip(self, frame): tm.assert_frame_equal(expected, result) def test_keyword_as_column_names(self): - df = DataFrame({'From': np.ones(5)}) - sql.to_sql(df, con=self.conn, name='testkeywords', - if_exists='replace', index=False) + df = DataFrame({"From": np.ones(5)}) + sql.to_sql( + df, con=self.conn, name="testkeywords", if_exists="replace", index=False + ) def test_if_exists(self): - df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) - df_if_exists_2 = DataFrame( - {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) - table_name = 'table_if_exists' + df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) + df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) + table_name = "table_if_exists" sql_select = "SELECT * FROM %s" % table_name def clean_up(test_table_to_drop): @@ -2651,33 +2789,67 @@ def clean_up(test_table_to_drop): # test if invalid value for if_exists raises appropriate error with pytest.raises(ValueError, match=""): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='notvalidvalue') + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="notvalidvalue", + ) clean_up(table_name) # test if_exists='fail' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail', index=False) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) with pytest.raises(ValueError, match=""): - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail') + sql.to_sql( + frame=df_if_exists_1, con=self.conn, name=table_name, if_exists="fail" + ) # test if_exists='replace' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='replace', index=False) - assert (tquery(sql_select, con=self.conn) == - [(3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="replace", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(3, "C"), (4, "D"), (5, "E")] clean_up(table_name) # test if_exists='append' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - if_exists='fail', index=False) - assert tquery(sql_select, con=self.conn) == [(1, 'A'), (2, 'B')] - sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - if_exists='append', index=False) - assert (tquery(sql_select, con=self.conn) == - [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) + sql.to_sql( + frame=df_if_exists_1, + con=self.conn, + name=table_name, + if_exists="fail", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [(1, "A"), (2, "B")] + sql.to_sql( + frame=df_if_exists_2, + con=self.conn, + name=table_name, + if_exists="append", + index=False, + ) + assert tquery(sql_select, con=self.conn) == [ + (1, "A"), + (2, "B"), + (3, "C"), + (4, "D"), + (5, "E"), + ] clean_up(table_name) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index e6fe6e3b7888f..715c7e370210f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -18,8 +18,12 @@ from pandas.io.parsers import read_csv from pandas.io.stata import ( - InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, StataReader, - read_stata) + InvalidColumnName, + PossiblePrecisionLoss, + StataMissingValue, + StataReader, + read_stata, +) @pytest.fixture @@ -29,77 +33,75 @@ def dirpath(datapath): @pytest.fixture def parsed_114(dirpath): - dta14_114 = os.path.join(dirpath, 'stata5_114.dta') + dta14_114 = os.path.join(dirpath, "stata5_114.dta") parsed_114 = read_stata(dta14_114, convert_dates=True) - parsed_114.index.name = 'index' + parsed_114.index.name = "index" return parsed_114 class TestStata: - @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "data") - self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') - self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + self.dta1_114 = os.path.join(self.dirpath, "stata1_114.dta") + self.dta1_117 = os.path.join(self.dirpath, "stata1_117.dta") - self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') - self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') - self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') - self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + self.dta2_113 = os.path.join(self.dirpath, "stata2_113.dta") + self.dta2_114 = os.path.join(self.dirpath, "stata2_114.dta") + self.dta2_115 = os.path.join(self.dirpath, "stata2_115.dta") + self.dta2_117 = os.path.join(self.dirpath, "stata2_117.dta") - self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') - self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') - self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') - self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') - self.csv3 = os.path.join(self.dirpath, 'stata3.csv') + self.dta3_113 = os.path.join(self.dirpath, "stata3_113.dta") + self.dta3_114 = os.path.join(self.dirpath, "stata3_114.dta") + self.dta3_115 = os.path.join(self.dirpath, "stata3_115.dta") + self.dta3_117 = os.path.join(self.dirpath, "stata3_117.dta") + self.csv3 = os.path.join(self.dirpath, "stata3.csv") - self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') - self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') - self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') - self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta4_113 = os.path.join(self.dirpath, "stata4_113.dta") + self.dta4_114 = os.path.join(self.dirpath, "stata4_114.dta") + self.dta4_115 = os.path.join(self.dirpath, "stata4_115.dta") + self.dta4_117 = os.path.join(self.dirpath, "stata4_117.dta") - self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta_encoding_118 = os.path.join(self.dirpath, - 'stata1_encoding_118.dta') + self.dta_encoding = os.path.join(self.dirpath, "stata1_encoding.dta") + self.dta_encoding_118 = os.path.join(self.dirpath, "stata1_encoding_118.dta") - self.csv14 = os.path.join(self.dirpath, 'stata5.csv') - self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') - self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') - self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') - self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta') + self.csv14 = os.path.join(self.dirpath, "stata5.csv") + self.dta14_113 = os.path.join(self.dirpath, "stata5_113.dta") + self.dta14_114 = os.path.join(self.dirpath, "stata5_114.dta") + self.dta14_115 = os.path.join(self.dirpath, "stata5_115.dta") + self.dta14_117 = os.path.join(self.dirpath, "stata5_117.dta") - self.csv15 = os.path.join(self.dirpath, 'stata6.csv') - self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') - self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') - self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') - self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta') + self.csv15 = os.path.join(self.dirpath, "stata6.csv") + self.dta15_113 = os.path.join(self.dirpath, "stata6_113.dta") + self.dta15_114 = os.path.join(self.dirpath, "stata6_114.dta") + self.dta15_115 = os.path.join(self.dirpath, "stata6_115.dta") + self.dta15_117 = os.path.join(self.dirpath, "stata6_117.dta") - self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') - self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta') + self.dta16_115 = os.path.join(self.dirpath, "stata7_115.dta") + self.dta16_117 = os.path.join(self.dirpath, "stata7_117.dta") - self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta') - self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta') - self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta') + self.dta17_113 = os.path.join(self.dirpath, "stata8_113.dta") + self.dta17_115 = os.path.join(self.dirpath, "stata8_115.dta") + self.dta17_117 = os.path.join(self.dirpath, "stata8_117.dta") - self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') - self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') + self.dta18_115 = os.path.join(self.dirpath, "stata9_115.dta") + self.dta18_117 = os.path.join(self.dirpath, "stata9_117.dta") - self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta') - self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta') + self.dta19_115 = os.path.join(self.dirpath, "stata10_115.dta") + self.dta19_117 = os.path.join(self.dirpath, "stata10_117.dta") - self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta') - self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta') + self.dta20_115 = os.path.join(self.dirpath, "stata11_115.dta") + self.dta20_117 = os.path.join(self.dirpath, "stata11_117.dta") - self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta') + self.dta21_117 = os.path.join(self.dirpath, "stata12_117.dta") - self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta') - self.dta23 = os.path.join(self.dirpath, 'stata15.dta') + self.dta22_118 = os.path.join(self.dirpath, "stata14_118.dta") + self.dta23 = os.path.join(self.dirpath, "stata15.dta") - self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta') - self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta') + self.dta24_111 = os.path.join(self.dirpath, "stata7_111.dta") + self.dta25_118 = os.path.join(self.dirpath, "stata16_118.dta") - self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') + self.stata_dates = os.path.join(self.dirpath, "stata13_dates.dta") def read_dta(self, file): # Legacy default reader configuration @@ -108,9 +110,9 @@ def read_dta(self, file): def read_csv(self, file): return read_csv(file, parse_dates=True) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_read_empty_dta(self, version): - empty_ds = DataFrame(columns=['unit']) + empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path, write_index=False, version=version) @@ -127,8 +129,7 @@ def test_data_method(self): parsed_114_read = rdr.read() tm.assert_frame_equal(parsed_114_data, parsed_114_read) - @pytest.mark.parametrize( - 'file', ['dta1_114', 'dta1_117']) + @pytest.mark.parametrize("file", ["dta1_114", "dta1_117"]) def test_read_dta1(self, file): file = getattr(self, file) @@ -136,13 +137,14 @@ def test_read_dta1(self, file): # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. - expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], - columns=['float_miss', 'double_miss', 'byte_miss', - 'int_miss', 'long_miss']) + expected = DataFrame( + [(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], + ) # this is an oddity as really the nan should be float64, but # the casting doesn't fail so need to match stata here - expected['float_miss'] = expected['float_miss'].astype(np.float32) + expected["float_miss"] = expected["float_miss"].astype(np.float32) tm.assert_frame_equal(parsed, expected) @@ -158,7 +160,7 @@ def test_read_dta2(self): datetime(2010, 1, 1), datetime(1974, 7, 1), datetime(2010, 1, 1), - datetime(2010, 1, 1) + datetime(2010, 1, 1), ), ( datetime(1959, 12, 31, 20, 3, 20), @@ -168,24 +170,22 @@ def test_read_dta2(self): datetime(1955, 1, 1), datetime(1955, 7, 1), datetime(1955, 1, 1), - datetime(2, 1, 1) + datetime(2, 1, 1), ), - ( - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - pd.NaT, - ) + (pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT), + ], + columns=[ + "datetime_c", + "datetime_big_c", + "date", + "weekly_date", + "monthly_date", + "quarterly_date", + "half_yearly_date", + "yearly_date", ], - columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date', - 'monthly_date', 'quarterly_date', 'half_yearly_date', - 'yearly_date'] ) - expected['yearly_date'] = expected['yearly_date'].astype('O') + expected["yearly_date"] = expected["yearly_date"].astype("O") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -204,15 +204,11 @@ def test_read_dta2(self): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, - check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, - check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, - check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) - @pytest.mark.parametrize( - 'file', ['dta3_113', 'dta3_114', 'dta3_115', 'dta3_117']) + @pytest.mark.parametrize("file", ["dta3_113", "dta3_114", "dta3_115", "dta3_117"]) def test_read_dta3(self, file): file = getattr(self, file) @@ -221,13 +217,12 @@ def test_read_dta3(self, file): # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int16) - expected['quarter'] = expected['quarter'].astype(np.int8) + expected["year"] = expected["year"].astype(np.int16) + expected["quarter"] = expected["quarter"].astype(np.int8) tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize( - 'file', ['dta4_113', 'dta4_114', 'dta4_115', 'dta4_117']) + @pytest.mark.parametrize("file", ["dta4_113", "dta4_114", "dta4_115", "dta4_117"]) def test_read_dta4(self, file): file = getattr(self, file) @@ -244,14 +239,21 @@ def test_read_dta4(self, file): ["seven", "four", 7, np.nan, "seven"], ["eight", "three", 8, np.nan, "eight"], ["nine", "two", 9, np.nan, "nine"], - ["ten", "one", "ten", np.nan, "ten"] + ["ten", "one", "ten", np.nan, "ten"], ], - columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', - 'labeled_with_missings', 'float_labelled']) + columns=[ + "fully_labeled", + "fully_labeled2", + "incompletely_labeled", + "labeled_with_missings", + "float_labelled", + ], + ) # these are all categoricals - expected = pd.concat([expected[col].astype('category') - for col in expected], axis=1) + expected = pd.concat( + [expected[col].astype("category") for col in expected], axis=1 + ) # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected, check_categorical=False) @@ -265,105 +267,122 @@ def test_read_dta12(self): [3, "cba", "qwertywertyqwerty"], [93, "", "strl"], ], - columns=['x', 'y', 'z']) + columns=["x", "y", "z"], + ) tm.assert_frame_equal(parsed_117, expected, check_dtype=False) def test_read_dta18(self): parsed_118 = self.read_dta(self.dta22_118) - parsed_118["Bytes"] = parsed_118["Bytes"].astype('O') + parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( - [['Cat', 'Bogota', 'Bogotá', 1, 1.0, 'option b Ünicode', 1.0], - ['Dog', 'Boston', 'Uzunköprü', np.nan, np.nan, np.nan, np.nan], - ['Plane', 'Rome', 'Tromsø', 0, 0.0, 'option a', 0.0], - ['Potato', 'Tokyo', 'Elâzığ', -4, 4.0, 4, 4], - ['', '', '', 0, 0.3332999, 'option a', 1 / 3.] - ], - columns=['Things', 'Cities', 'Unicode_Cities_Strl', - 'Ints', 'Floats', 'Bytes', 'Longs']) + [ + ["Cat", "Bogota", "Bogotá", 1, 1.0, "option b Ünicode", 1.0], + ["Dog", "Boston", "Uzunköprü", np.nan, np.nan, np.nan, np.nan], + ["Plane", "Rome", "Tromsø", 0, 0.0, "option a", 0.0], + ["Potato", "Tokyo", "Elâzığ", -4, 4.0, 4, 4], + ["", "", "", 0, 0.3332999, "option a", 1 / 3.0], + ], + columns=[ + "Things", + "Cities", + "Unicode_Cities_Strl", + "Ints", + "Floats", + "Bytes", + "Longs", + ], + ) expected["Floats"] = expected["Floats"].astype(np.float32) for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) with StataReader(self.dta22_118) as rdr: vl = rdr.variable_labels() - vl_expected = {'Unicode_Cities_Strl': - 'Here are some strls with Ünicode chars', - 'Longs': 'long data', - 'Things': 'Here are some things', - 'Bytes': 'byte data', - 'Ints': 'int data', - 'Cities': 'Here are some cities', - 'Floats': 'float data'} + vl_expected = { + "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", + "Longs": "long data", + "Things": "Here are some things", + "Bytes": "byte data", + "Ints": "int data", + "Cities": "Here are some cities", + "Floats": "float data", + } tm.assert_dict_equal(vl, vl_expected) - assert rdr.data_label == 'This is a Ünicode data label' + assert rdr.data_label == "This is a Ünicode data label" def test_read_write_dta5(self): - original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], - columns=['float_miss', 'double_miss', 'byte_miss', - 'int_miss', 'long_miss']) - original.index.name = 'index' + original = DataFrame( + [(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"], + ) + original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + tm.assert_frame_equal(written_and_read_again.set_index("index"), original) def test_write_dta6(self): original = self.read_csv(self.csv3) - original.index.name = 'index' + original.index.name = "index" original.index = original.index.astype(np.int32) - original['year'] = original['year'].astype(np.int32) - original['quarter'] = original['quarter'].astype(np.int32) + original["year"] = original["year"].astype(np.int32) + original["quarter"] = original["quarter"].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original, check_index_type=False) + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_read_write_dta10(self, version): - original = DataFrame(data=[["string", "object", 1, 1.1, - np.datetime64('2003-12-25')]], - columns=['string', 'object', 'integer', - 'floating', 'datetime']) + original = DataFrame( + data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], + columns=["string", "object", "integer", "floating", "datetime"], + ) original["object"] = Series(original["object"], dtype=object) - original.index.name = 'index' + original.index.name = "index" original.index = original.index.astype(np.int32) - original['integer'] = original['integer'].astype(np.int32) + original["integer"] = original["integer"].astype(np.int32) with tm.ensure_clean() as path: - original.to_stata(path, {'datetime': 'tc'}, version=version) + original.to_stata(path, {"datetime": "tc"}, version=version) written_and_read_again = self.read_dta(path) # original.index is np.int32, read index is np.int64 - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original, check_index_type=False) + tm.assert_frame_equal( + written_and_read_again.set_index("index"), + original, + check_index_type=False, + ) def test_stata_doc_examples(self): with tm.ensure_clean() as path: - df = DataFrame(np.random.randn(10, 2), columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.to_stata(path) def test_write_preserves_original(self): # 9795 np.random.seed(423) - df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) - df.loc[2, 'a':'c'] = np.nan + df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) + df.loc[2, "a":"c"] = np.nan df_copy = df.copy() with tm.ensure_clean() as path: df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) with tm.assert_produces_warning(FutureWarning): - encoded = read_stata(self.dta_encoding, encoding='latin-1') + encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] expected = raw.kreis1849[0] @@ -372,19 +391,27 @@ def test_encoding(self, version): with tm.ensure_clean() as path: with tm.assert_produces_warning(FutureWarning): - encoded.to_stata(path, write_index=False, version=version, - encoding='latin-1') + encoded.to_stata( + path, write_index=False, version=version, encoding="latin-1" + ) reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) def test_read_write_dta11(self): - original = DataFrame([(1, 2, 3, 4)], - columns=['good', 'b\u00E4d', '8number', - 'astringwithmorethan32characters______']) - formatted = DataFrame([(1, 2, 3, 4)], - columns=['good', 'b_d', '_8number', - 'astringwithmorethan32characters_']) - formatted.index.name = 'index' + original = DataFrame( + [(1, 2, 3, 4)], + columns=[ + "good", + "b\u00E4d", + "8number", + "astringwithmorethan32characters______", + ], + ) + formatted = DataFrame( + [(1, 2, 3, 4)], + columns=["good", "b_d", "_8number", "astringwithmorethan32characters_"], + ) + formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: @@ -392,216 +419,217 @@ def test_read_write_dta11(self): original.to_stata(path, None) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), formatted) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_read_write_dta12(self, version): - original = DataFrame([(1, 2, 3, 4, 5, 6)], - columns=['astringwithmorethan32characters_1', - 'astringwithmorethan32characters_2', - '+', - '-', - 'short', - 'delete']) - formatted = DataFrame([(1, 2, 3, 4, 5, 6)], - columns=['astringwithmorethan32characters_', - '_0astringwithmorethan32character', - '_', - '_1_', - '_short', - '_delete']) - formatted.index.name = 'index' + original = DataFrame( + [(1, 2, 3, 4, 5, 6)], + columns=[ + "astringwithmorethan32characters_1", + "astringwithmorethan32characters_2", + "+", + "-", + "short", + "delete", + ], + ) + formatted = DataFrame( + [(1, 2, 3, 4, 5, 6)], + columns=[ + "astringwithmorethan32characters_", + "_0astringwithmorethan32character", + "_", + "_1_", + "_short", + "_delete", + ], + ) + formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always', InvalidColumnName) + warnings.simplefilter("always", InvalidColumnName) original.to_stata(path, None, version=version) # should get a warning for that format. assert len(w) == 1 written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), formatted) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) def test_read_write_dta13(self): s1 = Series(2 ** 9, dtype=np.int16) s2 = Series(2 ** 17, dtype=np.int32) s3 = Series(2 ** 33, dtype=np.int64) - original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3}) - original.index.name = 'index' + original = DataFrame({"int16": s1, "int32": s2, "int64": s3}) + original.index.name = "index" formatted = original - formatted['int64'] = formatted['int64'].astype(np.float64) + formatted["int64"] = formatted["int64"].astype(np.float64) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - formatted) + tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) @pytest.mark.parametrize( - 'file', ['dta14_113', 'dta14_114', 'dta14_115', 'dta14_117']) + "file", ["dta14_113", "dta14_114", "dta14_115", "dta14_117"] + ) def test_read_write_reread_dta14(self, file, parsed_114, version): file = getattr(self, file) parsed = self.read_dta(file) - parsed.index.name = 'index' + parsed.index.name = "index" expected = self.read_csv(self.csv14) - cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] + cols = ["byte_", "int_", "long_", "float_", "double_"] for col in cols: expected[col] = expected[col]._convert(datetime=True, numeric=True) - expected['float_'] = expected['float_'].astype(np.float32) - expected['date_td'] = pd.to_datetime( - expected['date_td'], errors='coerce') + expected["float_"] = expected["float_"].astype(np.float32) + expected["date_td"] = pd.to_datetime(expected["date_td"], errors="coerce") tm.assert_frame_equal(parsed_114, parsed) with tm.ensure_clean() as path: - parsed_114.to_stata(path, {'date_td': 'td'}, version=version) + parsed_114.to_stata(path, {"date_td": "td"}, version=version) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), parsed_114) + tm.assert_frame_equal(written_and_read_again.set_index("index"), parsed_114) @pytest.mark.parametrize( - 'file', ['dta15_113', 'dta15_114', 'dta15_115', 'dta15_117']) + "file", ["dta15_113", "dta15_114", "dta15_115", "dta15_117"] + ) def test_read_write_reread_dta15(self, file): expected = self.read_csv(self.csv15) - expected['byte_'] = expected['byte_'].astype(np.int8) - expected['int_'] = expected['int_'].astype(np.int16) - expected['long_'] = expected['long_'].astype(np.int32) - expected['float_'] = expected['float_'].astype(np.float32) - expected['double_'] = expected['double_'].astype(np.float64) - expected['date_td'] = expected['date_td'].apply( - datetime.strptime, args=('%Y-%m-%d',)) + expected["byte_"] = expected["byte_"].astype(np.int8) + expected["int_"] = expected["int_"].astype(np.int16) + expected["long_"] = expected["long_"].astype(np.int32) + expected["float_"] = expected["float_"].astype(np.float32) + expected["double_"] = expected["double_"].astype(np.float64) + expected["date_td"] = expected["date_td"].apply( + datetime.strptime, args=("%Y-%m-%d",) + ) file = getattr(self, file) parsed = self.read_dta(file) tm.assert_frame_equal(expected, parsed) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_timestamp_and_label(self, version): - original = DataFrame([(1,)], columns=['variable']) + original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) - data_label = 'This is a data file.' + data_label = "This is a data file." with tm.ensure_clean() as path: - original.to_stata(path, time_stamp=time_stamp, - data_label=data_label, - version=version) + original.to_stata( + path, time_stamp=time_stamp, data_label=data_label, version=version + ) with StataReader(path) as reader: - assert reader.time_stamp == '29 Feb 2000 14:21' + assert reader.time_stamp == "29 Feb 2000 14:21" assert reader.data_label == data_label - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_invalid_timestamp(self, version): - original = DataFrame([(1,)], columns=['variable']) - time_stamp = '01 Jan 2000, 00:00:00' + original = DataFrame([(1,)], columns=["variable"]) + time_stamp = "01 Jan 2000, 00:00:00" with tm.ensure_clean() as path: msg = "time_stamp should be datetime type" with pytest.raises(ValueError, match=msg): - original.to_stata(path, time_stamp=time_stamp, - version=version) + original.to_stata(path, time_stamp=time_stamp, version=version) def test_numeric_column_names(self): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) - original.index.name = 'index' + original.index.name = "index" with tm.ensure_clean() as path: # should get a warning for that format. with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path) written_and_read_again = self.read_dta(path) - written_and_read_again = written_and_read_again.set_index('index') + written_and_read_again = written_and_read_again.set_index("index") columns = list(written_and_read_again.columns) convert_col_name = lambda x: int(x[1]) written_and_read_again.columns = map(convert_col_name, columns) tm.assert_frame_equal(original, written_and_read_again) - @pytest.mark.parametrize('version', [114, 117]) + @pytest.mark.parametrize("version", [114, 117]) def test_nan_to_missing_value(self, version): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) s1[::2] = np.nan s2[1::2] = np.nan - original = DataFrame({'s1': s1, 's2': s2}) - original.index.name = 'index' + original = DataFrame({"s1": s1, "s2": s2}) + original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) - written_and_read_again = written_and_read_again.set_index('index') + written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, original) def test_no_index(self): - columns = ['x', 'y'] - original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), - columns=columns) - original.index.name = 'index_not_written' + columns = ["x", "y"] + original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) + original.index.name = "index_not_written" with tm.ensure_clean() as path: original.to_stata(path, write_index=False) written_and_read_again = self.read_dta(path) with pytest.raises(KeyError, match=original.index.name): - written_and_read_again['index_not_written'] + written_and_read_again["index_not_written"] def test_string_no_dates(self): - s1 = Series(['a', 'A longer string']) + s1 = Series(["a", "A longer string"]) s2 = Series([1.0, 2.0], dtype=np.float64) - original = DataFrame({'s1': s1, 's2': s2}) - original.index.name = 'index' + original = DataFrame({"s1": s1, "s2": s2}) + original.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - original) + tm.assert_frame_equal(written_and_read_again.set_index("index"), original) def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) - original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3}) - original.index.name = 'index' + original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3}) + original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(PossiblePrecisionLoss): original.to_stata(path) written_and_read_again = self.read_dta(path) modified = original.copy() - modified['s1'] = Series(modified['s1'], dtype=np.int16) - modified['s2'] = Series(modified['s2'], dtype=np.int32) - modified['s3'] = Series(modified['s3'], dtype=np.float64) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - modified) + modified["s1"] = Series(modified["s1"], dtype=np.int16) + modified["s2"] = Series(modified["s2"], dtype=np.int32) + modified["s3"] = Series(modified["s3"], dtype=np.float64) + tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) def test_dates_invalid_column(self): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) - original.index.name = 'index' + original.index.name = "index" with tm.ensure_clean() as path: with tm.assert_produces_warning(InvalidColumnName): - original.to_stata(path, {0: 'tc'}) + original.to_stata(path, {0: "tc"}) written_and_read_again = self.read_dta(path) modified = original.copy() - modified.columns = ['_0'] - tm.assert_frame_equal(written_and_read_again.set_index('index'), - modified) + modified.columns = ["_0"] + tm.assert_frame_equal(written_and_read_again.set_index("index"), modified) def test_105(self): # Data obtained from: # http://go.worldbank.org/ZXY29PVJ21 - dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') + dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") df = pd.read_stata(dpath) df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] df0 = pd.DataFrame(df0) df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] - df0['clustnum'] = df0["clustnum"].astype(np.int16) - df0['pri_schl'] = df0["pri_schl"].astype(np.int8) - df0['psch_num'] = df0["psch_num"].astype(np.int8) - df0['psch_dis'] = df0["psch_dis"].astype(np.float32) + df0["clustnum"] = df0["clustnum"].astype(np.int16) + df0["pri_schl"] = df0["pri_schl"].astype(np.int8) + df0["psch_num"] = df0["psch_num"].astype(np.int8) + df0["psch_dis"] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0) def test_value_labels_old_format(self): @@ -609,45 +637,45 @@ def test_value_labels_old_format(self): # # Test that value_labels() returns an empty dict if the file format # predates supporting value labels. - dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') + dpath = os.path.join(self.dirpath, "S4_EDUC1.dta") reader = StataReader(dpath) assert reader.value_labels() == {} reader.close() def test_date_export_formats(self): - columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] + columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"] conversions = {c: c for c in columns} data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) original = DataFrame([data], columns=columns) - original.index.name = 'index' - expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time - datetime(2006, 11, 20), # Day - datetime(2006, 11, 19), # Week - datetime(2006, 11, 1), # Month - datetime(2006, 10, 1), # Quarter year - datetime(2006, 7, 1), # Half year - datetime(2006, 1, 1)] # Year + original.index.name = "index" + expected_values = [ + datetime(2006, 11, 20, 23, 13, 20), # Time + datetime(2006, 11, 20), # Day + datetime(2006, 11, 19), # Week + datetime(2006, 11, 1), # Month + datetime(2006, 10, 1), # Quarter year + datetime(2006, 7, 1), # Half year + datetime(2006, 1, 1), + ] # Year expected = DataFrame([expected_values], columns=columns) - expected.index.name = 'index' + expected.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path, conversions) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - expected) + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) def test_write_missing_strings(self): original = DataFrame([["1"], [None]], columns=["foo"]) expected = DataFrame([["1"], [""]], columns=["foo"]) - expected.index.name = 'index' + expected.index.name = "index" with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), - expected) + tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - @pytest.mark.parametrize('version', [114, 117]) - @pytest.mark.parametrize('byteorder', ['>', '<']) + @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("byteorder", [">", "<"]) def test_bool_uint(self, byteorder, version): s0 = Series([0, 1, True], dtype=np.bool) s1 = Series([0, 1, 100], dtype=np.uint8) @@ -657,19 +685,27 @@ def test_bool_uint(self, byteorder, version): s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32) s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32) - original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3, - 's4': s4, 's5': s5, 's6': s6}) - original.index.name = 'index' + original = DataFrame( + {"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6} + ) + original.index.name = "index" expected = original.copy() - expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, - np.int32, np.float64) + expected_types = ( + np.int8, + np.int8, + np.int16, + np.int16, + np.int32, + np.int32, + np.float64, + ) for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) with tm.ensure_clean() as path: original.to_stata(path, byteorder=byteorder, version=version) written_and_read_again = self.read_dta(path) - written_and_read_again = written_and_read_again.set_index('index') + written_and_read_again = written_and_read_again.set_index("index") tm.assert_frame_equal(written_and_read_again, expected) def test_variable_labels(self): @@ -677,8 +713,8 @@ def test_variable_labels(self): sr_115 = rdr.variable_labels() with StataReader(self.dta16_117) as rdr: sr_117 = rdr.variable_labels() - keys = ('var1', 'var2', 'var3') - labels = ('label1', 'label2', 'label3') + keys = ("var1", "var2", "var3") + labels = ("label1", "label2", "label3") for k, v in sr_115.items(): assert k in sr_117 assert v == sr_117[k] @@ -689,8 +725,9 @@ def test_minimal_size_col(self): str_lens = (1, 100, 244) s = {} for str_len in str_lens: - s['s' + str(str_len)] = Series(['a' * str_len, - 'b' * str_len, 'c' * str_len]) + s["s" + str(str_len)] = Series( + ["a" * str_len, "b" * str_len, "c" * str_len] + ) original = DataFrame(s) with tm.ensure_clean() as path: original.to_stata(path, write_index=False) @@ -707,26 +744,29 @@ def test_excessively_long_string(self): str_lens = (1, 244, 500) s = {} for str_len in str_lens: - s['s' + str(str_len)] = Series(['a' * str_len, - 'b' * str_len, 'c' * str_len]) + s["s" + str(str_len)] = Series( + ["a" * str_len, "b" * str_len, "c" * str_len] + ) original = DataFrame(s) - msg = (r"Fixed width strings in Stata \.dta files are limited to 244" - r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" - r" this restriction\. Use the\n'version=117' parameter to write" - r" the newer \(Stata 13 and later\) format\.") + msg = ( + r"Fixed width strings in Stata \.dta files are limited to 244" + r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" + r" this restriction\. Use the\n'version=117' parameter to write" + r" the newer \(Stata 13 and later\) format\." + ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: original.to_stata(path) def test_missing_value_generator(self): - types = ('b', 'h', 'l') - df = DataFrame([[0.0]], columns=['float_']) + types = ("b", "h", "l") + df = DataFrame([[0.0]], columns=["float_"]) with tm.ensure_clean() as path: df.to_stata(path) with StataReader(path) as rdr: valid_range = rdr.VALID_RANGE - expected_values = ['.' + chr(97 + i) for i in range(26)] - expected_values.insert(0, '.') + expected_values = ["." + chr(97 + i) for i in range(26)] + expected_values.insert(0, ".") for t in types: offset = valid_range[t][1] for i in range(0, 27): @@ -734,23 +774,24 @@ def test_missing_value_generator(self): assert val.string == expected_values[i] # Test extremes for floats - val = StataMissingValue(struct.unpack('= 1.5 """ - return [v[field] for v in rcParams['axes.prop_cycle']] + return [v[field] for v in rcParams["axes.prop_cycle"]] -def _check_plot_works(f, filterwarnings='always', **kwargs): +def _check_plot_works(f, filterwarnings="always", **kwargs): import matplotlib.pyplot as plt + ret = None with warnings.catch_warnings(): warnings.simplefilter(filterwarnings) try: try: - fig = kwargs['figure'] + fig = kwargs["figure"] except KeyError: fig = plt.gcf() plt.clf() - ax = kwargs.get('ax', fig.add_subplot(211)) # noqa + ax = kwargs.get("ax", fig.add_subplot(211)) # noqa ret = f(**kwargs) assert_is_valid_plot_return_object(ret) try: - kwargs['ax'] = fig.add_subplot(212) + kwargs["ax"] = fig.add_subplot(212) ret = f(**kwargs) except Exception: pass diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index 65e1d690d5f8f..51f2abb6cc2f4 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -4,30 +4,35 @@ def test_matplotlib_backend_error(): - msg = ('matplotlib is required for plotting when the default backend ' - '"matplotlib" is selected.') + msg = ( + "matplotlib is required for plotting when the default backend " + '"matplotlib" is selected.' + ) try: import matplotlib # noqa except ImportError: with pytest.raises(ImportError, match=msg): - pandas.set_option('plotting.backend', 'matplotlib') + pandas.set_option("plotting.backend", "matplotlib") def test_backend_is_not_module(): - msg = ('"not_an_existing_module" does not seem to be an installed module. ' - 'A pandas plotting backend must be a module that can be imported') + msg = ( + '"not_an_existing_module" does not seem to be an installed module. ' + "A pandas plotting backend must be a module that can be imported" + ) with pytest.raises(ValueError, match=msg): - pandas.set_option('plotting.backend', 'not_an_existing_module') + pandas.set_option("plotting.backend", "not_an_existing_module") def test_backend_is_correct(monkeypatch): - monkeypatch.setattr('pandas.core.config_init.importlib.import_module', - lambda name: None) - pandas.set_option('plotting.backend', 'correct_backend') - assert pandas.get_option('plotting.backend') == 'correct_backend' + monkeypatch.setattr( + "pandas.core.config_init.importlib.import_module", lambda name: None + ) + pandas.set_option("plotting.backend", "correct_backend") + assert pandas.get_option("plotting.backend") == "correct_backend" # Restore backend for other tests (matplotlib can be not installed) try: - pandas.set_option('plotting.backend', 'matplotlib') + pandas.set_option("plotting.backend", "matplotlib") except ImportError: pass diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index de1ac0c293189..cab0efe53f1fc 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -20,64 +20,63 @@ @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - @pytest.mark.slow def test_boxplot_legacy1(self): - df = DataFrame(np.random.randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) - df['indic'] = ['foo', 'bar'] * 3 - df['indic2'] = ['foo', 'bar', 'foo'] * 2 - - _check_plot_works(df.boxplot, return_type='dict') - _check_plot_works(df.boxplot, column=[ - 'one', 'two'], return_type='dict') + df = DataFrame( + np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) + df["indic"] = ["foo", "bar"] * 3 + df["indic2"] = ["foo", "bar", "foo"] * 2 + + _check_plot_works(df.boxplot, return_type="dict") + _check_plot_works(df.boxplot, column=["one", "two"], return_type="dict") # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, column=['one', 'two'], - by='indic') - _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) + _check_plot_works(df.boxplot, column=["one", "two"], by="indic") + _check_plot_works(df.boxplot, column="one", by=["indic", "indic2"]) with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='indic') + _check_plot_works(df.boxplot, by="indic") with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by=['indic', 'indic2']) - _check_plot_works(plotting._core.boxplot, data=df['one'], - return_type='dict') - _check_plot_works(df.boxplot, notch=1, return_type='dict') + _check_plot_works(df.boxplot, by=["indic", "indic2"]) + _check_plot_works(plotting._core.boxplot, data=df["one"], return_type="dict") + _check_plot_works(df.boxplot, notch=1, return_type="dict") with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='indic', notch=1) + _check_plot_works(df.boxplot, by="indic", notch=1) @pytest.mark.slow def test_boxplot_legacy2(self): - df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = Series(['A'] * 10) + df = DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) + df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + df["Y"] = Series(["A"] * 10) with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.boxplot, by='X') + _check_plot_works(df.boxplot, by="X") # When ax is supplied and required number of axes is 1, # passed ax should be used: fig, ax = self.plt.subplots() - axes = df.boxplot('Col1', by='X', ax=ax) + axes = df.boxplot("Col1", by="X", ax=ax) ax_axes = ax.axes assert ax_axes is axes fig, ax = self.plt.subplots() - axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') + axes = df.groupby("Y").boxplot(ax=ax, return_type="axes") ax_axes = ax.axes - assert ax_axes is axes['A'] + assert ax_axes is axes["A"] # Multiple columns with an ax argument should use same figure fig, ax = self.plt.subplots() with tm.assert_produces_warning(UserWarning): - axes = df.boxplot(column=['Col1', 'Col2'], - by='X', ax=ax, return_type='axes') - assert axes['Col1'].get_figure() is fig + axes = df.boxplot( + column=["Col1", "Col2"], by="X", ax=ax, return_type="axes" + ) + assert axes["Col1"].get_figure() is fig # When by is None, check that all relevant lines are present in the # dict fig, ax = self.plt.subplots() - d = df.boxplot(ax=ax, return_type='dict') + d = df.boxplot(ax=ax, return_type="dict") lines = list(itertools.chain.from_iterable(d.values())) assert len(ax.get_lines()) == len(lines) @@ -92,51 +91,52 @@ def test_boxplot_return_type_legacy(self): # API change in https://github.com/pandas-dev/pandas/pull/7096 import matplotlib as mpl # noqa - df = DataFrame(np.random.randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) + df = DataFrame( + np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) with pytest.raises(ValueError): - df.boxplot(return_type='NOTATYPE') + df.boxplot(return_type="NOTATYPE") result = df.boxplot() - self._check_box_return_type(result, 'axes') + self._check_box_return_type(result, "axes") with tm.assert_produces_warning(False): - result = df.boxplot(return_type='dict') - self._check_box_return_type(result, 'dict') + result = df.boxplot(return_type="dict") + self._check_box_return_type(result, "dict") with tm.assert_produces_warning(False): - result = df.boxplot(return_type='axes') - self._check_box_return_type(result, 'axes') + result = df.boxplot(return_type="axes") + self._check_box_return_type(result, "axes") with tm.assert_produces_warning(False): - result = df.boxplot(return_type='both') - self._check_box_return_type(result, 'both') + result = df.boxplot(return_type="both") + self._check_box_return_type(result, "both") @pytest.mark.slow def test_boxplot_axis_limits(self): - def _check_ax_limits(col, ax): y_min, y_max = ax.get_ylim() assert y_min <= col.min() assert y_max >= col.max() df = self.hist_df.copy() - df['age'] = np.random.randint(1, 20, df.shape[0]) + df["age"] = np.random.randint(1, 20, df.shape[0]) # One full row - height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category') - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) + height_ax, weight_ax = df.boxplot(["height", "weight"], by="category") + _check_ax_limits(df["height"], height_ax) + _check_ax_limits(df["weight"], weight_ax) assert weight_ax._sharey == height_ax # Two rows, one partial - p = df.boxplot(['height', 'weight', 'age'], by='category') + p = df.boxplot(["height", "weight", "age"], by="category") height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] dummy_ax = p[1, 1] - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) - _check_ax_limits(df['age'], age_ax) + _check_ax_limits(df["height"], height_ax) + _check_ax_limits(df["weight"], weight_ax) + _check_ax_limits(df["age"], age_ax) assert weight_ax._sharey == height_ax assert age_ax._sharey == height_ax assert dummy_ax._sharey is None @@ -145,60 +145,54 @@ def _check_ax_limits(col, ax): def test_boxplot_empty_column(self): df = DataFrame(np.random.randn(20, 4)) df.loc[:, 0] = np.nan - _check_plot_works(df.boxplot, return_type='axes') + _check_plot_works(df.boxplot, return_type="axes") @pytest.mark.slow def test_figsize(self): - df = DataFrame(np.random.rand(10, 5), - columns=['A', 'B', 'C', 'D', 'E']) - result = df.boxplot(return_type='axes', figsize=(12, 8)) + df = DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) + result = df.boxplot(return_type="axes", figsize=(12, 8)) assert result.figure.bbox_inches.width == 12 assert result.figure.bbox_inches.height == 8 def test_fontsize(self): df = DataFrame({"a": [1, 2, 3, 4, 5, 6]}) - self._check_ticks_props(df.boxplot("a", fontsize=16), - xlabelsize=16, ylabelsize=16) + self._check_ticks_props( + df.boxplot("a", fontsize=16), xlabelsize=16, ylabelsize=16 + ) @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): - @pytest.mark.slow def test_boxplot_legacy1(self): - grouped = self.hist_df.groupby(by='gender') + grouped = self.hist_df.groupby(by="gender") with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') + axes = _check_plot_works(grouped.boxplot, return_type="axes") self._check_axes_shape(list(axes.values), axes_num=2, layout=(1, 2)) - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow def test_boxplot_legacy2(self): tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.groupby(level=1) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') + axes = _check_plot_works(grouped.boxplot, return_type="axes") self._check_axes_shape(list(axes.values), axes_num=10, layout=(4, 3)) - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow def test_boxplot_legacy3(self): tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) grouped = df.unstack(level=1).groupby(level=0, axis=1) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(grouped.boxplot, return_type='axes') + axes = _check_plot_works(grouped.boxplot, return_type="axes") self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2)) - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') + axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes") self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow @@ -207,22 +201,22 @@ def test_grouped_plot_fignums(self): weight = Series(np.random.normal(166, 20, size=n)) height = Series(np.random.normal(60, 10, size=n)) with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) - df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) - gb = df.groupby('gender') + gender = np.random.choice(["male", "female"], size=n) + df = DataFrame({"height": height, "weight": weight, "gender": gender}) + gb = df.groupby("gender") res = gb.plot() assert len(self.plt.get_fignums()) == 2 assert len(res) == 2 tm.close() - res = gb.boxplot(return_type='axes') + res = gb.boxplot(return_type="axes") assert len(self.plt.get_fignums()) == 1 assert len(res) == 2 tm.close() # now works with GH 5610 as gender is excluded - res = df.groupby('gender').hist() + res = df.groupby("gender").hist() tm.close() @pytest.mark.slow @@ -230,36 +224,34 @@ def test_grouped_box_return_type(self): df = self.hist_df # old style: return_type=None - result = df.boxplot(by='gender') + result = df.boxplot(by="gender") assert isinstance(result, np.ndarray) self._check_box_return_type( - result, None, - expected_keys=['height', 'weight', 'category']) + result, None, expected_keys=["height", "weight", "category"] + ) # now for groupby - result = df.groupby('gender').boxplot(return_type='dict') - self._check_box_return_type( - result, 'dict', expected_keys=['Male', 'Female']) + result = df.groupby("gender").boxplot(return_type="dict") + self._check_box_return_type(result, "dict", expected_keys=["Male", "Female"]) - columns2 = 'X B C D A G Y N Q O'.split() + columns2 = "X B C D A G Y N Q O".split() df2 = DataFrame(random.randn(50, 10), columns=columns2) - categories2 = 'A B C D E F G H I J'.split() - df2['category'] = categories2 * 5 + categories2 = "A B C D E F G H I J".split() + df2["category"] = categories2 * 5 - for t in ['dict', 'axes', 'both']: - returned = df.groupby('classroom').boxplot(return_type=t) - self._check_box_return_type( - returned, t, expected_keys=['A', 'B', 'C']) + for t in ["dict", "axes", "both"]: + returned = df.groupby("classroom").boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=["A", "B", "C"]) - returned = df.boxplot(by='classroom', return_type=t) + returned = df.boxplot(by="classroom", return_type=t) self._check_box_return_type( - returned, t, - expected_keys=['height', 'weight', 'category']) + returned, t, expected_keys=["height", "weight", "category"] + ) - returned = df2.groupby('category').boxplot(return_type=t) + returned = df2.groupby("category").boxplot(return_type=t) self._check_box_return_type(returned, t, expected_keys=categories2) - returned = df2.boxplot(by='category', return_type=t) + returned = df2.boxplot(by="category", return_type=t) self._check_box_return_type(returned, t, expected_keys=columns2) @pytest.mark.slow @@ -268,79 +260,92 @@ def test_grouped_box_layout(self): msg = "Layout of 1x1 must be larger than required size 2" with pytest.raises(ValueError, match=msg): - df.boxplot(column=['weight', 'height'], by=df.gender, - layout=(1, 1)) + df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1)) msg = "The 'layout' keyword is not supported when 'by' is None" with pytest.raises(ValueError, match=msg): - df.boxplot(column=['height', 'weight', 'category'], - layout=(2, 1), return_type='dict') + df.boxplot( + column=["height", "weight", "category"], + layout=(2, 1), + return_type="dict", + ) msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): - df.boxplot(column=['weight', 'height'], by=df.gender, - layout=(-1, -1)) + df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1)) # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('gender').boxplot, - column='height', return_type='dict') + box = _check_plot_works( + df.groupby("gender").boxplot, column="height", return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - return_type='dict') + box = _check_plot_works( + df.groupby("category").boxplot, column="height", return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) # GH 6769 with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('classroom').boxplot, - column='height', return_type='dict') + box = _check_plot_works( + df.groupby("classroom").boxplot, column="height", return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) # GH 5897 - axes = df.boxplot(column=['height', 'weight', 'category'], by='gender', - return_type='axes') + axes = df.boxplot( + column=["height", "weight", "category"], by="gender", return_type="axes" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - for ax in [axes['height']]: + for ax in [axes["height"]]: self._check_visible(ax.get_xticklabels(), visible=False) self._check_visible([ax.xaxis.get_label()], visible=False) - for ax in [axes['weight'], axes['category']]: + for ax in [axes["weight"], axes["category"]]: self._check_visible(ax.get_xticklabels()) self._check_visible([ax.xaxis.get_label()]) - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], return_type='dict') + box = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, 2), return_type='dict') + box = _check_plot_works( + df.groupby("category").boxplot, + column="height", + layout=(3, 2), + return_type="dict", + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) with tm.assert_produces_warning(UserWarning): - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, -1), return_type='dict') + box = _check_plot_works( + df.groupby("category").boxplot, + column="height", + layout=(3, -1), + return_type="dict", + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(4, 1)) + box = df.boxplot( + column=["height", "weight", "category"], by="gender", layout=(4, 1) + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(-1, 1)) + box = df.boxplot( + column=["height", "weight", "category"], by="gender", layout=(-1, 1) + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], layout=(1, 4), - return_type='dict') + box = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], layout=(1, 4), return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) - box = df.groupby('classroom').boxplot( # noqa - column=['height', 'weight', 'category'], layout=(1, -1), - return_type='dict') + box = df.groupby("classroom").boxplot( # noqa + column=["height", "weight", "category"], layout=(1, -1), return_type="dict" + ) self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) @pytest.mark.slow @@ -355,15 +360,17 @@ def test_grouped_box_multiple_axes(self): # which has earlier alphabetical order with tm.assert_produces_warning(UserWarning): fig, axes = self.plt.subplots(2, 2) - df.groupby('category').boxplot( - column='height', return_type='axes', ax=axes) - self._check_axes_shape(self.plt.gcf().axes, - axes_num=4, layout=(2, 2)) + df.groupby("category").boxplot(column="height", return_type="axes", ax=axes) + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) fig, axes = self.plt.subplots(2, 3) with tm.assert_produces_warning(UserWarning): - returned = df.boxplot(column=['height', 'weight', 'category'], - by='gender', return_type='axes', ax=axes[0]) + returned = df.boxplot( + column=["height", "weight", "category"], + by="gender", + return_type="axes", + ax=axes[0], + ) returned = np.array(list(returned.values)) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[0]) @@ -371,9 +378,9 @@ def test_grouped_box_multiple_axes(self): # draw on second row with tm.assert_produces_warning(UserWarning): - returned = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], - return_type='axes', ax=axes[1]) + returned = df.groupby("classroom").boxplot( + column=["height", "weight", "category"], return_type="axes", ax=axes[1] + ) returned = np.array(list(returned.values)) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[1]) @@ -383,9 +390,10 @@ def test_grouped_box_multiple_axes(self): fig, axes = self.plt.subplots(2, 3) # pass different number of axes from required with tm.assert_produces_warning(UserWarning): - axes = df.groupby('classroom').boxplot(ax=axes) + axes = df.groupby("classroom").boxplot(ax=axes) def test_fontsize(self): df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}) - self._check_ticks_props(df.boxplot("a", by="b", fontsize=16), - xlabelsize=16, ylabelsize=16) + self._check_ticks_props( + df.boxplot("a", by="b", fontsize=16), xlabelsize=16, ylabelsize=16 + ) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 92d207e46b7ab..35d12706f0590 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -13,7 +13,9 @@ import pandas.util.testing as tm from pandas.plotting import ( - deregister_matplotlib_converters, register_matplotlib_converters) + deregister_matplotlib_converters, + register_matplotlib_converters, +) from pandas.tseries.offsets import Day, Micro, Milli, Second try: @@ -23,7 +25,7 @@ # causing an improprer skip pass -pytest.importorskip('matplotlib.pyplot') +pytest.importorskip("matplotlib.pyplot") def test_initial_warning(): @@ -33,35 +35,35 @@ def test_initial_warning(): "fig, ax = plt.subplots(); " "ax.plot(s.index, s.values)" ) - call = [sys.executable, '-c', code] + call = [sys.executable, "-c", code] out = subprocess.check_output(call, stderr=subprocess.STDOUT).decode() - assert 'Using an implicitly' in out + assert "Using an implicitly" in out def test_timtetonum_accepts_unicode(): - assert (converter.time2num("00:01") == converter.time2num("00:01")) + assert converter.time2num("00:01") == converter.time2num("00:01") class TestRegistration: - def test_register_by_default(self): # Run in subprocess to ensure a clean state - code = ("'import matplotlib.units; " - "import pandas as pd; " - "units = dict(matplotlib.units.registry); " - "assert pd.Timestamp in units)'") - call = [sys.executable, '-c', code] + code = ( + "'import matplotlib.units; " + "import pandas as pd; " + "units = dict(matplotlib.units.registry); " + "assert pd.Timestamp in units)'" + ) + call = [sys.executable, "-c", code] assert subprocess.check_call(call) == 0 def test_warns(self): plt = pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warning" state, in case this isn't the first test run converter._WARN = True - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: ax.plot(s.index, s.values) plt.close() @@ -70,7 +72,7 @@ def test_warns(self): def test_registering_no_warning(self): plt = pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run @@ -83,7 +85,7 @@ def test_registering_no_warning(self): def test_pandas_plots_register(self): pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run converter._WARN = True with tm.assert_produces_warning(None) as w: @@ -95,8 +97,7 @@ def test_matplotlib_formatters(self): units = pytest.importorskip("matplotlib.units") assert Timestamp in units.registry - ctx = cf.option_context("plotting.matplotlib.register_converters", - False) + ctx = cf.option_context("plotting.matplotlib.register_converters", False) with ctx: assert Timestamp not in units.registry @@ -104,10 +105,9 @@ def test_matplotlib_formatters(self): def test_option_no_warning(self): pytest.importorskip("matplotlib.pyplot") - ctx = cf.option_context("plotting.matplotlib.register_converters", - False) + ctx = cf.option_context("plotting.matplotlib.register_converters", False) plt = pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range('2017', periods=12)) + s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() converter._WARN = True @@ -155,15 +155,14 @@ def test_registry_resets(self): def test_old_import_warns(self): with tm.assert_produces_warning(FutureWarning) as w: from pandas.tseries import converter + converter.register() assert len(w) - assert ('pandas.plotting.register_matplotlib_converters' in - str(w[0].message)) + assert "pandas.plotting.register_matplotlib_converters" in str(w[0].message) class TestDateTimeConverter: - def setup_method(self, method): self.dtc = converter.DatetimeConverter() self.tc = converter.TimeFormatter(None) @@ -171,14 +170,14 @@ def setup_method(self, method): def test_convert_accepts_unicode(self): r1 = self.dtc.convert("12:22", None, None) r2 = self.dtc.convert("12:22", None, None) - assert (r1 == r2), "DatetimeConverter.convert should accept unicode" + assert r1 == r2, "DatetimeConverter.convert should accept unicode" def test_conversion(self): - rs = self.dtc.convert(['2012-1-1'], None, None)[0] + rs = self.dtc.convert(["2012-1-1"], None, None)[0] xp = datetime(2012, 1, 1).toordinal() assert rs == xp - rs = self.dtc.convert('2012-1-1', None, None) + rs = self.dtc.convert("2012-1-1", None, None) assert rs == xp rs = self.dtc.convert(date(2012, 1, 1), None, None) @@ -187,31 +186,36 @@ def test_conversion(self): rs = self.dtc.convert(datetime(2012, 1, 1).toordinal(), None, None) assert rs == xp - rs = self.dtc.convert('2012-1-1', None, None) + rs = self.dtc.convert("2012-1-1", None, None) assert rs == xp - rs = self.dtc.convert(Timestamp('2012-1-1'), None, None) + rs = self.dtc.convert(Timestamp("2012-1-1"), None, None) assert rs == xp # also testing datetime64 dtype (GH8614) - rs = self.dtc.convert(np_datetime64_compat('2012-01-01'), None, None) + rs = self.dtc.convert(np_datetime64_compat("2012-01-01"), None, None) assert rs == xp - rs = self.dtc.convert(np_datetime64_compat( - '2012-01-01 00:00:00+0000'), None, None) + rs = self.dtc.convert( + np_datetime64_compat("2012-01-01 00:00:00+0000"), None, None + ) assert rs == xp - rs = self.dtc.convert(np.array([ - np_datetime64_compat('2012-01-01 00:00:00+0000'), - np_datetime64_compat('2012-01-02 00:00:00+0000')]), None, None) + rs = self.dtc.convert( + np.array( + [ + np_datetime64_compat("2012-01-01 00:00:00+0000"), + np_datetime64_compat("2012-01-02 00:00:00+0000"), + ] + ), + None, + None, + ) assert rs[0] == xp # we have a tz-aware date (constructed to that when we turn to utc it # is the same as our sample) - ts = (Timestamp('2012-01-01') - .tz_localize('UTC') - .tz_convert('US/Eastern') - ) + ts = Timestamp("2012-01-01").tz_localize("UTC").tz_convert("US/Eastern") rs = self.dtc.convert(ts, None, None) assert rs == xp @@ -221,20 +225,19 @@ def test_conversion(self): rs = self.dtc.convert(Index([ts - Day(1), ts]), None, None) assert rs[1] == xp - rs = self.dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(), - None, None) + rs = self.dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(), None, None) assert rs[1] == xp def test_conversion_float(self): decimals = 9 - rs = self.dtc.convert( - Timestamp('2012-1-1 01:02:03', tz='UTC'), None, None) - xp = converter.dates.date2num(Timestamp('2012-1-1 01:02:03', tz='UTC')) + rs = self.dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) + xp = converter.dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert( - Timestamp('2012-1-1 09:02:03', tz='Asia/Hong_Kong'), None, None) + Timestamp("2012-1-1 09:02:03", tz="Asia/Hong_Kong"), None, None + ) tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) @@ -258,13 +261,16 @@ def test_conversion_outofbounds_datetime(self): xp = converter.dates.date2num(values[0]) assert rs == xp - @pytest.mark.parametrize('time,format_expected', [ - (0, '00:00'), # time2num(datetime.time.min) - (86399.999999, '23:59:59.999999'), # time2num(datetime.time.max) - (90000, '01:00'), - (3723, '01:02:03'), - (39723.2, '11:02:03.200') - ]) + @pytest.mark.parametrize( + "time,format_expected", + [ + (0, "00:00"), # time2num(datetime.time.min) + (86399.999999, "23:59:59.999999"), # time2num(datetime.time.max) + (90000, "01:00"), + (3723, "01:02:03"), + (39723.2, "11:02:03.200"), + ], + ) def test_time_formatter(self, time, format_expected): # issue 18478 result = self.tc(time) @@ -273,7 +279,7 @@ def test_time_formatter(self, time, format_expected): def test_dateindex_conversion(self): decimals = 9 - for freq in ('B', 'L', 'S'): + for freq in ("B", "L", "S"): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) xp = converter.dates.date2num(dateindex._mpl_repr()) @@ -284,18 +290,17 @@ def _assert_less(ts1, ts2): val1 = self.dtc.convert(ts1, None, None) val2 = self.dtc.convert(ts2, None, None) if not val1 < val2: - raise AssertionError('{0} is not less than {1}.'.format(val1, - val2)) + raise AssertionError("{0} is not less than {1}.".format(val1, val2)) # Matplotlib's time representation using floats cannot distinguish # intervals smaller than ~10 microsecond in the common range of years. - ts = Timestamp('2012-1-1') + ts = Timestamp("2012-1-1") _assert_less(ts, ts + Second()) _assert_less(ts, ts + Milli()) _assert_less(ts, ts + Micro(50)) def test_convert_nested(self): - inner = [Timestamp('2017-01-01'), Timestamp('2017-01-02')] + inner = [Timestamp("2017-01-01"), Timestamp("2017-01-02")] data = [inner, inner] result = self.dtc.convert(data, None, None) expected = [self.dtc.convert(x, None, None) for x in data] @@ -303,7 +308,6 @@ def test_convert_nested(self): class TestPeriodConverter: - def setup_method(self, method): self.pc = converter.PeriodConverter() @@ -311,7 +315,7 @@ class Axis: pass self.axis = Axis() - self.axis.freq = 'D' + self.axis.freq = "D" def test_convert_accepts_unicode(self): r1 = self.pc.convert("2012-1-1", None, self.axis) @@ -319,11 +323,11 @@ def test_convert_accepts_unicode(self): assert r1 == r2 def test_conversion(self): - rs = self.pc.convert(['2012-1-1'], None, self.axis)[0] - xp = Period('2012-1-1').ordinal + rs = self.pc.convert(["2012-1-1"], None, self.axis)[0] + xp = Period("2012-1-1").ordinal assert rs == xp - rs = self.pc.convert('2012-1-1', None, self.axis) + rs = self.pc.convert("2012-1-1", None, self.axis) assert rs == xp rs = self.pc.convert([date(2012, 1, 1)], None, self.axis)[0] @@ -332,24 +336,30 @@ def test_conversion(self): rs = self.pc.convert(date(2012, 1, 1), None, self.axis) assert rs == xp - rs = self.pc.convert([Timestamp('2012-1-1')], None, self.axis)[0] + rs = self.pc.convert([Timestamp("2012-1-1")], None, self.axis)[0] assert rs == xp - rs = self.pc.convert(Timestamp('2012-1-1'), None, self.axis) + rs = self.pc.convert(Timestamp("2012-1-1"), None, self.axis) assert rs == xp - rs = self.pc.convert( - np_datetime64_compat('2012-01-01'), None, self.axis) + rs = self.pc.convert(np_datetime64_compat("2012-01-01"), None, self.axis) assert rs == xp rs = self.pc.convert( - np_datetime64_compat('2012-01-01 00:00:00+0000'), None, self.axis) + np_datetime64_compat("2012-01-01 00:00:00+0000"), None, self.axis + ) assert rs == xp - rs = self.pc.convert(np.array([ - np_datetime64_compat('2012-01-01 00:00:00+0000'), - np_datetime64_compat('2012-01-02 00:00:00+0000')]), - None, self.axis) + rs = self.pc.convert( + np.array( + [ + np_datetime64_compat("2012-01-01 00:00:00+0000"), + np_datetime64_compat("2012-01-02 00:00:00+0000"), + ] + ), + None, + self.axis, + ) assert rs[0] == xp def test_integer_passthrough(self): @@ -359,7 +369,7 @@ def test_integer_passthrough(self): assert rs == xp def test_convert_nested(self): - data = ['2012-1-1', '2012-1-2'] + data = ["2012-1-1", "2012-1-2"] r1 = self.pc.convert([data, data], None, self.axis) r2 = [self.pc.convert(data, None, self.axis) for _ in range(2)] assert r1 == r2 diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index c3d824389aa4d..ecd575020eca6 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -22,24 +22,24 @@ @td.skip_if_no_mpl class TestTSPlot(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) - self.freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'A'] - idx = [ - period_range('12/31/1999', freq=x, periods=100) for x in self.freq] + self.freq = ["S", "T", "H", "D", "W", "M", "Q", "A"] + idx = [period_range("12/31/1999", freq=x, periods=100) for x in self.freq] self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] - self.period_df = [DataFrame(np.random.randn(len(x), 3), index=x, - columns=['A', 'B', 'C']) - for x in idx] + self.period_df = [ + DataFrame(np.random.randn(len(x), 3), index=x, columns=["A", "B", "C"]) + for x in idx + ] - freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A', '1B30Min'] - idx = [date_range('12/31/1999', freq=x, periods=100) for x in freq] + freq = ["S", "T", "H", "D", "W", "M", "Q-DEC", "A", "1B30Min"] + idx = [date_range("12/31/1999", freq=x, periods=100) for x in freq] self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx] - self.datetime_df = [DataFrame(np.random.randn(len(x), 3), index=x, - columns=['A', 'B', 'C']) - for x in idx] + self.datetime_df = [ + DataFrame(np.random.randn(len(x), 3), index=x, columns=["A", "B", "C"]) + for x in idx + ] def teardown_method(self, method): tm.close() @@ -47,8 +47,7 @@ def teardown_method(self, method): @pytest.mark.slow def test_ts_plot_with_tz(self): # GH2877 - index = date_range('1/1/2011', periods=2, freq='H', - tz='Europe/Brussels') + index = date_range("1/1/2011", periods=2, freq="H", tz="Europe/Brussels") ts = Series([188.5, 328.25], index=index) _check_plot_works(ts.plot) @@ -57,13 +56,13 @@ def test_fontsize_set_correctly(self): df = DataFrame(np.random.randn(10, 9), index=range(10)) fig, ax = self.plt.subplots() df.plot(fontsize=2, ax=ax) - for label in (ax.get_xticklabels() + ax.get_yticklabels()): + for label in ax.get_xticklabels() + ax.get_yticklabels(): assert label.get_fontsize() == 2 @pytest.mark.slow def test_frame_inferred(self): # inferred freq - idx = date_range('1/1/1987', freq='MS', periods=100) + idx = date_range("1/1/1987", freq="MS", periods=100) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame(np.random.randn(len(idx), 3), index=idx) @@ -75,7 +74,7 @@ def test_frame_inferred(self): _check_plot_works(df2.plot) # N > 1 - idx = date_range('2008-1-1 00:15:00', freq='15T', periods=10) + idx = date_range("2008-1-1 00:15:00", freq="15T", periods=10) idx = DatetimeIndex(idx.values, freq=None) df = DataFrame(np.random.randn(len(idx), 3), index=idx) _check_plot_works(df.plot) @@ -88,8 +87,8 @@ def test_is_error_nozeroindex(self): _check_plot_works(a.plot, yerr=a) def test_nonnumeric_exclude(self): - idx = date_range('1/1/1987', freq='A', periods=3) - df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}, idx) + idx = date_range("1/1/1987", freq="A", periods=3) + df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx) fig, ax = self.plt.subplots() df.plot(ax=ax) # it works @@ -98,7 +97,7 @@ def test_nonnumeric_exclude(self): msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): - df['A'].plot() + df["A"].plot() def test_tsplot_deprecated(self): from pandas.tseries.plotting import tsplot @@ -134,40 +133,41 @@ def f(*args, **kwds): _check_plot_works(s.plot, ax=ax) _, ax = self.plt.subplots() - ts.plot(style='k', ax=ax) - color = (0., 0., 0., 1) + ts.plot(style="k", ax=ax) + color = (0.0, 0.0, 0.0, 1) assert color == ax.get_lines()[0].get_color() def test_both_style_and_color(self): ts = tm.makeTimeSeries() - msg = ("Cannot pass 'style' string with a color symbol and 'color' " - "keyword argument. Please use one or the other or pass 'style'" - " without a color symbol") + msg = ( + "Cannot pass 'style' string with a color symbol and 'color' " + "keyword argument. Please use one or the other or pass 'style'" + " without a color symbol" + ) with pytest.raises(ValueError, match=msg): - ts.plot(style='b-', color='#000099') + ts.plot(style="b-", color="#000099") s = ts.reset_index(drop=True) with pytest.raises(ValueError, match=msg): - s.plot(style='b-', color='#000099') + s.plot(style="b-", color="#000099") @pytest.mark.slow def test_high_freq(self): - freaks = ['ms', 'us'] + freaks = ["ms", "us"] for freq in freaks: _, ax = self.plt.subplots() - rng = date_range('1/1/2012', periods=100, freq=freq) + rng = date_range("1/1/2012", periods=100, freq=freq) ser = Series(np.random.randn(len(rng)), rng) _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): from pandas.plotting._matplotlib.converter import get_datevalue - assert get_datevalue(None, 'D') is None - assert get_datevalue(1987, 'A') == 1987 - assert (get_datevalue(Period(1987, 'A'), 'M') == - Period('1987-12', 'M').ordinal) - assert (get_datevalue('1/1/1987', 'D') == - Period('1987-1-1', 'D').ordinal) + + assert get_datevalue(None, "D") is None + assert get_datevalue(1987, "A") == 1987 + assert get_datevalue(Period(1987, "A"), "M") == Period("1987-12", "M").ordinal + assert get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal @pytest.mark.slow def test_ts_plot_format_coord(self): @@ -178,32 +178,32 @@ def check_format_of_first_point(ax, expected_string): try: assert expected_string == ax.format_coord(first_x, first_y) except (ValueError): - pytest.skip("skipping test because issue forming " - "test comparison GH7664") + pytest.skip( + "skipping test because issue forming " "test comparison GH7664" + ) - annual = Series(1, index=date_range('2014-01-01', periods=3, - freq='A-DEC')) + annual = Series(1, index=date_range("2014-01-01", periods=3, freq="A-DEC")) _, ax = self.plt.subplots() annual.plot(ax=ax) - check_format_of_first_point(ax, 't = 2014 y = 1.000000') + check_format_of_first_point(ax, "t = 2014 y = 1.000000") # note this is added to the annual plot already in existence, and # changes its freq field - daily = Series(1, index=date_range('2014-01-01', periods=3, freq='D')) + daily = Series(1, index=date_range("2014-01-01", periods=3, freq="D")) daily.plot(ax=ax) - check_format_of_first_point(ax, - 't = 2014-01-01 y = 1.000000') + check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") tm.close() # tsplot from pandas.tseries.plotting import tsplot + _, ax = self.plt.subplots() with tm.assert_produces_warning(FutureWarning): tsplot(annual, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, 't = 2014 y = 1.000000') + check_format_of_first_point(ax, "t = 2014 y = 1.000000") with tm.assert_produces_warning(FutureWarning): tsplot(daily, self.plt.Axes.plot, ax=ax) - check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') + check_format_of_first_point(ax, "t = 2014-01-01 y = 1.000000") @pytest.mark.slow def test_line_plot_period_series(self): @@ -212,11 +212,12 @@ def test_line_plot_period_series(self): @pytest.mark.slow @pytest.mark.parametrize( - 'frqncy', ['1S', '3S', '5T', '7H', '4D', '8W', '11M', '3A']) + "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + ) def test_line_plot_period_mlt_series(self, frqncy): # test period index line plot for series with multiples (`mlt`) of the # frequency (`frqncy`) rule code. tests resolution of issue #14763 - idx = period_range('12/31/1999', freq=frqncy, periods=100) + idx = period_range("12/31/1999", freq=frqncy, periods=100) s = Series(np.random.randn(len(idx)), idx) _check_plot_works(s.plot, s.index.freq.rule_code) @@ -232,14 +233,14 @@ def test_line_plot_period_frame(self): @pytest.mark.slow @pytest.mark.parametrize( - 'frqncy', ['1S', '3S', '5T', '7H', '4D', '8W', '11M', '3A']) + "frqncy", ["1S", "3S", "5T", "7H", "4D", "8W", "11M", "3A"] + ) def test_line_plot_period_mlt_frame(self, frqncy): # test period index line plot for DataFrames with multiples (`mlt`) # of the frequency (`frqncy`) rule code. tests resolution of issue # #14763 - idx = period_range('12/31/1999', freq=frqncy, periods=100) - df = DataFrame(np.random.randn(len(idx), 3), index=idx, - columns=['A', 'B', 'C']) + idx = period_range("12/31/1999", freq=frqncy, periods=100) + df = DataFrame(np.random.randn(len(idx), 3), index=idx, columns=["A", "B", "C"]) freq = df.index.asfreq(df.index.freq.rule_code).freq _check_plot_works(df.plot, freq) @@ -260,33 +261,32 @@ def test_line_plot_inferred_freq(self): def test_fake_inferred_business(self): _, ax = self.plt.subplots() - rng = date_range('2001-1-1', '2001-1-10') + rng = date_range("2001-1-1", "2001-1-10") ts = Series(range(len(rng)), index=rng) ts = ts[:3].append(ts[5:]) ts.plot(ax=ax) - assert not hasattr(ax, 'freq') + assert not hasattr(ax, "freq") @pytest.mark.slow def test_plot_offset_freq(self): ser = tm.makeTimeSeries() _check_plot_works(ser.plot) - dr = date_range(ser.index[0], freq='BQS', periods=10) + dr = date_range(ser.index[0], freq="BQS", periods=10) ser = Series(np.random.randn(len(dr)), index=dr) _check_plot_works(ser.plot) @pytest.mark.slow def test_plot_multiple_inferred_freq(self): - dr = Index([datetime(2000, 1, 1), - datetime(2000, 1, 6), - datetime(2000, 1, 11)]) + dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime(2000, 1, 11)]) ser = Series(np.random.randn(len(dr)), index=dr) _check_plot_works(ser.plot) @pytest.mark.slow def test_uhf(self): import pandas.plotting._matplotlib.converter as conv - idx = date_range('2012-6-22 21:59:51.960928', freq='L', periods=500) + + idx = date_range("2012-6-22 21:59:51.960928", freq="L", periods=500) df = DataFrame(np.random.randn(len(idx), 2), index=idx) _, ax = self.plt.subplots() @@ -296,14 +296,14 @@ def test_uhf(self): tlocs = axis.get_ticklocs() tlabels = axis.get_ticklabels() for loc, label in zip(tlocs, tlabels): - xp = conv._from_ordinal(loc).strftime('%H:%M:%S.%f') + xp = conv._from_ordinal(loc).strftime("%H:%M:%S.%f") rs = str(label.get_text()) if len(rs): assert xp == rs @pytest.mark.slow def test_irreg_hf(self): - idx = date_range('2012-6-22 21:59:51', freq='S', periods=100) + idx = date_range("2012-6-22 21:59:51", freq="S", periods=100) df = DataFrame(np.random.randn(len(idx), 2), index=idx) irreg = df.iloc[[0, 1, 3, 4]] @@ -311,7 +311,7 @@ def test_irreg_hf(self): irreg.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() - sec = 1. / 24 / 60 / 60 + sec = 1.0 / 24 / 60 / 60 assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all() _, ax = self.plt.subplots() @@ -339,22 +339,21 @@ def test_business_freq(self): bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() - assert PeriodIndex(data=idx).freqstr == 'B' + assert PeriodIndex(data=idx).freqstr == "B" @pytest.mark.slow def test_business_freq_convert(self): - bts = tm.makeTimeSeries(300).asfreq('BM') - ts = bts.to_period('M') + bts = tm.makeTimeSeries(300).asfreq("BM") + ts = bts.to_period("M") _, ax = self.plt.subplots() bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == ts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() - assert PeriodIndex(data=idx).freqstr == 'M' + assert PeriodIndex(data=idx).freqstr == "M" def test_nonzero_base(self): # GH2571 - idx = (date_range('2012-12-20', periods=24, freq='H') + timedelta( - minutes=30)) + idx = date_range("2012-12-20", periods=24, freq="H") + timedelta(minutes=30) df = DataFrame(np.arange(24), index=idx) _, ax = self.plt.subplots() df.plot(ax=ax) @@ -362,7 +361,7 @@ def test_nonzero_base(self): assert not Index(rs).is_normalized def test_dataframe(self): - bts = DataFrame({'a': tm.makeTimeSeries()}) + bts = DataFrame({"a": tm.makeTimeSeries()}) _, ax = self.plt.subplots() bts.plot(ax=ax) idx = ax.get_lines()[0].get_xdata() @@ -370,7 +369,6 @@ def test_dataframe(self): @pytest.mark.slow def test_axis_limits(self): - def _test(ax): xlim = ax.get_xlim() ax.set_xlim(xlim[0] - 5, xlim[1] + 10) @@ -379,16 +377,14 @@ def _test(ax): assert result[1] == xlim[1] + 10 # string - expected = (Period('1/1/2000', ax.freq), - Period('4/1/2000', ax.freq)) - ax.set_xlim('1/1/2000', '4/1/2000') + expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq)) + ax.set_xlim("1/1/2000", "4/1/2000") result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal # datetime - expected = (Period('1/1/2000', ax.freq), - Period('4/1/2000', ax.freq)) + expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq)) ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1)) result = ax.get_xlim() assert int(result[0]) == expected[0].ordinal @@ -402,11 +398,11 @@ def _test(ax): _test(ax) _, ax = self.plt.subplots() - df = DataFrame({'a': ser, 'b': ser + 1}) + df = DataFrame({"a": ser, "b": ser + 1}) df.plot(ax=ax) _test(ax) - df = DataFrame({'a': ser, 'b': ser + 1}) + df = DataFrame({"a": ser, "b": ser + 1}) axes = df.plot(subplots=True) for ax in axes: @@ -415,22 +411,22 @@ def _test(ax): def test_get_finder(self): import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder('B') == conv._daily_finder - assert conv.get_finder('D') == conv._daily_finder - assert conv.get_finder('M') == conv._monthly_finder - assert conv.get_finder('Q') == conv._quarterly_finder - assert conv.get_finder('A') == conv._annual_finder - assert conv.get_finder('W') == conv._daily_finder + assert conv.get_finder("B") == conv._daily_finder + assert conv.get_finder("D") == conv._daily_finder + assert conv.get_finder("M") == conv._monthly_finder + assert conv.get_finder("Q") == conv._quarterly_finder + assert conv.get_finder("A") == conv._annual_finder + assert conv.get_finder("W") == conv._daily_finder @pytest.mark.slow def test_finder_daily(self): day_lst = [10, 40, 252, 400, 950, 2750, 10000] - xpl1 = xpl2 = [Period('1999-1-1', freq='B').ordinal] * len(day_lst) + xpl1 = xpl2 = [Period("1999-1-1", freq="B").ordinal] * len(day_lst) rs1 = [] rs2 = [] for i, n in enumerate(day_lst): - rng = bdate_range('1999-1-1', periods=n) + rng = bdate_range("1999-1-1", periods=n) ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -449,11 +445,11 @@ def test_finder_daily(self): def test_finder_quarterly(self): yrs = [3.5, 11] - xpl1 = xpl2 = [Period('1988Q1').ordinal] * len(yrs) + xpl1 = xpl2 = [Period("1988Q1").ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): - rng = period_range('1987Q2', periods=int(n * 4), freq='Q') + rng = period_range("1987Q2", periods=int(n * 4), freq="Q") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -472,11 +468,11 @@ def test_finder_quarterly(self): def test_finder_monthly(self): yrs = [1.15, 2.5, 4, 11] - xpl1 = xpl2 = [Period('Jan 1988').ordinal] * len(yrs) + xpl1 = xpl2 = [Period("Jan 1988").ordinal] * len(yrs) rs1 = [] rs2 = [] for i, n in enumerate(yrs): - rng = period_range('1987Q2', periods=int(n * 12), freq='M') + rng = period_range("1987Q2", periods=int(n * 12), freq="M") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -492,22 +488,22 @@ def test_finder_monthly(self): assert rs2 == xpl2 def test_finder_monthly_long(self): - rng = period_range('1988Q1', periods=24 * 12, freq='M') + rng = period_range("1988Q1", periods=24 * 12, freq="M") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period('1989Q1', 'M').ordinal + xp = Period("1989Q1", "M").ordinal assert rs == xp @pytest.mark.slow def test_finder_annual(self): xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] - xp = [Period(x, freq='A').ordinal for x in xp] + xp = [Period(x, freq="A").ordinal for x in xp] rs = [] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): - rng = period_range('1987', periods=nyears, freq='A') + rng = period_range("1987", periods=nyears, freq="A") ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) @@ -520,25 +516,25 @@ def test_finder_annual(self): @pytest.mark.slow def test_finder_minutely(self): nminutes = 50 * 24 * 60 - rng = date_range('1/1/1999', freq='Min', periods=nminutes) + rng = date_range("1/1/1999", freq="Min", periods=nminutes) ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period('1/1/1999', freq='Min').ordinal + xp = Period("1/1/1999", freq="Min").ordinal assert rs == xp def test_finder_hourly(self): nhours = 23 - rng = date_range('1/1/1999', freq='H', periods=nhours) + rng = date_range("1/1/1999", freq="H", periods=nhours) ser = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] - xp = Period('1/1/1999', freq='H').ordinal + xp = Period("1/1/1999", freq="H").ordinal assert rs == xp @@ -604,7 +600,7 @@ def test_gap_upsample(self): _, ax = self.plt.subplots() low.plot(ax=ax) - idxh = date_range(low.index[0], low.index[-1], freq='12h') + idxh = date_range(low.index[0], low.index[-1], freq="12h") s = Series(np.random.randn(len(idxh)), idxh) s.plot(secondary_y=True) lines = ax.get_lines() @@ -626,51 +622,49 @@ def test_secondary_y(self): ser2 = Series(np.random.randn(10)) fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") axes = fig.get_axes() line = ax.get_lines()[0] xp = Series(line.get_ydata(), line.get_xdata()) assert_series_equal(ser, xp) - assert ax.get_yaxis().get_ticks_position() == 'right' + assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() self.plt.close(fig) _, ax2 = self.plt.subplots() ser2.plot(ax=ax2) - assert (ax2.get_yaxis().get_ticks_position() == - self.default_tick_position) + assert ax2.get_yaxis().get_ticks_position() == self.default_tick_position self.plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) assert ax.get_yaxis().get_visible() - assert not hasattr(ax, 'left_ax') - assert hasattr(ax, 'right_ax') - assert hasattr(ax2, 'left_ax') - assert not hasattr(ax2, 'right_ax') + assert not hasattr(ax, "left_ax") + assert hasattr(ax, "right_ax") + assert hasattr(ax2, "left_ax") + assert not hasattr(ax2, "right_ax") @pytest.mark.slow def test_secondary_y_ts(self): - idx = date_range('1/1/2000', periods=10) + idx = date_range("1/1/2000", periods=10) ser = Series(np.random.randn(10), idx) ser2 = Series(np.random.randn(10), idx) fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") axes = fig.get_axes() line = ax.get_lines()[0] xp = Series(line.get_ydata(), line.get_xdata()).to_timestamp() assert_series_equal(ser, xp) - assert ax.get_yaxis().get_ticks_position() == 'right' + assert ax.get_yaxis().get_ticks_position() == "right" assert not axes[0].get_yaxis().get_visible() self.plt.close(fig) _, ax2 = self.plt.subplots() ser2.plot(ax=ax2) - assert (ax2.get_yaxis().get_ticks_position() == - self.default_tick_position) + assert ax2.get_yaxis().get_ticks_position() == self.default_tick_position self.plt.close(ax2.get_figure()) ax = ser2.plot() @@ -683,37 +677,35 @@ def test_secondary_kde(self): ser = Series(np.random.randn(10)) fig, ax = self.plt.subplots() - ax = ser.plot(secondary_y=True, kind='density', ax=ax) - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + ax = ser.plot(secondary_y=True, kind="density", ax=ax) + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") axes = fig.get_axes() - assert axes[1].get_yaxis().get_ticks_position() == 'right' + assert axes[1].get_yaxis().get_ticks_position() == "right" @pytest.mark.slow def test_secondary_bar(self): ser = Series(np.random.randn(10)) fig, ax = self.plt.subplots() - ser.plot(secondary_y=True, kind='bar', ax=ax) + ser.plot(secondary_y=True, kind="bar", ax=ax) axes = fig.get_axes() - assert axes[1].get_yaxis().get_ticks_position() == 'right' + assert axes[1].get_yaxis().get_ticks_position() == "right" @pytest.mark.slow def test_secondary_frame(self): - df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) - axes = df.plot(secondary_y=['a', 'c'], subplots=True) - assert axes[0].get_yaxis().get_ticks_position() == 'right' - assert (axes[1].get_yaxis().get_ticks_position() == - self.default_tick_position) - assert axes[2].get_yaxis().get_ticks_position() == 'right' + df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) + axes = df.plot(secondary_y=["a", "c"], subplots=True) + assert axes[0].get_yaxis().get_ticks_position() == "right" + assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position + assert axes[2].get_yaxis().get_ticks_position() == "right" @pytest.mark.slow def test_secondary_bar_frame(self): - df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) - axes = df.plot(kind='bar', secondary_y=['a', 'c'], subplots=True) - assert axes[0].get_yaxis().get_ticks_position() == 'right' - assert (axes[1].get_yaxis().get_ticks_position() == - self.default_tick_position) - assert axes[2].get_yaxis().get_ticks_position() == 'right' + df = DataFrame(np.random.randn(5, 3), columns=["a", "b", "c"]) + axes = df.plot(kind="bar", secondary_y=["a", "c"], subplots=True) + assert axes[0].get_yaxis().get_ticks_position() == "right" + assert axes[1].get_yaxis().get_ticks_position() == self.default_tick_position + assert axes[2].get_yaxis().get_ticks_position() == "right" def test_mixed_freq_regular_first(self): # TODO @@ -724,13 +716,13 @@ def test_mixed_freq_regular_first(self): _, ax = self.plt.subplots() s1.plot(ax=ax) - ax2 = s2.plot(style='g', ax=ax) + ax2 = s2.plot(style="g", ax=ax) lines = ax2.get_lines() idx1 = PeriodIndex(lines[0].get_xdata()) idx2 = PeriodIndex(lines[1].get_xdata()) - tm.assert_index_equal(idx1, s1.index.to_period('B')) - tm.assert_index_equal(idx2, s2.index.to_period('B')) + tm.assert_index_equal(idx1, s1.index.to_period("B")) + tm.assert_index_equal(idx2, s2.index.to_period("B")) left, right = ax2.get_xlim() pidx = s1.index.to_period() @@ -742,9 +734,9 @@ def test_mixed_freq_irregular_first(self): s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] _, ax = self.plt.subplots() - s2.plot(style='g', ax=ax) + s2.plot(style="g", ax=ax) s1.plot(ax=ax) - assert not hasattr(ax, 'freq') + assert not hasattr(ax, "freq") lines = ax.get_lines() x1 = lines[0].get_xdata() tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) @@ -757,12 +749,12 @@ def test_mixed_freq_regular_first_df(self): s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] _, ax = self.plt.subplots() s1.plot(ax=ax) - ax2 = s2.plot(style='g', ax=ax) + ax2 = s2.plot(style="g", ax=ax) lines = ax2.get_lines() idx1 = PeriodIndex(lines[0].get_xdata()) idx2 = PeriodIndex(lines[1].get_xdata()) - assert idx1.equals(s1.index.to_period('B')) - assert idx2.equals(s2.index.to_period('B')) + assert idx1.equals(s1.index.to_period("B")) + assert idx2.equals(s2.index.to_period("B")) left, right = ax2.get_xlim() pidx = s1.index.to_period() assert left <= pidx[0].ordinal @@ -774,9 +766,9 @@ def test_mixed_freq_irregular_first_df(self): s1 = tm.makeTimeSeries().to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] _, ax = self.plt.subplots() - s2.plot(style='g', ax=ax) + s2.plot(style="g", ax=ax) s1.plot(ax=ax) - assert not hasattr(ax, 'freq') + assert not hasattr(ax, "freq") lines = ax.get_lines() x1 = lines[0].get_xdata() tm.assert_numpy_array_equal(x1, s2.index.astype(object).values) @@ -784,60 +776,60 @@ def test_mixed_freq_irregular_first_df(self): tm.assert_numpy_array_equal(x2, s1.index.astype(object).values) def test_mixed_freq_hf_first(self): - idxh = date_range('1/1/1999', periods=365, freq='D') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() high.plot(ax=ax) low.plot(ax=ax) for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'D' + assert PeriodIndex(data=l.get_xdata()).freq == "D" @pytest.mark.slow def test_mixed_freq_alignment(self): - ts_ind = date_range('2012-01-01 13:00', '2012-01-02', freq='H') + ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="H") ts_data = np.random.randn(12) ts = Series(ts_data, index=ts_ind) - ts2 = ts.asfreq('T').interpolate() + ts2 = ts.asfreq("T").interpolate() _, ax = self.plt.subplots() ax = ts.plot(ax=ax) - ts2.plot(style='r', ax=ax) + ts2.plot(style="r", ax=ax) assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0] @pytest.mark.slow def test_mixed_freq_lf_first(self): - idxh = date_range('1/1/1999', periods=365, freq='D') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() low.plot(legend=True, ax=ax) high.plot(legend=True, ax=ax) for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'D' + assert PeriodIndex(data=l.get_xdata()).freq == "D" leg = ax.get_legend() assert len(leg.texts) == 2 self.plt.close(ax.get_figure()) - idxh = date_range('1/1/1999', periods=240, freq='T') - idxl = date_range('1/1/1999', periods=4, freq='H') + idxh = date_range("1/1/1999", periods=240, freq="T") + idxl = date_range("1/1/1999", periods=4, freq="H") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() low.plot(ax=ax) high.plot(ax=ax) for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'T' + assert PeriodIndex(data=l.get_xdata()).freq == "T" def test_mixed_freq_irreg_period(self): ts = tm.makeTimeSeries() irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] - rng = period_range('1/3/2000', periods=30, freq='B') + rng = period_range("1/3/2000", periods=30, freq="B") ps = Series(np.random.randn(len(rng)), rng) _, ax = self.plt.subplots() irreg.plot(ax=ax) @@ -846,7 +838,7 @@ def test_mixed_freq_irreg_period(self): def test_mixed_freq_shared_ax(self): # GH13341, using sharex=True - idx1 = date_range('2015-01-01', periods=3, freq='M') + idx1 = date_range("2015-01-01", periods=3, freq="M") idx2 = idx1[:1].union(idx1[2:]) s1 = Series(range(len(idx1)), idx1) s2 = Series(range(len(idx2)), idx2) @@ -855,10 +847,9 @@ def test_mixed_freq_shared_ax(self): s1.plot(ax=ax1) s2.plot(ax=ax2) - assert ax1.freq == 'M' - assert ax2.freq == 'M' - assert (ax1.lines[0].get_xydata()[0, 0] == - ax2.lines[0].get_xydata()[0, 0]) + assert ax1.freq == "M" + assert ax2.freq == "M" + assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0] # using twinx fig, ax1 = self.plt.subplots() @@ -866,8 +857,7 @@ def test_mixed_freq_shared_ax(self): s1.plot(ax=ax1) s2.plot(ax=ax2) - assert (ax1.lines[0].get_xydata()[0, 0] == - ax2.lines[0].get_xydata()[0, 0]) + assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0] # TODO (GH14330, GH14322) # plotting the irregular first does not yet work @@ -882,7 +872,7 @@ def test_nat_handling(self): _, ax = self.plt.subplots() - dti = DatetimeIndex(['2015-01-01', NaT, '2015-01-03']) + dti = DatetimeIndex(["2015-01-01", NaT, "2015-01-03"]) s = Series(range(len(dti)), dti) s.plot(ax=ax) xdata = ax.get_lines()[0].get_xdata() @@ -892,8 +882,8 @@ def test_nat_handling(self): @pytest.mark.slow def test_to_weekly_resampling(self): - idxh = date_range('1/1/1999', periods=52, freq='W') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() @@ -904,6 +894,7 @@ def test_to_weekly_resampling(self): _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot + with tm.assert_produces_warning(FutureWarning): tsplot(high, self.plt.Axes.plot, ax=ax) with tm.assert_produces_warning(FutureWarning): @@ -913,8 +904,8 @@ def test_to_weekly_resampling(self): @pytest.mark.slow def test_from_weekly_resampling(self): - idxh = date_range('1/1/1999', periods=52, freq='W') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() @@ -922,8 +913,10 @@ def test_from_weekly_resampling(self): high.plot(ax=ax) expected_h = idxh.to_period().asi8.astype(np.float64) - expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, - 1549, 1553, 1558, 1562], dtype=np.float64) + expected_l = np.array( + [1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, 1549, 1553, 1558, 1562], + dtype=np.float64, + ) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq xdata = l.get_xdata(orig=False) @@ -935,6 +928,7 @@ def test_from_weekly_resampling(self): _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot + with tm.assert_produces_warning(FutureWarning): tsplot(low, self.plt.Axes.plot, ax=ax) with tm.assert_produces_warning(FutureWarning): @@ -949,33 +943,43 @@ def test_from_weekly_resampling(self): @pytest.mark.slow def test_from_resampling_area_line_mixed(self): - idxh = date_range('1/1/1999', periods=52, freq='W') - idxl = date_range('1/1/1999', periods=12, freq='M') - high = DataFrame(np.random.rand(len(idxh), 3), - index=idxh, columns=[0, 1, 2]) - low = DataFrame(np.random.rand(len(idxl), 3), - index=idxl, columns=[0, 1, 2]) + idxh = date_range("1/1/1999", periods=52, freq="W") + idxl = date_range("1/1/1999", periods=12, freq="M") + high = DataFrame(np.random.rand(len(idxh), 3), index=idxh, columns=[0, 1, 2]) + low = DataFrame(np.random.rand(len(idxl), 3), index=idxl, columns=[0, 1, 2]) # low to high - for kind1, kind2 in [('line', 'area'), ('area', 'line')]: + for kind1, kind2 in [("line", "area"), ("area", "line")]: _, ax = self.plt.subplots() low.plot(kind=kind1, stacked=True, ax=ax) high.plot(kind=kind2, stacked=True, ax=ax) # check low dataframe result - expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, - 1544, 1549, 1553, 1558, 1562], - dtype=np.float64) + expected_x = np.array( + [ + 1514, + 1519, + 1523, + 1527, + 1531, + 1536, + 1540, + 1544, + 1549, + 1553, + 1558, + 1562, + ], + dtype=np.float64, + ) expected_y = np.zeros(len(expected_x), dtype=np.float64) for i in range(3): line = ax.lines[i] assert PeriodIndex(line.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(line.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) # check stacked values are correct expected_y += low[i].values - tm.assert_numpy_array_equal(line.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) # check high dataframe result expected_x = idxh.to_period().asi8.astype(np.float64) @@ -983,14 +987,12 @@ def test_from_resampling_area_line_mixed(self): for i in range(3): line = ax.lines[3 + i] assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(line.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) expected_y += high[i].values - tm.assert_numpy_array_equal(line.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) # high to low - for kind1, kind2 in [('line', 'area'), ('area', 'line')]: + for kind1, kind2 in [("line", "area"), ("area", "line")]: _, ax = self.plt.subplots() high.plot(kind=kind1, stacked=True, ax=ax) low.plot(kind=kind2, stacked=True, ax=ax) @@ -1001,31 +1003,41 @@ def test_from_resampling_area_line_mixed(self): for i in range(3): line = ax.lines[i] assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(line.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x) expected_y += high[i].values - tm.assert_numpy_array_equal(line.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y) # check low dataframe result - expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, - 1544, 1549, 1553, 1558, 1562], - dtype=np.float64) + expected_x = np.array( + [ + 1514, + 1519, + 1523, + 1527, + 1531, + 1536, + 1540, + 1544, + 1549, + 1553, + 1558, + 1562, + ], + dtype=np.float64, + ) expected_y = np.zeros(len(expected_x), dtype=np.float64) for i in range(3): lines = ax.lines[3 + i] assert PeriodIndex(data=lines.get_xdata()).freq == idxh.freq - tm.assert_numpy_array_equal(lines.get_xdata(orig=False), - expected_x) + tm.assert_numpy_array_equal(lines.get_xdata(orig=False), expected_x) expected_y += low[i].values - tm.assert_numpy_array_equal(lines.get_ydata(orig=False), - expected_y) + tm.assert_numpy_array_equal(lines.get_ydata(orig=False), expected_y) @pytest.mark.slow def test_mixed_freq_second_millisecond(self): # GH 7772, GH 7760 - idxh = date_range('2014-07-01 09:00', freq='S', periods=50) - idxl = date_range('2014-07-01 09:00', freq='100L', periods=500) + idxh = date_range("2014-07-01 09:00", freq="S", periods=50) + idxl = date_range("2014-07-01 09:00", freq="100L", periods=500) high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) # high to low @@ -1034,7 +1046,7 @@ def test_mixed_freq_second_millisecond(self): low.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'L' + assert PeriodIndex(data=l.get_xdata()).freq == "L" tm.close() # low to high @@ -1043,7 +1055,7 @@ def test_mixed_freq_second_millisecond(self): high.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): - assert PeriodIndex(data=l.get_xdata()).freq == 'L' + assert PeriodIndex(data=l.get_xdata()).freq == "L" @pytest.mark.slow def test_irreg_dtypes(self): @@ -1053,7 +1065,7 @@ def test_irreg_dtypes(self): _check_plot_works(df.plot) # np.datetime64 - idx = date_range('1/1/2000', periods=10) + idx = date_range("1/1/2000", periods=10) idx = idx[[0, 2, 5, 9]].astype(object) df = DataFrame(np.random.randn(len(idx), 3), idx) _, ax = self.plt.subplots() @@ -1064,9 +1076,9 @@ def test_time(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) - df = DataFrame({'a': np.random.randn(len(ts)), - 'b': np.random.randn(len(ts))}, - index=ts) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) fig, ax = self.plt.subplots() df.plot(ax=ax) @@ -1079,9 +1091,9 @@ def test_time(self): rs = l.get_text() if len(rs) > 0: if s != 0: - xp = time(h, m, s).strftime('%H:%M:%S') + xp = time(h, m, s).strftime("%H:%M:%S") else: - xp = time(h, m, s).strftime('%H:%M') + xp = time(h, m, s).strftime("%H:%M") assert xp == rs @pytest.mark.slow @@ -1090,9 +1102,9 @@ def test_time_change_xlim(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) - df = DataFrame({'a': np.random.randn(len(ts)), - 'b': np.random.randn(len(ts))}, - index=ts) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) fig, ax = self.plt.subplots() df.plot(ax=ax) @@ -1105,13 +1117,13 @@ def test_time_change_xlim(self): rs = l.get_text() if len(rs) > 0: if s != 0: - xp = time(h, m, s).strftime('%H:%M:%S') + xp = time(h, m, s).strftime("%H:%M:%S") else: - xp = time(h, m, s).strftime('%H:%M') + xp = time(h, m, s).strftime("%H:%M") assert xp == rs # change xlim - ax.set_xlim('1:30', '5:00') + ax.set_xlim("1:30", "5:00") # check tick labels again ticks = ax.get_xticks() @@ -1122,20 +1134,19 @@ def test_time_change_xlim(self): rs = l.get_text() if len(rs) > 0: if s != 0: - xp = time(h, m, s).strftime('%H:%M:%S') + xp = time(h, m, s).strftime("%H:%M:%S") else: - xp = time(h, m, s).strftime('%H:%M') + xp = time(h, m, s).strftime("%H:%M") assert xp == rs @pytest.mark.slow def test_time_musec(self): t = datetime(1, 1, 1, 3, 30, 0) deltas = np.random.randint(1, 20, 3).cumsum() - ts = np.array([(t + timedelta(microseconds=int(x))).time() - for x in deltas]) - df = DataFrame({'a': np.random.randn(len(ts)), - 'b': np.random.randn(len(ts))}, - index=ts) + ts = np.array([(t + timedelta(microseconds=int(x))).time() for x in deltas]) + df = DataFrame( + {"a": np.random.randn(len(ts)), "b": np.random.randn(len(ts))}, index=ts + ) fig, ax = self.plt.subplots() ax = df.plot(ax=ax) @@ -1151,30 +1162,30 @@ def test_time_musec(self): rs = l.get_text() if len(rs) > 0: if (us % 1000) != 0: - xp = time(h, m, s, us).strftime('%H:%M:%S.%f') + xp = time(h, m, s, us).strftime("%H:%M:%S.%f") elif (us // 1000) != 0: - xp = time(h, m, s, us).strftime('%H:%M:%S.%f')[:-3] + xp = time(h, m, s, us).strftime("%H:%M:%S.%f")[:-3] elif s != 0: - xp = time(h, m, s, us).strftime('%H:%M:%S') + xp = time(h, m, s, us).strftime("%H:%M:%S") else: - xp = time(h, m, s, us).strftime('%H:%M') + xp = time(h, m, s, us).strftime("%H:%M") assert xp == rs @pytest.mark.slow def test_secondary_upsample(self): - idxh = date_range('1/1/1999', periods=365, freq='D') - idxl = date_range('1/1/1999', periods=12, freq='M') + idxh = date_range("1/1/1999", periods=365, freq="D") + idxl = date_range("1/1/1999", periods=12, freq="M") high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) _, ax = self.plt.subplots() low.plot(ax=ax) ax = high.plot(secondary_y=True, ax=ax) for l in ax.get_lines(): - assert PeriodIndex(l.get_xdata()).freq == 'D' - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert PeriodIndex(l.get_xdata()).freq == "D" + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") for l in ax.left_ax.get_lines(): - assert PeriodIndex(l.get_xdata()).freq == 'D' + assert PeriodIndex(l.get_xdata()).freq == "D" @pytest.mark.slow def test_secondary_legend(self): @@ -1183,13 +1194,13 @@ def test_secondary_legend(self): # ts df = tm.makeTimeDataFrame() - df.plot(secondary_y=['A', 'B'], ax=ax) + df.plot(secondary_y=["A", "B"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 - assert leg.get_texts()[0].get_text() == 'A (right)' - assert leg.get_texts()[1].get_text() == 'B (right)' - assert leg.get_texts()[2].get_text() == 'C' - assert leg.get_texts()[3].get_text() == 'D' + assert leg.get_texts()[0].get_text() == "A (right)" + assert leg.get_texts()[1].get_text() == "B (right)" + assert leg.get_texts()[2].get_text() == "C" + assert leg.get_texts()[3].get_text() == "D" assert ax.right_ax.get_legend() is None colors = set() for line in leg.get_lines(): @@ -1201,33 +1212,33 @@ def test_secondary_legend(self): fig = self.plt.figure() ax = fig.add_subplot(211) - df.plot(secondary_y=['A', 'C'], mark_right=False, ax=ax) + df.plot(secondary_y=["A", "C"], mark_right=False, ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 - assert leg.get_texts()[0].get_text() == 'A' - assert leg.get_texts()[1].get_text() == 'B' - assert leg.get_texts()[2].get_text() == 'C' - assert leg.get_texts()[3].get_text() == 'D' + assert leg.get_texts()[0].get_text() == "A" + assert leg.get_texts()[1].get_text() == "B" + assert leg.get_texts()[2].get_text() == "C" + assert leg.get_texts()[3].get_text() == "D" self.plt.close(fig) fig, ax = self.plt.subplots() - df.plot(kind='bar', secondary_y=['A'], ax=ax) + df.plot(kind="bar", secondary_y=["A"], ax=ax) leg = ax.get_legend() - assert leg.get_texts()[0].get_text() == 'A (right)' - assert leg.get_texts()[1].get_text() == 'B' + assert leg.get_texts()[0].get_text() == "A (right)" + assert leg.get_texts()[1].get_text() == "B" self.plt.close(fig) fig, ax = self.plt.subplots() - df.plot(kind='bar', secondary_y=['A'], mark_right=False, ax=ax) + df.plot(kind="bar", secondary_y=["A"], mark_right=False, ax=ax) leg = ax.get_legend() - assert leg.get_texts()[0].get_text() == 'A' - assert leg.get_texts()[1].get_text() == 'B' + assert leg.get_texts()[0].get_text() == "A" + assert leg.get_texts()[1].get_text() == "B" self.plt.close(fig) fig = self.plt.figure() ax = fig.add_subplot(211) df = tm.makeTimeDataFrame() - ax = df.plot(secondary_y=['C', 'D'], ax=ax) + ax = df.plot(secondary_y=["C", "D"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1243,7 +1254,7 @@ def test_secondary_legend(self): df = tm.makeDataFrame() fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['A', 'B'], ax=ax) + ax = df.plot(secondary_y=["A", "B"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1257,7 +1268,7 @@ def test_secondary_legend(self): fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['C', 'D'], ax=ax) + ax = df.plot(secondary_y=["C", "D"], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1269,7 +1280,7 @@ def test_secondary_legend(self): assert len(colors) == 4 def test_format_date_axis(self): - rng = date_range('1/1/2012', periods=12, freq='M') + rng = date_range("1/1/2012", periods=12, freq="M") df = DataFrame(np.random.randn(len(rng), 3), rng) _, ax = self.plt.subplots() ax = df.plot(ax=ax) @@ -1280,10 +1291,10 @@ def test_format_date_axis(self): @pytest.mark.slow def test_ax_plot(self): - x = date_range(start='2012-01-02', periods=10, freq='D') + x = date_range(start="2012-01-02", periods=10, freq="D") y = list(range(len(x))) _, ax = self.plt.subplots() - lines = ax.plot(x, y, label='Y') + lines = ax.plot(x, y, label="Y") tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) @pytest.mark.slow @@ -1292,7 +1303,7 @@ def test_mpl_nopandas(self): values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) - kw = dict(fmt='-', lw=4) + kw = dict(fmt="-", lw=4) _, ax = self.plt.subplots() ax.plot_date([x.toordinal() for x in dates], values1, **kw) @@ -1341,8 +1352,8 @@ def test_secondary_y_non_ts_xlim(self): @pytest.mark.slow def test_secondary_y_regular_ts_xlim(self): # GH 3490 - regular-timeseries with secondary y - index_1 = date_range(start='2000-01-01', periods=4, freq='D') - index_2 = date_range(start='2000-01-05', periods=4, freq='D') + index_1 = date_range(start="2000-01-01", periods=4, freq="D") + index_2 = date_range(start="2000-01-05", periods=4, freq="D") s1 = Series(1, index=index_1) s2 = Series(2, index=index_2) @@ -1358,13 +1369,13 @@ def test_secondary_y_regular_ts_xlim(self): @pytest.mark.slow def test_secondary_y_mixed_freq_ts_xlim(self): # GH 3490 - mixed frequency timeseries with secondary y - rng = date_range('2000-01-01', periods=10000, freq='min') + rng = date_range("2000-01-01", periods=10000, freq="min") ts = Series(1, index=rng) _, ax = self.plt.subplots() ts.plot(ax=ax) left_before, right_before = ax.get_xlim() - ts.resample('D').mean().plot(secondary_y=True, ax=ax) + ts.resample("D").mean().plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() # a downsample should not have changed either limit @@ -1399,10 +1410,9 @@ def test_plot_outofbounds_datetime(self): def test_format_timedelta_ticks_narrow(self): - expected_labels = (['00:00:00.0000000{:0>2d}'.format(i) - for i in range(10)]) + expected_labels = ["00:00:00.0000000{:0>2d}".format(i) for i in range(10)] - rng = timedelta_range('0', periods=10, freq='ns') + rng = timedelta_range("0", periods=10, freq="ns") df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() df.plot(fontsize=2, ax=ax) @@ -1415,18 +1425,18 @@ def test_format_timedelta_ticks_narrow(self): def test_format_timedelta_ticks_wide(self): expected_labels = [ - '00:00:00', - '1 days 03:46:40', - '2 days 07:33:20', - '3 days 11:20:00', - '4 days 15:06:40', - '5 days 18:53:20', - '6 days 22:40:00', - '8 days 02:26:40', - '9 days 06:13:20', + "00:00:00", + "1 days 03:46:40", + "2 days 07:33:20", + "3 days 11:20:00", + "4 days 15:06:40", + "5 days 18:53:20", + "6 days 22:40:00", + "8 days 02:26:40", + "9 days 06:13:20", ] - rng = timedelta_range('0', periods=10, freq='1 d') + rng = timedelta_range("0", periods=10, freq="1 d") df = DataFrame(np.random.randn(len(rng), 3), rng) fig, ax = self.plt.subplots() ax = df.plot(fontsize=2, ax=ax) @@ -1439,42 +1449,50 @@ def test_format_timedelta_ticks_wide(self): def test_timedelta_plot(self): # test issue #8711 - s = Series(range(5), timedelta_range('1day', periods=5)) + s = Series(range(5), timedelta_range("1day", periods=5)) _, ax = self.plt.subplots() _check_plot_works(s.plot, ax=ax) # test long period - index = timedelta_range('1 day 2 hr 30 min 10 s', - periods=10, freq='1 d') + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 d") s = Series(np.random.randn(len(index)), index) _, ax = self.plt.subplots() _check_plot_works(s.plot, ax=ax) # test short period - index = timedelta_range('1 day 2 hr 30 min 10 s', - periods=10, freq='1 ns') + index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 ns") s = Series(np.random.randn(len(index)), index) _, ax = self.plt.subplots() _check_plot_works(s.plot, ax=ax) def test_hist(self): # https://github.com/matplotlib/matplotlib/issues/8459 - rng = date_range('1/1/2011', periods=10, freq='H') + rng = date_range("1/1/2011", periods=10, freq="H") x = rng - w1 = np.arange(0, 1, .1) - w2 = np.arange(0, 1, .1)[::-1] + w1 = np.arange(0, 1, 0.1) + w2 = np.arange(0, 1, 0.1)[::-1] _, ax = self.plt.subplots() ax.hist([x, x], weights=[w1, w2]) @pytest.mark.slow def test_overlapping_datetime(self): # GB 6608 - s1 = Series([1, 2, 3], index=[datetime(1995, 12, 31), - datetime(2000, 12, 31), - datetime(2005, 12, 31)]) - s2 = Series([1, 2, 3], index=[datetime(1997, 12, 31), - datetime(2003, 12, 31), - datetime(2008, 12, 31)]) + s1 = Series( + [1, 2, 3], + index=[ + datetime(1995, 12, 31), + datetime(2000, 12, 31), + datetime(2005, 12, 31), + ], + ) + s2 = Series( + [1, 2, 3], + index=[ + datetime(1997, 12, 31), + datetime(2003, 12, 31), + datetime(2008, 12, 31), + ], + ) # plot first series, then add the second series to those axes, # then try adding the first series again @@ -1483,23 +1501,20 @@ def test_overlapping_datetime(self): s2.plot(ax=ax) s1.plot(ax=ax) - @pytest.mark.xfail(reason="GH9053 matplotlib does not use" - " ax.xaxis.converter") + @pytest.mark.xfail(reason="GH9053 matplotlib does not use" " ax.xaxis.converter") def test_add_matplotlib_datetime64(self): # GH9053 - ensure that a plot with PeriodConverter still understands # datetime64 data. This still fails because matplotlib overrides the # ax.xaxis.converter with a DatetimeConverter - s = Series(np.random.randn(10), - index=date_range('1970-01-02', periods=10)) + s = Series(np.random.randn(10), index=date_range("1970-01-02", periods=10)) ax = s.plot() - ax.plot(s.index, s.values, color='g') + ax.plot(s.index, s.values, color="g") l1, l2 = ax.lines tm.assert_numpy_array_equal(l1.get_xydata(), l2.get_xydata()) def test_matplotlib_scatter_datetime64(self): # https://github.com/matplotlib/matplotlib/issues/11391 - df = DataFrame(np.random.RandomState(0).rand(10, 2), - columns=["x", "y"]) + df = DataFrame(np.random.RandomState(0).rand(10, 2), columns=["x", "y"]) df["time"] = date_range("2018-01-01", periods=10, freq="D") fig, ax = self.plt.subplots() ax.scatter(x="time", y="y", data=df) @@ -1520,13 +1535,13 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): try: plt.clf() ax = fig.add_subplot(211) - orig_ax = kwargs.pop('ax', plt.gca()) - orig_axfreq = getattr(orig_ax, 'freq', None) + orig_ax = kwargs.pop("ax", plt.gca()) + orig_axfreq = getattr(orig_ax, "freq", None) ret = f(*args, **kwargs) assert ret is not None # do something more intelligent - ax = kwargs.pop('ax', plt.gca()) + ax = kwargs.pop("ax", plt.gca()) if series is not None: dfreq = series.index.freq if isinstance(dfreq, DateOffset): @@ -1539,7 +1554,7 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): ax = fig.add_subplot(212) try: - kwargs['ax'] = ax + kwargs["ax"] = ax ret = f(*args, **kwargs) assert ret is not None # do something more intelligent except Exception: @@ -1554,7 +1569,7 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): # TODO(statsmodels 0.10.0): Remove the statsmodels check # https://github.com/pandas-dev/pandas/issues/24088 # https://github.com/statsmodels/statsmodels/issues/4772 - if 'statsmodels' not in sys.modules: + if "statsmodels" not in sys.modules: with ensure_clean(return_filelike=True) as path: pickle.dump(fig, path) finally: diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 272f01a12156b..0215b79cb993d 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -15,8 +15,7 @@ from pandas.core.dtypes.api import is_list_like import pandas as pd -from pandas import ( - DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range) +from pandas import DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range from pandas.core.arrays import integer_array from pandas.tests.plotting.common import TestPlotBase, _check_plot_works import pandas.util.testing as tm @@ -27,17 +26,20 @@ @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.tdf = tm.makeTimeDataFrame() - self.hexbin_df = DataFrame({"A": np.random.uniform(size=20), - "B": np.random.uniform(size=20), - "C": np.arange(20) + np.random.uniform( - size=20)}) + self.hexbin_df = DataFrame( + { + "A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform(size=20), + } + ) def _assert_ytickslabels_visibility(self, axes, expected): for ax, exp in zip(axes, expected): @@ -55,21 +57,18 @@ def test_plot(self): _check_plot_works(df.plot, grid=False) # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, - subplots=True) + axes = _check_plot_works(df.plot, subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, - subplots=True, layout=(-1, 2)) + axes = _check_plot_works(df.plot, subplots=True, layout=(-1, 2)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, - subplots=True, use_index=False) + axes = _check_plot_works(df.plot, subplots=True, use_index=False) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - df = DataFrame({'x': [1, 2], 'y': [3, 4]}) + df = DataFrame({"x": [1, 2], "y": [3, 4]}) if _mpl_ge_3_1_0(): msg = "'Line2D' object has no property 'blarg'" else: @@ -77,8 +76,7 @@ def test_plot(self): with pytest.raises(AttributeError, match=msg): df.plot.line(blarg=True) - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) _check_plot_works(df.plot, use_index=True) _check_plot_works(df.plot, sort_columns=False) @@ -87,14 +85,14 @@ def test_plot(self): _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) with tm.assert_produces_warning(UserWarning): - _check_plot_works(df.plot, subplots=True, title='blah') + _check_plot_works(df.plot, subplots=True, title="blah") # We have to redo it here because _check_plot_works does two plots, # once without an ax kwarg and once with an ax kwarg and the new sharex # behaviour does not remove the visibility of the latter axis (as ax is # present). see: https://github.com/pandas-dev/pandas/issues/9737 - axes = df.plot(subplots=True, title='blah') + axes = df.plot(subplots=True, title="blah") self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) # axes[0].figure.savefig("test.png") for ax in axes[:2]: @@ -108,32 +106,35 @@ def test_plot(self): self._check_visible([ax.xaxis.get_label()]) self._check_ticks_props(ax, xrot=0) - _check_plot_works(df.plot, title='blah') + _check_plot_works(df.plot, title="blah") tuples = zip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) + df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) _check_plot_works(df.plot, use_index=True) # unicode - index = MultiIndex.from_tuples([('\u03b1', 0), - ('\u03b1', 1), - ('\u03b2', 2), - ('\u03b2', 3), - ('\u03b3', 4), - ('\u03b3', 5), - ('\u03b4', 6), - ('\u03b4', 7)], names=['i0', 'i1']) + index = MultiIndex.from_tuples( + [ + ("\u03b1", 0), + ("\u03b1", 1), + ("\u03b2", 2), + ("\u03b2", 3), + ("\u03b3", 4), + ("\u03b3", 5), + ("\u03b4", 6), + ("\u03b4", 7), + ], + names=["i0", "i1"], + ) columns = MultiIndex.from_tuples( - [('bar', '\u0394'), ('bar', '\u0395')], names=['c0', 'c1']) - df = DataFrame(np.random.randint(0, 10, (8, 2)), - columns=columns, - index=index) - _check_plot_works(df.plot, title='\u03A3') + [("bar", "\u0394"), ("bar", "\u0395")], names=["c0", "c1"] + ) + df = DataFrame(np.random.randint(0, 10, (8, 2)), columns=columns, index=index) + _check_plot_works(df.plot, title="\u03A3") # GH 6951 # Test with single column - df = DataFrame({'x': np.random.rand(10)}) + df = DataFrame({"x": np.random.rand(10)}) axes = _check_plot_works(df.plot.bar, subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @@ -157,82 +158,82 @@ def test_integer_array_plot(self): _check_plot_works(s.plot.hist) _check_plot_works(s.plot.pie) - df = DataFrame({'x': arr, 'y': arr}) + df = DataFrame({"x": arr, "y": arr}) _check_plot_works(df.plot.line) _check_plot_works(df.plot.bar) _check_plot_works(df.plot.hist) - _check_plot_works(df.plot.pie, y='y') - _check_plot_works(df.plot.scatter, x='x', y='y') - _check_plot_works(df.plot.hexbin, x='x', y='y') + _check_plot_works(df.plot.pie, y="y") + _check_plot_works(df.plot.scatter, x="x", y="y") + _check_plot_works(df.plot.hexbin, x="x", y="y") def test_mpl2_color_cycle_str(self): # GH 15516 - colors = ['C' + str(x) for x in range(10)] - df = DataFrame(randn(10, 3), columns=['a', 'b', 'c']) + colors = ["C" + str(x) for x in range(10)] + df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) for c in colors: _check_plot_works(df.plot, color=c) def test_color_single_series_list(self): # GH 3486 df = DataFrame({"A": [1, 2, 3]}) - _check_plot_works(df.plot, color=['red']) + _check_plot_works(df.plot, color=["red"]) def test_rgb_tuple_color(self): # GH 16695 - df = DataFrame({'x': [1, 2], 'y': [3, 4]}) - _check_plot_works(df.plot, x='x', y='y', color=(1, 0, 0)) - _check_plot_works(df.plot, x='x', y='y', color=(1, 0, 0, 0.5)) + df = DataFrame({"x": [1, 2], "y": [3, 4]}) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0)) + _check_plot_works(df.plot, x="x", y="y", color=(1, 0, 0, 0.5)) def test_color_empty_string(self): df = DataFrame(randn(10, 2)) with pytest.raises(ValueError): - df.plot(color='') + df.plot(color="") def test_color_and_style_arguments(self): - df = DataFrame({'x': [1, 2], 'y': [3, 4]}) + df = DataFrame({"x": [1, 2], "y": [3, 4]}) # passing both 'color' and 'style' arguments should be allowed # if there is no color symbol in the style strings: - ax = df.plot(color=['red', 'black'], style=['-', '--']) + ax = df.plot(color=["red", "black"], style=["-", "--"]) # check that the linestyles are correctly set: linestyle = [line.get_linestyle() for line in ax.lines] - assert linestyle == ['-', '--'] + assert linestyle == ["-", "--"] # check that the colors are correctly set: color = [line.get_color() for line in ax.lines] - assert color == ['red', 'black'] + assert color == ["red", "black"] # passing both 'color' and 'style' arguments should not be allowed # if there is a color symbol in the style strings: with pytest.raises(ValueError): - df.plot(color=['red', 'black'], style=['k-', 'r--']) + df.plot(color=["red", "black"], style=["k-", "r--"]) def test_nonnumeric_exclude(self): - df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}) + df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}) ax = df.plot() assert len(ax.get_lines()) == 1 # B was plotted @pytest.mark.slow def test_implicit_label(self): - df = DataFrame(randn(10, 3), columns=['a', 'b', 'c']) - ax = df.plot(x='a', y='b') - self._check_text_labels(ax.xaxis.get_label(), 'a') + df = DataFrame(randn(10, 3), columns=["a", "b", "c"]) + ax = df.plot(x="a", y="b") + self._check_text_labels(ax.xaxis.get_label(), "a") @pytest.mark.slow def test_donot_overwrite_index_name(self): # GH 8494 - df = DataFrame(randn(2, 2), columns=['a', 'b']) - df.index.name = 'NAME' - df.plot(y='b', label='LABEL') - assert df.index.name == 'NAME' + df = DataFrame(randn(2, 2), columns=["a", "b"]) + df.index.name = "NAME" + df.plot(y="b", label="LABEL") + assert df.index.name == "NAME" @pytest.mark.slow def test_plot_xy(self): # columns.inferred_type == 'string' df = self.tdf - self._check_data(df.plot(x=0, y=1), df.set_index('A')['B'].plot()) - self._check_data(df.plot(x=0), df.set_index('A').plot()) + self._check_data(df.plot(x=0, y=1), df.set_index("A")["B"].plot()) + self._check_data(df.plot(x=0), df.set_index("A").plot()) self._check_data(df.plot(y=0), df.B.plot()) - self._check_data(df.plot(x='A', y='B'), df.set_index('A').B.plot()) - self._check_data(df.plot(x='A'), df.set_index('A').plot()) - self._check_data(df.plot(y='B'), df.B.plot()) + self._check_data(df.plot(x="A", y="B"), df.set_index("A").B.plot()) + self._check_data(df.plot(x="A"), df.set_index("A").plot()) + self._check_data(df.plot(y="B"), df.B.plot()) # columns.inferred_type == 'integer' df.columns = np.arange(1, len(df.columns) + 1) @@ -241,21 +242,19 @@ def test_plot_xy(self): self._check_data(df.plot(y=1), df[1].plot()) # figsize and title - ax = df.plot(x=1, y=2, title='Test', figsize=(16, 8)) - self._check_text_labels(ax.title, 'Test') - self._check_axes_shape(ax, axes_num=1, layout=(1, 1), - figsize=(16., 8.)) + ax = df.plot(x=1, y=2, title="Test", figsize=(16, 8)) + self._check_text_labels(ax.title, "Test") + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16.0, 8.0)) # columns.inferred_type == 'mixed' # TODO add MultiIndex test @pytest.mark.slow - @pytest.mark.parametrize("input_log, expected_log", [ - (True, 'log'), - ('sym', 'symlog') - ]) + @pytest.mark.parametrize( + "input_log, expected_log", [(True, "log"), ("sym", "symlog")] + ) def test_logscales(self, input_log, expected_log): - df = DataFrame({'a': np.arange(100)}, index=np.arange(100)) + df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) ax = df.plot(logy=input_log) self._check_ax_scales(ax, yaxis=expected_log) @@ -273,7 +272,7 @@ def test_logscales(self, input_log, expected_log): @pytest.mark.parametrize("input_param", ["logx", "logy", "loglog"]) def test_invalid_logscale(self, input_param): # GH: 24867 - df = DataFrame({'a': np.arange(100)}, index=np.arange(100)) + df = DataFrame({"a": np.arange(100)}, index=np.arange(100)) msg = "Boolean, None and 'sym' are valid options, 'sm' is given." with pytest.raises(ValueError, match=msg): @@ -289,13 +288,13 @@ def test_xcompat(self): assert not isinstance(lines[0].get_xdata(), PeriodIndex) tm.close() - pd.plotting.plot_params['xaxis.compat'] = True + pd.plotting.plot_params["xaxis.compat"] = True ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) tm.close() - pd.plotting.plot_params['x_compat'] = False + pd.plotting.plot_params["x_compat"] = False ax = df.plot() lines = ax.get_lines() @@ -304,7 +303,7 @@ def test_xcompat(self): tm.close() # useful if you're plotting a bunch together - with pd.plotting.plot_params.use('x_compat', True): + with pd.plotting.plot_params.use("x_compat", True): ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) @@ -321,19 +320,21 @@ def test_period_compat(self): df = DataFrame( np.random.rand(21, 2), index=bdate_range(datetime(2000, 1, 1), datetime(2000, 1, 31)), - columns=['a', 'b']) + columns=["a", "b"], + ) df.plot() self.plt.axhline(y=0) tm.close() def test_unsorted_index(self): - df = DataFrame({'y': np.arange(100)}, index=np.arange(99, -1, -1), - dtype=np.int64) + df = DataFrame( + {"y": np.arange(100)}, index=np.arange(99, -1, -1), dtype=np.int64 + ) ax = df.plot() lines = ax.get_lines()[0] rs = lines.get_xydata() - rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name='y') + rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") tm.assert_series_equal(rs, df.y, check_index_type=False) tm.close() @@ -341,27 +342,29 @@ def test_unsorted_index(self): ax = df.plot() lines = ax.get_lines()[0] rs = lines.get_xydata() - rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name='y') + rs = Series(rs[:, 1], rs[:, 0], dtype=np.int64, name="y") tm.assert_series_equal(rs, df.y) def test_unsorted_index_lims(self): - df = DataFrame({'y': [0., 1., 2., 3.]}, index=[1., 0., 3., 2.]) + df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0]}, index=[1.0, 0.0, 3.0, 2.0]) ax = df.plot() xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= np.nanmin(lines[0].get_data()[0]) assert xmax >= np.nanmax(lines[0].get_data()[0]) - df = DataFrame({'y': [0., 1., np.nan, 3., 4., 5., 6.]}, - index=[1., 0., 3., 2., np.nan, 3., 2.]) + df = DataFrame( + {"y": [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0]}, + index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0], + ) ax = df.plot() xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= np.nanmin(lines[0].get_data()[0]) assert xmax >= np.nanmax(lines[0].get_data()[0]) - df = DataFrame({'y': [0., 1., 2., 3.], 'z': [91., 90., 93., 92.]}) - ax = df.plot(x='z', y='y') + df = DataFrame({"y": [0.0, 1.0, 2.0, 3.0], "z": [91.0, 90.0, 93.0, 92.0]}) + ax = df.plot(x="z", y="y") xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= np.nanmin(lines[0].get_data()[0]) @@ -369,25 +372,22 @@ def test_unsorted_index_lims(self): @pytest.mark.slow def test_subplots(self): - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - for kind in ['bar', 'barh', 'line', 'area']: + for kind in ["bar", "barh", "line", "area"]: axes = df.plot(kind=kind, subplots=True, sharex=True, legend=True) self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) - assert axes.shape == (3, ) + assert axes.shape == (3,) for ax, column in zip(axes, df.columns): - self._check_legend_labels(ax, - labels=[pprint_thing(column)]) + self._check_legend_labels(ax, labels=[pprint_thing(column)]) for ax in axes[:-2]: self._check_visible(ax.xaxis) # xaxis must be visible for grid self._check_visible(ax.get_xticklabels(), visible=False) - if not (kind == 'bar' and self.mpl_ge_3_1_0): + if not (kind == "bar" and self.mpl_ge_3_1_0): # change https://github.com/pandas-dev/pandas/issues/26714 - self._check_visible( - ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) self._check_visible(ax.xaxis.get_label(), visible=False) self._check_visible(ax.get_yticklabels()) @@ -414,23 +414,27 @@ def test_groupby_boxplot_sharey(self): # sharey can now be switched check whether the right # pair of axes is turned on or off - df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14], - 'b': [0.56, 0.84, 0.29, 0.56, 0.85], - 'c': [0, 1, 2, 3, 1]}, - index=[0, 1, 2, 3, 4]) + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) # behavior without keyword - axes = df.groupby('c').boxplot() + axes = df.groupby("c").boxplot() expected = [True, False, True, False] self._assert_ytickslabels_visibility(axes, expected) # set sharey=True should be identical - axes = df.groupby('c').boxplot(sharey=True) + axes = df.groupby("c").boxplot(sharey=True) expected = [True, False, True, False] self._assert_ytickslabels_visibility(axes, expected) # sharey=False, all yticklabels should be visible - axes = df.groupby('c').boxplot(sharey=False) + axes = df.groupby("c").boxplot(sharey=False) expected = [True, True, True, True] self._assert_ytickslabels_visibility(axes, expected) @@ -439,33 +443,37 @@ def test_groupby_boxplot_sharex(self): # sharex can now be switched check whether the right # pair of axes is turned on or off - df = DataFrame({'a': [-1.43, -0.15, -3.70, -1.43, -0.14], - 'b': [0.56, 0.84, 0.29, 0.56, 0.85], - 'c': [0, 1, 2, 3, 1]}, - index=[0, 1, 2, 3, 4]) + df = DataFrame( + { + "a": [-1.43, -0.15, -3.70, -1.43, -0.14], + "b": [0.56, 0.84, 0.29, 0.56, 0.85], + "c": [0, 1, 2, 3, 1], + }, + index=[0, 1, 2, 3, 4], + ) # behavior without keyword - axes = df.groupby('c').boxplot() + axes = df.groupby("c").boxplot() expected = [True, True, True, True] self._assert_xtickslabels_visibility(axes, expected) # set sharex=False should be identical - axes = df.groupby('c').boxplot(sharex=False) + axes = df.groupby("c").boxplot(sharex=False) expected = [True, True, True, True] self._assert_xtickslabels_visibility(axes, expected) # sharex=True, yticklabels should be visible # only for bottom plots - axes = df.groupby('c').boxplot(sharex=True) + axes = df.groupby("c").boxplot(sharex=True) expected = [False, False, True, True] self._assert_xtickslabels_visibility(axes, expected) @pytest.mark.slow def test_subplots_timeseries(self): - idx = date_range(start='2014-07-01', freq='M', periods=10) + idx = date_range(start="2014-07-01", freq="M", periods=10) df = DataFrame(np.random.rand(10, 3), index=idx) - for kind in ['line', 'area']: + for kind in ["line", "area"]: axes = df.plot(kind=kind, subplots=True, sharex=True) self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) @@ -473,8 +481,7 @@ def test_subplots_timeseries(self): # GH 7801 self._check_visible(ax.xaxis) # xaxis must be visible for grid self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible( - ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) self._check_visible(ax.xaxis.get_label(), visible=False) self._check_visible(ax.get_yticklabels()) @@ -485,54 +492,64 @@ def test_subplots_timeseries(self): self._check_visible(axes[-1].get_yticklabels()) self._check_ticks_props(axes, xrot=0) - axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, - fontsize=7) + axes = df.plot(kind=kind, subplots=True, sharex=False, rot=45, fontsize=7) for ax in axes: self._check_visible(ax.xaxis) self._check_visible(ax.get_xticklabels()) self._check_visible(ax.get_xticklabels(minor=True)) self._check_visible(ax.xaxis.get_label()) self._check_visible(ax.get_yticklabels()) - self._check_ticks_props(ax, xlabelsize=7, xrot=45, - ylabelsize=7) + self._check_ticks_props(ax, xlabelsize=7, xrot=45, ylabelsize=7) def test_subplots_timeseries_y_axis(self): # GH16953 - data = {"numeric": np.array([1, 2, 5]), - "timedelta": [pd.Timedelta(-10, unit="s"), - pd.Timedelta(10, unit="m"), - pd.Timedelta(10, unit="h")], - "datetime_no_tz": [pd.to_datetime("2017-08-01 00:00:00"), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00")], - "datetime_all_tz": [pd.to_datetime("2017-08-01 00:00:00", - utc=True), - pd.to_datetime("2017-08-01 02:00:00", - utc=True), - pd.to_datetime("2017-08-02 00:00:00", - utc=True)], - "text": ["This", "should", "fail"]} + data = { + "numeric": np.array([1, 2, 5]), + "timedelta": [ + pd.Timedelta(-10, unit="s"), + pd.Timedelta(10, unit="m"), + pd.Timedelta(10, unit="h"), + ], + "datetime_no_tz": [ + pd.to_datetime("2017-08-01 00:00:00"), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + "datetime_all_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00", utc=True), + pd.to_datetime("2017-08-02 00:00:00", utc=True), + ], + "text": ["This", "should", "fail"], + } testdata = DataFrame(data) ax_numeric = testdata.plot(y="numeric") - assert (ax_numeric.get_lines()[0].get_data()[1] == - testdata["numeric"].values).all() + assert ( + ax_numeric.get_lines()[0].get_data()[1] == testdata["numeric"].values + ).all() ax_timedelta = testdata.plot(y="timedelta") - assert (ax_timedelta.get_lines()[0].get_data()[1] == - testdata["timedelta"].values).all() + assert ( + ax_timedelta.get_lines()[0].get_data()[1] == testdata["timedelta"].values + ).all() ax_datetime_no_tz = testdata.plot(y="datetime_no_tz") - assert (ax_datetime_no_tz.get_lines()[0].get_data()[1] == - testdata["datetime_no_tz"].values).all() + assert ( + ax_datetime_no_tz.get_lines()[0].get_data()[1] + == testdata["datetime_no_tz"].values + ).all() ax_datetime_all_tz = testdata.plot(y="datetime_all_tz") - assert (ax_datetime_all_tz.get_lines()[0].get_data()[1] == - testdata["datetime_all_tz"].values).all() + assert ( + ax_datetime_all_tz.get_lines()[0].get_data()[1] + == testdata["datetime_all_tz"].values + ).all() msg = "no numeric data to plot" with pytest.raises(TypeError, match=msg): testdata.plot(y="text") - @pytest.mark.xfail(reason='not support for period, categorical, ' - 'datetime_mixed_tz') + @pytest.mark.xfail( + reason="not support for period, categorical, " "datetime_mixed_tz" + ) def test_subplots_timeseries_y_axis_not_supported(self): """ This test will fail for: @@ -551,34 +568,42 @@ def test_subplots_timeseries_y_axis_not_supported(self): generally converting ``datetime`` objects in a tz-aware form could help with this problem """ - data = {"numeric": np.array([1, 2, 5]), - "period": [pd.Period('2017-08-01 00:00:00', freq='H'), - pd.Period('2017-08-01 02:00', freq='H'), - pd.Period('2017-08-02 00:00:00', freq='H')], - "categorical": pd.Categorical(["c", "b", "a"], - categories=["a", "b", "c"], - ordered=False), - "datetime_mixed_tz": [pd.to_datetime("2017-08-01 00:00:00", - utc=True), - pd.to_datetime("2017-08-01 02:00:00"), - pd.to_datetime("2017-08-02 00:00:00")]} + data = { + "numeric": np.array([1, 2, 5]), + "period": [ + pd.Period("2017-08-01 00:00:00", freq="H"), + pd.Period("2017-08-01 02:00", freq="H"), + pd.Period("2017-08-02 00:00:00", freq="H"), + ], + "categorical": pd.Categorical( + ["c", "b", "a"], categories=["a", "b", "c"], ordered=False + ), + "datetime_mixed_tz": [ + pd.to_datetime("2017-08-01 00:00:00", utc=True), + pd.to_datetime("2017-08-01 02:00:00"), + pd.to_datetime("2017-08-02 00:00:00"), + ], + } testdata = pd.DataFrame(data) ax_period = testdata.plot(x="numeric", y="period") - assert (ax_period.get_lines()[0].get_data()[1] == - testdata["period"].values).all() + assert ( + ax_period.get_lines()[0].get_data()[1] == testdata["period"].values + ).all() ax_categorical = testdata.plot(x="numeric", y="categorical") - assert (ax_categorical.get_lines()[0].get_data()[1] == - testdata["categorical"].values).all() - ax_datetime_mixed_tz = testdata.plot(x="numeric", - y="datetime_mixed_tz") - assert (ax_datetime_mixed_tz.get_lines()[0].get_data()[1] == - testdata["datetime_mixed_tz"].values).all() + assert ( + ax_categorical.get_lines()[0].get_data()[1] + == testdata["categorical"].values + ).all() + ax_datetime_mixed_tz = testdata.plot(x="numeric", y="datetime_mixed_tz") + assert ( + ax_datetime_mixed_tz.get_lines()[0].get_data()[1] + == testdata["datetime_mixed_tz"].values + ).all() @pytest.mark.slow def test_subplots_layout(self): # GH 6667 - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) axes = df.plot(subplots=True, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) @@ -610,11 +635,10 @@ def test_subplots_layout(self): df.plot(subplots=True, layout=(-1, -1)) # single column - df = DataFrame(np.random.rand(10, 1), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) axes = df.plot(subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1, ) + assert axes.shape == (1,) axes = df.plot(subplots=True, layout=(3, 3)) self._check_axes_shape(axes, axes_num=1, layout=(3, 3)) @@ -627,27 +651,25 @@ def test_subplots_warnings(self): df = DataFrame(np.random.randn(100, 4)) df.plot(subplots=True, layout=(3, 2)) - df = DataFrame(np.random.randn(100, 4), - index=date_range('1/1/2000', periods=100)) + df = DataFrame( + np.random.randn(100, 4), index=date_range("1/1/2000", periods=100) + ) df.plot(subplots=True, layout=(3, 2)) @pytest.mark.slow def test_subplots_multiple_axes(self): # GH 5353, 6970, GH 7069 fig, axes = self.plt.subplots(2, 3) - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - returned = df.plot(subplots=True, ax=axes[0], sharex=False, - sharey=False) + returned = df.plot(subplots=True, ax=axes[0], sharex=False, sharey=False) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3, ) + assert returned.shape == (3,) assert returned[0].figure is fig # draw on second row - returned = df.plot(subplots=True, ax=axes[1], sharex=False, - sharey=False) + returned = df.plot(subplots=True, ax=axes[1], sharex=False, sharey=False) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - assert returned.shape == (3, ) + assert returned.shape == (3,) assert returned[0].figure is fig self._check_axes_shape(axes, axes_num=6, layout=(2, 3)) tm.close() @@ -664,32 +686,33 @@ def test_subplots_multiple_axes(self): fig, axes = self.plt.subplots(2, 2) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) - df = DataFrame(np.random.rand(10, 4), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 4), index=list(string.ascii_letters[:10])) - returned = df.plot(subplots=True, ax=axes, layout=(2, 1), - sharex=False, sharey=False) + returned = df.plot( + subplots=True, ax=axes, layout=(2, 1), sharex=False, sharey=False + ) self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4, ) + assert returned.shape == (4,) - returned = df.plot(subplots=True, ax=axes, layout=(2, -1), - sharex=False, sharey=False) + returned = df.plot( + subplots=True, ax=axes, layout=(2, -1), sharex=False, sharey=False + ) self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4, ) + assert returned.shape == (4,) - returned = df.plot(subplots=True, ax=axes, layout=(-1, 2), - sharex=False, sharey=False) + returned = df.plot( + subplots=True, ax=axes, layout=(-1, 2), sharex=False, sharey=False + ) self._check_axes_shape(returned, axes_num=4, layout=(2, 2)) - assert returned.shape == (4, ) + assert returned.shape == (4,) # single column fig, axes = self.plt.subplots(1, 1) - df = DataFrame(np.random.rand(10, 1), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 1), index=list(string.ascii_letters[:10])) axes = df.plot(subplots=True, ax=[axes], sharex=False, sharey=False) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - assert axes.shape == (1, ) + assert axes.shape == (1,) def test_subplots_ts_share_axes(self): # GH 3964 @@ -697,7 +720,8 @@ def test_subplots_ts_share_axes(self): self.plt.subplots_adjust(left=0.05, right=0.95, hspace=0.3, wspace=0.3) df = DataFrame( np.random.randn(10, 9), - index=date_range(start='2014-07-01', freq='M', periods=10)) + index=date_range(start="2014-07-01", freq="M", periods=10), + ) for i, ax in enumerate(axes.ravel()): df[i].plot(ax=ax, fontsize=5) @@ -721,11 +745,11 @@ def test_subplots_ts_share_axes(self): def test_subplots_sharex_axes_existing_axes(self): # GH 9158 - d = {'A': [1., 2., 3., 4.], 'B': [4., 3., 2., 1.], 'C': [5, 1, 3, 4]} - df = DataFrame(d, index=date_range('2014 10 11', '2014 10 14')) + d = {"A": [1.0, 2.0, 3.0, 4.0], "B": [4.0, 3.0, 2.0, 1.0], "C": [5, 1, 3, 4]} + df = DataFrame(d, index=date_range("2014 10 11", "2014 10 14")) - axes = df[['A', 'B']].plot(subplots=True) - df['C'].plot(ax=axes[0], secondary_y=True) + axes = df[["A", "B"]].plot(subplots=True) + df["C"].plot(ax=axes[0], secondary_y=True) self._check_visible(axes[0].get_xticklabels(), visible=False) self._check_visible(axes[1].get_xticklabels(), visible=True) @@ -735,29 +759,31 @@ def test_subplots_sharex_axes_existing_axes(self): @pytest.mark.slow def test_subplots_dup_columns(self): # GH 10962 - df = DataFrame(np.random.rand(5, 5), columns=list('aaaaa')) + df = DataFrame(np.random.rand(5, 5), columns=list("aaaaa")) axes = df.plot(subplots=True) for ax in axes: - self._check_legend_labels(ax, labels=['a']) + self._check_legend_labels(ax, labels=["a"]) assert len(ax.lines) == 1 tm.close() - axes = df.plot(subplots=True, secondary_y='a') + axes = df.plot(subplots=True, secondary_y="a") for ax in axes: # (right) is only attached when subplots=False - self._check_legend_labels(ax, labels=['a']) + self._check_legend_labels(ax, labels=["a"]) assert len(ax.lines) == 1 tm.close() - ax = df.plot(secondary_y='a') - self._check_legend_labels(ax, labels=['a (right)'] * 5) + ax = df.plot(secondary_y="a") + self._check_legend_labels(ax, labels=["a (right)"] * 5) assert len(ax.lines) == 0 assert len(ax.right_ax.lines) == 5 def test_negative_log(self): - df = - DataFrame(rand(6, 4), - index=list(string.ascii_letters[:6]), - columns=['x', 'y', 'z', 'four']) + df = -DataFrame( + rand(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) with pytest.raises(ValueError): df.plot.area(logy=True) @@ -773,19 +799,20 @@ def _compare_stacked_y_cood(self, normal_lines, stacked_lines): def test_line_area_stacked(self): with tm.RNGContext(42): - df = DataFrame(rand(6, 4), columns=['w', 'x', 'y', 'z']) + df = DataFrame(rand(6, 4), columns=["w", "x", "y", "z"]) neg_df = -df # each column has either positive or negative value - sep_df = DataFrame({'w': rand(6), - 'x': rand(6), - 'y': -rand(6), - 'z': -rand(6)}) + sep_df = DataFrame( + {"w": rand(6), "x": rand(6), "y": -rand(6), "z": -rand(6)} + ) # each column has positive-negative mixed value - mixed_df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['w', 'x', 'y', 'z']) + mixed_df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["w", "x", "y", "z"], + ) - for kind in ['line', 'area']: + for kind in ["line", "area"]: ax1 = _check_plot_works(df.plot, kind=kind, stacked=False) ax2 = _check_plot_works(df.plot, kind=kind, stacked=True) self._compare_stacked_y_cood(ax1.lines, ax2.lines) @@ -811,9 +838,8 @@ def test_line_area_stacked(self): def test_line_area_nan_df(self): values1 = [1, 2, np.nan, 3] values2 = [3, np.nan, 2, 1] - df = DataFrame({'a': values1, 'b': values2}) - tdf = DataFrame({'a': values1, - 'b': values2}, index=tm.makeDateIndex(k=4)) + df = DataFrame({"a": values1, "b": values2}) + tdf = DataFrame({"a": values1, "b": values2}, index=tm.makeDateIndex(k=4)) for d in [df, tdf]: ax = _check_plot_works(d.plot) @@ -827,29 +853,29 @@ def test_line_area_nan_df(self): exp = np.array([3, 2, 1], dtype=np.float64) tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp) tm.assert_numpy_array_equal( - masked1.mask, np.array([False, False, True, False])) + masked1.mask, np.array([False, False, True, False]) + ) tm.assert_numpy_array_equal( - masked2.mask, np.array([False, True, False, False])) + masked2.mask, np.array([False, True, False, False]) + ) expected1 = np.array([1, 2, 0, 3], dtype=np.float64) expected2 = np.array([3, 0, 2, 1], dtype=np.float64) ax = _check_plot_works(d.plot, stacked=True) tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), - expected1 + expected2) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) ax = _check_plot_works(d.plot.area) tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), - expected1 + expected2) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) ax = _check_plot_works(d.plot.area, stacked=False) tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2) def test_line_lim(self): - df = DataFrame(rand(6, 3), columns=['x', 'y', 'z']) + df = DataFrame(rand(6, 3), columns=["x", "y", "z"]) ax = df.plot() xmin, xmax = ax.get_xlim() lines = ax.get_lines() @@ -865,15 +891,15 @@ def test_line_lim(self): axes = df.plot(secondary_y=True, subplots=True) self._check_axes_shape(axes, axes_num=3, layout=(3, 1)) for ax in axes: - assert hasattr(ax, 'left_ax') - assert not hasattr(ax, 'right_ax') + assert hasattr(ax, "left_ax") + assert not hasattr(ax, "right_ax") xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin <= lines[0].get_data()[0][0] assert xmax >= lines[0].get_data()[0][-1] def test_area_lim(self): - df = DataFrame(rand(6, 4), columns=['x', 'y', 'z', 'four']) + df = DataFrame(rand(6, 4), columns=["x", "y", "z", "four"]) neg_df = -df for stacked in [True, False]: @@ -892,6 +918,7 @@ def test_area_lim(self): @pytest.mark.slow def test_bar_colors(self): import matplotlib.pyplot as plt + default_colors = self._unpack_cycler(plt.rcParams) df = DataFrame(randn(5, 5)) @@ -899,14 +926,15 @@ def test_bar_colors(self): self._check_colors(ax.patches[::5], facecolors=default_colors[:5]) tm.close() - custom_colors = 'rgcby' + custom_colors = "rgcby" ax = df.plot.bar(color=custom_colors) self._check_colors(ax.patches[::5], facecolors=custom_colors) tm.close() from matplotlib import cm + # Test str -> colormap functionality - ax = df.plot.bar(colormap='jet') + ax = df.plot.bar(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] self._check_colors(ax.patches[::5], facecolors=rgba_colors) tm.close() @@ -917,26 +945,28 @@ def test_bar_colors(self): self._check_colors(ax.patches[::5], facecolors=rgba_colors) tm.close() - ax = df.loc[:, [0]].plot.bar(color='DodgerBlue') - self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) + ax = df.loc[:, [0]].plot.bar(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) tm.close() - ax = df.plot(kind='bar', color='green') - self._check_colors(ax.patches[::5], facecolors=['green'] * 5) + ax = df.plot(kind="bar", color="green") + self._check_colors(ax.patches[::5], facecolors=["green"] * 5) tm.close() def test_bar_user_colors(self): - df = pd.DataFrame({"A": range(4), - "B": range(1, 5), - "color": ['red', 'blue', 'blue', 'red']}) + df = pd.DataFrame( + {"A": range(4), "B": range(1, 5), "color": ["red", "blue", "blue", "red"]} + ) # This should *only* work when `y` is specified, else # we use one color per column - ax = df.plot.bar(y='A', color=df['color']) + ax = df.plot.bar(y="A", color=df["color"]) result = [p.get_facecolor() for p in ax.patches] - expected = [(1., 0., 0., 1.), - (0., 0., 1., 1.), - (0., 0., 1., 1.), - (1., 0., 0., 1.)] + expected = [ + (1.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (1.0, 0.0, 0.0, 1.0), + ] assert result == expected @pytest.mark.slow @@ -1001,25 +1031,29 @@ def test_bar_barwidth(self): @pytest.mark.slow def test_bar_barwidth_position(self): df = DataFrame(randn(5, 5)) - self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9, - position=0.2) - self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9, - position=0.2) + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, position=0.2) + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, position=0.2 + ) + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, position=0.2 + ) @pytest.mark.slow def test_bar_barwidth_position_int(self): # GH 12979 df = DataFrame(randn(5, 5)) - for w in [1, 1.]: + for w in [1, 1.0]: ax = df.plot.bar(stacked=True, width=w) ticks = ax.xaxis.get_ticklocs() tm.assert_numpy_array_equal(ticks, np.array([0, 1, 2, 3, 4])) @@ -1028,11 +1062,11 @@ def test_bar_barwidth_position_int(self): assert ax.patches[0].get_x() == -0.5 assert ax.patches[-1].get_x() == 3.5 - self._check_bar_alignment(df, kind='bar', stacked=True, width=1) - self._check_bar_alignment(df, kind='barh', stacked=False, width=1) - self._check_bar_alignment(df, kind='barh', stacked=True, width=1) - self._check_bar_alignment(df, kind='bar', subplots=True, width=1) - self._check_bar_alignment(df, kind='barh', subplots=True, width=1) + self._check_bar_alignment(df, kind="bar", stacked=True, width=1) + self._check_bar_alignment(df, kind="barh", stacked=False, width=1) + self._check_bar_alignment(df, kind="barh", stacked=True, width=1) + self._check_bar_alignment(df, kind="bar", subplots=True, width=1) + self._check_bar_alignment(df, kind="barh", subplots=True, width=1) @pytest.mark.slow def test_bar_bottom_left(self): @@ -1065,9 +1099,7 @@ def test_bar_bottom_left(self): @pytest.mark.slow def test_bar_nan(self): - df = DataFrame({'A': [10, np.nan, 20], - 'B': [5, 10, 20], - 'C': [1, 2, 3]}) + df = DataFrame({"A": [10, np.nan, 20], "B": [5, 10, 20], "C": [1, 2, 3]}) ax = df.plot.bar() expected = [10, 0, 20, 5, 10, 20, 1, 2, 3] result = [p.get_height() for p in ax.patches] @@ -1084,13 +1116,17 @@ def test_bar_nan(self): @pytest.mark.slow def test_bar_categorical(self): # GH 13019 - df1 = pd.DataFrame(np.random.randn(6, 5), - index=pd.Index(list('ABCDEF')), - columns=pd.Index(list('abcde'))) + df1 = pd.DataFrame( + np.random.randn(6, 5), + index=pd.Index(list("ABCDEF")), + columns=pd.Index(list("abcde")), + ) # categorical index must behave the same - df2 = pd.DataFrame(np.random.randn(6, 5), - index=pd.CategoricalIndex(list('ABCDEF')), - columns=pd.CategoricalIndex(list('abcde'))) + df2 = pd.DataFrame( + np.random.randn(6, 5), + index=pd.CategoricalIndex(list("ABCDEF")), + columns=pd.CategoricalIndex(list("abcde")), + ) for df in [df1, df2]: ax = df.plot.bar() @@ -1109,20 +1145,22 @@ def test_bar_categorical(self): @pytest.mark.slow def test_plot_scatter(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['x', 'y', 'z', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) - _check_plot_works(df.plot.scatter, x='x', y='y') + _check_plot_works(df.plot.scatter, x="x", y="y") _check_plot_works(df.plot.scatter, x=1, y=2) with pytest.raises(TypeError): - df.plot.scatter(x='x') + df.plot.scatter(x="x") with pytest.raises(TypeError): - df.plot.scatter(y='y') + df.plot.scatter(y="y") # GH 6951 - axes = df.plot(x='x', y='y', kind='scatter', subplots=True) + axes = df.plot(x="x", y="y", kind="scatter", subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) @pytest.mark.slow @@ -1131,26 +1169,22 @@ def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, - columns=['A label', 'B label', 'C label']) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) - ax1 = df.plot.scatter(x='A label', y='B label') - ax2 = df.plot.scatter(x='A label', y='B label', c='C label') + ax1 = df.plot.scatter(x="A label", y="B label") + ax2 = df.plot.scatter(x="A label", y="B label", c="C label") - vis1 = [vis.get_visible() for vis in - ax1.xaxis.get_minorticklabels()] - vis2 = [vis.get_visible() for vis in - ax2.xaxis.get_minorticklabels()] + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_minorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_minorticklabels()] assert vis1 == vis2 - vis1 = [vis.get_visible() for vis in - ax1.xaxis.get_majorticklabels()] - vis2 = [vis.get_visible() for vis in - ax2.xaxis.get_majorticklabels()] + vis1 = [vis.get_visible() for vis in ax1.xaxis.get_majorticklabels()] + vis2 = [vis.get_visible() for vis in ax2.xaxis.get_majorticklabels()] assert vis1 == vis2 - assert (ax1.xaxis.get_label().get_visible() == - ax2.xaxis.get_label().get_visible()) + assert ( + ax1.xaxis.get_label().get_visible() == ax2.xaxis.get_label().get_visible() + ) @pytest.mark.slow def test_if_hexbin_xaxis_label_is_visible(self): @@ -1158,82 +1192,79 @@ def test_if_hexbin_xaxis_label_is_visible(self): # interfere with x-axis label and ticklabels with # ipython inline backend. random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, - columns=['A label', 'B label', 'C label']) - - ax = df.plot.hexbin('A label', 'B label', gridsize=12) - assert all(vis.get_visible() for vis in - ax.xaxis.get_minorticklabels()) - assert all(vis.get_visible() for vis in - ax.xaxis.get_majorticklabels()) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) + + ax = df.plot.hexbin("A label", "B label", gridsize=12) + assert all(vis.get_visible() for vis in ax.xaxis.get_minorticklabels()) + assert all(vis.get_visible() for vis in ax.xaxis.get_majorticklabels()) assert ax.xaxis.get_label().get_visible() @pytest.mark.slow def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): import matplotlib.pyplot as plt + random_array = np.random.random((1000, 3)) - df = pd.DataFrame(random_array, - columns=['A label', 'B label', 'C label']) + df = pd.DataFrame(random_array, columns=["A label", "B label", "C label"]) fig, axes = plt.subplots(1, 2) - df.plot.scatter('A label', 'B label', c='C label', ax=axes[0]) - df.plot.scatter('A label', 'B label', c='C label', ax=axes[1]) + df.plot.scatter("A label", "B label", c="C label", ax=axes[0]) + df.plot.scatter("A label", "B label", c="C label", ax=axes[1]) plt.tight_layout() - points = np.array([ax.get_position().get_points() - for ax in fig.axes]) + points = np.array([ax.get_position().get_points() for ax in fig.axes]) axes_x_coords = points[:, :, 0] parent_distance = axes_x_coords[1, :] - axes_x_coords[0, :] colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] - assert np.isclose(parent_distance, - colorbar_distance, atol=1e-7).all() + assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() @pytest.mark.slow def test_plot_scatter_with_categorical_data(self): # GH 16199 - df = pd.DataFrame({'x': [1, 2, 3, 4], - 'y': pd.Categorical(['a', 'b', 'a', 'c'])}) + df = pd.DataFrame( + {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} + ) with pytest.raises(ValueError) as ve: - df.plot(x='x', y='y', kind='scatter') - ve.match('requires y column to be numeric') + df.plot(x="x", y="y", kind="scatter") + ve.match("requires y column to be numeric") with pytest.raises(ValueError) as ve: - df.plot(x='y', y='x', kind='scatter') - ve.match('requires x column to be numeric') + df.plot(x="y", y="x", kind="scatter") + ve.match("requires x column to be numeric") with pytest.raises(ValueError) as ve: - df.plot(x='y', y='y', kind='scatter') - ve.match('requires x column to be numeric') + df.plot(x="y", y="y", kind="scatter") + ve.match("requires x column to be numeric") @pytest.mark.slow def test_plot_scatter_with_c(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['x', 'y', 'z', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["x", "y", "z", "four"], + ) - axes = [df.plot.scatter(x='x', y='y', c='z'), - df.plot.scatter(x=0, y=1, c=2)] + axes = [df.plot.scatter(x="x", y="y", c="z"), df.plot.scatter(x=0, y=1, c=2)] for ax in axes: # default to Greys - assert ax.collections[0].cmap.name == 'Greys' + assert ax.collections[0].cmap.name == "Greys" # n.b. there appears to be no public method # to get the colorbar label - assert ax.collections[0].colorbar._label == 'z' + assert ax.collections[0].colorbar._label == "z" - cm = 'cubehelix' - ax = df.plot.scatter(x='x', y='y', c='z', colormap=cm) + cm = "cubehelix" + ax = df.plot.scatter(x="x", y="y", c="z", colormap=cm) assert ax.collections[0].cmap.name == cm # verify turning off colorbar works - ax = df.plot.scatter(x='x', y='y', c='z', colorbar=False) + ax = df.plot.scatter(x="x", y="y", c="z", colorbar=False) assert ax.collections[0].colorbar is None # verify that we can still plot a solid color - ax = df.plot.scatter(x=0, y=1, c='red') + ax = df.plot.scatter(x=0, y=1, c="red") assert ax.collections[0].colorbar is None - self._check_colors(ax.collections, facecolors=['r']) + self._check_colors(ax.collections, facecolors=["r"]) # Ensure that we can pass an np.array straight through to matplotlib, # this functionality was accidentally removed previously. @@ -1241,44 +1272,48 @@ def test_plot_scatter_with_c(self): # # Exercise colormap path and non-colormap path as they are independent # - df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) red_rgba = [1.0, 0.0, 0.0, 1.0] green_rgba = [0.0, 1.0, 0.0, 1.0] rgba_array = np.array([red_rgba, green_rgba]) - ax = df.plot.scatter(x='A', y='B', c=rgba_array) + ax = df.plot.scatter(x="A", y="B", c=rgba_array) # expect the face colors of the points in the non-colormap path to be # identical to the values we supplied, normally we'd be on shaky ground # comparing floats for equality but here we expect them to be # identical. - tm.assert_numpy_array_equal(ax.collections[0] - .get_facecolor(), rgba_array) + tm.assert_numpy_array_equal(ax.collections[0].get_facecolor(), rgba_array) # we don't test the colors of the faces in this next plot because they # are dependent on the spring colormap, which may change its colors # later. float_array = np.array([0.0, 1.0]) - df.plot.scatter(x='A', y='B', c=float_array, cmap='spring') + df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") def test_scatter_colors(self): - df = DataFrame({'a': [1, 2, 3], 'b': [1, 2, 3], 'c': [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) with pytest.raises(TypeError): - df.plot.scatter(x='a', y='b', c='c', color='green') + df.plot.scatter(x="a", y="b", c="c", color="green") default_colors = self._unpack_cycler(self.plt.rcParams) - ax = df.plot.scatter(x='a', y='b', c='c') + ax = df.plot.scatter(x="a", y="b", c="c") tm.assert_numpy_array_equal( ax.collections[0].get_facecolor()[0], - np.array(self.colorconverter.to_rgba(default_colors[0]))) + np.array(self.colorconverter.to_rgba(default_colors[0])), + ) - ax = df.plot.scatter(x='a', y='b', color='white') - tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - np.array([1, 1, 1, 1], dtype=np.float64)) + ax = df.plot.scatter(x="a", y="b", color="white") + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array([1, 1, 1, 1], dtype=np.float64), + ) @pytest.mark.slow def test_plot_bar(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) _check_plot_works(df.plot.bar) _check_plot_works(df.plot.bar, legend=False) @@ -1287,12 +1322,12 @@ def test_plot_bar(self): _check_plot_works(df.plot.bar, subplots=True) _check_plot_works(df.plot.bar, stacked=True) - df = DataFrame(randn(10, 15), - index=list(string.ascii_letters[:10]), - columns=range(15)) + df = DataFrame( + randn(10, 15), index=list(string.ascii_letters[:10]), columns=range(15) + ) _check_plot_works(df.plot.bar) - df = DataFrame({'a': [0, 1], 'b': [1, 0]}) + df = DataFrame({"a": [0, 1], "b": [1, 0]}) ax = _check_plot_works(df.plot.bar) self._check_ticks_props(ax, xrot=90) @@ -1305,22 +1340,36 @@ def test_plot_bar(self): ax = df.plot.barh(rot=55, fontsize=11) self._check_ticks_props(ax, yrot=55, ylabelsize=11, xlabelsize=11) - def _check_bar_alignment(self, df, kind='bar', stacked=False, - subplots=False, align='center', width=0.5, - position=0.5): - - axes = df.plot(kind=kind, stacked=stacked, subplots=subplots, - align=align, width=width, position=position, grid=True) + def _check_bar_alignment( + self, + df, + kind="bar", + stacked=False, + subplots=False, + align="center", + width=0.5, + position=0.5, + ): + + axes = df.plot( + kind=kind, + stacked=stacked, + subplots=subplots, + align=align, + width=width, + position=position, + grid=True, + ) axes = self._flatten_visible(axes) for ax in axes: - if kind == 'bar': + if kind == "bar": axis = ax.xaxis ax_min, ax_max = ax.get_xlim() min_edge = min(p.get_x() for p in ax.patches) max_edge = max(p.get_x() + p.get_width() for p in ax.patches) - elif kind == 'barh': + elif kind == "barh": axis = ax.yaxis ax_min, ax_max = ax.get_ylim() min_edge = min(p.get_y() for p in ax.patches) @@ -1334,18 +1383,17 @@ def _check_bar_alignment(self, df, kind='bar', stacked=False, tm.assert_almost_equal(ax_max, max_edge + 0.25) p = ax.patches[0] - if kind == 'bar' and (stacked is True or subplots is True): + if kind == "bar" and (stacked is True or subplots is True): edge = p.get_x() center = edge + p.get_width() * position - elif kind == 'bar' and stacked is False: + elif kind == "bar" and stacked is False: center = p.get_x() + p.get_width() * len(df.columns) * position edge = p.get_x() - elif kind == 'barh' and (stacked is True or subplots is True): + elif kind == "barh" and (stacked is True or subplots is True): center = p.get_y() + p.get_height() * position edge = p.get_y() - elif kind == 'barh' and stacked is False: - center = p.get_y() + p.get_height() * len( - df.columns) * position + elif kind == "barh" and stacked is False: + center = p.get_y() + p.get_height() * len(df.columns) * position edge = p.get_y() else: raise ValueError @@ -1353,10 +1401,10 @@ def _check_bar_alignment(self, df, kind='bar', stacked=False, # Check the ticks locates on integer assert (axis.get_ticklocs() == np.arange(len(df))).all() - if align == 'center': + if align == "center": # Check whether the bar locates on center tm.assert_almost_equal(axis.get_ticklocs()[0], center) - elif align == 'edge': + elif align == "edge": # Check whether the bar's edge starts from the tick tm.assert_almost_equal(axis.get_ticklocs()[0], edge) else: @@ -1367,80 +1415,85 @@ def _check_bar_alignment(self, df, kind='bar', stacked=False, @pytest.mark.slow def test_bar_stacked_center(self): # GH2157 - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind='bar', stacked=True) - self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9) - self._check_bar_alignment(df, kind='barh', stacked=True) - self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9) + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=True, width=0.9) @pytest.mark.slow def test_bar_center(self): - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind='bar', stacked=False) - self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9) - self._check_bar_alignment(df, kind='barh', stacked=False) - self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9) + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=False, width=0.9) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=False, width=0.9) @pytest.mark.slow def test_bar_subplots_center(self): - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - self._check_bar_alignment(df, kind='bar', subplots=True) - self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9) - self._check_bar_alignment(df, kind='barh', subplots=True) - self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9) + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="bar", subplots=True, width=0.9) + self._check_bar_alignment(df, kind="barh", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True, width=0.9) @pytest.mark.slow def test_bar_align_single_column(self): df = DataFrame(randn(5)) - self._check_bar_alignment(df, kind='bar', stacked=False) - self._check_bar_alignment(df, kind='bar', stacked=True) - self._check_bar_alignment(df, kind='barh', stacked=False) - self._check_bar_alignment(df, kind='barh', stacked=True) - self._check_bar_alignment(df, kind='bar', subplots=True) - self._check_bar_alignment(df, kind='barh', subplots=True) + self._check_bar_alignment(df, kind="bar", stacked=False) + self._check_bar_alignment(df, kind="bar", stacked=True) + self._check_bar_alignment(df, kind="barh", stacked=False) + self._check_bar_alignment(df, kind="barh", stacked=True) + self._check_bar_alignment(df, kind="bar", subplots=True) + self._check_bar_alignment(df, kind="barh", subplots=True) @pytest.mark.slow def test_bar_edge(self): - df = DataFrame({'A': [3] * 5, 'B': list(range(5))}, index=range(5)) - - self._check_bar_alignment(df, kind='bar', stacked=True, align='edge') - self._check_bar_alignment(df, kind='bar', stacked=True, width=0.9, - align='edge') - self._check_bar_alignment(df, kind='barh', stacked=True, align='edge') - self._check_bar_alignment(df, kind='barh', stacked=True, width=0.9, - align='edge') - - self._check_bar_alignment(df, kind='bar', stacked=False, align='edge') - self._check_bar_alignment(df, kind='bar', stacked=False, width=0.9, - align='edge') - self._check_bar_alignment(df, kind='barh', stacked=False, align='edge') - self._check_bar_alignment(df, kind='barh', stacked=False, width=0.9, - align='edge') - - self._check_bar_alignment(df, kind='bar', subplots=True, align='edge') - self._check_bar_alignment(df, kind='bar', subplots=True, width=0.9, - align='edge') - self._check_bar_alignment(df, kind='barh', subplots=True, align='edge') - self._check_bar_alignment(df, kind='barh', subplots=True, width=0.9, - align='edge') + df = DataFrame({"A": [3] * 5, "B": list(range(5))}, index=range(5)) + + self._check_bar_alignment(df, kind="bar", stacked=True, align="edge") + self._check_bar_alignment(df, kind="bar", stacked=True, width=0.9, align="edge") + self._check_bar_alignment(df, kind="barh", stacked=True, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=True, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="bar", stacked=False, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", stacked=False, align="edge") + self._check_bar_alignment( + df, kind="barh", stacked=False, width=0.9, align="edge" + ) + + self._check_bar_alignment(df, kind="bar", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="bar", subplots=True, width=0.9, align="edge" + ) + self._check_bar_alignment(df, kind="barh", subplots=True, align="edge") + self._check_bar_alignment( + df, kind="barh", subplots=True, width=0.9, align="edge" + ) @pytest.mark.slow def test_bar_log_no_subplots(self): # GH3254, GH3298 matplotlib/matplotlib#1882, #1892 # regressions in 1.2.1 - expected = np.array([.1, 1., 10., 100]) + expected = np.array([0.1, 1.0, 10.0, 100]) # no subplots - df = DataFrame({'A': [3] * 5, 'B': list(range(1, 6))}, index=range(5)) + df = DataFrame({"A": [3] * 5, "B": list(range(1, 6))}, index=range(5)) ax = df.plot.bar(grid=True, log=True) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) @pytest.mark.slow def test_bar_log_subplots(self): - expected = np.array([.1, 1., 10., 100., 1000., 1e4]) + expected = np.array([0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]) ax = DataFrame([Series([200, 300]), Series([300, 500])]).plot.bar( - log=True, subplots=True) + log=True, subplots=True + ) tm.assert_numpy_array_equal(ax[0].yaxis.get_ticklocs(), expected) tm.assert_numpy_array_equal(ax[1].yaxis.get_ticklocs(), expected) @@ -1448,14 +1501,15 @@ def test_bar_log_subplots(self): @pytest.mark.slow def test_boxplot(self): df = self.hist_df - series = df['height'] + series = df["height"] numeric_cols = df._get_numeric_data().columns labels = [pprint_thing(c) for c in numeric_cols] ax = _check_plot_works(df.plot.box) self._check_text_labels(ax.get_xticklabels(), labels) - tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), - np.arange(1, len(numeric_cols) + 1)) + tm.assert_numpy_array_equal( + ax.xaxis.get_ticklocs(), np.arange(1, len(numeric_cols) + 1) + ) assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) axes = series.plot.box(rot=40) @@ -1486,10 +1540,9 @@ def test_boxplot_vertical(self): # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.box, - subplots=True, vert=False, logx=True) + axes = _check_plot_works(df.plot.box, subplots=True, vert=False, logx=True) self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) - self._check_ax_scales(axes, xaxis='log') + self._check_ax_scales(axes, xaxis="log") for ax, label in zip(axes, labels): self._check_text_labels(ax.get_yticklabels(), [label]) assert len(ax.lines) == self.bp_n_objects @@ -1502,23 +1555,25 @@ def test_boxplot_vertical(self): @pytest.mark.slow def test_boxplot_return_type(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) + df = DataFrame( + randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=["one", "two", "three", "four"], + ) with pytest.raises(ValueError): - df.plot.box(return_type='NOTATYPE') + df.plot.box(return_type="NOTATYPE") - result = df.plot.box(return_type='dict') - self._check_box_return_type(result, 'dict') + result = df.plot.box(return_type="dict") + self._check_box_return_type(result, "dict") - result = df.plot.box(return_type='axes') - self._check_box_return_type(result, 'axes') + result = df.plot.box(return_type="axes") + self._check_box_return_type(result, "axes") result = df.plot.box() # default axes - self._check_box_return_type(result, 'axes') + self._check_box_return_type(result, "axes") - result = df.plot.box(return_type='both') - self._check_box_return_type(result, 'both') + result = df.plot.box(return_type="both") + self._check_box_return_type(result, "both") @pytest.mark.slow def test_boxplot_subplots_return_type(self): @@ -1527,42 +1582,44 @@ def test_boxplot_subplots_return_type(self): # normal style: return_type=None result = df.plot.box(subplots=True) assert isinstance(result, Series) - self._check_box_return_type(result, None, expected_keys=[ - 'height', 'weight', 'category']) + self._check_box_return_type( + result, None, expected_keys=["height", "weight", "category"] + ) - for t in ['dict', 'axes', 'both']: + for t in ["dict", "axes", "both"]: returned = df.plot.box(return_type=t, subplots=True) self._check_box_return_type( - returned, t, - expected_keys=['height', 'weight', 'category'], - check_ax_title=False) + returned, + t, + expected_keys=["height", "weight", "category"], + check_ax_title=False, + ) @pytest.mark.slow @td.skip_if_no_scipy def test_kde_df(self): df = DataFrame(randn(100, 4)) - ax = _check_plot_works(df.plot, kind='kde') + ax = _check_plot_works(df.plot, kind="kde") expected = [pprint_thing(c) for c in df.columns] self._check_legend_labels(ax, labels=expected) self._check_ticks_props(ax, xrot=0) - ax = df.plot(kind='kde', rot=20, fontsize=5) + ax = df.plot(kind="kde", rot=20, fontsize=5) self._check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot, kind='kde', - subplots=True) + axes = _check_plot_works(df.plot, kind="kde", subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = df.plot(kind='kde', logy=True, subplots=True) - self._check_ax_scales(axes, yaxis='log') + axes = df.plot(kind="kde", logy=True, subplots=True) + self._check_ax_scales(axes, yaxis="log") @pytest.mark.slow @td.skip_if_no_scipy def test_kde_missing_vals(self): df = DataFrame(np.random.uniform(size=(100, 4))) df.loc[0, 0] = np.nan - _check_plot_works(df.plot, kind='kde') + _check_plot_works(df.plot, kind="kde") @pytest.mark.slow def test_hist_df(self): @@ -1576,10 +1633,9 @@ def test_hist_df(self): self._check_legend_labels(ax, labels=expected) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.hist, - subplots=True, logy=True) + axes = _check_plot_works(df.plot.hist, subplots=True, logy=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - self._check_ax_scales(axes, yaxis='log') + self._check_ax_scales(axes, yaxis="log") axes = series.plot.hist(rot=40) self._check_ticks_props(axes, xrot=40, yrot=0) @@ -1598,11 +1654,17 @@ def test_hist_df(self): tm.close() # if horizontal, yticklabels are rotated - axes = df.plot.hist(rot=50, fontsize=8, orientation='horizontal') + axes = df.plot.hist(rot=50, fontsize=8, orientation="horizontal") self._check_ticks_props(axes, xrot=0, yrot=50, ylabelsize=8) - def _check_box_coord(self, patches, expected_y=None, expected_h=None, - expected_x=None, expected_w=None): + def _check_box_coord( + self, + patches, + expected_y=None, + expected_h=None, + expected_x=None, + expected_w=None, + ): result_y = np.array([p.get_y() for p in patches]) result_height = np.array([p.get_height() for p in patches]) result_x = np.array([p.get_x() for p in patches]) @@ -1610,106 +1672,145 @@ def _check_box_coord(self, patches, expected_y=None, expected_h=None, # dtype is depending on above values, no need to check if expected_y is not None: - tm.assert_numpy_array_equal(result_y, expected_y, - check_dtype=False) + tm.assert_numpy_array_equal(result_y, expected_y, check_dtype=False) if expected_h is not None: - tm.assert_numpy_array_equal(result_height, expected_h, - check_dtype=False) + tm.assert_numpy_array_equal(result_height, expected_h, check_dtype=False) if expected_x is not None: - tm.assert_numpy_array_equal(result_x, expected_x, - check_dtype=False) + tm.assert_numpy_array_equal(result_x, expected_x, check_dtype=False) if expected_w is not None: - tm.assert_numpy_array_equal(result_width, expected_w, - check_dtype=False) + tm.assert_numpy_array_equal(result_width, expected_w, check_dtype=False) @pytest.mark.slow def test_hist_df_coord(self): - normal_df = DataFrame({'A': np.repeat(np.array([1, 2, 3, 4, 5]), - np.array([10, 9, 8, 7, 6])), - 'B': np.repeat(np.array([1, 2, 3, 4, 5]), - np.array([8, 8, 8, 8, 8])), - 'C': np.repeat(np.array([1, 2, 3, 4, 5]), - np.array([6, 7, 8, 9, 10]))}, - columns=['A', 'B', 'C']) - - nan_df = DataFrame({'A': np.repeat(np.array([np.nan, 1, 2, 3, 4, 5]), - np.array([3, 10, 9, 8, 7, 6])), - 'B': np.repeat(np.array([1, np.nan, 2, 3, 4, 5]), - np.array([8, 3, 8, 8, 8, 8])), - 'C': np.repeat(np.array([1, 2, 3, np.nan, 4, 5]), - np.array([6, 7, 8, 3, 9, 10]))}, - columns=['A', 'B', 'C']) + normal_df = DataFrame( + { + "A": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([10, 9, 8, 7, 6])), + "B": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([8, 8, 8, 8, 8])), + "C": np.repeat(np.array([1, 2, 3, 4, 5]), np.array([6, 7, 8, 9, 10])), + }, + columns=["A", "B", "C"], + ) + + nan_df = DataFrame( + { + "A": np.repeat( + np.array([np.nan, 1, 2, 3, 4, 5]), np.array([3, 10, 9, 8, 7, 6]) + ), + "B": np.repeat( + np.array([1, np.nan, 2, 3, 4, 5]), np.array([8, 3, 8, 8, 8, 8]) + ), + "C": np.repeat( + np.array([1, 2, 3, np.nan, 4, 5]), np.array([6, 7, 8, 3, 9, 10]) + ), + }, + columns=["A", "B", "C"], + ) for df in [normal_df, nan_df]: ax = df.plot.hist(bins=5) - self._check_box_coord(ax.patches[:5], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(ax.patches[10:], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([6, 7, 8, 9, 10])) + self._check_box_coord( + ax.patches[:5], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) ax = df.plot.hist(bins=5, stacked=True) - self._check_box_coord(ax.patches[:5], - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_y=np.array([10, 9, 8, 7, 6]), - expected_h=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(ax.patches[10:], - expected_y=np.array([18, 17, 16, 15, 14]), - expected_h=np.array([6, 7, 8, 9, 10])) + self._check_box_coord( + ax.patches[:5], + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_y=np.array([10, 9, 8, 7, 6]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_y=np.array([18, 17, 16, 15, 14]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) axes = df.plot.hist(bins=5, stacked=True, subplots=True) - self._check_box_coord(axes[0].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(axes[1].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(axes[2].patches, - expected_y=np.array([0, 0, 0, 0, 0]), - expected_h=np.array([6, 7, 8, 9, 10])) + self._check_box_coord( + axes[0].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + axes[1].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + axes[2].patches, + expected_y=np.array([0, 0, 0, 0, 0]), + expected_h=np.array([6, 7, 8, 9, 10]), + ) # horizontal - ax = df.plot.hist(bins=5, orientation='horizontal') - self._check_box_coord(ax.patches[:5], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(ax.patches[10:], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([6, 7, 8, 9, 10])) - - ax = df.plot.hist(bins=5, stacked=True, - orientation='horizontal') - self._check_box_coord(ax.patches[:5], - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(ax.patches[5:10], - expected_x=np.array([10, 9, 8, 7, 6]), - expected_w=np.array([8, 8, 8, 8, 8])) + ax = df.plot.hist(bins=5, orientation="horizontal") + self._check_box_coord( + ax.patches[:5], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + ax.patches[10:], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10]), + ) + + ax = df.plot.hist(bins=5, stacked=True, orientation="horizontal") + self._check_box_coord( + ax.patches[:5], + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + ax.patches[5:10], + expected_x=np.array([10, 9, 8, 7, 6]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) self._check_box_coord( ax.patches[10:], expected_x=np.array([18, 17, 16, 15, 14]), - expected_w=np.array([6, 7, 8, 9, 10])) + expected_w=np.array([6, 7, 8, 9, 10]), + ) - axes = df.plot.hist(bins=5, stacked=True, subplots=True, - orientation='horizontal') - self._check_box_coord(axes[0].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([10, 9, 8, 7, 6])) - self._check_box_coord(axes[1].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([8, 8, 8, 8, 8])) - self._check_box_coord(axes[2].patches, - expected_x=np.array([0, 0, 0, 0, 0]), - expected_w=np.array([6, 7, 8, 9, 10])) + axes = df.plot.hist( + bins=5, stacked=True, subplots=True, orientation="horizontal" + ) + self._check_box_coord( + axes[0].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([10, 9, 8, 7, 6]), + ) + self._check_box_coord( + axes[1].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([8, 8, 8, 8, 8]), + ) + self._check_box_coord( + axes[2].patches, + expected_x=np.array([0, 0, 0, 0, 0]), + expected_w=np.array([6, 7, 8, 9, 10]), + ) @pytest.mark.slow def test_plot_int_columns(self): @@ -1718,11 +1819,11 @@ def test_plot_int_columns(self): @pytest.mark.slow def test_df_legend_labels(self): - kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] - df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) - df2 = DataFrame(rand(3, 3), columns=['d', 'e', 'f']) - df3 = DataFrame(rand(3, 3), columns=['g', 'h', 'i']) - df4 = DataFrame(rand(3, 3), columns=['j', 'k', 'l']) + kinds = ["line", "bar", "barh", "kde", "area", "hist"] + df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) + df2 = DataFrame(rand(3, 3), columns=["d", "e", "f"]) + df3 = DataFrame(rand(3, 3), columns=["g", "h", "i"]) + df4 = DataFrame(rand(3, 3), columns=["j", "k", "l"]) for kind in kinds: @@ -1735,82 +1836,82 @@ def test_df_legend_labels(self): ax = df3.plot(kind=kind, legend=True, ax=ax) self._check_legend_labels(ax, labels=df.columns.union(df3.columns)) - ax = df4.plot(kind=kind, legend='reverse', ax=ax) - expected = list(df.columns.union(df3.columns)) + list(reversed( - df4.columns)) + ax = df4.plot(kind=kind, legend="reverse", ax=ax) + expected = list(df.columns.union(df3.columns)) + list(reversed(df4.columns)) self._check_legend_labels(ax, labels=expected) # Secondary Y - ax = df.plot(legend=True, secondary_y='b') - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ax = df.plot(legend=True, secondary_y="b") + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) ax = df2.plot(legend=False, ax=ax) - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) - ax = df3.plot(kind='bar', legend=True, secondary_y='h', ax=ax) + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) + ax = df3.plot(kind="bar", legend=True, secondary_y="h", ax=ax) self._check_legend_labels( - ax, labels=['a', 'b (right)', 'c', 'g', 'h (right)', 'i']) + ax, labels=["a", "b (right)", "c", "g", "h (right)", "i"] + ) # Time Series - ind = date_range('1/1/2014', periods=3) - df = DataFrame(randn(3, 3), columns=['a', 'b', 'c'], index=ind) - df2 = DataFrame(randn(3, 3), columns=['d', 'e', 'f'], index=ind) - df3 = DataFrame(randn(3, 3), columns=['g', 'h', 'i'], index=ind) - ax = df.plot(legend=True, secondary_y='b') - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + ind = date_range("1/1/2014", periods=3) + df = DataFrame(randn(3, 3), columns=["a", "b", "c"], index=ind) + df2 = DataFrame(randn(3, 3), columns=["d", "e", "f"], index=ind) + df3 = DataFrame(randn(3, 3), columns=["g", "h", "i"], index=ind) + ax = df.plot(legend=True, secondary_y="b") + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) ax = df2.plot(legend=False, ax=ax) - self._check_legend_labels(ax, labels=['a', 'b (right)', 'c']) + self._check_legend_labels(ax, labels=["a", "b (right)", "c"]) ax = df3.plot(legend=True, ax=ax) - self._check_legend_labels( - ax, labels=['a', 'b (right)', 'c', 'g', 'h', 'i']) + self._check_legend_labels(ax, labels=["a", "b (right)", "c", "g", "h", "i"]) # scatter - ax = df.plot.scatter(x='a', y='b', label='data1') - self._check_legend_labels(ax, labels=['data1']) - ax = df2.plot.scatter(x='d', y='e', legend=False, label='data2', ax=ax) - self._check_legend_labels(ax, labels=['data1']) - ax = df3.plot.scatter(x='g', y='h', label='data3', ax=ax) - self._check_legend_labels(ax, labels=['data1', 'data3']) + ax = df.plot.scatter(x="a", y="b", label="data1") + self._check_legend_labels(ax, labels=["data1"]) + ax = df2.plot.scatter(x="d", y="e", legend=False, label="data2", ax=ax) + self._check_legend_labels(ax, labels=["data1"]) + ax = df3.plot.scatter(x="g", y="h", label="data3", ax=ax) + self._check_legend_labels(ax, labels=["data1", "data3"]) # ensure label args pass through and # index name does not mutate # column names don't mutate - df5 = df.set_index('a') - ax = df5.plot(y='b') - self._check_legend_labels(ax, labels=['b']) - ax = df5.plot(y='b', label='LABEL_b') - self._check_legend_labels(ax, labels=['LABEL_b']) - self._check_text_labels(ax.xaxis.get_label(), 'a') - ax = df5.plot(y='c', label='LABEL_c', ax=ax) - self._check_legend_labels(ax, labels=['LABEL_b', 'LABEL_c']) - assert df5.columns.tolist() == ['b', 'c'] + df5 = df.set_index("a") + ax = df5.plot(y="b") + self._check_legend_labels(ax, labels=["b"]) + ax = df5.plot(y="b", label="LABEL_b") + self._check_legend_labels(ax, labels=["LABEL_b"]) + self._check_text_labels(ax.xaxis.get_label(), "a") + ax = df5.plot(y="c", label="LABEL_c", ax=ax) + self._check_legend_labels(ax, labels=["LABEL_b", "LABEL_c"]) + assert df5.columns.tolist() == ["b", "c"] def test_legend_name(self): - multi = DataFrame(randn(4, 4), - columns=[np.array(['a', 'a', 'b', 'b']), - np.array(['x', 'y', 'x', 'y'])]) - multi.columns.names = ['group', 'individual'] + multi = DataFrame( + randn(4, 4), + columns=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], + ) + multi.columns.names = ["group", "individual"] ax = multi.plot() leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'group,individual') + self._check_text_labels(leg_title, "group,individual") df = DataFrame(randn(5, 5)) ax = df.plot(legend=True, ax=ax) leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'group,individual') + self._check_text_labels(leg_title, "group,individual") - df.columns.name = 'new' + df.columns.name = "new" ax = df.plot(legend=False, ax=ax) leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'group,individual') + self._check_text_labels(leg_title, "group,individual") ax = df.plot(legend=True, ax=ax) leg_title = ax.legend_.get_title() - self._check_text_labels(leg_title, 'new') + self._check_text_labels(leg_title, "new") @pytest.mark.slow def test_no_legend(self): - kinds = ['line', 'bar', 'barh', 'kde', 'area', 'hist'] - df = DataFrame(rand(3, 3), columns=['a', 'b', 'c']) + kinds = ["line", "bar", "barh", "kde", "area", "hist"] + df = DataFrame(rand(3, 3), columns=["a", "b", "c"]) for kind in kinds: @@ -1820,17 +1921,20 @@ def test_no_legend(self): @pytest.mark.slow def test_style_by_column(self): import matplotlib.pyplot as plt + fig = plt.gcf() df = DataFrame(randn(100, 3)) - for markers in [{0: '^', - 1: '+', - 2: 'o'}, {0: '^', - 1: '+'}, ['^', '+', 'o'], ['^', '+']]: + for markers in [ + {0: "^", 1: "+", 2: "o"}, + {0: "^", 1: "+"}, + ["^", "+", "o"], + ["^", "+"], + ]: fig.clf() fig.add_subplot(111) ax = df.plot(style=markers) - for i, l in enumerate(ax.get_lines()[:len(markers)]): + for i, l in enumerate(ax.get_lines()[: len(markers)]): assert l.get_marker() == markers[i] @pytest.mark.slow @@ -1840,13 +1944,13 @@ def test_line_label_none(self): assert ax.get_legend() is None ax = s.plot(legend=True) - assert ax.get_legend().get_texts()[0].get_text() == 'None' + assert ax.get_legend().get_texts()[0].get_text() == "None" @pytest.mark.slow def test_line_colors(self): from matplotlib import cm - custom_colors = 'rgcby' + custom_colors = "rgcby" df = DataFrame(randn(5, 5)) ax = df.plot(color=custom_colors) @@ -1862,7 +1966,7 @@ def test_line_colors(self): tm.close() - ax = df.plot(colormap='jet') + ax = df.plot(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] self._check_colors(ax.get_lines(), linecolors=rgba_colors) tm.close() @@ -1874,28 +1978,28 @@ def test_line_colors(self): # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - ax = df.loc[:, [0]].plot(color='DodgerBlue') - self._check_colors(ax.lines, linecolors=['DodgerBlue']) + ax = df.loc[:, [0]].plot(color="DodgerBlue") + self._check_colors(ax.lines, linecolors=["DodgerBlue"]) - ax = df.plot(color='red') - self._check_colors(ax.get_lines(), linecolors=['red'] * 5) + ax = df.plot(color="red") + self._check_colors(ax.get_lines(), linecolors=["red"] * 5) tm.close() # GH 10299 - custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF'] + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] ax = df.plot(color=custom_colors) self._check_colors(ax.get_lines(), linecolors=custom_colors) tm.close() with pytest.raises(ValueError): # Color contains shorthand hex value results in ValueError - custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] + custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] # Forced show plot _check_plot_works(df.plot, color=custom_colors) @pytest.mark.slow def test_dont_modify_colors(self): - colors = ['r', 'g', 'b'] + colors = ["r", "g", "b"] pd.DataFrame(np.random.rand(10, 2)).plot(color=colors) assert len(colors) == 3 @@ -1903,6 +2007,7 @@ def test_dont_modify_colors(self): def test_line_colors_and_styles_subplots(self): # GH 9894 from matplotlib import cm + default_colors = self._unpack_cycler(self.plt.rcParams) df = DataFrame(randn(5, 5)) @@ -1914,18 +2019,18 @@ def test_line_colors_and_styles_subplots(self): tm.close() # single color char - axes = df.plot(subplots=True, color='k') + axes = df.plot(subplots=True, color="k") for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['k']) + self._check_colors(ax.get_lines(), linecolors=["k"]) tm.close() # single color str - axes = df.plot(subplots=True, color='green') + axes = df.plot(subplots=True, color="green") for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['green']) + self._check_colors(ax.get_lines(), linecolors=["green"]) tm.close() - custom_colors = 'rgcby' + custom_colors = "rgcby" axes = df.plot(color=custom_colors, subplots=True) for ax, c in zip(axes, list(custom_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1937,7 +2042,7 @@ def test_line_colors_and_styles_subplots(self): tm.close() # GH 10299 - custom_colors = ['#FF0000', '#0000FF', '#FFFF00', '#000000', '#FFFFFF'] + custom_colors = ["#FF0000", "#0000FF", "#FFFF00", "#000000", "#FFFFFF"] axes = df.plot(color=custom_colors, subplots=True) for ax, c in zip(axes, list(custom_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1945,14 +2050,14 @@ def test_line_colors_and_styles_subplots(self): with pytest.raises(ValueError): # Color contains shorthand hex value results in ValueError - custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] + custom_colors = ["#F00", "#00F", "#FF0", "#000", "#FFF"] # Forced show plot # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): _check_plot_works(df.plot, color=custom_colors, subplots=True) rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ['jet', cm.jet]: + for cmap in ["jet", cm.jet]: axes = df.plot(colormap=cmap, subplots=True) for ax, c in zip(axes, rgba_colors): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1960,17 +2065,17 @@ def test_line_colors_and_styles_subplots(self): # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(color='DodgerBlue', subplots=True) - self._check_colors(axes[0].lines, linecolors=['DodgerBlue']) + axes = df.loc[:, [0]].plot(color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) # single character style - axes = df.plot(style='r', subplots=True) + axes = df.plot(style="r", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['r']) + self._check_colors(ax.get_lines(), linecolors=["r"]) tm.close() # list of styles - styles = list('rgcby') + styles = list("rgcby") axes = df.plot(style=styles, subplots=True) for ax, c in zip(axes, styles): self._check_colors(ax.get_lines(), linecolors=[c]) @@ -1981,7 +2086,7 @@ def test_area_colors(self): from matplotlib import cm from matplotlib.collections import PolyCollection - custom_colors = 'rgcby' + custom_colors = "rgcby" df = DataFrame(rand(5, 5)) ax = df.plot.area(color=custom_colors) @@ -1996,7 +2101,7 @@ def test_area_colors(self): assert h.get_alpha() is None tm.close() - ax = df.plot.area(colormap='jet') + ax = df.plot.area(colormap="jet") jet_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] self._check_colors(ax.get_lines(), linecolors=jet_colors) poly = [o for o in ax.get_children() if isinstance(o, PolyCollection)] @@ -2017,7 +2122,7 @@ def test_area_colors(self): handles, labels = ax.get_legend_handles_labels() linecolors = jet_with_alpha - self._check_colors(handles[:len(jet_colors)], linecolors=linecolors) + self._check_colors(handles[: len(jet_colors)], linecolors=linecolors) for h in handles: assert h.get_alpha() == 0.5 @@ -2030,14 +2135,15 @@ def test_hist_colors(self): self._check_colors(ax.patches[::10], facecolors=default_colors[:5]) tm.close() - custom_colors = 'rgcby' + custom_colors = "rgcby" ax = df.plot.hist(color=custom_colors) self._check_colors(ax.patches[::10], facecolors=custom_colors) tm.close() from matplotlib import cm + # Test str -> colormap functionality - ax = df.plot.hist(colormap='jet') + ax = df.plot.hist(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, 5)] self._check_colors(ax.patches[::10], facecolors=rgba_colors) tm.close() @@ -2048,11 +2154,11 @@ def test_hist_colors(self): self._check_colors(ax.patches[::10], facecolors=rgba_colors) tm.close() - ax = df.loc[:, [0]].plot.hist(color='DodgerBlue') - self._check_colors([ax.patches[0]], facecolors=['DodgerBlue']) + ax = df.loc[:, [0]].plot.hist(color="DodgerBlue") + self._check_colors([ax.patches[0]], facecolors=["DodgerBlue"]) - ax = df.plot(kind='hist', color='green') - self._check_colors(ax.patches[::10], facecolors=['green'] * 5) + ax = df.plot(kind="hist", color="green") + self._check_colors(ax.patches[::10], facecolors=["green"] * 5) tm.close() @pytest.mark.slow @@ -2060,14 +2166,14 @@ def test_hist_colors(self): def test_kde_colors(self): from matplotlib import cm - custom_colors = 'rgcby' + custom_colors = "rgcby" df = DataFrame(rand(5, 5)) ax = df.plot.kde(color=custom_colors) self._check_colors(ax.get_lines(), linecolors=custom_colors) tm.close() - ax = df.plot.kde(colormap='jet') + ax = df.plot.kde(colormap="jet") rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] self._check_colors(ax.get_lines(), linecolors=rgba_colors) tm.close() @@ -2080,129 +2186,133 @@ def test_kde_colors(self): @td.skip_if_no_scipy def test_kde_colors_and_styles_subplots(self): from matplotlib import cm + default_colors = self._unpack_cycler(self.plt.rcParams) df = DataFrame(randn(5, 5)) - axes = df.plot(kind='kde', subplots=True) + axes = df.plot(kind="kde", subplots=True) for ax, c in zip(axes, list(default_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() # single color char - axes = df.plot(kind='kde', color='k', subplots=True) + axes = df.plot(kind="kde", color="k", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['k']) + self._check_colors(ax.get_lines(), linecolors=["k"]) tm.close() # single color str - axes = df.plot(kind='kde', color='red', subplots=True) + axes = df.plot(kind="kde", color="red", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['red']) + self._check_colors(ax.get_lines(), linecolors=["red"]) tm.close() - custom_colors = 'rgcby' - axes = df.plot(kind='kde', color=custom_colors, subplots=True) + custom_colors = "rgcby" + axes = df.plot(kind="kde", color=custom_colors, subplots=True) for ax, c in zip(axes, list(custom_colors)): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() rgba_colors = [cm.jet(n) for n in np.linspace(0, 1, len(df))] - for cmap in ['jet', cm.jet]: - axes = df.plot(kind='kde', colormap=cmap, subplots=True) + for cmap in ["jet", cm.jet]: + axes = df.plot(kind="kde", colormap=cmap, subplots=True) for ax, c in zip(axes, rgba_colors): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() # make color a list if plotting one column frame # handles cases like df.plot(color='DodgerBlue') - axes = df.loc[:, [0]].plot(kind='kde', color='DodgerBlue', - subplots=True) - self._check_colors(axes[0].lines, linecolors=['DodgerBlue']) + axes = df.loc[:, [0]].plot(kind="kde", color="DodgerBlue", subplots=True) + self._check_colors(axes[0].lines, linecolors=["DodgerBlue"]) # single character style - axes = df.plot(kind='kde', style='r', subplots=True) + axes = df.plot(kind="kde", style="r", subplots=True) for ax in axes: - self._check_colors(ax.get_lines(), linecolors=['r']) + self._check_colors(ax.get_lines(), linecolors=["r"]) tm.close() # list of styles - styles = list('rgcby') - axes = df.plot(kind='kde', style=styles, subplots=True) + styles = list("rgcby") + axes = df.plot(kind="kde", style=styles, subplots=True) for ax, c in zip(axes, styles): self._check_colors(ax.get_lines(), linecolors=[c]) tm.close() @pytest.mark.slow def test_boxplot_colors(self): - def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k', - fliers_c=None): + def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c="k", fliers_c=None): # TODO: outside this func? if fliers_c is None: - fliers_c = 'k' - self._check_colors(bp['boxes'], - linecolors=[box_c] * len(bp['boxes'])) - self._check_colors(bp['whiskers'], - linecolors=[whiskers_c] * len(bp['whiskers'])) - self._check_colors(bp['medians'], - linecolors=[medians_c] * len(bp['medians'])) - self._check_colors(bp['fliers'], - linecolors=[fliers_c] * len(bp['fliers'])) - self._check_colors(bp['caps'], - linecolors=[caps_c] * len(bp['caps'])) + fliers_c = "k" + self._check_colors(bp["boxes"], linecolors=[box_c] * len(bp["boxes"])) + self._check_colors( + bp["whiskers"], linecolors=[whiskers_c] * len(bp["whiskers"]) + ) + self._check_colors( + bp["medians"], linecolors=[medians_c] * len(bp["medians"]) + ) + self._check_colors(bp["fliers"], linecolors=[fliers_c] * len(bp["fliers"])) + self._check_colors(bp["caps"], linecolors=[caps_c] * len(bp["caps"])) default_colors = self._unpack_cycler(self.plt.rcParams) df = DataFrame(randn(5, 5)) - bp = df.plot.box(return_type='dict') - _check_colors(bp, default_colors[0], default_colors[0], - default_colors[2]) + bp = df.plot.box(return_type="dict") + _check_colors(bp, default_colors[0], default_colors[0], default_colors[2]) tm.close() - dict_colors = dict(boxes='#572923', whiskers='#982042', - medians='#804823', caps='#123456') - bp = df.plot.box(color=dict_colors, sym='r+', return_type='dict') - _check_colors(bp, dict_colors['boxes'], dict_colors['whiskers'], - dict_colors['medians'], dict_colors['caps'], 'r') + dict_colors = dict( + boxes="#572923", whiskers="#982042", medians="#804823", caps="#123456" + ) + bp = df.plot.box(color=dict_colors, sym="r+", return_type="dict") + _check_colors( + bp, + dict_colors["boxes"], + dict_colors["whiskers"], + dict_colors["medians"], + dict_colors["caps"], + "r", + ) tm.close() # partial colors - dict_colors = dict(whiskers='c', medians='m') - bp = df.plot.box(color=dict_colors, return_type='dict') - _check_colors(bp, default_colors[0], 'c', 'm') + dict_colors = dict(whiskers="c", medians="m") + bp = df.plot.box(color=dict_colors, return_type="dict") + _check_colors(bp, default_colors[0], "c", "m") tm.close() from matplotlib import cm + # Test str -> colormap functionality - bp = df.plot.box(colormap='jet', return_type='dict') + bp = df.plot.box(colormap="jet", return_type="dict") jet_colors = [cm.jet(n) for n in np.linspace(0, 1, 3)] _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) tm.close() # Test colormap functionality - bp = df.plot.box(colormap=cm.jet, return_type='dict') + bp = df.plot.box(colormap=cm.jet, return_type="dict") _check_colors(bp, jet_colors[0], jet_colors[0], jet_colors[2]) tm.close() # string color is applied to all artists except fliers - bp = df.plot.box(color='DodgerBlue', return_type='dict') - _check_colors(bp, 'DodgerBlue', 'DodgerBlue', 'DodgerBlue', - 'DodgerBlue') + bp = df.plot.box(color="DodgerBlue", return_type="dict") + _check_colors(bp, "DodgerBlue", "DodgerBlue", "DodgerBlue", "DodgerBlue") # tuple is also applied to all artists except fliers - bp = df.plot.box(color=(0, 1, 0), sym='#123456', return_type='dict') - _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), - (0, 1, 0), '#123456') + bp = df.plot.box(color=(0, 1, 0), sym="#123456", return_type="dict") + _check_colors(bp, (0, 1, 0), (0, 1, 0), (0, 1, 0), (0, 1, 0), "#123456") with pytest.raises(ValueError): # Color contains invalid key results in ValueError - df.plot.box(color=dict(boxes='red', xxxx='blue')) + df.plot.box(color=dict(boxes="red", xxxx="blue")) def test_default_color_cycle(self): import matplotlib.pyplot as plt import cycler - colors = list('rgbk') - plt.rcParams['axes.prop_cycle'] = cycler.cycler('color', colors) + + colors = list("rgbk") + plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) df = DataFrame(randn(5, 3)) ax = df.plot() @@ -2211,11 +2321,11 @@ def test_default_color_cycle(self): self._check_colors(ax.get_lines(), linecolors=expected) def test_unordered_ts(self): - df = DataFrame(np.array([3.0, 2.0, 1.0]), - index=[date(2012, 10, 1), - date(2012, 9, 1), - date(2012, 8, 1)], - columns=['test']) + df = DataFrame( + np.array([3.0, 2.0, 1.0]), + index=[date(2012, 10, 1), date(2012, 9, 1), date(2012, 8, 1)], + columns=["test"], + ) ax = df.plot() xticks = ax.lines[0].get_xdata() assert xticks[0] < xticks[1] @@ -2224,17 +2334,17 @@ def test_unordered_ts(self): @td.skip_if_no_scipy def test_kind_both_ways(self): - df = DataFrame({'x': [1, 2, 3]}) + df = DataFrame({"x": [1, 2, 3]}) for kind in plotting.PlotAccessor._common_kinds: df.plot(kind=kind) getattr(df.plot, kind)() - for kind in ['scatter', 'hexbin']: - df.plot('x', 'x', kind=kind) - getattr(df.plot, kind)('x', 'x') + for kind in ["scatter", "hexbin"]: + df.plot("x", "x", kind=kind) + getattr(df.plot, kind)("x", "x") def test_all_invalid_plot_data(self): - df = DataFrame(list('abcd')) + df = DataFrame(list("abcd")) for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" @@ -2245,7 +2355,7 @@ def test_all_invalid_plot_data(self): def test_partially_invalid_plot_data(self): with tm.RNGContext(42): df = DataFrame(randn(10, 2), dtype=object) - df[np.random.rand(df.shape[0]) > 0.5] = 'a' + df[np.random.rand(df.shape[0]) > 0.5] = "a" for kind in plotting.PlotAccessor._common_kinds: msg = "no numeric data to plot" @@ -2254,9 +2364,9 @@ def test_partially_invalid_plot_data(self): with tm.RNGContext(42): # area plot doesn't support positive/negative mixed data - kinds = ['area'] + kinds = ["area"] df = DataFrame(rand(10, 2), dtype=object) - df[np.random.rand(df.shape[0]) > 0.5] = 'a' + df[np.random.rand(df.shape[0]) > 0.5] = "a" for kind in kinds: with pytest.raises(TypeError): df.plot(kind=kind) @@ -2264,50 +2374,50 @@ def test_partially_invalid_plot_data(self): def test_invalid_kind(self): df = DataFrame(randn(10, 2)) with pytest.raises(ValueError): - df.plot(kind='aasdf') - - @pytest.mark.parametrize("x,y,lbl", [ - (['B', 'C'], 'A', 'a'), - (['A'], ['B', 'C'], ['b', 'c']), - ('A', ['B', 'C'], 'badlabel') - ]) + df.plot(kind="aasdf") + + @pytest.mark.parametrize( + "x,y,lbl", + [ + (["B", "C"], "A", "a"), + (["A"], ["B", "C"], ["b", "c"]), + ("A", ["B", "C"], "badlabel"), + ], + ) def test_invalid_xy_args(self, x, y, lbl): # GH 18671, 19699 allows y to be list-like but not x - df = DataFrame({"A": [1, 2], 'B': [3, 4], 'C': [5, 6]}) + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) with pytest.raises(ValueError): df.plot(x=x, y=y, label=lbl) - @pytest.mark.parametrize("x,y", [ - ('A', 'B'), - (['A'], 'B') - ]) + @pytest.mark.parametrize("x,y", [("A", "B"), (["A"], "B")]) def test_invalid_xy_args_dup_cols(self, x, y): # GH 18671, 19699 allows y to be list-like but not x - df = DataFrame([[1, 3, 5], [2, 4, 6]], columns=list('AAB')) + df = DataFrame([[1, 3, 5], [2, 4, 6]], columns=list("AAB")) with pytest.raises(ValueError): df.plot(x=x, y=y) - @pytest.mark.parametrize("x,y,lbl,colors", [ - ('A', ['B'], ['b'], ['red']), - ('A', ['B', 'C'], ['b', 'c'], ['red', 'blue']), - (0, [1, 2], ['bokeh', 'cython'], ['green', 'yellow']) - ]) + @pytest.mark.parametrize( + "x,y,lbl,colors", + [ + ("A", ["B"], ["b"], ["red"]), + ("A", ["B", "C"], ["b", "c"], ["red", "blue"]), + (0, [1, 2], ["bokeh", "cython"], ["green", "yellow"]), + ], + ) def test_y_listlike(self, x, y, lbl, colors): # GH 19699: tests list-like y and verifies lbls & colors - df = DataFrame({"A": [1, 2], 'B': [3, 4], 'C': [5, 6]}) - _check_plot_works(df.plot, x='A', y=y, label=lbl) + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + _check_plot_works(df.plot, x="A", y=y, label=lbl) ax = df.plot(x=x, y=y, label=lbl, color=colors) assert len(ax.lines) == len(y) self._check_colors(ax.get_lines(), linecolors=colors) - @pytest.mark.parametrize("x,y,colnames", [ - (0, 1, ['A', 'B']), - (1, 0, [0, 1]) - ]) + @pytest.mark.parametrize("x,y,colnames", [(0, 1, ["A", "B"]), (1, 0, [0, 1])]) def test_xy_args_integer(self, x, y, colnames): # GH 20056: tests integer args for xy and checks col names - df = DataFrame({"A": [1, 2], 'B': [3, 4]}) + df = DataFrame({"A": [1, 2], "B": [3, 4]}) df.columns = colnames _check_plot_works(df.plot, x=x, y=y) @@ -2315,12 +2425,12 @@ def test_xy_args_integer(self, x, y, colnames): def test_hexbin_basic(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', gridsize=10) + ax = df.plot.hexbin(x="A", y="B", gridsize=10) # TODO: need better way to test. This just does existence. assert len(ax.collections) == 1 # GH 6951 - axes = df.plot.hexbin(x='A', y='B', subplots=True) + axes = df.plot.hexbin(x="A", y="B", subplots=True) # hexbin should have 2 axes in the figure, 1 for plotting and another # is colorbar assert len(axes[0].figure.axes) == 2 @@ -2331,10 +2441,10 @@ def test_hexbin_basic(self): def test_hexbin_with_c(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', C='C') + ax = df.plot.hexbin(x="A", y="B", C="C") assert len(ax.collections) == 1 - ax = df.plot.hexbin(x='A', y='B', C='C', reduce_C_function=np.std) + ax = df.plot.hexbin(x="A", y="B", C="C", reduce_C_function=np.std) assert len(ax.collections) == 1 @pytest.mark.slow @@ -2342,38 +2452,41 @@ def test_hexbin_cmap(self): df = self.hexbin_df # Default to BuGn - ax = df.plot.hexbin(x='A', y='B') - assert ax.collections[0].cmap.name == 'BuGn' + ax = df.plot.hexbin(x="A", y="B") + assert ax.collections[0].cmap.name == "BuGn" - cm = 'cubehelix' - ax = df.plot.hexbin(x='A', y='B', colormap=cm) + cm = "cubehelix" + ax = df.plot.hexbin(x="A", y="B", colormap=cm) assert ax.collections[0].cmap.name == cm @pytest.mark.slow def test_no_color_bar(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', colorbar=None) + ax = df.plot.hexbin(x="A", y="B", colorbar=None) assert ax.collections[0].colorbar is None @pytest.mark.slow def test_allow_cmap(self): df = self.hexbin_df - ax = df.plot.hexbin(x='A', y='B', cmap='YlGn') - assert ax.collections[0].cmap.name == 'YlGn' + ax = df.plot.hexbin(x="A", y="B", cmap="YlGn") + assert ax.collections[0].cmap.name == "YlGn" with pytest.raises(TypeError): - df.plot.hexbin(x='A', y='B', cmap='YlGn', colormap='BuGn') + df.plot.hexbin(x="A", y="B", cmap="YlGn", colormap="BuGn") @pytest.mark.slow def test_pie_df(self): - df = DataFrame(np.random.rand(5, 3), columns=['X', 'Y', 'Z'], - index=['a', 'b', 'c', 'd', 'e']) + df = DataFrame( + np.random.rand(5, 3), + columns=["X", "Y", "Z"], + index=["a", "b", "c", "d", "e"], + ) with pytest.raises(ValueError): df.plot.pie() - ax = _check_plot_works(df.plot.pie, y='Y') + ax = _check_plot_works(df.plot.pie, y="Y") self._check_text_labels(ax.texts, df.index) ax = _check_plot_works(df.plot.pie, y=2) @@ -2381,20 +2494,19 @@ def test_pie_df(self): # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.pie, - subplots=True) + axes = _check_plot_works(df.plot.pie, subplots=True) assert len(axes) == len(df.columns) for ax in axes: self._check_text_labels(ax.texts, df.index) for ax, ylabel in zip(axes, df.columns): assert ax.get_ylabel() == ylabel - labels = ['A', 'B', 'C', 'D', 'E'] - color_args = ['r', 'g', 'b', 'c', 'm'] + labels = ["A", "B", "C", "D", "E"] + color_args = ["r", "g", "b", "c", "m"] with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.plot.pie, - subplots=True, labels=labels, - colors=color_args) + axes = _check_plot_works( + df.plot.pie, subplots=True, labels=labels, colors=color_args + ) assert len(axes) == len(df.columns) for ax in axes: @@ -2408,24 +2520,25 @@ def test_pie_df_nan(self): fig, axes = self.plt.subplots(ncols=4) df.plot.pie(subplots=True, ax=axes, legend=True) - base_expected = ['0', '1', '2', '3'] + base_expected = ["0", "1", "2", "3"] for i, ax in enumerate(axes): expected = list(base_expected) # force copy - expected[i] = '' + expected[i] = "" result = [x.get_text() for x in ax.texts] assert result == expected # legend labels # NaN's not included in legend with subplots # see https://github.com/pandas-dev/pandas/issues/8390 - assert ([x.get_text() for x in ax.get_legend().get_texts()] == - base_expected[:i] + base_expected[i + 1:]) + assert [x.get_text() for x in ax.get_legend().get_texts()] == base_expected[ + :i + ] + base_expected[i + 1 :] @pytest.mark.slow def test_errorbar_plot(self): with warnings.catch_warnings(): - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} df = DataFrame(d) - d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} + d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} df_err = DataFrame(d_err) # check line plots @@ -2436,58 +2549,57 @@ def test_errorbar_plot(self): ax = _check_plot_works(df.plot, yerr=df_err, loglog=True) self._check_has_errorbars(ax, xerr=0, yerr=2) - kinds = ['line', 'bar', 'barh'] + kinds = ["line", "bar", "barh"] for kind in kinds: - ax = _check_plot_works(df.plot, yerr=df_err['x'], kind=kind) + ax = _check_plot_works(df.plot, yerr=df_err["x"], kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) ax = _check_plot_works(df.plot, yerr=d_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, - kind=kind) + ax = _check_plot_works(df.plot, yerr=df_err, xerr=df_err, kind=kind) self._check_has_errorbars(ax, xerr=2, yerr=2) - ax = _check_plot_works(df.plot, yerr=df_err['x'], - xerr=df_err['x'], - kind=kind) + ax = _check_plot_works( + df.plot, yerr=df_err["x"], xerr=df_err["x"], kind=kind + ) self._check_has_errorbars(ax, xerr=2, yerr=2) ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) self._check_has_errorbars(ax, xerr=2, yerr=2) # _check_plot_works adds an ax so catch warning. see GH #13188 - axes = _check_plot_works(df.plot, - yerr=df_err, xerr=df_err, - subplots=True, - kind=kind) + axes = _check_plot_works( + df.plot, yerr=df_err, xerr=df_err, subplots=True, kind=kind + ) self._check_has_errorbars(axes, xerr=1, yerr=1) - ax = _check_plot_works((df + 1).plot, yerr=df_err, - xerr=df_err, kind='bar', log=True) + ax = _check_plot_works( + (df + 1).plot, yerr=df_err, xerr=df_err, kind="bar", log=True + ) self._check_has_errorbars(ax, xerr=2, yerr=2) # yerr is raw error values - ax = _check_plot_works(df['y'].plot, yerr=np.ones(12) * 0.4) + ax = _check_plot_works(df["y"].plot, yerr=np.ones(12) * 0.4) self._check_has_errorbars(ax, xerr=0, yerr=1) ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4) self._check_has_errorbars(ax, xerr=0, yerr=2) # yerr is iterator import itertools - ax = _check_plot_works(df.plot, - yerr=itertools.repeat(0.1, len(df))) + + ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) self._check_has_errorbars(ax, xerr=0, yerr=2) # yerr is column name - for yerr in ['yerr', '誤差']: + for yerr in ["yerr", "誤差"]: s_df = df.copy() s_df[yerr] = np.ones(12) * 0.2 ax = _check_plot_works(s_df.plot, yerr=yerr) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(s_df.plot, y='y', x='x', yerr=yerr) + ax = _check_plot_works(s_df.plot, y="y", x="x", yerr=yerr) self._check_has_errorbars(ax, xerr=0, yerr=1) with pytest.raises(ValueError): df.plot(yerr=np.random.randn(11)) - df_err = DataFrame({'x': ['zzz'] * 12, 'y': ['zzz'] * 12}) + df_err = DataFrame({"x": ["zzz"] * 12, "y": ["zzz"] * 12}) with pytest.raises((ValueError, TypeError)): df.plot(yerr=df_err) @@ -2505,20 +2617,20 @@ def test_errorbar_with_integer_column_names(self): def test_errorbar_with_partial_columns(self): df = DataFrame(np.random.randn(10, 3)) df_err = DataFrame(np.random.randn(10, 2), columns=[0, 2]) - kinds = ['line', 'bar'] + kinds = ["line", "bar"] for kind in kinds: ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - ix = date_range('1/1/2000', periods=10, freq='M') + ix = date_range("1/1/2000", periods=10, freq="M") df.set_index(ix, inplace=True) df_err.set_index(ix, inplace=True) - ax = _check_plot_works(df.plot, yerr=df_err, kind='line') + ax = _check_plot_works(df.plot, yerr=df_err, kind="line") self._check_has_errorbars(ax, xerr=0, yerr=2) - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} df = DataFrame(d) - d_err = {'x': np.ones(12) * 0.2, 'z': np.ones(12) * 0.4} + d_err = {"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4} df_err = DataFrame(d_err) for err in [d_err, df_err]: ax = _check_plot_works(df.plot, yerr=err) @@ -2528,32 +2640,31 @@ def test_errorbar_with_partial_columns(self): def test_errorbar_timeseries(self): with warnings.catch_warnings(): - d = {'x': np.arange(12), 'y': np.arange(12, 0, -1)} - d_err = {'x': np.ones(12) * 0.2, 'y': np.ones(12) * 0.4} + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} + d_err = {"x": np.ones(12) * 0.2, "y": np.ones(12) * 0.4} # check time-series plots - ix = date_range('1/1/2000', '1/1/2001', freq='M') + ix = date_range("1/1/2000", "1/1/2001", freq="M") tdf = DataFrame(d, index=ix) tdf_err = DataFrame(d_err, index=ix) - kinds = ['line', 'bar', 'barh'] + kinds = ["line", "bar", "barh"] for kind in kinds: ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) ax = _check_plot_works(tdf.plot, yerr=d_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - ax = _check_plot_works(tdf.plot, y='y', yerr=tdf_err['x'], - kind=kind) + ax = _check_plot_works(tdf.plot, y="y", yerr=tdf_err["x"], kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(tdf.plot, y='y', yerr='x', kind=kind) + ax = _check_plot_works(tdf.plot, y="y", yerr="x", kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=1) ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) # _check_plot_works adds an ax so catch warning. see GH #13188 - axes = _check_plot_works(tdf.plot, - kind=kind, yerr=tdf_err, - subplots=True) + axes = _check_plot_works( + tdf.plot, kind=kind, yerr=tdf_err, subplots=True + ) self._check_has_errorbars(axes, xerr=0, yerr=1) def test_errorbar_asymmetrical(self): @@ -2576,8 +2687,7 @@ def test_errorbar_asymmetrical(self): tm.close() def test_table(self): - df = DataFrame(np.random.rand(10, 3), - index=list(string.ascii_letters[:10])) + df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) _check_plot_works(df.plot, table=True) _check_plot_works(df.plot, table=df) @@ -2587,26 +2697,24 @@ def test_table(self): assert len(ax.tables) == 1 def test_errorbar_scatter(self): - df = DataFrame( - np.random.randn(5, 2), index=range(5), columns=['x', 'y']) - df_err = DataFrame(np.random.randn(5, 2) / 5, - index=range(5), columns=['x', 'y']) + df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) + df_err = DataFrame( + np.random.randn(5, 2) / 5, index=range(5), columns=["x", "y"] + ) - ax = _check_plot_works(df.plot.scatter, x='x', y='y') + ax = _check_plot_works(df.plot.scatter, x="x", y="y") self._check_has_errorbars(ax, xerr=0, yerr=0) - ax = _check_plot_works(df.plot.scatter, x='x', y='y', xerr=df_err) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err) self._check_has_errorbars(ax, xerr=1, yerr=0) - ax = _check_plot_works(df.plot.scatter, x='x', y='y', yerr=df_err) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", yerr=df_err) self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(df.plot.scatter, x='x', y='y', xerr=df_err, - yerr=df_err) + ax = _check_plot_works(df.plot.scatter, x="x", y="y", xerr=df_err, yerr=df_err) self._check_has_errorbars(ax, xerr=1, yerr=1) - def _check_errorbar_color(containers, expected, has_err='has_xerr'): + def _check_errorbar_color(containers, expected, has_err="has_xerr"): lines = [] - errs = [c.lines - for c in ax.containers if getattr(c, has_err, False)][0] + errs = [c.lines for c in ax.containers if getattr(c, has_err, False)][0] for el in errs: if is_list_like(el): lines.extend(el) @@ -2614,19 +2722,19 @@ def _check_errorbar_color(containers, expected, has_err='has_xerr'): lines.append(el) err_lines = [x for x in lines if x in ax.collections] self._check_colors( - err_lines, linecolors=np.array([expected] * len(err_lines))) + err_lines, linecolors=np.array([expected] * len(err_lines)) + ) # GH 8081 - df = DataFrame( - np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e']) - ax = df.plot.scatter(x='a', y='b', xerr='d', yerr='e', c='red') + df = DataFrame(np.random.randn(10, 5), columns=["a", "b", "c", "d", "e"]) + ax = df.plot.scatter(x="a", y="b", xerr="d", yerr="e", c="red") self._check_has_errorbars(ax, xerr=1, yerr=1) - _check_errorbar_color(ax.containers, 'red', has_err='has_xerr') - _check_errorbar_color(ax.containers, 'red', has_err='has_yerr') + _check_errorbar_color(ax.containers, "red", has_err="has_xerr") + _check_errorbar_color(ax.containers, "red", has_err="has_yerr") - ax = df.plot.scatter(x='a', y='b', yerr='e', color='green') + ax = df.plot.scatter(x="a", y="b", yerr="e", color="green") self._check_has_errorbars(ax, xerr=0, yerr=1) - _check_errorbar_color(ax.containers, 'green', has_err='has_yerr') + _check_errorbar_color(ax.containers, "green", has_err="has_yerr") @pytest.mark.slow def test_sharex_and_ax(self): @@ -2634,13 +2742,18 @@ def test_sharex_and_ax(self): # the axis in fig.get_axis() are sorted differently than pandas # expected them, so make sure that only the right ones are removed import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") gs, axes = _generate_4_axes_via_gridspec() - df = DataFrame({"a": [1, 2, 3, 4, 5, 6], - "b": [1, 2, 3, 4, 5, 6], - "c": [1, 2, 3, 4, 5, 6], - "d": [1, 2, 3, 4, 5, 6]}) + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 3, 4, 5, 6], + "c": [1, 2, 3, 4, 5, 6], + "d": [1, 2, 3, 4, 5, 6], + } + ) def _check(axes): for ax in axes: @@ -2648,12 +2761,10 @@ def _check(axes): self._check_visible(ax.get_yticklabels(), visible=True) for ax in [axes[0], axes[2]]: self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible( - ax.get_xticklabels(minor=True), visible=False) + self._check_visible(ax.get_xticklabels(minor=True), visible=False) for ax in [axes[1], axes[3]]: self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) for ax in axes: df.plot(x="a", y="b", title="title", ax=ax, sharex=True) @@ -2689,17 +2800,20 @@ def test_sharey_and_ax(self): gs, axes = _generate_4_axes_via_gridspec() - df = DataFrame({"a": [1, 2, 3, 4, 5, 6], - "b": [1, 2, 3, 4, 5, 6], - "c": [1, 2, 3, 4, 5, 6], - "d": [1, 2, 3, 4, 5, 6]}) + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [1, 2, 3, 4, 5, 6], + "c": [1, 2, 3, 4, 5, 6], + "d": [1, 2, 3, 4, 5, 6], + } + ) def _check(axes): for ax in axes: assert len(ax.lines) == 1 self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) for ax in [axes[0], axes[1]]: self._check_visible(ax.get_yticklabels(), visible=True) for ax in [axes[2], axes[3]]: @@ -2741,10 +2855,10 @@ def test_memory_leak(self): for kind in plotting.PlotAccessor._all_kinds: args = {} - if kind in ['hexbin', 'scatter', 'pie']: + if kind in ["hexbin", "scatter", "pie"]: df = self.hexbin_df - args = {'x': 'A', 'y': 'B'} - elif kind == 'area': + args = {"x": "A", "y": "B"} + elif kind == "area": df = self.tdf.abs() else: df = self.tdf @@ -2768,9 +2882,11 @@ def test_df_subplots_patterns_minorticks(self): # GH 10657 import matplotlib.pyplot as plt - df = DataFrame(np.random.randn(10, 2), - index=date_range('1/1/2000', periods=10), - columns=list('AB')) + df = DataFrame( + np.random.randn(10, 2), + index=date_range("1/1/2000", periods=10), + columns=list("AB"), + ) # shared subplots fig, axes = plt.subplots(2, 1, sharex=True) @@ -2814,11 +2930,9 @@ def test_df_gridspec_patterns(self): import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec - ts = Series(np.random.randn(10), - index=date_range('1/1/2000', periods=10)) + ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) - df = DataFrame(np.random.randn(10, 2), index=ts.index, - columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), index=ts.index, columns=list("AB")) def _get_vertical_grid(): gs = gridspec.GridSpec(3, 1) @@ -2842,8 +2956,7 @@ def _get_horizontal_grid(): for ax in [ax1, ax2]: self._check_visible(ax.get_yticklabels(), visible=True) self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() # subplots=True @@ -2854,15 +2967,13 @@ def _get_horizontal_grid(): for ax in axes: self._check_visible(ax.get_yticklabels(), visible=True) self._check_visible(ax.get_xticklabels(), visible=True) - self._check_visible( - ax.get_xticklabels(minor=True), visible=True) + self._check_visible(ax.get_xticklabels(minor=True), visible=True) tm.close() # vertical / subplots / sharex=True / sharey=True ax1, ax2 = _get_vertical_grid() with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, - sharey=True) + axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 for ax in [ax1, ax2]: @@ -2878,8 +2989,7 @@ def _get_horizontal_grid(): # horizontal / subplots / sharex=True / sharey=True ax1, ax2 = _get_horizontal_grid() with tm.assert_produces_warning(UserWarning): - axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, - sharey=True) + axes = df.plot(subplots=True, ax=[ax1, ax2], sharex=True, sharey=True) assert len(axes[0].lines) == 1 assert len(axes[1].lines) == 1 self._check_visible(axes[0].get_yticklabels(), visible=True) @@ -2902,8 +3012,7 @@ def _get_boxed_grid(): return ax1, ax2, ax3, ax4 axes = _get_boxed_grid() - df = DataFrame(np.random.randn(10, 4), - index=ts.index, columns=list('ABCD')) + df = DataFrame(np.random.randn(10, 4), index=ts.index, columns=list("ABCD")) axes = df.plot(subplots=True, ax=axes) for ax in axes: assert len(ax.lines) == 1 @@ -2935,14 +3044,16 @@ def _get_boxed_grid(): def test_df_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 self._check_grid_settings( - DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4]}), - plotting.PlotAccessor._dataframe_kinds, kws={'x': 'a', 'y': 'b'}) + DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}), + plotting.PlotAccessor._dataframe_kinds, + kws={"x": "a", "y": "b"}, + ) def test_invalid_colormap(self): - df = DataFrame(randn(3, 2), columns=['A', 'B']) + df = DataFrame(randn(3, 2), columns=["A", "B"]) with pytest.raises(ValueError): - df.plot(colormap='invalid_colormap') + df.plot(colormap="invalid_colormap") def test_plain_axes(self): @@ -2954,14 +3065,15 @@ def test_plain_axes(self): # supplied ax itself is a plain Axes, but because the cmap keyword # a new ax is created for the colorbar -> also multiples axes (GH11520) - df = DataFrame({'a': randn(8), 'b': randn(8)}) + df = DataFrame({"a": randn(8), "b": randn(8)}) fig = self.plt.figure() ax = fig.add_axes((0, 0, 1, 1)) - df.plot(kind='scatter', ax=ax, x='a', y='b', c='a', cmap='hsv') + df.plot(kind="scatter", ax=ax, x="a", y="b", c="a", cmap="hsv") # other examples fig, ax = self.plt.subplots() from mpl_toolkits.axes_grid1 import make_axes_locatable + divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) Series(rand(10)).plot(ax=ax) @@ -2969,12 +3081,14 @@ def test_plain_axes(self): fig, ax = self.plt.subplots() from mpl_toolkits.axes_grid1.inset_locator import inset_axes - iax = inset_axes(ax, width="30%", height=1., loc=3) + + iax = inset_axes(ax, width="30%", height=1.0, loc=3) Series(rand(10)).plot(ax=ax) Series(rand(10)).plot(ax=iax) def test_passed_bar_colors(self): import matplotlib as mpl + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] colormap = mpl.colors.ListedColormap(color_tuples) barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar", cmap=colormap) @@ -2982,62 +3096,63 @@ def test_passed_bar_colors(self): def test_rcParams_bar_colors(self): import matplotlib as mpl + color_tuples = [(0.9, 0, 0, 1), (0, 0.9, 0, 1), (0, 0, 0.9, 1)] - with mpl.rc_context( - rc={'axes.prop_cycle': mpl.cycler("color", color_tuples)}): + with mpl.rc_context(rc={"axes.prop_cycle": mpl.cycler("color", color_tuples)}): barplot = pd.DataFrame([[1, 2, 3]]).plot(kind="bar") assert color_tuples == [c.get_facecolor() for c in barplot.patches] - @pytest.mark.parametrize('method', ['line', 'barh', 'bar']) + @pytest.mark.parametrize("method", ["line", "barh", "bar"]) def test_secondary_axis_font_size(self, method): # GH: 12565 - df = (pd.DataFrame(np.random.randn(15, 2), - columns=list('AB')) - .assign(C=lambda df: df.B.cumsum()) - .assign(D=lambda df: df.C * 1.1)) + df = ( + pd.DataFrame(np.random.randn(15, 2), columns=list("AB")) + .assign(C=lambda df: df.B.cumsum()) + .assign(D=lambda df: df.C * 1.1) + ) fontsize = 20 - sy = ['C', 'D'] + sy = ["C", "D"] - kwargs = dict(secondary_y=sy, fontsize=fontsize, - mark_right=True) + kwargs = dict(secondary_y=sy, fontsize=fontsize, mark_right=True) ax = getattr(df.plot, method)(**kwargs) - self._check_ticks_props(axes=ax.right_ax, - ylabelsize=fontsize) + self._check_ticks_props(axes=ax.right_ax, ylabelsize=fontsize) @pytest.mark.slow def test_x_string_values_ticks(self): # Test if string plot index have a fixed xtick position # GH: 7612, GH: 22334 - df = pd.DataFrame({'sales': [3, 2, 3], - 'visits': [20, 42, 28], - 'day': ['Monday', 'Tuesday', 'Wednesday']}) - ax = df.plot.area(x='day') + df = pd.DataFrame( + { + "sales": [3, 2, 3], + "visits": [20, 42, 28], + "day": ["Monday", "Tuesday", "Wednesday"], + } + ) + ax = df.plot.area(x="day") ax.set_xlim(-1, 3) xticklabels = [t.get_text() for t in ax.get_xticklabels()] labels_position = dict(zip(xticklabels, ax.get_xticks())) # Testing if the label stayed at the right position - assert labels_position['Monday'] == 0.0 - assert labels_position['Tuesday'] == 1.0 - assert labels_position['Wednesday'] == 2.0 + assert labels_position["Monday"] == 0.0 + assert labels_position["Tuesday"] == 1.0 + assert labels_position["Wednesday"] == 2.0 @pytest.mark.slow def test_x_multiindex_values_ticks(self): # Test if multiindex plot index have a fixed xtick position # GH: 15912 index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]]) - df = pd.DataFrame(np.random.randn(4, 2), - columns=['A', 'B'], - index=index) + df = pd.DataFrame(np.random.randn(4, 2), columns=["A", "B"], index=index) ax = df.plot() ax.set_xlim(-1, 4) xticklabels = [t.get_text() for t in ax.get_xticklabels()] labels_position = dict(zip(xticklabels, ax.get_xticks())) # Testing if the label stayed at the right position - assert labels_position['(2012, 1)'] == 0.0 - assert labels_position['(2012, 2)'] == 1.0 - assert labels_position['(2013, 1)'] == 2.0 - assert labels_position['(2013, 2)'] == 3.0 + assert labels_position["(2012, 1)"] == 0.0 + assert labels_position["(2012, 2)"] == 1.0 + assert labels_position["(2013, 1)"] == 2.0 + assert labels_position["(2013, 2)"] == 3.0 def _generate_4_axes_via_gridspec(): diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 5a5ee75928c97..bb1747710fe18 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -14,13 +14,12 @@ @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): - def test_series_groupby_plotting_nominally_works(self): n = 10 weight = Series(np.random.normal(166, 20, size=n)) height = Series(np.random.normal(60, 10, size=n)) with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) + gender = np.random.choice(["male", "female"], size=n) weight.groupby(gender).plot() tm.close() @@ -32,44 +31,39 @@ def test_series_groupby_plotting_nominally_works(self): def test_plotting_with_float_index_works(self): # GH 7025 - df = DataFrame({'def': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'val': np.random.randn(9)}, - index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0]) + df = DataFrame( + {"def": [1, 1, 1, 2, 2, 2, 3, 3, 3], "val": np.random.randn(9)}, + index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0], + ) - df.groupby('def')['val'].plot() + df.groupby("def")["val"].plot() tm.close() - df.groupby('def')['val'].apply(lambda x: x.plot()) + df.groupby("def")["val"].apply(lambda x: x.plot()) tm.close() def test_hist_single_row(self): # GH10214 bins = np.arange(80, 100 + 2, 1) - df = DataFrame({"Name": ["AAA", "BBB"], - "ByCol": [1, 2], - "Mark": [85, 89]}) + df = DataFrame({"Name": ["AAA", "BBB"], "ByCol": [1, 2], "Mark": [85, 89]}) df["Mark"].hist(by=df["ByCol"], bins=bins) df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]}) df["Mark"].hist(by=df["ByCol"], bins=bins) def test_plot_submethod_works(self): - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) - df.groupby('z').plot.scatter('x', 'y') + df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")}) + df.groupby("z").plot.scatter("x", "y") tm.close() - df.groupby('z')['x'].plot.line() + df.groupby("z")["x"].plot.line() tm.close() def test_plot_kwargs(self): - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) + df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")}) - res = df.groupby('z').plot(kind='scatter', x='x', y='y') + res = df.groupby("z").plot(kind="scatter", x="x", y="y") # check that a scatter plot is effectively plotted: the axes should # contain a PathCollection from the scatter plot (GH11805) - assert len(res['a'].collections) == 1 + assert len(res["a"].collections) == 1 - res = df.groupby('z').plot.scatter(x='x', y='y') - assert len(res['a'].collections) == 1 + res = df.groupby("z").plot.scatter(x="x", y="y") + assert len(res["a"].collections) == 1 diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 79ce418768044..14cb2bc9d7b62 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -15,14 +15,14 @@ @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' + self.ts.name = "ts" @pytest.mark.slow def test_hist_legacy(self): @@ -71,47 +71,40 @@ def test_hist_layout_with_by(self): # so we get a warning about an axis being cleared, even # though we don't explicing pass one, see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.gender, - layout=(2, 1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.gender, - layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, by=df.category, - layout=(4, 1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(2, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(2, -1)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(3, -1)) self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(-1, 4)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(-1, 4)) self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works( - df.height.hist, by=df.classroom, layout=(2, 2)) + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) y = Series(randn(2)) subplot(121) @@ -131,6 +124,7 @@ def test_hist_by_no_extra_plots(self): @pytest.mark.slow def test_plot_fails_when_ax_differs_from_figure(self): from pylab import figure + fig1 = figure() fig2 = figure() ax1 = fig1.add_subplot(111) @@ -140,10 +134,10 @@ def test_plot_fails_when_ax_differs_from_figure(self): @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - @pytest.mark.slow def test_hist_df_legacy(self): from matplotlib.patches import Rectangle + with tm.assert_produces_warning(UserWarning): _check_plot_works(self.hist_df.hist) @@ -180,14 +174,16 @@ def test_hist_df_legacy(self): xf, yf = 20, 18 xrot, yrot = 30, 40 axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) xf, yf = 20, 18 xrot, yrot = 30, 40 axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) tm.close() @@ -199,13 +195,13 @@ def test_hist_df_legacy(self): tm.close() ax = ser.hist(log=True) # scale of y must be 'log' - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") tm.close() # propagate attr exception from matplotlib.Axes.hist with pytest.raises(AttributeError): - ser.hist(foo='bar') + ser.hist(foo="bar") @pytest.mark.slow def test_hist_non_numerical_raises(self): @@ -222,20 +218,20 @@ def test_hist_layout(self): df = DataFrame(randn(100, 3)) layout_to_expected_size = ( - {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 - {'layout': (2, 2), 'expected_size': (2, 2)}, - {'layout': (4, 1), 'expected_size': (4, 1)}, - {'layout': (1, 4), 'expected_size': (1, 4)}, - {'layout': (3, 3), 'expected_size': (3, 3)}, - {'layout': (-1, 4), 'expected_size': (1, 4)}, - {'layout': (4, -1), 'expected_size': (4, 1)}, - {'layout': (-1, 2), 'expected_size': (2, 2)}, - {'layout': (2, -1), 'expected_size': (2, 2)} + {"layout": None, "expected_size": (2, 2)}, # default is 2x2 + {"layout": (2, 2), "expected_size": (2, 2)}, + {"layout": (4, 1), "expected_size": (4, 1)}, + {"layout": (1, 4), "expected_size": (1, 4)}, + {"layout": (3, 3), "expected_size": (3, 3)}, + {"layout": (-1, 4), "expected_size": (1, 4)}, + {"layout": (4, -1), "expected_size": (4, 1)}, + {"layout": (-1, 2), "expected_size": (2, 2)}, + {"layout": (2, -1), "expected_size": (2, 2)}, ) for layout_test in layout_to_expected_size: - axes = df.hist(layout=layout_test['layout']) - expected = layout_test['expected_size'] + axes = df.hist(layout=layout_test["layout"]) + expected = layout_test["expected_size"] self._check_axes_shape(axes, axes_num=3, layout=expected) # layout too small for all 4 plots @@ -260,15 +256,14 @@ def test_tight_layout(self): @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): - @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle from pandas.plotting._matplotlib.hist import _grouped_hist - df = DataFrame(randn(500, 2), columns=['A', 'B']) - df['C'] = np.random.randint(0, 4, 500) - df['D'] = ['X'] * 500 + df = DataFrame(randn(500, 2), columns=["A", "B"]) + df["C"] = np.random.randint(0, 4, 500) + df["D"] = ["X"] * 500 axes = _grouped_hist(df.A, by=df.C) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) @@ -279,7 +274,7 @@ def test_grouped_hist_legacy(self): tm.close() # group by a key with single value - axes = df.hist(by='D', rot=30) + axes = df.hist(by="D", rot=30) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) self._check_ticks_props(axes, xrot=30) @@ -288,29 +283,38 @@ def test_grouped_hist_legacy(self): xf, yf = 20, 18 xrot, yrot = 30, 40 - axes = _grouped_hist(df.A, by=df.C, cumulative=True, - bins=4, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot, density=True) + axes = _grouped_hist( + df.A, + by=df.C, + cumulative=True, + bins=4, + xlabelsize=xf, + xrot=xrot, + ylabelsize=yf, + yrot=yrot, + density=True, + ) # height of last bin (index 5) must be 1.0 for ax in axes.ravel(): rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] height = rects[-1].get_height() tm.assert_almost_equal(height, 1.0) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) + self._check_ticks_props( + axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot + ) tm.close() axes = _grouped_hist(df.A, by=df.C, log=True) # scale of y must be 'log' - self._check_ax_scales(axes, yaxis='log') + self._check_ax_scales(axes, yaxis="log") tm.close() # propagate attr exception from matplotlib.Axes.hist with pytest.raises(AttributeError): - _grouped_hist(df.A, by=df.C, foo='bar') + _grouped_hist(df.A, by=df.C, foo="bar") with tm.assert_produces_warning(FutureWarning): - df.hist(by='C', figsize='default') + df.hist(by="C", figsize="default") @pytest.mark.slow def test_grouped_hist_legacy2(self): @@ -319,9 +323,8 @@ def test_grouped_hist_legacy2(self): height = Series(np.random.normal(60, 10, size=n)) with tm.RNGContext(42): gender_int = np.random.choice([0, 1], size=n) - df_int = DataFrame({'height': height, 'weight': weight, - 'gender': gender_int}) - gb = df_int.groupby('gender') + df_int = DataFrame({"height": height, "weight": weight, "gender": gender_int}) + gb = df_int.groupby("gender") axes = gb.hist() assert len(axes) == 2 assert len(self.plt.get_fignums()) == 2 @@ -332,53 +335,54 @@ def test_grouped_hist_layout(self): df = self.hist_df msg = "Layout of 1x1 must be larger than required size 2" with pytest.raises(ValueError, match=msg): - df.hist(column='weight', by=df.gender, layout=(1, 1)) + df.hist(column="weight", by=df.gender, layout=(1, 1)) msg = "Layout of 1x3 must be larger than required size 4" with pytest.raises(ValueError, match=msg): - df.hist(column='height', by=df.category, layout=(1, 3)) + df.hist(column="height", by=df.category, layout=(1, 3)) msg = "At least one dimension of layout must be positive" with pytest.raises(ValueError, match=msg): - df.hist(column='height', by=df.category, layout=(-1, -1)) + df.hist(column="height", by=df.category, layout=(-1, -1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, 1)) + axes = _check_plot_works( + df.hist, column="height", by=df.gender, layout=(2, 1) + ) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, -1)) + axes = _check_plot_works( + df.hist, column="height", by=df.gender, layout=(2, -1) + ) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - axes = df.hist(column='height', by=df.category, layout=(4, 1)) + axes = df.hist(column="height", by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = df.hist(column='height', by=df.category, layout=(-1, 1)) + axes = df.hist(column="height", by=df.category, layout=(-1, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = df.hist(column='height', by=df.category, - layout=(4, 2), figsize=(12, 8)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) + axes = df.hist(column="height", by=df.category, layout=(4, 2), figsize=(12, 8)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) tm.close() # GH 6769 with tm.assert_produces_warning(UserWarning): axes = _check_plot_works( - df.hist, column='height', by='classroom', layout=(2, 2)) + df.hist, column="height", by="classroom", layout=(2, 2) + ) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) # without column with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.hist, by='classroom') + axes = _check_plot_works(df.hist, by="classroom") self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - axes = df.hist(by='gender', layout=(3, 5)) + axes = df.hist(by="gender", layout=(3, 5)) self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) - axes = df.hist(column=['height', 'weight', 'category']) + axes = df.hist(column=["height", "weight", "category"]) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) @pytest.mark.slow @@ -387,11 +391,11 @@ def test_grouped_hist_multiple_axes(self): df = self.hist_df fig, axes = self.plt.subplots(2, 3) - returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0]) + returned = df.hist(column=["height", "weight", "category"], ax=axes[0]) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[0]) assert returned[0].figure is fig - returned = df.hist(by='classroom', ax=axes[1]) + returned = df.hist(by="classroom", ax=axes[1]) self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) tm.assert_numpy_array_equal(returned, axes[1]) assert returned[0].figure is fig @@ -399,13 +403,13 @@ def test_grouped_hist_multiple_axes(self): with pytest.raises(ValueError): fig, axes = self.plt.subplots(2, 3) # pass different number of axes from required - axes = df.hist(column='height', ax=axes) + axes = df.hist(column="height", ax=axes) @pytest.mark.slow def test_axis_share_x(self): df = self.hist_df # GH4089 - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) + ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True) # share x assert ax1._shared_x_axes.joined(ax1, ax2) @@ -418,7 +422,7 @@ def test_axis_share_x(self): @pytest.mark.slow def test_axis_share_y(self): df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) + ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True) # share y assert ax1._shared_y_axes.joined(ax1, ax2) @@ -431,8 +435,7 @@ def test_axis_share_y(self): @pytest.mark.slow def test_axis_share_xy(self): df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, - sharey=True) + ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True) # share both x and y assert ax1._shared_x_axes.joined(ax1, ax2) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index b27df946aeacf..6cb6f818d40fd 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -28,71 +28,76 @@ def test_import_error_message(): def test_get_accessor_args(): func = plotting._core.PlotAccessor._get_call_args - msg = 'Called plot accessor for type list, expected Series or DataFrame' + msg = "Called plot accessor for type list, expected Series or DataFrame" with pytest.raises(TypeError, match=msg): - func(backend_name='', data=[], args=[], kwargs={}) + func(backend_name="", data=[], args=[], kwargs={}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - x, y, kind, kwargs = func(backend_name='', data=Series(), - args=['line', None], kwargs={}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + x, y, kind, kwargs = func( + backend_name="", data=Series(), args=["line", None], kwargs={} + ) assert x is None assert y is None - assert kind == 'line' - assert kwargs == {'ax': None} - - x, y, kind, kwargs = func(backend_name='', data=DataFrame(), - args=['x'], kwargs={'y': 'y', - 'kind': 'bar', - 'grid': False}) - assert x == 'x' - assert y == 'y' - assert kind == 'bar' - assert kwargs == {'grid': False} - - x, y, kind, kwargs = func(backend_name='pandas.plotting._matplotlib', - data=Series(), args=[], kwargs={}) + assert kind == "line" + assert kwargs == {"ax": None} + + x, y, kind, kwargs = func( + backend_name="", + data=DataFrame(), + args=["x"], + kwargs={"y": "y", "kind": "bar", "grid": False}, + ) + assert x == "x" + assert y == "y" + assert kind == "bar" + assert kwargs == {"grid": False} + + x, y, kind, kwargs = func( + backend_name="pandas.plotting._matplotlib", data=Series(), args=[], kwargs={} + ) assert x is None assert y is None - assert kind == 'line' + assert kind == "line" assert len(kwargs) == 22 @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' + self.ts.name = "ts" @pytest.mark.slow def test_autocorrelation_plot(self): from pandas.plotting import autocorrelation_plot + _check_plot_works(autocorrelation_plot, series=self.ts) _check_plot_works(autocorrelation_plot, series=self.ts.values) - ax = autocorrelation_plot(self.ts, label='Test') - self._check_legend_labels(ax, labels=['Test']) + ax = autocorrelation_plot(self.ts, label="Test") + self._check_legend_labels(ax, labels=["Test"]) @pytest.mark.slow def test_lag_plot(self): from pandas.plotting import lag_plot + _check_plot_works(lag_plot, series=self.ts) _check_plot_works(lag_plot, series=self.ts, lag=5) @pytest.mark.slow def test_bootstrap_plot(self): from pandas.plotting import bootstrap_plot + _check_plot_works(bootstrap_plot, series=self.ts, size=10) @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): - @td.skip_if_no_scipy def test_scatter_matrix_axis(self): scatter_matrix = plotting.scatter_matrix @@ -102,27 +107,27 @@ def test_scatter_matrix_axis(self): # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) + axes = _check_plot_works( + scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 + ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() # GH 5662 - expected = ['-2', '0', '2'] + expected = ["-2", "0", "2"] self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) - df[0] = ((df[0] - 2) / 3) + df[0] = (df[0] - 2) / 3 # we are plotting multiples on a sub-plot with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) + axes = _check_plot_works( + scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 + ) axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ['-1.0', '-0.5', '0.0'] + expected = ["-1.0", "-0.5", "0.0"] self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + self._check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow def test_andrews_curves(self, iris): @@ -131,63 +136,76 @@ def test_andrews_curves(self, iris): df = iris - _check_plot_works(andrews_curves, frame=df, class_column='Name') + _check_plot_works(andrews_curves, frame=df, class_column="Name") - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=rgba + ) self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=cnames + ) self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) length = 10 - df = DataFrame({"A": random.rand(length), - "B": random.rand(length), - "C": random.rand(length), - "Name": ["A"] * length}) - - _check_plot_works(andrews_curves, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) + df = DataFrame( + { + "A": random.rand(length), + "B": random.rand(length), + "C": random.rand(length), + "Name": ["A"] * length, + } + ) + + _check_plot_works(andrews_curves, frame=df, class_column="Name") + + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=rgba + ) self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", color=cnames + ) self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] + ax = _check_plot_works( + andrews_curves, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = andrews_curves(df, 'Name', color=colors) + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) + + colors = ["b", "g", "r"] + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) + ax = andrews_curves(df, "Name", color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): - andrews_curves(data=df, class_column='Name') + andrews_curves(data=df, class_column="Name") @pytest.mark.slow def test_parallel_coordinates(self, iris): @@ -196,46 +214,49 @@ def test_parallel_coordinates(self, iris): df = iris - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name') + ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") nlines = len(ax.get_lines()) nxticks = len(ax.xaxis.get_ticklabels()) - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=rgba) + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", color=rgba + ) self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10] + ) - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=cnames) + cnames = ["dodgerblue", "aquamarine", "seagreen"] + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", color=cnames + ) self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10] + ) - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet + ) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10] + ) - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', axvlines=False) + ax = _check_plot_works( + parallel_coordinates, frame=df, class_column="Name", axvlines=False + ) assert len(ax.get_lines()) == (nlines - nxticks) - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = parallel_coordinates(df, 'Name', color=colors) + colors = ["b", "g", "r"] + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) + ax = parallel_coordinates(df, "Name", color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, linecolors=colors) with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(data=df, class_column='Name') + parallel_coordinates(data=df, class_column="Name") with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(df, 'Name', colors=colors) + parallel_coordinates(df, "Name", colors=colors) # not sure if this is indicative of a problem @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning") @@ -243,18 +264,24 @@ def test_parallel_coordinates_with_sorted_labels(self): """ For #15908 """ from pandas.plotting import parallel_coordinates - df = DataFrame({"feat": [i for i in range(30)], - "class": [2 for _ in range(10)] + - [3 for _ in range(10)] + - [1 for _ in range(10)]}) - ax = parallel_coordinates(df, 'class', sort_labels=True) + df = DataFrame( + { + "feat": [i for i in range(30)], + "class": [2 for _ in range(10)] + + [3 for _ in range(10)] + + [1 for _ in range(10)], + } + ) + ax = parallel_coordinates(df, "class", sort_labels=True) polylines, labels = ax.get_legend_handles_labels() - color_label_tuples = \ - zip([polyline.get_color() for polyline in polylines], labels) - ordered_color_label_tuples = sorted(color_label_tuples, - key=lambda x: x[1]) - prev_next_tupels = zip([i for i in ordered_color_label_tuples[0:-1]], - [i for i in ordered_color_label_tuples[1:]]) + color_label_tuples = zip( + [polyline.get_color() for polyline in polylines], labels + ) + ordered_color_label_tuples = sorted(color_label_tuples, key=lambda x: x[1]) + prev_next_tupels = zip( + [i for i in ordered_color_label_tuples[0:-1]], + [i for i in ordered_color_label_tuples[1:]], + ) for prev, nxt in prev_next_tupels: # labels and colors are ordered strictly increasing assert prev[1] < nxt[1] and prev[0] < nxt[0] @@ -265,41 +292,35 @@ def test_radviz(self, iris): from matplotlib import cm df = iris - _check_plot_works(radviz, frame=df, class_column='Name') + _check_plot_works(radviz, frame=df, class_column="Name") - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works( - radviz, frame=df, class_column='Name', color=rgba) + rgba = ("#556270", "#4ECDC4", "#C7F464") + ax = _check_plot_works(radviz, frame=df, class_column="Name", color=rgba) # skip Circle drawn as ticks - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors( - patches[:10], facecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) - - _check_plot_works(radviz, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = [cm.jet(n) for n in np.linspace(0, 1, df['Name'].nunique())] - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) - - colors = [[0., 0., 1., 1.], - [0., 0.5, 1., 1.], - [1., 0., 0., 1.]] - df = DataFrame({"A": [1, 2, 3], - "B": [2, 1, 3], - "C": [3, 2, 1], - "Name": ['b', 'g', 'r']}) - ax = radviz(df, 'Name', color=colors) + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches[:10], facecolors=rgba, mapping=df["Name"][:10]) + + cnames = ["dodgerblue", "aquamarine", "seagreen"] + _check_plot_works(radviz, frame=df, class_column="Name", color=cnames) + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches, facecolors=cnames, mapping=df["Name"][:10]) + + _check_plot_works(radviz, frame=df, class_column="Name", colormap=cm.jet) + cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] + patches = [p for p in ax.patches[:20] if p.get_label() != ""] + self._check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10]) + + colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]] + df = DataFrame( + {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]} + ) + ax = radviz(df, "Name", color=colors) handles, labels = ax.get_legend_handles_labels() self._check_colors(handles, facecolors=colors) @pytest.mark.slow def test_subplot_titles(self, iris): - df = iris.drop('Name', axis=1).head() + df = iris.drop("Name", axis=1).head() # Use the column names as the subplot titles title = list(df.columns) @@ -308,8 +329,10 @@ def test_subplot_titles(self, iris): assert [p.get_title() for p in plot] == title # Case len(title) > len(df) - msg = ("The length of `title` must equal the number of columns if" - " using `title` of type `list` and `subplots=True`") + msg = ( + "The length of `title` must equal the number of columns if" + " using `title` of type `list` and `subplots=True`" + ) with pytest.raises(ValueError, match=msg): df.plot(subplots=True, title=title + ["kittens > puppies"]) @@ -318,16 +341,19 @@ def test_subplot_titles(self, iris): df.plot(subplots=True, title=title[:2]) # Case subplots=False and title is of type list - msg = ("Using `title` of type `list` is not supported unless" - " `subplots=True` is passed") + msg = ( + "Using `title` of type `list` is not supported unless" + " `subplots=True` is passed" + ) with pytest.raises(ValueError, match=msg): df.plot(subplots=False, title=title) # Case df with 3 numeric columns but layout of (2,2) - plot = df.drop('SepalWidth', axis=1).plot(subplots=True, layout=(2, 2), - title=title[:-1]) + plot = df.drop("SepalWidth", axis=1).plot( + subplots=True, layout=(2, 2), title=title[:-1] + ) title_list = [ax.get_title() for sublist in plot for ax in sublist] - assert title_list == title[:3] + [''] + assert title_list == title[:3] + [""] def test_get_standard_colors_random_seed(self): # GH17525 @@ -342,32 +368,34 @@ def test_get_standard_colors_random_seed(self): # Make sure it produces the same colors every time it's called from pandas.plotting._matplotlib.style import _get_standard_colors - color1 = _get_standard_colors(1, color_type='random') - color2 = _get_standard_colors(1, color_type='random') + + color1 = _get_standard_colors(1, color_type="random") + color2 = _get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): from pandas.plotting._matplotlib.style import _get_standard_colors # Make sure the default color_types returns the specified amount - color1 = _get_standard_colors(1, color_type='default') - color2 = _get_standard_colors(9, color_type='default') - color3 = _get_standard_colors(20, color_type='default') + color1 = _get_standard_colors(1, color_type="default") + color2 = _get_standard_colors(9, color_type="default") + color3 = _get_standard_colors(20, color_type="default") assert len(color1) == 1 assert len(color2) == 9 assert len(color3) == 20 def test_plot_single_color(self): # Example from #20585. All 3 bars should have the same color - df = DataFrame({'account-start': ['2017-02-03', '2017-03-03', - '2017-01-01'], - 'client': ['Alice Anders', 'Bob Baker', - 'Charlie Chaplin'], - 'balance': [-1432.32, 10.43, 30000.00], - 'db-id': [1234, 2424, 251], - 'proxy-id': [525, 1525, 2542], - 'rank': [52, 525, 32], - }) + df = DataFrame( + { + "account-start": ["2017-02-03", "2017-03-03", "2017-01-01"], + "client": ["Alice Anders", "Bob Baker", "Charlie Chaplin"], + "balance": [-1432.32, 10.43, 30000.00], + "db-id": [1234, 2424, 251], + "proxy-id": [525, 1525, 2542], + "rank": [52, 525, 32], + } + ) ax = df.client.value_counts().plot.bar() colors = [rect.get_facecolor() for rect in ax.get_children()[0:3]] assert all(color == colors[0] for color in colors) @@ -379,6 +407,7 @@ def test_get_standard_colors_no_appending(self): # correctly. from matplotlib import cm from pandas.plotting._matplotlib.style import _get_standard_colors + color_before = cm.gnuplot(range(5)) color_after = _get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) @@ -387,5 +416,4 @@ def test_get_standard_colors_no_appending(self): color_list = cm.gnuplot(np.linspace(0, 1, 16)) p = df.A.plot.bar(figsize=(16, 7), color=color_list) - assert (p.patches[1].get_facecolor() - == p.patches[17].get_facecolor()) + assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor() diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index d10620b4e7547..4c5b1e66d0075 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -22,47 +22,47 @@ @td.skip_if_no_mpl class TestSeriesPlots(TestPlotBase): - def setup_method(self, method): TestPlotBase.setup_method(self, method) import matplotlib as mpl + mpl.rcdefaults() self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' + self.ts.name = "ts" self.series = tm.makeStringSeries() - self.series.name = 'series' + self.series.name = "series" self.iseries = tm.makePeriodSeries() - self.iseries.name = 'iseries' + self.iseries.name = "iseries" @pytest.mark.slow def test_plot(self): - _check_plot_works(self.ts.plot, label='foo') + _check_plot_works(self.ts.plot, label="foo") _check_plot_works(self.ts.plot, use_index=False) axes = _check_plot_works(self.ts.plot, rot=0) self._check_ticks_props(axes, xrot=0) - ax = _check_plot_works(self.ts.plot, style='.', logy=True) - self._check_ax_scales(ax, yaxis='log') + ax = _check_plot_works(self.ts.plot, style=".", logy=True) + self._check_ax_scales(ax, yaxis="log") - ax = _check_plot_works(self.ts.plot, style='.', logx=True) - self._check_ax_scales(ax, xaxis='log') + ax = _check_plot_works(self.ts.plot, style=".", logx=True) + self._check_ax_scales(ax, xaxis="log") - ax = _check_plot_works(self.ts.plot, style='.', loglog=True) - self._check_ax_scales(ax, xaxis='log', yaxis='log') + ax = _check_plot_works(self.ts.plot, style=".", loglog=True) + self._check_ax_scales(ax, xaxis="log", yaxis="log") _check_plot_works(self.ts[:10].plot.bar) _check_plot_works(self.ts.plot.area, stacked=False) _check_plot_works(self.iseries.plot) - for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: + for kind in ["line", "bar", "barh", "kde", "hist", "box"]: _check_plot_works(self.series[:5].plot, kind=kind) _check_plot_works(self.series[:10].plot.barh) - ax = _check_plot_works(Series(randn(10)).plot.bar, color='black') - self._check_colors([ax.patches[0]], facecolors=['black']) + ax = _check_plot_works(Series(randn(10)).plot.bar, color="black") + self._check_colors([ax.patches[0]], facecolors=["black"]) # GH 6951 ax = _check_plot_works(self.ts.plot, subplots=True) @@ -77,13 +77,13 @@ def test_plot(self): def test_plot_figsize_and_title(self): # figsize and title _, ax = self.plt.subplots() - ax = self.series.plot(title='Test', figsize=(16, 8), ax=ax) - self._check_text_labels(ax.title, 'Test') + ax = self.series.plot(title="Test", figsize=(16, 8), ax=ax) + self._check_text_labels(ax.title, "Test") self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) def test_dont_modify_rcParams(self): # GH 8242 - key = 'axes.prop_cycle' + key = "axes.prop_cycle" colors = self.plt.rcParams[key] _, ax = self.plt.subplots() Series([1, 2, 3]).plot(ax=ax) @@ -123,7 +123,7 @@ def test_ts_area_lim(self): tm.close() tz_ts = self.ts.copy() - tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') + tz_ts.index = tz_ts.tz_localize("GMT").tz_convert("CET") _, ax = self.plt.subplots() ax = tz_ts.plot.area(stacked=False, x_compat=True, ax=ax) xmin, xmax = ax.get_xlim() @@ -142,30 +142,30 @@ def test_ts_area_lim(self): def test_label(self): s = Series([1, 2]) _, ax = self.plt.subplots() - ax = s.plot(label='LABEL', legend=True, ax=ax) - self._check_legend_labels(ax, labels=['LABEL']) + ax = s.plot(label="LABEL", legend=True, ax=ax) + self._check_legend_labels(ax, labels=["LABEL"]) self.plt.close() _, ax = self.plt.subplots() ax = s.plot(legend=True, ax=ax) - self._check_legend_labels(ax, labels=['None']) + self._check_legend_labels(ax, labels=["None"]) self.plt.close() # get name from index - s.name = 'NAME' + s.name = "NAME" _, ax = self.plt.subplots() ax = s.plot(legend=True, ax=ax) - self._check_legend_labels(ax, labels=['NAME']) + self._check_legend_labels(ax, labels=["NAME"]) self.plt.close() # override the default _, ax = self.plt.subplots() - ax = s.plot(legend=True, label='LABEL', ax=ax) - self._check_legend_labels(ax, labels=['LABEL']) + ax = s.plot(legend=True, label="LABEL", ax=ax) + self._check_legend_labels(ax, labels=["LABEL"]) self.plt.close() # Add lebel info, but don't draw _, ax = self.plt.subplots() - ax = s.plot(legend=False, label='LABEL', ax=ax) + ax = s.plot(legend=False, label="LABEL", ax=ax) assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it - self._check_legend_labels(ax, labels=['LABEL']) + self._check_legend_labels(ax, labels=["LABEL"]) def test_line_area_nan_series(self): values = [1, 2, np.nan, 3] @@ -179,7 +179,8 @@ def test_line_area_nan_series(self): exp = np.array([1, 2, 3], dtype=np.float64) tm.assert_numpy_array_equal(np.delete(masked.data, 2), exp) tm.assert_numpy_array_equal( - masked.mask, np.array([False, False, True, False])) + masked.mask, np.array([False, False, True, False]) + ) expected = np.array([1, 2, 0, 3], dtype=np.float64) ax = _check_plot_works(d.plot, stacked=True) @@ -190,16 +191,16 @@ def test_line_area_nan_series(self): tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) def test_line_use_index_false(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - s.index.name = 'The Index' + s = Series([1, 2, 3], index=["a", "b", "c"]) + s.index.name = "The Index" _, ax = self.plt.subplots() ax = s.plot(use_index=False, ax=ax) label = ax.get_xlabel() - assert label == '' + assert label == "" _, ax = self.plt.subplots() ax2 = s.plot.bar(use_index=False, ax=ax) label2 = ax2.get_xlabel() - assert label2 == '' + assert label2 == "" @pytest.mark.slow def test_bar_log(self): @@ -219,7 +220,7 @@ def test_bar_log(self): expected = np.array([1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]) _, ax = self.plt.subplots() - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar', ax=ax) + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind="bar", ax=ax) ymin = 0.0007943282347242822 ymax = 0.12589254117941673 res = ax.get_ylim() @@ -229,7 +230,7 @@ def test_bar_log(self): tm.close() _, ax = self.plt.subplots() - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh', ax=ax) + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind="barh", ax=ax) res = ax.get_xlim() tm.assert_almost_equal(res[0], ymin) tm.assert_almost_equal(res[1], ymax) @@ -237,19 +238,21 @@ def test_bar_log(self): @pytest.mark.slow def test_bar_ignore_index(self): - df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + df = Series([1, 2, 3, 4], index=["a", "b", "c", "d"]) _, ax = self.plt.subplots() ax = df.plot.bar(use_index=False, ax=ax) - self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) + self._check_text_labels(ax.get_xticklabels(), ["0", "1", "2", "3"]) def test_bar_user_colors(self): s = Series([1, 2, 3, 4]) - ax = s.plot.bar(color=['red', 'blue', 'blue', 'red']) + ax = s.plot.bar(color=["red", "blue", "blue", "red"]) result = [p.get_facecolor() for p in ax.patches] - expected = [(1., 0., 0., 1.), - (0., 0., 1., 1.), - (0., 0., 1., 1.), - (1., 0., 0., 1.)] + expected = [ + (1.0, 0.0, 0.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (0.0, 0.0, 1.0, 1.0), + (1.0, 0.0, 0.0, 1.0), + ] assert result == expected def test_rotation(self): @@ -264,18 +267,20 @@ def test_rotation(self): self._check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): - rng = date_range('1/1/2000', '3/1/2000') + rng = date_range("1/1/2000", "3/1/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(randn(len(rng)), rng) _, ax = self.plt.subplots() ax = ser.plot(ax=ax) xp = datetime(1999, 1, 1).toordinal() - ax.set_xlim('1/1/1999', '1/1/2001') + ax.set_xlim("1/1/1999", "1/1/2001") assert xp == ax.get_xlim()[0] def test_unsorted_index_xlim(self): - ser = Series([0., 1., np.nan, 3., 4., 5., 6.], - index=[1., 0., 3., 2., np.nan, 3., 2.]) + ser = Series( + [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0], + index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0], + ) _, ax = self.plt.subplots() ax = ser.plot(ax=ax) xmin, xmax = ax.get_xlim() @@ -287,36 +292,36 @@ def test_unsorted_index_xlim(self): def test_pie_series(self): # if sum of values is less than 1.0, pie handle them as rate and draw # semicircle. - series = Series(np.random.randint(1, 5), - index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') + series = Series( + np.random.randint(1, 5), index=["a", "b", "c", "d", "e"], name="YLABEL" + ) ax = _check_plot_works(series.plot.pie) self._check_text_labels(ax.texts, series.index) - assert ax.get_ylabel() == 'YLABEL' + assert ax.get_ylabel() == "YLABEL" # without wedge labels ax = _check_plot_works(series.plot.pie, labels=None) - self._check_text_labels(ax.texts, [''] * 5) + self._check_text_labels(ax.texts, [""] * 5) # with less colors than elements - color_args = ['r', 'g', 'b'] + color_args = ["r", "g", "b"] ax = _check_plot_works(series.plot.pie, colors=color_args) - color_expected = ['r', 'g', 'b', 'r', 'g'] + color_expected = ["r", "g", "b", "r", "g"] self._check_colors(ax.patches, facecolors=color_expected) # with labels and colors - labels = ['A', 'B', 'C', 'D', 'E'] - color_args = ['r', 'g', 'b', 'c', 'm'] - ax = _check_plot_works(series.plot.pie, labels=labels, - colors=color_args) + labels = ["A", "B", "C", "D", "E"] + color_args = ["r", "g", "b", "c", "m"] + ax = _check_plot_works(series.plot.pie, labels=labels, colors=color_args) self._check_text_labels(ax.texts, labels) self._check_colors(ax.patches, facecolors=color_args) # with autopct and fontsize - ax = _check_plot_works(series.plot.pie, colors=color_args, - autopct='%.2f', fontsize=7) - pcts = ['{0:.2f}'.format(s * 100) - for s in series.values / float(series.sum())] + ax = _check_plot_works( + series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7 + ) + pcts = ["{0:.2f}".format(s * 100) for s in series.values / float(series.sum())] expected_texts = list(chain.from_iterable(zip(series.index, pcts))) self._check_text_labels(ax.texts, expected_texts) for t in ax.texts: @@ -324,20 +329,19 @@ def test_pie_series(self): # includes negative value with pytest.raises(ValueError): - series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) + series = Series([1, 2, 0, 4, -1], index=["a", "b", "c", "d", "e"]) series.plot.pie() # includes nan - series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], - name='YLABEL') + series = Series([1, 2, np.nan, 4], index=["a", "b", "c", "d"], name="YLABEL") ax = _check_plot_works(series.plot.pie) - self._check_text_labels(ax.texts, ['a', 'b', '', 'd']) + self._check_text_labels(ax.texts, ["a", "b", "", "d"]) def test_pie_nan(self): s = Series([1, np.nan, 1, 1]) _, ax = self.plt.subplots() ax = s.plot.pie(legend=True, ax=ax) - expected = ['0', '', '2', '3'] + expected = ["0", "", "2", "3"] result = [x.get_text() for x in ax.texts] assert result == expected @@ -352,9 +356,8 @@ def test_hist_df_kwargs(self): def test_hist_df_with_nonnumerics(self): # GH 9853 with tm.RNGContext(1): - df = DataFrame( - np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df['E'] = ['x', 'y'] * 5 + df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df["E"] = ["x", "y"] * 5 _, ax = self.plt.subplots() ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 20 @@ -370,11 +373,9 @@ def test_hist_legacy(self): _check_plot_works(self.ts.hist, figsize=(8, 10)) # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, - by=self.ts.index.month) + _check_plot_works(self.ts.hist, by=self.ts.index.month) with tm.assert_produces_warning(UserWarning): - _check_plot_works(self.ts.hist, - by=self.ts.index.month, bins=5) + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) fig, ax = self.plt.subplots(1, 1) _check_plot_works(self.ts.hist, ax=ax) @@ -410,47 +411,40 @@ def test_hist_layout_with_by(self): # _check_plot_works adds an ax so catch warning. see GH #13188 with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.gender, layout=(2, 1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.gender, layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(4, 1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(2, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(2, -1)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(3, -1)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(3, -1)) self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.category, layout=(-1, 4)) + axes = _check_plot_works(df.height.hist, by=df.category, layout=(-1, 4)) self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(df.height.hist, - by=df.classroom, layout=(2, 2)) + axes = _check_plot_works(df.height.hist, by=df.classroom, layout=(2, 2)) self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 2), - figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) @pytest.mark.slow def test_hist_no_overlap(self): from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) y = Series(randn(2)) subplot(121) @@ -464,39 +458,38 @@ def test_hist_no_overlap(self): @pytest.mark.slow def test_hist_secondary_legend(self): # GH 9610 - df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) + df = DataFrame(np.random.randn(30, 4), columns=list("abcd")) # primary -> secondary _, ax = self.plt.subplots() - ax = df['a'].plot.hist(legend=True, ax=ax) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + ax = df["a"].plot.hist(legend=True, ax=ax) + df["b"].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b (right)']) + self._check_legend_labels(ax, labels=["a", "b (right)"]) assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() tm.close() # secondary -> secondary _, ax = self.plt.subplots() - ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax) + df["b"].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are draw on left ax # left axis must be invisible, right axis must be visible - self._check_legend_labels(ax.left_ax, - labels=['a (right)', 'b (right)']) + self._check_legend_labels(ax.left_ax, labels=["a (right)", "b (right)"]) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() tm.close() # secondary -> primary _, ax = self.plt.subplots() - ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) + ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax) # right axes is returned - df['b'].plot.hist(ax=ax, legend=True) + df["b"].plot.hist(ax=ax, legend=True) # both legends are draw on left ax # left and right axis must be visible - self._check_legend_labels(ax.left_ax, labels=['a (right)', 'b']) + self._check_legend_labels(ax.left_ax, labels=["a (right)", "b"]) assert ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() tm.close() @@ -504,8 +497,8 @@ def test_hist_secondary_legend(self): @pytest.mark.slow def test_df_series_secondary_legend(self): # GH 9779 - df = DataFrame(np.random.randn(30, 3), columns=list('abc')) - s = Series(np.random.randn(30), name='x') + df = DataFrame(np.random.randn(30, 3), columns=list("abc")) + s = Series(np.random.randn(30), name="x") # primary -> secondary (without passing ax) _, ax = self.plt.subplots() @@ -513,7 +506,7 @@ def test_df_series_secondary_legend(self): s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self._check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() tm.close() @@ -524,7 +517,7 @@ def test_df_series_secondary_legend(self): s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self._check_legend_labels(ax, labels=["a", "b", "c", "x (right)"]) assert ax.get_yaxis().get_visible() assert ax.right_ax.get_yaxis().get_visible() tm.close() @@ -535,7 +528,7 @@ def test_df_series_secondary_legend(self): s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + expected = ["a (right)", "b (right)", "c (right)", "x (right)"] self._check_legend_labels(ax.left_ax, labels=expected) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() @@ -547,7 +540,7 @@ def test_df_series_secondary_legend(self): s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + expected = ["a (right)", "b (right)", "c (right)", "x (right)"] self._check_legend_labels(ax.left_ax, expected) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() @@ -559,17 +552,16 @@ def test_df_series_secondary_legend(self): s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible - expected = ['a', 'b', 'c', 'x (right)'] + expected = ["a", "b", "c", "x (right)"] self._check_legend_labels(ax.left_ax, expected) assert not ax.left_ax.get_yaxis().get_visible() assert ax.get_yaxis().get_visible() tm.close() @pytest.mark.slow - @pytest.mark.parametrize("input_logy, expected_scale", [ - (True, 'log'), - ('sym', 'symlog') - ]) + @pytest.mark.parametrize( + "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")] + ) def test_secondary_logy(self, input_logy, expected_scale): # GH 25545 s1 = Series(np.random.randn(30)) @@ -587,7 +579,7 @@ def test_plot_fails_with_dupe_color_and_style(self): x = Series(randn(2)) with pytest.raises(ValueError): _, ax = self.plt.subplots() - x.plot(style='k--', color='k', ax=ax) + x.plot(style="k--", color="k", ax=ax) @pytest.mark.slow @td.skip_if_no_scipy @@ -595,38 +587,36 @@ def test_hist_kde(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(logy=True, ax=ax) - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") xlabels = ax.get_xticklabels() # ticks are values, thus ticklabels are blank - self._check_text_labels(xlabels, [''] * len(xlabels)) + self._check_text_labels(xlabels, [""] * len(xlabels)) ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) + self._check_text_labels(ylabels, [""] * len(ylabels)) _check_plot_works(self.ts.plot.kde) _check_plot_works(self.ts.plot.density) _, ax = self.plt.subplots() ax = self.ts.plot.kde(logy=True, ax=ax) - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") xlabels = ax.get_xticklabels() - self._check_text_labels(xlabels, [''] * len(xlabels)) + self._check_text_labels(xlabels, [""] * len(xlabels)) ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) + self._check_text_labels(ylabels, [""] * len(ylabels)) @pytest.mark.slow @td.skip_if_no_scipy def test_kde_kwargs(self): sample_points = np.linspace(-100, 100, 20) - _check_plot_works(self.ts.plot.kde, bw_method='scott', ind=20) + _check_plot_works(self.ts.plot.kde, bw_method="scott", ind=20) _check_plot_works(self.ts.plot.kde, bw_method=None, ind=20) _check_plot_works(self.ts.plot.kde, bw_method=None, ind=np.int(20)) - _check_plot_works(self.ts.plot.kde, bw_method=.5, ind=sample_points) - _check_plot_works(self.ts.plot.density, bw_method=.5, - ind=sample_points) + _check_plot_works(self.ts.plot.kde, bw_method=0.5, ind=sample_points) + _check_plot_works(self.ts.plot.density, bw_method=0.5, ind=sample_points) _, ax = self.plt.subplots() - ax = self.ts.plot.kde(logy=True, bw_method=.5, ind=sample_points, - ax=ax) - self._check_ax_scales(ax, yaxis='log') - self._check_text_labels(ax.yaxis.get_label(), 'Density') + ax = self.ts.plot.kde(logy=True, bw_method=0.5, ind=sample_points, ax=ax) + self._check_ax_scales(ax, yaxis="log") + self._check_text_labels(ax.yaxis.get_label(), "Density") @pytest.mark.slow @td.skip_if_no_scipy @@ -643,49 +633,50 @@ def test_hist_kwargs(self): _, ax = self.plt.subplots() ax = self.ts.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 5 - self._check_text_labels(ax.yaxis.get_label(), 'Frequency') + self._check_text_labels(ax.yaxis.get_label(), "Frequency") tm.close() _, ax = self.plt.subplots() - ax = self.ts.plot.hist(orientation='horizontal', ax=ax) - self._check_text_labels(ax.xaxis.get_label(), 'Frequency') + ax = self.ts.plot.hist(orientation="horizontal", ax=ax) + self._check_text_labels(ax.xaxis.get_label(), "Frequency") tm.close() _, ax = self.plt.subplots() - ax = self.ts.plot.hist(align='left', stacked=True, ax=ax) + ax = self.ts.plot.hist(align="left", stacked=True, ax=ax) tm.close() @pytest.mark.slow @td.skip_if_no_scipy def test_hist_kde_color(self): _, ax = self.plt.subplots() - ax = self.ts.plot.hist(logy=True, bins=10, color='b', ax=ax) - self._check_ax_scales(ax, yaxis='log') + ax = self.ts.plot.hist(logy=True, bins=10, color="b", ax=ax) + self._check_ax_scales(ax, yaxis="log") assert len(ax.patches) == 10 - self._check_colors(ax.patches, facecolors=['b'] * 10) + self._check_colors(ax.patches, facecolors=["b"] * 10) _, ax = self.plt.subplots() - ax = self.ts.plot.kde(logy=True, color='r', ax=ax) - self._check_ax_scales(ax, yaxis='log') + ax = self.ts.plot.kde(logy=True, color="r", ax=ax) + self._check_ax_scales(ax, yaxis="log") lines = ax.get_lines() assert len(lines) == 1 - self._check_colors(lines, ['r']) + self._check_colors(lines, ["r"]) @pytest.mark.slow def test_boxplot_series(self): _, ax = self.plt.subplots() ax = self.ts.plot.box(logy=True, ax=ax) - self._check_ax_scales(ax, yaxis='log') + self._check_ax_scales(ax, yaxis="log") xlabels = ax.get_xticklabels() self._check_text_labels(xlabels, [self.ts.name]) ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) + self._check_text_labels(ylabels, [""] * len(ylabels)) @pytest.mark.slow def test_kind_both_ways(self): s = Series(range(3)) - kinds = (plotting.PlotAccessor._common_kinds + - plotting.PlotAccessor._series_kinds) + kinds = ( + plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds + ) _, ax = self.plt.subplots() for kind in kinds: @@ -694,7 +685,7 @@ def test_kind_both_ways(self): @pytest.mark.slow def test_invalid_plot_data(self): - s = Series(list('abcd')) + s = Series(list("abcd")) _, ax = self.plt.subplots() for kind in plotting.PlotAccessor._common_kinds: @@ -709,7 +700,7 @@ def test_valid_object_plot(self): _check_plot_works(s.plot, kind=kind) def test_partially_invalid_plot_data(self): - s = Series(['a', 'b', 1.0, 2]) + s = Series(["a", "b", 1.0, 2]) _, ax = self.plt.subplots() for kind in plotting.PlotAccessor._common_kinds: @@ -720,12 +711,12 @@ def test_partially_invalid_plot_data(self): def test_invalid_kind(self): s = Series([1, 2]) with pytest.raises(ValueError): - s.plot(kind='aasdf') + s.plot(kind="aasdf") @pytest.mark.slow def test_dup_datetime_index_plot(self): - dr1 = date_range('1/1/2009', periods=4) - dr2 = date_range('1/2/2009', periods=4) + dr1 = date_range("1/1/2009", periods=4) + dr2 = date_range("1/2/2009", periods=4) index = dr1.append(dr2) values = randn(index.size) s = Series(values, index=index) @@ -734,11 +725,11 @@ def test_dup_datetime_index_plot(self): @pytest.mark.slow def test_errorbar_plot(self): - s = Series(np.arange(10), name='x') + s = Series(np.arange(10), name="x") s_err = np.random.randn(10) - d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y']) + d_err = DataFrame(randn(10, 2), index=s.index, columns=["x", "y"]) # test line and bar plots - kinds = ['line', 'bar'] + kinds = ["line", "bar"] for kind in kinds: ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=1) @@ -755,10 +746,10 @@ def test_errorbar_plot(self): self._check_has_errorbars(ax, xerr=1, yerr=0) # test time series plotting - ix = date_range('1/1/2000', '1/1/2001', freq='M') - ts = Series(np.arange(12), index=ix, name='x') + ix = date_range("1/1/2000", "1/1/2001", freq="M") + ts = Series(np.arange(12), index=ix, name="x") ts_err = Series(np.random.randn(12), index=ix) - td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y']) + td_err = DataFrame(randn(12, 2), index=ix, columns=["x", "y"]) ax = _check_plot_works(ts.plot, yerr=ts_err) self._check_has_errorbars(ax, xerr=0, yerr=1) @@ -769,7 +760,7 @@ def test_errorbar_plot(self): with pytest.raises(ValueError): s.plot(yerr=np.arange(11)) - s_err = ['zzz'] * 10 + s_err = ["zzz"] * 10 with pytest.raises(TypeError): s.plot(yerr=s_err) @@ -780,15 +771,16 @@ def test_table(self): @pytest.mark.slow def test_series_grid_settings(self): # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 - self._check_grid_settings(Series([1, 2, 3]), - plotting.PlotAccessor._series_kinds + - plotting.PlotAccessor._common_kinds) + self._check_grid_settings( + Series([1, 2, 3]), + plotting.PlotAccessor._series_kinds + plotting.PlotAccessor._common_kinds, + ) @pytest.mark.slow def test_standard_colors(self): from pandas.plotting._matplotlib.style import _get_standard_colors - for c in ['r', 'red', 'green', '#FF0000']: + for c in ["r", "red", "green", "#FF0000"]: result = _get_standard_colors(1, color=c) assert result == [c] @@ -837,21 +829,22 @@ def test_standard_colors_all(self): def test_series_plot_color_kwargs(self): # GH1890 _, ax = self.plt.subplots() - ax = Series(np.arange(12) + 1).plot(color='green', ax=ax) - self._check_colors(ax.get_lines(), linecolors=['green']) + ax = Series(np.arange(12) + 1).plot(color="green", ax=ax) + self._check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_kwargs(self): # #1890 _, ax = self.plt.subplots() - ax = Series(np.arange(12) + 1, index=date_range( - '1/1/2000', periods=12)).plot(color='green', ax=ax) - self._check_colors(ax.get_lines(), linecolors=['green']) + ax = Series(np.arange(12) + 1, index=date_range("1/1/2000", periods=12)).plot( + color="green", ax=ax + ) + self._check_colors(ax.get_lines(), linecolors=["green"]) def test_time_series_plot_color_with_empty_kwargs(self): import matplotlib as mpl def_colors = self._unpack_cycler(mpl.rcParams) - index = date_range('1/1/2000', periods=12) + index = date_range("1/1/2000", periods=12) s = Series(np.arange(1, 13), index=index) ncolors = 3 @@ -863,18 +856,24 @@ def test_time_series_plot_color_with_empty_kwargs(self): def test_xticklabels(self): # GH11529 - s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) + s = Series(np.arange(10), index=["P%02d" % i for i in range(10)]) _, ax = self.plt.subplots() ax = s.plot(xticks=[0, 3, 5, 9], ax=ax) - exp = ['P%02d' % i for i in [0, 3, 5, 9]] + exp = ["P%02d" % i for i in [0, 3, 5, 9]] self._check_text_labels(ax.get_xticklabels(), exp) def test_custom_business_day_freq(self): # GH7222 from pandas.tseries.offsets import CustomBusinessDay - s = Series(range(100, 121), index=pd.bdate_range( - start='2014-05-01', end='2014-06-01', - freq=CustomBusinessDay(holidays=['2014-05-26']))) + + s = Series( + range(100, 121), + index=pd.bdate_range( + start="2014-05-01", + end="2014-06-01", + freq=CustomBusinessDay(holidays=["2014-05-26"]), + ), + ) _check_plot_works(s.plot) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 61ab759aa8d9b..1e7a40b9040b7 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -5,27 +5,40 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, NaT, Period, PeriodIndex, - RangeIndex, Series, Timedelta, TimedeltaIndex, Timestamp, isna, - timedelta_range, to_timedelta) + Categorical, + DataFrame, + DatetimeIndex, + Index, + NaT, + Period, + PeriodIndex, + RangeIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, + timedelta_range, + to_timedelta, +) from pandas.core import nanops import pandas.util.testing as tm def get_objs(): indexes = [ - tm.makeBoolIndex(10, name='a'), - tm.makeIntIndex(10, name='a'), - tm.makeFloatIndex(10, name='a'), - tm.makeDateIndex(10, name='a'), - tm.makeDateIndex(10, name='a').tz_localize(tz='US/Eastern'), - tm.makePeriodIndex(10, name='a'), - tm.makeStringIndex(10, name='a'), - tm.makeUnicodeIndex(10, name='a') + tm.makeBoolIndex(10, name="a"), + tm.makeIntIndex(10, name="a"), + tm.makeFloatIndex(10, name="a"), + tm.makeDateIndex(10, name="a"), + tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern"), + tm.makePeriodIndex(10, name="a"), + tm.makeStringIndex(10, name="a"), + tm.makeUnicodeIndex(10, name="a"), ] arr = np.random.randn(10) - series = [Series(arr, index=idx, name='a') for idx in indexes] + series = [Series(arr, index=idx, name="a") for idx in indexes] objs = indexes + series return objs @@ -35,30 +48,29 @@ def get_objs(): class TestReductions: - - @pytest.mark.parametrize('opname', ['max', 'min']) - @pytest.mark.parametrize('obj', objs) + @pytest.mark.parametrize("opname", ["max", "min"]) + @pytest.mark.parametrize("obj", objs) def test_ops(self, opname, obj): result = getattr(obj, opname)() if not isinstance(obj, PeriodIndex): expected = getattr(obj.values, opname)() else: expected = pd.Period( - ordinal=getattr(obj._ndarray_values, opname)(), - freq=obj.freq) + ordinal=getattr(obj._ndarray_values, opname)(), freq=obj.freq + ) try: assert result == expected except TypeError: # comparing tz-aware series with np.array results in # TypeError - expected = expected.astype('M8[ns]').astype('int64') + expected = expected.astype("M8[ns]").astype("int64") assert result.value == expected def test_nanops(self): # GH#7261 - for opname in ['max', 'min']: + for opname in ["max", "min"]: for klass in [Index, Series]: - arg_op = 'arg' + opname if klass is Index else 'idx' + opname + arg_op = "arg" + opname if klass is Index else "idx" + opname obj = klass([np.nan, 2.0]) assert getattr(obj, opname)() == 2.0 @@ -108,7 +120,7 @@ def test_nanops(self): getattr(obj, arg_op)(skipna=False) # argmin/max - obj = Index(np.arange(5, dtype='int64')) + obj = Index(np.arange(5, dtype="int64")) assert obj.argmin() == 0 assert obj.argmax() == 4 @@ -124,8 +136,7 @@ def test_nanops(self): assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 - obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), - pd.NaT]) + obj = Index([pd.NaT, datetime(2011, 11, 1), datetime(2011, 11, 2), pd.NaT]) assert obj.argmin() == 1 assert obj.argmax() == 2 assert obj.argmin(skipna=False) == -1 @@ -137,15 +148,13 @@ def test_nanops(self): assert obj.argmin(skipna=False) == -1 assert obj.argmax(skipna=False) == -1 - @pytest.mark.parametrize('op, expected_col', [ - ['max', 'a'], ['min', 'b'] - ]) + @pytest.mark.parametrize("op, expected_col", [["max", "a"], ["min", "b"]]) def test_same_tz_min_max_axis_1(self, op, expected_col): # GH 10390 - df = DataFrame(pd.date_range('2016-01-01 00:00:00', periods=3, - tz='UTC'), - columns=['a']) - df['b'] = df.a.subtract(pd.Timedelta(seconds=3600)) + df = DataFrame( + pd.date_range("2016-01-01 00:00:00", periods=3, tz="UTC"), columns=["a"] + ) + df["b"] = df.a.subtract(pd.Timedelta(seconds=3600)) result = getattr(df, op)(axis=1) expected = df[expected_col].rename(None) tm.assert_series_equal(result, expected) @@ -156,9 +165,16 @@ class TestIndexReductions: # were moved from a Index-specific test file, _not_ that these tests are # intended long-term to be Index-specific - @pytest.mark.parametrize('start,stop,step', - [(0, 400, 3), (500, 0, -6), (-10**6, 10**6, 4), - (10**6, -10**6, -4), (0, 10, 20)]) + @pytest.mark.parametrize( + "start,stop,step", + [ + (0, 400, 3), + (500, 0, -6), + (-10 ** 6, 10 ** 6, 4), + (10 ** 6, -10 ** 6, -4), + (0, 10, 20), + ], + ) def test_max_min_range(self, start, stop, step): # GH#17607 idx = RangeIndex(start, stop, step) @@ -186,20 +202,20 @@ def test_max_min_range(self, start, stop, step): def test_minmax_timedelta64(self): # monotonic - idx1 = TimedeltaIndex(['1 days', '2 days', '3 days']) + idx1 = TimedeltaIndex(["1 days", "2 days", "3 days"]) assert idx1.is_monotonic # non-monotonic - idx2 = TimedeltaIndex(['1 days', np.nan, '3 days', 'NaT']) + idx2 = TimedeltaIndex(["1 days", np.nan, "3 days", "NaT"]) assert not idx2.is_monotonic for idx in [idx1, idx2]: - assert idx.min() == Timedelta('1 days') - assert idx.max() == Timedelta('3 days') + assert idx.min() == Timedelta("1 days") + assert idx.max() == Timedelta("3 days") assert idx.argmin() == 0 assert idx.argmax() == 2 - for op in ['min', 'max']: + for op in ["min", "max"]: # Return NaT obj = TimedeltaIndex([]) assert pd.isna(getattr(obj, op)()) @@ -211,10 +227,10 @@ def test_minmax_timedelta64(self): assert pd.isna(getattr(obj, op)()) def test_numpy_minmax_timedelta64(self): - td = timedelta_range('16815 days', '16820 days', freq='D') + td = timedelta_range("16815 days", "16820 days", freq="D") - assert np.min(td) == Timedelta('16815 days') - assert np.max(td) == Timedelta('16820 days') + assert np.min(td) == Timedelta("16815 days") + assert np.max(td) == Timedelta("16820 days") errmsg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=errmsg): @@ -234,8 +250,9 @@ def test_numpy_minmax_timedelta64(self): def test_timedelta_ops(self): # GH#4984 # make sure ops return Timedelta - s = Series([Timestamp('20130101') + timedelta(seconds=i * i) - for i in range(10)]) + s = Series( + [Timestamp("20130101") + timedelta(seconds=i * i) for i in range(10)] + ) td = s.diff() result = td.mean() @@ -245,12 +262,12 @@ def test_timedelta_ops(self): result = td.to_frame().mean() assert result[0] == expected - result = td.quantile(.1) - expected = Timedelta(np.timedelta64(2600, 'ms')) + result = td.quantile(0.1) + expected = Timedelta(np.timedelta64(2600, "ms")) assert result == expected result = td.median() - expected = to_timedelta('00:00:09') + expected = to_timedelta("00:00:09") assert result == expected result = td.to_frame().median() @@ -259,7 +276,7 @@ def test_timedelta_ops(self): # GH#6462 # consistency in returned values for sum result = td.sum() - expected = to_timedelta('00:01:21') + expected = to_timedelta("00:01:21") assert result == expected result = td.to_frame().sum() @@ -274,39 +291,40 @@ def test_timedelta_ops(self): assert result[0] == expected # invalid ops - for op in ['skew', 'kurt', 'sem', 'prod']: + for op in ["skew", "kurt", "sem", "prod"]: msg = "reduction operation '{}' not allowed for this dtype" with pytest.raises(TypeError, match=msg.format(op)): getattr(td, op)() # GH#10040 # make sure NaT is properly handled by median() - s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07')]) + s = Series([Timestamp("2015-02-03"), Timestamp("2015-02-07")]) assert s.diff().median() == timedelta(days=4) - s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), - Timestamp('2015-02-15')]) + s = Series( + [Timestamp("2015-02-03"), Timestamp("2015-02-07"), Timestamp("2015-02-15")] + ) assert s.diff().median() == timedelta(days=6) def test_minmax_tz(self, tz_naive_fixture): tz = tz_naive_fixture # monotonic - idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', - '2011-01-03'], tz=tz) + idx1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz=tz) assert idx1.is_monotonic # non-monotonic - idx2 = pd.DatetimeIndex(['2011-01-01', pd.NaT, '2011-01-03', - '2011-01-02', pd.NaT], tz=tz) + idx2 = pd.DatetimeIndex( + ["2011-01-01", pd.NaT, "2011-01-03", "2011-01-02", pd.NaT], tz=tz + ) assert not idx2.is_monotonic for idx in [idx1, idx2]: - assert idx.min() == Timestamp('2011-01-01', tz=tz) - assert idx.max() == Timestamp('2011-01-03', tz=tz) + assert idx.min() == Timestamp("2011-01-01", tz=tz) + assert idx.max() == Timestamp("2011-01-03", tz=tz) assert idx.argmin() == 0 assert idx.argmax() == 2 - @pytest.mark.parametrize('op', ['min', 'max']) + @pytest.mark.parametrize("op", ["min", "max"]) def test_minmax_nat_datetime64(self, op): # Return NaT obj = DatetimeIndex([]) @@ -372,10 +390,10 @@ def test_numpy_minmax_range(self): # is the same as basic integer index def test_numpy_minmax_datetime64(self): - dr = pd.date_range(start='2016-01-15', end='2016-01-20') + dr = pd.date_range(start="2016-01-15", end="2016-01-20") - assert np.min(dr) == Timestamp('2016-01-15 00:00:00', freq='D') - assert np.max(dr) == Timestamp('2016-01-20 00:00:00', freq='D') + assert np.min(dr) == Timestamp("2016-01-15 00:00:00", freq="D") + assert np.max(dr) == Timestamp("2016-01-20 00:00:00", freq="D") errmsg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=errmsg): @@ -397,42 +415,42 @@ def test_numpy_minmax_datetime64(self): def test_minmax_period(self): # monotonic - idx1 = pd.PeriodIndex([NaT, '2011-01-01', '2011-01-02', - '2011-01-03'], freq='D') + idx1 = pd.PeriodIndex([NaT, "2011-01-01", "2011-01-02", "2011-01-03"], freq="D") assert idx1.is_monotonic # non-monotonic - idx2 = pd.PeriodIndex(['2011-01-01', NaT, '2011-01-03', - '2011-01-02', NaT], freq='D') + idx2 = pd.PeriodIndex( + ["2011-01-01", NaT, "2011-01-03", "2011-01-02", NaT], freq="D" + ) assert not idx2.is_monotonic for idx in [idx1, idx2]: - assert idx.min() == pd.Period('2011-01-01', freq='D') - assert idx.max() == pd.Period('2011-01-03', freq='D') + assert idx.min() == pd.Period("2011-01-01", freq="D") + assert idx.max() == pd.Period("2011-01-03", freq="D") assert idx1.argmin() == 1 assert idx2.argmin() == 0 assert idx1.argmax() == 3 assert idx2.argmax() == 2 - for op in ['min', 'max']: + for op in ["min", "max"]: # Return NaT - obj = PeriodIndex([], freq='M') + obj = PeriodIndex([], freq="M") result = getattr(obj, op)() assert result is NaT - obj = PeriodIndex([NaT], freq='M') + obj = PeriodIndex([NaT], freq="M") result = getattr(obj, op)() assert result is NaT - obj = PeriodIndex([NaT, NaT, NaT], freq='M') + obj = PeriodIndex([NaT, NaT, NaT], freq="M") result = getattr(obj, op)() assert result is NaT def test_numpy_minmax_period(self): - pr = pd.period_range(start='2016-01-15', end='2016-01-20') + pr = pd.period_range(start="2016-01-15", end="2016-01-20") - assert np.min(pr) == Period('2016-01-15', freq='D') - assert np.max(pr) == Period('2016-01-20', freq='D') + assert np.min(pr) == Period("2016-01-15", freq="D") + assert np.max(pr) == Period("2016-01-20", freq="D") errmsg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=errmsg): @@ -451,19 +469,15 @@ def test_numpy_minmax_period(self): def test_min_max_categorical(self): - ci = pd.CategoricalIndex(list('aabbca'), - categories=list('cab'), - ordered=False) + ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) with pytest.raises(TypeError): ci.min() with pytest.raises(TypeError): ci.max() - ci = pd.CategoricalIndex(list('aabbca'), - categories=list('cab'), - ordered=True) - assert ci.min() == 'c' - assert ci.max() == 'b' + ci = pd.CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=True) + assert ci.min() == "c" + assert ci.max() == "b" class TestSeriesReductions: @@ -480,7 +494,7 @@ def test_sum_inf(self): assert np.isinf(s.sum()) - arr = np.random.randn(100, 100).astype('f4') + arr = np.random.randn(100, 100).astype("f4") arr[:, 2] = np.inf with pd.option_context("mode.use_inf_as_na", True): @@ -490,10 +504,7 @@ def test_sum_inf(self): assert np.isinf(res).all() @pytest.mark.parametrize("use_bottleneck", [True, False]) - @pytest.mark.parametrize("method, unit", [ - ("sum", 0.0), - ("prod", 1.0) - ]) + @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty(self, method, unit, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): # GH#9422 / GH#18921 @@ -584,30 +595,28 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(min_count=2) assert pd.isna(result) - @pytest.mark.parametrize('method, unit', [ - ('sum', 0.0), - ('prod', 1.0), - ]) + @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty_multi(self, method, unit): - s = pd.Series([1, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) + s = pd.Series( + [1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([("a", "b"), (0, 1)]), + ) # 1 / 0 by default result = getattr(s, method)(level=0) - expected = pd.Series([1, unit], index=['a', 'b']) + expected = pd.Series([1, unit], index=["a", "b"]) tm.assert_series_equal(result, expected) # min_count=0 result = getattr(s, method)(level=0, min_count=0) - expected = pd.Series([1, unit], index=['a', 'b']) + expected = pd.Series([1, unit], index=["a", "b"]) tm.assert_series_equal(result, expected) # min_count=1 result = getattr(s, method)(level=0, min_count=1) - expected = pd.Series([1, np.nan], index=['a', 'b']) + expected = pd.Series([1, np.nan], index=["a", "b"]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "method", ['mean', 'median', 'std', 'var']) + @pytest.mark.parametrize("method", ["mean", "median", "std", "var"]) def test_ops_consistency_on_empty(self, method): # GH#7869 @@ -618,7 +627,7 @@ def test_ops_consistency_on_empty(self, method): assert pd.isna(result) # timedelta64[ns] - result = getattr(Series(dtype='m8[ns]'), method)() + result = getattr(Series(dtype="m8[ns]"), method)() assert result is pd.NaT def test_nansum_buglet(self): @@ -629,21 +638,21 @@ def test_nansum_buglet(self): @pytest.mark.parametrize("use_bottleneck", [True, False]) def test_sum_overflow(self, use_bottleneck): - with pd.option_context('use_bottleneck', use_bottleneck): + with pd.option_context("use_bottleneck", use_bottleneck): # GH#6915 # overflowing on the smaller int dtypes - for dtype in ['int32', 'int64']: + for dtype in ["int32", "int64"]: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) - assert int(result) == v.sum(dtype='int64') + assert int(result) == v.sum(dtype="int64") result = s.min(skipna=False) assert int(result) == 0 result = s.max(skipna=False) assert int(result) == v[-1] - for dtype in ['float32', 'float64']: + for dtype in ["float32", "float64"]: v = np.arange(5000000, dtype=dtype) s = Series(v) @@ -656,7 +665,7 @@ def test_sum_overflow(self, use_bottleneck): def test_empty_timeseries_reductions_return_nat(self): # covers GH#11245 - for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'): + for dtype in ("m8[ns]", "m8[ns]", "M8[ns]", "M8[ns, UTC]"): assert Series([], dtype=dtype).min() is pd.NaT assert Series([], dtype=dtype).max() is pd.NaT assert Series([], dtype=dtype).min(skipna=False) is pd.NaT @@ -681,8 +690,7 @@ def test_numpy_argmin_deprecated(self): assert result == 1 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.argmin(s, out=data) @@ -705,8 +713,7 @@ def test_numpy_argmax_deprecated(self): assert result == 10 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): np.argmax(s, out=data) @@ -714,7 +721,7 @@ def test_numpy_argmax_deprecated(self): def test_idxmin(self): # test idxmin # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") # add some NaNs string_series[5:15] = np.NaN @@ -726,15 +733,14 @@ def test_idxmin(self): # no NaNs nona = string_series.dropna() assert nona[nona.idxmin()] == nona.min() - assert (nona.index.values.tolist().index(nona.idxmin()) == - nona.values.argmin()) + assert nona.index.values.tolist().index(nona.idxmin()) == nona.values.argmin() # all NaNs allna = string_series * np.nan assert pd.isna(allna.idxmin()) # datetime64[ns] - s = Series(pd.date_range('20130102', periods=6)) + s = Series(pd.date_range("20130102", periods=6)) result = s.idxmin() assert result == 0 @@ -745,7 +751,7 @@ def test_idxmin(self): def test_idxmax(self): # test idxmax # _check_stat_op approach can not be used here because of isna check. - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") # add some NaNs string_series[5:15] = np.NaN @@ -757,15 +763,15 @@ def test_idxmax(self): # no NaNs nona = string_series.dropna() assert nona[nona.idxmax()] == nona.max() - assert (nona.index.values.tolist().index(nona.idxmax()) == - nona.values.argmax()) + assert nona.index.values.tolist().index(nona.idxmax()) == nona.values.argmax() # all NaNs allna = string_series * np.nan assert pd.isna(allna.idxmax()) from pandas import date_range - s = Series(date_range('20130102', periods=6)) + + s = Series(date_range("20130102", periods=6)) result = s.idxmax() assert result == 5 @@ -794,8 +800,8 @@ def test_all_any(self): assert bool_series.any() # Alternative types, with implicit 'object' dtype. - s = Series(['abc', True]) - assert 'abc' == s.any() # 'abc' || True => 'abc' + s = Series(["abc", True]) + assert "abc" == s.any() # 'abc' || True => 'abc' def test_all_any_params(self): # Check skipna, with implicit 'object' dtype. @@ -807,8 +813,7 @@ def test_all_any_params(self): assert not s2.any(skipna=True) # Check level. - s = pd.Series([False, False, True, True, False, True], - index=[0, 0, 1, 1, 2, 2]) + s = pd.Series([False, False, True, True, False, True], index=[0, 0, 1, 1, 2, 2]) tm.assert_series_equal(s.all(level=0), Series([False, True, False])) tm.assert_series_equal(s.any(level=0), Series([False, True, True])) @@ -820,15 +825,15 @@ def test_all_any_params(self): # bool_only is not implemented alone. with pytest.raises(NotImplementedError): - s.any(bool_only=True,) + s.any(bool_only=True) with pytest.raises(NotImplementedError): s.all(bool_only=True) def test_timedelta64_analytics(self): # index min/max - dti = pd.date_range('2012-1-1', periods=3, freq='D') - td = Series(dti) - pd.Timestamp('20120101') + dti = pd.date_range("2012-1-1", periods=3, freq="D") + td = Series(dti) - pd.Timestamp("20120101") result = td.idxmin() assert result == 0 @@ -847,8 +852,8 @@ def test_timedelta64_analytics(self): assert result == 2 # abs - s1 = Series(pd.date_range('20120101', periods=3)) - s2 = Series(pd.date_range('20120102', periods=3)) + s1 = Series(pd.date_range("20120101", periods=3)) + s2 = Series(pd.date_range("20120102", periods=3)) expected = Series(s2 - s1) # FIXME: don't leave commented-out code @@ -861,28 +866,23 @@ def test_timedelta64_analytics(self): # max/min result = td.max() - expected = pd.Timedelta('2 days') + expected = pd.Timedelta("2 days") assert result == expected result = td.min() - expected = pd.Timedelta('1 days') + expected = pd.Timedelta("1 days") assert result == expected @pytest.mark.parametrize( "test_input,error_type", [ (pd.Series([]), ValueError), - # For strings, or any Series with dtype 'O' - (pd.Series(['foo', 'bar', 'baz']), TypeError), + (pd.Series(["foo", "bar", "baz"]), TypeError), (pd.Series([(1,), (2,)]), TypeError), - # For mixed data types - ( - pd.Series(['foo', 'foo', 'bar', 'bar', None, np.nan, 'baz']), - TypeError - ), - ] + (pd.Series(["foo", "foo", "bar", "bar", None, np.nan, "baz"]), TypeError), + ], ) def test_assert_idxminmax_raises(self, test_input, error_type): """ @@ -909,7 +909,7 @@ def test_idxminmax_with_inf(self): # Using old-style behavior that treats floating point nan, -inf, and # +inf as missing - with pd.option_context('mode.use_inf_as_na', True): + with pd.option_context("mode.use_inf_as_na", True): assert s.idxmin() == 0 assert np.isnan(s.idxmin(skipna=False)) assert s.idxmax() == 0 @@ -921,10 +921,14 @@ class TestDatetime64SeriesReductions: # were moved from a series-specific test file, _not_ that these tests are # intended long-term to be series-specific - @pytest.mark.parametrize('nat_ser', [ - Series([pd.NaT, pd.NaT]), - Series([pd.NaT, pd.Timedelta('nat')]), - Series([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + @pytest.mark.parametrize( + "nat_ser", + [ + Series([pd.NaT, pd.NaT]), + Series([pd.NaT, pd.Timedelta("nat")]), + Series([pd.Timedelta("nat"), pd.Timedelta("nat")]), + ], + ) def test_minmax_nat_series(self, nat_ser): # GH#23282 assert nat_ser.min() is pd.NaT @@ -932,10 +936,14 @@ def test_minmax_nat_series(self, nat_ser): assert nat_ser.min(skipna=False) is pd.NaT assert nat_ser.max(skipna=False) is pd.NaT - @pytest.mark.parametrize('nat_df', [ - pd.DataFrame([pd.NaT, pd.NaT]), - pd.DataFrame([pd.NaT, pd.Timedelta('nat')]), - pd.DataFrame([pd.Timedelta('nat'), pd.Timedelta('nat')])]) + @pytest.mark.parametrize( + "nat_df", + [ + pd.DataFrame([pd.NaT, pd.NaT]), + pd.DataFrame([pd.NaT, pd.Timedelta("nat")]), + pd.DataFrame([pd.Timedelta("nat"), pd.Timedelta("nat")]), + ], + ) def test_minmax_nat_dataframe(self, nat_df): # GH#23282 assert nat_df.min()[0] is pd.NaT @@ -944,7 +952,7 @@ def test_minmax_nat_dataframe(self, nat_df): assert nat_df.max(skipna=False)[0] is pd.NaT def test_min_max(self): - rng = pd.date_range('1/1/2000', '12/31/2000') + rng = pd.date_range("1/1/2000", "12/31/2000") rng2 = rng.take(np.random.permutation(len(rng))) the_min = rng2.min() @@ -958,9 +966,9 @@ def test_min_max(self): assert rng.max() == rng[-1] def test_min_max_series(self): - rng = pd.date_range('1/1/2000', periods=10, freq='4h') - lvls = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'C'] - df = DataFrame({'TS': rng, 'V': np.random.randn(len(rng)), 'L': lvls}) + rng = pd.date_range("1/1/2000", periods=10, freq="4h") + lvls = ["A", "A", "A", "B", "B", "B", "C", "C", "C", "C"] + df = DataFrame({"TS": rng, "V": np.random.randn(len(rng)), "L": lvls}) result = df.TS.max() exp = pd.Timestamp(df.TS.iat[-1]) @@ -992,23 +1000,33 @@ def test_min_max(self): assert _min == "a" assert _max == "d" - cat = Series(Categorical(["a", "b", "c", "d"], categories=[ - 'd', 'c', 'b', 'a'], ordered=True)) + cat = Series( + Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) + ) _min = cat.min() _max = cat.max() assert _min == "d" assert _max == "a" - cat = Series(Categorical( - [np.nan, "b", "c", np.nan], categories=['d', 'c', 'b', 'a' - ], ordered=True)) + cat = Series( + Categorical( + [np.nan, "b", "c", np.nan], + categories=["d", "c", "b", "a"], + ordered=True, + ) + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) assert _max == "b" - cat = Series(Categorical( - [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True)) + cat = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) _min = cat.min() _max = cat.max() assert np.isnan(_min) @@ -1017,8 +1035,9 @@ def test_min_max(self): def test_min_max_numeric_only(self): # TODO deprecate numeric_only argument for Categorical and use # skipna as well, see GH25303 - cat = Series(Categorical( - ["a", "b", np.nan, "a"], categories=['b', 'a'], ordered=True)) + cat = Series( + Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True) + ) _min = cat.min() _max = cat.max() @@ -1041,24 +1060,26 @@ class TestSeriesMode: # were moved from a series-specific test file, _not_ that these tests are # intended long-term to be series-specific - @pytest.mark.parametrize('dropna, expected', [ - (True, Series([], dtype=np.float64)), - (False, Series([], dtype=np.float64)) - ]) + @pytest.mark.parametrize( + "dropna, expected", + [(True, Series([], dtype=np.float64)), (False, Series([], dtype=np.float64))], + ) def test_mode_empty(self, dropna, expected): s = Series([], dtype=np.float64) result = s.mode(dropna) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, data, expected', [ - (True, [1, 1, 1, 2], [1]), - (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - (False, [1, 1, 1, 2], [1]), - (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - ]) @pytest.mark.parametrize( - 'dt', - list(np.typecodes['AllInteger'] + np.typecodes['Float']) + "dropna, data, expected", + [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ], + ) + @pytest.mark.parametrize( + "dt", list(np.typecodes["AllInteger"] + np.typecodes["Float"]) ) def test_mode_numerical(self, dropna, data, expected, dt): s = Series(data, dtype=dt) @@ -1066,134 +1087,173 @@ def test_mode_numerical(self, dropna, data, expected, dt): expected = Series(expected, dtype=dt) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, expected', [ - (True, [1.0]), - (False, [1, np.nan]), - ]) + @pytest.mark.parametrize("dropna, expected", [(True, [1.0]), (False, [1, np.nan])]) def test_mode_numerical_nan(self, dropna, expected): s = Series([1, 1, 2, np.nan, np.nan]) result = s.mode(dropna) expected = Series(expected) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, ['b'], ['bar'], ['nan']), - (False, ['b'], [np.nan], ['nan']) - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2, expected3", + [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], + ) def test_mode_str_obj(self, dropna, expected1, expected2, expected3): # Test string and object types. - data = ['a'] * 2 + ['b'] * 3 + data = ["a"] * 2 + ["b"] * 3 - s = Series(data, dtype='c') + s = Series(data, dtype="c") result = s.mode(dropna) - expected1 = Series(expected1, dtype='c') + expected1 = Series(expected1, dtype="c") tm.assert_series_equal(result, expected1) - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object) result = s.mode(dropna) expected2 = Series(expected2, dtype=object) tm.assert_series_equal(result, expected2) - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + data = ["foo", "bar", "bar", np.nan, np.nan, np.nan] s = Series(data, dtype=object).astype(str) result = s.mode(dropna) expected3 = Series(expected3, dtype=str) tm.assert_series_equal(result, expected3) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['foo'], ['foo']), - (False, ['foo'], [np.nan]) - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])], + ) def test_mode_mixeddtype(self, dropna, expected1, expected2): - s = Series([1, 'foo', 'foo']) + s = Series([1, "foo", "foo"]) result = s.mode(dropna) expected = Series(expected1) tm.assert_series_equal(result, expected) - s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) + s = Series([1, "foo", "foo", np.nan, np.nan, np.nan]) result = s.mode(dropna) expected = Series(expected2, dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['1900-05-03', '2011-01-03', '2013-01-02'], - ['2011-01-03', '2013-01-02']), - (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + ( + True, + ["1900-05-03", "2011-01-03", "2013-01-02"], + ["2011-01-03", "2013-01-02"], + ), + (False, [np.nan], [np.nan, "2011-01-03", "2013-01-02"]), + ], + ) def test_mode_datetime(self, dropna, expected1, expected2): - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + s = Series( + ["2011-01-03", "2013-01-02", "1900-05-03", "nan", "nan"], dtype="M8[ns]" + ) result = s.mode(dropna) - expected1 = Series(expected1, dtype='M8[ns]') + expected1 = Series(expected1, dtype="M8[ns]") tm.assert_series_equal(result, expected1) - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02', 'nan', 'nan'], - dtype='M8[ns]') + s = Series( + [ + "2011-01-03", + "2013-01-02", + "1900-05-03", + "2011-01-03", + "2013-01-02", + "nan", + "nan", + ], + dtype="M8[ns]", + ) result = s.mode(dropna) - expected2 = Series(expected2, dtype='M8[ns]') + expected2 = Series(expected2, dtype="M8[ns]") tm.assert_series_equal(result, expected2) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, [np.nan], [np.nan, '2 min', '1 day']), - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [ + (True, ["-1 days", "0 days", "1 days"], ["2 min", "1 day"]), + (False, [np.nan], [np.nan, "2 min", "1 day"]), + ], + ) def test_mode_timedelta(self, dropna, expected1, expected2): # gh-5986: Test timedelta types. - s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], - dtype='timedelta64[ns]') + s = Series( + ["1 days", "-1 days", "0 days", "nan", "nan"], dtype="timedelta64[ns]" + ) result = s.mode(dropna) - expected1 = Series(expected1, dtype='timedelta64[ns]') + expected1 = Series(expected1, dtype="timedelta64[ns]") tm.assert_series_equal(result, expected1) - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min', 'nan', 'nan'], - dtype='timedelta64[ns]') + s = Series( + [ + "1 day", + "1 day", + "-1 day", + "-1 day 2 min", + "2 min", + "2 min", + "nan", + "nan", + ], + dtype="timedelta64[ns]", + ) result = s.mode(dropna) - expected2 = Series(expected2, dtype='timedelta64[ns]') + expected2 = Series(expected2, dtype="timedelta64[ns]") tm.assert_series_equal(result, expected2) - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, Categorical([1, 2], categories=[1, 2]), - Categorical(['a'], categories=[1, 'a']), - Categorical([3, 1], categories=[3, 2, 1], ordered=True)), - (False, Categorical([np.nan], categories=[1, 2]), - Categorical([np.nan, 'a'], categories=[1, 'a']), - Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2, expected3", + [ + ( + True, + Categorical([1, 2], categories=[1, 2]), + Categorical(["a"], categories=[1, "a"]), + Categorical([3, 1], categories=[3, 2, 1], ordered=True), + ), + ( + False, + Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, "a"], categories=[1, "a"]), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True), + ), + ], + ) def test_mode_category(self, dropna, expected1, expected2, expected3): s = Series(Categorical([1, 2, np.nan, np.nan])) result = s.mode(dropna) - expected1 = Series(expected1, dtype='category') + expected1 = Series(expected1, dtype="category") tm.assert_series_equal(result, expected1) - s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + s = Series(Categorical([1, "a", "a", np.nan, np.nan])) result = s.mode(dropna) - expected2 = Series(expected2, dtype='category') + expected2 = Series(expected2, dtype="category") tm.assert_series_equal(result, expected2) - s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], - categories=[3, 2, 1], ordered=True)) + s = Series( + Categorical( + [1, 1, 2, 3, 3, np.nan, np.nan], categories=[3, 2, 1], ordered=True + ) + ) result = s.mode(dropna) - expected3 = Series(expected3, dtype='category') + expected3 = Series(expected3, dtype="category") tm.assert_series_equal(result, expected3) - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, [2**63], [1, 2**63]), - (False, [2**63], [1, 2**63]) - ]) + @pytest.mark.parametrize( + "dropna, expected1, expected2", + [(True, [2 ** 63], [1, 2 ** 63]), (False, [2 ** 63], [1, 2 ** 63])], + ) def test_mode_intoverflow(self, dropna, expected1, expected2): # Test for uint64 overflow. - s = Series([1, 2**63, 2**63], dtype=np.uint64) + s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64) result = s.mode(dropna) expected1 = Series(expected1, dtype=np.uint64) tm.assert_series_equal(result, expected1) - s = Series([1, 2**63], dtype=np.uint64) + s = Series([1, 2 ** 63], dtype=np.uint64) result = s.mode(dropna) expected2 = Series(expected2, dtype=np.uint64) tm.assert_series_equal(result, expected2) @@ -1202,8 +1262,8 @@ def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted - expected = Series(['foo', np.nan]) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) + expected = Series(["foo", np.nan]) + s = Series([1, "foo", "foo", np.nan, np.nan]) with tm.assert_produces_warning(UserWarning, check_stacklevel=False): result = s.mode(dropna=False) diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index b0fd2f290031e..432811b5a8264 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -15,37 +15,36 @@ class TestDatetimeLikeStatReductions: - - @pytest.mark.parametrize('box', [Series, pd.Index, DatetimeArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray]) def test_dt64_mean(self, tz_naive_fixture, box): tz = tz_naive_fixture - dti = pd.date_range('2001-01-01', periods=11, tz=tz) + dti = pd.date_range("2001-01-01", periods=11, tz=tz) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) dtarr = dti._data obj = box(dtarr) - assert obj.mean() == pd.Timestamp('2001-01-06', tz=tz) - assert obj.mean(skipna=False) == pd.Timestamp('2001-01-06', tz=tz) + assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz) + assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz) # dtarr[-2] will be the first date 2001-01-1 dtarr[-2] = pd.NaT obj = box(dtarr) - assert obj.mean() == pd.Timestamp('2001-01-06 07:12:00', tz=tz) + assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz) assert obj.mean(skipna=False) is pd.NaT - @pytest.mark.parametrize('box', [Series, pd.Index, PeriodArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) def test_period_mean(self, box): # GH#24757 - dti = pd.date_range('2001-01-01', periods=11) + dti = pd.date_range("2001-01-01", periods=11) # shuffle so that we are not just working with monotone-increasing dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) # use hourly frequency to avoid rounding errors in expected results # TODO: flesh this out with different frequencies - parr = dti._data.to_period('H') + parr = dti._data.to_period("H") obj = box(parr) with pytest.raises(TypeError, match="ambiguous"): obj.mean() @@ -60,10 +59,9 @@ def test_period_mean(self, box): with pytest.raises(TypeError, match="ambiguous"): obj.mean(skipna=True) - @pytest.mark.parametrize('box', [Series, pd.Index, TimedeltaArray]) + @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray]) def test_td64_mean(self, box): - tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], - unit='D') + tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D") tdarr = tdi._data obj = box(tdarr) @@ -79,7 +77,7 @@ def test_td64_mean(self, box): assert result2 == tdi[1:].mean() # exact equality fails by 1 nanosecond - assert result2.round('us') == (result * 11. / 10).round('us') + assert result2.round("us") == (result * 11.0 / 10).round("us") class TestSeriesStatReductions: @@ -87,18 +85,19 @@ class TestSeriesStatReductions: # were moved from a series-specific test file, _not_ that these tests are # intended long-term to be series-specific - def _check_stat_op(self, name, alternate, string_series_, - check_objects=False, check_allna=False): + def _check_stat_op( + self, name, alternate, string_series_, check_objects=False, check_allna=False + ): - with pd.option_context('use_bottleneck', False): + with pd.option_context("use_bottleneck", False): f = getattr(Series, name) # add some NaNs string_series_[5:15] = np.NaN # mean, idxmax, idxmin, min, and max are valid for dates - if name not in ['max', 'min', 'mean']: - ds = Series(pd.date_range('1/1/2001', periods=10)) + if name not in ["max", "min", "mean"]: + ds = Series(pd.date_range("1/1/2001", periods=10)) with pytest.raises(TypeError): f(ds) @@ -123,67 +122,67 @@ def _check_stat_op(self, name, alternate, string_series_, # GH#2888 items = [0] items.extend(range(2 ** 40, 2 ** 40 + 1000)) - s = Series(items, dtype='int64') + s = Series(items, dtype="int64") tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) # check date range if check_objects: - s = Series(pd.bdate_range('1/1/2000', periods=10)) + s = Series(pd.bdate_range("1/1/2000", periods=10)) res = f(s) exp = alternate(s) assert res == exp # check on string data - if name not in ['sum', 'min', 'max']: + if name not in ["sum", "min", "max"]: with pytest.raises(TypeError): - f(Series(list('abc'))) + f(Series(list("abc"))) # Invalid axis. with pytest.raises(ValueError): f(string_series_, axis=1) # Unimplemented numeric_only parameter. - if 'numeric_only' in inspect.getfullargspec(f).args: + if "numeric_only" in inspect.getfullargspec(f).args: with pytest.raises(NotImplementedError, match=name): f(string_series_, numeric_only=True) def test_sum(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('sum', np.sum, string_series, check_allna=False) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("sum", np.sum, string_series, check_allna=False) def test_mean(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('mean', np.mean, string_series) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("mean", np.mean, string_series) def test_median(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('median', np.median, string_series) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("median", np.median, string_series) # test with integers, test failure int_ts = Series(np.ones(10, dtype=int), index=range(10)) tm.assert_almost_equal(np.median(int_ts), int_ts.median()) def test_prod(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('prod', np.prod, string_series) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("prod", np.prod, string_series) def test_min(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('min', np.min, string_series, check_objects=True) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("min", np.min, string_series, check_objects=True) def test_max(self): - string_series = tm.makeStringSeries().rename('series') - self._check_stat_op('max', np.max, string_series, check_objects=True) + string_series = tm.makeStringSeries().rename("series") + self._check_stat_op("max", np.max, string_series, check_objects=True) def test_var_std(self): - string_series = tm.makeStringSeries().rename('series') - datetime_series = tm.makeTimeSeries().rename('ts') + string_series = tm.makeStringSeries().rename("series") + datetime_series = tm.makeTimeSeries().rename("ts") alt = lambda x: np.std(x, ddof=1) - self._check_stat_op('std', alt, string_series) + self._check_stat_op("std", alt, string_series) alt = lambda x: np.var(x, ddof=1) - self._check_stat_op('var', alt, string_series) + self._check_stat_op("var", alt, string_series) result = datetime_series.std(ddof=4) expected = np.std(datetime_series.values, ddof=4) @@ -202,15 +201,16 @@ def test_var_std(self): assert pd.isna(result) def test_sem(self): - string_series = tm.makeStringSeries().rename('series') - datetime_series = tm.makeTimeSeries().rename('ts') + string_series = tm.makeStringSeries().rename("series") + datetime_series = tm.makeTimeSeries().rename("ts") alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) - self._check_stat_op('sem', alt, string_series) + self._check_stat_op("sem", alt, string_series) result = datetime_series.sem(ddof=4) - expected = np.std(datetime_series.values, - ddof=4) / np.sqrt(len(datetime_series.values)) + expected = np.std(datetime_series.values, ddof=4) / np.sqrt( + len(datetime_series.values) + ) tm.assert_almost_equal(result, expected) # 1 - element series with ddof=1 @@ -222,10 +222,10 @@ def test_sem(self): def test_skew(self): from scipy.stats import skew - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") alt = lambda x: skew(x, bias=False) - self._check_stat_op('skew', alt, string_series) + self._check_stat_op("skew", alt, string_series) # test corner cases, skew() returns NaN unless there's at least 3 # values @@ -244,17 +244,17 @@ def test_skew(self): def test_kurt(self): from scipy.stats import kurtosis - string_series = tm.makeStringSeries().rename('series') + string_series = tm.makeStringSeries().rename("series") alt = lambda x: kurtosis(x, bias=False) - self._check_stat_op('kurt', alt, string_series) + self._check_stat_op("kurt", alt, string_series) index = pd.MultiIndex( - levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]] + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) s = Series(np.random.randn(6), index=index) - tm.assert_almost_equal(s.kurt(), s.kurt(level=0)['bar']) + tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"]) # test corner cases, kurt() returns NaN unless there's at least 4 # values diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index d0f78f6d5b439..bb4f7ced3350f 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -8,10 +8,23 @@ from pandas.core.indexes.period import period_range # The various methods we support -downsample_methods = ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', - 'median', 'prod', 'var', 'std', 'ohlc', 'quantile'] -upsample_methods = ['count', 'size'] -series_methods = ['nunique'] +downsample_methods = [ + "min", + "max", + "first", + "last", + "sum", + "mean", + "sem", + "median", + "prod", + "var", + "std", + "ohlc", + "quantile", +] +upsample_methods = ["count", "size"] +series_methods = ["nunique"] resample_methods = downsample_methods + upsample_methods + series_methods @@ -38,9 +51,11 @@ def simple_date_range_series(): """ Series with date range index and random data for test purposes. """ - def _simple_date_range_series(start, end, freq='D'): + + def _simple_date_range_series(start, end, freq="D"): rng = date_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) + return _simple_date_range_series @@ -49,9 +64,11 @@ def simple_period_range_series(): """ Series with period range index and random data for test purposes. """ - def _simple_period_range_series(start, end, freq='D'): + + def _simple_period_range_series(start, end, freq="D"): rng = period_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) + return _simple_period_range_series @@ -70,7 +87,7 @@ def _index_end(): @pytest.fixture def _index_freq(): """Fixture for parametrization of index, series and frame.""" - return 'D' + return "D" @pytest.fixture @@ -83,8 +100,7 @@ def _index_name(): def index(_index_factory, _index_start, _index_end, _index_freq, _index_name): """Fixture for parametrization of date_range, period_range and timedelta_range indexes""" - return _index_factory( - _index_start, _index_end, freq=_index_freq, name=_index_name) + return _index_factory(_index_start, _index_end, freq=_index_freq, name=_index_name) @pytest.fixture @@ -121,7 +137,7 @@ def frame(index, _series_name, _static_values): """Fixture for parametrization of DataFrame with date_range, period_range and timedelta_range indexes""" # _series_name is intentionally unused - return DataFrame({'value': _static_values}, index=index) + return DataFrame({"value": _static_values}, index=index) @pytest.fixture diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 63fa2007e401d..51e309130e45d 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -12,8 +12,11 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_index_equal, - assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) # a fixture value can be overridden by the test parameter value. Note that the # value of the fixture can be overridden this way even if the test doesn't use @@ -21,14 +24,13 @@ # see https://docs.pytest.org/en/latest/fixture.html#override-a-fixture-with-direct-test-parametrization # noqa # in this module we override the fixture values defined in conftest.py # tuples of '_index_factory,_series_name,_index_start,_index_end' -DATE_RANGE = (date_range, 'dti', datetime(2005, 1, 1), datetime(2005, 1, 10)) -PERIOD_RANGE = ( - period_range, 'pi', datetime(2005, 1, 1), datetime(2005, 1, 10)) -TIMEDELTA_RANGE = (timedelta_range, 'tdi', '1 day', '10 day') +DATE_RANGE = (date_range, "dti", datetime(2005, 1, 1), datetime(2005, 1, 10)) +PERIOD_RANGE = (period_range, "pi", datetime(2005, 1, 1), datetime(2005, 1, 10)) +TIMEDELTA_RANGE = (timedelta_range, "tdi", "1 day", "10 day") all_ts = pytest.mark.parametrize( - '_index_factory,_series_name,_index_start,_index_end', - [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE] + "_index_factory,_series_name,_index_start,_index_end", + [DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE], ) @@ -37,13 +39,13 @@ def create_index(_index_factory): def _create_index(*args, **kwargs): """ return the _index_factory created using the args, kwargs """ return _index_factory(*args, **kwargs) + return _create_index -@pytest.mark.parametrize('freq', ['2D', '1H']) +@pytest.mark.parametrize("freq", ["2D", "1H"]) @pytest.mark.parametrize( - '_index_factory,_series_name,_index_start,_index_end', - [DATE_RANGE, TIMEDELTA_RANGE] + "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] ) def test_asfreq(series_and_frame, freq, create_index): obj = series_and_frame @@ -55,24 +57,22 @@ def test_asfreq(series_and_frame, freq, create_index): @pytest.mark.parametrize( - '_index_factory,_series_name,_index_start,_index_end', - [DATE_RANGE, TIMEDELTA_RANGE] + "_index_factory,_series_name,_index_start,_index_end", [DATE_RANGE, TIMEDELTA_RANGE] ) def test_asfreq_fill_value(series, create_index): # test for fill value during resampling, issue 3715 s = series - result = s.resample('1H').asfreq() - new_index = create_index(s.index[0], s.index[-1], freq='1H') + result = s.resample("1H").asfreq() + new_index = create_index(s.index[0], s.index[-1], freq="1H") expected = s.reindex(new_index) assert_series_equal(result, expected) - frame = s.to_frame('value') + frame = s.to_frame("value") frame.iloc[1] = None - result = frame.resample('1H').asfreq(fill_value=4.0) - new_index = create_index(frame.index[0], - frame.index[-1], freq='1H') + result = frame.resample("1H").asfreq(fill_value=4.0) + new_index = create_index(frame.index[0], frame.index[-1], freq="1H") expected = frame.reindex(new_index, fill_value=4.0) assert_frame_equal(result, expected) @@ -82,26 +82,28 @@ def test_resample_interpolate(frame): # # 12925 df = frame assert_frame_equal( - df.resample('1T').asfreq().interpolate(), - df.resample('1T').interpolate()) + df.resample("1T").asfreq().interpolate(), df.resample("1T").interpolate() + ) def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() - msg = ("Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Index'") + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," + " but got an instance of 'Index'" + ) with pytest.raises(TypeError, match=msg): - xp.resample('A').mean() + xp.resample("A").mean() @all_ts -@pytest.mark.parametrize('freq', ['M', 'D', 'H']) +@pytest.mark.parametrize("freq", ["M", "D", "H"]) def test_resample_empty_series(freq, empty_series, resample_method): # GH12771 & GH12868 - if resample_method == 'ohlc': - pytest.skip('need to test for ohlc from GH13083') + if resample_method == "ohlc": + pytest.skip("need to test for ohlc from GH13083") s = empty_series result = getattr(s.resample(freq), resample_method)() @@ -117,13 +119,13 @@ def test_resample_empty_series(freq, empty_series, resample_method): @all_ts -@pytest.mark.parametrize('freq', ['M', 'D', 'H']) +@pytest.mark.parametrize("freq", ["M", "D", "H"]) def test_resample_empty_dataframe(empty_frame, freq, resample_method): # GH13212 df = empty_frame # count retains dimensions too result = getattr(df.resample(freq), resample_method)() - if resample_method != 'size': + if resample_method != "size": expected = df.copy() else: # GH14962 @@ -141,9 +143,7 @@ def test_resample_empty_dataframe(empty_frame, freq, resample_method): @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) -@pytest.mark.parametrize( - "dtype", - [np.float, np.int, np.object, 'datetime64[ns]']) +@pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions @@ -151,7 +151,7 @@ def test_resample_empty_dtypes(index, dtype, resample_method): # them to ensure they no longer do. (GH #10228) empty_series = Series([], index, dtype) try: - getattr(empty_series.resample('d'), resample_method)() + getattr(empty_series.resample("d"), resample_method)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) @@ -162,30 +162,25 @@ def test_resample_empty_dtypes(index, dtype, resample_method): def test_resample_loffset_arg_type(frame, create_index): # GH 13218, 15002 df = frame - expected_means = [df.values[i:i + 2].mean() - for i in range(0, len(df.values), 2)] - expected_index = create_index(df.index[0], - periods=len(df.index) / 2, - freq='2D') + expected_means = [df.values[i : i + 2].mean() for i in range(0, len(df.values), 2)] + expected_index = create_index(df.index[0], periods=len(df.index) / 2, freq="2D") # loffset coerces PeriodIndex to DateTimeIndex if isinstance(expected_index, PeriodIndex): expected_index = expected_index.to_timestamp() expected_index += timedelta(hours=2) - expected = DataFrame({'value': expected_means}, index=expected_index) + expected = DataFrame({"value": expected_means}, index=expected_index) - for arg in ['mean', {'value': 'mean'}, ['mean']]: + for arg in ["mean", {"value": "mean"}, ["mean"]]: - result_agg = df.resample('2D', loffset='2H').agg(arg) + result_agg = df.resample("2D", loffset="2H").agg(arg) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result_how = df.resample('2D', how=arg, loffset='2H') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result_how = df.resample("2D", how=arg, loffset="2H") if isinstance(arg, list): - expected.columns = pd.MultiIndex.from_tuples([('value', - 'mean')]) + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) # GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex if isinstance(expected.index, TimedeltaIndex): @@ -203,7 +198,7 @@ def test_resample_loffset_arg_type(frame, create_index): def test_apply_to_empty_series(empty_series): # GH 14313 s = empty_series - for freq in ['M', 'D', 'H']: + for freq in ["M", "D", "H"]: result = s.resample(freq).apply(lambda x: 1) expected = s.resample(freq).apply(np.sum) @@ -213,8 +208,8 @@ def test_apply_to_empty_series(empty_series): @all_ts def test_resampler_is_iterable(series): # GH 15314 - freq = 'H' - tg = Grouper(freq=freq, convention='start') + freq = "H" + tg = Grouper(freq=freq, convention="start") grouped = series.groupby(tg) resampled = series.resample(freq) for (rk, rv), (gk, gv) in zip(resampled, grouped): @@ -227,7 +222,7 @@ def test_resample_quantile(series): # GH 15023 s = series q = 0.75 - freq = 'H' + freq = "H" result = s.resample(freq).quantile(q) expected = s.resample(freq).agg(lambda x: x.quantile(q)).rename(s.name) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 830ba6062cc72..929bd1725b30a 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -16,7 +16,10 @@ from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) import pandas.tseries.offsets as offsets from pandas.tseries.offsets import BDay, Minute @@ -29,7 +32,7 @@ def _index_factory(): @pytest.fixture def _index_freq(): - return 'Min' + return "Min" @pytest.fixture @@ -40,20 +43,20 @@ def _static_values(index): def test_custom_grouper(index): dti = index - s = Series(np.array([1] * len(dti)), index=dti, dtype='int64') + s = Series(np.array([1] * len(dti)), index=dti, dtype="int64") b = Grouper(freq=Minute(5)) g = s.groupby(b) # check all cython functions work - funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) - b = Grouper(freq=Minute(5), closed='right', label='right') + b = Grouper(freq=Minute(5), closed="right", label="right") g = s.groupby(b) # check all cython functions work - funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + funcs = ["add", "mean", "prod", "ohlc", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) @@ -70,8 +73,7 @@ def test_custom_grouper(index): result = g.agg(np.sum) assert_series_equal(result, expect) - df = DataFrame(np.random.rand(len(dti), 10), - index=dti, dtype='float64') + df = DataFrame(np.random.rand(len(dti), 10), index=dti, dtype="float64") r = df.groupby(b).agg(np.sum) assert len(r.columns) == 10 @@ -79,77 +81,90 @@ def test_custom_grouper(index): @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) -@pytest.mark.parametrize('closed, expected', [ - ('right', - lambda s: Series( - [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], - index=date_range( - '1/1/2000', periods=4, freq='5min', name='index'))), - ('left', - lambda s: Series( - [s[:5].mean(), s[5:10].mean(), s[10:].mean()], - index=date_range( - '1/1/2000 00:05', periods=3, freq='5min', name='index')) - ) -]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +@pytest.mark.parametrize( + "closed, expected", + [ + ( + "right", + lambda s: Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=date_range("1/1/2000", periods=4, freq="5min", name="index"), + ), + ), + ( + "left", + lambda s: Series( + [s[:5].mean(), s[5:10].mean(), s[10:].mean()], + index=date_range( + "1/1/2000 00:05", periods=3, freq="5min", name="index" + ), + ), + ), + ], +) def test_resample_basic(series, closed, expected): s = series expected = expected(s) - result = s.resample('5min', closed=closed, label='right').mean() + result = s.resample("5min", closed=closed, label="right").mean() assert_series_equal(result, expected) def test_resample_integerarray(): # GH 25580, resample on IntegerArray - ts = pd.Series(range(9), - index=pd.date_range('1/1/2000', periods=9, freq='T'), - dtype='Int64') - result = ts.resample('3T').sum() - expected = Series([3, 12, 21], - index=pd.date_range('1/1/2000', periods=3, freq='3T'), - dtype="Int64") + ts = pd.Series( + range(9), index=pd.date_range("1/1/2000", periods=9, freq="T"), dtype="Int64" + ) + result = ts.resample("3T").sum() + expected = Series( + [3, 12, 21], + index=pd.date_range("1/1/2000", periods=3, freq="3T"), + dtype="Int64", + ) assert_series_equal(result, expected) - result = ts.resample('3T').mean() - expected = Series([1, 4, 7], - index=pd.date_range('1/1/2000', periods=3, freq='3T'), - dtype='Int64') + result = ts.resample("3T").mean() + expected = Series( + [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64" + ) assert_series_equal(result, expected) def test_resample_basic_grouper(series): s = series - result = s.resample('5Min').last() - grouper = Grouper(freq=Minute(5), closed='left', label='left') + result = s.resample("5Min").last() + grouper = Grouper(freq=Minute(5), closed="left", label="left") expected = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expected) @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) -@pytest.mark.parametrize('keyword,value', [ - ('label', 'righttt'), - ('closed', 'righttt'), - ('convention', 'starttt') -]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) +@pytest.mark.parametrize( + "keyword,value", + [("label", "righttt"), ("closed", "righttt"), ("convention", "starttt")], +) def test_resample_string_kwargs(series, keyword, value): # see gh-19303 # Check that wrong keyword argument strings raise an error msg = "Unsupported value {value} for `{keyword}`".format( - value=value, keyword=keyword) + value=value, keyword=keyword + ) with pytest.raises(ValueError, match=msg): - series.resample('5min', **({keyword: value})) + series.resample("5min", **({keyword: value})) @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) def test_resample_how(series, downsample_method): - if downsample_method == 'ohlc': - pytest.skip('covered by test_resample_how_ohlc') + if downsample_method == "ohlc": + pytest.skip("covered by test_resample_how_ohlc") s = series grouplist = np.ones_like(s) @@ -158,17 +173,18 @@ def test_resample_how(series, downsample_method): grouplist[6:11] = 2 grouplist[11:] = 3 expected = s.groupby(grouplist).agg(downsample_method) - expected.index = date_range( - '1/1/2000', periods=4, freq='5min', name='index') + expected.index = date_range("1/1/2000", periods=4, freq="5min", name="index") - result = getattr(s.resample( - '5min', closed='right', label='right'), downsample_method)() + result = getattr( + s.resample("5min", closed="right", label="right"), downsample_method + )() assert_series_equal(result, expected) @pytest.mark.parametrize( - '_index_start,_index_end,_index_name', - [('1/1/2000 00:00:00', '1/1/2000 00:13:00', 'index')]) + "_index_start,_index_end,_index_name", + [("1/1/2000 00:00:00", "1/1/2000 00:13:00", "index")], +) def test_resample_how_ohlc(series): s = series grouplist = np.ones_like(s) @@ -184,20 +200,19 @@ def _ohlc(group): expected = DataFrame( s.groupby(grouplist).agg(_ohlc).values.tolist(), - index=date_range('1/1/2000', periods=4, freq='5min', name='index'), - columns=['open', 'high', 'low', 'close']) + index=date_range("1/1/2000", periods=4, freq="5min", name="index"), + columns=["open", "high", "low", "close"], + ) - result = s.resample('5min', closed='right', label='right').ohlc() + result = s.resample("5min", closed="right", label="right").ohlc() assert_frame_equal(result, expected) -@pytest.mark.parametrize( - 'func', ['min', 'max', 'sum', 'prod', 'mean', 'var', 'std']) +@pytest.mark.parametrize("func", ["min", "max", "sum", "prod", "mean", "var", "std"]) def test_numpy_compat(func): # see gh-12811 - s = Series([1, 2, 3, 4, 5], index=date_range( - '20130101', periods=5, freq='s')) - r = s.resample('2s') + s = Series([1, 2, 3, 4, 5], index=date_range("20130101", periods=5, freq="s")) + r = s.resample("2s") msg = "numpy operations are not valid with resample" @@ -210,14 +225,13 @@ def test_numpy_compat(func): def test_resample_how_callables(): # GH#7929 data = np.arange(5, dtype=np.int64) - ind = date_range(start='2014-01-01', periods=len(data), freq='d') + ind = date_range(start="2014-01-01", periods=len(data), freq="d") df = DataFrame({"A": data, "B": data}, index=ind) def fn(x, a=1): return str(type(x)) class FnClass: - def __call__(self, x): return str(type(x)) @@ -258,120 +272,126 @@ def test_resample_rounding(): 11-08-2014,00:00:20.674,1 11-08-2014,00:00:21.191,1""" - df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [ - 'date', 'time']}, index_col='timestamp') + df = pd.read_csv( + StringIO(data), + parse_dates={"timestamp": ["date", "time"]}, + index_col="timestamp", + ) df.index.name = None - result = df.resample('6s').sum() - expected = DataFrame({'value': [ - 4, 9, 4, 2 - ]}, index=date_range('2014-11-08', freq='6s', periods=4)) + result = df.resample("6s").sum() + expected = DataFrame( + {"value": [4, 9, 4, 2]}, index=date_range("2014-11-08", freq="6s", periods=4) + ) assert_frame_equal(result, expected) - result = df.resample('7s').sum() - expected = DataFrame({'value': [ - 4, 10, 4, 1 - ]}, index=date_range('2014-11-08', freq='7s', periods=4)) + result = df.resample("7s").sum() + expected = DataFrame( + {"value": [4, 10, 4, 1]}, index=date_range("2014-11-08", freq="7s", periods=4) + ) assert_frame_equal(result, expected) - result = df.resample('11s').sum() - expected = DataFrame({'value': [ - 11, 8 - ]}, index=date_range('2014-11-08', freq='11s', periods=2)) + result = df.resample("11s").sum() + expected = DataFrame( + {"value": [11, 8]}, index=date_range("2014-11-08", freq="11s", periods=2) + ) assert_frame_equal(result, expected) - result = df.resample('13s').sum() - expected = DataFrame({'value': [ - 13, 6 - ]}, index=date_range('2014-11-08', freq='13s', periods=2)) + result = df.resample("13s").sum() + expected = DataFrame( + {"value": [13, 6]}, index=date_range("2014-11-08", freq="13s", periods=2) + ) assert_frame_equal(result, expected) - result = df.resample('17s').sum() - expected = DataFrame({'value': [ - 16, 3 - ]}, index=date_range('2014-11-08', freq='17s', periods=2)) + result = df.resample("17s").sum() + expected = DataFrame( + {"value": [16, 3]}, index=date_range("2014-11-08", freq="17s", periods=2) + ) assert_frame_equal(result, expected) def test_resample_basic_from_daily(): # from daily - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D', name='index') + dti = date_range( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index" + ) s = Series(np.random.rand(len(dti)), dti) # to weekly - result = s.resample('w-sun').last() + result = s.resample("w-sun").last() assert len(result) == 3 assert (result.index.dayofweek == [6, 6, 6]).all() - assert result.iloc[0] == s['1/2/2005'] - assert result.iloc[1] == s['1/9/2005'] + assert result.iloc[0] == s["1/2/2005"] + assert result.iloc[1] == s["1/9/2005"] assert result.iloc[2] == s.iloc[-1] - result = s.resample('W-MON').last() + result = s.resample("W-MON").last() assert len(result) == 2 assert (result.index.dayofweek == [0, 0]).all() - assert result.iloc[0] == s['1/3/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/3/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-TUE').last() + result = s.resample("W-TUE").last() assert len(result) == 2 assert (result.index.dayofweek == [1, 1]).all() - assert result.iloc[0] == s['1/4/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/4/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-WED').last() + result = s.resample("W-WED").last() assert len(result) == 2 assert (result.index.dayofweek == [2, 2]).all() - assert result.iloc[0] == s['1/5/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/5/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-THU').last() + result = s.resample("W-THU").last() assert len(result) == 2 assert (result.index.dayofweek == [3, 3]).all() - assert result.iloc[0] == s['1/6/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/6/2005"] + assert result.iloc[1] == s["1/10/2005"] - result = s.resample('W-FRI').last() + result = s.resample("W-FRI").last() assert len(result) == 2 assert (result.index.dayofweek == [4, 4]).all() - assert result.iloc[0] == s['1/7/2005'] - assert result.iloc[1] == s['1/10/2005'] + assert result.iloc[0] == s["1/7/2005"] + assert result.iloc[1] == s["1/10/2005"] # to biz day - result = s.resample('B').last() + result = s.resample("B").last() assert len(result) == 7 assert (result.index.dayofweek == [4, 0, 1, 2, 3, 4, 0]).all() - assert result.iloc[0] == s['1/2/2005'] - assert result.iloc[1] == s['1/3/2005'] - assert result.iloc[5] == s['1/9/2005'] - assert result.index.name == 'index' + assert result.iloc[0] == s["1/2/2005"] + assert result.iloc[1] == s["1/3/2005"] + assert result.iloc[5] == s["1/9/2005"] + assert result.index.name == "index" def test_resample_upsampling_picked_but_not_correct(): # Test for issue #3020 - dates = date_range('01-Jan-2014', '05-Jan-2014', freq='D') + dates = date_range("01-Jan-2014", "05-Jan-2014", freq="D") series = Series(1, index=dates) - result = series.resample('D').mean() + result = series.resample("D").mean() assert result.index[0] == dates[0] # GH 5955 # incorrect deciding to upsample when the axis frequency matches the # resample frequency - s = Series(np.arange(1., 6), index=[datetime( - 1975, 1, i, 12, 0) for i in range(1, 6)]) - expected = Series(np.arange(1., 6), index=date_range( - '19750101', periods=5, freq='D')) + s = Series( + np.arange(1.0, 6), index=[datetime(1975, 1, i, 12, 0) for i in range(1, 6)] + ) + expected = Series( + np.arange(1.0, 6), index=date_range("19750101", periods=5, freq="D") + ) - result = s.resample('D').count() + result = s.resample("D").count() assert_series_equal(result, Series(1, index=expected.index)) - result1 = s.resample('D').sum() - result2 = s.resample('D').mean() + result1 = s.resample("D").sum() + result2 = s.resample("D").mean() assert_series_equal(result1, expected) assert_series_equal(result2, expected) @@ -379,77 +399,76 @@ def test_resample_upsampling_picked_but_not_correct(): def test_resample_frame_basic(): df = tm.makeTimeDataFrame() - b = Grouper(freq='M') + b = Grouper(freq="M") g = df.groupby(b) # check all cython functions work - funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] + funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: g._cython_agg_general(f) - result = df.resample('A').mean() - assert_series_equal(result['A'], df['A'].resample('A').mean()) + result = df.resample("A").mean() + assert_series_equal(result["A"], df["A"].resample("A").mean()) - result = df.resample('M').mean() - assert_series_equal(result['A'], df['A'].resample('M').mean()) + result = df.resample("M").mean() + assert_series_equal(result["A"], df["A"].resample("M").mean()) - df.resample('M', kind='period').mean() - df.resample('W-WED', kind='period').mean() + df.resample("M", kind="period").mean() + df.resample("W-WED", kind="period").mean() -@pytest.mark.parametrize('loffset', [timedelta(minutes=1), - '1min', Minute(1), - np.timedelta64(1, 'm')]) +@pytest.mark.parametrize( + "loffset", [timedelta(minutes=1), "1min", Minute(1), np.timedelta64(1, "m")] +) def test_resample_loffset(loffset): # GH 7687 - rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") s = Series(np.random.randn(14), index=rng) - result = s.resample('5min', closed='right', label='right', - loffset=loffset).mean() - idx = date_range('1/1/2000', periods=4, freq='5min') - expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], - index=idx + timedelta(minutes=1)) + result = s.resample("5min", closed="right", label="right", loffset=loffset).mean() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series( + [s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=idx + timedelta(minutes=1), + ) assert_series_equal(result, expected) assert result.index.freq == Minute(5) # from daily - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D') + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") ser = Series(np.random.rand(len(dti)), dti) # to weekly - result = ser.resample('w-sun').last() + result = ser.resample("w-sun").last() business_day_offset = BDay() - expected = ser.resample('w-sun', loffset=-business_day_offset).last() + expected = ser.resample("w-sun", loffset=-business_day_offset).last() assert result.index[0] - business_day_offset == expected.index[0] def test_resample_loffset_upsample(): # GH 20744 - rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="min") s = Series(np.random.randn(14), index=rng) - result = s.resample('5min', closed='right', label='right', - loffset=timedelta(minutes=1)).ffill() - idx = date_range('1/1/2000', periods=4, freq='5min') - expected = Series([s[0], s[5], s[10], s[-1]], - index=idx + timedelta(minutes=1)) + result = s.resample( + "5min", closed="right", label="right", loffset=timedelta(minutes=1) + ).ffill() + idx = date_range("1/1/2000", periods=4, freq="5min") + expected = Series([s[0], s[5], s[10], s[-1]], index=idx + timedelta(minutes=1)) assert_series_equal(result, expected) def test_resample_loffset_count(): # GH 12725 - start_time = '1/1/2000 00:00:00' - rng = date_range(start_time, periods=100, freq='S') + start_time = "1/1/2000 00:00:00" + rng = date_range(start_time, periods=100, freq="S") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('10S', loffset='1s').count() + result = ts.resample("10S", loffset="1s").count() - expected_index = ( - date_range(start_time, periods=10, freq='10S') + - timedelta(seconds=1) + expected_index = date_range(start_time, periods=10, freq="10S") + timedelta( + seconds=1 ) expected = Series(10, index=expected_index) @@ -457,70 +476,78 @@ def test_resample_loffset_count(): # Same issue should apply to .size() since it goes through # same code path - result = ts.resample('10S', loffset='1s').size() + result = ts.resample("10S", loffset="1s").size() assert_series_equal(result, expected) def test_resample_upsample(): # from daily - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D', name='index') + dti = date_range( + start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index" + ) s = Series(np.random.rand(len(dti)), dti) # to minutely, by padding - result = s.resample('Min').pad() + result = s.resample("Min").pad() assert len(result) == 12961 assert result[0] == s[0] assert result[-1] == s[-1] - assert result.index.name == 'index' + assert result.index.name == "index" def test_resample_how_method(): # GH9915 - s = Series([11, 22], - index=[Timestamp('2015-03-31 21:48:52.672000'), - Timestamp('2015-03-31 21:49:52.739000')]) - expected = Series([11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22], - index=[Timestamp('2015-03-31 21:48:50'), - Timestamp('2015-03-31 21:49:00'), - Timestamp('2015-03-31 21:49:10'), - Timestamp('2015-03-31 21:49:20'), - Timestamp('2015-03-31 21:49:30'), - Timestamp('2015-03-31 21:49:40'), - Timestamp('2015-03-31 21:49:50')]) + s = Series( + [11, 22], + index=[ + Timestamp("2015-03-31 21:48:52.672000"), + Timestamp("2015-03-31 21:49:52.739000"), + ], + ) + expected = Series( + [11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22], + index=[ + Timestamp("2015-03-31 21:48:50"), + Timestamp("2015-03-31 21:49:00"), + Timestamp("2015-03-31 21:49:10"), + Timestamp("2015-03-31 21:49:20"), + Timestamp("2015-03-31 21:49:30"), + Timestamp("2015-03-31 21:49:40"), + Timestamp("2015-03-31 21:49:50"), + ], + ) assert_series_equal(s.resample("10S").mean(), expected) def test_resample_extra_index_point(): # GH#9756 - index = date_range(start='20150101', end='20150331', freq='BM') - expected = DataFrame({'A': Series([21, 41, 63], index=index)}) + index = date_range(start="20150101", end="20150331", freq="BM") + expected = DataFrame({"A": Series([21, 41, 63], index=index)}) - index = date_range(start='20150101', end='20150331', freq='B') - df = DataFrame( - {'A': Series(range(len(index)), index=index)}, dtype='int64') - result = df.resample('BM').last() + index = date_range(start="20150101", end="20150331", freq="B") + df = DataFrame({"A": Series(range(len(index)), index=index)}, dtype="int64") + result = df.resample("BM").last() assert_frame_equal(result, expected) def test_upsample_with_limit(): - rng = date_range('1/1/2000', periods=3, freq='5t') + rng = date_range("1/1/2000", periods=3, freq="5t") ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('t').ffill(limit=2) - expected = ts.reindex(result.index, method='ffill', limit=2) + result = ts.resample("t").ffill(limit=2) + expected = ts.reindex(result.index, method="ffill", limit=2) assert_series_equal(result, expected) def test_nearest_upsample_with_limit(): - rng = date_range('1/1/2000', periods=3, freq='5t') + rng = date_range("1/1/2000", periods=3, freq="5t") ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('t').nearest(limit=2) - expected = ts.reindex(result.index, method='nearest', limit=2) + result = ts.resample("t").nearest(limit=2) + expected = ts.reindex(result.index, method="nearest", limit=2) assert_series_equal(result, expected) @@ -529,75 +556,93 @@ def test_resample_ohlc(series): grouper = Grouper(freq=Minute(5)) expect = s.groupby(grouper).agg(lambda x: x[-1]) - result = s.resample('5Min').ohlc() + result = s.resample("5Min").ohlc() assert len(result) == len(expect) assert len(result.columns) == 4 xs = result.iloc[-2] - assert xs['open'] == s[-6] - assert xs['high'] == s[-6:-1].max() - assert xs['low'] == s[-6:-1].min() - assert xs['close'] == s[-2] + assert xs["open"] == s[-6] + assert xs["high"] == s[-6:-1].max() + assert xs["low"] == s[-6:-1].min() + assert xs["close"] == s[-2] xs = result.iloc[0] - assert xs['open'] == s[0] - assert xs['high'] == s[:5].max() - assert xs['low'] == s[:5].min() - assert xs['close'] == s[4] + assert xs["open"] == s[0] + assert xs["high"] == s[:5].max() + assert xs["low"] == s[:5].min() + assert xs["close"] == s[4] def test_resample_ohlc_result(): # GH 12332 - index = pd.date_range('1-1-2000', '2-15-2000', freq='h') - index = index.union(pd.date_range('4-15-2000', '5-15-2000', freq='h')) + index = pd.date_range("1-1-2000", "2-15-2000", freq="h") + index = index.union(pd.date_range("4-15-2000", "5-15-2000", freq="h")) s = Series(range(len(index)), index=index) - a = s.loc[:'4-15-2000'].resample('30T').ohlc() + a = s.loc[:"4-15-2000"].resample("30T").ohlc() assert isinstance(a, DataFrame) - b = s.loc[:'4-14-2000'].resample('30T').ohlc() + b = s.loc[:"4-14-2000"].resample("30T").ohlc() assert isinstance(b, DataFrame) # GH12348 # raising on odd period - rng = date_range('2013-12-30', '2014-01-07') - index = rng.drop([Timestamp('2014-01-01'), - Timestamp('2013-12-31'), - Timestamp('2014-01-04'), - Timestamp('2014-01-05')]) + rng = date_range("2013-12-30", "2014-01-07") + index = rng.drop( + [ + Timestamp("2014-01-01"), + Timestamp("2013-12-31"), + Timestamp("2014-01-04"), + Timestamp("2014-01-05"), + ] + ) df = DataFrame(data=np.arange(len(index)), index=index) - result = df.resample('B').mean() - expected = df.reindex(index=date_range(rng[0], rng[-1], freq='B')) + result = df.resample("B").mean() + expected = df.reindex(index=date_range(rng[0], rng[-1], freq="B")) assert_frame_equal(result, expected) def test_resample_ohlc_dataframe(): df = ( - DataFrame({ - 'PRICE': { - Timestamp('2011-01-06 10:59:05', tz=None): 24990, - Timestamp('2011-01-06 12:43:33', tz=None): 25499, - Timestamp('2011-01-06 12:54:09', tz=None): 25499}, - 'VOLUME': { - Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - Timestamp('2011-01-06 12:54:09', tz=None): 100000000}}) - ).reindex(['VOLUME', 'PRICE'], axis=1) - res = df.resample('H').ohlc() - exp = pd.concat([df['VOLUME'].resample('H').ohlc(), - df['PRICE'].resample('H').ohlc()], - axis=1, - keys=['VOLUME', 'PRICE']) + DataFrame( + { + "PRICE": { + Timestamp("2011-01-06 10:59:05", tz=None): 24990, + Timestamp("2011-01-06 12:43:33", tz=None): 25499, + Timestamp("2011-01-06 12:54:09", tz=None): 25499, + }, + "VOLUME": { + Timestamp("2011-01-06 10:59:05", tz=None): 1500000000, + Timestamp("2011-01-06 12:43:33", tz=None): 5000000000, + Timestamp("2011-01-06 12:54:09", tz=None): 100000000, + }, + } + ) + ).reindex(["VOLUME", "PRICE"], axis=1) + res = df.resample("H").ohlc() + exp = pd.concat( + [df["VOLUME"].resample("H").ohlc(), df["PRICE"].resample("H").ohlc()], + axis=1, + keys=["VOLUME", "PRICE"], + ) assert_frame_equal(exp, res) - df.columns = [['a', 'b'], ['c', 'd']] - res = df.resample('H').ohlc() - exp.columns = pd.MultiIndex.from_tuples([ - ('a', 'c', 'open'), ('a', 'c', 'high'), ('a', 'c', 'low'), - ('a', 'c', 'close'), ('b', 'd', 'open'), ('b', 'd', 'high'), - ('b', 'd', 'low'), ('b', 'd', 'close')]) + df.columns = [["a", "b"], ["c", "d"]] + res = df.resample("H").ohlc() + exp.columns = pd.MultiIndex.from_tuples( + [ + ("a", "c", "open"), + ("a", "c", "high"), + ("a", "c", "low"), + ("a", "c", "close"), + ("b", "d", "open"), + ("b", "d", "high"), + ("b", "d", "low"), + ("b", "d", "close"), + ] + ) assert_frame_equal(exp, res) # dupe columns fail atm @@ -608,49 +653,49 @@ def test_resample_dup_index(): # GH 4812 # dup columns with resample raising - df = DataFrame(np.random.randn(4, 12), index=[2000, 2000, 2000, 2000], - columns=[Period(year=2000, month=i + 1, freq='M') - for i in range(12)]) + df = DataFrame( + np.random.randn(4, 12), + index=[2000, 2000, 2000, 2000], + columns=[Period(year=2000, month=i + 1, freq="M") for i in range(12)], + ) df.iloc[3, :] = np.nan - result = df.resample('Q', axis=1).mean() + result = df.resample("Q", axis=1).mean() expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean() - expected.columns = [ - Period(year=2000, quarter=i + 1, freq='Q') for i in range(4)] + expected.columns = [Period(year=2000, quarter=i + 1, freq="Q") for i in range(4)] assert_frame_equal(result, expected) def test_resample_reresample(): - dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='D') + dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D") s = Series(np.random.rand(len(dti)), dti) - bs = s.resample('B', closed='right', label='right').mean() - result = bs.resample('8H').mean() + bs = s.resample("B", closed="right", label="right").mean() + result = bs.resample("8H").mean() assert len(result) == 22 assert isinstance(result.index.freq, offsets.DateOffset) assert result.index.freq == offsets.Hour(8) def test_resample_timestamp_to_period(simple_date_range_series): - ts = simple_date_range_series('1/1/1990', '1/1/2000') + ts = simple_date_range_series("1/1/1990", "1/1/2000") - result = ts.resample('A-DEC', kind='period').mean() - expected = ts.resample('A-DEC').mean() - expected.index = period_range('1990', '2000', freq='a-dec') + result = ts.resample("A-DEC", kind="period").mean() + expected = ts.resample("A-DEC").mean() + expected.index = period_range("1990", "2000", freq="a-dec") assert_series_equal(result, expected) - result = ts.resample('A-JUN', kind='period').mean() - expected = ts.resample('A-JUN').mean() - expected.index = period_range('1990', '2000', freq='a-jun') + result = ts.resample("A-JUN", kind="period").mean() + expected = ts.resample("A-JUN").mean() + expected.index = period_range("1990", "2000", freq="a-jun") assert_series_equal(result, expected) - result = ts.resample('M', kind='period').mean() - expected = ts.resample('M').mean() - expected.index = period_range('1990-01', '2000-01', freq='M') + result = ts.resample("M", kind="period").mean() + expected = ts.resample("M").mean() + expected.index = period_range("1990-01", "2000-01", freq="M") assert_series_equal(result, expected) - result = ts.resample('M', kind='period').mean() - expected = ts.resample('M').mean() - expected.index = period_range('1990-01', '2000-01', freq='M') + result = ts.resample("M", kind="period").mean() + expected = ts.resample("M").mean() + expected.index = period_range("1990-01", "2000-01", freq="M") assert_series_equal(result, expected) @@ -660,27 +705,26 @@ def _ohlc(group): return np.repeat(np.nan, 4) return [group[0], group.max(), group.min(), group[-1]] - rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 5:59:50", freq="10s") ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('5min', closed='right', - label='right').ohlc() + resampled = ts.resample("5min", closed="right", label="right").ohlc() - assert (resampled.loc['1/1/2000 00:00'] == ts[0]).all() + assert (resampled.loc["1/1/2000 00:00"] == ts[0]).all() exp = _ohlc(ts[1:31]) - assert (resampled.loc['1/1/2000 00:05'] == exp).all() + assert (resampled.loc["1/1/2000 00:05"] == exp).all() - exp = _ohlc(ts['1/1/2000 5:55:01':]) - assert (resampled.loc['1/1/2000 6:00:00'] == exp).all() + exp = _ohlc(ts["1/1/2000 5:55:01":]) + assert (resampled.loc["1/1/2000 6:00:00"] == exp).all() def test_downsample_non_unique(): - rng = date_range('1/1/2000', '2/29/2000') + rng = date_range("1/1/2000", "2/29/2000") rng2 = rng.repeat(5).values ts = Series(np.random.randn(len(rng2)), index=rng2) - result = ts.resample('M').mean() + result = ts.resample("M").mean() expected = ts.groupby(lambda x: x.month).mean() assert len(result) == 2 @@ -690,22 +734,21 @@ def test_downsample_non_unique(): def test_asfreq_non_unique(): # GH #1077 - rng = date_range('1/1/2000', '2/29/2000') + rng = date_range("1/1/2000", "2/29/2000") rng2 = rng.repeat(2).values ts = Series(np.random.randn(len(rng2)), index=rng2) - msg = 'cannot reindex from a duplicate axis' + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - ts.asfreq('B') + ts.asfreq("B") def test_resample_axis1(): - rng = date_range('1/1/2000', '2/29/2000') - df = DataFrame(np.random.randn(3, len(rng)), columns=rng, - index=['a', 'b', 'c']) + rng = date_range("1/1/2000", "2/29/2000") + df = DataFrame(np.random.randn(3, len(rng)), columns=rng, index=["a", "b", "c"]) - result = df.resample('M', axis=1).mean() - expected = df.T.resample('M').mean().T + result = df.resample("M", axis=1).mean() + expected = df.T.resample("M").mean().T tm.assert_frame_equal(result, expected) @@ -715,53 +758,52 @@ def test_resample_anchored_ticks(): # than starting from the first timestamp which might start in the # middle of a desired interval - rng = date_range('1/1/2000 04:00:00', periods=86400, freq='s') + rng = date_range("1/1/2000 04:00:00", periods=86400, freq="s") ts = Series(np.random.randn(len(rng)), index=rng) ts[:2] = np.nan # so results are the same - freqs = ['t', '5t', '15t', '30t', '4h', '12h'] + freqs = ["t", "5t", "15t", "30t", "4h", "12h"] for freq in freqs: - result = ts[2:].resample(freq, closed='left', label='left').mean() - expected = ts.resample(freq, closed='left', label='left').mean() + result = ts[2:].resample(freq, closed="left", label="left").mean() + expected = ts.resample(freq, closed="left", label="left").mean() assert_series_equal(result, expected) def test_resample_single_group(): mysum = lambda x: x.sum() - rng = date_range('2000-1-1', '2000-2-10', freq='D') + rng = date_range("2000-1-1", "2000-2-10", freq="D") ts = Series(np.random.randn(len(rng)), index=rng) - assert_series_equal(ts.resample('M').sum(), - ts.resample('M').apply(mysum)) + assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) - rng = date_range('2000-1-1', '2000-1-10', freq='D') + rng = date_range("2000-1-1", "2000-1-10", freq="D") ts = Series(np.random.randn(len(rng)), index=rng) - assert_series_equal(ts.resample('M').sum(), - ts.resample('M').apply(mysum)) + assert_series_equal(ts.resample("M").sum(), ts.resample("M").apply(mysum)) # GH 3849 - s = Series([30.1, 31.6], index=[Timestamp('20070915 15:30:00'), - Timestamp('20070915 15:40:00')]) - expected = Series([0.75], index=[Timestamp('20070915')]) - result = s.resample('D').apply(lambda x: np.std(x)) + s = Series( + [30.1, 31.6], + index=[Timestamp("20070915 15:30:00"), Timestamp("20070915 15:40:00")], + ) + expected = Series([0.75], index=[Timestamp("20070915")]) + result = s.resample("D").apply(lambda x: np.std(x)) assert_series_equal(result, expected) def test_resample_base(): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('5min', base=2).mean() - exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57', - freq='5min') + resampled = ts.resample("5min", base=2).mean() + exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min") tm.assert_index_equal(resampled.index, exp_rng) def test_resample_float_base(): # GH25161 - dt = pd.to_datetime(["2018-11-26 16:17:43.51", - "2018-11-26 16:17:44.51", - "2018-11-26 16:17:45.51"]) + dt = pd.to_datetime( + ["2018-11-26 16:17:43.51", "2018-11-26 16:17:44.51", "2018-11-26 16:17:45.51"] + ) s = Series(np.arange(3), index=dt) base = 17 + 43.51 / 60 @@ -771,35 +813,37 @@ def test_resample_float_base(): def test_resample_daily_anchored(): - rng = date_range('1/1/2000 0:00:00', periods=10000, freq='T') + rng = date_range("1/1/2000 0:00:00", periods=10000, freq="T") ts = Series(np.random.randn(len(rng)), index=rng) ts[:2] = np.nan # so results are the same - result = ts[2:].resample('D', closed='left', label='left').mean() - expected = ts.resample('D', closed='left', label='left').mean() + result = ts[2:].resample("D", closed="left", label="left").mean() + expected = ts.resample("D", closed="left", label="left").mean() assert_series_equal(result, expected) def test_resample_to_period_monthly_buglet(): # GH #1259 - rng = date_range('1/1/2000', '12/31/2000') + rng = date_range("1/1/2000", "12/31/2000") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('M', kind='period').mean() - exp_index = period_range('Jan-2000', 'Dec-2000', freq='M') + result = ts.resample("M", kind="period").mean() + exp_index = period_range("Jan-2000", "Dec-2000", freq="M") tm.assert_index_equal(result.index, exp_index) def test_period_with_agg(): # aggregate a period resampler with a lambda - s2 = Series(np.random.randint(0, 5, 50), - index=pd.period_range('2012-01-01', freq='H', periods=50), - dtype='float64') + s2 = Series( + np.random.randint(0, 5, 50), + index=pd.period_range("2012-01-01", freq="H", periods=50), + dtype="float64", + ) - expected = s2.to_timestamp().resample('D').mean().to_period() - result = s2.resample('D').agg(lambda x: x.mean()) + expected = s2.to_timestamp().resample("D").mean().to_period() + result = s2.resample("D").agg(lambda x: x.mean()) assert_series_equal(result, expected) @@ -810,11 +854,12 @@ def test_resample_segfault(): (1, datetime(2013, 10, 1, 16, 20), 1, 0), (2, datetime(2013, 10, 1, 16, 10), 1, 0), (2, datetime(2013, 10, 1, 18, 15), 1, 0), - (2, datetime(2013, 10, 1, 16, 10, 31), 1, 0)] + (2, datetime(2013, 10, 1, 16, 10, 31), 1, 0), + ] - df = DataFrame.from_records(all_wins_and_wagers, - columns=("ID", "timestamp", "A", "B") - ).set_index("timestamp") + df = DataFrame.from_records( + all_wins_and_wagers, columns=("ID", "timestamp", "A", "B") + ).set_index("timestamp") result = df.groupby("ID").resample("5min").sum() expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum()) assert_frame_equal(result, expected) @@ -825,34 +870,30 @@ def test_resample_dtype_preservation(): # GH 12202 # validation tests for dtype preservation - df = DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, freq='W'), - 'group': [1, 1, 2, 2], - 'val': Series([5, 6, 7, 8], - dtype='int32')} - ).set_index('date') - - result = df.resample('1D').ffill() + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": Series([5, 6, 7, 8], dtype="int32"), + } + ).set_index("date") + + result = df.resample("1D").ffill() assert result.val.dtype == np.int32 - result = df.groupby('group').resample('1D').ffill() + result = df.groupby("group").resample("1D").ffill() assert result.val.dtype == np.int32 def test_resample_dtype_coerceion(): - pytest.importorskip('scipy.interpolate') + pytest.importorskip("scipy.interpolate") # GH 16361 df = {"a": [1, 3, 1, 4]} df = DataFrame(df, index=pd.date_range("2017-01-01", "2017-01-04")) - expected = (df.astype("float64") - .resample("H") - .mean() - ["a"] - .interpolate("cubic") - ) + expected = df.astype("float64").resample("H").mean()["a"].interpolate("cubic") result = df.resample("H")["a"].mean().interpolate("cubic") tm.assert_series_equal(result, expected) @@ -863,20 +904,20 @@ def test_resample_dtype_coerceion(): def test_weekly_resample_buglet(): # #1327 - rng = date_range('1/1/2000', freq='B', periods=20) + rng = date_range("1/1/2000", freq="B", periods=20) ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('W').mean() - expected = ts.resample('W-SUN').mean() + resampled = ts.resample("W").mean() + expected = ts.resample("W-SUN").mean() assert_series_equal(resampled, expected) def test_monthly_resample_error(): # #1451 - dates = date_range('4/16/2012 20:00', periods=5000, freq='h') + dates = date_range("4/16/2012 20:00", periods=5000, freq="h") ts = Series(np.random.randn(len(dates)), index=dates) # it works! - ts.resample('M') + ts.resample("M") def test_nanosecond_resample_error(): @@ -884,20 +925,12 @@ def test_nanosecond_resample_error(): # Resampling using pd.tseries.offsets.Nano as period start = 1443707890427 exp_start = 1443707890400 - indx = pd.date_range( - start=pd.to_datetime(start), - periods=10, - freq='100n' - ) + indx = pd.date_range(start=pd.to_datetime(start), periods=10, freq="100n") ts = Series(range(len(indx)), index=indx) r = ts.resample(pd.tseries.offsets.Nano(100)) - result = r.agg('mean') + result = r.agg("mean") - exp_indx = pd.date_range( - start=pd.to_datetime(exp_start), - periods=10, - freq='100n' - ) + exp_indx = pd.date_range(start=pd.to_datetime(exp_start), periods=10, freq="100n") exp = Series(range(len(exp_indx)), index=exp_indx) assert_series_equal(result, exp) @@ -906,48 +939,44 @@ def test_nanosecond_resample_error(): def test_resample_anchored_intraday(simple_date_range_series): # #1471, #1458 - rng = date_range('1/1/2012', '4/1/2012', freq='100min') + rng = date_range("1/1/2012", "4/1/2012", freq="100min") df = DataFrame(rng.month, index=rng) - result = df.resample('M').mean() - expected = df.resample( - 'M', kind='period').mean().to_timestamp(how='end') - expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D') + result = df.resample("M").mean() + expected = df.resample("M", kind="period").mean().to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, expected) - result = df.resample('M', closed='left').mean() - exp = df.tshift(1, freq='D').resample('M', kind='period').mean() - exp = exp.to_timestamp(how='end') + result = df.resample("M", closed="left").mean() + exp = df.tshift(1, freq="D").resample("M", kind="period").mean() + exp = exp.to_timestamp(how="end") - exp.index = exp.index + Timedelta(1, 'ns') - Timedelta(1, 'D') + exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, exp) - rng = date_range('1/1/2012', '4/1/2012', freq='100min') + rng = date_range("1/1/2012", "4/1/2012", freq="100min") df = DataFrame(rng.month, index=rng) - result = df.resample('Q').mean() - expected = df.resample( - 'Q', kind='period').mean().to_timestamp(how='end') - expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D') + result = df.resample("Q").mean() + expected = df.resample("Q", kind="period").mean().to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, expected) - result = df.resample('Q', closed='left').mean() - expected = df.tshift(1, freq='D').resample('Q', kind='period', - closed='left').mean() - expected = expected.to_timestamp(how='end') - expected.index += Timedelta(1, 'ns') - Timedelta(1, 'D') + result = df.resample("Q", closed="left").mean() + expected = df.tshift(1, freq="D").resample("Q", kind="period", closed="left").mean() + expected = expected.to_timestamp(how="end") + expected.index += Timedelta(1, "ns") - Timedelta(1, "D") tm.assert_frame_equal(result, expected) - ts = simple_date_range_series('2012-04-29 23:00', '2012-04-30 5:00', - freq='h') - resampled = ts.resample('M').mean() + ts = simple_date_range_series("2012-04-29 23:00", "2012-04-30 5:00", freq="h") + resampled = ts.resample("M").mean() assert len(resampled) == 1 def test_resample_anchored_monthstart(simple_date_range_series): - ts = simple_date_range_series('1/1/2000', '12/31/2002') + ts = simple_date_range_series("1/1/2000", "12/31/2002") - freqs = ['MS', 'BMS', 'QS-MAR', 'AS-DEC', 'AS-JUN'] + freqs = ["MS", "BMS", "QS-MAR", "AS-DEC", "AS-JUN"] for freq in freqs: ts.resample(freq).mean() @@ -961,115 +990,113 @@ def test_resample_anchored_multiday(): # See: https://github.com/pandas-dev/pandas/issues/8683 index = pd.date_range( - '2014-10-14 23:06:23.206', periods=3, freq='400L' - ) | pd.date_range( - '2014-10-15 23:00:00', periods=2, freq='2200L') + "2014-10-14 23:06:23.206", periods=3, freq="400L" + ) | pd.date_range("2014-10-15 23:00:00", periods=2, freq="2200L") s = Series(np.random.randn(5), index=index) # Ensure left closing works - result = s.resample('2200L').mean() - assert result.index[-1] == Timestamp('2014-10-15 23:00:02.000') + result = s.resample("2200L").mean() + assert result.index[-1] == Timestamp("2014-10-15 23:00:02.000") # Ensure right closing works - result = s.resample('2200L', label='right').mean() - assert result.index[-1] == Timestamp('2014-10-15 23:00:04.200') + result = s.resample("2200L", label="right").mean() + assert result.index[-1] == Timestamp("2014-10-15 23:00:04.200") -def test_corner_cases(simple_period_range_series, - simple_date_range_series): +def test_corner_cases(simple_period_range_series, simple_date_range_series): # miscellaneous test coverage - rng = date_range('1/1/2000', periods=12, freq='t') + rng = date_range("1/1/2000", periods=12, freq="t") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('5t', closed='right', label='left').mean() - ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t') + result = ts.resample("5t", closed="right", label="left").mean() + ex_index = date_range("1999-12-31 23:55", periods=4, freq="5t") tm.assert_index_equal(result.index, ex_index) - len0pts = simple_period_range_series( - '2007-01', '2010-05', freq='M')[:0] + len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0] # it works - result = len0pts.resample('A-DEC').mean() + result = len0pts.resample("A-DEC").mean() assert len(result) == 0 # resample to periods - ts = simple_date_range_series( - '2000-04-28', '2000-04-30 11:00', freq='h') - result = ts.resample('M', kind='period').mean() + ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h") + result = ts.resample("M", kind="period").mean() assert len(result) == 1 - assert result.index[0] == Period('2000-04', freq='M') + assert result.index[0] == Period("2000-04", freq="M") def test_anchored_lowercase_buglet(): - dates = date_range('4/16/2012 20:00', periods=50000, freq='s') + dates = date_range("4/16/2012 20:00", periods=50000, freq="s") ts = Series(np.random.randn(len(dates)), index=dates) # it works! - ts.resample('d').mean() + ts.resample("d").mean() def test_upsample_apply_functions(): # #1596 - rng = pd.date_range('2012-06-12', periods=4, freq='h') + rng = pd.date_range("2012-06-12", periods=4, freq="h") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('20min').aggregate(['mean', 'sum']) + result = ts.resample("20min").aggregate(["mean", "sum"]) assert isinstance(result, DataFrame) def test_resample_not_monotonic(): - rng = pd.date_range('2012-06-12', periods=200, freq='h') + rng = pd.date_range("2012-06-12", periods=200, freq="h") ts = Series(np.random.randn(len(rng)), index=rng) ts = ts.take(np.random.permutation(len(ts))) - result = ts.resample('D').sum() - exp = ts.sort_index().resample('D').sum() + result = ts.resample("D").sum() + exp = ts.sort_index().resample("D").sum() assert_series_equal(result, exp) def test_resample_median_bug_1688(): - for dtype in ['int64', 'int32', 'float64', 'float32']: - df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0), - datetime(2012, 1, 1, 0, 5, 0)], - dtype=dtype) + for dtype in ["int64", "int32", "float64", "float32"]: + df = DataFrame( + [1, 2], + index=[datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)], + dtype=dtype, + ) result = df.resample("T").apply(lambda x: x.mean()) - exp = df.asfreq('T') + exp = df.asfreq("T") tm.assert_frame_equal(result, exp) result = df.resample("T").median() - exp = df.asfreq('T') + exp = df.asfreq("T") tm.assert_frame_equal(result, exp) def test_how_lambda_functions(simple_date_range_series): - ts = simple_date_range_series('1/1/2000', '4/1/2000') + ts = simple_date_range_series("1/1/2000", "4/1/2000") - result = ts.resample('M').apply(lambda x: x.mean()) - exp = ts.resample('M').mean() + result = ts.resample("M").apply(lambda x: x.mean()) + exp = ts.resample("M").mean() tm.assert_series_equal(result, exp) - foo_exp = ts.resample('M').mean() - foo_exp.name = 'foo' - bar_exp = ts.resample('M').std() - bar_exp.name = 'bar' + foo_exp = ts.resample("M").mean() + foo_exp.name = "foo" + bar_exp = ts.resample("M").std() + bar_exp.name = "bar" - result = ts.resample('M').apply( - [lambda x: x.mean(), lambda x: x.std(ddof=1)]) - result.columns = ['foo', 'bar'] - tm.assert_series_equal(result['foo'], foo_exp) - tm.assert_series_equal(result['bar'], bar_exp) + result = ts.resample("M").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)]) + result.columns = ["foo", "bar"] + tm.assert_series_equal(result["foo"], foo_exp) + tm.assert_series_equal(result["bar"], bar_exp) # this is a MI Series, so comparing the names of the results # doesn't make sense - result = ts.resample('M').aggregate({'foo': lambda x: x.mean(), - 'bar': lambda x: x.std(ddof=1)}) - tm.assert_series_equal(result['foo'], foo_exp, check_names=False) - tm.assert_series_equal(result['bar'], bar_exp, check_names=False) + result = ts.resample("M").aggregate( + {"foo": lambda x: x.mean(), "bar": lambda x: x.std(ddof=1)} + ) + tm.assert_series_equal(result["foo"], foo_exp, check_names=False) + tm.assert_series_equal(result["bar"], bar_exp, check_names=False) def test_resample_unequal_times(): @@ -1078,10 +1105,10 @@ def test_resample_unequal_times(): # end hour is less than start end = datetime(2012, 7, 31, 4) bad_ind = date_range(start, end, freq="30min") - df = DataFrame({'close': 1}, index=bad_ind) + df = DataFrame({"close": 1}, index=bad_ind) # it works! - df.resample('AS').sum() + df.resample("AS").sum() def test_resample_consistency(): @@ -1089,18 +1116,18 @@ def test_resample_consistency(): # GH 6418 # resample with bfill / limit / reindex consistency - i30 = pd.date_range('2002-02-02', periods=4, freq='30T') - s = Series(np.arange(4.), index=i30) + i30 = pd.date_range("2002-02-02", periods=4, freq="30T") + s = Series(np.arange(4.0), index=i30) s[2] = np.NaN # Upsample by factor 3 with reindex() and resample() methods: - i10 = pd.date_range(i30[0], i30[-1], freq='10T') + i10 = pd.date_range(i30[0], i30[-1], freq="10T") - s10 = s.reindex(index=i10, method='bfill') - s10_2 = s.reindex(index=i10, method='bfill', limit=2) - rl = s.reindex_like(s10, method='bfill', limit=2) - r10_2 = s.resample('10Min').bfill(limit=2) - r10 = s.resample('10Min').bfill() + s10 = s.reindex(index=i10, method="bfill") + s10_2 = s.reindex(index=i10, method="bfill", limit=2) + rl = s.reindex_like(s10, method="bfill", limit=2) + r10_2 = s.resample("10Min").bfill(limit=2) + r10 = s.resample("10Min").bfill() # s10_2, r10, r10_2, rl should all be equal assert_series_equal(s10_2, r10) @@ -1110,110 +1137,118 @@ def test_resample_consistency(): def test_resample_timegrouper(): # GH 7227 - dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3), - datetime(2014, 11, 5), datetime(2014, 9, 5), - datetime(2014, 10, 8), datetime(2014, 7, 15)] + dates1 = [ + datetime(2014, 10, 1), + datetime(2014, 9, 3), + datetime(2014, 11, 5), + datetime(2014, 9, 5), + datetime(2014, 10, 8), + datetime(2014, 7, 15), + ] dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:] dates3 = [pd.NaT] + dates1 + [pd.NaT] for dates in [dates1, dates2, dates3]: df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) - result = df.set_index('A').resample('M').count() - exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', - '2014-09-30', - '2014-10-31', '2014-11-30'], - freq='M', name='A') - expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx) + result = df.set_index("A").resample("M").count() + exp_idx = pd.DatetimeIndex( + ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"], + freq="M", + name="A", + ) + expected = DataFrame({"B": [1, 0, 2, 2, 1]}, index=exp_idx) assert_frame_equal(result, expected) - result = df.groupby(pd.Grouper(freq='M', key='A')).count() + result = df.groupby(pd.Grouper(freq="M", key="A")).count() assert_frame_equal(result, expected) - df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange( - len(dates)))) - result = df.set_index('A').resample('M').count() - expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, - index=exp_idx, columns=['B', 'C']) + df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(len(dates)))) + result = df.set_index("A").resample("M").count() + expected = DataFrame( + {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]}, + index=exp_idx, + columns=["B", "C"], + ) assert_frame_equal(result, expected) - result = df.groupby(pd.Grouper(freq='M', key='A')).count() + result = df.groupby(pd.Grouper(freq="M", key="A")).count() assert_frame_equal(result, expected) def test_resample_nunique(): # GH 12352 - df = DataFrame({ - 'ID': {Timestamp('2015-06-05 00:00:00'): '0010100903', - Timestamp('2015-06-08 00:00:00'): '0010150847'}, - 'DATE': {Timestamp('2015-06-05 00:00:00'): '2015-06-05', - Timestamp('2015-06-08 00:00:00'): '2015-06-08'}}) - r = df.resample('D') - g = df.groupby(pd.Grouper(freq='D')) - expected = df.groupby(pd.Grouper(freq='D')).ID.apply(lambda x: - x.nunique()) - assert expected.name == 'ID' + df = DataFrame( + { + "ID": { + Timestamp("2015-06-05 00:00:00"): "0010100903", + Timestamp("2015-06-08 00:00:00"): "0010150847", + }, + "DATE": { + Timestamp("2015-06-05 00:00:00"): "2015-06-05", + Timestamp("2015-06-08 00:00:00"): "2015-06-08", + }, + } + ) + r = df.resample("D") + g = df.groupby(pd.Grouper(freq="D")) + expected = df.groupby(pd.Grouper(freq="D")).ID.apply(lambda x: x.nunique()) + assert expected.name == "ID" for t in [r, g]: result = r.ID.nunique() assert_series_equal(result, expected) - result = df.ID.resample('D').nunique() + result = df.ID.resample("D").nunique() assert_series_equal(result, expected) - result = df.ID.groupby(pd.Grouper(freq='D')).nunique() + result = df.ID.groupby(pd.Grouper(freq="D")).nunique() assert_series_equal(result, expected) def test_resample_nunique_preserves_column_level_names(): # see gh-23222 df = tm.makeTimeDataFrame(freq="1D").abs() - df.columns = pd.MultiIndex.from_arrays([df.columns.tolist()] * 2, - names=["lev0", "lev1"]) + df.columns = pd.MultiIndex.from_arrays( + [df.columns.tolist()] * 2, names=["lev0", "lev1"] + ) result = df.resample("1h").nunique() tm.assert_index_equal(df.columns, result.columns) def test_resample_nunique_with_date_gap(): # GH 13453 - index = pd.date_range('1-1-2000', '2-15-2000', freq='h') - index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h') + index = pd.date_range("1-1-2000", "2-15-2000", freq="h") + index2 = pd.date_range("4-15-2000", "5-15-2000", freq="h") index3 = index.append(index2) - s = Series(range(len(index3)), index=index3, dtype='int64') - r = s.resample('M') + s = Series(range(len(index3)), index=index3, dtype="int64") + r = s.resample("M") # Since all elements are unique, these should all be the same - results = [ - r.count(), - r.nunique(), - r.agg(Series.nunique), - r.agg('nunique') - ] + results = [r.count(), r.nunique(), r.agg(Series.nunique), r.agg("nunique")] assert_series_equal(results[0], results[1]) assert_series_equal(results[0], results[2]) assert_series_equal(results[0], results[3]) -@pytest.mark.parametrize('n', [10000, 100000]) -@pytest.mark.parametrize('k', [10, 100, 1000]) +@pytest.mark.parametrize("n", [10000, 100000]) +@pytest.mark.parametrize("k", [10, 100, 1000]) def test_resample_group_info(n, k): # GH10914 # use a fixed seed to always have the same uniques prng = np.random.RandomState(1234) - dr = date_range(start='2015-08-27', periods=n // 10, freq='T') - ts = Series(prng.randint(0, n // k, n).astype('int64'), - index=prng.choice(dr, n)) + dr = date_range(start="2015-08-27", periods=n // 10, freq="T") + ts = Series(prng.randint(0, n // k, n).astype("int64"), index=prng.choice(dr, n)) - left = ts.resample('30T').nunique() - ix = date_range(start=ts.index.min(), end=ts.index.max(), - freq='30T') + left = ts.resample("30T").nunique() + ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30T") vals = ts.values - bins = np.searchsorted(ix.values, ts.index, side='right') + bins = np.searchsorted(ix.values, ts.index, side="right") sorter = np.lexsort((vals, bins)) vals, bins = vals[sorter], bins[sorter] @@ -1221,8 +1256,7 @@ def test_resample_group_info(n, k): mask = np.r_[True, vals[1:] != vals[:-1]] mask |= np.r_[True, bins[1:] != bins[:-1]] - arr = np.bincount(bins[mask] - 1, - minlength=len(ix)).astype('int64', copy=False) + arr = np.bincount(bins[mask] - 1, minlength=len(ix)).astype("int64", copy=False) right = Series(arr, index=ix) assert_series_equal(left, right) @@ -1230,15 +1264,14 @@ def test_resample_group_info(n, k): def test_resample_size(): n = 10000 - dr = date_range('2015-09-19', periods=n, freq='T') + dr = date_range("2015-09-19", periods=n, freq="T") ts = Series(np.random.randn(n), index=np.random.choice(dr, n)) - left = ts.resample('7T').size() - ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T') + left = ts.resample("7T").size() + ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7T") - bins = np.searchsorted(ix.values, ts.index.values, side='right') - val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype('int64', - copy=False) + bins = np.searchsorted(ix.values, ts.index.values, side="right") + val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype("int64", copy=False) right = Series(val, index=ix) assert_series_equal(left, right) @@ -1252,20 +1285,24 @@ def test_resample_across_dst(): # The DatetimeIndex we will start with # (note that DST happens at 03:00+02:00 -> 02:00+01:00) # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00 - df1 = DataFrame([1477786980, 1477790580], columns=['ts']) - dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s') - .dt.tz_localize('UTC') - .dt.tz_convert('Europe/Madrid')) + df1 = DataFrame([1477786980, 1477790580], columns=["ts"]) + dti1 = DatetimeIndex( + pd.to_datetime(df1.ts, unit="s") + .dt.tz_localize("UTC") + .dt.tz_convert("Europe/Madrid") + ) # The expected DatetimeIndex after resampling. # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00 - df2 = DataFrame([1477785600, 1477789200], columns=['ts']) - dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s') - .dt.tz_localize('UTC') - .dt.tz_convert('Europe/Madrid')) + df2 = DataFrame([1477785600, 1477789200], columns=["ts"]) + dti2 = DatetimeIndex( + pd.to_datetime(df2.ts, unit="s") + .dt.tz_localize("UTC") + .dt.tz_convert("Europe/Madrid") + ) df = DataFrame([5, 5], index=dti1) - result = df.resample(rule='H').sum() + result = df.resample(rule="H").sum() expected = DataFrame([5, 5], index=dti2) assert_frame_equal(result, expected) @@ -1273,13 +1310,15 @@ def test_resample_across_dst(): def test_groupby_with_dst_time_change(): # GH 24972 - index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], - tz='UTC').tz_convert('America/Chicago') + index = pd.DatetimeIndex( + [1478064900001000000, 1480037118776792000], tz="UTC" + ).tz_convert("America/Chicago") df = pd.DataFrame([1, 2], index=index) - result = df.groupby(pd.Grouper(freq='1d')).last() - expected_index_values = pd.date_range('2016-11-02', '2016-11-24', - freq='d', tz='America/Chicago') + result = df.groupby(pd.Grouper(freq="1d")).last() + expected_index_values = pd.date_range( + "2016-11-02", "2016-11-24", freq="d", tz="America/Chicago" + ) index = pd.DatetimeIndex(expected_index_values) expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) @@ -1288,132 +1327,151 @@ def test_groupby_with_dst_time_change(): def test_resample_dst_anchor(): # 5172 - dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') + dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz="US/Eastern") df = DataFrame([5], index=dti) - assert_frame_equal(df.resample(rule='D').sum(), - DataFrame([5], index=df.index.normalize())) - df.resample(rule='MS').sum() assert_frame_equal( - df.resample(rule='MS').sum(), - DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], - tz='US/Eastern'))) + df.resample(rule="D").sum(), DataFrame([5], index=df.index.normalize()) + ) + df.resample(rule="MS").sum() + assert_frame_equal( + df.resample(rule="MS").sum(), + DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], tz="US/Eastern")), + ) - dti = date_range('2013-09-30', '2013-11-02', freq='30Min', - tz='Europe/Paris') + dti = date_range("2013-09-30", "2013-11-02", freq="30Min", tz="Europe/Paris") values = range(dti.size) - df = DataFrame({"a": values, - "b": values, - "c": values}, index=dti, dtype='int64') + df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype="int64") how = {"a": "min", "b": "max", "c": "count"} assert_frame_equal( df.resample("W-MON").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 48, 384, 720, 1056, 1394], - "b": [47, 383, 719, 1055, 1393, 1586], - "c": [48, 336, 336, 336, 338, 193]}, - index=date_range('9/30/2013', '11/4/2013', - freq='W-MON', tz='Europe/Paris')), - 'W-MON Frequency') + DataFrame( + { + "a": [0, 48, 384, 720, 1056, 1394], + "b": [47, 383, 719, 1055, 1393, 1586], + "c": [48, 336, 336, 336, 338, 193], + }, + index=date_range("9/30/2013", "11/4/2013", freq="W-MON", tz="Europe/Paris"), + ), + "W-MON Frequency", + ) assert_frame_equal( df.resample("2W-MON").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 48, 720, 1394], - "b": [47, 719, 1393, 1586], - "c": [48, 672, 674, 193]}, - index=date_range('9/30/2013', '11/11/2013', - freq='2W-MON', tz='Europe/Paris')), - '2W-MON Frequency') + DataFrame( + { + "a": [0, 48, 720, 1394], + "b": [47, 719, 1393, 1586], + "c": [48, 672, 674, 193], + }, + index=date_range( + "9/30/2013", "11/11/2013", freq="2W-MON", tz="Europe/Paris" + ), + ), + "2W-MON Frequency", + ) assert_frame_equal( df.resample("MS").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 48, 1538], - "b": [47, 1537, 1586], - "c": [48, 1490, 49]}, - index=date_range('9/1/2013', '11/1/2013', - freq='MS', tz='Europe/Paris')), - 'MS Frequency') + DataFrame( + {"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, + index=date_range("9/1/2013", "11/1/2013", freq="MS", tz="Europe/Paris"), + ), + "MS Frequency", + ) assert_frame_equal( df.resample("2MS").agg(how)[["a", "b", "c"]], - DataFrame({"a": [0, 1538], - "b": [1537, 1586], - "c": [1538, 49]}, - index=date_range('9/1/2013', '11/1/2013', - freq='2MS', tz='Europe/Paris')), - '2MS Frequency') - - df_daily = df['10/26/2013':'10/29/2013'] + DataFrame( + {"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, + index=date_range("9/1/2013", "11/1/2013", freq="2MS", tz="Europe/Paris"), + ), + "2MS Frequency", + ) + + df_daily = df["10/26/2013":"10/29/2013"] assert_frame_equal( - df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"}) - [["a", "b", "c"]], - DataFrame({"a": [1248, 1296, 1346, 1394], - "b": [1295, 1345, 1393, 1441], - "c": [48, 50, 48, 48]}, - index=date_range('10/26/2013', '10/29/2013', - freq='D', tz='Europe/Paris')), - 'D Frequency') + df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"})[ + ["a", "b", "c"] + ], + DataFrame( + { + "a": [1248, 1296, 1346, 1394], + "b": [1295, 1345, 1393, 1441], + "c": [48, 50, 48, 48], + }, + index=date_range("10/26/2013", "10/29/2013", freq="D", tz="Europe/Paris"), + ), + "D Frequency", + ) def test_downsample_across_dst(): # GH 8531 - tz = pytz.timezone('Europe/Berlin') + tz = pytz.timezone("Europe/Berlin") dt = datetime(2014, 10, 26) - dates = date_range(tz.localize(dt), periods=4, freq='2H') - result = Series(5, index=dates).resample('H').mean() - expected = Series([5., np.nan] * 3 + [5.], - index=date_range(tz.localize(dt), periods=7, - freq='H')) + dates = date_range(tz.localize(dt), periods=4, freq="2H") + result = Series(5, index=dates).resample("H").mean() + expected = Series( + [5.0, np.nan] * 3 + [5.0], + index=date_range(tz.localize(dt), periods=7, freq="H"), + ) tm.assert_series_equal(result, expected) def test_downsample_across_dst_weekly(): # GH 9119, GH 21459 - df = DataFrame(index=DatetimeIndex([ - '2017-03-25', '2017-03-26', '2017-03-27', - '2017-03-28', '2017-03-29' - ], tz='Europe/Amsterdam'), - data=[11, 12, 13, 14, 15]) - result = df.resample('1W').sum() - expected = DataFrame([23, 42], index=pd.DatetimeIndex([ - '2017-03-26', '2017-04-02' - ], tz='Europe/Amsterdam')) + df = DataFrame( + index=DatetimeIndex( + ["2017-03-25", "2017-03-26", "2017-03-27", "2017-03-28", "2017-03-29"], + tz="Europe/Amsterdam", + ), + data=[11, 12, 13, 14, 15], + ) + result = df.resample("1W").sum() + expected = DataFrame( + [23, 42], + index=pd.DatetimeIndex(["2017-03-26", "2017-04-02"], tz="Europe/Amsterdam"), + ) tm.assert_frame_equal(result, expected) - idx = pd.date_range("2013-04-01", "2013-05-01", tz='Europe/London', - freq='H') + idx = pd.date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="H") s = Series(index=idx) - result = s.resample('W').mean() - expected = Series(index=pd.date_range( - '2013-04-07', freq='W', periods=5, tz='Europe/London' - )) + result = s.resample("W").mean() + expected = Series( + index=pd.date_range("2013-04-07", freq="W", periods=5, tz="Europe/London") + ) tm.assert_series_equal(result, expected) def test_resample_with_nat(): # GH 13020 - index = DatetimeIndex([pd.NaT, - '1970-01-01 00:00:00', - pd.NaT, - '1970-01-01 00:00:01', - '1970-01-01 00:00:02']) + index = DatetimeIndex( + [ + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + "1970-01-01 00:00:01", + "1970-01-01 00:00:02", + ] + ) frame = DataFrame([2, 3, 5, 7, 11], index=index) - index_1s = DatetimeIndex(['1970-01-01 00:00:00', - '1970-01-01 00:00:01', - '1970-01-01 00:00:02']) + index_1s = DatetimeIndex( + ["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"] + ) frame_1s = DataFrame([3, 7, 11], index=index_1s) - assert_frame_equal(frame.resample('1s').mean(), frame_1s) + assert_frame_equal(frame.resample("1s").mean(), frame_1s) - index_2s = DatetimeIndex(['1970-01-01 00:00:00', - '1970-01-01 00:00:02']) + index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"]) frame_2s = DataFrame([5, 11], index=index_2s) - assert_frame_equal(frame.resample('2s').mean(), frame_2s) + assert_frame_equal(frame.resample("2s").mean(), frame_2s) - index_3s = DatetimeIndex(['1970-01-01 00:00:00']) + index_3s = DatetimeIndex(["1970-01-01 00:00:00"]) frame_3s = DataFrame([7], index=index_3s) - assert_frame_equal(frame.resample('3s').mean(), frame_3s) + assert_frame_equal(frame.resample("3s").mean(), frame_3s) - assert_frame_equal(frame.resample('60s').mean(), frame_3s) + assert_frame_equal(frame.resample("60s").mean(), frame_3s) def test_resample_datetime_values(): @@ -1422,15 +1480,17 @@ def test_resample_datetime_values(): # introduced by the resampling dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)] - df = DataFrame({'timestamp': dates}, index=dates) + df = DataFrame({"timestamp": dates}, index=dates) - exp = Series([datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], - index=date_range('2016-01-15', periods=3, freq='2D'), - name='timestamp') + exp = Series( + [datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], + index=date_range("2016-01-15", periods=3, freq="2D"), + name="timestamp", + ) - res = df.resample('2D').first()['timestamp'] + res = df.resample("2D").first()["timestamp"] tm.assert_series_equal(res, exp) - res = df['timestamp'].resample('2D').first() + res = df["timestamp"].resample("2D").first() tm.assert_series_equal(res, exp) @@ -1440,43 +1500,45 @@ def f(data, add_arg): return np.mean(data) * add_arg multiplier = 10 - result = series.resample('D').apply(f, multiplier) - expected = series.resample('D').mean().multiply(multiplier) + result = series.resample("D").apply(f, multiplier) + expected = series.resample("D").mean().multiply(multiplier) tm.assert_series_equal(result, expected) # Testing as kwarg - result = series.resample('D').apply(f, add_arg=multiplier) - expected = series.resample('D').mean().multiply(multiplier) + result = series.resample("D").apply(f, add_arg=multiplier) + expected = series.resample("D").mean().multiply(multiplier) tm.assert_series_equal(result, expected) # Testing dataframe - df = pd.DataFrame({"A": 1, "B": 2}, - index=pd.date_range('2017', periods=10)) + df = pd.DataFrame({"A": 1, "B": 2}, index=pd.date_range("2017", periods=10)) result = df.groupby("A").resample("D").agg(f, multiplier) - expected = df.groupby("A").resample('D').mean().multiply(multiplier) + expected = df.groupby("A").resample("D").mean().multiply(multiplier) assert_frame_equal(result, expected) -@pytest.mark.parametrize('k', [1, 2, 3]) -@pytest.mark.parametrize('n1, freq1, n2, freq2', [ - (30, 'S', 0.5, 'Min'), - (60, 'S', 1, 'Min'), - (3600, 'S', 1, 'H'), - (60, 'Min', 1, 'H'), - (21600, 'S', 0.25, 'D'), - (86400, 'S', 1, 'D'), - (43200, 'S', 0.5, 'D'), - (1440, 'Min', 1, 'D'), - (12, 'H', 0.5, 'D'), - (24, 'H', 1, 'D'), -]) +@pytest.mark.parametrize("k", [1, 2, 3]) +@pytest.mark.parametrize( + "n1, freq1, n2, freq2", + [ + (30, "S", 0.5, "Min"), + (60, "S", 1, "Min"), + (3600, "S", 1, "H"), + (60, "Min", 1, "H"), + (21600, "S", 0.25, "D"), + (86400, "S", 1, "D"), + (43200, "S", 0.5, "D"), + (1440, "Min", 1, "D"), + (12, "H", 0.5, "D"), + (24, "H", 1, "D"), + ], +) def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): # GH 24127 n1_ = n1 * k n2_ = n2 * k - s = pd.Series(0, index=pd.date_range('19910905 13:00', - '19911005 07:00', - freq=freq1)) + s = pd.Series( + 0, index=pd.date_range("19910905 13:00", "19911005 07:00", freq=freq1) + ) s = s + range(len(s)) result1 = s.resample(str(n1_) + freq1).mean() @@ -1484,17 +1546,18 @@ def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k): assert_series_equal(result1, result2) -@pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ - ('19910905', '19920406', 'D', '19910905', '19920407'), - ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920407'), - ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', - '19920406 07:00'), - ('19910906', '19920406', 'M', '19910831', '19920430'), - ('19910831', '19920430', 'M', '19910831', '19920531'), - ('1991-08', '1992-04', 'M', '19910831', '19920531'), -]) -def test_get_timestamp_range_edges(first, last, offset, - exp_first, exp_last): +@pytest.mark.parametrize( + "first,last,offset,exp_first,exp_last", + [ + ("19910905", "19920406", "D", "19910905", "19920407"), + ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"), + ("19910905 06:00", "19920406 06:00", "H", "19910905 06:00", "19920406 07:00"), + ("19910906", "19920406", "M", "19910831", "19920430"), + ("19910831", "19920430", "M", "19910831", "19920531"), + ("1991-08", "1992-04", "M", "19910831", "19920531"), + ], +) +def test_get_timestamp_range_edges(first, last, offset, exp_first, exp_last): first = pd.Period(first) first = first.to_timestamp(first.freq) last = pd.Period(last) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 228de8a14c506..2ced955652c21 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -16,7 +16,10 @@ from pandas.core.resample import _get_period_range_edges import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) import pandas.tseries.offsets as offsets @@ -28,25 +31,23 @@ def _index_factory(): @pytest.fixture def _series_name(): - return 'pi' + return "pi" class TestPeriodIndex: - - @pytest.mark.parametrize('freq', ['2D', '1H', '2H']) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + @pytest.mark.parametrize("freq", ["2D", "1H", "2H"]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_asfreq(self, series_and_frame, freq, kind): # GH 12884, 15944 # make sure .asfreq() returns PeriodIndex (except kind='timestamp') obj = series_and_frame - if kind == 'timestamp': + if kind == "timestamp": expected = obj.to_timestamp().resample(freq).asfreq() else: - start = obj.index[0].to_timestamp(how='start') - end = (obj.index[-1] + obj.index.freq).to_timestamp(how='start') - new_index = date_range(start=start, end=end, freq=freq, - closed='left') + start = obj.index[0].to_timestamp(how="start") + end = (obj.index[-1] + obj.index.freq).to_timestamp(how="start") + new_index = date_range(start=start, end=end, freq=freq, closed="left") expected = obj.to_timestamp().reindex(new_index).to_period(freq) result = obj.resample(freq, kind=kind).asfreq() assert_almost_equal(result, expected) @@ -55,45 +56,52 @@ def test_asfreq_fill_value(self, series): # test for fill value during resampling, issue 3715 s = series - new_index = date_range(s.index[0].to_timestamp(how='start'), - (s.index[-1]).to_timestamp(how='start'), - freq='1H') + new_index = date_range( + s.index[0].to_timestamp(how="start"), + (s.index[-1]).to_timestamp(how="start"), + freq="1H", + ) expected = s.to_timestamp().reindex(new_index, fill_value=4.0) - result = s.resample('1H', kind='timestamp').asfreq(fill_value=4.0) + result = s.resample("1H", kind="timestamp").asfreq(fill_value=4.0) assert_series_equal(result, expected) - frame = s.to_frame('value') - new_index = date_range(frame.index[0].to_timestamp(how='start'), - (frame.index[-1]).to_timestamp(how='start'), - freq='1H') + frame = s.to_frame("value") + new_index = date_range( + frame.index[0].to_timestamp(how="start"), + (frame.index[-1]).to_timestamp(how="start"), + freq="1H", + ) expected = frame.to_timestamp().reindex(new_index, fill_value=3.0) - result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0) + result = frame.resample("1H", kind="timestamp").asfreq(fill_value=3.0) assert_frame_equal(result, expected) - @pytest.mark.parametrize('freq', ['H', '12H', '2D', 'W']) - @pytest.mark.parametrize('kind', [None, 'period', 'timestamp']) - @pytest.mark.parametrize('kwargs', [dict(on='date'), dict(level='d')]) + @pytest.mark.parametrize("freq", ["H", "12H", "2D", "W"]) + @pytest.mark.parametrize("kind", [None, "period", "timestamp"]) + @pytest.mark.parametrize("kwargs", [dict(on="date"), dict(level="d")]) def test_selection(self, index, freq, kind, kwargs): # This is a bug, these should be implemented # GH 14008 rng = np.arange(len(index), dtype=np.int64) - df = DataFrame({'date': index, 'a': rng}, - index=pd.MultiIndex.from_arrays([rng, index], - names=['v', 'd'])) - msg = ("Resampling from level= or on= selection with a PeriodIndex is" - r" not currently supported, use \.set_index\(\.\.\.\) to" - " explicitly set index") + df = DataFrame( + {"date": index, "a": rng}, + index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), + ) + msg = ( + "Resampling from level= or on= selection with a PeriodIndex is" + r" not currently supported, use \.set_index\(\.\.\.\) to" + " explicitly set index" + ) with pytest.raises(NotImplementedError, match=msg): df.resample(freq, kind=kind, **kwargs) - @pytest.mark.parametrize('month', MONTHS) - @pytest.mark.parametrize('meth', ['ffill', 'bfill']) - @pytest.mark.parametrize('conv', ['start', 'end']) - @pytest.mark.parametrize('targ', ['D', 'B', 'M']) - def test_annual_upsample_cases(self, targ, conv, meth, month, - simple_period_range_series): - ts = simple_period_range_series( - '1/1/1990', '12/31/1991', freq='A-%s' % month) + @pytest.mark.parametrize("month", MONTHS) + @pytest.mark.parametrize("meth", ["ffill", "bfill"]) + @pytest.mark.parametrize("conv", ["start", "end"]) + @pytest.mark.parametrize("targ", ["D", "B", "M"]) + def test_annual_upsample_cases( + self, targ, conv, meth, month, simple_period_range_series + ): + ts = simple_period_range_series("1/1/1990", "12/31/1991", freq="A-%s" % month) result = getattr(ts.resample(targ, convention=conv), meth)() expected = result.to_timestamp(targ, how=conv) @@ -101,269 +109,289 @@ def test_annual_upsample_cases(self, targ, conv, meth, month, assert_series_equal(result, expected) def test_basic_downsample(self, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='M') - result = ts.resample('a-dec').mean() + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") + result = ts.resample("a-dec").mean() expected = ts.groupby(ts.index.year).mean() - expected.index = period_range('1/1/1990', '6/30/1995', freq='a-dec') + expected.index = period_range("1/1/1990", "6/30/1995", freq="a-dec") assert_series_equal(result, expected) # this is ok - assert_series_equal(ts.resample('a-dec').mean(), result) - assert_series_equal(ts.resample('a').mean(), result) - - @pytest.mark.parametrize('rule,expected_error_msg', [ - ('a-dec', ''), - ('q-mar', ''), - ('M', ''), - ('w-thu', '') - ]) - def test_not_subperiod( - self, simple_period_range_series, rule, expected_error_msg): + assert_series_equal(ts.resample("a-dec").mean(), result) + assert_series_equal(ts.resample("a").mean(), result) + + @pytest.mark.parametrize( + "rule,expected_error_msg", + [ + ("a-dec", ""), + ("q-mar", ""), + ("M", ""), + ("w-thu", ""), + ], + ) + def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg): # These are incompatible period rules for resampling - ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='w-wed') - msg = ("Frequency cannot be resampled to {}, as they" - " are not sub or super periods").format(expected_error_msg) + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") + msg = ( + "Frequency cannot be resampled to {}, as they" + " are not sub or super periods" + ).format(expected_error_msg) with pytest.raises(IncompatibleFrequency, match=msg): ts.resample(rule).mean() - @pytest.mark.parametrize('freq', ['D', '2D']) + @pytest.mark.parametrize("freq", ["D", "2D"]) def test_basic_upsample(self, freq, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '6/30/1995', freq='M') - result = ts.resample('a-dec').mean() + ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M") + result = ts.resample("a-dec").mean() - resampled = result.resample(freq, convention='end').ffill() - expected = result.to_timestamp(freq, how='end') - expected = expected.asfreq(freq, 'ffill').to_period(freq) + resampled = result.resample(freq, convention="end").ffill() + expected = result.to_timestamp(freq, how="end") + expected = expected.asfreq(freq, "ffill").to_period(freq) assert_series_equal(resampled, expected) def test_upsample_with_limit(self): - rng = period_range('1/1/2000', periods=5, freq='A') + rng = period_range("1/1/2000", periods=5, freq="A") ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('M', convention='end').ffill(limit=2) - expected = ts.asfreq('M').reindex(result.index, method='ffill', - limit=2) + result = ts.resample("M", convention="end").ffill(limit=2) + expected = ts.asfreq("M").reindex(result.index, method="ffill", limit=2) assert_series_equal(result, expected) def test_annual_upsample(self, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='A-DEC') - df = DataFrame({'a': ts}) - rdf = df.resample('D').ffill() - exp = df['a'].resample('D').ffill() - assert_series_equal(rdf['a'], exp) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="A-DEC") + df = DataFrame({"a": ts}) + rdf = df.resample("D").ffill() + exp = df["a"].resample("D").ffill() + assert_series_equal(rdf["a"], exp) - rng = period_range('2000', '2003', freq='A-DEC') + rng = period_range("2000", "2003", freq="A-DEC") ts = Series([1, 2, 3, 4], index=rng) - result = ts.resample('M').ffill() - ex_index = period_range('2000-01', '2003-12', freq='M') + result = ts.resample("M").ffill() + ex_index = period_range("2000-01", "2003-12", freq="M") - expected = ts.asfreq('M', how='start').reindex(ex_index, - method='ffill') + expected = ts.asfreq("M", how="start").reindex(ex_index, method="ffill") assert_series_equal(result, expected) - @pytest.mark.parametrize('month', MONTHS) - @pytest.mark.parametrize('target', ['D', 'B', 'M']) - @pytest.mark.parametrize('convention', ['start', 'end']) - def test_quarterly_upsample(self, month, target, convention, - simple_period_range_series): - freq = 'Q-{month}'.format(month=month) - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq=freq) + @pytest.mark.parametrize("month", MONTHS) + @pytest.mark.parametrize("target", ["D", "B", "M"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_quarterly_upsample( + self, month, target, convention, simple_period_range_series + ): + freq = "Q-{month}".format(month=month) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, 'ffill').to_period() + expected = expected.asfreq(target, "ffill").to_period() assert_series_equal(result, expected) - @pytest.mark.parametrize('target', ['D', 'B']) - @pytest.mark.parametrize('convention', ['start', 'end']) - def test_monthly_upsample(self, target, convention, - simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='M') + @pytest.mark.parametrize("target", ["D", "B"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_monthly_upsample(self, target, convention, simple_period_range_series): + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, 'ffill').to_period() + expected = expected.asfreq(target, "ffill").to_period() assert_series_equal(result, expected) def test_resample_basic(self): # GH3609 - s = Series(range(100), index=date_range( - '20130101', freq='s', periods=100, name='idx'), dtype='float') + s = Series( + range(100), + index=date_range("20130101", freq="s", periods=100, name="idx"), + dtype="float", + ) s[10:30] = np.nan - index = PeriodIndex([ - Period('2013-01-01 00:00', 'T'), - Period('2013-01-01 00:01', 'T')], name='idx') + index = PeriodIndex( + [Period("2013-01-01 00:00", "T"), Period("2013-01-01 00:01", "T")], + name="idx", + ) expected = Series([34.5, 79.5], index=index) - result = s.to_period().resample('T', kind='period').mean() + result = s.to_period().resample("T", kind="period").mean() assert_series_equal(result, expected) - result2 = s.resample('T', kind='period').mean() + result2 = s.resample("T", kind="period").mean() assert_series_equal(result2, expected) - @pytest.mark.parametrize('freq,expected_vals', [('M', [31, 29, 31, 9]), - ('2M', [31 + 29, 31 + 9])]) + @pytest.mark.parametrize( + "freq,expected_vals", [("M", [31, 29, 31, 9]), ("2M", [31 + 29, 31 + 9])] + ) def test_resample_count(self, freq, expected_vals): # GH12774 - series = Series(1, index=pd.period_range(start='2000', periods=100)) + series = Series(1, index=pd.period_range(start="2000", periods=100)) result = series.resample(freq).count() - expected_index = pd.period_range(start='2000', freq=freq, - periods=len(expected_vals)) + expected_index = pd.period_range( + start="2000", freq=freq, periods=len(expected_vals) + ) expected = Series(expected_vals, index=expected_index) assert_series_equal(result, expected) def test_resample_same_freq(self, resample_method): # GH12770 - series = Series(range(3), index=pd.period_range( - start='2000', periods=3, freq='M')) + series = Series( + range(3), index=pd.period_range(start="2000", periods=3, freq="M") + ) expected = series - result = getattr(series.resample('M'), resample_method)() + result = getattr(series.resample("M"), resample_method)() assert_series_equal(result, expected) def test_resample_incompat_freq(self): - msg = ("Frequency cannot be resampled to ," - " as they are not sub or super periods") + msg = ( + "Frequency cannot be resampled to ," + " as they are not sub or super periods" + ) with pytest.raises(IncompatibleFrequency, match=msg): - Series(range(3), index=pd.period_range( - start='2000', periods=3, freq='M')).resample('W').mean() + Series( + range(3), index=pd.period_range(start="2000", periods=3, freq="M") + ).resample("W").mean() def test_with_local_timezone_pytz(self): # see gh-5430 - local_timezone = pytz.timezone('America/Los_Angeles') + local_timezone = pytz.timezone("America/Los_Angeles") - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, - tzinfo=pytz.utc) + start = datetime(year=2013, month=11, day=1, hour=0, minute=0, tzinfo=pytz.utc) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, - tzinfo=pytz.utc) + end = datetime(year=2013, month=11, day=2, hour=0, minute=0, tzinfo=pytz.utc) - index = pd.date_range(start, end, freq='H') + index = pd.date_range(start, end, freq="H") series = Series(1, index=index) series = series.tz_convert(local_timezone) - result = series.resample('D', kind='period').mean() + result = series.resample("D", kind="period").mean() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific - expected_index = (pd.period_range(start=start, end=end, freq='D') - - offsets.Day()) + expected_index = pd.period_range(start=start, end=end, freq="D") - offsets.Day() expected = Series(1, index=expected_index) assert_series_equal(result, expected) def test_resample_with_pytz(self): # GH 13238 - s = Series(2, index=pd.date_range('2017-01-01', periods=48, freq="H", - tz="US/Eastern")) + s = Series( + 2, index=pd.date_range("2017-01-01", periods=48, freq="H", tz="US/Eastern") + ) result = s.resample("D").mean() - expected = Series(2, index=pd.DatetimeIndex(['2017-01-01', - '2017-01-02'], - tz="US/Eastern")) + expected = Series( + 2, index=pd.DatetimeIndex(["2017-01-01", "2017-01-02"], tz="US/Eastern") + ) assert_series_equal(result, expected) # Especially assert that the timezone is LMT for pytz - assert result.index.tz == pytz.timezone('US/Eastern') + assert result.index.tz == pytz.timezone("US/Eastern") def test_with_local_timezone_dateutil(self): # see gh-5430 - local_timezone = 'dateutil/America/Los_Angeles' + local_timezone = "dateutil/America/Los_Angeles" - start = datetime(year=2013, month=11, day=1, hour=0, minute=0, - tzinfo=dateutil.tz.tzutc()) + start = datetime( + year=2013, month=11, day=1, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() + ) # 1 day later - end = datetime(year=2013, month=11, day=2, hour=0, minute=0, - tzinfo=dateutil.tz.tzutc()) + end = datetime( + year=2013, month=11, day=2, hour=0, minute=0, tzinfo=dateutil.tz.tzutc() + ) - index = pd.date_range(start, end, freq='H', name='idx') + index = pd.date_range(start, end, freq="H", name="idx") series = Series(1, index=index) series = series.tz_convert(local_timezone) - result = series.resample('D', kind='period').mean() + result = series.resample("D", kind="period").mean() # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific - expected_index = (pd.period_range(start=start, end=end, freq='D', - name='idx') - offsets.Day()) + expected_index = ( + pd.period_range(start=start, end=end, freq="D", name="idx") - offsets.Day() + ) expected = Series(1, index=expected_index) assert_series_equal(result, expected) def test_resample_nonexistent_time_bin_edge(self): # GH 19375 - index = date_range('2017-03-12', '2017-03-12 1:45:00', freq='15T') + index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15T") s = Series(np.zeros(len(index)), index=index) - expected = s.tz_localize('US/Pacific') - result = expected.resample('900S').mean() + expected = s.tz_localize("US/Pacific") + result = expected.resample("900S").mean() tm.assert_series_equal(result, expected) # GH 23742 - index = date_range(start='2017-10-10', end='2017-10-20', freq='1H') - index = index.tz_localize('UTC').tz_convert('America/Sao_Paulo') + index = date_range(start="2017-10-10", end="2017-10-20", freq="1H") + index = index.tz_localize("UTC").tz_convert("America/Sao_Paulo") df = DataFrame(data=list(range(len(index))), index=index) - result = df.groupby(pd.Grouper(freq='1D')).count() - expected = date_range(start='2017-10-09', end='2017-10-20', freq='D', - tz="America/Sao_Paulo", - nonexistent='shift_forward', closed='left') + result = df.groupby(pd.Grouper(freq="1D")).count() + expected = date_range( + start="2017-10-09", + end="2017-10-20", + freq="D", + tz="America/Sao_Paulo", + nonexistent="shift_forward", + closed="left", + ) tm.assert_index_equal(result.index, expected) def test_resample_ambiguous_time_bin_edge(self): # GH 10117 - idx = pd.date_range("2014-10-25 22:00:00", "2014-10-26 00:30:00", - freq="30T", tz="Europe/London") + idx = pd.date_range( + "2014-10-25 22:00:00", "2014-10-26 00:30:00", freq="30T", tz="Europe/London" + ) expected = Series(np.zeros(len(idx)), index=idx) - result = expected.resample('30T').mean() + result = expected.resample("30T").mean() tm.assert_series_equal(result, expected) def test_fill_method_and_how_upsample(self): # GH2073 - s = Series(np.arange(9, dtype='int64'), - index=date_range('2010-01-01', periods=9, freq='Q')) - last = s.resample('M').ffill() - both = s.resample('M').ffill().resample('M').last().astype('int64') + s = Series( + np.arange(9, dtype="int64"), + index=date_range("2010-01-01", periods=9, freq="Q"), + ) + last = s.resample("M").ffill() + both = s.resample("M").ffill().resample("M").last().astype("int64") assert_series_equal(last, both) - @pytest.mark.parametrize('day', DAYS) - @pytest.mark.parametrize('target', ['D', 'B']) - @pytest.mark.parametrize('convention', ['start', 'end']) - def test_weekly_upsample(self, day, target, convention, - simple_period_range_series): - freq = 'W-{day}'.format(day=day) - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq=freq) + @pytest.mark.parametrize("day", DAYS) + @pytest.mark.parametrize("target", ["D", "B"]) + @pytest.mark.parametrize("convention", ["start", "end"]) + def test_weekly_upsample(self, day, target, convention, simple_period_range_series): + freq = "W-{day}".format(day=day) + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq=freq) result = ts.resample(target, convention=convention).ffill() expected = result.to_timestamp(target, how=convention) - expected = expected.asfreq(target, 'ffill').to_period() + expected = expected.asfreq(target, "ffill").to_period() assert_series_equal(result, expected) def test_resample_to_timestamps(self, simple_period_range_series): - ts = simple_period_range_series('1/1/1990', '12/31/1995', freq='M') + ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M") - result = ts.resample('A-DEC', kind='timestamp').mean() - expected = ts.to_timestamp(how='start').resample('A-DEC').mean() + result = ts.resample("A-DEC", kind="timestamp").mean() + expected = ts.to_timestamp(how="start").resample("A-DEC").mean() assert_series_equal(result, expected) def test_resample_to_quarterly(self, simple_period_range_series): for month in MONTHS: - ts = simple_period_range_series( - '1990', '1992', freq='A-%s' % month) - quar_ts = ts.resample('Q-%s' % month).ffill() + ts = simple_period_range_series("1990", "1992", freq="A-%s" % month) + quar_ts = ts.resample("Q-%s" % month).ffill() - stamps = ts.to_timestamp('D', how='start') - qdates = period_range(ts.index[0].asfreq('D', 'start'), - ts.index[-1].asfreq('D', 'end'), - freq='Q-%s' % month) + stamps = ts.to_timestamp("D", how="start") + qdates = period_range( + ts.index[0].asfreq("D", "start"), + ts.index[-1].asfreq("D", "end"), + freq="Q-%s" % month, + ) - expected = stamps.reindex(qdates.to_timestamp('D', 's'), - method='ffill') + expected = stamps.reindex(qdates.to_timestamp("D", "s"), method="ffill") expected.index = qdates assert_series_equal(quar_ts, expected) # conforms, but different month - ts = simple_period_range_series('1990', '1992', freq='A-JUN') + ts = simple_period_range_series("1990", "1992", freq="A-JUN") - for how in ['start', 'end']: - result = ts.resample('Q-MAR', convention=how).ffill() - expected = ts.asfreq('Q-MAR', how=how) - expected = expected.reindex(result.index, method='ffill') + for how in ["start", "end"]: + result = ts.resample("Q-MAR", convention=how).ffill() + expected = ts.asfreq("Q-MAR", how=how) + expected = expected.reindex(result.index, method="ffill") # .to_timestamp('D') # expected = expected.resample('Q-MAR').ffill() @@ -371,149 +399,154 @@ def test_resample_to_quarterly(self, simple_period_range_series): assert_series_equal(result, expected) def test_resample_fill_missing(self): - rng = PeriodIndex([2000, 2005, 2007, 2009], freq='A') + rng = PeriodIndex([2000, 2005, 2007, 2009], freq="A") s = Series(np.random.randn(4), index=rng) stamps = s.to_timestamp() - filled = s.resample('A').ffill() - expected = stamps.resample('A').ffill().to_period('A') + filled = s.resample("A").ffill() + expected = stamps.resample("A").ffill().to_period("A") assert_series_equal(filled, expected) def test_cant_fill_missing_dups(self): - rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq='A') + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="A") s = Series(np.random.randn(5), index=rng) msg = "Reindexing only valid with uniquely valued Index objects" with pytest.raises(InvalidIndexError, match=msg): - s.resample('A').ffill() + s.resample("A").ffill() - @pytest.mark.parametrize('freq', ['5min']) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + @pytest.mark.parametrize("freq", ["5min"]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) def test_resample_5minute(self, freq, kind): - rng = period_range('1/1/2000', '1/5/2000', freq='T') + rng = period_range("1/1/2000", "1/5/2000", freq="T") ts = Series(np.random.randn(len(rng)), index=rng) expected = ts.to_timestamp().resample(freq).mean() - if kind != 'timestamp': + if kind != "timestamp": expected = expected.to_period(freq) result = ts.resample(freq, kind=kind).mean() assert_series_equal(result, expected) def test_upsample_daily_business_daily(self, simple_period_range_series): - ts = simple_period_range_series('1/1/2000', '2/1/2000', freq='B') + ts = simple_period_range_series("1/1/2000", "2/1/2000", freq="B") - result = ts.resample('D').asfreq() - expected = ts.asfreq('D').reindex(period_range('1/3/2000', '2/1/2000')) + result = ts.resample("D").asfreq() + expected = ts.asfreq("D").reindex(period_range("1/3/2000", "2/1/2000")) assert_series_equal(result, expected) - ts = simple_period_range_series('1/1/2000', '2/1/2000') - result = ts.resample('H', convention='s').asfreq() - exp_rng = period_range('1/1/2000', '2/1/2000 23:00', freq='H') - expected = ts.asfreq('H', how='s').reindex(exp_rng) + ts = simple_period_range_series("1/1/2000", "2/1/2000") + result = ts.resample("H", convention="s").asfreq() + exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="H") + expected = ts.asfreq("H", how="s").reindex(exp_rng) assert_series_equal(result, expected) def test_resample_irregular_sparse(self): - dr = date_range(start='1/1/2012', freq='5min', periods=1000) + dr = date_range(start="1/1/2012", freq="5min", periods=1000) s = Series(np.array(100), index=dr) # subset the data. - subset = s[:'2012-01-04 06:55'] + subset = s[:"2012-01-04 06:55"] - result = subset.resample('10min').apply(len) - expected = s.resample('10min').apply(len).loc[result.index] + result = subset.resample("10min").apply(len) + expected = s.resample("10min").apply(len).loc[result.index] assert_series_equal(result, expected) def test_resample_weekly_all_na(self): - rng = date_range('1/1/2000', periods=10, freq='W-WED') + rng = date_range("1/1/2000", periods=10, freq="W-WED") ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('W-THU').asfreq() + result = ts.resample("W-THU").asfreq() assert result.isna().all() - result = ts.resample('W-THU').asfreq().ffill()[:-1] - expected = ts.asfreq('W-THU').ffill() + result = ts.resample("W-THU").asfreq().ffill()[:-1] + expected = ts.asfreq("W-THU").ffill() assert_series_equal(result, expected) def test_resample_tz_localized(self): - dr = date_range(start='2012-4-13', end='2012-5-1') + dr = date_range(start="2012-4-13", end="2012-5-1") ts = Series(range(len(dr)), index=dr) - ts_utc = ts.tz_localize('UTC') - ts_local = ts_utc.tz_convert('America/Los_Angeles') + ts_utc = ts.tz_localize("UTC") + ts_local = ts_utc.tz_convert("America/Los_Angeles") - result = ts_local.resample('W').mean() + result = ts_local.resample("W").mean() ts_local_naive = ts_local.copy() - ts_local_naive.index = [x.replace(tzinfo=None) - for x in ts_local_naive.index.to_pydatetime()] + ts_local_naive.index = [ + x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime() + ] - exp = ts_local_naive.resample( - 'W').mean().tz_localize('America/Los_Angeles') + exp = ts_local_naive.resample("W").mean().tz_localize("America/Los_Angeles") assert_series_equal(result, exp) # it works - result = ts_local.resample('D').mean() + result = ts_local.resample("D").mean() # #2245 - idx = date_range('2001-09-20 15:59', '2001-09-20 16:00', freq='T', - tz='Australia/Sydney') + idx = date_range( + "2001-09-20 15:59", "2001-09-20 16:00", freq="T", tz="Australia/Sydney" + ) s = Series([1, 2], index=idx) - result = s.resample('D', closed='right', label='right').mean() - ex_index = date_range('2001-09-21', periods=1, freq='D', - tz='Australia/Sydney') + result = s.resample("D", closed="right", label="right").mean() + ex_index = date_range("2001-09-21", periods=1, freq="D", tz="Australia/Sydney") expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) # for good measure - result = s.resample('D', kind='period').mean() - ex_index = period_range('2001-09-20', periods=1, freq='D') + result = s.resample("D", kind="period").mean() + ex_index = period_range("2001-09-20", periods=1, freq="D") expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) # GH 6397 # comparing an offset that doesn't propagate tz's - rng = date_range('1/1/2011', periods=20000, freq='H') - rng = rng.tz_localize('EST') + rng = date_range("1/1/2011", periods=20000, freq="H") + rng = rng.tz_localize("EST") ts = DataFrame(index=rng) - ts['first'] = np.random.randn(len(rng)) - ts['second'] = np.cumsum(np.random.randn(len(rng))) + ts["first"] = np.random.randn(len(rng)) + ts["second"] = np.cumsum(np.random.randn(len(rng))) expected = DataFrame( { - 'first': ts.resample('A').sum()['first'], - 'second': ts.resample('A').mean()['second']}, - columns=['first', 'second']) - result = ts.resample( - 'A').agg({'first': np.sum, - 'second': np.mean}).reindex(columns=['first', 'second']) + "first": ts.resample("A").sum()["first"], + "second": ts.resample("A").mean()["second"], + }, + columns=["first", "second"], + ) + result = ( + ts.resample("A") + .agg({"first": np.sum, "second": np.mean}) + .reindex(columns=["first", "second"]) + ) assert_frame_equal(result, expected) def test_closed_left_corner(self): # #1465 - s = Series(np.random.randn(21), - index=date_range(start='1/1/2012 9:30', - freq='1min', periods=21)) + s = Series( + np.random.randn(21), + index=date_range(start="1/1/2012 9:30", freq="1min", periods=21), + ) s[0] = np.nan - result = s.resample('10min', closed='left', label='right').mean() - exp = s[1:].resample('10min', closed='left', label='right').mean() + result = s.resample("10min", closed="left", label="right").mean() + exp = s[1:].resample("10min", closed="left", label="right").mean() assert_series_equal(result, exp) - result = s.resample('10min', closed='left', label='left').mean() - exp = s[1:].resample('10min', closed='left', label='left').mean() + result = s.resample("10min", closed="left", label="left").mean() + exp = s[1:].resample("10min", closed="left", label="left").mean() - ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3) + ex_index = date_range(start="1/1/2012 9:30", freq="10min", periods=3) tm.assert_index_equal(result.index, ex_index) assert_series_equal(result, exp) def test_quarterly_resampling(self): - rng = period_range('2000Q1', periods=10, freq='Q-DEC') + rng = period_range("2000Q1", periods=10, freq="Q-DEC") ts = Series(np.arange(10), index=rng) - result = ts.resample('A').mean() - exp = ts.to_timestamp().resample('A').mean().to_period() + result = ts.resample("A").mean() + exp = ts.to_timestamp().resample("A").mean().to_period() assert_series_equal(result, exp) def test_resample_weekly_bug_1726(self): @@ -521,48 +554,51 @@ def test_resample_weekly_bug_1726(self): ind = date_range(start="8/6/2012", end="8/26/2012", freq="D") n = len(ind) data = [[x] * 5 for x in range(n)] - df = DataFrame(data, columns=['open', 'high', 'low', 'close', 'vol'], - index=ind) + df = DataFrame(data, columns=["open", "high", "low", "close", "vol"], index=ind) # it works! - df.resample('W-MON', closed='left', label='left').first() + df.resample("W-MON", closed="left", label="left").first() def test_resample_with_dst_time_change(self): # GH 15549 index = ( pd.DatetimeIndex([1457537600000000000, 1458059600000000000]) - .tz_localize("UTC").tz_convert('America/Chicago') + .tz_localize("UTC") + .tz_convert("America/Chicago") ) df = pd.DataFrame([1, 2], index=index) - result = df.resample('12h', closed='right', - label='right').last().ffill() - - expected_index_values = ['2016-03-09 12:00:00-06:00', - '2016-03-10 00:00:00-06:00', - '2016-03-10 12:00:00-06:00', - '2016-03-11 00:00:00-06:00', - '2016-03-11 12:00:00-06:00', - '2016-03-12 00:00:00-06:00', - '2016-03-12 12:00:00-06:00', - '2016-03-13 00:00:00-06:00', - '2016-03-13 13:00:00-05:00', - '2016-03-14 01:00:00-05:00', - '2016-03-14 13:00:00-05:00', - '2016-03-15 01:00:00-05:00', - '2016-03-15 13:00:00-05:00'] + result = df.resample("12h", closed="right", label="right").last().ffill() + + expected_index_values = [ + "2016-03-09 12:00:00-06:00", + "2016-03-10 00:00:00-06:00", + "2016-03-10 12:00:00-06:00", + "2016-03-11 00:00:00-06:00", + "2016-03-11 12:00:00-06:00", + "2016-03-12 00:00:00-06:00", + "2016-03-12 12:00:00-06:00", + "2016-03-13 00:00:00-06:00", + "2016-03-13 13:00:00-05:00", + "2016-03-14 01:00:00-05:00", + "2016-03-14 13:00:00-05:00", + "2016-03-15 01:00:00-05:00", + "2016-03-15 13:00:00-05:00", + ] index = pd.to_datetime(expected_index_values, utc=True).tz_convert( - 'America/Chicago') - expected = pd.DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 2.0], index=index) + "America/Chicago" + ) + expected = pd.DataFrame( + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0], + index=index, + ) assert_frame_equal(result, expected) def test_resample_bms_2752(self): # GH2753 - foo = Series(index=pd.bdate_range('20000101', '20000201')) + foo = Series(index=pd.bdate_range("20000101", "20000201")) res1 = foo.resample("BMS").mean() res2 = foo.resample("BMS").mean().resample("B").mean() - assert res1.index[0] == Timestamp('20000103') + assert res1.index[0] == Timestamp("20000103") assert res1.index[0] == res2.index[0] # def test_monthly_convention_span(self): @@ -579,28 +615,30 @@ def test_resample_bms_2752(self): # assert_series_equal(result, expected) def test_default_right_closed_label(self): - end_freq = ['D', 'Q', 'M', 'D'] - end_types = ['M', 'A', 'Q', 'W'] + end_freq = ["D", "Q", "M", "D"] + end_types = ["M", "A", "Q", "W"] for from_freq, to_freq in zip(end_freq, end_types): - idx = date_range(start='8/15/2012', periods=100, freq=from_freq) + idx = date_range(start="8/15/2012", periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq).mean() - assert_frame_equal(resampled, df.resample(to_freq, closed='right', - label='right').mean()) + assert_frame_equal( + resampled, df.resample(to_freq, closed="right", label="right").mean() + ) def test_default_left_closed_label(self): - others = ['MS', 'AS', 'QS', 'D', 'H'] - others_freq = ['D', 'Q', 'M', 'H', 'T'] + others = ["MS", "AS", "QS", "D", "H"] + others_freq = ["D", "Q", "M", "H", "T"] for from_freq, to_freq in zip(others_freq, others): - idx = date_range(start='8/15/2012', periods=100, freq=from_freq) + idx = date_range(start="8/15/2012", periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) resampled = df.resample(to_freq).mean() - assert_frame_equal(resampled, df.resample(to_freq, closed='left', - label='left').mean()) + assert_frame_equal( + resampled, df.resample(to_freq, closed="left", label="left").mean() + ) def test_all_values_single_bin(self): # 2070 @@ -614,151 +652,218 @@ def test_evenly_divisible_with_no_extra_bins(self): # 4076 # when the frequency is evenly divisible, sometimes extra bins - df = DataFrame(np.random.randn(9, 3), - index=date_range('2000-1-1', periods=9)) - result = df.resample('5D').mean() - expected = pd.concat( - [df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T - expected.index = [Timestamp('2000-1-1'), Timestamp('2000-1-6')] + df = DataFrame(np.random.randn(9, 3), index=date_range("2000-1-1", periods=9)) + result = df.resample("5D").mean() + expected = pd.concat([df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T + expected.index = [Timestamp("2000-1-1"), Timestamp("2000-1-6")] assert_frame_equal(result, expected) - index = date_range(start='2001-5-4', periods=28) + index = date_range(start="2001-5-4", periods=28) df = DataFrame( - [{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90, - 'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 + - [{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10, - 'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28, - index=index.append(index)).sort_index() - - index = date_range('2001-5-4', periods=4, freq='7D') + [ + { + "REST_KEY": 1, + "DLY_TRN_QT": 80, + "DLY_SLS_AMT": 90, + "COOP_DLY_TRN_QT": 30, + "COOP_DLY_SLS_AMT": 20, + } + ] + * 28 + + [ + { + "REST_KEY": 2, + "DLY_TRN_QT": 70, + "DLY_SLS_AMT": 10, + "COOP_DLY_TRN_QT": 50, + "COOP_DLY_SLS_AMT": 20, + } + ] + * 28, + index=index.append(index), + ).sort_index() + + index = date_range("2001-5-4", periods=4, freq="7D") expected = DataFrame( - [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, - 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, - index=index) - result = df.resample('7D').count() + [ + { + "REST_KEY": 14, + "DLY_TRN_QT": 14, + "DLY_SLS_AMT": 14, + "COOP_DLY_TRN_QT": 14, + "COOP_DLY_SLS_AMT": 14, + } + ] + * 4, + index=index, + ) + result = df.resample("7D").count() assert_frame_equal(result, expected) expected = DataFrame( - [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700, - 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4, - index=index) - result = df.resample('7D').sum() + [ + { + "REST_KEY": 21, + "DLY_TRN_QT": 1050, + "DLY_SLS_AMT": 700, + "COOP_DLY_TRN_QT": 560, + "COOP_DLY_SLS_AMT": 280, + } + ] + * 4, + index=index, + ) + result = df.resample("7D").sum() assert_frame_equal(result, expected) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) - @pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']]) + @pytest.mark.parametrize("kind", ["period", None, "timestamp"]) + @pytest.mark.parametrize("agg_arg", ["mean", {"value": "mean"}, ["mean"]]) def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): # make sure passing loffset returns DatetimeIndex in all cases # basic method taken from Base.test_resample_loffset_arg_type() df = frame - expected_means = [df.values[i:i + 2].mean() - for i in range(0, len(df.values), 2)] - expected_index = period_range( - df.index[0], periods=len(df.index) / 2, freq='2D') + expected_means = [ + df.values[i : i + 2].mean() for i in range(0, len(df.values), 2) + ] + expected_index = period_range(df.index[0], periods=len(df.index) / 2, freq="2D") # loffset coerces PeriodIndex to DateTimeIndex expected_index = expected_index.to_timestamp() expected_index += timedelta(hours=2) - expected = DataFrame({'value': expected_means}, index=expected_index) + expected = DataFrame({"value": expected_means}, index=expected_index) - result_agg = df.resample('2D', loffset='2H', kind=kind).agg(agg_arg) + result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result_how = df.resample('2D', how=agg_arg, loffset='2H', - kind=kind) + result_how = df.resample("2D", how=agg_arg, loffset="2H", kind=kind) if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex.from_tuples([('value', 'mean')]) + expected.columns = pd.MultiIndex.from_tuples([("value", "mean")]) assert_frame_equal(result_agg, expected) assert_frame_equal(result_how, expected) - @pytest.mark.parametrize('freq, period_mult', [('H', 24), ('12H', 2)]) - @pytest.mark.parametrize('kind', [None, 'period']) + @pytest.mark.parametrize("freq, period_mult", [("H", 24), ("12H", 2)]) + @pytest.mark.parametrize("kind", [None, "period"]) def test_upsampling_ohlc(self, freq, period_mult, kind): # GH 13083 - pi = period_range(start='2000', freq='D', periods=10) + pi = period_range(start="2000", freq="D", periods=10) s = Series(range(len(pi)), index=pi) expected = s.to_timestamp().resample(freq).ohlc().to_period(freq) # timestamp-based resampling doesn't include all sub-periods # of the last original period, so extend accordingly: - new_index = period_range(start='2000', freq=freq, - periods=period_mult * len(pi)) + new_index = period_range(start="2000", freq=freq, periods=period_mult * len(pi)) expected = expected.reindex(new_index) result = s.resample(freq, kind=kind).ohlc() assert_frame_equal(result, expected) - @pytest.mark.parametrize('periods, values', - [([pd.NaT, '1970-01-01 00:00:00', pd.NaT, - '1970-01-01 00:00:02', '1970-01-01 00:00:03'], - [2, 3, 5, 7, 11]), - ([pd.NaT, pd.NaT, '1970-01-01 00:00:00', pd.NaT, - pd.NaT, pd.NaT, '1970-01-01 00:00:02', - '1970-01-01 00:00:03', pd.NaT, pd.NaT], - [1, 2, 3, 5, 6, 8, 7, 11, 12, 13])]) - @pytest.mark.parametrize('freq, expected_values', - [('1s', [3, np.NaN, 7, 11]), - ('2s', [3, int((7 + 11) / 2)]), - ('3s', [int((3 + 7) / 2), 11])]) + @pytest.mark.parametrize( + "periods, values", + [ + ( + [ + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + ], + [2, 3, 5, 7, 11], + ), + ( + [ + pd.NaT, + pd.NaT, + "1970-01-01 00:00:00", + pd.NaT, + pd.NaT, + pd.NaT, + "1970-01-01 00:00:02", + "1970-01-01 00:00:03", + pd.NaT, + pd.NaT, + ], + [1, 2, 3, 5, 6, 8, 7, 11, 12, 13], + ), + ], + ) + @pytest.mark.parametrize( + "freq, expected_values", + [ + ("1s", [3, np.NaN, 7, 11]), + ("2s", [3, int((7 + 11) / 2)]), + ("3s", [int((3 + 7) / 2), 11]), + ], + ) def test_resample_with_nat(self, periods, values, freq, expected_values): # GH 13224 - index = PeriodIndex(periods, freq='S') + index = PeriodIndex(periods, freq="S") frame = DataFrame(values, index=index) - expected_index = period_range('1970-01-01 00:00:00', - periods=len(expected_values), freq=freq) + expected_index = period_range( + "1970-01-01 00:00:00", periods=len(expected_values), freq=freq + ) expected = DataFrame(expected_values, index=expected_index) result = frame.resample(freq).mean() assert_frame_equal(result, expected) def test_resample_with_only_nat(self): # GH 13224 - pi = PeriodIndex([pd.NaT] * 3, freq='S') + pi = PeriodIndex([pd.NaT] * 3, freq="S") frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) expected = DataFrame(index=expected_index) - result = frame.resample('1s').mean() + result = frame.resample("1s").mean() assert_frame_equal(result, expected) - @pytest.mark.parametrize('start,end,start_freq,end_freq,base', [ - ('19910905', '19910909 03:00', 'H', '24H', 10), - ('19910905', '19910909 12:00', 'H', '24H', 10), - ('19910905', '19910909 23:00', 'H', '24H', 10), - ('19910905 10:00', '19910909', 'H', '24H', 10), - ('19910905 10:00', '19910909 10:00', 'H', '24H', 10), - ('19910905', '19910909 10:00', 'H', '24H', 10), - ('19910905 12:00', '19910909', 'H', '24H', 10), - ('19910905 12:00', '19910909 03:00', 'H', '24H', 10), - ('19910905 12:00', '19910909 12:00', 'H', '24H', 10), - ('19910905 12:00', '19910909 12:00', 'H', '24H', 34), - ('19910905 12:00', '19910909 12:00', 'H', '17H', 10), - ('19910905 12:00', '19910909 12:00', 'H', '17H', 3), - ('19910905 12:00', '19910909 1:00', 'H', 'M', 3), - ('19910905', '19910913 06:00', '2H', '24H', 10), - ('19910905', '19910905 01:39', 'Min', '5Min', 3), - ('19910905', '19910905 03:18', '2Min', '5Min', 3), - ]) - def test_resample_with_non_zero_base(self, start, end, start_freq, - end_freq, base): + @pytest.mark.parametrize( + "start,end,start_freq,end_freq,base", + [ + ("19910905", "19910909 03:00", "H", "24H", 10), + ("19910905", "19910909 12:00", "H", "24H", 10), + ("19910905", "19910909 23:00", "H", "24H", 10), + ("19910905 10:00", "19910909", "H", "24H", 10), + ("19910905 10:00", "19910909 10:00", "H", "24H", 10), + ("19910905", "19910909 10:00", "H", "24H", 10), + ("19910905 12:00", "19910909", "H", "24H", 10), + ("19910905 12:00", "19910909 03:00", "H", "24H", 10), + ("19910905 12:00", "19910909 12:00", "H", "24H", 10), + ("19910905 12:00", "19910909 12:00", "H", "24H", 34), + ("19910905 12:00", "19910909 12:00", "H", "17H", 10), + ("19910905 12:00", "19910909 12:00", "H", "17H", 3), + ("19910905 12:00", "19910909 1:00", "H", "M", 3), + ("19910905", "19910913 06:00", "2H", "24H", 10), + ("19910905", "19910905 01:39", "Min", "5Min", 3), + ("19910905", "19910905 03:18", "2Min", "5Min", 3), + ], + ) + def test_resample_with_non_zero_base(self, start, end, start_freq, end_freq, base): # GH 23882 s = pd.Series(0, index=pd.period_range(start, end, freq=start_freq)) s = s + np.arange(len(s)) result = s.resample(end_freq, base=base).mean() result = result.to_timestamp(end_freq) # to_timestamp casts 24H -> D - result = result.asfreq(end_freq) if end_freq == '24H' else result + result = result.asfreq(end_freq) if end_freq == "24H" else result expected = s.to_timestamp().resample(end_freq, base=base).mean() assert_series_equal(result, expected) - @pytest.mark.parametrize('first,last,offset,exp_first,exp_last', [ - ('19910905', '19920406', 'D', '19910905', '19920406'), - ('19910905 00:00', '19920406 06:00', 'D', '19910905', '19920406'), - ('19910905 06:00', '19920406 06:00', 'H', '19910905 06:00', - '19920406 06:00'), - ('19910906', '19920406', 'M', '1991-09', '1992-04'), - ('19910831', '19920430', 'M', '1991-08', '1992-04'), - ('1991-08', '1992-04', 'M', '1991-08', '1992-04'), - ]) - def test_get_period_range_edges(self, first, last, offset, - exp_first, exp_last): + @pytest.mark.parametrize( + "first,last,offset,exp_first,exp_last", + [ + ("19910905", "19920406", "D", "19910905", "19920406"), + ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920406"), + ( + "19910905 06:00", + "19920406 06:00", + "H", + "19910905 06:00", + "19920406 06:00", + ), + ("19910906", "19920406", "M", "1991-09", "1992-04"), + ("19910831", "19920430", "M", "1991-08", "1992-04"), + ("1991-08", "1992-04", "M", "1991-08", "1992-04"), + ], + ) + def test_get_period_range_edges(self, first, last, offset, exp_first, exp_last): first = pd.Period(first) last = pd.Period(last) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 6943d30276a21..94bc884d66835 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -10,12 +10,10 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -dti = date_range(start=datetime(2005, 1, 1), - end=datetime(2005, 1, 10), freq='Min') +dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") test_series = Series(np.random.rand(len(dti)), dti) -_test_frame = DataFrame( - {'A': test_series, 'B': test_series, 'C': np.arange(len(dti))}) +_test_frame = DataFrame({"A": test_series, "B": test_series, "C": np.arange(len(dti))}) @pytest.fixture @@ -25,19 +23,21 @@ def test_frame(): def test_str(): - r = test_series.resample('H') - assert ('DatetimeIndexResampler [freq=, axis=0, closed=left, ' - 'label=left, convention=start, base=0]' in str(r)) + r = test_series.resample("H") + assert ( + "DatetimeIndexResampler [freq=, axis=0, closed=left, " + "label=left, convention=start, base=0]" in str(r) + ) def test_api(): - r = test_series.resample('H') + r = test_series.resample("H") result = r.mean() assert isinstance(result, Series) assert len(result) == 217 - r = test_series.to_frame().resample('H') + r = test_series.to_frame().resample("H") result = r.mean() assert isinstance(result, DataFrame) assert len(result) == 217 @@ -48,21 +48,22 @@ def test_groupby_resample_api(): # GH 12448 # .groupby(...).resample(...) hitting warnings # when appropriate - df = DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") # replication step - i = pd.date_range('2016-01-03', periods=8).tolist() + \ - pd.date_range('2016-01-17', periods=8).tolist() - index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], - names=['group', 'date']) - expected = DataFrame({'val': [5] * 7 + [6] + [7] * 7 + [8]}, - index=index) - result = df.groupby('group').apply( - lambda x: x.resample('1D').ffill())[['val']] + i = ( + pd.date_range("2016-01-03", periods=8).tolist() + + pd.date_range("2016-01-17", periods=8).tolist() + ) + index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"]) + expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index) + result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]] assert_frame_equal(result, expected) @@ -71,13 +72,17 @@ def test_groupby_resample_on_api(): # GH 15021 # .groupby(...).resample(on=...) results in an unexpected # keyword warning. - df = DataFrame({'key': ['A', 'B'] * 5, - 'dates': pd.date_range('2016-01-01', periods=10), - 'values': np.random.randn(10)}) + df = DataFrame( + { + "key": ["A", "B"] * 5, + "dates": pd.date_range("2016-01-01", periods=10), + "values": np.random.randn(10), + } + ) - expected = df.set_index('dates').groupby('key').resample('D').mean() + expected = df.set_index("dates").groupby("key").resample("D").mean() - result = df.groupby('key').resample('D', on='dates').mean() + result = df.groupby("key").resample("D", on="dates").mean() assert_frame_equal(result, expected) @@ -85,13 +90,13 @@ def test_pipe(test_frame): # GH17905 # series - r = test_series.resample('H') + r = test_series.resample("H") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_series_equal(result, expected) # dataframe - r = test_frame.resample('H') + r = test_frame.resample("H") expected = r.max() - r.mean() result = r.pipe(lambda x: x.max() - x.mean()) tm.assert_frame_equal(result, expected) @@ -99,25 +104,23 @@ def test_pipe(test_frame): def test_getitem(test_frame): - r = test_frame.resample('H') + r = test_frame.resample("H") tm.assert_index_equal(r._selected_obj.columns, test_frame.columns) - r = test_frame.resample('H')['B'] + r = test_frame.resample("H")["B"] assert r._selected_obj.name == test_frame.columns[1] # technically this is allowed - r = test_frame.resample('H')['A', 'B'] - tm.assert_index_equal(r._selected_obj.columns, - test_frame.columns[[0, 1]]) + r = test_frame.resample("H")["A", "B"] + tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) - r = test_frame.resample('H')['A', 'B'] - tm.assert_index_equal(r._selected_obj.columns, - test_frame.columns[[0, 1]]) + r = test_frame.resample("H")["A", "B"] + tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]]) -@pytest.mark.parametrize('key', [['D'], ['A', 'D']]) +@pytest.mark.parametrize("key", [["D"], ["A", "D"]]) def test_select_bad_cols(key, test_frame): - g = test_frame.resample('H') + g = test_frame.resample("H") # 'A' should not be referenced as a bad column... # will have to rethink regex if you change message! msg = r"^\"Columns not found: 'D'\"$" @@ -127,18 +130,18 @@ def test_select_bad_cols(key, test_frame): def test_attribute_access(test_frame): - r = test_frame.resample('H') - tm.assert_series_equal(r.A.sum(), r['A'].sum()) + r = test_frame.resample("H") + tm.assert_series_equal(r.A.sum(), r["A"].sum()) def test_api_compat_before_use(): # make sure that we are setting the binner # on these attributes - for attr in ['groups', 'ngroups', 'indices']: - rng = pd.date_range('1/1/2012', periods=100, freq='S') + for attr in ["groups", "ngroups", "indices"]: + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = Series(np.arange(len(rng)), index=rng) - rs = ts.resample('30s') + rs = ts.resample("30s") # before use getattr(rs, attr) @@ -151,13 +154,13 @@ def test_api_compat_before_use(): def tests_skip_nuisance(test_frame): df = test_frame - df['D'] = 'foo' - r = df.resample('H') - result = r[['A', 'B']].sum() + df["D"] = "foo" + r = df.resample("H") + result = r[["A", "B"]].sum() expected = pd.concat([r.A.sum(), r.B.sum()], axis=1) assert_frame_equal(result, expected) - expected = r[['A', 'B', 'C']].sum() + expected = r[["A", "B", "C"]].sum() result = r.sum() assert_frame_equal(result, expected) @@ -165,13 +168,13 @@ def tests_skip_nuisance(test_frame): def test_downsample_but_actually_upsampling(): # this is reindex / asfreq - rng = pd.date_range('1/1/2012', periods=100, freq='S') - ts = Series(np.arange(len(rng), dtype='int64'), index=rng) - result = ts.resample('20s').asfreq() - expected = Series([0, 20, 40, 60, 80], - index=pd.date_range('2012-01-01 00:00:00', - freq='20s', - periods=5)) + rng = pd.date_range("1/1/2012", periods=100, freq="S") + ts = Series(np.arange(len(rng), dtype="int64"), index=rng) + result = ts.resample("20s").asfreq() + expected = Series( + [0, 20, 40, 60, 80], + index=pd.date_range("2012-01-01 00:00:00", freq="20s", periods=5), + ) assert_series_equal(result, expected) @@ -181,43 +184,43 @@ def test_combined_up_downsampling_of_irregular(): # ts2.resample('2s').mean().ffill() # preserve these semantics - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = Series(np.arange(len(rng)), index=rng) ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = ts2.resample('2s', how='mean', fill_method='ffill') - expected = ts2.resample('2s').mean().ffill() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = ts2.resample("2s", how="mean", fill_method="ffill") + expected = ts2.resample("2s").mean().ffill() assert_series_equal(result, expected) def test_transform(): - r = test_series.resample('20min') - expected = test_series.groupby( - pd.Grouper(freq='20min')).transform('mean') - result = r.transform('mean') + r = test_series.resample("20min") + expected = test_series.groupby(pd.Grouper(freq="20min")).transform("mean") + result = r.transform("mean") assert_series_equal(result, expected) def test_fillna(): # need to upsample here - rng = pd.date_range('1/1/2012', periods=10, freq='2S') - ts = Series(np.arange(len(rng), dtype='int64'), index=rng) - r = ts.resample('s') + rng = pd.date_range("1/1/2012", periods=10, freq="2S") + ts = Series(np.arange(len(rng), dtype="int64"), index=rng) + r = ts.resample("s") expected = r.ffill() - result = r.fillna(method='ffill') + result = r.fillna(method="ffill") assert_series_equal(result, expected) expected = r.bfill() - result = r.fillna(method='bfill') + result = r.fillna(method="bfill") assert_series_equal(result, expected) - msg = (r"Invalid fill method\. Expecting pad \(ffill\), backfill" - r" \(bfill\) or nearest\. Got 0") + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\), backfill" + r" \(bfill\) or nearest\. Got 0" + ) with pytest.raises(ValueError, match=msg): r.fillna(0) @@ -225,8 +228,8 @@ def test_fillna(): def test_apply_without_aggregation(): # both resample and groupby should work w/o aggregation - r = test_series.resample('20min') - g = test_series.groupby(pd.Grouper(freq='20min')) + r = test_series.resample("20min") + g = test_series.groupby(pd.Grouper(freq="20min")) for t in [g, r]: result = t.apply(lambda x: x) @@ -237,18 +240,20 @@ def test_agg_consistency(): # make sure that we are consistent across # similar aggregations with and w/o selection list - df = DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) + df = DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) - r = df.resample('3T') + r = df.resample("3T") - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) - result = r.agg({'r1': 'mean', 'r2': 'sum'}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + expected = r[["A", "B", "C"]].agg({"r1": "mean", "r2": "sum"}) + result = r.agg({"r1": "mean", "r2": "sum"}) assert_frame_equal(result, expected, check_like=True) + # TODO: once GH 14008 is fixed, move these tests into # `Base` test class @@ -257,184 +262,167 @@ def test_agg(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') - index.name = 'date' - df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], - names=['index', 'date']) - r = df.resample('2D') + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) + r = df.resample("2D") cases = [ r, - df_col.resample('2D', on='date'), - df_mult.resample('2D', level='date'), - df.groupby(pd.Grouper(freq='2D')) + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), ] - a_mean = r['A'].mean() - a_std = r['A'].std() - a_sum = r['A'].sum() - b_mean = r['B'].mean() - b_std = r['B'].std() - b_sum = r['B'].sum() + a_mean = r["A"].mean() + a_std = r["A"].std() + a_sum = r["A"].sum() + b_mean = r["B"].mean() + b_std = r["B"].std() + b_sum = r["B"].sum() expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([['A', 'B'], - ['mean', 'std']]) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: result = t.aggregate([np.mean, np.std]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: - result = t.aggregate({'A': np.mean, - 'B': np.std}) + result = t.aggregate({"A": np.mean, "B": np.std}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'std')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) for t in cases: - result = t.aggregate({'A': ['mean', 'std']}) + result = t.aggregate({"A": ["mean", "std"]}) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = ['mean', 'sum'] + expected.columns = ["mean", "sum"] for t in cases: - result = t['A'].aggregate(['mean', 'sum']) + result = t["A"].aggregate(["mean", "sum"]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'sum')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'sum'), - ('B', 'mean2'), - ('B', 'sum2')]) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] + ) for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, - 'B': {'mean2': 'mean', 'sum2': 'sum'}}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t.aggregate( + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + } + ) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'std'), - ('B', 'mean'), - ('B', 'std')]) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) for t in cases: - result = t.aggregate({'A': ['mean', 'std'], - 'B': ['mean', 'std']}) + result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'), - ('r1', 'A', 'sum'), - ('r2', 'B', 'mean'), - ('r2', 'B', 'sum')]) + expected.columns = pd.MultiIndex.from_tuples( + [ + ("r1", "A", "mean"), + ("r1", "A", "sum"), + ("r2", "B", "mean"), + ("r2", "B", "sum"), + ] + ) def test_agg_misc(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') - index.name = 'date' - df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], - names=['index', 'date']) + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) - r = df.resample('2D') + r = df.resample("2D") cases = [ r, - df_col.resample('2D', on='date'), - df_mult.resample('2D', level='date'), - df.groupby(pd.Grouper(freq='2D')) + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), ] # passed lambda for t in cases: - result = t.agg({'A': np.sum, - 'B': lambda x: np.std(x, ddof=1)}) - rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) - expected = pd.concat([r['A'].sum(), rcustom], axis=1) + result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) + expected = pd.concat([r["A"].sum(), rcustom], axis=1) assert_frame_equal(result, expected, check_like=True) # agg with renamers - expected = pd.concat([t['A'].sum(), - t['B'].sum(), - t['A'].mean(), - t['B'].mean()], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'), - ('result1', 'B'), - ('result2', 'A'), - ('result2', 'B')]) + expected = pd.concat( + [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")] + ) for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), - ('result2', np.mean)])) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t[["A", "B"]].agg( + OrderedDict([("result1", np.sum), ("result2", np.mean)]) + ) assert_frame_equal(result, expected, check_like=True) # agg with different hows - expected = pd.concat([t['A'].sum(), - t['A'].std(), - t['B'].mean(), - t['B'].std()], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), - ('A', 'std'), - ('B', 'mean'), - ('B', 'std')]) + expected = pd.concat( + [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) for t in cases: - result = t.agg(OrderedDict([('A', ['sum', 'std']), - ('B', ['mean', 'std'])])) + result = t.agg(OrderedDict([("A", ["sum", "std"]), ("B", ["mean", "std"])])) assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not for t in cases: - result = t[['A', 'B']].agg({'A': ['sum', 'std'], - 'B': ['mean', 'std']}) + result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) assert_frame_equal(result, expected, check_like=True) # series like aggs for t in cases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t['A'].agg({'A': ['sum', 'std']}) - expected = pd.concat([t['A'].sum(), - t['A'].std()], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), - ('A', 'std')]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t["A"].agg({"A": ["sum", "std"]}) + expected = pd.concat([t["A"].sum(), t["A"].std()], axis=1) + expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std")]) assert_frame_equal(result, expected, check_like=True) - expected = pd.concat([t['A'].agg(['sum', 'std']), - t['A'].agg(['mean', 'std'])], - axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), - ('A', 'std'), - ('B', 'mean'), - ('B', 'std')]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t['A'].agg({'A': ['sum', 'std'], - 'B': ['mean', 'std']}) + expected = pd.concat( + [t["A"].agg(["sum", "std"]), t["A"].agg(["mean", "std"])], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) assert_frame_equal(result, expected, check_like=True) # errors @@ -442,133 +430,138 @@ def test_agg_misc(): msg = "\"Column 'B' does not exist!\"" for t in cases: with pytest.raises(KeyError, match=msg): - t[['A']].agg({'A': ['sum', 'std'], - 'B': ['mean', 'std']}) + t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) def test_agg_nested_dicts(): np.random.seed(1234) - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') - index.name = 'date' - df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index) + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") + index.name = "date" + df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() - df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], - names=['index', 'date']) - r = df.resample('2D') + df_mult.index = pd.MultiIndex.from_arrays( + [range(10), df.index], names=["index", "date"] + ) + r = df.resample("2D") cases = [ r, - df_col.resample('2D', on='date'), - df_mult.resample('2D', level='date'), - df.groupby(pd.Grouper(freq='2D')) + df_col.resample("2D", on="date"), + df_mult.resample("2D", level="date"), + df.groupby(pd.Grouper(freq="2D")), ] msg = r"cannot perform renaming for r(1|2) with a nested dictionary" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): - t.aggregate({'r1': {'A': ['mean', 'sum']}, - 'r2': {'B': ['mean', 'sum']}}) + t.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) for t in cases: - expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(), - t['B'].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( - 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + expected = pd.concat( + [t["A"].mean(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t[["A", "B"]].agg( + {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} + ) assert_frame_equal(result, expected, check_like=True) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = t.agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) assert_frame_equal(result, expected, check_like=True) def test_try_aggregate_non_existing_column(): # GH 16766 data = [ - {'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0}, - {'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0}, - {'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5} + {"dt": datetime(2017, 6, 1, 0), "x": 1.0, "y": 2.0}, + {"dt": datetime(2017, 6, 1, 1), "x": 2.0, "y": 2.0}, + {"dt": datetime(2017, 6, 1, 2), "x": 3.0, "y": 1.5}, ] - df = DataFrame(data).set_index('dt') + df = DataFrame(data).set_index("dt") # Error as we don't have 'z' column msg = "\"Column 'z' does not exist!\"" with pytest.raises(KeyError, match=msg): - df.resample('30T').agg({'x': ['mean'], - 'y': ['median'], - 'z': ['sum']}) + df.resample("30T").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]}) def test_selection_api_validation(): # GH 13500 - index = date_range(datetime(2005, 1, 1), - datetime(2005, 1, 10), freq='D') + index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") rng = np.arange(len(index), dtype=np.int64) - df = DataFrame({'date': index, 'a': rng}, - index=pd.MultiIndex.from_arrays([rng, index], - names=['v', 'd'])) - df_exp = DataFrame({'a': rng}, index=index) + df = DataFrame( + {"date": index, "a": rng}, + index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), + ) + df_exp = DataFrame({"a": rng}, index=index) # non DatetimeIndex - msg = ("Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Int64Index'") + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," + " but got an instance of 'Int64Index'" + ) with pytest.raises(TypeError, match=msg): - df.resample('2D', level='v') + df.resample("2D", level="v") msg = "The Grouper cannot specify both a key and a level!" with pytest.raises(ValueError, match=msg): - df.resample('2D', on='date', level='d') + df.resample("2D", on="date", level="d") msg = "unhashable type: 'list'" with pytest.raises(TypeError, match=msg): - df.resample('2D', on=['a', 'date']) + df.resample("2D", on=["a", "date"]) msg = r"\"Level \['a', 'date'\] not found\"" with pytest.raises(KeyError, match=msg): - df.resample('2D', level=['a', 'date']) + df.resample("2D", level=["a", "date"]) # upsampling not allowed - msg = ("Upsampling from level= or on= selection is not supported, use" - r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like") + msg = ( + "Upsampling from level= or on= selection is not supported, use" + r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like" + ) with pytest.raises(ValueError, match=msg): - df.resample('2D', level='d').asfreq() + df.resample("2D", level="d").asfreq() with pytest.raises(ValueError, match=msg): - df.resample('2D', on='date').asfreq() + df.resample("2D", on="date").asfreq() - exp = df_exp.resample('2D').sum() - exp.index.name = 'date' - assert_frame_equal(exp, df.resample('2D', on='date').sum()) + exp = df_exp.resample("2D").sum() + exp.index.name = "date" + assert_frame_equal(exp, df.resample("2D", on="date").sum()) - exp.index.name = 'd' - assert_frame_equal(exp, df.resample('2D', level='d').sum()) + exp.index.name = "d" + assert_frame_equal(exp, df.resample("2D", level="d").sum()) -@pytest.mark.parametrize('col_name', ['t2', 't2x', 't2q', 'T_2M', - 't2p', 't2m', 't2m1', 'T2M']) +@pytest.mark.parametrize( + "col_name", ["t2", "t2x", "t2q", "T_2M", "t2p", "t2m", "t2m1", "T2M"] +) def test_agg_with_datetime_index_list_agg_func(col_name): # GH 22660 # The parametrized column names would get converted to dates by our # date parser. Some would result in OutOfBoundsError (ValueError) while # others would result in OverflowError when passed into Timestamp. # We catch these errors and move on to the correct branch. - df = pd.DataFrame(list(range(200)), - index=pd.date_range(start='2017-01-01', freq='15min', - periods=200, tz='Europe/Berlin'), - columns=[col_name]) - result = df.resample('1d').aggregate(['mean']) - expected = pd.DataFrame([47.5, 143.5, 195.5], - index=pd.date_range(start='2017-01-01', freq='D', - periods=3, tz='Europe/Berlin'), - columns=pd.MultiIndex(levels=[[col_name], - ['mean']], - codes=[[0], [0]])) + df = pd.DataFrame( + list(range(200)), + index=pd.date_range( + start="2017-01-01", freq="15min", periods=200, tz="Europe/Berlin" + ), + columns=[col_name], + ) + result = df.resample("1d").aggregate(["mean"]) + expected = pd.DataFrame( + [47.5, 143.5, 195.5], + index=pd.date_range( + start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin" + ), + columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]), + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 959b6febcf1c9..9053a7ebfea2b 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -8,71 +8,84 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -test_frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.arange(40)}, - index=date_range('1/1/2000', - freq='s', - periods=40)) +test_frame = DataFrame( + {"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}, + index=date_range("1/1/2000", freq="s", periods=40), +) def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter - code = dedent("""\ + + code = dedent( + """\ import pandas.util.testing as tm s = tm.makeTimeSeries() rs = s.resample("D") - """) + """ + ) ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('rs.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("rs.", 1)) def test_deferred_with_groupby(): # GH 12486 # support deferred resample ops with groupby - data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3], - ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7], - ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5], - ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1], - ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]] - - df = DataFrame(data, columns=['date', 'id', 'score']) + data = [ + ["2010-01-01", "A", 2], + ["2010-01-02", "A", 3], + ["2010-01-05", "A", 8], + ["2010-01-10", "A", 7], + ["2010-01-13", "A", 3], + ["2010-01-01", "B", 5], + ["2010-01-03", "B", 2], + ["2010-01-04", "B", 1], + ["2010-01-11", "B", 7], + ["2010-01-14", "B", 3], + ] + + df = DataFrame(data, columns=["date", "id", "score"]) df.date = pd.to_datetime(df.date) def f(x): - return x.set_index('date').resample('D').asfreq() - expected = df.groupby('id').apply(f) - result = df.set_index('date').groupby('id').resample('D').asfreq() + return x.set_index("date").resample("D").asfreq() + + expected = df.groupby("id").apply(f) + result = df.set_index("date").groupby("id").resample("D").asfreq() assert_frame_equal(result, expected) - df = DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df = DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") def f(x): - return x.resample('1D').ffill() - expected = df.groupby('group').apply(f) - result = df.groupby('group').resample('1D').ffill() + return x.resample("1D").ffill() + + expected = df.groupby("group").apply(f) + result = df.groupby("group").resample("1D").ffill() assert_frame_equal(result, expected) def test_getitem(): - g = test_frame.groupby('A') + g = test_frame.groupby("A") - expected = g.B.apply(lambda x: x.resample('2s').mean()) + expected = g.B.apply(lambda x: x.resample("2s").mean()) - result = g.resample('2s').B.mean() + result = g.resample("2s").B.mean() assert_series_equal(result, expected) - result = g.B.resample('2s').mean() + result = g.B.resample("2s").mean() assert_series_equal(result, expected) - result = g.resample('2s').mean().B + result = g.resample("2s").mean().B assert_series_equal(result, expected) @@ -80,29 +93,31 @@ def test_getitem_multiple(): # GH 13174 # multiple calls after selection causing an issue with aliasing - data = [{'id': 1, 'buyer': 'A'}, {'id': 2, 'buyer': 'B'}] - df = DataFrame(data, index=pd.date_range('2016-01-01', periods=2)) - r = df.groupby('id').resample('1D') - result = r['buyer'].count() - expected = Series([1, 1], - index=pd.MultiIndex.from_tuples( - [(1, Timestamp('2016-01-01')), - (2, Timestamp('2016-01-02'))], - names=['id', None]), - name='buyer') + data = [{"id": 1, "buyer": "A"}, {"id": 2, "buyer": "B"}] + df = DataFrame(data, index=pd.date_range("2016-01-01", periods=2)) + r = df.groupby("id").resample("1D") + result = r["buyer"].count() + expected = Series( + [1, 1], + index=pd.MultiIndex.from_tuples( + [(1, Timestamp("2016-01-01")), (2, Timestamp("2016-01-02"))], + names=["id", None], + ), + name="buyer", + ) assert_series_equal(result, expected) - result = r['buyer'].count() + result = r["buyer"].count() assert_series_equal(result, expected) def test_groupby_resample_on_api_with_getitem(): # GH 17813 - df = pd.DataFrame({'id': list('aabbb'), - 'date': pd.date_range('1-1-2016', periods=5), - 'data': 1}) - exp = df.set_index('date').groupby('id').resample('2D')['data'].sum() - result = df.groupby('id').resample('2D', on='date')['data'].sum() + df = pd.DataFrame( + {"id": list("aabbb"), "date": pd.date_range("1-1-2016", periods=5), "data": 1} + ) + exp = df.set_index("date").groupby("id").resample("2D")["data"].sum() + result = df.groupby("id").resample("2D", on="date")["data"].sum() assert_series_equal(result, exp) @@ -110,78 +125,84 @@ def test_nearest(): # GH 17496 # Resample nearest - index = pd.date_range('1/1/2000', periods=3, freq='T') - result = Series(range(3), index=index).resample('20s').nearest() + index = pd.date_range("1/1/2000", periods=3, freq="T") + result = Series(range(3), index=index).resample("20s").nearest() expected = Series( [0, 0, 1, 1, 1, 2, 2], index=pd.DatetimeIndex( - ['2000-01-01 00:00:00', '2000-01-01 00:00:20', - '2000-01-01 00:00:40', '2000-01-01 00:01:00', - '2000-01-01 00:01:20', '2000-01-01 00:01:40', - '2000-01-01 00:02:00'], - dtype='datetime64[ns]', - freq='20S')) + [ + "2000-01-01 00:00:00", + "2000-01-01 00:00:20", + "2000-01-01 00:00:40", + "2000-01-01 00:01:00", + "2000-01-01 00:01:20", + "2000-01-01 00:01:40", + "2000-01-01 00:02:00", + ], + dtype="datetime64[ns]", + freq="20S", + ), + ) assert_series_equal(result, expected) def test_methods(): - g = test_frame.groupby('A') - r = g.resample('2s') + g = test_frame.groupby("A") + r = g.resample("2s") - for f in ['first', 'last', 'median', 'sem', 'sum', 'mean', - 'min', 'max']: + for f in ["first", "last", "median", "sem", "sum", "mean", "min", "max"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_frame_equal(result, expected) - for f in ['size']: + for f in ["size"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_series_equal(result, expected) - for f in ['count']: + for f in ["count"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_frame_equal(result, expected) # series only - for f in ['nunique']: + for f in ["nunique"]: result = getattr(r.B, f)() - expected = g.B.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.B.apply(lambda x: getattr(x.resample("2s"), f)()) assert_series_equal(result, expected) - for f in ['nearest', 'backfill', 'ffill', 'asfreq']: + for f in ["nearest", "backfill", "ffill", "asfreq"]: result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.resample('2s'), f)()) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)()) assert_frame_equal(result, expected) result = r.ohlc() - expected = g.apply(lambda x: x.resample('2s').ohlc()) + expected = g.apply(lambda x: x.resample("2s").ohlc()) assert_frame_equal(result, expected) - for f in ['std', 'var']: + for f in ["std", "var"]: result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.resample('2s'), f)(ddof=1)) + expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1)) assert_frame_equal(result, expected) def test_apply(): - g = test_frame.groupby('A') - r = g.resample('2s') + g = test_frame.groupby("A") + r = g.resample("2s") # reduction - expected = g.resample('2s').sum() + expected = g.resample("2s").sum() def f(x): - return x.resample('2s').sum() + return x.resample("2s").sum() result = r.apply(f) assert_frame_equal(result, expected) def f(x): - return x.resample('2s').apply(lambda y: y.sum()) + return x.resample("2s").apply(lambda y: y.sum()) result = g.apply(f) assert_frame_equal(result, expected) @@ -189,41 +210,40 @@ def f(x): def test_apply_with_mutated_index(): # GH 15169 - index = pd.date_range('1-1-2015', '12-31-15', freq='D') - df = DataFrame(data={'col1': np.random.rand(len(index))}, index=index) + index = pd.date_range("1-1-2015", "12-31-15", freq="D") + df = DataFrame(data={"col1": np.random.rand(len(index))}, index=index) def f(x): - s = Series([1, 2], index=['a', 'b']) + s = Series([1, 2], index=["a", "b"]) return s - expected = df.groupby(pd.Grouper(freq='M')).apply(f) + expected = df.groupby(pd.Grouper(freq="M")).apply(f) - result = df.resample('M').apply(f) + result = df.resample("M").apply(f) assert_frame_equal(result, expected) # A case for series - expected = df['col1'].groupby(pd.Grouper(freq='M')).apply(f) - result = df['col1'].resample('M').apply(f) + expected = df["col1"].groupby(pd.Grouper(freq="M")).apply(f) + result = df["col1"].resample("M").apply(f) assert_series_equal(result, expected) def test_resample_groupby_with_label(): # GH 13235 - index = date_range('2000-01-01', freq='2D', periods=5) - df = DataFrame(index=index, - data={'col0': [0, 0, 1, 1, 2], 'col1': [1, 1, 1, 1, 1]} - ) - result = df.groupby('col0').resample('1W', label='left').sum() - - mi = [np.array([0, 0, 1, 2]), - pd.to_datetime(np.array(['1999-12-26', '2000-01-02', - '2000-01-02', '2000-01-02']) - ) - ] - mindex = pd.MultiIndex.from_arrays(mi, names=['col0', None]) - expected = DataFrame(data={'col0': [0, 0, 2, 2], 'col1': [1, 1, 2, 1]}, - index=mindex - ) + index = date_range("2000-01-01", freq="2D", periods=5) + df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]}) + result = df.groupby("col0").resample("1W", label="left").sum() + + mi = [ + np.array([0, 0, 1, 2]), + pd.to_datetime( + np.array(["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"]) + ), + ] + mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None]) + expected = DataFrame( + data={"col0": [0, 0, 2, 2], "col1": [1, 1, 2, 1]}, index=mindex + ) assert_frame_equal(result, expected) @@ -232,12 +252,12 @@ def test_consistency_with_window(): # consistent return values with window df = test_frame - expected = pd.Int64Index([1, 2, 3], name='A') - result = df.groupby('A').resample('2s').mean() + expected = pd.Int64Index([1, 2, 3], name="A") + result = df.groupby("A").resample("2s").mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) - result = df.groupby('A').rolling(20).mean() + result = df.groupby("A").rolling(20).mean() assert result.index.nlevels == 2 tm.assert_index_equal(result.index.levels[0], expected) @@ -245,12 +265,14 @@ def test_consistency_with_window(): def test_median_duplicate_columns(): # GH 14233 - df = DataFrame(np.random.randn(20, 3), - columns=list('aaa'), - index=pd.date_range('2012-01-01', periods=20, freq='s')) + df = DataFrame( + np.random.randn(20, 3), + columns=list("aaa"), + index=pd.date_range("2012-01-01", periods=20, freq="s"), + ) df2 = df.copy() - df2.columns = ['a', 'b', 'c'] - expected = df2.resample('5s').median() - result = df.resample('5s').median() + df2.columns = ["a", "b", "c"] + expected = df2.resample("5s").median() + result = df.resample("5s").median() expected.columns = result.columns assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 3f767f8e7100f..648d78d92e7d4 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -11,12 +11,11 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -test_series = Series(np.random.randn(1000), - index=date_range('1/1/2000', periods=1000)) +test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000)) def test_apply(): - grouper = Grouper(freq='A', label='right', closed='right') + grouper = Grouper(freq="A", label="right", closed="right") grouped = test_series.groupby(grouper) @@ -36,18 +35,18 @@ def test_count(): expected = test_series.groupby(lambda x: x.year).count() - grouper = Grouper(freq='A', label='right', closed='right') + grouper = Grouper(freq="A", label="right", closed="right") result = test_series.groupby(grouper).count() expected.index = result.index assert_series_equal(result, expected) - result = test_series.resample('A').count() + result = test_series.resample("A").count() expected.index = result.index assert_series_equal(result, expected) def test_numpy_reduction(): - result = test_series.resample('A', closed='right').prod() + result = test_series.resample("A", closed="right").prod() expected = test_series.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index @@ -59,8 +58,8 @@ def test_apply_iteration(): # #2300 N = 1000 ind = pd.date_range(start="2000-01-01", freq="D", periods=N) - df = DataFrame({'open': 1, 'close': 2}, index=ind) - tg = Grouper(freq='M') + df = DataFrame({"open": 1, "close": 2}, index=ind) + tg = Grouper(freq="M") _, grouper, _ = tg._get_grouper(df) @@ -68,28 +67,33 @@ def test_apply_iteration(): grouped = df.groupby(grouper, group_keys=False) def f(df): - return df['close'] / df['open'] + return df["close"] / df["open"] # it works! result = grouped.apply(f) tm.assert_index_equal(result.index, df.index) -@pytest.mark.parametrize('name, func', [ - ('Int64Index', tm.makeIntIndex), - ('Index', tm.makeUnicodeIndex), - ('Float64Index', tm.makeFloatIndex), - ('MultiIndex', lambda m: tm.makeCustomIndex(m, 2)) -]) +@pytest.mark.parametrize( + "name, func", + [ + ("Int64Index", tm.makeIntIndex), + ("Index", tm.makeUnicodeIndex), + ("Float64Index", tm.makeFloatIndex), + ("MultiIndex", lambda m: tm.makeCustomIndex(m, 2)), + ], +) def test_fails_on_no_datetime_index(name, func): n = 2 index = func(n) - df = DataFrame({'a': np.random.randn(n)}, index=index) + df = DataFrame({"a": np.random.randn(n)}, index=index) - msg = ("Only valid with DatetimeIndex, TimedeltaIndex " - "or PeriodIndex, but got an instance of '{}'".format(name)) + msg = ( + "Only valid with DatetimeIndex, TimedeltaIndex " + "or PeriodIndex, but got an instance of '{}'".format(name) + ) with pytest.raises(TypeError, match=msg): - df.groupby(Grouper(freq='D')) + df.groupby(Grouper(freq="D")) def test_aaa_group_order(): @@ -97,46 +101,48 @@ def test_aaa_group_order(): # check TimeGrouper perform stable sorts n = 20 data = np.random.randn(n, 4) - df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), - datetime(2013, 1, 3), datetime(2013, 1, 4), - datetime(2013, 1, 5)] * 4 - grouped = df.groupby(Grouper(key='key', freq='D')) - - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), - df[::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), - df[1::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), - df[2::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), - df[3::5]) - tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), - df[4::5]) + df = DataFrame(data, columns=["A", "B", "C", "D"]) + df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + grouped = df.groupby(Grouper(key="key", freq="D")) + + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5]) + tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5]) def test_aggregate_normal(resample_method): """Check TimeGrouper's aggregation is identical as normal groupby.""" - if resample_method == 'ohlc': - pytest.xfail(reason='DataError: No numeric types to aggregate') + if resample_method == "ohlc": + pytest.xfail(reason="DataError: No numeric types to aggregate") data = np.random.randn(20, 4) - normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - normal_df['key'] = [1, 2, 3, 4, 5] * 4 + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, 3, 4, 5] * 4 - dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), - datetime(2013, 1, 3), datetime(2013, 1, 4), - datetime(2013, 1, 5)] * 4 + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + datetime(2013, 1, 3), + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 - normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) expected = getattr(normal_grouped, resample_method)() dt_result = getattr(dt_grouped, resample_method)() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") tm.assert_equal(expected, dt_result) # if TimeGrouper is used included, 'nth' doesn't work yet @@ -151,74 +157,81 @@ def test_aggregate_normal(resample_method): """ -@pytest.mark.parametrize('method, method_args, unit', [ - ('sum', dict(), 0), - ('sum', dict(min_count=0), 0), - ('sum', dict(min_count=1), np.nan), - ('prod', dict(), 1), - ('prod', dict(min_count=0), 1), - ('prod', dict(min_count=1), np.nan) -]) +@pytest.mark.parametrize( + "method, method_args, unit", + [ + ("sum", dict(), 0), + ("sum", dict(min_count=0), 0), + ("sum", dict(min_count=1), np.nan), + ("prod", dict(), 1), + ("prod", dict(min_count=0), 1), + ("prod", dict(min_count=1), np.nan), + ], +) def test_resample_entirly_nat_window(method, method_args, unit): - s = pd.Series([0] * 2 + [np.nan] * 2, - index=pd.date_range('2017', periods=4)) + s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range("2017", periods=4)) result = methodcaller(method, **method_args)(s.resample("2d")) - expected = pd.Series([0.0, unit], - index=pd.to_datetime(['2017-01-01', - '2017-01-03'])) + expected = pd.Series( + [0.0, unit], index=pd.to_datetime(["2017-01-01", "2017-01-03"]) + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('func, fill_value', [ - ('min', np.nan), - ('max', np.nan), - ('sum', 0), - ('prod', 1), - ('count', 0), -]) +@pytest.mark.parametrize( + "func, fill_value", + [("min", np.nan), ("max", np.nan), ("sum", 0), ("prod", 1), ("count", 0)], +) def test_aggregate_with_nat(func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet n = 20 - data = np.random.randn(n, 4).astype('int64') - normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 - - dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, - datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 - - normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) + data = np.random.randn(n, 4).astype("int64") + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() - pad = DataFrame([[fill_value] * 4], index=[3], - columns=['A', 'B', 'C', 'D']) + pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"]) expected = normal_result.append(pad) expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") assert_frame_equal(expected, dt_result) - assert dt_result.index.name == 'key' + assert dt_result.index.name == "key" def test_aggregate_with_nat_size(): # GH 9925 n = 20 - data = np.random.randn(n, 4).astype('int64') - normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 - - dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) - dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, - datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 - - normal_grouped = normal_df.groupby('key') - dt_grouped = dt_df.groupby(Grouper(key='key', freq='D')) + data = np.random.randn(n, 4).astype("int64") + normal_df = DataFrame(data, columns=["A", "B", "C", "D"]) + normal_df["key"] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=["A", "B", "C", "D"]) + dt_df["key"] = [ + datetime(2013, 1, 1), + datetime(2013, 1, 2), + pd.NaT, + datetime(2013, 1, 4), + datetime(2013, 1, 5), + ] * 4 + + normal_grouped = normal_df.groupby("key") + dt_grouped = dt_df.groupby(Grouper(key="key", freq="D")) normal_result = normal_grouped.size() dt_result = dt_grouped.size() @@ -226,37 +239,41 @@ def test_aggregate_with_nat_size(): pad = Series([0], index=[3]) expected = normal_result.append(pad) expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') + expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key") assert_series_equal(expected, dt_result) - assert dt_result.index.name == 'key' + assert dt_result.index.name == "key" def test_repr(): # GH18203 - result = repr(Grouper(key='A', freq='H')) - expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " - "closed='left', label='left', how='mean', " - "convention='e', base=0)") + result = repr(Grouper(key="A", freq="H")) + expected = ( + "TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0)" + ) assert result == expected -@pytest.mark.parametrize('method, method_args, expected_values', [ - ('sum', dict(), [1, 0, 1]), - ('sum', dict(min_count=0), [1, 0, 1]), - ('sum', dict(min_count=1), [1, np.nan, 1]), - ('sum', dict(min_count=2), [np.nan, np.nan, np.nan]), - ('prod', dict(), [1, 1, 1]), - ('prod', dict(min_count=0), [1, 1, 1]), - ('prod', dict(min_count=1), [1, np.nan, 1]), - ('prod', dict(min_count=2), [np.nan, np.nan, np.nan]), -]) +@pytest.mark.parametrize( + "method, method_args, expected_values", + [ + ("sum", dict(), [1, 0, 1]), + ("sum", dict(min_count=0), [1, 0, 1]), + ("sum", dict(min_count=1), [1, np.nan, 1]), + ("sum", dict(min_count=2), [np.nan, np.nan, np.nan]), + ("prod", dict(), [1, 1, 1]), + ("prod", dict(min_count=0), [1, 1, 1]), + ("prod", dict(min_count=1), [1, np.nan, 1]), + ("prod", dict(min_count=2), [np.nan, np.nan, np.nan]), + ], +) def test_upsample_sum(method, method_args, expected_values): s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H")) resampled = s.resample("30T") - index = pd.to_datetime(['2017-01-01T00:00:00', - '2017-01-01T00:30:00', - '2017-01-01T01:00:00']) + index = pd.to_datetime( + ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"] + ) result = methodcaller(method, **method_args)(resampled) expected = pd.Series(expected_values, index=index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 3498d30d11689..9ce419d5cd20a 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -10,72 +10,72 @@ def test_asfreq_bug(): - df = DataFrame(data=[1, 3], - index=[timedelta(), timedelta(minutes=3)]) - result = df.resample('1T').asfreq() - expected = DataFrame(data=[1, np.nan, np.nan, 3], - index=timedelta_range('0 day', - periods=4, - freq='1T')) + df = DataFrame(data=[1, 3], index=[timedelta(), timedelta(minutes=3)]) + result = df.resample("1T").asfreq() + expected = DataFrame( + data=[1, np.nan, np.nan, 3], + index=timedelta_range("0 day", periods=4, freq="1T"), + ) assert_frame_equal(result, expected) def test_resample_with_nat(): # GH 13223 - index = pd.to_timedelta(['0s', pd.NaT, '2s']) - result = DataFrame({'value': [2, 3, 5]}, index).resample('1s').mean() - expected = DataFrame({'value': [2.5, np.nan, 5.0]}, - index=timedelta_range('0 day', - periods=3, - freq='1S')) + index = pd.to_timedelta(["0s", pd.NaT, "2s"]) + result = DataFrame({"value": [2, 3, 5]}, index).resample("1s").mean() + expected = DataFrame( + {"value": [2.5, np.nan, 5.0]}, + index=timedelta_range("0 day", periods=3, freq="1S"), + ) assert_frame_equal(result, expected) def test_resample_as_freq_with_subperiod(): # GH 13022 - index = timedelta_range('00:00:00', '00:10:00', freq='5T') - df = DataFrame(data={'value': [1, 5, 10]}, index=index) - result = df.resample('2T').asfreq() - expected_data = {'value': [1, np.nan, np.nan, np.nan, np.nan, 10]} - expected = DataFrame(data=expected_data, - index=timedelta_range('00:00:00', - '00:10:00', freq='2T')) + index = timedelta_range("00:00:00", "00:10:00", freq="5T") + df = DataFrame(data={"value": [1, 5, 10]}, index=index) + result = df.resample("2T").asfreq() + expected_data = {"value": [1, np.nan, np.nan, np.nan, np.nan, 10]} + expected = DataFrame( + data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2T") + ) tm.assert_frame_equal(result, expected) def test_resample_with_timedeltas(): - expected = DataFrame({'A': np.arange(1480)}) + expected = DataFrame({"A": np.arange(1480)}) expected = expected.groupby(expected.index // 30).sum() - expected.index = pd.timedelta_range('0 days', freq='30T', periods=50) + expected.index = pd.timedelta_range("0 days", freq="30T", periods=50) - df = DataFrame({'A': np.arange(1480)}, index=pd.to_timedelta( - np.arange(1480), unit='T')) - result = df.resample('30T').sum() + df = DataFrame( + {"A": np.arange(1480)}, index=pd.to_timedelta(np.arange(1480), unit="T") + ) + result = df.resample("30T").sum() assert_frame_equal(result, expected) - s = df['A'] - result = s.resample('30T').sum() - assert_series_equal(result, expected['A']) + s = df["A"] + result = s.resample("30T").sum() + assert_series_equal(result, expected["A"]) def test_resample_single_period_timedelta(): - s = Series(list(range(5)), index=pd.timedelta_range( - '1 day', freq='s', periods=5)) - result = s.resample('2s').sum() - expected = Series([1, 5, 4], index=pd.timedelta_range( - '1 day', freq='2s', periods=3)) + s = Series(list(range(5)), index=pd.timedelta_range("1 day", freq="s", periods=5)) + result = s.resample("2s").sum() + expected = Series( + [1, 5, 4], index=pd.timedelta_range("1 day", freq="2s", periods=3) + ) assert_series_equal(result, expected) def test_resample_timedelta_idempotency(): # GH 12072 - index = pd.timedelta_range('0', periods=9, freq='10L') + index = pd.timedelta_range("0", periods=9, freq="10L") series = Series(range(9), index=index) - result = series.resample('10L').mean() + result = series.resample("10L").mean() expected = series assert_series_equal(result, expected) @@ -83,14 +83,14 @@ def test_resample_timedelta_idempotency(): def test_resample_base_with_timedeltaindex(): # GH 10530 - rng = timedelta_range(start='0s', periods=25, freq='s') + rng = timedelta_range(start="0s", periods=25, freq="s") ts = Series(np.random.randn(len(rng)), index=rng) - with_base = ts.resample('2s', base=5).mean() - without_base = ts.resample('2s').mean() + with_base = ts.resample("2s", base=5).mean() + without_base = ts.resample("2s").mean() - exp_without_base = timedelta_range(start='0s', end='25s', freq='2s') - exp_with_base = timedelta_range(start='5s', end='29s', freq='2s') + exp_without_base = timedelta_range(start="0s", end="25s", freq="2s") + exp_with_base = timedelta_range(start="5s", end="29s", freq="2s") tm.assert_index_equal(without_base.index, exp_without_base) tm.assert_index_equal(with_base.index, exp_with_base) @@ -98,15 +98,15 @@ def test_resample_base_with_timedeltaindex(): def test_resample_categorical_data_with_timedeltaindex(): # GH #12169 - df = DataFrame({'Group_obj': 'A'}, - index=pd.to_timedelta(list(range(20)), unit='s')) - df['Group'] = df['Group_obj'].astype('category') - result = df.resample('10s').agg(lambda x: (x.value_counts().index[0])) - expected = DataFrame({'Group_obj': ['A', 'A'], - 'Group': ['A', 'A']}, - index=pd.to_timedelta([0, 10], unit='s')) - expected = expected.reindex(['Group_obj', 'Group'], axis=1) - expected['Group'] = expected['Group_obj'].astype('category') + df = DataFrame({"Group_obj": "A"}, index=pd.to_timedelta(list(range(20)), unit="s")) + df["Group"] = df["Group_obj"].astype("category") + result = df.resample("10s").agg(lambda x: (x.value_counts().index[0])) + expected = DataFrame( + {"Group_obj": ["A", "A"], "Group": ["A", "A"]}, + index=pd.to_timedelta([0, 10], unit="s"), + ) + expected = expected.reindex(["Group_obj", "Group"], axis=1) + expected["Group"] = expected["Group_obj"].astype("category") tm.assert_frame_equal(result, expected) @@ -115,14 +115,14 @@ def test_resample_timedelta_values(): # check that timedelta dtype is preserved when NaT values are # introduced by the resampling - times = timedelta_range('1 day', '4 day', freq='4D') - df = DataFrame({'time': times}, index=times) + times = timedelta_range("1 day", "4 day", freq="4D") + df = DataFrame({"time": times}, index=times) - times2 = timedelta_range('1 day', '4 day', freq='2D') - exp = Series(times2, index=times2, name='time') + times2 = timedelta_range("1 day", "4 day", freq="2D") + exp = Series(times2, index=times2, name="time") exp.iloc[1] = pd.NaT - res = df.resample('2D').first()['time'] + res = df.resample("2D").first()["time"] tm.assert_series_equal(res, exp) - res = df['time'].resample('2D').first() + res = df["time"].resample("2D").first() tm.assert_series_equal(res, exp) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 72d04e26234a3..16cfe3a469b34 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -14,28 +14,35 @@ class TestJoin: - def setup_method(self, method): # aggregate multiple columns - self.df = DataFrame({'key1': get_test_data(), - 'key2': get_test_data(), - 'data1': np.random.randn(N), - 'data2': np.random.randn(N)}) + self.df = DataFrame( + { + "key1": get_test_data(), + "key2": get_test_data(), + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) # exclude a couple keys for fun - self.df = self.df[self.df['key2'] > 1] + self.df = self.df[self.df["key2"] > 1] - self.df2 = DataFrame({'key1': get_test_data(n=N // 5), - 'key2': get_test_data(ngroups=NGROUPS // 2, - n=N // 5), - 'value': np.random.randn(N // 5)}) + self.df2 = DataFrame( + { + "key1": get_test_data(n=N // 5), + "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), + "value": np.random.randn(N // 5), + } + ) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value - self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, - index=data['C']) + self.source = DataFrame( + {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"] + ) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -44,13 +51,11 @@ def test_cython_left_outer_join(self): ls, rs = libjoin.left_outer_join(left, right, max_group) - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8, 9, 10]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5, -1, -1]) + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 @@ -68,15 +73,35 @@ def test_cython_right_outer_join(self): rs, ls = libjoin.left_outer_join(right, left, max_group) - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") # 0 1 1 1 - exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, - # 2 2 4 - 6, 7, 8, 6, 7, 8, -1]) - exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, - 4, 4, 4, 5, 5, 5, 6]) + exp_li = a_( + [ + 0, + 1, + 2, + 3, + 4, + 5, + 3, + 4, + 5, + 3, + 4, + 5, + # 2 2 4 + 6, + 7, + 8, + 6, + 7, + 8, + -1, + ] + ) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 @@ -94,13 +119,11 @@ def test_cython_inner_join(self): ls, rs = libjoin.inner_join(left, right, max_group) - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') + exp_ls = left.argsort(kind="mergesort") + exp_rs = right.argsort(kind="mergesort") - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5]) + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 @@ -112,207 +135,209 @@ def test_cython_inner_join(self): tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') + joined_key2 = merge(self.df, self.df2, on="key2") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") joined_both = merge(self.df, self.df2) - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='left') + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left") def test_right_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='right') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') + joined_key2 = merge(self.df, self.df2, on="key2", how="right") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="right") - joined_both = merge(self.df, self.df2, how='right') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='right') + joined_both = merge(self.df, self.df2, how="right") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right") def test_full_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='outer') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') + joined_key2 = merge(self.df, self.df2, on="key2", how="outer") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer") - joined_both = merge(self.df, self.df2, how='outer') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='outer') + joined_both = merge(self.df, self.df2, how="outer") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer") def test_inner_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='inner') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') + joined_key2 = merge(self.df, self.df2, on="key2", how="inner") + _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner") - joined_both = merge(self.df, self.df2, how='inner') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='inner') + joined_both = merge(self.df, self.df2, how="inner") + _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): - joined = merge(self.df, self.df2, on='key2', - suffixes=['.foo', '.bar']) + joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"]) - assert 'key1.foo' in joined - assert 'key1.bar' in joined + assert "key1.foo" in joined + assert "key1.bar" in joined def test_handle_overlap_arbitrary_key(self): - joined = merge(self.df, self.df2, - left_on='key2', right_on='key1', - suffixes=['.foo', '.bar']) - assert 'key1.foo' in joined - assert 'key2.bar' in joined + joined = merge( + self.df, + self.df2, + left_on="key2", + right_on="key1", + suffixes=[".foo", ".bar"], + ) + assert "key1.foo" in joined + assert "key2.bar" in joined def test_join_on(self): target = self.target source = self.source - merged = target.join(source, on='C') - tm.assert_series_equal(merged['MergedA'], target['A'], - check_names=False) - tm.assert_series_equal(merged['MergedD'], target['D'], - check_names=False) + merged = target.join(source, on="C") + tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False) + tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - joined = df.join(df2, on='key') - expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], - 'value': [0, 0, 1, 1, 2]}) + df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) + joined = df.join(df2, on="key") + expected = DataFrame( + {"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]} + ) assert_frame_equal(joined, expected) # Test when some are missing - df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], - columns=['one']) - df_b = DataFrame([['foo'], ['bar']], index=[1, 2], - columns=['two']) - df_c = DataFrame([[1], [2]], index=[1, 2], - columns=['three']) - joined = df_a.join(df_b, on='one') - joined = joined.join(df_c, on='one') - assert np.isnan(joined['two']['c']) - assert np.isnan(joined['three']['c']) + df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"]) + df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"]) + df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"]) + joined = df_a.join(df_b, on="one") + joined = joined.join(df_c, on="one") + assert np.isnan(joined["two"]["c"]) + assert np.isnan(joined["three"]["c"]) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): - target.join(source, on='E') + target.join(source, on="E") # overlap source_copy = source.copy() - source_copy['A'] = 0 - msg = ("You are trying to merge on float64 and object columns. If" - " you wish to proceed you should use pd.concat") + source_copy["A"] = 0 + msg = ( + "You are trying to merge on float64 and object columns. If" + " you wish to proceed you should use pd.concat" + ) with pytest.raises(ValueError, match=msg): - target.join(source_copy, on='A') + target.join(source_copy, on="A") def test_join_on_fails_with_different_right_index(self): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - msg = (r'len\(left_on\) must equal the number of levels in the index' - ' of "right"') + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)} + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2), + ) + msg = ( + r"len\(left_on\) must equal the number of levels in the index" ' of "right"' + ) with pytest.raises(ValueError, match=msg): - merge(df, df2, left_on='a', right_index=True) + merge(df, df2, left_on="a", right_index=True) def test_join_on_fails_with_different_left_index(self): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}, - index=tm.makeCustomIndex(3, 2)) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}) - msg = (r'len\(right_on\) must equal the number of levels in the index' - ' of "left"') + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}, + index=tm.makeCustomIndex(3, 2), + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)} + ) + msg = ( + r"len\(right_on\) must equal the number of levels in the index" ' of "left"' + ) with pytest.raises(ValueError, match=msg): - merge(df, df2, right_on='b', left_index=True) + merge(df, df2, right_on="b", left_index=True) def test_join_on_fails_with_different_column_counts(self): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) + df = DataFrame( + {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)} + ) + df2 = DataFrame( + {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2), + ) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): - merge(df, df2, right_on='a', left_on=['a', 'b']) + merge(df, df2, right_on="a", left_on=["a", "b"]) - @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) + @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters - df = DataFrame({'a': [1, 1]}) - msg = ("Can only merge Series or DataFrame objects, a {} was passed" - .format(str(type(wrong_type)))) + df = DataFrame({"a": [1, 1]}) + msg = "Can only merge Series or DataFrame objects, a {} was passed".format( + str(type(wrong_type)) + ) with pytest.raises(TypeError, match=msg): - merge(wrong_type, df, left_on='a', right_on='a') + merge(wrong_type, df, left_on="a", right_on="a") with pytest.raises(TypeError, match=msg): - merge(df, wrong_type, left_on='a', right_on='a') + merge(df, wrong_type, left_on="a", right_on="a") def test_join_on_pass_vector(self): - expected = self.target.join(self.source, on='C') - del expected['C'] + expected = self.target.join(self.source, on="C") + del expected["C"] - join_col = self.target.pop('C') + join_col = self.target.pop("C") result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge - merged = self.target.join(self.source.reindex([]), on='C') + merged = self.target.join(self.source.reindex([]), on="C") for col in self.source: assert col in merged assert merged[col].isna().all() - merged2 = self.target.join(self.source.reindex([]), on='C', - how='inner') + merged2 = self.target.join(self.source.reindex([]), on="C", how="inner") tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): - df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) + df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1]}, index=["a", "b"]) - joined = df.join(df2, on='key', how='inner') + joined = df.join(df2, on="key", how="inner") - expected = df.join(df2, on='key') - expected = expected[expected['value'].notna()] - tm.assert_series_equal(joined['key'], expected['key'], - check_dtype=False) - tm.assert_series_equal(joined['value'], expected['value'], - check_dtype=False) + expected = df.join(df2, on="key") + expected = expected[expected["value"].notna()] + tm.assert_series_equal(joined["key"], expected["key"], check_dtype=False) + tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) + df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) # corner cases - joined = df.join(df2, on=['key']) - expected = df.join(df2, on='key') + joined = df.join(df2, on=["key"]) + expected = df.join(df2, on="key") assert_frame_equal(joined, expected) def test_join_on_series(self): - result = self.target.join(self.source['MergedA'], on='C') - expected = self.target.join(self.source[['MergedA']], on='C') + result = self.target.join(self.source["MergedA"], on="C") + expected = self.target.join(self.source[["MergedA"]], on="C") assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 - df = DataFrame({'a': [1, 1]}) - ds = Series([2], index=[1], name='b') - result = df.join(ds, on='a') - expected = DataFrame({'a': [1, 1], - 'b': [2, 2]}, index=df.index) + df = DataFrame({"a": [1, 1]}) + ds = Series([2], index=[1], name="b") + result = df.join(ds, on="a") + expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) - df1['bool'] = True - df1['string'] = 'foo' + df1["bool"] = True + df1["string"] = "foo" df2 = DataFrame(index=np.arange(5, 15)) - df2['int'] = 1 - df2['float'] = 1. + df2["int"] = 1 + df2["float"] = 1.0 joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) @@ -323,20 +348,32 @@ def test_join_index_mixed(self, join_type): assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): - df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(10), - columns=['A', 'B', 'C', 'D']) - assert df1['B'].dtype == np.int64 - assert df1['D'].dtype == np.bool_ + df1 = DataFrame( + {"A": 1.0, "B": 2, "C": "foo", "D": True}, + index=np.arange(10), + columns=["A", "B", "C", "D"], + ) + assert df1["B"].dtype == np.int64 + assert df1["D"].dtype == np.bool_ - df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(0, 10, 2), - columns=['A', 'B', 'C', 'D']) + df2 = DataFrame( + {"A": 1.0, "B": 2, "C": "foo", "D": True}, + index=np.arange(0, 10, 2), + columns=["A", "B", "C", "D"], + ) # overlap - joined = df1.join(df2, lsuffix='_one', rsuffix='_two') - expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', - 'A_two', 'B_two', 'C_two', 'D_two'] + joined = df1.join(df2, lsuffix="_one", rsuffix="_two") + expected_columns = [ + "A_one", + "B_one", + "C_one", + "D_one", + "A_two", + "B_two", + "C_two", + "D_two", + ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) @@ -345,37 +382,37 @@ def test_join_index_mixed_overlap(self): def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() - x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + x.join(DataFrame([3], index=[0], columns=["A"]), how="outer") def test_join_unconsolidated(self): # GH #331 - a = DataFrame(randn(30, 2), columns=['a', 'b']) + a = DataFrame(randn(30, 2), columns=["a", "b"]) c = Series(randn(30)) - a['c'] = c - d = DataFrame(randn(30, 1), columns=['q']) + a["c"] = c + d = DataFrame(randn(30, 1), columns=["q"]) # it works! a.join(d) d.join(a) def test_join_multiindex(self): - index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) + index1 = MultiIndex.from_arrays( + [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], + names=["first", "second"], + ) - index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) + index2 = MultiIndex.from_arrays( + [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], + names=["first", "second"], + ) - df1 = DataFrame(data=np.random.randn(6), index=index1, - columns=['var X']) - df2 = DataFrame(data=np.random.randn(6), index=index2, - columns=['var Y']) + df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"]) + df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"]) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) - joined = df1.join(df2, how='outer') + joined = df1.join(df2, how="outer") ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names @@ -385,7 +422,7 @@ def test_join_multiindex(self): df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) - joined = df1.join(df2, how='outer').sort_index(level=0) + joined = df1.join(df2, how="outer").sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names @@ -394,38 +431,62 @@ def test_join_multiindex(self): assert joined.index.names == index1.names def test_join_inner_multiindex(self): - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = [ + "two", + "one", + "three", + "one", + "two", + "one", + "two", + "two", + "three", + "one", + ] data = np.random.randn(len(key1)) - data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - to_join = DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) - - joined = data.join(to_join, on=['key1', 'key2'], how='inner') - expected = merge(data, to_join.reset_index(), - left_on=['key1', 'key2'], - right_on=['first', 'second'], how='inner', - sort=False) - - expected2 = merge(to_join, data, - right_on=['key1', 'key2'], left_index=True, - how='inner', sort=False) + data = DataFrame({"key1": key1, "key2": key2, "data": data}) + + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + to_join = DataFrame( + np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"] + ) + + joined = data.join(to_join, on=["key1", "key2"], how="inner") + expected = merge( + data, + to_join.reset_index(), + left_on=["key1", "key2"], + right_on=["first", "second"], + how="inner", + sort=False, + ) + + expected2 = merge( + to_join, + data, + right_on=["key1", "key2"], + left_index=True, + how="inner", + sort=False, + ) assert_frame_equal(joined, expected2.reindex_like(joined)) - expected2 = merge(to_join, data, right_on=['key1', 'key2'], - left_index=True, how='inner', sort=False) + expected2 = merge( + to_join, + data, + right_on=["key1", "key2"], + left_index=True, + how="inner", + sort=False, + ) - expected = expected.drop(['first', 'second'], axis=1) + expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic @@ -435,39 +496,38 @@ def test_join_inner_multiindex(self): def test_join_hierarchical_mixed(self): # GH 2024 - df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) - new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) - other_df = DataFrame( - [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) - other_df.set_index('a', inplace=True) + df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) + new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) + other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) + other_df.set_index("a", inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) - assert ('b', 'mean') in result - assert 'b' in result + assert ("b", "mean") in result + assert "b" in result def test_join_float64_float32(self): - a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) - b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) + a = DataFrame(randn(10, 2), columns=["a", "b"], dtype=np.float64) + b = DataFrame(randn(10, 1), columns=["c"], dtype=np.float32) joined = a.join(b) - assert joined.dtypes['a'] == 'float64' - assert joined.dtypes['b'] == 'float64' - assert joined.dtypes['c'] == 'float32' - - a = np.random.randint(0, 5, 100).astype('int64') - b = np.random.random(100).astype('float64') - c = np.random.random(100).astype('float32') - df = DataFrame({'a': a, 'b': b, 'c': c}) - xpdf = DataFrame({'a': a, 'b': b, 'c': c}) - s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) - rs = df.merge(s, left_on='a', right_index=True) - assert rs.dtypes['a'] == 'int64' - assert rs.dtypes['b'] == 'float64' - assert rs.dtypes['c'] == 'float32' - assert rs.dtypes['md'] == 'float32' - - xp = xpdf.merge(s, left_on='a', right_index=True) + assert joined.dtypes["a"] == "float64" + assert joined.dtypes["b"] == "float64" + assert joined.dtypes["c"] == "float32" + + a = np.random.randint(0, 5, 100).astype("int64") + b = np.random.random(100).astype("float64") + c = np.random.random(100).astype("float32") + df = DataFrame({"a": a, "b": b, "c": c}) + xpdf = DataFrame({"a": a, "b": b, "c": c}) + s = DataFrame(np.random.random(5).astype("float32"), columns=["md"]) + rs = df.merge(s, left_on="a", right_index=True) + assert rs.dtypes["a"] == "int64" + assert rs.dtypes["b"] == "float64" + assert rs.dtypes["c"] == "float32" + assert rs.dtypes["md"] == "float32" + + xp = xpdf.merge(s, left_on="a", right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): @@ -478,119 +538,124 @@ def test_join_many_non_unique_index(self): idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) - result = idf1.join([idf2, idf3], how='outer') + result = idf1.join([idf2, idf3], how="outer") - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') + df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer") + expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer") result = result.reset_index() expected = expected[result.columns] - expected['a'] = expected.a.astype('int64') - expected['b'] = expected.b.astype('int64') + expected["a"] = expected.a.astype("int64") + expected["b"] = expected.b.astype("int64") assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) - df3 = DataFrame( - {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) + df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) - result = idf1.join([idf2, idf3], how='inner') + result = idf1.join([idf2, idf3], how="inner") - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') + df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner") + expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner") result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - s = Series(np.repeat(np.arange(8), 2), - index=np.repeat(np.arange(8), 2), name='TEST') - inner = df.join(s, how='inner') - outer = df.join(s, how='outer') - left = df.join(s, how='left') - right = df.join(s, how='right') + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) + s = Series( + np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST" + ) + inner = df.join(s, how="inner") + outer = df.join(s, how="outer") + left = df.join(s, how="left") + right = df.join(s, how="right") assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): - left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], - 'value': [1, 2, 3, 4]}) - right = DataFrame({'value2': ['a', 'b', 'c']}, - index=['bar', 'baz', 'foo']) - - joined = left.join(right, on='key', sort=True) - expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], - 'value': [2, 3, 1, 4], - 'value2': ['a', 'b', 'c', 'c']}, - index=[1, 2, 0, 3]) + left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) assert_frame_equal(joined, expected) # smoke test - joined = left.join(right, on='key', sort=False) + joined = left.join(right, on="key", sort=False) tm.assert_index_equal(joined.index, pd.Index(list(range(4)))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index - df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) - df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) + df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"]) + df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) - expected = DataFrame({'a': [1, 2, 3, 3, 4], - 'b': [5, np.nan, 6, 7, np.nan]}, - index=[1, 2, 3, 3, 'a']) + expected = DataFrame( + {"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]}, + index=[1, 2, 3, 3, "a"], + ) tm.assert_frame_equal(result, expected) - df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) - df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) + df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"]) + df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) - expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, - index=[1, 2, 2, 'a']) + expected = DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"] + ) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 - index = pd.period_range('2016-01-01', periods=16, freq='M') - df = DataFrame([i for i in range(len(index))], - index=index, columns=['pnum']) + index = pd.period_range("2016-01-01", periods=16, freq="M") + df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) df2 = concat([df, df]) - result = df.join(df2, how='inner', rsuffix='_df2') + result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), - columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) + columns=["pnum", "pnum_df2"], + index=df2.sort_index().index, + ) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 - df = DataFrame(np.random.randn(20, 6), - columns=['a', 'b', 'c', 'd', 'e', 'f']) - df.insert(0, 'id', 0) - df.insert(5, 'dt', 'foo') + df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) + df.insert(0, "id", 0) + df.insert(5, "dt", "foo") - grouped = df.groupby('id') + grouped = df.groupby("id") mn = grouped.mean() cn = grouped.count() # it works! - mn.join(cn, rsuffix='_right') + mn.join(cn, rsuffix="_right") def test_join_many(self): - df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) - df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + df = DataFrame(np.random.randn(10, 6), columns=list("abcdef")) + df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) - df_list = [df[['a', 'b']][:-2], - df[['c', 'd']][2:], df[['e', 'f']][1:9]] + df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] @@ -598,25 +663,25 @@ def _check_diff_index(df_list, result, exp_index): tm.assert_frame_equal(result, expected) # different join types - joined = df_list[0].join(df_list[1:], how='outer') + joined = df_list[0].join(df_list[1:], how="outer") _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) - joined = df_list[0].join(df_list[1:], how='inner') + joined = df_list[0].join(df_list[1:], how="inner") _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): - df_list[0].join(df_list[1:], on='a') + df_list[0].join(df_list[1:], on="a") def test_join_many_mixed(self): - df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) - df['key'] = ['foo', 'bar'] * 4 - df1 = df.loc[:, ['A', 'B']] - df2 = df.loc[:, ['C', 'D']] - df3 = df.loc[:, ['key']] + df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"]) + df["key"] = ["foo", "bar"] * 4 + df1 = df.loc[:, ["A", "B"]] + df2 = df.loc[:, ["C", "D"]] + df3 = df.loc[:, ["key"]] result = df1.join([df2, df3]) assert_frame_equal(result, df) @@ -624,15 +689,18 @@ def test_join_many_mixed(self): def test_join_dups(self): # joining dups - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) expected = concat([df, df], axis=1) - result = df.join(df, rsuffix='_2') + result = df.join(df, rsuffix="_2") result.columns = expected.columns assert_frame_equal(result, expected) @@ -643,70 +711,72 @@ def test_join_dups(self): z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge( - z, left_index=True, right_index=True, how="outer") + z, left_index=True, right_index=True, how="outer" + ) dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) - expected.columns = ['x_x', 'y_x', 'x_y', - 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + expected.columns = ["x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y"] assert_frame_equal(dta, expected) def test_join_multi_to_multi(self, join_type): # GH 20475 - leftindex = MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], - names=['abc', 'xy', 'num']) - left = DataFrame({'v1': range(12)}, index=leftindex) - - rightindex = MultiIndex.from_product([list('abc'), list('xy')], - names=['abc', 'xy']) - right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, - index=rightindex) - - result = left.join(right, on=['abc', 'xy'], how=join_type) - expected = (left.reset_index() - .merge(right.reset_index(), - on=['abc', 'xy'], how=join_type) - .set_index(['abc', 'xy', 'num']) - ) + leftindex = MultiIndex.from_product( + [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"] + ) + left = DataFrame({"v1": range(12)}, index=leftindex) + + rightindex = MultiIndex.from_product( + [list("abc"), list("xy")], names=["abc", "xy"] + ) + right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) + + result = left.join(right, on=["abc", "xy"], how=join_type) + expected = ( + left.reset_index() + .merge(right.reset_index(), on=["abc", "xy"], how=join_type) + .set_index(["abc", "xy", "num"]) + ) assert_frame_equal(expected, result) - msg = (r'len\(left_on\) must equal the number of levels in the index' - ' of "right"') + msg = ( + r"len\(left_on\) must equal the number of levels in the index" ' of "right"' + ) with pytest.raises(ValueError, match=msg): - left.join(right, on='xy', how=join_type) + left.join(right, on="xy", how=join_type) with pytest.raises(ValueError, match=msg): - right.join(left, on=['abc', 'xy'], how=join_type) + right.join(left, on=["abc", "xy"], how=join_type) def test_join_on_tz_aware_datetimeindex(self): # GH 23931, 26335 df1 = pd.DataFrame( { - 'date': pd.date_range(start='2018-01-01', periods=5, - tz='America/Chicago'), - 'vals': list('abcde') + "date": pd.date_range( + start="2018-01-01", periods=5, tz="America/Chicago" + ), + "vals": list("abcde"), } ) df2 = pd.DataFrame( { - 'date': pd.date_range(start='2018-01-03', periods=5, - tz='America/Chicago'), - 'vals_2': list('tuvwx') + "date": pd.date_range( + start="2018-01-03", periods=5, tz="America/Chicago" + ), + "vals_2": list("tuvwx"), } ) - result = df1.join(df2.set_index('date'), on='date') + result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected['vals_2'] = pd.Series([np.nan] * 2 + list('tuv'), - dtype=object) + expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object) assert_frame_equal(result, expected) -def _check_join(left, right, result, join_col, how='left', - lsuffix='_x', rsuffix='_y'): +def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"): # some smoke tests for c in join_col: - assert(result[c].notna().all()) + assert result[c].notna().all() left_grouped = left.groupby(join_col) right_grouped = right.groupby(join_col) @@ -718,9 +788,10 @@ def _check_join(left, right, result, join_col, how='left', try: lgroup = left_grouped.get_group(group_key) except KeyError: - if how in ('left', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) + if how in ("left", "inner"): + raise AssertionError( + "key %s should not have been in the join" % str(group_key) + ) _assert_all_na(l_joined, left.columns, join_col) else: @@ -729,9 +800,10 @@ def _check_join(left, right, result, join_col, how='left', try: rgroup = right_grouped.get_group(group_key) except KeyError: - if how in ('right', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) + if how in ("right", "inner"): + raise AssertionError( + "key %s should not have been in the join" % str(group_key) + ) _assert_all_na(r_joined, right.columns, join_col) else: @@ -739,14 +811,15 @@ def _check_join(left, right, result, join_col, how='left', def _restrict_to_columns(group, columns, suffix): - found = [c for c in group.columns - if c in columns or c.replace(suffix, '') in columns] + found = [ + c for c in group.columns if c in columns or c.replace(suffix, "") in columns + ] # filter group = group.loc[:, found] # get rid of suffixes, if any - group = group.rename(columns=lambda x: x.replace(suffix, '')) + group = group.rename(columns=lambda x: x.replace(suffix, "")) # put in the right order... group = group.loc[:, columns] @@ -761,18 +834,18 @@ def _assert_same_contents(join_chunk, source): svalues = source.fillna(NA_SENTINEL).drop_duplicates().values rows = {tuple(row) for row in jvalues} - assert(len(rows) == len(source)) - assert(all(tuple(row) in rows for row in svalues)) + assert len(rows) == len(source) + assert all(tuple(row) in rows for row in svalues) def _assert_all_na(join_chunk, source_columns, join_col): for c in source_columns: if c in join_col: continue - assert(join_chunk[c].isna().all()) + assert join_chunk[c].isna().all() -def _join_by_hand(a, b, how='left'): +def _join_by_hand(a, b, how="left"): join_index = a.index.join(b.index, how=how) a_re = a.reindex(join_index) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8eb4141555260..80365e34fa87a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -12,9 +12,20 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, - Int64Index, IntervalIndex, MultiIndex, PeriodIndex, RangeIndex, Series, - TimedeltaIndex, UInt64Index) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, +) from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge @@ -30,7 +41,7 @@ def get_test_data(ngroups=NGROUPS, n=N): arr = np.asarray(np.tile(unique_groups, n // ngroups)) if len(arr) < n: - arr = np.asarray(list(arr) + unique_groups[:n - len(arr)]) + arr = np.asarray(list(arr) + unique_groups[: n - len(arr)]) random.shuffle(arr) return arr @@ -38,21 +49,21 @@ def get_test_data(ngroups=NGROUPS, n=N): def get_series(): return [ - pd.Series([1], dtype='int64'), - pd.Series([1], dtype='Int64'), + pd.Series([1], dtype="int64"), + pd.Series([1], dtype="Int64"), pd.Series([1.23]), - pd.Series(['foo']), + pd.Series(["foo"]), pd.Series([True]), - pd.Series([pd.Timestamp('2018-01-01')]), - pd.Series([pd.Timestamp('2018-01-01', tz='US/Eastern')]), + pd.Series([pd.Timestamp("2018-01-01")]), + pd.Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]), ] def get_series_na(): return [ - pd.Series([np.nan], dtype='Int64'), - pd.Series([np.nan], dtype='float'), - pd.Series([np.nan], dtype='object'), + pd.Series([np.nan], dtype="Int64"), + pd.Series([np.nan], dtype="float"), + pd.Series([np.nan], dtype="object"), pd.Series([pd.NaT]), ] @@ -85,82 +96,90 @@ def series_of_dtype_all_na(request): class TestMerge: - def setup_method(self, method): # aggregate multiple columns - self.df = DataFrame({'key1': get_test_data(), - 'key2': get_test_data(), - 'data1': np.random.randn(N), - 'data2': np.random.randn(N)}) + self.df = DataFrame( + { + "key1": get_test_data(), + "key2": get_test_data(), + "data1": np.random.randn(N), + "data2": np.random.randn(N), + } + ) # exclude a couple keys for fun - self.df = self.df[self.df['key2'] > 1] - - self.df2 = DataFrame({'key1': get_test_data(n=N // 5), - 'key2': get_test_data(ngroups=NGROUPS // 2, - n=N // 5), - 'value': np.random.randn(N // 5)}) + self.df = self.df[self.df["key2"] > 1] + + self.df2 = DataFrame( + { + "key1": get_test_data(n=N // 5), + "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), + "value": np.random.randn(N // 5), + } + ) - self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], - 'v1': np.random.randn(7)}) - self.right = DataFrame({'v2': np.random.randn(4)}, - index=['d', 'b', 'c', 'a']) + self.left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + self.right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) def test_merge_inner_join_empty(self): # GH 15328 df_empty = pd.DataFrame() - df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') + df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") result = pd.merge(df_empty, df_a, left_index=True, right_index=True) - expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + expected = pd.DataFrame({"a": []}, index=[], dtype="int64") assert_frame_equal(result, expected) def test_merge_common(self): joined = merge(self.df, self.df2) - exp = merge(self.df, self.df2, on=['key1', 'key2']) + exp = merge(self.df, self.df2, on=["key1", "key2"]) tm.assert_frame_equal(joined, exp) def test_merge_index_as_on_arg(self): # GH14355 - left = self.df.set_index('key1') - right = self.df2.set_index('key1') - result = merge(left, right, on='key1') - expected = merge(self.df, self.df2, on='key1').set_index('key1') + left = self.df.set_index("key1") + right = self.df2.set_index("key1") + result = merge(left, right, on="key1") + expected = merge(self.df, self.df2, on="key1").set_index("key1") assert_frame_equal(result, expected) def test_merge_index_singlekey_right_vs_left(self): - left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], - 'v1': np.random.randn(7)}) - right = DataFrame({'v2': np.random.randn(4)}, - index=['d', 'b', 'c', 'a']) - - merged1 = merge(left, right, left_on='key', - right_index=True, how='left', sort=False) - merged2 = merge(right, left, right_on='key', - left_index=True, how='right', sort=False) + left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) + + merged1 = merge( + left, right, left_on="key", right_index=True, how="left", sort=False + ) + merged2 = merge( + right, left, right_on="key", left_index=True, how="right", sort=False + ) assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) - merged1 = merge(left, right, left_on='key', - right_index=True, how='left', sort=True) - merged2 = merge(right, left, right_on='key', - left_index=True, how='right', sort=True) + merged1 = merge( + left, right, left_on="key", right_index=True, how="left", sort=True + ) + merged2 = merge( + right, left, right_on="key", left_index=True, how="right", sort=True + ) assert_frame_equal(merged1, merged2.loc[:, merged1.columns]) def test_merge_index_singlekey_inner(self): - left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], - 'v1': np.random.randn(7)}) - right = DataFrame({'v2': np.random.randn(4)}, - index=['d', 'b', 'c', 'a']) + left = DataFrame( + {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)} + ) + right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) # inner join - result = merge(left, right, left_on='key', right_index=True, - how='inner') - expected = left.join(right, on='key').loc[result.index] + result = merge(left, right, left_on="key", right_index=True, how="inner") + expected = left.join(right, on="key").loc[result.index] assert_frame_equal(result, expected) - result = merge(right, left, right_on='key', left_index=True, - how='inner') - expected = left.join(right, on='key').loc[result.index] + result = merge(right, left, right_on="key", left_index=True, how="inner") + expected = left.join(right, on="key").loc[result.index] assert_frame_equal(result, expected.loc[:, result.columns]) def test_merge_misspecified(self): @@ -171,166 +190,182 @@ def test_merge_misspecified(self): with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.right, right_index=True) - msg = ('Can only pass argument "on" OR "left_on" and "right_on", not' - ' a combination of both') + msg = ( + 'Can only pass argument "on" OR "left_on" and "right_on", not' + " a combination of both" + ) with pytest.raises(pd.errors.MergeError, match=msg): - merge(self.left, self.left, left_on='key', on='key') + merge(self.left, self.left, left_on="key", on="key") msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, left_on=['key1'], - right_on=['key1', 'key2']) + merge(self.df, self.df2, left_on=["key1"], right_on=["key1", "key2"]) def test_index_and_on_parameters_confusion(self): msg = "right_index parameter must be of type bool, not " with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, how='left', - left_index=False, right_index=['key1', 'key2']) + merge( + self.df, + self.df2, + how="left", + left_index=False, + right_index=["key1", "key2"], + ) msg = "left_index parameter must be of type bool, not " with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, how='left', - left_index=['key1', 'key2'], right_index=False) + merge( + self.df, + self.df2, + how="left", + left_index=["key1", "key2"], + right_index=False, + ) with pytest.raises(ValueError, match=msg): - merge(self.df, self.df2, how='left', - left_index=['key1', 'key2'], right_index=['key1', 'key2']) + merge( + self.df, + self.df2, + how="left", + left_index=["key1", "key2"], + right_index=["key1", "key2"], + ) def test_merge_overlap(self): - merged = merge(self.left, self.left, on='key') - exp_len = (self.left['key'].value_counts() ** 2).sum() + merged = merge(self.left, self.left, on="key") + exp_len = (self.left["key"].value_counts() ** 2).sum() assert len(merged) == exp_len - assert 'v1_x' in merged - assert 'v1_y' in merged + assert "v1_x" in merged + assert "v1_y" in merged def test_merge_different_column_key_names(self): - left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], - 'value': [1, 2, 3, 4]}) - right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'], - 'value': [5, 6, 7, 8]}) + left = DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) + right = DataFrame({"rkey": ["foo", "bar", "qux", "foo"], "value": [5, 6, 7, 8]}) - merged = left.merge(right, left_on='lkey', right_on='rkey', - how='outer', sort=True) + merged = left.merge( + right, left_on="lkey", right_on="rkey", how="outer", sort=True + ) - exp = pd.Series(['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan], - name='lkey') - tm.assert_series_equal(merged['lkey'], exp) + exp = pd.Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey") + tm.assert_series_equal(merged["lkey"], exp) - exp = pd.Series(['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux'], - name='rkey') - tm.assert_series_equal(merged['rkey'], exp) + exp = pd.Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey") + tm.assert_series_equal(merged["rkey"], exp) - exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name='value_x') - tm.assert_series_equal(merged['value_x'], exp) + exp = pd.Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x") + tm.assert_series_equal(merged["value_x"], exp) - exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name='value_y') - tm.assert_series_equal(merged['value_y'], exp) + exp = pd.Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y") + tm.assert_series_equal(merged["value_y"], exp) def test_merge_copy(self): - left = DataFrame({'a': 0, 'b': 1}, index=range(10)) - right = DataFrame({'c': 'foo', 'd': 'bar'}, index=range(10)) + left = DataFrame({"a": 0, "b": 1}, index=range(10)) + right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) - merged = merge(left, right, left_index=True, - right_index=True, copy=True) + merged = merge(left, right, left_index=True, right_index=True, copy=True) - merged['a'] = 6 - assert (left['a'] == 0).all() + merged["a"] = 6 + assert (left["a"] == 0).all() - merged['d'] = 'peekaboo' - assert (right['d'] == 'bar').all() + merged["d"] = "peekaboo" + assert (right["d"] == "bar").all() def test_merge_nocopy(self): - left = DataFrame({'a': 0, 'b': 1}, index=range(10)) - right = DataFrame({'c': 'foo', 'd': 'bar'}, index=range(10)) + left = DataFrame({"a": 0, "b": 1}, index=range(10)) + right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) - merged = merge(left, right, left_index=True, - right_index=True, copy=False) + merged = merge(left, right, left_index=True, right_index=True, copy=False) - merged['a'] = 6 - assert (left['a'] == 6).all() + merged["a"] = 6 + assert (left["a"] == 6).all() - merged['d'] = 'peekaboo' - assert (right['d'] == 'peekaboo').all() + merged["d"] = "peekaboo" + assert (right["d"] == "peekaboo").all() def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame - left = DataFrame({'key': [1, 1, 2, 2, 3], - 'value': list(range(5))}, - columns=['value', 'key']) - right = DataFrame({'key': [1, 1, 2, 3, 4, 5], - 'rvalue': list(range(6))}) - - joined = merge(left, right, on='key', how='outer') - expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5], - 'value': np.array([0, 0, 1, 1, 2, 3, 4, - np.nan, np.nan]), - 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]}, - columns=['value', 'key', 'rvalue']) + left = DataFrame( + {"key": [1, 1, 2, 2, 3], "value": list(range(5))}, columns=["value", "key"] + ) + right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": list(range(6))}) + + joined = merge(left, right, on="key", how="outer") + expected = DataFrame( + { + "key": [1, 1, 1, 1, 2, 2, 3, 4, 5], + "value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), + "rvalue": [0, 1, 0, 1, 2, 2, 3, 4, 5], + }, + columns=["value", "key", "rvalue"], + ) assert_frame_equal(joined, expected) def test_merge_join_key_dtype_cast(self): # #8596 - df1 = DataFrame({'key': [1], 'v1': [10]}) - df2 = DataFrame({'key': [2], 'v1': [20]}) - df = merge(df1, df2, how='outer') - assert df['key'].dtype == 'int64' + df1 = DataFrame({"key": [1], "v1": [10]}) + df2 = DataFrame({"key": [2], "v1": [20]}) + df = merge(df1, df2, how="outer") + assert df["key"].dtype == "int64" - df1 = DataFrame({'key': [True], 'v1': [1]}) - df2 = DataFrame({'key': [False], 'v1': [0]}) - df = merge(df1, df2, how='outer') + df1 = DataFrame({"key": [True], "v1": [1]}) + df2 = DataFrame({"key": [False], "v1": [0]}) + df = merge(df1, df2, how="outer") # GH13169 # this really should be bool - assert df['key'].dtype == 'object' + assert df["key"].dtype == "object" - df1 = DataFrame({'val': [1]}) - df2 = DataFrame({'val': [2]}) + df1 = DataFrame({"val": [1]}) + df2 = DataFrame({"val": [2]}) lkey = np.array([1]) rkey = np.array([2]) - df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') - assert df['key_0'].dtype == 'int64' + df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer") + assert df["key_0"].dtype == "int64" def test_handle_join_key_pass_array(self): - left = DataFrame({'key': [1, 1, 2, 2, 3], - 'value': np.arange(5)}, - columns=['value', 'key']) - right = DataFrame({'rvalue': np.arange(6)}) + left = DataFrame( + {"key": [1, 1, 2, 2, 3], "value": np.arange(5)}, columns=["value", "key"] + ) + right = DataFrame({"rvalue": np.arange(6)}) key = np.array([1, 1, 2, 3, 4, 5]) - merged = merge(left, right, left_on='key', right_on=key, how='outer') - merged2 = merge(right, left, left_on=key, right_on='key', how='outer') + merged = merge(left, right, left_on="key", right_on=key, how="outer") + merged2 = merge(right, left, left_on=key, right_on="key", how="outer") - assert_series_equal(merged['key'], merged2['key']) - assert merged['key'].notna().all() - assert merged2['key'].notna().all() + assert_series_equal(merged["key"], merged2["key"]) + assert merged["key"].notna().all() + assert merged2["key"].notna().all() - left = DataFrame({'value': np.arange(5)}, columns=['value']) - right = DataFrame({'rvalue': np.arange(6)}) + left = DataFrame({"value": np.arange(5)}, columns=["value"]) + right = DataFrame({"rvalue": np.arange(6)}) lkey = np.array([1, 1, 2, 2, 3]) rkey = np.array([1, 1, 2, 3, 4, 5]) - merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') - tm.assert_series_equal(merged['key_0'], Series([1, 1, 1, 1, 2, - 2, 3, 4, 5], - name='key_0')) + merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") + tm.assert_series_equal( + merged["key_0"], Series([1, 1, 1, 1, 2, 2, 3, 4, 5], name="key_0") + ) - left = DataFrame({'value': np.arange(3)}) - right = DataFrame({'rvalue': np.arange(6)}) + left = DataFrame({"value": np.arange(3)}) + right = DataFrame({"rvalue": np.arange(6)}) key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) - merged = merge(left, right, left_index=True, right_on=key, how='outer') - tm.assert_series_equal(merged['key_0'], Series(key, name='key_0')) + merged = merge(left, right, left_index=True, right_on=key, how="outer") + tm.assert_series_equal(merged["key_0"], Series(key, name="key_0")) def test_no_overlap_more_informative_error(self): dt = datetime.now() - df1 = DataFrame({'x': ['a']}, index=[dt]) + df1 = DataFrame({"x": ["a"]}, index=[dt]) - df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt]) - msg = ('No common columns to perform merge on. ' - 'Merge options: left_on={lon}, right_on={ron}, ' - 'left_index={lidx}, right_index={ridx}' - .format(lon=None, ron=None, lidx=False, ridx=False)) + msg = ( + "No common columns to perform merge on. " + "Merge options: left_on={lon}, right_on={ron}, " + "left_index={lidx}, right_index={ridx}".format( + lon=None, ron=None, lidx=False, ridx=False + ) + ) with pytest.raises(MergeError, match=msg): merge(df1, df2) @@ -342,165 +377,191 @@ def test_merge_non_unique_indexes(self): dt3 = datetime(2012, 5, 3) dt4 = datetime(2012, 5, 4) - df1 = DataFrame({'x': ['a']}, index=[dt]) - df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + df1 = DataFrame({"x": ["a"]}, index=[dt]) + df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt]) _check_merge(df1, df2) # Not monotonic - df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4]) - df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']}, - index=[dt3, dt3, dt2, dt2, dt, dt]) + df1 = DataFrame({"x": ["a", "b", "q"]}, index=[dt2, dt, dt4]) + df2 = DataFrame( + {"y": ["c", "d", "e", "f", "g", "h"]}, index=[dt3, dt3, dt2, dt2, dt, dt] + ) _check_merge(df1, df2) - df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt]) - df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt]) + df1 = DataFrame({"x": ["a", "b"]}, index=[dt, dt]) + df2 = DataFrame({"y": ["c", "d"]}, index=[dt, dt]) _check_merge(df1, df2) def test_merge_non_unique_index_many_to_many(self): dt = datetime(2012, 5, 1) dt2 = datetime(2012, 5, 2) dt3 = datetime(2012, 5, 3) - df1 = DataFrame({'x': ['a', 'b', 'c', 'd']}, - index=[dt2, dt2, dt, dt]) - df2 = DataFrame({'y': ['e', 'f', 'g', ' h', 'i']}, - index=[dt2, dt2, dt3, dt, dt]) + df1 = DataFrame({"x": ["a", "b", "c", "d"]}, index=[dt2, dt2, dt, dt]) + df2 = DataFrame( + {"y": ["e", "f", "g", " h", "i"]}, index=[dt2, dt2, dt3, dt, dt] + ) _check_merge(df1, df2) def test_left_merge_empty_dataframe(self): - left = DataFrame({'key': [1], 'value': [2]}) - right = DataFrame({'key': []}) + left = DataFrame({"key": [1], "value": [2]}) + right = DataFrame({"key": []}) - result = merge(left, right, on='key', how='left') + result = merge(left, right, on="key", how="left") assert_frame_equal(result, left) - result = merge(right, left, on='key', how='right') + result = merge(right, left, on="key", how="right") assert_frame_equal(result, left) - @pytest.mark.parametrize('kwarg', - [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]) + @pytest.mark.parametrize( + "kwarg", + [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), + ], + ) def test_merge_left_empty_right_empty(self, join_type, kwarg): # GH 10824 - left = pd.DataFrame(columns=['a', 'b', 'c']) - right = pd.DataFrame(columns=['x', 'y', 'z']) + left = pd.DataFrame(columns=["a", "b", "c"]) + right = pd.DataFrame(columns=["x", "y", "z"]) - exp_in = pd.DataFrame(columns=['a', 'b', 'c', 'x', 'y', 'z'], - index=pd.Index([], dtype=object), - dtype=object) + exp_in = pd.DataFrame( + columns=["a", "b", "c", "x", "y", "z"], + index=pd.Index([], dtype=object), + dtype=object, + ) result = pd.merge(left, right, how=join_type, **kwarg) tm.assert_frame_equal(result, exp_in) def test_merge_left_empty_right_notempty(self): # GH 10824 - left = pd.DataFrame(columns=['a', 'b', 'c']) - right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=['x', 'y', 'z']) - - exp_out = pd.DataFrame({'a': np.array([np.nan] * 3, dtype=object), - 'b': np.array([np.nan] * 3, dtype=object), - 'c': np.array([np.nan] * 3, dtype=object), - 'x': [1, 4, 7], - 'y': [2, 5, 8], - 'z': [3, 6, 9]}, - columns=['a', 'b', 'c', 'x', 'y', 'z']) + left = pd.DataFrame(columns=["a", "b", "c"]) + right = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"]) + + exp_out = pd.DataFrame( + { + "a": np.array([np.nan] * 3, dtype=object), + "b": np.array([np.nan] * 3, dtype=object), + "c": np.array([np.nan] * 3, dtype=object), + "x": [1, 4, 7], + "y": [2, 5, 8], + "z": [3, 6, 9], + }, + columns=["a", "b", "c", "x", "y", "z"], + ) exp_in = exp_out[0:0] # make empty DataFrame keeping dtype # result will have object dtype exp_in.index = exp_in.index.astype(object) def check1(exp, kwarg): - result = pd.merge(left, right, how='inner', **kwarg) + result = pd.merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='left', **kwarg) + result = pd.merge(left, right, how="left", **kwarg) tm.assert_frame_equal(result, exp) def check2(exp, kwarg): - result = pd.merge(left, right, how='right', **kwarg) + result = pd.merge(left, right, how="right", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='outer', **kwarg) + result = pd.merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x')]: + for kwarg in [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + ]: check1(exp_in, kwarg) check2(exp_out, kwarg) - kwarg = dict(left_on='a', right_index=True) + kwarg = dict(left_on="a", right_index=True) check1(exp_in, kwarg) - exp_out['a'] = [0, 1, 2] + exp_out["a"] = [0, 1, 2] check2(exp_out, kwarg) - kwarg = dict(left_on='a', right_on='x') + kwarg = dict(left_on="a", right_on="x") check1(exp_in, kwarg) - exp_out['a'] = np.array([np.nan] * 3, dtype=object) + exp_out["a"] = np.array([np.nan] * 3, dtype=object) check2(exp_out, kwarg) def test_merge_left_notempty_right_empty(self): # GH 10824 - left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=['a', 'b', 'c']) - right = pd.DataFrame(columns=['x', 'y', 'z']) - - exp_out = pd.DataFrame({'a': [1, 4, 7], - 'b': [2, 5, 8], - 'c': [3, 6, 9], - 'x': np.array([np.nan] * 3, dtype=object), - 'y': np.array([np.nan] * 3, dtype=object), - 'z': np.array([np.nan] * 3, dtype=object)}, - columns=['a', 'b', 'c', 'x', 'y', 'z']) + left = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) + right = pd.DataFrame(columns=["x", "y", "z"]) + + exp_out = pd.DataFrame( + { + "a": [1, 4, 7], + "b": [2, 5, 8], + "c": [3, 6, 9], + "x": np.array([np.nan] * 3, dtype=object), + "y": np.array([np.nan] * 3, dtype=object), + "z": np.array([np.nan] * 3, dtype=object), + }, + columns=["a", "b", "c", "x", "y", "z"], + ) exp_in = exp_out[0:0] # make empty DataFrame keeping dtype # result will have object dtype exp_in.index = exp_in.index.astype(object) def check1(exp, kwarg): - result = pd.merge(left, right, how='inner', **kwarg) + result = pd.merge(left, right, how="inner", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='right', **kwarg) + result = pd.merge(left, right, how="right", **kwarg) tm.assert_frame_equal(result, exp) def check2(exp, kwarg): - result = pd.merge(left, right, how='left', **kwarg) + result = pd.merge(left, right, how="left", **kwarg) tm.assert_frame_equal(result, exp) - result = pd.merge(left, right, how='outer', **kwarg) + result = pd.merge(left, right, how="outer", **kwarg) tm.assert_frame_equal(result, exp) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: + for kwarg in [ + dict(left_index=True, right_index=True), + dict(left_index=True, right_on="x"), + dict(left_on="a", right_index=True), + dict(left_on="a", right_on="x"), + ]: check1(exp_in, kwarg) check2(exp_out, kwarg) def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2): # GH 25183 - df = pd.DataFrame({'key': series_of_dtype, 'value': series_of_dtype2}, - columns=['key', 'value']) + df = pd.DataFrame( + {"key": series_of_dtype, "value": series_of_dtype2}, + columns=["key", "value"], + ) df_empty = df[:0] - expected = pd.DataFrame({ - 'value_x': pd.Series(dtype=df.dtypes['value']), - 'key': pd.Series(dtype=df.dtypes['key']), - 'value_y': pd.Series(dtype=df.dtypes['value']), - }, columns=['value_x', 'key', 'value_y']) - actual = df_empty.merge(df, on='key') + expected = pd.DataFrame( + { + "value_x": pd.Series(dtype=df.dtypes["value"]), + "key": pd.Series(dtype=df.dtypes["key"]), + "value_y": pd.Series(dtype=df.dtypes["value"]), + }, + columns=["value_x", "key", "value_y"], + ) + actual = df_empty.merge(df, on="key") assert_frame_equal(actual, expected) - def test_merge_all_na_column(self, series_of_dtype, - series_of_dtype_all_na): + def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na): # GH 25183 df_left = pd.DataFrame( - {'key': series_of_dtype, 'value': series_of_dtype_all_na}, - columns=['key', 'value']) + {"key": series_of_dtype, "value": series_of_dtype_all_na}, + columns=["key", "value"], + ) df_right = pd.DataFrame( - {'key': series_of_dtype, 'value': series_of_dtype_all_na}, - columns=['key', 'value']) - expected = pd.DataFrame({ - 'key': series_of_dtype, - 'value_x': series_of_dtype_all_na, - 'value_y': series_of_dtype_all_na, - }, columns=['key', 'value_x', 'value_y']) - actual = df_left.merge(df_right, on='key') + {"key": series_of_dtype, "value": series_of_dtype_all_na}, + columns=["key", "value"], + ) + expected = pd.DataFrame( + { + "key": series_of_dtype, + "value_x": series_of_dtype_all_na, + "value_y": series_of_dtype_all_na, + }, + columns=["key", "value_x", "value_y"], + ) + actual = df_left.merge(df_right, on="key") assert_frame_equal(actual, expected) def test_merge_nosort(self): @@ -508,26 +569,29 @@ def test_merge_nosort(self): from datetime import datetime - d = {"var1": np.random.randint(0, 10, size=10), - "var2": np.random.randint(0, 10, size=10), - "var3": [datetime(2012, 1, 12), - datetime(2011, 2, 4), - datetime(2010, 2, 3), - datetime(2012, 1, 12), - datetime(2011, 2, 4), - datetime(2012, 4, 3), - datetime(2012, 3, 4), - datetime(2008, 5, 1), - datetime(2010, 2, 3), - datetime(2012, 2, 3)]} + d = { + "var1": np.random.randint(0, 10, size=10), + "var2": np.random.randint(0, 10, size=10), + "var3": [ + datetime(2012, 1, 12), + datetime(2011, 2, 4), + datetime(2010, 2, 3), + datetime(2012, 1, 12), + datetime(2011, 2, 4), + datetime(2012, 4, 3), + datetime(2012, 3, 4), + datetime(2008, 5, 1), + datetime(2010, 2, 3), + datetime(2012, 2, 3), + ], + } df = DataFrame.from_dict(d) var3 = df.var3.unique() var3.sort() - new = DataFrame.from_dict({"var3": var3, - "var8": np.random.random(7)}) + new = DataFrame.from_dict({"var3": var3, "var8": np.random.random(7)}) result = df.merge(new, on="var3", sort=False) - exp = merge(df, new, on='var3', sort=False) + exp = merge(df, new, on="var3", sort=False) assert_frame_equal(result, exp) assert (df.var3.unique() == result.var3.unique()).all() @@ -536,33 +600,42 @@ def test_merge_nan_right(self): df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]}) df2 = DataFrame({"i1": [0], "i3": [0]}) result = df1.join(df2, on="i1", rsuffix="_") - expected = (DataFrame({'i1': {0: 0.0, 1: 1}, 'i2': {0: 0, 1: 1}, - 'i1_': {0: 0, 1: np.nan}, - 'i3': {0: 0.0, 1: np.nan}, - None: {0: 0, 1: 0}}) - .set_index(None) - .reset_index()[['i1', 'i2', 'i1_', 'i3']]) + expected = ( + DataFrame( + { + "i1": {0: 0.0, 1: 1}, + "i2": {0: 0, 1: 1}, + "i1_": {0: 0, 1: np.nan}, + "i3": {0: 0.0, 1: np.nan}, + None: {0: 0, 1: 0}, + } + ) + .set_index(None) + .reset_index()[["i1", "i2", "i1_", "i3"]] + ) assert_frame_equal(result, expected, check_dtype=False) df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]}) df2 = DataFrame({"i1": [0], "i3": [0.7]}) - result = df1.join(df2, rsuffix="_", on='i1') - expected = (DataFrame({'i1': {0: 0, 1: 1}, 'i1_': {0: 0.0, 1: nan}, - 'i2': {0: 0.5, 1: 1.5}, - 'i3': {0: 0.69999999999999996, - 1: nan}}) - [['i1', 'i2', 'i1_', 'i3']]) + result = df1.join(df2, rsuffix="_", on="i1") + expected = DataFrame( + { + "i1": {0: 0, 1: 1}, + "i1_": {0: 0.0, 1: nan}, + "i2": {0: 0.5, 1: 1.5}, + "i3": {0: 0.69999999999999996, 1: nan}, + } + )[["i1", "i2", "i1_", "i3"]] assert_frame_equal(result, expected) def test_merge_type(self): class NotADataFrame(DataFrame): - @property def _constructor(self): return NotADataFrame nad = NotADataFrame(self.df) - result = nad.merge(self.df2, on='key1') + result = nad.merge(self.df2, on="key1") assert isinstance(result, NotADataFrame) @@ -574,237 +647,318 @@ def test_join_append_timedeltas(self): # timedelta64 issues with join/merge # GH 5695 - d = {'d': dt.datetime(2013, 11, 5, 5, 56), 't': dt.timedelta(0, 22500)} - df = DataFrame(columns=list('dt')) + d = {"d": dt.datetime(2013, 11, 5, 5, 56), "t": dt.timedelta(0, 22500)} + df = DataFrame(columns=list("dt")) df = df.append(d, ignore_index=True) result = df.append(d, ignore_index=True) - expected = DataFrame({'d': [dt.datetime(2013, 11, 5, 5, 56), - dt.datetime(2013, 11, 5, 5, 56)], - 't': [dt.timedelta(0, 22500), - dt.timedelta(0, 22500)]}) + expected = DataFrame( + { + "d": [dt.datetime(2013, 11, 5, 5, 56), dt.datetime(2013, 11, 5, 5, 56)], + "t": [dt.timedelta(0, 22500), dt.timedelta(0, 22500)], + } + ) assert_frame_equal(result, expected) td = np.timedelta64(300000000) lhs = DataFrame(Series([td, td], index=["A", "B"])) rhs = DataFrame(Series([td], index=["A"])) - result = lhs.join(rhs, rsuffix='r', how="left") - expected = DataFrame({'0': Series([td, td], index=list('AB')), - '0r': Series([td, NaT], index=list('AB'))}) + result = lhs.join(rhs, rsuffix="r", how="left") + expected = DataFrame( + { + "0": Series([td, td], index=list("AB")), + "0r": Series([td, NaT], index=list("AB")), + } + ) assert_frame_equal(result, expected) def test_other_datetime_unit(self): # GH 13389 - df1 = pd.DataFrame({'entity_id': [101, 102]}) - s = pd.Series([None, None], index=[101, 102], name='days') - - for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', - 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', - 'datetime64[ns]']: - - df2 = s.astype(dtype).to_frame('days') + df1 = pd.DataFrame({"entity_id": [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name="days") + + for dtype in [ + "datetime64[D]", + "datetime64[h]", + "datetime64[m]", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ]: + + df2 = s.astype(dtype).to_frame("days") # coerces to datetime64[ns], thus should not be affected - assert df2['days'].dtype == 'datetime64[ns]' + assert df2["days"].dtype == "datetime64[ns]" - result = df1.merge(df2, left_on='entity_id', right_index=True) + result = df1.merge(df2, left_on="entity_id", right_index=True) - exp = pd.DataFrame({'entity_id': [101, 102], - 'days': np.array(['nat', 'nat'], - dtype='datetime64[ns]')}, - columns=['entity_id', 'days']) + exp = pd.DataFrame( + { + "entity_id": [101, 102], + "days": np.array(["nat", "nat"], dtype="datetime64[ns]"), + }, + columns=["entity_id", "days"], + ) tm.assert_frame_equal(result, exp) - @pytest.mark.parametrize("unit", ['D', 'h', 'm', 's', 'ms', 'us', 'ns']) + @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) def test_other_timedelta_unit(self, unit): # GH 13389 - df1 = pd.DataFrame({'entity_id': [101, 102]}) - s = pd.Series([None, None], index=[101, 102], name='days') + df1 = pd.DataFrame({"entity_id": [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name="days") dtype = "m8[{}]".format(unit) - df2 = s.astype(dtype).to_frame('days') - assert df2['days'].dtype == 'm8[ns]' + df2 = s.astype(dtype).to_frame("days") + assert df2["days"].dtype == "m8[ns]" - result = df1.merge(df2, left_on='entity_id', right_index=True) + result = df1.merge(df2, left_on="entity_id", right_index=True) - exp = pd.DataFrame({'entity_id': [101, 102], - 'days': np.array(['nat', 'nat'], - dtype=dtype)}, - columns=['entity_id', 'days']) + exp = pd.DataFrame( + {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)}, + columns=["entity_id", "days"], + ) tm.assert_frame_equal(result, exp) def test_overlapping_columns_error_message(self): - df = DataFrame({'key': [1, 2, 3], - 'v1': [4, 5, 6], - 'v2': [7, 8, 9]}) - df2 = DataFrame({'key': [1, 2, 3], - 'v1': [4, 5, 6], - 'v2': [7, 8, 9]}) - - df.columns = ['key', 'foo', 'foo'] - df2.columns = ['key', 'bar', 'bar'] - expected = DataFrame({'key': [1, 2, 3], - 'v1': [4, 5, 6], - 'v2': [7, 8, 9], - 'v3': [4, 5, 6], - 'v4': [7, 8, 9]}) - expected.columns = ['key', 'foo', 'foo', 'bar', 'bar'] + df = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]}) + df2 = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]}) + + df.columns = ["key", "foo", "foo"] + df2.columns = ["key", "bar", "bar"] + expected = DataFrame( + { + "key": [1, 2, 3], + "v1": [4, 5, 6], + "v2": [7, 8, 9], + "v3": [4, 5, 6], + "v4": [7, 8, 9], + } + ) + expected.columns = ["key", "foo", "foo", "bar", "bar"] assert_frame_equal(merge(df, df2), expected) # #2649, #10639 - df2.columns = ['key1', 'foo', 'foo'] - msg = (r"Data columns not unique: Index\(\['foo', 'foo'\]," - r" dtype='object'\)") + df2.columns = ["key1", "foo", "foo"] + msg = r"Data columns not unique: Index\(\['foo', 'foo'\]," r" dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) def test_merge_on_datetime64tz(self): # GH11405 - left = pd.DataFrame({'key': pd.date_range('20151010', periods=2, - tz='US/Eastern'), - 'value': [1, 2]}) - right = pd.DataFrame({'key': pd.date_range('20151011', periods=3, - tz='US/Eastern'), - 'value': [1, 2, 3]}) - - expected = DataFrame({'key': pd.date_range('20151010', periods=4, - tz='US/Eastern'), - 'value_x': [1, 2, np.nan, np.nan], - 'value_y': [np.nan, 1, 2, 3]}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + { + "key": pd.date_range("20151010", periods=2, tz="US/Eastern"), + "value": [1, 2], + } + ) + right = pd.DataFrame( + { + "key": pd.date_range("20151011", periods=3, tz="US/Eastern"), + "value": [1, 2, 3], + } + ) + + expected = DataFrame( + { + "key": pd.date_range("20151010", periods=4, tz="US/Eastern"), + "value_x": [1, 2, np.nan, np.nan], + "value_y": [np.nan, 1, 2, 3], + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - left = pd.DataFrame({'key': [1, 2], - 'value': pd.date_range('20151010', periods=2, - tz='US/Eastern')}) - right = pd.DataFrame({'key': [2, 3], - 'value': pd.date_range('20151011', periods=2, - tz='US/Eastern')}) - expected = DataFrame({ - 'key': [1, 2, 3], - 'value_x': list(pd.date_range('20151010', periods=2, - tz='US/Eastern')) + [pd.NaT], - 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2, - tz='US/Eastern'))}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + { + "key": [1, 2], + "value": pd.date_range("20151010", periods=2, tz="US/Eastern"), + } + ) + right = pd.DataFrame( + { + "key": [2, 3], + "value": pd.date_range("20151011", periods=2, tz="US/Eastern"), + } + ) + expected = DataFrame( + { + "key": [1, 2, 3], + "value_x": list(pd.date_range("20151010", periods=2, tz="US/Eastern")) + + [pd.NaT], + "value_y": [pd.NaT] + + list(pd.date_range("20151011", periods=2, tz="US/Eastern")), + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]' - assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]' + assert result["value_x"].dtype == "datetime64[ns, US/Eastern]" + assert result["value_y"].dtype == "datetime64[ns, US/Eastern]" def test_merge_on_datetime64tz_empty(self): # https://github.com/pandas-dev/pandas/issues/25014 - dtz = pd.DatetimeTZDtype(tz='UTC') - right = pd.DataFrame({'date': [pd.Timestamp('2018', tz=dtz.tz)], - 'value': [4.0], - 'date2': [pd.Timestamp('2019', tz=dtz.tz)]}, - columns=['date', 'value', 'date2']) + dtz = pd.DatetimeTZDtype(tz="UTC") + right = pd.DataFrame( + { + "date": [pd.Timestamp("2018", tz=dtz.tz)], + "value": [4.0], + "date2": [pd.Timestamp("2019", tz=dtz.tz)], + }, + columns=["date", "value", "date2"], + ) left = right[:0] - result = left.merge(right, on='date') - expected = pd.DataFrame({ - 'value_x': pd.Series(dtype=float), - 'date2_x': pd.Series(dtype=dtz), - 'date': pd.Series(dtype=dtz), - 'value_y': pd.Series(dtype=float), - 'date2_y': pd.Series(dtype=dtz), - }, columns=['value_x', 'date2_x', 'date', 'value_y', 'date2_y']) + result = left.merge(right, on="date") + expected = pd.DataFrame( + { + "value_x": pd.Series(dtype=float), + "date2_x": pd.Series(dtype=dtz), + "date": pd.Series(dtype=dtz), + "value_y": pd.Series(dtype=float), + "date2_y": pd.Series(dtype=dtz), + }, + columns=["value_x", "date2_x", "date", "value_y", "date2_y"], + ) tm.assert_frame_equal(result, expected) def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 - df1 = pd.DataFrame(pd.date_range( - '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'), - columns=['date']) - df1['value'] = 1 - df2 = pd.DataFrame({ - 'date': pd.to_datetime([ - '2017-10-29 03:00:00', '2017-10-29 04:00:00', - '2017-10-29 05:00:00' - ]), - 'value': 2 - }) - df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert( - 'Europe/Madrid') - result = pd.merge(df1, df2, how='outer', on='date') - expected = pd.DataFrame({ - 'date': pd.date_range( - '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'), - 'value_x': [1] * 4 + [np.nan] * 3, - 'value_y': [np.nan] * 4 + [2] * 3 - }) + df1 = pd.DataFrame( + pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"), + columns=["date"], + ) + df1["value"] = 1 + df2 = pd.DataFrame( + { + "date": pd.to_datetime( + [ + "2017-10-29 03:00:00", + "2017-10-29 04:00:00", + "2017-10-29 05:00:00", + ] + ), + "value": 2, + } + ) + df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid") + result = pd.merge(df1, df2, how="outer", on="date") + expected = pd.DataFrame( + { + "date": pd.date_range( + "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid" + ), + "value_x": [1] * 4 + [np.nan] * 3, + "value_y": [np.nan] * 4 + [2] * 3, + } + ) assert_frame_equal(result, expected) def test_merge_non_unique_period_index(self): # GH #16871 - index = pd.period_range('2016-01-01', periods=16, freq='M') - df = DataFrame([i for i in range(len(index))], - index=index, columns=['pnum']) + index = pd.period_range("2016-01-01", periods=16, freq="M") + df = DataFrame([i for i in range(len(index))], index=index, columns=["pnum"]) df2 = concat([df, df]) - result = df.merge(df2, left_index=True, right_index=True, how='inner') + result = df.merge(df2, left_index=True, right_index=True, how="inner") expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), - columns=['pnum_x', 'pnum_y'], index=df2.sort_index().index) + columns=["pnum_x", "pnum_y"], + index=df2.sort_index().index, + ) tm.assert_frame_equal(result, expected) def test_merge_on_periods(self): - left = pd.DataFrame({'key': pd.period_range('20151010', periods=2, - freq='D'), - 'value': [1, 2]}) - right = pd.DataFrame({'key': pd.period_range('20151011', periods=3, - freq='D'), - 'value': [1, 2, 3]}) - - expected = DataFrame({'key': pd.period_range('20151010', periods=4, - freq='D'), - 'value_x': [1, 2, np.nan, np.nan], - 'value_y': [np.nan, 1, 2, 3]}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + {"key": pd.period_range("20151010", periods=2, freq="D"), "value": [1, 2]} + ) + right = pd.DataFrame( + { + "key": pd.period_range("20151011", periods=3, freq="D"), + "value": [1, 2, 3], + } + ) + + expected = DataFrame( + { + "key": pd.period_range("20151010", periods=4, freq="D"), + "value_x": [1, 2, np.nan, np.nan], + "value_y": [np.nan, 1, 2, 3], + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - left = pd.DataFrame({'key': [1, 2], - 'value': pd.period_range('20151010', periods=2, - freq='D')}) - right = pd.DataFrame({'key': [2, 3], - 'value': pd.period_range('20151011', periods=2, - freq='D')}) - - exp_x = pd.period_range('20151010', periods=2, freq='D') - exp_y = pd.period_range('20151011', periods=2, freq='D') - expected = DataFrame({'key': [1, 2, 3], - 'value_x': list(exp_x) + [pd.NaT], - 'value_y': [pd.NaT] + list(exp_y)}) - result = pd.merge(left, right, on='key', how='outer') + left = pd.DataFrame( + {"key": [1, 2], "value": pd.period_range("20151010", periods=2, freq="D")} + ) + right = pd.DataFrame( + {"key": [2, 3], "value": pd.period_range("20151011", periods=2, freq="D")} + ) + + exp_x = pd.period_range("20151010", periods=2, freq="D") + exp_y = pd.period_range("20151011", periods=2, freq="D") + expected = DataFrame( + { + "key": [1, 2, 3], + "value_x": list(exp_x) + [pd.NaT], + "value_y": [pd.NaT] + list(exp_y), + } + ) + result = pd.merge(left, right, on="key", how="outer") assert_frame_equal(result, expected) - assert result['value_x'].dtype == 'Period[D]' - assert result['value_y'].dtype == 'Period[D]' + assert result["value_x"].dtype == "Period[D]" + assert result["value_y"].dtype == "Period[D]" def test_indicator(self): # PR #10054. xref #7412 and closes #8790. - df1 = DataFrame({'col1': [0, 1], 'col_conflict': [1, 2], - 'col_left': ['a', 'b']}) + df1 = DataFrame( + {"col1": [0, 1], "col_conflict": [1, 2], "col_left": ["a", "b"]} + ) df1_copy = df1.copy() - df2 = DataFrame({'col1': [1, 2, 3, 4, 5], - 'col_conflict': [1, 2, 3, 4, 5], - 'col_right': [2, 2, 2, 2, 2]}) + df2 = DataFrame( + { + "col1": [1, 2, 3, 4, 5], + "col_conflict": [1, 2, 3, 4, 5], + "col_right": [2, 2, 2, 2, 2], + } + ) df2_copy = df2.copy() - df_result = DataFrame({ - 'col1': [0, 1, 2, 3, 4, 5], - 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan], - 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan], - 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5], - 'col_right': [np.nan, 2, 2, 2, 2, 2]}) - df_result['_merge'] = Categorical( - ['left_only', 'both', 'right_only', - 'right_only', 'right_only', 'right_only'], - categories=['left_only', 'right_only', 'both']) - - df_result = df_result[['col1', 'col_conflict_x', 'col_left', - 'col_conflict_y', 'col_right', '_merge']] - - test = merge(df1, df2, on='col1', how='outer', indicator=True) + df_result = DataFrame( + { + "col1": [0, 1, 2, 3, 4, 5], + "col_conflict_x": [1, 2, np.nan, np.nan, np.nan, np.nan], + "col_left": ["a", "b", np.nan, np.nan, np.nan, np.nan], + "col_conflict_y": [np.nan, 1, 2, 3, 4, 5], + "col_right": [np.nan, 2, 2, 2, 2, 2], + } + ) + df_result["_merge"] = Categorical( + [ + "left_only", + "both", + "right_only", + "right_only", + "right_only", + "right_only", + ], + categories=["left_only", "right_only", "both"], + ) + + df_result = df_result[ + [ + "col1", + "col_conflict_x", + "col_left", + "col_conflict_y", + "col_right", + "_merge", + ] + ] + + test = merge(df1, df2, on="col1", how="outer", indicator=True) assert_frame_equal(test, df_result) - test = df1.merge(df2, on='col1', how='outer', indicator=True) + test = df1.merge(df2, on="col1", how="outer", indicator=True) assert_frame_equal(test, df_result) # No side effects @@ -814,258 +968,337 @@ def test_indicator(self): # Check with custom name df_result_custom_name = df_result df_result_custom_name = df_result_custom_name.rename( - columns={'_merge': 'custom_name'}) + columns={"_merge": "custom_name"} + ) test_custom_name = merge( - df1, df2, on='col1', how='outer', indicator='custom_name') + df1, df2, on="col1", how="outer", indicator="custom_name" + ) assert_frame_equal(test_custom_name, df_result_custom_name) test_custom_name = df1.merge( - df2, on='col1', how='outer', indicator='custom_name') + df2, on="col1", how="outer", indicator="custom_name" + ) assert_frame_equal(test_custom_name, df_result_custom_name) # Check only accepts strings and booleans msg = "indicator option can only accept boolean or string arguments" with pytest.raises(ValueError, match=msg): - merge(df1, df2, on='col1', how='outer', indicator=5) + merge(df1, df2, on="col1", how="outer", indicator=5) with pytest.raises(ValueError, match=msg): - df1.merge(df2, on='col1', how='outer', indicator=5) + df1.merge(df2, on="col1", how="outer", indicator=5) # Check result integrity - test2 = merge(df1, df2, on='col1', how='left', indicator=True) - assert (test2._merge != 'right_only').all() - test2 = df1.merge(df2, on='col1', how='left', indicator=True) - assert (test2._merge != 'right_only').all() + test2 = merge(df1, df2, on="col1", how="left", indicator=True) + assert (test2._merge != "right_only").all() + test2 = df1.merge(df2, on="col1", how="left", indicator=True) + assert (test2._merge != "right_only").all() - test3 = merge(df1, df2, on='col1', how='right', indicator=True) - assert (test3._merge != 'left_only').all() - test3 = df1.merge(df2, on='col1', how='right', indicator=True) - assert (test3._merge != 'left_only').all() + test3 = merge(df1, df2, on="col1", how="right", indicator=True) + assert (test3._merge != "left_only").all() + test3 = df1.merge(df2, on="col1", how="right", indicator=True) + assert (test3._merge != "left_only").all() - test4 = merge(df1, df2, on='col1', how='inner', indicator=True) - assert (test4._merge == 'both').all() - test4 = df1.merge(df2, on='col1', how='inner', indicator=True) - assert (test4._merge == 'both').all() + test4 = merge(df1, df2, on="col1", how="inner", indicator=True) + assert (test4._merge == "both").all() + test4 = df1.merge(df2, on="col1", how="inner", indicator=True) + assert (test4._merge == "both").all() # Check if working name in df - for i in ['_right_indicator', '_left_indicator', '_merge']: - df_badcolumn = DataFrame({'col1': [1, 2], i: [2, 2]}) - - msg = ("Cannot use `indicator=True` option when data contains a" - " column named {}|" - "Cannot use name of an existing column for indicator" - " column").format(i) + for i in ["_right_indicator", "_left_indicator", "_merge"]: + df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]}) + + msg = ( + "Cannot use `indicator=True` option when data contains a" + " column named {}|" + "Cannot use name of an existing column for indicator" + " column" + ).format(i) with pytest.raises(ValueError, match=msg): - merge(df1, df_badcolumn, on='col1', - how='outer', indicator=True) + merge(df1, df_badcolumn, on="col1", how="outer", indicator=True) with pytest.raises(ValueError, match=msg): - df1.merge(df_badcolumn, on='col1', how='outer', indicator=True) + df1.merge(df_badcolumn, on="col1", how="outer", indicator=True) # Check for name conflict with custom name - df_badcolumn = DataFrame( - {'col1': [1, 2], 'custom_column_name': [2, 2]}) + df_badcolumn = DataFrame({"col1": [1, 2], "custom_column_name": [2, 2]}) msg = "Cannot use name of an existing column for indicator column" with pytest.raises(ValueError, match=msg): - merge(df1, df_badcolumn, on='col1', how='outer', - indicator='custom_column_name') + merge( + df1, + df_badcolumn, + on="col1", + how="outer", + indicator="custom_column_name", + ) with pytest.raises(ValueError, match=msg): - df1.merge(df_badcolumn, on='col1', how='outer', - indicator='custom_column_name') + df1.merge( + df_badcolumn, on="col1", how="outer", indicator="custom_column_name" + ) # Merge on multiple columns - df3 = DataFrame({'col1': [0, 1], 'col2': ['a', 'b']}) + df3 = DataFrame({"col1": [0, 1], "col2": ["a", "b"]}) - df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']}) + df4 = DataFrame({"col1": [1, 1, 3], "col2": ["b", "x", "y"]}) - hand_coded_result = DataFrame({'col1': [0, 1, 1, 3], - 'col2': ['a', 'b', 'x', 'y']}) - hand_coded_result['_merge'] = Categorical( - ['left_only', 'both', 'right_only', 'right_only'], - categories=['left_only', 'right_only', 'both']) + hand_coded_result = DataFrame( + {"col1": [0, 1, 1, 3], "col2": ["a", "b", "x", "y"]} + ) + hand_coded_result["_merge"] = Categorical( + ["left_only", "both", "right_only", "right_only"], + categories=["left_only", "right_only", "both"], + ) - test5 = merge(df3, df4, on=['col1', 'col2'], - how='outer', indicator=True) + test5 = merge(df3, df4, on=["col1", "col2"], how="outer", indicator=True) assert_frame_equal(test5, hand_coded_result) - test5 = df3.merge(df4, on=['col1', 'col2'], - how='outer', indicator=True) + test5 = df3.merge(df4, on=["col1", "col2"], how="outer", indicator=True) assert_frame_equal(test5, hand_coded_result) def test_validation(self): - left = DataFrame({'a': ['a', 'b', 'c', 'd'], - 'b': ['cat', 'dog', 'weasel', 'horse']}, - index=range(4)) + left = DataFrame( + {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]}, + index=range(4), + ) - right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay', 'chirp']}, - index=range(5)) + right = DataFrame( + { + "a": ["a", "b", "c", "d", "e"], + "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"], + }, + index=range(5), + ) # Make sure no side effects. left_copy = left.copy() right_copy = right.copy() - result = merge(left, right, left_index=True, right_index=True, - validate='1:1') + result = merge(left, right, left_index=True, right_index=True, validate="1:1") assert_frame_equal(left, left_copy) assert_frame_equal(right, right_copy) # make sure merge still correct - expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'], - 'b': ['cat', 'dog', 'weasel', 'horse'], - 'a_y': ['a', 'b', 'c', 'd'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay']}, - index=range(4), - columns=['a_x', 'b', 'a_y', 'c']) - - result = merge(left, right, left_index=True, right_index=True, - validate='one_to_one') + expected = DataFrame( + { + "a_x": ["a", "b", "c", "d"], + "b": ["cat", "dog", "weasel", "horse"], + "a_y": ["a", "b", "c", "d"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + index=range(4), + columns=["a_x", "b", "a_y", "c"], + ) + + result = merge( + left, right, left_index=True, right_index=True, validate="one_to_one" + ) assert_frame_equal(result, expected) - expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'], - 'b': ['cat', 'dog', 'weasel', 'horse'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay']}, - index=range(4)) + expected_2 = DataFrame( + { + "a": ["a", "b", "c", "d"], + "b": ["cat", "dog", "weasel", "horse"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + index=range(4), + ) - result = merge(left, right, on='a', validate='1:1') + result = merge(left, right, on="a", validate="1:1") assert_frame_equal(left, left_copy) assert_frame_equal(right, right_copy) assert_frame_equal(result, expected_2) - result = merge(left, right, on='a', validate='one_to_one') + result = merge(left, right, on="a", validate="one_to_one") assert_frame_equal(result, expected_2) # One index, one column - expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'], - 'a': ['a', 'b', 'c', 'd'], - 'c': ['meow', 'bark', 'um... weasel noise?', - 'nay']}, - columns=['b', 'a', 'c'], - index=range(4)) - - left_index_reset = left.set_index('a') - result = merge(left_index_reset, right, left_index=True, - right_on='a', validate='one_to_one') + expected_3 = DataFrame( + { + "b": ["cat", "dog", "weasel", "horse"], + "a": ["a", "b", "c", "d"], + "c": ["meow", "bark", "um... weasel noise?", "nay"], + }, + columns=["b", "a", "c"], + index=range(4), + ) + + left_index_reset = left.set_index("a") + result = merge( + left_index_reset, + right, + left_index=True, + right_on="a", + validate="one_to_one", + ) assert_frame_equal(result, expected_3) # Dups on right - right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']}, - index=[4])) - merge(left, right_w_dups, left_index=True, right_index=True, - validate='one_to_many') + right_w_dups = right.append(pd.DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])) + merge( + left, + right_w_dups, + left_index=True, + right_index=True, + validate="one_to_many", + ) - msg = ("Merge keys are not unique in right dataset; not a one-to-one" - " merge") + msg = "Merge keys are not unique in right dataset; not a one-to-one" " merge" with pytest.raises(MergeError, match=msg): - merge(left, right_w_dups, left_index=True, right_index=True, - validate='one_to_one') + merge( + left, + right_w_dups, + left_index=True, + right_index=True, + validate="one_to_one", + ) with pytest.raises(MergeError, match=msg): - merge(left, right_w_dups, on='a', validate='one_to_one') + merge(left, right_w_dups, on="a", validate="one_to_one") # Dups on left - left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']}, - index=[3]), sort=True) - merge(left_w_dups, right, left_index=True, right_index=True, - validate='many_to_one') + left_w_dups = left.append( + pd.DataFrame({"a": ["a"], "c": ["cow"]}, index=[3]), sort=True + ) + merge( + left_w_dups, + right, + left_index=True, + right_index=True, + validate="many_to_one", + ) - msg = ("Merge keys are not unique in left dataset; not a one-to-one" - " merge") + msg = "Merge keys are not unique in left dataset; not a one-to-one" " merge" with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right, left_index=True, right_index=True, - validate='one_to_one') + merge( + left_w_dups, + right, + left_index=True, + right_index=True, + validate="one_to_one", + ) with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right, on='a', validate='one_to_one') + merge(left_w_dups, right, on="a", validate="one_to_one") # Dups on both - merge(left_w_dups, right_w_dups, on='a', validate='many_to_many') + merge(left_w_dups, right_w_dups, on="a", validate="many_to_many") - msg = ("Merge keys are not unique in right dataset; not a many-to-one" - " merge") + msg = "Merge keys are not unique in right dataset; not a many-to-one" " merge" with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right_w_dups, left_index=True, - right_index=True, validate='many_to_one') - - msg = ("Merge keys are not unique in left dataset; not a one-to-many" - " merge") + merge( + left_w_dups, + right_w_dups, + left_index=True, + right_index=True, + validate="many_to_one", + ) + + msg = "Merge keys are not unique in left dataset; not a one-to-many" " merge" with pytest.raises(MergeError, match=msg): - merge(left_w_dups, right_w_dups, on='a', - validate='one_to_many') + merge(left_w_dups, right_w_dups, on="a", validate="one_to_many") # Check invalid arguments msg = "Not a valid argument for validate" with pytest.raises(ValueError, match=msg): - merge(left, right, on='a', validate='jibberish') + merge(left, right, on="a", validate="jibberish") # Two column merge, dups in both, but jointly no dups. - left = DataFrame({'a': ['a', 'a', 'b', 'b'], - 'b': [0, 1, 0, 1], - 'c': ['cat', 'dog', 'weasel', 'horse']}, - index=range(4)) - - right = DataFrame({'a': ['a', 'a', 'b'], - 'b': [0, 1, 0], - 'd': ['meow', 'bark', 'um... weasel noise?']}, - index=range(3)) - - expected_multi = DataFrame({'a': ['a', 'a', 'b'], - 'b': [0, 1, 0], - 'c': ['cat', 'dog', 'weasel'], - 'd': ['meow', 'bark', - 'um... weasel noise?']}, - index=range(3)) - - msg = ("Merge keys are not unique in either left or right dataset;" - " not a one-to-one merge") + left = DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": [0, 1, 0, 1], + "c": ["cat", "dog", "weasel", "horse"], + }, + index=range(4), + ) + + right = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ) + + expected_multi = DataFrame( + { + "a": ["a", "a", "b"], + "b": [0, 1, 0], + "c": ["cat", "dog", "weasel"], + "d": ["meow", "bark", "um... weasel noise?"], + }, + index=range(3), + ) + + msg = ( + "Merge keys are not unique in either left or right dataset;" + " not a one-to-one merge" + ) with pytest.raises(MergeError, match=msg): - merge(left, right, on='a', validate='1:1') + merge(left, right, on="a", validate="1:1") - result = merge(left, right, on=['a', 'b'], validate='1:1') + result = merge(left, right, on=["a", "b"], validate="1:1") assert_frame_equal(result, expected_multi) def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 - a = pd.DataFrame({'a': [], 'b': [], 'c': []}) - with np.errstate(divide='raise'): - merge(a, a, on=('a', 'b')) + a = pd.DataFrame({"a": [], "b": [], "c": []}) + with np.errstate(divide="raise"): + merge(a, a, on=("a", "b")) - @pytest.mark.parametrize('how', ['right', 'outer']) + @pytest.mark.parametrize("how", ["right", "outer"]) @pytest.mark.parametrize( - 'index,expected_index', - [(CategoricalIndex([1, 2, 4]), - CategoricalIndex([1, 2, 4, None, None, None])), - (DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03']), - DatetimeIndex(['2001-01-01', '2002-02-02', '2003-03-03', - pd.NaT, pd.NaT, pd.NaT])), - (Float64Index([1, 2, 3]), - Float64Index([1, 2, 3, None, None, None])), - (Int64Index([1, 2, 3]), - Float64Index([1, 2, 3, None, None, None])), - (IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), - IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4), - np.nan, np.nan, np.nan])), - (PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03'], freq='D'), - PeriodIndex(['2001-01-01', '2001-01-02', '2001-01-03', - pd.NaT, pd.NaT, pd.NaT], freq='D')), - (TimedeltaIndex(['1d', '2d', '3d']), - TimedeltaIndex(['1d', '2d', '3d', pd.NaT, pd.NaT, pd.NaT]))]) + "index,expected_index", + [ + ( + CategoricalIndex([1, 2, 4]), + CategoricalIndex([1, 2, 4, None, None, None]), + ), + ( + DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]), + DatetimeIndex( + ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT] + ), + ), + (Float64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), + (Int64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])), + ( + IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]), + IntervalIndex.from_tuples( + [(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan] + ), + ), + ( + PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"), + PeriodIndex( + ["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT], + freq="D", + ), + ), + ( + TimedeltaIndex(["1d", "2d", "3d"]), + TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]), + ), + ], + ) def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}, index=index) - df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) - result = df1.merge(df2, left_on='key', right_index=True, how=how) - expected = pd.DataFrame([[1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5]], - columns=['a', 'key', 'b']) + df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index) + df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]}) + result = df1.merge(df2, left_on="key", right_index=True, how=how) + expected = pd.DataFrame( + [ + [1.0, 0, 1], + [2.0, 2, 3], + [3.0, 2, 3], + [np.nan, 1, 2], + [np.nan, 3, 4], + [np.nan, 4, 5], + ], + columns=["a", "key", "b"], + ) expected.set_index(expected_index, inplace=True) assert_frame_equal(result, expected) @@ -1073,294 +1306,313 @@ def test_merge_right_index_right(self): # Note: the expected output here is probably incorrect. # See https://github.com/pandas-dev/pandas/issues/17257 for more. # We include this as a regression test for GH-24897. - left = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 1, 1]}) - right = pd.DataFrame({'b': [1, 2, 3]}) - - expected = pd.DataFrame({'a': [1, 2, 3, None], - 'key': [0, 1, 1, 2], - 'b': [1, 2, 2, 3]}, - columns=['a', 'key', 'b'], - index=[0, 1, 2, np.nan]) - result = left.merge(right, left_on='key', right_index=True, - how='right') + left = pd.DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]}) + right = pd.DataFrame({"b": [1, 2, 3]}) + + expected = pd.DataFrame( + {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]}, + columns=["a", "key", "b"], + index=[0, 1, 2, np.nan], + ) + result = left.merge(right, left_on="key", right_index=True, how="right") tm.assert_frame_equal(result, expected) def test_merge_take_missing_values_from_index_of_other_dtype(self): # GH 24212 - left = pd.DataFrame({'a': [1, 2, 3], - 'key': pd.Categorical(['a', 'a', 'b'], - categories=list('abc'))}) - right = pd.DataFrame({'b': [1, 2, 3]}, - index=pd.CategoricalIndex(['a', 'b', 'c'])) - result = left.merge(right, left_on='key', - right_index=True, how='right') - expected = pd.DataFrame({'a': [1, 2, 3, None], - 'key': pd.Categorical(['a', 'a', 'b', 'c']), - 'b': [1, 1, 2, 3]}, - index=[0, 1, 2, np.nan]) - expected = expected.reindex(columns=['a', 'key', 'b']) + left = pd.DataFrame( + { + "a": [1, 2, 3], + "key": pd.Categorical(["a", "a", "b"], categories=list("abc")), + } + ) + right = pd.DataFrame( + {"b": [1, 2, 3]}, index=pd.CategoricalIndex(["a", "b", "c"]) + ) + result = left.merge(right, left_on="key", right_index=True, how="right") + expected = pd.DataFrame( + { + "a": [1, 2, 3, None], + "key": pd.Categorical(["a", "a", "b", "c"]), + "b": [1, 1, 2, 3], + }, + index=[0, 1, 2, np.nan], + ) + expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) def _check_merge(x, y): - for how in ['inner', 'left', 'outer']: + for how in ["inner", "left", "outer"]: result = x.join(y, how=how) - expected = merge(x.reset_index(), y.reset_index(), how=how, - sort=True) - expected = expected.set_index('index') + expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True) + expected = expected.set_index("index") # TODO check_names on merge? assert_frame_equal(result, expected, check_names=False) class TestMergeDtypes: - - @pytest.mark.parametrize('right_vals', [ - ['foo', 'bar'], - Series(['foo', 'bar']).astype('category'), - ]) + @pytest.mark.parametrize( + "right_vals", [["foo", "bar"], Series(["foo", "bar"]).astype("category")] + ) def test_different(self, right_vals): - left = DataFrame({'A': ['foo', 'bar'], - 'B': Series(['foo', 'bar']).astype('category'), - 'C': [1, 2], - 'D': [1.0, 2.0], - 'E': Series([1, 2], dtype='uint64'), - 'F': Series([1, 2], dtype='int32')}) - right = DataFrame({'A': right_vals}) + left = DataFrame( + { + "A": ["foo", "bar"], + "B": Series(["foo", "bar"]).astype("category"), + "C": [1, 2], + "D": [1.0, 2.0], + "E": Series([1, 2], dtype="uint64"), + "F": Series([1, 2], dtype="int32"), + } + ) + right = DataFrame({"A": right_vals}) # GH 9780 # We allow merging on object and categorical cols and cast # categorical cols to object - result = pd.merge(left, right, on='A') + result = pd.merge(left, right, on="A") assert is_object_dtype(result.A.dtype) - @pytest.mark.parametrize('d1', [np.int64, np.int32, - np.int16, np.int8, np.uint8]) - @pytest.mark.parametrize('d2', [np.int64, np.float64, - np.float32, np.float16]) + @pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8]) + @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) def test_join_multi_dtypes(self, d1, d2): dtype1 = np.dtype(d1) dtype2 = np.dtype(d2) - left = DataFrame({'k1': np.array([0, 1, 2] * 8, dtype=dtype1), - 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24), dtype=np.int64)}) + left = DataFrame( + { + "k1": np.array([0, 1, 2] * 8, dtype=dtype1), + "k2": ["foo", "bar"] * 12, + "v": np.array(np.arange(24), dtype=np.int64), + } + ) - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': np.array([5, 7], dtype=dtype2)}, index=index) + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": np.array([5, 7], dtype=dtype2)}, index=index) - result = left.join(right, on=['k1', 'k2']) + result = left.join(right, on=["k1", "k2"]) expected = left.copy() - if dtype2.kind == 'i': - dtype2 = np.dtype('float64') - expected['v2'] = np.array(np.nan, dtype=dtype2) - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + if dtype2.kind == "i": + dtype2 = np.dtype("float64") + expected["v2"] = np.array(np.nan, dtype=dtype2) + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 tm.assert_frame_equal(result, expected) - result = left.join(right, on=['k1', 'k2'], sort=True) - expected.sort_values(['k1', 'k2'], kind='mergesort', inplace=True) + result = left.join(right, on=["k1", "k2"], sort=True) + expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('int_vals, float_vals, exp_vals', [ - ([1, 2, 3], [1.0, 2.0, 3.0], {'X': [1, 2, 3], 'Y': [1.0, 2.0, 3.0]}), - ([1, 2, 3], [1.0, 3.0], {'X': [1, 3], 'Y': [1.0, 3.0]}), - ([1, 2], [1.0, 2.0, 3.0], {'X': [1, 2], 'Y': [1.0, 2.0]}), - ]) + @pytest.mark.parametrize( + "int_vals, float_vals, exp_vals", + [ + ([1, 2, 3], [1.0, 2.0, 3.0], {"X": [1, 2, 3], "Y": [1.0, 2.0, 3.0]}), + ([1, 2, 3], [1.0, 3.0], {"X": [1, 3], "Y": [1.0, 3.0]}), + ([1, 2], [1.0, 2.0, 3.0], {"X": [1, 2], "Y": [1.0, 2.0]}), + ], + ) def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): # GH 16572 # Check that float column is not cast to object if # merging on float and int columns - A = DataFrame({'X': int_vals}) - B = DataFrame({'Y': float_vals}) + A = DataFrame({"X": int_vals}) + B = DataFrame({"Y": float_vals}) expected = DataFrame(exp_vals) - result = A.merge(B, left_on='X', right_on='Y') + result = A.merge(B, left_on="X", right_on="Y") assert_frame_equal(result, expected) - result = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(result, expected[['Y', 'X']]) + result = B.merge(A, left_on="Y", right_on="X") + assert_frame_equal(result, expected[["Y", "X"]]) def test_merge_on_ints_floats_warning(self): # GH 16572 # merge will produce a warning when merging on int and # float columns where the float values are not exactly # equal to their int representation - A = DataFrame({'X': [1, 2, 3]}) - B = DataFrame({'Y': [1.1, 2.5, 3.0]}) - expected = DataFrame({'X': [3], 'Y': [3.0]}) + A = DataFrame({"X": [1, 2, 3]}) + B = DataFrame({"Y": [1.1, 2.5, 3.0]}) + expected = DataFrame({"X": [3], "Y": [3.0]}) with tm.assert_produces_warning(UserWarning): - result = A.merge(B, left_on='X', right_on='Y') + result = A.merge(B, left_on="X", right_on="Y") assert_frame_equal(result, expected) with tm.assert_produces_warning(UserWarning): - result = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(result, expected[['Y', 'X']]) + result = B.merge(A, left_on="Y", right_on="X") + assert_frame_equal(result, expected[["Y", "X"]]) # test no warning if float has NaNs - B = DataFrame({'Y': [np.nan, np.nan, 3.0]}) + B = DataFrame({"Y": [np.nan, np.nan, 3.0]}) with tm.assert_produces_warning(None): - result = B.merge(A, left_on='Y', right_on='X') - assert_frame_equal(result, expected[['Y', 'X']]) + result = B.merge(A, left_on="Y", right_on="X") + assert_frame_equal(result, expected[["Y", "X"]]) def test_merge_incompat_infer_boolean_object(self): # GH21119: bool + object bool merge OK - df1 = DataFrame({'key': Series([True, False], dtype=object)}) - df2 = DataFrame({'key': [True, False]}) + df1 = DataFrame({"key": Series([True, False], dtype=object)}) + df2 = DataFrame({"key": [True, False]}) - expected = DataFrame({'key': [True, False]}, dtype=object) - result = pd.merge(df1, df2, on='key') + expected = DataFrame({"key": [True, False]}, dtype=object) + result = pd.merge(df1, df2, on="key") assert_frame_equal(result, expected) - result = pd.merge(df2, df1, on='key') + result = pd.merge(df2, df1, on="key") assert_frame_equal(result, expected) # with missing value - df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)}) - df2 = DataFrame({'key': [True, False]}) + df1 = DataFrame({"key": Series([True, False, np.nan], dtype=object)}) + df2 = DataFrame({"key": [True, False]}) - expected = DataFrame({'key': [True, False]}, dtype=object) - result = pd.merge(df1, df2, on='key') + expected = DataFrame({"key": [True, False]}, dtype=object) + result = pd.merge(df1, df2, on="key") assert_frame_equal(result, expected) - result = pd.merge(df2, df1, on='key') + result = pd.merge(df2, df1, on="key") assert_frame_equal(result, expected) - @pytest.mark.parametrize('df1_vals, df2_vals', [ - - # merge on category coerces to object - ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), - ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), - - # no not infer - ([0, 1], pd.Series([False, True], dtype=object)), - ([0, 1], pd.Series([False, True], dtype=bool)), - ]) + @pytest.mark.parametrize( + "df1_vals, df2_vals", + [ + # merge on category coerces to object + ([0, 1, 2], Series(["a", "b", "a"]).astype("category")), + ([0.0, 1.0, 2.0], Series(["a", "b", "a"]).astype("category")), + # no not infer + ([0, 1], pd.Series([False, True], dtype=object)), + ([0, 1], pd.Series([False, True], dtype=bool)), + ], + ) def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): # these are explicitly allowed incompat merges, that pass thru # the result type is dependent on if the values on the rhs are # inferred, otherwise these will be coerced to object - df1 = DataFrame({'A': df1_vals}) - df2 = DataFrame({'A': df2_vals}) + df1 = DataFrame({"A": df1_vals}) + df2 = DataFrame({"A": df2_vals}) - result = pd.merge(df1, df2, on=['A']) + result = pd.merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) - result = pd.merge(df2, df1, on=['A']) + result = pd.merge(df2, df1, on=["A"]) assert is_object_dtype(result.A.dtype) - @pytest.mark.parametrize('df1_vals, df2_vals', [ - # do not infer to numeric - - (Series([1, 2], dtype='uint64'), ["a", "b", "c"]), - (Series([1, 2], dtype='int32'), ["a", "b", "c"]), - ([0, 1, 2], ["0", "1", "2"]), - ([0.0, 1.0, 2.0], ["0", "1", "2"]), - ([0, 1, 2], ["0", "1", "2"]), - (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01', - '2011-01-02']), - (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]), - (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]), - (pd.date_range('20130101', periods=3), - pd.date_range('20130101', periods=3, tz='US/Eastern')), - ]) + @pytest.mark.parametrize( + "df1_vals, df2_vals", + [ + # do not infer to numeric + (Series([1, 2], dtype="uint64"), ["a", "b", "c"]), + (Series([1, 2], dtype="int32"), ["a", "b", "c"]), + ([0, 1, 2], ["0", "1", "2"]), + ([0.0, 1.0, 2.0], ["0", "1", "2"]), + ([0, 1, 2], ["0", "1", "2"]), + ( + pd.date_range("1/1/2011", periods=2, freq="D"), + ["2011-01-01", "2011-01-02"], + ), + (pd.date_range("1/1/2011", periods=2, freq="D"), [0, 1]), + (pd.date_range("1/1/2011", periods=2, freq="D"), [0.0, 1.0]), + ( + pd.date_range("20130101", periods=3), + pd.date_range("20130101", periods=3, tz="US/Eastern"), + ), + ], + ) def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): # GH 9780, GH 15800 # Raise a ValueError when a user tries to merge on # dtypes that are incompatible (e.g., obj and int/float) - df1 = DataFrame({'A': df1_vals}) - df2 = DataFrame({'A': df2_vals}) + df1 = DataFrame({"A": df1_vals}) + df2 = DataFrame({"A": df2_vals}) - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=df1['A'].dtype, - rk_dtype=df2['A'].dtype)) + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format( + lk_dtype=df1["A"].dtype, rk_dtype=df2["A"].dtype + ) + ) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): - pd.merge(df1, df2, on=['A']) + pd.merge(df1, df2, on=["A"]) # Check that error still raised when swapping order of dataframes - msg = ("You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=df2['A'].dtype, - rk_dtype=df1['A'].dtype)) + msg = ( + "You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format( + lk_dtype=df2["A"].dtype, rk_dtype=df1["A"].dtype + ) + ) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): - pd.merge(df2, df1, on=['A']) + pd.merge(df2, df1, on=["A"]) @pytest.fixture def left(): np.random.seed(1234) return DataFrame( - {'X': Series(np.random.choice( - ['foo', 'bar'], - size=(10,))).astype(CDT(['foo', 'bar'])), - 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) + { + "X": Series(np.random.choice(["foo", "bar"], size=(10,))).astype( + CDT(["foo", "bar"]) + ), + "Y": np.random.choice(["one", "two", "three"], size=(10,)), + } + ) @pytest.fixture def right(): np.random.seed(1234) return DataFrame( - {'X': Series(['foo', 'bar']).astype(CDT(['foo', 'bar'])), - 'Z': [1, 2]}) + {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]} + ) class TestMergeCategorical: - def test_identical(self, left): # merging on the same, should preserve dtypes - merged = pd.merge(left, left, on='X') + merged = pd.merge(left, left, on="X") result = merged.dtypes.sort_index() - expected = Series([CategoricalDtype(), - np.dtype('O'), - np.dtype('O')], - index=['X', 'Y_x', 'Y_y']) + expected = Series( + [CategoricalDtype(), np.dtype("O"), np.dtype("O")], + index=["X", "Y_x", "Y_y"], + ) assert_series_equal(result, expected) def test_basic(self, left, right): # we have matching Categorical dtypes in X # so should preserve the merged column - merged = pd.merge(left, right, on='X') + merged = pd.merge(left, right, on="X") result = merged.dtypes.sort_index() - expected = Series([CategoricalDtype(), - np.dtype('O'), - np.dtype('int64')], - index=['X', 'Y', 'Z']) + expected = Series( + [CategoricalDtype(), np.dtype("O"), np.dtype("int64")], + index=["X", "Y", "Z"], + ) assert_series_equal(result, expected) def test_merge_categorical(self): # GH 9426 - right = DataFrame({'c': {0: 'a', - 1: 'b', - 2: 'c', - 3: 'd', - 4: 'e'}, - 'd': {0: 'null', - 1: 'null', - 2: 'null', - 3: 'null', - 4: 'null'}}) - left = DataFrame({'a': {0: 'f', - 1: 'f', - 2: 'f', - 3: 'f', - 4: 'f'}, - 'b': {0: 'g', - 1: 'g', - 2: 'g', - 3: 'g', - 4: 'g'}}) - df = pd.merge(left, right, how='left', left_on='b', right_on='c') + right = DataFrame( + { + "c": {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}, + "d": {0: "null", 1: "null", 2: "null", 3: "null", 4: "null"}, + } + ) + left = DataFrame( + { + "a": {0: "f", 1: "f", 2: "f", 3: "f", 4: "f"}, + "b": {0: "g", 1: "g", 2: "g", 3: "g", 4: "g"}, + } + ) + df = pd.merge(left, right, how="left", left_on="b", right_on="c") # object-object expected = df.copy() @@ -1369,54 +1621,60 @@ def test_merge_categorical(self): # note that we propagate the category # because we don't have any matching rows cright = right.copy() - cright['d'] = cright['d'].astype('category') - result = pd.merge(left, cright, how='left', left_on='b', right_on='c') - expected['d'] = expected['d'].astype(CategoricalDtype(['null'])) + cright["d"] = cright["d"].astype("category") + result = pd.merge(left, cright, how="left", left_on="b", right_on="c") + expected["d"] = expected["d"].astype(CategoricalDtype(["null"])) tm.assert_frame_equal(result, expected) # cat-object cleft = left.copy() - cleft['b'] = cleft['b'].astype('category') - result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c') + cleft["b"] = cleft["b"].astype("category") + result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") tm.assert_frame_equal(result, expected) # cat-cat cright = right.copy() - cright['d'] = cright['d'].astype('category') + cright["d"] = cright["d"].astype("category") cleft = left.copy() - cleft['b'] = cleft['b'].astype('category') - result = pd.merge(cleft, cright, how='left', left_on='b', right_on='c') + cleft["b"] = cleft["b"].astype("category") + result = pd.merge(cleft, cright, how="left", left_on="b", right_on="c") tm.assert_frame_equal(result, expected) def tests_merge_categorical_unordered_equal(self): # GH-19551 - df1 = DataFrame({ - 'Foo': Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C']), - 'Left': ['A0', 'B0', 'C0'], - }) - - df2 = DataFrame({ - 'Foo': Categorical(['C', 'B', 'A'], categories=['C', 'B', 'A']), - 'Right': ['C1', 'B1', 'A1'], - }) - result = pd.merge(df1, df2, on=['Foo']) - expected = DataFrame({ - 'Foo': pd.Categorical(['A', 'B', 'C']), - 'Left': ['A0', 'B0', 'C0'], - 'Right': ['A1', 'B1', 'C1'], - }) + df1 = DataFrame( + { + "Foo": Categorical(["A", "B", "C"], categories=["A", "B", "C"]), + "Left": ["A0", "B0", "C0"], + } + ) + + df2 = DataFrame( + { + "Foo": Categorical(["C", "B", "A"], categories=["C", "B", "A"]), + "Right": ["C1", "B1", "A1"], + } + ) + result = pd.merge(df1, df2, on=["Foo"]) + expected = DataFrame( + { + "Foo": pd.Categorical(["A", "B", "C"]), + "Left": ["A0", "B0", "C0"], + "Right": ["A1", "B1", "C1"], + } + ) assert_frame_equal(result, expected) def test_other_columns(self, left, right): # non-merge columns should preserve if possible - right = right.assign(Z=right.Z.astype('category')) + right = right.assign(Z=right.Z.astype("category")) - merged = pd.merge(left, right, on='X') + merged = pd.merge(left, right, on="X") result = merged.dtypes.sort_index() - expected = Series([CategoricalDtype(), - np.dtype('O'), - CategoricalDtype()], - index=['X', 'Y', 'Z']) + expected = Series( + [CategoricalDtype(), np.dtype("O"), CategoricalDtype()], + index=["X", "Y", "Z"], + ) assert_series_equal(result, expected) # categories are preserved @@ -1424,44 +1682,64 @@ def test_other_columns(self, left, right): assert right.Z.values.is_dtype_equal(merged.Z.values) @pytest.mark.parametrize( - 'change', [lambda x: x, - lambda x: x.astype(CDT(['foo', 'bar', 'bah'])), - lambda x: x.astype(CDT(ordered=True))]) + "change", + [ + lambda x: x, + lambda x: x.astype(CDT(["foo", "bar", "bah"])), + lambda x: x.astype(CDT(ordered=True)), + ], + ) def test_dtype_on_merged_different(self, change, join_type, left, right): # our merging columns, X now has 2 different dtypes # so we must be object as a result - X = change(right.X.astype('object')) + X = change(right.X.astype("object")) right = right.assign(X=X) assert is_categorical_dtype(left.X.values) # assert not left.X.values.is_dtype_equal(right.X.values) - merged = pd.merge(left, right, on='X', how=join_type) + merged = pd.merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series([np.dtype('O'), - np.dtype('O'), - np.dtype('int64')], - index=['X', 'Y', 'Z']) + expected = Series( + [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] + ) assert_series_equal(result, expected) def test_self_join_multiple_categories(self): # GH 16767 # non-duplicates should work with multiple categories m = 5 - df = pd.DataFrame({ - 'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m, - 'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m, - 'c': [letter - for each in ['m', 'n', 'u', 'p', 'o'] - for letter in [each] * 2 * m], - 'd': [letter - for each in ['aa', 'bb', 'cc', 'dd', 'ee', - 'ff', 'gg', 'hh', 'ii', 'jj'] - for letter in [each] * m]}) + df = pd.DataFrame( + { + "a": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] * m, + "b": ["t", "w", "x", "y", "z"] * 2 * m, + "c": [ + letter + for each in ["m", "n", "u", "p", "o"] + for letter in [each] * 2 * m + ], + "d": [ + letter + for each in [ + "aa", + "bb", + "cc", + "dd", + "ee", + "ff", + "gg", + "hh", + "ii", + "jj", + ] + for letter in [each] * m + ], + } + ) # change them all to categorical variables - df = df.apply(lambda x: x.astype('category')) + df = df.apply(lambda x: x.astype("category")) # self-join should equal ourselves result = pd.merge(df, df, on=list(df.columns)) @@ -1473,192 +1751,237 @@ def test_dtype_on_categorical_dates(self): # dates should not be coerced to ints df = pd.DataFrame( - [[date(2001, 1, 1), 1.1], - [date(2001, 1, 2), 1.3]], - columns=['date', 'num2'] + [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"] ) - df['date'] = df['date'].astype('category') + df["date"] = df["date"].astype("category") df2 = pd.DataFrame( - [[date(2001, 1, 1), 1.3], - [date(2001, 1, 3), 1.4]], - columns=['date', 'num4'] + [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"] ) - df2['date'] = df2['date'].astype('category') - - expected_outer = pd.DataFrame([ - [pd.Timestamp('2001-01-01'), 1.1, 1.3], - [pd.Timestamp('2001-01-02'), 1.3, np.nan], - [pd.Timestamp('2001-01-03'), np.nan, 1.4]], - columns=['date', 'num2', 'num4'] + df2["date"] = df2["date"].astype("category") + + expected_outer = pd.DataFrame( + [ + [pd.Timestamp("2001-01-01"), 1.1, 1.3], + [pd.Timestamp("2001-01-02"), 1.3, np.nan], + [pd.Timestamp("2001-01-03"), np.nan, 1.4], + ], + columns=["date", "num2", "num4"], ) - result_outer = pd.merge(df, df2, how='outer', on=['date']) + result_outer = pd.merge(df, df2, how="outer", on=["date"]) assert_frame_equal(result_outer, expected_outer) expected_inner = pd.DataFrame( - [[pd.Timestamp('2001-01-01'), 1.1, 1.3]], - columns=['date', 'num2', 'num4'] + [[pd.Timestamp("2001-01-01"), 1.1, 1.3]], columns=["date", "num2", "num4"] ) - result_inner = pd.merge(df, df2, how='inner', on=['date']) + result_inner = pd.merge(df, df2, how="inner", on=["date"]) assert_frame_equal(result_inner, expected_inner) - @pytest.mark.parametrize('ordered', [True, False]) - @pytest.mark.parametrize('category_column,categories,expected_categories', - [([False, True, True, False], [True, False], - [True, False]), - ([2, 1, 1, 2], [1, 2], [1, 2]), - (['False', 'True', 'True', 'False'], - ['True', 'False'], ['True', 'False'])]) - def test_merging_with_bool_or_int_cateorical_column(self, category_column, - categories, - expected_categories, - ordered): + @pytest.mark.parametrize("ordered", [True, False]) + @pytest.mark.parametrize( + "category_column,categories,expected_categories", + [ + ([False, True, True, False], [True, False], [True, False]), + ([2, 1, 1, 2], [1, 2], [1, 2]), + (["False", "True", "True", "False"], ["True", "False"], ["True", "False"]), + ], + ) + def test_merging_with_bool_or_int_cateorical_column( + self, category_column, categories, expected_categories, ordered + ): # GH 17187 # merging with a boolean/int categorical column - df1 = pd.DataFrame({'id': [1, 2, 3, 4], - 'cat': category_column}) - df1['cat'] = df1['cat'].astype(CDT(categories, ordered=ordered)) - df2 = pd.DataFrame({'id': [2, 4], 'num': [1, 9]}) + df1 = pd.DataFrame({"id": [1, 2, 3, 4], "cat": category_column}) + df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered)) + df2 = pd.DataFrame({"id": [2, 4], "num": [1, 9]}) result = df1.merge(df2) - expected = pd.DataFrame({'id': [2, 4], 'cat': expected_categories, - 'num': [1, 9]}) - expected['cat'] = expected['cat'].astype( - CDT(categories, ordered=ordered)) + expected = pd.DataFrame( + {"id": [2, 4], "cat": expected_categories, "num": [1, 9]} + ) + expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered)) assert_frame_equal(expected, result) def test_merge_on_int_array(self): # GH 23020 - df = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B': 1}) - result = pd.merge(df, df, on='A') - expected = pd.DataFrame({'A': pd.Series([1, 2, np.nan], dtype='Int64'), - 'B_x': 1, - 'B_y': 1}) + df = pd.DataFrame({"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B": 1}) + result = pd.merge(df, df, on="A") + expected = pd.DataFrame( + {"A": pd.Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1} + ) assert_frame_equal(result, expected) @pytest.fixture def left_df(): - return DataFrame({'a': [20, 10, 0]}, index=[2, 1, 0]) + return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0]) @pytest.fixture def right_df(): - return DataFrame({'b': [300, 100, 200]}, index=[3, 1, 2]) + return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2]) class TestMergeOnIndexes: - @pytest.mark.parametrize( "how, sort, expected", - [('inner', False, DataFrame({'a': [20, 10], - 'b': [200, 100]}, - index=[2, 1])), - ('inner', True, DataFrame({'a': [10, 20], - 'b': [100, 200]}, - index=[1, 2])), - ('left', False, DataFrame({'a': [20, 10, 0], - 'b': [200, 100, np.nan]}, - index=[2, 1, 0])), - ('left', True, DataFrame({'a': [0, 10, 20], - 'b': [np.nan, 100, 200]}, - index=[0, 1, 2])), - ('right', False, DataFrame({'a': [np.nan, 10, 20], - 'b': [300, 100, 200]}, - index=[3, 1, 2])), - ('right', True, DataFrame({'a': [10, 20, np.nan], - 'b': [100, 200, 300]}, - index=[1, 2, 3])), - ('outer', False, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3])), - ('outer', True, DataFrame({'a': [0, 10, 20, np.nan], - 'b': [np.nan, 100, 200, 300]}, - index=[0, 1, 2, 3]))]) + [ + ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])), + ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])), + ( + "left", + False, + DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]), + ), + ( + "left", + True, + DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]), + ), + ( + "right", + False, + DataFrame( + {"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2] + ), + ), + ( + "right", + True, + DataFrame( + {"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3] + ), + ), + ( + "outer", + False, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ( + "outer", + True, + DataFrame( + {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]}, + index=[0, 1, 2, 3], + ), + ), + ], + ) def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): - result = pd.merge(left_df, right_df, - left_index=True, - right_index=True, - how=how, - sort=sort) + result = pd.merge( + left_df, right_df, left_index=True, right_index=True, how=how, sort=sort + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - 'index', [ - CategoricalIndex(['A', 'B'], categories=['A', 'B'], name='index_col'), - Float64Index([1.0, 2.0], name='index_col'), - Int64Index([1, 2], name='index_col'), - UInt64Index([1, 2], name='index_col'), - RangeIndex(start=0, stop=2, name='index_col'), - DatetimeIndex(["2018-01-01", "2018-01-02"], name='index_col'), - ], ids=lambda x: type(x).__name__) + "index", + [ + CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"), + Float64Index([1.0, 2.0], name="index_col"), + Int64Index([1, 2], name="index_col"), + UInt64Index([1, 2], name="index_col"), + RangeIndex(start=0, stop=2, name="index_col"), + DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"), + ], + ids=lambda x: type(x).__name__, +) def test_merge_index_types(index): # gh-20777 # assert key access is consistent across index types left = DataFrame({"left_data": [1, 2]}, index=index) right = DataFrame({"right_data": [1.0, 2.0]}, index=index) - result = left.merge(right, on=['index_col']) + result = left.merge(right, on=["index_col"]) expected = DataFrame( - OrderedDict([('left_data', [1, 2]), ('right_data', [1.0, 2.0])]), - index=index) + OrderedDict([("left_data", [1, 2]), ("right_data", [1.0, 2.0])]), index=index + ) assert_frame_equal(result, expected) -@pytest.mark.parametrize("on,left_on,right_on,left_index,right_index,nm", [ - (['outer', 'inner'], None, None, False, False, 'B'), - (None, None, None, True, True, 'B'), - (None, ['outer', 'inner'], None, False, True, 'B'), - (None, None, ['outer', 'inner'], True, False, 'B'), - (['outer', 'inner'], None, None, False, False, None), - (None, None, None, True, True, None), - (None, ['outer', 'inner'], None, False, True, None), - (None, None, ['outer', 'inner'], True, False, None)]) +@pytest.mark.parametrize( + "on,left_on,right_on,left_index,right_index,nm", + [ + (["outer", "inner"], None, None, False, False, "B"), + (None, None, None, True, True, "B"), + (None, ["outer", "inner"], None, False, True, "B"), + (None, None, ["outer", "inner"], True, False, "B"), + (["outer", "inner"], None, None, False, False, None), + (None, None, None, True, True, None), + (None, ["outer", "inner"], None, False, True, None), + (None, None, ["outer", "inner"], True, False, None), + ], +) def test_merge_series(on, left_on, right_on, left_index, right_index, nm): # GH 21220 - a = pd.DataFrame({"A": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_product([['a', 'b'], [0, 1]], - names=['outer', 'inner'])) - b = pd.Series([1, 2, 3, 4], - index=pd.MultiIndex.from_product([['a', 'b'], [1, 2]], - names=['outer', 'inner']), name=nm) - expected = pd.DataFrame({"A": [2, 4], "B": [1, 3]}, - index=pd.MultiIndex.from_product([['a', 'b'], [1]], - names=['outer', 'inner'])) + a = pd.DataFrame( + {"A": [1, 2, 3, 4]}, + index=pd.MultiIndex.from_product( + [["a", "b"], [0, 1]], names=["outer", "inner"] + ), + ) + b = pd.Series( + [1, 2, 3, 4], + index=pd.MultiIndex.from_product( + [["a", "b"], [1, 2]], names=["outer", "inner"] + ), + name=nm, + ) + expected = pd.DataFrame( + {"A": [2, 4], "B": [1, 3]}, + index=pd.MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]), + ) if nm is not None: - result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index) + result = pd.merge( + a, + b, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) tm.assert_frame_equal(result, expected) else: msg = "Cannot merge a Series without a name" with pytest.raises(ValueError, match=msg): - result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index) - - -@pytest.mark.parametrize("col1, col2, kwargs, expected_cols", [ - (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), - (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), - (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), - ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), - (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), - ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), - ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), - ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), - ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), - (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), - ("a", "a", dict(), ["a_x", "a_y"]), - (0, 0, dict(), ["0_x", "0_y"]) -]) + result = pd.merge( + a, + b, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + ) + + +@pytest.mark.parametrize( + "col1, col2, kwargs, expected_cols", + [ + (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), + (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), + (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), + ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), + (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), + ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), + ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), + ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), + ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), + (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), + ("a", "a", dict(), ["a_x", "a_y"]), + (0, 0, dict(), ["0_x", "0_y"]), + ], +) def test_merge_suffix(col1, col2, kwargs, expected_cols): # issue: 24782 a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [4, 5, 6]}) - expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]], - columns=expected_cols) + expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) result = a.merge(b, left_index=True, right_index=True, **kwargs) tm.assert_frame_equal(result, expected) @@ -1667,13 +1990,16 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("col1, col2, suffixes", [ - ("a", "a", [None, None]), - ("a", "a", (None, None)), - ("a", "a", ("", None)), - (0, 0, [None, None]), - (0, 0, (None, "")) -]) +@pytest.mark.parametrize( + "col1, col2, suffixes", + [ + ("a", "a", [None, None]), + ("a", "a", (None, None)), + ("a", "a", ("", None)), + (0, 0, [None, None]), + (0, 0, (None, "")), + ], +) def test_merge_suffix_error(col1, col2, suffixes): # issue: 24782 a = pd.DataFrame({col1: [1, 2, 3]}) @@ -1685,10 +2011,7 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -@pytest.mark.parametrize("col1, col2, suffixes", [ - ("a", "a", None), - (0, 0, None) -]) +@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)]) def test_merge_suffix_none_error(col1, col2, suffixes): # issue: 24782 a = pd.DataFrame({col1: [1, 2, 3]}) @@ -1709,10 +2032,9 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse): "two": CategoricalDtype(categories=["a", "b", "c"], ordered=False), } - df1 = DataFrame({ - "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), - "left": [1, 2, 3], - }).set_index("foo") + df1 = DataFrame( + {"foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), "left": [1, 2, 3]} + ).set_index("foo") data_foo = ["a", "b", "c"] data_right = [1, 2, 3] @@ -1721,18 +2043,19 @@ def test_merge_equal_cat_dtypes(cat_dtype, reverse): data_foo.reverse() data_right.reverse() - df2 = DataFrame({ - "foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), - "right": data_right - }).set_index("foo") + df2 = DataFrame( + {"foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), "right": data_right} + ).set_index("foo") result = df1.merge(df2, left_index=True, right_index=True) - expected = DataFrame({ - "left": [1, 2, 3], - "right": [1, 2, 3], - "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), - }).set_index("foo") + expected = DataFrame( + { + "left": [1, 2, 3], + "right": [1, 2, 3], + "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), + } + ).set_index("foo") # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) @@ -1743,23 +2066,19 @@ def test_merge_equal_cat_dtypes2(): cat_dtype = CategoricalDtype(categories=["a", "b", "c"], ordered=False) # Test Data - df1 = DataFrame({ - "foo": Series(["a", "b"]).astype(cat_dtype), - "left": [1, 2], - }).set_index("foo") + df1 = DataFrame( + {"foo": Series(["a", "b"]).astype(cat_dtype), "left": [1, 2]} + ).set_index("foo") - df2 = DataFrame({ - "foo": Series(["a", "b", "c"]).astype(cat_dtype), - "right": [3, 2, 1], - }).set_index("foo") + df2 = DataFrame( + {"foo": Series(["a", "b", "c"]).astype(cat_dtype), "right": [3, 2, 1]} + ).set_index("foo") result = df1.merge(df2, left_index=True, right_index=True) - expected = DataFrame({ - "left": [1, 2], - "right": [3, 2], - "foo": Series(["a", "b"]).astype(cat_dtype), - }).set_index("foo") + expected = DataFrame( + {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)} + ).set_index("foo") # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 684fba5867c00..e2e17397464fe 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -9,143 +9,154 @@ class TestAsOfMerge: - def read_data(self, datapath, name, dedupe=False): - path = datapath('reshape', 'merge', 'data', name) + path = datapath("reshape", "merge", "data", name) x = read_csv(path) if dedupe: - x = (x.drop_duplicates(['time', 'ticker'], keep='last') - .reset_index(drop=True) - ) + x = x.drop_duplicates(["time", "ticker"], keep="last").reset_index( + drop=True + ) x.time = to_datetime(x.time) return x @pytest.fixture(autouse=True) def setup_method(self, datapath): - self.trades = self.read_data(datapath, 'trades.csv') - self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) - self.asof = self.read_data(datapath, 'asof.csv') - self.tolerance = self.read_data(datapath, 'tolerance.csv') - self.allow_exact_matches = self.read_data(datapath, - 'allow_exact_matches.csv') + self.trades = self.read_data(datapath, "trades.csv") + self.quotes = self.read_data(datapath, "quotes.csv", dedupe=True) + self.asof = self.read_data(datapath, "asof.csv") + self.tolerance = self.read_data(datapath, "tolerance.csv") + self.allow_exact_matches = self.read_data(datapath, "allow_exact_matches.csv") self.allow_exact_matches_and_tolerance = self.read_data( - datapath, 'allow_exact_matches_and_tolerance.csv') + datapath, "allow_exact_matches_and_tolerance.csv" + ) def test_examples1(self): """ doc-string examples """ - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 3, 7]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 3, 7]} + ) - result = pd.merge_asof(left, right, on='a') + result = pd.merge_asof(left, right, on="a") assert_frame_equal(result, expected) def test_examples2(self): """ doc-string examples """ - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL', 'GOOG', - 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) - - pd.merge_asof(trades, quotes, - on='time', - by='ticker') - - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('2ms')) - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100], - 'bid': [np.nan, 51.97, np.nan, - np.nan, np.nan], - 'ask': [np.nan, 51.98, np.nan, - np.nan, np.nan]}, - columns=['time', 'ticker', 'price', 'quantity', - 'bid', 'ask']) - - result = pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('10ms'), - allow_exact_matches=False) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": [ + "GOOG", + "MSFT", + "MSFT", + "MSFT", + "GOOG", + "AAPL", + "GOOG", + "MSFT", + ], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) + + pd.merge_asof(trades, quotes, on="time", by="ticker") + + pd.merge_asof( + trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.97, np.nan, np.nan, np.nan], + "ask": [np.nan, 51.98, np.nan, np.nan, np.nan], + }, + columns=["time", "ticker", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ) assert_frame_equal(result, expected) def test_examples3(self): """ doc-string examples """ # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 6, np.nan]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, np.nan]} + ) - result = pd.merge_asof(left, right, on='a', direction='forward') + result = pd.merge_asof(left, right, on="a", direction="forward") assert_frame_equal(result, expected) def test_examples4(self): """ doc-string examples """ # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 6, 7]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, 6, 7]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest') + result = pd.merge_asof(left, right, on="a", direction="nearest") assert_frame_equal(result, expected) def test_basic(self): @@ -154,36 +165,31 @@ def test_basic(self): trades = self.trades quotes = self.quotes - result = merge_asof(trades, quotes, - on='time', - by='ticker') + result = merge_asof(trades, quotes, on="time", by="ticker") assert_frame_equal(result, expected) def test_basic_categorical(self): expected = self.asof trades = self.trades.copy() - trades.ticker = trades.ticker.astype('category') + trades.ticker = trades.ticker.astype("category") quotes = self.quotes.copy() - quotes.ticker = quotes.ticker.astype('category') - expected.ticker = expected.ticker.astype('category') + quotes.ticker = quotes.ticker.astype("category") + expected.ticker = expected.ticker.astype("category") - result = merge_asof(trades, quotes, - on='time', - by='ticker') + result = merge_asof(trades, quotes, on="time", by="ticker") assert_frame_equal(result, expected) def test_basic_left_index(self): # GH14253 expected = self.asof - trades = self.trades.set_index('time') + trades = self.trades.set_index("time") quotes = self.quotes - result = merge_asof(trades, quotes, - left_index=True, - right_on='time', - by='ticker') + result = merge_asof( + trades, quotes, left_index=True, right_on="time", by="ticker" + ) # left-only index uses right's index, oddly expected.index = result.index # time column appears after left's columns @@ -194,61 +200,53 @@ def test_basic_right_index(self): expected = self.asof trades = self.trades - quotes = self.quotes.set_index('time') + quotes = self.quotes.set_index("time") - result = merge_asof(trades, quotes, - left_on='time', - right_index=True, - by='ticker') + result = merge_asof( + trades, quotes, left_on="time", right_index=True, by="ticker" + ) assert_frame_equal(result, expected) def test_basic_left_index_right_index(self): - expected = self.asof.set_index('time') - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') + expected = self.asof.set_index("time") + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") - result = merge_asof(trades, quotes, - left_index=True, - right_index=True, - by='ticker') + result = merge_asof( + trades, quotes, left_index=True, right_index=True, by="ticker" + ) assert_frame_equal(result, expected) def test_multi_index(self): # MultiIndex is prohibited - trades = self.trades.set_index(['time', 'price']) - quotes = self.quotes.set_index('time') + trades = self.trades.set_index(["time", "price"]) + quotes = self.quotes.set_index("time") with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_index=True, - right_index=True) + merge_asof(trades, quotes, left_index=True, right_index=True) - trades = self.trades.set_index('time') - quotes = self.quotes.set_index(['time', 'bid']) + trades = self.trades.set_index("time") + quotes = self.quotes.set_index(["time", "bid"]) with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_index=True, - right_index=True) + merge_asof(trades, quotes, left_index=True, right_index=True) def test_on_and_index(self): # 'on' parameter and index together is prohibited - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_on='price', - left_index=True, - right_index=True) + merge_asof( + trades, quotes, left_on="price", left_index=True, right_index=True + ) - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") with pytest.raises(MergeError): - merge_asof(trades, quotes, - right_on='bid', - left_index=True, - right_index=True) + merge_asof( + trades, quotes, right_on="bid", left_index=True, right_index=True + ) def test_basic_left_by_right_by(self): @@ -257,10 +255,9 @@ def test_basic_left_by_right_by(self): trades = self.trades quotes = self.quotes - result = merge_asof(trades, quotes, - on='time', - left_by='ticker', - right_by='ticker') + result = merge_asof( + trades, quotes, on="time", left_by="ticker", right_by="ticker" + ) assert_frame_equal(result, expected) def test_missing_right_by(self): @@ -269,181 +266,211 @@ def test_missing_right_by(self): trades = self.trades quotes = self.quotes - q = quotes[quotes.ticker != 'MSFT'] - result = merge_asof(trades, q, - on='time', - by='ticker') - expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan + q = quotes[quotes.ticker != "MSFT"] + result = merge_asof(trades, q, on="time", by="ticker") + expected.loc[expected.ticker == "MSFT", ["bid", "ask"]] = np.nan assert_frame_equal(result, expected) def test_multiby(self): # GH13936 - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.045', - '20160525 13:30:00.049']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL'], - 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', - 'NSDQ', 'ARCA'], - 'bid': [720.51, 51.95, 51.97, 51.99, - 720.50, 97.99], - 'ask': [720.92, 51.96, 51.98, 52.00, - 720.93, 98.01]}, - columns=['time', 'ticker', 'exch', 'bid', 'ask']) - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100], - 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], - 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity', 'bid', 'ask']) - - result = pd.merge_asof(trades, quotes, on='time', - by=['ticker', 'exch']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "exch", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.045", + "20160525 13:30:00.049", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL"], + "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"], + "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], + "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01], + }, + columns=["time", "ticker", "exch", "bid", "ask"], + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.95, 720.50, 720.51, np.nan], + "ask": [np.nan, 51.96, 720.93, 720.92, np.nan], + }, + columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) assert_frame_equal(result, expected) def test_multiby_heterogeneous_types(self): # GH13936 - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': [0, 0, 1, 1, 2], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.045', - '20160525 13:30:00.049']), - 'ticker': [1, 0, 0, 0, 1, 2], - 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', - 'NSDQ', 'ARCA'], - 'bid': [720.51, 51.95, 51.97, 51.99, - 720.50, 97.99], - 'ask': [720.92, 51.96, 51.98, 52.00, - 720.93, 98.01]}, - columns=['time', 'ticker', 'exch', 'bid', 'ask']) - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.046', - '20160525 13:30:00.048', - '20160525 13:30:00.050']), - 'ticker': [0, 0, 1, 1, 2], - 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100], - 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], - 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, - columns=['time', 'ticker', 'exch', - 'price', 'quantity', 'bid', 'ask']) - - result = pd.merge_asof(trades, quotes, on='time', - by=['ticker', 'exch']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": [0, 0, 1, 1, 2], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "exch", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.045", + "20160525 13:30:00.049", + ] + ), + "ticker": [1, 0, 0, 0, 1, 2], + "exch": ["BATS", "NSDQ", "ARCA", "ARCA", "NSDQ", "ARCA"], + "bid": [720.51, 51.95, 51.97, 51.99, 720.50, 97.99], + "ask": [720.92, 51.96, 51.98, 52.00, 720.93, 98.01], + }, + columns=["time", "ticker", "exch", "bid", "ask"], + ) + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.046", + "20160525 13:30:00.048", + "20160525 13:30:00.050", + ] + ), + "ticker": [0, 0, 1, 1, 2], + "exch": ["ARCA", "NSDQ", "NSDQ", "BATS", "NSDQ"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + "bid": [np.nan, 51.95, 720.50, 720.51, np.nan], + "ask": [np.nan, 51.96, 720.93, 720.92, np.nan], + }, + columns=["time", "ticker", "exch", "price", "quantity", "bid", "ask"], + ) + + result = pd.merge_asof(trades, quotes, on="time", by=["ticker", "exch"]) assert_frame_equal(result, expected) def test_multiby_indexed(self): # GH15676 - left = pd.DataFrame([ - [pd.to_datetime('20160602'), 1, 'a'], - [pd.to_datetime('20160602'), 2, 'a'], - [pd.to_datetime('20160603'), 1, 'b'], - [pd.to_datetime('20160603'), 2, 'b']], - columns=['time', 'k1', 'k2']).set_index('time') - - right = pd.DataFrame([ - [pd.to_datetime('20160502'), 1, 'a', 1.0], - [pd.to_datetime('20160502'), 2, 'a', 2.0], - [pd.to_datetime('20160503'), 1, 'b', 3.0], - [pd.to_datetime('20160503'), 2, 'b', 4.0]], - columns=['time', 'k1', 'k2', 'value']).set_index('time') - - expected = pd.DataFrame([ - [pd.to_datetime('20160602'), 1, 'a', 1.0], - [pd.to_datetime('20160602'), 2, 'a', 2.0], - [pd.to_datetime('20160603'), 1, 'b', 3.0], - [pd.to_datetime('20160603'), 2, 'b', 4.0]], - columns=['time', 'k1', 'k2', 'value']).set_index('time') - - result = pd.merge_asof(left, - right, - left_index=True, - right_index=True, - by=['k1', 'k2']) + left = pd.DataFrame( + [ + [pd.to_datetime("20160602"), 1, "a"], + [pd.to_datetime("20160602"), 2, "a"], + [pd.to_datetime("20160603"), 1, "b"], + [pd.to_datetime("20160603"), 2, "b"], + ], + columns=["time", "k1", "k2"], + ).set_index("time") + + right = pd.DataFrame( + [ + [pd.to_datetime("20160502"), 1, "a", 1.0], + [pd.to_datetime("20160502"), 2, "a", 2.0], + [pd.to_datetime("20160503"), 1, "b", 3.0], + [pd.to_datetime("20160503"), 2, "b", 4.0], + ], + columns=["time", "k1", "k2", "value"], + ).set_index("time") + + expected = pd.DataFrame( + [ + [pd.to_datetime("20160602"), 1, "a", 1.0], + [pd.to_datetime("20160602"), 2, "a", 2.0], + [pd.to_datetime("20160603"), 1, "b", 3.0], + [pd.to_datetime("20160603"), 2, "b", 4.0], + ], + columns=["time", "k1", "k2", "value"], + ).set_index("time") + + result = pd.merge_asof( + left, right, left_index=True, right_index=True, by=["k1", "k2"] + ) assert_frame_equal(expected, result) with pytest.raises(MergeError): - pd.merge_asof(left, right, left_index=True, right_index=True, - left_by=['k1', 'k2'], right_by=['k1']) + pd.merge_asof( + left, + right, + left_index=True, + right_index=True, + left_by=["k1", "k2"], + right_by=["k1"], + ) def test_basic2(self, datapath): - expected = self.read_data(datapath, 'asof2.csv') - trades = self.read_data(datapath, 'trades2.csv') - quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) + expected = self.read_data(datapath, "asof2.csv") + trades = self.read_data(datapath, "trades2.csv") + quotes = self.read_data(datapath, "quotes2.csv", dedupe=True) - result = merge_asof(trades, quotes, - on='time', - by='ticker') + result = merge_asof(trades, quotes, on="time", by="ticker") assert_frame_equal(result, expected) def test_basic_no_by(self): - f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \ + f = ( + lambda x: x[x.ticker == "MSFT"] + .drop("ticker", axis=1) .reset_index(drop=True) + ) # just use a single ticker expected = f(self.asof) trades = f(self.trades) quotes = f(self.quotes) - result = merge_asof(trades, quotes, - on='time') + result = merge_asof(trades, quotes, on="time") assert_frame_equal(result, expected) def test_valid_join_keys(self): @@ -452,40 +479,33 @@ def test_valid_join_keys(self): quotes = self.quotes with pytest.raises(MergeError): - merge_asof(trades, quotes, - left_on='time', - right_on='bid', - by='ticker') + merge_asof(trades, quotes, left_on="time", right_on="bid", by="ticker") with pytest.raises(MergeError): - merge_asof(trades, quotes, - on=['time', 'ticker'], - by='ticker') + merge_asof(trades, quotes, on=["time", "ticker"], by="ticker") with pytest.raises(MergeError): - merge_asof(trades, quotes, - by='ticker') + merge_asof(trades, quotes, by="ticker") def test_with_duplicates(self, datapath): - q = pd.concat([self.quotes, self.quotes]).sort_values( - ['time', 'ticker']).reset_index(drop=True) - result = merge_asof(self.trades, q, - on='time', - by='ticker') - expected = self.read_data(datapath, 'asof.csv') + q = ( + pd.concat([self.quotes, self.quotes]) + .sort_values(["time", "ticker"]) + .reset_index(drop=True) + ) + result = merge_asof(self.trades, q, on="time", by="ticker") + expected = self.read_data(datapath, "asof.csv") assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): - df1 = pd.DataFrame({'key': [1, 1, 3], - 'left_val': [1, 2, 3]}) - df2 = pd.DataFrame({'key': [1, 2, 2], - 'right_val': [1, 2, 3]}) - result = merge_asof(df1, df2, on='key') - expected = pd.DataFrame({'key': [1, 1, 3], - 'left_val': [1, 2, 3], - 'right_val': [1, 1, 3]}) + df1 = pd.DataFrame({"key": [1, 1, 3], "left_val": [1, 2, 3]}) + df2 = pd.DataFrame({"key": [1, 2, 2], "right_val": [1, 2, 3]}) + result = merge_asof(df1, df2, on="key") + expected = pd.DataFrame( + {"key": [1, 1, 3], "left_val": [1, 2, 3], "right_val": [1, 1, 3]} + ) assert_frame_equal(result, expected) def test_valid_allow_exact_matches(self): @@ -494,10 +514,9 @@ def test_valid_allow_exact_matches(self): quotes = self.quotes with pytest.raises(MergeError): - merge_asof(trades, quotes, - on='time', - by='ticker', - allow_exact_matches='foo') + merge_asof( + trades, quotes, on="time", by="ticker", allow_exact_matches="foo" + ) def test_valid_tolerance(self): @@ -505,403 +524,507 @@ def test_valid_tolerance(self): quotes = self.quotes # dti - merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=Timedelta('1s')) + merge_asof(trades, quotes, on="time", by="ticker", tolerance=Timedelta("1s")) # integer - merge_asof(trades.reset_index(), quotes.reset_index(), - on='index', - by='ticker', - tolerance=1) + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=1, + ) # incompat with pytest.raises(MergeError): - merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=1) + merge_asof(trades, quotes, on="time", by="ticker", tolerance=1) # invalid with pytest.raises(MergeError): - merge_asof(trades.reset_index(), quotes.reset_index(), - on='index', - by='ticker', - tolerance=1.0) + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=1.0, + ) # invalid negative with pytest.raises(MergeError): - merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=-Timedelta('1s')) + merge_asof( + trades, quotes, on="time", by="ticker", tolerance=-Timedelta("1s") + ) with pytest.raises(MergeError): - merge_asof(trades.reset_index(), quotes.reset_index(), - on='index', - by='ticker', - tolerance=-1) + merge_asof( + trades.reset_index(), + quotes.reset_index(), + on="index", + by="ticker", + tolerance=-1, + ) def test_non_sorted(self): - trades = self.trades.sort_values('time', ascending=False) - quotes = self.quotes.sort_values('time', ascending=False) + trades = self.trades.sort_values("time", ascending=False) + quotes = self.quotes.sort_values("time", ascending=False) # we require that we are already sorted on time & quotes assert not trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): - merge_asof(trades, quotes, - on='time', - by='ticker') + merge_asof(trades, quotes, on="time", by="ticker") - trades = self.trades.sort_values('time') + trades = self.trades.sort_values("time") assert trades.time.is_monotonic assert not quotes.time.is_monotonic with pytest.raises(ValueError): - merge_asof(trades, quotes, - on='time', - by='ticker') + merge_asof(trades, quotes, on="time", by="ticker") - quotes = self.quotes.sort_values('time') + quotes = self.quotes.sort_values("time") assert trades.time.is_monotonic assert quotes.time.is_monotonic # ok, though has dupes - merge_asof(trades, self.quotes, - on='time', - by='ticker') + merge_asof(trades, self.quotes, on="time", by="ticker") def test_tolerance(self): trades = self.trades quotes = self.quotes - result = merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=Timedelta('1day')) + result = merge_asof( + trades, quotes, on="time", by="ticker", tolerance=Timedelta("1day") + ) expected = self.tolerance assert_frame_equal(result, expected) def test_tolerance_forward(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, np.nan, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='forward', - tolerance=1) + result = pd.merge_asof(left, right, on="a", direction="forward", tolerance=1) assert_frame_equal(result, expected) def test_tolerance_nearest(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, np.nan, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [1, np.nan, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest', - tolerance=1) + result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=1) assert_frame_equal(result, expected) def test_tolerance_tz(self): # GH 14844 left = pd.DataFrame( - {'date': pd.date_range(start=pd.to_datetime('2016-01-02'), - freq='D', periods=5, - tz=pytz.timezone('UTC')), - 'value1': np.arange(5)}) + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-02"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value1": np.arange(5), + } + ) right = pd.DataFrame( - {'date': pd.date_range(start=pd.to_datetime('2016-01-01'), - freq='D', periods=5, - tz=pytz.timezone('UTC')), - 'value2': list("ABCDE")}) - result = pd.merge_asof(left, right, on='date', - tolerance=pd.Timedelta('1 day')) + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-01"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value2": list("ABCDE"), + } + ) + result = pd.merge_asof(left, right, on="date", tolerance=pd.Timedelta("1 day")) expected = pd.DataFrame( - {'date': pd.date_range(start=pd.to_datetime('2016-01-02'), - freq='D', periods=5, - tz=pytz.timezone('UTC')), - 'value1': np.arange(5), - 'value2': list("BCDEE")}) + { + "date": pd.date_range( + start=pd.to_datetime("2016-01-02"), + freq="D", + periods=5, + tz=pytz.timezone("UTC"), + ), + "value1": np.arange(5), + "value2": list("BCDEE"), + } + ) assert_frame_equal(result, expected) def test_tolerance_float(self): # GH22981 - left = pd.DataFrame({'a': [1.1, 3.5, 10.9], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1.0, 2.5, 3.3, 7.5, 11.5], - 'right_val': [1.0, 2.5, 3.3, 7.5, 11.5]}) - - expected = pd.DataFrame({'a': [1.1, 3.5, 10.9], - 'left_val': ['a', 'b', 'c'], - 'right_val': [1, 3.3, np.nan]}) + left = pd.DataFrame({"a": [1.1, 3.5, 10.9], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame( + {"a": [1.0, 2.5, 3.3, 7.5, 11.5], "right_val": [1.0, 2.5, 3.3, 7.5, 11.5]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest', - tolerance=0.5) + expected = pd.DataFrame( + { + "a": [1.1, 3.5, 10.9], + "left_val": ["a", "b", "c"], + "right_val": [1, 3.3, np.nan], + } + ) + + result = pd.merge_asof(left, right, on="a", direction="nearest", tolerance=0.5) assert_frame_equal(result, expected) def test_index_tolerance(self): # GH 15135 - expected = self.tolerance.set_index('time') - trades = self.trades.set_index('time') - quotes = self.quotes.set_index('time') - - result = pd.merge_asof(trades, quotes, - left_index=True, - right_index=True, - by='ticker', - tolerance=pd.Timedelta('1day')) + expected = self.tolerance.set_index("time") + trades = self.trades.set_index("time") + quotes = self.quotes.set_index("time") + + result = pd.merge_asof( + trades, + quotes, + left_index=True, + right_index=True, + by="ticker", + tolerance=pd.Timedelta("1day"), + ) assert_frame_equal(result, expected) def test_allow_exact_matches(self): - result = merge_asof(self.trades, self.quotes, - on='time', - by='ticker', - allow_exact_matches=False) + result = merge_asof( + self.trades, self.quotes, on="time", by="ticker", allow_exact_matches=False + ) expected = self.allow_exact_matches assert_frame_equal(result, expected) def test_allow_exact_matches_forward(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [2, 7, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 7, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='forward', - allow_exact_matches=False) + result = pd.merge_asof( + left, right, on="a", direction="forward", allow_exact_matches=False + ) assert_frame_equal(result, expected) def test_allow_exact_matches_nearest(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 7, 11], - 'right_val': [1, 2, 3, 7, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 7, 11], "right_val": [1, 2, 3, 7, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [2, 3, 11]}) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [2, 3, 11]} + ) - result = pd.merge_asof(left, right, on='a', direction='nearest', - allow_exact_matches=False) + result = pd.merge_asof( + left, right, on="a", direction="nearest", allow_exact_matches=False + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance(self): - result = merge_asof(self.trades, self.quotes, - on='time', - by='ticker', - tolerance=Timedelta('100ms'), - allow_exact_matches=False) + result = merge_asof( + self.trades, + self.quotes, + on="time", + by="ticker", + tolerance=Timedelta("100ms"), + allow_exact_matches=False, + ) expected = self.allow_exact_matches_and_tolerance assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance2(self): # GH 13695 - df1 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob']}) - df2 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.000', - '2016-07-15 13:30:00.030']), - 'version': [1, 2]}) - - result = pd.merge_asof(df1, df2, on='time') - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob'], - 'version': [2]}) + df1 = pd.DataFrame( + {"time": pd.to_datetime(["2016-07-15 13:30:00.030"]), "username": ["bob"]} + ) + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] + ), + "version": [1, 2], + } + ) + + result = pd.merge_asof(df1, df2, on="time") + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [2], + } + ) assert_frame_equal(result, expected) - result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob'], - 'version': [1]}) + result = pd.merge_asof(df1, df2, on="time", allow_exact_matches=False) + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [1], + } + ) assert_frame_equal(result, expected) - result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, - tolerance=pd.Timedelta('10ms')) - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), - 'username': ['bob'], - 'version': [np.nan]}) + result = pd.merge_asof( + df1, + df2, + on="time", + allow_exact_matches=False, + tolerance=pd.Timedelta("10ms"), + ) + expected = pd.DataFrame( + { + "time": pd.to_datetime(["2016-07-15 13:30:00.030"]), + "username": ["bob"], + "version": [np.nan], + } + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance3(self): # GH 13709 - df1 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030', - '2016-07-15 13:30:00.030']), - 'username': ['bob', 'charlie']}) - df2 = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.000', - '2016-07-15 13:30:00.030']), - 'version': [1, 2]}) - - result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, - tolerance=pd.Timedelta('10ms')) - expected = pd.DataFrame({ - 'time': pd.to_datetime(['2016-07-15 13:30:00.030', - '2016-07-15 13:30:00.030']), - 'username': ['bob', 'charlie'], - 'version': [np.nan, np.nan]}) + df1 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] + ), + "username": ["bob", "charlie"], + } + ) + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.000", "2016-07-15 13:30:00.030"] + ), + "version": [1, 2], + } + ) + + result = pd.merge_asof( + df1, + df2, + on="time", + allow_exact_matches=False, + tolerance=pd.Timedelta("10ms"), + ) + expected = pd.DataFrame( + { + "time": pd.to_datetime( + ["2016-07-15 13:30:00.030", "2016-07-15 13:30:00.030"] + ), + "username": ["bob", "charlie"], + "version": [np.nan, np.nan], + } + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance_forward(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 3, 4, 6, 11], - 'right_val': [1, 3, 4, 6, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 6, 11]}) - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [np.nan, 6, 11]}) - - result = pd.merge_asof(left, right, on='a', direction='forward', - allow_exact_matches=False, tolerance=1) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 6, 11]} + ) + + result = pd.merge_asof( + left, + right, + on="a", + direction="forward", + allow_exact_matches=False, + tolerance=1, + ) assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance_nearest(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 3, 4, 6, 11], - 'right_val': [1, 3, 4, 7, 11]}) - - expected = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c'], - 'right_val': [np.nan, 4, 11]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 3, 4, 6, 11], "right_val": [1, 3, 4, 7, 11]}) - result = pd.merge_asof(left, right, on='a', direction='nearest', - allow_exact_matches=False, tolerance=1) + expected = pd.DataFrame( + {"a": [1, 5, 10], "left_val": ["a", "b", "c"], "right_val": [np.nan, 4, 11]} + ) + + result = pd.merge_asof( + left, + right, + on="a", + direction="nearest", + allow_exact_matches=False, + tolerance=1, + ) assert_frame_equal(result, expected) def test_forward_by(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Y', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e']}) - right = pd.DataFrame({'a': [1, 6, 11, 15, 16], - 'b': ['X', 'Z', 'Y', 'Z', 'Y'], - 'right_val': [1, 6, 11, 15, 16]}) - - expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Y', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e'], - 'right_val': [1, np.nan, 11, 15, 16]}) + left = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Y", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + } + ) + right = pd.DataFrame( + { + "a": [1, 6, 11, 15, 16], + "b": ["X", "Z", "Y", "Z", "Y"], + "right_val": [1, 6, 11, 15, 16], + } + ) - result = pd.merge_asof(left, right, on='a', by='b', - direction='forward') + expected = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Y", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + "right_val": [1, np.nan, 11, 15, 16], + } + ) + + result = pd.merge_asof(left, right, on="a", by="b", direction="forward") assert_frame_equal(result, expected) def test_nearest_by(self): # GH14887 - left = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Z', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e']}) - right = pd.DataFrame({'a': [1, 6, 11, 15, 16], - 'b': ['X', 'Z', 'Z', 'Z', 'Y'], - 'right_val': [1, 6, 11, 15, 16]}) - - expected = pd.DataFrame({'a': [1, 5, 10, 12, 15], - 'b': ['X', 'X', 'Z', 'Z', 'Y'], - 'left_val': ['a', 'b', 'c', 'd', 'e'], - 'right_val': [1, 1, 11, 11, 16]}) + left = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Z", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + } + ) + right = pd.DataFrame( + { + "a": [1, 6, 11, 15, 16], + "b": ["X", "Z", "Z", "Z", "Y"], + "right_val": [1, 6, 11, 15, 16], + } + ) - result = pd.merge_asof(left, right, on='a', by='b', - direction='nearest') + expected = pd.DataFrame( + { + "a": [1, 5, 10, 12, 15], + "b": ["X", "X", "Z", "Z", "Y"], + "left_val": ["a", "b", "c", "d", "e"], + "right_val": [1, 1, 11, 11, 16], + } + ) + + result = pd.merge_asof(left, right, on="a", by="b", direction="nearest") assert_frame_equal(result, expected) def test_by_int(self): # we specialize by type, so test that this is correct - df1 = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.020', - '20160525 13:30:00.030', - '20160525 13:30:00.040', - '20160525 13:30:00.050', - '20160525 13:30:00.060']), - 'key': [1, 2, 1, 3, 2], - 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]}, - columns=['time', 'key', 'value1']) - - df2 = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.015', - '20160525 13:30:00.020', - '20160525 13:30:00.025', - '20160525 13:30:00.035', - '20160525 13:30:00.040', - '20160525 13:30:00.055', - '20160525 13:30:00.060', - '20160525 13:30:00.065']), - 'key': [2, 1, 1, 3, 2, 1, 2, 3], - 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]}, - columns=['time', 'key', 'value2']) - - result = pd.merge_asof(df1, df2, on='time', by='key') - - expected = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.020', - '20160525 13:30:00.030', - '20160525 13:30:00.040', - '20160525 13:30:00.050', - '20160525 13:30:00.060']), - 'key': [1, 2, 1, 3, 2], - 'value1': [1.1, 1.2, 1.3, 1.4, 1.5], - 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]}, - columns=['time', 'key', 'value1', 'value2']) + df1 = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.020", + "20160525 13:30:00.030", + "20160525 13:30:00.040", + "20160525 13:30:00.050", + "20160525 13:30:00.060", + ] + ), + "key": [1, 2, 1, 3, 2], + "value1": [1.1, 1.2, 1.3, 1.4, 1.5], + }, + columns=["time", "key", "value1"], + ) + + df2 = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.015", + "20160525 13:30:00.020", + "20160525 13:30:00.025", + "20160525 13:30:00.035", + "20160525 13:30:00.040", + "20160525 13:30:00.055", + "20160525 13:30:00.060", + "20160525 13:30:00.065", + ] + ), + "key": [2, 1, 1, 3, 2, 1, 2, 3], + "value2": [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8], + }, + columns=["time", "key", "value2"], + ) + + result = pd.merge_asof(df1, df2, on="time", by="key") + + expected = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.020", + "20160525 13:30:00.030", + "20160525 13:30:00.040", + "20160525 13:30:00.050", + "20160525 13:30:00.060", + ] + ), + "key": [1, 2, 1, 3, 2], + "value1": [1.1, 1.2, 1.3, 1.4, 1.5], + "value2": [2.2, 2.1, 2.3, 2.4, 2.7], + }, + columns=["time", "key", "value1", "value2"], + ) assert_frame_equal(result, expected) def test_on_float(self): # mimics how to determine the minimum-price variation - df1 = pd.DataFrame({ - 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], - 'symbol': list("ABCDEFG")}, - columns=['symbol', 'price']) + df1 = pd.DataFrame( + { + "price": [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], + "symbol": list("ABCDEFG"), + }, + columns=["symbol", "price"], + ) - df2 = pd.DataFrame({ - 'price': [0.0, 1.0, 100.0], - 'mpv': [0.0001, 0.01, 0.05]}, - columns=['price', 'mpv']) + df2 = pd.DataFrame( + {"price": [0.0, 1.0, 100.0], "mpv": [0.0001, 0.01, 0.05]}, + columns=["price", "mpv"], + ) - df1 = df1.sort_values('price').reset_index(drop=True) + df1 = df1.sort_values("price").reset_index(drop=True) - result = pd.merge_asof(df1, df2, on='price') + result = pd.merge_asof(df1, df2, on="price") - expected = pd.DataFrame({ - 'symbol': list("BGACEDF"), - 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], - 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]}, - columns=['symbol', 'price', 'mpv']) + expected = pd.DataFrame( + { + "symbol": list("BGACEDF"), + "price": [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], + "mpv": [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05], + }, + columns=["symbol", "price", "mpv"], + ) assert_frame_equal(result, expected) @@ -909,26 +1032,29 @@ def test_on_specialized_type(self, any_real_dtype): # see gh-13936 dtype = np.dtype(any_real_dtype).type - df1 = pd.DataFrame({ - "value": [5, 2, 25, 100, 78, 120, 79], - "symbol": list("ABCDEFG")}, - columns=["symbol", "value"]) + df1 = pd.DataFrame( + {"value": [5, 2, 25, 100, 78, 120, 79], "symbol": list("ABCDEFG")}, + columns=["symbol", "value"], + ) df1.value = dtype(df1.value) - df2 = pd.DataFrame({ - "value": [0, 80, 120, 125], - "result": list("xyzw")}, - columns=["value", "result"]) + df2 = pd.DataFrame( + {"value": [0, 80, 120, 125], "result": list("xyzw")}, + columns=["value", "result"], + ) df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) result = pd.merge_asof(df1, df2, on="value") expected = pd.DataFrame( - {"symbol": list("BACEGDF"), - "value": [2, 5, 25, 78, 79, 100, 120], - "result": list("xxxxxyz") - }, columns=["symbol", "value", "result"]) + { + "symbol": list("BACEGDF"), + "value": [2, 5, 25, 78, 79, 100, 120], + "result": list("xxxxxyz"), + }, + columns=["symbol", "value", "result"], + ) expected.value = dtype(expected.value) assert_frame_equal(result, expected) @@ -937,113 +1063,160 @@ def test_on_specialized_type_by_int(self, any_real_dtype): # see gh-13936 dtype = np.dtype(any_real_dtype).type - df1 = pd.DataFrame({ - "value": [5, 2, 25, 100, 78, 120, 79], - "key": [1, 2, 3, 2, 3, 1, 2], - "symbol": list("ABCDEFG")}, - columns=["symbol", "key", "value"]) + df1 = pd.DataFrame( + { + "value": [5, 2, 25, 100, 78, 120, 79], + "key": [1, 2, 3, 2, 3, 1, 2], + "symbol": list("ABCDEFG"), + }, + columns=["symbol", "key", "value"], + ) df1.value = dtype(df1.value) - df2 = pd.DataFrame({ - "value": [0, 80, 120, 125], - "key": [1, 2, 2, 3], - "result": list("xyzw")}, - columns=["value", "key", "result"]) + df2 = pd.DataFrame( + {"value": [0, 80, 120, 125], "key": [1, 2, 2, 3], "result": list("xyzw")}, + columns=["value", "key", "result"], + ) df2.value = dtype(df2.value) df1 = df1.sort_values("value").reset_index(drop=True) result = pd.merge_asof(df1, df2, on="value", by="key") - expected = pd.DataFrame({ - "symbol": list("BACEGDF"), - "key": [2, 1, 3, 3, 2, 2, 1], - "value": [2, 5, 25, 78, 79, 100, 120], - "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"]}, - columns=["symbol", "key", "value", "result"]) + expected = pd.DataFrame( + { + "symbol": list("BACEGDF"), + "key": [2, 1, 3, 3, 2, 2, 1], + "value": [2, 5, 25, 78, 79, 100, 120], + "result": [np.nan, "x", np.nan, np.nan, np.nan, "y", "x"], + }, + columns=["symbol", "key", "value", "result"], + ) expected.value = dtype(expected.value) assert_frame_equal(result, expected) def test_on_float_by_int(self): # type specialize both "by" and "on" parameters - df1 = pd.DataFrame({ - 'symbol': list("AAABBBCCC"), - 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59, - 12.5, 378.15, 378.2, 378.25]}, - columns=['symbol', 'exch', 'price']) - - df2 = pd.DataFrame({ - 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], - 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]}, - columns=['exch', 'price', 'mpv']) - - df1 = df1.sort_values('price').reset_index(drop=True) - df2 = df2.sort_values('price').reset_index(drop=True) - - result = pd.merge_asof(df1, df2, on='price', by='exch') - - expected = pd.DataFrame({ - 'symbol': list("AAABBBCCC"), - 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3], - 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58, - 12.59, 378.15, 378.2, 378.25], - 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]}, - columns=['symbol', 'exch', 'price', 'mpv']) + df1 = pd.DataFrame( + { + "symbol": list("AAABBBCCC"), + "exch": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "price": [ + 3.26, + 3.2599, + 3.2598, + 12.58, + 12.59, + 12.5, + 378.15, + 378.2, + 378.25, + ], + }, + columns=["symbol", "exch", "price"], + ) + + df2 = pd.DataFrame( + { + "exch": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "price": [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], + "mpv": [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0], + }, + columns=["exch", "price", "mpv"], + ) + + df1 = df1.sort_values("price").reset_index(drop=True) + df2 = df2.sort_values("price").reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on="price", by="exch") + + expected = pd.DataFrame( + { + "symbol": list("AAABBBCCC"), + "exch": [3, 2, 1, 3, 1, 2, 1, 2, 3], + "price": [ + 3.2598, + 3.2599, + 3.26, + 12.5, + 12.58, + 12.59, + 378.15, + 378.2, + 378.25, + ], + "mpv": [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25], + }, + columns=["symbol", "exch", "price", "mpv"], + ) assert_frame_equal(result, expected) def test_merge_datatype_error_raises(self): - msg = r'incompatible merge keys \[0\] .*, must be the same type' + msg = r"incompatible merge keys \[0\] .*, must be the same type" - left = pd.DataFrame({'left_val': [1, 5, 10], - 'a': ['a', 'b', 'c']}) - right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], - 'a': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) + right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) with pytest.raises(MergeError, match=msg): - merge_asof(left, right, on='a') + merge_asof(left, right, on="a") def test_merge_datatype_categorical_error_raises(self): - msg = (r'incompatible merge keys \[0\] .* both sides category, ' - 'but not equal ones') + msg = ( + r"incompatible merge keys \[0\] .* both sides category, " + "but not equal ones" + ) - left = pd.DataFrame({'left_val': [1, 5, 10], - 'a': pd.Categorical(['a', 'b', 'c'])}) - right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], - 'a': pd.Categorical(['a', 'X', 'c', 'X', 'b'])}) + left = pd.DataFrame( + {"left_val": [1, 5, 10], "a": pd.Categorical(["a", "b", "c"])} + ) + right = pd.DataFrame( + { + "right_val": [1, 2, 3, 6, 7], + "a": pd.Categorical(["a", "X", "c", "X", "b"]), + } + ) with pytest.raises(MergeError, match=msg): - merge_asof(left, right, on='a') + merge_asof(left, right, on="a") - @pytest.mark.parametrize('func', [lambda x: x, lambda x: to_datetime(x)], - ids=['numeric', 'datetime']) - @pytest.mark.parametrize('side', ['left', 'right']) + @pytest.mark.parametrize( + "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"] + ) + @pytest.mark.parametrize("side", ["left", "right"]) def test_merge_on_nans(self, func, side): # GH 23189 msg = "Merge keys contain null values on {} side".format(side) nulls = func([1.0, 5.0, np.nan]) - non_nulls = func([1.0, 5.0, 10.]) - df_null = pd.DataFrame({'a': nulls, 'left_val': ['a', 'b', 'c']}) - df = pd.DataFrame({'a': non_nulls, 'right_val': [1, 6, 11]}) + non_nulls = func([1.0, 5.0, 10.0]) + df_null = pd.DataFrame({"a": nulls, "left_val": ["a", "b", "c"]}) + df = pd.DataFrame({"a": non_nulls, "right_val": [1, 6, 11]}) with pytest.raises(ValueError, match=msg): - if side == 'left': - merge_asof(df_null, df, on='a') + if side == "left": + merge_asof(df_null, df, on="a") else: - merge_asof(df, df_null, on='a') + merge_asof(df, df_null, on="a") def test_merge_by_col_tz_aware(self): # GH 21184 left = pd.DataFrame( - {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'), - 'on_col': [2], 'values': ['a']}) + { + "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "on_col": [2], + "values": ["a"], + } + ) right = pd.DataFrame( - {'by_col': pd.DatetimeIndex(['2018-01-01']).tz_localize('UTC'), - 'on_col': [1], 'values': ['b']}) - result = pd.merge_asof(left, right, by='by_col', on='on_col') - expected = pd.DataFrame([ - [pd.Timestamp('2018-01-01', tz='UTC'), 2, 'a', 'b'] - ], columns=['by_col', 'on_col', 'values_x', 'values_y']) + { + "by_col": pd.DatetimeIndex(["2018-01-01"]).tz_localize("UTC"), + "on_col": [1], + "values": ["b"], + } + ) + result = pd.merge_asof(left, right, by="by_col", on="on_col") + expected = pd.DataFrame( + [[pd.Timestamp("2018-01-01", tz="UTC"), 2, "a", "b"]], + columns=["by_col", "on_col", "values_x", "values_y"], + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 12d9483af8761..5e3bf03a0a4ec 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -7,21 +7,27 @@ @pytest.fixture def df1(): - return DataFrame(dict( - outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], - inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], - v1=np.linspace(0, 1, 11))) + return DataFrame( + dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11), + ) + ) @pytest.fixture def df2(): - return DataFrame(dict( - outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], - inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], - v2=np.linspace(10, 11, 12))) + return DataFrame( + dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12), + ) + ) -@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def left_df(request, df1): """ Construct left test DataFrame with specified levels (any of 'outer', 'inner', and 'v1')""" @@ -32,7 +38,7 @@ def left_df(request, df1): return df1 -@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def right_df(request, df2): """ Construct right test DataFrame with specified levels (any of 'outer', 'inner', and 'v2')""" @@ -44,8 +50,7 @@ def right_df(request, df2): return df2 -def compute_expected(df_left, df_right, - on=None, left_on=None, right_on=None, how=None): +def compute_expected(df_left, df_right, on=None, left_on=None, right_on=None, how=None): """ Compute the expected merge result for the test case. @@ -85,8 +90,7 @@ def compute_expected(df_left, df_right, right_levels = [n for n in df_right.index.names if n is not None] # Compute output named index levels - output_levels = [i for i in left_on - if i in right_levels and i in left_levels] + output_levels = [i for i in left_on if i in right_levels and i in left_levels] # Drop index levels that aren't involved in the merge drop_left = [n for n in left_levels if n not in left_on] @@ -107,10 +111,7 @@ def compute_expected(df_left, df_right, df_right = df_right.reset_index(level=reset_right) # Perform merge - expected = df_left.merge(df_right, - left_on=left_on, - right_on=right_on, - how=how) + expected = df_left.merge(df_right, left_on=left_on, right_on=right_on, how=how) # Restore index levels if output_levels: @@ -119,11 +120,15 @@ def compute_expected(df_left, df_right, return expected -@pytest.mark.parametrize('on,how', - [(['outer'], 'inner'), - (['inner'], 'left'), - (['outer', 'inner'], 'right'), - (['inner', 'outer'], 'outer')]) +@pytest.mark.parametrize( + "on,how", + [ + (["outer"], "inner"), + (["inner"], "left"), + (["outer", "inner"], "right"), + (["inner", "outer"], "outer"), + ], +) def test_merge_indexes_and_columns_on(left_df, right_df, on, how): # Construct expected result @@ -134,44 +139,50 @@ def test_merge_indexes_and_columns_on(left_df, right_df, on, how): assert_frame_equal(result, expected, check_like=True) -@pytest.mark.parametrize('left_on,right_on,how', - [(['outer'], ['outer'], 'inner'), - (['inner'], ['inner'], 'right'), - (['outer', 'inner'], ['outer', 'inner'], 'left'), - (['inner', 'outer'], ['inner', 'outer'], 'outer')]) +@pytest.mark.parametrize( + "left_on,right_on,how", + [ + (["outer"], ["outer"], "inner"), + (["inner"], ["inner"], "right"), + (["outer", "inner"], ["outer", "inner"], "left"), + (["inner", "outer"], ["inner", "outer"], "outer"), + ], +) def test_merge_indexes_and_columns_lefton_righton( - left_df, right_df, left_on, right_on, how): + left_df, right_df, left_on, right_on, how +): # Construct expected result - expected = compute_expected(left_df, right_df, - left_on=left_on, - right_on=right_on, - how=how) + expected = compute_expected( + left_df, right_df, left_on=left_on, right_on=right_on, how=how + ) # Perform merge - result = left_df.merge(right_df, - left_on=left_on, right_on=right_on, how=how) + result = left_df.merge(right_df, left_on=left_on, right_on=right_on, how=how) assert_frame_equal(result, expected, check_like=True) -@pytest.mark.parametrize('left_index', - ['inner', ['inner', 'outer']]) +@pytest.mark.parametrize("left_index", ["inner", ["inner", "outer"]]) def test_join_indexes_and_columns_on(df1, df2, left_index, join_type): # Construct left_df left_df = df1.set_index(left_index) # Construct right_df - right_df = df2.set_index(['outer', 'inner']) + right_df = df2.set_index(["outer", "inner"]) # Result - expected = (left_df.reset_index() - .join(right_df, on=['outer', 'inner'], how=join_type, - lsuffix='_x', rsuffix='_y') - .set_index(left_index)) + expected = ( + left_df.reset_index() + .join( + right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" + ) + .set_index(left_index) + ) # Perform join - result = left_df.join(right_df, on=['outer', 'inner'], how=join_type, - lsuffix='_x', rsuffix='_y') + result = left_df.join( + right_df, on=["outer", "inner"], how=join_type, lsuffix="_x", rsuffix="_y" + ) assert_frame_equal(result, expected, check_like=True) diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index da8ac0b470f77..2b79548be7b59 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -7,60 +7,69 @@ class TestMergeOrdered: - def setup_method(self, method): - self.left = DataFrame({'key': ['a', 'c', 'e'], - 'lvalue': [1, 2., 3]}) + self.left = DataFrame({"key": ["a", "c", "e"], "lvalue": [1, 2.0, 3]}) - self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], - 'rvalue': [1, 2, 3., 4]}) + self.right = DataFrame({"key": ["b", "c", "d", "f"], "rvalue": [1, 2, 3.0, 4]}) def test_basic(self): - result = merge_ordered(self.left, self.right, on='key') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1, nan, 2, nan, 3, nan], - 'rvalue': [nan, 1, 2, 3, nan, 4]}) + result = merge_ordered(self.left, self.right, on="key") + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"], + "lvalue": [1, nan, 2, nan, 3, nan], + "rvalue": [nan, 1, 2, 3, nan, 4], + } + ) assert_frame_equal(result, expected) def test_ffill(self): - result = merge_ordered( - self.left, self.right, on='key', fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1., 1, 2, 2, 3, 3.], - 'rvalue': [nan, 1, 2, 3, 3, 4]}) + result = merge_ordered(self.left, self.right, on="key", fill_method="ffill") + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"], + "lvalue": [1.0, 1, 2, 2, 3, 3.0], + "rvalue": [nan, 1, 2, 3, 3, 4], + } + ) assert_frame_equal(result, expected) def test_multigroup(self): left = pd.concat([self.left, self.left], ignore_index=True) - left['group'] = ['a'] * 3 + ['b'] * 3 + left["group"] = ["a"] * 3 + ["b"] * 3 - result = merge_ordered(left, self.right, on='key', left_by='group', - fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, - 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, - 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) - expected['group'] = ['a'] * 6 + ['b'] * 6 + result = merge_ordered( + left, self.right, on="key", left_by="group", fill_method="ffill" + ) + expected = DataFrame( + { + "key": ["a", "b", "c", "d", "e", "f"] * 2, + "lvalue": [1.0, 1, 2, 2, 3, 3.0] * 2, + "rvalue": [nan, 1, 2, 3, 3, 4] * 2, + } + ) + expected["group"] = ["a"] * 6 + ["b"] * 6 assert_frame_equal(result, expected.loc[:, result.columns]) - result2 = merge_ordered(self.right, left, on='key', right_by='group', - fill_method='ffill') + result2 = merge_ordered( + self.right, left, on="key", right_by="group", fill_method="ffill" + ) assert_frame_equal(result, result2.loc[:, result.columns]) - result = merge_ordered(left, self.right, on='key', left_by='group') - assert result['group'].notna().all() + result = merge_ordered(left, self.right, on="key", left_by="group") + assert result["group"].notna().all() def test_merge_type(self): class NotADataFrame(DataFrame): - @property def _constructor(self): return NotADataFrame nad = NotADataFrame(self.left) - result = nad.merge(self.right, on='key') + result = nad.merge(self.right, on="key") assert isinstance(result, NotADataFrame) @@ -73,7 +82,7 @@ def test_empty_sequence_concat(self): ([], empty_pat), ({}, empty_pat), ([None], none_pat), - ([None, None], none_pat) + ([None, None], none_pat), ] for df_seq, pattern in test_cases: with pytest.raises(ValueError, match=pattern): @@ -84,20 +93,25 @@ def test_empty_sequence_concat(self): pd.concat([pd.DataFrame(), None]) def test_doc_example(self): - left = DataFrame({'group': list('aaabbb'), - 'key': ['a', 'c', 'e', 'a', 'c', 'e'], - 'lvalue': [1, 2, 3] * 2, - }) - - right = DataFrame({'key': ['b', 'c', 'd'], - 'rvalue': [1, 2, 3]}) - - result = merge_ordered(left, right, fill_method='ffill', - left_by='group') - - expected = DataFrame({'group': list('aaaaabbbbb'), - 'key': ['a', 'b', 'c', 'd', 'e'] * 2, - 'lvalue': [1, 1, 2, 2, 3] * 2, - 'rvalue': [nan, 1, 2, 3, 3] * 2}) + left = DataFrame( + { + "group": list("aaabbb"), + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3] * 2, + } + ) + + right = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + + result = merge_ordered(left, right, fill_method="ffill", left_by="group") + + expected = DataFrame( + { + "group": list("aaaaabbbbb"), + "key": ["a", "b", "c", "d", "e"] * 2, + "lvalue": [1, 1, 2, 2, 3] * 2, + "rvalue": [nan, 1, 2, 3, 3] * 2, + } + ) assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 0bfc8ebbd2871..7aea85153d908 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -16,161 +16,165 @@ def left(): """left dataframe (not multi-indexed) for multi-index join tests""" # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"] data = np.random.randn(len(key1)) - return DataFrame({'key1': key1, 'key2': key2, 'data': data}) + return DataFrame({"key1": key1, "key2": key2, "data": data}) @pytest.fixture def right(): """right dataframe (multi-indexed) for multi-index join tests""" - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['key1', 'key2']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["key1", "key2"], + ) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) + return DataFrame( + np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"] + ) @pytest.fixture def left_multi(): - return ( - DataFrame( - dict(Origin=['A', 'A', 'B', 'B', 'C'], - Destination=['A', 'B', 'A', 'C', 'A'], - Period=['AM', 'AM', 'IP', 'AM', 'OP'], - TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], - Trips=[1987, 3647, 2470, 4296, 4444]), - columns=['Origin', 'Destination', 'Period', - 'TripPurp', 'Trips']) - .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + return DataFrame( + dict( + Origin=["A", "A", "B", "B", "C"], + Destination=["A", "B", "A", "C", "A"], + Period=["AM", "AM", "IP", "AM", "OP"], + TripPurp=["hbw", "nhb", "hbo", "nhb", "hbw"], + Trips=[1987, 3647, 2470, 4296, 4444], + ), + columns=["Origin", "Destination", "Period", "TripPurp", "Trips"], + ).set_index(["Origin", "Destination", "Period", "TripPurp"]) @pytest.fixture def right_multi(): - return ( - DataFrame( - dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'], - Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'], - Period=['AM', 'AM', 'IP', 'AM', 'OP', 'IP', 'AM'], - LinkType=['a', 'b', 'c', 'b', 'a', 'b', 'a'], - Distance=[100, 80, 90, 80, 75, 35, 55]), - columns=['Origin', 'Destination', 'Period', - 'LinkType', 'Distance']) - .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + return DataFrame( + dict( + Origin=["A", "A", "B", "B", "C", "C", "E"], + Destination=["A", "B", "A", "B", "A", "B", "F"], + Period=["AM", "AM", "IP", "AM", "OP", "IP", "AM"], + LinkType=["a", "b", "c", "b", "a", "b", "a"], + Distance=[100, 80, 90, 80, 75, 35, 55], + ), + columns=["Origin", "Destination", "Period", "LinkType", "Distance"], + ).set_index(["Origin", "Destination", "Period", "LinkType"]) @pytest.fixture def on_cols_multi(): - return ['Origin', 'Destination', 'Period'] + return ["Origin", "Destination", "Period"] @pytest.fixture def idx_cols_multi(): - return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'] + return ["Origin", "Destination", "Period", "TripPurp", "LinkType"] class TestMergeMulti: - def setup_method(self): - self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, - columns=['j_one', 'j_two', 'j_three']) + self.index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + self.to_join = DataFrame( + np.random.randn(10, 3), + index=self.index, + columns=["j_one", "j_two", "j_three"], + ) # a little relevant example with NAs - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] + key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] + key2 = [ + "two", + "one", + "three", + "one", + "two", + "one", + "two", + "two", + "three", + "one", + ] data = np.random.randn(len(key1)) - self.data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) + self.data = DataFrame({"key1": key1, "key2": key2, "data": data}) def test_merge_on_multikey(self, left, right, join_type): - on_cols = ['key1', 'key2'] - result = (left.join(right, on=on_cols, how=join_type) - .reset_index(drop=True)) + on_cols = ["key1", "key2"] + result = left.join(right, on=on_cols, how=join_type).reset_index(drop=True) - expected = pd.merge(left, right.reset_index(), - on=on_cols, how=join_type) + expected = pd.merge(left, right.reset_index(), on=on_cols, how=join_type) tm.assert_frame_equal(result, expected) - result = (left.join(right, on=on_cols, how=join_type, sort=True) - .reset_index(drop=True)) + result = left.join(right, on=on_cols, how=join_type, sort=True).reset_index( + drop=True + ) - expected = pd.merge(left, right.reset_index(), - on=on_cols, how=join_type, sort=True) + expected = pd.merge( + left, right.reset_index(), on=on_cols, how=join_type, sort=True + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("sort", [False, True]) def test_left_join_multi_index(self, left, right, sort): - icols = ['1st', '2nd', '3rd'] + icols = ["1st", "2nd", "3rd"] def bind_cols(df): iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord('a') - return (f(df['1st']) + f(df['3rd']) * 1e2 + - df['2nd'].fillna(0) * 1e4) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 1e4 def run_asserts(left, right, sort): - res = left.join(right, on=icols, how='left', sort=sort) + res = left.join(right, on=icols, how="left", sort=sort) assert len(left) < len(res) + 1 - assert not res['4th'].isna().any() - assert not res['5th'].isna().any() + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() - tm.assert_series_equal( - res['4th'], - res['5th'], check_names=False) + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res['4th'], result, check_names=False) + tm.assert_series_equal(res["4th"], result, check_names=False) assert result.name is None if sort: - tm.assert_frame_equal( - res, res.sort_values(icols, kind='mergesort')) + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - out = merge(left, right.reset_index(), on=icols, - sort=sort, how='left') + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") res.index = np.arange(len(res)) tm.assert_frame_equal(out, res) - lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) - left = DataFrame(np.random.choice(lc, (5000, 2)), - columns=['1st', '3rd']) - left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame(np.random.choice(lc, (5000, 2)), columns=["1st", "3rd"]) + left.insert(1, "2nd", np.random.randint(0, 1000, len(left))) i = np.random.permutation(len(left)) right = left.iloc[i].copy() - left['4th'] = bind_cols(left) - right['5th'] = - bind_cols(right) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort) # inject some nulls - left.loc[1::23, '1st'] = np.nan - left.loc[2::37, '2nd'] = np.nan - left.loc[3::43, '3rd'] = np.nan - left['4th'] = bind_cols(left) + left.loc[1::23, "1st"] = np.nan + left.loc[2::37, "2nd"] = np.nan + left.loc[3::43, "3rd"] = np.nan + left["4th"] = bind_cols(left) i = np.random.permutation(len(left)) right = left.iloc[i, :-1] - right['5th'] = - bind_cols(right) + right["5th"] = -bind_cols(right) right.set_index(icols, inplace=True) run_asserts(left, right, sort) @@ -178,14 +182,14 @@ def run_asserts(left, right, sort): @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort): # compare left vs right merge with multikey - on_cols = ['key1', 'key2'] - merged_left_right = left.merge(right, - left_on=on_cols, right_index=True, - how='left', sort=sort) + on_cols = ["key1", "key2"] + merged_left_right = left.merge( + right, left_on=on_cols, right_index=True, how="left", sort=sort + ) - merge_right_left = right.merge(left, - right_on=on_cols, left_index=True, - how='right', sort=sort) + merge_right_left = right.merge( + left, right_on=on_cols, left_index=True, how="right", sort=sort + ) # Reorder columns merge_right_left = merge_right_left[merged_left_right.columns] @@ -199,210 +203,241 @@ def test_compress_group_combinations(self): key1 = np.tile(key1, 2) key2 = key1[::-1] - df = DataFrame({'key1': key1, 'key2': key2, - 'value1': np.random.randn(20000)}) + df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)}) - df2 = DataFrame({'key1': key1[::2], 'key2': key2[::2], - 'value2': np.random.randn(10000)}) + df2 = DataFrame( + {"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)} + ) # just to hit the label compression code path - merge(df, df2, how='outer') + merge(df, df2, how="outer") def test_left_join_index_preserve_order(self): - on_cols = ['k1', 'k2'] - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'v': np.array(np.arange(24), dtype=np.int64)}) + on_cols = ["k1", "k2"] + left = DataFrame( + { + "k1": [0, 1, 2] * 8, + "k2": ["foo", "bar"] * 12, + "v": np.array(np.arange(24), dtype=np.int64), + } + ) - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": [5, 7]}, index=index) result = left.join(right, on=on_cols) expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + expected["v2"] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 tm.assert_frame_equal(result, expected) - result.sort_values(on_cols, kind='mergesort', inplace=True) + result.sort_values(on_cols, kind="mergesort", inplace=True) expected = left.join(right, on=on_cols, sort=True) tm.assert_frame_equal(result, expected) # test join with multi dtypes blocks - left = DataFrame({'k1': [0, 1, 2] * 8, - 'k2': ['foo', 'bar'] * 12, - 'k3': np.array([0, 1, 2] * 8, dtype=np.float32), - 'v': np.array(np.arange(24), dtype=np.int32)}) + left = DataFrame( + { + "k1": [0, 1, 2] * 8, + "k2": ["foo", "bar"] * 12, + "k3": np.array([0, 1, 2] * 8, dtype=np.float32), + "v": np.array(np.arange(24), dtype=np.int32), + } + ) - index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) - right = DataFrame({'v2': [5, 7]}, index=index) + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": [5, 7]}, index=index) result = left.join(right, on=on_cols) expected = left.copy() - expected['v2'] = np.nan - expected.loc[(expected.k1 == 2) & (expected.k2 == 'bar'), 'v2'] = 5 - expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'), 'v2'] = 7 + expected["v2"] = np.nan + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 tm.assert_frame_equal(result, expected) - result = result.sort_values(on_cols, kind='mergesort') + result = result.sort_values(on_cols, kind="mergesort") expected = left.join(right, on=on_cols, sort=True) tm.assert_frame_equal(result, expected) def test_left_join_index_multi_match_multiindex(self): - left = DataFrame([ - ['X', 'Y', 'C', 'a'], - ['W', 'Y', 'C', 'e'], - ['V', 'Q', 'A', 'h'], - ['V', 'R', 'D', 'i'], - ['X', 'Y', 'D', 'b'], - ['X', 'Y', 'A', 'c'], - ['W', 'Q', 'B', 'f'], - ['W', 'R', 'C', 'g'], - ['V', 'Y', 'C', 'j'], - ['X', 'Y', 'B', 'd']], - columns=['cola', 'colb', 'colc', 'tag'], - index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8]) - - right = (DataFrame([ - ['W', 'R', 'C', 0], - ['W', 'Q', 'B', 3], - ['W', 'Q', 'B', 8], - ['X', 'Y', 'A', 1], - ['X', 'Y', 'A', 4], - ['X', 'Y', 'B', 5], - ['X', 'Y', 'C', 6], - ['X', 'Y', 'C', 9], - ['X', 'Q', 'C', -6], - ['X', 'R', 'C', -9], - ['V', 'Y', 'C', 7], - ['V', 'R', 'D', 2], - ['V', 'R', 'D', -1], - ['V', 'Q', 'A', -3]], - columns=['col1', 'col2', 'col3', 'val']) - .set_index(['col1', 'col2', 'col3'])) - - result = left.join(right, on=['cola', 'colb', 'colc'], how='left') - - expected = DataFrame([ - ['X', 'Y', 'C', 'a', 6], - ['X', 'Y', 'C', 'a', 9], - ['W', 'Y', 'C', 'e', nan], - ['V', 'Q', 'A', 'h', -3], - ['V', 'R', 'D', 'i', 2], - ['V', 'R', 'D', 'i', -1], - ['X', 'Y', 'D', 'b', nan], - ['X', 'Y', 'A', 'c', 1], - ['X', 'Y', 'A', 'c', 4], - ['W', 'Q', 'B', 'f', 3], - ['W', 'Q', 'B', 'f', 8], - ['W', 'R', 'C', 'g', 0], - ['V', 'Y', 'C', 'j', 7], - ['X', 'Y', 'B', 'd', 5]], - columns=['cola', 'colb', 'colc', 'tag', 'val'], - index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8]) + left = DataFrame( + [ + ["X", "Y", "C", "a"], + ["W", "Y", "C", "e"], + ["V", "Q", "A", "h"], + ["V", "R", "D", "i"], + ["X", "Y", "D", "b"], + ["X", "Y", "A", "c"], + ["W", "Q", "B", "f"], + ["W", "R", "C", "g"], + ["V", "Y", "C", "j"], + ["X", "Y", "B", "d"], + ], + columns=["cola", "colb", "colc", "tag"], + index=[3, 2, 0, 1, 7, 6, 4, 5, 9, 8], + ) + + right = DataFrame( + [ + ["W", "R", "C", 0], + ["W", "Q", "B", 3], + ["W", "Q", "B", 8], + ["X", "Y", "A", 1], + ["X", "Y", "A", 4], + ["X", "Y", "B", 5], + ["X", "Y", "C", 6], + ["X", "Y", "C", 9], + ["X", "Q", "C", -6], + ["X", "R", "C", -9], + ["V", "Y", "C", 7], + ["V", "R", "D", 2], + ["V", "R", "D", -1], + ["V", "Q", "A", -3], + ], + columns=["col1", "col2", "col3", "val"], + ).set_index(["col1", "col2", "col3"]) + + result = left.join(right, on=["cola", "colb", "colc"], how="left") + + expected = DataFrame( + [ + ["X", "Y", "C", "a", 6], + ["X", "Y", "C", "a", 9], + ["W", "Y", "C", "e", nan], + ["V", "Q", "A", "h", -3], + ["V", "R", "D", "i", 2], + ["V", "R", "D", "i", -1], + ["X", "Y", "D", "b", nan], + ["X", "Y", "A", "c", 1], + ["X", "Y", "A", "c", 4], + ["W", "Q", "B", "f", 3], + ["W", "Q", "B", "f", 8], + ["W", "R", "C", "g", 0], + ["V", "Y", "C", "j", 7], + ["X", "Y", "B", "d", 5], + ], + columns=["cola", "colb", "colc", "tag", "val"], + index=[3, 3, 2, 0, 1, 1, 7, 6, 6, 4, 4, 5, 9, 8], + ) tm.assert_frame_equal(result, expected) - result = left.join(right, on=['cola', 'colb', 'colc'], - how='left', sort=True) + result = left.join(right, on=["cola", "colb", "colc"], how="left", sort=True) - expected = expected.sort_values(['cola', 'colb', 'colc'], - kind='mergesort') + expected = expected.sort_values(["cola", "colb", "colc"], kind="mergesort") tm.assert_frame_equal(result, expected) def test_left_join_index_multi_match(self): - left = DataFrame([ - ['c', 0], - ['b', 1], - ['a', 2], - ['b', 3]], - columns=['tag', 'val'], - index=[2, 0, 1, 3]) - - right = (DataFrame([ - ['a', 'v'], - ['c', 'w'], - ['c', 'x'], - ['d', 'y'], - ['a', 'z'], - ['c', 'r'], - ['e', 'q'], - ['c', 's']], - columns=['tag', 'char']) - .set_index('tag')) - - result = left.join(right, on='tag', how='left') - - expected = DataFrame([ - ['c', 0, 'w'], - ['c', 0, 'x'], - ['c', 0, 'r'], - ['c', 0, 's'], - ['b', 1, nan], - ['a', 2, 'v'], - ['a', 2, 'z'], - ['b', 3, nan]], - columns=['tag', 'val', 'char'], - index=[2, 2, 2, 2, 0, 1, 1, 3]) + left = DataFrame( + [["c", 0], ["b", 1], ["a", 2], ["b", 3]], + columns=["tag", "val"], + index=[2, 0, 1, 3], + ) + + right = DataFrame( + [ + ["a", "v"], + ["c", "w"], + ["c", "x"], + ["d", "y"], + ["a", "z"], + ["c", "r"], + ["e", "q"], + ["c", "s"], + ], + columns=["tag", "char"], + ).set_index("tag") + + result = left.join(right, on="tag", how="left") + + expected = DataFrame( + [ + ["c", 0, "w"], + ["c", 0, "x"], + ["c", 0, "r"], + ["c", 0, "s"], + ["b", 1, nan], + ["a", 2, "v"], + ["a", 2, "z"], + ["b", 3, nan], + ], + columns=["tag", "val", "char"], + index=[2, 2, 2, 2, 0, 1, 1, 3], + ) tm.assert_frame_equal(result, expected) - result = left.join(right, on='tag', how='left', sort=True) - expected2 = expected.sort_values('tag', kind='mergesort') + result = left.join(right, on="tag", how="left", sort=True) + expected2 = expected.sort_values("tag", kind="mergesort") tm.assert_frame_equal(result, expected2) # GH7331 - maintain left frame order in left merge - result = merge(left, right.reset_index(), how='left', on='tag') + result = merge(left, right.reset_index(), how="left", on="tag") expected.index = np.arange(len(expected)) tm.assert_frame_equal(result, expected) def test_left_merge_na_buglet(self): - left = DataFrame({'id': list('abcde'), 'v1': randn(5), - 'v2': randn(5), 'dummy': list('abcde'), - 'v3': randn(5)}, - columns=['id', 'v1', 'v2', 'dummy', 'v3']) - right = DataFrame({'id': ['a', 'b', np.nan, np.nan, np.nan], - 'sv3': [1.234, 5.678, np.nan, np.nan, np.nan]}) + left = DataFrame( + { + "id": list("abcde"), + "v1": randn(5), + "v2": randn(5), + "dummy": list("abcde"), + "v3": randn(5), + }, + columns=["id", "v1", "v2", "dummy", "v3"], + ) + right = DataFrame( + { + "id": ["a", "b", np.nan, np.nan, np.nan], + "sv3": [1.234, 5.678, np.nan, np.nan, np.nan], + } + ) - result = merge(left, right, on='id', how='left') + result = merge(left, right, on="id", how="left") - rdf = right.drop(['id'], axis=1) + rdf = right.drop(["id"], axis=1) expected = left.join(rdf) tm.assert_frame_equal(result, expected) def test_merge_na_keys(self): - data = [[1950, "A", 1.5], - [1950, "B", 1.5], - [1955, "B", 1.5], - [1960, "B", np.nan], - [1970, "B", 4.], - [1950, "C", 4.], - [1960, "C", np.nan], - [1965, "C", 3.], - [1970, "C", 4.]] + data = [ + [1950, "A", 1.5], + [1950, "B", 1.5], + [1955, "B", 1.5], + [1960, "B", np.nan], + [1970, "B", 4.0], + [1950, "C", 4.0], + [1960, "C", np.nan], + [1965, "C", 3.0], + [1970, "C", 4.0], + ] frame = DataFrame(data, columns=["year", "panel", "data"]) - other_data = [[1960, 'A', np.nan], - [1970, 'A', np.nan], - [1955, 'A', np.nan], - [1965, 'A', np.nan], - [1965, 'B', np.nan], - [1955, 'C', np.nan]] - other = DataFrame(other_data, columns=['year', 'panel', 'data']) + other_data = [ + [1960, "A", np.nan], + [1970, "A", np.nan], + [1955, "A", np.nan], + [1965, "A", np.nan], + [1965, "B", np.nan], + [1955, "C", np.nan], + ] + other = DataFrame(other_data, columns=["year", "panel", "data"]) - result = frame.merge(other, how='outer') + result = frame.merge(other, how="outer") - expected = frame.fillna(-999).merge(other.fillna(-999), how='outer') + expected = frame.fillna(-999).merge(other.fillna(-999), how="outer") expected = expected.replace(-999, np.nan) tm.assert_frame_equal(result, expected) @@ -410,9 +445,9 @@ def test_merge_na_keys(self): @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) def test_merge_datetime_index(self, klass): # see gh-19038 - df = DataFrame([1, 2, 3], - ["2016-01-01", "2017-01-01", "2018-01-01"], - columns=["a"]) + df = DataFrame( + [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"] + ) df.index = pd.to_datetime(df.index) on_vector = df.index.year @@ -420,21 +455,16 @@ def test_merge_datetime_index(self, klass): on_vector = klass(on_vector) expected = DataFrame( - OrderedDict([ - ("a", [1, 2, 3]), - ("key_1", [2016, 2017, 2018]), - ]) + OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) ) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict([ - ("key_0", [2016, 2017, 2018]), - ("a_x", [1, 2, 3]), - ("a_y", [1, 2, 3]), - ]) + OrderedDict( + [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] + ) ) result = df.merge(df, on=[df.index.year], how="inner") @@ -444,166 +474,284 @@ def test_join_multi_levels(self): # GH 3662 # merge multi-levels - household = ( - DataFrame( - dict(household_id=[1, 2, 3], - male=[0, 1, 0], - wealth=[196087.3, 316478.7, 294750]), - columns=['household_id', 'male', 'wealth']) - .set_index('household_id')) - portfolio = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - name=["ABN Amro", "Robeco", "Royal Dutch Shell", - "Royal Dutch Shell", - "AAB Eastern Europe Equity Fund", - "Postbank BioTech Fonds", np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'name', 'share']) - .set_index(['household_id', 'asset_id'])) - result = household.join(portfolio, how='inner') + household = DataFrame( + dict( + household_id=[1, 2, 3], + male=[0, 1, 0], + wealth=[196087.3, 316478.7, 294750], + ), + columns=["household_id", "male", "wealth"], + ).set_index("household_id") + portfolio = DataFrame( + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000289783", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + np.nan, + ], + name=[ + "ABN Amro", + "Robeco", + "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", + np.nan, + ], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), + columns=["household_id", "asset_id", "name", "share"], + ).set_index(["household_id", "asset_id"]) + result = household.join(portfolio, how="inner") expected = ( DataFrame( - dict(male=[0, 1, 1, 0, 0, 0], - wealth=[196087.3, 316478.7, 316478.7, - 294750.0, 294750.0, 294750.0], - name=['ABN Amro', 'Robeco', 'Royal Dutch Shell', - 'Royal Dutch Shell', - 'AAB Eastern Europe Equity Fund', - 'Postbank BioTech Fonds'], - share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], - household_id=[1, 2, 2, 3, 3, 3], - asset_id=['nl0000301109', 'nl0000289783', 'gb00b03mlx29', - 'gb00b03mlx29', 'lu0197800237', - 'nl0000289965'])) - .set_index(['household_id', 'asset_id']) - .reindex(columns=['male', 'wealth', 'name', 'share'])) + dict( + male=[0, 1, 1, 0, 0, 0], + wealth=[196087.3, 316478.7, 316478.7, 294750.0, 294750.0, 294750.0], + name=[ + "ABN Amro", + "Robeco", + "Royal Dutch Shell", + "Royal Dutch Shell", + "AAB Eastern Europe Equity Fund", + "Postbank BioTech Fonds", + ], + share=[1.00, 0.40, 0.60, 0.15, 0.60, 0.25], + household_id=[1, 2, 2, 3, 3, 3], + asset_id=[ + "nl0000301109", + "nl0000289783", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + ], + ) + ) + .set_index(["household_id", "asset_id"]) + .reindex(columns=["male", "wealth", "name", "share"]) + ) tm.assert_frame_equal(result, expected) # equivalency - result = (merge(household.reset_index(), portfolio.reset_index(), - on=['household_id'], how='inner') - .set_index(['household_id', 'asset_id'])) + result = merge( + household.reset_index(), + portfolio.reset_index(), + on=["household_id"], + how="inner", + ).set_index(["household_id", "asset_id"]) tm.assert_frame_equal(result, expected) - result = household.join(portfolio, how='outer') - expected = (concat([ - expected, - (DataFrame( - dict(share=[1.00]), - index=MultiIndex.from_tuples( - [(4, np.nan)], - names=['household_id', 'asset_id']))) - ], axis=0, sort=True).reindex(columns=expected.columns)) + result = household.join(portfolio, how="outer") + expected = concat( + [ + expected, + ( + DataFrame( + dict(share=[1.00]), + index=MultiIndex.from_tuples( + [(4, np.nan)], names=["household_id", "asset_id"] + ), + ) + ), + ], + axis=0, + sort=True, + ).reindex(columns=expected.columns) tm.assert_frame_equal(result, expected) # invalid cases - household.index.name = 'foo' + household.index.name = "foo" with pytest.raises(ValueError): - household.join(portfolio, how='inner') + household.join(portfolio, how="inner") portfolio2 = portfolio.copy() - portfolio2.index.set_names(['household_id', 'foo']) + portfolio2.index.set_names(["household_id", "foo"]) with pytest.raises(ValueError): - portfolio2.join(portfolio, how='inner') + portfolio2.join(portfolio, how="inner") def test_join_multi_levels2(self): # some more advanced merges # GH6360 - household = ( - DataFrame( - dict(household_id=[1, 2, 2, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "nl0000289965", - np.nan], - share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0]), - columns=['household_id', 'asset_id', 'share']) - .set_index(['household_id', 'asset_id'])) - - log_return = DataFrame(dict( - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 180, 181], - log_return=[.09604978, -.06524096, .03532373, .03025441, .036997] - )).set_index(["asset_id", "t"]) + household = DataFrame( + dict( + household_id=[1, 2, 2, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000301109", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "nl0000289965", + np.nan, + ], + share=[1.0, 0.4, 0.6, 0.15, 0.6, 0.25, 1.0], + ), + columns=["household_id", "asset_id", "share"], + ).set_index(["household_id", "asset_id"]) + + log_return = DataFrame( + dict( + asset_id=[ + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + ], + t=[233, 234, 235, 180, 181], + log_return=[0.09604978, -0.06524096, 0.03532373, 0.03025441, 0.036997], + ) + ).set_index(["asset_id", "t"]) expected = ( - DataFrame(dict( - household_id=[2, 2, 2, 3, 3, 3, 3, 3], - asset_id=["gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237"], - t=[233, 234, 235, 233, 234, 235, 180, 181], - share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], - log_return=[.09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997] - )) + DataFrame( + dict( + household_id=[2, 2, 2, 3, 3, 3, 3, 3], + asset_id=[ + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + ], + t=[233, 234, 235, 233, 234, 235, 180, 181], + share=[0.6, 0.6, 0.6, 0.15, 0.15, 0.15, 0.6, 0.6], + log_return=[ + 0.09604978, + -0.06524096, + 0.03532373, + 0.09604978, + -0.06524096, + 0.03532373, + 0.03025441, + 0.036997, + ], + ) + ) .set_index(["household_id", "asset_id", "t"]) - .reindex(columns=['share', 'log_return'])) + .reindex(columns=["share", "log_return"]) + ) # this is the equivalency - result = (merge(household.reset_index(), log_return.reset_index(), - on=['asset_id'], how='inner') - .set_index(['household_id', 'asset_id', 't'])) + result = merge( + household.reset_index(), + log_return.reset_index(), + on=["asset_id"], + how="inner", + ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected) expected = ( - DataFrame(dict( - household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", - "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", - "lu0197800237", "lu0197800237", - "nl0000289965", None], - t=[None, None, 233, 234, 235, 233, 234, - 235, 180, 181, None, None], - share=[1.0, 0.4, 0.6, 0.6, 0.6, 0.15, - 0.15, 0.15, 0.6, 0.6, 0.25, 1.0], - log_return=[None, None, .09604978, -.06524096, .03532373, - .09604978, -.06524096, .03532373, - .03025441, .036997, None, None] - )) + DataFrame( + dict( + household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], + asset_id=[ + "nl0000301109", + "nl0000301109", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "gb00b03mlx29", + "lu0197800237", + "lu0197800237", + "nl0000289965", + None, + ], + t=[None, None, 233, 234, 235, 233, 234, 235, 180, 181, None, None], + share=[ + 1.0, + 0.4, + 0.6, + 0.6, + 0.6, + 0.15, + 0.15, + 0.15, + 0.6, + 0.6, + 0.25, + 1.0, + ], + log_return=[ + None, + None, + 0.09604978, + -0.06524096, + 0.03532373, + 0.09604978, + -0.06524096, + 0.03532373, + 0.03025441, + 0.036997, + None, + None, + ], + ) + ) .set_index(["household_id", "asset_id", "t"]) - .reindex(columns=['share', 'log_return'])) + .reindex(columns=["share", "log_return"]) + ) - result = (merge(household.reset_index(), log_return.reset_index(), - on=['asset_id'], how='outer') - .set_index(['household_id', 'asset_id', 't'])) + result = merge( + household.reset_index(), + log_return.reset_index(), + on=["asset_id"], + how="outer", + ).set_index(["household_id", "asset_id", "t"]) tm.assert_frame_equal(result, expected) class TestJoinMultiMulti: - - def test_join_multi_multi(self, left_multi, right_multi, join_type, - on_cols_multi, idx_cols_multi): + def test_join_multi_multi( + self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + ): # Multi-index join tests - expected = (pd.merge(left_multi.reset_index(), - right_multi.reset_index(), - how=join_type, on=on_cols_multi). - set_index(idx_cols_multi).sort_index()) + expected = ( + pd.merge( + left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, + on=on_cols_multi, + ) + .set_index(idx_cols_multi) + .sort_index() + ) result = left_multi.join(right_multi, how=join_type).sort_index() tm.assert_frame_equal(result, expected) - def test_join_multi_empty_frames(self, left_multi, right_multi, join_type, - on_cols_multi, idx_cols_multi): + def test_join_multi_empty_frames( + self, left_multi, right_multi, join_type, on_cols_multi, idx_cols_multi + ): left_multi = left_multi.drop(columns=left_multi.columns) right_multi = right_multi.drop(columns=right_multi.columns) - expected = (pd.merge(left_multi.reset_index(), - right_multi.reset_index(), - how=join_type, on=on_cols_multi) - .set_index(idx_cols_multi).sort_index()) + expected = ( + pd.merge( + left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, + on=on_cols_multi, + ) + .set_index(idx_cols_multi) + .sort_index() + ) result = left_multi.join(right_multi, how=join_type).sort_index() tm.assert_frame_equal(result, expected) @@ -611,9 +759,9 @@ def test_join_multi_empty_frames(self, left_multi, right_multi, join_type, @pytest.mark.parametrize("box", [None, np.asarray, Series, Index]) def test_merge_datetime_index(self, box): # see gh-19038 - df = DataFrame([1, 2, 3], - ["2016-01-01", "2017-01-01", "2018-01-01"], - columns=["a"]) + df = DataFrame( + [1, 2, 3], ["2016-01-01", "2017-01-01", "2018-01-01"], columns=["a"] + ) df.index = pd.to_datetime(df.index) on_vector = df.index.year @@ -621,46 +769,42 @@ def test_merge_datetime_index(self, box): on_vector = box(on_vector) expected = DataFrame( - OrderedDict([ - ("a", [1, 2, 3]), - ("key_1", [2016, 2017, 2018]), - ]) + OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) ) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict([ - ("key_0", [2016, 2017, 2018]), - ("a_x", [1, 2, 3]), - ("a_y", [1, 2, 3]), - ]) + OrderedDict( + [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] + ) ) result = df.merge(df, on=[df.index.year], how="inner") tm.assert_frame_equal(result, expected) def test_single_common_level(self): - index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) + index_left = pd.MultiIndex.from_tuples( + [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] + ) - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=index_left) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=index_left + ) - index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), - ('K2', 'Y2'), ('K2', 'Y3')], - names=['key', 'Y']) + index_right = pd.MultiIndex.from_tuples( + [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] + ) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=index_right) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, + index=index_right, + ) result = left.join(right) - expected = (pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner') - .set_index(['key', 'X', 'Y'])) + expected = pd.merge( + left.reset_index(), right.reset_index(), on=["key"], how="inner" + ).set_index(["key", "X", "Y"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 031f3abf31b16..6366bf0521fbc 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -15,8 +15,18 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series, - Timestamp, concat, date_range, isna, read_csv) + Categorical, + DataFrame, + DatetimeIndex, + Index, + MultiIndex, + Series, + Timestamp, + concat, + date_range, + isna, + read_csv, +) import pandas.core.common as com from pandas.tests.extension.decimal import to_decimal from pandas.util import testing as tm @@ -47,30 +57,40 @@ class TestConcatAppendCommon: def setup_method(self, method): - dt_data = [pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')] - tz_data = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')] - - td_data = [pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Timedelta('3 days')] - - period_data = [pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M')] - - self.data = {'bool': [True, False, True], - 'int64': [1, 2, 3], - 'float64': [1.1, np.nan, 3.3], - 'category': pd.Categorical(['X', 'Y', 'Z']), - 'object': ['a', 'b', 'c'], - 'datetime64[ns]': dt_data, - 'datetime64[ns, US/Eastern]': tz_data, - 'timedelta64[ns]': td_data, - 'period[M]': period_data} + dt_data = [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ] + tz_data = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ] + + td_data = [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + ] + + period_data = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + ] + + self.data = { + "bool": [True, False, True], + "int64": [1, 2, 3], + "float64": [1.1, np.nan, 3.3], + "category": pd.Categorical(["X", "Y", "Z"]), + "object": ["a", "b", "c"], + "datetime64[ns]": dt_data, + "datetime64[ns, US/Eastern]": tz_data, + "timedelta64[ns]": td_data, + "period[M]": period_data, + } def _check_expected_dtype(self, obj, label): """ @@ -78,13 +98,13 @@ def _check_expected_dtype(self, obj, label): considering not-supported dtypes """ if isinstance(obj, pd.Index): - if label == 'bool': - assert obj.dtype == 'object' + if label == "bool": + assert obj.dtype == "object" else: assert obj.dtype == label elif isinstance(obj, pd.Series): - if label.startswith('period'): - assert obj.dtype == 'Period[M]' + if label.startswith("period"): + assert obj.dtype == "Period[M]" else: assert obj.dtype == label else: @@ -103,10 +123,9 @@ def test_concatlike_same_dtypes(self): vals2 = vals1 vals3 = vals1 - if typ1 == 'category': + if typ1 == "category": exp_data = pd.Categorical(list(vals1) + list(vals2)) - exp_data3 = pd.Categorical(list(vals1) + list(vals2) + - list(vals3)) + exp_data3 = pd.Categorical(list(vals1) + list(vals2) + list(vals3)) else: exp_data = vals1 + vals2 exp_data3 = vals1 + vals2 + vals3 @@ -124,52 +143,53 @@ def test_concatlike_same_dtypes(self): tm.assert_index_equal(res, exp) # index.append name mismatch - i1 = pd.Index(vals1, name='x') - i2 = pd.Index(vals2, name='y') + i1 = pd.Index(vals1, name="x") + i2 = pd.Index(vals2, name="y") res = i1.append(i2) exp = pd.Index(exp_data) tm.assert_index_equal(res, exp) # index.append name match - i1 = pd.Index(vals1, name='x') - i2 = pd.Index(vals2, name='x') + i1 = pd.Index(vals1, name="x") + i2 = pd.Index(vals2, name="x") res = i1.append(i2) - exp = pd.Index(exp_data, name='x') + exp = pd.Index(exp_data, name="x") tm.assert_index_equal(res, exp) # cannot append non-index - with pytest.raises(TypeError, match='all inputs must be Index'): + with pytest.raises(TypeError, match="all inputs must be Index"): pd.Index(vals1).append(vals2) - with pytest.raises(TypeError, match='all inputs must be Index'): + with pytest.raises(TypeError, match="all inputs must be Index"): pd.Index(vals1).append([pd.Index(vals2), vals3]) # ----- Series ----- # # series.append - res = pd.Series(vals1).append(pd.Series(vals2), - ignore_index=True) + res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) exp = pd.Series(exp_data) tm.assert_series_equal(res, exp, check_index_type=True) # concat - res = pd.concat([pd.Series(vals1), pd.Series(vals2)], - ignore_index=True) + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # 3 elements - res = pd.Series(vals1).append([pd.Series(vals2), pd.Series(vals3)], - ignore_index=True) + res = pd.Series(vals1).append( + [pd.Series(vals2), pd.Series(vals3)], ignore_index=True + ) exp = pd.Series(exp_data3) tm.assert_series_equal(res, exp) - res = pd.concat([pd.Series(vals1), pd.Series(vals2), - pd.Series(vals3)], ignore_index=True) + res = pd.concat( + [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], + ignore_index=True, + ) tm.assert_series_equal(res, exp) # name mismatch - s1 = pd.Series(vals1, name='x') - s2 = pd.Series(vals2, name='y') + s1 = pd.Series(vals1, name="x") + s2 = pd.Series(vals2, name="y") res = s1.append(s2, ignore_index=True) exp = pd.Series(exp_data) tm.assert_series_equal(res, exp, check_index_type=True) @@ -178,18 +198,20 @@ def test_concatlike_same_dtypes(self): tm.assert_series_equal(res, exp, check_index_type=True) # name match - s1 = pd.Series(vals1, name='x') - s2 = pd.Series(vals2, name='x') + s1 = pd.Series(vals1, name="x") + s2 = pd.Series(vals2, name="x") res = s1.append(s2, ignore_index=True) - exp = pd.Series(exp_data, name='x') + exp = pd.Series(exp_data, name="x") tm.assert_series_equal(res, exp, check_index_type=True) res = pd.concat([s1, s2], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # cannot append non-index - msg = (r"cannot concatenate object of type '.+';" - " only Series and DataFrame objs are valid") + msg = ( + r"cannot concatenate object of type '.+';" + " only Series and DataFrame objs are valid" + ) with pytest.raises(TypeError, match=msg): pd.Series(vals1).append(vals2) @@ -216,21 +238,23 @@ def test_concatlike_dtypes_coercion(self): if typ1 == typ2: # same dtype is tested in test_concatlike_same_dtypes continue - elif typ1 == 'category' or typ2 == 'category': + elif typ1 == "category" or typ2 == "category": # ToDo: suspicious continue # specify expected dtype - if typ1 == 'bool' and typ2 in ('int64', 'float64'): + if typ1 == "bool" and typ2 in ("int64", "float64"): # series coerces to numeric based on numpy rule # index doesn't because bool is object dtype exp_series_dtype = typ2 - elif typ2 == 'bool' and typ1 in ('int64', 'float64'): + elif typ2 == "bool" and typ1 in ("int64", "float64"): exp_series_dtype = typ1 - elif (typ1 == 'datetime64[ns, US/Eastern]' or - typ2 == 'datetime64[ns, US/Eastern]' or - typ1 == 'timedelta64[ns]' or - typ2 == 'timedelta64[ns]'): + elif ( + typ1 == "datetime64[ns, US/Eastern]" + or typ2 == "datetime64[ns, US/Eastern]" + or typ1 == "timedelta64[ns]" + or typ2 == "timedelta64[ns]" + ): exp_index_dtype = object exp_series_dtype = object @@ -245,45 +269,48 @@ def test_concatlike_dtypes_coercion(self): tm.assert_index_equal(res, exp) # 3 elements - res = pd.Index(vals1).append([pd.Index(vals2), - pd.Index(vals3)]) + res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) exp = pd.Index(exp_data3, dtype=exp_index_dtype) tm.assert_index_equal(res, exp) # ----- Series ----- # # series.append - res = pd.Series(vals1).append(pd.Series(vals2), - ignore_index=True) + res = pd.Series(vals1).append(pd.Series(vals2), ignore_index=True) exp = pd.Series(exp_data, dtype=exp_series_dtype) tm.assert_series_equal(res, exp, check_index_type=True) # concat - res = pd.concat([pd.Series(vals1), pd.Series(vals2)], - ignore_index=True) + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], ignore_index=True) tm.assert_series_equal(res, exp, check_index_type=True) # 3 elements - res = pd.Series(vals1).append([pd.Series(vals2), - pd.Series(vals3)], - ignore_index=True) + res = pd.Series(vals1).append( + [pd.Series(vals2), pd.Series(vals3)], ignore_index=True + ) exp = pd.Series(exp_data3, dtype=exp_series_dtype) tm.assert_series_equal(res, exp) - res = pd.concat([pd.Series(vals1), pd.Series(vals2), - pd.Series(vals3)], ignore_index=True) + res = pd.concat( + [pd.Series(vals1), pd.Series(vals2), pd.Series(vals3)], + ignore_index=True, + ) tm.assert_series_equal(res, exp) def test_concatlike_common_coerce_to_pandas_object(self): # GH 13626 # result must be Timestamp/Timedelta, not datetime.datetime/timedelta - dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02']) - tdi = pd.TimedeltaIndex(['1 days', '2 days']) - - exp = pd.Index([pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-01-02'), - pd.Timedelta('1 days'), - pd.Timedelta('2 days')]) + dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"]) + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ] + ) res = dti.append(tdi) tm.assert_index_equal(res, exp) @@ -305,11 +332,12 @@ def test_concatlike_common_coerce_to_pandas_object(self): def test_concatlike_datetimetz(self, tz_aware_fixture): tz = tz_aware_fixture # GH 7795 - dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) - dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz=tz) + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz=tz) - exp = pd.DatetimeIndex(['2011-01-01', '2011-01-02', - '2012-01-01', '2012-01-02'], tz=tz) + exp = pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2012-01-01", "2012-01-02"], tz=tz + ) res = dti1.append(dti2) tm.assert_index_equal(res, exp) @@ -322,20 +350,19 @@ def test_concatlike_datetimetz(self, tz_aware_fixture): res = pd.concat([dts1, dts2]) tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) - @pytest.mark.parametrize('tz', - ['UTC', 'US/Eastern', 'Asia/Tokyo', 'EST5EDT']) + @pytest.mark.parametrize("tz", ["UTC", "US/Eastern", "Asia/Tokyo", "EST5EDT"]) def test_concatlike_datetimetz_short(self, tz): # GH#7795 - ix1 = pd.date_range(start='2014-07-15', end='2014-07-17', - freq='D', tz=tz) - ix2 = pd.DatetimeIndex(['2014-07-11', '2014-07-21'], tz=tz) - df1 = pd.DataFrame(0, index=ix1, columns=['A', 'B']) - df2 = pd.DataFrame(0, index=ix2, columns=['A', 'B']) - - exp_idx = pd.DatetimeIndex(['2014-07-15', '2014-07-16', - '2014-07-17', '2014-07-11', - '2014-07-21'], tz=tz) - exp = pd.DataFrame(0, index=exp_idx, columns=['A', 'B']) + ix1 = pd.date_range(start="2014-07-15", end="2014-07-17", freq="D", tz=tz) + ix2 = pd.DatetimeIndex(["2014-07-11", "2014-07-21"], tz=tz) + df1 = pd.DataFrame(0, index=ix1, columns=["A", "B"]) + df2 = pd.DataFrame(0, index=ix2, columns=["A", "B"]) + + exp_idx = pd.DatetimeIndex( + ["2014-07-15", "2014-07-16", "2014-07-17", "2014-07-11", "2014-07-21"], + tz=tz, + ) + exp = pd.DataFrame(0, index=exp_idx, columns=["A", "B"]) tm.assert_frame_equal(df1.append(df2), exp) tm.assert_frame_equal(pd.concat([df1, df2]), exp) @@ -345,13 +372,18 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): # GH 13660 # different tz coerces to object - dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) - dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02']) - - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-02', tz=tz), - pd.Timestamp('2012-01-01'), - pd.Timestamp('2012-01-02')], dtype=object) + dti1 = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], tz=tz) + dti2 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"]) + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-02"), + ], + dtype=object, + ) res = dti1.append(dti2) tm.assert_index_equal(res, exp) @@ -365,14 +397,17 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) # different tz - dti3 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], - tz='US/Pacific') - - exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2011-01-02', tz=tz), - pd.Timestamp('2012-01-01', tz='US/Pacific'), - pd.Timestamp('2012-01-02', tz='US/Pacific')], - dtype=object) + dti3 = pd.DatetimeIndex(["2012-01-01", "2012-01-02"], tz="US/Pacific") + + exp = pd.Index( + [ + pd.Timestamp("2011-01-01", tz=tz), + pd.Timestamp("2011-01-02", tz=tz), + pd.Timestamp("2012-01-01", tz="US/Pacific"), + pd.Timestamp("2012-01-02", tz="US/Pacific"), + ], + dtype=object, + ) res = dti1.append(dti3) # tm.assert_index_equal(res, exp) @@ -387,11 +422,10 @@ def test_concatlike_datetimetz_to_object(self, tz_aware_fixture): def test_concatlike_common_period(self): # GH 13660 - pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') - pi2 = pd.PeriodIndex(['2012-01', '2012-02'], freq='M') + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01", "2012-02"], freq="M") - exp = pd.PeriodIndex(['2011-01', '2011-02', '2012-01', - '2012-02'], freq='M') + exp = pd.PeriodIndex(["2011-01", "2011-02", "2012-01", "2012-02"], freq="M") res = pi1.append(pi2) tm.assert_index_equal(res, exp) @@ -406,13 +440,18 @@ def test_concatlike_common_period(self): def test_concatlike_common_period_diff_freq_to_object(self): # GH 13221 - pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') - pi2 = pd.PeriodIndex(['2012-01-01', '2012-02-01'], freq='D') - - exp = pd.Index([pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Period('2012-01-01', freq='D'), - pd.Period('2012-02-01', freq='D')], dtype=object) + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + pi2 = pd.PeriodIndex(["2012-01-01", "2012-02-01"], freq="D") + + exp = pd.Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2012-01-01", freq="D"), + pd.Period("2012-02-01", freq="D"), + ], + dtype=object, + ) res = pi1.append(pi2) tm.assert_index_equal(res, exp) @@ -428,12 +467,17 @@ def test_concatlike_common_period_diff_freq_to_object(self): def test_concatlike_common_period_mixed_dt_to_object(self): # GH 13221 # different datetimelike - pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') - tdi = pd.TimedeltaIndex(['1 days', '2 days']) - exp = pd.Index([pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Timedelta('1 days'), - pd.Timedelta('2 days')], dtype=object) + pi1 = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") + tdi = pd.TimedeltaIndex(["1 days", "2 days"]) + exp = pd.Index( + [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + ], + dtype=object, + ) res = pi1.append(tdi) tm.assert_index_equal(res, exp) @@ -447,10 +491,15 @@ def test_concatlike_common_period_mixed_dt_to_object(self): tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) # inverse - exp = pd.Index([pd.Timedelta('1 days'), - pd.Timedelta('2 days'), - pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M')], dtype=object) + exp = pd.Index( + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + ], + dtype=object, + ) res = tdi.append(pi1) tm.assert_index_equal(res, exp) @@ -467,56 +516,57 @@ def test_concat_categorical(self): # GH 13524 # same categories -> category - s1 = pd.Series([1, 2, np.nan], dtype='category') - s2 = pd.Series([2, 1, 2], dtype='category') + s1 = pd.Series([1, 2, np.nan], dtype="category") + s2 = pd.Series([2, 1, 2], dtype="category") - exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category') + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # partially different categories => not-category - s1 = pd.Series([3, 2], dtype='category') - s2 = pd.Series([2, 1], dtype='category') + s1 = pd.Series([3, 2], dtype="category") + s2 = pd.Series([2, 1], dtype="category") exp = pd.Series([3, 2, 2, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # completely different categories (same dtype) => not-category - s1 = pd.Series([10, 11, np.nan], dtype='category') - s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') + s1 = pd.Series([10, 11, np.nan], dtype="category") + s2 = pd.Series([np.nan, 1, 3, 2], dtype="category") - exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype='object') + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 - a = pd.Series(Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c'])) - b = pd.Series(Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c'])) + a = pd.Series(Categorical(["a", "b", "c"], categories=["a", "b", "c"])) + b = pd.Series(Categorical(["a", "b", "c"], categories=["b", "a", "c"])) result = pd.concat([a, b], ignore_index=True) - expected = pd.Series(Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c'])) + expected = pd.Series( + Categorical(["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]) + ) tm.assert_series_equal(result, expected) def test_concat_categorical_coercion(self): # GH 13524 # category + not-category => not-category - s1 = pd.Series([1, 2, np.nan], dtype='category') + s1 = pd.Series([1, 2, np.nan], dtype="category") s2 = pd.Series([2, 1, 2]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='object') + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # result shouldn't be affected by 1st elem dtype - exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype='object') + exp = pd.Series([2, 1, 2, 1, 2, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # all values are not in category => not-category - s1 = pd.Series([3, 2], dtype='category') + s1 = pd.Series([3, 2], dtype="category") s2 = pd.Series([2, 1]) exp = pd.Series([3, 2, 2, 1]) @@ -528,31 +578,31 @@ def test_concat_categorical_coercion(self): tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # completely different categories => not-category - s1 = pd.Series([10, 11, np.nan], dtype='category') + s1 = pd.Series([10, 11, np.nan], dtype="category") s2 = pd.Series([1, 3, 2]) - exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype='object') + exp = pd.Series([10, 11, np.nan, 1, 3, 2], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype='object') + exp = pd.Series([1, 3, 2, 10, 11, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # different dtype => not-category - s1 = pd.Series([10, 11, np.nan], dtype='category') - s2 = pd.Series(['a', 'b', 'c']) + s1 = pd.Series([10, 11, np.nan], dtype="category") + s2 = pd.Series(["a", "b", "c"]) - exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c']) + exp = pd.Series([10, 11, np.nan, "a", "b", "c"]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan]) + exp = pd.Series(["a", "b", "c", 10, 11, np.nan]) tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # if normal series only contains NaN-likes => not-category - s1 = pd.Series([10, 11], dtype='category') + s1 = pd.Series([10, 11], dtype="category") s2 = pd.Series([np.nan, np.nan, np.nan]) exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) @@ -567,23 +617,21 @@ def test_concat_categorical_3elem_coercion(self): # GH 13524 # mixed dtypes => not-category - s1 = pd.Series([1, 2, np.nan], dtype='category') - s2 = pd.Series([2, 1, 2], dtype='category') + s1 = pd.Series([1, 2, np.nan], dtype="category") + s2 = pd.Series([2, 1, 2], dtype="category") s3 = pd.Series([1, 2, 1, 2, np.nan]) - exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], - dtype='object') + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) - exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], - dtype='object') + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2], dtype="object") tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) # values are all in either category => not-category - s1 = pd.Series([4, 5, 6], dtype='category') - s2 = pd.Series([1, 2, 3], dtype='category') + s1 = pd.Series([4, 5, 6], dtype="category") + s2 = pd.Series([1, 2, 3], dtype="category") s3 = pd.Series([1, 3, 4]) exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) @@ -595,8 +643,8 @@ def test_concat_categorical_3elem_coercion(self): tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) # values are all in either category => not-category - s1 = pd.Series([4, 5, 6], dtype='category') - s2 = pd.Series([1, 2, 3], dtype='category') + s1 = pd.Series([4, 5, 6], dtype="category") + s2 = pd.Series([1, 2, 3], dtype="category") s3 = pd.Series([10, 11, 12]) exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) @@ -610,12 +658,12 @@ def test_concat_categorical_3elem_coercion(self): def test_concat_categorical_multi_coercion(self): # GH 13524 - s1 = pd.Series([1, 3], dtype='category') - s2 = pd.Series([3, 4], dtype='category') + s1 = pd.Series([1, 3], dtype="category") + s2 = pd.Series([3, 4], dtype="category") s3 = pd.Series([2, 3]) - s4 = pd.Series([2, 2], dtype='category') + s4 = pd.Series([2, 2], dtype="category") s5 = pd.Series([1, np.nan]) - s6 = pd.Series([1, 3, 2], dtype='category') + s6 = pd.Series([1, 3, 2], dtype="category") # mixed dtype, values are all in categories => not-category exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) @@ -640,8 +688,9 @@ def test_concat_categorical_ordered(self): tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], - ordered=True)) + exp = pd.Series( + pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], ordered=True) + ) tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) @@ -650,23 +699,22 @@ def test_concat_categorical_coercion_nan(self): # some edge cases # category + not-category => not category - s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), - dtype='category') + s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), dtype="category") s2 = pd.Series([np.nan, 1]) exp = pd.Series([np.nan, np.nan, np.nan, 1]) tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) - s1 = pd.Series([1, np.nan], dtype='category') + s1 = pd.Series([1, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) - exp = pd.Series([1, np.nan, np.nan, np.nan], dtype='object') + exp = pd.Series([1, np.nan, np.nan, np.nan], dtype="object") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) # mixed dtype, all nan-likes => not-category - s1 = pd.Series([np.nan, np.nan], dtype='category') + s1 = pd.Series([np.nan, np.nan], dtype="category") s2 = pd.Series([np.nan, np.nan]) exp = pd.Series([np.nan, np.nan, np.nan, np.nan]) @@ -676,10 +724,10 @@ def test_concat_categorical_coercion_nan(self): tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) # all category nan-likes => category - s1 = pd.Series([np.nan, np.nan], dtype='category') - s2 = pd.Series([np.nan, np.nan], dtype='category') + s1 = pd.Series([np.nan, np.nan], dtype="category") + s2 = pd.Series([np.nan, np.nan], dtype="category") - exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category') + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) @@ -687,8 +735,8 @@ def test_concat_categorical_coercion_nan(self): def test_concat_categorical_empty(self): # GH 13524 - s1 = pd.Series([], dtype='category') - s2 = pd.Series([1, 2], dtype='category') + s1 = pd.Series([], dtype="category") + s2 = pd.Series([1, 2], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) @@ -696,14 +744,14 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - s1 = pd.Series([], dtype='category') - s2 = pd.Series([], dtype='category') + s1 = pd.Series([], dtype="category") + s2 = pd.Series([], dtype="category") tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) - s1 = pd.Series([], dtype='category') - s2 = pd.Series([], dtype='object') + s1 = pd.Series([], dtype="category") + s2 = pd.Series([], dtype="object") # different dtype => not-category tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) @@ -711,7 +759,7 @@ def test_concat_categorical_empty(self): tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) - s1 = pd.Series([], dtype='category') + s1 = pd.Series([], dtype="category") s2 = pd.Series([np.nan, np.nan]) # empty Series is ignored @@ -724,30 +772,28 @@ def test_concat_categorical_empty(self): def test_concat_join_axes_deprecated(self, axis): # GH21951 - one = pd.DataFrame([[0., 1.], [2., 3.]], columns=list('ab')) - two = pd.DataFrame([[10., 11.], [12., 13.]], index=[1, 2], - columns=list('bc')) + one = pd.DataFrame([[0.0, 1.0], [2.0, 3.0]], columns=list("ab")) + two = pd.DataFrame( + [[10.0, 11.0], [12.0, 13.0]], index=[1, 2], columns=list("bc") + ) - expected = pd.concat([one, two], - axis=1, sort=False).reindex(index=two.index) + expected = pd.concat([one, two], axis=1, sort=False).reindex(index=two.index) with tm.assert_produces_warning(expected_warning=FutureWarning): - result = pd.concat([one, two], - axis=1, sort=False, join_axes=[two.index]) + result = pd.concat([one, two], axis=1, sort=False, join_axes=[two.index]) tm.assert_frame_equal(result, expected) - expected = pd.concat([one, two], - axis=0, sort=False).reindex(columns=two.columns) + expected = pd.concat([one, two], axis=0, sort=False).reindex( + columns=two.columns + ) with tm.assert_produces_warning(expected_warning=FutureWarning): - result = pd.concat([one, two], - axis=0, sort=False, join_axes=[two.columns]) + result = pd.concat([one, two], axis=0, sort=False, join_axes=[two.columns]) tm.assert_frame_equal(result, expected) class TestAppend: - def test_append(self, sort, float_frame): mixed_frame = float_frame.copy() - mixed_frame['foo'] = 'bar' + mixed_frame["foo"] = "bar" begin_index = float_frame.index[:5] end_index = float_frame.index[5:] @@ -756,14 +802,14 @@ def test_append(self, sort, float_frame): end_frame = float_frame.reindex(end_index) appended = begin_frame.append(end_frame) - tm.assert_almost_equal(appended['A'], float_frame['A']) + tm.assert_almost_equal(appended["A"], float_frame["A"]) - del end_frame['A'] + del end_frame["A"] partial_appended = begin_frame.append(end_frame, sort=sort) - assert 'A' in partial_appended + assert "A" in partial_appended partial_appended = end_frame.append(begin_frame, sort=sort) - assert 'A' in partial_appended + assert "A" in partial_appended # mixed type handling appended = mixed_frame[:5].append(mixed_frame[5:]) @@ -775,8 +821,9 @@ def test_append(self, sort, float_frame): # all equal except 'foo' column tm.assert_frame_equal( - mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), - mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) + mixed_appended.reindex(columns=["A", "B", "C", "D"]), + mixed_appended2.reindex(columns=["A", "B", "C", "D"]), + ) def test_append_empty(self, float_frame): empty = DataFrame() @@ -796,29 +843,32 @@ def test_append_overlap_raises(self, float_frame): def test_append_new_columns(self): # see gh-6129: new columns - df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) - row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') - expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { - 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) + df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}}) + row = Series([5, 6, 7], index=["a", "b", "c"], name="z") + expected = DataFrame( + { + "a": {"x": 1, "y": 2, "z": 5}, + "b": {"x": 3, "y": 4, "z": 6}, + "c": {"z": 7}, + } + ) result = df.append(row) tm.assert_frame_equal(result, expected) def test_append_length0_frame(self, sort): - df = DataFrame(columns=['A', 'B', 'C']) - df3 = DataFrame(index=[0, 1], columns=['A', 'B']) + df = DataFrame(columns=["A", "B", "C"]) + df3 = DataFrame(index=[0, 1], columns=["A", "B"]) df5 = df.append(df3, sort=sort) - expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) + expected = DataFrame(index=[0, 1], columns=["A", "B", "C"]) assert_frame_equal(df5, expected) def test_append_records(self): - arr1 = np.zeros((2,), dtype=('i4,f4,a10')) - arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")] + arr1 = np.zeros((2,), dtype=("i4,f4,a10")) + arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] - arr2 = np.zeros((3,), dtype=('i4,f4,a10')) - arr2[:] = [(3, 4., 'foo'), - (5, 6., "bar"), - (7., 8., 'baz')] + arr2 = np.zeros((3,), dtype=("i4,f4,a10")) + arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")] df1 = DataFrame(arr1) df2 = DataFrame(arr2) @@ -829,15 +879,14 @@ def test_append_records(self): # rewrite sort fixture, since we also want to test default of None def test_append_sorts(self, sort_with_none): - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) - df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3]) + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) + df2 = pd.DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3]) if sort_with_none is None: # only warn if not explicitly specified # don't check stacklevel since its set for concat, and append # has an extra stack. - ctx = tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) + ctx = tm.assert_produces_warning(FutureWarning, check_stacklevel=False) else: ctx = tm.assert_produces_warning(None) @@ -845,74 +894,83 @@ def test_append_sorts(self, sort_with_none): result = df1.append(df2, sort=sort_with_none) # for None / True - expected = pd.DataFrame({"b": [1, 2, None, None], - "a": [1, 2, 1, 2], - "c": [None, None, 3, 4]}, - columns=['a', 'b', 'c']) + expected = pd.DataFrame( + {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]}, + columns=["a", "b", "c"], + ) if sort_with_none is False: - expected = expected[['b', 'a', 'c']] + expected = expected[["b", "a", "c"]] tm.assert_frame_equal(result, expected) def test_append_different_columns(self, sort): - df = DataFrame({'bools': np.random.randn(10) > 0, - 'ints': np.random.randint(0, 10, 10), - 'floats': np.random.randn(10), - 'strings': ['foo', 'bar'] * 5}) + df = DataFrame( + { + "bools": np.random.randn(10) > 0, + "ints": np.random.randint(0, 10, 10), + "floats": np.random.randn(10), + "strings": ["foo", "bar"] * 5, + } + ) - a = df[:5].loc[:, ['bools', 'ints', 'floats']] - b = df[5:].loc[:, ['strings', 'ints', 'floats']] + a = df[:5].loc[:, ["bools", "ints", "floats"]] + b = df[5:].loc[:, ["strings", "ints", "floats"]] appended = a.append(b, sort=sort) - assert isna(appended['strings'][0:4]).all() - assert isna(appended['bools'][5:]).all() + assert isna(appended["strings"][0:4]).all() + assert isna(appended["bools"][5:]).all() def test_append_many(self, sort, float_frame): - chunks = [float_frame[:5], float_frame[5:10], - float_frame[10:15], float_frame[15:]] + chunks = [ + float_frame[:5], + float_frame[5:10], + float_frame[10:15], + float_frame[15:], + ] result = chunks[0].append(chunks[1:]) tm.assert_frame_equal(result, float_frame) chunks[-1] = chunks[-1].copy() - chunks[-1]['foo'] = 'bar' + chunks[-1]["foo"] = "bar" result = chunks[0].append(chunks[1:], sort=sort) tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame) - assert (result['foo'][15:] == 'bar').all() - assert result['foo'][:15].isna().all() + assert (result["foo"][15:] == "bar").all() + assert result["foo"][:15].isna().all() def test_append_preserve_index_name(self): # #980 - df1 = DataFrame(columns=['A', 'B', 'C']) - df1 = df1.set_index(['A']) - df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], - columns=['A', 'B', 'C']) - df2 = df2.set_index(['A']) + df1 = DataFrame(columns=["A", "B", "C"]) + df1 = df1.set_index(["A"]) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) + df2 = df2.set_index(["A"]) result = df1.append(df2) - assert result.index.name == 'A' + assert result.index.name == "A" indexes_can_append = [ pd.RangeIndex(3), pd.Index([4, 5, 6]), pd.Index([4.5, 5.5, 6.5]), - pd.Index(list('abc')), - pd.CategoricalIndex('A B C'.split()), - pd.CategoricalIndex('D E F'.split(), ordered=True), + pd.Index(list("abc")), + pd.CategoricalIndex("A B C".split()), + pd.CategoricalIndex("D E F".split(), ordered=True), pd.IntervalIndex.from_breaks([7, 8, 9, 10]), - pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 3, 7, 12)]), + pd.DatetimeIndex( + [ + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 3, 7, 12), + ] + ), ] indexes_cannot_append_with_other = [ - pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]), + pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]) ] all_indexes = indexes_can_append + indexes_cannot_append_with_other - @pytest.mark.parametrize("index", - all_indexes, - ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize("index", all_indexes, ids=lambda x: x.__class__.__name__) def test_append_same_columns_type(self, index): # GH18359 @@ -921,9 +979,9 @@ def test_append_same_columns_type(self, index): ser_index = index[:2] ser = pd.Series([7, 8], index=ser_index, name=2) result = df.append(ser) - expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]], - index=[0, 1, 2], - columns=index) + expected = pd.DataFrame( + [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index + ) assert_frame_equal(result, expected) # ser wider than df @@ -932,14 +990,18 @@ def test_append_same_columns_type(self, index): df = pd.DataFrame([[1, 2], [4, 5]], columns=index) ser = pd.Series([7, 8, 9], index=ser_index, name=2) result = df.append(ser) - expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], - index=[0, 1, 2], - columns=ser_index) + expected = pd.DataFrame( + [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]], + index=[0, 1, 2], + columns=ser_index, + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize("df_columns, series_index", - combinations(indexes_can_append, r=2), - ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize( + "df_columns, series_index", + combinations(indexes_can_append, r=2), + ids=lambda x: x.__class__.__name__, + ) def test_append_different_columns_types(self, df_columns, series_index): # GH18359 # See also test 'test_append_different_columns_types_raises' below @@ -951,20 +1013,28 @@ def test_append_different_columns_types(self, df_columns, series_index): result = df.append(ser) idx_diff = ser.index.difference(df_columns) combined_columns = Index(df_columns.tolist()).append(idx_diff) - expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan], - [4, 5, 6, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, 7, 8, 9]], - index=[0, 1, 2], - columns=combined_columns) + expected = pd.DataFrame( + [ + [1.0, 2.0, 3.0, np.nan, np.nan, np.nan], + [4, 5, 6, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, 7, 8, 9], + ], + index=[0, 1, 2], + columns=combined_columns, + ) assert_frame_equal(result, expected) - @pytest.mark.parametrize('index_can_append', indexes_can_append, - ids=lambda x: x.__class__.__name__) - @pytest.mark.parametrize('index_cannot_append_with_other', - indexes_cannot_append_with_other, - ids=lambda x: x.__class__.__name__) + @pytest.mark.parametrize( + "index_can_append", indexes_can_append, ids=lambda x: x.__class__.__name__ + ) + @pytest.mark.parametrize( + "index_cannot_append_with_other", + indexes_cannot_append_with_other, + ids=lambda x: x.__class__.__name__, + ) def test_append_different_columns_types_raises( - self, index_can_append, index_cannot_append_with_other): + self, index_can_append, index_cannot_append_with_other + ): # GH18359 # Dataframe.append will raise if MultiIndex appends # or is appended to a different index type @@ -973,17 +1043,19 @@ def test_append_different_columns_types_raises( # appending without raising. df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append) - ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, - name=2) - msg = (r"Expected tuple, got (int|long|float|str|" - r"pandas._libs.interval.Interval)|" - r"object of type '(int|float|Timestamp|" - r"pandas._libs.interval.Interval)' has no len\(\)|") + ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other, name=2) + msg = ( + r"Expected tuple, got (int|long|float|str|" + r"pandas._libs.interval.Interval)|" + r"object of type '(int|float|Timestamp|" + r"pandas._libs.interval.Interval)' has no len\(\)|" + ) with pytest.raises(TypeError, match=msg): df.append(ser) - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], - columns=index_cannot_append_with_other) + df = pd.DataFrame( + [[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other + ) ser = pd.Series([7, 8, 9], index=index_can_append, name=2) with pytest.raises(TypeError, match=msg): @@ -994,69 +1066,87 @@ def test_append_dtype_coerce(self, sort): # GH 4993 # appending with datetime will incorrectly convert datetime64 - df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0)], - columns=['start_time']) - df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10)], - [dt.datetime(2013, 1, 4, 0, 0), - dt.datetime(2013, 1, 4, 7, 10)]], - columns=['start_time', 'end_time']) - - expected = concat([Series([pd.NaT, - pd.NaT, - dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 4, 7, 10)], - name='end_time'), - Series([dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0), - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], - axis=1, sort=sort) + df1 = DataFrame( + index=[1, 2], + data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)], + columns=["start_time"], + ) + df2 = DataFrame( + index=[4, 5], + data=[ + [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)], + ], + columns=["start_time", "end_time"], + ) + + expected = concat( + [ + Series( + [ + pd.NaT, + pd.NaT, + dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10), + ], + name="end_time", + ), + Series( + [ + dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0), + ], + name="start_time", + ), + ], + axis=1, + sort=sort, + ) result = df1.append(df2, ignore_index=True, sort=sort) if sort: - expected = expected[['end_time', 'start_time']] + expected = expected[["end_time", "start_time"]] else: - expected = expected[['start_time', 'end_time']] + expected = expected[["start_time", "end_time"]] assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self, sort): - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) - df2 = DataFrame({'B': np.array([True, False, True, False], - dtype=bool)}) + df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")}) + df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)}) appended = df1.append(df2, ignore_index=True, sort=sort) - assert appended['A'].dtype == 'f8' - assert appended['B'].dtype == 'O' + assert appended["A"].dtype == "f8" + assert appended["B"].dtype == "O" def test_append_empty_frame_to_series_with_dateutil_tz(self): # GH 23682 - date = Timestamp('2018-10-24 07:30:00', tz=dateutil.tz.tzutc()) - s = Series({'date': date, 'a': 1.0, 'b': 2.0}) - df = DataFrame(columns=['c', 'd']) + date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) + s = Series({"date": date, "a": 1.0, "b": 2.0}) + df = DataFrame(columns=["c", "d"]) result = df.append(s, ignore_index=True) # n.b. it's not clear to me that expected is correct here. # It's possible that the `date` column should have # datetime64[ns, tz] dtype for both result and expected. # that would be more consistent with new columns having # their own dtype (float for a and b, datetime64ns, tz for date). - expected = DataFrame([[np.nan, np.nan, 1., 2., date]], - columns=['c', 'd', 'a', 'b', 'date'], - dtype=object) + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]], + columns=["c", "d", "a", "b", "date"], + dtype=object, + ) # These columns get cast to object after append - expected['a'] = expected['a'].astype(float) - expected['b'] = expected['b'].astype(float) + expected["a"] = expected["a"].astype(float) + expected["b"] = expected["b"].astype(float) assert_frame_equal(result, expected) class TestConcatenate: - def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) - df3 = DataFrame({5: 'foo'}, index=range(4)) + df3 = DataFrame({5: "foo"}, index=range(4)) # These are actual copies. result = concat([df, df2, df3], axis=1, copy=True) @@ -1095,17 +1185,15 @@ def test_concat_with_group_keys(self): df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1]) - exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], - [0, 1, 2, 0, 1, 2, 3]]) - expected = DataFrame(np.r_[df.values, df2.values], - index=exp_index) + exp_index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]] + ) + expected = DataFrame(np.r_[df.values, df2.values], index=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1]) - exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 1, 2]]) - expected = DataFrame(np.r_[df.values, df.values], - index=exp_index2) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], index=exp_index2) tm.assert_frame_equal(result, expected) # axis=1 @@ -1113,53 +1201,55 @@ def test_concat_with_group_keys(self): df2 = DataFrame(np.random.randn(4, 4)) result = concat([df, df2], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df2.values], - columns=exp_index) + expected = DataFrame(np.c_[df.values, df2.values], columns=exp_index) tm.assert_frame_equal(result, expected) result = concat([df, df], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df.values], - columns=exp_index2) + expected = DataFrame(np.c_[df.values, df.values], columns=exp_index2) tm.assert_frame_equal(result, expected) def test_concat_keys_specific_levels(self): df = DataFrame(np.random.randn(10, 4)) pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]] - level = ['three', 'two', 'one', 'zero'] - result = concat(pieces, axis=1, keys=['one', 'two', 'three'], - levels=[level], - names=['group_key']) + level = ["three", "two", "one", "zero"] + result = concat( + pieces, + axis=1, + keys=["one", "two", "three"], + levels=[level], + names=["group_key"], + ) - tm.assert_index_equal(result.columns.levels[0], - Index(level, name='group_key')) - assert result.columns.names[0] == 'group_key' + tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) + assert result.columns.names[0] == "group_key" def test_concat_dataframe_keys_bug(self, sort): - t1 = DataFrame({ - 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], - name='id'))}) - t2 = DataFrame({ - 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) + t1 = DataFrame( + {"value": Series([1, 2, 3], index=Index(["a", "b", "c"], name="id"))} + ) + t2 = DataFrame({"value": Series([7, 8], index=Index(["a", "b"], name="id"))}) # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort) - assert list(result.columns) == [('t1', 'value'), ('t2', 'value')] + result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) + assert list(result.columns) == [("t1", "value"), ("t2", "value")] def test_concat_series_partial_columns_names(self): # GH10698 - foo = Series([1, 2], name='foo') + foo = Series([1, 2], name="foo") bar = Series([1, 2]) baz = Series([4, 5]) result = concat([foo, bar, baz], axis=1) - expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [ - 4, 5]}, columns=['foo', 0, 1]) + expected = DataFrame( + {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1] + ) tm.assert_frame_equal(result, expected) - result = concat([foo, bar, baz], axis=1, keys=[ - 'red', 'blue', 'yellow']) - expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [ - 4, 5]}, columns=['red', 'blue', 'yellow']) + result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"]) + expected = DataFrame( + {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]}, + columns=["red", "blue", "yellow"], + ) tm.assert_frame_equal(result, expected) result = concat([foo, bar, baz], axis=1, ignore_index=True) @@ -1167,10 +1257,12 @@ def test_concat_series_partial_columns_names(self): tm.assert_frame_equal(result, expected) def test_concat_dict(self): - frames = {'foo': DataFrame(np.random.randn(4, 3)), - 'bar': DataFrame(np.random.randn(4, 3)), - 'baz': DataFrame(np.random.randn(4, 3)), - 'qux': DataFrame(np.random.randn(4, 3))} + frames = { + "foo": DataFrame(np.random.randn(4, 3)), + "bar": DataFrame(np.random.randn(4, 3)), + "baz": DataFrame(np.random.randn(4, 3)), + "qux": DataFrame(np.random.randn(4, 3)), + } sorted_keys = com.dict_keys_to_ordered_list(frames) @@ -1179,198 +1271,226 @@ def test_concat_dict(self): tm.assert_frame_equal(result, expected) result = concat(frames, axis=1) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, - axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, axis=1) tm.assert_frame_equal(result, expected) - keys = ['baz', 'foo', 'bar'] + keys = ["baz", "foo", "bar"] result = concat(frames, keys=keys) expected = concat([frames[k] for k in keys], keys=keys) tm.assert_frame_equal(result, expected) def test_concat_ignore_index(self, sort): - frame1 = DataFrame({"test1": ["a", "b", "c"], - "test2": [1, 2, 3], - "test3": [4.5, 3.2, 1.2]}) + frame1 = DataFrame( + {"test1": ["a", "b", "c"], "test2": [1, 2, 3], "test3": [4.5, 3.2, 1.2]} + ) frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) frame1.index = Index(["x", "y", "z"]) frame2.index = Index(["x", "y", "q"]) - v1 = concat([frame1, frame2], axis=1, - ignore_index=True, sort=sort) + v1 = concat([frame1, frame2], axis=1, ignore_index=True, sort=sort) nan = np.nan - expected = DataFrame([[nan, nan, nan, 4.3], - ['a', 1, 4.5, 5.2], - ['b', 2, 3.2, 2.2], - ['c', 3, 1.2, nan]], - index=Index(["q", "x", "y", "z"])) + expected = DataFrame( + [ + [nan, nan, nan, 4.3], + ["a", 1, 4.5, 5.2], + ["b", 2, 3.2, 2.2], + ["c", 3, 1.2, nan], + ], + index=Index(["q", "x", "y", "z"]), + ) if not sort: - expected = expected.loc[['x', 'y', 'z', 'q']] + expected = expected.loc[["x", "y", "z", "q"]] tm.assert_frame_equal(v1, expected) def test_concat_multiindex_with_keys(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - result = concat([frame, frame], keys=[0, 1], names=['iteration']) - - assert result.index.names == ('iteration',) + index.names + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + result = concat([frame, frame], keys=[0, 1], names=["iteration"]) + + assert result.index.names == ("iteration",) + index.names tm.assert_frame_equal(result.loc[0], frame) tm.assert_frame_equal(result.loc[1], frame) assert result.index.nlevels == 3 def test_concat_multiindex_with_tz(self): # GH 6606 - df = DataFrame({'dt': [datetime(2014, 1, 1), - datetime(2014, 1, 2), - datetime(2014, 1, 3)], - 'b': ['A', 'B', 'C'], - 'c': [1, 2, 3], 'd': [4, 5, 6]}) - df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) - df = df.set_index(['dt', 'b']) - - exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', - '2014-01-03'] * 2, - tz='US/Pacific', name='dt') - exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') + df = DataFrame( + { + "dt": [ + datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3), + ], + "b": ["A", "B", "C"], + "c": [1, 2, 3], + "d": [4, 5, 6], + } + ) + df["dt"] = df["dt"].apply(lambda d: Timestamp(d, tz="US/Pacific")) + df = df.set_index(["dt", "b"]) + + exp_idx1 = DatetimeIndex( + ["2014-01-01", "2014-01-02", "2014-01-03"] * 2, tz="US/Pacific", name="dt" + ) + exp_idx2 = Index(["A", "B", "C"] * 2, name="b") exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, - index=exp_idx, columns=['c', 'd']) + expected = DataFrame( + {"c": [1, 2, 3] * 2, "d": [4, 5, 6] * 2}, index=exp_idx, columns=["c", "d"] + ) result = concat([df, df]) tm.assert_frame_equal(result, expected) def test_concat_multiindex_with_none_in_index_names(self): # GH 15787 - index = pd.MultiIndex.from_product([[1], range(5)], - names=['level1', None]) - df = pd.DataFrame({'col': range(5)}, index=index, dtype=np.int32) - - result = concat([df, df], keys=[1, 2], names=['level2']) - index = pd.MultiIndex.from_product([[1, 2], [1], range(5)], - names=['level2', 'level1', None]) - expected = pd.DataFrame({'col': list(range(5)) * 2}, - index=index, dtype=np.int32) + index = pd.MultiIndex.from_product([[1], range(5)], names=["level1", None]) + df = pd.DataFrame({"col": range(5)}, index=index, dtype=np.int32) + + result = concat([df, df], keys=[1, 2], names=["level2"]) + index = pd.MultiIndex.from_product( + [[1, 2], [1], range(5)], names=["level2", "level1", None] + ) + expected = pd.DataFrame( + {"col": list(range(5)) * 2}, index=index, dtype=np.int32 + ) assert_frame_equal(result, expected) - result = concat([df, df[:2]], keys=[1, 2], names=['level2']) + result = concat([df, df[:2]], keys=[1, 2], names=["level2"]) level2 = [1] * 5 + [2] * 2 level1 = [1] * 7 no_name = list(range(5)) + list(range(2)) tuples = list(zip(level2, level1, no_name)) - index = pd.MultiIndex.from_tuples(tuples, - names=['level2', 'level1', None]) - expected = pd.DataFrame({'col': no_name}, index=index, - dtype=np.int32) + index = pd.MultiIndex.from_tuples(tuples, names=["level2", "level1", None]) + expected = pd.DataFrame({"col": no_name}, index=index, dtype=np.int32) assert_frame_equal(result, expected) def test_concat_keys_and_levels(self): df = DataFrame(np.random.randn(1, 3)) df2 = DataFrame(np.random.randn(1, 4)) - levels = [['foo', 'baz'], ['one', 'two']] - names = ['first', 'second'] - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels, - names=names) + levels = [["foo", "baz"], ["one", "two"]] + names = ["first", "second"] + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + names=names, + ) expected = concat([df, df2, df, df2]) - exp_index = MultiIndex(levels=levels + [[0]], - codes=[[0, 0, 1, 1], [0, 1, 0, 1], - [0, 0, 0, 0]], - names=names + [None]) + exp_index = MultiIndex( + levels=levels + [[0]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 0, 0, 0]], + names=names + [None], + ) expected.index = exp_index tm.assert_frame_equal(result, expected) # no names - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels) + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + levels=levels, + ) assert result.index.names == (None,) * 3 # no levels - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - names=['first', 'second']) - assert result.index.names == ('first', 'second') + (None,) - tm.assert_index_equal(result.index.levels[0], - Index(['baz', 'foo'], name='first')) + result = concat( + [df, df2, df, df2], + keys=[("foo", "one"), ("foo", "two"), ("baz", "one"), ("baz", "two")], + names=["first", "second"], + ) + assert result.index.names == ("first", "second") + (None,) + tm.assert_index_equal( + result.index.levels[0], Index(["baz", "foo"], name="first") + ) def test_concat_keys_levels_no_overlap(self): # GH #1406 - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) msg = "Values not found in passed level" with pytest.raises(ValueError, match=msg): - concat([df, df], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + concat([df, df], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) msg = "Key one not in level" with pytest.raises(ValueError, match=msg): - concat([df, df2], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + concat([df, df2], keys=["one", "two"], levels=[["foo", "bar", "baz"]]) def test_concat_rename_index(self): - a = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_a')) - b = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_b')) + a = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_a"), + ) + b = DataFrame( + np.random.rand(3, 3), + columns=list("ABC"), + index=Index(list("abc"), name="index_b"), + ) - result = concat([a, b], keys=['key0', 'key1'], - names=['lvl0', 'lvl1']) + result = concat([a, b], keys=["key0", "key1"], names=["lvl0", "lvl1"]) - exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) + exp = concat([a, b], keys=["key0", "key1"], names=["lvl0"]) names = list(exp.index.names) - names[1] = 'lvl1' + names[1] = "lvl1" exp.index.set_names(names, inplace=True) tm.assert_frame_equal(result, exp) assert result.index.names == exp.index.names def test_crossed_dtypes_weird_corner(self): - columns = ['A', 'B', 'C', 'D'] - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'), - 'B': np.array([1, 2, 3, 4], dtype='i8'), - 'C': np.array([1, 2, 3, 4], dtype='f8'), - 'D': np.array([1, 2, 3, 4], dtype='i8')}, - columns=columns) - - df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'), - 'B': np.array([1, 2, 3, 4], dtype='f8'), - 'C': np.array([1, 2, 3, 4], dtype='i8'), - 'D': np.array([1, 2, 3, 4], dtype='f8')}, - columns=columns) + columns = ["A", "B", "C", "D"] + df1 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="f8"), + "B": np.array([1, 2, 3, 4], dtype="i8"), + "C": np.array([1, 2, 3, 4], dtype="f8"), + "D": np.array([1, 2, 3, 4], dtype="i8"), + }, + columns=columns, + ) + + df2 = DataFrame( + { + "A": np.array([1, 2, 3, 4], dtype="i8"), + "B": np.array([1, 2, 3, 4], dtype="f8"), + "C": np.array([1, 2, 3, 4], dtype="i8"), + "D": np.array([1, 2, 3, 4], dtype="f8"), + }, + columns=columns, + ) appended = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), - columns=columns) + expected = DataFrame( + np.concatenate([df1.values, df2.values], axis=0), columns=columns + ) tm.assert_frame_equal(appended, expected) - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) - result = concat( - [df, df2], keys=['one', 'two'], names=['first', 'second']) - assert result.index.names == ('first', 'second') + df = DataFrame(np.random.randn(1, 3), index=["a"]) + df2 = DataFrame(np.random.randn(1, 4), index=["b"]) + result = concat([df, df2], keys=["one", "two"], names=["first", "second"]) + assert result.index.names == ("first", "second") def test_dups_index(self): # GH 4771 # single dtypes - df = DataFrame(np.random.randint(0, 10, size=40).reshape( - 10, 4), columns=['A', 'A', 'C', 'C']) + df = DataFrame( + np.random.randint(0, 10, size=40).reshape(10, 4), + columns=["A", "A", "C", "C"], + ) result = concat([df, df], axis=1) assert_frame_equal(result.iloc[:, :4], df) @@ -1381,12 +1501,15 @@ def test_dups_index(self): assert_frame_equal(result.iloc[10:], df) # multi dtypes - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) + df = concat( + [ + DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]), + DataFrame( + np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"] + ), + ], + axis=1, + ) result = concat([df, df], axis=1) assert_frame_equal(result.iloc[:, :6], df) @@ -1410,32 +1533,33 @@ def test_dups_index(self): def test_with_mixed_tuples(self, sort): # 10697 # columns have mixed tuples, so handle properly - df1 = DataFrame({'A': 'foo', ('B', 1): 'bar'}, index=range(2)) - df2 = DataFrame({'B': 'foo', ('B', 1): 'bar'}, index=range(2)) + df1 = DataFrame({"A": "foo", ("B", 1): "bar"}, index=range(2)) + df2 = DataFrame({"B": "foo", ("B", 1): "bar"}, index=range(2)) # it works concat([df1, df2], sort=sort) def test_handle_empty_objects(self, sort): - df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) + df = DataFrame(np.random.randn(10, 4), columns=list("abcd")) baz = df[:5].copy() - baz['foo'] = 'bar' + baz["foo"] = "bar" empty = df[5:5] frames = [baz, empty, empty, df[5:]] concatted = concat(frames, axis=0, sort=sort) - expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo']) - expected['foo'] = expected['foo'].astype('O') - expected.loc[0:4, 'foo'] = 'bar' + expected = df.reindex(columns=["a", "b", "c", "d", "foo"]) + expected["foo"] = expected["foo"].astype("O") + expected.loc[0:4, "foo"] = "bar" tm.assert_frame_equal(concatted, expected) # empty as first element with time series # GH3259 - df = DataFrame(dict(A=range(10000)), index=date_range( - '20130101', periods=10000, freq='s')) + df = DataFrame( + dict(A=range(10000)), index=date_range("20130101", periods=10000, freq="s") + ) empty = DataFrame() result = concat([df, empty], axis=1) assert_frame_equal(result, df) @@ -1453,54 +1577,62 @@ def test_concat_mixed_objs(self): # G2385 # axis 1 - index = date_range('01-Jan-2013', periods=10, freq='H') - arr = np.arange(10, dtype='int64') + index = date_range("01-Jan-2013", periods=10, freq="H") + arr = np.arange(10, dtype="int64") s1 = Series(arr, index=index) s2 = Series(arr, index=index) df = DataFrame(arr.reshape(-1, 1), index=index) - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 0]) + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 0] + ) result = concat([df, df], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 1]) + expected = DataFrame( + np.repeat(arr, 2).reshape(-1, 2), index=index, columns=[0, 1] + ) result = concat([s1, s2], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) result = concat([s1, s2, s1], axis=1) assert_frame_equal(result, expected) - expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5), - index=index, columns=[0, 0, 1, 2, 3]) + expected = DataFrame( + np.repeat(arr, 5).reshape(-1, 5), index=index, columns=[0, 0, 1, 2, 3] + ) result = concat([s1, df, s2, s2, s1], axis=1) assert_frame_equal(result, expected) # with names - s1.name = 'foo' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 0]) + s1.name = "foo" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, 0] + ) result = concat([s1, df, s2], axis=1) assert_frame_equal(result, expected) - s2.name = 'bar' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 'bar']) + s2.name = "bar" + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=["foo", 0, "bar"] + ) result = concat([s1, df, s2], axis=1) assert_frame_equal(result, expected) # ignore index - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) + expected = DataFrame( + np.repeat(arr, 3).reshape(-1, 3), index=index, columns=[0, 1, 2] + ) result = concat([s1, df, s2], axis=1, ignore_index=True) assert_frame_equal(result, expected) # axis 0 - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), - index=index.tolist() * 3, columns=[0]) + expected = DataFrame( + np.tile(arr, 3).reshape(-1, 1), index=index.tolist() * 3, columns=[0] + ) result = concat([s1, df, s2]) assert_frame_equal(result, expected) @@ -1516,8 +1648,8 @@ def test_empty_dtype_coerce(self): # see below # 10571 - df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) - df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) + df1 = DataFrame(data=[[1, None], [2, None]], columns=["a", "b"]) + df2 = DataFrame(data=[[3, None], [4, None]], columns=["a", "b"]) result = concat([df1, df2]) expected = df1.dtypes tm.assert_series_equal(result.dtypes, expected) @@ -1525,28 +1657,29 @@ def test_empty_dtype_coerce(self): def test_dtype_coerceion(self): # 12411 - df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) + df = DataFrame({"date": [pd.Timestamp("20130101").tz_localize("UTC"), pd.NaT]}) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) # 12045 import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) + + df = DataFrame( + {"date": [datetime.datetime(2012, 1, 1), datetime.datetime(1012, 1, 2)]} + ) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) + df = DataFrame({"text": ["some words"] + [None] * 9}) result = concat([df.iloc[[0]], df.iloc[[1]]]) tm.assert_series_equal(result.dtypes, df.dtypes) def test_concat_series(self): ts = tm.makeTimeSeries() - ts.name = 'foo' + ts.name = "foo" pieces = [ts[:5], ts[5:15], ts[15:]] @@ -1557,12 +1690,10 @@ def test_concat_series(self): result = concat(pieces, keys=[0, 1, 2]) expected = ts.copy() - ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) + ts.index = DatetimeIndex(np.array(ts.index.values, dtype="M8[ns]")) - exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), - np.arange(len(ts))] - exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], - codes=exp_codes) + exp_codes = [np.repeat([0, 1, 2], [len(x) for x in pieces]), np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], codes=exp_codes) expected.index = exp_index tm.assert_series_equal(result, expected) @@ -1575,52 +1706,51 @@ def test_concat_series_axis1(self, sort=sort): expected = DataFrame(pieces).T assert_frame_equal(result, expected) - result = concat(pieces, keys=['A', 'B', 'C'], axis=1) - expected = DataFrame(pieces, index=['A', 'B', 'C']).T + result = concat(pieces, keys=["A", "B", "C"], axis=1) + expected = DataFrame(pieces, index=["A", "B", "C"]).T assert_frame_equal(result, expected) # preserve series names, #2489 - s = Series(randn(5), name='A') - s2 = Series(randn(5), name='B') + s = Series(randn(5), name="A") + s2 = Series(randn(5), name="B") result = concat([s, s2], axis=1) - expected = DataFrame({'A': s, 'B': s2}) + expected = DataFrame({"A": s, "B": s2}) assert_frame_equal(result, expected) s2.name = None result = concat([s, s2], axis=1) - tm.assert_index_equal(result.columns, - Index(['A', 0], dtype='object')) + tm.assert_index_equal(result.columns, Index(["A", 0], dtype="object")) # must reindex, #2603 - s = Series(randn(3), index=['c', 'a', 'b'], name='A') - s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') + s = Series(randn(3), index=["c", "a", "b"], name="A") + s2 = Series(randn(4), index=["d", "a", "b", "c"], name="B") result = concat([s, s2], axis=1, sort=sort) - expected = DataFrame({'A': s, 'B': s2}) + expected = DataFrame({"A": s, "B": s2}) assert_frame_equal(result, expected) def test_concat_series_axis1_names_applied(self): # ensure names argument is not ignored on axis=1, #23490 s = Series([1, 2, 3]) s2 = Series([4, 5, 6]) - result = concat([s, s2], axis=1, keys=['a', 'b'], names=['A']) - expected = DataFrame([[1, 4], [2, 5], [3, 6]], - columns=pd.Index(['a', 'b'], name='A')) + result = concat([s, s2], axis=1, keys=["a", "b"], names=["A"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], columns=pd.Index(["a", "b"], name="A") + ) assert_frame_equal(result, expected) - result = concat([s, s2], axis=1, keys=[('a', 1), ('b', 2)], - names=['A', 'B']) - expected = DataFrame([[1, 4], [2, 5], [3, 6]], - columns=MultiIndex.from_tuples([('a', 1), - ('b', 2)], - names=['A', 'B'])) + result = concat([s, s2], axis=1, keys=[("a", 1), ("b", 2)], names=["A", "B"]) + expected = DataFrame( + [[1, 4], [2, 5], [3, 6]], + columns=MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["A", "B"]), + ) assert_frame_equal(result, expected) def test_concat_single_with_key(self): df = DataFrame(np.random.randn(10, 4)) - result = concat([df], keys=['foo']) - expected = concat([df, df], keys=['foo', 'bar']) + result = concat([df], keys=["foo"]) + expected = concat([df, df], keys=["foo", "bar"]) tm.assert_frame_equal(result, expected[:10]) def test_concat_exclude_none(self): @@ -1635,24 +1765,24 @@ def test_concat_exclude_none(self): def test_concat_datetime64_block(self): from pandas.core.indexes.datetimes import date_range - rng = date_range('1/1/2000', periods=10) + rng = date_range("1/1/2000", periods=10) - df = DataFrame({'time': rng}) + df = DataFrame({"time": rng}) result = concat([df, df]) - assert (result.iloc[:10]['time'] == rng).all() - assert (result.iloc[10:]['time'] == rng).all() + assert (result.iloc[:10]["time"] == rng).all() + assert (result.iloc[10:]["time"] == rng).all() def test_concat_timedelta64_block(self): from pandas import to_timedelta - rng = to_timedelta(np.arange(10), unit='s') + rng = to_timedelta(np.arange(10), unit="s") - df = DataFrame({'time': rng}) + df = DataFrame({"time": rng}) result = concat([df, df]) - assert (result.iloc[:10]['time'] == rng).all() - assert (result.iloc[10:]['time'] == rng).all() + assert (result.iloc[:10]["time"] == rng).all() + assert (result.iloc[10:]["time"] == rng).all() def test_concat_keys_with_none(self): # #1649 @@ -1662,10 +1792,10 @@ def test_concat_keys_with_none(self): expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) tm.assert_frame_equal(result, expected) - result = concat([None, df0, df0[:2], df0[:1], df0], - keys=['a', 'b', 'c', 'd', 'e']) - expected = concat([df0, df0[:2], df0[:1], df0], - keys=['b', 'c', 'd', 'e']) + result = concat( + [None, df0, df0[:2], df0[:1], df0], keys=["a", "b", "c", "d", "e"] + ) + expected = concat([df0, df0[:2], df0[:1], df0], keys=["b", "c", "d", "e"]) tm.assert_frame_equal(result, expected) def test_concat_bug_1719(self): @@ -1674,33 +1804,43 @@ def test_concat_bug_1719(self): # to join with union # these two are of different length! - left = concat([ts1, ts2], join='outer', axis=1) - right = concat([ts2, ts1], join='outer', axis=1) + left = concat([ts1, ts2], join="outer", axis=1) + right = concat([ts2, ts1], join="outer", axis=1) assert len(left) == len(right) def test_concat_bug_2972(self): ts0 = Series(np.zeros(5)) ts1 = Series(np.ones(5)) - ts0.name = ts1.name = 'same name' + ts0.name = ts1.name = "same name" result = concat([ts0, ts1], axis=1) expected = DataFrame({0: ts0, 1: ts1}) - expected.columns = ['same name', 'same name'] + expected.columns = ["same name", "same name"] assert_frame_equal(result, expected) def test_concat_bug_3602(self): # GH 3602, duplicate columns - df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'prc': [6, 6, 6, 6], - 'stringvar': ['rrr', 'rrr', 'rrr', 'rrr']}) - df2 = DataFrame({'C': [9, 10, 11, 12], 'misc': [1, 2, 3, 4], - 'prc': [6, 6, 6, 6]}) - expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], - [0, 6, 'rrr', 10, 2, 6], - [0, 6, 'rrr', 11, 3, 6], - [0, 6, 'rrr', 12, 4, 6]]) - expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc'] + df1 = DataFrame( + { + "firmNo": [0, 0, 0, 0], + "prc": [6, 6, 6, 6], + "stringvar": ["rrr", "rrr", "rrr", "rrr"], + } + ) + df2 = DataFrame( + {"C": [9, 10, 11, 12], "misc": [1, 2, 3, 4], "prc": [6, 6, 6, 6]} + ) + expected = DataFrame( + [ + [0, 6, "rrr", 9, 1, 6], + [0, 6, "rrr", 10, 2, 6], + [0, 6, "rrr", 11, 3, 6], + [0, 6, "rrr", 12, 4, 6], + ] + ) + expected.columns = ["firmNo", "prc", "stringvar", "C", "misc", "prc"] result = concat([df1, df2], axis=1) assert_frame_equal(result, expected) @@ -1708,17 +1848,17 @@ def test_concat_bug_3602(self): def test_concat_inner_join_empty(self): # GH 15328 df_empty = pd.DataFrame() - df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64') - df_expected = pd.DataFrame({'a': []}, index=[], dtype='int64') + df_a = pd.DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") + df_expected = pd.DataFrame({"a": []}, index=[], dtype="int64") - for how, expected in [('inner', df_expected), ('outer', df_a)]: + for how, expected in [("inner", df_expected), ("outer", df_a)]: result = pd.concat([df_a, df_empty], axis=1, join=how) assert_frame_equal(result, expected) def test_concat_series_axis1_same_names_ignore_index(self): - dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] - s1 = Series(randn(len(dates)), index=dates, name='value') - s2 = Series(randn(len(dates)), index=dates, name='value') + dates = date_range("01-Jan-2013", "01-Jan-2014", freq="MS")[0:-1] + s1 = Series(randn(len(dates)), index=dates, name="value") + s2 = Series(randn(len(dates)), index=dates, name="value") result = concat([s1, s2], axis=1, ignore_index=True) expected = Index([0, 1]) @@ -1733,13 +1873,12 @@ def test_concat_iterables(self): expected = DataFrame([1, 2, 3, 4, 5, 6]) assert_frame_equal(concat((df1, df2), ignore_index=True), expected) assert_frame_equal(concat([df1, df2], ignore_index=True), expected) - assert_frame_equal(concat((df for df in (df1, df2)), - ignore_index=True), expected) assert_frame_equal( - concat(deque((df1, df2)), ignore_index=True), expected) + concat((df for df in (df1, df2)), ignore_index=True), expected + ) + assert_frame_equal(concat(deque((df1, df2)), ignore_index=True), expected) class CustomIterator1: - def __len__(self): return 2 @@ -1748,23 +1887,24 @@ def __getitem__(self, index): return {0: df1, 1: df2}[index] except KeyError: raise IndexError - assert_frame_equal(pd.concat(CustomIterator1(), - ignore_index=True), expected) - class CustomIterator2(abc.Iterable): + assert_frame_equal(pd.concat(CustomIterator1(), ignore_index=True), expected) + class CustomIterator2(abc.Iterable): def __iter__(self): yield df1 yield df2 - assert_frame_equal(pd.concat(CustomIterator2(), - ignore_index=True), expected) + + assert_frame_equal(pd.concat(CustomIterator2(), ignore_index=True), expected) def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = mkdf(10, 2) - msg = ("cannot concatenate object of type '{}';" - " only Series and DataFrame objs are valid") + msg = ( + "cannot concatenate object of type '{}';" + " only Series and DataFrame objs are valid" + ) for obj in [1, dict(), [1, 2], (1, 2)]: with pytest.raises(TypeError, match=msg.format(type(obj))): concat([df1, obj]) @@ -1772,8 +1912,10 @@ def test_concat_invalid(self): def test_concat_invalid_first_argument(self): df1 = mkdf(10, 2) df2 = mkdf(10, 2) - msg = ('first argument must be an iterable of pandas ' - 'objects, you passed an object of type "DataFrame"') + msg = ( + "first argument must be an iterable of pandas " + 'objects, you passed an object of type "DataFrame"' + ) with pytest.raises(TypeError, match=msg): concat(df1, df2) @@ -1799,25 +1941,23 @@ def test_concat_invalid_first_argument(self): def test_concat_NaT_series(self): # GH 11693 # test for merging NaT series with datetime series. - x = Series(date_range('20151124 08:00', '20151124 09:00', - freq='1h', tz='US/Eastern')) - y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') + x = Series( + date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="US/Eastern") + ) + y = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") expected = Series([x[0], x[1], pd.NaT, pd.NaT]) result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # all NaT with tz - expected = Series(pd.NaT, index=range(4), - dtype='datetime64[ns, US/Eastern]') + expected = Series(pd.NaT, index=range(4), dtype="datetime64[ns, US/Eastern]") result = pd.concat([y, y], ignore_index=True) tm.assert_series_equal(result, expected) # without tz - x = pd.Series(pd.date_range('20151124 08:00', - '20151124 09:00', freq='1h')) - y = pd.Series(pd.date_range('20151124 10:00', - '20151124 11:00', freq='1h')) + x = pd.Series(pd.date_range("20151124 08:00", "20151124 09:00", freq="1h")) + y = pd.Series(pd.date_range("20151124 10:00", "20151124 11:00", freq="1h")) y[:] = pd.NaT expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) result = pd.concat([x, y], ignore_index=True) @@ -1825,15 +1965,18 @@ def test_concat_NaT_series(self): # all NaT without tz x[:] = pd.NaT - expected = pd.Series(pd.NaT, index=range(4), - dtype='datetime64[ns]') + expected = pd.Series(pd.NaT, index=range(4), dtype="datetime64[ns]") result = pd.concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) def test_concat_tz_frame(self): - df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), - B=pd.Timestamp('20130603', tz='CET')), - index=range(5)) + df2 = DataFrame( + dict( + A=pd.Timestamp("20130102", tz="US/Eastern"), + B=pd.Timestamp("20130603", tz="CET"), + ), + index=range(5), + ) # concat df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) @@ -1841,250 +1984,270 @@ def test_concat_tz_frame(self): def test_concat_tz_series(self): # gh-11755: tz and no tz - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(date_range('2012-01-01', '2012-01-02')) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(date_range("2012-01-01", "2012-01-02")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # gh-11887: concat tz and object - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(['a', 'b']) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') + x = Series(date_range("20151124 08:00", "20151124 09:00", freq="1h", tz="UTC")) + y = Series(["a", "b"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) # see gh-12217 and gh-12306 # Concatenating two UTC times first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('UTC') + first[0] = first[0].dt.tz_localize("UTC") second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('UTC') + second[0] = second[0].dt.tz_localize("UTC") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, UTC]' + assert result[0].dtype == "datetime64[ns, UTC]" # Concatenating two London times first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') + first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('Europe/London') + second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, Europe/London]' + assert result[0].dtype == "datetime64[ns, Europe/London]" # Concatenating 2+1 London times first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) - first[0] = first[0].dt.tz_localize('Europe/London') + first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') + second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, Europe/London]' + assert result[0].dtype == "datetime64[ns, Europe/London]" # Concat'ing 1+2 London times first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') + first[0] = first[0].dt.tz_localize("Europe/London") second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') + second[0] = second[0].dt.tz_localize("Europe/London") result = pd.concat([first, second]) - assert result[0].dtype == 'datetime64[ns, Europe/London]' + assert result[0].dtype == "datetime64[ns, Europe/London]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta - x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-02-01', tz='US/Eastern')] - y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] + x = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-02-01", tz="US/Eastern"), + ] + y = [pd.Timedelta("1 day"), pd.Timedelta("2 day")] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) # tz and period - y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] + y = [pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M")] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + tm.assert_series_equal(result, pd.Series(x + y, dtype="object")) def test_concat_tz_series_tzlocal(self): # see gh-13583 - x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] - y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), - pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] + x = [ + pd.Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2011-02-01", tz=dateutil.tz.tzlocal()), + ] + y = [ + pd.Timestamp("2012-01-01", tz=dateutil.tz.tzlocal()), + pd.Timestamp("2012-02-01", tz=dateutil.tz.tzlocal()), + ] result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) tm.assert_series_equal(result, pd.Series(x + y)) - assert result.dtype == 'datetime64[ns, tzlocal()]' + assert result.dtype == "datetime64[ns, tzlocal()]" - @pytest.mark.parametrize('tz1', [None, 'UTC']) - @pytest.mark.parametrize('tz2', [None, 'UTC']) - @pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')]) + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) + @pytest.mark.parametrize("s", [pd.NaT, pd.Timestamp("20150101")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s): # GH 12396 # tz-naive first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply( - lambda x: x.dt.tz_localize(tz1)) + lambda x: x.dt.tz_localize(tz1) + ) second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2)) result = pd.concat([first, second], axis=0) - expected = pd.DataFrame(pd.Series( - [pd.NaT, pd.NaT, s], index=[0, 1, 0])) + expected = pd.DataFrame(pd.Series([pd.NaT, pd.NaT, s], index=[0, 1, 0])) expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: expected = expected.astype(object) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz1', [None, 'UTC']) - @pytest.mark.parametrize('tz2', [None, 'UTC']) + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2): # GH 12396 first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)) - second = pd.DataFrame(pd.Series( - [pd.NaT]).dt.tz_localize(tz2), columns=[1]) + second = pd.DataFrame(pd.Series([pd.NaT]).dt.tz_localize(tz2), columns=[1]) expected = pd.DataFrame( - {0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), - 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)} + { + 0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1), + 1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2), + } ) result = pd.concat([first, second], axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz1', [None, 'UTC']) - @pytest.mark.parametrize('tz2', [None, 'UTC']) + @pytest.mark.parametrize("tz1", [None, "UTC"]) + @pytest.mark.parametrize("tz2", [None, "UTC"]) def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2): # GH 12396 # tz-naive first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1) - second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)], - [pd.Timestamp('2016/01/01', tz=tz2)]], - index=[2, 3]) + second = pd.DataFrame( + [ + [pd.Timestamp("2015/01/01", tz=tz2)], + [pd.Timestamp("2016/01/01", tz=tz2)], + ], + index=[2, 3], + ) - expected = pd.DataFrame([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01', tz=tz2), - pd.Timestamp('2016/01/01', tz=tz2)]) + expected = pd.DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz2), + pd.Timestamp("2016/01/01", tz=tz2), + ] + ) if tz1 != tz2: expected = expected.astype(object) result = pd.concat([first, second]) assert_frame_equal(result, expected) - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_concat_NaT_dataframes(self, tz): # GH 12396 first = pd.DataFrame([[pd.NaT], [pd.NaT]]) first = first.apply(lambda x: x.dt.tz_localize(tz)) - second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)], - [pd.Timestamp('2016/01/01', tz=tz)]], - index=[2, 3]) - expected = pd.DataFrame([pd.NaT, pd.NaT, - pd.Timestamp('2015/01/01', tz=tz), - pd.Timestamp('2016/01/01', tz=tz)]) + second = pd.DataFrame( + [[pd.Timestamp("2015/01/01", tz=tz)], [pd.Timestamp("2016/01/01", tz=tz)]], + index=[2, 3], + ) + expected = pd.DataFrame( + [ + pd.NaT, + pd.NaT, + pd.Timestamp("2015/01/01", tz=tz), + pd.Timestamp("2016/01/01", tz=tz), + ] + ) result = pd.concat([first, second], axis=0) assert_frame_equal(result, expected) def test_concat_period_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='Period[D]') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="D")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="Period[D]") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) def test_concat_period_multiple_freq_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-10-01", "2016-01-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" def test_concat_period_other_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="M")) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" # non-period - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(pd.DatetimeIndex(["2015-11-01", "2015-12-01"])) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(['A', 'B']) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + x = Series(pd.PeriodIndex(["2015-11-01", "2015-12-01"], freq="D")) + y = Series(["A", "B"]) + expected = Series([x[0], x[1], y[0], y[1]], dtype="object") result = concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) - assert result.dtype == 'object' + assert result.dtype == "object" def test_concat_empty_series(self): # GH 11082 - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series(name="y") res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}, - index=pd.Index([0, 1, 2], dtype='O')) + exp = pd.DataFrame( + {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan]}, + index=pd.Index([0, 1, 2], dtype="O"), + ) tm.assert_frame_equal(res, exp) - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series(name="y") res = pd.concat([s1, s2], axis=0) # name will be reset exp = pd.Series([1, 2, 3]) tm.assert_series_equal(res, exp) # empty Series with no name - s1 = pd.Series([1, 2, 3], name='x') + s1 = pd.Series([1, 2, 3], name="x") s2 = pd.Series(name=None) res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0], - index=pd.Index([0, 1, 2], dtype='O')) + exp = pd.DataFrame( + {"x": [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=["x", 0], + index=pd.Index([0, 1, 2], dtype="O"), + ) tm.assert_frame_equal(res, exp) - @pytest.mark.parametrize('tz', [None, 'UTC']) - @pytest.mark.parametrize('values', [[], [1, 2, 3]]) + @pytest.mark.parametrize("tz", [None, "UTC"]) + @pytest.mark.parametrize("values", [[], [1, 2, 3]]) def test_concat_empty_series_timelike(self, tz, values): # GH 18447 - first = Series([], dtype='M8[ns]').dt.tz_localize(tz) + first = Series([], dtype="M8[ns]").dt.tz_localize(tz) second = Series(values) expected = DataFrame( - {0: pd.Series([pd.NaT] * len(values), - dtype='M8[ns]' - ).dt.tz_localize(tz), - 1: values}) + { + 0: pd.Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz), + 1: values, + } + ) result = concat([first, second], axis=1) assert_frame_equal(result, expected) def test_default_index(self): # is_series and ignore_index - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series([4, 5, 6], name='y') + s1 = pd.Series([1, 2, 3], name="x") + s2 = pd.Series([4, 5, 6], name="y") res = pd.concat([s1, s2], axis=1, ignore_index=True) assert isinstance(res.columns, pd.RangeIndex) exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) # use check_index_type=True to check the result have # RangeIndex (default index) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) # is_series and all inputs have no names s1 = pd.Series([1, 2, 3]) @@ -2093,23 +2256,19 @@ def test_default_index(self): assert isinstance(res.columns, pd.RangeIndex) exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) exp.columns = pd.RangeIndex(2) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) # is_dataframe and ignore_index - df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) - df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) + df1 = pd.DataFrame({"A": [1, 2], "B": [5, 6]}) + df2 = pd.DataFrame({"A": [3, 4], "B": [7, 8]}) res = pd.concat([df1, df2], axis=0, ignore_index=True) - exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], - columns=['A', 'B']) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], columns=["A", "B"]) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) res = pd.concat([df1, df2], axis=1, ignore_index=True) exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(res, exp, check_index_type=True, check_column_type=True) def test_concat_multiindex_rangeindex(self): # GH13542 @@ -2117,9 +2276,10 @@ def test_concat_multiindex_rangeindex(self): # there is a bug in concat with objects of len 1 df = DataFrame(np.random.randn(9, 2)) - df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)], - codes=[np.repeat(np.arange(3), 3), - np.tile(np.arange(3), 3)]) + df.index = MultiIndex( + levels=[pd.RangeIndex(3), pd.RangeIndex(3)], + codes=[np.repeat(np.arange(3), 3), np.tile(np.arange(3), 3)], + ) res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) exp = df.iloc[[2, 3, 4, 5], :] @@ -2128,22 +2288,23 @@ def test_concat_multiindex_rangeindex(self): def test_concat_multiindex_dfs_with_deepcopy(self): # GH 9967 from copy import deepcopy - example_multiindex1 = pd.MultiIndex.from_product([['a'], ['b']]) + + example_multiindex1 = pd.MultiIndex.from_product([["a"], ["b"]]) example_dataframe1 = pd.DataFrame([0], index=example_multiindex1) - example_multiindex2 = pd.MultiIndex.from_product([['a'], ['c']]) + example_multiindex2 = pd.MultiIndex.from_product([["a"], ["c"]]) example_dataframe2 = pd.DataFrame([1], index=example_multiindex2) - example_dict = {'s1': example_dataframe1, 's2': example_dataframe2} - expected_index = pd.MultiIndex(levels=[['s1', 's2'], - ['a'], - ['b', 'c']], - codes=[[0, 1], [0, 0], [0, 1]], - names=['testname', None, None]) + example_dict = {"s1": example_dataframe1, "s2": example_dataframe2} + expected_index = pd.MultiIndex( + levels=[["s1", "s2"], ["a"], ["b", "c"]], + codes=[[0, 1], [0, 0], [0, 1]], + names=["testname", None, None], + ) expected = pd.DataFrame([[0], [1]], index=expected_index) - result_copy = pd.concat(deepcopy(example_dict), names=['testname']) + result_copy = pd.concat(deepcopy(example_dict), names=["testname"]) tm.assert_frame_equal(result_copy, expected) - result_no_copy = pd.concat(example_dict, names=['testname']) + result_no_copy = pd.concat(example_dict, names=["testname"]) tm.assert_frame_equal(result_no_copy, expected) def test_categorical_concat_append(self): @@ -2152,8 +2313,7 @@ def test_categorical_concat_append(self): df = DataFrame({"cats": cat, "vals": vals}) cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"]) vals2 = [1, 2, 1, 2] - exp = DataFrame({"cats": cat2, "vals": vals2}, - index=Index([0, 1, 0, 1])) + exp = DataFrame({"cats": cat2, "vals": vals2}, index=Index([0, 1, 0, 1])) tm.assert_frame_equal(pd.concat([df, df]), exp) tm.assert_frame_equal(df.append(df), exp) @@ -2164,7 +2324,7 @@ def test_categorical_concat_append(self): df_different_categories = DataFrame({"cats": cat3, "vals": vals3}) res = pd.concat([df, df_different_categories], ignore_index=True) - exp = DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]}) + exp = DataFrame({"cats": list("abab"), "vals": [1, 2, 1, 2]}) tm.assert_frame_equal(res, exp) res = df.append(df_different_categories, ignore_index=True) @@ -2173,115 +2333,132 @@ def test_categorical_concat_append(self): def test_categorical_concat_dtypes(self): # GH8143 - index = ['cat', 'obj', 'num'] - cat = Categorical(['a', 'b', 'c']) - obj = Series(['a', 'b', 'c']) + index = ["cat", "obj", "num"] + cat = Categorical(["a", "b", "c"]) + obj = Series(["a", "b", "c"]) num = Series([1, 2, 3]) df = pd.concat([Series(cat), obj, num], axis=1, keys=index) - result = df.dtypes == 'object' + result = df.dtypes == "object" expected = Series([False, True, False], index=index) tm.assert_series_equal(result, expected) - result = df.dtypes == 'int64' + result = df.dtypes == "int64" expected = Series([False, False, True], index=index) tm.assert_series_equal(result, expected) - result = df.dtypes == 'category' + result = df.dtypes == "category" expected = Series([True, False, False], index=index) tm.assert_series_equal(result, expected) def test_categorical_concat(self, sort): # See GH 10177 - df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3), - columns=["a", "b", "c"]) + df1 = DataFrame( + np.arange(18, dtype="int64").reshape(6, 3), columns=["a", "b", "c"] + ) - df2 = DataFrame(np.arange(14, dtype='int64').reshape(7, 2), - columns=["a", "c"]) + df2 = DataFrame(np.arange(14, dtype="int64").reshape(7, 2), columns=["a", "c"]) cat_values = ["one", "one", "two", "one", "two", "two", "one"] - df2['h'] = Series(Categorical(cat_values)) + df2["h"] = Series(Categorical(cat_values)) res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort) - exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan, np.nan], - 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], - 'h': [None] * 6 + cat_values}) + exp = DataFrame( + { + "a": [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + "b": [ + 1, + 4, + 7, + 10, + 13, + 16, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], + "c": [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + "h": [None] * 6 + cat_values, + } + ) tm.assert_frame_equal(res, exp) def test_categorical_concat_gh7864(self): # GH 7864 # make sure ordering is preserved - df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')}) + df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list("abbaae")}) df["grade"] = Categorical(df["raw_grade"]) - df['grade'].cat.set_categories(['e', 'a', 'b']) + df["grade"].cat.set_categories(["e", "a", "b"]) df1 = df[0:3] df2 = df[3:] - tm.assert_index_equal(df['grade'].cat.categories, - df1['grade'].cat.categories) - tm.assert_index_equal(df['grade'].cat.categories, - df2['grade'].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, df1["grade"].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, df2["grade"].cat.categories) dfx = pd.concat([df1, df2]) - tm.assert_index_equal(df['grade'].cat.categories, - dfx['grade'].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, dfx["grade"].cat.categories) dfa = df1.append(df2) - tm.assert_index_equal(df['grade'].cat.categories, - dfa['grade'].cat.categories) + tm.assert_index_equal(df["grade"].cat.categories, dfa["grade"].cat.categories) def test_categorical_concat_preserve(self): # GH 8641 series concat not preserving category dtype # GH 13524 can concat different categories - s = Series(list('abc'), dtype='category') - s2 = Series(list('abd'), dtype='category') + s = Series(list("abc"), dtype="category") + s2 = Series(list("abd"), dtype="category") - exp = Series(list('abcabd')) + exp = Series(list("abcabd")) res = pd.concat([s, s2], ignore_index=True) tm.assert_series_equal(res, exp) - exp = Series(list('abcabc'), dtype='category') + exp = Series(list("abcabc"), dtype="category") res = pd.concat([s, s], ignore_index=True) tm.assert_series_equal(res, exp) - exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2], - dtype='category') + exp = Series(list("abcabc"), index=[0, 1, 2, 0, 1, 2], dtype="category") res = pd.concat([s, s]) tm.assert_series_equal(res, exp) - a = Series(np.arange(6, dtype='int64')) - b = Series(list('aabbca')) + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) - df2 = DataFrame({'A': a, - 'B': b.astype(CategoricalDtype(list('cab')))}) + df2 = DataFrame({"A": a, "B": b.astype(CategoricalDtype(list("cab")))}) res = pd.concat([df2, df2]) exp = DataFrame( - {'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))}) + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ) tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): - a = Series(np.arange(6, dtype='int64')) - b = Series(list('aabbca')) + a = Series(np.arange(6, dtype="int64")) + b = Series(list("aabbca")) - df2 = DataFrame({'A': a, - 'B': b.astype(CategoricalDtype(list('cab'))) - }).set_index('B') + df2 = DataFrame( + {"A": a, "B": b.astype(CategoricalDtype(list("cab")))} + ).set_index("B") result = pd.concat([df2, df2]) expected = DataFrame( - {'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab'))) - }).set_index('B') + { + "A": pd.concat([a, a]), + "B": pd.concat([b, b]).astype(CategoricalDtype(list("cab"))), + } + ).set_index("B") tm.assert_frame_equal(result, expected) # wrong categories - df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe')) - }).set_index('B') + df3 = DataFrame( + {"A": a, "B": Categorical(b, categories=list("abe"))} + ).set_index("B") msg = "categories must match existing categories when appending" with pytest.raises(TypeError, match=msg): pd.concat([df2, df3]) @@ -2290,28 +2467,30 @@ def test_concat_categoricalindex(self): # GH 16111, categories that aren't lexsorted categories = [9, 0, 1, 2, 3] - a = pd.Series(1, index=pd.CategoricalIndex([9, 0], - categories=categories)) - b = pd.Series(2, index=pd.CategoricalIndex([0, 1], - categories=categories)) - c = pd.Series(3, index=pd.CategoricalIndex([1, 2], - categories=categories)) + a = pd.Series(1, index=pd.CategoricalIndex([9, 0], categories=categories)) + b = pd.Series(2, index=pd.CategoricalIndex([0, 1], categories=categories)) + c = pd.Series(3, index=pd.CategoricalIndex([1, 2], categories=categories)) result = pd.concat([a, b, c], axis=1) exp_idx = pd.CategoricalIndex([9, 0, 1, 2], categories=categories) - exp = pd.DataFrame({0: [1, 1, np.nan, np.nan], - 1: [np.nan, 2, 2, np.nan], - 2: [np.nan, np.nan, 3, 3]}, - columns=[0, 1, 2], - index=exp_idx) + exp = pd.DataFrame( + { + 0: [1, 1, np.nan, np.nan], + 1: [np.nan, 2, 2, np.nan], + 2: [np.nan, np.nan, 3, 3], + }, + columns=[0, 1, 2], + index=exp_idx, + ) tm.assert_frame_equal(result, exp) def test_concat_order(self): # GH 17344 - dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])] - dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a']) - for i in range(100)] + dfs = [pd.DataFrame(index=range(3), columns=["a", 1, None])] + dfs += [ + pd.DataFrame(index=range(3), columns=[None, 1, "a"]) for i in range(100) + ] result = pd.concat(dfs, sort=True).columns expected = dfs[0].columns @@ -2319,49 +2498,69 @@ def test_concat_order(self): def test_concat_datetime_timezone(self): # GH 18523 - idx1 = pd.date_range('2011-01-01', periods=3, freq='H', - tz='Europe/Paris') - idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H') - df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1) - df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2) + idx1 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Europe/Paris") + idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq="H") + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1) + df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2) result = pd.concat([df1, df2], axis=1) - exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00', - '2011-01-01 01:00:00+01:00', - '2011-01-01 02:00:00+01:00'], - freq='H' - ).tz_convert('UTC').tz_convert('Europe/Paris') + exp_idx = ( + DatetimeIndex( + [ + "2011-01-01 00:00:00+01:00", + "2011-01-01 01:00:00+01:00", + "2011-01-01 02:00:00+01:00", + ], + freq="H", + ) + .tz_convert("UTC") + .tz_convert("Europe/Paris") + ) - expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]], - index=exp_idx, columns=['a', 'b']) + expected = pd.DataFrame( + [[1, 1], [2, 2], [3, 3]], index=exp_idx, columns=["a", "b"] + ) tm.assert_frame_equal(result, expected) - idx3 = pd.date_range('2011-01-01', periods=3, - freq='H', tz='Asia/Tokyo') - df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3) + idx3 = pd.date_range("2011-01-01", periods=3, freq="H", tz="Asia/Tokyo") + df3 = pd.DataFrame({"b": [1, 2, 3]}, index=idx3) result = pd.concat([df1, df3], axis=1) - exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00', - '2010-12-31 16:00:00+00:00', - '2010-12-31 17:00:00+00:00', - '2010-12-31 23:00:00+00:00', - '2011-01-01 00:00:00+00:00', - '2011-01-01 01:00:00+00:00'] - ) + exp_idx = DatetimeIndex( + [ + "2010-12-31 15:00:00+00:00", + "2010-12-31 16:00:00+00:00", + "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", + ] + ) - expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3], - [1, np.nan], [2, np.nan], [3, np.nan]], - index=exp_idx, columns=['a', 'b']) + expected = pd.DataFrame( + [ + [np.nan, 1], + [np.nan, 2], + [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], + ], + index=exp_idx, + columns=["a", "b"], + ) tm.assert_frame_equal(result, expected) # GH 13783: Concat after resample - result = pd.concat([df1.resample('H').mean(), - df2.resample('H').mean()], sort=True) - expected = pd.DataFrame({'a': [1, 2, 3] + [np.nan] * 3, - 'b': [np.nan] * 3 + [1, 2, 3]}, - index=idx1.append(idx1)) + result = pd.concat( + [df1.resample("H").mean(), df2.resample("H").mean()], sort=True + ) + expected = pd.DataFrame( + {"a": [1, 2, 3] + [np.nan] * 3, "b": [np.nan] * 3 + [1, 2, 3]}, + index=idx1.append(idx1), + ) tm.assert_frame_equal(result, expected) def test_concat_different_extension_dtypes_upcasts(self): @@ -2369,50 +2568,55 @@ def test_concat_different_extension_dtypes_upcasts(self): b = pd.Series(to_decimal([1, 2])) result = pd.concat([a, b], ignore_index=True) - expected = pd.Series([ - 1, 2, - Decimal(1), Decimal(2) - ], dtype=object) + expected = pd.Series([1, 2, Decimal(1), Decimal(2)], dtype=object) tm.assert_series_equal(result, expected) def test_concat_odered_dict(self): # GH 21510 - expected = pd.concat([pd.Series(range(3)), pd.Series(range(4))], - keys=['First', 'Another']) - result = pd.concat(OrderedDict([('First', pd.Series(range(3))), - ('Another', pd.Series(range(4)))])) + expected = pd.concat( + [pd.Series(range(3)), pd.Series(range(4))], keys=["First", "Another"] + ) + result = pd.concat( + OrderedDict( + [("First", pd.Series(range(3))), ("Another", pd.Series(range(4)))] + ) + ) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize('dt', np.sctypes['float']) +@pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["float"]) def test_concat_no_unnecessary_upcast(dt, pdt): # GH 13247 dims = pdt().ndim - dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), - pdt(np.array([np.nan], dtype=dt, ndmin=dims)), - pdt(np.array([5], dtype=dt, ndmin=dims))] + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], dtype=dt, ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] x = pd.concat(dfs) assert x.values.dtype == dt -@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize('dt', np.sctypes['int']) +@pytest.mark.parametrize("pdt", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("dt", np.sctypes["int"]) def test_concat_will_upcast(dt, pdt): with catch_warnings(record=True): dims = pdt().ndim - dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)), - pdt(np.array([np.nan], ndmin=dims)), - pdt(np.array([5], dtype=dt, ndmin=dims))] + dfs = [ + pdt(np.array([1], dtype=dt, ndmin=dims)), + pdt(np.array([np.nan], ndmin=dims)), + pdt(np.array([5], dtype=dt, ndmin=dims)), + ] x = pd.concat(dfs) - assert x.values.dtype == 'float64' + assert x.values.dtype == "float64" def test_concat_empty_and_non_empty_frame_regression(): # GH 18178 regression test - df1 = pd.DataFrame({'foo': [1]}) - df2 = pd.DataFrame({'foo': []}) - expected = pd.DataFrame({'foo': [1.0]}) + df1 = pd.DataFrame({"foo": [1]}) + df2 = pd.DataFrame({"foo": []}) + expected = pd.DataFrame({"foo": [1.0]}) result = pd.concat([df1, df2]) assert_frame_equal(result, expected) @@ -2428,17 +2632,17 @@ def test_concat_empty_and_non_empty_series_regression(): def test_concat_sorts_columns(sort_with_none): # GH-4588 - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a']) + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"]) df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]}) # for sort=True/None - expected = pd.DataFrame({"a": [1, 2, 3, 4], - "b": [1, 2, None, None], - "c": [None, None, 5, 6]}, - columns=['a', 'b', 'c']) + expected = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [1, 2, None, None], "c": [None, None, 5, 6]}, + columns=["a", "b", "c"], + ) if sort_with_none is False: - expected = expected[['b', 'a', 'c']] + expected = expected[["b", "a", "c"]] if sort_with_none is None: # only warn if not explicitly specified @@ -2453,15 +2657,15 @@ def test_concat_sorts_columns(sort_with_none): def test_concat_sorts_index(sort_with_none): - df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b']) - df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b']) + df1 = pd.DataFrame({"a": [1, 2, 3]}, index=["c", "a", "b"]) + df2 = pd.DataFrame({"b": [1, 2]}, index=["a", "b"]) # For True/None - expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]}, - index=['a', 'b', 'c'], - columns=['a', 'b']) + expected = pd.DataFrame( + {"a": [2, 3, 1], "b": [1, 2, None]}, index=["a", "b", "c"], columns=["a", "b"] + ) if sort_with_none is False: - expected = expected.loc[['c', 'a', 'b']] + expected = expected.loc[["c", "a", "b"]] if sort_with_none is None: # only warn if not explicitly specified @@ -2477,78 +2681,76 @@ def test_concat_sorts_index(sort_with_none): def test_concat_inner_sort(sort_with_none): # https://github.com/pandas-dev/pandas/pull/20613 - df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, - columns=['b', 'a', 'c']) - df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4]) + df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]}, columns=["b", "a", "c"]) + df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[3, 4]) with tm.assert_produces_warning(None): # unset sort should *not* warn for inner join # since that never sorted - result = pd.concat([df1, df2], sort=sort_with_none, - join='inner', - ignore_index=True) + result = pd.concat( + [df1, df2], sort=sort_with_none, join="inner", ignore_index=True + ) - expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, - columns=['b', 'a']) + expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]}, columns=["b", "a"]) if sort_with_none is True: - expected = expected[['a', 'b']] + expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) def test_concat_aligned_sort(): # GH-4588 - df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]}, - columns=['c', 'b', 'a']) + df = pd.DataFrame({"c": [1, 2], "b": [3, 4], "a": [5, 6]}, columns=["c", "b", "a"]) result = pd.concat([df, df], sort=True, ignore_index=True) - expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4], - 'c': [1, 2, 1, 2]}, - columns=['a', 'b', 'c']) + expected = pd.DataFrame( + {"a": [5, 6, 5, 6], "b": [3, 4, 3, 4], "c": [1, 2, 1, 2]}, + columns=["a", "b", "c"], + ) tm.assert_frame_equal(result, expected) - result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True, - ignore_index=True) - expected = expected[['b', 'c']] + result = pd.concat([df, df[["c", "b"]]], join="inner", sort=True, ignore_index=True) + expected = expected[["b", "c"]] tm.assert_frame_equal(result, expected) def test_concat_aligned_sort_does_not_raise(): # GH-4588 # We catch TypeErrors from sorting internally and do not re-raise. - df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a']) - expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]}, - columns=[1, 'a']) + df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, "a"]) + expected = pd.DataFrame({1: [1, 2, 1, 2], "a": [3, 4, 3, 4]}, columns=[1, "a"]) result = pd.concat([df, df], ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("s1name,s2name", [ - (np.int64(190), (43, 0)), (190, (43, 0))]) +@pytest.mark.parametrize("s1name,s2name", [(np.int64(190), (43, 0)), (190, (43, 0))]) def test_concat_series_name_npscalar_tuple(s1name, s2name): # GH21015 - s1 = pd.Series({'a': 1, 'b': 2}, name=s1name) - s2 = pd.Series({'c': 5, 'd': 6}, name=s2name) + s1 = pd.Series({"a": 1, "b": 2}, name=s1name) + s2 = pd.Series({"c": 5, "d": 6}, name=s2name) result = pd.concat([s1, s2]) - expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6}) + expected = pd.Series({"a": 1, "b": 2, "c": 5, "d": 6}) tm.assert_series_equal(result, expected) def test_concat_categorical_tz(): # GH-23816 - a = pd.Series(pd.date_range('2017-01-01', periods=2, tz='US/Pacific')) - b = pd.Series(['a', 'b'], dtype='category') + a = pd.Series(pd.date_range("2017-01-01", periods=2, tz="US/Pacific")) + b = pd.Series(["a", "b"], dtype="category") result = pd.concat([a, b], ignore_index=True) - expected = pd.Series([ - pd.Timestamp('2017-01-01', tz="US/Pacific"), - pd.Timestamp('2017-01-02', tz="US/Pacific"), - 'a', 'b' - ]) + expected = pd.Series( + [ + pd.Timestamp("2017-01-01", tz="US/Pacific"), + pd.Timestamp("2017-01-02", tz="US/Pacific"), + "a", + "b", + ] + ) tm.assert_series_equal(result, expected) def test_concat_datetimeindex_freq(): # GH 3232 # Monotonic index result - dr = pd.date_range('01-Jan-2013', periods=100, freq='50L', tz='UTC') + dr = pd.date_range("01-Jan-2013", periods=100, freq="50L", tz="UTC") data = list(range(100)) expected = pd.DataFrame(data, index=dr) result = pd.concat([expected[:50], expected[50:]]) @@ -2556,7 +2758,6 @@ def test_concat_datetimeindex_freq(): # Non-monotonic index result result = pd.concat([expected[50:], expected[:50]]) - expected = pd.DataFrame(data[50:] + data[:50], - index=dr[50:].append(dr[:50])) + expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) expected.index.freq = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index f71730fb4a313..a2ebf2359f55f 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -3,9 +3,22 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex, - Series, TimedeltaIndex, Timestamp, cut, date_range, isna, qcut, - timedelta_range, to_datetime) + Categorical, + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + Series, + TimedeltaIndex, + Timestamp, + cut, + date_range, + isna, + qcut, + timedelta_range, + to_datetime, +) from pandas.api.types import CategoricalDtype as CDT import pandas.core.reshape.tile as tmod import pandas.util.testing as tm @@ -20,7 +33,7 @@ def test_simple(): def test_bins(): - data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1]) result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) @@ -28,12 +41,11 @@ def test_bins(): expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) - tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, - 6.53333333, 9.7])) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) def test_right(): - data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) @@ -45,7 +57,7 @@ def test_right(): def test_no_right(): - data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3), closed="left") @@ -57,7 +69,7 @@ def test_no_right(): def test_array_like(): - data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) @@ -65,8 +77,7 @@ def test_array_like(): expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) - tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, - 6.53333333, 9.7])) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) def test_bins_from_interval_index(): @@ -75,9 +86,9 @@ def test_bins_from_interval_index(): result = cut(range(5), bins=expected.categories) tm.assert_categorical_equal(result, expected) - expected = Categorical.from_codes(np.append(c.codes, -1), - categories=c.categories, - ordered=True) + expected = Categorical.from_codes( + np.append(c.codes, -1), categories=c.categories, ordered=True + ) result = cut(range(6), bins=expected.categories) tm.assert_categorical_equal(result, expected) @@ -91,8 +102,7 @@ def test_bins_from_interval_index_doc_example(): result = cut([25, 20, 50], bins=c.categories) tm.assert_index_equal(result.categories, expected) - tm.assert_numpy_array_equal(result.codes, - np.array([1, 1, 2], dtype="int8")) + tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8")) def test_bins_not_overlapping_from_interval_index(): @@ -106,35 +116,52 @@ def test_bins_not_overlapping_from_interval_index(): def test_bins_not_monotonic(): msg = "bins must increase monotonically" - data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] with pytest.raises(ValueError, match=msg): cut(data, [0.1, 1.5, 1, 10]) -@pytest.mark.parametrize("x, bins, expected", [ - (date_range("2017-12-31", periods=3), - [Timestamp.min, Timestamp('2018-01-01'), Timestamp.max], - IntervalIndex.from_tuples([ - (Timestamp.min, Timestamp('2018-01-01')), - (Timestamp('2018-01-01'), Timestamp.max)])), - - ([-1, 0, 1], - np.array([np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], - dtype="int64"), - IntervalIndex.from_tuples([ - (np.iinfo(np.int64).min, 0), - (0, np.iinfo(np.int64).max)])), - - ([np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], - np.array([ - np.timedelta64(-np.iinfo(np.int64).max), - np.timedelta64(0), - np.timedelta64(np.iinfo(np.int64).max)]), - IntervalIndex.from_tuples([ - (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), - (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max))])), -]) +@pytest.mark.parametrize( + "x, bins, expected", + [ + ( + date_range("2017-12-31", periods=3), + [Timestamp.min, Timestamp("2018-01-01"), Timestamp.max], + IntervalIndex.from_tuples( + [ + (Timestamp.min, Timestamp("2018-01-01")), + (Timestamp("2018-01-01"), Timestamp.max), + ] + ), + ), + ( + [-1, 0, 1], + np.array( + [np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64" + ), + IntervalIndex.from_tuples( + [(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)] + ), + ), + ( + [np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], + np.array( + [ + np.timedelta64(-np.iinfo(np.int64).max), + np.timedelta64(0), + np.timedelta64(np.iinfo(np.int64).max), + ] + ), + IntervalIndex.from_tuples( + [ + (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), + (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max)), + ] + ), + ), + ], +) def test_bins_monotonic_not_overflowing(x, bins, expected): # GH 26045 result = cut(x, bins) @@ -143,16 +170,19 @@ def test_bins_monotonic_not_overflowing(x, bins, expected): def test_wrong_num_labels(): msg = "Bin labels must be one fewer than the number of bin edges" - data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1] with pytest.raises(ValueError, match=msg): cut(data, [0, 1, 10], labels=["foo", "bar", "baz"]) -@pytest.mark.parametrize("x,bins,msg", [ - ([], 2, "Cannot cut empty array"), - ([1, 2, 3], 0.5, "`bins` should be a positive integer") -]) +@pytest.mark.parametrize( + "x,bins,msg", + [ + ([], 2, "Cannot cut empty array"), + ([1, 2, 3], 0.5, "`bins` should be a positive integer"), + ], +) def test_cut_corner(x, bins, msg): with pytest.raises(ValueError, match=msg): cut(x, bins) @@ -166,13 +196,17 @@ def test_cut_not_1d_arg(arg, cut_func): cut_func(arg, 2) -@pytest.mark.parametrize('data', [ - [0, 1, 2, 3, 4, np.inf], - [-np.inf, 0, 1, 2, 3, 4], - [-np.inf, 0, 1, 2, 3, 4, np.inf]]) +@pytest.mark.parametrize( + "data", + [ + [0, 1, 2, 3, 4, np.inf], + [-np.inf, 0, 1, 2, 3, 4], + [-np.inf, 0, 1, 2, 3, 4, np.inf], + ], +) def test_int_bins_with_inf(data): # GH 24314 - msg = 'cannot specify integer `bins` when input data contains infinity' + msg = "cannot specify integer `bins` when input data contains infinity" with pytest.raises(ValueError, match=msg): cut(data, bins=3) @@ -188,10 +222,13 @@ def test_cut_out_of_range_more(): tm.assert_series_equal(ind, exp) -@pytest.mark.parametrize("right,breaks,closed", [ - (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"), - (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left") -]) +@pytest.mark.parametrize( + "right,breaks,closed", + [ + (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"), + (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"), + ], +) def test_labels(right, breaks, closed): arr = np.tile(np.arange(0, 1.01, 0.1), 4) @@ -254,14 +291,23 @@ def test_cut_out_of_bounds(): tm.assert_numpy_array_equal(mask, ex_mask) -@pytest.mark.parametrize("get_labels,get_expected", [ - (lambda labels: labels, - lambda labels: Categorical(["Medium"] + 4 * ["Small"] + - ["Medium", "Large"], - categories=labels, ordered=True)), - (lambda labels: Categorical.from_codes([0, 1, 2], labels), - lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels)) -]) +@pytest.mark.parametrize( + "get_labels,get_expected", + [ + ( + lambda labels: labels, + lambda labels: Categorical( + ["Medium"] + 4 * ["Small"] + ["Medium", "Large"], + categories=labels, + ordered=True, + ), + ), + ( + lambda labels: Categorical.from_codes([0, 1, 2], labels), + lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels), + ), + ], +) def test_cut_pass_labels(get_labels, get_expected): bins = [0, 25, 50, 100] arr = [50, 5, 10, 15, 20, 30, 70] @@ -277,23 +323,25 @@ def test_cut_pass_labels_compat(): labels = ["Good", "Medium", "Bad"] result = cut(arr, 3, labels=labels) - exp = cut(arr, 3, labels=Categorical(labels, categories=labels, - ordered=True)) + exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True)) tm.assert_categorical_equal(result, exp) -@pytest.mark.parametrize("x", [np.arange(11.), np.arange(11.) / 1e10]) +@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10]) def test_round_frac_just_works(x): # It works. cut(x, 2) -@pytest.mark.parametrize("val,precision,expected", [ - (-117.9998, 3, -118), - (117.9998, 3, 118), - (117.9998, 2, 118), - (0.000123456, 2, 0.00012) -]) +@pytest.mark.parametrize( + "val,precision,expected", + [ + (-117.9998, 3, -118), + (117.9998, 3, 118), + (117.9998, 2, 118), + (0.000123456, 2, 0.00012), + ], +) def test_round_frac(val, precision, expected): # see gh-1979 result = tmod._round_frac(val, precision=precision) @@ -307,8 +355,11 @@ def test_cut_return_intervals(): exp_bins = np.linspace(0, 8, num=4).round(3) exp_bins[0] -= 0.008 - expected = Series(IntervalIndex.from_breaks(exp_bins, closed="right").take( - [0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) + expected = Series( + IntervalIndex.from_breaks(exp_bins, closed="right").take( + [0, 0, 0, 1, 1, 1, 2, 2, 2] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) @@ -317,17 +368,21 @@ def test_series_ret_bins(): ser = Series(np.arange(4)) result, bins = cut(ser, 2, retbins=True) - expected = Series(IntervalIndex.from_breaks( - [-0.003, 1.5, 3], closed="right").repeat(2)).astype(CDT(ordered=True)) + expected = Series( + IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("kwargs,msg", [ - (dict(duplicates="drop"), None), - (dict(), "Bin edges must be unique"), - (dict(duplicates="raise"), "Bin edges must be unique"), - (dict(duplicates="foo"), "invalid value for 'duplicates' parameter") -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), + ], +) def test_cut_duplicates_bin(kwargs, msg): # see gh-20947 bins = [0, 2, 4, 6, 10, 10] @@ -354,8 +409,8 @@ def test_single_bin(data, length): @pytest.mark.parametrize( - "array_1_writeable,array_2_writeable", - [(True, True), (True, False), (False, False)]) + "array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)] +) def test_cut_read_only(array_1_writeable, array_2_writeable): # issue 18773 array_1 = np.arange(0, 100, 10) @@ -365,58 +420,93 @@ def test_cut_read_only(array_1_writeable, array_2_writeable): array_2.flags.writeable = array_2_writeable hundred_elements = np.arange(100) - tm.assert_categorical_equal(cut(hundred_elements, array_1), - cut(hundred_elements, array_2)) + tm.assert_categorical_equal( + cut(hundred_elements, array_1), cut(hundred_elements, array_2) + ) -@pytest.mark.parametrize("conv", [ - lambda v: Timestamp(v), - lambda v: to_datetime(v), - lambda v: np.datetime64(v), - lambda v: Timestamp(v).to_pydatetime(), -]) +@pytest.mark.parametrize( + "conv", + [ + lambda v: Timestamp(v), + lambda v: to_datetime(v), + lambda v: np.datetime64(v), + lambda v: Timestamp(v).to_pydatetime(), + ], +) def test_datetime_bin(conv): data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")] bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"] - expected = Series(IntervalIndex([ - Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), - Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])).astype( - CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), + Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), + ] + ) + ).astype(CDT(ordered=True)) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("data", [ - to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])), - [np.datetime64("2013-01-01"), np.datetime64("2013-01-02"), - np.datetime64("2013-01-03")], - np.array([np.datetime64("2013-01-01"), np.datetime64("2013-01-02"), - np.datetime64("2013-01-03")]), - DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]) -]) +@pytest.mark.parametrize( + "data", + [ + to_datetime(Series(["2013-01-01", "2013-01-02", "2013-01-03"])), + [ + np.datetime64("2013-01-01"), + np.datetime64("2013-01-02"), + np.datetime64("2013-01-03"), + ], + np.array( + [ + np.datetime64("2013-01-01"), + np.datetime64("2013-01-02"), + np.datetime64("2013-01-03"), + ] + ), + DatetimeIndex(["2013-01-01", "2013-01-02", "2013-01-03"]), + ], +) def test_datetime_cut(data): # see gh-14714 # # Testing time data when it comes in various collection types. result, _ = cut(data, 3, retbins=True) - expected = Series(IntervalIndex([ - Interval(Timestamp("2012-12-31 23:57:07.200000"), - Timestamp("2013-01-01 16:00:00")), - Interval(Timestamp("2013-01-01 16:00:00"), - Timestamp("2013-01-02 08:00:00")), - Interval(Timestamp("2013-01-02 08:00:00"), - Timestamp("2013-01-03 00:00:00"))])).astype(CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000"), + Timestamp("2013-01-01 16:00:00"), + ), + Interval( + Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00") + ), + Interval( + Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00") + ), + ] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(Series(result), expected) -@pytest.mark.parametrize("bins", [ - 3, [Timestamp("2013-01-01 04:57:07.200000"), - Timestamp("2013-01-01 21:00:00"), - Timestamp("2013-01-02 13:00:00"), - Timestamp("2013-01-03 05:00:00")]]) +@pytest.mark.parametrize( + "bins", + [ + 3, + [ + Timestamp("2013-01-01 04:57:07.200000"), + Timestamp("2013-01-01 21:00:00"), + Timestamp("2013-01-02 13:00:00"), + Timestamp("2013-01-03 05:00:00"), + ], + ], +) @pytest.mark.parametrize("box", [list, np.array, Index, Series]) def test_datetime_tz_cut(bins, box): # see gh-19872 @@ -427,14 +517,24 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(s, bins) - expected = Series(IntervalIndex([ - Interval(Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz)), - Interval(Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz)), - Interval(Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz))])).astype( - CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) @@ -446,15 +546,15 @@ def test_datetime_nan_error(): def test_datetime_nan_mask(): - result = cut(date_range("20130102", periods=5), - bins=date_range("20130101", periods=2)) + result = cut( + date_range("20130102", periods=5), bins=date_range("20130101", periods=2) + ) mask = result.categories.isna() tm.assert_numpy_array_equal(mask, np.array([False])) mask = result.isna() - tm.assert_numpy_array_equal(mask, np.array([False, True, True, - True, True])) + tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True])) @pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"]) @@ -466,9 +566,9 @@ def test_datetime_cut_roundtrip(tz): expected = cut(ser, result_bins) tm.assert_series_equal(result, expected) - expected_bins = DatetimeIndex(["2017-12-31 23:57:07.200000", - "2018-01-02 00:00:00", - "2018-01-03 00:00:00"]) + expected_bins = DatetimeIndex( + ["2017-12-31 23:57:07.200000", "2018-01-02 00:00:00", "2018-01-03 00:00:00"] + ) expected_bins = expected_bins.tz_localize(tz) tm.assert_index_equal(result_bins, expected_bins) @@ -481,7 +581,7 @@ def test_timedelta_cut_roundtrip(): expected = cut(ser, result_bins) tm.assert_series_equal(result, expected) - expected_bins = TimedeltaIndex(["0 days 23:57:07.200000", - "2 days 00:00:00", - "3 days 00:00:00"]) + expected_bins = TimedeltaIndex( + ["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"] + ) tm.assert_index_equal(result_bins, expected_bins) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index fbec775bbf407..56e83ada9eb99 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -8,232 +8,269 @@ class TestMelt: - def setup_method(self, method): self.df = tm.makeTimeDataFrame()[:10] - self.df['id1'] = (self.df['A'] > 0).astype(np.int64) - self.df['id2'] = (self.df['B'] > 0).astype(np.int64) - - self.var_name = 'var' - self.value_name = 'val' - - self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867 - ], [-1.321405, 0.368915, -1.055342], - [-0.807333, 0.08298, -0.873361]]) - self.df1.columns = [list('ABC'), list('abc')] - self.df1.columns.names = ['CAP', 'low'] + self.df["id1"] = (self.df["A"] > 0).astype(np.int64) + self.df["id2"] = (self.df["B"] > 0).astype(np.int64) + + self.var_name = "var" + self.value_name = "val" + + self.df1 = pd.DataFrame( + [ + [1.067683, -1.110463, 0.20867], + [-1.321405, 0.368915, -1.055342], + [-0.807333, 0.08298, -0.873361], + ] + ) + self.df1.columns = [list("ABC"), list("abc")] + self.df1.columns.names = ["CAP", "low"] def test_top_level_method(self): result = melt(self.df) - assert result.columns.tolist() == ['variable', 'value'] + assert result.columns.tolist() == ["variable", "value"] def test_method_signatures(self): - tm.assert_frame_equal(self.df.melt(), - melt(self.df)) + tm.assert_frame_equal(self.df.melt(), melt(self.df)) - tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'], - value_vars=['A', 'B']), - melt(self.df, - id_vars=['id1', 'id2'], - value_vars=['A', 'B'])) + tm.assert_frame_equal( + self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]), + melt(self.df, id_vars=["id1", "id2"], value_vars=["A", "B"]), + ) - tm.assert_frame_equal(self.df.melt(var_name=self.var_name, - value_name=self.value_name), - melt(self.df, - var_name=self.var_name, - value_name=self.value_name)) + tm.assert_frame_equal( + self.df.melt(var_name=self.var_name, value_name=self.value_name), + melt(self.df, var_name=self.var_name, value_name=self.value_name), + ) - tm.assert_frame_equal(self.df1.melt(col_level=0), - melt(self.df1, col_level=0)) + tm.assert_frame_equal(self.df1.melt(col_level=0), melt(self.df1, col_level=0)) def test_default_col_names(self): result = self.df.melt() - assert result.columns.tolist() == ['variable', 'value'] + assert result.columns.tolist() == ["variable", "value"] - result1 = self.df.melt(id_vars=['id1']) - assert result1.columns.tolist() == ['id1', 'variable', 'value'] + result1 = self.df.melt(id_vars=["id1"]) + assert result1.columns.tolist() == ["id1", "variable", "value"] - result2 = self.df.melt(id_vars=['id1', 'id2']) - assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value'] + result2 = self.df.melt(id_vars=["id1", "id2"]) + assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] def test_value_vars(self): - result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A') + result3 = self.df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 - result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B']) - expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) + result4 = self.df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]) + expected4 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", "value"], + ) tm.assert_frame_equal(result4, expected4) def test_value_vars_types(self): # GH 15348 - expected = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', 'value']) + expected = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", "value"], + ) for type_ in (tuple, list, np.array): - result = self.df.melt(id_vars=['id1', 'id2'], - value_vars=type_(('A', 'B'))) + result = self.df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B"))) tm.assert_frame_equal(result, expected) def test_vars_work_with_multiindex(self): - expected = DataFrame({ - ('A', 'a'): self.df1[('A', 'a')], - 'CAP': ['B'] * len(self.df1), - 'low': ['b'] * len(self.df1), - 'value': self.df1[('B', 'b')], - }, columns=[('A', 'a'), 'CAP', 'low', 'value']) - - result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')]) + expected = DataFrame( + { + ("A", "a"): self.df1[("A", "a")], + "CAP": ["B"] * len(self.df1), + "low": ["b"] * len(self.df1), + "value": self.df1[("B", "b")], + }, + columns=[("A", "a"), "CAP", "low", "value"], + ) + + result = self.df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")]) tm.assert_frame_equal(result, expected) def test_single_vars_work_with_multiindex(self): - expected = DataFrame({ - 'A': {0: 1.067683, 1: -1.321405, 2: -0.807333}, - 'CAP': {0: 'B', 1: 'B', 2: 'B'}, - 'value': {0: -1.110463, 1: 0.368915, 2: 0.08298}}) - result = self.df1.melt(['A'], ['B'], col_level=0) + expected = DataFrame( + { + "A": {0: 1.067683, 1: -1.321405, 2: -0.807333}, + "CAP": {0: "B", 1: "B", 2: "B"}, + "value": {0: -1.110463, 1: 0.368915, 2: 0.08298}, + } + ) + result = self.df1.melt(["A"], ["B"], col_level=0) tm.assert_frame_equal(result, expected) def test_tuple_vars_fail_with_multiindex(self): # melt should fail with an informative error message if # the columns have a MultiIndex and a tuple is passed # for id_vars or value_vars. - tuple_a = ('A', 'a') + tuple_a = ("A", "a") list_a = [tuple_a] - tuple_b = ('B', 'b') + tuple_b = ("B", "b") list_b = [tuple_b] - msg = (r"(id|value)_vars must be a list of tuples when columns are" - " a MultiIndex") - for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b), - (tuple_a, tuple_b)): + msg = ( + r"(id|value)_vars must be a list of tuples when columns are" " a MultiIndex" + ) + for id_vars, value_vars in ( + (tuple_a, list_b), + (list_a, tuple_b), + (tuple_a, tuple_b), + ): with pytest.raises(ValueError, match=msg): self.df1.melt(id_vars=id_vars, value_vars=value_vars) def test_custom_var_name(self): result5 = self.df.melt(var_name=self.var_name) - assert result5.columns.tolist() == ['var', 'value'] - - result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name) - assert result6.columns.tolist() == ['id1', 'var', 'value'] - - result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name) - assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name) - assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value'] - - result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name) - expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - 'value': (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, 'value']) + assert result5.columns.tolist() == ["var", "value"] + + result6 = self.df.melt(id_vars=["id1"], var_name=self.var_name) + assert result6.columns.tolist() == ["id1", "var", "value"] + + result7 = self.df.melt(id_vars=["id1", "id2"], var_name=self.var_name) + assert result7.columns.tolist() == ["id1", "id2", "var", "value"] + + result8 = self.df.melt( + id_vars=["id1", "id2"], value_vars="A", var_name=self.var_name + ) + assert result8.columns.tolist() == ["id1", "id2", "var", "value"] + + result9 = self.df.melt( + id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=self.var_name + ) + expected9 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + self.var_name: ["A"] * 10 + ["B"] * 10, + "value": (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", self.var_name, "value"], + ) tm.assert_frame_equal(result9, expected9) def test_custom_value_name(self): result10 = self.df.melt(value_name=self.value_name) - assert result10.columns.tolist() == ['variable', 'val'] - - result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name) - assert result11.columns.tolist() == ['id1', 'variable', 'val'] - - result12 = self.df.melt(id_vars=['id1', 'id2'], - value_name=self.value_name) - assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - value_name=self.value_name) - assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val'] - - result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - value_name=self.value_name) - expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - 'variable': ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', 'variable', - self.value_name]) + assert result10.columns.tolist() == ["variable", "val"] + + result11 = self.df.melt(id_vars=["id1"], value_name=self.value_name) + assert result11.columns.tolist() == ["id1", "variable", "val"] + + result12 = self.df.melt(id_vars=["id1", "id2"], value_name=self.value_name) + assert result12.columns.tolist() == ["id1", "id2", "variable", "val"] + + result13 = self.df.melt( + id_vars=["id1", "id2"], value_vars="A", value_name=self.value_name + ) + assert result13.columns.tolist() == ["id1", "id2", "variable", "val"] + + result14 = self.df.melt( + id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=self.value_name + ) + expected14 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + "variable": ["A"] * 10 + ["B"] * 10, + self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", "variable", self.value_name], + ) tm.assert_frame_equal(result14, expected14) def test_custom_var_and_value_name(self): - result15 = self.df.melt(var_name=self.var_name, - value_name=self.value_name) - assert result15.columns.tolist() == ['var', 'val'] - - result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name, - value_name=self.value_name) - assert result16.columns.tolist() == ['id1', 'var', 'val'] - - result17 = self.df.melt(id_vars=['id1', 'id2'], - var_name=self.var_name, - value_name=self.value_name) - assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A', - var_name=self.var_name, - value_name=self.value_name) - assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val'] - - result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'], - var_name=self.var_name, - value_name=self.value_name) - expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2, - 'id2': self.df['id2'].tolist() * 2, - self.var_name: ['A'] * 10 + ['B'] * 10, - self.value_name: (self.df['A'].tolist() + - self.df['B'].tolist())}, - columns=['id1', 'id2', self.var_name, - self.value_name]) + result15 = self.df.melt(var_name=self.var_name, value_name=self.value_name) + assert result15.columns.tolist() == ["var", "val"] + + result16 = self.df.melt( + id_vars=["id1"], var_name=self.var_name, value_name=self.value_name + ) + assert result16.columns.tolist() == ["id1", "var", "val"] + + result17 = self.df.melt( + id_vars=["id1", "id2"], var_name=self.var_name, value_name=self.value_name + ) + assert result17.columns.tolist() == ["id1", "id2", "var", "val"] + + result18 = self.df.melt( + id_vars=["id1", "id2"], + value_vars="A", + var_name=self.var_name, + value_name=self.value_name, + ) + assert result18.columns.tolist() == ["id1", "id2", "var", "val"] + + result19 = self.df.melt( + id_vars=["id1", "id2"], + value_vars=["A", "B"], + var_name=self.var_name, + value_name=self.value_name, + ) + expected19 = DataFrame( + { + "id1": self.df["id1"].tolist() * 2, + "id2": self.df["id2"].tolist() * 2, + self.var_name: ["A"] * 10 + ["B"] * 10, + self.value_name: (self.df["A"].tolist() + self.df["B"].tolist()), + }, + columns=["id1", "id2", self.var_name, self.value_name], + ) tm.assert_frame_equal(result19, expected19) df20 = self.df.copy() - df20.columns.name = 'foo' + df20.columns.name = "foo" result20 = df20.melt() - assert result20.columns.tolist() == ['foo', 'value'] + assert result20.columns.tolist() == ["foo", "value"] def test_col_level(self): res1 = self.df1.melt(col_level=0) - res2 = self.df1.melt(col_level='CAP') - assert res1.columns.tolist() == ['CAP', 'value'] - assert res2.columns.tolist() == ['CAP', 'value'] + res2 = self.df1.melt(col_level="CAP") + assert res1.columns.tolist() == ["CAP", "value"] + assert res2.columns.tolist() == ["CAP", "value"] def test_multiindex(self): res = self.df1.melt() - assert res.columns.tolist() == ['CAP', 'low', 'value'] - - @pytest.mark.parametrize("col", [ - pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')), - pd.Series(["a", "b", "c", "a", "d"], dtype="category"), - pd.Series([0, 1, 0, 0, 0])]) + assert res.columns.tolist() == ["CAP", "low", "value"] + + @pytest.mark.parametrize( + "col", + [ + pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")), + pd.Series(["a", "b", "c", "a", "d"], dtype="category"), + pd.Series([0, 1, 0, 0, 0]), + ], + ) def test_pandas_dtypes(self, col): # GH 15785 - df = DataFrame({'klass': range(5), - 'col': col, - 'attr1': [1, 0, 0, 0, 0], - 'attr2': col}) - expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], - ignore_index=True) - result = melt(df, id_vars=['klass', 'col'], var_name='attribute', - value_name='value') - expected = DataFrame({0: list(range(5)) * 2, - 1: pd.concat([col] * 2, ignore_index=True), - 2: ['attr1'] * 5 + ['attr2'] * 5, - 3: expected_value}) - expected.columns = ['klass', 'col', 'attribute', 'value'] + df = DataFrame( + {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col} + ) + expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True) + result = melt( + df, id_vars=["klass", "col"], var_name="attribute", value_name="value" + ) + expected = DataFrame( + { + 0: list(range(5)) * 2, + 1: pd.concat([col] * 2, ignore_index=True), + 2: ["attr1"] * 5 + ["attr2"] * 5, + 3: expected_value, + } + ) + expected.columns = ["klass", "col", "attribute", "value"] tm.assert_frame_equal(result, expected) def test_melt_missing_columns_raises(self): @@ -242,472 +279,688 @@ def test_melt_missing_columns_raises(self): # attempted with column names absent from the dataframe # Generate data - df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) + df = pd.DataFrame(np.random.randn(5, 4), columns=list("abcd")) # Try to melt with missing `value_vars` column name msg = "The following '{Var}' are not present in the DataFrame: {Col}" with pytest.raises( - KeyError, - match=msg.format(Var='value_vars', Col="\\['C'\\]")): - df.melt(['a', 'b'], ['C', 'd']) + KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]") + ): + df.melt(["a", "b"], ["C", "d"]) # Try to melt with missing `id_vars` column name - with pytest.raises( - KeyError, - match=msg.format(Var='id_vars', Col="\\['A'\\]")): - df.melt(['A', 'b'], ['c', 'd']) + with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")): + df.melt(["A", "b"], ["c", "d"]) # Multiple missing with pytest.raises( - KeyError, - match=msg.format(Var='id_vars', - Col="\\['not_here', 'or_there'\\]")): - df.melt(['a', 'b', 'not_here', 'or_there'], ['c', 'd']) + KeyError, + match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"), + ): + df.melt(["a", "b", "not_here", "or_there"], ["c", "d"]) # Multiindex melt fails if column is missing from multilevel melt multi = df.copy() - multi.columns = [list('ABCD'), list('abcd')] - with pytest.raises( - KeyError, - match=msg.format(Var='id_vars', - Col="\\['E'\\]")): - multi.melt([('E', 'a')], [('B', 'b')]) + multi.columns = [list("ABCD"), list("abcd")] + with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")): + multi.melt([("E", "a")], [("B", "b")]) # Multiindex fails if column is missing from single level melt with pytest.raises( - KeyError, - match=msg.format(Var='value_vars', - Col="\\['F'\\]")): - multi.melt(['A'], ['F'], col_level=0) + KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]") + ): + multi.melt(["A"], ["F"], col_level=0) class TestLreshape: - def test_pairs(self): - data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009'], - 'visitdt2': - ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], - 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], - 'wt1': [1823, 3338, 1549, 3298, 4306], - 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], - 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + ], + "birthwt": [1766, 3301, 1454, 3139, 4133], + "id": [101, 102, 103, 104, 105], + "sex": ["Male", "Female", "Female", "Female", "Female"], + "visitdt1": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + ], + "visitdt2": ["21jan2009", nan, "22jan2009", "31dec2008", "03feb2009"], + "visitdt3": ["05feb2009", nan, nan, "02jan2009", "15feb2009"], + "wt1": [1823, 3338, 1549, 3298, 4306], + "wt2": [2011.0, nan, 1892.0, 3338.0, 4575.0], + "wt3": [2293.0, nan, nan, 3377.0, 4805.0], + } df = DataFrame(data) - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], - 'wt': ['wt%d' % i for i in range(1, 4)]} + spec = { + "visitdt": ["visitdt%d" % i for i in range(1, 4)], + "wt": ["wt%d" % i for i in range(1, 4)], + } result = lreshape(df, spec) - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, - 4133, 1766, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, - 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Male', - 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', '02jan2009', '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, - 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp_data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "21dec2008", + "11jan2009", + ], + "birthwt": [ + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 1454, + 3139, + 4133, + 1766, + 3139, + 4133, + ], + "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], + "sex": [ + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + ], + "visitdt": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + "21jan2009", + "22jan2009", + "31dec2008", + "03feb2009", + "05feb2009", + "02jan2009", + "15feb2009", + ], + "wt": [ + 1823.0, + 3338.0, + 1549.0, + 3298.0, + 4306.0, + 2011.0, + 1892.0, + 3338.0, + 4575.0, + 2293.0, + 3377.0, + 4805.0, + ], + } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) result = lreshape(df, spec, dropna=False) - exp_data = {'birthdt': - ['08jan2009', '20dec2008', '30dec2008', '21dec2008', - '11jan2009', '08jan2009', '20dec2008', '30dec2008', - '21dec2008', '11jan2009', '08jan2009', '20dec2008', - '30dec2008', '21dec2008', '11jan2009'], - 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, - 3139, 4133, 1766, 3301, 1454, 3139, 4133], - 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, - 101, 102, 103, 104, 105], - 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female', - 'Male', 'Female', 'Female', 'Female', 'Female'], - 'visitdt': ['11jan2009', '22dec2008', '04jan2009', - '29dec2008', '20jan2009', '21jan2009', nan, - '22jan2009', '31dec2008', '03feb2009', - '05feb2009', nan, nan, '02jan2009', - '15feb2009'], - 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, - 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, - 4805.0]} + exp_data = { + "birthdt": [ + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + "08jan2009", + "20dec2008", + "30dec2008", + "21dec2008", + "11jan2009", + ], + "birthwt": [ + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 3301, + 1454, + 3139, + 4133, + 1766, + 3301, + 1454, + 3139, + 4133, + ], + "id": [ + 101, + 102, + 103, + 104, + 105, + 101, + 102, + 103, + 104, + 105, + 101, + 102, + 103, + 104, + 105, + ], + "sex": [ + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Female", + "Male", + "Female", + "Female", + "Female", + "Female", + ], + "visitdt": [ + "11jan2009", + "22dec2008", + "04jan2009", + "29dec2008", + "20jan2009", + "21jan2009", + nan, + "22jan2009", + "31dec2008", + "03feb2009", + "05feb2009", + nan, + nan, + "02jan2009", + "15feb2009", + ], + "wt": [ + 1823.0, + 3338.0, + 1549.0, + 3298.0, + 4306.0, + 2011.0, + nan, + 1892.0, + 3338.0, + 4575.0, + 2293.0, + nan, + nan, + 3377.0, + 4805.0, + ], + } exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) - spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], - 'wt': ['wt%d' % i for i in range(1, 4)]} + spec = { + "visitdt": ["visitdt%d" % i for i in range(1, 3)], + "wt": ["wt%d" % i for i in range(1, 4)], + } msg = "All column lists must be same length" with pytest.raises(ValueError, match=msg): lreshape(df, spec) class TestWideToLong: - def test_simple(self): np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame({"A1970": {0: "a", - 1: "b", - 2: "c"}, - "A1980": {0: "d", - 1: "e", - 2: "f"}, - "B1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) + df = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = DataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_stubs(self): # GH9204 df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) - df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] - stubs = ['inc', 'edu'] + df.columns = ["id", "inc1", "inc2", "edu1", "edu2"] + stubs = ["inc", "edu"] # TODO: unused? - df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa + df_long = pd.wide_to_long(df, stubs, i="id", j="age") # noqa - assert stubs == ['inc', 'edu'] + assert stubs == ["inc", "edu"] def test_separating_character(self): # GH14779 np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame({"A.1970": {0: "a", - 1: "b", - 2: "c"}, - "A.1980": {0: "d", - 1: "e", - 2: "f"}, - "B.1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B.1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) + df = pd.DataFrame( + { + "A.1970": {0: "a", 1: "b", 2: "c"}, + "A.1980": {0: "d", 1: "e", 2: "f"}, + "B.1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B.1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A": ['a', 'b', 'c', 'd', 'e', 'f'], - "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A": ["a", "b", "c", "d", "e", "f"], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = DataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") tm.assert_frame_equal(result, expected) def test_escapable_characters(self): np.random.seed(123) x = np.random.randn(3) - df = pd.DataFrame({"A(quarterly)1970": {0: "a", - 1: "b", - 2: "c"}, - "A(quarterly)1980": {0: "d", - 1: "e", - 2: "f"}, - "B(quarterly)1970": {0: 2.5, - 1: 1.2, - 2: .7}, - "B(quarterly)1980": {0: 3.2, - 1: 1.3, - 2: .1}, - "X": dict(zip( - range(3), x))}) + df = pd.DataFrame( + { + "A(quarterly)1970": {0: "a", 1: "b", 2: "c"}, + "A(quarterly)1980": {0: "d", 1: "e", 2: "f"}, + "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), x)), + } + ) df["id"] = df.index - exp_data = {"X": x.tolist() + x.tolist(), - "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], - "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], - "id": [0, 1, 2, 0, 1, 2]} + exp_data = { + "X": x.tolist() + x.tolist(), + "A(quarterly)": ["a", "b", "c", "d", "e", "f"], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": [1970, 1970, 1970, 1980, 1980, 1980], + "id": [0, 1, 2, 0, 1, 2], + } expected = DataFrame(exp_data) - expected = expected.set_index( - ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] - result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], - i="id", j="year") + expected = expected.set_index(["id", "year"])[ + ["X", "A(quarterly)", "B(quarterly)"] + ] + result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_unbalanced(self): # test that we can have a varying amount of time variables - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], - 'A': [1.0, 3.0, 2.0, 4.0], - 'B': [5.0, np.nan, 6.0, np.nan], - 'id': [0, 0, 1, 1], - 'year': [2010, 2011, 2010, 2011]} + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": ["X1", "X1", "X2", "X2"], + "A": [1.0, 3.0, 2.0, 4.0], + "B": [5.0, np.nan, 6.0, np.nan], + "id": [0, 0, 1, 1], + "year": [2010, 2011, 2010, 2011], + } expected = pd.DataFrame(exp_data) - expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] - result = wide_to_long(df, ['A', 'B'], i='id', j='year') + expected = expected.set_index(["id", "year"])[["X", "A", "B"]] + result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result, expected) def test_character_overlap(self): # Test we handle overlapping characters in both id_vars and value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'BBBX': [91, 92, 93], - 'BBBZ': [91, 92, 93] - }) - df['id'] = df.index - expected = pd.DataFrame({ - 'BBBX': [91, 92, 93, 91, 92, 93], - 'BBBZ': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': [11, 11, 11, 12, 12, 12]}) - expected = expected.set_index(['id', 'year'])[ - ['BBBX', 'BBBZ', 'A', 'B', 'BB']] - result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + df = pd.DataFrame( + { + "A11": ["a11", "a22", "a33"], + "A12": ["a21", "a22", "a23"], + "B11": ["b11", "b12", "b13"], + "B12": ["b21", "b22", "b23"], + "BB11": [1, 2, 3], + "BB12": [4, 5, 6], + "BBBX": [91, 92, 93], + "BBBZ": [91, 92, 93], + } + ) + df["id"] = df.index + expected = pd.DataFrame( + { + "BBBX": [91, 92, 93, 91, 92, 93], + "BBBZ": [91, 92, 93, 91, 92, 93], + "A": ["a11", "a22", "a33", "a21", "a22", "a23"], + "B": ["b11", "b12", "b13", "b21", "b22", "b23"], + "BB": [1, 2, 3, 4, 5, 6], + "id": [0, 1, 2, 0, 1, 2], + "year": [11, 11, 11, 12, 12, 12], + } + ) + expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]] + result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_invalid_separator(self): # if an invalid separator is supplied a empty data frame is returned - sep = 'nope!' - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'A2010': [], - 'A2011': [], - 'B2010': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - expected = pd.DataFrame(exp_data).astype({'year': 'int'}) - expected = expected.set_index(['id', 'year'])[[ - 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] + sep = "nope!" + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": "", + "A2010": [], + "A2011": [], + "B2010": [], + "id": [], + "year": [], + "A": [], + "B": [], + } + expected = pd.DataFrame(exp_data).astype({"year": "int"}) + expected = expected.set_index(["id", "year"])[ + ["X", "A2010", "A2011", "B2010", "A", "B"] + ] expected.index.set_levels([0, 1], level=0, inplace=True) - result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep) + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_num_string_disambiguation(self): # Test that we can disambiguate number value_vars from # string value_vars - df = pd.DataFrame({ - 'A11': ['a11', 'a22', 'a33'], - 'A12': ['a21', 'a22', 'a23'], - 'B11': ['b11', 'b12', 'b13'], - 'B12': ['b21', 'b22', 'b23'], - 'BB11': [1, 2, 3], - 'BB12': [4, 5, 6], - 'Arating': [91, 92, 93], - 'Arating_old': [91, 92, 93] - }) - df['id'] = df.index - expected = pd.DataFrame({ - 'Arating': [91, 92, 93, 91, 92, 93], - 'Arating_old': [91, 92, 93, 91, 92, 93], - 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], - 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], - 'BB': [1, 2, 3, 4, 5, 6], - 'id': [0, 1, 2, 0, 1, 2], - 'year': [11, 11, 11, 12, 12, 12]}) - expected = expected.set_index(['id', 'year'])[ - ['Arating', 'Arating_old', 'A', 'B', 'BB']] - result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + df = pd.DataFrame( + { + "A11": ["a11", "a22", "a33"], + "A12": ["a21", "a22", "a23"], + "B11": ["b11", "b12", "b13"], + "B12": ["b21", "b22", "b23"], + "BB11": [1, 2, 3], + "BB12": [4, 5, 6], + "Arating": [91, 92, 93], + "Arating_old": [91, 92, 93], + } + ) + df["id"] = df.index + expected = pd.DataFrame( + { + "Arating": [91, 92, 93, 91, 92, 93], + "Arating_old": [91, 92, 93, 91, 92, 93], + "A": ["a11", "a22", "a33", "a21", "a22", "a23"], + "B": ["b11", "b12", "b13", "b21", "b22", "b23"], + "BB": [1, 2, 3, 4, 5, 6], + "id": [0, 1, 2, 0, 1, 2], + "year": [11, 11, 11, 12, 12, 12], + } + ) + expected = expected.set_index(["id", "year"])[ + ["Arating", "Arating_old", "A", "B", "BB"] + ] + result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_invalid_suffixtype(self): # If all stubs names end with a string, but a numeric suffix is # assumed, an empty data frame is returned - df = pd.DataFrame({'Aone': [1.0, 2.0], - 'Atwo': [3.0, 4.0], - 'Bone': [5.0, 6.0], - 'X': ['X1', 'X2']}) - df['id'] = df.index - exp_data = {'X': '', - 'Aone': [], - 'Atwo': [], - 'Bone': [], - 'id': [], - 'year': [], - 'A': [], - 'B': []} - expected = pd.DataFrame(exp_data).astype({'year': 'int'}) - - expected = expected.set_index(['id', 'year']) + df = pd.DataFrame( + { + "Aone": [1.0, 2.0], + "Atwo": [3.0, 4.0], + "Bone": [5.0, 6.0], + "X": ["X1", "X2"], + } + ) + df["id"] = df.index + exp_data = { + "X": "", + "Aone": [], + "Atwo": [], + "Bone": [], + "id": [], + "year": [], + "A": [], + "B": [], + } + expected = pd.DataFrame(exp_data).astype({"year": "int"}) + + expected = expected.set_index(["id", "year"]) expected.index.set_levels([0, 1], level=0, inplace=True) - result = wide_to_long(df, ['A', 'B'], i='id', j='year') - tm.assert_frame_equal(result.sort_index(axis=1), - expected.sort_index(axis=1)) + result = wide_to_long(df, ["A", "B"], i="id", j="year") + tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) def test_multiple_id_columns(self): # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm - df = pd.DataFrame({ - 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], - 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] - }) - expected = pd.DataFrame({ - 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, - 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], - 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], - 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], - 'age': [1, 2, 1, 2, 1, 2, 1, 2, 1, - 2, 1, 2, 1, 2, 1, 2, 1, 2] - }) - expected = expected.set_index(['famid', 'birth', 'age'])[['ht']] - result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') + df = pd.DataFrame( + { + "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9], + } + ) + expected = pd.DataFrame( + { + "ht": [ + 2.8, + 3.4, + 2.9, + 3.8, + 2.2, + 2.9, + 2.0, + 3.2, + 1.8, + 2.8, + 1.9, + 2.4, + 2.2, + 3.3, + 2.3, + 3.4, + 2.1, + 2.9, + ], + "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], + } + ) + expected = expected.set_index(["famid", "birth", "age"])[["ht"]] + result = wide_to_long(df, "ht", i=["famid", "birth"], j="age") tm.assert_frame_equal(result, expected) def test_non_unique_idvars(self): # GH16382 # Raise an error message if non unique id vars (i) are passed - df = pd.DataFrame({ - 'A_A1': [1, 2, 3, 4, 5], - 'B_B1': [1, 2, 3, 4, 5], - 'x': [1, 1, 1, 1, 1] - }) + df = pd.DataFrame( + {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]} + ) msg = "the id variables need to uniquely identify each row" with pytest.raises(ValueError, match=msg): - wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') + wide_to_long(df, ["A_A", "B_B"], i="x", j="colname") def test_cast_j_int(self): - df = pd.DataFrame({ - 'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'], - 'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'], - 'actor_fb_likes_1': [1000.0, 40000.0, 11000.0], - 'actor_fb_likes_2': [936.0, 5000.0, 393.0], - 'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']}) - - expected = pd.DataFrame({ - 'actor': ['CCH Pounder', - 'Johnny Depp', - 'Christoph Waltz', - 'Joel David Moore', - 'Orlando Bloom', - 'Rory Kinnear'], - 'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0], - 'num': [1, 1, 1, 2, 2, 2], - 'title': ['Avatar', - 'Pirates of the Caribbean', - 'Spectre', - 'Avatar', - 'Pirates of the Caribbean', - 'Spectre']}).set_index(['title', 'num']) - result = wide_to_long(df, ['actor', 'actor_fb_likes'], - i='title', j='num', sep='_') + df = pd.DataFrame( + { + "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"], + "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"], + "actor_fb_likes_1": [1000.0, 40000.0, 11000.0], + "actor_fb_likes_2": [936.0, 5000.0, 393.0], + "title": ["Avatar", "Pirates of the Caribbean", "Spectre"], + } + ) + + expected = pd.DataFrame( + { + "actor": [ + "CCH Pounder", + "Johnny Depp", + "Christoph Waltz", + "Joel David Moore", + "Orlando Bloom", + "Rory Kinnear", + ], + "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0], + "num": [1, 1, 1, 2, 2, 2], + "title": [ + "Avatar", + "Pirates of the Caribbean", + "Spectre", + "Avatar", + "Pirates of the Caribbean", + "Spectre", + ], + } + ).set_index(["title", "num"]) + result = wide_to_long( + df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_" + ) tm.assert_frame_equal(result, expected) def test_identical_stubnames(self): - df = pd.DataFrame({'A2010': [1.0, 2.0], - 'A2011': [3.0, 4.0], - 'B2010': [5.0, 6.0], - 'A': ['X1', 'X2']}) + df = pd.DataFrame( + { + "A2010": [1.0, 2.0], + "A2011": [3.0, 4.0], + "B2010": [5.0, 6.0], + "A": ["X1", "X2"], + } + ) msg = "stubname can't be identical to a column name" with pytest.raises(ValueError, match=msg): - wide_to_long(df, ['A', 'B'], i='A', j='colname') + wide_to_long(df, ["A", "B"], i="A", j="colname") def test_nonnumeric_suffix(self): - df = pd.DataFrame({'treatment_placebo': [1.0, 2.0], - 'treatment_test': [3.0, 4.0], - 'result_placebo': [5.0, 6.0], - 'A': ['X1', 'X2']}) - expected = pd.DataFrame({ - 'A': ['X1', 'X1', 'X2', 'X2'], - 'colname': ['placebo', 'test', 'placebo', 'test'], - 'result': [5.0, np.nan, 6.0, np.nan], - 'treatment': [1.0, 3.0, 2.0, 4.0]}) - expected = expected.set_index(['A', 'colname']) - result = wide_to_long(df, ['result', 'treatment'], - i='A', j='colname', suffix='[a-z]+', sep='_') + df = pd.DataFrame( + { + "treatment_placebo": [1.0, 2.0], + "treatment_test": [3.0, 4.0], + "result_placebo": [5.0, 6.0], + "A": ["X1", "X2"], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X1", "X2", "X2"], + "colname": ["placebo", "test", "placebo", "test"], + "result": [5.0, np.nan, 6.0, np.nan], + "treatment": [1.0, 3.0, 2.0, 4.0], + } + ) + expected = expected.set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_" + ) tm.assert_frame_equal(result, expected) def test_mixed_type_suffix(self): - df = pd.DataFrame({ - 'A': ['X1', 'X2'], - 'result_1': [0, 9], - 'result_foo': [5.0, 6.0], - 'treatment_1': [1.0, 2.0], - 'treatment_foo': [3.0, 4.0]}) - expected = pd.DataFrame({ - 'A': ['X1', 'X2', 'X1', 'X2'], - 'colname': ['1', '1', 'foo', 'foo'], - 'result': [0.0, 9.0, 5.0, 6.0], - 'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname']) - result = wide_to_long(df, ['result', 'treatment'], - i='A', j='colname', suffix='.+', sep='_') + df = pd.DataFrame( + { + "A": ["X1", "X2"], + "result_1": [0, 9], + "result_foo": [5.0, 6.0], + "treatment_1": [1.0, 2.0], + "treatment_foo": [3.0, 4.0], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X2", "X1", "X2"], + "colname": ["1", "1", "foo", "foo"], + "result": [0.0, 9.0, 5.0, 6.0], + "treatment": [1.0, 2.0, 3.0, 4.0], + } + ).set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_" + ) tm.assert_frame_equal(result, expected) def test_float_suffix(self): - df = pd.DataFrame({ - 'treatment_1.1': [1.0, 2.0], - 'treatment_2.1': [3.0, 4.0], - 'result_1.2': [5.0, 6.0], - 'result_1': [0, 9], - 'A': ['X1', 'X2']}) - expected = pd.DataFrame({ - 'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'], - 'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], - 'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], - 'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]}) - expected = expected.set_index(['A', 'colname']) - result = wide_to_long(df, ['result', 'treatment'], - i='A', j='colname', suffix='[0-9.]+', sep='_') + df = pd.DataFrame( + { + "treatment_1.1": [1.0, 2.0], + "treatment_2.1": [3.0, 4.0], + "result_1.2": [5.0, 6.0], + "result_1": [0, 9], + "A": ["X1", "X2"], + } + ) + expected = pd.DataFrame( + { + "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"], + "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], + "result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], + "treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0], + } + ) + expected = expected.set_index(["A", "colname"]) + result = wide_to_long( + df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_" + ) tm.assert_frame_equal(result, expected) def test_col_substring_of_stubname(self): # GH22468 # Don't raise ValueError when a column name is a substring # of a stubname that's been passed as a string - wide_data = {'node_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, - 'A': {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81}, - 'PA0': {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6}, - 'PA1': {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67}, - 'PA3': {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67} - } + wide_data = { + "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, + "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81}, + "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6}, + "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67}, + "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}, + } wide_df = pd.DataFrame.from_dict(wide_data) - expected = pd.wide_to_long(wide_df, - stubnames=['PA'], - i=['node_id', 'A'], - j='time') - result = pd.wide_to_long(wide_df, - stubnames='PA', - i=['node_id', 'A'], - j='time') + expected = pd.wide_to_long( + wide_df, stubnames=["PA"], i=["node_id", "A"], j="time" + ) + result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7795c356bf43e..b497f6c3aa9b4 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -7,8 +7,15 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Grouper, Index, MultiIndex, Series, concat, - date_range) + Categorical, + DataFrame, + Grouper, + Index, + MultiIndex, + Series, + concat, + date_range, +) from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import crosstab, pivot_table import pandas.util.testing as tm @@ -26,33 +33,68 @@ def interval_values(request, closed): class TestPivotTable: - def setup_method(self, method): - self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + self.data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) def test_pivot_table(self, observed): - index = ['A', 'B'] - columns = 'C' - table = pivot_table(self.data, values='D', - index=index, columns=columns, observed=observed) + index = ["A", "B"] + columns = "C" + table = pivot_table( + self.data, values="D", index=index, columns=columns, observed=observed + ) table2 = self.data.pivot_table( - values='D', index=index, columns=columns, observed=observed) + values="D", index=index, columns=columns, observed=observed + ) tm.assert_frame_equal(table, table2) # this works - pivot_table(self.data, values='D', index=index, observed=observed) + pivot_table(self.data, values="D", index=index, observed=observed) if len(index) > 1: assert table.index.names == tuple(index) @@ -64,174 +106,199 @@ def test_pivot_table(self, observed): else: assert table.columns.name == columns[0] - expected = self.data.groupby( - index + [columns])['D'].agg(np.mean).unstack() + expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) def test_pivot_table_categorical_observed_equal(self, observed): # issue #24923 - df = pd.DataFrame({'col1': list('abcde'), - 'col2': list('fghij'), - 'col3': [1, 2, 3, 4, 5]}) + df = pd.DataFrame( + {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]} + ) - expected = df.pivot_table(index='col1', values='col3', - columns='col2', aggfunc=np.sum, - fill_value=0) + expected = df.pivot_table( + index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 + ) - expected.index = expected.index.astype('category') - expected.columns = expected.columns.astype('category') + expected.index = expected.index.astype("category") + expected.columns = expected.columns.astype("category") - df.col1 = df.col1.astype('category') - df.col2 = df.col2.astype('category') + df.col1 = df.col1.astype("category") + df.col2 = df.col2.astype("category") - result = df.pivot_table(index='col1', values='col3', - columns='col2', aggfunc=np.sum, - fill_value=0, observed=observed) + result = df.pivot_table( + index="col1", + values="col3", + columns="col2", + aggfunc=np.sum, + fill_value=0, + observed=observed, + ) tm.assert_frame_equal(result, expected) def test_pivot_table_nocols(self): - df = DataFrame({'rows': ['a', 'b', 'c'], - 'cols': ['x', 'y', 'z'], - 'values': [1, 2, 3]}) - rs = df.pivot_table(columns='cols', aggfunc=np.sum) - xp = df.pivot_table(index='cols', aggfunc=np.sum).T + df = DataFrame( + {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]} + ) + rs = df.pivot_table(columns="cols", aggfunc=np.sum) + xp = df.pivot_table(index="cols", aggfunc=np.sum).T tm.assert_frame_equal(rs, xp) - rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'}) - xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T + rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"}) + xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T tm.assert_frame_equal(rs, xp) def test_pivot_table_dropna(self): - df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000}, - 'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'}, - 'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310}, - 'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'}, - 'quantity': {0: 2000000, 1: 500000, - 2: 1000000, 3: 1000000}}) - pv_col = df.pivot_table('quantity', 'month', [ - 'customer', 'product'], dropna=False) + df = DataFrame( + { + "amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000}, + "customer": {0: "A", 1: "A", 2: "B", 3: "C"}, + "month": {0: 201307, 1: 201309, 2: 201308, 3: 201310}, + "product": {0: "a", 1: "b", 2: "c", 3: "d"}, + "quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}, + } + ) + pv_col = df.pivot_table( + "quantity", "month", ["customer", "product"], dropna=False + ) pv_ind = df.pivot_table( - 'quantity', ['customer', 'product'], 'month', dropna=False) + "quantity", ["customer", "product"], "month", dropna=False + ) - m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'), - ('A', 'd'), ('B', 'a'), ('B', 'b'), - ('B', 'c'), ('B', 'd'), ('C', 'a'), - ('C', 'b'), ('C', 'c'), ('C', 'd')], - names=['customer', 'product']) + m = MultiIndex.from_tuples( + [ + ("A", "a"), + ("A", "b"), + ("A", "c"), + ("A", "d"), + ("B", "a"), + ("B", "b"), + ("B", "c"), + ("B", "d"), + ("C", "a"), + ("C", "b"), + ("C", "c"), + ("C", "d"), + ], + names=["customer", "product"], + ) tm.assert_index_equal(pv_col.columns, m) tm.assert_index_equal(pv_ind.index, m) def test_pivot_table_categorical(self): - cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) + cat1 = Categorical( + ["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True + ) + cat2 = Categorical( + ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True + ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - result = pd.pivot_table(df, values='values', index=['A', 'B'], - dropna=True) + result = pd.pivot_table(df, values="values", index=["A", "B"], dropna=True) - exp_index = pd.MultiIndex.from_arrays( - [cat1, cat2], - names=['A', 'B']) - expected = DataFrame( - {'values': [1, 2, 3, 4]}, - index=exp_index) + exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) + expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) tm.assert_frame_equal(result, expected) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 - categories = ['a', 'b', 'c', 'd'] - - df = DataFrame({'A': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], - 'B': [1, 2, 3, 1, 2, 3, 1, 2, 3], - 'C': range(0, 9)}) - - df['A'] = df['A'].astype(CDT(categories, ordered=False)) - result = df.pivot_table(index='B', columns='A', values='C', - dropna=dropna) - expected_columns = Series(['a', 'b', 'c'], name='A') - expected_columns = expected_columns.astype( - CDT(categories, ordered=False)) - expected_index = Series([1, 2, 3], name='B') - expected = DataFrame([[0, 3, 6], - [1, 4, 7], - [2, 5, 8]], - index=expected_index, - columns=expected_columns,) + categories = ["a", "b", "c", "d"] + + df = DataFrame( + { + "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "B": [1, 2, 3, 1, 2, 3, 1, 2, 3], + "C": range(0, 9), + } + ) + + df["A"] = df["A"].astype(CDT(categories, ordered=False)) + result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) + expected_columns = Series(["a", "b", "c"], name="A") + expected_columns = expected_columns.astype(CDT(categories, ordered=False)) + expected_index = Series([1, 2, 3], name="B") + expected = DataFrame( + [[0, 3, 6], [1, 4, 7], [2, 5, 8]], + index=expected_index, + columns=expected_columns, + ) if not dropna: # add back the non observed to compare - expected = expected.reindex( - columns=Categorical(categories)).astype('float') + expected = expected.reindex(columns=Categorical(categories)).astype("float") tm.assert_frame_equal(result, expected) def test_pivot_with_non_observable_dropna(self, dropna): # gh-21133 df = pd.DataFrame( - {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'], - categories=['low', 'high'], - ordered=True), - 'B': range(5)}) + { + "A": pd.Categorical( + [np.nan, "low", "high", "low", "high"], + categories=["low", "high"], + ordered=True, + ), + "B": range(5), + } + ) - result = df.pivot_table(index='A', values='B', dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = pd.DataFrame( - {'B': [2, 3]}, + {"B": [2, 3]}, index=pd.Index( - pd.Categorical.from_codes([0, 1], - categories=['low', 'high'], - ordered=True), - name='A')) + pd.Categorical.from_codes( + [0, 1], categories=["low", "high"], ordered=True + ), + name="A", + ), + ) tm.assert_frame_equal(result, expected) # gh-21378 df = pd.DataFrame( - {'A': pd.Categorical(['left', 'low', 'high', 'low', 'high'], - categories=['low', 'high', 'left'], - ordered=True), - 'B': range(5)}) + { + "A": pd.Categorical( + ["left", "low", "high", "low", "high"], + categories=["low", "high", "left"], + ordered=True, + ), + "B": range(5), + } + ) - result = df.pivot_table(index='A', values='B', dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna) expected = pd.DataFrame( - {'B': [2, 3, 0]}, + {"B": [2, 3, 0]}, index=pd.Index( - pd.Categorical.from_codes([0, 1, 2], - categories=['low', 'high', 'left'], - ordered=True), - name='A')) + pd.Categorical.from_codes( + [0, 1, 2], categories=["low", "high", "left"], ordered=True + ), + name="A", + ), + ) tm.assert_frame_equal(result, expected) def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 - df = DataFrame( - {'A': interval_values, - 'B': 1}) - result = df.pivot_table(index='A', values='B', dropna=dropna) - expected = DataFrame( - {'B': 1}, - index=Index(interval_values.unique(), - name='A')) + df = DataFrame({"A": interval_values, "B": 1}) + result = df.pivot_table(index="A", values="B", dropna=dropna) + expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) tm.assert_frame_equal(result, expected) def test_pass_array(self): - result = self.data.pivot_table( - 'D', index=self.data.A, columns=self.data.C) - expected = self.data.pivot_table('D', index='A', columns='C') + result = self.data.pivot_table("D", index=self.data.A, columns=self.data.C) + expected = self.data.pivot_table("D", index="A", columns="C") tm.assert_frame_equal(result, expected) def test_pass_function(self): - result = self.data.pivot_table('D', index=lambda x: x // 5, - columns=self.data.C) - expected = self.data.pivot_table('D', index=self.data.index // 5, - columns='C') + result = self.data.pivot_table("D", index=lambda x: x // 5, columns=self.data.C) + expected = self.data.pivot_table("D", index=self.data.index // 5, columns="C") tm.assert_frame_equal(result, expected) def test_pivot_table_multiple(self): - index = ['A', 'B'] - columns = 'C' + index = ["A", "B"] + columns = "C" table = pivot_table(self.data, index=index, columns=columns) expected = self.data.groupby(index + [columns]).agg(np.mean).unstack() tm.assert_frame_equal(table, expected) @@ -239,417 +306,533 @@ def test_pivot_table_multiple(self): def test_pivot_dtypes(self): # can convert dtypes - f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [ - 1, 2, 3, 4], 'i': ['a', 'b', 'a', 'b']}) - assert f.dtypes['v'] == 'int64' + f = DataFrame( + { + "a": ["cat", "bat", "cat", "bat"], + "v": [1, 2, 3, 4], + "i": ["a", "b", "a", "b"], + } + ) + assert f.dtypes["v"] == "int64" - z = pivot_table(f, values='v', index=['a'], columns=[ - 'i'], fill_value=0, aggfunc=np.sum) + z = pivot_table( + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.sum + ) result = z.dtypes - expected = Series([np.dtype('int64')] * 2, - index=Index(list('ab'), name='i')) + expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i")) tm.assert_series_equal(result, expected) # cannot convert dtypes - f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [ - 1.5, 2.5, 3.5, 4.5], 'i': ['a', 'b', 'a', 'b']}) - assert f.dtypes['v'] == 'float64' + f = DataFrame( + { + "a": ["cat", "bat", "cat", "bat"], + "v": [1.5, 2.5, 3.5, 4.5], + "i": ["a", "b", "a", "b"], + } + ) + assert f.dtypes["v"] == "float64" - z = pivot_table(f, values='v', index=['a'], columns=[ - 'i'], fill_value=0, aggfunc=np.mean) + z = pivot_table( + f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc=np.mean + ) result = z.dtypes - expected = Series([np.dtype('float64')] * 2, - index=Index(list('ab'), name='i')) + expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i")) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('columns,values', - [('bool1', ['float1', 'float2']), - ('bool1', ['float1', 'float2', 'bool1']), - ('bool2', ['float1', 'float2', 'bool1'])]) + @pytest.mark.parametrize( + "columns,values", + [ + ("bool1", ["float1", "float2"]), + ("bool1", ["float1", "float2", "bool1"]), + ("bool2", ["float1", "float2", "bool1"]), + ], + ) def test_pivot_preserve_dtypes(self, columns, values): # GH 7142 regression test v = np.arange(5, dtype=np.float64) - df = DataFrame({'float1': v, 'float2': v + 2.0, - 'bool1': v <= 2, 'bool2': v <= 3}) + df = DataFrame( + {"float1": v, "float2": v + 2.0, "bool1": v <= 2, "bool2": v <= 3} + ) df_res = df.reset_index().pivot_table( - index='index', columns=columns, values=values) + index="index", columns=columns, values=values + ) result = dict(df_res.dtypes) - expected = {col: np.dtype('O') if col[0].startswith('b') - else np.dtype('float64') for col in df_res} + expected = { + col: np.dtype("O") if col[0].startswith("b") else np.dtype("float64") + for col in df_res + } assert result == expected def test_pivot_no_values(self): # GH 14380 - idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-01-02', - '2011-01-01', '2011-01-02']) - df = pd.DataFrame({'A': [1, 2, 3, 4, 5]}, - index=idx) + idx = pd.DatetimeIndex( + ["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"] + ) + df = pd.DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx) res = df.pivot_table(index=df.index.month, columns=df.index.day) - exp_columns = pd.MultiIndex.from_tuples([('A', 1), ('A', 2)]) - exp = pd.DataFrame([[2.5, 4.0], [2.0, np.nan]], - index=[1, 2], columns=exp_columns) + exp_columns = pd.MultiIndex.from_tuples([("A", 1), ("A", 2)]) + exp = pd.DataFrame( + [[2.5, 4.0], [2.0, np.nan]], index=[1, 2], columns=exp_columns + ) tm.assert_frame_equal(res, exp) - df = pd.DataFrame({'A': [1, 2, 3, 4, 5], - 'dt': pd.date_range('2011-01-01', freq='D', - periods=5)}, - index=idx) - res = df.pivot_table(index=df.index.month, - columns=pd.Grouper(key='dt', freq='M')) - exp_columns = pd.MultiIndex.from_tuples([('A', - pd.Timestamp('2011-01-31'))]) - exp_columns.names = [None, 'dt'] - exp = pd.DataFrame([3.25, 2.0], - index=[1, 2], columns=exp_columns) + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5], + "dt": pd.date_range("2011-01-01", freq="D", periods=5), + }, + index=idx, + ) + res = df.pivot_table( + index=df.index.month, columns=pd.Grouper(key="dt", freq="M") + ) + exp_columns = pd.MultiIndex.from_tuples([("A", pd.Timestamp("2011-01-31"))]) + exp_columns.names = [None, "dt"] + exp = pd.DataFrame([3.25, 2.0], index=[1, 2], columns=exp_columns) tm.assert_frame_equal(res, exp) - res = df.pivot_table(index=pd.Grouper(freq='A'), - columns=pd.Grouper(key='dt', freq='M')) - exp = pd.DataFrame([3], - index=pd.DatetimeIndex(['2011-12-31']), - columns=exp_columns) + res = df.pivot_table( + index=pd.Grouper(freq="A"), columns=pd.Grouper(key="dt", freq="M") + ) + exp = pd.DataFrame( + [3], index=pd.DatetimeIndex(["2011-12-31"]), columns=exp_columns + ) tm.assert_frame_equal(res, exp) def test_pivot_multi_values(self): - result = pivot_table(self.data, values=['D', 'E'], - index='A', columns=['B', 'C'], fill_value=0) - expected = pivot_table(self.data.drop(['F'], axis=1), - index='A', columns=['B', 'C'], fill_value=0) + result = pivot_table( + self.data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0 + ) + expected = pivot_table( + self.data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0 + ) tm.assert_frame_equal(result, expected) def test_pivot_multi_functions(self): - f = lambda func: pivot_table(self.data, values=['D', 'E'], - index=['A', 'B'], columns='C', - aggfunc=func) + f = lambda func: pivot_table( + self.data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func + ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) - expected = concat([means, stds], keys=['mean', 'std'], axis=1) + expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) # margins not supported?? - f = lambda func: pivot_table(self.data, values=['D', 'E'], - index=['A', 'B'], columns='C', - aggfunc=func, margins=True) + f = lambda func: pivot_table( + self.data, + values=["D", "E"], + index=["A", "B"], + columns="C", + aggfunc=func, + margins=True, + ) result = f([np.mean, np.std]) means = f(np.mean) stds = f(np.std) - expected = concat([means, stds], keys=['mean', 'std'], axis=1) + expected = concat([means, stds], keys=["mean", "std"], axis=1) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_index_with_nan(self, method): # GH 3588 nan = np.nan - df = DataFrame({'a': ['R1', 'R2', nan, 'R4'], - 'b': ['C1', 'C2', 'C3', 'C4'], - 'c': [10, 15, 17, 20]}) + df = DataFrame( + { + "a": ["R1", "R2", nan, "R4"], + "b": ["C1", "C2", "C3", "C4"], + "c": [10, 15, 17, 20], + } + ) if method: - result = df.pivot('a', 'b', 'c') + result = df.pivot("a", "b", "c") else: - result = pd.pivot(df, 'a', 'b', 'c') - expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan], - [nan, 15, nan, nan], [nan, nan, nan, 20]], - index=Index([nan, 'R1', 'R2', 'R4'], name='a'), - columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) + result = pd.pivot(df, "a", "b", "c") + expected = DataFrame( + [ + [nan, nan, 17, nan], + [10, nan, nan, nan], + [nan, 15, nan, nan], + [nan, nan, nan, 20], + ], + index=Index([nan, "R1", "R2", "R4"], name="a"), + columns=Index(["C1", "C2", "C3", "C4"], name="b"), + ) tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T) + tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T) # GH9491 - df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'), - 'c': 100 + np.arange(6)}) - df['b'] = df['a'] - pd.Timestamp('2014-02-02') - df.loc[1, 'a'] = df.loc[3, 'a'] = nan - df.loc[1, 'b'] = df.loc[4, 'b'] = nan + df = DataFrame( + { + "a": pd.date_range("2014-02-01", periods=6, freq="D"), + "c": 100 + np.arange(6), + } + ) + df["b"] = df["a"] - pd.Timestamp("2014-02-02") + df.loc[1, "a"] = df.loc[3, "a"] = nan + df.loc[1, "b"] = df.loc[4, "b"] = nan if method: - pv = df.pivot('a', 'b', 'c') + pv = df.pivot("a", "b", "c") else: - pv = pd.pivot(df, 'a', 'b', 'c') + pv = pd.pivot(df, "a", "b", "c") assert pv.notna().values.sum() == len(df) for _, row in df.iterrows(): - assert pv.loc[row['a'], row['b']] == row['c'] + assert pv.loc[row["a"], row["b"]] == row["c"] if method: - result = df.pivot('b', 'a', 'c') + result = df.pivot("b", "a", "c") else: - result = pd.pivot(df, 'b', 'a', 'c') + result = pd.pivot(df, "b", "a", "c") tm.assert_frame_equal(result, pv.T) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_tz(self, method): # GH 5878 - df = DataFrame({'dt1': [datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0), - datetime(2013, 1, 1, 9, 0), - datetime(2013, 1, 2, 9, 0)], - 'dt2': [datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 1, 9, 0), - datetime(2014, 1, 2, 9, 0), - datetime(2014, 1, 2, 9, 0)], - 'data1': np.arange(4, dtype='int64'), - 'data2': np.arange(4, dtype='int64')}) - - df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) - df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) - - exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) - exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', - '2014/01/02 09:00'] * 2, - name='dt2', tz='Asia/Tokyo') + df = DataFrame( + { + "dt1": [ + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + datetime(2013, 1, 1, 9, 0), + datetime(2013, 1, 2, 9, 0), + ], + "dt2": [ + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 1, 9, 0), + datetime(2014, 1, 2, 9, 0), + datetime(2014, 1, 2, 9, 0), + ], + "data1": np.arange(4, dtype="int64"), + "data2": np.arange(4, dtype="int64"), + } + ) + + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) + + exp_col1 = Index(["data1", "data1", "data2", "data2"]) + exp_col2 = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"] * 2, name="dt2", tz="Asia/Tokyo" + ) exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.DatetimeIndex(['2013/01/01 09:00', - '2013/01/02 09:00'], - name='dt1', - tz='US/Pacific'), - columns=exp_col) + expected = DataFrame( + [[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ), + columns=exp_col, + ) if method: - pv = df.pivot(index='dt1', columns='dt2') + pv = df.pivot(index="dt1", columns="dt2") else: - pv = pd.pivot(df, index='dt1', columns='dt2') + pv = pd.pivot(df, index="dt1", columns="dt2") tm.assert_frame_equal(pv, expected) - expected = DataFrame([[0, 2], [1, 3]], - index=pd.DatetimeIndex(['2013/01/01 09:00', - '2013/01/02 09:00'], - name='dt1', - tz='US/Pacific'), - columns=pd.DatetimeIndex(['2014/01/01 09:00', - '2014/01/02 09:00'], - name='dt2', - tz='Asia/Tokyo')) + expected = DataFrame( + [[0, 2], [1, 3]], + index=pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ), + columns=pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ), + ) if method: - pv = df.pivot(index='dt1', columns='dt2', values='data1') + pv = df.pivot(index="dt1", columns="dt2", values="data1") else: - pv = pd.pivot(df, index='dt1', columns='dt2', values='data1') + pv = pd.pivot(df, index="dt1", columns="dt2", values="data1") tm.assert_frame_equal(pv, expected) def test_pivot_tz_in_values(self): # GH 14948 - df = pd.DataFrame([{'uid': u'aa', - 'ts': pd.Timestamp('2016-08-12 13:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-12 08:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-12 14:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-25 11:00:00-0700', - tz='US/Pacific')}, - {'uid': u'aa', - 'ts': pd.Timestamp('2016-08-25 13:00:00-0700', - tz='US/Pacific')}]) - - df = df.set_index('ts').reset_index() - mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, - second=0, microsecond=0)) - - result = pd.pivot_table(df.set_index('ts').reset_index(), - values='ts', index=['uid'], columns=[mins], - aggfunc=np.min) + df = pd.DataFrame( + [ + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), + }, + { + "uid": u"aa", + "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"), + }, + ] + ) + + df = df.set_index("ts").reset_index() + mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0, microsecond=0)) + + result = pd.pivot_table( + df.set_index("ts").reset_index(), + values="ts", + index=["uid"], + columns=[mins], + aggfunc=np.min, + ) expected = pd.DataFrame( [ - [pd.Timestamp('2016-08-12 08:00:00-0700', tz='US/Pacific'), - pd.Timestamp('2016-08-25 11:00:00-0700', tz='US/Pacific')] + [ + pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"), + pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"), + ] ], - index=pd.Index(['aa'], name='uid'), + index=pd.Index(["aa"], name="uid"), columns=pd.DatetimeIndex( [ - pd.Timestamp('2016-08-12 00:00:00', tz='US/Pacific'), - pd.Timestamp('2016-08-25 00:00:00', tz='US/Pacific') + pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"), + pd.Timestamp("2016-08-25 00:00:00", tz="US/Pacific"), ], - name='ts') + name="ts", + ), ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_periods(self, method): - df = DataFrame({'p1': [pd.Period('2013-01-01', 'D'), - pd.Period('2013-01-02', 'D'), - pd.Period('2013-01-01', 'D'), - pd.Period('2013-01-02', 'D')], - 'p2': [pd.Period('2013-01', 'M'), - pd.Period('2013-01', 'M'), - pd.Period('2013-02', 'M'), - pd.Period('2013-02', 'M')], - 'data1': np.arange(4, dtype='int64'), - 'data2': np.arange(4, dtype='int64')}) - - exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) - exp_col2 = pd.PeriodIndex(['2013-01', '2013-02'] * 2, - name='p2', freq='M') + df = DataFrame( + { + "p1": [ + pd.Period("2013-01-01", "D"), + pd.Period("2013-01-02", "D"), + pd.Period("2013-01-01", "D"), + pd.Period("2013-01-02", "D"), + ], + "p2": [ + pd.Period("2013-01", "M"), + pd.Period("2013-01", "M"), + pd.Period("2013-02", "M"), + pd.Period("2013-02", "M"), + ], + "data1": np.arange(4, dtype="int64"), + "data2": np.arange(4, dtype="int64"), + } + ) + + exp_col1 = Index(["data1", "data1", "data2", "data2"]) + exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M") exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], - index=pd.PeriodIndex(['2013-01-01', '2013-01-02'], - name='p1', freq='D'), - columns=exp_col) + expected = DataFrame( + [[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), + columns=exp_col, + ) if method: - pv = df.pivot(index='p1', columns='p2') + pv = df.pivot(index="p1", columns="p2") else: - pv = pd.pivot(df, index='p1', columns='p2') + pv = pd.pivot(df, index="p1", columns="p2") tm.assert_frame_equal(pv, expected) - expected = DataFrame([[0, 2], [1, 3]], - index=pd.PeriodIndex(['2013-01-01', '2013-01-02'], - name='p1', freq='D'), - columns=pd.PeriodIndex(['2013-01', '2013-02'], - name='p2', freq='M')) + expected = DataFrame( + [[0, 2], [1, 3]], + index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"), + columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"), + ) if method: - pv = df.pivot(index='p1', columns='p2', values='data1') + pv = df.pivot(index="p1", columns="p2", values="data1") else: - pv = pd.pivot(df, index='p1', columns='p2', values='data1') + pv = pd.pivot(df, index="p1", columns="p2", values="data1") tm.assert_frame_equal(pv, expected) - @pytest.mark.parametrize('values', [ - ['baz', 'zoo'], np.array(['baz', 'zoo']), - pd.Series(['baz', 'zoo']), pd.Index(['baz', 'zoo']) - ]) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize( + "values", + [ + ["baz", "zoo"], + np.array(["baz", "zoo"]), + pd.Series(["baz", "zoo"]), + pd.Index(["baz", "zoo"]), + ], + ) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values(self, values, method): # issue #17160 - df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], - 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - 'baz': [1, 2, 3, 4, 5, 6], - 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) if method: - result = df.pivot(index='foo', columns='bar', values=values) + result = df.pivot(index="foo", columns="bar", values=values) else: - result = pd.pivot(df, index='foo', columns='bar', values=values) - - data = [[1, 2, 3, 'x', 'y', 'z'], - [4, 5, 6, 'q', 'w', 't']] - index = Index(data=['one', 'two'], name='foo') - columns = MultiIndex(levels=[['baz', 'zoo'], ['A', 'B', 'C']], - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], - names=[None, 'bar']) - expected = DataFrame(data=data, index=index, - columns=columns, dtype='object') + result = pd.pivot(df, index="foo", columns="bar", values=values) + + data = [[1, 2, 3, "x", "y", "z"], [4, 5, 6, "q", "w", "t"]] + index = Index(data=["one", "two"], name="foo") + columns = MultiIndex( + levels=[["baz", "zoo"], ["A", "B", "C"]], + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + names=[None, "bar"], + ) + expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('values', [ - ['bar', 'baz'], np.array(['bar', 'baz']), - pd.Series(['bar', 'baz']), pd.Index(['bar', 'baz']) - ]) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize( + "values", + [ + ["bar", "baz"], + np.array(["bar", "baz"]), + pd.Series(["bar", "baz"]), + pd.Index(["bar", "baz"]), + ], + ) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_list_like_values_nans(self, values, method): # issue #17160 - df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], - 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - 'baz': [1, 2, 3, 4, 5, 6], - 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) if method: - result = df.pivot(index='zoo', columns='foo', values=values) + result = df.pivot(index="zoo", columns="foo", values=values) else: - result = pd.pivot(df, index='zoo', columns='foo', values=values) - - data = [[np.nan, 'A', np.nan, 4], - [np.nan, 'C', np.nan, 6], - [np.nan, 'B', np.nan, 5], - ['A', np.nan, 1, np.nan], - ['B', np.nan, 2, np.nan], - ['C', np.nan, 3, np.nan]] - index = Index(data=['q', 't', 'w', 'x', 'y', 'z'], name='zoo') - columns = MultiIndex(levels=[['bar', 'baz'], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - names=[None, 'foo']) - expected = DataFrame(data=data, index=index, - columns=columns, dtype='object') + result = pd.pivot(df, index="zoo", columns="foo", values=values) + + data = [ + [np.nan, "A", np.nan, 4], + [np.nan, "C", np.nan, 6], + [np.nan, "B", np.nan, 5], + ["A", np.nan, 1, np.nan], + ["B", np.nan, 2, np.nan], + ["C", np.nan, 3, np.nan], + ] + index = Index(data=["q", "t", "w", "x", "y", "z"], name="zoo") + columns = MultiIndex( + levels=[["bar", "baz"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=[None, "foo"], + ) + expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(reason='MultiIndexed unstack with tuple names fails' - 'with KeyError GH#19966') - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.xfail( + reason="MultiIndexed unstack with tuple names fails" "with KeyError GH#19966" + ) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_multiindex(self, method): # issue #17160 index = Index(data=[0, 1, 2, 3, 4, 5]) - data = [['one', 'A', 1, 'x'], - ['one', 'B', 2, 'y'], - ['one', 'C', 3, 'z'], - ['two', 'A', 4, 'q'], - ['two', 'B', 5, 'w'], - ['two', 'C', 6, 't']] - columns = MultiIndex(levels=[['bar', 'baz'], ['first', 'second']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - df = DataFrame(data=data, index=index, columns=columns, dtype='object') + data = [ + ["one", "A", 1, "x"], + ["one", "B", 2, "y"], + ["one", "C", 3, "z"], + ["two", "A", 4, "q"], + ["two", "B", 5, "w"], + ["two", "C", 6, "t"], + ] + columns = MultiIndex( + levels=[["bar", "baz"], ["first", "second"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + ) + df = DataFrame(data=data, index=index, columns=columns, dtype="object") if method: - result = df.pivot(index=('bar', 'first'), - columns=('bar', 'second'), - values=('baz', 'first')) + result = df.pivot( + index=("bar", "first"), + columns=("bar", "second"), + values=("baz", "first"), + ) else: - result = pd.pivot(df, - index=('bar', 'first'), - columns=('bar', 'second'), - values=('baz', 'first')) - - data = {'A': Series([1, 4], index=['one', 'two']), - 'B': Series([2, 5], index=['one', 'two']), - 'C': Series([3, 6], index=['one', 'two'])} + result = pd.pivot( + df, + index=("bar", "first"), + columns=("bar", "second"), + values=("baz", "first"), + ) + + data = { + "A": Series([1, 4], index=["one", "two"]), + "B": Series([2, 5], index=["one", "two"]), + "C": Series([3, 6], index=["one", "two"]), + } expected = DataFrame(data) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('method', [True, False]) + @pytest.mark.parametrize("method", [True, False]) def test_pivot_with_tuple_of_values(self, method): # issue #17160 - df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'], - 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - 'baz': [1, 2, 3, 4, 5, 6], - 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) + df = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"): # tuple is seen as a single column name if method: - df.pivot(index='zoo', columns='foo', values=('bar', 'baz')) + df.pivot(index="zoo", columns="foo", values=("bar", "baz")) else: - pd.pivot(df, index='zoo', columns='foo', values=('bar', 'baz')) + pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz")) def test_margins(self): - def _check_output(result, values_col, index=['A', 'B'], - columns=['C'], - margins_col='All'): + def _check_output( + result, values_col, index=["A", "B"], columns=["C"], margins_col="All" + ): col_margins = result.loc[result.index[:-1], margins_col] expected_col_margins = self.data.groupby(index)[values_col].mean() - tm.assert_series_equal(col_margins, expected_col_margins, - check_names=False) + tm.assert_series_equal(col_margins, expected_col_margins, check_names=False) assert col_margins.name == margins_col result = result.sort_index() - index_margins = result.loc[(margins_col, '')].iloc[:-1] + index_margins = result.loc[(margins_col, "")].iloc[:-1] expected_ix_margins = self.data.groupby(columns)[values_col].mean() - tm.assert_series_equal(index_margins, expected_ix_margins, - check_names=False) - assert index_margins.name == (margins_col, '') + tm.assert_series_equal( + index_margins, expected_ix_margins, check_names=False + ) + assert index_margins.name == (margins_col, "") - grand_total_margins = result.loc[(margins_col, ''), margins_col] + grand_total_margins = result.loc[(margins_col, ""), margins_col] expected_total_margins = self.data[values_col].mean() assert grand_total_margins == expected_total_margins # column specified - result = self.data.pivot_table(values='D', index=['A', 'B'], - columns='C', - margins=True, aggfunc=np.mean) - _check_output(result, 'D') + result = self.data.pivot_table( + values="D", index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + ) + _check_output(result, "D") # Set a different margins_name (not 'All') - result = self.data.pivot_table(values='D', index=['A', 'B'], - columns='C', - margins=True, aggfunc=np.mean, - margins_name='Totals') - _check_output(result, 'D', margins_col='Totals') + result = self.data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=np.mean, + margins_name="Totals", + ) + _check_output(result, "D", margins_col="Totals") # no column specified - table = self.data.pivot_table(index=['A', 'B'], columns='C', - margins=True, aggfunc=np.mean) + table = self.data.pivot_table( + index=["A", "B"], columns="C", margins=True, aggfunc=np.mean + ) for value_col in table.columns.levels[0]: _check_output(table[value_col], value_col) @@ -657,55 +840,63 @@ def _check_output(result, values_col, index=['A', 'B'], # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data.pivot_table(index=['AA', 'BB'], margins=True, - aggfunc=np.mean) + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) for value_col in table.columns: - totals = table.loc[('All', ''), value_col] + totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() # no rows - rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, - aggfunc=np.mean) + rtable = self.data.pivot_table( + columns=["AA", "BB"], margins=True, aggfunc=np.mean + ) assert isinstance(rtable, Series) - table = self.data.pivot_table(index=['AA', 'BB'], margins=True, - aggfunc='mean') - for item in ['DD', 'EE', 'FF']: - totals = table.loc[('All', ''), item] + table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") + for item in ["DD", "EE", "FF"]: + totals = table.loc[("All", ""), item] assert totals == self.data[item].mean() def test_margins_dtype(self): # GH 17013 df = self.data.copy() - df[['D', 'E', 'F']] = np.arange(len(df) * 3).reshape(len(df), 3) + df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3) - mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')] - mi = MultiIndex.from_tuples(mi_val, names=('A', 'B')) - expected = DataFrame({'dull': [12, 21, 3, 9, 45], - 'shiny': [33, 0, 36, 51, 120]}, - index=mi).rename_axis('C', axis=1) - expected['All'] = expected['dull'] + expected['shiny'] + mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] + mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) + expected = DataFrame( + {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi + ).rename_axis("C", axis=1) + expected["All"] = expected["dull"] + expected["shiny"] - result = df.pivot_table(values='D', index=['A', 'B'], - columns='C', margins=True, - aggfunc=np.sum, fill_value=0) + result = df.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=np.sum, + fill_value=0, + ) tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(reason='GH#17035 (len of floats is casted back to ' - 'floats)') + @pytest.mark.xfail(reason="GH#17035 (len of floats is casted back to " "floats)") def test_margins_dtype_len(self): - mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')] - mi = MultiIndex.from_tuples(mi_val, names=('A', 'B')) - expected = DataFrame({'dull': [1, 1, 2, 1, 5], - 'shiny': [2, 0, 2, 2, 6]}, - index=mi).rename_axis('C', axis=1) - expected['All'] = expected['dull'] + expected['shiny'] + mi_val = list(product(["bar", "foo"], ["one", "two"])) + [("All", "")] + mi = MultiIndex.from_tuples(mi_val, names=("A", "B")) + expected = DataFrame( + {"dull": [1, 1, 2, 1, 5], "shiny": [2, 0, 2, 2, 6]}, index=mi + ).rename_axis("C", axis=1) + expected["All"] = expected["dull"] + expected["shiny"] - result = self.data.pivot_table(values='D', index=['A', 'B'], - columns='C', margins=True, - aggfunc=len, fill_value=0) + result = self.data.pivot_table( + values="D", + index=["A", "B"], + columns="C", + margins=True, + aggfunc=len, + fill_value=0, + ) tm.assert_frame_equal(expected, result) @@ -713,407 +904,599 @@ def test_pivot_integer_columns(self): # caused by upstream bug in unstack d = date.min - data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], - [d + timedelta(i) - for i in range(20)], [1.0])) + data = list( + product( + ["foo", "bar"], + ["A", "B", "C"], + ["x1", "x2"], + [d + timedelta(i) for i in range(20)], + [1.0], + ) + ) df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) - table2 = df2.pivot_table( - values='4', index=['0', '1', '3'], columns=['2']) + table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"]) tm.assert_frame_equal(table, table2, check_names=False) def test_pivot_no_level_overlap(self): # GH #1181 - data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, - 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, - 'c': (['foo'] * 4 + ['bar'] * 4) * 2, - 'value': np.random.randn(16)}) + data = DataFrame( + { + "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2, + "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2, + "c": (["foo"] * 4 + ["bar"] * 4) * 2, + "value": np.random.randn(16), + } + ) - table = data.pivot_table('value', index='a', columns=['b', 'c']) + table = data.pivot_table("value", index="a", columns=["b", "c"]) - grouped = data.groupby(['a', 'b', 'c'])['value'].mean() - expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') + grouped = data.groupby(["a", "b", "c"])["value"].mean() + expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all") tm.assert_frame_equal(table, expected) def test_pivot_columns_lexsorted(self): n = 10000 - dtype = np.dtype([ - ("Index", object), - ("Symbol", object), - ("Year", int), - ("Month", int), - ("Day", int), - ("Quantity", int), - ("Price", float), - ]) - - products = np.array([ - ('SP500', 'ADBE'), - ('SP500', 'NVDA'), - ('SP500', 'ORCL'), - ('NDQ100', 'AAPL'), - ('NDQ100', 'MSFT'), - ('NDQ100', 'GOOG'), - ('FTSE', 'DGE.L'), - ('FTSE', 'TSCO.L'), - ('FTSE', 'GSK.L'), - ], dtype=[('Index', object), ('Symbol', object)]) + dtype = np.dtype( + [ + ("Index", object), + ("Symbol", object), + ("Year", int), + ("Month", int), + ("Day", int), + ("Quantity", int), + ("Price", float), + ] + ) + + products = np.array( + [ + ("SP500", "ADBE"), + ("SP500", "NVDA"), + ("SP500", "ORCL"), + ("NDQ100", "AAPL"), + ("NDQ100", "MSFT"), + ("NDQ100", "GOOG"), + ("FTSE", "DGE.L"), + ("FTSE", "TSCO.L"), + ("FTSE", "GSK.L"), + ], + dtype=[("Index", object), ("Symbol", object)], + ) items = np.empty(n, dtype=dtype) iproduct = np.random.randint(0, len(products), n) - items['Index'] = products['Index'][iproduct] - items['Symbol'] = products['Symbol'][iproduct] - dr = pd.date_range(date(2000, 1, 1), - date(2010, 12, 31)) + items["Index"] = products["Index"][iproduct] + items["Symbol"] = products["Symbol"][iproduct] + dr = pd.date_range(date(2000, 1, 1), date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] - items['Year'] = dates.year - items['Month'] = dates.month - items['Day'] = dates.day - items['Price'] = np.random.lognormal(4.0, 2.0, n) + items["Year"] = dates.year + items["Month"] = dates.month + items["Day"] = dates.day + items["Price"] = np.random.lognormal(4.0, 2.0, n) df = DataFrame(items) - pivoted = df.pivot_table('Price', index=['Month', 'Day'], - columns=['Index', 'Symbol', 'Year'], - aggfunc='mean') + pivoted = df.pivot_table( + "Price", + index=["Month", "Day"], + columns=["Index", "Symbol", "Year"], + aggfunc="mean", + ) assert pivoted.columns.is_monotonic def test_pivot_complex_aggfunc(self): - f = OrderedDict([('D', ['std']), ('E', ['sum'])]) - expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') - result = self.data.pivot_table(index='A', columns='B', aggfunc=f) + f = OrderedDict([("D", ["std"]), ("E", ["sum"])]) + expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") + result = self.data.pivot_table(index="A", columns="B", aggfunc=f) tm.assert_frame_equal(result, expected) def test_margins_no_values_no_cols(self): # Regression test on pivot table: no values or cols passed. - result = self.data[['A', 'B']].pivot_table( - index=['A', 'B'], aggfunc=len, margins=True) + result = self.data[["A", "B"]].pivot_table( + index=["A", "B"], aggfunc=len, margins=True + ) result_list = result.tolist() assert sum(result_list[:-1]) == result_list[-1] def test_margins_no_values_two_rows(self): # Regression test on pivot table: no values passed but rows are a # multi-index - result = self.data[['A', 'B', 'C']].pivot_table( - index=['A', 'B'], columns='C', aggfunc=len, margins=True) + result = self.data[["A", "B", "C"]].pivot_table( + index=["A", "B"], columns="C", aggfunc=len, margins=True + ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] def test_margins_no_values_one_row_one_col(self): # Regression test on pivot table: no values passed but row and col # defined - result = self.data[['A', 'B']].pivot_table( - index='A', columns='B', aggfunc=len, margins=True) + result = self.data[["A", "B"]].pivot_table( + index="A", columns="B", aggfunc=len, margins=True + ) assert result.All.tolist() == [4.0, 7.0, 11.0] def test_margins_no_values_two_row_two_cols(self): # Regression test on pivot table: no values passed but rows and cols # are multi-indexed - self.data['D'] = ['a', 'b', 'c', 'd', - 'e', 'f', 'g', 'h', 'i', 'j', 'k'] - result = self.data[['A', 'B', 'C', 'D']].pivot_table( - index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) + self.data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"] + result = self.data[["A", "B", "C", "D"]].pivot_table( + index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True + ) assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0] - @pytest.mark.parametrize( - 'margin_name', ['foo', 'one', 666, None, ['a', 'b']]) + @pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]]) def test_pivot_table_with_margins_set_margin_name(self, margin_name): # see gh-3335 - msg = (r'Conflicting name "{}" in margins|' - "margins_name argument must be a string").format(margin_name) + msg = ( + r'Conflicting name "{}" in margins|' + "margins_name argument must be a string" + ).format(margin_name) with pytest.raises(ValueError, match=msg): # multi-index index - pivot_table(self.data, values='D', index=['A', 'B'], - columns=['C'], margins=True, - margins_name=margin_name) + pivot_table( + self.data, + values="D", + index=["A", "B"], + columns=["C"], + margins=True, + margins_name=margin_name, + ) with pytest.raises(ValueError, match=msg): # multi-index column - pivot_table(self.data, values='D', index=['C'], - columns=['A', 'B'], margins=True, - margins_name=margin_name) + pivot_table( + self.data, + values="D", + index=["C"], + columns=["A", "B"], + margins=True, + margins_name=margin_name, + ) with pytest.raises(ValueError, match=msg): # non-multi-index index/column - pivot_table(self.data, values='D', index=['A'], - columns=['B'], margins=True, - margins_name=margin_name) + pivot_table( + self.data, + values="D", + index=["A"], + columns=["B"], + margins=True, + margins_name=margin_name, + ) def test_pivot_timegrouper(self): - df = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [datetime(2013, 1, 1), - datetime(2013, 1, 1), - datetime(2013, 10, 1), - datetime(2013, 10, 2), - datetime(2013, 10, 1), - datetime(2013, 10, 2), - datetime(2013, 12, 2), - datetime(2013, 12, 2), ]}).set_index('Date') - - expected = DataFrame(np.array([10, 18, 3], dtype='int64') - .reshape(1, 3), - index=[datetime(2013, 12, 31)], - columns='Carl Joe Mark'.split()) - expected.index.name = 'Date' - expected.columns.name = 'Buyer' - - result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', - values='Quantity', aggfunc=np.sum) + df = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 1, 1), + datetime(2013, 1, 1), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 10, 1), + datetime(2013, 10, 2), + datetime(2013, 12, 2), + datetime(2013, 12, 2), + ], + } + ).set_index("Date") + + expected = DataFrame( + np.array([10, 18, 3], dtype="int64").reshape(1, 3), + index=[datetime(2013, 12, 31)], + columns="Carl Joe Mark".split(), + ) + expected.index.name = "Date" + expected.columns.name = "Buyer" + + result = pivot_table( + df, + index=Grouper(freq="A"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="A"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) - expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]) - .reshape(2, 3), - index=[datetime(2013, 1, 1), - datetime(2013, 7, 1)], - columns='Carl Joe Mark'.split()) - expected.index.name = 'Date' - expected.columns.name = 'Buyer' + expected = DataFrame( + np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), + index=[datetime(2013, 1, 1), datetime(2013, 7, 1)], + columns="Carl Joe Mark".split(), + ) + expected.index.name = "Date" + expected.columns.name = "Buyer" - result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=Grouper(freq="6MS"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() - result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), - columns='Buyer', - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=Grouper(freq="6MS", key="Date"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', key='Date'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", key="Date"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) msg = "'The grouper name foo is not found'" with pytest.raises(KeyError, match=msg): - pivot_table(df, index=Grouper(freq='6MS', key='foo'), - columns='Buyer', values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index=Grouper(freq="6MS", key="foo"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) with pytest.raises(KeyError, match=msg): - pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', key='foo'), - values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", key="foo"), + values="Quantity", + aggfunc=np.sum, + ) # passing the level - df = df.set_index('Date') - result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), - columns='Buyer', values='Quantity', - aggfunc=np.sum) + df = df.set_index("Date") + result = pivot_table( + df, + index=Grouper(freq="6MS", level="Date"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', level='Date'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", level="Date"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) msg = "The level foo is not valid" with pytest.raises(ValueError, match=msg): - pivot_table(df, index=Grouper(freq='6MS', level='foo'), - columns='Buyer', values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index=Grouper(freq="6MS", level="foo"), + columns="Buyer", + values="Quantity", + aggfunc=np.sum, + ) with pytest.raises(ValueError, match=msg): - pivot_table(df, index='Buyer', - columns=Grouper(freq='6MS', level='foo'), - values='Quantity', aggfunc=np.sum) + pivot_table( + df, + index="Buyer", + columns=Grouper(freq="6MS", level="foo"), + values="Quantity", + aggfunc=np.sum, + ) # double grouper - df = DataFrame({ - 'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5), - datetime(2013, 10, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 11, 1, 20, 0), - datetime(2013, 10, 2, 10, 0), - datetime(2013, 10, 2, 12, 0), - datetime(2013, 12, 5, 14, 0)], - 'PayDay': [datetime(2013, 10, 4, 0, 0), - datetime(2013, 10, 15, 13, 5), - datetime(2013, 9, 5, 20, 0), - datetime(2013, 11, 2, 10, 0), - datetime(2013, 10, 7, 20, 0), - datetime(2013, 9, 5, 10, 0), - datetime(2013, 12, 30, 12, 0), - datetime(2013, 11, 20, 14, 0), ]}) - - result = pivot_table(df, index=Grouper(freq='M', key='Date'), - columns=Grouper(freq='M', key='PayDay'), - values='Quantity', aggfunc=np.sum) - expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, - 6, np.nan, 1, 9, - np.nan, 9, np.nan, np.nan, np.nan, - np.nan, 3, np.nan]).reshape(4, 4), - index=[datetime(2013, 9, 30), - datetime(2013, 10, 31), - datetime(2013, 11, 30), - datetime(2013, 12, 31)], - columns=[datetime(2013, 9, 30), - datetime(2013, 10, 31), - datetime(2013, 11, 30), - datetime(2013, 12, 31)]) - expected.index.name = 'Date' - expected.columns.name = 'PayDay' + df = DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime(2013, 11, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 11, 1, 20, 0), + datetime(2013, 10, 2, 10, 0), + datetime(2013, 10, 2, 12, 0), + datetime(2013, 12, 5, 14, 0), + ], + "PayDay": [ + datetime(2013, 10, 4, 0, 0), + datetime(2013, 10, 15, 13, 5), + datetime(2013, 9, 5, 20, 0), + datetime(2013, 11, 2, 10, 0), + datetime(2013, 10, 7, 20, 0), + datetime(2013, 9, 5, 10, 0), + datetime(2013, 12, 30, 12, 0), + datetime(2013, 11, 20, 14, 0), + ], + } + ) + + result = pivot_table( + df, + index=Grouper(freq="M", key="Date"), + columns=Grouper(freq="M", key="PayDay"), + values="Quantity", + aggfunc=np.sum, + ) + expected = DataFrame( + np.array( + [ + np.nan, + 3, + np.nan, + np.nan, + 6, + np.nan, + 1, + 9, + np.nan, + 9, + np.nan, + np.nan, + np.nan, + np.nan, + 3, + np.nan, + ] + ).reshape(4, 4), + index=[ + datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31), + ], + columns=[ + datetime(2013, 9, 30), + datetime(2013, 10, 31), + datetime(2013, 11, 30), + datetime(2013, 12, 31), + ], + ) + expected.index.name = "Date" + expected.columns.name = "PayDay" tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), - columns=Grouper(freq='M', key='Date'), - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=Grouper(freq="M", key="PayDay"), + columns=Grouper(freq="M", key="Date"), + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) - tuples = [(datetime(2013, 9, 30), datetime(2013, 10, 31)), - (datetime(2013, 10, 31), - datetime(2013, 9, 30)), - (datetime(2013, 10, 31), - datetime(2013, 11, 30)), - (datetime(2013, 10, 31), - datetime(2013, 12, 31)), - (datetime(2013, 11, 30), - datetime(2013, 10, 31)), - (datetime(2013, 12, 31), datetime(2013, 11, 30)), ] - idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) - expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan, - 9, np.nan, 9, np.nan, - np.nan, 3]).reshape(6, 2), - index=idx, columns=['A', 'B']) - expected.columns.name = 'Branch' + tuples = [ + (datetime(2013, 9, 30), datetime(2013, 10, 31)), + (datetime(2013, 10, 31), datetime(2013, 9, 30)), + (datetime(2013, 10, 31), datetime(2013, 11, 30)), + (datetime(2013, 10, 31), datetime(2013, 12, 31)), + (datetime(2013, 11, 30), datetime(2013, 10, 31)), + (datetime(2013, 12, 31), datetime(2013, 11, 30)), + ] + idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"]) + expected = DataFrame( + np.array( + [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3] + ).reshape(6, 2), + index=idx, + columns=["A", "B"], + ) + expected.columns.name = "Branch" result = pivot_table( - df, index=[Grouper(freq='M', key='Date'), - Grouper(freq='M', key='PayDay')], columns=['Branch'], - values='Quantity', aggfunc=np.sum) + df, + index=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + columns=["Branch"], + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=['Branch'], - columns=[Grouper(freq='M', key='Date'), - Grouper(freq='M', key='PayDay')], - values='Quantity', aggfunc=np.sum) + result = pivot_table( + df, + index=["Branch"], + columns=[Grouper(freq="M", key="Date"), Grouper(freq="M", key="PayDay")], + values="Quantity", + aggfunc=np.sum, + ) tm.assert_frame_equal(result, expected.T) def test_pivot_datetime_tz(self): - dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00'] - dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', - '2013-01-01 15:00:00', - '2013-02-01 15:00:00', '2013-02-01 15:00:00', - '2013-02-01 15:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'dt1': dates1, 'dt2': dates2, - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) - df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) - - exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], - tz='US/Pacific', name='dt1') - exp_col1 = Index(['value1', 'value1']) - exp_col2 = Index(['a', 'b'], name='label') + dates1 = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + dates2 = [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "dt1": dates1, + "dt2": dates2, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d, tz="US/Pacific")) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d, tz="Asia/Tokyo")) + + exp_idx = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Pacific", + name="dt1", + ) + exp_col1 = Index(["value1", "value1"]) + exp_col2 = Index(["a", "b"], name="label") exp_col = MultiIndex.from_arrays([exp_col1, exp_col2]) - expected = DataFrame([[0, 3], [1, 4], [2, 5]], - index=exp_idx, columns=exp_col) - result = pivot_table(df, index=['dt1'], columns=[ - 'label'], values=['value1']) + expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col) + result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"]) tm.assert_frame_equal(result, expected) - exp_col1 = Index(['sum', 'sum', 'sum', 'sum', - 'mean', 'mean', 'mean', 'mean']) - exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2) - exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', - '2013-02-01 15:00:00'] * 4, - tz='Asia/Tokyo', name='dt2') + exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"]) + exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2) + exp_col3 = pd.DatetimeIndex( + ["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4, + tz="Asia/Tokyo", + name="dt2", + ) exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3]) - expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], - [1, 4, 2, 1, 1, 4, 2, 1], - [2, 5, 1, 2, 2, 5, 1, 2]], - dtype='int64'), - index=exp_idx, - columns=exp_col) - - result = pivot_table(df, index=['dt1'], columns=['dt2'], - values=['value1', 'value2'], - aggfunc=[np.sum, np.mean]) + expected = DataFrame( + np.array( + [ + [0, 3, 1, 2, 0, 3, 1, 2], + [1, 4, 2, 1, 1, 4, 2, 1], + [2, 5, 1, 2, 2, 5, 1, 2], + ], + dtype="int64", + ), + index=exp_idx, + columns=exp_col, + ) + + result = pivot_table( + df, + index=["dt1"], + columns=["dt2"], + values=["value1", "value2"], + aggfunc=[np.sum, np.mean], + ) tm.assert_frame_equal(result, expected) def test_pivot_dtaccessor(self): # GH 8103 - dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', - '2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00'] - dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', - '2013-01-01 15:00:00', - '2013-02-01 15:00:00', '2013-02-01 15:00:00', - '2013-02-01 15:00:00'] - df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'dt1': dates1, 'dt2': dates2, - 'value1': np.arange(6, dtype='int64'), - 'value2': [1, 2] * 3}) - df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d)) - df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d)) - - result = pivot_table(df, index='label', columns=df['dt1'].dt.hour, - values='value1') - - exp_idx = Index(['a', 'b'], name='label') - expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, - index=exp_idx, - columns=Index([7, 8, 9], name='dt1')) + dates1 = [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ] + dates2 = [ + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-01-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + "2013-02-01 15:00:00", + ] + df = DataFrame( + { + "label": ["a", "a", "a", "b", "b", "b"], + "dt1": dates1, + "dt2": dates2, + "value1": np.arange(6, dtype="int64"), + "value2": [1, 2] * 3, + } + ) + df["dt1"] = df["dt1"].apply(lambda d: pd.Timestamp(d)) + df["dt2"] = df["dt2"].apply(lambda d: pd.Timestamp(d)) + + result = pivot_table( + df, index="label", columns=df["dt1"].dt.hour, values="value1" + ) + + exp_idx = Index(["a", "b"], name="label") + expected = DataFrame( + {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=exp_idx, + columns=Index([7, 8, 9], name="dt1"), + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=df['dt2'].dt.month, - columns=df['dt1'].dt.hour, - values='value1') + result = pivot_table( + df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1" + ) - expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]}, - index=Index([1, 2], name='dt2'), - columns=Index([7, 8, 9], name='dt1')) + expected = DataFrame( + {7: [0, 3], 8: [1, 4], 9: [2, 5]}, + index=Index([1, 2], name="dt2"), + columns=Index([7, 8, 9], name="dt1"), + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=df['dt2'].dt.year.values, - columns=[df['dt1'].dt.hour, df['dt2'].dt.month], - values='value1') + result = pivot_table( + df, + index=df["dt2"].dt.year.values, + columns=[df["dt1"].dt.hour, df["dt2"].dt.month], + values="value1", + ) exp_col = MultiIndex.from_arrays( - [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=['dt1', 'dt2']) - expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype='int64'), - index=[2013], columns=exp_col) + [[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=["dt1", "dt2"] + ) + expected = DataFrame( + np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"), index=[2013], columns=exp_col + ) tm.assert_frame_equal(result, expected) - result = pivot_table(df, index=np.array(['X', 'X', 'X', - 'X', 'Y', 'Y']), - columns=[df['dt1'].dt.hour, df['dt2'].dt.month], - values='value1') - expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan], - [np.nan, np.nan, np.nan, - 4, np.nan, 5]]), - index=['X', 'Y'], columns=exp_col) + result = pivot_table( + df, + index=np.array(["X", "X", "X", "X", "Y", "Y"]), + columns=[df["dt1"].dt.hour, df["dt2"].dt.month], + values="value1", + ) + expected = DataFrame( + np.array( + [[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]] + ), + index=["X", "Y"], + columns=exp_col, + ) tm.assert_frame_equal(result, expected) def test_daily(self): - rng = date_range('1/1/2000', '12/31/2004', freq='D') + rng = date_range("1/1/2000", "12/31/2004", freq="D") ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_table(DataFrame(ts), index=ts.index.year, - columns=ts.index.dayofyear) + annual = pivot_table( + DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear + ) annual.columns = annual.columns.droplevel(0) doy = np.asarray(ts.index.dayofyear) @@ -1127,11 +1510,12 @@ def test_daily(self): assert result.name == i def test_monthly(self): - rng = date_range('1/1/2000', '12/31/2004', freq='M') + rng = date_range("1/1/2000", "12/31/2004", freq="M") ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_table(pd.DataFrame(ts), index=ts.index.year, - columns=ts.index.month) + annual = pivot_table( + pd.DataFrame(ts), index=ts.index.year, columns=ts.index.month + ) annual.columns = annual.columns.droplevel(0) month = ts.index.month @@ -1144,120 +1528,140 @@ def test_monthly(self): def test_pivot_table_with_iterator_values(self): # GH 12017 - aggs = {'D': 'sum', 'E': 'mean'} + aggs = {"D": "sum", "E": "mean"} pivot_values_list = pd.pivot_table( - self.data, index=['A'], values=list(aggs.keys()), aggfunc=aggs, + self.data, index=["A"], values=list(aggs.keys()), aggfunc=aggs ) pivot_values_keys = pd.pivot_table( - self.data, index=['A'], values=aggs.keys(), aggfunc=aggs, + self.data, index=["A"], values=aggs.keys(), aggfunc=aggs ) tm.assert_frame_equal(pivot_values_keys, pivot_values_list) agg_values_gen = (value for value in aggs.keys()) pivot_values_gen = pd.pivot_table( - self.data, index=['A'], values=agg_values_gen, aggfunc=aggs, + self.data, index=["A"], values=agg_values_gen, aggfunc=aggs ) tm.assert_frame_equal(pivot_values_gen, pivot_values_list) def test_pivot_table_margins_name_with_aggfunc_list(self): # GH 13354 - margins_name = 'Weekly' + margins_name = "Weekly" costs = pd.DataFrame( - {'item': ['bacon', 'cheese', 'bacon', 'cheese'], - 'cost': [2.5, 4.5, 3.2, 3.3], - 'day': ['M', 'M', 'T', 'T']} + { + "item": ["bacon", "cheese", "bacon", "cheese"], + "cost": [2.5, 4.5, 3.2, 3.3], + "day": ["M", "M", "T", "T"], + } ) table = costs.pivot_table( - index="item", columns="day", margins=True, - margins_name=margins_name, aggfunc=[np.mean, max] - ) - ix = pd.Index( - ['bacon', 'cheese', margins_name], dtype='object', name='item' + index="item", + columns="day", + margins=True, + margins_name=margins_name, + aggfunc=[np.mean, max], ) - tups = [('mean', 'cost', 'M'), ('mean', 'cost', 'T'), - ('mean', 'cost', margins_name), ('max', 'cost', 'M'), - ('max', 'cost', 'T'), ('max', 'cost', margins_name)] - cols = pd.MultiIndex.from_tuples(tups, names=[None, None, 'day']) + ix = pd.Index(["bacon", "cheese", margins_name], dtype="object", name="item") + tups = [ + ("mean", "cost", "M"), + ("mean", "cost", "T"), + ("mean", "cost", margins_name), + ("max", "cost", "M"), + ("max", "cost", "T"), + ("max", "cost", margins_name), + ] + cols = pd.MultiIndex.from_tuples(tups, names=[None, None, "day"]) expected = pd.DataFrame(table.values, index=ix, columns=cols) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason='GH#17035 (np.mean of ints is casted back to ' - 'ints)') + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to " "ints)") def test_categorical_margins(self, observed): # GH 10989 - df = pd.DataFrame({'x': np.arange(8), - 'y': np.arange(8) // 4, - 'z': np.arange(8) % 2}) + df = pd.DataFrame( + {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} + ) expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) - expected.index = Index([0, 1, 'All'], name='y') - expected.columns = Index([0, 1, 'All'], name='z') + expected.index = Index([0, 1, "All"], name="y") + expected.columns = Index([0, 1, "All"], name="z") - table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - @pytest.mark.xfail(reason='GH#17035 (np.mean of ints is casted back to ' - 'ints)') + @pytest.mark.xfail(reason="GH#17035 (np.mean of ints is casted back to " "ints)") def test_categorical_margins_category(self, observed): - df = pd.DataFrame({'x': np.arange(8), - 'y': np.arange(8) // 4, - 'z': np.arange(8) % 2}) + df = pd.DataFrame( + {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2} + ) expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) - expected.index = Index([0, 1, 'All'], name='y') - expected.columns = Index([0, 1, 'All'], name='z') + expected.index = Index([0, 1, "All"], name="y") + expected.columns = Index([0, 1, "All"], name="z") - df.y = df.y.astype('category') - df.z = df.z.astype('category') - table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) + df.y = df.y.astype("category") + df.z = df.z.astype("category") + table = df.pivot_table("x", "y", "z", dropna=observed, margins=True) tm.assert_frame_equal(table, expected) def test_categorical_aggfunc(self, observed): # GH 9534 - df = pd.DataFrame({"C1": ["A", "B", "C", "C"], - "C2": ["a", "a", "b", "b"], - "V": [1, 2, 3, 4]}) + df = pd.DataFrame( + {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]} + ) df["C1"] = df["C1"].astype("category") - result = df.pivot_table("V", index="C1", columns="C2", - dropna=observed, aggfunc="count") - - expected_index = pd.CategoricalIndex(['A', 'B', 'C'], - categories=['A', 'B', 'C'], - ordered=False, - name='C1') - expected_columns = pd.Index(['a', 'b'], name='C2') - expected_data = np.array([[1., np.nan], - [1., np.nan], - [np.nan, 2.]]) - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_columns) + result = df.pivot_table( + "V", index="C1", columns="C2", dropna=observed, aggfunc="count" + ) + + expected_index = pd.CategoricalIndex( + ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" + ) + expected_columns = pd.Index(["a", "b"], name="C2") + expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) tm.assert_frame_equal(result, expected) def test_categorical_pivot_index_ordering(self, observed): # GH 8731 - df = pd.DataFrame({'Sales': [100, 120, 220], - 'Month': ['January', 'January', 'January'], - 'Year': [2013, 2014, 2013]}) - months = ['January', 'February', 'March', 'April', 'May', 'June', - 'July', 'August', 'September', 'October', 'November', - 'December'] - df['Month'] = df['Month'].astype('category').cat.set_categories(months) - result = df.pivot_table(values='Sales', - index='Month', - columns='Year', - dropna=observed, - aggfunc='sum') - expected_columns = pd.Int64Index([2013, 2014], name='Year') - expected_index = pd.CategoricalIndex(['January'], - categories=months, - ordered=False, - name='Month') - expected = pd.DataFrame([[320, 120]], - index=expected_index, - columns=expected_columns) + df = pd.DataFrame( + { + "Sales": [100, 120, 220], + "Month": ["January", "January", "January"], + "Year": [2013, 2014, 2013], + } + ) + months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + df["Month"] = df["Month"].astype("category").cat.set_categories(months) + result = df.pivot_table( + values="Sales", + index="Month", + columns="Year", + dropna=observed, + aggfunc="sum", + ) + expected_columns = pd.Int64Index([2013, 2014], name="Year") + expected_index = pd.CategoricalIndex( + ["January"], categories=months, ordered=False, name="Month" + ) + expected = pd.DataFrame( + [[320, 120]], index=expected_index, columns=expected_columns + ) if not observed: result = result.dropna().astype(np.int64) @@ -1268,111 +1672,142 @@ def test_pivot_table_not_series(self): # pivot_table always returns a DataFrame # when values is not list like and columns is None # and aggfunc is not instance of list - df = DataFrame({'col1': [3, 4, 5], - 'col2': ['C', 'D', 'E'], - 'col3': [1, 3, 9]}) + df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]}) - result = df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum) - m = MultiIndex.from_arrays([[1, 3, 9], - ['C', 'D', 'E']], - names=['col3', 'col2']) - expected = DataFrame([3, 4, 5], - index=m, columns=['col1']) + result = df.pivot_table("col1", index=["col3", "col2"], aggfunc=np.sum) + m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"]) + expected = DataFrame([3, 4, 5], index=m, columns=["col1"]) tm.assert_frame_equal(result, expected) - result = df.pivot_table( - 'col1', index='col3', columns='col2', aggfunc=np.sum + result = df.pivot_table("col1", index="col3", columns="col2", aggfunc=np.sum) + expected = DataFrame( + [[3, np.NaN, np.NaN], [np.NaN, 4, np.NaN], [np.NaN, np.NaN, 5]], + index=Index([1, 3, 9], name="col3"), + columns=Index(["C", "D", "E"], name="col2"), ) - expected = DataFrame([[3, np.NaN, np.NaN], - [np.NaN, 4, np.NaN], - [np.NaN, np.NaN, 5]], - index=Index([1, 3, 9], name='col3'), - columns=Index(['C', 'D', 'E'], name='col2')) tm.assert_frame_equal(result, expected) - result = df.pivot_table('col1', index='col3', aggfunc=[np.sum]) - m = MultiIndex.from_arrays([['sum'], - ['col1']]) - expected = DataFrame([3, 4, 5], - index=Index([1, 3, 9], name='col3'), - columns=m) + result = df.pivot_table("col1", index="col3", aggfunc=[np.sum]) + m = MultiIndex.from_arrays([["sum"], ["col1"]]) + expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m) tm.assert_frame_equal(result, expected) def test_pivot_margins_name_unicode(self): # issue #13292 - greek = '\u0394\u03bf\u03ba\u03b9\u03bc\u03ae' - frame = pd.DataFrame({'foo': [1, 2, 3]}) - table = pd.pivot_table(frame, index=['foo'], aggfunc=len, margins=True, - margins_name=greek) - index = pd.Index([1, 2, 3, greek], dtype='object', name='foo') + greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae" + frame = pd.DataFrame({"foo": [1, 2, 3]}) + table = pd.pivot_table( + frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek + ) + index = pd.Index([1, 2, 3, greek], dtype="object", name="foo") expected = pd.DataFrame(index=index) tm.assert_frame_equal(table, expected) def test_pivot_string_as_func(self): # GH #18713 # for correctness purposes - data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', - 'bar', 'bar', 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', - 'one', 'two', 'two', 'two', 'one'], - 'C': range(11)}) - - result = pivot_table(data, index='A', columns='B', aggfunc='sum') - mi = MultiIndex(levels=[['C'], ['one', 'two']], - codes=[[0, 0], [0, 1]], names=[None, 'B']) - expected = DataFrame({('C', 'one'): {'bar': 15, 'foo': 13}, - ('C', 'two'): {'bar': 7, 'foo': 20}}, - columns=mi).rename_axis('A') + data = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": range(11), + } + ) + + result = pivot_table(data, index="A", columns="B", aggfunc="sum") + mi = MultiIndex( + levels=[["C"], ["one", "two"]], codes=[[0, 0], [0, 1]], names=[None, "B"] + ) + expected = DataFrame( + {("C", "one"): {"bar": 15, "foo": 13}, ("C", "two"): {"bar": 7, "foo": 20}}, + columns=mi, + ).rename_axis("A") tm.assert_frame_equal(result, expected) - result = pivot_table(data, index='A', columns='B', - aggfunc=['sum', 'mean']) - mi = MultiIndex(levels=[['sum', 'mean'], ['C'], ['one', 'two']], - codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], - names=[None, None, 'B']) - expected = DataFrame({('mean', 'C', 'one'): {'bar': 5.0, 'foo': 3.25}, - ('mean', 'C', 'two'): {'bar': 7.0, - 'foo': 6.666666666666667}, - ('sum', 'C', 'one'): {'bar': 15, 'foo': 13}, - ('sum', 'C', 'two'): {'bar': 7, 'foo': 20}}, - columns=mi).rename_axis('A') + result = pivot_table(data, index="A", columns="B", aggfunc=["sum", "mean"]) + mi = MultiIndex( + levels=[["sum", "mean"], ["C"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]], + names=[None, None, "B"], + ) + expected = DataFrame( + { + ("mean", "C", "one"): {"bar": 5.0, "foo": 3.25}, + ("mean", "C", "two"): {"bar": 7.0, "foo": 6.666666666666667}, + ("sum", "C", "one"): {"bar": 15, "foo": 13}, + ("sum", "C", "two"): {"bar": 7, "foo": 20}, + }, + columns=mi, + ).rename_axis("A") tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('f, f_numpy', - [('sum', np.sum), - ('mean', np.mean), - ('std', np.std), - (['sum', 'mean'], [np.sum, np.mean]), - (['sum', 'std'], [np.sum, np.std]), - (['std', 'mean'], [np.std, np.mean])]) + @pytest.mark.parametrize( + "f, f_numpy", + [ + ("sum", np.sum), + ("mean", np.mean), + ("std", np.std), + (["sum", "mean"], [np.sum, np.mean]), + (["sum", "std"], [np.sum, np.std]), + (["std", "mean"], [np.std, np.mean]), + ], + ) def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes - result = pivot_table(self.data, index='A', columns='B', aggfunc=f) - expected = pivot_table(self.data, index='A', columns='B', - aggfunc=f_numpy) + result = pivot_table(self.data, index="A", columns="B", aggfunc=f) + expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) tm.assert_frame_equal(result, expected) @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - df = DataFrame({'ind1': np.arange(2 ** 16), - 'ind2': np.arange(2 ** 16), - 'count': 0}) + df = DataFrame( + {"ind1": np.arange(2 ** 16), "ind2": np.arange(2 ** 16), "count": 0} + ) msg = "Unstacked DataFrame is too big, causing int32 overflow" with pytest.raises(ValueError, match=msg): - df.pivot_table(index='ind1', columns='ind2', - values='count', aggfunc='count') + df.pivot_table( + index="ind1", columns="ind2", values="count", aggfunc="count" + ) def test_pivot_table_aggfunc_dropna(self, dropna): # GH 22159 - df = pd.DataFrame({'fruit': ['apple', 'peach', 'apple'], - 'size': [1, 1, 2], - 'taste': [7, 6, 6]}) + df = pd.DataFrame( + { + "fruit": ["apple", "peach", "apple"], + "size": [1, 1, 2], + "taste": [7, 6, 6], + } + ) def ret_one(x): return 1 @@ -1383,77 +1818,108 @@ def ret_sum(x): def ret_none(x): return np.nan - result = pd.pivot_table(df, columns='fruit', - aggfunc=[ret_sum, ret_none, ret_one], - dropna=dropna) + result = pd.pivot_table( + df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna + ) data = [[3, 1, np.nan, np.nan, 1, 1], [13, 6, np.nan, np.nan, 1, 1]] - col = pd.MultiIndex.from_product([['ret_sum', 'ret_none', 'ret_one'], - ['apple', 'peach']], - names=[None, 'fruit']) - expected = pd.DataFrame(data, index=['size', 'taste'], columns=col) + col = pd.MultiIndex.from_product( + [["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]], + names=[None, "fruit"], + ) + expected = pd.DataFrame(data, index=["size", "taste"], columns=col) if dropna: - expected = expected.dropna(axis='columns') + expected = expected.dropna(axis="columns") tm.assert_frame_equal(result, expected) def test_pivot_table_aggfunc_scalar_dropna(self, dropna): # GH 22159 - df = pd.DataFrame({'A': ['one', 'two', 'one'], - 'x': [3, np.nan, 2], - 'y': [1, np.nan, np.nan]}) + df = pd.DataFrame( + {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]} + ) - result = pd.pivot_table(df, columns='A', - aggfunc=np.mean, - dropna=dropna) + result = pd.pivot_table(df, columns="A", aggfunc=np.mean, dropna=dropna) data = [[2.5, np.nan], [1, np.nan]] - col = pd.Index(['one', 'two'], name='A') - expected = pd.DataFrame(data, index=['x', 'y'], columns=col) + col = pd.Index(["one", "two"], name="A") + expected = pd.DataFrame(data, index=["x", "y"], columns=col) if dropna: - expected = expected.dropna(axis='columns') + expected = expected.dropna(axis="columns") tm.assert_frame_equal(result, expected) class TestCrosstab: - def setup_method(self, method): - df = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', - 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', - 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', - 'dull', 'shiny', 'shiny', 'dull', - 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) + df = DataFrame( + { + "A": [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + "B": [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + "C": [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + "D": np.random.randn(11), + "E": np.random.randn(11), + "F": np.random.randn(11), + } + ) self.df = df.append(df, ignore_index=True) def test_crosstab_single(self): df = self.df - result = crosstab(df['A'], df['C']) - expected = df.groupby(['A', 'C']).size().unstack() + result = crosstab(df["A"], df["C"]) + expected = df.groupby(["A", "C"]).size().unstack() tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) def test_crosstab_multiple(self): df = self.df - result = crosstab(df['A'], [df['B'], df['C']]) - expected = df.groupby(['A', 'B', 'C']).size() - expected = expected.unstack( - 'B').unstack('C').fillna(0).astype(np.int64) + result = crosstab(df["A"], [df["B"], df["C"]]) + expected = df.groupby(["A", "B", "C"]).size() + expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64) tm.assert_frame_equal(result, expected) - result = crosstab([df['B'], df['C']], df['A']) - expected = df.groupby(['B', 'C', 'A']).size() - expected = expected.unstack('A').fillna(0).astype(np.int64) + result = crosstab([df["B"], df["C"]], df["A"]) + expected = df.groupby(["B", "C", "A"]).size() + expected = expected.unstack("A").fillna(0).astype(np.int64) tm.assert_frame_equal(result, expected) def test_crosstab_ndarray(self): @@ -1461,30 +1927,32 @@ def test_crosstab_ndarray(self): b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 10, size=100) - df = DataFrame({'a': a, 'b': b, 'c': c}) + df = DataFrame({"a": a, "b": b, "c": c}) - result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c')) - expected = crosstab(df['a'], [df['b'], df['c']]) + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c")) + expected = crosstab(df["a"], [df["b"], df["c"]]) tm.assert_frame_equal(result, expected) - result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c')) - expected = crosstab([df['b'], df['c']], df['a']) + result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c")) + expected = crosstab([df["b"], df["c"]], df["a"]) tm.assert_frame_equal(result, expected) # assign arbitrary names - result = crosstab(self.df['A'].values, self.df['C'].values) - assert result.index.name == 'row_0' - assert result.columns.name == 'col_0' + result = crosstab(self.df["A"].values, self.df["C"].values) + assert result.index.name == "row_0" + assert result.columns.name == "col_0" def test_crosstab_non_aligned(self): # GH 17005 - a = pd.Series([0, 1, 1], index=['a', 'b', 'c']) - b = pd.Series([3, 4, 3, 4, 3], index=['a', 'b', 'c', 'd', 'f']) + a = pd.Series([0, 1, 1], index=["a", "b", "c"]) + b = pd.Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"]) c = np.array([3, 4, 3]) - expected = pd.DataFrame([[1, 0], [1, 1]], - index=Index([0, 1], name='row_0'), - columns=Index([3, 4], name='col_0')) + expected = pd.DataFrame( + [[1, 0], [1, 1]], + index=Index([0, 1], name="row_0"), + columns=Index([3, 4], name="col_0"), + ) result = crosstab(a, b) tm.assert_frame_equal(result, expected) @@ -1497,27 +1965,26 @@ def test_crosstab_margins(self): b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) - df = DataFrame({'a': a, 'b': b, 'c': c}) + df = DataFrame({"a": a, "b": b, "c": c}) - result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), - margins=True) + result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True) - assert result.index.names == ('a',) - assert result.columns.names == ['b', 'c'] + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] - all_cols = result['All', ''] - exp_cols = df.groupby(['a']).size().astype('i8') + all_cols = result["All", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") # to keep index.name - exp_margin = Series([len(df)], index=Index(['All'], name='a')) + exp_margin = Series([len(df)], index=Index(["All"], name="a")) exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ('All', '') + exp_cols.name = ("All", "") tm.assert_series_equal(all_cols, exp_cols) - all_rows = result.loc['All'] - exp_rows = df.groupby(['b', 'c']).size().astype('i8') - exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) - exp_rows.name = 'All' + all_rows = result.loc["All"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("All", "")])) + exp_rows.name = "All" exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) @@ -1529,37 +1996,49 @@ def test_crosstab_margins_set_margin_name(self): b = np.random.randint(0, 3, size=100) c = np.random.randint(0, 5, size=100) - df = DataFrame({'a': a, 'b': b, 'c': c}) + df = DataFrame({"a": a, "b": b, "c": c}) - result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), - margins=True, margins_name='TOTAL') + result = crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name="TOTAL", + ) - assert result.index.names == ('a',) - assert result.columns.names == ['b', 'c'] + assert result.index.names == ("a",) + assert result.columns.names == ["b", "c"] - all_cols = result['TOTAL', ''] - exp_cols = df.groupby(['a']).size().astype('i8') + all_cols = result["TOTAL", ""] + exp_cols = df.groupby(["a"]).size().astype("i8") # to keep index.name - exp_margin = Series([len(df)], index=Index(['TOTAL'], name='a')) + exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a")) exp_cols = exp_cols.append(exp_margin) - exp_cols.name = ('TOTAL', '') + exp_cols.name = ("TOTAL", "") tm.assert_series_equal(all_cols, exp_cols) - all_rows = result.loc['TOTAL'] - exp_rows = df.groupby(['b', 'c']).size().astype('i8') - exp_rows = exp_rows.append(Series([len(df)], index=[('TOTAL', '')])) - exp_rows.name = 'TOTAL' + all_rows = result.loc["TOTAL"] + exp_rows = df.groupby(["b", "c"]).size().astype("i8") + exp_rows = exp_rows.append(Series([len(df)], index=[("TOTAL", "")])) + exp_rows.name = "TOTAL" exp_rows = exp_rows.reindex(all_rows.index) exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows) msg = "margins_name argument must be a string" - for margins_name in [666, None, ['a', 'b']]: + for margins_name in [666, None, ["a", "b"]]: with pytest.raises(ValueError, match=msg): - crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), - margins=True, margins_name=margins_name) + crosstab( + a, + [b, c], + rownames=["a"], + colnames=("b", "c"), + margins=True, + margins_name=margins_name, + ) def test_crosstab_pass_values(self): a = np.random.randint(0, 7, size=100) @@ -1567,28 +2046,29 @@ def test_crosstab_pass_values(self): c = np.random.randint(0, 5, size=100) values = np.random.randn(100) - table = crosstab([a, b], c, values, aggfunc=np.sum, - rownames=['foo', 'bar'], colnames=['baz']) + table = crosstab( + [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"] + ) - df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values}) + df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values}) - expected = df.pivot_table('values', index=['foo', 'bar'], - columns='baz', aggfunc=np.sum) + expected = df.pivot_table( + "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum + ) tm.assert_frame_equal(table, expected) def test_crosstab_dropna(self): # GH 3820 - a = np.array(['foo', 'foo', 'foo', 'bar', - 'bar', 'foo', 'foo'], dtype=object) - b = np.array(['one', 'one', 'two', 'one', - 'two', 'two', 'two'], dtype=object) - c = np.array(['dull', 'dull', 'dull', 'dull', - 'dull', 'shiny', 'shiny'], dtype=object) - res = pd.crosstab(a, [b, c], rownames=['a'], - colnames=['b', 'c'], dropna=False) - m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), - ('two', 'dull'), ('two', 'shiny')], - names=['b', 'c']) + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + res = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False) + m = MultiIndex.from_tuples( + [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")], + names=["b", "c"], + ) tm.assert_index_equal(res.columns, m) def test_crosstab_no_overlap(self): @@ -1607,302 +2087,319 @@ def test_margin_dropna(self): # pivot_table counts null into margin ('All') # when margins=true and dropna=true - df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], - 'b': [3, 3, 4, 4, 4, 4]}) + df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3, 4, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) - df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], - 'b': [3, np.nan, 4, 4, 4, 4]}) + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3.0, 4.0, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) - df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2], - 'b': [3, 3, 4, 4, 4, 4]}) + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]} + ) actual = pd.crosstab(df.a, df.b, margins=True, dropna=True) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3, 4, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) # GH 12642 # _add_margins raises KeyError: Level None not found # when margins=True and dropna=False - df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], - 'b': [3, 3, 4, 4, 4, 4]}) + df = pd.DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3, 4, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3, 4, "All"], name="b") tm.assert_frame_equal(actual, expected) - df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], - 'b': [3, np.nan, 4, 4, 4, 4]}) + df = DataFrame( + {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]} + ) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3.0, 4.0, 'All'], name='b') + expected.index = Index([1.0, 2.0, "All"], name="a") + expected.columns = Index([3.0, 4.0, "All"], name="b") tm.assert_frame_equal(actual, expected) - a = np.array(['foo', 'foo', 'foo', 'bar', - 'bar', 'foo', 'foo'], dtype=object) - b = np.array(['one', 'one', 'two', 'one', - 'two', np.nan, 'two'], dtype=object) - c = np.array(['dull', 'dull', 'dull', 'dull', - 'dull', 'shiny', 'shiny'], dtype=object) - - actual = pd.crosstab(a, [b, c], rownames=['a'], - colnames=['b', 'c'], margins=True, dropna=False) - m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'], - ['dull', 'shiny', 'dull', 'shiny', '']], - names=['b', 'c']) - expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], - [3, 0, 2, 1, 7]], columns=m) - expected.index = Index(['bar', 'foo', 'All'], name='a') + a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object) + b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object) + c = np.array( + ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object + ) + + actual = pd.crosstab( + a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [ + ["one", "one", "two", "two", "All"], + ["dull", "shiny", "dull", "shiny", ""], + ], + names=["b", "c"], + ) + expected = DataFrame( + [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m + ) + expected.index = Index(["bar", "foo", "All"], name="a") tm.assert_frame_equal(actual, expected) - actual = pd.crosstab([a, b], c, rownames=['a', 'b'], - colnames=['c'], margins=True, dropna=False) - m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'], - ['one', 'two', 'one', 'two', '']], - names=['a', 'b']) - expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], - [5, 2, 7]], index=m) - expected.columns = Index(['dull', 'shiny', 'All'], name='c') + actual = pd.crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) - actual = pd.crosstab([a, b], c, rownames=['a', 'b'], - colnames=['c'], margins=True, dropna=True) - m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'], - ['one', 'two', 'one', 'two', '']], - names=['a', 'b']) - expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], - [5, 1, 6]], index=m) - expected.columns = Index(['dull', 'shiny', 'All'], name='c') + actual = pd.crosstab( + [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True + ) + m = MultiIndex.from_arrays( + [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]], + names=["a", "b"], + ) + expected = DataFrame( + [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m + ) + expected.columns = Index(["dull", "shiny", "All"], name="c") tm.assert_frame_equal(actual, expected) def test_crosstab_normalize(self): # Issue 12578 - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [1, 1, np.nan, 1, 1]}) - - rindex = pd.Index([1, 2], name='a') - cindex = pd.Index([3, 4], name='b') - full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], - index=rindex, columns=cindex) - row_normal = pd.DataFrame([[1.0, 0], [0.25, 0.75]], - index=rindex, columns=cindex) - col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], - index=rindex, columns=cindex) + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) + + rindex = pd.Index([1, 2], name="a") + cindex = pd.Index([3, 4], name="b") + full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex) + row_normal = pd.DataFrame( + [[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex + ) + col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex) # Check all normalize args - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='all'), - full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), - full_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'), - row_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'), - col_normal) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1), - pd.crosstab(df.a, df.b, normalize='columns')) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0), - pd.crosstab(df.a, df.b, normalize='index')) - - row_normal_margins = pd.DataFrame([[1.0, 0], - [0.25, 0.75], - [0.4, 0.6]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4], name='b', - dtype='object')) - col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]], - index=pd.Index([1, 2], name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b', - dtype='object')) - - all_normal_margins = pd.DataFrame([[0.2, 0, 0.2], - [0.2, 0.6, 0.8], - [0.4, 0.6, 1]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b', - dtype='object')) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', - margins=True), row_normal_margins) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', - margins=True), - col_normal_margins) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True, - margins=True), all_normal_margins) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="all"), full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="index"), row_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize="columns"), col_normal) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=1), + pd.crosstab(df.a, df.b, normalize="columns"), + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=0), + pd.crosstab(df.a, df.b, normalize="index"), + ) + + row_normal_margins = pd.DataFrame( + [[1.0, 0], [0.25, 0.75], [0.4, 0.6]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4], name="b", dtype="object"), + ) + col_normal_margins = pd.DataFrame( + [[0.5, 0, 0.2], [0.5, 1.0, 0.8]], + index=pd.Index([1, 2], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + + all_normal_margins = pd.DataFrame( + [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize="columns", margins=True), + col_normal_margins, + ) + tm.assert_frame_equal( + pd.crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins + ) # Test arrays - pd.crosstab([np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], - np.array([1, 2, 1, 2])) + pd.crosstab( + [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2]) + ) # Test with aggfunc - norm_counts = pd.DataFrame([[0.25, 0, 0.25], - [0.25, 0.5, 0.75], - [0.5, 0.5, 1]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b')) - test_case = pd.crosstab(df.a, df.b, df.c, aggfunc='count', - normalize='all', - margins=True) + norm_counts = pd.DataFrame( + [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b"), + ) + test_case = pd.crosstab( + df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True + ) tm.assert_frame_equal(test_case, norm_counts) - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [0, 4, np.nan, 3, 3]}) - - norm_sum = pd.DataFrame([[0, 0, 0.], - [0.4, 0.6, 1], - [0.4, 0.6, 1]], - index=pd.Index([1, 2, 'All'], - name='a', - dtype='object'), - columns=pd.Index([3, 4, 'All'], - name='b', - dtype='object')) - test_case = pd.crosstab(df.a, df.b, df.c, aggfunc=np.sum, - normalize='all', - margins=True) + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]} + ) + + norm_sum = pd.DataFrame( + [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]], + index=pd.Index([1, 2, "All"], name="a", dtype="object"), + columns=pd.Index([3, 4, "All"], name="b", dtype="object"), + ) + test_case = pd.crosstab( + df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True + ) tm.assert_frame_equal(test_case, norm_sum) def test_crosstab_with_empties(self): # Check handling of empties - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [np.nan, np.nan, np.nan, np.nan, np.nan]}) - - empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]], - index=pd.Index([1, 2], - name='a', - dtype='int64'), - columns=pd.Index([3, 4], name='b')) - - for i in [True, 'index', 'columns']: - calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', - normalize=i) + df = pd.DataFrame( + { + "a": [1, 2, 2, 2, 2], + "b": [3, 3, 4, 4, 4], + "c": [np.nan, np.nan, np.nan, np.nan, np.nan], + } + ) + + empty = pd.DataFrame( + [[0.0, 0.0], [0.0, 0.0]], + index=pd.Index([1, 2], name="a", dtype="int64"), + columns=pd.Index([3, 4], name="b"), + ) + + for i in [True, "index", "columns"]: + calculated = pd.crosstab( + df.a, df.b, values=df.c, aggfunc="count", normalize=i + ) tm.assert_frame_equal(empty, calculated) - nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]], - index=pd.Index([1, 2], - name='a', - dtype='int64'), - columns=pd.Index([3, 4], name='b')) + nans = pd.DataFrame( + [[0.0, np.nan], [0.0, 0.0]], + index=pd.Index([1, 2], name="a", dtype="int64"), + columns=pd.Index([3, 4], name="b"), + ) - calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', - normalize=False) + calculated = pd.crosstab( + df.a, df.b, values=df.c, aggfunc="count", normalize=False + ) tm.assert_frame_equal(nans, calculated) def test_crosstab_errors(self): # Issue 12578 - df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - 'c': [1, 1, np.nan, 1, 1]}) + df = pd.DataFrame( + {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]} + ) - error = 'values cannot be used without an aggfunc.' + error = "values cannot be used without an aggfunc." with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, values=df.c) - error = 'aggfunc cannot be used without values' + error = "aggfunc cannot be used without values" with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, aggfunc=np.mean) - error = 'Not a valid normalize argument' + error = "Not a valid normalize argument" with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize='42') + pd.crosstab(df.a, df.b, normalize="42") with pytest.raises(ValueError, match=error): pd.crosstab(df.a, df.b, normalize=42) - error = 'Not a valid margins argument' + error = "Not a valid margins argument" with pytest.raises(ValueError, match=error): - pd.crosstab(df.a, df.b, normalize='all', margins=42) + pd.crosstab(df.a, df.b, normalize="all", margins=42) def test_crosstab_with_categorial_columns(self): # GH 8860 - df = pd.DataFrame({'MAKE': ['Honda', 'Acura', 'Tesla', - 'Honda', 'Honda', 'Acura'], - 'MODEL': ['Sedan', 'Sedan', 'Electric', - 'Pickup', 'Sedan', 'Sedan']}) - categories = ['Sedan', 'Electric', 'Pickup'] - df['MODEL'] = (df['MODEL'].astype('category') - .cat.set_categories(categories)) - result = pd.crosstab(df['MAKE'], df['MODEL']) - - expected_index = pd.Index(['Acura', 'Honda', 'Tesla'], name='MAKE') - expected_columns = pd.CategoricalIndex(categories, - categories=categories, - ordered=False, - name='MODEL') + df = pd.DataFrame( + { + "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"], + "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"], + } + ) + categories = ["Sedan", "Electric", "Pickup"] + df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories) + result = pd.crosstab(df["MAKE"], df["MODEL"]) + + expected_index = pd.Index(["Acura", "Honda", "Tesla"], name="MAKE") + expected_columns = pd.CategoricalIndex( + categories, categories=categories, ordered=False, name="MODEL" + ) expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]] - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_columns) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_columns + ) tm.assert_frame_equal(result, expected) def test_crosstab_with_numpy_size(self): # GH 4003 - df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, - 'B': ['A', 'B', 'C'] * 8, - 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, - 'D': np.random.randn(24), - 'E': np.random.randn(24)}) - result = pd.crosstab(index=[df['A'], df['B']], - columns=[df['C']], - margins=True, - aggfunc=np.size, - values=df['D']) - expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'], - ['', 'A', 'B', 'C']], - codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], - [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], - names=['A', 'B']) - expected_column = pd.Index(['bar', 'foo', 'All'], - dtype='object', - name='C') - expected_data = np.array([[2., 2., 4.], - [2., 2., 4.], - [2., 2., 4.], - [2., np.nan, 2.], - [np.nan, 2., 2.], - [2., np.nan, 2.], - [np.nan, 2., 2.], - [2., np.nan, 2.], - [np.nan, 2., 2.], - [12., 12., 24.]]) - expected = pd.DataFrame(expected_data, - index=expected_index, - columns=expected_column) + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + } + ) + result = pd.crosstab( + index=[df["A"], df["B"]], + columns=[df["C"]], + margins=True, + aggfunc=np.size, + values=df["D"], + ) + expected_index = pd.MultiIndex( + levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]], + codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]], + names=["A", "B"], + ) + expected_column = pd.Index(["bar", "foo", "All"], dtype="object", name="C") + expected_data = np.array( + [ + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, 2.0, 4.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [2.0, np.nan, 2.0], + [np.nan, 2.0, 2.0], + [12.0, 12.0, 24.0], + ] + ) + expected = pd.DataFrame( + expected_data, index=expected_index, columns=expected_column + ) tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): # GH 13279 - s = pd.Series(range(3), name='foo') + s = pd.Series(range(3), name="foo") result = pd.crosstab(s, s) - expected_index = pd.Index(range(3), name='foo') - expected = pd.DataFrame(np.eye(3, dtype=np.int64), - index=expected_index, - columns=expected_index) + expected_index = pd.Index(range(3), name="foo") + expected = pd.DataFrame( + np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("names", [['a', ('b', 'c')], - [('a', 'b'), 'c']]) + @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): s1 = pd.Series(range(3), name=names[0]) s2 = pd.Series(range(1, 4), name=names[1]) @@ -1914,13 +2411,13 @@ def test_crosstab_tuple_name(self, names): tm.assert_frame_equal(result, expected) def test_crosstab_unsorted_order(self): - df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]}, - index=['C', 'A', 'B']) + df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) result = pd.crosstab(df.index, [df.b, df.a]) - e_idx = pd.Index(['A', 'B', 'C'], name='row_0') - e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], - names=['b', 'a']) - expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]], - index=e_idx, - columns=e_columns) + e_idx = pd.Index(["A", "B", "C"], name="row_0") + e_columns = pd.MultiIndex.from_tuples( + [(1, 4), (2, 6), (3, 5)], names=["b", "a"] + ) + expected = pd.DataFrame( + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index e66484822c625..cb46918157e89 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -4,8 +4,20 @@ import pytest from pandas import ( - Categorical, DatetimeIndex, Interval, IntervalIndex, NaT, Series, - TimedeltaIndex, Timestamp, cut, date_range, isna, qcut, timedelta_range) + Categorical, + DatetimeIndex, + Interval, + IntervalIndex, + NaT, + Series, + TimedeltaIndex, + Timestamp, + cut, + date_range, + isna, + qcut, + timedelta_range, +) from pandas.api.types import CategoricalDtype as CDT from pandas.core.algorithms import quantile import pandas.util.testing as tm @@ -19,7 +31,7 @@ def test_qcut(): # We store the bins as Index that have been # rounded to comparisons are a bit tricky. labels, bins = qcut(arr, 4, retbins=True) - ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) + ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0]) result = labels.categories.left.values assert np.allclose(result, ex_bins[:-1], atol=1e-2) @@ -40,7 +52,7 @@ def test_qcut_bounds(): def test_qcut_specify_quantiles(): arr = np.random.randn(100) - factor = qcut(arr, [0, .25, .5, .75, 1.]) + factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0]) expected = qcut(arr, 4) tm.assert_categorical_equal(factor, expected) @@ -55,8 +67,14 @@ def test_qcut_include_lowest(): values = np.arange(10) ii = qcut(values, 4) - ex_levels = IntervalIndex([Interval(-0.001, 2.25), Interval(2.25, 4.5), - Interval(4.5, 6.75), Interval(6.75, 9)]) + ex_levels = IntervalIndex( + [ + Interval(-0.001, 2.25), + Interval(2.25, 4.5), + Interval(4.5, 6.75), + Interval(6.75, 9), + ] + ) tm.assert_index_equal(ii.categories, ex_levels) @@ -93,8 +111,9 @@ def test_qcut_binning_issues(datapath): starts.append(float(s)) ends.append(float(e)) - for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]), - zip(ends[:-1], ends[1:])): + for (sp, sn), (ep, en) in zip( + zip(starts[:-1], starts[1:]), zip(ends[:-1], ends[1:]) + ): assert sp < sn assert ep < en assert ep <= sn @@ -104,19 +123,22 @@ def test_qcut_return_intervals(): ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) res = qcut(ser, [0, 0.333, 0.666, 1]) - exp_levels = np.array([Interval(-0.001, 2.664), - Interval(2.664, 5.328), Interval(5.328, 8)]) - exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( - CDT(ordered=True)) + exp_levels = np.array( + [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)] + ) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True)) tm.assert_series_equal(res, exp) -@pytest.mark.parametrize("kwargs,msg", [ - (dict(duplicates="drop"), None), - (dict(), "Bin edges must be unique"), - (dict(duplicates="raise"), "Bin edges must be unique"), - (dict(duplicates="foo"), "invalid value for 'duplicates' parameter") -]) +@pytest.mark.parametrize( + "kwargs,msg", + [ + (dict(duplicates="drop"), None), + (dict(), "Bin edges must be unique"), + (dict(duplicates="raise"), "Bin edges must be unique"), + (dict(duplicates="foo"), "invalid value for 'duplicates' parameter"), + ], +) def test_qcut_duplicates_bin(kwargs, msg): # see gh-7751 values = [0, 0, 0, 0, 1, 2, 3] @@ -130,11 +152,9 @@ def test_qcut_duplicates_bin(kwargs, msg): tm.assert_index_equal(result.categories, expected) -@pytest.mark.parametrize("data,start,end", [ - (9.0, 8.999, 9.0), - (0.0, -0.001, 0.0), - (-9.0, -9.001, -9.0), -]) +@pytest.mark.parametrize( + "data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)] +) @pytest.mark.parametrize("length", [1, 2]) @pytest.mark.parametrize("labels", [None, False]) def test_single_quantile(data, start, end, length, labels): @@ -143,8 +163,7 @@ def test_single_quantile(data, start, end, length, labels): result = qcut(ser, 1, labels=labels) if labels is None: - intervals = IntervalIndex([Interval(start, end)] * - length, closed="right") + intervals = IntervalIndex([Interval(start, end)] * length, closed="right") expected = Series(intervals).astype(CDT(ordered=True)) else: expected = Series([0] * length) @@ -152,15 +171,19 @@ def test_single_quantile(data, start, end, length, labels): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("ser", [ - Series(DatetimeIndex(["20180101", NaT, "20180103"])), - Series(TimedeltaIndex(["0 days", NaT, "2 days"]))], - ids=lambda x: str(x.dtype)) +@pytest.mark.parametrize( + "ser", + [ + Series(DatetimeIndex(["20180101", NaT, "20180103"])), + Series(TimedeltaIndex(["0 days", NaT, "2 days"])), + ], + ids=lambda x: str(x.dtype), +) def test_qcut_nat(ser): # see gh-19768 - intervals = IntervalIndex.from_tuples([ - (ser[0] - Nano(), ser[2] - Day()), - np.nan, (ser[2] - Day(), ser[2])]) + intervals = IntervalIndex.from_tuples( + [(ser[0] - Nano(), ser[2] - Day()), np.nan, (ser[2] - Day(), ser[2])] + ) expected = Series(Categorical(intervals, ordered=True)) result = qcut(ser, 2) @@ -174,22 +197,40 @@ def test_datetime_tz_qcut(bins): ser = Series(date_range("20130101", periods=3, tz=tz)) result = qcut(ser, bins) - expected = Series(IntervalIndex([ - Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz)), - Interval(Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz)), - Interval(Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz))])).astype( - CDT(ordered=True)) + expected = Series( + IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:59:59.999999999", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + ).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("arg,expected_bins", [ - [timedelta_range("1day", periods=3), - TimedeltaIndex(["1 days", "2 days", "3 days"])], - [date_range("20180101", periods=3), - DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"])]]) +@pytest.mark.parametrize( + "arg,expected_bins", + [ + [ + timedelta_range("1day", periods=3), + TimedeltaIndex(["1 days", "2 days", "3 days"]), + ], + [ + date_range("20180101", periods=3), + DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]), + ], + ], +) def test_date_like_qcut_bins(arg, expected_bins): # see gh-19891 ser = Series(arg) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index d0979fb86d36d..1c9e3e57bc310 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -15,22 +15,19 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestGetDummies: - @pytest.fixture def df(self): - return DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) + return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]}) - @pytest.fixture(params=['uint8', 'i8', np.float64, bool, None]) + @pytest.fixture(params=["uint8", "i8", np.float64, bool, None]) def dtype(self, request): return np.dtype(request.param) - @pytest.fixture(params=['dense', 'sparse']) + @pytest.fixture(params=["dense", "sparse"]) def sparse(self, request): # params are strings to simplify reading test results, # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True] - return request.param == 'sparse' + return request.param == "sparse" def effective_dtype(self, dtype): if dtype is None: @@ -39,17 +36,17 @@ def effective_dtype(self, dtype): def test_raises_on_dtype_object(self, df): with pytest.raises(ValueError): - get_dummies(df, dtype='object') + get_dummies(df, dtype="object") def test_basic(self, sparse, dtype): - s_list = list('abc') + s_list = list("abc") s_series = Series(s_list) - s_series_index = Series(s_list, list('ABC')) + s_series_index = Series(s_list, list("ABC")) - expected = DataFrame({'a': [1, 0, 0], - 'b': [0, 1, 0], - 'c': [0, 0, 1]}, - dtype=self.effective_dtype(dtype)) + expected = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype=self.effective_dtype(dtype), + ) if sparse: expected = expected.apply(pd.SparseArray, fill_value=0.0) result = get_dummies(s_list, sparse=sparse, dtype=dtype) @@ -58,23 +55,23 @@ def test_basic(self, sparse, dtype): result = get_dummies(s_series, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) - expected.index = list('ABC') + expected.index = list("ABC") result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) assert_frame_equal(result, expected) def test_basic_types(self, sparse, dtype): # GH 10531 - s_list = list('abc') + s_list = list("abc") s_series = Series(s_list) - s_df = DataFrame({'a': [0, 1, 0, 1, 2], - 'b': ['A', 'A', 'B', 'C', 'C'], - 'c': [2, 3, 3, 3, 2]}) - - expected = DataFrame({'a': [1, 0, 0], - 'b': [0, 1, 0], - 'c': [0, 0, 1]}, - dtype=self.effective_dtype(dtype), - columns=list('abc')) + s_df = DataFrame( + {"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]} + ) + + expected = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype=self.effective_dtype(dtype), + columns=list("abc"), + ) if sparse: if is_integer_dtype(dtype): fill_value = 0 @@ -90,12 +87,10 @@ def test_basic_types(self, sparse, dtype): result = get_dummies(s_series, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) - result = get_dummies(s_df, columns=s_df.columns, - sparse=sparse, dtype=dtype) + result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype) if sparse: - dtype_name = 'Sparse[{}, {}]'.format( - self.effective_dtype(dtype).name, - fill_value + dtype_name = "Sparse[{}, {}]".format( + self.effective_dtype(dtype).name, fill_value ) else: dtype_name = self.effective_dtype(dtype).name @@ -105,9 +100,9 @@ def test_basic_types(self, sparse, dtype): result.index = [str(i) for i in result.index] tm.assert_series_equal(result, expected) - result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) + result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype) - expected_counts = {'int64': 1, 'object': 1} + expected_counts = {"int64": 1, "object": 1} expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) expected = Series(expected_counts).sort_index() @@ -119,7 +114,7 @@ def test_basic_types(self, sparse, dtype): def test_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) - just_na_series_index = Series(just_na_list, index=['A']) + just_na_series_index = Series(just_na_list, index=["A"]) res_list = get_dummies(just_na_list, sparse=sparse) res_series = get_dummies(just_na_series, sparse=sparse) @@ -131,66 +126,68 @@ def test_just_na(self, sparse): assert res_list.index.tolist() == [0] assert res_series.index.tolist() == [0] - assert res_series_index.index.tolist() == ['A'] + assert res_series_index.index.tolist() == ["A"] def test_include_na(self, sparse, dtype): - s = ['a', 'b', np.nan] + s = ["a", "b", np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) - exp = DataFrame({'a': [1, 0, 0], - 'b': [0, 1, 0]}, - dtype=self.effective_dtype(dtype)) + exp = DataFrame( + {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype) + ) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) - exp_na = DataFrame({nan: [0, 0, 1], - 'a': [1, 0, 0], - 'b': [0, 1, 0]}, - dtype=self.effective_dtype(dtype)) - exp_na = exp_na.reindex(['a', 'b', nan], axis=1) + exp_na = DataFrame( + {nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]}, + dtype=self.effective_dtype(dtype), + ) + exp_na = exp_na.reindex(["a", "b", nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, - sparse=sparse, dtype=dtype) - exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=self.effective_dtype(dtype)) + res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) + exp_just_na = DataFrame( + Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype) + ) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) def test_unicode(self, sparse): # See GH 6885 - get_dummies chokes on unicode values import unicodedata - e = 'e' - eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') + + e = "e" + eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE") s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter', sparse=sparse) - exp = DataFrame({'letter_e': [1, 0, 0], - 'letter_%s' % eacute: [0, 1, 1]}, - dtype=np.uint8) + res = get_dummies(s, prefix="letter", sparse=sparse) + exp = DataFrame( + {"letter_e": [1, 0, 0], "letter_%s" % eacute: [0, 1, 1]}, dtype=np.uint8 + ) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse): - df = df[['A', 'B']] + df = df[["A", "B"]] result = get_dummies(df, sparse=sparse) - expected = DataFrame({'A_a': [1, 0, 1], - 'A_b': [0, 1, 0], - 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, - dtype=np.uint8) + expected = DataFrame( + {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]}, + dtype=np.uint8, + ) if sparse: - expected = pd.DataFrame({ - "A_a": pd.SparseArray([1, 0, 1], dtype='uint8'), - "A_b": pd.SparseArray([0, 1, 0], dtype='uint8'), - "B_b": pd.SparseArray([1, 1, 0], dtype='uint8'), - "B_c": pd.SparseArray([0, 0, 1], dtype='uint8'), - }) + expected = pd.DataFrame( + { + "A_a": pd.SparseArray([1, 0, 1], dtype="uint8"), + "A_b": pd.SparseArray([0, 1, 0], dtype="uint8"), + "B_b": pd.SparseArray([1, 1, 0], dtype="uint8"), + "B_c": pd.SparseArray([0, 0, 1], dtype="uint8"), + } + ) assert_frame_equal(result, expected) @@ -202,26 +199,34 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype): else: arr = np.array typ = dtype - expected = DataFrame({'C': [1, 2, 3], - 'A_a': arr([1, 0, 1], dtype=typ), - 'A_b': arr([0, 1, 0], dtype=typ), - 'B_b': arr([1, 1, 0], dtype=typ), - 'B_c': arr([0, 0, 1], dtype=typ)}) - expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + expected = DataFrame( + { + "C": [1, 2, 3], + "A_a": arr([1, 0, 1], dtype=typ), + "A_b": arr([0, 1, 0], dtype=typ), + "B_b": arr([1, 1, 0], dtype=typ), + "B_c": arr([0, 0, 1], dtype=typ), + } + ) + expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]] assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_list(self, df, sparse): - prefixes = ['from_A', 'from_B'] + prefixes = ["from_A", "from_B"] result = get_dummies(df, prefix=prefixes, sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'from_A_a': [1, 0, 1], - 'from_A_b': [0, 1, 0], - 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}, - dtype=np.uint8) - expected[['C']] = df[['C']] - cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - expected = expected[['C'] + cols] + expected = DataFrame( + { + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + "from_B_b": [1, 1, 0], + "from_B_c": [0, 0, 1], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] + cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] + expected = expected[["C"] + cols] typ = pd.SparseArray if sparse else pd.Series expected[cols] = expected[cols].apply(lambda x: typ(x)) @@ -229,98 +234,109 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... - result = get_dummies(df, prefix='bad', sparse=sparse) - bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] - expected = DataFrame([[1, 1, 0, 1, 0], - [2, 0, 1, 1, 0], - [3, 1, 0, 0, 1]], - columns=['C'] + bad_columns, - dtype=np.uint8) + result = get_dummies(df, prefix="bad", sparse=sparse) + bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"] + expected = DataFrame( + [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + columns=["C"] + bad_columns, + dtype=np.uint8, + ) expected = expected.astype({"C": np.int64}) if sparse: # work around astyping & assigning with duplicate columns # https://github.com/pandas-dev/pandas/issues/14427 - expected = pd.concat([ - pd.Series([1, 2, 3], name='C'), - pd.Series([1, 0, 1], name='bad_a', dtype='Sparse[uint8]'), - pd.Series([0, 1, 0], name='bad_b', dtype='Sparse[uint8]'), - pd.Series([1, 1, 0], name='bad_b', dtype='Sparse[uint8]'), - pd.Series([0, 0, 1], name='bad_c', dtype='Sparse[uint8]'), - ], axis=1) + expected = pd.concat( + [ + pd.Series([1, 2, 3], name="C"), + pd.Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"), + pd.Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + pd.Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"), + pd.Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"), + ], + axis=1, + ) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self, df, sparse): - result = get_dummies(df, prefix=['from_A'], columns=['A'], - sparse=sparse) - expected = DataFrame({'B': ['b', 'b', 'c'], - 'C': [1, 2, 3], - 'from_A_a': [1, 0, 1], - 'from_A_b': [0, 1, 0]}, dtype=np.uint8) - expected[['C']] = df[['C']] + result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse) + expected = DataFrame( + { + "B": ["b", "b", "c"], + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] if sparse: - cols = ['from_A_a', 'from_A_b'] + cols = ["from_A_a", "from_A_b"] expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): - result = get_dummies(df, prefix_sep='..', sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'A..a': [1, 0, 1], - 'A..b': [0, 1, 0], - 'B..b': [1, 1, 0], - 'B..c': [0, 0, 1]}, - dtype=np.uint8) - expected[['C']] = df[['C']] - expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] + result = get_dummies(df, prefix_sep="..", sparse=sparse) + expected = DataFrame( + { + "C": [1, 2, 3], + "A..a": [1, 0, 1], + "A..b": [0, 1, 0], + "B..b": [1, 1, 0], + "B..c": [0, 0, 1], + }, + dtype=np.uint8, + ) + expected[["C"]] = df[["C"]] + expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: - cols = ['A..a', 'A..b', 'B..b', 'B..c'] + cols = ["A..a", "A..b", "B..b", "B..c"] expected[cols] = expected[cols].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse) - expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) + result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse) + expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, - sparse=sparse) + result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(df, prefix=['too few'], sparse=sparse) + get_dummies(df, prefix=["too few"], sparse=sparse) def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(df, prefix_sep=['bad'], sparse=sparse) + get_dummies(df, prefix_sep=["bad"], sparse=sparse) def test_dataframe_dummies_prefix_dict(self, sparse): - prefixes = {'A': 'from_A', 'B': 'from_B'} - df = DataFrame({'C': [1, 2, 3], - 'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c']}) + prefixes = {"A": "from_A", "B": "from_B"} + df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]}) result = get_dummies(df, prefix=prefixes, sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'from_A_a': [1, 0, 1], - 'from_A_b': [0, 1, 0], - 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}) + expected = DataFrame( + { + "C": [1, 2, 3], + "from_A_a": [1, 0, 1], + "from_A_b": [0, 1, 0], + "from_B_b": [1, 1, 0], + "from_B_c": [0, 0, 1], + } + ) - columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] + columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected[columns] = expected[columns].astype(np.uint8) if sparse: - expected[columns] = expected[columns].apply( - lambda x: pd.SparseSeries(x) - ) + expected[columns] = expected[columns].apply(lambda x: pd.SparseSeries(x)) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self, df, sparse, dtype): df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, - sparse=sparse, dtype=dtype).sort_index(axis=1) + result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index( + axis=1 + ) if sparse: arr = SparseArray @@ -329,23 +345,26 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype): arr = np.array typ = dtype - expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_a': arr([1, 0, 1, 0], dtype=typ), - 'A_b': arr([0, 1, 0, 0], dtype=typ), - 'A_nan': arr([0, 0, 0, 1], dtype=typ), - 'B_b': arr([1, 1, 0, 0], dtype=typ), - 'B_c': arr([0, 0, 1, 0], dtype=typ), - 'B_nan': arr([0, 0, 0, 1], dtype=typ) - }).sort_index(axis=1) + expected = DataFrame( + { + "C": [1, 2, 3, np.nan], + "A_a": arr([1, 0, 1, 0], dtype=typ), + "A_b": arr([0, 1, 0, 0], dtype=typ), + "A_nan": arr([0, 0, 0, 1], dtype=typ), + "B_b": arr([1, 1, 0, 0], dtype=typ), + "B_c": arr([0, 0, 1, 0], dtype=typ), + "B_nan": arr([0, 0, 0, 1], dtype=typ), + } + ).sort_index(axis=1) assert_frame_equal(result, expected) result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype) - expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] + expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]] assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): - df['cat'] = pd.Categorical(['x', 'y', 'y']) + df["cat"] = pd.Categorical(["x", "y", "y"]) result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) if sparse: arr = SparseArray @@ -354,29 +373,41 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): arr = np.array typ = dtype - expected = DataFrame({'C': [1, 2, 3], - 'A_a': arr([1, 0, 1], dtype=typ), - 'A_b': arr([0, 1, 0], dtype=typ), - 'B_b': arr([1, 1, 0], dtype=typ), - 'B_c': arr([0, 0, 1], dtype=typ), - 'cat_x': arr([1, 0, 0], dtype=typ), - 'cat_y': arr([0, 1, 1], dtype=typ) - }).sort_index(axis=1) + expected = DataFrame( + { + "C": [1, 2, 3], + "A_a": arr([1, 0, 1], dtype=typ), + "A_b": arr([0, 1, 0], dtype=typ), + "B_b": arr([1, 1, 0], dtype=typ), + "B_c": arr([0, 0, 1], dtype=typ), + "cat_x": arr([1, 0, 0], dtype=typ), + "cat_y": arr([0, 1, 1], dtype=typ), + } + ).sort_index(axis=1) assert_frame_equal(result, expected) - @pytest.mark.parametrize('get_dummies_kwargs,expected', [ - ({'data': pd.DataFrame(({'ä': ['a']}))}, - pd.DataFrame({'ä_a': [1]}, dtype=np.uint8)), - - ({'data': pd.DataFrame({'x': ['ä']})}, - pd.DataFrame({'x_ä': [1]}, dtype=np.uint8)), - - ({'data': pd.DataFrame({'x': ['a']}), 'prefix':'ä'}, - pd.DataFrame({'ä_a': [1]}, dtype=np.uint8)), - - ({'data': pd.DataFrame({'x': ['a']}), 'prefix_sep':'ä'}, - pd.DataFrame({'xäa': [1]}, dtype=np.uint8))]) + @pytest.mark.parametrize( + "get_dummies_kwargs,expected", + [ + ( + {"data": pd.DataFrame(({"ä": ["a"]}))}, + pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["ä"]})}, + pd.DataFrame({"x_ä": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["a"]}), "prefix": "ä"}, + pd.DataFrame({"ä_a": [1]}, dtype=np.uint8), + ), + ( + {"data": pd.DataFrame({"x": ["a"]}), "prefix_sep": "ä"}, + pd.DataFrame({"xäa": [1]}, dtype=np.uint8), + ), + ], + ) def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): # GH22084 pd.get_dummies incorrectly encodes unicode characters # in dataframe column names @@ -386,13 +417,11 @@ def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case - s_list = list('abc') + s_list = list("abc") s_series = Series(s_list) - s_series_index = Series(s_list, list('ABC')) + s_series_index = Series(s_list, list("ABC")) - expected = DataFrame({'b': [0, 1, 0], - 'c': [0, 0, 1]}, - dtype=np.uint8) + expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8) result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: @@ -402,15 +431,15 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - expected.index = list('ABC') + expected.index = list("ABC") result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) def test_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. - s_list = list('aaa') + s_list = list("aaa") s_series = Series(s_list) - s_series_index = Series(s_list, list('ABC')) + s_series_index = Series(s_list, list("ABC")) expected = DataFrame(index=np.arange(3)) @@ -420,56 +449,49 @@ def test_basic_drop_first_one_level(self, sparse): result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - expected = DataFrame(index=list('ABC')) + expected = DataFrame(index=list("ABC")) result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) def test_basic_drop_first_NA(self, sparse): # Test NA handling together with drop_first - s_NA = ['a', 'b', np.nan] + s_NA = ["a", "b", np.nan] res = get_dummies(s_NA, drop_first=True, sparse=sparse) - exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) + exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res, exp) - res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, - sparse=sparse) - exp_na = DataFrame( - {'b': [0, 1, 0], - nan: [0, 0, 1]}, - dtype=np.uint8).reindex(['b', nan], axis=1) + res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse) + exp_na = DataFrame({"b": [0, 1, 0], nan: [0, 0, 1]}, dtype=np.uint8).reindex( + ["b", nan], axis=1 + ) if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, - sparse=sparse) + res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) def test_dataframe_dummies_drop_first(self, df, sparse): - df = df[['A', 'B']] + df = df[["A", "B"]] result = get_dummies(df, drop_first=True, sparse=sparse) - expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, - dtype=np.uint8) + expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) if sparse: expected = expected.apply(pd.SparseArray, fill_value=0) assert_frame_equal(result, expected) - def test_dataframe_dummies_drop_first_with_categorical( - self, df, sparse, dtype): - df['cat'] = pd.Categorical(['x', 'y', 'y']) + def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): + df["cat"] = pd.Categorical(["x", "y", "y"]) result = get_dummies(df, drop_first=True, sparse=sparse) - expected = DataFrame({'C': [1, 2, 3], - 'A_b': [0, 1, 0], - 'B_c': [0, 0, 1], - 'cat_y': [0, 1, 1]}) - cols = ['A_b', 'B_c', 'cat_y'] + expected = DataFrame( + {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]} + ) + cols = ["A_b", "B_c", "cat_y"] expected[cols] = expected[cols].astype(np.uint8) - expected = expected[['C', 'A_b', 'B_c', 'cat_y']] + expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: expected[col] = pd.SparseSeries(expected[col]) @@ -477,14 +499,19 @@ def test_dataframe_dummies_drop_first_with_categorical( def test_dataframe_dummies_drop_first_with_na(self, df, sparse): df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, drop_first=True, - sparse=sparse).sort_index(axis=1) - expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_b': [0, 1, 0, 0], - 'A_nan': [0, 0, 0, 1], - 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}) - cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] + result = get_dummies( + df, dummy_na=True, drop_first=True, sparse=sparse + ).sort_index(axis=1) + expected = DataFrame( + { + "C": [1, 2, 3, np.nan], + "A_b": [0, 1, 0, 0], + "A_nan": [0, 0, 0, 1], + "B_c": [0, 0, 1, 0], + "B_nan": [0, 0, 0, 1], + } + ) + cols = ["A_b", "A_nan", "B_c", "B_nan"] expected[cols] = expected[cols].astype(np.uint8) expected = expected.sort_index(axis=1) if sparse: @@ -493,85 +520,76 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, drop_first=True, - sparse=sparse) - expected = expected[['C', 'A_b', 'B_c']] + result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse) + expected = expected[["C", "A_b", "B_c"]] assert_frame_equal(result, expected) def test_int_int(self): data = Series([1, 2, 1]) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], - [0, 1], - [1, 0]], - columns=[1, 2], - dtype=np.uint8) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) - data = Series(pd.Categorical(['a', 'b', 'a'])) + data = Series(pd.Categorical(["a", "b", "a"])) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], - [0, 1], - [1, 0]], - columns=pd.Categorical(['a', 'b']), - dtype=np.uint8) + expected = DataFrame( + [[1, 0], [0, 1], [1, 0]], columns=pd.Categorical(["a", "b"]), dtype=np.uint8 + ) tm.assert_frame_equal(result, expected) def test_int_df(self, dtype): data = DataFrame( - {'A': [1, 2, 1], - 'B': pd.Categorical(['a', 'b', 'a']), - 'C': [1, 2, 1], - 'D': [1., 2., 1.] - } + { + "A": [1, 2, 1], + "B": pd.Categorical(["a", "b", "a"]), + "C": [1, 2, 1], + "D": [1.0, 2.0, 1.0], + } + ) + columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"] + expected = DataFrame( + [[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]], + columns=columns, ) - columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b'] - expected = DataFrame([ - [1, 1., 1, 0, 1, 0], - [2, 2., 0, 1, 0, 1], - [1, 1., 1, 0, 1, 0] - ], columns=columns) expected[columns[2:]] = expected[columns[2:]].astype(dtype) - result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype) + result = pd.get_dummies(data, columns=["A", "B"], dtype=dtype) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): # GH13854 for ordered in [False, True]: - cat = pd.Categorical(list("xy"), categories=list("xyz"), - ordered=ordered) + cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) result = get_dummies(cat, dtype=dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], - dtype=self.effective_dtype(dtype)) - cols = pd.CategoricalIndex(cat.categories, - categories=cat.categories, - ordered=ordered) - expected = DataFrame(data, columns=cols, - dtype=self.effective_dtype(dtype)) + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype)) + cols = pd.CategoricalIndex( + cat.categories, categories=cat.categories, ordered=ordered + ) + expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('sparse', [True, False]) + @pytest.mark.parametrize("sparse", [True, False]) def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 - df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]), - ('Nation', ['AB', 'CD'])])) - df = get_dummies(df, columns=['Nation'], sparse=sparse) - df2 = df.reindex(columns=['GDP']) + df = DataFrame.from_dict( + OrderedDict([("GDP", [1, 2]), ("Nation", ["AB", "CD"])]) + ) + df = get_dummies(df, columns=["Nation"], sparse=sparse) + df2 = df.reindex(columns=["GDP"]) - tm.assert_frame_equal(df[['GDP']], df2) + tm.assert_frame_equal(df[["GDP"]], df2) def test_get_dummies_duplicate_columns(self, df): # GH20839 df.columns = ["A", "A", "A"] result = get_dummies(df).sort_index(axis=1) - expected = DataFrame([[1, 1, 0, 1, 0], - [2, 0, 1, 1, 0], - [3, 1, 0, 0, 1]], - columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'], - dtype=np.uint8).sort_index(axis=1) + expected = DataFrame( + [[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], + columns=["A", "A_a", "A_b", "A_b", "A_c"], + dtype=np.uint8, + ).sort_index(axis=1) expected = expected.astype({"A": np.int64}) @@ -579,54 +597,54 @@ def test_get_dummies_duplicate_columns(self, df): def test_get_dummies_all_sparse(self): df = pd.DataFrame({"A": [1, 2]}) - result = pd.get_dummies(df, columns=['A'], sparse=True) - dtype = SparseDtype('uint8', 0) - expected = pd.DataFrame({ - 'A_1': SparseArray([1, 0], dtype=dtype), - 'A_2': SparseArray([0, 1], dtype=dtype), - }) + result = pd.get_dummies(df, columns=["A"], sparse=True) + dtype = SparseDtype("uint8", 0) + expected = pd.DataFrame( + { + "A_1": SparseArray([1, 0], dtype=dtype), + "A_2": SparseArray([0, 1], dtype=dtype), + } + ) tm.assert_frame_equal(result, expected) class TestCategoricalReshape: - def test_reshaping_multi_index_categorical(self): - cols = ['ItemA', 'ItemB', 'ItemC'] + cols = ["ItemA", "ItemB", "ItemC"] data = {c: tm.makeTimeDataFrame() for c in cols} - df = pd.concat({c: data[c].stack() for c in data}, axis='columns') - df.index.names = ['major', 'minor'] - df['str'] = 'foo' + df = pd.concat({c: data[c].stack() for c in data}, axis="columns") + df.index.names = ["major", "minor"] + df["str"] = "foo" dti = df.index.levels[0] - df['category'] = df['str'].astype('category') - result = df['category'].unstack() + df["category"] = df["str"].astype("category") + result = df["category"].unstack() - c = Categorical(['foo'] * len(dti)) - expected = DataFrame({'A': c.copy(), - 'B': c.copy(), - 'C': c.copy(), - 'D': c.copy()}, - columns=Index(list('ABCD'), name='minor'), - index=dti) + c = Categorical(["foo"] * len(dti)) + expected = DataFrame( + {"A": c.copy(), "B": c.copy(), "C": c.copy(), "D": c.copy()}, + columns=Index(list("ABCD"), name="minor"), + index=dti, + ) tm.assert_frame_equal(result, expected) class TestMakeAxisDummies: - def test_preserve_categorical_dtype(self): # GH13854 for ordered in [False, True]: cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered) - midx = pd.MultiIndex(levels=[['a'], cidx], - codes=[[0, 0], [0, 1]]) + midx = pd.MultiIndex(levels=[["a"], cidx], codes=[[0, 0], [0, 1]]) df = DataFrame([[10, 11]], index=midx) - expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], - index=midx, columns=cidx) + expected = DataFrame( + [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], index=midx, columns=cidx + ) from pandas.core.reshape.reshape import make_axis_dummies + result = make_axis_dummies(df) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 75dc2ccc54a83..188f08777668e 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -9,44 +9,47 @@ class TestUnionCategoricals: - def test_union_categorical(self): # GH 13361 data = [ - (list('abc'), list('abd'), list('abcabd')), + (list("abc"), list("abd"), list("abcabd")), ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), - - (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], - ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), - - (pd.date_range('2014-01-01', '2014-01-05'), - pd.date_range('2014-01-06', '2014-01-07'), - pd.date_range('2014-01-01', '2014-01-07')), - - (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), - pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), - pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), - - (pd.period_range('2014-01-01', '2014-01-05'), - pd.period_range('2014-01-06', '2014-01-07'), - pd.period_range('2014-01-01', '2014-01-07')), + ( + ["b", "b", np.nan, "a"], + ["a", np.nan, "c"], + ["b", "b", np.nan, "a", "a", np.nan, "c"], + ), + ( + pd.date_range("2014-01-01", "2014-01-05"), + pd.date_range("2014-01-06", "2014-01-07"), + pd.date_range("2014-01-01", "2014-01-07"), + ), + ( + pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"), + pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"), + pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"), + ), + ( + pd.period_range("2014-01-01", "2014-01-05"), + pd.period_range("2014-01-06", "2014-01-07"), + pd.period_range("2014-01-01", "2014-01-07"), + ), ] for a, b, combined in data: for box in [Categorical, CategoricalIndex, Series]: - result = union_categoricals([box(Categorical(a)), - box(Categorical(b))]) + result = union_categoricals([box(Categorical(a)), box(Categorical(b))]) expected = Categorical(combined) - tm.assert_categorical_equal(result, expected, - check_category_order=True) + tm.assert_categorical_equal(result, expected, check_category_order=True) # new categories ordered by appearance - s = Categorical(['x', 'y', 'z']) - s2 = Categorical(['a', 'b', 'c']) + s = Categorical(["x", "y", "z"]) + s2 = Categorical(["a", "b", "c"]) result = union_categoricals([s, s2]) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['x', 'y', 'z', 'a', 'b', 'c']) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"] + ) tm.assert_categorical_equal(result, expected) s = Categorical([0, 1.2, 2], ordered=True) @@ -58,60 +61,66 @@ def test_union_categorical(self): # must exactly match types s = Categorical([0, 1.2, 2]) s2 = Categorical([2, 3, 4]) - msg = 'dtype of categories must be the same' + msg = "dtype of categories must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([s, s2]) - msg = 'No Categoricals to union' + msg = "No Categoricals to union" with pytest.raises(ValueError, match=msg): union_categoricals([]) def test_union_categoricals_nan(self): # GH 13759 - res = union_categoricals([pd.Categorical([1, 2, np.nan]), - pd.Categorical([3, 2, np.nan])]) + res = union_categoricals( + [pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])] + ) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([pd.Categorical(['A', 'B']), - pd.Categorical(['B', 'B', np.nan])]) - exp = Categorical(['A', 'B', 'B', 'B', np.nan]) + res = union_categoricals( + [pd.Categorical(["A", "B"]), pd.Categorical(["B", "B", np.nan])] + ) + exp = Categorical(["A", "B", "B", "B", np.nan]) tm.assert_categorical_equal(res, exp) - val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), - pd.NaT] - val2 = [pd.NaT, pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-02-01')] + val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT] + val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) - exp = Categorical(val1 + val2, - categories=[pd.Timestamp('2011-01-01'), - pd.Timestamp('2011-03-01'), - pd.Timestamp('2011-02-01')]) + exp = Categorical( + val1 + val2, + categories=[ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-03-01"), + pd.Timestamp("2011-02-01"), + ], + ) tm.assert_categorical_equal(res, exp) # all NaN - res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan], - dtype=object)), - pd.Categorical(['X'])]) - exp = Categorical([np.nan, np.nan, 'X']) + res = union_categoricals( + [ + pd.Categorical(np.array([np.nan, np.nan], dtype=object)), + pd.Categorical(["X"]), + ] + ) + exp = Categorical([np.nan, np.nan, "X"]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([pd.Categorical([np.nan, np.nan]), - pd.Categorical([np.nan, np.nan])]) + res = union_categoricals( + [pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])] + ) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp) def test_union_categoricals_empty(self): # GH 13759 - res = union_categoricals([pd.Categorical([]), - pd.Categorical([])]) + res = union_categoricals([pd.Categorical([]), pd.Categorical([])]) exp = Categorical([]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([Categorical([]), - Categorical(['1'])]) - exp = Categorical(['1']) + res = union_categoricals([Categorical([]), Categorical(["1"])]) + exp = Categorical(["1"]) tm.assert_categorical_equal(res, exp) def test_union_categorical_same_category(self): @@ -119,31 +128,30 @@ def test_union_categorical_same_category(self): c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) res = union_categoricals([c1, c2]) - exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], - categories=[1, 2, 3, 4]) + exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4]) tm.assert_categorical_equal(res, exp) - c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) - c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) + c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"]) + c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"]) res = union_categoricals([c1, c2]) - exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], - categories=['x', 'y', 'z']) + exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"]) tm.assert_categorical_equal(res, exp) def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 - c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) - c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']) + c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2]) - expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical( + ["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"] + ) tm.assert_categorical_equal(result, expected) def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) - msg = 'Categorical.ordered must be the same' + msg = "Categorical.ordered must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([c1, c2]) @@ -174,7 +182,7 @@ def test_union_categoricals_ignore_order(self): exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) - msg = 'Categorical.ordered must be the same' + msg = "Categorical.ordered must be the same" with pytest.raises(TypeError, match=msg): union_categoricals([c1, c2], ignore_order=False) @@ -183,8 +191,7 @@ def test_union_categoricals_ignore_order(self): tm.assert_categorical_equal(res, exp) res = union_categoricals([c1, c1], ignore_order=False) - exp = Categorical([1, 2, 3, 1, 2, 3], - categories=[1, 2, 3], ordered=True) + exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) @@ -201,8 +208,7 @@ def test_union_categoricals_ignore_order(self): exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([c2, c1], ignore_order=True, - sort_categories=True) + res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) @@ -221,41 +227,38 @@ def test_union_categoricals_ignore_order(self): def test_union_categoricals_sort(self): # GH 13846 - c1 = Categorical(['x', 'y', 'z']) - c2 = Categorical(['a', 'b', 'c']) + c1 = Categorical(["x", "y", "z"]) + c2 = Categorical(["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'x', 'y', 'z']) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"] + ) tm.assert_categorical_equal(result, expected) # fastpath - c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) - c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) + c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) - c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) + c1 = Categorical(["a", "b"], categories=["c", "a", "b"]) + c2 = Categorical(["b", "c"], categories=["c", "a", "b"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) # fastpath - skip resort - c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) + c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['x', np.nan]) - c2 = Categorical([np.nan, 'b']) + c1 = Categorical(["x", np.nan]) + c2 = Categorical([np.nan, "b"]) result = union_categoricals([c1, c2], sort_categories=True) - expected = Categorical(['x', np.nan, np.nan, 'b'], - categories=['b', 'x']) + expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"]) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) @@ -270,41 +273,39 @@ def test_union_categoricals_sort(self): expected = Categorical([]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) - c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) + c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) with pytest.raises(TypeError): union_categoricals([c1, c2], sort_categories=True) def test_union_categoricals_sort_false(self): # GH 13846 - c1 = Categorical(['x', 'y', 'z']) - c2 = Categorical(['a', 'b', 'c']) + c1 = Categorical(["x", "y", "z"]) + c2 = Categorical(["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], - categories=['x', 'y', 'z', 'a', 'b', 'c']) + expected = Categorical( + ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"] + ) tm.assert_categorical_equal(result, expected) # fastpath - c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) - c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + c1 = Categorical(["a", "b"], categories=["b", "a", "c"]) + c2 = Categorical(["b", "c"], categories=["b", "a", "c"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['b', 'a', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"]) tm.assert_categorical_equal(result, expected) # fastpath - skip resort - c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + c1 = Categorical(["a", "b"], categories=["a", "b", "c"]) + c2 = Categorical(["b", "c"], categories=["a", "b", "c"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['a', 'b', 'b', 'c'], - categories=['a', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['x', np.nan]) - c2 = Categorical([np.nan, 'b']) + c1 = Categorical(["x", np.nan]) + c2 = Categorical([np.nan, "b"]) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['x', np.nan, np.nan, 'b'], - categories=['x', 'b']) + expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"]) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) @@ -319,19 +320,20 @@ def test_union_categoricals_sort_false(self): expected = Categorical([]) tm.assert_categorical_equal(result, expected) - c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) - c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True) + c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True) result = union_categoricals([c1, c2], sort_categories=False) - expected = Categorical(['b', 'a', 'a', 'c'], - categories=['b', 'a', 'c'], ordered=True) + expected = Categorical( + ["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True + ) tm.assert_categorical_equal(result, expected) def test_union_categorical_unwrap(self): # GH 14173 - c1 = Categorical(['a', 'b']) - c2 = pd.Series(['b', 'c'], dtype='category') + c1 = Categorical(["a", "b"]) + c2 = pd.Series(["b", "c"], dtype="category") result = union_categoricals([c1, c2]) - expected = Categorical(['a', 'b', 'b', 'c']) + expected = Categorical(["a", "b", "b", "c"]) tm.assert_categorical_equal(result, expected) c2 = CategoricalIndex(c2) @@ -343,4 +345,4 @@ def test_union_categorical_unwrap(self): tm.assert_categorical_equal(result, expected) with pytest.raises(TypeError): - union_categoricals([c1, ['a', 'b', 'c']]) + union_categoricals([c1, ["a", "b", "c"]]) diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 92a3bb9e29219..60c6d7ec3017b 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -7,11 +7,10 @@ class TestCartesianProduct: - def test_simple(self): - x, y = list('ABC'), [1, 22] + x, y = list("ABC"), [1, 22] result1, result2 = cartesian_product([x, y]) - expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C']) + expected1 = np.array(["A", "A", "B", "B", "C", "C"]) expected2 = np.array([1, 22, 1, 22, 1, 22]) tm.assert_numpy_array_equal(result1, expected1) tm.assert_numpy_array_equal(result2, expected2) @@ -19,7 +18,7 @@ def test_simple(self): def test_datetimeindex(self): # regression test for GitHub issue #6439 # make sure that the ordering on datetimeindex is consistent - x = date_range('2000-01-01', periods=2) + x = date_range("2000-01-01", periods=2) result1, result2 = [Index(y).day for y in cartesian_product([x, x])] expected1 = Index([1, 1, 2, 2]) expected2 = Index([1, 2, 1, 2]) @@ -29,7 +28,7 @@ def test_datetimeindex(self): def test_empty(self): # product of empty factors X = [[], [0, 1], []] - Y = [[], [], ['a', 'b', 'c']] + Y = [[], [], ["a", "b", "c"]] for x, y in zip(X, Y): expected1 = np.array([], dtype=np.asarray(x).dtype) expected2 = np.array([], dtype=np.asarray(y).dtype) @@ -42,10 +41,9 @@ def test_empty(self): expected = [] assert result == expected - @pytest.mark.parametrize("X", [ - 1, [1], [1, 2], [[1], 2], - 'a', ['a'], ['a', 'b'], [['a'], 'b'] - ]) + @pytest.mark.parametrize( + "X", [1, [1], [1, 2], [[1], 2], "a", ["a"], ["a", "b"], [["a"], "b"]] + ) def test_invalid_input(self, X): msg = "Input must be a list-like of list-likes" diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index 6645244318776..e4987e4483fd9 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -11,9 +11,8 @@ def interval(): class TestInterval: - def test_properties(self, interval): - assert interval.closed == 'right' + assert interval.closed == "right" assert interval.left == 0 assert interval.right == 1 assert interval.mid == 0.5 @@ -22,7 +21,7 @@ def test_repr(self, interval): assert repr(interval) == "Interval(0, 1, closed='right')" assert str(interval) == "(0, 1]" - interval_left = Interval(0, 1, closed='left') + interval_left = Interval(0, 1, closed="left") assert repr(interval_left) == "Interval(0, 1, closed='left')" assert str(interval_left) == "[0, 1)" @@ -35,22 +34,22 @@ def test_contains(self, interval): with pytest.raises(TypeError, match=msg): interval in interval - interval_both = Interval(0, 1, closed='both') + interval_both = Interval(0, 1, closed="both") assert 0 in interval_both assert 1 in interval_both - interval_neither = Interval(0, 1, closed='neither') + interval_neither = Interval(0, 1, closed="neither") assert 0 not in interval_neither assert 0.5 in interval_neither assert 1 not in interval_neither def test_equal(self): - assert Interval(0, 1) == Interval(0, 1, closed='right') - assert Interval(0, 1) != Interval(0, 1, closed='left') + assert Interval(0, 1) == Interval(0, 1, closed="right") + assert Interval(0, 1) != Interval(0, 1, closed="left") assert Interval(0, 1) != 0 def test_comparison(self): - with pytest.raises(TypeError, match='unorderable types'): + with pytest.raises(TypeError, match="unorderable types"): Interval(0, 1) < 2 assert Interval(0, 1) < Interval(1, 2) @@ -64,29 +63,37 @@ def test_hash(self, interval): # should not raise hash(interval) - @pytest.mark.parametrize('left, right, expected', [ - (0, 5, 5), - (-2, 5.5, 7.5), - (10, 10, 0), - (10, np.inf, np.inf), - (-np.inf, -5, np.inf), - (-np.inf, np.inf, np.inf), - (Timedelta('0 days'), Timedelta('5 days'), Timedelta('5 days')), - (Timedelta('10 days'), Timedelta('10 days'), Timedelta('0 days')), - (Timedelta('1H10M'), Timedelta('5H5M'), Timedelta('3H55M')), - (Timedelta('5S'), Timedelta('1H'), Timedelta('59M55S'))]) + @pytest.mark.parametrize( + "left, right, expected", + [ + (0, 5, 5), + (-2, 5.5, 7.5), + (10, 10, 0), + (10, np.inf, np.inf), + (-np.inf, -5, np.inf), + (-np.inf, np.inf, np.inf), + (Timedelta("0 days"), Timedelta("5 days"), Timedelta("5 days")), + (Timedelta("10 days"), Timedelta("10 days"), Timedelta("0 days")), + (Timedelta("1H10M"), Timedelta("5H5M"), Timedelta("3H55M")), + (Timedelta("5S"), Timedelta("1H"), Timedelta("59M55S")), + ], + ) def test_length(self, left, right, expected): # GH 18789 iv = Interval(left, right) result = iv.length assert result == expected - @pytest.mark.parametrize('left, right, expected', [ - ('2017-01-01', '2017-01-06', '5 days'), - ('2017-01-01', '2017-01-01 12:00:00', '12 hours'), - ('2017-01-01 12:00', '2017-01-01 12:00:00', '0 days'), - ('2017-01-01 12:01', '2017-01-05 17:31:00', '4 days 5 hours 30 min')]) - @pytest.mark.parametrize('tz', (None, 'UTC', 'CET', 'US/Eastern')) + @pytest.mark.parametrize( + "left, right, expected", + [ + ("2017-01-01", "2017-01-06", "5 days"), + ("2017-01-01", "2017-01-01 12:00:00", "12 hours"), + ("2017-01-01 12:00", "2017-01-01 12:00:00", "0 days"), + ("2017-01-01 12:01", "2017-01-05 17:31:00", "4 days 5 hours 30 min"), + ], + ) + @pytest.mark.parametrize("tz", (None, "UTC", "CET", "US/Eastern")) def test_length_timestamp(self, tz, left, right, expected): # GH 18789 iv = Interval(Timestamp(left, tz=tz), Timestamp(right, tz=tz)) @@ -94,12 +101,18 @@ def test_length_timestamp(self, tz, left, right, expected): expected = Timedelta(expected) assert result == expected - @pytest.mark.parametrize('left, right', [ - (0, 1), - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timestamp('2018-01-02')), - (Timestamp('2018-01-01', tz='US/Eastern'), - Timestamp('2018-01-02', tz='US/Eastern'))]) + @pytest.mark.parametrize( + "left, right", + [ + (0, 1), + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timestamp("2018-01-02")), + ( + Timestamp("2018-01-01", tz="US/Eastern"), + Timestamp("2018-01-02", tz="US/Eastern"), + ), + ], + ) def test_is_empty(self, left, right, closed): # GH27219 # non-empty always return False @@ -109,16 +122,19 @@ def test_is_empty(self, left, right, closed): # same endpoint is empty except when closed='both' (contains one point) iv = Interval(left, left, closed) result = iv.is_empty - expected = closed != 'both' + expected = closed != "both" assert result is expected - @pytest.mark.parametrize('left, right', [ - ('a', 'z'), - (('a', 'b'), ('c', 'd')), - (list('AB'), list('ab')), - (Interval(0, 1), Interval(1, 2)), - (Period('2018Q1', freq='Q'), Period('2018Q1', freq='Q')) - ]) + @pytest.mark.parametrize( + "left, right", + [ + ("a", "z"), + (("a", "b"), ("c", "d")), + (list("AB"), list("ab")), + (Interval(0, 1), Interval(1, 2)), + (Period("2018Q1", freq="Q"), Period("2018Q1", freq="Q")), + ], + ) def test_construct_errors(self, left, right): # GH 23013 msg = "Only numeric, Timestamp and Timedelta endpoints are allowed" @@ -144,7 +160,7 @@ def test_math_add(self, closed): interval + interval with pytest.raises(TypeError, match=msg): - interval + 'foo' + interval + "foo" def test_math_sub(self, closed): interval = Interval(0, 1, closed=closed) @@ -162,7 +178,7 @@ def test_math_sub(self, closed): interval - interval with pytest.raises(TypeError, match=msg): - interval - 'foo' + interval - "foo" def test_math_mult(self, closed): interval = Interval(0, 1, closed=closed) @@ -184,7 +200,7 @@ def test_math_mult(self, closed): msg = r"can\'t multiply sequence by non-int" with pytest.raises(TypeError, match=msg): - interval * 'foo' + interval * "foo" def test_math_div(self, closed): interval = Interval(0, 1, closed=closed) @@ -202,7 +218,7 @@ def test_math_div(self, closed): interval / interval with pytest.raises(TypeError, match=msg): - interval / 'foo' + interval / "foo" def test_math_floordiv(self, closed): interval = Interval(1, 2, closed=closed) @@ -220,23 +236,24 @@ def test_math_floordiv(self, closed): interval // interval with pytest.raises(TypeError, match=msg): - interval // 'foo' + interval // "foo" def test_constructor_errors(self): msg = "invalid option for 'closed': foo" with pytest.raises(ValueError, match=msg): - Interval(0, 1, closed='foo') + Interval(0, 1, closed="foo") - msg = 'left side of interval must be <= right side' + msg = "left side of interval must be <= right side" with pytest.raises(ValueError, match=msg): Interval(1, 0) - @pytest.mark.parametrize('tz_left, tz_right', [ - (None, 'UTC'), ('UTC', None), ('UTC', 'US/Eastern')]) + @pytest.mark.parametrize( + "tz_left, tz_right", [(None, "UTC"), ("UTC", None), ("UTC", "US/Eastern")] + ) def test_constructor_errors_tz(self, tz_left, tz_right): # GH 18538 - left = Timestamp('2017-01-01', tz=tz_left) - right = Timestamp('2017-01-02', tz=tz_right) + left = Timestamp("2017-01-01", tz=tz_left) + right = Timestamp("2017-01-02", tz=tz_right) error = TypeError if com._any_none(tz_left, tz_right) else ValueError with pytest.raises(error): Interval(left, right) diff --git a/pandas/tests/scalar/interval/test_ops.py b/pandas/tests/scalar/interval/test_ops.py index 963fe14d46dcd..f560c42617260 100644 --- a/pandas/tests/scalar/interval/test_ops.py +++ b/pandas/tests/scalar/interval/test_ops.py @@ -4,10 +4,14 @@ from pandas import Interval, Timedelta, Timestamp -@pytest.fixture(params=[ - (Timedelta('0 days'), Timedelta('1 day')), - (Timestamp('2018-01-01'), Timedelta('1 day')), - (0, 1)], ids=lambda x: type(x[0]).__name__) +@pytest.fixture( + params=[ + (Timedelta("0 days"), Timedelta("1 day")), + (Timestamp("2018-01-01"), Timedelta("1 day")), + (0, 1), + ], + ids=lambda x: type(x[0]).__name__, +) def start_shift(request): """ Fixture for generating intervals of types from a start value and a shift @@ -17,7 +21,6 @@ def start_shift(request): class TestOverlaps: - def test_overlaps_self(self, start_shift, closed): start, shift = start_shift interval = Interval(start, start + shift, closed) @@ -49,12 +52,15 @@ def test_overlaps_endpoint(self, start_shift, closed, other_closed): expected = interval1.closed_right and interval2.closed_left assert result == expected - @pytest.mark.parametrize('other', [ - 10, True, 'foo', Timedelta('1 day'), Timestamp('2018-01-01')], - ids=lambda x: type(x).__name__) + @pytest.mark.parametrize( + "other", + [10, True, "foo", Timedelta("1 day"), Timestamp("2018-01-01")], + ids=lambda x: type(x).__name__, + ) def test_overlaps_invalid_type(self, other): interval = Interval(0, 1) - msg = '`other` must be an Interval, got {other}'.format( - other=type(other).__name__) + msg = "`other` must be an Interval, got {other}".format( + other=type(other).__name__ + ) with pytest.raises(TypeError, match=msg): interval.overlaps(other) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index c6f649aeba12f..ee0ff87e31aea 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -1,7 +1,6 @@ import pytest -from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, _period_code_map) +from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG, _period_code_map from pandas.errors import OutOfBoundsDatetime from pandas import Period, offsets @@ -9,10 +8,11 @@ class TestFreqConversion: """Test frequency conversion of date objects""" - @pytest.mark.parametrize('freq', ['A', 'Q', 'M', 'W', 'B', 'D']) + + @pytest.mark.parametrize("freq", ["A", "Q", "M", "W", "B", "D"]) def test_asfreq_near_zero(self, freq): # GH#19643, GH#19650 - per = Period('0001-01-01', freq=freq) + per = Period("0001-01-01", freq=freq) tup1 = (per.year, per.hour, per.day) prev = per - 1 @@ -22,673 +22,709 @@ def test_asfreq_near_zero(self, freq): def test_asfreq_near_zero_weekly(self): # GH#19834 - per1 = Period('0001-01-01', 'D') + 6 - per2 = Period('0001-01-01', 'D') - 6 - week1 = per1.asfreq('W') - week2 = per2.asfreq('W') + per1 = Period("0001-01-01", "D") + 6 + per2 = Period("0001-01-01", "D") - 6 + week1 = per1.asfreq("W") + week2 = per2.asfreq("W") assert week1 != week2 - assert week1.asfreq('D', 'E') >= per1 - assert week2.asfreq('D', 'S') <= per2 + assert week1.asfreq("D", "E") >= per1 + assert week2.asfreq("D", "S") <= per2 - @pytest.mark.xfail(reason='GH#19643 period_helper asfreq functions fail ' - 'to check for overflows') + @pytest.mark.xfail( + reason="GH#19643 period_helper asfreq functions fail " "to check for overflows" + ) def test_to_timestamp_out_of_bounds(self): # GH#19643, currently gives Timestamp('1754-08-30 22:43:41.128654848') - per = Period('0001-01-01', freq='B') + per = Period("0001-01-01", freq="B") with pytest.raises(OutOfBoundsDatetime): per.to_timestamp() def test_asfreq_corner(self): - val = Period(freq='A', year=2007) - result1 = val.asfreq('5t') - result2 = val.asfreq('t') - expected = Period('2007-12-31 23:59', freq='t') + val = Period(freq="A", year=2007) + result1 = val.asfreq("5t") + result2 = val.asfreq("t") + expected = Period("2007-12-31 23:59", freq="t") assert result1.ordinal == expected.ordinal - assert result1.freqstr == '5T' + assert result1.freqstr == "5T" assert result2.ordinal == expected.ordinal - assert result2.freqstr == 'T' + assert result2.freqstr == "T" def test_conv_annual(self): # frequency conversion tests: from Annual Frequency - ival_A = Period(freq='A', year=2007) + ival_A = Period(freq="A", year=2007) ival_AJAN = Period(freq="A-JAN", year=2007) ival_AJUN = Period(freq="A-JUN", year=2007) ival_ANOV = Period(freq="A-NOV", year=2007) - ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) - ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) - ival_A_to_M_start = Period(freq='M', year=2007, month=1) - ival_A_to_M_end = Period(freq='M', year=2007, month=12) - ival_A_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_A_to_W_end = Period(freq='W', year=2007, month=12, day=31) - ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) - ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) - ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - - ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) - ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) - ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) - - assert ival_A.asfreq('Q', 'S') == ival_A_to_Q_start - assert ival_A.asfreq('Q', 'e') == ival_A_to_Q_end - assert ival_A.asfreq('M', 's') == ival_A_to_M_start - assert ival_A.asfreq('M', 'E') == ival_A_to_M_end - assert ival_A.asfreq('W', 'S') == ival_A_to_W_start - assert ival_A.asfreq('W', 'E') == ival_A_to_W_end - assert ival_A.asfreq('B', 'S') == ival_A_to_B_start - assert ival_A.asfreq('B', 'E') == ival_A_to_B_end - assert ival_A.asfreq('D', 'S') == ival_A_to_D_start - assert ival_A.asfreq('D', 'E') == ival_A_to_D_end - assert ival_A.asfreq('H', 'S') == ival_A_to_H_start - assert ival_A.asfreq('H', 'E') == ival_A_to_H_end - assert ival_A.asfreq('min', 'S') == ival_A_to_T_start - assert ival_A.asfreq('min', 'E') == ival_A_to_T_end - assert ival_A.asfreq('T', 'S') == ival_A_to_T_start - assert ival_A.asfreq('T', 'E') == ival_A_to_T_end - assert ival_A.asfreq('S', 'S') == ival_A_to_S_start - assert ival_A.asfreq('S', 'E') == ival_A_to_S_end - - assert ival_AJAN.asfreq('D', 'S') == ival_AJAN_to_D_start - assert ival_AJAN.asfreq('D', 'E') == ival_AJAN_to_D_end - - assert ival_AJUN.asfreq('D', 'S') == ival_AJUN_to_D_start - assert ival_AJUN.asfreq('D', 'E') == ival_AJUN_to_D_end - - assert ival_ANOV.asfreq('D', 'S') == ival_ANOV_to_D_start - assert ival_ANOV.asfreq('D', 'E') == ival_ANOV_to_D_end - - assert ival_A.asfreq('A') == ival_A + ival_A_to_Q_start = Period(freq="Q", year=2007, quarter=1) + ival_A_to_Q_end = Period(freq="Q", year=2007, quarter=4) + ival_A_to_M_start = Period(freq="M", year=2007, month=1) + ival_A_to_M_end = Period(freq="M", year=2007, month=12) + ival_A_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_A_to_W_end = Period(freq="W", year=2007, month=12, day=31) + ival_A_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_A_to_B_end = Period(freq="B", year=2007, month=12, day=31) + ival_A_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_A_to_D_end = Period(freq="D", year=2007, month=12, day=31) + ival_A_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_A_to_H_end = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_A_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_A_to_T_end = Period( + freq="Min", year=2007, month=12, day=31, hour=23, minute=59 + ) + ival_A_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_A_to_S_end = Period( + freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + ) + + ival_AJAN_to_D_end = Period(freq="D", year=2007, month=1, day=31) + ival_AJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) + ival_AJUN_to_D_end = Period(freq="D", year=2007, month=6, day=30) + ival_AJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) + ival_ANOV_to_D_end = Period(freq="D", year=2007, month=11, day=30) + ival_ANOV_to_D_start = Period(freq="D", year=2006, month=12, day=1) + + assert ival_A.asfreq("Q", "S") == ival_A_to_Q_start + assert ival_A.asfreq("Q", "e") == ival_A_to_Q_end + assert ival_A.asfreq("M", "s") == ival_A_to_M_start + assert ival_A.asfreq("M", "E") == ival_A_to_M_end + assert ival_A.asfreq("W", "S") == ival_A_to_W_start + assert ival_A.asfreq("W", "E") == ival_A_to_W_end + assert ival_A.asfreq("B", "S") == ival_A_to_B_start + assert ival_A.asfreq("B", "E") == ival_A_to_B_end + assert ival_A.asfreq("D", "S") == ival_A_to_D_start + assert ival_A.asfreq("D", "E") == ival_A_to_D_end + assert ival_A.asfreq("H", "S") == ival_A_to_H_start + assert ival_A.asfreq("H", "E") == ival_A_to_H_end + assert ival_A.asfreq("min", "S") == ival_A_to_T_start + assert ival_A.asfreq("min", "E") == ival_A_to_T_end + assert ival_A.asfreq("T", "S") == ival_A_to_T_start + assert ival_A.asfreq("T", "E") == ival_A_to_T_end + assert ival_A.asfreq("S", "S") == ival_A_to_S_start + assert ival_A.asfreq("S", "E") == ival_A_to_S_end + + assert ival_AJAN.asfreq("D", "S") == ival_AJAN_to_D_start + assert ival_AJAN.asfreq("D", "E") == ival_AJAN_to_D_end + + assert ival_AJUN.asfreq("D", "S") == ival_AJUN_to_D_start + assert ival_AJUN.asfreq("D", "E") == ival_AJUN_to_D_end + + assert ival_ANOV.asfreq("D", "S") == ival_ANOV_to_D_start + assert ival_ANOV.asfreq("D", "E") == ival_ANOV_to_D_end + + assert ival_A.asfreq("A") == ival_A def test_conv_quarterly(self): # frequency conversion tests: from Quarterly Frequency - ival_Q = Period(freq='Q', year=2007, quarter=1) - ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) + ival_Q = Period(freq="Q", year=2007, quarter=1) + ival_Q_end_of_year = Period(freq="Q", year=2007, quarter=4) ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) - ival_Q_to_A = Period(freq='A', year=2007) - ival_Q_to_M_start = Period(freq='M', year=2007, month=1) - ival_Q_to_M_end = Period(freq='M', year=2007, month=3) - ival_Q_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_Q_to_W_end = Period(freq='W', year=2007, month=3, day=31) - ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) - ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) - ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, hour=23) - ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, hour=23, - minute=59, second=59) - - ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) - ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) - - ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) - ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) - - assert ival_Q.asfreq('A') == ival_Q_to_A - assert ival_Q_end_of_year.asfreq('A') == ival_Q_to_A - - assert ival_Q.asfreq('M', 'S') == ival_Q_to_M_start - assert ival_Q.asfreq('M', 'E') == ival_Q_to_M_end - assert ival_Q.asfreq('W', 'S') == ival_Q_to_W_start - assert ival_Q.asfreq('W', 'E') == ival_Q_to_W_end - assert ival_Q.asfreq('B', 'S') == ival_Q_to_B_start - assert ival_Q.asfreq('B', 'E') == ival_Q_to_B_end - assert ival_Q.asfreq('D', 'S') == ival_Q_to_D_start - assert ival_Q.asfreq('D', 'E') == ival_Q_to_D_end - assert ival_Q.asfreq('H', 'S') == ival_Q_to_H_start - assert ival_Q.asfreq('H', 'E') == ival_Q_to_H_end - assert ival_Q.asfreq('Min', 'S') == ival_Q_to_T_start - assert ival_Q.asfreq('Min', 'E') == ival_Q_to_T_end - assert ival_Q.asfreq('S', 'S') == ival_Q_to_S_start - assert ival_Q.asfreq('S', 'E') == ival_Q_to_S_end - - assert ival_QEJAN.asfreq('D', 'S') == ival_QEJAN_to_D_start - assert ival_QEJAN.asfreq('D', 'E') == ival_QEJAN_to_D_end - assert ival_QEJUN.asfreq('D', 'S') == ival_QEJUN_to_D_start - assert ival_QEJUN.asfreq('D', 'E') == ival_QEJUN_to_D_end - - assert ival_Q.asfreq('Q') == ival_Q + ival_Q_to_A = Period(freq="A", year=2007) + ival_Q_to_M_start = Period(freq="M", year=2007, month=1) + ival_Q_to_M_end = Period(freq="M", year=2007, month=3) + ival_Q_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_Q_to_W_end = Period(freq="W", year=2007, month=3, day=31) + ival_Q_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_Q_to_B_end = Period(freq="B", year=2007, month=3, day=30) + ival_Q_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_Q_to_D_end = Period(freq="D", year=2007, month=3, day=31) + ival_Q_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_Q_to_H_end = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_Q_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_Q_to_T_end = Period( + freq="Min", year=2007, month=3, day=31, hour=23, minute=59 + ) + ival_Q_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_Q_to_S_end = Period( + freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + ) + + ival_QEJAN_to_D_start = Period(freq="D", year=2006, month=2, day=1) + ival_QEJAN_to_D_end = Period(freq="D", year=2006, month=4, day=30) + + ival_QEJUN_to_D_start = Period(freq="D", year=2006, month=7, day=1) + ival_QEJUN_to_D_end = Period(freq="D", year=2006, month=9, day=30) + + assert ival_Q.asfreq("A") == ival_Q_to_A + assert ival_Q_end_of_year.asfreq("A") == ival_Q_to_A + + assert ival_Q.asfreq("M", "S") == ival_Q_to_M_start + assert ival_Q.asfreq("M", "E") == ival_Q_to_M_end + assert ival_Q.asfreq("W", "S") == ival_Q_to_W_start + assert ival_Q.asfreq("W", "E") == ival_Q_to_W_end + assert ival_Q.asfreq("B", "S") == ival_Q_to_B_start + assert ival_Q.asfreq("B", "E") == ival_Q_to_B_end + assert ival_Q.asfreq("D", "S") == ival_Q_to_D_start + assert ival_Q.asfreq("D", "E") == ival_Q_to_D_end + assert ival_Q.asfreq("H", "S") == ival_Q_to_H_start + assert ival_Q.asfreq("H", "E") == ival_Q_to_H_end + assert ival_Q.asfreq("Min", "S") == ival_Q_to_T_start + assert ival_Q.asfreq("Min", "E") == ival_Q_to_T_end + assert ival_Q.asfreq("S", "S") == ival_Q_to_S_start + assert ival_Q.asfreq("S", "E") == ival_Q_to_S_end + + assert ival_QEJAN.asfreq("D", "S") == ival_QEJAN_to_D_start + assert ival_QEJAN.asfreq("D", "E") == ival_QEJAN_to_D_end + assert ival_QEJUN.asfreq("D", "S") == ival_QEJUN_to_D_start + assert ival_QEJUN.asfreq("D", "E") == ival_QEJUN_to_D_end + + assert ival_Q.asfreq("Q") == ival_Q def test_conv_monthly(self): # frequency conversion tests: from Monthly Frequency - ival_M = Period(freq='M', year=2007, month=1) - ival_M_end_of_year = Period(freq='M', year=2007, month=12) - ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) - ival_M_to_A = Period(freq='A', year=2007) - ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_M_to_W_start = Period(freq='W', year=2007, month=1, day=1) - ival_M_to_W_end = Period(freq='W', year=2007, month=1, day=31) - ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) - ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) - ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, hour=23) - ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, - minute=59, second=59) - - assert ival_M.asfreq('A') == ival_M_to_A - assert ival_M_end_of_year.asfreq('A') == ival_M_to_A - assert ival_M.asfreq('Q') == ival_M_to_Q - assert ival_M_end_of_quarter.asfreq('Q') == ival_M_to_Q - - assert ival_M.asfreq('W', 'S') == ival_M_to_W_start - assert ival_M.asfreq('W', 'E') == ival_M_to_W_end - assert ival_M.asfreq('B', 'S') == ival_M_to_B_start - assert ival_M.asfreq('B', 'E') == ival_M_to_B_end - assert ival_M.asfreq('D', 'S') == ival_M_to_D_start - assert ival_M.asfreq('D', 'E') == ival_M_to_D_end - assert ival_M.asfreq('H', 'S') == ival_M_to_H_start - assert ival_M.asfreq('H', 'E') == ival_M_to_H_end - assert ival_M.asfreq('Min', 'S') == ival_M_to_T_start - assert ival_M.asfreq('Min', 'E') == ival_M_to_T_end - assert ival_M.asfreq('S', 'S') == ival_M_to_S_start - assert ival_M.asfreq('S', 'E') == ival_M_to_S_end - - assert ival_M.asfreq('M') == ival_M + ival_M = Period(freq="M", year=2007, month=1) + ival_M_end_of_year = Period(freq="M", year=2007, month=12) + ival_M_end_of_quarter = Period(freq="M", year=2007, month=3) + ival_M_to_A = Period(freq="A", year=2007) + ival_M_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_M_to_W_start = Period(freq="W", year=2007, month=1, day=1) + ival_M_to_W_end = Period(freq="W", year=2007, month=1, day=31) + ival_M_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_M_to_B_end = Period(freq="B", year=2007, month=1, day=31) + ival_M_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_M_to_D_end = Period(freq="D", year=2007, month=1, day=31) + ival_M_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_M_to_H_end = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_M_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_M_to_T_end = Period( + freq="Min", year=2007, month=1, day=31, hour=23, minute=59 + ) + ival_M_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_M_to_S_end = Period( + freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + ) + + assert ival_M.asfreq("A") == ival_M_to_A + assert ival_M_end_of_year.asfreq("A") == ival_M_to_A + assert ival_M.asfreq("Q") == ival_M_to_Q + assert ival_M_end_of_quarter.asfreq("Q") == ival_M_to_Q + + assert ival_M.asfreq("W", "S") == ival_M_to_W_start + assert ival_M.asfreq("W", "E") == ival_M_to_W_end + assert ival_M.asfreq("B", "S") == ival_M_to_B_start + assert ival_M.asfreq("B", "E") == ival_M_to_B_end + assert ival_M.asfreq("D", "S") == ival_M_to_D_start + assert ival_M.asfreq("D", "E") == ival_M_to_D_end + assert ival_M.asfreq("H", "S") == ival_M_to_H_start + assert ival_M.asfreq("H", "E") == ival_M_to_H_end + assert ival_M.asfreq("Min", "S") == ival_M_to_T_start + assert ival_M.asfreq("Min", "E") == ival_M_to_T_end + assert ival_M.asfreq("S", "S") == ival_M_to_S_start + assert ival_M.asfreq("S", "E") == ival_M_to_S_end + + assert ival_M.asfreq("M") == ival_M def test_conv_weekly(self): # frequency conversion tests: from Weekly Frequency - ival_W = Period(freq='W', year=2007, month=1, day=1) - - ival_WSUN = Period(freq='W', year=2007, month=1, day=7) - ival_WSAT = Period(freq='W-SAT', year=2007, month=1, day=6) - ival_WFRI = Period(freq='W-FRI', year=2007, month=1, day=5) - ival_WTHU = Period(freq='W-THU', year=2007, month=1, day=4) - ival_WWED = Period(freq='W-WED', year=2007, month=1, day=3) - ival_WTUE = Period(freq='W-TUE', year=2007, month=1, day=2) - ival_WMON = Period(freq='W-MON', year=2007, month=1, day=1) - - ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) - ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) - ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) - ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) - ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) - ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) - ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) - ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) - ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) - ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) - ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) - ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) - - ival_W_end_of_year = Period(freq='W', year=2007, month=12, day=31) - ival_W_end_of_quarter = Period(freq='W', year=2007, month=3, day=31) - ival_W_end_of_month = Period(freq='W', year=2007, month=1, day=31) - ival_W_to_A = Period(freq='A', year=2007) - ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_W_to_M = Period(freq='M', year=2007, month=1) - - if Period(freq='D', year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq='A', year=2007) + ival_W = Period(freq="W", year=2007, month=1, day=1) + + ival_WSUN = Period(freq="W", year=2007, month=1, day=7) + ival_WSAT = Period(freq="W-SAT", year=2007, month=1, day=6) + ival_WFRI = Period(freq="W-FRI", year=2007, month=1, day=5) + ival_WTHU = Period(freq="W-THU", year=2007, month=1, day=4) + ival_WWED = Period(freq="W-WED", year=2007, month=1, day=3) + ival_WTUE = Period(freq="W-TUE", year=2007, month=1, day=2) + ival_WMON = Period(freq="W-MON", year=2007, month=1, day=1) + + ival_WSUN_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_WSUN_to_D_end = Period(freq="D", year=2007, month=1, day=7) + ival_WSAT_to_D_start = Period(freq="D", year=2006, month=12, day=31) + ival_WSAT_to_D_end = Period(freq="D", year=2007, month=1, day=6) + ival_WFRI_to_D_start = Period(freq="D", year=2006, month=12, day=30) + ival_WFRI_to_D_end = Period(freq="D", year=2007, month=1, day=5) + ival_WTHU_to_D_start = Period(freq="D", year=2006, month=12, day=29) + ival_WTHU_to_D_end = Period(freq="D", year=2007, month=1, day=4) + ival_WWED_to_D_start = Period(freq="D", year=2006, month=12, day=28) + ival_WWED_to_D_end = Period(freq="D", year=2007, month=1, day=3) + ival_WTUE_to_D_start = Period(freq="D", year=2006, month=12, day=27) + ival_WTUE_to_D_end = Period(freq="D", year=2007, month=1, day=2) + ival_WMON_to_D_start = Period(freq="D", year=2006, month=12, day=26) + ival_WMON_to_D_end = Period(freq="D", year=2007, month=1, day=1) + + ival_W_end_of_year = Period(freq="W", year=2007, month=12, day=31) + ival_W_end_of_quarter = Period(freq="W", year=2007, month=3, day=31) + ival_W_end_of_month = Period(freq="W", year=2007, month=1, day=31) + ival_W_to_A = Period(freq="A", year=2007) + ival_W_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_W_to_M = Period(freq="M", year=2007, month=1) + + if Period(freq="D", year=2007, month=12, day=31).weekday == 6: + ival_W_to_A_end_of_year = Period(freq="A", year=2007) else: - ival_W_to_A_end_of_year = Period(freq='A', year=2008) + ival_W_to_A_end_of_year = Period(freq="A", year=2008) - if Period(freq='D', year=2007, month=3, day=31).weekday == 6: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) + if Period(freq="D", year=2007, month=3, day=31).weekday == 6: + ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=1) else: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) + ival_W_to_Q_end_of_quarter = Period(freq="Q", year=2007, quarter=2) - if Period(freq='D', year=2007, month=1, day=31).weekday == 6: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) + if Period(freq="D", year=2007, month=1, day=31).weekday == 6: + ival_W_to_M_end_of_month = Period(freq="M", year=2007, month=1) else: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) - - ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) - ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) - ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, - minute=59, second=59) - - assert ival_W.asfreq('A') == ival_W_to_A - assert ival_W_end_of_year.asfreq('A') == ival_W_to_A_end_of_year - - assert ival_W.asfreq('Q') == ival_W_to_Q - assert ival_W_end_of_quarter.asfreq('Q') == ival_W_to_Q_end_of_quarter - - assert ival_W.asfreq('M') == ival_W_to_M - assert ival_W_end_of_month.asfreq('M') == ival_W_to_M_end_of_month - - assert ival_W.asfreq('B', 'S') == ival_W_to_B_start - assert ival_W.asfreq('B', 'E') == ival_W_to_B_end - - assert ival_W.asfreq('D', 'S') == ival_W_to_D_start - assert ival_W.asfreq('D', 'E') == ival_W_to_D_end - - assert ival_WSUN.asfreq('D', 'S') == ival_WSUN_to_D_start - assert ival_WSUN.asfreq('D', 'E') == ival_WSUN_to_D_end - assert ival_WSAT.asfreq('D', 'S') == ival_WSAT_to_D_start - assert ival_WSAT.asfreq('D', 'E') == ival_WSAT_to_D_end - assert ival_WFRI.asfreq('D', 'S') == ival_WFRI_to_D_start - assert ival_WFRI.asfreq('D', 'E') == ival_WFRI_to_D_end - assert ival_WTHU.asfreq('D', 'S') == ival_WTHU_to_D_start - assert ival_WTHU.asfreq('D', 'E') == ival_WTHU_to_D_end - assert ival_WWED.asfreq('D', 'S') == ival_WWED_to_D_start - assert ival_WWED.asfreq('D', 'E') == ival_WWED_to_D_end - assert ival_WTUE.asfreq('D', 'S') == ival_WTUE_to_D_start - assert ival_WTUE.asfreq('D', 'E') == ival_WTUE_to_D_end - assert ival_WMON.asfreq('D', 'S') == ival_WMON_to_D_start - assert ival_WMON.asfreq('D', 'E') == ival_WMON_to_D_end - - assert ival_W.asfreq('H', 'S') == ival_W_to_H_start - assert ival_W.asfreq('H', 'E') == ival_W_to_H_end - assert ival_W.asfreq('Min', 'S') == ival_W_to_T_start - assert ival_W.asfreq('Min', 'E') == ival_W_to_T_end - assert ival_W.asfreq('S', 'S') == ival_W_to_S_start - assert ival_W.asfreq('S', 'E') == ival_W_to_S_end - - assert ival_W.asfreq('W') == ival_W + ival_W_to_M_end_of_month = Period(freq="M", year=2007, month=2) + + ival_W_to_B_start = Period(freq="B", year=2007, month=1, day=1) + ival_W_to_B_end = Period(freq="B", year=2007, month=1, day=5) + ival_W_to_D_start = Period(freq="D", year=2007, month=1, day=1) + ival_W_to_D_end = Period(freq="D", year=2007, month=1, day=7) + ival_W_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_W_to_H_end = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_W_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_W_to_T_end = Period( + freq="Min", year=2007, month=1, day=7, hour=23, minute=59 + ) + ival_W_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_W_to_S_end = Period( + freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + ) + + assert ival_W.asfreq("A") == ival_W_to_A + assert ival_W_end_of_year.asfreq("A") == ival_W_to_A_end_of_year + + assert ival_W.asfreq("Q") == ival_W_to_Q + assert ival_W_end_of_quarter.asfreq("Q") == ival_W_to_Q_end_of_quarter + + assert ival_W.asfreq("M") == ival_W_to_M + assert ival_W_end_of_month.asfreq("M") == ival_W_to_M_end_of_month + + assert ival_W.asfreq("B", "S") == ival_W_to_B_start + assert ival_W.asfreq("B", "E") == ival_W_to_B_end + + assert ival_W.asfreq("D", "S") == ival_W_to_D_start + assert ival_W.asfreq("D", "E") == ival_W_to_D_end + + assert ival_WSUN.asfreq("D", "S") == ival_WSUN_to_D_start + assert ival_WSUN.asfreq("D", "E") == ival_WSUN_to_D_end + assert ival_WSAT.asfreq("D", "S") == ival_WSAT_to_D_start + assert ival_WSAT.asfreq("D", "E") == ival_WSAT_to_D_end + assert ival_WFRI.asfreq("D", "S") == ival_WFRI_to_D_start + assert ival_WFRI.asfreq("D", "E") == ival_WFRI_to_D_end + assert ival_WTHU.asfreq("D", "S") == ival_WTHU_to_D_start + assert ival_WTHU.asfreq("D", "E") == ival_WTHU_to_D_end + assert ival_WWED.asfreq("D", "S") == ival_WWED_to_D_start + assert ival_WWED.asfreq("D", "E") == ival_WWED_to_D_end + assert ival_WTUE.asfreq("D", "S") == ival_WTUE_to_D_start + assert ival_WTUE.asfreq("D", "E") == ival_WTUE_to_D_end + assert ival_WMON.asfreq("D", "S") == ival_WMON_to_D_start + assert ival_WMON.asfreq("D", "E") == ival_WMON_to_D_end + + assert ival_W.asfreq("H", "S") == ival_W_to_H_start + assert ival_W.asfreq("H", "E") == ival_W_to_H_end + assert ival_W.asfreq("Min", "S") == ival_W_to_T_start + assert ival_W.asfreq("Min", "E") == ival_W_to_T_end + assert ival_W.asfreq("S", "S") == ival_W_to_S_start + assert ival_W.asfreq("S", "E") == ival_W_to_S_end + + assert ival_W.asfreq("W") == ival_W msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - ival_W.asfreq('WK') + ival_W.asfreq("WK") def test_conv_weekly_legacy(self): # frequency conversion tests: from Weekly Frequency msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - Period(freq='WK', year=2007, month=1, day=1) + Period(freq="WK", year=2007, month=1, day=1) with pytest.raises(ValueError, match=msg): - Period(freq='WK-SAT', year=2007, month=1, day=6) + Period(freq="WK-SAT", year=2007, month=1, day=6) with pytest.raises(ValueError, match=msg): - Period(freq='WK-FRI', year=2007, month=1, day=5) + Period(freq="WK-FRI", year=2007, month=1, day=5) with pytest.raises(ValueError, match=msg): - Period(freq='WK-THU', year=2007, month=1, day=4) + Period(freq="WK-THU", year=2007, month=1, day=4) with pytest.raises(ValueError, match=msg): - Period(freq='WK-WED', year=2007, month=1, day=3) + Period(freq="WK-WED", year=2007, month=1, day=3) with pytest.raises(ValueError, match=msg): - Period(freq='WK-TUE', year=2007, month=1, day=2) + Period(freq="WK-TUE", year=2007, month=1, day=2) with pytest.raises(ValueError, match=msg): - Period(freq='WK-MON', year=2007, month=1, day=1) + Period(freq="WK-MON", year=2007, month=1, day=1) def test_conv_business(self): # frequency conversion tests: from Business Frequency" - ival_B = Period(freq='B', year=2007, month=1, day=1) - ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) - ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) - ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) - ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) - - ival_B_to_A = Period(freq='A', year=2007) - ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_B_to_M = Period(freq='M', year=2007, month=1) - ival_B_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - assert ival_B.asfreq('A') == ival_B_to_A - assert ival_B_end_of_year.asfreq('A') == ival_B_to_A - assert ival_B.asfreq('Q') == ival_B_to_Q - assert ival_B_end_of_quarter.asfreq('Q') == ival_B_to_Q - assert ival_B.asfreq('M') == ival_B_to_M - assert ival_B_end_of_month.asfreq('M') == ival_B_to_M - assert ival_B.asfreq('W') == ival_B_to_W - assert ival_B_end_of_week.asfreq('W') == ival_B_to_W - - assert ival_B.asfreq('D') == ival_B_to_D - - assert ival_B.asfreq('H', 'S') == ival_B_to_H_start - assert ival_B.asfreq('H', 'E') == ival_B_to_H_end - assert ival_B.asfreq('Min', 'S') == ival_B_to_T_start - assert ival_B.asfreq('Min', 'E') == ival_B_to_T_end - assert ival_B.asfreq('S', 'S') == ival_B_to_S_start - assert ival_B.asfreq('S', 'E') == ival_B_to_S_end - - assert ival_B.asfreq('B') == ival_B + ival_B = Period(freq="B", year=2007, month=1, day=1) + ival_B_end_of_year = Period(freq="B", year=2007, month=12, day=31) + ival_B_end_of_quarter = Period(freq="B", year=2007, month=3, day=30) + ival_B_end_of_month = Period(freq="B", year=2007, month=1, day=31) + ival_B_end_of_week = Period(freq="B", year=2007, month=1, day=5) + + ival_B_to_A = Period(freq="A", year=2007) + ival_B_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_B_to_M = Period(freq="M", year=2007, month=1) + ival_B_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_B_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_B_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_B_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_B_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_B_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_B_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_B_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + + assert ival_B.asfreq("A") == ival_B_to_A + assert ival_B_end_of_year.asfreq("A") == ival_B_to_A + assert ival_B.asfreq("Q") == ival_B_to_Q + assert ival_B_end_of_quarter.asfreq("Q") == ival_B_to_Q + assert ival_B.asfreq("M") == ival_B_to_M + assert ival_B_end_of_month.asfreq("M") == ival_B_to_M + assert ival_B.asfreq("W") == ival_B_to_W + assert ival_B_end_of_week.asfreq("W") == ival_B_to_W + + assert ival_B.asfreq("D") == ival_B_to_D + + assert ival_B.asfreq("H", "S") == ival_B_to_H_start + assert ival_B.asfreq("H", "E") == ival_B_to_H_end + assert ival_B.asfreq("Min", "S") == ival_B_to_T_start + assert ival_B.asfreq("Min", "E") == ival_B_to_T_end + assert ival_B.asfreq("S", "S") == ival_B_to_S_start + assert ival_B.asfreq("S", "E") == ival_B_to_S_end + + assert ival_B.asfreq("B") == ival_B def test_conv_daily(self): # frequency conversion tests: from Business Frequency" - ival_D = Period(freq='D', year=2007, month=1, day=1) - ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) - ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) - ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) - ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) + ival_D = Period(freq="D", year=2007, month=1, day=1) + ival_D_end_of_year = Period(freq="D", year=2007, month=12, day=31) + ival_D_end_of_quarter = Period(freq="D", year=2007, month=3, day=31) + ival_D_end_of_month = Period(freq="D", year=2007, month=1, day=31) + ival_D_end_of_week = Period(freq="D", year=2007, month=1, day=7) - ival_D_friday = Period(freq='D', year=2007, month=1, day=5) - ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) - ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) + ival_D_friday = Period(freq="D", year=2007, month=1, day=5) + ival_D_saturday = Period(freq="D", year=2007, month=1, day=6) + ival_D_sunday = Period(freq="D", year=2007, month=1, day=7) # TODO: unused? # ival_D_monday = Period(freq='D', year=2007, month=1, day=8) - ival_B_friday = Period(freq='B', year=2007, month=1, day=5) - ival_B_monday = Period(freq='B', year=2007, month=1, day=8) + ival_B_friday = Period(freq="B", year=2007, month=1, day=5) + ival_B_monday = Period(freq="B", year=2007, month=1, day=8) - ival_D_to_A = Period(freq='A', year=2007) + ival_D_to_A = Period(freq="A", year=2007) - ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) - ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) - ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) + ival_Deoq_to_AJAN = Period(freq="A-JAN", year=2008) + ival_Deoq_to_AJUN = Period(freq="A-JUN", year=2007) + ival_Deoq_to_ADEC = Period(freq="A-DEC", year=2007) ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) - ival_D_to_M = Period(freq='M', year=2007, month=1) - ival_D_to_W = Period(freq='W', year=2007, month=1, day=7) - - ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, hour=23) - ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, - minute=59, second=59) - - assert ival_D.asfreq('A') == ival_D_to_A - - assert ival_D_end_of_quarter.asfreq('A-JAN') == ival_Deoq_to_AJAN - assert ival_D_end_of_quarter.asfreq('A-JUN') == ival_Deoq_to_AJUN - assert ival_D_end_of_quarter.asfreq('A-DEC') == ival_Deoq_to_ADEC - - assert ival_D_end_of_year.asfreq('A') == ival_D_to_A - assert ival_D_end_of_quarter.asfreq('Q') == ival_D_to_QEDEC + ival_D_to_M = Period(freq="M", year=2007, month=1) + ival_D_to_W = Period(freq="W", year=2007, month=1, day=7) + + ival_D_to_H_start = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_D_to_H_end = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_D_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_D_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_D_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_D_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + + assert ival_D.asfreq("A") == ival_D_to_A + + assert ival_D_end_of_quarter.asfreq("A-JAN") == ival_Deoq_to_AJAN + assert ival_D_end_of_quarter.asfreq("A-JUN") == ival_Deoq_to_AJUN + assert ival_D_end_of_quarter.asfreq("A-DEC") == ival_Deoq_to_ADEC + + assert ival_D_end_of_year.asfreq("A") == ival_D_to_A + assert ival_D_end_of_quarter.asfreq("Q") == ival_D_to_QEDEC assert ival_D.asfreq("Q-JAN") == ival_D_to_QEJAN assert ival_D.asfreq("Q-JUN") == ival_D_to_QEJUN assert ival_D.asfreq("Q-DEC") == ival_D_to_QEDEC - assert ival_D.asfreq('M') == ival_D_to_M - assert ival_D_end_of_month.asfreq('M') == ival_D_to_M - assert ival_D.asfreq('W') == ival_D_to_W - assert ival_D_end_of_week.asfreq('W') == ival_D_to_W - - assert ival_D_friday.asfreq('B') == ival_B_friday - assert ival_D_saturday.asfreq('B', 'S') == ival_B_friday - assert ival_D_saturday.asfreq('B', 'E') == ival_B_monday - assert ival_D_sunday.asfreq('B', 'S') == ival_B_friday - assert ival_D_sunday.asfreq('B', 'E') == ival_B_monday - - assert ival_D.asfreq('H', 'S') == ival_D_to_H_start - assert ival_D.asfreq('H', 'E') == ival_D_to_H_end - assert ival_D.asfreq('Min', 'S') == ival_D_to_T_start - assert ival_D.asfreq('Min', 'E') == ival_D_to_T_end - assert ival_D.asfreq('S', 'S') == ival_D_to_S_start - assert ival_D.asfreq('S', 'E') == ival_D_to_S_end - - assert ival_D.asfreq('D') == ival_D + assert ival_D.asfreq("M") == ival_D_to_M + assert ival_D_end_of_month.asfreq("M") == ival_D_to_M + assert ival_D.asfreq("W") == ival_D_to_W + assert ival_D_end_of_week.asfreq("W") == ival_D_to_W + + assert ival_D_friday.asfreq("B") == ival_B_friday + assert ival_D_saturday.asfreq("B", "S") == ival_B_friday + assert ival_D_saturday.asfreq("B", "E") == ival_B_monday + assert ival_D_sunday.asfreq("B", "S") == ival_B_friday + assert ival_D_sunday.asfreq("B", "E") == ival_B_monday + + assert ival_D.asfreq("H", "S") == ival_D_to_H_start + assert ival_D.asfreq("H", "E") == ival_D_to_H_end + assert ival_D.asfreq("Min", "S") == ival_D_to_T_start + assert ival_D.asfreq("Min", "E") == ival_D_to_T_end + assert ival_D.asfreq("S", "S") == ival_D_to_S_start + assert ival_D.asfreq("S", "E") == ival_D_to_S_end + + assert ival_D.asfreq("D") == ival_D def test_conv_hourly(self): # frequency conversion tests: from Hourly Frequency" - ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, - hour=23) - ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, - hour=23) - ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, - hour=23) - ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, - hour=23) - ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, - hour=23) - ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, - hour=23) - - ival_H_to_A = Period(freq='A', year=2007) - ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_H_to_M = Period(freq='M', year=2007, month=1) - ival_H_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) - - ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=59) - ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=59, second=59) - - assert ival_H.asfreq('A') == ival_H_to_A - assert ival_H_end_of_year.asfreq('A') == ival_H_to_A - assert ival_H.asfreq('Q') == ival_H_to_Q - assert ival_H_end_of_quarter.asfreq('Q') == ival_H_to_Q - assert ival_H.asfreq('M') == ival_H_to_M - assert ival_H_end_of_month.asfreq('M') == ival_H_to_M - assert ival_H.asfreq('W') == ival_H_to_W - assert ival_H_end_of_week.asfreq('W') == ival_H_to_W - assert ival_H.asfreq('D') == ival_H_to_D - assert ival_H_end_of_day.asfreq('D') == ival_H_to_D - assert ival_H.asfreq('B') == ival_H_to_B - assert ival_H_end_of_bus.asfreq('B') == ival_H_to_B - - assert ival_H.asfreq('Min', 'S') == ival_H_to_T_start - assert ival_H.asfreq('Min', 'E') == ival_H_to_T_end - assert ival_H.asfreq('S', 'S') == ival_H_to_S_start - assert ival_H.asfreq('S', 'E') == ival_H_to_S_end - - assert ival_H.asfreq('H') == ival_H + ival_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq="H", year=2007, month=12, day=31, hour=23) + ival_H_end_of_quarter = Period(freq="H", year=2007, month=3, day=31, hour=23) + ival_H_end_of_month = Period(freq="H", year=2007, month=1, day=31, hour=23) + ival_H_end_of_week = Period(freq="H", year=2007, month=1, day=7, hour=23) + ival_H_end_of_day = Period(freq="H", year=2007, month=1, day=1, hour=23) + ival_H_end_of_bus = Period(freq="H", year=2007, month=1, day=1, hour=23) + + ival_H_to_A = Period(freq="A", year=2007) + ival_H_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_H_to_M = Period(freq="M", year=2007, month=1) + ival_H_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_H_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_H_to_B = Period(freq="B", year=2007, month=1, day=1) + + ival_H_to_T_start = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0 + ) + ival_H_to_T_end = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=59 + ) + ival_H_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_H_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + ) + + assert ival_H.asfreq("A") == ival_H_to_A + assert ival_H_end_of_year.asfreq("A") == ival_H_to_A + assert ival_H.asfreq("Q") == ival_H_to_Q + assert ival_H_end_of_quarter.asfreq("Q") == ival_H_to_Q + assert ival_H.asfreq("M") == ival_H_to_M + assert ival_H_end_of_month.asfreq("M") == ival_H_to_M + assert ival_H.asfreq("W") == ival_H_to_W + assert ival_H_end_of_week.asfreq("W") == ival_H_to_W + assert ival_H.asfreq("D") == ival_H_to_D + assert ival_H_end_of_day.asfreq("D") == ival_H_to_D + assert ival_H.asfreq("B") == ival_H_to_B + assert ival_H_end_of_bus.asfreq("B") == ival_H_to_B + + assert ival_H.asfreq("Min", "S") == ival_H_to_T_start + assert ival_H.asfreq("Min", "E") == ival_H_to_T_end + assert ival_H.asfreq("S", "S") == ival_H_to_S_start + assert ival_H.asfreq("S", "E") == ival_H_to_S_end + + assert ival_H.asfreq("H") == ival_H def test_conv_minutely(self): # frequency conversion tests: from Minutely Frequency" - ival_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, - hour=23, minute=59) - ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, - hour=23, minute=59) - ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, - hour=23, minute=59) - ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, - hour=23, minute=59) - ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=59) - - ival_T_to_A = Period(freq='A', year=2007) - ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_T_to_M = Period(freq='M', year=2007, month=1) - ival_T_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - - ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=59) - - assert ival_T.asfreq('A') == ival_T_to_A - assert ival_T_end_of_year.asfreq('A') == ival_T_to_A - assert ival_T.asfreq('Q') == ival_T_to_Q - assert ival_T_end_of_quarter.asfreq('Q') == ival_T_to_Q - assert ival_T.asfreq('M') == ival_T_to_M - assert ival_T_end_of_month.asfreq('M') == ival_T_to_M - assert ival_T.asfreq('W') == ival_T_to_W - assert ival_T_end_of_week.asfreq('W') == ival_T_to_W - assert ival_T.asfreq('D') == ival_T_to_D - assert ival_T_end_of_day.asfreq('D') == ival_T_to_D - assert ival_T.asfreq('B') == ival_T_to_B - assert ival_T_end_of_bus.asfreq('B') == ival_T_to_B - assert ival_T.asfreq('H') == ival_T_to_H - assert ival_T_end_of_hour.asfreq('H') == ival_T_to_H - - assert ival_T.asfreq('S', 'S') == ival_T_to_S_start - assert ival_T.asfreq('S', 'E') == ival_T_to_S_end - - assert ival_T.asfreq('Min') == ival_T + ival_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) + ival_T_end_of_year = Period( + freq="Min", year=2007, month=12, day=31, hour=23, minute=59 + ) + ival_T_end_of_quarter = Period( + freq="Min", year=2007, month=3, day=31, hour=23, minute=59 + ) + ival_T_end_of_month = Period( + freq="Min", year=2007, month=1, day=31, hour=23, minute=59 + ) + ival_T_end_of_week = Period( + freq="Min", year=2007, month=1, day=7, hour=23, minute=59 + ) + ival_T_end_of_day = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_T_end_of_bus = Period( + freq="Min", year=2007, month=1, day=1, hour=23, minute=59 + ) + ival_T_end_of_hour = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=59 + ) + + ival_T_to_A = Period(freq="A", year=2007) + ival_T_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_T_to_M = Period(freq="M", year=2007, month=1) + ival_T_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_T_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_T_to_B = Period(freq="B", year=2007, month=1, day=1) + ival_T_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + + ival_T_to_S_start = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) + ival_T_to_S_end = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + ) + + assert ival_T.asfreq("A") == ival_T_to_A + assert ival_T_end_of_year.asfreq("A") == ival_T_to_A + assert ival_T.asfreq("Q") == ival_T_to_Q + assert ival_T_end_of_quarter.asfreq("Q") == ival_T_to_Q + assert ival_T.asfreq("M") == ival_T_to_M + assert ival_T_end_of_month.asfreq("M") == ival_T_to_M + assert ival_T.asfreq("W") == ival_T_to_W + assert ival_T_end_of_week.asfreq("W") == ival_T_to_W + assert ival_T.asfreq("D") == ival_T_to_D + assert ival_T_end_of_day.asfreq("D") == ival_T_to_D + assert ival_T.asfreq("B") == ival_T_to_B + assert ival_T_end_of_bus.asfreq("B") == ival_T_to_B + assert ival_T.asfreq("H") == ival_T_to_H + assert ival_T_end_of_hour.asfreq("H") == ival_T_to_H + + assert ival_T.asfreq("S", "S") == ival_T_to_S_start + assert ival_T.asfreq("S", "E") == ival_T_to_S_end + + assert ival_T.asfreq("Min") == ival_T def test_conv_secondly(self): # frequency conversion tests: from Secondly Frequency" - ival_S = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, - second=0) - ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, - hour=23, minute=59, second=59) - ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, - hour=23, minute=59, second=59) - ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, - hour=23, minute=59, second=59) - ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=59, second=59) - ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, - hour=0, minute=0, second=59) - - ival_S_to_A = Period(freq='A', year=2007) - ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_S_to_M = Period(freq='M', year=2007, month=1) - ival_S_to_W = Period(freq='W', year=2007, month=1, day=7) - ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) - ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) - ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) - - assert ival_S.asfreq('A') == ival_S_to_A - assert ival_S_end_of_year.asfreq('A') == ival_S_to_A - assert ival_S.asfreq('Q') == ival_S_to_Q - assert ival_S_end_of_quarter.asfreq('Q') == ival_S_to_Q - assert ival_S.asfreq('M') == ival_S_to_M - assert ival_S_end_of_month.asfreq('M') == ival_S_to_M - assert ival_S.asfreq('W') == ival_S_to_W - assert ival_S_end_of_week.asfreq('W') == ival_S_to_W - assert ival_S.asfreq('D') == ival_S_to_D - assert ival_S_end_of_day.asfreq('D') == ival_S_to_D - assert ival_S.asfreq('B') == ival_S_to_B - assert ival_S_end_of_bus.asfreq('B') == ival_S_to_B - assert ival_S.asfreq('H') == ival_S_to_H - assert ival_S_end_of_hour.asfreq('H') == ival_S_to_H - assert ival_S.asfreq('Min') == ival_S_to_T - assert ival_S_end_of_minute.asfreq('Min') == ival_S_to_T - - assert ival_S.asfreq('S') == ival_S + ival_S = Period(freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=0) + ival_S_end_of_year = Period( + freq="S", year=2007, month=12, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_quarter = Period( + freq="S", year=2007, month=3, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_month = Period( + freq="S", year=2007, month=1, day=31, hour=23, minute=59, second=59 + ) + ival_S_end_of_week = Period( + freq="S", year=2007, month=1, day=7, hour=23, minute=59, second=59 + ) + ival_S_end_of_day = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + ival_S_end_of_bus = Period( + freq="S", year=2007, month=1, day=1, hour=23, minute=59, second=59 + ) + ival_S_end_of_hour = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=59, second=59 + ) + ival_S_end_of_minute = Period( + freq="S", year=2007, month=1, day=1, hour=0, minute=0, second=59 + ) + + ival_S_to_A = Period(freq="A", year=2007) + ival_S_to_Q = Period(freq="Q", year=2007, quarter=1) + ival_S_to_M = Period(freq="M", year=2007, month=1) + ival_S_to_W = Period(freq="W", year=2007, month=1, day=7) + ival_S_to_D = Period(freq="D", year=2007, month=1, day=1) + ival_S_to_B = Period(freq="B", year=2007, month=1, day=1) + ival_S_to_H = Period(freq="H", year=2007, month=1, day=1, hour=0) + ival_S_to_T = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) + + assert ival_S.asfreq("A") == ival_S_to_A + assert ival_S_end_of_year.asfreq("A") == ival_S_to_A + assert ival_S.asfreq("Q") == ival_S_to_Q + assert ival_S_end_of_quarter.asfreq("Q") == ival_S_to_Q + assert ival_S.asfreq("M") == ival_S_to_M + assert ival_S_end_of_month.asfreq("M") == ival_S_to_M + assert ival_S.asfreq("W") == ival_S_to_W + assert ival_S_end_of_week.asfreq("W") == ival_S_to_W + assert ival_S.asfreq("D") == ival_S_to_D + assert ival_S_end_of_day.asfreq("D") == ival_S_to_D + assert ival_S.asfreq("B") == ival_S_to_B + assert ival_S_end_of_bus.asfreq("B") == ival_S_to_B + assert ival_S.asfreq("H") == ival_S_to_H + assert ival_S_end_of_hour.asfreq("H") == ival_S_to_H + assert ival_S.asfreq("Min") == ival_S_to_T + assert ival_S_end_of_minute.asfreq("Min") == ival_S_to_T + + assert ival_S.asfreq("S") == ival_S def test_asfreq_mult(self): # normal freq to mult freq - p = Period(freq='A', year=2007) + p = Period(freq="A", year=2007) # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: + for freq in ["3A", offsets.YearEnd(3)]: result = p.asfreq(freq) - expected = Period('2007', freq='3A') + expected = Period("2007", freq="3A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='3A') + for freq in ["3A", offsets.YearEnd(3)]: + result = p.asfreq(freq, how="S") + expected = Period("2007", freq="3A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # mult freq to normal freq - p = Period(freq='3A', year=2007) + p = Period(freq="3A", year=2007) # ordinal will change because how=E is the default - for freq in ['A', offsets.YearEnd()]: + for freq in ["A", offsets.YearEnd()]: result = p.asfreq(freq) - expected = Period('2009', freq='A') + expected = Period("2009", freq="A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # ordinal will not change - for freq in ['A', offsets.YearEnd()]: - result = p.asfreq(freq, how='S') - expected = Period('2007', freq='A') + for freq in ["A", offsets.YearEnd()]: + result = p.asfreq(freq, how="S") + expected = Period("2007", freq="A") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq='A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: + p = Period(freq="A", year=2007) + for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) - expected = Period('2007-12', freq='2M') + expected = Period("2007-12", freq="2M") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq, how="S") + expected = Period("2007-01", freq="2M") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - p = Period(freq='3A', year=2007) - for freq in ['2M', offsets.MonthEnd(2)]: + p = Period(freq="3A", year=2007) + for freq in ["2M", offsets.MonthEnd(2)]: result = p.asfreq(freq) - expected = Period('2009-12', freq='2M') + expected = Period("2009-12", freq="2M") assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq - for freq in ['2M', offsets.MonthEnd(2)]: - result = p.asfreq(freq, how='S') - expected = Period('2007-01', freq='2M') + for freq in ["2M", offsets.MonthEnd(2)]: + result = p.asfreq(freq, how="S") + expected = Period("2007-01", freq="2M") assert result == expected assert result.ordinal == expected.ordinal @@ -696,24 +732,24 @@ def test_asfreq_mult(self): def test_asfreq_combined(self): # normal freq to combined freq - p = Period('2007', freq='H') + p = Period("2007", freq="H") # ordinal will not change - expected = Period('2007', freq='25H') - for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): + expected = Period("2007", freq="25H") + for freq, how in zip(["1D1H", "1H1D"], ["E", "S"]): result = p.asfreq(freq, how=how) assert result == expected assert result.ordinal == expected.ordinal assert result.freq == expected.freq # combined freq to normal freq - p1 = Period(freq='1D1H', year=2007) - p2 = Period(freq='1H1D', year=2007) + p1 = Period(freq="1D1H", year=2007) + p2 = Period(freq="1H1D", year=2007) # ordinal will change because how=E is the default - result1 = p1.asfreq('H') - result2 = p2.asfreq('H') - expected = Period('2007-01-02', freq='H') + result1 = p1.asfreq("H") + result2 = p2.asfreq("H") + expected = Period("2007-01-02", freq="H") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -722,9 +758,9 @@ def test_asfreq_combined(self): assert result2.freq == expected.freq # ordinal will not change - result1 = p1.asfreq('H', how='S') - result2 = p2.asfreq('H', how='S') - expected = Period('2007-01-01', freq='H') + result1 = p1.asfreq("H", how="S") + result2 = p2.asfreq("H", how="S") + expected = Period("2007-01-01", freq="H") assert result1 == expected assert result1.ordinal == expected.ordinal assert result1.freq == expected.freq @@ -735,13 +771,13 @@ def test_asfreq_combined(self): def test_asfreq_MS(self): initial = Period("2013") - assert initial.asfreq(freq="M", how="S") == Period('2013-01', 'M') + assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") with pytest.raises(ValueError, match=msg): - Period('2013-01', 'MS') + Period("2013-01", "MS") assert _period_code_map.get("MS") is None diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 2a765086af403..34d2fa6a9194c 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -20,60 +20,58 @@ class TestPeriodConstruction: def test_construction(self): - i1 = Period('1/1/2005', freq='M') - i2 = Period('Jan 2005') + i1 = Period("1/1/2005", freq="M") + i2 = Period("Jan 2005") assert i1 == i2 - i1 = Period('2005', freq='A') - i2 = Period('2005') - i3 = Period('2005', freq='a') + i1 = Period("2005", freq="A") + i2 = Period("2005") + i3 = Period("2005", freq="a") assert i1 == i2 assert i1 == i3 - i4 = Period('2005', freq='M') - i5 = Period('2005', freq='m') + i4 = Period("2005", freq="M") + i5 = Period("2005", freq="m") msg = r"Input has different freq=M from Period\(freq=A-DEC\)" with pytest.raises(IncompatibleFrequency, match=msg): i1 != i4 assert i4 == i5 - i1 = Period.now('Q') - i2 = Period(datetime.now(), freq='Q') - i3 = Period.now('q') + i1 = Period.now("Q") + i2 = Period(datetime.now(), freq="Q") + i3 = Period.now("q") assert i1 == i2 assert i1 == i3 - i1 = Period('1982', freq='min') - i2 = Period('1982', freq='MIN') + i1 = Period("1982", freq="min") + i2 = Period("1982", freq="MIN") assert i1 == i2 - i2 = Period('1982', freq=('Min', 1)) + i2 = Period("1982", freq=("Min", 1)) assert i1 == i2 - i1 = Period(year=2005, month=3, day=1, freq='D') - i2 = Period('3/1/2005', freq='D') + i1 = Period(year=2005, month=3, day=1, freq="D") + i2 = Period("3/1/2005", freq="D") assert i1 == i2 - i3 = Period(year=2005, month=3, day=1, freq='d') + i3 = Period(year=2005, month=3, day=1, freq="d") assert i1 == i3 - i1 = Period('2007-01-01 09:00:00.001') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + i1 = Period("2007-01-01 09:00:00.001") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") assert i1 == expected - expected = Period(np_datetime64_compat( - '2007-01-01 09:00:00.001Z'), freq='L') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") assert i1 == expected - i1 = Period('2007-01-01 09:00:00.00101') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + i1 = Period("2007-01-01 09:00:00.00101") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") assert i1 == expected - expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), - freq='U') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") assert i1 == expected msg = "Must supply freq for ordinal value" @@ -81,161 +79,158 @@ def test_construction(self): Period(ordinal=200701) with pytest.raises(ValueError, match="Invalid frequency: X"): - Period('2007-1-1', freq='X') + Period("2007-1-1", freq="X") def test_construction_bday(self): # Biz day construction, roll forward if non-weekday - i1 = Period('3/10/12', freq='B') - i2 = Period('3/10/12', freq='D') - assert i1 == i2.asfreq('B') - i2 = Period('3/11/12', freq='D') - assert i1 == i2.asfreq('B') - i2 = Period('3/12/12', freq='D') - assert i1 == i2.asfreq('B') - - i3 = Period('3/10/12', freq='b') + i1 = Period("3/10/12", freq="B") + i2 = Period("3/10/12", freq="D") + assert i1 == i2.asfreq("B") + i2 = Period("3/11/12", freq="D") + assert i1 == i2.asfreq("B") + i2 = Period("3/12/12", freq="D") + assert i1 == i2.asfreq("B") + + i3 = Period("3/10/12", freq="b") assert i1 == i3 - i1 = Period(year=2012, month=3, day=10, freq='B') - i2 = Period('3/12/12', freq='B') + i1 = Period(year=2012, month=3, day=10, freq="B") + i2 = Period("3/12/12", freq="B") assert i1 == i2 def test_construction_quarter(self): - i1 = Period(year=2005, quarter=1, freq='Q') - i2 = Period('1/1/2005', freq='Q') + i1 = Period(year=2005, quarter=1, freq="Q") + i2 = Period("1/1/2005", freq="Q") assert i1 == i2 - i1 = Period(year=2005, quarter=3, freq='Q') - i2 = Period('9/1/2005', freq='Q') + i1 = Period(year=2005, quarter=3, freq="Q") + i2 = Period("9/1/2005", freq="Q") assert i1 == i2 - i1 = Period('2005Q1') - i2 = Period(year=2005, quarter=1, freq='Q') - i3 = Period('2005q1') + i1 = Period("2005Q1") + i2 = Period(year=2005, quarter=1, freq="Q") + i3 = Period("2005q1") assert i1 == i2 assert i1 == i3 - i1 = Period('05Q1') + i1 = Period("05Q1") assert i1 == i2 - lower = Period('05q1') + lower = Period("05q1") assert i1 == lower - i1 = Period('1Q2005') + i1 = Period("1Q2005") assert i1 == i2 - lower = Period('1q2005') + lower = Period("1q2005") assert i1 == lower - i1 = Period('1Q05') + i1 = Period("1Q05") assert i1 == i2 - lower = Period('1q05') + lower = Period("1q05") assert i1 == lower - i1 = Period('4Q1984') + i1 = Period("4Q1984") assert i1.year == 1984 - lower = Period('4q1984') + lower = Period("4q1984") assert i1 == lower def test_construction_month(self): - expected = Period('2007-01', freq='M') - i1 = Period('200701', freq='M') + expected = Period("2007-01", freq="M") + i1 = Period("200701", freq="M") assert i1 == expected - i1 = Period('200701', freq='M') + i1 = Period("200701", freq="M") assert i1 == expected - i1 = Period(200701, freq='M') + i1 = Period(200701, freq="M") assert i1 == expected - i1 = Period(ordinal=200701, freq='M') + i1 = Period(ordinal=200701, freq="M") assert i1.year == 18695 - i1 = Period(datetime(2007, 1, 1), freq='M') - i2 = Period('200701', freq='M') + i1 = Period(datetime(2007, 1, 1), freq="M") + i2 = Period("200701", freq="M") assert i1 == i2 - i1 = Period(date(2007, 1, 1), freq='M') - i2 = Period(datetime(2007, 1, 1), freq='M') - i3 = Period(np.datetime64('2007-01-01'), freq='M') - i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') - i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') + i1 = Period(date(2007, 1, 1), freq="M") + i2 = Period(datetime(2007, 1, 1), freq="M") + i3 = Period(np.datetime64("2007-01-01"), freq="M") + i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") + i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") assert i1 == i2 assert i1 == i3 assert i1 == i4 assert i1 == i5 def test_period_constructor_offsets(self): - assert (Period('1/1/2005', freq=offsets.MonthEnd()) == - Period('1/1/2005', freq='M')) - assert (Period('2005', freq=offsets.YearEnd()) == - Period('2005', freq='A')) - assert (Period('2005', freq=offsets.MonthEnd()) == - Period('2005', freq='M')) - assert (Period('3/10/12', freq=offsets.BusinessDay()) == - Period('3/10/12', freq='B')) - assert (Period('3/10/12', freq=offsets.Day()) == - Period('3/10/12', freq='D')) - - assert (Period(year=2005, quarter=1, - freq=offsets.QuarterEnd(startingMonth=12)) == - Period(year=2005, quarter=1, freq='Q')) - assert (Period(year=2005, quarter=2, - freq=offsets.QuarterEnd(startingMonth=12)) == - Period(year=2005, quarter=2, freq='Q')) - - assert (Period(year=2005, month=3, day=1, freq=offsets.Day()) == - Period(year=2005, month=3, day=1, freq='D')) - assert (Period(year=2012, month=3, day=10, freq=offsets.BDay()) == - Period(year=2012, month=3, day=10, freq='B')) - - expected = Period('2005-03-01', freq='3D') - assert (Period(year=2005, month=3, day=1, - freq=offsets.Day(3)) == expected) - assert Period(year=2005, month=3, day=1, freq='3D') == expected - - assert (Period(year=2012, month=3, day=10, - freq=offsets.BDay(3)) == - Period(year=2012, month=3, day=10, freq='3B')) - - assert (Period(200701, freq=offsets.MonthEnd()) == - Period(200701, freq='M')) + assert Period("1/1/2005", freq=offsets.MonthEnd()) == Period( + "1/1/2005", freq="M" + ) + assert Period("2005", freq=offsets.YearEnd()) == Period("2005", freq="A") + assert Period("2005", freq=offsets.MonthEnd()) == Period("2005", freq="M") + assert Period("3/10/12", freq=offsets.BusinessDay()) == Period( + "3/10/12", freq="B" + ) + assert Period("3/10/12", freq=offsets.Day()) == Period("3/10/12", freq="D") + + assert Period( + year=2005, quarter=1, freq=offsets.QuarterEnd(startingMonth=12) + ) == Period(year=2005, quarter=1, freq="Q") + assert Period( + year=2005, quarter=2, freq=offsets.QuarterEnd(startingMonth=12) + ) == Period(year=2005, quarter=2, freq="Q") + + assert Period(year=2005, month=3, day=1, freq=offsets.Day()) == Period( + year=2005, month=3, day=1, freq="D" + ) + assert Period(year=2012, month=3, day=10, freq=offsets.BDay()) == Period( + year=2012, month=3, day=10, freq="B" + ) + + expected = Period("2005-03-01", freq="3D") + assert Period(year=2005, month=3, day=1, freq=offsets.Day(3)) == expected + assert Period(year=2005, month=3, day=1, freq="3D") == expected + + assert Period(year=2012, month=3, day=10, freq=offsets.BDay(3)) == Period( + year=2012, month=3, day=10, freq="3B" + ) + + assert Period(200701, freq=offsets.MonthEnd()) == Period(200701, freq="M") i1 = Period(ordinal=200701, freq=offsets.MonthEnd()) - i2 = Period(ordinal=200701, freq='M') + i2 = Period(ordinal=200701, freq="M") assert i1 == i2 assert i1.year == 18695 assert i2.year == 18695 - i1 = Period(datetime(2007, 1, 1), freq='M') - i2 = Period('200701', freq='M') + i1 = Period(datetime(2007, 1, 1), freq="M") + i2 = Period("200701", freq="M") assert i1 == i2 - i1 = Period(date(2007, 1, 1), freq='M') - i2 = Period(datetime(2007, 1, 1), freq='M') - i3 = Period(np.datetime64('2007-01-01'), freq='M') - i4 = Period(np_datetime64_compat('2007-01-01 00:00:00Z'), freq='M') - i5 = Period(np_datetime64_compat('2007-01-01 00:00:00.000Z'), freq='M') + i1 = Period(date(2007, 1, 1), freq="M") + i2 = Period(datetime(2007, 1, 1), freq="M") + i3 = Period(np.datetime64("2007-01-01"), freq="M") + i4 = Period(np_datetime64_compat("2007-01-01 00:00:00Z"), freq="M") + i5 = Period(np_datetime64_compat("2007-01-01 00:00:00.000Z"), freq="M") assert i1 == i2 assert i1 == i3 assert i1 == i4 assert i1 == i5 - i1 = Period('2007-01-01 09:00:00.001') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq='L') + i1 = Period("2007-01-01 09:00:00.001") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1000), freq="L") assert i1 == expected - expected = Period(np_datetime64_compat( - '2007-01-01 09:00:00.001Z'), freq='L') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.001Z"), freq="L") assert i1 == expected - i1 = Period('2007-01-01 09:00:00.00101') - expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq='U') + i1 = Period("2007-01-01 09:00:00.00101") + expected = Period(datetime(2007, 1, 1, 9, 0, 0, 1010), freq="U") assert i1 == expected - expected = Period(np_datetime64_compat('2007-01-01 09:00:00.00101Z'), - freq='U') + expected = Period(np_datetime64_compat("2007-01-01 09:00:00.00101Z"), freq="U") assert i1 == expected def test_invalid_arguments(self): @@ -245,62 +240,62 @@ def test_invalid_arguments(self): Period(datetime.now().date()) with pytest.raises(ValueError): - Period(1.6, freq='D') + Period(1.6, freq="D") with pytest.raises(ValueError): - Period(ordinal=1.6, freq='D') + Period(ordinal=1.6, freq="D") with pytest.raises(ValueError): - Period(ordinal=2, value=1, freq='D') + Period(ordinal=2, value=1, freq="D") with pytest.raises(ValueError): Period(month=1) with pytest.raises(ValueError): - Period('-2000', 'A') + Period("-2000", "A") with pytest.raises(DateParseError): - Period('0', 'A') + Period("0", "A") with pytest.raises(DateParseError): - Period('1/1/-2000', 'A') + Period("1/1/-2000", "A") def test_constructor_corner(self): - expected = Period('2007-01', freq='2M') - assert Period(year=2007, month=1, freq='2M') == expected + expected = Period("2007-01", freq="2M") + assert Period(year=2007, month=1, freq="2M") == expected assert Period(None) is NaT - p = Period('2007-01-01', freq='D') + p = Period("2007-01-01", freq="D") - result = Period(p, freq='A') - exp = Period('2007', freq='A') + result = Period(p, freq="A") + exp = Period("2007", freq="A") assert result == exp def test_constructor_infer_freq(self): - p = Period('2007-01-01') - assert p.freq == 'D' + p = Period("2007-01-01") + assert p.freq == "D" - p = Period('2007-01-01 07') - assert p.freq == 'H' + p = Period("2007-01-01 07") + assert p.freq == "H" - p = Period('2007-01-01 07:10') - assert p.freq == 'T' + p = Period("2007-01-01 07:10") + assert p.freq == "T" - p = Period('2007-01-01 07:10:15') - assert p.freq == 'S' + p = Period("2007-01-01 07:10:15") + assert p.freq == "S" - p = Period('2007-01-01 07:10:15.123') - assert p.freq == 'L' + p = Period("2007-01-01 07:10:15.123") + assert p.freq == "L" - p = Period('2007-01-01 07:10:15.123000') - assert p.freq == 'L' + p = Period("2007-01-01 07:10:15.123000") + assert p.freq == "L" - p = Period('2007-01-01 07:10:15.123400') - assert p.freq == 'U' + p = Period("2007-01-01 07:10:15.123400") + assert p.freq == "U" def test_multiples(self): - result1 = Period('1989', freq='2A') - result2 = Period('1989', freq='A') + result1 = Period("1989", freq="2A") + result2 = Period("1989", freq="A") assert result1.ordinal == result2.ordinal - assert result1.freqstr == '2A-DEC' - assert result2.freqstr == 'A-DEC' + assert result1.freqstr == "2A-DEC" + assert result2.freqstr == "A-DEC" assert result1.freq == offsets.YearEnd(2) assert result2.freq == offsets.YearEnd() @@ -309,428 +304,428 @@ def test_multiples(self): assert (result1 - 1).ordinal == result2.ordinal - 2 assert (-1 + result1).ordinal == result2.ordinal - 2 - @pytest.mark.parametrize('month', MONTHS) + @pytest.mark.parametrize("month", MONTHS) def test_period_cons_quarterly(self, month): # bugs in scikits.timeseries - freq = 'Q-%s' % month - exp = Period('1989Q3', freq=freq) - assert '1989Q3' in str(exp) - stamp = exp.to_timestamp('D', how='end') + freq = "Q-%s" % month + exp = Period("1989Q3", freq=freq) + assert "1989Q3" in str(exp) + stamp = exp.to_timestamp("D", how="end") p = Period(stamp, freq=freq) assert p == exp - stamp = exp.to_timestamp('3D', how='end') + stamp = exp.to_timestamp("3D", how="end") p = Period(stamp, freq=freq) assert p == exp - @pytest.mark.parametrize('month', MONTHS) + @pytest.mark.parametrize("month", MONTHS) def test_period_cons_annual(self, month): # bugs in scikits.timeseries - freq = 'A-%s' % month - exp = Period('1989', freq=freq) - stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) + freq = "A-%s" % month + exp = Period("1989", freq=freq) + stamp = exp.to_timestamp("D", how="end") + timedelta(days=30) p = Period(stamp, freq=freq) assert p == exp + 1 assert isinstance(p, Period) - @pytest.mark.parametrize('day', DAYS) - @pytest.mark.parametrize('num', range(10, 17)) + @pytest.mark.parametrize("day", DAYS) + @pytest.mark.parametrize("num", range(10, 17)) def test_period_cons_weekly(self, num, day): - daystr = '2011-02-%d' % num - freq = 'W-%s' % day + daystr = "2011-02-%d" % num + freq = "W-%s" % day result = Period(daystr, freq=freq) - expected = Period(daystr, freq='D').asfreq(freq) + expected = Period(daystr, freq="D").asfreq(freq) assert result == expected assert isinstance(result, Period) def test_period_from_ordinal(self): - p = Period('2011-01', freq='M') - res = Period._from_ordinal(p.ordinal, freq='M') + p = Period("2011-01", freq="M") + res = Period._from_ordinal(p.ordinal, freq="M") assert p == res assert isinstance(res, Period) def test_period_cons_nat(self): - p = Period('NaT', freq='M') + p = Period("NaT", freq="M") assert p is NaT - p = Period('nat', freq='W-SUN') + p = Period("nat", freq="W-SUN") assert p is NaT - p = Period(iNaT, freq='D') + p = Period(iNaT, freq="D") assert p is NaT - p = Period(iNaT, freq='3D') + p = Period(iNaT, freq="3D") assert p is NaT - p = Period(iNaT, freq='1D1H') + p = Period(iNaT, freq="1D1H") assert p is NaT - p = Period('NaT') + p = Period("NaT") assert p is NaT p = Period(iNaT) assert p is NaT def test_period_cons_mult(self): - p1 = Period('2011-01', freq='3M') - p2 = Period('2011-01', freq='M') + p1 = Period("2011-01", freq="3M") + p2 = Period("2011-01", freq="M") assert p1.ordinal == p2.ordinal assert p1.freq == offsets.MonthEnd(3) - assert p1.freqstr == '3M' + assert p1.freqstr == "3M" assert p2.freq == offsets.MonthEnd() - assert p2.freqstr == 'M' + assert p2.freqstr == "M" result = p1 + 1 assert result.ordinal == (p2 + 3).ordinal assert result.freq == p1.freq - assert result.freqstr == '3M' + assert result.freqstr == "3M" result = p1 - 1 assert result.ordinal == (p2 - 3).ordinal assert result.freq == p1.freq - assert result.freqstr == '3M' + assert result.freqstr == "3M" - msg = ('Frequency must be positive, because it' - ' represents span: -3M') + msg = "Frequency must be positive, because it" " represents span: -3M" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='-3M') + Period("2011-01", freq="-3M") - msg = ('Frequency must be positive, because it' ' represents span: 0M') + msg = "Frequency must be positive, because it" " represents span: 0M" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='0M') + Period("2011-01", freq="0M") def test_period_cons_combined(self): - p = [(Period('2011-01', freq='1D1H'), - Period('2011-01', freq='1H1D'), - Period('2011-01', freq='H')), - (Period(ordinal=1, freq='1D1H'), - Period(ordinal=1, freq='1H1D'), - Period(ordinal=1, freq='H'))] + p = [ + ( + Period("2011-01", freq="1D1H"), + Period("2011-01", freq="1H1D"), + Period("2011-01", freq="H"), + ), + ( + Period(ordinal=1, freq="1D1H"), + Period(ordinal=1, freq="1H1D"), + Period(ordinal=1, freq="H"), + ), + ] for p1, p2, p3 in p: assert p1.ordinal == p3.ordinal assert p2.ordinal == p3.ordinal assert p1.freq == offsets.Hour(25) - assert p1.freqstr == '25H' + assert p1.freqstr == "25H" assert p2.freq == offsets.Hour(25) - assert p2.freqstr == '25H' + assert p2.freqstr == "25H" assert p3.freq == offsets.Hour() - assert p3.freqstr == 'H' + assert p3.freqstr == "H" result = p1 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p1.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" result = p2 + 1 assert result.ordinal == (p3 + 25).ordinal assert result.freq == p2.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" result = p1 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p1.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" result = p2 - 1 assert result.ordinal == (p3 - 25).ordinal assert result.freq == p2.freq - assert result.freqstr == '25H' + assert result.freqstr == "25H" - msg = ('Frequency must be positive, because it' - ' represents span: -25H') + msg = "Frequency must be positive, because it" " represents span: -25H" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='-1D1H') + Period("2011-01", freq="-1D1H") with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='-1H1D') + Period("2011-01", freq="-1H1D") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq='-1D1H') + Period(ordinal=1, freq="-1D1H") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq='-1H1D') + Period(ordinal=1, freq="-1H1D") - msg = ('Frequency must be positive, because it' - ' represents span: 0D') + msg = "Frequency must be positive, because it" " represents span: 0D" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='0D0H') + Period("2011-01", freq="0D0H") with pytest.raises(ValueError, match=msg): - Period(ordinal=1, freq='0D0H') + Period(ordinal=1, freq="0D0H") # You can only combine together day and intraday offsets - msg = ('Invalid frequency: 1W1D') + msg = "Invalid frequency: 1W1D" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='1W1D') - msg = ('Invalid frequency: 1D1W') + Period("2011-01", freq="1W1D") + msg = "Invalid frequency: 1D1W" with pytest.raises(ValueError, match=msg): - Period('2011-01', freq='1D1W') + Period("2011-01", freq="1D1W") class TestPeriodMethods: def test_round_trip(self): - p = Period('2000Q1') + p = Period("2000Q1") new_p = tm.round_trip_pickle(p) assert new_p == p def test_hash(self): - assert (hash(Period('2011-01', freq='M')) == - hash(Period('2011-01', freq='M'))) + assert hash(Period("2011-01", freq="M")) == hash(Period("2011-01", freq="M")) - assert (hash(Period('2011-01-01', freq='D')) != - hash(Period('2011-01', freq='M'))) + assert hash(Period("2011-01-01", freq="D")) != hash(Period("2011-01", freq="M")) - assert (hash(Period('2011-01', freq='3M')) != - hash(Period('2011-01', freq='2M'))) + assert hash(Period("2011-01", freq="3M")) != hash(Period("2011-01", freq="2M")) - assert (hash(Period('2011-01', freq='M')) != - hash(Period('2011-02', freq='M'))) + assert hash(Period("2011-01", freq="M")) != hash(Period("2011-02", freq="M")) # -------------------------------------------------------------- # to_timestamp - @pytest.mark.parametrize('tzstr', ['Europe/Brussels', - 'Asia/Tokyo', 'US/Pacific']) + @pytest.mark.parametrize("tzstr", ["Europe/Brussels", "Asia/Tokyo", "US/Pacific"]) def test_to_timestamp_tz_arg(self, tzstr): - p = Period('1/1/2005', freq='M').to_timestamp(tz=tzstr) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="M").to_timestamp(tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - p = Period('1/1/2005', freq='3H').to_timestamp(tz=tzstr) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="3H").to_timestamp(tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - p = Period('1/1/2005', freq='A').to_timestamp(freq='A', tz=tzstr) - exp = Timestamp('31/12/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="A").to_timestamp(freq="A", tz=tzstr) + exp = Timestamp("31/12/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - p = Period('1/1/2005', freq='A').to_timestamp(freq='3H', tz=tzstr) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="A").to_timestamp(freq="3H", tz=tzstr) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp assert p.tz == exp_zone.tzinfo assert p.tz == exp.tz - @pytest.mark.parametrize('tzstr', ['dateutil/Europe/Brussels', - 'dateutil/Asia/Tokyo', - 'dateutil/US/Pacific']) + @pytest.mark.parametrize( + "tzstr", + ["dateutil/Europe/Brussels", "dateutil/Asia/Tokyo", "dateutil/US/Pacific"], + ) def test_to_timestamp_tz_arg_dateutil(self, tzstr): tz = maybe_get_tz(tzstr) - p = Period('1/1/2005', freq='M').to_timestamp(tz=tz) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="M").to_timestamp(tz=tz) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) assert p == exp - assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1]) + assert p.tz == dateutil_gettz(tzstr.split("/", 1)[1]) assert p.tz == exp.tz - p = Period('1/1/2005', freq='M').to_timestamp(freq='3H', tz=tz) - exp = Timestamp('1/1/2005', tz='UTC').tz_convert(tzstr) + p = Period("1/1/2005", freq="M").to_timestamp(freq="3H", tz=tz) + exp = Timestamp("1/1/2005", tz="UTC").tz_convert(tzstr) assert p == exp - assert p.tz == dateutil_gettz(tzstr.split('/', 1)[1]) + assert p.tz == dateutil_gettz(tzstr.split("/", 1)[1]) assert p.tz == exp.tz def test_to_timestamp_tz_arg_dateutil_from_string(self): - p = Period('1/1/2005', - freq='M').to_timestamp(tz='dateutil/Europe/Brussels') - assert p.tz == dateutil_gettz('Europe/Brussels') + p = Period("1/1/2005", freq="M").to_timestamp(tz="dateutil/Europe/Brussels") + assert p.tz == dateutil_gettz("Europe/Brussels") def test_to_timestamp_mult(self): - p = Period('2011-01', freq='M') - assert p.to_timestamp(how='S') == Timestamp('2011-01-01') - expected = Timestamp('2011-02-01') - Timedelta(1, 'ns') - assert p.to_timestamp(how='E') == expected + p = Period("2011-01", freq="M") + assert p.to_timestamp(how="S") == Timestamp("2011-01-01") + expected = Timestamp("2011-02-01") - Timedelta(1, "ns") + assert p.to_timestamp(how="E") == expected - p = Period('2011-01', freq='3M') - assert p.to_timestamp(how='S') == Timestamp('2011-01-01') - expected = Timestamp('2011-04-01') - Timedelta(1, 'ns') - assert p.to_timestamp(how='E') == expected + p = Period("2011-01", freq="3M") + assert p.to_timestamp(how="S") == Timestamp("2011-01-01") + expected = Timestamp("2011-04-01") - Timedelta(1, "ns") + assert p.to_timestamp(how="E") == expected def test_to_timestamp(self): - p = Period('1982', freq='A') - start_ts = p.to_timestamp(how='S') - aliases = ['s', 'StarT', 'BEGIn'] + p = Period("1982", freq="A") + start_ts = p.to_timestamp(how="S") + aliases = ["s", "StarT", "BEGIn"] for a in aliases: - assert start_ts == p.to_timestamp('D', how=a) + assert start_ts == p.to_timestamp("D", how=a) # freq with mult should not affect to the result - assert start_ts == p.to_timestamp('3D', how=a) + assert start_ts == p.to_timestamp("3D", how=a) - end_ts = p.to_timestamp(how='E') - aliases = ['e', 'end', 'FINIsH'] + end_ts = p.to_timestamp(how="E") + aliases = ["e", "end", "FINIsH"] for a in aliases: - assert end_ts == p.to_timestamp('D', how=a) - assert end_ts == p.to_timestamp('3D', how=a) + assert end_ts == p.to_timestamp("D", how=a) + assert end_ts == p.to_timestamp("3D", how=a) - from_lst = ['A', 'Q', 'M', 'W', 'B', 'D', 'H', 'Min', 'S'] + from_lst = ["A", "Q", "M", "W", "B", "D", "H", "Min", "S"] def _ex(p): return Timestamp((p + p.freq).start_time.value - 1) for i, fcode in enumerate(from_lst): - p = Period('1982', freq=fcode) + p = Period("1982", freq=fcode) result = p.to_timestamp().to_period(fcode) assert result == p - assert p.start_time == p.to_timestamp(how='S') + assert p.start_time == p.to_timestamp(how="S") assert p.end_time == _ex(p) # Frequency other than daily - p = Period('1985', freq='A') + p = Period("1985", freq="A") - result = p.to_timestamp('H', how='end') - expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns') + result = p.to_timestamp("H", how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp('3H', how='end') + result = p.to_timestamp("3H", how="end") assert result == expected - result = p.to_timestamp('T', how='end') - expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns') + result = p.to_timestamp("T", how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected - result = p.to_timestamp('2T', how='end') + result = p.to_timestamp("2T", how="end") assert result == expected - result = p.to_timestamp(how='end') - expected = Timestamp(1986, 1, 1) - Timedelta(1, 'ns') + result = p.to_timestamp(how="end") + expected = Timestamp(1986, 1, 1) - Timedelta(1, "ns") assert result == expected expected = datetime(1985, 1, 1) - result = p.to_timestamp('H', how='start') + result = p.to_timestamp("H", how="start") assert result == expected - result = p.to_timestamp('T', how='start') + result = p.to_timestamp("T", how="start") assert result == expected - result = p.to_timestamp('S', how='start') + result = p.to_timestamp("S", how="start") assert result == expected - result = p.to_timestamp('3H', how='start') + result = p.to_timestamp("3H", how="start") assert result == expected - result = p.to_timestamp('5S', how='start') + result = p.to_timestamp("5S", how="start") assert result == expected # -------------------------------------------------------------- # Rendering: __repr__, strftime, etc def test_repr(self): - p = Period('Jan-2000') - assert '2000-01' in repr(p) + p = Period("Jan-2000") + assert "2000-01" in repr(p) - p = Period('2000-12-15') - assert '2000-12-15' in repr(p) + p = Period("2000-12-15") + assert "2000-12-15" in repr(p) def test_repr_nat(self): - p = Period('nat', freq='M') + p = Period("nat", freq="M") assert repr(NaT) in repr(p) def test_millisecond_repr(self): - p = Period('2000-01-01 12:15:02.123') + p = Period("2000-01-01 12:15:02.123") assert repr(p) == "Period('2000-01-01 12:15:02.123', 'L')" def test_microsecond_repr(self): - p = Period('2000-01-01 12:15:02.123567') + p = Period("2000-01-01 12:15:02.123567") assert repr(p) == "Period('2000-01-01 12:15:02.123567', 'U')" def test_strftime(self): # GH#3363 - p = Period('2000-1-1 12:34:12', freq='S') - res = p.strftime('%Y-%m-%d %H:%M:%S') - assert res == '2000-01-01 12:34:12' + p = Period("2000-1-1 12:34:12", freq="S") + res = p.strftime("%Y-%m-%d %H:%M:%S") + assert res == "2000-01-01 12:34:12" assert isinstance(res, str) class TestPeriodProperties: "Test properties such as year, month, weekday, etc...." - @pytest.mark.parametrize('freq', ['A', 'M', 'D', 'H']) + @pytest.mark.parametrize("freq", ["A", "M", "D", "H"]) def test_is_leap_year(self, freq): # GH 13727 - p = Period('2000-01-01 00:00:00', freq=freq) + p = Period("2000-01-01 00:00:00", freq=freq) assert p.is_leap_year assert isinstance(p.is_leap_year, bool) - p = Period('1999-01-01 00:00:00', freq=freq) + p = Period("1999-01-01 00:00:00", freq=freq) assert not p.is_leap_year - p = Period('2004-01-01 00:00:00', freq=freq) + p = Period("2004-01-01 00:00:00", freq=freq) assert p.is_leap_year - p = Period('2100-01-01 00:00:00', freq=freq) + p = Period("2100-01-01 00:00:00", freq=freq) assert not p.is_leap_year def test_quarterly_negative_ordinals(self): - p = Period(ordinal=-1, freq='Q-DEC') + p = Period(ordinal=-1, freq="Q-DEC") assert p.year == 1969 assert p.quarter == 4 assert isinstance(p, Period) - p = Period(ordinal=-2, freq='Q-DEC') + p = Period(ordinal=-2, freq="Q-DEC") assert p.year == 1969 assert p.quarter == 3 assert isinstance(p, Period) - p = Period(ordinal=-2, freq='M') + p = Period(ordinal=-2, freq="M") assert p.year == 1969 assert p.month == 11 assert isinstance(p, Period) def test_freq_str(self): - i1 = Period('1982', freq='Min') + i1 = Period("1982", freq="Min") assert i1.freq == offsets.Minute() - assert i1.freqstr == 'T' + assert i1.freqstr == "T" def test_period_deprecated_freq(self): - cases = {"M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], - "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], - "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], - "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], - "T": ["minute", "MINUTE", "MINUTELY", "minutely"], - "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], - "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], - "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], - "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]} + cases = { + "M": ["MTH", "MONTH", "MONTHLY", "Mth", "month", "monthly"], + "B": ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY", "bus"], + "D": ["DAY", "DLY", "DAILY", "Day", "Dly", "Daily"], + "H": ["HR", "HOUR", "HRLY", "HOURLY", "hr", "Hour", "HRly"], + "T": ["minute", "MINUTE", "MINUTELY", "minutely"], + "S": ["sec", "SEC", "SECOND", "SECONDLY", "second"], + "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], + "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], + "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"], + } msg = INVALID_FREQ_ERR_MSG for exp, freqs in cases.items(): for freq in freqs: with pytest.raises(ValueError, match=msg): - Period('2016-03-01 09:00', freq=freq) + Period("2016-03-01 09:00", freq=freq) with pytest.raises(ValueError, match=msg): Period(ordinal=1, freq=freq) # check supported freq-aliases still works - p1 = Period('2016-03-01 09:00', freq=exp) + p1 = Period("2016-03-01 09:00", freq=exp) p2 = Period(ordinal=1, freq=exp) assert isinstance(p1, Period) assert isinstance(p2, Period) def test_start_time(self): - freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] + freq_lst = ["A", "Q", "M", "D", "H", "T", "S"] xp = datetime(2012, 1, 1) for f in freq_lst: - p = Period('2012', freq=f) + p = Period("2012", freq=f) assert p.start_time == xp - assert Period('2012', freq='B').start_time == datetime(2012, 1, 2) - assert Period('2012', freq='W').start_time == datetime(2011, 12, 26) + assert Period("2012", freq="B").start_time == datetime(2012, 1, 2) + assert Period("2012", freq="W").start_time == datetime(2011, 12, 26) def test_end_time(self): - p = Period('2012', freq='A') + p = Period("2012", freq="A") def _ex(*args): return Timestamp(Timestamp(datetime(*args)).value - 1) @@ -738,40 +733,40 @@ def _ex(*args): xp = _ex(2013, 1, 1) assert xp == p.end_time - p = Period('2012', freq='Q') + p = Period("2012", freq="Q") xp = _ex(2012, 4, 1) assert xp == p.end_time - p = Period('2012', freq='M') + p = Period("2012", freq="M") xp = _ex(2012, 2, 1) assert xp == p.end_time - p = Period('2012', freq='D') + p = Period("2012", freq="D") xp = _ex(2012, 1, 2) assert xp == p.end_time - p = Period('2012', freq='H') + p = Period("2012", freq="H") xp = _ex(2012, 1, 1, 1) assert xp == p.end_time - p = Period('2012', freq='B') + p = Period("2012", freq="B") xp = _ex(2012, 1, 3) assert xp == p.end_time - p = Period('2012', freq='W') + p = Period("2012", freq="W") xp = _ex(2012, 1, 2) assert xp == p.end_time # Test for GH 11738 - p = Period('2012', freq='15D') + p = Period("2012", freq="15D") xp = _ex(2012, 1, 16) assert xp == p.end_time - p = Period('2012', freq='1D1H') + p = Period("2012", freq="1D1H") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time - p = Period('2012', freq='1H1D') + p = Period("2012", freq="1H1D") xp = _ex(2012, 1, 2, 1) assert xp == p.end_time @@ -779,13 +774,13 @@ def test_anchor_week_end_time(self): def _ex(*args): return Timestamp(Timestamp(datetime(*args)).value - 1) - p = Period('2013-1-1', 'W-SAT') + p = Period("2013-1-1", "W-SAT") xp = _ex(2013, 1, 6) assert p.end_time == xp def test_properties_annually(self): # Test properties on Periods with annually frequency. - a_date = Period(freq='A', year=2007) + a_date = Period(freq="A", year=2007) assert a_date.year == 2007 def test_properties_quarterly(self): @@ -801,7 +796,7 @@ def test_properties_quarterly(self): def test_properties_monthly(self): # Test properties on Periods with daily frequency. - m_date = Period(freq='M', year=2007, month=1) + m_date = Period(freq="M", year=2007, month=1) for x in range(11): m_ival_x = m_date + x assert m_ival_x.year == 2007 @@ -817,7 +812,7 @@ def test_properties_monthly(self): def test_properties_weekly(self): # Test properties on Periods with daily frequency. - w_date = Period(freq='W', year=2007, month=1, day=7) + w_date = Period(freq="W", year=2007, month=1, day=7) # assert w_date.year == 2007 assert w_date.quarter == 1 @@ -825,12 +820,11 @@ def test_properties_weekly(self): assert w_date.week == 1 assert (w_date - 1).week == 52 assert w_date.days_in_month == 31 - assert Period(freq='W', year=2012, - month=2, day=1).days_in_month == 29 + assert Period(freq="W", year=2012, month=2, day=1).days_in_month == 29 def test_properties_weekly_legacy(self): # Test properties on Periods with daily frequency. - w_date = Period(freq='W', year=2007, month=1, day=7) + w_date = Period(freq="W", year=2007, month=1, day=7) assert w_date.year == 2007 assert w_date.quarter == 1 assert w_date.month == 1 @@ -838,16 +832,16 @@ def test_properties_weekly_legacy(self): assert (w_date - 1).week == 52 assert w_date.days_in_month == 31 - exp = Period(freq='W', year=2012, month=2, day=1) + exp = Period(freq="W", year=2012, month=2, day=1) assert exp.days_in_month == 29 msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): - Period(freq='WK', year=2007, month=1, day=7) + Period(freq="WK", year=2007, month=1, day=7) def test_properties_daily(self): # Test properties on Periods with daily frequency. - b_date = Period(freq='B', year=2007, month=1, day=1) + b_date = Period(freq="B", year=2007, month=1, day=1) # assert b_date.year == 2007 assert b_date.quarter == 1 @@ -856,10 +850,9 @@ def test_properties_daily(self): assert b_date.weekday == 0 assert b_date.dayofyear == 1 assert b_date.days_in_month == 31 - assert Period(freq='B', year=2012, - month=2, day=1).days_in_month == 29 + assert Period(freq="B", year=2012, month=2, day=1).days_in_month == 29 - d_date = Period(freq='D', year=2007, month=1, day=1) + d_date = Period(freq="D", year=2007, month=1, day=1) assert d_date.year == 2007 assert d_date.quarter == 1 @@ -868,13 +861,12 @@ def test_properties_daily(self): assert d_date.weekday == 0 assert d_date.dayofyear == 1 assert d_date.days_in_month == 31 - assert Period(freq='D', year=2012, month=2, - day=1).days_in_month == 29 + assert Period(freq="D", year=2012, month=2, day=1).days_in_month == 29 def test_properties_hourly(self): # Test properties on Periods with hourly frequency. - h_date1 = Period(freq='H', year=2007, month=1, day=1, hour=0) - h_date2 = Period(freq='2H', year=2007, month=1, day=1, hour=0) + h_date1 = Period(freq="H", year=2007, month=1, day=1, hour=0) + h_date2 = Period(freq="2H", year=2007, month=1, day=1, hour=0) for h_date in [h_date1, h_date2]: assert h_date.year == 2007 @@ -885,13 +877,13 @@ def test_properties_hourly(self): assert h_date.dayofyear == 1 assert h_date.hour == 0 assert h_date.days_in_month == 31 - assert Period(freq='H', year=2012, month=2, day=1, - hour=0).days_in_month == 29 + assert ( + Period(freq="H", year=2012, month=2, day=1, hour=0).days_in_month == 29 + ) def test_properties_minutely(self): # Test properties on Periods with minutely frequency. - t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0) + t_date = Period(freq="Min", year=2007, month=1, day=1, hour=0, minute=0) # assert t_date.quarter == 1 assert t_date.month == 1 @@ -901,13 +893,16 @@ def test_properties_minutely(self): assert t_date.hour == 0 assert t_date.minute == 0 assert t_date.days_in_month == 31 - assert Period(freq='D', year=2012, month=2, day=1, hour=0, - minute=0).days_in_month == 29 + assert ( + Period(freq="D", year=2012, month=2, day=1, hour=0, minute=0).days_in_month + == 29 + ) def test_properties_secondly(self): # Test properties on Periods with secondly frequency. - s_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, - minute=0, second=0) + s_date = Period( + freq="Min", year=2007, month=1, day=1, hour=0, minute=0, second=0 + ) # assert s_date.year == 2007 assert s_date.quarter == 1 @@ -919,12 +914,15 @@ def test_properties_secondly(self): assert s_date.minute == 0 assert s_date.second == 0 assert s_date.days_in_month == 31 - assert Period(freq='Min', year=2012, month=2, day=1, hour=0, - minute=0, second=0).days_in_month == 29 + assert ( + Period( + freq="Min", year=2012, month=2, day=1, hour=0, minute=0, second=0 + ).days_in_month + == 29 + ) class TestPeriodField: - def test_get_period_field_array_raises_on_out_of_range(self): msg = "Buffer dtype mismatch, expected 'int64_t' but got 'double'" with pytest.raises(ValueError, match=msg): @@ -932,13 +930,12 @@ def test_get_period_field_array_raises_on_out_of_range(self): class TestComparisons: - def setup_method(self, method): - self.january1 = Period('2000-01', 'M') - self.january2 = Period('2000-01', 'M') - self.february = Period('2000-02', 'M') - self.march = Period('2000-03', 'M') - self.day = Period('2012-01-01', 'D') + self.january1 = Period("2000-01", "M") + self.january2 = Period("2000-01", "M") + self.february = Period("2000-02", "M") + self.march = Period("2000-03", "M") + self.day = Period("2012-01-01", "D") def test_equal(self): assert self.january1 == self.january2 @@ -1000,14 +997,20 @@ def test_sort(self): assert sorted(periods) == correctPeriods def test_period_nat_comp(self): - p_nat = Period('NaT', freq='D') - p = Period('2011-01-01', freq='D') + p_nat = Period("NaT", freq="D") + p = Period("2011-01-01", freq="D") - nat = Timestamp('NaT') - t = Timestamp('2011-01-01') + nat = Timestamp("NaT") + t = Timestamp("2011-01-01") # confirm Period('NaT') work identical with Timestamp('NaT') - for left, right in [(p_nat, p), (p, p_nat), (p_nat, p_nat), (nat, t), - (t, nat), (nat, nat)]: + for left, right in [ + (p_nat, p), + (p, p_nat), + (p_nat, p_nat), + (nat, t), + (t, nat), + (nat, nat), + ]: assert not left < right assert not left > right assert not left == right @@ -1017,30 +1020,29 @@ def test_period_nat_comp(self): class TestArithmetic: - def test_sub_delta(self): - left, right = Period('2011', freq='A'), Period('2007', freq='A') + left, right = Period("2011", freq="A"), Period("2007", freq="A") result = left - right assert result == 4 * right.freq with pytest.raises(period.IncompatibleFrequency): - left - Period('2007-01', freq='M') + left - Period("2007-01", freq="M") def test_add_integer(self): - per1 = Period(freq='D', year=2008, month=1, day=1) - per2 = Period(freq='D', year=2008, month=1, day=2) + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) assert per1 + 1 == per2 assert 1 + per1 == per2 def test_add_sub_nat(self): # GH#13071 - p = Period('2011-01', freq='M') + p = Period("2011-01", freq="M") assert p + NaT is NaT assert NaT + p is NaT assert p - NaT is NaT assert NaT - p is NaT - p = Period('NaT', freq='M') + p = Period("NaT", freq="M") assert p + NaT is NaT assert NaT + p is NaT assert p - NaT is NaT @@ -1048,8 +1050,8 @@ def test_add_sub_nat(self): def test_add_invalid(self): # GH#4731 - per1 = Period(freq='D', year=2008, month=1, day=1) - per2 = Period(freq='D', year=2008, month=1, day=2) + per1 = Period(freq="D", year=2008, month=1, day=1) + per2 = Period(freq="D", year=2008, month=1, day=2) msg = r"unsupported operand type\(s\)" with pytest.raises(TypeError, match=msg): @@ -1060,20 +1062,22 @@ def test_add_invalid(self): per1 + per2 boxes = [lambda x: x, lambda x: pd.Series([x]), lambda x: pd.Index([x])] - ids = ['identity', 'Series', 'Index'] + ids = ["identity", "Series", "Index"] - @pytest.mark.parametrize('lbox', boxes, ids=ids) - @pytest.mark.parametrize('rbox', boxes, ids=ids) + @pytest.mark.parametrize("lbox", boxes, ids=ids) + @pytest.mark.parametrize("rbox", boxes, ids=ids) def test_add_timestamp_raises(self, rbox, lbox): # GH#17983 - ts = Timestamp('2017') - per = Period('2017', freq='M') + ts = Timestamp("2017") + per = Period("2017", freq="M") # We may get a different message depending on which class raises # the error. - msg = (r"cannot add|unsupported operand|" - r"can only operate on a|incompatible type|" - r"ufunc add cannot use operands") + msg = ( + r"cannot add|unsupported operand|" + r"can only operate on a|incompatible type|" + r"ufunc add cannot use operands" + ) with pytest.raises(TypeError, match=msg): lbox(ts) + rbox(per) @@ -1084,8 +1088,8 @@ def test_add_timestamp_raises(self, rbox, lbox): lbox(per) + rbox(per) def test_sub(self): - per1 = Period('2011-01-01', freq='D') - per2 = Period('2011-01-15', freq='D') + per1 = Period("2011-01-01", freq="D") + per2 = Period("2011-01-15", freq="D") off = per1.freq assert per1 - per2 == -14 * off @@ -1093,51 +1097,60 @@ def test_sub(self): msg = r"Input has different freq=M from Period\(freq=D\)" with pytest.raises(period.IncompatibleFrequency, match=msg): - per1 - Period('2011-02', freq='M') + per1 - Period("2011-02", freq="M") - @pytest.mark.parametrize('n', [1, 2, 3, 4]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) def test_sub_n_gt_1_ticks(self, tick_classes, n): # GH 23878 - p1 = pd.Period('19910905', freq=tick_classes(n)) - p2 = pd.Period('19920406', freq=tick_classes(n)) + p1 = pd.Period("19910905", freq=tick_classes(n)) + p2 = pd.Period("19920406", freq=tick_classes(n)) - expected = (pd.Period(str(p2), freq=p2.freq.base) - - pd.Period(str(p1), freq=p1.freq.base)) + expected = pd.Period(str(p2), freq=p2.freq.base) - pd.Period( + str(p1), freq=p1.freq.base + ) assert (p2 - p1) == expected - @pytest.mark.parametrize('normalize', [True, False]) - @pytest.mark.parametrize('n', [1, 2, 3, 4]) - @pytest.mark.parametrize('offset, kwd_name', [ - (pd.offsets.YearEnd, 'month'), - (pd.offsets.QuarterEnd, 'startingMonth'), - (pd.offsets.MonthEnd, None), - (pd.offsets.Week, 'weekday') - ]) + @pytest.mark.parametrize("normalize", [True, False]) + @pytest.mark.parametrize("n", [1, 2, 3, 4]) + @pytest.mark.parametrize( + "offset, kwd_name", + [ + (pd.offsets.YearEnd, "month"), + (pd.offsets.QuarterEnd, "startingMonth"), + (pd.offsets.MonthEnd, None), + (pd.offsets.Week, "weekday"), + ], + ) def test_sub_n_gt_1_offsets(self, offset, kwd_name, n, normalize): # GH 23878 kwds = {kwd_name: 3} if kwd_name is not None else {} - p1_d = '19910905' - p2_d = '19920406' + p1_d = "19910905" + p2_d = "19920406" p1 = pd.Period(p1_d, freq=offset(n, normalize, **kwds)) p2 = pd.Period(p2_d, freq=offset(n, normalize, **kwds)) - expected = (pd.Period(p2_d, freq=p2.freq.base) - - pd.Period(p1_d, freq=p1.freq.base)) + expected = pd.Period(p2_d, freq=p2.freq.base) - pd.Period( + p1_d, freq=p1.freq.base + ) assert (p2 - p1) == expected def test_add_offset(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('2011', freq=freq) - exp = Period('2013', freq=freq) + for freq in ["A", "2A", "3A"]: + p = Period("2011", freq=freq) + exp = Period("2013", freq=freq) assert p + offsets.YearEnd(2) == exp assert offsets.YearEnd(2) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1148,19 +1161,23 @@ def test_add_offset(self): with pytest.raises(period.IncompatibleFrequency): o + p - for freq in ['M', '2M', '3M']: - p = Period('2011-03', freq=freq) - exp = Period('2011-05', freq=freq) + for freq in ["M", "2M", "3M"]: + p = Period("2011-03", freq=freq) + exp = Period("2011-05", freq=freq) assert p + offsets.MonthEnd(2) == exp assert offsets.MonthEnd(2) + p == exp - exp = Period('2012-03', freq=freq) + exp = Period("2012-03", freq=freq) assert p + offsets.MonthEnd(12) == exp assert offsets.MonthEnd(12) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1172,38 +1189,42 @@ def test_add_offset(self): o + p # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('2011-04-01', freq=freq) + for freq in ["D", "2D", "3D"]: + p = Period("2011-04-01", freq=freq) - exp = Period('2011-04-06', freq=freq) + exp = Period("2011-04-06", freq=freq) assert p + offsets.Day(5) == exp assert offsets.Day(5) + p == exp - exp = Period('2011-04-02', freq=freq) + exp = Period("2011-04-02", freq=freq) assert p + offsets.Hour(24) == exp assert offsets.Hour(24) + p == exp - exp = Period('2011-04-03', freq=freq) - assert p + np.timedelta64(2, 'D') == exp + exp = Period("2011-04-03", freq=freq) + assert p + np.timedelta64(2, "D") == exp with pytest.raises(TypeError): - np.timedelta64(2, 'D') + p + np.timedelta64(2, "D") + p - exp = Period('2011-04-02', freq=freq) - assert p + np.timedelta64(3600 * 24, 's') == exp + exp = Period("2011-04-02", freq=freq) + assert p + np.timedelta64(3600 * 24, "s") == exp with pytest.raises(TypeError): - np.timedelta64(3600 * 24, 's') + p + np.timedelta64(3600 * 24, "s") + p - exp = Period('2011-03-30', freq=freq) + exp = Period("2011-03-30", freq=freq) assert p + timedelta(-2) == exp assert timedelta(-2) + p == exp - exp = Period('2011-04-03', freq=freq) + exp = Period("2011-04-03", freq=freq) assert p + timedelta(hours=48) == exp assert timedelta(hours=48) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1214,38 +1235,42 @@ def test_add_offset(self): with pytest.raises(period.IncompatibleFrequency): o + p - for freq in ['H', '2H', '3H']: - p = Period('2011-04-01 09:00', freq=freq) + for freq in ["H", "2H", "3H"]: + p = Period("2011-04-01 09:00", freq=freq) - exp = Period('2011-04-03 09:00', freq=freq) + exp = Period("2011-04-03 09:00", freq=freq) assert p + offsets.Day(2) == exp assert offsets.Day(2) + p == exp - exp = Period('2011-04-01 12:00', freq=freq) + exp = Period("2011-04-01 12:00", freq=freq) assert p + offsets.Hour(3) == exp assert offsets.Hour(3) + p == exp - exp = Period('2011-04-01 12:00', freq=freq) - assert p + np.timedelta64(3, 'h') == exp + exp = Period("2011-04-01 12:00", freq=freq) + assert p + np.timedelta64(3, "h") == exp with pytest.raises(TypeError): - np.timedelta64(3, 'h') + p + np.timedelta64(3, "h") + p - exp = Period('2011-04-01 10:00', freq=freq) - assert p + np.timedelta64(3600, 's') == exp + exp = Period("2011-04-01 10:00", freq=freq) + assert p + np.timedelta64(3600, "s") == exp with pytest.raises(TypeError): - np.timedelta64(3600, 's') + p + np.timedelta64(3600, "s") + p - exp = Period('2011-04-01 11:00', freq=freq) + exp = Period("2011-04-01 11:00", freq=freq) assert p + timedelta(minutes=120) == exp assert timedelta(minutes=120) + p == exp - exp = Period('2011-04-05 12:00', freq=freq) + exp = Period("2011-04-05 12:00", freq=freq) assert p + timedelta(days=4, minutes=180) == exp assert timedelta(days=4, minutes=180) + p == exp - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: with pytest.raises(period.IncompatibleFrequency): p + o @@ -1258,15 +1283,19 @@ def test_add_offset(self): def test_add_offset_nat(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('NaT', freq=freq) + for freq in ["A", "2A", "3A"]: + p = Period("NaT", freq=freq) for o in [offsets.YearEnd(2)]: assert p + o is NaT assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1275,8 +1304,8 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) + for freq in ["M", "2M", "3M"]: + p = Period("NaT", freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p + o is NaT @@ -1286,9 +1315,13 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1298,11 +1331,16 @@ def test_add_offset_nat(self): assert o + p is NaT # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), - np.timedelta64(3600 * 24, 's'), timedelta(-2), - timedelta(hours=48)]: + for freq in ["D", "2D", "3D"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(5), + offsets.Hour(24), + np.timedelta64(2, "D"), + np.timedelta64(3600 * 24, "s"), + timedelta(-2), + timedelta(hours=48), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1311,9 +1349,13 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1322,19 +1364,28 @@ def test_add_offset_nat(self): else: assert o + p is NaT - for freq in ['H', '2H', '3H']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), - np.timedelta64(3600, 's'), timedelta(minutes=120), - timedelta(days=4, minutes=180)]: + for freq in ["H", "2H", "3H"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(2), + offsets.Hour(3), + np.timedelta64(3, "h"), + np.timedelta64(3600, "s"), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + ]: assert p + o is NaT if not isinstance(o, np.timedelta64): assert o + p is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: assert p + o is NaT if isinstance(o, np.timedelta64): @@ -1345,128 +1396,168 @@ def test_add_offset_nat(self): def test_sub_offset(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('2011', freq=freq) - assert p - offsets.YearEnd(2) == Period('2009', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for freq in ["A", "2A", "3A"]: + p = Period("2011", freq=freq) + assert p - offsets.YearEnd(2) == Period("2009", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p - o - for freq in ['M', '2M', '3M']: - p = Period('2011-03', freq=freq) - assert p - offsets.MonthEnd(2) == Period('2011-01', freq=freq) - assert p - offsets.MonthEnd(12) == Period('2010-03', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for freq in ["M", "2M", "3M"]: + p = Period("2011-03", freq=freq) + assert p - offsets.MonthEnd(2) == Period("2011-01", freq=freq) + assert p - offsets.MonthEnd(12) == Period("2010-03", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: with pytest.raises(period.IncompatibleFrequency): p - o # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('2011-04-01', freq=freq) - assert p - offsets.Day(5) == Period('2011-03-27', freq=freq) - assert p - offsets.Hour(24) == Period('2011-03-31', freq=freq) - assert p - np.timedelta64(2, 'D') == Period( - '2011-03-30', freq=freq) - assert p - np.timedelta64(3600 * 24, 's') == Period( - '2011-03-31', freq=freq) - assert p - timedelta(-2) == Period('2011-04-03', freq=freq) - assert p - timedelta(hours=48) == Period('2011-03-30', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for freq in ["D", "2D", "3D"]: + p = Period("2011-04-01", freq=freq) + assert p - offsets.Day(5) == Period("2011-03-27", freq=freq) + assert p - offsets.Hour(24) == Period("2011-03-31", freq=freq) + assert p - np.timedelta64(2, "D") == Period("2011-03-30", freq=freq) + assert p - np.timedelta64(3600 * 24, "s") == Period("2011-03-31", freq=freq) + assert p - timedelta(-2) == Period("2011-04-03", freq=freq) + assert p - timedelta(hours=48) == Period("2011-03-30", freq=freq) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: with pytest.raises(period.IncompatibleFrequency): p - o - for freq in ['H', '2H', '3H']: - p = Period('2011-04-01 09:00', freq=freq) - assert p - offsets.Day(2) == Period('2011-03-30 09:00', freq=freq) - assert p - offsets.Hour(3) == Period('2011-04-01 06:00', freq=freq) - assert p - np.timedelta64(3, 'h') == Period( - '2011-04-01 06:00', freq=freq) - assert p - np.timedelta64(3600, 's') == Period( - '2011-04-01 08:00', freq=freq) - assert p - timedelta(minutes=120) == Period( - '2011-04-01 07:00', freq=freq) + for freq in ["H", "2H", "3H"]: + p = Period("2011-04-01 09:00", freq=freq) + assert p - offsets.Day(2) == Period("2011-03-30 09:00", freq=freq) + assert p - offsets.Hour(3) == Period("2011-04-01 06:00", freq=freq) + assert p - np.timedelta64(3, "h") == Period("2011-04-01 06:00", freq=freq) + assert p - np.timedelta64(3600, "s") == Period( + "2011-04-01 08:00", freq=freq + ) + assert p - timedelta(minutes=120) == Period("2011-04-01 07:00", freq=freq) assert p - timedelta(days=4, minutes=180) == Period( - '2011-03-28 06:00', freq=freq) - - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + "2011-03-28 06:00", freq=freq + ) + + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: with pytest.raises(period.IncompatibleFrequency): p - o def test_sub_offset_nat(self): # freq is DateOffset - for freq in ['A', '2A', '3A']: - p = Period('NaT', freq=freq) + for freq in ["A", "2A", "3A"]: + p = Period("NaT", freq=freq) for o in [offsets.YearEnd(2)]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p - o is NaT - for freq in ['M', '2M', '3M']: - p = Period('NaT', freq=freq) + for freq in ["M", "2M", "3M"]: + p = Period("NaT", freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(365, 'D'), - timedelta(365)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(365, "D"), + timedelta(365), + ]: assert p - o is NaT # freq is Tick - for freq in ['D', '2D', '3D']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), - np.timedelta64(3600 * 24, 's'), timedelta(-2), - timedelta(hours=48)]: + for freq in ["D", "2D", "3D"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(5), + offsets.Hour(24), + np.timedelta64(2, "D"), + np.timedelta64(3600 * 24, "s"), + timedelta(-2), + timedelta(hours=48), + ]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(4, 'h'), - timedelta(hours=23)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(4, "h"), + timedelta(hours=23), + ]: assert p - o is NaT - for freq in ['H', '2H', '3H']: - p = Period('NaT', freq=freq) - for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), - np.timedelta64(3600, 's'), timedelta(minutes=120), - timedelta(days=4, minutes=180)]: + for freq in ["H", "2H", "3H"]: + p = Period("NaT", freq=freq) + for o in [ + offsets.Day(2), + offsets.Hour(3), + np.timedelta64(3, "h"), + np.timedelta64(3600, "s"), + timedelta(minutes=120), + timedelta(days=4, minutes=180), + ]: assert p - o is NaT - for o in [offsets.YearBegin(2), offsets.MonthBegin(1), - offsets.Minute(), np.timedelta64(3200, 's'), - timedelta(hours=23, minutes=30)]: + for o in [ + offsets.YearBegin(2), + offsets.MonthBegin(1), + offsets.Minute(), + np.timedelta64(3200, "s"), + timedelta(hours=23, minutes=30), + ]: assert p - o is NaT - @pytest.mark.parametrize('freq', ['M', '2M', '3M']) + @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_nat_ops(self, freq): - p = Period('NaT', freq=freq) + p = Period("NaT", freq=freq) assert p + 1 is NaT assert 1 + p is NaT assert p - 1 is NaT - assert p - Period('2011-01', freq=freq) is NaT - assert Period('2011-01', freq=freq) - p is NaT + assert p - Period("2011-01", freq=freq) is NaT + assert Period("2011-01", freq=freq) - p is NaT def test_period_ops_offset(self): - p = Period('2011-04-01', freq='D') + p = Period("2011-04-01", freq="D") result = p + offsets.Day() - exp = Period('2011-04-02', freq='D') + exp = Period("2011-04-02", freq="D") assert result == exp result = p - offsets.Day(2) - exp = Period('2011-03-30', freq='D') + exp = Period("2011-03-30", freq="D") assert result == exp msg = r"Input cannot be converted to Period\(freq=D\)" @@ -1479,7 +1570,7 @@ def test_period_ops_offset(self): def test_period_immutable(): # see gh-17116 - per = Period('2014Q1') + per = Period("2014Q1") with pytest.raises(AttributeError): per.ordinal = 14 @@ -1489,9 +1580,8 @@ def test_period_immutable(): # TODO: This doesn't fail on all systems; track down which -@pytest.mark.xfail(reason="Parses as Jan 1, 0007 on some systems", - strict=False) +@pytest.mark.xfail(reason="Parses as Jan 1, 0007 on some systems", strict=False) def test_small_year_parsing(): - per1 = Period('0001-01-07', 'D') + per1 = Period("0001-01-07", "D") assert per1.year == 1 assert per1.day == 7 diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 19426c3bf3ffb..f935a7fa880c7 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -8,15 +8,28 @@ import pandas.compat as compat from pandas import ( - DatetimeIndex, Index, NaT, Period, Series, Timedelta, TimedeltaIndex, - Timestamp, isna) + DatetimeIndex, + Index, + NaT, + Period, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, + isna, +) from pandas.core.arrays import PeriodArray from pandas.util import testing as tm -@pytest.mark.parametrize("nat,idx", [(Timestamp("NaT"), DatetimeIndex), - (Timedelta("NaT"), TimedeltaIndex), - (Period("NaT", freq="M"), PeriodArray)]) +@pytest.mark.parametrize( + "nat,idx", + [ + (Timestamp("NaT"), DatetimeIndex), + (Timedelta("NaT"), TimedeltaIndex), + (Period("NaT", freq="M"), PeriodArray), + ], +) def test_nat_fields(nat, idx): for field in idx._field_ops: @@ -72,8 +85,7 @@ def test_nat_vector_field_access(): @pytest.mark.parametrize("klass", [Timestamp, Timedelta, Period]) -@pytest.mark.parametrize("value", [None, np.nan, iNaT, float("nan"), - NaT, "NaT", "nat"]) +@pytest.mark.parametrize("value", [None, np.nan, iNaT, float("nan"), NaT, "NaT", "nat"]) def test_identity(klass, value): assert klass(value) is NaT @@ -98,13 +110,31 @@ def test_round_nat(klass, method, freq): assert round_method(freq) is ts -@pytest.mark.parametrize("method", [ - "astimezone", "combine", "ctime", "dst", "fromordinal", - "fromtimestamp", "isocalendar", "strftime", "strptime", - "time", "timestamp", "timetuple", "timetz", "toordinal", - "tzname", "utcfromtimestamp", "utcnow", "utcoffset", - "utctimetuple", "timestamp" -]) +@pytest.mark.parametrize( + "method", + [ + "astimezone", + "combine", + "ctime", + "dst", + "fromordinal", + "fromtimestamp", + "isocalendar", + "strftime", + "strptime", + "time", + "timestamp", + "timetuple", + "timetz", + "toordinal", + "tzname", + "utcfromtimestamp", + "utcnow", + "utcoffset", + "utctimetuple", + "timestamp", + ], +) def test_nat_methods_raise(method): # see gh-9513, gh-17329 msg = "NaTType does not support {method}".format(method=method) @@ -113,38 +143,46 @@ def test_nat_methods_raise(method): getattr(NaT, method)() -@pytest.mark.parametrize("method", [ - "weekday", "isoweekday" -]) +@pytest.mark.parametrize("method", ["weekday", "isoweekday"]) def test_nat_methods_nan(method): # see gh-9513, gh-17329 assert np.isnan(getattr(NaT, method)()) -@pytest.mark.parametrize("method", [ - "date", "now", "replace", "today", - "tz_convert", "tz_localize" -]) +@pytest.mark.parametrize( + "method", ["date", "now", "replace", "today", "tz_convert", "tz_localize"] +) def test_nat_methods_nat(method): # see gh-8254, gh-9513, gh-17329 assert getattr(NaT, method)() is NaT -@pytest.mark.parametrize("get_nat", [ - lambda x: NaT, - lambda x: Timedelta(x), - lambda x: Timestamp(x) -]) +@pytest.mark.parametrize( + "get_nat", [lambda x: NaT, lambda x: Timedelta(x), lambda x: Timestamp(x)] +) def test_nat_iso_format(get_nat): # see gh-12300 assert get_nat("NaT").isoformat() == "NaT" -@pytest.mark.parametrize("klass,expected", [ - (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), - (Timedelta, ["components", "delta", "is_populated", "resolution_string", - "to_pytimedelta", "to_timedelta64", "view"]) -]) +@pytest.mark.parametrize( + "klass,expected", + [ + (Timestamp, ["freqstr", "normalize", "to_julian_date", "to_period", "tz"]), + ( + Timedelta, + [ + "components", + "delta", + "is_populated", + "resolution_string", + "to_pytimedelta", + "to_timedelta64", + "view", + ], + ), + ], +) def test_missing_public_nat_methods(klass, expected): # see gh-17327 # @@ -154,8 +192,7 @@ def test_missing_public_nat_methods(klass, expected): nat_names = dir(NaT) klass_names = dir(klass) - missing = [x for x in klass_names if x not in nat_names and - not x.startswith("_")] + missing = [x for x in klass_names if x not in nat_names and not x.startswith("_")] missing.sort() assert missing == expected @@ -179,9 +216,11 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): nat_names = dir(NaT) klass_names = dir(klass) - overlap = [x for x in nat_names if x in klass_names and - not x.startswith("_") and - callable(getattr(klass, x))] + overlap = [ + x + for x in nat_names + if x in klass_names and not x.startswith("_") and callable(getattr(klass, x)) + ] # Timestamp takes precedence over Timedelta in terms of overlap. if klass is Timedelta: @@ -195,18 +234,54 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): return overlap -@pytest.mark.parametrize("klass,expected", [ - (Timestamp, ["astimezone", "ceil", "combine", "ctime", "date", "day_name", - "dst", "floor", "fromisoformat", "fromordinal", - "fromtimestamp", "isocalendar", "isoformat", "isoweekday", - "month_name", "now", "replace", "round", "strftime", - "strptime", "time", "timestamp", "timetuple", "timetz", - "to_datetime64", "to_numpy", "to_pydatetime", "today", - "toordinal", "tz_convert", "tz_localize", "tzname", - "utcfromtimestamp", "utcnow", "utcoffset", "utctimetuple", - "weekday"]), - (Timedelta, ["total_seconds"]) -]) +@pytest.mark.parametrize( + "klass,expected", + [ + ( + Timestamp, + [ + "astimezone", + "ceil", + "combine", + "ctime", + "date", + "day_name", + "dst", + "floor", + "fromisoformat", + "fromordinal", + "fromtimestamp", + "isocalendar", + "isoformat", + "isoweekday", + "month_name", + "now", + "replace", + "round", + "strftime", + "strptime", + "time", + "timestamp", + "timetuple", + "timetz", + "to_datetime64", + "to_numpy", + "to_pydatetime", + "today", + "toordinal", + "tz_convert", + "tz_localize", + "tzname", + "utcfromtimestamp", + "utcnow", + "utcoffset", + "utctimetuple", + "weekday", + ], + ), + (Timedelta, ["total_seconds"]), + ], +) def test_overlap_public_nat_methods(klass, expected): # see gh-17327 # @@ -221,9 +296,12 @@ def test_overlap_public_nat_methods(klass, expected): assert _get_overlap_public_nat_methods(klass) == expected -@pytest.mark.parametrize("compare", ( - _get_overlap_public_nat_methods(Timestamp, True) + - _get_overlap_public_nat_methods(Timedelta, True)) +@pytest.mark.parametrize( + "compare", + ( + _get_overlap_public_nat_methods(Timestamp, True) + + _get_overlap_public_nat_methods(Timedelta, True) + ), ) def test_nat_doc_strings(compare): # see gh-17327 @@ -249,32 +327,42 @@ def test_nat_doc_strings(compare): @pytest.mark.parametrize("op_name", list(_ops.keys())) -@pytest.mark.parametrize("value,val_type", [ - (2, "scalar"), - (1.5, "scalar"), - (np.nan, "scalar"), - (timedelta(3600), "timedelta"), - (Timedelta("5s"), "timedelta"), - (datetime(2014, 1, 1), "timestamp"), - (Timestamp("2014-01-01"), "timestamp"), - (Timestamp("2014-01-01", tz="UTC"), "timestamp"), - (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), - (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), -]) +@pytest.mark.parametrize( + "value,val_type", + [ + (2, "scalar"), + (1.5, "scalar"), + (np.nan, "scalar"), + (timedelta(3600), "timedelta"), + (Timedelta("5s"), "timedelta"), + (datetime(2014, 1, 1), "timestamp"), + (Timestamp("2014-01-01"), "timestamp"), + (Timestamp("2014-01-01", tz="UTC"), "timestamp"), + (Timestamp("2014-01-01", tz="US/Eastern"), "timestamp"), + (pytz.timezone("Asia/Tokyo").localize(datetime(2014, 1, 1)), "timestamp"), + ], +) def test_nat_arithmetic_scalar(op_name, value, val_type): # see gh-6873 invalid_ops = { "scalar": {"right_div_left"}, "timedelta": {"left_times_right", "right_times_left"}, - "timestamp": {"left_times_right", "right_times_left", - "left_div_right", "right_div_left"} + "timestamp": { + "left_times_right", + "right_times_left", + "left_div_right", + "right_div_left", + }, } op = _ops[op_name] if op_name in invalid_ops.get(val_type, set()): - if (val_type == "timedelta" and "times" in op_name and - isinstance(value, Timedelta)): + if ( + val_type == "timedelta" + and "times" in op_name + and isinstance(value, Timedelta) + ): msg = "Cannot multiply" else: msg = "unsupported operand type" @@ -290,11 +378,9 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): assert op(NaT, value) is expected -@pytest.mark.parametrize("val,expected", [ - (np.nan, NaT), - (NaT, np.nan), - (np.timedelta64("NaT"), np.nan) -]) +@pytest.mark.parametrize( + "val,expected", [(np.nan, NaT), (NaT, np.nan), (np.timedelta64("NaT"), np.nan)] +) def test_nat_rfloordiv_timedelta(val, expected): # see gh-#18846 # @@ -303,15 +389,18 @@ def test_nat_rfloordiv_timedelta(val, expected): assert td // val is expected -@pytest.mark.parametrize("op_name", [ - "left_plus_right", "right_plus_left", - "left_minus_right", "right_minus_left" -]) -@pytest.mark.parametrize("value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - TimedeltaIndex(["1 day", "2 day"], name="x"), -]) +@pytest.mark.parametrize( + "op_name", + ["left_plus_right", "right_plus_left", "left_minus_right", "right_minus_left"], +) +@pytest.mark.parametrize( + "value", + [ + DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), + TimedeltaIndex(["1 day", "2 day"], name="x"), + ], +) def test_nat_arithmetic_index(op_name, value): # see gh-11718 exp_name = "x" @@ -325,10 +414,10 @@ def test_nat_arithmetic_index(op_name, value): tm.assert_index_equal(_ops[op_name](NaT, value), expected) -@pytest.mark.parametrize("op_name", [ - "left_plus_right", "right_plus_left", - "left_minus_right", "right_minus_left" -]) +@pytest.mark.parametrize( + "op_name", + ["left_plus_right", "right_plus_left", "left_minus_right", "right_minus_left"], +) @pytest.mark.parametrize("box", [TimedeltaIndex, Series]) def test_nat_arithmetic_td64_vector(op_name, box): # see gh-19124 @@ -350,9 +439,7 @@ def test_to_numpy_alias(): assert isna(expected) and isna(result) -@pytest.mark.parametrize("other", [ - Timedelta(0), Timestamp(0) -]) +@pytest.mark.parametrize("other", [Timedelta(0), Timestamp(0)]) def test_nat_comparisons(compare_operators_no_eq_ne, other): # GH 26039 assert getattr(NaT, compare_operators_no_eq_ne)(other) is False diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 0fdbcf6ff4647..52f32d41a02ff 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -20,17 +20,22 @@ class TestTimedeltaAdditionSubtraction: __add__, __radd__, __sub__, __rsub__ """ - @pytest.mark.parametrize('ten_seconds', [ - Timedelta(10, unit='s'), - timedelta(seconds=10), - np.timedelta64(10, 's'), - np.timedelta64(10000000000, 'ns'), - pd.offsets.Second(10)]) + + @pytest.mark.parametrize( + "ten_seconds", + [ + Timedelta(10, unit="s"), + timedelta(seconds=10), + np.timedelta64(10, "s"), + np.timedelta64(10000000000, "ns"), + pd.offsets.Second(10), + ], + ) def test_td_add_sub_ten_seconds(self, ten_seconds): # GH#6808 - base = Timestamp('20130101 09:01:12.123456') - expected_add = Timestamp('20130101 09:01:22.123456') - expected_sub = Timestamp('20130101 09:01:02.123456') + base = Timestamp("20130101 09:01:12.123456") + expected_add = Timestamp("20130101 09:01:22.123456") + expected_sub = Timestamp("20130101 09:01:02.123456") result = base + ten_seconds assert result == expected_add @@ -38,17 +43,21 @@ def test_td_add_sub_ten_seconds(self, ten_seconds): result = base - ten_seconds assert result == expected_sub - @pytest.mark.parametrize('one_day_ten_secs', [ - Timedelta('1 day, 00:00:10'), - Timedelta('1 days, 00:00:10'), - timedelta(days=1, seconds=10), - np.timedelta64(1, 'D') + np.timedelta64(10, 's'), - pd.offsets.Day() + pd.offsets.Second(10)]) + @pytest.mark.parametrize( + "one_day_ten_secs", + [ + Timedelta("1 day, 00:00:10"), + Timedelta("1 days, 00:00:10"), + timedelta(days=1, seconds=10), + np.timedelta64(1, "D") + np.timedelta64(10, "s"), + pd.offsets.Day() + pd.offsets.Second(10), + ], + ) def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): # GH#6808 - base = Timestamp('20130102 09:01:12.123456') - expected_add = Timestamp('20130103 09:01:22.123456') - expected_sub = Timestamp('20130101 09:01:02.123456') + base = Timestamp("20130102 09:01:12.123456") + expected_add = Timestamp("20130103 09:01:22.123456") + expected_sub = Timestamp("20130101 09:01:02.123456") result = base + one_day_ten_secs assert result == expected_add @@ -56,10 +65,10 @@ def test_td_add_sub_one_day_ten_seconds(self, one_day_ten_secs): result = base - one_day_ten_secs assert result == expected_sub - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_datetimelike_scalar(self, op): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, datetime(2016, 1, 1)) if op is operator.add: @@ -68,57 +77,57 @@ def test_td_add_datetimelike_scalar(self, op): assert isinstance(result, Timestamp) assert result == Timestamp(2016, 1, 11) - result = op(td, Timestamp('2018-01-12 18:09')) + result = op(td, Timestamp("2018-01-12 18:09")) assert isinstance(result, Timestamp) - assert result == Timestamp('2018-01-22 18:09') + assert result == Timestamp("2018-01-22 18:09") - result = op(td, np.datetime64('2018-01-12')) + result = op(td, np.datetime64("2018-01-12")) assert isinstance(result, Timestamp) - assert result == Timestamp('2018-01-22') + assert result == Timestamp("2018-01-22") result = op(td, NaT) assert result is NaT - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_td(self, op): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, Timedelta(days=10)) assert isinstance(result, Timedelta) assert result == Timedelta(days=20) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_pytimedelta(self, op): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, timedelta(days=9)) assert isinstance(result, Timedelta) assert result == Timedelta(days=19) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedelta64(self, op): - td = Timedelta(10, unit='d') - result = op(td, np.timedelta64(-4, 'D')) + td = Timedelta(10, unit="d") + result = op(td, np.timedelta64(-4, "D")) assert isinstance(result, Timedelta) assert result == Timedelta(days=6) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_offset(self, op): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, pd.offsets.Hour(6)) assert isinstance(result, Timedelta) assert result == Timedelta(days=10, hours=6) def test_td_sub_td(self): - td = Timedelta(10, unit='d') - expected = Timedelta(0, unit='ns') + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") result = td - td assert isinstance(result, Timedelta) assert result == expected def test_td_sub_pytimedelta(self): - td = Timedelta(10, unit='d') - expected = Timedelta(0, unit='ns') + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") result = td - td.to_pytimedelta() assert isinstance(result, Timedelta) @@ -129,8 +138,8 @@ def test_td_sub_pytimedelta(self): assert result == expected def test_td_sub_timedelta64(self): - td = Timedelta(10, unit='d') - expected = Timedelta(0, unit='ns') + td = Timedelta(10, unit="d") + expected = Timedelta(0, unit="ns") result = td - td.to_timedelta64() assert isinstance(result, Timedelta) @@ -142,13 +151,13 @@ def test_td_sub_timedelta64(self): def test_td_sub_nat(self): # In this context pd.NaT is treated as timedelta-like - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td - NaT assert result is NaT def test_td_sub_td64_nat(self): - td = Timedelta(10, unit='d') - td_nat = np.timedelta64('NaT') + td = Timedelta(10, unit="d") + td_nat = np.timedelta64("NaT") result = td - td_nat assert result is NaT @@ -157,13 +166,13 @@ def test_td_sub_td64_nat(self): assert result is NaT def test_td_sub_offset(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td - pd.offsets.Hour(1) assert isinstance(result, Timedelta) - assert result == Timedelta(239, unit='h') + assert result == Timedelta(239, unit="h") def test_td_add_sub_numeric_raises(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") for other in [2, 2.0, np.int64(2), np.float64(2)]: with pytest.raises(TypeError): td + other @@ -175,67 +184,61 @@ def test_td_add_sub_numeric_raises(self): other - td def test_td_rsub_nat(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = NaT - td assert result is NaT - result = np.datetime64('NaT') - td + result = np.datetime64("NaT") - td assert result is NaT def test_td_rsub_offset(self): - result = pd.offsets.Hour(1) - Timedelta(10, unit='d') + result = pd.offsets.Hour(1) - Timedelta(10, unit="d") assert isinstance(result, Timedelta) - assert result == Timedelta(-239, unit='h') + assert result == Timedelta(-239, unit="h") def test_td_sub_timedeltalike_object_dtype_array(self): # GH#21980 - arr = np.array([Timestamp('20130101 9:01'), - Timestamp('20121230 9:02')]) - exp = np.array([Timestamp('20121231 9:01'), - Timestamp('20121229 9:02')]) - res = arr - Timedelta('1D') + arr = np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]) + exp = np.array([Timestamp("20121231 9:01"), Timestamp("20121229 9:02")]) + res = arr - Timedelta("1D") tm.assert_numpy_array_equal(res, exp) def test_td_sub_mixed_most_timedeltalike_object_dtype_array(self): # GH#21980 now = Timestamp.now() - arr = np.array([now, - Timedelta('1D'), - np.timedelta64(2, 'h')]) - exp = np.array([now - Timedelta('1D'), - Timedelta('0D'), - np.timedelta64(2, 'h') - Timedelta('1D')]) - res = arr - Timedelta('1D') + arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) + exp = np.array( + [ + now - Timedelta("1D"), + Timedelta("0D"), + np.timedelta64(2, "h") - Timedelta("1D"), + ] + ) + res = arr - Timedelta("1D") tm.assert_numpy_array_equal(res, exp) def test_td_rsub_mixed_most_timedeltalike_object_dtype_array(self): # GH#21980 now = Timestamp.now() - arr = np.array([now, - Timedelta('1D'), - np.timedelta64(2, 'h')]) + arr = np.array([now, Timedelta("1D"), np.timedelta64(2, "h")]) with pytest.raises(TypeError): - Timedelta('1D') - arr + Timedelta("1D") - arr - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_timedeltalike_object_dtype_array(self, op): # GH#21980 - arr = np.array([Timestamp('20130101 9:01'), - Timestamp('20121230 9:02')]) - exp = np.array([Timestamp('20130102 9:01'), - Timestamp('20121231 9:02')]) - res = op(arr, Timedelta('1D')) + arr = np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]) + exp = np.array([Timestamp("20130102 9:01"), Timestamp("20121231 9:02")]) + res = op(arr, Timedelta("1D")) tm.assert_numpy_array_equal(res, exp) - @pytest.mark.parametrize('op', [operator.add, ops.radd]) + @pytest.mark.parametrize("op", [operator.add, ops.radd]) def test_td_add_mixed_timedeltalike_object_dtype_array(self, op): # GH#21980 now = Timestamp.now() - arr = np.array([now, - Timedelta('1D')]) - exp = np.array([now + Timedelta('1D'), - Timedelta('2D')]) - res = op(arr, Timedelta('1D')) + arr = np.array([now, Timedelta("1D")]) + exp = np.array([now + Timedelta("1D"), Timedelta("2D")]) + res = op(arr, Timedelta("1D")) tm.assert_numpy_array_equal(res, exp) @@ -254,25 +257,25 @@ class TestTimedeltaMultiplicationDivision: # --------------------------------------------------------------- # Timedelta.__mul__, __rmul__ - @pytest.mark.parametrize('td_nat', [NaT, - np.timedelta64('NaT', 'ns'), - np.timedelta64('NaT')]) - @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + @pytest.mark.parametrize( + "td_nat", [NaT, np.timedelta64("NaT", "ns"), np.timedelta64("NaT")] + ) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nat(self, op, td_nat): # GH#19819 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") with pytest.raises(TypeError): op(td, td_nat) - @pytest.mark.parametrize('nan', [np.nan, np.float64('NaN'), float('nan')]) - @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_nan(self, op, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = op(td, nan) assert result is NaT - @pytest.mark.parametrize('op', [operator.mul, ops.rmul]) + @pytest.mark.parametrize("op", [operator.mul, ops.rmul]) def test_td_mul_scalar(self, op): # GH#19738 td = Timedelta(minutes=3) @@ -301,19 +304,19 @@ def test_td_mul_scalar(self, op): def test_td_div_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td / pd.offsets.Hour(1) assert result == 240 assert td / td == 1 - assert td / np.timedelta64(60, 'h') == 4 + assert td / np.timedelta64(60, "h") == 4 assert np.isnan(td / NaT) def test_td_div_numeric_scalar(self): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td / 2 assert isinstance(result, Timedelta) @@ -323,10 +326,10 @@ def test_td_div_numeric_scalar(self): assert isinstance(result, Timedelta) assert result == Timedelta(days=2) - @pytest.mark.parametrize('nan', [np.nan, np.float64('NaN'), float('nan')]) + @pytest.mark.parametrize("nan", [np.nan, np.float64("NaN"), float("nan")]) def test_td_div_nan(self, nan): # np.float64('NaN') has a 'dtype' attr, avoid treating as array - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = td / nan assert result is NaT @@ -338,11 +341,11 @@ def test_td_div_nan(self, nan): def test_td_rdiv_timedeltalike_scalar(self): # GH#19738 - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") result = pd.offsets.Hour(1) / td assert result == 1 / 240.0 - assert np.timedelta64(60, 'h') / td == 0.25 + assert np.timedelta64(60, "h") / td == 0.25 # --------------------------------------------------------------- # Timedelta.__floordiv__ @@ -362,7 +365,7 @@ def test_td_floordiv_null_scalar(self): assert td // np.nan is NaT assert np.isnan(td // NaT) - assert np.isnan(td // np.timedelta64('NaT')) + assert np.isnan(td // np.timedelta64("NaT")) def test_td_floordiv_offsets(self): # GH#19738 @@ -375,7 +378,7 @@ def test_td_floordiv_invalid_scalar(self): td = Timedelta(hours=3, minutes=4) with pytest.raises(TypeError): - td // np.datetime64('2016-01-01', dtype='datetime64[us]') + td // np.datetime64("2016-01-01", dtype="datetime64[us]") def test_td_floordiv_numeric_scalar(self): # GH#18846 @@ -400,8 +403,7 @@ def test_td_floordiv_timedeltalike_array(self): expected = np.array([3], dtype=np.int64) tm.assert_numpy_array_equal(res, expected) - res = (10 * td) // np.array([scalar.to_timedelta64(), - np.timedelta64('NaT')]) + res = (10 * td) // np.array([scalar.to_timedelta64(), np.timedelta64("NaT")]) expected = np.array([10, np.nan]) tm.assert_numpy_array_equal(res, expected) @@ -410,7 +412,7 @@ def test_td_floordiv_numeric_series(self): td = Timedelta(hours=3, minutes=4) ser = pd.Series([1], dtype=np.int64) res = td // ser - assert res.dtype.kind == 'm' + assert res.dtype.kind == "m" # --------------------------------------------------------------- # Timedelta.__rfloordiv__ @@ -435,7 +437,7 @@ def test_td_rfloordiv_null_scalar(self): td = Timedelta(hours=3, minutes=3) assert np.isnan(td.__rfloordiv__(NaT)) - assert np.isnan(td.__rfloordiv__(np.timedelta64('NaT'))) + assert np.isnan(td.__rfloordiv__(np.timedelta64("NaT"))) def test_td_rfloordiv_offsets(self): # GH#19738 @@ -445,7 +447,7 @@ def test_td_rfloordiv_invalid_scalar(self): # GH#18846 td = Timedelta(hours=3, minutes=3) - dt64 = np.datetime64('2016-01-01', dtype='datetime64[us]') + dt64 = np.datetime64("2016-01-01", dtype="datetime64[us]") with pytest.raises(TypeError): td.__rfloordiv__(dt64) @@ -477,8 +479,7 @@ def test_td_rfloordiv_timedeltalike_array(self): expected = np.array([3], dtype=np.int64) tm.assert_numpy_array_equal(res, expected) - arr = np.array([(10 * scalar).to_timedelta64(), - np.timedelta64('NaT')]) + arr = np.array([(10 * scalar).to_timedelta64(), np.timedelta64("NaT")]) res = td.__rfloordiv__(arr) expected = np.array([10, np.nan]) tm.assert_numpy_array_equal(res, expected) @@ -516,14 +517,14 @@ def test_mod_timedelta64_nat(self): # GH#19365 td = Timedelta(hours=37) - result = td % np.timedelta64('NaT', 'ns') + result = td % np.timedelta64("NaT", "ns") assert result is NaT def test_mod_timedelta64(self): # GH#19365 td = Timedelta(hours=37) - result = td % np.timedelta64(2, 'h') + result = td % np.timedelta64(2, "h") assert isinstance(result, Timedelta) assert result == Timedelta(hours=1) @@ -557,7 +558,7 @@ def test_mod_invalid(self): td = Timedelta(hours=37) with pytest.raises(TypeError): - td % Timestamp('2018-01-22') + td % Timestamp("2018-01-22") with pytest.raises(TypeError): td % [] @@ -573,7 +574,7 @@ def test_rmod_pytimedelta(self): def test_rmod_timedelta64(self): # GH#19365 td = Timedelta(minutes=3) - result = np.timedelta64(5, 'm') % td + result = np.timedelta64(5, "m") % td assert isinstance(result, Timedelta) assert result == Timedelta(minutes=2) @@ -582,7 +583,7 @@ def test_rmod_invalid(self): td = Timedelta(minutes=3) with pytest.raises(TypeError): - Timestamp('2018-01-22') % td + Timestamp("2018-01-22") % td with pytest.raises(TypeError): 15 % td @@ -601,7 +602,7 @@ def test_divmod_numeric(self): td = Timedelta(days=2, hours=6) result = divmod(td, 53 * 3600 * 1e9) - assert result[0] == Timedelta(1, unit='ns') + assert result[0] == Timedelta(1, unit="ns") assert isinstance(result[1], Timedelta) assert result[1] == Timedelta(hours=1) @@ -642,7 +643,7 @@ def test_divmod_invalid(self): td = Timedelta(days=2, hours=6) with pytest.raises(TypeError): - divmod(td, Timestamp('2018-01-22')) + divmod(td, Timestamp("2018-01-22")) def test_rdivmod_pytimedelta(self): # GH#19365 @@ -662,7 +663,7 @@ def test_rdivmod_invalid(self): td = Timedelta(minutes=3) with pytest.raises(TypeError): - divmod(Timestamp('2018-01-22'), td) + divmod(Timestamp("2018-01-22"), td) with pytest.raises(TypeError): divmod(15, td) @@ -675,16 +676,16 @@ def test_rdivmod_invalid(self): # ---------------------------------------------------------------- - @pytest.mark.parametrize('op', [ - operator.mul, - ops.rmul, - operator.truediv, - ops.rdiv, - ops.rsub]) - @pytest.mark.parametrize('arr', [ - np.array([Timestamp('20130101 9:01'), Timestamp('20121230 9:02')]), - np.array([Timestamp.now(), Timedelta('1D')]) - ]) + @pytest.mark.parametrize( + "op", [operator.mul, ops.rmul, operator.truediv, ops.rdiv, ops.rsub] + ) + @pytest.mark.parametrize( + "arr", + [ + np.array([Timestamp("20130101 9:01"), Timestamp("20121230 9:02")]), + np.array([Timestamp.now(), Timedelta("1D")]), + ], + ) def test_td_op_timedelta_timedeltalike_array(self, op, arr): with pytest.raises(TypeError): - op(arr, Timedelta('1D')) + op(arr, Timedelta("1D")) diff --git a/pandas/tests/scalar/timedelta/test_construction.py b/pandas/tests/scalar/timedelta/test_construction.py index 4d24680ac5230..9917e8bc4c9ac 100644 --- a/pandas/tests/scalar/timedelta/test_construction.py +++ b/pandas/tests/scalar/timedelta/test_construction.py @@ -7,102 +7,108 @@ def test_construction(): - expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') - assert Timedelta(10, unit='d').value == expected - assert Timedelta(10.0, unit='d').value == expected - assert Timedelta('10 days').value == expected + expected = np.timedelta64(10, "D").astype("m8[ns]").view("i8") + assert Timedelta(10, unit="d").value == expected + assert Timedelta(10.0, unit="d").value == expected + assert Timedelta("10 days").value == expected assert Timedelta(days=10).value == expected assert Timedelta(days=10.0).value == expected - expected += np.timedelta64(10, 's').astype('m8[ns]').view('i8') - assert Timedelta('10 days 00:00:10').value == expected + expected += np.timedelta64(10, "s").astype("m8[ns]").view("i8") + assert Timedelta("10 days 00:00:10").value == expected assert Timedelta(days=10, seconds=10).value == expected assert Timedelta(days=10, milliseconds=10 * 1000).value == expected - assert Timedelta(days=10, - microseconds=10 * 1000 * 1000).value == expected + assert Timedelta(days=10, microseconds=10 * 1000 * 1000).value == expected # rounding cases assert Timedelta(82739999850000).value == 82739999850000 - assert ('0 days 22:58:59.999850' in str(Timedelta(82739999850000))) + assert "0 days 22:58:59.999850" in str(Timedelta(82739999850000)) assert Timedelta(123072001000000).value == 123072001000000 - assert ('1 days 10:11:12.001' in str(Timedelta(123072001000000))) + assert "1 days 10:11:12.001" in str(Timedelta(123072001000000)) # string conversion with/without leading zero # GH#9570 - assert Timedelta('0:00:00') == timedelta(hours=0) - assert Timedelta('00:00:00') == timedelta(hours=0) - assert Timedelta('-1:00:00') == -timedelta(hours=1) - assert Timedelta('-01:00:00') == -timedelta(hours=1) + assert Timedelta("0:00:00") == timedelta(hours=0) + assert Timedelta("00:00:00") == timedelta(hours=0) + assert Timedelta("-1:00:00") == -timedelta(hours=1) + assert Timedelta("-01:00:00") == -timedelta(hours=1) # more strings & abbrevs # GH#8190 - assert Timedelta('1 h') == timedelta(hours=1) - assert Timedelta('1 hour') == timedelta(hours=1) - assert Timedelta('1 hr') == timedelta(hours=1) - assert Timedelta('1 hours') == timedelta(hours=1) - assert Timedelta('-1 hours') == -timedelta(hours=1) - assert Timedelta('1 m') == timedelta(minutes=1) - assert Timedelta('1.5 m') == timedelta(seconds=90) - assert Timedelta('1 minute') == timedelta(minutes=1) - assert Timedelta('1 minutes') == timedelta(minutes=1) - assert Timedelta('1 s') == timedelta(seconds=1) - assert Timedelta('1 second') == timedelta(seconds=1) - assert Timedelta('1 seconds') == timedelta(seconds=1) - assert Timedelta('1 ms') == timedelta(milliseconds=1) - assert Timedelta('1 milli') == timedelta(milliseconds=1) - assert Timedelta('1 millisecond') == timedelta(milliseconds=1) - assert Timedelta('1 us') == timedelta(microseconds=1) - assert Timedelta('1 micros') == timedelta(microseconds=1) - assert Timedelta('1 microsecond') == timedelta(microseconds=1) - assert Timedelta('1.5 microsecond') == Timedelta('00:00:00.000001500') - assert Timedelta('1 ns') == Timedelta('00:00:00.000000001') - assert Timedelta('1 nano') == Timedelta('00:00:00.000000001') - assert Timedelta('1 nanosecond') == Timedelta('00:00:00.000000001') + assert Timedelta("1 h") == timedelta(hours=1) + assert Timedelta("1 hour") == timedelta(hours=1) + assert Timedelta("1 hr") == timedelta(hours=1) + assert Timedelta("1 hours") == timedelta(hours=1) + assert Timedelta("-1 hours") == -timedelta(hours=1) + assert Timedelta("1 m") == timedelta(minutes=1) + assert Timedelta("1.5 m") == timedelta(seconds=90) + assert Timedelta("1 minute") == timedelta(minutes=1) + assert Timedelta("1 minutes") == timedelta(minutes=1) + assert Timedelta("1 s") == timedelta(seconds=1) + assert Timedelta("1 second") == timedelta(seconds=1) + assert Timedelta("1 seconds") == timedelta(seconds=1) + assert Timedelta("1 ms") == timedelta(milliseconds=1) + assert Timedelta("1 milli") == timedelta(milliseconds=1) + assert Timedelta("1 millisecond") == timedelta(milliseconds=1) + assert Timedelta("1 us") == timedelta(microseconds=1) + assert Timedelta("1 micros") == timedelta(microseconds=1) + assert Timedelta("1 microsecond") == timedelta(microseconds=1) + assert Timedelta("1.5 microsecond") == Timedelta("00:00:00.000001500") + assert Timedelta("1 ns") == Timedelta("00:00:00.000000001") + assert Timedelta("1 nano") == Timedelta("00:00:00.000000001") + assert Timedelta("1 nanosecond") == Timedelta("00:00:00.000000001") # combos - assert Timedelta('10 days 1 hour') == timedelta(days=10, hours=1) - assert Timedelta('10 days 1 h') == timedelta(days=10, hours=1) - assert Timedelta('10 days 1 h 1m 1s') == timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s') == -timedelta( - days=10, hours=1, minutes=1, seconds=1) - assert Timedelta('-10 days 1 h 1m 1s 3us') == -timedelta( - days=10, hours=1, minutes=1, seconds=1, microseconds=3) - assert Timedelta('-10 days 1 h 1.5m 1s 3us') == -timedelta( - days=10, hours=1, minutes=1, seconds=31, microseconds=3) + assert Timedelta("10 days 1 hour") == timedelta(days=10, hours=1) + assert Timedelta("10 days 1 h") == timedelta(days=10, hours=1) + assert Timedelta("10 days 1 h 1m 1s") == timedelta( + days=10, hours=1, minutes=1, seconds=1 + ) + assert Timedelta("-10 days 1 h 1m 1s") == -timedelta( + days=10, hours=1, minutes=1, seconds=1 + ) + assert Timedelta("-10 days 1 h 1m 1s") == -timedelta( + days=10, hours=1, minutes=1, seconds=1 + ) + assert Timedelta("-10 days 1 h 1m 1s 3us") == -timedelta( + days=10, hours=1, minutes=1, seconds=1, microseconds=3 + ) + assert Timedelta("-10 days 1 h 1.5m 1s 3us") == -timedelta( + days=10, hours=1, minutes=1, seconds=31, microseconds=3 + ) # Currently invalid as it has a - on the hh:mm:dd part # (only allowed on the days) with pytest.raises(ValueError): - Timedelta('-10 days -1 h 1.5m 1s 3us') + Timedelta("-10 days -1 h 1.5m 1s 3us") # only leading neg signs are allowed with pytest.raises(ValueError): - Timedelta('10 days -1 h 1.5m 1s 3us') + Timedelta("10 days -1 h 1.5m 1s 3us") # no units specified with pytest.raises(ValueError): - Timedelta('3.1415') + Timedelta("3.1415") # invalid construction with pytest.raises(ValueError, match="cannot construct a Timedelta"): Timedelta() with pytest.raises(ValueError, match="unit abbreviation w/o a number"): - Timedelta('foo') + Timedelta("foo") - msg = ("cannot construct a Timedelta from " - "the passed arguments, allowed keywords are ") + msg = ( + "cannot construct a Timedelta from " + "the passed arguments, allowed keywords are " + ) with pytest.raises(ValueError, match=msg): Timedelta(day=10) # floats - expected = np.timedelta64( - 10, 's').astype('m8[ns]').view('i8') + np.timedelta64( - 500, 'ms').astype('m8[ns]').view('i8') - assert Timedelta(10.5, unit='s').value == expected + expected = np.timedelta64(10, "s").astype("m8[ns]").view("i8") + np.timedelta64( + 500, "ms" + ).astype("m8[ns]").view("i8") + assert Timedelta(10.5, unit="s").value == expected # offset assert to_timedelta(offsets.Hour(2)) == Timedelta(hours=2) @@ -110,35 +116,55 @@ def test_construction(): assert Timedelta(offsets.Second(2)) == Timedelta(seconds=2) # GH#11995: unicode - expected = Timedelta('1H') - result = Timedelta('1H') + expected = Timedelta("1H") + result = Timedelta("1H") assert result == expected - assert to_timedelta(offsets.Hour(2)) == Timedelta('0 days, 02:00:00') + assert to_timedelta(offsets.Hour(2)) == Timedelta("0 days, 02:00:00") with pytest.raises(ValueError): - Timedelta('foo bar') - - -@pytest.mark.parametrize('item', list({'days': 'D', - 'seconds': 's', - 'microseconds': 'us', - 'milliseconds': 'ms', - 'minutes': 'm', - 'hours': 'h', - 'weeks': 'W'}.items())) -@pytest.mark.parametrize('npdtype', [np.int64, np.int32, np.int16, - np.float64, np.float32, np.float16]) + Timedelta("foo bar") + + +@pytest.mark.parametrize( + "item", + list( + { + "days": "D", + "seconds": "s", + "microseconds": "us", + "milliseconds": "ms", + "minutes": "m", + "hours": "h", + "weeks": "W", + }.items() + ), +) +@pytest.mark.parametrize( + "npdtype", [np.int64, np.int32, np.int16, np.float64, np.float32, np.float16] +) def test_td_construction_with_np_dtypes(npdtype, item): # GH#8757: test construction with np dtypes pykwarg, npkwarg = item - expected = np.timedelta64(1, npkwarg).astype('m8[ns]').view('i8') + expected = np.timedelta64(1, npkwarg).astype("m8[ns]").view("i8") assert Timedelta(**{pykwarg: npdtype(1)}).value == expected -@pytest.mark.parametrize('val', [ - '1s', '-1s', '1us', '-1us', '1 day', '-1 day', - '-23:59:59.999999', '-1 days +23:59:59.999999', '-1ns', - '1ns', '-23:59:59.999999999']) +@pytest.mark.parametrize( + "val", + [ + "1s", + "-1s", + "1us", + "-1us", + "1 day", + "-1 day", + "-23:59:59.999999", + "-1 days +23:59:59.999999", + "-1ns", + "1ns", + "-23:59:59.999999999", + ], +) def test_td_from_repr_roundtrip(val): # round-trip both for string and value td = Timedelta(val) @@ -147,58 +173,100 @@ def test_td_from_repr_roundtrip(val): # str does not normally display nanos if not td.nanoseconds: assert Timedelta(str(td)) == td - assert Timedelta(td._repr_base(format='all')) == td + assert Timedelta(td._repr_base(format="all")) == td def test_overflow_on_construction(): # GH#3374 - value = Timedelta('1day').value * 20169940 + value = Timedelta("1day").value * 20169940 with pytest.raises(OverflowError): Timedelta(value) # xref GH#17637 with pytest.raises(OverflowError): - Timedelta(7 * 19999, unit='D') + Timedelta(7 * 19999, unit="D") with pytest.raises(OverflowError): Timedelta(timedelta(days=13 * 19999)) -@pytest.mark.parametrize('fmt,exp', [ - ('P6DT0H50M3.010010012S', Timedelta(days=6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12)), - ('P-6DT0H50M3.010010012S', Timedelta(days=-6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12)), - ('P4DT12H30M5S', Timedelta(days=4, hours=12, minutes=30, seconds=5)), - ('P0DT0H0M0.000000123S', Timedelta(nanoseconds=123)), - ('P0DT0H0M0.00001S', Timedelta(microseconds=10)), - ('P0DT0H0M0.001S', Timedelta(milliseconds=1)), - ('P0DT0H1M0S', Timedelta(minutes=1)), - ('P1DT25H61M61S', Timedelta(days=1, hours=25, minutes=61, seconds=61)) -]) +@pytest.mark.parametrize( + "fmt,exp", + [ + ( + "P6DT0H50M3.010010012S", + Timedelta( + days=6, + minutes=50, + seconds=3, + milliseconds=10, + microseconds=10, + nanoseconds=12, + ), + ), + ( + "P-6DT0H50M3.010010012S", + Timedelta( + days=-6, + minutes=50, + seconds=3, + milliseconds=10, + microseconds=10, + nanoseconds=12, + ), + ), + ("P4DT12H30M5S", Timedelta(days=4, hours=12, minutes=30, seconds=5)), + ("P0DT0H0M0.000000123S", Timedelta(nanoseconds=123)), + ("P0DT0H0M0.00001S", Timedelta(microseconds=10)), + ("P0DT0H0M0.001S", Timedelta(milliseconds=1)), + ("P0DT0H1M0S", Timedelta(minutes=1)), + ("P1DT25H61M61S", Timedelta(days=1, hours=25, minutes=61, seconds=61)), + ], +) def test_iso_constructor(fmt, exp): assert Timedelta(fmt) == exp -@pytest.mark.parametrize('fmt', [ - 'PPPPPPPPPPPP', 'PDTHMS', 'P0DT999H999M999S', - 'P1DT0H0M0.0000000000000S', 'P1DT0H0M00000000000S', - 'P1DT0H0M0.S']) +@pytest.mark.parametrize( + "fmt", + [ + "PPPPPPPPPPPP", + "PDTHMS", + "P0DT999H999M999S", + "P1DT0H0M0.0000000000000S", + "P1DT0H0M00000000000S", + "P1DT0H0M0.S", + ], +) def test_iso_constructor_raises(fmt): - with pytest.raises(ValueError, match=('Invalid ISO 8601 Duration ' - 'format - {}'.format(fmt))): + with pytest.raises( + ValueError, match=("Invalid ISO 8601 Duration " "format - {}".format(fmt)) + ): Timedelta(fmt) -@pytest.mark.parametrize('constructed_td, conversion', [ - (Timedelta(nanoseconds=100), '100ns'), - (Timedelta(days=1, hours=1, minutes=1, weeks=1, seconds=1, milliseconds=1, - microseconds=1, nanoseconds=1), 694861001001001), - (Timedelta(microseconds=1) + Timedelta(nanoseconds=1), '1us1ns'), - (Timedelta(microseconds=1) - Timedelta(nanoseconds=1), '999ns'), - (Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2), '990ns')]) +@pytest.mark.parametrize( + "constructed_td, conversion", + [ + (Timedelta(nanoseconds=100), "100ns"), + ( + Timedelta( + days=1, + hours=1, + minutes=1, + weeks=1, + seconds=1, + milliseconds=1, + microseconds=1, + nanoseconds=1, + ), + 694861001001001, + ), + (Timedelta(microseconds=1) + Timedelta(nanoseconds=1), "1us1ns"), + (Timedelta(microseconds=1) - Timedelta(nanoseconds=1), "999ns"), + (Timedelta(microseconds=1) + 5 * Timedelta(nanoseconds=-2), "990ns"), + ], +) def test_td_constructor_on_nanoseconds(constructed_td, conversion): # GH#9273 assert constructed_td == Timedelta(conversion) @@ -206,4 +274,4 @@ def test_td_constructor_on_nanoseconds(constructed_td, conversion): def test_td_constructor_value_error(): with pytest.raises(TypeError): - Timedelta(nanoseconds='abc') + Timedelta(nanoseconds="abc") diff --git a/pandas/tests/scalar/timedelta/test_formats.py b/pandas/tests/scalar/timedelta/test_formats.py index 7db79f3f832f9..753186ee4b738 100644 --- a/pandas/tests/scalar/timedelta/test_formats.py +++ b/pandas/tests/scalar/timedelta/test_formats.py @@ -3,25 +3,42 @@ from pandas import Timedelta -@pytest.mark.parametrize('td, expected_repr', [ - (Timedelta(10, unit='d'), "Timedelta('10 days 00:00:00')"), - (Timedelta(10, unit='s'), "Timedelta('0 days 00:00:10')"), - (Timedelta(10, unit='ms'), "Timedelta('0 days 00:00:00.010000')"), - (Timedelta(-10, unit='ms'), "Timedelta('-1 days +23:59:59.990000')")]) +@pytest.mark.parametrize( + "td, expected_repr", + [ + (Timedelta(10, unit="d"), "Timedelta('10 days 00:00:00')"), + (Timedelta(10, unit="s"), "Timedelta('0 days 00:00:10')"), + (Timedelta(10, unit="ms"), "Timedelta('0 days 00:00:00.010000')"), + (Timedelta(-10, unit="ms"), "Timedelta('-1 days +23:59:59.990000')"), + ], +) def test_repr(td, expected_repr): assert repr(td) == expected_repr -@pytest.mark.parametrize('td, expected_iso', [ - (Timedelta(days=6, minutes=50, seconds=3, milliseconds=10, microseconds=10, - nanoseconds=12), 'P6DT0H50M3.010010012S'), - (Timedelta(days=4, hours=12, minutes=30, seconds=5), 'P4DT12H30M5S'), - (Timedelta(nanoseconds=123), 'P0DT0H0M0.000000123S'), - # trim nano - (Timedelta(microseconds=10), 'P0DT0H0M0.00001S'), - # trim micro - (Timedelta(milliseconds=1), 'P0DT0H0M0.001S'), - # don't strip every 0 - (Timedelta(minutes=1), 'P0DT0H1M0S')]) +@pytest.mark.parametrize( + "td, expected_iso", + [ + ( + Timedelta( + days=6, + minutes=50, + seconds=3, + milliseconds=10, + microseconds=10, + nanoseconds=12, + ), + "P6DT0H50M3.010010012S", + ), + (Timedelta(days=4, hours=12, minutes=30, seconds=5), "P4DT12H30M5S"), + (Timedelta(nanoseconds=123), "P0DT0H0M0.000000123S"), + # trim nano + (Timedelta(microseconds=10), "P0DT0H0M0.00001S"), + # trim micro + (Timedelta(milliseconds=1), "P0DT0H0M0.001S"), + # don't strip every 0 + (Timedelta(minutes=1), "P0DT0H1M0S"), + ], +) def test_isoformat(td, expected_iso): assert td.isoformat() == expected_iso diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 469072970133d..e4980be49d35f 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -8,37 +8,36 @@ from pandas._libs.tslibs import NaT, iNaT import pandas as pd -from pandas import ( - Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta) +from pandas import Series, Timedelta, TimedeltaIndex, timedelta_range, to_timedelta import pandas.util.testing as tm class TestTimedeltaArithmetic: - def test_arithmetic_overflow(self): with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + pd.Timedelta(13 * 19999, unit='D') + pd.Timestamp("1700-01-01") + pd.Timedelta(13 * 19999, unit="D") with pytest.raises(OverflowError): - pd.Timestamp('1700-01-01') + timedelta(days=13 * 19999) + pd.Timestamp("1700-01-01") + timedelta(days=13 * 19999) def test_array_timedelta_floordiv(self): # https://github.com/pandas-dev/pandas/issues/19761 - ints = pd.date_range('2012-10-08', periods=4, freq='D').view('i8') + ints = pd.date_range("2012-10-08", periods=4, freq="D").view("i8") msg = r"Use 'array // timedelta.value'" with tm.assert_produces_warning(FutureWarning) as m: - result = ints // pd.Timedelta(1, unit='s') + result = ints // pd.Timedelta(1, unit="s") assert msg in str(m[0].message) - expected = np.array([1349654400, 1349740800, 1349827200, 1349913600], - dtype='i8') + expected = np.array( + [1349654400, 1349740800, 1349827200, 1349913600], dtype="i8" + ) tm.assert_numpy_array_equal(result, expected) def test_ops_error_str(self): # GH 13624 - td = Timedelta('1 day') + td = Timedelta("1 day") - for left, right in [(td, 'a'), ('a', td)]: + for left, right in [(td, "a"), ("a", td)]: with pytest.raises(TypeError): left + right @@ -55,7 +54,7 @@ class Other: other = Other() - td = Timedelta('1 day') + td = Timedelta("1 day") assert td.__add__(other) is NotImplemented assert td.__sub__(other) is NotImplemented assert td.__truediv__(other) is NotImplemented @@ -63,17 +62,17 @@ class Other: assert td.__floordiv__(other) is NotImplemented def test_unary_ops(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") # __neg__, __pos__ - assert -td == Timedelta(-10, unit='d') - assert -td == Timedelta('-10d') - assert +td == Timedelta(10, unit='d') + assert -td == Timedelta(-10, unit="d") + assert -td == Timedelta("-10d") + assert +td == Timedelta(10, unit="d") # __abs__, __abs__(__neg__) assert abs(td) == td assert abs(-td) == td - assert abs(-td) == Timedelta('10d') + assert abs(-td) == Timedelta("10d") class TestTimedeltaComparison: @@ -100,8 +99,8 @@ def test_compare_tick(self, tick_classes): def test_comparison_object_array(self): # analogous to GH#15183 - td = Timedelta('2 days') - other = Timedelta('3 hours') + td = Timedelta("2 days") + other = Timedelta("3 hours") arr = np.array([other, td], dtype=object) res = arr == td @@ -109,9 +108,7 @@ def test_comparison_object_array(self): assert (res == expected).all() # 2D case - arr = np.array([[other, td], - [td, other]], - dtype=object) + arr = np.array([[other, td], [td, other]], dtype=object) res = arr != td expected = np.array([[True, False], [False, True]], dtype=bool) assert res.shape == expected.shape @@ -119,7 +116,7 @@ def test_comparison_object_array(self): def test_compare_timedelta_ndarray(self): # GH11835 - periods = [Timedelta('0 days 01:00:00'), Timedelta('0 days 01:00:00')] + periods = [Timedelta("0 days 01:00:00"), Timedelta("0 days 01:00:00")] arr = np.array(periods) result = arr[0] > arr expected = np.array([False, False]) @@ -131,8 +128,8 @@ def test_compare_custom_object(self): Make sure non supported operations on Timedelta returns NonImplemented and yields to other operand (GH#20829). """ - class CustomClass: + class CustomClass: def __init__(self, cmp_result=None): self.cmp_result = cmp_result @@ -148,7 +145,7 @@ def __eq__(self, other): def __gt__(self, other): return self.generic_result() - t = Timedelta('1s') + t = Timedelta("1s") assert not (t == "string") assert not (t == 1) @@ -163,7 +160,7 @@ def __gt__(self, other): @pytest.mark.parametrize("val", ["string", 1]) def test_compare_unknown_type(self, val): # GH20829 - t = Timedelta('1s') + t = Timedelta("1s") with pytest.raises(TypeError): t >= val with pytest.raises(TypeError): @@ -175,10 +172,14 @@ def test_compare_unknown_type(self, val): class TestTimedeltas: - - @pytest.mark.parametrize("unit, value, expected", [ - ('us', 9.999, 9999), ('ms', 9.999999, 9999999), - ('s', 9.999999999, 9999999999)]) + @pytest.mark.parametrize( + "unit, value, expected", + [ + ("us", 9.999, 9999), + ("ms", 9.999999, 9999999), + ("s", 9.999999999, 9999999999), + ], + ) def test_rounding_on_int_unit_construction(self, unit, value, expected): # GH 12690 result = Timedelta(value, unit=unit) @@ -188,8 +189,8 @@ def test_rounding_on_int_unit_construction(self, unit, value, expected): def test_total_seconds_scalar(self): # see gh-10939 - rng = Timedelta('1 days, 10:11:12.100123456') - expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9 + rng = Timedelta("1 days, 10:11:12.100123456") + expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456.0 / 1e9 tm.assert_almost_equal(rng.total_seconds(), expt) rng = Timedelta(np.nan) @@ -197,44 +198,42 @@ def test_total_seconds_scalar(self): def test_conversion(self): - for td in [Timedelta(10, unit='d'), - Timedelta('1 days, 10:11:12.012345')]: + for td in [Timedelta(10, unit="d"), Timedelta("1 days, 10:11:12.012345")]: pydt = td.to_pytimedelta() assert td == Timedelta(pydt) assert td == pydt - assert (isinstance(pydt, timedelta) and not isinstance( - pydt, Timedelta)) + assert isinstance(pydt, timedelta) and not isinstance(pydt, Timedelta) - assert td == np.timedelta64(td.value, 'ns') + assert td == np.timedelta64(td.value, "ns") td64 = td.to_timedelta64() - assert td64 == np.timedelta64(td.value, 'ns') + assert td64 == np.timedelta64(td.value, "ns") assert td == td64 assert isinstance(td64, np.timedelta64) # this is NOT equal and cannot be roundtripped (because of the nanos) - td = Timedelta('1 days, 10:11:12.012345678') + td = Timedelta("1 days, 10:11:12.012345678") assert td != td.to_pytimedelta() def test_freq_conversion(self): # truediv - td = Timedelta('1 days 2 hours 3 ns') - result = td / np.timedelta64(1, 'D') + td = Timedelta("1 days 2 hours 3 ns") + result = td / np.timedelta64(1, "D") assert result == td.value / float(86400 * 1e9) - result = td / np.timedelta64(1, 's') + result = td / np.timedelta64(1, "s") assert result == td.value / float(1e9) - result = td / np.timedelta64(1, 'ns') + result = td / np.timedelta64(1, "ns") assert result == td.value # floordiv - td = Timedelta('1 days 2 hours 3 ns') - result = td // np.timedelta64(1, 'D') + td = Timedelta("1 days 2 hours 3 ns") + result = td // np.timedelta64(1, "D") assert result == 1 - result = td // np.timedelta64(1, 's') + result = td // np.timedelta64(1, "s") assert result == 93600 - result = td // np.timedelta64(1, 'ns') + result = td // np.timedelta64(1, "ns") assert result == td.value def test_fields(self): @@ -243,18 +242,18 @@ def check(value): assert isinstance(value, int) # compat to datetime.timedelta - rng = to_timedelta('1 days, 10:11:12') + rng = to_timedelta("1 days, 10:11:12") assert rng.days == 1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 0 assert rng.nanoseconds == 0 msg = "'Timedelta' object has no attribute '{}'" - with pytest.raises(AttributeError, match=msg.format('hours')): + with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours - with pytest.raises(AttributeError, match=msg.format('minutes')): + with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes - with pytest.raises(AttributeError, match=msg.format('milliseconds')): + with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # GH 10050 @@ -263,28 +262,28 @@ def check(value): check(rng.microseconds) check(rng.nanoseconds) - td = Timedelta('-1 days, 10:11:12') - assert abs(td) == Timedelta('13:48:48') + td = Timedelta("-1 days, 10:11:12") + assert abs(td) == Timedelta("13:48:48") assert str(td) == "-1 days +10:11:12" - assert -td == Timedelta('0 days 13:48:48') - assert -Timedelta('-1 days, 10:11:12').value == 49728000000000 - assert Timedelta('-1 days, 10:11:12').value == -49728000000000 + assert -td == Timedelta("0 days 13:48:48") + assert -Timedelta("-1 days, 10:11:12").value == 49728000000000 + assert Timedelta("-1 days, 10:11:12").value == -49728000000000 - rng = to_timedelta('-1 days, 10:11:12.100123456') + rng = to_timedelta("-1 days, 10:11:12.100123456") assert rng.days == -1 assert rng.seconds == 10 * 3600 + 11 * 60 + 12 assert rng.microseconds == 100 * 1000 + 123 assert rng.nanoseconds == 456 msg = "'Timedelta' object has no attribute '{}'" - with pytest.raises(AttributeError, match=msg.format('hours')): + with pytest.raises(AttributeError, match=msg.format("hours")): rng.hours - with pytest.raises(AttributeError, match=msg.format('minutes')): + with pytest.raises(AttributeError, match=msg.format("minutes")): rng.minutes - with pytest.raises(AttributeError, match=msg.format('milliseconds')): + with pytest.raises(AttributeError, match=msg.format("milliseconds")): rng.milliseconds # components - tup = pd.to_timedelta(-1, 'us').components + tup = pd.to_timedelta(-1, "us").components assert tup.days == -1 assert tup.hours == 23 assert tup.minutes == 59 @@ -302,7 +301,7 @@ def check(value): check(tup.microseconds) check(tup.nanoseconds) - tup = Timedelta('-1 days 1 us').components + tup = Timedelta("-1 days 1 us").components assert tup.days == -2 assert tup.hours == 23 assert tup.minutes == 59 @@ -313,142 +312,189 @@ def check(value): def test_iso_conversion(self): # GH #21877 - expected = Timedelta(1, unit='s') - assert to_timedelta('P0DT0H0M1S') == expected + expected = Timedelta(1, unit="s") + assert to_timedelta("P0DT0H0M1S") == expected def test_nat_converters(self): - result = to_timedelta('nat').to_numpy() - assert result.dtype.kind == 'M' - assert result.astype('int64') == iNaT + result = to_timedelta("nat").to_numpy() + assert result.dtype.kind == "M" + assert result.astype("int64") == iNaT - result = to_timedelta('nan').to_numpy() - assert result.dtype.kind == 'M' - assert result.astype('int64') == iNaT + result = to_timedelta("nan").to_numpy() + assert result.dtype.kind == "M" + assert result.astype("int64") == iNaT @pytest.mark.filterwarnings("ignore:M and Y units are deprecated") - @pytest.mark.parametrize('units, np_unit', - [(['Y', 'y'], 'Y'), - (['M'], 'M'), - (['W', 'w'], 'W'), - (['D', 'd', 'days', 'day', 'Days', 'Day'], 'D'), - (['m', 'minute', 'min', 'minutes', 't', - 'Minute', 'Min', 'Minutes', 'T'], 'm'), - (['s', 'seconds', 'sec', 'second', - 'S', 'Seconds', 'Sec', 'Second'], 's'), - (['ms', 'milliseconds', 'millisecond', 'milli', - 'millis', 'l', 'MS', 'Milliseconds', - 'Millisecond', 'Milli', 'Millis', 'L'], 'ms'), - (['us', 'microseconds', 'microsecond', 'micro', - 'micros', 'u', 'US', 'Microseconds', - 'Microsecond', 'Micro', 'Micros', 'U'], 'us'), - (['ns', 'nanoseconds', 'nanosecond', 'nano', - 'nanos', 'n', 'NS', 'Nanoseconds', - 'Nanosecond', 'Nano', 'Nanos', 'N'], 'ns')]) - @pytest.mark.parametrize('wrapper', [np.array, list, pd.Index]) + @pytest.mark.parametrize( + "units, np_unit", + [ + (["Y", "y"], "Y"), + (["M"], "M"), + (["W", "w"], "W"), + (["D", "d", "days", "day", "Days", "Day"], "D"), + ( + ["m", "minute", "min", "minutes", "t", "Minute", "Min", "Minutes", "T"], + "m", + ), + (["s", "seconds", "sec", "second", "S", "Seconds", "Sec", "Second"], "s"), + ( + [ + "ms", + "milliseconds", + "millisecond", + "milli", + "millis", + "l", + "MS", + "Milliseconds", + "Millisecond", + "Milli", + "Millis", + "L", + ], + "ms", + ), + ( + [ + "us", + "microseconds", + "microsecond", + "micro", + "micros", + "u", + "US", + "Microseconds", + "Microsecond", + "Micro", + "Micros", + "U", + ], + "us", + ), + ( + [ + "ns", + "nanoseconds", + "nanosecond", + "nano", + "nanos", + "n", + "NS", + "Nanoseconds", + "Nanosecond", + "Nano", + "Nanos", + "N", + ], + "ns", + ), + ], + ) + @pytest.mark.parametrize("wrapper", [np.array, list, pd.Index]) def test_unit_parser(self, units, np_unit, wrapper): # validate all units, GH 6855, GH 21762 for unit in units: # array-likes - expected = TimedeltaIndex([np.timedelta64(i, np_unit) - for i in np.arange(5).tolist()]) + expected = TimedeltaIndex( + [np.timedelta64(i, np_unit) for i in np.arange(5).tolist()] + ) result = to_timedelta(wrapper(range(5)), unit=unit) tm.assert_index_equal(result, expected) result = TimedeltaIndex(wrapper(range(5)), unit=unit) tm.assert_index_equal(result, expected) - if unit == 'M': + if unit == "M": # M is treated as minutes in string repr - expected = TimedeltaIndex([np.timedelta64(i, 'm') - for i in np.arange(5).tolist()]) + expected = TimedeltaIndex( + [np.timedelta64(i, "m") for i in np.arange(5).tolist()] + ) - str_repr = ['{}{}'.format(x, unit) for x in np.arange(5)] + str_repr = ["{}{}".format(x, unit) for x in np.arange(5)] result = to_timedelta(wrapper(str_repr)) tm.assert_index_equal(result, expected) result = TimedeltaIndex(wrapper(str_repr)) tm.assert_index_equal(result, expected) # scalar - expected = Timedelta(np.timedelta64(2, np_unit).astype( - 'timedelta64[ns]')) + expected = Timedelta(np.timedelta64(2, np_unit).astype("timedelta64[ns]")) result = to_timedelta(2, unit=unit) assert result == expected result = Timedelta(2, unit=unit) assert result == expected - if unit == 'M': - expected = Timedelta(np.timedelta64(2, 'm').astype( - 'timedelta64[ns]')) + if unit == "M": + expected = Timedelta(np.timedelta64(2, "m").astype("timedelta64[ns]")) - result = to_timedelta('2{}'.format(unit)) + result = to_timedelta("2{}".format(unit)) assert result == expected - result = Timedelta('2{}'.format(unit)) + result = Timedelta("2{}".format(unit)) assert result == expected - @pytest.mark.parametrize('unit', ['Y', 'y', 'M']) + @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_deprecated(self, unit): with tm.assert_produces_warning(FutureWarning) as w1: Timedelta(10, unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w1[0].message)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w2: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w2: to_timedelta(10, unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w2[0].message)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False) as w3: + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w3: to_timedelta([1, 2], unit) - msg = r'.* units are deprecated .*' + msg = r".* units are deprecated .*" assert re.match(msg, str(w3[0].message)) def test_numeric_conversions(self): - assert Timedelta(0) == np.timedelta64(0, 'ns') - assert Timedelta(10) == np.timedelta64(10, 'ns') - assert Timedelta(10, unit='ns') == np.timedelta64(10, 'ns') + assert Timedelta(0) == np.timedelta64(0, "ns") + assert Timedelta(10) == np.timedelta64(10, "ns") + assert Timedelta(10, unit="ns") == np.timedelta64(10, "ns") - assert Timedelta(10, unit='us') == np.timedelta64(10, 'us') - assert Timedelta(10, unit='ms') == np.timedelta64(10, 'ms') - assert Timedelta(10, unit='s') == np.timedelta64(10, 's') - assert Timedelta(10, unit='d') == np.timedelta64(10, 'D') + assert Timedelta(10, unit="us") == np.timedelta64(10, "us") + assert Timedelta(10, unit="ms") == np.timedelta64(10, "ms") + assert Timedelta(10, unit="s") == np.timedelta64(10, "s") + assert Timedelta(10, unit="d") == np.timedelta64(10, "D") def test_timedelta_conversions(self): - assert (Timedelta(timedelta(seconds=1)) == - np.timedelta64(1, 's').astype('m8[ns]')) - assert (Timedelta(timedelta(microseconds=1)) == - np.timedelta64(1, 'us').astype('m8[ns]')) - assert (Timedelta(timedelta(days=1)) == - np.timedelta64(1, 'D').astype('m8[ns]')) + assert Timedelta(timedelta(seconds=1)) == np.timedelta64(1, "s").astype( + "m8[ns]" + ) + assert Timedelta(timedelta(microseconds=1)) == np.timedelta64(1, "us").astype( + "m8[ns]" + ) + assert Timedelta(timedelta(days=1)) == np.timedelta64(1, "D").astype("m8[ns]") def test_to_numpy_alias(self): # GH 24653: alias .to_numpy() for scalars - td = Timedelta('10m7s') + td = Timedelta("10m7s") assert td.to_timedelta64() == td.to_numpy() def test_round(self): - t1 = Timedelta('1 days 02:34:56.789123456') - t2 = Timedelta('-1 days 02:34:56.789123456') - - for (freq, s1, s2) in [('N', t1, t2), - ('U', Timedelta('1 days 02:34:56.789123000'), - Timedelta('-1 days 02:34:56.789123000')), - ('L', Timedelta('1 days 02:34:56.789000000'), - Timedelta('-1 days 02:34:56.789000000')), - ('S', Timedelta('1 days 02:34:57'), - Timedelta('-1 days 02:34:57')), - ('2S', Timedelta('1 days 02:34:56'), - Timedelta('-1 days 02:34:56')), - ('5S', Timedelta('1 days 02:34:55'), - Timedelta('-1 days 02:34:55')), - ('T', Timedelta('1 days 02:35:00'), - Timedelta('-1 days 02:35:00')), - ('12T', Timedelta('1 days 02:36:00'), - Timedelta('-1 days 02:36:00')), - ('H', Timedelta('1 days 03:00:00'), - Timedelta('-1 days 03:00:00')), - ('d', Timedelta('1 days'), - Timedelta('-1 days'))]: + t1 = Timedelta("1 days 02:34:56.789123456") + t2 = Timedelta("-1 days 02:34:56.789123456") + + for (freq, s1, s2) in [ + ("N", t1, t2), + ( + "U", + Timedelta("1 days 02:34:56.789123000"), + Timedelta("-1 days 02:34:56.789123000"), + ), + ( + "L", + Timedelta("1 days 02:34:56.789000000"), + Timedelta("-1 days 02:34:56.789000000"), + ), + ("S", Timedelta("1 days 02:34:57"), Timedelta("-1 days 02:34:57")), + ("2S", Timedelta("1 days 02:34:56"), Timedelta("-1 days 02:34:56")), + ("5S", Timedelta("1 days 02:34:55"), Timedelta("-1 days 02:34:55")), + ("T", Timedelta("1 days 02:35:00"), Timedelta("-1 days 02:35:00")), + ("12T", Timedelta("1 days 02:36:00"), Timedelta("-1 days 02:36:00")), + ("H", Timedelta("1 days 03:00:00"), Timedelta("-1 days 03:00:00")), + ("d", Timedelta("1 days"), Timedelta("-1 days")), + ]: r1 = t1.round(freq) assert r1 == s1 r2 = t2.round(freq) @@ -456,51 +502,60 @@ def test_round(self): # invalid for freq, msg in [ - ('Y', ' is a non-fixed frequency'), - ('M', ' is a non-fixed frequency'), - ('foobar', 'Invalid frequency: foobar')]: + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: with pytest.raises(ValueError, match=msg): t1.round(freq) - t1 = timedelta_range('1 days', periods=3, freq='1 min 2 s 3 us') + t1 = timedelta_range("1 days", periods=3, freq="1 min 2 s 3 us") t2 = -1 * t1 - t1a = timedelta_range('1 days', periods=3, freq='1 min 2 s') - t1c = pd.TimedeltaIndex([1, 1, 1], unit='D') + t1a = timedelta_range("1 days", periods=3, freq="1 min 2 s") + t1c = pd.TimedeltaIndex([1, 1, 1], unit="D") # note that negative times round DOWN! so don't give whole numbers - for (freq, s1, s2) in [('N', t1, t2), - ('U', t1, t2), - ('L', t1a, - TimedeltaIndex(['-1 days +00:00:00', - '-2 days +23:58:58', - '-2 days +23:57:56'], - dtype='timedelta64[ns]', - freq=None) - ), - ('S', t1a, - TimedeltaIndex(['-1 days +00:00:00', - '-2 days +23:58:58', - '-2 days +23:57:56'], - dtype='timedelta64[ns]', - freq=None) - ), - ('12T', t1c, - TimedeltaIndex(['-1 days', - '-1 days', - '-1 days'], - dtype='timedelta64[ns]', - freq=None) - ), - ('H', t1c, - TimedeltaIndex(['-1 days', - '-1 days', - '-1 days'], - dtype='timedelta64[ns]', - freq=None) - ), - ('d', t1c, - pd.TimedeltaIndex([-1, -1, -1], unit='D') - )]: + for (freq, s1, s2) in [ + ("N", t1, t2), + ("U", t1, t2), + ( + "L", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "S", + t1a, + TimedeltaIndex( + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "12T", + t1c, + TimedeltaIndex( + ["-1 days", "-1 days", "-1 days"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ( + "H", + t1c, + TimedeltaIndex( + ["-1 days", "-1 days", "-1 days"], + dtype="timedelta64[ns]", + freq=None, + ), + ), + ("d", t1c, pd.TimedeltaIndex([-1, -1, -1], unit="D")), + ]: r1 = t1.round(freq) tm.assert_index_equal(r1, s1) @@ -509,114 +564,115 @@ def test_round(self): # invalid for freq, msg in [ - ('Y', ' is a non-fixed frequency'), - ('M', ' is a non-fixed frequency'), - ('foobar', 'Invalid frequency: foobar')]: + ("Y", " is a non-fixed frequency"), + ("M", " is a non-fixed frequency"), + ("foobar", "Invalid frequency: foobar"), + ]: with pytest.raises(ValueError, match=msg): t1.round(freq) def test_contains(self): # Checking for any NaT-like objects # GH 13603 - td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) - for v in [pd.NaT, None, float('nan'), np.nan]: + td = to_timedelta(range(5), unit="d") + pd.offsets.Hour(1) + for v in [pd.NaT, None, float("nan"), np.nan]: assert not (v in td) td = to_timedelta([pd.NaT]) - for v in [pd.NaT, None, float('nan'), np.nan]: - assert (v in td) + for v in [pd.NaT, None, float("nan"), np.nan]: + assert v in td def test_identity(self): - td = Timedelta(10, unit='d') + td = Timedelta(10, unit="d") assert isinstance(td, Timedelta) assert isinstance(td, timedelta) def test_short_format_converters(self): def conv(v): - return v.astype('m8[ns]') - - assert Timedelta('10') == np.timedelta64(10, 'ns') - assert Timedelta('10ns') == np.timedelta64(10, 'ns') - assert Timedelta('100') == np.timedelta64(100, 'ns') - assert Timedelta('100ns') == np.timedelta64(100, 'ns') - - assert Timedelta('1000') == np.timedelta64(1000, 'ns') - assert Timedelta('1000ns') == np.timedelta64(1000, 'ns') - assert Timedelta('1000NS') == np.timedelta64(1000, 'ns') - - assert Timedelta('10us') == np.timedelta64(10000, 'ns') - assert Timedelta('100us') == np.timedelta64(100000, 'ns') - assert Timedelta('1000us') == np.timedelta64(1000000, 'ns') - assert Timedelta('1000Us') == np.timedelta64(1000000, 'ns') - assert Timedelta('1000uS') == np.timedelta64(1000000, 'ns') - - assert Timedelta('1ms') == np.timedelta64(1000000, 'ns') - assert Timedelta('10ms') == np.timedelta64(10000000, 'ns') - assert Timedelta('100ms') == np.timedelta64(100000000, 'ns') - assert Timedelta('1000ms') == np.timedelta64(1000000000, 'ns') - - assert Timedelta('-1s') == -np.timedelta64(1000000000, 'ns') - assert Timedelta('1s') == np.timedelta64(1000000000, 'ns') - assert Timedelta('10s') == np.timedelta64(10000000000, 'ns') - assert Timedelta('100s') == np.timedelta64(100000000000, 'ns') - assert Timedelta('1000s') == np.timedelta64(1000000000000, 'ns') - - assert Timedelta('1d') == conv(np.timedelta64(1, 'D')) - assert Timedelta('-1d') == -conv(np.timedelta64(1, 'D')) - assert Timedelta('1D') == conv(np.timedelta64(1, 'D')) - assert Timedelta('10D') == conv(np.timedelta64(10, 'D')) - assert Timedelta('100D') == conv(np.timedelta64(100, 'D')) - assert Timedelta('1000D') == conv(np.timedelta64(1000, 'D')) - assert Timedelta('10000D') == conv(np.timedelta64(10000, 'D')) + return v.astype("m8[ns]") + + assert Timedelta("10") == np.timedelta64(10, "ns") + assert Timedelta("10ns") == np.timedelta64(10, "ns") + assert Timedelta("100") == np.timedelta64(100, "ns") + assert Timedelta("100ns") == np.timedelta64(100, "ns") + + assert Timedelta("1000") == np.timedelta64(1000, "ns") + assert Timedelta("1000ns") == np.timedelta64(1000, "ns") + assert Timedelta("1000NS") == np.timedelta64(1000, "ns") + + assert Timedelta("10us") == np.timedelta64(10000, "ns") + assert Timedelta("100us") == np.timedelta64(100000, "ns") + assert Timedelta("1000us") == np.timedelta64(1000000, "ns") + assert Timedelta("1000Us") == np.timedelta64(1000000, "ns") + assert Timedelta("1000uS") == np.timedelta64(1000000, "ns") + + assert Timedelta("1ms") == np.timedelta64(1000000, "ns") + assert Timedelta("10ms") == np.timedelta64(10000000, "ns") + assert Timedelta("100ms") == np.timedelta64(100000000, "ns") + assert Timedelta("1000ms") == np.timedelta64(1000000000, "ns") + + assert Timedelta("-1s") == -np.timedelta64(1000000000, "ns") + assert Timedelta("1s") == np.timedelta64(1000000000, "ns") + assert Timedelta("10s") == np.timedelta64(10000000000, "ns") + assert Timedelta("100s") == np.timedelta64(100000000000, "ns") + assert Timedelta("1000s") == np.timedelta64(1000000000000, "ns") + + assert Timedelta("1d") == conv(np.timedelta64(1, "D")) + assert Timedelta("-1d") == -conv(np.timedelta64(1, "D")) + assert Timedelta("1D") == conv(np.timedelta64(1, "D")) + assert Timedelta("10D") == conv(np.timedelta64(10, "D")) + assert Timedelta("100D") == conv(np.timedelta64(100, "D")) + assert Timedelta("1000D") == conv(np.timedelta64(1000, "D")) + assert Timedelta("10000D") == conv(np.timedelta64(10000, "D")) # space - assert Timedelta(' 10000D ') == conv(np.timedelta64(10000, 'D')) - assert Timedelta(' - 10000D ') == -conv(np.timedelta64(10000, 'D')) + assert Timedelta(" 10000D ") == conv(np.timedelta64(10000, "D")) + assert Timedelta(" - 10000D ") == -conv(np.timedelta64(10000, "D")) # invalid with pytest.raises(ValueError): - Timedelta('1foo') + Timedelta("1foo") with pytest.raises(ValueError): - Timedelta('foo') + Timedelta("foo") def test_full_format_converters(self): def conv(v): - return v.astype('m8[ns]') + return v.astype("m8[ns]") - d1 = np.timedelta64(1, 'D') + d1 = np.timedelta64(1, "D") - assert Timedelta('1days') == conv(d1) - assert Timedelta('1days,') == conv(d1) - assert Timedelta('- 1days,') == -conv(d1) + assert Timedelta("1days") == conv(d1) + assert Timedelta("1days,") == conv(d1) + assert Timedelta("- 1days,") == -conv(d1) - assert Timedelta('00:00:01') == conv(np.timedelta64(1, 's')) - assert Timedelta('06:00:01') == conv(np.timedelta64(6 * 3600 + 1, 's')) - assert Timedelta('06:00:01.0') == conv( - np.timedelta64(6 * 3600 + 1, 's')) - assert Timedelta('06:00:01.01') == conv(np.timedelta64( - 1000 * (6 * 3600 + 1) + 10, 'ms')) + assert Timedelta("00:00:01") == conv(np.timedelta64(1, "s")) + assert Timedelta("06:00:01") == conv(np.timedelta64(6 * 3600 + 1, "s")) + assert Timedelta("06:00:01.0") == conv(np.timedelta64(6 * 3600 + 1, "s")) + assert Timedelta("06:00:01.01") == conv( + np.timedelta64(1000 * (6 * 3600 + 1) + 10, "ms") + ) - assert (Timedelta('- 1days, 00:00:01') == - conv(-d1 + np.timedelta64(1, 's'))) - assert (Timedelta('1days, 06:00:01') == - conv(d1 + np.timedelta64(6 * 3600 + 1, 's'))) - assert (Timedelta('1days, 06:00:01.01') == - conv(d1 + np.timedelta64(1000 * (6 * 3600 + 1) + 10, 'ms'))) + assert Timedelta("- 1days, 00:00:01") == conv(-d1 + np.timedelta64(1, "s")) + assert Timedelta("1days, 06:00:01") == conv( + d1 + np.timedelta64(6 * 3600 + 1, "s") + ) + assert Timedelta("1days, 06:00:01.01") == conv( + d1 + np.timedelta64(1000 * (6 * 3600 + 1) + 10, "ms") + ) # invalid with pytest.raises(ValueError): - Timedelta('- 1days, 00') + Timedelta("- 1days, 00") def test_overflow(self): # GH 9442 - s = Series(pd.date_range('20130101', periods=100000, freq='H')) - s[0] += pd.Timedelta('1s 1ms') + s = Series(pd.date_range("20130101", periods=100000, freq="H")) + s[0] += pd.Timedelta("1s 1ms") # mean result = (s - s.min()).mean() - expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s) - ).sum()) + expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) # the computation is converted to float so # might be some loss of precision @@ -634,24 +690,24 @@ def test_overflow(self): def test_pickle(self): - v = Timedelta('1 days 10:11:12.0123456') + v = Timedelta("1 days 10:11:12.0123456") v_p = tm.round_trip_pickle(v) assert v == v_p def test_timedelta_hash_equality(self): # GH 11129 - v = Timedelta(1, 'D') + v = Timedelta(1, "D") td = timedelta(days=1) assert hash(v) == hash(td) d = {td: 2} assert d[v] == 2 - tds = timedelta_range('1 second', periods=20) + tds = timedelta_range("1 second", periods=20) assert all(hash(td) == hash(td.to_pytimedelta()) for td in tds) # python timedeltas drop ns resolution - ns_td = Timedelta(1, 'ns') + ns_td = Timedelta(1, "ns") assert hash(ns_td) != hash(ns_td.to_pytimedelta()) def test_implementation_limits(self): @@ -664,65 +720,65 @@ def test_implementation_limits(self): assert max_td.value == np.iinfo(np.int64).max # Beyond lower limit, a NAT before the Overflow - assert (min_td - Timedelta(1, 'ns')) is NaT + assert (min_td - Timedelta(1, "ns")) is NaT with pytest.raises(OverflowError): - min_td - Timedelta(2, 'ns') + min_td - Timedelta(2, "ns") with pytest.raises(OverflowError): - max_td + Timedelta(1, 'ns') + max_td + Timedelta(1, "ns") # Same tests using the internal nanosecond values - td = Timedelta(min_td.value - 1, 'ns') + td = Timedelta(min_td.value - 1, "ns") assert td is NaT with pytest.raises(OverflowError): - Timedelta(min_td.value - 2, 'ns') + Timedelta(min_td.value - 2, "ns") with pytest.raises(OverflowError): - Timedelta(max_td.value + 1, 'ns') + Timedelta(max_td.value + 1, "ns") def test_total_seconds_precision(self): # GH 19458 - assert Timedelta('30S').total_seconds() == 30.0 - assert Timedelta('0').total_seconds() == 0.0 - assert Timedelta('-2S').total_seconds() == -2.0 - assert Timedelta('5.324S').total_seconds() == 5.324 - assert (Timedelta('30S').total_seconds() - 30.0) < 1e-20 - assert (30.0 - Timedelta('30S').total_seconds()) < 1e-20 + assert Timedelta("30S").total_seconds() == 30.0 + assert Timedelta("0").total_seconds() == 0.0 + assert Timedelta("-2S").total_seconds() == -2.0 + assert Timedelta("5.324S").total_seconds() == 5.324 + assert (Timedelta("30S").total_seconds() - 30.0) < 1e-20 + assert (30.0 - Timedelta("30S").total_seconds()) < 1e-20 def test_timedelta_arithmetic(self): - data = pd.Series(['nat', '32 days'], dtype='timedelta64[ns]') - deltas = [timedelta(days=1), Timedelta(1, unit='D')] + data = pd.Series(["nat", "32 days"], dtype="timedelta64[ns]") + deltas = [timedelta(days=1), Timedelta(1, unit="D")] for delta in deltas: result_method = data.add(delta) result_operator = data + delta - expected = pd.Series(['nat', '33 days'], dtype='timedelta64[ns]') + expected = pd.Series(["nat", "33 days"], dtype="timedelta64[ns]") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) result_method = data.sub(delta) result_operator = data - delta - expected = pd.Series(['nat', '31 days'], dtype='timedelta64[ns]') + expected = pd.Series(["nat", "31 days"], dtype="timedelta64[ns]") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) # GH 9396 result_method = data.div(delta) result_operator = data / delta - expected = pd.Series([np.nan, 32.], dtype='float64') + expected = pd.Series([np.nan, 32.0], dtype="float64") tm.assert_series_equal(result_operator, expected) tm.assert_series_equal(result_method, expected) def test_apply_to_timedelta(self): - timedelta_NaT = pd.to_timedelta('NaT') + timedelta_NaT = pd.to_timedelta("NaT") - list_of_valid_strings = ['00:00:01', '00:00:02'] + list_of_valid_strings = ["00:00:01", "00:00:02"] a = pd.to_timedelta(list_of_valid_strings) b = Series(list_of_valid_strings).apply(pd.to_timedelta) # Can't compare until apply on a Series gives the correct dtype # assert_series_equal(a, b) - list_of_strings = ['00:00:01', np.nan, pd.NaT, timedelta_NaT] + list_of_strings = ["00:00:01", np.nan, pd.NaT, timedelta_NaT] # TODO: unused? a = pd.to_timedelta(list_of_strings) # noqa @@ -731,7 +787,7 @@ def test_apply_to_timedelta(self): # assert_series_equal(a, b) def test_components(self): - rng = timedelta_range('1 days, 10:11:12', periods=2, freq='s') + rng = timedelta_range("1 days, 10:11:12", periods=2, freq="s") rng.components # with nat @@ -743,13 +799,13 @@ def test_components(self): assert result.iloc[1].isna().all() def test_resolution_string(self): - assert Timedelta(days=1).resolution_string == 'D' - assert Timedelta(days=1, hours=6).resolution_string == 'H' - assert Timedelta(days=1, minutes=6).resolution_string == 'T' - assert Timedelta(days=1, seconds=6).resolution_string == 'S' - assert Timedelta(days=1, milliseconds=6).resolution_string == 'L' - assert Timedelta(days=1, microseconds=6).resolution_string == 'U' - assert Timedelta(days=1, nanoseconds=6).resolution_string == 'N' + assert Timedelta(days=1).resolution_string == "D" + assert Timedelta(days=1, hours=6).resolution_string == "H" + assert Timedelta(days=1, minutes=6).resolution_string == "T" + assert Timedelta(days=1, seconds=6).resolution_string == "S" + assert Timedelta(days=1, milliseconds=6).resolution_string == "L" + assert Timedelta(days=1, microseconds=6).resolution_string == "U" + assert Timedelta(days=1, nanoseconds=6).resolution_string == "N" def test_resolution_deprecated(self): # GH#21344 @@ -759,15 +815,18 @@ def test_resolution_deprecated(self): assert "Use Timedelta.resolution_string instead" in str(w[0].message) -@pytest.mark.parametrize('value, expected', [ - (Timedelta('10S'), True), - (Timedelta('-10S'), True), - (Timedelta(10, unit='ns'), True), - (Timedelta(0, unit='ns'), False), - (Timedelta(-10, unit='ns'), True), - (Timedelta(None), True), - (pd.NaT, True), -]) +@pytest.mark.parametrize( + "value, expected", + [ + (Timedelta("10S"), True), + (Timedelta("-10S"), True), + (Timedelta(10, unit="ns"), True), + (Timedelta(0, unit="ns"), False), + (Timedelta(-10, unit="ns"), True), + (Timedelta(None), True), + (pd.NaT, True), + ], +) def test_truthiness(value, expected): # https://github.com/pandas-dev/pandas/issues/21484 assert bool(value) is expected diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 4f20bdbd65ba1..58bd03129f2df 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -29,11 +29,13 @@ def test_overflow_offset_raises(self): # xref https://github.com/statsmodels/statsmodels/issues/3374 # ends up multiplying really large numbers which overflow - stamp = Timestamp('2017-01-13 00:00:00', freq='D') + stamp = Timestamp("2017-01-13 00:00:00", freq="D") offset_overflow = 20169940 * offsets.Day(1) - msg = ("the add operation between " - r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " - "will overflow") + msg = ( + "the add operation between " + r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " + "will overflow" + ) with pytest.raises(OverflowError, match=msg): stamp + offset_overflow @@ -77,7 +79,7 @@ def test_addition_subtraction_types(self): td = timedelta(seconds=1) # build a timestamp with a frequency, since then it supports # addition/subtraction of integers - ts = Timestamp(dt, freq='D') + ts = Timestamp(dt, freq="D") with tm.assert_produces_warning(FutureWarning): # GH#22535 add/sub with integers is deprecated @@ -92,20 +94,23 @@ def test_addition_subtraction_types(self): # Timestamp +/- datetime64 not supported, so not tested (could possibly # assert error raised?) - td64 = np.timedelta64(1, 'D') + td64 = np.timedelta64(1, "D") assert type(ts + td64) == Timestamp assert type(ts - td64) == Timestamp - @pytest.mark.parametrize('freq, td, td64', [ - ('S', timedelta(seconds=1), np.timedelta64(1, 's')), - ('min', timedelta(minutes=1), np.timedelta64(1, 'm')), - ('H', timedelta(hours=1), np.timedelta64(1, 'h')), - ('D', timedelta(days=1), np.timedelta64(1, 'D')), - ('W', timedelta(weeks=1), np.timedelta64(1, 'W')), - ('M', None, np.timedelta64(1, 'M')) - ]) + @pytest.mark.parametrize( + "freq, td, td64", + [ + ("S", timedelta(seconds=1), np.timedelta64(1, "s")), + ("min", timedelta(minutes=1), np.timedelta64(1, "m")), + ("H", timedelta(hours=1), np.timedelta64(1, "h")), + ("D", timedelta(days=1), np.timedelta64(1, "D")), + ("W", timedelta(weeks=1), np.timedelta64(1, "W")), + ("M", None, np.timedelta64(1, "M")), + ], + ) def test_addition_subtraction_preserve_frequency(self, freq, td, td64): - ts = Timestamp('2014-03-05 00:00:00', freq=freq) + ts = Timestamp("2014-03-05 00:00:00", freq=freq) original_freq = ts.freq with tm.assert_produces_warning(FutureWarning): @@ -124,20 +129,23 @@ def test_addition_subtraction_preserve_frequency(self, freq, td, td64): assert (ts + td64).freq == original_freq assert (ts - td64).freq == original_freq - @pytest.mark.parametrize('td', [Timedelta(hours=3), - np.timedelta64(3, 'h'), - timedelta(hours=3)]) + @pytest.mark.parametrize( + "td", [Timedelta(hours=3), np.timedelta64(3, "h"), timedelta(hours=3)] + ) def test_radd_tdscalar(self, td): # GH#24775 timedelta64+Timestamp should not raise ts = Timestamp.now() assert td + ts == ts + td - @pytest.mark.parametrize('other,expected_difference', [ - (np.timedelta64(-123, 'ns'), -123), - (np.timedelta64(1234567898, 'ns'), 1234567898), - (np.timedelta64(-123, 'us'), -123000), - (np.timedelta64(-123, 'ms'), -123000000) - ]) + @pytest.mark.parametrize( + "other,expected_difference", + [ + (np.timedelta64(-123, "ns"), -123), + (np.timedelta64(1234567898, "ns"), 1234567898), + (np.timedelta64(-123, "us"), -123000), + (np.timedelta64(-123, "ms"), -123000000), + ], + ) def test_timestamp_add_timedelta64_unit(self, other, expected_difference): ts = Timestamp(datetime.utcnow()) result = ts + other diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index b572b4607108c..4ff0f84327854 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -10,9 +10,9 @@ class TestTimestampComparison: def test_comparison_object_array(self): # GH#15183 - ts = Timestamp('2011-01-03 00:00:00-0500', tz='US/Eastern') - other = Timestamp('2011-01-01 00:00:00-0500', tz='US/Eastern') - naive = Timestamp('2011-01-01 00:00:00') + ts = Timestamp("2011-01-03 00:00:00-0500", tz="US/Eastern") + other = Timestamp("2011-01-01 00:00:00-0500", tz="US/Eastern") + naive = Timestamp("2011-01-01 00:00:00") arr = np.array([other, ts], dtype=object) res = arr == ts @@ -20,9 +20,7 @@ def test_comparison_object_array(self): assert (res == expected).all() # 2D case - arr = np.array([[other, ts], - [ts, other]], - dtype=object) + arr = np.array([[other, ts], [ts, other]], dtype=object) res = arr != ts expected = np.array([[True, False], [False, True]], dtype=bool) assert res.shape == expected.shape @@ -65,27 +63,27 @@ def test_comparison(self): def test_compare_invalid(self): # GH#8058 - val = Timestamp('20130101 12:01:02') - assert not val == 'foo' + val = Timestamp("20130101 12:01:02") + assert not val == "foo" assert not val == 10.0 assert not val == 1 assert not val == [] - assert not val == {'foo': 1} + assert not val == {"foo": 1} assert not val == np.float64(1) assert not val == np.int64(1) - assert val != 'foo' + assert val != "foo" assert val != 10.0 assert val != 1 assert val != [] - assert val != {'foo': 1} + assert val != {"foo": 1} assert val != np.float64(1) assert val != np.int64(1) def test_cant_compare_tz_naive_w_aware(self, utc_fixture): # see GH#1404 - a = Timestamp('3/12/2012') - b = Timestamp('3/12/2012', tz=utc_fixture) + a = Timestamp("3/12/2012") + b = Timestamp("3/12/2012", tz=utc_fixture) with pytest.raises(TypeError): a == b @@ -119,15 +117,10 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): def test_timestamp_compare_scalars(self): # case where ndim == 0 lhs = np.datetime64(datetime(2013, 12, 6)) - rhs = Timestamp('now') - nat = Timestamp('nat') + rhs = Timestamp("now") + nat = Timestamp("nat") - ops = {'gt': 'lt', - 'lt': 'gt', - 'ge': 'le', - 'le': 'ge', - 'eq': 'eq', - 'ne': 'ne'} + ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"} for left, right in ops.items(): left_f = getattr(operator, left) @@ -143,7 +136,7 @@ def test_timestamp_compare_scalars(self): def test_timestamp_compare_with_early_datetime(self): # e.g. datetime.min - stamp = Timestamp('2012-01-01') + stamp = Timestamp("2012-01-01") assert not stamp == datetime.min assert not stamp == datetime(1600, 1, 1) @@ -159,7 +152,7 @@ def test_timestamp_compare_with_early_datetime(self): def test_compare_zerodim_array(self): # GH#26916 ts = Timestamp.now() - dt64 = np.datetime64('2016-01-01', 'ns') + dt64 = np.datetime64("2016-01-01", "ns") arr = np.array(dt64) assert arr.ndim == 0 @@ -190,7 +183,7 @@ def __eq__(self, o): return isinstance(o, Inf) inf = Inf() - timestamp = Timestamp('2018-11-30') + timestamp = Timestamp("2018-11-30") for left, right in [(inf, timestamp), (timestamp, inf)]: assert left > right or left < right diff --git a/pandas/tests/scalar/timestamp/test_rendering.py b/pandas/tests/scalar/timestamp/test_rendering.py index 69ea0a810c4ce..6b64b230a0bb9 100644 --- a/pandas/tests/scalar/timestamp/test_rendering.py +++ b/pandas/tests/scalar/timestamp/test_rendering.py @@ -8,18 +8,18 @@ class TestTimestampRendering: - timezones = ['UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/US/Pacific'] + timezones = ["UTC", "Asia/Tokyo", "US/Eastern", "dateutil/US/Pacific"] - @pytest.mark.parametrize('tz', timezones) - @pytest.mark.parametrize('freq', ['D', 'M', 'S', 'N']) - @pytest.mark.parametrize('date', ['2014-03-07', '2014-01-01 09:00', - '2014-01-01 00:00:00.000000001']) + @pytest.mark.parametrize("tz", timezones) + @pytest.mark.parametrize("freq", ["D", "M", "S", "N"]) + @pytest.mark.parametrize( + "date", ["2014-03-07", "2014-01-01 09:00", "2014-01-01 00:00:00.000000001"] + ) def test_repr(self, date, freq, tz): # avoid to match with timezone name freq_repr = "'{0}'".format(freq) - if tz.startswith('dateutil'): - tz_repr = tz.replace('dateutil', '') + if tz.startswith("dateutil"): + tz_repr = tz.replace("dateutil", "") else: tz_repr = tz @@ -50,28 +50,28 @@ def test_repr(self, date, freq, tz): def test_repr_utcoffset(self): # This can cause the tz field to be populated, but it's redundant to # include this information in the date-string. - date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) - assert '2014-03-13 00:00:00-0400' in repr(date_with_utc_offset) - assert 'tzoffset' not in repr(date_with_utc_offset) - assert 'pytz.FixedOffset(-240)' in repr(date_with_utc_offset) - expr = repr(date_with_utc_offset).replace("'pytz.FixedOffset(-240)'", - 'pytz.FixedOffset(-240)') + date_with_utc_offset = Timestamp("2014-03-13 00:00:00-0400", tz=None) + assert "2014-03-13 00:00:00-0400" in repr(date_with_utc_offset) + assert "tzoffset" not in repr(date_with_utc_offset) + assert "pytz.FixedOffset(-240)" in repr(date_with_utc_offset) + expr = repr(date_with_utc_offset).replace( + "'pytz.FixedOffset(-240)'", "pytz.FixedOffset(-240)" + ) assert date_with_utc_offset == eval(expr) def test_timestamp_repr_pre1900(self): # pre-1900 - stamp = Timestamp('1850-01-01', tz='US/Eastern') + stamp = Timestamp("1850-01-01", tz="US/Eastern") repr(stamp) - iso8601 = '1850-01-01 01:23:45.012345' - stamp = Timestamp(iso8601, tz='US/Eastern') + iso8601 = "1850-01-01 01:23:45.012345" + stamp = Timestamp(iso8601, tz="US/Eastern") result = repr(stamp) assert iso8601 in result def test_pprint(self): # GH#12622 - nested_obj = {'foo': 1, - 'bar': [{'w': {'a': Timestamp('2011-01-01')}}] * 10} + nested_obj = {"foo": 1, "bar": [{"w": {"a": Timestamp("2011-01-01")}}] * 10} result = pprint.pformat(nested_obj, width=50) expected = r"""{'bar': [{'w': {'a': Timestamp('2011-01-01 00:00:00')}}, {'w': {'a': Timestamp('2011-01-01 00:00:00')}}, diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index b9946796a4e1f..7b0ff83aee5d4 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -25,21 +25,20 @@ class TestTimestampProperties: - def test_properties_business(self): - ts = Timestamp('2017-10-01', freq='B') - control = Timestamp('2017-10-01') + ts = Timestamp("2017-10-01", freq="B") + control = Timestamp("2017-10-01") assert ts.dayofweek == 6 - assert not ts.is_month_start # not a weekday + assert not ts.is_month_start # not a weekday assert not ts.is_quarter_start # not a weekday # Control case: non-business is month/qtr start assert control.is_month_start assert control.is_quarter_start - ts = Timestamp('2017-09-30', freq='B') - control = Timestamp('2017-09-30') + ts = Timestamp("2017-09-30", freq="B") + control = Timestamp("2017-09-30") assert ts.dayofweek == 5 - assert not ts.is_month_end # not a weekday + assert not ts.is_month_end # not a weekday assert not ts.is_quarter_end # not a weekday # Control case: non-business is month/qtr start assert control.is_month_end @@ -52,7 +51,7 @@ def check(value, equal): assert value == equal # GH 10050 - ts = Timestamp('2015-05-10 09:06:03.000100001') + ts = Timestamp("2015-05-10 09:06:03.000100001") check(ts.year, 2015) check(ts.month, 5) check(ts.day, 10) @@ -72,7 +71,7 @@ def check(value, equal): check(ts.daysinmonth, 31) # GH 13303 - ts = Timestamp('2014-12-31 23:59:00-05:00', tz='US/Eastern') + ts = Timestamp("2014-12-31 23:59:00-05:00", tz="US/Eastern") check(ts.year, 2014) check(ts.month, 12) check(ts.day, 31) @@ -90,30 +89,31 @@ def check(value, equal): check(ts.week, 1) check(ts.daysinmonth, 31) - ts = Timestamp('2014-01-01 00:00:00+01:00') - starts = ['is_month_start', 'is_quarter_start', 'is_year_start'] + ts = Timestamp("2014-01-01 00:00:00+01:00") + starts = ["is_month_start", "is_quarter_start", "is_year_start"] for start in starts: assert getattr(ts, start) - ts = Timestamp('2014-12-31 23:59:59+01:00') - ends = ['is_month_end', 'is_year_end', 'is_quarter_end'] + ts = Timestamp("2014-12-31 23:59:59+01:00") + ends = ["is_month_end", "is_year_end", "is_quarter_end"] for end in ends: assert getattr(ts, end) # GH 12806 - @pytest.mark.parametrize('data', - [Timestamp('2017-08-28 23:00:00'), - Timestamp('2017-08-28 23:00:00', tz='EST')]) - @pytest.mark.parametrize('time_locale', [ - None] if tm.get_locales() is None else [None] + tm.get_locales()) + @pytest.mark.parametrize( + "data", + [Timestamp("2017-08-28 23:00:00"), Timestamp("2017-08-28 23:00:00", tz="EST")], + ) + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) def test_names(self, data, time_locale): # GH 17354 # Test .weekday_name, .day_name(), .month_name - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert data.weekday_name == 'Monday' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert data.weekday_name == "Monday" if time_locale is None: - expected_day = 'Monday' - expected_month = 'August' + expected_day = "Monday" + expected_month = "August" else: with tm.set_locale(time_locale, locale.LC_TIME): expected_day = calendar.day_name[0].capitalize() @@ -127,7 +127,7 @@ def test_names(self, data, time_locale): expected_day = unicodedata.normalize("NFD", expected_day) expected_month = unicodedata.normalize("NFD", expected_month) - result_day = unicodedata.normalize("NFD", result_day,) + result_day = unicodedata.normalize("NFD", result_day) result_month = unicodedata.normalize("NFD", result_month) assert result_day == expected_day @@ -141,17 +141,17 @@ def test_names(self, data, time_locale): def test_is_leap_year(self, tz_naive_fixture): tz = tz_naive_fixture # GH 13727 - dt = Timestamp('2000-01-01 00:00:00', tz=tz) + dt = Timestamp("2000-01-01 00:00:00", tz=tz) assert dt.is_leap_year assert isinstance(dt.is_leap_year, bool) - dt = Timestamp('1999-01-01 00:00:00', tz=tz) + dt = Timestamp("1999-01-01 00:00:00", tz=tz) assert not dt.is_leap_year - dt = Timestamp('2004-01-01 00:00:00', tz=tz) + dt = Timestamp("2004-01-01 00:00:00", tz=tz) assert dt.is_leap_year - dt = Timestamp('2100-01-01 00:00:00', tz=tz) + dt = Timestamp("2100-01-01 00:00:00", tz=tz) assert not dt.is_leap_year def test_woy_boundary(self): @@ -181,43 +181,60 @@ def test_woy_boundary(self): expected = 53 # ISO standard assert result == expected - result = np.array([Timestamp(datetime(*args)).week - for args in [(2000, 1, 1), (2000, 1, 2), ( - 2005, 1, 1), (2005, 1, 2)]]) + result = np.array( + [ + Timestamp(datetime(*args)).week + for args in [(2000, 1, 1), (2000, 1, 2), (2005, 1, 1), (2005, 1, 2)] + ] + ) assert (result == [52, 52, 53, 53]).all() def test_resolution(self): # GH#21336, GH#21365 - dt = Timestamp('2100-01-01 00:00:00') + dt = Timestamp("2100-01-01 00:00:00") assert dt.resolution == Timedelta(nanoseconds=1) class TestTimestampConstructors: - def test_constructor(self): - base_str = '2014-07-01 09:00' + base_str = "2014-07-01 09:00" base_dt = datetime(2014, 7, 1, 9) base_expected = 1404205200000000000 # confirm base representation is correct import calendar - assert (calendar.timegm(base_dt.timetuple()) * 1000000000 == - base_expected) - - tests = [(base_str, base_dt, base_expected), - ('2014-07-01 10:00', datetime(2014, 7, 1, 10), - base_expected + 3600 * 1000000000), - ('2014-07-01 09:00:00.000008000', - datetime(2014, 7, 1, 9, 0, 0, 8), - base_expected + 8000), - ('2014-07-01 09:00:00.000000005', - Timestamp('2014-07-01 09:00:00.000000005'), - base_expected + 5)] - - timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), - ('US/Eastern', -4), ('dateutil/US/Pacific', -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5)] + + assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected + + tests = [ + (base_str, base_dt, base_expected), + ( + "2014-07-01 10:00", + datetime(2014, 7, 1, 10), + base_expected + 3600 * 1000000000, + ), + ( + "2014-07-01 09:00:00.000008000", + datetime(2014, 7, 1, 9, 0, 0, 8), + base_expected + 8000, + ), + ( + "2014-07-01 09:00:00.000000005", + Timestamp("2014-07-01 09:00:00.000000005"), + base_expected + 5, + ), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] for date_str, date, expected in tests: for result in [Timestamp(date_str), Timestamp(date)]: @@ -232,8 +249,7 @@ def test_constructor(self): # with timezone for tz, offset in timezones: - for result in [Timestamp(date_str, tz=tz), Timestamp(date, - tz=tz)]: + for result in [Timestamp(date_str, tz=tz), Timestamp(date, tz=tz)]: expected_tz = expected - offset * 3600 * 1000000000 assert result.value == expected_tz assert conversion.pydt_to_i8(result) == expected_tz @@ -245,34 +261,41 @@ def test_constructor(self): # should convert to UTC if tz is not None: - result = Timestamp(result).tz_convert('UTC') + result = Timestamp(result).tz_convert("UTC") else: - result = Timestamp(result, tz='UTC') + result = Timestamp(result, tz="UTC") expected_utc = expected - offset * 3600 * 1000000000 assert result.value == expected_utc assert conversion.pydt_to_i8(result) == expected_utc def test_constructor_with_stringoffset(self): # GH 7833 - base_str = '2014-07-01 11:00:00+02:00' + base_str = "2014-07-01 11:00:00+02:00" base_dt = datetime(2014, 7, 1, 9) base_expected = 1404205200000000000 # confirm base representation is correct import calendar - assert (calendar.timegm(base_dt.timetuple()) * 1000000000 == - base_expected) - tests = [(base_str, base_expected), - ('2014-07-01 12:00:00+02:00', - base_expected + 3600 * 1000000000), - ('2014-07-01 11:00:00.000008000+02:00', base_expected + 8000), - ('2014-07-01 11:00:00.000000005+02:00', base_expected + 5)] - - timezones = [(None, 0), ('UTC', 0), (pytz.utc, 0), ('Asia/Tokyo', 9), - ('US/Eastern', -4), ('dateutil/US/Pacific', -7), - (pytz.FixedOffset(-180), -3), - (dateutil.tz.tzoffset(None, 18000), 5)] + assert calendar.timegm(base_dt.timetuple()) * 1000000000 == base_expected + + tests = [ + (base_str, base_expected), + ("2014-07-01 12:00:00+02:00", base_expected + 3600 * 1000000000), + ("2014-07-01 11:00:00.000008000+02:00", base_expected + 8000), + ("2014-07-01 11:00:00.000000005+02:00", base_expected + 5), + ] + + timezones = [ + (None, 0), + ("UTC", 0), + (pytz.utc, 0), + ("Asia/Tokyo", 9), + ("US/Eastern", -4), + ("dateutil/US/Pacific", -7), + (pytz.FixedOffset(-180), -3), + (dateutil.tz.tzoffset(None, 18000), 5), + ] for date_str, expected in tests: for result in [Timestamp(date_str)]: @@ -298,23 +321,23 @@ def test_constructor_with_stringoffset(self): assert conversion.pydt_to_i8(result) == expected_tz # should convert to UTC - result = Timestamp(result).tz_convert('UTC') + result = Timestamp(result).tz_convert("UTC") expected_utc = expected assert result.value == expected_utc assert conversion.pydt_to_i8(result) == expected_utc # This should be 2013-11-01 05:00 in UTC # converted to Chicago tz - result = Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago') - assert result.value == Timestamp('2013-11-01 05:00').value + result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago") + assert result.value == Timestamp("2013-11-01 05:00").value expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa assert repr(result) == expected assert result == eval(repr(result)) # This should be 2013-11-01 05:00 in UTC # converted to Tokyo tz (+09:00) - result = Timestamp('2013-11-01 00:00:00-0500', tz='Asia/Tokyo') - assert result.value == Timestamp('2013-11-01 05:00').value + result = Timestamp("2013-11-01 00:00:00-0500", tz="Asia/Tokyo") + assert result.value == Timestamp("2013-11-01 05:00").value expected = "Timestamp('2013-11-01 14:00:00+0900', tz='Asia/Tokyo')" assert repr(result) == expected assert result == eval(repr(result)) @@ -337,41 +360,43 @@ def test_constructor_with_stringoffset(self): assert result == eval(repr(result)) def test_constructor_invalid(self): - with pytest.raises(TypeError, match='Cannot convert input'): + with pytest.raises(TypeError, match="Cannot convert input"): Timestamp(slice(2)) - with pytest.raises(ValueError, match='Cannot convert Period'): - Timestamp(Period('1000-01-01')) + with pytest.raises(ValueError, match="Cannot convert Period"): + Timestamp(Period("1000-01-01")) def test_constructor_invalid_tz(self): # GH#17690 - with pytest.raises(TypeError, match='must be a datetime.tzinfo'): - Timestamp('2017-10-22', tzinfo='US/Eastern') + with pytest.raises(TypeError, match="must be a datetime.tzinfo"): + Timestamp("2017-10-22", tzinfo="US/Eastern") - with pytest.raises(ValueError, match='at most one of'): - Timestamp('2017-10-22', tzinfo=utc, tz='UTC') + with pytest.raises(ValueError, match="at most one of"): + Timestamp("2017-10-22", tzinfo=utc, tz="UTC") with pytest.raises(ValueError, match="Invalid frequency:"): # GH#5168 # case where user tries to pass tz as an arg, not kwarg, gets # interpreted as a `freq` - Timestamp('2012-01-01', 'US/Pacific') + Timestamp("2012-01-01", "US/Pacific") def test_constructor_strptime(self): # GH25016 # Test support for Timestamp.strptime - fmt = '%Y%m%d-%H%M%S-%f%z' - ts = '20190129-235348-000001+0000' + fmt = "%Y%m%d-%H%M%S-%f%z" + ts = "20190129-235348-000001+0000" with pytest.raises(NotImplementedError): Timestamp.strptime(ts, fmt) def test_constructor_tz_or_tzinfo(self): # GH#17943, GH#17690, GH#5168 - stamps = [Timestamp(year=2017, month=10, day=22, tz='UTC'), - Timestamp(year=2017, month=10, day=22, tzinfo=utc), - Timestamp(year=2017, month=10, day=22, tz=utc), - Timestamp(datetime(2017, 10, 22), tzinfo=utc), - Timestamp(datetime(2017, 10, 22), tz='UTC'), - Timestamp(datetime(2017, 10, 22), tz=utc)] + stamps = [ + Timestamp(year=2017, month=10, day=22, tz="UTC"), + Timestamp(year=2017, month=10, day=22, tzinfo=utc), + Timestamp(year=2017, month=10, day=22, tz=utc), + Timestamp(datetime(2017, 10, 22), tzinfo=utc), + Timestamp(datetime(2017, 10, 22), tz="UTC"), + Timestamp(datetime(2017, 10, 22), tz=utc), + ] assert all(ts == stamps[0] for ts in stamps) def test_constructor_positional(self): @@ -388,10 +413,10 @@ def test_constructor_positional(self): Timestamp(2000, 1, 32) # see gh-11630 - assert (repr(Timestamp(2015, 11, 12)) == - repr(Timestamp('20151112'))) - assert (repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == - repr(Timestamp('2015-11-12 01:02:03.999999'))) + assert repr(Timestamp(2015, 11, 12)) == repr(Timestamp("20151112")) + assert repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)) == repr( + Timestamp("2015-11-12 01:02:03.999999") + ) def test_constructor_keyword(self): # GH 10758 @@ -406,23 +431,32 @@ def test_constructor_keyword(self): with pytest.raises(ValueError): Timestamp(year=2000, month=1, day=32) - assert (repr(Timestamp(year=2015, month=11, day=12)) == - repr(Timestamp('20151112'))) - - assert (repr(Timestamp(year=2015, month=11, day=12, hour=1, minute=2, - second=3, microsecond=999999)) == - repr(Timestamp('2015-11-12 01:02:03.999999'))) + assert repr(Timestamp(year=2015, month=11, day=12)) == repr( + Timestamp("20151112") + ) + + assert repr( + Timestamp( + year=2015, + month=11, + day=12, + hour=1, + minute=2, + second=3, + microsecond=999999, + ) + ) == repr(Timestamp("2015-11-12 01:02:03.999999")) def test_constructor_fromordinal(self): base = datetime(2000, 1, 1) - ts = Timestamp.fromordinal(base.toordinal(), freq='D') + ts = Timestamp.fromordinal(base.toordinal(), freq="D") assert base == ts - assert ts.freq == 'D' + assert ts.freq == "D" assert base.toordinal() == ts.toordinal() - ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern') - assert Timestamp('2000-01-01', tz='US/Eastern') == ts + ts = Timestamp.fromordinal(base.toordinal(), tz="US/Eastern") + assert Timestamp("2000-01-01", tz="US/Eastern") == ts assert base.toordinal() == ts.toordinal() # GH#3042 @@ -431,37 +465,69 @@ def test_constructor_fromordinal(self): assert ts.to_pydatetime() == dt # with a tzinfo - stamp = Timestamp('2011-4-16', tz='US/Eastern') + stamp = Timestamp("2011-4-16", tz="US/Eastern") dt_tz = stamp.to_pydatetime() - ts = Timestamp.fromordinal(dt_tz.toordinal(), tz='US/Eastern') + ts = Timestamp.fromordinal(dt_tz.toordinal(), tz="US/Eastern") assert ts.to_pydatetime() == dt_tz - @pytest.mark.parametrize('result', [ - Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), - Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, - microsecond=6, nanosecond=1), - Timestamp(year=2000, month=1, day=2, hour=3, minute=4, second=5, - microsecond=6, nanosecond=1, tz='UTC'), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC)]) + @pytest.mark.parametrize( + "result", + [ + Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), nanosecond=1), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + ), + Timestamp( + year=2000, + month=1, + day=2, + hour=3, + minute=4, + second=5, + microsecond=6, + nanosecond=1, + tz="UTC", + ), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), + Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), + ], + ) def test_constructor_nanosecond(self, result): # GH 18898 expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) expected = expected + Timedelta(nanoseconds=1) assert result == expected - @pytest.mark.parametrize('z', ['Z0', 'Z00']) + @pytest.mark.parametrize("z", ["Z0", "Z00"]) def test_constructor_invalid_Z0_isostring(self, z): # GH 8910 with pytest.raises(ValueError): - Timestamp('2014-11-02 01:00{}'.format(z)) - - @pytest.mark.parametrize('arg', ['year', 'month', 'day', 'hour', 'minute', - 'second', 'microsecond', 'nanosecond']) + Timestamp("2014-11-02 01:00{}".format(z)) + + @pytest.mark.parametrize( + "arg", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + ], + ) def test_invalid_date_kwarg_with_string_input(self, arg): kwarg = {arg: 1} with pytest.raises(ValueError): - Timestamp('2010-10-10 12:59:59.999999999', **kwarg) + Timestamp("2010-10-10 12:59:59.999999999", **kwarg) def test_out_of_bounds_integer_value(self): # GH#26651 check that we raise OutOfBoundsDatetime, not OverflowError @@ -471,12 +537,12 @@ def test_out_of_bounds_integer_value(self): Timestamp(Timestamp.min.value * 2) def test_out_of_bounds_value(self): - one_us = np.timedelta64(1).astype('timedelta64[us]') + one_us = np.timedelta64(1).astype("timedelta64[us]") # By definition we can't go out of bounds in [ns], so we # convert the datetime64s to [us] so we can go out of bounds - min_ts_us = np.datetime64(Timestamp.min).astype('M8[us]') - max_ts_us = np.datetime64(Timestamp.max).astype('M8[us]') + min_ts_us = np.datetime64(Timestamp.min).astype("M8[us]") + max_ts_us = np.datetime64(Timestamp.max).astype("M8[us]") # No error for the min/max datetimes Timestamp(min_ts_us) @@ -492,33 +558,33 @@ def test_out_of_bounds_value(self): def test_out_of_bounds_string(self): with pytest.raises(ValueError): - Timestamp('1676-01-01') + Timestamp("1676-01-01") with pytest.raises(ValueError): - Timestamp('2263-01-01') + Timestamp("2263-01-01") def test_barely_out_of_bounds(self): # GH#19529 # GH#19382 close enough to bounds that dropping nanos would result # in an in-bounds datetime with pytest.raises(OutOfBoundsDatetime): - Timestamp('2262-04-11 23:47:16.854775808') + Timestamp("2262-04-11 23:47:16.854775808") def test_bounds_with_different_units(self): - out_of_bounds_dates = ('1677-09-21', '2262-04-12') + out_of_bounds_dates = ("1677-09-21", "2262-04-12") - time_units = ('D', 'h', 'm', 's', 'ms', 'us') + time_units = ("D", "h", "m", "s", "ms", "us") for date_string in out_of_bounds_dates: for unit in time_units: - dt64 = np.datetime64(date_string, dtype='M8[%s]' % unit) + dt64 = np.datetime64(date_string, dtype="M8[%s]" % unit) with pytest.raises(ValueError): Timestamp(dt64) - in_bounds_dates = ('1677-09-23', '2262-04-11') + in_bounds_dates = ("1677-09-23", "2262-04-11") for date_string in in_bounds_dates: for unit in time_units: - dt64 = np.datetime64(date_string, dtype='M8[%s]' % unit) + dt64 = np.datetime64(date_string, dtype="M8[%s]" % unit) Timestamp(dt64) def test_min_valid(self): @@ -531,12 +597,12 @@ def test_max_valid(self): def test_now(self): # GH#9000 - ts_from_string = Timestamp('now') + ts_from_string = Timestamp("now") ts_from_method = Timestamp.now() ts_datetime = datetime.now() - ts_from_string_tz = Timestamp('now', tz='US/Eastern') - ts_from_method_tz = Timestamp.now(tz='US/Eastern') + ts_from_string_tz = Timestamp("now", tz="US/Eastern") + ts_from_method_tz = Timestamp.now(tz="US/Eastern") # Check that the delta between the times is less than 1s (arbitrarily # small) @@ -544,16 +610,21 @@ def test_now(self): assert abs(ts_from_method - ts_from_string) < delta assert abs(ts_datetime - ts_from_method) < delta assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert (abs(ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None)) < delta) + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) def test_today(self): - ts_from_string = Timestamp('today') + ts_from_string = Timestamp("today") ts_from_method = Timestamp.today() ts_datetime = datetime.today() - ts_from_string_tz = Timestamp('today', tz='US/Eastern') - ts_from_method_tz = Timestamp.today(tz='US/Eastern') + ts_from_string_tz = Timestamp("today", tz="US/Eastern") + ts_from_method_tz = Timestamp.today(tz="US/Eastern") # Check that the delta between the times is less than 1s (arbitrarily # small) @@ -561,26 +632,33 @@ def test_today(self): assert abs(ts_from_method - ts_from_string) < delta assert abs(ts_datetime - ts_from_method) < delta assert abs(ts_from_method_tz - ts_from_string_tz) < delta - assert (abs(ts_from_string_tz.tz_localize(None) - - ts_from_method_tz.tz_localize(None)) < delta) - - @pytest.mark.parametrize('tz', [None, pytz.timezone('US/Pacific')]) + assert ( + abs( + ts_from_string_tz.tz_localize(None) + - ts_from_method_tz.tz_localize(None) + ) + < delta + ) + + @pytest.mark.parametrize("tz", [None, pytz.timezone("US/Pacific")]) def test_disallow_setting_tz(self, tz): # GH 3746 - ts = Timestamp('2010') + ts = Timestamp("2010") with pytest.raises(AttributeError): ts.tz = tz - @pytest.mark.parametrize('offset', ['+0300', '+0200']) + @pytest.mark.parametrize("offset", ["+0300", "+0200"]) def test_construct_timestamp_near_dst(self, offset): # GH 20854 - expected = Timestamp('2016-10-30 03:00:00{}'.format(offset), - tz='Europe/Helsinki') - result = Timestamp(expected).tz_convert('Europe/Helsinki') + expected = Timestamp( + "2016-10-30 03:00:00{}".format(offset), tz="Europe/Helsinki" + ) + result = Timestamp(expected).tz_convert("Europe/Helsinki") assert result == expected - @pytest.mark.parametrize('arg', [ - '2013/01/01 00:00:00+09:00', '2013-01-01 00:00:00+09:00']) + @pytest.mark.parametrize( + "arg", ["2013/01/01 00:00:00+09:00", "2013-01-01 00:00:00+09:00"] + ) def test_construct_with_different_string_format(self, arg): # GH 12064 result = Timestamp(arg) @@ -589,21 +667,21 @@ def test_construct_with_different_string_format(self, arg): def test_construct_timestamp_preserve_original_frequency(self): # GH 22311 - result = Timestamp(Timestamp('2010-08-08', freq='D')).freq + result = Timestamp(Timestamp("2010-08-08", freq="D")).freq expected = offsets.Day() assert result == expected def test_constructor_invalid_frequency(self): # GH 22311 with pytest.raises(ValueError, match="Invalid frequency:"): - Timestamp('2012-01-01', freq=[]) + Timestamp("2012-01-01", freq=[]) - @pytest.mark.parametrize('box', [datetime, Timestamp]) + @pytest.mark.parametrize("box", [datetime, Timestamp]) def test_depreciate_tz_and_tzinfo_in_datetime_input(self, box): # GH 23579 - kwargs = {'year': 2018, 'month': 1, 'day': 1, 'tzinfo': utc} + kwargs = {"year": 2018, "month": 1, "day": 1, "tzinfo": utc} with tm.assert_produces_warning(FutureWarning): - Timestamp(box(**kwargs), tz='US/Pacific') + Timestamp(box(**kwargs), tz="US/Pacific") def test_dont_convert_dateutil_utc_to_pytz_utc(self): result = Timestamp(datetime(2018, 1, 1), tz=tzutc()) @@ -624,83 +702,92 @@ class SubDatetime(datetime): class TestTimestamp: - def test_tz(self): - tstr = '2014-02-01 09:00' + tstr = "2014-02-01 09:00" ts = Timestamp(tstr) - local = ts.tz_localize('Asia/Tokyo') + local = ts.tz_localize("Asia/Tokyo") assert local.hour == 9 - assert local == Timestamp(tstr, tz='Asia/Tokyo') - conv = local.tz_convert('US/Eastern') - assert conv == Timestamp('2014-01-31 19:00', tz='US/Eastern') + assert local == Timestamp(tstr, tz="Asia/Tokyo") + conv = local.tz_convert("US/Eastern") + assert conv == Timestamp("2014-01-31 19:00", tz="US/Eastern") assert conv.hour == 19 # preserves nanosecond ts = Timestamp(tstr) + offsets.Nano(5) - local = ts.tz_localize('Asia/Tokyo') + local = ts.tz_localize("Asia/Tokyo") assert local.hour == 9 assert local.nanosecond == 5 - conv = local.tz_convert('US/Eastern') + conv = local.tz_convert("US/Eastern") assert conv.nanosecond == 5 assert conv.hour == 19 def test_utc_z_designator(self): - assert get_timezone(Timestamp('2014-11-02 01:00Z').tzinfo) is utc + assert get_timezone(Timestamp("2014-11-02 01:00Z").tzinfo) is utc def test_asm8(self): np.random.seed(7960929) ns = [Timestamp.min.value, Timestamp.max.value, 1000] for n in ns: - assert (Timestamp(n).asm8.view('i8') == - np.datetime64(n, 'ns').view('i8') == n) + assert ( + Timestamp(n).asm8.view("i8") == np.datetime64(n, "ns").view("i8") == n + ) - assert (Timestamp('nat').asm8.view('i8') == - np.datetime64('nat', 'ns').view('i8')) + assert Timestamp("nat").asm8.view("i8") == np.datetime64("nat", "ns").view("i8") def test_class_ops_pytz(self): def compare(x, y): - assert (int(Timestamp(x).value / 1e9) == - int(Timestamp(y).value / 1e9)) + assert int(Timestamp(x).value / 1e9) == int(Timestamp(y).value / 1e9) compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now('UTC'), datetime.now(timezone('UTC'))) + compare(Timestamp.now("UTC"), datetime.now(timezone("UTC"))) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) - compare(Timestamp.utcfromtimestamp(current_time), - datetime.utcfromtimestamp(current_time)) - compare(Timestamp.fromtimestamp(current_time), - datetime.fromtimestamp(current_time)) + compare( + Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time), + ) + compare( + Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) + ) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() - compare(Timestamp.combine(date_component, time_component), - datetime.combine(date_component, time_component)) + compare( + Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component), + ) def test_class_ops_dateutil(self): def compare(x, y): - assert (int(np.round(Timestamp(x).value / 1e9)) == - int(np.round(Timestamp(y).value / 1e9))) + assert int(np.round(Timestamp(x).value / 1e9)) == int( + np.round(Timestamp(y).value / 1e9) + ) compare(Timestamp.now(), datetime.now()) - compare(Timestamp.now('UTC'), datetime.now(tzutc())) + compare(Timestamp.now("UTC"), datetime.now(tzutc())) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) - compare(Timestamp.utcfromtimestamp(current_time), - datetime.utcfromtimestamp(current_time)) - compare(Timestamp.fromtimestamp(current_time), - datetime.fromtimestamp(current_time)) + compare( + Timestamp.utcfromtimestamp(current_time), + datetime.utcfromtimestamp(current_time), + ) + compare( + Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) + ) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() - compare(Timestamp.combine(date_component, time_component), - datetime.combine(date_component, time_component)) + compare( + Timestamp.combine(date_component, time_component), + datetime.combine(date_component, time_component), + ) def test_basics_nanos(self): - val = np.int64(946684800000000000).view('M8[ns]') - stamp = Timestamp(val.view('i8') + 500) + val = np.int64(946684800000000000).view("M8[ns]") + stamp = Timestamp(val.view("i8") + 500) assert stamp.year == 2000 assert stamp.month == 1 assert stamp.microsecond == 0 @@ -715,26 +802,30 @@ def test_basics_nanos(self): assert stamp.microsecond == 145224 assert stamp.nanosecond == 192 - @pytest.mark.parametrize('value, check_kwargs', [ - [946688461000000000, {}], - [946688461000000000 / 1000, dict(unit='us')], - [946688461000000000 / 1000000, dict(unit='ms')], - [946688461000000000 / 1000000000, dict(unit='s')], - [10957, dict(unit='D', h=0)], - [(946688461000000000 + 500000) / 1000000000, - dict(unit='s', us=499, ns=964)], - [(946688461000000000 + 500000000) / 1000000000, - dict(unit='s', us=500000)], - [(946688461000000000 + 500000) / 1000000, dict(unit='ms', us=500)], - [(946688461000000000 + 500000) / 1000, dict(unit='us', us=500)], - [(946688461000000000 + 500000000) / 1000000, - dict(unit='ms', us=500000)], - [946688461000000000 / 1000.0 + 5, dict(unit='us', us=5)], - [946688461000000000 / 1000.0 + 5000, dict(unit='us', us=5000)], - [946688461000000000 / 1000000.0 + 0.5, dict(unit='ms', us=500)], - [946688461000000000 / 1000000.0 + 0.005, dict(unit='ms', us=5, ns=5)], - [946688461000000000 / 1000000000.0 + 0.5, dict(unit='s', us=500000)], - [10957 + 0.5, dict(unit='D', h=12)]]) + @pytest.mark.parametrize( + "value, check_kwargs", + [ + [946688461000000000, {}], + [946688461000000000 / 1000, dict(unit="us")], + [946688461000000000 / 1000000, dict(unit="ms")], + [946688461000000000 / 1000000000, dict(unit="s")], + [10957, dict(unit="D", h=0)], + [ + (946688461000000000 + 500000) / 1000000000, + dict(unit="s", us=499, ns=964), + ], + [(946688461000000000 + 500000000) / 1000000000, dict(unit="s", us=500000)], + [(946688461000000000 + 500000) / 1000000, dict(unit="ms", us=500)], + [(946688461000000000 + 500000) / 1000, dict(unit="us", us=500)], + [(946688461000000000 + 500000000) / 1000000, dict(unit="ms", us=500000)], + [946688461000000000 / 1000.0 + 5, dict(unit="us", us=5)], + [946688461000000000 / 1000.0 + 5000, dict(unit="us", us=5000)], + [946688461000000000 / 1000000.0 + 0.5, dict(unit="ms", us=500)], + [946688461000000000 / 1000000.0 + 0.005, dict(unit="ms", us=5, ns=5)], + [946688461000000000 / 1000000000.0 + 0.5, dict(unit="s", us=500000)], + [10957 + 0.5, dict(unit="D", h=12)], + ], + ) def test_unit(self, value, check_kwargs): def check(value, unit=None, h=1, s=1, us=0, ns=0): stamp = Timestamp(value, unit=unit) @@ -742,7 +833,7 @@ def check(value, unit=None, h=1, s=1, us=0, ns=0): assert stamp.month == 1 assert stamp.day == 1 assert stamp.hour == h - if unit != 'D': + if unit != "D": assert stamp.minute == 1 assert stamp.second == s assert stamp.microsecond == us @@ -758,26 +849,26 @@ def test_roundtrip(self): # test value to string and back conversions # further test accessors - base = Timestamp('20140101 00:00:00') + base = Timestamp("20140101 00:00:00") - result = Timestamp(base.value + Timedelta('5ms').value) + result = Timestamp(base.value + Timedelta("5ms").value) assert result == Timestamp(str(base) + ".005000") assert result.microsecond == 5000 - result = Timestamp(base.value + Timedelta('5us').value) + result = Timestamp(base.value + Timedelta("5us").value) assert result == Timestamp(str(base) + ".000005") assert result.microsecond == 5 - result = Timestamp(base.value + Timedelta('5ns').value) + result = Timestamp(base.value + Timedelta("5ns").value) assert result == Timestamp(str(base) + ".000000005") assert result.nanosecond == 5 assert result.microsecond == 0 - result = Timestamp(base.value + Timedelta('6ms 5us').value) + result = Timestamp(base.value + Timedelta("6ms 5us").value) assert result == Timestamp(str(base) + ".006005") assert result.microsecond == 5 + 6 * 1000 - result = Timestamp(base.value + Timedelta('200ms 5us').value) + result = Timestamp(base.value + Timedelta("200ms 5us").value) assert result == Timestamp(str(base) + ".200005") assert result.microsecond == 5 + 200 * 1000 @@ -788,43 +879,42 @@ def test_hash_equivalent(self): def test_tz_conversion_freq(self, tz_naive_fixture): # GH25241 - t1 = Timestamp('2019-01-01 10:00', freq='H') + t1 = Timestamp("2019-01-01 10:00", freq="H") assert t1.tz_localize(tz=tz_naive_fixture).freq == t1.freq - t2 = Timestamp('2019-01-02 12:00', tz='UTC', freq='T') - assert t2.tz_convert(tz='UTC').freq == t2.freq + t2 = Timestamp("2019-01-02 12:00", tz="UTC", freq="T") + assert t2.tz_convert(tz="UTC").freq == t2.freq class TestTimestampNsOperations: - def test_nanosecond_string_parsing(self): - ts = Timestamp('2013-05-01 07:15:45.123456789') + ts = Timestamp("2013-05-01 07:15:45.123456789") # GH 7878 - expected_repr = '2013-05-01 07:15:45.123456789' + expected_repr = "2013-05-01 07:15:45.123456789" expected_value = 1367392545123456789 assert ts.value == expected_value assert expected_repr in repr(ts) - ts = Timestamp('2013-05-01 07:15:45.123456789+09:00', tz='Asia/Tokyo') + ts = Timestamp("2013-05-01 07:15:45.123456789+09:00", tz="Asia/Tokyo") assert ts.value == expected_value - 9 * 3600 * 1000000000 assert expected_repr in repr(ts) - ts = Timestamp('2013-05-01 07:15:45.123456789', tz='UTC') + ts = Timestamp("2013-05-01 07:15:45.123456789", tz="UTC") assert ts.value == expected_value assert expected_repr in repr(ts) - ts = Timestamp('2013-05-01 07:15:45.123456789', tz='US/Eastern') + ts = Timestamp("2013-05-01 07:15:45.123456789", tz="US/Eastern") assert ts.value == expected_value + 4 * 3600 * 1000000000 assert expected_repr in repr(ts) # GH 10041 - ts = Timestamp('20130501T071545.123456789') + ts = Timestamp("20130501T071545.123456789") assert ts.value == expected_value assert expected_repr in repr(ts) def test_nanosecond_timestamp(self): # GH 7610 expected = 1293840000000000005 - t = Timestamp('2011-01-01') + offsets.Nano(5) + t = Timestamp("2011-01-01") + offsets.Nano(5) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" assert t.value == expected assert t.nanosecond == 5 @@ -834,7 +924,7 @@ def test_nanosecond_timestamp(self): assert t.value == expected assert t.nanosecond == 5 - t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000005Z')) + t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000005Z")) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000005')" assert t.value == expected assert t.nanosecond == 5 @@ -850,39 +940,38 @@ def test_nanosecond_timestamp(self): assert t.value == expected assert t.nanosecond == 10 - t = Timestamp(np_datetime64_compat('2011-01-01 00:00:00.000000010Z')) + t = Timestamp(np_datetime64_compat("2011-01-01 00:00:00.000000010Z")) assert repr(t) == "Timestamp('2011-01-01 00:00:00.000000010')" assert t.value == expected assert t.nanosecond == 10 class TestTimestampToJulianDate: - def test_compare_1700(self): - r = Timestamp('1700-06-23').to_julian_date() + r = Timestamp("1700-06-23").to_julian_date() assert r == 2342145.5 def test_compare_2000(self): - r = Timestamp('2000-04-12').to_julian_date() + r = Timestamp("2000-04-12").to_julian_date() assert r == 2451646.5 def test_compare_2100(self): - r = Timestamp('2100-08-12').to_julian_date() + r = Timestamp("2100-08-12").to_julian_date() assert r == 2488292.5 def test_compare_hour01(self): - r = Timestamp('2000-08-12T01:00:00').to_julian_date() + r = Timestamp("2000-08-12T01:00:00").to_julian_date() assert r == 2451768.5416666666666666 def test_compare_hour13(self): - r = Timestamp('2000-08-12T13:00:00').to_julian_date() + r = Timestamp("2000-08-12T13:00:00").to_julian_date() assert r == 2451769.0416666666666666 class TestTimestampConversion: def test_conversion(self): # GH#9255 - ts = Timestamp('2000-01-01') + ts = Timestamp("2000-01-01") result = ts.to_pydatetime() expected = datetime(2000, 1, 1) @@ -890,42 +979,41 @@ def test_conversion(self): assert type(result) == type(expected) result = ts.to_datetime64() - expected = np.datetime64(ts.value, 'ns') + expected = np.datetime64(ts.value, "ns") assert result == expected assert type(result) == type(expected) assert result.dtype == expected.dtype def test_to_pydatetime_nonzero_nano(self): - ts = Timestamp('2011-01-01 9:00:00.123456789') + ts = Timestamp("2011-01-01 9:00:00.123456789") # Warn the user of data loss (nanoseconds). - with tm.assert_produces_warning(UserWarning, - check_stacklevel=False): + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): expected = datetime(2011, 1, 1, 9, 0, 0, 123456) result = ts.to_pydatetime() assert result == expected def test_timestamp_to_datetime(self): - stamp = Timestamp('20090415', tz='US/Eastern', freq='D') + stamp = Timestamp("20090415", tz="US/Eastern", freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_datetime_dateutil(self): - stamp = Timestamp('20090415', tz='dateutil/US/Eastern', freq='D') + stamp = Timestamp("20090415", tz="dateutil/US/Eastern", freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo def test_timestamp_to_datetime_explicit_pytz(self): - stamp = Timestamp('20090415', tz=pytz.timezone('US/Eastern'), freq='D') + stamp = Timestamp("20090415", tz=pytz.timezone("US/Eastern"), freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo @td.skip_if_windows_python_3 def test_timestamp_to_datetime_explicit_dateutil(self): - stamp = Timestamp('20090415', tz=gettz('US/Eastern'), freq='D') + stamp = Timestamp("20090415", tz=gettz("US/Eastern"), freq="D") dtval = stamp.to_pydatetime() assert stamp == dtval assert stamp.tzinfo == dtval.tzinfo @@ -935,21 +1023,25 @@ def test_to_datetime_bijective(self): # by going from nanoseconds to microseconds. exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.max.to_pydatetime()).value / 1000 == - Timestamp.max.value / 1000) + assert ( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000 + == Timestamp.max.value / 1000 + ) exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning with tm.assert_produces_warning(exp_warning, check_stacklevel=False): - assert (Timestamp(Timestamp.min.to_pydatetime()).value / 1000 == - Timestamp.min.value / 1000) + assert ( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000 + == Timestamp.min.value / 1000 + ) def test_to_period_tz_warning(self): # GH#21333 make sure a warning is issued when timezone # info is lost - ts = Timestamp('2009-04-15 16:17:18', tz='US/Eastern') + ts = Timestamp("2009-04-15 16:17:18", tz="US/Eastern") with tm.assert_produces_warning(UserWarning): # warning that timezone info will be lost - ts.to_period('D') + ts.to_period("D") def test_to_numpy_alias(self): # GH 24653: alias .to_numpy() for scalars diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 914423fcf5ba7..f64cf97acf805 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -24,61 +24,63 @@ class TestTimestampTZOperations: def test_tz_localize_pushes_out_of_bounds(self): # GH#12677 # tz_localize that pushes away from the boundary is OK - pac = Timestamp.min.tz_localize('US/Pacific') + pac = Timestamp.min.tz_localize("US/Pacific") assert pac.value > Timestamp.min.value - pac.tz_convert('Asia/Tokyo') # tz_convert doesn't change value + pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime): - Timestamp.min.tz_localize('Asia/Tokyo') + Timestamp.min.tz_localize("Asia/Tokyo") # tz_localize that pushes away from the boundary is OK - tokyo = Timestamp.max.tz_localize('Asia/Tokyo') + tokyo = Timestamp.max.tz_localize("Asia/Tokyo") assert tokyo.value < Timestamp.max.value - tokyo.tz_convert('US/Pacific') # tz_convert doesn't change value + tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value with pytest.raises(OutOfBoundsDatetime): - Timestamp.max.tz_localize('US/Pacific') + Timestamp.max.tz_localize("US/Pacific") def test_tz_localize_ambiguous_bool(self): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 - ts = Timestamp('2015-11-01 01:00:03') - expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') - expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + ts = Timestamp("2015-11-01 01:00:03") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") with pytest.raises(pytz.AmbiguousTimeError): - ts.tz_localize('US/Central') + ts.tz_localize("US/Central") - result = ts.tz_localize('US/Central', ambiguous=True) + result = ts.tz_localize("US/Central", ambiguous=True) assert result == expected0 - result = ts.tz_localize('US/Central', ambiguous=False) + result = ts.tz_localize("US/Central", ambiguous=False) assert result == expected1 def test_tz_localize_ambiguous(self): - ts = Timestamp('2014-11-02 01:00') - ts_dst = ts.tz_localize('US/Eastern', ambiguous=True) - ts_no_dst = ts.tz_localize('US/Eastern', ambiguous=False) + ts = Timestamp("2014-11-02 01:00") + ts_dst = ts.tz_localize("US/Eastern", ambiguous=True) + ts_no_dst = ts.tz_localize("US/Eastern", ambiguous=False) assert (ts_no_dst.value - ts_dst.value) / 1e9 == 3600 with pytest.raises(ValueError): - ts.tz_localize('US/Eastern', ambiguous='infer') + ts.tz_localize("US/Eastern", ambiguous="infer") # GH#8025 - msg = ('Cannot localize tz-aware Timestamp, ' - 'use tz_convert for conversions') + msg = "Cannot localize tz-aware Timestamp, " "use tz_convert for conversions" with pytest.raises(TypeError, match=msg): - Timestamp('2011-01-01', tz='US/Eastern').tz_localize('Asia/Tokyo') + Timestamp("2011-01-01", tz="US/Eastern").tz_localize("Asia/Tokyo") - msg = ('Cannot convert tz-naive Timestamp, ' - 'use tz_localize to localize') + msg = "Cannot convert tz-naive Timestamp, " "use tz_localize to localize" with pytest.raises(TypeError, match=msg): - Timestamp('2011-01-01').tz_convert('Asia/Tokyo') - - @pytest.mark.parametrize('stamp, tz', [ - ('2015-03-08 02:00', 'US/Eastern'), - ('2015-03-08 02:30', 'US/Pacific'), - ('2015-03-29 02:00', 'Europe/Paris'), - ('2015-03-29 02:30', 'Europe/Belgrade')]) - @pytest.mark.filterwarnings('ignore::FutureWarning') + Timestamp("2011-01-01").tz_convert("Asia/Tokyo") + + @pytest.mark.parametrize( + "stamp, tz", + [ + ("2015-03-08 02:00", "US/Eastern"), + ("2015-03-08 02:30", "US/Pacific"), + ("2015-03-29 02:00", "Europe/Paris"), + ("2015-03-29 02:30", "Europe/Belgrade"), + ], + ) + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_tz_localize_nonexistent(self, stamp, tz): # GH#13057 ts = Timestamp(stamp) @@ -87,38 +89,45 @@ def test_tz_localize_nonexistent(self, stamp, tz): # GH 22644 with pytest.raises(NonExistentTimeError): with tm.assert_produces_warning(FutureWarning): - ts.tz_localize(tz, errors='raise') + ts.tz_localize(tz, errors="raise") with tm.assert_produces_warning(FutureWarning): - assert ts.tz_localize(tz, errors='coerce') is NaT + assert ts.tz_localize(tz, errors="coerce") is NaT def test_tz_localize_errors_ambiguous(self): # GH#13057 - ts = Timestamp('2015-11-1 01:00') + ts = Timestamp("2015-11-1 01:00") with pytest.raises(AmbiguousTimeError): with tm.assert_produces_warning(FutureWarning): - ts.tz_localize('US/Pacific', errors='coerce') + ts.tz_localize("US/Pacific", errors="coerce") - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_tz_localize_errors_invalid_arg(self): # GH 22644 - tz = 'Europe/Warsaw' - ts = Timestamp('2015-03-29 02:00:00') + tz = "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:00:00") with pytest.raises(ValueError): with tm.assert_produces_warning(FutureWarning): - ts.tz_localize(tz, errors='foo') + ts.tz_localize(tz, errors="foo") def test_tz_localize_errors_coerce(self): # GH 22644 # make sure errors='coerce' gets mapped correctly to nonexistent - tz = 'Europe/Warsaw' - ts = Timestamp('2015-03-29 02:00:00') + tz = "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:00:00") with tm.assert_produces_warning(FutureWarning): - result = ts.tz_localize(tz, errors='coerce') - expected = ts.tz_localize(tz, nonexistent='NaT') + result = ts.tz_localize(tz, errors="coerce") + expected = ts.tz_localize(tz, nonexistent="NaT") assert result is expected - @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']) + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): tz = tz_aware_fixture ts = Timestamp(stamp) @@ -135,10 +144,10 @@ def test_tz_localize_roundtrip(self, stamp, tz_aware_fixture): def test_tz_localize_ambiguous_compat(self): # validate that pytz and dateutil are compat for dst # when the transition happens - naive = Timestamp('2013-10-27 01:00:00') + naive = Timestamp("2013-10-27 01:00:00") - pytz_zone = 'Europe/London' - dateutil_zone = 'dateutil/Europe/London' + pytz_zone = "Europe/London" + dateutil_zone = "dateutil/Europe/London" result_pytz = naive.tz_localize(pytz_zone, ambiguous=0) result_dateutil = naive.tz_localize(dateutil_zone, ambiguous=0) assert result_pytz.value == result_dateutil.value @@ -146,8 +155,8 @@ def test_tz_localize_ambiguous_compat(self): # fixed ambiguous behavior # see gh-14621 - assert result_pytz.to_pydatetime().tzname() == 'GMT' - assert result_dateutil.to_pydatetime().tzname() == 'BST' + assert result_pytz.to_pydatetime().tzname() == "GMT" + assert result_dateutil.to_pydatetime().tzname() == "BST" assert str(result_pytz) != str(result_dateutil) # 1 hour difference @@ -158,99 +167,138 @@ def test_tz_localize_ambiguous_compat(self): # see gh-14621 assert str(result_pytz) == str(result_dateutil) - assert (result_pytz.to_pydatetime().tzname() == - result_dateutil.to_pydatetime().tzname()) - - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern'), - 'US/Eastern', 'dateutil/US/Eastern']) + assert ( + result_pytz.to_pydatetime().tzname() + == result_dateutil.to_pydatetime().tzname() + ) + + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) def test_timestamp_tz_localize(self, tz): - stamp = Timestamp('3/11/2012 04:00') + stamp = Timestamp("3/11/2012 04:00") result = stamp.tz_localize(tz) - expected = Timestamp('3/11/2012 04:00', tz=tz) + expected = Timestamp("3/11/2012 04:00", tz=tz) assert result.hour == expected.hour assert result == expected - @pytest.mark.parametrize('start_ts, tz, end_ts, shift', [ - ['2015-03-29 02:20:00', 'Europe/Warsaw', '2015-03-29 03:00:00', - 'forward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:59:59.999999999', 'backward'], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 03:20:00', timedelta(hours=1)], - ['2015-03-29 02:20:00', 'Europe/Warsaw', - '2015-03-29 01:20:00', timedelta(hours=-1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:00:00', - 'forward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:59:59.999999999', - 'backward'], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 03:33:00', - timedelta(hours=1)], - ['2018-03-11 02:33:00', 'US/Pacific', '2018-03-11 01:33:00', - timedelta(hours=-1)] - ]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) - def test_timestamp_tz_localize_nonexistent_shift(self, start_ts, tz, - end_ts, shift, - tz_type): + @pytest.mark.parametrize( + "start_ts, tz, end_ts, shift", + [ + ["2015-03-29 02:20:00", "Europe/Warsaw", "2015-03-29 03:00:00", "forward"], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:59:59.999999999", + "backward", + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 03:20:00", + timedelta(hours=1), + ], + [ + "2015-03-29 02:20:00", + "Europe/Warsaw", + "2015-03-29 01:20:00", + timedelta(hours=-1), + ], + ["2018-03-11 02:33:00", "US/Pacific", "2018-03-11 03:00:00", "forward"], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:59:59.999999999", + "backward", + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 03:33:00", + timedelta(hours=1), + ], + [ + "2018-03-11 02:33:00", + "US/Pacific", + "2018-03-11 01:33:00", + timedelta(hours=-1), + ], + ], + ) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_timestamp_tz_localize_nonexistent_shift( + self, start_ts, tz, end_ts, shift, tz_type + ): # GH 8917, 24466 tz = tz_type + tz if isinstance(shift, str): - shift = 'shift_' + shift + shift = "shift_" + shift ts = Timestamp(start_ts) result = ts.tz_localize(tz, nonexistent=shift) expected = Timestamp(end_ts).tz_localize(tz) assert result == expected - @pytest.mark.parametrize('offset', [-1, 1]) - @pytest.mark.parametrize('tz_type', ['', 'dateutil/']) - def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, - tz_type): + @pytest.mark.parametrize("offset", [-1, 1]) + @pytest.mark.parametrize("tz_type", ["", "dateutil/"]) + def test_timestamp_tz_localize_nonexistent_shift_invalid(self, offset, tz_type): # GH 8917, 24466 - tz = tz_type + 'Europe/Warsaw' - ts = Timestamp('2015-03-29 02:20:00') + tz = tz_type + "Europe/Warsaw" + ts = Timestamp("2015-03-29 02:20:00") msg = "The provided timedelta will relocalize on a nonexistent time" with pytest.raises(ValueError, match=msg): ts.tz_localize(tz, nonexistent=timedelta(seconds=offset)) - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) def test_timestamp_tz_localize_nonexistent_NaT(self, tz): # GH 8917 - ts = Timestamp('2015-03-29 02:20:00') - result = ts.tz_localize(tz, nonexistent='NaT') + ts = Timestamp("2015-03-29 02:20:00") + result = ts.tz_localize(tz, nonexistent="NaT") assert result is NaT - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) def test_timestamp_tz_localize_nonexistent_raise(self, tz): # GH 8917 - ts = Timestamp('2015-03-29 02:20:00') + ts = Timestamp("2015-03-29 02:20:00") with pytest.raises(pytz.NonExistentTimeError): - ts.tz_localize(tz, nonexistent='raise') + ts.tz_localize(tz, nonexistent="raise") with pytest.raises(ValueError): - ts.tz_localize(tz, nonexistent='foo') + ts.tz_localize(tz, nonexistent="foo") # ------------------------------------------------------------------ # Timestamp.tz_convert - @pytest.mark.parametrize('stamp', ['2014-02-01 09:00', '2014-07-08 09:00', - '2014-11-01 17:00', '2014-11-05 00:00']) + @pytest.mark.parametrize( + "stamp", + [ + "2014-02-01 09:00", + "2014-07-08 09:00", + "2014-11-01 17:00", + "2014-11-05 00:00", + ], + ) def test_tz_convert_roundtrip(self, stamp, tz_aware_fixture): tz = tz_aware_fixture - ts = Timestamp(stamp, tz='UTC') + ts = Timestamp(stamp, tz="UTC") converted = ts.tz_convert(tz) reset = converted.tz_convert(None) assert reset == Timestamp(stamp) assert reset.tzinfo is None - assert reset == converted.tz_convert('UTC').tz_localize(None) + assert reset == converted.tz_convert("UTC").tz_localize(None) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_astimezone(self, tzstr): # astimezone is an alias for tz_convert, so keep it with # the tz_convert tests - utcdate = Timestamp('3/11/2012 22:00', tz='UTC') + utcdate = Timestamp("3/11/2012 22:00", tz="UTC") expected = utcdate.tz_convert(tzstr) result = utcdate.astimezone(tzstr) assert expected == result @@ -261,12 +309,12 @@ def test_tz_convert_utc_with_system_utc(self): from pandas._libs.tslibs.timezones import maybe_get_tz # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) # from system utc to real utc - ts = Timestamp('2001-01-05 11:56', tz=maybe_get_tz('dateutil/UTC')) + ts = Timestamp("2001-01-05 11:56", tz=maybe_get_tz("dateutil/UTC")) # check that the time hasn't changed. assert ts == ts.tz_convert(dateutil.tz.tzutc()) @@ -274,16 +322,16 @@ def test_tz_convert_utc_with_system_utc(self): # Timestamp.__init__ with tz str or tzinfo def test_timestamp_constructor_tz_utc(self): - utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') + utc_stamp = Timestamp("3/11/2012 05:00", tz="utc") assert utc_stamp.tzinfo is pytz.utc assert utc_stamp.hour == 5 - utc_stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') + utc_stamp = Timestamp("3/11/2012 05:00").tz_localize("utc") assert utc_stamp.hour == 5 def test_timestamp_to_datetime_tzoffset(self): tzinfo = tzoffset(None, 7200) - expected = Timestamp('3/11/2012 04:00', tz=tzinfo) + expected = Timestamp("3/11/2012 04:00", tz=tzinfo) result = Timestamp(expected.to_pydatetime()) assert expected == result @@ -292,85 +340,95 @@ def test_timestamp_constructor_near_dst_boundary(self): # Naive string timestamps were being localized incorrectly # with tz_convert_single instead of tz_localize_to_utc - for tz in ['Europe/Brussels', 'Europe/Prague']: - result = Timestamp('2015-10-25 01:00', tz=tz) - expected = Timestamp('2015-10-25 01:00').tz_localize(tz) + for tz in ["Europe/Brussels", "Europe/Prague"]: + result = Timestamp("2015-10-25 01:00", tz=tz) + expected = Timestamp("2015-10-25 01:00").tz_localize(tz) assert result == expected with pytest.raises(pytz.AmbiguousTimeError): - Timestamp('2015-10-25 02:00', tz=tz) + Timestamp("2015-10-25 02:00", tz=tz) - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00').tz_localize('Europe/Paris') + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00").tz_localize("Europe/Paris") assert result == expected with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') + Timestamp("2017-03-26 02:00", tz="Europe/Paris") # GH#11708 - naive = Timestamp('2015-11-18 10:00:00') - result = naive.tz_localize('UTC').tz_convert('Asia/Kolkata') - expected = Timestamp('2015-11-18 15:30:00+0530', tz='Asia/Kolkata') + naive = Timestamp("2015-11-18 10:00:00") + result = naive.tz_localize("UTC").tz_convert("Asia/Kolkata") + expected = Timestamp("2015-11-18 15:30:00+0530", tz="Asia/Kolkata") assert result == expected # GH#15823 - result = Timestamp('2017-03-26 00:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 00:00:00+0100', tz='Europe/Paris') + result = Timestamp("2017-03-26 00:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 00:00:00+0100", tz="Europe/Paris") assert result == expected - result = Timestamp('2017-03-26 01:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 01:00:00+0100', tz='Europe/Paris') + result = Timestamp("2017-03-26 01:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 01:00:00+0100", tz="Europe/Paris") assert result == expected with pytest.raises(pytz.NonExistentTimeError): - Timestamp('2017-03-26 02:00', tz='Europe/Paris') + Timestamp("2017-03-26 02:00", tz="Europe/Paris") - result = Timestamp('2017-03-26 02:00:00+0100', tz='Europe/Paris') + result = Timestamp("2017-03-26 02:00:00+0100", tz="Europe/Paris") naive = Timestamp(result.value) - expected = naive.tz_localize('UTC').tz_convert('Europe/Paris') + expected = naive.tz_localize("UTC").tz_convert("Europe/Paris") assert result == expected - result = Timestamp('2017-03-26 03:00', tz='Europe/Paris') - expected = Timestamp('2017-03-26 03:00:00+0200', tz='Europe/Paris') + result = Timestamp("2017-03-26 03:00", tz="Europe/Paris") + expected = Timestamp("2017-03-26 03:00:00+0200", tz="Europe/Paris") assert result == expected - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern'), - 'US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) def test_timestamp_constructed_by_date_and_tz(self, tz): # GH#2993, Timestamp cannot be constructed by datetime.date # and tz correctly result = Timestamp(date(2012, 3, 11), tz=tz) - expected = Timestamp('3/11/2012', tz=tz) + expected = Timestamp("3/11/2012", tz=tz) assert result.hour == expected.hour assert result == expected - @pytest.mark.parametrize('tz', [pytz.timezone('US/Eastern'), - gettz('US/Eastern'), - 'US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize( + "tz", + [ + pytz.timezone("US/Eastern"), + gettz("US/Eastern"), + "US/Eastern", + "dateutil/US/Eastern", + ], + ) def test_timestamp_add_timedelta_push_over_dst_boundary(self, tz): # GH#1389 # 4 hours before DST transition - stamp = Timestamp('3/10/2012 22:00', tz=tz) + stamp = Timestamp("3/10/2012 22:00", tz=tz) result = stamp + timedelta(hours=6) # spring forward, + "7" hours - expected = Timestamp('3/11/2012 05:00', tz=tz) + expected = Timestamp("3/11/2012 05:00", tz=tz) assert result == expected - def test_timestamp_timetz_equivalent_with_datetime_tz(self, - tz_naive_fixture): + def test_timestamp_timetz_equivalent_with_datetime_tz(self, tz_naive_fixture): # GH21358 tz = timezones.maybe_get_tz(tz_naive_fixture) - stamp = Timestamp('2018-06-04 10:20:30', tz=tz) - _datetime = datetime(2018, 6, 4, hour=10, - minute=20, second=30, tzinfo=tz) + stamp = Timestamp("2018-06-04 10:20:30", tz=tz) + _datetime = datetime(2018, 6, 4, hour=10, minute=20, second=30, tzinfo=tz) result = stamp.timetz() expected = _datetime.timetz() diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 8b13458050ce8..dffb957b8f3b0 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -20,15 +20,18 @@ class TestTimestampUnaryOps: # -------------------------------------------------------------- # Timestamp.round - @pytest.mark.parametrize('timestamp, freq, expected', [ - ('20130101 09:10:11', 'D', '20130101'), - ('20130101 19:10:11', 'D', '20130102'), - ('20130201 12:00:00', 'D', '20130202'), - ('20130104 12:00:00', 'D', '20130105'), - ('2000-01-05 05:09:15.13', 'D', '2000-01-05 00:00:00'), - ('2000-01-05 05:09:15.13', 'H', '2000-01-05 05:00:00'), - ('2000-01-05 05:09:15.13', 'S', '2000-01-05 05:09:15') - ]) + @pytest.mark.parametrize( + "timestamp, freq, expected", + [ + ("20130101 09:10:11", "D", "20130101"), + ("20130101 19:10:11", "D", "20130102"), + ("20130201 12:00:00", "D", "20130202"), + ("20130104 12:00:00", "D", "20130105"), + ("2000-01-05 05:09:15.13", "D", "2000-01-05 00:00:00"), + ("2000-01-05 05:09:15.13", "H", "2000-01-05 05:00:00"), + ("2000-01-05 05:09:15.13", "S", "2000-01-05 05:09:15"), + ], + ) def test_round_frequencies(self, timestamp, freq, expected): dt = Timestamp(timestamp) result = dt.round(freq) @@ -36,61 +39,72 @@ def test_round_frequencies(self, timestamp, freq, expected): assert result == expected def test_round_tzaware(self): - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('D') - expected = Timestamp('20130101', tz='US/Eastern') + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("D") + expected = Timestamp("20130101", tz="US/Eastern") assert result == expected - dt = Timestamp('20130101 09:10:11', tz='US/Eastern') - result = dt.round('s') + dt = Timestamp("20130101 09:10:11", tz="US/Eastern") + result = dt.round("s") assert result == dt def test_round_30min(self): # round - dt = Timestamp('20130104 12:32:00') - result = dt.round('30Min') - expected = Timestamp('20130104 12:30:00') + dt = Timestamp("20130104 12:32:00") + result = dt.round("30Min") + expected = Timestamp("20130104 12:30:00") assert result == expected def test_round_subsecond(self): # GH#14440 & GH#15578 - result = Timestamp('2016-10-17 12:00:00.0015').round('ms') - expected = Timestamp('2016-10-17 12:00:00.002000') + result = Timestamp("2016-10-17 12:00:00.0015").round("ms") + expected = Timestamp("2016-10-17 12:00:00.002000") assert result == expected - result = Timestamp('2016-10-17 12:00:00.00149').round('ms') - expected = Timestamp('2016-10-17 12:00:00.001000') + result = Timestamp("2016-10-17 12:00:00.00149").round("ms") + expected = Timestamp("2016-10-17 12:00:00.001000") assert result == expected - ts = Timestamp('2016-10-17 12:00:00.0015') - for freq in ['us', 'ns']: + ts = Timestamp("2016-10-17 12:00:00.0015") + for freq in ["us", "ns"]: assert ts == ts.round(freq) - result = Timestamp('2016-10-17 12:00:00.001501031').round('10ns') - expected = Timestamp('2016-10-17 12:00:00.001501030') + result = Timestamp("2016-10-17 12:00:00.001501031").round("10ns") + expected = Timestamp("2016-10-17 12:00:00.001501030") assert result == expected def test_round_nonstandard_freq(self): with tm.assert_produces_warning(False): - Timestamp('2016-10-17 12:00:00.001501031').round('1010ns') + Timestamp("2016-10-17 12:00:00.001501031").round("1010ns") def test_round_invalid_arg(self): - stamp = Timestamp('2000-01-05 05:09:15.13') + stamp = Timestamp("2000-01-05 05:09:15.13") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - stamp.round('foo') - - @pytest.mark.parametrize('test_input, rounder, freq, expected', [ - ('2117-01-01 00:00:45', 'floor', '15s', '2117-01-01 00:00:45'), - ('2117-01-01 00:00:45', 'ceil', '15s', '2117-01-01 00:00:45'), - ('2117-01-01 00:00:45.000000012', 'floor', '10ns', - '2117-01-01 00:00:45.000000010'), - ('1823-01-01 00:00:01.000000012', 'ceil', '10ns', - '1823-01-01 00:00:01.000000020'), - ('1823-01-01 00:00:01', 'floor', '1s', '1823-01-01 00:00:01'), - ('1823-01-01 00:00:01', 'ceil', '1s', '1823-01-01 00:00:01'), - ('NaT', 'floor', '1s', 'NaT'), - ('NaT', 'ceil', '1s', 'NaT') - ]) + stamp.round("foo") + + @pytest.mark.parametrize( + "test_input, rounder, freq, expected", + [ + ("2117-01-01 00:00:45", "floor", "15s", "2117-01-01 00:00:45"), + ("2117-01-01 00:00:45", "ceil", "15s", "2117-01-01 00:00:45"), + ( + "2117-01-01 00:00:45.000000012", + "floor", + "10ns", + "2117-01-01 00:00:45.000000010", + ), + ( + "1823-01-01 00:00:01.000000012", + "ceil", + "10ns", + "1823-01-01 00:00:01.000000020", + ), + ("1823-01-01 00:00:01", "floor", "1s", "1823-01-01 00:00:01"), + ("1823-01-01 00:00:01", "ceil", "1s", "1823-01-01 00:00:01"), + ("NaT", "floor", "1s", "NaT"), + ("NaT", "ceil", "1s", "NaT"), + ], + ) def test_ceil_floor_edge(self, test_input, rounder, freq, expected): dt = Timestamp(test_input) func = getattr(dt, rounder) @@ -102,15 +116,18 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = Timestamp(expected) assert result == expected - @pytest.mark.parametrize('test_input, freq, expected', [ - ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), - ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), - ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), - ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'), - ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), - ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), - ]) - @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) + @pytest.mark.parametrize( + "test_input, freq, expected", + [ + ("2018-01-01 00:02:06", "2s", "2018-01-01 00:02:06"), + ("2018-01-01 00:02:00", "2T", "2018-01-01 00:02:00"), + ("2018-01-01 00:04:00", "4T", "2018-01-01 00:04:00"), + ("2018-01-01 00:15:00", "15T", "2018-01-01 00:15:00"), + ("2018-01-01 00:20:00", "20T", "2018-01-01 00:20:00"), + ("2018-01-01 03:00:00", "3H", "2018-01-01 03:00:00"), + ], + ) + @pytest.mark.parametrize("rounder", ["ceil", "floor", "round"]) def test_round_minute_freq(self, test_input, freq, expected, rounder): # Ensure timestamps that shouldn't round dont! # GH#21262 @@ -122,72 +139,92 @@ def test_round_minute_freq(self, test_input, freq, expected, rounder): assert result == expected def test_ceil(self): - dt = Timestamp('20130101 09:10:11') - result = dt.ceil('D') - expected = Timestamp('20130102') + dt = Timestamp("20130101 09:10:11") + result = dt.ceil("D") + expected = Timestamp("20130102") assert result == expected def test_floor(self): - dt = Timestamp('20130101 09:10:11') - result = dt.floor('D') - expected = Timestamp('20130101') + dt = Timestamp("20130101 09:10:11") + result = dt.floor("D") + expected = Timestamp("20130101") assert result == expected - @pytest.mark.parametrize('method', ['ceil', 'round', 'floor']) + @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) def test_round_dst_border_ambiguous(self, method): # GH 18946 round near "fall back" DST - ts = Timestamp('2017-10-29 00:00:00', tz='UTC').tz_convert( - 'Europe/Madrid' - ) + ts = Timestamp("2017-10-29 00:00:00", tz="UTC").tz_convert("Europe/Madrid") # - result = getattr(ts, method)('H', ambiguous=True) + result = getattr(ts, method)("H", ambiguous=True) assert result == ts - result = getattr(ts, method)('H', ambiguous=False) - expected = Timestamp('2017-10-29 01:00:00', tz='UTC').tz_convert( - 'Europe/Madrid' + result = getattr(ts, method)("H", ambiguous=False) + expected = Timestamp("2017-10-29 01:00:00", tz="UTC").tz_convert( + "Europe/Madrid" ) assert result == expected - result = getattr(ts, method)('H', ambiguous='NaT') + result = getattr(ts, method)("H", ambiguous="NaT") assert result is NaT with pytest.raises(pytz.AmbiguousTimeError): - getattr(ts, method)('H', ambiguous='raise') - - @pytest.mark.parametrize('method, ts_str, freq', [ - ['ceil', '2018-03-11 01:59:00-0600', '5min'], - ['round', '2018-03-11 01:59:00-0600', '5min'], - ['floor', '2018-03-11 03:01:00-0500', '2H']]) + getattr(ts, method)("H", ambiguous="raise") + + @pytest.mark.parametrize( + "method, ts_str, freq", + [ + ["ceil", "2018-03-11 01:59:00-0600", "5min"], + ["round", "2018-03-11 01:59:00-0600", "5min"], + ["floor", "2018-03-11 03:01:00-0500", "2H"], + ], + ) def test_round_dst_border_nonexistent(self, method, ts_str, freq): # GH 23324 round near "spring forward" DST - ts = Timestamp(ts_str, tz='America/Chicago') - result = getattr(ts, method)(freq, nonexistent='shift_forward') - expected = Timestamp('2018-03-11 03:00:00', tz='America/Chicago') + ts = Timestamp(ts_str, tz="America/Chicago") + result = getattr(ts, method)(freq, nonexistent="shift_forward") + expected = Timestamp("2018-03-11 03:00:00", tz="America/Chicago") assert result == expected - result = getattr(ts, method)(freq, nonexistent='NaT') + result = getattr(ts, method)(freq, nonexistent="NaT") assert result is NaT - with pytest.raises(pytz.NonExistentTimeError, - match='2018-03-11 02:00:00'): - getattr(ts, method)(freq, nonexistent='raise') - - @pytest.mark.parametrize('timestamp', [ - '2018-01-01 0:0:0.124999360', - '2018-01-01 0:0:0.125000367', - '2018-01-01 0:0:0.125500', - '2018-01-01 0:0:0.126500', - '2018-01-01 12:00:00', - '2019-01-01 12:00:00', - ]) - @pytest.mark.parametrize('freq', [ - '2ns', '3ns', '4ns', '5ns', '6ns', '7ns', - '250ns', '500ns', '750ns', - '1us', '19us', '250us', '500us', '750us', - '1s', '2s', '3s', - '1D', - ]) + with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + getattr(ts, method)(freq, nonexistent="raise") + + @pytest.mark.parametrize( + "timestamp", + [ + "2018-01-01 0:0:0.124999360", + "2018-01-01 0:0:0.125000367", + "2018-01-01 0:0:0.125500", + "2018-01-01 0:0:0.126500", + "2018-01-01 12:00:00", + "2019-01-01 12:00:00", + ], + ) + @pytest.mark.parametrize( + "freq", + [ + "2ns", + "3ns", + "4ns", + "5ns", + "6ns", + "7ns", + "250ns", + "500ns", + "750ns", + "1us", + "19us", + "250us", + "500us", + "750us", + "1s", + "2s", + "3s", + "1D", + ], + ) def test_round_int64(self, timestamp, freq): """check that all rounding modes are accurate to int64 precision see GH#22591 @@ -218,26 +255,26 @@ def test_round_int64(self, timestamp, freq): def test_replace_naive(self): # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00') + ts = Timestamp("2016-01-01 09:00:00") result = ts.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00') + expected = Timestamp("2016-01-01 00:00:00") assert result == expected def test_replace_aware(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 # replacing datetime components with and w/o presence of a timezone - ts = Timestamp('2016-01-01 09:00:00', tz=tz) + ts = Timestamp("2016-01-01 09:00:00", tz=tz) result = ts.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00', tz=tz) + expected = Timestamp("2016-01-01 00:00:00", tz=tz) assert result == expected def test_replace_preserves_nanos(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) result = ts.replace(hour=0) - expected = Timestamp('2016-01-01 00:00:00.000000123', tz=tz) + expected = Timestamp("2016-01-01 00:00:00.000000123", tz=tz) assert result == expected def test_replace_multiple(self, tz_aware_fixture): @@ -245,43 +282,51 @@ def test_replace_multiple(self, tz_aware_fixture): # GH#14621, GH#7825 # replacing datetime components with and w/o presence of a timezone # test all - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) - result = ts.replace(year=2015, month=2, day=2, hour=0, minute=5, - second=5, microsecond=5, nanosecond=5) - expected = Timestamp('2015-02-02 00:05:05.000005005', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) + result = ts.replace( + year=2015, + month=2, + day=2, + hour=0, + minute=5, + second=5, + microsecond=5, + nanosecond=5, + ) + expected = Timestamp("2015-02-02 00:05:05.000005005", tz=tz) assert result == expected def test_replace_invalid_kwarg(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) with pytest.raises(TypeError): ts.replace(foo=5) def test_replace_integer_args(self, tz_aware_fixture): tz = tz_aware_fixture # GH#14621, GH#7825 - ts = Timestamp('2016-01-01 09:00:00.000000123', tz=tz) + ts = Timestamp("2016-01-01 09:00:00.000000123", tz=tz) with pytest.raises(ValueError): ts.replace(hour=0.1) def test_replace_tzinfo_equiv_tz_localize_none(self): # GH#14621, GH#7825 # assert conversion to naive is the same as replacing tzinfo with None - ts = Timestamp('2013-11-03 01:59:59.999999-0400', tz='US/Eastern') + ts = Timestamp("2013-11-03 01:59:59.999999-0400", tz="US/Eastern") assert ts.tz_localize(None) == ts.replace(tzinfo=None) @td.skip_if_windows def test_replace_tzinfo(self): # GH#15683 dt = datetime(2016, 3, 27, 1) - tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo + tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + with tm.set_timezone("UTC"): assert result_dt.timestamp() == result_pd.timestamp() assert result_dt == result_pd @@ -291,19 +336,23 @@ def test_replace_tzinfo(self): result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + with tm.set_timezone("UTC"): assert result_dt.timestamp() == result_pd.timestamp() assert result_dt == result_pd assert result_dt == result_pd.to_pydatetime() - @pytest.mark.parametrize('tz, normalize', [ - (pytz.timezone('US/Eastern'), lambda x: x.tzinfo.normalize(x)), - (gettz('US/Eastern'), lambda x: x)]) + @pytest.mark.parametrize( + "tz, normalize", + [ + (pytz.timezone("US/Eastern"), lambda x: x.tzinfo.normalize(x)), + (gettz("US/Eastern"), lambda x: x), + ], + ) def test_replace_across_dst(self, tz, normalize): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization - ts_naive = Timestamp('2017-12-03 16:03:30') + ts_naive = Timestamp("2017-12-03 16:03:30") ts_aware = conversion.localize_pydatetime(ts_naive, tz) # Preliminary sanity-check @@ -321,14 +370,14 @@ def test_replace_across_dst(self, tz, normalize): def test_replace_dst_border(self): # Gh 7825 - t = Timestamp('2013-11-3', tz='America/Chicago') + t = Timestamp("2013-11-3", tz="America/Chicago") result = t.replace(hour=3) - expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago') + expected = Timestamp("2013-11-3 03:00:00", tz="America/Chicago") assert result == expected - @pytest.mark.skipif(not PY36, reason='Fold not available until PY3.6') - @pytest.mark.parametrize('fold', [0, 1]) - @pytest.mark.parametrize('tz', ['dateutil/Europe/London', 'Europe/London']) + @pytest.mark.skipif(not PY36, reason="Fold not available until PY3.6") + @pytest.mark.parametrize("fold", [0, 1]) + @pytest.mark.parametrize("tz", ["dateutil/Europe/London", "Europe/London"]) def test_replace_dst_fold(self, fold, tz): # GH 25017 d = datetime(2019, 10, 27, 2, 30) @@ -342,12 +391,12 @@ def test_replace_dst_fold(self, fold, tz): # -------------------------------------------------------------- # Timestamp.normalize - @pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00']) + @pytest.mark.parametrize("arg", ["2013-11-30", "2013-11-30 12:00:00"]) def test_normalize(self, tz_naive_fixture, arg): tz = tz_naive_fixture ts = Timestamp(arg, tz=tz) result = ts.normalize() - expected = Timestamp('2013-11-30', tz=tz) + expected = Timestamp("2013-11-30", tz=tz) assert result == expected # -------------------------------------------------------------- @@ -360,14 +409,14 @@ def test_timestamp(self): uts = ts.replace(tzinfo=utc) assert ts.timestamp() == uts.timestamp() - tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') - utsc = tsc.tz_convert('UTC') + tsc = Timestamp("2014-10-11 11:00:01.12345678", tz="US/Central") + utsc = tsc.tz_convert("UTC") # utsc is a different representation of the same time assert tsc.timestamp() == utsc.timestamp() # datetime.timestamp() converts in the local timezone - with tm.set_timezone('UTC'): + with tm.set_timezone("UTC"): # should agree with datetime.timestamp method dt = ts.to_pydatetime() assert dt.timestamp() == ts.timestamp() diff --git a/pandas/tests/series/common.py b/pandas/tests/series/common.py index 220bf20c81dc3..38c62e89f1873 100644 --- a/pandas/tests/series/common.py +++ b/pandas/tests/series/common.py @@ -7,23 +7,22 @@ class TestData: - @cache_readonly def ts(self): ts = _ts.copy() - ts.name = 'ts' + ts.name = "ts" return ts @cache_readonly def series(self): series = tm.makeStringSeries() - series.name = 'series' + series.name = "series" return series @cache_readonly def objSeries(self): objSeries = tm.makeObjectSeries() - objSeries.name = 'objects' + objSeries.name = "objects" return objSeries @cache_readonly diff --git a/pandas/tests/series/conftest.py b/pandas/tests/series/conftest.py index 367e7a1baa7f3..f5b401398d6d6 100644 --- a/pandas/tests/series/conftest.py +++ b/pandas/tests/series/conftest.py @@ -9,7 +9,7 @@ def datetime_series(): Fixture for Series of floats with DatetimeIndex """ s = tm.makeTimeSeries() - s.name = 'ts' + s.name = "ts" return s @@ -19,7 +19,7 @@ def string_series(): Fixture for Series of floats with Index of unique strings """ s = tm.makeStringSeries() - s.name = 'series' + s.name = "series" return s @@ -29,5 +29,5 @@ def object_series(): Fixture for Series of dtype datetime64[ns] with Index of unique strings """ s = tm.makeObjectSeries() - s.name = 'objects' + s.name = "objects" return s diff --git a/pandas/tests/series/indexing/conftest.py b/pandas/tests/series/indexing/conftest.py index 0e06f6b8e4640..9c7103c196d60 100644 --- a/pandas/tests/series/indexing/conftest.py +++ b/pandas/tests/series/indexing/conftest.py @@ -3,6 +3,6 @@ from pandas.tests.series.common import TestData -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def test_data(): return TestData() diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 2c1eb11d9b515..31a1f43470f2c 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -11,13 +11,15 @@ @pytest.mark.parametrize( - 'first_slice,second_slice', [ + "first_slice,second_slice", + [ [[2, None], [None, -5]], [[None, 0], [None, -5]], [[None, -5], [None, 0]], - [[None, 0], [None, 0]] - ]) -@pytest.mark.parametrize('fill', [None, -1]) + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("fill", [None, -1]) def test_align(test_data, first_slice, second_slice, join_type, fill): a = test_data.ts[slice(*first_slice)] b = test_data.ts[slice(*second_slice)] @@ -42,24 +44,26 @@ def test_align(test_data, first_slice, second_slice, join_type, fill): assert_series_equal(aa, ea) assert_series_equal(ab, eb) - assert aa.name == 'ts' - assert ea.name == 'ts' - assert ab.name == 'ts' - assert eb.name == 'ts' + assert aa.name == "ts" + assert ea.name == "ts" + assert ab.name == "ts" + assert eb.name == "ts" @pytest.mark.parametrize( - 'first_slice,second_slice', [ + "first_slice,second_slice", + [ [[2, None], [None, -5]], [[None, 0], [None, -5]], [[None, -5], [None, 0]], - [[None, 0], [None, 0]] - ]) -@pytest.mark.parametrize('method', ['pad', 'bfill']) -@pytest.mark.parametrize('limit', [None, 1]) -def test_align_fill_method(test_data, - first_slice, second_slice, - join_type, method, limit): + [[None, 0], [None, 0]], + ], +) +@pytest.mark.parametrize("method", ["pad", "bfill"]) +@pytest.mark.parametrize("limit", [None, 1]) +def test_align_fill_method( + test_data, first_slice, second_slice, join_type, method, limit +): a = test_data.ts[slice(*first_slice)] b = test_data.ts[slice(*second_slice)] @@ -81,27 +85,27 @@ def test_align_nocopy(test_data): # do copy a = test_data.ts.copy() - ra, _ = a.align(b, join='left') + ra, _ = a.align(b, join="left") ra[:5] = 5 assert not (a[:5] == 5).any() # do not copy a = test_data.ts.copy() - ra, _ = a.align(b, join='left', copy=False) + ra, _ = a.align(b, join="left", copy=False) ra[:5] = 5 assert (a[:5] == 5).all() # do copy a = test_data.ts.copy() b = test_data.ts[:5].copy() - _, rb = a.align(b, join='right') + _, rb = a.align(b, join="right") rb[:3] = 5 assert not (b[:3] == 5).any() # do not copy a = test_data.ts.copy() b = test_data.ts[:5].copy() - _, rb = a.align(b, join='right', copy=False) + _, rb = a.align(b, join="right", copy=False) rb[:2] = 5 assert (b[:2] == 5).all() @@ -119,15 +123,16 @@ def test_align_same_index(test_data): def test_align_multiindex(): # GH 10665 - midx = pd.MultiIndex.from_product([range(2), range(3), range(2)], - names=('a', 'b', 'c')) - idx = pd.Index(range(2), name='b') - s1 = pd.Series(np.arange(12, dtype='int64'), index=midx) - s2 = pd.Series(np.arange(2, dtype='int64'), index=idx) + midx = pd.MultiIndex.from_product( + [range(2), range(3), range(2)], names=("a", "b", "c") + ) + idx = pd.Index(range(2), name="b") + s1 = pd.Series(np.arange(12, dtype="int64"), index=midx) + s2 = pd.Series(np.arange(2, dtype="int64"), index=idx) # these must be the same results (but flipped) - res1l, res1r = s1.align(s2, join='left') - res2l, res2r = s2.align(s1, join='right') + res1l, res1r = s1.align(s2, join="left") + res2l, res2r = s2.align(s1, join="right") expl = s1 tm.assert_series_equal(expl, res1l) @@ -136,11 +141,12 @@ def test_align_multiindex(): tm.assert_series_equal(expr, res1r) tm.assert_series_equal(expr, res2l) - res1l, res1r = s1.align(s2, join='right') - res2l, res2r = s2.align(s1, join='left') + res1l, res1r = s1.align(s2, join="right") + res2l, res2r = s2.align(s1, join="left") - exp_idx = pd.MultiIndex.from_product([range(2), range(2), range(2)], - names=('a', 'b', 'c')) + exp_idx = pd.MultiIndex.from_product( + [range(2), range(2), range(2)], names=("a", "b", "c") + ) expl = pd.Series([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx) tm.assert_series_equal(expl, res1l) tm.assert_series_equal(expl, res2r) @@ -194,18 +200,18 @@ def test_reindex_nan(): i, j = [nan, 1, nan, 8, 4, nan], [2, 0, 2, 3, 1, 2] assert_series_equal(ts.reindex(i), ts.iloc[j]) - ts.index = ts.index.astype('object') + ts.index = ts.index.astype("object") # reindex coerces index.dtype to float, loc/iloc doesn't assert_series_equal(ts.reindex(i), ts.iloc[j], check_index_type=False) def test_reindex_series_add_nat(): - rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") series = Series(rng) result = series.reindex(range(15)) - assert np.issubdtype(result.dtype, np.dtype('M8[ns]')) + assert np.issubdtype(result.dtype, np.dtype("M8[ns]")) mask = result.isna() assert mask[-5:].all() @@ -213,7 +219,7 @@ def test_reindex_series_add_nat(): def test_reindex_with_datetimes(): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) ts = Series(np.random.randn(20), index=rng) result = ts.reindex(list(ts.index[5:10])) @@ -226,10 +232,10 @@ def test_reindex_with_datetimes(): def test_reindex_corner(test_data): # (don't forget to fix this) I think it's fixed - test_data.empty.reindex(test_data.ts.index, method='pad') # it works + test_data.empty.reindex(test_data.ts.index, method="pad") # it works # corner case: pad empty series - reindexed = test_data.empty.reindex(test_data.ts.index, method='pad') + reindexed = test_data.empty.reindex(test_data.ts.index, method="pad") # pass non-Index reindexed = test_data.ts.reindex(list(test_data.ts.index)) @@ -237,75 +243,75 @@ def test_reindex_corner(test_data): # bad fill method ts = test_data.ts[::2] - msg = (r"Invalid fill method\. Expecting pad \(ffill\), backfill" - r" \(bfill\) or nearest\. Got foo") + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\), backfill" + r" \(bfill\) or nearest\. Got foo" + ) with pytest.raises(ValueError, match=msg): - ts.reindex(test_data.ts.index, method='foo') + ts.reindex(test_data.ts.index, method="foo") def test_reindex_pad(): - s = Series(np.arange(10), dtype='int64') + s = Series(np.arange(10), dtype="int64") s2 = s[::2] - reindexed = s2.reindex(s.index, method='pad') - reindexed2 = s2.reindex(s.index, method='ffill') + reindexed = s2.reindex(s.index, method="pad") + reindexed2 = s2.reindex(s.index, method="ffill") assert_series_equal(reindexed, reindexed2) expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], index=np.arange(10)) assert_series_equal(reindexed, expected) # GH4604 - s = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e']) - new_index = ['a', 'g', 'c', 'f'] + s = Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) + new_index = ["a", "g", "c", "f"] expected = Series([1, 1, 3, 3], index=new_index) # this changes dtype because the ffill happens after result = s.reindex(new_index).ffill() - assert_series_equal(result, expected.astype('float64')) + assert_series_equal(result, expected.astype("float64")) - result = s.reindex(new_index).ffill(downcast='infer') + result = s.reindex(new_index).ffill(downcast="infer") assert_series_equal(result, expected) expected = Series([1, 5, 3, 5], index=new_index) - result = s.reindex(new_index, method='ffill') + result = s.reindex(new_index, method="ffill") assert_series_equal(result, expected) # inference of new dtype - s = Series([True, False, False, True], index=list('abcd')) - new_index = 'agc' + s = Series([True, False, False, True], index=list("abcd")) + new_index = "agc" result = s.reindex(list(new_index)).ffill() expected = Series([True, True, False], index=list(new_index)) assert_series_equal(result, expected) # GH4618 shifted series downcasting s = Series(False, index=range(0, 5)) - result = s.shift(1).fillna(method='bfill') + result = s.shift(1).fillna(method="bfill") expected = Series(False, index=range(0, 5)) assert_series_equal(result, expected) def test_reindex_nearest(): - s = Series(np.arange(10, dtype='int64')) + s = Series(np.arange(10, dtype="int64")) target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method='nearest') - expected = Series(np.around(target).astype('int64'), target) + actual = s.reindex(target, method="nearest") + expected = Series(np.around(target).astype("int64"), target) assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method='nearest') + actual = s.reindex_like(actual, method="nearest") assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method='nearest', tolerance=1) + actual = s.reindex_like(actual, method="nearest", tolerance=1) assert_series_equal(expected, actual) - actual = s.reindex_like(actual, method='nearest', - tolerance=[1, 2, 3, 4]) + actual = s.reindex_like(actual, method="nearest", tolerance=[1, 2, 3, 4]) assert_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', tolerance=0.2) + actual = s.reindex(target, method="nearest", tolerance=0.2) expected = Series([0, 1, np.nan, 2], target) assert_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', - tolerance=[0.3, 0.01, 0.4, 3]) + actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) expected = Series([0, np.nan, np.nan, 2], target) assert_series_equal(expected, actual) @@ -349,30 +355,29 @@ def test_reindex_bool_pad(test_data): # fail ts = test_data.ts[5:] bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) - filled_bool = bool_ts.reindex(test_data.ts.index, method='pad') + filled_bool = bool_ts.reindex(test_data.ts.index, method="pad") assert isna(filled_bool[:5]).all() def test_reindex_categorical(): - index = date_range('20000101', periods=3) + index = date_range("20000101", periods=3) # reindexing to an invalid Categorical - s = Series(['a', 'b', 'c'], dtype='category') + s = Series(["a", "b", "c"], dtype="category") result = s.reindex(index) - expected = Series(Categorical(values=[np.nan, np.nan, np.nan], - categories=['a', 'b', 'c'])) + expected = Series( + Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"]) + ) expected.index = index tm.assert_series_equal(result, expected) # partial reindexing - expected = Series(Categorical(values=['b', 'c'], categories=['a', 'b', - 'c'])) + expected = Series(Categorical(values=["b", "c"], categories=["a", "b", "c"])) expected.index = [1, 2] result = s.reindex([1, 2]) tm.assert_series_equal(result, expected) - expected = Series(Categorical( - values=['c', np.nan], categories=['a', 'b', 'c'])) + expected = Series(Categorical(values=["c", np.nan], categories=["a", "b", "c"])) expected.index = [2, 3] result = s.reindex([2, 3]) tm.assert_series_equal(result, expected) @@ -380,8 +385,9 @@ def test_reindex_categorical(): def test_reindex_like(test_data): other = test_data.ts[::2] - assert_series_equal(test_data.ts.reindex(other.index), - test_data.ts.reindex_like(other)) + assert_series_equal( + test_data.ts.reindex(other.index), test_data.ts.reindex_like(other) + ) # GH 7179 day1 = datetime(2013, 3, 5) @@ -391,7 +397,7 @@ def test_reindex_like(test_data): series1 = Series([5, None, None], [day1, day2, day3]) series2 = Series([None, None], [day1, day3]) - result = series1.reindex_like(series2, method='pad') + result = series1.reindex_like(series2, method="pad") expected = Series([5, np.nan], index=[day1, day3]) assert_series_equal(result, expected) @@ -399,13 +405,13 @@ def test_reindex_like(test_data): def test_reindex_fill_value(): # ----------------------------------------------------------- # floats - floats = Series([1., 2., 3.]) + floats = Series([1.0, 2.0, 3.0]) result = floats.reindex([1, 2, 3]) - expected = Series([2., 3., np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) assert_series_equal(result, expected) result = floats.reindex([1, 2, 3], fill_value=0) - expected = Series([2., 3., 0], index=[1, 2, 3]) + expected = Series([2.0, 3.0, 0], index=[1, 2, 3]) assert_series_equal(result, expected) # ----------------------------------------------------------- @@ -413,7 +419,7 @@ def test_reindex_fill_value(): ints = Series([1, 2, 3]) result = ints.reindex([1, 2, 3]) - expected = Series([2., 3., np.nan], index=[1, 2, 3]) + expected = Series([2.0, 3.0, np.nan], index=[1, 2, 3]) assert_series_equal(result, expected) # don't upcast @@ -430,8 +436,8 @@ def test_reindex_fill_value(): expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) assert_series_equal(result, expected) - result = objects.reindex([1, 2, 3], fill_value='foo') - expected = Series([2, 3, 'foo'], index=[1, 2, 3], dtype=object) + result = objects.reindex([1, 2, 3], fill_value="foo") + expected = Series([2, 3, "foo"], index=[1, 2, 3], dtype=object) assert_series_equal(result, expected) # ------------------------------------------------------------ @@ -449,23 +455,23 @@ def test_reindex_fill_value(): def test_reindex_datetimeindexes_tz_naive_and_aware(): # GH 8306 - idx = date_range('20131101', tz='America/Chicago', periods=7) - newidx = date_range('20131103', periods=10, freq='H') + idx = date_range("20131101", tz="America/Chicago", periods=7) + newidx = date_range("20131103", periods=10, freq="H") s = Series(range(7), index=idx) with pytest.raises(TypeError): - s.reindex(newidx, method='ffill') + s.reindex(newidx, method="ffill") def test_reindex_empty_series_tz_dtype(): # GH 20869 - result = Series(dtype='datetime64[ns, UTC]').reindex([0, 1]) - expected = Series([pd.NaT] * 2, dtype='datetime64[ns, UTC]') + result = Series(dtype="datetime64[ns, UTC]").reindex([0, 1]) + expected = Series([pd.NaT] * 2, dtype="datetime64[ns, UTC]") tm.assert_equal(result, expected) def test_rename(): # GH 17407 - s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex')) + s = Series(range(1, 6), index=pd.Index(range(2, 7), name="IntIndex")) result = s.rename(str) expected = s.rename(lambda i: str(i)) assert_series_equal(result, expected) @@ -474,26 +480,21 @@ def test_rename(): @pytest.mark.parametrize( - 'data, index, drop_labels,' - ' axis, expected_data, expected_index', + "data, index, drop_labels," " axis, expected_data, expected_index", [ # Unique Index - ([1, 2], ['one', 'two'], ['two'], - 0, [1], ['one']), - ([1, 2], ['one', 'two'], ['two'], - 'rows', [1], ['one']), - ([1, 1, 2], ['one', 'two', 'one'], ['two'], - 0, [1, 2], ['one', 'one']), - + ([1, 2], ["one", "two"], ["two"], 0, [1], ["one"]), + ([1, 2], ["one", "two"], ["two"], "rows", [1], ["one"]), + ([1, 1, 2], ["one", "two", "one"], ["two"], 0, [1, 2], ["one", "one"]), # GH 5248 Non-Unique Index - ([1, 1, 2], ['one', 'two', 'one'], 'two', - 0, [1, 2], ['one', 'one']), - ([1, 1, 2], ['one', 'two', 'one'], ['one'], - 0, [1], ['two']), - ([1, 1, 2], ['one', 'two', 'one'], 'one', - 0, [1], ['two'])]) -def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, - expected_data, expected_index): + ([1, 1, 2], ["one", "two", "one"], "two", 0, [1, 2], ["one", "one"]), + ([1, 1, 2], ["one", "two", "one"], ["one"], 0, [1], ["two"]), + ([1, 1, 2], ["one", "two", "one"], "one", 0, [1], ["two"]), + ], +) +def test_drop_unique_and_non_unique_index( + data, index, axis, drop_labels, expected_data, expected_index +): s = Series(data=data, index=index) result = s.drop(drop_labels, axis=axis) @@ -502,20 +503,16 @@ def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, @pytest.mark.parametrize( - 'data, index, drop_labels,' - ' axis, error_type, error_desc', + "data, index, drop_labels," " axis, error_type, error_desc", [ # single string/tuple-like - (range(3), list('abc'), 'bc', - 0, KeyError, 'not found in axis'), - + (range(3), list("abc"), "bc", 0, KeyError, "not found in axis"), # bad axis - (range(3), list('abc'), ('a',), - 0, KeyError, 'not found in axis'), - (range(3), list('abc'), 'one', - 'columns', ValueError, 'No axis named columns')]) -def test_drop_exception_raised(data, index, drop_labels, - axis, error_type, error_desc): + (range(3), list("abc"), ("a",), 0, KeyError, "not found in axis"), + (range(3), list("abc"), "one", "columns", ValueError, "No axis named columns"), + ], +) +def test_drop_exception_raised(data, index, drop_labels, axis, error_type, error_desc): with pytest.raises(error_type, match=error_desc): Series(data, index=index).drop(drop_labels, axis=axis) @@ -523,10 +520,10 @@ def test_drop_exception_raised(data, index, drop_labels, def test_drop_with_ignore_errors(): # errors='ignore' - s = Series(range(3), index=list('abc')) - result = s.drop('bc', errors='ignore') + s = Series(range(3), index=list("abc")) + result = s.drop("bc", errors="ignore") tm.assert_series_equal(result, s) - result = s.drop(['a', 'd'], errors='ignore') + result = s.drop(["a", "d"], errors="ignore") expected = s.iloc[1:] tm.assert_series_equal(result, expected) @@ -538,8 +535,8 @@ def test_drop_with_ignore_errors(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 3]]) -@pytest.mark.parametrize('drop_labels', [[], [1], [3]]) +@pytest.mark.parametrize("index", [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize("drop_labels", [[], [1], [3]]) def test_drop_empty_list(index, drop_labels): # GH 21494 expected_index = [i for i in index if i not in drop_labels] @@ -547,12 +544,15 @@ def test_drop_empty_list(index, drop_labels): tm.assert_series_equal(series, pd.Series(index=expected_index)) -@pytest.mark.parametrize('data, index, drop_labels', [ - (None, [1, 2, 3], [1, 4]), - (None, [1, 2, 2], [1, 4]), - ([2, 3], [0, 1], [False, True]) -]) +@pytest.mark.parametrize( + "data, index, drop_labels", + [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]), + ], +) def test_drop_non_empty_list(data, index, drop_labels): # GH 21494 and GH 16877 - with pytest.raises(KeyError, match='not found in axis'): + with pytest.raises(KeyError, match="not found in axis"): pd.Series(data=data, index=index).drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index ef7312616250d..9b76ed026e580 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -25,27 +25,29 @@ def test_getitem_boolean(test_data): def test_getitem_boolean_empty(): s = Series([], dtype=np.int64) - s.index.name = 'index_name' + s.index.name = "index_name" s = s[s.isna()] - assert s.index.name == 'index_name' + assert s.index.name == "index_name" assert s.dtype == np.int64 # GH5877 # indexing with empty series - s = Series(['A', 'B']) - expected = Series(np.nan, index=['C'], dtype=object) - result = s[Series(['C'], dtype=object)] + s = Series(["A", "B"]) + expected = Series(np.nan, index=["C"], dtype=object) + result = s[Series(["C"], dtype=object)] assert_series_equal(result, expected) - s = Series(['A', 'B']) - expected = Series(dtype=object, index=Index([], dtype='int64')) + s = Series(["A", "B"]) + expected = Series(dtype=object, index=Index([], dtype="int64")) result = s[Series([], dtype=object)] assert_series_equal(result, expected) # invalid because of the boolean indexer # that's empty or not-aligned - msg = (r"Unalignable boolean Series provided as indexer \(index of" - r" the boolean Series and of the indexed object do not match") + msg = ( + r"Unalignable boolean Series provided as indexer \(index of" + r" the boolean Series and of the indexed object do not match" + ) with pytest.raises(IndexingError, match=msg): s[Series([], dtype=bool)] @@ -87,8 +89,10 @@ def test_getitem_setitem_boolean_corner(test_data): # these used to raise...?? - msg = (r"Unalignable boolean Series provided as indexer \(index of" - r" the boolean Series and of the indexed object do not match") + msg = ( + r"Unalignable boolean Series provided as indexer \(index of" + r" the boolean Series and of the indexed object do not match" + ) with pytest.raises(IndexingError, match=msg): ts[mask_shifted] with pytest.raises(IndexingError, match=msg): @@ -156,14 +160,17 @@ def test_where_unsafe_float(float_dtype): assert_series_equal(s, expected) -@pytest.mark.parametrize("dtype,expected_dtype", [ - (np.int8, np.float64), - (np.int16, np.float64), - (np.int32, np.float64), - (np.int64, np.float64), - (np.float32, np.float32), - (np.float64, np.float64) -]) +@pytest.mark.parametrize( + "dtype,expected_dtype", + [ + (np.int8, np.float64), + (np.int16, np.float64), + (np.int32, np.float64), + (np.int64, np.float64), + (np.float32, np.float32), + (np.float64, np.float64), + ], +) def test_where_unsafe_upcast(dtype, expected_dtype): # see gh-9743 s = Series(np.arange(10), dtype=dtype) @@ -186,17 +193,17 @@ def test_where_unsafe(): assert_series_equal(s, expected) # see gh-3235 - s = Series(np.arange(10), dtype='int64') + s = Series(np.arange(10), dtype="int64") mask = s < 5 s[mask] = range(2, 7) - expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype='int64') + expected = Series(list(range(2, 7)) + list(range(5, 10)), dtype="int64") assert_series_equal(s, expected) assert s.dtype == expected.dtype - s = Series(np.arange(10), dtype='int64') + s = Series(np.arange(10), dtype="int64") mask = s > 5 s[mask] = [0] * 4 - expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') + expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype="int64") assert_series_equal(s, expected) s = Series(np.arange(10)) @@ -241,8 +248,8 @@ def test_where(): assert_series_equal(rs, s.abs()) rs = s.where(cond) - assert (s.shape == rs.shape) - assert (rs is not s) + assert s.shape == rs.shape + assert rs is not s # test alignment cond = Series([True, False, False, True, False], index=s.index) @@ -278,13 +285,15 @@ def test_where_error(): msg = "cannot assign mismatch length to masked array" with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] - msg = ("NumPy boolean array indexing assignment cannot assign 0 input" - " values to the 1 output values where the mask is true") + msg = ( + "NumPy boolean array indexing assignment cannot assign 0 input" + " values to the 1 output values where the mask is true" + ) with pytest.raises(ValueError, match=msg): s[[True, False]] = [] -@pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) +@pytest.mark.parametrize("klass", [list, tuple, np.array, Series]) def test_where_array_like(klass): # see gh-15414 s = Series([1, 2, 3]) @@ -295,12 +304,15 @@ def test_where_array_like(klass): assert_series_equal(result, expected) -@pytest.mark.parametrize('cond', [ - [1, 0, 1], - Series([2, 5, 7]), - ["True", "False", "True"], - [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")] -]) +@pytest.mark.parametrize( + "cond", + [ + [1, 0, 1], + Series([2, 5, 7]), + ["True", "False", "True"], + [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")], + ], +) def test_where_invalid_input(cond): # see gh-15414: only boolean arrays accepted s = Series([1, 2, 3]) @@ -341,74 +353,69 @@ def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment - msg = ("cannot set using a {} indexer with a different length than" - " the value") + msg = "cannot set using a {} indexer with a different length than" " the value" # slice - s = Series(list('abc')) + s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format('slice')): + with pytest.raises(ValueError, match=msg.format("slice")): s[0:3] = list(range(27)) s[0:3] = list(range(3)) expected = Series([0, 1, 2]) - assert_series_equal(s.astype(np.int64), expected, ) + assert_series_equal(s.astype(np.int64), expected) # slice with step - s = Series(list('abcdef')) + s = Series(list("abcdef")) - with pytest.raises(ValueError, match=msg.format('slice')): + with pytest.raises(ValueError, match=msg.format("slice")): s[0:4:2] = list(range(27)) - s = Series(list('abcdef')) + s = Series(list("abcdef")) s[0:4:2] = list(range(2)) - expected = Series([0, 'b', 1, 'd', 'e', 'f']) + expected = Series([0, "b", 1, "d", "e", "f"]) assert_series_equal(s, expected) # neg slices - s = Series(list('abcdef')) + s = Series(list("abcdef")) - with pytest.raises(ValueError, match=msg.format('slice')): + with pytest.raises(ValueError, match=msg.format("slice")): s[:-1] = list(range(27)) s[-3:-1] = list(range(2)) - expected = Series(['a', 'b', 'c', 0, 1, 'f']) + expected = Series(["a", "b", "c", 0, 1, "f"]) assert_series_equal(s, expected) # list - s = Series(list('abc')) + s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format('list-like')): + with pytest.raises(ValueError, match=msg.format("list-like")): s[[0, 1, 2]] = list(range(27)) - s = Series(list('abc')) + s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format('list-like')): + with pytest.raises(ValueError, match=msg.format("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar - s = Series(list('abc')) + s = Series(list("abc")) s[0] = list(range(10)) - expected = Series([list(range(10)), 'b', 'c']) + expected = Series([list(range(10)), "b", "c"]) assert_series_equal(s, expected) -@pytest.mark.parametrize('size', range(2, 6)) -@pytest.mark.parametrize('mask', [ - [True, False, False, False, False], - [True, False], - [False] -]) -@pytest.mark.parametrize('item', [ - 2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min -]) +@pytest.mark.parametrize("size", range(2, 6)) +@pytest.mark.parametrize( + "mask", [[True, False, False, False, False], [True, False], [False]] +) +@pytest.mark.parametrize( + "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min] +) # Test numpy arrays, lists and tuples as the input to be # broadcast -@pytest.mark.parametrize('box', [ - lambda x: np.array([x]), - lambda x: [x], - lambda x: (x,) -]) +@pytest.mark.parametrize( + "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] +) def test_broadcast(size, mask, item, box): selection = np.resize(mask, size) @@ -416,8 +423,9 @@ def test_broadcast(size, mask, item, box): # Construct the expected series by taking the source # data or item based on the selection - expected = Series([item if use_item else data[ - i] for i, use_item in enumerate(selection)]) + expected = Series( + [item if use_item else data[i] for i, use_item in enumerate(selection)] + ) s = Series(data) s[selection] = box(item) @@ -454,8 +462,7 @@ def test_where_dups(): s2 = Series(list(range(3))) comb = pd.concat([s1, s2]) result = comb.where(comb < 2) - expected = Series([0, 1, np.nan, 0, 1, np.nan], - index=[0, 1, 2, 0, 1, 2]) + expected = Series([0, 1, np.nan, 0, 1, np.nan], index=[0, 1, 2, 0, 1, 2]) assert_series_equal(result, expected) # GH 4548 @@ -472,31 +479,31 @@ def test_where_dups(): def test_where_numeric_with_string(): # GH 9280 s = pd.Series([1, 2, 3]) - w = s.where(s > 1, 'X') + w = s.where(s > 1, "X") assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) - assert w.dtype == 'object' + assert w.dtype == "object" - w = s.where(s > 1, ['X', 'Y', 'Z']) + w = s.where(s > 1, ["X", "Y", "Z"]) assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) - assert w.dtype == 'object' + assert w.dtype == "object" - w = s.where(s > 1, np.array(['X', 'Y', 'Z'])) + w = s.where(s > 1, np.array(["X", "Y", "Z"])) assert not is_integer(w[0]) assert is_integer(w[1]) assert is_integer(w[2]) assert isinstance(w[0], str) - assert w.dtype == 'object' + assert w.dtype == "object" def test_where_timedelta_coerce(): - s = Series([1, 2], dtype='timedelta64[ns]') + s = Series([1, 2], dtype="timedelta64[ns]") expected = Series([10, 10]) mask = np.array([False, False]) @@ -513,12 +520,12 @@ def test_where_timedelta_coerce(): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='object') + expected = Series([10, None], dtype="object") assert_series_equal(rs, expected) def test_where_datetime_conversion(): - s = Series(date_range('20130102', periods=2)) + s = Series(date_range("20130102", periods=2)) expected = Series([10, 10]) mask = np.array([False, False]) @@ -535,12 +542,11 @@ def test_where_datetime_conversion(): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='object') + expected = Series([10, None], dtype="object") assert_series_equal(rs, expected) # GH 15701 - timestamps = ['2016-12-31 12:00:04+00:00', - '2016-12-31 12:00:04.010000+00:00'] + timestamps = ["2016-12-31 12:00:04+00:00", "2016-12-31 12:00:04.010000+00:00"] s = Series([pd.Timestamp(t) for t in timestamps]) rs = s.where(Series([False, True])) expected = Series([pd.NaT, s[1]]) @@ -548,14 +554,17 @@ def test_where_datetime_conversion(): def test_where_dt_tz_values(tz_naive_fixture): - ser1 = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20150103'], - tz=tz_naive_fixture)) - ser2 = pd.Series(pd.DatetimeIndex(['20160514', '20160515', '20160516'], - tz=tz_naive_fixture)) + ser1 = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture) + ) + ser2 = pd.Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture) + ) mask = pd.Series([True, True, False]) result = ser1.where(mask, ser2) - exp = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20160516'], - tz=tz_naive_fixture)) + exp = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20160516"], tz=tz_naive_fixture) + ) assert_series_equal(exp, result) diff --git a/pandas/tests/series/indexing/test_callable.py b/pandas/tests/series/indexing/test_callable.py index b656137545903..2d879eed967e5 100644 --- a/pandas/tests/series/indexing/test_callable.py +++ b/pandas/tests/series/indexing/test_callable.py @@ -4,12 +4,12 @@ def test_getitem_callable(): # GH 12533 - s = pd.Series(4, index=list('ABCD')) - result = s[lambda x: 'A'] - assert result == s.loc['A'] + s = pd.Series(4, index=list("ABCD")) + result = s[lambda x: "A"] + assert result == s.loc["A"] - result = s[lambda x: ['A', 'B']] - tm.assert_series_equal(result, s.loc[['A', 'B']]) + result = s[lambda x: ["A", "B"]] + tm.assert_series_equal(result, s.loc[["A", "B"]]) result = s[lambda x: [True, False, True, True]] tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) @@ -17,9 +17,9 @@ def test_getitem_callable(): def test_setitem_callable(): # GH 12533 - s = pd.Series([1, 2, 3, 4], index=list('ABCD')) - s[lambda x: 'A'] = -1 - tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) + s = pd.Series([1, 2, 3, 4], index=list("ABCD")) + s[lambda x: "A"] = -1 + tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list("ABCD"))) def test_setitem_other_callable(): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index a8120ec9c5c58..721ea2b6e6632 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -10,7 +10,10 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) """ @@ -19,74 +22,86 @@ def test_fancy_getitem(): - dti = date_range(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) + dti = date_range( + freq="WOM-1FRI", start=datetime(2005, 1, 1), end=datetime(2010, 1, 1) + ) s = Series(np.arange(len(dti)), index=dti) assert s[48] == 48 - assert s['1/2/2009'] == 48 - assert s['2009-1-2'] == 48 + assert s["1/2/2009"] == 48 + assert s["2009-1-2"] == 48 assert s[datetime(2009, 1, 2)] == 48 assert s[Timestamp(datetime(2009, 1, 2))] == 48 with pytest.raises(KeyError, match=r"^'2009-1-3'$"): - s['2009-1-3'] - assert_series_equal(s['3/6/2009':'2009-06-05'], - s[datetime(2009, 3, 6):datetime(2009, 6, 5)]) + s["2009-1-3"] + assert_series_equal( + s["3/6/2009":"2009-06-05"], s[datetime(2009, 3, 6) : datetime(2009, 6, 5)] + ) def test_fancy_setitem(): - dti = date_range(freq='WOM-1FRI', start=datetime(2005, 1, 1), - end=datetime(2010, 1, 1)) + dti = date_range( + freq="WOM-1FRI", start=datetime(2005, 1, 1), end=datetime(2010, 1, 1) + ) s = Series(np.arange(len(dti)), index=dti) s[48] = -1 assert s[48] == -1 - s['1/2/2009'] = -2 + s["1/2/2009"] = -2 assert s[48] == -2 - s['1/2/2009':'2009-06-05'] = -3 + s["1/2/2009":"2009-06-05"] = -3 assert (s[48:54] == -3).all() @pytest.mark.filterwarnings("ignore::DeprecationWarning") -@pytest.mark.parametrize('tz', [None, 'Asia/Shanghai', 'Europe/Berlin']) -@pytest.mark.parametrize('name', [None, 'my_dti']) +@pytest.mark.parametrize("tz", [None, "Asia/Shanghai", "Europe/Berlin"]) +@pytest.mark.parametrize("name", [None, "my_dti"]) def test_dti_snap(name, tz): - dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', - '1/5/2002', '1/6/2002', '1/7/2002'], - name=name, tz=tz, freq='D') - - result = dti.snap(freq='W-MON') - expected = date_range('12/31/2001', '1/7/2002', - name=name, tz=tz, freq='w-mon') + dti = DatetimeIndex( + [ + "1/1/2002", + "1/2/2002", + "1/3/2002", + "1/4/2002", + "1/5/2002", + "1/6/2002", + "1/7/2002", + ], + name=name, + tz=tz, + freq="D", + ) + + result = dti.snap(freq="W-MON") + expected = date_range("12/31/2001", "1/7/2002", name=name, tz=tz, freq="w-mon") expected = expected.repeat([3, 4]) tm.assert_index_equal(result, expected) assert result.tz == expected.tz - result = dti.snap(freq='B') + result = dti.snap(freq="B") - expected = date_range('1/1/2002', '1/7/2002', - name=name, tz=tz, freq='b') + expected = date_range("1/1/2002", "1/7/2002", name=name, tz=tz, freq="b") expected = expected.repeat([1, 1, 1, 2, 2]) tm.assert_index_equal(result, expected) assert result.tz == expected.tz def test_dti_reset_index_round_trip(): - dti = date_range(start='1/1/2001', end='6/1/2001', freq='D') - d1 = DataFrame({'v': np.random.rand(len(dti))}, index=dti) + dti = date_range(start="1/1/2001", end="6/1/2001", freq="D") + d1 = DataFrame({"v": np.random.rand(len(dti))}, index=dti) d2 = d1.reset_index() - assert d2.dtypes[0] == np.dtype('M8[ns]') - d3 = d2.set_index('index') + assert d2.dtypes[0] == np.dtype("M8[ns]") + d3 = d2.set_index("index") assert_frame_equal(d1, d3, check_names=False) # #2329 stamp = datetime(2012, 11, 22) - df = DataFrame([[stamp, 12.1]], columns=['Date', 'Value']) - df = df.set_index('Date') + df = DataFrame([[stamp, 12.1]], columns=["Date", "Value"]) + df = df.set_index("Date") assert df.index[0] == stamp - assert df.reset_index()['Date'][0] == stamp + assert df.reset_index()["Date"][0] == stamp def test_series_set_value(): @@ -95,14 +110,12 @@ def test_series_set_value(): dates = [datetime(2001, 1, 1), datetime(2001, 1, 2)] index = DatetimeIndex(dates) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - s = Series().set_value(dates[0], 1.) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + s = Series().set_value(dates[0], 1.0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s2 = s.set_value(dates[1], np.nan) - exp = Series([1., np.nan], index=index) + exp = Series([1.0, np.nan], index=index) assert_series_equal(s2, exp) @@ -113,59 +126,60 @@ def test_series_set_value(): @pytest.mark.slow def test_slice_locs_indexerror(): - times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) - for i in range(100000)] + times = [datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000)] s = Series(range(100000), times) - s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)] + s.loc[datetime(1900, 1, 1) : datetime(2100, 1, 1)] def test_slicing_datetimes(): # GH 7523 # unique - df = DataFrame(np.arange(4., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 3, 4]]) - result = df.loc[datetime(2001, 1, 1, 10):] + df = DataFrame( + np.arange(4.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 3, 4]], + ) + result = df.loc[datetime(2001, 1, 1, 10) :] assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] + result = df.loc[: datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 11):] + result = df.loc[datetime(2001, 1, 1, 11) :] expected = df.iloc[1:] assert_frame_equal(result, expected) - result = df.loc['20010101 11':] + result = df.loc["20010101 11":] assert_frame_equal(result, expected) # duplicates - df = pd.DataFrame(np.arange(5., dtype='float64'), - index=[datetime(2001, 1, i, 10, 00) - for i in [1, 2, 2, 3, 4]]) + df = pd.DataFrame( + np.arange(5.0, dtype="float64"), + index=[datetime(2001, 1, i, 10, 00) for i in [1, 2, 2, 3, 4]], + ) - result = df.loc[datetime(2001, 1, 1, 10):] + result = df.loc[datetime(2001, 1, 1, 10) :] assert_frame_equal(result, df) - result = df.loc[:datetime(2001, 1, 4, 10)] + result = df.loc[: datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 10):datetime(2001, 1, 4, 10)] + result = df.loc[datetime(2001, 1, 1, 10) : datetime(2001, 1, 4, 10)] assert_frame_equal(result, df) - result = df.loc[datetime(2001, 1, 1, 11):] + result = df.loc[datetime(2001, 1, 1, 11) :] expected = df.iloc[1:] assert_frame_equal(result, expected) - result = df.loc['20010101 11':] + result = df.loc["20010101 11":] assert_frame_equal(result, expected) def test_frame_datetime64_duplicated(): - dates = date_range('2010-07-01', end='2010-08-05') + dates = date_range("2010-07-01", end="2010-08-05") - tst = DataFrame({'symbol': 'AAA', 'date': dates}) - result = tst.duplicated(['date', 'symbol']) + tst = DataFrame({"symbol": "AAA", "date": dates}) + result = tst.duplicated(["date", "symbol"]) assert (-result).all() - tst = DataFrame({'date': dates}) + tst = DataFrame({"date": dates}) result = tst.duplicated() assert (-result).all() @@ -176,7 +190,7 @@ def test_getitem_setitem_datetime_tz_pytz(): N = 50 # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -192,14 +206,14 @@ def test_getitem_setitem_datetime_tz_pytz(): # repeat with datetimes result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = ts[4] assert_series_equal(result, ts) result = ts.copy() # comparison dates with datetime MUST be localized! - date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) + date = tz("US/Central").localize(datetime(1990, 1, 1, 3)) result[date] = 0 result[date] = ts[4] assert_series_equal(result, ts) @@ -209,16 +223,16 @@ def test_getitem_setitem_datetime_tz_dateutil(): from dateutil.tz import tzutc from pandas._libs.tslibs.timezones import dateutil_gettz as gettz - tz = lambda x: tzutc() if x == 'UTC' else gettz( - x) # handle special case for utc in dateutil + tz = ( + lambda x: tzutc() if x == "UTC" else gettz(x) + ) # handle special case for utc in dateutil from pandas import date_range N = 50 # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', - tz='America/New_York') + rng = date_range("1/1/1990", periods=N, freq="H", tz="America/New_York") ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -234,20 +248,20 @@ def test_getitem_setitem_datetime_tz_dateutil(): # repeat with datetimes result = ts.copy() - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 - result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = 0 + result[datetime(1990, 1, 1, 9, tzinfo=tz("UTC"))] = ts[4] assert_series_equal(result, ts) result = ts.copy() - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 - result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] + result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz("America/Chicago"))] = ts[4] assert_series_equal(result, ts) def test_getitem_setitem_datetimeindex(): N = 50 # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") ts = Series(np.random.randn(N), index=rng) result = ts["1990-01-01 04:00:00"] @@ -291,13 +305,13 @@ def test_getitem_setitem_datetimeindex(): result[datetime(1990, 1, 1, 4)] = ts[4] assert_series_equal(result, ts) - result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] + result = ts[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] expected = ts[4:8] assert_series_equal(result, expected) result = ts.copy() - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 - result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] + result[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] = 0 + result[datetime(1990, 1, 1, 4) : datetime(1990, 1, 1, 7)] = ts[4:8] assert_series_equal(result, ts) lb = datetime(1990, 1, 1, 4) @@ -342,7 +356,7 @@ def test_getitem_setitem_periodindex(): from pandas import period_range N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') + rng = period_range("1/1/1990", periods=N, freq="H") ts = Series(np.random.randn(N), index=rng) result = ts["1990-01-01 04"] @@ -387,7 +401,7 @@ def test_getitem_setitem_periodindex(): # FutureWarning from NumPy. @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") def test_getitem_median_slice_bug(): - index = date_range('20090415', '20090519', freq='2B') + index = date_range("20090415", "20090519", freq="2B") s = Series(np.random.randn(13), index=index) indexer = [slice(6, 7, None)] @@ -399,11 +413,11 @@ def test_getitem_median_slice_bug(): def test_datetime_indexing(): from pandas import date_range - index = date_range('1/1/2000', '1/7/2000') + index = date_range("1/1/2000", "1/7/2000") index = index.repeat(3) s = Series(len(index), index=index) - stamp = Timestamp('1/8/2000') + stamp = Timestamp("1/8/2000") with pytest.raises(KeyError, match=r"^947289600000000000$"): s[stamp] @@ -427,11 +441,18 @@ def test_datetime_indexing(): @pytest.fixture def dups(): - dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), - datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 3), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 4), - datetime(2000, 1, 4), datetime(2000, 1, 5)] + dates = [ + datetime(2000, 1, 2), + datetime(2000, 1, 2), + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 3), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 4), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ] return Series(np.random.randn(len(dates)), index=dates) @@ -447,22 +468,28 @@ def test_is_unique_monotonic(dups): def test_index_unique(dups): uniques = dups.index.unique() - expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), - datetime(2000, 1, 4), datetime(2000, 1, 5)]) - assert uniques.dtype == 'M8[ns]' # sanity + expected = DatetimeIndex( + [ + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + ] + ) + assert uniques.dtype == "M8[ns]" # sanity tm.assert_index_equal(uniques, expected) assert dups.index.nunique() == 4 # #2563 assert isinstance(uniques, DatetimeIndex) - dups_local = dups.index.tz_localize('US/Eastern') - dups_local.name = 'foo' + dups_local = dups.index.tz_localize("US/Eastern") + dups_local.name = "foo" result = dups_local.unique() - expected = DatetimeIndex(expected, name='foo') - expected = expected.tz_localize('US/Eastern') + expected = DatetimeIndex(expected, name="foo") + expected = expected.tz_localize("US/Eastern") assert result.tz is not None - assert result.name == 'foo' + assert result.name == "foo" tm.assert_index_equal(result, expected) # NaT, note this is excluded @@ -472,8 +499,9 @@ def test_index_unique(dups): assert idx.nunique() == 20 assert idx.nunique(dropna=False) == 21 - arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) - for t in range(20)] + [NaT] + arr = [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20) + ] + [NaT] idx = DatetimeIndex(arr * 3) tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) assert idx.nunique() == 20 @@ -515,16 +543,15 @@ def test_duplicate_dates_indexing(dups): def test_range_slice(): - idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', - '1/4/2000']) + idx = DatetimeIndex(["1/1/2000", "1/2/2000", "1/2/2000", "1/3/2000", "1/4/2000"]) ts = Series(np.random.randn(len(idx)), index=idx) - result = ts['1/2/2000':] + result = ts["1/2/2000":] expected = ts[1:] assert_series_equal(result, expected) - result = ts['1/2/2000':'1/3/2000'] + result = ts["1/2/2000":"1/3/2000"] expected = ts[1:4] assert_series_equal(result, expected) @@ -537,6 +564,7 @@ def test_groupby_average_dup_values(dups): def test_indexing_over_size_cutoff(): import datetime + # #1821 old_cutoff = _index._SIZE_CUTOFF @@ -561,9 +589,9 @@ def test_indexing_over_size_cutoff(): for p in duplicate_positions: dates[p + 1] = dates[p] - df = DataFrame(np.random.randn(len(dates), 4), - index=dates, - columns=list('ABCD')) + df = DataFrame( + np.random.randn(len(dates), 4), index=dates, columns=list("ABCD") + ) pos = n * 3 timestamp = df.index[pos] @@ -578,7 +606,7 @@ def test_indexing_over_size_cutoff(): def test_indexing_unordered(): # GH 2437 - rng = date_range(start='2011-01-01', end='2011-01-15') + rng = date_range(start="2011-01-01", end="2011-01-15") ts = Series(np.random.rand(len(rng)), index=rng) ts2 = pd.concat([ts[0:4], ts[-4:], ts[4:-4]]) @@ -597,83 +625,84 @@ def compare(slobj): expected = ts[slobj] assert_series_equal(result, expected) - compare(slice('2011-01-01', '2011-01-15')) - compare(slice('2010-12-30', '2011-01-15')) - compare(slice('2011-01-01', '2011-01-16')) + compare(slice("2011-01-01", "2011-01-15")) + compare(slice("2010-12-30", "2011-01-15")) + compare(slice("2011-01-01", "2011-01-16")) # partial ranges - compare(slice('2011-01-01', '2011-01-6')) - compare(slice('2011-01-06', '2011-01-8')) - compare(slice('2011-01-06', '2011-01-12')) + compare(slice("2011-01-01", "2011-01-6")) + compare(slice("2011-01-06", "2011-01-8")) + compare(slice("2011-01-06", "2011-01-12")) # single values - result = ts2['2011'].sort_index() - expected = ts['2011'] + result = ts2["2011"].sort_index() + expected = ts["2011"] assert_series_equal(result, expected) # diff freq - rng = date_range(datetime(2005, 1, 1), periods=20, freq='M') + rng = date_range(datetime(2005, 1, 1), periods=20, freq="M") ts = Series(np.arange(len(rng)), index=rng) ts = ts.take(np.random.permutation(20)) - result = ts['2005'] + result = ts["2005"] for t in result.index: assert t.year == 2005 def test_indexing(): - idx = date_range("2001-1-1", periods=20, freq='M') + idx = date_range("2001-1-1", periods=20, freq="M") ts = Series(np.random.rand(len(idx)), index=idx) # getting # GH 3070, make sure semantics work on Series/Frame - expected = ts['2001'] - expected.name = 'A' + expected = ts["2001"] + expected.name = "A" df = DataFrame(dict(A=ts)) - result = df['2001']['A'] + result = df["2001"]["A"] assert_series_equal(expected, result) # setting - ts['2001'] = 1 - expected = ts['2001'] - expected.name = 'A' + ts["2001"] = 1 + expected = ts["2001"] + expected.name = "A" - df.loc['2001', 'A'] = 1 + df.loc["2001", "A"] = 1 - result = df['2001']['A'] + result = df["2001"]["A"] assert_series_equal(expected, result) # GH3546 (not including times on the last day) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:00', - freq='H') + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:00", freq="H") ts = Series(range(len(idx)), index=idx) - expected = ts['2013-05'] + expected = ts["2013-05"] assert_series_equal(expected, ts) - idx = date_range(start='2013-05-31 00:00', end='2013-05-31 23:59', - freq='S') + idx = date_range(start="2013-05-31 00:00", end="2013-05-31 23:59", freq="S") ts = Series(range(len(idx)), index=idx) - expected = ts['2013-05'] + expected = ts["2013-05"] assert_series_equal(expected, ts) - idx = [Timestamp('2013-05-31 00:00'), - Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999))] + idx = [ + Timestamp("2013-05-31 00:00"), + Timestamp(datetime(2013, 5, 31, 23, 59, 59, 999999)), + ] ts = Series(range(len(idx)), index=idx) - expected = ts['2013'] + expected = ts["2013"] assert_series_equal(expected, ts) # GH14826, indexing with a seconds resolution string / datetime object - df = DataFrame(np.random.rand(5, 5), - columns=['open', 'high', 'low', 'close', 'volume'], - index=date_range('2012-01-02 18:01:00', - periods=5, tz='US/Central', freq='s')) + df = DataFrame( + np.random.rand(5, 5), + columns=["open", "high", "low", "close", "volume"], + index=date_range("2012-01-02 18:01:00", periods=5, tz="US/Central", freq="s"), + ) expected = df.loc[[df.index[2]]] # this is a single date, so will raise with pytest.raises(KeyError, match=r"^'2012-01-02 18:01:02'$"): - df['2012-01-02 18:01:02'] + df["2012-01-02 18:01:02"] msg = r"Timestamp\('2012-01-02 18:01:02-0600', tz='US/Central', freq='S'\)" with pytest.raises(KeyError, match=msg): df[df.index[2]] @@ -685,7 +714,7 @@ def test_indexing(): def test_set_none_nan(): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) series[3] = None assert series[3] is NaT @@ -701,15 +730,15 @@ def test_set_none_nan(): def test_nat_operations(): # GH 8617 - s = Series([0, pd.NaT], dtype='m8[ns]') + s = Series([0, pd.NaT], dtype="m8[ns]") exp = s[0] assert s.median() == exp assert s.min() == exp assert s.max() == exp -@pytest.mark.parametrize('method', ["round", "floor", "ceil"]) -@pytest.mark.parametrize('freq', ["s", "5s", "min", "5min", "h", "5h"]) +@pytest.mark.parametrize("method", ["round", "floor", "ceil"]) +@pytest.mark.parametrize("freq", ["s", "5s", "min", "5min", "h", "5h"]) def test_round_nat(method, freq): # GH14940 s = Series([pd.NaT]) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index d794b4aca82e6..c8342c54e9b5d 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -8,8 +8,7 @@ from pandas.core.dtypes.common import is_scalar import pandas as pd -from pandas import ( - Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp) +from pandas import Categorical, DataFrame, MultiIndex, Series, Timedelta, Timestamp import pandas.util.testing as tm from pandas.util.testing import assert_series_equal @@ -17,7 +16,7 @@ def test_basic_indexing(): - s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) + s = Series(np.random.randn(5), index=["a", "b", "a", "a", "b"]) msg = "index out of bounds" with pytest.raises(IndexError, match=msg): @@ -27,7 +26,7 @@ def test_basic_indexing(): s[5] = 0 with pytest.raises(KeyError, match=r"^'c'$"): - s['c'] + s["c"] s = s.sort_index() @@ -46,36 +45,35 @@ def test_basic_getitem_with_labels(test_data): expected = test_data.ts.reindex(indices) assert_series_equal(result, expected) - result = test_data.ts[indices[0]:indices[2]] - expected = test_data.ts.loc[indices[0]:indices[2]] + result = test_data.ts[indices[0] : indices[2]] + expected = test_data.ts.loc[indices[0] : indices[2]] assert_series_equal(result, expected) # integer indexes, be careful s = Series(np.random.randn(10), index=list(range(0, 20, 2))) inds = [0, 2, 5, 7, 8] arr_inds = np.array([0, 2, 5, 7, 8]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s[inds] expected = s.reindex(inds) assert_series_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = s[arr_inds] expected = s.reindex(arr_inds) assert_series_equal(result, expected) # GH12089 # with tz for values - s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), - index=['a', 'b', 'c']) - expected = Timestamp('2011-01-01', tz='US/Eastern') - result = s.loc['a'] + s = Series( + pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + ) + expected = Timestamp("2011-01-01", tz="US/Eastern") + result = s.loc["a"] assert result == expected result = s.iloc[0] assert result == expected - result = s['a'] + result = s["a"] assert result == expected @@ -115,7 +113,7 @@ def test_getitem_get(test_data): # None # GH 5652 - for s in [Series(), Series(index=list('abc'))]: + for s in [Series(), Series(index=list("abc"))]: result = s.get(None) assert result is None @@ -144,25 +142,26 @@ def test_type_promotion(): s["a"] = pd.Timestamp("2016-01-01") s["b"] = 3.0 s["c"] = "foo" - expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], - index=["a", "b", "c"]) + expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) assert_series_equal(s, expected) @pytest.mark.parametrize( - 'result_1, duplicate_item, expected_1', + "result_1, duplicate_item, expected_1", [ [ - pd.Series({1: 12, 2: [1, 2, 2, 3]}), pd.Series({1: 313}), - pd.Series({1: 12, }, dtype=object), + pd.Series({1: 12, 2: [1, 2, 2, 3]}), + pd.Series({1: 313}), + pd.Series({1: 12}, dtype=object), ], [ pd.Series({1: [1, 2, 3], 2: [1, 2, 2, 3]}), - pd.Series({1: [1, 2, 3]}), pd.Series({1: [1, 2, 3], }), + pd.Series({1: [1, 2, 3]}), + pd.Series({1: [1, 2, 3]}), ], - ]) -def test_getitem_with_duplicates_indices( - result_1, duplicate_item, expected_1): + ], +) +def test_getitem_with_duplicates_indices(result_1, duplicate_item, expected_1): # GH 17610 result = result_1.append(duplicate_item) expected = expected_1.append(duplicate_item) @@ -184,11 +183,11 @@ def test_getitem_out_of_bounds(test_data): def test_getitem_setitem_integers(): # caused bug without test - s = Series([1, 2, 3], ['a', 'b', 'c']) + s = Series([1, 2, 3], ["a", "b", "c"]) - assert s.iloc[0] == s['a'] + assert s.iloc[0] == s["a"] s.iloc[0] = 5 - tm.assert_almost_equal(s['a'], 5) + tm.assert_almost_equal(s["a"], 5) def test_getitem_box_float64(test_data): @@ -197,12 +196,9 @@ def test_getitem_box_float64(test_data): @pytest.mark.parametrize( - 'arr', - [ - np.random.randn(10), - tm.makeDateIndex(10, name='a').tz_localize( - tz='US/Eastern'), - ]) + "arr", + [np.random.randn(10), tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern")], +) def test_get(arr): # GH 21260 s = Series(arr, index=[2 * i for i in range(len(arr))]) @@ -219,14 +215,14 @@ def test_get(arr): assert s.get(-1) is None assert s.get(s.index.max() + 1) is None - s = Series(arr[:6], index=list('abcdef')) - assert s.get('c') == s.iloc[2] + s = Series(arr[:6], index=list("abcdef")) + assert s.get("c") == s.iloc[2] - result = s.get(slice('b', 'd')) + result = s.get(slice("b", "d")) expected = s.iloc[[1, 2, 3]] tm.assert_series_equal(result, expected) - result = s.get('Z') + result = s.get("Z") assert result is None assert s.get(4) == s.iloc[4] @@ -240,12 +236,12 @@ def test_get(arr): def test_series_box_timestamp(): - rng = pd.date_range('20090415', '20090519', freq='B') + rng = pd.date_range("20090415", "20090519", freq="B") ser = Series(rng) assert isinstance(ser[5], pd.Timestamp) - rng = pd.date_range('20090415', '20090519', freq='B') + rng = pd.date_range("20090415", "20090519", freq="B") ser = Series(rng, index=rng) assert isinstance(ser[5], pd.Timestamp) @@ -261,29 +257,27 @@ def test_getitem_ambiguous_keyerror(): def test_getitem_unordered_dup(): - obj = Series(range(5), index=['c', 'a', 'a', 'b', 'b']) - assert is_scalar(obj['c']) - assert obj['c'] == 0 + obj = Series(range(5), index=["c", "a", "a", "b", "b"]) + assert is_scalar(obj["c"]) + assert obj["c"] == 0 def test_getitem_dups_with_missing(): # breaks reindex, so need to use .loc internally # GH 4246 - s = Series([1, 2, 3, 4], ['foo', 'bar', 'foo', 'bah']) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - expected = s.loc[['foo', 'bar', 'bah', 'bam']] - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s[['foo', 'bar', 'bah', 'bam']] + s = Series([1, 2, 3, 4], ["foo", "bar", "foo", "bah"]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + expected = s.loc[["foo", "bar", "bah", "bam"]] + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = s[["foo", "bar", "bah", "bam"]] assert_series_equal(result, expected) def test_getitem_dups(): - s = Series(range(5), index=['A', 'A', 'B', 'C', 'C'], dtype=np.int64) - expected = Series([3, 4], index=['C', 'C'], dtype=np.int64) - result = s['C'] + s = Series(range(5), index=["A", "A", "B", "C", "C"], dtype=np.int64) + expected = Series([3, 4], index=["C", "C"], dtype=np.int64) + result = s["C"] assert_series_equal(result, expected) @@ -306,8 +300,10 @@ def test_getitem_dataframe(): rng = list(range(10)) s = pd.Series(10, index=rng) df = pd.DataFrame(rng, index=rng) - msg = ("Indexing a Series with DataFrame is not supported," - " use the appropriate DataFrame column") + msg = ( + "Indexing a Series with DataFrame is not supported," + " use the appropriate DataFrame column" + ) with pytest.raises(TypeError, match=msg): s[df > 5] @@ -322,30 +318,29 @@ def test_setitem(test_data): assert not np.isnan(test_data.ts[2]) # caught this bug when writing tests - series = Series(tm.makeIntIndex(20).astype(float), - index=tm.makeIntIndex(20)) + series = Series(tm.makeIntIndex(20).astype(float), index=tm.makeIntIndex(20)) series[::2] = 0 assert (series[::2] == 0).all() # set item that's not contained s = test_data.series.copy() - s['foobar'] = 1 + s["foobar"] = 1 - app = Series([1], index=['foobar'], name='series') + app = Series([1], index=["foobar"], name="series") expected = test_data.series.append(app) assert_series_equal(s, expected) # Test for issue #10193 - key = pd.Timestamp('2012-01-01') + key = pd.Timestamp("2012-01-01") series = pd.Series() series[key] = 47 expected = pd.Series(47, [key]) assert_series_equal(series, expected) - series = pd.Series([], pd.DatetimeIndex([], freq='D')) + series = pd.Series([], pd.DatetimeIndex([], freq="D")) series[key] = 47 - expected = pd.Series(47, pd.DatetimeIndex([key], freq='D')) + expected = pd.Series(47, pd.DatetimeIndex([key], freq="D")) assert_series_equal(series, expected) @@ -377,25 +372,23 @@ def test_setitem_dtypes(): def test_set_value(test_data): idx = test_data.ts.index[10] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = test_data.ts.set_value(idx, 0) assert res is test_data.ts assert test_data.ts[idx] == 0 # equiv s = test_data.series.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = s.set_value('foobar', 0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = s.set_value("foobar", 0) assert res is s - assert res.index[-1] == 'foobar' - assert res['foobar'] == 0 + assert res.index[-1] == "foobar" + assert res["foobar"] == 0 s = test_data.series.copy() - s.loc['foobar'] = 0 - assert s.index[-1] == 'foobar' - assert s['foobar'] == 0 + s.loc["foobar"] = 0 + assert s.index[-1] == "foobar" + assert s["foobar"] == 0 def test_setslice(test_data): @@ -427,37 +420,46 @@ def test_basic_getitem_setitem_corner(test_data): test_data.ts[[5, slice(None, None)]] = 2 -@pytest.mark.parametrize('tz', ['US/Eastern', 'UTC', 'Asia/Tokyo']) +@pytest.mark.parametrize("tz", ["US/Eastern", "UTC", "Asia/Tokyo"]) def test_setitem_with_tz(tz): - orig = pd.Series(pd.date_range('2016-01-01', freq='H', periods=3, - tz=tz)) - assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) + orig = pd.Series(pd.date_range("2016-01-01", freq="H", periods=3, tz=tz)) + assert orig.dtype == "datetime64[ns, {0}]".format(tz) # scalar s = orig.copy() - s[1] = pd.Timestamp('2011-01-01', tz=tz) - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2016-01-01 02:00', tz=tz)]) + s[1] = pd.Timestamp("2011-01-01", tz=tz) + exp = pd.Series( + [ + pd.Timestamp("2016-01-01 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2016-01-01 02:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() - s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.loc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) s = orig.copy() - s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.iloc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) # vector - vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) - assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) + vals = pd.Series( + [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == "datetime64[ns, {0}]".format(tz) s[[1, 2]] = vals - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2012-01-01 00:00', tz=tz)]) + exp = pd.Series( + [ + pd.Timestamp("2016-01-01 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2012-01-01 00:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() @@ -471,36 +473,45 @@ def test_setitem_with_tz(tz): def test_setitem_with_tz_dst(): # GH XXX - tz = 'US/Eastern' - orig = pd.Series(pd.date_range('2016-11-06', freq='H', periods=3, - tz=tz)) - assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) + tz = "US/Eastern" + orig = pd.Series(pd.date_range("2016-11-06", freq="H", periods=3, tz=tz)) + assert orig.dtype == "datetime64[ns, {0}]".format(tz) # scalar s = orig.copy() - s[1] = pd.Timestamp('2011-01-01', tz=tz) - exp = pd.Series([pd.Timestamp('2016-11-06 00:00-04:00', tz=tz), - pd.Timestamp('2011-01-01 00:00-05:00', tz=tz), - pd.Timestamp('2016-11-06 01:00-05:00', tz=tz)]) + s[1] = pd.Timestamp("2011-01-01", tz=tz) + exp = pd.Series( + [ + pd.Timestamp("2016-11-06 00:00-04:00", tz=tz), + pd.Timestamp("2011-01-01 00:00-05:00", tz=tz), + pd.Timestamp("2016-11-06 01:00-05:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() - s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.loc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) s = orig.copy() - s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) + s.iloc[1] = pd.Timestamp("2011-01-01", tz=tz) tm.assert_series_equal(s, exp) # vector - vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) - assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) + vals = pd.Series( + [pd.Timestamp("2011-01-01", tz=tz), pd.Timestamp("2012-01-01", tz=tz)], + index=[1, 2], + ) + assert vals.dtype == "datetime64[ns, {0}]".format(tz) s[[1, 2]] = vals - exp = pd.Series([pd.Timestamp('2016-11-06 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2012-01-01 00:00', tz=tz)]) + exp = pd.Series( + [ + pd.Timestamp("2016-11-06 00:00", tz=tz), + pd.Timestamp("2011-01-01 00:00", tz=tz), + pd.Timestamp("2012-01-01 00:00", tz=tz), + ] + ) tm.assert_series_equal(s, exp) s = orig.copy() @@ -537,8 +548,7 @@ def test_categorial_assigning_ops(): s = orig.copy() s.index = ["x", "y"] s["y"] = "a" - exp = Series(Categorical(["b", "a"], categories=["a", "b"]), - index=["x", "y"]) + exp = Series(Categorical(["b", "a"], categories=["a", "b"]), index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan @@ -570,7 +580,7 @@ def test_slice(test_data): def test_slice_can_reorder_not_uniquely_indexed(): - s = Series(1, index=['a', 'a', 'b', 'b', 'c']) + s = Series(1, index=["a", "a", "b", "b", "c"]) s[::-1] # it works! @@ -614,8 +624,7 @@ def test_setitem_na(): s[::2] = np.nan assert_series_equal(s, expected) - expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, - 9]) + expected = Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]) s = Series(np.arange(10)) s[:5] = np.nan assert_series_equal(s, expected) @@ -624,65 +633,63 @@ def test_setitem_na(): def test_timedelta_assignment(): # GH 8209 s = Series([]) - s.loc['B'] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta('1 days'), index=['B'])) + s.loc["B"] = timedelta(1) + tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) - s = s.reindex(s.index.insert(0, 'A')) - tm.assert_series_equal(s, Series( - [np.nan, Timedelta('1 days')], index=['A', 'B'])) + s = s.reindex(s.index.insert(0, "A")) + tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) result = s.fillna(timedelta(1)) - expected = Series(Timedelta('1 days'), index=['A', 'B']) + expected = Series(Timedelta("1 days"), index=["A", "B"]) tm.assert_series_equal(result, expected) - s.loc['A'] = timedelta(1) + s.loc["A"] = timedelta(1) tm.assert_series_equal(s, expected) # GH 14155 - s = Series(10 * [np.timedelta64(10, 'm')]) - s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') - expected = pd.Series(10 * [np.timedelta64(10, 'm')]) - expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) + s = Series(10 * [np.timedelta64(10, "m")]) + s.loc[[1, 2, 3]] = np.timedelta64(20, "m") + expected = pd.Series(10 * [np.timedelta64(10, "m")]) + expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, "m")) tm.assert_series_equal(s, expected) def test_underlying_data_conversion(): # GH 4080 - df = DataFrame({c: [1, 2, 3] for c in ['a', 'b', 'c']}) - df.set_index(['a', 'b', 'c'], inplace=True) + df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) + df.set_index(["a", "b", "c"], inplace=True) s = Series([1], index=[(2, 2, 2)]) - df['val'] = 0 + df["val"] = 0 df - df['val'].update(s) + df["val"].update(s) - expected = DataFrame( - dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) - expected.set_index(['a', 'b', 'c'], inplace=True) + expected = DataFrame(dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) + expected.set_index(["a", "b", "c"], inplace=True) tm.assert_frame_equal(df, expected) # GH 3970 # these are chained assignments as well - pd.set_option('chained_assignment', None) + pd.set_option("chained_assignment", None) df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) df["cc"] = 0.0 ck = [True] * len(df) - df["bb"].iloc[0] = .13 + df["bb"].iloc[0] = 0.13 # TODO: unused df_tmp = df.iloc[ck] # noqa - df["bb"].iloc[0] = .15 - assert df['bb'].iloc[0] == 0.15 - pd.set_option('chained_assignment', 'raise') + df["bb"].iloc[0] = 0.15 + assert df["bb"].iloc[0] == 0.15 + pd.set_option("chained_assignment", "raise") # GH 3217 df = DataFrame(dict(a=[1, 3], b=[np.nan, 2])) - df['c'] = np.nan - df['c'].update(pd.Series(['foo'], index=[0])) + df["c"] = np.nan + df["c"].update(pd.Series(["foo"], index=[0])) - expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=['foo', np.nan])) + expected = DataFrame(dict(a=[1, 3], b=[np.nan, 2], c=["foo", np.nan])) tm.assert_frame_equal(df, expected) @@ -696,9 +703,9 @@ def test_cast_on_putmask(): # GH 2746 # need to upcast - s = Series([1, 2], index=[1, 2], dtype='int64') - s[[True, False]] = Series([0], index=[1], dtype='int64') - expected = Series([0, 2], index=[1, 2], dtype='int64') + s = Series([1, 2], index=[1, 2], dtype="int64") + s[[True, False]] = Series([0], index=[1], dtype="int64") + expected = Series([0, 2], index=[1, 2], dtype="int64") assert_series_equal(s, expected) @@ -715,25 +722,25 @@ def test_type_promote_putmask(): mask = s > 0 s2 = s[mask].map(str) s[mask] = s2 - assert_series_equal(s, Series([0, '1', '2', 0])) + assert_series_equal(s, Series([0, "1", "2", 0])) - s = Series([0, 'foo', 'bar', 0]) + s = Series([0, "foo", "bar", 0]) mask = Series([False, True, True, False]) s2 = s[mask] s[mask] = s2 - assert_series_equal(s, Series([0, 'foo', 'bar', 0])) + assert_series_equal(s, Series([0, "foo", "bar", 0])) def test_multilevel_preserve_name(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - s = Series(np.random.randn(len(index)), index=index, name='sth') - - result = s['foo'] - result2 = s.loc['foo'] + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(np.random.randn(len(index)), index=index, name="sth") + + result = s["foo"] + result2 = s.loc["foo"] assert result.name == s.name assert result2.name == s.name @@ -774,13 +781,13 @@ def test_setitem_slice_into_readonly_backing_data(): def test_pop(): # GH 6600 - df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) + df = DataFrame({"A": 0, "B": np.arange(5, dtype="int64"), "C": 0}) k = df.iloc[4] - result = k.pop('B') + result = k.pop("B") assert result == 4 - expected = Series([0, 0], index=['A', 'C'], name=4) + expected = Series([0, 0], index=["A", "C"], name=4) assert_series_equal(k, expected) @@ -804,11 +811,11 @@ def test_take(): def test_take_categorical(): # https://github.com/pandas-dev/pandas/issues/20664 - s = Series(pd.Categorical(['a', 'b', 'c'])) + s = Series(pd.Categorical(["a", "b", "c"])) result = s.take([-2, -2, 0]) - expected = Series(pd.Categorical(['b', 'b', 'a'], - categories=['a', 'b', 'c']), - index=[1, 1, 0]) + expected = Series( + pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), index=[1, 1, 0] + ) assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_loc.py b/pandas/tests/series/indexing/test_loc.py index 8fefc19b4843f..2f7807526a29d 100644 --- a/pandas/tests/series/indexing/test_loc.py +++ b/pandas/tests/series/indexing/test_loc.py @@ -6,21 +6,16 @@ from pandas.util.testing import assert_series_equal -@pytest.mark.parametrize("val,expected", [ - (2**63 - 1, 3), - (2**63, 4), -]) +@pytest.mark.parametrize("val,expected", [(2 ** 63 - 1, 3), (2 ** 63, 4)]) def test_loc_uint64(val, expected): # see gh-19399 - s = Series({2**63 - 1: 3, 2**63: 4}) + s = Series({2 ** 63 - 1: 3, 2 ** 63: 4}) assert s.loc[val] == expected def test_loc_getitem(test_data): inds = test_data.series.index[[3, 4, 7]] - assert_series_equal( - test_data.series.loc[inds], - test_data.series.reindex(inds)) + assert_series_equal(test_data.series.loc[inds], test_data.series.reindex(inds)) assert_series_equal(test_data.series.iloc[5::2], test_data.series[5::2]) # slice with indices @@ -99,7 +94,7 @@ def test_loc_setitem_corner(test_data): test_data.series.loc[inds] = 5 msg = r"\['foo'\] not in index" with pytest.raises(KeyError, match=msg): - test_data.series.loc[inds + ['foo']] = 5 + test_data.series.loc[inds + ["foo"]] = 5 def test_basic_setitem_with_labels(test_data): @@ -113,8 +108,8 @@ def test_basic_setitem_with_labels(test_data): cp = test_data.ts.copy() exp = test_data.ts.copy() - cp[indices[0]:indices[2]] = 0 - exp.loc[indices[0]:indices[2]] = 0 + cp[indices[0] : indices[2]] = 0 + exp.loc[indices[0] : indices[2]] = 0 assert_series_equal(cp, exp) # integer indexes, be careful @@ -144,12 +139,13 @@ def test_basic_setitem_with_labels(test_data): # GH12089 # with tz for values - s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), - index=['a', 'b', 'c']) + s = Series( + pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=["a", "b", "c"] + ) s2 = s.copy() - expected = Timestamp('2011-01-03', tz='US/Eastern') - s2.loc['a'] = expected - result = s2.loc['a'] + expected = Timestamp("2011-01-03", tz="US/Eastern") + s2.loc["a"] = expected + result = s2.loc["a"] assert result == expected s2 = s.copy() @@ -158,6 +154,6 @@ def test_basic_setitem_with_labels(test_data): assert result == expected s2 = s.copy() - s2['a'] = expected - result = s2['a'] + s2["a"] = expected + result = s2["a"] assert result == expected diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index 480d185f18b8d..b4996575b0a05 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -9,21 +9,88 @@ def test_get(): # GH 6383 - s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, 45, - 51, 39, 55, 43, 54, 52, 51, 54])) + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ) + ) result = s.get(25, 0) expected = 0 assert result == expected - s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, - 45, 51, 39, 55, 43, 54, 52, 51, 54]), - index=pd.Float64Index( - [25.0, 36.0, 49.0, 64.0, 81.0, 100.0, - 121.0, 144.0, 169.0, 196.0, 1225.0, - 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, - 1681.0, 1764.0, 1849.0, 1936.0], - dtype='object')) + s = Series( + np.array( + [ + 43, + 48, + 60, + 48, + 50, + 51, + 50, + 45, + 57, + 48, + 56, + 45, + 51, + 39, + 55, + 43, + 54, + 52, + 51, + 54, + ] + ), + index=pd.Float64Index( + [ + 25.0, + 36.0, + 49.0, + 64.0, + 81.0, + 100.0, + 121.0, + 144.0, + 169.0, + 196.0, + 1225.0, + 1296.0, + 1369.0, + 1444.0, + 1521.0, + 1600.0, + 1681.0, + 1764.0, + 1849.0, + 1936.0, + ], + dtype="object", + ), + ) result = s.get(25, 0) expected = 43 @@ -31,24 +98,24 @@ def test_get(): # GH 7407 # with a boolean accessor - df = pd.DataFrame({'i': [0] * 3, 'b': [False] * 3}) + df = pd.DataFrame({"i": [0] * 3, "b": [False] * 3}) vc = df.i.value_counts() - result = vc.get(99, default='Missing') - assert result == 'Missing' + result = vc.get(99, default="Missing") + assert result == "Missing" vc = df.b.value_counts() - result = vc.get(False, default='Missing') + result = vc.get(False, default="Missing") assert result == 3 - result = vc.get(True, default='Missing') - assert result == 'Missing' + result = vc.get(True, default="Missing") + assert result == "Missing" def test_get_nan(): # GH 8569 s = pd.Float64Index(range(10)).to_series() assert s.get(np.nan) is None - assert s.get(np.nan, default='Missing') == 'Missing' + assert s.get(np.nan, default="Missing") == "Missing" def test_get_nan_multiple(): @@ -59,20 +126,18 @@ def test_get_nan_multiple(): idx = [2, 30] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert_series_equal(s.get(idx), - Series([2, np.nan], index=idx)) + assert_series_equal(s.get(idx), Series([2, np.nan], index=idx)) idx = [2, np.nan] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert_series_equal(s.get(idx), - Series([2, np.nan], index=idx)) + assert_series_equal(s.get(idx), Series([2, np.nan], index=idx)) # GH 17295 - all missing keys idx = [20, 30] - assert(s.get(idx) is None) + assert s.get(idx) is None idx = [np.nan, np.nan] - assert(s.get(idx) is None) + assert s.get(idx) is None def test_delitem(): @@ -97,28 +162,24 @@ def test_delitem(): # only 1 left, del, add, del s = Series(1) del s[0] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='int64'))) + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) s[0] = 1 assert_series_equal(s, Series(1)) del s[0] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='int64'))) + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="int64"))) # Index(dtype=object) - s = Series(1, index=['a']) - del s['a'] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='object'))) - s['a'] = 1 - assert_series_equal(s, Series(1, index=['a'])) - del s['a'] - assert_series_equal(s, Series(dtype='int64', index=Index( - [], dtype='object'))) + s = Series(1, index=["a"]) + del s["a"] + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) + s["a"] = 1 + assert_series_equal(s, Series(1, index=["a"])) + del s["a"] + assert_series_equal(s, Series(dtype="int64", index=Index([], dtype="object"))) def test_slice_float64(): - values = np.arange(10., 50., 2) + values = np.arange(10.0, 50.0, 2) index = Index(values) start, end = values[[5, 15]] @@ -150,7 +211,7 @@ def test_getitem_negative_out_of_bounds(): s[-11] msg = "index -11 is out of bounds for axis 0 with size 10" with pytest.raises(IndexError, match=msg): - s[-11] = 'foo' + s[-11] = "foo" def test_getitem_regression(): @@ -192,19 +253,21 @@ def test_getitem_setitem_slice_integers(): def test_setitem_float_labels(): # note labels are floats - s = Series(['a', 'b', 'c'], index=[0, 0.5, 1]) + s = Series(["a", "b", "c"], index=[0, 0.5, 1]) tmp = s.copy() - s.loc[1] = 'zoo' - tmp.iloc[2] = 'zoo' + s.loc[1] = "zoo" + tmp.iloc[2] = "zoo" assert_series_equal(s, tmp) def test_slice_float_get_set(test_data): - msg = (r"cannot do slice indexing on with these indexers \[{key}\]" - r" of ") + msg = ( + r"cannot do slice indexing on with these indexers \[{key}\]" + r" of " + ) with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): test_data.ts[4.0:10.0] @@ -237,7 +300,7 @@ def test_int_indexing(): s[5] with pytest.raises(KeyError, match=r"^'c'$"): - s['c'] + s["c"] # not monotonic s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) @@ -246,7 +309,7 @@ def test_int_indexing(): s[5] with pytest.raises(KeyError, match=r"^'c'$"): - s['c'] + s["c"] def test_getitem_int64(test_data): diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index f734fe7fd58f4..63baa6af7c02a 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -8,17 +8,20 @@ class TestSeriesAlterAxes: - def test_setindex(self, string_series): # wrong type - msg = (r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed") + msg = ( + r"Index\(\.\.\.\) must be called with a collection of some" + r" kind, None was passed" + ) with pytest.raises(TypeError, match=msg): string_series.index = None # wrong length - msg = ("Length mismatch: Expected axis has 30 elements, new" - " values have 29 elements") + msg = ( + "Length mismatch: Expected axis has 30 elements, new" + " values have 29 elements" + ) with pytest.raises(ValueError, match=msg): string_series.index = np.arange(len(string_series) - 1) @@ -30,7 +33,7 @@ def test_setindex(self, string_series): def test_rename(self, datetime_series): ts = datetime_series - renamer = lambda x: x.strftime('%Y%m%d') + renamer = lambda x: x.strftime("%Y%m%d") renamed = ts.rename(renamer) assert renamed.index[0] == renamer(ts.index[0]) @@ -40,53 +43,53 @@ def test_rename(self, datetime_series): tm.assert_series_equal(renamed, renamed2) # partial dict - s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64') - renamed = s.rename({'b': 'foo', 'd': 'bar'}) - tm.assert_index_equal(renamed.index, Index(['a', 'foo', 'c', 'bar'])) + s = Series(np.arange(4), index=["a", "b", "c", "d"], dtype="int64") + renamed = s.rename({"b": "foo", "d": "bar"}) + tm.assert_index_equal(renamed.index, Index(["a", "foo", "c", "bar"])) # index with name - renamer = Series(np.arange(4), - index=Index(['a', 'b', 'c', 'd'], name='name'), - dtype='int64') + renamer = Series( + np.arange(4), index=Index(["a", "b", "c", "d"], name="name"), dtype="int64" + ) renamed = renamer.rename({}) assert renamed.index.name == renamer.index.name def test_rename_by_series(self): - s = Series(range(5), name='foo') + s = Series(range(5), name="foo") renamer = Series({1: 10, 2: 20}) result = s.rename(renamer) - expected = Series(range(5), index=[0, 10, 20, 3, 4], name='foo') + expected = Series(range(5), index=[0, 10, 20, 3, 4], name="foo") tm.assert_series_equal(result, expected) def test_rename_set_name(self): - s = Series(range(4), index=list('abcd')) - for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]: + s = Series(range(4), index=list("abcd")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: result = s.rename(name) assert result.name == name tm.assert_numpy_array_equal(result.index.values, s.index.values) assert s.name is None def test_rename_set_name_inplace(self): - s = Series(range(3), index=list('abc')) - for name in ['foo', 123, 123., datetime(2001, 11, 11), ('foo',)]: + s = Series(range(3), index=list("abc")) + for name in ["foo", 123, 123.0, datetime(2001, 11, 11), ("foo",)]: s.rename(name, inplace=True) assert s.name == name - exp = np.array(['a', 'b', 'c'], dtype=np.object_) + exp = np.array(["a", "b", "c"], dtype=np.object_) tm.assert_numpy_array_equal(s.index.values, exp) def test_rename_axis_supported(self): # Supporting axis for compatibility, detailed in GH-18589 s = Series(range(5)) s.rename({}, axis=0) - s.rename({}, axis='index') - with pytest.raises(ValueError, match='No axis named 5'): + s.rename({}, axis="index") + with pytest.raises(ValueError, match="No axis named 5"): s.rename({}, axis=5) def test_set_name_attribute(self): s = Series([1, 2, 3]) - s2 = Series([1, 2, 3], name='bar') - for name in [7, 7., 'name', datetime(2001, 1, 1), (1,), "\u05D0"]: + s2 = Series([1, 2, 3], name="bar") + for name in [7, 7.0, "name", datetime(2001, 1, 1), (1,), "\u05D0"]: s.name = name assert s.name == name s2.name = name @@ -94,13 +97,13 @@ def test_set_name_attribute(self): def test_set_name(self): s = Series([1, 2, 3]) - s2 = s._set_name('foo') - assert s2.name == 'foo' + s2 = s._set_name("foo") + assert s2.name == "foo" assert s.name is None assert s is not s2 def test_rename_inplace(self, datetime_series): - renamer = lambda x: x.strftime('%Y%m%d') + renamer = lambda x: x.strftime("%Y%m%d") expected = renamer(datetime_series.index[0]) datetime_series.rename(renamer, inplace=True) @@ -116,14 +119,14 @@ def test_set_index_makes_timeseries(self): def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() - ser.index.names = ['hash', 'category'] + ser.index.names = ["hash", "category"] - ser.name = 'value' + ser.name = "value" df = ser.reset_index() - assert 'value' in df + assert "value" in df - df = ser.reset_index(name='value2') - assert 'value2' in df + df = ser.reset_index(name="value2") + assert "value2" in df # check inplace s = ser.reset_index(drop=True) @@ -132,9 +135,10 @@ def test_reset_index(self): tm.assert_series_equal(s, s2) # level - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) assert len(rs.columns) == 2 @@ -144,47 +148,45 @@ def test_reset_index(self): assert isinstance(rs, Series) def test_reset_index_name(self): - s = Series([1, 2, 3], index=Index(range(3), name='x')) + s = Series([1, 2, 3], index=Index(range(3), name="x")) assert s.reset_index().index.name is None assert s.reset_index(drop=True).index.name is None def test_reset_index_level(self): - df = DataFrame([[1, 2, 3], [4, 5, 6]], - columns=['A', 'B', 'C']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - for levels in ['A', 'B'], [0, 1]: + for levels in ["A", "B"], [0, 1]: # With MultiIndex - s = df.set_index(['A', 'B'])['C'] + s = df.set_index(["A", "B"])["C"] result = s.reset_index(level=levels[0]) - tm.assert_frame_equal(result, df.set_index('B')) + tm.assert_frame_equal(result, df.set_index("B")) result = s.reset_index(level=levels[:1]) - tm.assert_frame_equal(result, df.set_index('B')) + tm.assert_frame_equal(result, df.set_index("B")) result = s.reset_index(level=levels) tm.assert_frame_equal(result, df) - result = df.set_index(['A', 'B']).reset_index(level=levels, - drop=True) - tm.assert_frame_equal(result, df[['C']]) + result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) + tm.assert_frame_equal(result, df[["C"]]) - with pytest.raises(KeyError, match='Level E '): - s.reset_index(level=['A', 'E']) + with pytest.raises(KeyError, match="Level E "): + s.reset_index(level=["A", "E"]) # With single-level Index - s = df.set_index('A')['B'] + s = df.set_index("A")["B"] result = s.reset_index(level=levels[0]) - tm.assert_frame_equal(result, df[['A', 'B']]) + tm.assert_frame_equal(result, df[["A", "B"]]) result = s.reset_index(level=levels[:1]) - tm.assert_frame_equal(result, df[['A', 'B']]) + tm.assert_frame_equal(result, df[["A", "B"]]) result = s.reset_index(level=levels[0], drop=True) - tm.assert_series_equal(result, df['B']) + tm.assert_series_equal(result, df["B"]) - with pytest.raises(IndexError, match='Too many levels'): + with pytest.raises(IndexError, match="Too many levels"): s.reset_index(level=[0, 1, 2]) # Check that .reset_index([],drop=True) doesn't fail @@ -194,19 +196,20 @@ def test_reset_index_level(self): def test_reset_index_range(self): # GH 12071 - s = Series(range(2), name='A', dtype='int64') + s = Series(range(2), name="A", dtype="int64") series_result = s.reset_index() assert isinstance(series_result.index, RangeIndex) - series_expected = DataFrame([[0, 0], [1, 1]], - columns=['index', 'A'], - index=RangeIndex(stop=2)) + series_expected = DataFrame( + [[0, 0], [1, 1]], columns=["index", "A"], index=RangeIndex(stop=2) + ) tm.assert_frame_equal(series_result, series_expected) def test_reorder_levels(self): - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]], - names=['L0', 'L1', 'L2']) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + names=["L0", "L1", "L2"], + ) s = Series(np.arange(6), index=index) # no change, position @@ -214,49 +217,49 @@ def test_reorder_levels(self): tm.assert_series_equal(s, result) # no change, labels - result = s.reorder_levels(['L0', 'L1', 'L2']) + result = s.reorder_levels(["L0", "L1", "L2"]) tm.assert_series_equal(s, result) # rotate, position result = s.reorder_levels([1, 2, 0]) - e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']], - codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], - [0, 0, 0, 0, 0, 0]], - names=['L1', 'L2', 'L0']) + e_idx = MultiIndex( + levels=[["one", "two", "three"], [0, 1], ["bar"]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]], + names=["L1", "L2", "L0"], + ) expected = Series(np.arange(6), index=e_idx) tm.assert_series_equal(result, expected) def test_rename_axis_mapper(self): # GH 19978 - mi = MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], - names=['ll', 'nn']) + mi = MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["ll", "nn"]) s = Series([i for i in range(len(mi))], index=mi) - result = s.rename_axis(index={'ll': 'foo'}) - assert result.index.names == ['foo', 'nn'] + result = s.rename_axis(index={"ll": "foo"}) + assert result.index.names == ["foo", "nn"] result = s.rename_axis(index=str.upper, axis=0) - assert result.index.names == ['LL', 'NN'] + assert result.index.names == ["LL", "NN"] - result = s.rename_axis(index=['foo', 'goo']) - assert result.index.names == ['foo', 'goo'] + result = s.rename_axis(index=["foo", "goo"]) + assert result.index.names == ["foo", "goo"] - with pytest.raises(TypeError, match='unexpected'): - s.rename_axis(columns='wrong') + with pytest.raises(TypeError, match="unexpected"): + s.rename_axis(columns="wrong") def test_rename_axis_inplace(self, datetime_series): # GH 15704 - expected = datetime_series.rename_axis('foo') + expected = datetime_series.rename_axis("foo") result = datetime_series - no_return = result.rename_axis('foo', inplace=True) + no_return = result.rename_axis("foo", inplace=True) assert no_return is None tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('kwargs', [{'mapper': None}, {'index': None}, {}]) + @pytest.mark.parametrize("kwargs", [{"mapper": None}, {"index": None}, {}]) def test_rename_axis_none(self, kwargs): # GH 25034 - index = Index(list('abc'), name='foo') + index = Index(list("abc"), name="foo") df = Series([1, 2, 3], index=index) result = df.rename_axis(**kwargs) @@ -266,52 +269,52 @@ def test_rename_axis_none(self, kwargs): def test_set_axis_inplace_axes(self, axis_series): # GH14636 - ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') + ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") expected = ser.copy() - expected.index = list('abcd') + expected.index = list("abcd") # inplace=True # The FutureWarning comes from the fact that we would like to have # inplace default to False some day for inplace, warn in [(None, FutureWarning), (True, None)]: result = ser.copy() - kwargs = {'inplace': inplace} + kwargs = {"inplace": inplace} with tm.assert_produces_warning(warn): - result.set_axis(list('abcd'), axis=axis_series, **kwargs) + result.set_axis(list("abcd"), axis=axis_series, **kwargs) tm.assert_series_equal(result, expected) def test_set_axis_inplace(self): # GH14636 - s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') + s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") expected = s.copy() - expected.index = list('abcd') + expected.index = list("abcd") # inplace=False - result = s.set_axis(list('abcd'), axis=0, inplace=False) + result = s.set_axis(list("abcd"), axis=0, inplace=False) tm.assert_series_equal(expected, result) # omitting the "axis" parameter with tm.assert_produces_warning(None): - result = s.set_axis(list('abcd'), inplace=False) + result = s.set_axis(list("abcd"), inplace=False) tm.assert_series_equal(result, expected) # wrong values for the "axis" parameter - for axis in [2, 'foo']: - with pytest.raises(ValueError, match='No axis named'): - s.set_axis(list('abcd'), axis=axis, inplace=False) + for axis in [2, "foo"]: + with pytest.raises(ValueError, match="No axis named"): + s.set_axis(list("abcd"), axis=axis, inplace=False) def test_set_axis_prior_to_deprecation_signature(self): - s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') + s = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") expected = s.copy() - expected.index = list('abcd') + expected.index = list("abcd") - for axis in [0, 'index']: + for axis in [0, "index"]: with tm.assert_produces_warning(FutureWarning): - result = s.set_axis(0, list('abcd'), inplace=False) + result = s.set_axis(0, list("abcd"), inplace=False) tm.assert_series_equal(result, expected) def test_reset_index_drop_errors(self): @@ -319,24 +322,25 @@ def test_reset_index_drop_errors(self): # KeyError raised for series index when passed level name is missing s = Series(range(4)) - with pytest.raises(KeyError, match='must be same as name'): - s.reset_index('wrong', drop=True) - with pytest.raises(KeyError, match='must be same as name'): - s.reset_index('wrong') + with pytest.raises(KeyError, match="must be same as name"): + s.reset_index("wrong", drop=True) + with pytest.raises(KeyError, match="must be same as name"): + s.reset_index("wrong") # KeyError raised for series when level to be dropped is missing s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2)) - with pytest.raises(KeyError, match='not found'): - s.reset_index('wrong', drop=True) + with pytest.raises(KeyError, match="not found"): + s.reset_index("wrong", drop=True) def test_droplevel(self): # GH20342 ser = Series([1, 2, 3, 4]) - ser.index = MultiIndex.from_arrays([(1, 2, 3, 4), (5, 6, 7, 8)], - names=['a', 'b']) - expected = ser.reset_index('b', drop=True) - result = ser.droplevel('b', axis='index') + ser.index = MultiIndex.from_arrays( + [(1, 2, 3, 4), (5, 6, 7, 8)], names=["a", "b"] + ) + expected = ser.reset_index("b", drop=True) + result = ser.droplevel("b", axis="index") tm.assert_series_equal(result, expected) # test that droplevel raises ValueError on axis != 0 with pytest.raises(ValueError): - ser.droplevel(1, axis='columns') + ser.droplevel(1, axis="columns") diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index e48fd9ce11a7d..89b411a284563 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -9,45 +9,60 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, Series, date_range, isna, notna) + Categorical, + CategoricalIndex, + DataFrame, + Series, + date_range, + isna, + notna, +) from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_index_equal, - assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) class TestSeriesAnalytics: - def test_describe(self): - s = Series([0, 1, 2, 3, 4], name='int_data') + s = Series([0, 1, 2, 3, 4], name="int_data") result = s.describe() - expected = Series([5, 2, s.std(), 0, 1, 2, 3, 4], - name='int_data', - index=['count', 'mean', 'std', 'min', '25%', - '50%', '75%', 'max']) + expected = Series( + [5, 2, s.std(), 0, 1, 2, 3, 4], + name="int_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) tm.assert_series_equal(result, expected) - s = Series([True, True, False, False, False], name='bool_data') + s = Series([True, True, False, False, False], name="bool_data") result = s.describe() - expected = Series([5, 2, False, 3], name='bool_data', - index=['count', 'unique', 'top', 'freq']) + expected = Series( + [5, 2, False, 3], name="bool_data", index=["count", "unique", "top", "freq"] + ) tm.assert_series_equal(result, expected) - s = Series(['a', 'a', 'b', 'c', 'd'], name='str_data') + s = Series(["a", "a", "b", "c", "d"], name="str_data") result = s.describe() - expected = Series([5, 4, 'a', 2], name='str_data', - index=['count', 'unique', 'top', 'freq']) + expected = Series( + [5, 4, "a", 2], name="str_data", index=["count", "unique", "top", "freq"] + ) tm.assert_series_equal(result, expected) def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 s = pd.Series([None, None], dtype=object) result = s.describe() - expected = pd.Series([0, 0, np.nan, np.nan], dtype=object, - index=['count', 'unique', 'top', 'freq']) + expected = pd.Series( + [0, 0, np.nan, np.nan], + dtype=object, + index=["count", "unique", "top", "freq"], + ) tm.assert_series_equal(result, expected) result = s[:0].describe() @@ -65,61 +80,67 @@ def test_describe_with_tz(self, tz_naive_fixture): s = Series(date_range(start, end, tz=tz), name=name) result = s.describe() expected = Series( - [5, 5, s.value_counts().index[0], 1, start.tz_localize(tz), - end.tz_localize(tz) - ], + [ + 5, + 5, + s.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], name=name, - index=['count', 'unique', 'top', 'freq', 'first', 'last'] + index=["count", "unique", "top", "freq", "first", "last"], ) tm.assert_series_equal(result, expected) def test_argsort(self, datetime_series): - self._check_accum_op('argsort', datetime_series, check_dtype=False) + self._check_accum_op("argsort", datetime_series, check_dtype=False) argsorted = datetime_series.argsort() assert issubclass(argsorted.dtype.type, np.integer) # GH 2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp('201301%02d' % (i + 1)) for i in range(5)]) - assert s.dtype == 'datetime64[ns]' + s = Series([Timestamp("201301%02d" % (i + 1)) for i in range(5)]) + assert s.dtype == "datetime64[ns]" shifted = s.shift(-1) - assert shifted.dtype == 'datetime64[ns]' + assert shifted.dtype == "datetime64[ns]" assert isna(shifted[4]) result = s.argsort() - expected = Series(range(5), dtype='int64') + expected = Series(range(5), dtype="int64") assert_series_equal(result, expected) result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype='int64') + expected = Series(list(range(4)) + [-1], dtype="int64") assert_series_equal(result, expected) def test_argsort_stable(self): s = Series(np.random.randint(0, 100, size=10000)) - mindexer = s.argsort(kind='mergesort') + mindexer = s.argsort(kind="mergesort") qindexer = s.argsort() - mexpected = np.argsort(s.values, kind='mergesort') - qexpected = np.argsort(s.values, kind='quicksort') + mexpected = np.argsort(s.values, kind="mergesort") + qexpected = np.argsort(s.values, kind="quicksort") - tm.assert_series_equal(mindexer, Series(mexpected), - check_dtype=False) - tm.assert_series_equal(qindexer, Series(qexpected), - check_dtype=False) - msg = (r"ndarray Expected type ," - r" found instead") + tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) + tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + msg = ( + r"ndarray Expected type ," + r" found instead" + ) with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(qindexer, mindexer) def test_cumsum(self, datetime_series): - self._check_accum_op('cumsum', datetime_series) + self._check_accum_op("cumsum", datetime_series) def test_cumprod(self, datetime_series): - self._check_accum_op('cumprod', datetime_series) + self._check_accum_op("cumprod", datetime_series) def test_cummin(self, datetime_series): - tm.assert_numpy_array_equal(datetime_series.cummin().values, - np.minimum - .accumulate(np.array(datetime_series))) + tm.assert_numpy_array_equal( + datetime_series.cummin().values, + np.minimum.accumulate(np.array(datetime_series)), + ) ts = datetime_series.copy() ts[::2] = np.NaN result = ts.cummin()[1::2] @@ -128,9 +149,10 @@ def test_cummin(self, datetime_series): tm.assert_series_equal(result, expected) def test_cummax(self, datetime_series): - tm.assert_numpy_array_equal(datetime_series.cummax().values, - np.maximum - .accumulate(np.array(datetime_series))) + tm.assert_numpy_array_equal( + datetime_series.cummax().values, + np.maximum.accumulate(np.array(datetime_series)), + ) ts = datetime_series.copy() ts[::2] = np.NaN result = ts.cummax()[1::2] @@ -139,90 +161,75 @@ def test_cummax(self, datetime_series): tm.assert_series_equal(result, expected) def test_cummin_datetime64(self): - s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1', - 'NaT', '2000-1-3'])) + s = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) + ) - expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', - '2000-1-1', 'NaT', '2000-1-1'])) + expected = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"]) + ) result = s.cummin(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_datetime( - ['NaT', '2000-1-2', '2000-1-2', '2000-1-1', '2000-1-1', '2000-1-1' - ])) + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-1", "2000-1-1", "2000-1-1"] + ) + ) result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) def test_cummax_datetime64(self): - s = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', '2000-1-1', - 'NaT', '2000-1-3'])) + s = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) + ) - expected = pd.Series(pd.to_datetime(['NaT', '2000-1-2', 'NaT', - '2000-1-2', 'NaT', '2000-1-3'])) + expected = pd.Series( + pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"]) + ) result = s.cummax(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_datetime( - ['NaT', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-2', '2000-1-3' - ])) + expected = pd.Series( + pd.to_datetime( + ["NaT", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-3"] + ) + ) result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) def test_cummin_timedelta64(self): - s = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '1 min', - 'NaT', - '3 min', ])) - - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '1 min', - 'NaT', - '1 min', ])) + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "1 min"]) + ) result = s.cummin(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - '2 min', - '1 min', - '1 min', - '1 min', ])) + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "1 min", "1 min", "1 min"]) + ) result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) def test_cummax_timedelta64(self): - s = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '1 min', - 'NaT', - '3 min', ])) - - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - 'NaT', - '2 min', - 'NaT', - '3 min', ])) + s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) + + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "NaT", "2 min", "NaT", "3 min"]) + ) result = s.cummax(skipna=True) tm.assert_series_equal(expected, result) - expected = pd.Series(pd.to_timedelta(['NaT', - '2 min', - '2 min', - '2 min', - '2 min', - '3 min', ])) + expected = pd.Series( + pd.to_timedelta(["NaT", "2 min", "2 min", "2 min", "2 min", "3 min"]) + ) result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) def test_npdiff(self): - pytest.skip("skipping due to Series no longer being an " - "ndarray") + pytest.skip("skipping due to Series no longer being an " "ndarray") # no longer works as the return type of np.diff is now nd.array s = Series(np.arange(5)) @@ -232,9 +239,11 @@ def test_npdiff(self): def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) - tm.assert_numpy_array_equal(func(datetime_series_).values, - func(np.array(datetime_series_)), - check_dtype=check_dtype) + tm.assert_numpy_array_equal( + func(datetime_series_).values, + func(np.array(datetime_series_)), + check_dtype=check_dtype, + ) # with missing values ts = datetime_series_.copy() @@ -243,25 +252,20 @@ def _check_accum_op(self, name, datetime_series_, check_dtype=True): result = func(ts)[1::2] expected = func(np.array(ts.dropna())) - tm.assert_numpy_array_equal(result.values, expected, - check_dtype=False) + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) def test_compress(self): cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], - index=list('abcde'), name='foo') - expected = Series(s.values.compress(cond), - index=list('ac'), name='foo') + s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") + expected = Series(s.values.compress(cond), index=list("ac"), name="foo") with tm.assert_produces_warning(FutureWarning): result = s.compress(cond) tm.assert_series_equal(result, expected) def test_numpy_compress(self): cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], - index=list('abcde'), name='foo') - expected = Series(s.values.compress(cond), - index=list('ac'), name='foo') + s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") + expected = Series(s.values.compress(cond), index=list("ac"), name="foo") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(np.compress(cond, s), expected) @@ -277,8 +281,9 @@ def test_numpy_compress(self): def test_round(self, datetime_series): datetime_series.index.name = "index_name" result = datetime_series.round(2) - expected = Series(np.round(datetime_series.values, 2), - index=datetime_series.index, name='ts') + expected = Series( + np.round(datetime_series.values, 2), index=datetime_series.index, name="ts" + ) assert_series_equal(result, expected) assert result.name == datetime_series.name @@ -286,7 +291,7 @@ def test_numpy_round(self): # See gh-12600 s = Series([1.53, 1.36, 0.06]) out = np.round(s, decimals=0) - expected = Series([2., 1., 0.]) + expected = Series([2.0, 1.0, 0.0]) assert_series_equal(out, expected) msg = "the 'out' parameter is not supported" @@ -298,13 +303,13 @@ def test_numpy_round_nan(self): s = Series([1.53, np.nan, 0.06]) with tm.assert_produces_warning(None): result = s.round() - expected = Series([2., np.nan, 0.]) + expected = Series([2.0, np.nan, 0.0]) assert_series_equal(result, expected) def test_built_in_round(self): s = Series([1.123, 2.123, 3.123], index=range(3)) result = round(s) - expected_rounded0 = Series([1., 2., 3.], index=range(3)) + expected_rounded0 = Series([1.0, 2.0, 3.0], index=range(3)) tm.assert_series_equal(result, expected_rounded0) decimals = 2 @@ -313,7 +318,7 @@ def test_built_in_round(self): tm.assert_series_equal(result, expected_rounded) def test_prod_numpy16_bug(self): - s = Series([1., 1., 1.], index=range(3)) + s = Series([1.0, 1.0, 1.0], index=range(3)) result = s.prod() assert not isinstance(result, Series) @@ -326,11 +331,9 @@ def test_corr(self, datetime_series): tm.assert_almost_equal(datetime_series.corr(datetime_series), 1) # partial overlap - tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), - 1) + tm.assert_almost_equal(datetime_series[:15].corr(datetime_series[5:]), 1) - assert isna(datetime_series[:15].corr(datetime_series[5:], - min_periods=12)) + assert isna(datetime_series[:15].corr(datetime_series[5:], min_periods=12)) ts1 = datetime_series[:15].reindex(datetime_series.index) ts2 = datetime_series[5:].reindex(datetime_series.index) @@ -358,75 +361,100 @@ def test_corr_rank(self): A = tm.makeTimeSeries() B = tm.makeTimeSeries() A[-5:] = A[:5] - result = A.corr(B, method='kendall') + result = A.corr(B, method="kendall") expected = stats.kendalltau(A, B)[0] tm.assert_almost_equal(result, expected) - result = A.corr(B, method='spearman') + result = A.corr(B, method="spearman") expected = stats.spearmanr(A, B)[0] tm.assert_almost_equal(result, expected) # results from R A = Series( - [-0.89926396, 0.94209606, -1.03289164, -0.95445587, 0.76910310, - - 0.06430576, -2.09704447, 0.40660407, -0.89926396, 0.94209606]) + [ + -0.89926396, + 0.94209606, + -1.03289164, + -0.95445587, + 0.76910310, + -0.06430576, + -2.09704447, + 0.40660407, + -0.89926396, + 0.94209606, + ] + ) B = Series( - [-1.01270225, -0.62210117, -1.56895827, 0.59592943, -0.01680292, - 1.17258718, -1.06009347, -0.10222060, -0.89076239, 0.89372375]) + [ + -1.01270225, + -0.62210117, + -1.56895827, + 0.59592943, + -0.01680292, + 1.17258718, + -1.06009347, + -0.10222060, + -0.89076239, + 0.89372375, + ] + ) kexp = 0.4319297 sexp = 0.5853767 - tm.assert_almost_equal(A.corr(B, method='kendall'), kexp) - tm.assert_almost_equal(A.corr(B, method='spearman'), sexp) + tm.assert_almost_equal(A.corr(B, method="kendall"), kexp) + tm.assert_almost_equal(A.corr(B, method="spearman"), sexp) def test_corr_invalid_method(self): # GH PR #22298 s1 = pd.Series(np.random.randn(10)) s2 = pd.Series(np.random.randn(10)) - msg = ("method must be either 'pearson', " - "'spearman', 'kendall', or a callable, ") + msg = ( + "method must be either 'pearson', " "'spearman', 'kendall', or a callable, " + ) with pytest.raises(ValueError, match=msg): s1.corr(s2, method="____") def test_corr_callable_method(self, datetime_series): # simple correlation example # returns 1 if exact equality, 0 otherwise - my_corr = lambda a, b: 1. if (a == b).all() else 0. + my_corr = lambda a, b: 1.0 if (a == b).all() else 0.0 # simple example s1 = Series([1, 2, 3, 4, 5]) s2 = Series([5, 4, 3, 2, 1]) expected = 0 - tm.assert_almost_equal( - s1.corr(s2, method=my_corr), - expected) + tm.assert_almost_equal(s1.corr(s2, method=my_corr), expected) # full overlap - tm.assert_almost_equal(datetime_series.corr( - datetime_series, method=my_corr), 1.) + tm.assert_almost_equal( + datetime_series.corr(datetime_series, method=my_corr), 1.0 + ) # partial overlap - tm.assert_almost_equal(datetime_series[:15].corr( - datetime_series[5:], method=my_corr), 1.) + tm.assert_almost_equal( + datetime_series[:15].corr(datetime_series[5:], method=my_corr), 1.0 + ) # No overlap - assert np.isnan(datetime_series[::2].corr( - datetime_series[1::2], method=my_corr)) + assert np.isnan( + datetime_series[::2].corr(datetime_series[1::2], method=my_corr) + ) # dataframe example df = pd.DataFrame([s1, s2]) - expected = pd.DataFrame([ - {0: 1., 1: 0}, {0: 0, 1: 1.}]) - tm.assert_almost_equal( - df.transpose().corr(method=my_corr), expected) + expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) + tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) def test_cov(self, datetime_series): # full overlap - tm.assert_almost_equal(datetime_series.cov(datetime_series), - datetime_series.std() ** 2) + tm.assert_almost_equal( + datetime_series.cov(datetime_series), datetime_series.std() ** 2 + ) # partial overlap - tm.assert_almost_equal(datetime_series[:15].cov(datetime_series[5:]), - datetime_series[5:15].std() ** 2) + tm.assert_almost_equal( + datetime_series[:15].cov(datetime_series[5:]), + datetime_series[5:15].std() ** 2, + ) # No overlap assert np.isnan(datetime_series[::2].cov(datetime_series[1::2])) @@ -437,8 +465,7 @@ def test_cov(self, datetime_series): assert isna(cp.cov(cp)) # min_periods - assert isna(datetime_series[:15].cov(datetime_series[5:], - min_periods=12)) + assert isna(datetime_series[:15].cov(datetime_series[5:], min_periods=12)) ts1 = datetime_series[:15].reindex(datetime_series.index) ts2 = datetime_series[5:].reindex(datetime_series.index) @@ -451,7 +478,7 @@ def test_count(self, datetime_series): assert datetime_series.count() == np.isfinite(datetime_series).sum() - mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]]) + mi = MultiIndex.from_arrays([list("aabbcc"), [1, 2, 2, nan, 1, 2]]) ts = Series(np.arange(len(mi)), index=mi) left = ts.count(level=1) @@ -462,12 +489,13 @@ def test_count(self, datetime_series): assert_series_equal(ts.count(level=1), right - 1) def test_dot(self): - a = Series(np.random.randn(4), index=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'], - columns=['p', 'q', 'r', 's']).T + a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) + b = DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T result = a.dot(b) - expected = Series(np.dot(a.values, b.values), index=['1', '2', '3']) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # Check index alignment @@ -478,11 +506,11 @@ def test_dot(self): # Check ndarray argument result = a.dot(b.values) assert np.all(result == expected.values) - assert_almost_equal(a.dot(b['2'].values), expected['2']) + assert_almost_equal(a.dot(b["2"].values), expected["2"]) # Check series argument - assert_almost_equal(a.dot(b['1']), expected['1']) - assert_almost_equal(a.dot(b2['1']), expected['1']) + assert_almost_equal(a.dot(b["1"]), expected["1"]) + assert_almost_equal(a.dot(b2["1"]), expected["1"]) msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" # exception raised is of type Exception @@ -494,19 +522,19 @@ def test_dot(self): def test_matmul(self): # matmul test is for GH #10259 - a = Series(np.random.randn(4), index=['p', 'q', 'r', 's']) - b = DataFrame(np.random.randn(3, 4), index=['1', '2', '3'], - columns=['p', 'q', 'r', 's']).T + a = Series(np.random.randn(4), index=["p", "q", "r", "s"]) + b = DataFrame( + np.random.randn(3, 4), index=["1", "2", "3"], columns=["p", "q", "r", "s"] + ).T # Series @ DataFrame -> Series result = operator.matmul(a, b) - expected = Series(np.dot(a.values, b.values), index=['1', '2', '3']) + expected = Series(np.dot(a.values, b.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # DataFrame @ Series -> Series result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), - index=['1', '2', '3']) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # Series @ Series -> scalar @@ -539,17 +567,15 @@ def test_matmul(self): assert_almost_equal(result, expected) # mixed dtype DataFrame @ Series - a['p'] = int(a.p) + a["p"] = int(a.p) result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), - index=['1', '2', '3']) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) assert_series_equal(result, expected) # different dtypes DataFrame @ Series a = a.astype(int) result = operator.matmul(b.T, a) - expected = Series(np.dot(b.T.values, a.T.values), - index=['1', '2', '3']) + expected = Series(np.dot(b.T.values, a.T.values), index=["1", "2", "3"]) assert_series_equal(result, expected) msg = r"Dot product shape mismatch, \(4,\) vs \(3,\)" @@ -578,9 +604,11 @@ def test_clip(self, datetime_series): def test_clip_types_and_nulls(self): - sers = [Series([np.nan, 1.0, 2.0, 3.0]), Series([None, 'a', 'b', 'c']), - Series(pd.to_datetime( - [np.nan, 1, 2, 3], unit='D'))] + sers = [ + Series([np.nan, 1.0, 2.0, 3.0]), + Series([None, "a", "b", "c"]), + Series(pd.to_datetime([np.nan, 1, 2, 3], unit="D")), + ] for s in sers: thresh = s[2] @@ -599,14 +627,11 @@ def test_clip_with_na_args(self): s = Series([1, 2, 3]) assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) - assert_series_equal(s.clip(upper=np.nan, lower=np.nan), - Series([1, 2, 3])) + assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) # GH #19992 - assert_series_equal(s.clip(lower=[0, 4, np.nan]), - Series([1, 4, np.nan])) - assert_series_equal(s.clip(upper=[1, np.nan, 1]), - Series([1, np.nan, 1])) + assert_series_equal(s.clip(lower=[0, 4, np.nan]), Series([1, 4, np.nan])) + assert_series_equal(s.clip(upper=[1, np.nan, 1]), Series([1, np.nan, 1])) def test_clip_against_series(self): # GH #6966 @@ -615,11 +640,9 @@ def test_clip_against_series(self): threshold = Series([1.0, 2.0, 3.0]) with tm.assert_produces_warning(FutureWarning): - assert_series_equal(s.clip_lower(threshold), - Series([1.0, 2.0, 4.0])) + assert_series_equal(s.clip_lower(threshold), Series([1.0, 2.0, 4.0])) with tm.assert_produces_warning(FutureWarning): - assert_series_equal(s.clip_upper(threshold), - Series([1.0, 1.0, 3.0])) + assert_series_equal(s.clip_upper(threshold), Series([1.0, 1.0, 3.0])) lower = Series([1.0, 2.0, 3.0]) upper = Series([1.5, 2.5, 3.5]) @@ -644,20 +667,28 @@ def test_clip_with_datetimes(self): # GH 11838 # naive and tz-aware datetimes - t = Timestamp('2015-12-01 09:30:30') - s = Series([Timestamp('2015-12-01 09:30:00'), - Timestamp('2015-12-01 09:31:00')]) + t = Timestamp("2015-12-01 09:30:30") + s = Series([Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:31:00")]) result = s.clip(upper=t) - expected = Series([Timestamp('2015-12-01 09:30:00'), - Timestamp('2015-12-01 09:30:30')]) + expected = Series( + [Timestamp("2015-12-01 09:30:00"), Timestamp("2015-12-01 09:30:30")] + ) assert_series_equal(result, expected) - t = Timestamp('2015-12-01 09:30:30', tz='US/Eastern') - s = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'), - Timestamp('2015-12-01 09:31:00', tz='US/Eastern')]) + t = Timestamp("2015-12-01 09:30:30", tz="US/Eastern") + s = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:31:00", tz="US/Eastern"), + ] + ) result = s.clip(upper=t) - expected = Series([Timestamp('2015-12-01 09:30:00', tz='US/Eastern'), - Timestamp('2015-12-01 09:30:30', tz='US/Eastern')]) + expected = Series( + [ + Timestamp("2015-12-01 09:30:00", tz="US/Eastern"), + Timestamp("2015-12-01 09:30:30", tz="US/Eastern"), + ] + ) assert_series_equal(result, expected) def test_cummethods_bool(self): @@ -667,10 +698,12 @@ def test_cummethods_bool(self): b = ~a c = pd.Series([False] * len(b)) d = ~c - methods = {'cumsum': np.cumsum, - 'cumprod': np.cumprod, - 'cummin': np.minimum.accumulate, - 'cummax': np.maximum.accumulate} + methods = { + "cumsum": np.cumsum, + "cumprod": np.cumprod, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + } args = product((a, b, c, d), methods) for s, method in args: expected = Series(methods[method](s.values)) @@ -682,19 +715,16 @@ def test_cummethods_bool(self): cpe = pd.Series([False, 0, nan, 0]) cmin = pd.Series([False, False, nan, False]) cmax = pd.Series([False, True, nan, True]) - expecteds = {'cumsum': cse, - 'cumprod': cpe, - 'cummin': cmin, - 'cummax': cmax} + expecteds = {"cumsum": cse, "cumprod": cpe, "cummin": cmin, "cummax": cmax} for method in methods: res = getattr(e, method)() assert_series_equal(res, expecteds[method]) def test_isin(self): - s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) - result = s.isin(['A', 'C']) + result = s.isin(["A", "C"]) expected = Series([True, False, True, False, False, False, True, True]) assert_series_equal(result, expected) @@ -702,25 +732,26 @@ def test_isin(self): # This specific issue has to have a series over 1e6 in len, but the # comparison array (in_list) must be large enough so that numpy doesn't # do a manual masking trick that will avoid this issue altogether - s = Series(list('abcdefghijk' * 10 ** 5)) + s = Series(list("abcdefghijk" * 10 ** 5)) # If numpy doesn't do the manual comparison/mask, these # unorderable mixed types are what cause the exception in numpy - in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E', - 'K', 'E', 'S', 'I', 'R', 'R'] * 6 + in_list = [-1, "a", "b", "G", "Y", "Z", "E", "K", "E", "S", "I", "R", "R"] * 6 assert s.isin(in_list).sum() == 200000 def test_isin_with_string_scalar(self): # GH4763 - s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) - msg = (r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[str\]") + s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[str\]" + ) with pytest.raises(TypeError, match=msg): - s.isin('a') + s.isin("a") - s = Series(['aaa', 'b', 'c']) + s = Series(["aaa", "b", "c"]) with pytest.raises(TypeError, match=msg): - s.isin('aaa') + s.isin("aaa") def test_isin_with_i8(self): # GH 5021 @@ -729,7 +760,7 @@ def test_isin_with_i8(self): expected2 = Series([False, True, False, False, False]) # datetime64[ns] - s = Series(date_range('jan-01-2013', 'jan-05-2013')) + s = Series(date_range("jan-01-2013", "jan-05-2013")) result = s.isin(s[0:2]) assert_series_equal(result, expected) @@ -738,7 +769,7 @@ def test_isin_with_i8(self): assert_series_equal(result, expected) # fails on dtype conversion in the first place - result = s.isin(s[0:2].values.astype('datetime64[D]')) + result = s.isin(s[0:2].values.astype("datetime64[D]")) assert_series_equal(result, expected) result = s.isin([s[1]]) @@ -751,7 +782,7 @@ def test_isin_with_i8(self): assert_series_equal(result, expected) # timedelta64[ns] - s = Series(pd.to_timedelta(range(5), unit='d')) + s = Series(pd.to_timedelta(range(5), unit="d")) result = s.isin(s[0:2]) assert_series_equal(result, expected) @@ -778,39 +809,35 @@ def test_ptp(self): assert s.ptp() == 13 assert pd.isna(s.ptp(skipna=False)) - mi = pd.MultiIndex.from_product([['a', 'b'], [1, 2, 3]]) + mi = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]]) s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi) - expected = pd.Series([6, 2], index=['a', 'b'], dtype=np.float64) + expected = pd.Series([6, 2], index=["a", "b"], dtype=np.float64) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(s.ptp(level=0), expected) - expected = pd.Series([np.nan, np.nan], index=['a', 'b']) + expected = pd.Series([np.nan, np.nan], index=["a", "b"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): tm.assert_series_equal(s.ptp(level=0, skipna=False), expected) - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s.ptp(axis=1) - s = pd.Series(['a', 'b', 'c', 'd', 'e']) + s = pd.Series(["a", "b", "c", "d", "e"]) msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s.ptp() msg = r"Series\.ptp does not implement numeric_only\." with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s.ptp(numeric_only=True) def test_repeat(self): - s = Series(np.random.randn(3), index=['a', 'b', 'c']) + s = Series(np.random.randn(3), index=["a", "b", "c"]) reps = s.repeat(5) exp = Series(s.values.repeat(5), index=s.index.values.repeat(5)) @@ -818,14 +845,12 @@ def test_repeat(self): to_rep = [2, 3, 4] reps = s.repeat(to_rep) - exp = Series(s.values.repeat(to_rep), - index=s.index.values.repeat(to_rep)) + exp = Series(s.values.repeat(to_rep), index=s.index.values.repeat(to_rep)) assert_series_equal(reps, exp) def test_numpy_repeat(self): - s = Series(np.arange(3), name='x') - expected = Series(s.values.repeat(2), name='x', - index=s.index.values.repeat(2)) + s = Series(np.arange(3), name="x") + expected = Series(s.values.repeat(2), name="x", index=s.index.values.repeat(2)) assert_series_equal(np.repeat(s, 2), expected) msg = "the 'axis' parameter is not supported" @@ -835,11 +860,11 @@ def test_numpy_repeat(self): def test_searchsorted(self): s = Series([1, 2, 3]) - result = s.searchsorted(1, side='left') + result = s.searchsorted(1, side="left") assert is_scalar(result) assert result == 0 - result = s.searchsorted(1, side='right') + result = s.searchsorted(1, side="right") assert is_scalar(result) assert result == 1 @@ -860,15 +885,15 @@ def test_searchsorted_numeric_dtypes_vector(self): tm.assert_numpy_array_equal(r, e) def test_search_sorted_datetime64_scalar(self): - s = Series(pd.date_range('20120101', periods=10, freq='2D')) - v = pd.Timestamp('20120102') + s = Series(pd.date_range("20120101", periods=10, freq="2D")) + v = pd.Timestamp("20120102") r = s.searchsorted(v) assert is_scalar(r) assert r == 1 def test_search_sorted_datetime64_list(self): - s = Series(pd.date_range('20120101', periods=10, freq='2D')) - v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')] + s = Series(pd.date_range("20120101", periods=10, freq="2D")) + v = [pd.Timestamp("20120102"), pd.Timestamp("20120104")] r = s.searchsorted(v) e = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) @@ -890,7 +915,7 @@ def test_is_monotonic(self): s = Series(np.arange(1000, 0, -1)) assert s.is_monotonic_decreasing is True - s = Series(pd.date_range('20130101', periods=10)) + s = Series(pd.date_range("20130101", periods=10)) assert s.is_monotonic is True assert s.is_monotonic_increasing is True s = Series(list(reversed(s.tolist()))) @@ -898,38 +923,36 @@ def test_is_monotonic(self): assert s.is_monotonic_decreasing is True def test_sort_index_level(self): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] - res = s.sort_index(level='A') + res = s.sort_index(level="A") assert_series_equal(backwards, res) - res = s.sort_index(level=['A', 'B']) + res = s.sort_index(level=["A", "B"]) assert_series_equal(backwards, res) - res = s.sort_index(level='A', sort_remaining=False) + res = s.sort_index(level="A", sort_remaining=False) assert_series_equal(s, res) - res = s.sort_index(level=['A', 'B'], sort_remaining=False) + res = s.sort_index(level=["A", "B"], sort_remaining=False) assert_series_equal(s, res) def test_apply_categorical(self): - values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'), - ordered=True) - s = pd.Series(values, name='XX', index=list('abcdefg')) + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = pd.Series(values, name="XX", index=list("abcdefg")) result = s.apply(lambda x: x.lower()) # should be categorical dtype when the number of categories are # the same - values = pd.Categorical(list('abbabcd'), categories=list('dcba'), - ordered=True) - exp = pd.Series(values, name='XX', index=list('abcdefg')) + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = pd.Series(values, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) tm.assert_categorical_equal(result.values, exp.values) - result = s.apply(lambda x: 'A') - exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg')) + result = s.apply(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) assert result.dtype == np.object @@ -941,7 +964,7 @@ def test_shift_int(self, datetime_series): def test_shift_categorical(self): # GH 9416 - s = pd.Series(['a', 'b', 'c', 'd'], dtype='category') + s = pd.Series(["a", "b", "c", "d"], dtype="category") assert_series_equal(s.iloc[:-1], s.shift(1).shift(-1).dropna()) @@ -961,29 +984,35 @@ def test_shift_categorical(self): def test_unstack(self): from numpy import nan - index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']], - codes=[[1, 1, 0, 0], [0, 1, 0, 2]]) + index = MultiIndex( + levels=[["bar", "foo"], ["one", "three", "two"]], + codes=[[1, 1, 0, 0], [0, 1, 0, 2]], + ) - s = Series(np.arange(4.), index=index) + s = Series(np.arange(4.0), index=index) unstacked = s.unstack() - expected = DataFrame([[2., nan, 3.], [0., 1., nan]], - index=['bar', 'foo'], - columns=['one', 'three', 'two']) + expected = DataFrame( + [[2.0, nan, 3.0], [0.0, 1.0, nan]], + index=["bar", "foo"], + columns=["one", "three", "two"], + ) assert_frame_equal(unstacked, expected) unstacked = s.unstack(level=0) assert_frame_equal(unstacked, expected.T) - index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], - codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], - [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[["bar"], ["one", "two", "three"], [0, 1]], + codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) s = Series(np.random.randn(6), index=index) - exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]], - codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) - expected = DataFrame({'bar': s.values}, - index=exp_index).sort_index(level=0) + exp_index = MultiIndex( + levels=[["one", "two", "three"], [0, 1]], + codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], + ) + expected = DataFrame({"bar": s.values}, index=exp_index).sort_index(level=0) unstacked = s.unstack(0).sort_index() assert_frame_equal(unstacked, expected) @@ -991,87 +1020,97 @@ def test_unstack(self): idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) ts = pd.Series([1, 2], index=idx) left = ts.unstack() - right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], - columns=[nan, 3.5]) + right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], columns=[nan, 3.5]) assert_frame_equal(left, right) - idx = pd.MultiIndex.from_arrays([['cat', 'cat', 'cat', 'dog', 'dog' - ], ['a', 'a', 'b', 'a', 'b'], - [1, 2, 1, 1, np.nan]]) + idx = pd.MultiIndex.from_arrays( + [ + ["cat", "cat", "cat", "dog", "dog"], + ["a", "a", "b", "a", "b"], + [1, 2, 1, 1, np.nan], + ] + ) ts = pd.Series([1.0, 1.1, 1.2, 1.3, 1.4], index=idx) - right = DataFrame([[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]], - columns=['cat', 'dog']) - tpls = [('a', 1), ('a', 2), ('b', nan), ('b', 1)] + right = DataFrame( + [[1.0, 1.3], [1.1, nan], [nan, 1.4], [1.2, nan]], columns=["cat", "dog"] + ) + tpls = [("a", 1), ("a", 2), ("b", nan), ("b", 1)] right.index = pd.MultiIndex.from_tuples(tpls) assert_frame_equal(ts.unstack(level=0), right) def test_value_counts_datetime(self): # most dtypes are tested in test_base.py - values = [pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 10:00'), - pd.Timestamp('2011-01-01 11:00'), - pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 09:00'), - pd.Timestamp('2011-01-01 11:00')] - - exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00', - '2011-01-01 10:00']) - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') - - s = pd.Series(values, name='xxx') + values = [ + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 10:00"), + pd.Timestamp("2011-01-01 11:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 09:00"), + pd.Timestamp("2011-01-01 11:00"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"] + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check DatetimeIndex outputs the same result - idx = pd.DatetimeIndex(values, name='xxx') + idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_datetime_tz(self): - values = [pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 10:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 11:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 09:00', tz='US/Eastern'), - pd.Timestamp('2011-01-01 11:00', tz='US/Eastern')] - - exp_idx = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 11:00', - '2011-01-01 10:00'], tz='US/Eastern') - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') - - s = pd.Series(values, name='xxx') + values = [ + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"), + pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"), + ] + + exp_idx = pd.DatetimeIndex( + ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"], + tz="US/Eastern", + ) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") + + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) - idx = pd.DatetimeIndex(values, name='xxx') + idx = pd.DatetimeIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_period(self): - values = [pd.Period('2011-01', freq='M'), - pd.Period('2011-02', freq='M'), - pd.Period('2011-03', freq='M'), - pd.Period('2011-01', freq='M'), - pd.Period('2011-01', freq='M'), - pd.Period('2011-03', freq='M')] + values = [ + pd.Period("2011-01", freq="M"), + pd.Period("2011-02", freq="M"), + pd.Period("2011-03", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-01", freq="M"), + pd.Period("2011-03", freq="M"), + ] - exp_idx = pd.PeriodIndex(['2011-01', '2011-03', '2011-02'], freq='M') - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') + exp_idx = pd.PeriodIndex(["2011-01", "2011-03", "2011-02"], freq="M") + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - s = pd.Series(values, name='xxx') + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check DatetimeIndex outputs the same result - idx = pd.PeriodIndex(values, name='xxx') + idx = pd.PeriodIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @@ -1079,72 +1118,71 @@ def test_value_counts_categorical_ordered(self): # most dtypes are tested in test_base.py values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=True) - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], - ordered=True) - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=True) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - s = pd.Series(values, name='xxx') + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name='xxx') + idx = pd.CategoricalIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) def test_value_counts_categorical_not_ordered(self): values = pd.Categorical([1, 2, 3, 1, 1, 3], ordered=False) - exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], - ordered=False) - exp = pd.Series([3, 2, 1], index=exp_idx, name='xxx') + exp_idx = pd.CategoricalIndex([1, 3, 2], categories=[1, 2, 3], ordered=False) + exp = pd.Series([3, 2, 1], index=exp_idx, name="xxx") - s = pd.Series(values, name='xxx') + s = pd.Series(values, name="xxx") tm.assert_series_equal(s.value_counts(), exp) # check CategoricalIndex outputs the same result - idx = pd.CategoricalIndex(values, name='xxx') + idx = pd.CategoricalIndex(values, name="xxx") tm.assert_series_equal(idx.value_counts(), exp) # normalize - exp = pd.Series(np.array([3., 2., 1]) / 6., - index=exp_idx, name='xxx') + exp = pd.Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="xxx") tm.assert_series_equal(s.value_counts(normalize=True), exp) tm.assert_series_equal(idx.value_counts(normalize=True), exp) @pytest.mark.parametrize("func", [np.any, np.all]) - @pytest.mark.parametrize("kwargs", [ - dict(keepdims=True), - dict(out=object()), - ]) + @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) @td.skip_if_np_lt("1.15") def test_validate_any_all_out_keepdims_raises(self, kwargs, func): s = pd.Series([1, 2]) param = list(kwargs)[0] name = func.__name__ - msg = (r"the '{arg}' parameter is not " - r"supported in the pandas " - r"implementation of {fname}\(\)").format(arg=param, fname=name) + msg = ( + r"the '{arg}' parameter is not " + r"supported in the pandas " + r"implementation of {fname}\(\)" + ).format(arg=param, fname=name) with pytest.raises(ValueError, match=msg): func(s, **kwargs) @td.skip_if_np_lt("1.15") def test_validate_sum_initial(self): s = pd.Series([1, 2]) - msg = (r"the 'initial' parameter is not " - r"supported in the pandas " - r"implementation of sum\(\)") + msg = ( + r"the 'initial' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) with pytest.raises(ValueError, match=msg): np.sum(s, initial=10) def test_validate_median_initial(self): s = pd.Series([1, 2]) - msg = (r"the 'overwrite_input' parameter is not " - r"supported in the pandas " - r"implementation of median\(\)") + msg = ( + r"the 'overwrite_input' parameter is not " + r"supported in the pandas " + r"implementation of median\(\)" + ) with pytest.raises(ValueError, match=msg): # It seems like np.median doesn't dispatch, so we use the # method instead of the ufunc. @@ -1153,36 +1191,38 @@ def test_validate_median_initial(self): @td.skip_if_np_lt("1.15") def test_validate_stat_keepdims(self): s = pd.Series([1, 2]) - msg = (r"the 'keepdims' parameter is not " - r"supported in the pandas " - r"implementation of sum\(\)") + msg = ( + r"the 'keepdims' parameter is not " + r"supported in the pandas " + r"implementation of sum\(\)" + ) with pytest.raises(ValueError, match=msg): np.sum(s, keepdims=True) def test_compound_deprecated(self): - s = Series([.1, .2, .3, .4]) + s = Series([0.1, 0.2, 0.3, 0.4]) with tm.assert_produces_warning(FutureWarning): s.compound() - df = pd.DataFrame({'s': s}) + df = pd.DataFrame({"s": s}) with tm.assert_produces_warning(FutureWarning): df.compound() main_dtypes = [ - 'datetime', - 'datetimetz', - 'timedelta', - 'int8', - 'int16', - 'int32', - 'int64', - 'float32', - 'float64', - 'uint8', - 'uint16', - 'uint32', - 'uint64' + "datetime", + "datetimetz", + "timedelta", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", ] @@ -1199,19 +1239,27 @@ def s_main_dtypes(): The columns are the name of the dtype. """ df = pd.DataFrame( - {'datetime': pd.to_datetime(['2003', '2002', - '2001', '2002', - '2005']), - 'datetimetz': pd.to_datetime( - ['2003', '2002', - '2001', '2002', - '2005']).tz_localize('US/Eastern'), - 'timedelta': pd.to_timedelta(['3d', '2d', '1d', - '2d', '5d'])}) - - for dtype in ['int8', 'int16', 'int32', 'int64', - 'float32', 'float64', - 'uint8', 'uint16', 'uint32', 'uint64']: + { + "datetime": pd.to_datetime(["2003", "2002", "2001", "2002", "2005"]), + "datetimetz": pd.to_datetime( + ["2003", "2002", "2001", "2002", "2005"] + ).tz_localize("US/Eastern"), + "timedelta": pd.to_timedelta(["3d", "2d", "1d", "2d", "5d"]), + } + ) + + for dtype in [ + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "uint8", + "uint16", + "uint32", + "uint64", + ]: df[dtype] = Series([3, 2, 1, 2, 5], dtype=dtype) return df @@ -1227,25 +1275,27 @@ def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests s = Series(vals, dtype=dtype) result = getattr(s, method)(3) - expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected_idxr = [0, 1, 2] if method == "nsmallest" else [3, 2, 1] expected = s.loc[expected_idxr] tm.assert_series_equal(result, expected) class TestNLargestNSmallest: - @pytest.mark.parametrize( - "r", [Series([3., 2, 1, 2, '5'], dtype='object'), - Series([3., 2, 1, 2, 5], dtype='object'), - # not supported on some archs - # Series([3., 2, 1, 2, 5], dtype='complex256'), - Series([3., 2, 1, 2, 5], dtype='complex128'), - Series(list('abcde')), - Series(list('abcde'), dtype='category')]) + "r", + [ + Series([3.0, 2, 1, 2, "5"], dtype="object"), + Series([3.0, 2, 1, 2, 5], dtype="object"), + # not supported on some archs + # Series([3., 2, 1, 2, 5], dtype='complex256'), + Series([3.0, 2, 1, 2, 5], dtype="complex128"), + Series(list("abcde")), + Series(list("abcde"), dtype="category"), + ], + ) def test_error(self, r): dt = r.dtype - msg = ("Cannot use method 'n(larg|small)est' with " - "dtype {dt}".format(dt=dt)) + msg = "Cannot use method 'n(larg|small)est' with " "dtype {dt}".format(dt=dt) args = 2, len(r), 0, -1 methods = r.nlargest, r.nsmallest for method, arg in product(methods, args): @@ -1258,7 +1308,7 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split): s = s_main_dtypes_split assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) - assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]]) + assert_series_equal(s.nsmallest(2, keep="last"), s.iloc[[2, 3]]) empty = s.iloc[0:0] assert_series_equal(s.nsmallest(0), empty) @@ -1269,20 +1319,19 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split): assert_series_equal(s.nsmallest(len(s)), s.sort_values()) assert_series_equal(s.nsmallest(len(s) + 1), s.sort_values()) assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) - assert_series_equal(s.nlargest(len(s) + 1), - s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) def test_misc(self): - s = Series([3., np.nan, 1, 2, 5]) + s = Series([3.0, np.nan, 1, 2, 5]) assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) msg = 'keep must be either "first", "last"' with pytest.raises(ValueError, match=msg): - s.nsmallest(keep='invalid') + s.nsmallest(keep="invalid") with pytest.raises(ValueError, match=msg): - s.nlargest(keep='invalid') + s.nlargest(keep="invalid") # GH 15297 s = Series([1] * 5, index=[1, 2, 3, 4, 5]) @@ -1292,16 +1341,16 @@ def test_misc(self): result = s.nsmallest(3) assert_series_equal(result, expected_first) - result = s.nsmallest(3, keep='last') + result = s.nsmallest(3, keep="last") assert_series_equal(result, expected_last) result = s.nlargest(3) assert_series_equal(result, expected_first) - result = s.nlargest(3, keep='last') + result = s.nlargest(3, keep="last") assert_series_equal(result, expected_last) - @pytest.mark.parametrize('n', range(1, 5)) + @pytest.mark.parametrize("n", range(1, 5)) def test_n(self, n): # GH 13412 @@ -1325,17 +1374,16 @@ def test_boundary_float(self, nselect_method, float_dtype): # GH 21426 dtype_info = np.finfo(float_dtype) min_val, max_val = dtype_info.min, dtype_info.max - min_2nd, max_2nd = np.nextafter( - [min_val, max_val], 0, dtype=float_dtype) + min_2nd, max_2nd = np.nextafter([min_val, max_val], 0, dtype=float_dtype) vals = [min_val, min_2nd, max_2nd, max_val] assert_check_nselect_boundary(vals, float_dtype, nselect_method) - @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) + @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_boundary_datetimelike(self, nselect_method, dtype): # GH 21426 # use int64 bounds and +1 to min_val since true minimum is NaT # (include min_val/NaT at end to maintain same expected_idxr) - dtype_info = np.iinfo('int64') + dtype_info = np.iinfo("int64") min_val, max_val = dtype_info.min, dtype_info.max vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] assert_check_nselect_boundary(vals, dtype, nselect_method) @@ -1343,17 +1391,17 @@ def test_boundary_datetimelike(self, nselect_method, dtype): def test_duplicate_keep_all_ties(self): # see gh-16818 s = Series([10, 9, 8, 7, 7, 7, 7, 6]) - result = s.nlargest(4, keep='all') + result = s.nlargest(4, keep="all") expected = Series([10, 9, 8, 7, 7, 7, 7]) assert_series_equal(result, expected) - result = s.nsmallest(2, keep='all') + result = s.nsmallest(2, keep="all") expected = Series([6, 7, 7, 7, 7], index=[7, 3, 4, 5, 6]) assert_series_equal(result, expected) - @pytest.mark.parametrize('data,expected', - [([True, False], [True]), - ([True, False, True, True], [True])]) + @pytest.mark.parametrize( + "data,expected", [([True, False], [True]), ([True, False, True, True], [True])] + ) def test_boolean(self, data, expected): # GH 26154 : ensure True > False s = Series(data) @@ -1363,35 +1411,37 @@ def test_boolean(self, data, expected): class TestCategoricalSeriesAnalytics: - def test_count(self): - s = Series(Categorical([np.nan, 1, 2, np.nan], - categories=[5, 4, 3, 2, 1], ordered=True)) + s = Series( + Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + ) result = s.count() assert result == 2 def test_value_counts(self): # GH 12835 - cats = Categorical(list('abcccb'), categories=list('cabd')) - s = Series(cats, name='xxx') + cats = Categorical(list("abcccb"), categories=list("cabd")) + s = Series(cats, name="xxx") res = s.value_counts(sort=False) - exp_index = CategoricalIndex(list('cabd'), categories=cats.categories) - exp = Series([3, 1, 2, 0], name='xxx', index=exp_index) + exp_index = CategoricalIndex(list("cabd"), categories=cats.categories) + exp = Series([3, 1, 2, 0], name="xxx", index=exp_index) tm.assert_series_equal(res, exp) res = s.value_counts(sort=True) - exp_index = CategoricalIndex(list('cbad'), categories=cats.categories) - exp = Series([3, 2, 1, 0], name='xxx', index=exp_index) + exp_index = CategoricalIndex(list("cbad"), categories=cats.categories) + exp = Series([3, 2, 1, 0], name="xxx", index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same # (tested in test_base.py) - s = Series(["a", "b", "c", "c", "c", "b"], name='xxx') + s = Series(["a", "b", "c", "c", "c", "b"], name="xxx") res = s.value_counts() - exp = Series([3, 2, 1], name='xxx', index=["c", "b", "a"]) + exp = Series([3, 2, 1], name="xxx", index=["c", "b", "a"]) tm.assert_series_equal(res, exp) def test_value_counts_with_nan(self): @@ -1410,8 +1460,9 @@ def test_value_counts_with_nan(self): # same Series via two different constructions --> same behaviour series = [ Series(["a", "b", None, "a", None, None], dtype="category"), - Series(Categorical(["a", "b", None, "a", None, None], - categories=["a", "b"])) + Series( + Categorical(["a", "b", None, "a", None, None], categories=["a", "b"]) + ), ] for s in series: @@ -1433,18 +1484,23 @@ def test_value_counts_with_nan(self): @pytest.mark.parametrize( "dtype", - ["int_", "uint", "float_", "unicode_", "timedelta64[h]", - pytest.param("datetime64[D]", - marks=pytest.mark.xfail(reason="GH#7996", strict=False))] + [ + "int_", + "uint", + "float_", + "unicode_", + "timedelta64[h]", + pytest.param( + "datetime64[D]", marks=pytest.mark.xfail(reason="GH#7996", strict=False) + ), + ], ) - def test_drop_duplicates_categorical_non_bool(self, dtype, - ordered_fixture): + def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) - tc1 = Series(Categorical(input1, categories=cat_array, - ordered=ordered_fixture)) + tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) @@ -1454,11 +1510,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, False]) - tm.assert_series_equal(tc1.duplicated(keep='last'), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep='last'), - tc1[~expected]) + tm.assert_series_equal(tc1.duplicated(keep="last"), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) sc = tc1.copy() - sc.drop_duplicates(keep='last', inplace=True) + sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, True]) @@ -1470,9 +1525,7 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical( - input2, categories=cat_array, ordered=ordered_fixture) - ) + tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) @@ -1482,11 +1535,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, False, False, False]) - tm.assert_series_equal(tc2.duplicated(keep='last'), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep='last'), - tc2[~expected]) + tm.assert_series_equal(tc2.duplicated(keep="last"), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) sc = tc2.copy() - sc.drop_duplicates(keep='last', inplace=True) + sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, True, True, False]) @@ -1497,9 +1549,13 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, tm.assert_series_equal(sc, tc2[~expected]) def test_drop_duplicates_categorical_bool(self, ordered_fixture): - tc = Series(Categorical([True, False, True, False], - categories=[True, False], - ordered=ordered_fixture)) + tc = Series( + Categorical( + [True, False, True, False], + categories=[True, False], + ordered=ordered_fixture, + ) + ) expected = Series([False, False, True, True]) tm.assert_series_equal(tc.duplicated(), expected) @@ -1509,10 +1565,10 @@ def test_drop_duplicates_categorical_bool(self, ordered_fixture): tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, False, False]) - tm.assert_series_equal(tc.duplicated(keep='last'), expected) - tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected]) + tm.assert_series_equal(tc.duplicated(keep="last"), expected) + tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(keep='last', inplace=True) + sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, True, True]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 71b0a2d9d74eb..2097264ba5e78 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -7,8 +7,16 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, DatetimeIndex, Index, Series, TimedeltaIndex, - date_range, period_range, timedelta_range) + Categorical, + DataFrame, + DatetimeIndex, + Index, + Series, + TimedeltaIndex, + date_range, + period_range, + timedelta_range, +) from pandas.core.arrays import PeriodArray from pandas.core.indexes.datetimes import Timestamp import pandas.util.testing as tm @@ -26,6 +34,7 @@ class SharedWithSparse: In generic tests on this class, use ``self._assert_series_equal()`` which is implemented in sub-classes. """ + def _assert_series_equal(self, left, right): """Dispatch to series class dependent assertion""" raise NotImplementedError @@ -47,7 +56,7 @@ def test_copy_index_name_checking(self): assert self.ts is self.ts cp = self.ts.copy() - cp.index.name = 'foo' + cp.index.name = "foo" printing.pprint_thing(self.ts.index.name) assert self.ts.index.name is None @@ -67,14 +76,14 @@ def test_binop_maybe_preserve_name(self): # names don't match, don't preserve cp = self.ts.copy() - cp.name = 'something else' + cp.name = "something else" result = self.ts + cp assert result.name is None result = self.ts.add(cp) assert result.name is None - ops = ['add', 'sub', 'mul', 'div', 'truediv', 'floordiv', 'mod', 'pow'] - ops = ops + ['r' + op for op in ops] + ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"] + ops = ops + ["r" + op for op in ops] for op in ops: # names match, preserve s = self.ts.copy() @@ -83,7 +92,7 @@ def test_binop_maybe_preserve_name(self): # names don't match, don't preserve cp = self.ts.copy() - cp.name = 'changed' + cp.name = "changed" result = getattr(s, op)(cp) assert result.name is None @@ -129,14 +138,13 @@ def test_to_sparse_pass_name(self): assert result.name == self.ts.name def test_constructor_dict(self): - d = {'a': 0., 'b': 1., 'c': 2.} + d = {"a": 0.0, "b": 1.0, "c": 2.0} result = self.series_klass(d) expected = self.series_klass(d, index=sorted(d.keys())) self._assert_series_equal(result, expected) - result = self.series_klass(d, index=['b', 'c', 'd', 'a']) - expected = self.series_klass([1, 2, np.nan, 0], - index=['b', 'c', 'd', 'a']) + result = self.series_klass(d, index=["b", "c", "d", "a"]) + expected = self.series_klass([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) self._assert_series_equal(result, expected) def test_constructor_subclass_dict(self): @@ -147,8 +155,7 @@ def test_constructor_subclass_dict(self): def test_constructor_ordereddict(self): # GH3283 - data = OrderedDict( - ('col%s' % i, np.random.random()) for i in range(12)) + data = OrderedDict(("col%s" % i, np.random.random()) for i in range(12)) series = self.series_klass(data) expected = self.series_klass(list(data.values()), list(data.keys())) @@ -162,20 +169,20 @@ class A(OrderedDict): self._assert_series_equal(series, expected) def test_constructor_dict_multiindex(self): - d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.} + d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} _d = sorted(d.items()) result = self.series_klass(d) expected = self.series_klass( - [x[1] for x in _d], - index=pd.MultiIndex.from_tuples([x[0] for x in _d])) + [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) + ) self._assert_series_equal(result, expected) - d['z'] = 111. - _d.insert(0, ('z', d['z'])) + d["z"] = 111.0 + _d.insert(0, ("z", d["z"])) result = self.series_klass(d) - expected = self.series_klass([x[1] for x in _d], - index=pd.Index([x[0] for x in _d], - tupleize_cols=False)) + expected = self.series_klass( + [x[1] for x in _d], index=pd.Index([x[0] for x in _d], tupleize_cols=False) + ) result = result.reindex(index=expected.index) self._assert_series_equal(result, expected) @@ -184,15 +191,16 @@ def test_constructor_dict_timedelta_index(self): # construct Series from dict as data and TimedeltaIndex as index # will result NaN in result Series data expected = self.series_klass( - data=['A', 'B', 'C'], - index=pd.to_timedelta([0, 10, 20], unit='s') + data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s") ) result = self.series_klass( - data={pd.to_timedelta(0, unit='s'): 'A', - pd.to_timedelta(10, unit='s'): 'B', - pd.to_timedelta(20, unit='s'): 'C'}, - index=pd.to_timedelta([0, 10, 20], unit='s') + data={ + pd.to_timedelta(0, unit="s"): "A", + pd.to_timedelta(10, unit="s"): "B", + pd.to_timedelta(20, unit="s"): "C", + }, + index=pd.to_timedelta([0, 10, 20], unit="s"), ) self._assert_series_equal(result, expected) @@ -200,8 +208,7 @@ def test_constructor_dict_timedelta_index(self): def test_from_array_deprecated(self): # multiple FutureWarnings, so can't assert stacklevel - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.series_klass.from_array([1, 2, 3]) def test_sparse_accessor_updates_on_inplace(self): @@ -218,71 +225,85 @@ class TestSeriesMisc(TestData, SharedWithSparse): def test_tab_completion(self): # GH 9910 - s = Series(list('abcd')) + s = Series(list("abcd")) # Series of str values should have .str but not .dt/.cat in __dir__ - assert 'str' in dir(s) - assert 'dt' not in dir(s) - assert 'cat' not in dir(s) + assert "str" in dir(s) + assert "dt" not in dir(s) + assert "cat" not in dir(s) # similarly for .dt - s = Series(date_range('1/1/2015', periods=5)) - assert 'dt' in dir(s) - assert 'str' not in dir(s) - assert 'cat' not in dir(s) + s = Series(date_range("1/1/2015", periods=5)) + assert "dt" in dir(s) + assert "str" not in dir(s) + assert "cat" not in dir(s) # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. - s = Series(list('abbcd'), dtype="category") - assert 'cat' in dir(s) - assert 'str' in dir(s) # as it is a string categorical - assert 'dt' not in dir(s) + s = Series(list("abbcd"), dtype="category") + assert "cat" in dir(s) + assert "str" in dir(s) # as it is a string categorical + assert "dt" not in dir(s) # similar to cat and str - s = Series(date_range('1/1/2015', periods=5)).astype("category") - assert 'cat' in dir(s) - assert 'str' not in dir(s) - assert 'dt' in dir(s) # as it is a datetime categorical + s = Series(date_range("1/1/2015", periods=5)).astype("category") + assert "cat" in dir(s) + assert "str" not in dir(s) + assert "dt" in dir(s) # as it is a datetime categorical def test_tab_completion_with_categorical(self): # test the tab completion display - ok_for_cat = ['name', 'index', 'categorical', 'categories', 'codes', - 'ordered', 'set_categories', 'add_categories', - 'remove_categories', 'rename_categories', - 'reorder_categories', 'remove_unused_categories', - 'as_ordered', 'as_unordered'] + ok_for_cat = [ + "name", + "index", + "categorical", + "categories", + "codes", + "ordered", + "set_categories", + "add_categories", + "remove_categories", + "rename_categories", + "reorder_categories", + "remove_unused_categories", + "as_ordered", + "as_unordered", + ] def get_dir(s): - results = [r for r in s.cat.__dir__() if not r.startswith('_')] + results = [r for r in s.cat.__dir__() if not r.startswith("_")] return list(sorted(set(results))) - s = Series(list('aabbcde')).astype('category') + s = Series(list("aabbcde")).astype("category") results = get_dir(s) tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) - @pytest.mark.parametrize("index", [ - tm.makeUnicodeIndex(10), - tm.makeStringIndex(10), - tm.makeCategoricalIndex(10), - Index(['foo', 'bar', 'baz'] * 2), - tm.makeDateIndex(10), - tm.makePeriodIndex(10), - tm.makeTimedeltaIndex(10), - tm.makeIntIndex(10), - tm.makeUIntIndex(10), - tm.makeIntIndex(10), - tm.makeFloatIndex(10), - Index([True, False]), - Index(['a{}'.format(i) for i in range(101)]), - pd.MultiIndex.from_tuples(zip('ABCD', 'EFGH')), - pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], 'EFGH')), ]) + @pytest.mark.parametrize( + "index", + [ + tm.makeUnicodeIndex(10), + tm.makeStringIndex(10), + tm.makeCategoricalIndex(10), + Index(["foo", "bar", "baz"] * 2), + tm.makeDateIndex(10), + tm.makePeriodIndex(10), + tm.makeTimedeltaIndex(10), + tm.makeIntIndex(10), + tm.makeUIntIndex(10), + tm.makeIntIndex(10), + tm.makeFloatIndex(10), + Index([True, False]), + Index(["a{}".format(i) for i in range(101)]), + pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")), + pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")), + ], + ) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. s = pd.Series(index=index) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: - assert (not isinstance(x, str) or - not x.isidentifier() or x in dir_s) + assert not isinstance(x, str) or not x.isidentifier() or x in dir_s else: assert x not in dir_s @@ -322,7 +343,7 @@ def test_iteritems(self): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) - assert not hasattr(self.series.iteritems(), 'reverse') + assert not hasattr(self.series.iteritems(), "reverse") def test_items(self): for idx, val in self.series.items(): @@ -332,7 +353,7 @@ def test_items(self): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) - assert not hasattr(self.series.items(), 'reverse') + assert not hasattr(self.series.items(), "reverse") def test_raise_on_info(self): s = Series(np.random.randn(10)) @@ -343,7 +364,7 @@ def test_raise_on_info(self): def test_copy(self): for deep in [None, False, True]: - s = Series(np.arange(10), dtype='float64') + s = Series(np.arange(10), dtype="float64") # default deep is True if deep is None: @@ -365,19 +386,19 @@ def test_copy(self): def test_copy_tzaware(self): # GH#11794 # copy of tz-aware - expected = Series([Timestamp('2012/01/01', tz='UTC')]) - expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) + expected = Series([Timestamp("2012/01/01", tz="UTC")]) + expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) for deep in [None, False, True]: - s = Series([Timestamp('2012/01/01', tz='UTC')]) + s = Series([Timestamp("2012/01/01", tz="UTC")]) if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) - s2[0] = pd.Timestamp('1999/01/01', tz='UTC') + s2[0] = pd.Timestamp("1999/01/01", tz="UTC") # default deep is True if deep is None or deep is True: @@ -391,10 +412,10 @@ def test_copy_tzaware(self): def test_axis_alias(self): s = Series([1, 2, np.nan]) - assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index')) - assert s.dropna().sum('rows') == 3 - assert s._get_axis_number('rows') == 0 - assert s._get_axis_name('rows') == 'index' + assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) + assert s.dropna().sum("rows") == 3 + assert s._get_axis_number("rows") == 0 + assert s._get_axis_name("rows") == "index" def test_class_axis(self): # https://github.com/pandas-dev/pandas/issues/18147 @@ -408,8 +429,11 @@ def test_numpy_unique(self): def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame - tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], - index=date_range('1/1/2000', periods=1000)) + tsdf = DataFrame( + np.random.randn(1000, 3), + columns=["A", "B", "C"], + index=date_range("1/1/2000", periods=1000), + ) def f(x): return x[x.idxmax()] @@ -428,53 +452,53 @@ def f(x): # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) - expected = Series(1, index=range(10), dtype='float64') + expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) - tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F')) + tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) # compress # GH 6658 - s = Series([0, 1., -1], index=list('abc')) + s = Series([0, 1.0, -1], index=list("abc")) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.], index=['b'])) + tm.assert_series_equal(result, Series([1.0], index=["b"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original - exp = Series([], dtype='float64', index=Index([], dtype='object')) + exp = Series([], dtype="float64", index=Index([], dtype="object")) tm.assert_series_equal(result, exp) - s = Series([0, 1., -1], index=[.1, .2, .3]) + s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.], index=[.2])) + tm.assert_series_equal(result, Series([1.0], index=[0.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original - exp = Series([], dtype='float64', index=Index([], dtype='float64')) + exp = Series([], dtype="float64", index=Index([], dtype="float64")) tm.assert_series_equal(result, exp) def test_str_accessor_updates_on_inplace(self): - s = pd.Series(list('abc')) + s = pd.Series(list("abc")) s.drop([0], inplace=True) assert len(s.str.lower()) == 2 def test_str_attribute(self): # GH9068 - methods = ['strip', 'rstrip', 'lstrip'] - s = Series([' jack', 'jill ', ' jesse ', 'frank']) + methods = ["strip", "rstrip", "lstrip"] + s = Series([" jack", "jill ", " jesse ", "frank"]) for method in methods: expected = Series([getattr(str, method)(x) for x in s.values]) assert_series_equal(getattr(Series.str, method)(s.str), expected) # str accessor only valid with string values s = Series(range(5)) - with pytest.raises(AttributeError, match='only use .str accessor'): + with pytest.raises(AttributeError, match="only use .str accessor"): s.str.repeat(2) def test_empty_method(self): @@ -486,14 +510,14 @@ def test_empty_method(self): def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 - pytest.importorskip('IPython', minversion="6.0.0") + pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" ip.run_code(code) with tm.assert_produces_warning(None): - with provisionalcompleter('ignore'): - list(ip.Completer.completions('s.', 1)) + with provisionalcompleter("ignore"): + list(ip.Completer.completions("s.", 1)) def test_integer_series_size(self): # GH 25580 @@ -510,7 +534,6 @@ def test_get_values_deprecation(self): class TestCategoricalSeries: - @pytest.mark.parametrize( "method", [ @@ -522,11 +545,12 @@ class TestCategoricalSeries: lambda x: x.cat.add_categories([4]), lambda x: x.cat.as_ordered(), lambda x: x.cat.as_unordered(), - ]) + ], + ) def test_getname_categorical_accessor(self, method): # GH 17509 - s = Series([1, 2, 3], name='A').astype('category') - expected = 'A' + s = Series([1, 2, 3], name="A").astype("category") + expected = "A" result = method(s).name assert result == expected @@ -549,24 +573,24 @@ def test_cat_accessor(self): def test_cat_accessor_api(self): # GH 9322 from pandas.core.arrays.categorical import CategoricalAccessor + assert Series.cat is CategoricalAccessor - s = Series(list('aabbcde')).astype('category') + s = Series(list("aabbcde")).astype("category") assert isinstance(s.cat, CategoricalAccessor) invalid = Series([1]) with pytest.raises(AttributeError, match="only use .cat accessor"): invalid.cat - assert not hasattr(invalid, 'cat') + assert not hasattr(invalid, "cat") def test_cat_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 - c = Series(list('aabbcde')).astype('category') - with pytest.raises(AttributeError, - match="You cannot add any new attribute"): + c = Series(list("aabbcde")).astype("category") + with pytest.raises(AttributeError, match="You cannot add any new attribute"): c.cat.xlabel = "a" def test_cat_accessor_updates_on_inplace(self): - s = Series(list('abc')).astype('category') + s = Series(list("abc")).astype("category") s.drop(0, inplace=True) s.cat.remove_unused_categories(inplace=True) assert len(s.cat.categories) == 2 @@ -580,11 +604,11 @@ def test_categorical_delegations(self): with pytest.raises(AttributeError, match=msg): Series([1, 2, 3]).cat() with pytest.raises(AttributeError, match=msg): - Series(['a', 'b', 'c']).cat + Series(["a", "b", "c"]).cat with pytest.raises(AttributeError, match=msg): - Series(np.arange(5.)).cat + Series(np.arange(5.0)).cat with pytest.raises(AttributeError, match=msg): - Series([Timestamp('20130101')]).cat + Series([Timestamp("20130101")]).cat # Series should delegate calls to '.categories', '.codes', '.ordered' # and the methods '.set_categories()' 'drop_unused_categories()' to the @@ -596,7 +620,7 @@ def test_categorical_delegations(self): exp_categories = Index([1, 2, 3]) tm.assert_index_equal(s.cat.categories, exp_categories) - exp_codes = Series([0, 1, 2, 0], dtype='int8') + exp_codes = Series([0, 1, 2, 0], dtype="int8") tm.assert_series_equal(s.cat.codes, exp_codes) assert s.cat.ordered @@ -615,8 +639,7 @@ def test_categorical_delegations(self): tm.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused categories - s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c" - ])) + s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"])) exp_categories = Index(["a", "b"]) exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) s = s.cat.remove_unused_categories() @@ -635,22 +658,22 @@ def test_categorical_delegations(self): # GH18862 (let Series.cat.rename_categories take callables) s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) result = s.cat.rename_categories(lambda x: x.upper()) - expected = Series(Categorical(["A", "B", "C", "A"], - categories=["A", "B", "C"], - ordered=True)) + expected = Series( + Categorical(["A", "B", "C", "A"], categories=["A", "B", "C"], ordered=True) + ) tm.assert_series_equal(result, expected) def test_dt_accessor_api_for_categorical(self): # https://github.com/pandas-dev/pandas/issues/10661 from pandas.core.indexes.accessors import Properties - s_dr = Series(date_range('1/1/2015', periods=5, tz="MET")) + s_dr = Series(date_range("1/1/2015", periods=5, tz="MET")) c_dr = s_dr.astype("category") - s_pr = Series(period_range('1/1/2015', freq='D', periods=5)) + s_pr = Series(period_range("1/1/2015", freq="D", periods=5)) c_pr = s_pr.astype("category") - s_tdr = Series(timedelta_range('1 days', '10 days')) + s_tdr = Series(timedelta_range("1 days", "10 days")) c_tdr = s_tdr.astype("category") # only testing field (like .day) @@ -660,29 +683,36 @@ def test_dt_accessor_api_for_categorical(self): test_data = [ ("Datetime", get_ops(DatetimeIndex), s_dr, c_dr), ("Period", get_ops(PeriodArray), s_pr, c_pr), - ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr)] + ("Timedelta", get_ops(TimedeltaIndex), s_tdr, c_tdr), + ] assert isinstance(c_dr.dt, Properties) special_func_defs = [ - ('strftime', ("%Y-%m-%d",), {}), - ('tz_convert', ("EST",), {}), - ('round', ("D",), {}), - ('floor', ("D",), {}), - ('ceil', ("D",), {}), - ('asfreq', ("D",), {}), + ("strftime", ("%Y-%m-%d",), {}), + ("tz_convert", ("EST",), {}), + ("round", ("D",), {}), + ("floor", ("D",), {}), + ("ceil", ("D",), {}), + ("asfreq", ("D",), {}), # ('tz_localize', ("UTC",), {}), ] _special_func_names = [f[0] for f in special_func_defs] # the series is already localized - _ignore_names = ['tz_localize', 'components'] + _ignore_names = ["tz_localize", "components"] for name, attr_names, s, c in test_data: - func_names = [f - for f in dir(s.dt) - if not (f.startswith("_") or f in attr_names or f in - _special_func_names or f in _ignore_names)] + func_names = [ + f + for f in dir(s.dt) + if not ( + f.startswith("_") + or f in attr_names + or f in _special_func_names + or f in _ignore_names + ) + ] func_defs = [(f, (), {}) for f in func_names] for f_def in special_func_defs: @@ -691,7 +721,7 @@ def test_dt_accessor_api_for_categorical(self): for func, args, kwargs in func_defs: with warnings.catch_warnings(): - if func == 'to_period': + if func == "to_period": # dropping TZ warnings.simplefilter("ignore", UserWarning) res = getattr(c.dt, func)(*args, **kwargs) @@ -719,9 +749,9 @@ def test_dt_accessor_api_for_categorical(self): else: tm.assert_almost_equal(res, exp) - invalid = Series([1, 2, 3]).astype('category') + invalid = Series([1, 2, 3]).astype("category") msg = "Can only use .dt accessor with datetimelike" with pytest.raises(AttributeError, match=msg): invalid.dt - assert not hasattr(invalid, 'str') + assert not hasattr(invalid, "str") diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 4551453499455..65a0822bbc55f 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -12,19 +12,21 @@ class TestSeriesApply: - def test_apply(self, datetime_series): - with np.errstate(all='ignore'): - tm.assert_series_equal(datetime_series.apply(np.sqrt), - np.sqrt(datetime_series)) + with np.errstate(all="ignore"): + tm.assert_series_equal( + datetime_series.apply(np.sqrt), np.sqrt(datetime_series) + ) # element-wise apply import math - tm.assert_series_equal(datetime_series.apply(math.exp), - np.exp(datetime_series)) + + tm.assert_series_equal( + datetime_series.apply(math.exp), np.exp(datetime_series) + ) # empty series - s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) + s = Series(dtype=object, name="foo", index=pd.Index([], name="bar")) rs = s.apply(lambda x: x) tm.assert_series_equal(s, rs) @@ -61,21 +63,21 @@ def test_apply_dont_convert_dtype(self): def test_with_string_args(self, datetime_series): - for arg in ['sum', 'mean', 'min', 'max', 'std']: + for arg in ["sum", "mean", "min", "max", "std"]: result = datetime_series.apply(arg) expected = getattr(datetime_series, arg)() assert result == expected def test_apply_args(self): - s = Series(['foo,bar']) + s = Series(["foo,bar"]) - result = s.apply(str.split, args=(',', )) - assert result[0] == ['foo', 'bar'] + result = s.apply(str.split, args=(",",)) + assert result[0] == ["foo", "bar"] assert isinstance(result[0], list) def test_series_map_box_timestamps(self): # GH#2689, GH#2627 - ser = Series(pd.date_range('1/1/2000', periods=10)) + ser = Series(pd.date_range("1/1/2000", periods=10)) def func(x): return (x.hour, x.day, x.month) @@ -86,57 +88,57 @@ def func(x): def test_apply_box(self): # ufunc will not be boxed. Same test cases as the test_map_box - vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns]' + assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None']) + res = s.apply(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) - vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')] + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns, US/Eastern]' - res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern']) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.apply(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) # timedelta - vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = pd.Series(vals) - assert s.dtype == 'timedelta64[ns]' - res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days)) - exp = pd.Series(['Timedelta_1', 'Timedelta_2']) + assert s.dtype == "timedelta64[ns]" + res = s.apply(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days)) + exp = pd.Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) # period - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = pd.Series(vals) - assert s.dtype == 'Period[M]' - res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__, - x.freqstr)) - exp = pd.Series(['Period_M', 'Period_M']) + assert s.dtype == "Period[M]" + res = s.apply(lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr)) + exp = pd.Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_apply_datetimetz(self): - values = pd.date_range('2011-01-01', '2011-01-02', - freq='H').tz_localize('Asia/Tokyo') - s = pd.Series(values, name='XX') + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = pd.Series(values, name="XX") result = s.apply(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range('2011-01-02', '2011-01-03', - freq='H').tz_localize('Asia/Tokyo') - exp = pd.Series(exp_values, name='XX') + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = pd.Series(exp_values, name="XX") tm.assert_series_equal(result, exp) # change dtype # GH 14506 : Returned dtype changed from int32 to int64 result = s.apply(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) + exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) tm.assert_series_equal(result, exp) # not vectorized @@ -146,36 +148,35 @@ def f(x): return str(x.tz) result = s.map(f) - exp = pd.Series(['Asia/Tokyo'] * 25, name='XX') + exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) def test_apply_dict_depr(self): - tsdf = pd.DataFrame(np.random.randn(10, 3), - columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) with tm.assert_produces_warning(FutureWarning): - tsdf.A.agg({'foo': ['sum', 'mean']}) + tsdf.A.agg({"foo": ["sum", "mean"]}) - @pytest.mark.parametrize('series', [ - ['1-1', '1-1', np.NaN], - ['1-1', '1-2', np.NaN]]) + @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) def test_apply_categorical_with_nan_values(self, series): # GH 20714 bug fixed in: GH 24275 - s = pd.Series(series, dtype='category') - result = s.apply(lambda x: x.split('-')[0]) + s = pd.Series(series, dtype="category") + result = s.apply(lambda x: x.split("-")[0]) result = result.astype(object) - expected = pd.Series(['1', '1', np.NaN], dtype='category') + expected = pd.Series(["1", "1", np.NaN], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) class TestSeriesAggregate: - def test_transform(self, string_series): # transforming functions - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): f_sqrt = np.sqrt(string_series) f_abs = np.abs(string_series) @@ -191,86 +192,89 @@ def test_transform(self, string_series): # list-like result = string_series.transform([np.sqrt]) expected = f_sqrt.to_frame().copy() - expected.columns = ['sqrt'] + expected.columns = ["sqrt"] assert_frame_equal(result, expected) result = string_series.transform([np.sqrt]) assert_frame_equal(result, expected) - result = string_series.transform(['sqrt']) + result = string_series.transform(["sqrt"]) assert_frame_equal(result, expected) # multiple items in list # these are in the order as if we are applying both functions per # series and then concatting expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ['sqrt', 'absolute'] + expected.columns = ["sqrt", "absolute"] result = string_series.apply([np.sqrt, np.abs]) assert_frame_equal(result, expected) - result = string_series.transform(['sqrt', 'abs']) - expected.columns = ['sqrt', 'abs'] + result = string_series.transform(["sqrt", "abs"]) + expected.columns = ["sqrt", "abs"] assert_frame_equal(result, expected) # dict, provide renaming expected = pd.concat([f_sqrt, f_abs], axis=1) - expected.columns = ['foo', 'bar'] - expected = expected.unstack().rename('series') + expected.columns = ["foo", "bar"] + expected = expected.unstack().rename("series") - result = string_series.apply({'foo': np.sqrt, 'bar': np.abs}) + result = string_series.apply({"foo": np.sqrt, "bar": np.abs}) assert_series_equal(result.reindex_like(expected), expected) def test_transform_and_agg_error(self, string_series): # we are trying to transform with an aggregator with pytest.raises(ValueError): - string_series.transform(['min', 'max']) + string_series.transform(["min", "max"]) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - string_series.agg(['sqrt', 'max']) + with np.errstate(all="ignore"): + string_series.agg(["sqrt", "max"]) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - string_series.transform(['sqrt', 'max']) + with np.errstate(all="ignore"): + string_series.transform(["sqrt", "max"]) with pytest.raises(ValueError): - with np.errstate(all='ignore'): - string_series.agg({'foo': np.sqrt, 'bar': 'sum'}) + with np.errstate(all="ignore"): + string_series.agg({"foo": np.sqrt, "bar": "sum"}) def test_demo(self): # demonstration tests - s = Series(range(6), dtype='int64', name='series') + s = Series(range(6), dtype="int64", name="series") - result = s.agg(['min', 'max']) - expected = Series([0, 5], index=['min', 'max'], name='series') + result = s.agg(["min", "max"]) + expected = Series([0, 5], index=["min", "max"], name="series") tm.assert_series_equal(result, expected) - result = s.agg({'foo': 'min'}) - expected = Series([0], index=['foo'], name='series') + result = s.agg({"foo": "min"}) + expected = Series([0], index=["foo"], name="series") tm.assert_series_equal(result, expected) # nested renaming with tm.assert_produces_warning(FutureWarning): - result = s.agg({'foo': ['min', 'max']}) + result = s.agg({"foo": ["min", "max"]}) - expected = DataFrame( - {'foo': [0, 5]}, - index=['min', 'max']).unstack().rename('series') + expected = ( + DataFrame({"foo": [0, 5]}, index=["min", "max"]).unstack().rename("series") + ) tm.assert_series_equal(result, expected) def test_multiple_aggregators_with_dict_api(self): - s = Series(range(6), dtype='int64', name='series') + s = Series(range(6), dtype="int64", name="series") # nested renaming with tm.assert_produces_warning(FutureWarning): - result = s.agg({'foo': ['min', 'max'], 'bar': ['sum', 'mean']}) - - expected = DataFrame( - {'foo': [5.0, np.nan, 0.0, np.nan], - 'bar': [np.nan, 2.5, np.nan, 15.0]}, - columns=['foo', 'bar'], - index=['max', 'mean', - 'min', 'sum']).unstack().rename('series') + result = s.agg({"foo": ["min", "max"], "bar": ["sum", "mean"]}) + + expected = ( + DataFrame( + {"foo": [5.0, np.nan, 0.0, np.nan], "bar": [np.nan, 2.5, np.nan, 15.0]}, + columns=["foo", "bar"], + index=["max", "mean", "min", "sum"], + ) + .unstack() + .rename("series") + ) tm.assert_series_equal(result.reindex_like(expected), expected) def test_agg_apply_evaluate_lambdas_the_same(self, string_series): @@ -287,37 +291,42 @@ def test_agg_apply_evaluate_lambdas_the_same(self, string_series): def test_with_nested_series(self, datetime_series): # GH 2316 # .agg with a reducer and a transform, what to do - result = datetime_series.apply(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) - expected = DataFrame({'x': datetime_series, - 'x^2': datetime_series ** 2}) + result = datetime_series.apply( + lambda x: Series([x, x ** 2], index=["x", "x^2"]) + ) + expected = DataFrame({"x": datetime_series, "x^2": datetime_series ** 2}) tm.assert_frame_equal(result, expected) - result = datetime_series.agg(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) + result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) tm.assert_frame_equal(result, expected) def test_replicate_describe(self, string_series): # this also tests a result set that is all scalars expected = string_series.describe() - result = string_series.apply(OrderedDict( - [('count', 'count'), - ('mean', 'mean'), - ('std', 'std'), - ('min', 'min'), - ('25%', lambda x: x.quantile(0.25)), - ('50%', 'median'), - ('75%', lambda x: x.quantile(0.75)), - ('max', 'max')])) + result = string_series.apply( + OrderedDict( + [ + ("count", "count"), + ("mean", "mean"), + ("std", "std"), + ("min", "min"), + ("25%", lambda x: x.quantile(0.25)), + ("50%", "median"), + ("75%", lambda x: x.quantile(0.75)), + ("max", "max"), + ] + ) + ) assert_series_equal(result, expected) def test_reduce(self, string_series): # reductions with named functions - result = string_series.agg(['sum', 'mean']) - expected = Series([string_series.sum(), - string_series.mean()], - ['sum', 'mean'], - name=string_series.name) + result = string_series.agg(["sum", "mean"]) + expected = Series( + [string_series.sum(), string_series.mean()], + ["sum", "mean"], + name=string_series.name, + ) assert_series_equal(result, expected) def test_non_callable_aggregates(self): @@ -325,50 +334,60 @@ def test_non_callable_aggregates(self): s = Series([1, 2, None]) # Calling agg w/ just a string arg same as calling s.arg - result = s.agg('size') + result = s.agg("size") expected = s.size assert result == expected # test when mixed w/ callable reducers - result = s.agg(['size', 'count', 'mean']) - expected = Series(OrderedDict([('size', 3.0), - ('count', 2.0), - ('mean', 1.5)])) + result = s.agg(["size", "count", "mean"]) + expected = Series(OrderedDict([("size", 3.0), ("count", 2.0), ("mean", 1.5)])) assert_series_equal(result[expected.index], expected) - @pytest.mark.parametrize("series, func, expected", chain( - _get_cython_table_params(Series(), [ - ('sum', 0), - ('max', np.nan), - ('min', np.nan), - ('all', True), - ('any', False), - ('mean', np.nan), - ('prod', 1), - ('std', np.nan), - ('var', np.nan), - ('median', np.nan), - ]), - _get_cython_table_params(Series([np.nan, 1, 2, 3]), [ - ('sum', 6), - ('max', 3), - ('min', 1), - ('all', True), - ('any', True), - ('mean', 2), - ('prod', 6), - ('std', 1), - ('var', 1), - ('median', 2), - ]), - _get_cython_table_params(Series('a b c'.split()), [ - ('sum', 'abc'), - ('max', 'c'), - ('min', 'a'), - ('all', 'c'), # see GH12863 - ('any', 'a'), - ]), - )) + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series(), + [ + ("sum", 0), + ("max", np.nan), + ("min", np.nan), + ("all", True), + ("any", False), + ("mean", np.nan), + ("prod", 1), + ("std", np.nan), + ("var", np.nan), + ("median", np.nan), + ], + ), + _get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("sum", 6), + ("max", 3), + ("min", 1), + ("all", True), + ("any", True), + ("mean", 2), + ("prod", 6), + ("std", 1), + ("var", 1), + ("median", 2), + ], + ), + _get_cython_table_params( + Series("a b c".split()), + [ + ("sum", "abc"), + ("max", "c"), + ("min", "a"), + ("all", "c"), # see GH12863 + ("any", "a"), + ], + ), + ), + ) def test_agg_cython_table(self, series, func, expected): # GH21224 # test reducing functions in @@ -379,19 +398,25 @@ def test_agg_cython_table(self, series, func, expected): else: assert result == expected - @pytest.mark.parametrize("series, func, expected", chain( - _get_cython_table_params(Series(), [ - ('cumprod', Series([], Index([]))), - ('cumsum', Series([], Index([]))), - ]), - _get_cython_table_params(Series([np.nan, 1, 2, 3]), [ - ('cumprod', Series([np.nan, 1, 2, 6])), - ('cumsum', Series([np.nan, 1, 3, 6])), - ]), - _get_cython_table_params(Series('a b c'.split()), [ - ('cumsum', Series(['a', 'ab', 'abc'])), - ]), - )) + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series(), + [("cumprod", Series([], Index([]))), ("cumsum", Series([], Index([])))], + ), + _get_cython_table_params( + Series([np.nan, 1, 2, 3]), + [ + ("cumprod", Series([np.nan, 1, 2, 6])), + ("cumsum", Series([np.nan, 1, 3, 6])), + ], + ), + _get_cython_table_params( + Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))] + ), + ), + ) def test_agg_cython_table_transform(self, series, func, expected): # GH21224 # test transforming functions in @@ -399,16 +424,22 @@ def test_agg_cython_table_transform(self, series, func, expected): result = series.agg(func) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("series, func, expected", chain( - _get_cython_table_params(Series('a b c'.split()), [ - ('mean', TypeError), # mean raises TypeError - ('prod', TypeError), - ('std', TypeError), - ('var', TypeError), - ('median', TypeError), - ('cumprod', TypeError), - ]) - )) + @pytest.mark.parametrize( + "series, func, expected", + chain( + _get_cython_table_params( + Series("a b c".split()), + [ + ("mean", TypeError), # mean raises TypeError + ("prod", TypeError), + ("std", TypeError), + ("var", TypeError), + ("median", TypeError), + ("cumprod", TypeError), + ], + ) + ), + ) def test_agg_cython_table_raises(self, series, func, expected): # GH21224 with pytest.raises(expected): @@ -417,12 +448,11 @@ def test_agg_cython_table_raises(self, series, func, expected): class TestSeriesMap: - def test_map(self, datetime_series): index, data = tm.getMixedTypeDict() - source = Series(data['B'], index=data['C']) - target = Series(data['C'][:4], index=data['D'][:4]) + source = Series(data["B"], index=data["C"]) + target = Series(data["C"][:4], index=data["D"][:4]) merged = target.map(source) @@ -449,25 +479,28 @@ def test_map(self, datetime_series): exp = Series(["odd", "even", "odd", np.nan]) tm.assert_series_equal(a.map(c), exp) - a = Series(['a', 'b', 'c', 'd']) - b = Series([1, 2, 3, 4], - index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) - c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e'])) + a = Series(["a", "b", "c", "d"]) + b = Series([1, 2, 3, 4], index=pd.CategoricalIndex(["b", "c", "d", "e"])) + c = Series([1, 2, 3, 4], index=Index(["b", "c", "d", "e"])) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 1, 2, 3]) tm.assert_series_equal(a.map(c), exp) - a = Series(['a', 'b', 'c', 'd']) - b = Series(['B', 'C', 'D', 'E'], dtype='category', - index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) - c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) - - exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'], - categories=['B', 'C', 'D', 'E'])) + a = Series(["a", "b", "c", "d"]) + b = Series( + ["B", "C", "D", "E"], + dtype="category", + index=pd.CategoricalIndex(["b", "c", "d", "e"]), + ) + c = Series(["B", "C", "D", "E"], index=Index(["b", "c", "d", "e"])) + + exp = Series( + pd.Categorical([np.nan, "B", "C", "D"], categories=["B", "C", "D", "E"]) + ) tm.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, 'B', 'C', 'D']) + exp = Series([np.nan, "B", "C", "D"]) tm.assert_series_equal(a.map(c), exp) @pytest.mark.parametrize("index", tm.all_index_generator(10)) @@ -481,12 +514,12 @@ def test_map_empty(self, index): def test_map_compat(self): # related GH 8024 s = Series([True, True, False], index=[1, 2, 3]) - result = s.map({True: 'foo', False: 'bar'}) - expected = Series(['foo', 'foo', 'bar'], index=[1, 2, 3]) + result = s.map({True: "foo", False: "bar"}) + expected = Series(["foo", "foo", "bar"], index=[1, 2, 3]) assert_series_equal(result, expected) def test_map_int(self): - left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4}) + left = Series({"a": 1.0, "b": 2.0, "c": 3.0, "d": 4}) right = Series({1: 11, 2: 22, 3: 33}) assert left.dtype == np.float_ @@ -494,8 +527,8 @@ def test_map_int(self): merged = left.map(right) assert merged.dtype == np.float_ - assert isna(merged['d']) - assert not isna(merged['c']) + assert isna(merged["d"]) + assert not isna(merged["c"]) def test_map_type_inference(self): s = Series(range(3)) @@ -512,7 +545,7 @@ def test_map_decimal(self, string_series): def test_map_na_exclusion(self): s = Series([1.5, np.nan, 3, np.nan, 5]) - result = s.map(lambda x: x * 2, na_action='ignore') + result = s.map(lambda x: x * 2, na_action="ignore") exp = s * 2 assert_series_equal(result, exp) @@ -524,30 +557,29 @@ def test_map_dict_with_tuple_keys(self): from being mapped properly. """ # GH 18496 - df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) - label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} + df = pd.DataFrame({"a": [(1,), (2,), (3, 4), (5, 6)]}) + label_mappings = {(1,): "A", (2,): "B", (3, 4): "A", (5, 6): "B"} - df['labels'] = df['a'].map(label_mappings) - df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) + df["labels"] = df["a"].map(label_mappings) + df["expected_labels"] = pd.Series(["A", "B", "A", "B"], index=df.index) # All labels should be filled now - tm.assert_series_equal(df['labels'], df['expected_labels'], - check_names=False) + tm.assert_series_equal(df["labels"], df["expected_labels"], check_names=False) def test_map_counter(self): - s = Series(['a', 'b', 'c'], index=[1, 2, 3]) + s = Series(["a", "b", "c"], index=[1, 2, 3]) counter = Counter() - counter['b'] = 5 - counter['c'] += 1 + counter["b"] = 5 + counter["c"] += 1 result = s.map(counter) expected = Series([0, 5, 1], index=[1, 2, 3]) assert_series_equal(result, expected) def test_map_defaultdict(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - default_dict = defaultdict(lambda: 'blank') - default_dict[1] = 'stuff' + s = Series([1, 2, 3], index=["a", "b", "c"]) + default_dict = defaultdict(lambda: "blank") + default_dict[1] = "stuff" result = s.map(default_dict) - expected = Series(['stuff', 'blank', 'blank'], index=['a', 'b', 'c']) + expected = Series(["stuff", "blank", "blank"], index=["a", "b", "c"]) assert_series_equal(result, expected) def test_map_dict_subclass_with_missing(self): @@ -555,101 +587,104 @@ def test_map_dict_subclass_with_missing(self): Test Series.map with a dictionary subclass that defines __missing__, i.e. sets a default value (GH #15999). """ + class DictWithMissing(dict): def __missing__(self, key): - return 'missing' + return "missing" + s = Series([1, 2, 3]) - dictionary = DictWithMissing({3: 'three'}) + dictionary = DictWithMissing({3: "three"}) result = s.map(dictionary) - expected = Series(['missing', 'missing', 'three']) + expected = Series(["missing", "missing", "three"]) assert_series_equal(result, expected) def test_map_dict_subclass_without_missing(self): class DictWithoutMissing(dict): pass + s = Series([1, 2, 3]) - dictionary = DictWithoutMissing({3: 'three'}) + dictionary = DictWithoutMissing({3: "three"}) result = s.map(dictionary) - expected = Series([np.nan, np.nan, 'three']) + expected = Series([np.nan, np.nan, "three"]) assert_series_equal(result, expected) def test_map_box(self): - vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns]' + assert s.dtype == "datetime64[ns]" # boxed value must be Timestamp instance - res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None']) + res = s.map(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_None", "Timestamp_2_None"]) tm.assert_series_equal(res, exp) - vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern')] + vals = [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + ] s = pd.Series(vals) - assert s.dtype == 'datetime64[ns, US/Eastern]' - res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, - x.day, x.tz)) - exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern']) + assert s.dtype == "datetime64[ns, US/Eastern]" + res = s.map(lambda x: "{0}_{1}_{2}".format(x.__class__.__name__, x.day, x.tz)) + exp = pd.Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"]) tm.assert_series_equal(res, exp) # timedelta - vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")] s = pd.Series(vals) - assert s.dtype == 'timedelta64[ns]' - res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days)) - exp = pd.Series(['Timedelta_1', 'Timedelta_2']) + assert s.dtype == "timedelta64[ns]" + res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.days)) + exp = pd.Series(["Timedelta_1", "Timedelta_2"]) tm.assert_series_equal(res, exp) # period - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = pd.Series(vals) - assert s.dtype == 'Period[M]' - res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__, - x.freqstr)) - exp = pd.Series(['Period_M', 'Period_M']) + assert s.dtype == "Period[M]" + res = s.map(lambda x: "{0}_{1}".format(x.__class__.__name__, x.freqstr)) + exp = pd.Series(["Period_M", "Period_M"]) tm.assert_series_equal(res, exp) def test_map_categorical(self): - values = pd.Categorical(list('ABBABCD'), categories=list('DCBA'), - ordered=True) - s = pd.Series(values, name='XX', index=list('abcdefg')) + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + s = pd.Series(values, name="XX", index=list("abcdefg")) result = s.map(lambda x: x.lower()) - exp_values = pd.Categorical(list('abbabcd'), categories=list('dcba'), - ordered=True) - exp = pd.Series(exp_values, name='XX', index=list('abcdefg')) + exp_values = pd.Categorical( + list("abbabcd"), categories=list("dcba"), ordered=True + ) + exp = pd.Series(exp_values, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) tm.assert_categorical_equal(result.values, exp_values) - result = s.map(lambda x: 'A') - exp = pd.Series(['A'] * 7, name='XX', index=list('abcdefg')) + result = s.map(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) assert result.dtype == np.object with pytest.raises(NotImplementedError): - s.map(lambda x: x, na_action='ignore') + s.map(lambda x: x, na_action="ignore") def test_map_datetimetz(self): - values = pd.date_range('2011-01-01', '2011-01-02', - freq='H').tz_localize('Asia/Tokyo') - s = pd.Series(values, name='XX') + values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize( + "Asia/Tokyo" + ) + s = pd.Series(values, name="XX") # keep tz result = s.map(lambda x: x + pd.offsets.Day()) - exp_values = pd.date_range('2011-01-02', '2011-01-03', - freq='H').tz_localize('Asia/Tokyo') - exp = pd.Series(exp_values, name='XX') + exp_values = pd.date_range("2011-01-02", "2011-01-03", freq="H").tz_localize( + "Asia/Tokyo" + ) + exp = pd.Series(exp_values, name="XX") tm.assert_series_equal(result, exp) # change dtype # GH 14506 : Returned dtype changed from int32 to int64 result = s.map(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) + exp = pd.Series(list(range(24)) + [0], name="XX", dtype=np.int64) tm.assert_series_equal(result, exp) with pytest.raises(NotImplementedError): - s.map(lambda x: x, na_action='ignore') + s.map(lambda x: x, na_action="ignore") # not vectorized def f(x): @@ -658,13 +693,17 @@ def f(x): return str(x.tz) result = s.map(f) - exp = pd.Series(['Asia/Tokyo'] * 25, name='XX') + exp = pd.Series(["Asia/Tokyo"] * 25, name="XX") tm.assert_series_equal(result, exp) - @pytest.mark.parametrize("vals,mapping,exp", [ - (list('abc'), {np.nan: 'not NaN'}, [np.nan] * 3 + ['not NaN']), - (list('abc'), {'a': 'a letter'}, ['a letter'] + [np.nan] * 3), - (list(range(3)), {0: 42}, [42] + [np.nan] * 3)]) + @pytest.mark.parametrize( + "vals,mapping,exp", + [ + (list("abc"), {np.nan: "not NaN"}, [np.nan] * 3 + ["not NaN"]), + (list("abc"), {"a": "a letter"}, ["a letter"] + [np.nan] * 3), + (list(range(3)), {0: 42}, [42] + [np.nan] * 3), + ], + ) def test_map_missing_mixed(self, vals, mapping, exp): # GH20495 s = pd.Series(vals + [np.nan]) @@ -672,22 +711,29 @@ def test_map_missing_mixed(self, vals, mapping, exp): tm.assert_series_equal(result, pd.Series(exp)) - @pytest.mark.parametrize("dti,exp", [ - (Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), - DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype='int64')), - (tm.makeTimeSeries(nper=30), - DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype='int64')) - ]) + @pytest.mark.parametrize( + "dti,exp", + [ + ( + Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])), + DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"), + ), + ( + tm.makeTimeSeries(nper=30), + DataFrame(np.repeat([[1, 2]], 30, axis=0), dtype="int64"), + ), + ], + ) def test_apply_series_on_date_time_index_aware_series(self, dti, exp): # GH 25959 # Calling apply on a localized time series should not cause an error - index = dti.tz_localize('UTC').index + index = dti.tz_localize("UTC").index result = pd.Series(index).apply(lambda x: pd.Series([1, 2])) assert_frame_equal(result, exp) def test_apply_scaler_on_date_time_index_aware_series(self): # GH 25959 # Calling apply on a localized time series should not cause an error - series = tm.makeTimeSeries(nper=30).tz_localize('UTC') + series = tm.makeTimeSeries(nper=30).tz_localize("UTC") result = pd.Series(series.index).apply(lambda x: 1) - assert_series_equal(result, pd.Series(np.ones(30), dtype='int64')) + assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 2cc2ad080eb4c..5b57b5ba2dbae 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -15,31 +15,31 @@ def _permute(obj): class TestSeriesFlexArithmetic: @pytest.mark.parametrize( - 'ts', + "ts", [ (lambda x: x, lambda x: x * 2, False), (lambda x: x, lambda x: x[::2], False), (lambda x: x, lambda x: 5, True), - (lambda x: tm.makeFloatSeries(), - lambda x: tm.makeFloatSeries(), - True) - ]) - @pytest.mark.parametrize('opname', ['add', 'sub', 'mul', 'floordiv', - 'truediv', 'div', 'pow']) + (lambda x: tm.makeFloatSeries(), lambda x: tm.makeFloatSeries(), True), + ], + ) + @pytest.mark.parametrize( + "opname", ["add", "sub", "mul", "floordiv", "truediv", "div", "pow"] + ) def test_flex_method_equivalence(self, opname, ts): # check that Series.{opname} behaves like Series.__{opname}__, - tser = tm.makeTimeSeries().rename('ts') + tser = tm.makeTimeSeries().rename("ts") series = ts[0](tser) other = ts[1](tser) check_reverse = ts[2] - if opname == 'div': - pytest.skip('div test only for Py3') + if opname == "div": + pytest.skip("div test only for Py3") op = getattr(Series, opname) - if op == 'div': + if op == "div": alt = operator.truediv else: alt = getattr(operator, opname) @@ -58,7 +58,7 @@ class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted def test_add_series_with_period_index(self): - rng = pd.period_range('1/1/2000', '1/1/2010', freq='A') + rng = pd.period_range("1/1/2000", "1/1/2010", freq="A") ts = Series(np.random.randn(len(rng)), index=rng) result = ts + ts[::2] @@ -71,12 +71,13 @@ def test_add_series_with_period_index(self): msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" with pytest.raises(IncompatibleFrequency, match=msg): - ts + ts.asfreq('D', how="end") + ts + ts.asfreq("D", how="end") # ------------------------------------------------------------------ # Comparisons + class TestSeriesFlexComparison: def test_comparison_flex_basic(self): left = pd.Series(np.random.randn(10)) @@ -90,7 +91,7 @@ def test_comparison_flex_basic(self): tm.assert_series_equal(left.ge(right), left >= right) # axis - for axis in [0, None, 'index']: + for axis in [0, None, "index"]: tm.assert_series_equal(left.eq(right, axis=axis), left == right) tm.assert_series_equal(left.ne(right, axis=axis), left != right) tm.assert_series_equal(left.le(right, axis=axis), left < right) @@ -99,16 +100,16 @@ def test_comparison_flex_basic(self): tm.assert_series_equal(left.ge(right, axis=axis), left >= right) # - msg = 'No axis named 1 for object type' - for op in ['eq', 'ne', 'le', 'le', 'gt', 'ge']: + msg = "No axis named 1 for object type" + for op in ["eq", "ne", "le", "le", "gt", "ge"]: with pytest.raises(ValueError, match=msg): getattr(left, op)(right, axis=1) class TestSeriesComparison: def test_comparison_different_length(self): - a = Series(['a', 'b', 'c']) - b = Series(['b', 'a']) + a = Series(["a", "b", "c"]) + b = Series(["b", "a"]) with pytest.raises(ValueError): a < b @@ -117,41 +118,41 @@ def test_comparison_different_length(self): with pytest.raises(ValueError): a == b - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_ser_flex_cmp_return_dtypes(self, opname): # GH#15115 ser = Series([1, 3, 2], index=range(3)) const = 2 result = getattr(ser, opname)(const).dtypes - expected = np.dtype('bool') + expected = np.dtype("bool") assert result == expected - @pytest.mark.parametrize('opname', ['eq', 'ne', 'gt', 'lt', 'ge', 'le']) + @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"]) def test_ser_flex_cmp_return_dtypes_empty(self, opname): # GH#15115 empty Series case ser = Series([1, 3, 2], index=range(3)) empty = ser.iloc[:0] const = 2 result = getattr(empty, opname)(const).dtypes - expected = np.dtype('bool') + expected = np.dtype("bool") assert result == expected - @pytest.mark.parametrize('op', [operator.eq, operator.ne, - operator.le, operator.lt, - operator.ge, operator.gt]) - @pytest.mark.parametrize('names', [(None, None, None), - ('foo', 'bar', None), - ('baz', 'baz', 'baz')]) + @pytest.mark.parametrize( + "op", + [operator.eq, operator.ne, operator.le, operator.lt, operator.ge, operator.gt], + ) + @pytest.mark.parametrize( + "names", [(None, None, None), ("foo", "bar", None), ("baz", "baz", "baz")] + ) def test_ser_cmp_result_names(self, names, op): # datetime64 dtype - dti = pd.date_range('1949-06-07 03:00:00', - freq='H', periods=5, name=names[0]) + dti = pd.date_range("1949-06-07 03:00:00", freq="H", periods=5, name=names[0]) ser = Series(dti).rename(names[1]) result = op(ser, dti) assert result.name == names[2] # datetime64tz dtype - dti = dti.tz_localize('US/Central') + dti = dti.tz_localize("US/Central") ser = Series(dti).rename(names[1]) result = op(ser, dti) assert result.name == names[2] @@ -165,7 +166,7 @@ def test_ser_cmp_result_names(self, names, op): # categorical if op in [operator.eq, operator.ne]: # categorical dtype comparisons raise for inequalities - cidx = tdi.astype('category') + cidx = tdi.astype("category") ser = Series(cidx).rename(names[1]) result = op(ser, cidx) assert result.name == names[2] diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/test_asof.py index ee94b32717d1a..8bc9e9c38d83a 100644 --- a/pandas/tests/series/test_asof.py +++ b/pandas/tests/series/test_asof.py @@ -6,15 +6,14 @@ class TestSeriesAsof: - def test_basic(self): # array or list or dates N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.random.randn(N), index=rng) ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='25s') + dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = ts.asof(dates) assert notna(result).all() @@ -36,7 +35,7 @@ def test_basic(self): def test_scalar(self): N = 30 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") ts = Series(np.arange(N), index=rng) ts[5:10] = np.NaN ts[15:20] = np.NaN @@ -61,35 +60,42 @@ def test_scalar(self): def test_with_nan(self): # basic asof test - rng = date_range('1/1/2000', '1/2/2000', freq='4h') + rng = date_range("1/1/2000", "1/2/2000", freq="4h") s = Series(np.arange(len(rng)), index=rng) - r = s.resample('2h').mean() + r = s.resample("2h").mean() result = r.asof(r.index) - expected = Series([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.], - index=date_range('1/1/2000', '1/2/2000', freq='2h')) + expected = Series( + [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) tm.assert_series_equal(result, expected) r.iloc[3:5] = np.nan result = r.asof(r.index) - expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.], - index=date_range('1/1/2000', '1/2/2000', freq='2h')) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) tm.assert_series_equal(result, expected) r.iloc[-3:] = np.nan result = r.asof(r.index) - expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.], - index=date_range('1/1/2000', '1/2/2000', freq='2h')) + expected = Series( + [0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.0], + index=date_range("1/1/2000", "1/2/2000", freq="2h"), + ) tm.assert_series_equal(result, expected) def test_periodindex(self): from pandas import period_range, PeriodIndex + # array or list or dates N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') + rng = period_range("1/1/1990", periods=N, freq="H") ts = Series(np.random.randn(N), index=rng) ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='37min') + dates = date_range("1/1/1990", periods=N * 3, freq="37min") result = ts.asof(dates) assert notna(result).all() @@ -101,7 +107,7 @@ def test_periodindex(self): lb = ts.index[14] ub = ts.index[30] - pix = PeriodIndex(result.index.values, freq='H') + pix = PeriodIndex(result.index.values, freq="H") mask = (pix >= lb) & (pix < ub) rs = result[mask] assert (rs == ts[lb]).all() @@ -128,10 +134,10 @@ def test_periodindex(self): def test_errors(self): - s = Series([1, 2, 3], - index=[Timestamp('20130101'), - Timestamp('20130103'), - Timestamp('20130102')]) + s = Series( + [1, 2, 3], + index=[Timestamp("20130101"), Timestamp("20130103"), Timestamp("20130102")], + ) # non-monotonic assert not s.index.is_monotonic @@ -140,10 +146,10 @@ def test_errors(self): # subset with Series N = 10 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") s = Series(np.random.randn(N), index=rng) with pytest.raises(ValueError): - s.asof(s.index[0], subset='foo') + s.asof(s.index[0], subset="foo") def test_all_nans(self): # GH 15713 @@ -154,19 +160,19 @@ def test_all_nans(self): # testing non-default indexes N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') + rng = date_range("1/1/1990", periods=N, freq="53s") - dates = date_range('1/1/1990', periods=N * 3, freq='25s') + dates = date_range("1/1/1990", periods=N * 3, freq="25s") result = Series(np.nan, index=rng).asof(dates) expected = Series(np.nan, index=dates) tm.assert_series_equal(result, expected) # testing scalar input - date = date_range('1/1/1990', periods=N * 3, freq='25s')[0] + date = date_range("1/1/1990", periods=N * 3, freq="25s")[0] result = Series(np.nan, index=rng).asof(date) assert isna(result) # test name is propagated - result = Series(np.nan, index=[1, 2, 3, 4], name='test').asof([4, 5]) - expected = Series(np.nan, index=[4, 5], name='test') + result = Series(np.nan, index=[1, 2, 3, 4], name="test").asof([4, 5]) + expected = Series(np.nan, index=[4, 5], name="test") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_block_internals.py b/pandas/tests/series/test_block_internals.py index a18f1e1f444b7..18e75c3be5bcc 100644 --- a/pandas/tests/series/test_block_internals.py +++ b/pandas/tests/series/test_block_internals.py @@ -5,17 +5,16 @@ class TestSeriesBlockInternals: - def test_setitem_invalidates_datetime_index_freq(self): # GH#24096 altering a datetime64tz Series inplace invalidates the # `freq` attribute on the underlying DatetimeIndex - dti = pd.date_range('20130101', periods=3, tz='US/Eastern') + dti = pd.date_range("20130101", periods=3, tz="US/Eastern") ts = dti[1] ser = pd.Series(dti) assert ser._values is not dti assert ser._values._data.base is not dti._data._data.base - assert dti.freq == 'D' + assert dti.freq == "D" ser.iloc[1] = pd.NaT assert ser._values.freq is None @@ -23,18 +22,17 @@ def test_setitem_invalidates_datetime_index_freq(self): assert ser._values is not dti assert ser._values._data.base is not dti._data._data.base assert dti[1] == ts - assert dti.freq == 'D' + assert dti.freq == "D" def test_dt64tz_setitem_does_not_mutate_dti(self): # GH#21907, GH#24096 - dti = pd.date_range('2016-01-01', periods=10, tz='US/Pacific') + dti = pd.date_range("2016-01-01", periods=10, tz="US/Pacific") ts = dti[0] ser = pd.Series(dti) assert ser._values is not dti assert ser._values._data.base is not dti._data._data.base assert ser._data.blocks[0].values is not dti - assert (ser._data.blocks[0].values._data.base - is not dti._data._data.base) + assert ser._data.blocks[0].values._data.base is not dti._data._data.base ser[::3] = pd.NaT assert ser[0] is pd.NaT diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index d03c29ad79469..bf527bae297d9 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -11,7 +11,6 @@ class TestSeriesCombine: - def test_append(self, datetime_series, string_series, object_series): appendedSeries = string_series.append(object_series) for idx, value in appendedSeries.items(): @@ -27,8 +26,7 @@ def test_append(self, datetime_series, string_series, object_series): datetime_series.append(datetime_series, verify_integrity=True) def test_append_many(self, datetime_series): - pieces = [datetime_series[:5], datetime_series[5:10], - datetime_series[10:]] + pieces = [datetime_series[:5], datetime_series[5:10], datetime_series[10:]] result = pieces[0].append(pieces[1:]) assert_series_equal(result, datetime_series) @@ -43,12 +41,14 @@ def test_append_duplicates(self): # the result must have RangeIndex exp = pd.Series([1, 2, 3, 4, 5, 6]) - tm.assert_series_equal(s1.append(s2, ignore_index=True), - exp, check_index_type=True) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), - exp, check_index_type=True) + tm.assert_series_equal( + s1.append(s2, ignore_index=True), exp, check_index_type=True + ) + tm.assert_series_equal( + pd.concat([s1, s2], ignore_index=True), exp, check_index_type=True + ) - msg = 'Indexes have overlapping values:' + msg = "Indexes have overlapping values:" with pytest.raises(ValueError, match=msg): s1.append(s2, verify_integrity=True) with pytest.raises(ValueError, match=msg): @@ -94,54 +94,57 @@ def test_combine_first(self): combined = strings.combine_first(floats) tm.assert_series_equal(strings, combined.loc[index[::2]]) - tm.assert_series_equal(floats[1::2].astype(object), - combined.loc[index[1::2]]) + tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]]) # corner case - s = Series([1., 2, 3], index=[0, 1, 2]) + s = Series([1.0, 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) - s.index = s.index.astype('O') + s.index = s.index.astype("O") assert_series_equal(s, result) def test_update(self): - s = Series([1.5, nan, 3., 4., nan]) - s2 = Series([nan, 3.5, nan, 5.]) + s = Series([1.5, nan, 3.0, 4.0, nan]) + s2 = Series([nan, 3.5, nan, 5.0]) s.update(s2) - expected = Series([1.5, 3.5, 3., 5., np.nan]) + expected = Series([1.5, 3.5, 3.0, 5.0, np.nan]) assert_series_equal(s, expected) # GH 3217 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) - df['c'] = np.nan + df["c"] = np.nan - df['c'].update(Series(['foo'], index=[0])) - expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]], - columns=['a', 'b', 'c']) + df["c"].update(Series(["foo"], index=[0])) + expected = DataFrame( + [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] + ) assert_frame_equal(df, expected) - @pytest.mark.parametrize('other, dtype, expected', [ - # other is int - ([61, 63], 'int32', pd.Series([10, 61, 12], dtype='int32')), - ([61, 63], 'int64', pd.Series([10, 61, 12])), - ([61, 63], float, pd.Series([10., 61., 12.])), - ([61, 63], object, pd.Series([10, 61, 12], dtype=object)), - # other is float, but can be cast to int - ([61., 63.], 'int32', pd.Series([10, 61, 12], dtype='int32')), - ([61., 63.], 'int64', pd.Series([10, 61, 12])), - ([61., 63.], float, pd.Series([10., 61., 12.])), - ([61., 63.], object, pd.Series([10, 61., 12], dtype=object)), - # others is float, cannot be cast to int - ([61.1, 63.1], 'int32', pd.Series([10., 61.1, 12.])), - ([61.1, 63.1], 'int64', pd.Series([10., 61.1, 12.])), - ([61.1, 63.1], float, pd.Series([10., 61.1, 12.])), - ([61.1, 63.1], object, pd.Series([10, 61.1, 12], dtype=object)), - # other is object, cannot be cast - ([(61,), (63,)], 'int32', pd.Series([10, (61,), 12])), - ([(61,), (63,)], 'int64', pd.Series([10, (61,), 12])), - ([(61,), (63,)], float, pd.Series([10., (61,), 12.])), - ([(61,), (63,)], object, pd.Series([10, (61,), 12])) - ]) + @pytest.mark.parametrize( + "other, dtype, expected", + [ + # other is int + ([61, 63], "int32", pd.Series([10, 61, 12], dtype="int32")), + ([61, 63], "int64", pd.Series([10, 61, 12])), + ([61, 63], float, pd.Series([10.0, 61.0, 12.0])), + ([61, 63], object, pd.Series([10, 61, 12], dtype=object)), + # other is float, but can be cast to int + ([61.0, 63.0], "int32", pd.Series([10, 61, 12], dtype="int32")), + ([61.0, 63.0], "int64", pd.Series([10, 61, 12])), + ([61.0, 63.0], float, pd.Series([10.0, 61.0, 12.0])), + ([61.0, 63.0], object, pd.Series([10, 61.0, 12], dtype=object)), + # others is float, cannot be cast to int + ([61.1, 63.1], "int32", pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], "int64", pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], float, pd.Series([10.0, 61.1, 12.0])), + ([61.1, 63.1], object, pd.Series([10, 61.1, 12], dtype=object)), + # other is object, cannot be cast + ([(61,), (63,)], "int32", pd.Series([10, (61,), 12])), + ([(61,), (63,)], "int64", pd.Series([10, (61,), 12])), + ([(61,), (63,)], float, pd.Series([10.0, (61,), 12.0])), + ([(61,), (63,)], object, pd.Series([10, (61,), 12])), + ], + ) def test_update_dtypes(self, other, dtype, expected): s = Series([10, 11, 12], dtype=dtype) @@ -153,29 +156,30 @@ def test_update_dtypes(self, other, dtype, expected): def test_concat_empty_series_dtypes_roundtrips(self): # round-tripping with self & like self - dtypes = map(np.dtype, ['float64', 'int8', 'uint8', 'bool', 'm8[ns]', - 'M8[ns]']) + dtypes = map(np.dtype, ["float64", "int8", "uint8", "bool", "m8[ns]", "M8[ns]"]) for dtype in dtypes: assert pd.concat([Series(dtype=dtype)]).dtype == dtype - assert pd.concat([Series(dtype=dtype), - Series(dtype=dtype)]).dtype == dtype + assert pd.concat([Series(dtype=dtype), Series(dtype=dtype)]).dtype == dtype def int_result_type(dtype, dtype2): typs = {dtype.kind, dtype2.kind} - if not len(typs - {'i', 'u', 'b'}) and (dtype.kind == 'i' or - dtype2.kind == 'i'): - return 'i' - elif not len(typs - {'u', 'b'}) and (dtype.kind == 'u' or - dtype2.kind == 'u'): - return 'u' + if not len(typs - {"i", "u", "b"}) and ( + dtype.kind == "i" or dtype2.kind == "i" + ): + return "i" + elif not len(typs - {"u", "b"}) and ( + dtype.kind == "u" or dtype2.kind == "u" + ): + return "u" return None def float_result_type(dtype, dtype2): typs = {dtype.kind, dtype2.kind} - if not len(typs - {'f', 'i', 'u'}) and (dtype.kind == 'f' or - dtype2.kind == 'f'): - return 'f' + if not len(typs - {"f", "i", "u"}) and ( + dtype.kind == "f" or dtype2.kind == "f" + ): + return "f" return None def get_result_type(dtype, dtype2): @@ -185,7 +189,7 @@ def get_result_type(dtype, dtype2): result = int_result_type(dtype, dtype2) if result is not None: return result - return 'O' + return "O" for dtype in dtypes: for dtype2 in dtypes: @@ -193,22 +197,25 @@ def get_result_type(dtype, dtype2): continue expected = get_result_type(dtype, dtype2) - result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2) - ]).dtype + result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)]).dtype assert result.kind == expected def test_combine_first_dt_tz_values(self, tz_naive_fixture): - ser1 = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20150103'], - tz=tz_naive_fixture), - name='ser1') - ser2 = pd.Series(pd.DatetimeIndex(['20160514', '20160515', '20160516'], - tz=tz_naive_fixture), - index=[2, 3, 4], name='ser2') + ser1 = pd.Series( + pd.DatetimeIndex(["20150101", "20150102", "20150103"], tz=tz_naive_fixture), + name="ser1", + ) + ser2 = pd.Series( + pd.DatetimeIndex(["20160514", "20160515", "20160516"], tz=tz_naive_fixture), + index=[2, 3, 4], + name="ser2", + ) result = ser1.combine_first(ser2) - exp_vals = pd.DatetimeIndex(['20150101', '20150102', '20150103', - '20160515', '20160516'], - tz=tz_naive_fixture) - exp = pd.Series(exp_vals, name='ser1') + exp_vals = pd.DatetimeIndex( + ["20150101", "20150102", "20150103", "20160515", "20160516"], + tz=tz_naive_fixture, + ) + exp = pd.Series(exp_vals, name="ser1") assert_series_equal(exp, result) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @@ -216,82 +223,108 @@ def test_combine_first_dt_tz_values(self, tz_naive_fixture): def test_concat_empty_series_dtypes(self): # booleans - assert pd.concat([Series(dtype=np.bool_), - Series(dtype=np.int32)]).dtype == np.int32 - assert pd.concat([Series(dtype=np.bool_), - Series(dtype=np.float32)]).dtype == np.object_ + assert ( + pd.concat([Series(dtype=np.bool_), Series(dtype=np.int32)]).dtype + == np.int32 + ) + assert ( + pd.concat([Series(dtype=np.bool_), Series(dtype=np.float32)]).dtype + == np.object_ + ) # datetime-like - assert pd.concat([Series(dtype='m8[ns]'), - Series(dtype=np.bool)]).dtype == np.object_ - assert pd.concat([Series(dtype='m8[ns]'), - Series(dtype=np.int64)]).dtype == np.object_ - assert pd.concat([Series(dtype='M8[ns]'), - Series(dtype=np.bool)]).dtype == np.object_ - assert pd.concat([Series(dtype='M8[ns]'), - Series(dtype=np.int64)]).dtype == np.object_ - assert pd.concat([Series(dtype='M8[ns]'), - Series(dtype=np.bool_), - Series(dtype=np.int64)]).dtype == np.object_ + assert ( + pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.bool)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="m8[ns]"), Series(dtype=np.int64)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.bool)]).dtype + == np.object_ + ) + assert ( + pd.concat([Series(dtype="M8[ns]"), Series(dtype=np.int64)]).dtype + == np.object_ + ) + assert ( + pd.concat( + [Series(dtype="M8[ns]"), Series(dtype=np.bool_), Series(dtype=np.int64)] + ).dtype + == np.object_ + ) # categorical - assert pd.concat([Series(dtype='category'), - Series(dtype='category')]).dtype == 'category' + assert ( + pd.concat([Series(dtype="category"), Series(dtype="category")]).dtype + == "category" + ) # GH 18515 - assert pd.concat([Series(np.array([]), dtype='category'), - Series(dtype='float64')]).dtype == 'float64' - assert pd.concat([Series(dtype='category'), - Series(dtype='object')]).dtype == 'object' + assert ( + pd.concat( + [Series(np.array([]), dtype="category"), Series(dtype="float64")] + ).dtype + == "float64" + ) + assert ( + pd.concat([Series(dtype="category"), Series(dtype="object")]).dtype + == "object" + ) # sparse # TODO: move? - result = pd.concat([Series(dtype='float64').to_sparse(), - Series(dtype='float64').to_sparse()]) - assert result.dtype == 'Sparse[float64]' + result = pd.concat( + [Series(dtype="float64").to_sparse(), Series(dtype="float64").to_sparse()] + ) + assert result.dtype == "Sparse[float64]" # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert result.ftype == 'float64:sparse' + assert result.ftype == "float64:sparse" - result = pd.concat([Series(dtype='float64').to_sparse(), - Series(dtype='float64')]) + result = pd.concat( + [Series(dtype="float64").to_sparse(), Series(dtype="float64")] + ) # TODO: release-note: concat sparse dtype expected = pd.core.sparse.api.SparseDtype(np.float64) assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert result.ftype == 'float64:sparse' + assert result.ftype == "float64:sparse" - result = pd.concat([Series(dtype='float64').to_sparse(), - Series(dtype='object')]) + result = pd.concat( + [Series(dtype="float64").to_sparse(), Series(dtype="object")] + ) # TODO: release-note: concat sparse dtype - expected = pd.core.sparse.api.SparseDtype('object') + expected = pd.core.sparse.api.SparseDtype("object") assert result.dtype == expected # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert result.ftype == 'object:sparse' + assert result.ftype == "object:sparse" def test_combine_first_dt64(self): from pandas.core.tools.datetimes import to_datetime + s0 = to_datetime(Series(["2010", np.NaN])) s1 = to_datetime(Series([np.NaN, "2011"])) rs = s0.combine_first(s1) - xp = to_datetime(Series(['2010', '2011'])) + xp = to_datetime(Series(["2010", "2011"])) assert_series_equal(rs, xp) s0 = to_datetime(Series(["2010", np.NaN])) s1 = Series([np.NaN, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), '2011']) + xp = Series([datetime(2010, 1, 1), "2011"]) assert_series_equal(rs, xp) class TestTimeseries: - def test_append_concat(self): - rng = date_range('5/8/2012 1:45', periods=10, freq='5T') + rng = date_range("5/8/2012 1:45", periods=10, freq="5T") ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) @@ -311,19 +344,16 @@ def test_append_concat(self): # different index names rng1 = rng.copy() rng2 = rng.copy() - rng1.name = 'foo' - rng2.name = 'bar' - assert rng1.append(rng1).name == 'foo' + rng1.name = "foo" + rng2.name = "bar" + assert rng1.append(rng1).name == "foo" assert rng1.append(rng2).name is None def test_append_concat_tz(self): # see gh-2938 - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='US/Eastern') + rng = date_range("5/8/2012 1:45", periods=10, freq="5T", tz="US/Eastern") + rng2 = date_range("5/8/2012 2:35", periods=10, freq="5T", tz="US/Eastern") + rng3 = date_range("5/8/2012 1:45", periods=20, freq="5T", tz="US/Eastern") ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) @@ -341,12 +371,15 @@ def test_append_concat_tz_explicit_pytz(self): # see gh-2938 from pytz import timezone as timezone - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz=timezone('US/Eastern')) - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz=timezone('US/Eastern')) + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") + ) ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) @@ -362,12 +395,15 @@ def test_append_concat_tz_explicit_pytz(self): def test_append_concat_tz_dateutil(self): # see gh-2938 - rng = date_range('5/8/2012 1:45', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T', - tz='dateutil/US/Eastern') - rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T', - tz='dateutil/US/Eastern') + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz="dateutil/US/Eastern" + ) ts = Series(np.random.randn(len(rng)), rng) df = DataFrame(np.random.randn(len(rng), 4), index=rng) ts2 = Series(np.random.randn(len(rng2)), rng2) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 663d5ae505303..2f09d777e719c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -10,52 +10,63 @@ from pandas._libs.tslib import iNaT from pandas.compat import PY36 -from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64tz_dtype) +from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype from pandas.core.dtypes.dtypes import CategoricalDtype, ordered_sentinel import pandas as pd from pandas import ( - Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series, - Timestamp, date_range, isna, period_range, timedelta_range) + Categorical, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, + period_range, + timedelta_range, +) from pandas.core.arrays import period_array import pandas.util.testing as tm from pandas.util.testing import assert_series_equal class TestSeriesConstructors: - - @pytest.mark.parametrize('constructor,check_index_type', [ - # NOTE: some overlap with test_constructor_empty but that test does not - # test for None or an empty generator. - # test_constructor_pass_none tests None but only with the index also - # passed. - (lambda: Series(), True), - (lambda: Series(None), True), - (lambda: Series({}), True), - (lambda: Series(()), False), # creates a RangeIndex - (lambda: Series([]), False), # creates a RangeIndex - (lambda: Series((x for x in [])), False), # creates a RangeIndex - (lambda: Series(data=None), True), - (lambda: Series(data={}), True), - (lambda: Series(data=()), False), # creates a RangeIndex - (lambda: Series(data=[]), False), # creates a RangeIndex - (lambda: Series(data=(x for x in [])), False), # creates a RangeIndex - ]) + @pytest.mark.parametrize( + "constructor,check_index_type", + [ + # NOTE: some overlap with test_constructor_empty but that test does not + # test for None or an empty generator. + # test_constructor_pass_none tests None but only with the index also + # passed. + (lambda: Series(), True), + (lambda: Series(None), True), + (lambda: Series({}), True), + (lambda: Series(()), False), # creates a RangeIndex + (lambda: Series([]), False), # creates a RangeIndex + (lambda: Series((x for x in [])), False), # creates a RangeIndex + (lambda: Series(data=None), True), + (lambda: Series(data={}), True), + (lambda: Series(data=()), False), # creates a RangeIndex + (lambda: Series(data=[]), False), # creates a RangeIndex + (lambda: Series(data=(x for x in [])), False), # creates a RangeIndex + ], + ) def test_empty_constructor(self, constructor, check_index_type): expected = Series() result = constructor() assert len(result.index) == 0 - tm.assert_series_equal(result, expected, - check_index_type=check_index_type) + tm.assert_series_equal(result, expected, check_index_type=check_index_type) def test_invalid_dtype(self): # GH15520 - msg = 'not understood' - invalid_list = [pd.Timestamp, 'pd.Timestamp', list] + msg = "not understood" + invalid_list = [pd.Timestamp, "pd.Timestamp", list] for dtype in invalid_list: with pytest.raises(TypeError, match=msg): - Series([], name='time', dtype=dtype) + Series([], name="time", dtype=dtype) def test_scalar_conversion(self): @@ -64,8 +75,8 @@ def test_scalar_conversion(self): assert not isinstance(scalar, float) # Coercion - assert float(Series([1.])) == 1.0 - assert int(Series([1.])) == 1 + assert float(Series([1.0])) == 1.0 + assert int(Series([1.0])) == 1 def test_constructor(self, datetime_series): empty_series = Series() @@ -81,7 +92,7 @@ def test_constructor(self, datetime_series): assert id(datetime_series.index) == id(derived.index) # Mixed type Series - mixed = Series(['hello', np.NaN], index=[0, 1]) + mixed = Series(["hello", np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN @@ -92,9 +103,9 @@ def test_constructor(self, datetime_series): with pytest.raises(Exception, match="Data must be 1-dimensional"): Series(np.random.randn(3, 3), index=np.arange(3)) - mixed.name = 'Series' + mixed.name = "Series" rs = Series(mixed).name - xp = 'Series' + xp = "Series" assert rs == xp # raise on MultiIndex GH4187 @@ -103,7 +114,7 @@ def test_constructor(self, datetime_series): with pytest.raises(NotImplementedError, match=msg): Series(m) - @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) + @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) @@ -113,13 +124,13 @@ def test_constructor_empty(self, input_class): assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: - empty = Series(dtype='float64') - empty2 = Series(input_class(), dtype='float64') + empty = Series(dtype="float64") + empty2 = Series(input_class(), dtype="float64") assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: - empty = Series(dtype='category') - empty2 = Series(input_class(), dtype='category') + empty = Series(dtype="category") + empty2 = Series(input_class(), dtype="category") assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: @@ -130,26 +141,26 @@ def test_constructor_empty(self, input_class): # With index and dtype float64: empty = Series(np.nan, index=range(10)) - empty2 = Series(input_class(), index=range(10), dtype='float64') + empty2 = Series(input_class(), index=range(10), dtype="float64") assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str - empty = Series('', dtype=str, index=range(3)) - empty2 = Series('', index=range(3)) + empty = Series("", dtype=str, index=range(3)) + empty2 = Series("", index=range(3)) assert_series_equal(empty, empty2) - @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) + @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) def test_constructor_nan(self, input_arg): - empty = Series(dtype='float64', index=range(10)) + empty = Series(dtype="float64", index=range(10)) empty2 = Series(input_arg, index=range(10)) assert_series_equal(empty, empty2, check_index_type=False) - @pytest.mark.parametrize('dtype', [ - 'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object', - 'datetime64[ns, UTC]', - ]) - @pytest.mark.parametrize('index', [None, pd.Index([])]) + @pytest.mark.parametrize( + "dtype", + ["f8", "i8", "M8[ns]", "m8[ns]", "category", "object", "datetime64[ns, UTC]"], + ) + @pytest.mark.parametrize("index", [None, pd.Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 result = pd.Series(dtype=dtype, index=index) @@ -157,15 +168,15 @@ def test_constructor_dtype_only(self, dtype, index): assert len(result) == 0 def test_constructor_no_data_index_order(self): - result = pd.Series(index=['b', 'a', 'c']) - assert result.index.tolist() == ['b', 'a', 'c'] + result = pd.Series(index=["b", "a", "c"]) + assert result.index.tolist() == ["b", "a", "c"] def test_constructor_no_data_string_type(self): # GH 22477 result = pd.Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) - @pytest.mark.parametrize('item', ['entry', 'ѐ', 13]) + @pytest.mark.parametrize("item", ["entry", "ѐ", 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 result = pd.Series(item, index=[1], dtype=str) @@ -173,17 +184,17 @@ def test_constructor_string_element_string_type(self, item): def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 - ser = Series(['x', None], dtype=string_dtype) + ser = Series(["x", None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None - ser = Series(['x', np.nan], dtype=string_dtype) + ser = Series(["x", np.nan], dtype=string_dtype) assert np.isnan(ser.iloc[1]) def test_constructor_series(self): - index1 = ['d', 'b', 'a', 'c'] + index1 = ["d", "b", "a", "c"] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) @@ -197,20 +208,20 @@ def __iter__(self): for i in range(10): yield i - expected = Series(list(range(10)), dtype='int64') - result = Series(Iter(), dtype='int64') + expected = Series(list(range(10)), dtype="int64") + result = Series(Iter(), dtype="int64") assert_series_equal(result, expected) def test_constructor_sequence(self): # GH 21987 - expected = Series(list(range(10)), dtype='int64') - result = Series(range(10), dtype='int64') + expected = Series(list(range(10)), dtype="int64") + result = Series(range(10), dtype="int64") assert_series_equal(result, expected) def test_constructor_single_str(self): # GH 21987 - expected = Series(['abc']) - result = Series('abc') + expected = Series(["abc"]) + result = Series("abc") assert_series_equal(result, expected) def test_constructor_list_like(self): @@ -218,27 +229,28 @@ def test_constructor_list_like(self): # make sure that we are coercing different # list-likes to standard dtypes and not # platform specific - expected = Series([1, 2, 3], dtype='int64') - for obj in [[1, 2, 3], (1, 2, 3), - np.array([1, 2, 3], dtype='int64')]: + expected = Series([1, 2, 3], dtype="int64") + for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]: result = Series(obj, index=[0, 1, 2]) assert_series_equal(result, expected) - @pytest.mark.parametrize('dtype', ['bool', 'int32', 'int64', 'float64']) + @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"]) def test_constructor_index_dtype(self, dtype): # GH 17088 s = Series(Index([0, 2, 4]), dtype=dtype) assert s.dtype == dtype - @pytest.mark.parametrize('input_vals', [ - ([1, 2]), - (['1', '2']), - (list(pd.date_range('1/1/2011', periods=2, freq='H'))), - (list(pd.date_range('1/1/2011', periods=2, freq='H', - tz='US/Eastern'))), - ([pd.Interval(left=0, right=5)]), - ]) + @pytest.mark.parametrize( + "input_vals", + [ + ([1, 2]), + (["1", "2"]), + (list(pd.date_range("1/1/2011", periods=2, freq="H"))), + (list(pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), + ([pd.Interval(left=0, right=5)]), + ], + ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings @@ -249,7 +261,7 @@ def test_constructor_list_str(self, input_vals, string_dtype): def test_constructor_list_str_na(self, string_dtype): result = Series([1.0, 2.0, np.nan], dtype=string_dtype) - expected = Series(['1.0', '2.0', np.nan], dtype=object) + expected = Series(["1.0", "2.0", np.nan], dtype=object) assert_series_equal(result, expected) assert np.isnan(result[2]) @@ -279,37 +291,35 @@ def test_constructor_map(self): assert_series_equal(result, exp) def test_constructor_categorical(self): - cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], - fastpath=True) + cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # can cast to a new dtype - result = Series(pd.Categorical([1, 2, 3]), - dtype='int64') - expected = pd.Series([1, 2, 3], dtype='int64') + result = Series(pd.Categorical([1, 2, 3]), dtype="int64") + expected = pd.Series([1, 2, 3], dtype="int64") tm.assert_series_equal(result, expected) # GH12574 - cat = Series(pd.Categorical([1, 2, 3]), dtype='category') + cat = Series(pd.Categorical([1, 2, 3]), dtype="category") assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) - s = Series([1, 2, 3], dtype='category') + s = Series([1, 2, 3], dtype="category") assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) def test_constructor_categorical_with_coercion(self): - factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) # test basic creation / coercion of categoricals - s = Series(factor, name='A') - assert s.dtype == 'category' + s = Series(factor, name="A") + assert s.dtype == "category" assert len(s) == len(factor) str(s.values) str(s) # in a frame - df = DataFrame({'A': factor}) - result = df['A'] + df = DataFrame({"A": factor}) + result = df["A"] tm.assert_series_equal(result, s) result = df.iloc[:, 0] tm.assert_series_equal(result, s) @@ -317,30 +327,30 @@ def test_constructor_categorical_with_coercion(self): str(df.values) str(df) - df = DataFrame({'A': s}) - result = df['A'] + df = DataFrame({"A": s}) + result = df["A"] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) # multiples - df = DataFrame({'A': s, 'B': s, 'C': 1}) - result1 = df['A'] - result2 = df['B'] + df = DataFrame({"A": s, "B": s, "C": 1}) + result1 = df["A"] + result2 = df["B"] tm.assert_series_equal(result1, s) tm.assert_series_equal(result2, s, check_names=False) - assert result2.name == 'B' + assert result2.name == "B" assert len(df) == len(factor) str(df.values) str(df) # GH8623 - x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], - [1, 'John P. Doe']], - columns=['person_id', 'person_name']) - x['person_name'] = Categorical(x.person_name - ) # doing this breaks transform + x = DataFrame( + [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], + columns=["person_id", "person_name"], + ) + x["person_name"] = Categorical(x.person_name) # doing this breaks transform expected = x.iloc[0].person_name result = x.person_name.iloc[0] @@ -353,47 +363,50 @@ def test_constructor_categorical_with_coercion(self): assert result == expected def test_constructor_categorical_dtype(self): - result = pd.Series(['a', 'b'], - dtype=CategoricalDtype(['a', 'b', 'c'], - ordered=True)) + result = pd.Series( + ["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True) + ) assert is_categorical_dtype(result) is True - tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) + tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"])) assert result.cat.ordered - result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a'])) + result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) assert is_categorical_dtype(result) - tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) + tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype - result = Series('a', index=[0, 1], - dtype=CategoricalDtype(['a', 'b'], ordered=True)) - expected = Series(['a', 'a'], index=[0, 1], - dtype=CategoricalDtype(['a', 'b'], ordered=True)) + result = Series( + "a", index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) + ) + expected = Series( + ["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True) + ) tm.assert_series_equal(result, expected, check_categorical=True) def test_constructor_categorical_string(self): # GH 26336: the string 'category' maintains existing CategoricalDtype - cdt = CategoricalDtype(categories=list('dabc'), ordered=True) - expected = Series(list('abcabc'), dtype=cdt) + cdt = CategoricalDtype(categories=list("dabc"), ordered=True) + expected = Series(list("abcabc"), dtype=cdt) # Series(Categorical, dtype='category') keeps existing dtype - cat = Categorical(list('abcabc'), dtype=cdt) - result = Series(cat, dtype='category') + cat = Categorical(list("abcabc"), dtype=cdt) + result = Series(cat, dtype="category") tm.assert_series_equal(result, expected) # Series(Series[Categorical], dtype='category') keeps existing dtype - result = Series(result, dtype='category') + result = Series(result, dtype="category") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('none, warning', [ - (None, None), (ordered_sentinel, FutureWarning)]) + @pytest.mark.parametrize( + "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] + ) def test_categorical_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) - cat = Categorical(list('abcdaba'), dtype=cdt1) + cat = Categorical(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning, check_stacklevel=False): Series(cat, dtype=cdt2) @@ -436,21 +449,19 @@ def test_categorical_sideeffects_free(self): tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): - left = pd.Series(['a', 'b', 'c'], - dtype=CategoricalDtype(['a', 'b'])) - right = pd.Series(pd.Categorical(['a', 'b', np.nan], - categories=['a', 'b'])) + left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) + right = pd.Series(pd.Categorical(["a", "b", np.nan], categories=["a", "b"])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): - data = ma.masked_all((3, ), dtype=float) + data = ma.masked_all((3,), dtype=float) result = Series(data) expected = Series([nan, nan, nan]) assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0.0, nan, 2.0], index=index) assert_series_equal(result, expected) @@ -460,14 +471,14 @@ def test_constructor_maskedarray(self): expected = Series([0.0, 1.0, 2.0], index=index) assert_series_equal(result, expected) - data = ma.masked_all((3, ), dtype=int) + data = ma.masked_all((3,), dtype=int) result = Series(data) expected = Series([nan, nan, nan], dtype=float) assert_series_equal(result, expected) data[0] = 0 data[2] = 2 - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0, nan, 2], index=index, dtype=float) assert_series_equal(result, expected) @@ -477,14 +488,14 @@ def test_constructor_maskedarray(self): expected = Series([0, 1, 2], index=index, dtype=int) assert_series_equal(result, expected) - data = ma.masked_all((3, ), dtype=bool) + data = ma.masked_all((3,), dtype=bool) result = Series(data) expected = Series([nan, nan, nan], dtype=object) assert_series_equal(result, expected) data[0] = True data[2] = False - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([True, nan, False], index=index, dtype=object) assert_series_equal(result, expected) @@ -494,34 +505,40 @@ def test_constructor_maskedarray(self): expected = Series([True, True, False], index=index, dtype=bool) assert_series_equal(result, expected) - data = ma.masked_all((3, ), dtype='M8[ns]') + data = ma.masked_all((3,), dtype="M8[ns]") result = Series(data) - expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]') + expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]") assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) - index = ['a', 'b', 'c'] + index = ["a", "b", "c"] result = Series(data, index=index) - expected = Series([datetime(2001, 1, 1), iNaT, - datetime(2001, 1, 3)], index=index, dtype='M8[ns]') + expected = Series( + [datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], + index=index, + dtype="M8[ns]", + ) assert_series_equal(result, expected) data[1] = datetime(2001, 1, 2) result = Series(data, index=index) - expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2), - datetime(2001, 1, 3)], index=index, dtype='M8[ns]') + expected = Series( + [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], + index=index, + dtype="M8[ns]", + ) assert_series_equal(result, expected) def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 - data = ma.masked_all((3, ), dtype=float).harden_mask() + data = ma.masked_all((3,), dtype=float).harden_mask() result = pd.Series(data) expected = pd.Series([nan, nan, nan]) tm.assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): - rng = date_range('20090415', '20090519', freq='B') + rng = date_range("20090415", "20090519", freq="B") data = {k: 1 for k in rng} result = Series(data, index=rng) @@ -531,17 +548,22 @@ def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) - @pytest.mark.parametrize('input', [[1, 2, 3], - (1, 2, 3), - list(range(3)), - pd.Categorical(['a', 'b', 'a']), - (i for i in range(3)), - map(lambda x: x, range(3))]) + @pytest.mark.parametrize( + "input", + [ + [1, 2, 3], + (1, 2, 3), + list(range(3)), + pd.Categorical(["a", "b", "a"]), + (i for i in range(3)), + map(lambda x: x, range(3)), + ], + ) def test_constructor_index_mismatch(self, input): # GH 19342 # test that construction of a Series with an index of different length # raises an error - msg = 'Length of passed values is 3, index implies 4' + msg = "Length of passed values is 3, index implies 4" with pytest.raises(ValueError, match=msg): Series(input, index=np.arange(4)) @@ -549,8 +571,8 @@ def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise - result = Series(np.array(100), index=np.arange(4), dtype='int64') - expected = Series(100, index=np.arange(4), dtype='int64') + result = Series(np.array(100), index=np.arange(4), dtype="int64") + expected = Series(100, index=np.arange(4), dtype="int64") tm.assert_series_equal(result, expected) def test_constructor_broadcast_list(self): @@ -559,7 +581,7 @@ def test_constructor_broadcast_list(self): # should raise msg = "Length of passed values is 1, index implies 3" with pytest.raises(ValueError, match=msg): - Series(['foo'], index=['a', 'b', 'c']) + Series(["foo"], index=["a", "b", "c"]) def test_constructor_corner(self): df = tm.makeTimeDataFrame() @@ -568,16 +590,16 @@ def test_constructor_corner(self): assert isinstance(s, Series) def test_constructor_sanitize(self): - s = Series(np.array([1., 1., 8.]), dtype='i8') - assert s.dtype == np.dtype('i8') + s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8") + assert s.dtype == np.dtype("i8") - s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') - assert s.dtype == np.dtype('f8') + s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") + assert s.dtype == np.dtype("f8") def test_constructor_copy(self): # GH15125 # test dtype parameter has no side effects on copy=True - for data in [[1.], np.array([1.])]: + for data in [[1.0], np.array([1.0])]: x = Series(data) y = pd.Series(x, copy=True, dtype=float) @@ -585,22 +607,24 @@ def test_constructor_copy(self): tm.assert_series_equal(x, y) # changes to origin of copy does not affect the copy - x[0] = 2. + x[0] = 2.0 assert not x.equals(y) - assert x[0] == 2. - assert y[0] == 1. + assert x[0] == 2.0 + assert y[0] == 1.0 @pytest.mark.parametrize( "index", [ - pd.date_range('20170101', periods=3, tz='US/Eastern'), - pd.date_range('20170101', periods=3), - pd.timedelta_range('1 day', periods=3), - pd.period_range('2012Q1', periods=3, freq='Q'), - pd.Index(list('abc')), + pd.date_range("20170101", periods=3, tz="US/Eastern"), + pd.date_range("20170101", periods=3), + pd.timedelta_range("1 day", periods=3), + pd.period_range("2012Q1", periods=3, freq="Q"), + pd.Index(list("abc")), pd.Int64Index([1, 2, 3]), - pd.RangeIndex(0, 3)], - ids=lambda x: type(x).__name__) + pd.RangeIndex(0, 3), + ], + ids=lambda x: type(x).__name__, + ) def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input @@ -630,7 +654,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([pd.NaT, pd.NaT]) - assert exp.dtype == 'datetime64[ns]' + assert exp.dtype == "datetime64[ns]" tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) @@ -647,7 +671,7 @@ def test_constructor_cast(self): def test_constructor_unsigned_dtype_overflow(self, uint_dtype): # see gh-15832 - msg = 'Trying to coerce negative values to unsigned integers' + msg = "Trying to coerce negative values to unsigned integers" with pytest.raises(OverflowError, match=msg): Series([-1], dtype=uint_dtype) @@ -675,38 +699,36 @@ def test_constructor_datelike_coercion(self): # GH 9477 # incorrectly inferring on dateimelike looking when object dtype is # specified - s = Series([Timestamp('20130101'), 'NOV'], dtype=object) - assert s.iloc[0] == Timestamp('20130101') - assert s.iloc[1] == 'NOV' + s = Series([Timestamp("20130101"), "NOV"], dtype=object) + assert s.iloc[0] == Timestamp("20130101") + assert s.iloc[1] == "NOV" assert s.dtype == object # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed - belly = '216 3T19'.split() - wing1 = '2T15 4H19'.split() - wing2 = '416 4T20'.split() - mat = pd.to_datetime('2016-01-22 2019-09-07'.split()) - df = pd.DataFrame( - {'wing1': wing1, - 'wing2': wing2, - 'mat': mat}, index=belly) - - result = df.loc['3T19'] + belly = "216 3T19".split() + wing1 = "2T15 4H19".split() + wing2 = "416 4T20".split() + mat = pd.to_datetime("2016-01-22 2019-09-07".split()) + df = pd.DataFrame({"wing1": wing1, "wing2": wing2, "mat": mat}, index=belly) + + result = df.loc["3T19"] assert result.dtype == object - result = df.loc['216'] + result = df.loc["216"] assert result.dtype == object def test_constructor_datetimes_with_nulls(self): # gh-15869 - for arr in [np.array([None, None, None, None, - datetime.now(), None]), - np.array([None, None, datetime.now(), None])]: + for arr in [ + np.array([None, None, None, None, datetime.now(), None]), + np.array([None, None, datetime.now(), None]), + ]: result = Series(arr) - assert result.dtype == 'M8[ns]' + assert result.dtype == "M8[ns]" def test_constructor_dtype_datetime64(self): - s = Series(iNaT, dtype='M8[ns]', index=range(5)) + s = Series(iNaT, dtype="M8[ns]", index=range(5)) assert isna(s).all() # in theory this should be all nulls, but since @@ -714,16 +736,16 @@ def test_constructor_dtype_datetime64(self): s = Series(iNaT, index=range(5)) assert not isna(s).all() - s = Series(nan, dtype='M8[ns]', index=range(5)) + s = Series(nan, dtype="M8[ns]", index=range(5)) assert isna(s).all() - s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]') + s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]") assert isna(s[1]) - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" - s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]') + s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype="M8[ns]") assert isna(s[1]) - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" # GH3416 dates = [ @@ -733,43 +755,38 @@ def test_constructor_dtype_datetime64(self): ] s = Series(dates) - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" s.iloc[0] = np.nan - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" # GH3414 related - expected = Series([ - datetime(2013, 1, 1), - datetime(2013, 1, 2), - datetime(2013, 1, 3), - ], dtype='datetime64[ns]') + expected = Series( + [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)], + dtype="datetime64[ns]", + ) - result = Series( - Series(dates).astype(np.int64) / 1000000, dtype='M8[ms]') + result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) - result = Series(dates, dtype='datetime64[ns]') + result = Series(dates, dtype="datetime64[ns]") tm.assert_series_equal(result, expected) - expected = Series([ - pd.NaT, - datetime(2013, 1, 2), - datetime(2013, 1, 3), - ], dtype='datetime64[ns]') - result = Series([np.nan] + dates[1:], dtype='datetime64[ns]') + expected = Series( + [pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]" + ) + result = Series([np.nan] + dates[1:], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) - dts = Series(dates, dtype='datetime64[ns]') + dts = Series(dates, dtype="datetime64[ns]") # valid astype - dts.astype('int64') + dts.astype("int64") # invalid casting - msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" - r" \[int32\]") + msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): - dts.astype('int32') + dts.astype("int32") # ints are ok # we test with np.int64 to get similar results on @@ -786,101 +803,102 @@ def test_constructor_dtype_datetime64(self): assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types - result = Series([Timestamp('20130101'), 1], index=['a', 'b']) - assert result['a'] == Timestamp('20130101') - assert result['b'] == 1 + result = Series([Timestamp("20130101"), 1], index=["a", "b"]) + assert result["a"] == Timestamp("20130101") + assert result["b"] == 1 # GH6529 # coerce datetime64 non-ns properly - dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') - values2 = dates.view(np.ndarray).astype('datetime64[ns]') + dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") + values2 = dates.view(np.ndarray).astype("datetime64[ns]") expected = Series(values2, index=dates) - for dtype in ['s', 'D', 'ms', 'us', 'ns']: - values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + for dtype in ["s", "D", "ms", "us", "ns"]: + values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) - for dtype in ['s', 'D', 'ms', 'us', 'ns']: - values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + for dtype in ["s", "D", "ms", "us", "ns"]: + values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, index=dates, dtype=object) assert_series_equal(result, expected) # leave datetime.date alone - dates2 = np.array([d.date() for d in dates.to_pydatetime()], - dtype=object) + dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime - s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' - s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' - s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' - s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) - assert s.dtype == 'datetime64[ns]' + s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" + s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"]) + assert s.dtype == "datetime64[ns]" # tz-aware (UTC and other tz's) # GH 8411 - dr = date_range('20130101', periods=3) + dr = date_range("20130101", periods=3) assert Series(dr).iloc[0].tz is None - dr = date_range('20130101', periods=3, tz='UTC') - assert str(Series(dr).iloc[0].tz) == 'UTC' - dr = date_range('20130101', periods=3, tz='US/Eastern') - assert str(Series(dr).iloc[0].tz) == 'US/Eastern' + dr = date_range("20130101", periods=3, tz="UTC") + assert str(Series(dr).iloc[0].tz) == "UTC" + dr = date_range("20130101", periods=3, tz="US/Eastern") + assert str(Series(dr).iloc[0].tz) == "US/Eastern" # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) - assert s.dtype == 'object' + assert s.dtype == "object" assert s[2] is pd.NaT - assert 'NaT' in str(s) + assert "NaT" in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) - assert s.dtype == 'object' + assert s.dtype == "object" assert s[2] is pd.NaT - assert 'NaT' in str(s) + assert "NaT" in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert s.dtype == 'object' + assert s.dtype == "object" assert s[2] is np.nan - assert 'NaN' in str(s) + assert "NaN" in str(s) def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz - dr = date_range('20130101', periods=3, tz='US/Eastern') + dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr) - assert s.dtype.name == 'datetime64[ns, US/Eastern]' - assert s.dtype == 'datetime64[ns, US/Eastern]' + assert s.dtype.name == "datetime64[ns, US/Eastern]" + assert s.dtype == "datetime64[ns, US/Eastern]" assert is_datetime64tz_dtype(s.dtype) - assert 'datetime64[ns, US/Eastern]' in str(s) + assert "datetime64[ns, US/Eastern]" in str(s) # export result = s.values assert isinstance(result, np.ndarray) - assert result.dtype == 'datetime64[ns]' + assert result.dtype == "datetime64[ns]" exp = pd.DatetimeIndex(result) - exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) + exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] - assert result == Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', freq='D') + assert result == Timestamp( + "2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D" + ) result = s[0] - assert result == Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', freq='D') + assert result == Timestamp( + "2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D" + ) result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) @@ -893,39 +911,47 @@ def test_constructor_with_datetime_tz(self): assert_series_equal(result, s) # short str - assert 'datetime64[ns, US/Eastern]' in str(s) + assert "datetime64[ns, US/Eastern]" in str(s) # formatting with NaT result = s.shift() - assert 'datetime64[ns, US/Eastern]' in str(result) - assert 'NaT' in str(result) + assert "datetime64[ns, US/Eastern]" in str(result) + assert "NaT" in str(result) # long str - t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) - assert 'datetime64[ns, US/Eastern]' in str(t) + t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) + assert "datetime64[ns, US/Eastern]" in str(t) - result = pd.DatetimeIndex(s, freq='infer') + result = pd.DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) # inference - s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) - assert s.dtype == 'datetime64[ns, US/Pacific]' - assert lib.infer_dtype(s, skipna=True) == 'datetime64' - - s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), - pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) - assert s.dtype == 'object' - assert lib.infer_dtype(s, skipna=True) == 'datetime' + s = Series( + [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), + ] + ) + assert s.dtype == "datetime64[ns, US/Pacific]" + assert lib.infer_dtype(s, skipna=True) == "datetime64" + + s = Series( + [ + pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), + pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"), + ] + ) + assert s.dtype == "object" + assert lib.infer_dtype(s, skipna=True) == "datetime" # with all NaT - s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') - expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) + s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") + expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 @@ -937,17 +963,16 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('arg', - ['2013-01-01 00:00:00', pd.NaT, np.nan, None]) + @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string - result = Series([arg], dtype='datetime64[ns, CET]') - expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') + result = Series([arg], dtype="datetime64[ns, CET]") + expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") assert_series_equal(result, expected) def test_construction_interval(self): # construction from interval & array of intervals - index = IntervalIndex.from_breaks(np.arange(3), closed='right') + index = IntervalIndex.from_breaks(np.arange(3), closed="right") result = Series(index) repr(result) str(result) @@ -960,30 +985,30 @@ def test_construction_consistency(self): # make sure that we are not re-localizing upon construction # GH 14928 - s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) + s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) result = Series(s, dtype=s.dtype) tm.assert_series_equal(result, s) - result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype) + result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) def test_constructor_infer_period(self): - data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None] + data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] result = pd.Series(data) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) - assert result.dtype == 'Period[D]' + assert result.dtype == "Period[D]" data = np.asarray(data, dtype=object) tm.assert_series_equal(result, expected) - assert result.dtype == 'Period[D]' + assert result.dtype == "Period[D]" def test_constructor_period_incompatible_frequency(self): - data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')] + data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = pd.Series(data) assert result.dtype == object assert result.tolist() == data @@ -992,16 +1017,16 @@ def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series - pi = period_range('20130101', periods=5, freq='D') + pi = period_range("20130101", periods=5, freq="D") s = Series(pi) - assert s.dtype == 'Period[D]' + assert s.dtype == "Period[D]" expected = Series(pi.astype(object)) assert_series_equal(s, expected) def test_constructor_dict(self): - d = {'a': 0., 'b': 1., 'c': 2.} - result = Series(d, index=['b', 'c', 'd', 'a']) - expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) + d = {"a": 0.0, "b": 1.0, "c": 2.0} + result = Series(d, index=["b", "c", "d", "a"]) + expected = Series([1, 2, nan, 0], index=["b", "c", "d", "a"]) assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) @@ -1016,40 +1041,41 @@ def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} result = Series(d) if PY36: - expected = Series([1, 0, 2], index=list('bac')) + expected = Series([1, 0, 2], index=list("bac")) else: - expected = Series([0, 1, 2], index=list('abc')) + expected = Series([0, 1, 2], index=list("abc")) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) + @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18480 - d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} + d = {1: "a", value: "b", float("nan"): "c", 4: "d"} result = Series(d).sort_values() - expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) + expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4]) assert_series_equal(result, expected) # MultiIndex: - d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} + d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"} result = Series(d).sort_values() - expected = Series(['a', 'b', 'c'], - index=Index([(1, 1), (2, np.nan), (3, value)])) + expected = Series( + ["a", "b", "c"], index=Index([(1, 1), (2, np.nan), (3, value)]) + ) assert_series_equal(result, expected) def test_constructor_dict_datetime64_index(self): # GH 9456 - dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] + dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] values = [42544017.198965244, 1234565, 40512335.181958228, -1] def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values)) data_datetime64 = create_data(np.datetime64) - data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) + data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) data_Timestamp = create_data(Timestamp) expected = Series(values, (Timestamp(x) for x in dates_as_str)) @@ -1073,11 +1099,9 @@ def test_constructor_tuple_of_tuples(self): assert tuple(s) == data def test_constructor_dict_of_tuples(self): - data = {(1, 2): 3, - (None, 5): 6} + data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() - expected = Series([3, 6], - index=MultiIndex.from_tuples([(1, 2), (None, 5)])) + expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) def test_constructor_set(self): @@ -1091,20 +1115,20 @@ def test_constructor_set(self): # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_fromDict(self): - data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} + data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) tm.assert_is_sorted(series.index) - data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} + data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()} series = Series(data) assert series.dtype == np.object_ - data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'} + data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) assert series.dtype == np.object_ - data = {'a': '0', 'b': '1'} + data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 @@ -1114,20 +1138,20 @@ def test_fromValue(self, datetime_series): assert nans.dtype == np.float_ assert len(nans) == len(datetime_series) - strings = Series('foo', index=datetime_series.index) + strings = Series("foo", index=datetime_series.index) assert strings.dtype == np.object_ assert len(strings) == len(datetime_series) d = datetime.now() dates = Series(d, index=datetime_series.index) - assert dates.dtype == 'M8[ns]' + assert dates.dtype == "M8[ns]" assert len(dates) == len(datetime_series) # GH12336 # Test construction of categorical series from value categorical = Series(0, index=datetime_series.index, dtype="category") expected = Series(0, index=datetime_series.index).astype("category") - assert categorical.dtype == 'category' + assert categorical.dtype == "category" assert len(categorical) == len(datetime_series) tm.assert_series_equal(categorical, expected) @@ -1135,43 +1159,42 @@ def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" td = Series([timedelta(days=1)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" - td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64( - 1, 's')]) + td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(1, "s")]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" # mixed with NaT - td = Series([timedelta(days=1), NaT], dtype='m8[ns]') - assert td.dtype == 'timedelta64[ns]' + td = Series([timedelta(days=1), NaT], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" - td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') - assert td.dtype == 'timedelta64[ns]' + td = Series([timedelta(days=1), np.nan], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" - td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]') - assert td.dtype == 'timedelta64[ns]' + td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]") + assert td.dtype == "timedelta64[ns]" # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) - assert td.dtype == 'object' + assert td.dtype == "object" td = Series([np.timedelta64(300000000), np.nan]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" td = Series([pd.NaT, np.timedelta64(300000000)]) - assert td.dtype == 'timedelta64[ns]' + assert td.dtype == "timedelta64[ns]" - td = Series([np.timedelta64(1, 's')]) - assert td.dtype == 'timedelta64[ns]' + td = Series([np.timedelta64(1, "s")]) + assert td.dtype == "timedelta64[ns]" # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: @@ -1179,44 +1202,43 @@ def test_constructor_dtype_timedelta64(self): # td.astype('m8[%s]' % t) # valid astype - td.astype('int64') + td.astype("int64") # invalid casting - msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" - r" \[int32\]") + msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): - td.astype('int32') + td.astype("int32") # this is an invalid casting msg = "Could not convert object to NumPy timedelta" with pytest.raises(ValueError, match=msg): - Series([timedelta(days=1), 'foo'], dtype='m8[ns]') + Series([timedelta(days=1), "foo"], dtype="m8[ns]") # leave as object here - td = Series([timedelta(days=i) for i in range(3)] + ['foo']) - assert td.dtype == 'object' + td = Series([timedelta(days=i) for i in range(3)] + ["foo"]) + assert td.dtype == "object" # these will correctly infer a timedelta - s = Series([None, pd.NaT, '1 Day']) - assert s.dtype == 'timedelta64[ns]' - s = Series([np.nan, pd.NaT, '1 Day']) - assert s.dtype == 'timedelta64[ns]' - s = Series([pd.NaT, None, '1 Day']) - assert s.dtype == 'timedelta64[ns]' - s = Series([pd.NaT, np.nan, '1 Day']) - assert s.dtype == 'timedelta64[ns]' + s = Series([None, pd.NaT, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([np.nan, pd.NaT, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([pd.NaT, None, "1 Day"]) + assert s.dtype == "timedelta64[ns]" + s = Series([pd.NaT, np.nan, "1 Day"]) + assert s.dtype == "timedelta64[ns]" # GH 16406 def test_constructor_mixed_tz(self): - s = Series([Timestamp('20130101'), - Timestamp('20130101', tz='US/Eastern')]) - expected = Series([Timestamp('20130101'), - Timestamp('20130101', tz='US/Eastern')], - dtype='object') + s = Series([Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")]) + expected = Series( + [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")], + dtype="object", + ) assert_series_equal(s, expected) def test_NaT_scalar(self): - series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") val = series[3] assert isna(val) @@ -1226,42 +1248,42 @@ def test_NaT_scalar(self): def test_NaT_cast(self): # GH10747 - result = Series([np.nan]).astype('M8[ns]') + result = Series([np.nan]).astype("M8[ns]") expected = Series([NaT]) assert_series_equal(result, expected) def test_constructor_name_hashable(self): - for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), "\u05D0"]: - for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]: + for n in [777, 777.0, "name", datetime(2001, 11, 11), (1,), "\u05D0"]: + for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]: s = Series(data, name=n) assert s.name == n def test_constructor_name_unhashable(self): msg = r"Series\.name must be a hashable type" - for n in [['name_list'], np.ones(2), {1: 2}]: - for data in [['name_list'], np.ones(2), {1: 2}]: + for n in [["name_list"], np.ones(2), {1: 2}]: + for data in [["name_list"], np.ones(2), {1: 2}]: with pytest.raises(TypeError, match=msg): Series(data, name=n) def test_auto_conversion(self): - series = Series(list(date_range('1/1/2000', periods=10))) - assert series.dtype == 'M8[ns]' + series = Series(list(date_range("1/1/2000", periods=10))) + assert series.dtype == "M8[ns]" def test_convert_non_ns(self): # convert from a numpy array of non-ns timedelta64 - arr = np.array([1, 2, 3], dtype='timedelta64[s]') + arr = np.array([1, 2, 3], dtype="timedelta64[s]") s = Series(arr) - expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s')) + expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s")) assert_series_equal(s, expected) # convert from a numpy array of non-ns datetime64 # note that creating a numpy datetime64 is in LOCAL time!!!! # seems to work for M8[D], but not for M8[s] - s = Series(np.array(['2013-01-01', '2013-01-02', - '2013-01-03'], dtype='datetime64[D]')) - assert_series_equal(s, Series(date_range('20130101', periods=3, - freq='D'))) + s = Series( + np.array(["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]") + ) + assert_series_equal(s, Series(date_range("20130101", periods=3, freq="D"))) # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) @@ -1272,10 +1294,12 @@ def test_convert_non_ns(self): @pytest.mark.parametrize( "index", [ - date_range('1/1/2000', periods=10), - timedelta_range('1 day', periods=10), - period_range('2000-Q1', periods=10, freq='Q')], - ids=lambda x: type(x).__name__) + date_range("1/1/2000", periods=10), + timedelta_range("1 day", periods=10), + period_range("2000-Q1", periods=10, freq="Q"), + ], + ids=lambda x: type(x).__name__, + ) def test_constructor_cant_cast_datetimelike(self, index): # floats are not ok @@ -1298,10 +1322,12 @@ def test_constructor_cant_cast_datetimelike(self, index): @pytest.mark.parametrize( "index", [ - date_range('1/1/2000', periods=10), - timedelta_range('1 day', periods=10), - period_range('2000-Q1', periods=10, freq='Q')], - ids=lambda x: type(x).__name__) + date_range("1/1/2000", periods=10), + timedelta_range("1 day", periods=10), + period_range("2000-Q1", periods=10, freq="Q"), + ], + ids=lambda x: type(x).__name__, + ) def test_constructor_cast_object(self, index): s = Series(index, dtype=object) exp = Series(index).astype(object) @@ -1315,10 +1341,7 @@ def test_constructor_cast_object(self, index): exp = Series(index).astype(object) tm.assert_series_equal(s, exp) - @pytest.mark.parametrize("dtype", [ - np.datetime64, - np.timedelta64, - ]) + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) def test_constructor_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 msg = "dtype has no unit. Please pass in" @@ -1326,27 +1349,32 @@ def test_constructor_generic_timestamp_no_frequency(self, dtype): with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) - @pytest.mark.parametrize("dtype,msg", [ - ("m8[ps]", "cannot convert timedeltalike"), - ("M8[ps]", "cannot convert datetimelike"), - ]) + @pytest.mark.parametrize( + "dtype,msg", + [ + ("m8[ps]", "cannot convert timedeltalike"), + ("M8[ps]", "cannot convert datetimelike"), + ], + ) def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg): # see gh-15524, gh-15987 with pytest.raises(TypeError, match=msg): Series([], dtype=dtype) - @pytest.mark.parametrize('dtype', [None, 'uint8', 'category']) + @pytest.mark.parametrize("dtype", [None, "uint8", "category"]) def test_constructor_range_dtype(self, dtype): # GH 16804 - expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64') + expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64") result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected) def test_constructor_tz_mixed_data(self): # GH 13051 - dt_list = [Timestamp('2016-05-01 02:03:37'), - Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')] + dt_list = [ + Timestamp("2016-05-01 02:03:37"), + Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"), + ] result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 86164f53c5515..c9092917cb0c6 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -13,8 +13,17 @@ import pandas as pd from pandas import ( - DataFrame, DatetimeIndex, Index, PeriodIndex, Series, TimedeltaIndex, - bdate_range, date_range, period_range, timedelta_range) + DataFrame, + DatetimeIndex, + Index, + PeriodIndex, + Series, + TimedeltaIndex, + bdate_range, + date_range, + period_range, + timedelta_range, +) from pandas.core.arrays import PeriodArray import pandas.core.common as com import pandas.util.testing as tm @@ -22,27 +31,42 @@ class TestSeriesDatetimeValues: - def test_dt_namespace_accessor(self): # GH 7207, 11128 # test .dt namespace accessor ok_for_period = PeriodArray._datetimelike_ops - ok_for_period_methods = ['strftime', 'to_timestamp', 'asfreq'] + ok_for_period_methods = ["strftime", "to_timestamp", "asfreq"] ok_for_dt = DatetimeIndex._datetimelike_ops - ok_for_dt_methods = ['to_period', 'to_pydatetime', 'tz_localize', - 'tz_convert', 'normalize', 'strftime', 'round', - 'floor', 'ceil', 'day_name', 'month_name'] + ok_for_dt_methods = [ + "to_period", + "to_pydatetime", + "tz_localize", + "tz_convert", + "normalize", + "strftime", + "round", + "floor", + "ceil", + "day_name", + "month_name", + ] ok_for_td = TimedeltaIndex._datetimelike_ops - ok_for_td_methods = ['components', 'to_pytimedelta', 'total_seconds', - 'round', 'floor', 'ceil'] + ok_for_td_methods = [ + "components", + "to_pytimedelta", + "total_seconds", + "round", + "floor", + "ceil", + ] def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): - result = result.astype('int64') + result = result.astype("int64") elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name) @@ -56,15 +80,15 @@ def compare(s, name): tm.assert_series_equal(a, b) # datetimeindex - cases = [Series(date_range('20130101', periods=5), name='xxx'), - Series(date_range('20130101', periods=5, freq='s'), - name='xxx'), - Series(date_range('20130101 00:00:00', periods=5, freq='ms'), - name='xxx')] + cases = [ + Series(date_range("20130101", periods=5), name="xxx"), + Series(date_range("20130101", periods=5, freq="s"), name="xxx"), + Series(date_range("20130101 00:00:00", periods=5, freq="ms"), name="xxx"), + ] for s in cases: for prop in ok_for_dt: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_dt_methods: @@ -74,30 +98,30 @@ def compare(s, name): assert isinstance(result, np.ndarray) assert result.dtype == object - result = s.dt.tz_localize('US/Eastern') - exp_values = DatetimeIndex(s.values).tz_localize('US/Eastern') - expected = Series(exp_values, index=s.index, name='xxx') + result = s.dt.tz_localize("US/Eastern") + exp_values = DatetimeIndex(s.values).tz_localize("US/Eastern") + expected = Series(exp_values, index=s.index, name="xxx") tm.assert_series_equal(result, expected) tz_result = result.dt.tz - assert str(tz_result) == 'US/Eastern' + assert str(tz_result) == "US/Eastern" freq_result = s.dt.freq - assert freq_result == DatetimeIndex(s.values, freq='infer').freq + assert freq_result == DatetimeIndex(s.values, freq="infer").freq # let's localize, then convert - result = s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') - exp_values = (DatetimeIndex(s.values).tz_localize('UTC') - .tz_convert('US/Eastern')) - expected = Series(exp_values, index=s.index, name='xxx') + result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") + exp_values = ( + DatetimeIndex(s.values).tz_localize("UTC").tz_convert("US/Eastern") + ) + expected = Series(exp_values, index=s.index, name="xxx") tm.assert_series_equal(result, expected) # datetimeindex with tz - s = Series(date_range('20130101', periods=5, tz='US/Eastern'), - name='xxx') + s = Series(date_range("20130101", periods=5, tz="US/Eastern"), name="xxx") for prop in ok_for_dt: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_dt_methods: @@ -107,27 +131,30 @@ def compare(s, name): assert isinstance(result, np.ndarray) assert result.dtype == object - result = s.dt.tz_convert('CET') - expected = Series(s._values.tz_convert('CET'), - index=s.index, name='xxx') + result = s.dt.tz_convert("CET") + expected = Series(s._values.tz_convert("CET"), index=s.index, name="xxx") tm.assert_series_equal(result, expected) tz_result = result.dt.tz - assert str(tz_result) == 'CET' + assert str(tz_result) == "CET" freq_result = s.dt.freq - assert freq_result == DatetimeIndex(s.values, freq='infer').freq + assert freq_result == DatetimeIndex(s.values, freq="infer").freq # timedelta index - cases = [Series(timedelta_range('1 day', periods=5), - index=list('abcde'), name='xxx'), - Series(timedelta_range('1 day 01:23:45', periods=5, - freq='s'), name='xxx'), - Series(timedelta_range('2 days 01:23:45.012345', periods=5, - freq='ms'), name='xxx')] + cases = [ + Series( + timedelta_range("1 day", periods=5), index=list("abcde"), name="xxx" + ), + Series(timedelta_range("1 day 01:23:45", periods=5, freq="s"), name="xxx"), + Series( + timedelta_range("2 days 01:23:45.012345", periods=5, freq="ms"), + name="xxx", + ), + ] for s in cases: for prop in ok_for_td: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_td_methods: @@ -143,37 +170,34 @@ def compare(s, name): result = s.dt.total_seconds() assert isinstance(result, pd.Series) - assert result.dtype == 'float64' + assert result.dtype == "float64" freq_result = s.dt.freq - assert freq_result == TimedeltaIndex(s.values, freq='infer').freq + assert freq_result == TimedeltaIndex(s.values, freq="infer").freq # both - index = date_range('20130101', periods=3, freq='D') - s = Series(date_range('20140204', periods=3, freq='s'), - index=index, name='xxx') - exp = Series(np.array([2014, 2014, 2014], dtype='int64'), - index=index, name='xxx') + index = date_range("20130101", periods=3, freq="D") + s = Series(date_range("20140204", periods=3, freq="s"), index=index, name="xxx") + exp = Series( + np.array([2014, 2014, 2014], dtype="int64"), index=index, name="xxx" + ) tm.assert_series_equal(s.dt.year, exp) - exp = Series(np.array([2, 2, 2], dtype='int64'), - index=index, name='xxx') + exp = Series(np.array([2, 2, 2], dtype="int64"), index=index, name="xxx") tm.assert_series_equal(s.dt.month, exp) - exp = Series(np.array([0, 1, 2], dtype='int64'), - index=index, name='xxx') + exp = Series(np.array([0, 1, 2], dtype="int64"), index=index, name="xxx") tm.assert_series_equal(s.dt.second, exp) - exp = pd.Series([s[0]] * 3, index=index, name='xxx') + exp = pd.Series([s[0]] * 3, index=index, name="xxx") tm.assert_series_equal(s.dt.normalize(), exp) # periodindex - cases = [Series(period_range('20130101', periods=5, freq='D'), - name='xxx')] + cases = [Series(period_range("20130101", periods=5, freq="D"), name="xxx")] for s in cases: for prop in ok_for_period: # we test freq below - if prop != 'freq': + if prop != "freq": compare(s, prop) for prop in ok_for_period_methods: @@ -184,168 +208,203 @@ def compare(s, name): # test limited display api def get_dir(s): - results = [r for r in s.dt.__dir__() if not r.startswith('_')] + results = [r for r in s.dt.__dir__() if not r.startswith("_")] return list(sorted(set(results))) - s = Series(date_range('20130101', periods=5, freq='D'), name='xxx') + s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) + results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) + ) - s = Series(period_range('20130101', periods=5, - freq='D', name='xxx').astype(object)) + s = Series( + period_range("20130101", periods=5, freq="D", name="xxx").astype(object) + ) results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_period + ok_for_period_methods)))) + results, list(sorted(set(ok_for_period + ok_for_period_methods))) + ) # 11295 # ambiguous time error on the conversions - s = Series(pd.date_range('2015-01-01', '2016-01-01', - freq='T'), name='xxx') - s = s.dt.tz_localize('UTC').dt.tz_convert('America/Chicago') + s = Series(pd.date_range("2015-01-01", "2016-01-01", freq="T"), name="xxx") + s = s.dt.tz_localize("UTC").dt.tz_convert("America/Chicago") results = get_dir(s) tm.assert_almost_equal( - results, list(sorted(set(ok_for_dt + ok_for_dt_methods)))) - exp_values = pd.date_range('2015-01-01', '2016-01-01', freq='T', - tz='UTC').tz_convert('America/Chicago') - expected = Series(exp_values, name='xxx') + results, list(sorted(set(ok_for_dt + ok_for_dt_methods))) + ) + exp_values = pd.date_range( + "2015-01-01", "2016-01-01", freq="T", tz="UTC" + ).tz_convert("America/Chicago") + expected = Series(exp_values, name="xxx") tm.assert_series_equal(s, expected) # no setting allowed - s = Series(date_range('20130101', periods=5, freq='D'), name='xxx') + s = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): s.dt.hour = 5 # trying to set a copy - with pd.option_context('chained_assignment', 'raise'): + with pd.option_context("chained_assignment", "raise"): with pytest.raises(com.SettingWithCopyError): s.dt.hour[0] = 5 - @pytest.mark.parametrize('method, dates', [ - ['round', ['2012-01-02', '2012-01-02', '2012-01-01']], - ['floor', ['2012-01-01', '2012-01-01', '2012-01-01']], - ['ceil', ['2012-01-02', '2012-01-02', '2012-01-02']] - ]) + @pytest.mark.parametrize( + "method, dates", + [ + ["round", ["2012-01-02", "2012-01-02", "2012-01-01"]], + ["floor", ["2012-01-01", "2012-01-01", "2012-01-01"]], + ["ceil", ["2012-01-02", "2012-01-02", "2012-01-02"]], + ], + ) def test_dt_round(self, method, dates): # round - s = Series(pd.to_datetime(['2012-01-01 13:00:00', - '2012-01-01 12:01:00', - '2012-01-01 08:00:00']), name='xxx') - result = getattr(s.dt, method)('D') - expected = Series(pd.to_datetime(dates), name='xxx') + s = Series( + pd.to_datetime( + ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] + ), + name="xxx", + ) + result = getattr(s.dt, method)("D") + expected = Series(pd.to_datetime(dates), name="xxx") tm.assert_series_equal(result, expected) def test_dt_round_tz(self): - s = Series(pd.to_datetime(['2012-01-01 13:00:00', - '2012-01-01 12:01:00', - '2012-01-01 08:00:00']), name='xxx') - result = (s.dt.tz_localize('UTC') - .dt.tz_convert('US/Eastern') - .dt.round('D')) - - exp_values = pd.to_datetime(['2012-01-01', '2012-01-01', - '2012-01-01']).tz_localize('US/Eastern') - expected = Series(exp_values, name='xxx') + s = Series( + pd.to_datetime( + ["2012-01-01 13:00:00", "2012-01-01 12:01:00", "2012-01-01 08:00:00"] + ), + name="xxx", + ) + result = s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern").dt.round("D") + + exp_values = pd.to_datetime( + ["2012-01-01", "2012-01-01", "2012-01-01"] + ).tz_localize("US/Eastern") + expected = Series(exp_values, name="xxx") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('method', ['ceil', 'round', 'floor']) + @pytest.mark.parametrize("method", ["ceil", "round", "floor"]) def test_dt_round_tz_ambiguous(self, method): # GH 18946 round near "fall back" DST - df1 = pd.DataFrame([ - pd.to_datetime('2017-10-29 02:00:00+02:00', utc=True), - pd.to_datetime('2017-10-29 02:00:00+01:00', utc=True), - pd.to_datetime('2017-10-29 03:00:00+01:00', utc=True) - ], - columns=['date']) - df1['date'] = df1['date'].dt.tz_convert('Europe/Madrid') + df1 = pd.DataFrame( + [ + pd.to_datetime("2017-10-29 02:00:00+02:00", utc=True), + pd.to_datetime("2017-10-29 02:00:00+01:00", utc=True), + pd.to_datetime("2017-10-29 03:00:00+01:00", utc=True), + ], + columns=["date"], + ) + df1["date"] = df1["date"].dt.tz_convert("Europe/Madrid") # infer - result = getattr(df1.date.dt, method)('H', ambiguous='infer') - expected = df1['date'] + result = getattr(df1.date.dt, method)("H", ambiguous="infer") + expected = df1["date"] tm.assert_series_equal(result, expected) # bool-array - result = getattr(df1.date.dt, method)( - 'H', ambiguous=[True, False, False] - ) + result = getattr(df1.date.dt, method)("H", ambiguous=[True, False, False]) tm.assert_series_equal(result, expected) # NaT - result = getattr(df1.date.dt, method)('H', ambiguous='NaT') - expected = df1['date'].copy() + result = getattr(df1.date.dt, method)("H", ambiguous="NaT") + expected = df1["date"].copy() expected.iloc[0:2] = pd.NaT tm.assert_series_equal(result, expected) # raise with pytest.raises(pytz.AmbiguousTimeError): - getattr(df1.date.dt, method)('H', ambiguous='raise') - - @pytest.mark.parametrize('method, ts_str, freq', [ - ['ceil', '2018-03-11 01:59:00-0600', '5min'], - ['round', '2018-03-11 01:59:00-0600', '5min'], - ['floor', '2018-03-11 03:01:00-0500', '2H']]) + getattr(df1.date.dt, method)("H", ambiguous="raise") + + @pytest.mark.parametrize( + "method, ts_str, freq", + [ + ["ceil", "2018-03-11 01:59:00-0600", "5min"], + ["round", "2018-03-11 01:59:00-0600", "5min"], + ["floor", "2018-03-11 03:01:00-0500", "2H"], + ], + ) def test_dt_round_tz_nonexistent(self, method, ts_str, freq): # GH 23324 round near "spring forward" DST - s = Series([pd.Timestamp(ts_str, tz='America/Chicago')]) - result = getattr(s.dt, method)(freq, nonexistent='shift_forward') - expected = Series( - [pd.Timestamp('2018-03-11 03:00:00', tz='America/Chicago')] - ) + s = Series([pd.Timestamp(ts_str, tz="America/Chicago")]) + result = getattr(s.dt, method)(freq, nonexistent="shift_forward") + expected = Series([pd.Timestamp("2018-03-11 03:00:00", tz="America/Chicago")]) tm.assert_series_equal(result, expected) - result = getattr(s.dt, method)(freq, nonexistent='NaT') + result = getattr(s.dt, method)(freq, nonexistent="NaT") expected = Series([pd.NaT]).dt.tz_localize(result.dt.tz) tm.assert_series_equal(result, expected) - with pytest.raises(pytz.NonExistentTimeError, - match='2018-03-11 02:00:00'): - getattr(s.dt, method)(freq, nonexistent='raise') + with pytest.raises(pytz.NonExistentTimeError, match="2018-03-11 02:00:00"): + getattr(s.dt, method)(freq, nonexistent="raise") def test_dt_namespace_accessor_categorical(self): # GH 19468 - dti = DatetimeIndex(['20171111', '20181212']).repeat(2) - s = Series(pd.Categorical(dti), name='foo') + dti = DatetimeIndex(["20171111", "20181212"]).repeat(2) + s = Series(pd.Categorical(dti), name="foo") result = s.dt.year - expected = Series([2017, 2017, 2018, 2018], name='foo') + expected = Series([2017, 2017, 2018, 2018], name="foo") tm.assert_series_equal(result, expected) def test_dt_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 - s = Series(date_range('20130101', periods=5, freq='D')) - with pytest.raises(AttributeError, - match="You cannot add any new attribute"): + s = Series(date_range("20130101", periods=5, freq="D")) + with pytest.raises(AttributeError, match="You cannot add any new attribute"): s.dt.xlabel = "a" - @pytest.mark.parametrize('time_locale', [ - None] if tm.get_locales() is None else [None] + tm.get_locales()) + @pytest.mark.parametrize( + "time_locale", [None] if tm.get_locales() is None else [None] + tm.get_locales() + ) def test_dt_accessor_datetime_name_accessors(self, time_locale): # Test Monday -> Sunday and January -> December, in that sequence if time_locale is None: # If the time_locale is None, day-name and month_name should # return the english attributes - expected_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - expected_months = ['January', 'February', 'March', 'April', 'May', - 'June', 'July', 'August', 'September', - 'October', 'November', 'December'] + expected_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + expected_months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] else: with tm.set_locale(time_locale, locale.LC_TIME): expected_days = calendar.day_name[:] expected_months = calendar.month_name[1:] - s = Series(date_range(freq='D', start=datetime(1998, 1, 1), - periods=365)) - english_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'] - for day, name, eng_name in zip(range(4, 11), - expected_days, - english_days): + s = Series(date_range(freq="D", start=datetime(1998, 1, 1), periods=365)) + english_days = [ + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", + ] + for day, name, eng_name in zip(range(4, 11), expected_days, english_days): name = name.capitalize() assert s.dt.weekday_name[day] == eng_name assert s.dt.day_name(locale=time_locale)[day] == name s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.day_name(locale=time_locale).iloc[-1]) - s = Series(date_range(freq='M', start='2012', end='2013')) + s = Series(date_range(freq="M", start="2012", end="2013")) result = s.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) @@ -369,71 +428,98 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): def test_strftime(self): # GH 10086 - s = Series(date_range('20130101', periods=5)) - result = s.dt.strftime('%Y/%m/%d') - expected = Series(['2013/01/01', '2013/01/02', '2013/01/03', - '2013/01/04', '2013/01/05']) + s = Series(date_range("20130101", periods=5)) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) tm.assert_series_equal(result, expected) - s = Series(date_range('2015-02-03 11:22:33.4567', periods=5)) - result = s.dt.strftime('%Y/%m/%d %H-%M-%S') - expected = Series(['2015/02/03 11-22-33', '2015/02/04 11-22-33', - '2015/02/05 11-22-33', '2015/02/06 11-22-33', - '2015/02/07 11-22-33']) + s = Series(date_range("2015-02-03 11:22:33.4567", periods=5)) + result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + expected = Series( + [ + "2015/02/03 11-22-33", + "2015/02/04 11-22-33", + "2015/02/05 11-22-33", + "2015/02/06 11-22-33", + "2015/02/07 11-22-33", + ] + ) tm.assert_series_equal(result, expected) - s = Series(period_range('20130101', periods=5)) - result = s.dt.strftime('%Y/%m/%d') - expected = Series(['2013/01/01', '2013/01/02', '2013/01/03', - '2013/01/04', '2013/01/05']) + s = Series(period_range("20130101", periods=5)) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["2013/01/01", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) tm.assert_series_equal(result, expected) - s = Series(period_range( - '2015-02-03 11:22:33.4567', periods=5, freq='s')) - result = s.dt.strftime('%Y/%m/%d %H-%M-%S') - expected = Series(['2015/02/03 11-22-33', '2015/02/03 11-22-34', - '2015/02/03 11-22-35', '2015/02/03 11-22-36', - '2015/02/03 11-22-37']) + s = Series(period_range("2015-02-03 11:22:33.4567", periods=5, freq="s")) + result = s.dt.strftime("%Y/%m/%d %H-%M-%S") + expected = Series( + [ + "2015/02/03 11-22-33", + "2015/02/03 11-22-34", + "2015/02/03 11-22-35", + "2015/02/03 11-22-36", + "2015/02/03 11-22-37", + ] + ) tm.assert_series_equal(result, expected) - s = Series(date_range('20130101', periods=5)) + s = Series(date_range("20130101", periods=5)) s.iloc[0] = pd.NaT - result = s.dt.strftime('%Y/%m/%d') - expected = Series(['NaT', '2013/01/02', '2013/01/03', '2013/01/04', - '2013/01/05']) + result = s.dt.strftime("%Y/%m/%d") + expected = Series( + ["NaT", "2013/01/02", "2013/01/03", "2013/01/04", "2013/01/05"] + ) tm.assert_series_equal(result, expected) - datetime_index = date_range('20150301', periods=5) + datetime_index = date_range("20150301", periods=5) result = datetime_index.strftime("%Y/%m/%d") - expected = Index(['2015/03/01', '2015/03/02', '2015/03/03', - '2015/03/04', '2015/03/05'], dtype=np.object_) + expected = Index( + ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], + dtype=np.object_, + ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) - period_index = period_range('20150301', periods=5) + period_index = period_range("20150301", periods=5) result = period_index.strftime("%Y/%m/%d") - expected = Index(['2015/03/01', '2015/03/02', '2015/03/03', - '2015/03/04', '2015/03/05'], dtype='=U10') + expected = Index( + ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], + dtype="=U10", + ) tm.assert_index_equal(result, expected) - s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, - 32, 1)]) - result = s.dt.strftime('%Y-%m-%d %H:%M:%S') + s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)]) + result = s.dt.strftime("%Y-%m-%d %H:%M:%S") expected = Series(["2013-01-01 02:32:59", "2013-01-02 14:32:01"]) tm.assert_series_equal(result, expected) - s = Series(period_range('20130101', periods=4, freq='H')) - result = s.dt.strftime('%Y/%m/%d %H:%M:%S') - expected = Series(["2013/01/01 00:00:00", "2013/01/01 01:00:00", - "2013/01/01 02:00:00", "2013/01/01 03:00:00"]) - - s = Series(period_range('20130101', periods=4, freq='L')) - result = s.dt.strftime('%Y/%m/%d %H:%M:%S.%l') - expected = Series(["2013/01/01 00:00:00.000", - "2013/01/01 00:00:00.001", - "2013/01/01 00:00:00.002", - "2013/01/01 00:00:00.003"]) + s = Series(period_range("20130101", periods=4, freq="H")) + result = s.dt.strftime("%Y/%m/%d %H:%M:%S") + expected = Series( + [ + "2013/01/01 00:00:00", + "2013/01/01 01:00:00", + "2013/01/01 02:00:00", + "2013/01/01 03:00:00", + ] + ) + + s = Series(period_range("20130101", periods=4, freq="L")) + result = s.dt.strftime("%Y/%m/%d %H:%M:%S.%l") + expected = Series( + [ + "2013/01/01 00:00:00.000", + "2013/01/01 00:00:00.001", + "2013/01/01 00:00:00.002", + "2013/01/01 00:00:00.003", + ] + ) tm.assert_series_equal(result, expected) def test_valid_dt_with_missing_values(self): @@ -441,11 +527,10 @@ def test_valid_dt_with_missing_values(self): from datetime import date, time # GH 8689 - s = Series(date_range('20130101', periods=5, freq='D')) + s = Series(date_range("20130101", periods=5, freq="D")) s.iloc[2] = pd.NaT - for attr in ['microsecond', 'nanosecond', 'second', 'minute', 'hour', - 'day']: + for attr in ["microsecond", "nanosecond", "second", "minute", "hour", "day"]: expected = getattr(s.dt, attr).copy() expected.iloc[2] = np.nan result = getattr(s.dt, attr) @@ -453,42 +538,51 @@ def test_valid_dt_with_missing_values(self): result = s.dt.date expected = Series( - [date(2013, 1, 1), date(2013, 1, 2), np.nan, date(2013, 1, 4), - date(2013, 1, 5)], dtype='object') + [ + date(2013, 1, 1), + date(2013, 1, 2), + np.nan, + date(2013, 1, 4), + date(2013, 1, 5), + ], + dtype="object", + ) tm.assert_series_equal(result, expected) result = s.dt.time - expected = Series( - [time(0), time(0), np.nan, time(0), time(0)], dtype='object') + expected = Series([time(0), time(0), np.nan, time(0), time(0)], dtype="object") tm.assert_series_equal(result, expected) def test_dt_accessor_api(self): # GH 9322 from pandas.core.indexes.accessors import ( - CombinedDatetimelikeProperties, DatetimeProperties) + CombinedDatetimelikeProperties, + DatetimeProperties, + ) + assert Series.dt is CombinedDatetimelikeProperties - s = Series(date_range('2000-01-01', periods=3)) + s = Series(date_range("2000-01-01", periods=3)) assert isinstance(s.dt, DatetimeProperties) - @pytest.mark.parametrize('ser', [Series(np.arange(5)), - Series(list('abcde')), - Series(np.random.randn(5))]) + @pytest.mark.parametrize( + "ser", [Series(np.arange(5)), Series(list("abcde")), Series(np.random.randn(5))] + ) def test_dt_accessor_invalid(self, ser): # GH#9322 check that series with incorrect dtypes don't have attr with pytest.raises(AttributeError, match="only use .dt accessor"): ser.dt - assert not hasattr(ser, 'dt') + assert not hasattr(ser, "dt") def test_dt_accessor_updates_on_inplace(self): - s = Series(pd.date_range('2018-01-01', periods=10)) + s = Series(pd.date_range("2018-01-01", periods=10)) s[2] = None - s.fillna(pd.Timestamp('2018-01-01'), inplace=True) + s.fillna(pd.Timestamp("2018-01-01"), inplace=True) result = s.dt.date assert result[0] == result[2] def test_between(self): - s = Series(bdate_range('1/1/2000', periods=20).astype(object)) + s = Series(bdate_range("1/1/2000", periods=20).astype(object)) s[::2] = np.nan result = s[s.between(s[3], s[17])] @@ -501,13 +595,12 @@ def test_between(self): def test_date_tz(self): # GH11757 - rng = pd.DatetimeIndex(['2014-04-04 23:56', - '2014-07-18 21:24', - '2015-11-22 22:14'], tz="US/Eastern") + rng = pd.DatetimeIndex( + ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], + tz="US/Eastern", + ) s = Series(rng) - expected = Series([date(2014, 4, 4), - date(2014, 7, 18), - date(2015, 11, 22)]) + expected = Series([date(2014, 4, 4), date(2014, 7, 18), date(2015, 11, 22)]) assert_series_equal(s.dt.date, expected) assert_series_equal(s.apply(lambda x: x.date()), expected) @@ -517,35 +610,39 @@ def test_datetime_understood(self): series = pd.Series(pd.date_range("2012-01-01", periods=3)) offset = pd.offsets.DateOffset(days=6) result = series - offset - expected = pd.Series(pd.to_datetime([ - '2011-12-26', '2011-12-27', '2011-12-28'])) + expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) tm.assert_series_equal(result, expected) def test_dt_timetz_accessor(self, tz_naive_fixture): # GH21358 tz = maybe_get_tz(tz_naive_fixture) - dtindex = pd.DatetimeIndex(['2014-04-04 23:56', '2014-07-18 21:24', - '2015-11-22 22:14'], tz=tz) + dtindex = pd.DatetimeIndex( + ["2014-04-04 23:56", "2014-07-18 21:24", "2015-11-22 22:14"], tz=tz + ) s = Series(dtindex) - expected = Series([time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), - time(22, 14, tzinfo=tz)]) + expected = Series( + [time(23, 56, tzinfo=tz), time(21, 24, tzinfo=tz), time(22, 14, tzinfo=tz)] + ) result = s.dt.timetz tm.assert_series_equal(result, expected) def test_setitem_with_string_index(self): # GH 23451 - x = pd.Series([1, 2, 3], index=['Date', 'b', 'other']) - x['Date'] = date.today() + x = pd.Series([1, 2, 3], index=["Date", "b", "other"]) + x["Date"] = date.today() assert x.Date == date.today() - assert x['Date'] == date.today() + assert x["Date"] == date.today() def test_setitem_with_different_tz(self): # GH#24024 - ser = pd.Series(pd.date_range('2000', periods=2, tz="US/Central")) - ser[0] = pd.Timestamp("2000", tz='US/Eastern') - expected = pd.Series([ - pd.Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"), - pd.Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), - ], dtype=object) + ser = pd.Series(pd.date_range("2000", periods=2, tz="US/Central")) + ser[0] = pd.Timestamp("2000", tz="US/Eastern") + expected = pd.Series( + [ + pd.Timestamp("2000-01-01 00:00:00-05:00", tz="US/Eastern"), + pd.Timestamp("2000-01-02 00:00:00-06:00", tz="US/Central"), + ], + dtype=object, + ) tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index b17f24fef825e..9be79bf93ece7 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -12,61 +12,67 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, Series, Timedelta, Timestamp, date_range) + Categorical, + DataFrame, + Index, + Series, + Timedelta, + Timestamp, + date_range, +) import pandas.util.testing as tm class TestSeriesDtypes: - def test_dt64_series_astype_object(self): - dt64ser = Series(date_range('20130101', periods=3)) + dt64ser = Series(date_range("20130101", periods=3)) result = dt64ser.astype(object) assert isinstance(result.iloc[0], datetime) assert result.dtype == np.object_ def test_td64_series_astype_object(self): - tdser = Series(['59 Days', '59 Days', 'NaT'], dtype='timedelta64[ns]') + tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") result = tdser.astype(object) assert isinstance(result.iloc[0], timedelta) assert result.dtype == np.object_ - @pytest.mark.parametrize("dtype", ["float32", "float64", - "int64", "int32"]) + @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) def test_astype(self, dtype): - s = Series(np.random.randn(5), name='foo') + s = Series(np.random.randn(5), name="foo") as_typed = s.astype(dtype) assert as_typed.dtype == dtype assert as_typed.name == s.name def test_asobject_deprecated(self): - s = Series(np.random.randn(5), name='foo') + s = Series(np.random.randn(5), name="foo") with tm.assert_produces_warning(FutureWarning): o = s.asobject assert isinstance(o, np.ndarray) def test_dtype(self, datetime_series): - assert datetime_series.dtype == np.dtype('float64') - assert datetime_series.dtypes == np.dtype('float64') + assert datetime_series.dtype == np.dtype("float64") + assert datetime_series.dtypes == np.dtype("float64") # GH 26705 - Assert .ftype is deprecated with tm.assert_produces_warning(FutureWarning): - assert datetime_series.ftype == 'float64:dense' + assert datetime_series.ftype == "float64:dense" # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): - assert datetime_series.ftypes == 'float64:dense' + assert datetime_series.ftypes == "float64:dense" # GH18243 - Assert .get_ftype_counts is deprecated with tm.assert_produces_warning(FutureWarning): - tm.assert_series_equal(datetime_series.get_ftype_counts(), - Series(1, ['float64:dense'])) + tm.assert_series_equal( + datetime_series.get_ftype_counts(), Series(1, ["float64:dense"]) + ) @pytest.mark.parametrize("value", [np.nan, np.inf]) @pytest.mark.parametrize("dtype", [np.int32, np.int64]) def test_astype_cast_nan_inf_int(self, dtype, value): # gh-14265: check NaN and inf raise error when converting to int - msg = 'Cannot convert non-finite values \\(NA or inf\\) to integer' + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" s = Series([value]) with pytest.raises(ValueError, match=msg): @@ -80,39 +86,39 @@ def test_astype_cast_object_int_fail(self, dtype): arr.astype(dtype) def test_astype_cast_object_int(self): - arr = Series(['1', '2', '3', '4'], dtype=object) + arr = Series(["1", "2", "3", "4"], dtype=object) result = arr.astype(int) tm.assert_series_equal(result, Series(np.arange(1, 5))) def test_astype_datetime(self): - s = Series(iNaT, dtype='M8[ns]', index=range(5)) + s = Series(iNaT, dtype="M8[ns]", index=range(5)) - s = s.astype('O') + s = s.astype("O") assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0)]) - s = s.astype('O') + s = s.astype("O") assert s.dtype == np.object_ s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) s[1] = np.nan - assert s.dtype == 'M8[ns]' + assert s.dtype == "M8[ns]" - s = s.astype('O') + s = s.astype("O") assert s.dtype == np.object_ def test_astype_datetime64tz(self): - s = Series(date_range('20130101', periods=3, tz='US/Eastern')) + s = Series(date_range("20130101", periods=3, tz="US/Eastern")) # astype result = s.astype(object) expected = Series(s.astype(object), dtype=object) tm.assert_series_equal(result, expected) - result = Series(s.values).dt.tz_localize('UTC').dt.tz_convert(s.dt.tz) + result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz) tm.assert_series_equal(result, s) # astype - object, preserves on construction @@ -121,24 +127,24 @@ def test_astype_datetime64tz(self): tm.assert_series_equal(result, expected) # astype - datetime64[ns, tz] - result = Series(s.values).astype('datetime64[ns, US/Eastern]') + result = Series(s.values).astype("datetime64[ns, US/Eastern]") tm.assert_series_equal(result, s) result = Series(s.values).astype(s.dtype) tm.assert_series_equal(result, s) - result = s.astype('datetime64[ns, CET]') - expected = Series(date_range('20130101 06:00:00', periods=3, tz='CET')) + result = s.astype("datetime64[ns, CET]") + expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, np.str_]) - @pytest.mark.parametrize("series", [Series([string.digits * 10, - tm.rands(63), - tm.rands(64), - tm.rands(1000)]), - Series([string.digits * 10, - tm.rands(63), - tm.rands(64), np.nan, 1.0])]) + @pytest.mark.parametrize( + "series", + [ + Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), + Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]), + ], + ) def test_astype_str_map(self, dtype, series): # see gh-4405 result = series.astype(dtype) @@ -147,22 +153,22 @@ def test_astype_str_map(self, dtype, series): def test_astype_str_cast(self): # see gh-9757 - ts = Series([Timestamp('2010-01-04 00:00:00')]) + ts = Series([Timestamp("2010-01-04 00:00:00")]) s = ts.astype(str) - expected = Series([str('2010-01-04')]) + expected = Series([str("2010-01-04")]) tm.assert_series_equal(s, expected) - ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) + ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) s = ts.astype(str) - expected = Series([str('2010-01-04 00:00:00-05:00')]) + expected = Series([str("2010-01-04 00:00:00-05:00")]) tm.assert_series_equal(s, expected) - td = Series([Timedelta(1, unit='d')]) + td = Series([Timedelta(1, unit="d")]) s = td.astype(str) - expected = Series([str('1 days 00:00:00.000000000')]) + expected = Series([str("1 days 00:00:00.000000000")]) tm.assert_series_equal(s, expected) def test_astype_unicode(self): @@ -171,14 +177,13 @@ def test_astype_unicode(self): digits = string.digits test_series = [ Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), - Series(['データーサイエンス、お前はもう死んでいる']), + Series(["データーサイエンス、お前はもう死んでいる"]), ] former_encoding = None if sys.getdefaultencoding() == "utf-8": - test_series.append(Series(['野菜食べないとやばい' - .encode("utf-8")])) + test_series.append(Series(["野菜食べないとやばい".encode("utf-8")])) for s in test_series: res = s.astype("unicode") @@ -193,22 +198,23 @@ def test_astype_unicode(self): @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # see gh-7271 - s = Series(range(0, 10, 2), name='abc') + s = Series(range(0, 10, 2), name="abc") - dt1 = dtype_class({'abc': str}) + dt1 = dtype_class({"abc": str}) result = s.astype(dt1) - expected = Series(['0', '2', '4', '6', '8'], name='abc') + expected = Series(["0", "2", "4", "6", "8"], name="abc") tm.assert_series_equal(result, expected) - dt2 = dtype_class({'abc': 'float64'}) + dt2 = dtype_class({"abc": "float64"}) result = s.astype(dt2) - expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', - name='abc') + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc") tm.assert_series_equal(result, expected) - dt3 = dtype_class({'abc': str, 'def': str}) - msg = ("Only the Series name can be used for the key in Series dtype" - r" mappings\.") + dt3 = dtype_class({"abc": str, "def": str}) + msg = ( + "Only the Series name can be used for the key in Series dtype" + r" mappings\." + ) with pytest.raises(KeyError, match=msg): s.astype(dt3) @@ -225,17 +231,18 @@ def test_astype_dict_like(self, dtype_class): def test_astype_categories_deprecation_raises(self): # deprecated 17636 - s = Series(['a', 'b', 'a']) + s = Series(["a", "b", "a"]) with pytest.raises(ValueError, match="Got an unexpected"): - s.astype('category', categories=['a', 'b'], ordered=True) + s.astype("category", categories=["a", "b"], ordered=True) - @pytest.mark.parametrize('none, warning', [ - (None, None), (ordered_sentinel, FutureWarning)]) + @pytest.mark.parametrize( + "none, warning", [(None, None), (ordered_sentinel, FutureWarning)] + ) def test_astype_category_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed - cdt1 = CategoricalDtype(categories=list('cdab'), ordered=True) - cdt2 = CategoricalDtype(categories=list('cedafb'), ordered=none) - s = Series(list('abcdaba'), dtype=cdt1) + cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) + cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) + s = Series(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning, check_stacklevel=False): s.astype(cdt2) @@ -243,25 +250,25 @@ def test_astype_from_categorical(self): items = ["a", "b", "c", "a"] s = Series(items) exp = Series(Categorical(items)) - res = s.astype('category') + res = s.astype("category") tm.assert_series_equal(res, exp) items = [1, 2, 3, 1] s = Series(items) exp = Series(Categorical(items)) - res = s.astype('category') + res = s.astype("category") tm.assert_series_equal(res, exp) - df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], - "vals": [1, 2, 3, 4, 5, 6]}) + df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]}) cats = Categorical([1, 2, 3, 4, 5, 6]) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) - df = DataFrame({"cats": ['a', 'b', 'b', 'a', 'a', 'd'], - "vals": [1, 2, 3, 4, 5, 6]}) - cats = Categorical(['a', 'b', 'b', 'a', 'a', 'd']) + df = DataFrame( + {"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]} + ) + cats = Categorical(["a", "b", "b", "a", "a", "d"]) exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]}) df["cats"] = df["cats"].astype("category") tm.assert_frame_equal(exp_df, df) @@ -273,57 +280,56 @@ def test_astype_from_categorical(self): res = s.astype(CategoricalDtype(None, ordered=True)) tm.assert_series_equal(res, exp) - exp = Series(Categorical(lst, categories=list('abcdef'), ordered=True)) - res = s.astype(CategoricalDtype(list('abcdef'), ordered=True)) + exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True)) + res = s.astype(CategoricalDtype(list("abcdef"), ordered=True)) tm.assert_series_equal(res, exp) def test_astype_categorical_to_other(self): value = np.random.RandomState(0).randint(0, 10000, 100) - df = DataFrame({'value': value}) + df = DataFrame({"value": value}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + df = df.sort_values(by=["value"], ascending=True) + df["value_group"] = pd.cut( + df.value, range(0, 10500, 500), right=False, labels=cat_labels + ) - s = df['value_group'] + s = df["value_group"] expected = s - tm.assert_series_equal(s.astype('category'), expected) + tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) - msg = (r"could not convert string to float|" - r"invalid literal for float\(\)") + msg = r"could not convert string to float|" r"invalid literal for float\(\)" with pytest.raises(ValueError, match=msg): - s.astype('float64') + s.astype("float64") - cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) - exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) - tm.assert_series_equal(cat.astype('str'), exp) - s2 = Series(Categorical(['1', '2', '3', '4'])) + cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) + exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) + tm.assert_series_equal(cat.astype("str"), exp) + s2 = Series(Categorical(["1", "2", "3", "4"])) exp2 = Series([1, 2, 3, 4]).astype(int) - tm.assert_series_equal(s2.astype('int'), exp2) + tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): - tm.assert_almost_equal( - np.sort(np.unique(a)), np.sort(np.unique(b))) + tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b))) - expected = Series(np.array(s.values), name='value_group') - cmp(s.astype('object'), expected) + expected = Series(np.array(s.values), name="value_group") + cmp(s.astype("object"), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) # valid conversion - for valid in [lambda x: x.astype('category'), - lambda x: x.astype(CategoricalDtype()), - lambda x: x.astype('object').astype('category'), - lambda x: x.astype('object').astype( - CategoricalDtype()) - ]: + for valid in [ + lambda x: x.astype("category"), + lambda x: x.astype(CategoricalDtype()), + lambda x: x.astype("object").astype("category"), + lambda x: x.astype("object").astype(CategoricalDtype()), + ]: result = valid(s) # compare series values @@ -331,21 +337,26 @@ def cmp(a, b): tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) - msg = (r"invalid type for astype") - for invalid in [lambda x: x.astype(Categorical), - lambda x: x.astype('object').astype(Categorical)]: + msg = ( + r"invalid type for astype" + ) + for invalid in [ + lambda x: x.astype(Categorical), + lambda x: x.astype("object").astype(Categorical), + ]: with pytest.raises(TypeError, match=msg): invalid(s) - @pytest.mark.parametrize('name', [None, 'foo']) - @pytest.mark.parametrize('dtype_ordered', [True, False]) - @pytest.mark.parametrize('series_ordered', [True, False]) - def test_astype_categorical_to_categorical(self, name, dtype_ordered, - series_ordered): + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("series_ordered", [True, False]) + def test_astype_categorical_to_categorical( + self, name, dtype_ordered, series_ordered + ): # GH 10696/18593 - s_data = list('abcaacbab') - s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered) + s_data = list("abcaacbab") + s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) s = Series(s_data, dtype=s_dtype, name=name) # unspecified categories @@ -356,7 +367,7 @@ def test_astype_categorical_to_categorical(self, name, dtype_ordered, tm.assert_series_equal(result, expected) # different categories - dtype = CategoricalDtype(list('adc'), dtype_ordered) + dtype = CategoricalDtype(list("adc"), dtype_ordered) result = s.astype(dtype) expected = Series(s_data, name=name, dtype=dtype) tm.assert_series_equal(result, expected) @@ -364,54 +375,54 @@ def test_astype_categorical_to_categorical(self, name, dtype_ordered, if dtype_ordered is False: # not specifying ordered, so only test once expected = s - result = s.astype('category') + result = s.astype("category") tm.assert_series_equal(result, expected) def test_astype_categoricaldtype(self): - s = Series(['a', 'b', 'a']) - result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) - expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) + s = Series(["a", "b", "a"]) + result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) + expected = Series(Categorical(["a", "b", "a"], ordered=True)) tm.assert_series_equal(result, expected) - result = s.astype(CategoricalDtype(['a', 'b'], ordered=False)) - expected = Series(Categorical(['a', 'b', 'a'], ordered=False)) + result = s.astype(CategoricalDtype(["a", "b"], ordered=False)) + expected = Series(Categorical(["a", "b", "a"], ordered=False)) tm.assert_series_equal(result, expected) - result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False)) - expected = Series(Categorical(['a', 'b', 'a'], - categories=['a', 'b', 'c'], - ordered=False)) + result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) + expected = Series( + Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) + ) tm.assert_series_equal(result, expected) - tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c'])) + tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) - @pytest.mark.parametrize("dtype", [ - np.datetime64, - np.timedelta64, - ]) + @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) def test_astype_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 data = [1] s = Series(data) - msg = ((r"The '{dtype}' dtype has no unit\. " - r"Please pass in '{dtype}\[ns\]' instead.") - .format(dtype=dtype.__name__)) + msg = ( + r"The '{dtype}' dtype has no unit\. " + r"Please pass in '{dtype}\[ns\]' instead." + ).format(dtype=dtype.__name__) with pytest.raises(ValueError, match=msg): s.astype(dtype) - @pytest.mark.parametrize("dtype", np.typecodes['All']) + @pytest.mark.parametrize("dtype", np.typecodes["All"]) def test_astype_empty_constructor_equality(self, dtype): # see gh-15524 if dtype not in ( - "S", "V", # poor support (if any) currently - "M", "m" # Generic timestamps raise a ValueError. Already tested. + "S", + "V", # poor support (if any) currently + "M", + "m", # Generic timestamps raise a ValueError. Already tested. ): init_empty = Series([], dtype=dtype) as_type_empty = Series([]).astype(dtype) tm.assert_series_equal(init_empty, as_type_empty) - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_complex(self): # see gh-4819: complex access for ndarray compat a = np.arange(5, dtype=np.float64) @@ -435,59 +446,60 @@ def test_arg_for_errors_in_astype(self): # see gh-14878 s = Series([1, 2, 3]) - msg = (r"Expected value of kwarg 'errors' to be one of \['raise'," - r" 'ignore'\]\. Supplied value is 'False'") + msg = ( + r"Expected value of kwarg 'errors' to be one of \['raise'," + r" 'ignore'\]\. Supplied value is 'False'" + ) with pytest.raises(ValueError, match=msg): s.astype(np.float64, errors=False) - s.astype(np.int8, errors='raise') + s.astype(np.int8, errors="raise") def test_intercept_astype_object(self): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) # This test no longer makes sense, as # Series is by default already M8[ns]. - expected = series.astype('object') + expected = series.astype("object") - df = DataFrame({'a': series, - 'b': np.random.randn(len(series))}) - exp_dtypes = Series([np.dtype('datetime64[ns]'), - np.dtype('float64')], index=['a', 'b']) + df = DataFrame({"a": series, "b": np.random.randn(len(series))}) + exp_dtypes = Series( + [np.dtype("datetime64[ns]"), np.dtype("float64")], index=["a", "b"] + ) tm.assert_series_equal(df.dtypes, exp_dtypes) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - df = DataFrame({'a': series, 'b': ['foo'] * len(series)}) + df = DataFrame({"a": series, "b": ["foo"] * len(series)}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() def test_series_to_categorical(self): # see gh-16524: test conversion of Series to Categorical - series = Series(['a', 'b', 'c']) + series = Series(["a", "b", "c"]) - result = Series(series, dtype='category') - expected = Series(['a', 'b', 'c'], dtype='category') + result = Series(series, dtype="category") + expected = Series(["a", "b", "c"], dtype="category") tm.assert_series_equal(result, expected) def test_infer_objects_series(self): # GH 11221 - actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects() + actual = Series(np.array([1, 2, 3], dtype="O")).infer_objects() expected = Series([1, 2, 3]) tm.assert_series_equal(actual, expected) - actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects() - expected = Series([1., 2., 3., np.nan]) + actual = Series(np.array([1, 2, 3, None], dtype="O")).infer_objects() + expected = Series([1.0, 2.0, 3.0, np.nan]) tm.assert_series_equal(actual, expected) # only soft conversions, unconvertable pass thru unchanged - actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) - .infer_objects()) - expected = Series([1, 2, 3, None, 'a']) + actual = Series(np.array([1, 2, 3, None, "a"], dtype="O")).infer_objects() + expected = Series([1, 2, 3, None, "a"]) - assert actual.dtype == 'object' + assert actual.dtype == "object" tm.assert_series_equal(actual, expected) def test_is_homogeneous_type(self): @@ -495,10 +507,13 @@ def test_is_homogeneous_type(self): assert Series([1, 2])._is_homogeneous_type assert Series(pd.Categorical([1, 2]))._is_homogeneous_type - @pytest.mark.parametrize("data", [ - pd.period_range("2000", periods=4), - pd.IntervalIndex.from_breaks([1, 2, 3, 4]) - ]) + @pytest.mark.parametrize( + "data", + [ + pd.period_range("2000", periods=4), + pd.IntervalIndex.from_breaks([1, 2, 3, 4]), + ], + ) def test_values_compatibility(self, data): # https://github.com/pandas-dev/pandas/issues/23995 result = pd.Series(data).values diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 392aa48e200a2..4a914e4fb0f2c 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -27,13 +27,13 @@ def test_unique(): result = s.unique() assert len(result) == 2 - s = Series([1.2345] * 100, dtype='f4') + s = Series([1.2345] * 100, dtype="f4") s[::2] = np.nan result = s.unique() assert len(result) == 2 # NAs in object arrays #714 - s = Series(['foo'] * 100, dtype='O') + s = Series(["foo"] * 100, dtype="O") s[::2] = np.nan result = s.unique() assert len(result) == 2 @@ -48,8 +48,7 @@ def test_unique(): s = Series(Categorical([])) tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) s = Series(Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), - check_dtype=False) + tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False) def test_unique_data_ownership(): @@ -57,14 +56,18 @@ def test_unique_data_ownership(): Series(Series(["a", "c", "b"]).unique()).sort_values() -@pytest.mark.parametrize('data, expected', [ - (np.random.randint(0, 10, size=1000), False), - (np.arange(1000), True), - ([], True), - ([np.nan], True), - (['foo', 'bar', np.nan], True), - (['foo', 'foo', np.nan], False), - (['foo', 'bar', np.nan, np.nan], False)]) +@pytest.mark.parametrize( + "data, expected", + [ + (np.random.randint(0, 10, size=1000), False), + (np.arange(1000), True), + ([], True), + ([np.nan], True), + (["foo", "bar", np.nan], True), + (["foo", "foo", np.nan], False), + (["foo", "bar", np.nan, np.nan], False), + ], +) def test_is_unique(data, expected): # GH11946 / GH25180 s = Series(data) @@ -89,17 +92,18 @@ def __ne__(self, other): @pytest.mark.parametrize( - 'keep, expected', + "keep, expected", [ - ('first', Series([False, False, False, False, True, True, False])), - ('last', Series([False, True, True, False, False, False, False])), - (False, Series([False, True, True, False, True, True, False])) - ]) + ("first", Series([False, False, False, False, True, True, False])), + ("last", Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])), + ], +) def test_drop_duplicates(any_numpy_dtype, keep, expected): tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) - if tc.dtype == 'bool': - pytest.skip('tested separately in test_drop_duplicates_bool') + if tc.dtype == "bool": + pytest.skip("tested separately in test_drop_duplicates_bool") tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) @@ -108,10 +112,14 @@ def test_drop_duplicates(any_numpy_dtype, keep, expected): tm.assert_series_equal(sc, tc[~expected]) -@pytest.mark.parametrize('keep, expected', - [('first', Series([False, False, True, True])), - ('last', Series([True, True, False, False])), - (False, Series([True, True, True, True]))]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, True])), + ("last", Series([True, True, False, False])), + (False, Series([True, True, True, True])), + ], +) def test_drop_duplicates_bool(keep, expected): tc = Series([True, False, True, False]) @@ -122,23 +130,29 @@ def test_drop_duplicates_bool(keep, expected): tm.assert_series_equal(sc, tc[~expected]) -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True], name='name')), - ('last', Series([True, True, False, False, False], name='name')), - (False, Series([True, True, True, False, True], name='name')) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True], name="name")), + ("last", Series([True, True, False, False, False], name="name")), + (False, Series([True, True, True, False, True], name="name")), + ], +) def test_duplicated_keep(keep, expected): - s = Series(['a', 'b', 'b', 'c', 'a'], name='name') + s = Series(["a", "b", "b", "c", "a"], name="name") result = s.duplicated(keep=keep) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) -]) +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) def test_duplicated_nan_none(keep, expected): s = Series([np.nan, 3, 3, None, np.nan], dtype=object) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 0b62624ad2696..d35198ca70f37 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -16,7 +16,7 @@ class TestSeriesInternals: def test_convert(self): # Tests: All to nans, coerce, true # Test coercion returns correct type - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) results = s._convert(datetime=True, coerce=True) expected = Series([NaT] * 3) assert_series_equal(results, expected) @@ -25,7 +25,7 @@ def test_convert(self): expected = Series([np.nan] * 3) assert_series_equal(results, expected) - expected = Series([NaT] * 3, dtype=np.dtype('m8[ns]')) + expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]")) results = s._convert(timedelta=True, coerce=True) assert_series_equal(results, expected) @@ -33,7 +33,7 @@ def test_convert(self): td = dt - datetime(2000, 1, 1, 0, 0) # Test coercion with mixed types - s = Series(['a', '3.1415', dt, td]) + s = Series(["a", "3.1415", dt, td]) results = s._convert(datetime=True, coerce=True) expected = Series([NaT, NaT, dt, NaT]) assert_series_equal(results, expected) @@ -43,8 +43,7 @@ def test_convert(self): assert_series_equal(results, expected) results = s._convert(timedelta=True, coerce=True) - expected = Series([NaT, NaT, NaT, td], - dtype=np.dtype('m8[ns]')) + expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]")) assert_series_equal(results, expected) # Test standard conversion returns original @@ -57,94 +56,117 @@ def test_convert(self): assert_series_equal(results, s) # test pass-through and non-conversion when other types selected - s = Series(['1.0', '2.0', '3.0']) + s = Series(["1.0", "2.0", "3.0"]) results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([1.0, 2.0, 3.0]) assert_series_equal(results, expected) results = s._convert(True, False, True) assert_series_equal(results, s) - s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], - dtype='O') + s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O") results = s._convert(datetime=True, numeric=True, timedelta=True) - expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, - 0)]) + expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) assert_series_equal(results, expected) results = s._convert(datetime=False, numeric=True, timedelta=True) assert_series_equal(results, s) td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) - s = Series([td, td], dtype='O') + s = Series([td, td], dtype="O") results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([td, td]) assert_series_equal(results, expected) results = s._convert(True, True, False) assert_series_equal(results, s) - s = Series([1., 2, 3], index=['a', 'b', 'c']) + s = Series([1.0, 2, 3], index=["a", "b", "c"]) result = s._convert(numeric=True) assert_series_equal(result, s) # force numeric conversion - r = s.copy().astype('O') - r['a'] = '1' + r = s.copy().astype("O") + r["a"] = "1" result = r._convert(numeric=True) assert_series_equal(result, s) - r = s.copy().astype('O') - r['a'] = '1.' + r = s.copy().astype("O") + r["a"] = "1." result = r._convert(numeric=True) assert_series_equal(result, s) - r = s.copy().astype('O') - r['a'] = 'garbled' + r = s.copy().astype("O") + r["a"] = "garbled" result = r._convert(numeric=True) expected = s.copy() - expected['a'] = np.nan + expected["a"] = np.nan assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) - s = Series([1, 'na', 3, 4]) + s = Series([1, "na", 3, 4]) result = s._convert(datetime=True, numeric=True) expected = Series([1, np.nan, 3, 4]) assert_series_equal(result, expected) - s = Series([1, '', 3, 4]) + s = Series([1, "", 3, 4]) result = s._convert(datetime=True, numeric=True) assert_series_equal(result, expected) # dates - s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0)]) - s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), - datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1, - Timestamp('20010104'), '20010105'], dtype='O') + s = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + ] + ) + s2 = Series( + [ + datetime(2001, 1, 1, 0, 0), + datetime(2001, 1, 2, 0, 0), + datetime(2001, 1, 3, 0, 0), + "foo", + 1.0, + 1, + Timestamp("20010104"), + "20010105", + ], + dtype="O", + ) result = s._convert(datetime=True) - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103')], dtype='M8[ns]') + expected = Series( + [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], + dtype="M8[ns]", + ) assert_series_equal(result, expected) result = s._convert(datetime=True, coerce=True) assert_series_equal(result, expected) - expected = Series([Timestamp('20010101'), Timestamp('20010102'), - Timestamp('20010103'), NaT, NaT, NaT, - Timestamp('20010104'), Timestamp('20010105')], - dtype='M8[ns]') - result = s2._convert(datetime=True, numeric=False, timedelta=False, - coerce=True) + expected = Series( + [ + Timestamp("20010101"), + Timestamp("20010102"), + Timestamp("20010103"), + NaT, + NaT, + NaT, + Timestamp("20010104"), + Timestamp("20010105"), + ], + dtype="M8[ns]", + ) + result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) assert_series_equal(result, expected) result = s2._convert(datetime=True, coerce=True) assert_series_equal(result, expected) - s = Series(['foo', 'bar', 1, 1.0], dtype='O') + s = Series(["foo", "bar", 1, 1.0], dtype="O") result = s._convert(datetime=True, coerce=True) expected = Series([NaT] * 2 + [Timestamp(1)] * 2) assert_series_equal(result, expected) # preserver if non-object - s = Series([1], dtype='float32') + s = Series([1], dtype="float32") result = s._convert(datetime=True, coerce=True) assert_series_equal(result, s) @@ -155,7 +177,7 @@ def test_convert(self): # dateutil parses some single letters into today's value as a date expected = Series([NaT]) - for x in 'abcdefghijklmnopqrstuvwxyz': + for x in "abcdefghijklmnopqrstuvwxyz": s = Series([x]) result = s._convert(datetime=True, coerce=True) assert_series_equal(result, expected) @@ -164,7 +186,7 @@ def test_convert(self): assert_series_equal(result, expected) def test_convert_no_arg_error(self): - s = Series(['1.0', '2']) + s = Series(["1.0", "2"]) msg = r"At least one of datetime, numeric or timedelta must be True\." with pytest.raises(ValueError, match=msg): s._convert() @@ -172,7 +194,7 @@ def test_convert_no_arg_error(self): def test_convert_preserve_bool(self): s = Series([1, True, 3, 5], dtype=object) r = s._convert(datetime=True, numeric=True) - e = Series([1, 1, 3, 5], dtype='i8') + e = Series([1, 1, 3, 5], dtype="i8") tm.assert_series_equal(r, e) def test_convert_preserve_all_bool(self): @@ -196,17 +218,17 @@ def test_astype_no_pandas_dtype(self): tm.assert_series_equal(result, ser) def test_from_array(self): - result = pd.Series(pd.array(['1H', '2H'], dtype='timedelta64[ns]')) + result = pd.Series(pd.array(["1H", "2H"], dtype="timedelta64[ns]")) assert result._data.blocks[0].is_extension is False - result = pd.Series(pd.array(['2015'], dtype='datetime64[ns]')) + result = pd.Series(pd.array(["2015"], dtype="datetime64[ns]")) assert result._data.blocks[0].is_extension is False def test_from_list_dtype(self): - result = pd.Series(['1H', '2H'], dtype='timedelta64[ns]') + result = pd.Series(["1H", "2H"], dtype="timedelta64[ns]") assert result._data.blocks[0].is_extension is False - result = pd.Series(['2015'], dtype='datetime64[ns]') + result = pd.Series(["2015"], dtype="datetime64[ns]") assert result._data.blocks[0].is_extension is False @@ -214,10 +236,10 @@ def test_hasnans_unchached_for_series(): # GH#19700 idx = pd.Index([0, 1]) assert idx.hasnans is False - assert 'hasnans' in idx._cache + assert "hasnans" in idx._cache ser = idx.to_series() assert ser.hasnans is False - assert not hasattr(ser, '_cache') + assert not hasattr(ser, "_cache") ser.iloc[-1] = np.nan assert ser.hasnans is True assert Series.hasnans.__doc__ == pd.Index.hasnans.__doc__ diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 39c217e7d95b1..0238314122462 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -9,16 +9,18 @@ from pandas import DataFrame, Series import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal, ensure_clean) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, + ensure_clean, +) from pandas.io.common import _get_handle class TestSeriesToCSV: - def read_csv(self, path, **kwargs): - params = dict(squeeze=True, index_col=0, - header=None, parse_dates=True) + params = dict(squeeze=True, index_col=0, header=None, parse_dates=True) params.update(**kwargs) header = params.get("header") @@ -77,8 +79,9 @@ def test_from_csv(self, datetime_series, string_series): outfile.write("1998-01-01|1.0\n1999-01-01|2.0") series = self.read_csv(path, sep="|") - check_series = Series({datetime(1998, 1, 1): 1.0, - datetime(1999, 1, 1): 2.0}) + check_series = Series( + {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} + ) assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) @@ -93,7 +96,7 @@ def test_to_csv(self, datetime_series): with io.open(path, newline=None) as f: lines = f.readlines() - assert (lines[1] != '\n') + assert lines[1] != "\n" datetime_series.to_csv(path, index=False, header=False) arr = np.loadtxt(path) @@ -120,9 +123,9 @@ def test_to_csv_float_format(self): assert_series_equal(rs, xp) def test_to_csv_list_entries(self): - s = Series(['jack and jill', 'jesse and frank']) + s = Series(["jack and jill", "jesse and frank"]) - split = s.str.split(r'\s+and\s+') + split = s.str.split(r"\s+and\s+") buf = StringIO() split.to_csv(buf, header=False) @@ -135,75 +138,91 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path_or_buf=None, header=False) assert isinstance(csv_str, str) - @pytest.mark.parametrize('s,encoding', [ - (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X'), None), - # GH 21241, 21118 - (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), - (Series(["123", "你好", "世界"], name="中文"), 'gb2312'), - (Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), 'cp737') - ]) + @pytest.mark.parametrize( + "s,encoding", + [ + ( + Series([0.123456, 0.234567, 0.567567], index=["A", "B", "C"], name="X"), + None, + ), + # GH 21241, 21118 + (Series(["abc", "def", "ghi"], name="X"), "ascii"), + (Series(["123", "你好", "世界"], name="中文"), "gb2312"), + (Series(["123", "Γειά σου", "Κόσμε"], name="Ελληνικά"), "cp737"), + ], + ) def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: - s.to_csv(filename, compression=compression, encoding=encoding, - header=True) + s.to_csv(filename, compression=compression, encoding=encoding, header=True) # test the round trip - to_csv -> read_csv - result = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, squeeze=True) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv - f, _handles = _get_handle(filename, 'w', compression=compression, - encoding=encoding) + f, _handles = _get_handle( + filename, "w", compression=compression, encoding=encoding + ) with f: s.to_csv(f, encoding=encoding, header=True) - result = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, squeeze=True) + result = pd.read_csv( + filename, + compression=compression, + encoding=encoding, + index_col=0, + squeeze=True, + ) assert_series_equal(s, result) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode(encoding or 'utf8') + text = fh.read().decode(encoding or "utf8") assert s.name in text with tm.decompress_file(filename, compression) as fh: - assert_series_equal(s, pd.read_csv(fh, - index_col=0, - squeeze=True, - encoding=encoding)) + assert_series_equal( + s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding) + ) class TestSeriesIO: - def test_to_frame(self, datetime_series): datetime_series.name = None rs = datetime_series.to_frame() xp = pd.DataFrame(datetime_series.values, index=datetime_series.index) assert_frame_equal(rs, xp) - datetime_series.name = 'testname' + datetime_series.name = "testname" rs = datetime_series.to_frame() - xp = pd.DataFrame(dict(testname=datetime_series.values), - index=datetime_series.index) + xp = pd.DataFrame( + dict(testname=datetime_series.values), index=datetime_series.index + ) assert_frame_equal(rs, xp) - rs = datetime_series.to_frame(name='testdifferent') - xp = pd.DataFrame(dict(testdifferent=datetime_series.values), - index=datetime_series.index) + rs = datetime_series.to_frame(name="testdifferent") + xp = pd.DataFrame( + dict(testdifferent=datetime_series.values), index=datetime_series.index + ) assert_frame_equal(rs, xp) def test_timeseries_periodindex(self): # GH2891 from pandas import period_range - prng = period_range('1/1/2011', '1/1/2012', freq='M') + + prng = period_range("1/1/2011", "1/1/2012", freq="M") ts = Series(np.random.randn(len(prng)), prng) new_ts = tm.round_trip_pickle(ts) - assert new_ts.index.freq == 'M' + assert new_ts.index.freq == "M" def test_pickle_preserve_name(self): - for n in [777, 777., 'name', datetime(2001, 11, 11), (1, 2)]: + for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, 2)]: unpickled = self._pickle_roundtrip_name(tm.makeTimeSeries(name=n)) assert unpickled.name == n @@ -218,7 +237,6 @@ def test_to_frame_expanddim(self): # GH 9762 class SubclassedSeries(Series): - @property def _constructor_expanddim(self): return SubclassedFrame @@ -226,22 +244,20 @@ def _constructor_expanddim(self): class SubclassedFrame(DataFrame): pass - s = SubclassedSeries([1, 2, 3], name='X') + s = SubclassedSeries([1, 2, 3], name="X") result = s.to_frame() assert isinstance(result, SubclassedFrame) - expected = SubclassedFrame({'X': [1, 2, 3]}) + expected = SubclassedFrame({"X": [1, 2, 3]}) assert_frame_equal(result, expected) - @pytest.mark.parametrize('mapping', ( - dict, - collections.defaultdict(list), - collections.OrderedDict)) + @pytest.mark.parametrize( + "mapping", (dict, collections.defaultdict(list), collections.OrderedDict) + ) def test_to_dict(self, mapping, datetime_series): # GH16122 tm.assert_series_equal( - Series(datetime_series.to_dict(mapping), name='ts'), - datetime_series) + Series(datetime_series.to_dict(mapping), name="ts"), datetime_series + ) from_method = Series(datetime_series.to_dict(collections.Counter)) - from_constructor = Series(collections - .Counter(datetime_series.iteritems())) + from_constructor = Series(collections.Counter(datetime_series.iteritems())) tm.assert_series_equal(from_method, from_constructor) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 94050f7526444..6012f3986e955 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -11,8 +11,17 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, IntervalIndex, MultiIndex, NaT, Series, - Timestamp, date_range, isna) + Categorical, + DataFrame, + Index, + IntervalIndex, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, +) from pandas.core.series import remove_na import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -23,7 +32,8 @@ def _skip_if_no_pchip(): from scipy.interpolate import pchip_interpolate # noqa except ImportError: import pytest - pytest.skip('scipy.interpolate.pchip missing') + + pytest.skip("scipy.interpolate.pchip missing") def _skip_if_no_akima(): @@ -31,16 +41,16 @@ def _skip_if_no_akima(): from scipy.interpolate import Akima1DInterpolator # noqa except ImportError: import pytest - pytest.skip('scipy.interpolate.Akima1DInterpolator missing') + + pytest.skip("scipy.interpolate.Akima1DInterpolator missing") -def _simple_ts(start, end, freq='D'): +def _simple_ts(start, end, freq="D"): rng = date_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) class TestSeriesMissingData: - def test_remove_na_deprecation(self): # see gh-16971 with tm.assert_produces_warning(FutureWarning): @@ -48,40 +58,74 @@ def test_remove_na_deprecation(self): def test_timedelta_fillna(self): # GH 3371 - s = Series([Timestamp('20130101'), Timestamp('20130101'), - Timestamp('20130102'), Timestamp('20130103 9:01:01')]) + s = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) td = s.diff() # reg fillna with tm.assert_produces_warning(FutureWarning): result = td.fillna(0) - expected = Series([timedelta(0), timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(0), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) # interpreted as seconds, deprecated with tm.assert_produces_warning(FutureWarning): result = td.fillna(1) - expected = Series([timedelta(seconds=1), - timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) result = td.fillna(timedelta(days=1, seconds=1)) - expected = Series([timedelta(days=1, seconds=1), timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(days=1, seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) result = td.fillna(np.timedelta64(int(1e9))) - expected = Series([timedelta(seconds=1), timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) assert_series_equal(result, expected) result = td.fillna(NaT) - expected = Series([NaT, timedelta(0), timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1)], - dtype='m8[ns]') + expected = Series( + [ + NaT, + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype="m8[ns]", + ) assert_series_equal(result, expected) # ffill @@ -102,14 +146,26 @@ def test_timedelta_fillna(self): def test_datetime64_fillna(self): - s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( - '20130102'), Timestamp('20130103 9:01:01')]) + s = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) s[2] = np.nan # reg fillna - result = s.fillna(Timestamp('20130104')) - expected = Series([Timestamp('20130101'), Timestamp( - '20130101'), Timestamp('20130104'), Timestamp('20130103 9:01:01')]) + result = s.fillna(Timestamp("20130104")) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130104"), + Timestamp("20130103 9:01:01"), + ] + ) assert_series_equal(result, expected) result = s.fillna(NaT) @@ -118,260 +174,367 @@ def test_datetime64_fillna(self): # ffill result = s.ffill() - expected = Series([Timestamp('20130101'), Timestamp( - '20130101'), Timestamp('20130101'), Timestamp('20130103 9:01:01')]) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + ] + ) assert_series_equal(result, expected) # bfill result = s.bfill() - expected = Series([Timestamp('20130101'), Timestamp('20130101'), - Timestamp('20130103 9:01:01'), Timestamp( - '20130103 9:01:01')]) + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + Timestamp("20130103 9:01:01"), + ] + ) assert_series_equal(result, expected) # GH 6587 # make sure that we are treating as integer when filling # this also tests inference of a datetime-like with NaT's - s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001']) + s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"]) expected = Series( - ['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001', - '2013-08-05 15:30:00.000001'], dtype='M8[ns]') - result = s.fillna(method='backfill') + [ + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + ], + dtype="M8[ns]", + ) + result = s.fillna(method="backfill") assert_series_equal(result, expected) def test_datetime64_tz_fillna(self): - for tz in ['US/Eastern', 'Asia/Tokyo']: + for tz in ["US/Eastern", "Asia/Tokyo"]: # DatetimeBlock - s = Series([Timestamp('2011-01-01 10:00'), pd.NaT, - Timestamp('2011-01-03 10:00'), pd.NaT]) + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) null_loc = pd.Series([False, True, False, True]) - result = s.fillna(pd.Timestamp('2011-01-02 10:00')) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00'), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-02 10:00')]) + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ] + ) tm.assert_series_equal(expected, result) # check s is not changed tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-02 10:00', tz=tz)]) + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna('AAA') - expected = Series([Timestamp('2011-01-01 10:00'), 'AAA', - Timestamp('2011-01-03 10:00'), 'AAA'], - dtype=object) + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz), - 3: pd.Timestamp('2011-01-04 10:00')}) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-04 10:00')]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00'), - 3: pd.Timestamp('2011-01-04 10:00')}) - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00'), - Timestamp('2011-01-03 10:00'), - Timestamp('2011-01-04 10:00')]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00"), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) # DatetimeBlockTZ - idx = pd.DatetimeIndex(['2011-01-01 10:00', pd.NaT, - '2011-01-03 10:00', pd.NaT], tz=tz) + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz + ) s = pd.Series(idx) - assert s.dtype == 'datetime64[ns, {0}]'.format(tz) + assert s.dtype == "datetime64[ns, {0}]".format(tz) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00')) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2011-01-02 10:00'), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2011-01-02 10:00')]) + result = s.fillna(pd.Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00', tz=tz)) - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-02 10:00', - '2011-01-03 10:00', '2011-01-02 10:00'], - tz=tz) + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(pd.Timestamp('2011-01-02 10:00', - tz=tz).to_pydatetime()) - idx = pd.DatetimeIndex(['2011-01-01 10:00', '2011-01-02 10:00', - '2011-01-03 10:00', '2011-01-02 10:00'], - tz=tz) + result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = pd.DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna('AAA') - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), 'AAA', - Timestamp('2011-01-03 10:00', tz=tz), 'AAA'], - dtype=object) + result = s.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz), - 3: pd.Timestamp('2011-01-04 10:00')}) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2011-01-04 10:00')]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna({1: pd.Timestamp('2011-01-02 10:00', tz=tz), - 3: pd.Timestamp('2011-01-04 10:00', tz=tz)}) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2011-01-02 10:00', tz=tz), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2011-01-04 10:00', tz=tz)]) + result = s.fillna( + { + 1: pd.Timestamp("2011-01-02 10:00", tz=tz), + 3: pd.Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) # filling with a naive/other zone, coerce to object - result = s.fillna(Timestamp('20130101')) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2013-01-01'), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2013-01-01')]) + result = s.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) - result = s.fillna(Timestamp('20130101', tz='US/Pacific')) - expected = Series([Timestamp('2011-01-01 10:00', tz=tz), - Timestamp('2013-01-01', tz='US/Pacific'), - Timestamp('2011-01-03 10:00', tz=tz), - Timestamp('2013-01-01', tz='US/Pacific')]) + result = s.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + ] + ) tm.assert_series_equal(expected, result) tm.assert_series_equal(pd.isna(s), null_loc) # with timezone # GH 15855 - df = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]) - exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]) - assert_series_equal(df.fillna(method='pad'), exp) - - df = pd.Series([pd.NaT, pd.Timestamp('2012-11-11 00:00:00+01:00')]) - exp = pd.Series([pd.Timestamp('2012-11-11 00:00:00+01:00'), - pd.Timestamp('2012-11-11 00:00:00+01:00')]) - assert_series_equal(df.fillna(method='bfill'), exp) + df = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) + exp = pd.Series( + [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + assert_series_equal(df.fillna(method="pad"), exp) + + df = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) + exp = pd.Series( + [ + pd.Timestamp("2012-11-11 00:00:00+01:00"), + pd.Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + assert_series_equal(df.fillna(method="bfill"), exp) def test_fillna_consistency(self): # GH 16402 # fillna with a tz aware to a tz-naive, should result in object - s = Series([Timestamp('20130101'), pd.NaT]) + s = Series([Timestamp("20130101"), pd.NaT]) - result = s.fillna(Timestamp('20130101', tz='US/Eastern')) - expected = Series([Timestamp('20130101'), - Timestamp('2013-01-01', tz='US/Eastern')], - dtype='object') + result = s.fillna(Timestamp("20130101", tz="US/Eastern")) + expected = Series( + [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], + dtype="object", + ) assert_series_equal(result, expected) # where (we ignore the errors=) - result = s.where([True, False], - Timestamp('20130101', tz='US/Eastern'), - errors='ignore') + result = s.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) assert_series_equal(result, expected) - result = s.where([True, False], - Timestamp('20130101', tz='US/Eastern'), - errors='ignore') + result = s.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) assert_series_equal(result, expected) # with a non-datetime - result = s.fillna('foo') - expected = Series([Timestamp('20130101'), - 'foo']) + result = s.fillna("foo") + expected = Series([Timestamp("20130101"), "foo"]) assert_series_equal(result, expected) # assignment s2 = s.copy() - s2[1] = 'foo' + s2[1] = "foo" assert_series_equal(s2, expected) def test_datetime64tz_fillna_round_issue(self): # GH 14872 - data = pd.Series([pd.NaT, pd.NaT, - datetime(2016, 12, 12, 22, 24, 6, 100001, - tzinfo=pytz.utc)]) + data = pd.Series( + [pd.NaT, pd.NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + ) - filled = data.fillna(method='bfill') + filled = data.fillna(method="bfill") - expected = pd.Series([datetime(2016, 12, 12, 22, 24, 6, - 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, - 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, - 100001, tzinfo=pytz.utc)]) + expected = pd.Series( + [ + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + ] + ) assert_series_equal(filled, expected) def test_fillna_downcast(self): # GH 15277 # infer int64 from float64 - s = pd.Series([1., np.nan]) - result = s.fillna(0, downcast='infer') + s = pd.Series([1.0, np.nan]) + result = s.fillna(0, downcast="infer") expected = pd.Series([1, 0]) assert_series_equal(result, expected) # infer int64 from float64 when fillna value is a dict - s = pd.Series([1., np.nan]) - result = s.fillna({1: 0}, downcast='infer') + s = pd.Series([1.0, np.nan]) + result = s.fillna({1: 0}, downcast="infer") expected = pd.Series([1, 0]) assert_series_equal(result, expected) def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) - s.fillna(method='ffill', inplace=True) - assert_series_equal(s.fillna(method='ffill', inplace=False), s) + s.fillna(method="ffill", inplace=True) + assert_series_equal(s.fillna(method="ffill", inplace=False), s) def test_fillna_raise(self): s = Series(np.random.randint(-100, 100, 50)) - msg = ('"value" parameter must be a scalar or dict, but you passed a' - ' "list"') + msg = '"value" parameter must be a scalar or dict, but you passed a' ' "list"' with pytest.raises(TypeError, match=msg): s.fillna([1, 2]) - msg = ('"value" parameter must be a scalar or dict, but you passed a' - ' "tuple"') + msg = '"value" parameter must be a scalar or dict, but you passed a' ' "tuple"' with pytest.raises(TypeError, match=msg): s.fillna((1, 2)) # related GH 9217, make sure limit is an int and greater than 0 s = Series([1, 2, 3, None]) - msg = (r"Cannot specify both 'value' and 'method'\.|" - r"Limit must be greater than 0|" - "Limit must be an integer") - for limit in [-1, 0, 1., 2.]: - for method in ['backfill', 'bfill', 'pad', 'ffill', None]: + msg = ( + r"Cannot specify both 'value' and 'method'\.|" + r"Limit must be greater than 0|" + "Limit must be an integer" + ) + for limit in [-1, 0, 1.0, 2.0]: + for method in ["backfill", "bfill", "pad", "ffill", None]: with pytest.raises(ValueError, match=msg): s.fillna(1, limit=limit, method=method) def test_categorical_nan_equality(self): cat = Series(Categorical(["a", "b", "c", np.nan])) exp = Series([True, True, True, False]) - res = (cat == cat) + res = cat == cat tm.assert_series_equal(res, exp) def test_categorical_nan_handling(self): @@ -379,62 +542,63 @@ def test_categorical_nan_handling(self): # NaNs are represented as -1 in labels s = Series(Categorical(["a", "b", np.nan, "a"])) tm.assert_index_equal(s.cat.categories, Index(["a", "b"])) - tm.assert_numpy_array_equal(s.values.codes, - np.array([0, 1, -1, 0], dtype=np.int8)) - - @pytest.mark.parametrize('fill_value, expected_output', [ - ('a', ['a', 'a', 'b', 'a', 'a']), - ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']), - ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]), - ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]), - (Series('a'), ['a', np.nan, 'b', np.nan, np.nan]), - (Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]), - (Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]), - (Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b']) - ]) + tm.assert_numpy_array_equal( + s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8) + ) + + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + ("a", ["a", "a", "b", "a", "a"]), + ({1: "a", 3: "b", 4: "b"}, ["a", "a", "b", "b", "b"]), + ({1: "a"}, ["a", "a", "b", np.nan, np.nan]), + ({1: "a", 3: "b"}, ["a", "a", "b", "b", np.nan]), + (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), + (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), + (Series({1: "a", 3: "b"}), ["a", "a", "b", "b", np.nan]), + (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), + ], + ) def test_fillna_categorical(self, fill_value, expected_output): # GH 17033 # Test fillna for a Categorical series - data = ['a', np.nan, 'b', np.nan, np.nan] - s = Series(Categorical(data, categories=['a', 'b'])) - exp = Series(Categorical(expected_output, categories=['a', 'b'])) + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b"])) + exp = Series(Categorical(expected_output, categories=["a", "b"])) tm.assert_series_equal(s.fillna(fill_value), exp) def test_fillna_categorical_raise(self): - data = ['a', np.nan, 'b', np.nan, np.nan] - s = Series(Categorical(data, categories=['a', 'b'])) + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b"])) - with pytest.raises(ValueError, - match="fill value must be in categories"): - s.fillna('d') + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna("d") - with pytest.raises(ValueError, - match="fill value must be in categories"): - s.fillna(Series('d')) + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna(Series("d")) - with pytest.raises(ValueError, - match="fill value must be in categories"): - s.fillna({1: 'd', 3: 'a'}) + with pytest.raises(ValueError, match="fill value must be in categories"): + s.fillna({1: "d", 3: "a"}) - msg = ('"value" parameter must be a scalar or ' - 'dict, but you passed a "list"') + msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): - s.fillna(['a', 'b']) + s.fillna(["a", "b"]) - msg = ('"value" parameter must be a scalar or ' - 'dict, but you passed a "tuple"') + msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): - s.fillna(('a', 'b')) + s.fillna(("a", "b")) - msg = ('"value" parameter must be a scalar, dict ' - 'or Series, but you passed a "DataFrame"') + msg = ( + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"' + ) with pytest.raises(TypeError, match=msg): - s.fillna(DataFrame({1: ['a'], 3: ['b']})) + s.fillna(DataFrame({1: ["a"], 3: ["b"]})) def test_fillna_nat(self): - series = Series([0, 1, 2, iNaT], dtype='M8[ns]') + series = Series([0, 1, 2, iNaT], dtype="M8[ns]") - filled = series.fillna(method='pad') + filled = series.fillna(method="pad") filled2 = series.fillna(value=series.values[2]) expected = series.copy() @@ -443,16 +607,16 @@ def test_fillna_nat(self): assert_series_equal(filled, expected) assert_series_equal(filled2, expected) - df = DataFrame({'A': series}) - filled = df.fillna(method='pad') + df = DataFrame({"A": series}) + filled = df.fillna(method="pad") filled2 = df.fillna(value=series.values[2]) - expected = DataFrame({'A': expected}) + expected = DataFrame({"A": expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected) - series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') + series = Series([iNaT, 0, 1, 2], dtype="M8[ns]") - filled = series.fillna(method='bfill') + filled = series.fillna(method="bfill") filled2 = series.fillna(value=series[1]) expected = series.copy() @@ -461,49 +625,49 @@ def test_fillna_nat(self): assert_series_equal(filled, expected) assert_series_equal(filled2, expected) - df = DataFrame({'A': series}) - filled = df.fillna(method='bfill') + df = DataFrame({"A": series}) + filled = df.fillna(method="bfill") filled2 = df.fillna(value=series[1]) - expected = DataFrame({'A': expected}) + expected = DataFrame({"A": expected}) assert_frame_equal(filled, expected) assert_frame_equal(filled2, expected) def test_isna_for_inf(self): - s = Series(['a', np.inf, np.nan, 1.0]) - with pd.option_context('mode.use_inf_as_na', True): + s = Series(["a", np.inf, np.nan, 1.0]) + with pd.option_context("mode.use_inf_as_na", True): r = s.isna() dr = s.dropna() e = Series([False, True, True, False]) - de = Series(['a', 1.0], index=[0, 3]) + de = Series(["a", 1.0], index=[0, 3]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de) def test_isnull_for_inf_deprecated(self): # gh-17115 - s = Series(['a', np.inf, np.nan, 1.0]) - with pd.option_context('mode.use_inf_as_null', True): + s = Series(["a", np.inf, np.nan, 1.0]) + with pd.option_context("mode.use_inf_as_null", True): r = s.isna() dr = s.dropna() e = Series([False, True, True, False]) - de = Series(['a', 1.0], index=[0, 3]) + de = Series(["a", 1.0], index=[0, 3]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de) def test_fillna(self, datetime_series): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - tm.assert_series_equal(ts, ts.fillna(method='ffill')) + tm.assert_series_equal(ts, ts.fillna(method="ffill")) ts[2] = np.NaN - exp = Series([0., 1., 1., 3., 4.], index=ts.index) - tm.assert_series_equal(ts.fillna(method='ffill'), exp) + exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="ffill"), exp) - exp = Series([0., 1., 3., 3., 4.], index=ts.index) - tm.assert_series_equal(ts.fillna(method='backfill'), exp) + exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="backfill"), exp) - exp = Series([0., 1., 5., 3., 4.], index=ts.index) + exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(value=5), exp) msg = "Must specify a fill 'value' or 'method'" @@ -512,13 +676,13 @@ def test_fillna(self, datetime_series): msg = "Cannot specify both 'value' and 'method'" with pytest.raises(ValueError, match=msg): - datetime_series.fillna(value=0, method='ffill') + datetime_series.fillna(value=0, method="ffill") # GH 5703 s1 = Series([np.nan]) s2 = Series([1]) result = s1.fillna(s2) - expected = Series([1.]) + expected = Series([1.0]) assert_series_equal(result, expected) result = s1.fillna({}) assert_series_equal(result, s1) @@ -537,10 +701,10 @@ def test_fillna(self, datetime_series): result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) assert_series_equal(result, s1) - s1 = Series([0, 1, 2], list('abc')) - s2 = Series([0, np.nan, 2], list('bac')) + s1 = Series([0, 1, 2], list("abc")) + s2 = Series([0, np.nan, 2], list("bac")) result = s2.fillna(s1) - expected = Series([0, 0, 2.], list('bac')) + expected = Series([0, 0, 2.0], list("bac")) assert_series_equal(result, expected) # limit @@ -556,25 +720,25 @@ def test_fillna(self, datetime_series): # GH 9043 # make sure a string representation of int/float values can be filled # correctly without raising errors or being converted - vals = ['0', '1.5', '-0.3'] + vals = ["0", "1.5", "-0.3"] for val in vals: - s = Series([0, 1, np.nan, np.nan, 4], dtype='float64') + s = Series([0, 1, np.nan, np.nan, 4], dtype="float64") result = s.fillna(val) - expected = Series([0, 1, val, val, 4], dtype='object') + expected = Series([0, 1, val, val, 4], dtype="object") assert_series_equal(result, expected) def test_fillna_bug(self): - x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) - filled = x.fillna(method='ffill') - expected = Series([nan, 1., 1., 3., 3.], x.index) + x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) + filled = x.fillna(method="ffill") + expected = Series([nan, 1.0, 1.0, 3.0, 3.0], x.index) assert_series_equal(filled, expected) - filled = x.fillna(method='bfill') - expected = Series([1., 1., 3., 3., nan], x.index) + filled = x.fillna(method="bfill") + expected = Series([1.0, 1.0, 3.0, 3.0, nan], x.index) assert_series_equal(filled, expected) def test_fillna_inplace(self): - x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) + x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) y = x.copy() y.fillna(value=0, inplace=True) @@ -584,14 +748,14 @@ def test_fillna_inplace(self): def test_fillna_invalid_method(self, datetime_series): try: - datetime_series.fillna(method='ffil') + datetime_series.fillna(method="ffil") except ValueError as inst: - assert 'ffil' in str(inst) + assert "ffil" in str(inst) def test_ffill(self): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) ts[2] = np.NaN - assert_series_equal(ts.ffill(), ts.fillna(method='ffill')) + assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) def test_ffill_mixed_dtypes_without_missing_data(self): # GH14956 @@ -600,9 +764,9 @@ def test_ffill_mixed_dtypes_without_missing_data(self): assert_series_equal(series, result) def test_bfill(self): - ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) ts[2] = np.NaN - assert_series_equal(ts.bfill(), ts.fillna(method='bfill')) + assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) def test_timedelta64_nan(self): @@ -649,36 +813,45 @@ def test_dropna_empty(self): assert len(s) == 0 # invalid axis - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): s.dropna(axis=1) def test_datetime64_tz_dropna(self): # DatetimeBlock - s = Series([Timestamp('2011-01-01 10:00'), pd.NaT, Timestamp( - '2011-01-03 10:00'), pd.NaT]) + s = Series( + [ + Timestamp("2011-01-01 10:00"), + pd.NaT, + Timestamp("2011-01-03 10:00"), + pd.NaT, + ] + ) result = s.dropna() - expected = Series([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-03 10:00')], index=[0, 2]) + expected = Series( + [Timestamp("2011-01-01 10:00"), Timestamp("2011-01-03 10:00")], index=[0, 2] + ) tm.assert_series_equal(result, expected) # DatetimeBlockTZ - idx = pd.DatetimeIndex(['2011-01-01 10:00', pd.NaT, - '2011-01-03 10:00', pd.NaT], - tz='Asia/Tokyo') + idx = pd.DatetimeIndex( + ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz="Asia/Tokyo" + ) s = pd.Series(idx) - assert s.dtype == 'datetime64[ns, Asia/Tokyo]' + assert s.dtype == "datetime64[ns, Asia/Tokyo]" result = s.dropna() - expected = Series([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), - Timestamp('2011-01-03 10:00', tz='Asia/Tokyo')], - index=[0, 2]) - assert result.dtype == 'datetime64[ns, Asia/Tokyo]' + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz="Asia/Tokyo"), + Timestamp("2011-01-03 10:00", tz="Asia/Tokyo"), + ], + index=[0, 2], + ) + assert result.dtype == "datetime64[ns, Asia/Tokyo]" tm.assert_series_equal(result, expected) def test_dropna_no_nan(self): - for s in [Series([1, 2, 3], name='x'), Series( - [False, True, False], name='x')]: + for s in [Series([1, 2, 3], name="x"), Series([False, True, False], name="x")]: result = s.dropna() tm.assert_series_equal(result, s) @@ -689,9 +862,10 @@ def test_dropna_no_nan(self): tm.assert_series_equal(s2, s) def test_dropna_intervals(self): - s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( - [np.nan, 0, 1, 2], - [np.nan, 1, 2, 3])) + s = Series( + [np.nan, 1, 2, 3], + IntervalIndex.from_arrays([np.nan, 0, 1, 2], [np.nan, 1, 2, 3]), + ) result = s.dropna() expected = s.iloc[1:] @@ -725,25 +899,27 @@ def test_notna(self): tm.assert_series_equal(ser.notna(), expected) def test_pad_nan(self): - x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'], - dtype=float) + x = Series( + [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float + ) - x.fillna(method='pad', inplace=True) + x.fillna(method="pad", inplace=True) - expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], - ['z', 'a', 'b', 'c', 'd'], dtype=float) + expected = Series( + [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float + ) assert_series_equal(x[1:], expected[1:]) assert np.isnan(x[0]), np.isnan(expected[0]) def test_pad_require_monotonicity(self): - rng = date_range('1/1/2000', '3/1/2000', freq='B') + rng = date_range("1/1/2000", "3/1/2000", freq="B") # neither monotonic increasing or decreasing rng2 = rng[[1, 0, 2]] msg = "index must be monotonic increasing or decreasing" with pytest.raises(ValueError, match=msg): - rng2.get_indexer(rng, method='pad') + rng2.get_indexer(rng, method="pad") def test_dropna_preserve_name(self, datetime_series): datetime_series[:5] = np.nan @@ -767,16 +943,16 @@ def test_series_fillna_limit(self): s = Series(np.random.randn(10), index=index) result = s[:2].reindex(index) - result = result.fillna(method='pad', limit=5) + result = result.fillna(method="pad", limit=5) - expected = s[:2].reindex(index).fillna(method='pad') + expected = s[:2].reindex(index).fillna(method="pad") expected[-3:] = np.nan assert_series_equal(result, expected) result = s[-2:].reindex(index) - result = result.fillna(method='bfill', limit=5) + result = result.fillna(method="bfill", limit=5) - expected = s[-2:].reindex(index).fillna(method='backfill') + expected = s[-2:].reindex(index).fillna(method="backfill") expected[:3] = np.nan assert_series_equal(result, expected) @@ -789,21 +965,23 @@ def test_sparse_series_fillna_limit(self): ss = s[:2].reindex(index).to_sparse() # TODO: what is this test doing? why are result an expected # the same call to fillna? - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): # TODO: release-note fillna performance warning - result = ss.fillna(method='pad', limit=5) - expected = ss.fillna(method='pad', limit=5) + result = ss.fillna(method="pad", limit=5) + expected = ss.fillna(method="pad", limit=5) expected = expected.to_dense() expected[-3:] = np.nan expected = expected.to_sparse() assert_series_equal(result, expected) ss = s[-2:].reindex(index).to_sparse() - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - result = ss.fillna(method='backfill', limit=5) - expected = ss.fillna(method='backfill') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + result = ss.fillna(method="backfill", limit=5) + expected = ss.fillna(method="backfill") expected = expected.to_dense() expected[:3] = np.nan expected = expected.to_sparse() @@ -816,19 +994,21 @@ def test_sparse_series_pad_backfill_limit(self): s = Series(np.random.randn(10), index=index) s = s.to_sparse() - result = s[:2].reindex(index, method='pad', limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = s[:2].reindex(index).fillna(method='pad') + result = s[:2].reindex(index, method="pad", limit=5) + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = s[:2].reindex(index).fillna(method="pad") expected = expected.to_dense() expected[-3:] = np.nan expected = expected.to_sparse() assert_series_equal(result, expected) - result = s[-2:].reindex(index, method='backfill', limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = s[-2:].reindex(index).fillna(method='backfill') + result = s[-2:].reindex(index, method="backfill", limit=5) + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = s[-2:].reindex(index).fillna(method="backfill") expected = expected.to_dense() expected[:3] = np.nan expected = expected.to_sparse() @@ -839,23 +1019,39 @@ def test_series_pad_backfill_limit(self): index = np.arange(10) s = Series(np.random.randn(10), index=index) - result = s[:2].reindex(index, method='pad', limit=5) + result = s[:2].reindex(index, method="pad", limit=5) - expected = s[:2].reindex(index).fillna(method='pad') + expected = s[:2].reindex(index).fillna(method="pad") expected[-3:] = np.nan assert_series_equal(result, expected) - result = s[-2:].reindex(index, method='backfill', limit=5) + result = s[-2:].reindex(index, method="backfill", limit=5) - expected = s[-2:].reindex(index).fillna(method='backfill') + expected = s[-2:].reindex(index).fillna(method="backfill") expected[:3] = np.nan assert_series_equal(result, expected) -@pytest.fixture(params=['linear', 'index', 'values', 'nearest', 'slinear', - 'zero', 'quadratic', 'cubic', 'barycentric', 'krogh', - 'polynomial', 'spline', 'piecewise_polynomial', - 'from_derivatives', 'pchip', 'akima', ]) +@pytest.fixture( + params=[ + "linear", + "index", + "values", + "nearest", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + ] +) def nontemporal_method(request): """ Fixture that returns an (method name, required kwargs) pair. @@ -864,14 +1060,27 @@ def nontemporal_method(request): separately from these non-temporal methods. """ method = request.param - kwargs = dict(order=1) if method in ('spline', 'polynomial') else dict() + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() return method, kwargs -@pytest.fixture(params=['linear', 'slinear', 'zero', 'quadratic', 'cubic', - 'barycentric', 'krogh', 'polynomial', 'spline', - 'piecewise_polynomial', 'from_derivatives', 'pchip', - 'akima', ]) +@pytest.fixture( + params=[ + "linear", + "slinear", + "zero", + "quadratic", + "cubic", + "barycentric", + "krogh", + "polynomial", + "spline", + "piecewise_polynomial", + "from_derivatives", + "pchip", + "akima", + ] +) def interp_methods_ind(request): """ Fixture that returns a (method name, required kwargs) pair to be tested for various Index types. @@ -880,38 +1089,39 @@ def interp_methods_ind(request): 'values' as a parameterization """ method = request.param - kwargs = dict(order=1) if method in ('spline', 'polynomial') else dict() + kwargs = dict(order=1) if method in ("spline", "polynomial") else dict() return method, kwargs class TestSeriesInterpolateData: def test_interpolate(self, datetime_series, string_series): - ts = Series(np.arange(len(datetime_series), dtype=float), - datetime_series.index) + ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() ts_copy[5:10] = np.NaN - linear_interp = ts_copy.interpolate(method='linear') + linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) - ord_ts = Series([d.toordinal() for d in datetime_series.index], - index=datetime_series.index).astype(float) + ord_ts = Series( + [d.toordinal() for d in datetime_series.index], index=datetime_series.index + ).astype(float) ord_ts_copy = ord_ts.copy() ord_ts_copy[5:10] = np.NaN - time_interp = ord_ts_copy.interpolate(method='time') + time_interp = ord_ts_copy.interpolate(method="time") tm.assert_series_equal(time_interp, ord_ts) def test_interpolate_time_raises_for_non_timeseries(self): # When method='time' is used on a non-TimeSeries that contains a null # value, a ValueError should be raised. non_ts = Series([0, 1, 2, np.NaN]) - msg = ("time-weighted interpolation only works on Series.* " - "with a DatetimeIndex") + msg = ( + "time-weighted interpolation only works on Series.* " "with a DatetimeIndex" + ) with pytest.raises(ValueError, match=msg): - non_ts.interpolate(method='time') + non_ts.interpolate(method="time") @td.skip_if_no_scipy def test_interpolate_pchip(self): @@ -920,9 +1130,10 @@ def test_interpolate_pchip(self): ser = Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index - new_index = ser.index.union(Index([49.25, 49.5, 49.75, 50.25, 50.5, - 50.75])).astype(float) - interp_s = ser.reindex(new_index).interpolate(method='pchip') + new_index = ser.index.union( + Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) + ).astype(float) + interp_s = ser.reindex(new_index).interpolate(method="pchip") # does not blow up, GH5977 interp_s[49:51] @@ -932,54 +1143,56 @@ def test_interpolate_akima(self): ser = Series([10, 11, 12, 13]) - expected = Series([11.00, 11.25, 11.50, 11.75, - 12.00, 12.25, 12.50, 12.75, 13.00], - index=Index([1.0, 1.25, 1.5, 1.75, - 2.0, 2.25, 2.5, 2.75, 3.0])) + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) # interpolate at new_index - new_index = ser.index.union( - Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) - ).astype(float) - interp_s = ser.reindex(new_index).interpolate(method='akima') + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="akima") assert_series_equal(interp_s[1:3], expected) @td.skip_if_no_scipy def test_interpolate_piecewise_polynomial(self): ser = Series([10, 11, 12, 13]) - expected = Series([11.00, 11.25, 11.50, 11.75, - 12.00, 12.25, 12.50, 12.75, 13.00], - index=Index([1.0, 1.25, 1.5, 1.75, - 2.0, 2.25, 2.5, 2.75, 3.0])) + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) # interpolate at new_index - new_index = ser.index.union( - Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) - ).astype(float) - interp_s = ser.reindex(new_index).interpolate( - method='piecewise_polynomial') + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="piecewise_polynomial") assert_series_equal(interp_s[1:3], expected) @td.skip_if_no_scipy def test_interpolate_from_derivatives(self): ser = Series([10, 11, 12, 13]) - expected = Series([11.00, 11.25, 11.50, 11.75, - 12.00, 12.25, 12.50, 12.75, 13.00], - index=Index([1.0, 1.25, 1.5, 1.75, - 2.0, 2.25, 2.5, 2.75, 3.0])) + expected = Series( + [11.00, 11.25, 11.50, 11.75, 12.00, 12.25, 12.50, 12.75, 13.00], + index=Index([1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0]), + ) # interpolate at new_index - new_index = ser.index.union( - Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75]) - ).astype(float) - interp_s = ser.reindex(new_index).interpolate( - method='from_derivatives') + new_index = ser.index.union(Index([1.25, 1.5, 1.75, 2.25, 2.5, 2.75])).astype( + float + ) + interp_s = ser.reindex(new_index).interpolate(method="from_derivatives") assert_series_equal(interp_s[1:3], expected) - @pytest.mark.parametrize("kwargs", [ - {}, - pytest.param({'method': 'polynomial', 'order': 1}, - marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + ), + ], + ) def test_interpolate_corners(self, kwargs): s = Series([np.nan, np.nan]) assert_series_equal(s.interpolate(**kwargs), s) @@ -993,107 +1206,112 @@ def test_interpolate_index_values(self): vals = s.index.values.astype(float) - result = s.interpolate(method='index') + result = s.interpolate(method="index") expected = s.copy() bad = isna(expected.values) good = ~bad - expected = Series(np.interp(vals[bad], vals[good], - s.values[good]), - index=s.index[bad]) + expected = Series( + np.interp(vals[bad], vals[good], s.values[good]), index=s.index[bad] + ) assert_series_equal(result[bad], expected) # 'values' is synonymous with 'index' for the method kwarg - other_result = s.interpolate(method='values') + other_result = s.interpolate(method="values") assert_series_equal(other_result, result) assert_series_equal(other_result[bad], expected) def test_interpolate_non_ts(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - msg = ("time-weighted interpolation only works on Series or DataFrames" - " with a DatetimeIndex") + msg = ( + "time-weighted interpolation only works on Series or DataFrames" + " with a DatetimeIndex" + ) with pytest.raises(ValueError, match=msg): - s.interpolate(method='time') + s.interpolate(method="time") - @pytest.mark.parametrize("kwargs", [ - {}, - pytest.param({'method': 'polynomial', 'order': 1}, - marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "kwargs", + [ + {}, + pytest.param( + {"method": "polynomial", "order": 1}, marks=td.skip_if_no_scipy + ), + ], + ) def test_nan_interpolate(self, kwargs): s = Series([0, 1, np.nan, 3]) result = s.interpolate(**kwargs) - expected = Series([0., 1., 2., 3.]) + expected = Series([0.0, 1.0, 2.0, 3.0]) assert_series_equal(result, expected) def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9]) + expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) assert_series_equal(result, expected) def test_nan_str_index(self): - s = Series([0, 1, 2, np.nan], index=list('abcd')) + s = Series([0, 1, 2, np.nan], index=list("abcd")) result = s.interpolate() - expected = Series([0., 1., 2., 2.], index=list('abcd')) + expected = Series([0.0, 1.0, 2.0, 2.0], index=list("abcd")) assert_series_equal(result, expected) @td.skip_if_no_scipy def test_interp_quad(self): sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) - result = sq.interpolate(method='quadratic') - expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4]) + result = sq.interpolate(method="quadratic") + expected = Series([1.0, 4.0, 9.0, 16.0], index=[1, 2, 3, 4]) assert_series_equal(result, expected) @td.skip_if_no_scipy def test_interp_scipy_basic(self): s = Series([1, 3, np.nan, 12, np.nan, 25]) # slinear - expected = Series([1., 3., 7.5, 12., 18.5, 25.]) - result = s.interpolate(method='slinear') + expected = Series([1.0, 3.0, 7.5, 12.0, 18.5, 25.0]) + result = s.interpolate(method="slinear") assert_series_equal(result, expected) - result = s.interpolate(method='slinear', downcast='infer') + result = s.interpolate(method="slinear", downcast="infer") assert_series_equal(result, expected) # nearest expected = Series([1, 3, 3, 12, 12, 25]) - result = s.interpolate(method='nearest') - assert_series_equal(result, expected.astype('float')) + result = s.interpolate(method="nearest") + assert_series_equal(result, expected.astype("float")) - result = s.interpolate(method='nearest', downcast='infer') + result = s.interpolate(method="nearest", downcast="infer") assert_series_equal(result, expected) # zero expected = Series([1, 3, 3, 12, 12, 25]) - result = s.interpolate(method='zero') - assert_series_equal(result, expected.astype('float')) + result = s.interpolate(method="zero") + assert_series_equal(result, expected.astype("float")) - result = s.interpolate(method='zero', downcast='infer') + result = s.interpolate(method="zero", downcast="infer") assert_series_equal(result, expected) # quadratic # GH #15662. - expected = Series([1, 3., 6.823529, 12., 18.058824, 25.]) - result = s.interpolate(method='quadratic') + expected = Series([1, 3.0, 6.823529, 12.0, 18.058824, 25.0]) + result = s.interpolate(method="quadratic") assert_series_equal(result, expected) - result = s.interpolate(method='quadratic', downcast='infer') + result = s.interpolate(method="quadratic", downcast="infer") assert_series_equal(result, expected) # cubic - expected = Series([1., 3., 6.8, 12., 18.2, 25.]) - result = s.interpolate(method='cubic') + expected = Series([1.0, 3.0, 6.8, 12.0, 18.2, 25.0]) + result = s.interpolate(method="cubic") assert_series_equal(result, expected) def test_interp_limit(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - expected = Series([1., 3., 5., 7., np.nan, 11.]) - result = s.interpolate(method='linear', limit=2) + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) + result = s.interpolate(method="linear", limit=2) assert_series_equal(result, expected) @pytest.mark.parametrize("limit", [-1, 0]) - def test_interpolate_invalid_nonpositive_limit(self, nontemporal_method, - limit): + def test_interpolate_invalid_nonpositive_limit(self, nontemporal_method, limit): # GH 9217: make sure limit is greater than zero. s = pd.Series([1, 2, np.nan, 4]) method, kwargs = nontemporal_method @@ -1108,7 +1326,7 @@ def test_interpolate_invalid_float_limit(self, nontemporal_method): with pytest.raises(ValueError, match="Limit must be an integer"): s.interpolate(limit=limit, method=method, **kwargs) - @pytest.mark.parametrize("invalid_method", [None, 'nonexistent_method']) + @pytest.mark.parametrize("invalid_method", [None, "nonexistent_method"]) def test_interp_invalid_method(self, invalid_method): s = Series([1, 3, np.nan, 12, np.nan, 25]) @@ -1125,164 +1343,154 @@ def test_interp_limit_forward(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) # Provide 'forward' (the default) explicitly here. - expected = Series([1., 3., 5., 7., np.nan, 11.]) + expected = Series([1.0, 3.0, 5.0, 7.0, np.nan, 11.0]) - result = s.interpolate(method='linear', limit=2, - limit_direction='forward') + result = s.interpolate(method="linear", limit=2, limit_direction="forward") assert_series_equal(result, expected) - result = s.interpolate(method='linear', limit=2, - limit_direction='FORWARD') + result = s.interpolate(method="linear", limit=2, limit_direction="FORWARD") assert_series_equal(result, expected) def test_interp_unlimited(self): # these test are for issue #16282 default Limit=None is unlimited - s = Series([np.nan, 1., 3., np.nan, np.nan, np.nan, 11., np.nan]) - expected = Series([1., 1., 3., 5., 7., 9., 11., 11.]) - result = s.interpolate(method='linear', - limit_direction='both') + s = Series([np.nan, 1.0, 3.0, np.nan, np.nan, np.nan, 11.0, np.nan]) + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="both") assert_series_equal(result, expected) - expected = Series([np.nan, 1., 3., 5., 7., 9., 11., 11.]) - result = s.interpolate(method='linear', - limit_direction='forward') + expected = Series([np.nan, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 11.0]) + result = s.interpolate(method="linear", limit_direction="forward") assert_series_equal(result, expected) - expected = Series([1., 1., 3., 5., 7., 9., 11., np.nan]) - result = s.interpolate(method='linear', - limit_direction='backward') + expected = Series([1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, np.nan]) + result = s.interpolate(method="linear", limit_direction="backward") assert_series_equal(result, expected) def test_interp_limit_bad_direction(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - msg = (r"Invalid limit_direction: expecting one of \['forward'," - r" 'backward', 'both'\], got 'abc'") + msg = ( + r"Invalid limit_direction: expecting one of \['forward'," + r" 'backward', 'both'\], got 'abc'" + ) with pytest.raises(ValueError, match=msg): - s.interpolate(method='linear', limit=2, limit_direction='abc') + s.interpolate(method="linear", limit=2, limit_direction="abc") # raises an error even if no limit is specified. with pytest.raises(ValueError, match=msg): - s.interpolate(method='linear', limit_direction='abc') + s.interpolate(method="linear", limit_direction="abc") # limit_area introduced GH #16284 def test_interp_limit_area(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([nan, nan, 3, nan, nan, nan, 7, nan, nan]) - expected = Series([nan, nan, 3., 4., 5., 6., 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='inside') + expected = Series([nan, nan, 3.0, 4.0, 5.0, 6.0, 7.0, nan, nan]) + result = s.interpolate(method="linear", limit_area="inside") assert_series_equal(result, expected) - expected = Series([nan, nan, 3., 4., nan, nan, 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='inside', - limit=1) + expected = Series([nan, nan, 3.0, 4.0, nan, nan, 7.0, nan, nan]) + result = s.interpolate(method="linear", limit_area="inside", limit=1) - expected = Series([nan, nan, 3., 4., nan, 6., 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='inside', - limit_direction='both', limit=1) + expected = Series([nan, nan, 3.0, 4.0, nan, 6.0, 7.0, nan, nan]) + result = s.interpolate( + method="linear", limit_area="inside", limit_direction="both", limit=1 + ) assert_series_equal(result, expected) - expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., 7.]) - result = s.interpolate(method='linear', limit_area='outside') + expected = Series([nan, nan, 3.0, nan, nan, nan, 7.0, 7.0, 7.0]) + result = s.interpolate(method="linear", limit_area="outside") assert_series_equal(result, expected) - expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., nan]) - result = s.interpolate(method='linear', limit_area='outside', - limit=1) + expected = Series([nan, nan, 3.0, nan, nan, nan, 7.0, 7.0, nan]) + result = s.interpolate(method="linear", limit_area="outside", limit=1) - expected = Series([nan, 3., 3., nan, nan, nan, 7., 7., nan]) - result = s.interpolate(method='linear', limit_area='outside', - limit_direction='both', limit=1) + expected = Series([nan, 3.0, 3.0, nan, nan, nan, 7.0, 7.0, nan]) + result = s.interpolate( + method="linear", limit_area="outside", limit_direction="both", limit=1 + ) assert_series_equal(result, expected) - expected = Series([3., 3., 3., nan, nan, nan, 7., nan, nan]) - result = s.interpolate(method='linear', limit_area='outside', - direction='backward') + expected = Series([3.0, 3.0, 3.0, nan, nan, nan, 7.0, nan, nan]) + result = s.interpolate( + method="linear", limit_area="outside", direction="backward" + ) # raises an error even if limit type is wrong. - msg = (r"Invalid limit_area: expecting one of \['inside', 'outside'\]," - " got abc") + msg = ( + r"Invalid limit_area: expecting one of \['inside', 'outside'\]," " got abc" + ) with pytest.raises(ValueError, match=msg): - s.interpolate(method='linear', limit_area='abc') + s.interpolate(method="linear", limit_area="abc") def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11]) - expected = Series([1., 3., np.nan, 7., 9., 11.]) - result = s.interpolate(method='linear', limit=2, - limit_direction='backward') + expected = Series([1.0, 3.0, np.nan, 7.0, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") assert_series_equal(result, expected) - expected = Series([1., 3., 5., np.nan, 9., 11.]) - result = s.interpolate(method='linear', limit=1, - limit_direction='both') + expected = Series([1.0, 3.0, 5.0, np.nan, 9.0, 11.0]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") assert_series_equal(result, expected) # Check that this works on a longer series of nans. - s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, - np.nan]) + s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, np.nan]) - expected = Series([1., 3., 4., 5., 6., 7., 9., 10., 11., 12., 12.]) - result = s.interpolate(method='linear', limit=2, - limit_direction='both') + expected = Series([1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") assert_series_equal(result, expected) - expected = Series([1., 3., 4., np.nan, 6., 7., 9., 10., 11., 12., 12.]) - result = s.interpolate(method='linear', limit=1, - limit_direction='both') + expected = Series( + [1.0, 3.0, 4.0, np.nan, 6.0, 7.0, 9.0, 10.0, 11.0, 12.0, 12.0] + ) + result = s.interpolate(method="linear", limit=1, limit_direction="both") assert_series_equal(result, expected) def test_interp_limit_to_ends(self): # These test are for issue #10420 -- flow back to beginning. s = Series([np.nan, np.nan, 5, 7, 9, np.nan]) - expected = Series([5., 5., 5., 7., 9., np.nan]) - result = s.interpolate(method='linear', limit=2, - limit_direction='backward') + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, np.nan]) + result = s.interpolate(method="linear", limit=2, limit_direction="backward") assert_series_equal(result, expected) - expected = Series([5., 5., 5., 7., 9., 9.]) - result = s.interpolate(method='linear', limit=2, - limit_direction='both') + expected = Series([5.0, 5.0, 5.0, 7.0, 9.0, 9.0]) + result = s.interpolate(method="linear", limit=2, limit_direction="both") assert_series_equal(result, expected) def test_interp_limit_before_ends(self): # These test are for issue #11115 -- limit ends properly. s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan]) - expected = Series([np.nan, np.nan, 5., 7., 7., np.nan]) - result = s.interpolate(method='linear', limit=1, - limit_direction='forward') + expected = Series([np.nan, np.nan, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="forward") assert_series_equal(result, expected) - expected = Series([np.nan, 5., 5., 7., np.nan, np.nan]) - result = s.interpolate(method='linear', limit=1, - limit_direction='backward') + expected = Series([np.nan, 5.0, 5.0, 7.0, np.nan, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="backward") assert_series_equal(result, expected) - expected = Series([np.nan, 5., 5., 7., 7., np.nan]) - result = s.interpolate(method='linear', limit=1, - limit_direction='both') + expected = Series([np.nan, 5.0, 5.0, 7.0, 7.0, np.nan]) + result = s.interpolate(method="linear", limit=1, limit_direction="both") assert_series_equal(result, expected) @td.skip_if_no_scipy def test_interp_all_good(self): s = Series([1, 2, 3]) - result = s.interpolate(method='polynomial', order=1) + result = s.interpolate(method="polynomial", order=1) assert_series_equal(result, s) # non-scipy result = s.interpolate() assert_series_equal(result, s) - @pytest.mark.parametrize("check_scipy", [ - False, - pytest.param(True, marks=td.skip_if_no_scipy) - ]) + @pytest.mark.parametrize( + "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] + ) def test_interp_multiIndex(self, check_scipy): - idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) s = Series([1, 2, np.nan], index=idx) expected = s.copy() @@ -1293,32 +1501,31 @@ def test_interp_multiIndex(self, check_scipy): msg = "Only `method=linear` interpolation is supported on MultiIndexes" if check_scipy: with pytest.raises(ValueError, match=msg): - s.interpolate(method='polynomial', order=1) + s.interpolate(method="polynomial", order=1) @td.skip_if_no_scipy def test_interp_nonmono_raise(self): s = Series([1, np.nan, 3], index=[0, 2, 1]) msg = "krogh interpolation requires that the index be monotonic" with pytest.raises(ValueError, match=msg): - s.interpolate(method='krogh') + s.interpolate(method="krogh") @td.skip_if_no_scipy def test_interp_datetime64(self): - df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3)) - result = df.interpolate(method='nearest') - expected = Series([1., 1., 3.], - index=date_range('1/1/2000', periods=3)) + df = Series([1, np.nan, 3], index=date_range("1/1/2000", periods=3)) + result = df.interpolate(method="nearest") + expected = Series([1.0, 1.0, 3.0], index=date_range("1/1/2000", periods=3)) assert_series_equal(result, expected) def test_interp_limit_no_nans(self): # GH 7173 - s = pd.Series([1., 2., 3.]) + s = pd.Series([1.0, 2.0, 3.0]) result = s.interpolate(limit=1) expected = s assert_series_equal(result, expected) @td.skip_if_no_scipy - @pytest.mark.parametrize("method", ['polynomial', 'spline']) + @pytest.mark.parametrize("method", ["polynomial", "spline"]) def test_no_order(self, method): # see GH-10633, GH-24014 s = Series([0, 1, np.nan, 3]) @@ -1327,83 +1534,81 @@ def test_no_order(self, method): s.interpolate(method=method) @td.skip_if_no_scipy - @pytest.mark.parametrize('order', [-1, -1.0, 0, 0.0, np.nan]) + @pytest.mark.parametrize("order", [-1, -1.0, 0, 0.0, np.nan]) def test_interpolate_spline_invalid_order(self, order): s = Series([0, 1, np.nan, 3]) msg = "order needs to be specified and greater than 0" with pytest.raises(ValueError, match=msg): - s.interpolate(method='spline', order=order) + s.interpolate(method="spline", order=order) @td.skip_if_no_scipy def test_spline(self): s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) - result = s.interpolate(method='spline', order=1) - expected = Series([1., 2., 3., 4., 5., 6., 7.]) + result = s.interpolate(method="spline", order=1) + expected = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) assert_series_equal(result, expected) @td.skip_if_no_scipy def test_spline_extrapolate(self): s = Series([1, 2, 3, 4, np.nan, 6, np.nan]) - result3 = s.interpolate(method='spline', order=1, ext=3) - expected3 = Series([1., 2., 3., 4., 5., 6., 6.]) + result3 = s.interpolate(method="spline", order=1, ext=3) + expected3 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0]) assert_series_equal(result3, expected3) - result1 = s.interpolate(method='spline', order=1, ext=0) - expected1 = Series([1., 2., 3., 4., 5., 6., 7.]) + result1 = s.interpolate(method="spline", order=1, ext=0) + expected1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) assert_series_equal(result1, expected1) @td.skip_if_no_scipy def test_spline_smooth(self): s = Series([1, 2, np.nan, 4, 5.1, np.nan, 7]) - assert (s.interpolate(method='spline', order=3, s=0)[5] != - s.interpolate(method='spline', order=3)[5]) + assert ( + s.interpolate(method="spline", order=3, s=0)[5] + != s.interpolate(method="spline", order=3)[5] + ) @td.skip_if_no_scipy def test_spline_interpolation(self): s = Series(np.arange(10) ** 2) s[np.random.randint(0, 9, 3)] = np.nan - result1 = s.interpolate(method='spline', order=1) - expected1 = s.interpolate(method='spline', order=1) + result1 = s.interpolate(method="spline", order=1) + expected1 = s.interpolate(method="spline", order=1) assert_series_equal(result1, expected1) def test_interp_timedelta64(self): # GH 6424 - df = Series([1, np.nan, 3], - index=pd.to_timedelta([1, 2, 3])) - result = df.interpolate(method='time') - expected = Series([1., 2., 3.], - index=pd.to_timedelta([1, 2, 3])) + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 3])) + result = df.interpolate(method="time") + expected = Series([1.0, 2.0, 3.0], index=pd.to_timedelta([1, 2, 3])) assert_series_equal(result, expected) # test for non uniform spacing - df = Series([1, np.nan, 3], - index=pd.to_timedelta([1, 2, 4])) - result = df.interpolate(method='time') - expected = Series([1., 1.666667, 3.], - index=pd.to_timedelta([1, 2, 4])) + df = Series([1, np.nan, 3], index=pd.to_timedelta([1, 2, 4])) + result = df.interpolate(method="time") + expected = Series([1.0, 1.666667, 3.0], index=pd.to_timedelta([1, 2, 4])) assert_series_equal(result, expected) def test_series_interpolate_method_values(self): # #1646 - ts = _simple_ts('1/1/2000', '1/20/2000') + ts = _simple_ts("1/1/2000", "1/20/2000") ts[::2] = np.nan - result = ts.interpolate(method='values') + result = ts.interpolate(method="values") exp = ts.interpolate() assert_series_equal(result, exp) def test_series_interpolate_intraday(self): # #1698 - index = pd.date_range('1/1/2012', periods=4, freq='12D') + index = pd.date_range("1/1/2012", periods=4, freq="12D") ts = pd.Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(days=1)).sort_values() - exp = ts.reindex(new_index).interpolate(method='time') + exp = ts.reindex(new_index).interpolate(method="time") - index = pd.date_range('1/1/2012', periods=4, freq='12H') + index = pd.date_range("1/1/2012", periods=4, freq="12H") ts = pd.Series([0, 12, 24, 36], index) new_index = index.append(index + pd.DateOffset(hours=1)).sort_values() - result = ts.reindex(new_index).interpolate(method='time') + result = ts.reindex(new_index).interpolate(method="time") tm.assert_numpy_array_equal(result.values, exp.values) @@ -1416,10 +1621,11 @@ def test_nonzero_warning(self): @pytest.mark.parametrize( "ind", [ - ['a', 'b', 'c', 'd'], + ["a", "b", "c", "d"], pd.period_range(start="2019-01-01", periods=4), pd.interval_range(start=0, end=4), - ]) + ], + ) def test_interp_non_timedelta_index(self, interp_methods_ind, ind): # gh 21662 df = pd.DataFrame([0, 1, np.nan, 3], index=ind) @@ -1437,7 +1643,8 @@ def test_interp_non_timedelta_index(self, interp_methods_ind, ind): "Index column must be numeric or datetime type when " "using {method} method other than linear. " "Try setting a numeric or datetime index column before " - "interpolating.".format(method=method)) + "interpolating.".format(method=method) + ) with pytest.raises(ValueError, match=expected_error): df[0].interpolate(method=method, **kwargs) @@ -1461,6 +1668,5 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): assert_series_equal(result, expected) else: pytest.skip( - "This interpolation method is not supported for " - "Timedelta Index yet." + "This interpolation method is not supported for " "Timedelta Index yet." ) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 215fa9f22277e..aada5cca9fdc7 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -5,25 +5,26 @@ import pytest import pandas as pd -from pandas import ( - Categorical, DataFrame, Index, Series, bdate_range, date_range, isna) +from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna from pandas.core import ops from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_index_equal, - assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_index_equal, + assert_series_equal, +) from .common import TestData class TestSeriesLogicalOps: - @pytest.mark.parametrize('bool_op', [operator.and_, - operator.or_, operator.xor]) + @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) def test_bool_operators_with_nas(self, bool_op): # boolean &, |, ^ should work with object arrays and propagate NAs - ser = Series(bdate_range('1/1/2000', periods=10), dtype=object) + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) ser[::2] = np.nan mask = ser.isna() @@ -37,7 +38,7 @@ def test_bool_operators_with_nas(self, bool_op): def test_operators_bitwise(self): # GH#9016: support bitwise op for integer types - index = list('bca') + index = list("bca") s_tft = Series([True, False, True], index=index) s_fff = Series([False, False, False], index=index) @@ -47,7 +48,7 @@ def test_operators_bitwise(self): # TODO: unused # s_0101 = Series([0, 1, 0, 1]) - s_0123 = Series(range(4), dtype='int64') + s_0123 = Series(range(4), dtype="int64") s_3333 = Series([3] * 4) s_4444 = Series([4] * 4) @@ -60,21 +61,21 @@ def test_operators_bitwise(self): assert_series_equal(res, expected) res = s_0123 & s_3333 - expected = Series(range(4), dtype='int64') + expected = Series(range(4), dtype="int64") assert_series_equal(res, expected) res = s_0123 | s_4444 - expected = Series(range(4, 8), dtype='int64') + expected = Series(range(4, 8), dtype="int64") assert_series_equal(res, expected) - s_a0b1c0 = Series([1], list('b')) + s_a0b1c0 = Series([1], list("b")) res = s_tft & s_a0b1c0 - expected = s_tff.reindex(list('abc')) + expected = s_tff.reindex(list("abc")) assert_series_equal(res, expected) res = s_tft | s_a0b1c0 - expected = s_tft.reindex(list('abc')) + expected = s_tft.reindex(list("abc")) assert_series_equal(res, expected) n0 = 0 @@ -95,19 +96,19 @@ def test_operators_bitwise(self): expected = Series([0, 1, 0, 1]) assert_series_equal(res, expected) - s_1111 = Series([1] * 4, dtype='int8') + s_1111 = Series([1] * 4, dtype="int8") res = s_0123 & s_1111 - expected = Series([0, 1, 0, 1], dtype='int64') + expected = Series([0, 1, 0, 1], dtype="int64") assert_series_equal(res, expected) res = s_0123.astype(np.int16) | s_1111.astype(np.int32) - expected = Series([1, 1, 3, 3], dtype='int32') + expected = Series([1, 1, 3, 3], dtype="int32") assert_series_equal(res, expected) with pytest.raises(TypeError): - s_1111 & 'a' + s_1111 & "a" with pytest.raises(TypeError): - s_1111 & ['a', 'b', 'c', 'd'] + s_1111 & ["a", "b", "c", "d"] with pytest.raises(TypeError): s_0123 & np.NaN with pytest.raises(TypeError): @@ -116,24 +117,25 @@ def test_operators_bitwise(self): s_0123 & [0.1, 4, 3.14, 2] # s_0123 will be all false now because of reindexing like s_tft - exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) assert_series_equal(s_tft & s_0123, exp) # s_tft will be all false now because of reindexing like s_0123 - exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) assert_series_equal(s_0123 & s_tft, exp) assert_series_equal(s_0123 & False, Series([False] * 4)) assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) assert_series_equal(s_0123 & [False], Series([False] * 4)) assert_series_equal(s_0123 & (False), Series([False] * 4)) - assert_series_equal(s_0123 & Series([False, np.NaN, False, False]), - Series([False] * 4)) + assert_series_equal( + s_0123 & Series([False, np.NaN, False, False]), Series([False] * 4) + ) s_ftft = Series([False, True, False, True]) assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft) - s_abNd = Series(['a', 'b', np.NaN, 'd']) + s_abNd = Series(["a", "b", np.NaN, "d"]) res = s_0123 & s_abNd expected = s_ftft assert_series_equal(res, expected) @@ -152,7 +154,7 @@ def test_scalar_na_logical_ops_corners(self): result = s & list(s) assert_series_equal(result, expected) - d = DataFrame({'A': s}) + d = DataFrame({"A": s}) # TODO: Fix this exception - needs to be fixed! (see GH5035) # (previously this was a TypeError because series returned # NotImplemented @@ -161,7 +163,7 @@ def test_scalar_na_logical_ops_corners(self): # https://github.com/pandas-dev/pandas/issues/5284 with pytest.raises(TypeError): - d.__and__(s, axis='columns') + d.__and__(s, axis="columns") with pytest.raises(TypeError): s & d @@ -169,11 +171,7 @@ def test_scalar_na_logical_ops_corners(self): # this is wrong as its not a boolean result # result = d.__and__(s,axis='index') - @pytest.mark.parametrize('op', [ - operator.and_, - operator.or_, - operator.xor, - ]) + @pytest.mark.parametrize("op", [operator.and_, operator.or_, operator.xor]) def test_logical_ops_with_index(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) @@ -185,26 +183,33 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx1) assert_series_equal(result, expected) - expected = Series([op(ser[n], idx2[n]) for n in range(len(ser))], - dtype=bool) + expected = Series([op(ser[n], idx2[n]) for n in range(len(ser))], dtype=bool) result = op(ser, idx2) assert_series_equal(result, expected) - @pytest.mark.parametrize('op', [ - pytest.param(ops.rand_, - marks=pytest.mark.xfail(reason="GH#22092 Index " - "implementation returns " - "Index", - raises=AssertionError, - strict=True)), - pytest.param(ops.ror_, - marks=pytest.mark.xfail(reason="Index.get_indexer " - "with non unique index", - raises=InvalidIndexError, - strict=True)), - ops.rxor, - ]) + @pytest.mark.parametrize( + "op", + [ + pytest.param( + ops.rand_, + marks=pytest.mark.xfail( + reason="GH#22092 Index " "implementation returns " "Index", + raises=AssertionError, + strict=True, + ), + ), + pytest.param( + ops.ror_, + marks=pytest.mark.xfail( + reason="Index.get_indexer " "with non unique index", + raises=InvalidIndexError, + strict=True, + ), + ), + ops.rxor, + ], + ) def test_reversed_logical_ops_with_index(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) @@ -222,11 +227,14 @@ def test_reversed_logical_ops_with_index(self, op): result = op(ser, idx2) assert_index_equal(result, expected) - @pytest.mark.parametrize("op, expected", [ - (ops.rand_, pd.Index([False, True])), - (ops.ror_, pd.Index([False, True])), - (ops.rxor, pd.Index([])), - ]) + @pytest.mark.parametrize( + "op, expected", + [ + (ops.rand_, pd.Index([False, True])), + (ops.ror_, pd.Index([False, True])), + (ops.rxor, pd.Index([])), + ], + ) def test_reverse_ops_with_index(self, op, expected): # https://github.com/pandas-dev/pandas/pull/23628 # multi-set Index ops are buggy, so let's avoid duplicates... @@ -239,30 +247,30 @@ def test_logical_ops_label_based(self): # GH#4947 # logical ops should be label based - a = Series([True, False, True], list('bca')) - b = Series([False, True, False], list('abc')) + a = Series([True, False, True], list("bca")) + b = Series([False, True, False], list("abc")) - expected = Series([False, True, False], list('abc')) + expected = Series([False, True, False], list("abc")) result = a & b assert_series_equal(result, expected) - expected = Series([True, True, False], list('abc')) + expected = Series([True, True, False], list("abc")) result = a | b assert_series_equal(result, expected) - expected = Series([True, False, False], list('abc')) + expected = Series([True, False, False], list("abc")) result = a ^ b assert_series_equal(result, expected) # rhs is bigger - a = Series([True, False, True], list('bca')) - b = Series([False, True, False, True], list('abcd')) + a = Series([True, False, True], list("bca")) + b = Series([False, True, False, True], list("abcd")) - expected = Series([False, True, False, False], list('abcd')) + expected = Series([False, True, False, False], list("abcd")) result = a & b assert_series_equal(result, expected) - expected = Series([True, True, False, False], list('abcd')) + expected = Series([True, True, False, False], list("abcd")) result = a | b assert_series_equal(result, expected) @@ -270,35 +278,39 @@ def test_logical_ops_label_based(self): # vs empty result = a & Series([]) - expected = Series([False, False, False], list('bca')) + expected = Series([False, False, False], list("bca")) assert_series_equal(result, expected) result = a | Series([]) - expected = Series([True, False, True], list('bca')) + expected = Series([True, False, True], list("bca")) assert_series_equal(result, expected) # vs non-matching - result = a & Series([1], ['z']) - expected = Series([False, False, False, False], list('abcz')) + result = a & Series([1], ["z"]) + expected = Series([False, False, False, False], list("abcz")) assert_series_equal(result, expected) - result = a | Series([1], ['z']) - expected = Series([True, True, False, False], list('abcz')) + result = a | Series([1], ["z"]) + expected = Series([True, True, False, False], list("abcz")) assert_series_equal(result, expected) # identity # we would like s[s|e] == s to hold for any e, whether empty or not - for e in [Series([]), Series([1], ['z']), - Series(np.nan, b.index), Series(np.nan, a.index)]: + for e in [ + Series([]), + Series([1], ["z"]), + Series(np.nan, b.index), + Series(np.nan, a.index), + ]: result = a[a | e] assert_series_equal(result, a[a]) - for e in [Series(['z'])]: + for e in [Series(["z"])]: result = a[a | e] assert_series_equal(result, a[a]) # vs scalars - index = list('bca') + index = list("bca") t = Series([True, False, True]) for v in [True, 1, 2]: @@ -306,7 +318,7 @@ def test_logical_ops_label_based(self): expected = Series([True, True, True], index=index) assert_series_equal(result, expected) - for v in [np.nan, 'foo']: + for v in [np.nan, "foo"]: with pytest.raises(TypeError): t | v @@ -330,59 +342,49 @@ def test_logical_ops_label_based(self): def test_logical_ops_df_compat(self): # GH#1134 - s1 = pd.Series([True, False, True], index=list('ABC'), name='x') - s2 = pd.Series([True, True, False], index=list('ABD'), name='x') + s1 = pd.Series([True, False, True], index=list("ABC"), name="x") + s2 = pd.Series([True, True, False], index=list("ABD"), name="x") - exp = pd.Series([True, False, False, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, False, False, False], index=list("ABCD"), name="x") assert_series_equal(s1 & s2, exp) assert_series_equal(s2 & s1, exp) # True | np.nan => True - exp = pd.Series([True, True, True, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, True, False], index=list("ABCD"), name="x") assert_series_equal(s1 | s2, exp) # np.nan | True => np.nan, filled with False - exp = pd.Series([True, True, False, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, False, False], index=list("ABCD"), name="x") assert_series_equal(s2 | s1, exp) # DataFrame doesn't fill nan with False - exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, False, np.nan, np.nan]}, index=list("ABCD")) assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) - exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, True, np.nan, np.nan]}, index=list("ABCD")) assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) # different length - s3 = pd.Series([True, False, True], index=list('ABC'), name='x') - s4 = pd.Series([True, True, True, True], index=list('ABCD'), name='x') + s3 = pd.Series([True, False, True], index=list("ABC"), name="x") + s4 = pd.Series([True, True, True, True], index=list("ABCD"), name="x") - exp = pd.Series([True, False, True, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, False, True, False], index=list("ABCD"), name="x") assert_series_equal(s3 & s4, exp) assert_series_equal(s4 & s3, exp) # np.nan | True => np.nan, filled with False - exp = pd.Series([True, True, True, False], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, True, False], index=list("ABCD"), name="x") assert_series_equal(s3 | s4, exp) # True | np.nan => True - exp = pd.Series([True, True, True, True], - index=list('ABCD'), name='x') + exp = pd.Series([True, True, True, True], index=list("ABCD"), name="x") assert_series_equal(s4 | s3, exp) - exp = pd.DataFrame({'x': [True, False, True, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, False, True, np.nan]}, index=list("ABCD")) assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) - exp = pd.DataFrame({'x': [True, True, True, np.nan]}, - index=list('ABCD')) + exp = pd.DataFrame({"x": [True, True, True, np.nan]}, index=list("ABCD")) assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) @@ -394,13 +396,13 @@ def test_comparisons(self): left[:3] = np.nan result = nanops.nangt(left, right) - with np.errstate(invalid='ignore'): - expected = (left > right).astype('O') + with np.errstate(invalid="ignore"): + expected = (left > right).astype("O") expected[:3] = np.nan assert_almost_equal(result, expected) - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) s2 = Series([False, True, False]) # it works! @@ -411,21 +413,21 @@ def test_comparisons(self): def test_categorical_comparisons(self): # GH 8938 # allow equality comparisons - a = Series(list('abc'), dtype="category") - b = Series(list('abc'), dtype="object") - c = Series(['a', 'b', 'cc'], dtype="object") - d = Series(list('acb'), dtype="object") - e = Categorical(list('abc')) - f = Categorical(list('acb')) + a = Series(list("abc"), dtype="category") + b = Series(list("abc"), dtype="object") + c = Series(["a", "b", "cc"], dtype="object") + d = Series(list("acb"), dtype="object") + e = Categorical(list("abc")) + f = Categorical(list("acb")) # vs scalar - assert not (a == 'a').all() - assert ((a != 'a') == ~(a == 'a')).all() + assert not (a == "a").all() + assert ((a != "a") == ~(a == "a")).all() - assert not ('a' == a).all() - assert (a == 'a')[0] - assert ('a' == a)[0] - assert not ('a' != a)[0] + assert not ("a" == a).all() + assert (a == "a")[0] + assert ("a" == a)[0] + assert not ("a" != a)[0] # vs list-like assert (a == a).all() @@ -448,10 +450,10 @@ def test_categorical_comparisons(self): assert not (a == f).all() assert not (f == a).all() - assert ((~(a == e) == (a != e)).all()) - assert ((~(e == a) == (e != a)).all()) - assert ((~(a == f) == (a != f)).all()) - assert ((~(f == a) == (f != a)).all()) + assert (~(a == e) == (a != e)).all() + assert (~(e == a) == (e != a)).all() + assert (~(a == f) == (a != f)).all() + assert (~(f == a) == (f != a)).all() # non-equality is not comparable with pytest.raises(TypeError): @@ -501,11 +503,11 @@ def test_comparison_tuples(self): assert_series_equal(result, expected) def test_comparison_operators_with_nas(self): - ser = Series(bdate_range('1/1/2000', periods=10), dtype=object) + ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) ser[::2] = np.nan # test that comparisons work - ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + ops = ["lt", "le", "gt", "ge", "eq", "ne"] for op in ops: val = ser[5] @@ -514,7 +516,7 @@ def test_comparison_operators_with_nas(self): expected = f(ser.dropna(), val).reindex(ser.index) - if op == 'ne': + if op == "ne": expected = expected.fillna(True).astype(bool) else: expected = expected.fillna(False).astype(bool) @@ -561,11 +563,11 @@ def test_ne(self): def test_comp_ops_df_compat(self): # GH 1134 - s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') - s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") + s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: @@ -591,70 +593,68 @@ def test_comp_ops_df_compat(self): def test_compare_series_interval_keyword(self): # GH 25338 - s = Series(['IntervalA', 'IntervalB', 'IntervalC']) - result = s == 'IntervalA' + s = Series(["IntervalA", "IntervalB", "IntervalC"]) + result = s == "IntervalA" expected = Series([True, False, False]) assert_series_equal(result, expected) class TestSeriesFlexComparisonOps: - def test_comparison_flex_alignment(self): - left = Series([1, 3, 2], index=list('abc')) - right = Series([2, 2, 2], index=list('bcd')) + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) - exp = pd.Series([False, False, True, False], index=list('abcd')) + exp = pd.Series([False, False, True, False], index=list("abcd")) assert_series_equal(left.eq(right), exp) - exp = pd.Series([True, True, False, True], index=list('abcd')) + exp = pd.Series([True, True, False, True], index=list("abcd")) assert_series_equal(left.ne(right), exp) - exp = pd.Series([False, False, True, False], index=list('abcd')) + exp = pd.Series([False, False, True, False], index=list("abcd")) assert_series_equal(left.le(right), exp) - exp = pd.Series([False, False, False, False], index=list('abcd')) + exp = pd.Series([False, False, False, False], index=list("abcd")) assert_series_equal(left.lt(right), exp) - exp = pd.Series([False, True, True, False], index=list('abcd')) + exp = pd.Series([False, True, True, False], index=list("abcd")) assert_series_equal(left.ge(right), exp) - exp = pd.Series([False, True, False, False], index=list('abcd')) + exp = pd.Series([False, True, False, False], index=list("abcd")) assert_series_equal(left.gt(right), exp) def test_comparison_flex_alignment_fill(self): - left = Series([1, 3, 2], index=list('abc')) - right = Series([2, 2, 2], index=list('bcd')) + left = Series([1, 3, 2], index=list("abc")) + right = Series([2, 2, 2], index=list("bcd")) - exp = pd.Series([False, False, True, True], index=list('abcd')) + exp = pd.Series([False, False, True, True], index=list("abcd")) assert_series_equal(left.eq(right, fill_value=2), exp) - exp = pd.Series([True, True, False, False], index=list('abcd')) + exp = pd.Series([True, True, False, False], index=list("abcd")) assert_series_equal(left.ne(right, fill_value=2), exp) - exp = pd.Series([False, False, True, True], index=list('abcd')) + exp = pd.Series([False, False, True, True], index=list("abcd")) assert_series_equal(left.le(right, fill_value=0), exp) - exp = pd.Series([False, False, False, True], index=list('abcd')) + exp = pd.Series([False, False, False, True], index=list("abcd")) assert_series_equal(left.lt(right, fill_value=0), exp) - exp = pd.Series([True, True, True, False], index=list('abcd')) + exp = pd.Series([True, True, True, False], index=list("abcd")) assert_series_equal(left.ge(right, fill_value=0), exp) - exp = pd.Series([True, True, False, False], index=list('abcd')) + exp = pd.Series([True, True, False, False], index=list("abcd")) assert_series_equal(left.gt(right, fill_value=0), exp) class TestSeriesOperators(TestData): - def test_operators_empty_int_corner(self): s1 = Series([], [], dtype=np.int32) - s2 = Series({'x': 0.}) - assert_series_equal(s1 * s2, Series([np.nan], index=['x'])) + s2 = Series({"x": 0.0}) + assert_series_equal(s1 * s2, Series([np.nan], index=["x"])) def test_ops_datetimelike_align(self): # GH 7500 # datetimelike ops need to align - dt = Series(date_range('2012-1-1', periods=3, freq='D')) + dt = Series(date_range("2012-1-1", periods=3, freq="D")) dt.iloc[2] = np.nan dt2 = dt[::-1] @@ -687,23 +687,26 @@ def test_operators_corner(self): # float + int int_ts = self.ts.astype(int)[:-5] added = self.ts + int_ts - expected = Series(self.ts.values[:-5] + int_ts.values, - index=self.ts.index[:-5], name='ts') + expected = Series( + self.ts.values[:-5] + int_ts.values, index=self.ts.index[:-5], name="ts" + ) tm.assert_series_equal(added[:-5], expected) - pairings = [(Series.div, operator.truediv, 1), - (Series.rdiv, lambda x, y: operator.truediv(y, x), 1)] - for op in ['add', 'sub', 'mul', 'pow', 'truediv', 'floordiv']: + pairings = [ + (Series.div, operator.truediv, 1), + (Series.rdiv, lambda x, y: operator.truediv(y, x), 1), + ] + for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]: fv = 0 lop = getattr(Series, op) lequiv = getattr(operator, op) - rop = getattr(Series, 'r' + op) + rop = getattr(Series, "r" + op) # bind op at definition time... requiv = lambda x, y, op=op: getattr(operator, op)(y, x) pairings.append((lop, lequiv, fv)) pairings.append((rop, requiv, fv)) - @pytest.mark.parametrize('op, equiv_op, fv', pairings) + @pytest.mark.parametrize("op, equiv_op, fv", pairings) def test_operators_combine(self, op, equiv_op, fv): def _check_fill(meth, op, a, b, fill_value=0): exp_index = a.index.union(b.index) @@ -715,7 +718,7 @@ def _check_fill(meth, op, a, b, fill_value=0): exp_values = [] for i in range(len(exp_index)): - with np.errstate(all='ignore'): + with np.errstate(all="ignore"): if amask[i]: if bmask[i]: exp_values.append(np.nan) @@ -733,8 +736,8 @@ def _check_fill(meth, op, a, b, fill_value=0): expected = Series(exp_values, exp_index) assert_series_equal(result, expected) - a = Series([np.nan, 1., 2., 3., np.nan], index=np.arange(5)) - b = Series([np.nan, 1, np.nan, 3, np.nan, 4.], index=np.arange(6)) + a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5)) + b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6)) result = op(a, b) exp = equiv_op(a, b) @@ -746,8 +749,10 @@ def _check_fill(meth, op, a, b, fill_value=0): def test_operators_na_handling(self): from decimal import Decimal from datetime import date - s = Series([Decimal('1.3'), Decimal('2.3')], - index=[date(2012, 1, 1), date(2012, 1, 2)]) + + s = Series( + [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] + ) result = s + s.shift(1) result2 = s.shift(1) + s @@ -764,8 +769,8 @@ def test_op_duplicate_index(self): def test_divmod(self): # GH25557 - a = Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd']) - b = Series([2, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e']) + a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"]) + b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"]) result = a.divmod(b) expected = divmod(a, b) @@ -783,10 +788,10 @@ class TestSeriesUnaryOps: def test_neg(self): ser = tm.makeStringSeries() - ser.name = 'series' + ser.name = "series" assert_series_equal(-ser, -1 * ser) def test_invert(self): ser = tm.makeStringSeries() - ser.name = 'series' + ser.name = "series" assert_series_equal(-(ser < 0), ~(ser < 0)) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 6b0edf670e03e..9b34b52bf39b9 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -8,52 +8,49 @@ class TestSeriesPeriod: - def setup_method(self, method): - self.series = Series(period_range('2000-01-01', periods=10, freq='D')) + self.series = Series(period_range("2000-01-01", periods=10, freq="D")) def test_auto_conversion(self): - series = Series(list(period_range('2000-01-01', periods=10, freq='D'))) - assert series.dtype == 'Period[D]' + series = Series(list(period_range("2000-01-01", periods=10, freq="D"))) + assert series.dtype == "Period[D]" - series = pd.Series([pd.Period('2011-01-01', freq='D'), - pd.Period('2011-02-01', freq='D')]) - assert series.dtype == 'Period[D]' + series = pd.Series( + [pd.Period("2011-01-01", freq="D"), pd.Period("2011-02-01", freq="D")] + ) + assert series.dtype == "Period[D]" def test_getitem(self): - assert self.series[1] == pd.Period('2000-01-02', freq='D') + assert self.series[1] == pd.Period("2000-01-02", freq="D") result = self.series[[2, 4]] - exp = pd.Series([pd.Period('2000-01-03', freq='D'), - pd.Period('2000-01-05', freq='D')], - index=[2, 4], dtype='Period[D]') + exp = pd.Series( + [pd.Period("2000-01-03", freq="D"), pd.Period("2000-01-05", freq="D")], + index=[2, 4], + dtype="Period[D]", + ) tm.assert_series_equal(result, exp) - assert result.dtype == 'Period[D]' + assert result.dtype == "Period[D]" def test_isna(self): # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) tm.assert_series_equal(s.isna(), Series([False, True])) tm.assert_series_equal(s.notna(), Series([True, False])) def test_fillna(self): # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) - res = s.fillna(pd.Period('2012-01', freq='M')) - exp = Series([pd.Period('2011-01', freq='M'), - pd.Period('2012-01', freq='M')]) + res = s.fillna(pd.Period("2012-01", freq="M")) + exp = Series([pd.Period("2011-01", freq="M"), pd.Period("2012-01", freq="M")]) tm.assert_series_equal(res, exp) - assert res.dtype == 'Period[M]' + assert res.dtype == "Period[M]" def test_dropna(self): # GH 13737 - s = Series([pd.Period('2011-01', freq='M'), - pd.Period('NaT', freq='M')]) - tm.assert_series_equal(s.dropna(), - Series([pd.Period('2011-01', freq='M')])) + s = Series([pd.Period("2011-01", freq="M"), pd.Period("NaT", freq="M")]) + tm.assert_series_equal(s.dropna(), Series([pd.Period("2011-01", freq="M")])) def test_between(self): left, right = self.series[[2, 7]] @@ -66,7 +63,7 @@ def test_between(self): @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") def test_NaT_scalar(self): - series = Series([0, 1000, 2000, pd._libs.iNaT], dtype='period[D]') + series = Series([0, 1000, 2000, pd._libs.iNaT], dtype="period[D]") val = series[3] assert pd.isna(val) @@ -76,7 +73,7 @@ def test_NaT_scalar(self): @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") def test_NaT_cast(self): - result = Series([np.nan]).astype('period[D]') + result = Series([np.nan]).astype("period[D]") expected = Series([pd.NaT]) tm.assert_series_equal(result, expected) @@ -96,64 +93,65 @@ def test_set_nan(self): assert self.series[6] is pd.NaT def test_intercept_astype_object(self): - expected = self.series.astype('object') + expected = self.series.astype("object") - df = DataFrame({'a': self.series, - 'b': np.random.randn(len(self.series))}) + df = DataFrame({"a": self.series, "b": np.random.randn(len(self.series))}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() - df = DataFrame({'a': self.series, 'b': ['foo'] * len(self.series)}) + df = DataFrame({"a": self.series, "b": ["foo"] * len(self.series)}) result = df.values.squeeze() assert (result[:, 0] == expected.values).all() def test_align_series(self, join_type): - rng = period_range('1/1/2000', '1/1/2010', freq='A') + rng = period_range("1/1/2000", "1/1/2010", freq="A") ts = Series(np.random.randn(len(rng)), index=rng) ts.align(ts[::2], join=join_type) def test_truncate(self): # GH 17717 - idx1 = pd.PeriodIndex([ - pd.Period('2017-09-02'), - pd.Period('2017-09-02'), - pd.Period('2017-09-03') - ]) + idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) series1 = pd.Series([1, 2, 3], index=idx1) - result1 = series1.truncate(after='2017-09-02') + result1 = series1.truncate(after="2017-09-02") - expected_idx1 = pd.PeriodIndex([ - pd.Period('2017-09-02'), - pd.Period('2017-09-02') - ]) + expected_idx1 = pd.PeriodIndex( + [pd.Period("2017-09-02"), pd.Period("2017-09-02")] + ) tm.assert_series_equal(result1, pd.Series([1, 2], index=expected_idx1)) - idx2 = pd.PeriodIndex([ - pd.Period('2017-09-03'), - pd.Period('2017-09-02'), - pd.Period('2017-09-03') - ]) + idx2 = pd.PeriodIndex( + [pd.Period("2017-09-03"), pd.Period("2017-09-02"), pd.Period("2017-09-03")] + ) series2 = pd.Series([1, 2, 3], index=idx2) - result2 = series2.sort_index().truncate(after='2017-09-02') + result2 = series2.sort_index().truncate(after="2017-09-02") - expected_idx2 = pd.PeriodIndex([ - pd.Period('2017-09-02') - ]) + expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) tm.assert_series_equal(result2, pd.Series([2], index=expected_idx2)) - @pytest.mark.parametrize('input_vals', [ - [Period('2016-01', freq='M'), Period('2016-02', freq='M')], - [Period('2016-01-01', freq='D'), Period('2016-01-02', freq='D')], - [Period('2016-01-01 00:00:00', freq='H'), - Period('2016-01-01 01:00:00', freq='H')], - [Period('2016-01-01 00:00:00', freq='M'), - Period('2016-01-01 00:01:00', freq='M')], - [Period('2016-01-01 00:00:00', freq='S'), - Period('2016-01-01 00:00:01', freq='S')] - ]) + @pytest.mark.parametrize( + "input_vals", + [ + [Period("2016-01", freq="M"), Period("2016-02", freq="M")], + [Period("2016-01-01", freq="D"), Period("2016-01-02", freq="D")], + [ + Period("2016-01-01 00:00:00", freq="H"), + Period("2016-01-01 01:00:00", freq="H"), + ], + [ + Period("2016-01-01 00:00:00", freq="M"), + Period("2016-01-01 00:01:00", freq="M"), + ], + [ + Period("2016-01-01 00:00:00", freq="S"), + Period("2016-01-01 00:00:01", freq="S"), + ], + ], + ) def test_end_time_timevalues(self, input_vals): # GH 17157 # Check that the time part of the Period is adjusted by end_time @@ -165,11 +163,9 @@ def test_end_time_timevalues(self, input_vals): expected = s.apply(lambda x: x.end_time) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('input_vals', [ - ('2001'), ('NaT') - ]) + @pytest.mark.parametrize("input_vals", [("2001"), ("NaT")]) def test_to_period(self, input_vals): # GH 21205 - expected = Series([input_vals], dtype='Period[D]') - result = Series([input_vals], dtype='datetime64[ns]').dt.to_period('D') + expected = Series([input_vals], dtype="Period[D]") + result = Series([input_vals], dtype="datetime64[ns]").dt.to_period("D") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 18bc7b119c97a..b001312fa37f3 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -12,7 +12,6 @@ class TestSeriesQuantile(TestData): - def test_quantile(self): q = self.ts.quantile(0.1) @@ -27,50 +26,53 @@ def test_quantile(self): # datetime64[ns] dtype dts = self.ts.index.to_series() - q = dts.quantile(.2) - assert q == Timestamp('2000-01-10 19:12:00') + q = dts.quantile(0.2) + assert q == Timestamp("2000-01-10 19:12:00") # timedelta64[ns] dtype tds = dts.diff() - q = tds.quantile(.25) - assert q == pd.to_timedelta('24:00:00') + q = tds.quantile(0.25) + assert q == pd.to_timedelta("24:00:00") # GH7661 - result = Series([np.timedelta64('NaT')]).sum() + result = Series([np.timedelta64("NaT")]).sum() assert result == pd.Timedelta(0) - msg = 'percentiles should all be in the interval \\[0, 1\\]' + msg = "percentiles should all be in the interval \\[0, 1\\]" for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): self.ts.quantile(invalid) def test_quantile_multi(self): - qs = [.1, .9] + qs = [0.1, 0.9] result = self.ts.quantile(qs) - expected = pd.Series([np.percentile(self.ts.dropna(), 10), - np.percentile(self.ts.dropna(), 90)], - index=qs, name=self.ts.name) + expected = pd.Series( + [np.percentile(self.ts.dropna(), 10), np.percentile(self.ts.dropna(), 90)], + index=qs, + name=self.ts.name, + ) tm.assert_series_equal(result, expected) dts = self.ts.index.to_series() - dts.name = 'xxx' - result = dts.quantile((.2, .2)) - expected = Series([Timestamp('2000-01-10 19:12:00'), - Timestamp('2000-01-10 19:12:00')], - index=[.2, .2], name='xxx') + dts.name = "xxx" + result = dts.quantile((0.2, 0.2)) + expected = Series( + [Timestamp("2000-01-10 19:12:00"), Timestamp("2000-01-10 19:12:00")], + index=[0.2, 0.2], + name="xxx", + ) tm.assert_series_equal(result, expected) result = self.ts.quantile([]) - expected = pd.Series([], name=self.ts.name, index=Index( - [], dtype=float)) + expected = pd.Series([], name=self.ts.name, index=Index([], dtype=float)) tm.assert_series_equal(result, expected) def test_quantile_interpolation(self): # see gh-10174 # interpolation = linear (default case) - q = self.ts.quantile(0.1, interpolation='linear') + q = self.ts.quantile(0.1, interpolation="linear") assert q == np.percentile(self.ts.dropna(), 10) q1 = self.ts.quantile(0.1) assert q1 == np.percentile(self.ts.dropna(), 10) @@ -82,11 +84,11 @@ def test_quantile_interpolation_dtype(self): # GH #10174 # interpolation = linear (default case) - q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower') + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="lower") assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) - q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher') + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation="higher") assert q == np.percentile(np.array([1, 3, 4]), 50) assert is_integer(q) @@ -109,38 +111,56 @@ def test_quantile_nan(self): tm.assert_series_equal(res, pd.Series([np.nan], index=[0.5])) res = s.quantile([0.2, 0.3]) - tm.assert_series_equal(res, pd.Series([np.nan, np.nan], - index=[0.2, 0.3])) - - @pytest.mark.parametrize('case', [ - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03')], - [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern')], - [pd.Timedelta('1 days'), pd.Timedelta('2 days'), - pd.Timedelta('3 days')], - # NaT - [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), - pd.Timestamp('2011-01-03'), pd.NaT], - [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-01-02', tz='US/Eastern'), - pd.Timestamp('2011-01-03', tz='US/Eastern'), pd.NaT], - [pd.Timedelta('1 days'), pd.Timedelta('2 days'), - pd.Timedelta('3 days'), pd.NaT]]) + tm.assert_series_equal(res, pd.Series([np.nan, np.nan], index=[0.2, 0.3])) + + @pytest.mark.parametrize( + "case", + [ + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + ], + [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + ], + [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")], + # NaT + [ + pd.Timestamp("2011-01-01"), + pd.Timestamp("2011-01-02"), + pd.Timestamp("2011-01-03"), + pd.NaT, + ], + [ + pd.Timestamp("2011-01-01", tz="US/Eastern"), + pd.Timestamp("2011-01-02", tz="US/Eastern"), + pd.Timestamp("2011-01-03", tz="US/Eastern"), + pd.NaT, + ], + [ + pd.Timedelta("1 days"), + pd.Timedelta("2 days"), + pd.Timedelta("3 days"), + pd.NaT, + ], + ], + ) def test_quantile_box(self, case): - s = pd.Series(case, name='XXX') + s = pd.Series(case, name="XXX") res = s.quantile(0.5) assert res == case[1] res = s.quantile([0.5]) - exp = pd.Series([case[1]], index=[0.5], name='XXX') + exp = pd.Series([case[1]], index=[0.5], name="XXX") tm.assert_series_equal(res, exp) def test_datetime_timedelta_quantiles(self): # covers #9694 - assert pd.isna(Series([], dtype='M8[ns]').quantile(.5)) - assert pd.isna(Series([], dtype='m8[ns]').quantile(.5)) + assert pd.isna(Series([], dtype="M8[ns]").quantile(0.5)) + assert pd.isna(Series([], dtype="m8[ns]").quantile(0.5)) def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile(0.5) @@ -149,10 +169,10 @@ def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile([0.5]) tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) - @pytest.mark.parametrize('values, dtype', [ - ([0, 0, 0, 1, 2, 3], 'Sparse[int]'), - ([0., None, 1., 2.], 'Sparse[float]'), - ]) + @pytest.mark.parametrize( + "values, dtype", + [([0, 0, 0, 1, 2, 3], "Sparse[int]"), ([0.0, None, 1.0, 2.0], "Sparse[float]")], + ) def test_quantile_sparse(self, values, dtype): ser = pd.Series(values, dtype=dtype) result = ser.quantile([0.5]) @@ -162,7 +182,7 @@ def test_quantile_sparse(self, values, dtype): def test_quantile_empty(self): # floats - s = Series([], dtype='float64') + s = Series([], dtype="float64") res = s.quantile(0.5) assert np.isnan(res) @@ -172,7 +192,7 @@ def test_quantile_empty(self): tm.assert_series_equal(res, exp) # int - s = Series([], dtype='int64') + s = Series([], dtype="int64") res = s.quantile(0.5) assert np.isnan(res) @@ -182,7 +202,7 @@ def test_quantile_empty(self): tm.assert_series_equal(res, exp) # datetime - s = Series([], dtype='datetime64[ns]') + s = Series([], dtype="datetime64[ns]") res = s.quantile(0.5) assert res is pd.NaT diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 05ea9df545211..f93e1651c8b10 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -19,23 +19,22 @@ class TestSeriesRank(TestData): s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) results = { - 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, - 3.5, 1.5, 8.0, nan, 5.5]), - 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), - 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), - 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), + "average": np.array([1.5, 5.5, 7.0, 3.5, nan, 3.5, 1.5, 8.0, nan, 5.5]), + "min": np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + "max": np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + "first": np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + "dense": np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } def test_rank(self): - pytest.importorskip('scipy.stats.special') - rankdata = pytest.importorskip('scipy.stats.rankdata') + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") self.ts[::2] = np.nan - self.ts[:10][::3] = 4. + self.ts[:10][::3] = 4.0 ranks = self.ts.rank() - oranks = self.ts.astype('O').rank() + oranks = self.ts.astype("O").rank() assert_series_equal(ranks, oranks) @@ -43,7 +42,7 @@ def test_rank(self): filled = self.ts.fillna(np.inf) # rankdata returns a ndarray - exp = Series(rankdata(filled), index=filled.index, name='ts') + exp = Series(rankdata(filled), index=filled.index, name="ts") exp[mask] = np.nan tm.assert_series_equal(ranks, exp) @@ -87,7 +86,7 @@ def test_rank(self): iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) - rng = date_range('1/1/1990', periods=5) + rng = date_range("1/1/1990", periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 @@ -100,18 +99,18 @@ def test_rank(self): assert_series_equal(iranks, exp) # GH 5968 - iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], - dtype='m8[ns]') + iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() assert_series_equal(iranks, exp) values = np.array( - [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40 - ], dtype='float64') + [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], + dtype="float64", + ) random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) - exp = Series(random_order + 1.0, dtype='float64') + exp = Series(random_order + 1.0, dtype="float64") iranks = iseries.rank() assert_series_equal(iranks, exp) @@ -119,125 +118,171 @@ def test_rank_categorical(self): # GH issue #15420 rank incorrectly orders ordered categories # Test ascending/descending ranking for ordered categoricals - exp = Series([1., 2., 3., 4., 5., 6.]) - exp_desc = Series([6., 5., 4., 3., 2., 1.]) + exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) ordered = Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth'] - ).astype(CategoricalDtype(categories=['first', 'second', 'third', - 'fourth', 'fifth', 'sixth'], - ordered=True)) + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=True, + ) + ) assert_series_equal(ordered.rank(), exp) assert_series_equal(ordered.rank(ascending=False), exp_desc) # Unordered categoricals should be ranked as objects - unordered = Series(['first', 'second', 'third', 'fourth', - 'fifth', 'sixth']).astype( - CategoricalDtype(categories=['first', 'second', 'third', - 'fourth', 'fifth', 'sixth'], - ordered=False)) - exp_unordered = Series([2., 4., 6., 3., 1., 5.]) + unordered = Series( + ["first", "second", "third", "fourth", "fifth", "sixth"] + ).astype( + CategoricalDtype( + categories=["first", "second", "third", "fourth", "fifth", "sixth"], + ordered=False, + ) + ) + exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0]) res = unordered.rank() assert_series_equal(res, exp_unordered) - unordered1 = Series( - [1, 2, 3, 4, 5, 6], - ).astype(CategoricalDtype([1, 2, 3, 4, 5, 6], False)) - exp_unordered1 = Series([1., 2., 3., 4., 5., 6.]) + unordered1 = Series([1, 2, 3, 4, 5, 6]).astype( + CategoricalDtype([1, 2, 3, 4, 5, 6], False) + ) + exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) res1 = unordered1.rank() assert_series_equal(res1, exp_unordered1) # Test na_option for rank data na_ser = Series( - ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', np.NaN] - ).astype(CategoricalDtype(['first', 'second', 'third', 'fourth', - 'fifth', 'sixth', 'seventh'], True)) + ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN] + ).astype( + CategoricalDtype( + ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"], + True, + ) + ) - exp_top = Series([2., 3., 4., 5., 6., 7., 1.]) - exp_bot = Series([1., 2., 3., 4., 5., 6., 7.]) - exp_keep = Series([1., 2., 3., 4., 5., 6., np.NaN]) + exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0]) + exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]) + exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN]) - assert_series_equal(na_ser.rank(na_option='top'), exp_top) - assert_series_equal(na_ser.rank(na_option='bottom'), exp_bot) - assert_series_equal(na_ser.rank(na_option='keep'), exp_keep) + assert_series_equal(na_ser.rank(na_option="top"), exp_top) + assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot) + assert_series_equal(na_ser.rank(na_option="keep"), exp_keep) # Test na_option for rank data with ascending False - exp_top = Series([7., 6., 5., 4., 3., 2., 1.]) - exp_bot = Series([6., 5., 4., 3., 2., 1., 7.]) - exp_keep = Series([6., 5., 4., 3., 2., 1., np.NaN]) + exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]) + exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0]) + exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN]) - assert_series_equal( - na_ser.rank(na_option='top', ascending=False), - exp_top - ) - assert_series_equal( - na_ser.rank(na_option='bottom', ascending=False), - exp_bot - ) - assert_series_equal( - na_ser.rank(na_option='keep', ascending=False), - exp_keep - ) + assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top) + assert_series_equal(na_ser.rank(na_option="bottom", ascending=False), exp_bot) + assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep) # Test invalid values for na_option msg = "na_option must be one of 'keep', 'top', or 'bottom'" with pytest.raises(ValueError, match=msg): - na_ser.rank(na_option='bad', ascending=False) + na_ser.rank(na_option="bad", ascending=False) # invalid type with pytest.raises(ValueError, match=msg): na_ser.rank(na_option=True, ascending=False) # Test with pct=True - na_ser = Series(['first', 'second', 'third', 'fourth', np.NaN]).astype( - CategoricalDtype(['first', 'second', 'third', 'fourth'], True)) - exp_top = Series([0.4, 0.6, 0.8, 1., 0.2]) - exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.]) - exp_keep = Series([0.25, 0.5, 0.75, 1., np.NaN]) + na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype( + CategoricalDtype(["first", "second", "third", "fourth"], True) + ) + exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2]) + exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0]) + exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN]) - assert_series_equal(na_ser.rank(na_option='top', pct=True), exp_top) - assert_series_equal(na_ser.rank(na_option='bottom', pct=True), exp_bot) - assert_series_equal(na_ser.rank(na_option='keep', pct=True), exp_keep) + assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top) + assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot) + assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep) def test_rank_signature(self): s = Series([0, 1]) - s.rank(method='average') - msg = ("No axis named average for object type" - " ") + s.rank(method="average") + msg = ( + "No axis named average for object type" + " " + ) with pytest.raises(ValueError, match=msg): - s.rank('average') - - @pytest.mark.parametrize('contents,dtype', [ - ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, - 2, 40, np.inf], - 'float64'), - ([-np.inf, -50, -1, -1e-20, -1e-25, -1e-45, 0, 1e-40, 1e-20, 1e-10, - 2, 40, np.inf], - 'float32'), - ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], - 'uint8'), - pytest.param([np.iinfo(np.int64).min, -100, 0, 1, 9999, 100000, - 1e10, np.iinfo(np.int64).max], - 'int64', - marks=pytest.mark.xfail( - reason="iNaT is equivalent to minimum value of dtype" - "int64 pending issue GH#16674")), - ([NegInfinity(), '1', 'A', 'BA', 'Ba', 'C', Infinity()], - 'object') - ]) + s.rank("average") + + @pytest.mark.parametrize( + "contents,dtype", + [ + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-50, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float64", + ), + ( + [ + -np.inf, + -50, + -1, + -1e-20, + -1e-25, + -1e-45, + 0, + 1e-40, + 1e-20, + 1e-10, + 2, + 40, + np.inf, + ], + "float32", + ), + ([np.iinfo(np.uint8).min, 1, 2, 100, np.iinfo(np.uint8).max], "uint8"), + pytest.param( + [ + np.iinfo(np.int64).min, + -100, + 0, + 1, + 9999, + 100000, + 1e10, + np.iinfo(np.int64).max, + ], + "int64", + marks=pytest.mark.xfail( + reason="iNaT is equivalent to minimum value of dtype" + "int64 pending issue GH#16674" + ), + ), + ([NegInfinity(), "1", "A", "BA", "Ba", "C", Infinity()], "object"), + ], + ) def test_rank_inf(self, contents, dtype): dtype_na_map = { - 'float64': np.nan, - 'float32': np.nan, - 'int64': iNaT, - 'object': None + "float64": np.nan, + "float32": np.nan, + "int64": iNaT, + "object": None, } # Insert nans at random positions if underlying dtype has missing # value. Then adjust the expected order by adding nans accordingly # This is for testing whether rank calculation is affected # when values are interwined with nan values. values = np.array(contents, dtype=dtype) - exp_order = np.array(range(len(values)), dtype='float64') + 1.0 + exp_order = np.array(range(len(values)), dtype="float64") + 1.0 if dtype in dtype_na_map: na_value = dtype_na_map[dtype] nan_indices = np.random.choice(range(len(values)), 5) @@ -246,19 +291,19 @@ def test_rank_inf(self, contents, dtype): # shuffle the testing array and expected results in the same way random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) - exp = Series(exp_order[random_order], dtype='float64') + exp = Series(exp_order[random_order], dtype="float64") iranks = iseries.rank() assert_series_equal(iranks, exp) def test_rank_tie_methods(self): s = self.s - def _check(s, expected, method='average'): + def _check(s, expected, method="average"): result = s.rank(method=method) tm.assert_series_equal(result, Series(expected)) dtypes = [None, object] - disabled = {(object, 'first')} + disabled = {(object, "first")} results = self.results for method, dtype in product(results, dtypes): @@ -268,36 +313,36 @@ def _check(s, expected, method='average'): _check(series, results[method], method=method) @td.skip_if_no_scipy - @pytest.mark.parametrize('ascending', [True, False]) - @pytest.mark.parametrize('method', ['average', 'min', 'max', 'first', - 'dense']) - @pytest.mark.parametrize('na_option', ['top', 'bottom', 'keep']) + @pytest.mark.parametrize("ascending", [True, False]) + @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) + @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) def test_rank_tie_methods_on_infs_nans(self, method, na_option, ascending): - dtypes = [('object', None, Infinity(), NegInfinity()), - ('float64', np.nan, np.inf, -np.inf)] + dtypes = [ + ("object", None, Infinity(), NegInfinity()), + ("float64", np.nan, np.inf, -np.inf), + ] chunk = 3 - disabled = {('object', 'first')} + disabled = {("object", "first")} def _check(s, method, na_option, ascending): exp_ranks = { - 'average': ([2, 2, 2], [5, 5, 5], [8, 8, 8]), - 'min': ([1, 1, 1], [4, 4, 4], [7, 7, 7]), - 'max': ([3, 3, 3], [6, 6, 6], [9, 9, 9]), - 'first': ([1, 2, 3], [4, 5, 6], [7, 8, 9]), - 'dense': ([1, 1, 1], [2, 2, 2], [3, 3, 3]) + "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]), + "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]), + "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]), + "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]), + "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]), } ranks = exp_ranks[method] - if na_option == 'top': + if na_option == "top": order = [ranks[1], ranks[0], ranks[2]] - elif na_option == 'bottom': + elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] expected = list(chain.from_iterable(expected)) - result = s.rank(method=method, na_option=na_option, - ascending=ascending) - tm.assert_series_equal(result, Series(expected, dtype='float64')) + result = s.rank(method=method, na_option=na_option, ascending=ascending) + tm.assert_series_equal(result, Series(expected, dtype="float64")) for dtype, na_value, pos_inf, neg_inf in dtypes: in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk @@ -311,51 +356,53 @@ def test_rank_desc_mix_nans_infs(self): # check descending ranking when mix nans and infs iseries = Series([1, np.nan, np.inf, -np.inf, 25]) result = iseries.rank(ascending=False) - exp = Series([3, np.nan, 1, 4, 2], dtype='float64') + exp = Series([3, np.nan, 1, 4, 2], dtype="float64") tm.assert_series_equal(result, exp) def test_rank_methods_series(self): - pytest.importorskip('scipy.stats.special') - rankdata = pytest.importorskip('scipy.stats.rankdata') + pytest.importorskip("scipy.stats.special") + rankdata = pytest.importorskip("scipy.stats.rankdata") xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) - index = [chr(ord('a') + i) for i in range(len(xs))] + index = [chr(ord("a") + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) - for m in ['average', 'min', 'max', 'first', 'dense']: + for m in ["average", "min", "max", "first", "dense"]: result = ts.rank(method=m) - sprank = rankdata(vals, m if m != 'first' else 'ordinal') - expected = Series(sprank, index=index).astype('float64') + sprank = rankdata(vals, m if m != "first" else "ordinal") + expected = Series(sprank, index=index).astype("float64") tm.assert_series_equal(result, expected) def test_rank_dense_method(self): - dtypes = ['O', 'f8', 'i8'] - in_out = [([1], [1]), - ([2], [1]), - ([0], [1]), - ([2, 2], [1, 1]), - ([1, 2, 3], [1, 2, 3]), - ([4, 2, 1], [3, 2, 1],), - ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), - ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])] + dtypes = ["O", "f8", "i8"] + in_out = [ + ([1], [1]), + ([2], [1]), + ([0], [1]), + ([2, 2], [1, 1]), + ([1, 2, 3], [1, 2, 3]), + ([4, 2, 1], [3, 2, 1]), + ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]), + ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]), + ] for ser, exp in in_out: for dtype in dtypes: s = Series(ser).astype(dtype) - result = s.rank(method='dense') + result = s.rank(method="dense") expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) def test_rank_descending(self): - dtypes = ['O', 'f8', 'i8'] + dtypes = ["O", "f8", "i8"] for dtype, method in product(dtypes, self.results): - if 'i' in dtype: + if "i" in dtype: s = self.s.dropna() else: s = self.s.astype(dtype) @@ -364,7 +411,7 @@ def test_rank_descending(self): expected = (s.max() - s).rank() assert_series_equal(res, expected) - if method == 'first' and dtype == 'O': + if method == "first" and dtype == "O": continue expected = (s.max() - s).rank(method=method) @@ -372,7 +419,7 @@ def test_rank_descending(self): assert_series_equal(res2, expected) def test_rank_int(self): - s = self.s.dropna().astype('i8') + s = self.s.dropna().astype("i8") for method, res in self.results.items(): result = s.rank(method=method) @@ -390,7 +437,7 @@ def test_rank_object_bug(self): def test_rank_modify_inplace(self): # GH 18521 # Check rank does not mutate series - s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) + s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT]) expected = s.copy() s.rank() @@ -400,93 +447,113 @@ def test_rank_modify_inplace(self): # GH15630, pct should be on 100% basis when method='dense' -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1., 1.]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]), - ([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) + +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 2, 2.0 / 2, 2.0 / 2]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 3, 1.0 / 3, 3.0 / 3, 3.0 / 3, 2.0 / 3]), + ([1, 1, 3, 3, 5, 5], [1.0 / 3, 1.0 / 3, 2.0 / 3, 2.0 / 3, 3.0 / 3, 3.0 / 3]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_dense_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='dense', pct=True) + result = s.rank(method="dense", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1. / 2, 1. / 2]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 1.0 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 2.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 1.0 / 5, 4.0 / 5, 4.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 1.0 / 6, 3.0 / 6, 3.0 / 6, 5.0 / 6, 5.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_min_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='min', pct=True) + result = s.rank(method="min", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1., 1.]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0, 1.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 3.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [2.0 / 5, 2.0 / 5, 5.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [2.0 / 6, 2.0 / 6, 4.0 / 6, 4.0 / 6, 6.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_max_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='max', pct=True) + result = s.rank(method="max", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1.5 / 2, 1.5 / 2]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], - [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.5 / 2, 1.5 / 2]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.5 / 3, 2.5 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_average_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='average', pct=True) + result = s.rank(method="average", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) -@pytest.mark.parametrize('dtype', ['f8', 'i8']) -@pytest.mark.parametrize('ser, exp', [ - ([1], [1.]), - ([1, 2], [1. / 2, 2. / 2]), - ([2, 2], [1. / 2, 2. / 2.]), - ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), - ([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]), - ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), - ([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]), - ([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]), - ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +@pytest.mark.parametrize("dtype", ["f8", "i8"]) +@pytest.mark.parametrize( + "ser, exp", + [ + ([1], [1.0]), + ([1, 2], [1.0 / 2, 2.0 / 2]), + ([2, 2], [1.0 / 2, 2.0 / 2.0]), + ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([1, 2, 2], [1.0 / 3, 2.0 / 3, 3.0 / 3]), + ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]), + ([1, 1, 5, 5, 3], [1.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 3.0 / 5]), + ([1, 1, 3, 3, 5, 5], [1.0 / 6, 2.0 / 6, 3.0 / 6, 4.0 / 6, 5.0 / 6, 6.0 / 6]), + ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]), + ], +) def test_rank_first_pct(dtype, ser, exp): s = Series(ser).astype(dtype) - result = s.rank(method='first', pct=True) + result = s.rank(method="first", pct=True) expected = Series(exp).astype(result.dtype) assert_series_equal(result, expected) @@ -495,6 +562,6 @@ def test_rank_first_pct(dtype, ser, exp): @pytest.mark.high_memory def test_pct_max_many_rows(): # GH 18271 - s = Series(np.arange(2**24 + 1)) + s = Series(np.arange(2 ** 24 + 1)) result = s.rank(pct=True).max() assert result == 1 diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 92096b3c95670..06a859963cf93 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -20,18 +20,17 @@ def test_replace(self): exp = ser.fillna(-1) tm.assert_series_equal(ser, exp) - rs = ser.replace(0., np.nan) - ser[ser == 0.] = np.nan + rs = ser.replace(0.0, np.nan) + ser[ser == 0.0] = np.nan tm.assert_series_equal(rs, ser) - ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), - dtype=object) + ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan - ser[6:10] = 'foo' - ser[20:30] = 'bar' + ser[6:10] = "foo" + ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, 'foo', 'bar'], -1) + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -39,7 +38,7 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -47,11 +46,11 @@ def test_replace(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -60,7 +59,7 @@ def test_replace(self): ser = pd.Series([np.nan, 0, np.inf]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) - ser = pd.Series([np.nan, 0, 'foo', 'bar', np.inf, None, pd.NaT]) + ser = pd.Series([np.nan, 0, "foo", "bar", np.inf, None, pd.NaT]) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) filled = ser.copy() filled[4] = 0 @@ -76,7 +75,7 @@ def test_replace(self): # make sure that we aren't just masking a TypeError because bools don't # implement indexing - with pytest.raises(TypeError, match='Cannot compare types .+'): + with pytest.raises(TypeError, match="Cannot compare types .+"): ser.replace([1, 2], [np.nan, 0]) ser = pd.Series([0, 1, 2, 3, 4]) @@ -96,19 +95,17 @@ def test_replace_gh5319(self): result = ser.replace(np.nan) tm.assert_series_equal(result, expected) # GH 5797 - ser = pd.Series(pd.date_range('20130101', periods=5)) + ser = pd.Series(pd.date_range("20130101", periods=5)) expected = ser.copy() - expected.loc[2] = pd.Timestamp('20120101') - result = ser.replace({pd.Timestamp('20130103'): - pd.Timestamp('20120101')}) + expected.loc[2] = pd.Timestamp("20120101") + result = ser.replace({pd.Timestamp("20130103"): pd.Timestamp("20120101")}) tm.assert_series_equal(result, expected) - result = ser.replace(pd.Timestamp('20130103'), - pd.Timestamp('20120101')) + result = ser.replace(pd.Timestamp("20130103"), pd.Timestamp("20120101")) tm.assert_series_equal(result, expected) # GH 11792: Test with replacing NaT in a list with tz data - ts = pd.Timestamp('2015/01/01', tz='UTC') - s = pd.Series([pd.NaT, pd.Timestamp('2015/01/01', tz='UTC')]) + ts = pd.Timestamp("2015/01/01", tz="UTC") + s = pd.Series([pd.NaT, pd.Timestamp("2015/01/01", tz="UTC")]) result = s.replace([np.nan, pd.NaT], pd.Timestamp.min) expected = pd.Series([pd.Timestamp.min, ts], dtype=object) tm.assert_series_equal(expected, result) @@ -124,10 +121,12 @@ def test_replace_with_single_list(self): # make sure things don't get corrupted when fillna call fails s = ser.copy() - msg = (r"Invalid fill method\. Expecting pad \(ffill\) or backfill" - r" \(bfill\)\. Got crash_cymbal") + msg = ( + r"Invalid fill method\. Expecting pad \(ffill\) or backfill" + r" \(bfill\)\. Got crash_cymbal" + ) with pytest.raises(ValueError, match=msg): - s.replace([1, 2, 3], inplace=True, method='crash_cymbal') + s.replace([1, 2, 3], inplace=True, method="crash_cymbal") tm.assert_series_equal(s, ser) def test_replace_with_empty_list(self): @@ -141,10 +140,10 @@ def test_replace_with_empty_list(self): with pytest.raises(ValueError, match="cannot assign mismatch"): s.replace({np.nan: []}) with pytest.raises(ValueError, match="cannot assign mismatch"): - s.replace({np.nan: ['dummy', 'alt']}) + s.replace({np.nan: ["dummy", "alt"]}) def test_replace_mixed_types(self): - s = pd.Series(np.arange(5), dtype='int64') + s = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): sc = s.copy() @@ -154,7 +153,7 @@ def check_replace(to_rep, val, expected): tm.assert_series_equal(expected, sc) # MUST upcast to float - e = pd.Series([0., 1., 2., 3., 4.]) + e = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) tr, v = [3], [3.0] check_replace(tr, v, e) @@ -164,38 +163,40 @@ def check_replace(to_rep, val, expected): check_replace(tr, v, e) # casts to object - e = pd.Series([0, 1, 2, 3.5, 'a']) - tr, v = [3, 4], [3.5, 'a'] + e = pd.Series([0, 1, 2, 3.5, "a"]) + tr, v = [3, 4], [3.5, "a"] check_replace(tr, v, e) # again casts to object - e = pd.Series([0, 1, 2, 3.5, pd.Timestamp('20130101')]) - tr, v = [3, 4], [3.5, pd.Timestamp('20130101')] + e = pd.Series([0, 1, 2, 3.5, pd.Timestamp("20130101")]) + tr, v = [3, 4], [3.5, pd.Timestamp("20130101")] check_replace(tr, v, e) # casts to object - e = pd.Series([0, 1, 2, 3.5, True], dtype='object') + e = pd.Series([0, 1, 2, 3.5, True], dtype="object") tr, v = [3, 4], [3.5, True] check_replace(tr, v, e) # test an object with dates + floats + integers + strings - dr = pd.date_range('1/1/2001', '1/10/2001', - freq='D').to_series().reset_index(drop=True) - result = dr.astype(object).replace( - [dr[0], dr[1], dr[2]], [1.0, 2, 'a']) - expected = pd.Series([1.0, 2, 'a'] + dr[3:].tolist(), dtype=object) + dr = ( + pd.date_range("1/1/2001", "1/10/2001", freq="D") + .to_series() + .reset_index(drop=True) + ) + result = dr.astype(object).replace([dr[0], dr[1], dr[2]], [1.0, 2, "a"]) + expected = pd.Series([1.0, 2, "a"] + dr[3:].tolist(), dtype=object) tm.assert_series_equal(result, expected) def test_replace_bool_with_string_no_op(self): s = pd.Series([True, False, True]) - result = s.replace('fun', 'in-the-sun') + result = s.replace("fun", "in-the-sun") tm.assert_series_equal(s, result) def test_replace_bool_with_string(self): # nonexistent elements s = pd.Series([True, False, True]) - result = s.replace(True, '2u') - expected = pd.Series(['2u', False, '2u']) + result = s.replace(True, "2u") + expected = pd.Series(["2u", False, "2u"]) tm.assert_series_equal(expected, result) def test_replace_bool_with_bool(self): @@ -206,19 +207,18 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) - with pytest.raises(TypeError, match='Cannot compare types .+'): - s.replace({'asdf': 'asdb', True: 'yes'}) + with pytest.raises(TypeError, match="Cannot compare types .+"): + s.replace({"asdf": "asdb", True: "yes"}) def test_replace2(self): N = 100 - ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), - dtype=object) + ser = pd.Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), dtype=object) ser[:5] = np.nan - ser[6:10] = 'foo' - ser[20:30] = 'bar' + ser[6:10] = "foo" + ser[20:30] = "bar" # replace list with a single value - rs = ser.replace([np.nan, 'foo', 'bar'], -1) + rs = ser.replace([np.nan, "foo", "bar"], -1) assert (rs[:5] == -1).all() assert (rs[6:10] == -1).all() @@ -226,7 +226,7 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values - rs = ser.replace({np.nan: -1, 'foo': -2, 'bar': -3}) + rs = ser.replace({np.nan: -1, "foo": -2, "bar": -3}) assert (rs[:5] == -1).all() assert (rs[6:10] == -2).all() @@ -234,61 +234,64 @@ def test_replace2(self): assert (pd.isna(ser[:5])).all() # replace with different values with 2 lists - rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + rs2 = ser.replace([np.nan, "foo", "bar"], [-1, -2, -3]) tm.assert_series_equal(rs, rs2) # replace inplace - ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + ser.replace([np.nan, "foo", "bar"], -1, inplace=True) assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() def test_replace_with_empty_dictlike(self): # GH 15289 - s = pd.Series(list('abcd')) + s = pd.Series(list("abcd")) tm.assert_series_equal(s, s.replace(dict())) tm.assert_series_equal(s, s.replace(pd.Series([]))) def test_replace_string_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) - result = s.replace('2', np.nan) + result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_replacer_equals_replacement(self): # GH 20656 # make sure all replacers are matching against original values - s = pd.Series(['a', 'b']) - expected = pd.Series(['b', 'a']) - result = s.replace({'a': 'b', 'b': 'a'}) + s = pd.Series(["a", "b"]) + expected = pd.Series(["b", "a"]) + result = s.replace({"a": "b", "b": "a"}) tm.assert_series_equal(expected, result) def test_replace_unicode_with_number(self): # GH 15743 s = pd.Series([1, 2, 3]) - result = s.replace('2', np.nan) + result = s.replace("2", np.nan) expected = pd.Series([1, 2, 3]) tm.assert_series_equal(expected, result) def test_replace_mixed_types_with_string(self): # Testing mixed - s = pd.Series([1, 2, 3, '4', 4, 5]) - result = s.replace([2, '4'], np.nan) + s = pd.Series([1, 2, 3, "4", 4, 5]) + result = s.replace([2, "4"], np.nan) expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.parametrize("categorical, numeric", [ - (pd.Categorical('A', categories=['A', 'B']), [1]), - (pd.Categorical(('A', ), categories=['A', 'B']), [1]), - (pd.Categorical(('A', 'B'), categories=['A', 'B']), [1, 2]), - ]) + @pytest.mark.parametrize( + "categorical, numeric", + [ + (pd.Categorical("A", categories=["A", "B"]), [1]), + (pd.Categorical(("A",), categories=["A", "B"]), [1]), + (pd.Categorical(("A", "B"), categories=["A", "B"]), [1, 2]), + ], + ) def test_replace_categorical(self, categorical, numeric): # GH 24971 # Do not check if dtypes are equal due to a known issue that # Categorical.replace sometimes coerces to object (GH 23305) s = pd.Series(categorical) - result = s.replace({'A': 1, 'B': 2}) + result = s.replace({"A": 1, "B": 2}) expected = pd.Series(numeric) tm.assert_series_equal(expected, result, check_dtype=False) @@ -296,12 +299,11 @@ def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError s = pd.Series([0, 1, 2, 3, 4]) - result = s.replace([3], ['100000000000000000000']) - expected = pd.Series([0, 1, 2, '100000000000000000000', 4]) + result = s.replace([3], ["100000000000000000000"]) + expected = pd.Series([0, 1, 2, "100000000000000000000", 4]) tm.assert_series_equal(result, expected) - s = pd.Series([0, '100000000000000000000', - '100000000000000000001']) - result = s.replace(['100000000000000000000'], [1]) - expected = pd.Series([0, 1, '100000000000000000001']) + s = pd.Series([0, "100000000000000000000", "100000000000000000001"]) + result = s.replace(["100000000000000000000"], [1]) + expected = pd.Series([0, 1, "100000000000000000001"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 92b6fb0610979..3e8f653c47424 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -5,8 +5,15 @@ import pandas as pd from pandas import ( - Categorical, DataFrame, Index, Series, date_range, option_context, - period_range, timedelta_range) + Categorical, + DataFrame, + Index, + Series, + date_range, + option_context, + period_range, + timedelta_range, +) from pandas.core.base import StringMixin from pandas.core.index import MultiIndex import pandas.util.testing as tm @@ -15,20 +22,27 @@ class TestSeriesRepr(TestData): - def test_multilevel_name_print(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - s = Series(range(len(index)), index=index, name='sth') - expected = ["first second", "foo one 0", - " two 1", " three 2", - "bar one 3", " two 4", - "baz two 5", " three 6", - "qux one 7", " two 8", - " three 9", "Name: sth, dtype: int64"] + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + s = Series(range(len(index)), index=index, name="sth") + expected = [ + "first second", + "foo one 0", + " two 1", + " three 2", + "bar one 3", + " two 4", + "baz two 5", + " three 6", + "qux one 7", + " two 8", + " three 9", + "Name: sth, dtype: int64", + ] expected = "\n".join(expected) assert repr(s) == expected @@ -51,7 +65,7 @@ def test_name_printing(self): s.name = None assert "Name:" not in repr(s) - s = Series(index=date_range('20010101', '20020101'), name='test') + s = Series(index=date_range("20010101", "20020101"), name="test") assert "Name: test" in repr(s) def test_repr(self): @@ -71,21 +85,30 @@ def test_repr(self): str(self.series) # with Nones - ots = self.ts.astype('O') + ots = self.ts.astype("O") ots[::2] = None repr(ots) # various names - for name in ['', 1, 1.2, 'foo', '\u03B1\u03B2\u03B3', - 'loooooooooooooooooooooooooooooooooooooooooooooooooooong', - ('foo', 'bar', 'baz'), (1, 2), ('foo', 1, 2.3), - ('\u03B1', '\u03B2', '\u03B3'), - ('\u03B1', 'bar')]: + for name in [ + "", + 1, + 1.2, + "foo", + "\u03B1\u03B2\u03B3", + "loooooooooooooooooooooooooooooooooooooooooooooooooooong", + ("foo", "bar", "baz"), + (1, 2), + ("foo", 1, 2.3), + ("\u03B1", "\u03B2", "\u03B3"), + ("\u03B1", "bar"), + ]: self.series.name = name repr(self.series) - biggie = Series(tm.randn(1000), index=np.arange(1000), - name=('foo', 'bar', 'baz')) + biggie = Series( + tm.randn(1000), index=np.arange(1000), name=("foo", "bar", "baz") + ) repr(biggie) # 0 as name @@ -104,15 +127,15 @@ def test_repr(self): assert "a\n" not in repr(ser) # with empty series (#4651) - s = Series([], dtype=np.int64, name='foo') - assert repr(s) == 'Series([], Name: foo, dtype: int64)' + s = Series([], dtype=np.int64, name="foo") + assert repr(s) == "Series([], Name: foo, dtype: int64)" s = Series([], dtype=np.int64, name=None) - assert repr(s) == 'Series([], dtype: int64)' + assert repr(s) == "Series([], dtype: int64)" def test_tidy_repr(self): a = Series(["\u05d0"] * 1000) - a.name = 'title1' + a.name = "title1" repr(a) # should not raise exception def test_repr_bool_fails(self, capsys): @@ -122,7 +145,7 @@ def test_repr_bool_fails(self, capsys): repr(s) captured = capsys.readouterr() - assert captured.err == '' + assert captured.err == "" def test_repr_name_iterable_indexable(self): s = Series([1, 2, 3], name=np.int64(3)) @@ -130,7 +153,7 @@ def test_repr_name_iterable_indexable(self): # it works! repr(s) - s.name = ("\u05d0", ) * 2 + s.name = ("\u05d0",) * 2 repr(s) def test_repr_should_return_str(self): @@ -146,7 +169,7 @@ def test_repr_should_return_str(self): def test_repr_max_rows(self): # GH 6863 - with pd.option_context('max_rows', None): + with pd.option_context("max_rows", None): str(Series(range(1001))) # should not raise exception def test_unicode_string_with_unicode(self): @@ -161,13 +184,14 @@ def test_str_to_bytes_raises(self): bytes(df) def test_timeseries_repr_object_dtype(self): - index = Index([datetime(2000, 1, 1) + timedelta(i) - for i in range(1000)], dtype=object) + index = Index( + [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object + ) ts = Series(np.random.randn(len(index)), index) repr(ts) ts = tm.makeTimeSeries(1000) - assert repr(ts).splitlines()[-1].startswith('Freq:') + assert repr(ts).splitlines()[-1].startswith("Freq:") ts2 = ts.iloc[np.random.randint(0, len(ts) - 1, 400)] repr(ts2).splitlines()[-1] @@ -183,9 +207,8 @@ def test_latex_repr(self): \bottomrule \end{tabular} """ - with option_context('display.latex.escape', False, - 'display.latex.repr', True): - s = Series([r'$\alpha$', 'b', 'c']) + with option_context("display.latex.escape", False, "display.latex.repr", True): + s = Series([r"$\alpha$", "b", "c"]) assert result == s._repr_latex_() assert s._repr_latex_() is None @@ -200,16 +223,15 @@ def test_index_repr_in_frame_with_nan(self): class TestCategoricalRepr: - def test_categorical_repr_unicode(self): # see gh-21002 class County(StringMixin): - name = 'San Sebastián' - state = 'PR' + name = "San Sebastián" + state = "PR" def __str__(self): - return self.name + ', ' + self.state + return self.name + ", " + self.state cat = pd.Categorical([County() for _ in range(61)]) idx = pd.Index(cat) @@ -220,21 +242,29 @@ def __str__(self): def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) - exp = ("0 1\n1 2\n2 3\n3 4\n" + - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]") + exp = ( + "0 1\n1 2\n2 3\n3 4\n" + + "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + ) assert exp == a.__str__() a = Series(Categorical(["a", "b"] * 25)) - exp = ("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" + - "Length: 50, dtype: category\nCategories (2, object): [a, b]") + exp = ( + "0 a\n1 b\n" + + " ..\n" + + "48 a\n49 b\n" + + "Length: 50, dtype: category\nCategories (2, object): [a, b]" + ) with option_context("display.max_rows", 5): assert exp == repr(a) levs = list("abcdefghijklmnopqrstuvwxyz") a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) - exp = ("0 a\n1 b\n" + "dtype: category\n" - "Categories (26, object): [a < b < c < d ... w < x < y < z]") + exp = ( + "0 a\n1 b\n" + "dtype: category\n" + "Categories (26, object): [a < b < c < d ... w < x < y < z]" + ) assert exp == a.__str__() def test_categorical_series_repr(self): @@ -290,7 +320,7 @@ def test_categorical_series_repr_ordered(self): assert repr(s) == exp def test_categorical_series_repr_datetime(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -303,8 +333,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -319,7 +348,7 @@ def test_categorical_series_repr_datetime(self): assert repr(s) == exp def test_categorical_series_repr_datetime_ordered(self): - idx = date_range('2011-01-01 09:00', freq='H', periods=5) + idx = date_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00 1 2011-01-01 10:00:00 @@ -332,8 +361,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp - idx = date_range('2011-01-01 09:00', freq='H', periods=5, - tz='US/Eastern') + idx = date_range("2011-01-01 09:00", freq="H", periods=5, tz="US/Eastern") s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00:00-05:00 1 2011-01-01 10:00:00-05:00 @@ -348,7 +376,7 @@ def test_categorical_series_repr_datetime_ordered(self): assert repr(s) == exp def test_categorical_series_repr_period(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -361,7 +389,7 @@ def test_categorical_series_repr_period(self): assert repr(s) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) s = Series(Categorical(idx)) exp = """0 2011-01 1 2011-02 @@ -374,7 +402,7 @@ def test_categorical_series_repr_period(self): assert repr(s) == exp def test_categorical_series_repr_period_ordered(self): - idx = period_range('2011-01-01 09:00', freq='H', periods=5) + idx = period_range("2011-01-01 09:00", freq="H", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01-01 09:00 1 2011-01-01 10:00 @@ -387,7 +415,7 @@ def test_categorical_series_repr_period_ordered(self): assert repr(s) == exp - idx = period_range('2011-01', freq='M', periods=5) + idx = period_range("2011-01", freq="M", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 2011-01 1 2011-02 @@ -400,7 +428,7 @@ def test_categorical_series_repr_period_ordered(self): assert repr(s) == exp def test_categorical_series_repr_timedelta(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) s = Series(Categorical(idx)) exp = """0 1 days 1 2 days @@ -412,7 +440,7 @@ def test_categorical_series_repr_timedelta(self): assert repr(s) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) s = Series(Categorical(idx)) exp = """0 0 days 01:00:00 1 1 days 01:00:00 @@ -432,7 +460,7 @@ def test_categorical_series_repr_timedelta(self): assert repr(s) == exp def test_categorical_series_repr_timedelta_ordered(self): - idx = timedelta_range('1 days', periods=5) + idx = timedelta_range("1 days", periods=5) s = Series(Categorical(idx, ordered=True)) exp = """0 1 days 1 2 days @@ -444,7 +472,7 @@ def test_categorical_series_repr_timedelta_ordered(self): assert repr(s) == exp - idx = timedelta_range('1 hours', periods=10) + idx = timedelta_range("1 hours", periods=10) s = Series(Categorical(idx, ordered=True)) exp = """0 0 days 01:00:00 1 1 days 01:00:00 diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 0d95a1014f4a8..0ae2194543b44 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -11,12 +11,11 @@ class TestSeriesSorting(TestData): - def test_sort_values(self): # check indexes are reordered corresponding with the values - ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D']) - expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C']) + ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"]) + expected = Series([1, 2, 3, 4], ["D", "B", "A", "C"]) result = ser.sort_values() tm.assert_series_equal(expected, result) @@ -29,12 +28,12 @@ def test_sort_values(self): tm.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) # na_position - result = ts.sort_values(na_position='first') + result = ts.sort_values(na_position="first") assert np.isnan(result[:5]).all() tm.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) # something object-type - ser = Series(['A', 'B'], [1, 2]) + ser = Series(["A", "B"], [1, 2]) # no failure ser.sort_values() @@ -42,15 +41,15 @@ def test_sort_values(self): ordered = ts.sort_values(ascending=False) expected = np.sort(ts.dropna().values)[::-1] assert_almost_equal(expected, ordered.dropna().values) - ordered = ts.sort_values(ascending=False, na_position='first') + ordered = ts.sort_values(ascending=False, na_position="first") assert_almost_equal(expected, ordered.dropna().values) # ascending=[False] should behave the same as ascending=False ordered = ts.sort_values(ascending=[False]) expected = ts.sort_values(ascending=False) assert_series_equal(expected, ordered) - ordered = ts.sort_values(ascending=[False], na_position='first') - expected = ts.sort_values(ascending=False, na_position='first') + ordered = ts.sort_values(ascending=[False], na_position="first") + expected = ts.sort_values(ascending=False, na_position="first") assert_series_equal(expected, ordered) msg = "ascending must be boolean" @@ -67,22 +66,23 @@ def test_sort_values(self): ts.sort_values(ascending=[False, False]) msg = "ascending must be boolean" with pytest.raises(ValueError, match=msg): - ts.sort_values(ascending='foobar') + ts.sort_values(ascending="foobar") # inplace=True ts = self.ts.copy() ts.sort_values(ascending=False, inplace=True) tm.assert_series_equal(ts, self.ts.sort_values(ascending=False)) - tm.assert_index_equal(ts.index, - self.ts.sort_values(ascending=False).index) + tm.assert_index_equal(ts.index, self.ts.sort_values(ascending=False).index) # GH 5856/5853 # Series.sort_values operating on a view df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0] - msg = ("This Series is a view of some other array, to sort in-place" - " you must create a copy") + msg = ( + "This Series is a view of some other array, to sort in-place" + " you must create a copy" + ) with pytest.raises(ValueError, match=msg): s.sort_values(inplace=True) @@ -96,8 +96,7 @@ def test_sort_index(self): # descending sorted_series = random_order.sort_index(ascending=False) - assert_series_equal(sorted_series, - self.ts.reindex(self.ts.index[::-1])) + assert_series_equal(sorted_series, self.ts.reindex(self.ts.index[::-1])) # compat on level sorted_series = random_order.sort_index(level=0) @@ -107,8 +106,7 @@ def test_sort_index(self): sorted_series = random_order.sort_index(axis=0) assert_series_equal(sorted_series, self.ts) - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): random_order.sort_values(axis=1) @@ -129,8 +127,7 @@ def test_sort_index_inplace(self): result = random_order.sort_index(ascending=False, inplace=True) assert result is None - tm.assert_series_equal(random_order, self.ts.reindex( - self.ts.index[::-1])) + tm.assert_series_equal(random_order, self.ts.reindex(self.ts.index[::-1])) # ascending random_order = self.ts.reindex(rindex) @@ -139,10 +136,10 @@ def test_sort_index_inplace(self): assert result is None tm.assert_series_equal(random_order, self.ts) - @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + @pytest.mark.parametrize("level", ["A", 0]) # GH 21052 def test_sort_index_multiindex(self, level): - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] @@ -160,39 +157,39 @@ def test_sort_index_kind(self): series = Series(index=[3, 2, 1, 4, 3]) expected_series = Series(index=[1, 2, 3, 3, 4]) - index_sorted_series = series.sort_index(kind='mergesort') + index_sorted_series = series.sort_index(kind="mergesort") assert_series_equal(expected_series, index_sorted_series) - index_sorted_series = series.sort_index(kind='quicksort') + index_sorted_series = series.sort_index(kind="quicksort") assert_series_equal(expected_series, index_sorted_series) - index_sorted_series = series.sort_index(kind='heapsort') + index_sorted_series = series.sort_index(kind="heapsort") assert_series_equal(expected_series, index_sorted_series) def test_sort_index_na_position(self): series = Series(index=[3, 2, 1, 4, 3, np.nan]) expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4]) - index_sorted_series = series.sort_index(na_position='first') + index_sorted_series = series.sort_index(na_position="first") assert_series_equal(expected_series_first, index_sorted_series) expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan]) - index_sorted_series = series.sort_index(na_position='last') + index_sorted_series = series.sort_index(na_position="last") assert_series_equal(expected_series_last, index_sorted_series) def test_sort_index_intervals(self): - s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( - [0, 1, 2, 3], - [1, 2, 3, 4])) + s = Series( + [np.nan, 1, 2, 3], IntervalIndex.from_arrays([0, 1, 2, 3], [1, 2, 3, 4]) + ) result = s.sort_index() expected = s assert_series_equal(result, expected) result = s.sort_index(ascending=False) - expected = Series([3, 2, 1, np.nan], IntervalIndex.from_arrays( - [3, 2, 1, 0], - [4, 3, 2, 1])) + expected = Series( + [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) + ) assert_series_equal(result, expected) def test_sort_values_categorical(self): @@ -202,8 +199,8 @@ def test_sort_values_categorical(self): # sort in the categories order expected = Series( - Categorical(["a", "a", "b", "b"], - ordered=False), index=[0, 3, 1, 2]) + Categorical(["a", "a", "b", "b"], ordered=False), index=[0, 3, 1, 2] + ) result = cat.sort_values() tm.assert_series_equal(result, expected) @@ -212,8 +209,11 @@ def test_sort_values_categorical(self): exp = np.array(["a", "b", "c", "d"], dtype=np.object_) tm.assert_numpy_array_equal(res.__array__(), exp) - cat = Series(Categorical(["a", "c", "b", "d"], categories=[ - "a", "b", "c", "d"], ordered=True)) + cat = Series( + Categorical( + ["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True + ) + ) res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=np.object_) tm.assert_numpy_array_equal(res.__array__(), exp) @@ -222,15 +222,16 @@ def test_sort_values_categorical(self): exp = np.array(["d", "c", "b", "a"], dtype=np.object_) tm.assert_numpy_array_equal(res.__array__(), exp) - raw_cat1 = Categorical(["a", "b", "c", "d"], - categories=["a", "b", "c", "d"], ordered=False) - raw_cat2 = Categorical(["a", "b", "c", "d"], - categories=["d", "c", "b", "a"], ordered=True) + raw_cat1 = Categorical( + ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False + ) + raw_cat2 = Categorical( + ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True + ) s = ["a", "b", "c", "d"] - df = DataFrame({"unsort": raw_cat1, - "sort": raw_cat2, - "string": s, - "values": [1, 2, 3, 4]}) + df = DataFrame( + {"unsort": raw_cat1, "sort": raw_cat2, "string": s, "values": [1, 2, 3, 4]} + ) # Cats must be sorted in a dataframe res = df.sort_values(by=["string"], ascending=False) @@ -249,17 +250,18 @@ def test_sort_values_categorical(self): # multi-columns sort # GH 7848 - df = DataFrame({"id": [6, 5, 4, 3, 2, 1], - "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) + df = DataFrame( + {"id": [6, 5, 4, 3, 2, 1], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) df["grade"] = Categorical(df["raw_grade"], ordered=True) - df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a']) + df["grade"] = df["grade"].cat.set_categories(["b", "e", "a"]) # sorts 'grade' according to the order of the categories - result = df.sort_values(by=['grade']) + result = df.sort_values(by=["grade"]) expected = df.iloc[[1, 2, 5, 0, 3, 4]] tm.assert_frame_equal(result, expected) # multi - result = df.sort_values(by=['grade', 'id']) + result = df.sort_values(by=["grade", "id"]) expected = df.iloc[[2, 1, 5, 4, 3, 0]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index b47d339f5a5f2..450fdc3f4dd6f 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -7,45 +7,41 @@ class TestSeriesSubclassing: - def test_indexing_sliced(self): - s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd')) - res = s.loc[['a', 'b']] - exp = tm.SubclassedSeries([1, 2], index=list('ab')) + s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd")) + res = s.loc[["a", "b"]] + exp = tm.SubclassedSeries([1, 2], index=list("ab")) tm.assert_series_equal(res, exp) res = s.iloc[[2, 3]] - exp = tm.SubclassedSeries([3, 4], index=list('cd')) + exp = tm.SubclassedSeries([3, 4], index=list("cd")) tm.assert_series_equal(res, exp) - res = s.loc[['a', 'b']] - exp = tm.SubclassedSeries([1, 2], index=list('ab')) + res = s.loc[["a", "b"]] + exp = tm.SubclassedSeries([1, 2], index=list("ab")) tm.assert_series_equal(res, exp) def test_to_frame(self): - s = tm.SubclassedSeries([1, 2, 3, 4], index=list('abcd'), name='xxx') + s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd"), name="xxx") res = s.to_frame() - exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd')) + exp = tm.SubclassedDataFrame({"xxx": [1, 2, 3, 4]}, index=list("abcd")) tm.assert_frame_equal(res, exp) def test_subclass_unstack(self): # GH 15564 - s = tm.SubclassedSeries( - [1, 2, 3, 4], index=[list('aabb'), list('xyxy')]) + s = tm.SubclassedSeries([1, 2, 3, 4], index=[list("aabb"), list("xyxy")]) res = s.unstack() - exp = tm.SubclassedDataFrame( - {'x': [1, 3], 'y': [2, 4]}, index=['a', 'b']) + exp = tm.SubclassedDataFrame({"x": [1, 3], "y": [2, 4]}, index=["a", "b"]) tm.assert_frame_equal(res, exp) def test_subclass_empty_repr(self): - assert 'SubclassedSeries' in repr(tm.SubclassedSeries()) + assert "SubclassedSeries" in repr(tm.SubclassedSeries()) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesSubclassing: - def test_subclass_sparse_slice(self): # int64 s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) @@ -62,16 +58,16 @@ def test_subclass_sparse_slice(self): assert s[1:3].dtype == SparseDtype(np.int64) # float64 - s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.]) - exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3]) + s = tm.SubclassedSparseSeries([1.0, 2.0, 3.0, 4.0, 5.0]) + exp = tm.SubclassedSparseSeries([2.0, 3.0, 4.0], index=[1, 2, 3]) tm.assert_sp_series_equal(s.loc[1:3], exp) assert s.loc[1:3].dtype == SparseDtype(np.float64) - exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + exp = tm.SubclassedSparseSeries([2.0, 3.0], index=[1, 2]) tm.assert_sp_series_equal(s.iloc[1:3], exp) assert s.iloc[1:3].dtype == SparseDtype(np.float64) - exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + exp = tm.SubclassedSparseSeries([2.0, 3.0], index=[1, 2]) tm.assert_sp_series_equal(s[1:3], exp) assert s[1:3].dtype == SparseDtype(np.float64) @@ -83,29 +79,26 @@ def test_subclass_sparse_addition(self): s1 = tm.SubclassedSparseSeries([4.0, 5.0, 6.0]) s2 = tm.SubclassedSparseSeries([1.0, 2.0, 3.0]) - exp = tm.SubclassedSparseSeries([5., 7., 9.]) + exp = tm.SubclassedSparseSeries([5.0, 7.0, 9.0]) tm.assert_sp_series_equal(s1 + s2, exp) def test_subclass_sparse_to_frame(self): - s = tm.SubclassedSparseSeries([1, 2], index=list('ab'), name='xxx') + s = tm.SubclassedSparseSeries([1, 2], index=list("ab"), name="xxx") res = s.to_frame() - exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block', - fill_value=0) - exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr}, - index=list('ab'), - default_fill_value=0) + exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind="block", fill_value=0) + exp = tm.SubclassedSparseDataFrame( + {"xxx": exp_arr}, index=list("ab"), default_fill_value=0 + ) tm.assert_sp_frame_equal(res, exp) # create from int dict - res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, - index=list('ab'), - default_fill_value=0) + res = tm.SubclassedSparseDataFrame( + {"xxx": [1, 2]}, index=list("ab"), default_fill_value=0 + ) tm.assert_sp_frame_equal(res, exp) - s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('ab'), - name='xxx') + s = tm.SubclassedSparseSeries([1.1, 2.1], index=list("ab"), name="xxx") res = s.to_frame() - exp = tm.SubclassedSparseDataFrame({'xxx': [1.1, 2.1]}, - index=list('ab')) + exp = tm.SubclassedSparseDataFrame({"xxx": [1.1, 2.1]}, index=list("ab")) tm.assert_sp_frame_equal(res, exp) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 97f1cd1cc7789..6be1b9a9143bf 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -12,39 +12,49 @@ import pandas as pd from pandas import ( - DataFrame, Index, NaT, Series, Timestamp, concat, date_range, offsets, - timedelta_range, to_datetime) + DataFrame, + Index, + NaT, + Series, + Timestamp, + concat, + date_range, + offsets, + timedelta_range, + to_datetime, +) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tests.series.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from pandas.tseries.offsets import BDay, BMonthEnd -def _simple_ts(start, end, freq='D'): +def _simple_ts(start, end, freq="D"): rng = date_range(start, end, freq=freq) return Series(np.random.randn(len(rng)), index=rng) def assert_range_equal(left, right): - assert (left.equals(right)) - assert (left.freq == right.freq) - assert (left.tz == right.tz) + assert left.equals(right) + assert left.freq == right.freq + assert left.tz == right.tz class TestTimeSeries(TestData): - def test_shift(self): shifted = self.ts.shift(1) unshifted = shifted.shift(-1) tm.assert_index_equal(shifted.index, self.ts.index) tm.assert_index_equal(unshifted.index, self.ts.index) - tm.assert_numpy_array_equal(unshifted.dropna().values, - self.ts.values[:-1]) + tm.assert_numpy_array_equal(unshifted.dropna().values, self.ts.values[:-1]) offset = BDay() shifted = self.ts.shift(1, freq=offset) @@ -55,8 +65,8 @@ def test_shift(self): unshifted = self.ts.shift(0, freq=offset) assert_series_equal(unshifted, self.ts) - shifted = self.ts.shift(1, freq='B') - unshifted = shifted.shift(-1, freq='B') + shifted = self.ts.shift(1, freq="B") + unshifted = shifted.shift(-1, freq="B") assert_series_equal(unshifted, self.ts) @@ -72,17 +82,17 @@ def test_shift(self): tm.assert_index_equal(unshifted.index, ps.index) tm.assert_numpy_array_equal(unshifted.dropna().values, ps.values[:-1]) - shifted2 = ps.shift(1, 'B') + shifted2 = ps.shift(1, "B") shifted3 = ps.shift(1, BDay()) assert_series_equal(shifted2, shifted3) - assert_series_equal(ps, shifted2.shift(-1, 'B')) + assert_series_equal(ps, shifted2.shift(-1, "B")) msg = "Given freq D does not match PeriodIndex freq B" with pytest.raises(ValueError, match=msg): - ps.shift(freq='D') + ps.shift(freq="D") # legacy support - shifted4 = ps.shift(1, freq='B') + shifted4 = ps.shift(1, freq="B") assert_series_equal(shifted2, shifted4) shifted5 = ps.shift(1, freq=BDay()) @@ -90,8 +100,8 @@ def test_shift(self): # 32-bit taking # GH 8129 - index = date_range('2000-01-01', periods=5) - for dtype in ['int32', 'int64']: + index = date_range("2000-01-01", periods=5) + for dtype in ["int32", "int64"]: s1 = Series(np.arange(5, dtype=dtype), index=index) p = s1.iloc[1] result = s1.shift(periods=p) @@ -100,52 +110,57 @@ def test_shift(self): # xref 8260 # with tz - s = Series(date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) result = s - s.shift() - exp = Series(TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo') + exp = Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") assert_series_equal(result, exp) # incompat tz - s2 = Series(date_range('2000-01-01 09:00:00', periods=5, - tz='CET'), name='foo') - msg = ("DatetimeArray subtraction must have the same timezones or no" - " timezones") + s2 = Series(date_range("2000-01-01 09:00:00", periods=5, tz="CET"), name="foo") + msg = ( + "DatetimeArray subtraction must have the same timezones or no" " timezones" + ) with pytest.raises(TypeError, match=msg): s - s2 def test_shift2(self): - ts = Series(np.random.randn(5), - index=date_range('1/1/2000', periods=5, freq='H')) + ts = Series( + np.random.randn(5), index=date_range("1/1/2000", periods=5, freq="H") + ) - result = ts.shift(1, freq='5T') - exp_index = ts.index.shift(1, freq='5T') + result = ts.shift(1, freq="5T") + exp_index = ts.index.shift(1, freq="5T") tm.assert_index_equal(result.index, exp_index) # GH #1063, multiple of same base - result = ts.shift(1, freq='4H') + result = ts.shift(1, freq="4H") exp_index = ts.index + offsets.Hour(4) tm.assert_index_equal(result.index, exp_index) - idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + idx = DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"]) msg = "Cannot shift with no freq" with pytest.raises(NullFrequencyError, match=msg): idx.shift(1) def test_shift_fill_value(self): # GH #24128 - ts = Series([1.0, 2.0, 3.0, 4.0, 5.0], - index=date_range('1/1/2000', periods=5, freq='H')) + ts = Series( + [1.0, 2.0, 3.0, 4.0, 5.0], index=date_range("1/1/2000", periods=5, freq="H") + ) - exp = Series([0.0, 1.0, 2.0, 3.0, 4.0], - index=date_range('1/1/2000', periods=5, freq='H')) + exp = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], index=date_range("1/1/2000", periods=5, freq="H") + ) # check that fill value works result = ts.shift(1, fill_value=0.0) tm.assert_series_equal(result, exp) - exp = Series([0.0, 0.0, 1.0, 2.0, 3.0], - index=date_range('1/1/2000', periods=5, freq='H')) + exp = Series( + [0.0, 0.0, 1.0, 2.0, 3.0], index=date_range("1/1/2000", periods=5, freq="H") + ) result = ts.shift(2, fill_value=0.0) tm.assert_series_equal(result, exp) @@ -154,44 +169,46 @@ def test_shift_fill_value(self): assert res.dtype == ts.dtype def test_categorical_shift_fill_value(self): - ts = pd.Series(['a', 'b', 'c', 'd'], dtype="category") - res = ts.shift(1, fill_value='a') - expected = pd.Series(pd.Categorical(['a', 'a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - ordered=False)) + ts = pd.Series(["a", "b", "c", "d"], dtype="category") + res = ts.shift(1, fill_value="a") + expected = pd.Series( + pd.Categorical( + ["a", "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False + ) + ) tm.assert_equal(res, expected) # check for incorrect fill_value msg = "'fill_value=f' is not present in this Categorical's categories" with pytest.raises(ValueError, match=msg): - ts.shift(1, fill_value='f') + ts.shift(1, fill_value="f") def test_shift_dst(self): # GH 13926 - dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern') + dates = date_range("2016-11-06", freq="H", periods=10, tz="US/Eastern") s = Series(dates) res = s.shift(0) tm.assert_series_equal(res, s) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" res = s.shift(1) exp_vals = [NaT] + dates.astype(object).values.tolist()[:9] exp = Series(exp_vals) tm.assert_series_equal(res, exp) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" res = s.shift(-2) exp_vals = dates.astype(object).values.tolist()[2:] + [NaT, NaT] exp = Series(exp_vals) tm.assert_series_equal(res, exp) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" for ex in [10, -10, 20, -20]: res = s.shift(ex) - exp = Series([NaT] * 10, dtype='datetime64[ns, US/Eastern]') + exp = Series([NaT] * 10, dtype="datetime64[ns, US/Eastern]") tm.assert_series_equal(res, exp) - assert res.dtype == 'datetime64[ns, US/Eastern]' + assert res.dtype == "datetime64[ns, US/Eastern]" def test_tshift(self): # PeriodIndex @@ -201,7 +218,7 @@ def test_tshift(self): assert_series_equal(unshifted, ps) - shifted2 = ps.tshift(freq='B') + shifted2 = ps.tshift(freq="B") assert_series_equal(shifted, shifted2) shifted3 = ps.tshift(freq=BDay()) @@ -209,7 +226,7 @@ def test_tshift(self): msg = "Given freq M does not match PeriodIndex freq B" with pytest.raises(ValueError, match=msg): - ps.tshift(freq='M') + ps.tshift(freq="M") # DatetimeIndex shifted = self.ts.tshift(1) @@ -220,8 +237,9 @@ def test_tshift(self): shifted2 = self.ts.tshift(freq=self.ts.index.freq) assert_series_equal(shifted, shifted2) - inferred_ts = Series(self.ts.values, Index(np.asarray(self.ts.index)), - name='ts') + inferred_ts = Series( + self.ts.values, Index(np.asarray(self.ts.index)), name="ts" + ) shifted = inferred_ts.tshift(1) unshifted = shifted.tshift(-1) assert_series_equal(shifted, self.ts.tshift(1)) @@ -273,66 +291,70 @@ def test_truncate(self): # corner case, empty series returned truncated = ts.truncate(after=self.ts.index[0] - offset) - assert (len(truncated) == 0) + assert len(truncated) == 0 truncated = ts.truncate(before=self.ts.index[-1] + offset) - assert (len(truncated) == 0) + assert len(truncated) == 0 msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" with pytest.raises(ValueError, match=msg): - ts.truncate(before=self.ts.index[-1] + offset, - after=self.ts.index[0] - offset) + ts.truncate( + before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset + ) def test_truncate_nonsortedindex(self): # GH 17935 - s = pd.Series(['a', 'b', 'c', 'd', 'e'], - index=[5, 3, 2, 9, 0]) - msg = 'truncate requires a sorted index' + s = pd.Series(["a", "b", "c", "d", "e"], index=[5, 3, 2, 9, 0]) + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): s.truncate(before=3, after=9) - rng = pd.date_range('2011-01-01', '2012-01-01', freq='W') + rng = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts = pd.Series(np.random.randn(len(rng)), index=rng) - msg = 'truncate requires a sorted index' + msg = "truncate requires a sorted index" with pytest.raises(ValueError, match=msg): - ts.sort_values(ascending=False).truncate(before='2011-11', - after='2011-12') + ts.sort_values(ascending=False).truncate(before="2011-11", after="2011-12") def test_asfreq(self): - ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), datetime( - 2009, 11, 30), datetime(2009, 12, 31)]) - - daily_ts = ts.asfreq('B') - monthly_ts = daily_ts.asfreq('BM') + ts = Series( + [0.0, 1.0, 2.0], + index=[ + datetime(2009, 10, 30), + datetime(2009, 11, 30), + datetime(2009, 12, 31), + ], + ) + + daily_ts = ts.asfreq("B") + monthly_ts = daily_ts.asfreq("BM") tm.assert_series_equal(monthly_ts, ts) - daily_ts = ts.asfreq('B', method='pad') - monthly_ts = daily_ts.asfreq('BM') + daily_ts = ts.asfreq("B", method="pad") + monthly_ts = daily_ts.asfreq("BM") tm.assert_series_equal(monthly_ts, ts) daily_ts = ts.asfreq(BDay()) monthly_ts = daily_ts.asfreq(BMonthEnd()) tm.assert_series_equal(monthly_ts, ts) - result = ts[:0].asfreq('M') + result = ts[:0].asfreq("M") assert len(result) == 0 assert result is not ts - daily_ts = ts.asfreq('D', fill_value=-1) + daily_ts = ts.asfreq("D", fill_value=-1) result = daily_ts.value_counts().sort_index() - expected = Series([60, 1, 1, 1], - index=[-1.0, 2.0, 1.0, 0.0]).sort_index() + expected = Series([60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0]).sort_index() tm.assert_series_equal(result, expected) def test_asfreq_datetimeindex_empty_series(self): # GH 14320 - expected = Series(index=pd.DatetimeIndex( - ["2016-09-29 11:00"])).asfreq('H') - result = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"]), - data=[3]).asfreq('H') + expected = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"])).asfreq("H") + result = Series(index=pd.DatetimeIndex(["2016-09-29 11:00"]), data=[3]).asfreq( + "H" + ) tm.assert_index_equal(expected.index, result.index) def test_diff(self): @@ -358,7 +380,7 @@ def test_diff(self): assert_series_equal(rs, xp) # datetime diff (GH3100) - s = Series(date_range('20130102', periods=5)) + s = Series(date_range("20130102", periods=5)) rs = s - s.shift(1) xp = s.diff() assert_series_equal(rs, xp) @@ -370,61 +392,58 @@ def test_diff(self): # with tz s = Series( - date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) result = s.diff() - assert_series_equal(result, Series( - TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo')) + assert_series_equal( + result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + ) def test_pct_change(self): rs = self.ts.pct_change(fill_method=None) assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) rs = self.ts.pct_change(2) - filled = self.ts.fillna(method='pad') + filled = self.ts.fillna(method="pad") assert_series_equal(rs, filled / filled.shift(2) - 1) - rs = self.ts.pct_change(fill_method='bfill', limit=1) - filled = self.ts.fillna(method='bfill', limit=1) + rs = self.ts.pct_change(fill_method="bfill", limit=1) + filled = self.ts.fillna(method="bfill", limit=1) assert_series_equal(rs, filled / filled.shift(1) - 1) - rs = self.ts.pct_change(freq='5D') - filled = self.ts.fillna(method='pad') - assert_series_equal(rs, - (filled / filled.shift(freq='5D') - 1) - .reindex_like(filled)) + rs = self.ts.pct_change(freq="5D") + filled = self.ts.fillna(method="pad") + assert_series_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) def test_pct_change_shift_over_nas(self): - s = Series([1., 1.5, np.nan, 2.5, 3.]) + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) chg = s.pct_change() - expected = Series([np.nan, 0.5, 0., 2.5 / 1.5 - 1, .2]) + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) assert_series_equal(chg, expected) - @pytest.mark.parametrize("freq, periods, fill_method, limit", - [('5B', 5, None, None), - ('3B', 3, None, None), - ('3B', 3, 'bfill', None), - ('7B', 7, 'pad', 1), - ('7B', 7, 'bfill', 3), - ('14B', 14, None, None)]) + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): # GH 7292 - rs_freq = self.ts.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = self.ts.pct_change(periods, - fill_method=fill_method, - limit=limit) + rs_freq = self.ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = self.ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_series_equal(rs_freq, rs_periods) empty_ts = Series(index=self.ts.index) - rs_freq = empty_ts.pct_change(freq=freq, - fill_method=fill_method, - limit=limit) - rs_periods = empty_ts.pct_change(periods, - fill_method=fill_method, - limit=limit) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_series_equal(rs_freq, rs_periods) def test_autocorr(self): @@ -499,7 +518,7 @@ def test_timeseries_coercion(self): assert isinstance(ser.index, DatetimeIndex) def test_contiguous_boolean_preserve_freq(self): - rng = date_range('1/1/2000', '3/1/2000', freq='B') + rng = date_range("1/1/2000", "3/1/2000", freq="B") mask = np.zeros(len(rng), dtype=bool) mask[10:20] = True @@ -517,95 +536,112 @@ def test_to_datetime_unit(self): epoch = 1370745748 s = Series([epoch + t for t in range(20)]) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) assert_series_equal(result, expected) s = Series([epoch + t for t in range(20)]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + ) assert_series_equal(result, expected) s = Series([epoch + t for t in range(20)] + [iNaT]) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) assert_series_equal(result, expected) s = Series([epoch + t for t in range(20)] + [iNaT]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) assert_series_equal(result, expected) # GH13834 - s = Series([epoch + t for t in np.arange(0, 2, .25)] + - [iNaT]).astype(float) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in np.arange(0, 2, .25)] + [NaT]) + s = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float) + result = to_datetime(s, unit="s") + expected = Series( + [ + Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) + for t in np.arange(0, 2, 0.25) + ] + + [NaT] + ) assert_series_equal(result, expected) - s = concat([Series([epoch + t for t in range(20)] - ).astype(float), Series([np.nan])], - ignore_index=True) - result = to_datetime(s, unit='s') - expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( - seconds=t) for t in range(20)] + [NaT]) + s = concat( + [Series([epoch + t for t in range(20)]).astype(float), Series([np.nan])], + ignore_index=True, + ) + result = to_datetime(s, unit="s") + expected = Series( + [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)] + + [NaT] + ) assert_series_equal(result, expected) - result = to_datetime([1, 2, 'NaT', pd.NaT, np.nan], unit='D') - expected = DatetimeIndex([Timestamp('1970-01-02'), - Timestamp('1970-01-03')] + ['NaT'] * 3) + result = to_datetime([1, 2, "NaT", pd.NaT, np.nan], unit="D") + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3 + ) tm.assert_index_equal(result, expected) msg = "non convertible value foo with the unit 'D'" with pytest.raises(ValueError, match=msg): - to_datetime([1, 2, 'foo'], unit='D') + to_datetime([1, 2, "foo"], unit="D") msg = "cannot convert input 111111111 with the unit 'D'" with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime([1, 2, 111111111], unit='D') + to_datetime([1, 2, 111111111], unit="D") # coerce we can process - expected = DatetimeIndex([Timestamp('1970-01-02'), - Timestamp('1970-01-03')] + ['NaT'] * 1) - result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce') + expected = DatetimeIndex( + [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1 + ) + result = to_datetime([1, 2, "foo"], unit="D", errors="coerce") tm.assert_index_equal(result, expected) - result = to_datetime([1, 2, 111111111], unit='D', errors='coerce') + result = to_datetime([1, 2, 111111111], unit="D", errors="coerce") tm.assert_index_equal(result, expected) def test_series_ctor_datetime64(self): - rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') + rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") dates = np.asarray(rng) series = Series(dates) - assert np.issubdtype(series.dtype, np.dtype('M8[ns]')) + assert np.issubdtype(series.dtype, np.dtype("M8[ns]")) def test_series_repr_nat(self): - series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") result = repr(series) - expected = ('0 1970-01-01 00:00:00.000000\n' - '1 1970-01-01 00:00:00.000001\n' - '2 1970-01-01 00:00:00.000002\n' - '3 NaT\n' - 'dtype: datetime64[ns]') + expected = ( + "0 1970-01-01 00:00:00.000000\n" + "1 1970-01-01 00:00:00.000001\n" + "2 1970-01-01 00:00:00.000002\n" + "3 NaT\n" + "dtype: datetime64[ns]" + ) assert result == expected def test_asfreq_keep_index_name(self): # GH #9854 - index_name = 'bar' - index = pd.date_range('20130101', periods=20, name=index_name) - df = pd.DataFrame([x for x in range(20)], columns=['foo'], index=index) + index_name = "bar" + index = pd.date_range("20130101", periods=20, name=index_name) + df = pd.DataFrame([x for x in range(20)], columns=["foo"], index=index) assert index_name == df.index.name - assert index_name == df.asfreq('10D').index.name + assert index_name == df.asfreq("10D").index.name def test_promote_datetime_date(self): - rng = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000", periods=20) ts = Series(np.random.randn(20), index=rng) ts_slice = ts[5:] @@ -619,8 +655,8 @@ def test_promote_datetime_date(self): assert_series_equal(result2, expected) # test asfreq - result = ts2.asfreq('4H', method='ffill') - expected = ts[5:].asfreq('4H', method='ffill') + result = ts2.asfreq("4H", method="ffill") + expected = ts[5:].asfreq("4H", method="ffill") assert_series_equal(result, expected) result = rng.get_indexer(ts2.index) @@ -628,13 +664,13 @@ def test_promote_datetime_date(self): tm.assert_numpy_array_equal(result, expected) def test_asfreq_normalize(self): - rng = date_range('1/1/2000 09:30', periods=20) - norm = date_range('1/1/2000', periods=20) + rng = date_range("1/1/2000 09:30", periods=20) + norm = date_range("1/1/2000", periods=20) vals = np.random.randn(20) ts = Series(vals, index=rng) - result = ts.asfreq('D', normalize=True) - norm = date_range('1/1/2000', periods=20) + result = ts.asfreq("D", normalize=True) + norm = date_range("1/1/2000", periods=20) expected = Series(vals, index=norm) assert_series_equal(result, expected) @@ -642,80 +678,80 @@ def test_asfreq_normalize(self): vals = np.random.randn(20, 3) ts = DataFrame(vals, index=rng) - result = ts.asfreq('D', normalize=True) + result = ts.asfreq("D", normalize=True) expected = DataFrame(vals, index=norm) assert_frame_equal(result, expected) def test_first_subset(self): - ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') - result = ts.first('10d') + ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") + result = ts.first("10d") assert len(result) == 20 - ts = _simple_ts('1/1/2000', '1/1/2010') - result = ts.first('10d') + ts = _simple_ts("1/1/2000", "1/1/2010") + result = ts.first("10d") assert len(result) == 10 - result = ts.first('3M') - expected = ts[:'3/31/2000'] + result = ts.first("3M") + expected = ts[:"3/31/2000"] assert_series_equal(result, expected) - result = ts.first('21D') + result = ts.first("21D") expected = ts[:21] assert_series_equal(result, expected) - result = ts[:0].first('3M') + result = ts[:0].first("3M") assert_series_equal(result, ts[:0]) def test_first_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "'first' only supports a DatetimeIndex index" with pytest.raises(TypeError, match=msg): - ser.first('1D') + ser.first("1D") def test_last_subset(self): - ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') - result = ts.last('10d') + ts = _simple_ts("1/1/2000", "1/1/2010", freq="12h") + result = ts.last("10d") assert len(result) == 20 - ts = _simple_ts('1/1/2000', '1/1/2010') - result = ts.last('10d') + ts = _simple_ts("1/1/2000", "1/1/2010") + result = ts.last("10d") assert len(result) == 10 - result = ts.last('21D') - expected = ts['12/12/2009':] + result = ts.last("21D") + expected = ts["12/12/2009":] assert_series_equal(result, expected) - result = ts.last('21D') + result = ts.last("21D") expected = ts[-21:] assert_series_equal(result, expected) - result = ts[:0].last('3M') + result = ts[:0].last("3M") assert_series_equal(result, ts[:0]) def test_last_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "'last' only supports a DatetimeIndex index" with pytest.raises(TypeError, match=msg): - ser.last('1D') + ser.last("1D") def test_format_pre_1900_dates(self): - rng = date_range('1/1/1850', '1/1/1950', freq='A-DEC') + rng = date_range("1/1/1850", "1/1/1950", freq="A-DEC") rng.format() ts = Series(1, index=rng) repr(ts) def test_at_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = Series(np.random.randn(len(rng)), index=rng) rs = ts.at_time(rng[1]) assert (rs.index.hour == rng[1].hour).all() assert (rs.index.minute == rng[1].minute).all() assert (rs.index.second == rng[1].second).all() - result = ts.at_time('9:30') + result = ts.at_time("9:30") expected = ts.at_time(time(9, 30)) assert_series_equal(result, expected) @@ -731,33 +767,33 @@ def test_at_time(self): assert_series_equal(result, expected) tm.assert_frame_equal(result_df, exp_df) - chunk = df.loc['1/4/2000':] + chunk = df.loc["1/4/2000":] result = chunk.loc[time(9, 30)] expected = result_df[-1:] tm.assert_frame_equal(result, expected) # midnight, everything - rng = date_range('1/1/2000', '1/31/2000') + rng = date_range("1/1/2000", "1/31/2000") ts = Series(np.random.randn(len(rng)), index=rng) result = ts.at_time(time(0, 0)) assert_series_equal(result, ts) # time doesn't exist - rng = date_range('1/1/2012', freq='23Min', periods=384) + rng = date_range("1/1/2012", freq="23Min", periods=384) ts = Series(np.random.randn(len(rng)), rng) - rs = ts.at_time('16:00') + rs = ts.at_time("16:00") assert len(rs) == 0 def test_at_time_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "Index must be DatetimeIndex" with pytest.raises(TypeError, match=msg): - ser.at_time('00:00') + ser.at_time("00:00") def test_between(self): - series = Series(date_range('1/1/2000', periods=10)) + series = Series(date_range("1/1/2000", periods=10)) left, right = series[[2, 7]] result = series.between(left, right) @@ -765,7 +801,7 @@ def test_between(self): assert_series_equal(result, expected) def test_between_time(self): - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = Series(np.random.randn(len(rng)), index=rng) stime = time(0, 0) etime = time(1, 0) @@ -792,12 +828,12 @@ def test_between_time(self): else: assert t < etime - result = ts.between_time('00:00', '01:00') + result = ts.between_time("00:00", "01:00") expected = ts.between_time(stime, etime) assert_series_equal(result, expected) # across midnight - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = Series(np.random.randn(len(rng)), index=rng) stime = time(22, 0) etime = time(9, 0) @@ -826,40 +862,45 @@ def test_between_time(self): def test_between_time_raises(self): # GH20725 - ser = pd.Series('a b c'.split()) + ser = pd.Series("a b c".split()) msg = "Index must be DatetimeIndex" with pytest.raises(TypeError, match=msg): - ser.between_time(start_time='00:00', end_time='12:00') + ser.between_time(start_time="00:00", end_time="12:00") def test_between_time_types(self): # GH11818 - rng = date_range('1/1/2000', '1/5/2000', freq='5min') - msg = (r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]" - " to a time") + rng = date_range("1/1/2000", "1/5/2000", freq="5min") + msg = ( + r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]" + " to a time" + ) with pytest.raises(ValueError, match=msg): - rng.indexer_between_time(datetime(2010, 1, 2, 1), - datetime(2010, 1, 2, 5)) + rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) - frame = DataFrame({'A': 0}, index=rng) + frame = DataFrame({"A": 0}, index=rng) with pytest.raises(ValueError, match=msg): - frame.between_time(datetime(2010, 1, 2, 1), - datetime(2010, 1, 2, 5)) + frame.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) series = Series(0, index=rng) with pytest.raises(ValueError, match=msg): - series.between_time(datetime(2010, 1, 2, 1), - datetime(2010, 1, 2, 5)) + series.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) @td.skip_if_has_locale def test_between_time_formats(self): # GH11818 - rng = date_range('1/1/2000', '1/5/2000', freq='5min') + rng = date_range("1/1/2000", "1/5/2000", freq="5min") ts = DataFrame(np.random.randn(len(rng), 2), index=rng) - strings = [("2:00", "2:30"), ("0200", "0230"), ("2:00am", "2:30am"), - ("0200am", "0230am"), ("2:00:00", "2:30:00"), - ("020000", "023000"), ("2:00:00am", "2:30:00am"), - ("020000am", "023000am")] + strings = [ + ("2:00", "2:30"), + ("0200", "0230"), + ("2:00am", "2:30am"), + ("0200am", "0230am"), + ("2:00:00", "2:30:00"), + ("020000", "023000"), + ("2:00:00am", "2:30:00am"), + ("020000am", "023000am"), + ] expected_length = 28 for time_string in strings: @@ -867,38 +908,37 @@ def test_between_time_formats(self): def test_between_time_axis(self): # issue 8839 - rng = date_range('1/1/2000', periods=100, freq='10min') + rng = date_range("1/1/2000", periods=100, freq="10min") ts = Series(np.random.randn(len(rng)), index=rng) - stime, etime = ('08:00:00', '09:00:00') + stime, etime = ("08:00:00", "09:00:00") expected_length = 7 assert len(ts.between_time(stime, etime)) == expected_length assert len(ts.between_time(stime, etime, axis=0)) == expected_length - msg = ("No axis named 1 for object type" - " ") + msg = "No axis named 1 for object type" " " with pytest.raises(ValueError, match=msg): ts.between_time(stime, etime, axis=1) def test_to_period(self): from pandas.core.indexes.period import period_range - ts = _simple_ts('1/1/2000', '1/1/2001') + ts = _simple_ts("1/1/2000", "1/1/2001") pts = ts.to_period() exp = ts.copy() - exp.index = period_range('1/1/2000', '1/1/2001') + exp.index = period_range("1/1/2000", "1/1/2001") assert_series_equal(pts, exp) - pts = ts.to_period('M') - exp.index = exp.index.asfreq('M') - tm.assert_index_equal(pts.index, exp.index.asfreq('M')) + pts = ts.to_period("M") + exp.index = exp.index.asfreq("M") + tm.assert_index_equal(pts.index, exp.index.asfreq("M")) assert_series_equal(pts, exp) # GH 7606 without freq - idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04']) - exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', - '2011-01-04'], freq='D') + idx = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"]) + exp_idx = pd.PeriodIndex( + ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], freq="D" + ) s = Series(np.random.randn(4), index=idx) expected = s.copy() @@ -915,7 +955,7 @@ def test_to_period(self): assert_frame_equal(df.to_period(axis=1), expected) def test_groupby_count_dateparseerror(self): - dr = date_range(start='1/1/2012', freq='5min', periods=10) + dr = date_range(start="1/1/2012", freq="5min", periods=10) # BAD Example, datetimes first s = Series(np.arange(10), index=[dr, np.arange(10)]) @@ -929,17 +969,17 @@ def test_groupby_count_dateparseerror(self): assert_series_equal(result, expected) def test_to_csv_numpy_16_bug(self): - frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) + frame = DataFrame({"a": date_range("1/1/2000", periods=10)}) buf = StringIO() frame.to_csv(buf) result = buf.getvalue() - assert '2000-01-01' in result + assert "2000-01-01" in result def test_series_map_box_timedelta(self): # GH 11349 - s = Series(timedelta_range('1 day 1 s', periods=5, freq='h')) + s = Series(timedelta_range("1 day 1 s", periods=5, freq="h")) def f(x): return x.total_seconds() @@ -951,19 +991,20 @@ def f(x): def test_asfreq_resample_set_correct_freq(self): # GH5613 # we test if .asfreq() and .resample() set the correct value for .freq - df = pd.DataFrame({'date': ["2012-01-01", "2012-01-02", "2012-01-03"], - 'col': [1, 2, 3]}) + df = pd.DataFrame( + {"date": ["2012-01-01", "2012-01-02", "2012-01-03"], "col": [1, 2, 3]} + ) df = df.set_index(pd.to_datetime(df.date)) # testing the settings before calling .asfreq() and .resample() assert df.index.freq is None - assert df.index.inferred_freq == 'D' + assert df.index.inferred_freq == "D" # does .asfreq() set .freq correctly? - assert df.asfreq('D').index.freq == 'D' + assert df.asfreq("D").index.freq == "D" # does .resample() set .freq correctly? - assert df.resample('D').asfreq().index.freq == 'D' + assert df.resample("D").asfreq().index.freq == "D" def test_pickle(self): @@ -971,7 +1012,7 @@ def test_pickle(self): p = tm.round_trip_pickle(NaT) assert p is NaT - idx = pd.to_datetime(['2013-01-01', NaT, '2014-01-06']) + idx = pd.to_datetime(["2013-01-01", NaT, "2014-01-06"]) idx_p = tm.round_trip_pickle(idx) assert idx_p[0] == idx[0] assert idx_p[1] is NaT @@ -979,13 +1020,13 @@ def test_pickle(self): # GH11002 # don't infer freq - idx = date_range('1750-1-1', '2050-1-1', freq='7D') + idx = date_range("1750-1-1", "2050-1-1", freq="7D") idx_p = tm.round_trip_pickle(idx) tm.assert_index_equal(idx, idx_p) - @pytest.mark.parametrize('tz', [None, 'Asia/Tokyo', 'US/Eastern']) + @pytest.mark.parametrize("tz", [None, "Asia/Tokyo", "US/Eastern"]) def test_setops_preserve_freq(self, tz): - rng = date_range('1/1/2000', '1/1/2002', name='idx', tz=tz) + rng = date_range("1/1/2000", "1/1/2002", name="idx", tz=tz) result = rng[:50].union(rng[50:100]) assert result.name == rng.name @@ -1004,10 +1045,10 @@ def test_setops_preserve_freq(self, tz): result = rng[:50].intersection(rng[25:75]) assert result.name == rng.name - assert result.freqstr == 'D' + assert result.freqstr == "D" assert result.tz == rng.tz - nofreq = DatetimeIndex(list(rng[25:75]), name='other') + nofreq = DatetimeIndex(list(rng[25:75]), name="other") result = rng[:50].union(nofreq) assert result.name is None assert result.freq == rng.freq @@ -1020,25 +1061,24 @@ def test_setops_preserve_freq(self, tz): def test_from_M8_structured(self): dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))] - arr = np.array(dates, - dtype=[('Date', 'M8[us]'), ('Forecasting', 'M8[us]')]) + arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")]) df = DataFrame(arr) - assert df['Date'][0] == dates[0][0] - assert df['Forecasting'][0] == dates[0][1] + assert df["Date"][0] == dates[0][0] + assert df["Forecasting"][0] == dates[0][1] - s = Series(arr['Date']) + s = Series(arr["Date"]) assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = Series.from_array(arr['Date'], Index([0])) + s = Series.from_array(arr["Date"], Index([0])) assert s[0] == dates[0][0] def test_get_level_values_box(self): from pandas import MultiIndex - dates = date_range('1/1/2000', periods=4) + dates = date_range("1/1/2000", periods=4) levels = [dates, [0, 1]] codes = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]] @@ -1048,18 +1088,22 @@ def test_get_level_values_box(self): def test_view_tz(self): # GH#24024 - ser = pd.Series(pd.date_range('2000', periods=4, tz='US/Central')) + ser = pd.Series(pd.date_range("2000", periods=4, tz="US/Central")) result = ser.view("i8") - expected = pd.Series([946706400000000000, - 946792800000000000, - 946879200000000000, - 946965600000000000]) + expected = pd.Series( + [ + 946706400000000000, + 946792800000000000, + 946879200000000000, + 946965600000000000, + ] + ) tm.assert_series_equal(result, expected) def test_asarray_tz_naive(self): # This shouldn't produce a warning. - ser = pd.Series(pd.date_range('2000', periods=2)) - expected = np.array(['2000-01-01', '2000-01-02'], dtype='M8[ns]') + ser = pd.Series(pd.date_range("2000", periods=2)) + expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") with tm.assert_produces_warning(None): result = np.asarray(ser) @@ -1069,14 +1113,13 @@ def test_asarray_tz_naive(self): with tm.assert_produces_warning(None): result = np.asarray(ser, dtype=object) - expected = np.array([pd.Timestamp('2000-01-01'), - pd.Timestamp('2000-01-02')]) + expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) def test_asarray_tz_aware(self): - tz = 'US/Central' - ser = pd.Series(pd.date_range('2000', periods=2, tz=tz)) - expected = np.array(['2000-01-01T06', '2000-01-02T06'], dtype='M8[ns]') + tz = "US/Central" + ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) + expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") # We warn by default and return an ndarray[M8[ns]] with tm.assert_produces_warning(FutureWarning): result = np.asarray(ser) @@ -1090,8 +1133,9 @@ def test_asarray_tz_aware(self): tm.assert_numpy_array_equal(result, expected) # Future behavior with no warning - expected = np.array([pd.Timestamp("2000-01-01", tz=tz), - pd.Timestamp("2000-01-02", tz=tz)]) + expected = np.array( + [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] + ) with tm.assert_produces_warning(None): result = np.asarray(ser, dtype=object) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index 6ff02b3160020..c16e2864b131f 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -20,77 +20,80 @@ class TestSeriesTimezones: # Series.tz_localize def test_series_tz_localize(self): - rng = date_range('1/1/2011', periods=100, freq='H') + rng = date_range("1/1/2011", periods=100, freq="H") ts = Series(1, index=rng) - result = ts.tz_localize('utc') - assert result.index.tz.zone == 'UTC' + result = ts.tz_localize("utc") + assert result.index.tz.zone == "UTC" # Can't localize if already tz-aware - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") ts = Series(1, index=rng) - with pytest.raises(TypeError, match='Already tz-aware'): - ts.tz_localize('US/Eastern') + with pytest.raises(TypeError, match="Already tz-aware"): + ts.tz_localize("US/Eastern") - @pytest.mark.filterwarnings('ignore::FutureWarning') + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_tz_localize_errors_deprecation(self): # GH 22644 - tz = 'Europe/Warsaw' + tz = "Europe/Warsaw" n = 60 - rng = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') + rng = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") ts = Series(rng) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): with pytest.raises(ValueError): - ts.dt.tz_localize(tz, errors='foo') + ts.dt.tz_localize(tz, errors="foo") # make sure errors='coerce' gets mapped correctly to nonexistent - result = ts.dt.tz_localize(tz, errors='coerce') - expected = ts.dt.tz_localize(tz, nonexistent='NaT') + result = ts.dt.tz_localize(tz, errors="coerce") + expected = ts.dt.tz_localize(tz, nonexistent="NaT") tm.assert_series_equal(result, expected) def test_series_tz_localize_ambiguous_bool(self): # make sure that we are correctly accepting bool values as ambiguous # GH#14402 - ts = Timestamp('2015-11-01 01:00:03') - expected0 = Timestamp('2015-11-01 01:00:03-0500', tz='US/Central') - expected1 = Timestamp('2015-11-01 01:00:03-0600', tz='US/Central') + ts = Timestamp("2015-11-01 01:00:03") + expected0 = Timestamp("2015-11-01 01:00:03-0500", tz="US/Central") + expected1 = Timestamp("2015-11-01 01:00:03-0600", tz="US/Central") ser = Series([ts]) expected0 = Series([expected0]) expected1 = Series([expected1]) with pytest.raises(pytz.AmbiguousTimeError): - ser.dt.tz_localize('US/Central') + ser.dt.tz_localize("US/Central") - result = ser.dt.tz_localize('US/Central', ambiguous=True) + result = ser.dt.tz_localize("US/Central", ambiguous=True) tm.assert_series_equal(result, expected0) - result = ser.dt.tz_localize('US/Central', ambiguous=[True]) + result = ser.dt.tz_localize("US/Central", ambiguous=[True]) tm.assert_series_equal(result, expected0) - result = ser.dt.tz_localize('US/Central', ambiguous=False) + result = ser.dt.tz_localize("US/Central", ambiguous=False) tm.assert_series_equal(result, expected1) - result = ser.dt.tz_localize('US/Central', ambiguous=[False]) + result = ser.dt.tz_localize("US/Central", ambiguous=[False]) tm.assert_series_equal(result, expected1) - @pytest.mark.parametrize('tz', ['Europe/Warsaw', 'dateutil/Europe/Warsaw']) - @pytest.mark.parametrize('method, exp', [ - ['shift_forward', '2015-03-29 03:00:00'], - ['NaT', NaT], - ['raise', None], - ['foo', 'invalid'] - ]) + @pytest.mark.parametrize("tz", ["Europe/Warsaw", "dateutil/Europe/Warsaw"]) + @pytest.mark.parametrize( + "method, exp", + [ + ["shift_forward", "2015-03-29 03:00:00"], + ["NaT", NaT], + ["raise", None], + ["foo", "invalid"], + ], + ) def test_series_tz_localize_nonexistent(self, tz, method, exp): # GH 8917 n = 60 - dti = date_range(start='2015-03-29 02:00:00', periods=n, freq='min') + dti = date_range(start="2015-03-29 02:00:00", periods=n, freq="min") s = Series(1, dti) - if method == 'raise': + if method == "raise": with pytest.raises(pytz.NonExistentTimeError): s.tz_localize(tz, nonexistent=method) - elif exp == 'invalid': + elif exp == "invalid": with pytest.raises(ValueError): dti.tz_localize(tz, nonexistent=method) else: @@ -98,12 +101,12 @@ def test_series_tz_localize_nonexistent(self, tz, method, exp): expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz)) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_series_tz_localize_empty(self, tzstr): # GH#2248 ser = Series() - ser2 = ser.tz_localize('utc') + ser2 = ser.tz_localize("utc") assert ser2.index.tz == pytz.utc ser2 = ser.tz_localize(tzstr) @@ -113,24 +116,23 @@ def test_series_tz_localize_empty(self, tzstr): # Series.tz_convert def test_series_tz_convert(self): - rng = date_range('1/1/2011', periods=200, freq='D', tz='US/Eastern') + rng = date_range("1/1/2011", periods=200, freq="D", tz="US/Eastern") ts = Series(1, index=rng) - result = ts.tz_convert('Europe/Berlin') - assert result.index.tz.zone == 'Europe/Berlin' + result = ts.tz_convert("Europe/Berlin") + assert result.index.tz.zone == "Europe/Berlin" # can't convert tz-naive - rng = date_range('1/1/2011', periods=200, freq='D') + rng = date_range("1/1/2011", periods=200, freq="D") ts = Series(1, index=rng) with pytest.raises(TypeError, match="Cannot convert tz-naive"): - ts.tz_convert('US/Eastern') + ts.tz_convert("US/Eastern") def test_series_tz_convert_to_utc(self): - base = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - tz='UTC') - idx1 = base.tz_convert('Asia/Tokyo')[:2] - idx2 = base.tz_convert('US/Eastern')[1:] + base = DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC") + idx1 = base.tz_convert("Asia/Tokyo")[:2] + idx2 = base.tz_convert("US/Eastern")[1:] res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2) tm.assert_series_equal(res, Series([np.nan, 3, np.nan], index=base)) @@ -139,28 +141,26 @@ def test_series_tz_convert_to_utc(self): # Series.append def test_series_append_aware(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="US/Eastern") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='US/Eastern') + exp_index = DatetimeIndex( + ["2011-01-01 01:00", "2011-01-01 02:00"], tz="US/Eastern" + ) exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="UTC") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="UTC") ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) - exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], - tz='UTC') + exp_index = DatetimeIndex(["2011-01-01 01:00", "2011-01-01 02:00"], tz="UTC") exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) utc = rng1.tz @@ -168,22 +168,23 @@ def test_series_append_aware(self): # GH#7795 # different tz coerces to object dtype, not UTC - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', - tz='US/Eastern') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Central') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H", tz="US/Eastern") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Central") ser1 = Series([1], index=rng1) ser2 = Series([2], index=rng2) ts_result = ser1.append(ser2) - exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), - Timestamp('1/1/2011 02:00', tz='US/Central')]) + exp_index = Index( + [ + Timestamp("1/1/2011 01:00", tz="US/Eastern"), + Timestamp("1/1/2011 02:00", tz="US/Central"), + ] + ) exp = Series([1, 2], index=exp_index) tm.assert_series_equal(ts_result, exp) def test_series_append_aware_naive(self): - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') - rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', - tz='US/Eastern') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") + rng2 = date_range("1/1/2011 02:00", periods=1, freq="H", tz="US/Eastern") ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ser1.append(ser2) @@ -192,7 +193,7 @@ def test_series_append_aware_naive(self): assert ts_result.index.equals(expected) # mixed - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') + rng1 = date_range("1/1/2011 01:00", periods=1, freq="H") rng2 = range(100) ser1 = Series(np.random.randn(len(rng1)), index=rng1) ser2 = Series(np.random.randn(len(rng2)), index=rng2) @@ -202,18 +203,23 @@ def test_series_append_aware_naive(self): assert ts_result.index.equals(expected) def test_series_append_dst(self): - rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') - rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', - tz='US/Eastern') + rng1 = date_range("1/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") + rng2 = date_range("8/1/2016 01:00", periods=3, freq="H", tz="US/Eastern") ser1 = Series([1, 2, 3], index=rng1) ser2 = Series([10, 11, 12], index=rng2) ts_result = ser1.append(ser2) - exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', - '2016-01-01 03:00', '2016-08-01 01:00', - '2016-08-01 02:00', '2016-08-01 03:00'], - tz='US/Eastern') + exp_index = DatetimeIndex( + [ + "2016-01-01 01:00", + "2016-01-01 02:00", + "2016-01-01 03:00", + "2016-08-01 01:00", + "2016-08-01 02:00", + "2016-08-01 03:00", + ], + tz="US/Eastern", + ) exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) tm.assert_series_equal(ts_result, exp) assert ts_result.index.tz == rng1.tz @@ -223,8 +229,10 @@ def test_series_append_dst(self): def test_dateutil_tzoffset_support(self): values = [188.5, 328.25] tzinfo = tzoffset(None, 7200) - index = [datetime(2012, 5, 11, 11, tzinfo=tzinfo), - datetime(2012, 5, 11, 12, tzinfo=tzinfo)] + index = [ + datetime(2012, 5, 11, 11, tzinfo=tzinfo), + datetime(2012, 5, 11, 12, tzinfo=tzinfo), + ] series = Series(data=values, index=index) assert series.index.tz == tzinfo @@ -232,29 +240,29 @@ def test_dateutil_tzoffset_support(self): # it works! #2443 repr(series.index[0]) - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_tz_aware_asfreq(self, tz): - dr = date_range('2011-12-01', '2012-07-20', freq='D', tz=tz) + dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz) ser = Series(np.random.randn(len(dr)), index=dr) # it works! - ser.asfreq('T') + ser.asfreq("T") - @pytest.mark.parametrize('tz', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_string_index_alias_tz_aware(self, tz): - rng = date_range('1/1/2000', periods=10, tz=tz) + rng = date_range("1/1/2000", periods=10, tz=tz) ser = Series(np.random.randn(len(rng)), index=rng) - result = ser['1/3/2000'] + result = ser["1/3/2000"] tm.assert_almost_equal(result, ser[2]) # TODO: De-duplicate with test below def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): - rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + rng = date_range("1/1/2011", periods=10, freq="H", tz="US/Eastern") ser = Series(np.random.randn(len(rng)), index=rng) - ts_moscow = ser.tz_convert('Europe/Moscow') + ts_moscow = ser.tz_convert("Europe/Moscow") result = ser + ts_moscow assert result.index.tz is pytz.utc @@ -263,30 +271,32 @@ def test_series_add_tz_mismatch_converts_to_utc_duplicate(self): assert result.index.tz is pytz.utc def test_series_add_tz_mismatch_converts_to_utc(self): - rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + rng = date_range("1/1/2011", periods=100, freq="H", tz="utc") perm = np.random.permutation(100)[:90] - ser1 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('US/Eastern')) + ser1 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("US/Eastern") + ) perm = np.random.permutation(100)[:90] - ser2 = Series(np.random.randn(90), - index=rng.take(perm).tz_convert('Europe/Berlin')) + ser2 = Series( + np.random.randn(90), index=rng.take(perm).tz_convert("Europe/Berlin") + ) result = ser1 + ser2 - uts1 = ser1.tz_convert('utc') - uts2 = ser2.tz_convert('utc') + uts1 = ser1.tz_convert("utc") + uts2 = ser2.tz_convert("utc") expected = uts1 + uts2 assert result.index.tz == pytz.UTC tm.assert_series_equal(result, expected) def test_series_add_aware_naive_raises(self): - rng = date_range('1/1/2011', periods=10, freq='H') + rng = date_range("1/1/2011", periods=10, freq="H") ser = Series(np.random.randn(len(rng)), index=rng) - ser_utc = ser.tz_localize('utc') + ser_utc = ser.tz_localize("utc") with pytest.raises(Exception): ser + ser_utc @@ -295,21 +305,22 @@ def test_series_add_aware_naive_raises(self): ser_utc + ser def test_series_align_aware(self): - idx1 = date_range('2001', periods=5, freq='H', tz='US/Eastern') + idx1 = date_range("2001", periods=5, freq="H", tz="US/Eastern") ser = Series(np.random.randn(len(idx1)), index=idx1) - ser_central = ser.tz_convert('US/Central') + ser_central = ser.tz_convert("US/Central") # # different timezones convert to UTC new1, new2 = ser.align(ser_central) assert new1.index.tz == pytz.UTC assert new2.index.tz == pytz.UTC - @pytest.mark.parametrize('tzstr', ['US/Eastern', 'dateutil/US/Eastern']) + @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_localized_at_time_between_time(self, tzstr): from datetime import time + tz = timezones.maybe_get_tz(tzstr) - rng = date_range('4/16/2012', '5/1/2012', freq='H') + rng = date_range("4/16/2012", "5/1/2012", freq="H") ts = Series(np.random.randn(len(rng)), index=rng) ts_local = ts.tz_localize(tzstr) @@ -325,15 +336,15 @@ def test_localized_at_time_between_time(self, tzstr): tm.assert_series_equal(result, expected) assert timezones.tz_compare(result.index.tz, tz) - @pytest.mark.parametrize('tzstr', ['Europe/Berlin', - 'dateutil/Europe/Berlin']) + @pytest.mark.parametrize("tzstr", ["Europe/Berlin", "dateutil/Europe/Berlin"]) def test_getitem_pydatetime_tz(self, tzstr): tz = timezones.maybe_get_tz(tzstr) - index = date_range(start='2012-12-24 16:00', end='2012-12-24 18:00', - freq='H', tz=tzstr) + index = date_range( + start="2012-12-24 16:00", end="2012-12-24 18:00", freq="H", tz=tzstr + ) ts = Series(index=index, data=index.hour) - time_pandas = Timestamp('2012-12-24 17:00', tz=tzstr) + time_pandas = Timestamp("2012-12-24 17:00", tz=tzstr) dt = datetime(2012, 12, 24, 17, 0) time_datetime = conversion.localize_pydatetime(dt, tz) @@ -341,32 +352,30 @@ def test_getitem_pydatetime_tz(self, tzstr): def test_series_truncate_datetimeindex_tz(self): # GH 9243 - idx = date_range('4/1/2005', '4/30/2005', freq='D', tz='US/Pacific') + idx = date_range("4/1/2005", "4/30/2005", freq="D", tz="US/Pacific") s = Series(range(len(idx)), index=idx) result = s.truncate(datetime(2005, 4, 2), datetime(2005, 4, 4)) expected = Series([1, 2, 3], index=idx[1:4]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('copy', [True, False]) - @pytest.mark.parametrize('method, tz', [ - ['tz_localize', None], - ['tz_convert', 'Europe/Berlin'] - ]) + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize( + "method, tz", [["tz_localize", None], ["tz_convert", "Europe/Berlin"]] + ) def test_tz_localize_convert_copy_inplace_mutate(self, copy, method, tz): # GH 6326 - result = Series(np.arange(0, 5), - index=date_range('20131027', periods=5, freq='1H', - tz=tz)) - getattr(result, method)('UTC', copy=copy) - expected = Series(np.arange(0, 5), - index=date_range('20131027', periods=5, freq='1H', - tz=tz)) + result = Series( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) + getattr(result, method)("UTC", copy=copy) + expected = Series( + np.arange(0, 5), index=date_range("20131027", periods=5, freq="1H", tz=tz) + ) tm.assert_series_equal(result, expected) def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture): # GH 25843 tz = tz_aware_fixture - result = Series([Timestamp('2019', tz=tz)], - dtype='datetime64[ns]') - expected = Series([Timestamp('2019')]) + result = Series([Timestamp("2019", tz=tz)], dtype="datetime64[ns]") + expected = Series([Timestamp("2019")]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 183aa6e393355..c024e9caba156 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -8,19 +8,10 @@ import pandas.util.testing as tm UNARY_UFUNCS = [np.positive, np.floor, np.exp] -BINARY_UFUNCS = [ - np.add, # dunder op - np.logaddexp, -] -SPARSE = [ - True, - False -] -SPARSE_IDS = ['sparse', 'dense'] -SHUFFLE = [ - True, - False -] +BINARY_UFUNCS = [np.add, np.logaddexp] # dunder op +SPARSE = [True, False] +SPARSE_IDS = ["sparse", "dense"] +SHUFFLE = [True, False] @pytest.fixture @@ -28,8 +19,8 @@ def arrays_for_binary_ufunc(): """ A pair of random, length-100 integer-dtype arrays, that are mostly 0. """ - a1 = np.random.randint(0, 10, 100, dtype='int64') - a2 = np.random.randint(0, 10, 100, dtype='int64') + a1 = np.random.randint(0, 10, 100, dtype="int64") + a2 = np.random.randint(0, 10, 100, dtype="int64") a1[::3] = 0 a2[::4] = 0 return a1, a2 @@ -39,10 +30,10 @@ def arrays_for_binary_ufunc(): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) def test_unary_ufunc(ufunc, sparse): # Test that ufunc(Series) == Series(ufunc) - array = np.random.randint(0, 10, 10, dtype='int64') + array = np.random.randint(0, 10, 10, dtype="int64") array[::2] = 0 if sparse: - array = pd.SparseArray(array, dtype=pd.SparseDtype('int64', 0)) + array = pd.SparseArray(array, dtype=pd.SparseDtype("int64", 0)) index = list(string.ascii_letters[:10]) name = "name" @@ -55,20 +46,20 @@ def test_unary_ufunc(ufunc, sparse): @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) other = a2 array_args = (a1, a2) - series_args = (series, other) # ufunc(series, array) + series_args = (series, other) # ufunc(series, array) if flip: array_args = reversed(array_args) @@ -81,22 +72,22 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that # * func(Series(a), Series(b)) == Series(ufunc(a, b)) # * ufunc(Index, Series) dispatches to Series (returns a Series) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) other = pd.Index(a2, name=name).astype("int64") array_args = (a1, a2) - series_args = (series, other) # ufunc(series, array) + series_args = (series, other) # ufunc(series, array) if flip: array_args = reversed(array_args) @@ -109,18 +100,18 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) -@pytest.mark.parametrize("shuffle", [True, False], ids=['unaligned', - 'aligned']) -@pytest.mark.parametrize("flip", [True, False], ids=['flipped', 'straight']) -def test_binary_ufunc_with_series(flip, shuffle, sparse, ufunc, - arrays_for_binary_ufunc): +@pytest.mark.parametrize("shuffle", [True, False], ids=["unaligned", "aligned"]) +@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"]) +def test_binary_ufunc_with_series( + flip, shuffle, sparse, ufunc, arrays_for_binary_ufunc +): # Test that # * func(Series(a), Series(b)) == Series(ufunc(a, b)) # with alignment between the indices a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -179,8 +170,7 @@ def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("shuffle", SHUFFLE) @pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning") -def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, - arrays_for_binary_ufunc): +def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc): # Test that # the same conditions from binary_ufunc_scalar apply to # ufuncs with multiple outputs. @@ -193,8 +183,8 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, a2[a2 == 0] = 1 if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype('int64', 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype('int64', 0)) + a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) s1 = pd.Series(a1) s2 = pd.Series(a2) @@ -234,12 +224,11 @@ def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): @pytest.mark.parametrize("sparse", SPARSE, ids=SPARSE_IDS) @pytest.mark.parametrize("ufunc", BINARY_UFUNCS) -def test_binary_ufunc_drops_series_name(ufunc, sparse, - arrays_for_binary_ufunc): +def test_binary_ufunc_drops_series_name(ufunc, sparse, arrays_for_binary_ufunc): # Drop the names when they differ. a1, a2 = arrays_for_binary_ufunc - s1 = pd.Series(a1, name='a') - s2 = pd.Series(a2, name='b') + s1 = pd.Series(a1, name="a") + s2 = pd.Series(a2, name="b") result = ufunc(s1, s2) assert result.name is None @@ -256,55 +245,51 @@ def __add__(self, other): arr = np.array([Dummy(0), Dummy(1)]) ser = pd.Series(arr) tm.assert_series_equal(np.add(ser, ser), pd.Series(np.add(ser, arr))) - tm.assert_series_equal(np.add(ser, Dummy(1)), - pd.Series(np.add(ser, Dummy(1)))) - - -@pytest.mark.parametrize('values', [ - pd.array([1, 3, 2]), - pytest.param( - pd.array([1, 10, 0], dtype='Sparse[int]'), - marks=pytest.mark.xfail(resason='GH-27080. Bug in SparseArray') - ), - pd.to_datetime(['2000', '2010', '2001']), - pd.to_datetime(['2000', '2010', '2001']).tz_localize("CET"), - pd.to_datetime(['2000', '2010', '2001']).to_period(freq="D"), - -]) + tm.assert_series_equal(np.add(ser, Dummy(1)), pd.Series(np.add(ser, Dummy(1)))) + + +@pytest.mark.parametrize( + "values", + [ + pd.array([1, 3, 2]), + pytest.param( + pd.array([1, 10, 0], dtype="Sparse[int]"), + marks=pytest.mark.xfail(resason="GH-27080. Bug in SparseArray"), + ), + pd.to_datetime(["2000", "2010", "2001"]), + pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), + pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), + ], +) def test_reduce(values): a = pd.Series(values) assert np.maximum.reduce(a) == values[1] -@pytest.mark.parametrize('type_', [ - list, - deque, - tuple, -]) +@pytest.mark.parametrize("type_", [list, deque, tuple]) def test_binary_ufunc_other_types(type_): - a = pd.Series([1, 2, 3], name='name') + a = pd.Series([1, 2, 3], name="name") b = type_([3, 4, 5]) result = np.add(a, b) - expected = pd.Series(np.add(a.to_numpy(), b), name='name') + expected = pd.Series(np.add(a.to_numpy(), b), name="name") tm.assert_series_equal(result, expected) def test_object_dtype_ok(): - class Thing: def __init__(self, value): self.value = value def __add__(self, other): - other = getattr(other, 'value', other) + other = getattr(other, "value", other) return type(self)(self.value + other) def __eq__(self, other): return type(other) is Thing and self.value == other.value def __repr__(self): - return 'Thing({})'.format(self.value) + return "Thing({})".format(self.value) s = pd.Series([Thing(1), Thing(2)]) result = np.add(s, Thing(1)) @@ -319,9 +304,5 @@ def test_outer(): with tm.assert_produces_warning(FutureWarning): result = np.subtract.outer(s, o) - expected = np.array([ - [0, -1, -2], - [1, 0, -1], - [2, 1, 0] - ], dtype=np.dtype('int64')) + expected = np.array([[0, -1, -2], [1, 0, -1], [2, 1, 0]], dtype=np.dtype("int64")) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_validate.py b/pandas/tests/series/test_validate.py index cef38d5ce3f23..c4311f507f7ee 100644 --- a/pandas/tests/series/test_validate.py +++ b/pandas/tests/series/test_validate.py @@ -4,12 +4,13 @@ class TestSeriesValidate: """Tests for error handling related to data types of method arguments.""" - @pytest.mark.parametrize("func", ["reset_index", "_set_name", - "sort_values", "sort_index", - "rename", "dropna"]) + @pytest.mark.parametrize( + "func", + ["reset_index", "_set_name", "sort_values", "sort_index", "rename", "dropna"], + ) @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_args(self, string_series, func, inplace): - msg = "For argument \"inplace\" expected type bool" + msg = 'For argument "inplace" expected type bool' kwargs = dict(inplace=inplace) if func == "_set_name": diff --git a/pandas/tests/sparse/frame/conftest.py b/pandas/tests/sparse/frame/conftest.py index 3423260c1720a..989b58419c2cd 100644 --- a/pandas/tests/sparse/frame/conftest.py +++ b/pandas/tests/sparse/frame/conftest.py @@ -3,16 +3,19 @@ from pandas import DataFrame, SparseArray, SparseDataFrame, bdate_range -data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], - 'C': np.arange(10, dtype=np.float64), - 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} -dates = bdate_range('1/1/2011', periods=10) +data = { + "A": [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + "C": np.arange(10, dtype=np.float64), + "D": [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan], +} +dates = bdate_range("1/1/2011", periods=10) # fixture names must be compatible with the tests in # tests/frame/test_api.SharedWithSparse + @pytest.fixture def float_frame_dense(): """ @@ -31,7 +34,7 @@ def float_frame(): Columns are ['A', 'B', 'C', 'D']; some entries are missing """ # default_kind='block' is the default - return SparseDataFrame(data, index=dates, default_kind='block') + return SparseDataFrame(data, index=dates, default_kind="block") @pytest.fixture @@ -42,7 +45,7 @@ def float_frame_int_kind(): Columns are ['A', 'B', 'C', 'D'] and default_kind='integer'. Some entries are missing. """ - return SparseDataFrame(data, index=dates, default_kind='integer') + return SparseDataFrame(data, index=dates, default_kind="integer") @pytest.fixture @@ -53,7 +56,7 @@ def float_string_frame(): Columns are ['A', 'B', 'C', 'D', 'foo']; some entries are missing """ sdf = SparseDataFrame(data, index=dates) - sdf['foo'] = SparseArray(['bar'] * len(dates)) + sdf["foo"] = SparseArray(["bar"] * len(dates)) return sdf @@ -66,7 +69,7 @@ def float_frame_fill0_dense(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 0 - return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates) + return DataFrame(values, columns=["A", "B", "C", "D"], index=dates) @pytest.fixture @@ -78,8 +81,9 @@ def float_frame_fill0(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 0 - return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], - default_fill_value=0, index=dates) + return SparseDataFrame( + values, columns=["A", "B", "C", "D"], default_fill_value=0, index=dates + ) @pytest.fixture @@ -91,7 +95,7 @@ def float_frame_fill2_dense(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 2 - return DataFrame(values, columns=['A', 'B', 'C', 'D'], index=dates) + return DataFrame(values, columns=["A", "B", "C", "D"], index=dates) @pytest.fixture @@ -103,8 +107,9 @@ def float_frame_fill2(): """ values = SparseDataFrame(data).values values[np.isnan(values)] = 2 - return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], - default_fill_value=2, index=dates) + return SparseDataFrame( + values, columns=["A", "B", "C", "D"], default_fill_value=2, index=dates + ) @pytest.fixture diff --git a/pandas/tests/sparse/frame/test_analytics.py b/pandas/tests/sparse/frame/test_analytics.py index ae97682f297ad..fae879b3d33b5 100644 --- a/pandas/tests/sparse/frame/test_analytics.py +++ b/pandas/tests/sparse/frame/test_analytics.py @@ -6,7 +6,7 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_quantile(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] @@ -24,7 +24,7 @@ def test_quantile(): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_quantile_multi(): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [np.nan, np.nan]] diff --git a/pandas/tests/sparse/frame/test_apply.py b/pandas/tests/sparse/frame/test_apply.py index 4e677f5055e79..d8158db32d8f0 100644 --- a/pandas/tests/sparse/frame/test_apply.py +++ b/pandas/tests/sparse/frame/test_apply.py @@ -9,7 +9,7 @@ @pytest.fixture def dates(): - return bdate_range('1/1/2011', periods=10) + return bdate_range("1/1/2011", periods=10) @pytest.fixture @@ -19,10 +19,12 @@ def empty(): @pytest.fixture def frame(dates): - data = {'A': [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], - 'C': np.arange(10, dtype=np.float64), - 'D': [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan]} + data = { + "A": [np.nan, np.nan, np.nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, np.nan, np.nan, np.nan, 3, 4, 5, 6], + "C": np.arange(10, dtype=np.float64), + "D": [0, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan], + } return SparseDataFrame(data, index=dates) @@ -32,9 +34,9 @@ def fill_frame(frame): values = frame.values.copy() values[np.isnan(values)] = 2 - return SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], - default_fill_value=2, - index=frame.index) + return SparseDataFrame( + values, columns=["A", "B", "C", "D"], default_fill_value=2, index=frame.index + ) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @@ -46,25 +48,22 @@ def test_apply(frame): # agg / broadcast # two FutureWarnings, so we can't check stacklevel properly. - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): broadcasted = frame.apply(np.sum, broadcast=True) assert isinstance(broadcasted, SparseDataFrame) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): exp = frame.to_dense().apply(np.sum, broadcast=True) tm.assert_frame_equal(broadcasted.to_dense(), exp) applied = frame.apply(np.sum) - tm.assert_series_equal(applied, - frame.to_dense().apply(nanops.nansum).to_sparse()) + tm.assert_series_equal(applied, frame.to_dense().apply(nanops.nansum).to_sparse()) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_fill(fill_frame): applied = fill_frame.apply(np.sqrt) - assert applied['A'].fill_value == np.sqrt(2) + assert applied["A"].fill_value == np.sqrt(2) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @@ -75,8 +74,7 @@ def test_apply_empty(empty): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_apply_nonuq(): - orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - index=['a', 'a', 'c']) + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"]) sparse = orig.to_sparse() res = sparse.apply(lambda s: s[0], axis=1) exp = orig.apply(lambda s: s[0], axis=1) @@ -107,8 +105,11 @@ def test_applymap(frame): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_apply_keep_sparse_dtype(): # GH 23744 - sdf = SparseDataFrame(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), - columns=['b', 'a', 'c'], default_fill_value=1) + sdf = SparseDataFrame( + np.array([[0, 1, 0], [0, 0, 0], [0, 0, 1]]), + columns=["b", "a", "c"], + default_fill_value=1, + ) df = DataFrame(sdf) expected = sdf.apply(np.exp) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index d3e2e1357f9d7..96e3c4640d2f6 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -12,7 +12,11 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.sparse import frame as spf from pandas.core.sparse.api import ( - SparseArray, SparseDataFrame, SparseDtype, SparseSeries) + SparseArray, + SparseDataFrame, + SparseDtype, + SparseSeries, +) from pandas.tests.frame.test_api import SharedWithSparse from pandas.util import testing as tm @@ -53,8 +57,8 @@ def test_itertuples(self, float_frame): def test_fill_value_when_combine_const(self): # GH12723 - dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') - df = SparseDataFrame({'foo': dat}, index=range(6)) + dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") + df = SparseDataFrame({"foo": dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) @@ -81,23 +85,23 @@ def test_copy(self, float_frame): # this is now identical (but not is_a ) assert cp.index.identical(float_frame.index) - def test_constructor(self, float_frame, float_frame_int_kind, - float_frame_fill0): + def test_constructor(self, float_frame, float_frame_int_kind, float_frame_fill0): for col, series in float_frame.items(): assert isinstance(series, SparseSeries) - assert isinstance(float_frame_int_kind['A'].sp_index, IntIndex) + assert isinstance(float_frame_int_kind["A"].sp_index, IntIndex) # constructed zframe from matrix above - assert float_frame_fill0['A'].fill_value == 0 + assert float_frame_fill0["A"].fill_value == 0 # XXX: changed asarray - expected = pd.SparseArray([0, 0, 0, 0, 1., 2., 3., 4., 5., 6.], - fill_value=0, kind='block') - tm.assert_sp_array_equal(expected, - float_frame_fill0['A'].values) - tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2., - 3., 4., 5., 6.]), - float_frame_fill0['A'].to_dense().values) + expected = pd.SparseArray( + [0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], fill_value=0, kind="block" + ) + tm.assert_sp_array_equal(expected, float_frame_fill0["A"].values) + tm.assert_numpy_array_equal( + np.array([0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), + float_frame_fill0["A"].to_dense().values, + ) # construct no data sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) @@ -115,9 +119,13 @@ def test_constructor(self, float_frame, float_frame_int_kind, # init dict with different index idx = float_frame.index[:5] cons = SparseDataFrame( - float_frame, index=idx, columns=float_frame.columns, + float_frame, + index=idx, + columns=float_frame.columns, default_fill_value=float_frame.default_fill_value, - default_kind=float_frame.default_kind, copy=True) + default_kind=float_frame.default_kind, + copy=True, + ) reindexed = float_frame.reindex(idx) tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) @@ -132,12 +140,12 @@ def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value - d = {'b': [2, 3], 'a': [0, 1]} + d = {"b": [2, 3], "a": [0, 1]} frame = SparseDataFrame(data=d) if compat.PY36: - expected = SparseDataFrame(data=d, columns=list('ba')) + expected = SparseDataFrame(data=d, columns=list("ba")) else: - expected = SparseDataFrame(data=d, columns=list('ab')) + expected = SparseDataFrame(data=d, columns=list("ab")) tm.assert_sp_frame_equal(frame, expected) def test_constructor_ndarray(self, float_frame): @@ -145,22 +153,22 @@ def test_constructor_ndarray(self, float_frame): sp = SparseDataFrame(float_frame.values) # 1d - sp = SparseDataFrame(float_frame['A'].values, index=float_frame.index, - columns=['A']) - tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=['A'])) + sp = SparseDataFrame( + float_frame["A"].values, index=float_frame.index, columns=["A"] + ) + tm.assert_sp_frame_equal(sp, float_frame.reindex(columns=["A"])) # raise on level argument msg = "Reindex by level not supported for sparse" with pytest.raises(TypeError, match=msg): - float_frame.reindex(columns=['A'], level=1) + float_frame.reindex(columns=["A"], level=1) # wrong length index / columns with pytest.raises(ValueError, match="^Index length"): SparseDataFrame(float_frame.values, index=float_frame.index[:-1]) with pytest.raises(ValueError, match="^Column length"): - SparseDataFrame(float_frame.values, - columns=float_frame.columns[:-1]) + SparseDataFrame(float_frame.values, columns=float_frame.columns[:-1]) # GH 9272 def test_constructor_empty(self): @@ -181,14 +189,14 @@ def test_constructor_convert_index_once(self): def test_constructor_from_series(self): # GH 2873 - x = Series(np.random.randn(10000), name='a') + x = Series(np.random.randn(10000), name="a") x = x.to_sparse(fill_value=0) assert isinstance(x, SparseSeries) df = SparseDataFrame(x) assert isinstance(df, SparseDataFrame) - x = Series(np.random.randn(10000), name='a') - y = Series(np.random.randn(10000), name='b') + x = Series(np.random.randn(10000), name="a") + y = Series(np.random.randn(10000), name="b") x2 = x.astype(float) x2.loc[:9998] = np.NaN # TODO: x_sparse is unused...fix @@ -206,7 +214,7 @@ def test_constructor_from_series(self): def test_constructor_from_dense_series(self): # GH 19393 # series with name - x = Series(np.random.randn(10000), name='a') + x = Series(np.random.randn(10000), name="a") result = SparseDataFrame(x) expected = x.to_frame().to_sparse() tm.assert_sp_frame_equal(result, expected) @@ -221,9 +229,14 @@ def test_constructor_from_unknown_type(self): # GH 19393 class Unknown: pass - with pytest.raises(TypeError, - match=('SparseDataFrame called with unknown type ' - '"Unknown" for data argument')): + + with pytest.raises( + TypeError, + match=( + "SparseDataFrame called with unknown type " + '"Unknown" for data argument' + ), + ): SparseDataFrame(Unknown()) def test_constructor_preserve_attr(self): @@ -232,58 +245,63 @@ def test_constructor_preserve_attr(self): assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - df = pd.SparseDataFrame({'x': arr}) - assert df['x'].dtype == SparseDtype(np.int64) - assert df['x'].fill_value == 0 + df = pd.SparseDataFrame({"x": arr}) + assert df["x"].dtype == SparseDtype(np.int64) + assert df["x"].fill_value == 0 - s = pd.SparseSeries(arr, name='x') + s = pd.SparseSeries(arr, name="x") assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 df = pd.SparseDataFrame(s) - assert df['x'].dtype == SparseDtype(np.int64) - assert df['x'].fill_value == 0 + assert df["x"].dtype == SparseDtype(np.int64) + assert df["x"].fill_value == 0 - df = pd.SparseDataFrame({'x': s}) - assert df['x'].dtype == SparseDtype(np.int64) - assert df['x'].fill_value == 0 + df = pd.SparseDataFrame({"x": s}) + assert df["x"].dtype == SparseDtype(np.int64) + assert df["x"].fill_value == 0 def test_constructor_nan_dataframe(self): # GH 10079 trains = np.arange(100) thresholds = [10, 20, 30, 40, 50, 60] tuples = [(i, j) for i in trains for j in thresholds] - index = pd.MultiIndex.from_tuples(tuples, - names=['trains', 'thresholds']) + index = pd.MultiIndex.from_tuples(tuples, names=["trains", "thresholds"]) matrix = np.empty((len(index), len(trains))) matrix.fill(np.nan) df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float) result = df.to_sparse() - expected = pd.SparseDataFrame(matrix, index=index, columns=trains, - dtype=float) + expected = pd.SparseDataFrame(matrix, index=index, columns=trains, dtype=float) tm.assert_sp_frame_equal(result, expected) def test_type_coercion_at_construction(self): # GH 15682 result = pd.SparseDataFrame( - {'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8', - default_fill_value=0) + {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]}, + dtype="uint8", + default_fill_value=0, + ) expected = pd.SparseDataFrame( - {'a': pd.SparseSeries([1, 0, 0], dtype='uint8'), - 'b': pd.SparseSeries([0, 1, 0], dtype='uint8'), - 'c': pd.SparseSeries([0, 0, 1], dtype='uint8')}, - default_fill_value=0) + { + "a": pd.SparseSeries([1, 0, 0], dtype="uint8"), + "b": pd.SparseSeries([0, 1, 0], dtype="uint8"), + "c": pd.SparseSeries([0, 0, 1], dtype="uint8"), + }, + default_fill_value=0, + ) tm.assert_sp_frame_equal(result, expected) def test_default_dtype(self): - result = pd.SparseDataFrame(columns=list('ab'), index=range(2)) - expected = pd.SparseDataFrame([[np.nan, np.nan], [np.nan, np.nan]], - columns=list('ab'), index=range(2)) + result = pd.SparseDataFrame(columns=list("ab"), index=range(2)) + expected = pd.SparseDataFrame( + [[np.nan, np.nan], [np.nan, np.nan]], columns=list("ab"), index=range(2) + ) tm.assert_sp_frame_equal(result, expected) def test_nan_data_with_int_dtype_raises_error(self): - sdf = pd.SparseDataFrame([[np.nan, np.nan], [np.nan, np.nan]], - columns=list('ab'), index=range(2)) + sdf = pd.SparseDataFrame( + [[np.nan, np.nan], [np.nan, np.nan]], columns=list("ab"), index=range(2) + ) msg = "Cannot convert non-finite values" with pytest.raises(ValueError, match=msg): pd.SparseDataFrame(sdf, dtype=np.int64) @@ -293,11 +311,12 @@ def test_dtypes(self): df.loc[:9998] = np.nan sdf = df.to_sparse() result = sdf.dtypes - expected = Series(['Sparse[float64, nan]'] * 4) + expected = Series(["Sparse[float64, nan]"] * 4) tm.assert_series_equal(result, expected) - def test_shape(self, float_frame, float_frame_int_kind, - float_frame_fill0, float_frame_fill2): + def test_shape( + self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 + ): # see gh-10452 assert float_frame.shape == (10, 4) assert float_frame_int_kind.shape == (10, 4) @@ -316,10 +335,16 @@ def test_array_interface(self, float_frame): dres = np.sqrt(float_frame.to_dense()) tm.assert_frame_equal(res.to_dense(), dres) - def test_pickle(self, float_frame, float_frame_int_kind, float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_pickle( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _test_roundtrip(frame, orig): result = tm.round_trip_pickle(frame) tm.assert_sp_frame_equal(frame, result) @@ -332,19 +357,17 @@ def _test_roundtrip(frame, orig): _test_roundtrip(float_frame_fill2, float_frame_fill2_dense) def test_dense_to_sparse(self): - df = DataFrame({'A': [nan, nan, nan, 1, 2], - 'B': [1, 2, nan, nan, nan]}) + df = DataFrame({"A": [nan, nan, nan, 1, 2], "B": [1, 2, nan, nan, nan]}) sdf = df.to_sparse() assert isinstance(sdf, SparseDataFrame) assert np.isnan(sdf.default_fill_value) - assert isinstance(sdf['A'].sp_index, BlockIndex) + assert isinstance(sdf["A"].sp_index, BlockIndex) tm.assert_frame_equal(sdf.to_dense(), df) - sdf = df.to_sparse(kind='integer') - assert isinstance(sdf['A'].sp_index, IntIndex) + sdf = df.to_sparse(kind="integer") + assert isinstance(sdf["A"].sp_index, IntIndex) - df = DataFrame({'A': [0, 0, 0, 1, 2], - 'B': [1, 2, 0, 0, 0]}, dtype=float) + df = DataFrame({"A": [0, 0, 0, 1, 2], "B": [1, 2, 0, 0, 0]}, dtype=float) sdf = df.to_sparse(fill_value=0) assert sdf.default_fill_value == 0 tm.assert_frame_equal(sdf.to_dense(), df) @@ -356,8 +379,7 @@ def test_deprecated_dense_to_sparse(self): df = pd.DataFrame({"A": [1, np.nan, 3]}) sparse_df = pd.SparseDataFrame({"A": [1, np.nan, 3]}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.to_sparse() tm.assert_frame_equal(result, sparse_df) @@ -365,10 +387,14 @@ def test_density(self): df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) assert df.density == 0.7 - df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), - 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) + df = SparseDataFrame( + { + "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + "C": np.arange(10), + "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], + } + ) assert df.density == 0.75 @@ -388,41 +414,50 @@ def test_sparse_series_ops_fill(self, float_frame_fill2): self._check_frame_ops(float_frame_fill2) def _check_frame_ops(self, frame): - def _compare_to_dense(a, b, da, db, op): sparse_result = op(a, b) dense_result = op(da, db) fill = sparse_result.default_fill_value dense_result = dense_result.to_sparse(fill_value=fill) - tm.assert_sp_frame_equal(sparse_result, dense_result, - exact_indices=False) + tm.assert_sp_frame_equal(sparse_result, dense_result, exact_indices=False) if isinstance(a, DataFrame) and isinstance(db, DataFrame): mixed_result = op(a, db) assert isinstance(mixed_result, SparseDataFrame) - tm.assert_sp_frame_equal(mixed_result, sparse_result, - exact_indices=False) + tm.assert_sp_frame_equal( + mixed_result, sparse_result, exact_indices=False + ) - opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + opnames = ["add", "sub", "mul", "truediv", "floordiv"] ops = [getattr(operator, name) for name in opnames] fidx = frame.index # time series operations - series = [frame['A'], frame['B'], frame['C'], frame['D'], - frame['A'].reindex(fidx[:7]), frame['A'].reindex(fidx[::2]), - SparseSeries( - [], index=[])] + series = [ + frame["A"], + frame["B"], + frame["C"], + frame["D"], + frame["A"].reindex(fidx[:7]), + frame["A"].reindex(fidx[::2]), + SparseSeries([], index=[]), + ] for op in opnames: - _compare_to_dense(frame, frame[::2], frame.to_dense(), - frame[::2].to_dense(), getattr(operator, op)) + _compare_to_dense( + frame, + frame[::2], + frame.to_dense(), + frame[::2].to_dense(), + getattr(operator, op), + ) # 2304, no auto-broadcasting for i, s in enumerate(series): - f = lambda a, b: getattr(a, op)(b, axis='index') + f = lambda a, b: getattr(a, op)(b, axis="index") _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f) # rops are not implemented @@ -430,8 +465,13 @@ def _compare_to_dense(a, b, da, db, op): # frame.to_dense(), f) # cross-sectional operations - series = [frame.xs(fidx[0]), frame.xs(fidx[3]), frame.xs(fidx[5]), - frame.xs(fidx[7]), frame.xs(fidx[5])[:2]] + series = [ + frame.xs(fidx[0]), + frame.xs(fidx[3]), + frame.xs(fidx[5]), + frame.xs(fidx[7]), + frame.xs(fidx[5])[:2], + ] for op in ops: for s in series: @@ -439,7 +479,7 @@ def _compare_to_dense(a, b, da, db, op): _compare_to_dense(s, frame, s, frame.to_dense(), op) # it works! - result = frame + frame.loc[:, ['A', 'B']] # noqa + result = frame + frame.loc[:, ["A", "B"]] # noqa def test_op_corners(self, float_frame, empty_frame): empty = empty_frame + empty_frame @@ -457,58 +497,53 @@ def test_scalar_ops(self): def test_getitem(self): # 1585 select multiple columns - sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c']) + sdf = SparseDataFrame(index=[0, 1, 2], columns=["a", "b", "c"]) - result = sdf[['a', 'b']] - exp = sdf.reindex(columns=['a', 'b']) + result = sdf[["a", "b"]] + exp = sdf.reindex(columns=["a", "b"]) tm.assert_sp_frame_equal(result, exp) with pytest.raises(KeyError, match=r"\['d'\] not in index"): - sdf[['a', 'd']] + sdf[["a", "d"]] def test_iloc(self, float_frame): # GH 2227 result = float_frame.iloc[:, 0] assert isinstance(result, SparseSeries) - tm.assert_sp_series_equal(result, float_frame['A']) + tm.assert_sp_series_equal(result, float_frame["A"]) # preserve sparse index type. #2251 - data = {'A': [0, 1]} - iframe = SparseDataFrame(data, default_kind='integer') - tm.assert_class_equal(iframe['A'].sp_index, - iframe.iloc[:, 0].sp_index) + data = {"A": [0, 1]} + iframe = SparseDataFrame(data, default_kind="integer") + tm.assert_class_equal(iframe["A"].sp_index, iframe.iloc[:, 0].sp_index) def test_set_value(self, float_frame): # ok, as the index gets converted to object frame = float_frame.copy() - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = frame.set_value('foobar', 'B', 1.5) - assert res.index.dtype == 'object' + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = frame.set_value("foobar", "B", 1.5) + assert res.index.dtype == "object" res = float_frame res.index = res.index.astype(object) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = float_frame.set_value('foobar', 'B', 1.5) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res = float_frame.set_value("foobar", "B", 1.5) assert res is not float_frame - assert res.index[-1] == 'foobar' - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert res.get_value('foobar', 'B') == 1.5 - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res2 = res.set_value('foobar', 'qux', 1.5) + assert res.index[-1] == "foobar" + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert res.get_value("foobar", "B") == 1.5 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + res2 = res.set_value("foobar", "qux", 1.5) assert res2 is not res - tm.assert_index_equal(res2.columns, - pd.Index(list(float_frame.columns) + ['qux'])) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert res2.get_value('foobar', 'qux') == 1.5 + tm.assert_index_equal( + res2.columns, pd.Index(list(float_frame.columns) + ["qux"]) + ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert res2.get_value("foobar", "qux") == 1.5 def test_fancy_index_misc(self, float_frame): # axis = 0 @@ -524,8 +559,7 @@ def test_fancy_index_misc(self, float_frame): def test_getitem_overload(self, float_frame): # slicing sl = float_frame[:20] - tm.assert_sp_frame_equal(sl, - float_frame.reindex(float_frame.index[:20])) + tm.assert_sp_frame_equal(sl, float_frame.reindex(float_frame.index[:20])) # boolean indexing d = float_frame.index[5] @@ -539,136 +573,146 @@ def test_getitem_overload(self, float_frame): with pytest.raises(ValueError, match=msg): float_frame[indexer[:-1]] - def test_setitem(self, float_frame, float_frame_int_kind, - float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_setitem( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check_frame(frame, orig): N = len(frame) # insert SparseSeries - frame['E'] = frame['A'] - assert isinstance(frame['E'], SparseSeries) - tm.assert_sp_series_equal(frame['E'], frame['A'], - check_names=False) + frame["E"] = frame["A"] + assert isinstance(frame["E"], SparseSeries) + tm.assert_sp_series_equal(frame["E"], frame["A"], check_names=False) # insert SparseSeries differently-indexed - to_insert = frame['A'][::2] - frame['E'] = to_insert + to_insert = frame["A"][::2] + frame["E"] = to_insert expected = to_insert.to_dense().reindex(frame.index) - result = frame['E'].to_dense() + result = frame["E"].to_dense() tm.assert_series_equal(result, expected, check_names=False) - assert result.name == 'E' + assert result.name == "E" # insert Series - frame['F'] = frame['A'].to_dense() - assert isinstance(frame['F'], SparseSeries) - tm.assert_sp_series_equal(frame['F'], frame['A'], - check_names=False) + frame["F"] = frame["A"].to_dense() + assert isinstance(frame["F"], SparseSeries) + tm.assert_sp_series_equal(frame["F"], frame["A"], check_names=False) # insert Series differently-indexed - to_insert = frame['A'].to_dense()[::2] - frame['G'] = to_insert + to_insert = frame["A"].to_dense()[::2] + frame["G"] = to_insert expected = to_insert.reindex(frame.index) - expected.name = 'G' - tm.assert_series_equal(frame['G'].to_dense(), expected) + expected.name = "G" + tm.assert_series_equal(frame["G"].to_dense(), expected) # insert ndarray - frame['H'] = np.random.randn(N) - assert isinstance(frame['H'], SparseSeries) + frame["H"] = np.random.randn(N) + assert isinstance(frame["H"], SparseSeries) to_sparsify = np.random.randn(N) - to_sparsify[N // 2:] = frame.default_fill_value - frame['I'] = to_sparsify - assert len(frame['I'].sp_values) == N // 2 + to_sparsify[N // 2 :] = frame.default_fill_value + frame["I"] = to_sparsify + assert len(frame["I"].sp_values) == N // 2 # insert ndarray wrong size # GH 25484 - msg = 'Length of values does not match length of index' + msg = "Length of values does not match length of index" with pytest.raises(ValueError, match=msg): - frame['foo'] = np.random.randn(N - 1) + frame["foo"] = np.random.randn(N - 1) # scalar value - frame['J'] = 5 - assert len(frame['J'].sp_values) == N - assert (frame['J'].sp_values == 5).all() + frame["J"] = 5 + assert len(frame["J"].sp_values) == N + assert (frame["J"].sp_values == 5).all() - frame['K'] = frame.default_fill_value - assert len(frame['K'].sp_values) == 0 + frame["K"] = frame.default_fill_value + assert len(frame["K"].sp_values) == 0 _check_frame(float_frame, float_frame_dense) _check_frame(float_frame_int_kind, float_frame_dense) _check_frame(float_frame_fill0, float_frame_fill0_dense) _check_frame(float_frame_fill2, float_frame_fill2_dense) - @pytest.mark.parametrize('values', [ - [True, False], - [0, 1], - [1, None], - ['a', 'b'], - [pd.Timestamp('2017'), pd.NaT], - [pd.Timedelta('10s'), pd.NaT], - ]) + @pytest.mark.parametrize( + "values", + [ + [True, False], + [0, 1], + [1, None], + ["a", "b"], + [pd.Timestamp("2017"), pd.NaT], + [pd.Timedelta("10s"), pd.NaT], + ], + ) def test_setitem_more(self, values): df = pd.DataFrame({"A": values}) - df['A'] = pd.SparseArray(values) - expected = pd.DataFrame({'A': pd.SparseArray(values)}) + df["A"] = pd.SparseArray(values) + expected = pd.DataFrame({"A": pd.SparseArray(values)}) tm.assert_frame_equal(df, expected) def test_setitem_corner(self, float_frame): - float_frame['a'] = float_frame['B'] - tm.assert_sp_series_equal(float_frame['a'], float_frame['B'], - check_names=False) + float_frame["a"] = float_frame["B"] + tm.assert_sp_series_equal(float_frame["a"], float_frame["B"], check_names=False) def test_setitem_array(self, float_frame): - arr = float_frame['B'] + arr = float_frame["B"] - float_frame['E'] = arr - tm.assert_sp_series_equal(float_frame['E'], float_frame['B'], - check_names=False) + float_frame["E"] = arr + tm.assert_sp_series_equal(float_frame["E"], float_frame["B"], check_names=False) - float_frame['F'] = arr[:-1] + float_frame["F"] = arr[:-1] index = float_frame.index[:-1] - tm.assert_sp_series_equal(float_frame['E'].reindex(index), - float_frame['F'].reindex(index), - check_names=False) + tm.assert_sp_series_equal( + float_frame["E"].reindex(index), + float_frame["F"].reindex(index), + check_names=False, + ) def test_setitem_chained_no_consolidate(self): # https://github.com/pandas-dev/pandas/pull/19268 # issuecomment-361696418 # chained setitem used to cause consolidation sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) - with pd.option_context('mode.chained_assignment', None): + with pd.option_context("mode.chained_assignment", None): sdf[0][1] = 2 assert len(sdf._data.blocks) == 2 def test_delitem(self, float_frame): - A = float_frame['A'] - C = float_frame['C'] + A = float_frame["A"] + C = float_frame["C"] - del float_frame['B'] - assert 'B' not in float_frame - tm.assert_sp_series_equal(float_frame['A'], A) - tm.assert_sp_series_equal(float_frame['C'], C) + del float_frame["B"] + assert "B" not in float_frame + tm.assert_sp_series_equal(float_frame["A"], A) + tm.assert_sp_series_equal(float_frame["C"], C) - del float_frame['D'] - assert 'D' not in float_frame + del float_frame["D"] + assert "D" not in float_frame - del float_frame['A'] - assert 'A' not in float_frame + del float_frame["A"] + assert "A" not in float_frame def test_set_columns(self, float_frame): float_frame.columns = float_frame.columns - msg = ("Length mismatch: Expected axis has 4 elements, new values have" - " 3 elements") + msg = ( + "Length mismatch: Expected axis has 4 elements, new values have" + " 3 elements" + ) with pytest.raises(ValueError, match=msg): float_frame.columns = float_frame.columns[:-1] def test_set_index(self, float_frame): float_frame.index = float_frame.index - msg = ("Length mismatch: Expected axis has 10 elements, new values" - " have 9 elements") + msg = ( + "Length mismatch: Expected axis has 10 elements, new values" + " have 9 elements" + ) with pytest.raises(ValueError, match=msg): float_frame.index = float_frame.index[:-1] @@ -687,22 +731,28 @@ def test_append(self, float_frame): a = float_frame.iloc[:5, :3] b = float_frame.iloc[5:] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): # Stacklevel is set for pd.concat, not append appended = a.append(b) - tm.assert_sp_frame_equal(appended.iloc[:, :3], float_frame.iloc[:, :3], - exact_indices=False) + tm.assert_sp_frame_equal( + appended.iloc[:, :3], float_frame.iloc[:, :3], exact_indices=False + ) - a = a[['B', 'C', 'A']].head(2) + a = a[["B", "C", "A"]].head(2) b = b.head(2) - expected = pd.SparseDataFrame({ - "B": [0., 1, None, 3], - "C": [0., 1, 5, 6], - "A": [None, None, 2, 3], - "D": [None, None, 5, None], - }, index=a.index | b.index, columns=['B', 'C', 'A', 'D']) + expected = pd.SparseDataFrame( + { + "B": [0.0, 1, None, 3], + "C": [0.0, 1, 5, 6], + "A": [None, None, 2, 3], + "D": [None, None, 5, None], + }, + index=a.index | b.index, + columns=["B", "C", "A", "D"], + ) with tm.assert_produces_warning(None, raise_on_extra_warnings=False): appended = a.append(b, sort=False) @@ -711,77 +761,97 @@ def test_append(self, float_frame): with tm.assert_produces_warning(None, raise_on_extra_warnings=False): appended = a.append(b, sort=True) - tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']], - consolidate_block_indices=True, - check_kind=False) + tm.assert_sp_frame_equal( + appended, + expected[["A", "B", "C", "D"]], + consolidate_block_indices=True, + check_kind=False, + ) def test_astype(self): - sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], - dtype=np.int64), - 'B': SparseArray([4, 5, 6, 7], - dtype=np.int64)}) - assert sparse['A'].dtype == SparseDtype(np.int64) - assert sparse['B'].dtype == SparseDtype(np.int64) + sparse = pd.SparseDataFrame( + { + "A": SparseArray([1, 2, 3, 4], dtype=np.int64), + "B": SparseArray([4, 5, 6, 7], dtype=np.int64), + } + ) + assert sparse["A"].dtype == SparseDtype(np.int64) + assert sparse["B"].dtype == SparseDtype(np.int64) # retain fill_value res = sparse.astype(np.float64) - exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], - fill_value=0, - kind='integer'), - 'B': SparseArray([4., 5., 6., 7.], - fill_value=0, - kind='integer')}, - default_fill_value=np.nan) + exp = pd.SparseDataFrame( + { + "A": SparseArray([1.0, 2.0, 3.0, 4.0], fill_value=0, kind="integer"), + "B": SparseArray([4.0, 5.0, 6.0, 7.0], fill_value=0, kind="integer"), + }, + default_fill_value=np.nan, + ) tm.assert_sp_frame_equal(res, exp) - assert res['A'].dtype == SparseDtype(np.float64, 0) - assert res['B'].dtype == SparseDtype(np.float64, 0) + assert res["A"].dtype == SparseDtype(np.float64, 0) + assert res["B"].dtype == SparseDtype(np.float64, 0) # update fill_value res = sparse.astype(SparseDtype(np.float64, np.nan)) - exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], - fill_value=np.nan, - kind='integer'), - 'B': SparseArray([4., 5., 6., 7.], - fill_value=np.nan, - kind='integer')}, - default_fill_value=np.nan) + exp = pd.SparseDataFrame( + { + "A": SparseArray( + [1.0, 2.0, 3.0, 4.0], fill_value=np.nan, kind="integer" + ), + "B": SparseArray( + [4.0, 5.0, 6.0, 7.0], fill_value=np.nan, kind="integer" + ), + }, + default_fill_value=np.nan, + ) tm.assert_sp_frame_equal(res, exp) - assert res['A'].dtype == SparseDtype(np.float64, np.nan) - assert res['B'].dtype == SparseDtype(np.float64, np.nan) + assert res["A"].dtype == SparseDtype(np.float64, np.nan) + assert res["B"].dtype == SparseDtype(np.float64, np.nan) def test_astype_bool(self): - sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], - fill_value=0, - dtype=np.int64), - 'B': SparseArray([0, 5, 0, 7], - fill_value=0, - dtype=np.int64)}, - default_fill_value=0) - assert sparse['A'].dtype == SparseDtype(np.int64) - assert sparse['B'].dtype == SparseDtype(np.int64) + sparse = pd.SparseDataFrame( + { + "A": SparseArray([0, 2, 0, 4], fill_value=0, dtype=np.int64), + "B": SparseArray([0, 5, 0, 7], fill_value=0, dtype=np.int64), + }, + default_fill_value=0, + ) + assert sparse["A"].dtype == SparseDtype(np.int64) + assert sparse["B"].dtype == SparseDtype(np.int64) res = sparse.astype(SparseDtype(bool, False)) - exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True], - dtype=np.bool, - fill_value=False, - kind='integer'), - 'B': SparseArray([False, True, False, True], - dtype=np.bool, - fill_value=False, - kind='integer')}, - default_fill_value=False) + exp = pd.SparseDataFrame( + { + "A": SparseArray( + [False, True, False, True], + dtype=np.bool, + fill_value=False, + kind="integer", + ), + "B": SparseArray( + [False, True, False, True], + dtype=np.bool, + fill_value=False, + kind="integer", + ), + }, + default_fill_value=False, + ) tm.assert_sp_frame_equal(res, exp) - assert res['A'].dtype == SparseDtype(np.bool) - assert res['B'].dtype == SparseDtype(np.bool) + assert res["A"].dtype == SparseDtype(np.bool) + assert res["B"].dtype == SparseDtype(np.bool) def test_astype_object(self): # This may change in GH-23125 - df = pd.DataFrame({"A": SparseArray([0, 1]), - "B": SparseArray([0, 1])}) + df = pd.DataFrame({"A": SparseArray([0, 1]), "B": SparseArray([0, 1])}) result = df.astype(object) dtype = SparseDtype(object, 0) - expected = pd.DataFrame({"A": SparseArray([0, 1], dtype=dtype), - "B": SparseArray([0, 1], dtype=dtype)}) + expected = pd.DataFrame( + { + "A": SparseArray([0, 1], dtype=dtype), + "B": SparseArray([0, 1], dtype=dtype), + } + ) tm.assert_frame_equal(result, expected) def test_fillna(self, float_frame_fill0, float_frame_fill0_dense): @@ -790,58 +860,64 @@ def test_fillna(self, float_frame_fill0, float_frame_fill0_dense): result = df.fillna(0) expected = dense.fillna(0) - tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0), - exact_indices=False) + tm.assert_sp_frame_equal( + result, expected.to_sparse(fill_value=0), exact_indices=False + ) tm.assert_frame_equal(result.to_dense(), expected) result = df.copy() result.fillna(0, inplace=True) expected = dense.fillna(0) - tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0), - exact_indices=False) + tm.assert_sp_frame_equal( + result, expected.to_sparse(fill_value=0), exact_indices=False + ) tm.assert_frame_equal(result.to_dense(), expected) result = df.copy() - result = df['A'] + result = df["A"] result.fillna(0, inplace=True) - expected = dense['A'].fillna(0) + expected = dense["A"].fillna(0) # this changes internal SparseArray repr # tm.assert_sp_series_equal(result, expected.to_sparse(fill_value=0)) tm.assert_series_equal(result.to_dense(), expected) def test_fillna_fill_value(self): - df = pd.DataFrame({'A': [1, 0, 0], 'B': [np.nan, np.nan, 4]}) + df = pd.DataFrame({"A": [1, 0, 0], "B": [np.nan, np.nan, 4]}) sparse = pd.SparseDataFrame(df) - tm.assert_frame_equal(sparse.fillna(-1).to_dense(), - df.fillna(-1), check_dtype=False) + tm.assert_frame_equal( + sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False + ) sparse = pd.SparseDataFrame(df, default_fill_value=0) - tm.assert_frame_equal(sparse.fillna(-1).to_dense(), - df.fillna(-1), check_dtype=False) + tm.assert_frame_equal( + sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False + ) def test_sparse_frame_pad_backfill_limit(self): index = np.arange(10) df = DataFrame(np.random.randn(10, 4), index=index) sdf = df.to_sparse() - result = sdf[:2].reindex(index, method='pad', limit=5) + result = sdf[:2].reindex(index, method="pad", limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[:2].reindex(index).fillna(method='pad') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[:2].reindex(index).fillna(method="pad") expected = expected.to_dense() expected.values[-3:] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) - result = sdf[-2:].reindex(index, method='backfill', limit=5) + result = sdf[-2:].reindex(index, method="backfill", limit=5) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[-2:].reindex(index).fillna(method='backfill') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[-2:].reindex(index).fillna(method="backfill") expected = expected.to_dense() expected.values[:3] = np.nan expected = expected.to_sparse() @@ -853,26 +929,30 @@ def test_sparse_frame_fillna_limit(self): sdf = df.to_sparse() result = sdf[:2].reindex(index) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - result = result.fillna(method='pad', limit=5) - - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[:2].reindex(index).fillna(method='pad') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + result = result.fillna(method="pad", limit=5) + + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[:2].reindex(index).fillna(method="pad") expected = expected.to_dense() expected.values[-3:] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) result = sdf[-2:].reindex(index) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - result = result.fillna(method='backfill', limit=5) - - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): - expected = sdf[-2:].reindex(index).fillna(method='backfill') + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + result = result.fillna(method="backfill", limit=5) + + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): + expected = sdf[-2:].reindex(index).fillna(method="backfill") expected = expected.to_dense() expected.values[:3] = np.nan expected = expected.to_sparse() @@ -880,17 +960,20 @@ def test_sparse_frame_fillna_limit(self): def test_rename(self, float_frame): result = float_frame.rename(index=str) - expected = SparseDataFrame(float_frame.values, - index=float_frame.index.strftime( - "%Y-%m-%d %H:%M:%S"), - columns=list('ABCD')) + expected = SparseDataFrame( + float_frame.values, + index=float_frame.index.strftime("%Y-%m-%d %H:%M:%S"), + columns=list("ABCD"), + ) tm.assert_sp_frame_equal(result, expected) - result = float_frame.rename(columns=lambda x: '%s%d' % (x, 1)) - data = {'A1': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B1': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C1': np.arange(10, dtype=np.float64), - 'D1': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + result = float_frame.rename(columns=lambda x: "%s%d" % (x, 1)) + data = { + "A1": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + "B1": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + "C1": np.arange(10, dtype=np.float64), + "D1": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan], + } expected = SparseDataFrame(data, index=float_frame.index) tm.assert_sp_frame_equal(result, expected) @@ -900,30 +983,33 @@ def test_corr(self, float_frame): tm.assert_frame_equal(res, float_frame.to_dense().corr().to_sparse()) def test_describe(self, float_frame): - float_frame['foo'] = np.nan + float_frame["foo"] = np.nan float_frame.dtypes.value_counts() str(float_frame) desc = float_frame.describe() # noqa def test_join(self, float_frame): - left = float_frame.loc[:, ['A', 'B']] - right = float_frame.loc[:, ['C', 'D']] + left = float_frame.loc[:, ["A", "B"]] + right = float_frame.loc[:, ["C", "D"]] joined = left.join(right) tm.assert_sp_frame_equal(joined, float_frame, exact_indices=False) - right = float_frame.loc[:, ['B', 'D']] - msg = (r"columns overlap but no suffix specified: Index\(\['B'\]," - r" dtype='object'\)") + right = float_frame.loc[:, ["B", "D"]] + msg = ( + r"columns overlap but no suffix specified: Index\(\['B'\]," + r" dtype='object'\)" + ) with pytest.raises(ValueError, match=msg): left.join(right) - with pytest.raises(ValueError, match='Other Series must have a name'): - float_frame.join(Series( - np.random.randn(len(float_frame)), index=float_frame.index)) - - def test_reindex(self, float_frame, float_frame_int_kind, - float_frame_fill0, float_frame_fill2): + with pytest.raises(ValueError, match="Other Series must have a name"): + float_frame.join( + Series(np.random.randn(len(float_frame)), index=float_frame.index) + ) + def test_reindex( + self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 + ): def _check_frame(frame): index = frame.index sidx = index[::2] @@ -933,37 +1019,35 @@ def _check_frame(frame): dense_result = frame.to_dense().reindex(sidx) tm.assert_frame_equal(sparse_result.to_dense(), dense_result) - tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(), - dense_result) + tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(), dense_result) sparse_result2 = sparse_result.reindex(index) dense_result2 = dense_result.reindex(index) tm.assert_frame_equal(sparse_result2.to_dense(), dense_result2) # propagate CORRECT fill value - tm.assert_almost_equal(sparse_result.default_fill_value, - frame.default_fill_value) - tm.assert_almost_equal(sparse_result['A'].fill_value, - frame['A'].fill_value) + tm.assert_almost_equal( + sparse_result.default_fill_value, frame.default_fill_value + ) + tm.assert_almost_equal(sparse_result["A"].fill_value, frame["A"].fill_value) # length zero length_zero = frame.reindex([]) assert len(length_zero) == 0 assert len(length_zero.columns) == len(frame.columns) - assert len(length_zero['A']) == 0 + assert len(length_zero["A"]) == 0 # frame being reindexed has length zero length_n = length_zero.reindex(index) assert len(length_n) == len(frame) assert len(length_n.columns) == len(frame.columns) - assert len(length_n['A']) == len(frame) + assert len(length_n["A"]) == len(frame) # reindex columns - reindexed = frame.reindex(columns=['A', 'B', 'Z']) + reindexed = frame.reindex(columns=["A", "B", "Z"]) assert len(reindexed.columns) == 3 - tm.assert_almost_equal(reindexed['Z'].fill_value, - frame.default_fill_value) - assert np.isnan(reindexed['Z'].sp_values).all() + tm.assert_almost_equal(reindexed["Z"].fill_value, frame.default_fill_value) + assert np.isnan(reindexed["Z"].sp_values).all() _check_frame(float_frame) _check_frame(float_frame_int_kind) @@ -972,16 +1056,15 @@ def _check_frame(frame): # with copy=False reindexed = float_frame.reindex(float_frame.index, copy=False) - reindexed['F'] = reindexed['A'] - assert 'F' in float_frame + reindexed["F"] = reindexed["A"] + assert "F" in float_frame reindexed = float_frame.reindex(float_frame.index) - reindexed['G'] = reindexed['A'] - assert 'G' not in float_frame + reindexed["G"] = reindexed["A"] + assert "G" not in float_frame - def test_reindex_fill_value(self, float_frame_fill0, - float_frame_fill0_dense): - rng = bdate_range('20110110', periods=20) + def test_reindex_fill_value(self, float_frame_fill0, float_frame_fill0_dense): + rng = bdate_range("20110110", periods=20) result = float_frame_fill0.reindex(rng, fill_value=0) exp = float_frame_fill0_dense.reindex(rng, fill_value=0) @@ -990,83 +1073,105 @@ def test_reindex_fill_value(self, float_frame_fill0, def test_reindex_method(self): - sparse = SparseDataFrame(data=[[11., 12., 14.], - [21., 22., 24.], - [41., 42., 44.]], - index=[1, 2, 4], - columns=[1, 2, 4], - dtype=float) + sparse = SparseDataFrame( + data=[[11.0, 12.0, 14.0], [21.0, 22.0, 24.0], [41.0, 42.0, 44.0]], + index=[1, 2, 4], + columns=[1, 2, 4], + dtype=float, + ) # Over indices # default method result = sparse.reindex(index=range(6)) - expected = SparseDataFrame(data=[[nan, nan, nan], - [11., 12., 14.], - [21., 22., 24.], - [nan, nan, nan], - [41., 42., 44.], - [nan, nan, nan]], - index=range(6), - columns=[1, 2, 4], - dtype=float) + expected = SparseDataFrame( + data=[ + [nan, nan, nan], + [11.0, 12.0, 14.0], + [21.0, 22.0, 24.0], + [nan, nan, nan], + [41.0, 42.0, 44.0], + [nan, nan, nan], + ], + index=range(6), + columns=[1, 2, 4], + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # method='bfill' - result = sparse.reindex(index=range(6), method='bfill') - expected = SparseDataFrame(data=[[11., 12., 14.], - [11., 12., 14.], - [21., 22., 24.], - [41., 42., 44.], - [41., 42., 44.], - [nan, nan, nan]], - index=range(6), - columns=[1, 2, 4], - dtype=float) + result = sparse.reindex(index=range(6), method="bfill") + expected = SparseDataFrame( + data=[ + [11.0, 12.0, 14.0], + [11.0, 12.0, 14.0], + [21.0, 22.0, 24.0], + [41.0, 42.0, 44.0], + [41.0, 42.0, 44.0], + [nan, nan, nan], + ], + index=range(6), + columns=[1, 2, 4], + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # method='ffill' - result = sparse.reindex(index=range(6), method='ffill') - expected = SparseDataFrame(data=[[nan, nan, nan], - [11., 12., 14.], - [21., 22., 24.], - [21., 22., 24.], - [41., 42., 44.], - [41., 42., 44.]], - index=range(6), - columns=[1, 2, 4], - dtype=float) + result = sparse.reindex(index=range(6), method="ffill") + expected = SparseDataFrame( + data=[ + [nan, nan, nan], + [11.0, 12.0, 14.0], + [21.0, 22.0, 24.0], + [21.0, 22.0, 24.0], + [41.0, 42.0, 44.0], + [41.0, 42.0, 44.0], + ], + index=range(6), + columns=[1, 2, 4], + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # Over columns # default method result = sparse.reindex(columns=range(6)) - expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan], - [nan, 21., 22., nan, 24., nan], - [nan, 41., 42., nan, 44., nan]], - index=[1, 2, 4], - columns=range(6), - dtype=float) + expected = SparseDataFrame( + data=[ + [nan, 11.0, 12.0, nan, 14.0, nan], + [nan, 21.0, 22.0, nan, 24.0, nan], + [nan, 41.0, 42.0, nan, 44.0, nan], + ], + index=[1, 2, 4], + columns=range(6), + dtype=float, + ) tm.assert_sp_frame_equal(result, expected) # method='bfill' with pytest.raises(NotImplementedError): - sparse.reindex(columns=range(6), method='bfill') + sparse.reindex(columns=range(6), method="bfill") # method='ffill' with pytest.raises(NotImplementedError): - sparse.reindex(columns=range(6), method='ffill') + sparse.reindex(columns=range(6), method="ffill") def test_take(self, float_frame): result = float_frame.take([1, 0, 2], axis=1) - expected = float_frame.reindex(columns=['B', 'A', 'C']) + expected = float_frame.reindex(columns=["B", "A", "C"]) tm.assert_sp_frame_equal(result, expected) - def test_to_dense(self, float_frame, float_frame_int_kind, - float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): + def test_to_dense( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check(frame, orig): dense_dm = frame.to_dense() # Sparse[float] != float @@ -1078,8 +1183,9 @@ def _check(frame, orig): _check(float_frame_fill0, float_frame_fill0_dense) _check(float_frame_fill2, float_frame_fill2_dense) - def test_stack_sparse_frame(self, float_frame, float_frame_int_kind, - float_frame_fill0, float_frame_fill2): + def test_stack_sparse_frame( + self, float_frame, float_frame_int_kind, float_frame_fill0, float_frame_fill2 + ): def _check(frame): dense_frame = frame.to_dense() # noqa @@ -1087,8 +1193,7 @@ def _check(frame): from_sparse_lp = spf.stack_sparse_frame(frame) - tm.assert_numpy_array_equal(from_dense_lp.values, - from_sparse_lp.values) + tm.assert_numpy_array_equal(from_dense_lp.values, from_sparse_lp.values) _check(float_frame) _check(float_frame_int_kind) @@ -1100,11 +1205,16 @@ def _check(frame): with pytest.raises(TypeError, match=msg): _check(float_frame_fill2) - def test_transpose(self, float_frame, float_frame_int_kind, - float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_transpose( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check(frame, orig): transposed = frame.T untransposed = transposed.T @@ -1119,10 +1229,16 @@ def _check(frame, orig): _check(float_frame_fill0, float_frame_fill0_dense) _check(float_frame_fill2, float_frame_fill2_dense) - def test_shift(self, float_frame, float_frame_int_kind, float_frame_dense, - float_frame_fill0, float_frame_fill0_dense, - float_frame_fill2, float_frame_fill2_dense): - + def test_shift( + self, + float_frame, + float_frame_int_kind, + float_frame_dense, + float_frame_fill0, + float_frame_fill0_dense, + float_frame_fill2, + float_frame_fill2_dense, + ): def _check(frame, orig): shifted = frame.shift(0) exp = orig.shift(0) @@ -1136,16 +1252,14 @@ def _check(frame, orig): exp = orig.shift(-2) tm.assert_frame_equal(shifted.to_dense(), exp) - shifted = frame.shift(2, freq='B') - exp = orig.shift(2, freq='B') - exp = exp.to_sparse(frame.default_fill_value, - kind=frame.default_kind) + shifted = frame.shift(2, freq="B") + exp = orig.shift(2, freq="B") + exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) tm.assert_frame_equal(shifted, exp) shifted = frame.shift(2, freq=BDay()) exp = orig.shift(2, freq=BDay()) - exp = exp.to_sparse(frame.default_fill_value, - kind=frame.default_kind) + exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) tm.assert_frame_equal(shifted, exp) _check(float_frame, float_frame_dense) @@ -1172,7 +1286,7 @@ def test_count(self, float_frame): tm.assert_series_equal(result, dense_result, check_dtype=False) def test_numpy_transpose(self): - sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=['a']) + sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=["a"]) result = np.transpose(np.transpose(sdf)) tm.assert_sp_frame_equal(result, sdf) @@ -1206,47 +1320,46 @@ def test_combine_first_with_dense(self): def test_combine_add(self, float_frame): df = float_frame.to_dense() df2 = df.copy() - df2['C'][:3] = np.nan - df['A'][:3] = 5.7 + df2["C"][:3] = np.nan + df["A"][:3] = 5.7 result = df.to_sparse().add(df2.to_sparse(), fill_value=0) expected = df.add(df2, fill_value=0).to_sparse() tm.assert_sp_frame_equal(result, expected) def test_isin(self): - sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.) - xp = sparse_df[sparse_df.flag == 1.] - rs = sparse_df[sparse_df.flag.isin([1.])] + sparse_df = DataFrame({"flag": [1.0, 0.0, 1.0]}).to_sparse(fill_value=0.0) + xp = sparse_df[sparse_df.flag == 1.0] + rs = sparse_df[sparse_df.flag.isin([1.0])] tm.assert_frame_equal(xp, rs) def test_sparse_pow_issue(self): # 2220 - df = SparseDataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) + df = SparseDataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]}) # note : no error without nan - df = SparseDataFrame({'A': [nan, 0, 1]}) + df = SparseDataFrame({"A": [nan, 0, 1]}) # note that 2 ** df works fine, also df ** 1 result = 1 ** df - r1 = result.take([0], 1)['A'] - r2 = result['A'] + r1 = result.take([0], 1)["A"] + r2 = result["A"] assert len(r2.sp_values) == len(r1.sp_values) def test_as_blocks(self): - df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]}, - dtype='float64') + df = SparseDataFrame({"A": [1.1, 3.3], "B": [nan, -3.9]}, dtype="float64") # deprecated 0.21.0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df_blocks = df.blocks - assert list(df_blocks.keys()) == ['Sparse[float64, nan]'] - tm.assert_frame_equal(df_blocks['Sparse[float64, nan]'], df) + assert list(df_blocks.keys()) == ["Sparse[float64, nan]"] + tm.assert_frame_equal(df_blocks["Sparse[float64, nan]"], df) - @pytest.mark.xfail(reason='nan column names in _init_dict problematic ' - '(GH#16894)') + @pytest.mark.xfail( + reason="nan column names in _init_dict problematic " "(GH#16894)" + ) def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) @@ -1255,76 +1368,105 @@ def test_nan_columnname(self): def test_isna(self): # GH 8276 - df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], - 'B': [0, np.nan, np.nan, 2, np.nan]}) + df = pd.SparseDataFrame( + {"A": [np.nan, np.nan, 1, 2, np.nan], "B": [0, np.nan, np.nan, 2, np.nan]} + ) res = df.isna() - exp = pd.SparseDataFrame({'A': [True, True, False, False, True], - 'B': [False, True, True, False, True]}, - default_fill_value=True) + exp = pd.SparseDataFrame( + { + "A": [True, True, False, False, True], + "B": [False, True, True, False, True], + }, + default_fill_value=True, + ) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], - 'B': [0, np.nan, 0, 2, np.nan]}, - default_fill_value=0.) + df = pd.SparseDataFrame( + {"A": [0, 0, 1, 2, np.nan], "B": [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.0, + ) res = df.isna() assert isinstance(res, pd.SparseDataFrame) - exp = pd.DataFrame({'A': [False, False, False, False, True], - 'B': [False, True, False, False, True]}) + exp = pd.DataFrame( + { + "A": [False, False, False, False, True], + "B": [False, True, False, False, True], + } + ) tm.assert_frame_equal(res.to_dense(), exp) def test_notna(self): # GH 8276 - df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], - 'B': [0, np.nan, np.nan, 2, np.nan]}) + df = pd.SparseDataFrame( + {"A": [np.nan, np.nan, 1, 2, np.nan], "B": [0, np.nan, np.nan, 2, np.nan]} + ) res = df.notna() - exp = pd.SparseDataFrame({'A': [False, False, True, True, False], - 'B': [True, False, False, True, False]}, - default_fill_value=False) + exp = pd.SparseDataFrame( + { + "A": [False, False, True, True, False], + "B": [True, False, False, True, False], + }, + default_fill_value=False, + ) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], - 'B': [0, np.nan, 0, 2, np.nan]}, - default_fill_value=0.) + df = pd.SparseDataFrame( + {"A": [0, 0, 1, 2, np.nan], "B": [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.0, + ) res = df.notna() assert isinstance(res, pd.SparseDataFrame) - exp = pd.DataFrame({'A': [True, True, True, True, False], - 'B': [True, False, True, True, False]}) + exp = pd.DataFrame( + { + "A": [True, True, True, True, False], + "B": [True, False, True, True, False], + } + ) tm.assert_frame_equal(res.to_dense(), exp) def test_default_fill_value_with_no_data(self): # GH 16807 - expected = pd.SparseDataFrame([[1.0, 1.0], [1.0, 1.0]], - columns=list('ab'), index=range(2)) - result = pd.SparseDataFrame(columns=list('ab'), index=range(2), - default_fill_value=1.0) + expected = pd.SparseDataFrame( + [[1.0, 1.0], [1.0, 1.0]], columns=list("ab"), index=range(2) + ) + result = pd.SparseDataFrame( + columns=list("ab"), index=range(2), default_fill_value=1.0 + ) tm.assert_frame_equal(expected, result) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameArithmetic: - def test_numeric_op_scalar(self): - df = pd.DataFrame({'A': [nan, nan, 0, 1, ], - 'B': [0, 1, 2, nan], - 'C': [1., 2., 3., 4.], - 'D': [nan, nan, nan, nan]}) + df = pd.DataFrame( + { + "A": [nan, nan, 0, 1], + "B": [0, 1, 2, nan], + "C": [1.0, 2.0, 3.0, 4.0], + "D": [nan, nan, nan, nan], + } + ) sparse = df.to_sparse() tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse()) def test_comparison_op_scalar(self): # GH 13001 - df = pd.DataFrame({'A': [nan, nan, 0, 1, ], - 'B': [0, 1, 2, nan], - 'C': [1., 2., 3., 4.], - 'D': [nan, nan, nan, nan]}) + df = pd.DataFrame( + { + "A": [nan, nan, 0, 1], + "B": [0, 1, 2, nan], + "C": [1.0, 2.0, 3.0, 4.0], + "D": [nan, nan, nan, nan], + } + ) sparse = df.to_sparse() # comparison changes internal repr, compare with dense @@ -1340,7 +1482,6 @@ def test_comparison_op_scalar(self): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameAnalytics: - def test_cumsum(self, float_frame): expected = SparseDataFrame(float_frame.to_dense().cumsum()) @@ -1369,13 +1510,11 @@ def test_numpy_cumsum(self, float_frame): def test_numpy_func_call(self, float_frame): # no exception should be raised even though # numpy passes in 'axis=None' or `axis=-1' - funcs = ['sum', 'cumsum', 'var', - 'mean', 'prod', 'cumprod', - 'std', 'min', 'max'] + funcs = ["sum", "cumsum", "var", "mean", "prod", "cumprod", "std", "min", "max"] for func in funcs: getattr(np, func)(float_frame) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)') + @pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH 17386)") def test_quantile(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] @@ -1391,7 +1530,7 @@ def test_quantile(self): tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected) - @pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH 17386)') + @pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH 17386)") def test_quantile_multi(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] @@ -1424,7 +1563,7 @@ def test_dropna(self, inplace, how): # Tests regression #21172. expected = pd.SparseDataFrame({"F2": [0, 1]}) input_df = pd.SparseDataFrame( - {"F1": [float('nan'), float('nan')], "F2": [0, 1]} + {"F1": [float("nan"), float("nan")], "F2": [0, 1]} ) result_df = input_df.dropna(axis=1, inplace=inplace, how=how) if inplace: diff --git a/pandas/tests/sparse/frame/test_indexing.py b/pandas/tests/sparse/frame/test_indexing.py index 2d2a7ac278dd6..c93e9d1e0e8d1 100644 --- a/pandas/tests/sparse/frame/test_indexing.py +++ b/pandas/tests/sparse/frame/test_indexing.py @@ -7,18 +7,21 @@ pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)") -@pytest.mark.parametrize('data', [ - [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], - [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], +@pytest.mark.parametrize( + "data", [ - [1.0, 1.0 + 1.0j], - [2.0 + 2.0j, 2.0], - [3.0, 3.0 + 3.0j], - [4.0 + 4.0j, 4.0], - [np.nan, np.nan] - ] -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') + [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], + [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], + [ + [1.0, 1.0 + 1.0j], + [2.0 + 2.0j, 2.0], + [3.0, 3.0 + 3.0j], + [4.0 + 4.0j, 4.0], + [np.nan, np.nan], + ], + ], +) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_numeric_data(data): # GH 17386 lower_bound = 1.5 @@ -34,24 +37,22 @@ def test_where_with_numeric_data(data): tm.assert_sp_frame_equal(result, sparse_expected) -@pytest.mark.parametrize('data', [ - [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], - [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], +@pytest.mark.parametrize( + "data", [ - [1.0, 1.0 + 1.0j], - [2.0 + 2.0j, 2.0], - [3.0, 3.0 + 3.0j], - [4.0 + 4.0j, 4.0], - [np.nan, np.nan] - ] -]) -@pytest.mark.parametrize('other', [ - True, - -100, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') + [[1, 1], [2, 2], [3, 3], [4, 4], [0, 0]], + [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0], [np.nan, np.nan]], + [ + [1.0, 1.0 + 1.0j], + [2.0 + 2.0j, 2.0], + [3.0, 3.0 + 3.0j], + [4.0 + 4.0j, 4.0], + [np.nan, np.nan], + ], + ], +) +@pytest.mark.parametrize("other", [True, -100, 0.1, 100.0 + 100.0j]) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_numeric_data_and_other(data, other): # GH 17386 lower_bound = 1.5 @@ -61,14 +62,13 @@ def test_where_with_numeric_data_and_other(data, other): dense = DataFrame(data) dense_expected = dense.where(dense > lower_bound, other) - sparse_expected = SparseDataFrame(dense_expected, - default_fill_value=other) + sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_bool_data(): # GH 17386 data = [[False, False], [True, True], [False, False]] @@ -85,13 +85,8 @@ def test_where_with_bool_data(): tm.assert_sp_frame_equal(result, sparse_expected) -@pytest.mark.parametrize('other', [ - True, - 0, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.parametrize("other", [True, 0, 0.1, 100.0 + 100.0j]) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_bool_data_and_other(other): # GH 17386 data = [[False, False], [True, True], [False, False]] @@ -102,8 +97,7 @@ def test_where_with_bool_data_and_other(other): dense = DataFrame(data) dense_expected = dense.where(dense == cond, other) - sparse_expected = SparseDataFrame(dense_expected, - default_fill_value=other) + sparse_expected = SparseDataFrame(dense_expected, default_fill_value=other) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) diff --git a/pandas/tests/sparse/frame/test_to_csv.py b/pandas/tests/sparse/frame/test_to_csv.py index 41d7bfabed44a..4ba4fba7391d4 100644 --- a/pandas/tests/sparse/frame/test_to_csv.py +++ b/pandas/tests/sparse/frame/test_to_csv.py @@ -10,13 +10,14 @@ class TestSparseDataFrameToCsv: fill_values = [np.nan, 0, None, 1] - @pytest.mark.parametrize('fill_value', fill_values) + @pytest.mark.parametrize("fill_value", fill_values) def test_to_csv_sparse_dataframe(self, fill_value): # GH19384 - sdf = SparseDataFrame({'a': type(self).fill_values}, - default_fill_value=fill_value) + sdf = SparseDataFrame( + {"a": type(self).fill_values}, default_fill_value=fill_value + ) - with tm.ensure_clean('sparse_df.csv') as path: + with tm.ensure_clean("sparse_df.csv") as path: sdf.to_csv(path, index=False) df = read_csv(path, skip_blank_lines=False) diff --git a/pandas/tests/sparse/frame/test_to_from_scipy.py b/pandas/tests/sparse/frame/test_to_from_scipy.py index 881d8d31e5162..9d1ccc62146ab 100644 --- a/pandas/tests/sparse/frame/test_to_from_scipy.py +++ b/pandas/tests/sparse/frame/test_to_from_scipy.py @@ -8,16 +8,16 @@ from pandas.core.sparse.api import SparseDtype from pandas.util import testing as tm -scipy = pytest.importorskip('scipy') +scipy = pytest.importorskip("scipy") ignore_matrix_warning = pytest.mark.filterwarnings( "ignore:the matrix subclass:PendingDeprecationWarning" ) -@pytest.mark.parametrize('index', [None, list('abc')]) # noqa: F811 -@pytest.mark.parametrize('columns', [None, list('def')]) -@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) -@pytest.mark.parametrize('dtype', [bool, int, float, np.uint16]) +@pytest.mark.parametrize("index", [None, list("abc")]) # noqa: F811 +@pytest.mark.parametrize("columns", [None, list("def")]) +@pytest.mark.parametrize("fill_value", [None, 0, np.nan]) +@pytest.mark.parametrize("dtype", [bool, int, float, np.uint16]) @ignore_matrix_warning @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): @@ -36,8 +36,9 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # can just skip testing it thoroughly return - sdf = SparseDataFrame(spm, index=index, columns=columns, - default_fill_value=fill_value) + sdf = SparseDataFrame( + spm, index=index, columns=columns, default_fill_value=fill_value + ) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic @@ -45,7 +46,8 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( - fill_value if fill_value is not None else np.nan) + fill_value if fill_value is not None else np.nan + ) # Assert frame is as expected sdf_obj = sdf.astype(object) @@ -58,24 +60,25 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # Ensure dtype is preserved if possible # XXX: verify this res_dtype = bool if is_bool_dtype(dtype) else dtype - tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype), - {np.dtype(res_dtype)}) + tm.assert_contains_all( + sdf.dtypes.apply(lambda dtype: dtype.subtype), {np.dtype(res_dtype)} + ) assert sdf.to_coo().dtype == res_dtype # However, adding a str column results in an upcast to object - sdf['strings'] = np.arange(len(sdf)).astype(str) + sdf["strings"] = np.arange(len(sdf)).astype(str) assert sdf.to_coo().dtype == np.object_ -@pytest.mark.parametrize('fill_value', [None, 0, np.nan]) # noqa: F811 +@pytest.mark.parametrize("fill_value", [None, 0, np.nan]) # noqa: F811 @ignore_matrix_warning @pytest.mark.filterwarnings("ignore:object dtype is not supp:UserWarning") @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_from_to_scipy_object(spmatrix, fill_value): # GH 4343 dtype = object - columns = list('cd') - index = list('ab') + columns = list("cd") + index = list("ab") if spmatrix is scipy.sparse.dok_matrix: pytest.skip("dok_matrix from object does not work in SciPy") @@ -92,8 +95,9 @@ def test_from_to_scipy_object(spmatrix, fill_value): # can just skip testing it thoroughly return - sdf = SparseDataFrame(spm, index=index, columns=columns, - default_fill_value=fill_value) + sdf = SparseDataFrame( + spm, index=index, columns=columns, default_fill_value=fill_value + ) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic @@ -101,7 +105,8 @@ def test_from_to_scipy_object(spmatrix, fill_value): rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = SparseDataFrame(rarr, index=index, columns=columns).fillna( - fill_value if fill_value is not None else np.nan) + fill_value if fill_value is not None else np.nan + ) # Assert frame is as expected sdf_obj = sdf.astype(SparseDtype(object, fill_value)) @@ -113,8 +118,9 @@ def test_from_to_scipy_object(spmatrix, fill_value): # Ensure dtype is preserved if possible res_dtype = object - tm.assert_contains_all(sdf.dtypes.apply(lambda dtype: dtype.subtype), - {np.dtype(res_dtype)}) + tm.assert_contains_all( + sdf.dtypes.apply(lambda dtype: dtype.subtype), {np.dtype(res_dtype)} + ) assert sdf.to_coo().dtype == res_dtype @@ -157,11 +163,14 @@ def test_from_scipy_fillna(spmatrix): sdf = SparseDataFrame(spm).fillna(-1.0) # Returning frame should fill all nan values with -1.0 - expected = SparseDataFrame({ - 0: SparseSeries([1., -1, -1]), - 1: SparseSeries([np.nan, 1, np.nan]), - 2: SparseSeries([np.nan, np.nan, 1]), - }, default_fill_value=-1) + expected = SparseDataFrame( + { + 0: SparseSeries([1.0, -1, -1]), + 1: SparseSeries([np.nan, 1, np.nan]), + 2: SparseSeries([np.nan, np.nan, 1]), + }, + default_fill_value=-1, + ) # fill_value is expected to be what .fillna() above was called with # We don't use -1 as initial fill_value in expected SparseSeries @@ -179,8 +188,7 @@ def test_index_names_multiple_nones(): # https://github.com/pandas-dev/pandas/pull/24092 sparse = pytest.importorskip("scipy.sparse") - s = (pd.Series(1, index=pd.MultiIndex.from_product([['A', 'B'], [0, 1]])) - .to_sparse()) + s = pd.Series(1, index=pd.MultiIndex.from_product([["A", "B"], [0, 1]])).to_sparse() result, _, _ = s.to_coo() assert isinstance(result, sparse.coo_matrix) result = result.toarray() diff --git a/pandas/tests/sparse/series/test_indexing.py b/pandas/tests/sparse/series/test_indexing.py index 0f4235d7cc3fe..525b0487a9376 100644 --- a/pandas/tests/sparse/series/test_indexing.py +++ b/pandas/tests/sparse/series/test_indexing.py @@ -7,18 +7,26 @@ pytestmark = pytest.mark.skip("Wrong SparseBlock initialization (GH 17386)") -@pytest.mark.parametrize('data', [ - [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], - [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], +@pytest.mark.parametrize( + "data", [ - 1.0, 1.0 + 1.0j, - 2.0 + 2.0j, 2.0, - 3.0, 3.0 + 3.0j, - 4.0 + 4.0j, 4.0, - np.nan, np.nan - ] -]) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') + [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], + [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], + [ + 1.0, + 1.0 + 1.0j, + 2.0 + 2.0j, + 2.0, + 3.0, + 3.0 + 3.0j, + 4.0 + 4.0j, + 4.0, + np.nan, + np.nan, + ], + ], +) +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_numeric_data(data): # GH 17386 lower_bound = 1.5 @@ -34,26 +42,27 @@ def test_where_with_numeric_data(data): tm.assert_sp_series_equal(result, sparse_expected) -@pytest.mark.parametrize('data', [ - [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], - [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], +@pytest.mark.parametrize( + "data", [ - 1.0, 1.0 + 1.0j, - 2.0 + 2.0j, 2.0, - 3.0, 3.0 + 3.0j, - 4.0 + 4.0j, 4.0, - np.nan, np.nan - ] -]) -@pytest.mark.parametrize('other', [ - True, - -100, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.skip(reason='Wrong SparseBlock initialization ' - '(Segfault) ' - '(GH 17386)') + [1, 1, 2, 2, 3, 3, 4, 4, 0, 0], + [1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, np.nan, np.nan], + [ + 1.0, + 1.0 + 1.0j, + 2.0 + 2.0j, + 2.0, + 3.0, + 3.0 + 3.0j, + 4.0 + 4.0j, + 4.0, + np.nan, + np.nan, + ], + ], +) +@pytest.mark.parametrize("other", [True, -100, 0.1, 100.0 + 100.0j]) +@pytest.mark.skip(reason="Wrong SparseBlock initialization " "(Segfault) " "(GH 17386)") def test_where_with_numeric_data_and_other(data, other): # GH 17386 lower_bound = 1.5 @@ -69,7 +78,7 @@ def test_where_with_numeric_data_and_other(data, other): tm.assert_sp_series_equal(result, sparse_expected) -@pytest.mark.xfail(reason='Wrong SparseBlock initialization (GH#17386)') +@pytest.mark.xfail(reason="Wrong SparseBlock initialization (GH#17386)") def test_where_with_bool_data(): # GH 17386 data = [False, False, True, True, False, False] @@ -86,15 +95,8 @@ def test_where_with_bool_data(): tm.assert_sp_series_equal(result, sparse_expected) -@pytest.mark.parametrize('other', [ - True, - 0, - 0.1, - 100.0 + 100.0j -]) -@pytest.mark.skip(reason='Wrong SparseBlock initialization ' - '(Segfault) ' - '(GH 17386)') +@pytest.mark.parametrize("other", [True, 0, 0.1, 100.0 + 100.0j]) +@pytest.mark.skip(reason="Wrong SparseBlock initialization " "(Segfault) " "(GH 17386)") def test_where_with_bool_data_and_other(other): # GH 17386 data = [False, False, True, True, False, False] diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 2abd63281c4fe..8895544958d7a 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -11,8 +11,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import ( - DataFrame, Series, SparseDtype, SparseSeries, bdate_range, isna) +from pandas import DataFrame, Series, SparseDtype, SparseSeries, bdate_range, isna from pandas.core.reshape.util import cartesian_product import pandas.core.sparse.frame as spf from pandas.tests.series.test_api import SharedWithSparse @@ -71,36 +70,32 @@ class TestSparseSeries(SharedWithSparse): def setup_method(self, method): arr, index = _test_data1() - date_index = bdate_range('1/1/2011', periods=len(index)) + date_index = bdate_range("1/1/2011", periods=len(index)) - self.bseries = SparseSeries(arr, index=index, kind='block', - name='bseries') + self.bseries = SparseSeries(arr, index=index, kind="block", name="bseries") self.ts = self.bseries - self.btseries = SparseSeries(arr, index=date_index, kind='block') + self.btseries = SparseSeries(arr, index=date_index, kind="block") - self.iseries = SparseSeries(arr, index=index, kind='integer', - name='iseries') + self.iseries = SparseSeries(arr, index=index, kind="integer", name="iseries") arr, index = _test_data2() - self.bseries2 = SparseSeries(arr, index=index, kind='block') - self.iseries2 = SparseSeries(arr, index=index, kind='integer') + self.bseries2 = SparseSeries(arr, index=index, kind="block") + self.iseries2 = SparseSeries(arr, index=index, kind="integer") arr, index = _test_data1_zero() - self.zbseries = SparseSeries(arr, index=index, kind='block', - fill_value=0, name='zbseries') - self.ziseries = SparseSeries(arr, index=index, kind='integer', - fill_value=0) + self.zbseries = SparseSeries( + arr, index=index, kind="block", fill_value=0, name="zbseries" + ) + self.ziseries = SparseSeries(arr, index=index, kind="integer", fill_value=0) arr, index = _test_data2_zero() - self.zbseries2 = SparseSeries(arr, index=index, kind='block', - fill_value=0) - self.ziseries2 = SparseSeries(arr, index=index, kind='integer', - fill_value=0) + self.zbseries2 = SparseSeries(arr, index=index, kind="block", fill_value=0) + self.ziseries2 = SparseSeries(arr, index=index, kind="integer", fill_value=0) def test_constructor_dict_input(self): # gh-16905 - constructor_dict = {1: 1.} + constructor_dict = {1: 1.0} index = [0, 1, 2] # Series with index passed in @@ -120,12 +115,12 @@ def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} result = SparseSeries(d) if PY36: - expected = SparseSeries([1, 0, 2], index=list('bac')) + expected = SparseSeries([1, 0, 2], index=list("bac")) else: - expected = SparseSeries([0, 1, 2], index=list('abc')) + expected = SparseSeries([0, 1, 2], index=list("abc")) tm.assert_sp_series_equal(result, expected) def test_constructor_dtype(self): @@ -155,16 +150,16 @@ def test_iteration_and_str(self): def test_construct_DataFrame_with_sp_series(self): # it works! - df = DataFrame({'col': self.bseries}) + df = DataFrame({"col": self.bseries}) # printing & access df.iloc[:1] - df['col'] + df["col"] df.dtypes str(df) # blocking - expected = Series({'col': 'float64:sparse'}) + expected = Series({"col": "float64:sparse"}) # GH 26705 - Assert .ftypes is deprecated with tm.assert_produces_warning(FutureWarning): @@ -176,7 +171,7 @@ def test_constructor_preserve_attr(self): assert arr.dtype == SparseDtype(np.int64) assert arr.fill_value == 0 - s = pd.SparseSeries(arr, name='x') + s = pd.SparseSeries(arr, name="x") assert s.dtype == SparseDtype(np.int64) assert s.fill_value == 0 @@ -191,14 +186,14 @@ def test_series_density(self): def test_sparse_to_dense(self): arr, index = _test_data1() series = self.bseries.to_dense() - tm.assert_series_equal(series, Series(arr, name='bseries')) + tm.assert_series_equal(series, Series(arr, name="bseries")) series = self.iseries.to_dense() - tm.assert_series_equal(series, Series(arr, name='iseries')) + tm.assert_series_equal(series, Series(arr, name="iseries")) arr, index = _test_data1_zero() series = self.zbseries.to_dense() - tm.assert_series_equal(series, Series(arr, name='zbseries')) + tm.assert_series_equal(series, Series(arr, name="zbseries")) series = self.ziseries.to_dense() tm.assert_series_equal(series, Series(arr)) @@ -228,8 +223,8 @@ def test_to_dense_fill_value(self): def test_dense_to_sparse(self): series = self.bseries.to_dense() - bseries = series.to_sparse(kind='block') - iseries = series.to_sparse(kind='integer') + bseries = series.to_sparse(kind="block") + iseries = series.to_sparse(kind="integer") tm.assert_sp_series_equal(bseries, self.bseries) tm.assert_sp_series_equal(iseries, self.iseries, check_names=False) assert iseries.name == self.bseries.name @@ -241,8 +236,8 @@ def test_dense_to_sparse(self): # non-NaN fill value series = self.zbseries.to_dense() - zbseries = series.to_sparse(kind='block', fill_value=0) - ziseries = series.to_sparse(kind='integer', fill_value=0) + zbseries = series.to_sparse(kind="block", fill_value=0) + ziseries = series.to_sparse(kind="integer", fill_value=0) tm.assert_sp_series_equal(zbseries, self.zbseries) tm.assert_sp_series_equal(ziseries, self.ziseries, check_names=False) assert ziseries.name == self.zbseries.name @@ -253,7 +248,7 @@ def test_dense_to_sparse(self): assert series.shape == ziseries.shape def test_to_dense_preserve_name(self): - assert (self.bseries.name is not None) + assert self.bseries.name is not None result = self.bseries.to_dense() assert result.name == self.bseries.name @@ -265,8 +260,9 @@ def test_constructor(self): assert isinstance(self.iseries.sp_index, IntIndex) assert self.zbseries.fill_value == 0 - tm.assert_numpy_array_equal(self.zbseries.values.to_dense(), - self.bseries.to_dense().fillna(0).values) + tm.assert_numpy_array_equal( + self.zbseries.values.to_dense(), self.bseries.to_dense().fillna(0).values + ) # pass SparseSeries def _check_const(sparse, name): @@ -277,16 +273,16 @@ def _check_const(sparse, name): assert result.name == name # use passed name - result = SparseSeries(sparse, name='x') + result = SparseSeries(sparse, name="x") tm.assert_sp_series_equal(result, sparse, check_names=False) - assert result.name == 'x' + assert result.name == "x" - _check_const(self.bseries, 'bseries') - _check_const(self.iseries, 'iseries') - _check_const(self.zbseries, 'zbseries') + _check_const(self.bseries, "bseries") + _check_const(self.iseries, "iseries") + _check_const(self.zbseries, "zbseries") # Sparse time series works - date_index = bdate_range('1/1/2000', periods=len(self.bseries)) + date_index = bdate_range("1/1/2000", periods=len(self.bseries)) s5 = SparseSeries(self.bseries, index=date_index) assert isinstance(s5, SparseSeries) @@ -303,16 +299,15 @@ def _check_const(sparse, name): assert values[0] == 97 assert len(sp) == 20 - assert sp.shape == (20, ) + assert sp.shape == (20,) # but can make it copy! - sp = SparseSeries(values, sparse_index=self.bseries.sp_index, - copy=True) + sp = SparseSeries(values, sparse_index=self.bseries.sp_index, copy=True) sp.sp_values[:5] = 100 assert values[0] == 97 assert len(sp) == 20 - assert sp.shape == (20, ) + assert sp.shape == (20,) def test_constructor_scalar(self): data = 5 @@ -324,7 +319,7 @@ def test_constructor_scalar(self): data = np.nan sp = SparseSeries(data, np.arange(100)) assert len(sp) == 100 - assert sp.shape == (100, ) + assert sp.shape == (100,) def test_constructor_ndarray(self): pass @@ -334,13 +329,13 @@ def test_constructor_nonnan(self): sp_series = SparseSeries(arr, fill_value=0) tm.assert_numpy_array_equal(sp_series.values.to_dense(), np.array(arr)) assert len(sp_series) == 5 - assert sp_series.shape == (5, ) + assert sp_series.shape == (5,) def test_constructor_empty(self): # see gh-9272 sp = SparseSeries() assert len(sp.index) == 0 - assert sp.shape == (0, ) + assert sp.shape == (0,) def test_copy_astype(self): cop = self.bseries.astype(np.float64) @@ -372,30 +367,28 @@ def test_copy_astype(self): def test_shape(self): # see gh-10452 - assert self.bseries.shape == (20, ) - assert self.btseries.shape == (20, ) - assert self.iseries.shape == (20, ) + assert self.bseries.shape == (20,) + assert self.btseries.shape == (20,) + assert self.iseries.shape == (20,) - assert self.bseries2.shape == (15, ) - assert self.iseries2.shape == (15, ) + assert self.bseries2.shape == (15,) + assert self.iseries2.shape == (15,) - assert self.zbseries2.shape == (15, ) - assert self.ziseries2.shape == (15, ) + assert self.zbseries2.shape == (15,) + assert self.ziseries2.shape == (15,) def test_astype(self): result = self.bseries.astype(SparseDtype(np.int64, 0)) - expected = (self.bseries.to_dense() - .fillna(0) - .astype(np.int64) - .to_sparse(fill_value=0)) + expected = ( + self.bseries.to_dense().fillna(0).astype(np.int64).to_sparse(fill_value=0) + ) tm.assert_sp_series_equal(result, expected) def test_astype_all(self): orig = pd.Series(np.array([1, 2, 3])) s = SparseSeries(orig) - types = [np.float64, np.float32, np.int64, - np.int32, np.int16, np.int8] + types = [np.float64, np.float32, np.int64, np.int32, np.int16, np.int8] for typ in types: dtype = SparseDtype(typ) res = s.astype(dtype) @@ -403,25 +396,24 @@ def test_astype_all(self): tm.assert_series_equal(res.to_dense(), orig.astype(typ)) def test_kind(self): - assert self.bseries.kind == 'block' - assert self.iseries.kind == 'integer' + assert self.bseries.kind == "block" + assert self.iseries.kind == "integer" def test_to_frame(self): # GH 9850 - s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name='x') - exp = pd.SparseDataFrame({'x': [1, 2, 0, nan, 4, nan, 0]}) + s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name="x") + exp = pd.SparseDataFrame({"x": [1, 2, 0, nan, 4, nan, 0]}) tm.assert_sp_frame_equal(s.to_frame(), exp) - exp = pd.SparseDataFrame({'y': [1, 2, 0, nan, 4, nan, 0]}) - tm.assert_sp_frame_equal(s.to_frame(name='y'), exp) + exp = pd.SparseDataFrame({"y": [1, 2, 0, nan, 4, nan, 0]}) + tm.assert_sp_frame_equal(s.to_frame(name="y"), exp) - s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name='x', fill_value=0) - exp = pd.SparseDataFrame({'x': [1, 2, 0, nan, 4, nan, 0]}, - default_fill_value=0) + s = pd.SparseSeries([1, 2, 0, nan, 4, nan, 0], name="x", fill_value=0) + exp = pd.SparseDataFrame({"x": [1, 2, 0, nan, 4, nan, 0]}, default_fill_value=0) tm.assert_sp_frame_equal(s.to_frame(), exp) - exp = pd.DataFrame({'y': [1, 2, 0, nan, 4, nan, 0]}) - tm.assert_frame_equal(s.to_frame(name='y').to_dense(), exp) + exp = pd.DataFrame({"y": [1, 2, 0, nan, 4, nan, 0]}) + tm.assert_frame_equal(s.to_frame(name="y").to_dense(), exp) def test_pickle(self): def _test_roundtrip(series): @@ -477,24 +469,20 @@ def test_get_get_value(self): expected = self.btseries.to_dense()[dt] tm.assert_almost_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - tm.assert_almost_equal( - self.bseries.get_value(10), self.bseries[10]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + tm.assert_almost_equal(self.bseries.get_value(10), self.bseries[10]) def test_set_value(self): idx = self.btseries.index[7] - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self.btseries.set_value(idx, 0) assert self.btseries[idx] == 0 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - self.iseries.set_value('foobar', 0) - assert self.iseries.index[-1] == 'foobar' - assert self.iseries['foobar'] == 0 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.iseries.set_value("foobar", 0) + assert self.iseries.index[-1] == "foobar" + assert self.iseries["foobar"] == 0 def test_getitem_slice(self): idx = self.bseries.index @@ -523,10 +511,9 @@ def _compare(idx): dense_result = dense.take(idx).values sparse_result = sp.take(idx) assert isinstance(sparse_result, SparseSeries) - tm.assert_almost_equal(dense_result, - sparse_result.values.to_dense()) + tm.assert_almost_equal(dense_result, sparse_result.values.to_dense()) - _compare([1., 2., 3., 4., 5., 0.]) + _compare([1.0, 2.0, 3.0, 4.0, 5.0, 0.0]) _compare([7, 2, 9, 0, 4]) _compare([3, 6, 3, 4, 7]) @@ -546,8 +533,10 @@ def test_numpy_take(self): sp = SparseSeries([1.0, 2.0, 3.0]) indices = [1, 2] - tm.assert_series_equal(np.take(sp, indices, axis=0).to_dense(), - np.take(sp.to_dense(), indices, axis=0)) + tm.assert_series_equal( + np.take(sp, indices, axis=0).to_dense(), + np.take(sp.to_dense(), indices, axis=0), + ) msg = "the 'out' parameter is not supported" with pytest.raises(ValueError, match=msg): @@ -555,20 +544,20 @@ def test_numpy_take(self): msg = "the 'mode' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.take(sp, indices, out=None, mode='clip') + np.take(sp, indices, out=None, mode="clip") def test_setitem(self): - self.bseries[5] = 7. - assert self.bseries[5] == 7. + self.bseries[5] = 7.0 + assert self.bseries[5] == 7.0 def test_setslice(self): - self.bseries[5:10] = 7. - tm.assert_series_equal(self.bseries[5:10].to_dense(), - Series(7., index=range(5, 10), - name=self.bseries.name)) + self.bseries[5:10] = 7.0 + tm.assert_series_equal( + self.bseries[5:10].to_dense(), + Series(7.0, index=range(5, 10), name=self.bseries.name), + ) def test_operators(self): - def _check_op(a, b, op): sp_result = op(a, b) adense = a.to_dense() if isinstance(a, SparseSeries) else a @@ -617,6 +606,7 @@ def test_binary_operators(self): # skipping for now ##### import pytest + pytest.skip("skipping sparse binary operators test") def _check_inplace_op(iop, op): @@ -626,19 +616,21 @@ def _check_inplace_op(iop, op): iop(tmp, self.bseries) tm.assert_sp_series_equal(tmp, expected) - inplace_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow'] + inplace_ops = ["add", "sub", "mul", "truediv", "floordiv", "pow"] for op in inplace_ops: - _check_inplace_op(getattr(operator, "i%s" % op), - getattr(operator, op)) - - @pytest.mark.parametrize("values, op, fill_value", [ - ([True, False, False, True], operator.invert, True), - ([True, False, False, True], operator.invert, False), - ([0, 1, 2, 3], operator.pos, 0), - ([0, 1, 2, 3], operator.neg, 0), - ([0, np.nan, 2, 3], operator.pos, np.nan), - ([0, np.nan, 2, 3], operator.neg, np.nan), - ]) + _check_inplace_op(getattr(operator, "i%s" % op), getattr(operator, op)) + + @pytest.mark.parametrize( + "values, op, fill_value", + [ + ([True, False, False, True], operator.invert, True), + ([True, False, False, True], operator.invert, False), + ([0, 1, 2, 3], operator.pos, 0), + ([0, 1, 2, 3], operator.neg, 0), + ([0, np.nan, 2, 3], operator.pos, np.nan), + ([0, np.nan, 2, 3], operator.neg, np.nan), + ], + ) def test_unary_operators(self, values, op, fill_value): # https://github.com/pandas-dev/pandas/issues/22835 values = np.asarray(values) @@ -646,46 +638,48 @@ def test_unary_operators(self, values, op, fill_value): new_fill_value = not fill_value else: new_fill_value = op(fill_value) - s = SparseSeries(values, - fill_value=fill_value, - index=['a', 'b', 'c', 'd'], - name='name') + s = SparseSeries( + values, fill_value=fill_value, index=["a", "b", "c", "d"], name="name" + ) result = op(s) - expected = SparseSeries(op(values), - fill_value=new_fill_value, - index=['a', 'b', 'c', 'd'], - name='name') + expected = SparseSeries( + op(values), + fill_value=new_fill_value, + index=["a", "b", "c", "d"], + name="name", + ) tm.assert_sp_series_equal(result, expected) def test_abs(self): - s = SparseSeries([1, 2, -3], name='x') - expected = SparseSeries([1, 2, 3], name='x') + s = SparseSeries([1, 2, -3], name="x") + expected = SparseSeries([1, 2, 3], name="x") result = s.abs() tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = np.abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" - s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x') - expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index, - fill_value=2, name='x') + s = SparseSeries([1, -2, 2, -3], fill_value=-2, name="x") + expected = SparseSeries( + [1, 2, 3], sparse_index=s.sp_index, fill_value=2, name="x" + ) result = s.abs() tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" result = np.abs(s) tm.assert_sp_series_equal(result, expected) - assert result.name == 'x' + assert result.name == "x" def test_reindex(self): def _compare_with_series(sps, new_index): @@ -720,19 +714,20 @@ def _compare_with_series(sps, new_index): # with copy=False reindexed = self.bseries.reindex(self.bseries.index, copy=True) - reindexed.sp_values[:] = 1. - assert (self.bseries.sp_values != 1.).all() + reindexed.sp_values[:] = 1.0 + assert (self.bseries.sp_values != 1.0).all() reindexed = self.bseries.reindex(self.bseries.index, copy=False) - reindexed.sp_values[:] = 1. - tm.assert_numpy_array_equal(self.bseries.sp_values, np.repeat(1., 10)) + reindexed.sp_values[:] = 1.0 + tm.assert_numpy_array_equal(self.bseries.sp_values, np.repeat(1.0, 10)) def test_sparse_reindex(self): length = 10 def _check(values, index1, index2, fill_value): - first_series = SparseSeries(values, sparse_index=index1, - fill_value=fill_value) + first_series = SparseSeries( + values, sparse_index=index1, fill_value=fill_value + ) reindexed = first_series.sparse_reindex(index2) assert reindexed.sp_index is index2 @@ -762,7 +757,7 @@ def _check_all(values, first, second): _check_with_fill_value(values, first, second, fill_value=0) index1 = [2, 4, 5, 6, 8, 9] - values1 = np.arange(6.) + values1 = np.arange(6.0) _check_all(values1, index1, [2, 4, 5]) _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9]) @@ -770,11 +765,10 @@ def _check_all(values, first, second): _check_all(values1, index1, [0, 1, 7, 8, 9]) _check_all(values1, index1, []) - first_series = SparseSeries(values1, - sparse_index=IntIndex(length, index1), - fill_value=nan) - with pytest.raises(TypeError, - match='new index must be a SparseIndex'): + first_series = SparseSeries( + values1, sparse_index=IntIndex(length, index1), fill_value=nan + ) + with pytest.raises(TypeError, match="new index must be a SparseIndex"): first_series.sparse_reindex(0) def test_repr(self): @@ -801,7 +795,7 @@ def _compare_with_dense(obj, op): dense_result = getattr(series, op)() assert sparse_result == dense_result - to_compare = ['count', 'sum', 'mean', 'std', 'var', 'skew'] + to_compare = ["count", "sum", "mean", "std", "var", "skew"] def _compare_all(obj): for op in to_compare: @@ -833,7 +827,7 @@ def test_dropna(self): expected = sp.to_dense().dropna() expected = expected[expected != 0] - exp_arr = pd.SparseArray(expected.values, fill_value=0, kind='block') + exp_arr = pd.SparseArray(expected.values, fill_value=0, kind="block") tm.assert_sp_array_equal(sp_valid.values, exp_arr) tm.assert_index_equal(sp_valid.index, expected.index) assert len(sp_valid.sp_values) == 2 @@ -845,18 +839,24 @@ def test_dropna(self): def test_homogenize(self): def _check_matches(indices, expected): - data = {i: SparseSeries(idx.to_int_index().indices, - sparse_index=idx, fill_value=np.nan) - for i, idx in enumerate(indices)} + data = { + i: SparseSeries( + idx.to_int_index().indices, sparse_index=idx, fill_value=np.nan + ) + for i, idx in enumerate(indices) + } # homogenized is only valid with NaN fill values homogenized = spf.homogenize(data) for k, v in homogenized.items(): - assert (v.sp_index.equals(expected)) + assert v.sp_index.equals(expected) - indices1 = [BlockIndex(10, [2], [7]), BlockIndex(10, [1, 6], [3, 4]), - BlockIndex(10, [0], [10])] + indices1 = [ + BlockIndex(10, [2], [7]), + BlockIndex(10, [1, 6], [3, 4]), + BlockIndex(10, [0], [10]), + ] expected1 = BlockIndex(10, [2, 6], [2, 3]) _check_matches(indices1, expected1) @@ -865,8 +865,7 @@ def _check_matches(indices, expected): _check_matches(indices2, expected2) # must have NaN fill value - data = {'a': SparseSeries(np.arange(7), sparse_index=expected2, - fill_value=0)} + data = {"a": SparseSeries(np.arange(7), sparse_index=expected2, fill_value=0)} with pytest.raises(TypeError, match="NaN fill value"): spf.homogenize(data) @@ -892,7 +891,7 @@ def test_fill_value_when_combine_const(self): tm.assert_series_equal(res, exp) def test_shift(self): - series = SparseSeries([nan, 1., 2., 3., nan, nan], index=np.arange(6)) + series = SparseSeries([nan, 1.0, 2.0, 3.0, nan, nan], index=np.arange(6)) shifted = series.shift(0) # assert shifted is not series @@ -904,9 +903,10 @@ def test_shift(self): f = lambda s: s.shift(-2) _dense_series_compare(series, f) - series = SparseSeries([nan, 1., 2., 3., nan, nan], - index=bdate_range('1/1/2000', periods=6)) - f = lambda s: s.shift(2, freq='B') + series = SparseSeries( + [nan, 1.0, 2.0, 3.0, nan, nan], index=bdate_range("1/1/2000", periods=6) + ) + f = lambda s: s.shift(2, freq="B") _dense_series_compare(series, f) f = lambda s: s.shift(2, freq=BDay()) @@ -917,14 +917,18 @@ def test_shift_nan(self): orig = pd.Series([np.nan, 2, np.nan, 4, 0, np.nan, 0]) sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(1), orig.shift(1).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(2), orig.shift(2).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(3), orig.shift(3).to_sparse(), - check_kind=False) + tm.assert_sp_series_equal( + sparse.shift(0), orig.shift(0).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(1), orig.shift(1).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(2), orig.shift(2).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(3), orig.shift(3).to_sparse(), check_kind=False + ) tm.assert_sp_series_equal(sparse.shift(-1), orig.shift(-1).to_sparse()) tm.assert_sp_series_equal(sparse.shift(-2), orig.shift(-2).to_sparse()) @@ -933,31 +937,30 @@ def test_shift_nan(self): sparse = orig.to_sparse(fill_value=0) tm.assert_sp_series_equal( - sparse.shift(0), - orig.shift(0).to_sparse(fill_value=sparse.fill_value) + sparse.shift(0), orig.shift(0).to_sparse(fill_value=sparse.fill_value) + ) + tm.assert_sp_series_equal( + sparse.shift(1), orig.shift(1).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(2), orig.shift(2).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(3), orig.shift(3).to_sparse(fill_value=0), check_kind=False + ) + + tm.assert_sp_series_equal( + sparse.shift(-1), orig.shift(-1).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-2), orig.shift(-2).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-3), orig.shift(-3).to_sparse(fill_value=0), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-4), orig.shift(-4).to_sparse(fill_value=0), check_kind=False ) - tm.assert_sp_series_equal(sparse.shift(1), - orig.shift(1).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(2), - orig.shift(2).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(3), - orig.shift(3).to_sparse(fill_value=0), - check_kind=False) - - tm.assert_sp_series_equal(sparse.shift(-1), - orig.shift(-1).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-2), - orig.shift(-2).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-3), - orig.shift(-3).to_sparse(fill_value=0), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-4), - orig.shift(-4).to_sparse(fill_value=0), - check_kind=False) def test_shift_dtype(self): # GH 12908 @@ -967,50 +970,50 @@ def test_shift_dtype(self): tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse()) sparse = orig.to_sparse(fill_value=np.nan) - tm.assert_sp_series_equal(sparse.shift(0), - orig.shift(0).to_sparse(fill_value=np.nan)) + tm.assert_sp_series_equal( + sparse.shift(0), orig.shift(0).to_sparse(fill_value=np.nan) + ) # shift(1) or more span changes dtype to float64 # XXX: SparseSeries doesn't need to shift dtype here. # Do we want to astype in shift, for backwards compat? # If not, document it. - tm.assert_sp_series_equal(sparse.shift(1).astype('f8'), - orig.shift(1).to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.shift(2).astype('f8'), - orig.shift(2).to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.shift(3).astype('f8'), - orig.shift(3).to_sparse(kind='integer')) - - tm.assert_sp_series_equal(sparse.shift(-1).astype('f8'), - orig.shift(-1).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-2).astype('f8'), - orig.shift(-2).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-3).astype('f8'), - orig.shift(-3).to_sparse(), - check_kind=False) - tm.assert_sp_series_equal(sparse.shift(-4).astype('f8'), - orig.shift(-4).to_sparse(), - check_kind=False) - - @pytest.mark.parametrize("fill_value", [ - 0, - 1, - np.nan - ]) + tm.assert_sp_series_equal( + sparse.shift(1).astype("f8"), orig.shift(1).to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.shift(2).astype("f8"), orig.shift(2).to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.shift(3).astype("f8"), orig.shift(3).to_sparse(kind="integer") + ) + + tm.assert_sp_series_equal( + sparse.shift(-1).astype("f8"), orig.shift(-1).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-2).astype("f8"), orig.shift(-2).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-3).astype("f8"), orig.shift(-3).to_sparse(), check_kind=False + ) + tm.assert_sp_series_equal( + sparse.shift(-4).astype("f8"), orig.shift(-4).to_sparse(), check_kind=False + ) + + @pytest.mark.parametrize("fill_value", [0, 1, np.nan]) @pytest.mark.parametrize("periods", [0, 1, 2, 3, -1, -2, -3, -4]) def test_shift_dtype_fill_value(self, fill_value, periods): # GH 12908 - orig = pd.Series([1, 0, 0, 4], dtype=np.dtype('int64')) + orig = pd.Series([1, 0, 0, 4], dtype=np.dtype("int64")) sparse = orig.to_sparse(fill_value=fill_value) result = sparse.shift(periods) expected = orig.shift(periods).to_sparse(fill_value=fill_value) - tm.assert_sp_series_equal(result, expected, - check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_series_equal( + result, expected, check_kind=False, consolidate_block_indices=True + ) def test_combine_first(self): s = self.bseries @@ -1024,8 +1027,8 @@ def test_combine_first(self): tm.assert_sp_series_equal(result, result2) tm.assert_sp_series_equal(result, expected) - @pytest.mark.parametrize('deep', [True, False]) - @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) + @pytest.mark.parametrize("deep", [True, False]) + @pytest.mark.parametrize("fill_value", [0, 1, np.nan, None]) def test_memory_usage_deep(self, deep, fill_value): values = [1.0] + [fill_value] * 20 sparse_series = SparseSeries(values, fill_value=fill_value) @@ -1039,35 +1042,38 @@ def test_memory_usage_deep(self, deep, fill_value): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseHandlingMultiIndexes: - def setup_method(self, method): miindex = pd.MultiIndex.from_product( - [["x", "y"], ["10", "20"]], names=['row-foo', 'row-bar']) + [["x", "y"], ["10", "20"]], names=["row-foo", "row-bar"] + ) micol = pd.MultiIndex.from_product( - [['a', 'b', 'c'], ["1", "2"]], names=['col-foo', 'col-bar']) - dense_multiindex_frame = pd.DataFrame( - index=miindex, columns=micol).sort_index().sort_index(axis=1) + [["a", "b", "c"], ["1", "2"]], names=["col-foo", "col-bar"] + ) + dense_multiindex_frame = ( + pd.DataFrame(index=miindex, columns=micol).sort_index().sort_index(axis=1) + ) self.dense_multiindex_frame = dense_multiindex_frame.fillna(value=3.14) def test_to_sparse_preserve_multiindex_names_columns(self): sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse() sparse_multiindex_frame = sparse_multiindex_frame.copy() - tm.assert_index_equal(sparse_multiindex_frame.columns, - self.dense_multiindex_frame.columns) + tm.assert_index_equal( + sparse_multiindex_frame.columns, self.dense_multiindex_frame.columns + ) def test_round_trip_preserve_multiindex_names(self): sparse_multiindex_frame = self.dense_multiindex_frame.to_sparse() round_trip_multiindex_frame = sparse_multiindex_frame.to_dense() - tm.assert_frame_equal(self.dense_multiindex_frame, - round_trip_multiindex_frame, - check_column_type=True, - check_names=True) + tm.assert_frame_equal( + self.dense_multiindex_frame, + round_trip_multiindex_frame, + check_column_type=True, + check_names=True, + ) @td.skip_if_no_scipy -@pytest.mark.filterwarnings( - "ignore:the matrix subclass:PendingDeprecationWarning" -) +@pytest.mark.filterwarnings("ignore:the matrix subclass:PendingDeprecationWarning") @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesScipyInteraction: @@ -1075,72 +1081,89 @@ class TestSparseSeriesScipyInteraction: def setup_method(self, method): import scipy.sparse + # SparseSeries inputs used in tests, the tests rely on the order self.sparse_series = [] s = pd.Series([3.0, nan, 1.0, 2.0, nan, nan]) - s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) + s.index = pd.MultiIndex.from_tuples( + [ + (1, 2, "a", 0), + (1, 2, "a", 1), + (1, 1, "b", 0), + (1, 1, "b", 1), + (2, 1, "b", 0), + (2, 1, "b", 1), + ], + names=["A", "B", "C", "D"], + ) self.sparse_series.append(s.to_sparse()) ss = self.sparse_series[0].copy() ss.index.names = [3, 0, 1, 2] self.sparse_series.append(ss) - ss = pd.Series([ - nan - ] * 12, index=cartesian_product((range(3), range(4)))).to_sparse() + ss = pd.Series( + [nan] * 12, index=cartesian_product((range(3), range(4))) + ).to_sparse() for k, v in zip([(0, 0), (1, 2), (1, 3)], [3.0, 1.0, 2.0]): ss[k] = v self.sparse_series.append(ss) # results used in tests self.coo_matrices = [] - self.coo_matrices.append(scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4))) - self.coo_matrices.append(scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4))) - self.coo_matrices.append(scipy.sparse.coo_matrix( - ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2))) - self.ils = [[(1, 2), (1, 1), (2, 1)], [(1, 1), (1, 2), (2, 1)], - [(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b')]] - self.jls = [[('a', 0), ('a', 1), ('b', 0), ('b', 1)], [0, 1]] + self.coo_matrices.append( + scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 2, 3])), shape=(3, 4) + ) + ) + self.coo_matrices.append( + scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) + ) + ) + self.coo_matrices.append( + scipy.sparse.coo_matrix( + ([3.0, 1.0, 2.0], ([0, 1, 1], [0, 0, 1])), shape=(3, 2) + ) + ) + self.ils = [ + [(1, 2), (1, 1), (2, 1)], + [(1, 1), (1, 2), (2, 1)], + [(1, 2, "a"), (1, 1, "b"), (2, 1, "b")], + ] + self.jls = [[("a", 0), ("a", 1), ("b", 0), ("b", 1)], [0, 1]] def test_to_coo_text_names_integer_row_levels_nosort(self): ss = self.sparse_series[0] - kwargs = {'row_levels': [0, 1], 'column_levels': [2, 3]} + kwargs = {"row_levels": [0, 1], "column_levels": [2, 3]} result = (self.coo_matrices[0], self.ils[0], self.jls[0]) self._run_test(ss, kwargs, result) def test_to_coo_text_names_integer_row_levels_sort(self): ss = self.sparse_series[0] - kwargs = {'row_levels': [0, 1], - 'column_levels': [2, 3], - 'sort_labels': True} + kwargs = {"row_levels": [0, 1], "column_levels": [2, 3], "sort_labels": True} result = (self.coo_matrices[1], self.ils[1], self.jls[0]) self._run_test(ss, kwargs, result) def test_to_coo_text_names_text_row_levels_nosort_col_level_single(self): ss = self.sparse_series[0] - kwargs = {'row_levels': ['A', 'B', 'C'], - 'column_levels': ['D'], - 'sort_labels': False} + kwargs = { + "row_levels": ["A", "B", "C"], + "column_levels": ["D"], + "sort_labels": False, + } result = (self.coo_matrices[2], self.ils[2], self.jls[1]) self._run_test(ss, kwargs, result) def test_to_coo_integer_names_integer_row_levels_nosort(self): ss = self.sparse_series[1] - kwargs = {'row_levels': [3, 0], 'column_levels': [1, 2]} + kwargs = {"row_levels": [3, 0], "column_levels": [1, 2]} result = (self.coo_matrices[0], self.ils[0], self.jls[0]) self._run_test(ss, kwargs, result) def test_to_coo_text_names_text_row_levels_nosort(self): ss = self.sparse_series[0] - kwargs = {'row_levels': ['A', 'B'], 'column_levels': ['C', 'D']} + kwargs = {"row_levels": ["A", "B"], "column_levels": ["C", "D"]} result = (self.coo_matrices[0], self.ils[0], self.jls[0]) self._run_test(ss, kwargs, result) @@ -1148,13 +1171,13 @@ def test_to_coo_bad_partition_nonnull_intersection(self): ss = self.sparse_series[0] msg = "Is not a partition because intersection is not null" with pytest.raises(ValueError, match=msg): - ss.to_coo(['A', 'B', 'C'], ['C', 'D']) + ss.to_coo(["A", "B", "C"], ["C", "D"]) def test_to_coo_bad_partition_small_union(self): ss = self.sparse_series[0] msg = "Is not a partition because union is not the whole" with pytest.raises(ValueError, match=msg): - ss.to_coo(['A'], ['C', 'D']) + ss.to_coo(["A"], ["C", "D"]) def test_to_coo_nlevels_less_than_two(self): ss = self.sparse_series[0] @@ -1166,15 +1189,13 @@ def test_to_coo_nlevels_less_than_two(self): def test_to_coo_bad_ilevel(self): ss = self.sparse_series[0] with pytest.raises(KeyError, match="Level E not found"): - ss.to_coo(['A', 'B'], ['C', 'D', 'E']) + ss.to_coo(["A", "B"], ["C", "D", "E"]) def test_to_coo_duplicate_index_entries(self): - ss = pd.concat([self.sparse_series[0], - self.sparse_series[0]]).to_sparse() - msg = ("Duplicate index entries are not allowed in to_coo" - " transformation") + ss = pd.concat([self.sparse_series[0], self.sparse_series[0]]).to_sparse() + msg = "Duplicate index entries are not allowed in to_coo" " transformation" with pytest.raises(ValueError, match=msg): - ss.to_coo(['A', 'B'], ['C', 'D']) + ss.to_coo(["A", "B"], ["C", "D"]) def test_from_coo_dense_index(self): ss = SparseSeries.from_coo(self.coo_matrices[0], dense_index=True) @@ -1201,8 +1222,8 @@ def _run_test(self, ss, kwargs, check): # for every test, also test symmetry property (transpose), switch # row_levels and column_levels d = kwargs.copy() - d['row_levels'] = kwargs['column_levels'] - d['column_levels'] = kwargs['row_levels'] + d["row_levels"] = kwargs["column_levels"] + d["column_levels"] = kwargs["row_levels"] results = ss.to_coo(**d) results = (results[0].T, results[2], results[1]) self._check_results_to_coo(results, check) @@ -1222,34 +1243,32 @@ def test_concat(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind) + for kind in ["integer", "block"]: + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind) + sparse1 = pd.SparseSeries(val1, fill_value=0, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, fill_value=0, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, fill_value=0, kind=kind) - tm.assert_sp_series_equal(res, exp, - consolidate_block_indices=True) + tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) def test_concat_axis1(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y') + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y") res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) exp = pd.SparseDataFrame(exp) tm.assert_sp_frame_equal(res, exp) @@ -1257,19 +1276,21 @@ def test_concat_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) + for kind in ["integer", "block"]: + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) exp = pd.SparseSeries(exp, kind=kind, fill_value=0) @@ -1279,12 +1300,11 @@ def test_concat_axis1_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y', fill_value=0) + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y", fill_value=0) res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) @@ -1292,21 +1312,23 @@ def test_concat_different_kind(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x', kind='integer') - sparse2 = pd.SparseSeries(val2, name='y', kind='block', fill_value=0) + sparse1 = pd.SparseSeries(val1, name="x", kind="integer") + sparse2 = pd.SparseSeries(val2, name="y", kind="block", fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) - exp = pd.SparseSeries(exp, kind='integer') + exp = pd.SparseSeries(exp, kind="integer") tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) - exp = pd.SparseSeries(exp, kind='block', fill_value=0) + exp = pd.SparseSeries(exp, kind="block", fill_value=0) tm.assert_sp_series_equal(res, exp) def test_concat_sparse_dense(self): @@ -1314,9 +1336,9 @@ def test_concat_sparse_dense(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse = pd.SparseSeries(val1, name='x', kind=kind) - dense = pd.Series(val2, name='y') + for kind in ["integer", "block"]: + sparse = pd.SparseSeries(val1, name="x", kind=kind) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) exp = pd.concat([pd.Series(val1), dense]) @@ -1328,8 +1350,8 @@ def test_concat_sparse_dense(self): exp = exp.astype("Sparse") tm.assert_series_equal(res, exp) - sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0) - dense = pd.Series(val2, name='y') + sparse = pd.SparseSeries(val1, name="x", kind=kind, fill_value=0) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) exp = pd.concat([pd.Series(val1), dense]) @@ -1343,96 +1365,96 @@ def test_concat_sparse_dense(self): def test_value_counts(self): vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name='xx') + dense = pd.Series(vals, name="xx") - sparse = pd.SparseSeries(vals, name='xx') - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + sparse = pd.SparseSeries(vals, name="xx") + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) - sparse = pd.SparseSeries(vals, name='xx', fill_value=0) - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + sparse = pd.SparseSeries(vals, name="xx", fill_value=0) + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) def test_value_counts_dup(self): vals = [1, 2, nan, 0, nan, 1, 2, nan, nan, 1, 2, 0, 1, 1] # numeric op may cause sp_values to include the same value as # fill_value - dense = pd.Series(vals, name='xx') / 0. - sparse = pd.SparseSeries(vals, name='xx') / 0. - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + dense = pd.Series(vals, name="xx") / 0.0 + sparse = pd.SparseSeries(vals, name="xx") / 0.0 + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) vals = [1, 2, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name='xx') * 0. - sparse = pd.SparseSeries(vals, name='xx') * 0. - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + dense = pd.Series(vals, name="xx") * 0.0 + sparse = pd.SparseSeries(vals, name="xx") * 0.0 + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) def test_value_counts_int(self): vals = [1, 2, 0, 1, 2, 1, 2, 0, 1, 1] - dense = pd.Series(vals, name='xx') + dense = pd.Series(vals, name="xx") # fill_value is np.nan, but should not be included in the result - sparse = pd.SparseSeries(vals, name='xx') - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) - - sparse = pd.SparseSeries(vals, name='xx', fill_value=0) - tm.assert_series_equal(sparse.value_counts(), - dense.value_counts()) - tm.assert_series_equal(sparse.value_counts(dropna=False), - dense.value_counts(dropna=False)) + sparse = pd.SparseSeries(vals, name="xx") + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) + + sparse = pd.SparseSeries(vals, name="xx", fill_value=0) + tm.assert_series_equal(sparse.value_counts(), dense.value_counts()) + tm.assert_series_equal( + sparse.value_counts(dropna=False), dense.value_counts(dropna=False) + ) def test_isna(self): # GH 8276 - s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name="xxx") res = s.isna() - exp = pd.SparseSeries([True, True, False, False, True], name='xxx', - fill_value=True) + exp = pd.SparseSeries( + [True, True, False, False, True], name="xxx", fill_value=True + ) tm.assert_sp_series_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', - fill_value=0.) + s = pd.SparseSeries([np.nan, 0.0, 1.0, 2.0, 0.0], name="xxx", fill_value=0.0) res = s.isna() assert isinstance(res, pd.SparseSeries) - exp = pd.Series([True, False, False, False, False], name='xxx') + exp = pd.Series([True, False, False, False, False], name="xxx") tm.assert_series_equal(res.to_dense(), exp) def test_notna(self): # GH 8276 - s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name="xxx") res = s.notna() - exp = pd.SparseSeries([False, False, True, True, False], name='xxx', - fill_value=False) + exp = pd.SparseSeries( + [False, False, True, True, False], name="xxx", fill_value=False + ) tm.assert_sp_series_equal(res, exp) # if fill_value is not nan, True can be included in sp_values - s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', - fill_value=0.) + s = pd.SparseSeries([np.nan, 0.0, 1.0, 2.0, 0.0], name="xxx", fill_value=0.0) res = s.notna() assert isinstance(res, pd.SparseSeries) - exp = pd.Series([False, True, True, True, True], name='xxx') + exp = pd.Series([False, True, True, True, True], name="xxx") tm.assert_series_equal(res.to_dense(), exp) def _dense_series_compare(s, f): result = f(s) - assert (isinstance(result, SparseSeries)) + assert isinstance(result, SparseSeries) dense_result = f(s.to_dense()) tm.assert_series_equal(result.to_dense(), dense_result) @@ -1440,15 +1462,14 @@ def _dense_series_compare(s, f): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesAnalytics: - def setup_method(self, method): arr, index = _test_data1() - self.bseries = SparseSeries(arr, index=index, kind='block', - name='bseries') + self.bseries = SparseSeries(arr, index=index, kind="block", name="bseries") arr, index = _test_data1_zero() - self.zbseries = SparseSeries(arr, index=index, kind='block', - fill_value=0, name='zbseries') + self.zbseries = SparseSeries( + arr, index=index, kind="block", fill_value=0, name="zbseries" + ) def test_cumsum(self): result = self.bseries.cumsum() @@ -1484,38 +1505,47 @@ def test_numpy_cumsum(self): def test_numpy_func_call(self): # no exception should be raised even though # numpy passes in 'axis=None' or `axis=-1' - funcs = ['sum', 'cumsum', 'var', 'mean', - 'prod', 'cumprod', 'std', 'argsort', - 'min', 'max'] + funcs = [ + "sum", + "cumsum", + "var", + "mean", + "prod", + "cumprod", + "std", + "argsort", + "min", + "max", + ] for func in funcs: - for series in ('bseries', 'zbseries'): + for series in ("bseries", "zbseries"): getattr(np, func)(getattr(self, series)) def test_deprecated_numpy_func_call(self): # NOTE: These should be add to the 'test_numpy_func_call' test above # once the behavior of argmin/argmax is corrected. - funcs = ['argmin', 'argmax'] + funcs = ["argmin", "argmax"] for func in funcs: - for series in ('bseries', 'zbseries'): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + for series in ("bseries", "zbseries"): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): getattr(np, func)(getattr(self, series)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): getattr(getattr(self, series), func)() @pytest.mark.parametrize( - 'datetime_type', (np.datetime64, - pd.Timestamp, - lambda x: datetime.strptime(x, '%Y-%m-%d'))) + "datetime_type", + (np.datetime64, pd.Timestamp, lambda x: datetime.strptime(x, "%Y-%m-%d")), +) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") def test_constructor_dict_datetime64_index(datetime_type): # GH 9456 - dates = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] + dates = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] values = [42544017.198965244, 1234565, 40512335.181958228, -1] result = SparseSeries(dict(zip(map(datetime_type, dates), values))) @@ -1542,8 +1572,7 @@ def test_deprecated_to_sparse(): ser = Series([1, np.nan, 3]) sparse_ser = pd.SparseSeries([1, np.nan, 3]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ser.to_sparse() tm.assert_series_equal(result, sparse_ser) diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 4fed878a10ca6..d7295c4bfe5f0 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -10,7 +10,7 @@ class TestSparseArrayConcat: - @pytest.mark.parametrize('kind', ['integer', 'block']) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_basic(self, kind): a = pd.SparseArray([1, 0, 0, 2], kind=kind) b = pd.SparseArray([1, 0, 2, 2], kind=kind) @@ -19,43 +19,39 @@ def test_basic(self, kind): # Can't make any assertions about the sparse index itself # since we aren't don't merge sparse blocs across arrays # in to_concat - expected = np.array([1, 2, 1, 2, 2], dtype='int64') + expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind - @pytest.mark.parametrize('kind', ['integer', 'block']) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_uses_first_kind(self, kind): - other = 'integer' if kind == 'block' else 'block' + other = "integer" if kind == "block" else "block" a = pd.SparseArray([1, 0, 0, 2], kind=kind) b = pd.SparseArray([1, 0, 2, 2], kind=other) result = pd.SparseArray._concat_same_type([a, b]) - expected = np.array([1, 2, 1, 2, 2], dtype='int64') + expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesConcat: - - @pytest.mark.parametrize('kind', [ - 'integer', - 'block', - ]) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_concat(self, kind): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind) + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - sparse1 = pd.SparseSeries(val1, fill_value=0, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, fill_value=0, name='y', kind=kind) + sparse1 = pd.SparseSeries(val1, fill_value=0, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, fill_value=0, name="y", kind=kind) res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) @@ -66,12 +62,11 @@ def test_concat_axis1(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y') + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y") res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) exp = pd.SparseDataFrame(exp) tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) @@ -79,20 +74,22 @@ def test_concat_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - for kind in ['integer', 'block']: - sparse1 = pd.SparseSeries(val1, name='x', kind=kind) - sparse2 = pd.SparseSeries(val2, name='y', kind=kind, fill_value=0) + for kind in ["integer", "block"]: + sparse1 = pd.SparseSeries(val1, name="x", kind=kind) + sparse2 = pd.SparseSeries(val2, name="y", kind=kind, fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) exp = pd.SparseSeries(exp, kind=kind) tm.assert_sp_series_equal(res, exp) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse1]) exp = pd.concat([pd.Series(val2), pd.Series(val1)]) @@ -103,12 +100,11 @@ def test_concat_axis1_different_fill(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x') - sparse2 = pd.SparseSeries(val2, name='y', fill_value=0) + sparse1 = pd.SparseSeries(val1, name="x") + sparse2 = pd.SparseSeries(val2, name="y", fill_value=0) res = pd.concat([sparse1, sparse2], axis=1) - exp = pd.concat([pd.Series(val1, name='x'), - pd.Series(val2, name='y')], axis=1) + exp = pd.concat([pd.Series(val1, name="x"), pd.Series(val2, name="y")], axis=1) assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) @@ -116,8 +112,8 @@ def test_concat_different_kind(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse1 = pd.SparseSeries(val1, name='x', kind='integer') - sparse2 = pd.SparseSeries(val2, name='y', kind='block') + sparse1 = pd.SparseSeries(val1, name="x", kind="integer") + sparse2 = pd.SparseSeries(val2, name="y", kind="block") res = pd.concat([sparse1, sparse2]) exp = pd.concat([pd.Series(val1), pd.Series(val2)]) @@ -129,17 +125,14 @@ def test_concat_different_kind(self): exp = pd.SparseSeries(exp, kind=sparse2.kind) tm.assert_sp_series_equal(res, exp, consolidate_block_indices=True) - @pytest.mark.parametrize('kind', [ - 'integer', - 'block', - ]) + @pytest.mark.parametrize("kind", ["integer", "block"]) def test_concat_sparse_dense(self, kind): # use first input's fill_value val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) val2 = np.array([3, np.nan, 4, 0, 0]) - sparse = pd.SparseSeries(val1, name='x', kind=kind) - dense = pd.Series(val2, name='y') + sparse = pd.SparseSeries(val1, name="x", kind=kind) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) exp = pd.SparseSeries(pd.concat([pd.Series(val1), dense]), kind=kind) @@ -148,23 +141,17 @@ def test_concat_sparse_dense(self, kind): res = pd.concat([dense, sparse, dense]) exp = pd.concat([dense, pd.Series(val1), dense]) # XXX: changed from SparseSeries to Series[sparse] - exp = pd.Series( - pd.SparseArray(exp, kind=kind), - index=exp.index, - name=exp.name, - ) + exp = pd.Series(pd.SparseArray(exp, kind=kind), index=exp.index, name=exp.name) tm.assert_series_equal(res, exp) - sparse = pd.SparseSeries(val1, name='x', kind=kind, fill_value=0) - dense = pd.Series(val2, name='y') + sparse = pd.SparseSeries(val1, name="x", kind=kind, fill_value=0) + dense = pd.Series(val2, name="y") res = pd.concat([sparse, dense]) # XXX: changed from SparseSeries to Series[sparse] exp = pd.concat([pd.Series(val1), dense]) exp = pd.Series( - pd.SparseArray(exp, kind=kind, fill_value=0), - index=exp.index, - name=exp.name, + pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name ) tm.assert_series_equal(res, exp) @@ -172,9 +159,7 @@ def test_concat_sparse_dense(self, kind): exp = pd.concat([dense, pd.Series(val1), dense]) # XXX: changed from SparseSeries to Series[sparse] exp = pd.Series( - pd.SparseArray(exp, kind=kind, fill_value=0), - index=exp.index, - name=exp.name, + pd.SparseArray(exp, kind=kind, fill_value=0), index=exp.index, name=exp.name ) tm.assert_series_equal(res, exp) @@ -182,23 +167,34 @@ def test_concat_sparse_dense(self, kind): @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameConcat: - def setup_method(self, method): - self.dense1 = pd.DataFrame({'A': [0., 1., 2., np.nan], - 'B': [0., 0., 0., 0.], - 'C': [np.nan, np.nan, np.nan, np.nan], - 'D': [1., 2., 3., 4.]}) + self.dense1 = pd.DataFrame( + { + "A": [0.0, 1.0, 2.0, np.nan], + "B": [0.0, 0.0, 0.0, 0.0], + "C": [np.nan, np.nan, np.nan, np.nan], + "D": [1.0, 2.0, 3.0, 4.0], + } + ) - self.dense2 = pd.DataFrame({'A': [5., 6., 7., 8.], - 'B': [np.nan, 0., 7., 8.], - 'C': [5., 6., np.nan, np.nan], - 'D': [np.nan, np.nan, np.nan, np.nan]}) + self.dense2 = pd.DataFrame( + { + "A": [5.0, 6.0, 7.0, 8.0], + "B": [np.nan, 0.0, 7.0, 8.0], + "C": [5.0, 6.0, np.nan, np.nan], + "D": [np.nan, np.nan, np.nan, np.nan], + } + ) - self.dense3 = pd.DataFrame({'E': [5., 6., 7., 8.], - 'F': [np.nan, 0., 7., 8.], - 'G': [5., 6., np.nan, np.nan], - 'H': [np.nan, np.nan, np.nan, np.nan]}) + self.dense3 = pd.DataFrame( + { + "E": [5.0, 6.0, 7.0, 8.0], + "F": [np.nan, 0.0, 7.0, 8.0], + "G": [5.0, 6.0, np.nan, np.nan], + "H": [np.nan, np.nan, np.nan, np.nan], + } + ) def test_concat(self): # fill_value = np.nan @@ -250,14 +246,16 @@ def test_concat_different_fill_value(self): sparse = self.dense1.to_sparse() sparse2 = self.dense2.to_sparse(fill_value=0) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse, sparse2]) exp = pd.concat([self.dense1, self.dense2]).to_sparse() tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True) - with tm.assert_produces_warning(PerformanceWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): res = pd.concat([sparse2, sparse]) exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0) exp._default_fill_value = np.nan @@ -269,13 +267,13 @@ def test_concat_different_columns_sort_warns(self): # stacklevel is wrong since we have two FutureWarnings, # one for depr, one for sorting. - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): res = pd.concat([sparse, sparse3]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False, - raise_on_extra_warnings=False,): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): exp = pd.concat([self.dense1, self.dense3]) exp = exp.to_sparse() @@ -297,12 +295,13 @@ def test_concat_different_columns(self): def test_concat_bug(self): from pandas.core.sparse.api import SparseDtype - x = pd.SparseDataFrame({"A": pd.SparseArray([np.nan, np.nan], - fill_value=0)}) + + x = pd.SparseDataFrame({"A": pd.SparseArray([np.nan, np.nan], fill_value=0)}) y = pd.SparseDataFrame({"B": []}) - res = pd.concat([x, y], sort=False)[['A']] - exp = pd.DataFrame({"A": pd.SparseArray([np.nan, np.nan], - dtype=SparseDtype(float, 0))}) + res = pd.concat([x, y], sort=False)[["A"]] + exp = pd.DataFrame( + {"A": pd.SparseArray([np.nan, np.nan], dtype=SparseDtype(float, 0))} + ) tm.assert_frame_equal(res, exp) def test_concat_different_columns_buggy(self): @@ -310,19 +309,19 @@ def test_concat_different_columns_buggy(self): sparse3 = self.dense3.to_sparse(fill_value=0) res = pd.concat([sparse, sparse3], sort=True) - exp = (pd.concat([self.dense1, self.dense3], sort=True) - .to_sparse(fill_value=0)) + exp = pd.concat([self.dense1, self.dense3], sort=True).to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_frame_equal( + res, exp, check_kind=False, consolidate_block_indices=True + ) res = pd.concat([sparse3, sparse], sort=True) - exp = (pd.concat([self.dense3, self.dense1], sort=True) - .to_sparse(fill_value=0)) + exp = pd.concat([self.dense3, self.dense1], sort=True).to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_frame_equal( + res, exp, check_kind=False, consolidate_block_indices=True + ) # different fill values sparse = self.dense1.to_sparse() @@ -343,7 +342,7 @@ def test_concat_series(self): sparse = self.dense1.to_sparse() sparse2 = self.dense2.to_sparse() - for col in ['A', 'D']: + for col in ["A", "D"]: res = pd.concat([sparse, sparse2[col]]) exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse() tm.assert_sp_frame_equal(res, exp, check_kind=False) @@ -356,21 +355,21 @@ def test_concat_series(self): sparse = self.dense1.to_sparse(fill_value=0) sparse2 = self.dense2.to_sparse(fill_value=0) - for col in ['C', 'D']: + for col in ["C", "D"]: res = pd.concat([sparse, sparse2[col]]) - exp = pd.concat([self.dense1, - self.dense2[col]]).to_sparse(fill_value=0) + exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, check_kind=False, - consolidate_block_indices=True) + tm.assert_sp_frame_equal( + res, exp, check_kind=False, consolidate_block_indices=True + ) res = pd.concat([sparse2[col], sparse]) - exp = pd.concat([self.dense2[col], - self.dense1]).to_sparse(fill_value=0) - exp['C'] = res['C'] + exp = pd.concat([self.dense2[col], self.dense1]).to_sparse(fill_value=0) + exp["C"] = res["C"] exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(res, exp, consolidate_block_indices=True, - check_kind=False) + tm.assert_sp_frame_equal( + res, exp, consolidate_block_indices=True, check_kind=False + ) def test_concat_axis1(self): # fill_value = np.nan @@ -391,14 +390,12 @@ def test_concat_axis1(self): sparse3 = self.dense3.to_sparse(fill_value=0) res = pd.concat([sparse, sparse3], axis=1) - exp = pd.concat([self.dense1, self.dense3], - axis=1).to_sparse(fill_value=0) + exp = pd.concat([self.dense1, self.dense3], axis=1).to_sparse(fill_value=0) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse3, sparse], axis=1) - exp = pd.concat([self.dense3, self.dense1], - axis=1).to_sparse(fill_value=0) + exp = pd.concat([self.dense3, self.dense1], axis=1).to_sparse(fill_value=0) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) @@ -416,14 +413,16 @@ def test_concat_axis1(self): assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), exp) - @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', - itertools.product([None, 0, 1, np.nan], - [0, 1], - [1, 0])) + @pytest.mark.parametrize( + "fill_value,sparse_idx,dense_idx", + itertools.product([None, 0, 1, np.nan], [0, 1], [1, 0]), + ) def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx): frames = [self.dense1, self.dense2] - sparse_frame = [frames[dense_idx], - frames[sparse_idx].to_sparse(fill_value=fill_value)] + sparse_frame = [ + frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value), + ] dense_frame = [frames[dense_idx], frames[sparse_idx]] # This will try both directions sparse + dense and dense + sparse @@ -437,20 +436,21 @@ def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx): sparse_frame = sparse_frame[::-1] dense_frame = dense_frame[::-1] - @pytest.mark.parametrize('fill_value,sparse_idx,dense_idx', - itertools.product([None, 0, 1, np.nan], - [0, 1], - [1, 0])) - @pytest.mark.xfail(reason="The iloc fails and I can't make expected", - strict=False) + @pytest.mark.parametrize( + "fill_value,sparse_idx,dense_idx", + itertools.product([None, 0, 1, np.nan], [0, 1], [1, 0]), + ) + @pytest.mark.xfail(reason="The iloc fails and I can't make expected", strict=False) def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx): # See GH16874, GH18914 and #18686 for why this should be a DataFrame from pandas.core.dtypes.common import is_sparse frames = [self.dense1, self.dense3] - sparse_frame = [frames[dense_idx], - frames[sparse_idx].to_sparse(fill_value=fill_value)] + sparse_frame = [ + frames[dense_idx], + frames[sparse_idx].to_sparse(fill_value=fill_value), + ] dense_frame = [frames[dense_idx], frames[sparse_idx]] # This will try both directions sparse + dense and dense + sparse diff --git a/pandas/tests/sparse/test_format.py b/pandas/tests/sparse/test_format.py index 805f77eb21c2f..cf8734910cd19 100644 --- a/pandas/tests/sparse/test_format.py +++ b/pandas/tests/sparse/test_format.py @@ -15,19 +15,20 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesFormatting: - @property def dtype_format_for_platform(self): - return '' if use_32bit_repr else ', dtype=int32' + return "" if use_32bit_repr else ", dtype=int32" def test_sparse_max_row(self): s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() result = repr(s) dfm = self.dtype_format_for_platform - exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" - "4 NaN\ndtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "0 1.0\n1 NaN\n2 NaN\n3 3.0\n" + "4 NaN\ndtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp def test_sparsea_max_row_truncated(self): @@ -37,55 +38,63 @@ def test_sparsea_max_row_truncated(self): with option_context("display.max_rows", 3): # GH 10560 result = repr(s) - exp = ("0 1.0\n ... \n4 NaN\n" - "Length: 5, dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "0 1.0\n ... \n4 NaN\n" + "Length: 5, dtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp def test_sparse_mi_max_row(self): - idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), - ('C', 0), ('C', 1), ('C', 2)]) - s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], - index=idx).to_sparse() + idx = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("B", 0), ("C", 0), ("C", 1), ("C", 2)] + ) + s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], index=idx).to_sparse() result = repr(s) dfm = self.dtype_format_for_platform - exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n" - "C 0 3.0\n 1 NaN\n 2 NaN\n" - "dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "A 0 1.0\n 1 NaN\nB 0 NaN\n" + "C 0 3.0\n 1 NaN\n 2 NaN\n" + "dtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp - with option_context("display.max_rows", 3, - "display.show_dimensions", False): + with option_context("display.max_rows", 3, "display.show_dimensions", False): # GH 13144 result = repr(s) - exp = ("A 0 1.0\n ... \nC 2 NaN\n" - "dtype: Sparse[float64, nan]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dfm)) + exp = ( + "A 0 1.0\n ... \nC 2 NaN\n" + "dtype: Sparse[float64, nan]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm) + ) assert result == exp def test_sparse_bool(self): # GH 13110 - s = pd.SparseSeries([True, False, False, True, False, False], - fill_value=False) + s = pd.SparseSeries([True, False, False, True, False, False], fill_value=False) result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 True\n1 False\n2 False\n" - "3 True\n4 False\n5 False\n" - "dtype: Sparse[bool, False]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + dtype = "" if use_32bit_repr else ", dtype=int32" + exp = ( + "0 True\n1 False\n2 False\n" + "3 True\n4 False\n5 False\n" + "dtype: Sparse[bool, False]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp with option_context("display.max_rows", 3): result = repr(s) - exp = ("0 True\n ... \n5 False\n" - "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + exp = ( + "0 True\n ... \n5 False\n" + "Length: 6, dtype: Sparse[bool, False]\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp def test_sparse_int(self): @@ -93,33 +102,39 @@ def test_sparse_int(self): s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n" - "5 0\ndtype: Sparse[int64, False]\nBlockIndex\n" - "Block locations: array([1, 4]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + dtype = "" if use_32bit_repr else ", dtype=int32" + exp = ( + "0 0\n1 1\n2 0\n3 0\n4 1\n" + "5 0\ndtype: Sparse[int64, False]\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp - with option_context("display.max_rows", 3, - "display.show_dimensions", False): + with option_context("display.max_rows", 3, "display.show_dimensions", False): result = repr(s) - exp = ("0 0\n ..\n5 0\n" - "dtype: Sparse[int64, False]\nBlockIndex\n" - "Block locations: array([1, 4]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + exp = ( + "0 0\n ..\n5 0\n" + "dtype: Sparse[int64, False]\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype) + ) assert result == exp @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseDataFrameFormatting: - def test_sparse_frame(self): # GH 13110 - df = pd.DataFrame({'A': [True, False, True, False, True], - 'B': [True, False, True, False, True], - 'C': [0, 0, 3, 0, 5], - 'D': [np.nan, np.nan, np.nan, 1, 2]}) + df = pd.DataFrame( + { + "A": [True, False, True, False, True], + "B": [True, False, True, False, True], + "C": [0, 0, 3, 0, 5], + "D": [np.nan, np.nan, np.nan, 1, 2], + } + ) sparse = df.to_sparse() assert repr(sparse) == repr(df) @@ -132,7 +147,7 @@ def test_sparse_repr_after_set(self): res = sdf.copy() # Ignore the warning - with pd.option_context('mode.chained_assignment', None): + with pd.option_context("mode.chained_assignment", None): sdf[0][1] = 2 # This line triggers the bug repr(sdf) @@ -143,7 +158,7 @@ def test_repr_no_warning(): with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) df = pd.SparseDataFrame({"A": [1, 2]}) - s = df['A'] + s = df["A"] with tm.assert_produces_warning(None): repr(df) diff --git a/pandas/tests/sparse/test_groupby.py b/pandas/tests/sparse/test_groupby.py index bf6055bc12725..04e49a272a77a 100644 --- a/pandas/tests/sparse/test_groupby.py +++ b/pandas/tests/sparse/test_groupby.py @@ -8,22 +8,22 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestSparseGroupBy: - def setup_method(self, method): - self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8), - 'E': [np.nan, np.nan, 1, 2, - np.nan, 1, np.nan, np.nan]}) + self.dense = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan], + } + ) self.sparse = self.dense.to_sparse() def test_first_last_nth(self): # tests for first / last / nth - sparse_grouped = self.sparse.groupby('A') - dense_grouped = self.dense.groupby('A') + sparse_grouped = self.sparse.groupby("A") + dense_grouped = self.dense.groupby("A") sparse_grouped_first = sparse_grouped.first() sparse_grouped_last = sparse_grouped.last() @@ -33,16 +33,13 @@ def test_first_last_nth(self): dense_grouped_last = pd.DataFrame(dense_grouped.last().to_sparse()) dense_grouped_nth = pd.DataFrame(dense_grouped.nth(1).to_sparse()) - tm.assert_frame_equal(sparse_grouped_first, - dense_grouped_first) - tm.assert_frame_equal(sparse_grouped_last, - dense_grouped_last) - tm.assert_frame_equal(sparse_grouped_nth, - dense_grouped_nth) + tm.assert_frame_equal(sparse_grouped_first, dense_grouped_first) + tm.assert_frame_equal(sparse_grouped_last, dense_grouped_last) + tm.assert_frame_equal(sparse_grouped_nth, dense_grouped_nth) def test_aggfuncs(self): - sparse_grouped = self.sparse.groupby('A') - dense_grouped = self.dense.groupby('A') + sparse_grouped = self.sparse.groupby("A") + dense_grouped = self.dense.groupby("A") result = sparse_grouped.mean().to_sparse() expected = dense_grouped.mean().to_sparse() @@ -64,10 +61,13 @@ def test_aggfuncs(self): @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") def test_groupby_includes_fill_value(fill_value): # https://github.com/pandas-dev/pandas/issues/5078 - df = pd.DataFrame({'a': [fill_value, 1, fill_value, fill_value], - 'b': [fill_value, 1, fill_value, fill_value]}) + df = pd.DataFrame( + { + "a": [fill_value, 1, fill_value, fill_value], + "b": [fill_value, 1, fill_value, fill_value], + } + ) sdf = df.to_sparse(fill_value=fill_value) - result = sdf.groupby('a').sum() - expected = pd.DataFrame(df.groupby('a').sum().to_sparse( - fill_value=fill_value)) + result = sdf.groupby("a").sum() + expected = pd.DataFrame(df.groupby("a").sum().to_sparse(fill_value=fill_value)) tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index df59f1dfe7b13..5cfacaf16cffe 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -9,7 +9,6 @@ @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseSeriesIndexing: - def setup_method(self, method): self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) self.sparse = self.orig.to_sparse() @@ -51,16 +50,17 @@ def test_getitem_slice(self): def test_getitem_int_dtype(self): # GH 8292 - s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name='xxx') + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name="xxx") res = s[::2] - exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name='xxx') + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name="xxx") tm.assert_sp_series_equal(res, exp) assert res.dtype == SparseDtype(np.int64) - s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name='xxx') + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name="xxx") res = s[::2] - exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], - fill_value=0, name='xxx') + exp = pd.SparseSeries( + [0, 2, 4, 6], index=[0, 2, 4, 6], fill_value=0, name="xxx" + ) tm.assert_sp_series_equal(res, exp) assert res.dtype == SparseDtype(np.int64) @@ -102,14 +102,10 @@ def test_getitem_ellipsis(self): def test_getitem_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse[:2], - orig[:2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[4:2], - orig[4:2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[::2], - orig[::2].to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse[-5:], - orig[-5:].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse(fill_value=0)) def test_loc(self): orig = self.orig @@ -144,14 +140,14 @@ def test_loc(self): tm.assert_sp_series_equal(result, exp) def test_loc_index(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - assert sparse.loc['A'] == 1 - assert np.isnan(sparse.loc['B']) + assert sparse.loc["A"] == 1 + assert np.isnan(sparse.loc["B"]) - result = sparse.loc[['A', 'C', 'D']] - exp = orig.loc[['A', 'C', 'D']].to_sparse() + result = sparse.loc[["A", "C", "D"]] + exp = orig.loc[["A", "C", "D"]].to_sparse() tm.assert_sp_series_equal(result, exp) # dense array @@ -169,14 +165,14 @@ def test_loc_index(self): tm.assert_sp_series_equal(result, exp) def test_loc_index_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - assert sparse.loc['A'] == 1 - assert np.isnan(sparse.loc['B']) + assert sparse.loc["A"] == 1 + assert np.isnan(sparse.loc["B"]) - result = sparse.loc[['A', 'C', 'D']] - exp = orig.loc[['A', 'C', 'D']].to_sparse(fill_value=0) + result = sparse.loc[["A", "C", "D"]] + exp = orig.loc[["A", "C", "D"]].to_sparse(fill_value=0) tm.assert_sp_series_equal(result, exp) # dense array @@ -195,17 +191,17 @@ def test_loc_slice(self): tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) def test_loc_slice_index_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.loc['C':], - orig.loc['C':].to_sparse(fill_value=0)) + tm.assert_sp_series_equal( + sparse.loc["C":], orig.loc["C":].to_sparse(fill_value=0) + ) def test_loc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.loc[2:], - orig.loc[2:].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse(fill_value=0)) def test_iloc(self): orig = self.orig @@ -245,8 +241,9 @@ def test_iloc_slice(self): def test_iloc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.iloc[2:], - orig.iloc[2:].to_sparse(fill_value=0)) + tm.assert_sp_series_equal( + sparse.iloc[2:], orig.iloc[2:].to_sparse(fill_value=0) + ) def test_at(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) @@ -257,24 +254,22 @@ def test_at(self): assert sparse.at[3] == orig.at[3] assert np.isnan(sparse.at[4]) - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], - index=list('abcde')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("abcde")) sparse = orig.to_sparse() - assert sparse.at['a'] == orig.at['a'] - assert np.isnan(sparse.at['b']) - assert np.isnan(sparse.at['c']) - assert sparse.at['d'] == orig.at['d'] - assert np.isnan(sparse.at['e']) + assert sparse.at["a"] == orig.at["a"] + assert np.isnan(sparse.at["b"]) + assert np.isnan(sparse.at["c"]) + assert sparse.at["d"] == orig.at["d"] + assert np.isnan(sparse.at["e"]) def test_at_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], - index=list('abcde')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("abcde")) sparse = orig.to_sparse(fill_value=0) - assert sparse.at['a'] == orig.at['a'] - assert np.isnan(sparse.at['b']) - assert sparse.at['c'] == orig.at['c'] - assert sparse.at['d'] == orig.at['d'] - assert sparse.at['e'] == orig.at['e'] + assert sparse.at["a"] == orig.at["a"] + assert np.isnan(sparse.at["b"]) + assert sparse.at["c"] == orig.at["c"] + assert sparse.at["d"] == orig.at["d"] + assert sparse.at["e"] == orig.at["e"] def test_iat(self): orig = self.orig @@ -307,38 +302,37 @@ def test_get(self): assert np.isnan(s.get(1)) assert s.get(5) is None - s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE')) - assert s.get('A') == 1 - assert np.isnan(s.get('B')) - assert s.get('C') == 0 - assert s.get('XX') is None + s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE")) + assert s.get("A") == 1 + assert np.isnan(s.get("B")) + assert s.get("C") == 0 + assert s.get("XX") is None - s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list('ABCDE'), - fill_value=0) - assert s.get('A') == 1 - assert np.isnan(s.get('B')) - assert s.get('C') == 0 - assert s.get('XX') is None + s = pd.SparseSeries([1, np.nan, 0, 3, 0], index=list("ABCDE"), fill_value=0) + assert s.get("A") == 1 + assert np.isnan(s.get("B")) + assert s.get("C") == 0 + assert s.get("XX") is None def test_take(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], - index=list('ABCDE')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse.take([0]), - orig.take([0]).to_sparse()) - tm.assert_sp_series_equal(sparse.take([0, 1, 3]), - orig.take([0, 1, 3]).to_sparse()) - tm.assert_sp_series_equal(sparse.take([-1, -2]), - orig.take([-1, -2]).to_sparse()) + tm.assert_sp_series_equal(sparse.take([0]), orig.take([0]).to_sparse()) + tm.assert_sp_series_equal( + sparse.take([0, 1, 3]), orig.take([0, 1, 3]).to_sparse() + ) + tm.assert_sp_series_equal( + sparse.take([-1, -2]), orig.take([-1, -2]).to_sparse() + ) def test_take_fill_value(self): - orig = pd.Series([1, np.nan, 0, 3, 0], - index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.take([0]), - orig.take([0]).to_sparse(fill_value=0)) + tm.assert_sp_series_equal( + sparse.take([0]), orig.take([0]).to_sparse(fill_value=0) + ) exp = orig.take([0, 1, 3]).to_sparse(fill_value=0) tm.assert_sp_series_equal(sparse.take([0, 1, 3]), exp) @@ -347,81 +341,76 @@ def test_take_fill_value(self): tm.assert_sp_series_equal(sparse.take([-1, -2]), exp) def test_reindex(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan], - index=list('ABCDE')) + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse() + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse() tm.assert_sp_series_equal(res, exp) # all missing & fill_value - res = sparse.reindex(['B', 'E', 'C']) - exp = orig.reindex(['B', 'E', 'C']).to_sparse() + res = sparse.reindex(["B", "E", "C"]) + exp = orig.reindex(["B", "E", "C"]).to_sparse() tm.assert_sp_series_equal(res, exp) - orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], - index=list('ABCDE')) + orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], index=list("ABCDE")) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse() + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse() tm.assert_sp_series_equal(res, exp) def test_fill_value_reindex(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) # includes missing and fill_value - res = sparse.reindex(['A', 'B', 'C']) - exp = orig.reindex(['A', 'B', 'C']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "B", "C"]) + exp = orig.reindex(["A", "B", "C"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) # all missing - orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], - index=list('ABCDE')) + orig = pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) # all fill_value - orig = pd.Series([0., 0., 0., 0., 0.], - index=list('ABCDE')) + orig = pd.Series([0.0, 0.0, 0.0, 0.0, 0.0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) def test_fill_value_reindex_coerces_float_int(self): - orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + orig = pd.Series([1, np.nan, 0, 3, 0], index=list("ABCDE")) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'E', 'C', 'D']) - exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "E", "C", "D"]) + exp = orig.reindex(["A", "E", "C", "D"]).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) def test_reindex_fill_value(self): - floats = pd.Series([1., 2., 3.]).to_sparse() + floats = pd.Series([1.0, 2.0, 3.0]).to_sparse() result = floats.reindex([1, 2, 3], fill_value=0) - expected = pd.Series([2., 3., 0], index=[1, 2, 3]).to_sparse() + expected = pd.Series([2.0, 3.0, 0], index=[1, 2, 3]).to_sparse() tm.assert_sp_series_equal(result, expected) def test_reindex_nearest(self): - s = pd.Series(np.arange(10, dtype='float64')).to_sparse() + s = pd.Series(np.arange(10, dtype="float64")).to_sparse() target = [0.1, 0.9, 1.5, 2.0] - actual = s.reindex(target, method='nearest') + actual = s.reindex(target, method="nearest") expected = pd.Series(np.around(target), target).to_sparse() tm.assert_sp_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', tolerance=0.2) + actual = s.reindex(target, method="nearest", tolerance=0.2) expected = pd.Series([0, 1, np.nan, 2], target).to_sparse() tm.assert_sp_series_equal(expected, actual) - actual = s.reindex(target, method='nearest', - tolerance=[0.3, 0.01, 0.4, 3]) + actual = s.reindex(target, method="nearest", tolerance=[0.3, 0.01, 0.4, 3]) expected = pd.Series([0, np.nan, np.nan, 2], target).to_sparse() tm.assert_sp_series_equal(expected, actual) @@ -430,17 +419,19 @@ def test_reindex_nearest(self): def tests_indexing_with_sparse(self, kind, fill): # see gh-13985 arr = pd.SparseArray([1, 2, 3], kind=kind) - indexer = pd.SparseArray([True, False, True], - fill_value=fill, - dtype=bool) + indexer = pd.SparseArray([True, False, True], fill_value=fill, dtype=bool) expected = arr[indexer] result = pd.SparseArray([1, 3], kind=kind) tm.assert_sp_array_equal(result, expected) s = pd.SparseSeries(arr, index=["a", "b", "c"], dtype=np.float64) - expected = pd.SparseSeries([1, 3], index=["a", "c"], kind=kind, - dtype=SparseDtype(np.float64, s.fill_value)) + expected = pd.SparseSeries( + [1, 3], + index=["a", "c"], + kind=kind, + dtype=SparseDtype(np.float64, s.fill_value), + ) tm.assert_sp_series_equal(s[indexer], expected) tm.assert_sp_series_equal(s.loc[indexer], expected) @@ -450,19 +441,18 @@ def tests_indexing_with_sparse(self, kind, fill): tm.assert_sp_series_equal(s[indexer], expected) tm.assert_sp_series_equal(s.loc[indexer], expected) - msg = ("iLocation based boolean indexing cannot " - "use an indexable as a mask") + msg = "iLocation based boolean indexing cannot " "use an indexable as a mask" with pytest.raises(ValueError, match=msg): s.iloc[indexer] @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): - def setup_method(self, method): # Mi with duplicated values - idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), - ('C', 0), ('C', 1)]) + idx = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("B", 0), ("C", 0), ("C", 1)] + ) self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx) self.sparse = self.orig.to_sparse() @@ -474,8 +464,8 @@ def test_getitem_multi(self): assert np.isnan(sparse[1]) assert sparse[3] == orig[3] - tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse()) - tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse()) + tm.assert_sp_series_equal(sparse["A"], orig["A"].to_sparse()) + tm.assert_sp_series_equal(sparse["B"], orig["B"].to_sparse()) result = sparse[[1, 3, 4]] exp = orig[[1, 3, 4]].to_sparse() @@ -499,31 +489,28 @@ def test_getitem_multi_tuple(self): orig = self.orig sparse = self.sparse - assert sparse['C', 0] == orig['C', 0] - assert np.isnan(sparse['A', 1]) - assert np.isnan(sparse['B', 0]) + assert sparse["C", 0] == orig["C", 0] + assert np.isnan(sparse["A", 1]) + assert np.isnan(sparse["B", 0]) def test_getitems_slice_multi(self): orig = self.orig sparse = self.sparse tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["B":], orig.loc["B":].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["C":], orig.loc["C":].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['A':'B'], - orig.loc['A':'B'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A":"B"], orig.loc["A":"B"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:"B"], orig.loc[:"B"].to_sparse()) def test_loc(self): # need to be override to use different label orig = self.orig sparse = self.sparse - tm.assert_sp_series_equal(sparse.loc['A'], - orig.loc['A'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['B'], - orig.loc['B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A"], orig.loc["A"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["B"], orig.loc["B"].to_sparse()) result = sparse.loc[[1, 3, 4]] exp = orig.loc[[1, 3, 4]].to_sparse() @@ -535,8 +522,8 @@ def test_loc(self): tm.assert_sp_series_equal(result, exp) # single element list (GH 15447) - result = sparse.loc[['A']] - exp = orig.loc[['A']].to_sparse() + result = sparse.loc[["A"]] + exp = orig.loc[["A"]].to_sparse() tm.assert_sp_series_equal(result, exp) # dense array @@ -557,43 +544,42 @@ def test_loc_multi_tuple(self): orig = self.orig sparse = self.sparse - assert sparse.loc['C', 0] == orig.loc['C', 0] - assert np.isnan(sparse.loc['A', 1]) - assert np.isnan(sparse.loc['B', 0]) + assert sparse.loc["C", 0] == orig.loc["C", 0] + assert np.isnan(sparse.loc["A", 1]) + assert np.isnan(sparse.loc["B", 0]) def test_loc_slice(self): orig = self.orig sparse = self.sparse - tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A":], orig.loc["A":].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["B":], orig.loc["B":].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["C":], orig.loc["C":].to_sparse()) - tm.assert_sp_series_equal(sparse.loc['A':'B'], - orig.loc['A':'B'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc["A":"B"], orig.loc["A":"B"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:"B"], orig.loc[:"B"].to_sparse()) def test_reindex(self): # GH 15447 orig = self.orig sparse = self.sparse - res = sparse.reindex([('A', 0), ('C', 1)]) - exp = orig.reindex([('A', 0), ('C', 1)]).to_sparse() + res = sparse.reindex([("A", 0), ("C", 1)]) + exp = orig.reindex([("A", 0), ("C", 1)]).to_sparse() tm.assert_sp_series_equal(res, exp) # On specific level: - res = sparse.reindex(['A', 'C', 'B'], level=0) - exp = orig.reindex(['A', 'C', 'B'], level=0).to_sparse() + res = sparse.reindex(["A", "C", "B"], level=0) + exp = orig.reindex(["A", "C", "B"], level=0).to_sparse() tm.assert_sp_series_equal(res, exp) # single element list (GH 15447) - res = sparse.reindex(['A'], level=0) - exp = orig.reindex(['A'], level=0).to_sparse() + res = sparse.reindex(["A"], level=0) + exp = orig.reindex(["A"], level=0).to_sparse() tm.assert_sp_series_equal(res, exp) with pytest.raises(TypeError): # Incomplete keys are not accepted for reindexing: - sparse.reindex(['A', 'C']) + sparse.reindex(["A", "C"]) # "copy" argument: res = sparse.reindex(sparse.index, copy=True) @@ -606,48 +592,44 @@ def test_reindex(self): @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") class TestSparseDataFrameIndexing: - def test_getitem(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse() - tm.assert_sp_series_equal(sparse['x'], orig['x'].to_sparse()) - tm.assert_sp_frame_equal(sparse[['x']], orig[['x']].to_sparse()) - tm.assert_sp_frame_equal(sparse[['z', 'x']], - orig[['z', 'x']].to_sparse()) + tm.assert_sp_series_equal(sparse["x"], orig["x"].to_sparse()) + tm.assert_sp_frame_equal(sparse[["x"]], orig[["x"]].to_sparse()) + tm.assert_sp_frame_equal(sparse[["z", "x"]], orig[["z", "x"]].to_sparse()) - tm.assert_sp_frame_equal(sparse[[True, False, True, True]], - orig[[True, False, True, True]].to_sparse()) + tm.assert_sp_frame_equal( + sparse[[True, False, True, True]], + orig[[True, False, True, True]].to_sparse(), + ) - tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], - orig.iloc[[1, 2]].to_sparse()) + tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], orig.iloc[[1, 2]].to_sparse()) def test_getitem_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - result = sparse[['z']] - expected = orig[['z']].to_sparse(fill_value=0) + result = sparse[["z"]] + expected = orig[["z"]].to_sparse(fill_value=0) tm.assert_sp_frame_equal(result, expected, check_fill_value=False) - tm.assert_sp_series_equal(sparse['y'], - orig['y'].to_sparse(fill_value=0)) + tm.assert_sp_series_equal(sparse["y"], orig["y"].to_sparse(fill_value=0)) - exp = orig[['x']].to_sparse(fill_value=0) + exp = orig[["x"]].to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[['x']], exp) + tm.assert_sp_frame_equal(sparse[["x"]], exp) - exp = orig[['z', 'x']].to_sparse(fill_value=0) + exp = orig[["z", "x"]].to_sparse(fill_value=0) exp._default_fill_value = np.nan - tm.assert_sp_frame_equal(sparse[['z', 'x']], exp) + tm.assert_sp_frame_equal(sparse[["z", "x"]], exp) indexer = [True, False, True, True] exp = orig[indexer].to_sparse(fill_value=0) @@ -659,31 +641,29 @@ def test_getitem_fill_value(self): tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], exp) def test_loc(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + columns=list("xyz"), + ) sparse = orig.to_sparse() - assert sparse.loc[0, 'x'] == 1 - assert np.isnan(sparse.loc[1, 'z']) - assert sparse.loc[2, 'z'] == 4 + assert sparse.loc[0, "x"] == 1 + assert np.isnan(sparse.loc[1, "z"]) + assert sparse.loc[2, "z"] == 4 # have to specify `kind='integer'`, since we construct a # new SparseArray here, and the default sparse type is # integer there, but block in SparseSeries - tm.assert_sp_series_equal(sparse.loc[0], - orig.loc[0].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[1], - orig.loc[1].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[2, :], - orig.loc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[2, :], - orig.loc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc[:, 'y'], - orig.loc[:, 'y'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:, 'y'], - orig.loc[:, 'y'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[0], orig.loc[0].to_sparse(kind="integer")) + tm.assert_sp_series_equal(sparse.loc[1], orig.loc[1].to_sparse(kind="integer")) + tm.assert_sp_series_equal( + sparse.loc[2, :], orig.loc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc[2, :], orig.loc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal(sparse.loc[:, "y"], orig.loc[:, "y"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:, "y"], orig.loc[:, "y"].to_sparse()) result = sparse.loc[[1, 2]] exp = orig.loc[[1, 2]].to_sparse() @@ -693,12 +673,12 @@ def test_loc(self): exp = orig.loc[[1, 2], :].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[:, ['x', 'z']] - exp = orig.loc[:, ['x', 'z']].to_sparse() + result = sparse.loc[:, ["x", "z"]] + exp = orig.loc[:, ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[[0, 2], ['x', 'z']] - exp = orig.loc[[0, 2], ['x', 'z']].to_sparse() + result = sparse.loc[[0, 2], ["x", "z"]] + exp = orig.loc[[0, 2], ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) # exceeds the bounds @@ -721,44 +701,47 @@ def test_loc(self): tm.assert_sp_frame_equal(result, exp) def test_loc_index(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - index=list('abc'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + index=list("abc"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - assert sparse.loc['a', 'x'] == 1 - assert np.isnan(sparse.loc['b', 'z']) - assert sparse.loc['c', 'z'] == 4 - - tm.assert_sp_series_equal(sparse.loc['a'], - orig.loc['a'].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc['b'], - orig.loc['b'].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc['b', :], - orig.loc['b', :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.loc['b', :], - orig.loc['b', :].to_sparse(kind='integer')) - - tm.assert_sp_series_equal(sparse.loc[:, 'z'], - orig.loc[:, 'z'].to_sparse()) - tm.assert_sp_series_equal(sparse.loc[:, 'z'], - orig.loc[:, 'z'].to_sparse()) - - result = sparse.loc[['a', 'b']] - exp = orig.loc[['a', 'b']].to_sparse() + assert sparse.loc["a", "x"] == 1 + assert np.isnan(sparse.loc["b", "z"]) + assert sparse.loc["c", "z"] == 4 + + tm.assert_sp_series_equal( + sparse.loc["a"], orig.loc["a"].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc["b"], orig.loc["b"].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc["b", :], orig.loc["b", :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.loc["b", :], orig.loc["b", :].to_sparse(kind="integer") + ) + + tm.assert_sp_series_equal(sparse.loc[:, "z"], orig.loc[:, "z"].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:, "z"], orig.loc[:, "z"].to_sparse()) + + result = sparse.loc[["a", "b"]] + exp = orig.loc[["a", "b"]].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[['a', 'b'], :] - exp = orig.loc[['a', 'b'], :].to_sparse() + result = sparse.loc[["a", "b"], :] + exp = orig.loc[["a", "b"], :].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[:, ['x', 'z']] - exp = orig.loc[:, ['x', 'z']].to_sparse() + result = sparse.loc[:, ["x", "z"]] + exp = orig.loc[:, ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) - result = sparse.loc[['c', 'a'], ['x', 'z']] - exp = orig.loc[['c', 'a'], ['x', 'z']].to_sparse() + result = sparse.loc[["c", "a"], ["x", "z"]] + exp = orig.loc[["c", "a"], ["x", "z"]].to_sparse() tm.assert_sp_frame_equal(result, exp) # dense array @@ -776,34 +759,30 @@ def test_loc_index(self): tm.assert_sp_frame_equal(result, exp) def test_loc_slice(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + columns=list("xyz"), + ) sparse = orig.to_sparse() tm.assert_sp_frame_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) def test_iloc(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]]) + orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]]) sparse = orig.to_sparse() assert sparse.iloc[1, 1] == 3 assert np.isnan(sparse.iloc[2, 0]) - tm.assert_sp_series_equal(sparse.iloc[0], - orig.loc[0].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[1], - orig.loc[1].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[2, :], - orig.iloc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[2, :], - orig.iloc[2, :].to_sparse(kind='integer')) - tm.assert_sp_series_equal(sparse.iloc[:, 1], - orig.iloc[:, 1].to_sparse()) - tm.assert_sp_series_equal(sparse.iloc[:, 1], - orig.iloc[:, 1].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[0], orig.loc[0].to_sparse(kind="integer")) + tm.assert_sp_series_equal(sparse.iloc[1], orig.loc[1].to_sparse(kind="integer")) + tm.assert_sp_series_equal( + sparse.iloc[2, :], orig.iloc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal( + sparse.iloc[2, :], orig.iloc[2, :].to_sparse(kind="integer") + ) + tm.assert_sp_series_equal(sparse.iloc[:, 1], orig.iloc[:, 1].to_sparse()) + tm.assert_sp_series_equal(sparse.iloc[:, 1], orig.iloc[:, 1].to_sparse()) result = sparse.iloc[[1, 2]] exp = orig.iloc[[1, 2]].to_sparse() @@ -825,43 +804,43 @@ def test_iloc(self): sparse.iloc[[1, 3, 5]] def test_iloc_slice(self): - orig = pd.DataFrame([[1, np.nan, np.nan], - [2, 3, np.nan], - [np.nan, np.nan, 4]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, np.nan], [2, 3, np.nan], [np.nan, np.nan, 4]], + columns=list("xyz"), + ) sparse = orig.to_sparse() tm.assert_sp_frame_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse()) def test_at(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - assert sparse.at['A', 'x'] == orig.at['A', 'x'] - assert np.isnan(sparse.at['B', 'z']) - assert np.isnan(sparse.at['C', 'y']) - assert sparse.at['D', 'x'] == orig.at['D', 'x'] + assert sparse.at["A", "x"] == orig.at["A", "x"] + assert np.isnan(sparse.at["B", "z"]) + assert np.isnan(sparse.at["C", "y"]) + assert sparse.at["D", "x"] == orig.at["D", "x"] def test_at_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - assert sparse.at['A', 'x'] == orig.at['A', 'x'] - assert np.isnan(sparse.at['B', 'z']) - assert np.isnan(sparse.at['C', 'y']) - assert sparse.at['D', 'x'] == orig.at['D', 'x'] + assert sparse.at["A", "x"] == orig.at["A", "x"] + assert np.isnan(sparse.at["B", "z"]) + assert np.isnan(sparse.at["C", "y"]) + assert sparse.at["D", "x"] == orig.at["D", "x"] def test_iat(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() assert sparse.iat[0, 0] == orig.iat[0, 0] assert np.isnan(sparse.iat[1, 2]) @@ -872,11 +851,11 @@ def test_iat(self): assert sparse.iat[-1, -1] == orig.iat[-1, -1] def test_iat_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) assert sparse.iat[0, 0] == orig.iat[0, 0] assert np.isnan(sparse.iat[1, 2]) @@ -887,26 +866,21 @@ def test_iat_fill_value(self): assert sparse.iat[-1, -1] == orig.iat[-1, -1] def test_take(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse() - tm.assert_sp_frame_equal(sparse.take([0]), - orig.take([0]).to_sparse()) - tm.assert_sp_frame_equal(sparse.take([0, 1]), - orig.take([0, 1]).to_sparse()) - tm.assert_sp_frame_equal(sparse.take([-1, -2]), - orig.take([-1, -2]).to_sparse()) + tm.assert_sp_frame_equal(sparse.take([0]), orig.take([0]).to_sparse()) + tm.assert_sp_frame_equal(sparse.take([0, 1]), orig.take([0, 1]).to_sparse()) + tm.assert_sp_frame_equal(sparse.take([-1, -2]), orig.take([-1, -2]).to_sparse()) def test_take_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) exp = orig.take([0]).to_sparse(fill_value=0) @@ -922,142 +896,163 @@ def test_take_fill_value(self): tm.assert_sp_frame_equal(sparse.take([-1, -2]), exp) def test_reindex(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse() + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse() tm.assert_sp_frame_equal(res, exp) - orig = pd.DataFrame([[np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [ + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + ], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse() - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse() + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse() tm.assert_sp_frame_equal(res, exp) def test_reindex_fill_value(self): - orig = pd.DataFrame([[1, np.nan, 0], - [2, 3, np.nan], - [0, np.nan, 4], - [0, np.nan, 5]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) # all missing - orig = pd.DataFrame([[np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan]], - index=list('ABCD'), columns=list('xyz')) + orig = pd.DataFrame( + [ + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + ], + index=list("ABCD"), + columns=list("xyz"), + ) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) # all fill_value - orig = pd.DataFrame([[0, 0, 0], - [0, 0, 0], - [0, 0, 0], - [0, 0, 0]], - index=list('ABCD'), columns=list('xyz'), - dtype=np.int) + orig = pd.DataFrame( + [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], + index=list("ABCD"), + columns=list("xyz"), + dtype=np.int, + ) sparse = orig.to_sparse(fill_value=0) - res = sparse.reindex(['A', 'C', 'B']) - exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) + res = sparse.reindex(["A", "C", "B"]) + exp = orig.reindex(["A", "C", "B"]).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") class TestMultitype: - def setup_method(self, method): - self.cols = ['string', 'int', 'float', 'object'] + self.cols = ["string", "int", "float", "object"] - self.string_series = pd.SparseSeries(['a', 'b', 'c']) + self.string_series = pd.SparseSeries(["a", "b", "c"]) self.int_series = pd.SparseSeries([1, 2, 3]) self.float_series = pd.SparseSeries([1.1, 1.2, 1.3]) self.object_series = pd.SparseSeries([[], {}, set()]) - self.sdf = pd.SparseDataFrame({ - 'string': self.string_series, - 'int': self.int_series, - 'float': self.float_series, - 'object': self.object_series, - }) + self.sdf = pd.SparseDataFrame( + { + "string": self.string_series, + "int": self.int_series, + "float": self.float_series, + "object": self.object_series, + } + ) self.sdf = self.sdf[self.cols] - self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols) + self.ss = pd.SparseSeries(["a", 1, 1.1, []], index=self.cols) def test_frame_basic_dtypes(self): for _, row in self.sdf.iterrows(): assert row.dtype == SparseDtype(object) - tm.assert_sp_series_equal(self.sdf['string'], self.string_series, - check_names=False) - tm.assert_sp_series_equal(self.sdf['int'], self.int_series, - check_names=False) - tm.assert_sp_series_equal(self.sdf['float'], self.float_series, - check_names=False) - tm.assert_sp_series_equal(self.sdf['object'], self.object_series, - check_names=False) + tm.assert_sp_series_equal( + self.sdf["string"], self.string_series, check_names=False + ) + tm.assert_sp_series_equal(self.sdf["int"], self.int_series, check_names=False) + tm.assert_sp_series_equal( + self.sdf["float"], self.float_series, check_names=False + ) + tm.assert_sp_series_equal( + self.sdf["object"], self.object_series, check_names=False + ) def test_frame_indexing_single(self): - tm.assert_sp_series_equal(self.sdf.iloc[0], - pd.SparseSeries(['a', 1, 1.1, []], - index=self.cols), - check_names=False) - tm.assert_sp_series_equal(self.sdf.iloc[1], - pd.SparseSeries(['b', 2, 1.2, {}], - index=self.cols), - check_names=False) - tm.assert_sp_series_equal(self.sdf.iloc[2], - pd.SparseSeries(['c', 3, 1.3, set()], - index=self.cols), - check_names=False) + tm.assert_sp_series_equal( + self.sdf.iloc[0], + pd.SparseSeries(["a", 1, 1.1, []], index=self.cols), + check_names=False, + ) + tm.assert_sp_series_equal( + self.sdf.iloc[1], + pd.SparseSeries(["b", 2, 1.2, {}], index=self.cols), + check_names=False, + ) + tm.assert_sp_series_equal( + self.sdf.iloc[2], + pd.SparseSeries(["c", 3, 1.3, set()], index=self.cols), + check_names=False, + ) def test_frame_indexing_multiple(self): tm.assert_sp_frame_equal(self.sdf, self.sdf[:]) tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:]) - tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]], - pd.SparseDataFrame({ - 'string': self.string_series.iloc[[1, 2]], - 'int': self.int_series.iloc[[1, 2]], - 'float': self.float_series.iloc[[1, 2]], - 'object': self.object_series.iloc[[1, 2]] - }, index=[1, 2])[self.cols]) - tm.assert_sp_frame_equal(self.sdf[['int', 'string']], - pd.SparseDataFrame({ - 'int': self.int_series, - 'string': self.string_series, - })) + tm.assert_sp_frame_equal( + self.sdf.iloc[[1, 2]], + pd.SparseDataFrame( + { + "string": self.string_series.iloc[[1, 2]], + "int": self.int_series.iloc[[1, 2]], + "float": self.float_series.iloc[[1, 2]], + "object": self.object_series.iloc[[1, 2]], + }, + index=[1, 2], + )[self.cols], + ) + tm.assert_sp_frame_equal( + self.sdf[["int", "string"]], + pd.SparseDataFrame({"int": self.int_series, "string": self.string_series}), + ) def test_series_indexing_single(self): for i, idx in enumerate(self.cols): assert self.ss.iloc[i] == self.ss[idx] - tm.assert_class_equal(self.ss.iloc[i], self.ss[idx], - obj="series index") + tm.assert_class_equal(self.ss.iloc[i], self.ss[idx], obj="series index") - assert self.ss['string'] == 'a' - assert self.ss['int'] == 1 - assert self.ss['float'] == 1.1 - assert self.ss['object'] == [] + assert self.ss["string"] == "a" + assert self.ss["int"] == 1 + assert self.ss["float"] == 1.1 + assert self.ss["object"] == [] def test_series_indexing_multiple(self): - tm.assert_sp_series_equal(self.ss.loc[['string', 'int']], - pd.SparseSeries(['a', 1], - index=['string', 'int'])) - tm.assert_sp_series_equal(self.ss.loc[['string', 'object']], - pd.SparseSeries(['a', []], - index=['string', 'object'])) + tm.assert_sp_series_equal( + self.ss.loc[["string", "int"]], + pd.SparseSeries(["a", 1], index=["string", "int"]), + ) + tm.assert_sp_series_equal( + self.ss.loc[["string", "object"]], + pd.SparseSeries(["a", []], index=["string", "object"]), + ) diff --git a/pandas/tests/sparse/test_pivot.py b/pandas/tests/sparse/test_pivot.py index 8f98117f20208..85b899dfe76d5 100644 --- a/pandas/tests/sparse/test_pivot.py +++ b/pandas/tests/sparse/test_pivot.py @@ -10,52 +10,56 @@ @pytest.mark.filterwarnings("ignore:Series.to_sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") class TestPivotTable: - def setup_method(self, method): rs = np.random.RandomState(0) - self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': rs.randn(8), - 'D': rs.randn(8), - 'E': [np.nan, np.nan, 1, 2, - np.nan, 1, np.nan, np.nan]}) + self.dense = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": rs.randn(8), + "D": rs.randn(8), + "E": [np.nan, np.nan, 1, 2, np.nan, 1, np.nan, np.nan], + } + ) self.sparse = self.dense.to_sparse() def test_pivot_table(self): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='C') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='C') + res_sparse = pd.pivot_table(self.sparse, index="A", columns="B", values="C") + res_dense = pd.pivot_table(self.dense, index="A", columns="B", values="C") tm.assert_frame_equal(res_sparse, res_dense) - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='E') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='E') + res_sparse = pd.pivot_table(self.sparse, index="A", columns="B", values="E") + res_dense = pd.pivot_table(self.dense, index="A", columns="B", values="E") tm.assert_frame_equal(res_sparse, res_dense) - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='E', aggfunc='mean') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='E', aggfunc='mean') + res_sparse = pd.pivot_table( + self.sparse, index="A", columns="B", values="E", aggfunc="mean" + ) + res_dense = pd.pivot_table( + self.dense, index="A", columns="B", values="E", aggfunc="mean" + ) tm.assert_frame_equal(res_sparse, res_dense) def test_pivot_table_with_nans(self): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values='E', aggfunc='sum') - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values='E', aggfunc='sum') + res_sparse = pd.pivot_table( + self.sparse, index="A", columns="B", values="E", aggfunc="sum" + ) + res_dense = pd.pivot_table( + self.dense, index="A", columns="B", values="E", aggfunc="sum" + ) tm.assert_frame_equal(res_sparse, res_dense) - @pytest.mark.xfail(not _np_version_under1p17, - reason="failing occasionally on numpy > 1.17", - strict=False) + @pytest.mark.xfail( + not _np_version_under1p17, + reason="failing occasionally on numpy > 1.17", + strict=False, + ) def test_pivot_table_multi(self): - res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', - values=['D', 'E']) - res_dense = pd.pivot_table(self.dense, index='A', columns='B', - values=['D', 'E']) + res_sparse = pd.pivot_table( + self.sparse, index="A", columns="B", values=["D", "E"] + ) + res_dense = pd.pivot_table( + self.dense, index="A", columns="B", values=["D", "E"] + ) res_dense = res_dense.apply(lambda x: x.astype("Sparse[float64]")) tm.assert_frame_equal(res_sparse, res_dense) diff --git a/pandas/tests/sparse/test_reshape.py b/pandas/tests/sparse/test_reshape.py index 37ec0bba2621d..bb5232f065a04 100644 --- a/pandas/tests/sparse/test_reshape.py +++ b/pandas/tests/sparse/test_reshape.py @@ -26,9 +26,7 @@ def test_sparse_frame_stack(sparse_df, multi_index3): def test_sparse_frame_unstack(sparse_df): mi = pd.MultiIndex.from_tuples([(0, 0), (1, 0), (1, 2)]) sparse_df.index = mi - arr = np.array([[1, np.nan, np.nan], - [np.nan, 1, np.nan], - [np.nan, np.nan, 1]]) + arr = np.array([[1, np.nan, np.nan], [np.nan, 1, np.nan], [np.nan, np.nan, 1]]) unstacked_df = pd.DataFrame(arr, index=mi).unstack() unstacked_sdf = sparse_df.unstack() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 565c98ffad77b..c0d73821020b5 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -7,8 +7,7 @@ from numpy.random import RandomState import pytest -from pandas._libs import ( - algos as libalgos, groupby as libgroupby, hashtable as ht) +from pandas._libs import algos as libalgos, groupby as libgroupby, hashtable as ht from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -16,8 +15,15 @@ import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DatetimeIndex, Index, IntervalIndex, Series, - Timestamp, compat) + Categorical, + CategoricalIndex, + DatetimeIndex, + Index, + IntervalIndex, + Series, + Timestamp, + compat, +) import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com @@ -27,7 +33,6 @@ class TestMatch: - def test_ints(self): values = np.array([0, 2, 1]) to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0]) @@ -50,8 +55,8 @@ def test_ints(self): tm.assert_series_equal(result, expected) def test_strings(self): - values = ['foo', 'bar', 'baz'] - to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux'] + values = ["foo", "bar", "baz"] + to_match = ["bar", "foo", "qux", "foo", "bar", "baz", "qux"] result = algos.match(to_match, values) expected = np.array([1, 0, -1, 0, 1, 2, -1], dtype=np.int64) @@ -63,19 +68,17 @@ def test_strings(self): class TestFactorize: - def test_basic(self): - labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', - 'c']) - tm.assert_numpy_array_equal( - uniques, np.array(['a', 'b', 'c'], dtype=object)) + labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) + tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) - labels, uniques = algos.factorize(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], sort=True) + labels, uniques = algos.factorize( + ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True + ) exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array(['a', 'b', 'c'], dtype=object) + exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5)))) @@ -91,41 +94,40 @@ def test_basic(self): exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) + labels, uniques = algos.factorize(list(reversed(np.arange(5.0)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64) + exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.))), - sort=True) + labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64) + exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) def test_mixed(self): # doc example reshaping.rst - x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + x = Series(["A", "A", np.nan, "B", 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = Index(['A', 'B', 3.14, np.inf]) + exp = Index(["A", "B", 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(labels, exp) - exp = Index([3.14, np.inf, 'A', 'B']) + exp = Index([3.14, np.inf, "A", "B"]) tm.assert_index_equal(uniques, exp) def test_datelike(self): # M8 - v1 = Timestamp('20130101 09:00:00.00004') - v2 = Timestamp('20130101') + v1 = Timestamp("20130101 09:00:00.00004") + v2 = Timestamp("20130101") x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) @@ -141,8 +143,8 @@ def test_datelike(self): tm.assert_index_equal(uniques, exp) # period - v1 = pd.Period('201302', freq='M') - v2 = pd.Period('201303', freq='M') + v1 = pd.Period("201302", freq="M") + v2 = pd.Period("201303", freq="M") x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index @@ -157,8 +159,8 @@ def test_datelike(self): tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) # GH 5986 - v1 = pd.to_timedelta('1 day 1 min') - v2 = pd.to_timedelta('1 day') + v1 = pd.to_timedelta("1 day 1 min") + v2 = pd.to_timedelta("1 day") x = Series([v1, v2, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) @@ -174,61 +176,59 @@ def test_factorize_nan(self): # nan should map to na_sentinel, not reverse_indexer[na_sentinel] # rizer.factorize should not raise an exception if na_sentinel indexes # outside of reverse_indexer - key = np.array([1, 2, 1, np.nan], dtype='O') + key = np.array([1, 2, 1, np.nan], dtype="O") rizer = ht.Factorizer(len(key)) for na_sentinel in (-1, 20): ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel) - expected = np.array([0, 1, 0, na_sentinel], dtype='int32') + expected = np.array([0, 1, 0, na_sentinel], dtype="int32") assert len(set(key)) == len(set(expected)) - tm.assert_numpy_array_equal(pd.isna(key), - expected == na_sentinel) + tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) # nan still maps to na_sentinel when sort=False - key = np.array([0, np.nan, 1], dtype='O') + key = np.array([0, np.nan, 1], dtype="O") na_sentinel = -1 # TODO(wesm): unused? ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa - expected = np.array([2, -1, 0], dtype='int32') + expected = np.array([2, -1, 0], dtype="int32") assert len(set(key)) == len(set(expected)) tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) - @pytest.mark.parametrize("data,expected_label,expected_level", [ - ( - [(1, 1), (1, 2), (0, 0), (1, 2), 'nonsense'], - [0, 1, 2, 1, 3], - [(1, 1), (1, 2), (0, 0), 'nonsense'] - ), - ( - [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)], - [0, 1, 2, 1, 3], - [(1, 1), (1, 2), (0, 0), (1, 2, 3)] - ), - ( - [(1, 1), (1, 2), (0, 0), (1, 2)], - [0, 1, 2, 1], - [(1, 1), (1, 2), (0, 0)] - ) - ]) + @pytest.mark.parametrize( + "data,expected_label,expected_level", + [ + ( + [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), "nonsense"], + ), + ( + [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)], + [0, 1, 2, 1, 3], + [(1, 1), (1, 2), (0, 0), (1, 2, 3)], + ), + ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]), + ], + ) def test_factorize_tuple_list(self, data, expected_label, expected_level): # GH9454 result = pd.factorize(data) - tm.assert_numpy_array_equal(result[0], - np.array(expected_label, dtype=np.intp)) + tm.assert_numpy_array_equal(result[0], np.array(expected_label, dtype=np.intp)) - expected_level_array = com.asarray_tuplesafe(expected_level, - dtype=object) + expected_level_array = com.asarray_tuplesafe(expected_level, dtype=object) tm.assert_numpy_array_equal(result[1], expected_level_array) def test_complex_sorting(self): # gh 12666 - check no segfault x17 = np.array([complex(i) for i in range(17)], dtype=object) - msg = ("unorderable types: .* [<>] .*" - "|" # the above case happens for numpy < 1.14 - "'[<>]' not supported between instances of .*") + msg = ( + "unorderable types: .* [<>] .*" + "|" # the above case happens for numpy < 1.14 + "'[<>]' not supported between instances of .*" + ) with pytest.raises(TypeError, match=msg): algos.factorize(x17[::-1], sort=True) @@ -243,42 +243,40 @@ def test_float64_factorize(self, writable): tm.assert_numpy_array_equal(uniques, exp_uniques) def test_uint64_factorize(self, writable): - data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64) + data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64) + exp_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_int64_factorize(self, writable): - data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64) + data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64) data.setflags(write=writable) exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64) + exp_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_string_factorize(self, writable): - data = np.array(['a', 'c', 'a', 'b', 'c'], - dtype=object) + data = np.array(["a", "c", "a", "b", "c"], dtype=object) data.setflags(write=writable) exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp) - exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + exp_uniques = np.array(["a", "c", "b"], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) tm.assert_numpy_array_equal(uniques, exp_uniques) def test_object_factorize(self, writable): - data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'], - dtype=object) + data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) data.setflags(write=writable) exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) - exp_uniques = np.array(['a', 'c', 'b'], dtype=object) + exp_uniques = np.array(["a", "c", "b"], dtype=object) labels, uniques = algos.factorize(data) tm.assert_numpy_array_equal(labels, exp_labels) @@ -287,17 +285,20 @@ def test_object_factorize(self, writable): def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. - data = np.array([2**63, 1, 2**63], dtype=np.uint64) + data = np.array([2 ** 63, 1, 2 ** 63], dtype=np.uint64) with tm.assert_produces_warning(expected_warning=FutureWarning): algos.factorize(data, order=True) with tm.assert_produces_warning(False): algos.factorize(data) - @pytest.mark.parametrize('data', [ - np.array([0, 1, 0], dtype='u8'), - np.array([-2**63, 1, -2**63], dtype='i8'), - np.array(['__nan__', 'foo', '__nan__'], dtype='object'), - ]) + @pytest.mark.parametrize( + "data", + [ + np.array([0, 1, 0], dtype="u8"), + np.array([-2 ** 63, 1, -2 ** 63], dtype="i8"), + np.array(["__nan__", "foo", "__nan__"], dtype="object"), + ], + ) def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. l, u = algos.factorize(data) @@ -306,16 +307,18 @@ def test_parametrized_factorize_na_value_default(self, data): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) - @pytest.mark.parametrize('data, na_value', [ - (np.array([0, 1, 0, 2], dtype='u8'), 0), - (np.array([1, 0, 1, 2], dtype='u8'), 1), - (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63), - (np.array([1, -2**63, 1, 0], dtype='i8'), 1), - (np.array(['a', '', 'a', 'b'], dtype=object), 'a'), - (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()), - (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object), - ('a', 1)), - ]) + @pytest.mark.parametrize( + "data, na_value", + [ + (np.array([0, 1, 0, 2], dtype="u8"), 0), + (np.array([1, 0, 1, 2], dtype="u8"), 1), + (np.array([-2 ** 63, 1, -2 ** 63, 0], dtype="i8"), -2 ** 63), + (np.array([1, -2 ** 63, 1, 0], dtype="i8"), 1), + (np.array(["a", "", "a", "b"], dtype=object), "a"), + (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()), + (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)), + ], + ) def test_parametrized_factorize_na_value(self, data, na_value): l, u = algos._factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] @@ -323,17 +326,24 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) - @pytest.mark.parametrize('sort', [True, False]) - @pytest.mark.parametrize('na_sentinel', [-1, -10, 100]) - @pytest.mark.parametrize('data, uniques', [ - (np.array(['b', 'a', None, 'b'], dtype=object), - np.array(['b', 'a'], dtype=object)), - (pd.array([2, 1, np.nan, 2], dtype='Int64'), - pd.array([2, 1], dtype='Int64'))], - ids=['numpy_array', 'extension_array']) + @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("na_sentinel", [-1, -10, 100]) + @pytest.mark.parametrize( + "data, uniques", + [ + ( + np.array(["b", "a", None, "b"], dtype=object), + np.array(["b", "a"], dtype=object), + ), + ( + pd.array([2, 1, np.nan, 2], dtype="Int64"), + pd.array([2, 1], dtype="Int64"), + ), + ], + ids=["numpy_array", "extension_array"], + ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - labels, uniques = algos.factorize(data, sort=sort, - na_sentinel=na_sentinel) + labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = safe_sort(uniques) @@ -348,7 +358,6 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): class TestUnique: - def test_ints(self): arr = np.random.randint(0, 100, size=50) @@ -356,20 +365,21 @@ def test_ints(self): assert isinstance(result, np.ndarray) def test_objects(self): - arr = np.random.randint(0, 100, size=50).astype('O') + arr = np.random.randint(0, 100, size=50).astype("O") result = algos.unique(arr) assert isinstance(result, np.ndarray) def test_object_refcount_bug(self): - lst = ['A', 'B', 'C', 'D', 'E'] + lst = ["A", "B", "C", "D", "E"] for i in range(1000): len(algos.unique(lst)) def test_on_index_object(self): - mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile( - np.arange(5), 5)]) + mindex = pd.MultiIndex.from_arrays( + [np.arange(5).repeat(5), np.tile(np.arange(5), 5)] + ) expected = mindex.values expected.sort() @@ -383,13 +393,20 @@ def test_on_index_object(self): def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np_array_datetime64_compat( - ['2015-01-03T00:00:00.000000000+0000', - '2015-01-01T00:00:00.000000000+0000'], - dtype='M8[ns]') + [ + "2015-01-03T00:00:00.000000000+0000", + "2015-01-01T00:00:00.000000000+0000", + ], + dtype="M8[ns]", + ) - dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000', - '2015-01-01T00:00:00.000000000', - '2015-01-01T00:00:00.000000000']) + dt_index = pd.to_datetime( + [ + "2015-01-03T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + "2015-01-01T00:00:00.000000000", + ] + ) result = algos.unique(dt_index) tm.assert_numpy_array_equal(result, expected) assert result.dtype == expected.dtype @@ -406,7 +423,7 @@ def test_datetime64_dtype_array_returned(self): def test_timedelta64_dtype_array_returned(self): # GH 9431 - expected = np.array([31200, 45678, 10000], dtype='m8[ns]') + expected = np.array([31200, 45678, 10000], dtype="m8[ns]") td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) @@ -424,36 +441,35 @@ def test_timedelta64_dtype_array_returned(self): assert result.dtype == expected.dtype def test_uint64_overflow(self): - s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) - exp = np.array([1, 2, 2**63], dtype=np.uint64) + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(algos.unique(s), exp) def test_nan_in_object_array(self): - duplicated_items = ['a', np.nan, 'c', 'c'] + duplicated_items = ["a", np.nan, "c", "c"] result = pd.unique(duplicated_items) - expected = np.array(['a', np.nan, 'c'], dtype=object) + expected = np.array(["a", np.nan, "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_categorical(self): # we are expecting to return in the order # of appearance - expected = Categorical(list('bac'), categories=list('bac')) + expected = Categorical(list("bac"), categories=list("bac")) # we are expecting to return in the order # of the categories - expected_o = Categorical( - list('bac'), categories=list('abc'), ordered=True) + expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True) # GH 15939 - c = Categorical(list('baabc')) + c = Categorical(list("baabc")) result = c.unique() tm.assert_categorical_equal(result, expected) result = algos.unique(c) tm.assert_categorical_equal(result, expected) - c = Categorical(list('baabc'), ordered=True) + c = Categorical(list("baabc"), ordered=True) result = c.unique() tm.assert_categorical_equal(result, expected_o) @@ -461,7 +477,7 @@ def test_categorical(self): tm.assert_categorical_equal(result, expected_o) # Series of categorical dtype - s = Series(Categorical(list('baabc')), name='foo') + s = Series(Categorical(list("baabc")), name="foo") result = s.unique() tm.assert_categorical_equal(result, expected) @@ -469,8 +485,7 @@ def test_categorical(self): tm.assert_categorical_equal(result, expected) # CI -> return CI - ci = CategoricalIndex(Categorical(list('baabc'), - categories=list('bac'))) + ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac"))) expected = CategoricalIndex(expected) result = ci.unique() tm.assert_index_equal(result, expected) @@ -482,31 +497,55 @@ def test_datetime64tz_aware(self): # GH 15939 result = Series( - Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])).unique() - expected = DatetimeArray._from_sequence(np.array([ - Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern") - ])) + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ).unique() + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) + ) tm.assert_extension_array_equal(result, expected) - result = Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]).unique() - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) + result = Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ).unique() + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) tm.assert_index_equal(result, expected) result = pd.unique( - Series(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')]))) - expected = DatetimeArray._from_sequence(np.array([ - Timestamp('2016-01-01', tz="US/Eastern"), - ])) + Series( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + ) + expected = DatetimeArray._from_sequence( + np.array([Timestamp("2016-01-01", tz="US/Eastern")]) + ) tm.assert_extension_array_equal(result, expected) - result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', freq=None) + result = pd.unique( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) tm.assert_index_equal(result, expected) def test_order_of_appearance(self): @@ -514,39 +553,43 @@ def test_order_of_appearance(self): # light testing of guarantee of order of appearance # these also are the doc-examples result = pd.unique(Series([2, 1, 3, 3])) - tm.assert_numpy_array_equal(result, - np.array([2, 1, 3], dtype='int64')) + tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64")) result = pd.unique(Series([2] + [1] * 5)) - tm.assert_numpy_array_equal(result, - np.array([2, 1], dtype='int64')) + tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64")) - result = pd.unique(Series([Timestamp('20160101'), - Timestamp('20160101')])) - expected = np.array(['2016-01-01T00:00:00.000000000'], - dtype='datetime64[ns]') + result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")])) + expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) - result = pd.unique(Index( - [Timestamp('20160101', tz='US/Eastern'), - Timestamp('20160101', tz='US/Eastern')])) - expected = DatetimeIndex(['2016-01-01 00:00:00'], - dtype='datetime64[ns, US/Eastern]', - freq=None) + result = pd.unique( + Index( + [ + Timestamp("20160101", tz="US/Eastern"), + Timestamp("20160101", tz="US/Eastern"), + ] + ) + ) + expected = DatetimeIndex( + ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None + ) tm.assert_index_equal(result, expected) - result = pd.unique(list('aabc')) - expected = np.array(['a', 'b', 'c'], dtype=object) + result = pd.unique(list("aabc")) + expected = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(result, expected) - result = pd.unique(Series(Categorical(list('aabc')))) - expected = Categorical(list('abc')) + result = pd.unique(Series(Categorical(list("aabc")))) + expected = Categorical(list("abc")) tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize("arg ,expected", [ - (('1', '1', '2'), np.array(['1', '2'], dtype=object)), - (('foo',), np.array(['foo'], dtype=object)) - ]) + @pytest.mark.parametrize( + "arg ,expected", + [ + (("1", "1", "2"), np.array(["1", "2"], dtype=object)), + (("foo",), np.array(["foo"], dtype=object)), + ], + ) def test_tuple_with_strings(self, arg, expected): # see GH 17108 result = pd.unique(arg) @@ -554,9 +597,9 @@ def test_tuple_with_strings(self, arg, expected): def test_obj_none_preservation(self): # GH 20866 - arr = np.array(['foo', None], dtype=object) + arr = np.array(["foo", None], dtype=object) result = pd.unique(arr) - expected = np.array(['foo', None], dtype=object) + expected = np.array(["foo", None], dtype=object) tm.assert_numpy_array_equal(result, expected, strict_nan=True) @@ -570,8 +613,8 @@ def test_signed_zero(self): def test_different_nans(self): # GH 21866 # create different nans from bit-patterns: - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent @@ -582,8 +625,8 @@ def test_different_nans(self): def test_first_nan_kept(self): # GH 22295 # create different nans from bit-patterns: - bits_for_nan1 = 0xfff8000000000001 - bits_for_nan2 = 0x7ff8000000000001 + bits_for_nan1 = 0xFFF8000000000001 + bits_for_nan2 = 0x7FF8000000000001 NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0] NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0] assert NAN1 != NAN1 @@ -593,17 +636,14 @@ def test_first_nan_kept(self): result = pd.unique(a) assert result.size == 1 # use bit patterns to identify which nan was kept: - result_nan_bits = struct.unpack("=Q", - struct.pack("d", result[0]))[0] + result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0] assert result_nan_bits == bits_for_nan1 - def test_do_not_mangle_na_values(self, unique_nulls_fixture, - unique_nulls_fixture2): + def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2): # GH 22295 if unique_nulls_fixture is unique_nulls_fixture2: return # skip it, values not unique - a = np.array([unique_nulls_fixture, - unique_nulls_fixture2], dtype=np.object) + a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) result = pd.unique(a) assert result.size == 2 assert a[0] is unique_nulls_fixture @@ -611,11 +651,12 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, class TestIsin: - def test_invalid(self): - msg = (r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[int\]") + msg = ( + r"only list-like objects are allowed to be passed to isin\(\)," + r" you passed a \[int\]" + ) with pytest.raises(TypeError, match=msg): algos.isin(1, 1) with pytest.raises(TypeError, match=msg): @@ -645,25 +686,25 @@ def test_basic(self): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(['a', 'b'], ['a']) + result = algos.isin(["a", "b"], ["a"]) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(['a', 'b']), Series(['a'])) + result = algos.isin(Series(["a", "b"]), Series(["a"])) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(Series(['a', 'b']), {'a'}) + result = algos.isin(Series(["a", "b"]), {"a"}) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - result = algos.isin(['a', 'b'], [1]) + result = algos.isin(["a", "b"], [1]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) def test_i8(self): - arr = pd.date_range('20130101', periods=3).values + arr = pd.date_range("20130101", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -676,7 +717,7 @@ def test_i8(self): expected = np.array([True, True, False]) tm.assert_numpy_array_equal(result, expected) - arr = pd.timedelta_range('1 day', periods=3).values + arr = pd.timedelta_range("1 day", periods=3).values result = algos.isin(arr, [arr[0]]) expected = np.array([True, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -691,7 +732,7 @@ def test_i8(self): def test_large(self): - s = pd.date_range('20000101', periods=2000000, freq='s').values + s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) expected[0] = True @@ -701,7 +742,7 @@ def test_large(self): def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) - cats = ['a', 'b', 'c'] + cats = ["a", "b", "c"] Sd = Series(Categorical(1).from_codes(vals, cats)) St = Series(Categorical(1).from_codes(np.array([0, 1]), cats)) expected = np.array([True, True, False, True]) @@ -743,8 +784,8 @@ def test_different_nans(self): # GH 22160 # all nans are handled as equivalent - comps = [float('nan')] - values = [float('nan')] + comps = [float("nan")] + values = [float("nan")] assert comps[0] is not values[0] # different nan-objects # as list of python-objects: @@ -752,20 +793,22 @@ def test_different_nans(self): tm.assert_numpy_array_equal(np.array([True]), result) # as object-array: - result = algos.isin(np.asarray(comps, dtype=np.object), - np.asarray(values, dtype=np.object)) + result = algos.isin( + np.asarray(comps, dtype=np.object), np.asarray(values, dtype=np.object) + ) tm.assert_numpy_array_equal(np.array([True]), result) # as float64-array: - result = algos.isin(np.asarray(comps, dtype=np.float64), - np.asarray(values, dtype=np.float64)) + result = algos.isin( + np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64) + ) tm.assert_numpy_array_equal(np.array([True]), result) def test_no_cast(self): # GH 22160 # ensure 42 is not casted to a string - comps = ['ss', 42] - values = ['42'] + comps = ["ss", 42] + values = ["42"] expected = np.array([False, False]) result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) @@ -781,8 +824,8 @@ def test_empty(self, empty): def test_different_nan_objects(self): # GH 22119 - comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object) - vals = np.array([float('nan')], dtype=np.object) + comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=np.object) + vals = np.array([float("nan")], dtype=np.object) expected = np.array([False, False, True]) result = algos.isin(comps, vals) tm.assert_numpy_array_equal(expected, result) @@ -792,8 +835,8 @@ def test_different_nans_as_float64(self): # create different nans from bit-patterns, # these nans will land in different buckets in the hash-table # if no special care is taken - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 @@ -811,7 +854,6 @@ def test_different_nans_as_float64(self): class TestValueCounts: - def test_value_counts(self): np.random.seed(1234) from pandas.core.reshape.tile import cut @@ -829,33 +871,32 @@ def test_value_counts(self): def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - expected = Series([4], - index=IntervalIndex.from_tuples([(0.996, 4.0)])) + expected = Series([4], index=IntervalIndex.from_tuples([(0.996, 4.0)])) tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) - expected = Series([2, 2], - index=IntervalIndex.from_tuples([(0.996, 2.5), - (2.5, 4.0)])) + expected = Series( + [2, 2], index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]) + ) tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): - result = algos.value_counts([1, 1.]) + result = algos.value_counts([1, 1.0]) assert len(result) == 1 - result = algos.value_counts([1, 1.], bins=1) + result = algos.value_counts([1, 1.0], bins=1) assert len(result) == 1 - result = algos.value_counts(Series([1, 1., '1'])) # object + result = algos.value_counts(Series([1, 1.0, "1"])) # object assert len(result) == 2 msg = "bins argument only works with numeric data" with pytest.raises(TypeError, match=msg): - algos.value_counts(['1', 1], bins=1) + algos.value_counts(["1", 1], bins=1) def test_value_counts_nat(self): - td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]') - dt = pd.to_datetime(['NaT', '2014-01-01']) + td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]") + dt = pd.to_datetime(["NaT", "2014-01-01"]) for s in [td, dt]: vc = algos.value_counts(s) @@ -863,32 +904,40 @@ def test_value_counts_nat(self): assert len(vc) == 1 assert len(vc_with_na) == 2 - exp_dt = Series({Timestamp('2014-01-01 00:00:00'): 1}) + exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}) tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) def test_value_counts_datetime_outofbounds(self): # GH 13663 - s = Series([datetime(3000, 1, 1), datetime(5000, 1, 1), - datetime(5000, 1, 1), datetime(6000, 1, 1), - datetime(3000, 1, 1), datetime(3000, 1, 1)]) + s = Series( + [ + datetime(3000, 1, 1), + datetime(5000, 1, 1), + datetime(5000, 1, 1), + datetime(6000, 1, 1), + datetime(3000, 1, 1), + datetime(3000, 1, 1), + ] + ) res = s.value_counts() - exp_index = Index([datetime(3000, 1, 1), datetime(5000, 1, 1), - datetime(6000, 1, 1)], dtype=object) + exp_index = Index( + [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)], + dtype=object, + ) exp = Series([3, 2, 1], index=exp_index) tm.assert_series_equal(res, exp) # GH 12424 - res = pd.to_datetime(Series(['2362-01-01', np.nan]), - errors='ignore') - exp = Series(['2362-01-01', np.nan], dtype=object) + res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) def test_categorical(self): - s = Series(Categorical(list('aaabbc'))) + s = Series(Categorical(list("aaabbc"))) result = s.value_counts() - expected = Series([3, 2, 1], index=CategoricalIndex(['a', 'b', 'c'])) + expected = Series([3, 2, 1], index=CategoricalIndex(["a", "b", "c"])) tm.assert_series_equal(result, expected, check_index_type=True) @@ -899,39 +948,51 @@ def test_categorical(self): tm.assert_series_equal(result, expected, check_index_type=True) def test_categorical_nans(self): - s = Series(Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan) + s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan) s.iloc[1] = np.nan result = s.value_counts() - expected = Series([4, 3, 2], index=CategoricalIndex( - ['a', 'b', 'c'], categories=['a', 'b', 'c'])) + expected = Series( + [4, 3, 2], + index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]), + ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([ - 4, 3, 2, 1 - ], index=CategoricalIndex(['a', 'b', 'c', np.nan])) + expected = Series([4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order - s = Series(Categorical( - list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c'])) + s = Series( + Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"]) + ) s.iloc[1] = np.nan result = s.value_counts() - expected = Series([4, 3, 2], index=CategoricalIndex( - ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True)) + expected = Series( + [4, 3, 2], + index=CategoricalIndex( + ["a", "b", "c"], categories=["b", "a", "c"], ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = Series([4, 3, 2, 1], index=CategoricalIndex( - ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True)) + expected = Series( + [4, 3, 2, 1], + index=CategoricalIndex( + ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) def test_categorical_zeroes(self): # keep the `d` category with 0 - s = Series(Categorical( - list('bbbaac'), categories=list('abcd'), ordered=True)) + s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True)) result = s.value_counts() - expected = Series([3, 2, 1, 0], index=Categorical( - ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True)) + expected = Series( + [3, 2, 1, 0], + index=Categorical( + ["b", "a", "c", "d"], categories=list("abcd"), ordered=True + ), + ) tm.assert_series_equal(result, expected, check_index_type=True) def test_dropna(self): @@ -939,59 +1000,66 @@ def test_dropna(self): tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=True), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False]).value_counts(dropna=False), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=True), - Series([2, 1], index=[True, False])) + Series([2, 1], index=[True, False]), + ) tm.assert_series_equal( Series([True, True, False, None]).value_counts(dropna=False), - Series([2, 1, 1], index=[True, False, np.nan])) + Series([2, 1, 1], index=[True, False, np.nan]), + ) tm.assert_series_equal( - Series([10.3, 5., 5.]).value_counts(dropna=True), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) tm.assert_series_equal( - Series([10.3, 5., 5.]).value_counts(dropna=False), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0]).value_counts(dropna=False), + Series([2, 1], index=[5.0, 10.3]), + ) tm.assert_series_equal( - Series([10.3, 5., 5., None]).value_counts(dropna=True), - Series([2, 1], index=[5., 10.3])) + Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True), + Series([2, 1], index=[5.0, 10.3]), + ) # 32-bit linux has a different ordering if not compat.is_platform_32bit(): - result = Series([10.3, 5., 5., None]).value_counts(dropna=False) - expected = Series([2, 1, 1], index=[5., 10.3, np.nan]) + result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) + expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) tm.assert_series_equal(result, expected) def test_value_counts_normalized(self): # GH12558 s = Series([1, 2, np.nan, np.nan, np.nan]) - dtypes = (np.float64, np.object, 'M8[ns]') + dtypes = (np.float64, np.object, "M8[ns]") for t in dtypes: s_typed = s.astype(t) result = s_typed.value_counts(normalize=True, dropna=False) - expected = Series([0.6, 0.2, 0.2], - index=Series([np.nan, 2.0, 1.0], dtype=t)) + expected = Series( + [0.6, 0.2, 0.2], index=Series([np.nan, 2.0, 1.0], dtype=t) + ) tm.assert_series_equal(result, expected) result = s_typed.value_counts(normalize=True, dropna=True) - expected = Series([0.5, 0.5], - index=Series([2.0, 1.0], dtype=t)) + expected = Series([0.5, 0.5], index=Series([2.0, 1.0], dtype=t)) tm.assert_series_equal(result, expected) def test_value_counts_uint64(self): - arr = np.array([2**63], dtype=np.uint64) - expected = Series([1], index=[2**63]) + arr = np.array([2 ** 63], dtype=np.uint64) + expected = Series([1], index=[2 ** 63]) result = algos.value_counts(arr) tm.assert_series_equal(result, expected) - arr = np.array([-1, 2**63], dtype=object) - expected = Series([1, 1], index=[-1, 2**63]) + arr = np.array([-1, 2 ** 63], dtype=object) + expected = Series([1, 1], index=[-1, 2 ** 63]) result = algos.value_counts(arr) # 32-bit linux has a different ordering @@ -1000,7 +1068,6 @@ def test_value_counts_uint64(self): class TestDuplicated: - def test_duplicated_with_nas(self): keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) @@ -1008,11 +1075,11 @@ def test_duplicated_with_nas(self): expected = np.array([False, False, False, True, False, True]) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='first') + result = algos.duplicated(keys, keep="first") expected = np.array([False, False, False, True, False, True]) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='last') + result = algos.duplicated(keys, keep="last") expected = np.array([True, False, True, False, False, False]) tm.assert_numpy_array_equal(result, expected) @@ -1021,8 +1088,9 @@ def test_duplicated_with_nas(self): tm.assert_numpy_array_equal(result, expected) keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, - [0, np.nan, 0, np.nan] * 2)): + for i, t in enumerate( + zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2) + ): keys[i] = t result = algos.duplicated(keys) @@ -1031,7 +1099,7 @@ def test_duplicated_with_nas(self): expected = np.array(falses + trues) tm.assert_numpy_array_equal(result, expected) - result = algos.duplicated(keys, keep='last') + result = algos.duplicated(keys, keep="last") expected = np.array(trues + falses) tm.assert_numpy_array_equal(result, expected) @@ -1039,51 +1107,66 @@ def test_duplicated_with_nas(self): expected = np.array(trues + trues) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('case', [ - np.array([1, 2, 1, 5, 3, - 2, 4, 1, 5, 6]), - np.array([1.1, 2.2, 1.1, np.nan, 3.3, - 2.2, 4.4, 1.1, np.nan, 6.6]), - np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j, - 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]), - np.array(['a', 'b', 'a', 'e', 'c', - 'b', 'd', 'a', 'e', 'f'], dtype=object), - np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], - dtype=np.uint64), - ]) + @pytest.mark.parametrize( + "case", + [ + np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]), + np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]), + np.array( + [ + 1 + 1j, + 2 + 2j, + 1 + 1j, + 5 + 5j, + 3 + 3j, + 2 + 2j, + 4 + 4j, + 1 + 1j, + 5 + 5j, + 6 + 6j, + ] + ), + np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object), + np.array( + [1, 2 ** 63, 1, 3 ** 5, 10, 2 ** 63, 39, 1, 3 ** 5, 7], dtype=np.uint64 + ), + ], + ) def test_numeric_object_likes(self, case): - exp_first = np.array([False, False, True, False, False, - True, False, True, True, False]) - exp_last = np.array([True, True, True, True, False, - False, False, False, False, False]) + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) exp_false = exp_first | exp_last - res_first = algos.duplicated(case, keep='first') + res_first = algos.duplicated(case, keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = algos.duplicated(case, keep='last') + res_last = algos.duplicated(case, keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = algos.duplicated(case, keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # index - for idx in [Index(case), Index(case, dtype='category')]: - res_first = idx.duplicated(keep='first') + for idx in [Index(case), Index(case, dtype="category")]: + res_first = idx.duplicated(keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = idx.duplicated(keep='last') + res_last = idx.duplicated(keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = idx.duplicated(keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [Series(case), Series(case, dtype='category')]: - res_first = s.duplicated(keep='first') + for s in [Series(case), Series(case, dtype="category")]: + res_first = s.duplicated(keep="first") tm.assert_series_equal(res_first, Series(exp_first)) - res_last = s.duplicated(keep='last') + res_last = s.duplicated(keep="last") tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) @@ -1091,52 +1174,82 @@ def test_numeric_object_likes(self, case): def test_datetime_likes(self): - dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03', - '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06'] - td = ['1 days', '2 days', '1 days', 'NaT', '3 days', - '2 days', '4 days', '1 days', 'NaT', '6 days'] - - cases = [np.array([Timestamp(d) for d in dt]), - np.array([Timestamp(d, tz='US/Eastern') for d in dt]), - np.array([pd.Period(d, freq='D') for d in dt]), - np.array([np.datetime64(d) for d in dt]), - np.array([pd.Timedelta(d) for d in td])] - - exp_first = np.array([False, False, True, False, False, - True, False, True, True, False]) - exp_last = np.array([True, True, True, True, False, - False, False, False, False, False]) + dt = [ + "2011-01-01", + "2011-01-02", + "2011-01-01", + "NaT", + "2011-01-03", + "2011-01-02", + "2011-01-04", + "2011-01-01", + "NaT", + "2011-01-06", + ] + td = [ + "1 days", + "2 days", + "1 days", + "NaT", + "3 days", + "2 days", + "4 days", + "1 days", + "NaT", + "6 days", + ] + + cases = [ + np.array([Timestamp(d) for d in dt]), + np.array([Timestamp(d, tz="US/Eastern") for d in dt]), + np.array([pd.Period(d, freq="D") for d in dt]), + np.array([np.datetime64(d) for d in dt]), + np.array([pd.Timedelta(d) for d in td]), + ] + + exp_first = np.array( + [False, False, True, False, False, True, False, True, True, False] + ) + exp_last = np.array( + [True, True, True, True, False, False, False, False, False, False] + ) exp_false = exp_first | exp_last for case in cases: - res_first = algos.duplicated(case, keep='first') + res_first = algos.duplicated(case, keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = algos.duplicated(case, keep='last') + res_last = algos.duplicated(case, keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = algos.duplicated(case, keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # index - for idx in [Index(case), Index(case, dtype='category'), - Index(case, dtype=object)]: - res_first = idx.duplicated(keep='first') + for idx in [ + Index(case), + Index(case, dtype="category"), + Index(case, dtype=object), + ]: + res_first = idx.duplicated(keep="first") tm.assert_numpy_array_equal(res_first, exp_first) - res_last = idx.duplicated(keep='last') + res_last = idx.duplicated(keep="last") tm.assert_numpy_array_equal(res_last, exp_last) res_false = idx.duplicated(keep=False) tm.assert_numpy_array_equal(res_false, exp_false) # series - for s in [Series(case), Series(case, dtype='category'), - Series(case, dtype=object)]: - res_first = s.duplicated(keep='first') + for s in [ + Series(case), + Series(case, dtype="category"), + Series(case, dtype=object), + ]: + res_first = s.duplicated(keep="first") tm.assert_series_equal(res_first, Series(exp_first)) - res_last = s.duplicated(keep='last') + res_last = s.duplicated(keep="last") tm.assert_series_equal(res_last, Series(exp_last)) res_false = s.duplicated(keep=False) @@ -1146,17 +1259,24 @@ def test_unique_index(self): cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)] for case in cases: assert case.is_unique is True - tm.assert_numpy_array_equal(case.duplicated(), - np.array([False, False, False])) - - @pytest.mark.parametrize('arr, unique', [ - ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], - [(0, 0), (0, 1), (1, 0), (1, 1)]), - ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')], - [('b', 'c'), ('a', 'b')]), - ([('a', 1), ('b', 2), ('a', 3), ('a', 1)], - [('a', 1), ('b', 2), ('a', 3)]), - ]) + tm.assert_numpy_array_equal( + case.duplicated(), np.array([False, False, False]) + ) + + @pytest.mark.parametrize( + "arr, unique", + [ + ( + [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], + [(0, 0), (0, 1), (1, 0), (1, 1)], + ), + ( + [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")], + [("b", "c"), ("a", "b")], + ), + ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]), + ], + ) def test_unique_tuples(self, arr, unique): # https://github.com/pandas-dev/pandas/issues/16519 expected = np.empty(len(unique), dtype=object) @@ -1167,18 +1287,17 @@ def test_unique_tuples(self, arr, unique): class GroupVarTestMixin: - def test_group_var_generic_1d(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 1))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(15, 1).astype(self.dtype) - labels = np.tile(np.arange(5), (3, )).astype('int64') + labels = np.tile(np.arange(5), (3,)).astype("int64") - expected_out = (np.squeeze(values) - .reshape((5, 3), order='F') - .std(axis=1, ddof=1) ** 2)[:, np.newaxis] + expected_out = ( + np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 + )[:, np.newaxis] expected_counts = counts + 3 self.algo(out, counts, values, labels) @@ -1189,9 +1308,9 @@ def test_group_var_generic_1d_flat_labels(self): prng = RandomState(1234) out = (np.nan * np.ones((1, 1))).astype(self.dtype) - counts = np.zeros(1, dtype='int64') + counts = np.zeros(1, dtype="int64") values = 10 * prng.rand(5, 1).astype(self.dtype) - labels = np.zeros(5, dtype='int64') + labels = np.zeros(5, dtype="int64") expected_out = np.array([[values.std(ddof=1) ** 2]]) expected_counts = counts + 5 @@ -1205,9 +1324,9 @@ def test_group_var_generic_2d_all_finite(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) - labels = np.tile(np.arange(5), (2, )).astype('int64') + labels = np.tile(np.arange(5), (2,)).astype("int64") expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 expected_counts = counts + 2 @@ -1220,15 +1339,17 @@ def test_group_var_generic_2d_some_nan(self): prng = RandomState(1234) out = (np.nan * np.ones((5, 2))).astype(self.dtype) - counts = np.zeros(5, dtype='int64') + counts = np.zeros(5, dtype="int64") values = 10 * prng.rand(10, 2).astype(self.dtype) values[:, 1] = np.nan - labels = np.tile(np.arange(5), (2, )).astype('int64') - - expected_out = np.vstack([values[:, 0] - .reshape(5, 2, order='F') - .std(ddof=1, axis=1) ** 2, - np.nan * np.ones(5)]).T.astype(self.dtype) + labels = np.tile(np.arange(5), (2,)).astype("int64") + + expected_out = np.vstack( + [ + values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, + np.nan * np.ones(5), + ] + ).T.astype(self.dtype) expected_counts = counts + 2 self.algo(out, counts, values, labels) @@ -1239,9 +1360,9 @@ def test_group_var_constant(self): # Regression test from GH 10448. out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype='int64') + counts = np.array([0], dtype="int64") values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) - labels = np.zeros(3, dtype='int64') + labels = np.zeros(3, dtype="int64") self.algo(out, counts, values, labels) @@ -1262,10 +1383,10 @@ def test_group_var_large_inputs(self): prng = RandomState(1234) out = np.array([[np.nan]], dtype=self.dtype) - counts = np.array([0], dtype='int64') + counts = np.array([0], dtype="int64") values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype) values.shape = (10 ** 6, 1) - labels = np.zeros(10 ** 6, dtype='int64') + labels = np.zeros(10 ** 6, dtype="int64") self.algo(out, counts, values, labels) @@ -1282,15 +1403,13 @@ class TestGroupVarFloat32(GroupVarTestMixin): class TestHashTable: - def test_lookup_nan(self, writable): xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) # GH 21688 ensure we can deal with readonly memory views xs.setflags(write=writable) m = ht.Float64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), - dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) def test_add_signed_zeros(self): # GH 21866 inconsistent hash-function for float64 @@ -1306,8 +1425,8 @@ def test_add_signed_zeros(self): def test_add_different_nans(self): # GH 21866 inconsistent hash-function for float64 # create different nans from bit-patterns: - NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0] - NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0] + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 # default hash function would lead to different hash-buckets @@ -1318,28 +1437,32 @@ def test_add_different_nans(self): assert len(m) == 1 # NAN1 and NAN2 are equivalent def test_lookup_overflow(self, writable): - xs = np.array([1, 2, 2**63], dtype=np.uint64) + xs = np.array([1, 2, 2 ** 63], dtype=np.uint64) # GH 21688 ensure we can deal with readonly memory views xs.setflags(write=writable) m = ht.UInt64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), - dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) def test_get_unique(self): - s = Series([1, 2, 2**63, 2**63], dtype=np.uint64) - exp = np.array([1, 2, 2**63], dtype=np.uint64) + s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) + exp = np.array([1, 2, 2 ** 63], dtype=np.uint64) tm.assert_numpy_array_equal(s.unique(), exp) - @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case - @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [ - (ht.PyObjectHashTable, ht.ObjectVector, 'object', False), - (ht.StringHashTable, ht.ObjectVector, 'object', True), - (ht.Float64HashTable, ht.Float64Vector, 'float64', False), - (ht.Int64HashTable, ht.Int64Vector, 'int64', False), - (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)]) - def test_vector_resize(self, writable, htable, uniques, dtype, - safely_resizes, nvals): + @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize( + "htable, uniques, dtype, safely_resizes", + [ + (ht.PyObjectHashTable, ht.ObjectVector, "object", False), + (ht.StringHashTable, ht.ObjectVector, "object", True), + (ht.Float64HashTable, ht.Float64Vector, "float64", False), + (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), + ], + ) + def test_vector_resize( + self, writable, htable, uniques, dtype, safely_resizes, nvals + ): # Test for memory errors after internal vector # reallocations (GH 7157) vals = np.array(np.random.randn(1000), dtype=dtype) @@ -1364,21 +1487,25 @@ def test_vector_resize(self, writable, htable, uniques, dtype, if safely_resizes: htable.get_labels(vals, uniques, 0, -1) else: - with pytest.raises(ValueError, match='external reference.*'): + with pytest.raises(ValueError, match="external reference.*"): htable.get_labels(vals, uniques, 0, -1) - uniques.to_array() # should not raise here + uniques.to_array() # should not raise here assert tmp.shape == oldshape - @pytest.mark.parametrize('htable, tm_dtype', [ - (ht.PyObjectHashTable, 'String'), - (ht.StringHashTable, 'String'), - (ht.Float64HashTable, 'Float'), - (ht.Int64HashTable, 'Int'), - (ht.UInt64HashTable, 'UInt')]) + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) def test_hashtable_unique(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, 'make' + tm_dtype + 'Index') + maker = getattr(tm, "make" + tm_dtype + "Index") s = Series(maker(1000)) if htable == ht.Float64HashTable: # add NaN for float column @@ -1393,27 +1520,32 @@ def test_hashtable_unique(self, htable, tm_dtype, writable): # drop_duplicates has own cython code (hash_table_func_helper.pxi) # and is tested separately; keeps first occurrence like ht.unique() - expected_unique = s_duplicated.drop_duplicates(keep='first').values + expected_unique = s_duplicated.drop_duplicates(keep="first").values result_unique = htable().unique(s_duplicated.values) tm.assert_numpy_array_equal(result_unique, expected_unique) # test return_inverse=True # reconstruction can only succeed if the inverse is correct - result_unique, result_inverse = htable().unique(s_duplicated.values, - return_inverse=True) + result_unique, result_inverse = htable().unique( + s_duplicated.values, return_inverse=True + ) tm.assert_numpy_array_equal(result_unique, expected_unique) reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) - @pytest.mark.parametrize('htable, tm_dtype', [ - (ht.PyObjectHashTable, 'String'), - (ht.StringHashTable, 'String'), - (ht.Float64HashTable, 'Float'), - (ht.Int64HashTable, 'Int'), - (ht.UInt64HashTable, 'UInt')]) + @pytest.mark.parametrize( + "htable, tm_dtype", + [ + (ht.PyObjectHashTable, "String"), + (ht.StringHashTable, "String"), + (ht.Float64HashTable, "Float"), + (ht.Int64HashTable, "Int"), + (ht.UInt64HashTable, "UInt"), + ], + ) def test_hashtable_factorize(self, htable, tm_dtype, writable): # output of maker has guaranteed unique elements - maker = getattr(tm, 'make' + tm_dtype + 'Index') + maker = getattr(tm, "make" + tm_dtype + "Index") s = Series(maker(1000)) if htable == ht.Float64HashTable: # add NaN for float column @@ -1441,42 +1573,46 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable): expected_reconstruct = s_duplicated.dropna().values tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) - @pytest.mark.parametrize('hashtable', [ - ht.PyObjectHashTable, ht.StringHashTable, - ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable]) + @pytest.mark.parametrize( + "hashtable", + [ + ht.PyObjectHashTable, + ht.StringHashTable, + ht.Float64HashTable, + ht.Int64HashTable, + ht.UInt64HashTable, + ], + ) def test_hashtable_large_sizehint(self, hashtable): # GH 22729 size_hint = np.iinfo(np.uint32).max + 1 - tbl = hashtable(size_hint=size_hint) # noqa + tbl = hashtable(size_hint=size_hint) # noqa def test_quantile(): s = Series(np.random.randn(100)) - result = algos.quantile(s, [0, .25, .5, .75, 1.]) - expected = algos.quantile(s.values, [0, .25, .5, .75, 1.]) + result = algos.quantile(s, [0, 0.25, 0.5, 0.75, 1.0]) + expected = algos.quantile(s.values, [0, 0.25, 0.5, 0.75, 1.0]) tm.assert_almost_equal(result, expected) def test_unique_label_indices(): - a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8') + a = np.random.randint(1, 1 << 10, 1 << 15).astype("i8") left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1] - tm.assert_numpy_array_equal(left, right, - check_dtype=False) + tm.assert_numpy_array_equal(left, right, check_dtype=False) a[np.random.choice(len(a), 10)] = -1 left = ht.unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] - tm.assert_numpy_array_equal(left, right, - check_dtype=False) + tm.assert_numpy_array_equal(left, right, check_dtype=False) class TestRank: - @td.skip_if_no_scipy def test_scipy_compat(self): from scipy.stats import rankdata @@ -1490,13 +1626,13 @@ def _check(arr): exp[mask] = nan assert_almost_equal(result, exp) - _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) - _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + _check(np.array([nan, nan, 5.0, 5.0, 5.0, nan, 1, 2, 3, nan])) + _check(np.array([4.0, nan, 5.0, 5.0, 5.0, nan, 1, 2, 4.0, nan])) def test_basic(self): exp = np.array([1, 2], dtype=np.float64) - for dtype in np.typecodes['AllInteger']: + for dtype in np.typecodes["AllInteger"]: s = Series([1, 100], dtype=dtype) tm.assert_numpy_array_equal(algos.rank(s), exp) @@ -1504,7 +1640,7 @@ def test_uint64_overflow(self): exp = np.array([1, 2], dtype=np.float64) for dtype in [np.float64, np.uint64]: - s = Series([1, 2**63], dtype=dtype) + s = Series([1, 2 ** 63], dtype=dtype) tm.assert_numpy_array_equal(algos.rank(s), exp) def test_too_many_ndims(self): @@ -1516,10 +1652,11 @@ def test_too_many_ndims(self): @pytest.mark.single @pytest.mark.high_memory - @pytest.mark.parametrize('values', [ - np.arange(2**24 + 1), - np.arange(2**25 + 2).reshape(2**24 + 1, 2)], - ids=['1d', '2d']) + @pytest.mark.parametrize( + "values", + [np.arange(2 ** 24 + 1), np.arange(2 ** 25 + 2).reshape(2 ** 24 + 1, 2)], + ids=["1d", "2d"], + ) def test_pct_max_many_rows(self, values): # GH 18271 result = algos.rank(values, pct=True).max() @@ -1528,8 +1665,8 @@ def test_pct_max_many_rows(self, values): def test_pad_backfill_object_segfault(): - old = np.array([], dtype='O') - new = np.array([datetime(2010, 12, 31)], dtype='O') + old = np.array([], dtype="O") + new = np.array([datetime(2010, 12, 31)], dtype="O") result = libalgos.pad["object"](old, new) expected = np.array([-1], dtype=np.int64) @@ -1549,13 +1686,12 @@ def test_pad_backfill_object_segfault(): def test_arrmap(): - values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') - result = libalgos.arrmap_object(values, lambda x: x in ['foo', 'bar']) - assert (result.dtype == np.bool_) + values = np.array(["foo", "foo", "bar", "bar", "baz", "qux"], dtype="O") + result = libalgos.arrmap_object(values, lambda x: x in ["foo", "bar"]) + assert result.dtype == np.bool_ class TestTseriesUtil: - def test_combineFunc(self): pass @@ -1577,8 +1713,7 @@ def test_backfill(self): filler = libalgos.backfill["int64_t"](old.values, new.values) - expect_filler = np.array([0, 0, 1, 1, 1, 1, - 2, 2, 2, 2, 2, -1], dtype=np.int64) + expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.int64) tm.assert_numpy_array_equal(filler, expect_filler) # corner case @@ -1595,8 +1730,7 @@ def test_pad(self): filler = libalgos.pad["int64_t"](old.values, new.values) - expect_filler = np.array([-1, 0, 0, 0, 0, 1, - 1, 1, 1, 1, 2, 2], dtype=np.int64) + expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.int64) tm.assert_numpy_array_equal(filler, expect_filler) # corner case @@ -1609,32 +1743,267 @@ def test_pad(self): def test_is_lexsorted(): failure = [ - np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, - 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'), - np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, - 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, - 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, - 12, 11, - 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, - 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, - 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, - 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, - 6, 5, - 4, 3, 2, 1, 0], dtype='int64')] - - assert (not libalgos.is_lexsorted(failure)) + np.array( + [ + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype="int64", + ), + np.array( + [ + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + 30, + 29, + 28, + 27, + 26, + 25, + 24, + 23, + 22, + 21, + 20, + 19, + 18, + 17, + 16, + 15, + 14, + 13, + 12, + 11, + 10, + 9, + 8, + 7, + 6, + 5, + 4, + 3, + 2, + 1, + 0, + ], + dtype="int64", + ), + ] + + assert not libalgos.is_lexsorted(failure) def test_groupsort_indexer(): @@ -1646,7 +2015,7 @@ def test_groupsort_indexer(): # need to use a stable sort # np.argsort returns int, groupsort_indexer # always returns int64 - expected = np.argsort(a, kind='mergesort') + expected = np.argsort(a, kind="mergesort") expected = expected.astype(np.int64) tm.assert_numpy_array_equal(result, expected) @@ -1718,7 +2087,7 @@ def test_ensure_platform_int(): arr = np.arange(100, dtype=np.intp) result = libalgos.ensure_platform_int(arr) - assert (result is arr) + assert result is arr def test_int64_add_overflow(): @@ -1738,34 +2107,42 @@ def test_int64_add_overflow(): with pytest.raises(OverflowError, match=msg): algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True]) + ) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True]) + ) with pytest.raises(OverflowError, match=msg): - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([False, True]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([False, True]), + b_mask=np.array([False, True]), + ) with pytest.raises(OverflowError, match=msg): with tm.assert_produces_warning(RuntimeWarning): - algos.checked_add_with_arr(np.array([m, m]), - np.array([np.nan, m])) + algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m])) # Check that the nan boolean arrays override whether or not # the addition overflows. We don't check the result but just # the fact that an OverflowError is not raised. - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([True, True])) - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - b_mask=np.array([True, True])) - algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), - arr_mask=np.array([True, False]), - b_mask=np.array([False, True])) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True]) + ) + algos.checked_add_with_arr( + np.array([m, m]), + np.array([m, m]), + arr_mask=np.array([True, False]), + b_mask=np.array([False, True]), + ) class TestMode: - def test_no_mode(self): exp = Series([], dtype=np.float64) tm.assert_series_equal(algos.mode([]), exp) @@ -1778,7 +2155,7 @@ def test_mode_single(self): exp_multi = [1] data_multi = [1, 1] - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: s = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) tm.assert_series_equal(algos.mode(s), exp) @@ -1790,8 +2167,8 @@ def test_mode_single(self): exp = Series([1], dtype=np.int) tm.assert_series_equal(algos.mode([1]), exp) - exp = Series(['a', 'b', 'c'], dtype=np.object) - tm.assert_series_equal(algos.mode(['a', 'b', 'c']), exp) + exp = Series(["a", "b", "c"], dtype=np.object) + tm.assert_series_equal(algos.mode(["a", "b", "c"]), exp) def test_number_mode(self): exp_single = [1] @@ -1800,7 +2177,7 @@ def test_number_mode(self): exp_multi = [1, 3] data_multi = [1] * 5 + [2] * 3 + [3] * 5 - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + for dt in np.typecodes["AllInteger"] + np.typecodes["Float"]: s = Series(data_single, dtype=dt) exp = Series(exp_single, dtype=dt) tm.assert_series_equal(algos.mode(s), exp) @@ -1810,15 +2187,15 @@ def test_number_mode(self): tm.assert_series_equal(algos.mode(s), exp) def test_strobj_mode(self): - exp = ['b'] - data = ['a'] * 2 + ['b'] * 3 + exp = ["b"] + data = ["a"] * 2 + ["b"] * 3 - s = Series(data, dtype='c') - exp = Series(exp, dtype='c') + s = Series(data, dtype="c") + exp = Series(exp, dtype="c") tm.assert_series_equal(algos.mode(s), exp) - exp = ['bar'] - data = ['foo'] * 2 + ['bar'] * 3 + exp = ["bar"] + data = ["foo"] * 2 + ["bar"] * 3 for dt in [str, object]: s = Series(data, dtype=dt) @@ -1826,41 +2203,41 @@ def test_strobj_mode(self): tm.assert_series_equal(algos.mode(s), exp) def test_datelike_mode(self): - exp = Series(['1900-05-03', '2011-01-03', - '2013-01-02'], dtype="M8[ns]") - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03'], dtype='M8[ns]') + exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]") tm.assert_series_equal(algos.mode(s), exp) - exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]') - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02'], dtype='M8[ns]') + exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]") + s = Series( + ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"], + dtype="M8[ns]", + ) tm.assert_series_equal(algos.mode(s), exp) def test_timedelta_mode(self): - exp = Series(['-1 days', '0 days', '1 days'], - dtype='timedelta64[ns]') - s = Series(['1 days', '-1 days', '0 days'], - dtype='timedelta64[ns]') + exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]") + s = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]") tm.assert_series_equal(algos.mode(s), exp) - exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], dtype='timedelta64[ns]') + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + s = Series( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) tm.assert_series_equal(algos.mode(s), exp) def test_mixed_dtype(self): - exp = Series(['foo']) - s = Series([1, 'foo', 'foo']) + exp = Series(["foo"]) + s = Series([1, "foo", "foo"]) tm.assert_series_equal(algos.mode(s), exp) def test_uint64_overflow(self): - exp = Series([2**63], dtype=np.uint64) - s = Series([1, 2**63, 2**63], dtype=np.uint64) + exp = Series([2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63, 2 ** 63], dtype=np.uint64) tm.assert_series_equal(algos.mode(s), exp) - exp = Series([1, 2**63], dtype=np.uint64) - s = Series([1, 2**63], dtype=np.uint64) + exp = Series([1, 2 ** 63], dtype=np.uint64) + s = Series([1, 2 ** 63], dtype=np.uint64) tm.assert_series_equal(algos.mode(s), exp) def test_categorical(self): @@ -1869,8 +2246,8 @@ def test_categorical(self): tm.assert_categorical_equal(algos.mode(c), exp) tm.assert_categorical_equal(c.mode(), exp) - c = Categorical([1, 'a', 'a']) - exp = Categorical(['a'], categories=[1, 'a']) + c = Categorical([1, "a", "a"]) + exp = Categorical(["a"], categories=[1, "a"]) tm.assert_categorical_equal(algos.mode(c), exp) tm.assert_categorical_equal(c.mode(), exp) @@ -1884,15 +2261,17 @@ def test_index(self): exp = Series([1, 2, 3], dtype=np.int64) tm.assert_series_equal(algos.mode(idx), exp) - idx = Index([1, 'a', 'a']) - exp = Series(['a'], dtype=object) + idx = Index([1, "a", "a"]) + exp = Series(["a"], dtype=object) tm.assert_series_equal(algos.mode(idx), exp) idx = Index([1, 1, 2, 3, 3]) exp = Series([1, 3], dtype=np.int64) tm.assert_series_equal(algos.mode(idx), exp) - exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]') - idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min'], dtype='timedelta64[ns]') + exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]") + idx = Index( + ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"], + dtype="timedelta64[ns]", + ) tm.assert_series_equal(algos.mode(idx), exp) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index f9a1bb97cc48c..279d6dd84d92b 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -11,14 +11,28 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_datetime64tz_dtype, is_object_dtype, - is_timedelta64_dtype, needs_i8_conversion) + is_datetime64_dtype, + is_datetime64tz_dtype, + is_object_dtype, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd from pandas import ( - CategoricalIndex, DataFrame, DatetimeIndex, Index, Interval, IntervalIndex, - PeriodIndex, Series, Timedelta, TimedeltaIndex, Timestamp) + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + Interval, + IntervalIndex, + PeriodIndex, + Series, + Timedelta, + TimedeltaIndex, + Timestamp, +) from pandas.core.accessor import PandasDelegate from pandas.core.arrays import DatetimeArray, PandasArray, TimedeltaArray from pandas.core.base import NoNewAttributesMixin, PandasObject @@ -27,21 +41,20 @@ class CheckStringMixin: - def test_string_methods_dont_fail(self): repr(self.container) str(self.container) bytes(self.container) def test_tricky_container(self): - if not hasattr(self, 'unicode_container'): - pytest.skip('Need unicode_container to test with this') + if not hasattr(self, "unicode_container"): + pytest.skip("Need unicode_container to test with this") repr(self.unicode_container) str(self.unicode_container) class CheckImmutable: - mutable_regex = re.compile('does not support mutable operations') + mutable_regex = re.compile("does not support mutable operations") def check_mutable_error(self, *args, **kwargs): # Pass whatever function you normally would to pytest.raises @@ -86,10 +99,9 @@ def check_result(self, result, expected, klass=None): class TestPandasDelegate: - class Delegator: - _properties = ['foo'] - _methods = ['bar'] + _properties = ["foo"] + _methods = ["bar"] def _set_foo(self, value): self.foo = value @@ -104,7 +116,6 @@ def bar(self, *args, **kwargs): pass class Delegate(PandasDelegate, PandasObject): - def __init__(self, obj): self.obj = obj @@ -119,12 +130,10 @@ def test_invalid_delegation(self): self.Delegate._add_delegate_accessors( delegate=self.Delegator, accessors=self.Delegator._properties, - typ='property' + typ="property", ) self.Delegate._add_delegate_accessors( - delegate=self.Delegator, - accessors=self.Delegator._methods, - typ='method' + delegate=self.Delegator, accessors=self.Delegator._methods, typ="method" ) delegate = self.Delegate(self.Delegator()) @@ -148,40 +157,36 @@ def test_memory_usage(self): class Ops: - def _allow_na_ops(self, obj): """Whether to skip test cases including NaN""" - if (isinstance(obj, Index) and - (obj.is_boolean() or not obj._can_hold_na)): + if isinstance(obj, Index) and (obj.is_boolean() or not obj._can_hold_na): # don't test boolean / int64 index return False return True def setup_method(self, method): - self.bool_index = tm.makeBoolIndex(10, name='a') - self.int_index = tm.makeIntIndex(10, name='a') - self.float_index = tm.makeFloatIndex(10, name='a') - self.dt_index = tm.makeDateIndex(10, name='a') - self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize( - tz='US/Eastern') - self.period_index = tm.makePeriodIndex(10, name='a') - self.string_index = tm.makeStringIndex(10, name='a') - self.unicode_index = tm.makeUnicodeIndex(10, name='a') + self.bool_index = tm.makeBoolIndex(10, name="a") + self.int_index = tm.makeIntIndex(10, name="a") + self.float_index = tm.makeFloatIndex(10, name="a") + self.dt_index = tm.makeDateIndex(10, name="a") + self.dt_tz_index = tm.makeDateIndex(10, name="a").tz_localize(tz="US/Eastern") + self.period_index = tm.makePeriodIndex(10, name="a") + self.string_index = tm.makeStringIndex(10, name="a") + self.unicode_index = tm.makeUnicodeIndex(10, name="a") arr = np.random.randn(10) - self.bool_series = Series(arr, index=self.bool_index, name='a') - self.int_series = Series(arr, index=self.int_index, name='a') - self.float_series = Series(arr, index=self.float_index, name='a') - self.dt_series = Series(arr, index=self.dt_index, name='a') + self.bool_series = Series(arr, index=self.bool_index, name="a") + self.int_series = Series(arr, index=self.int_index, name="a") + self.float_series = Series(arr, index=self.float_index, name="a") + self.dt_series = Series(arr, index=self.dt_index, name="a") self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) - self.period_series = Series(arr, index=self.period_index, name='a') - self.string_series = Series(arr, index=self.string_index, name='a') - self.unicode_series = Series(arr, index=self.unicode_index, name='a') - - types = ['bool', 'int', 'float', 'dt', 'dt_tz', 'period', 'string', - 'unicode'] - self.indexes = [getattr(self, '{}_index'.format(t)) for t in types] - self.series = [getattr(self, '{}_series'.format(t)) for t in types] + self.period_series = Series(arr, index=self.period_index, name="a") + self.string_series = Series(arr, index=self.string_index, name="a") + self.unicode_series = Series(arr, index=self.unicode_index, name="a") + + types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] + self.indexes = [getattr(self, "{}_index".format(t)) for t in types] + self.series = [getattr(self, "{}_series".format(t)) for t in types] self.objs = self.indexes + self.series def check_ops_properties(self, props, filter=None, ignore_failures=False): @@ -196,8 +201,7 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): try: if isinstance(o, Series): - expected = Series( - getattr(o.index, op), index=o.index, name='a') + expected = Series(getattr(o.index, op), index=o.index, name="a") else: expected = getattr(o, op) except (AttributeError): @@ -211,8 +215,9 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): tm.assert_series_equal(result, expected) elif isinstance(result, Index) and isinstance(expected, Index): tm.assert_index_equal(result, expected) - elif isinstance(result, np.ndarray) and isinstance(expected, - np.ndarray): + elif isinstance(result, np.ndarray) and isinstance( + expected, np.ndarray + ): tm.assert_numpy_array_equal(result, expected) else: assert result == expected @@ -231,29 +236,30 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False): with pytest.raises(err): getattr(o, op) - @pytest.mark.parametrize('klass', [Series, DataFrame]) + @pytest.mark.parametrize("klass", [Series, DataFrame]) def test_binary_ops_docs(self, klass): - op_map = {'add': '+', - 'sub': '-', - 'mul': '*', - 'mod': '%', - 'pow': '**', - 'truediv': '/', - 'floordiv': '//'} + op_map = { + "add": "+", + "sub": "-", + "mul": "*", + "mod": "%", + "pow": "**", + "truediv": "/", + "floordiv": "//", + } for op_name in op_map: operand1 = klass.__name__.lower() - operand2 = 'other' + operand2 = "other" op = op_map[op_name] - expected_str = ' '.join([operand1, op, operand2]) + expected_str = " ".join([operand1, op, operand2]) assert expected_str in getattr(klass, op_name).__doc__ # reverse version of the binary ops - expected_str = ' '.join([operand2, op, operand1]) - assert expected_str in getattr(klass, 'r' + op_name).__doc__ + expected_str = " ".join([operand2, op, operand1]) + assert expected_str in getattr(klass, "r" + op_name).__doc__ class TestIndexOps(Ops): - def setup_method(self, method): super().setup_method(method) self.is_valid_objs = self.objs @@ -286,7 +292,7 @@ def test_none_comparison(self): assert result.iat[0] assert result.iat[1] - if (is_datetime64_dtype(o) or is_datetime64tz_dtype(o)): + if is_datetime64_dtype(o) or is_datetime64tz_dtype(o): # Following DatetimeIndex (and Timestamp) convention, # inequality comparisons with Series[datetime64] raise with pytest.raises(TypeError): @@ -306,16 +312,16 @@ def test_ndarray_compat_properties(self): for o in self.objs: # Check that we work. - for p in ['shape', 'dtype', 'T', 'nbytes']: + for p in ["shape", "dtype", "T", "nbytes"]: assert getattr(o, p, None) is not None # deprecated properties - for p in ['flags', 'strides', 'itemsize']: + for p in ["flags", "strides", "itemsize"]: with tm.assert_produces_warning(FutureWarning): assert getattr(o, p, None) is not None with tm.assert_produces_warning(FutureWarning): - assert hasattr(o, 'base') + assert hasattr(o, "base") # If we have a datetime-like dtype then needs a view to work # but the user is responsible for that @@ -354,25 +360,26 @@ def test_value_counts_unique_nunique(self): expected_index = Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' + o.name = "a" else: expected_index = Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) # take-based repeat indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) rep = values.take(indices) - o = klass(rep, index=idx, name='a') + o = klass(rep, index=idx, name="a") # check values has the same dtype as the original assert o.dtype == orig.dtype - expected_s = Series(range(10, 0, -1), index=expected_index, - dtype='int64', name='a') + expected_s = Series( + range(10, 0, -1), index=expected_index, dtype="int64", name="a" + ) result = o.value_counts() tm.assert_series_equal(result, expected_s) assert result.index.name is None - assert result.name == 'a' + assert result.name == "a" result = o.unique() if isinstance(o, Index): @@ -385,14 +392,14 @@ def test_value_counts_unique_nunique(self): assert isinstance(r, Timestamp) tm.assert_numpy_array_equal( - result.astype(object), - orig._values.astype(object)) + result.astype(object), orig._values.astype(object) + ) else: tm.assert_numpy_array_equal(result, orig.values) assert o.nunique() == len(np.unique(o.values)) - @pytest.mark.parametrize('null_obj', [np.nan, None]) + @pytest.mark.parametrize("null_obj", [np.nan, None]) def test_value_counts_unique_nunique_null(self, null_obj): for orig in self.objs: @@ -431,7 +438,7 @@ def test_value_counts_unique_nunique_null(self, null_obj): # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) - o.name = 'a' + o.name = "a" else: if isinstance(o, DatetimeIndex): expected_index = orig._values._shallow_copy(values) @@ -439,7 +446,7 @@ def test_value_counts_unique_nunique_null(self, null_obj): expected_index = Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' + o.name = "a" # check values has the same dtype as the original assert o.dtype == orig.dtype @@ -449,29 +456,34 @@ def test_value_counts_unique_nunique_null(self, null_obj): if isinstance(o, Index): tm.assert_numpy_array_equal(pd.isna(o), nanloc) else: - exp = Series(nanloc, o.index, name='a') + exp = Series(nanloc, o.index, name="a") tm.assert_series_equal(pd.isna(o), exp) - expected_s_na = Series(list(range(10, 2, -1)) + [3], - index=expected_index[9:0:-1], - dtype='int64', name='a') - expected_s = Series(list(range(10, 2, -1)), - index=expected_index[9:1:-1], - dtype='int64', name='a') + expected_s_na = Series( + list(range(10, 2, -1)) + [3], + index=expected_index[9:0:-1], + dtype="int64", + name="a", + ) + expected_s = Series( + list(range(10, 2, -1)), + index=expected_index[9:1:-1], + dtype="int64", + name="a", + ) result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None - assert result_s_na.name == 'a' + assert result_s_na.name == "a" result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) assert result_s.index.name is None - assert result_s.name == 'a' + assert result_s.name == "a" result = o.unique() if isinstance(o, Index): - tm.assert_index_equal(result, - Index(values[1:], name='a')) + tm.assert_index_equal(result, Index(values[1:], name="a")) elif is_datetime64tz_dtype(o): # unable to compare NaT / nan tm.assert_extension_array_equal(result[1:], values[2:]) @@ -485,11 +497,11 @@ def test_value_counts_unique_nunique_null(self, null_obj): assert o.nunique() == 8 assert o.nunique(dropna=False) == 9 - @pytest.mark.parametrize('klass', [Index, Series]) + @pytest.mark.parametrize("klass", [Index, Series]) def test_value_counts_inferred(self, klass): - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) - expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) + expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): @@ -503,22 +515,22 @@ def test_value_counts_inferred(self, klass): # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() - expected = Series([3, 1, 4, 2], index=list('acbd')).sort_values() + expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) - expected = Series([1, 2, 3, 4], index=list('cdab')) + expected = Series([1, 2, 3, 4], index=list("cdab")) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) - expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) + expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) tm.assert_series_equal(hist, expected) - @pytest.mark.parametrize('klass', [Index, Series]) + @pytest.mark.parametrize("klass", [Index, Series]) def test_value_counts_bins(self, klass): - s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] + s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) # bins @@ -553,63 +565,66 @@ def test_value_counts_bins(self, klass): tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25, 0], - index=intervals.take([0, 3, 1, 2])) + exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly - s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, - 'd', 'd', 'a', 'a', 'b'] + s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) - expected = Series([4, 3, 2], index=['b', 'a', 'd']) + expected = Series([4, 3, 2], index=["b", "a", "d"]) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): - exp = Index(['a', 'b', np.nan, 'd']) + exp = Index(["a", "b", np.nan, "d"]) tm.assert_index_equal(s.unique(), exp) else: - exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) + exp = np.array(["a", "b", np.nan, "d"], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) expected = Series([], dtype=np.int64) - tm.assert_series_equal(s.value_counts(), expected, - check_index_type=False) + tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: - tm.assert_numpy_array_equal(s.unique(), np.array([]), - check_dtype=False) + tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) assert s.nunique() == 0 - @pytest.mark.parametrize('klass', [Index, Series]) + @pytest.mark.parametrize("klass", [Index, Series]) def test_value_counts_datetime64(self, klass): # GH 3002, datetime64[ns] # don't test names though - txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', - 'xxyyzz20100101EGG', 'xxyyww20090101EGG', - 'foofoo20080909PIE', 'foofoo20080909GUM']) + txt = "\n".join( + [ + "xxyyzz20100101PIE", + "xxyyzz20100101GUM", + "xxyyzz20100101EGG", + "xxyyww20090101EGG", + "foofoo20080909PIE", + "foofoo20080909GUM", + ] + ) f = StringIO(txt) - df = pd.read_fwf(f, widths=[6, 8, 3], - names=["person_id", "dt", "food"], - parse_dates=["dt"]) + df = pd.read_fwf( + f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"] + ) - s = klass(df['dt'].copy()) + s = klass(df["dt"].copy()) s.name = None - idx = pd.to_datetime(['2010-01-01 00:00:00', - '2008-09-09 00:00:00', - '2009-01-01 00:00:00']) + idx = pd.to_datetime( + ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] + ) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) - expected = np_array_datetime64_compat(['2010-01-01 00:00:00', - '2009-01-01 00:00:00', - '2008-09-09 00:00:00'], - dtype='datetime64[ns]') + expected = np_array_datetime64_compat( + ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], + dtype="datetime64[ns]", + ) if isinstance(s, Index): tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: @@ -618,11 +633,11 @@ def test_value_counts_datetime64(self, klass): assert s.nunique() == 3 # with NaT - s = df['dt'].copy() + s = df["dt"].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() - assert result.index.dtype == 'datetime64[ns]' + assert result.index.dtype == "datetime64[ns]" tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) @@ -630,7 +645,7 @@ def test_value_counts_datetime64(self, klass): tm.assert_series_equal(result, expected_s) unique = s.unique() - assert unique.dtype == 'datetime64[ns]' + assert unique.dtype == "datetime64[ns]" # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): @@ -645,20 +660,20 @@ def test_value_counts_datetime64(self, klass): # timedelta64[ns] td = df.dt - df.dt + timedelta(1) - td = klass(td, name='dt') + td = klass(td, name="dt") result = td.value_counts() - expected_s = Series([6], index=[Timedelta('1day')], name='dt') + expected_s = Series([6], index=[Timedelta("1day")], name="dt") tm.assert_series_equal(result, expected_s) - expected = TimedeltaIndex(['1 days'], name='dt') + expected = TimedeltaIndex(["1 days"], name="dt") if isinstance(td, Index): tm.assert_index_equal(td.unique(), expected) else: tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) - td2 = klass(td2, name='dt') + td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s) @@ -677,12 +692,10 @@ def test_factorize(self): tm.assert_numpy_array_equal(labels, exp_arr) if isinstance(o, Series): - tm.assert_index_equal(uniques, Index(orig), - check_names=False) + tm.assert_index_equal(uniques, Index(orig), check_names=False) else: # factorize explicitly resets name - tm.assert_index_equal(uniques, exp_uniques, - check_names=False) + tm.assert_index_equal(uniques, exp_uniques, check_names=False) def test_factorize_repeated(self): for orig in self.objs: @@ -701,19 +714,20 @@ def test_factorize_repeated(self): o = o.take(indexer) n = o[5:].append(o) - exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], - dtype=np.intp) + exp_arr = np.array( + [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp + ) labels, uniques = n.factorize(sort=True) tm.assert_numpy_array_equal(labels, exp_arr) if isinstance(o, Series): - tm.assert_index_equal(uniques, Index(orig).sort_values(), - check_names=False) + tm.assert_index_equal( + uniques, Index(orig).sort_values(), check_names=False + ) else: tm.assert_index_equal(uniques, o, check_names=False) - exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], - np.intp) + exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) labels, uniques = n.factorize(sort=False) tm.assert_numpy_array_equal(labels, exp_arr) @@ -732,7 +746,7 @@ def test_duplicated_drop_duplicates_index(self): # special case if original.is_boolean(): result = original.drop_duplicates() - expected = Index([False, True], name='a') + expected = Index([False, True], name="a") tm.assert_index_equal(result, expected) continue @@ -750,8 +764,7 @@ def test_duplicated_drop_duplicates_index(self): # create repeated values, 3rd and 5th values are duplicated idx = original[list(range(len(original))) + [5, 3]] - expected = np.array([False] * len(original) + [True, True], - dtype=bool) + expected = np.array([False] * len(original) + [True, True], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool @@ -762,10 +775,10 @@ def test_duplicated_drop_duplicates_index(self): base[5] = True expected = np.array(base) - duplicated = idx.duplicated(keep='last') + duplicated = idx.duplicated(keep="last") tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool - result = idx.drop_duplicates(keep='last') + result = idx.drop_duplicates(keep="last") tm.assert_index_equal(result, idx[~expected]) base = [False] * len(original) + [True, True] @@ -779,14 +792,18 @@ def test_duplicated_drop_duplicates_index(self): result = idx.drop_duplicates(keep=False) tm.assert_index_equal(result, idx[~expected]) - with pytest.raises(TypeError, - match=(r"drop_duplicates\(\) got an " - r"unexpected keyword argument")): + with pytest.raises( + TypeError, + match=( + r"drop_duplicates\(\) got an " r"unexpected keyword argument" + ), + ): idx.drop_duplicates(inplace=True) else: - expected = Series([False] * len(original), - index=original.index, name='a') + expected = Series( + [False] * len(original), index=original.index, name="a" + ) tm.assert_series_equal(original.duplicated(), expected) result = original.drop_duplicates() tm.assert_series_equal(result, original) @@ -794,45 +811,56 @@ def test_duplicated_drop_duplicates_index(self): idx = original.index[list(range(len(original))) + [5, 3]] values = original._values[list(range(len(original))) + [5, 3]] - s = Series(values, index=idx, name='a') + s = Series(values, index=idx, name="a") - expected = Series([False] * len(original) + [True, True], - index=idx, name='a') + expected = Series( + [False] * len(original) + [True, True], index=idx, name="a" + ) tm.assert_series_equal(s.duplicated(), expected) tm.assert_series_equal(s.drop_duplicates(), original) base = [False] * len(idx) base[3] = True base[5] = True - expected = Series(base, index=idx, name='a') + expected = Series(base, index=idx, name="a") - tm.assert_series_equal(s.duplicated(keep='last'), expected) - tm.assert_series_equal(s.drop_duplicates(keep='last'), - s[~np.array(base)]) + tm.assert_series_equal(s.duplicated(keep="last"), expected) + tm.assert_series_equal( + s.drop_duplicates(keep="last"), s[~np.array(base)] + ) base = [False] * len(original) + [True, True] base[3] = True base[5] = True - expected = Series(base, index=idx, name='a') + expected = Series(base, index=idx, name="a") tm.assert_series_equal(s.duplicated(keep=False), expected) - tm.assert_series_equal(s.drop_duplicates(keep=False), - s[~np.array(base)]) + tm.assert_series_equal( + s.drop_duplicates(keep=False), s[~np.array(base)] + ) s.drop_duplicates(inplace=True) tm.assert_series_equal(s, original) def test_drop_duplicates_series_vs_dataframe(self): # GH 14192 - df = pd.DataFrame({'a': [1, 1, 1, 'one', 'one'], - 'b': [2, 2, np.nan, np.nan, np.nan], - 'c': [3, 3, np.nan, np.nan, 'three'], - 'd': [1, 2, 3, 4, 4], - 'e': [datetime(2015, 1, 1), datetime(2015, 1, 1), - datetime(2015, 2, 1), pd.NaT, pd.NaT] - }) + df = pd.DataFrame( + { + "a": [1, 1, 1, "one", "one"], + "b": [2, 2, np.nan, np.nan, np.nan], + "c": [3, 3, np.nan, np.nan, "three"], + "d": [1, 2, 3, 4, 4], + "e": [ + datetime(2015, 1, 1), + datetime(2015, 1, 1), + datetime(2015, 2, 1), + pd.NaT, + pd.NaT, + ], + } + ) for column in df.columns: - for keep in ['first', 'last', False]: + for keep in ["first", "last", False]: dropped_frame = df[[column]].drop_duplicates(keep=keep) dropped_series = df[column].drop_duplicates(keep=keep) tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) @@ -896,17 +924,18 @@ def test_memory_usage(self): res = o.memory_usage() res_deep = o.memory_usage(deep=True) - if (is_object_dtype(o) or (isinstance(o, Series) and - is_object_dtype(o.index))): + if is_object_dtype(o) or ( + isinstance(o, Series) and is_object_dtype(o.index) + ): # if there are objects, only deep will pick them up assert res_deep > res else: assert res == res_deep if isinstance(o, Series): - assert ((o.memory_usage(index=False) + - o.index.memory_usage()) == - o.memory_usage(index=True)) + assert ( + o.memory_usage(index=False) + o.index.memory_usage() + ) == o.memory_usage(index=True) # sys.getsizeof will call the .memory_usage with # deep=True, and add on some GC overhead @@ -944,10 +973,15 @@ def test_getitem(self): with pytest.raises(IndexError): s.iloc[20] - @pytest.mark.parametrize('indexer_klass', [list, pd.Index]) - @pytest.mark.parametrize('indexer', [[True] * 10, [False] * 10, - [True, False, True, True, False, - False, True, True, False, True]]) + @pytest.mark.parametrize("indexer_klass", [list, pd.Index]) + @pytest.mark.parametrize( + "indexer", + [ + [True] * 10, + [False] * 10, + [True, False, True, True, False, False, True, True, False, True], + ], + ) def test_bool_indexing(self, indexer_klass, indexer): # GH 22533 for idx in self.indexes: @@ -980,7 +1014,6 @@ def test_numpy_transpose(self): class TestNoNewAttributesMixin: - def test_mixin(self): class T(NoNewAttributesMixin): pass @@ -1005,32 +1038,34 @@ class TestToIterable: # test that we convert an iterable to python types dtypes = [ - ('int8', int), - ('int16', int), - ('int32', int), - ('int64', int), - ('uint8', int), - ('uint16', int), - ('uint32', int), - ('uint64', int), - ('float16', float), - ('float32', float), - ('float64', float), - ('datetime64[ns]', Timestamp), - ('datetime64[ns, US/Eastern]', Timestamp), - ('timedelta64[ns]', Timedelta)] - - @pytest.mark.parametrize( - 'dtype, rdtype', dtypes) + ("int8", int), + ("int16", int), + ("int32", int), + ("int64", int), + ("uint8", int), + ("uint16", int), + ("uint32", int), + ("uint64", int), + ("float16", float), + ("float32", float), + ("float64", float), + ("datetime64[ns]", Timestamp), + ("datetime64[ns, US/Eastern]", Timestamp), + ("timedelta64[ns]", Timedelta), + ] + + @pytest.mark.parametrize("dtype, rdtype", dtypes) @pytest.mark.parametrize( - 'method', + "method", [ lambda x: x.tolist(), lambda x: x.to_list(), lambda x: list(x), lambda x: list(x.__iter__()), - ], ids=['tolist', 'to_list', 'list', 'iter']) - @pytest.mark.parametrize('typ', [Series, Index]) + ], + ids=["tolist", "to_list", "list", "iter"], + ) + @pytest.mark.parametrize("typ", [Series, Index]) @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") # TODO(GH-24559): Remove the filterwarnings def test_iterable(self, typ, method, dtype, rdtype): @@ -1042,23 +1077,26 @@ def test_iterable(self, typ, method, dtype, rdtype): assert isinstance(result, rdtype) @pytest.mark.parametrize( - 'dtype, rdtype, obj', + "dtype, rdtype, obj", [ - ('object', object, 'a'), - ('object', int, 1), - ('category', object, 'a'), - ('category', int, 1)]) + ("object", object, "a"), + ("object", int, 1), + ("category", object, "a"), + ("category", int, 1), + ], + ) @pytest.mark.parametrize( - 'method', + "method", [ lambda x: x.tolist(), lambda x: x.to_list(), lambda x: list(x), lambda x: list(x.__iter__()), - ], ids=['tolist', 'to_list', 'list', 'iter']) - @pytest.mark.parametrize('typ', [Series, Index]) - def test_iterable_object_and_category(self, typ, method, - dtype, rdtype, obj): + ], + ids=["tolist", "to_list", "list", "iter"], + ) + @pytest.mark.parametrize("typ", [Series, Index]) + def test_iterable_object_and_category(self, typ, method, dtype, rdtype, obj): # gh-10904 # gh-13258 # coerce iteration to underlying python / pandas types @@ -1066,8 +1104,7 @@ def test_iterable_object_and_category(self, typ, method, result = method(s)[0] assert isinstance(result, rdtype) - @pytest.mark.parametrize( - 'dtype, rdtype', dtypes) + @pytest.mark.parametrize("dtype, rdtype", dtypes) def test_iterable_items(self, dtype, rdtype): # gh-13258 # test items / iteritems yields the correct boxed scalars @@ -1080,11 +1117,9 @@ def test_iterable_items(self, dtype, rdtype): assert isinstance(result, rdtype) @pytest.mark.parametrize( - 'dtype, rdtype', - dtypes + [ - ('object', int), - ('category', int)]) - @pytest.mark.parametrize('typ', [Series, Index]) + "dtype, rdtype", dtypes + [("object", int), ("category", int)] + ) + @pytest.mark.parametrize("typ", [Series, Index]) @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") # TODO(GH-24559): Remove the filterwarnings def test_iterable_map(self, typ, dtype, rdtype): @@ -1097,89 +1132,104 @@ def test_iterable_map(self, typ, dtype, rdtype): assert result in rdtype @pytest.mark.parametrize( - 'method', + "method", [ lambda x: x.tolist(), lambda x: x.to_list(), lambda x: list(x), lambda x: list(x.__iter__()), - ], ids=['tolist', 'to_list', 'list', 'iter']) + ], + ids=["tolist", "to_list", "list", "iter"], + ) def test_categorial_datetimelike(self, method): - i = CategoricalIndex([Timestamp('1999-12-31'), - Timestamp('2000-12-31')]) + i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")]) result = method(i)[0] assert isinstance(result, Timestamp) def test_iter_box(self): - vals = [Timestamp('2011-01-01'), Timestamp('2011-01-02')] + vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] s = Series(vals) - assert s.dtype == 'datetime64[ns]' + assert s.dtype == "datetime64[ns]" for res, exp in zip(s, vals): assert isinstance(res, Timestamp) assert res.tz is None assert res == exp - vals = [Timestamp('2011-01-01', tz='US/Eastern'), - Timestamp('2011-01-02', tz='US/Eastern')] + vals = [ + Timestamp("2011-01-01", tz="US/Eastern"), + Timestamp("2011-01-02", tz="US/Eastern"), + ] s = Series(vals) - assert s.dtype == 'datetime64[ns, US/Eastern]' + assert s.dtype == "datetime64[ns, US/Eastern]" for res, exp in zip(s, vals): assert isinstance(res, Timestamp) assert res.tz == exp.tz assert res == exp # timedelta - vals = [Timedelta('1 days'), Timedelta('2 days')] + vals = [Timedelta("1 days"), Timedelta("2 days")] s = Series(vals) - assert s.dtype == 'timedelta64[ns]' + assert s.dtype == "timedelta64[ns]" for res, exp in zip(s, vals): assert isinstance(res, Timedelta) assert res == exp # period - vals = [pd.Period('2011-01-01', freq='M'), - pd.Period('2011-01-02', freq='M')] + vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = Series(vals) - assert s.dtype == 'Period[M]' + assert s.dtype == "Period[M]" for res, exp in zip(s, vals): assert isinstance(res, pd.Period) - assert res.freq == 'M' + assert res.freq == "M" assert res == exp -@pytest.mark.parametrize('array, expected_type, dtype', [ - (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), - (np.array(['a', 'b']), np.ndarray, 'object'), - (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), - (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), DatetimeArray, - 'datetime64[ns, US/Central]'), - - (pd.PeriodIndex([2018, 2019], freq='A'), pd.core.arrays.PeriodArray, - pd.core.dtypes.dtypes.PeriodDtype("A-DEC")), - (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray, - 'interval'), - - # This test is currently failing for datetime64[ns] and timedelta64[ns]. - # The NumPy type system is sufficient for representing these types, so - # we just use NumPy for Series / DataFrame columns of these types (so - # we get consolidation and so on). - # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray - # abstraction to for code reuse. - # At the moment, we've judged that allowing this test to fail is more - # practical that overriding Series._values to special case - # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. - pytest.param( - pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]', - marks=[pytest.mark.xfail(reason="datetime _values", strict=True)] - ), - pytest.param( - pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]', - marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)] - ), - -]) +@pytest.mark.parametrize( + "array, expected_type, dtype", + [ + (np.array([0, 1], dtype=np.int64), np.ndarray, "int64"), + (np.array(["a", "b"]), np.ndarray, "object"), + (pd.Categorical(["a", "b"]), pd.Categorical, "category"), + ( + pd.DatetimeIndex(["2017", "2018"], tz="US/Central"), + DatetimeArray, + "datetime64[ns, US/Central]", + ), + ( + pd.PeriodIndex([2018, 2019], freq="A"), + pd.core.arrays.PeriodArray, + pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), + ), + ( + pd.IntervalIndex.from_breaks([0, 1, 2]), + pd.core.arrays.IntervalArray, + "interval", + ), + # This test is currently failing for datetime64[ns] and timedelta64[ns]. + # The NumPy type system is sufficient for representing these types, so + # we just use NumPy for Series / DataFrame columns of these types (so + # we get consolidation and so on). + # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray + # abstraction to for code reuse. + # At the moment, we've judged that allowing this test to fail is more + # practical that overriding Series._values to special case + # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. + pytest.param( + pd.DatetimeIndex(["2017", "2018"]), + np.ndarray, + "datetime64[ns]", + marks=[pytest.mark.xfail(reason="datetime _values", strict=True)], + ), + pytest.param( + pd.TimedeltaIndex([10 ** 10]), + np.ndarray, + "m8[ns]", + marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)], + ), + ], +) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values r_values = pd.Index(array)._values @@ -1189,18 +1239,27 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_equal(l_values, r_values) -@pytest.mark.parametrize('array, expected', [ - (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), - (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), - (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), - (pd.DatetimeIndex(['2017-01-01T00:00:00']), - np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), - (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"), - np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')), - (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')), - (pd.PeriodIndex(['2017', '2018'], freq='D'), - np.array([17167, 17532], dtype=np.int64)), -]) +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), + (np.array(["0", "1"]), np.array(["0", "1"], dtype=object)), + (pd.Categorical(["a", "a"]), np.array([0, 0], dtype="int8")), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"]), + np.array(["2017-01-01T00:00:00"], dtype="M8[ns]"), + ), + ( + pd.DatetimeIndex(["2017-01-01T00:00:00"], tz="US/Eastern"), + np.array(["2017-01-01T05:00:00"], dtype="M8[ns]"), + ), + (pd.TimedeltaIndex([10 ** 10]), np.array([10 ** 10], dtype="m8[ns]")), + ( + pd.PeriodIndex(["2017", "2018"], freq="D"), + np.array([17167, 17532], dtype=np.int64), + ), + ], +) def test_ndarray_values(array, expected): l_values = pd.Series(array)._ndarray_values r_values = pd.Index(array)._ndarray_values @@ -1208,9 +1267,7 @@ def test_ndarray_values(array, expected): tm.assert_numpy_array_equal(l_values, expected) -@pytest.mark.parametrize("arr", [ - np.array([1, 2, 3]), -]) +@pytest.mark.parametrize("arr", [np.array([1, 2, 3])]) def test_numpy_array(arr): ser = pd.Series(arr) result = ser.array @@ -1229,23 +1286,30 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): assert isinstance(result, PandasArray) -@pytest.mark.parametrize("array, attr", [ - (pd.Categorical(['a', 'b']), '_codes'), - (pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'), - (pd.core.arrays.integer_array([0, np.nan]), '_data'), - (pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'), - (pd.SparseArray([0, 1]), '_sparse_values'), - (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), - # tz-aware Datetime - (DatetimeArray(np.array(['2000-01-01T12:00:00', - '2000-01-02T12:00:00'], - dtype='M8[ns]'), - dtype=DatetimeTZDtype(tz="US/Central")), - '_data'), -]) -@pytest.mark.parametrize('box', [pd.Series, pd.Index]) +@pytest.mark.parametrize( + "array, attr", + [ + (pd.Categorical(["a", "b"]), "_codes"), + (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), + (pd.core.arrays.integer_array([0, np.nan]), "_data"), + (pd.core.arrays.IntervalArray.from_breaks([0, 1]), "_left"), + (pd.SparseArray([0, 1]), "_sparse_values"), + (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), + # tz-aware Datetime + ( + DatetimeArray( + np.array( + ["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + "_data", + ), + ], +) +@pytest.mark.parametrize("box", [pd.Series, pd.Index]) def test_array(array, attr, box): - if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index: + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: pytest.skip("No index type for {}".format(array.dtype)) result = box(array, copy=False).array @@ -1257,43 +1321,61 @@ def test_array(array, attr, box): def test_array_multiindex_raises(): - idx = pd.MultiIndex.from_product([['A'], ['a', 'b']]) - with pytest.raises(ValueError, match='MultiIndex'): + idx = pd.MultiIndex.from_product([["A"], ["a", "b"]]) + with pytest.raises(ValueError, match="MultiIndex"): idx.array -@pytest.mark.parametrize('array, expected', [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(['a', 'b']), np.array(['a', 'b'], dtype=object)), - (pd.core.arrays.period_array(['2000', '2001'], freq='D'), - np.array([pd.Period('2000', freq="D"), pd.Period('2001', freq='D')])), - (pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object)), - (pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), - np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object)), - (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), - - # tz-naive datetime - (DatetimeArray(np.array(['2000', '2001'], dtype='M8[ns]')), - np.array(['2000', '2001'], dtype='M8[ns]')), - - # tz-aware stays tz`-aware - (DatetimeArray(np.array(['2000-01-01T06:00:00', - '2000-01-02T06:00:00'], - dtype='M8[ns]'), - dtype=DatetimeTZDtype(tz='US/Central')), - np.array([pd.Timestamp('2000-01-01', tz='US/Central'), - pd.Timestamp('2000-01-02', tz='US/Central')])), - - # Timedelta - (TimedeltaArray(np.array([0, 3600000000000], dtype='i8'), freq='H'), - np.array([0, 3600000000000], dtype='m8[ns]')), -]) -@pytest.mark.parametrize('box', [pd.Series, pd.Index]) +@pytest.mark.parametrize( + "array, expected", + [ + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + ( + pd.core.arrays.period_array(["2000", "2001"], freq="D"), + np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + ), + ( + pd.core.arrays.integer_array([0, np.nan]), + np.array([0, np.nan], dtype=object), + ), + ( + pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), + np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + ), + (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + # tz-naive datetime + ( + DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), + np.array(["2000", "2001"], dtype="M8[ns]"), + ), + # tz-aware stays tz`-aware + ( + DatetimeArray( + np.array( + ["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]" + ), + dtype=DatetimeTZDtype(tz="US/Central"), + ), + np.array( + [ + pd.Timestamp("2000-01-01", tz="US/Central"), + pd.Timestamp("2000-01-02", tz="US/Central"), + ] + ), + ), + # Timedelta + ( + TimedeltaArray(np.array([0, 3600000000000], dtype="i8"), freq="H"), + np.array([0, 3600000000000], dtype="m8[ns]"), + ), + ], +) +@pytest.mark.parametrize("box", [pd.Series, pd.Index]) def test_to_numpy(array, expected, box): thing = box(array) - if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index: + if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: pytest.skip("No index type for {}".format(array.dtype)) result = thing.to_numpy() @@ -1301,10 +1383,9 @@ def test_to_numpy(array, expected, box): @pytest.mark.parametrize("as_series", [True, False]) -@pytest.mark.parametrize("arr", [ - np.array([1, 2, 3], dtype="int64"), - np.array(['a', 'b', 'c'], dtype=object), -]) +@pytest.mark.parametrize( + "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] +) def test_to_numpy_copy(arr, as_series): obj = pd.Index(arr, copy=False) if as_series: @@ -1325,23 +1406,22 @@ def test_to_numpy_copy(arr, as_series): @pytest.mark.parametrize("as_series", [True, False]) def test_to_numpy_dtype(as_series): tz = "US/Eastern" - obj = pd.DatetimeIndex(['2000', '2001'], tz=tz) + obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) if as_series: obj = pd.Series(obj) # preserve tz by default result = obj.to_numpy() - expected = np.array([pd.Timestamp('2000', tz=tz), - pd.Timestamp('2001', tz=tz)], - dtype=object) + expected = np.array( + [pd.Timestamp("2000", tz=tz), pd.Timestamp("2001", tz=tz)], dtype=object + ) tm.assert_numpy_array_equal(result, expected) result = obj.to_numpy(dtype="object") tm.assert_numpy_array_equal(result, expected) result = obj.to_numpy(dtype="M8[ns]") - expected = np.array(['2000-01-01T05', '2001-01-01T05'], - dtype='M8[ns]') + expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -1349,35 +1429,47 @@ class TestConstruction: # test certain constructor behaviours on dtype inference across Series, # Index and DataFrame - @pytest.mark.parametrize("klass", [ - Series, - lambda x, **kwargs: DataFrame({'a': x}, **kwargs)['a'], - pytest.param(lambda x, **kwargs: DataFrame(x, **kwargs)[0], - marks=pytest.mark.xfail), - Index, - ]) - @pytest.mark.parametrize("a", [ - np.array(['2263-01-01'], dtype='datetime64[D]'), - np.array([datetime(2263, 1, 1)], dtype=object), - np.array([np.datetime64('2263-01-01', 'D')], dtype=object), - np.array(["2263-01-01"], dtype=object) - ], ids=['datetime64[D]', 'object-datetime.datetime', - 'object-numpy-scalar', 'object-string']) + @pytest.mark.parametrize( + "klass", + [ + Series, + lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"], + pytest.param( + lambda x, **kwargs: DataFrame(x, **kwargs)[0], marks=pytest.mark.xfail + ), + Index, + ], + ) + @pytest.mark.parametrize( + "a", + [ + np.array(["2263-01-01"], dtype="datetime64[D]"), + np.array([datetime(2263, 1, 1)], dtype=object), + np.array([np.datetime64("2263-01-01", "D")], dtype=object), + np.array(["2263-01-01"], dtype=object), + ], + ids=[ + "datetime64[D]", + "object-datetime.datetime", + "object-numpy-scalar", + "object-string", + ], + ) def test_constructor_datetime_outofbound(self, a, klass): # GH-26853 (+ bug GH-26206 out of bound non-ns unit) # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == 'M': + if a.dtype.kind == "M": with pytest.raises(pd.errors.OutOfBoundsDatetime): klass(a) else: result = klass(a) - assert result.dtype == 'object' + assert result.dtype == "object" tm.assert_numpy_array_equal(result.to_numpy(), a) # Explicit dtype specified # Forced conversion fails for all -> all cases raise error with pytest.raises(pd.errors.OutOfBoundsDatetime): - klass(a, dtype='datetime64[ns]') + klass(a, dtype="datetime64[ns]") diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index de8d28eeb41a6..d96f806bc383f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -21,31 +21,31 @@ def fn(x): part2 = partial(part1) class somecall: - def __call__(self): return x # noqa - assert getname(fn) == 'fn' + assert getname(fn) == "fn" assert getname(lambda_) - assert getname(part1) == 'fn' - assert getname(part2) == 'fn' - assert getname(somecall()) == 'somecall' + assert getname(part1) == "fn" + assert getname(part2) == "fn" + assert getname(somecall()) == "somecall" assert getname(1) is None def test_any_none(): - assert (com._any_none(1, 2, 3, None)) - assert (not com._any_none(1, 2, 3, 4)) + assert com._any_none(1, 2, 3, None) + assert not com._any_none(1, 2, 3, 4) def test_all_not_none(): - assert (com._all_not_none(1, 2, 3, 4)) - assert (not com._all_not_none(1, 2, 3, None)) - assert (not com._all_not_none(None, None, None, None)) + assert com._all_not_none(1, 2, 3, 4) + assert not com._all_not_none(1, 2, 3, None) + assert not com._all_not_none(None, None, None, None) def test_random_state(): import numpy.random as npr + # Check with seed state = com.random_state(5) assert state.uniform() == npr.RandomState(5).uniform() @@ -59,31 +59,34 @@ def test_random_state(): # Error for floats or strings with pytest.raises(ValueError): - com.random_state('test') + com.random_state("test") with pytest.raises(ValueError): com.random_state(5.5) -@pytest.mark.parametrize('left, right, expected', [ - (Series([1], name='x'), Series([2], name='x'), 'x'), - (Series([1], name='x'), Series([2], name='y'), None), - (Series([1]), Series([2], name='x'), None), - (Series([1], name='x'), Series([2]), None), - (Series([1], name='x'), [2], 'x'), - ([1], Series([2], name='y'), 'y')]) +@pytest.mark.parametrize( + "left, right, expected", + [ + (Series([1], name="x"), Series([2], name="x"), "x"), + (Series([1], name="x"), Series([2], name="y"), None), + (Series([1]), Series([2], name="x"), None), + (Series([1], name="x"), Series([2]), None), + (Series([1], name="x"), [2], "x"), + ([1], Series([2], name="y"), "y"), + ], +) def test_maybe_match_name(left, right, expected): assert ops._maybe_match_name(left, right) == expected def test_dict_compat(): - data_datetime64 = {np.datetime64('1990-03-15'): 1, - np.datetime64('2015-03-15'): 2} + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} data_unchanged = {1: 2, 3: 4, 5: 6} - expected = {Timestamp('1990-3-15'): 1, Timestamp('2015-03-15'): 2} - assert (com.dict_compat(data_datetime64) == expected) - assert (com.dict_compat(expected) == expected) - assert (com.dict_compat(data_unchanged) == data_unchanged) + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert com.dict_compat(data_datetime64) == expected + assert com.dict_compat(expected) == expected + assert com.dict_compat(data_unchanged) == data_unchanged def test_standardize_mapping(): @@ -99,11 +102,11 @@ def test_standardize_mapping(): with pytest.raises(TypeError): com.standardize_mapping(list) - fill = {'bad': 'data'} - assert (com.standardize_mapping(fill) == dict) + fill = {"bad": "data"} + assert com.standardize_mapping(fill) == dict # Convert instance to type - assert (com.standardize_mapping({}) == dict) + assert com.standardize_mapping({}) == dict dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index bb662e99664e2..d644c002fbdfb 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -35,13 +35,13 @@ def import_module(name): @pytest.fixture def df(): - return DataFrame({'A': [1, 2, 3]}) + return DataFrame({"A": [1, 2, 3]}) def test_dask(df): - toolz = import_module('toolz') # noqa - dask = import_module('dask') # noqa + toolz = import_module("toolz") # noqa + dask = import_module("dask") # noqa import dask.dataframe as dd @@ -52,7 +52,7 @@ def test_dask(df): def test_xarray(df): - xarray = import_module('xarray') # noqa + xarray = import_module("xarray") # noqa assert df.to_xarray() is not None @@ -67,22 +67,23 @@ def test_oo_optimizable(): @pytest.mark.filterwarnings("ignore:can't:ImportWarning") def test_statsmodels(): - statsmodels = import_module('statsmodels') # noqa + statsmodels = import_module("statsmodels") # noqa import statsmodels.api as sm import statsmodels.formula.api as smf + df = sm.datasets.get_rdataset("Guerry", "HistData").data - smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=df).fit() + smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit() # Cython import warning @pytest.mark.filterwarnings("ignore:can't:ImportWarning") def test_scikit_learn(df): - sklearn = import_module('sklearn') # noqa + sklearn = import_module("sklearn") # noqa from sklearn import svm, datasets digits = datasets.load_digits() - clf = svm.SVC(gamma=0.001, C=100.) + clf = svm.SVC(gamma=0.001, C=100.0) clf.fit(digits.data[:-1], digits.target[:-1]) clf.predict(digits.data[-1:]) @@ -92,23 +93,22 @@ def test_scikit_learn(df): @pytest.mark.filterwarnings("ignore") def test_seaborn(): - seaborn = import_module('seaborn') + seaborn = import_module("seaborn") tips = seaborn.load_dataset("tips") seaborn.stripplot(x="day", y="total_bill", data=tips) def test_pandas_gbq(df): - pandas_gbq = import_module('pandas_gbq') # noqa + pandas_gbq = import_module("pandas_gbq") # noqa @pytest.mark.xfail(reason="0.7.0 pending") @tm.network def test_pandas_datareader(): - pandas_datareader = import_module('pandas_datareader') # noqa - pandas_datareader.DataReader( - 'F', 'quandl', '2017-01-01', '2017-02-01') + pandas_datareader = import_module("pandas_datareader") # noqa + pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01") # importing from pandas, Cython import warning @@ -118,8 +118,8 @@ def test_pandas_datareader(): @pytest.mark.skip(reason="gh-25778: geopandas stack issue") def test_geopandas(): - geopandas = import_module('geopandas') # noqa - fp = geopandas.datasets.get_path('naturalearth_lowres') + geopandas = import_module("geopandas") # noqa + fp = geopandas.datasets.get_path("naturalearth_lowres") assert geopandas.read_file(fp) is not None @@ -127,7 +127,7 @@ def test_geopandas(): @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") def test_pyarrow(df): - pyarrow = import_module('pyarrow') # noqa + pyarrow = import_module("pyarrow") # noqa table = pyarrow.Table.from_pandas(df) result = table.to_pandas() tm.assert_frame_equal(result, df) @@ -142,11 +142,11 @@ def test_missing_required_dependency(): # -E : disable PYTHON* env vars, especially PYTHONPATH # And, that's apparently not enough, so we give up. # https://github.com/MacPython/pandas-wheels/pull/50 - call = ['python', '-sSE', '-c', 'import pandas'] + call = ["python", "-sSE", "-c", "import pandas"] with pytest.raises(subprocess.CalledProcessError) as exc: subprocess.check_output(call, stderr=subprocess.STDOUT) output = exc.value.stdout.decode() - for name in ['numpy', 'pytz', 'dateutil']: + for name in ["numpy", "pytz", "dateutil"]: assert name in output diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index 899b985f247d4..531c511e8c02d 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -6,12 +6,22 @@ @pytest.mark.parametrize( - "exc", ['UnsupportedFunctionCall', 'UnsortedIndexError', - 'OutOfBoundsDatetime', - 'ParserError', 'PerformanceWarning', 'DtypeWarning', - 'EmptyDataError', 'ParserWarning', 'MergeError']) + "exc", + [ + "UnsupportedFunctionCall", + "UnsortedIndexError", + "OutOfBoundsDatetime", + "ParserError", + "PerformanceWarning", + "DtypeWarning", + "EmptyDataError", + "ParserWarning", + "MergeError", + ], +) def test_exception_importable(exc): from pandas import errors + e = getattr(errors, exc) assert e is not None @@ -24,7 +34,7 @@ def test_catch_oob(): from pandas import errors try: - pd.Timestamp('15000101') + pd.Timestamp("15000101") except errors.OutOfBoundsDatetime: pass @@ -48,11 +58,11 @@ def test_error_rename(): class Foo: @classmethod def classmethod(cls): - raise AbstractMethodError(cls, methodtype='classmethod') + raise AbstractMethodError(cls, methodtype="classmethod") @property def property(self): - raise AbstractMethodError(self, methodtype='property') + raise AbstractMethodError(self, methodtype="property") def method(self): raise AbstractMethodError(self) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 1f3f5a251ef17..a7281e002cc5c 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -9,31 +9,41 @@ from pandas.core.computation import expressions as expr import pandas.util.testing as tm from pandas.util.testing import ( - assert_almost_equal, assert_frame_equal, assert_series_equal) + assert_almost_equal, + assert_frame_equal, + assert_series_equal, +) from pandas.io.formats.printing import pprint_thing -_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') -_frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64') -_mixed = DataFrame({'A': _frame['A'].copy(), - 'B': _frame['B'].astype('float32'), - 'C': _frame['C'].astype('int64'), - 'D': _frame['D'].astype('int32')}) -_mixed2 = DataFrame({'A': _frame2['A'].copy(), - 'B': _frame2['B'].astype('float32'), - 'C': _frame2['C'].astype('int64'), - 'D': _frame2['D'].astype('int32')}) +_frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64") +_frame2 = DataFrame(randn(100, 4), columns=list("ABCD"), dtype="float64") +_mixed = DataFrame( + { + "A": _frame["A"].copy(), + "B": _frame["B"].astype("float32"), + "C": _frame["C"].astype("int64"), + "D": _frame["D"].astype("int32"), + } +) +_mixed2 = DataFrame( + { + "A": _frame2["A"].copy(), + "B": _frame2["B"].astype("float32"), + "C": _frame2["C"].astype("int64"), + "D": _frame2["D"].astype("int32"), + } +) _integer = DataFrame( - np.random.randint(1, 100, - size=(10001, 4)), - columns=list('ABCD'), dtype='int64') -_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)), - columns=list('ABCD'), dtype='int64') + np.random.randint(1, 100, size=(10001, 4)), columns=list("ABCD"), dtype="int64" +) +_integer2 = DataFrame( + np.random.randint(1, 100, size=(101, 4)), columns=list("ABCD"), dtype="int64" +) -@pytest.mark.skipif(not expr._USE_NUMEXPR, reason='not using numexpr') +@pytest.mark.skipif(not expr._USE_NUMEXPR, reason="not using numexpr") class TestExpressions: - def setup_method(self, method): self.frame = _frame.copy() @@ -46,15 +56,14 @@ def setup_method(self, method): def teardown_method(self, method): expr._MIN_ELEMENTS = self._MIN_ELEMENTS - def run_arithmetic(self, df, other, assert_func, check_dtype=False, - test_flex=True): + def run_arithmetic(self, df, other, assert_func, check_dtype=False, test_flex=True): expr._MIN_ELEMENTS = 0 - operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv'] + operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"] for arith in operations: operator_name = arith - if arith == 'div': - operator_name = 'truediv' + if arith == "div": + operator_name = "truediv" if test_flex: op = lambda x, y: getattr(df, arith)(y) @@ -68,22 +77,30 @@ def run_arithmetic(self, df, other, assert_func, check_dtype=False, result = op(df, other) try: if check_dtype: - if arith == 'truediv': - assert expected.dtype.kind == 'f' + if arith == "truediv": + assert expected.dtype.kind == "f" assert_func(expected, result) except Exception: pprint_thing("Failed test with operator %r" % op.__name__) raise def test_integer_arithmetic(self): - self.run_arithmetic(self.integer, self.integer, - assert_frame_equal) - self.run_arithmetic(self.integer.iloc[:, 0], - self.integer.iloc[:, 0], assert_series_equal, - check_dtype=True) - - def run_binary(self, df, other, assert_func, test_flex=False, - numexpr_ops={'gt', 'lt', 'ge', 'le', 'eq', 'ne'}): + self.run_arithmetic(self.integer, self.integer, assert_frame_equal) + self.run_arithmetic( + self.integer.iloc[:, 0], + self.integer.iloc[:, 0], + assert_series_equal, + check_dtype=True, + ) + + def run_binary( + self, + df, + other, + assert_func, + test_flex=False, + numexpr_ops={"gt", "lt", "ge", "le", "eq", "ne"}, + ): """ tests solely that the result is the same whether or not numexpr is enabled. Need to test whether the function does the correct thing @@ -91,7 +108,7 @@ def run_binary(self, df, other, assert_func, test_flex=False, """ expr._MIN_ELEMENTS = 0 expr.set_test_mode(True) - operations = ['gt', 'lt', 'ge', 'le', 'eq', 'ne'] + operations = ["gt", "lt", "ge", "le", "eq", "ne"] for arith in operations: if test_flex: @@ -116,27 +133,24 @@ def run_binary(self, df, other, assert_func, test_flex=False, pprint_thing("test_flex was %r" % test_flex) raise - def run_frame(self, df, other, binary_comp=None, run_binary=True, - **kwargs): - self.run_arithmetic(df, other, assert_frame_equal, - test_flex=False, **kwargs) - self.run_arithmetic(df, other, assert_frame_equal, test_flex=True, - **kwargs) + def run_frame(self, df, other, binary_comp=None, run_binary=True, **kwargs): + self.run_arithmetic(df, other, assert_frame_equal, test_flex=False, **kwargs) + self.run_arithmetic(df, other, assert_frame_equal, test_flex=True, **kwargs) if run_binary: if binary_comp is None: expr.set_use_numexpr(False) binary_comp = other + 1 expr.set_use_numexpr(True) - self.run_binary(df, binary_comp, assert_frame_equal, - test_flex=False, **kwargs) - self.run_binary(df, binary_comp, assert_frame_equal, - test_flex=True, **kwargs) + self.run_binary( + df, binary_comp, assert_frame_equal, test_flex=False, **kwargs + ) + self.run_binary( + df, binary_comp, assert_frame_equal, test_flex=True, **kwargs + ) def run_series(self, ser, other, binary_comp=None, **kwargs): - self.run_arithmetic(ser, other, assert_series_equal, - test_flex=False, **kwargs) - self.run_arithmetic(ser, other, assert_almost_equal, - test_flex=True, **kwargs) + self.run_arithmetic(ser, other, assert_series_equal, test_flex=False, **kwargs) + self.run_arithmetic(ser, other, assert_almost_equal, test_flex=True, **kwargs) # series doesn't uses vec_compare instead of numexpr... # if binary_comp is None: # binary_comp = other + 1 @@ -169,78 +183,84 @@ def test_mixed_arithmetic_series(self): def test_float_arithemtic(self): self.run_arithmetic(self.frame, self.frame, assert_frame_equal) - self.run_arithmetic(self.frame.iloc[:, 0], self.frame.iloc[:, 0], - assert_series_equal, check_dtype=True) + self.run_arithmetic( + self.frame.iloc[:, 0], + self.frame.iloc[:, 0], + assert_series_equal, + check_dtype=True, + ) def test_mixed_arithmetic(self): self.run_arithmetic(self.mixed, self.mixed, assert_frame_equal) for col in self.mixed.columns: - self.run_arithmetic(self.mixed[col], self.mixed[col], - assert_series_equal) + self.run_arithmetic(self.mixed[col], self.mixed[col], assert_series_equal) def test_integer_with_zeros(self): self.integer *= np.random.randint(0, 2, size=np.shape(self.integer)) - self.run_arithmetic(self.integer, self.integer, - assert_frame_equal) - self.run_arithmetic(self.integer.iloc[:, 0], - self.integer.iloc[:, 0], assert_series_equal) + self.run_arithmetic(self.integer, self.integer, assert_frame_equal) + self.run_arithmetic( + self.integer.iloc[:, 0], self.integer.iloc[:, 0], assert_series_equal + ) def test_invalid(self): # no op - result = expr._can_use_numexpr(operator.add, None, self.frame, - self.frame, 'evaluate') + result = expr._can_use_numexpr( + operator.add, None, self.frame, self.frame, "evaluate" + ) assert not result # mixed - result = expr._can_use_numexpr(operator.add, '+', self.mixed, - self.frame, 'evaluate') + result = expr._can_use_numexpr( + operator.add, "+", self.mixed, self.frame, "evaluate" + ) assert not result # min elements - result = expr._can_use_numexpr(operator.add, '+', self.frame2, - self.frame2, 'evaluate') + result = expr._can_use_numexpr( + operator.add, "+", self.frame2, self.frame2, "evaluate" + ) assert not result # ok, we only check on first part of expression - result = expr._can_use_numexpr(operator.add, '+', self.frame, - self.frame2, 'evaluate') + result = expr._can_use_numexpr( + operator.add, "+", self.frame, self.frame2, "evaluate" + ) assert result def test_binary_ops(self): def testit(): - for f, f2 in [(self.frame, self.frame2), - (self.mixed, self.mixed2)]: + for f, f2 in [(self.frame, self.frame2), (self.mixed, self.mixed2)]: - for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), - ('div', '/'), ('pow', '**')]: + for op, op_str in [ + ("add", "+"), + ("sub", "-"), + ("mul", "*"), + ("div", "/"), + ("pow", "**"), + ]: - if op == 'pow': + if op == "pow": continue - if op == 'div': - op = getattr(operator, 'truediv', None) + if op == "div": + op = getattr(operator, "truediv", None) else: op = getattr(operator, op, None) if op is not None: - result = expr._can_use_numexpr(op, op_str, f, f, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f, f, "evaluate") assert result != f._is_mixed_type - result = expr.evaluate(op, op_str, f, f, - use_numexpr=True) - expected = expr.evaluate(op, op_str, f, f, - use_numexpr=False) + result = expr.evaluate(op, op_str, f, f, use_numexpr=True) + expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) if isinstance(result, DataFrame): tm.assert_frame_equal(result, expected) else: - tm.assert_numpy_array_equal(result, - expected.values) + tm.assert_numpy_array_equal(result, expected.values) - result = expr._can_use_numexpr(op, op_str, f2, f2, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f2, f2, "evaluate") assert not result expr.set_use_numexpr(False) @@ -253,8 +273,7 @@ def testit(): def test_boolean_ops(self): def testit(): - for f, f2 in [(self.frame, self.frame2), - (self.mixed, self.mixed2)]: + for f, f2 in [(self.frame, self.frame2), (self.mixed, self.mixed2)]: f11 = f f12 = f + 1 @@ -262,26 +281,28 @@ def testit(): f21 = f2 f22 = f2 + 1 - for op, op_str in [('gt', '>'), ('lt', '<'), ('ge', '>='), - ('le', '<='), ('eq', '=='), ('ne', '!=')]: + for op, op_str in [ + ("gt", ">"), + ("lt", "<"), + ("ge", ">="), + ("le", "<="), + ("eq", "=="), + ("ne", "!="), + ]: op = getattr(operator, op) - result = expr._can_use_numexpr(op, op_str, f11, f12, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f11, f12, "evaluate") assert result != f11._is_mixed_type - result = expr.evaluate(op, op_str, f11, f12, - use_numexpr=True) - expected = expr.evaluate(op, op_str, f11, f12, - use_numexpr=False) + result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) + expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) if isinstance(result, DataFrame): tm.assert_frame_equal(result, expected) else: tm.assert_numpy_array_equal(result, expected.values) - result = expr._can_use_numexpr(op, op_str, f21, f22, - 'evaluate') + result = expr._can_use_numexpr(op, op_str, f21, f22, "evaluate") assert not result expr.set_use_numexpr(False) @@ -313,11 +334,10 @@ def testit(): testit() def test_bool_ops_raise_on_arithmetic(self): - df = DataFrame({'a': np.random.rand(10) > 0.5, - 'b': np.random.rand(10) > 0.5}) - names = 'truediv', 'floordiv', 'pow' - ops = '/', '//', '**' - msg = 'operator %r not implemented for bool dtypes' + df = DataFrame({"a": np.random.rand(10) > 0.5, "b": np.random.rand(10) > 0.5}) + names = "truediv", "floordiv", "pow" + ops = "/", "//", "**" + msg = "operator %r not implemented for bool dtypes" for op, name in zip(ops, names): f = getattr(operator, name) err_msg = re.escape(msg % op) @@ -342,17 +362,16 @@ def test_bool_ops_raise_on_arithmetic(self): def test_bool_ops_warn_on_arithmetic(self): n = 10 - df = DataFrame({'a': np.random.rand(n) > 0.5, - 'b': np.random.rand(n) > 0.5}) - names = 'add', 'mul', 'sub' - ops = '+', '*', '-' - subs = {'+': '|', '*': '&', '-': '^'} - sub_funcs = {'|': 'or_', '&': 'and_', '^': 'xor'} + df = DataFrame({"a": np.random.rand(n) > 0.5, "b": np.random.rand(n) > 0.5}) + names = "add", "mul", "sub" + ops = "+", "*", "-" + subs = {"+": "|", "*": "&", "-": "^"} + sub_funcs = {"|": "or_", "&": "and_", "^": "xor"} for op, name in zip(ops, names): f = getattr(operator, name) fe = getattr(operator, sub_funcs[subs[op]]) - if op == '-': + if op == "-": # raises TypeError continue @@ -387,18 +406,28 @@ def test_bool_ops_warn_on_arithmetic(self): e = fe(df, True) tm.assert_frame_equal(r, e) - @pytest.mark.parametrize("test_input,expected", [ - (DataFrame([[0, 1, 2, 'aa'], [0, 1, 2, 'aa']], - columns=['a', 'b', 'c', 'dtype']), - DataFrame([[False, False], [False, False]], - columns=['a', 'dtype'])), - (DataFrame([[0, 3, 2, 'aa'], [0, 4, 2, 'aa'], [0, 1, 1, 'bb']], - columns=['a', 'b', 'c', 'dtype']), - DataFrame([[False, False], [False, False], - [False, False]], columns=['a', 'dtype'])), - ]) + @pytest.mark.parametrize( + "test_input,expected", + [ + ( + DataFrame( + [[0, 1, 2, "aa"], [0, 1, 2, "aa"]], columns=["a", "b", "c", "dtype"] + ), + DataFrame([[False, False], [False, False]], columns=["a", "dtype"]), + ), + ( + DataFrame( + [[0, 3, 2, "aa"], [0, 4, 2, "aa"], [0, 1, 1, "bb"]], + columns=["a", "b", "c", "dtype"], + ), + DataFrame( + [[False, False], [False, False], [False, False]], + columns=["a", "dtype"], + ), + ), + ], + ) def test_bool_ops_column_name_dtype(self, test_input, expected): # GH 22383 - .ne fails if columns containing column name 'dtype' - result = test_input.loc[:, ['a', 'dtype']].ne( - test_input.loc[:, ['a', 'dtype']]) + result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]]) assert_frame_equal(result, expected) diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 455981506efca..e750193abb71a 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -8,13 +8,14 @@ class TestIndexer: - def test_outer_join_indexer(self): - typemap = [('int32', _join.outer_join_indexer_int32), - ('int64', _join.outer_join_indexer_int64), - ('float32', _join.outer_join_indexer_float32), - ('float64', _join.outer_join_indexer_float64), - ('object', _join.outer_join_indexer_object)] + typemap = [ + ("int32", _join.outer_join_indexer_int32), + ("int64", _join.outer_join_indexer_int64), + ("float32", _join.outer_join_indexer_float32), + ("float64", _join.outer_join_indexer_float64), + ("object", _join.outer_join_indexer_object), + ] for dtype, indexer in typemap: left = np.arange(3, dtype=dtype) @@ -56,12 +57,111 @@ def test_left_join_indexer_unique(): def test_left_outer_join_bug(): - left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, - 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, - 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, - 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, - 3, 1, 2, 0, 2], dtype=np.int64) + left = np.array( + [ + 0, + 1, + 0, + 1, + 1, + 2, + 3, + 1, + 0, + 2, + 1, + 2, + 0, + 1, + 1, + 2, + 3, + 2, + 3, + 2, + 1, + 1, + 3, + 0, + 3, + 2, + 3, + 0, + 0, + 2, + 3, + 2, + 0, + 3, + 1, + 3, + 0, + 1, + 3, + 0, + 0, + 1, + 0, + 3, + 1, + 0, + 1, + 0, + 1, + 1, + 0, + 2, + 2, + 2, + 2, + 2, + 0, + 3, + 1, + 2, + 0, + 0, + 3, + 1, + 3, + 2, + 2, + 0, + 1, + 3, + 0, + 2, + 3, + 2, + 3, + 3, + 2, + 3, + 3, + 1, + 3, + 2, + 0, + 0, + 3, + 1, + 1, + 1, + 0, + 2, + 3, + 3, + 1, + 2, + 0, + 3, + 1, + 2, + 0, + 2, + ], + dtype=np.int64, + ) right = np.array([3, 1], dtype=np.int64) max_groups = 4 @@ -196,39 +296,55 @@ def test_inner_join_indexer2(): def test_merge_join_categorical_multiindex(): # From issue 16627 - a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'], - ['a', 'b', 'c']), - 'Int1': [0, 1, 0, 1, 0, 0]} + a = { + "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]), + "Int1": [0, 1, 0, 1, 0, 0], + } a = DataFrame(a) - b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - ['a', 'b', 'c']), - 'Int': [0, 0, 0, 1, 1, 1], - 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} - b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] - - expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], - right_on=['Cat', 'Int'], how='left') - result = a.join(b, on=['Cat1', 'Int1']) - expected = expected.drop(['Cat', 'Int'], axis=1) + b = { + "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + result = a.join(b, on=["Cat1", "Int1"]) + expected = expected.drop(["Cat", "Int"], axis=1) assert_frame_equal(expected, result) # Same test, but with ordered categorical - a = {'Cat1': Categorical(['a', 'b', 'a', 'c', 'a', 'b'], - ['b', 'a', 'c'], - ordered=True), - 'Int1': [0, 1, 0, 1, 0, 0]} + a = { + "Cat1": Categorical( + ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True + ), + "Int1": [0, 1, 0, 1, 0, 0], + } a = DataFrame(a) - b = {'Cat': Categorical(['a', 'b', 'c', 'a', 'b', 'c'], - ['b', 'a', 'c'], - ordered=True), - 'Int': [0, 0, 0, 1, 1, 1], - 'Factor': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]} - b = DataFrame(b).set_index(['Cat', 'Int'])['Factor'] - - expected = merge(a, b.reset_index(), left_on=['Cat1', 'Int1'], - right_on=['Cat', 'Int'], how='left') - result = a.join(b, on=['Cat1', 'Int1']) - expected = expected.drop(['Cat', 'Int'], axis=1) + b = { + "Cat": Categorical( + ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True + ), + "Int": [0, 0, 0, 1, 1, 1], + "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6], + } + b = DataFrame(b).set_index(["Cat", "Int"])["Factor"] + + expected = merge( + a, + b.reset_index(), + left_on=["Cat1", "Int1"], + right_on=["Cat", "Int"], + how="left", + ) + result = a.join(b, on=["Cat1", "Int1"]) + expected = expected.drop(["Cat", "Int"], axis=1) assert_frame_equal(expected, result) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 66b0d8869940a..77841f0bb9f0d 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -8,40 +8,38 @@ class TestMisc: - def test_max_len_string_array(self): - arr = a = np.array(['foo', 'b', np.nan], dtype='object') + arr = a = np.array(["foo", "b", np.nan], dtype="object") assert libwriters.max_len_string_array(arr) == 3 # unicode - arr = a.astype('U').astype(object) + arr = a.astype("U").astype(object) assert libwriters.max_len_string_array(arr) == 3 # bytes for python3 - arr = a.astype('S').astype(object) + arr = a.astype("S").astype(object) assert libwriters.max_len_string_array(arr) == 3 # raises with pytest.raises(TypeError): - libwriters.max_len_string_array(arr.astype('U')) + libwriters.max_len_string_array(arr.astype("U")) def test_fast_unique_multiple_list_gen_sort(self): - keys = [['p', 'a'], ['n', 'd'], ['a', 's']] + keys = [["p", "a"], ["n", "d"], ["a", "s"]] gen = (key for key in keys) - expected = np.array(['a', 'd', 'n', 'p', 's']) + expected = np.array(["a", "d", "n", "p", "s"]) out = lib.fast_unique_multiple_list_gen(gen, sort=True) tm.assert_numpy_array_equal(np.array(out), expected) gen = (key for key in keys) - expected = np.array(['p', 'a', 'n', 'd', 's']) + expected = np.array(["p", "a", "n", "d", "s"]) out = lib.fast_unique_multiple_list_gen(gen, sort=False) tm.assert_numpy_array_equal(np.array(out), expected) class TestIndexing: - def test_maybe_indices_to_slice_left_edge(self): target = np.arange(100) @@ -58,20 +56,17 @@ def test_maybe_indices_to_slice_left_edge(self): maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # reverse indices = indices[::-1] maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # not slice - for case in [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], - [2, 0, -2]]: + for case in [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], [2, 0, -2]]: indices = np.array(case, dtype=np.int64) maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) @@ -89,16 +84,14 @@ def test_maybe_indices_to_slice_right_edge(self): maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # reverse indices = indices[::-1] maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # not slice indices = np.array([97, 98, 99, 100], dtype=np.int64) @@ -165,16 +158,14 @@ def test_maybe_indices_to_slice_middle(self): maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # reverse indices = indices[::-1] maybe_slice = lib.maybe_indices_to_slice(indices, len(target)) assert isinstance(maybe_slice, slice) - tm.assert_numpy_array_equal(target[indices], - target[maybe_slice]) + tm.assert_numpy_array_equal(target[indices], target[maybe_slice]) # not slice for case in [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index aa9c9bb05f877..a76f2bb04a542 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -16,28 +16,44 @@ from pandas.core.index import Index, MultiIndex import pandas.util.testing as tm -AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', - 'std', 'var', 'sem'] +AGG_FUNCTIONS = [ + "sum", + "prod", + "min", + "max", + "median", + "mean", + "skew", + "mad", + "std", + "var", + "sem", +] class Base: - def setup_method(self, method): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - - self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], - codes=[[0, 1, 2, 3]], names=['first']) + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + self.frame = DataFrame( + np.random.randn(10, 3), + index=index, + columns=Index(["A", "B", "C"], name="exp"), + ) + + self.single_level = MultiIndex( + levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] + ) # create test series object - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) @@ -45,32 +61,31 @@ def setup_method(self, method): self.series = s self.tdf = tm.makeTimeDataFrame(100) - self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]).sum() + self.ymd = self.tdf.groupby( + [lambda x: x.year, lambda x: x.month, lambda x: x.day] + ).sum() # use Int64Index, to make sure things work - self.ymd.index.set_levels([lev.astype('i8') - for lev in self.ymd.index.levels], - inplace=True) - self.ymd.index.set_names(['year', 'month', 'day'], inplace=True) + self.ymd.index.set_levels( + [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True + ) + self.ymd.index.set_names(["year", "month", "day"], inplace=True) class TestMultiLevel(Base): - def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) - result = a['A'].append(b['A']) - tm.assert_series_equal(result, self.frame['A']) + result = a["A"].append(b["A"]) + tm.assert_series_equal(result, self.frame["A"]) def test_append_index(self): idx1 = Index([1.1, 1.2, 1.3]) - idx2 = pd.date_range('2011-01-01', freq='D', periods=3, - tz='Asia/Tokyo') - idx3 = Index(['A', 'B', 'C']) + idx2 = pd.date_range("2011-01-01", freq="D", periods=3, tz="Asia/Tokyo") + idx3 = Index(["A", "B", "C"]) midx_lv2 = MultiIndex.from_arrays([idx1, idx2]) midx_lv3 = MultiIndex.from_arrays([idx1, idx2, idx3]) @@ -78,10 +93,12 @@ def test_append_index(self): result = idx1.append(midx_lv2) # see gh-7112 - tz = pytz.timezone('Asia/Tokyo') - expected_tuples = [(1.1, tz.localize(datetime.datetime(2011, 1, 1))), - (1.2, tz.localize(datetime.datetime(2011, 1, 2))), - (1.3, tz.localize(datetime.datetime(2011, 1, 3)))] + tz = pytz.timezone("Asia/Tokyo") + expected_tuples = [ + (1.1, tz.localize(datetime.datetime(2011, 1, 1))), + (1.2, tz.localize(datetime.datetime(2011, 1, 2))), + (1.3, tz.localize(datetime.datetime(2011, 1, 3))), + ] expected = Index([1.1, 1.2, 1.3] + expected_tuples) tm.assert_index_equal(result, expected) @@ -90,8 +107,7 @@ def test_append_index(self): tm.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv2) - expected = MultiIndex.from_arrays([idx1.append(idx1), - idx2.append(idx2)]) + expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2)]) tm.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv3) @@ -99,78 +115,84 @@ def test_append_index(self): result = midx_lv3.append(midx_lv2) expected = Index._simple_new( - np.array([(1.1, tz.localize(datetime.datetime(2011, 1, 1)), 'A'), - (1.2, tz.localize(datetime.datetime(2011, 1, 2)), 'B'), - (1.3, tz.localize(datetime.datetime(2011, 1, 3)), 'C')] + - expected_tuples), None) + np.array( + [ + (1.1, tz.localize(datetime.datetime(2011, 1, 1)), "A"), + (1.2, tz.localize(datetime.datetime(2011, 1, 2)), "B"), + (1.3, tz.localize(datetime.datetime(2011, 1, 3)), "C"), + ] + + expected_tuples + ), + None, + ) tm.assert_index_equal(result, expected) def test_dataframe_constructor(self): - multi = DataFrame(np.random.randn(4, 4), - index=[np.array(['a', 'a', 'b', 'b']), - np.array(['x', 'y', 'x', 'y'])]) + multi = DataFrame( + np.random.randn(4, 4), + index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])], + ) assert isinstance(multi.index, MultiIndex) assert not isinstance(multi.columns, MultiIndex) - multi = DataFrame(np.random.randn(4, 4), - columns=[['a', 'a', 'b', 'b'], - ['x', 'y', 'x', 'y']]) + multi = DataFrame( + np.random.randn(4, 4), columns=[["a", "a", "b", "b"], ["x", "y", "x", "y"]] + ) assert isinstance(multi.columns, MultiIndex) def test_series_constructor(self): - multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), np.array( - ['x', 'y', 'x', 'y'])]) + multi = Series( + 1.0, index=[np.array(["a", "a", "b", "b"]), np.array(["x", "y", "x", "y"])] + ) assert isinstance(multi.index, MultiIndex) - multi = Series(1., index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) + multi = Series(1.0, index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) assert isinstance(multi.index, MultiIndex) - multi = Series(range(4), index=[['a', 'a', 'b', 'b'], - ['x', 'y', 'x', 'y']]) + multi = Series(range(4), index=[["a", "a", "b", "b"], ["x", "y", "x", "y"]]) assert isinstance(multi.index, MultiIndex) def test_reindex_level(self): # axis=0 - month_sums = self.ymd.sum(level='month') + month_sums = self.ymd.sum(level="month") result = month_sums.reindex(self.ymd.index, level=1) - expected = self.ymd.groupby(level='month').transform(np.sum) + expected = self.ymd.groupby(level="month").transform(np.sum) tm.assert_frame_equal(result, expected) # Series - result = month_sums['A'].reindex(self.ymd.index, level=1) - expected = self.ymd['A'].groupby(level='month').transform(np.sum) + result = month_sums["A"].reindex(self.ymd.index, level=1) + expected = self.ymd["A"].groupby(level="month").transform(np.sum) tm.assert_series_equal(result, expected, check_names=False) # axis=1 - month_sums = self.ymd.T.sum(axis=1, level='month') + month_sums = self.ymd.T.sum(axis=1, level="month") result = month_sums.reindex(columns=self.ymd.index, level=1) - expected = self.ymd.groupby(level='month').transform(np.sum).T + expected = self.ymd.groupby(level="month").transform(np.sum).T tm.assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) - month_sums = self.ymd.sum(level='month') - result = op(self.ymd, month_sums, level='month') + month_sums = self.ymd.sum(level="month") + result = op(self.ymd, month_sums, level="month") - broadcasted = self.ymd.groupby(level='month').transform(np.sum) + broadcasted = self.ymd.groupby(level="month").transform(np.sum) expected = op(self.ymd, broadcasted) tm.assert_frame_equal(result, expected) # Series op = getattr(Series, opname) - result = op(self.ymd['A'], month_sums['A'], level='month') - broadcasted = self.ymd['A'].groupby(level='month').transform( - np.sum) - expected = op(self.ymd['A'], broadcasted) - expected.name = 'A' + result = op(self.ymd["A"], month_sums["A"], level="month") + broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum) + expected = op(self.ymd["A"], broadcasted) + expected.name = "A" tm.assert_series_equal(result, expected) - _check_op('sub') - _check_op('add') - _check_op('mul') - _check_op('div') + _check_op("sub") + _check_op("add") + _check_op("mul") + _check_op("div") def test_pickle(self): def _test_roundtrip(frame): @@ -184,12 +206,12 @@ def _test_roundtrip(frame): def test_reindex(self): expected = self.frame.iloc[[0, 3]] - reindexed = self.frame.loc[[('foo', 'one'), ('bar', 'one')]] + reindexed = self.frame.loc[[("foo", "one"), ("bar", "one")]] tm.assert_frame_equal(reindexed, expected) with catch_warnings(record=True): simplefilter("ignore", FutureWarning) - reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] + reindexed = self.frame.ix[[("foo", "one"), ("bar", "one")]] tm.assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): @@ -225,24 +247,22 @@ def test_repr_to_string(self): self.ymd.T.to_string(buf=buf) def test_repr_name_coincide(self): - index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')], - names=['a', 'b', 'c']) + index = MultiIndex.from_tuples( + [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"] + ) - df = DataFrame({'value': [0, 1]}, index=index) + df = DataFrame({"value": [0, 1]}, index=index) - lines = repr(df).split('\n') - assert lines[2].startswith('a 0 foo') + lines = repr(df).split("\n") + assert lines[2].startswith("a 0 foo") def test_delevel_infer_dtype(self): - tuples = [tuple - for tuple in product( - ['foo', 'bar'], [10, 20], [1.0, 1.1])] - index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) - df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'], - index=index) + tuples = [tuple for tuple in product(["foo", "bar"], [10, 20], [1.0, 1.1])] + index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) + df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() - assert is_integer_dtype(deleveled['prm1']) - assert is_float_dtype(deleveled['prm2']) + assert is_integer_dtype(deleveled["prm1"]) + assert is_float_dtype(deleveled["prm2"]) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) @@ -264,7 +284,7 @@ def _check_counts(frame, axis=0): for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count() - expected = expected.reindex_like(result).astype('i8') + expected = expected.reindex_like(result).astype("i8") tm.assert_frame_equal(result, expected) self.frame.iloc[1, [1, 2]] = np.nan @@ -279,40 +299,46 @@ def _check_counts(frame, axis=0): # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() - with pytest.raises(TypeError, match='hierarchical'): + with pytest.raises(TypeError, match="hierarchical"): df.count(level=0) - self.frame['D'] = 'foo' + self.frame["D"] = "foo" result = self.frame.count(level=0, numeric_only=True) - tm.assert_index_equal(result.columns, Index(list('ABC'), name='exp')) + tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp")) def test_count_level_series(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', - 'three', 'four']], - codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) + index = MultiIndex( + levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], + codes=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]], + ) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() tm.assert_series_equal( - result.astype('f8'), expected.reindex(result.index).fillna(0)) + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) result = s.count(level=1) expected = s.groupby(level=1).count() tm.assert_series_equal( - result.astype('f8'), expected.reindex(result.index).fillna(0)) + result.astype("f8"), expected.reindex(result.index).fillna(0) + ) def test_count_level_corner(self): - s = self.frame['A'][:0] + s = self.frame["A"][:0] result = s.count(level=0) - expected = Series(0, index=s.index.levels[0], name='A') + expected = Series(0, index=s.index.levels[0], name="A") tm.assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) - expected = DataFrame(index=s.index.levels[0], - columns=df.columns).fillna(0).astype(np.int64) + expected = ( + DataFrame(index=s.index.levels[0], columns=df.columns) + .fillna(0) + .astype(np.int64) + ) tm.assert_frame_equal(result, expected) def test_get_level_number_out_of_bounds(self): @@ -333,13 +359,14 @@ def test_unstack(self): self.ymd.astype(np.int32).unstack() def test_unstack_multiple_no_empty_columns(self): - index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), ( - 1, 'baz', 1), (1, 'qux', 1)]) + index = MultiIndex.from_tuples( + [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)] + ) s = Series(np.random.randn(4), index=index) unstacked = s.unstack([1, 2]) - expected = unstacked.dropna(axis=1, how='all') + expected = unstacked.dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) def test_stack(self): @@ -402,33 +429,39 @@ def check(left, right): li, ri = left.index, right.index tm.assert_index_equal(li, ri) - df = DataFrame(np.arange(12).reshape(4, 3), - index=list('abab'), - columns=['1st', '2nd', '3rd']) + df = DataFrame( + np.arange(12).reshape(4, 3), + index=list("abab"), + columns=["1st", "2nd", "3rd"], + ) - mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd', '3rd']], - codes=[np.tile( - np.arange(2).repeat(3), 2), np.tile( - np.arange(3), 4)]) + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd", "3rd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile(np.arange(3), 4)], + ) left, right = df.stack(), Series(np.arange(12), index=mi) check(left, right) - df.columns = ['1st', '2nd', '1st'] - mi = MultiIndex(levels=[['a', 'b'], ['1st', '2nd']], codes=[np.tile( - np.arange(2).repeat(3), 2), np.tile( - [0, 1, 0], 4)]) + df.columns = ["1st", "2nd", "1st"] + mi = MultiIndex( + levels=[["a", "b"], ["1st", "2nd"]], + codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)], + ) left, right = df.stack(), Series(np.arange(12), index=mi) check(left, right) - tpls = ('a', 2), ('b', 1), ('a', 1), ('b', 2) + tpls = ("a", 2), ("b", 1), ("a", 1), ("b", 2) df.index = MultiIndex.from_tuples(tpls) - mi = MultiIndex(levels=[['a', 'b'], [1, 2], ['1st', '2nd']], - codes=[np.tile( - np.arange(2).repeat(3), 2), np.repeat( - [1, 0, 1], [3, 6, 3]), np.tile( - [0, 1, 0], 4)]) + mi = MultiIndex( + levels=[["a", "b"], [1, 2], ["1st", "2nd"]], + codes=[ + np.tile(np.arange(2).repeat(3), 2), + np.repeat([1, 0, 1], [3, 6, 3]), + np.tile([0, 1, 0], 4), + ], + ) left, right = df.stack(), Series(np.arange(12), index=mi) check(left, right) @@ -447,7 +480,7 @@ def test_unstack_odd_failure(self): Thur,Lunch,No,117.32,44 Thur,Lunch,Yes,51.51,17""" - df = pd.read_csv(StringIO(data)).set_index(['day', 'time', 'smoker']) + df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"]) # it works, #2100 result = df.unstack(2) @@ -457,65 +490,67 @@ def test_unstack_odd_failure(self): def test_stack_mixed_dtype(self): df = self.frame.T - df['foo', 'four'] = 'foo' + df["foo", "four"] = "foo" df = df.sort_index(level=1, axis=1) stacked = df.stack() - result = df['foo'].stack().sort_index() - tm.assert_series_equal(stacked['foo'], result, check_names=False) + result = df["foo"].stack().sort_index() + tm.assert_series_equal(stacked["foo"], result, check_names=False) assert result.name is None - assert stacked['bar'].dtype == np.float_ + assert stacked["bar"].dtype == np.float_ def test_unstack_bug(self): - df = DataFrame({'state': ['naive', 'naive', 'naive', 'activ', 'activ', - 'activ'], - 'exp': ['a', 'b', 'b', 'b', 'a', 'a'], - 'barcode': [1, 2, 3, 4, 1, 3], - 'v': ['hi', 'hi', 'bye', 'bye', 'bye', 'peace'], - 'extra': np.arange(6.)}) + df = DataFrame( + { + "state": ["naive", "naive", "naive", "activ", "activ", "activ"], + "exp": ["a", "b", "b", "b", "a", "a"], + "barcode": [1, 2, 3, 4, 1, 3], + "v": ["hi", "hi", "bye", "bye", "bye", "peace"], + "extra": np.arange(6.0), + } + ) - result = df.groupby(['state', 'exp', 'barcode', 'v']).apply(len) + result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack() - tm.assert_series_equal( - restacked, result.reindex(restacked.index).astype(float)) + tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() - assert unstacked.index.name == 'first' - assert unstacked.columns.names == ['exp', 'second'] + assert unstacked.index.name == "first" + assert unstacked.columns.names == ["exp", "second"] restacked = unstacked.stack() assert restacked.index.names == self.frame.index.names def test_unstack_level_name(self): - result = self.frame.unstack('second') + result = self.frame.unstack("second") expected = self.frame.unstack(level=1) tm.assert_frame_equal(result, expected) def test_stack_level_name(self): - unstacked = self.frame.unstack('second') - result = unstacked.stack('exp') + unstacked = self.frame.unstack("second") + result = unstacked.stack("exp") expected = self.frame.unstack().stack(0) tm.assert_frame_equal(result, expected) - result = self.frame.stack('exp') + result = self.frame.stack("exp") expected = self.frame.stack() tm.assert_series_equal(result, expected) def test_stack_unstack_multiple(self): - unstacked = self.ymd.unstack(['year', 'month']) - expected = self.ymd.unstack('year').unstack('month') + unstacked = self.ymd.unstack(["year", "month"]) + expected = self.ymd.unstack("year").unstack("month") tm.assert_frame_equal(unstacked, expected) assert unstacked.columns.names == expected.columns.names # series - s = self.ymd['A'] - s_unstacked = s.unstack(['year', 'month']) - tm.assert_frame_equal(s_unstacked, expected['A']) + s = self.ymd["A"] + s_unstacked = s.unstack(["year", "month"]) + tm.assert_frame_equal(s_unstacked, expected["A"]) - restacked = unstacked.stack(['year', 'month']) + restacked = unstacked.stack(["year", "month"]) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sort_index(level=0) @@ -524,23 +559,23 @@ def test_stack_unstack_multiple(self): # GH #451 unstacked = self.ymd.unstack([1, 2]) - expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all') + expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) - expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') + expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how="all") tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns]) def test_stack_names_and_numbers(self): - unstacked = self.ymd.unstack(['year', 'month']) + unstacked = self.ymd.unstack(["year", "month"]) # Can't use mixture of names and numbers to stack with pytest.raises(ValueError, match="level should contain"): - unstacked.stack([0, 'month']) + unstacked.stack([0, "month"]) def test_stack_multiple_out_of_bounds(self): # nlevels == 3 - unstacked = self.ymd.unstack(['year', 'month']) + unstacked = self.ymd.unstack(["year", "month"]) with pytest.raises(IndexError, match="Too many levels"): unstacked.stack([2, 3]) @@ -549,9 +584,12 @@ def test_stack_multiple_out_of_bounds(self): def test_unstack_period_series(self): # GH 4342 - idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02', - '2013-03', '2013-03'], freq='M', name='period') - idx2 = Index(['A', 'B'] * 3, name='str') + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period", + ) + idx2 = Index(["A", "B"] * 3, name="str") value = [1, 2, 3, 4, 5, 6] idx = MultiIndex.from_arrays([idx1, idx2]) @@ -562,20 +600,28 @@ def test_unstack_period_series(self): result3 = s.unstack(level=0) e_idx = pd.PeriodIndex( - ['2013-01', '2013-02', '2013-03'], freq='M', name='period') - expected = DataFrame({'A': [1, 3, 5], 'B': [2, 4, 6]}, index=e_idx, - columns=['A', 'B']) - expected.columns.name = 'str' + ["2013-01", "2013-02", "2013-03"], freq="M", name="period" + ) + expected = DataFrame( + {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"] + ) + expected.columns.name = "str" tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected.T) - idx1 = pd.PeriodIndex(['2013-01', '2013-01', '2013-02', '2013-02', - '2013-03', '2013-03'], freq='M', name='period1') - - idx2 = pd.PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', - '2013-08', '2013-07'], freq='M', name='period2') + idx1 = pd.PeriodIndex( + ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"], + freq="M", + name="period1", + ) + + idx2 = pd.PeriodIndex( + ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"], + freq="M", + name="period2", + ) idx = MultiIndex.from_arrays([idx1, idx2]) s = Series(value, index=idx) @@ -584,14 +630,22 @@ def test_unstack_period_series(self): result3 = s.unstack(level=0) e_idx = pd.PeriodIndex( - ['2013-01', '2013-02', '2013-03'], freq='M', name='period1') - e_cols = pd.PeriodIndex(['2013-07', '2013-08', '2013-09', '2013-10', - '2013-11', '2013-12'], - freq='M', name='period2') - expected = DataFrame([[np.nan, np.nan, np.nan, np.nan, 2, 1], - [np.nan, np.nan, 4, 3, np.nan, np.nan], - [6, 5, np.nan, np.nan, np.nan, np.nan]], - index=e_idx, columns=e_cols) + ["2013-01", "2013-02", "2013-03"], freq="M", name="period1" + ) + e_cols = pd.PeriodIndex( + ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"], + freq="M", + name="period2", + ) + expected = DataFrame( + [ + [np.nan, np.nan, np.nan, np.nan, 2, 1], + [np.nan, np.nan, 4, 3, np.nan, np.nan], + [6, 5, np.nan, np.nan, np.nan, np.nan], + ], + index=e_idx, + columns=e_cols, + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) @@ -599,13 +653,17 @@ def test_unstack_period_series(self): def test_unstack_period_frame(self): # GH 4342 - idx1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-02', '2014-02', - '2014-01', '2014-01'], - freq='M', name='period1') - idx2 = pd.PeriodIndex(['2013-12', '2013-12', '2014-02', '2013-10', - '2013-10', '2014-02'], - freq='M', name='period2') - value = {'A': [1, 2, 3, 4, 5, 6], 'B': [6, 5, 4, 3, 2, 1]} + idx1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"], + freq="M", + name="period1", + ) + idx2 = pd.PeriodIndex( + ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"], + freq="M", + name="period2", + ) + value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]} idx = MultiIndex.from_arrays([idx1, idx2]) df = DataFrame(value, index=idx) @@ -613,48 +671,55 @@ def test_unstack_period_frame(self): result2 = df.unstack(level=1) result3 = df.unstack(level=0) - e_1 = pd.PeriodIndex(['2014-01', '2014-02'], freq='M', name='period1') - e_2 = pd.PeriodIndex(['2013-10', '2013-12', '2014-02', '2013-10', - '2013-12', '2014-02'], freq='M', name='period2') - e_cols = MultiIndex.from_arrays(['A A A B B B'.split(), e_2]) - expected = DataFrame([[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], - index=e_1, columns=e_cols) + e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1") + e_2 = pd.PeriodIndex( + ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"], + freq="M", + name="period2", + ) + e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2]) + expected = DataFrame( + [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) - e_1 = pd.PeriodIndex(['2014-01', '2014-02', '2014-01', - '2014-02'], freq='M', name='period1') + e_1 = pd.PeriodIndex( + ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1" + ) e_2 = pd.PeriodIndex( - ['2013-10', '2013-12', '2014-02'], freq='M', name='period2') - e_cols = MultiIndex.from_arrays(['A A B B'.split(), e_1]) - expected = DataFrame([[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], - index=e_2, columns=e_cols) + ["2013-10", "2013-12", "2014-02"], freq="M", name="period2" + ) + e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1]) + expected = DataFrame( + [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols + ) tm.assert_frame_equal(result3, expected) def test_stack_multiple_bug(self): """ bug when some uniques are not present in the data #3170""" id_col = ([1] * 3) + ([2] * 3) - name = (['a'] * 3) + (['b'] * 3) - date = pd.to_datetime(['2013-01-03', '2013-01-04', '2013-01-05'] * 2) + name = (["a"] * 3) + (["b"] * 3) + date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2) var1 = np.random.randint(0, 100, 6) df = DataFrame(dict(ID=id_col, NAME=name, DATE=date, VAR1=var1)) - multi = df.set_index(['DATE', 'ID']) - multi.columns.name = 'Params' - unst = multi.unstack('ID') - down = unst.resample('W-THU').mean() + multi = df.set_index(["DATE", "ID"]) + multi.columns.name = "Params" + unst = multi.unstack("ID") + down = unst.resample("W-THU").mean() - rs = down.stack('ID') - xp = unst.loc[:, ['VAR1']].resample('W-THU').mean().stack('ID') - xp.columns.name = 'Params' + rs = down.stack("ID") + xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") + xp.columns.name = "Params" tm.assert_frame_equal(rs, xp) def test_stack_dropna(self): # GH #3997 - df = DataFrame({'A': ['a1', 'a2'], 'B': ['b1', 'b2'], 'C': [1, 1]}) - df = df.set_index(['A', 'B']) + df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]}) + df = df.set_index(["A", "B"]) stacked = df.unstack().stack(dropna=False) assert len(stacked) > len(stacked.dropna()) @@ -663,19 +728,23 @@ def test_stack_dropna(self): tm.assert_frame_equal(stacked, stacked.dropna()) def test_unstack_multiple_hierarchical(self): - df = DataFrame(index=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1 - ]], - columns=[[0, 0, 1, 1], [0, 1, 0, 1]]) + df = DataFrame( + index=[ + [0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], + [0, 1, 0, 1, 0, 1, 0, 1], + ], + columns=[[0, 0, 1, 1], [0, 1, 0, 1]], + ) - df.index.names = ['a', 'b', 'c'] - df.columns.names = ['d', 'e'] + df.index.names = ["a", "b", "c"] + df.columns.names = ["d", "e"] # it works! - df.unstack(['b', 'c']) + df.unstack(["b", "c"]) def test_groupby_transform(self): - s = self.frame['A'] + s = self.frame["A"] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) @@ -690,17 +759,21 @@ def test_unstack_sparse_keyspace(self): # Generate Long File & Test Pivot NUM_ROWS = 1000 - df = DataFrame({'A': np.random.randint(100, size=NUM_ROWS), - 'B': np.random.randint(300, size=NUM_ROWS), - 'C': np.random.randint(-7, 7, size=NUM_ROWS), - 'D': np.random.randint(-19, 19, size=NUM_ROWS), - 'E': np.random.randint(3000, size=NUM_ROWS), - 'F': np.random.randn(NUM_ROWS)}) - - idf = df.set_index(['A', 'B', 'C', 'D', 'E']) + df = DataFrame( + { + "A": np.random.randint(100, size=NUM_ROWS), + "B": np.random.randint(300, size=NUM_ROWS), + "C": np.random.randint(-7, 7, size=NUM_ROWS), + "D": np.random.randint(-19, 19, size=NUM_ROWS), + "E": np.random.randint(3000, size=NUM_ROWS), + "F": np.random.randn(NUM_ROWS), + } + ) + + idf = df.set_index(["A", "B", "C", "D", "E"]) # it works! is sufficient - idf.unstack('E') + idf.unstack("E") def test_unstack_unobserved_keys(self): # related to #2278 refactoring @@ -720,69 +793,79 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): # GH 20601 - df = DataFrame(np.random.randn(2 ** 16, 2), - index=[np.arange(2 ** 16), np.arange(2 ** 16)]) - with pytest.raises(ValueError, match='int32 overflow'): + df = DataFrame( + np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)] + ) + with pytest.raises(ValueError, match="int32 overflow"): df.unstack() def test_stack_order_with_unsorted_levels(self): # GH 16323 def manual_compare_stacked(df, df_stacked, lev0, lev1): - assert all(df.loc[row, col] == - df_stacked.loc[(row, col[lev0]), col[lev1]] - for row in df.index for col in df.columns) + assert all( + df.loc[row, col] == df_stacked.loc[(row, col[lev0]), col[lev1]] + for row in df.index + for col in df.columns + ) # deep check for 1-row case for width in [2, 3]: levels_poss = itertools.product( - itertools.permutations([0, 1, 2], width), - repeat=2) + itertools.permutations([0, 1, 2], width), repeat=2 + ) for levels in levels_poss: - columns = MultiIndex(levels=levels, - codes=[[0, 0, 1, 1], - [0, 1, 0, 1]]) + columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) df = DataFrame(columns=columns, data=[range(4)]) for stack_lev in range(2): df_stacked = df.stack(stack_lev) - manual_compare_stacked(df, df_stacked, - stack_lev, 1 - stack_lev) + manual_compare_stacked(df, df_stacked, stack_lev, 1 - stack_lev) # check multi-row case - mi = MultiIndex(levels=[["A", "C", "B"], ["B", "A", "C"]], - codes=[np.repeat(range(3), 3), np.tile(range(3), 3)]) - df = DataFrame(columns=mi, index=range(5), - data=np.arange(5 * len(mi)).reshape(5, -1)) + mi = MultiIndex( + levels=[["A", "C", "B"], ["B", "A", "C"]], + codes=[np.repeat(range(3), 3), np.tile(range(3), 3)], + ) + df = DataFrame( + columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1) + ) manual_compare_stacked(df, df.stack(0), 0, 1) def test_groupby_corner(self): - midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], - codes=[[0], [0], [0]], - names=['one', 'two', 'three']) - df = DataFrame([np.random.rand(4)], columns=['a', 'b', 'c', 'd'], - index=midx) + midx = MultiIndex( + levels=[["foo"], ["bar"], ["baz"]], + codes=[[0], [0], [0]], + names=["one", "two", "three"], + ) + df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx) # should work - df.groupby(level='three') + df.groupby(level="three") def test_groupby_level_no_obs(self): # #1697 - midx = MultiIndex.from_tuples([('f1', 's1'), ('f1', 's2'), ( - 'f2', 's1'), ('f2', 's2'), ('f3', 's1'), ('f3', 's2')]) - df = DataFrame( - [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) - df1 = df.loc(axis=1)[df.columns.map( - lambda u: u[0] in ['f2', 'f3'])] + midx = MultiIndex.from_tuples( + [ + ("f1", "s1"), + ("f1", "s2"), + ("f2", "s1"), + ("f2", "s2"), + ("f3", "s1"), + ("f3", "s2"), + ] + ) + df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], columns=midx) + df1 = df.loc(axis=1)[df.columns.map(lambda u: u[0] in ["f2", "f3"])] grouped = df1.groupby(axis=1, level=0) result = grouped.sum() - assert (result.columns == ['f2', 'f3']).all() + assert (result.columns == ["f2", "f3"]).all() def test_join(self): - a = self.frame.loc[self.frame.index[:5], ['A']] - b = self.frame.loc[self.frame.index[2:], ['B', 'C']] + a = self.frame.loc[self.frame.index[:5], ["A"]] + b = self.frame.loc[self.frame.index[2:], ["B", "C"]] - joined = a.join(b, how='outer').reindex(self.frame.index) + joined = a.join(b, how="outer").reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan @@ -792,10 +875,10 @@ def test_join(self): tm.assert_frame_equal(joined, expected, check_names=False) def test_swaplevel(self): - swapped = self.frame['A'].swaplevel() - swapped2 = self.frame['A'].swaplevel(0) - swapped3 = self.frame['A'].swaplevel(0, 1) - swapped4 = self.frame['A'].swaplevel('first', 'second') + swapped = self.frame["A"].swaplevel() + swapped2 = self.frame["A"].swaplevel(0) + swapped3 = self.frame["A"].swaplevel(0, 1) + swapped4 = self.frame["A"].swaplevel("first", "second") assert not swapped.index.equals(self.frame.index) tm.assert_series_equal(swapped, swapped2) tm.assert_series_equal(swapped, swapped3) @@ -804,34 +887,34 @@ def test_swaplevel(self): back = swapped.swaplevel() back2 = swapped.swaplevel(0) back3 = swapped.swaplevel(0, 1) - back4 = swapped.swaplevel('second', 'first') + back4 = swapped.swaplevel("second", "first") assert back.index.equals(self.frame.index) tm.assert_series_equal(back, back2) tm.assert_series_equal(back, back3) tm.assert_series_equal(back, back4) ft = self.frame.T - swapped = ft.swaplevel('first', 'second', axis=1) - exp = self.frame.swaplevel('first', 'second').T + swapped = ft.swaplevel("first", "second", axis=1) + exp = self.frame.swaplevel("first", "second").T tm.assert_frame_equal(swapped, exp) def test_reorder_levels(self): - result = self.ymd.reorder_levels(['month', 'day', 'year']) + result = self.ymd.reorder_levels(["month", "day", "year"]) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) tm.assert_frame_equal(result, expected) - result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) - expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) + result = self.ymd["A"].reorder_levels(["month", "day", "year"]) + expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2) tm.assert_series_equal(result, expected) - result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) + result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) tm.assert_frame_equal(result, expected) - with pytest.raises(TypeError, match='hierarchical axis'): + with pytest.raises(TypeError, match="hierarchical axis"): self.ymd.reorder_levels([1, 2], axis=1) - with pytest.raises(IndexError, match='Too many levels'): + with pytest.raises(IndexError, match="Too many levels"): self.ymd.index.reorder_levels([1, 2, 3]) def test_insert_index(self): @@ -841,11 +924,13 @@ def test_insert_index(self): assert (df[2000, 1, 10] == df[2000, 1, 7]).all() def test_alignment(self): - x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ( - "A", 2), ("B", 3)])) + x = Series( + data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)]) + ) - y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ( - "Z", 2), ("B", 3)])) + y = Series( + data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)]) + ) res = x - y exp_index = x.index.union(y.index) @@ -860,39 +945,39 @@ def test_alignment(self): def test_count(self): frame = self.frame.copy() - frame.index.names = ['a', 'b'] + frame.index.names = ["a", "b"] - result = frame.count(level='b') + result = frame.count(level="b") expect = self.frame.count(level=1) tm.assert_frame_equal(result, expect, check_names=False) - result = frame.count(level='a') + result = frame.count(level="a") expect = self.frame.count(level=0) tm.assert_frame_equal(result, expect, check_names=False) series = self.series.copy() - series.index.names = ['a', 'b'] + series.index.names = ["a", "b"] - result = series.count(level='b') + result = series.count(level="b") expect = self.series.count(level=1) tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == 'b' + assert result.index.name == "b" - result = series.count(level='a') + result = series.count(level="a") expect = self.series.count(level=0) tm.assert_series_equal(result, expect, check_names=False) - assert result.index.name == 'a' + assert result.index.name == "a" msg = "Level x not found" with pytest.raises(KeyError, match=msg): - series.count('x') + series.count("x") with pytest.raises(KeyError, match=msg): - frame.count(level='x') + frame.count(level="x") - @pytest.mark.parametrize('op', AGG_FUNCTIONS) - @pytest.mark.parametrize('level', [0, 1]) - @pytest.mark.parametrize('skipna', [True, False]) - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("op", AGG_FUNCTIONS) + @pytest.mark.parametrize("level", [0, 1]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_series_group_min_max(self, op, level, skipna, sort): # GH 17537 grouped = self.series.groupby(level=level, sort=sort) @@ -903,11 +988,11 @@ def test_series_group_min_max(self, op, level, skipna, sort): rightside = rightside.sort_index(level=level) tm.assert_series_equal(leftside, rightside) - @pytest.mark.parametrize('op', AGG_FUNCTIONS) - @pytest.mark.parametrize('level', [0, 1]) - @pytest.mark.parametrize('axis', [0, 1]) - @pytest.mark.parametrize('skipna', [True, False]) - @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize("op", AGG_FUNCTIONS) + @pytest.mark.parametrize("level", [0, 1]) + @pytest.mark.parametrize("axis", [0, 1]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("sort", [True, False]) def test_frame_group_ops(self, op, level, axis, skipna, sort): # GH 17537 self.frame.iloc[1, [1, 2]] = np.nan @@ -927,8 +1012,7 @@ def aggf(x): return getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) - rightside = getattr(frame, op)(level=level, axis=axis, - skipna=skipna) + rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) if sort: rightside = rightside.sort_index(level=level, axis=axis) frame = frame.sort_index(level=level, axis=axis) @@ -950,25 +1034,28 @@ def test_stat_op_corner(self): def test_frame_any_all_group(self): df = DataFrame( - {'data': [False, False, True, False, True, False, True]}, + {"data": [False, False, True, False, True, False, True]}, index=[ - ['one', 'one', 'two', 'one', 'two', 'two', 'two'], - [0, 1, 0, 2, 1, 2, 3]]) + ["one", "one", "two", "one", "two", "two", "two"], + [0, 1, 0, 2, 1, 2, 3], + ], + ) result = df.any(level=0) - ex = DataFrame({'data': [False, True]}, index=['one', 'two']) + ex = DataFrame({"data": [False, True]}, index=["one", "two"]) tm.assert_frame_equal(result, ex) result = df.all(level=0) - ex = DataFrame({'data': [False, False]}, index=['one', 'two']) + ex = DataFrame({"data": [False, False]}, index=["one", "two"]) tm.assert_frame_equal(result, ex) def test_std_var_pass_ddof(self): - index = MultiIndex.from_arrays([np.arange(5).repeat(10), np.tile( - np.arange(10), 5)]) + index = MultiIndex.from_arrays( + [np.arange(5).repeat(10), np.tile(np.arange(10), 5)] + ) df = DataFrame(np.random.randn(len(index), 5), index=index) - for meth in ['var', 'std']: + for meth in ["var", "std"]: ddof = 4 alt = lambda x: getattr(x, meth)(ddof=ddof) @@ -981,12 +1068,12 @@ def test_std_var_pass_ddof(self): tm.assert_frame_equal(result, expected) def test_frame_series_agg_multiple_levels(self): - result = self.ymd.sum(level=['year', 'month']) - expected = self.ymd.groupby(level=['year', 'month']).sum() + result = self.ymd.sum(level=["year", "month"]) + expected = self.ymd.groupby(level=["year", "month"]).sum() tm.assert_frame_equal(result, expected) - result = self.ymd['A'].sum(level=['year', 'month']) - expected = self.ymd['A'].groupby(level=['year', 'month']).sum() + result = self.ymd["A"].sum(level=["year", "month"]) + expected = self.ymd["A"].groupby(level=["year", "month"]).sum() tm.assert_series_equal(result, expected) def test_groupby_multilevel(self): @@ -1008,39 +1095,42 @@ def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): - index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ( - 'bar', 'one'), ('bar', 'two')]) + index = MultiIndex.from_tuples( + [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")] + ) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) - df['Totals', ''] = df.sum(1) + df["Totals", ""] = df.sum(1) df = df._consolidate() def test_loc_preserve_names(self): result = self.ymd.loc[2000] - result2 = self.ymd['A'].loc[2000] + result2 = self.ymd["A"].loc[2000] assert result.index.names == self.ymd.index.names[1:] assert result2.index.names == self.ymd.index.names[1:] result = self.ymd.loc[2000, 2] - result2 = self.ymd['A'].loc[2000, 2] + result2 = self.ymd["A"].loc[2000, 2] assert result.index.name == self.ymd.index.names[2] assert result2.index.name == self.ymd.index.names[2] def test_unstack_preserve_types(self): # GH #403 - self.ymd['E'] = 'foo' - self.ymd['F'] = 2 + self.ymd["E"] = "foo" + self.ymd["F"] = 2 - unstacked = self.ymd.unstack('month') - assert unstacked['A', 1].dtype == np.float64 - assert unstacked['E', 1].dtype == np.object_ - assert unstacked['F', 1].dtype == np.float64 + unstacked = self.ymd.unstack("month") + assert unstacked["A", 1].dtype == np.float64 + assert unstacked["E", 1].dtype == np.object_ + assert unstacked["F", 1].dtype == np.float64 def test_unstack_group_index_overflow(self): codes = np.tile(np.arange(500), 2) level = np.arange(500) - index = MultiIndex(levels=[level] * 8 + [[0, 1]], - codes=[codes] * 8 + [np.arange(2).repeat(500)]) + index = MultiIndex( + levels=[level] * 8 + [[0, 1]], + codes=[codes] * 8 + [np.arange(2).repeat(500)], + ) s = Series(np.arange(1000), index=index) result = s.unstack() @@ -1051,17 +1141,20 @@ def test_unstack_group_index_overflow(self): tm.assert_series_equal(s, stacked.reindex(s.index)) # put it at beginning - index = MultiIndex(levels=[[0, 1]] + [level] * 8, - codes=[np.arange(2).repeat(500)] + [codes] * 8) + index = MultiIndex( + levels=[[0, 1]] + [level] * 8, + codes=[np.arange(2).repeat(500)] + [codes] * 8, + ) s = Series(np.arange(1000), index=index) result = s.unstack(0) assert result.shape == (500, 2) # put it in middle - index = MultiIndex(levels=[level] * 4 + [[0, 1]] + [level] * 4, - codes=([codes] * 4 + [np.arange(2).repeat(500)] + - [codes] * 4)) + index = MultiIndex( + levels=[level] * 4 + [[0, 1]] + [level] * 4, + codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4), + ) s = Series(np.arange(1000), index=index) result = s.unstack(4) @@ -1072,12 +1165,17 @@ def test_pyint_engine(self): # bits, the index underlying the MultiIndex engine works with Python # integers, rather than uint64. N = 5 - keys = [tuple(l) for l in [[0] * 10 * N, - [1] * 10 * N, - [2] * 10 * N, - [np.nan] * N + [2] * 9 * N, - [0] * N + [2] * 9 * N, - [np.nan] * N + [2] * 8 * N + [0] * N]] + keys = [ + tuple(l) + for l in [ + [0] * 10 * N, + [1] * 10 * N, + [2] * 10 * N, + [np.nan] * N + [2] * 9 * N, + [0] * N + [2] * 9 * N, + [np.nan] * N + [2] * 8 * N + [0] * N, + ] + ] # Each level contains 4 elements (including NaN), so it is represented # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a # 64 bit engine and truncating the first levels, the fourth and fifth @@ -1100,93 +1198,106 @@ def test_pyint_engine(self): tm.assert_numpy_array_equal(result, expected) def test_to_html(self): - self.ymd.columns.name = 'foo' + self.ymd.columns.name = "foo" self.ymd.to_html() self.ymd.T.to_html() def test_level_with_tuples(self): - index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), ( - 'foo', 'qux', 0)], [0, 1]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) - result = series[('foo', 'bar', 0)] - result2 = series.loc[('foo', 'bar', 0)] + result = series[("foo", "bar", 0)] + result2 = series.loc[("foo", "bar", 0)] expected = series[:2] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"): - series[('foo', 'bar', 0), 2] + series[("foo", "bar", 0), 2] - result = frame.loc[('foo', 'bar', 0)] - result2 = frame.xs(('foo', 'bar', 0)) + result = frame.loc[("foo", "bar", 0)] + result2 = frame.xs(("foo", "bar", 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ( - 'foo', 'qux')], [0, 1]], - codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + index = MultiIndex( + levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) - result = series[('foo', 'bar')] - result2 = series.loc[('foo', 'bar')] + result = series[("foo", "bar")] + result2 = series.loc[("foo", "bar")] expected = series[:2] expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) - result = frame.loc[('foo', 'bar')] - result2 = frame.xs(('foo', 'bar')) + result = frame.loc[("foo", "bar")] + result2 = frame.xs(("foo", "bar")) expected = frame[:2] expected.index = expected.index.droplevel(0) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) def test_mixed_depth_drop(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) - result = df.drop('a', axis=1) - expected = df.drop([('a', '', '')], axis=1) + result = df.drop("a", axis=1) + expected = df.drop([("a", "", "")], axis=1) tm.assert_frame_equal(expected, result) - result = df.drop(['top'], axis=1) - expected = df.drop([('top', 'OD', 'wx')], axis=1) - expected = expected.drop([('top', 'OD', 'wy')], axis=1) + result = df.drop(["top"], axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) + expected = expected.drop([("top", "OD", "wy")], axis=1) tm.assert_frame_equal(expected, result) - result = df.drop(('top', 'OD', 'wx'), axis=1) - expected = df.drop([('top', 'OD', 'wx')], axis=1) + result = df.drop(("top", "OD", "wx"), axis=1) + expected = df.drop([("top", "OD", "wx")], axis=1) tm.assert_frame_equal(expected, result) - expected = df.drop([('top', 'OD', 'wy')], axis=1) - expected = df.drop('top', axis=1) + expected = df.drop([("top", "OD", "wy")], axis=1) + expected = df.drop("top", axis=1) - result = df.drop('result1', level=1, axis=1) - expected = df.drop([('routine1', 'result1', ''), - ('routine2', 'result1', '')], axis=1) + result = df.drop("result1", level=1, axis=1) + expected = df.drop( + [("routine1", "result1", ""), ("routine2", "result1", "")], axis=1 + ) tm.assert_frame_equal(expected, result) def test_drop_nonunique(self): - df = DataFrame([["x-a", "x", "a", 1.5], ["x-a", "x", "a", 1.2], - ["z-c", "z", "c", 3.1], ["x-a", "x", "a", 4.1], - ["x-b", "x", "b", 5.1], ["x-b", "x", "b", 4.1], - ["x-b", "x", "b", 2.2], - ["y-a", "y", "a", 1.2], ["z-b", "z", "b", 2.1]], - columns=["var1", "var2", "var3", "var4"]) + df = DataFrame( + [ + ["x-a", "x", "a", 1.5], + ["x-a", "x", "a", 1.2], + ["z-c", "z", "c", 3.1], + ["x-a", "x", "a", 4.1], + ["x-b", "x", "b", 5.1], + ["x-b", "x", "b", 4.1], + ["x-b", "x", "b", 2.2], + ["y-a", "y", "a", 1.2], + ["z-b", "z", "b", 2.1], + ], + columns=["var1", "var2", "var3", "var4"], + ) grp_size = df.groupby("var1").size() drop_idx = grp_size.loc[grp_size == 1] @@ -1202,9 +1313,11 @@ def test_drop_nonunique(self): tm.assert_frame_equal(result, expected) def test_mixed_depth_pop(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] + arrays = [ + ["a", "top", "top", "routine1", "routine1", "routine2"], + ["", "OD", "OD", "result1", "result2", "result1"], + ["", "wx", "wy", "", "", ""], + ] tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) @@ -1212,97 +1325,99 @@ def test_mixed_depth_pop(self): df1 = df.copy() df2 = df.copy() - result = df1.pop('a') - expected = df2.pop(('a', '', '')) + result = df1.pop("a") + expected = df2.pop(("a", "", "")) tm.assert_series_equal(expected, result, check_names=False) tm.assert_frame_equal(df1, df2) - assert result.name == 'a' + assert result.name == "a" - expected = df1['top'] - df1 = df1.drop(['top'], axis=1) - result = df2.pop('top') + expected = df1["top"] + df1 = df1.drop(["top"], axis=1) + result = df2.pop("top") tm.assert_frame_equal(expected, result) tm.assert_frame_equal(df1, df2) def test_reindex_level_partial_selection(self): - result = self.frame.reindex(['foo', 'qux'], level=0) + result = self.frame.reindex(["foo", "qux"], level=0) expected = self.frame.iloc[[0, 1, 2, 7, 8, 9]] tm.assert_frame_equal(result, expected) - result = self.frame.T.reindex(['foo', 'qux'], axis=1, level=0) + result = self.frame.T.reindex(["foo", "qux"], axis=1, level=0) tm.assert_frame_equal(result, expected.T) - result = self.frame.loc[['foo', 'qux']] + result = self.frame.loc[["foo", "qux"]] tm.assert_frame_equal(result, expected) - result = self.frame['A'].loc[['foo', 'qux']] - tm.assert_series_equal(result, expected['A']) + result = self.frame["A"].loc[["foo", "qux"]] + tm.assert_series_equal(result, expected["A"]) - result = self.frame.T.loc[:, ['foo', 'qux']] + result = self.frame.T.loc[:, ["foo", "qux"]] tm.assert_frame_equal(result, expected.T) def test_drop_level(self): - result = self.frame.drop(['bar', 'qux'], level='first') + result = self.frame.drop(["bar", "qux"], level="first") expected = self.frame.iloc[[0, 1, 2, 5, 6]] tm.assert_frame_equal(result, expected) - result = self.frame.drop(['two'], level='second') + result = self.frame.drop(["two"], level="second") expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]] tm.assert_frame_equal(result, expected) - result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') + result = self.frame.T.drop(["bar", "qux"], axis=1, level="first") expected = self.frame.iloc[[0, 1, 2, 5, 6]].T tm.assert_frame_equal(result, expected) - result = self.frame.T.drop(['two'], axis=1, level='second') + result = self.frame.T.drop(["two"], axis=1, level="second") expected = self.frame.iloc[[0, 2, 3, 6, 7, 9]].T tm.assert_frame_equal(result, expected) def test_drop_level_nonunique_datetime(self): # GH 12701 - idx = Index([2, 3, 4, 4, 5], name='id') - idxdt = pd.to_datetime(['201603231400', - '201603231500', - '201603231600', - '201603231600', - '201603231700']) - df = DataFrame(np.arange(10).reshape(5, 2), - columns=list('ab'), index=idx) - df['tstamp'] = idxdt - df = df.set_index('tstamp', append=True) - ts = Timestamp('201603231600') + idx = Index([2, 3, 4, 4, 5], name="id") + idxdt = pd.to_datetime( + [ + "201603231400", + "201603231500", + "201603231600", + "201603231600", + "201603231700", + ] + ) + df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) + df["tstamp"] = idxdt + df = df.set_index("tstamp", append=True) + ts = Timestamp("201603231600") assert df.index.is_unique is False - result = df.drop(ts, level='tstamp') + result = df.drop(ts, level="tstamp") expected = df.loc[idx != 4] tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('box', [Series, DataFrame]) + @pytest.mark.parametrize("box", [Series, DataFrame]) def test_drop_tz_aware_timestamp_across_dst(self, box): # GH 21761 - start = Timestamp('2017-10-29', tz='Europe/Berlin') - end = Timestamp('2017-10-29 04:00:00', tz='Europe/Berlin') - index = pd.date_range(start, end, freq='15min') + start = Timestamp("2017-10-29", tz="Europe/Berlin") + end = Timestamp("2017-10-29 04:00:00", tz="Europe/Berlin") + index = pd.date_range(start, end, freq="15min") data = box(data=[1] * len(index), index=index) result = data.drop(start) - expected_start = Timestamp('2017-10-29 00:15:00', tz='Europe/Berlin') - expected_idx = pd.date_range(expected_start, end, freq='15min') + expected_start = Timestamp("2017-10-29 00:15:00", tz="Europe/Berlin") + expected_idx = pd.date_range(expected_start, end, freq="15min") expected = box(data=[1] * len(expected_idx), index=expected_idx) tm.assert_equal(result, expected) def test_drop_preserve_names(self): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], - [1, 2, 3, 1, 2, 3]], - names=['one', 'two']) + index = MultiIndex.from_arrays( + [[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]], names=["one", "two"] + ) df = DataFrame(np.random.randn(6, 3), index=index) result = df.drop([(0, 2)]) - assert result.index.names == ('one', 'two') + assert result.index.names == ("one", "two") def test_unicode_repr_issues(self): - levels = [Index(['a/\u03c3', 'b/\u03c3', 'c/\u03c3']), - Index([0, 1])] + levels = [Index(["a/\u03c3", "b/\u03c3", "c/\u03c3"]), Index([0, 1])] codes = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] index = MultiIndex(levels=levels, codes=codes) @@ -1312,8 +1427,7 @@ def test_unicode_repr_issues(self): # repr(index.get_level_values(1)) def test_unicode_repr_level_names(self): - index = MultiIndex.from_tuples([(0, 0), (1, 1)], - names=['\u0394', 'i1']) + index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"]) s = Series(range(2), index=index) df = DataFrame(np.random.randn(2, 4), index=index) @@ -1322,48 +1436,51 @@ def test_unicode_repr_level_names(self): def test_join_segfault(self): # 1532 - df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]}) - df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]}) - df1 = df1.set_index(['a', 'b']) - df2 = df2.set_index(['a', 'b']) + df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]}) + df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]}) + df1 = df1.set_index(["a", "b"]) + df2 = df2.set_index(["a", "b"]) # it works! - for how in ['left', 'right', 'outer']: + for how in ["left", "right", "outer"]: df1.join(df2, how=how) def test_frame_dict_constructor_empty_series(self): - s1 = Series([ - 1, 2, 3, 4 - ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)])) - s2 = Series([ - 1, 2, 3, 4 - ], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)])) + s1 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2), (2, 4)]) + ) + s2 = Series( + [1, 2, 3, 4], index=MultiIndex.from_tuples([(1, 2), (1, 3), (3, 2), (3, 4)]) + ) s3 = Series() # it works! - DataFrame({'foo': s1, 'bar': s2, 'baz': s3}) - DataFrame.from_dict({'foo': s1, 'baz': s3, 'bar': s2}) + DataFrame({"foo": s1, "bar": s2, "baz": s3}) + DataFrame.from_dict({"foo": s1, "baz": s3, "bar": s2}) def test_multiindex_na_repr(self): # only an issue with long columns from numpy import nan - df3 = DataFrame({ - 'A' * 30: {('A', 'A0006000', 'nuit'): 'A0006000'}, - 'B' * 30: {('A', 'A0006000', 'nuit'): nan}, - 'C' * 30: {('A', 'A0006000', 'nuit'): nan}, - 'D' * 30: {('A', 'A0006000', 'nuit'): nan}, - 'E' * 30: {('A', 'A0006000', 'nuit'): 'A'}, - 'F' * 30: {('A', 'A0006000', 'nuit'): nan}, - }) - - idf = df3.set_index(['A' * 30, 'C' * 30]) + + df3 = DataFrame( + { + "A" * 30: {("A", "A0006000", "nuit"): "A0006000"}, + "B" * 30: {("A", "A0006000", "nuit"): nan}, + "C" * 30: {("A", "A0006000", "nuit"): nan}, + "D" * 30: {("A", "A0006000", "nuit"): nan}, + "E" * 30: {("A", "A0006000", "nuit"): "A"}, + "F" * 30: {("A", "A0006000", "nuit"): nan}, + } + ) + + idf = df3.set_index(["A" * 30, "C" * 30]) repr(idf) def test_assign_index_sequences(self): # #2200 - df = DataFrame({"a": [1, 2, 3], - "b": [4, 5, 6], - "c": [7, 8, 9]}).set_index(["a", "b"]) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index( + ["a", "b"] + ) index = list(df.index) index[0] = ("faz", "boo") df.index = index @@ -1375,19 +1492,24 @@ def test_assign_index_sequences(self): repr(df) def test_tuples_have_na(self): - index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], - codes=[[1, 1, 1, 1, -1, 0, 0, 0], - [0, 1, 2, 3, 0, 1, 2, 3]]) + index = MultiIndex( + levels=[[1, 0], [0, 1, 2, 3]], + codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]], + ) assert isna(index[4][0]) assert isna(index.values[4][0]) def test_duplicate_groupby_issues(self): - idx_tp = [('600809', '20061231'), ('600809', '20070331'), - ('600809', '20070630'), ('600809', '20070331')] - dt = ['demo', 'demo', 'demo', 'demo'] - - idx = MultiIndex.from_tuples(idx_tp, names=['STK_ID', 'RPT_Date']) + idx_tp = [ + ("600809", "20061231"), + ("600809", "20070331"), + ("600809", "20070630"), + ("600809", "20070331"), + ] + dt = ["demo", "demo", "demo", "demo"] + + idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"]) s = Series(dt, index=idx) result = s.groupby(s.index).first() @@ -1395,25 +1517,31 @@ def test_duplicate_groupby_issues(self): def test_duplicate_mi(self): # GH 4516 - df = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2], - ['bah', 'bam', 3.0, 3], - ['bah', 'bam', 4.0, 4], ['foo', 'bar', 5.0, 5], - ['bah', 'bam', 6.0, 6]], - columns=list('ABCD')) - df = df.set_index(['A', 'B']) + df = DataFrame( + [ + ["foo", "bar", 1.0, 1], + ["foo", "bar", 2.0, 2], + ["bah", "bam", 3.0, 3], + ["bah", "bam", 4.0, 4], + ["foo", "bar", 5.0, 5], + ["bah", "bam", 6.0, 6], + ], + columns=list("ABCD"), + ) + df = df.set_index(["A", "B"]) df = df.sort_index(level=0) - expected = DataFrame([['foo', 'bar', 1.0, 1], ['foo', 'bar', 2.0, 2], - ['foo', 'bar', 5.0, 5]], - columns=list('ABCD')).set_index(['A', 'B']) - result = df.loc[('foo', 'bar')] + expected = DataFrame( + [["foo", "bar", 1.0, 1], ["foo", "bar", 2.0, 2], ["foo", "bar", 5.0, 5]], + columns=list("ABCD"), + ).set_index(["A", "B"]) + result = df.loc[("foo", "bar")] tm.assert_frame_equal(result, expected) def test_duplicated_drop_duplicates(self): # GH 4060 idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2])) - expected = np.array( - [False, False, False, True, False, False], dtype=bool) + expected = np.array([False, False, False, True, False, False], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool @@ -1421,11 +1549,11 @@ def test_duplicated_drop_duplicates(self): tm.assert_index_equal(idx.drop_duplicates(), expected) expected = np.array([True, False, False, False, False, False]) - duplicated = idx.duplicated(keep='last') + duplicated = idx.duplicated(keep="last") tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) - tm.assert_index_equal(idx.drop_duplicates(keep='last'), expected) + tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected) expected = np.array([True, False, False, True, False, False]) duplicated = idx.duplicated(keep=False) @@ -1436,25 +1564,26 @@ def test_duplicated_drop_duplicates(self): def test_multiindex_set_index(self): # segfault in #3308 - d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]} + d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]} df = DataFrame(d) tuples = [(0, 1), (0, 2), (1, 2)] - df['tuples'] = tuples + df["tuples"] = tuples - index = MultiIndex.from_tuples(df['tuples']) + index = MultiIndex.from_tuples(df["tuples"]) # it works! df.set_index(index) def test_datetimeindex(self): idx1 = pd.DatetimeIndex( - ['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00' - ] * 2, tz='Asia/Tokyo') - idx2 = pd.date_range('2010/01/01', periods=6, freq='M', - tz='US/Eastern') + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, + tz="Asia/Tokyo", + ) + idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) - expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', - '2013-04-03 9:00'], tz='Asia/Tokyo') + expected1 = pd.DatetimeIndex( + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"], tz="Asia/Tokyo" + ) tm.assert_index_equal(idx.levels[0], expected1) tm.assert_index_equal(idx.levels[1], idx2) @@ -1465,18 +1594,19 @@ def test_datetimeindex(self): date2 = datetime.datetime.today() date3 = Timestamp.today() - for d1, d2 in itertools.product( - [date1, date2, date3], [date1, date2, date3]): + for d1, d2 in itertools.product([date1, date2, date3], [date1, date2, date3]): index = MultiIndex.from_product([[d1], [d2]]) assert isinstance(index.levels[0], pd.DatetimeIndex) assert isinstance(index.levels[1], pd.DatetimeIndex) def test_constructor_with_tz(self): - index = pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], - name='dt1', tz='US/Pacific') - columns = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], - name='dt2', tz='Asia/Tokyo') + index = pd.DatetimeIndex( + ["2013/01/01 09:00", "2013/01/02 09:00"], name="dt1", tz="US/Pacific" + ) + columns = pd.DatetimeIndex( + ["2014/01/01 09:00", "2014/01/02 09:00"], name="dt2", tz="Asia/Tokyo" + ) result = MultiIndex.from_arrays([index, columns]) tm.assert_index_equal(result.levels[0], index) @@ -1489,49 +1619,72 @@ def test_constructor_with_tz(self): def test_set_index_datetime(self): # GH 3950 df = DataFrame( - {'label': ['a', 'a', 'a', 'b', 'b', 'b'], - 'datetime': ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'], - 'value': range(6)}) - df.index = pd.to_datetime(df.pop('datetime'), utc=True) - df.index = df.index.tz_convert('US/Pacific') - - expected = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], name='datetime') - expected = expected.tz_localize('UTC').tz_convert('US/Pacific') - - df = df.set_index('label', append=True) + { + "label": ["a", "a", "a", "b", "b", "b"], + "datetime": [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + "value": range(6), + } + ) + df.index = pd.to_datetime(df.pop("datetime"), utc=True) + df.index = df.index.tz_convert("US/Pacific") + + expected = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + name="datetime", + ) + expected = expected.tz_localize("UTC").tz_convert("US/Pacific") + + df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) - tm.assert_index_equal(df.index.levels[1], - Index(['a', 'b'], name='label')) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) df = df.swaplevel(0, 1) - tm.assert_index_equal(df.index.levels[0], - Index(['a', 'b'], name='label')) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) df = DataFrame(np.random.random(6)) - idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00', '2011-07-19 07:00:00', - '2011-07-19 08:00:00', '2011-07-19 09:00:00'], - tz='US/Eastern') - idx2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-01 09:00', - '2012-04-01 09:00', '2012-04-02 09:00', - '2012-04-02 09:00', '2012-04-02 09:00'], - tz='US/Eastern') - idx3 = pd.date_range('2011-01-01 09:00', periods=6, tz='Asia/Tokyo') + idx1 = pd.DatetimeIndex( + [ + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + "2011-07-19 07:00:00", + "2011-07-19 08:00:00", + "2011-07-19 09:00:00", + ], + tz="US/Eastern", + ) + idx2 = pd.DatetimeIndex( + [ + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-01 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + "2012-04-02 09:00", + ], + tz="US/Eastern", + ) + idx3 = pd.date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) - expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00', - '2011-07-19 08:00:00', - '2011-07-19 09:00:00'], tz='US/Eastern') - expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'], - tz='US/Eastern') + expected1 = pd.DatetimeIndex( + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + tz="US/Eastern", + ) + expected2 = pd.DatetimeIndex( + ["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern" + ) tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) @@ -1544,157 +1697,190 @@ def test_set_index_datetime(self): def test_reset_index_datetime(self): # GH 3950 - for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: - idx1 = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, - name='idx1') - idx2 = Index(range(5), name='idx2', dtype='int64') + for tz in ["UTC", "Asia/Tokyo", "US/Eastern"]: + idx1 = pd.date_range("1/1/2011", periods=5, freq="D", tz=tz, name="idx1") + idx2 = Index(range(5), name="idx2", dtype="int64") idx = MultiIndex.from_arrays([idx1, idx2]) df = DataFrame( - {'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) - - expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1), - datetime.datetime(2011, 1, 2), - datetime.datetime(2011, 1, 3), - datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5)], - 'idx2': np.arange(5, dtype='int64'), - 'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, - columns=['idx1', 'idx2', 'a', 'b']) - expected['idx1'] = expected['idx1'].apply( - lambda d: Timestamp(d, tz=tz)) + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) tm.assert_frame_equal(df.reset_index(), expected) - idx3 = pd.date_range('1/1/2012', periods=5, freq='MS', - tz='Europe/Paris', name='idx3') + idx3 = pd.date_range( + "1/1/2012", periods=5, freq="MS", tz="Europe/Paris", name="idx3" + ) idx = MultiIndex.from_arrays([idx1, idx2, idx3]) df = DataFrame( - {'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) - - expected = DataFrame({'idx1': [datetime.datetime(2011, 1, 1), - datetime.datetime(2011, 1, 2), - datetime.datetime(2011, 1, 3), - datetime.datetime(2011, 1, 4), - datetime.datetime(2011, 1, 5)], - 'idx2': np.arange(5, dtype='int64'), - 'idx3': [datetime.datetime(2012, 1, 1), - datetime.datetime(2012, 2, 1), - datetime.datetime(2012, 3, 1), - datetime.datetime(2012, 4, 1), - datetime.datetime(2012, 5, 1)], - 'a': np.arange(5, dtype='int64'), - 'b': ['A', 'B', 'C', 'D', 'E']}, - columns=['idx1', 'idx2', 'idx3', 'a', 'b']) - expected['idx1'] = expected['idx1'].apply( - lambda d: Timestamp(d, tz=tz)) - expected['idx3'] = expected['idx3'].apply( - lambda d: Timestamp(d, tz='Europe/Paris')) + {"a": np.arange(5, dtype="int64"), "b": ["A", "B", "C", "D", "E"]}, + index=idx, + ) + + expected = DataFrame( + { + "idx1": [ + datetime.datetime(2011, 1, 1), + datetime.datetime(2011, 1, 2), + datetime.datetime(2011, 1, 3), + datetime.datetime(2011, 1, 4), + datetime.datetime(2011, 1, 5), + ], + "idx2": np.arange(5, dtype="int64"), + "idx3": [ + datetime.datetime(2012, 1, 1), + datetime.datetime(2012, 2, 1), + datetime.datetime(2012, 3, 1), + datetime.datetime(2012, 4, 1), + datetime.datetime(2012, 5, 1), + ], + "a": np.arange(5, dtype="int64"), + "b": ["A", "B", "C", "D", "E"], + }, + columns=["idx1", "idx2", "idx3", "a", "b"], + ) + expected["idx1"] = expected["idx1"].apply(lambda d: Timestamp(d, tz=tz)) + expected["idx3"] = expected["idx3"].apply( + lambda d: Timestamp(d, tz="Europe/Paris") + ) tm.assert_frame_equal(df.reset_index(), expected) # GH 7793 - idx = MultiIndex.from_product([['a', 'b'], pd.date_range( - '20130101', periods=3, tz=tz)]) + idx = MultiIndex.from_product( + [["a", "b"], pd.date_range("20130101", periods=3, tz=tz)] + ) df = DataFrame( - np.arange(6, dtype='int64').reshape( - 6, 1), columns=['a'], index=idx) - - expected = DataFrame({'level_0': 'a a a b b b'.split(), - 'level_1': [ - datetime.datetime(2013, 1, 1), - datetime.datetime(2013, 1, 2), - datetime.datetime(2013, 1, 3)] * 2, - 'a': np.arange(6, dtype='int64')}, - columns=['level_0', 'level_1', 'a']) - expected['level_1'] = expected['level_1'].apply( - lambda d: Timestamp(d, freq='D', tz=tz)) + np.arange(6, dtype="int64").reshape(6, 1), columns=["a"], index=idx + ) + + expected = DataFrame( + { + "level_0": "a a a b b b".split(), + "level_1": [ + datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 1, 2), + datetime.datetime(2013, 1, 3), + ] + * 2, + "a": np.arange(6, dtype="int64"), + }, + columns=["level_0", "level_1", "a"], + ) + expected["level_1"] = expected["level_1"].apply( + lambda d: Timestamp(d, freq="D", tz=tz) + ) tm.assert_frame_equal(df.reset_index(), expected) def test_reset_index_period(self): # GH 7746 idx = MultiIndex.from_product( - [pd.period_range('20130101', periods=3, freq='M'), list('abc')], - names=['month', 'feature']) - - df = DataFrame(np.arange(9, dtype='int64').reshape(-1, 1), - index=idx, columns=['a']) - expected = DataFrame({ - 'month': ([pd.Period('2013-01', freq='M')] * 3 + - [pd.Period('2013-02', freq='M')] * 3 + - [pd.Period('2013-03', freq='M')] * 3), - 'feature': ['a', 'b', 'c'] * 3, - 'a': np.arange(9, dtype='int64') - }, columns=['month', 'feature', 'a']) + [pd.period_range("20130101", periods=3, freq="M"), list("abc")], + names=["month", "feature"], + ) + + df = DataFrame( + np.arange(9, dtype="int64").reshape(-1, 1), index=idx, columns=["a"] + ) + expected = DataFrame( + { + "month": ( + [pd.Period("2013-01", freq="M")] * 3 + + [pd.Period("2013-02", freq="M")] * 3 + + [pd.Period("2013-03", freq="M")] * 3 + ), + "feature": ["a", "b", "c"] * 3, + "a": np.arange(9, dtype="int64"), + }, + columns=["month", "feature", "a"], + ) tm.assert_frame_equal(df.reset_index(), expected) def test_reset_index_multiindex_columns(self): - levels = [['A', ''], ['B', 'b']] - df = DataFrame([[0, 2], [1, 3]], - columns=MultiIndex.from_tuples(levels)) - result = df[['B']].rename_axis('A').reset_index() + levels = [["A", ""], ["B", "b"]] + df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + result = df[["B"]].rename_axis("A").reset_index() tm.assert_frame_equal(result, df) # gh-16120: already existing column - with pytest.raises(ValueError, - match=(r"cannot insert \('A', ''\), " - "already exists")): - df.rename_axis('A').reset_index() + with pytest.raises( + ValueError, match=(r"cannot insert \('A', ''\), " "already exists") + ): + df.rename_axis("A").reset_index() # gh-16164: multiindex (tuple) full key - result = df.set_index([('A', '')]).reset_index() + result = df.set_index([("A", "")]).reset_index() tm.assert_frame_equal(result, df) # with additional (unnamed) index level - idx_col = DataFrame([[0], [1]], - columns=MultiIndex.from_tuples([('level_0', '')])) - expected = pd.concat([idx_col, df[[('B', 'b'), ('A', '')]]], axis=1) - result = df.set_index([('B', 'b')], append=True).reset_index() + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) + ) + expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1) + result = df.set_index([("B", "b")], append=True).reset_index() tm.assert_frame_equal(result, expected) # with index name which is a too long tuple... - with pytest.raises(ValueError, - match=("Item must have length equal " - "to number of levels.")): - df.rename_axis([('C', 'c', 'i')]).reset_index() + with pytest.raises( + ValueError, match=("Item must have length equal " "to number of levels.") + ): + df.rename_axis([("C", "c", "i")]).reset_index() # or too short... - levels = [['A', 'a', ''], ['B', 'b', 'i']] - df2 = DataFrame([[0, 2], [1, 3]], - columns=MultiIndex.from_tuples(levels)) - idx_col = DataFrame([[0], [1]], - columns=MultiIndex.from_tuples([('C', 'c', 'ii')])) + levels = [["A", "a", ""], ["B", "b", "i"]] + df2 = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + idx_col = DataFrame( + [[0], [1]], columns=MultiIndex.from_tuples([("C", "c", "ii")]) + ) expected = pd.concat([idx_col, df2], axis=1) - result = df2.rename_axis([('C', 'c')]).reset_index(col_fill='ii') + result = df2.rename_axis([("C", "c")]).reset_index(col_fill="ii") tm.assert_frame_equal(result, expected) # ... which is incompatible with col_fill=None - with pytest.raises(ValueError, - match=("col_fill=None is incompatible with " - r"incomplete column name \('C', 'c'\)")): - df2.rename_axis([('C', 'c')]).reset_index(col_fill=None) + with pytest.raises( + ValueError, + match=( + "col_fill=None is incompatible with " + r"incomplete column name \('C', 'c'\)" + ), + ): + df2.rename_axis([("C", "c")]).reset_index(col_fill=None) # with col_level != 0 - result = df2.rename_axis([('c', 'ii')]).reset_index(col_level=1, - col_fill='C') + result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") tm.assert_frame_equal(result, expected) def test_set_index_period(self): # GH 6631 df = DataFrame(np.random.random(6)) - idx1 = pd.period_range('2011-01-01', periods=3, freq='M') + idx1 = pd.period_range("2011-01-01", periods=3, freq="M") idx1 = idx1.append(idx1) - idx2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + idx2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") idx2 = idx2.append(idx2).append(idx2) - idx3 = pd.period_range('2005', periods=6, freq='A') + idx3 = pd.period_range("2005", periods=6, freq="A") df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) - expected1 = pd.period_range('2011-01-01', periods=3, freq='M') - expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + expected1 = pd.period_range("2011-01-01", periods=3, freq="M") + expected2 = pd.period_range("2013-01-01 09:00", periods=2, freq="H") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) @@ -1708,9 +1894,9 @@ def test_repeat(self): # GH 9361 # fixed by # GH 7891 m_idx = MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6), (7, 8)]) - data = ['a', 'b', 'c', 'd'] + data = ["a", "b", "c", "d"] m_df = Series(data, index=m_idx) - assert m_df.repeat(3).shape == (3 * len(data), ) + assert m_df.repeat(3).shape == (3 * len(data),) class TestSorted(Base): @@ -1725,22 +1911,38 @@ def test_sorting_repr_8017(self): np.random.seed(0) data = np.random.randn(3, 4) - for gen, extra in [([1., 3., 2., 5.], 4.), ([1, 3, 2, 5], 4), - ([Timestamp('20130101'), Timestamp('20130103'), - Timestamp('20130102'), Timestamp('20130105')], - Timestamp('20130104')), - (['1one', '3one', '2one', '5one'], '4one')]: - columns = MultiIndex.from_tuples([('red', i) for i in gen]) - df = DataFrame(data, index=list('def'), columns=columns) - df2 = pd.concat([df, - DataFrame('world', index=list('def'), - columns=MultiIndex.from_tuples( - [('red', extra)]))], axis=1) + for gen, extra in [ + ([1.0, 3.0, 2.0, 5.0], 4.0), + ([1, 3, 2, 5], 4), + ( + [ + Timestamp("20130101"), + Timestamp("20130103"), + Timestamp("20130102"), + Timestamp("20130105"), + ], + Timestamp("20130104"), + ), + (["1one", "3one", "2one", "5one"], "4one"), + ]: + columns = MultiIndex.from_tuples([("red", i) for i in gen]) + df = DataFrame(data, index=list("def"), columns=columns) + df2 = pd.concat( + [ + df, + DataFrame( + "world", + index=list("def"), + columns=MultiIndex.from_tuples([("red", extra)]), + ), + ], + axis=1, + ) # check that the repr is good # make sure that we have a correct sparsified repr # e.g. only 1 header of read - assert str(df2).splitlines()[0].split() == ['red'] + assert str(df2).splitlines()[0].split() == ["red"] # GH 8017 # sorting fails after columns added @@ -1756,7 +1958,7 @@ def test_sorting_repr_8017(self): # setitem then sort result = df.copy() - result[('red', extra)] = 'world' + result[("red", extra)] = "world" result = result.sort_index(axis=1) tm.assert_frame_equal(result, expected) @@ -1768,7 +1970,7 @@ def test_sort_index_level(self): # axis=1 # series - a_sorted = self.frame['A'].sort_index(level=0) + a_sorted = self.frame["A"].sort_index(level=0) # preserve names assert a_sorted.index.names == self.frame.index.names @@ -1798,8 +2000,8 @@ def test_sort_index_level_large_cardinality(self): assert result.index.lexsort_depth == 3 def test_sort_index_level_by_name(self): - self.frame.index.names = ['first', 'second'] - result = self.frame.sort_index(level='second') + self.frame.index.names = ["first", "second"] + result = self.frame.sort_index(level="second") expected = self.frame.sort_index(level=1) tm.assert_frame_equal(result, expected) @@ -1807,32 +2009,36 @@ def test_sort_index_level_mixed(self): sorted_before = self.frame.sort_index(level=1) df = self.frame.copy() - df['foo'] = 'bar' + df["foo"] = "bar" sorted_after = df.sort_index(level=1) - tm.assert_frame_equal(sorted_before, - sorted_after.drop(['foo'], axis=1)) + tm.assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) dft = self.frame.T sorted_before = dft.sort_index(level=1, axis=1) - dft['foo', 'three'] = 'bar' + dft["foo", "three"] = "bar" sorted_after = dft.sort_index(level=1, axis=1) - tm.assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), - sorted_after.drop([('foo', 'three')], axis=1)) + tm.assert_frame_equal( + sorted_before.drop([("foo", "three")], axis=1), + sorted_after.drop([("foo", "three")], axis=1), + ) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] - index = MultiIndex(levels=levels, - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + ) assert index.is_lexsorted() - index = MultiIndex(levels=levels, - codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]] + ) assert not index.is_lexsorted() - index = MultiIndex(levels=levels, - codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]] + ) assert not index.is_lexsorted() assert index.lexsort_depth == 0 @@ -1842,17 +2048,19 @@ def test_sort_index_and_reconstruction(self): # lexsortedness should be identical # across MultiIndex construction methods - df = DataFrame([[1, 1], [2, 2]], index=list('ab')) - expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex.from_tuples([(0.5, 'a'), - (0.5, 'b'), - (0.8, 'a'), - (0.8, 'b')])) + df = DataFrame([[1, 1], [2, 2]], index=list("ab")) + expected = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples( + [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")] + ), + ) assert expected.index.is_lexsorted() result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex.from_product([[0.5, 0.8], list('ab')])) + index=MultiIndex.from_product([[0.5, 0.8], list("ab")]), + ) result = result.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic @@ -1861,8 +2069,10 @@ def test_sort_index_and_reconstruction(self): result = DataFrame( [[1, 1], [2, 2], [1, 1], [2, 2]], - index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) + index=MultiIndex( + levels=[[0.5, 0.8], ["a", "b"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) result = result.sort_index() assert result.index.is_lexsorted() @@ -1877,15 +2087,17 @@ def test_sort_index_and_reconstruction(self): tm.assert_frame_equal(result, expected) # 14015 - df = DataFrame([[1, 2], [6, 7]], - columns=MultiIndex.from_tuples( - [(0, '20160811 12:00:00'), - (0, '20160809 12:00:00')], - names=['l1', 'Date'])) - - df.columns.set_levels(pd.to_datetime(df.columns.levels[1]), - level=1, - inplace=True) + df = DataFrame( + [[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, "20160811 12:00:00"), (0, "20160809 12:00:00")], + names=["l1", "Date"], + ), + ) + + df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1, inplace=True + ) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic result = df.sort_index(axis=1) @@ -1897,18 +2109,22 @@ def test_sort_index_and_reconstruction(self): def test_sort_index_and_reconstruction_doc_example(self): # doc example - df = DataFrame({'value': [1, 2, 3, 4]}, - index=MultiIndex( - levels=[['a', 'b'], ['bb', 'aa']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=MultiIndex( + levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) assert df.index.is_lexsorted() assert not df.index.is_monotonic # sort it - expected = DataFrame({'value': [2, 1, 4, 3]}, - index=MultiIndex( - levels=[['a', 'b'], ['aa', 'bb']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]])) + expected = DataFrame( + {"value": [2, 1, 4, 3]}, + index=MultiIndex( + levels=[["a", "b"], ["aa", "bb"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]] + ), + ) result = df.sort_index() assert result.index.is_lexsorted() assert result.index.is_monotonic @@ -1928,20 +2144,22 @@ def test_sort_index_reorder_on_ops(self): df = DataFrame( np.random.randn(8, 2), index=MultiIndex.from_product( - [['a', 'b'], ['big', 'small'], ['red', 'blu']], - names=['letter', 'size', 'color']), - columns=['near', 'far']) + [["a", "b"], ["big", "small"], ["red", "blu"]], + names=["letter", "size", "color"], + ), + columns=["near", "far"], + ) df = df.sort_index() def my_func(group): - group.index = ['newz', 'newa'] + group.index = ["newz", "newa"] return group - result = df.groupby(level=['letter', 'size']).apply( - my_func).sort_index() + result = df.groupby(level=["letter", "size"]).apply(my_func).sort_index() expected = MultiIndex.from_product( - [['a', 'b'], ['big', 'small'], ['newa', 'newz']], - names=['letter', 'size', None]) + [["a", "b"], ["big", "small"], ["newa", "newz"]], + names=["letter", "size", None], + ) tm.assert_index_equal(result.index, expected) @@ -1949,14 +2167,11 @@ def test_sort_non_lexsorted(self): # degenerate case where we sort but don't # have a satisfying result :< # GH 15797 - idx = MultiIndex([['A', 'B', 'C'], - ['c', 'b', 'a']], - [[0, 1, 2, 0, 1, 2], - [0, 2, 1, 1, 0, 2]]) - - df = DataFrame({'col': range(len(idx))}, - index=idx, - dtype='int64') + idx = MultiIndex( + [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]] + ) + + df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64") assert df.index.is_lexsorted() is False assert df.index.is_monotonic is False @@ -1965,11 +2180,13 @@ def test_sort_non_lexsorted(self): assert sorted.index.is_monotonic is True expected = DataFrame( - {'col': [1, 4, 5, 2]}, - index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'), - ('C', 'a'), ('C', 'b')]), - dtype='int64') - result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :] + {"col": [1, 4, 5, 2]}, + index=MultiIndex.from_tuples( + [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")] + ), + dtype="int64", + ) + result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :] tm.assert_frame_equal(result, expected) def test_sort_index_nan(self): @@ -1978,21 +2195,45 @@ def test_sort_index_nan(self): tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]] mi = MultiIndex.from_tuples(tuples) - df = DataFrame(np.arange(16).reshape(4, 4), - index=mi, columns=list('ABCD')) + df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD")) s = Series(np.arange(4), index=mi) - df2 = DataFrame({ - 'date': pd.to_datetime([ - '20121002', '20121007', '20130130', '20130202', '20130305', - '20121002', '20121207', '20130130', '20130202', '20130305', - '20130202', '20130305' - ]), - 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], - 'whole_cost': [1790, np.nan, 280, 259, np.nan, 623, 90, 312, - np.nan, 301, 359, 801], - 'cost': [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12] - }).set_index(['date', 'user_id']) + df2 = DataFrame( + { + "date": pd.to_datetime( + [ + "20121002", + "20121007", + "20130130", + "20130202", + "20130305", + "20121002", + "20121207", + "20130130", + "20130202", + "20130305", + "20130202", + "20130305", + ] + ), + "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], + "whole_cost": [ + 1790, + np.nan, + 280, + 259, + np.nan, + 623, + 90, + 312, + np.nan, + 301, + 359, + 801, + ], + "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12], + } + ).set_index(["date", "user_id"]) # sorting frame, default nan position is last result = df.sort_index() @@ -2000,12 +2241,12 @@ def test_sort_index_nan(self): tm.assert_frame_equal(result, expected) # sorting frame, nan position last - result = df.sort_index(na_position='last') + result = df.sort_index(na_position="last") expected = df.iloc[[3, 0, 2, 1], :] tm.assert_frame_equal(result, expected) # sorting frame, nan position first - result = df.sort_index(na_position='first') + result = df.sort_index(na_position="first") expected = df.iloc[[1, 2, 3, 0], :] tm.assert_frame_equal(result, expected) @@ -2020,12 +2261,12 @@ def test_sort_index_nan(self): tm.assert_series_equal(result, expected) # sorting series, nan position last - result = s.sort_index(na_position='last') + result = s.sort_index(na_position="last") expected = s.iloc[[3, 0, 2, 1]] tm.assert_series_equal(result, expected) # sorting series, nan position first - result = s.sort_index(na_position='first') + result = s.sort_index(na_position="first") expected = s.iloc[[1, 2, 3, 0]] tm.assert_series_equal(result, expected) @@ -2033,20 +2274,21 @@ def test_sort_ascending_list(self): # GH: 16934 # Set up a Series with a three level MultiIndex - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], - [4, 3, 2, 1, 4, 3, 2, 1]] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + [4, 3, 2, 1, 4, 3, 2, 1], + ] tuples = zip(*arrays) - mi = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + mi = MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) s = Series(range(8), index=mi) # Sort with boolean ascending - result = s.sort_index(level=['third', 'first'], ascending=False) + result = s.sort_index(level=["third", "first"], ascending=False) expected = s.iloc[[4, 0, 5, 1, 6, 2, 7, 3]] tm.assert_series_equal(result, expected) # Sort with list of boolean ascending - result = s.sort_index(level=['third', 'first'], - ascending=[False, True]) + result = s.sort_index(level=["third", "first"], ascending=[False, True]) expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 6e7b34a0632ad..f6e936630f6be 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -18,7 +18,6 @@ class TestnanopsDataFrame: - def setup_method(self, method): np.random.seed(11235) nanops._USE_BOTTLENECK = False @@ -30,12 +29,10 @@ def setup_method(self, method): self.arr_complex = self.arr_float + self.arr_float1 * 1j self.arr_int = np.random.randint(-10, 10, arr_shape) self.arr_bool = np.random.randint(0, 2, arr_shape) == 0 - self.arr_str = np.abs(self.arr_float).astype('S') - self.arr_utf = np.abs(self.arr_float).astype('U') - self.arr_date = np.random.randint(0, 20000, - arr_shape).astype('M8[ns]') - self.arr_tdelta = np.random.randint(0, 20000, - arr_shape).astype('m8[ns]') + self.arr_str = np.abs(self.arr_float).astype("S") + self.arr_utf = np.abs(self.arr_float).astype("U") + self.arr_date = np.random.randint(0, 20000, arr_shape).astype("M8[ns]") + self.arr_tdelta = np.random.randint(0, 20000, arr_shape).astype("m8[ns]") self.arr_nan = np.tile(np.nan, arr_shape) self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan]) @@ -47,29 +44,27 @@ def setup_method(self, method): self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf]) self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf]) - self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, - self.arr_inf]) - self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, - self.arr_inf]) - self.arr_obj = np.vstack([ - self.arr_float.astype('O'), - self.arr_int.astype('O'), - self.arr_bool.astype('O'), - self.arr_complex.astype('O'), - self.arr_str.astype('O'), - self.arr_utf.astype('O'), - self.arr_date.astype('O'), - self.arr_tdelta.astype('O') - ]) - - with np.errstate(invalid='ignore'): + self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, self.arr_inf]) + self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, self.arr_inf]) + self.arr_obj = np.vstack( + [ + self.arr_float.astype("O"), + self.arr_int.astype("O"), + self.arr_bool.astype("O"), + self.arr_complex.astype("O"), + self.arr_str.astype("O"), + self.arr_utf.astype("O"), + self.arr_date.astype("O"), + self.arr_tdelta.astype("O"), + ] + ) + + with np.errstate(invalid="ignore"): self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j - self.arr_complex_nan = np.vstack([self.arr_complex, - self.arr_nan_nanj]) + self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj]) self.arr_nan_infj = self.arr_inf * 1j - self.arr_complex_nan_infj = np.vstack([self.arr_complex, - self.arr_nan_infj]) + self.arr_complex_nan_infj = np.vstack([self.arr_complex, self.arr_nan_infj]) self.arr_float_2d = self.arr_float[:, :, 0] self.arr_float1_2d = self.arr_float1[:, :, 0] @@ -91,22 +86,26 @@ def teardown_method(self, method): nanops._USE_BOTTLENECK = use_bn def check_results(self, targ, res, axis, check_dtype=True): - res = getattr(res, 'asm8', res) - res = getattr(res, 'values', res) + res = getattr(res, "asm8", res) + res = getattr(res, "values", res) # timedeltas are a beast here def _coerce_tds(targ, res): - if hasattr(targ, 'dtype') and targ.dtype == 'm8[ns]': + if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": if len(targ) == 1: targ = targ[0].item() res = res.item() else: - targ = targ.view('i8') + targ = targ.view("i8") return targ, res try: - if axis != 0 and hasattr( - targ, 'shape') and targ.ndim and targ.shape != res.shape: + if ( + axis != 0 + and hasattr(targ, "shape") + and targ.ndim + and targ.shape != res.shape + ): res = np.split(res, [targ.shape[0]], axis=0)[0] except (ValueError, IndexError): targ, res = _coerce_tds(targ, res) @@ -116,7 +115,7 @@ def _coerce_tds(targ, res): except AssertionError: # handle timedelta dtypes - if hasattr(targ, 'dtype') and targ.dtype == 'm8[ns]': + if hasattr(targ, "dtype") and targ.dtype == "m8[ns]": targ, res = _coerce_tds(targ, res) tm.assert_almost_equal(targ, res, check_dtype=check_dtype) return @@ -124,34 +123,40 @@ def _coerce_tds(targ, res): # There are sometimes rounding errors with # complex and object dtypes. # If it isn't one of those, re-raise the error. - if not hasattr(res, 'dtype') or res.dtype.kind not in ['c', 'O']: + if not hasattr(res, "dtype") or res.dtype.kind not in ["c", "O"]: raise # convert object dtypes to something that can be split into # real and imaginary parts - if res.dtype.kind == 'O': - if targ.dtype.kind != 'O': + if res.dtype.kind == "O": + if targ.dtype.kind != "O": res = res.astype(targ.dtype) else: try: - res = res.astype('c16') + res = res.astype("c16") except RuntimeError: - res = res.astype('f8') + res = res.astype("f8") try: - targ = targ.astype('c16') + targ = targ.astype("c16") except RuntimeError: - targ = targ.astype('f8') + targ = targ.astype("f8") # there should never be a case where numpy returns an object # but nanops doesn't, so make that an exception - elif targ.dtype.kind == 'O': + elif targ.dtype.kind == "O": raise - tm.assert_almost_equal(np.real(targ), np.real(res), - check_dtype=check_dtype) - tm.assert_almost_equal(np.imag(targ), np.imag(res), - check_dtype=check_dtype) - - def check_fun_data(self, testfunc, targfunc, testarval, targarval, - targarnanval, check_dtype=True, empty_targfunc=None, - **kwargs): + tm.assert_almost_equal(np.real(targ), np.real(res), check_dtype=check_dtype) + tm.assert_almost_equal(np.imag(targ), np.imag(res), check_dtype=check_dtype) + + def check_fun_data( + self, + testfunc, + targfunc, + testarval, + targarval, + targarnanval, + check_dtype=True, + empty_targfunc=None, + **kwargs + ): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval @@ -161,25 +166,23 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, targ = targfunc(targartempval, axis=axis, **kwargs) try: - res = testfunc(testarval, axis=axis, skipna=skipna, - **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) + self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna: res = testfunc(testarval, axis=axis, **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + self.check_results(targ, res, axis, check_dtype=check_dtype) if axis is None: res = testfunc(testarval, skipna=skipna, **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna and axis is None: res = testfunc(testarval, **kwargs) - self.check_results(targ, res, axis, - check_dtype=check_dtype) + self.check_results(targ, res, axis, check_dtype=check_dtype) except BaseException as exc: - exc.args += ('axis: %s of %s' % (axis, testarval.ndim - 1), - 'skipna: %s' % skipna, 'kwargs: %s' % kwargs) + exc.args += ( + "axis: %s of %s" % (axis, testarval.ndim - 1), + "skipna: %s" % skipna, + "kwargs: %s" % kwargs, + ) raise if testarval.ndim <= 1: @@ -191,12 +194,27 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, targarnanval2 = np.take(targarnanval, 0, axis=-1) except ValueError: return - self.check_fun_data(testfunc, targfunc, testarval2, targarval2, - targarnanval2, check_dtype=check_dtype, - empty_targfunc=empty_targfunc, **kwargs) - - def check_fun(self, testfunc, targfunc, testar, targar=None, - targarnan=None, empty_targfunc=None, **kwargs): + self.check_fun_data( + testfunc, + targfunc, + testarval2, + targarval2, + targarnanval2, + check_dtype=check_dtype, + empty_targfunc=empty_targfunc, + **kwargs + ) + + def check_fun( + self, + testfunc, + targfunc, + testar, + targar=None, + targarnan=None, + empty_targfunc=None, + **kwargs + ): if targar is None: targar = testar if targarnan is None: @@ -205,40 +223,61 @@ def check_fun(self, testfunc, targfunc, testar, targar=None, targarval = getattr(self, targar) targarnanval = getattr(self, targarnan) try: - self.check_fun_data(testfunc, targfunc, testarval, targarval, - targarnanval, empty_targfunc=empty_targfunc, - **kwargs) + self.check_fun_data( + testfunc, + targfunc, + testarval, + targarval, + targarnanval, + empty_targfunc=empty_targfunc, + **kwargs + ) except BaseException as exc: - exc.args += ('testar: %s' % testar, 'targar: %s' % targar, - 'targarnan: %s' % targarnan) + exc.args += ( + "testar: %s" % testar, + "targar: %s" % targar, + "targarnan: %s" % targarnan, + ) raise - def check_funs(self, testfunc, targfunc, allow_complex=True, - allow_all_nan=True, allow_str=True, allow_date=True, - allow_tdelta=True, allow_obj=True, **kwargs): - self.check_fun(testfunc, targfunc, 'arr_float', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_float_nan', 'arr_float', - **kwargs) - self.check_fun(testfunc, targfunc, 'arr_int', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_bool', **kwargs) - objs = [self.arr_float.astype('O'), self.arr_int.astype('O'), - self.arr_bool.astype('O')] + def check_funs( + self, + testfunc, + targfunc, + allow_complex=True, + allow_all_nan=True, + allow_str=True, + allow_date=True, + allow_tdelta=True, + allow_obj=True, + **kwargs + ): + self.check_fun(testfunc, targfunc, "arr_float", **kwargs) + self.check_fun(testfunc, targfunc, "arr_float_nan", "arr_float", **kwargs) + self.check_fun(testfunc, targfunc, "arr_int", **kwargs) + self.check_fun(testfunc, targfunc, "arr_bool", **kwargs) + objs = [ + self.arr_float.astype("O"), + self.arr_int.astype("O"), + self.arr_bool.astype("O"), + ] if allow_all_nan: - self.check_fun(testfunc, targfunc, 'arr_nan', **kwargs) + self.check_fun(testfunc, targfunc, "arr_nan", **kwargs) if allow_complex: - self.check_fun(testfunc, targfunc, 'arr_complex', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_complex_nan', - 'arr_complex', **kwargs) + self.check_fun(testfunc, targfunc, "arr_complex", **kwargs) + self.check_fun( + testfunc, targfunc, "arr_complex_nan", "arr_complex", **kwargs + ) if allow_all_nan: - self.check_fun(testfunc, targfunc, 'arr_nan_nanj', **kwargs) - objs += [self.arr_complex.astype('O')] + self.check_fun(testfunc, targfunc, "arr_nan_nanj", **kwargs) + objs += [self.arr_complex.astype("O")] if allow_str: - self.check_fun(testfunc, targfunc, 'arr_str', **kwargs) - self.check_fun(testfunc, targfunc, 'arr_utf', **kwargs) - objs += [self.arr_str.astype('O'), self.arr_utf.astype('O')] + self.check_fun(testfunc, targfunc, "arr_str", **kwargs) + self.check_fun(testfunc, targfunc, "arr_utf", **kwargs) + objs += [self.arr_str.astype("O"), self.arr_utf.astype("O")] if allow_date: try: @@ -246,8 +285,8 @@ def check_funs(self, testfunc, targfunc, allow_complex=True, except TypeError: pass else: - self.check_fun(testfunc, targfunc, 'arr_date', **kwargs) - objs += [self.arr_date.astype('O')] + self.check_fun(testfunc, targfunc, "arr_date", **kwargs) + objs += [self.arr_date.astype("O")] if allow_tdelta: try: @@ -255,44 +294,69 @@ def check_funs(self, testfunc, targfunc, allow_complex=True, except TypeError: pass else: - self.check_fun(testfunc, targfunc, 'arr_tdelta', **kwargs) - objs += [self.arr_tdelta.astype('O')] + self.check_fun(testfunc, targfunc, "arr_tdelta", **kwargs) + objs += [self.arr_tdelta.astype("O")] if allow_obj: self.arr_obj = np.vstack(objs) # some nanops handle object dtypes better than their numpy # counterparts, so the numpy functions need to be given something # else - if allow_obj == 'convert': - targfunc = partial(self._badobj_wrap, func=targfunc, - allow_complex=allow_complex) - self.check_fun(testfunc, targfunc, 'arr_obj', **kwargs) + if allow_obj == "convert": + targfunc = partial( + self._badobj_wrap, func=targfunc, allow_complex=allow_complex + ) + self.check_fun(testfunc, targfunc, "arr_obj", **kwargs) def _badobj_wrap(self, value, func, allow_complex=True, **kwargs): - if value.dtype.kind == 'O': + if value.dtype.kind == "O": if allow_complex: - value = value.astype('c16') + value = value.astype("c16") else: - value = value.astype('f8') + value = value.astype("f8") return func(value, **kwargs) def test_nanany(self): - self.check_funs(nanops.nanany, np.any, allow_all_nan=False, - allow_str=False, allow_date=False, allow_tdelta=False) + self.check_funs( + nanops.nanany, + np.any, + allow_all_nan=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) def test_nanall(self): - self.check_funs(nanops.nanall, np.all, allow_all_nan=False, - allow_str=False, allow_date=False, allow_tdelta=False) + self.check_funs( + nanops.nanall, + np.all, + allow_all_nan=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) def test_nansum(self): - self.check_funs(nanops.nansum, np.sum, allow_str=False, - allow_date=False, allow_tdelta=True, check_dtype=False, - empty_targfunc=np.nansum) + self.check_funs( + nanops.nansum, + np.sum, + allow_str=False, + allow_date=False, + allow_tdelta=True, + check_dtype=False, + empty_targfunc=np.nansum, + ) def test_nanmean(self): - self.check_funs(nanops.nanmean, np.mean, allow_complex=False, - allow_obj=False, allow_str=False, allow_date=False, - allow_tdelta=True) + self.check_funs( + nanops.nanmean, + np.mean, + allow_complex=False, + allow_obj=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + ) def test_nanmean_overflow(self): # GH 10155 @@ -310,13 +374,13 @@ def test_nanmean_overflow(self): def test_returned_dtype(self): dtypes = [np.int16, np.int32, np.int64, np.float32, np.float64] - if hasattr(np, 'float128'): + if hasattr(np, "float128"): dtypes.append(np.float128) for dtype in dtypes: s = Series(range(10), dtype=dtype) - group_a = ['mean', 'std', 'var', 'skew', 'kurt'] - group_b = ['min', 'max'] + group_a = ["mean", "std", "var", "skew", "kurt"] + group_b = ["min", "max"] for method in group_a + group_b: result = getattr(s, method)() if is_integer_dtype(dtype) and method in group_a: @@ -327,36 +391,64 @@ def test_returned_dtype(self): def test_nanmedian(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) - self.check_funs(nanops.nanmedian, np.median, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=True, allow_obj='convert') - - @pytest.mark.parametrize('ddof', range(3)) + self.check_funs( + nanops.nanmedian, + np.median, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj="convert", + ) + + @pytest.mark.parametrize("ddof", range(3)) def test_nanvar(self, ddof): - self.check_funs(nanops.nanvar, np.var, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=True, allow_obj='convert', ddof=ddof) - - @pytest.mark.parametrize('ddof', range(3)) + self.check_funs( + nanops.nanvar, + np.var, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj="convert", + ddof=ddof, + ) + + @pytest.mark.parametrize("ddof", range(3)) def test_nanstd(self, ddof): - self.check_funs(nanops.nanstd, np.std, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=True, allow_obj='convert', ddof=ddof) + self.check_funs( + nanops.nanstd, + np.std, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=True, + allow_obj="convert", + ddof=ddof, + ) @td.skip_if_no_scipy - @pytest.mark.parametrize('ddof', range(3)) + @pytest.mark.parametrize("ddof", range(3)) def test_nansem(self, ddof): from scipy.stats import sem - with np.errstate(invalid='ignore'): - self.check_funs(nanops.nansem, sem, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=False, allow_obj='convert', ddof=ddof) + + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nansem, + sem, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + allow_obj="convert", + ddof=ddof, + ) def _minmax_wrap(self, value, axis=None, func=None): # numpy warns if all nan res = func(value, axis) - if res.dtype.kind == 'm': + if res.dtype.kind == "m": res = np.atleast_1d(res) return res @@ -364,15 +456,13 @@ def test_nanmin(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._minmax_wrap, func=np.min) - self.check_funs(nanops.nanmin, func, - allow_str=False, allow_obj=False) + self.check_funs(nanops.nanmin, func, allow_str=False, allow_obj=False) def test_nanmax(self): with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._minmax_wrap, func=np.max) - self.check_funs(nanops.nanmax, func, - allow_str=False, allow_obj=False) + self.check_funs(nanops.nanmax, func, allow_str=False, allow_obj=False) def _argminmax_wrap(self, value, axis=None, func=None): res = func(value, axis) @@ -380,8 +470,12 @@ def _argminmax_wrap(self, value, axis=None, func=None): nullnan = isna(nans) if res.ndim: res[nullnan] = -1 - elif (hasattr(nullnan, 'all') and nullnan.all() or - not hasattr(nullnan, 'all') and nullnan): + elif ( + hasattr(nullnan, "all") + and nullnan.all() + or not hasattr(nullnan, "all") + and nullnan + ): res = -1 return res @@ -389,64 +483,92 @@ def test_nanargmax(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmax) - self.check_funs(nanops.nanargmax, func, - allow_str=False, allow_obj=False, - allow_date=True, allow_tdelta=True) + self.check_funs( + nanops.nanargmax, + func, + allow_str=False, + allow_obj=False, + allow_date=True, + allow_tdelta=True, + ) def test_nanargmin(self): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) func = partial(self._argminmax_wrap, func=np.argmin) - self.check_funs(nanops.nanargmin, func, allow_str=False, - allow_obj=False) + self.check_funs(nanops.nanargmin, func, allow_str=False, allow_obj=False) def _skew_kurt_wrap(self, values, axis=None, func=None): if not isinstance(values.dtype.type, np.floating): - values = values.astype('f8') + values = values.astype("f8") result = func(values, axis=axis, bias=False) # fix for handling cases where all elements in an axis are the same if isinstance(result, np.ndarray): result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0 return result elif np.max(values) == np.min(values): - return 0. + return 0.0 return result @td.skip_if_no_scipy def test_nanskew(self): from scipy.stats import skew + func = partial(self._skew_kurt_wrap, func=skew) - with np.errstate(invalid='ignore'): - self.check_funs(nanops.nanskew, func, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=False) + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nanskew, + func, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) @td.skip_if_no_scipy def test_nankurt(self): from scipy.stats import kurtosis + func1 = partial(kurtosis, fisher=True) func = partial(self._skew_kurt_wrap, func=func1) - with np.errstate(invalid='ignore'): - self.check_funs(nanops.nankurt, func, allow_complex=False, - allow_str=False, allow_date=False, - allow_tdelta=False) + with np.errstate(invalid="ignore"): + self.check_funs( + nanops.nankurt, + func, + allow_complex=False, + allow_str=False, + allow_date=False, + allow_tdelta=False, + ) def test_nanprod(self): - self.check_funs(nanops.nanprod, np.prod, allow_str=False, - allow_date=False, allow_tdelta=False, - empty_targfunc=np.nanprod) + self.check_funs( + nanops.nanprod, + np.prod, + allow_str=False, + allow_date=False, + allow_tdelta=False, + empty_targfunc=np.nanprod, + ) def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) - res01 = checkfun(self.arr_float_2d, self.arr_float1_2d, - min_periods=len(self.arr_float_2d) - 1, **kwargs) + res01 = checkfun( + self.arr_float_2d, + self.arr_float1_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs + ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) - res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, - **kwargs) - res11 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, - min_periods=len(self.arr_float_2d) - 1, **kwargs) + res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, **kwargs) + res11 = checkfun( + self.arr_float_nan_2d, + self.arr_float1_nan_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs + ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -454,12 +576,19 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, **kwargs) res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, **kwargs) res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, **kwargs) - res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, - **kwargs) - res24 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, - min_periods=len(self.arr_float_2d) - 1, **kwargs) - res25 = checkfun(self.arr_float_2d, self.arr_float1_2d, - min_periods=len(self.arr_float_2d) + 1, **kwargs) + res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, **kwargs) + res24 = checkfun( + self.arr_float_nan_2d, + self.arr_nan_float1_2d, + min_periods=len(self.arr_float_2d) - 1, + **kwargs + ) + res25 = checkfun( + self.arr_float_2d, + self.arr_float1_2d, + min_periods=len(self.arr_float_2d) + 1, + **kwargs + ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) tm.assert_almost_equal(targ2, res22) @@ -469,15 +598,22 @@ def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, **kwargs) - res01 = checkfun(self.arr_float_1d, self.arr_float1_1d, - min_periods=len(self.arr_float_1d) - 1, **kwargs) + res01 = checkfun( + self.arr_float_1d, + self.arr_float1_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs + ) tm.assert_almost_equal(targ0, res00) tm.assert_almost_equal(targ0, res01) - res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, - **kwargs) - res11 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, - min_periods=len(self.arr_float_1d) - 1, **kwargs) + res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, **kwargs) + res11 = checkfun( + self.arr_float_nan_1d, + self.arr_float1_nan_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs + ) tm.assert_almost_equal(targ1, res10) tm.assert_almost_equal(targ1, res11) @@ -485,12 +621,19 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, **kwargs) res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, **kwargs) res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, **kwargs) - res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, - **kwargs) - res24 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, - min_periods=len(self.arr_float_1d) - 1, **kwargs) - res25 = checkfun(self.arr_float_1d, self.arr_float1_1d, - min_periods=len(self.arr_float_1d) + 1, **kwargs) + res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, **kwargs) + res24 = checkfun( + self.arr_float_nan_1d, + self.arr_nan_float1_1d, + min_periods=len(self.arr_float_1d) - 1, + **kwargs + ) + res25 = checkfun( + self.arr_float_1d, + self.arr_float1_1d, + min_periods=len(self.arr_float_1d) + 1, + **kwargs + ) tm.assert_almost_equal(targ2, res20) tm.assert_almost_equal(targ2, res21) tm.assert_almost_equal(targ2, res22) @@ -500,50 +643,41 @@ def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs): def test_nancorr(self): targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] - targ1 = np.corrcoef(self.arr_float_2d.flat, - self.arr_float1_2d.flat)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1) targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] - targ1 = np.corrcoef(self.arr_float_1d.flat, - self.arr_float1_1d.flat)[0, 1] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='pearson') + targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson") def test_nancorr_pearson(self): targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] - targ1 = np.corrcoef(self.arr_float_2d.flat, - self.arr_float1_2d.flat)[0, 1] - self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, - method='pearson') + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="pearson") targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] - targ1 = np.corrcoef(self.arr_float_1d.flat, - self.arr_float1_1d.flat)[0, 1] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='pearson') + targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson") @td.skip_if_no_scipy def test_nancorr_kendall(self): from scipy.stats import kendalltau + targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0] targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] - self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, - method='kendall') + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall") targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0] targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='kendall') + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall") @td.skip_if_no_scipy def test_nancorr_spearman(self): from scipy.stats import spearmanr + targ0 = spearmanr(self.arr_float_2d, self.arr_float1_2d)[0] targ1 = spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] - self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, - method='spearman') + self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="spearman") targ0 = spearmanr(self.arr_float_1d, self.arr_float1_1d)[0] targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] - self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, - method='spearman') + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman") def test_nancov(self): targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1] @@ -578,7 +712,7 @@ def check_nancomp(self, checkfun, targ0): res2 = checkfun(arr_float_nan, arr_nan_float1) tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) except Exception as exc: - exc.args += ('ndim: %s' % arr_float.ndim, ) + exc.args += ("ndim: %s" % arr_float.ndim,) raise try: @@ -618,7 +752,7 @@ def test_nanne(self): self.check_nancomp(nanops.nanne, targ0) def check_bool(self, func, value, correct, *args, **kwargs): - while getattr(value, 'ndim', True): + while getattr(value, "ndim", True): try: res0 = func(value, *args, **kwargs) if correct: @@ -626,9 +760,9 @@ def check_bool(self, func, value, correct, *args, **kwargs): else: assert not res0 except BaseException as exc: - exc.args += ('dim: %s' % getattr(value, 'ndim', value), ) + exc.args += ("dim: %s" % getattr(value, "ndim", value),) raise - if not hasattr(value, 'ndim'): + if not hasattr(value, "ndim"): break try: value = np.take(value, 0, axis=-1) @@ -636,46 +770,72 @@ def check_bool(self, func, value, correct, *args, **kwargs): break def test__has_infs(self): - pairs = [('arr_complex', False), ('arr_int', False), - ('arr_bool', False), ('arr_str', False), ('arr_utf', False), - ('arr_complex', False), ('arr_complex_nan', False), - ('arr_nan_nanj', False), ('arr_nan_infj', True), - ('arr_complex_nan_infj', True)] - pairs_float = [('arr_float', False), ('arr_nan', False), - ('arr_float_nan', False), ('arr_nan_nan', False), - ('arr_float_inf', True), ('arr_inf', True), - ('arr_nan_inf', True), ('arr_float_nan_inf', True), - ('arr_nan_nan_inf', True)] + pairs = [ + ("arr_complex", False), + ("arr_int", False), + ("arr_bool", False), + ("arr_str", False), + ("arr_utf", False), + ("arr_complex", False), + ("arr_complex_nan", False), + ("arr_nan_nanj", False), + ("arr_nan_infj", True), + ("arr_complex_nan_infj", True), + ] + pairs_float = [ + ("arr_float", False), + ("arr_nan", False), + ("arr_float_nan", False), + ("arr_nan_nan", False), + ("arr_float_inf", True), + ("arr_inf", True), + ("arr_nan_inf", True), + ("arr_float_nan_inf", True), + ("arr_nan_nan_inf", True), + ] for arr, correct in pairs: val = getattr(self, arr) try: self.check_bool(nanops._has_infs, val, correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise for arr, correct in pairs_float: val = getattr(self, arr) try: self.check_bool(nanops._has_infs, val, correct) - self.check_bool(nanops._has_infs, val.astype('f4'), correct) - self.check_bool(nanops._has_infs, val.astype('f2'), correct) + self.check_bool(nanops._has_infs, val.astype("f4"), correct) + self.check_bool(nanops._has_infs, val.astype("f2"), correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise def test__isfinite(self): - pairs = [('arr_complex', False), ('arr_int', False), - ('arr_bool', False), ('arr_str', False), ('arr_utf', False), - ('arr_complex', False), ('arr_complex_nan', True), - ('arr_nan_nanj', True), ('arr_nan_infj', True), - ('arr_complex_nan_infj', True)] - pairs_float = [('arr_float', False), ('arr_nan', True), - ('arr_float_nan', True), ('arr_nan_nan', True), - ('arr_float_inf', True), ('arr_inf', True), - ('arr_nan_inf', True), ('arr_float_nan_inf', True), - ('arr_nan_nan_inf', True)] + pairs = [ + ("arr_complex", False), + ("arr_int", False), + ("arr_bool", False), + ("arr_str", False), + ("arr_utf", False), + ("arr_complex", False), + ("arr_complex_nan", True), + ("arr_nan_nanj", True), + ("arr_nan_infj", True), + ("arr_complex_nan_infj", True), + ] + pairs_float = [ + ("arr_float", False), + ("arr_nan", True), + ("arr_float_nan", True), + ("arr_nan_nan", True), + ("arr_float_inf", True), + ("arr_inf", True), + ("arr_nan_inf", True), + ("arr_float_nan_inf", True), + ("arr_nan_nan_inf", True), + ] func1 = lambda x: np.any(nanops._isfinite(x).ravel()) @@ -687,33 +847,32 @@ def test__isfinite(self): try: self.check_bool(func1, val, correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise for arr, correct in pairs_float: val = getattr(self, arr) try: self.check_bool(func1, val, correct) - self.check_bool(func1, val.astype('f4'), correct) - self.check_bool(func1, val.astype('f2'), correct) + self.check_bool(func1, val.astype("f4"), correct) + self.check_bool(func1, val.astype("f2"), correct) except BaseException as exc: - exc.args += (arr, ) + exc.args += (arr,) raise def test__bn_ok_dtype(self): - assert nanops._bn_ok_dtype(self.arr_float.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_complex.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_int.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_bool.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_str.dtype, 'test') - assert nanops._bn_ok_dtype(self.arr_utf.dtype, 'test') - assert not nanops._bn_ok_dtype(self.arr_date.dtype, 'test') - assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, 'test') - assert not nanops._bn_ok_dtype(self.arr_obj.dtype, 'test') + assert nanops._bn_ok_dtype(self.arr_float.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_int.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_bool.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_str.dtype, "test") + assert nanops._bn_ok_dtype(self.arr_utf.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_date.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, "test") + assert not nanops._bn_ok_dtype(self.arr_obj.dtype, "test") class TestEnsureNumeric: - def test_numeric_values(self): # Test integer assert nanops._ensure_numeric(1) == 1 @@ -734,24 +893,24 @@ def test_ndarray(self): assert np.allclose(nanops._ensure_numeric(o_values), values) # Test convertible string ndarray - s_values = np.array(['1', '2', '3'], dtype=object) + s_values = np.array(["1", "2", "3"], dtype=object) assert np.allclose(nanops._ensure_numeric(s_values), values) # Test non-convertible string ndarray - s_values = np.array(['foo', 'bar', 'baz'], dtype=object) + s_values = np.array(["foo", "bar", "baz"], dtype=object) msg = r"could not convert string to float: '(foo|baz)'" with pytest.raises(ValueError, match=msg): nanops._ensure_numeric(s_values) def test_convertable_values(self): - assert np.allclose(nanops._ensure_numeric('1'), 1.0) - assert np.allclose(nanops._ensure_numeric('1.1'), 1.1) - assert np.allclose(nanops._ensure_numeric('1+1j'), 1 + 1j) + assert np.allclose(nanops._ensure_numeric("1"), 1.0) + assert np.allclose(nanops._ensure_numeric("1.1"), 1.1) + assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j) def test_non_convertable_values(self): msg = "Could not convert foo to numeric" with pytest.raises(TypeError, match=msg): - nanops._ensure_numeric('foo') + nanops._ensure_numeric("foo") msg = "Could not convert {} to numeric" with pytest.raises(TypeError, match=msg): nanops._ensure_numeric({}) @@ -772,16 +931,14 @@ def setup_method(self, method): def test_nanvar_all_finite(self): samples = self.samples actual_variance = nanops.nanvar(samples) - tm.assert_almost_equal(actual_variance, self.variance, - check_less_precise=2) + tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) def test_nanvar_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_variance = nanops.nanvar(samples, skipna=True) - tm.assert_almost_equal(actual_variance, self.variance, - check_less_precise=2) + tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) actual_variance = nanops.nanvar(samples, skipna=False) tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2) @@ -791,12 +948,10 @@ def test_nanstd_nans(self): samples[::2] = self.samples actual_std = nanops.nanstd(samples, skipna=True) - tm.assert_almost_equal(actual_std, self.variance ** 0.5, - check_less_precise=2) + tm.assert_almost_equal(actual_std, self.variance ** 0.5, check_less_precise=2) actual_std = nanops.nanvar(samples, skipna=False) - tm.assert_almost_equal(actual_std, np.nan, - check_less_precise=2) + tm.assert_almost_equal(actual_std, np.nan, check_less_precise=2) def test_nanvar_axis(self): # Generate some sample data. @@ -805,8 +960,9 @@ def test_nanvar_axis(self): samples = np.vstack([samples_norm, samples_unif]) actual_variance = nanops.nanvar(samples, axis=1) - tm.assert_almost_equal(actual_variance, np.array( - [self.variance, 1.0 / 12]), check_less_precise=2) + tm.assert_almost_equal( + actual_variance, np.array([self.variance, 1.0 / 12]), check_less_precise=2 + ) def test_nanvar_ddof(self): n = 5 @@ -819,32 +975,43 @@ def test_nanvar_ddof(self): # The unbiased estimate. var = 1.0 / 12 - tm.assert_almost_equal(variance_1, var, - check_less_precise=2) + tm.assert_almost_equal(variance_1, var, check_less_precise=2) # The underestimated variance. - tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, - check_less_precise=2) + tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, check_less_precise=2) # The overestimated variance. - tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, - check_less_precise=2) + tm.assert_almost_equal( + variance_2, (n - 1.0) / (n - 2.0) * var, check_less_precise=2 + ) def test_ground_truth(self): # Test against values that were precomputed with Numpy. samples = np.empty((4, 4)) - samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287 - ], [0.72980153, 0.03109364, 0.99155171], - [0.09317602, 0.60078248, 0.15871292]]) + samples[:3, :3] = np.array( + [ + [0.97303362, 0.21869576, 0.55560287], + [0.72980153, 0.03109364, 0.99155171], + [0.09317602, 0.60078248, 0.15871292], + ] + ) samples[3] = samples[:, 3] = np.nan # Actual variances along axis=0, 1 for ddof=0, 1, 2 - variance = np.array([[[0.13762259, 0.05619224, 0.11568816 - ], [0.20643388, 0.08428837, 0.17353224], - [0.41286776, 0.16857673, 0.34706449]], - [[0.09519783, 0.16435395, 0.05082054 - ], [0.14279674, 0.24653093, 0.07623082], - [0.28559348, 0.49306186, 0.15246163]]]) + variance = np.array( + [ + [ + [0.13762259, 0.05619224, 0.11568816], + [0.20643388, 0.08428837, 0.17353224], + [0.41286776, 0.16857673, 0.34706449], + ], + [ + [0.09519783, 0.16435395, 0.05082054], + [0.14279674, 0.24653093, 0.07623082], + [0.28559348, 0.49306186, 0.15246163], + ], + ] + ) # Test nanvar. for axis in range(2): @@ -903,8 +1070,7 @@ def test_ground_truth(self): tm.assert_almost_equal(skew, self.actual_skew) def test_axis(self): - samples = np.vstack([self.samples, - np.nan * np.ones(len(self.samples))]) + samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) skew = nanops.nanskew(samples, axis=1) tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan])) @@ -953,8 +1119,7 @@ def test_ground_truth(self): tm.assert_almost_equal(kurt, self.actual_kurt) def test_axis(self): - samples = np.vstack([self.samples, - np.nan * np.ones(len(self.samples))]) + samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) kurt = nanops.nankurt(samples, axis=1) tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan])) @@ -974,12 +1139,12 @@ def prng(self): class TestDatetime64NaNOps: - @pytest.mark.parametrize('tz', [None, 'UTC']) + @pytest.mark.parametrize("tz", [None, "UTC"]) @pytest.mark.xfail(reason="disabled") # Enabling mean changes the behavior of DataFrame.mean # See https://github.com/pandas-dev/pandas/issues/24752 def test_nanmean(self, tz): - dti = pd.date_range('2016-01-01', periods=3, tz=tz) + dti = pd.date_range("2016-01-01", periods=3, tz=tz) expected = dti[1] for obj in [dti, DatetimeArray(dti), Series(dti)]: @@ -997,50 +1162,56 @@ def test_use_bottleneck(): if nanops._BOTTLENECK_INSTALLED: - pd.set_option('use_bottleneck', True) - assert pd.get_option('use_bottleneck') - - pd.set_option('use_bottleneck', False) - assert not pd.get_option('use_bottleneck') - - pd.set_option('use_bottleneck', use_bn) - - -@pytest.mark.parametrize("numpy_op, expected", [ - (np.sum, 10), - (np.nansum, 10), - (np.mean, 2.5), - (np.nanmean, 2.5), - (np.median, 2.5), - (np.nanmedian, 2.5), - (np.min, 1), - (np.max, 4), - (np.nanmin, 1), - (np.nanmax, 4) -]) + pd.set_option("use_bottleneck", True) + assert pd.get_option("use_bottleneck") + + pd.set_option("use_bottleneck", False) + assert not pd.get_option("use_bottleneck") + + pd.set_option("use_bottleneck", use_bn) + + +@pytest.mark.parametrize( + "numpy_op, expected", + [ + (np.sum, 10), + (np.nansum, 10), + (np.mean, 2.5), + (np.nanmean, 2.5), + (np.median, 2.5), + (np.nanmedian, 2.5), + (np.min, 1), + (np.max, 4), + (np.nanmin, 1), + (np.nanmax, 4), + ], +) def test_numpy_ops(numpy_op, expected): # GH8383 result = numpy_op(pd.Series([1, 2, 3, 4])) assert result == expected -@pytest.mark.parametrize("operation", [ - nanops.nanany, - nanops.nanall, - nanops.nansum, - nanops.nanmean, - nanops.nanmedian, - nanops.nanstd, - nanops.nanvar, - nanops.nansem, - nanops.nanargmax, - nanops.nanargmin, - nanops.nanmax, - nanops.nanmin, - nanops.nanskew, - nanops.nankurt, - nanops.nanprod, -]) +@pytest.mark.parametrize( + "operation", + [ + nanops.nanany, + nanops.nanall, + nanops.nansum, + nanops.nanmean, + nanops.nanmedian, + nanops.nanstd, + nanops.nanvar, + nanops.nansem, + nanops.nanargmax, + nanops.nanargmin, + nanops.nanmax, + nanops.nanmin, + nanops.nanskew, + nanops.nankurt, + nanops.nanprod, + ], +) def test_nanops_independent_of_mask_param(operation): # GH22764 s = pd.Series([1, 2, np.nan, 3, np.nan, 4]) diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index 3916bedb8e44b..cd154ed5fe570 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -18,16 +18,16 @@ def test_import_optional(): def test_xlrd_version_fallback(): - pytest.importorskip('xlrd') + pytest.importorskip("xlrd") import_optional_dependency("xlrd") def test_bad_version(): - name = 'fakemodule' + name = "fakemodule" module = types.ModuleType(name) module.__version__ = "0.9.0" sys.modules[name] = module - VERSIONS[name] = '1.0.0' + VERSIONS[name] = "1.0.0" match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'" with pytest.raises(ImportError, match=match): @@ -43,10 +43,10 @@ def test_bad_version(): def test_no_version_raises(): - name = 'fakemodule' + name = "fakemodule" module = types.ModuleType(name) sys.modules[name] = module - VERSIONS[name] = '1.0.0' + VERSIONS[name] = "1.0.0" with pytest.raises(ImportError, match="Can't determine .* fakemodule"): import_optional_dependency(name) diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index e79ec56c819c1..97086f8ab1e85 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -21,10 +21,9 @@ def ensure_removed(obj, attr): class MyAccessor: - def __init__(self, obj): self.obj = obj - self.item = 'item' + self.item = "item" @property def prop(self): @@ -34,30 +33,33 @@ def method(self): return self.item -@pytest.mark.parametrize('obj, registrar', [ - (pd.Series, pd.api.extensions.register_series_accessor), - (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), - (pd.Index, pd.api.extensions.register_index_accessor) -]) +@pytest.mark.parametrize( + "obj, registrar", + [ + (pd.Series, pd.api.extensions.register_series_accessor), + (pd.DataFrame, pd.api.extensions.register_dataframe_accessor), + (pd.Index, pd.api.extensions.register_index_accessor), + ], +) def test_register(obj, registrar): - with ensure_removed(obj, 'mine'): + with ensure_removed(obj, "mine"): before = set(dir(obj)) - registrar('mine')(MyAccessor) - assert obj([]).mine.prop == 'item' + registrar("mine")(MyAccessor) + assert obj([]).mine.prop == "item" after = set(dir(obj)) - assert (before ^ after) == {'mine'} - assert 'mine' in obj._accessors + assert (before ^ after) == {"mine"} + assert "mine" in obj._accessors def test_accessor_works(): - with ensure_removed(pd.Series, 'mine'): - pd.api.extensions.register_series_accessor('mine')(MyAccessor) + with ensure_removed(pd.Series, "mine"): + pd.api.extensions.register_series_accessor("mine")(MyAccessor) s = pd.Series([1, 2]) assert s.mine.obj is s - assert s.mine.prop == 'item' - assert s.mine.method() == 'item' + assert s.mine.prop == "item" + assert s.mine.method() == "item" def test_overwrite_warns(): @@ -65,20 +67,20 @@ def test_overwrite_warns(): mean = pd.Series.mean try: with tm.assert_produces_warning(UserWarning) as w: - pd.api.extensions.register_series_accessor('mean')(MyAccessor) + pd.api.extensions.register_series_accessor("mean")(MyAccessor) s = pd.Series([1, 2]) - assert s.mean.prop == 'item' + assert s.mean.prop == "item" msg = str(w[0].message) - assert 'mean' in msg - assert 'MyAccessor' in msg - assert 'Series' in msg + assert "mean" in msg + assert "MyAccessor" in msg + assert "Series" in msg finally: pd.Series.mean = mean def test_raises_attribute_error(): - with ensure_removed(pd.Series, 'bad'): + with ensure_removed(pd.Series, "bad"): @pd.api.extensions.register_series_accessor("bad") class Bad: diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index f198fb6ae57b1..f64ad8edafbd7 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -9,34 +9,42 @@ from pandas import DataFrame, MultiIndex, Series, array, concat, merge from pandas.core import common as com from pandas.core.sorting import ( - decons_group_index, get_group_index, is_int64_overflow_possible, - lexsort_indexer, nargsort, safe_sort) + decons_group_index, + get_group_index, + is_int64_overflow_possible, + lexsort_indexer, + nargsort, + safe_sort, +) from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal class TestSorting: - @pytest.mark.slow def test_int64_overflow(self): B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500))) A = np.arange(2500) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': A, - 'F': B, - 'G': A, - 'H': B, - 'values': np.random.randn(2500)}) - - lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) - rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) - - left = lg.sum()['values'] - right = rg.sum()['values'] + df = DataFrame( + { + "A": A, + "B": B, + "C": A, + "D": B, + "E": A, + "F": B, + "G": A, + "H": B, + "values": np.random.randn(2500), + } + ) + + lg = df.groupby(["A", "B", "C", "D", "E", "F", "G", "H"]) + rg = df.groupby(["H", "G", "F", "E", "D", "C", "B", "A"]) + + left = lg.sum()["values"] + right = rg.sum()["values"] exp_index, _ = left.index.sortlevel() tm.assert_index_equal(left.index, exp_index) @@ -44,11 +52,10 @@ def test_int64_overflow(self): exp_index, _ = right.index.sortlevel(0) tm.assert_index_equal(right.index, exp_index) - tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' - ]].values)) + tups = list(map(tuple, df[["A", "B", "C", "D", "E", "F", "G", "H"]].values)) tups = com.asarray_tuplesafe(tups) - expected = df.groupby(tups).sum()['values'] + expected = df.groupby(tups).sum()["values"] for k, v in expected.items(): assert left[k] == right[k[::-1]] @@ -59,9 +66,8 @@ def test_int64_overflow_moar(self): # GH9096 values = range(55109) - data = DataFrame.from_dict( - {'a': values, 'b': values, 'c': values, 'd': values}) - grouped = data.groupby(['a', 'b', 'c', 'd']) + data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values}) + grouped = data.groupby(["a", "b", "c", "d"]) assert len(grouped) == len(values) arr = np.random.randint(-1 << 12, 1 << 12, (1 << 15, 5)) @@ -71,26 +77,26 @@ def test_int64_overflow_moar(self): i = np.random.permutation(len(arr)) arr = arr[i] # shuffle rows - df = DataFrame(arr, columns=list('abcde')) - df['jim'], df['joe'] = np.random.randn(2, len(df)) * 10 - gr = df.groupby(list('abcde')) + df = DataFrame(arr, columns=list("abcde")) + df["jim"], df["joe"] = np.random.randn(2, len(df)) * 10 + gr = df.groupby(list("abcde")) # verify this is testing what it is supposed to test! assert is_int64_overflow_possible(gr.grouper.shape) # manually compute groupings jim, joe = defaultdict(list), defaultdict(list) - for key, a, b in zip(map(tuple, arr), df['jim'], df['joe']): + for key, a, b in zip(map(tuple, arr), df["jim"], df["joe"]): jim[key].append(a) joe[key].append(b) assert len(gr) == len(jim) - mi = MultiIndex.from_tuples(jim.keys(), names=list('abcde')) + mi = MultiIndex.from_tuples(jim.keys(), names=list("abcde")) def aggr(func): - f = lambda a: np.fromiter(map(func, a), dtype='f8') + f = lambda a: np.fromiter(map(func, a), dtype="f8") arr = np.vstack((f(jim.values()), f(joe.values()))).T - res = DataFrame(arr, columns=['jim', 'joe'], index=mi) + res = DataFrame(arr, columns=["jim", "joe"], index=mi) return res.sort_index() assert_frame_equal(gr.mean(), aggr(np.mean)) @@ -99,22 +105,22 @@ def aggr(func): def test_lexsort_indexer(self): keys = [[nan] * 5 + list(range(100)) + [nan] * 5] # orders=True, na_position='last' - result = lexsort_indexer(keys, orders=True, na_position='last') + result = lexsort_indexer(keys, orders=True, na_position="last") exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=True, na_position='first' - result = lexsort_indexer(keys, orders=True, na_position='first') + result = lexsort_indexer(keys, orders=True, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='last' - result = lexsort_indexer(keys, orders=False, na_position='last') + result = lexsort_indexer(keys, orders=False, na_position="last") exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='first' - result = lexsort_indexer(keys, orders=False, na_position='first') + result = lexsort_indexer(keys, orders=False, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) @@ -122,7 +128,7 @@ def test_nargsort(self): # np.argsort(items) places NaNs last items = [nan] * 5 + list(range(100)) + [nan] * 5 # np.argsort(items2) may not place NaNs first - items2 = np.array(items, dtype='O') + items2 = np.array(items, dtype="O") # mergesort is the most difficult to get right because we want it to be # stable. @@ -133,104 +139,96 @@ def test_nargsort(self): # arrays.""" # mergesort, ascending=True, na_position='last' - result = nargsort(items, kind='mergesort', ascending=True, - na_position='last') + result = nargsort(items, kind="mergesort", ascending=True, na_position="last") exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' - result = nargsort(items, kind='mergesort', ascending=True, - na_position='first') + result = nargsort(items, kind="mergesort", ascending=True, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' - result = nargsort(items, kind='mergesort', ascending=False, - na_position='last') + result = nargsort(items, kind="mergesort", ascending=False, na_position="last") exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' - result = nargsort(items, kind='mergesort', ascending=False, - na_position='first') + result = nargsort(items, kind="mergesort", ascending=False, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='last' - result = nargsort(items2, kind='mergesort', ascending=True, - na_position='last') + result = nargsort(items2, kind="mergesort", ascending=True, na_position="last") exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' - result = nargsort(items2, kind='mergesort', ascending=True, - na_position='first') + result = nargsort(items2, kind="mergesort", ascending=True, na_position="first") exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' - result = nargsort(items2, kind='mergesort', ascending=False, - na_position='last') + result = nargsort(items2, kind="mergesort", ascending=False, na_position="last") exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' - result = nargsort(items2, kind='mergesort', ascending=False, - na_position='first') + result = nargsort( + items2, kind="mergesort", ascending=False, na_position="first" + ) exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) class TestMerge: - @pytest.mark.slow def test_int64_overflow_issues(self): # #2690, combinatorial explosion - df1 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G1']) - df2 = DataFrame(np.random.randn(1000, 7), - columns=list('ABCDEF') + ['G2']) + df1 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G1"]) + df2 = DataFrame(np.random.randn(1000, 7), columns=list("ABCDEF") + ["G2"]) # it works! - result = merge(df1, df2, how='outer') + result = merge(df1, df2, how="outer") assert len(result) == 2000 low, high, n = -1 << 10, 1 << 10, 1 << 20 - left = DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) - left['left'] = left.sum(axis=1) + left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")) + left["left"] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() - right.columns = right.columns[:-1].tolist() + ['right'] + right.columns = right.columns[:-1].tolist() + ["right"] right.index = np.arange(len(right)) - right['right'] *= -1 + right["right"] *= -1 - out = merge(left, right, how='outer') + out = merge(left, right, how="outer") assert len(out) == len(left) - assert_series_equal(out['left'], - out['right'], check_names=False) + assert_series_equal(out["left"], -out["right"], check_names=False) result = out.iloc[:, :-2].sum(axis=1) - assert_series_equal(out['left'], result, check_names=False) + assert_series_equal(out["left"], result, check_names=False) assert result.name is None out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) - for how in ['left', 'right', 'outer', 'inner']: + for how in ["left", "right", "outer", "inner"]: assert_frame_equal(out, merge(left, right, how=how, sort=True)) # check that left merge w/ sort=False maintains left frame order - out = merge(left, right, how='left', sort=False) + out = merge(left, right, how="left", sort=False) assert_frame_equal(left, out[left.columns.tolist()]) - out = merge(right, left, how='left', sort=False) + out = merge(right, left, how="left", sort=False) assert_frame_equal(right, out[right.columns.tolist()]) # one-2-many/none match n = 1 << 11 - left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), - columns=list('ABCDEFG')) + left = DataFrame( + np.random.randint(low, high, (n, 7)).astype("int64"), + columns=list("ABCDEFG"), + ) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values @@ -239,16 +237,17 @@ def test_int64_overflow_issues(self): # add duplicates to left frame left = concat([left, left], ignore_index=True) - right = DataFrame(np.random.randint(low, high, (n // 2, 7)) - .astype('int64'), - columns=list('ABCDEFG')) + right = DataFrame( + np.random.randint(low, high, (n // 2, 7)).astype("int64"), + columns=list("ABCDEFG"), + ) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) - left['left'] = np.random.randn(len(left)) - right['right'] = np.random.randn(len(right)) + left["left"] = np.random.randn(len(left)) + right["right"] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) @@ -262,11 +261,11 @@ def test_int64_overflow_issues(self): # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) - for idx, row in left.set_index(list('ABCDEFG')).iterrows(): - ldict[idx].append(row['left']) + for idx, row in left.set_index(list("ABCDEFG")).iterrows(): + ldict[idx].append(row["left"]) - for idx, row in right.set_index(list('ABCDEFG')).iterrows(): - rdict[idx].append(row['right']) + for idx, row in right.set_index(list("ABCDEFG")).iterrows(): + rdict[idx].append(row["right"]) vals = [] for k, lval in ldict.items(): @@ -285,22 +284,25 @@ def align(df): return df def verify_order(df): - kcols = list('ABCDEFG') - assert_frame_equal(df[kcols].copy(), - df[kcols].sort_values(kcols, kind='mergesort')) + kcols = list("ABCDEFG") + assert_frame_equal( + df[kcols].copy(), df[kcols].sort_values(kcols, kind="mergesort") + ) - out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) + out = DataFrame(vals, columns=list("ABCDEFG") + ["left", "right"]) out = align(out) - jmask = {'left': out['left'].notna(), - 'right': out['right'].notna(), - 'inner': out['left'].notna() & out['right'].notna(), - 'outer': np.ones(len(out), dtype='bool')} + jmask = { + "left": out["left"].notna(), + "right": out["right"].notna(), + "inner": out["left"].notna() & out["right"].notna(), + "outer": np.ones(len(out), dtype="bool"), + } - for how in 'left', 'right', 'outer', 'inner': + for how in "left", "right", "outer", "inner": mask = jmask[how] frame = align(out[mask].copy()) - assert mask.all() ^ mask.any() or how == 'outer' + assert mask.all() ^ mask.any() or how == "outer" for sort in [False, True]: res = merge(left, right, how=how, sort=sort) @@ -308,12 +310,12 @@ def verify_order(df): verify_order(res) # as in GH9092 dtypes break with outer/right join - assert_frame_equal(frame, align(res), - check_dtype=how not in ('right', 'outer')) + assert_frame_equal( + frame, align(res), check_dtype=how not in ("right", "outer") + ) def test_decons(): - def testit(label_list, shape): group_index = get_group_index(label_list, shape, sort=True, xnull=True) label_list2 = decons_group_index(group_index, shape) @@ -322,19 +324,22 @@ def testit(label_list, shape): tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), - np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64)] + label_list = [ + np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64), + ] testit(label_list, shape) shape = (10000, 10000) - label_list = [np.tile(np.arange(10000, dtype=np.int64), 5), - np.tile(np.arange(10000, dtype=np.int64), 5)] + label_list = [ + np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5), + ] testit(label_list, shape) class TestSafeSort: - def test_basic_sort(self): values = [3, 1, 2, 0, 4] result = safe_sort(values) @@ -343,7 +348,7 @@ def test_basic_sort(self): values = list("baaacb") result = safe_sort(values) - expected = np.array(list("aaabbc"), dtype='object') + expected = np.array(list("aaabbc"), dtype="object") tm.assert_numpy_array_equal(result, expected) values = [] @@ -351,7 +356,7 @@ def test_basic_sort(self): expected = np.array([]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('verify', [True, False]) + @pytest.mark.parametrize("verify", [True, False]) def test_labels(self, verify): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) @@ -364,8 +369,7 @@ def test_labels(self, verify): # na_sentinel labels = [0, 1, 1, 2, 3, 0, 99, 4] - result, result_labels = safe_sort(values, labels, na_sentinel=99, - verify=verify) + result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify) expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) @@ -376,78 +380,77 @@ def test_labels(self, verify): tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) - @pytest.mark.parametrize('na_sentinel', [-1, 99]) + @pytest.mark.parametrize("na_sentinel", [-1, 99]) def test_labels_out_of_bound(self, na_sentinel): values = [3, 1, 2, 0, 4] expected = np.array([0, 1, 2, 3, 4]) # out of bound indices labels = [0, 101, 102, 2, 3, 0, 99, 4] - result, result_labels = safe_sort( - values, labels, na_sentinel=na_sentinel) + result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel) expected_labels = np.array( - [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], - dtype=np.intp) + [3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp + ) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) def test_mixed_integer(self): - values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) + values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object) result = safe_sort(values) - expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) - values = np.array(['b', 1, 0, 'a'], dtype=object) + values = np.array(["b", 1, 0, "a"], dtype=object) labels = [0, 1, 2, 3, 0, -1, 1] result, result_labels = safe_sort(values, labels) - expected = np.array([0, 1, 'a', 'b'], dtype=object) + expected = np.array([0, 1, "a", "b"], dtype=object) expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) tm.assert_numpy_array_equal(result_labels, expected_labels) def test_mixed_integer_from_list(self): - values = ['b', 1, 0, 'a', 0, 'b'] + values = ["b", 1, 0, "a", 0, "b"] result = safe_sort(values) - expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) - msg = ("unorderable types: .* [<>] .*" - "|" # the above case happens for numpy < 1.14 - "'[<>]' not supported between instances of .*") + msg = ( + "unorderable types: .* [<>] .*" + "|" # the above case happens for numpy < 1.14 + "'[<>]' not supported between instances of .*" + ) with pytest.raises(TypeError, match=msg): safe_sort(arr) def test_exceptions(self): - with pytest.raises(TypeError, - match="Only list-like objects are allowed"): + with pytest.raises(TypeError, match="Only list-like objects are allowed"): safe_sort(values=1) - with pytest.raises(TypeError, - match="Only list-like objects or None"): + with pytest.raises(TypeError, match="Only list-like objects or None"): safe_sort(values=[0, 1, 2], labels=1) - with pytest.raises(ValueError, - match="values should be unique"): + with pytest.raises(ValueError, match="values should be unique"): safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) def test_extension_array(self): # a = array([1, 3, np.nan, 2], dtype='Int64') - a = array([1, 3, 2], dtype='Int64') + a = array([1, 3, 2], dtype="Int64") result = safe_sort(a) # expected = array([1, 2, 3, np.nan], dtype='Int64') - expected = array([1, 2, 3], dtype='Int64') + expected = array([1, 2, 3], dtype="Int64") tm.assert_extension_array_equal(result, expected) - @pytest.mark.parametrize('verify', [True, False]) - @pytest.mark.parametrize('na_sentinel', [-1, 99]) + @pytest.mark.parametrize("verify", [True, False]) + @pytest.mark.parametrize("na_sentinel", [-1, 99]) def test_extension_array_labels(self, verify, na_sentinel): - a = array([1, 3, 2], dtype='Int64') - result, labels = safe_sort(a, [0, 1, na_sentinel, 2], - na_sentinel=na_sentinel, verify=verify) - expected_values = array([1, 2, 3], dtype='Int64') + a = array([1, 3, 2], dtype="Int64") + result, labels = safe_sort( + a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify + ) + expected_values = array([1, 2, 3], dtype="Int64") expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp) tm.assert_extension_array_equal(result, expected_values) tm.assert_numpy_array_equal(labels, expected_labels) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 955554f60aa1f..d70614fcd2700 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -20,64 +20,89 @@ def assert_series_or_index_equal(left, right): _any_string_method = [ - ('cat', (), {'sep': ','}), # noqa: E241 - ('cat', (Series(list('zyx')),), {'sep': ',', # noqa: E241 - 'join': 'left'}), - ('center', (10,), {}), # noqa: E241 - ('contains', ('a',), {}), # noqa: E241 - ('count', ('a',), {}), # noqa: E241 - ('decode', ('UTF-8',), {}), # noqa: E241 - ('encode', ('UTF-8',), {}), # noqa: E241 - ('endswith', ('a',), {}), # noqa: E241 - ('extract', ('([a-z]*)',), {'expand': False}), # noqa: E241 - ('extract', ('([a-z]*)',), {'expand': True}), # noqa: E241 - ('extractall', ('([a-z]*)',), {}), # noqa: E241 - ('find', ('a',), {}), # noqa: E241 - ('findall', ('a',), {}), # noqa: E241 - ('get', (0,), {}), # noqa: E241 + ("cat", (), {"sep": ","}), # noqa: E241 + ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), # noqa: E241 + ("center", (10,), {}), # noqa: E241 + ("contains", ("a",), {}), # noqa: E241 + ("count", ("a",), {}), # noqa: E241 + ("decode", ("UTF-8",), {}), # noqa: E241 + ("encode", ("UTF-8",), {}), # noqa: E241 + ("endswith", ("a",), {}), # noqa: E241 + ("extract", ("([a-z]*)",), {"expand": False}), # noqa: E241 + ("extract", ("([a-z]*)",), {"expand": True}), # noqa: E241 + ("extractall", ("([a-z]*)",), {}), # noqa: E241 + ("find", ("a",), {}), # noqa: E241 + ("findall", ("a",), {}), # noqa: E241 + ("get", (0,), {}), # noqa: E241 # because "index" (and "rindex") fail intentionally # if the string is not found, search only for empty string - ('index', ('',), {}), # noqa: E241 - ('join', (',',), {}), # noqa: E241 - ('ljust', (10,), {}), # noqa: E241 - ('match', ('a',), {}), # noqa: E241 - ('normalize', ('NFC',), {}), # noqa: E241 - ('pad', (10,), {}), # noqa: E241 - ('partition', (' ',), {'expand': False}), # noqa: E241 - ('partition', (' ',), {'expand': True}), # noqa: E241 - ('repeat', (3,), {}), # noqa: E241 - ('replace', ('a', 'z',), {}), # noqa: E241 - ('rfind', ('a',), {}), # noqa: E241 - ('rindex', ('',), {}), # noqa: E241 - ('rjust', (10,), {}), # noqa: E241 - ('rpartition', (' ',), {'expand': False}), # noqa: E241 - ('rpartition', (' ',), {'expand': True}), # noqa: E241 - ('slice', (0, 1,), {}), # noqa: E241 - ('slice_replace', (0, 1, 'z',), {}), # noqa: E241 - ('split', (' ',), {'expand': False}), # noqa: E241 - ('split', (' ',), {'expand': True}), # noqa: E241 - ('startswith', ('a',), {}), # noqa: E241 + ("index", ("",), {}), # noqa: E241 + ("join", (",",), {}), # noqa: E241 + ("ljust", (10,), {}), # noqa: E241 + ("match", ("a",), {}), # noqa: E241 + ("normalize", ("NFC",), {}), # noqa: E241 + ("pad", (10,), {}), # noqa: E241 + ("partition", (" ",), {"expand": False}), # noqa: E241 + ("partition", (" ",), {"expand": True}), # noqa: E241 + ("repeat", (3,), {}), # noqa: E241 + ("replace", ("a", "z"), {}), # noqa: E241 + ("rfind", ("a",), {}), # noqa: E241 + ("rindex", ("",), {}), # noqa: E241 + ("rjust", (10,), {}), # noqa: E241 + ("rpartition", (" ",), {"expand": False}), # noqa: E241 + ("rpartition", (" ",), {"expand": True}), # noqa: E241 + ("slice", (0, 1), {}), # noqa: E241 + ("slice_replace", (0, 1, "z"), {}), # noqa: E241 + ("split", (" ",), {"expand": False}), # noqa: E241 + ("split", (" ",), {"expand": True}), # noqa: E241 + ("startswith", ("a",), {}), # noqa: E241 # translating unicode points of "a" to "d" - ('translate', ({97: 100},), {}), # noqa: E241 - ('wrap', (2,), {}), # noqa: E241 - ('zfill', (10,), {}) # noqa: E241 -] + list(zip([ - # methods without positional arguments: zip with empty tuple and empty dict - 'capitalize', 'cat', 'get_dummies', - 'isalnum', 'isalpha', 'isdecimal', - 'isdigit', 'islower', 'isnumeric', - 'isspace', 'istitle', 'isupper', - 'len', 'lower', 'lstrip', 'partition', - 'rpartition', 'rsplit', 'rstrip', - 'slice', 'slice_replace', 'split', - 'strip', 'swapcase', 'title', 'upper', 'casefold' -], [()] * 100, [{}] * 100)) + ("translate", ({97: 100},), {}), # noqa: E241 + ("wrap", (2,), {}), # noqa: E241 + ("zfill", (10,), {}), # noqa: E241 +] + list( + zip( + [ + # methods without positional arguments: zip with empty tuple and empty dict + "capitalize", + "cat", + "get_dummies", + "isalnum", + "isalpha", + "isdecimal", + "isdigit", + "islower", + "isnumeric", + "isspace", + "istitle", + "isupper", + "len", + "lower", + "lstrip", + "partition", + "rpartition", + "rsplit", + "rstrip", + "slice", + "slice_replace", + "split", + "strip", + "swapcase", + "title", + "upper", + "casefold", + ], + [()] * 100, + [{}] * 100, + ) +) ids, _, _ = zip(*_any_string_method) # use method name as fixture-id # test that the above list captures all methods of StringMethods -missing_methods = {f for f in dir(strings.StringMethods) - if not f.startswith('_')} - set(ids) +missing_methods = { + f for f in dir(strings.StringMethods) if not f.startswith("_") +} - set(ids) assert not missing_methods @@ -113,11 +138,11 @@ def any_string_method(request): # subset of the full set from pandas/conftest.py _any_allowed_skipna_inferred_dtype = [ - ('string', ['a', np.nan, 'c']), - ('bytes', [b'a', np.nan, b'c']), - ('empty', [np.nan, np.nan, np.nan]), - ('empty', []), - ('mixed-integer', ['a', np.nan, 2]) + ("string", ["a", np.nan, "c"]), + ("bytes", [b"a", np.nan, b"c"]), + ("empty", [np.nan, np.nan, np.nan]), + ("empty", []), + ("mixed-integer", ["a", np.nan, 2]), ] ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id @@ -162,23 +187,24 @@ def any_allowed_skipna_inferred_dtype(request): class TestStringMethods: - def test_api(self): # GH 6106, GH 9322 assert Series.str is strings.StringMethods - assert isinstance(Series(['']).str, strings.StringMethods) + assert isinstance(Series([""]).str, strings.StringMethods) def test_api_mi_raises(self): # GH 23679 - mi = MultiIndex.from_arrays([['a', 'b', 'c']]) - with pytest.raises(AttributeError, match='Can only use .str accessor ' - 'with Index, not MultiIndex'): + mi = MultiIndex.from_arrays([["a", "b", "c"]]) + with pytest.raises( + AttributeError, + match="Can only use .str accessor " "with Index, not MultiIndex", + ): mi.str - assert not hasattr(mi, 'str') + assert not hasattr(mi, "str") - @pytest.mark.parametrize('dtype', [object, 'category']) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("dtype", [object, "category"]) + @pytest.mark.parametrize("box", [Series, Index]) def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): # one instance of parametrized fixture inferred_dtype, values = any_skipna_inferred_dtype @@ -186,28 +212,38 @@ def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype): t = box(values, dtype=dtype) # explicit dtype to avoid casting # TODO: get rid of these xfails - if dtype == 'category' and inferred_dtype in ['period', 'interval']: - pytest.xfail(reason='Conversion to numpy array fails because ' - 'the ._values-attribute is not a numpy array for ' - 'PeriodArray/IntervalArray; see GH 23553') - - types_passing_constructor = ['string', 'unicode', 'empty', - 'bytes', 'mixed', 'mixed-integer'] + if dtype == "category" and inferred_dtype in ["period", "interval"]: + pytest.xfail( + reason="Conversion to numpy array fails because " + "the ._values-attribute is not a numpy array for " + "PeriodArray/IntervalArray; see GH 23553" + ) + + types_passing_constructor = [ + "string", + "unicode", + "empty", + "bytes", + "mixed", + "mixed-integer", + ] if inferred_dtype in types_passing_constructor: # GH 6106 assert isinstance(t.str, strings.StringMethods) else: # GH 9184, GH 23011, GH 23163 - with pytest.raises(AttributeError, match='Can only use .str ' - 'accessor with string values.*'): + with pytest.raises( + AttributeError, + match="Can only use .str " "accessor with string values.*", + ): t.str - assert not hasattr(t, 'str') + assert not hasattr(t, "str") - @pytest.mark.parametrize('dtype', [object, 'category']) - @pytest.mark.parametrize('box', [Series, Index]) - def test_api_per_method(self, box, dtype, - any_allowed_skipna_inferred_dtype, - any_string_method): + @pytest.mark.parametrize("dtype", [object, "category"]) + @pytest.mark.parametrize("box", [Series, Index]) + def test_api_per_method( + self, box, dtype, any_allowed_skipna_inferred_dtype, any_string_method + ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, # and raise on all others @@ -217,47 +253,62 @@ def test_api_per_method(self, box, dtype, method_name, args, kwargs = any_string_method # TODO: get rid of these xfails - if (method_name in ['partition', 'rpartition'] and box == Index - and inferred_dtype == 'empty'): - pytest.xfail(reason='Method cannot deal with empty Index') - if (method_name == 'split' and box == Index and values.size == 0 - and kwargs.get('expand', None) is not None): - pytest.xfail(reason='Split fails on empty Series when expand=True') - if (method_name == 'get_dummies' and box == Index - and inferred_dtype == 'empty' and (dtype == object - or values.size == 0)): - pytest.xfail(reason='Need to fortify get_dummies corner cases') + if ( + method_name in ["partition", "rpartition"] + and box == Index + and inferred_dtype == "empty" + ): + pytest.xfail(reason="Method cannot deal with empty Index") + if ( + method_name == "split" + and box == Index + and values.size == 0 + and kwargs.get("expand", None) is not None + ): + pytest.xfail(reason="Split fails on empty Series when expand=True") + if ( + method_name == "get_dummies" + and box == Index + and inferred_dtype == "empty" + and (dtype == object or values.size == 0) + ): + pytest.xfail(reason="Need to fortify get_dummies corner cases") t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) - bytes_allowed = method_name in ['decode', 'get', 'len', 'slice'] + bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. # This could be changed with an 'errors'-kwarg to the `str`-accessor, # see discussion in GH 13877 - mixed_allowed = method_name not in ['cat'] + mixed_allowed = method_name not in ["cat"] - allowed_types = (['string', 'unicode', 'empty'] - + ['bytes'] * bytes_allowed - + ['mixed', 'mixed-integer'] * mixed_allowed) + allowed_types = ( + ["string", "unicode", "empty"] + + ["bytes"] * bytes_allowed + + ["mixed", "mixed-integer"] * mixed_allowed + ) if inferred_dtype in allowed_types: # xref GH 23555, GH 23556 method(*args, **kwargs) # works! else: # GH 23011, GH 23163 - msg = ('Cannot use .str.{name} with values of inferred dtype ' - '{inferred_dtype!r}.'.format(name=method_name, - inferred_dtype=inferred_dtype)) + msg = ( + "Cannot use .str.{name} with values of inferred dtype " + "{inferred_dtype!r}.".format( + name=method_name, inferred_dtype=inferred_dtype + ) + ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) def test_api_for_categorical(self, any_string_method): # https://github.com/pandas-dev/pandas/issues/10661 - s = Series(list('aabb')) + s = Series(list("aabb")) s = s + " " + s - c = s.astype('category') + c = s.astype("category") assert isinstance(c.str, strings.StringMethods) method_name, args, kwargs = any_string_method @@ -275,7 +326,7 @@ def test_api_for_categorical(self, any_string_method): def test_iter(self): # GH3638 - strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel' + strs = "google", "wikimedia", "wikipedia", "wikitravel" ds = Series(strs) for s in ds.str: @@ -293,7 +344,7 @@ def test_iter(self): # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in # this case since 'wikitravel' is the longest string - assert s.dropna().values.item() == 'l' + assert s.dropna().values.item() == "l" def test_iter_empty(self): ds = Series([], dtype=object) @@ -309,7 +360,7 @@ def test_iter_empty(self): assert s == 1 def test_iter_single_element(self): - ds = Series(['a']) + ds = Series(["a"]) for i, s in enumerate(ds.str): pass @@ -318,62 +369,61 @@ def test_iter_single_element(self): assert_series_equal(ds, s) def test_iter_object_try_string(self): - ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range( - 4)]) + ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(4)]) - i, s = 100, 'h' + i, s = 100, "h" for i, s in enumerate(ds.str): pass assert i == 100 - assert s == 'h' + assert s == "h" - @pytest.mark.parametrize('box', [Series, Index]) - @pytest.mark.parametrize('other', [None, Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) + @pytest.mark.parametrize("other", [None, Series, Index]) def test_str_cat_name(self, box, other): # GH 21053 - values = ['a', 'b'] + values = ["a", "b"] if other: other = other(values) else: other = values - result = box(values, name='name').str.cat(other, sep=',', join='left') - assert result.name == 'name' + result = box(values, name="name").str.cat(other, sep=",", join="left") + assert result.name == "name" - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat(self, box): # test_cat above tests "str_cat" from ndarray; # here testing "str.cat" from Series/Indext to ndarray/list - s = box(['a', 'a', 'b', 'b', 'c', np.nan]) + s = box(["a", "a", "b", "b", "c", np.nan]) # single array result = s.str.cat() - expected = 'aabbc' + expected = "aabbc" assert result == expected - result = s.str.cat(na_rep='-') - expected = 'aabbc-' + result = s.str.cat(na_rep="-") + expected = "aabbc-" assert result == expected - result = s.str.cat(sep='_', na_rep='NA') - expected = 'a_a_b_b_c_NA' + result = s.str.cat(sep="_", na_rep="NA") + expected = "a_a_b_b_c_NA" assert result == expected - t = np.array(['a', np.nan, 'b', 'd', 'foo', np.nan], dtype=object) - expected = box(['aa', 'a-', 'bb', 'bd', 'cfoo', '--']) + t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object) + expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"]) # Series/Index with array - result = s.str.cat(t, na_rep='-') + result = s.str.cat(t, na_rep="-") assert_series_or_index_equal(result, expected) # Series/Index with list - result = s.str.cat(list(t), na_rep='-') + result = s.str.cat(list(t), na_rep="-") assert_series_or_index_equal(result, expected) # errors for incorrect lengths - rgx = 'All arrays must be same length, except those having an index.*' - z = Series(['1', '2', '3']) + rgx = "All arrays must be same length, except those having an index.*" + z = Series(["1", "2", "3"]) with pytest.raises(ValueError, match=rgx): s.str.cat(z) @@ -384,26 +434,26 @@ def test_str_cat(self, box): with pytest.raises(ValueError, match=rgx): s.str.cat(list(z)) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_raises_intuitive_error(self, box): # GH 11334 - s = box(['a', 'b', 'c', 'd']) + s = box(["a", "b", "c", "d"]) message = "Did you mean to supply a `sep` keyword?" with pytest.raises(ValueError, match=message): - s.str.cat('|') + s.str.cat("|") with pytest.raises(ValueError, match=message): - s.str.cat(' ') + s.str.cat(" ") - @pytest.mark.parametrize('sep', ['', None]) - @pytest.mark.parametrize('dtype_target', ['object', 'category']) - @pytest.mark.parametrize('dtype_caller', ['object', 'category']) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("sep", ["", None]) + @pytest.mark.parametrize("dtype_target", ["object", "category"]) + @pytest.mark.parametrize("dtype_caller", ["object", "category"]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): - s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller) + s = Index(["a", "a", "b", "a"], dtype=dtype_caller) s = s if box == Index else Series(s, index=s) - t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target) + t = Index(["b", "a", "b", "c"], dtype=dtype_target) - expected = Index(['ab', 'aa', 'bb', 'ac']) + expected = Index(["ab", "aa", "bb", "ac"]) expected = expected if box == Index else Series(expected, index=s) # Series/Index with unaligned Index @@ -429,33 +479,37 @@ def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep): assert_series_or_index_equal(result, expected) # test integer/float dtypes (inferred by constructor) and mixed - @pytest.mark.parametrize('data', [[1, 2, 3], [.1, .2, .3], [1, 2, 'b']], - ids=['integers', 'floats', 'mixed']) + @pytest.mark.parametrize( + "data", + [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]], + ids=["integers", "floats", "mixed"], + ) # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b'] - @pytest.mark.parametrize('box', [Series, Index, list, - lambda x: np.array(x, dtype=object)], - ids=['Series', 'Index', 'list', 'np.array']) + @pytest.mark.parametrize( + "box", + [Series, Index, list, lambda x: np.array(x, dtype=object)], + ids=["Series", "Index", "list", "np.array"], + ) def test_str_cat_wrong_dtype_raises(self, box, data): # GH 22722 - s = Series(['a', 'b', 'c']) + s = Series(["a", "b", "c"]) t = box(data) - msg = 'Concatenation requires list-likes containing only strings.*' + msg = "Concatenation requires list-likes containing only strings.*" with pytest.raises(TypeError, match=msg): # need to use outer and na_rep, as otherwise Index would not raise - s.str.cat(t, join='outer', na_rep='-') + s.str.cat(t, join="outer", na_rep="-") - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_mixed_inputs(self, box): - s = Index(['a', 'b', 'c', 'd']) + s = Index(["a", "b", "c", "d"]) s = s if box == Index else Series(s, index=s) - t = Series(['A', 'B', 'C', 'D'], index=s.values) + t = Series(["A", "B", "C", "D"], index=s.values) d = concat([t, Series(s, index=s)], axis=1) - expected = Index(['aAa', 'bBb', 'cCc', 'dDd']) - expected = expected if box == Index else Series(expected.values, - index=s.values) + expected = Index(["aAa", "bBb", "cCc", "dDd"]) + expected = expected if box == Index else Series(expected.values, index=s.values) # Series/Index with DataFrame result = s.str.cat(d) @@ -480,7 +534,7 @@ def test_str_cat_mixed_inputs(self, box): assert_series_or_index_equal(result, expected) # Series/Index with list of Series; different indexes - t.index = ['b', 'c', 'd', 'a'] + t.index = ["b", "c", "d", "a"] with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default result = s.str.cat([t, s]) @@ -493,7 +547,7 @@ def test_str_cat_mixed_inputs(self, box): assert_series_or_index_equal(result, expected) # Series/Index with DataFrame; different indexes - d.index = ['b', 'c', 'd', 'a'] + d.index = ["b", "c", "d", "a"] with tm.assert_produces_warning(expected_warning=FutureWarning): # FutureWarning to switch to alignment by default result = s.str.cat(d) @@ -506,8 +560,8 @@ def test_str_cat_mixed_inputs(self, box): assert_series_or_index_equal(result, expected) # errors for incorrect lengths - rgx = 'All arrays must be same length, except those having an index.*' - z = Series(['1', '2', '3']) + rgx = "All arrays must be same length, except those having an index.*" + z = Series(["1", "2", "3"]) e = concat([z, z], axis=1) # DataFrame @@ -531,13 +585,13 @@ def test_str_cat_mixed_inputs(self, box): s.str.cat([z.values, s]) # errors for incorrect arguments in list-like - rgx = 'others must be Series, Index, DataFrame,.*' + rgx = "others must be Series, Index, DataFrame,.*" # make sure None/NaN do not crash checks in _get_series_list - u = Series(['a', np.nan, 'c', None]) + u = Series(["a", np.nan, "c", None]) # mix of string and Series with pytest.raises(TypeError, match=rgx): - s.str.cat([u, 'u']) + s.str.cat([u, "u"]) # DataFrame in list with pytest.raises(TypeError, match=rgx): @@ -565,59 +619,59 @@ def test_str_cat_mixed_inputs(self, box): with pytest.raises(TypeError, match=rgx): s.str.cat(1) - @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right']) - @pytest.mark.parametrize('box', [Series, Index]) + @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) + @pytest.mark.parametrize("box", [Series, Index]) def test_str_cat_align_indexed(self, box, join): # https://github.com/pandas-dev/pandas/issues/18657 - s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd']) - t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b']) + s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"]) + t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"]) sa, ta = s.align(t, join=join) # result after manual alignment of inputs - expected = sa.str.cat(ta, na_rep='-') + expected = sa.str.cat(ta, na_rep="-") if box == Index: s = Index(s) sa = Index(sa) expected = Index(expected) - result = s.str.cat(t, join=join, na_rep='-') + result = s.str.cat(t, join=join, na_rep="-") assert_series_or_index_equal(result, expected) - @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right']) + @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"]) def test_str_cat_align_mixed_inputs(self, join): - s = Series(['a', 'b', 'c', 'd']) - t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + s = Series(["a", "b", "c", "d"]) + t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) d = concat([t, t], axis=1) - expected_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee']) + expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"]) expected = expected_outer.loc[s.index.join(t.index, how=join)] # list of Series - result = s.str.cat([t, t], join=join, na_rep='-') + result = s.str.cat([t, t], join=join, na_rep="-") tm.assert_series_equal(result, expected) # DataFrame - result = s.str.cat(d, join=join, na_rep='-') + result = s.str.cat(d, join=join, na_rep="-") tm.assert_series_equal(result, expected) # mixed list of indexed/unindexed - u = np.array(['A', 'B', 'C', 'D']) - expected_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-']) + u = np.array(["A", "B", "C", "D"]) + expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) # joint index of rhs [t, u]; u will be forced have index of s - rhs_idx = t.index & s.index if join == 'inner' else t.index | s.index + rhs_idx = t.index & s.index if join == "inner" else t.index | s.index expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] - result = s.str.cat([t, u], join=join, na_rep='-') + result = s.str.cat([t, u], join=join, na_rep="-") tm.assert_series_equal(result, expected) with tm.assert_produces_warning(expected_warning=FutureWarning): # nested list-likes will be deprecated - result = s.str.cat([t, list(u)], join=join, na_rep='-') + result = s.str.cat([t, list(u)], join=join, na_rep="-") tm.assert_series_equal(result, expected) # errors for incorrect lengths - rgx = r'If `others` contains arrays or lists \(or other list-likes.*' - z = Series(['1', '2', '3']).values + rgx = r"If `others` contains arrays or lists \(or other list-likes.*" + z = Series(["1", "2", "3"]).values # unindexed object of wrong length with pytest.raises(ValueError, match=rgx): @@ -627,13 +681,13 @@ def test_str_cat_align_mixed_inputs(self, join): with pytest.raises(ValueError, match=rgx): s.str.cat([t, z], join=join) - @pytest.mark.parametrize('box', [Series, Index]) - @pytest.mark.parametrize('other', [Series, Index]) + @pytest.mark.parametrize("box", [Series, Index]) + @pytest.mark.parametrize("other", [Series, Index]) def test_str_cat_all_na(self, box, other): # GH 24044 # check that all NaNs in caller / target work - s = Index(['a', 'b', 'c', 'd']) + s = Index(["a", "b", "c", "d"]) s = s if box == Index else Series(s, index=s) t = other([np.nan] * 4, dtype=object) # add index of s for alignment @@ -644,74 +698,77 @@ def test_str_cat_all_na(self, box, other): expected = Series([np.nan] * 4, index=s.index, dtype=object) else: # box == Index expected = Index([np.nan] * 4, dtype=object) - result = s.str.cat(t, join='left') + result = s.str.cat(t, join="left") assert_series_or_index_equal(result, expected) # all-NA caller (only for Series) if other == Series: expected = Series([np.nan] * 4, dtype=object, index=t.index) - result = t.str.cat(s, join='left') + result = t.str.cat(s, join="left") tm.assert_series_equal(result, expected) def test_str_cat_special_cases(self): - s = Series(['a', 'b', 'c', 'd']) - t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1]) + s = Series(["a", "b", "c", "d"]) + t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1]) # iterator of elements with different types - expected = Series(['aaa', 'bbb', 'c-c', 'ddd', '-e-']) - result = s.str.cat(iter([t, s.values]), join='outer', na_rep='-') + expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"]) + result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-") tm.assert_series_equal(result, expected) # right-align with different indexes in others - expected = Series(['aa-', 'd-d'], index=[0, 3]) - result = s.str.cat([t.loc[[0]], t.loc[[3]]], join='right', na_rep='-') + expected = Series(["aa-", "d-d"], index=[0, 3]) + result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-") tm.assert_series_equal(result, expected) def test_cat_on_filtered_index(self): - df = DataFrame(index=MultiIndex.from_product( - [[2011, 2012], [1, 2, 3]], names=['year', 'month'])) + df = DataFrame( + index=MultiIndex.from_product( + [[2011, 2012], [1, 2, 3]], names=["year", "month"] + ) + ) df = df.reset_index() df = df[df.month > 1] - str_year = df.year.astype('str') - str_month = df.month.astype('str') - str_both = str_year.str.cat(str_month, sep=' ') + str_year = df.year.astype("str") + str_month = df.month.astype("str") + str_both = str_year.str.cat(str_month, sep=" ") - assert str_both.loc[1] == '2011 2' + assert str_both.loc[1] == "2011 2" - str_multiple = str_year.str.cat([str_month, str_month], sep=' ') + str_multiple = str_year.str.cat([str_month, str_month], sep=" ") - assert str_multiple.loc[1] == '2011 2 2' + assert str_multiple.loc[1] == "2011 2 2" def test_count(self): - values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'], - dtype=np.object_) + values = np.array(["foo", "foofoo", NA, "foooofooofommmfoo"], dtype=np.object_) - result = strings.str_count(values, 'f[o]+') + result = strings.str_count(values, "f[o]+") exp = np.array([1, 2, NA, 4]) tm.assert_numpy_array_equal(result, exp) - result = Series(values).str.count('f[o]+') + result = Series(values).str.count("f[o]+") exp = Series([1, 2, NA, 4]) assert isinstance(result, Series) tm.assert_series_equal(result, exp) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] - rs = strings.str_count(mixed, 'a') + mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + rs = strings.str_count(mixed, "a") xp = np.array([1, NA, 0, NA, NA, 0, NA, NA, NA]) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.count('a') + rs = Series(mixed).str.count("a") xp = Series([1, NA, 0, NA, NA, 0, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_contains(self): - values = np.array(['foo', NA, 'fooommm__foo', - 'mmm_', 'foommm[_]+bar'], dtype=np.object_) - pat = 'mmm[_]+' + values = np.array( + ["foo", NA, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ + ) + pat = "mmm[_]+" result = strings.str_contains(values, pat) expected = np.array([False, NA, True, True, False], dtype=np.object_) @@ -721,39 +778,37 @@ def test_contains(self): expected = np.array([False, NA, False, False, True], dtype=np.object_) tm.assert_numpy_array_equal(result, expected) - values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + values = ["foo", "xyz", "fooommm__foo", "mmm_"] result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) # case insensitive using regex - values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_'] - result = strings.str_contains(values, 'FOO|mmm', case=False) + values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"] + result = strings.str_contains(values, "FOO|mmm", case=False) expected = np.array([True, False, True, True]) tm.assert_numpy_array_equal(result, expected) # case insensitive without regex - result = strings.str_contains(values, 'foo', regex=False, case=False) + result = strings.str_contains(values, "foo", regex=False, case=False) expected = np.array([True, False, True, False]) tm.assert_numpy_array_equal(result, expected) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] - rs = strings.str_contains(mixed, 'o') - xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], - dtype=np.object_) + mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + rs = strings.str_contains(mixed, "o") + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.contains('o') + rs = Series(mixed).str.contains("o") xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode - values = np.array(['foo', NA, 'fooommm__foo', 'mmm_'], - dtype=np.object_) - pat = 'mmm[_]+' + values = np.array(["foo", NA, "fooommm__foo", "mmm_"], dtype=np.object_) + pat = "mmm[_]+" result = strings.str_contains(values, pat) expected = np.array([False, np.nan, True, True], dtype=np.object_) @@ -763,8 +818,7 @@ def test_contains(self): expected = np.array([False, False, True, True]) tm.assert_numpy_array_equal(result, expected) - values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'], - dtype=np.object_) + values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) result = strings.str_contains(values, pat) expected = np.array([False, False, True, True]) assert result.dtype == np.bool_ @@ -775,65 +829,65 @@ def test_contains_for_object_category(self): # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") - result = values.str.contains('a', na=True) + result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) - result = values.str.contains('a', na=True) + result = values.str.contains("a", na=True) expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + result = values.str.contains("a", na=False) expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) def test_startswith(self): - values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) - result = values.str.startswith('foo') + result = values.str.startswith("foo") exp = Series([False, NA, True, False, False, NA, True]) tm.assert_series_equal(result, exp) - result = values.str.startswith('foo', na=True) + result = values.str.startswith("foo", na=True) tm.assert_series_equal(result, exp.fillna(True).astype(bool)) # mixed - mixed = np.array(['a', NA, 'b', True, datetime.today(), - 'foo', None, 1, 2.], dtype=np.object_) - rs = strings.str_startswith(mixed, 'f') - xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], - dtype=np.object_) + mixed = np.array( + ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=np.object_, + ) + rs = strings.str_startswith(mixed, "f") + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.startswith('f') + rs = Series(mixed).str.startswith("f") assert isinstance(rs, Series) xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) tm.assert_series_equal(rs, xp) def test_endswith(self): - values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) + values = Series(["om", NA, "foo_nom", "nom", "bar_foo", NA, "foo"]) - result = values.str.endswith('foo') + result = values.str.endswith("foo") exp = Series([False, NA, False, False, True, NA, True]) tm.assert_series_equal(result, exp) - result = values.str.endswith('foo', na=False) + result = values.str.endswith("foo", na=False) tm.assert_series_equal(result, exp.fillna(False).astype(bool)) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] - rs = strings.str_endswith(mixed, 'f') - xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], - dtype=np.object_) + mixed = ["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0] + rs = strings.str_endswith(mixed, "f") + xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], dtype=np.object_) tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.endswith('f') + rs = Series(mixed).str.endswith("f") xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) @@ -846,28 +900,26 @@ def test_title(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, - 1, 2.]) + mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.0]) mixed = mixed.str.title() exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) def test_lower_upper(self): - values = Series(['om', NA, 'nom', 'nom']) + values = Series(["om", NA, "nom", "nom"]) result = values.str.upper() - exp = Series(['OM', NA, 'NOM', 'NOM']) + exp = Series(["OM", NA, "NOM", "NOM"]) tm.assert_series_equal(result, exp) result = result.str.lower() tm.assert_series_equal(result, values) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1, - 2.]) + mixed = Series(["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]) mixed = mixed.str.upper() rs = Series(mixed).str.lower() - xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) @@ -878,8 +930,7 @@ def test_capitalize(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, - 1, 2.]) + mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None, 1, 2.0]) mixed = mixed.str.capitalize() exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) @@ -891,14 +942,13 @@ def test_swapcase(self): tm.assert_series_equal(result, exp) # mixed - mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None, - 1, 2.]) + mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None, 1, 2.0]) mixed = mixed.str.swapcase() exp = Series(["foo", NA, "BAR", NA, NA, "bLAH", NA, NA, NA]) tm.assert_almost_equal(mixed, exp) def test_casemethods(self): - values = ['aaa', 'bbb', 'CCC', 'Dddd', 'eEEE'] + values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"] s = Series(values) assert s.str.lower().tolist() == [v.lower() for v in values] assert s.str.upper().tolist() == [v.upper() for v in values] @@ -907,22 +957,23 @@ def test_casemethods(self): assert s.str.swapcase().tolist() == [v.swapcase() for v in values] def test_replace(self): - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) - result = values.str.replace('BAD[_]*', '') - exp = Series(['foobar', NA]) + result = values.str.replace("BAD[_]*", "") + exp = Series(["foobar", NA]) tm.assert_series_equal(result, exp) - result = values.str.replace('BAD[_]*', '', n=1) - exp = Series(['foobarBAD', NA]) + result = values.str.replace("BAD[_]*", "", n=1) + exp = Series(["foobarBAD", NA]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', - None, 1, 2.]) + mixed = Series( + ["aBAD", NA, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ) - rs = Series(mixed).str.replace('BAD[_]*', '') - xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + rs = Series(mixed).str.replace("BAD[_]*", "") + xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -935,66 +986,69 @@ def test_replace(self): # GH 13438 msg = "repl must be a string or callable" for klass in (Series, Index): - for repl in (None, 3, {'a': 'b'}): - for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']): + for repl in (None, 3, {"a": "b"}): + for data in (["a", "b", None], ["a", "b", "c", "ad"]): values = klass(data) with pytest.raises(TypeError, match=msg): - values.str.replace('a', repl) + values.str.replace("a", repl) def test_replace_callable(self): # GH 15055 - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) # test with callable repl = lambda m: m.group(0).swapcase() - result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) - exp = Series(['foObaD__baRbaD', NA]) + result = values.str.replace("[a-z][A-Z]{2}", repl, n=2) + exp = Series(["foObaD__baRbaD", NA]) tm.assert_series_equal(result, exp) # test with wrong number of arguments, raising an error - p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ ' - r'(?(3)required )positional arguments?') + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) repl = lambda: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace("a", repl) repl = lambda m, x: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace("a", repl) repl = lambda m, x, y=None: None with pytest.raises(TypeError, match=p_err): - values.str.replace('a', repl) + values.str.replace("a", repl) # test regex named groups - values = Series(['Foo Bar Baz', NA]) + values = Series(["Foo Bar Baz", NA]) pat = r"(?P\w+) (?P\w+) (?P\w+)" - repl = lambda m: m.group('middle').swapcase() + repl = lambda m: m.group("middle").swapcase() result = values.str.replace(pat, repl) - exp = Series(['bAR', NA]) + exp = Series(["bAR", NA]) tm.assert_series_equal(result, exp) def test_replace_compiled_regex(self): # GH 15446 - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) # test with compiled regex - pat = re.compile(r'BAD[_]*') - result = values.str.replace(pat, '') - exp = Series(['foobar', NA]) + pat = re.compile(r"BAD[_]*") + result = values.str.replace(pat, "") + exp = Series(["foobar", NA]) tm.assert_series_equal(result, exp) - result = values.str.replace(pat, '', n=1) - exp = Series(['foobarBAD', NA]) + result = values.str.replace(pat, "", n=1) + exp = Series(["foobarBAD", NA]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', - None, 1, 2.]) + mixed = Series( + ["aBAD", NA, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0] + ) - rs = Series(mixed).str.replace(pat, '') - xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + rs = Series(mixed).str.replace(pat, "") + xp = Series(["a", NA, "b", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1007,410 +1061,399 @@ def test_replace_compiled_regex(self): # case and flags provided to str.replace will have no effect # and will produce warnings - values = Series(['fooBAD__barBAD__bad', NA]) - pat = re.compile(r'BAD[_]*') + values = Series(["fooBAD__barBAD__bad", NA]) + pat = re.compile(r"BAD[_]*") - with pytest.raises(ValueError, - match="case and flags cannot be"): - result = values.str.replace(pat, '', flags=re.IGNORECASE) + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", flags=re.IGNORECASE) - with pytest.raises(ValueError, - match="case and flags cannot be"): - result = values.str.replace(pat, '', case=False) + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", case=False) - with pytest.raises(ValueError, - match="case and flags cannot be"): - result = values.str.replace(pat, '', case=True) + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, "", case=True) # test with callable - values = Series(['fooBAD__barBAD', NA]) + values = Series(["fooBAD__barBAD", NA]) repl = lambda m: m.group(0).swapcase() - pat = re.compile('[a-z][A-Z]{2}') + pat = re.compile("[a-z][A-Z]{2}") result = values.str.replace(pat, repl, n=2) - exp = Series(['foObaD__baRbaD', NA]) + exp = Series(["foObaD__baRbaD", NA]) tm.assert_series_equal(result, exp) def test_replace_literal(self): # GH16808 literal replace (regex=False vs regex=True) - values = Series(['f.o', 'foo', NA]) - exp = Series(['bao', 'bao', NA]) - result = values.str.replace('f.', 'ba') + values = Series(["f.o", "foo", NA]) + exp = Series(["bao", "bao", NA]) + result = values.str.replace("f.", "ba") tm.assert_series_equal(result, exp) - exp = Series(['bao', 'foo', NA]) - result = values.str.replace('f.', 'ba', regex=False) + exp = Series(["bao", "foo", NA]) + result = values.str.replace("f.", "ba", regex=False) tm.assert_series_equal(result, exp) # Cannot do a literal replace if given a callable repl or compiled # pattern callable_repl = lambda m: m.group(0).swapcase() - compiled_pat = re.compile('[a-z][A-Z]{2}') + compiled_pat = re.compile("[a-z][A-Z]{2}") msg = "Cannot use a callable replacement when regex=False" with pytest.raises(ValueError, match=msg): - values.str.replace('abc', callable_repl, regex=False) + values.str.replace("abc", callable_repl, regex=False) - msg = ("Cannot use a compiled regex as replacement pattern with" - " regex=False") + msg = "Cannot use a compiled regex as replacement pattern with" " regex=False" with pytest.raises(ValueError, match=msg): - values.str.replace(compiled_pat, '', regex=False) + values.str.replace(compiled_pat, "", regex=False) def test_repeat(self): - values = Series(['a', 'b', NA, 'c', NA, 'd']) + values = Series(["a", "b", NA, "c", NA, "d"]) result = values.str.repeat(3) - exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd']) + exp = Series(["aaa", "bbb", NA, "ccc", NA, "ddd"]) tm.assert_series_equal(result, exp) result = values.str.repeat([1, 2, 3, 4, 5, 6]) - exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd']) + exp = Series(["a", "bb", NA, "cccc", NA, "dddddd"]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1, - 2.]) + mixed = Series(["a", NA, "b", True, datetime.today(), "foo", None, 1, 2.0]) rs = Series(mixed).str.repeat(3) - xp = Series(['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA]) + xp = Series(["aaa", NA, "bbb", NA, NA, "foofoofoo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_match(self): # New match behavior introduced in 0.13 - values = Series(['fooBAD__barBAD', NA, 'foo']) - result = values.str.match('.*(BAD[_]+).*(BAD)') + values = Series(["fooBAD__barBAD", NA, "foo"]) + result = values.str.match(".*(BAD[_]+).*(BAD)") exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) - values = Series(['fooBAD__barBAD', NA, 'foo']) - result = values.str.match('.*BAD[_]+.*BAD') + values = Series(["fooBAD__barBAD", NA, "foo"]) + result = values.str.match(".*BAD[_]+.*BAD") exp = Series([True, NA, False]) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), - 'foo', None, 1, 2.]) - rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)') + mixed = Series( + ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + ) + rs = Series(mixed).str.match(".*(BAD[_]+).*(BAD)") xp = Series([True, NA, True, NA, NA, False, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) # na GH #6609 - res = Series(['a', 0, np.nan]).str.match('a', na=False) + res = Series(["a", 0, np.nan]).str.match("a", na=False) exp = Series([True, False, False]) assert_series_equal(exp, res) - res = Series(['a', 0, np.nan]).str.match('a') + res = Series(["a", 0, np.nan]).str.match("a") exp = Series([True, np.nan, np.nan]) assert_series_equal(exp, res) def test_extract_expand_None(self): - values = Series(['fooBAD__barBAD', NA, 'foo']) - with pytest.raises(ValueError, - match='expand must be True or False'): - values.str.extract('.*(BAD[_]+).*(BAD)', expand=None) + values = Series(["fooBAD__barBAD", NA, "foo"]) + with pytest.raises(ValueError, match="expand must be True or False"): + values.str.extract(".*(BAD[_]+).*(BAD)", expand=None) def test_extract_expand_unspecified(self): - values = Series(['fooBAD__barBAD', NA, 'foo']) - result_unspecified = values.str.extract('.*(BAD[_]+).*') + values = Series(["fooBAD__barBAD", NA, "foo"]) + result_unspecified = values.str.extract(".*(BAD[_]+).*") assert isinstance(result_unspecified, DataFrame) - result_true = values.str.extract('.*(BAD[_]+).*', expand=True) + result_true = values.str.extract(".*(BAD[_]+).*", expand=True) tm.assert_frame_equal(result_unspecified, result_true) def test_extract_expand_False(self): # Contains tests like those in test_match and some others. - values = Series(['fooBAD__barBAD', NA, 'foo']) + values = Series(["fooBAD__barBAD", NA, "foo"]) er = [NA, NA] # empty row - result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False) - exp = DataFrame([['BAD__', 'BAD'], er, er]) + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # mixed - mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + ) - rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=False) - exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, er, er, - er, er]) + rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # unicode - values = Series(['fooBAD__barBAD', NA, 'foo']) + values = Series(["fooBAD__barBAD", NA, "foo"]) - result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False) - exp = DataFrame([['BAD__', 'BAD'], er, er]) + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=False) + exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # GH9980 # Index only works with one regex group since # multi-group would expand to a frame - idx = Index(['A1', 'A2', 'A3', 'A4', 'B5']) + idx = Index(["A1", "A2", "A3", "A4", "B5"]) with pytest.raises(ValueError, match="supported"): - idx.str.extract('([AB])([123])', expand=False) + idx.str.extract("([AB])([123])", expand=False) # these should work for both Series and Index for klass in [Series, Index]: # no groups - s_or_idx = klass(['A1', 'B2', 'C3']) + s_or_idx = klass(["A1", "B2", "C3"]) msg = "pattern contains no capture groups" with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('[ABC][123]', expand=False) + s_or_idx.str.extract("[ABC][123]", expand=False) # only non-capturing groups with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('(?:[AB]).*', expand=False) + s_or_idx.str.extract("(?:[AB]).*", expand=False) # single group renames series/index properly - s_or_idx = klass(['A1', 'A2']) - result = s_or_idx.str.extract(r'(?PA)\d', expand=False) - assert result.name == 'uno' + s_or_idx = klass(["A1", "A2"]) + result = s_or_idx.str.extract(r"(?PA)\d", expand=False) + assert result.name == "uno" - exp = klass(['A', 'A'], name='uno') + exp = klass(["A", "A"], name="uno") if klass == Series: tm.assert_series_equal(result, exp) else: tm.assert_index_equal(result, exp) - s = Series(['A1', 'B2', 'C3']) + s = Series(["A1", "B2", "C3"]) # one group, no matches - result = s.str.extract('(_)', expand=False) + result = s.str.extract("(_)", expand=False) exp = Series([NA, NA, NA], dtype=object) tm.assert_series_equal(result, exp) # two groups, no matches - result = s.str.extract('(_)(_)', expand=False) + result = s.str.extract("(_)(_)", expand=False) exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) tm.assert_frame_equal(result, exp) # one group, some matches - result = s.str.extract('([AB])[123]', expand=False) - exp = Series(['A', 'B', NA]) + result = s.str.extract("([AB])[123]", expand=False) + exp = Series(["A", "B", NA]) tm.assert_series_equal(result, exp) # two groups, some matches - result = s.str.extract('([AB])([123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = s.str.extract("([AB])([123])", expand=False) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one named group - result = s.str.extract('(?P[AB])', expand=False) - exp = Series(['A', 'B', NA], name='letter') + result = s.str.extract("(?P[AB])", expand=False) + exp = Series(["A", "B", NA], name="letter") tm.assert_series_equal(result, exp) # two named groups - result = s.str.extract('(?P[AB])(?P[123])', - expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], - columns=['letter', 'number']) + result = s.str.extract("(?P[AB])(?P[123])", expand=False) + exp = DataFrame( + [["A", "1"], ["B", "2"], [NA, NA]], columns=["letter", "number"] + ) tm.assert_frame_equal(result, exp) # mix named and unnamed groups - result = s.str.extract('([AB])(?P[123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], - columns=[0, 'number']) + result = s.str.extract("([AB])(?P[123])", expand=False) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]], columns=[0, "number"]) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group - result = s.str.extract('([AB])(?:[123])', expand=False) - exp = Series(['A', 'B', NA]) + result = s.str.extract("([AB])(?:[123])", expand=False) + exp = Series(["A", "B", NA]) tm.assert_series_equal(result, exp) # two normal groups, one non-capturing group - result = Series(['A11', 'B22', 'C33']).str.extract( - '([AB])([123])(?:[123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = Series(["A11", "B22", "C33"]).str.extract( + "([AB])([123])(?:[123])", expand=False + ) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group - result = Series(['A1', 'B2', '3']).str.extract( - '(?P[AB])?(?P[123])', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']], - columns=['letter', 'number']) + result = Series(["A1", "B2", "3"]).str.extract( + "(?P[AB])?(?P[123])", expand=False + ) + exp = DataFrame( + [["A", "1"], ["B", "2"], [NA, "3"]], columns=["letter", "number"] + ) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group - result = Series(['A1', 'B2', 'C']).str.extract( - '(?P[ABC])(?P[123])?', expand=False) - exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], - columns=['letter', 'number']) + result = Series(["A1", "B2", "C"]).str.extract( + "(?P[ABC])(?P[123])?", expand=False + ) + exp = DataFrame( + [["A", "1"], ["B", "2"], ["C", NA]], columns=["letter", "number"] + ) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): - data = ['A1', 'B2', 'C'] - index = index[:len(data)] + data = ["A1", "B2", "C"] + index = index[: len(data)] s = Series(data, index=index) - result = s.str.extract(r'(\d)', expand=False) - exp = Series(['1', '2', NA], index=index) + result = s.str.extract(r"(\d)", expand=False) + exp = Series(["1", "2", NA], index=index) tm.assert_series_equal(result, exp) result = Series(data, index=index).str.extract( - r'(?P\D)(?P\d)?', expand=False) - e_list = [ - ['A', '1'], - ['B', '2'], - ['C', NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number'], index=index) + r"(?P\D)(?P\d)?", expand=False + ) + e_list = [["A", "1"], ["B", "2"], ["C", NA]] + exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) i_funs = [ - tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeRangeIndex, ] for index in i_funs: check_index(index()) # single_series_name_is_preserved. - s = Series(['a3', 'b3', 'c2'], name='bob') - r = s.str.extract(r'(?P[a-z])', expand=False) - e = Series(['a', 'b', 'c'], name='sue') + s = Series(["a3", "b3", "c2"], name="bob") + r = s.str.extract(r"(?P[a-z])", expand=False) + e = Series(["a", "b", "c"], name="sue") tm.assert_series_equal(r, e) assert r.name == e.name def test_extract_expand_True(self): # Contains tests like those in test_match and some others. - values = Series(['fooBAD__barBAD', NA, 'foo']) + values = Series(["fooBAD__barBAD", NA, "foo"]) er = [NA, NA] # empty row - result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True) - exp = DataFrame([['BAD__', 'BAD'], er, er]) + result = values.str.extract(".*(BAD[_]+).*(BAD)", expand=True) + exp = DataFrame([["BAD__", "BAD"], er, er]) tm.assert_frame_equal(result, exp) # mixed - mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["aBAD_BAD", NA, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] + ) - rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=True) - exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, - er, er, er, er]) + rs = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=True) + exp = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(rs, exp) # these should work for both Series and Index for klass in [Series, Index]: # no groups - s_or_idx = klass(['A1', 'B2', 'C3']) + s_or_idx = klass(["A1", "B2", "C3"]) msg = "pattern contains no capture groups" with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('[ABC][123]', expand=True) + s_or_idx.str.extract("[ABC][123]", expand=True) # only non-capturing groups with pytest.raises(ValueError, match=msg): - s_or_idx.str.extract('(?:[AB]).*', expand=True) + s_or_idx.str.extract("(?:[AB]).*", expand=True) # single group renames series/index properly - s_or_idx = klass(['A1', 'A2']) - result_df = s_or_idx.str.extract(r'(?PA)\d', expand=True) + s_or_idx = klass(["A1", "A2"]) + result_df = s_or_idx.str.extract(r"(?PA)\d", expand=True) assert isinstance(result_df, DataFrame) - result_series = result_df['uno'] - assert_series_equal(result_series, Series(['A', 'A'], name='uno')) + result_series = result_df["uno"] + assert_series_equal(result_series, Series(["A", "A"], name="uno")) def test_extract_series(self): # extract should give the same result whether or not the # series has a name. for series_name in None, "series_name": - s = Series(['A1', 'B2', 'C3'], name=series_name) + s = Series(["A1", "B2", "C3"], name=series_name) # one group, no matches - result = s.str.extract('(_)', expand=True) + result = s.str.extract("(_)", expand=True) exp = DataFrame([NA, NA, NA], dtype=object) tm.assert_frame_equal(result, exp) # two groups, no matches - result = s.str.extract('(_)(_)', expand=True) + result = s.str.extract("(_)(_)", expand=True) exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object) tm.assert_frame_equal(result, exp) # one group, some matches - result = s.str.extract('([AB])[123]', expand=True) - exp = DataFrame(['A', 'B', NA]) + result = s.str.extract("([AB])[123]", expand=True) + exp = DataFrame(["A", "B", NA]) tm.assert_frame_equal(result, exp) # two groups, some matches - result = s.str.extract('([AB])([123])', expand=True) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = s.str.extract("([AB])([123])", expand=True) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one named group - result = s.str.extract('(?P[AB])', expand=True) - exp = DataFrame({"letter": ['A', 'B', NA]}) + result = s.str.extract("(?P[AB])", expand=True) + exp = DataFrame({"letter": ["A", "B", NA]}) tm.assert_frame_equal(result, exp) # two named groups - result = s.str.extract( - '(?P[AB])(?P[123])', - expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - [NA, NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number']) + result = s.str.extract("(?P[AB])(?P[123])", expand=True) + e_list = [["A", "1"], ["B", "2"], [NA, NA]] + exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # mix named and unnamed groups - result = s.str.extract('([AB])(?P[123])', expand=True) - exp = DataFrame(e_list, columns=[0, 'number']) + result = s.str.extract("([AB])(?P[123])", expand=True) + exp = DataFrame(e_list, columns=[0, "number"]) tm.assert_frame_equal(result, exp) # one normal group, one non-capturing group - result = s.str.extract('([AB])(?:[123])', expand=True) - exp = DataFrame(['A', 'B', NA]) + result = s.str.extract("([AB])(?:[123])", expand=True) + exp = DataFrame(["A", "B", NA]) tm.assert_frame_equal(result, exp) def test_extract_optional_groups(self): # two normal groups, one non-capturing group - result = Series(['A11', 'B22', 'C33']).str.extract( - '([AB])([123])(?:[123])', expand=True) - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) + result = Series(["A11", "B22", "C33"]).str.extract( + "([AB])([123])(?:[123])", expand=True + ) + exp = DataFrame([["A", "1"], ["B", "2"], [NA, NA]]) tm.assert_frame_equal(result, exp) # one optional group followed by one normal group - result = Series(['A1', 'B2', '3']).str.extract( - '(?P[AB])?(?P[123])', expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - [NA, '3'] - ] - exp = DataFrame(e_list, columns=['letter', 'number']) + result = Series(["A1", "B2", "3"]).str.extract( + "(?P[AB])?(?P[123])", expand=True + ) + e_list = [["A", "1"], ["B", "2"], [NA, "3"]] + exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # one normal group followed by one optional group - result = Series(['A1', 'B2', 'C']).str.extract( - '(?P[ABC])(?P[123])?', expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - ['C', NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number']) + result = Series(["A1", "B2", "C"]).str.extract( + "(?P[ABC])(?P[123])?", expand=True + ) + e_list = [["A", "1"], ["B", "2"], ["C", NA]] + exp = DataFrame(e_list, columns=["letter", "number"]) tm.assert_frame_equal(result, exp) # GH6348 # not passing index to the extractor def check_index(index): - data = ['A1', 'B2', 'C'] - index = index[:len(data)] - result = Series(data, index=index).str.extract( - r'(\d)', expand=True) - exp = DataFrame(['1', '2', NA], index=index) + data = ["A1", "B2", "C"] + index = index[: len(data)] + result = Series(data, index=index).str.extract(r"(\d)", expand=True) + exp = DataFrame(["1", "2", NA], index=index) tm.assert_frame_equal(result, exp) result = Series(data, index=index).str.extract( - r'(?P\D)(?P\d)?', expand=True) - e_list = [ - ['A', '1'], - ['B', '2'], - ['C', NA] - ] - exp = DataFrame(e_list, columns=['letter', 'number'], index=index) + r"(?P\D)(?P\d)?", expand=True + ) + e_list = [["A", "1"], ["B", "2"], ["C", NA]] + exp = DataFrame(e_list, columns=["letter", "number"], index=index) tm.assert_frame_equal(result, exp) i_funs = [ - tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, - tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex + tm.makeStringIndex, + tm.makeUnicodeIndex, + tm.makeIntIndex, + tm.makeDateIndex, + tm.makePeriodIndex, + tm.makeRangeIndex, ] for index in i_funs: check_index(index()) @@ -1419,18 +1462,18 @@ def test_extract_single_group_returns_frame(self): # GH11386 extract should always return DataFrame, even when # there is only one group. Prior to v0.18.0, extract returned # Series when there was only one group in the regex. - s = Series(['a3', 'b3', 'c2'], name='series_name') - r = s.str.extract(r'(?P[a-z])', expand=True) - e = DataFrame({"letter": ['a', 'b', 'c']}) + s = Series(["a3", "b3", "c2"], name="series_name") + r = s.str.extract(r"(?P[a-z])", expand=True) + e = DataFrame({"letter": ["a", "b", "c"]}) tm.assert_frame_equal(r, e) def test_extractall(self): subject_list = [ - 'dave@google.com', - 'tdhock5@gmail.com', - 'maudelaperriere@gmail.com', - 'rob@gmail.com some text steve@gmail.com', - 'a@b.com some text c@d.com and e@f.com', + "dave@google.com", + "tdhock5@gmail.com", + "maudelaperriere@gmail.com", + "rob@gmail.com some text steve@gmail.com", + "a@b.com some text c@d.com and e@f.com", np.nan, "", ] @@ -1438,8 +1481,11 @@ def test_extractall(self): ("dave", "google", "com"), ("tdhock5", "gmail", "com"), ("maudelaperriere", "gmail", "com"), - ("rob", "gmail", "com"), ("steve", "gmail", "com"), - ("a", "b", "com"), ("c", "d", "com"), ("e", "f", "com"), + ("rob", "gmail", "com"), + ("steve", "gmail", "com"), + ("a", "b", "com"), + ("c", "d", "com"), + ("e", "f", "com"), ] named_pattern = r""" (?P[a-z0-9]+) @@ -1452,45 +1498,42 @@ def test_extractall(self): S = Series(subject_list) # extractall should return a DataFrame with one row for each # match, indexed by the subject from which the match came. - expected_index = MultiIndex.from_tuples([ - (0, 0), - (1, 0), - (2, 0), - (3, 0), - (3, 1), - (4, 0), - (4, 1), - (4, 2), - ], names=(None, "match")) - expected_df = DataFrame( - expected_tuples, expected_index, expected_columns) + expected_index = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)], + names=(None, "match"), + ) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = S.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # The index of the input Series should be used to construct # the index of the output DataFrame: - series_index = MultiIndex.from_tuples([ - ("single", "Dave"), - ("single", "Toby"), - ("single", "Maude"), - ("multiple", "robAndSteve"), - ("multiple", "abcdef"), - ("none", "missing"), - ("none", "empty"), - ]) + series_index = MultiIndex.from_tuples( + [ + ("single", "Dave"), + ("single", "Toby"), + ("single", "Maude"), + ("multiple", "robAndSteve"), + ("multiple", "abcdef"), + ("none", "missing"), + ("none", "empty"), + ] + ) Si = Series(subject_list, series_index) - expected_index = MultiIndex.from_tuples([ - ("single", "Dave", 0), - ("single", "Toby", 0), - ("single", "Maude", 0), - ("multiple", "robAndSteve", 0), - ("multiple", "robAndSteve", 1), - ("multiple", "abcdef", 0), - ("multiple", "abcdef", 1), - ("multiple", "abcdef", 2), - ], names=(None, None, "match")) - expected_df = DataFrame( - expected_tuples, expected_index, expected_columns) + expected_index = MultiIndex.from_tuples( + [ + ("single", "Dave", 0), + ("single", "Toby", 0), + ("single", "Maude", 0), + ("multiple", "robAndSteve", 0), + ("multiple", "robAndSteve", 1), + ("multiple", "abcdef", 0), + ("multiple", "abcdef", 1), + ("multiple", "abcdef", 2), + ], + names=(None, None, "match"), + ) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = Si.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) @@ -1498,81 +1541,73 @@ def test_extractall(self): Sn = Series(subject_list, series_index) Sn.index.names = ("matches", "description") expected_index.names = ("matches", "description", "match") - expected_df = DataFrame( - expected_tuples, expected_index, expected_columns) + expected_df = DataFrame(expected_tuples, expected_index, expected_columns) computed_df = Sn.str.extractall(named_pattern, re.VERBOSE) tm.assert_frame_equal(computed_df, expected_df) # optional groups. - subject_list = ['', 'A1', '32'] - named_pattern = '(?P[AB])?(?P[123])' + subject_list = ["", "A1", "32"] + named_pattern = "(?P[AB])?(?P[123])" computed_df = Series(subject_list).str.extractall(named_pattern) - expected_index = MultiIndex.from_tuples([ - (1, 0), - (2, 0), - (2, 1), - ], names=(None, "match")) - expected_df = DataFrame([ - ('A', '1'), - (NA, '3'), - (NA, '2'), - ], expected_index, columns=['letter', 'number']) + expected_index = MultiIndex.from_tuples( + [(1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + expected_df = DataFrame( + [("A", "1"), (NA, "3"), (NA, "2")], + expected_index, + columns=["letter", "number"], + ) tm.assert_frame_equal(computed_df, expected_df) # only one of two groups has a name. - pattern = '([AB])?(?P[123])' + pattern = "([AB])?(?P[123])" computed_df = Series(subject_list).str.extractall(pattern) - expected_df = DataFrame([ - ('A', '1'), - (NA, '3'), - (NA, '2'), - ], expected_index, columns=[0, 'number']) + expected_df = DataFrame( + [("A", "1"), (NA, "3"), (NA, "2")], expected_index, columns=[0, "number"] + ) tm.assert_frame_equal(computed_df, expected_df) def test_extractall_single_group(self): # extractall(one named group) returns DataFrame with one named # column. - s = Series(['a3', 'b3', 'd4c2'], name='series_name') - r = s.str.extractall(r'(?P[a-z])') - i = MultiIndex.from_tuples([ - (0, 0), - (1, 0), - (2, 0), - (2, 1), - ], names=(None, "match")) - e = DataFrame({"letter": ['a', 'b', 'd', 'c']}, i) + s = Series(["a3", "b3", "d4c2"], name="series_name") + r = s.str.extractall(r"(?P[a-z])") + i = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + e = DataFrame({"letter": ["a", "b", "d", "c"]}, i) tm.assert_frame_equal(r, e) # extractall(one un-named group) returns DataFrame with one # un-named column. - r = s.str.extractall(r'([a-z])') - e = DataFrame(['a', 'b', 'd', 'c'], i) + r = s.str.extractall(r"([a-z])") + e = DataFrame(["a", "b", "d", "c"], i) tm.assert_frame_equal(r, e) def test_extractall_single_group_with_quantifier(self): # extractall(one un-named group with quantifier) returns # DataFrame with one un-named column (GH13382). - s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name') - r = s.str.extractall(r'([a-z]+)') - i = MultiIndex.from_tuples([ - (0, 0), - (1, 0), - (2, 0), - (2, 1), - ], names=(None, "match")) - e = DataFrame(['ab', 'abc', 'd', 'cd'], i) + s = Series(["ab3", "abc3", "d4cd2"], name="series_name") + r = s.str.extractall(r"([a-z]+)") + i = MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match") + ) + e = DataFrame(["ab", "abc", "d", "cd"], i) tm.assert_frame_equal(r, e) - @pytest.mark.parametrize('data, names', [ - ([], (None, )), - ([], ('i1', )), - ([], (None, 'i2')), - ([], ('i1', 'i2')), - (['a3', 'b3', 'd4c2'], (None, )), - (['a3', 'b3', 'd4c2'], ('i1', 'i2')), - (['a3', 'b3', 'd4c2'], (None, 'i2')), - (['a3', 'b3', 'd4c2'], ('i1', 'i2')), - ]) + @pytest.mark.parametrize( + "data, names", + [ + ([], (None,)), + ([], ("i1",)), + ([], (None, "i2")), + ([], ("i1", "i2")), + (["a3", "b3", "d4c2"], (None,)), + (["a3", "b3", "d4c2"], ("i1", "i2")), + (["a3", "b3", "d4c2"], (None, "i2")), + (["a3", "b3", "d4c2"], ("i1", "i2")), + ], + ) def test_extractall_no_matches(self, data, names): # GH19075 extractall with no matches should return a valid MultiIndex n = len(data) @@ -1581,108 +1616,109 @@ def test_extractall_no_matches(self, data, names): else: a = (tuple([i] * (n - 1)) for i in range(n)) i = MultiIndex.from_tuples(a, names=names) - s = Series(data, name='series_name', index=i, dtype='object') - ei = MultiIndex.from_tuples([], names=(names + ('match',))) + s = Series(data, name="series_name", index=i, dtype="object") + ei = MultiIndex.from_tuples([], names=(names + ("match",))) # one un-named group. - r = s.str.extractall('(z)') + r = s.str.extractall("(z)") e = DataFrame(columns=[0], index=ei) tm.assert_frame_equal(r, e) # two un-named groups. - r = s.str.extractall('(z)(z)') + r = s.str.extractall("(z)(z)") e = DataFrame(columns=[0, 1], index=ei) tm.assert_frame_equal(r, e) # one named group. - r = s.str.extractall('(?Pz)') + r = s.str.extractall("(?Pz)") e = DataFrame(columns=["first"], index=ei) tm.assert_frame_equal(r, e) # two named groups. - r = s.str.extractall('(?Pz)(?Pz)') + r = s.str.extractall("(?Pz)(?Pz)") e = DataFrame(columns=["first", "second"], index=ei) tm.assert_frame_equal(r, e) # one named, one un-named. - r = s.str.extractall('(z)(?Pz)') + r = s.str.extractall("(z)(?Pz)") e = DataFrame(columns=[0, "second"], index=ei) tm.assert_frame_equal(r, e) def test_extractall_stringindex(self): - s = Series(["a1a2", "b1", "c1"], name='xxx') + s = Series(["a1a2", "b1", "c1"], name="xxx") res = s.str.extractall(r"[ab](?P\d)") - exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], - names=[None, 'match']) - exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + exp_idx = MultiIndex.from_tuples( + [(0, 0), (0, 1), (1, 0)], names=[None, "match"] + ) + exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) # index should return the same result as the default index without name # thus index.name doesn't affect to the result - for idx in [Index(["a1a2", "b1", "c1"]), - Index(["a1a2", "b1", "c1"], name='xxx')]: + for idx in [ + Index(["a1a2", "b1", "c1"]), + Index(["a1a2", "b1", "c1"], name="xxx"), + ]: res = idx.str.extractall(r"[ab](?P\d)") tm.assert_frame_equal(res, exp) - s = Series(["a1a2", "b1", "c1"], name='s_name', - index=Index(["XX", "yy", "zz"], name='idx_name')) + s = Series( + ["a1a2", "b1", "c1"], + name="s_name", + index=Index(["XX", "yy", "zz"], name="idx_name"), + ) res = s.str.extractall(r"[ab](?P\d)") - exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], - names=["idx_name", 'match']) - exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + exp_idx = MultiIndex.from_tuples( + [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"] + ) + exp = DataFrame({"digit": ["1", "2", "1"]}, index=exp_idx) tm.assert_frame_equal(res, exp) def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for # each capture group) - s = Series(['a3', 'b3', 'd4c2'], name='series_name') + s = Series(["a3", "b3", "d4c2"], name="series_name") with pytest.raises(ValueError, match="no capture groups"): - s.str.extractall(r'[a-z]') + s.str.extractall(r"[a-z]") def test_extract_index_one_two_groups(self): - s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"], - name='series_name') - r = s.index.str.extract(r'([A-Z])', expand=True) - e = DataFrame(['A', "B", "D"]) + s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name") + r = s.index.str.extract(r"([A-Z])", expand=True) + e = DataFrame(["A", "B", "D"]) tm.assert_frame_equal(r, e) # Prior to v0.18.0, index.str.extract(regex with one group) # returned Index. With more than one group, extract raised an # error (GH9980). Now extract always returns DataFrame. - r = s.index.str.extract( - r'(?P[A-Z])(?P[0-9])', expand=True) - e_list = [ - ("A", "3"), - ("B", "3"), - ("D", "4"), - ] + r = s.index.str.extract(r"(?P[A-Z])(?P[0-9])", expand=True) + e_list = [("A", "3"), ("B", "3"), ("D", "4")] e = DataFrame(e_list, columns=["letter", "digit"]) tm.assert_frame_equal(r, e) def test_extractall_same_as_extract(self): - s = Series(['a3', 'b3', 'c2'], name='series_name') + s = Series(["a3", "b3", "c2"], name="series_name") - pattern_two_noname = r'([a-z])([0-9])' + pattern_two_noname = r"([a-z])([0-9])" extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_multi_index = s.str.extractall(pattern_two_noname) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_multi_index) - pattern_two_named = r'(?P[a-z])(?P[0-9])' + pattern_two_named = r"(?P[a-z])(?P[0-9])" extract_two_named = s.str.extract(pattern_two_named, expand=True) has_multi_index = s.str.extractall(pattern_two_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_multi_index) - pattern_one_named = r'(?P[a-z])' + pattern_one_named = r"(?P[a-z])" extract_one_named = s.str.extract(pattern_one_named, expand=True) has_multi_index = s.str.extractall(pattern_one_named) no_multi_index = has_multi_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_multi_index) - pattern_one_noname = r'([a-z])' + pattern_one_noname = r"([a-z])" extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_multi_index = s.str.extractall(pattern_one_noname) no_multi_index = has_multi_index.xs(0, level="match") @@ -1690,32 +1726,31 @@ def test_extractall_same_as_extract(self): def test_extractall_same_as_extract_subject_index(self): # same as above tests, but s has an MultiIndex. - i = MultiIndex.from_tuples([ - ("A", "first"), - ("B", "second"), - ("C", "third"), - ], names=("capital", "ordinal")) - s = Series(['a3', 'b3', 'c2'], i, name='series_name') - - pattern_two_noname = r'([a-z])([0-9])' + i = MultiIndex.from_tuples( + [("A", "first"), ("B", "second"), ("C", "third")], + names=("capital", "ordinal"), + ) + s = Series(["a3", "b3", "c2"], i, name="series_name") + + pattern_two_noname = r"([a-z])([0-9])" extract_two_noname = s.str.extract(pattern_two_noname, expand=True) has_match_index = s.str.extractall(pattern_two_noname) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_noname, no_match_index) - pattern_two_named = r'(?P[a-z])(?P[0-9])' + pattern_two_named = r"(?P[a-z])(?P[0-9])" extract_two_named = s.str.extract(pattern_two_named, expand=True) has_match_index = s.str.extractall(pattern_two_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_two_named, no_match_index) - pattern_one_named = r'(?P[a-z])' + pattern_one_named = r"(?P[a-z])" extract_one_named = s.str.extract(pattern_one_named, expand=True) has_match_index = s.str.extractall(pattern_one_named) no_match_index = has_match_index.xs(0, level="match") tm.assert_frame_equal(extract_one_named, no_match_index) - pattern_one_noname = r'([a-z])' + pattern_one_noname = r"([a-z])" extract_one_noname = s.str.extract(pattern_one_noname, expand=True) has_match_index = s.str.extractall(pattern_one_noname) no_match_index = has_match_index.xs(0, level="match") @@ -1731,43 +1766,40 @@ def test_empty_str_methods(self): # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty)) - assert '' == empty.str.cat() + assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) - tm.assert_series_equal(empty_int, empty.str.count('a')) - tm.assert_series_equal(empty_bool, empty.str.contains('a')) - tm.assert_series_equal(empty_bool, empty.str.startswith('a')) - tm.assert_series_equal(empty_bool, empty.str.endswith('a')) + tm.assert_series_equal(empty_int, empty.str.count("a")) + tm.assert_series_equal(empty_bool, empty.str.contains("a")) + tm.assert_series_equal(empty_bool, empty.str.startswith("a")) + tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) - tm.assert_series_equal(empty_str, empty.str.replace('a', 'b')) + tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) - tm.assert_series_equal(empty_bool, empty.str.match('^a')) + tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( - DataFrame(columns=[0], dtype=str), - empty.str.extract('()', expand=True)) + DataFrame(columns=[0], dtype=str), empty.str.extract("()", expand=True) + ) tm.assert_frame_equal( - DataFrame(columns=[0, 1], dtype=str), - empty.str.extract('()()', expand=True)) - tm.assert_series_equal( - empty_str, - empty.str.extract('()', expand=False)) + DataFrame(columns=[0, 1], dtype=str), empty.str.extract("()()", expand=True) + ) + tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=str), - empty.str.extract('()()', expand=False)) + empty.str.extract("()()", expand=False), + ) tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) - tm.assert_series_equal(empty_str, empty_str.str.join('')) + tm.assert_series_equal(empty_str, empty_str.str.join("")) tm.assert_series_equal(empty_int, empty.str.len()) - tm.assert_series_equal(empty_str, empty_str.str.findall('a')) - tm.assert_series_equal(empty_int, empty.str.find('a')) - tm.assert_series_equal(empty_int, empty.str.rfind('a')) + tm.assert_series_equal(empty_str, empty_str.str.findall("a")) + tm.assert_series_equal(empty_int, empty.str.find("a")) + tm.assert_series_equal(empty_int, empty.str.rfind("a")) tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) - tm.assert_series_equal(empty_str, empty.str.split('a')) - tm.assert_series_equal(empty_str, empty.str.rsplit('a')) - tm.assert_series_equal(empty_str, - empty.str.partition('a', expand=False)) - tm.assert_series_equal(empty_str, - empty.str.rpartition('a', expand=False)) + tm.assert_series_equal(empty_str, empty.str.split("a")) + tm.assert_series_equal(empty_str, empty.str.rsplit("a")) + tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False)) + tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) tm.assert_series_equal(empty_str, empty.str.strip()) @@ -1775,8 +1807,8 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii')) - tm.assert_series_equal(empty_bytes, empty.str.encode('ascii')) + tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) tm.assert_series_equal(empty_str, empty.str.isalnum()) tm.assert_series_equal(empty_str, empty.str.isalpha()) tm.assert_series_equal(empty_str, empty.str.isdigit()) @@ -1788,39 +1820,42 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) - tm.assert_series_equal(empty_str, empty.str.normalize('NFC')) + tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) - table = str.maketrans('a', 'b') + table = str.maketrans("a", "b") tm.assert_series_equal(empty_str, empty.str.translate(table)) def test_empty_str_methods_to_frame(self): empty = Series(dtype=str) empty_df = DataFrame() - tm.assert_frame_equal(empty_df, empty.str.partition('a')) - tm.assert_frame_equal(empty_df, empty.str.rpartition('a')) + tm.assert_frame_equal(empty_df, empty.str.partition("a")) + tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) def test_ismethods(self): - values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] + values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] str_s = Series(values) - alnum_e = [True, True, True, True, True, False, True, True, False, - False] - alpha_e = [True, True, True, False, False, False, True, False, False, - False] - digit_e = [False, False, False, True, False, False, False, True, False, - False] + alnum_e = [True, True, True, True, True, False, True, True, False, False] + alpha_e = [True, True, True, False, False, False, True, False, False, False] + digit_e = [False, False, False, True, False, False, False, True, False, False] # TODO: unused - num_e = [False, False, False, True, False, False, # noqa - False, True, False, False] - - space_e = [False, False, False, False, False, False, False, False, - False, True] - lower_e = [False, True, False, False, False, False, False, False, - False, False] - upper_e = [True, False, False, False, True, False, True, False, False, - False] - title_e = [True, False, True, False, True, False, False, False, False, - False] + num_e = [ + False, + False, + False, + True, + False, + False, # noqa + False, + True, + False, + False, + ] + + space_e = [False, False, False, False, False, False, False, False, False, True] + lower_e = [False, True, False, False, False, False, False, False, False, False] + upper_e = [True, False, False, False, True, False, True, False, False, False] + title_e = [True, False, True, False, True, False, False, False, False, False] tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e)) tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e)) @@ -1843,18 +1878,18 @@ def test_isnumeric(self): # 0x2605: ★ not number # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 - values = ['A', '3', '¼', '★', '፸', '3', 'four'] + values = ["A", "3", "¼", "★", "፸", "3", "four"] s = Series(values) numeric_e = [False, True, True, False, True, True, False] decimal_e = [False, True, False, False, False, True, False] tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) - unicodes = ['A', '3', '¼', '★', '፸', '3', 'four'] + unicodes = ["A", "3", "¼", "★", "፸", "3", "four"] assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] - values = ['A', np.nan, '¼', '★', np.nan, '3', 'four'] + values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] s = Series(values) numeric_e = [False, np.nan, True, False, np.nan, True, False] decimal_e = [False, np.nan, False, False, np.nan, True, False] @@ -1862,68 +1897,70 @@ def test_isnumeric(self): tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) def test_get_dummies(self): - s = Series(['a|b', 'a|c', np.nan]) - result = s.str.get_dummies('|') - expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], - columns=list('abc')) + s = Series(["a|b", "a|c", np.nan]) + result = s.str.get_dummies("|") + expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc")) tm.assert_frame_equal(result, expected) - s = Series(['a;b', 'a', 7]) - result = s.str.get_dummies(';') - expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], - columns=list('7ab')) + s = Series(["a;b", "a", 7]) + result = s.str.get_dummies(";") + expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab")) tm.assert_frame_equal(result, expected) # GH9980, GH8028 - idx = Index(['a|b', 'a|c', 'b|c']) - result = idx.str.get_dummies('|') + idx = Index(["a|b", "a|c", "b|c"]) + result = idx.str.get_dummies("|") - expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1), - (0, 1, 1)], names=('a', 'b', 'c')) + expected = MultiIndex.from_tuples( + [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c") + ) tm.assert_index_equal(result, expected) def test_get_dummies_with_name_dummy(self): # GH 12180 # Dummies named 'name' should work as expected - s = Series(['a', 'b,name', 'b']) - result = s.str.get_dummies(',') - expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], - columns=['a', 'b', 'name']) + s = Series(["a", "b,name", "b"]) + result = s.str.get_dummies(",") + expected = DataFrame( + [[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"] + ) tm.assert_frame_equal(result, expected) - idx = Index(['a|b', 'name|c', 'b|name']) - result = idx.str.get_dummies('|') + idx = Index(["a|b", "name|c", "b|name"]) + result = idx.str.get_dummies("|") - expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1), - (0, 1, 0, 1)], - names=('a', 'b', 'c', 'name')) + expected = MultiIndex.from_tuples( + [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") + ) tm.assert_index_equal(result, expected) def test_join(self): - values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) - result = values.str.split('_').str.join('_') + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + result = values.str.split("_").str.join("_") tm.assert_series_equal(values, result) # mixed - mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] + ) - rs = Series(mixed).str.split('_').str.join('_') - xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA]) + rs = Series(mixed).str.split("_").str.join("_") + xp = Series(["a_b", NA, "asdf_cas_asdf", NA, NA, "foo", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_len(self): - values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) + values = Series(["foo", "fooo", "fooooo", np.nan, "fooooooo"]) result = values.str.len() exp = values.map(lambda x: len(x) if notna(x) else NA) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(), - 'foo', None, 1, 2.]) + mixed = Series( + ["a_b", NA, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] + ) rs = Series(mixed).str.len() xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA]) @@ -1932,90 +1969,84 @@ def test_len(self): tm.assert_almost_equal(rs, xp) def test_findall(self): - values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD']) + values = Series(["fooBAD__barBAD", NA, "foo", "BAD"]) - result = values.str.findall('BAD[_]*') - exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']]) + result = values.str.findall("BAD[_]*") + exp = Series([["BAD__", "BAD"], NA, [], ["BAD"]]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(), - 'BAD', None, 1, 2.]) + mixed = Series( + ["fooBAD__barBAD", NA, "foo", True, datetime.today(), "BAD", None, 1, 2.0] + ) - rs = Series(mixed).str.findall('BAD[_]*') - xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA]) + rs = Series(mixed).str.findall("BAD[_]*") + xp = Series([["BAD__", "BAD"], NA, [], NA, NA, ["BAD"], NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_find(self): - values = Series(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXXX']) - result = values.str.find('EF') + values = Series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"]) + result = values.str.find("EF") tm.assert_series_equal(result, Series([4, 3, 1, 0, -1])) - expected = np.array([v.find('EF') for v in values.values], - dtype=np.int64) + expected = np.array([v.find("EF") for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rfind('EF') + result = values.str.rfind("EF") tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) - expected = np.array([v.rfind('EF') for v in values.values], - dtype=np.int64) + expected = np.array([v.rfind("EF") for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.find('EF', 3) + result = values.str.find("EF", 3) tm.assert_series_equal(result, Series([4, 3, 7, 4, -1])) - expected = np.array([v.find('EF', 3) for v in values.values], - dtype=np.int64) + expected = np.array([v.find("EF", 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rfind('EF', 3) + result = values.str.rfind("EF", 3) tm.assert_series_equal(result, Series([4, 5, 7, 4, -1])) - expected = np.array([v.rfind('EF', 3) for v in values.values], - dtype=np.int64) + expected = np.array([v.rfind("EF", 3) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.find('EF', 3, 6) + result = values.str.find("EF", 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) - expected = np.array([v.find('EF', 3, 6) for v in values.values], - dtype=np.int64) + expected = np.array([v.find("EF", 3, 6) for v in values.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rfind('EF', 3, 6) + result = values.str.rfind("EF", 3, 6) tm.assert_series_equal(result, Series([4, 3, -1, 4, -1])) - expected = np.array([v.rfind('EF', 3, 6) for v in values.values], - dtype=np.int64) + expected = np.array( + [v.rfind("EF", 3, 6) for v in values.values], dtype=np.int64 + ) tm.assert_numpy_array_equal(result.values, expected) - with pytest.raises(TypeError, - match="expected a string object, not int"): + with pytest.raises(TypeError, match="expected a string object, not int"): result = values.str.find(0) - with pytest.raises(TypeError, - match="expected a string object, not int"): + with pytest.raises(TypeError, match="expected a string object, not int"): result = values.str.rfind(0) def test_find_nan(self): - values = Series(['ABCDEFG', np.nan, 'DEFGHIJEF', np.nan, 'XXXX']) - result = values.str.find('EF') + values = Series(["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"]) + result = values.str.find("EF") tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1])) - result = values.str.rfind('EF') + result = values.str.rfind("EF") tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - result = values.str.find('EF', 3) + result = values.str.find("EF", 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - result = values.str.rfind('EF', 3) + result = values.str.rfind("EF", 3) tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1])) - result = values.str.find('EF', 3, 6) + result = values.str.find("EF", 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) - result = values.str.rfind('EF', 3, 6) + result = values.str.rfind("EF", 3, 6) tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) def test_index(self): - def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) @@ -2023,136 +2054,126 @@ def _check(result, expected): tm.assert_index_equal(result, expected) for klass in [Series, Index]: - s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF']) + s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) - result = s.str.index('EF') + result = s.str.index("EF") _check(result, klass([4, 3, 1, 0])) - expected = np.array([v.index('EF') for v in s.values], - dtype=np.int64) + expected = np.array([v.index("EF") for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex('EF') + result = s.str.rindex("EF") _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex('EF') for v in s.values], - dtype=np.int64) + expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.index('EF', 3) + result = s.str.index("EF", 3) _check(result, klass([4, 3, 7, 4])) - expected = np.array([v.index('EF', 3) for v in s.values], - dtype=np.int64) + expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex('EF', 3) + result = s.str.rindex("EF", 3) _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex('EF', 3) for v in s.values], - dtype=np.int64) + expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.index('E', 4, 8) + result = s.str.index("E", 4, 8) _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.index('E', 4, 8) for v in s.values], - dtype=np.int64) + expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex('E', 0, 5) + result = s.str.rindex("E", 0, 5) _check(result, klass([4, 3, 1, 4])) - expected = np.array([v.rindex('E', 0, 5) for v in s.values], - dtype=np.int64) + expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) with pytest.raises(ValueError, match="substring not found"): - result = s.str.index('DE') + result = s.str.index("DE") msg = "expected a string object, not int" with pytest.raises(TypeError, match=msg): result = s.str.index(0) # test with nan - s = Series(['abcb', 'ab', 'bcbe', np.nan]) - result = s.str.index('b') + s = Series(["abcb", "ab", "bcbe", np.nan]) + result = s.str.index("b") tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) - result = s.str.rindex('b') + result = s.str.rindex("b") tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) def test_pad(self): - values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + values = Series(["a", "b", NA, "c", NA, "eeeeee"]) - result = values.str.pad(5, side='left') - exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) + result = values.str.pad(5, side="left") + exp = Series([" a", " b", NA, " c", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='right') - exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) + result = values.str.pad(5, side="right") + exp = Series(["a ", "b ", NA, "c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='both') - exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + result = values.str.pad(5, side="both") + exp = Series([" a ", " b ", NA, " c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. - ]) + mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) - rs = Series(mixed).str.pad(5, side='left') - xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA]) + rs = Series(mixed).str.pad(5, side="left") + xp = Series([" a", NA, " b", NA, NA, " ee", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) - mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. - ]) + mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) - rs = Series(mixed).str.pad(5, side='right') - xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA]) + rs = Series(mixed).str.pad(5, side="right") + xp = Series(["a ", NA, "b ", NA, NA, "ee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) - mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2. - ]) + mixed = Series(["a", NA, "b", True, datetime.today(), "ee", None, 1, 2.0]) - rs = Series(mixed).str.pad(5, side='both') - xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA]) + rs = Series(mixed).str.pad(5, side="both") + xp = Series([" a ", NA, " b ", NA, NA, " ee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_pad_fillchar(self): - values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + values = Series(["a", "b", NA, "c", NA, "eeeeee"]) - result = values.str.pad(5, side='left', fillchar='X') - exp = Series(['XXXXa', 'XXXXb', NA, 'XXXXc', NA, 'eeeeee']) + result = values.str.pad(5, side="left", fillchar="X") + exp = Series(["XXXXa", "XXXXb", NA, "XXXXc", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='right', fillchar='X') - exp = Series(['aXXXX', 'bXXXX', NA, 'cXXXX', NA, 'eeeeee']) + result = values.str.pad(5, side="right", fillchar="X") + exp = Series(["aXXXX", "bXXXX", NA, "cXXXX", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) - result = values.str.pad(5, side='both', fillchar='X') - exp = Series(['XXaXX', 'XXbXX', NA, 'XXcXX', NA, 'eeeeee']) + result = values.str.pad(5, side="both", fillchar="X") + exp = Series(["XXaXX", "XXbXX", NA, "XXcXX", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) msg = "fillchar must be a character, not str" with pytest.raises(TypeError, match=msg): - result = values.str.pad(5, fillchar='XY') + result = values.str.pad(5, fillchar="XY") msg = "fillchar must be a character, not int" with pytest.raises(TypeError, match=msg): result = values.str.pad(5, fillchar=5) - @pytest.mark.parametrize("f", ['center', 'ljust', 'rjust', 'zfill', 'pad']) + @pytest.mark.parametrize("f", ["center", "ljust", "rjust", "zfill", "pad"]) def test_pad_width(self, f): # see gh-13598 - s = Series(['1', '22', 'a', 'bb']) + s = Series(["1", "22", "a", "bb"]) msg = "width must be of integer type, not*" with pytest.raises(TypeError, match=msg): - getattr(s.str, f)('f') + getattr(s.str, f)("f") def test_translate(self): - def _check(result, expected): if isinstance(result, Series): tm.assert_series_equal(result, expected) @@ -2160,77 +2181,70 @@ def _check(result, expected): tm.assert_index_equal(result, expected) for klass in [Series, Index]: - s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) - table = str.maketrans('abc', 'cde') + s = klass(["abcdefg", "abcc", "cdddfg", "cdefggg"]) + table = str.maketrans("abc", "cde") result = s.str.translate(table) - expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg']) + expected = klass(["cdedefg", "cdee", "edddfg", "edefggg"]) _check(result, expected) # Series with non-string values - s = Series(['a', 'b', 'c', 1.2]) - expected = Series(['c', 'd', 'e', np.nan]) + s = Series(["a", "b", "c", 1.2]) + expected = Series(["c", "d", "e", np.nan]) result = s.str.translate(table) tm.assert_series_equal(result, expected) def test_center_ljust_rjust(self): - values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) + values = Series(["a", "b", NA, "c", NA, "eeeeee"]) result = values.str.center(5) - exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee']) + exp = Series([" a ", " b ", NA, " c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.ljust(5) - exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee']) + exp = Series(["a ", "b ", NA, "c ", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) result = values.str.rjust(5) - exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee']) + exp = Series([" a", " b", NA, " c", NA, "eeeeee"]) tm.assert_almost_equal(result, exp) # mixed - mixed = Series(['a', NA, 'b', True, datetime.today(), 'c', 'eee', None, - 1, 2.]) + mixed = Series(["a", NA, "b", True, datetime.today(), "c", "eee", None, 1, 2.0]) rs = Series(mixed).str.center(5) - xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, NA - ]) + xp = Series([" a ", NA, " b ", NA, NA, " c ", " eee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.ljust(5) - xp = Series(['a ', NA, 'b ', NA, NA, 'c ', 'eee ', NA, NA, NA - ]) + xp = Series(["a ", NA, "b ", NA, NA, "c ", "eee ", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rjust(5) - xp = Series([' a', NA, ' b', NA, NA, ' c', ' eee', NA, NA, NA - ]) + xp = Series([" a", NA, " b", NA, NA, " c", " eee", NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_center_ljust_rjust_fillchar(self): - values = Series(['a', 'bb', 'cccc', 'ddddd', 'eeeeee']) + values = Series(["a", "bb", "cccc", "ddddd", "eeeeee"]) - result = values.str.center(5, fillchar='X') - expected = Series(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee']) + result = values.str.center(5, fillchar="X") + expected = Series(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) - expected = np.array([v.center(5, 'X') for v in values.values], - dtype=np.object_) + expected = np.array([v.center(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.ljust(5, fillchar='X') - expected = Series(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee']) + result = values.str.ljust(5, fillchar="X") + expected = Series(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) - expected = np.array([v.ljust(5, 'X') for v in values.values], - dtype=np.object_) + expected = np.array([v.ljust(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) - result = values.str.rjust(5, fillchar='X') - expected = Series(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee']) + result = values.str.rjust(5, fillchar="X") + expected = Series(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"]) tm.assert_series_equal(result, expected) - expected = np.array([v.rjust(5, 'X') for v in values.values], - dtype=np.object_) + expected = np.array([v.rjust(5, "X") for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) # If fillchar is not a charatter, normal str raises TypeError @@ -2239,13 +2253,13 @@ def test_center_ljust_rjust_fillchar(self): template = "fillchar must be a character, not {dtype}" with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.center(5, fillchar='XY') + values.str.center(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.ljust(5, fillchar='XY') + values.str.ljust(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.rjust(5, fillchar='XY') + values.str.rjust(5, fillchar="XY") with pytest.raises(TypeError, match=template.format(dtype="int")): values.str.center(5, fillchar=1) @@ -2257,125 +2271,125 @@ def test_center_ljust_rjust_fillchar(self): values.str.rjust(5, fillchar=1) def test_zfill(self): - values = Series(['1', '22', 'aaa', '333', '45678']) + values = Series(["1", "22", "aaa", "333", "45678"]) result = values.str.zfill(5) - expected = Series(['00001', '00022', '00aaa', '00333', '45678']) + expected = Series(["00001", "00022", "00aaa", "00333", "45678"]) tm.assert_series_equal(result, expected) - expected = np.array([v.zfill(5) for v in values.values], - dtype=np.object_) + expected = np.array([v.zfill(5) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) result = values.str.zfill(3) - expected = Series(['001', '022', 'aaa', '333', '45678']) + expected = Series(["001", "022", "aaa", "333", "45678"]) tm.assert_series_equal(result, expected) - expected = np.array([v.zfill(3) for v in values.values], - dtype=np.object_) + expected = np.array([v.zfill(3) for v in values.values], dtype=np.object_) tm.assert_numpy_array_equal(result.values, expected) - values = Series(['1', np.nan, 'aaa', np.nan, '45678']) + values = Series(["1", np.nan, "aaa", np.nan, "45678"]) result = values.str.zfill(5) - expected = Series(['00001', np.nan, '00aaa', np.nan, '45678']) + expected = Series(["00001", np.nan, "00aaa", np.nan, "45678"]) tm.assert_series_equal(result, expected) def test_split(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) - result = values.str.split('_') - exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + result = values.str.split("_") + exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) - result = values.str.split('__') + values = Series(["a__b__c", "c__d__e", NA, "f__g__h"]) + result = values.str.split("__") tm.assert_series_equal(result, exp) - result = values.str.split('__', expand=False) + result = values.str.split("__", expand=False) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, - 2.]) - result = mixed.str.split('_') - exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA - ]) + mixed = Series(["a_b_c", NA, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.split("_") + exp = Series([["a", "b", "c"], NA, ["d", "e", "f"], NA, NA, NA, NA, NA]) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - result = mixed.str.split('_', expand=False) + result = mixed.str.split("_", expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # regex split - values = Series(['a,b_c', 'c_d,e', NA, 'f,g,h']) - result = values.str.split('[,_]') - exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + values = Series(["a,b_c", "c_d,e", NA, "f,g,h"]) + result = values.str.split("[,_]") + exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) def test_rsplit(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) - result = values.str.rsplit('_') - exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + result = values.str.rsplit("_") + exp = Series([["a", "b", "c"], ["c", "d", "e"], NA, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) - result = values.str.rsplit('__') + values = Series(["a__b__c", "c__d__e", NA, "f__g__h"]) + result = values.str.rsplit("__") tm.assert_series_equal(result, exp) - result = values.str.rsplit('__', expand=False) + result = values.str.rsplit("__", expand=False) tm.assert_series_equal(result, exp) # mixed - mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, - 2.]) - result = mixed.str.rsplit('_') - exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA - ]) + mixed = Series(["a_b_c", NA, "d_e_f", True, datetime.today(), None, 1, 2.0]) + result = mixed.str.rsplit("_") + exp = Series([["a", "b", "c"], NA, ["d", "e", "f"], NA, NA, NA, NA, NA]) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) - result = mixed.str.rsplit('_', expand=False) + result = mixed.str.rsplit("_", expand=False) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) # regex split is not supported by rsplit - values = Series(['a,b_c', 'c_d,e', NA, 'f,g,h']) - result = values.str.rsplit('[,_]') - exp = Series([['a,b_c'], ['c_d,e'], NA, ['f,g,h']]) + values = Series(["a,b_c", "c_d,e", NA, "f,g,h"]) + result = values.str.rsplit("[,_]") + exp = Series([["a,b_c"], ["c_d,e"], NA, ["f,g,h"]]) tm.assert_series_equal(result, exp) # setting max number of splits, make sure it's from reverse - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) - result = values.str.rsplit('_', n=1) - exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']]) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) + result = values.str.rsplit("_", n=1) + exp = Series([["a_b", "c"], ["c_d", "e"], NA, ["f_g", "h"]]) tm.assert_series_equal(result, exp) def test_split_blank_string(self): # expand blank split GH 20067 - values = Series([''], name='test') + values = Series([""], name="test") result = values.str.split(expand=True) exp = DataFrame([[]]) # NOTE: this is NOT an empty DataFrame tm.assert_frame_equal(result, exp) - values = Series(['a b c', 'a b', '', ' '], name='test') + values = Series(["a b c", "a b", "", " "], name="test") result = values.str.split(expand=True) - exp = DataFrame([['a', 'b', 'c'], ['a', 'b', np.nan], - [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]]) + exp = DataFrame( + [ + ["a", "b", "c"], + ["a", "b", np.nan], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + ] + ) tm.assert_frame_equal(result, exp) def test_split_noargs(self): # #1859 - s = Series(['Wes McKinney', 'Travis Oliphant']) + s = Series(["Wes McKinney", "Travis Oliphant"]) result = s.str.split() - expected = ['Travis', 'Oliphant'] + expected = ["Travis", "Oliphant"] assert result[1] == expected result = s.str.rsplit() assert result[1] == expected def test_split_maxsplit(self): # re.split 0, str.split -1 - s = Series(['bd asdf jfg', 'kjasdflqw asdfnfk']) + s = Series(["bd asdf jfg", "kjasdflqw asdfnfk"]) result = s.str.split(n=-1) xp = s.str.split() @@ -2384,134 +2398,142 @@ def test_split_maxsplit(self): result = s.str.split(n=0) tm.assert_series_equal(result, xp) - xp = s.str.split('asdf') - result = s.str.split('asdf', n=0) + xp = s.str.split("asdf") + result = s.str.split("asdf", n=0) tm.assert_series_equal(result, xp) - result = s.str.split('asdf', n=-1) + result = s.str.split("asdf", n=-1) tm.assert_series_equal(result, xp) def test_split_no_pat_with_nonzero_n(self): - s = Series(['split once', 'split once too!']) + s = Series(["split once", "split once too!"]) result = s.str.split(n=1) - expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) + expected = Series({0: ["split", "once"], 1: ["split", "once too!"]}) tm.assert_series_equal(expected, result, check_index_type=False) def test_split_to_dataframe(self): - s = Series(['nosplit', 'alsonosplit']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + s = Series(["nosplit", "alsonosplit"]) + result = s.str.split("_", expand=True) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) tm.assert_frame_equal(result, exp) - s = Series(['some_equal_splits', 'with_no_nans']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) + s = Series(["some_equal_splits", "with_no_nans"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) tm.assert_frame_equal(result, exp) - s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: ['some', 'one'], - 1: ['unequal', 'of'], - 2: ['splits', 'these'], - 3: [NA, 'things'], - 4: [NA, 'is'], - 5: [NA, 'not']}) + s = Series(["some_unequal_splits", "one_of_these_things_is_not"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + { + 0: ["some", "one"], + 1: ["unequal", "of"], + 2: ["splits", "these"], + 3: [NA, "things"], + 4: [NA, "is"], + 5: [NA, "not"], + } + ) tm.assert_frame_equal(result, exp) - s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) - result = s.str.split('_', expand=True) - exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, - index=['preserve', 'me']) + s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + result = s.str.split("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + ) tm.assert_frame_equal(result, exp) with pytest.raises(ValueError, match="expand must be"): - s.str.split('_', expand="not_a_boolean") + s.str.split("_", expand="not_a_boolean") def test_split_to_multiindex_expand(self): # https://github.com/pandas-dev/pandas/issues/23677 - idx = Index(['nosplit', 'alsonosplit', np.nan]) - result = idx.str.split('_', expand=True) + idx = Index(["nosplit", "alsonosplit", np.nan]) + result = idx.str.split("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 - idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None]) - result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), - ('with', 'no', 'nans'), - [np.nan, np.nan, np.nan], - [None, None, None]]) + idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) + result = idx.str.split("_", expand=True) + exp = MultiIndex.from_tuples( + [ + ("some", "equal", "splits"), + ("with", "no", "nans"), + [np.nan, np.nan, np.nan], + [None, None, None], + ] + ) tm.assert_index_equal(result, exp) assert result.nlevels == 3 - idx = Index(['some_unequal_splits', - 'one_of_these_things_is_not', - np.nan, None]) - result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', - NA, NA, NA), - ('one', 'of', 'these', - 'things', 'is', 'not'), - (np.nan, np.nan, np.nan, - np.nan, np.nan, np.nan), - (None, None, None, - None, None, None)]) + idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) + result = idx.str.split("_", expand=True) + exp = MultiIndex.from_tuples( + [ + ("some", "unequal", "splits", NA, NA, NA), + ("one", "of", "these", "things", "is", "not"), + (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), + (None, None, None, None, None, None), + ] + ) tm.assert_index_equal(result, exp) assert result.nlevels == 6 with pytest.raises(ValueError, match="expand must be"): - idx.str.split('_', expand="not_a_boolean") + idx.str.split("_", expand="not_a_boolean") def test_rsplit_to_dataframe_expand(self): - s = Series(['nosplit', 'alsonosplit']) - result = s.str.rsplit('_', expand=True) - exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + s = Series(["nosplit", "alsonosplit"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}) tm.assert_frame_equal(result, exp) - s = Series(['some_equal_splits', 'with_no_nans']) - result = s.str.rsplit('_', expand=True) - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) + s = Series(["some_equal_splits", "with_no_nans"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) tm.assert_frame_equal(result, exp) - result = s.str.rsplit('_', expand=True, n=2) - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) + result = s.str.rsplit("_", expand=True, n=2) + exp = DataFrame( + {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]} + ) tm.assert_frame_equal(result, exp) - result = s.str.rsplit('_', expand=True, n=1) - exp = DataFrame({0: ['some_equal', 'with_no'], 1: ['splits', 'nans']}) + result = s.str.rsplit("_", expand=True, n=1) + exp = DataFrame({0: ["some_equal", "with_no"], 1: ["splits", "nans"]}) tm.assert_frame_equal(result, exp) - s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) - result = s.str.rsplit('_', expand=True) - exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, - index=['preserve', 'me']) + s = Series(["some_splits", "with_index"], index=["preserve", "me"]) + result = s.str.rsplit("_", expand=True) + exp = DataFrame( + {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"] + ) tm.assert_frame_equal(result, exp) def test_rsplit_to_multiindex_expand(self): - idx = Index(['nosplit', 'alsonosplit']) - result = idx.str.rsplit('_', expand=True) + idx = Index(["nosplit", "alsonosplit"]) + result = idx.str.rsplit("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 - idx = Index(['some_equal_splits', 'with_no_nans']) - result = idx.str.rsplit('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), ( - 'with', 'no', 'nans')]) + idx = Index(["some_equal_splits", "with_no_nans"]) + result = idx.str.rsplit("_", expand=True) + exp = MultiIndex.from_tuples( + [("some", "equal", "splits"), ("with", "no", "nans")] + ) tm.assert_index_equal(result, exp) assert result.nlevels == 3 - idx = Index(['some_equal_splits', 'with_no_nans']) - result = idx.str.rsplit('_', expand=True, n=1) - exp = MultiIndex.from_tuples([('some_equal', 'splits'), - ('with_no', 'nans')]) + idx = Index(["some_equal_splits", "with_no_nans"]) + result = idx.str.rsplit("_", expand=True, n=1) + exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")]) tm.assert_index_equal(result, exp) assert result.nlevels == 2 @@ -2531,126 +2553,146 @@ def test_split_with_name(self): # GH 12617 # should preserve name - s = Series(['a,b', 'c,d'], name='xxx') - res = s.str.split(',') - exp = Series([['a', 'b'], ['c', 'd']], name='xxx') + s = Series(["a,b", "c,d"], name="xxx") + res = s.str.split(",") + exp = Series([["a", "b"], ["c", "d"]], name="xxx") tm.assert_series_equal(res, exp) - res = s.str.split(',', expand=True) - exp = DataFrame([['a', 'b'], ['c', 'd']]) + res = s.str.split(",", expand=True) + exp = DataFrame([["a", "b"], ["c", "d"]]) tm.assert_frame_equal(res, exp) - idx = Index(['a,b', 'c,d'], name='xxx') - res = idx.str.split(',') - exp = Index([['a', 'b'], ['c', 'd']], name='xxx') + idx = Index(["a,b", "c,d"], name="xxx") + res = idx.str.split(",") + exp = Index([["a", "b"], ["c", "d"]], name="xxx") assert res.nlevels == 1 tm.assert_index_equal(res, exp) - res = idx.str.split(',', expand=True) - exp = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')]) + res = idx.str.split(",", expand=True) + exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")]) assert res.nlevels == 2 tm.assert_index_equal(res, exp) def test_partition_series(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) - result = values.str.partition('_', expand=False) - exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA, - ('f', '_', 'g_h'), None]) + result = values.str.partition("_", expand=False) + exp = Series( + [("a", "_", "b_c"), ("c", "_", "d_e"), NA, ("f", "_", "g_h"), None] + ) tm.assert_series_equal(result, exp) - result = values.str.rpartition('_', expand=False) - exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA, - ('f_g', '_', 'h'), None]) + result = values.str.rpartition("_", expand=False) + exp = Series( + [("a_b", "_", "c"), ("c_d", "_", "e"), NA, ("f_g", "_", "h"), None] + ) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None]) - result = values.str.partition('__', expand=False) - exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA, - ('f', '__', 'g__h'), None]) + values = Series(["a__b__c", "c__d__e", NA, "f__g__h", None]) + result = values.str.partition("__", expand=False) + exp = Series( + [("a", "__", "b__c"), ("c", "__", "d__e"), NA, ("f", "__", "g__h"), None] + ) tm.assert_series_equal(result, exp) - result = values.str.rpartition('__', expand=False) - exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA, - ('f__g', '__', 'h'), None]) + result = values.str.rpartition("__", expand=False) + exp = Series( + [("a__b", "__", "c"), ("c__d", "__", "e"), NA, ("f__g", "__", "h"), None] + ) tm.assert_series_equal(result, exp) # None - values = Series(['a b c', 'c d e', NA, 'f g h', None]) + values = Series(["a b c", "c d e", NA, "f g h", None]) result = values.str.partition(expand=False) - exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA, - ('f', ' ', 'g h'), None]) + exp = Series( + [("a", " ", "b c"), ("c", " ", "d e"), NA, ("f", " ", "g h"), None] + ) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) - exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA, - ('f g', ' ', 'h'), None]) + exp = Series( + [("a b", " ", "c"), ("c d", " ", "e"), NA, ("f g", " ", "h"), None] + ) tm.assert_series_equal(result, exp) # Not split - values = Series(['abc', 'cde', NA, 'fgh', None]) - result = values.str.partition('_', expand=False) - exp = Series([('abc', '', ''), ('cde', '', ''), NA, - ('fgh', '', ''), None]) + values = Series(["abc", "cde", NA, "fgh", None]) + result = values.str.partition("_", expand=False) + exp = Series([("abc", "", ""), ("cde", "", ""), NA, ("fgh", "", ""), None]) tm.assert_series_equal(result, exp) - result = values.str.rpartition('_', expand=False) - exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, - ('', '', 'fgh'), None]) + result = values.str.rpartition("_", expand=False) + exp = Series([("", "", "abc"), ("", "", "cde"), NA, ("", "", "fgh"), None]) tm.assert_series_equal(result, exp) # unicode - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) - result = values.str.partition('_', expand=False) - exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), - NA, ('f', '_', 'g_h')]) + result = values.str.partition("_", expand=False) + exp = Series([("a", "_", "b_c"), ("c", "_", "d_e"), NA, ("f", "_", "g_h")]) tm.assert_series_equal(result, exp) - result = values.str.rpartition('_', expand=False) - exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), - NA, ('f_g', '_', 'h')]) + result = values.str.rpartition("_", expand=False) + exp = Series([("a_b", "_", "c"), ("c_d", "_", "e"), NA, ("f_g", "_", "h")]) tm.assert_series_equal(result, exp) # compare to standard lib - values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF']) - result = values.str.partition('_', expand=False).tolist() - assert result == [v.partition('_') for v in values] - result = values.str.rpartition('_', expand=False).tolist() - assert result == [v.rpartition('_') for v in values] + values = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"]) + result = values.str.partition("_", expand=False).tolist() + assert result == [v.partition("_") for v in values] + result = values.str.rpartition("_", expand=False).tolist() + assert result == [v.rpartition("_") for v in values] def test_partition_index(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None]) + values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None]) - result = values.str.partition('_', expand=False) - exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), - ('f', '_', 'g_h'), np.nan, None])) + result = values.str.partition("_", expand=False) + exp = Index( + np.array( + [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None] + ) + ) tm.assert_index_equal(result, exp) assert result.nlevels == 1 - result = values.str.rpartition('_', expand=False) - exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), - ('f_g', '_', 'h'), np.nan, None])) + result = values.str.rpartition("_", expand=False) + exp = Index( + np.array( + [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None] + ) + ) tm.assert_index_equal(result, exp) assert result.nlevels == 1 - result = values.str.partition('_') - exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), - ('f', '_', 'g_h'), (np.nan, np.nan, np.nan), - (None, None, None)]) + result = values.str.partition("_") + exp = Index( + [ + ("a", "_", "b_c"), + ("c", "_", "d_e"), + ("f", "_", "g_h"), + (np.nan, np.nan, np.nan), + (None, None, None), + ] + ) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 - result = values.str.rpartition('_') - exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), - ('f_g', '_', 'h'), (np.nan, np.nan, np.nan), - (None, None, None)]) + result = values.str.rpartition("_") + exp = Index( + [ + ("a_b", "_", "c"), + ("c_d", "_", "e"), + ("f_g", "_", "h"), + (np.nan, np.nan, np.nan), + (None, None, None), + ] + ) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 @@ -2658,209 +2700,222 @@ def test_partition_index(self): def test_partition_to_dataframe(self): # https://github.com/pandas-dev/pandas/issues/23558 - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) - result = values.str.partition('_') - exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) + result = values.str.partition("_") + exp = DataFrame( + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + } + ) tm.assert_frame_equal(result, exp) - result = values.str.rpartition('_') - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['c', 'e', np.nan, 'h', None]}) + result = values.str.rpartition("_") + exp = DataFrame( + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + } + ) tm.assert_frame_equal(result, exp) - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) - result = values.str.partition('_', expand=True) - exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h", None]) + result = values.str.partition("_", expand=True) + exp = DataFrame( + { + 0: ["a", "c", np.nan, "f", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["b_c", "d_e", np.nan, "g_h", None], + } + ) tm.assert_frame_equal(result, exp) - result = values.str.rpartition('_', expand=True) - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], - 1: ['_', '_', np.nan, '_', None], - 2: ['c', 'e', np.nan, 'h', None]}) + result = values.str.rpartition("_", expand=True) + exp = DataFrame( + { + 0: ["a_b", "c_d", np.nan, "f_g", None], + 1: ["_", "_", np.nan, "_", None], + 2: ["c", "e", np.nan, "h", None], + } + ) tm.assert_frame_equal(result, exp) def test_partition_with_name(self): # GH 12617 - s = Series(['a,b', 'c,d'], name='xxx') - res = s.str.partition(',') - exp = DataFrame({0: ['a', 'c'], 1: [',', ','], 2: ['b', 'd']}) + s = Series(["a,b", "c,d"], name="xxx") + res = s.str.partition(",") + exp = DataFrame({0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}) tm.assert_frame_equal(res, exp) # should preserve name - res = s.str.partition(',', expand=False) - exp = Series([('a', ',', 'b'), ('c', ',', 'd')], name='xxx') + res = s.str.partition(",", expand=False) + exp = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx") tm.assert_series_equal(res, exp) - idx = Index(['a,b', 'c,d'], name='xxx') - res = idx.str.partition(',') - exp = MultiIndex.from_tuples([('a', ',', 'b'), ('c', ',', 'd')]) + idx = Index(["a,b", "c,d"], name="xxx") + res = idx.str.partition(",") + exp = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")]) assert res.nlevels == 3 tm.assert_index_equal(res, exp) # should preserve name - res = idx.str.partition(',', expand=False) - exp = Index(np.array([('a', ',', 'b'), ('c', ',', 'd')]), name='xxx') + res = idx.str.partition(",", expand=False) + exp = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx") assert res.nlevels == 1 tm.assert_index_equal(res, exp) def test_partition_deprecation(self): # GH 22676; depr kwarg "pat" in favor of "sep" - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", NA, "f_g_h"]) # str.partition # using sep -> no warning - expected = values.str.partition(sep='_') + expected = values.str.partition(sep="_") with tm.assert_produces_warning(FutureWarning): - result = values.str.partition(pat='_') + result = values.str.partition(pat="_") tm.assert_frame_equal(result, expected) # str.rpartition # using sep -> no warning - expected = values.str.rpartition(sep='_') + expected = values.str.rpartition(sep="_") with tm.assert_produces_warning(FutureWarning): - result = values.str.rpartition(pat='_') + result = values.str.rpartition(pat="_") tm.assert_frame_equal(result, expected) def test_pipe_failures(self): # #2119 - s = Series(['A|B|C']) + s = Series(["A|B|C"]) - result = s.str.split('|') - exp = Series([['A', 'B', 'C']]) + result = s.str.split("|") + exp = Series([["A", "B", "C"]]) tm.assert_series_equal(result, exp) - result = s.str.replace('|', ' ') - exp = Series(['A B C']) + result = s.str.replace("|", " ") + exp = Series(["A B C"]) tm.assert_series_equal(result, exp) def test_slice(self): - values = Series(['aafootwo', 'aabartwo', NA, 'aabazqux']) + values = Series(["aafootwo", "aabartwo", NA, "aabazqux"]) result = values.str.slice(2, 5) - exp = Series(['foo', 'bar', NA, 'baz']) + exp = Series(["foo", "bar", NA, "baz"]) tm.assert_series_equal(result, exp) - for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), - (3, 0, -1)]: + for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), (3, 0, -1)]: try: result = values.str.slice(start, stop, step) - expected = Series([s[start:stop:step] if not isna(s) else NA - for s in values]) + expected = Series( + [s[start:stop:step] if not isna(s) else NA for s in values] + ) tm.assert_series_equal(result, expected) except IndexError: - print('failed on %s:%s:%s' % (start, stop, step)) + print("failed on %s:%s:%s" % (start, stop, step)) raise # mixed - mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(), - None, 1, 2.]) + mixed = Series( + ["aafootwo", NA, "aabartwo", True, datetime.today(), None, 1, 2.0] + ) rs = Series(mixed).str.slice(2, 5) - xp = Series(['foo', NA, 'bar', NA, NA, NA, NA, NA]) + xp = Series(["foo", NA, "bar", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.slice(2, 5, -1) - xp = Series(['oof', NA, 'rab', NA, NA, NA, NA, NA]) + xp = Series(["oof", NA, "rab", NA, NA, NA, NA, NA]) def test_slice_replace(self): - values = Series(['short', 'a bit longer', 'evenlongerthanthat', '', NA - ]) + values = Series(["short", "a bit longer", "evenlongerthanthat", "", NA]) - exp = Series(['shrt', 'a it longer', 'evnlongerthanthat', '', NA]) + exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", NA]) result = values.str.slice_replace(2, 3) tm.assert_series_equal(result, exp) - exp = Series(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z', NA]) - result = values.str.slice_replace(2, 3, 'z') + exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", NA]) + result = values.str.slice_replace(2, 3, "z") tm.assert_series_equal(result, exp) - exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA - ]) - result = values.str.slice_replace(2, 2, 'z') + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", NA]) + result = values.str.slice_replace(2, 2, "z") tm.assert_series_equal(result, exp) - exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA - ]) - result = values.str.slice_replace(2, 1, 'z') + exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", NA]) + result = values.str.slice_replace(2, 1, "z") tm.assert_series_equal(result, exp) - exp = Series(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z', NA]) - result = values.str.slice_replace(-1, None, 'z') + exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", NA]) + result = values.str.slice_replace(-1, None, "z") tm.assert_series_equal(result, exp) - exp = Series(['zrt', 'zer', 'zat', 'z', NA]) - result = values.str.slice_replace(None, -2, 'z') + exp = Series(["zrt", "zer", "zat", "z", NA]) + result = values.str.slice_replace(None, -2, "z") tm.assert_series_equal(result, exp) - exp = Series(['shortz', 'a bit znger', 'evenlozerthanthat', 'z', NA]) - result = values.str.slice_replace(6, 8, 'z') + exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", NA]) + result = values.str.slice_replace(6, 8, "z") tm.assert_series_equal(result, exp) - exp = Series(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z', NA]) - result = values.str.slice_replace(-10, 3, 'z') + exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", NA]) + result = values.str.slice_replace(-10, 3, "z") tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip(self): - values = Series([' aa ', ' bb \n', NA, 'cc ']) + values = Series([" aa ", " bb \n", NA, "cc "]) result = values.str.strip() - exp = Series(['aa', 'bb', NA, 'cc']) + exp = Series(["aa", "bb", NA, "cc"]) tm.assert_series_equal(result, exp) result = values.str.lstrip() - exp = Series(['aa ', 'bb \n', NA, 'cc ']) + exp = Series(["aa ", "bb \n", NA, "cc "]) tm.assert_series_equal(result, exp) result = values.str.rstrip() - exp = Series([' aa', ' bb', NA, 'cc']) + exp = Series([" aa", " bb", NA, "cc"]) tm.assert_series_equal(result, exp) def test_strip_lstrip_rstrip_mixed(self): # mixed - mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), None, - 1, 2.]) + mixed = Series([" aa ", NA, " bb \t\n", True, datetime.today(), None, 1, 2.0]) rs = Series(mixed).str.strip() - xp = Series(['aa', NA, 'bb', NA, NA, NA, NA, NA]) + xp = Series(["aa", NA, "bb", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.lstrip() - xp = Series(['aa ', NA, 'bb \t\n', NA, NA, NA, NA, NA]) + xp = Series(["aa ", NA, "bb \t\n", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) rs = Series(mixed).str.rstrip() - xp = Series([' aa', NA, ' bb', NA, NA, NA, NA, NA]) + xp = Series([" aa", NA, " bb", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) def test_strip_lstrip_rstrip_args(self): - values = Series(['xxABCxx', 'xx BNSD', 'LDFJH xx']) + values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"]) - rs = values.str.strip('x') - xp = Series(['ABC', ' BNSD', 'LDFJH ']) + rs = values.str.strip("x") + xp = Series(["ABC", " BNSD", "LDFJH "]) assert_series_equal(rs, xp) - rs = values.str.lstrip('x') - xp = Series(['ABCxx', ' BNSD', 'LDFJH xx']) + rs = values.str.lstrip("x") + xp = Series(["ABCxx", " BNSD", "LDFJH xx"]) assert_series_equal(rs, xp) - rs = values.str.rstrip('x') - xp = Series(['xxABC', 'xx BNSD', 'LDFJH ']) + rs = values.str.rstrip("x") + xp = Series(["xxABC", "xx BNSD", "LDFJH "]) assert_series_equal(rs, xp) def test_wrap(self): @@ -2868,69 +2923,87 @@ def test_wrap(self): # two words greater than width, one word less than width, one word # equal to width, one word greater than width, multiple tokens with # trailing whitespace equal to width - values = Series(['hello world', 'hello world!', 'hello world!!', - 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdefa', - 'ab ab ab ab ', 'ab ab ab ab a', '\t']) + values = Series( + [ + "hello world", + "hello world!", + "hello world!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdefa", + "ab ab ab ab ", + "ab ab ab ab a", + "\t", + ] + ) # expected values - xp = Series(['hello world', 'hello world!', 'hello\nworld!!', - 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdef\na', - 'ab ab ab ab', 'ab ab ab ab\na', '']) + xp = Series( + [ + "hello world", + "hello world!", + "hello\nworld!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdef\na", + "ab ab ab ab", + "ab ab ab ab\na", + "", + ] + ) rs = values.str.wrap(12, break_long_words=True) assert_series_equal(rs, xp) # test with pre and post whitespace (non-unicode), NaN, and non-ascii # Unicode - values = Series([' pre ', np.nan, '\xac\u20ac\U00008000 abadcafe']) - xp = Series([' pre', NA, '\xac\u20ac\U00008000 ab\nadcafe']) + values = Series([" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"]) + xp = Series([" pre", NA, "\xac\u20ac\U00008000 ab\nadcafe"]) rs = values.str.wrap(6) assert_series_equal(rs, xp) def test_get(self): - values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) + values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - result = values.str.split('_').str.get(1) - expected = Series(['b', 'd', np.nan, 'g']) + result = values.str.split("_").str.get(1) + expected = Series(["b", "d", np.nan, "g"]) tm.assert_series_equal(result, expected) # mixed - mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), None, 1, - 2.]) + mixed = Series(["a_b_c", NA, "c_d_e", True, datetime.today(), None, 1, 2.0]) - rs = Series(mixed).str.split('_').str.get(1) - xp = Series(['b', NA, 'd', NA, NA, NA, NA, NA]) + rs = Series(mixed).str.split("_").str.get(1) + xp = Series(["b", NA, "d", NA, NA, NA, NA, NA]) assert isinstance(rs, Series) tm.assert_almost_equal(rs, xp) # bounds testing - values = Series(['1_2_3_4_5', '6_7_8_9_10', '11_12']) + values = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"]) # positive index - result = values.str.split('_').str.get(2) - expected = Series(['3', '8', np.nan]) + result = values.str.split("_").str.get(2) + expected = Series(["3", "8", np.nan]) tm.assert_series_equal(result, expected) # negative index - result = values.str.split('_').str.get(-3) - expected = Series(['3', '8', np.nan]) + result = values.str.split("_").str.get(-3) + expected = Series(["3", "8", np.nan]) tm.assert_series_equal(result, expected) def test_get_complex(self): # GH 20671, getting value not in dict raising `KeyError` - values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, - {1: 'a', 2: 'b', 3: 'c'}]) + values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}]) result = values.str.get(1) - expected = Series([2, 2, np.nan, 'a']) + expected = Series([2, 2, np.nan, "a"]) tm.assert_series_equal(result, expected) result = values.str.get(-1) expected = Series([3, 3, np.nan, np.nan]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('to_type', [tuple, list, np.array]) + @pytest.mark.parametrize("to_type", [tuple, list, np.array]) def test_get_complex_nested(self, to_type): values = Series([to_type([to_type([1, 2])])]) @@ -2944,77 +3017,106 @@ def test_get_complex_nested(self, to_type): def test_contains_moar(self): # PR #1179 - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, - 'CABA', 'dog', 'cat']) + s = Series(["A", "B", "C", "Aaba", "Baca", "", NA, "CABA", "dog", "cat"]) - result = s.str.contains('a') - expected = Series([False, False, False, True, True, False, np.nan, - False, False, True]) + result = s.str.contains("a") + expected = Series( + [False, False, False, True, True, False, np.nan, False, False, True] + ) assert_series_equal(result, expected) - result = s.str.contains('a', case=False) - expected = Series([True, False, False, True, True, False, np.nan, True, - False, True]) + result = s.str.contains("a", case=False) + expected = Series( + [True, False, False, True, True, False, np.nan, True, False, True] + ) assert_series_equal(result, expected) - result = s.str.contains('Aa') - expected = Series([False, False, False, True, False, False, np.nan, - False, False, False]) + result = s.str.contains("Aa") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False] + ) assert_series_equal(result, expected) - result = s.str.contains('ba') - expected = Series([False, False, False, True, False, False, np.nan, - False, False, False]) + result = s.str.contains("ba") + expected = Series( + [False, False, False, True, False, False, np.nan, False, False, False] + ) assert_series_equal(result, expected) - result = s.str.contains('ba', case=False) - expected = Series([False, False, False, True, True, False, np.nan, - True, False, False]) + result = s.str.contains("ba", case=False) + expected = Series( + [False, False, False, True, True, False, np.nan, True, False, False] + ) assert_series_equal(result, expected) def test_contains_nan(self): # PR #14171 s = Series([np.nan, np.nan, np.nan], dtype=np.object_) - result = s.str.contains('foo', na=False) + result = s.str.contains("foo", na=False) expected = Series([False, False, False], dtype=np.bool_) assert_series_equal(result, expected) - result = s.str.contains('foo', na=True) + result = s.str.contains("foo", na=True) expected = Series([True, True, True], dtype=np.bool_) assert_series_equal(result, expected) - result = s.str.contains('foo', na="foo") + result = s.str.contains("foo", na="foo") expected = Series(["foo", "foo", "foo"], dtype=np.object_) assert_series_equal(result, expected) - result = s.str.contains('foo') + result = s.str.contains("foo") expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) assert_series_equal(result, expected) def test_replace_moar(self): # PR #1179 - s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', - 'dog', 'cat']) + s = Series(["A", "B", "C", "Aaba", "Baca", "", NA, "CABA", "dog", "cat"]) - result = s.str.replace('A', 'YYY') - expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA, - 'CYYYBYYY', 'dog', 'cat']) + result = s.str.replace("A", "YYY") + expected = Series( + ["YYY", "B", "C", "YYYaba", "Baca", "", NA, "CYYYBYYY", "dog", "cat"] + ) assert_series_equal(result, expected) - result = s.str.replace('A', 'YYY', case=False) - expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA, - 'CYYYBYYY', 'dog', 'cYYYt']) + result = s.str.replace("A", "YYY", case=False) + expected = Series( + [ + "YYY", + "B", + "C", + "YYYYYYbYYY", + "BYYYcYYY", + "", + NA, + "CYYYBYYY", + "dog", + "cYYYt", + ] + ) assert_series_equal(result, expected) - result = s.str.replace('^.a|dog', 'XX-XX ', case=False) - expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA, - 'XX-XX BA', 'XX-XX ', 'XX-XX t']) + result = s.str.replace("^.a|dog", "XX-XX ", case=False) + expected = Series( + [ + "A", + "B", + "C", + "XX-XX ba", + "XX-XX ca", + "", + NA, + "XX-XX BA", + "XX-XX ", + "XX-XX t", + ] + ) assert_series_equal(result, expected) def test_string_slice_get_syntax(self): - s = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', NA, 'CYYYBYYY', - 'dog', 'cYYYt']) + s = Series( + ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", NA, "CYYYBYYY", "dog", "cYYYt"] + ) result = s.str[0] expected = s.str.get(0) @@ -3029,35 +3131,37 @@ def test_string_slice_get_syntax(self): assert_series_equal(result, expected) def test_string_slice_out_of_bounds(self): - s = Series([(1, 2), (1, ), (3, 4, 5)]) + s = Series([(1, 2), (1,), (3, 4, 5)]) result = s.str[1] expected = Series([2, np.nan, 4]) assert_series_equal(result, expected) - s = Series(['foo', 'b', 'ba']) + s = Series(["foo", "b", "ba"]) result = s.str[1] - expected = Series(['o', np.nan, 'a']) + expected = Series(["o", np.nan, "a"]) assert_series_equal(result, expected) def test_match_findall_flags(self): - data = {'Dave': 'dave@google.com', - 'Steve': 'steve@gmail.com', - 'Rob': 'rob@gmail.com', - 'Wes': np.nan} + data = { + "Dave": "dave@google.com", + "Steve": "steve@gmail.com", + "Rob": "rob@gmail.com", + "Wes": np.nan, + } data = Series(data) - pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})' + pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) - assert result.iloc[0].tolist() == ['dave', 'google', 'com'] + assert result.iloc[0].tolist() == ["dave", "google", "com"] result = data.str.match(pat, flags=re.IGNORECASE) assert result[0] result = data.str.findall(pat, flags=re.IGNORECASE) - assert result[0][0] == ('dave', 'google', 'com') + assert result[0][0] == ("dave", "google", "com") result = data.str.count(pat, flags=re.IGNORECASE) assert result[0] == 1 @@ -3067,73 +3171,80 @@ def test_match_findall_flags(self): assert result[0] def test_encode_decode(self): - base = Series(['a', 'b', 'a\xe4']) - series = base.str.encode('utf-8') + base = Series(["a", "b", "a\xe4"]) + series = base.str.encode("utf-8") - f = lambda x: x.decode('utf-8') - result = series.str.decode('utf-8') + f = lambda x: x.decode("utf-8") + result = series.str.decode("utf-8") exp = series.map(f) tm.assert_series_equal(result, exp) def test_encode_decode_errors(self): - encodeBase = Series(['a', 'b', 'a\x9d']) + encodeBase = Series(["a", "b", "a\x9d"]) - msg = (r"'charmap' codec can't encode character '\\x9d' in position 1:" - " character maps to ") + msg = ( + r"'charmap' codec can't encode character '\\x9d' in position 1:" + " character maps to " + ) with pytest.raises(UnicodeEncodeError, match=msg): - encodeBase.str.encode('cp1252') + encodeBase.str.encode("cp1252") - f = lambda x: x.encode('cp1252', 'ignore') - result = encodeBase.str.encode('cp1252', 'ignore') + f = lambda x: x.encode("cp1252", "ignore") + result = encodeBase.str.encode("cp1252", "ignore") exp = encodeBase.map(f) tm.assert_series_equal(result, exp) - decodeBase = Series([b'a', b'b', b'a\x9d']) + decodeBase = Series([b"a", b"b", b"a\x9d"]) - msg = ("'charmap' codec can't decode byte 0x9d in position 1:" - " character maps to ") + msg = ( + "'charmap' codec can't decode byte 0x9d in position 1:" + " character maps to " + ) with pytest.raises(UnicodeDecodeError, match=msg): - decodeBase.str.decode('cp1252') + decodeBase.str.decode("cp1252") - f = lambda x: x.decode('cp1252', 'ignore') - result = decodeBase.str.decode('cp1252', 'ignore') + f = lambda x: x.decode("cp1252", "ignore") + result = decodeBase.str.decode("cp1252", "ignore") exp = decodeBase.map(f) tm.assert_series_equal(result, exp) def test_normalize(self): - values = ['ABC', 'ABC', '123', np.nan, 'アイエ'] - s = Series(values, index=['a', 'b', 'c', 'd', 'e']) + values = ["ABC", "ABC", "123", np.nan, "アイエ"] + s = Series(values, index=["a", "b", "c", "d", "e"]) - normed = ['ABC', 'ABC', '123', np.nan, 'アイエ'] - expected = Series(normed, index=['a', 'b', 'c', 'd', 'e']) + normed = ["ABC", "ABC", "123", np.nan, "アイエ"] + expected = Series(normed, index=["a", "b", "c", "d", "e"]) - result = s.str.normalize('NFKC') + result = s.str.normalize("NFKC") tm.assert_series_equal(result, expected) - expected = Series(['ABC', 'ABC', '123', np.nan, 'アイエ'], - index=['a', 'b', 'c', 'd', 'e']) + expected = Series( + ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"] + ) - result = s.str.normalize('NFC') + result = s.str.normalize("NFC") tm.assert_series_equal(result, expected) with pytest.raises(ValueError, match="invalid normalization form"): - s.str.normalize('xxx') + s.str.normalize("xxx") - s = Index(['ABC', '123', 'アイエ']) - expected = Index(['ABC', '123', 'アイエ']) - result = s.str.normalize('NFKC') + s = Index(["ABC", "123", "アイエ"]) + expected = Index(["ABC", "123", "アイエ"]) + result = s.str.normalize("NFKC") tm.assert_index_equal(result, expected) def test_index_str_accessor_visibility(self): from pandas.core.strings import StringMethods - cases = [(['a', 'b'], 'string'), - (['a', 'b', 1], 'mixed-integer'), - (['a', 'b', 1.3], 'mixed'), - (['a', 'b', 1.3, 1], 'mixed-integer'), - (['aa', datetime(2011, 1, 1)], 'mixed')] + cases = [ + (["a", "b"], "string"), + (["a", "b", 1], "mixed-integer"), + (["a", "b", 1.3], "mixed"), + (["a", "b", 1.3, 1], "mixed-integer"), + (["aa", datetime(2011, 1, 1)], "mixed"), + ] for values, tp in cases: idx = Index(values) assert isinstance(Series(values).str, StringMethods) @@ -3146,12 +3257,14 @@ def test_index_str_accessor_visibility(self): assert isinstance(idx.str, StringMethods) assert idx.inferred_type == tp - cases = [([1, np.nan], 'floating'), - ([datetime(2011, 1, 1)], 'datetime64'), - ([timedelta(1)], 'timedelta64')] + cases = [ + ([1, np.nan], "floating"), + ([datetime(2011, 1, 1)], "datetime64"), + ([timedelta(1)], "timedelta64"), + ] for values, tp in cases: idx = Index(values) - message = 'Can only use .str accessor with string values' + message = "Can only use .str accessor with string values" with pytest.raises(AttributeError, match=message): Series(values).str with pytest.raises(AttributeError, match=message): @@ -3159,30 +3272,28 @@ def test_index_str_accessor_visibility(self): assert idx.inferred_type == tp # MultiIndex has mixed dtype, but not allow to use accessor - idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')]) - assert idx.inferred_type == 'mixed' - message = 'Can only use .str accessor with Index, not MultiIndex' + idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) + assert idx.inferred_type == "mixed" + message = "Can only use .str accessor with Index, not MultiIndex" with pytest.raises(AttributeError, match=message): idx.str def test_str_accessor_no_new_attributes(self): # https://github.com/pandas-dev/pandas/issues/10673 - s = Series(list('aabbcde')) - with pytest.raises(AttributeError, - match="You cannot add any new attribute"): + s = Series(list("aabbcde")) + with pytest.raises(AttributeError, match="You cannot add any new attribute"): s.str.xlabel = "a" def test_method_on_bytes(self): - lhs = Series(np.array(list('abc'), 'S1').astype(object)) - rhs = Series(np.array(list('def'), 'S1').astype(object)) - with pytest.raises(TypeError, - match="Cannot use .str.cat with values of.*"): + lhs = Series(np.array(list("abc"), "S1").astype(object)) + rhs = Series(np.array(list("def"), "S1").astype(object)) + with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"): lhs.str.cat(rhs) def test_casefold(self): # GH25405 - expected = Series(['ss', NA, 'case', 'ssd']) - s = Series(['ß', NA, 'case', 'ßd']) + expected = Series(["ss", NA, "case", "ssd"]) + s = Series(["ß", NA, "case", "ßd"]) result = s.str.casefold() tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index afcc90a1c8e74..d2a9e1dc94bb5 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -18,48 +18,52 @@ def writeable(request): # Check that take_nd works both with writeable arrays # (in which case fast typed memory-views implementation) # and read-only arrays alike. -@pytest.fixture(params=[ - (np.float64, True), - (np.float32, True), - (np.uint64, False), - (np.uint32, False), - (np.uint16, False), - (np.uint8, False), - (np.int64, False), - (np.int32, False), - (np.int16, False), - (np.int8, False), - (np.object_, True), - (np.bool, False), -]) +@pytest.fixture( + params=[ + (np.float64, True), + (np.float32, True), + (np.uint64, False), + (np.uint32, False), + (np.uint16, False), + (np.uint8, False), + (np.int64, False), + (np.int32, False), + (np.int16, False), + (np.int8, False), + (np.object_, True), + (np.bool, False), + ] +) def dtype_can_hold_na(request): return request.param -@pytest.fixture(params=[ - (np.int8, np.int16(127), np.int8), - (np.int8, np.int16(128), np.int16), - (np.int32, 1, np.int32), - (np.int32, 2.0, np.float64), - (np.int32, 3.0 + 4.0j, np.complex128), - (np.int32, True, np.object_), - (np.int32, "", np.object_), - (np.float64, 1, np.float64), - (np.float64, 2.0, np.float64), - (np.float64, 3.0 + 4.0j, np.complex128), - (np.float64, True, np.object_), - (np.float64, "", np.object_), - (np.complex128, 1, np.complex128), - (np.complex128, 2.0, np.complex128), - (np.complex128, 3.0 + 4.0j, np.complex128), - (np.complex128, True, np.object_), - (np.complex128, "", np.object_), - (np.bool_, 1, np.object_), - (np.bool_, 2.0, np.object_), - (np.bool_, 3.0 + 4.0j, np.object_), - (np.bool_, True, np.bool_), - (np.bool_, '', np.object_), -]) +@pytest.fixture( + params=[ + (np.int8, np.int16(127), np.int8), + (np.int8, np.int16(128), np.int16), + (np.int32, 1, np.int32), + (np.int32, 2.0, np.float64), + (np.int32, 3.0 + 4.0j, np.complex128), + (np.int32, True, np.object_), + (np.int32, "", np.object_), + (np.float64, 1, np.float64), + (np.float64, 2.0, np.float64), + (np.float64, 3.0 + 4.0j, np.complex128), + (np.float64, True, np.object_), + (np.float64, "", np.object_), + (np.complex128, 1, np.complex128), + (np.complex128, 2.0, np.complex128), + (np.complex128, 3.0 + 4.0j, np.complex128), + (np.complex128, True, np.object_), + (np.complex128, "", np.object_), + (np.bool_, 1, np.object_), + (np.bool_, 2.0, np.object_), + (np.bool_, 3.0 + 4.0j, np.object_), + (np.bool_, True, np.bool_), + (np.bool_, "", np.object_), + ] +) def dtype_fill_out_dtype(request): return request.param @@ -102,15 +106,15 @@ def test_1d_fill_nonna(self, dtype_fill_out_dtype): indexer = [2, 1, 0, -1] result = algos.take_1d(data, indexer, fill_value=fill_value) - assert ((result[[0, 1, 2]] == data[[2, 1, 0]]).all()) - assert (result[3] == fill_value) - assert (result.dtype == out_dtype) + assert (result[[0, 1, 2]] == data[[2, 1, 0]]).all() + assert result[3] == fill_value + assert result.dtype == out_dtype indexer = [2, 1, 0, 1] result = algos.take_1d(data, indexer, fill_value=fill_value) - assert ((result[[0, 1, 2, 3]] == data[indexer]).all()) - assert (result.dtype == dtype) + assert (result[[0, 1, 2, 3]] == data[indexer]).all() + assert result.dtype == dtype def test_2d_with_out(self, dtype_can_hold_na, writeable): dtype, can_hold_na = dtype_can_hold_na @@ -157,28 +161,24 @@ def test_2d_fill_nonna(self, dtype_fill_out_dtype): data = np.random.randint(0, 2, (5, 3)).astype(dtype) indexer = [2, 1, 0, -1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all()) - assert ((result[3, :] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2], :] == data[[2, 1, 0], :]).all() + assert (result[3, :] == fill_value).all() + assert result.dtype == out_dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all()) - assert ((result[:, 3] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all() + assert (result[:, 3] == fill_value).all() + assert result.dtype == out_dtype indexer = [2, 1, 0, 1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2, 3], :] == data[indexer, :]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2, 3], :] == data[indexer, :]).all() + assert result.dtype == dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2, 3]] == data[:, indexer]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2, 3]] == data[:, indexer]).all() + assert result.dtype == dtype def test_3d_with_out(self, dtype_can_hold_na): dtype, can_hold_na = dtype_can_hold_na @@ -237,39 +237,33 @@ def test_3d_fill_nonna(self, dtype_fill_out_dtype): data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) indexer = [2, 1, 0, -1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()) - assert ((result[3, :, :] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all() + assert (result[3, :, :] == fill_value).all() + assert result.dtype == out_dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()) - assert ((result[:, 3, :] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all() + assert (result[:, 3, :] == fill_value).all() + assert result.dtype == out_dtype - result = algos.take_nd(data, indexer, axis=2, - fill_value=fill_value) - assert ((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()) - assert ((result[:, :, 3] == fill_value).all()) - assert (result.dtype == out_dtype) + result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert (result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all() + assert (result[:, :, 3] == fill_value).all() + assert result.dtype == out_dtype indexer = [2, 1, 0, 1] - result = algos.take_nd(data, indexer, axis=0, - fill_value=fill_value) - assert ((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) + assert (result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all() + assert result.dtype == dtype - result = algos.take_nd(data, indexer, axis=1, - fill_value=fill_value) - assert ((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) + assert (result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all() + assert result.dtype == dtype - result = algos.take_nd(data, indexer, axis=2, - fill_value=fill_value) - assert ((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()) - assert (result.dtype == dtype) + result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value) + assert (result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all() + assert result.dtype == dtype def test_1d_other_dtypes(self): arr = np.random.randn(10).astype(np.float32) @@ -336,7 +330,7 @@ def test_2d_float32(self): tm.assert_almost_equal(result, expected) # this now accepts a float32! # test with float64 out buffer - out = np.empty((len(indexer), arr.shape[1]), dtype='float32') + out = np.empty((len(indexer), arr.shape[1]), dtype="float32") algos.take_nd(arr, indexer, out=out) # it works! # axis=1 @@ -352,7 +346,7 @@ def test_2d_float32(self): def test_2d_datetime64(self): # 2005/01/01 - 2006/01/01 arr = np.random.randint(11045376, 11360736, (5, 3)) * 100000000000 - arr = arr.view(dtype='datetime64[ns]') + arr = arr.view(dtype="datetime64[ns]") indexer = [0, 2, -1, 1, -1] # axis=0 @@ -365,11 +359,11 @@ def test_2d_datetime64(self): expected.view(np.int64)[[2, 4], :] = iNaT tm.assert_almost_equal(result, expected) - result = algos.take_nd(arr, indexer, axis=0, - fill_value=datetime(2007, 1, 1)) + result = algos.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) - algos.take_nd(arr, indexer, out=result2, axis=0, - fill_value=datetime(2007, 1, 1)) + algos.take_nd( + arr, indexer, out=result2, axis=0, fill_value=datetime(2007, 1, 1) + ) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) @@ -386,11 +380,11 @@ def test_2d_datetime64(self): expected.view(np.int64)[:, [2, 4]] = iNaT tm.assert_almost_equal(result, expected) - result = algos.take_nd(arr, indexer, axis=1, - fill_value=datetime(2007, 1, 1)) + result = algos.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) - algos.take_nd(arr, indexer, out=result2, axis=1, - fill_value=datetime(2007, 1, 1)) + algos.take_nd( + arr, indexer, out=result2, axis=1, fill_value=datetime(2007, 1, 1) + ) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) @@ -415,15 +409,13 @@ def test_take_axis_1(self): tm.assert_numpy_array_equal(result, expected) # allow_fill=True - result = algos.take(arr, [0, -1], axis=1, allow_fill=True, - fill_value=0) + result = algos.take(arr, [0, -1], axis=1, allow_fill=True, fill_value=0) expected = np.array([[0, 0], [3, 0], [6, 0], [9, 0]]) tm.assert_numpy_array_equal(result, expected) # GH#26976 make sure we validate along the correct axis with pytest.raises(IndexError, match="indices are out-of-bounds"): - algos.take(arr, [0, 3], axis=1, allow_fill=True, - fill_value=0) + algos.take(arr, [0, 3], axis=1, allow_fill=True, fill_value=0) class TestExtensionTake: @@ -447,7 +439,7 @@ def test_bounds_check_small(self): expected = np.array([1, 3, 2], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('allow_fill', [True, False]) + @pytest.mark.parametrize("allow_fill", [True, False]) def test_take_empty(self, allow_fill): arr = np.array([], dtype=np.int64) # empty take is ok @@ -458,9 +450,8 @@ def test_take_empty(self, allow_fill): algos.take(arr, [0], allow_fill=allow_fill) def test_take_na_empty(self): - result = algos.take(np.array([]), [-1, -1], allow_fill=True, - fill_value=0.0) - expected = np.array([0., 0.]) + result = algos.take(np.array([]), [-1, -1], allow_fill=True, fill_value=0.0) + expected = np.array([0.0, 0.0]) tm.assert_numpy_array_equal(result, expected) def test_take_coerces_list(self): diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 8604acb1bd2b2..27700d778df19 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -12,8 +12,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import ( - DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna) +from pandas import DataFrame, Index, Series, Timestamp, bdate_range, concat, isna, notna from pandas.core.base import DataError, SpecificationError from pandas.core.sorting import safe_sort import pandas.core.window as rwindow @@ -36,20 +35,30 @@ def raw(request): return request.param -@pytest.fixture(params=['triang', 'blackman', 'hamming', 'bartlett', 'bohman', - 'blackmanharris', 'nuttall', 'barthann']) +@pytest.fixture( + params=[ + "triang", + "blackman", + "hamming", + "bartlett", + "bohman", + "blackmanharris", + "nuttall", + "barthann", + ] +) def win_types(request): return request.param -@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian', - 'exponential']) +@pytest.fixture(params=["kaiser", "gaussian", "general_gaussian", "exponential"]) def win_types_special(request): return request.param -@pytest.fixture(params=["sum", "mean", "median", "max", "min", - "var", "std", "kurt", "skew"]) +@pytest.fixture( + params=["sum", "mean", "median", "max", "min", "var", "std", "kurt", "skew"] +) def arithmetic_win_operators(request): return request.param @@ -66,12 +75,10 @@ def _create_data(self): self.arr = arr self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) - self.frame = DataFrame(randn(N, K), index=self.rng, - columns=np.arange(K)) + self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) class TestApi(Base): - def setup_method(self, method): self._create_data() @@ -85,205 +92,237 @@ def test_getitem(self): # technically this is allowed r = self.frame.rolling(window=5)[1, 3] - tm.assert_index_equal(r._selected_obj.columns, - self.frame.columns[[1, 3]]) + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) r = self.frame.rolling(window=5)[[1, 3]] - tm.assert_index_equal(r._selected_obj.columns, - self.frame.columns[[1, 3]]) + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns[[1, 3]]) def test_select_bad_cols(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) + df = DataFrame([[1, 2]], columns=["A", "B"]) g = df.rolling(window=5) with pytest.raises(KeyError, match="Columns not found: 'C'"): - g[['C']] - with pytest.raises(KeyError, match='^[^A]+$'): + g[["C"]] + with pytest.raises(KeyError, match="^[^A]+$"): # A should not be referenced as a bad column... # will have to rethink regex if you change message! - g[['A', 'C']] + g[["A", "C"]] def test_attribute_access(self): - df = DataFrame([[1, 2]], columns=['A', 'B']) + df = DataFrame([[1, 2]], columns=["A", "B"]) r = df.rolling(window=5) - tm.assert_series_equal(r.A.sum(), r['A'].sum()) + tm.assert_series_equal(r.A.sum(), r["A"].sum()) msg = "'Rolling' object has no attribute 'F'" with pytest.raises(AttributeError, match=msg): r.F def tests_skip_nuisance(self): - df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'}) + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) - result = r[['A', 'B']].sum() - expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9], - 'B': [np.nan, np.nan, 18, 21, 24]}, - columns=list('AB')) + result = r[["A", "B"]].sum() + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) tm.assert_frame_equal(result, expected) def test_skip_sum_object_raises(self): - df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'}) + df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3) result = r.sum() - expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9], - 'B': [np.nan, np.nan, 18, 21, 24]}, - columns=list('AB')) + expected = DataFrame( + {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, + columns=list("AB"), + ) tm.assert_frame_equal(result, expected) def test_agg(self): - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) - a_mean = r['A'].mean() - a_std = r['A'].std() - a_sum = r['A'].sum() - b_mean = r['B'].mean() - b_std = r['B'].std() - b_sum = r['B'].sum() + a_mean = r["A"].mean() + a_std = r["A"].std() + a_sum = r["A"].sum() + b_mean = r["B"].mean() + b_std = r["B"].std() + b_sum = r["B"].sum() result = r.aggregate([np.mean, np.std]) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', - 'std']]) + expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) tm.assert_frame_equal(result, expected) - result = r.aggregate({'A': np.mean, 'B': np.std}) + result = r.aggregate({"A": np.mean, "B": np.std}) expected = concat([a_mean, b_std], axis=1) tm.assert_frame_equal(result, expected, check_like=True) - result = r.aggregate({'A': ['mean', 'std']}) + result = r.aggregate({"A": ["mean", "std"]}) expected = concat([a_mean, a_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', - 'std')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) tm.assert_frame_equal(result, expected) - result = r['A'].aggregate(['mean', 'sum']) + result = r["A"].aggregate(["mean", "sum"]) expected = concat([a_mean, a_sum], axis=1) - expected.columns = ['mean', 'sum'] + expected.columns = ["mean", "sum"] tm.assert_frame_equal(result, expected) with catch_warnings(record=True): # using a dict with renaming warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) + result = r.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), - ('A', 'sum')]) + expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum")]) tm.assert_frame_equal(result, expected, check_like=True) with catch_warnings(record=True): warnings.simplefilter("ignore", FutureWarning) - result = r.aggregate({'A': {'mean': 'mean', - 'sum': 'sum'}, - 'B': {'mean2': 'mean', - 'sum2': 'sum'}}) + result = r.aggregate( + { + "A": {"mean": "mean", "sum": "sum"}, + "B": {"mean2": "mean", "sum2": "sum"}, + } + ) expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1) - exp_cols = [('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')] + exp_cols = [("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")] expected.columns = pd.MultiIndex.from_tuples(exp_cols) tm.assert_frame_equal(result, expected, check_like=True) - result = r.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) + result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) expected = concat([a_mean, a_std, b_mean, b_std], axis=1) - exp_cols = [('A', 'mean'), ('A', 'std'), ('B', 'mean'), ('B', 'std')] + exp_cols = [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] expected.columns = pd.MultiIndex.from_tuples(exp_cols) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_apply(self, raw): # passed lambda - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) - a_sum = r['A'].sum() + a_sum = r["A"].sum() - result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) - rcustom = r['B'].apply(lambda x: np.std(x, ddof=1), raw=raw) + result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) + rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw) expected = concat([a_sum, rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_consistency(self): - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) result = r.agg([np.sum, np.mean]).columns - expected = pd.MultiIndex.from_product([list('AB'), ['sum', 'mean']]) + expected = pd.MultiIndex.from_product([list("AB"), ["sum", "mean"]]) tm.assert_index_equal(result, expected) - result = r['A'].agg([np.sum, np.mean]).columns - expected = Index(['sum', 'mean']) + result = r["A"].agg([np.sum, np.mean]).columns + expected = Index(["sum", "mean"]) tm.assert_index_equal(result, expected) - result = r.agg({'A': [np.sum, np.mean]}).columns - expected = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'mean')]) + result = r.agg({"A": [np.sum, np.mean]}).columns + expected = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "mean")]) tm.assert_index_equal(result, expected) def test_agg_nested_dicts(self): # API change for disallowing these types of nested dicts - df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + df = DataFrame({"A": range(5), "B": range(0, 10, 2)}) r = df.rolling(window=3) msg = r"cannot perform renaming for (r1|r2) with a nested dictionary" with pytest.raises(SpecificationError, match=msg): - r.aggregate({'r1': {'A': ['mean', 'sum']}, - 'r2': {'B': ['mean', 'sum']}}) - - expected = concat([r['A'].mean(), r['A'].std(), - r['B'].mean(), r['B'].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( - 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) + r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}}) + + expected = concat( + [r["A"].mean(), r["A"].std(), r["B"].mean(), r["B"].std()], axis=1 + ) + expected.columns = pd.MultiIndex.from_tuples( + [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")] + ) with catch_warnings(record=True): warnings.simplefilter("ignore", FutureWarning) - result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) + result = r[["A", "B"]].agg( + {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}} + ) tm.assert_frame_equal(result, expected, check_like=True) with catch_warnings(record=True): warnings.simplefilter("ignore", FutureWarning) - result = r.agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) - expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), ( - 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) + result = r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}) + expected.columns = pd.MultiIndex.from_tuples( + [ + ("A", "ra", "mean"), + ("A", "ra", "std"), + ("B", "rb", "mean"), + ("B", "rb", "std"), + ] + ) tm.assert_frame_equal(result, expected, check_like=True) def test_count_nonnumeric_types(self): # GH12541 - cols = ['int', 'float', 'string', 'datetime', 'timedelta', 'periods', - 'fl_inf', 'fl_nan', 'str_nan', 'dt_nat', 'periods_nat'] + cols = [ + "int", + "float", + "string", + "datetime", + "timedelta", + "periods", + "fl_inf", + "fl_nan", + "str_nan", + "dt_nat", + "periods_nat", + ] df = DataFrame( - {'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'datetime': pd.date_range('20170101', periods=3), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s'), - 'periods': [pd.Period('2012-01'), pd.Period('2012-02'), - pd.Period('2012-03')], - 'fl_inf': [1., 2., np.Inf], - 'fl_nan': [1., 2., np.NaN], - 'str_nan': ['aa', 'bb', np.NaN], - 'dt_nat': [Timestamp('20170101'), Timestamp('20170203'), - Timestamp(None)], - 'periods_nat': [pd.Period('2012-01'), pd.Period('2012-02'), - pd.Period(None)]}, - columns=cols) + { + "int": [1, 2, 3], + "float": [4.0, 5.0, 6.0], + "string": list("abc"), + "datetime": pd.date_range("20170101", periods=3), + "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"), + "periods": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period("2012-03"), + ], + "fl_inf": [1.0, 2.0, np.Inf], + "fl_nan": [1.0, 2.0, np.NaN], + "str_nan": ["aa", "bb", np.NaN], + "dt_nat": [ + Timestamp("20170101"), + Timestamp("20170203"), + Timestamp(None), + ], + "periods_nat": [ + pd.Period("2012-01"), + pd.Period("2012-02"), + pd.Period(None), + ], + }, + columns=cols, + ) expected = DataFrame( - {'int': [1., 2., 2.], - 'float': [1., 2., 2.], - 'string': [1., 2., 2.], - 'datetime': [1., 2., 2.], - 'timedelta': [1., 2., 2.], - 'periods': [1., 2., 2.], - 'fl_inf': [1., 2., 2.], - 'fl_nan': [1., 2., 1.], - 'str_nan': [1., 2., 1.], - 'dt_nat': [1., 2., 1.], - 'periods_nat': [1., 2., 1.]}, - columns=cols) + { + "int": [1.0, 2.0, 2.0], + "float": [1.0, 2.0, 2.0], + "string": [1.0, 2.0, 2.0], + "datetime": [1.0, 2.0, 2.0], + "timedelta": [1.0, 2.0, 2.0], + "periods": [1.0, 2.0, 2.0], + "fl_inf": [1.0, 2.0, 2.0], + "fl_nan": [1.0, 2.0, 1.0], + "str_nan": [1.0, 2.0, 1.0], + "dt_nat": [1.0, 2.0, 1.0], + "periods_nat": [1.0, 2.0, 1.0], + }, + columns=cols, + ) result = df.rolling(window=2).count() tm.assert_frame_equal(result, expected) @@ -296,12 +335,12 @@ def test_count_nonnumeric_types(self): @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") def test_window_with_args(self): # make sure that we are aggregating window functions correctly with arg - r = Series(np.random.randn(100)).rolling(window=10, min_periods=1, - win_type='gaussian') - expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1) - expected.columns = ['', ''] - result = r.aggregate([lambda x: x.mean(std=10), - lambda x: x.mean(std=.01)]) + r = Series(np.random.randn(100)).rolling( + window=10, min_periods=1, win_type="gaussian" + ) + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["", ""] + result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=0.01)]) tm.assert_frame_equal(result, expected) def a(x): @@ -310,77 +349,95 @@ def a(x): def b(x): return x.mean(std=0.01) - expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1) - expected.columns = ['a', 'b'] + expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1) + expected.columns = ["a", "b"] result = r.aggregate([a, b]) tm.assert_frame_equal(result, expected) def test_preserve_metadata(self): # GH 10565 - s = Series(np.arange(100), name='foo') + s = Series(np.arange(100), name="foo") s2 = s.rolling(30).sum() s3 = s.rolling(20).sum() - assert s2.name == 'foo' - assert s3.name == 'foo' - - @pytest.mark.parametrize("func,window_size,expected_vals", [ - ('rolling', 2, [[np.nan, np.nan, np.nan, np.nan], - [15., 20., 25., 20.], - [25., 30., 35., 30.], - [np.nan, np.nan, np.nan, np.nan], - [20., 30., 35., 30.], - [35., 40., 60., 40.], - [60., 80., 85., 80]]), - ('expanding', None, [[10., 10., 20., 20.], - [15., 20., 25., 20.], - [20., 30., 30., 20.], - [10., 10., 30., 30.], - [20., 30., 35., 30.], - [26.666667, 40., 50., 30.], - [40., 80., 60., 30.]])]) + assert s2.name == "foo" + assert s3.name == "foo" + + @pytest.mark.parametrize( + "func,window_size,expected_vals", + [ + ( + "rolling", + 2, + [ + [np.nan, np.nan, np.nan, np.nan], + [15.0, 20.0, 25.0, 20.0], + [25.0, 30.0, 35.0, 30.0], + [np.nan, np.nan, np.nan, np.nan], + [20.0, 30.0, 35.0, 30.0], + [35.0, 40.0, 60.0, 40.0], + [60.0, 80.0, 85.0, 80], + ], + ), + ( + "expanding", + None, + [ + [10.0, 10.0, 20.0, 20.0], + [15.0, 20.0, 25.0, 20.0], + [20.0, 30.0, 30.0, 20.0], + [10.0, 10.0, 30.0, 30.0], + [20.0, 30.0, 35.0, 30.0], + [26.666667, 40.0, 50.0, 30.0], + [40.0, 80.0, 60.0, 30.0], + ], + ), + ], + ) def test_multiple_agg_funcs(self, func, window_size, expected_vals): # GH 15072 - df = pd.DataFrame([ - ['A', 10, 20], - ['A', 20, 30], - ['A', 30, 40], - ['B', 10, 30], - ['B', 30, 40], - ['B', 40, 80], - ['B', 80, 90]], columns=['stock', 'low', 'high']) - - f = getattr(df.groupby('stock'), func) + df = pd.DataFrame( + [ + ["A", 10, 20], + ["A", 20, 30], + ["A", 30, 40], + ["B", 10, 30], + ["B", 30, 40], + ["B", 40, 80], + ["B", 80, 90], + ], + columns=["stock", "low", "high"], + ) + + f = getattr(df.groupby("stock"), func) if window_size: window = f(window_size) else: window = f() - index = pd.MultiIndex.from_tuples([ - ('A', 0), ('A', 1), ('A', 2), - ('B', 3), ('B', 4), ('B', 5), ('B', 6)], names=['stock', None]) - columns = pd.MultiIndex.from_tuples([ - ('low', 'mean'), ('low', 'max'), ('high', 'mean'), - ('high', 'min')]) + index = pd.MultiIndex.from_tuples( + [("A", 0), ("A", 1), ("A", 2), ("B", 3), ("B", 4), ("B", 5), ("B", 6)], + names=["stock", None], + ) + columns = pd.MultiIndex.from_tuples( + [("low", "mean"), ("low", "max"), ("high", "mean"), ("high", "min")] + ) expected = pd.DataFrame(expected_vals, index=index, columns=columns) - result = window.agg(OrderedDict(( - ('low', ['mean', 'max']), - ('high', ['mean', 'min']), - ))) + result = window.agg( + OrderedDict((("low", ["mean", "max"]), ("high", ["mean", "min"]))) + ) tm.assert_frame_equal(result, expected) @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") class TestWindow(Base): - def setup_method(self, method): self._create_data() @td.skip_if_no_scipy - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): # GH 12669 @@ -388,32 +445,30 @@ def test_constructor(self, which): c = o.rolling # valid - c(win_type='boxcar', window=2, min_periods=1) - c(win_type='boxcar', window=2, min_periods=1, center=True) - c(win_type='boxcar', window=2, min_periods=1, center=False) + c(win_type="boxcar", window=2, min_periods=1) + c(win_type="boxcar", window=2, min_periods=1, center=True) + c(win_type="boxcar", window=2, min_periods=1, center=False) # not valid - for w in [2., 'foo', np.array([2])]: + for w in [2.0, "foo", np.array([2])]: with pytest.raises(ValueError): - c(win_type='boxcar', window=2, min_periods=w) + c(win_type="boxcar", window=2, min_periods=w) with pytest.raises(ValueError): - c(win_type='boxcar', window=2, min_periods=1, center=w) + c(win_type="boxcar", window=2, min_periods=1, center=w) - for wt in ['foobar', 1]: + for wt in ["foobar", 1]: with pytest.raises(ValueError): c(win_type=wt, window=2) @td.skip_if_no_scipy - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor_with_win_type(self, which, win_types): # GH 12669 o = getattr(self, which) c = o.rolling c(win_type=win_types, window=2) - @pytest.mark.parametrize( - 'method', ['sum', 'mean']) + @pytest.mark.parametrize("method", ["sum", "mean"]) def test_numpy_compat(self, method): # see gh-12811 w = rwindow.Window(Series([2, 4, 6]), window=[0, 2]) @@ -427,19 +482,17 @@ def test_numpy_compat(self, method): class TestRolling(Base): - def setup_method(self, method): self._create_data() def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) df df.rolling(2).sum() df.rolling(2, min_periods=1).sum() - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): # GH 12669 @@ -458,7 +511,7 @@ def test_constructor(self, which): c(-1) # not valid - for w in [2., 'foo', np.array([2])]: + for w in [2.0, "foo", np.array([2])]: with pytest.raises(ValueError): c(window=w) with pytest.raises(ValueError): @@ -467,50 +520,53 @@ def test_constructor(self, which): c(window=2, min_periods=1, center=w) @td.skip_if_no_scipy - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor_with_win_type(self, which): # GH 13383 o = getattr(self, which) c = o.rolling with pytest.raises(ValueError): - c(-1, win_type='boxcar') + c(-1, win_type="boxcar") - @pytest.mark.parametrize( - 'window', [timedelta(days=3), pd.Timedelta(days=3)]) + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3)]) def test_constructor_with_timedelta_window(self, window): # GH 15440 n = 10 - df = DataFrame({'value': np.arange(n)}, - index=pd.date_range('2015-12-24', periods=n, freq="D")) - expected_data = np.append([0., 1.], np.arange(3., 27., 3)) + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) + expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) result = df.rolling(window=window).sum() - expected = DataFrame({'value': expected_data}, - index=pd.date_range('2015-12-24', periods=n, - freq="D")) + expected = DataFrame( + {"value": expected_data}, + index=pd.date_range("2015-12-24", periods=n, freq="D"), + ) tm.assert_frame_equal(result, expected) - expected = df.rolling('3D').sum() + expected = df.rolling("3D").sum() tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'window', [timedelta(days=3), pd.Timedelta(days=3), '3D']) + @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3), "3D"]) def test_constructor_timedelta_window_and_minperiods(self, window, raw): # GH 15305 n = 10 - df = DataFrame({'value': np.arange(n)}, - index=pd.date_range('2017-08-08', periods=n, freq="D")) + df = DataFrame( + {"value": np.arange(n)}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) expected = DataFrame( - {'value': np.append([np.NaN, 1.], np.arange(3., 27., 3))}, - index=pd.date_range('2017-08-08', periods=n, freq="D")) + {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, + index=pd.date_range("2017-08-08", periods=n, freq="D"), + ) result_roll_sum = df.rolling(window=window, min_periods=2).sum() - result_roll_generic = df.rolling(window=window, - min_periods=2).apply(sum, raw=raw) + result_roll_generic = df.rolling(window=window, min_periods=2).apply( + sum, raw=raw + ) tm.assert_frame_equal(result_roll_sum, expected) tm.assert_frame_equal(result_roll_generic, expected) - @pytest.mark.parametrize( - 'method', ['std', 'mean', 'sum', 'max', 'min', 'var']) + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 r = rwindow.Rolling(Series([2, 4, 6]), window=2) @@ -523,108 +579,116 @@ def test_numpy_compat(self, method): getattr(r, method)(dtype=np.float64) def test_closed(self): - df = DataFrame({'A': [0, 1, 2, 3, 4]}) + df = DataFrame({"A": [0, 1, 2, 3, 4]}) # closed only allowed for datetimelike with pytest.raises(ValueError): - df.rolling(window=3, closed='neither') + df.rolling(window=3, closed="neither") @pytest.mark.parametrize("closed", ["neither", "left"]) def test_closed_empty(self, closed, arithmetic_win_operators): # GH 26005 func_name = arithmetic_win_operators - ser = pd.Series(data=np.arange(5), - index=pd.date_range("2000", periods=5, freq="2D")) + ser = pd.Series( + data=np.arange(5), index=pd.date_range("2000", periods=5, freq="2D") + ) roll = ser.rolling("1D", closed=closed) result = getattr(roll, func_name)() expected = pd.Series([np.nan] * 5, index=ser.index) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("func", ['min', 'max']) + @pytest.mark.parametrize("func", ["min", "max"]) def test_closed_one_entry(self, func): # GH24718 - ser = pd.Series(data=[2], index=pd.date_range('2000', periods=1)) - result = getattr(ser.rolling('10D', closed='left'), func)() + ser = pd.Series(data=[2], index=pd.date_range("2000", periods=1)) + result = getattr(ser.rolling("10D", closed="left"), func)() tm.assert_series_equal(result, pd.Series([np.nan], index=ser.index)) - @pytest.mark.parametrize("func", ['min', 'max']) + @pytest.mark.parametrize("func", ["min", "max"]) def test_closed_one_entry_groupby(self, func): # GH24718 - ser = pd.DataFrame(data={'A': [1, 1, 2], 'B': [3, 2, 1]}, - index=pd.date_range('2000', periods=3)) + ser = pd.DataFrame( + data={"A": [1, 1, 2], "B": [3, 2, 1]}, + index=pd.date_range("2000", periods=3), + ) result = getattr( - ser.groupby('A', sort=False)['B'].rolling('10D', closed='left'), - func)() - exp_idx = pd.MultiIndex.from_arrays(arrays=[[1, 1, 2], ser.index], - names=('A', None)) - expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name='B') + ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func + )() + exp_idx = pd.MultiIndex.from_arrays( + arrays=[[1, 1, 2], ser.index], names=("A", None) + ) + expected = pd.Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B") tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("input_dtype", ['int', 'float']) - @pytest.mark.parametrize("func,closed,expected", [ - ('min', 'right', [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), - ('min', 'both', [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), - ('min', 'neither', [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), - ('min', 'left', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), - ('max', 'right', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - ('max', 'both', [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - ('max', 'neither', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), - ('max', 'left', [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]) - ]) - def test_closed_min_max_datetime(self, input_dtype, - func, closed, - expected): + @pytest.mark.parametrize("input_dtype", ["int", "float"]) + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "both", [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("min", "neither", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]), + ("min", "left", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]), + ("max", "right", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "both", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + ("max", "neither", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ("max", "left", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]), + ], + ) + def test_closed_min_max_datetime(self, input_dtype, func, closed, expected): # see gh-21704 - ser = pd.Series(data=np.arange(10).astype(input_dtype), - index=pd.date_range('2000', periods=10)) + ser = pd.Series( + data=np.arange(10).astype(input_dtype), + index=pd.date_range("2000", periods=10), + ) - result = getattr(ser.rolling('3D', closed=closed), func)() + result = getattr(ser.rolling("3D", closed=closed), func)() expected = pd.Series(expected, index=ser.index) tm.assert_series_equal(result, expected) def test_closed_uneven(self): # see gh-21704 - ser = pd.Series(data=np.arange(10), - index=pd.date_range('2000', periods=10)) + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) # uneven ser = ser.drop(index=ser.index[[1, 5]]) - result = ser.rolling('3D', closed='left').min() - expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], - index=ser.index) + result = ser.rolling("3D", closed="left").min() + expected = pd.Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("func,closed,expected", [ - ('min', 'right', [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), - ('min', 'both', [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), - ('min', 'neither', [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), - ('min', 'left', [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), - ('max', 'right', [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), - ('max', 'both', [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), - ('max', 'neither', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), - ('max', 'left', [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]) - ]) + @pytest.mark.parametrize( + "func,closed,expected", + [ + ("min", "right", [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "both", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("min", "neither", [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]), + ("min", "left", [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]), + ("max", "right", [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]), + ("max", "both", [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]), + ("max", "neither", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]), + ("max", "left", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]), + ], + ) def test_closed_min_max_minp(self, func, closed, expected): # see gh-21704 - ser = pd.Series(data=np.arange(10), - index=pd.date_range('2000', periods=10)) + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) ser[ser.index[-3:]] = np.nan - result = getattr(ser.rolling('3D', min_periods=2, closed=closed), - func)() + result = getattr(ser.rolling("3D", min_periods=2, closed=closed), func)() expected = pd.Series(expected, index=ser.index) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("closed,expected", [ - ('right', [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), - ('both', [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), - ('neither', [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), - ('left', [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]) - ]) + @pytest.mark.parametrize( + "closed,expected", + [ + ("right", [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]), + ("both", [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("neither", [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), + ("left", [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]), + ], + ) def test_closed_median_quantile(self, closed, expected): # GH 26005 - ser = pd.Series(data=np.arange(10), - index=pd.date_range('2000', periods=10)) - roll = ser.rolling('3D', closed=closed) + ser = pd.Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) + roll = ser.rolling("3D", closed=closed) expected = pd.Series(expected, index=ser.index) result = roll.median() @@ -633,7 +697,7 @@ def test_closed_median_quantile(self, closed, expected): result = roll.quantile(0.5) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('roller', ['1s', 1]) + @pytest.mark.parametrize("roller", ["1s", 1]) def tests_empty_df_rolling(self, roller): # GH 15819 Verifies that datetime and integer rolling windows can be # applied to empty DataFrames @@ -673,9 +737,12 @@ def test_missing_minp_zero(self): def test_missing_minp_zero_variable(self): # https://github.com/pandas-dev/pandas/pull/18921 - x = pd.Series([np.nan] * 4, - index=pd.DatetimeIndex(['2017-01-01', '2017-01-04', - '2017-01-06', '2017-01-07'])) + x = pd.Series( + [np.nan] * 4, + index=pd.DatetimeIndex( + ["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"] + ), + ) result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() expected = pd.Series(0.0, index=x.index) tm.assert_series_equal(result, expected) @@ -683,15 +750,16 @@ def test_missing_minp_zero_variable(self): def test_multi_index_names(self): # GH 16789, 16825 - cols = pd.MultiIndex.from_product([['A', 'B'], ['C', 'D', 'E']], - names=['1', '2']) + cols = pd.MultiIndex.from_product( + [["A", "B"], ["C", "D", "E"]], names=["1", "2"] + ) df = DataFrame(np.ones((10, 6)), columns=cols) result = df.rolling(3).cov() tm.assert_index_equal(result.columns, df.columns) - assert result.index.names == [None, '1', '2'] + assert result.index.names == [None, "1", "2"] - @pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame]) + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) def test_iter_raises(self, klass): # https://github.com/pandas-dev/pandas/issues/11704 # Iteration over a Window @@ -705,47 +773,40 @@ def test_rolling_axis_sum(self, axis_frame): axis = df._get_axis_number(axis_frame) if axis == 0: - expected = DataFrame({ - i: [np.nan] * 2 + [3.0] * 8 - for i in range(20) - }) + expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)}) else: # axis == 1 - expected = DataFrame([ - [np.nan] * 2 + [3.0] * 18 - ] * 10) + expected = DataFrame([[np.nan] * 2 + [3.0] * 18] * 10) result = df.rolling(3, axis=axis_frame).sum() tm.assert_frame_equal(result, expected) def test_rolling_axis_count(self, axis_frame): # see gh-26055 - df = DataFrame({'x': range(3), 'y': range(3)}) + df = DataFrame({"x": range(3), "y": range(3)}) axis = df._get_axis_number(axis_frame) - if axis in [0, 'index']: - expected = DataFrame({'x': [1.0, 2.0, 2.0], 'y': [1.0, 2.0, 2.0]}) + if axis in [0, "index"]: + expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]}) else: - expected = DataFrame({'x': [1.0, 1.0, 1.0], 'y': [2.0, 2.0, 2.0]}) + expected = DataFrame({"x": [1.0, 1.0, 1.0], "y": [2.0, 2.0, 2.0]}) result = df.rolling(2, axis=axis_frame).count() tm.assert_frame_equal(result, expected) class TestExpanding(Base): - def setup_method(self, method): self._create_data() def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) df df.expanding(2).sum() - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): # GH 12669 @@ -758,14 +819,13 @@ def test_constructor(self, which): c(min_periods=1, center=False) # not valid - for w in [2., 'foo', np.array([2])]: + for w in [2.0, "foo", np.array([2])]: with pytest.raises(ValueError): c(min_periods=w) with pytest.raises(ValueError): c(min_periods=1, center=w) - @pytest.mark.parametrize( - 'method', ['std', 'mean', 'sum', 'max', 'min', 'var']) + @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 e = rwindow.Expanding(Series([2, 4, 6]), window=2) @@ -778,10 +838,17 @@ def test_numpy_compat(self, method): getattr(e, method)(dtype=np.float64) @pytest.mark.parametrize( - 'expander', - [1, pytest.param('ls', marks=pytest.mark.xfail( - reason='GH#16425 expanding with ' - 'offset not supported'))]) + "expander", + [ + 1, + pytest.param( + "ls", + marks=pytest.mark.xfail( + reason="GH#16425 expanding with " "offset not supported" + ), + ), + ], + ) def test_empty_df_expanding(self, expander): # GH 15819 Verifies that datetime and integer expanding windows can be # applied to empty DataFrames @@ -793,8 +860,7 @@ def test_empty_df_expanding(self, expander): # Verifies that datetime and integer expanding windows can be applied # to empty DataFrames with datetime index expected = DataFrame(index=pd.DatetimeIndex([])) - result = DataFrame( - index=pd.DatetimeIndex([])).expanding(expander).sum() + result = DataFrame(index=pd.DatetimeIndex([])).expanding(expander).sum() tm.assert_frame_equal(result, expected) def test_missing_minp_zero(self): @@ -810,7 +876,7 @@ def test_missing_minp_zero(self): expected = pd.Series([np.nan]) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame]) + @pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame]) def test_iter_raises(self, klass): # https://github.com/pandas-dev/pandas/issues/11704 # Iteration over a Window @@ -824,33 +890,28 @@ def test_expanding_axis(self, axis_frame): axis = df._get_axis_number(axis_frame) if axis == 0: - expected = DataFrame({ - i: [np.nan] * 2 + [float(j) for j in range(3, 11)] - for i in range(20) - }) + expected = DataFrame( + {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)} + ) else: # axis == 1 - expected = DataFrame([ - [np.nan] * 2 + [float(i) for i in range(3, 21)] - ] * 10) + expected = DataFrame([[np.nan] * 2 + [float(i) for i in range(3, 21)]] * 10) result = df.expanding(3, axis=axis_frame).sum() tm.assert_frame_equal(result, expected) class TestEWM(Base): - def setup_method(self, method): self._create_data() def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = DataFrame({"B": [0, 1, 2, np.nan, 4]}) df df.ewm(com=0.5).mean() - @pytest.mark.parametrize( - 'which', ['series', 'frame']) + @pytest.mark.parametrize("which", ["series", "frame"]) def test_constructor(self, which): o = getattr(self, which) c = o.ewm @@ -889,8 +950,7 @@ def test_constructor(self, which): with pytest.raises(ValueError): c(alpha=alpha) - @pytest.mark.parametrize( - 'method', ['std', 'mean', 'var']) + @pytest.mark.parametrize("method", ["std", "mean", "var"]) def test_numpy_compat(self, method): # see gh-12811 e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) @@ -915,64 +975,78 @@ class Dtype: window = 2 funcs = { - 'count': lambda v: v.count(), - 'max': lambda v: v.max(), - 'min': lambda v: v.min(), - 'sum': lambda v: v.sum(), - 'mean': lambda v: v.mean(), - 'std': lambda v: v.std(), - 'var': lambda v: v.var(), - 'median': lambda v: v.median() + "count": lambda v: v.count(), + "max": lambda v: v.max(), + "min": lambda v: v.min(), + "sum": lambda v: v.sum(), + "mean": lambda v: v.mean(), + "std": lambda v: v.std(), + "var": lambda v: v.var(), + "median": lambda v: v.median(), } def get_expects(self): expects = { - 'sr1': { - 'count': Series([1, 2, 2, 2, 2], dtype='float64'), - 'max': Series([np.nan, 1, 2, 3, 4], dtype='float64'), - 'min': Series([np.nan, 0, 1, 2, 3], dtype='float64'), - 'sum': Series([np.nan, 1, 3, 5, 7], dtype='float64'), - 'mean': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64'), - 'std': Series([np.nan] + [np.sqrt(.5)] * 4, dtype='float64'), - 'var': Series([np.nan, .5, .5, .5, .5], dtype='float64'), - 'median': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64') + "sr1": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), + "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), + "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), + "mean": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(0.5)] * 4, dtype="float64"), + "var": Series([np.nan, 0.5, 0.5, 0.5, 0.5], dtype="float64"), + "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), }, - 'sr2': { - 'count': Series([1, 2, 2, 2, 2], dtype='float64'), - 'max': Series([np.nan, 10, 8, 6, 4], dtype='float64'), - 'min': Series([np.nan, 8, 6, 4, 2], dtype='float64'), - 'sum': Series([np.nan, 18, 14, 10, 6], dtype='float64'), - 'mean': Series([np.nan, 9, 7, 5, 3], dtype='float64'), - 'std': Series([np.nan] + [np.sqrt(2)] * 4, dtype='float64'), - 'var': Series([np.nan, 2, 2, 2, 2], dtype='float64'), - 'median': Series([np.nan, 9, 7, 5, 3], dtype='float64') + "sr2": { + "count": Series([1, 2, 2, 2, 2], dtype="float64"), + "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), + "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), + "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), + "mean": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + "std": Series([np.nan] + [np.sqrt(2)] * 4, dtype="float64"), + "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), + "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), + }, + "df": { + "count": DataFrame( + {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, + dtype="float64", + ), + "max": DataFrame( + {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, + dtype="float64", + ), + "min": DataFrame( + {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, + dtype="float64", + ), + "sum": DataFrame( + { + 0: Series([np.nan, 2, 6, 10, 14]), + 1: Series([np.nan, 4, 8, 12, 16]), + }, + dtype="float64", + ), + "mean": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), + "std": DataFrame( + { + 0: Series([np.nan] + [np.sqrt(2)] * 4), + 1: Series([np.nan] + [np.sqrt(2)] * 4), + }, + dtype="float64", + ), + "var": DataFrame( + {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, + dtype="float64", + ), + "median": DataFrame( + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + dtype="float64", + ), }, - 'df': { - 'count': DataFrame({0: Series([1, 2, 2, 2, 2]), - 1: Series([1, 2, 2, 2, 2])}, - dtype='float64'), - 'max': DataFrame({0: Series([np.nan, 2, 4, 6, 8]), - 1: Series([np.nan, 3, 5, 7, 9])}, - dtype='float64'), - 'min': DataFrame({0: Series([np.nan, 0, 2, 4, 6]), - 1: Series([np.nan, 1, 3, 5, 7])}, - dtype='float64'), - 'sum': DataFrame({0: Series([np.nan, 2, 6, 10, 14]), - 1: Series([np.nan, 4, 8, 12, 16])}, - dtype='float64'), - 'mean': DataFrame({0: Series([np.nan, 1, 3, 5, 7]), - 1: Series([np.nan, 2, 4, 6, 8])}, - dtype='float64'), - 'std': DataFrame({0: Series([np.nan] + [np.sqrt(2)] * 4), - 1: Series([np.nan] + [np.sqrt(2)] * 4)}, - dtype='float64'), - 'var': DataFrame({0: Series([np.nan, 2, 2, 2, 2]), - 1: Series([np.nan, 2, 2, 2, 2])}, - dtype='float64'), - 'median': DataFrame({0: Series([np.nan, 1, 3, 5, 7]), - 1: Series([np.nan, 2, 4, 6, 8])}, - dtype='float64'), - } } return expects @@ -981,11 +1055,7 @@ def _create_dtype_data(self, dtype): sr2 = Series(np.arange(10, 0, -2), dtype=dtype) df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) - data = { - 'sr1': sr1, - 'sr2': sr2, - 'df': df - } + data = {"sr1": sr1, "sr2": sr2, "df": df} return data @@ -1069,27 +1139,23 @@ class TestDtype_float64(Dtype_float): class TestDtype_category(Dtype): - dtype = 'category' + dtype = "category" include_df = False def _create_dtype_data(self, dtype): sr1 = Series(range(5), dtype=dtype) sr2 = Series(range(10, 0, -2), dtype=dtype) - data = { - 'sr1': sr1, - 'sr2': sr2 - } + data = {"sr1": sr1, "sr2": sr2} return data class DatetimeLike(Dtype): - def check_dtypes(self, f, f_name, d, d_name, exp): roll = d.rolling(window=self.window) - if f_name == 'count': + if f_name == "count": result = f(roll) tm.assert_almost_equal(result, exp) @@ -1099,24 +1165,25 @@ def check_dtypes(self, f, f_name, d, d_name, exp): class TestDtype_timedelta(DatetimeLike): - dtype = np.dtype('m8[ns]') + dtype = np.dtype("m8[ns]") class TestDtype_datetime(DatetimeLike): - dtype = np.dtype('M8[ns]') + dtype = np.dtype("M8[ns]") class TestDtype_datetime64UTC(DatetimeLike): - dtype = 'datetime64[ns, UTC]' + dtype = "datetime64[ns, UTC]" def _create_data(self): - pytest.skip("direct creation of extension dtype " - "datetime64[ns, UTC] is not supported ATM") + pytest.skip( + "direct creation of extension dtype " + "datetime64[ns, UTC] is not supported ATM" + ) @pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") class TestMoments(Base): - def setup_method(self, method): self._create_data() @@ -1130,46 +1197,69 @@ def test_centered_axis_validation(self): Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean() # ok ok - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, - axis=0).mean() - DataFrame(np.ones((10, 10))).rolling(window=3, center=True, - axis=1).mean() + DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=0).mean() + DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=1).mean() # bad axis with pytest.raises(ValueError): - (DataFrame(np.ones((10, 10))) - .rolling(window=3, center=True, axis=2).mean()) + (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) def test_rolling_sum(self): - self._check_moment_func(np.nansum, name='sum', - zero_min_periods_equal=False) + self._check_moment_func(np.nansum, name="sum", zero_min_periods_equal=False) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() - self._check_moment_func(counter, name='count', has_min_periods=False, - fill_value=0) + self._check_moment_func( + counter, name="count", has_min_periods=False, fill_value=0 + ) def test_rolling_mean(self): - self._check_moment_func(np.mean, name='mean') + self._check_moment_func(np.mean, name="mean") @td.skip_if_no_scipy def test_cmov_mean(self): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) result = Series(vals).rolling(5, center=True).mean() - expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, - 12.818, 12.952, np.nan, np.nan]) + expected = Series( + [ + np.nan, + np.nan, + 9.962, + 11.27, + 11.564, + 12.516, + 12.818, + 12.952, + np.nan, + np.nan, + ] + ) tm.assert_series_equal(expected, result) @td.skip_if_no_scipy def test_cmov_window(self): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) - result = Series(vals).rolling(5, win_type='boxcar', center=True).mean() - expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516, - 12.818, 12.952, np.nan, np.nan]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) + result = Series(vals).rolling(5, win_type="boxcar", center=True).mean() + expected = Series( + [ + np.nan, + np.nan, + 9.962, + 11.27, + 11.564, + 12.516, + 12.818, + 12.952, + np.nan, + np.nan, + ] + ) tm.assert_series_equal(expected, result) @td.skip_if_no_scipy @@ -1177,49 +1267,78 @@ def test_cmov_window_corner(self): # GH 8238 # all nan vals = pd.Series([np.nan] * 10) - result = vals.rolling(5, center=True, win_type='boxcar').mean() + result = vals.rolling(5, center=True, win_type="boxcar").mean() assert np.isnan(result).all() # empty vals = pd.Series([]) - result = vals.rolling(5, center=True, win_type='boxcar').mean() + result = vals.rolling(5, center=True, win_type="boxcar").mean() assert len(result) == 0 # shorter than window vals = pd.Series(np.random.randn(5)) - result = vals.rolling(10, win_type='boxcar').mean() + result = vals.rolling(10, win_type="boxcar").mean() assert np.isnan(result).all() assert len(result) == 5 @td.skip_if_no_scipy def test_cmov_window_frame(self): # Gh 8238 - vals = np.array([[12.18, 3.64], [10.18, 9.16], [13.24, 14.61], - [4.51, 8.11], [6.15, 11.44], [9.14, 6.21], - [11.31, 10.67], [2.94, 6.51], [9.42, 8.39], [12.44, - 7.34]]) - - xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [9.252, 9.392], - [8.644, 9.906], [8.87, 10.208], [6.81, 8.588], - [7.792, 8.644], [9.05, 7.824], [np.nan, np.nan - ], [np.nan, np.nan]]) + vals = np.array( + [ + [12.18, 3.64], + [10.18, 9.16], + [13.24, 14.61], + [4.51, 8.11], + [6.15, 11.44], + [9.14, 6.21], + [11.31, 10.67], + [2.94, 6.51], + [9.42, 8.39], + [12.44, 7.34], + ] + ) + + xp = np.array( + [ + [np.nan, np.nan], + [np.nan, np.nan], + [9.252, 9.392], + [8.644, 9.906], + [8.87, 10.208], + [6.81, 8.588], + [7.792, 8.644], + [9.05, 7.824], + [np.nan, np.nan], + [np.nan, np.nan], + ] + ) # DataFrame - rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).mean() + rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).mean() tm.assert_frame_equal(DataFrame(xp), rs) # invalid method with pytest.raises(AttributeError): - (DataFrame(vals).rolling(5, win_type='boxcar', center=True) - .std()) + (DataFrame(vals).rolling(5, win_type="boxcar", center=True).std()) # sum - xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [46.26, 46.96], - [43.22, 49.53], [44.35, 51.04], [34.05, 42.94], - [38.96, 43.22], [45.25, 39.12], [np.nan, np.nan - ], [np.nan, np.nan]]) - - rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).sum() + xp = np.array( + [ + [np.nan, np.nan], + [np.nan, np.nan], + [46.26, 46.96], + [43.22, 49.53], + [44.35, 51.04], + [34.05, 42.94], + [38.96, 43.22], + [45.25, 39.12], + [np.nan, np.nan], + [np.nan, np.nan], + ] + ) + + rs = DataFrame(vals).rolling(5, win_type="boxcar", center=True).sum() tm.assert_frame_equal(DataFrame(xp), rs) @td.skip_if_no_scipy @@ -1230,32 +1349,112 @@ def test_cmov_window_na_min_periods(self): vals[8] = np.nan xp = vals.rolling(5, min_periods=4, center=True).mean() - rs = vals.rolling(5, win_type='boxcar', min_periods=4, - center=True).mean() + rs = vals.rolling(5, win_type="boxcar", min_periods=4, center=True).mean() tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy def test_cmov_window_regular(self, win_types): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) xps = { - 'hamming': [np.nan, np.nan, 8.71384, 9.56348, 12.38009, 14.03687, - 13.8567, 11.81473, np.nan, np.nan], - 'triang': [np.nan, np.nan, 9.28667, 10.34667, 12.00556, 13.33889, - 13.38, 12.33667, np.nan, np.nan], - 'barthann': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675, - 14.0825, 11.5675, np.nan, np.nan], - 'bohman': [np.nan, np.nan, 7.61599, 9.1764, 12.83559, 14.17267, - 14.65923, 11.10401, np.nan, np.nan], - 'blackmanharris': [np.nan, np.nan, 6.97691, 9.16438, 13.05052, - 14.02156, 15.10512, 10.74574, np.nan, np.nan], - 'nuttall': [np.nan, np.nan, 7.04618, 9.16786, 13.02671, 14.03559, - 15.05657, 10.78514, np.nan, np.nan], - 'blackman': [np.nan, np.nan, 7.73345, 9.17869, 12.79607, 14.20036, - 14.57726, 11.16988, np.nan, np.nan], - 'bartlett': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675, - 14.0825, 11.5675, np.nan, np.nan] + "hamming": [ + np.nan, + np.nan, + 8.71384, + 9.56348, + 12.38009, + 14.03687, + 13.8567, + 11.81473, + np.nan, + np.nan, + ], + "triang": [ + np.nan, + np.nan, + 9.28667, + 10.34667, + 12.00556, + 13.33889, + 13.38, + 12.33667, + np.nan, + np.nan, + ], + "barthann": [ + np.nan, + np.nan, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 14.0825, + 11.5675, + np.nan, + np.nan, + ], + "bohman": [ + np.nan, + np.nan, + 7.61599, + 9.1764, + 12.83559, + 14.17267, + 14.65923, + 11.10401, + np.nan, + np.nan, + ], + "blackmanharris": [ + np.nan, + np.nan, + 6.97691, + 9.16438, + 13.05052, + 14.02156, + 15.10512, + 10.74574, + np.nan, + np.nan, + ], + "nuttall": [ + np.nan, + np.nan, + 7.04618, + 9.16786, + 13.02671, + 14.03559, + 15.05657, + 10.78514, + np.nan, + np.nan, + ], + "blackman": [ + np.nan, + np.nan, + 7.73345, + 9.17869, + 12.79607, + 14.20036, + 14.57726, + 11.16988, + np.nan, + np.nan, + ], + "bartlett": [ + np.nan, + np.nan, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 14.0825, + 11.5675, + np.nan, + np.nan, + ], } xp = Series(xps[win_types]) @@ -1277,25 +1476,106 @@ def test_cmov_window_regular_linear_range(self, win_types): @td.skip_if_no_scipy def test_cmov_window_regular_missing_data(self, win_types): # GH 8238 - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48] + ) xps = { - 'bartlett': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925, - 12.5575, 14.3675, 15.61667, 13.655], - 'blackman': [np.nan, np.nan, 9.04582, 11.41536, 7.73345, 9.17869, - 12.79607, 14.20036, 15.8706, 13.655], - 'barthann': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925, - 12.5575, 14.3675, 15.61667, 13.655], - 'bohman': [np.nan, np.nan, 8.9444, 11.56327, 7.61599, 9.1764, - 12.83559, 14.17267, 15.90976, 13.655], - 'hamming': [np.nan, np.nan, 9.59321, 10.29694, 8.71384, 9.56348, - 12.38009, 14.20565, 15.24694, 13.69758], - 'nuttall': [np.nan, np.nan, 8.47693, 12.2821, 7.04618, 9.16786, - 13.02671, 14.03673, 16.08759, 13.65553], - 'triang': [np.nan, np.nan, 9.33167, 9.76125, 9.28667, 10.34667, - 12.00556, 13.82125, 14.49429, 13.765], - 'blackmanharris': [np.nan, np.nan, 8.42526, 12.36824, 6.97691, - 9.16438, 13.05052, 14.02175, 16.1098, 13.65509] + "bartlett": [ + np.nan, + np.nan, + 9.70333, + 10.5225, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 15.61667, + 13.655, + ], + "blackman": [ + np.nan, + np.nan, + 9.04582, + 11.41536, + 7.73345, + 9.17869, + 12.79607, + 14.20036, + 15.8706, + 13.655, + ], + "barthann": [ + np.nan, + np.nan, + 9.70333, + 10.5225, + 8.4425, + 9.1925, + 12.5575, + 14.3675, + 15.61667, + 13.655, + ], + "bohman": [ + np.nan, + np.nan, + 8.9444, + 11.56327, + 7.61599, + 9.1764, + 12.83559, + 14.17267, + 15.90976, + 13.655, + ], + "hamming": [ + np.nan, + np.nan, + 9.59321, + 10.29694, + 8.71384, + 9.56348, + 12.38009, + 14.20565, + 15.24694, + 13.69758, + ], + "nuttall": [ + np.nan, + np.nan, + 8.47693, + 12.2821, + 7.04618, + 9.16786, + 13.02671, + 14.03673, + 16.08759, + 13.65553, + ], + "triang": [ + np.nan, + np.nan, + 9.33167, + 9.76125, + 9.28667, + 10.34667, + 12.00556, + 13.82125, + 14.49429, + 13.765, + ], + "blackmanharris": [ + np.nan, + np.nan, + 8.42526, + 12.36824, + 6.97691, + 9.16438, + 13.05052, + 14.02175, + 16.1098, + 13.65509, + ], } xp = Series(xps[win_types]) @@ -1306,40 +1586,85 @@ def test_cmov_window_regular_missing_data(self, win_types): def test_cmov_window_special(self, win_types_special): # GH 8238 kwds = { - 'kaiser': {'beta': 1.}, - 'gaussian': {'std': 1.}, - 'general_gaussian': {'power': 2., 'width': 2.}, - 'exponential': {'tau': 10}} + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, + "general_gaussian": {"power": 2.0, "width": 2.0}, + "exponential": {"tau": 10}, + } - vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, - 10.63, 14.48]) + vals = np.array( + [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48] + ) xps = { - 'gaussian': [np.nan, np.nan, 8.97297, 9.76077, 12.24763, 13.89053, - 13.65671, 12.01002, np.nan, np.nan], - 'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, 11.73161, - 13.08516, 12.95111, 12.74577, np.nan, np.nan], - 'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, 12.75129, - 12.90702, 12.83757, np.nan, np.nan], - 'exponential': [np.nan, np.nan, 9.83364, 11.10472, 11.64551, - 12.66138, 12.92379, 12.83770, np.nan, np.nan], + "gaussian": [ + np.nan, + np.nan, + 8.97297, + 9.76077, + 12.24763, + 13.89053, + 13.65671, + 12.01002, + np.nan, + np.nan, + ], + "general_gaussian": [ + np.nan, + np.nan, + 9.85011, + 10.71589, + 11.73161, + 13.08516, + 12.95111, + 12.74577, + np.nan, + np.nan, + ], + "kaiser": [ + np.nan, + np.nan, + 9.86851, + 11.02969, + 11.65161, + 12.75129, + 12.90702, + 12.83757, + np.nan, + np.nan, + ], + "exponential": [ + np.nan, + np.nan, + 9.83364, + 11.10472, + 11.64551, + 12.66138, + 12.92379, + 12.83770, + np.nan, + np.nan, + ], } xp = Series(xps[win_types_special]) - rs = Series(vals).rolling( - 5, win_type=win_types_special, center=True).mean( - **kwds[win_types_special]) + rs = ( + Series(vals) + .rolling(5, win_type=win_types_special, center=True) + .mean(**kwds[win_types_special]) + ) tm.assert_series_equal(xp, rs) @td.skip_if_no_scipy def test_cmov_window_special_linear_range(self, win_types_special): # GH 8238 kwds = { - 'kaiser': {'beta': 1.}, - 'gaussian': {'std': 1.}, - 'general_gaussian': {'power': 2., 'width': 2.}, - 'slepian': {'width': 0.5}, - 'exponential': {'tau': 10}} + "kaiser": {"beta": 1.0}, + "gaussian": {"std": 1.0}, + "general_gaussian": {"power": 2.0, "width": 2.0}, + "slepian": {"width": 0.5}, + "exponential": {"tau": 10}, + } vals = np.array(range(10), dtype=np.float) xp = vals.copy() @@ -1347,16 +1672,18 @@ def test_cmov_window_special_linear_range(self, win_types_special): xp[-2:] = np.nan xp = Series(xp) - rs = Series(vals).rolling( - 5, win_type=win_types_special, center=True).mean( - **kwds[win_types_special]) + rs = ( + Series(vals) + .rolling(5, win_type=win_types_special, center=True) + .mean(**kwds[win_types_special]) + ) tm.assert_series_equal(xp, rs) def test_rolling_median(self): - self._check_moment_func(np.median, name='median') + self._check_moment_func(np.median, name="median") def test_rolling_min(self): - self._check_moment_func(np.min, name='min') + self._check_moment_func(np.min, name="min") a = pd.Series([1, 2, 3, 4, 5]) result = a.rolling(window=100, min_periods=1).min() @@ -1367,7 +1694,7 @@ def test_rolling_min(self): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() def test_rolling_max(self): - self._check_moment_func(np.max, name='max') + self._check_moment_func(np.max, name="max") a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) b = a.rolling(window=100, min_periods=1).max() @@ -1376,13 +1703,12 @@ def test_rolling_max(self): with pytest.raises(ValueError): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() - @pytest.mark.parametrize('q', [0.0, .1, .5, .9, 1.0]) + @pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) def test_rolling_quantile(self, q): - def scoreatpercentile(a, per): values = np.sort(a, axis=0) - idx = int(per / 1. * (values.shape[0] - 1)) + idx = int(per / 1.0 * (values.shape[0] - 1)) if idx == values.shape[0] - 1: retval = values[-1] @@ -1399,15 +1725,14 @@ def scoreatpercentile(a, per): def quantile_func(x): return scoreatpercentile(x, q) - self._check_moment_func(quantile_func, name='quantile', - quantile=q) + self._check_moment_func(quantile_func, name="quantile", quantile=q) def test_rolling_quantile_np_percentile(self): # #9413: Tests that rolling window's quantile default behavior # is analogous to Numpy's percentile row = 10 col = 5 - idx = pd.date_range('20100101', periods=row, freq='B') + idx = pd.date_range("20100101", periods=row, freq="B") df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx) df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0) @@ -1415,24 +1740,31 @@ def test_rolling_quantile_np_percentile(self): tm.assert_almost_equal(df_quantile.values, np.array(np_percentile)) - @pytest.mark.parametrize('quantile', [0.0, 0.1, 0.45, 0.5, 1]) - @pytest.mark.parametrize('interpolation', ['linear', 'lower', 'higher', - 'nearest', 'midpoint']) - @pytest.mark.parametrize('data', [[1., 2., 3., 4., 5., 6., 7.], - [8., 1., 3., 4., 5., 2., 6., 7.], - [0., np.nan, 0.2, np.nan, 0.4], - [np.nan, np.nan, np.nan, np.nan], - [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5], - [0.5], [np.nan, 0.7, 0.6]]) - def test_rolling_quantile_interpolation_options(self, quantile, - interpolation, data): + @pytest.mark.parametrize("quantile", [0.0, 0.1, 0.45, 0.5, 1]) + @pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] + ) + @pytest.mark.parametrize( + "data", + [ + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], + [8.0, 1.0, 3.0, 4.0, 5.0, 2.0, 6.0, 7.0], + [0.0, np.nan, 0.2, np.nan, 0.4], + [np.nan, np.nan, np.nan, np.nan], + [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5], + [0.5], + [np.nan, 0.7, 0.6], + ], + ) + def test_rolling_quantile_interpolation_options( + self, quantile, interpolation, data + ): # Tests that rolling window's quantile behavior is analogous to # Series' quantile for each interpolation option s = Series(data) q1 = s.quantile(quantile, interpolation) - q2 = s.expanding(min_periods=1).quantile( - quantile, interpolation).iloc[-1] + q2 = s.expanding(min_periods=1).quantile(quantile, interpolation).iloc[-1] if np.isnan(q1): assert np.isnan(q2) @@ -1443,13 +1775,13 @@ def test_invalid_quantile_value(self): data = np.arange(5) s = Series(data) - with pytest.raises(ValueError, match="Interpolation 'invalid'" - " is not supported"): - s.rolling(len(data), min_periods=1).quantile( - 0.5, interpolation='invalid') + with pytest.raises( + ValueError, match="Interpolation 'invalid'" " is not supported" + ): + s.rolling(len(data), min_periods=1).quantile(0.5, interpolation="invalid") def test_rolling_quantile_param(self): - ser = Series([0.0, .1, .5, .9, 1.0]) + ser = Series([0.0, 0.1, 0.5, 0.9, 1.0]) with pytest.raises(ValueError): ser.rolling(3).quantile(-0.1) @@ -1458,21 +1790,23 @@ def test_rolling_quantile_param(self): ser.rolling(3).quantile(10.0) with pytest.raises(TypeError): - ser.rolling(3).quantile('foo') + ser.rolling(3).quantile("foo") def test_rolling_apply(self, raw): # suppress warnings about empty slices, as we are deliberately testing # with a 0-length Series with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning) + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) def f(x): return x[np.isfinite(x)].mean() - self._check_moment_func(np.mean, name='apply', func=f, raw=raw) + self._check_moment_func(np.mean, name="apply", func=f, raw=raw) expected = Series([]) result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) @@ -1481,15 +1815,16 @@ def f(x): # gh-8080 s = Series([None, None, None]) result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1., 2., 2.]) + expected = Series([1.0, 2.0, 2.0]) tm.assert_series_equal(result, expected) result = s.rolling(2, min_periods=0).apply(len, raw=raw) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('klass', [Series, DataFrame]) + @pytest.mark.parametrize("klass", [Series, DataFrame]) @pytest.mark.parametrize( - 'method', [lambda x: x.rolling(window=2), lambda x: x.expanding()]) + "method", [lambda x: x.rolling(window=2), lambda x: x.expanding()] + ) def test_apply_future_warning(self, klass, method): # gh-5071 @@ -1509,12 +1844,13 @@ def test_rolling_apply_out_of_bounds(self, raw): expected = pd.Series([1, 3, 6, 10], dtype=float) tm.assert_almost_equal(result, expected) - @pytest.mark.parametrize('window', [2, '2s']) + @pytest.mark.parametrize("window", [2, "2s"]) def test_rolling_apply_with_pandas_objects(self, window): # 5071 - df = pd.DataFrame({'A': np.random.randn(5), - 'B': np.random.randint(0, 10, size=5)}, - index=pd.date_range('20130101', periods=5, freq='s')) + df = pd.DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=pd.date_range("20130101", periods=5, freq="s"), + ) # we have an equal spaced timeseries index # so simulate removing the first period @@ -1531,24 +1867,21 @@ def f(x): df.rolling(window).apply(f, raw=True) def test_rolling_std(self): - self._check_moment_func(lambda x: np.std(x, ddof=1), - name='std') - self._check_moment_func(lambda x: np.std(x, ddof=0), - name='std', ddof=0) + self._check_moment_func(lambda x: np.std(x, ddof=1), name="std") + self._check_moment_func(lambda x: np.std(x, ddof=0), name="std", ddof=0) def test_rolling_std_1obs(self): - vals = pd.Series([1., 2., 3., 4., 5.]) + vals = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) result = vals.rolling(1, min_periods=1).std() expected = pd.Series([np.nan] * 5) tm.assert_series_equal(result, expected) result = vals.rolling(1, min_periods=1).std(ddof=0) - expected = pd.Series([0.] * 5) + expected = pd.Series([0.0] * 5) tm.assert_series_equal(result, expected) - result = (pd.Series([np.nan, np.nan, 3, 4, 5]) - .rolling(3, min_periods=2).std()) + result = pd.Series([np.nan, np.nan, 3, 4, 5]).rolling(3, min_periods=2).std() assert np.isnan(result[2]) def test_rolling_std_neg_sqrt(self): @@ -1556,9 +1889,15 @@ def test_rolling_std_neg_sqrt(self): # Test move_nanstd for neg sqrt. - a = pd.Series([0.0011448196318903589, 0.00028718669878572767, - 0.00028718669878572767, 0.00028718669878572767, - 0.00028718669878572767]) + a = pd.Series( + [ + 0.0011448196318903589, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767, + 0.00028718669878572767, + ] + ) b = a.rolling(window=3).std() assert np.isfinite(b[2:]).all() @@ -1566,56 +1905,58 @@ def test_rolling_std_neg_sqrt(self): assert np.isfinite(b[2:]).all() def test_rolling_var(self): - self._check_moment_func(lambda x: np.var(x, ddof=1), - name='var') - self._check_moment_func(lambda x: np.var(x, ddof=0), - name='var', ddof=0) + self._check_moment_func(lambda x: np.var(x, ddof=1), name="var") + self._check_moment_func(lambda x: np.var(x, ddof=0), name="var", ddof=0) @td.skip_if_no_scipy def test_rolling_skew(self): from scipy.stats import skew - self._check_moment_func(lambda x: skew(x, bias=False), name='skew') + + self._check_moment_func(lambda x: skew(x, bias=False), name="skew") @td.skip_if_no_scipy def test_rolling_kurt(self): from scipy.stats import kurtosis - self._check_moment_func(lambda x: kurtosis(x, bias=False), - name='kurt') - - def _check_moment_func(self, static_comp, name, has_min_periods=True, - has_center=True, has_time_rule=True, - fill_value=None, zero_min_periods_equal=True, - **kwargs): + self._check_moment_func(lambda x: kurtosis(x, bias=False), name="kurt") + + def _check_moment_func( + self, + static_comp, + name, + has_min_periods=True, + has_center=True, + has_time_rule=True, + fill_value=None, + zero_min_periods_equal=True, + **kwargs + ): def get_result(obj, window, min_periods=None, center=False): - r = obj.rolling(window=window, min_periods=min_periods, - center=center) + r = obj.rolling(window=window, min_periods=min_periods, center=center) return getattr(r, name)(**kwargs) series_result = get_result(self.series, window=50) assert isinstance(series_result, Series) - tm.assert_almost_equal(series_result.iloc[-1], - static_comp(self.series[-50:])) + tm.assert_almost_equal(series_result.iloc[-1], static_comp(self.series[-50:])) frame_result = get_result(self.frame, window=50) assert isinstance(frame_result, DataFrame) tm.assert_series_equal( frame_result.iloc[-1, :], self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw), - check_names=False) + check_names=False, + ) # check time_rule works if has_time_rule: win = 25 minp = 10 - series = self.series[::2].resample('B').mean() - frame = self.frame[::2].resample('B').mean() + series = self.series[::2].resample("B").mean() + frame = self.frame[::2].resample("B").mean() if has_min_periods: - series_result = get_result(series, window=win, - min_periods=minp) - frame_result = get_result(frame, window=win, - min_periods=minp) + series_result = get_result(series, window=win, min_periods=minp) + frame_result = get_result(frame, window=win, min_periods=minp) else: series_result = get_result(series, window=win) frame_result = get_result(frame, window=win) @@ -1626,12 +1967,13 @@ def get_result(obj, window, min_periods=None, center=False): trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) - tm.assert_almost_equal(series_result[-1], - static_comp(trunc_series)) + tm.assert_almost_equal(series_result[-1], static_comp(trunc_series)) - tm.assert_series_equal(frame_result.xs(last_date), - trunc_frame.apply(static_comp, raw=raw), - check_names=False) + tm.assert_series_equal( + frame_result.xs(last_date), + trunc_frame.apply(static_comp, raw=raw), + check_names=False, + ) # excluding NaNs correctly obj = Series(randn(50)) @@ -1666,16 +2008,13 @@ def get_result(obj, window, min_periods=None, center=False): # window larger than series length (#7297) if has_min_periods: for minp in (0, len(self.series) - 1, len(self.series)): - result = get_result(self.series, len(self.series) + 1, - min_periods=minp) - expected = get_result(self.series, len(self.series), - min_periods=minp) + result = get_result(self.series, len(self.series) + 1, min_periods=minp) + expected = get_result(self.series, len(self.series), min_periods=minp) nan_mask = isna(result) tm.assert_series_equal(nan_mask, isna(expected)) nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], - expected[nan_mask]) + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) else: result = get_result(self.series, len(self.series) + 1) expected = get_result(self.series, len(self.series)) @@ -1690,43 +2029,63 @@ def get_result(obj, window, min_periods=None, center=False): if has_min_periods: result = get_result(obj, 20, min_periods=15, center=True) expected = get_result( - pd.concat([obj, Series([np.NaN] * 9)]), 20, - min_periods=15)[9:].reset_index(drop=True) + pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15 + )[9:].reset_index(drop=True) else: result = get_result(obj, 20, center=True) - expected = get_result( - pd.concat([obj, Series([np.NaN] * 9)]), - 20)[9:].reset_index(drop=True) + expected = get_result(pd.concat([obj, Series([np.NaN] * 9)]), 20)[ + 9: + ].reset_index(drop=True) tm.assert_series_equal(result, expected) # shifter index - s = ['x%d' % x for x in range(12)] + s = ["x%d" % x for x in range(12)] if has_min_periods: minp = 10 - series_xp = get_result( - self.series.reindex(list(self.series.index) + s), - window=25, - min_periods=minp).shift(-12).reindex(self.series.index) - frame_xp = get_result( - self.frame.reindex(list(self.frame.index) + s), - window=25, - min_periods=minp).shift(-12).reindex(self.frame.index) + series_xp = ( + get_result( + self.series.reindex(list(self.series.index) + s), + window=25, + min_periods=minp, + ) + .shift(-12) + .reindex(self.series.index) + ) + frame_xp = ( + get_result( + self.frame.reindex(list(self.frame.index) + s), + window=25, + min_periods=minp, + ) + .shift(-12) + .reindex(self.frame.index) + ) - series_rs = get_result(self.series, window=25, - min_periods=minp, center=True) - frame_rs = get_result(self.frame, window=25, min_periods=minp, - center=True) + series_rs = get_result( + self.series, window=25, min_periods=minp, center=True + ) + frame_rs = get_result( + self.frame, window=25, min_periods=minp, center=True + ) else: - series_xp = get_result( - self.series.reindex(list(self.series.index) + s), - window=25).shift(-12).reindex(self.series.index) - frame_xp = get_result( - self.frame.reindex(list(self.frame.index) + s), - window=25).shift(-12).reindex(self.frame.index) + series_xp = ( + get_result( + self.series.reindex(list(self.series.index) + s), window=25 + ) + .shift(-12) + .reindex(self.series.index) + ) + frame_xp = ( + get_result( + self.frame.reindex(list(self.frame.index) + s), window=25 + ) + .shift(-12) + .reindex(self.frame.index) + ) series_rs = get_result(self.series, window=25, center=True) frame_rs = get_result(self.frame, window=25, center=True) @@ -1738,15 +2097,15 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_frame_equal(frame_xp, frame_rs) def test_ewma(self): - self._check_ew(name='mean') + self._check_ew(name="mean") vals = pd.Series(np.zeros(1000)) vals[5] = 1 result = vals.ewm(span=100, adjust=False).mean().sum() assert np.abs(result - 1) < 1e-2 - @pytest.mark.parametrize('adjust', [True, False]) - @pytest.mark.parametrize('ignore_na', [True, False]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) def test_ewma_cases(self, adjust, ignore_na): # try adjust/ignore_na args matrix @@ -1761,51 +2120,68 @@ def test_ewma_cases(self, adjust, ignore_na): tm.assert_series_equal(result, expected) def test_ewma_nan_handling(self): - s = Series([1.] + [np.nan] * 5 + [1.]) + s = Series([1.0] + [np.nan] * 5 + [1.0]) result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([1.] * len(s))) + tm.assert_series_equal(result, Series([1.0] * len(s))) - s = Series([np.nan] * 2 + [1.] + [np.nan] * 2 + [1.]) + s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([np.nan] * 2 + [1.] * 4)) + tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) # GH 7603 - s0 = Series([np.nan, 1., 101.]) - s1 = Series([1., np.nan, 101.]) - s2 = Series([np.nan, 1., np.nan, np.nan, 101., np.nan]) - s3 = Series([1., np.nan, 101., 50.]) - com = 2. - alpha = 1. / (1. + com) + s0 = Series([np.nan, 1.0, 101.0]) + s1 = Series([1.0, np.nan, 101.0]) + s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) + s3 = Series([1.0, np.nan, 101.0, 50.0]) + com = 2.0 + alpha = 1.0 / (1.0 + com) def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method='ffill') + return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1. - alpha), 1.]), - (s0, True, True, [np.nan, (1. - alpha), 1.]), - (s0, False, False, [np.nan, (1. - alpha), alpha]), - (s0, False, True, [np.nan, (1. - alpha), alpha]), - (s1, True, False, [(1. - alpha) ** 2, np.nan, 1.]), - (s1, True, True, [(1. - alpha), np.nan, 1.]), - (s1, False, False, [(1. - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1. - alpha), np.nan, alpha]), - (s2, True, False, [np.nan, (1. - alpha) ** - 3, np.nan, np.nan, 1., np.nan]), - (s2, True, True, [np.nan, (1. - alpha), - np.nan, np.nan, 1., np.nan]), - (s2, False, False, [np.nan, (1. - alpha) ** - 3, np.nan, np.nan, alpha, np.nan]), - (s2, False, True, [np.nan, (1. - alpha), - np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1. - alpha) ** - 3, np.nan, (1. - alpha), 1.]), - (s3, True, True, [(1. - alpha) ** - 2, np.nan, (1. - alpha), 1.]), - (s3, False, False, [(1. - alpha) ** 3, np.nan, - (1. - alpha) * alpha, - alpha * ((1. - alpha) ** 2 + alpha)]), - (s3, False, True, [(1. - alpha) ** 2, - np.nan, (1. - alpha) * alpha, alpha])]: + (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), + (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), + (s0, False, False, [np.nan, (1.0 - alpha), alpha]), + (s0, False, True, [np.nan, (1.0 - alpha), alpha]), + (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), + (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), + (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), + (s1, False, True, [(1.0 - alpha), np.nan, alpha]), + ( + s2, + True, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + ( + s2, + False, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + ), + (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), + (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), + (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), + ( + s3, + False, + False, + [ + (1.0 - alpha) ** 3, + np.nan, + (1.0 - alpha) * alpha, + alpha * ((1.0 - alpha) ** 2 + alpha), + ], + ), + ( + s3, + False, + True, + [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], + ), + ]: expected = simple_wma(s, Series(w)) result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() @@ -1816,10 +2192,10 @@ def simple_wma(s, w): tm.assert_series_equal(result, expected) def test_ewmvar(self): - self._check_ew(name='var') + self._check_ew(name="var") def test_ewmvol(self): - self._check_ew(name='vol') + self._check_ew(name="vol") def test_ewma_span_com_args(self): A = self.series.ewm(com=9.5).mean() @@ -1904,7 +2280,7 @@ def test_ewm_domain_checks(self): with pytest.raises(ValueError, match=msg): s.ewm(alpha=1.1) - @pytest.mark.parametrize('method', ['mean', 'vol', 'var']) + @pytest.mark.parametrize("method", ["mean", "vol", "var"]) def test_ew_empty_series(self, method): vals = pd.Series([], dtype=np.float64) @@ -1937,7 +2313,7 @@ def _check_ew(self, name=None, preserve_nan=False): for min_periods in (0, 1): result = getattr(s.ewm(com=50, min_periods=min_periods), name)() - if name == 'mean': + if name == "mean": assert result[:10].isna().all() assert not result[10:].isna().any() else: @@ -1947,15 +2323,13 @@ def _check_ew(self, name=None, preserve_nan=False): assert not result[11:].isna().any() # check series of length 0 - result = getattr(Series().ewm(com=50, min_periods=min_periods), - name)() + result = getattr(Series().ewm(com=50, min_periods=min_periods), name)() tm.assert_series_equal(result, Series()) # check series of length 1 - result = getattr(Series([1.]).ewm(50, min_periods=min_periods), - name)() - if name == 'mean': - tm.assert_series_equal(result, Series([1.])) + result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() + if name == "mean": + tm.assert_series_equal(result, Series([1.0])) else: # ewm.std, ewm.vol, ewm.var with bias=False require at least # two values @@ -1969,22 +2343,22 @@ def _check_ew(self, name=None, preserve_nan=False): class TestPairwise: # GH 7738 - df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], - columns=['C', 'C']), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), - DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], - columns=[1, 0.]), - DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], - columns=[0, 1.]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], - columns=[1., 'X']), ] - df2 = DataFrame([[None, 1, 1], [None, 1, 2], - [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) + df1s = [ + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]), + DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]), + DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]), + ] + df2 = DataFrame( + [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]], + columns=["Y", "Z", "X"], + ) s = Series([1, 1, 3, 8]) def compare(self, result, expected): @@ -1996,7 +2370,7 @@ def compare(self, result, expected): tm.assert_numpy_array_equal(result, expected, check_dtype=False) - @pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()]) + @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) def test_no_flex(self, f): # DataFrame methods (which do not call _flex_binary_moment()) @@ -2010,12 +2384,16 @@ def test_no_flex(self, f): self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x: x.expanding().cov(pairwise=True), - lambda x: x.expanding().corr(pairwise=True), - lambda x: x.rolling(window=3).cov(pairwise=True), - lambda x: x.rolling(window=3).corr(pairwise=True), - lambda x: x.ewm(com=3).cov(pairwise=True), - lambda x: x.ewm(com=3).corr(pairwise=True)]) + "f", + [ + lambda x: x.expanding().cov(pairwise=True), + lambda x: x.expanding().corr(pairwise=True), + lambda x: x.rolling(window=3).cov(pairwise=True), + lambda x: x.rolling(window=3).corr(pairwise=True), + lambda x: x.ewm(com=3).cov(pairwise=True), + lambda x: x.ewm(com=3).corr(pairwise=True), + ], + ) def test_pairwise_with_self(self, f): # DataFrame with itself, pairwise=True @@ -2024,11 +2402,10 @@ def test_pairwise_with_self(self, f): results = [] for i, df in enumerate(self.df1s): result = f(df) - tm.assert_index_equal(result.index.levels[0], - df.index, - check_names=False) - tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), - safe_sort(df.columns.unique())) + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(df.columns.unique()) + ) tm.assert_index_equal(result.columns, df.columns) results.append(df) @@ -2037,12 +2414,16 @@ def test_pairwise_with_self(self, f): self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x: x.expanding().cov(pairwise=False), - lambda x: x.expanding().corr(pairwise=False), - lambda x: x.rolling(window=3).cov(pairwise=False), - lambda x: x.rolling(window=3).corr(pairwise=False), - lambda x: x.ewm(com=3).cov(pairwise=False), - lambda x: x.ewm(com=3).corr(pairwise=False), ]) + "f", + [ + lambda x: x.expanding().cov(pairwise=False), + lambda x: x.expanding().corr(pairwise=False), + lambda x: x.rolling(window=3).cov(pairwise=False), + lambda x: x.rolling(window=3).corr(pairwise=False), + lambda x: x.ewm(com=3).cov(pairwise=False), + lambda x: x.ewm(com=3).corr(pairwise=False), + ], + ) def test_no_pairwise_with_self(self, f): # DataFrame with itself, pairwise=False @@ -2055,38 +2436,46 @@ def test_no_pairwise_with_self(self, f): self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x, y: x.expanding().cov(y, pairwise=True), - lambda x, y: x.expanding().corr(y, pairwise=True), - lambda x, y: x.rolling(window=3).cov(y, pairwise=True), - lambda x, y: x.rolling(window=3).corr(y, pairwise=True), - lambda x, y: x.ewm(com=3).cov(y, pairwise=True), - lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]) + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=True), + lambda x, y: x.expanding().corr(y, pairwise=True), + lambda x, y: x.rolling(window=3).cov(y, pairwise=True), + lambda x, y: x.rolling(window=3).corr(y, pairwise=True), + lambda x, y: x.ewm(com=3).cov(y, pairwise=True), + lambda x, y: x.ewm(com=3).corr(y, pairwise=True), + ], + ) def test_pairwise_with_other(self, f): # DataFrame with another DataFrame, pairwise=True results = [f(df, self.df2) for df in self.df1s] for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index.levels[0], - df.index, - check_names=False) - tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), - safe_sort(self.df2.columns.unique())) + tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), safe_sort(self.df2.columns.unique()) + ) for i, result in enumerate(results): if i > 0: self.compare(result, results[0]) @pytest.mark.parametrize( - 'f', [lambda x, y: x.expanding().cov(y, pairwise=False), - lambda x, y: x.expanding().corr(y, pairwise=False), - lambda x, y: x.rolling(window=3).cov(y, pairwise=False), - lambda x, y: x.rolling(window=3).corr(y, pairwise=False), - lambda x, y: x.ewm(com=3).cov(y, pairwise=False), - lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ]) + "f", + [ + lambda x, y: x.expanding().cov(y, pairwise=False), + lambda x, y: x.expanding().corr(y, pairwise=False), + lambda x, y: x.rolling(window=3).cov(y, pairwise=False), + lambda x, y: x.rolling(window=3).corr(y, pairwise=False), + lambda x, y: x.ewm(com=3).cov(y, pairwise=False), + lambda x, y: x.ewm(com=3).corr(y, pairwise=False), + ], + ) def test_no_pairwise_with_other(self, f): # DataFrame with another DataFrame, pairwise=False - results = [f(df, self.df2) if df.columns.is_unique else None - for df in self.df1s] + results = [ + f(df, self.df2) if df.columns.is_unique else None for df in self.df1s + ] for (df, result) in zip(self.df1s, results): if result is not None: with catch_warnings(record=True): @@ -2097,25 +2486,28 @@ def test_no_pairwise_with_other(self, f): tm.assert_index_equal(result.index, expected_index) tm.assert_index_equal(result.columns, expected_columns) else: - with pytest.raises(ValueError, - match="'arg1' columns are not unique"): + with pytest.raises(ValueError, match="'arg1' columns are not unique"): f(df, self.df2) - with pytest.raises(ValueError, - match="'arg2' columns are not unique"): + with pytest.raises(ValueError, match="'arg2' columns are not unique"): f(self.df2, df) @pytest.mark.parametrize( - 'f', [lambda x, y: x.expanding().cov(y), - lambda x, y: x.expanding().corr(y), - lambda x, y: x.rolling(window=3).cov(y), - lambda x, y: x.rolling(window=3).corr(y), - lambda x, y: x.ewm(com=3).cov(y), - lambda x, y: x.ewm(com=3).corr(y), ]) + "f", + [ + lambda x, y: x.expanding().cov(y), + lambda x, y: x.expanding().corr(y), + lambda x, y: x.rolling(window=3).cov(y), + lambda x, y: x.rolling(window=3).corr(y), + lambda x, y: x.ewm(com=3).cov(y), + lambda x, y: x.ewm(com=3).corr(y), + ], + ) def test_pairwise_with_series(self, f): # DataFrame with a Series - results = ([f(df, self.s) for df in self.df1s] + - [f(self.s, df) for df in self.df1s]) + results = [f(df, self.s) for df in self.df1s] + [ + f(self.s, df) for df in self.df1s + ] for (df, result) in zip(self.df1s, results): tm.assert_index_equal(result.index, df.index) tm.assert_index_equal(result.columns, df.columns) @@ -2127,42 +2519,135 @@ def test_pairwise_with_series(self, f): # create the data only once as we are not setting it def _create_consistency_data(): def create_series(): - return [Series(), - Series([np.nan]), - Series([np.nan, np.nan]), - Series([3.]), - Series([np.nan, 3.]), - Series([3., np.nan]), - Series([1., 3.]), - Series([2., 2.]), - Series([3., 1.]), - Series([5., 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., np.nan, - np.nan]), - Series([np.nan, 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., - np.nan, np.nan]), - Series([np.nan, np.nan, 5., 5., np.nan, np.nan, np.nan, 5., 5., - np.nan, np.nan]), - Series([np.nan, 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7., - 12., 13., 14., 15.]), - Series([np.nan, 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3., - 12., 13., 14., 15.]), - Series([2., 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7., - 12., 13., 14., 15.]), - Series([2., 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3., - 12., 13., 14., 15.]), - Series(range(10)), - Series(range(20, 0, -2)), ] + return [ + Series(), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] def create_dataframes(): - return ([DataFrame(), - DataFrame(columns=['a']), - DataFrame(columns=['a', 'a']), - DataFrame(columns=['a', 'b']), - DataFrame(np.arange(10).reshape((5, 2))), - DataFrame(np.arange(25).reshape((5, 5))), - DataFrame(np.arange(25).reshape((5, 5)), - columns=['a', 'b', 99, 'd', 'd'])] + - [DataFrame(s) for s in create_series()]) + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] def is_constant(x): values = x.values.ravel() @@ -2191,40 +2676,37 @@ def _rolling_consistency_cases(): class TestMomentsConsistency(Base): base_functions = [ - (lambda v: Series(v).count(), None, 'count'), - (lambda v: Series(v).max(), None, 'max'), - (lambda v: Series(v).min(), None, 'min'), - (lambda v: Series(v).sum(), None, 'sum'), - (lambda v: Series(v).mean(), None, 'mean'), - (lambda v: Series(v).std(), 1, 'std'), - (lambda v: Series(v).cov(Series(v)), None, 'cov'), - (lambda v: Series(v).corr(Series(v)), None, 'corr'), - (lambda v: Series(v).var(), 1, 'var'), - + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), # restore once GH 8086 is fixed # lambda v: Series(v).skew(), 3, 'skew'), # (lambda v: Series(v).kurt(), 4, 'kurt'), - # restore once GH 8084 is fixed # lambda v: Series(v).quantile(0.3), None, 'quantile'), - - (lambda v: Series(v).median(), None, 'median'), - (np.nanmax, 1, 'max'), - (np.nanmin, 1, 'min'), - (np.nansum, 1, 'sum'), - (np.nanmean, 1, 'mean'), - (lambda v: np.nanstd(v, ddof=1), 1, 'std'), - (lambda v: np.nanvar(v, ddof=1), 1, 'var'), - (np.nanmedian, 1, 'median'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), ] no_nan_functions = [ - (np.max, None, 'max'), - (np.min, None, 'min'), - (np.sum, None, 'sum'), - (np.mean, None, 'mean'), - (lambda v: np.std(v, ddof=1), 1, 'std'), - (lambda v: np.var(v, ddof=1), 1, 'var'), - (np.median, None, 'median'), + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), ] def _create_data(self): @@ -2234,11 +2716,21 @@ def _create_data(self): def setup_method(self, method): self._create_data() - def _test_moments_consistency(self, min_periods, count, mean, mock_mean, - corr, var_unbiased=None, std_unbiased=None, - cov_unbiased=None, var_biased=None, - std_biased=None, cov_biased=None, - var_debiasing_factors=None): + def _test_moments_consistency( + self, + min_periods, + count, + mean, + mock_mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + var_debiasing_factors=None, + ): def _non_null_values(x): values = x.values.ravel() return set(values[notna(values)].tolist()) @@ -2250,7 +2742,7 @@ def _non_null_values(x): if mock_mean: # check that mean equals mock_mean expected = mock_mean(x) - assert_equal(mean_x, expected.astype('float64')) + assert_equal(mean_x, expected.astype("float64")) # check that correlation of a series with itself is either 1 or NaN corr_x_x = corr(x, x) @@ -2275,12 +2767,12 @@ def _non_null_values(x): var_unbiased_x = var_unbiased(x) var_biased_x = var_biased(x) var_debiasing_factors_x = var_debiasing_factors(x) - assert_equal(var_unbiased_x, var_biased_x * - var_debiasing_factors_x) + assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) - for (std, var, cov) in [(std_biased, var_biased, cov_biased), - (std_unbiased, var_unbiased, cov_unbiased) - ]: + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: # check that var(x), std(x), and cov(x) are all >= 0 var_x = var(x) @@ -2306,7 +2798,7 @@ def _non_null_values(x): # check that variance of constant series is identically 0 assert not (var_x > 0).any().any() expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0. + expected[count_x >= max(min_periods, 1)] = 0.0 if var is var_unbiased: expected[count_x < 2] = np.nan assert_equal(var_x, expected) @@ -2333,8 +2825,7 @@ def _non_null_values(x): # var(y)) / 2 var_x_plus_y = var(x + y) var_y = var(y) - assert_equal(cov_x_y, 0.5 * - (var_x_plus_y - var_x - var_y)) + assert_equal(cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y)) # check that corr(x, y) == cov(x, y) / (std(x) * # std(y)) @@ -2346,146 +2837,160 @@ def _non_null_values(x): # mean(x)*mean(y) mean_y = mean(y) mean_x_times_y = mean(x * y) - assert_equal(cov_x_y, mean_x_times_y - - (mean_x * mean_y)) + assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) @pytest.mark.slow - @pytest.mark.parametrize('min_periods', [0, 1, 2, 3, 4]) - @pytest.mark.parametrize('adjust', [True, False]) - @pytest.mark.parametrize('ignore_na', [True, False]) + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) def test_ewm_consistency(self, min_periods, adjust, ignore_na): def _weights(s, com, adjust, ignore_na): if isinstance(s, DataFrame): if not len(s.columns): return DataFrame(index=s.index, columns=s.columns) - w = concat([ - _weights(s.iloc[:, i], com=com, adjust=adjust, - ignore_na=ignore_na) - for i, _ in enumerate(s.columns)], axis=1) + w = concat( + [ + _weights( + s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na + ) + for i, _ in enumerate(s.columns) + ], + axis=1, + ) w.index = s.index w.columns = s.columns return w w = Series(np.nan, index=s.index) - alpha = 1. / (1. + com) + alpha = 1.0 / (1.0 + com) if ignore_na: - w[s.notna()] = _weights(s[s.notna()], com=com, - adjust=adjust, ignore_na=False) + w[s.notna()] = _weights( + s[s.notna()], com=com, adjust=adjust, ignore_na=False + ) elif adjust: for i in range(len(s)): if s.iat[i] == s.iat[i]: - w.iat[i] = pow(1. / (1. - alpha), i) + w.iat[i] = pow(1.0 / (1.0 - alpha), i) else: - sum_wts = 0. + sum_wts = 0.0 prev_i = -1 for i in range(len(s)): if s.iat[i] == s.iat[i]: if prev_i == -1: - w.iat[i] = 1. + w.iat[i] = 1.0 else: - w.iat[i] = alpha * sum_wts / pow(1. - alpha, - i - prev_i) + w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) sum_wts += w.iat[i] prev_i = i return w def _variance_debiasing_factors(s, com, adjust, ignore_na): weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - cum_sum = weights.cumsum().fillna(method='ffill') - cum_sum_sq = (weights * weights).cumsum().fillna(method='ffill') + cum_sum = weights.cumsum().fillna(method="ffill") + cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") numerator = cum_sum * cum_sum denominator = numerator - cum_sum_sq - denominator[denominator <= 0.] = np.nan + denominator[denominator <= 0.0] = np.nan return numerator / denominator def _ewma(s, com, min_periods, adjust, ignore_na): weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - result = s.multiply(weights).cumsum().divide(weights.cumsum( - )).fillna(method='ffill') - result[s.expanding().count() < (max(min_periods, 1) if min_periods - else 1)] = np.nan + result = ( + s.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") + ) + result[ + s.expanding().count() < (max(min_periods, 1) if min_periods else 1) + ] = np.nan return result - com = 3. + com = 3.0 # test consistency between different ewm* moments self._test_moments_consistency( min_periods=min_periods, count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).mean(), - mock_mean=lambda x: _ewma(x, com=com, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na), - corr=lambda x, y: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).corr(y), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + mock_mean=lambda x: _ewma( + x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), var_unbiased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).var(bias=False)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), std_unbiased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .std(bias=False)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=False) + ), cov_unbiased=lambda x, y: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .cov(y, bias=False)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=False) + ), var_biased=lambda x: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .var(bias=True)), - std_biased=lambda x: x.ewm(com=com, min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na).std(bias=True), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + std_biased=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=True), cov_biased=lambda x, y: ( - x.ewm(com=com, min_periods=min_periods, - adjust=adjust, ignore_na=ignore_na) - .cov(y, bias=True)), + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=True) + ), var_debiasing_factors=lambda x: ( - _variance_debiasing_factors(x, com=com, adjust=adjust, - ignore_na=ignore_na))) + _variance_debiasing_factors( + x, com=com, adjust=adjust, ignore_na=ignore_na + ) + ), + ) @pytest.mark.slow - @pytest.mark.parametrize( - 'min_periods', [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) def test_expanding_consistency(self, min_periods): # suppress warnings about empty slices, as we are deliberately testing # with empty/0-length Series/DataFrames with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning) + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) # test consistency between different expanding_* moments self._test_moments_consistency( min_periods=min_periods, count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding( - min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding( - min_periods=min_periods).sum() / x.expanding().count(), - corr=lambda x, y: x.expanding( - min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding( - min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding( - min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding( - min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding( - min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding( - min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding( - min_periods=min_periods).cov(y, ddof=0), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() + / x.expanding().count(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( + y, ddof=0 + ), var_debiasing_factors=lambda x: ( - x.expanding().count() / - (x.expanding().count() - 1.) - .replace(0., np.nan))) + x.expanding().count() + / (x.expanding().count() - 1.0).replace(0.0, np.nan) + ), + ) # test consistency between expanding_xyz() and either (a) # expanding_apply of Series.xyz(), or (b) expanding_apply of @@ -2497,92 +3002,111 @@ def test_expanding_consistency(self, min_periods): if no_nans: functions = self.base_functions + self.no_nan_functions for (f, require_min_periods, name) in functions: - expanding_f = getattr( - x.expanding(min_periods=min_periods), name) + expanding_f = getattr(x.expanding(min_periods=min_periods), name) - if (require_min_periods and - (min_periods is not None) and - (min_periods < require_min_periods)): + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): continue - if name == 'count': + if name == "count": expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=0).apply(func=f, raw=True) + expanding_apply_f_result = x.expanding(min_periods=0).apply( + func=f, raw=True + ) else: - if name in ['cov', 'corr']: - expanding_f_result = expanding_f( - pairwise=False) + if name in ["cov", "corr"]: + expanding_f_result = expanding_f(pairwise=False) else: expanding_f_result = expanding_f() expanding_apply_f_result = x.expanding( - min_periods=min_periods).apply(func=f, raw=True) + min_periods=min_periods + ).apply(func=f, raw=True) # GH 9422 - if name in ['sum', 'prod']: - assert_equal(expanding_f_result, - expanding_apply_f_result) + if name in ["sum", "prod"]: + assert_equal(expanding_f_result, expanding_apply_f_result) @pytest.mark.slow @pytest.mark.parametrize( - 'window,min_periods,center', list(_rolling_consistency_cases())) + "window,min_periods,center", list(_rolling_consistency_cases()) + ) def test_rolling_consistency(self, window, min_periods, center): # suppress warnings about empty slices, as we are deliberately testing # with empty/0-length Series/DataFrames with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning) + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) # test consistency between different rolling_* moments self._test_moments_consistency( min_periods=min_periods, - count=lambda x: ( - x.rolling(window=window, center=center) - .count()), + count=lambda x: (x.rolling(window=window, center=center).count()), mean=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).mean()), + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), mock_mean=lambda x: ( - x.rolling(window=window, - min_periods=min_periods, - center=center).sum() - .divide(x.rolling(window=window, - min_periods=min_periods, - center=center).count())), + x.rolling(window=window, min_periods=min_periods, center=center) + .sum() + .divide( + x.rolling( + window=window, min_periods=min_periods, center=center + ).count() + ) + ), corr=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).corr(y)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).corr(y) + ), var_unbiased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).var()), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).var() + ), std_unbiased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).std()), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).std() + ), cov_unbiased=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).cov(y)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y) + ), var_biased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).var(ddof=0)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) + ), std_biased=lambda x: ( - x.rolling(window=window, min_periods=min_periods, - center=center).std(ddof=0)), - + x.rolling( + window=window, min_periods=min_periods, center=center + ).std(ddof=0) + ), cov_biased=lambda x, y: ( - x.rolling(window=window, min_periods=min_periods, - center=center).cov(y, ddof=0)), + x.rolling( + window=window, min_periods=min_periods, center=center + ).cov(y, ddof=0) + ), var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center).count() - .divide((x.rolling(window=window, center=center) - .count() - 1.) - .replace(0., np.nan)))) + x.rolling(window=window, center=center) + .count() + .divide( + (x.rolling(window=window, center=center).count() - 1.0).replace( + 0.0, np.nan + ) + ) + ), + ) # test consistency between rolling_xyz() and either (a) # rolling_apply of Series.xyz(), or (b) rolling_apply of @@ -2595,33 +3119,36 @@ def test_rolling_consistency(self, window, min_periods, center): functions = self.base_functions + self.no_nan_functions for (f, require_min_periods, name) in functions: rolling_f = getattr( - x.rolling(window=window, center=center, - min_periods=min_periods), name) + x.rolling( + window=window, center=center, min_periods=min_periods + ), + name, + ) - if require_min_periods and ( - min_periods is not None) and ( - min_periods < require_min_periods): + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): continue - if name == 'count': + if name == "count": rolling_f_result = rolling_f() rolling_apply_f_result = x.rolling( - window=window, min_periods=0, - center=center).apply(func=f, raw=True) + window=window, min_periods=0, center=center + ).apply(func=f, raw=True) else: - if name in ['cov', 'corr']: - rolling_f_result = rolling_f( - pairwise=False) + if name in ["cov", "corr"]: + rolling_f_result = rolling_f(pairwise=False) else: rolling_f_result = rolling_f() rolling_apply_f_result = x.rolling( - window=window, min_periods=min_periods, - center=center).apply(func=f, raw=True) + window=window, min_periods=min_periods, center=center + ).apply(func=f, raw=True) # GH 9422 - if name in ['sum', 'prod']: - assert_equal(rolling_f_result, - rolling_apply_f_result) + if name in ["sum", "prod"]: + assert_equal(rolling_f_result, rolling_apply_f_result) # binary moments def test_rolling_cov(self): @@ -2632,7 +3159,7 @@ def test_rolling_cov(self): tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_cov_pairwise(self): - self._check_pairwise_moment('rolling', 'cov', window=10, min_periods=5) + self._check_pairwise_moment("rolling", "cov", window=10, min_periods=5) def test_rolling_corr(self): A = self.series @@ -2651,10 +3178,9 @@ def test_rolling_corr(self): tm.assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): - self._check_pairwise_moment('rolling', 'corr', window=10, - min_periods=5) + self._check_pairwise_moment("rolling", "corr", window=10, min_periods=5) - @pytest.mark.parametrize('window', range(7)) + @pytest.mark.parametrize("window", range(7)) def test_rolling_corr_with_zero_variance(self, window): # GH 18430 s = pd.Series(np.zeros(20)) @@ -2675,17 +3201,27 @@ def get_result(obj, obj2=None): def test_flex_binary_moment(self): # GH3155 # don't blow the stack - msg = ("arguments to moment function must be of type" - " np.ndarray/Series/DataFrame") + msg = ( + "arguments to moment function must be of type" + " np.ndarray/Series/DataFrame" + ) with pytest.raises(TypeError, match=msg): rwindow._flex_binary_moment(5, 6, None) def test_corr_sanity(self): # GH 3155 - df = DataFrame(np.array( - [[0.87024726, 0.18505595], [0.64355431, 0.3091617], - [0.92372966, 0.50552513], [0.00203756, 0.04520709], - [0.84780328, 0.33394331], [0.78369152, 0.63919667]])) + df = DataFrame( + np.array( + [ + [0.87024726, 0.18505595], + [0.64355431, 0.3091617], + [0.92372966, 0.50552513], + [0.00203756, 0.04520709], + [0.84780328, 0.33394331], + [0.78369152, 0.63919667], + ] + ) + ) res = df[0].rolling(5, center=True).corr(df[1]) assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res) @@ -2699,14 +3235,13 @@ def test_corr_sanity(self): except AssertionError: print(res) - @pytest.mark.parametrize('method', ['corr', 'cov']) + @pytest.mark.parametrize("method", ["corr", "cov"]) def test_flex_binary_frame(self, method): series = self.frame[1] res = getattr(series.rolling(window=10), method)(self.frame) res2 = getattr(self.frame.rolling(window=10), method)(series) - exp = self.frame.apply(lambda x: getattr( - series.rolling(window=10), method)(x)) + exp = self.frame.apply(lambda x: getattr(series.rolling(window=10), method)(x)) tm.assert_frame_equal(res, exp) tm.assert_frame_equal(res2, exp) @@ -2715,21 +3250,25 @@ def test_flex_binary_frame(self, method): frame2.values[:] = np.random.randn(*frame2.shape) res3 = getattr(self.frame.rolling(window=10), method)(frame2) - exp = DataFrame({k: getattr(self.frame[k].rolling( - window=10), method)(frame2[k]) for k in self.frame}) + exp = DataFrame( + { + k: getattr(self.frame[k].rolling(window=10), method)(frame2[k]) + for k in self.frame + } + ) tm.assert_frame_equal(res3, exp) def test_ewmcov(self): - self._check_binary_ew('cov') + self._check_binary_ew("cov") def test_ewmcov_pairwise(self): - self._check_pairwise_moment('ewm', 'cov', span=10, min_periods=5) + self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) def test_ewmcorr(self): - self._check_binary_ew('corr') + self._check_binary_ew("corr") def test_ewmcorr_pairwise(self): - self._check_pairwise_moment('ewm', 'corr', span=10, min_periods=5) + self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) def _check_binary_ew(self, name): def func(A, B, com, **kwargs): @@ -2758,8 +3297,7 @@ def func(A, B, com, **kwargs): tm.assert_series_equal(result, Series([])) # check series of length 1 - result = func( - Series([1.]), Series([1.]), 50, min_periods=min_periods) + result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) tm.assert_series_equal(result, Series([np.NaN])) msg = "Input arrays must be of the same type!" @@ -2768,22 +3306,17 @@ def func(A, B, com, **kwargs): func(A, randn(50), 20, min_periods=5) def test_expanding_apply_args_kwargs(self, raw): - def mean_w_arg(x, const): return np.mean(x) + const df = DataFrame(np.random.rand(20, 3)) - expected = df.expanding().apply(np.mean, raw=raw) + 20. + expected = df.expanding().apply(np.mean, raw=raw) + 20.0 - result = df.expanding().apply(mean_w_arg, - raw=raw, - args=(20, )) + result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) tm.assert_frame_equal(result, expected) - result = df.expanding().apply(mean_w_arg, - raw=raw, - kwargs={'const': 20}) + result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) tm.assert_frame_equal(result, expected) def test_expanding_corr(self): @@ -2798,14 +3331,16 @@ def test_expanding_corr(self): def test_expanding_count(self): result = self.series.expanding().count() - tm.assert_almost_equal(result, self.series.rolling( - window=len(self.series)).count()) + tm.assert_almost_equal( + result, self.series.rolling(window=len(self.series)).count() + ) def test_expanding_quantile(self): result = self.series.expanding().quantile(0.5) - rolling_result = self.series.rolling(window=len(self.series), - min_periods=1).quantile(0.5) + rolling_result = self.series.rolling( + window=len(self.series), min_periods=1 + ).quantile(0.5) tm.assert_almost_equal(result, rolling_result) @@ -2822,16 +3357,18 @@ def test_expanding_cov(self): def test_expanding_cov_pairwise(self): result = self.frame.expanding().corr() - rolling_result = self.frame.rolling(window=len(self.frame), - min_periods=1).corr() + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() tm.assert_frame_equal(result, rolling_result) def test_expanding_corr_pairwise(self): result = self.frame.expanding().corr() - rolling_result = self.frame.rolling(window=len(self.frame), - min_periods=1).corr() + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() tm.assert_frame_equal(result, rolling_result) def test_expanding_cov_diff_index(self): @@ -2867,7 +3404,7 @@ def test_expanding_corr_diff_index(self): s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.]) + expected = Series([None, None, None, 1.0]) tm.assert_series_equal(result, expected) def test_rolling_cov_diff_length(self): @@ -2895,12 +3432,10 @@ def test_rolling_corr_diff_length(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - 'f', + "f", [ - lambda x: (x.rolling(window=10, min_periods=5) - .cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5) - .corr(x, pairwise=False)), + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), @@ -2909,20 +3444,18 @@ def test_rolling_corr_diff_length(self): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling( - window=10, min_periods=5).quantile(quantile=0.5), + lambda x: x.rolling(window=10, min_periods=5).quantile(quantile=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=True), - lambda x: x.rolling(win_type='boxcar', - window=10, min_periods=5).mean()]) + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ], + ) def test_rolling_functions_window_non_shrinkage(self, f): # GH 7764 s = Series(range(4)) s_expected = Series(np.nan, index=s.index) - df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) + df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"]) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) try: @@ -2939,18 +3472,22 @@ def test_rolling_functions_window_non_shrinkage(self, f): def test_rolling_functions_window_non_shrinkage_binary(self): # corr/cov return a MI DataFrame - df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], - columns=Index(['A', 'B'], name='foo'), - index=Index(range(4), name='bar')) + df = DataFrame( + [[1, 5], [3, 2], [3, 9], [-1, 0]], + columns=Index(["A", "B"], name="foo"), + index=Index(range(4), name="bar"), + ) df_expected = DataFrame( - columns=Index(['A', 'B'], name='foo'), - index=pd.MultiIndex.from_product([df.index, df.columns], - names=['bar', 'foo']), - dtype='float64') - functions = [lambda x: (x.rolling(window=10, min_periods=5) - .cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5) - .corr(x, pairwise=True))] + columns=Index(["A", "B"], name="foo"), + index=pd.MultiIndex.from_product( + [df.index, df.columns], names=["bar", "foo"] + ), + dtype="float64", + ) + functions = [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] for f in functions: df_result = f(df) tm.assert_frame_equal(df_result, df_expected) @@ -2961,52 +3498,43 @@ def test_moment_functions_zero_length(self): s_expected = s df1 = DataFrame() df1_expected = df1 - df2 = DataFrame(columns=['a']) - df2['a'] = df2['a'].astype('float64') + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") df2_expected = df2 - functions = [lambda x: x.expanding().count(), - lambda x: x.expanding(min_periods=5).cov( - x, pairwise=False), - lambda x: x.expanding(min_periods=5).corr( - x, pairwise=False), - lambda x: x.expanding(min_periods=5).max(), - lambda x: x.expanding(min_periods=5).min(), - lambda x: x.expanding(min_periods=5).sum(), - lambda x: x.expanding(min_periods=5).mean(), - lambda x: x.expanding(min_periods=5).std(), - lambda x: x.expanding(min_periods=5).var(), - lambda x: x.expanding(min_periods=5).skew(), - lambda x: x.expanding(min_periods=5).kurt(), - lambda x: x.expanding(min_periods=5).quantile(0.5), - lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply( - sum, raw=False), - lambda x: x.expanding(min_periods=5).apply( - sum, raw=True), - lambda x: x.rolling(window=10).count(), - lambda x: x.rolling(window=10, min_periods=5).cov( - x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).corr( - x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling( - window=10, min_periods=5).quantile(0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply( - sum, raw=True), - lambda x: x.rolling(win_type='boxcar', - window=10, min_periods=5).mean(), - ] + functions = [ + lambda x: x.expanding().count(), + lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), + lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), + lambda x: x.expanding(min_periods=5).max(), + lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).sum(), + lambda x: x.expanding(min_periods=5).mean(), + lambda x: x.expanding(min_periods=5).std(), + lambda x: x.expanding(min_periods=5).var(), + lambda x: x.expanding(min_periods=5).skew(), + lambda x: x.expanding(min_periods=5).kurt(), + lambda x: x.expanding(min_periods=5).quantile(0.5), + lambda x: x.expanding(min_periods=5).median(), + lambda x: x.expanding(min_periods=5).apply(sum, raw=False), + lambda x: x.expanding(min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ] for f in functions: try: s_result = f(s) @@ -3026,28 +3554,27 @@ def test_moment_functions_zero_length_pairwise(self): df1 = DataFrame() df1_expected = df1 - df2 = DataFrame(columns=Index(['a'], name='foo'), - index=Index([], name='bar')) - df2['a'] = df2['a'].astype('float64') + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") df1_expected = DataFrame( index=pd.MultiIndex.from_product([df1.index, df1.columns]), - columns=Index([])) + columns=Index([]), + ) df2_expected = DataFrame( - index=pd.MultiIndex.from_product([df2.index, df2.columns], - names=['bar', 'foo']), - columns=Index(['a'], name='foo'), - dtype='float64') - - functions = [lambda x: (x.expanding(min_periods=5) - .cov(x, pairwise=True)), - lambda x: (x.expanding(min_periods=5) - .corr(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5) - .cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5) - .corr(x, pairwise=True)), - ] + index=pd.MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + functions = [ + lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), + lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] for f in functions: df1_result = f(df1) tm.assert_frame_equal(df1_result, df1_expected) @@ -3057,25 +3584,27 @@ def test_moment_functions_zero_length_pairwise(self): def test_expanding_cov_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], - columns=Index(['A', 'B'], name='foo')) - df1a = DataFrame([[1, 5], [3, 9]], - index=[0, 2], - columns=Index(['A', 'B'], name='foo')) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], - columns=Index(['X', 'Y'], name='foo')) - df2a = DataFrame([[5, 6], [2, 1]], - index=[0, 2], - columns=Index(['X', 'Y'], name='foo')) + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) + df1a = DataFrame( + [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") + ) # TODO: xref gh-15826 # .loc is not preserving the names result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] - expected = DataFrame([[-3.0, -6.0], [-5.0, -10.0]], - columns=Index(['A', 'B'], name='foo'), - index=Index(['X', 'Y'], name='foo')) + expected = DataFrame( + [[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(["A", "B"], name="foo"), + index=Index(["X", "Y"], name="foo"), + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) @@ -3083,25 +3612,29 @@ def test_expanding_cov_pairwise_diff_length(self): def test_expanding_corr_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 2], [3, 2], [3, 4]], - columns=['A', 'B'], - index=Index(range(3), name='bar')) - df1a = DataFrame([[1, 2], [3, 4]], - index=Index([0, 2], name='bar'), - columns=['A', 'B']) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], - columns=['X', 'Y'], - index=Index(range(3), name='bar')) - df2a = DataFrame([[5, 6], [2, 1]], - index=Index([0, 2], name='bar'), - columns=['X', 'Y']) + df1 = DataFrame( + [[1, 2], [3, 2], [3, 4]], + columns=["A", "B"], + index=Index(range(3), name="bar"), + ) + df1a = DataFrame( + [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], + columns=["X", "Y"], + index=Index(range(3), name="bar"), + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] + ) result1 = df1.expanding().corr(df2, pairwise=True).loc[2] result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] - expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], - columns=['A', 'B'], - index=Index(['X', 'Y'])) + expected = DataFrame( + [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) + ) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) @@ -3122,8 +3655,7 @@ def test_rolling_skew_edge_cases(self): tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 0.177994, 1.548824] - d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 - ]) + d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) x = d.rolling(window=4).skew() tm.assert_series_equal(expected, x) @@ -3143,8 +3675,7 @@ def test_rolling_kurt_edge_cases(self): tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] - d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 - ]) + d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401]) expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) x = d.rolling(window=4).kurt() tm.assert_series_equal(expected, x) @@ -3159,20 +3690,19 @@ def test_rolling_kurt_eq_value_fperr(self): a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() - @pytest.mark.parametrize('func,static_comp', [('sum', np.sum), - ('mean', np.mean), - ('max', np.max), - ('min', np.min)], - ids=['sum', 'mean', 'max', 'min']) + @pytest.mark.parametrize( + "func,static_comp", + [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + ids=["sum", "mean", "max", "min"], + ) def test_expanding_func(self, func, static_comp): def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, - center=center, axis=axis) + exp = x.expanding(min_periods=min_periods, center=center, axis=axis) return getattr(exp, func)() + self._check_expanding(expanding_func, static_comp, preserve_nan=False) def test_expanding_apply(self, raw): - def expanding_mean(x, min_periods=1): exp = x.expanding(min_periods=min_periods) @@ -3184,17 +3714,22 @@ def expanding_mean(x, min_periods=1): self._check_expanding(expanding_mean, np.mean, preserve_nan=False) ser = Series([]) - tm.assert_series_equal(ser, ser.expanding().apply( - lambda x: x.mean(), raw=raw)) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) # GH 8080 s = Series([None, None, None]) result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1., 2., 3.]) + expected = Series([1.0, 2.0, 3.0]) tm.assert_series_equal(result, expected) - def _check_expanding(self, func, static_comp, has_min_periods=True, - has_time_rule=True, preserve_nan=True): + def _check_expanding( + self, + func, + static_comp, + has_min_periods=True, + has_time_rule=True, + preserve_nan=True, + ): series_result = func(self.series) assert isinstance(series_result, Series) @@ -3244,9 +3779,11 @@ def test_rolling_max_gh6297(self): # Sort chronologically series = series.sort_index() - expected = Series([1.0, 2.0, 6.0, 4.0, 5.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').max().rolling(window=1).max() + expected = Series( + [1.0, 2.0, 6.0, 4.0, 5.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").max().rolling(window=1).max() tm.assert_series_equal(expected, x) def test_rolling_max_resample(self): @@ -3262,22 +3799,28 @@ def test_rolling_max_resample(self): series = series.sort_index() # Default how should be max - expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').max().rolling(window=1).max() + expected = Series( + [0.0, 1.0, 2.0, 3.0, 20.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").max().rolling(window=1).max() tm.assert_series_equal(expected, x) # Now specify median (10.0) - expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').median().rolling(window=1).max() + expected = Series( + [0.0, 1.0, 2.0, 3.0, 10.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").median().rolling(window=1).max() tm.assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 v = (4.0 + 10.0 + 20.0) / 3.0 - expected = Series([0.0, 1.0, 2.0, 3.0, v], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').mean().rolling(window=1).max() + expected = Series( + [0.0, 1.0, 2.0, 3.0, v], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").mean().rolling(window=1).max() tm.assert_series_equal(expected, x) def test_rolling_min_resample(self): @@ -3293,9 +3836,11 @@ def test_rolling_min_resample(self): series = series.sort_index() # Default how should be min - expected = Series([0.0, 1.0, 2.0, 3.0, 4.0], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - r = series.resample('D').min().rolling(window=1) + expected = Series( + [0.0, 1.0, 2.0, 3.0, 4.0], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + r = series.resample("D").min().rolling(window=1) tm.assert_series_equal(expected, r.min()) def test_rolling_median_resample(self): @@ -3311,9 +3856,11 @@ def test_rolling_median_resample(self): series = series.sort_index() # Default how should be median - expected = Series([0.0, 1.0, 2.0, 3.0, 10], - index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) - x = series.resample('D').median().rolling(window=1).median() + expected = Series( + [0.0, 1.0, 2.0, 3.0, 10], + index=[datetime(1975, 1, i, 0) for i in range(1, 6)], + ) + x = series.resample("D").median().rolling(window=1).median() tm.assert_series_equal(expected, x) def test_rolling_median_memory_error(self): @@ -3326,41 +3873,42 @@ def test_rolling_min_max_numeric_types(self): # GH12373 types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] - types_test.extend([np.dtype("{}{}".format(sign, width)) - for width in [1, 2, 4, 8] for sign in "ui"]) + types_test.extend( + [ + np.dtype("{}{}".format(sign, width)) + for width in [1, 2, 4, 8] + for sign in "ui" + ] + ) for data_type in types_test: # Just testing that these don't throw exceptions and that # the return type is float64. Other tests will cover quantitative # correctness - result = (DataFrame(np.arange(20, dtype=data_type)) - .rolling(window=5).max()) + result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).max() assert result.dtypes[0] == np.dtype("f8") - result = (DataFrame(np.arange(20, dtype=data_type)) - .rolling(window=5).min()) + result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() assert result.dtypes[0] == np.dtype("f8") class TestGrouperGrouping: - def setup_method(self, method): self.series = Series(np.arange(10)) - self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.arange(40)}) + self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) def test_mutated(self): msg = r"group\(\) got an unexpected keyword argument 'foo'" with pytest.raises(TypeError, match=msg): - self.frame.groupby('A', foo=1) + self.frame.groupby("A", foo=1) - g = self.frame.groupby('A') + g = self.frame.groupby("A") assert not g.mutated - g = self.frame.groupby('A', mutated=True) + g = self.frame.groupby("A", mutated=True) assert g.mutated def test_getitem(self): - g = self.frame.groupby('A') - g_mutated = self.frame.groupby('A', mutated=True) + g = self.frame.groupby("A") + g_mutated = self.frame.groupby("A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) @@ -3379,9 +3927,9 @@ def test_getitem(self): def test_getitem_multiple(self): # GH 13174 - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(2) - g_mutated = self.frame.groupby('A', mutated=True) + g_mutated = self.frame.groupby("A", mutated=True) expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) result = r.B.count() @@ -3391,16 +3939,16 @@ def test_getitem_multiple(self): tm.assert_series_equal(result, expected) def test_rolling(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(window=4) - for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) - for f in ['std', 'var']: + for f in ["std", "var"]: result = getattr(r, f)(ddof=1) expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) tm.assert_frame_equal(result, expected) @@ -3410,14 +3958,15 @@ def test_rolling(self): tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(window=4) - for f in ['corr', 'cov']: + for f in ["corr", "cov"]: result = getattr(r, f)(self.frame) def func(x): return getattr(x.rolling(4), f)(self.frame) + expected = g.apply(func) tm.assert_frame_equal(result, expected) @@ -3425,30 +3974,31 @@ def func(x): def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) + expected = g.apply(func) tm.assert_series_equal(result, expected) def test_rolling_apply(self, raw): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.rolling(window=4) # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply( - lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) + expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) def test_rolling_apply_mutability(self): # GH 14013 - df = pd.DataFrame({'A': ['foo'] * 3 + ['bar'] * 3, 'B': [1] * 6}) - g = df.groupby('A') + df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) + g = df.groupby("A") - mi = pd.MultiIndex.from_tuples([('bar', 3), ('bar', 4), ('bar', 5), - ('foo', 0), ('foo', 1), ('foo', 2)]) + mi = pd.MultiIndex.from_tuples( + [("bar", 3), ("bar", 4), ("bar", 5), ("foo", 0), ("foo", 1), ("foo", 2)] + ) - mi.names = ['A', None] + mi.names = ["A", None] # Grouped column should not be a part of the output - expected = pd.DataFrame([np.nan, 2., 2.] * 2, columns=['B'], index=mi) + expected = pd.DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi) result = g.rolling(window=2).sum() tm.assert_frame_equal(result, expected) @@ -3461,16 +4011,16 @@ def test_rolling_apply_mutability(self): tm.assert_frame_equal(result, expected) def test_expanding(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.expanding() - for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) tm.assert_frame_equal(result, expected) - for f in ['std', 'var']: + for f in ["std", "var"]: result = getattr(r, f)(ddof=0) expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) tm.assert_frame_equal(result, expected) @@ -3480,14 +4030,15 @@ def test_expanding(self): tm.assert_frame_equal(result, expected) def test_expanding_corr_cov(self): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.expanding() - for f in ['corr', 'cov']: + for f in ["corr", "cov"]: result = getattr(r, f)(self.frame) def func(x): return getattr(x.expanding(), f)(self.frame) + expected = g.apply(func) tm.assert_frame_equal(result, expected) @@ -3495,17 +4046,17 @@ def func(x): def func(x): return getattr(x.B.expanding(), f)(pairwise=True) + expected = g.apply(func) tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw): - g = self.frame.groupby('A') + g = self.frame.groupby("A") r = g.expanding() # reduction result = r.apply(lambda x: x.sum(), raw=raw) - expected = g.apply( - lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) + expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) @@ -3516,28 +4067,33 @@ class TestRollingTS: def setup_method(self, method): - self.regular = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': range(5)}).set_index('A') + self.regular = DataFrame( + {"A": pd.date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") - self.ragged = DataFrame({'B': range(5)}) - self.ragged.index = [Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')] + self.ragged = DataFrame({"B": range(5)}) + self.ragged.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] def test_doc_string(self): - df = DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=[Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')]) + df = DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=[ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ], + ) df - df.rolling('2s').sum() + df.rolling("2s").sum() def test_valid(self): @@ -3545,28 +4101,28 @@ def test_valid(self): # not a valid freq with pytest.raises(ValueError): - df.rolling(window='foobar') + df.rolling(window="foobar") # not a datetimelike index with pytest.raises(ValueError): - df.reset_index().rolling(window='foobar') + df.reset_index().rolling(window="foobar") # non-fixed freqs - for freq in ['2MS', pd.offsets.MonthBegin(2)]: + for freq in ["2MS", pd.offsets.MonthBegin(2)]: with pytest.raises(ValueError): df.rolling(window=freq) - for freq in ['1D', pd.offsets.Day(2), '2ms']: + for freq in ["1D", pd.offsets.Day(2), "2ms"]: df.rolling(window=freq) # non-integer min_periods - for minp in [1.0, 'foo', np.array([1, 2, 3])]: + for minp in [1.0, "foo", np.array([1, 2, 3])]: with pytest.raises(ValueError): - df.rolling(window='1D', min_periods=minp) + df.rolling(window="1D", min_periods=minp) # center is not implemented with pytest.raises(NotImplementedError): - df.rolling(window='1D', center=True) + df.rolling(window="1D", center=True) def test_on(self): @@ -3574,71 +4130,66 @@ def test_on(self): # not a valid column with pytest.raises(ValueError): - df.rolling(window='2s', on='foobar') + df.rolling(window="2s", on="foobar") # column is valid df = df.copy() - df['C'] = pd.date_range('20130101', periods=len(df)) - df.rolling(window='2d', on='C').sum() + df["C"] = pd.date_range("20130101", periods=len(df)) + df.rolling(window="2d", on="C").sum() # invalid columns with pytest.raises(ValueError): - df.rolling(window='2d', on='B') + df.rolling(window="2d", on="B") # ok even though on non-selected - df.rolling(window='2d', on='C').B.sum() + df.rolling(window="2d", on="C").B.sum() def test_monotonic_on(self): # on/index must be monotonic - df = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': range(5)}) + df = DataFrame( + {"A": pd.date_range("20130101", periods=5, freq="s"), "B": range(5)} + ) assert df.A.is_monotonic - df.rolling('2s', on='A').sum() + df.rolling("2s", on="A").sum() - df = df.set_index('A') + df = df.set_index("A") assert df.index.is_monotonic - df.rolling('2s').sum() + df.rolling("2s").sum() # non-monotonic df.index = reversed(df.index.tolist()) assert not df.index.is_monotonic with pytest.raises(ValueError): - df.rolling('2s').sum() + df.rolling("2s").sum() df = df.reset_index() with pytest.raises(ValueError): - df.rolling('2s', on='A').sum() + df.rolling("2s", on="A").sum() def test_frame_on(self): - df = DataFrame({'B': range(5), - 'C': pd.date_range('20130101 09:00:00', - periods=5, - freq='3s')}) + df = DataFrame( + { + "B": range(5), + "C": pd.date_range("20130101 09:00:00", periods=5, freq="3s"), + } + ) - df['A'] = [Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')] + df["A"] = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] # we are doing simulating using 'on' - expected = (df.set_index('A') - .rolling('2s') - .B - .sum() - .reset_index(drop=True) - ) + expected = df.set_index("A").rolling("2s").B.sum().reset_index(drop=True) - result = (df.rolling('2s', on='A') - .B - .sum() - ) + result = df.rolling("2s", on="A").B.sum() tm.assert_series_equal(result, expected) # test as a frame @@ -3647,66 +4198,70 @@ def test_frame_on(self): # so the columns need to be switched compared # to the actual result where they are ordered as in the # original - expected = (df.set_index('A') - .rolling('2s')[['B']] - .sum() - .reset_index()[['B', 'A']] - ) + expected = ( + df.set_index("A").rolling("2s")[["B"]].sum().reset_index()[["B", "A"]] + ) - result = (df.rolling('2s', on='A')[['B']] - .sum() - ) + result = df.rolling("2s", on="A")[["B"]].sum() tm.assert_frame_equal(result, expected) def test_frame_on2(self): # using multiple aggregation columns - df = DataFrame({'A': [0, 1, 2, 3, 4], - 'B': [0, 1, 2, np.nan, 4], - 'C': Index([Timestamp('20130101 09:00:00'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:05'), - Timestamp('20130101 09:00:06')])}, - columns=['A', 'C', 'B']) - - expected1 = DataFrame({'A': [0., 1, 3, 3, 7], - 'B': [0, 1, 3, np.nan, 4], - 'C': df['C']}, - columns=['A', 'C', 'B']) - - result = df.rolling('2s', on='C').sum() + df = DataFrame( + { + "A": [0, 1, 2, 3, 4], + "B": [0, 1, 2, np.nan, 4], + "C": Index( + [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + ), + }, + columns=["A", "C", "B"], + ) + + expected1 = DataFrame( + {"A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"]}, + columns=["A", "C", "B"], + ) + + result = df.rolling("2s", on="C").sum() expected = expected1 tm.assert_frame_equal(result, expected) - expected = Series([0, 1, 3, np.nan, 4], name='B') - result = df.rolling('2s', on='C').B.sum() + expected = Series([0, 1, 3, np.nan, 4], name="B") + result = df.rolling("2s", on="C").B.sum() tm.assert_series_equal(result, expected) - expected = expected1[['A', 'B', 'C']] - result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum() + expected = expected1[["A", "B", "C"]] + result = df.rolling("2s", on="C")[["A", "B", "C"]].sum() tm.assert_frame_equal(result, expected) def test_basic_regular(self): df = self.regular.copy() - df.index = pd.date_range('20130101', periods=5, freq='D') + df.index = pd.date_range("20130101", periods=5, freq="D") expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window='1D').sum() + result = df.rolling(window="1D").sum() tm.assert_frame_equal(result, expected) - df.index = pd.date_range('20130101', periods=5, freq='2D') + df.index = pd.date_range("20130101", periods=5, freq="2D") expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window='2D', min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1, min_periods=1).sum() - result = df.rolling(window='2D', min_periods=1).sum() + result = df.rolling(window="2D", min_periods=1).sum() tm.assert_frame_equal(result, expected) expected = df.rolling(window=1).sum() - result = df.rolling(window='2D').sum() + result = df.rolling(window="2D").sum() tm.assert_frame_equal(result, expected) def test_min_periods(self): @@ -3716,307 +4271,312 @@ def test_min_periods(self): # these slightly different expected = df.rolling(2, min_periods=1).sum() - result = df.rolling('2s').sum() + result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.rolling(2, min_periods=1).sum() - result = df.rolling('2s', min_periods=1).sum() + result = df.rolling("2s", min_periods=1).sum() tm.assert_frame_equal(result, expected) def test_closed(self): # xref GH13965 - df = DataFrame({'A': [1] * 5}, - index=[Timestamp('20130101 09:00:01'), - Timestamp('20130101 09:00:02'), - Timestamp('20130101 09:00:03'), - Timestamp('20130101 09:00:04'), - Timestamp('20130101 09:00:06')]) + df = DataFrame( + {"A": [1] * 5}, + index=[ + Timestamp("20130101 09:00:01"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:04"), + Timestamp("20130101 09:00:06"), + ], + ) # closed must be 'right', 'left', 'both', 'neither' with pytest.raises(ValueError): - self.regular.rolling(window='2s', closed="blabla") + self.regular.rolling(window="2s", closed="blabla") expected = df.copy() expected["A"] = [1.0, 2, 2, 2, 1] - result = df.rolling('2s', closed='right').sum() + result = df.rolling("2s", closed="right").sum() tm.assert_frame_equal(result, expected) # default should be 'right' - result = df.rolling('2s').sum() + result = df.rolling("2s").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [1.0, 2, 3, 3, 2] - result = df.rolling('2s', closed='both').sum() + result = df.rolling("2s", closed="both").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 2, 2, 1] - result = df.rolling('2s', closed='left').sum() + result = df.rolling("2s", closed="left").sum() tm.assert_frame_equal(result, expected) expected = df.copy() expected["A"] = [np.nan, 1.0, 1, 1, np.nan] - result = df.rolling('2s', closed='neither').sum() + result = df.rolling("2s", closed="neither").sum() tm.assert_frame_equal(result, expected) def test_ragged_sum(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).sum() + result = df.rolling(window="1s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).sum() + result = df.rolling(window="2s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 3, 7] + expected["B"] = [0.0, 1, 3, 3, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=2).sum() + result = df.rolling(window="2s", min_periods=2).sum() expected = df.copy() - expected['B'] = [np.nan, np.nan, 3, np.nan, 7] + expected["B"] = [np.nan, np.nan, 3, np.nan, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s', min_periods=1).sum() + result = df.rolling(window="3s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 5, 7] + expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s').sum() + result = df.rolling(window="3s").sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 5, 7] + expected["B"] = [0.0, 1, 3, 5, 7] tm.assert_frame_equal(result, expected) - result = df.rolling(window='4s', min_periods=1).sum() + result = df.rolling(window="4s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 6, 9] + expected["B"] = [0.0, 1, 3, 6, 9] tm.assert_frame_equal(result, expected) - result = df.rolling(window='4s', min_periods=3).sum() + result = df.rolling(window="4s", min_periods=3).sum() expected = df.copy() - expected['B'] = [np.nan, np.nan, 3, 6, 9] + expected["B"] = [np.nan, np.nan, 3, 6, 9] tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).sum() + result = df.rolling(window="5s", min_periods=1).sum() expected = df.copy() - expected['B'] = [0.0, 1, 3, 6, 10] + expected["B"] = [0.0, 1, 3, 6, 10] tm.assert_frame_equal(result, expected) def test_ragged_mean(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).mean() + result = df.rolling(window="1s", min_periods=1).mean() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).mean() + result = df.rolling(window="2s", min_periods=1).mean() expected = df.copy() - expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_median(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).median() + result = df.rolling(window="1s", min_periods=1).median() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).median() + result = df.rolling(window="2s", min_periods=1).median() expected = df.copy() - expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_quantile(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).quantile(0.5) + result = df.rolling(window="1s", min_periods=1).quantile(0.5) expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).quantile(0.5) + result = df.rolling(window="2s", min_periods=1).quantile(0.5) expected = df.copy() - expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + expected["B"] = [0.0, 1, 1.5, 3.0, 3.5] tm.assert_frame_equal(result, expected) def test_ragged_std(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).std(ddof=0) + result = df.rolling(window="1s", min_periods=1).std(ddof=0) expected = df.copy() - expected['B'] = [0.0] * 5 + expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='1s', min_periods=1).std(ddof=1) + result = df.rolling(window="1s", min_periods=1).std(ddof=1) expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s', min_periods=1).std(ddof=0) + result = df.rolling(window="3s", min_periods=1).std(ddof=0) expected = df.copy() - expected['B'] = [0.0] + [0.5] * 4 + expected["B"] = [0.0] + [0.5] * 4 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).std(ddof=1) + result = df.rolling(window="5s", min_periods=1).std(ddof=1) expected = df.copy() - expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] tm.assert_frame_equal(result, expected) def test_ragged_var(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).var(ddof=0) + result = df.rolling(window="1s", min_periods=1).var(ddof=0) expected = df.copy() - expected['B'] = [0.0] * 5 + expected["B"] = [0.0] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='1s', min_periods=1).var(ddof=1) + result = df.rolling(window="1s", min_periods=1).var(ddof=1) expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='3s', min_periods=1).var(ddof=0) + result = df.rolling(window="3s", min_periods=1).var(ddof=0) expected = df.copy() - expected['B'] = [0.0] + [0.25] * 4 + expected["B"] = [0.0] + [0.25] * 4 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).var(ddof=1) + result = df.rolling(window="5s", min_periods=1).var(ddof=1) expected = df.copy() - expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.] + expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0] tm.assert_frame_equal(result, expected) def test_ragged_skew(self): df = self.ragged - result = df.rolling(window='3s', min_periods=1).skew() + result = df.rolling(window="3s", min_periods=1).skew() expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).skew() + result = df.rolling(window="5s", min_periods=1).skew() expected = df.copy() - expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0] + expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0] tm.assert_frame_equal(result, expected) def test_ragged_kurt(self): df = self.ragged - result = df.rolling(window='3s', min_periods=1).kurt() + result = df.rolling(window="3s", min_periods=1).kurt() expected = df.copy() - expected['B'] = [np.nan] * 5 + expected["B"] = [np.nan] * 5 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).kurt() + result = df.rolling(window="5s", min_periods=1).kurt() expected = df.copy() - expected['B'] = [np.nan] * 4 + [-1.2] + expected["B"] = [np.nan] * 4 + [-1.2] tm.assert_frame_equal(result, expected) def test_ragged_count(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).count() + result = df.rolling(window="1s", min_periods=1).count() expected = df.copy() - expected['B'] = [1.0, 1, 1, 1, 1] + expected["B"] = [1.0, 1, 1, 1, 1] tm.assert_frame_equal(result, expected) df = self.ragged - result = df.rolling(window='1s').count() + result = df.rolling(window="1s").count() tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).count() + result = df.rolling(window="2s", min_periods=1).count() expected = df.copy() - expected['B'] = [1.0, 1, 2, 1, 2] + expected["B"] = [1.0, 1, 2, 1, 2] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=2).count() + result = df.rolling(window="2s", min_periods=2).count() expected = df.copy() - expected['B'] = [np.nan, np.nan, 2, np.nan, 2] + expected["B"] = [np.nan, np.nan, 2, np.nan, 2] tm.assert_frame_equal(result, expected) def test_regular_min(self): - df = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': [0.0, 1, 2, 3, 4]}).set_index('A') - result = df.rolling('1s').min() + df = DataFrame( + { + "A": pd.date_range("20130101", periods=5, freq="s"), + "B": [0.0, 1, 2, 3, 4], + } + ).set_index("A") + result = df.rolling("1s").min() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - df = DataFrame({'A': pd.date_range('20130101', - periods=5, - freq='s'), - 'B': [5, 4, 3, 4, 5]}).set_index('A') + df = DataFrame( + {"A": pd.date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5]} + ).set_index("A") tm.assert_frame_equal(result, expected) - result = df.rolling('2s').min() + result = df.rolling("2s").min() expected = df.copy() - expected['B'] = [5.0, 4, 3, 3, 4] + expected["B"] = [5.0, 4, 3, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling('5s').min() + result = df.rolling("5s").min() expected = df.copy() - expected['B'] = [5.0, 4, 3, 3, 3] + expected["B"] = [5.0, 4, 3, 3, 3] tm.assert_frame_equal(result, expected) def test_ragged_min(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).min() + result = df.rolling(window="1s", min_periods=1).min() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).min() + result = df.rolling(window="2s", min_periods=1).min() expected = df.copy() - expected['B'] = [0.0, 1, 1, 3, 3] + expected["B"] = [0.0, 1, 1, 3, 3] tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).min() + result = df.rolling(window="5s", min_periods=1).min() expected = df.copy() - expected['B'] = [0.0, 0, 0, 1, 1] + expected["B"] = [0.0, 0, 0, 1, 1] tm.assert_frame_equal(result, expected) def test_perf_min(self): N = 10000 - dfp = DataFrame({'B': np.random.randn(N)}, - index=pd.date_range('20130101', - periods=N, - freq='s')) + dfp = DataFrame( + {"B": np.random.randn(N)}, + index=pd.date_range("20130101", periods=N, freq="s"), + ) expected = dfp.rolling(2, min_periods=1).min() - result = dfp.rolling('2s').min() + result = dfp.rolling("2s").min() assert ((result - expected) < 0.01).all().bool() expected = dfp.rolling(200, min_periods=1).min() - result = dfp.rolling('200s').min() + result = dfp.rolling("200s").min() assert ((result - expected) < 0.01).all().bool() def test_ragged_max(self): df = self.ragged - result = df.rolling(window='1s', min_periods=1).max() + result = df.rolling(window="1s", min_periods=1).max() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).max() + result = df.rolling(window="2s", min_periods=1).max() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).max() + result = df.rolling(window="5s", min_periods=1).max() expected = df.copy() - expected['B'] = [0.0, 1, 2, 3, 4] + expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) def test_ragged_apply(self, raw): @@ -4024,19 +4584,19 @@ def test_ragged_apply(self, raw): df = self.ragged f = lambda x: 1 - result = df.rolling(window='1s', min_periods=1).apply(f, raw=raw) + result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) expected = df.copy() - expected['B'] = 1. + expected["B"] = 1.0 tm.assert_frame_equal(result, expected) - result = df.rolling(window='2s', min_periods=1).apply(f, raw=raw) + result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) expected = df.copy() - expected['B'] = 1. + expected["B"] = 1.0 tm.assert_frame_equal(result, expected) - result = df.rolling(window='5s', min_periods=1).apply(f, raw=raw) + result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) expected = df.copy() - expected['B'] = 1. + expected["B"] = 1.0 tm.assert_frame_equal(result, expected) def test_all(self): @@ -4044,10 +4604,20 @@ def test_all(self): # simple comparison of integer vs time-based windowing df = self.regular * 2 er = df.rolling(window=1) - r = df.rolling(window='1s') - - for f in ['sum', 'mean', 'count', 'median', 'std', - 'var', 'kurt', 'skew', 'min', 'max']: + r = df.rolling(window="1s") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: result = getattr(r, f)() expected = getattr(er, f)() @@ -4061,7 +4631,7 @@ def test_all_apply(self, raw): df = self.regular * 2 er = df.rolling(window=1) - r = df.rolling(window='1s') + r = df.rolling(window="1s") result = r.apply(lambda x: 1, raw=raw) expected = er.apply(lambda x: 1, raw=raw) @@ -4071,17 +4641,26 @@ def test_all2(self): # more sophisticated comparison of integer vs. # time-based windowing - df = DataFrame({'B': np.arange(50)}, - index=pd.date_range('20130101', - periods=50, freq='H') - ) + df = DataFrame( + {"B": np.arange(50)}, index=pd.date_range("20130101", periods=50, freq="H") + ) # in-range data dft = df.between_time("09:00", "16:00") - r = dft.rolling(window='5H') - - for f in ['sum', 'mean', 'count', 'median', 'std', - 'var', 'kurt', 'skew', 'min', 'max']: + r = dft.rolling(window="5H") + + for f in [ + "sum", + "mean", + "count", + "median", + "std", + "var", + "kurt", + "skew", + "min", + "max", + ]: result = getattr(r, f)() @@ -4092,8 +4671,12 @@ def test_all2(self): def agg_by_day(x): x = x.between_time("09:00", "16:00") return getattr(x.rolling(5, min_periods=1), f)() - expected = df.groupby(df.index.day).apply( - agg_by_day).reset_index(level=0, drop=True) + + expected = ( + df.groupby(df.index.day) + .apply(agg_by_day) + .reset_index(level=0, drop=True) + ) tm.assert_frame_equal(result, expected) @@ -4103,52 +4686,65 @@ def test_groupby_monotonic(self): # we don't need to validate monotonicity when grouping data = [ - ['David', '1/1/2015', 100], ['David', '1/5/2015', 500], - ['David', '5/30/2015', 50], ['David', '7/25/2015', 50], - ['Ryan', '1/4/2014', 100], ['Ryan', '1/19/2015', 500], - ['Ryan', '3/31/2016', 50], ['Joe', '7/1/2015', 100], - ['Joe', '9/9/2015', 500], ['Joe', '10/15/2015', 50]] - - df = DataFrame(data=data, columns=['name', 'date', 'amount']) - df['date'] = pd.to_datetime(df['date']) - - expected = df.set_index('date').groupby('name').apply( - lambda x: x.rolling('180D')['amount'].sum()) - result = df.groupby('name').rolling('180D', on='date')['amount'].sum() + ["David", "1/1/2015", 100], + ["David", "1/5/2015", 500], + ["David", "5/30/2015", 50], + ["David", "7/25/2015", 50], + ["Ryan", "1/4/2014", 100], + ["Ryan", "1/19/2015", 500], + ["Ryan", "3/31/2016", 50], + ["Joe", "7/1/2015", 100], + ["Joe", "9/9/2015", 500], + ["Joe", "10/15/2015", 50], + ] + + df = DataFrame(data=data, columns=["name", "date", "amount"]) + df["date"] = pd.to_datetime(df["date"]) + + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) + result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) def test_non_monotonic(self): # GH 13966 (similar to #15130, closed by #15175) - dates = pd.date_range(start='2016-01-01 09:30:00', - periods=20, freq='s') - df = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.concatenate((dates, dates)), - 'C': np.arange(40)}) + dates = pd.date_range(start="2016-01-01 09:30:00", periods=20, freq="s") + df = DataFrame( + { + "A": [1] * 20 + [2] * 12 + [3] * 8, + "B": np.concatenate((dates, dates)), + "C": np.arange(40), + } + ) - result = df.groupby('A').rolling('4s', on='B').C.mean() - expected = df.set_index('B').groupby('A').apply( - lambda x: x.rolling('4s')['C'].mean()) + result = df.groupby("A").rolling("4s", on="B").C.mean() + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) tm.assert_series_equal(result, expected) - df2 = df.sort_values('B') - result = df2.groupby('A').rolling('4s', on='B').C.mean() + df2 = df.sort_values("B") + result = df2.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) def test_rolling_cov_offset(self): # GH16058 - idx = pd.date_range('2017-01-01', periods=24, freq='1h') + idx = pd.date_range("2017-01-01", periods=24, freq="1h") ss = Series(np.arange(len(idx)), index=idx) - result = ss.rolling('2h').cov() + result = ss.rolling("2h").cov() expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx) tm.assert_series_equal(result, expected) expected2 = ss.rolling(2, min_periods=1).cov() tm.assert_series_equal(result, expected2) - result = ss.rolling('3h').cov() + result = ss.rolling("3h").cov() expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 6e3e768f9360f..bc1eee2a0aaf2 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -24,10 +24,7 @@ def transform(request): return request.param -@pytest.fixture(params=[ - 47393996303418497800, - 100000000000000000000 -]) +@pytest.fixture(params=[47393996303418497800, 100000000000000000000]) def large_val(request): return request.param @@ -37,19 +34,24 @@ def multiple_elts(request): return request.param -@pytest.fixture(params=[ - (lambda x: Index(x, name="idx"), tm.assert_index_equal), - (lambda x: Series(x, name="ser"), tm.assert_series_equal), - (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal) -]) +@pytest.fixture( + params=[ + (lambda x: Index(x, name="idx"), tm.assert_index_equal), + (lambda x: Series(x, name="ser"), tm.assert_series_equal), + (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal), + ] +) def transform_assert_equal(request): return request.param -@pytest.mark.parametrize("input_kwargs,result_kwargs", [ - (dict(), dict(dtype=np.int64)), - (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)) -]) +@pytest.mark.parametrize( + "input_kwargs,result_kwargs", + [ + (dict(), dict(dtype=np.int64)), + (dict(errors="coerce", downcast="integer"), dict(dtype=np.int8)), + ], +) def test_empty(input_kwargs, result_kwargs): # see gh-16302 ser = Series([], dtype=object) @@ -68,13 +70,15 @@ def test_series(last_val): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("data", [ - [1, 3, 4, 5], - [1., 3., 4., 5.], - - # Bool is regarded as numeric. - [True, False, True, True] -]) +@pytest.mark.parametrize( + "data", + [ + [1, 3, 4, 5], + [1.0, 3.0, 4.0, 5.0], + # Bool is regarded as numeric. + [True, False, True, True], + ], +) def test_series_numeric(data): ser = Series(data, index=list("ABCD"), name="EFG") @@ -82,12 +86,16 @@ def test_series_numeric(data): tm.assert_series_equal(result, ser) -@pytest.mark.parametrize("data,msg", [ - ([1, -3.14, "apple"], - 'Unable to parse string "apple" at position 2'), - (["orange", 1, -3.14, "apple"], - 'Unable to parse string "orange" at position 0') -]) +@pytest.mark.parametrize( + "data,msg", + [ + ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'), + ( + ["orange", 1, -3.14, "apple"], + 'Unable to parse string "orange" at position 0', + ), + ], +) def test_error(data, msg): ser = Series(data) @@ -95,10 +103,9 @@ def test_error(data, msg): to_numeric(ser, errors="raise") -@pytest.mark.parametrize("errors,exp_data", [ - ("ignore", [1, -3.14, "apple"]), - ("coerce", [1, -3.14, np.nan]) -]) +@pytest.mark.parametrize( + "errors,exp_data", [("ignore", [1, -3.14, "apple"]), ("coerce", [1, -3.14, np.nan])] +) def test_ignore_error(errors, exp_data): ser = Series([1, -3.14, "apple"]) result = to_numeric(ser, errors=errors) @@ -107,13 +114,15 @@ def test_ignore_error(errors, exp_data): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("errors,exp", [ - ("raise", 'Unable to parse string "apple" at position 2'), - ("ignore", [True, False, "apple"]), - - # Coerces to float. - ("coerce", [1., 0., np.nan]) -]) +@pytest.mark.parametrize( + "errors,exp", + [ + ("raise", 'Unable to parse string "apple" at position 2'), + ("ignore", [True, False, "apple"]), + # Coerces to float. + ("coerce", [1.0, 0.0, np.nan]), + ], +) def test_bool_handling(errors, exp): ser = Series([True, False, "apple"]) @@ -135,22 +144,22 @@ def test_list(): tm.assert_numpy_array_equal(res, expected) -@pytest.mark.parametrize("data,arr_kwargs", [ - ([1, 3, 4, 5], dict(dtype=np.int64)), - ([1., 3., 4., 5.], dict()), - - # Boolean is regarded as numeric. - ([True, False, True, True], dict()) -]) +@pytest.mark.parametrize( + "data,arr_kwargs", + [ + ([1, 3, 4, 5], dict(dtype=np.int64)), + ([1.0, 3.0, 4.0, 5.0], dict()), + # Boolean is regarded as numeric. + ([True, False, True, True], dict()), + ], +) def test_list_numeric(data, arr_kwargs): result = to_numeric(data) expected = np.array(data, **arr_kwargs) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("kwargs", [ - dict(dtype="O"), dict() -]) +@pytest.mark.parametrize("kwargs", [dict(dtype="O"), dict()]) def test_numeric(kwargs): data = [1, -3.14, 7] @@ -161,24 +170,25 @@ def test_numeric(kwargs): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("columns", [ - # One column. - "a", - - # Multiple columns. - ["a", "b"] -]) +@pytest.mark.parametrize( + "columns", + [ + # One column. + "a", + # Multiple columns. + ["a", "b"], + ], +) def test_numeric_df_columns(columns): # see gh-14827 - df = DataFrame(dict( - a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], - b=[1.0, 2.0, 3.0, 4.0], - )) + df = DataFrame( + dict( + a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), "0.1"], + b=[1.0, 2.0, 3.0, 4.0], + ) + ) - expected = DataFrame(dict( - a=[1.2, 3.14, np.inf, 0.1], - b=[1.0, 2.0, 3.0, 4.0], - )) + expected = DataFrame(dict(a=[1.2, 3.14, np.inf, 0.1], b=[1.0, 2.0, 3.0, 4.0])) df_copy = df.copy() df_copy[columns] = df_copy[columns].apply(to_numeric) @@ -186,12 +196,16 @@ def test_numeric_df_columns(columns): tm.assert_frame_equal(df_copy, expected) -@pytest.mark.parametrize("data,exp_data", [ - ([[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], - [[3.14, 1.0], 1.6, 0.1]), - ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], - [[3.14, 1.0], 0.1]) -]) +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + [[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1], + [[3.14, 1.0], 1.6, 0.1], + ), + ([np.array([decimal.Decimal(3.14), 1.0]), 0.1], [[3.14, 1.0], 0.1]), + ], +) def test_numeric_embedded_arr_likes(data, exp_data): # Test to_numeric with embedded lists and arrays df = DataFrame(dict(a=data)) @@ -238,13 +252,11 @@ def test_really_large_scalar(large_val, signed, transform, errors): with pytest.raises(ValueError, match=msg): to_numeric(val, **kwargs) else: - expected = float(val) if (errors == "coerce" and - val_is_string) else val + expected = float(val) if (errors == "coerce" and val_is_string) else val tm.assert_almost_equal(to_numeric(val, **kwargs), expected) -def test_really_large_in_arr(large_val, signed, transform, - multiple_elts, errors): +def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors): # see gh-24910 kwargs = dict(errors=errors) if errors is not None else dict() val = -large_val if signed else large_val @@ -283,8 +295,7 @@ def test_really_large_in_arr(large_val, signed, transform, tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) -def test_really_large_in_arr_consistent(large_val, signed, - multiple_elts, errors): +def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors): # see gh-24910 # # Even if we discover that we have to hold float, does not mean @@ -314,11 +325,14 @@ def test_really_large_in_arr_consistent(large_val, signed, tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) -@pytest.mark.parametrize("errors,checker", [ - ("raise", 'Unable to parse string "fail" at position 0'), - ("ignore", lambda x: x == "fail"), - ("coerce", lambda x: np.isnan(x)) -]) +@pytest.mark.parametrize( + "errors,checker", + [ + ("raise", 'Unable to parse string "fail" at position 0'), + ("ignore", lambda x: x == "fail"), + ("coerce", lambda x: np.isnan(x)), + ], +) def test_scalar_fail(errors, checker): scalar = "fail" @@ -329,10 +343,7 @@ def test_scalar_fail(errors, checker): assert checker(to_numeric(scalar, errors=errors)) -@pytest.mark.parametrize("data", [ - [1, 2, 3], - [1., np.nan, 3, np.nan] -]) +@pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]]) def test_numeric_dtypes(data, transform_assert_equal): transform, assert_equal = transform_assert_equal data = transform(data) @@ -341,10 +352,13 @@ def test_numeric_dtypes(data, transform_assert_equal): assert_equal(result, data) -@pytest.mark.parametrize("data,exp", [ - (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")), - (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])) -]) +@pytest.mark.parametrize( + "data,exp", + [ + (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")), + (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])), + ], +) def test_str(data, exp, transform_assert_equal): transform, assert_equal = transform_assert_equal result = to_numeric(transform(data)) @@ -386,11 +400,14 @@ def test_period(transform_assert_equal): pytest.skip("Missing PeriodDtype support in to_numeric") -@pytest.mark.parametrize("errors,expected", [ - ("raise", "Invalid object type at position 0"), - ("ignore", Series([[10.0, 2], 1.0, "apple"])), - ("coerce", Series([np.nan, 1.0, np.nan])) -]) +@pytest.mark.parametrize( + "errors,expected", + [ + ("raise", "Invalid object type at position 0"), + ("ignore", Series([[10.0, 2], 1.0, "apple"])), + ("coerce", Series([np.nan, 1.0, np.nan])), + ], +) def test_non_hashable(errors, expected): # see gh-13324 ser = Series([[10.0, 2], 1.0, "apple"]) @@ -423,23 +440,26 @@ def test_errors_invalid_value(): to_numeric(data, errors=invalid_error_value) -@pytest.mark.parametrize("data", [ - ["1", 2, 3], - [1, 2, 3], - np.array(["1970-01-02", "1970-01-03", - "1970-01-04"], dtype="datetime64[D]") -]) -@pytest.mark.parametrize("kwargs,exp_dtype", [ - # Basic function tests. - (dict(), np.int64), - (dict(downcast=None), np.int64), - - # Support below np.float32 is rare and far between. - (dict(downcast="float"), np.dtype(np.float32).char), - - # Basic dtype support. - (dict(downcast="unsigned"), np.dtype(np.typecodes["UnsignedInteger"][0])) -]) +@pytest.mark.parametrize( + "data", + [ + ["1", 2, 3], + [1, 2, 3], + np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"), + ], +) +@pytest.mark.parametrize( + "kwargs,exp_dtype", + [ + # Basic function tests. + (dict(), np.int64), + (dict(downcast=None), np.int64), + # Support below np.float32 is rare and far between. + (dict(downcast="float"), np.dtype(np.float32).char), + # Basic dtype support. + (dict(downcast="unsigned"), np.dtype(np.typecodes["UnsignedInteger"][0])), + ], +) def test_downcast_basic(data, kwargs, exp_dtype): # see gh-13352 result = to_numeric(data, **kwargs) @@ -448,12 +468,14 @@ def test_downcast_basic(data, kwargs, exp_dtype): @pytest.mark.parametrize("signed_downcast", ["integer", "signed"]) -@pytest.mark.parametrize("data", [ - ["1", 2, 3], - [1, 2, 3], - np.array(["1970-01-02", "1970-01-03", - "1970-01-04"], dtype="datetime64[D]") -]) +@pytest.mark.parametrize( + "data", + [ + ["1", 2, 3], + [1, 2, 3], + np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"), + ], +) def test_signed_downcast(data, signed_downcast): # see gh-13352 smallest_int_dtype = np.dtype(np.typecodes["Integer"][0]) @@ -470,8 +492,7 @@ def test_ignore_downcast_invalid_data(): data = ["foo", 2, 3] expected = np.array(data, dtype=object) - res = to_numeric(data, errors="ignore", - downcast="unsigned") + res = to_numeric(data, errors="ignore", downcast="unsigned") tm.assert_numpy_array_equal(res, expected) @@ -486,13 +507,18 @@ def test_ignore_downcast_neg_to_unsigned(): @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"]) -@pytest.mark.parametrize("data,expected", [ - (["1.1", 2, 3], - np.array([1.1, 2, 3], dtype=np.float64)), - ([10000.0, 20000, 3000, 40000.36, 50000, 50000.00], - np.array([10000.0, 20000, 3000, - 40000.36, 50000, 50000.00], dtype=np.float64)) -]) +@pytest.mark.parametrize( + "data,expected", + [ + (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)), + ( + [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], + np.array( + [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64 + ), + ), + ], +) def test_ignore_downcast_cannot_convert_float(data, expected, downcast): # Cannot cast to an integer (signed or unsigned) # because we have a float number. @@ -500,11 +526,10 @@ def test_ignore_downcast_cannot_convert_float(data, expected, downcast): tm.assert_numpy_array_equal(res, expected) -@pytest.mark.parametrize("downcast,expected_dtype", [ - ("integer", np.int16), - ("signed", np.int16), - ("unsigned", np.uint16) -]) +@pytest.mark.parametrize( + "downcast,expected_dtype", + [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)], +) def test_downcast_not8bit(downcast, expected_dtype): # the smallest integer dtype need not be np.(u)int8 data = ["256", 257, 258] @@ -514,54 +539,47 @@ def test_downcast_not8bit(downcast, expected_dtype): tm.assert_numpy_array_equal(res, expected) -@pytest.mark.parametrize("dtype,downcast,min_max", [ - ("int8", "integer", [iinfo(np.int8).min, - iinfo(np.int8).max]), - ("int16", "integer", [iinfo(np.int16).min, - iinfo(np.int16).max]), - ("int32", "integer", [iinfo(np.int32).min, - iinfo(np.int32).max]), - ("int64", "integer", [iinfo(np.int64).min, - iinfo(np.int64).max]), - ("uint8", "unsigned", [iinfo(np.uint8).min, - iinfo(np.uint8).max]), - ("uint16", "unsigned", [iinfo(np.uint16).min, - iinfo(np.uint16).max]), - ("uint32", "unsigned", [iinfo(np.uint32).min, - iinfo(np.uint32).max]), - ("uint64", "unsigned", [iinfo(np.uint64).min, - iinfo(np.uint64).max]), - ("int16", "integer", [iinfo(np.int8).min, - iinfo(np.int8).max + 1]), - ("int32", "integer", [iinfo(np.int16).min, - iinfo(np.int16).max + 1]), - ("int64", "integer", [iinfo(np.int32).min, - iinfo(np.int32).max + 1]), - ("int16", "integer", [iinfo(np.int8).min - 1, - iinfo(np.int16).max]), - ("int32", "integer", [iinfo(np.int16).min - 1, - iinfo(np.int32).max]), - ("int64", "integer", [iinfo(np.int32).min - 1, - iinfo(np.int64).max]), - ("uint16", "unsigned", [iinfo(np.uint8).min, - iinfo(np.uint8).max + 1]), - ("uint32", "unsigned", [iinfo(np.uint16).min, - iinfo(np.uint16).max + 1]), - ("uint64", "unsigned", [iinfo(np.uint32).min, - iinfo(np.uint32).max + 1]) -]) +@pytest.mark.parametrize( + "dtype,downcast,min_max", + [ + ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]), + ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]), + ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]), + ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]), + ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]), + ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]), + ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]), + ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]), + ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]), + ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]), + ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]), + ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]), + ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]), + ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]), + ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]), + ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]), + ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]), + ], +) def test_downcast_limits(dtype, downcast, min_max): # see gh-14404: test the limits of each downcast. series = to_numeric(Series(min_max), downcast=downcast) assert series.dtype == dtype -@pytest.mark.parametrize("data,exp_data", [ - ([200, 300, "", "NaN", 30000000000000000000], - [200, 300, np.nan, np.nan, 30000000000000000000]), - (["12345678901234567890", "1234567890", "ITEM"], - [12345678901234567890, 1234567890, np.nan]) -]) +@pytest.mark.parametrize( + "data,exp_data", + [ + ( + [200, 300, "", "NaN", 30000000000000000000], + [200, 300, np.nan, np.nan, 30000000000000000000], + ), + ( + ["12345678901234567890", "1234567890", "ITEM"], + [12345678901234567890, 1234567890, np.nan], + ), + ], +) def test_coerce_uint64_conflict(data, exp_data): # see gh-17007 and gh-17125 # @@ -572,10 +590,13 @@ def test_coerce_uint64_conflict(data, exp_data): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("errors,exp", [ - ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])), - ("raise", "Unable to parse string") -]) +@pytest.mark.parametrize( + "errors,exp", + [ + ("ignore", Series(["12345678901234567890", "1234567890", "ITEM"])), + ("raise", "Unable to parse string"), + ], +) def test_non_coerce_uint64_conflict(errors, exp): # see gh-17007 and gh-17125 # diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 7de1e8117289e..be07f829dbae8 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -2,7 +2,11 @@ from pandas._libs.tslibs import frequencies as libfrequencies, resolution from pandas._libs.tslibs.frequencies import ( - FreqGroup, _period_code_map, get_freq, get_freq_code) + FreqGroup, + _period_code_map, + get_freq, + get_freq_code, +) import pandas.tseries.offsets as offsets @@ -12,11 +16,20 @@ def period_code_item(request): return request.param -@pytest.mark.parametrize("freqstr,expected", [ - ("A", 1000), ("3A", 1000), ("-1A", 1000), - ("Y", 1000), ("3Y", 1000), ("-1Y", 1000), - ("W", 4000), ("W-MON", 4001), ("W-FRI", 4005) -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", 1000), + ("3A", 1000), + ("-1A", 1000), + ("Y", 1000), + ("3Y", 1000), + ("-1Y", 1000), + ("W", 4000), + ("W-MON", 4001), + ("W-FRI", 4005), + ], +) def test_freq_code(freqstr, expected): assert get_freq(freqstr) == expected @@ -26,15 +39,31 @@ def test_freq_code_match(period_code_item): assert get_freq(freqstr) == code -@pytest.mark.parametrize("freqstr,expected", [ - ("A", 1000), ("3A", 1000), ("-1A", 1000), ("A-JAN", 1000), - ("A-MAY", 1000), ("Y", 1000), ("3Y", 1000), ("-1Y", 1000), - ("Y-JAN", 1000), ("Y-MAY", 1000), (offsets.YearEnd(), 1000), - (offsets.YearEnd(month=1), 1000), (offsets.YearEnd(month=5), 1000), - ("W", 4000), ("W-MON", 4000), ("W-FRI", 4000), (offsets.Week(), 4000), - (offsets.Week(weekday=1), 4000), (offsets.Week(weekday=5), 4000), - ("T", FreqGroup.FR_MIN), -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", 1000), + ("3A", 1000), + ("-1A", 1000), + ("A-JAN", 1000), + ("A-MAY", 1000), + ("Y", 1000), + ("3Y", 1000), + ("-1Y", 1000), + ("Y-JAN", 1000), + ("Y-MAY", 1000), + (offsets.YearEnd(), 1000), + (offsets.YearEnd(month=1), 1000), + (offsets.YearEnd(month=5), 1000), + ("W", 4000), + ("W-MON", 4000), + ("W-FRI", 4000), + (offsets.Week(), 4000), + (offsets.Week(weekday=1), 4000), + (offsets.Week(weekday=5), 4000), + ("T", FreqGroup.FR_MIN), + ], +) def test_freq_group(freqstr, expected): assert resolution.get_freq_group(freqstr) == expected @@ -48,10 +77,10 @@ def test_freq_group_match(period_code_item): assert str_group == code_group == code // 1000 * 1000 -@pytest.mark.parametrize("freqstr,exp_freqstr", [ - ("D", "D"), ("W", "D"), ("M", "D"), - ("S", "S"), ("T", "S"), ("H", "S") -]) +@pytest.mark.parametrize( + "freqstr,exp_freqstr", + [("D", "D"), ("W", "D"), ("M", "D"), ("S", "S"), ("T", "S"), ("H", "S")], +) def test_get_to_timestamp_base(freqstr, exp_freqstr): tsb = libfrequencies.get_to_timestamp_base @@ -61,18 +90,26 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): _reso = resolution.Resolution -@pytest.mark.parametrize("freqstr,expected", [ - ("A", "year"), ("Q", "quarter"), ("M", "month"), - ("D", "day"), ("H", "hour"), ("T", "minute"), - ("S", "second"), ("L", "millisecond"), - ("U", "microsecond"), ("N", "nanosecond") -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("A", "year"), + ("Q", "quarter"), + ("M", "month"), + ("D", "day"), + ("H", "hour"), + ("T", "minute"), + ("S", "second"), + ("L", "millisecond"), + ("U", "microsecond"), + ("N", "nanosecond"), + ], +) def test_get_str_from_freq(freqstr, expected): assert _reso.get_str_from_freq(freqstr) == expected -@pytest.mark.parametrize("freq", ["A", "Q", "M", "D", "H", - "T", "S", "L", "U", "N"]) +@pytest.mark.parametrize("freq", ["A", "Q", "M", "D", "H", "T", "S", "L", "U", "N"]) def test_get_freq_roundtrip(freq): result = _reso.get_freq(_reso.get_str_from_freq(freq)) assert freq == result @@ -84,22 +121,30 @@ def test_get_freq_roundtrip2(freq): assert freq == result -@pytest.mark.parametrize("args,expected", [ - ((1.5, "T"), (90, "S")), ((62.4, "T"), (3744, "S")), - ((1.04, "H"), (3744, "S")), ((1, "D"), (1, "D")), - ((0.342931, "H"), (1234551600, "U")), ((1.2345, "D"), (106660800, "L")) -]) +@pytest.mark.parametrize( + "args,expected", + [ + ((1.5, "T"), (90, "S")), + ((62.4, "T"), (3744, "S")), + ((1.04, "H"), (3744, "S")), + ((1, "D"), (1, "D")), + ((0.342931, "H"), (1234551600, "U")), + ((1.2345, "D"), (106660800, "L")), + ], +) def test_resolution_bumping(args, expected): # see gh-14378 assert _reso.get_stride_from_decimal(*args) == expected -@pytest.mark.parametrize("args", [ - (0.5, "N"), - - # Too much precision in the input can prevent. - (0.3429324798798269273987982, "H") -]) +@pytest.mark.parametrize( + "args", + [ + (0.5, "N"), + # Too much precision in the input can prevent. + (0.3429324798798269273987982, "H"), + ], +) def test_cat(args): msg = "Could not convert to integer offset at any resolution" @@ -107,38 +152,37 @@ def test_cat(args): _reso.get_stride_from_decimal(*args) -@pytest.mark.parametrize("freq_input,expected", [ - # Frequency string. - ("A", (get_freq("A"), 1)), - ("3D", (get_freq("D"), 3)), - ("-2M", (get_freq("M"), -2)), - - # Tuple. - (("D", 1), (get_freq("D"), 1)), - (("A", 3), (get_freq("A"), 3)), - (("M", -2), (get_freq("M"), -2)), - ((5, "T"), (FreqGroup.FR_MIN, 5)), - - # Numeric Tuple. - ((1000, 1), (1000, 1)), - - # Offsets. - (offsets.Day(), (get_freq("D"), 1)), - (offsets.Day(3), (get_freq("D"), 3)), - (offsets.Day(-2), (get_freq("D"), -2)), - (offsets.MonthEnd(), (get_freq("M"), 1)), - (offsets.MonthEnd(3), (get_freq("M"), 3)), - (offsets.MonthEnd(-2), (get_freq("M"), -2)), - (offsets.Week(), (get_freq("W"), 1)), - (offsets.Week(3), (get_freq("W"), 3)), - (offsets.Week(-2), (get_freq("W"), -2)), - (offsets.Hour(), (FreqGroup.FR_HR, 1)), - - # Monday is weekday=0. - (offsets.Week(weekday=1), (get_freq("W-TUE"), 1)), - (offsets.Week(3, weekday=0), (get_freq("W-MON"), 3)), - (offsets.Week(-2, weekday=4), (get_freq("W-FRI"), -2)), -]) +@pytest.mark.parametrize( + "freq_input,expected", + [ + # Frequency string. + ("A", (get_freq("A"), 1)), + ("3D", (get_freq("D"), 3)), + ("-2M", (get_freq("M"), -2)), + # Tuple. + (("D", 1), (get_freq("D"), 1)), + (("A", 3), (get_freq("A"), 3)), + (("M", -2), (get_freq("M"), -2)), + ((5, "T"), (FreqGroup.FR_MIN, 5)), + # Numeric Tuple. + ((1000, 1), (1000, 1)), + # Offsets. + (offsets.Day(), (get_freq("D"), 1)), + (offsets.Day(3), (get_freq("D"), 3)), + (offsets.Day(-2), (get_freq("D"), -2)), + (offsets.MonthEnd(), (get_freq("M"), 1)), + (offsets.MonthEnd(3), (get_freq("M"), 3)), + (offsets.MonthEnd(-2), (get_freq("M"), -2)), + (offsets.Week(), (get_freq("W"), 1)), + (offsets.Week(3), (get_freq("W"), 3)), + (offsets.Week(-2), (get_freq("W"), -2)), + (offsets.Hour(), (FreqGroup.FR_HR, 1)), + # Monday is weekday=0. + (offsets.Week(weekday=1), (get_freq("W-TUE"), 1)), + (offsets.Week(3, weekday=0), (get_freq("W-MON"), 3)), + (offsets.Week(-2, weekday=4), (get_freq("W-FRI"), -2)), + ], +) def test_get_freq_code(freq_input, expected): assert get_freq_code(freq_input) == expected diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index fb65ec1eb9961..4c8f6253cdf7b 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -7,8 +7,7 @@ from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas.compat import is_platform_windows -from pandas import ( - DatetimeIndex, Index, Series, Timestamp, date_range, period_range) +from pandas import DatetimeIndex, Index, Series, Timestamp, date_range, period_range from pandas.core.tools.datetimes import to_datetime import pandas.util.testing as tm @@ -39,21 +38,38 @@ def _check_generated_range(start, periods, freq): else: inf_freq = frequencies.infer_freq(index) is_dec_range = inf_freq == "Q-DEC" and gen.freqstr in ( - "Q", "Q-DEC", "Q-SEP", "Q-JUN", "Q-MAR") + "Q", + "Q-DEC", + "Q-SEP", + "Q-JUN", + "Q-MAR", + ) is_nov_range = inf_freq == "Q-NOV" and gen.freqstr in ( - "Q-NOV", "Q-AUG", "Q-MAY", "Q-FEB") + "Q-NOV", + "Q-AUG", + "Q-MAY", + "Q-FEB", + ) is_oct_range = inf_freq == "Q-OCT" and gen.freqstr in ( - "Q-OCT", "Q-JUL", "Q-APR", "Q-JAN") + "Q-OCT", + "Q-JUL", + "Q-APR", + "Q-JAN", + ) assert is_dec_range or is_nov_range or is_oct_range -@pytest.fixture(params=[(timedelta(1), "D"), - (timedelta(hours=1), "H"), - (timedelta(minutes=1), "T"), - (timedelta(seconds=1), "S"), - (np.timedelta64(1, "ns"), "N"), - (timedelta(microseconds=1), "U"), - (timedelta(microseconds=1000), "L")]) +@pytest.fixture( + params=[ + (timedelta(1), "D"), + (timedelta(hours=1), "H"), + (timedelta(minutes=1), "T"), + (timedelta(seconds=1), "S"), + (np.timedelta64(1, "ns"), "N"), + (timedelta(microseconds=1), "U"), + (timedelta(microseconds=1000), "L"), + ] +) def base_delta_code_pair(request): return request.param @@ -128,8 +144,7 @@ def test_fifth_week_of_month_infer(): def test_week_of_month_fake(): # All of these dates are on same day # of week and are 4 or 5 weeks apart. - index = DatetimeIndex(["2013-08-27", "2013-10-01", - "2013-10-29", "2013-11-26"]) + index = DatetimeIndex(["2013-08-27", "2013-10-01", "2013-10-29", "2013-11-26"]) assert frequencies.infer_freq(index) != "WOM-4TUE" @@ -137,8 +152,10 @@ def test_fifth_week_of_month(): # see gh-9425 # # Only supports freq up to WOM-4. - msg = ("Of the four parameters: start, end, periods, " - "and freq, exactly three must be specified") + msg = ( + "Of the four parameters: start, end, periods, " + "and freq, exactly three must be specified" + ) with pytest.raises(ValueError, match=msg): date_range("2014-01-01", freq="WOM-5MON") @@ -165,12 +182,17 @@ def test_infer_freq_delta(base_delta_code_pair, count): assert frequencies.infer_freq(index) == exp_freq -@pytest.mark.parametrize("constructor", [ - lambda now, delta: DatetimeIndex([now + delta * 7] + - [now + delta * j for j in range(3)]), - lambda now, delta: DatetimeIndex([now + delta * j for j in range(3)] + - [now + delta * 7]) -]) +@pytest.mark.parametrize( + "constructor", + [ + lambda now, delta: DatetimeIndex( + [now + delta * 7] + [now + delta * j for j in range(3)] + ), + lambda now, delta: DatetimeIndex( + [now + delta * j for j in range(3)] + [now + delta * 7] + ), + ], +) def test_infer_freq_custom(base_delta_code_pair, constructor): b = Timestamp(datetime.now()) base_delta, _ = base_delta_code_pair @@ -184,8 +206,9 @@ def test_weekly_infer(periods, day): def test_week_of_month_infer(periods, day, count): - _check_generated_range("1/1/2000", periods, - "WOM-{count}{day}".format(count=count, day=day)) + _check_generated_range( + "1/1/2000", periods, "WOM-{count}{day}".format(count=count, day=day) + ) @pytest.mark.parametrize("freq", ["M", "BM", "BMS"]) @@ -194,20 +217,19 @@ def test_monthly_infer(periods, freq): def test_quarterly_infer(month, periods): - _check_generated_range("1/1/2000", periods, - "Q-{month}".format(month=month)) + _check_generated_range("1/1/2000", periods, "Q-{month}".format(month=month)) @pytest.mark.parametrize("annual", ["A", "BA"]) def test_annually_infer(month, periods, annual): - _check_generated_range("1/1/2000", periods, - "{annual}-{month}".format(annual=annual, - month=month)) + _check_generated_range( + "1/1/2000", periods, "{annual}-{month}".format(annual=annual, month=month) + ) -@pytest.mark.parametrize("freq,expected", [ - ("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT") -]) +@pytest.mark.parametrize( + "freq,expected", [("Q", "Q-DEC"), ("Q-NOV", "Q-NOV"), ("Q-OCT", "Q-OCT")] +) def test_infer_freq_index(freq, expected): rng = period_range("1959Q2", "2009Q3", freq=freq) rng = Index(rng.to_timestamp("D", how="e").astype(object)) @@ -218,13 +240,20 @@ def test_infer_freq_index(freq, expected): @pytest.mark.parametrize( "expected,dates", list( - {"AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], - "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], - "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], - "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], - "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], - "H": ["2011-12-31 22:00", "2011-12-31 23:00", - "2012-01-01 00:00", "2012-01-01 01:00"]}.items()) + { + "AS-JAN": ["2009-01-01", "2010-01-01", "2011-01-01", "2012-01-01"], + "Q-OCT": ["2009-01-31", "2009-04-30", "2009-07-31", "2009-10-31"], + "M": ["2010-11-30", "2010-12-31", "2011-01-31", "2011-02-28"], + "W-SAT": ["2010-12-25", "2011-01-01", "2011-01-08", "2011-01-15"], + "D": ["2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04"], + "H": [ + "2011-12-31 22:00", + "2011-12-31 23:00", + "2012-01-01 00:00", + "2012-01-01 01:00", + ], + }.items() + ), ) def test_infer_freq_tz(tz_naive_fixture, expected, dates): # see gh-7310 @@ -233,14 +262,17 @@ def test_infer_freq_tz(tz_naive_fixture, expected, dates): assert idx.inferred_freq == expected -@pytest.mark.parametrize("date_pair", [ - ["2013-11-02", "2013-11-5"], # Fall DST - ["2014-03-08", "2014-03-11"], # Spring DST - ["2014-01-01", "2014-01-03"] # Regular Time -]) -@pytest.mark.parametrize("freq", [ - "3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N" -]) +@pytest.mark.parametrize( + "date_pair", + [ + ["2013-11-02", "2013-11-5"], # Fall DST + ["2014-03-08", "2014-03-11"], # Spring DST + ["2014-01-01", "2014-01-03"], # Regular Time + ], +) +@pytest.mark.parametrize( + "freq", ["3H", "10T", "3601S", "3600001L", "3600000001U", "3600000000001N"] +) def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): # see gh-8772 tz = tz_naive_fixture @@ -249,33 +281,90 @@ def test_infer_freq_tz_transition(tz_naive_fixture, date_pair, freq): def test_infer_freq_tz_transition_custom(): - index = date_range("2013-11-03", periods=5, - freq="3H").tz_localize("America/Chicago") + index = date_range("2013-11-03", periods=5, freq="3H").tz_localize( + "America/Chicago" + ) assert index.inferred_freq is None -@pytest.mark.parametrize("data,expected", [ - # Hourly freq in a day must result in "H" - (["2014-07-01 09:00", "2014-07-01 10:00", "2014-07-01 11:00", - "2014-07-01 12:00", "2014-07-01 13:00", "2014-07-01 14:00"], "H"), - - (["2014-07-01 09:00", "2014-07-01 10:00", "2014-07-01 11:00", - "2014-07-01 12:00", "2014-07-01 13:00", "2014-07-01 14:00", - "2014-07-01 15:00", "2014-07-01 16:00", "2014-07-02 09:00", - "2014-07-02 10:00", "2014-07-02 11:00"], "BH"), - (["2014-07-04 09:00", "2014-07-04 10:00", "2014-07-04 11:00", - "2014-07-04 12:00", "2014-07-04 13:00", "2014-07-04 14:00", - "2014-07-04 15:00", "2014-07-04 16:00", "2014-07-07 09:00", - "2014-07-07 10:00", "2014-07-07 11:00"], "BH"), - (["2014-07-04 09:00", "2014-07-04 10:00", "2014-07-04 11:00", - "2014-07-04 12:00", "2014-07-04 13:00", "2014-07-04 14:00", - "2014-07-04 15:00", "2014-07-04 16:00", "2014-07-07 09:00", - "2014-07-07 10:00", "2014-07-07 11:00", "2014-07-07 12:00", - "2014-07-07 13:00", "2014-07-07 14:00", "2014-07-07 15:00", - "2014-07-07 16:00", "2014-07-08 09:00", "2014-07-08 10:00", - "2014-07-08 11:00", "2014-07-08 12:00", "2014-07-08 13:00", - "2014-07-08 14:00", "2014-07-08 15:00", "2014-07-08 16:00"], "BH"), -]) +@pytest.mark.parametrize( + "data,expected", + [ + # Hourly freq in a day must result in "H" + ( + [ + "2014-07-01 09:00", + "2014-07-01 10:00", + "2014-07-01 11:00", + "2014-07-01 12:00", + "2014-07-01 13:00", + "2014-07-01 14:00", + ], + "H", + ), + ( + [ + "2014-07-01 09:00", + "2014-07-01 10:00", + "2014-07-01 11:00", + "2014-07-01 12:00", + "2014-07-01 13:00", + "2014-07-01 14:00", + "2014-07-01 15:00", + "2014-07-01 16:00", + "2014-07-02 09:00", + "2014-07-02 10:00", + "2014-07-02 11:00", + ], + "BH", + ), + ( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + ], + "BH", + ), + ( + [ + "2014-07-04 09:00", + "2014-07-04 10:00", + "2014-07-04 11:00", + "2014-07-04 12:00", + "2014-07-04 13:00", + "2014-07-04 14:00", + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + "2014-07-08 11:00", + "2014-07-08 12:00", + "2014-07-08 13:00", + "2014-07-08 14:00", + "2014-07-08 15:00", + "2014-07-08 16:00", + ], + "BH", + ), + ], +) def test_infer_freq_business_hour(data, expected): # see gh-7905 idx = DatetimeIndex(data) @@ -297,21 +386,21 @@ def test_non_datetime_index2(): assert result == rng.inferred_freq -@pytest.mark.parametrize("idx", [ - tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10) -]) +@pytest.mark.parametrize( + "idx", [tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)] +) def test_invalid_index_types(idx): - msg = ("(cannot infer freq from a non-convertible)|" - "(Check the `freq` attribute instead of using infer_freq)") + msg = ( + "(cannot infer freq from a non-convertible)|" + "(Check the `freq` attribute instead of using infer_freq)" + ) with pytest.raises(TypeError, match=msg): frequencies.infer_freq(idx) -@pytest.mark.skipif(is_platform_windows(), - reason="see gh-10822: Windows issue") -@pytest.mark.parametrize("idx", [tm.makeStringIndex(10), - tm.makeUnicodeIndex(10)]) +@pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue") +@pytest.mark.parametrize("idx", [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]) def test_invalid_index_types_unicode(idx): # see gh-10822 # @@ -339,7 +428,7 @@ def test_series(): assert inferred == "D" -@pytest.mark.parametrize("end", [10, 10.]) +@pytest.mark.parametrize("end", [10, 10.0]) def test_series_invalid_type(end): # see gh-6407 msg = "cannot infer freq from a non-convertible dtype on a Series" @@ -376,22 +465,63 @@ def test_series_datetime_index(freq): assert inferred == freq -@pytest.mark.parametrize("offset_func", [ - frequencies.get_offset, - lambda freq: date_range("2011-01-01", periods=5, freq=freq) -]) -@pytest.mark.parametrize("freq", [ - "WEEKDAY", "EOM", "W@MON", "W@TUE", "W@WED", "W@THU", - "W@FRI", "W@SAT", "W@SUN", "Q@JAN", "Q@FEB", "Q@MAR", - "A@JAN", "A@FEB", "A@MAR", "A@APR", "A@MAY", "A@JUN", - "A@JUL", "A@AUG", "A@SEP", "A@OCT", "A@NOV", "A@DEC", - "Y@JAN", "WOM@1MON", "WOM@2MON", "WOM@3MON", - "WOM@4MON", "WOM@1TUE", "WOM@2TUE", "WOM@3TUE", - "WOM@4TUE", "WOM@1WED", "WOM@2WED", "WOM@3WED", - "WOM@4WED", "WOM@1THU", "WOM@2THU", "WOM@3THU", - "WOM@4THU", "WOM@1FRI", "WOM@2FRI", "WOM@3FRI", - "WOM@4FRI" -]) +@pytest.mark.parametrize( + "offset_func", + [ + frequencies.get_offset, + lambda freq: date_range("2011-01-01", periods=5, freq=freq), + ], +) +@pytest.mark.parametrize( + "freq", + [ + "WEEKDAY", + "EOM", + "W@MON", + "W@TUE", + "W@WED", + "W@THU", + "W@FRI", + "W@SAT", + "W@SUN", + "Q@JAN", + "Q@FEB", + "Q@MAR", + "A@JAN", + "A@FEB", + "A@MAR", + "A@APR", + "A@MAY", + "A@JUN", + "A@JUL", + "A@AUG", + "A@SEP", + "A@OCT", + "A@NOV", + "A@DEC", + "Y@JAN", + "WOM@1MON", + "WOM@2MON", + "WOM@3MON", + "WOM@4MON", + "WOM@1TUE", + "WOM@2TUE", + "WOM@3TUE", + "WOM@4TUE", + "WOM@1WED", + "WOM@2WED", + "WOM@3WED", + "WOM@4WED", + "WOM@1THU", + "WOM@2THU", + "WOM@3THU", + "WOM@4THU", + "WOM@1FRI", + "WOM@2FRI", + "WOM@3FRI", + "WOM@4FRI", + ], +) def test_legacy_offset_warnings(offset_func, freq): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): offset_func(freq) diff --git a/pandas/tests/tseries/frequencies/test_to_offset.py b/pandas/tests/tseries/frequencies/test_to_offset.py index c9c35b47f3475..b6069c446160d 100644 --- a/pandas/tests/tseries/frequencies/test_to_offset.py +++ b/pandas/tests/tseries/frequencies/test_to_offset.py @@ -8,54 +8,79 @@ import pandas.tseries.offsets as offsets -@pytest.mark.parametrize("freq_input,expected", [ - (frequencies.to_offset("10us"), offsets.Micro(10)), - (offsets.Hour(), offsets.Hour()), - ((5, "T"), offsets.Minute(5)), - ("2h30min", offsets.Minute(150)), - ("2h 30min", offsets.Minute(150)), - ("2h30min15s", offsets.Second(150 * 60 + 15)), - ("2h 60min", offsets.Hour(3)), - ("2h 20.5min", offsets.Second(8430)), - ("1.5min", offsets.Second(90)), - ("0.5S", offsets.Milli(500)), - ("15l500u", offsets.Micro(15500)), - ("10s75L", offsets.Milli(10075)), - ("1s0.25ms", offsets.Micro(1000250)), - ("1s0.25L", offsets.Micro(1000250)), - ("2800N", offsets.Nano(2800)), - ("2SM", offsets.SemiMonthEnd(2)), - ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), - ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), - ("2SMS-15", offsets.SemiMonthBegin(2)), -]) +@pytest.mark.parametrize( + "freq_input,expected", + [ + (frequencies.to_offset("10us"), offsets.Micro(10)), + (offsets.Hour(), offsets.Hour()), + ((5, "T"), offsets.Minute(5)), + ("2h30min", offsets.Minute(150)), + ("2h 30min", offsets.Minute(150)), + ("2h30min15s", offsets.Second(150 * 60 + 15)), + ("2h 60min", offsets.Hour(3)), + ("2h 20.5min", offsets.Second(8430)), + ("1.5min", offsets.Second(90)), + ("0.5S", offsets.Milli(500)), + ("15l500u", offsets.Micro(15500)), + ("10s75L", offsets.Milli(10075)), + ("1s0.25ms", offsets.Micro(1000250)), + ("1s0.25L", offsets.Micro(1000250)), + ("2800N", offsets.Nano(2800)), + ("2SM", offsets.SemiMonthEnd(2)), + ("2SM-16", offsets.SemiMonthEnd(2, day_of_month=16)), + ("2SMS-14", offsets.SemiMonthBegin(2, day_of_month=14)), + ("2SMS-15", offsets.SemiMonthBegin(2)), + ], +) def test_to_offset(freq_input, expected): result = frequencies.to_offset(freq_input) assert result == expected -@pytest.mark.parametrize("freqstr,expected", [ - ("-1S", -1), - ("-2SM", -2), - ("-1SMS", -1), - ("-5min10s", -310), -]) +@pytest.mark.parametrize( + "freqstr,expected", [("-1S", -1), ("-2SM", -2), ("-1SMS", -1), ("-5min10s", -310)] +) def test_to_offset_negative(freqstr, expected): result = frequencies.to_offset(freqstr) assert result.n == expected -@pytest.mark.parametrize("freqstr", [ - "2h20m", "U1", "-U", "3U1", "-2-3U", "-2D:3H", - "1.5.0S", "2SMS-15-15", "2SMS-15D", "100foo", - - # Invalid leading +/- signs. - "+-1d", "-+1h", "+1", "-7", "+d", "-m", - - # Invalid shortcut anchors. - "SM-0", "SM-28", "SM-29", "SM-FOO", "BSM", "SM--1", "SMS-1", - "SMS-28", "SMS-30", "SMS-BAR", "SMS-BYR", "BSMS", "SMS--2" -]) +@pytest.mark.parametrize( + "freqstr", + [ + "2h20m", + "U1", + "-U", + "3U1", + "-2-3U", + "-2D:3H", + "1.5.0S", + "2SMS-15-15", + "2SMS-15D", + "100foo", + # Invalid leading +/- signs. + "+-1d", + "-+1h", + "+1", + "-7", + "+d", + "-m", + # Invalid shortcut anchors. + "SM-0", + "SM-28", + "SM-29", + "SM-FOO", + "BSM", + "SM--1", + "SMS-1", + "SMS-28", + "SMS-30", + "SMS-BAR", + "SMS-BYR", + "BSMS", + "SMS--2", + ], +) def test_to_offset_invalid(freqstr): # see gh-13930 @@ -71,47 +96,49 @@ def test_to_offset_no_evaluate(): frequencies.to_offset(("", "")) -@pytest.mark.parametrize("freqstr,expected", [ - ("2D 3H", offsets.Hour(51)), - ("2 D3 H", offsets.Hour(51)), - ("2 D 3 H", offsets.Hour(51)), - (" 2 D 3 H ", offsets.Hour(51)), - (" H ", offsets.Hour()), - (" 3 H ", offsets.Hour(3)), -]) +@pytest.mark.parametrize( + "freqstr,expected", + [ + ("2D 3H", offsets.Hour(51)), + ("2 D3 H", offsets.Hour(51)), + ("2 D 3 H", offsets.Hour(51)), + (" 2 D 3 H ", offsets.Hour(51)), + (" H ", offsets.Hour()), + (" 3 H ", offsets.Hour(3)), + ], +) def test_to_offset_whitespace(freqstr, expected): result = frequencies.to_offset(freqstr) assert result == expected -@pytest.mark.parametrize("freqstr,expected", [ - ("00H 00T 01S", 1), - ("-00H 03T 14S", -194), -]) +@pytest.mark.parametrize( + "freqstr,expected", [("00H 00T 01S", 1), ("-00H 03T 14S", -194)] +) def test_to_offset_leading_zero(freqstr, expected): result = frequencies.to_offset(freqstr) assert result.n == expected -@pytest.mark.parametrize("freqstr,expected", [ - ("+1d", 1), - ("+2h30min", 150), -]) +@pytest.mark.parametrize("freqstr,expected", [("+1d", 1), ("+2h30min", 150)]) def test_to_offset_leading_plus(freqstr, expected): result = frequencies.to_offset(freqstr) assert result.n == expected -@pytest.mark.parametrize("kwargs,expected", [ - (dict(days=1, seconds=1), offsets.Second(86401)), - (dict(days=-1, seconds=1), offsets.Second(-86399)), - (dict(hours=1, minutes=10), offsets.Minute(70)), - (dict(hours=1, minutes=-10), offsets.Minute(50)), - (dict(weeks=1), offsets.Day(7)), - (dict(hours=1), offsets.Hour(1)), - (dict(hours=1), frequencies.to_offset("60min")), - (dict(microseconds=1), offsets.Micro(1)) -]) +@pytest.mark.parametrize( + "kwargs,expected", + [ + (dict(days=1, seconds=1), offsets.Second(86401)), + (dict(days=-1, seconds=1), offsets.Second(-86399)), + (dict(hours=1, minutes=10), offsets.Minute(70)), + (dict(hours=1, minutes=-10), offsets.Minute(50)), + (dict(weeks=1), offsets.Day(7)), + (dict(hours=1), offsets.Hour(1)), + (dict(hours=1), frequencies.to_offset("60min")), + (dict(microseconds=1), offsets.Micro(1)), + ], +) def test_to_offset_pd_timedelta(kwargs, expected): # see gh-9064 td = Timedelta(**kwargs) @@ -128,19 +155,22 @@ def test_to_offset_pd_timedelta_invalid(): frequencies.to_offset(td) -@pytest.mark.parametrize("shortcut,expected", [ - ("W", offsets.Week(weekday=6)), - ("W-SUN", offsets.Week(weekday=6)), - ("Q", offsets.QuarterEnd(startingMonth=12)), - ("Q-DEC", offsets.QuarterEnd(startingMonth=12)), - ("Q-MAY", offsets.QuarterEnd(startingMonth=5)), - ("SM", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), - ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), - ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), - ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), - ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), -]) +@pytest.mark.parametrize( + "shortcut,expected", + [ + ("W", offsets.Week(weekday=6)), + ("W-SUN", offsets.Week(weekday=6)), + ("Q", offsets.QuarterEnd(startingMonth=12)), + ("Q-DEC", offsets.QuarterEnd(startingMonth=12)), + ("Q-MAY", offsets.QuarterEnd(startingMonth=5)), + ("SM", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-15", offsets.SemiMonthEnd(day_of_month=15)), + ("SM-1", offsets.SemiMonthEnd(day_of_month=1)), + ("SM-27", offsets.SemiMonthEnd(day_of_month=27)), + ("SMS-2", offsets.SemiMonthBegin(day_of_month=2)), + ("SMS-27", offsets.SemiMonthBegin(day_of_month=27)), + ], +) def test_anchored_shortcuts(shortcut, expected): result = frequencies.to_offset(shortcut) assert result == expected diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 407e83de5e6e9..79c28942769f0 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -6,15 +6,18 @@ import pandas.util.testing as tm from pandas.tseries.holiday import ( - AbstractHolidayCalendar, Holiday, Timestamp, USFederalHolidayCalendar, - USThanksgivingDay, get_calendar) - - -@pytest.mark.parametrize("transform", [ - lambda x: x, - lambda x: x.strftime("%Y-%m-%d"), - lambda x: Timestamp(x) -]) + AbstractHolidayCalendar, + Holiday, + Timestamp, + USFederalHolidayCalendar, + USThanksgivingDay, + get_calendar, +) + + +@pytest.mark.parametrize( + "transform", [lambda x: x, lambda x: x.strftime("%Y-%m-%d"), lambda x: Timestamp(x)] +) def test_calendar(transform): start_date = datetime(2012, 1, 1) end_date = datetime(2012, 12, 31) @@ -32,7 +35,7 @@ def test_calendar(transform): datetime(2012, 10, 8), datetime(2012, 11, 12), datetime(2012, 11, 22), - datetime(2012, 12, 25) + datetime(2012, 12, 25), ] assert list(holidays.to_pydatetime()) == expected @@ -56,12 +59,15 @@ def __init__(self, name=None, rules=None): def test_calendar_observance_dates(): # see gh-11477 us_fed_cal = get_calendar("USFederalHolidayCalendar") - holidays0 = us_fed_cal.holidays(datetime(2015, 7, 3), datetime( - 2015, 7, 3)) # <-- same start and end dates - holidays1 = us_fed_cal.holidays(datetime(2015, 7, 3), datetime( - 2015, 7, 6)) # <-- different start and end dates - holidays2 = us_fed_cal.holidays(datetime(2015, 7, 3), datetime( - 2015, 7, 3)) # <-- same start and end dates + holidays0 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 3) + ) # <-- same start and end dates + holidays1 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 6) + ) # <-- different start and end dates + holidays2 = us_fed_cal.holidays( + datetime(2015, 7, 3), datetime(2015, 7, 3) + ) # <-- same start and end dates # These should all produce the same result. # diff --git a/pandas/tests/tseries/holiday/test_federal.py b/pandas/tests/tseries/holiday/test_federal.py index 62b5ab2b849ae..64c60d4e365e6 100644 --- a/pandas/tests/tseries/holiday/test_federal.py +++ b/pandas/tests/tseries/holiday/test_federal.py @@ -1,7 +1,10 @@ from datetime import datetime from pandas.tseries.holiday import ( - AbstractHolidayCalendar, USMartinLutherKingJr, USMemorialDay) + AbstractHolidayCalendar, + USMartinLutherKingJr, + USMemorialDay, +) def test_no_mlk_before_1986(): @@ -9,28 +12,27 @@ def test_no_mlk_before_1986(): class MLKCalendar(AbstractHolidayCalendar): rules = [USMartinLutherKingJr] - holidays = MLKCalendar().holidays(start="1984", - end="1988").to_pydatetime().tolist() + holidays = MLKCalendar().holidays(start="1984", end="1988").to_pydatetime().tolist() # Testing to make sure holiday is not incorrectly observed before 1986. - assert holidays == [datetime(1986, 1, 20, 0, 0), - datetime(1987, 1, 19, 0, 0)] + assert holidays == [datetime(1986, 1, 20, 0, 0), datetime(1987, 1, 19, 0, 0)] def test_memorial_day(): class MemorialDay(AbstractHolidayCalendar): rules = [USMemorialDay] - holidays = MemorialDay().holidays(start="1971", - end="1980").to_pydatetime().tolist() + holidays = MemorialDay().holidays(start="1971", end="1980").to_pydatetime().tolist() # Fixes 5/31 error and checked manually against Wikipedia. - assert holidays == [datetime(1971, 5, 31, 0, 0), - datetime(1972, 5, 29, 0, 0), - datetime(1973, 5, 28, 0, 0), - datetime(1974, 5, 27, 0, 0), - datetime(1975, 5, 26, 0, 0), - datetime(1976, 5, 31, 0, 0), - datetime(1977, 5, 30, 0, 0), - datetime(1978, 5, 29, 0, 0), - datetime(1979, 5, 28, 0, 0)] + assert holidays == [ + datetime(1971, 5, 31, 0, 0), + datetime(1972, 5, 29, 0, 0), + datetime(1973, 5, 28, 0, 0), + datetime(1974, 5, 27, 0, 0), + datetime(1975, 5, 26, 0, 0), + datetime(1976, 5, 31, 0, 0), + datetime(1977, 5, 30, 0, 0), + datetime(1978, 5, 29, 0, 0), + datetime(1979, 5, 28, 0, 0), + ] diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 27bba1cc89dee..06869fcd7a4f8 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -6,10 +6,24 @@ import pandas.util.testing as tm from pandas.tseries.holiday import ( - MO, SA, AbstractHolidayCalendar, DateOffset, EasterMonday, GoodFriday, - Holiday, HolidayCalendarFactory, Timestamp, USColumbusDay, USLaborDay, - USMartinLutherKingJr, USMemorialDay, USPresidentsDay, USThanksgivingDay, - get_calendar, next_monday) + MO, + SA, + AbstractHolidayCalendar, + DateOffset, + EasterMonday, + GoodFriday, + Holiday, + HolidayCalendarFactory, + Timestamp, + USColumbusDay, + USLaborDay, + USMartinLutherKingJr, + USMemorialDay, + USPresidentsDay, + USThanksgivingDay, + get_calendar, + next_monday, +) def _check_holiday_results(holiday, start, end, expected): @@ -30,92 +44,141 @@ def _check_holiday_results(holiday, start, end, expected): assert list(holiday.dates(start, end)) == expected # Verify that timezone info is preserved. - assert (list(holiday.dates(utc.localize(Timestamp(start)), - utc.localize(Timestamp(end)))) == - [utc.localize(dt) for dt in expected]) - - -@pytest.mark.parametrize("holiday,start_date,end_date,expected", [ - (USMemorialDay, datetime(2011, 1, 1), datetime(2020, 12, 31), - [datetime(2011, 5, 30), datetime(2012, 5, 28), datetime(2013, 5, 27), - datetime(2014, 5, 26), datetime(2015, 5, 25), datetime(2016, 5, 30), - datetime(2017, 5, 29), datetime(2018, 5, 28), datetime(2019, 5, 27), - datetime(2020, 5, 25)]), - - (Holiday("July 4th Eve", month=7, day=3), "2001-01-01", "2003-03-03", - [Timestamp("2001-07-03 00:00:00"), Timestamp("2002-07-03 00:00:00")]), - (Holiday("July 4th Eve", month=7, day=3, days_of_week=(0, 1, 2, 3)), - "2001-01-01", "2008-03-03", [ - Timestamp("2001-07-03 00:00:00"), Timestamp("2002-07-03 00:00:00"), - Timestamp("2003-07-03 00:00:00"), Timestamp("2006-07-03 00:00:00"), - Timestamp("2007-07-03 00:00:00")]), - - (EasterMonday, datetime(2011, 1, 1), datetime(2020, 12, 31), - [Timestamp("2011-04-25 00:00:00"), Timestamp("2012-04-09 00:00:00"), - Timestamp("2013-04-01 00:00:00"), Timestamp("2014-04-21 00:00:00"), - Timestamp("2015-04-06 00:00:00"), Timestamp("2016-03-28 00:00:00"), - Timestamp("2017-04-17 00:00:00"), Timestamp("2018-04-02 00:00:00"), - Timestamp("2019-04-22 00:00:00"), Timestamp("2020-04-13 00:00:00")]), - (GoodFriday, datetime(2011, 1, 1), datetime(2020, 12, 31), - [Timestamp("2011-04-22 00:00:00"), Timestamp("2012-04-06 00:00:00"), - Timestamp("2013-03-29 00:00:00"), Timestamp("2014-04-18 00:00:00"), - Timestamp("2015-04-03 00:00:00"), Timestamp("2016-03-25 00:00:00"), - Timestamp("2017-04-14 00:00:00"), Timestamp("2018-03-30 00:00:00"), - Timestamp("2019-04-19 00:00:00"), Timestamp("2020-04-10 00:00:00")]), - - (USThanksgivingDay, datetime(2011, 1, 1), datetime(2020, 12, 31), - [datetime(2011, 11, 24), datetime(2012, 11, 22), datetime(2013, 11, 28), - datetime(2014, 11, 27), datetime(2015, 11, 26), datetime(2016, 11, 24), - datetime(2017, 11, 23), datetime(2018, 11, 22), datetime(2019, 11, 28), - datetime(2020, 11, 26)]) -]) + assert list( + holiday.dates(utc.localize(Timestamp(start)), utc.localize(Timestamp(end))) + ) == [utc.localize(dt) for dt in expected] + + +@pytest.mark.parametrize( + "holiday,start_date,end_date,expected", + [ + ( + USMemorialDay, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + datetime(2011, 5, 30), + datetime(2012, 5, 28), + datetime(2013, 5, 27), + datetime(2014, 5, 26), + datetime(2015, 5, 25), + datetime(2016, 5, 30), + datetime(2017, 5, 29), + datetime(2018, 5, 28), + datetime(2019, 5, 27), + datetime(2020, 5, 25), + ], + ), + ( + Holiday("July 4th Eve", month=7, day=3), + "2001-01-01", + "2003-03-03", + [Timestamp("2001-07-03 00:00:00"), Timestamp("2002-07-03 00:00:00")], + ), + ( + Holiday("July 4th Eve", month=7, day=3, days_of_week=(0, 1, 2, 3)), + "2001-01-01", + "2008-03-03", + [ + Timestamp("2001-07-03 00:00:00"), + Timestamp("2002-07-03 00:00:00"), + Timestamp("2003-07-03 00:00:00"), + Timestamp("2006-07-03 00:00:00"), + Timestamp("2007-07-03 00:00:00"), + ], + ), + ( + EasterMonday, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + Timestamp("2011-04-25 00:00:00"), + Timestamp("2012-04-09 00:00:00"), + Timestamp("2013-04-01 00:00:00"), + Timestamp("2014-04-21 00:00:00"), + Timestamp("2015-04-06 00:00:00"), + Timestamp("2016-03-28 00:00:00"), + Timestamp("2017-04-17 00:00:00"), + Timestamp("2018-04-02 00:00:00"), + Timestamp("2019-04-22 00:00:00"), + Timestamp("2020-04-13 00:00:00"), + ], + ), + ( + GoodFriday, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + Timestamp("2011-04-22 00:00:00"), + Timestamp("2012-04-06 00:00:00"), + Timestamp("2013-03-29 00:00:00"), + Timestamp("2014-04-18 00:00:00"), + Timestamp("2015-04-03 00:00:00"), + Timestamp("2016-03-25 00:00:00"), + Timestamp("2017-04-14 00:00:00"), + Timestamp("2018-03-30 00:00:00"), + Timestamp("2019-04-19 00:00:00"), + Timestamp("2020-04-10 00:00:00"), + ], + ), + ( + USThanksgivingDay, + datetime(2011, 1, 1), + datetime(2020, 12, 31), + [ + datetime(2011, 11, 24), + datetime(2012, 11, 22), + datetime(2013, 11, 28), + datetime(2014, 11, 27), + datetime(2015, 11, 26), + datetime(2016, 11, 24), + datetime(2017, 11, 23), + datetime(2018, 11, 22), + datetime(2019, 11, 28), + datetime(2020, 11, 26), + ], + ), + ], +) def test_holiday_dates(holiday, start_date, end_date, expected): _check_holiday_results(holiday, start_date, end_date, expected) -@pytest.mark.parametrize("holiday,start,expected", [ - (USMemorialDay, datetime(2015, 7, 1), []), - (USMemorialDay, "2015-05-25", "2015-05-25"), - - (USLaborDay, datetime(2015, 7, 1), []), - (USLaborDay, "2015-09-07", "2015-09-07"), - - (USColumbusDay, datetime(2015, 7, 1), []), - (USColumbusDay, "2015-10-12", "2015-10-12"), - - (USThanksgivingDay, datetime(2015, 7, 1), []), - (USThanksgivingDay, "2015-11-26", "2015-11-26"), - - (USMartinLutherKingJr, datetime(2015, 7, 1), []), - (USMartinLutherKingJr, "2015-01-19", "2015-01-19"), - - (USPresidentsDay, datetime(2015, 7, 1), []), - (USPresidentsDay, "2015-02-16", "2015-02-16"), - - (GoodFriday, datetime(2015, 7, 1), []), - (GoodFriday, "2015-04-03", "2015-04-03"), - - (EasterMonday, "2015-04-06", "2015-04-06"), - (EasterMonday, datetime(2015, 7, 1), []), - (EasterMonday, "2015-04-05", []), - - ("New Years Day", "2015-01-01", "2015-01-01"), - ("New Years Day", "2010-12-31", "2010-12-31"), - ("New Years Day", datetime(2015, 7, 1), []), - ("New Years Day", "2011-01-01", []), - - ("July 4th", "2015-07-03", "2015-07-03"), - ("July 4th", datetime(2015, 7, 1), []), - ("July 4th", "2015-07-04", []), - - ("Veterans Day", "2012-11-12", "2012-11-12"), - ("Veterans Day", datetime(2015, 7, 1), []), - ("Veterans Day", "2012-11-11", []), - - ("Christmas", "2011-12-26", "2011-12-26"), - ("Christmas", datetime(2015, 7, 1), []), - ("Christmas", "2011-12-25", []), -]) +@pytest.mark.parametrize( + "holiday,start,expected", + [ + (USMemorialDay, datetime(2015, 7, 1), []), + (USMemorialDay, "2015-05-25", "2015-05-25"), + (USLaborDay, datetime(2015, 7, 1), []), + (USLaborDay, "2015-09-07", "2015-09-07"), + (USColumbusDay, datetime(2015, 7, 1), []), + (USColumbusDay, "2015-10-12", "2015-10-12"), + (USThanksgivingDay, datetime(2015, 7, 1), []), + (USThanksgivingDay, "2015-11-26", "2015-11-26"), + (USMartinLutherKingJr, datetime(2015, 7, 1), []), + (USMartinLutherKingJr, "2015-01-19", "2015-01-19"), + (USPresidentsDay, datetime(2015, 7, 1), []), + (USPresidentsDay, "2015-02-16", "2015-02-16"), + (GoodFriday, datetime(2015, 7, 1), []), + (GoodFriday, "2015-04-03", "2015-04-03"), + (EasterMonday, "2015-04-06", "2015-04-06"), + (EasterMonday, datetime(2015, 7, 1), []), + (EasterMonday, "2015-04-05", []), + ("New Years Day", "2015-01-01", "2015-01-01"), + ("New Years Day", "2010-12-31", "2010-12-31"), + ("New Years Day", datetime(2015, 7, 1), []), + ("New Years Day", "2011-01-01", []), + ("July 4th", "2015-07-03", "2015-07-03"), + ("July 4th", datetime(2015, 7, 1), []), + ("July 4th", "2015-07-04", []), + ("Veterans Day", "2012-11-12", "2012-11-12"), + ("Veterans Day", datetime(2015, 7, 1), []), + ("Veterans Day", "2012-11-11", []), + ("Christmas", "2011-12-26", "2011-12-26"), + ("Christmas", datetime(2015, 7, 1), []), + ("Christmas", "2011-12-25", []), + ], +) def test_holidays_within_dates(holiday, start, expected): # see gh-11477 # @@ -132,26 +195,34 @@ def test_holidays_within_dates(holiday, start, expected): _check_holiday_results(holiday, start, start, expected) -@pytest.mark.parametrize("transform", [ - lambda x: x.strftime("%Y-%m-%d"), - lambda x: Timestamp(x) -]) +@pytest.mark.parametrize( + "transform", [lambda x: x.strftime("%Y-%m-%d"), lambda x: Timestamp(x)] +) def test_argument_types(transform): start_date = datetime(2011, 1, 1) end_date = datetime(2020, 12, 31) holidays = USThanksgivingDay.dates(start_date, end_date) - holidays2 = USThanksgivingDay.dates( - transform(start_date), transform(end_date)) + holidays2 = USThanksgivingDay.dates(transform(start_date), transform(end_date)) tm.assert_index_equal(holidays, holidays2) -@pytest.mark.parametrize("name,kwargs", [ - ("One-Time", dict(year=2012, month=5, day=28)), - ("Range", dict(month=5, day=28, start_date=datetime(2012, 1, 1), - end_date=datetime(2012, 12, 31), - offset=DateOffset(weekday=MO(1)))) -]) +@pytest.mark.parametrize( + "name,kwargs", + [ + ("One-Time", dict(year=2012, month=5, day=28)), + ( + "Range", + dict( + month=5, + day=28, + start_date=datetime(2012, 1, 1), + end_date=datetime(2012, 12, 31), + offset=DateOffset(weekday=MO(1)), + ), + ), + ], +) def test_special_holidays(name, kwargs): base_date = [datetime(2012, 5, 28)] holiday = Holiday(name, **kwargs) @@ -171,12 +242,12 @@ class TestCalendar(AbstractHolidayCalendar): def test_factory(): - class_1 = HolidayCalendarFactory("MemorialDay", - AbstractHolidayCalendar, - USMemorialDay) - class_2 = HolidayCalendarFactory("Thanksgiving", - AbstractHolidayCalendar, - USThanksgivingDay) + class_1 = HolidayCalendarFactory( + "MemorialDay", AbstractHolidayCalendar, USMemorialDay + ) + class_2 = HolidayCalendarFactory( + "Thanksgiving", AbstractHolidayCalendar, USThanksgivingDay + ) class_3 = HolidayCalendarFactory("Combined", class_1, class_2) assert len(class_1.rules) == 1 @@ -188,6 +259,10 @@ def test_both_offset_observance_raises(): # see gh-10217 msg = "Cannot use both offset and observance" with pytest.raises(NotImplementedError, match=msg): - Holiday("Cyber Monday", month=11, day=1, - offset=[DateOffset(weekday=SA(4))], - observance=next_monday) + Holiday( + "Cyber Monday", + month=11, + day=1, + offset=[DateOffset(weekday=SA(4))], + observance=next_monday, + ) diff --git a/pandas/tests/tseries/holiday/test_observance.py b/pandas/tests/tseries/holiday/test_observance.py index 1c22918b2efd8..9ee63d2a36556 100644 --- a/pandas/tests/tseries/holiday/test_observance.py +++ b/pandas/tests/tseries/holiday/test_observance.py @@ -3,9 +3,17 @@ import pytest from pandas.tseries.holiday import ( - after_nearest_workday, before_nearest_workday, nearest_workday, - next_monday, next_monday_or_tuesday, next_workday, previous_friday, - previous_workday, sunday_to_monday, weekend_to_monday) + after_nearest_workday, + before_nearest_workday, + nearest_workday, + next_monday, + next_monday_or_tuesday, + next_workday, + previous_friday, + previous_workday, + sunday_to_monday, + weekend_to_monday, +) _WEDNESDAY = datetime(2014, 4, 9) _THURSDAY = datetime(2014, 4, 10) @@ -21,11 +29,9 @@ def test_next_monday(day): assert next_monday(day) == _MONDAY -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _TUESDAY), - (_MONDAY, _TUESDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _TUESDAY), (_MONDAY, _TUESDAY)] +) def test_next_monday_or_tuesday(day, expected): assert next_monday_or_tuesday(day) == expected @@ -39,55 +45,43 @@ def test_sunday_to_monday(): assert sunday_to_monday(_SUNDAY) == _MONDAY -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _FRIDAY), - (_SUNDAY, _MONDAY), - (_MONDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _FRIDAY), (_SUNDAY, _MONDAY), (_MONDAY, _MONDAY)] +) def test_nearest_workday(day, expected): assert nearest_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _MONDAY), - (_MONDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _MONDAY)] +) def test_weekend_to_monday(day, expected): assert weekend_to_monday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _MONDAY), - (_MONDAY, _TUESDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _MONDAY), (_MONDAY, _TUESDAY)] +) def test_next_workday(day, expected): assert next_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _FRIDAY), - (_SUNDAY, _FRIDAY), - (_TUESDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _FRIDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)] +) def test_previous_workday(day, expected): assert previous_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _THURSDAY), - (_SUNDAY, _FRIDAY), - (_TUESDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _THURSDAY), (_SUNDAY, _FRIDAY), (_TUESDAY, _MONDAY)] +) def test_before_nearest_workday(day, expected): assert before_nearest_workday(day) == expected -@pytest.mark.parametrize("day,expected", [ - (_SATURDAY, _MONDAY), - (_SUNDAY, _TUESDAY), - (_FRIDAY, _MONDAY) -]) +@pytest.mark.parametrize( + "day,expected", [(_SATURDAY, _MONDAY), (_SUNDAY, _TUESDAY), (_FRIDAY, _MONDAY)] +) def test_after_nearest_workday(day, expected): assert after_nearest_workday(day) == expected diff --git a/pandas/tests/tseries/offsets/common.py b/pandas/tests/tseries/offsets/common.py index e6177fa4aaa4b..079fcc36ff3ee 100644 --- a/pandas/tests/tseries/offsets/common.py +++ b/pandas/tests/tseries/offsets/common.py @@ -12,13 +12,15 @@ def assert_offset_equal(offset, base, expected): assert actual_swapped == expected assert actual_apply == expected except AssertionError: - raise AssertionError("\nExpected: %s\nActual: %s\nFor Offset: %s)" - "\nAt Date: %s" % - (expected, actual, offset, base)) + raise AssertionError( + "\nExpected: %s\nActual: %s\nFor Offset: %s)" + "\nAt Date: %s" % (expected, actual, offset, base) + ) def assert_onOffset(offset, date, expected): actual = offset.onOffset(date) - assert actual == expected, ("\nExpected: %s\nActual: %s\nFor Offset: %s)" - "\nAt Date: %s" % - (expected, actual, offset, date)) + assert actual == expected, ( + "\nExpected: %s\nActual: %s\nFor Offset: %s)" + "\nAt Date: %s" % (expected, actual, offset, date) + ) diff --git a/pandas/tests/tseries/offsets/conftest.py b/pandas/tests/tseries/offsets/conftest.py index c192a56b205ca..2f6868f56c073 100644 --- a/pandas/tests/tseries/offsets/conftest.py +++ b/pandas/tests/tseries/offsets/conftest.py @@ -11,9 +11,13 @@ def offset_types(request): return request.param -@pytest.fixture(params=[getattr(offsets, o) for o in offsets.__all__ if - issubclass(getattr(offsets, o), offsets.MonthOffset) - and o != 'MonthOffset']) +@pytest.fixture( + params=[ + getattr(offsets, o) + for o in offsets.__all__ + if issubclass(getattr(offsets, o), offsets.MonthOffset) and o != "MonthOffset" + ] +) def month_classes(request): """ Fixture for month based datetime offsets available for a time series. diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index b61f5f3b013e4..c24d917a5e454 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -34,49 +34,61 @@ def makeFY5253LastOfMonth(*args, **kwds): def test_get_offset_name(): - assert (makeFY5253LastOfMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=4).freqstr == "REQ-L-MAR-TUE-4") - assert (makeFY5253NearestEndMonthQuarter( - weekday=1, startingMonth=3, - qtr_with_extra_week=3).freqstr == "REQ-N-MAR-TUE-3") + assert ( + makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=4 + ).freqstr + == "REQ-L-MAR-TUE-4" + ) + assert ( + makeFY5253NearestEndMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=3 + ).freqstr + == "REQ-N-MAR-TUE-3" + ) def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('gibberish') + get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('QS-JAN-B') + get_offset("QS-JAN-B") pairs = [ - ("RE-N-DEC-MON", - makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), - ("RE-L-DEC-TUE", - makeFY5253LastOfMonth(weekday=1, startingMonth=12)), - ("REQ-L-MAR-TUE-4", - makeFY5253LastOfMonthQuarter(weekday=1, - startingMonth=3, - qtr_with_extra_week=4)), - ("REQ-L-DEC-MON-3", - makeFY5253LastOfMonthQuarter(weekday=0, - startingMonth=12, - qtr_with_extra_week=3)), - ("REQ-N-DEC-MON-3", - makeFY5253NearestEndMonthQuarter(weekday=0, - startingMonth=12, - qtr_with_extra_week=3))] + ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), + ("RE-L-DEC-TUE", makeFY5253LastOfMonth(weekday=1, startingMonth=12)), + ( + "REQ-L-MAR-TUE-4", + makeFY5253LastOfMonthQuarter( + weekday=1, startingMonth=3, qtr_with_extra_week=4 + ), + ), + ( + "REQ-L-DEC-MON-3", + makeFY5253LastOfMonthQuarter( + weekday=0, startingMonth=12, qtr_with_extra_week=3 + ), + ), + ( + "REQ-N-DEC-MON-3", + makeFY5253NearestEndMonthQuarter( + weekday=0, startingMonth=12, qtr_with_extra_week=3 + ), + ), + ] for name, expected in pairs: offset = get_offset(name) - assert offset == expected, ("Expected %r to yield %r (actual: %r)" % - (name, expected, offset)) + assert offset == expected, "Expected %r to yield %r (actual: %r)" % ( + name, + expected, + offset, + ) class TestFY5253LastOfMonth(Base): - offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, - weekday=WeekDay.SAT) + offset_lom_sat_aug = makeFY5253LastOfMonth(1, startingMonth=8, weekday=WeekDay.SAT) + offset_lom_sat_sep = makeFY5253LastOfMonth(1, startingMonth=9, weekday=WeekDay.SAT) on_offset_cases = [ # From Wikipedia (see: @@ -95,14 +107,12 @@ class TestFY5253LastOfMonth(Base): (offset_lom_sat_aug, datetime(2017, 8, 26), True), (offset_lom_sat_aug, datetime(2018, 8, 25), True), (offset_lom_sat_aug, datetime(2019, 8, 31), True), - (offset_lom_sat_aug, datetime(2006, 8, 27), False), (offset_lom_sat_aug, datetime(2007, 8, 28), False), (offset_lom_sat_aug, datetime(2008, 8, 31), False), (offset_lom_sat_aug, datetime(2009, 8, 30), False), (offset_lom_sat_aug, datetime(2010, 8, 29), False), (offset_lom_sat_aug, datetime(2011, 8, 28), False), - (offset_lom_sat_aug, datetime(2006, 8, 25), False), (offset_lom_sat_aug, datetime(2007, 8, 24), False), (offset_lom_sat_aug, datetime(2008, 8, 29), False), @@ -110,42 +120,48 @@ class TestFY5253LastOfMonth(Base): (offset_lom_sat_aug, datetime(2010, 8, 27), False), (offset_lom_sat_aug, datetime(2011, 8, 26), False), (offset_lom_sat_aug, datetime(2019, 8, 30), False), - # From GMCR (see for example: # http://yahoo.brand.edgar-online.com/Default.aspx? # companyid=3184&formtypeID=7) (offset_lom_sat_sep, datetime(2010, 9, 25), True), (offset_lom_sat_sep, datetime(2011, 9, 24), True), - (offset_lom_sat_sep, datetime(2012, 9, 29), True)] + (offset_lom_sat_sep, datetime(2012, 9, 29), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) def test_apply(self): - offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_sat_1 = makeFY5253LastOfMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT) - - date_seq_lom_aug_sat = [datetime(2006, 8, 26), datetime(2007, 8, 25), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 8, 27), - datetime(2012, 8, 25), datetime(2013, 8, 31), - datetime(2014, 8, 30), datetime(2015, 8, 29), - datetime(2016, 8, 27)] + offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, weekday=WeekDay.SAT) + offset_lom_aug_sat_1 = makeFY5253LastOfMonth( + n=1, startingMonth=8, weekday=WeekDay.SAT + ) + + date_seq_lom_aug_sat = [ + datetime(2006, 8, 26), + datetime(2007, 8, 25), + datetime(2008, 8, 30), + datetime(2009, 8, 29), + datetime(2010, 8, 28), + datetime(2011, 8, 27), + datetime(2012, 8, 25), + datetime(2013, 8, 31), + datetime(2014, 8, 30), + datetime(2015, 8, 29), + datetime(2016, 8, 27), + ] tests = [ (offset_lom_aug_sat, date_seq_lom_aug_sat), (offset_lom_aug_sat_1, date_seq_lom_aug_sat), - (offset_lom_aug_sat, [ - datetime(2006, 8, 25)] + date_seq_lom_aug_sat), - (offset_lom_aug_sat_1, [ - datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), - (makeFY5253LastOfMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_lom_aug_sat))), + (offset_lom_aug_sat, [datetime(2006, 8, 25)] + date_seq_lom_aug_sat), + (offset_lom_aug_sat_1, [datetime(2006, 8, 27)] + date_seq_lom_aug_sat[1:]), + ( + makeFY5253LastOfMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), + list(reversed(date_seq_lom_aug_sat)), + ), ] for test in tests: offset, data = test @@ -156,44 +172,36 @@ def test_apply(self): class TestFY5253NearestEndMonth(Base): - def test_get_year_end(self): - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SAT).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 31)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.SUN).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 9, 1)) - assert (makeFY5253NearestEndMonth( - startingMonth=8, weekday=WeekDay.FRI).get_year_end( - datetime(2013, 1, 1)) == datetime(2013, 8, 30)) - - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") - assert (offset_n.get_year_end(datetime(2012, 1, 1)) == - datetime(2013, 1, 1)) - assert (offset_n.get_year_end(datetime(2012, 1, 10)) == - datetime(2013, 1, 1)) - - assert (offset_n.get_year_end(datetime(2013, 1, 1)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 2)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 3)) == - datetime(2013, 12, 31)) - assert (offset_n.get_year_end(datetime(2013, 1, 10)) == - datetime(2013, 12, 31)) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SAT + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 8, 31) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.SUN + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 9, 1) + assert makeFY5253NearestEndMonth( + startingMonth=8, weekday=WeekDay.FRI + ).get_year_end(datetime(2013, 1, 1)) == datetime(2013, 8, 30) + + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") + assert offset_n.get_year_end(datetime(2012, 1, 1)) == datetime(2013, 1, 1) + assert offset_n.get_year_end(datetime(2012, 1, 10)) == datetime(2013, 1, 1) + + assert offset_n.get_year_end(datetime(2013, 1, 1)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 2)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 3)) == datetime(2013, 12, 31) + assert offset_n.get_year_end(datetime(2013, 1, 10)) == datetime(2013, 12, 31) JNJ = FY5253(n=1, startingMonth=12, weekday=6, variation="nearest") - assert (JNJ.get_year_end(datetime(2006, 1, 1)) == - datetime(2006, 12, 31)) + assert JNJ.get_year_end(datetime(2006, 1, 1)) == datetime(2006, 12, 31) - offset_lom_aug_sat = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.SAT) - offset_lom_aug_thu = makeFY5253NearestEndMonth(1, startingMonth=8, - weekday=WeekDay.THU) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") + offset_lom_aug_sat = makeFY5253NearestEndMonth( + 1, startingMonth=8, weekday=WeekDay.SAT + ) + offset_lom_aug_thu = makeFY5253NearestEndMonth( + 1, startingMonth=8, weekday=WeekDay.THU + ) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") on_offset_cases = [ # From Wikipedia (see: @@ -219,19 +227,16 @@ def test_get_year_end(self): (offset_lom_aug_sat, datetime(2009, 8, 29), True), (offset_lom_aug_sat, datetime(2010, 8, 28), True), (offset_lom_aug_sat, datetime(2011, 9, 3), True), - (offset_lom_aug_sat, datetime(2016, 9, 3), True), (offset_lom_aug_sat, datetime(2017, 9, 2), True), (offset_lom_aug_sat, datetime(2018, 9, 1), True), (offset_lom_aug_sat, datetime(2019, 8, 31), True), - (offset_lom_aug_sat, datetime(2006, 8, 27), False), (offset_lom_aug_sat, datetime(2007, 8, 28), False), (offset_lom_aug_sat, datetime(2008, 8, 31), False), (offset_lom_aug_sat, datetime(2009, 8, 30), False), (offset_lom_aug_sat, datetime(2010, 8, 29), False), (offset_lom_aug_sat, datetime(2011, 8, 28), False), - (offset_lom_aug_sat, datetime(2006, 8, 25), False), (offset_lom_aug_sat, datetime(2007, 8, 24), False), (offset_lom_aug_sat, datetime(2008, 8, 29), False), @@ -239,62 +244,82 @@ def test_get_year_end(self): (offset_lom_aug_sat, datetime(2010, 8, 27), False), (offset_lom_aug_sat, datetime(2011, 8, 26), False), (offset_lom_aug_sat, datetime(2019, 8, 30), False), - # From Micron, see: # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 (offset_lom_aug_thu, datetime(2012, 8, 30), True), (offset_lom_aug_thu, datetime(2011, 9, 1), True), - (offset_n, datetime(2012, 12, 31), False), (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] + (offset_n, datetime(2013, 1, 2), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) def test_apply(self): - date_seq_nem_8_sat = [datetime(2006, 9, 2), datetime(2007, 9, 1), - datetime(2008, 8, 30), datetime(2009, 8, 29), - datetime(2010, 8, 28), datetime(2011, 9, 3)] + date_seq_nem_8_sat = [ + datetime(2006, 9, 2), + datetime(2007, 9, 1), + datetime(2008, 8, 30), + datetime(2009, 8, 29), + datetime(2010, 8, 28), + datetime(2011, 9, 3), + ] - JNJ = [datetime(2005, 1, 2), datetime(2006, 1, 1), - datetime(2006, 12, 31), datetime(2007, 12, 30), - datetime(2008, 12, 28), datetime(2010, 1, 3), - datetime(2011, 1, 2), datetime(2012, 1, 1), - datetime(2012, 12, 30)] + JNJ = [ + datetime(2005, 1, 2), + datetime(2006, 1, 1), + datetime(2006, 12, 31), + datetime(2007, 12, 30), + datetime(2008, 12, 28), + datetime(2010, 1, 3), + datetime(2011, 1, 2), + datetime(2012, 1, 1), + datetime(2012, 12, 30), + ] - DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, - variation="nearest") + DEC_SAT = FY5253(n=-1, startingMonth=12, weekday=5, variation="nearest") tests = [ - (makeFY5253NearestEndMonth(startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), - [datetime(2006, 9, 1)] + date_seq_nem_8_sat), - (makeFY5253NearestEndMonth(n=1, startingMonth=8, - weekday=WeekDay.SAT), - [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:]), - (makeFY5253NearestEndMonth(n=-1, startingMonth=8, - weekday=WeekDay.SAT), - list(reversed(date_seq_nem_8_sat))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), JNJ), - (makeFY5253NearestEndMonth(n=-1, startingMonth=12, - weekday=WeekDay.SUN), - list(reversed(JNJ))), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2005, 1, 2), datetime(2006, 1, 1)]), - (makeFY5253NearestEndMonth(n=1, startingMonth=12, - weekday=WeekDay.SUN), - [datetime(2006, 1, 2), datetime(2006, 12, 31)]), - (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]) + ( + makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 1)] + date_seq_nem_8_sat, + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=8, weekday=WeekDay.SAT), + [datetime(2006, 9, 3)] + date_seq_nem_8_sat[1:], + ), + ( + makeFY5253NearestEndMonth(n=-1, startingMonth=8, weekday=WeekDay.SAT), + list(reversed(date_seq_nem_8_sat)), + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + JNJ, + ), + ( + makeFY5253NearestEndMonth(n=-1, startingMonth=12, weekday=WeekDay.SUN), + list(reversed(JNJ)), + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + [datetime(2005, 1, 2), datetime(2006, 1, 1)], + ), + ( + makeFY5253NearestEndMonth(n=1, startingMonth=12, weekday=WeekDay.SUN), + [datetime(2006, 1, 2), datetime(2006, 12, 31)], + ), + (DEC_SAT, [datetime(2013, 1, 15), datetime(2012, 12, 29)]), ] for test in tests: offset, data = test @@ -305,70 +330,82 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter(Base): - def test_isAnchored(self): assert makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).isAnchored() assert makeFY5253LastOfMonthQuarter( - weekday=WeekDay.SAT, startingMonth=3, - qtr_with_extra_week=4).isAnchored() + weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 + ).isAnchored() assert not makeFY5253LastOfMonthQuarter( - 2, startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4).isAnchored() + 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ).isAnchored() def test_equality(self): - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) == makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4)) - assert (makeFY5253LastOfMonthQuarter( - startingMonth=1, weekday=WeekDay.SAT, - qtr_with_extra_week=4) != makeFY5253LastOfMonthQuarter( - startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4)) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) == makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) != makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SUN, qtr_with_extra_week=4 + ) + assert makeFY5253LastOfMonthQuarter( + startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) != makeFY5253LastOfMonthQuarter( + startingMonth=2, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) def test_offset(self): - offset = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset2 = makeFY5253LastOfMonthQuarter(2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset4 = makeFY5253LastOfMonthQuarter(4, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - offset_neg1 = makeFY5253LastOfMonthQuarter(-1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - offset_neg2 = makeFY5253LastOfMonthQuarter(-2, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - - GMCR = [datetime(2010, 3, 27), datetime(2010, 6, 26), - datetime(2010, 9, 25), datetime(2010, 12, 25), - datetime(2011, 3, 26), datetime(2011, 6, 25), - datetime(2011, 9, 24), datetime(2011, 12, 24), - datetime(2012, 3, 24), datetime(2012, 6, 23), - datetime(2012, 9, 29), datetime(2012, 12, 29), - datetime(2013, 3, 30), datetime(2013, 6, 29)] + offset = makeFY5253LastOfMonthQuarter( + 1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset2 = makeFY5253LastOfMonthQuarter( + 2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset4 = makeFY5253LastOfMonthQuarter( + 4, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + offset_neg1 = makeFY5253LastOfMonthQuarter( + -1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + offset_neg2 = makeFY5253LastOfMonthQuarter( + -2, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + + GMCR = [ + datetime(2010, 3, 27), + datetime(2010, 6, 26), + datetime(2010, 9, 25), + datetime(2010, 12, 25), + datetime(2011, 3, 26), + datetime(2011, 6, 25), + datetime(2011, 9, 24), + datetime(2011, 12, 24), + datetime(2012, 3, 24), + datetime(2012, 6, 23), + datetime(2012, 9, 29), + datetime(2012, 12, 29), + datetime(2013, 3, 30), + datetime(2013, 6, 29), + ] assert_offset_equal(offset, base=GMCR[0], expected=GMCR[1]) - assert_offset_equal(offset, base=GMCR[0] + relativedelta(days=-1), - expected=GMCR[0]) + assert_offset_equal( + offset, base=GMCR[0] + relativedelta(days=-1), expected=GMCR[0] + ) assert_offset_equal(offset, base=GMCR[1], expected=GMCR[2]) assert_offset_equal(offset2, base=GMCR[0], expected=GMCR[2]) assert_offset_equal(offset4, base=GMCR[0], expected=GMCR[4]) assert_offset_equal(offset_neg1, base=GMCR[-1], expected=GMCR[-2]) - assert_offset_equal(offset_neg1, - base=GMCR[-1] + relativedelta(days=+1), - expected=GMCR[-1]) + assert_offset_equal( + offset_neg1, base=GMCR[-1] + relativedelta(days=+1), expected=GMCR[-1] + ) assert_offset_equal(offset_neg2, base=GMCR[-1], expected=GMCR[-3]) date = GMCR[0] + relativedelta(days=-1) @@ -381,12 +418,12 @@ def test_offset(self): assert_offset_equal(offset_neg1, date, expected) date = date + offset_neg1 - lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=8, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) - lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=9, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) + lomq_aug_sat_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) + lomq_sep_sat_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=9, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) on_offset_cases = [ # From Wikipedia @@ -397,14 +434,12 @@ def test_offset(self): (lomq_aug_sat_4, datetime(2010, 8, 28), True), (lomq_aug_sat_4, datetime(2011, 8, 27), True), (lomq_aug_sat_4, datetime(2019, 8, 31), True), - (lomq_aug_sat_4, datetime(2006, 8, 27), False), (lomq_aug_sat_4, datetime(2007, 8, 28), False), (lomq_aug_sat_4, datetime(2008, 8, 31), False), (lomq_aug_sat_4, datetime(2009, 8, 30), False), (lomq_aug_sat_4, datetime(2010, 8, 29), False), (lomq_aug_sat_4, datetime(2011, 8, 28), False), - (lomq_aug_sat_4, datetime(2006, 8, 25), False), (lomq_aug_sat_4, datetime(2007, 8, 24), False), (lomq_aug_sat_4, datetime(2008, 8, 29), False), @@ -412,44 +447,51 @@ def test_offset(self): (lomq_aug_sat_4, datetime(2010, 8, 27), False), (lomq_aug_sat_4, datetime(2011, 8, 26), False), (lomq_aug_sat_4, datetime(2019, 8, 30), False), - # From GMCR (lomq_sep_sat_4, datetime(2010, 9, 25), True), (lomq_sep_sat_4, datetime(2011, 9, 24), True), (lomq_sep_sat_4, datetime(2012, 9, 29), True), - (lomq_sep_sat_4, datetime(2013, 6, 29), True), (lomq_sep_sat_4, datetime(2012, 6, 23), True), (lomq_sep_sat_4, datetime(2012, 6, 30), False), - (lomq_sep_sat_4, datetime(2013, 3, 30), True), (lomq_sep_sat_4, datetime(2012, 3, 24), True), - (lomq_sep_sat_4, datetime(2012, 12, 29), True), (lomq_sep_sat_4, datetime(2011, 12, 24), True), - # INTC (extra week in Q1) # See: http://www.intc.com/releasedetail.cfm?ReleaseID=542844 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 4, 2), True), - + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2011, 4, 2), + True, + ), # see: http://google.brand.edgar-online.com/?sym=INTC&formtypeID=7 - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2012, 12, 29), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2011, 12, 31), True), - (makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1), - datetime(2010, 12, 25), True)] - - @pytest.mark.parametrize('case', on_offset_cases) + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2012, 12, 29), + True, + ), + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2011, 12, 31), + True, + ), + ( + makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ), + datetime(2010, 12, 25), + True, + ), + ] + + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -457,47 +499,46 @@ def test_onOffset(self, case): def test_year_has_extra_week(self): # End of long Q1 assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2011, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2011, 4, 2)) # Start of long Q1 assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 26)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2010, 12, 26)) # End of year before year with long Q1 assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2010, 12, 25)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2010, 12, 25)) - for year in [x - for x in range(1994, 2011 + 1) - if x not in [2011, 2005, 2000, 1994]]: + for year in [ + x for x in range(1994, 2011 + 1) if x not in [2011, 2005, 2000, 1994] + ]: assert not makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week( - datetime(year, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(year, 4, 2)) # Other long years assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2005, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2005, 4, 2)) assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(2000, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(2000, 4, 2)) assert makeFY5253LastOfMonthQuarter( - 1, startingMonth=12, weekday=WeekDay.SAT, - qtr_with_extra_week=1).year_has_extra_week(datetime(1994, 4, 2)) + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ).year_has_extra_week(datetime(1994, 4, 2)) def test_get_weeks(self): - sat_dec_1 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=1) - sat_dec_4 = makeFY5253LastOfMonthQuarter(1, startingMonth=12, - weekday=WeekDay.SAT, - qtr_with_extra_week=4) + sat_dec_1 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=1 + ) + sat_dec_4 = makeFY5253LastOfMonthQuarter( + 1, startingMonth=12, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) assert sat_dec_1.get_weeks(datetime(2011, 4, 2)) == [14, 13, 13, 13] assert sat_dec_4.get_weeks(datetime(2011, 4, 2)) == [13, 13, 13, 14] @@ -507,13 +548,12 @@ def test_get_weeks(self): class TestFY5253NearestEndMonthQuarter(Base): offset_nem_sat_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.SAT, - qtr_with_extra_week=4) + 1, startingMonth=8, weekday=WeekDay.SAT, qtr_with_extra_week=4 + ) offset_nem_thu_aug_4 = makeFY5253NearestEndMonthQuarter( - 1, startingMonth=8, weekday=WeekDay.THU, - qtr_with_extra_week=4) - offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, - variation="nearest") + 1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4 + ) + offset_n = FY5253(weekday=WeekDay.TUE, startingMonth=12, variation="nearest") on_offset_cases = [ # From Wikipedia @@ -523,19 +563,16 @@ class TestFY5253NearestEndMonthQuarter(Base): (offset_nem_sat_aug_4, datetime(2009, 8, 29), True), (offset_nem_sat_aug_4, datetime(2010, 8, 28), True), (offset_nem_sat_aug_4, datetime(2011, 9, 3), True), - (offset_nem_sat_aug_4, datetime(2016, 9, 3), True), (offset_nem_sat_aug_4, datetime(2017, 9, 2), True), (offset_nem_sat_aug_4, datetime(2018, 9, 1), True), (offset_nem_sat_aug_4, datetime(2019, 8, 31), True), - (offset_nem_sat_aug_4, datetime(2006, 8, 27), False), (offset_nem_sat_aug_4, datetime(2007, 8, 28), False), (offset_nem_sat_aug_4, datetime(2008, 8, 31), False), (offset_nem_sat_aug_4, datetime(2009, 8, 30), False), (offset_nem_sat_aug_4, datetime(2010, 8, 29), False), (offset_nem_sat_aug_4, datetime(2011, 8, 28), False), - (offset_nem_sat_aug_4, datetime(2006, 8, 25), False), (offset_nem_sat_aug_4, datetime(2007, 8, 24), False), (offset_nem_sat_aug_4, datetime(2008, 8, 29), False), @@ -543,12 +580,10 @@ class TestFY5253NearestEndMonthQuarter(Base): (offset_nem_sat_aug_4, datetime(2010, 8, 27), False), (offset_nem_sat_aug_4, datetime(2011, 8, 26), False), (offset_nem_sat_aug_4, datetime(2019, 8, 30), False), - # From Micron, see: # http://google.brand.edgar-online.com/?sym=MU&formtypeID=7 (offset_nem_thu_aug_4, datetime(2012, 8, 30), True), (offset_nem_thu_aug_4, datetime(2011, 9, 1), True), - # See: http://google.brand.edgar-online.com/?sym=MU&formtypeID=13 (offset_nem_thu_aug_4, datetime(2013, 5, 30), True), (offset_nem_thu_aug_4, datetime(2013, 2, 28), True), @@ -556,70 +591,68 @@ class TestFY5253NearestEndMonthQuarter(Base): (offset_nem_thu_aug_4, datetime(2012, 5, 31), True), (offset_nem_thu_aug_4, datetime(2007, 3, 1), True), (offset_nem_thu_aug_4, datetime(1994, 3, 3), True), - (offset_n, datetime(2012, 12, 31), False), (offset_n, datetime(2013, 1, 1), True), - (offset_n, datetime(2013, 1, 2), False)] + (offset_n, datetime(2013, 1, 2), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) def test_offset(self): - offset = makeFY5253NearestEndMonthQuarter(1, startingMonth=8, - weekday=WeekDay.THU, - qtr_with_extra_week=4) - - MU = [datetime(2012, 5, 31), - datetime(2012, 8, 30), datetime(2012, 11, 29), - datetime(2013, 2, 28), datetime(2013, 5, 30)] + offset = makeFY5253NearestEndMonthQuarter( + 1, startingMonth=8, weekday=WeekDay.THU, qtr_with_extra_week=4 + ) + + MU = [ + datetime(2012, 5, 31), + datetime(2012, 8, 30), + datetime(2012, 11, 29), + datetime(2013, 2, 28), + datetime(2013, 5, 30), + ] date = MU[0] + relativedelta(days=-1) for expected in MU: assert_offset_equal(offset, date, expected) date = date + offset - assert_offset_equal(offset, - datetime(2012, 5, 31), - datetime(2012, 8, 30)) - assert_offset_equal(offset, - datetime(2012, 5, 30), - datetime(2012, 5, 31)) + assert_offset_equal(offset, datetime(2012, 5, 31), datetime(2012, 8, 30)) + assert_offset_equal(offset, datetime(2012, 5, 30), datetime(2012, 5, 31)) - offset2 = FY5253Quarter(weekday=5, startingMonth=12, variation="last", - qtr_with_extra_week=4) + offset2 = FY5253Quarter( + weekday=5, startingMonth=12, variation="last", qtr_with_extra_week=4 + ) - assert_offset_equal(offset2, - datetime(2013, 1, 15), - datetime(2013, 3, 30)) + assert_offset_equal(offset2, datetime(2013, 1, 15), datetime(2013, 3, 30)) def test_bunched_yearends(): # GH#14774 cases with two fiscal year-ends in the same calendar-year - fy = FY5253(n=1, weekday=5, startingMonth=12, variation='nearest') - dt = Timestamp('2004-01-01') - assert fy.rollback(dt) == Timestamp('2002-12-28') - assert (-fy).apply(dt) == Timestamp('2002-12-28') - assert dt - fy == Timestamp('2002-12-28') + fy = FY5253(n=1, weekday=5, startingMonth=12, variation="nearest") + dt = Timestamp("2004-01-01") + assert fy.rollback(dt) == Timestamp("2002-12-28") + assert (-fy).apply(dt) == Timestamp("2002-12-28") + assert dt - fy == Timestamp("2002-12-28") - assert fy.rollforward(dt) == Timestamp('2004-01-03') - assert fy.apply(dt) == Timestamp('2004-01-03') - assert fy + dt == Timestamp('2004-01-03') - assert dt + fy == Timestamp('2004-01-03') + assert fy.rollforward(dt) == Timestamp("2004-01-03") + assert fy.apply(dt) == Timestamp("2004-01-03") + assert fy + dt == Timestamp("2004-01-03") + assert dt + fy == Timestamp("2004-01-03") # Same thing, but starting from a Timestamp in the previous year. - dt = Timestamp('2003-12-31') - assert fy.rollback(dt) == Timestamp('2002-12-28') - assert (-fy).apply(dt) == Timestamp('2002-12-28') - assert dt - fy == Timestamp('2002-12-28') + dt = Timestamp("2003-12-31") + assert fy.rollback(dt) == Timestamp("2002-12-28") + assert (-fy).apply(dt) == Timestamp("2002-12-28") + assert dt - fy == Timestamp("2002-12-28") def test_fy5253_last_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=-5, startingMonth=5, variation="last", weekday=0) - ts = Timestamp('1984-05-28 06:29:43.955911354+0200', - tz='Europe/San_Marino') + ts = Timestamp("1984-05-28 06:29:43.955911354+0200", tz="Europe/San_Marino") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -628,7 +661,7 @@ def test_fy5253_last_onoffset(): def test_fy5253_nearest_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=3, startingMonth=7, variation="nearest", weekday=2) - ts = Timestamp('2032-07-28 00:12:59.035729419+0000', tz='Africa/Dakar') + ts = Timestamp("2032-07-28 00:12:59.035729419+0000", tz="Africa/Dakar") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -636,10 +669,10 @@ def test_fy5253_nearest_onoffset(): def test_fy5253qtr_onoffset_nearest(): # GH#19036 - ts = Timestamp('1985-09-02 23:57:46.232550356-0300', - tz='Atlantic/Bermuda') - offset = FY5253Quarter(n=3, qtr_with_extra_week=1, startingMonth=2, - variation="nearest", weekday=0) + ts = Timestamp("1985-09-02 23:57:46.232550356-0300", tz="Atlantic/Bermuda") + offset = FY5253Quarter( + n=3, qtr_with_extra_week=1, startingMonth=2, variation="nearest", weekday=0 + ) fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -647,10 +680,10 @@ def test_fy5253qtr_onoffset_nearest(): def test_fy5253qtr_onoffset_last(): # GH#19036 - offset = FY5253Quarter(n=-2, qtr_with_extra_week=1, - startingMonth=7, variation="last", weekday=2) - ts = Timestamp('2011-01-26 19:03:40.331096129+0200', - tz='Africa/Windhoek') + offset = FY5253Quarter( + n=-2, qtr_with_extra_week=1, startingMonth=7, variation="last", weekday=2 + ) + ts = Timestamp("2011-01-26 19:03:40.331096129+0200", tz="Africa/Windhoek") slow = (ts + offset) - offset == ts fast = offset.onOffset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index c0021b1eade78..822e97b21f0da 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -4,9 +4,17 @@ import pytest from pandas._libs.tslibs import ( - NaT, OutOfBoundsDatetime, Timestamp, conversion, timezones) + NaT, + OutOfBoundsDatetime, + Timestamp, + conversion, + timezones, +) from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, get_freq_code, get_freq_str) + INVALID_FREQ_ERR_MSG, + get_freq_code, + get_freq_str, +) import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import ApplyTypeError import pandas.compat as compat @@ -21,11 +29,37 @@ from pandas.tseries.holiday import USFederalHolidayCalendar import pandas.tseries.offsets as offsets from pandas.tseries.offsets import ( - FY5253, BDay, BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, - BusinessHour, BYearBegin, BYearEnd, CBMonthBegin, CBMonthEnd, CDay, - CustomBusinessHour, DateOffset, Day, Easter, FY5253Quarter, - LastWeekOfMonth, MonthBegin, MonthEnd, Nano, QuarterBegin, QuarterEnd, - SemiMonthBegin, SemiMonthEnd, Tick, Week, WeekOfMonth, YearBegin, YearEnd) + FY5253, + BDay, + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BusinessHour, + BYearBegin, + BYearEnd, + CBMonthBegin, + CBMonthEnd, + CDay, + CustomBusinessHour, + DateOffset, + Day, + Easter, + FY5253Quarter, + LastWeekOfMonth, + MonthBegin, + MonthEnd, + Nano, + QuarterBegin, + QuarterEnd, + SemiMonthBegin, + SemiMonthEnd, + Tick, + Week, + WeekOfMonth, + YearBegin, + YearEnd, +) from .common import assert_offset_equal, assert_onOffset @@ -61,18 +95,34 @@ class Base: _offset = None d = Timestamp(datetime(2008, 1, 2)) - timezones = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', - 'dateutil/Asia/Tokyo', 'dateutil/US/Pacific'] + timezones = [ + None, + "UTC", + "Asia/Tokyo", + "US/Eastern", + "dateutil/Asia/Tokyo", + "dateutil/US/Pacific", + ] def _get_offset(self, klass, value=1, normalize=False): # create instance from offset class if klass is FY5253: - klass = klass(n=value, startingMonth=1, weekday=1, - variation='last', normalize=normalize) + klass = klass( + n=value, + startingMonth=1, + weekday=1, + variation="last", + normalize=normalize, + ) elif klass is FY5253Quarter: - klass = klass(n=value, startingMonth=1, weekday=1, - qtr_with_extra_week=1, variation='last', - normalize=normalize) + klass = klass( + n=value, + startingMonth=1, + weekday=1, + qtr_with_extra_week=1, + variation="last", + normalize=normalize, + ) elif klass is LastWeekOfMonth: klass = klass(n=value, weekday=5, normalize=normalize) elif klass is WeekOfMonth: @@ -103,12 +153,12 @@ def test_apply_out_of_range(self, tz_naive_fixture): else: offset = self._get_offset(self._offset, value=10000) - result = Timestamp('20080101') + offset + result = Timestamp("20080101") + offset assert isinstance(result, datetime) assert result.tzinfo is None # Check tz is preserved - t = Timestamp('20080101', tz=tz) + t = Timestamp("20080101", tz=tz) result = t + offset assert isinstance(result, datetime) assert t.tzinfo == result.tzinfo @@ -189,41 +239,42 @@ class TestCommon(Base): # exected value created by Base._get_offset # are applied to 2011/01/01 09:00 (Saturday) # used for .apply and .rollforward - expecteds = {'Day': Timestamp('2011-01-02 09:00:00'), - 'DateOffset': Timestamp('2011-01-02 09:00:00'), - 'BusinessDay': Timestamp('2011-01-03 09:00:00'), - 'CustomBusinessDay': Timestamp('2011-01-03 09:00:00'), - 'CustomBusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), - 'CustomBusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), - 'MonthBegin': Timestamp('2011-02-01 09:00:00'), - 'BusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), - 'MonthEnd': Timestamp('2011-01-31 09:00:00'), - 'SemiMonthEnd': Timestamp('2011-01-15 09:00:00'), - 'SemiMonthBegin': Timestamp('2011-01-15 09:00:00'), - 'BusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), - 'YearBegin': Timestamp('2012-01-01 09:00:00'), - 'BYearBegin': Timestamp('2011-01-03 09:00:00'), - 'YearEnd': Timestamp('2011-12-31 09:00:00'), - 'BYearEnd': Timestamp('2011-12-30 09:00:00'), - 'QuarterBegin': Timestamp('2011-03-01 09:00:00'), - 'BQuarterBegin': Timestamp('2011-03-01 09:00:00'), - 'QuarterEnd': Timestamp('2011-03-31 09:00:00'), - 'BQuarterEnd': Timestamp('2011-03-31 09:00:00'), - 'BusinessHour': Timestamp('2011-01-03 10:00:00'), - 'CustomBusinessHour': Timestamp('2011-01-03 10:00:00'), - 'WeekOfMonth': Timestamp('2011-01-08 09:00:00'), - 'LastWeekOfMonth': Timestamp('2011-01-29 09:00:00'), - 'FY5253Quarter': Timestamp('2011-01-25 09:00:00'), - 'FY5253': Timestamp('2011-01-25 09:00:00'), - 'Week': Timestamp('2011-01-08 09:00:00'), - 'Easter': Timestamp('2011-04-24 09:00:00'), - 'Hour': Timestamp('2011-01-01 10:00:00'), - 'Minute': Timestamp('2011-01-01 09:01:00'), - 'Second': Timestamp('2011-01-01 09:00:01'), - 'Milli': Timestamp('2011-01-01 09:00:00.001000'), - 'Micro': Timestamp('2011-01-01 09:00:00.000001'), - 'Nano': Timestamp(np_datetime64_compat( - '2011-01-01T09:00:00.000000001Z'))} + expecteds = { + "Day": Timestamp("2011-01-02 09:00:00"), + "DateOffset": Timestamp("2011-01-02 09:00:00"), + "BusinessDay": Timestamp("2011-01-03 09:00:00"), + "CustomBusinessDay": Timestamp("2011-01-03 09:00:00"), + "CustomBusinessMonthEnd": Timestamp("2011-01-31 09:00:00"), + "CustomBusinessMonthBegin": Timestamp("2011-01-03 09:00:00"), + "MonthBegin": Timestamp("2011-02-01 09:00:00"), + "BusinessMonthBegin": Timestamp("2011-01-03 09:00:00"), + "MonthEnd": Timestamp("2011-01-31 09:00:00"), + "SemiMonthEnd": Timestamp("2011-01-15 09:00:00"), + "SemiMonthBegin": Timestamp("2011-01-15 09:00:00"), + "BusinessMonthEnd": Timestamp("2011-01-31 09:00:00"), + "YearBegin": Timestamp("2012-01-01 09:00:00"), + "BYearBegin": Timestamp("2011-01-03 09:00:00"), + "YearEnd": Timestamp("2011-12-31 09:00:00"), + "BYearEnd": Timestamp("2011-12-30 09:00:00"), + "QuarterBegin": Timestamp("2011-03-01 09:00:00"), + "BQuarterBegin": Timestamp("2011-03-01 09:00:00"), + "QuarterEnd": Timestamp("2011-03-31 09:00:00"), + "BQuarterEnd": Timestamp("2011-03-31 09:00:00"), + "BusinessHour": Timestamp("2011-01-03 10:00:00"), + "CustomBusinessHour": Timestamp("2011-01-03 10:00:00"), + "WeekOfMonth": Timestamp("2011-01-08 09:00:00"), + "LastWeekOfMonth": Timestamp("2011-01-29 09:00:00"), + "FY5253Quarter": Timestamp("2011-01-25 09:00:00"), + "FY5253": Timestamp("2011-01-25 09:00:00"), + "Week": Timestamp("2011-01-08 09:00:00"), + "Easter": Timestamp("2011-04-24 09:00:00"), + "Hour": Timestamp("2011-01-01 10:00:00"), + "Minute": Timestamp("2011-01-01 09:01:00"), + "Second": Timestamp("2011-01-01 09:00:01"), + "Milli": Timestamp("2011-01-01 09:00:00.001000"), + "Micro": Timestamp("2011-01-01 09:00:00.000001"), + "Nano": Timestamp(np_datetime64_compat("2011-01-01T09:00:00.000000001Z")), + } def test_immutable(self, offset_types): # GH#21341 check that __setattr__ raises @@ -237,7 +288,7 @@ def test_return_type(self, offset_types): offset = self._get_offset(offset_types) # make sure that we are returning a Timestamp - result = Timestamp('20080101') + offset + result = Timestamp("20080101") + offset assert isinstance(result, Timestamp) # make sure that we are returning NaT @@ -262,7 +313,7 @@ def test_offset_timedelta64_arg(self, offset_types): # object off = self._get_offset(offset_types) - td64 = np.timedelta64(4567, 's') + td64 = np.timedelta64(4567, "s") with pytest.raises(TypeError, match="argument must be an integer"): type(off)(n=td64, **off.kwds) @@ -281,14 +332,11 @@ def test_offset_freqstr(self, offset_types): offset = self._get_offset(offset_types) freqstr = offset.freqstr - if freqstr not in ('', - "", - 'LWOM-SAT', ): + if freqstr not in ("", "", "LWOM-SAT"): code = get_offset(freqstr) assert offset.rule_code == code - def _check_offsetfunc_works(self, offset, funcname, dt, expected, - normalize=False): + def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=False): if normalize and issubclass(offset, Tick): # normalize=True disallowed for Tick subclasses GH#21427 @@ -309,14 +357,15 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, exp_warning = None ts = Timestamp(dt) + Nano(5) - if (offset_s.__class__.__name__ == 'DateOffset' and - (funcname == 'apply' or normalize) and - ts.nanosecond > 0): + if ( + offset_s.__class__.__name__ == "DateOffset" + and (funcname == "apply" or normalize) + and ts.nanosecond > 0 + ): exp_warning = UserWarning # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning, - check_stacklevel=False): + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): result = func(ts) assert isinstance(result, Timestamp) if normalize is False: @@ -345,14 +394,15 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, exp_warning = None ts = Timestamp(dt, tz=tz) + Nano(5) - if (offset_s.__class__.__name__ == 'DateOffset' and - (funcname == 'apply' or normalize) and - ts.nanosecond > 0): + if ( + offset_s.__class__.__name__ == "DateOffset" + and (funcname == "apply" or normalize) + and ts.nanosecond > 0 + ): exp_warning = UserWarning # test nanosecond is preserved - with tm.assert_produces_warning(exp_warning, - check_stacklevel=False): + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): result = func(ts) assert isinstance(result, Timestamp) if normalize is False: @@ -362,119 +412,146 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, def test_apply(self, offset_types): sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat('2011-01-01 09:00Z') + ndt = np_datetime64_compat("2011-01-01 09:00Z") for dt in [sdt, ndt]: expected = self.expecteds[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'apply', dt, expected) + self._check_offsetfunc_works(offset_types, "apply", dt, expected) expected = Timestamp(expected.date()) - self._check_offsetfunc_works(offset_types, 'apply', dt, expected, - normalize=True) + self._check_offsetfunc_works( + offset_types, "apply", dt, expected, normalize=True + ) def test_rollforward(self, offset_types): expecteds = self.expecteds.copy() # result will not be changed if the target is on the offset - no_changes = ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', - 'Week', 'Hour', 'Minute', 'Second', 'Milli', 'Micro', - 'Nano', 'DateOffset'] + no_changes = [ + "Day", + "MonthBegin", + "SemiMonthBegin", + "YearBegin", + "Week", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", + ] for n in no_changes: - expecteds[n] = Timestamp('2011/01/01 09:00') + expecteds[n] = Timestamp("2011/01/01 09:00") - expecteds['BusinessHour'] = Timestamp('2011-01-03 09:00:00') - expecteds['CustomBusinessHour'] = Timestamp('2011-01-03 09:00:00') + expecteds["BusinessHour"] = Timestamp("2011-01-03 09:00:00") + expecteds["CustomBusinessHour"] = Timestamp("2011-01-03 09:00:00") # but be changed when normalize=True norm_expected = expecteds.copy() for k in norm_expected: norm_expected[k] = Timestamp(norm_expected[k].date()) - normalized = {'Day': Timestamp('2011-01-02 00:00:00'), - 'DateOffset': Timestamp('2011-01-02 00:00:00'), - 'MonthBegin': Timestamp('2011-02-01 00:00:00'), - 'SemiMonthBegin': Timestamp('2011-01-15 00:00:00'), - 'YearBegin': Timestamp('2012-01-01 00:00:00'), - 'Week': Timestamp('2011-01-08 00:00:00'), - 'Hour': Timestamp('2011-01-01 00:00:00'), - 'Minute': Timestamp('2011-01-01 00:00:00'), - 'Second': Timestamp('2011-01-01 00:00:00'), - 'Milli': Timestamp('2011-01-01 00:00:00'), - 'Micro': Timestamp('2011-01-01 00:00:00')} + normalized = { + "Day": Timestamp("2011-01-02 00:00:00"), + "DateOffset": Timestamp("2011-01-02 00:00:00"), + "MonthBegin": Timestamp("2011-02-01 00:00:00"), + "SemiMonthBegin": Timestamp("2011-01-15 00:00:00"), + "YearBegin": Timestamp("2012-01-01 00:00:00"), + "Week": Timestamp("2011-01-08 00:00:00"), + "Hour": Timestamp("2011-01-01 00:00:00"), + "Minute": Timestamp("2011-01-01 00:00:00"), + "Second": Timestamp("2011-01-01 00:00:00"), + "Milli": Timestamp("2011-01-01 00:00:00"), + "Micro": Timestamp("2011-01-01 00:00:00"), + } norm_expected.update(normalized) sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat('2011-01-01 09:00Z') + ndt = np_datetime64_compat("2011-01-01 09:00Z") for dt in [sdt, ndt]: expected = expecteds[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollforward', dt, - expected) + self._check_offsetfunc_works(offset_types, "rollforward", dt, expected) expected = norm_expected[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollforward', dt, - expected, normalize=True) + self._check_offsetfunc_works( + offset_types, "rollforward", dt, expected, normalize=True + ) def test_rollback(self, offset_types): - expecteds = {'BusinessDay': Timestamp('2010-12-31 09:00:00'), - 'CustomBusinessDay': Timestamp('2010-12-31 09:00:00'), - 'CustomBusinessMonthEnd': - Timestamp('2010-12-31 09:00:00'), - 'CustomBusinessMonthBegin': - Timestamp('2010-12-01 09:00:00'), - 'BusinessMonthBegin': Timestamp('2010-12-01 09:00:00'), - 'MonthEnd': Timestamp('2010-12-31 09:00:00'), - 'SemiMonthEnd': Timestamp('2010-12-31 09:00:00'), - 'BusinessMonthEnd': Timestamp('2010-12-31 09:00:00'), - 'BYearBegin': Timestamp('2010-01-01 09:00:00'), - 'YearEnd': Timestamp('2010-12-31 09:00:00'), - 'BYearEnd': Timestamp('2010-12-31 09:00:00'), - 'QuarterBegin': Timestamp('2010-12-01 09:00:00'), - 'BQuarterBegin': Timestamp('2010-12-01 09:00:00'), - 'QuarterEnd': Timestamp('2010-12-31 09:00:00'), - 'BQuarterEnd': Timestamp('2010-12-31 09:00:00'), - 'BusinessHour': Timestamp('2010-12-31 17:00:00'), - 'CustomBusinessHour': Timestamp('2010-12-31 17:00:00'), - 'WeekOfMonth': Timestamp('2010-12-11 09:00:00'), - 'LastWeekOfMonth': Timestamp('2010-12-25 09:00:00'), - 'FY5253Quarter': Timestamp('2010-10-26 09:00:00'), - 'FY5253': Timestamp('2010-01-26 09:00:00'), - 'Easter': Timestamp('2010-04-04 09:00:00')} + expecteds = { + "BusinessDay": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessDay": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessMonthEnd": Timestamp("2010-12-31 09:00:00"), + "CustomBusinessMonthBegin": Timestamp("2010-12-01 09:00:00"), + "BusinessMonthBegin": Timestamp("2010-12-01 09:00:00"), + "MonthEnd": Timestamp("2010-12-31 09:00:00"), + "SemiMonthEnd": Timestamp("2010-12-31 09:00:00"), + "BusinessMonthEnd": Timestamp("2010-12-31 09:00:00"), + "BYearBegin": Timestamp("2010-01-01 09:00:00"), + "YearEnd": Timestamp("2010-12-31 09:00:00"), + "BYearEnd": Timestamp("2010-12-31 09:00:00"), + "QuarterBegin": Timestamp("2010-12-01 09:00:00"), + "BQuarterBegin": Timestamp("2010-12-01 09:00:00"), + "QuarterEnd": Timestamp("2010-12-31 09:00:00"), + "BQuarterEnd": Timestamp("2010-12-31 09:00:00"), + "BusinessHour": Timestamp("2010-12-31 17:00:00"), + "CustomBusinessHour": Timestamp("2010-12-31 17:00:00"), + "WeekOfMonth": Timestamp("2010-12-11 09:00:00"), + "LastWeekOfMonth": Timestamp("2010-12-25 09:00:00"), + "FY5253Quarter": Timestamp("2010-10-26 09:00:00"), + "FY5253": Timestamp("2010-01-26 09:00:00"), + "Easter": Timestamp("2010-04-04 09:00:00"), + } # result will not be changed if the target is on the offset - for n in ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', 'Week', - 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano', - 'DateOffset']: - expecteds[n] = Timestamp('2011/01/01 09:00') + for n in [ + "Day", + "MonthBegin", + "SemiMonthBegin", + "YearBegin", + "Week", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", + ]: + expecteds[n] = Timestamp("2011/01/01 09:00") # but be changed when normalize=True norm_expected = expecteds.copy() for k in norm_expected: norm_expected[k] = Timestamp(norm_expected[k].date()) - normalized = {'Day': Timestamp('2010-12-31 00:00:00'), - 'DateOffset': Timestamp('2010-12-31 00:00:00'), - 'MonthBegin': Timestamp('2010-12-01 00:00:00'), - 'SemiMonthBegin': Timestamp('2010-12-15 00:00:00'), - 'YearBegin': Timestamp('2010-01-01 00:00:00'), - 'Week': Timestamp('2010-12-25 00:00:00'), - 'Hour': Timestamp('2011-01-01 00:00:00'), - 'Minute': Timestamp('2011-01-01 00:00:00'), - 'Second': Timestamp('2011-01-01 00:00:00'), - 'Milli': Timestamp('2011-01-01 00:00:00'), - 'Micro': Timestamp('2011-01-01 00:00:00')} + normalized = { + "Day": Timestamp("2010-12-31 00:00:00"), + "DateOffset": Timestamp("2010-12-31 00:00:00"), + "MonthBegin": Timestamp("2010-12-01 00:00:00"), + "SemiMonthBegin": Timestamp("2010-12-15 00:00:00"), + "YearBegin": Timestamp("2010-01-01 00:00:00"), + "Week": Timestamp("2010-12-25 00:00:00"), + "Hour": Timestamp("2011-01-01 00:00:00"), + "Minute": Timestamp("2011-01-01 00:00:00"), + "Second": Timestamp("2011-01-01 00:00:00"), + "Milli": Timestamp("2011-01-01 00:00:00"), + "Micro": Timestamp("2011-01-01 00:00:00"), + } norm_expected.update(normalized) sdt = datetime(2011, 1, 1, 9, 0) - ndt = np_datetime64_compat('2011-01-01 09:00Z') + ndt = np_datetime64_compat("2011-01-01 09:00Z") for dt in [sdt, ndt]: expected = expecteds[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollback', dt, - expected) + self._check_offsetfunc_works(offset_types, "rollback", dt, expected) expected = norm_expected[offset_types.__name__] - self._check_offsetfunc_works(offset_types, 'rollback', dt, - expected, normalize=True) + self._check_offsetfunc_works( + offset_types, "rollback", dt, expected, normalize=True + ) def test_onOffset(self, offset_types): dt = self.expecteds[offset_types.__name__] @@ -531,14 +608,15 @@ def test_add(self, offset_types, tz_naive_fixture): assert result == expected_localize def test_pickle_v0_15_2(self, datapath): - offsets = {'DateOffset': DateOffset(years=1), - 'MonthBegin': MonthBegin(1), - 'Day': Day(1), - 'YearBegin': YearBegin(1), - 'Week': Week(1)} - - pickle_path = datapath('tseries', 'offsets', 'data', - 'dateoffset_0_15_2.pickle') + offsets = { + "DateOffset": DateOffset(years=1), + "MonthBegin": MonthBegin(1), + "Day": Day(1), + "YearBegin": YearBegin(1), + "Week": Week(1), + } + + pickle_path = datapath("tseries", "offsets", "data", "dateoffset_0_15_2.pickle") # This code was executed once on v0.15.2 to generate the pickle: # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) # @@ -546,7 +624,6 @@ def test_pickle_v0_15_2(self, datapath): class TestDateOffset(Base): - def setup_method(self, method): self.d = Timestamp(datetime(2008, 1, 2)) _offset_map.clear() @@ -563,19 +640,19 @@ def test_mul(self): def test_constructor(self): - assert ((self.d + DateOffset(months=2)) == datetime(2008, 3, 2)) - assert ((self.d - DateOffset(months=2)) == datetime(2007, 11, 2)) + assert (self.d + DateOffset(months=2)) == datetime(2008, 3, 2) + assert (self.d - DateOffset(months=2)) == datetime(2007, 11, 2) - assert ((self.d + DateOffset(2)) == datetime(2008, 1, 4)) + assert (self.d + DateOffset(2)) == datetime(2008, 1, 4) assert not DateOffset(2).isAnchored() assert DateOffset(1).isAnchored() d = datetime(2008, 1, 31) - assert ((d + DateOffset(months=1)) == datetime(2008, 2, 29)) + assert (d + DateOffset(months=1)) == datetime(2008, 2, 29) def test_copy(self): - assert (DateOffset(months=2).copy() == DateOffset(months=2)) + assert DateOffset(months=2).copy() == DateOffset(months=2) def test_eq(self): offset1 = DateOffset(days=1) @@ -601,13 +678,13 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * BusinessDays>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * BusinessDays>" if compat.PY37: - expected = '' + expected = "" else: - expected = '' + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -631,15 +708,13 @@ def testRollback1(self): assert BDay(10).rollback(self.d) == self.d def testRollback2(self): - assert (BDay(10).rollback(datetime(2008, 1, 5)) == - datetime(2008, 1, 4)) + assert BDay(10).rollback(datetime(2008, 1, 5)) == datetime(2008, 1, 4) def testRollforward1(self): assert BDay(10).rollforward(self.d) == self.d def testRollforward2(self): - assert (BDay(10).rollforward(datetime(2008, 1, 5)) == - datetime(2008, 1, 7)) + assert BDay(10).rollforward(datetime(2008, 1, 5)) == datetime(2008, 1, 7) def test_roll_date_object(self): offset = BDay() @@ -660,52 +735,84 @@ def test_roll_date_object(self): assert result == datetime(2012, 9, 15) def test_onOffset(self): - tests = [(BDay(), datetime(2008, 1, 1), True), - (BDay(), datetime(2008, 1, 5), False)] + tests = [ + (BDay(), datetime(2008, 1, 1), True), + (BDay(), datetime(2008, 1, 5), False), + ] for offset, d, expected in tests: assert_onOffset(offset, d, expected) apply_cases = [] - apply_cases.append((BDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - apply_cases.append((2 * BDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - apply_cases.append((-BDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - apply_cases.append((-2 * BDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) - - apply_cases.append((BDay(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + BDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8), + }, + ) + ) + + apply_cases.append( + ( + 2 * BDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9), + }, + ) + ) + + apply_cases.append( + ( + -BDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + -2 * BDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + BDay(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -736,8 +843,7 @@ def test_apply_large_n(self): assert rs == xp def test_apply_corner(self): - msg = ("Only know how to combine business day with datetime or" - " timedelta") + msg = "Only know how to combine business day with datetime or" " timedelta" with pytest.raises(ApplyTypeError, match=msg): BDay().apply(BMonthEnd()) @@ -755,69 +861,56 @@ def setup_method(self, method): self.offset4 = BusinessHour(n=-4) from datetime import time as dt_time + self.offset5 = BusinessHour(start=dt_time(11, 0), end=dt_time(14, 30)) - self.offset6 = BusinessHour(start='20:00', end='05:00') - self.offset7 = BusinessHour(n=-2, start=dt_time(21, 30), - end=dt_time(6, 30)) - self.offset8 = BusinessHour(start=['09:00', '13:00'], - end=['12:00', '17:00']) - self.offset9 = BusinessHour(n=3, start=['09:00', '22:00'], - end=['13:00', '03:00']) - self.offset10 = BusinessHour(n=-1, start=['23:00', '13:00'], - end=['02:00', '17:00']) - - @pytest.mark.parametrize("start,end,match", [ - ( - dt_time(11, 0, 5), - '17:00', - "time data must be specified only with hour and minute" - ), - ( - 'AAA', - '17:00', - "time data must match '%H:%M' format" - ), - ( - '14:00:05', - '17:00', - "time data must match '%H:%M' format" - ), - ( - [], - '17:00', - "Must include at least 1 start time" - ), - ( - '09:00', - [], - "Must include at least 1 end time" - ), - ( - ['09:00', '11:00'], - '17:00', - "number of starting time and ending time must be the same" - ), - ( - ['09:00', '11:00'], - ['10:00'], - "number of starting time and ending time must be the same" - ), - ( - ['09:00', '11:00'], - ['12:00', '20:00'], - r"invalid starting and ending time\(s\): opening hours should not " - "touch or overlap with one another" - ), - ( - ['12:00', '20:00'], - ['09:00', '11:00'], - r"invalid starting and ending time\(s\): opening hours should not " - "touch or overlap with one another" - ), - ]) + self.offset6 = BusinessHour(start="20:00", end="05:00") + self.offset7 = BusinessHour(n=-2, start=dt_time(21, 30), end=dt_time(6, 30)) + self.offset8 = BusinessHour(start=["09:00", "13:00"], end=["12:00", "17:00"]) + self.offset9 = BusinessHour( + n=3, start=["09:00", "22:00"], end=["13:00", "03:00"] + ) + self.offset10 = BusinessHour( + n=-1, start=["23:00", "13:00"], end=["02:00", "17:00"] + ) + + @pytest.mark.parametrize( + "start,end,match", + [ + ( + dt_time(11, 0, 5), + "17:00", + "time data must be specified only with hour and minute", + ), + ("AAA", "17:00", "time data must match '%H:%M' format"), + ("14:00:05", "17:00", "time data must match '%H:%M' format"), + ([], "17:00", "Must include at least 1 start time"), + ("09:00", [], "Must include at least 1 end time"), + ( + ["09:00", "11:00"], + "17:00", + "number of starting time and ending time must be the same", + ), + ( + ["09:00", "11:00"], + ["10:00"], + "number of starting time and ending time must be the same", + ), + ( + ["09:00", "11:00"], + ["12:00", "20:00"], + r"invalid starting and ending time\(s\): opening hours should not " + "touch or overlap with one another", + ), + ( + ["12:00", "20:00"], + ["09:00", "11:00"], + r"invalid starting and ending time\(s\): opening hours should not " + "touch or overlap with one another", + ), + ], + ) def test_constructor_errors(self, start, end, match): - with pytest.raises(ValueError, - match=match): + with pytest.raises(ValueError, match=match): BusinessHour(start=start, end=end) def test_different_normalize_equals(self): @@ -827,68 +920,67 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset1) == '' - assert repr(self.offset2) == '<3 * BusinessHours: BH=09:00-17:00>' - assert repr(self.offset3) == '<-1 * BusinessHour: BH=09:00-17:00>' - assert repr(self.offset4) == '<-4 * BusinessHours: BH=09:00-17:00>' - - assert repr(self.offset5) == '' - assert repr(self.offset6) == '' - assert repr(self.offset7) == '<-2 * BusinessHours: BH=21:30-06:30>' - assert (repr(self.offset8) == - '') - assert (repr(self.offset9) == - '<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>') - assert (repr(self.offset10) == - '<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>') + assert repr(self.offset1) == "" + assert repr(self.offset2) == "<3 * BusinessHours: BH=09:00-17:00>" + assert repr(self.offset3) == "<-1 * BusinessHour: BH=09:00-17:00>" + assert repr(self.offset4) == "<-4 * BusinessHours: BH=09:00-17:00>" + + assert repr(self.offset5) == "" + assert repr(self.offset6) == "" + assert repr(self.offset7) == "<-2 * BusinessHours: BH=21:30-06:30>" + assert repr(self.offset8) == "" + assert repr(self.offset9) == "<3 * BusinessHours: BH=09:00-13:00,22:00-03:00>" + assert repr(self.offset10) == "<-1 * BusinessHour: BH=13:00-17:00,23:00-02:00>" def test_with_offset(self): - expected = Timestamp('2014-07-01 13:00') + expected = Timestamp("2014-07-01 13:00") assert self.d + BusinessHour() * 3 == expected assert self.d + BusinessHour(n=3) == expected - @pytest.mark.parametrize("offset_name", [ - "offset1", - "offset2", - "offset3", - "offset4", - "offset8", - "offset9", - "offset10" - ]) + @pytest.mark.parametrize( + "offset_name", + ["offset1", "offset2", "offset3", "offset4", "offset8", "offset9", "offset10"], + ) def test_eq_attribute(self, offset_name): offset = getattr(self, offset_name) assert offset == offset - @pytest.mark.parametrize("offset1,offset2", [ - (BusinessHour(start='09:00'), BusinessHour()), - (BusinessHour(start=['23:00', '13:00'], end=['12:00', '17:00']), - BusinessHour(start=['13:00', '23:00'], end=['17:00', '12:00'])), - ]) + @pytest.mark.parametrize( + "offset1,offset2", + [ + (BusinessHour(start="09:00"), BusinessHour()), + ( + BusinessHour(start=["23:00", "13:00"], end=["12:00", "17:00"]), + BusinessHour(start=["13:00", "23:00"], end=["17:00", "12:00"]), + ), + ], + ) def test_eq(self, offset1, offset2): assert offset1 == offset2 - @pytest.mark.parametrize("offset1,offset2", [ - (BusinessHour(), BusinessHour(-1)), - (BusinessHour(start='09:00'), BusinessHour(start='09:01')), - (BusinessHour(start='09:00', end='17:00'), - BusinessHour(start='17:00', end='09:01')), - (BusinessHour(start=['13:00', '23:00'], end=['18:00', '07:00']), - BusinessHour(start=['13:00', '23:00'], end=['17:00', '12:00'])), - ]) + @pytest.mark.parametrize( + "offset1,offset2", + [ + (BusinessHour(), BusinessHour(-1)), + (BusinessHour(start="09:00"), BusinessHour(start="09:01")), + ( + BusinessHour(start="09:00", end="17:00"), + BusinessHour(start="17:00", end="09:01"), + ), + ( + BusinessHour(start=["13:00", "23:00"], end=["18:00", "07:00"]), + BusinessHour(start=["13:00", "23:00"], end=["17:00", "12:00"]), + ), + ], + ) def test_neq(self, offset1, offset2): assert offset1 != offset2 - @pytest.mark.parametrize("offset_name", [ - "offset1", - "offset2", - "offset3", - "offset4", - "offset8", - "offset9", - "offset10" - ]) + @pytest.mark.parametrize( + "offset_name", + ["offset1", "offset2", "offset3", "offset4", "offset8", "offset9", "offset10"], + ) def test_hash(self, offset_name): offset = getattr(self, offset_name) assert offset == offset @@ -940,20 +1032,18 @@ def testRollback1(self): assert self._offset(5).rollback(self.d) == self.d def testRollback2(self): - assert (self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == - datetime(2014, 7, 4, 17, 0)) + assert self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == datetime( + 2014, 7, 4, 17, 0 + ) def testRollforward1(self): assert self.offset1.rollforward(self.d) == self.d assert self.offset2.rollforward(self.d) == self.d assert self.offset3.rollforward(self.d) == self.d assert self.offset4.rollforward(self.d) == self.d - assert (self.offset5.rollforward(self.d) == - datetime(2014, 7, 1, 11, 0)) - assert (self.offset6.rollforward(self.d) == - datetime(2014, 7, 1, 20, 0)) - assert (self.offset7.rollforward(self.d) == - datetime(2014, 7, 1, 21, 30)) + assert self.offset5.rollforward(self.d) == datetime(2014, 7, 1, 11, 0) + assert self.offset6.rollforward(self.d) == datetime(2014, 7, 1, 20, 0) + assert self.offset7.rollforward(self.d) == datetime(2014, 7, 1, 21, 30) assert self.offset8.rollforward(self.d) == self.d assert self.offset9.rollforward(self.d) == self.d assert self.offset10.rollforward(self.d) == datetime(2014, 7, 1, 13) @@ -973,8 +1063,9 @@ def testRollforward1(self): assert self._offset(5).rollforward(self.d) == self.d def testRollforward2(self): - assert (self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == - datetime(2014, 7, 7, 9)) + assert self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == datetime( + 2014, 7, 7, 9 + ) def test_roll_date_object(self): offset = BusinessHour() @@ -988,111 +1079,156 @@ def test_roll_date_object(self): assert result == datetime(2014, 7, 7, 9) normalize_cases = [] - normalize_cases.append((BusinessHour(normalize=True), { - datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 1, 0): datetime(2014, 7, 1), - datetime(2014, 7, 4, 15): datetime(2014, 7, 4), - datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 7), - datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) - - normalize_cases.append((BusinessHour(-1, normalize=True), { - datetime(2014, 7, 1, 8): datetime(2014, 6, 30), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30), - datetime(2014, 7, 1, 0): datetime(2014, 6, 30), - datetime(2014, 7, 7, 10): datetime(2014, 7, 4), - datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 4), - datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) - - normalize_cases.append((BusinessHour(1, normalize=True, start='17:00', - end='04:00'), { - datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 2, 2): datetime(2014, 7, 2), - datetime(2014, 7, 2, 3): datetime(2014, 7, 2), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5), - datetime(2014, 7, 5, 2): datetime(2014, 7, 5), - datetime(2014, 7, 7, 2): datetime(2014, 7, 7), - datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) - - @pytest.mark.parametrize('case', normalize_cases) + normalize_cases.append( + ( + BusinessHour(normalize=True), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7), + }, + ) + ) + + normalize_cases.append( + ( + BusinessHour(-1, normalize=True), + { + datetime(2014, 7, 1, 8): datetime(2014, 6, 30), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30), + datetime(2014, 7, 1, 0): datetime(2014, 6, 30), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4), + }, + ) + ) + + normalize_cases.append( + ( + BusinessHour(1, normalize=True, start="17:00", end="04:00"), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 2), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", normalize_cases) def test_normalize(self, case): offset, cases = case for dt, expected in cases.items(): assert offset.apply(dt) == expected on_offset_cases = [] - on_offset_cases.append((BusinessHour(), { - datetime(2014, 7, 1, 9): True, - datetime(2014, 7, 1, 8, 59): False, - datetime(2014, 7, 1, 8): False, - datetime(2014, 7, 1, 17): True, - datetime(2014, 7, 1, 17, 1): False, - datetime(2014, 7, 1, 18): False, - datetime(2014, 7, 5, 9): False, - datetime(2014, 7, 6, 12): False})) - - on_offset_cases.append((BusinessHour(start='10:00', end='15:00'), { - datetime(2014, 7, 1, 9): False, - datetime(2014, 7, 1, 10): True, - datetime(2014, 7, 1, 15): True, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12): False, - datetime(2014, 7, 6, 12): False})) - - on_offset_cases.append((BusinessHour(start='19:00', end='05:00'), { - datetime(2014, 7, 1, 9, 0): False, - datetime(2014, 7, 1, 10, 0): False, - datetime(2014, 7, 1, 15): False, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12, 0): False, - datetime(2014, 7, 6, 12, 0): False, - datetime(2014, 7, 1, 19, 0): True, - datetime(2014, 7, 2, 0, 0): True, - datetime(2014, 7, 4, 23): True, - datetime(2014, 7, 5, 1): True, - datetime(2014, 7, 5, 5, 0): True, - datetime(2014, 7, 6, 23, 0): False, - datetime(2014, 7, 7, 3, 0): False})) - - on_offset_cases.append((BusinessHour(start=['09:00', '13:00'], - end=['12:00', '17:00']), { - datetime(2014, 7, 1, 9): True, - datetime(2014, 7, 1, 8, 59): False, - datetime(2014, 7, 1, 8): False, - datetime(2014, 7, 1, 17): True, - datetime(2014, 7, 1, 17, 1): False, - datetime(2014, 7, 1, 18): False, - datetime(2014, 7, 5, 9): False, - datetime(2014, 7, 6, 12): False, - datetime(2014, 7, 1, 12, 30): False})) - - on_offset_cases.append((BusinessHour(start=['19:00', '23:00'], - end=['21:00', '05:00']), { - datetime(2014, 7, 1, 9, 0): False, - datetime(2014, 7, 1, 10, 0): False, - datetime(2014, 7, 1, 15): False, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12, 0): False, - datetime(2014, 7, 6, 12, 0): False, - datetime(2014, 7, 1, 19, 0): True, - datetime(2014, 7, 2, 0, 0): True, - datetime(2014, 7, 4, 23): True, - datetime(2014, 7, 5, 1): True, - datetime(2014, 7, 5, 5, 0): True, - datetime(2014, 7, 6, 23, 0): False, - datetime(2014, 7, 7, 3, 0): False, - datetime(2014, 7, 4, 22): False})) - - @pytest.mark.parametrize('case', on_offset_cases) + on_offset_cases.append( + ( + BusinessHour(), + { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start="10:00", end="15:00"), + { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start=["09:00", "13:00"], end=["12:00", "17:00"]), + { + datetime(2014, 7, 1, 9): True, + datetime(2014, 7, 1, 8, 59): False, + datetime(2014, 7, 1, 8): False, + datetime(2014, 7, 1, 17): True, + datetime(2014, 7, 1, 17, 1): False, + datetime(2014, 7, 1, 18): False, + datetime(2014, 7, 5, 9): False, + datetime(2014, 7, 6, 12): False, + datetime(2014, 7, 1, 12, 30): False, + }, + ) + ) + + on_offset_cases.append( + ( + BusinessHour(start=["19:00", "23:00"], end=["21:00", "05:00"]), + { + datetime(2014, 7, 1, 9, 0): False, + datetime(2014, 7, 1, 10, 0): False, + datetime(2014, 7, 1, 15): False, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12, 0): False, + datetime(2014, 7, 6, 12, 0): False, + datetime(2014, 7, 1, 19, 0): True, + datetime(2014, 7, 2, 0, 0): True, + datetime(2014, 7, 4, 23): True, + datetime(2014, 7, 5, 1): True, + datetime(2014, 7, 5, 5, 0): True, + datetime(2014, 7, 6, 23, 0): False, + datetime(2014, 7, 7, 3, 0): False, + datetime(2014, 7, 4, 22): False, + }, + ) + ) + + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, cases = case for dt, expected in cases.items(): @@ -1101,233 +1237,462 @@ def test_onOffset(self, case): opening_time_cases = [] # opening time should be affected by sign of n, not by n's value and # end - opening_time_cases.append(([BusinessHour(), BusinessHour(n=2), - BusinessHour(n=4), BusinessHour(end='10:00'), - BusinessHour(n=2, end='4:00'), - BusinessHour(n=4, end='15:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 9)), - # if timestamp is on opening time, next opening time is - # as it is - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 3, 9), - datetime(2014, 7, 2, 9)), - # 2014-07-05 is saturday - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 8, 9), - datetime(2014, 7, 7, 9))})) - - opening_time_cases.append(([BusinessHour(start='11:15'), - BusinessHour(n=2, start='11:15'), - BusinessHour(n=3, start='11:15'), - BusinessHour(start='11:15', end='10:00'), - BusinessHour(n=2, start='11:15', end='4:00'), - BusinessHour(n=3, start='11:15', - end='15:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), - datetime(2014, 6, 30, 11, 15)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 11, 15)), - datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 3, 11, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), - datetime(2014, 7, 3, 11, 15)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 11, 15))})) - - opening_time_cases.append(([BusinessHour(-1), BusinessHour(n=-2), - BusinessHour(n=-4), - BusinessHour(n=-1, end='10:00'), - BusinessHour(n=-2, end='4:00'), - BusinessHour(n=-4, end='15:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 9)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 9), - datetime(2014, 7, 3, 9)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 7, 9)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 9), - datetime(2014, 7, 8, 9))})) - - opening_time_cases.append(([BusinessHour(start='17:00', end='05:00'), - BusinessHour(n=3, start='17:00', - end='03:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 17), - datetime(2014, 6, 30, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 4, 17): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 3, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 7, 17, 1): (datetime(2014, 7, 8, 17), - datetime(2014, 7, 7, 17)), })) - - opening_time_cases.append(([BusinessHour(-1, start='17:00', end='05:00'), - BusinessHour(n=-2, start='17:00', - end='03:00')], { - datetime(2014, 7, 1, 11): (datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 3, 17), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 17)), - datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 17))})) - - opening_time_cases.append(([BusinessHour(start=['11:15', '15:00'], - end=['13:00', '20:00']), - BusinessHour(n=3, start=['11:15', '15:00'], - end=['12:00', '20:00']), - BusinessHour(start=['11:15', '15:00'], - end=['13:00', '17:00']), - BusinessHour(n=2, start=['11:15', '15:00'], - end=['12:00', '03:00']), - BusinessHour(n=3, start=['11:15', '15:00'], - end=['13:00', '16:00'])], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 11, 15), - datetime(2014, 6, 30, 15)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 10): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 1, 15)), - datetime(2014, 7, 2, 11, 15): (datetime(2014, 7, 2, 11, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 2, 11, 15, 1): (datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 11, 15)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 11, 15), - datetime(2014, 7, 3, 15)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 7, 9, 1): (datetime(2014, 7, 7, 11, 15), - datetime(2014, 7, 4, 15)), - datetime(2014, 7, 7, 12): (datetime(2014, 7, 7, 15), - datetime(2014, 7, 7, 11, 15))})) - - opening_time_cases.append(([BusinessHour(n=-1, start=['17:00', '08:00'], - end=['05:00', '10:00']), - BusinessHour(n=-2, start=['08:00', '17:00'], - end=['10:00', '03:00'])], { - datetime(2014, 7, 1, 11): (datetime(2014, 7, 1, 8), - datetime(2014, 7, 1, 17)), - datetime(2014, 7, 1, 18): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 8)), - datetime(2014, 7, 1, 23): (datetime(2014, 7, 1, 17), - datetime(2014, 7, 2, 8)), - datetime(2014, 7, 2, 8): (datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 8)), - datetime(2014, 7, 2, 9): (datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 2, 16, 59): (datetime(2014, 7, 2, 8), - datetime(2014, 7, 2, 17)), - datetime(2014, 7, 5, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 4, 10): (datetime(2014, 7, 4, 8), - datetime(2014, 7, 4, 17)), - datetime(2014, 7, 4, 23): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 6, 10): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 7, 5): (datetime(2014, 7, 4, 17), - datetime(2014, 7, 7, 8)), - datetime(2014, 7, 7, 18): (datetime(2014, 7, 7, 17), - datetime(2014, 7, 8, 8))})) - - @pytest.mark.parametrize('case', opening_time_cases) + opening_time_cases.append( + ( + [ + BusinessHour(), + BusinessHour(n=2), + BusinessHour(n=4), + BusinessHour(end="10:00"), + BusinessHour(n=2, end="4:00"), + BusinessHour(n=4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 9), + ), + # if timestamp is on opening time, next opening time is + # as it is + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 3, 9), + datetime(2014, 7, 2, 9), + ), + # 2014-07-05 is saturday + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 8, 9), + datetime(2014, 7, 7, 9), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start="11:15"), + BusinessHour(n=2, start="11:15"), + BusinessHour(n=3, start="11:15"), + BusinessHour(start="11:15", end="10:00"), + BusinessHour(n=2, start="11:15", end="4:00"), + BusinessHour(n=3, start="11:15", end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 11, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 11, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 3, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 11, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 11, 15), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(-1), + BusinessHour(n=-2), + BusinessHour(n=-4), + BusinessHour(n=-1, end="10:00"), + BusinessHour(n=-2, end="4:00"), + BusinessHour(n=-4, end="15:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 9), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 9), + datetime(2014, 7, 3, 9), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 7, 9), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 9), + datetime(2014, 7, 8, 9), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start="17:00", end="05:00"), + BusinessHour(n=3, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 17), + datetime(2014, 6, 30, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 4, 17): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 3, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 7, 17, 1): ( + datetime(2014, 7, 8, 17), + datetime(2014, 7, 7, 17), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(-1, start="17:00", end="05:00"), + BusinessHour(n=-2, start="17:00", end="03:00"), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 3, 17), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 17), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 17), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(start=["11:15", "15:00"], end=["13:00", "20:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["12:00", "20:00"]), + BusinessHour(start=["11:15", "15:00"], end=["13:00", "17:00"]), + BusinessHour(n=2, start=["11:15", "15:00"], end=["12:00", "03:00"]), + BusinessHour(n=3, start=["11:15", "15:00"], end=["13:00", "16:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 11, 15), + datetime(2014, 6, 30, 15), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 10): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 1, 15), + ), + datetime(2014, 7, 2, 11, 15): ( + datetime(2014, 7, 2, 11, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 2, 11, 15, 1): ( + datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 11, 15), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 11, 15), + datetime(2014, 7, 3, 15), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 9, 1): ( + datetime(2014, 7, 7, 11, 15), + datetime(2014, 7, 4, 15), + ), + datetime(2014, 7, 7, 12): ( + datetime(2014, 7, 7, 15), + datetime(2014, 7, 7, 11, 15), + ), + }, + ) + ) + + opening_time_cases.append( + ( + [ + BusinessHour(n=-1, start=["17:00", "08:00"], end=["05:00", "10:00"]), + BusinessHour(n=-2, start=["08:00", "17:00"], end=["10:00", "03:00"]), + ], + { + datetime(2014, 7, 1, 11): ( + datetime(2014, 7, 1, 8), + datetime(2014, 7, 1, 17), + ), + datetime(2014, 7, 1, 18): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 1, 23): ( + datetime(2014, 7, 1, 17), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 8): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 8), + ), + datetime(2014, 7, 2, 9): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 2, 16, 59): ( + datetime(2014, 7, 2, 8), + datetime(2014, 7, 2, 17), + ), + datetime(2014, 7, 5, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 4, 10): ( + datetime(2014, 7, 4, 8), + datetime(2014, 7, 4, 17), + ), + datetime(2014, 7, 4, 23): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 6, 10): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 5): ( + datetime(2014, 7, 4, 17), + datetime(2014, 7, 7, 8), + ), + datetime(2014, 7, 7, 18): ( + datetime(2014, 7, 7, 17), + datetime(2014, 7, 8, 8), + ), + }, + ) + ) + + @pytest.mark.parametrize("case", opening_time_cases) def test_opening_time(self, case): _offsets, cases = case for offset in _offsets: @@ -1336,251 +1701,343 @@ def test_opening_time(self, case): assert offset._prev_opening_time(dt) == exp_prev apply_cases = [] - apply_cases.append((BusinessHour(), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) - - apply_cases.append((BusinessHour(4), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) - - apply_cases.append((BusinessHour(-1), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), - datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30)})) - - apply_cases.append((BusinessHour(-4), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), - datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30)})) - - apply_cases.append((BusinessHour(start='13:00', end='16:00'), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), - datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14)})) - - apply_cases.append((BusinessHour(n=2, start='13:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), - datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), - datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30)})) - - apply_cases.append((BusinessHour(n=-1, start='13:00', end='16:00'), { - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), - datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15)})) - - apply_cases.append((BusinessHour(n=-3, start='10:00', end='16:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), - datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), - datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), - datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), - datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), - datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30)})) - - apply_cases.append((BusinessHour(start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), - datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), - datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), - datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), - datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), - datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), - datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30)})) - - apply_cases.append((BusinessHour(n=-1, start='19:00', end='05:00'), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), - datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), - datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), - datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), - datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), - datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), - datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30)})) + apply_cases.append( + ( + BusinessHour(), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 2, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 12), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(4), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(-1), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 10): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 1, 9, 30, 15): datetime(2014, 6, 30, 16, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 5): datetime(2014, 6, 30, 16), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 16), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 16), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 16), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9): datetime(2014, 7, 4, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 16, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 16, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(-4), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 15), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 13), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 13, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 13, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(start="13:00", end="16:00"), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 13), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 14), + datetime(2014, 7, 1, 15, 30, 15): datetime(2014, 7, 2, 13, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=2, start="13:00", end="16:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 14, 30): datetime(2014, 7, 3, 13, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 14, 30): datetime(2014, 7, 7, 13, 30), + datetime(2014, 7, 4, 14, 30, 30): datetime(2014, 7, 7, 13, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start="13:00", end="16:00"), + { + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 14): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 15): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 16): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 13, 30, 15): datetime(2014, 7, 1, 15, 30, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 15), + datetime(2014, 7, 7, 11): datetime(2014, 7, 4, 15), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-3, start="10:00", end="16:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 13), + datetime(2014, 7, 2, 13): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 13), + datetime(2014, 7, 2, 11, 30): datetime(2014, 7, 1, 14, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 13), + datetime(2014, 7, 4, 10): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 16): datetime(2014, 7, 4, 13), + datetime(2014, 7, 4, 12, 30): datetime(2014, 7, 3, 15, 30), + datetime(2014, 7, 4, 12, 30, 30): datetime(2014, 7, 3, 15, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 20), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 20), + datetime(2014, 7, 2, 4, 30): datetime(2014, 7, 2, 19, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 1), + datetime(2014, 7, 4, 10): datetime(2014, 7, 4, 20), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 5, 1), + datetime(2014, 7, 5, 4): datetime(2014, 7, 7, 19), + datetime(2014, 7, 5, 4, 30): datetime(2014, 7, 7, 19, 30), + datetime(2014, 7, 5, 4, 30, 30): datetime(2014, 7, 7, 19, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start="19:00", end="05:00"), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 3), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 5, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 5, 4, 30, 30), + }, + ) + ) # long business hours (see gh-26381) - apply_cases.append((BusinessHour(n=4, start='00:00', end='23:00'), { - datetime(2014, 7, 3, 22): datetime(2014, 7, 4, 3), - datetime(2014, 7, 4, 22): datetime(2014, 7, 7, 3), - datetime(2014, 7, 3, 22, 30): datetime(2014, 7, 4, 3, 30), - datetime(2014, 7, 3, 22, 20): datetime(2014, 7, 4, 3, 20), - datetime(2014, 7, 4, 22, 30, 30): datetime(2014, 7, 7, 3, 30, 30), - datetime(2014, 7, 4, 22, 30, 20): datetime(2014, 7, 7, 3, 30, 20)})) - - apply_cases.append((BusinessHour(n=-4, start='00:00', end='23:00'), { - datetime(2014, 7, 4, 3): datetime(2014, 7, 3, 22), - datetime(2014, 7, 7, 3): datetime(2014, 7, 4, 22), - datetime(2014, 7, 4, 3, 30): datetime(2014, 7, 3, 22, 30), - datetime(2014, 7, 4, 3, 20): datetime(2014, 7, 3, 22, 20), - datetime(2014, 7, 7, 3, 30, 30): datetime(2014, 7, 4, 22, 30, 30), - datetime(2014, 7, 7, 3, 30, 20): datetime(2014, 7, 4, 22, 30, 20)})) + apply_cases.append( + ( + BusinessHour(n=4, start="00:00", end="23:00"), + { + datetime(2014, 7, 3, 22): datetime(2014, 7, 4, 3), + datetime(2014, 7, 4, 22): datetime(2014, 7, 7, 3), + datetime(2014, 7, 3, 22, 30): datetime(2014, 7, 4, 3, 30), + datetime(2014, 7, 3, 22, 20): datetime(2014, 7, 4, 3, 20), + datetime(2014, 7, 4, 22, 30, 30): datetime(2014, 7, 7, 3, 30, 30), + datetime(2014, 7, 4, 22, 30, 20): datetime(2014, 7, 7, 3, 30, 20), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-4, start="00:00", end="23:00"), + { + datetime(2014, 7, 4, 3): datetime(2014, 7, 3, 22), + datetime(2014, 7, 7, 3): datetime(2014, 7, 4, 22), + datetime(2014, 7, 4, 3, 30): datetime(2014, 7, 3, 22, 30), + datetime(2014, 7, 4, 3, 20): datetime(2014, 7, 3, 22, 20), + datetime(2014, 7, 7, 3, 30, 30): datetime(2014, 7, 4, 22, 30, 30), + datetime(2014, 7, 7, 3, 30, 20): datetime(2014, 7, 4, 22, 30, 20), + }, + ) + ) # multiple business hours - apply_cases.append((BusinessHour(start=['09:00', '14:00'], - end=['12:00', '18:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 17), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 17, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 9), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 14), - # out of business hours - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 15), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 9), - datetime(2014, 7, 4, 17, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 17, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) - - apply_cases.append((BusinessHour(n=4, start=['09:00', '14:00'], - end=['12:00', '18:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 17), - datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 11), - datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 14), - datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 17), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 15), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 11, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 11, 30, 30)})) - - apply_cases.append((BusinessHour(n=-4, start=['09:00', '14:00'], - end=['12:00', '18:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 16), - datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), - datetime(2014, 7, 1, 15): datetime(2014, 6, 30, 18), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 10), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 11), - datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 16), - datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 12), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 12), - datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 12), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 12), - datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 12), - datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 12), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 14, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 14, 30, 30)})) - - apply_cases.append((BusinessHour(n=-1, start=['19:00', '03:00'], - end=['01:00', '05:00']), { - datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), - datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), - datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), - datetime(2014, 7, 2, 4): datetime(2014, 7, 2, 1), - datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), - datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), - datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), - datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), - datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), - datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 0), - datetime(2014, 7, 7, 3, 30): datetime(2014, 7, 5, 0, 30), - datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 7, 4, 30), - datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 7, 4, 30, 30)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + BusinessHour(start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 1, 17, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 14), + # out of business hours + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 15), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 9), + datetime(2014, 7, 4, 17, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 17, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=4, start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 17), + datetime(2014, 7, 1, 13): datetime(2014, 7, 2, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 2, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 2, 11), + datetime(2014, 7, 1, 17): datetime(2014, 7, 2, 14), + datetime(2014, 7, 2, 11): datetime(2014, 7, 2, 17), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 15), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 15), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 15), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 15), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 15), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 14), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 11, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 11, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-4, start=["09:00", "14:00"], end=["12:00", "18:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 30, 16), + datetime(2014, 7, 1, 13): datetime(2014, 6, 30, 17), + datetime(2014, 7, 1, 15): datetime(2014, 6, 30, 18), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1, 10), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 11), + datetime(2014, 7, 2, 11): datetime(2014, 7, 1, 16), + datetime(2014, 7, 2, 8): datetime(2014, 7, 1, 12), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 12), + datetime(2014, 7, 2, 23): datetime(2014, 7, 2, 12), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 12), + datetime(2014, 7, 5, 15): datetime(2014, 7, 4, 12), + datetime(2014, 7, 4, 18): datetime(2014, 7, 4, 12), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 4, 14, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 4, 14, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + BusinessHour(n=-1, start=["19:00", "03:00"], end=["01:00", "05:00"]), + { + datetime(2014, 7, 1, 17): datetime(2014, 7, 1, 4), + datetime(2014, 7, 2, 14): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 8): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 13): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 20): datetime(2014, 7, 2, 5), + datetime(2014, 7, 2, 19): datetime(2014, 7, 2, 4), + datetime(2014, 7, 2, 4): datetime(2014, 7, 2, 1), + datetime(2014, 7, 2, 19, 30): datetime(2014, 7, 2, 4, 30), + datetime(2014, 7, 3, 0): datetime(2014, 7, 2, 23), + datetime(2014, 7, 3, 6): datetime(2014, 7, 3, 4), + datetime(2014, 7, 4, 23): datetime(2014, 7, 4, 22), + datetime(2014, 7, 5, 0): datetime(2014, 7, 4, 23), + datetime(2014, 7, 5, 4): datetime(2014, 7, 5, 0), + datetime(2014, 7, 7, 3, 30): datetime(2014, 7, 5, 0, 30), + datetime(2014, 7, 7, 19, 30): datetime(2014, 7, 7, 4, 30), + datetime(2014, 7, 7, 19, 30, 30): datetime(2014, 7, 7, 4, 30, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -1588,91 +2045,119 @@ def test_apply(self, case): apply_large_n_cases = [] # A week later - apply_large_n_cases.append((BusinessHour(40), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), - datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), - datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), - datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), - datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), - datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), - datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), - datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), - datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), - datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), - datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), - datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(40), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 8, 11), + datetime(2014, 7, 1, 13): datetime(2014, 7, 8, 13), + datetime(2014, 7, 1, 15): datetime(2014, 7, 8, 15), + datetime(2014, 7, 1, 16): datetime(2014, 7, 8, 16), + datetime(2014, 7, 1, 17): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 11): datetime(2014, 7, 9, 11), + datetime(2014, 7, 2, 8): datetime(2014, 7, 9, 9), + datetime(2014, 7, 2, 19): datetime(2014, 7, 10, 9), + datetime(2014, 7, 2, 23): datetime(2014, 7, 10, 9), + datetime(2014, 7, 3, 0): datetime(2014, 7, 10, 9), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 9), + datetime(2014, 7, 4, 18): datetime(2014, 7, 14, 9), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 14, 9, 30), + datetime(2014, 7, 7, 9, 30, 30): datetime(2014, 7, 14, 9, 30, 30), + }, + ) + ) # 3 days and 1 hour before - apply_large_n_cases.append((BusinessHour(-25), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), - datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), - datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), - datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), - datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), - datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), - datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), - datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), - datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), - datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), - datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(-25), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 12), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 16), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 17), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 16), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 16), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 16), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 16), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 16), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 16, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30), + }, + ) + ) # 5 days and 3 hours later - apply_large_n_cases.append((BusinessHour(28, start='21:00', end='02:00'), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), - datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), - datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), - datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), - datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), - datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), - datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), - datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), - datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(28, start="21:00", end="02:00"), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 10, 0), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 12, 0), + datetime(2014, 7, 4, 3): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 15, 0), + datetime(2014, 7, 6, 18): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 1): datetime(2014, 7, 15, 0), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30), + }, + ) + ) # large n for multiple opening hours (3 days and 1 hour before) - apply_large_n_cases.append((BusinessHour(n=-25, start=['09:00', '14:00'], - end=['12:00', '19:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), - datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 11), - datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 18), - datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 19), - datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), - datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 18), - datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 18), - datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 18), - datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 18), - datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 18), - datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 18), - datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 18, 30), - datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30)})) + apply_large_n_cases.append( + ( + BusinessHour(n=-25, start=["09:00", "14:00"], end=["12:00", "19:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 6, 26, 10), + datetime(2014, 7, 1, 13): datetime(2014, 6, 26, 11), + datetime(2014, 7, 1, 9): datetime(2014, 6, 25, 18), + datetime(2014, 7, 1, 10): datetime(2014, 6, 25, 19), + datetime(2014, 7, 3, 11): datetime(2014, 6, 30, 10), + datetime(2014, 7, 3, 8): datetime(2014, 6, 27, 18), + datetime(2014, 7, 3, 19): datetime(2014, 6, 30, 18), + datetime(2014, 7, 3, 23): datetime(2014, 6, 30, 18), + datetime(2014, 7, 4, 9): datetime(2014, 6, 30, 18), + datetime(2014, 7, 5, 15): datetime(2014, 7, 1, 18), + datetime(2014, 7, 6, 18): datetime(2014, 7, 1, 18), + datetime(2014, 7, 7, 9, 30): datetime(2014, 7, 1, 18, 30), + datetime(2014, 7, 7, 10, 30, 30): datetime(2014, 7, 2, 9, 30, 30), + }, + ) + ) # 5 days and 3 hours later - apply_large_n_cases.append((BusinessHour(28, start=['21:00', '03:00'], - end=['01:00', '04:00']), { - datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), - datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 3), - datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), - datetime(2014, 7, 2, 2): datetime(2014, 7, 9, 23), - datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), - datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 2): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 3): datetime(2014, 7, 11, 23), - datetime(2014, 7, 4, 21): datetime(2014, 7, 12, 0), - datetime(2014, 7, 5, 0): datetime(2014, 7, 14, 22), - datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 23), - datetime(2014, 7, 6, 18): datetime(2014, 7, 14, 23), - datetime(2014, 7, 7, 1): datetime(2014, 7, 14, 23), - datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30)})) - - @pytest.mark.parametrize('case', apply_large_n_cases) + apply_large_n_cases.append( + ( + BusinessHour(28, start=["21:00", "03:00"], end=["01:00", "04:00"]), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 9, 0), + datetime(2014, 7, 1, 22): datetime(2014, 7, 9, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 9, 21), + datetime(2014, 7, 2, 2): datetime(2014, 7, 9, 23), + datetime(2014, 7, 3, 21): datetime(2014, 7, 11, 0), + datetime(2014, 7, 4, 1): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 2): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 3): datetime(2014, 7, 11, 23), + datetime(2014, 7, 4, 21): datetime(2014, 7, 12, 0), + datetime(2014, 7, 5, 0): datetime(2014, 7, 14, 22), + datetime(2014, 7, 5, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 5, 15): datetime(2014, 7, 14, 23), + datetime(2014, 7, 6, 18): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 1): datetime(2014, 7, 14, 23), + datetime(2014, 7, 7, 23, 30): datetime(2014, 7, 15, 21, 30), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_large_n_cases) def test_apply_large_n(self, case): offset, cases = case for base, expected in cases.items(): @@ -1681,57 +2166,89 @@ def test_apply_large_n(self, case): def test_apply_nanoseconds(self): tests = [] - tests.append((BusinessHour(), - {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp( - '2014-07-04 16:00') + Nano(5), - Timestamp('2014-07-04 16:00') + Nano(5): Timestamp( - '2014-07-07 09:00') + Nano(5), - Timestamp('2014-07-04 16:00') - Nano(5): Timestamp( - '2014-07-04 17:00') - Nano(5)})) - - tests.append((BusinessHour(-1), - {Timestamp('2014-07-04 15:00') + Nano(5): Timestamp( - '2014-07-04 14:00') + Nano(5), - Timestamp('2014-07-04 10:00') + Nano(5): Timestamp( - '2014-07-04 09:00') + Nano(5), - Timestamp('2014-07-04 10:00') - Nano(5): Timestamp( - '2014-07-03 17:00') - Nano(5), })) + tests.append( + ( + BusinessHour(), + { + Timestamp("2014-07-04 15:00") + + Nano(5): Timestamp("2014-07-04 16:00") + + Nano(5), + Timestamp("2014-07-04 16:00") + + Nano(5): Timestamp("2014-07-07 09:00") + + Nano(5), + Timestamp("2014-07-04 16:00") + - Nano(5): Timestamp("2014-07-04 17:00") + - Nano(5), + }, + ) + ) + + tests.append( + ( + BusinessHour(-1), + { + Timestamp("2014-07-04 15:00") + + Nano(5): Timestamp("2014-07-04 14:00") + + Nano(5), + Timestamp("2014-07-04 10:00") + + Nano(5): Timestamp("2014-07-04 09:00") + + Nano(5), + Timestamp("2014-07-04 10:00") + - Nano(5): Timestamp("2014-07-03 17:00") + - Nano(5), + }, + ) + ) for offset, cases in tests: for base, expected in cases.items(): assert_offset_equal(offset, base, expected) def test_datetimeindex(self): - idx1 = date_range(start='2014-07-04 15:00', end='2014-07-08 10:00', - freq='BH') - idx2 = date_range(start='2014-07-04 15:00', periods=12, freq='BH') - idx3 = date_range(end='2014-07-08 10:00', periods=12, freq='BH') - expected = DatetimeIndex(['2014-07-04 15:00', '2014-07-04 16:00', - '2014-07-07 09:00', - '2014-07-07 10:00', '2014-07-07 11:00', - '2014-07-07 12:00', - '2014-07-07 13:00', '2014-07-07 14:00', - '2014-07-07 15:00', - '2014-07-07 16:00', '2014-07-08 09:00', - '2014-07-08 10:00'], - freq='BH') + idx1 = date_range(start="2014-07-04 15:00", end="2014-07-08 10:00", freq="BH") + idx2 = date_range(start="2014-07-04 15:00", periods=12, freq="BH") + idx3 = date_range(end="2014-07-08 10:00", periods=12, freq="BH") + expected = DatetimeIndex( + [ + "2014-07-04 15:00", + "2014-07-04 16:00", + "2014-07-07 09:00", + "2014-07-07 10:00", + "2014-07-07 11:00", + "2014-07-07 12:00", + "2014-07-07 13:00", + "2014-07-07 14:00", + "2014-07-07 15:00", + "2014-07-07 16:00", + "2014-07-08 09:00", + "2014-07-08 10:00", + ], + freq="BH", + ) for idx in [idx1, idx2, idx3]: tm.assert_index_equal(idx, expected) - idx1 = date_range(start='2014-07-04 15:45', end='2014-07-08 10:45', - freq='BH') - idx2 = date_range(start='2014-07-04 15:45', periods=12, freq='BH') - idx3 = date_range(end='2014-07-08 10:45', periods=12, freq='BH') - - expected = DatetimeIndex(['2014-07-04 15:45', '2014-07-04 16:45', - '2014-07-07 09:45', - '2014-07-07 10:45', '2014-07-07 11:45', - '2014-07-07 12:45', - '2014-07-07 13:45', '2014-07-07 14:45', - '2014-07-07 15:45', - '2014-07-07 16:45', '2014-07-08 09:45', - '2014-07-08 10:45'], - freq='BH') + idx1 = date_range(start="2014-07-04 15:45", end="2014-07-08 10:45", freq="BH") + idx2 = date_range(start="2014-07-04 15:45", periods=12, freq="BH") + idx3 = date_range(end="2014-07-08 10:45", periods=12, freq="BH") + + expected = DatetimeIndex( + [ + "2014-07-04 15:45", + "2014-07-04 16:45", + "2014-07-07 09:45", + "2014-07-07 10:45", + "2014-07-07 11:45", + "2014-07-07 12:45", + "2014-07-07 13:45", + "2014-07-07 14:45", + "2014-07-07 15:45", + "2014-07-07 16:45", + "2014-07-08 09:45", + "2014-07-08 10:45", + ], + freq="BH", + ) expected = idx1 for idx in [idx1, idx2, idx3]: tm.assert_index_equal(idx, expected) @@ -1739,8 +2256,7 @@ def test_datetimeindex(self): class TestCustomBusinessHour(Base): _offset = CustomBusinessHour - holidays = ['2014-06-27', datetime(2014, 6, 30), - np.datetime64('2014-07-02')] + holidays = ["2014-06-27", datetime(2014, 6, 30), np.datetime64("2014-07-02")] def setup_method(self, method): # 2014 Calendar to check custom holidays @@ -1749,18 +2265,19 @@ def setup_method(self, method): # 29 30 7/1 2 3 4 5 # 6 7 8 9 10 11 12 self.d = datetime(2014, 7, 1, 10, 00) - self.offset1 = CustomBusinessHour(weekmask='Tue Wed Thu Fri') + self.offset1 = CustomBusinessHour(weekmask="Tue Wed Thu Fri") self.offset2 = CustomBusinessHour(holidays=self.holidays) def test_constructor_errors(self): from datetime import time as dt_time + with pytest.raises(ValueError): CustomBusinessHour(start=dt_time(11, 0, 5)) with pytest.raises(ValueError): - CustomBusinessHour(start='AAA') + CustomBusinessHour(start="AAA") with pytest.raises(ValueError): - CustomBusinessHour(start='14:00:05') + CustomBusinessHour(start="14:00:05") def test_different_normalize_equals(self): # GH#21404 changed __eq__ to return False when `normalize` doesnt match @@ -1769,11 +2286,11 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset1) == '' - assert repr(self.offset2) == '' + assert repr(self.offset1) == "" + assert repr(self.offset2) == "" def test_with_offset(self): - expected = Timestamp('2014-07-01 13:00') + expected = Timestamp("2014-07-01 13:00") assert self.d + CustomBusinessHour() * 3 == expected assert self.d + CustomBusinessHour(n=3) == expected @@ -1783,17 +2300,18 @@ def test_eq(self): assert offset == offset assert CustomBusinessHour() != CustomBusinessHour(-1) - assert (CustomBusinessHour(start='09:00') == - CustomBusinessHour()) - assert (CustomBusinessHour(start='09:00') != - CustomBusinessHour(start='09:01')) - assert (CustomBusinessHour(start='09:00', end='17:00') != - CustomBusinessHour(start='17:00', end='09:01')) - - assert (CustomBusinessHour(weekmask='Tue Wed Thu Fri') != - CustomBusinessHour(weekmask='Mon Tue Wed Thu Fri')) - assert (CustomBusinessHour(holidays=['2014-06-27']) != - CustomBusinessHour(holidays=['2014-06-28'])) + assert CustomBusinessHour(start="09:00") == CustomBusinessHour() + assert CustomBusinessHour(start="09:00") != CustomBusinessHour(start="09:01") + assert CustomBusinessHour(start="09:00", end="17:00") != CustomBusinessHour( + start="17:00", end="09:01" + ) + + assert CustomBusinessHour(weekmask="Tue Wed Thu Fri") != CustomBusinessHour( + weekmask="Mon Tue Wed Thu Fri" + ) + assert CustomBusinessHour(holidays=["2014-06-27"]) != CustomBusinessHour( + holidays=["2014-06-28"] + ) def test_sub(self): # override the Base.test_sub implementation because self.offset2 is @@ -1821,8 +2339,9 @@ def testRollback1(self): assert self.offset2.rollback(d) == datetime(2014, 6, 26, 17) def testRollback2(self): - assert (self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == - datetime(2014, 7, 4, 17, 0)) + assert self._offset(-3).rollback(datetime(2014, 7, 5, 15, 0)) == datetime( + 2014, 7, 4, 17, 0 + ) def testRollforward1(self): assert self.offset1.rollforward(self.d) == self.d @@ -1833,8 +2352,9 @@ def testRollforward1(self): assert self.offset2.rollforward(d) == datetime(2014, 7, 1, 9) def testRollforward2(self): - assert (self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == - datetime(2014, 7, 7, 9)) + assert self._offset(-3).rollforward(datetime(2014, 7, 5, 16, 0)) == datetime( + 2014, 7, 7, 9 + ) def test_roll_date_object(self): offset = BusinessHour() @@ -1848,46 +2368,61 @@ def test_roll_date_object(self): assert result == datetime(2014, 7, 7, 9) normalize_cases = [] - normalize_cases.append(( - CustomBusinessHour(normalize=True, holidays=holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3), - datetime(2014, 7, 1, 23): datetime(2014, 7, 3), - datetime(2014, 7, 1, 0): datetime(2014, 7, 1), - datetime(2014, 7, 4, 15): datetime(2014, 7, 4), - datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 7), - datetime(2014, 7, 6, 10): datetime(2014, 7, 7)})) - - normalize_cases.append(( - CustomBusinessHour(-1, normalize=True, holidays=holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 6, 26), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 16): datetime(2014, 7, 1), - datetime(2014, 7, 1, 10): datetime(2014, 6, 26), - datetime(2014, 7, 1, 0): datetime(2014, 6, 26), - datetime(2014, 7, 7, 10): datetime(2014, 7, 4), - datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), - datetime(2014, 7, 5, 23): datetime(2014, 7, 4), - datetime(2014, 7, 6, 10): datetime(2014, 7, 4)})) - - normalize_cases.append(( - CustomBusinessHour(1, normalize=True, - start='17:00', end='04:00', - holidays=holidays), - {datetime(2014, 7, 1, 8): datetime(2014, 7, 1), - datetime(2014, 7, 1, 17): datetime(2014, 7, 1), - datetime(2014, 7, 1, 23): datetime(2014, 7, 2), - datetime(2014, 7, 2, 2): datetime(2014, 7, 2), - datetime(2014, 7, 2, 3): datetime(2014, 7, 3), - datetime(2014, 7, 4, 23): datetime(2014, 7, 5), - datetime(2014, 7, 5, 2): datetime(2014, 7, 5), - datetime(2014, 7, 7, 2): datetime(2014, 7, 7), - datetime(2014, 7, 7, 17): datetime(2014, 7, 7)})) - - @pytest.mark.parametrize('norm_cases', normalize_cases) + normalize_cases.append( + ( + CustomBusinessHour(normalize=True, holidays=holidays), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3), + datetime(2014, 7, 1, 23): datetime(2014, 7, 3), + datetime(2014, 7, 1, 0): datetime(2014, 7, 1), + datetime(2014, 7, 4, 15): datetime(2014, 7, 4), + datetime(2014, 7, 4, 15, 59): datetime(2014, 7, 4), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 7), + datetime(2014, 7, 6, 10): datetime(2014, 7, 7), + }, + ) + ) + + normalize_cases.append( + ( + CustomBusinessHour(-1, normalize=True, holidays=holidays), + { + datetime(2014, 7, 1, 8): datetime(2014, 6, 26), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 16): datetime(2014, 7, 1), + datetime(2014, 7, 1, 10): datetime(2014, 6, 26), + datetime(2014, 7, 1, 0): datetime(2014, 6, 26), + datetime(2014, 7, 7, 10): datetime(2014, 7, 4), + datetime(2014, 7, 7, 10, 1): datetime(2014, 7, 7), + datetime(2014, 7, 5, 23): datetime(2014, 7, 4), + datetime(2014, 7, 6, 10): datetime(2014, 7, 4), + }, + ) + ) + + normalize_cases.append( + ( + CustomBusinessHour( + 1, normalize=True, start="17:00", end="04:00", holidays=holidays + ), + { + datetime(2014, 7, 1, 8): datetime(2014, 7, 1), + datetime(2014, 7, 1, 17): datetime(2014, 7, 1), + datetime(2014, 7, 1, 23): datetime(2014, 7, 2), + datetime(2014, 7, 2, 2): datetime(2014, 7, 2), + datetime(2014, 7, 2, 3): datetime(2014, 7, 3), + datetime(2014, 7, 4, 23): datetime(2014, 7, 5), + datetime(2014, 7, 5, 2): datetime(2014, 7, 5), + datetime(2014, 7, 7, 2): datetime(2014, 7, 7), + datetime(2014, 7, 7, 17): datetime(2014, 7, 7), + }, + ) + ) + + @pytest.mark.parametrize("norm_cases", normalize_cases) def test_normalize(self, norm_cases): offset, cases = norm_cases for dt, expected in cases.items(): @@ -1896,59 +2431,74 @@ def test_normalize(self, norm_cases): def test_onOffset(self): tests = [] - tests.append((CustomBusinessHour(start='10:00', end='15:00', - holidays=self.holidays), - {datetime(2014, 7, 1, 9): False, - datetime(2014, 7, 1, 10): True, - datetime(2014, 7, 1, 15): True, - datetime(2014, 7, 1, 15, 1): False, - datetime(2014, 7, 5, 12): False, - datetime(2014, 7, 6, 12): False})) + tests.append( + ( + CustomBusinessHour(start="10:00", end="15:00", holidays=self.holidays), + { + datetime(2014, 7, 1, 9): False, + datetime(2014, 7, 1, 10): True, + datetime(2014, 7, 1, 15): True, + datetime(2014, 7, 1, 15, 1): False, + datetime(2014, 7, 5, 12): False, + datetime(2014, 7, 6, 12): False, + }, + ) + ) for offset, cases in tests: for dt, expected in cases.items(): assert offset.onOffset(dt) == expected apply_cases = [] - apply_cases.append(( - CustomBusinessHour(holidays=holidays), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), - datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), - datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), - datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9), - datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10), - # out of business hours - datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), - # saturday - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30)})) - - apply_cases.append(( - CustomBusinessHour(4, holidays=holidays), - {datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), - datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9), - datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11), - datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12), - datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), - datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), - datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), - datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), - datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), - datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30)})) - - @pytest.mark.parametrize('apply_case', apply_cases) + apply_cases.append( + ( + CustomBusinessHour(holidays=holidays), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 12), + datetime(2014, 7, 1, 13): datetime(2014, 7, 1, 14), + datetime(2014, 7, 1, 15): datetime(2014, 7, 1, 16), + datetime(2014, 7, 1, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 16, 30, 15): datetime(2014, 7, 3, 9, 30, 15), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 10), + # out of business hours + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 10), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 10), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 10), + # saturday + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 10), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 9, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 9, 30, 30), + }, + ) + ) + + apply_cases.append( + ( + CustomBusinessHour(4, holidays=holidays), + { + datetime(2014, 7, 1, 11): datetime(2014, 7, 1, 15), + datetime(2014, 7, 1, 13): datetime(2014, 7, 3, 9), + datetime(2014, 7, 1, 15): datetime(2014, 7, 3, 11), + datetime(2014, 7, 1, 16): datetime(2014, 7, 3, 12), + datetime(2014, 7, 1, 17): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 11): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 8): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 19): datetime(2014, 7, 3, 13), + datetime(2014, 7, 2, 23): datetime(2014, 7, 3, 13), + datetime(2014, 7, 3, 0): datetime(2014, 7, 3, 13), + datetime(2014, 7, 5, 15): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 17): datetime(2014, 7, 7, 13), + datetime(2014, 7, 4, 16, 30): datetime(2014, 7, 7, 12, 30), + datetime(2014, 7, 4, 16, 30, 30): datetime(2014, 7, 7, 12, 30, 30), + }, + ) + ) + + @pytest.mark.parametrize("apply_case", apply_cases) def test_apply(self, apply_case): offset, cases = apply_case for base, expected in cases.items(): @@ -1956,24 +2506,40 @@ def test_apply(self, apply_case): nano_cases = [] nano_cases.append( - (CustomBusinessHour(holidays=holidays), - {Timestamp('2014-07-01 15:00') + Nano(5): - Timestamp('2014-07-01 16:00') + Nano(5), - Timestamp('2014-07-01 16:00') + Nano(5): - Timestamp('2014-07-03 09:00') + Nano(5), - Timestamp('2014-07-01 16:00') - Nano(5): - Timestamp('2014-07-01 17:00') - Nano(5)})) + ( + CustomBusinessHour(holidays=holidays), + { + Timestamp("2014-07-01 15:00") + + Nano(5): Timestamp("2014-07-01 16:00") + + Nano(5), + Timestamp("2014-07-01 16:00") + + Nano(5): Timestamp("2014-07-03 09:00") + + Nano(5), + Timestamp("2014-07-01 16:00") + - Nano(5): Timestamp("2014-07-01 17:00") + - Nano(5), + }, + ) + ) nano_cases.append( - (CustomBusinessHour(-1, holidays=holidays), - {Timestamp('2014-07-01 15:00') + Nano(5): - Timestamp('2014-07-01 14:00') + Nano(5), - Timestamp('2014-07-01 10:00') + Nano(5): - Timestamp('2014-07-01 09:00') + Nano(5), - Timestamp('2014-07-01 10:00') - Nano(5): - Timestamp('2014-06-26 17:00') - Nano(5)})) - - @pytest.mark.parametrize('nano_case', nano_cases) + ( + CustomBusinessHour(-1, holidays=holidays), + { + Timestamp("2014-07-01 15:00") + + Nano(5): Timestamp("2014-07-01 14:00") + + Nano(5), + Timestamp("2014-07-01 10:00") + + Nano(5): Timestamp("2014-07-01 09:00") + + Nano(5), + Timestamp("2014-07-01 10:00") + - Nano(5): Timestamp("2014-06-26 17:00") + - Nano(5), + }, + ) + ) + + @pytest.mark.parametrize("nano_case", nano_cases) def test_apply_nanoseconds(self, nano_case): offset, cases = nano_case for base, expected in cases.items(): @@ -1985,7 +2551,7 @@ class TestCustomBusinessDay(Base): def setup_method(self, method): self.d = datetime(2008, 1, 1) - self.nd = np_datetime64_compat('2008-01-01 00:00:00Z') + self.nd = np_datetime64_compat("2008-01-01 00:00:00Z") self.offset = CDay() self.offset1 = self.offset @@ -1998,13 +2564,13 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * CustomBusinessDays>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessDays>" if compat.PY37: - expected = '' + expected = "" else: - expected = '' + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -2029,15 +2595,13 @@ def testRollback1(self): assert CDay(10).rollback(self.d) == self.d def testRollback2(self): - assert (CDay(10).rollback(datetime(2008, 1, 5)) == - datetime(2008, 1, 4)) + assert CDay(10).rollback(datetime(2008, 1, 5)) == datetime(2008, 1, 4) def testRollforward1(self): assert CDay(10).rollforward(self.d) == self.d def testRollforward2(self): - assert (CDay(10).rollforward(datetime(2008, 1, 5)) == - datetime(2008, 1, 7)) + assert CDay(10).rollforward(datetime(2008, 1, 5)) == datetime(2008, 1, 7) def test_roll_date_object(self): offset = CDay() @@ -2057,54 +2621,86 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - on_offset_cases = [(CDay(), datetime(2008, 1, 1), True), - (CDay(), datetime(2008, 1, 5), False)] + on_offset_cases = [ + (CDay(), datetime(2008, 1, 1), True), + (CDay(), datetime(2008, 1, 5), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) apply_cases = [] - apply_cases.append((CDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - apply_cases.append((2 * CDay(), { - datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - apply_cases.append((-CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - apply_cases.append((-2 * CDay(), { - datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) - - apply_cases.append((CDay(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 4): datetime(2008, 1, 4), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + CDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8), + }, + ) + ) + + apply_cases.append( + ( + 2 * CDay(), + { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9), + }, + ) + ) + + apply_cases.append( + ( + -CDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + -2 * CDay(), + { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7), + }, + ) + ) + + apply_cases.append( + ( + CDay(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -2130,15 +2726,16 @@ def test_apply_large_n(self): assert rs == xp def test_apply_corner(self): - msg = ("Only know how to combine trading day with datetime, datetime64" - " or timedelta") + msg = ( + "Only know how to combine trading day with datetime, datetime64" + " or timedelta" + ) with pytest.raises(ApplyTypeError, match=msg): CDay().apply(BMonthEnd()) def test_holidays(self): # Define a TradingDay offset - holidays = ['2012-05-01', datetime(2013, 5, 1), - np.datetime64('2014-05-01')] + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] tday = CDay(holidays=holidays) for year in range(2012, 2015): dt = datetime(year, 4, 30) @@ -2147,8 +2744,8 @@ def test_holidays(self): assert rs == xp def test_weekmask(self): - weekmask_saudi = 'Sat Sun Mon Tue Wed' # Thu-Fri Weekend - weekmask_uae = '1111001' # Fri-Sat Weekend + weekmask_saudi = "Sat Sun Mon Tue Wed" # Thu-Fri Weekend + weekmask_uae = "1111001" # Fri-Sat Weekend weekmask_egypt = [1, 1, 1, 1, 0, 0, 1] # Fri-Sat Weekend bday_saudi = CDay(weekmask=weekmask_saudi) bday_uae = CDay(weekmask=weekmask_uae) @@ -2166,9 +2763,8 @@ def test_weekmask(self): assert xp2 == dt + 2 * bday_egypt def test_weekmask_and_holidays(self): - weekmask_egypt = 'Sun Mon Tue Wed Thu' # Fri-Sat Weekend - holidays = ['2012-05-01', datetime(2013, 5, 1), - np.datetime64('2014-05-01')] + weekmask_egypt = "Sun Mon Tue Wed Thu" # Fri-Sat Weekend + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] bday_egypt = CDay(holidays=holidays, weekmask=weekmask_egypt) dt = datetime(2013, 4, 30) xp_egypt = datetime(2013, 5, 5) @@ -2191,14 +2787,13 @@ def _check_roundtrip(obj): def test_pickle_compat_0_14_1(self, datapath): hdays = [datetime(2013, 1, 1) for ele in range(4)] - pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle') + pth = datapath("tseries", "offsets", "data", "cday-0.14.1.pickle") cday0_14_1 = read_pickle(pth) cday = CDay(holidays=hdays) assert cday == cday0_14_1 class CustomBusinessMonthBase: - def setup_method(self, method): self.d = datetime(2008, 1, 1) @@ -2226,7 +2821,7 @@ def _check_roundtrip(obj): def test_copy(self): # GH 17452 - off = self._offset(weekmask='Mon Wed Fri') + off = self._offset(weekmask="Mon Wed Fri") assert off == off.copy() @@ -2240,15 +2835,14 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * CustomBusinessMonthEnds>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessMonthEnds>" def testCall(self): assert self.offset2(self.d) == datetime(2008, 2, 29) def testRollback1(self): - assert (CDay(10).rollback(datetime(2007, 12, 31)) == - datetime(2007, 12, 31)) + assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) def testRollback2(self): assert CBMonthEnd(10).rollback(self.d) == datetime(2007, 12, 31) @@ -2274,36 +2868,68 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - on_offset_cases = [(CBMonthEnd(), datetime(2008, 1, 31), True), - (CBMonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (CBMonthEnd(), datetime(2008, 1, 31), True), + (CBMonthEnd(), datetime(2008, 1, 1), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, d, expected = case assert_onOffset(offset, d, expected) apply_cases = [] - apply_cases.append((CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) - - apply_cases.append((2 * CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 2, 7): datetime(2008, 3, 31)})) - - apply_cases.append((-CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 2, 8): datetime(2008, 1, 31)})) - - apply_cases.append((-2 * CBMonthEnd(), { - datetime(2008, 1, 1): datetime(2007, 11, 30), - datetime(2008, 2, 9): datetime(2007, 12, 31)})) - - apply_cases.append((CBMonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 2, 7): datetime(2008, 2, 29)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29), + }, + ) + ) + + apply_cases.append( + ( + 2 * CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 2, 7): datetime(2008, 3, 31), + }, + ) + ) + + apply_cases.append( + ( + -CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 2, 8): datetime(2008, 1, 31), + }, + ) + ) + + apply_cases.append( + ( + -2 * CBMonthEnd(), + { + datetime(2008, 1, 1): datetime(2007, 11, 30), + datetime(2008, 2, 9): datetime(2007, 12, 31), + }, + ) + ) + + apply_cases.append( + ( + CBMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 2, 7): datetime(2008, 2, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -2330,8 +2956,7 @@ def test_apply_large_n(self): def test_holidays(self): # Define a TradingDay offset - holidays = ['2012-01-31', datetime(2012, 2, 28), - np.datetime64('2012-02-29')] + holidays = ["2012-01-31", datetime(2012, 2, 28), np.datetime64("2012-02-29")] bm_offset = CBMonthEnd(holidays=holidays) dt = datetime(2012, 1, 1) assert dt + bm_offset == datetime(2012, 1, 30) @@ -2340,11 +2965,13 @@ def test_holidays(self): @pytest.mark.filterwarnings("ignore:Non:pandas.errors.PerformanceWarning") def test_datetimeindex(self): from pandas.tseries.holiday import USFederalHolidayCalendar + hcal = USFederalHolidayCalendar() freq = CBMonthEnd(calendar=hcal) - assert (date_range(start='20120101', end='20130101', - freq=freq).tolist()[0] == datetime(2012, 1, 31)) + assert date_range(start="20120101", end="20130101", freq=freq).tolist()[ + 0 + ] == datetime(2012, 1, 31) class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): @@ -2357,15 +2984,14 @@ def test_different_normalize_equals(self): assert offset != offset2 def test_repr(self): - assert repr(self.offset) == '' - assert repr(self.offset2) == '<2 * CustomBusinessMonthBegins>' + assert repr(self.offset) == "" + assert repr(self.offset2) == "<2 * CustomBusinessMonthBegins>" def testCall(self): assert self.offset2(self.d) == datetime(2008, 3, 3) def testRollback1(self): - assert (CDay(10).rollback(datetime(2007, 12, 31)) == - datetime(2007, 12, 31)) + assert CDay(10).rollback(datetime(2007, 12, 31)) == datetime(2007, 12, 31) def testRollback2(self): assert CBMonthBegin(10).rollback(self.d) == datetime(2008, 1, 1) @@ -2391,36 +3017,68 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - on_offset_cases = [(CBMonthBegin(), datetime(2008, 1, 1), True), - (CBMonthBegin(), datetime(2008, 1, 31), False)] + on_offset_cases = [ + (CBMonthBegin(), datetime(2008, 1, 1), True), + (CBMonthBegin(), datetime(2008, 1, 31), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) apply_cases = [] - apply_cases.append((CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 2, 7): datetime(2008, 3, 3)})) - - apply_cases.append((2 * CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 2, 7): datetime(2008, 4, 1)})) - - apply_cases.append((-CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2007, 12, 3), - datetime(2008, 2, 8): datetime(2008, 2, 1)})) - - apply_cases.append((-2 * CBMonthBegin(), { - datetime(2008, 1, 1): datetime(2007, 11, 1), - datetime(2008, 2, 9): datetime(2008, 1, 1)})) - - apply_cases.append((CBMonthBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 7): datetime(2008, 2, 1)})) - - @pytest.mark.parametrize('case', apply_cases) + apply_cases.append( + ( + CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 2, 7): datetime(2008, 3, 3), + }, + ) + ) + + apply_cases.append( + ( + 2 * CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 2, 7): datetime(2008, 4, 1), + }, + ) + ) + + apply_cases.append( + ( + -CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2007, 12, 3), + datetime(2008, 2, 8): datetime(2008, 2, 1), + }, + ) + ) + + apply_cases.append( + ( + -2 * CBMonthBegin(), + { + datetime(2008, 1, 1): datetime(2007, 11, 1), + datetime(2008, 2, 9): datetime(2008, 1, 1), + }, + ) + ) + + apply_cases.append( + ( + CBMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 7): datetime(2008, 2, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", apply_cases) def test_apply(self, case): offset, cases = case for base, expected in cases.items(): @@ -2448,8 +3106,7 @@ def test_apply_large_n(self): def test_holidays(self): # Define a TradingDay offset - holidays = ['2012-02-01', datetime(2012, 2, 2), - np.datetime64('2012-03-01')] + holidays = ["2012-02-01", datetime(2012, 2, 2), np.datetime64("2012-03-01")] bm_offset = CBMonthBegin(holidays=holidays) dt = datetime(2012, 1, 1) @@ -2460,8 +3117,9 @@ def test_holidays(self): def test_datetimeindex(self): hcal = USFederalHolidayCalendar() cbmb = CBMonthBegin(calendar=hcal) - assert (date_range(start='20120101', end='20130101', - freq=cbmb).tolist()[0] == datetime(2012, 1, 3)) + assert date_range(start="20120101", end="20130101", freq=cbmb).tolist()[ + 0 + ] == datetime(2012, 1, 3) class TestWeek(Base): @@ -2490,42 +3148,66 @@ def test_isAnchored(self): offset_cases = [] # not business week - offset_cases.append((Week(), { - datetime(2008, 1, 1): datetime(2008, 1, 8), - datetime(2008, 1, 4): datetime(2008, 1, 11), - datetime(2008, 1, 5): datetime(2008, 1, 12), - datetime(2008, 1, 6): datetime(2008, 1, 13), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) + offset_cases.append( + ( + Week(), + { + datetime(2008, 1, 1): datetime(2008, 1, 8), + datetime(2008, 1, 4): datetime(2008, 1, 11), + datetime(2008, 1, 5): datetime(2008, 1, 12), + datetime(2008, 1, 6): datetime(2008, 1, 13), + datetime(2008, 1, 7): datetime(2008, 1, 14), + }, + ) + ) # Mon - offset_cases.append((Week(weekday=0), { - datetime(2007, 12, 31): datetime(2008, 1, 7), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 14)})) + offset_cases.append( + ( + Week(weekday=0), + { + datetime(2007, 12, 31): datetime(2008, 1, 7), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 14), + }, + ) + ) # n=0 -> roll forward. Mon - offset_cases.append((Week(0, weekday=0), { - datetime(2007, 12, 31): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) + offset_cases.append( + ( + Week(0, weekday=0), + { + datetime(2007, 12, 31): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7), + }, + ) + ) # n=0 -> roll forward. Mon - offset_cases.append((Week(-2, weekday=1), { - datetime(2010, 4, 6): datetime(2010, 3, 23), - datetime(2010, 4, 8): datetime(2010, 3, 30), - datetime(2010, 4, 5): datetime(2010, 3, 23)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + Week(-2, weekday=1), + { + datetime(2010, 4, 6): datetime(2010, 3, 23), + datetime(2010, 4, 8): datetime(2010, 3, 30), + datetime(2010, 4, 5): datetime(2010, 3, 23), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - @pytest.mark.parametrize('weekday', range(7)) + @pytest.mark.parametrize("weekday", range(7)) def test_onOffset(self, weekday): offset = Week(weekday=weekday) @@ -2558,8 +3240,9 @@ def test_constructor(self): WeekOfMonth(n=1, week=0, weekday=-7) def test_repr(self): - assert (repr(WeekOfMonth(weekday=1, week=2)) == - "") + assert ( + repr(WeekOfMonth(weekday=1, week=2)) == "" + ) def test_offset(self): date1 = datetime(2011, 1, 4) # 1st Tuesday of Month @@ -2573,12 +3256,10 @@ def test_offset(self): (-2, 2, 1, date2, datetime(2010, 11, 16)), (-2, 2, 1, date3, datetime(2010, 11, 16)), (-2, 2, 1, date4, datetime(2010, 12, 21)), - (-1, 2, 1, date1, datetime(2010, 12, 21)), (-1, 2, 1, date2, datetime(2010, 12, 21)), (-1, 2, 1, date3, datetime(2010, 12, 21)), (-1, 2, 1, date4, datetime(2011, 1, 18)), - (0, 0, 1, date1, datetime(2011, 1, 4)), (0, 0, 1, date2, datetime(2011, 2, 1)), (0, 0, 1, date3, datetime(2011, 2, 1)), @@ -2591,7 +3272,6 @@ def test_offset(self): (0, 1, 1, date2, datetime(2011, 1, 11)), (0, 2, 1, date3, datetime(2011, 1, 18)), (0, 3, 1, date4, datetime(2011, 1, 25)), - (1, 0, 0, date1, datetime(2011, 2, 7)), (1, 0, 0, date2, datetime(2011, 2, 7)), (1, 0, 0, date3, datetime(2011, 2, 7)), @@ -2604,16 +3284,15 @@ def test_offset(self): (1, 0, 2, date2, datetime(2011, 2, 2)), (1, 0, 2, date3, datetime(2011, 2, 2)), (1, 0, 2, date4, datetime(2011, 2, 2)), - (1, 2, 1, date1, datetime(2011, 1, 18)), (1, 2, 1, date2, datetime(2011, 1, 18)), (1, 2, 1, date3, datetime(2011, 2, 15)), (1, 2, 1, date4, datetime(2011, 2, 15)), - (2, 2, 1, date1, datetime(2011, 2, 15)), (2, 2, 1, date2, datetime(2011, 2, 15)), (2, 2, 1, date3, datetime(2011, 3, 15)), - (2, 2, 1, date4, datetime(2011, 3, 15))] + (2, 2, 1, date4, datetime(2011, 3, 15)), + ] for n, week, weekday, dt, expected in test_cases: offset = WeekOfMonth(n, week=week, weekday=weekday) @@ -2626,14 +3305,16 @@ def test_offset(self): result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2) assert result == datetime(2011, 2, 2) - on_offset_cases = [(0, 0, datetime(2011, 2, 7), True), - (0, 0, datetime(2011, 2, 6), False), - (0, 0, datetime(2011, 2, 14), False), - (1, 0, datetime(2011, 2, 14), True), - (0, 1, datetime(2011, 2, 1), True), - (0, 1, datetime(2011, 2, 8), False)] + on_offset_cases = [ + (0, 0, datetime(2011, 2, 7), True), + (0, 0, datetime(2011, 2, 6), False), + (0, 0, datetime(2011, 2, 14), False), + (1, 0, datetime(2011, 2, 14), True), + (0, 1, datetime(2011, 2, 1), True), + (0, 1, datetime(2011, 2, 8), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): week, weekday, dt, expected = case offset = WeekOfMonth(week=week, weekday=weekday) @@ -2661,10 +3342,10 @@ def test_offset(self): next_sat = datetime(2013, 9, 28) offset_sat = LastWeekOfMonth(n=1, weekday=5) - one_day_before = (last_sat + timedelta(days=-1)) + one_day_before = last_sat + timedelta(days=-1) assert one_day_before + offset_sat == last_sat - one_day_after = (last_sat + timedelta(days=+1)) + one_day_after = last_sat + timedelta(days=+1) assert one_day_after + offset_sat == next_sat # Test On that day @@ -2701,16 +3382,16 @@ def test_offset(self): (WeekDay.SUN, datetime(2013, 2, 25), False), # Not a SUN (WeekDay.MON, datetime(2013, 2, 25), True), (WeekDay.SAT, datetime(2013, 11, 30), True), - (WeekDay.SAT, datetime(2006, 8, 26), True), (WeekDay.SAT, datetime(2007, 8, 25), True), (WeekDay.SAT, datetime(2008, 8, 30), True), (WeekDay.SAT, datetime(2009, 8, 29), True), (WeekDay.SAT, datetime(2010, 8, 28), True), (WeekDay.SAT, datetime(2011, 8, 27), True), - (WeekDay.SAT, datetime(2019, 8, 31), True)] + (WeekDay.SAT, datetime(2019, 8, 31), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): weekday, dt, expected = case offset = LastWeekOfMonth(weekday=weekday) @@ -2723,31 +3404,33 @@ class TestSemiMonthEnd(Base): offset2 = _offset(2) def test_offset_whole_year(self): - dates = (datetime(2007, 12, 31), - datetime(2008, 1, 15), - datetime(2008, 1, 31), - datetime(2008, 2, 15), - datetime(2008, 2, 29), - datetime(2008, 3, 15), - datetime(2008, 3, 31), - datetime(2008, 4, 15), - datetime(2008, 4, 30), - datetime(2008, 5, 15), - datetime(2008, 5, 31), - datetime(2008, 6, 15), - datetime(2008, 6, 30), - datetime(2008, 7, 15), - datetime(2008, 7, 31), - datetime(2008, 8, 15), - datetime(2008, 8, 31), - datetime(2008, 9, 15), - datetime(2008, 9, 30), - datetime(2008, 10, 15), - datetime(2008, 10, 31), - datetime(2008, 11, 15), - datetime(2008, 11, 30), - datetime(2008, 12, 15), - datetime(2008, 12, 31)) + dates = ( + datetime(2007, 12, 31), + datetime(2008, 1, 15), + datetime(2008, 1, 31), + datetime(2008, 2, 15), + datetime(2008, 2, 29), + datetime(2008, 3, 15), + datetime(2008, 3, 31), + datetime(2008, 4, 15), + datetime(2008, 4, 30), + datetime(2008, 5, 15), + datetime(2008, 5, 31), + datetime(2008, 6, 15), + datetime(2008, 6, 30), + datetime(2008, 7, 15), + datetime(2008, 7, 31), + datetime(2008, 8, 15), + datetime(2008, 8, 31), + datetime(2008, 9, 15), + datetime(2008, 9, 30), + datetime(2008, 10, 15), + datetime(2008, 10, 31), + datetime(2008, 11, 15), + datetime(2008, 11, 30), + datetime(2008, 12, 15), + datetime(2008, 12, 31), + ) for base, exp_date in zip(dates[:-1], dates[1:]): assert_offset_equal(SemiMonthEnd(), base, exp_date) @@ -2763,94 +3446,142 @@ def test_offset_whole_year(self): tm.assert_index_equal(result, exp) # ensure generating a range with DatetimeIndex gives same result - result = date_range(start=dates[0], end=dates[-1], freq='SM') + result = date_range(start=dates[0], end=dates[-1], freq="SM") exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) offset_cases = [] - offset_cases.append((SemiMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 31)})) - - offset_cases.append((SemiMonthEnd(day_of_month=20), { - datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 20), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 20), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - offset_cases.append((SemiMonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 16): datetime(2008, 1, 31), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 15)})) - - offset_cases.append((SemiMonthEnd(0, day_of_month=16), { - datetime(2008, 1, 1): datetime(2008, 1, 16), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 16)})) - - offset_cases.append((SemiMonthEnd(2), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 11, 30)})) - - offset_cases.append((SemiMonthEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 30): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - offset_cases.append((SemiMonthEnd(-1, day_of_month=4), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2007, 1, 4): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - offset_cases.append((SemiMonthEnd(-2), { - datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 2, 15), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 14): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + SemiMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(day_of_month=20), + { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 20), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 20), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 16): datetime(2008, 1, 31), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 15), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(0, day_of_month=16), + { + datetime(2008, 1, 1): datetime(2008, 1, 16), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 16), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 11, 30), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 30): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-1, day_of_month=4), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2007, 1, 4): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthEnd(-2), + { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 2, 15), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 14): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 15), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - @pytest.mark.parametrize('case', offset_cases) + @pytest.mark.parametrize("case", offset_cases) def test_apply_index(self, case): offset, cases = case s = DatetimeIndex(cases.keys()) @@ -2862,21 +3593,28 @@ def test_apply_index(self, case): exp = DatetimeIndex(cases.values()) tm.assert_index_equal(result, exp) - on_offset_cases = [(datetime(2007, 12, 31), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 1), False), - (datetime(2008, 2, 29), True)] + on_offset_cases = [ + (datetime(2007, 12, 31), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 1), False), + (datetime(2008, 2, 29), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): dt, expected = case assert_onOffset(SemiMonthEnd(), dt, expected) - @pytest.mark.parametrize('klass', [Series, DatetimeIndex]) + @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding @@ -2884,13 +3622,23 @@ def test_vectorized_offset_addition(self, klass): result = s + SemiMonthEnd() result2 = SemiMonthEnd() + s - exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-01-31 00:15:00", tz="US/Central"), + Timestamp("2000-02-29", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) - s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), - Timestamp('2000-02-01', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-01 00:15:00", tz="US/Central"), + Timestamp("2000-02-01", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding @@ -2898,8 +3646,13 @@ def test_vectorized_offset_addition(self, klass): result = s + SemiMonthEnd() result2 = SemiMonthEnd() + s - exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) @@ -2910,31 +3663,33 @@ class TestSemiMonthBegin(Base): offset2 = _offset(2) def test_offset_whole_year(self): - dates = (datetime(2007, 12, 15), - datetime(2008, 1, 1), - datetime(2008, 1, 15), - datetime(2008, 2, 1), - datetime(2008, 2, 15), - datetime(2008, 3, 1), - datetime(2008, 3, 15), - datetime(2008, 4, 1), - datetime(2008, 4, 15), - datetime(2008, 5, 1), - datetime(2008, 5, 15), - datetime(2008, 6, 1), - datetime(2008, 6, 15), - datetime(2008, 7, 1), - datetime(2008, 7, 15), - datetime(2008, 8, 1), - datetime(2008, 8, 15), - datetime(2008, 9, 1), - datetime(2008, 9, 15), - datetime(2008, 10, 1), - datetime(2008, 10, 15), - datetime(2008, 11, 1), - datetime(2008, 11, 15), - datetime(2008, 12, 1), - datetime(2008, 12, 15)) + dates = ( + datetime(2007, 12, 15), + datetime(2008, 1, 1), + datetime(2008, 1, 15), + datetime(2008, 2, 1), + datetime(2008, 2, 15), + datetime(2008, 3, 1), + datetime(2008, 3, 15), + datetime(2008, 4, 1), + datetime(2008, 4, 15), + datetime(2008, 5, 1), + datetime(2008, 5, 15), + datetime(2008, 6, 1), + datetime(2008, 6, 15), + datetime(2008, 7, 1), + datetime(2008, 7, 15), + datetime(2008, 8, 1), + datetime(2008, 8, 15), + datetime(2008, 9, 1), + datetime(2008, 9, 15), + datetime(2008, 10, 1), + datetime(2008, 10, 15), + datetime(2008, 11, 1), + datetime(2008, 11, 15), + datetime(2008, 12, 1), + datetime(2008, 12, 15), + ) for base, exp_date in zip(dates[:-1], dates[1:]): assert_offset_equal(SemiMonthBegin(), base, exp_date) @@ -2950,98 +3705,146 @@ def test_offset_whole_year(self): tm.assert_index_equal(result, exp) # ensure generating a range with DatetimeIndex gives same result - result = date_range(start=dates[0], end=dates[-1], freq='SMS') + result = date_range(start=dates[0], end=dates[-1], freq="SMS") exp = DatetimeIndex(dates) tm.assert_index_equal(result, exp) offset_cases = [] - offset_cases.append((SemiMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 1, 15), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 15), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 15), - datetime(2006, 12, 1): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2007, 1, 1)})) - - offset_cases.append((SemiMonthBegin(day_of_month=20), { - datetime(2008, 1, 1): datetime(2008, 1, 20), - datetime(2008, 1, 15): datetime(2008, 1, 20), - datetime(2008, 1, 21): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 14): datetime(2006, 12, 20), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 1): datetime(2007, 1, 20), - datetime(2006, 12, 1): datetime(2006, 12, 20), - datetime(2006, 12, 15): datetime(2006, 12, 20)})) - - offset_cases.append((SemiMonthBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 1, 15), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 2): datetime(2006, 12, 15), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - offset_cases.append((SemiMonthBegin(0, day_of_month=16), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 1, 16): datetime(2008, 1, 16), - datetime(2008, 1, 15): datetime(2008, 1, 16), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2007, 1, 5): datetime(2007, 1, 16), - datetime(2007, 1, 1): datetime(2007, 1, 1)})) - - offset_cases.append((SemiMonthBegin(2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 15), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2006, 12, 29): datetime(2007, 1, 15), - datetime(2006, 12, 15): datetime(2007, 1, 15), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2007, 1, 16): datetime(2007, 2, 15), - datetime(2006, 11, 1): datetime(2006, 12, 1)})) - - offset_cases.append((SemiMonthBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 15), - datetime(2008, 6, 30): datetime(2008, 6, 15), - datetime(2008, 6, 14): datetime(2008, 6, 1), - datetime(2008, 12, 31): datetime(2008, 12, 15), - datetime(2006, 12, 29): datetime(2006, 12, 15), - datetime(2006, 12, 15): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 15)})) - - offset_cases.append((SemiMonthBegin(-1, day_of_month=4), { - datetime(2007, 1, 1): datetime(2006, 12, 4), - datetime(2007, 1, 4): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 6, 4), - datetime(2008, 12, 31): datetime(2008, 12, 4), - datetime(2006, 12, 5): datetime(2006, 12, 4), - datetime(2006, 12, 30): datetime(2006, 12, 4), - datetime(2006, 12, 2): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 4)})) - - offset_cases.append((SemiMonthBegin(-2), { - datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 1), - datetime(2008, 6, 14): datetime(2008, 5, 15), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 15): datetime(2006, 11, 15), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + SemiMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(day_of_month=20), + { + datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 2): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(0, day_of_month=16), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 5): datetime(2007, 1, 16), + datetime(2007, 1, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 15): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 12, 1), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 6, 14): datetime(2008, 6, 1), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 15), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-1, day_of_month=4), + { + datetime(2007, 1, 1): datetime(2006, 12, 4), + datetime(2007, 1, 4): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2006, 12, 2): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 4), + }, + ) + ) + + offset_cases.append( + ( + SemiMonthBegin(-2), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 1), + datetime(2008, 6, 14): datetime(2008, 5, 15), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 15): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - @pytest.mark.parametrize('case', offset_cases) + @pytest.mark.parametrize("case", offset_cases) def test_apply_index(self, case): offset, cases = case s = DatetimeIndex(cases.keys()) @@ -3054,42 +3857,64 @@ def test_apply_index(self, case): exp = DatetimeIndex(cases.values()) tm.assert_index_equal(result, exp) - on_offset_cases = [(datetime(2007, 12, 1), True), - (datetime(2007, 12, 15), True), - (datetime(2007, 12, 14), False), - (datetime(2007, 12, 31), False), - (datetime(2008, 2, 15), True)] + on_offset_cases = [ + (datetime(2007, 12, 1), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 31), False), + (datetime(2008, 2, 15), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): dt, expected = case assert_onOffset(SemiMonthBegin(), dt, expected) - @pytest.mark.parametrize('klass', [Series, DatetimeIndex]) + @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): - s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding # an integer array to PeriodIndex result = s + SemiMonthBegin() result2 = SemiMonthBegin() + s - exp = klass([Timestamp('2000-02-01 00:15:00', tz='US/Central'), - Timestamp('2000-03-01', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-02-01 00:15:00", tz="US/Central"), + Timestamp("2000-03-01", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) - s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), - Timestamp('2000-02-01', tz='US/Central')], name='a') + s = klass( + [ + Timestamp("2000-01-01 00:15:00", tz="US/Central"), + Timestamp("2000-02-01", tz="US/Central"), + ], + name="a", + ) with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding # an integer array to PeriodIndex result = s + SemiMonthBegin() result2 = SemiMonthBegin() + s - exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - Timestamp('2000-02-15', tz='US/Central')], name='a') + exp = klass( + [ + Timestamp("2000-01-15 00:15:00", tz="US/Central"), + Timestamp("2000-02-15", tz="US/Central"), + ], + name="a", + ) tm.assert_equal(result, exp) tm.assert_equal(result2, exp) @@ -3104,58 +3929,61 @@ def test_Easter(): assert_offset_equal(-Easter(), datetime(2011, 1, 1), datetime(2010, 4, 4)) assert_offset_equal(-Easter(), datetime(2010, 4, 5), datetime(2010, 4, 4)) - assert_offset_equal(-Easter(2), - datetime(2011, 1, 1), - datetime(2009, 4, 12)) + assert_offset_equal(-Easter(2), datetime(2011, 1, 1), datetime(2009, 4, 12)) assert_offset_equal(-Easter(), datetime(2010, 4, 4), datetime(2009, 4, 12)) - assert_offset_equal(-Easter(2), - datetime(2010, 4, 4), - datetime(2008, 3, 23)) + assert_offset_equal(-Easter(2), datetime(2010, 4, 4), datetime(2008, 3, 23)) class TestOffsetNames: - def test_get_offset_name(self): - assert BDay().freqstr == 'B' - assert BDay(2).freqstr == '2B' - assert BMonthEnd().freqstr == 'BM' - assert Week(weekday=0).freqstr == 'W-MON' - assert Week(weekday=1).freqstr == 'W-TUE' - assert Week(weekday=2).freqstr == 'W-WED' - assert Week(weekday=3).freqstr == 'W-THU' - assert Week(weekday=4).freqstr == 'W-FRI' + assert BDay().freqstr == "B" + assert BDay(2).freqstr == "2B" + assert BMonthEnd().freqstr == "BM" + assert Week(weekday=0).freqstr == "W-MON" + assert Week(weekday=1).freqstr == "W-TUE" + assert Week(weekday=2).freqstr == "W-WED" + assert Week(weekday=3).freqstr == "W-THU" + assert Week(weekday=4).freqstr == "W-FRI" assert LastWeekOfMonth(weekday=WeekDay.SUN).freqstr == "LWOM-SUN" def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('gibberish') + get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset('QS-JAN-B') + get_offset("QS-JAN-B") pairs = [ - ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), - ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), - ('W-TUE', Week(weekday=1)), ('W-WED', Week(weekday=2)), - ('W-THU', Week(weekday=3)), ('W-FRI', Week(weekday=4))] + ("B", BDay()), + ("b", BDay()), + ("bm", BMonthEnd()), + ("Bm", BMonthEnd()), + ("W-MON", Week(weekday=0)), + ("W-TUE", Week(weekday=1)), + ("W-WED", Week(weekday=2)), + ("W-THU", Week(weekday=3)), + ("W-FRI", Week(weekday=4)), + ] for name, expected in pairs: offset = get_offset(name) - assert offset == expected, ("Expected %r to yield %r (actual: %r)" % - (name, expected, offset)) + assert offset == expected, "Expected %r to yield %r (actual: %r)" % ( + name, + expected, + offset, + ) def test_get_offset_legacy(): - pairs = [('w@Sat', Week(weekday=5))] + pairs = [("w@Sat", Week(weekday=5))] for name, expected in pairs: with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): get_offset(name) class TestOffsetAliases: - def setup_method(self, method): _offset_map.clear() @@ -3166,32 +3994,44 @@ def test_alias_equality(self): assert k == v.copy() def test_rule_code(self): - lst = ['M', 'MS', 'BM', 'BMS', 'D', 'B', 'H', 'T', 'S', 'L', 'U'] + lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "T", "S", "L", "U"] for k in lst: assert k == get_offset(k).rule_code # should be cached - this is kind of an internals test... assert k in _offset_map assert k == (get_offset(k) * 3).rule_code - suffix_lst = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] - base = 'W' + suffix_lst = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + base = "W" for v in suffix_lst: - alias = '-'.join([base, v]) + alias = "-".join([base, v]) assert alias == get_offset(alias).rule_code assert alias == (get_offset(alias) * 5).rule_code - suffix_lst = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', - 'SEP', 'OCT', 'NOV', 'DEC'] - base_lst = ['A', 'AS', 'BA', 'BAS', 'Q', 'QS', 'BQ', 'BQS'] + suffix_lst = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + base_lst = ["A", "AS", "BA", "BAS", "Q", "QS", "BQ", "BQS"] for base in base_lst: for v in suffix_lst: - alias = '-'.join([base, v]) + alias = "-".join([base, v]) assert alias == get_offset(alias).rule_code assert alias == (get_offset(alias) * 5).rule_code - lst = ['M', 'D', 'B', 'H', 'T', 'S', 'L', 'U'] + lst = ["M", "D", "B", "H", "T", "S", "L", "U"] for k in lst: - code, stride = get_freq_code('3' + k) + code, stride = get_freq_code("3" + k) assert isinstance(code, int) assert stride == 3 assert k == get_freq_str(code) @@ -3202,30 +4042,42 @@ def test_dateoffset_misc(): # it works oset.freqstr - assert (not offsets.DateOffset(months=2) == 2) + assert not offsets.DateOffset(months=2) == 2 def test_freq_offsets(): off = BDay(1, offset=timedelta(0, 1800)) - assert (off.freqstr == 'B+30Min') + assert off.freqstr == "B+30Min" off = BDay(1, offset=timedelta(0, -1800)) - assert (off.freqstr == 'B-30Min') + assert off.freqstr == "B-30Min" class TestReprNames: - def test_str_for_named_is_name(self): # look at all the amazing combinations! - month_prefixes = ['A', 'AS', 'BA', 'BAS', 'Q', 'BQ', 'BQS', 'QS'] - names = [prefix + '-' + month - for prefix in month_prefixes - for month in ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', - 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']] - days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] - names += ['W-' + day for day in days] - names += ['WOM-' + week + day - for week in ('1', '2', '3', '4') for day in days] + month_prefixes = ["A", "AS", "BA", "BAS", "Q", "BQ", "BQS", "QS"] + names = [ + prefix + "-" + month + for prefix in month_prefixes + for month in [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ] + ] + days = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] + names += ["W-" + day for day in days] + names += ["WOM-" + week + day for week in ("1", "2", "3", "4") for day in days] _offset_map.clear() for name in names: offset = get_offset(name) @@ -3242,29 +4094,40 @@ class TestDST: """ test DateOffset additions over Daylight Savings Time """ + # one microsecond before the DST transition ts_pre_fallback = "2013-11-03 01:59:59.999999" ts_pre_springfwd = "2013-03-10 01:59:59.999999" # test both basic names and dateutil timezones timezone_utc_offsets = { - 'US/Eastern': dict(utc_offset_daylight=-4, - utc_offset_standard=-5, ), - 'dateutil/US/Pacific': dict(utc_offset_daylight=-7, - utc_offset_standard=-8, ) + "US/Eastern": dict(utc_offset_daylight=-4, utc_offset_standard=-5), + "dateutil/US/Pacific": dict(utc_offset_daylight=-7, utc_offset_standard=-8), } valid_date_offsets_singular = [ - 'weekday', 'day', 'hour', 'minute', 'second', 'microsecond' + "weekday", + "day", + "hour", + "minute", + "second", + "microsecond", ] valid_date_offsets_plural = [ - 'weeks', 'days', - 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds' + "weeks", + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", ] def _test_all_offsets(self, n, **kwds): - valid_offsets = self.valid_date_offsets_plural if n > 1 \ + valid_offsets = ( + self.valid_date_offsets_plural + if n > 1 else self.valid_date_offsets_singular + ) for name in valid_offsets: self._test_offset(offset_name=name, offset_n=n, **kwds) @@ -3276,90 +4139,101 @@ def _test_offset(self, offset_name, offset_n, tstart, expected_utc_offset): if expected_utc_offset is not None: assert get_utc_offset_hours(t) == expected_utc_offset - if offset_name == 'weeks': + if offset_name == "weeks": # dates should match - assert t.date() == timedelta(days=7 * offset.kwds[ - 'weeks']) + tstart.date() + assert t.date() == timedelta(days=7 * offset.kwds["weeks"]) + tstart.date() # expect the same day of week, hour of day, minute, second, ... - assert (t.dayofweek == tstart.dayofweek and - t.hour == tstart.hour and - t.minute == tstart.minute and - t.second == tstart.second) - elif offset_name == 'days': + assert ( + t.dayofweek == tstart.dayofweek + and t.hour == tstart.hour + and t.minute == tstart.minute + and t.second == tstart.second + ) + elif offset_name == "days": # dates should match - assert timedelta(offset.kwds['days']) + tstart.date() == t.date() + assert timedelta(offset.kwds["days"]) + tstart.date() == t.date() # expect the same hour of day, minute, second, ... - assert (t.hour == tstart.hour and - t.minute == tstart.minute and - t.second == tstart.second) + assert ( + t.hour == tstart.hour + and t.minute == tstart.minute + and t.second == tstart.second + ) elif offset_name in self.valid_date_offsets_singular: # expect the singular offset value to match between tstart and t - datepart_offset = getattr(t, offset_name - if offset_name != 'weekday' else - 'dayofweek') + datepart_offset = getattr( + t, offset_name if offset_name != "weekday" else "dayofweek" + ) assert datepart_offset == offset.kwds[offset_name] else: # the offset should be the same as if it was done in UTC - assert (t == (tstart.tz_convert('UTC') + offset) - .tz_convert('US/Pacific')) + assert t == (tstart.tz_convert("UTC") + offset).tz_convert("US/Pacific") def _make_timestamp(self, string, hrs_offset, tz): if hrs_offset >= 0: - offset_string = '{hrs:02d}00'.format(hrs=hrs_offset) + offset_string = "{hrs:02d}00".format(hrs=hrs_offset) else: - offset_string = '-{hrs:02d}00'.format(hrs=-1 * hrs_offset) + offset_string = "-{hrs:02d}00".format(hrs=-1 * hrs_offset) return Timestamp(string + offset_string).tz_convert(tz) def test_springforward_plural(self): # test moving from standard to daylight savings for tz, utc_offsets in self.timezone_utc_offsets.items(): - hrs_pre = utc_offsets['utc_offset_standard'] - hrs_post = utc_offsets['utc_offset_daylight'] + hrs_pre = utc_offsets["utc_offset_standard"] + hrs_post = utc_offsets["utc_offset_daylight"] self._test_all_offsets( - n=3, tstart=self._make_timestamp(self.ts_pre_springfwd, - hrs_pre, tz), - expected_utc_offset=hrs_post) + n=3, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=hrs_post, + ) def test_fallback_singular(self): # in the case of singular offsets, we don't necessarily know which utc # offset the new Timestamp will wind up in (the tz for 1 month may be # different from 1 second) so we don't specify an expected_utc_offset for tz, utc_offsets in self.timezone_utc_offsets.items(): - hrs_pre = utc_offsets['utc_offset_standard'] - self._test_all_offsets(n=1, tstart=self._make_timestamp( - self.ts_pre_fallback, hrs_pre, tz), expected_utc_offset=None) + hrs_pre = utc_offsets["utc_offset_standard"] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_fallback, hrs_pre, tz), + expected_utc_offset=None, + ) def test_springforward_singular(self): for tz, utc_offsets in self.timezone_utc_offsets.items(): - hrs_pre = utc_offsets['utc_offset_standard'] - self._test_all_offsets(n=1, tstart=self._make_timestamp( - self.ts_pre_springfwd, hrs_pre, tz), expected_utc_offset=None) - - offset_classes = {MonthBegin: ['11/2/2012', '12/1/2012'], - MonthEnd: ['11/2/2012', '11/30/2012'], - BMonthBegin: ['11/2/2012', '12/3/2012'], - BMonthEnd: ['11/2/2012', '11/30/2012'], - CBMonthBegin: ['11/2/2012', '12/3/2012'], - CBMonthEnd: ['11/2/2012', '11/30/2012'], - SemiMonthBegin: ['11/2/2012', '11/15/2012'], - SemiMonthEnd: ['11/2/2012', '11/15/2012'], - Week: ['11/2/2012', '11/9/2012'], - YearBegin: ['11/2/2012', '1/1/2013'], - YearEnd: ['11/2/2012', '12/31/2012'], - BYearBegin: ['11/2/2012', '1/1/2013'], - BYearEnd: ['11/2/2012', '12/31/2012'], - QuarterBegin: ['11/2/2012', '12/1/2012'], - QuarterEnd: ['11/2/2012', '12/31/2012'], - BQuarterBegin: ['11/2/2012', '12/3/2012'], - BQuarterEnd: ['11/2/2012', '12/31/2012'], - Day: ['11/4/2012', '11/4/2012 23:00']}.items() - - @pytest.mark.parametrize('tup', offset_classes) + hrs_pre = utc_offsets["utc_offset_standard"] + self._test_all_offsets( + n=1, + tstart=self._make_timestamp(self.ts_pre_springfwd, hrs_pre, tz), + expected_utc_offset=None, + ) + + offset_classes = { + MonthBegin: ["11/2/2012", "12/1/2012"], + MonthEnd: ["11/2/2012", "11/30/2012"], + BMonthBegin: ["11/2/2012", "12/3/2012"], + BMonthEnd: ["11/2/2012", "11/30/2012"], + CBMonthBegin: ["11/2/2012", "12/3/2012"], + CBMonthEnd: ["11/2/2012", "11/30/2012"], + SemiMonthBegin: ["11/2/2012", "11/15/2012"], + SemiMonthEnd: ["11/2/2012", "11/15/2012"], + Week: ["11/2/2012", "11/9/2012"], + YearBegin: ["11/2/2012", "1/1/2013"], + YearEnd: ["11/2/2012", "12/31/2012"], + BYearBegin: ["11/2/2012", "1/1/2013"], + BYearEnd: ["11/2/2012", "12/31/2012"], + QuarterBegin: ["11/2/2012", "12/1/2012"], + QuarterEnd: ["11/2/2012", "12/31/2012"], + BQuarterBegin: ["11/2/2012", "12/3/2012"], + BQuarterEnd: ["11/2/2012", "12/31/2012"], + Day: ["11/4/2012", "11/4/2012 23:00"], + }.items() + + @pytest.mark.parametrize("tup", offset_classes) def test_all_offset_classes(self, tup): offset, test_values = tup - first = Timestamp(test_values[0], tz='US/Eastern') + offset() - second = Timestamp(test_values[1], tz='US/Eastern') + first = Timestamp(test_values[0], tz="US/Eastern") + offset() + second = Timestamp(test_values[1], tz="US/Eastern") assert first == second @@ -3379,7 +4253,7 @@ def test_valid_default_arguments(offset_types): cls() -@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) def test_valid_month_attributes(kwd, month_classes): # GH#18226 cls = month_classes @@ -3388,14 +4262,14 @@ def test_valid_month_attributes(kwd, month_classes): cls(**{kwd: 3}) -@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) def test_valid_relativedelta_kwargs(kwd): # Check that all the arguments specified in liboffsets.relativedelta_kwds # are in fact valid relativedelta keyword args DateOffset(**{kwd: 1}) -@pytest.mark.parametrize('kwd', sorted(list(liboffsets.relativedelta_kwds))) +@pytest.mark.parametrize("kwd", sorted(list(liboffsets.relativedelta_kwds))) def test_valid_tick_attributes(kwd, tick_classes): # GH#18226 cls = tick_classes @@ -3406,7 +4280,7 @@ def test_valid_tick_attributes(kwd, tick_classes): def test_validate_n_error(): with pytest.raises(TypeError): - DateOffset(n='Doh!') + DateOffset(n="Doh!") with pytest.raises(TypeError): MonthBegin(n=timedelta(1)) @@ -3433,14 +4307,14 @@ def test_weeks_onoffset(): # GH#18510 Week with weekday = None, normalize = False should always # be onOffset offset = Week(n=2, weekday=None) - ts = Timestamp('1862-01-13 09:03:34.873477378+0210', tz='Africa/Lusaka') + ts = Timestamp("1862-01-13 09:03:34.873477378+0210", tz="Africa/Lusaka") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = Week(n=2, weekday=None) - ts = Timestamp('1856-10-24 16:18:36.556360110-0717', tz='Pacific/Easter') + ts = Timestamp("1856-10-24 16:18:36.556360110-0717", tz="Pacific/Easter") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -3450,14 +4324,14 @@ def test_weekofmonth_onoffset(): # GH#18864 # Make sure that nanoseconds don't trip up onOffset (and with it apply) offset = WeekOfMonth(n=2, week=2, weekday=0) - ts = Timestamp('1916-05-15 01:14:49.583410462+0422', tz='Asia/Qyzylorda') + ts = Timestamp("1916-05-15 01:14:49.583410462+0422", tz="Asia/Qyzylorda") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = WeekOfMonth(n=-3, week=1, weekday=0) - ts = Timestamp('1980-12-08 03:38:52.878321185+0500', tz='Asia/Oral') + ts = Timestamp("1980-12-08 03:38:52.878321185+0500", tz="Asia/Oral") fast = offset.onOffset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -3466,16 +4340,14 @@ def test_weekofmonth_onoffset(): def test_last_week_of_month_on_offset(): # GH#19036, GH#18977 _adjust_dst was incorrect for LastWeekOfMonth offset = LastWeekOfMonth(n=4, weekday=6) - ts = Timestamp('1917-05-27 20:55:27.084284178+0200', - tz='Europe/Warsaw') + ts = Timestamp("1917-05-27 20:55:27.084284178+0200", tz="Europe/Warsaw") slow = (ts + offset) - offset == ts fast = offset.onOffset(ts) assert fast == slow # negative n offset = LastWeekOfMonth(n=-4, weekday=5) - ts = Timestamp('2005-08-27 05:01:42.799392561-0500', - tz='America/Rainy_River') + ts = Timestamp("2005-08-27 05:01:42.799392561-0500", tz="America/Rainy_River") slow = (ts + offset) - offset == ts fast = offset.onOffset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 271f4ceef5f49..880ff1f137520 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -17,42 +17,67 @@ import pandas as pd from pandas.tseries.offsets import ( - BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, BYearBegin, BYearEnd, - MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, YearBegin, YearEnd) + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BYearBegin, + BYearEnd, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + YearBegin, + YearEnd, +) # ---------------------------------------------------------------- # Helpers for generating random data with warnings.catch_warnings(): - warnings.simplefilter('ignore') - min_dt = pd.Timestamp(1900, 1, 1).to_pydatetime(), - max_dt = pd.Timestamp(1900, 1, 1).to_pydatetime(), + warnings.simplefilter("ignore") + min_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) + max_dt = (pd.Timestamp(1900, 1, 1).to_pydatetime(),) gen_date_range = st.builds( pd.date_range, start=st.datetimes( # TODO: Choose the min/max values more systematically min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(), - max_value=pd.Timestamp(2100, 1, 1).to_pydatetime() + max_value=pd.Timestamp(2100, 1, 1).to_pydatetime(), ), periods=st.integers(min_value=2, max_value=100), - freq=st.sampled_from('Y Q M D H T s ms us ns'.split()), + freq=st.sampled_from("Y Q M D H T s ms us ns".split()), tz=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), ) gen_random_datetime = st.datetimes( min_value=min_dt, max_value=max_dt, - timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()) + timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()), ) # The strategy for each type is registered in conftest.py, as they don't carry # enough runtime information (e.g. type hints) to infer how to build them. -gen_yqm_offset = st.one_of(*map(st.from_type, [ - MonthBegin, MonthEnd, BMonthBegin, BMonthEnd, - QuarterBegin, QuarterEnd, BQuarterBegin, BQuarterEnd, - YearBegin, YearEnd, BYearBegin, BYearEnd -])) +gen_yqm_offset = st.one_of( + *map( + st.from_type, + [ + MonthBegin, + MonthEnd, + BMonthBegin, + BMonthEnd, + QuarterBegin, + QuarterEnd, + BQuarterBegin, + BQuarterEnd, + YearBegin, + YearEnd, + BYearBegin, + BYearEnd, + ], + ) +) # ---------------------------------------------------------------- @@ -60,7 +85,7 @@ # Based on CI runs: Always passes on OSX, fails on Linux, sometimes on Windows -@pytest.mark.xfail(strict=False, reason='inconsistent between OSs, Pythons') +@pytest.mark.xfail(strict=False, reason="inconsistent between OSs, Pythons") @given(gen_random_datetime, gen_yqm_offset) def test_on_offset_implementations(dt, offset): assume(not offset.normalize) @@ -71,10 +96,12 @@ def test_on_offset_implementations(dt, offset): assert offset.onOffset(dt) == (compare == dt) -@pytest.mark.xfail(reason="res_v2 below is incorrect, needs to use the " - "commented-out version with tz_localize. " - "But with that fix in place, hypothesis then " - "has errors in timezone generation.") +@pytest.mark.xfail( + reason="res_v2 below is incorrect, needs to use the " + "commented-out version with tz_localize. " + "But with that fix in place, hypothesis then " + "has errors in timezone generation." +) @given(gen_yqm_offset, gen_date_range) def test_apply_index_implementations(offset, rng): # offset.apply_index(dti)[i] should match dti[i] + offset @@ -103,8 +130,9 @@ def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and # 2) that hour is not incorrectly changed by this normalization # Note that dti includes a transition across DST boundary - dti = pd.date_range(start='2017-10-30 12:00:00', end='2017-11-06', - freq='D', tz='US/Eastern') + dti = pd.date_range( + start="2017-10-30 12:00:00", end="2017-11-06", freq="D", tz="US/Eastern" + ) assert (dti.hour == 12).all() # we haven't screwed up yet res = dti + offset diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index f2f6aed097d0c..98a3631c8e63a 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -27,21 +27,21 @@ def test_apply_ticks(): result = offsets.Hour(3).apply(offsets.Hour(4)) exp = offsets.Hour(7) - assert (result == exp) + assert result == exp def test_delta_to_tick(): delta = timedelta(3) tick = offsets._delta_to_tick(delta) - assert (tick == offsets.Day(3)) + assert tick == offsets.Day(3) td = Timedelta(nanoseconds=5) tick = offsets._delta_to_tick(td) assert tick == Nano(5) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) @settings(deadline=None) # GH 24641 @example(n=2, m=3) @example(n=800, m=300) @@ -62,7 +62,7 @@ def test_tick_add_sub(cls, n, m): assert left - right == expected -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) @settings(deadline=None) @example(n=2, m=3) @given(n=st.integers(-999, 999), m=st.integers(-999, 999)) @@ -86,14 +86,10 @@ def test_tick_equality(cls, n, m): def test_Hour(): - assert_offset_equal(Hour(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 1)) - assert_offset_equal(Hour(-1), - datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) - assert_offset_equal(2 * Hour(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 2)) - assert_offset_equal(-1 * Hour(), - datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + assert_offset_equal(Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 1)) + assert_offset_equal(Hour(-1), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + assert_offset_equal(2 * Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 2)) + assert_offset_equal(-1 * Hour(), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) assert Hour(3) + Hour(2) == Hour(5) assert Hour(3) - Hour(2) == Hour() @@ -102,14 +98,10 @@ def test_Hour(): def test_Minute(): - assert_offset_equal(Minute(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1)) - assert_offset_equal(Minute(-1), - datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) - assert_offset_equal(2 * Minute(), - datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2)) - assert_offset_equal(-1 * Minute(), - datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal(Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1)) + assert_offset_equal(Minute(-1), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal(2 * Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2)) + assert_offset_equal(-1 * Minute(), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) assert Minute(3) + Minute(2) == Minute(5) assert Minute(3) - Minute(2) == Minute() @@ -117,67 +109,61 @@ def test_Minute(): def test_Second(): - assert_offset_equal(Second(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 1)) - assert_offset_equal(Second(-1), - datetime(2010, 1, 1, 0, 0, 1), - datetime(2010, 1, 1)) - assert_offset_equal(2 * Second(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 2)) - assert_offset_equal(-1 * Second(), - datetime(2010, 1, 1, 0, 0, 1), - datetime(2010, 1, 1)) + assert_offset_equal(Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 1)) + assert_offset_equal(Second(-1), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1)) + assert_offset_equal( + 2 * Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 2) + ) + assert_offset_equal( + -1 * Second(), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1) + ) assert Second(3) + Second(2) == Second(5) assert Second(3) - Second(2) == Second() def test_Millisecond(): - assert_offset_equal(Milli(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 1000)) - assert_offset_equal(Milli(-1), - datetime(2010, 1, 1, 0, 0, 0, 1000), - datetime(2010, 1, 1)) - assert_offset_equal(Milli(2), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 2000)) - assert_offset_equal(2 * Milli(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 2000)) - assert_offset_equal(-1 * Milli(), - datetime(2010, 1, 1, 0, 0, 0, 1000), - datetime(2010, 1, 1)) + assert_offset_equal( + Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1000) + ) + assert_offset_equal( + Milli(-1), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1) + ) + assert_offset_equal( + Milli(2), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000) + ) + assert_offset_equal( + 2 * Milli(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2000) + ) + assert_offset_equal( + -1 * Milli(), datetime(2010, 1, 1, 0, 0, 0, 1000), datetime(2010, 1, 1) + ) assert Milli(3) + Milli(2) == Milli(5) assert Milli(3) - Milli(2) == Milli() def test_MillisecondTimestampArithmetic(): - assert_offset_equal(Milli(), - Timestamp('2010-01-01'), - Timestamp('2010-01-01 00:00:00.001')) - assert_offset_equal(Milli(-1), - Timestamp('2010-01-01 00:00:00.001'), - Timestamp('2010-01-01')) + assert_offset_equal( + Milli(), Timestamp("2010-01-01"), Timestamp("2010-01-01 00:00:00.001") + ) + assert_offset_equal( + Milli(-1), Timestamp("2010-01-01 00:00:00.001"), Timestamp("2010-01-01") + ) def test_Microsecond(): - assert_offset_equal(Micro(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 1)) - assert_offset_equal(Micro(-1), - datetime(2010, 1, 1, 0, 0, 0, 1), - datetime(2010, 1, 1)) - - assert_offset_equal(2 * Micro(), - datetime(2010, 1, 1), - datetime(2010, 1, 1, 0, 0, 0, 2)) - assert_offset_equal(-1 * Micro(), - datetime(2010, 1, 1, 0, 0, 0, 1), - datetime(2010, 1, 1)) + assert_offset_equal(Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 1)) + assert_offset_equal( + Micro(-1), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1) + ) + + assert_offset_equal( + 2 * Micro(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 0, 2) + ) + assert_offset_equal( + -1 * Micro(), datetime(2010, 1, 1, 0, 0, 0, 1), datetime(2010, 1, 1) + ) assert Micro(3) + Micro(2) == Micro(5) assert Micro(3) - Micro(2) == Micro() @@ -196,18 +182,10 @@ def test_NanosecondGeneric(): def test_Nanosecond(): timestamp = Timestamp(datetime(2010, 1, 1)) - assert_offset_equal(Nano(), - timestamp, - timestamp + np.timedelta64(1, 'ns')) - assert_offset_equal(Nano(-1), - timestamp + np.timedelta64(1, 'ns'), - timestamp) - assert_offset_equal(2 * Nano(), - timestamp, - timestamp + np.timedelta64(2, 'ns')) - assert_offset_equal(-1 * Nano(), - timestamp + np.timedelta64(1, 'ns'), - timestamp) + assert_offset_equal(Nano(), timestamp, timestamp + np.timedelta64(1, "ns")) + assert_offset_equal(Nano(-1), timestamp + np.timedelta64(1, "ns"), timestamp) + assert_offset_equal(2 * Nano(), timestamp, timestamp + np.timedelta64(2, "ns")) + assert_offset_equal(-1 * Nano(), timestamp + np.timedelta64(1, "ns"), timestamp) assert Nano(3) + Nano(2) == Nano(5) assert Nano(3) - Nano(2) == Nano() @@ -218,13 +196,17 @@ def test_Nanosecond(): assert Micro(5) + Nano(1) == Nano(5001) -@pytest.mark.parametrize('kls, expected', - [(Hour, Timedelta(hours=5)), - (Minute, Timedelta(hours=2, minutes=3)), - (Second, Timedelta(hours=2, seconds=3)), - (Milli, Timedelta(hours=2, milliseconds=3)), - (Micro, Timedelta(hours=2, microseconds=3)), - (Nano, Timedelta(hours=2, nanoseconds=3))]) +@pytest.mark.parametrize( + "kls, expected", + [ + (Hour, Timedelta(hours=5)), + (Minute, Timedelta(hours=2, minutes=3)), + (Second, Timedelta(hours=2, seconds=3)), + (Milli, Timedelta(hours=2, milliseconds=3)), + (Micro, Timedelta(hours=2, microseconds=3)), + (Nano, Timedelta(hours=2, nanoseconds=3)), + ], +) def test_tick_addition(kls, expected): offset = kls(3) result = offset + Timedelta(hours=2) @@ -232,7 +214,7 @@ def test_tick_addition(kls, expected): assert result == expected -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_division(cls): off = cls(10) @@ -254,13 +236,13 @@ def test_tick_division(cls): if cls._inc < Timedelta(seconds=1): # Case where we end up with a bigger class - result = off / .001 + result = off / 0.001 assert isinstance(result, offsets.Tick) assert not isinstance(result, cls) - assert result.delta == off.delta / .001 + assert result.delta == off.delta / 0.001 -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_rdiv(cls): off = cls(10) delta = off.delta @@ -278,12 +260,12 @@ def test_tick_rdiv(cls): assert (delta.to_pytimedelta() * 2) / off == 2 result = np.array([2 * td64, td64]) / off - expected = np.array([2., 1.]) + expected = np.array([2.0, 1.0]) tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize('cls1', tick_classes) -@pytest.mark.parametrize('cls2', tick_classes) +@pytest.mark.parametrize("cls1", tick_classes) +@pytest.mark.parametrize("cls2", tick_classes) def test_tick_zero(cls1, cls2): assert cls1(0) == cls2(0) assert cls1(0) + cls2(0) == cls1(0) @@ -295,17 +277,17 @@ def test_tick_zero(cls1, cls2): assert cls1(2) + Nano(0) == cls1(2) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_equalities(cls): assert cls() == cls(1) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): assert not cls().isAnchored() -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_compare_ticks(cls): three = cls(3) four = cls(4) @@ -318,7 +300,7 @@ def test_compare_ticks(cls): assert cls(3) != cls(4) -@pytest.mark.parametrize('cls', tick_classes) +@pytest.mark.parametrize("cls", tick_classes) def test_compare_ticks_to_strs(cls): # GH#23524 off = cls(19) diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py index bcfe997583b02..12a524d82fcf5 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -9,8 +9,19 @@ from pandas import Timestamp from pandas.tseries.offsets import ( - BMonthBegin, BMonthEnd, BQuarterBegin, BQuarterEnd, BYearBegin, BYearEnd, - MonthBegin, MonthEnd, QuarterBegin, QuarterEnd, YearBegin, YearEnd) + BMonthBegin, + BMonthEnd, + BQuarterBegin, + BQuarterEnd, + BYearBegin, + BYearEnd, + MonthBegin, + MonthEnd, + QuarterBegin, + QuarterEnd, + YearBegin, + YearEnd, +) from .common import assert_offset_equal, assert_onOffset from .test_offsets import Base @@ -26,19 +37,30 @@ def test_quarterly_dont_normalize(): for klass in offsets: result = date + klass() - assert (result.time() == date.time()) - - -@pytest.mark.parametrize('n', [-2, 1]) -@pytest.mark.parametrize('cls', [MonthBegin, MonthEnd, - BMonthBegin, BMonthEnd, - QuarterBegin, QuarterEnd, - BQuarterBegin, BQuarterEnd, - YearBegin, YearEnd, - BYearBegin, BYearEnd]) + assert result.time() == date.time() + + +@pytest.mark.parametrize("n", [-2, 1]) +@pytest.mark.parametrize( + "cls", + [ + MonthBegin, + MonthEnd, + BMonthBegin, + BMonthEnd, + QuarterBegin, + QuarterEnd, + BQuarterBegin, + BQuarterEnd, + YearBegin, + YearEnd, + BYearBegin, + BYearEnd, + ], +) def test_apply_index(cls, n): offset = cls(n=n) - rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') + rng = pd.date_range(start="1/1/2000", periods=100000, freq="T") ser = pd.Series(rng) res = rng + offset @@ -52,12 +74,16 @@ def test_apply_index(cls, n): assert res2.iloc[-1] == ser.iloc[-1] + offset -@pytest.mark.parametrize('offset', [QuarterBegin(), QuarterEnd(), - BQuarterBegin(), BQuarterEnd()]) +@pytest.mark.parametrize( + "offset", [QuarterBegin(), QuarterEnd(), BQuarterBegin(), BQuarterEnd()] +) def test_on_offset(offset): - dates = [datetime(2016, m, d) - for m in [10, 11, 12] - for d in [1, 2, 3, 28, 29, 30, 31] if not (m == 11 and d == 31)] + dates = [ + datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31) + ] for date in dates: res = offset.onOffset(date) slow_version = date == (date + offset) - offset @@ -67,41 +93,66 @@ def test_on_offset(offset): # -------------------------------------------------------------------- # Months + class TestMonthBegin(Base): _offset = MonthBegin offset_cases = [] # NOTE: I'm not entirely happy with the logic here for Begin -ss # see thread 'offset conventions' on the ML - offset_cases.append((MonthBegin(), { - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 2, 1): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - offset_cases.append((MonthBegin(0), { - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 12, 3): datetime(2007, 1, 1), - datetime(2007, 1, 31): datetime(2007, 2, 1)})) - - offset_cases.append((MonthBegin(2), { - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 3, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 12, 28): datetime(2008, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - offset_cases.append((MonthBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 5, 31): datetime(2008, 5, 1), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 1, 2): datetime(2006, 1, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + MonthBegin(), + { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 2, 1): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(0), + { + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 12, 3): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(2), + { + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 12, 28): datetime(2008, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + MonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 5, 31): datetime(2008, 5, 1), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 1, 2): datetime(2006, 1, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -129,47 +180,73 @@ def test_normalize(self): assert result == expected offset_cases = [] - offset_cases.append((MonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 31)})) - - offset_cases.append((MonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 31), - datetime(2006, 12, 31): datetime(2006, 12, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - offset_cases.append((MonthEnd(2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 31)})) - - offset_cases.append((MonthEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2008, 5, 31), - datetime(2008, 12, 31): datetime(2008, 11, 30), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 11, 30), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + MonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + MonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 11, 30), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(MonthEnd(), datetime(2007, 12, 31), True), - (MonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (MonthEnd(), datetime(2007, 12, 31), True), + (MonthEnd(), datetime(2008, 1, 1), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -185,53 +262,79 @@ def test_offsets_compare_equal(self): assert not offset1 != offset2 offset_cases = [] - offset_cases.append((BMonthBegin(), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 1): datetime(2006, 10, 2), - datetime(2007, 1, 1): datetime(2007, 2, 1), - datetime(2006, 12, 1): datetime(2007, 1, 1)})) - - offset_cases.append((BMonthBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2006, 10, 2): datetime(2006, 10, 2), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2006, 12, 29): datetime(2007, 1, 1), - datetime(2006, 12, 31): datetime(2007, 1, 1), - datetime(2006, 9, 15): datetime(2006, 10, 2)})) - - offset_cases.append((BMonthBegin(2), { - datetime(2008, 1, 1): datetime(2008, 3, 3), - datetime(2008, 1, 15): datetime(2008, 3, 3), - datetime(2006, 12, 29): datetime(2007, 2, 1), - datetime(2006, 12, 31): datetime(2007, 2, 1), - datetime(2007, 1, 1): datetime(2007, 3, 1), - datetime(2006, 11, 1): datetime(2007, 1, 1)})) - - offset_cases.append((BMonthBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 1), - datetime(2008, 6, 30): datetime(2008, 6, 2), - datetime(2008, 6, 1): datetime(2008, 5, 1), - datetime(2008, 3, 10): datetime(2008, 3, 3), - datetime(2008, 12, 31): datetime(2008, 12, 1), - datetime(2006, 12, 29): datetime(2006, 12, 1), - datetime(2006, 12, 30): datetime(2006, 12, 1), - datetime(2007, 1, 1): datetime(2006, 12, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BMonthBegin(), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 1): datetime(2006, 10, 2), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 10, 2): datetime(2006, 10, 2), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 15): datetime(2006, 10, 2), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(2), + { + datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 1, 15): datetime(2008, 3, 3), + datetime(2006, 12, 29): datetime(2007, 2, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BMonthBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 2), + datetime(2008, 6, 1): datetime(2008, 5, 1), + datetime(2008, 3, 10): datetime(2008, 3, 3), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 30): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(BMonthBegin(), datetime(2007, 12, 31), False), - (BMonthBegin(), datetime(2008, 1, 1), True), - (BMonthBegin(), datetime(2001, 4, 2), True), - (BMonthBegin(), datetime(2008, 3, 3), True)] + on_offset_cases = [ + (BMonthBegin(), datetime(2007, 12, 31), False), + (BMonthBegin(), datetime(2008, 1, 1), True), + (BMonthBegin(), datetime(2001, 4, 2), True), + (BMonthBegin(), datetime(2008, 3, 3), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -254,57 +357,83 @@ def test_offsets_compare_equal(self): assert not offset1 != offset2 offset_cases = [] - offset_cases.append((BMonthEnd(), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2006, 12, 29): datetime(2007, 1, 31), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31), - datetime(2006, 12, 1): datetime(2006, 12, 29)})) - - offset_cases.append((BMonthEnd(0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2006, 12, 29): datetime(2006, 12, 29), - datetime(2006, 12, 31): datetime(2007, 1, 31), - datetime(2007, 1, 1): datetime(2007, 1, 31)})) - - offset_cases.append((BMonthEnd(2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 3, 31), - datetime(2006, 12, 29): datetime(2007, 2, 28), - datetime(2006, 12, 31): datetime(2007, 2, 28), - datetime(2007, 1, 1): datetime(2007, 2, 28), - datetime(2006, 11, 1): datetime(2006, 12, 29)})) - - offset_cases.append((BMonthEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2008, 5, 30), - datetime(2008, 12, 31): datetime(2008, 11, 28), - datetime(2006, 12, 29): datetime(2006, 11, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BMonthEnd(), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 29), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 2, 28), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BMonthEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2008, 5, 30), + datetime(2008, 12, 31): datetime(2008, 11, 28), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(BMonthEnd(), datetime(2007, 12, 31), True), - (BMonthEnd(), datetime(2008, 1, 1), False)] + on_offset_cases = [ + (BMonthEnd(), datetime(2007, 12, 31), True), + (BMonthEnd(), datetime(2008, 1, 1), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) + # -------------------------------------------------------------------- # Quarters class TestQuarterBegin(Base): - def test_repr(self): expected = "" assert repr(QuarterBegin()) == expected @@ -324,58 +453,88 @@ def test_offset_corner_case(self): assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 1) offset_cases = [] - offset_cases.append((QuarterBegin(startingMonth=1), { - datetime(2007, 12, 1): datetime(2008, 1, 1), - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 1): datetime(2008, 7, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 4, 30): datetime(2008, 5, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 12, 1): datetime(2009, 1, 1), - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2008, 4, 30): datetime(2008, 7, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2008, 4, 30): datetime(2008, 4, 1), - datetime(2008, 7, 1): datetime(2008, 4, 1)})) - - offset_cases.append((QuarterBegin(startingMonth=1, n=2), { - datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 2, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2008, 3, 31): datetime(2008, 7, 1), - datetime(2008, 4, 15): datetime(2008, 10, 1), - datetime(2008, 4, 1): datetime(2008, 10, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + QuarterBegin(startingMonth=1), + { + datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 4, 30): datetime(2008, 5, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 30): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), + datetime(2008, 7, 1): datetime(2008, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + QuarterBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 10, 1), + datetime(2008, 4, 1): datetime(2008, 10, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -404,57 +563,87 @@ def test_offset_corner_case(self): assert datetime(2010, 2, 1) + offset == datetime(2010, 1, 31) offset_cases = [] - offset_cases.append((QuarterEnd(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31)})) - - offset_cases.append((QuarterEnd(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 31), - datetime(2008, 3, 15): datetime(2008, 5, 31), - datetime(2008, 3, 31): datetime(2008, 5, 31), - datetime(2008, 4, 15): datetime(2008, 5, 31), - datetime(2008, 4, 30): datetime(2008, 5, 31)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31), - datetime(2008, 7, 1): datetime(2008, 4, 30)})) - - offset_cases.append((QuarterEnd(startingMonth=1, n=2), { - datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + QuarterEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 5, 31), + datetime(2008, 3, 31): datetime(2008, 5, 31), + datetime(2008, 4, 15): datetime(2008, 5, 31), + datetime(2008, 4, 30): datetime(2008, 5, 31), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + datetime(2008, 7, 1): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + QuarterEnd(startingMonth=1, n=2), + { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -490,9 +679,10 @@ def test_offset(self, case): (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), - (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True)] + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -520,73 +710,103 @@ def test_offset_corner_case(self): assert datetime(2007, 4, 3) + offset == datetime(2007, 4, 2) offset_cases = [] - offset_cases.append((BQuarterBegin(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 4, 1), - datetime(2008, 1, 31): datetime(2008, 4, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2008, 3, 31): datetime(2008, 4, 1), - datetime(2008, 4, 15): datetime(2008, 7, 1), - datetime(2007, 3, 15): datetime(2007, 4, 2), - datetime(2007, 2, 28): datetime(2007, 4, 2), - datetime(2007, 1, 1): datetime(2007, 4, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 7, 2), - datetime(2008, 4, 30): datetime(2008, 7, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 1), - datetime(2008, 1, 31): datetime(2008, 2, 1), - datetime(2008, 1, 15): datetime(2008, 2, 1), - datetime(2008, 2, 29): datetime(2008, 5, 1), - datetime(2008, 3, 15): datetime(2008, 5, 1), - datetime(2008, 3, 31): datetime(2008, 5, 1), - datetime(2008, 4, 15): datetime(2008, 5, 1), - datetime(2008, 8, 15): datetime(2008, 11, 3), - datetime(2008, 9, 15): datetime(2008, 11, 3), - datetime(2008, 11, 1): datetime(2008, 11, 3), - datetime(2008, 4, 30): datetime(2008, 5, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2007, 12, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 4, 1), - datetime(2008, 2, 29): datetime(2008, 4, 1), - datetime(2008, 1, 15): datetime(2008, 4, 1), - datetime(2008, 2, 27): datetime(2008, 4, 1), - datetime(2008, 3, 15): datetime(2008, 4, 1), - datetime(2007, 4, 1): datetime(2007, 4, 2), - datetime(2007, 4, 2): datetime(2007, 4, 2), - datetime(2007, 7, 1): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 7, 2), - datetime(2007, 7, 2): datetime(2007, 7, 2)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 1), - datetime(2008, 1, 31): datetime(2008, 1, 1), - datetime(2008, 2, 15): datetime(2008, 1, 1), - datetime(2008, 2, 29): datetime(2008, 1, 1), - datetime(2008, 3, 15): datetime(2008, 1, 1), - datetime(2008, 3, 31): datetime(2008, 1, 1), - datetime(2008, 4, 15): datetime(2008, 4, 1), - datetime(2007, 7, 3): datetime(2007, 7, 2), - datetime(2007, 4, 3): datetime(2007, 4, 2), - datetime(2007, 7, 2): datetime(2007, 4, 2), - datetime(2008, 4, 1): datetime(2008, 1, 1)})) - - offset_cases.append((BQuarterBegin(startingMonth=1, n=2), { - datetime(2008, 1, 1): datetime(2008, 7, 1), - datetime(2008, 1, 15): datetime(2008, 7, 1), - datetime(2008, 2, 29): datetime(2008, 7, 1), - datetime(2008, 3, 15): datetime(2008, 7, 1), - datetime(2007, 3, 31): datetime(2007, 7, 2), - datetime(2007, 4, 15): datetime(2007, 10, 1), - datetime(2008, 4, 30): datetime(2008, 10, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BQuarterBegin(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2007, 3, 15): datetime(2007, 4, 2), + datetime(2007, 2, 28): datetime(2007, 4, 2), + datetime(2007, 1, 1): datetime(2007, 4, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 7, 2), + datetime(2008, 4, 30): datetime(2008, 7, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 8, 15): datetime(2008, 11, 3), + datetime(2008, 9, 15): datetime(2008, 11, 3), + datetime(2008, 11, 1): datetime(2008, 11, 3), + datetime(2008, 4, 30): datetime(2008, 5, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2007, 12, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 15): datetime(2008, 4, 1), + datetime(2008, 2, 27): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 4, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 2): datetime(2007, 7, 2), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2007, 7, 3): datetime(2007, 7, 2), + datetime(2007, 4, 3): datetime(2007, 4, 2), + datetime(2007, 7, 2): datetime(2007, 4, 2), + datetime(2008, 4, 1): datetime(2008, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + BQuarterBegin(startingMonth=1, n=2), + { + datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 1, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2007, 3, 31): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 10, 1), + datetime(2008, 4, 30): datetime(2008, 10, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -615,56 +835,86 @@ def test_offset_corner_case(self): assert datetime(2010, 1, 31) + offset == datetime(2010, 1, 29) offset_cases = [] - offset_cases.append((BQuarterEnd(startingMonth=1), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 4, 30), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 7, 31)})) - - offset_cases.append((BQuarterEnd(startingMonth=2), { - datetime(2008, 1, 1): datetime(2008, 2, 29), - datetime(2008, 1, 31): datetime(2008, 2, 29), - datetime(2008, 2, 15): datetime(2008, 2, 29), - datetime(2008, 2, 29): datetime(2008, 5, 30), - datetime(2008, 3, 15): datetime(2008, 5, 30), - datetime(2008, 3, 31): datetime(2008, 5, 30), - datetime(2008, 4, 15): datetime(2008, 5, 30), - datetime(2008, 4, 30): datetime(2008, 5, 30)})) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=0), { - datetime(2008, 1, 1): datetime(2008, 1, 31), - datetime(2008, 1, 31): datetime(2008, 1, 31), - datetime(2008, 2, 15): datetime(2008, 4, 30), - datetime(2008, 2, 29): datetime(2008, 4, 30), - datetime(2008, 3, 15): datetime(2008, 4, 30), - datetime(2008, 3, 31): datetime(2008, 4, 30), - datetime(2008, 4, 15): datetime(2008, 4, 30), - datetime(2008, 4, 30): datetime(2008, 4, 30)})) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=-1), { - datetime(2008, 1, 1): datetime(2007, 10, 31), - datetime(2008, 1, 31): datetime(2007, 10, 31), - datetime(2008, 2, 15): datetime(2008, 1, 31), - datetime(2008, 2, 29): datetime(2008, 1, 31), - datetime(2008, 3, 15): datetime(2008, 1, 31), - datetime(2008, 3, 31): datetime(2008, 1, 31), - datetime(2008, 4, 15): datetime(2008, 1, 31), - datetime(2008, 4, 30): datetime(2008, 1, 31)})) - - offset_cases.append((BQuarterEnd(startingMonth=1, n=2), { - datetime(2008, 1, 31): datetime(2008, 7, 31), - datetime(2008, 2, 15): datetime(2008, 7, 31), - datetime(2008, 2, 29): datetime(2008, 7, 31), - datetime(2008, 3, 15): datetime(2008, 7, 31), - datetime(2008, 3, 31): datetime(2008, 7, 31), - datetime(2008, 4, 15): datetime(2008, 7, 31), - datetime(2008, 4, 30): datetime(2008, 10, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BQuarterEnd(startingMonth=1), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=2), + { + datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 30), + datetime(2008, 3, 15): datetime(2008, 5, 30), + datetime(2008, 3, 31): datetime(2008, 5, 30), + datetime(2008, 4, 15): datetime(2008, 5, 30), + datetime(2008, 4, 30): datetime(2008, 5, 30), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=0), + { + datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=-1), + { + datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + }, + ) + ) + + offset_cases.append( + ( + BQuarterEnd(startingMonth=1, n=2), + { + datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -697,13 +947,15 @@ def test_offset(self, case): (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), - (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False)] + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) + # -------------------------------------------------------------------- # Years @@ -716,85 +968,147 @@ def test_misspecified(self): YearBegin(month=13) offset_cases = [] - offset_cases.append((YearBegin(), { - datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1)})) - - offset_cases.append((YearBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 1), - datetime(2005, 12, 31): datetime(2006, 1, 1)})) - - offset_cases.append((YearBegin(3), { - datetime(2008, 1, 1): datetime(2011, 1, 1), - datetime(2008, 6, 30): datetime(2011, 1, 1), - datetime(2008, 12, 31): datetime(2011, 1, 1), - datetime(2005, 12, 30): datetime(2008, 1, 1), - datetime(2005, 12, 31): datetime(2008, 1, 1)})) - - offset_cases.append((YearBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 1, 1), - datetime(2007, 1, 15): datetime(2007, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 1), - datetime(2006, 12, 30): datetime(2006, 1, 1), - datetime(2007, 1, 1): datetime(2006, 1, 1)})) - - offset_cases.append((YearBegin(-2), { - datetime(2007, 1, 1): datetime(2005, 1, 1), - datetime(2008, 6, 30): datetime(2007, 1, 1), - datetime(2008, 12, 31): datetime(2007, 1, 1)})) - - offset_cases.append((YearBegin(month=4), { - datetime(2007, 4, 1): datetime(2008, 4, 1), - datetime(2007, 4, 15): datetime(2008, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1)})) - - offset_cases.append((YearBegin(0, month=4), { - datetime(2007, 4, 1): datetime(2007, 4, 1), - datetime(2007, 3, 1): datetime(2007, 4, 1), - datetime(2007, 12, 15): datetime(2008, 4, 1), - datetime(2012, 1, 31): datetime(2012, 4, 1)})) - - offset_cases.append((YearBegin(4, month=4), { - datetime(2007, 4, 1): datetime(2011, 4, 1), - datetime(2007, 4, 15): datetime(2011, 4, 1), - datetime(2007, 3, 1): datetime(2010, 4, 1), - datetime(2007, 12, 15): datetime(2011, 4, 1), - datetime(2012, 1, 31): datetime(2015, 4, 1)})) - - offset_cases.append((YearBegin(-1, month=4), { - datetime(2007, 4, 1): datetime(2006, 4, 1), - datetime(2007, 3, 1): datetime(2006, 4, 1), - datetime(2007, 12, 15): datetime(2007, 4, 1), - datetime(2012, 1, 31): datetime(2011, 4, 1)})) - - offset_cases.append((YearBegin(-3, month=4), { - datetime(2007, 4, 1): datetime(2004, 4, 1), - datetime(2007, 3, 1): datetime(2004, 4, 1), - datetime(2007, 12, 15): datetime(2005, 4, 1), - datetime(2012, 1, 31): datetime(2009, 4, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + YearBegin(), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(3), + { + datetime(2008, 1, 1): datetime(2011, 1, 1), + datetime(2008, 6, 30): datetime(2011, 1, 1), + datetime(2008, 12, 31): datetime(2011, 1, 1), + datetime(2005, 12, 30): datetime(2008, 1, 1), + datetime(2005, 12, 31): datetime(2008, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 1, 1), + datetime(2007, 1, 15): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 1), + datetime(2006, 12, 30): datetime(2006, 1, 1), + datetime(2007, 1, 1): datetime(2006, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-2), + { + datetime(2007, 1, 1): datetime(2005, 1, 1), + datetime(2008, 6, 30): datetime(2007, 1, 1), + datetime(2008, 12, 31): datetime(2007, 1, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(month=4), + { + datetime(2007, 4, 1): datetime(2008, 4, 1), + datetime(2007, 4, 15): datetime(2008, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(0, month=4), + { + datetime(2007, 4, 1): datetime(2007, 4, 1), + datetime(2007, 3, 1): datetime(2007, 4, 1), + datetime(2007, 12, 15): datetime(2008, 4, 1), + datetime(2012, 1, 31): datetime(2012, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(4, month=4), + { + datetime(2007, 4, 1): datetime(2011, 4, 1), + datetime(2007, 4, 15): datetime(2011, 4, 1), + datetime(2007, 3, 1): datetime(2010, 4, 1), + datetime(2007, 12, 15): datetime(2011, 4, 1), + datetime(2012, 1, 31): datetime(2015, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-1, month=4), + { + datetime(2007, 4, 1): datetime(2006, 4, 1), + datetime(2007, 3, 1): datetime(2006, 4, 1), + datetime(2007, 12, 15): datetime(2007, 4, 1), + datetime(2012, 1, 31): datetime(2011, 4, 1), + }, + ) + ) + + offset_cases.append( + ( + YearBegin(-3, month=4), + { + datetime(2007, 4, 1): datetime(2004, 4, 1), + datetime(2007, 3, 1): datetime(2004, 4, 1), + datetime(2007, 12, 15): datetime(2005, 4, 1), + datetime(2012, 1, 31): datetime(2009, 4, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(YearBegin(), datetime(2007, 1, 3), False), - (YearBegin(), datetime(2008, 1, 1), True), - (YearBegin(), datetime(2006, 12, 31), False), - (YearBegin(), datetime(2006, 1, 2), False)] + on_offset_cases = [ + (YearBegin(), datetime(2007, 1, 3), False), + (YearBegin(), datetime(2008, 1, 1), True), + (YearBegin(), datetime(2006, 12, 31), False), + (YearBegin(), datetime(2006, 1, 2), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -808,44 +1122,70 @@ def test_misspecified(self): YearEnd(month=13) offset_cases = [] - offset_cases.append((YearEnd(), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 31)})) - - offset_cases.append((YearEnd(0), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 30): datetime(2005, 12, 31)})) - - offset_cases.append((YearEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 31), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 31), - datetime(2006, 12, 30): datetime(2005, 12, 31), - datetime(2007, 1, 1): datetime(2006, 12, 31)})) - - offset_cases.append((YearEnd(-2), { - datetime(2007, 1, 1): datetime(2005, 12, 31), - datetime(2008, 6, 30): datetime(2006, 12, 31), - datetime(2008, 12, 31): datetime(2006, 12, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + YearEnd(), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 31), + datetime(2006, 12, 30): datetime(2005, 12, 31), + datetime(2007, 1, 1): datetime(2006, 12, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-2), + { + datetime(2007, 1, 1): datetime(2005, 12, 31), + datetime(2008, 6, 30): datetime(2006, 12, 31), + datetime(2008, 12, 31): datetime(2006, 12, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(YearEnd(), datetime(2007, 12, 31), True), - (YearEnd(), datetime(2008, 1, 1), False), - (YearEnd(), datetime(2006, 12, 31), True), - (YearEnd(), datetime(2006, 12, 29), False)] + on_offset_cases = [ + (YearEnd(), datetime(2007, 12, 31), True), + (YearEnd(), datetime(2008, 1, 1), False), + (YearEnd(), datetime(2006, 12, 31), True), + (YearEnd(), datetime(2006, 12, 29), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -853,45 +1193,71 @@ def test_onOffset(self, case): class TestYearEndDiffMonth(Base): offset_cases = [] - offset_cases.append((YearEnd(month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 15): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2009, 3, 31), - datetime(2008, 3, 30): datetime(2008, 3, 31), - datetime(2005, 3, 31): datetime(2006, 3, 31), - datetime(2006, 7, 30): datetime(2007, 3, 31)})) - - offset_cases.append((YearEnd(0, month=3), - {datetime(2008, 1, 1): datetime(2008, 3, 31), - datetime(2008, 2, 28): datetime(2008, 3, 31), - datetime(2008, 3, 31): datetime(2008, 3, 31), - datetime(2005, 3, 30): datetime(2005, 3, 31)})) - - offset_cases.append((YearEnd(-1, month=3), - {datetime(2007, 1, 1): datetime(2006, 3, 31), - datetime(2008, 2, 28): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2007, 3, 31), - datetime(2006, 3, 29): datetime(2005, 3, 31), - datetime(2006, 3, 30): datetime(2005, 3, 31), - datetime(2007, 3, 1): datetime(2006, 3, 31)})) - - offset_cases.append((YearEnd(-2, month=3), - {datetime(2007, 1, 1): datetime(2005, 3, 31), - datetime(2008, 6, 30): datetime(2007, 3, 31), - datetime(2008, 3, 31): datetime(2006, 3, 31)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + YearEnd(month=3), + { + datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 15): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2009, 3, 31), + datetime(2008, 3, 30): datetime(2008, 3, 31), + datetime(2005, 3, 31): datetime(2006, 3, 31), + datetime(2006, 7, 30): datetime(2007, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(0, month=3), + { + datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 28): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2008, 3, 31), + datetime(2005, 3, 30): datetime(2005, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-1, month=3), + { + datetime(2007, 1, 1): datetime(2006, 3, 31), + datetime(2008, 2, 28): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2007, 3, 31), + datetime(2006, 3, 29): datetime(2005, 3, 31), + datetime(2006, 3, 30): datetime(2005, 3, 31), + datetime(2007, 3, 1): datetime(2006, 3, 31), + }, + ) + ) + + offset_cases.append( + ( + YearEnd(-2, month=3), + { + datetime(2007, 1, 1): datetime(2005, 3, 31), + datetime(2008, 6, 30): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2006, 3, 31), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(YearEnd(month=3), datetime(2007, 3, 31), True), - (YearEnd(month=3), datetime(2008, 1, 1), False), - (YearEnd(month=3), datetime(2006, 3, 31), True), - (YearEnd(month=3), datetime(2006, 3, 29), False)] + on_offset_cases = [ + (YearEnd(month=3), datetime(2007, 3, 31), True), + (YearEnd(month=3), datetime(2008, 1, 1), False), + (YearEnd(month=3), datetime(2006, 3, 31), True), + (YearEnd(month=3), datetime(2006, 3, 29), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -908,38 +1274,62 @@ def test_misspecified(self): BYearEnd(month=13) offset_cases = [] - offset_cases.append((BYearBegin(), { - datetime(2008, 1, 1): datetime(2009, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2011, 1, 1): datetime(2011, 1, 3), - datetime(2011, 1, 3): datetime(2012, 1, 2), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2)})) - - offset_cases.append((BYearBegin(0), { - datetime(2008, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2009, 1, 1), - datetime(2008, 12, 31): datetime(2009, 1, 1), - datetime(2005, 12, 30): datetime(2006, 1, 2), - datetime(2005, 12, 31): datetime(2006, 1, 2)})) - - offset_cases.append((BYearBegin(-1), { - datetime(2007, 1, 1): datetime(2006, 1, 2), - datetime(2009, 1, 4): datetime(2009, 1, 1), - datetime(2009, 1, 1): datetime(2008, 1, 1), - datetime(2008, 6, 30): datetime(2008, 1, 1), - datetime(2008, 12, 31): datetime(2008, 1, 1), - datetime(2006, 12, 29): datetime(2006, 1, 2), - datetime(2006, 12, 30): datetime(2006, 1, 2), - datetime(2006, 1, 1): datetime(2005, 1, 3)})) - - offset_cases.append((BYearBegin(-2), { - datetime(2007, 1, 1): datetime(2005, 1, 3), - datetime(2007, 6, 30): datetime(2006, 1, 2), - datetime(2008, 12, 31): datetime(2007, 1, 1)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BYearBegin(), + { + datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2011, 1, 1): datetime(2011, 1, 3), + datetime(2011, 1, 3): datetime(2012, 1, 2), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(0), + { + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(-1), + { + datetime(2007, 1, 1): datetime(2006, 1, 2), + datetime(2009, 1, 4): datetime(2009, 1, 1), + datetime(2009, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 2), + datetime(2006, 12, 30): datetime(2006, 1, 2), + datetime(2006, 1, 1): datetime(2005, 1, 3), + }, + ) + ) + + offset_cases.append( + ( + BYearBegin(-2), + { + datetime(2007, 1, 1): datetime(2005, 1, 3), + datetime(2007, 6, 30): datetime(2006, 1, 2), + datetime(2008, 12, 31): datetime(2007, 1, 1), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -950,44 +1340,70 @@ class TestBYearEnd(Base): _offset = BYearEnd offset_cases = [] - offset_cases.append((BYearEnd(), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2009, 12, 31), - datetime(2005, 12, 30): datetime(2006, 12, 29), - datetime(2005, 12, 31): datetime(2006, 12, 29)})) - - offset_cases.append((BYearEnd(0), { - datetime(2008, 1, 1): datetime(2008, 12, 31), - datetime(2008, 6, 30): datetime(2008, 12, 31), - datetime(2008, 12, 31): datetime(2008, 12, 31), - datetime(2005, 12, 31): datetime(2006, 12, 29)})) - - offset_cases.append((BYearEnd(-1), { - datetime(2007, 1, 1): datetime(2006, 12, 29), - datetime(2008, 6, 30): datetime(2007, 12, 31), - datetime(2008, 12, 31): datetime(2007, 12, 31), - datetime(2006, 12, 29): datetime(2005, 12, 30), - datetime(2006, 12, 30): datetime(2006, 12, 29), - datetime(2007, 1, 1): datetime(2006, 12, 29)})) - - offset_cases.append((BYearEnd(-2), { - datetime(2007, 1, 1): datetime(2005, 12, 30), - datetime(2008, 6, 30): datetime(2006, 12, 29), - datetime(2008, 12, 31): datetime(2006, 12, 29)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BYearEnd(), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2006, 12, 29), + datetime(2005, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(0), + { + datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(-1), + { + datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(-2), + { + datetime(2007, 1, 1): datetime(2005, 12, 30), + datetime(2008, 6, 30): datetime(2006, 12, 29), + datetime(2008, 12, 31): datetime(2006, 12, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): assert_offset_equal(offset, base, expected) - on_offset_cases = [(BYearEnd(), datetime(2007, 12, 31), True), - (BYearEnd(), datetime(2008, 1, 1), False), - (BYearEnd(), datetime(2006, 12, 31), False), - (BYearEnd(), datetime(2006, 12, 29), True)] + on_offset_cases = [ + (BYearEnd(), datetime(2007, 12, 31), True), + (BYearEnd(), datetime(2008, 1, 1), False), + (BYearEnd(), datetime(2006, 12, 31), False), + (BYearEnd(), datetime(2006, 12, 29), True), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) @@ -1004,15 +1420,27 @@ def test_bad_month_fail(self): BYearEnd(month=0) offset_cases = [] - offset_cases.append((BYearEnd(month=6), { - datetime(2008, 1, 1): datetime(2008, 6, 30), - datetime(2007, 6, 30): datetime(2008, 6, 30)})) - - offset_cases.append((BYearEnd(n=-1, month=6), { - datetime(2008, 1, 1): datetime(2007, 6, 29), - datetime(2007, 6, 30): datetime(2007, 6, 29)})) - - @pytest.mark.parametrize('case', offset_cases) + offset_cases.append( + ( + BYearEnd(month=6), + { + datetime(2008, 1, 1): datetime(2008, 6, 30), + datetime(2007, 6, 30): datetime(2008, 6, 30), + }, + ) + ) + + offset_cases.append( + ( + BYearEnd(n=-1, month=6), + { + datetime(2008, 1, 1): datetime(2007, 6, 29), + datetime(2007, 6, 30): datetime(2007, 6, 29), + }, + ) + ) + + @pytest.mark.parametrize("case", offset_cases) def test_offset(self, case): offset, cases = case for base, expected in cases.items(): @@ -1025,10 +1453,12 @@ def test_roll(self): assert offset.rollforward(date) == datetime(2010, 6, 30) assert offset.rollback(date) == datetime(2009, 6, 30) - on_offset_cases = [(BYearEnd(month=2), datetime(2007, 2, 28), True), - (BYearEnd(month=6), datetime(2007, 6, 30), False)] + on_offset_cases = [ + (BYearEnd(month=2), datetime(2007, 2, 28), True), + (BYearEnd(month=6), datetime(2007, 6, 30), False), + ] - @pytest.mark.parametrize('case', on_offset_cases) + @pytest.mark.parametrize("case", on_offset_cases) def test_onOffset(self, case): offset, dt, expected = case assert_onOffset(offset, dt, expected) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 8474cb17fc28a..47e398dfe3d16 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -5,38 +5,42 @@ def test_namespace(): - submodules = ['c_timestamp', - 'ccalendar', - 'conversion', - 'fields', - 'frequencies', - 'nattype', - 'np_datetime', - 'offsets', - 'parsing', - 'period', - 'resolution', - 'strptime', - 'timedeltas', - 'timestamps', - 'timezones', - 'tzconversion'] + submodules = [ + "c_timestamp", + "ccalendar", + "conversion", + "fields", + "frequencies", + "nattype", + "np_datetime", + "offsets", + "parsing", + "period", + "resolution", + "strptime", + "timedeltas", + "timestamps", + "timezones", + "tzconversion", + ] - api = ['NaT', - 'NaTType', - 'iNaT', - 'is_null_datetimelike', - 'OutOfBoundsDatetime', - 'Period', - 'IncompatibleFrequency', - 'Timedelta', - 'Timestamp', - 'delta_to_nanoseconds', - 'ints_to_pytimedelta', - 'localize_pydatetime', - 'normalize_date', - 'tz_convert_single'] + api = [ + "NaT", + "NaTType", + "iNaT", + "is_null_datetimelike", + "OutOfBoundsDatetime", + "Period", + "IncompatibleFrequency", + "Timedelta", + "Timestamp", + "delta_to_nanoseconds", + "ints_to_pytimedelta", + "localize_pydatetime", + "normalize_date", + "tz_convert_single", + ] expected = set(submodules + api) - names = [x for x in dir(tslibs) if not x.startswith('__')] + names = [x for x in dir(tslibs) if not x.startswith("__")] assert set(names) == expected diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 680be445e657e..5cf2165993cd7 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -12,14 +12,25 @@ import pandas.util.testing as tm -@pytest.mark.parametrize("data,expected", [ - (["01-01-2013", "01-02-2013"], - ["2013-01-01T00:00:00.000000000-0000", - "2013-01-02T00:00:00.000000000-0000"]), - (["Mon Sep 16 2013", "Tue Sep 17 2013"], - ["2013-09-16T00:00:00.000000000-0000", - "2013-09-17T00:00:00.000000000-0000"]) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ( + ["01-01-2013", "01-02-2013"], + [ + "2013-01-01T00:00:00.000000000-0000", + "2013-01-02T00:00:00.000000000-0000", + ], + ), + ( + ["Mon Sep 16 2013", "Tue Sep 17 2013"], + [ + "2013-09-16T00:00:00.000000000-0000", + "2013-09-17T00:00:00.000000000-0000", + ], + ), + ], +) def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) @@ -28,12 +39,15 @@ def test_parsing_valid_dates(data, expected): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dt_string, expected_tz", [ - ["01-01-2013 08:00:00+08:00", 480], - ["2013-01-01T08:00:00.000000000+0800", 480], - ["2012-12-31T16:00:00.000000000-0800", -480], - ["12-31-2012 23:00:00-01:00", -60] -]) +@pytest.mark.parametrize( + "dt_string, expected_tz", + [ + ["01-01-2013 08:00:00+08:00", 480], + ["2013-01-01T08:00:00.000000000+0800", 480], + ["2012-12-31T16:00:00.000000000-0800", -480], + ["12-31-2012 23:00:00-01:00", -60], + ], +) def test_parsing_timezone_offsets(dt_string, expected_tz): # All of these datetime strings with offsets are equivalent # to the same datetime after the timezone offset is added. @@ -64,20 +78,21 @@ def test_parsing_different_timezone_offsets(): data = np.array(data, dtype=object) result, result_tz = tslib.array_to_datetime(data) - expected = np.array([datetime(2015, 11, 18, 15, 30, - tzinfo=tzoffset(None, 19800)), - datetime(2015, 11, 18, 15, 30, - tzinfo=tzoffset(None, 23400))], - dtype=object) + expected = np.array( + [ + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 19800)), + datetime(2015, 11, 18, 15, 30, tzinfo=tzoffset(None, 23400)), + ], + dtype=object, + ) tm.assert_numpy_array_equal(result, expected) assert result_tz is None -@pytest.mark.parametrize("data", [ - ["-352.737091", "183.575577"], - ["1", "2", "3", "4", "5"] -]) +@pytest.mark.parametrize( + "data", [["-352.737091", "183.575577"], ["1", "2", "3", "4", "5"]] +) def test_number_looking_strings_not_into_datetime(data): # see gh-4601 # @@ -89,12 +104,16 @@ def test_number_looking_strings_not_into_datetime(data): tm.assert_numpy_array_equal(result, arr) -@pytest.mark.parametrize("invalid_date", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01")]) +@pytest.mark.parametrize( + "invalid_date", + [ + date(1000, 1, 1), + datetime(1000, 1, 1), + "1000-01-01", + "Jan 1, 1000", + np.datetime64("1000-01-01"), + ], +) @pytest.mark.parametrize("errors", ["coerce", "raise"]) def test_coerce_outside_ns_bounds(invalid_date, errors): arr = np.array([invalid_date], dtype="object") @@ -135,13 +154,11 @@ def test_coerce_of_invalid_datetimes(errors): else: # coerce. # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = ["2013-01-01T00:00:00.000000000-0000", - iNaT, - iNaT] + expected = ["2013-01-01T00:00:00.000000000-0000", iNaT, iNaT] tm.assert_numpy_array_equal( - result, - np_array_datetime64_compat(expected, dtype="M8[ns]")) + result, np_array_datetime64_compat(expected, dtype="M8[ns]") + ) def test_to_datetime_barely_out_of_bounds(): @@ -160,14 +177,14 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("data,expected", [ - ([SubDatetime(2000, 1, 1)], - ["2000-01-01T00:00:00.000000000-0000"]), - ([datetime(2000, 1, 1)], - ["2000-01-01T00:00:00.000000000-0000"]), - ([Timestamp(2000, 1, 1)], - ["2000-01-01T00:00:00.000000000-0000"]) -]) +@pytest.mark.parametrize( + "data,expected", + [ + ([SubDatetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ([datetime(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ([Timestamp(2000, 1, 1)], ["2000-01-01T00:00:00.000000000-0000"]), + ], +) def test_datetime_subclass(data, expected): # GH 25851 # ensure that subclassed datetime works with diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py index f09dca7fb355e..6f6e32411a784 100644 --- a/pandas/tests/tslibs/test_ccalendar.py +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -6,12 +6,15 @@ from pandas._libs.tslibs import ccalendar -@pytest.mark.parametrize("date_tuple,expected", [ - ((2001, 3, 1), 60), - ((2004, 3, 1), 61), - ((1907, 12, 31), 365), # End-of-year, non-leap year. - ((2004, 12, 31), 366), # End-of-year, leap year. -]) +@pytest.mark.parametrize( + "date_tuple,expected", + [ + ((2001, 3, 1), 60), + ((2004, 3, 1), 61), + ((1907, 12, 31), 365), # End-of-year, non-leap year. + ((2004, 12, 31), 366), # End-of-year, leap year. + ], +) def test_get_day_of_year_numeric(date_tuple, expected): assert ccalendar.get_day_of_year(*date_tuple) == expected diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 9e6516ffeee9c..6c30e2b6c7a1c 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -50,13 +50,17 @@ def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): _compare_local_to_utc(tz_didx, utc_didx) -@pytest.mark.parametrize("arr", [ - pytest.param(np.array([], dtype=np.int64), id="empty"), - pytest.param(np.array([iNaT], dtype=np.int64), id="all_nat")]) +@pytest.mark.parametrize( + "arr", + [ + pytest.param(np.array([], dtype=np.int64), id="empty"), + pytest.param(np.array([iNaT], dtype=np.int64), id="all_nat"), + ], +) def test_tz_convert_corner(arr): - result = tzconversion.tz_convert(arr, - timezones.maybe_get_tz("US/Eastern"), - timezones.maybe_get_tz("Asia/Tokyo")) + result = tzconversion.tz_convert( + arr, timezones.maybe_get_tz("US/Eastern"), timezones.maybe_get_tz("Asia/Tokyo") + ) tm.assert_numpy_array_equal(result, arr) @@ -72,15 +76,22 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("dt, expected", [ - pytest.param(Timestamp("2000-01-01"), - Timestamp("2000-01-01", tz=UTC), id="timestamp"), - pytest.param(datetime(2000, 1, 1), - datetime(2000, 1, 1, tzinfo=UTC), - id="datetime"), - pytest.param(SubDatetime(2000, 1, 1), - SubDatetime(2000, 1, 1, tzinfo=UTC), - id="subclassed_datetime")]) +@pytest.mark.parametrize( + "dt, expected", + [ + pytest.param( + Timestamp("2000-01-01"), Timestamp("2000-01-01", tz=UTC), id="timestamp" + ), + pytest.param( + datetime(2000, 1, 1), datetime(2000, 1, 1, tzinfo=UTC), id="datetime" + ), + pytest.param( + SubDatetime(2000, 1, 1), + SubDatetime(2000, 1, 1, tzinfo=UTC), + id="subclassed_datetime", + ), + ], +) def test_localize_pydatetime_dt_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py index 279e8edebaf31..5810c7e52abca 100644 --- a/pandas/tests/tslibs/test_libfrequencies.py +++ b/pandas/tests/tslibs/test_libfrequencies.py @@ -1,94 +1,100 @@ import pytest from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, _period_str_to_code, get_rule_month, is_subperiod, - is_superperiod) + INVALID_FREQ_ERR_MSG, + _period_str_to_code, + get_rule_month, + is_subperiod, + is_superperiod, +) from pandas.tseries import offsets -@pytest.mark.parametrize("obj,expected", [ - ("W", "DEC"), - (offsets.Week(), "DEC"), - - ("D", "DEC"), - (offsets.Day(), "DEC"), - - ("Q", "DEC"), - (offsets.QuarterEnd(startingMonth=12), "DEC"), - - ("Q-JAN", "JAN"), - (offsets.QuarterEnd(startingMonth=1), "JAN"), - - ("A-DEC", "DEC"), - ("Y-DEC", "DEC"), - (offsets.YearEnd(), "DEC"), - - ("A-MAY", "MAY"), - ("Y-MAY", "MAY"), - (offsets.YearEnd(month=5), "MAY") -]) +@pytest.mark.parametrize( + "obj,expected", + [ + ("W", "DEC"), + (offsets.Week(), "DEC"), + ("D", "DEC"), + (offsets.Day(), "DEC"), + ("Q", "DEC"), + (offsets.QuarterEnd(startingMonth=12), "DEC"), + ("Q-JAN", "JAN"), + (offsets.QuarterEnd(startingMonth=1), "JAN"), + ("A-DEC", "DEC"), + ("Y-DEC", "DEC"), + (offsets.YearEnd(), "DEC"), + ("A-MAY", "MAY"), + ("Y-MAY", "MAY"), + (offsets.YearEnd(month=5), "MAY"), + ], +) def test_get_rule_month(obj, expected): result = get_rule_month(obj) assert result == expected -@pytest.mark.parametrize("obj,expected", [ - ("A", 1000), - ("A-DEC", 1000), - ("A-JAN", 1001), - - ("Y", 1000), - ("Y-DEC", 1000), - ("Y-JAN", 1001), - - ("Q", 2000), - ("Q-DEC", 2000), - ("Q-FEB", 2002), - - ("W", 4000), - ("W-SUN", 4000), - ("W-FRI", 4005), - - ("Min", 8000), - ("ms", 10000), - ("US", 11000), - ("NS", 12000) -]) +@pytest.mark.parametrize( + "obj,expected", + [ + ("A", 1000), + ("A-DEC", 1000), + ("A-JAN", 1001), + ("Y", 1000), + ("Y-DEC", 1000), + ("Y-JAN", 1001), + ("Q", 2000), + ("Q-DEC", 2000), + ("Q-FEB", 2002), + ("W", 4000), + ("W-SUN", 4000), + ("W-FRI", 4005), + ("Min", 8000), + ("ms", 10000), + ("US", 11000), + ("NS", 12000), + ], +) def test_period_str_to_code(obj, expected): assert _period_str_to_code(obj) == expected -@pytest.mark.parametrize("p1,p2,expected", [ - # Input validation. - (offsets.MonthEnd(), None, False), - (offsets.YearEnd(), None, False), - (None, offsets.YearEnd(), False), - (None, offsets.MonthEnd(), False), - (None, None, False), - - (offsets.YearEnd(), offsets.MonthEnd(), True), - (offsets.Hour(), offsets.Minute(), True), - (offsets.Second(), offsets.Milli(), True), - (offsets.Milli(), offsets.Micro(), True), - (offsets.Micro(), offsets.Nano(), True) -]) +@pytest.mark.parametrize( + "p1,p2,expected", + [ + # Input validation. + (offsets.MonthEnd(), None, False), + (offsets.YearEnd(), None, False), + (None, offsets.YearEnd(), False), + (None, offsets.MonthEnd(), False), + (None, None, False), + (offsets.YearEnd(), offsets.MonthEnd(), True), + (offsets.Hour(), offsets.Minute(), True), + (offsets.Second(), offsets.Milli(), True), + (offsets.Milli(), offsets.Micro(), True), + (offsets.Micro(), offsets.Nano(), True), + ], +) def test_super_sub_symmetry(p1, p2, expected): assert is_superperiod(p1, p2) is expected assert is_subperiod(p2, p1) is expected -@pytest.mark.parametrize("freq,expected,aliases", [ - ("D", 6000, ["DAY", "DLY", "DAILY"]), - ("M", 3000, ["MTH", "MONTH", "MONTHLY"]), - ("N", 12000, ["NANOSECOND", "NANOSECONDLY"]), - ("H", 7000, ["HR", "HOUR", "HRLY", "HOURLY"]), - ("T", 8000, ["minute", "MINUTE", "MINUTELY"]), - ("L", 10000, ["MILLISECOND", "MILLISECONDLY"]), - ("U", 11000, ["MICROSECOND", "MICROSECONDLY"]), - ("S", 9000, ["sec", "SEC", "SECOND", "SECONDLY"]), - ("B", 5000, ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"]), -]) +@pytest.mark.parametrize( + "freq,expected,aliases", + [ + ("D", 6000, ["DAY", "DLY", "DAILY"]), + ("M", 3000, ["MTH", "MONTH", "MONTHLY"]), + ("N", 12000, ["NANOSECOND", "NANOSECONDLY"]), + ("H", 7000, ["HR", "HOUR", "HRLY", "HOURLY"]), + ("T", 8000, ["minute", "MINUTE", "MINUTELY"]), + ("L", 10000, ["MILLISECOND", "MILLISECONDLY"]), + ("U", 11000, ["MICROSECOND", "MICROSECONDLY"]), + ("S", 9000, ["sec", "SEC", "SECOND", "SECONDLY"]), + ("B", 5000, ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"]), + ], +) def test_assert_aliases_deprecated(freq, expected, aliases): assert isinstance(aliases, list) assert _period_str_to_code(freq) == expected diff --git a/pandas/tests/tslibs/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py index ca6402d6151e0..6ff2ae669c8df 100644 --- a/pandas/tests/tslibs/test_liboffsets.py +++ b/pandas/tests/tslibs/test_liboffsets.py @@ -16,43 +16,55 @@ def day_opt(request): return request.param -@pytest.mark.parametrize("dt,exp_week_day,exp_last_day", [ - (datetime(2017, 11, 30), 3, 30), # Business day. - (datetime(1993, 10, 31), 6, 29) # Non-business day. -]) +@pytest.mark.parametrize( + "dt,exp_week_day,exp_last_day", + [ + (datetime(2017, 11, 30), 3, 30), # Business day. + (datetime(1993, 10, 31), 6, 29), # Non-business day. + ], +) def test_get_last_bday(dt, exp_week_day, exp_last_day): assert dt.weekday() == exp_week_day assert liboffsets.get_lastbday(dt.year, dt.month) == exp_last_day -@pytest.mark.parametrize("dt,exp_week_day,exp_first_day", [ - (datetime(2017, 4, 1), 5, 3), # Non-weekday. - (datetime(1993, 10, 1), 4, 1) # Business day. -]) +@pytest.mark.parametrize( + "dt,exp_week_day,exp_first_day", + [ + (datetime(2017, 4, 1), 5, 3), # Non-weekday. + (datetime(1993, 10, 1), 4, 1), # Business day. + ], +) def test_get_first_bday(dt, exp_week_day, exp_first_day): assert dt.weekday() == exp_week_day assert liboffsets.get_firstbday(dt.year, dt.month) == exp_first_day -@pytest.mark.parametrize("months,day_opt,expected", [ - (0, 15, datetime(2017, 11, 15)), - (0, None, datetime(2017, 11, 30)), - (1, "start", datetime(2017, 12, 1)), - (-145, "end", datetime(2005, 10, 31)), - (0, "business_end", datetime(2017, 11, 30)), - (0, "business_start", datetime(2017, 11, 1)) -]) +@pytest.mark.parametrize( + "months,day_opt,expected", + [ + (0, 15, datetime(2017, 11, 15)), + (0, None, datetime(2017, 11, 30)), + (1, "start", datetime(2017, 12, 1)), + (-145, "end", datetime(2005, 10, 31)), + (0, "business_end", datetime(2017, 11, 30)), + (0, "business_start", datetime(2017, 11, 1)), + ], +) def test_shift_month_dt(months, day_opt, expected): dt = datetime(2017, 11, 30) assert liboffsets.shift_month(dt, months, day_opt=day_opt) == expected -@pytest.mark.parametrize("months,day_opt,expected", [ - (1, "start", Timestamp("1929-06-01")), - (-3, "end", Timestamp("1929-02-28")), - (25, None, Timestamp("1931-06-5")), - (-1, 31, Timestamp("1929-04-30")) -]) +@pytest.mark.parametrize( + "months,day_opt,expected", + [ + (1, "start", Timestamp("1929-06-01")), + (-3, "end", Timestamp("1929-02-28")), + (25, None, Timestamp("1931-06-5")), + (-1, 31, Timestamp("1929-04-30")), + ], +) def test_shift_month_ts(months, day_opt, expected): ts = Timestamp("1929-05-05") assert liboffsets.shift_month(ts, months, day_opt=day_opt) == expected @@ -66,13 +78,15 @@ def test_shift_month_error(): liboffsets.shift_month(dt, 3, day_opt=day_opt) -@pytest.mark.parametrize("other,expected", [ - # Before March 1. - (datetime(2017, 2, 10), {2: 1, -7: -7, 0: 0}), - - # After March 1. - (Timestamp("2014-03-15", tz="US/Eastern"), {2: 2, -7: -6, 0: 1}) -]) +@pytest.mark.parametrize( + "other,expected", + [ + # Before March 1. + (datetime(2017, 2, 10), {2: 1, -7: -7, 0: 0}), + # After March 1. + (Timestamp("2014-03-15", tz="US/Eastern"), {2: 2, -7: -6, 0: 1}), + ], +) @pytest.mark.parametrize("n", [2, -7, 0]) def test_roll_yearday(other, expected, n): month = 3 @@ -81,13 +95,15 @@ def test_roll_yearday(other, expected, n): assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n] -@pytest.mark.parametrize("other,expected", [ - # Before June 30. - (datetime(1999, 6, 29), {5: 4, -7: -7, 0: 0}), - - # After June 30. - (Timestamp(2072, 8, 24, 6, 17, 18), {5: 5, -7: -6, 0: 1}) -]) +@pytest.mark.parametrize( + "other,expected", + [ + # Before June 30. + (datetime(1999, 6, 29), {5: 4, -7: -7, 0: 0}), + # After June 30. + (Timestamp(2072, 8, 24, 6, 17, 18), {5: 5, -7: -6, 0: 1}), + ], +) @pytest.mark.parametrize("n", [5, -7, 0]) def test_roll_yearday2(other, expected, n): month = 6 @@ -107,56 +123,37 @@ def test_get_day_of_month_error(): liboffsets.roll_yearday(dt, n=3, month=11, day_opt=day_opt) -@pytest.mark.parametrize("month", [ - 3, # (other.month % 3) < (month % 3) - 5 # (other.month % 3) > (month % 3) -]) +@pytest.mark.parametrize( + "month", + [3, 5], # (other.month % 3) < (month % 3) # (other.month % 3) > (month % 3) +) @pytest.mark.parametrize("n", [4, -3]) def test_roll_qtr_day_not_mod_unequal(day_opt, month, n): - expected = { - 3: { - -3: -2, - 4: 4 - }, - 5: { - -3: -3, - 4: 3 - } - } + expected = {3: {-3: -2, 4: 4}, 5: {-3: -3, 4: 3}} other = Timestamp(2072, 10, 1, 6, 17, 18) # Saturday. assert roll_qtrday(other, n, month, day_opt, modby=3) == expected[month][n] -@pytest.mark.parametrize("other,month,exp_dict", [ - # Monday. - (datetime(1999, 5, 31), 2, { - -1: { - "start": 0, - "business_start": 0 - } - }), - - # Saturday. - (Timestamp(2072, 10, 1, 6, 17, 18), 4, { - 2: { - "end": 1, - "business_end": 1, - "business_start": 1 - } - }), - - # First business day. - (Timestamp(2072, 10, 3, 6, 17, 18), 4, { - 2: { - "end": 1, - "business_end": 1 - }, - -1: { - "start": 0 - } - }) -]) +@pytest.mark.parametrize( + "other,month,exp_dict", + [ + # Monday. + (datetime(1999, 5, 31), 2, {-1: {"start": 0, "business_start": 0}}), + # Saturday. + ( + Timestamp(2072, 10, 1, 6, 17, 18), + 4, + {2: {"end": 1, "business_end": 1, "business_start": 1}}, + ), + # First business day. + ( + Timestamp(2072, 10, 3, 6, 17, 18), + 4, + {2: {"end": 1, "business_end": 1}, -1: {"start": 0}}, + ), + ], +) @pytest.mark.parametrize("n", [2, -1]) def test_roll_qtr_day_mod_equal(other, month, exp_dict, n, day_opt): # All cases have (other.month % 3) == (month % 3). @@ -164,10 +161,9 @@ def test_roll_qtr_day_mod_equal(other, month, exp_dict, n, day_opt): assert roll_qtrday(other, n, month, day_opt, modby=3) == expected -@pytest.mark.parametrize("n,expected", [ - (42, {29: 42, 1: 42, 31: 41}), - (-4, {29: -4, 1: -3, 31: -4}) -]) +@pytest.mark.parametrize( + "n,expected", [(42, {29: 42, 1: 42, 31: 41}), (-4, {29: -4, 1: -3, 31: -4})] +) @pytest.mark.parametrize("compare", [29, 1, 31]) def test_roll_convention(n, expected, compare): assert liboffsets.roll_convention(29, n, compare) == expected[compare] diff --git a/pandas/tests/tslibs/test_normalize_date.py b/pandas/tests/tslibs/test_normalize_date.py index e169b1c7aa505..2a41836f456ec 100644 --- a/pandas/tests/tslibs/test_normalize_date.py +++ b/pandas/tests/tslibs/test_normalize_date.py @@ -8,11 +8,14 @@ from pandas._libs.tslibs.timestamps import Timestamp -@pytest.mark.parametrize("value,expected", [ - (date(2012, 9, 7), datetime(2012, 9, 7)), - (datetime(2012, 9, 7, 12), datetime(2012, 9, 7)), - (datetime(2007, 10, 1, 1, 12, 5, 10), datetime(2007, 10, 1)) -]) +@pytest.mark.parametrize( + "value,expected", + [ + (date(2012, 9, 7), datetime(2012, 9, 7)), + (datetime(2012, 9, 7, 12), datetime(2012, 9, 7)), + (datetime(2007, 10, 1, 1, 12, 5, 10), datetime(2007, 10, 1)), + ], +) def test_normalize_date(value, expected): result = tslibs.normalize_date(value) assert result == expected @@ -22,10 +25,14 @@ class SubDatetime(datetime): pass -@pytest.mark.parametrize("dt, expected", [ - (Timestamp(2000, 1, 1, 1), Timestamp(2000, 1, 1, 0)), - (datetime(2000, 1, 1, 1), datetime(2000, 1, 1, 0)), - (SubDatetime(2000, 1, 1, 1), SubDatetime(2000, 1, 1, 0))]) +@pytest.mark.parametrize( + "dt, expected", + [ + (Timestamp(2000, 1, 1, 1), Timestamp(2000, 1, 1, 0)), + (datetime(2000, 1, 1, 1), datetime(2000, 1, 1, 0)), + (SubDatetime(2000, 1, 1, 1), SubDatetime(2000, 1, 1, 0)), + ], +) def test_normalize_date_sub_types(dt, expected): # GH 25851 # ensure that subclassed datetime works with diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index 8c995f243a993..a6e7aee46b485 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -5,17 +5,21 @@ from pandas._libs import tslib -@pytest.mark.parametrize("date_str, exp", [ - ("2011-01-02", datetime(2011, 1, 2)), - ("2011-1-2", datetime(2011, 1, 2)), - ("2011-01", datetime(2011, 1, 1)), - ("2011-1", datetime(2011, 1, 1)), - ("2011 01 02", datetime(2011, 1, 2)), - ("2011.01.02", datetime(2011, 1, 2)), - ("2011/01/02", datetime(2011, 1, 2)), - ("2011\\01\\02", datetime(2011, 1, 2)), - ("2013-01-01 05:30:00", datetime(2013, 1, 1, 5, 30)), - ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30))]) +@pytest.mark.parametrize( + "date_str, exp", + [ + ("2011-01-02", datetime(2011, 1, 2)), + ("2011-1-2", datetime(2011, 1, 2)), + ("2011-01", datetime(2011, 1, 1)), + ("2011-1", datetime(2011, 1, 1)), + ("2011 01 02", datetime(2011, 1, 2)), + ("2011.01.02", datetime(2011, 1, 2)), + ("2011/01/02", datetime(2011, 1, 2)), + ("2011\\01\\02", datetime(2011, 1, 2)), + ("2013-01-01 05:30:00", datetime(2013, 1, 1, 5, 30)), + ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)), + ], +) def test_parsers_iso8601(date_str, exp): # see gh-12060 # @@ -25,28 +29,29 @@ def test_parsers_iso8601(date_str, exp): assert actual == exp -@pytest.mark.parametrize("date_str", [ - "2011-01/02", - "2011=11=11", - "201401", - "201111", - "200101", - - # Mixed separated and unseparated. - "2005-0101", - "200501-01", - "20010101 12:3456", - "20010101 1234:56", - - # HHMMSS must have two digits in - # each component if unseparated. - "20010101 1", - "20010101 123", - "20010101 12345", - "20010101 12345Z", -]) +@pytest.mark.parametrize( + "date_str", + [ + "2011-01/02", + "2011=11=11", + "201401", + "201111", + "200101", + # Mixed separated and unseparated. + "2005-0101", + "200501-01", + "20010101 12:3456", + "20010101 1234:56", + # HHMMSS must have two digits in + # each component if unseparated. + "20010101 1", + "20010101 123", + "20010101 12345", + "20010101 12345Z", + ], +) def test_parsers_iso8601_invalid(date_str): - msg = "Error parsing datetime string \"{s}\"".format(s=date_str) + msg = 'Error parsing datetime string "{s}"'.format(s=date_str) with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) @@ -54,8 +59,9 @@ def test_parsers_iso8601_invalid(date_str): def test_parsers_iso8601_invalid_offset_invalid(): date_str = "2001-01-01 12-34-56" - msg = ("Timezone hours offset out of range " - "in datetime string \"{s}\"".format(s=date_str)) + msg = "Timezone hours offset out of range " 'in datetime string "{s}"'.format( + s=date_str + ) with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) @@ -64,5 +70,5 @@ def test_parsers_iso8601_invalid_offset_invalid(): def test_parsers_iso8601_leading_space(): # GH#25895 make sure isoparser doesn't overflow with long input date_str, expected = ("2013-1-1 5:30:00", datetime(2013, 1, 1, 5, 30)) - actual = tslib._test_parse_iso8601(' ' * 200 + date_str) + actual = tslib._test_parse_iso8601(" " * 200 + date_str) assert actual == expected diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 6d9b72b67d4c7..700fee2d89f3c 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -23,10 +23,9 @@ def test_parse_time_string(): assert parsed == parsed_lower -@pytest.mark.parametrize("dashed,normal", [ - ("1988-Q2", "1988Q2"), - ("2Q-1988", "2Q1988") -]) +@pytest.mark.parametrize( + "dashed,normal", [("1988-Q2", "1988Q2"), ("2Q-1988", "2Q1988")] +) def test_parse_time_quarter_with_dash(dashed, normal): # see gh-9688 (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed) @@ -37,140 +36,171 @@ def test_parse_time_quarter_with_dash(dashed, normal): assert reso_dash == reso -@pytest.mark.parametrize("dashed", [ - "-2Q1992", "2-Q1992", "4-4Q1992" -]) +@pytest.mark.parametrize("dashed", ["-2Q1992", "2-Q1992", "4-4Q1992"]) def test_parse_time_quarter_with_dash_error(dashed): - msg = ("Unknown datetime string format, " - "unable to parse: {dashed}".format(dashed=dashed)) + msg = "Unknown datetime string format, " "unable to parse: {dashed}".format( + dashed=dashed + ) with pytest.raises(parsing.DateParseError, match=msg): parse_time_string(dashed) -@pytest.mark.parametrize("date_string,expected", [ - ("123.1234", False), - ("-50000", False), - ("999", False), - ("m", False), - ("T", False), - - ("Mon Sep 16, 2013", True), - ("2012-01-01", True), - ("01/01/2012", True), - ("01012012", True), - ("0101", True), - ("1-1", True) -]) +@pytest.mark.parametrize( + "date_string,expected", + [ + ("123.1234", False), + ("-50000", False), + ("999", False), + ("m", False), + ("T", False), + ("Mon Sep 16, 2013", True), + ("2012-01-01", True), + ("01/01/2012", True), + ("01012012", True), + ("0101", True), + ("1-1", True), + ], +) def test_does_not_convert_mixed_integer(date_string, expected): assert parsing._does_string_look_like_datetime(date_string) is expected -@pytest.mark.parametrize("date_str,kwargs,msg", [ - ("2013Q5", dict(), - ("Incorrect quarterly string is given, " - "quarter must be between 1 and 4: 2013Q5")), - - # see gh-5418 - ("2013Q1", dict(freq="INVLD-L-DEC-SAT"), - ("Unable to retrieve month information " - "from given freq: INVLD-L-DEC-SAT")) -]) +@pytest.mark.parametrize( + "date_str,kwargs,msg", + [ + ( + "2013Q5", + dict(), + ( + "Incorrect quarterly string is given, " + "quarter must be between 1 and 4: 2013Q5" + ), + ), + # see gh-5418 + ( + "2013Q1", + dict(freq="INVLD-L-DEC-SAT"), + ( + "Unable to retrieve month information " + "from given freq: INVLD-L-DEC-SAT" + ), + ), + ], +) def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg): with pytest.raises(parsing.DateParseError, match=msg): parsing.parse_time_string(date_str, **kwargs) -@pytest.mark.parametrize("date_str,freq,expected", [ - ("2013Q2", None, datetime(2013, 4, 1)), - ("2013Q2", "A-APR", datetime(2012, 8, 1)), - ("2013-Q2", "A-DEC", datetime(2013, 4, 1)) -]) +@pytest.mark.parametrize( + "date_str,freq,expected", + [ + ("2013Q2", None, datetime(2013, 4, 1)), + ("2013Q2", "A-APR", datetime(2012, 8, 1)), + ("2013-Q2", "A-DEC", datetime(2013, 4, 1)), + ], +) def test_parsers_quarterly_with_freq(date_str, freq, expected): result, _, _ = parsing.parse_time_string(date_str, freq=freq) assert result == expected -@pytest.mark.parametrize("date_str", [ - "2Q 2005", "2Q-200A", "2Q-200", - "22Q2005", "2Q200.", "6Q-20" -]) +@pytest.mark.parametrize( + "date_str", ["2Q 2005", "2Q-200A", "2Q-200", "22Q2005", "2Q200.", "6Q-20"] +) def test_parsers_quarter_invalid(date_str): if date_str == "6Q-20": - msg = ("Incorrect quarterly string is given, quarter " - "must be between 1 and 4: {date_str}".format(date_str=date_str)) + msg = ( + "Incorrect quarterly string is given, quarter " + "must be between 1 and 4: {date_str}".format(date_str=date_str) + ) else: - msg = ("Unknown datetime string format, unable " - "to parse: {date_str}".format(date_str=date_str)) + msg = "Unknown datetime string format, unable " "to parse: {date_str}".format( + date_str=date_str + ) with pytest.raises(ValueError, match=msg): parsing.parse_time_string(date_str) -@pytest.mark.parametrize("date_str,expected", [ - ("201101", datetime(2011, 1, 1, 0, 0)), - ("200005", datetime(2000, 5, 1, 0, 0)) -]) +@pytest.mark.parametrize( + "date_str,expected", + [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], +) def test_parsers_month_freq(date_str, expected): result, _, _ = parsing.parse_time_string(date_str, freq="M") assert result == expected @td.skip_if_not_us_locale -@pytest.mark.parametrize("string,fmt", [ - ("20111230", "%Y%m%d"), - ("2011-12-30", "%Y-%m-%d"), - ("30-12-2011", "%d-%m-%Y"), - ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"), - ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), - ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f") -]) +@pytest.mark.parametrize( + "string,fmt", + [ + ("20111230", "%Y%m%d"), + ("2011-12-30", "%Y-%m-%d"), + ("30-12-2011", "%d-%m-%Y"), + ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), + ], +) def test_guess_datetime_format_with_parseable_formats(string, fmt): result = parsing._guess_datetime_format(string) assert result == fmt -@pytest.mark.parametrize("dayfirst,expected", [ - (True, "%d/%m/%Y"), - (False, "%m/%d/%Y") -]) +@pytest.mark.parametrize("dayfirst,expected", [(True, "%d/%m/%Y"), (False, "%m/%d/%Y")]) def test_guess_datetime_format_with_dayfirst(dayfirst, expected): ambiguous_string = "01/01/2011" - result = parsing._guess_datetime_format(ambiguous_string, - dayfirst=dayfirst) + result = parsing._guess_datetime_format(ambiguous_string, dayfirst=dayfirst) assert result == expected @td.skip_if_has_locale -@pytest.mark.parametrize("string,fmt", [ - ("30/Dec/2011", "%d/%b/%Y"), - ("30/December/2011", "%d/%B/%Y"), - ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S") -]) +@pytest.mark.parametrize( + "string,fmt", + [ + ("30/Dec/2011", "%d/%b/%Y"), + ("30/December/2011", "%d/%B/%Y"), + ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S"), + ], +) def test_guess_datetime_format_with_locale_specific_formats(string, fmt): result = parsing._guess_datetime_format(string) assert result == fmt -@pytest.mark.parametrize("invalid_dt", [ - "2013", "01/2013", "12:00:00", "1/1/1/1", - "this_is_not_a_datetime", "51a", 9, - datetime(2011, 1, 1) -]) +@pytest.mark.parametrize( + "invalid_dt", + [ + "2013", + "01/2013", + "12:00:00", + "1/1/1/1", + "this_is_not_a_datetime", + "51a", + 9, + datetime(2011, 1, 1), + ], +) def test_guess_datetime_format_invalid_inputs(invalid_dt): # A datetime string must include a year, month and a day for it to be # guessable, in addition to being a string that looks like a datetime. assert parsing._guess_datetime_format(invalid_dt) is None -@pytest.mark.parametrize("string,fmt", [ - ("2011-1-1", "%Y-%m-%d"), - ("1/1/2011", "%m/%d/%Y"), - ("30-1-2011", "%d-%m-%Y"), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S") -]) +@pytest.mark.parametrize( + "string,fmt", + [ + ("2011-1-1", "%Y-%m-%d"), + ("1/1/2011", "%m/%d/%Y"), + ("30-1-2011", "%d-%m-%Y"), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S"), + ], +) def test_guess_datetime_format_no_padding(string, fmt): # see gh-11142 result = parsing._guess_datetime_format(string) diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index a86f1e873893d..5497cb65c5373 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -4,82 +4,75 @@ from pandas._libs.tslibs.period import period_asfreq, period_ordinal -@pytest.mark.parametrize("freq1,freq2,expected", [ - ("D", "H", 24), - ("D", "T", 1440), - ("D", "S", 86400), - ("D", "L", 86400000), - ("D", "U", 86400000000), - ("D", "N", 86400000000000), - - ("H", "T", 60), - ("H", "S", 3600), - ("H", "L", 3600000), - ("H", "U", 3600000000), - ("H", "N", 3600000000000), - - ("T", "S", 60), - ("T", "L", 60000), - ("T", "U", 60000000), - ("T", "N", 60000000000), - - ("S", "L", 1000), - ("S", "U", 1000000), - ("S", "N", 1000000000), - - ("L", "U", 1000), - ("L", "N", 1000000), - - ("U", "N", 1000) -]) +@pytest.mark.parametrize( + "freq1,freq2,expected", + [ + ("D", "H", 24), + ("D", "T", 1440), + ("D", "S", 86400), + ("D", "L", 86400000), + ("D", "U", 86400000000), + ("D", "N", 86400000000000), + ("H", "T", 60), + ("H", "S", 3600), + ("H", "L", 3600000), + ("H", "U", 3600000000), + ("H", "N", 3600000000000), + ("T", "S", 60), + ("T", "L", 60000), + ("T", "U", 60000000), + ("T", "N", 60000000000), + ("S", "L", 1000), + ("S", "U", 1000000), + ("S", "N", 1000000000), + ("L", "U", 1000), + ("L", "N", 1000000), + ("U", "N", 1000), + ], +) def test_intra_day_conversion_factors(freq1, freq2, expected): - assert period_asfreq(1, get_freq(freq1), - get_freq(freq2), False) == expected + assert period_asfreq(1, get_freq(freq1), get_freq(freq2), False) == expected -@pytest.mark.parametrize("freq,expected", [ - ("A", 0), - ("M", 0), - ("W", 1), - ("D", 0), - ("B", 0) -]) +@pytest.mark.parametrize( + "freq,expected", [("A", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)] +) def test_period_ordinal_start_values(freq, expected): # information for Jan. 1, 1970. - assert period_ordinal(1970, 1, 1, 0, 0, 0, - 0, 0, get_freq(freq)) == expected - - -@pytest.mark.parametrize("dt,expected", [ - ((1970, 1, 4, 0, 0, 0, 0, 0), 1), - ((1970, 1, 5, 0, 0, 0, 0, 0), 2), - ((2013, 10, 6, 0, 0, 0, 0, 0), 2284), - ((2013, 10, 7, 0, 0, 0, 0, 0), 2285) -]) + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq(freq)) == expected + + +@pytest.mark.parametrize( + "dt,expected", + [ + ((1970, 1, 4, 0, 0, 0, 0, 0), 1), + ((1970, 1, 5, 0, 0, 0, 0, 0), 2), + ((2013, 10, 6, 0, 0, 0, 0, 0), 2284), + ((2013, 10, 7, 0, 0, 0, 0, 0), 2285), + ], +) def test_period_ordinal_week(dt, expected): args = dt + (get_freq("W"),) assert period_ordinal(*args) == expected -@pytest.mark.parametrize("day,expected", [ - # Thursday (Oct. 3, 2013). - (3, 11415), - - # Friday (Oct. 4, 2013). - (4, 11416), - - # Saturday (Oct. 5, 2013). - (5, 11417), - - # Sunday (Oct. 6, 2013). - (6, 11417), - - # Monday (Oct. 7, 2013). - (7, 11417), - - # Tuesday (Oct. 8, 2013). - (8, 11418) -]) +@pytest.mark.parametrize( + "day,expected", + [ + # Thursday (Oct. 3, 2013). + (3, 11415), + # Friday (Oct. 4, 2013). + (4, 11416), + # Saturday (Oct. 5, 2013). + (5, 11417), + # Sunday (Oct. 6, 2013). + (6, 11417), + # Monday (Oct. 7, 2013). + (7, 11417), + # Tuesday (Oct. 8, 2013). + (8, 11418), + ], +) def test_period_ordinal_business_day(day, expected): args = (2013, 10, day, 0, 0, 0, 0, 0, get_freq("B")) assert period_ordinal(*args) == expected diff --git a/pandas/tests/tslibs/test_timedeltas.py b/pandas/tests/tslibs/test_timedeltas.py index 65ae9d6ed90ec..d4bd7c2d5486c 100644 --- a/pandas/tests/tslibs/test_timedeltas.py +++ b/pandas/tests/tslibs/test_timedeltas.py @@ -7,15 +7,18 @@ from pandas import Timedelta -@pytest.mark.parametrize("obj,expected", [ - (np.timedelta64(14, "D"), 14 * 24 * 3600 * 1e9), - (Timedelta(minutes=-7), -7 * 60 * 1e9), - (Timedelta(minutes=-7).to_pytimedelta(), -7 * 60 * 1e9), - (pd.offsets.Nano(125), 125), - (1, 1), - (np.int64(2), 2), - (np.int32(3), 3) -]) +@pytest.mark.parametrize( + "obj,expected", + [ + (np.timedelta64(14, "D"), 14 * 24 * 3600 * 1e9), + (Timedelta(minutes=-7), -7 * 60 * 1e9), + (Timedelta(minutes=-7).to_pytimedelta(), -7 * 60 * 1e9), + (pd.offsets.Nano(125), 125), + (1, 1), + (np.int64(2), 2), + (np.int32(3), 3), + ], +) def test_delta_to_nanoseconds(obj, expected): result = delta_to_nanoseconds(obj) assert result == expected diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index e3fb6ecfb9c1c..03cc8fcb6e904 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -32,7 +32,7 @@ def test_tzlocal_repr(): def test_tzlocal_maybe_get_tz(): # see gh-13583 - tz = timezones.maybe_get_tz('tzlocal()') + tz = timezones.maybe_get_tz("tzlocal()") assert tz == dateutil.tz.tzlocal() @@ -48,10 +48,12 @@ def test_tzlocal_offset(): assert ts.value + offset == Timestamp("2011-01-01").value -@pytest.fixture(params=[ - (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), - (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)) -]) +@pytest.fixture( + params=[ + (pytz.timezone("US/Eastern"), lambda tz, x: tz.localize(x)), + (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)), + ] +) def infer_setup(request): eastern, localize = request.param @@ -67,12 +69,18 @@ def infer_setup(request): def test_infer_tz_compat(infer_setup): eastern, _, start, end, start_naive, end_naive = infer_setup - assert (timezones.infer_tzinfo(start, end) is - conversion.localize_pydatetime(start_naive, eastern).tzinfo) - assert (timezones.infer_tzinfo(start, None) is - conversion.localize_pydatetime(start_naive, eastern).tzinfo) - assert (timezones.infer_tzinfo(None, end) is - conversion.localize_pydatetime(end_naive, eastern).tzinfo) + assert ( + timezones.infer_tzinfo(start, end) + is conversion.localize_pydatetime(start_naive, eastern).tzinfo + ) + assert ( + timezones.infer_tzinfo(start, None) + is conversion.localize_pydatetime(start_naive, eastern).tzinfo + ) + assert ( + timezones.infer_tzinfo(None, end) + is conversion.localize_pydatetime(end_naive, eastern).tzinfo + ) def test_infer_tz_utc_localize(infer_setup): diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index 96fc64a2a7b9c..1583420053fde 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -39,8 +39,9 @@ def _assert_not_almost_equal(a, b, **kwargs): """ try: assert_almost_equal(a, b, **kwargs) - msg = ("{a} and {b} were approximately equal " - "when they shouldn't have been").format(a=a, b=b) + msg = ( + "{a} and {b} were approximately equal " "when they shouldn't have been" + ).format(a=a, b=b) pytest.fail(msg=msg) except AssertionError: pass @@ -65,62 +66,69 @@ def _assert_not_almost_equal_both(a, b, **kwargs): _assert_not_almost_equal(b, a, **kwargs) -@pytest.mark.parametrize("a,b", [ - (1.1, 1.1), (1.1, 1.100001), (np.int16(1), 1.000001), - (np.float64(1.1), 1.1), (np.uint32(5), 5), -]) +@pytest.mark.parametrize( + "a,b", + [ + (1.1, 1.1), + (1.1, 1.100001), + (np.int16(1), 1.000001), + (np.float64(1.1), 1.1), + (np.uint32(5), 5), + ], +) def test_assert_almost_equal_numbers(a, b): _assert_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1)), -]) +@pytest.mark.parametrize("a,b", [(1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1))]) def test_assert_not_almost_equal_numbers(a, b): _assert_not_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0), -]) +@pytest.mark.parametrize("a,b", [(0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0)]) def test_assert_almost_equal_numbers_with_zeros(a, b): _assert_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (0.001, 0), (1, 0), -]) +@pytest.mark.parametrize("a,b", [(0.001, 0), (1, 0)]) def test_assert_not_almost_equal_numbers_with_zeros(a, b): _assert_not_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - (1, "abc"), (1, [1, ]), (1, object()), -]) +@pytest.mark.parametrize("a,b", [(1, "abc"), (1, [1]), (1, object())]) def test_assert_not_almost_equal_numbers_with_mixed(a, b): _assert_not_almost_equal_both(a, b) @pytest.mark.parametrize( - "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]) + "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"] +) @pytest.mark.parametrize( - "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]) + "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"] +) def test_assert_almost_equal_edge_case_ndarrays(left_dtype, right_dtype): # Empty compare. - _assert_almost_equal_both(np.array([], dtype=left_dtype), - np.array([], dtype=right_dtype), - check_dtype=False) + _assert_almost_equal_both( + np.array([], dtype=left_dtype), + np.array([], dtype=right_dtype), + check_dtype=False, + ) def test_assert_almost_equal_dicts(): _assert_almost_equal_both({"a": 1, "b": 2}, {"a": 1, "b": 2}) -@pytest.mark.parametrize("a,b", [ - ({"a": 1, "b": 2}, {"a": 1, "b": 3}), - ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}), - ({"a": 1}, 1), ({"a": 1}, "abc"), ({"a": 1}, [1, ]), -]) +@pytest.mark.parametrize( + "a,b", + [ + ({"a": 1, "b": 2}, {"a": 1, "b": 3}), + ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}), + ({"a": 1}, 1), + ({"a": 1}, "abc"), + ({"a": 1}, [1]), + ], +) def test_assert_not_almost_equal_dicts(a, b): _assert_not_almost_equal_both(a, b) @@ -132,14 +140,15 @@ def test_assert_almost_equal_dict_like_object(val): class DictLikeObj: def keys(self): - return "a", + return ("a",) def __getitem__(self, item): if item == "a": return dict_val - func = (_assert_almost_equal_both if val == dict_val - else _assert_not_almost_equal_both) + func = ( + _assert_almost_equal_both if val == dict_val else _assert_not_almost_equal_both + ) func(real_dict, DictLikeObj(), check_dtype=False) @@ -147,31 +156,34 @@ def test_assert_almost_equal_strings(): _assert_almost_equal_both("abc", "abc") -@pytest.mark.parametrize("a,b", [ - ("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1, ]), -]) +@pytest.mark.parametrize( + "a,b", [("abc", "abcd"), ("abc", "abd"), ("abc", 1), ("abc", [1])] +) def test_assert_not_almost_equal_strings(a, b): _assert_not_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - ([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3])), -]) +@pytest.mark.parametrize( + "a,b", [([1, 2, 3], [1, 2, 3]), (np.array([1, 2, 3]), np.array([1, 2, 3]))] +) def test_assert_almost_equal_iterables(a, b): _assert_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [ - # Class is different. - (np.array([1, 2, 3]), [1, 2, 3]), - - # Dtype is different. - (np.array([1, 2, 3]), np.array([1., 2., 3.])), - - # Can't compare generators. - (iter([1, 2, 3]), [1, 2, 3]), ([1, 2, 3], [1, 2, 4]), - ([1, 2, 3], [1, 2, 3, 4]), ([1, 2, 3], 1), -]) +@pytest.mark.parametrize( + "a,b", + [ + # Class is different. + (np.array([1, 2, 3]), [1, 2, 3]), + # Dtype is different. + (np.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])), + # Can't compare generators. + (iter([1, 2, 3]), [1, 2, 3]), + ([1, 2, 3], [1, 2, 4]), + ([1, 2, 3], [1, 2, 3, 4]), + ([1, 2, 3], 1), + ], +) def test_assert_not_almost_equal_iterables(a, b): _assert_not_almost_equal(a, b) @@ -180,20 +192,23 @@ def test_assert_almost_equal_null(): _assert_almost_equal_both(None, None) -@pytest.mark.parametrize("a,b", [ - (None, np.NaN), (None, 0), (np.NaN, 0), -]) +@pytest.mark.parametrize("a,b", [(None, np.NaN), (None, 0), (np.NaN, 0)]) def test_assert_not_almost_equal_null(a, b): _assert_not_almost_equal(a, b) -@pytest.mark.parametrize("a,b", [ - (np.inf, np.inf), (np.inf, float("inf")), - (np.array([np.inf, np.nan, -np.inf]), - np.array([np.inf, np.nan, -np.inf])), - (np.array([np.inf, None, -np.inf], dtype=np.object_), - np.array([np.inf, np.nan, -np.inf], dtype=np.object_)), -]) +@pytest.mark.parametrize( + "a,b", + [ + (np.inf, np.inf), + (np.inf, float("inf")), + (np.array([np.inf, np.nan, -np.inf]), np.array([np.inf, np.nan, -np.inf])), + ( + np.array([np.inf, None, -np.inf], dtype=np.object_), + np.array([np.inf, np.nan, -np.inf], dtype=np.object_), + ), + ], +) def test_assert_almost_equal_inf(a, b): _assert_almost_equal_both(a, b) @@ -202,12 +217,15 @@ def test_assert_not_almost_equal_inf(): _assert_not_almost_equal_both(np.inf, 0) -@pytest.mark.parametrize("a,b", [ - (Index([1., 1.1]), Index([1., 1.100001])), - (Series([1., 1.1]), Series([1., 1.100001])), - (np.array([1.1, 2.000001]), np.array([1.1, 2.0])), - (DataFrame({"a": [1., 1.1]}), DataFrame({"a": [1., 1.100001]})) -]) +@pytest.mark.parametrize( + "a,b", + [ + (Index([1.0, 1.1]), Index([1.0, 1.100001])), + (Series([1.0, 1.1]), Series([1.0, 1.100001])), + (np.array([1.1, 2.000001]), np.array([1.1, 2.0])), + (DataFrame({"a": [1.0, 1.1]}), DataFrame({"a": [1.0, 1.100001]})), + ], +) def test_assert_almost_equal_pandas(a, b): _assert_almost_equal_both(a, b) @@ -225,16 +243,18 @@ def test_assert_almost_equal_value_mismatch(): assert_almost_equal(1, 2) -@pytest.mark.parametrize("a,b,klass1,klass2", [ - (np.array([1]), 1, "ndarray", "int"), - (1, np.array([1]), "int", "ndarray"), -]) +@pytest.mark.parametrize( + "a,b,klass1,klass2", + [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], +) def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2): msg = """numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2) +\\[right\\]: {klass2}""".format( + klass1=klass1, klass2=klass2 + ) with pytest.raises(AssertionError, match=msg): assert_almost_equal(a, b) @@ -248,8 +268,7 @@ def test_assert_almost_equal_value_mismatch1(): \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([np.nan, 2, 3]), - np.array([1, np.nan, 3])) + assert_almost_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) def test_assert_almost_equal_value_mismatch2(): @@ -271,8 +290,9 @@ def test_assert_almost_equal_value_mismatch3(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) + assert_almost_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) def test_assert_almost_equal_value_mismatch4(): @@ -283,8 +303,7 @@ def test_assert_almost_equal_value_mismatch4(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([[1, 2], [3, 4]]), - np.array([[1, 3], [3, 4]])) + assert_almost_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) def test_assert_almost_equal_shape_mismatch_override(): @@ -294,9 +313,7 @@ def test_assert_almost_equal_shape_mismatch_override(): \\[left\\]: \\(2L*,\\) \\[right\\]: \\(3L*,\\)""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array([1, 2]), - np.array([3, 4, 5]), - obj="Index") + assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index") def test_assert_almost_equal_unicode(): @@ -308,8 +325,7 @@ def test_assert_almost_equal_unicode(): \\[right\\]: \\[á, à, å\\]""" with pytest.raises(AssertionError, match=msg): - assert_almost_equal(np.array(["á", "à", "ä"]), - np.array(["á", "à", "å"])) + assert_almost_equal(np.array(["á", "à", "ä"]), np.array(["á", "à", "å"])) def test_assert_almost_equal_timestamp(): diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py index 139755d4510b5..d51dd8b36751a 100644 --- a/pandas/tests/util/test_assert_categorical_equal.py +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -4,10 +4,10 @@ from pandas.util.testing import assert_categorical_equal -@pytest.mark.parametrize("c", [ - Categorical([1, 2, 3, 4]), - Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5]), -]) +@pytest.mark.parametrize( + "c", + [Categorical([1, 2, 3, 4]), Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4, 5])], +) def test_categorical_equal(c): assert_categorical_equal(c, c) @@ -81,7 +81,9 @@ def test_categorical_equal_object_override(obj): Attribute "ordered" are different \\[left\\]: False -\\[right\\]: True""".format(obj=obj) +\\[right\\]: True""".format( + obj=obj + ) c1 = Categorical(data, ordered=False) c2 = Categorical(data, ordered=True) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index 782b88be15057..43a474da2bbda 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -5,10 +5,14 @@ from pandas.util.testing import assert_extension_array_equal -@pytest.mark.parametrize("kwargs", [ - dict(), # Default is check_exact=False - dict(check_exact=False), dict(check_exact=True) -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(), # Default is check_exact=False + dict(check_exact=False), + dict(check_exact=True), + ], +) def test_assert_extension_array_equal_not_exact(kwargs): # see gh-23709 arr1 = SparseArray([-0.17387645482451206, 0.3414148016424936]) @@ -28,9 +32,9 @@ def test_assert_extension_array_equal_not_exact(kwargs): assert_extension_array_equal(arr1, arr2, **kwargs) -@pytest.mark.parametrize("check_less_precise", [ - True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 -]) +@pytest.mark.parametrize( + "check_less_precise", [True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] +) def test_assert_extension_array_equal_less_precise(check_less_precise): arr1 = SparseArray([0.5, 0.123456]) arr2 = SparseArray([0.5, 0.123457]) @@ -93,8 +97,11 @@ def test_assert_extension_array_equal_non_extension_array(side): extension_array = SparseArray(numpy_array) msg = "{side} is not an ExtensionArray".format(side=side) - args = ((numpy_array, extension_array) if side == "left" - else (extension_array, numpy_array)) + args = ( + (numpy_array, extension_array) + if side == "left" + else (extension_array, numpy_array) + ) with pytest.raises(AssertionError, match=msg): assert_extension_array_equal(*args) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 735d16f7ad0db..9571e8027ccf7 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -9,7 +9,7 @@ def by_blocks_fixture(request): return request.param -@pytest.fixture(params=['DataFrame', 'Series']) +@pytest.fixture(params=["DataFrame", "Series"]) def obj_fixture(request): return request.param @@ -76,29 +76,24 @@ def _assert_not_frame_equal_both(a, b, **kwargs): @pytest.mark.parametrize("check_like", [True, False]) def test_frame_equal_row_order_mismatch(check_like, obj_fixture): - df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "c"]) - df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, - index=["c", "b", "a"]) + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) if not check_like: # Do not ignore row-column orderings. msg = "{obj}.index are different".format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, - df2, - check_like=check_like, - obj=obj_fixture) + assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) else: - _assert_frame_equal_both(df1, - df2, - check_like=check_like, - obj=obj_fixture) + _assert_frame_equal_both(df1, df2, check_like=check_like, obj=obj_fixture) -@pytest.mark.parametrize("df1,df2", [ - (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), - (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), -]) +@pytest.mark.parametrize( + "df1,df2", + [ + (DataFrame({"A": [1, 2, 3]}), DataFrame({"A": [1, 2, 3, 4]})), + (DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}), DataFrame({"A": [1, 2, 3]})), + ], +) def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): msg = "{obj} are different".format(obj=obj_fixture) @@ -106,21 +101,27 @@ def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): assert_frame_equal(df1, df2, obj=obj_fixture) -@pytest.mark.parametrize("df1,df2,msg", [ - # Index - (DataFrame.from_records({"a": [1, 2], - "c": ["l1", "l2"]}, index=["a"]), - DataFrame.from_records({"a": [1.0, 2.0], - "c": ["l1", "l2"]}, index=["a"]), - "DataFrame\\.index are different"), - - # MultiIndex - (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]), - DataFrame.from_records({"a": [1.0, 2.0], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]), - "MultiIndex level \\[0\\] are different") -]) +@pytest.mark.parametrize( + "df1,df2,msg", + [ + # Index + ( + DataFrame.from_records({"a": [1, 2], "c": ["l1", "l2"]}, index=["a"]), + DataFrame.from_records({"a": [1.0, 2.0], "c": ["l1", "l2"]}, index=["a"]), + "DataFrame\\.index are different", + ), + # MultiIndex + ( + DataFrame.from_records( + {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ), + DataFrame.from_records( + {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ), + "MultiIndex level \\[0\\] are different", + ), + ], +) def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type): kwargs = dict(check_index_type=check_index_type) @@ -153,12 +154,11 @@ def test_frame_equal_index_mismatch(obj_fixture): {obj}\\.index values are different \\(33\\.33333 %\\) \\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) \\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""".format( - obj=obj_fixture) + obj=obj_fixture + ) - df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "c"]) - df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "d"]) + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"]) with pytest.raises(AssertionError, match=msg): assert_frame_equal(df1, df2, obj=obj_fixture) @@ -170,12 +170,11 @@ def test_frame_equal_columns_mismatch(obj_fixture): {obj}\\.columns values are different \\(50\\.0 %\\) \\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) \\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""".format( - obj=obj_fixture) + obj=obj_fixture + ) - df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, - index=["a", "b", "c"]) - df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, - index=["a", "b", "c"]) + df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) with pytest.raises(AssertionError, match=msg): assert_frame_equal(df1, df2, obj=obj_fixture) @@ -186,34 +185,40 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): {obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) \\[left\\]: \\[4, 5, 6\\] -\\[right\\]: \\[4, 5, 7\\]""".format(obj=obj_fixture) +\\[right\\]: \\[4, 5, 7\\]""".format( + obj=obj_fixture + ) df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, - df2, - by_blocks=by_blocks_fixture, - obj=obj_fixture) + assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) -@pytest.mark.parametrize("df1,df2,msg", [ - (DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), - DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), - """{obj}\\.iloc\\[:, 1\\] are different +@pytest.mark.parametrize( + "df1,df2,msg", + [ + ( + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), + """{obj}\\.iloc\\[:, 1\\] are different {obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) \\[left\\]: \\[é, è, ë\\] -\\[right\\]: \\[é, è, e̊\\]"""), - (DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), - DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), - """{obj}\\.iloc\\[:, 0\\] are different +\\[right\\]: \\[é, è, e̊\\]""", + ), + ( + DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), + DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), + """{obj}\\.iloc\\[:, 0\\] are different {obj}\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) \\[left\\]: \\[á, à, ä\\] -\\[right\\]: \\[a, a, a\\]"""), -]) +\\[right\\]: \\[a, a, a\\]""", + ), + ], +) def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): # see gh-20503 # @@ -221,7 +226,4 @@ def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, obj_fixture): # when comparing DataFrames containing differing unicode objects. msg = msg.format(obj=obj_fixture) with pytest.raises(AssertionError, match=msg): - assert_frame_equal(df1, - df2, - by_blocks=by_blocks_fixture, - obj=obj_fixture) + assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj_fixture) diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 445d9c4e482b0..270f765caebd0 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -17,8 +17,7 @@ def test_index_equal_levels_mismatch(): \\)""" idx1 = Index([1, 2, 3]) - idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), - ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) with pytest.raises(AssertionError, match=msg): assert_index_equal(idx1, idx2, exact=False) @@ -31,10 +30,8 @@ def test_index_equal_values_mismatch(check_exact): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), - ("B", 3), ("B", 4)]) - idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), - ("B", 3), ("B", 4)]) + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) with pytest.raises(AssertionError, match=msg): assert_index_equal(idx1, idx2, check_exact=check_exact) @@ -69,7 +66,7 @@ def test_index_equal_class_mismatch(check_exact): def test_index_equal_values_close(check_exact): - idx1 = Index([1, 2, 3.]) + idx1 = Index([1, 2, 3.0]) idx2 = Index([1, 2, 3.0000000001]) if check_exact: @@ -86,10 +83,9 @@ def test_index_equal_values_close(check_exact): def test_index_equal_values_less_close(check_exact, check_less_precise): - idx1 = Index([1, 2, 3.]) + idx1 = Index([1, 2, 3.0]) idx2 = Index([1, 2, 3.0001]) - kwargs = dict(check_exact=check_exact, - check_less_precise=check_less_precise) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) if check_exact or not check_less_precise: msg = """Index are different @@ -107,8 +103,7 @@ def test_index_equal_values_less_close(check_exact, check_less_precise): def test_index_equal_values_too_far(check_exact, check_less_precise): idx1 = Index([1, 2, 3]) idx2 = Index([1, 2, 4]) - kwargs = dict(check_exact=check_exact, - check_less_precise=check_less_precise) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) msg = """Index are different @@ -121,12 +116,9 @@ def test_index_equal_values_too_far(check_exact, check_less_precise): def test_index_equal_level_values_mismatch(check_exact, check_less_precise): - idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), - ("B", 3), ("B", 4)]) - idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), - ("B", 3), ("B", 4)]) - kwargs = dict(check_exact=check_exact, - check_less_precise=check_less_precise) + idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) + idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) + kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) msg = """MultiIndex level \\[1\\] are different @@ -138,9 +130,10 @@ def test_index_equal_level_values_mismatch(check_exact, check_less_precise): assert_index_equal(idx1, idx2, **kwargs) -@pytest.mark.parametrize("name1,name2", [ - (None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT) -]) +@pytest.mark.parametrize( + "name1,name2", + [(None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)], +) def test_index_equal_names(name1, name2): msg = """Index are different diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index d2dd7912388b0..e4435b5f008e8 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -4,11 +4,14 @@ from pandas.util.testing import assert_interval_array_equal -@pytest.mark.parametrize("kwargs", [ - dict(start=0, periods=4), - dict(start=1, periods=5), - dict(start=5, end=10, closed="left"), -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(start=0, periods=4), + dict(start=1, periods=5), + dict(start=5, end=10, closed="left"), + ], +) def test_interval_array_equal(kwargs): arr = interval_range(**kwargs).values assert_interval_array_equal(arr, arr) diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py index 447b1ac172202..59f77d18a8929 100644 --- a/pandas/tests/util/test_assert_numpy_array_equal.py +++ b/pandas/tests/util/test_assert_numpy_array_equal.py @@ -23,16 +23,18 @@ def test_assert_numpy_array_equal_bad_type(): assert_numpy_array_equal(1, 2) -@pytest.mark.parametrize("a,b,klass1,klass2", [ - (np.array([1]), 1, "ndarray", "int"), - (1, np.array([1]), "int", "ndarray"), -]) +@pytest.mark.parametrize( + "a,b,klass1,klass2", + [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], +) def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2): msg = """numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format(klass1=klass1, klass2=klass2) +\\[right\\]: {klass2}""".format( + klass1=klass1, klass2=klass2 + ) with pytest.raises(AssertionError, match=msg): assert_numpy_array_equal(a, b) @@ -46,8 +48,7 @@ def test_assert_numpy_array_equal_value_mismatch1(): \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([np.nan, 2, 3]), - np.array([1, np.nan, 3])) + assert_numpy_array_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) def test_assert_numpy_array_equal_value_mismatch2(): @@ -69,8 +70,9 @@ def test_assert_numpy_array_equal_value_mismatch3(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) + assert_numpy_array_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) def test_assert_numpy_array_equal_value_mismatch4(): @@ -81,8 +83,7 @@ def test_assert_numpy_array_equal_value_mismatch4(): \\[right\\]: \\[1\\.1, 2.0\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([1.1, 2.000001]), - np.array([1.1, 2.0])) + assert_numpy_array_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0])) def test_assert_numpy_array_equal_value_mismatch5(): @@ -93,8 +94,9 @@ def test_assert_numpy_array_equal_value_mismatch5(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([[1, 2], [3, 4], [5, 6]]), - np.array([[1, 3], [3, 4], [5, 6]])) + assert_numpy_array_equal( + np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]]) + ) def test_assert_numpy_array_equal_value_mismatch6(): @@ -105,8 +107,7 @@ def test_assert_numpy_array_equal_value_mismatch6(): \\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), - np.array([[1, 3], [3, 4]])) + assert_numpy_array_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])) def test_assert_numpy_array_equal_shape_mismatch_override(): @@ -117,9 +118,7 @@ def test_assert_numpy_array_equal_shape_mismatch_override(): \\[right\\]: \\(3L*,\\)""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array([1, 2]), - np.array([3, 4, 5]), - obj="Index") + assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index") def test_numpy_array_equal_unicode(): @@ -134,8 +133,7 @@ def test_numpy_array_equal_unicode(): \\[right\\]: \\[á, à, å\\]""" with pytest.raises(AssertionError, match=msg): - assert_numpy_array_equal(np.array(["á", "à", "ä"]), - np.array(["á", "à", "å"])) + assert_numpy_array_equal(np.array(["á", "à", "ä"]), np.array(["á", "à", "å"])) def test_numpy_array_equal_object(): @@ -164,9 +162,11 @@ def test_numpy_array_equal_copy_flag(other_type, check_same): other = a.copy() if check_same != other_type: - msg = (r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)" - if check_same == "same" - else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)") + msg = ( + r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)" + if check_same == "same" + else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)" + ) if msg is not None: with pytest.raises(AssertionError, match=msg): diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index 79b2a56537613..c681817896903 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -6,11 +6,11 @@ def f(): - warnings.warn('f1', FutureWarning) - warnings.warn('f2', RuntimeWarning) + warnings.warn("f1", FutureWarning) + warnings.warn("f2", RuntimeWarning) -@pytest.mark.filterwarnings('ignore:f1:FutureWarning') +@pytest.mark.filterwarnings("ignore:f1:FutureWarning") def test_assert_produces_warning_honors_filter(): # Raise by default. msg = r"Caused unexpected warning\(s\)" @@ -18,6 +18,5 @@ def test_assert_produces_warning_honors_filter(): with tm.assert_produces_warning(RuntimeWarning): f() - with tm.assert_produces_warning(RuntimeWarning, - raise_on_extra_warnings=False): + with tm.assert_produces_warning(RuntimeWarning, raise_on_extra_warnings=False): f() diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 61cabcf3f4aae..a12d9386eb159 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -64,29 +64,33 @@ def _assert_not_series_equal_both(a, b, **kwargs): _assert_not_series_equal(b, a, **kwargs) -@pytest.mark.parametrize("data", [ - range(3), list("abc"), list("áàä"), -]) +@pytest.mark.parametrize("data", [range(3), list("abc"), list("áàä")]) def test_series_equal(data): _assert_series_equal_both(Series(data), Series(data)) -@pytest.mark.parametrize("data1,data2", [ - (range(3), range(1, 4)), - (list("abc"), list("xyz")), - (list("áàä"), list("éèë")), - (list("áàä"), list(b"aaa")), - (range(3), range(4)), -]) +@pytest.mark.parametrize( + "data1,data2", + [ + (range(3), range(1, 4)), + (list("abc"), list("xyz")), + (list("áàä"), list("éèë")), + (list("áàä"), list(b"aaa")), + (range(3), range(4)), + ], +) def test_series_not_equal_value_mismatch(data1, data2): _assert_not_series_equal_both(Series(data1), Series(data2)) -@pytest.mark.parametrize("kwargs", [ - dict(dtype="float64"), # dtype mismatch - dict(index=[1, 2, 4]), # index mismatch - dict(name="foo"), # name mismatch -]) +@pytest.mark.parametrize( + "kwargs", + [ + dict(dtype="float64"), # dtype mismatch + dict(index=[1, 2, 4]), # index mismatch + dict(name="foo"), # name mismatch + ], +) def test_series_not_equal_metadata_mismatch(kwargs): data = range(3) s1 = Series(data) @@ -104,9 +108,10 @@ def test_less_precise(data1, data2, dtype, check_less_precise): kwargs = dict(check_less_precise=check_less_precise) - if ((check_less_precise is False or check_less_precise == 10) or - ((check_less_precise is True or check_less_precise >= 3) and - abs(data1 - data2) >= 0.0001)): + if (check_less_precise is False or check_less_precise == 10) or ( + (check_less_precise is True or check_less_precise >= 3) + and abs(data1 - data2) >= 0.0001 + ): msg = "Series values are different" with pytest.raises(AssertionError, match=msg): assert_series_equal(s1, s2, **kwargs) @@ -114,19 +119,27 @@ def test_less_precise(data1, data2, dtype, check_less_precise): _assert_series_equal_both(s1, s2, **kwargs) -@pytest.mark.parametrize("s1,s2,msg", [ - # Index - (Series(["l1", "l2"], index=[1, 2]), - Series(["l1", "l2"], index=[1., 2.]), - "Series\\.index are different"), - - # MultiIndex - (DataFrame.from_records({"a": [1, 2], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]).c, - DataFrame.from_records({"a": [1., 2.], "b": [2.1, 1.5], - "c": ["l1", "l2"]}, index=["a", "b"]).c, - "MultiIndex level \\[0\\] are different") -]) +@pytest.mark.parametrize( + "s1,s2,msg", + [ + # Index + ( + Series(["l1", "l2"], index=[1, 2]), + Series(["l1", "l2"], index=[1.0, 2.0]), + "Series\\.index are different", + ), + # MultiIndex + ( + DataFrame.from_records( + {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ).c, + DataFrame.from_records( + {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"] + ).c, + "MultiIndex level \\[0\\] are different", + ), + ], +) def test_series_equal_index_dtype(s1, s2, msg, check_index_type): kwargs = dict(check_index_type=check_index_type) diff --git a/pandas/tests/util/test_deprecate.py b/pandas/tests/util/test_deprecate.py index 7fa7989eff690..e7b38bb2b700a 100644 --- a/pandas/tests/util/test_deprecate.py +++ b/pandas/tests/util/test_deprecate.py @@ -13,16 +13,16 @@ def new_func(): This is the extended summary. The deprecate directive goes before this. """ - return 'new_func called' + return "new_func called" def new_func_no_docstring(): - return 'new_func_no_docstring called' + return "new_func_no_docstring called" def new_func_wrong_docstring(): """Summary should be in the next line.""" - return 'new_func_wrong_docstring called' + return "new_func_wrong_docstring called" def new_func_with_deprecation(): @@ -38,26 +38,28 @@ def new_func_with_deprecation(): def test_deprecate_ok(): - depr_func = deprecate('depr_func', new_func, '1.0', - msg='Use new_func instead.') + depr_func = deprecate("depr_func", new_func, "1.0", msg="Use new_func instead.") with tm.assert_produces_warning(FutureWarning): result = depr_func() - assert result == 'new_func called' + assert result == "new_func called" assert depr_func.__doc__ == dedent(new_func_with_deprecation.__doc__) def test_deprecate_no_docstring(): - depr_func = deprecate('depr_func', new_func_no_docstring, '1.0', - msg='Use new_func instead.') + depr_func = deprecate( + "depr_func", new_func_no_docstring, "1.0", msg="Use new_func instead." + ) with tm.assert_produces_warning(FutureWarning): result = depr_func() - assert result == 'new_func_no_docstring called' + assert result == "new_func_no_docstring called" def test_deprecate_wrong_docstring(): - with pytest.raises(AssertionError, match='deprecate needs a correctly ' - 'formatted docstring'): - deprecate('depr_func', new_func_wrong_docstring, '1.0', - msg='Use new_func instead.') + with pytest.raises( + AssertionError, match="deprecate needs a correctly " "formatted docstring" + ): + deprecate( + "depr_func", new_func_wrong_docstring, "1.0", msg="Use new_func instead." + ) diff --git a/pandas/tests/util/test_deprecate_kwarg.py b/pandas/tests/util/test_deprecate_kwarg.py index b6e2f8e184a8d..c17c48197ccf7 100644 --- a/pandas/tests/util/test_deprecate_kwarg.py +++ b/pandas/tests/util/test_deprecate_kwarg.py @@ -27,10 +27,7 @@ def _f3(new=0): return new -@pytest.mark.parametrize("key,klass", [ - ("old", FutureWarning), - ("new", None) -]) +@pytest.mark.parametrize("key,klass", [("old", FutureWarning), ("new", None)]) def test_deprecate_kwarg(key, klass): x = 78 @@ -67,6 +64,7 @@ def test_bad_deprecate_kwarg(): msg = "mapping from old to new argument values must be dict or callable!" with pytest.raises(TypeError, match=msg): + @deprecate_kwarg("old", "new", 0) def f4(new=None): return new diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 31468a40f72dd..27a23180b269a 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -10,15 +10,18 @@ import pandas.util.testing as tm -@pytest.fixture(params=[ - Series([1, 2, 3] * 3, dtype="int32"), - Series([None, 2.5, 3.5] * 3, dtype="float32"), - Series(["a", "b", "c"] * 3, dtype="category"), - Series(["d", "e", "f"] * 3), - Series([True, False, True] * 3), - Series(pd.date_range("20130101", periods=9)), - Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), - Series(pd.timedelta_range("2000", periods=9))]) +@pytest.fixture( + params=[ + Series([1, 2, 3] * 3, dtype="int32"), + Series([None, 2.5, 3.5] * 3, dtype="float32"), + Series(["a", "b", "c"] * 3, dtype="category"), + Series(["d", "e", "f"] * 3), + Series([True, False, True] * 3), + Series(pd.date_range("20130101", periods=9)), + Series(pd.date_range("20130101", periods=9, tz="US/Eastern")), + Series(pd.timedelta_range("2000", periods=9)), + ] +) def series(request): return request.param @@ -65,9 +68,13 @@ def test_consistency(): # Check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth. result = hash_pandas_object(Index(["foo", "bar", "baz"])) - expected = Series(np.array([3600424527151052760, 1374399572096150070, - 477881037637427054], dtype="uint64"), - index=["foo", "bar", "baz"]) + expected = Series( + np.array( + [3600424527151052760, 1374399572096150070, 477881037637427054], + dtype="uint64", + ), + index=["foo", "bar", "baz"], + ) tm.assert_series_equal(result, expected) @@ -76,10 +83,9 @@ def test_hash_array(series): tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr)) -@pytest.mark.parametrize("arr2", [ - np.array([3, 4, "All"]), - np.array([3, 4, "All"], dtype=object), -]) +@pytest.mark.parametrize( + "arr2", [np.array([3, 4, "All"]), np.array([3, 4, "All"], dtype=object)] +) def test_hash_array_mixed(arr2): result1 = hash_array(np.array(["3", "4", "All"])) result2 = hash_array(arr2) @@ -105,9 +111,10 @@ def test_hash_tuples(): assert result == expected[0] -@pytest.mark.parametrize("tup", [ - (1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), - ("A", pd.Timestamp("2012-01-01"))]) +@pytest.mark.parametrize( + "tup", + [(1, "one"), (1, np.nan), (1.0, pd.NaT, "A"), ("A", pd.Timestamp("2012-01-01"))], +) def test_hash_tuple(tup): # Test equivalence between # hash_tuples and hash_tuple. @@ -117,14 +124,26 @@ def test_hash_tuple(tup): assert result == expected -@pytest.mark.parametrize("val", [ - 1, 1.4, "A", b"A", pd.Timestamp("2012-01-01"), - pd.Timestamp("2012-01-01", tz="Europe/Brussels"), - datetime.datetime(2012, 1, 1), - pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), - pd.Timedelta("1 days"), datetime.timedelta(1), - pd.Period("2012-01-01", freq="D"), pd.Interval(0, 1), - np.nan, pd.NaT, None]) +@pytest.mark.parametrize( + "val", + [ + 1, + 1.4, + "A", + b"A", + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz="Europe/Brussels"), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz="EST").to_pydatetime(), + pd.Timedelta("1 days"), + datetime.timedelta(1), + pd.Period("2012-01-01", freq="D"), + pd.Interval(0, 1), + np.nan, + pd.NaT, + None, + ], +) def test_hash_scalar(val): result = _hash_scalar(val) expected = hash_array(np.array([val], dtype=object), categorize=True) @@ -140,8 +159,7 @@ def test_hash_tuples_err(val): def test_multiindex_unique(): - mi = MultiIndex.from_tuples([(118, 472), (236, 118), - (51, 204), (102, 51)]) + mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) assert mi.is_unique is True result = hash_pandas_object(mi) @@ -149,9 +167,11 @@ def test_multiindex_unique(): def test_multiindex_objects(): - mi = MultiIndex(levels=[["b", "d", "a"], [1, 2, 3]], - codes=[[0, 1, 0, 2], [2, 0, 0, 1]], - names=["col1", "col2"]) + mi = MultiIndex( + levels=[["b", "d", "a"], [1, 2, 3]], + codes=[[0, 1, 0, 2], [2, 0, 0, 1]], + names=["col1", "col2"], + ) recons = mi._sort_levels_monotonic() # These are equal. @@ -176,32 +196,36 @@ def test_multiindex_objects(): tm.assert_numpy_array_equal(np.sort(result), np.sort(expected)) -@pytest.mark.parametrize("obj", [ - Series([1, 2, 3]), - Series([1.0, 1.5, 3.2]), - Series([1.0, 1.5, np.nan]), - Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), - Series(["a", "b", "c"]), - Series(["a", np.nan, "c"]), - Series(["a", None, "c"]), - Series([True, False, True]), - Series(), - Index([1, 2, 3]), - Index([True, False, True]), - DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), - DataFrame(), - tm.makeMissingDataframe(), - tm.makeMixedDataFrame(), - tm.makeTimeDataFrame(), - tm.makeTimeSeries(), - tm.makeTimedeltaIndex(), - tm.makePeriodIndex(), - Series(tm.makePeriodIndex()), - Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), - MultiIndex.from_product([range(5), ["foo", "bar", "baz"], - pd.date_range("20130101", periods=2)]), - MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]) -]) +@pytest.mark.parametrize( + "obj", + [ + Series([1, 2, 3]), + Series([1.0, 1.5, 3.2]), + Series([1.0, 1.5, np.nan]), + Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), + Series(["a", "b", "c"]), + Series(["a", np.nan, "c"]), + Series(["a", None, "c"]), + Series([True, False, True]), + Series(), + Index([1, 2, 3]), + Index([True, False, True]), + DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), + DataFrame(), + tm.makeMissingDataframe(), + tm.makeMixedDataFrame(), + tm.makeTimeDataFrame(), + tm.makeTimeSeries(), + tm.makeTimedeltaIndex(), + tm.makePeriodIndex(), + Series(tm.makePeriodIndex()), + Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), + MultiIndex.from_product( + [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)] + ), + MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]), + ], +) def test_hash_pandas_object(obj, index): _check_equal(obj, index=index) _check_not_equal_with_index(obj) @@ -212,18 +236,23 @@ def test_hash_pandas_object2(series, index): _check_not_equal_with_index(series) -@pytest.mark.parametrize("obj", [ - Series([], dtype="float64"), Series([], dtype="object"), Index([])]) +@pytest.mark.parametrize( + "obj", [Series([], dtype="float64"), Series([], dtype="object"), Index([])] +) def test_hash_pandas_empty_object(obj, index): # These are by-definition the same with # or without the index as the data is empty. _check_equal(obj, index=index) -@pytest.mark.parametrize("s1", [ - Series(["a", "b", "c", "d"]), - Series([1000, 2000, 3000, 4000]), - Series(pd.date_range(0, periods=4))]) +@pytest.mark.parametrize( + "s1", + [ + Series(["a", "b", "c", "d"]), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4)), + ], +) @pytest.mark.parametrize("categorize", [True, False]) def test_categorical_consistency(s1, categorize): # see gh-15143 @@ -244,13 +273,11 @@ def test_categorical_consistency(s1, categorize): def test_categorical_with_nan_consistency(): c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], - categories=pd.date_range("2012-01-01", periods=5, name="B")) + [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") + ) expected = hash_array(c, categorize=False) - c = pd.Categorical.from_codes( - [-1, 0], - categories=[pd.Timestamp("2012-01-01")]) + c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) result = hash_array(c, categorize=False) assert result[0] in expected @@ -299,7 +326,7 @@ def test_alternate_encoding(index): @pytest.mark.parametrize("l_exp", range(8)) @pytest.mark.parametrize("l_add", [0, 1]) def test_same_len_hash_collisions(l_exp, l_add): - length = 2**(l_exp + 8) + l_add + length = 2 ** (l_exp + 8) + l_add s = tm.rands_array(length, 2) result = hash_array(s, "utf8") @@ -310,8 +337,10 @@ def test_hash_collisions(): # Hash collisions are bad. # # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726 - hashes = ["Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa - "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe"] # noqa + hashes = [ + "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9", # noqa + "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe", + ] # noqa # These should be different. result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8") @@ -323,5 +352,4 @@ def test_hash_collisions(): tm.assert_numpy_array_equal(result2, expected2) result = hash_array(np.asarray(hashes, dtype=object), "utf8") - tm.assert_numpy_array_equal(result, np.concatenate([expected1, - expected2], axis=0)) + tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) diff --git a/pandas/tests/util/test_move.py b/pandas/tests/util/test_move.py index 0987600574b8d..0e28dd2dd9d71 100644 --- a/pandas/tests/util/test_move.py +++ b/pandas/tests/util/test_move.py @@ -19,6 +19,7 @@ def test_more_than_one_ref(): b = b"testing" with pytest.raises(BadMove, match="testing") as e: + def handle_success(type_, value, tb): assert value.args[0] is b return type(e).handle_success(e, type_, value, tb) # super diff --git a/pandas/tests/util/test_safe_import.py b/pandas/tests/util/test_safe_import.py index eef3657af6562..bd07bea934ed3 100644 --- a/pandas/tests/util/test_safe_import.py +++ b/pandas/tests/util/test_safe_import.py @@ -15,21 +15,16 @@ def test_safe_import_exists(): assert td.safe_import("pandas") -@pytest.mark.parametrize("min_version,valid", [ - ("0.0.0", True), - ("99.99.99", False) -]) +@pytest.mark.parametrize("min_version,valid", [("0.0.0", True), ("99.99.99", False)]) def test_safe_import_versions(min_version, valid): result = td.safe_import("pandas", min_version=min_version) result = result if valid else not result assert result -@pytest.mark.parametrize("min_version,valid", [ - (None, False), - ("1.0", True), - ("2.0", False) -]) +@pytest.mark.parametrize( + "min_version,valid", [(None, False), ("1.0", True), ("2.0", False)] +) def test_safe_import_dummy(monkeypatch, min_version, valid): mod_name = "hello123" diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 88ce48245dc70..83d9be1ad235f 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -27,8 +27,7 @@ def test_rands_array_2d(): def test_numpy_err_state_is_default(): - expected = {"over": "warn", "divide": "warn", - "invalid": "warn", "under": "ignore"} + expected = {"over": "warn", "divide": "warn", "invalid": "warn", "under": "ignore"} import numpy as np # The error state should be unchanged after that import. @@ -81,7 +80,7 @@ def test_assert_raises_regex_deprecated(): assert 1 == 2, msg -@pytest.mark.parametrize('strict_data_files', [True, False]) +@pytest.mark.parametrize("strict_data_files", [True, False]) def test_datapath_missing(datapath): with pytest.raises(ValueError, match="Could not find file"): datapath("not_a_file") diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py index 581c394401223..1f1365d62c64e 100644 --- a/pandas/tests/util/test_validate_args.py +++ b/pandas/tests/util/test_validate_args.py @@ -21,10 +21,12 @@ def test_bad_arg_length_max_value_single(): min_fname_arg_count = 0 max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"argument \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): validate_args(_fname, args, min_fname_arg_count, compat_args) @@ -37,10 +39,12 @@ def test_bad_arg_length_max_value_multiple(): min_fname_arg_count = 2 max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"arguments \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): validate_args(_fname, args, min_fname_arg_count, compat_args) @@ -49,9 +53,10 @@ def test_bad_arg_length_max_value_multiple(): @pytest.mark.parametrize("i", range(1, 3)) def test_not_all_defaults(i): bad_arg = "foo" - msg = ("the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)". - format(arg=bad_arg, func=_fname)) + msg = ( + "the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + ) compat_args = OrderedDict() compat_args["foo"] = 2 diff --git a/pandas/tests/util/test_validate_args_and_kwargs.py b/pandas/tests/util/test_validate_args_and_kwargs.py index 1d75de1872319..396056466bb81 100644 --- a/pandas/tests/util/test_validate_args_and_kwargs.py +++ b/pandas/tests/util/test_validate_args_and_kwargs.py @@ -16,15 +16,15 @@ def test_invalid_total_length_max_length_one(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(kwargs) + len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"argument \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) def test_invalid_total_length_max_length_multiple(): @@ -36,21 +36,18 @@ def test_invalid_total_length_max_length_multiple(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(kwargs) + len(args) + min_fname_arg_count - msg = (r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)" - .format(fname=_fname, max_length=max_length, - actual_length=actual_length)) + msg = ( + r"{fname}\(\) takes at most {max_length} " + r"arguments \({actual_length} given\)".format( + fname=_fname, max_length=max_length, actual_length=actual_length + ) + ) with pytest.raises(TypeError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) -@pytest.mark.parametrize("args,kwargs", [ - ((), {"foo": -5, "bar": 2}), - ((-5, 2), {}) -]) +@pytest.mark.parametrize("args,kwargs", [((), {"foo": -5, "bar": 2}), ((-5, 2), {})]) def test_missing_args_or_kwargs(args, kwargs): bad_arg = "bar" min_fname_arg_count = 2 @@ -59,13 +56,13 @@ def test_missing_args_or_kwargs(args, kwargs): compat_args["foo"] = -5 compat_args[bad_arg] = 1 - msg = (r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)". - format(arg=bad_arg, func=_fname)) + msg = ( + r"the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + ) with pytest.raises(ValueError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) def test_duplicate_argument(): @@ -79,13 +76,12 @@ def test_duplicate_argument(): kwargs = {"foo": None, "bar": None} args = (None,) # duplicate value for "foo" - msg = (r"{fname}\(\) got multiple values for keyword " - r"argument '{arg}'".format(fname=_fname, arg="foo")) + msg = r"{fname}\(\) got multiple values for keyword " r"argument '{arg}'".format( + fname=_fname, arg="foo" + ) with pytest.raises(TypeError, match=msg): - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) def test_validation(): @@ -99,6 +95,4 @@ def test_validation(): args = (1, None) min_fname_arg_count = 2 - validate_args_and_kwargs(_fname, args, kwargs, - min_fname_arg_count, - compat_args) + validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index 1cf9736eaef92..ec9f3948403de 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -16,8 +16,9 @@ def test_bad_kwarg(): compat_args[bad_arg + "o"] = "bar" kwargs = {good_arg: "foo", bad_arg: "bar"} - msg = (r"{fname}\(\) got an unexpected " - r"keyword argument '{arg}'".format(fname=_fname, arg=bad_arg)) + msg = r"{fname}\(\) got an unexpected " r"keyword argument '{arg}'".format( + fname=_fname, arg=bad_arg + ) with pytest.raises(TypeError, match=msg): validate_kwargs(_fname, kwargs, compat_args) @@ -26,9 +27,10 @@ def test_bad_kwarg(): @pytest.mark.parametrize("i", range(1, 3)) def test_not_all_none(i): bad_arg = "foo" - msg = (r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)". - format(arg=bad_arg, func=_fname)) + msg = ( + r"the '{arg}' parameter is not supported " + r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + ) compat_args = OrderedDict() compat_args["foo"] = 1 @@ -58,8 +60,10 @@ def test_validation(): @pytest.mark.parametrize("name", ["inplace", "copy"]) @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_kwarg_fail(name, value): - msg = ("For argument \"%s\" expected type bool, received type %s" % - (name, type(value).__name__)) + msg = 'For argument "%s" expected type bool, received type %s' % ( + name, + type(value).__name__, + ) with pytest.raises(ValueError, match=msg): validate_bool_kwarg(value, name) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index e1e4dd4cf4b8a..c2b76188ad36b 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -6,15 +6,27 @@ # in `pandas.plotting`, or remove from here (I guess they are here for # legacy reasons from pandas.plotting._matplotlib.converter import ( - DatetimeConverter, MilliSecondLocator, PandasAutoDateFormatter, - PandasAutoDateLocator, PeriodConverter, TimeConverter, TimeFormatter, - TimeSeries_DateFormatter, TimeSeries_DateLocator, get_datevalue, - get_finder, time2num) + DatetimeConverter, + MilliSecondLocator, + PandasAutoDateFormatter, + PandasAutoDateLocator, + PeriodConverter, + TimeConverter, + TimeFormatter, + TimeSeries_DateFormatter, + TimeSeries_DateLocator, + get_datevalue, + get_finder, + time2num, +) def register(): from pandas.plotting import register_matplotlib_converters - msg = ("'pandas.tseries.converter.register' has been moved and renamed to " - "'pandas.plotting.register_matplotlib_converters'. ") + + msg = ( + "'pandas.tseries.converter.register' has been moved and renamed to " + "'pandas.plotting.register_matplotlib_converters'. " + ) warnings.warn(msg, FutureWarning, stacklevel=2) register_matplotlib_converters() diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 4069a2004476b..dfe91b514bbe1 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -18,20 +18,32 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - is_datetime64_dtype, is_period_arraylike, is_timedelta64_dtype) + is_datetime64_dtype, + is_period_arraylike, + is_timedelta64_dtype, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.algorithms import unique from pandas.tseries.offsets import ( - DateOffset, Day, Hour, Micro, Milli, Minute, Nano, Second, prefix_mapping) + DateOffset, + Day, + Hour, + Micro, + Milli, + Minute, + Nano, + Second, + prefix_mapping, +) _ONE_MICRO = 1000 -_ONE_MILLI = (_ONE_MICRO * 1000) -_ONE_SECOND = (_ONE_MILLI * 1000) -_ONE_MINUTE = (60 * _ONE_SECOND) -_ONE_HOUR = (60 * _ONE_MINUTE) -_ONE_DAY = (24 * _ONE_HOUR) +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR # --------------------------------------------------------------------- # Offset names ("time rules") and related functions @@ -45,13 +57,15 @@ def get_period_alias(offset_str): return _offset_to_period_map.get(offset_str, None) -_name_to_offset_map = {'days': Day(1), - 'hours': Hour(1), - 'minutes': Minute(1), - 'seconds': Second(1), - 'milliseconds': Milli(1), - 'microseconds': Micro(1), - 'nanoseconds': Nano(1)} +_name_to_offset_map = { + "days": Day(1), + "hours": Hour(1), + "minutes": Minute(1), + "seconds": Second(1), + "milliseconds": Milli(1), + "microseconds": Micro(1), + "nanoseconds": Nano(1), +} def to_offset(freq): @@ -132,16 +146,17 @@ def to_offset(freq): stride_sign = None try: splitted = re.split(libfreqs.opattern, freq) - if splitted[-1] != '' and not splitted[-1].isspace(): + if splitted[-1] != "" and not splitted[-1].isspace(): # the last element must be blank - raise ValueError('last element must be blank') - for sep, stride, name in zip(splitted[0::4], splitted[1::4], - splitted[2::4]): - if sep != '' and not sep.isspace(): - raise ValueError('separator must be spaces') + raise ValueError("last element must be blank") + for sep, stride, name in zip( + splitted[0::4], splitted[1::4], splitted[2::4] + ): + if sep != "" and not sep.isspace(): + raise ValueError("separator must be spaces") prefix = libfreqs._lite_rule_alias.get(name) or name if stride_sign is None: - stride_sign = -1 if stride.startswith('-') else 1 + stride_sign = -1 if stride.startswith("-") else 1 if not stride: stride = 1 if prefix in Resolution._reso_str_bump_map.keys(): @@ -181,7 +196,7 @@ def get_offset(name): if name not in _offset_map: try: - split = name.split('-') + split = name.split("-") klass = prefix_mapping[split[0]] # handles case where there's no suffix (and will TypeError if too # many '-') @@ -221,16 +236,22 @@ def infer_freq(index, warn=True): if isinstance(index, ABCSeries): values = index._values - if not (is_datetime64_dtype(values) or - is_timedelta64_dtype(values) or - values.dtype == object): - raise TypeError("cannot infer freq from a non-convertible dtype " - "on a Series of {dtype}".format(dtype=index.dtype)) + if not ( + is_datetime64_dtype(values) + or is_timedelta64_dtype(values) + or values.dtype == object + ): + raise TypeError( + "cannot infer freq from a non-convertible dtype " + "on a Series of {dtype}".format(dtype=index.dtype) + ) index = values if is_period_arraylike(index): - raise TypeError("PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq.") + raise TypeError( + "PeriodIndex given. Check the `freq` attribute " + "instead of using infer_freq." + ) elif is_timedelta64_dtype(index): # Allow TimedeltaIndex and TimedeltaArray inferer = _TimedeltaFrequencyInferer(index, warn=warn) @@ -238,8 +259,10 @@ def infer_freq(index, warn=True): if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): - raise TypeError("cannot infer freq from a non-convertible index " - "type {type}".format(type=type(index))) + raise TypeError( + "cannot infer freq from a non-convertible index " + "type {type}".format(type=type(index)) + ) index = index.values if not isinstance(index, pd.DatetimeIndex): @@ -263,17 +286,18 @@ def __init__(self, index, warn=True): # This moves the values, which are implicitly in UTC, to the # the timezone so they are in local time - if hasattr(index, 'tz'): + if hasattr(index, "tz"): if index.tz is not None: self.values = tz_convert(self.values, UTC, index.tz) self.warn = warn if len(index) < 3: - raise ValueError('Need at least 3 dates to infer frequency') + raise ValueError("Need at least 3 dates to infer frequency") - self.is_monotonic = (self.index._is_monotonic_increasing or - self.index._is_monotonic_decreasing) + self.is_monotonic = ( + self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing + ) @cache_readonly def deltas(self): @@ -309,7 +333,7 @@ def get_freq(self): # Business hourly, maybe. 17: one day / 65: one weekend if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]): - return 'BH' + return "BH" # Possibly intraday frequency. Here we use the # original .asi8 values as the modified values # will not work around DST transitions. See #8772 @@ -319,22 +343,22 @@ def get_freq(self): delta = self.deltas_asi8[0] if _is_multiple(delta, _ONE_HOUR): # Hours - return _maybe_add_count('H', delta / _ONE_HOUR) + return _maybe_add_count("H", delta / _ONE_HOUR) elif _is_multiple(delta, _ONE_MINUTE): # Minutes - return _maybe_add_count('T', delta / _ONE_MINUTE) + return _maybe_add_count("T", delta / _ONE_MINUTE) elif _is_multiple(delta, _ONE_SECOND): # Seconds - return _maybe_add_count('S', delta / _ONE_SECOND) + return _maybe_add_count("S", delta / _ONE_SECOND) elif _is_multiple(delta, _ONE_MILLI): # Milliseconds - return _maybe_add_count('L', delta / _ONE_MILLI) + return _maybe_add_count("L", delta / _ONE_MILLI) elif _is_multiple(delta, _ONE_MICRO): # Microseconds - return _maybe_add_count('U', delta / _ONE_MICRO) + return _maybe_add_count("U", delta / _ONE_MICRO) else: # Nanoseconds - return _maybe_add_count('N', delta) + return _maybe_add_count("N", delta) @cache_readonly def day_deltas(self): @@ -353,24 +377,23 @@ def rep_stamp(self): return Timestamp(self.values[0]) def month_position_check(self): - return libresolution.month_position_check(self.fields, - self.index.dayofweek) + return libresolution.month_position_check(self.fields, self.index.dayofweek) @cache_readonly def mdiffs(self): - nmonths = self.fields['Y'] * 12 + self.fields['M'] - return unique_deltas(nmonths.astype('i8')) + nmonths = self.fields["Y"] * 12 + self.fields["M"] + return unique_deltas(nmonths.astype("i8")) @cache_readonly def ydiffs(self): - return unique_deltas(self.fields['Y'].astype('i8')) + return unique_deltas(self.fields["Y"].astype("i8")) def _infer_daily_rule(self): annual_rule = self._get_annual_rule() if annual_rule: nyears = self.ydiffs[0] month = MONTH_ALIASES[self.rep_stamp.month] - alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month) + alias = "{prefix}-{month}".format(prefix=annual_rule, month=month) return _maybe_add_count(alias, nyears) quarterly_rule = self._get_quarterly_rule() @@ -378,8 +401,7 @@ def _infer_daily_rule(self): nquarters = self.mdiffs[0] / 3 mod_dict = {0: 12, 2: 11, 1: 10} month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]] - alias = '{prefix}-{month}'.format(prefix=quarterly_rule, - month=month) + alias = "{prefix}-{month}".format(prefix=quarterly_rule, month=month) return _maybe_add_count(alias, nquarters) monthly_rule = self._get_monthly_rule() @@ -391,13 +413,12 @@ def _infer_daily_rule(self): if days % 7 == 0: # Weekly day = int_to_weekday[self.rep_stamp.weekday()] - return _maybe_add_count( - 'W-{day}'.format(day=day), days / 7) + return _maybe_add_count("W-{day}".format(day=day), days / 7) else: - return _maybe_add_count('D', days) + return _maybe_add_count("D", days) if self._is_business_daily(): - return 'B' + return "B" wom_rule = self._get_wom_rule() if wom_rule: @@ -407,12 +428,11 @@ def _get_annual_rule(self): if len(self.ydiffs) > 1: return None - if len(unique(self.fields['M'])) > 1: + if len(unique(self.fields["M"])) > 1: return None pos_check = self.month_position_check() - return {'cs': 'AS', 'bs': 'BAS', - 'ce': 'A', 'be': 'BA'}.get(pos_check) + return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check) def _get_quarterly_rule(self): if len(self.mdiffs) > 1: @@ -422,15 +442,13 @@ def _get_quarterly_rule(self): return None pos_check = self.month_position_check() - return {'cs': 'QS', 'bs': 'BQS', - 'ce': 'Q', 'be': 'BQ'}.get(pos_check) + return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check) def _get_monthly_rule(self): if len(self.mdiffs) > 1: return None pos_check = self.month_position_check() - return {'cs': 'MS', 'bs': 'BMS', - 'ce': 'M', 'be': 'BM'}.get(pos_check) + return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check) def _is_business_daily(self): # quick check: cannot be business daily @@ -442,8 +460,10 @@ def _is_business_daily(self): shifts = np.diff(self.index.asi8) shifts = np.floor_divide(shifts, _ONE_DAY) weekdays = np.mod(first_weekday + np.cumsum(shifts), 7) - return np.all(((weekdays == 0) & (shifts == 3)) | - ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))) + return np.all( + ((weekdays == 0) & (shifts == 3)) + | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1)) + ) def _get_wom_rule(self): # wdiffs = unique(np.diff(self.index.week)) @@ -465,21 +485,20 @@ def _get_wom_rule(self): week = week_of_months[0] + 1 wd = int_to_weekday[weekdays[0]] - return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) + return "WOM-{week}{weekday}".format(week=week, weekday=wd) class _TimedeltaFrequencyInferer(_FrequencyInferer): - def _infer_daily_rule(self): if self.is_unique: days = self.deltas[0] / _ONE_DAY if days % 7 == 0: # Weekly wd = int_to_weekday[self.rep_stamp.weekday()] - alias = 'W-{weekday}'.format(weekday=wd) + alias = "W-{weekday}".format(weekday=wd) return _maybe_add_count(alias, days / 7) else: - return _maybe_add_count('D', days) + return _maybe_add_count("D", days) def _is_multiple(us, mult): @@ -490,6 +509,6 @@ def _maybe_add_count(base, count): if count != 1: assert count == int(count) count = int(count) - return '{count}{base}'.format(count=count, base=base) + return "{count}{base}".format(count=count, base=base) else: return base diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 7171a6a182bdc..1654163d2a9e0 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -127,9 +127,18 @@ class Holiday: for observance. """ - def __init__(self, name, year=None, month=None, day=None, offset=None, - observance=None, start_date=None, end_date=None, - days_of_week=None): + def __init__( + self, + name, + year=None, + month=None, + day=None, + offset=None, + observance=None, + start_date=None, + end_date=None, + days_of_week=None, + ): """ Parameters ---------- @@ -166,27 +175,27 @@ class from pandas.tseries.offsets self.month = month self.day = day self.offset = offset - self.start_date = Timestamp( - start_date) if start_date is not None else start_date - self.end_date = Timestamp( - end_date) if end_date is not None else end_date + self.start_date = ( + Timestamp(start_date) if start_date is not None else start_date + ) + self.end_date = Timestamp(end_date) if end_date is not None else end_date self.observance = observance - assert (days_of_week is None or type(days_of_week) == tuple) + assert days_of_week is None or type(days_of_week) == tuple self.days_of_week = days_of_week def __repr__(self): - info = '' + info = "" if self.year is not None: - info += 'year={year}, '.format(year=self.year) - info += 'month={mon}, day={day}, '.format(mon=self.month, day=self.day) + info += "year={year}, ".format(year=self.year) + info += "month={mon}, day={day}, ".format(mon=self.month, day=self.day) if self.offset is not None: - info += 'offset={offset}'.format(offset=self.offset) + info += "offset={offset}".format(offset=self.offset) if self.observance is not None: - info += 'observance={obs}'.format(obs=self.observance) + info += "observance={obs}".format(obs=self.observance) - repr = 'Holiday: {name} ({info})'.format(name=self.name, info=info) + repr = "Holiday: {name} ({info})".format(name=self.name, info=info) return repr def dates(self, start_date, end_date, return_name=False): @@ -217,17 +226,21 @@ def dates(self, start_date, end_date, return_name=False): dates = self._reference_dates(start_date, end_date) holiday_dates = self._apply_rule(dates) if self.days_of_week is not None: - holiday_dates = holiday_dates[np.in1d(holiday_dates.dayofweek, - self.days_of_week)] + holiday_dates = holiday_dates[ + np.in1d(holiday_dates.dayofweek, self.days_of_week) + ] if self.start_date is not None: - filter_start_date = max(self.start_date.tz_localize( - filter_start_date.tz), filter_start_date) + filter_start_date = max( + self.start_date.tz_localize(filter_start_date.tz), filter_start_date + ) if self.end_date is not None: - filter_end_date = min(self.end_date.tz_localize( - filter_end_date.tz), filter_end_date) - holiday_dates = holiday_dates[(holiday_dates >= filter_start_date) & - (holiday_dates <= filter_end_date)] + filter_end_date = min( + self.end_date.tz_localize(filter_end_date.tz), filter_end_date + ) + holiday_dates = holiday_dates[ + (holiday_dates >= filter_start_date) & (holiday_dates <= filter_end_date) + ] if return_name: return Series(self.name, index=holiday_dates) return holiday_dates @@ -249,14 +262,19 @@ def _reference_dates(self, start_date, end_date): year_offset = DateOffset(years=1) reference_start_date = Timestamp( - datetime(start_date.year - 1, self.month, self.day)) + datetime(start_date.year - 1, self.month, self.day) + ) reference_end_date = Timestamp( - datetime(end_date.year + 1, self.month, self.day)) + datetime(end_date.year + 1, self.month, self.day) + ) # Don't process unnecessary holidays - dates = date_range(start=reference_start_date, - end=reference_end_date, - freq=year_offset, tz=start_date.tz) + dates = date_range( + start=reference_start_date, + end=reference_end_date, + freq=year_offset, + tz=start_date.tz, + ) return dates @@ -315,7 +333,6 @@ def get_calendar(name): class HolidayCalendarMetaClass(type): - def __new__(cls, clsname, bases, attrs): calendar_class = super().__new__(cls, clsname, bases, attrs) register(calendar_class) @@ -326,6 +343,7 @@ class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass): """ Abstract interface to create holidays following certain rules. """ + rules = [] # type: List[Holiday] start_date = Timestamp(datetime(1970, 1, 1)) end_date = Timestamp(datetime(2030, 12, 31)) @@ -375,8 +393,10 @@ def holidays(self, start=None, end=None, return_name=False): DatetimeIndex of holidays """ if self.rules is None: - raise Exception('Holiday Calendar {name} does not have any ' - 'rules specified'.format(name=self.name)) + raise Exception( + "Holiday Calendar {name} does not have any " + "rules specified".format(name=self.name) + ) if start is None: start = AbstractHolidayCalendar.start_date @@ -390,8 +410,7 @@ def holidays(self, start=None, end=None, return_name=False): holidays = None # If we don't have a cache or the dates are outside the prior cache, we # get them again - if (self._cache is None or start < self._cache[0] or - end > self._cache[1]): + if self._cache is None or start < self._cache[0] or end > self._cache[1]: for rule in self.rules: rule_holidays = rule.dates(start, end, return_name=True) @@ -464,23 +483,29 @@ def merge(self, other, inplace=False): return holidays -USMemorialDay = Holiday('Memorial Day', month=5, day=31, - offset=DateOffset(weekday=MO(-1))) -USLaborDay = Holiday('Labor Day', month=9, day=1, - offset=DateOffset(weekday=MO(1))) -USColumbusDay = Holiday('Columbus Day', month=10, day=1, - offset=DateOffset(weekday=MO(2))) -USThanksgivingDay = Holiday('Thanksgiving', month=11, day=1, - offset=DateOffset(weekday=TH(4))) -USMartinLutherKingJr = Holiday('Martin Luther King Jr. Day', - start_date=datetime(1986, 1, 1), month=1, day=1, - offset=DateOffset(weekday=MO(3))) -USPresidentsDay = Holiday('Presidents Day', month=2, day=1, - offset=DateOffset(weekday=MO(3))) +USMemorialDay = Holiday( + "Memorial Day", month=5, day=31, offset=DateOffset(weekday=MO(-1)) +) +USLaborDay = Holiday("Labor Day", month=9, day=1, offset=DateOffset(weekday=MO(1))) +USColumbusDay = Holiday( + "Columbus Day", month=10, day=1, offset=DateOffset(weekday=MO(2)) +) +USThanksgivingDay = Holiday( + "Thanksgiving", month=11, day=1, offset=DateOffset(weekday=TH(4)) +) +USMartinLutherKingJr = Holiday( + "Martin Luther King Jr. Day", + start_date=datetime(1986, 1, 1), + month=1, + day=1, + offset=DateOffset(weekday=MO(3)), +) +USPresidentsDay = Holiday( + "Presidents Day", month=2, day=1, offset=DateOffset(weekday=MO(3)) +) GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)]) -EasterMonday = Holiday("Easter Monday", month=1, day=1, - offset=[Easter(), Day(1)]) +EasterMonday = Holiday("Easter Monday", month=1, day=1, offset=[Easter(), Day(1)]) class USFederalHolidayCalendar(AbstractHolidayCalendar): @@ -489,22 +514,22 @@ class USFederalHolidayCalendar(AbstractHolidayCalendar): https://www.opm.gov/policy-data-oversight/ snow-dismissal-procedures/federal-holidays/ """ + rules = [ - Holiday('New Years Day', month=1, day=1, observance=nearest_workday), + Holiday("New Years Day", month=1, day=1, observance=nearest_workday), USMartinLutherKingJr, USPresidentsDay, USMemorialDay, - Holiday('July 4th', month=7, day=4, observance=nearest_workday), + Holiday("July 4th", month=7, day=4, observance=nearest_workday), USLaborDay, USColumbusDay, - Holiday('Veterans Day', month=11, day=11, observance=nearest_workday), + Holiday("Veterans Day", month=11, day=11, observance=nearest_workday), USThanksgivingDay, - Holiday('Christmas', month=12, day=25, observance=nearest_workday) + Holiday("Christmas", month=12, day=25, observance=nearest_workday), ] -def HolidayCalendarFactory(name, base, other, - base_class=AbstractHolidayCalendar): +def HolidayCalendarFactory(name, base, other, base_class=AbstractHolidayCalendar): rules = AbstractHolidayCalendar.merge_class(base, other) calendar_class = type(name, (base_class,), {"rules": rules, "name": name}) return calendar_class diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 087c05574090c..ac3e92c772517 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -7,12 +7,29 @@ import numpy as np from pandas._libs.tslibs import ( - NaT, OutOfBoundsDatetime, Timedelta, Timestamp, ccalendar, conversion, - delta_to_nanoseconds, frequencies as libfrequencies, normalize_date, - offsets as liboffsets, timezones) + NaT, + OutOfBoundsDatetime, + Timedelta, + Timestamp, + ccalendar, + conversion, + delta_to_nanoseconds, + frequencies as libfrequencies, + normalize_date, + offsets as liboffsets, + timezones, +) from pandas._libs.tslibs.offsets import ( - ApplyTypeError, BaseOffset, _get_calendar, _is_normalized, _to_dt64, - apply_index_wraps, as_datetime, roll_yearday, shift_month) + ApplyTypeError, + BaseOffset, + _get_calendar, + _is_normalized, + _to_dt64, + apply_index_wraps, + as_datetime, + roll_yearday, + shift_month, +) from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -21,17 +38,44 @@ from pandas.core.tools.datetimes import to_datetime -__all__ = ['Day', 'BusinessDay', 'BDay', 'CustomBusinessDay', 'CDay', - 'CBMonthEnd', 'CBMonthBegin', - 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd', - 'SemiMonthEnd', 'SemiMonthBegin', - 'BusinessHour', 'CustomBusinessHour', - 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd', - 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd', - 'LastWeekOfMonth', 'FY5253Quarter', 'FY5253', - 'Week', 'WeekOfMonth', 'Easter', - 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano', - 'DateOffset'] +__all__ = [ + "Day", + "BusinessDay", + "BDay", + "CustomBusinessDay", + "CDay", + "CBMonthEnd", + "CBMonthBegin", + "MonthBegin", + "BMonthBegin", + "MonthEnd", + "BMonthEnd", + "SemiMonthEnd", + "SemiMonthBegin", + "BusinessHour", + "CustomBusinessHour", + "YearBegin", + "BYearBegin", + "YearEnd", + "BYearEnd", + "QuarterBegin", + "BQuarterBegin", + "QuarterEnd", + "BQuarterEnd", + "LastWeekOfMonth", + "FY5253Quarter", + "FY5253", + "Week", + "WeekOfMonth", + "Easter", + "Hour", + "Minute", + "Second", + "Milli", + "Micro", + "Nano", + "DateOffset", +] # convert to/from datetime/timestamp to allow invalid Timestamp ranges to # pass thru @@ -58,8 +102,8 @@ def wrapper(self, other): elif isinstance(other, (np.datetime64, datetime, date)): other = as_timestamp(other) - tz = getattr(other, 'tzinfo', None) - nano = getattr(other, 'nanosecond', 0) + tz = getattr(other, "tzinfo", None) + nano = getattr(other, "nanosecond", 0) try: if self._adjust_dst and isinstance(other, Timestamp): @@ -80,7 +124,8 @@ def wrapper(self, other): if result.tz is not None: # convert to UTC value = conversion.tz_convert_single( - result.value, timezones.UTC, result.tz) + result.value, timezones.UTC, result.tz + ) else: value = result.value result = Timestamp(value + nano) @@ -101,6 +146,7 @@ def wrapper(self, other): result = Timestamp(result) return result + return wrapper @@ -199,11 +245,11 @@ def __add__(date): >>> ts + DateOffset(month=3) Timestamp('2017-03-01 09:10:11') """ + _params = cache_readonly(BaseOffset._params.fget) _use_relativedelta = False _adjust_dst = False - _attributes = frozenset(['n', 'normalize'] + - list(liboffsets.relativedelta_kwds)) + _attributes = frozenset(["n", "normalize"] + list(liboffsets.relativedelta_kwds)) # default for prior pickles normalize = False @@ -224,7 +270,7 @@ def apply(self, other): other = as_datetime(other) if len(self.kwds) > 0: - tzinfo = getattr(other, 'tzinfo', None) + tzinfo = getattr(other, "tzinfo", None) if tzinfo is not None and self._use_relativedelta: # perform calculation in UTC other = other.replace(tzinfo=None) @@ -261,72 +307,83 @@ def apply_index(self, i): """ if type(self) is not DateOffset: - raise NotImplementedError("DateOffset subclass {name} " - "does not have a vectorized " - "implementation".format( - name=self.__class__.__name__)) + raise NotImplementedError( + "DateOffset subclass {name} " + "does not have a vectorized " + "implementation".format(name=self.__class__.__name__) + ) kwds = self.kwds - relativedelta_fast = {'years', 'months', 'weeks', 'days', 'hours', - 'minutes', 'seconds', 'microseconds'} + relativedelta_fast = { + "years", + "months", + "weeks", + "days", + "hours", + "minutes", + "seconds", + "microseconds", + } # relativedelta/_offset path only valid for base DateOffset - if (self._use_relativedelta and - set(kwds).issubset(relativedelta_fast)): + if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): - months = ((kwds.get('years', 0) * 12 + - kwds.get('months', 0)) * self.n) + months = (kwds.get("years", 0) * 12 + kwds.get("months", 0)) * self.n if months: shifted = liboffsets.shift_months(i.asi8, months) i = type(i)(shifted, dtype=i.dtype) - weeks = (kwds.get('weeks', 0)) * self.n + weeks = (kwds.get("weeks", 0)) * self.n if weeks: # integer addition on PeriodIndex is deprecated, # so we directly use _time_shift instead - asper = i.to_period('W') + asper = i.to_period("W") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data shifted = asper._time_shift(weeks) - i = shifted.to_timestamp() + i.to_perioddelta('W') + i = shifted.to_timestamp() + i.to_perioddelta("W") - timedelta_kwds = {k: v for k, v in kwds.items() - if k in ['days', 'hours', 'minutes', - 'seconds', 'microseconds']} + timedelta_kwds = { + k: v + for k, v in kwds.items() + if k in ["days", "hours", "minutes", "seconds", "microseconds"] + } if timedelta_kwds: delta = Timedelta(**timedelta_kwds) i = i + (self.n * delta) return i - elif not self._use_relativedelta and hasattr(self, '_offset'): + elif not self._use_relativedelta and hasattr(self, "_offset"): # timedelta return i + (self._offset * self.n) else: # relativedelta with other keywords kwd = set(kwds) - relativedelta_fast - raise NotImplementedError("DateOffset with relativedelta " - "keyword(s) {kwd} not able to be " - "applied vectorized".format(kwd=kwd)) + raise NotImplementedError( + "DateOffset with relativedelta " + "keyword(s) {kwd} not able to be " + "applied vectorized".format(kwd=kwd) + ) def isAnchored(self): # TODO: Does this make sense for the general case? It would help # if there were a canonical docstring for what isAnchored means. - return (self.n == 1) + return self.n == 1 # TODO: Combine this with BusinessMixin version by defining a whitelisted # set of attributes on each object rather than the existing behavior of # iterating over internal ``__dict__`` def _repr_attrs(self): - exclude = {'n', 'inc', 'normalize'} + exclude = {"n", "inc", "normalize"} attrs = [] for attr in sorted(self.__dict__): - if attr.startswith('_') or attr == 'kwds': + if attr.startswith("_") or attr == "kwds": continue elif attr not in exclude: value = getattr(self, attr) - attrs.append('{attr}={value}'.format(attr=attr, value=value)) + attrs.append("{attr}={value}".format(attr=attr, value=value)) - out = '' + out = "" if attrs: - out += ': ' + ', '.join(attrs) + out += ": " + ", ".join(attrs) return out @property @@ -372,13 +429,13 @@ def onOffset(self, dt): # date range generated by this offset. Subclasses may have this # re-implemented in a nicer way. a = dt - b = ((dt + self) - self) + b = (dt + self) - self return a == b # way to get around weirdness with rule_code @property def _prefix(self): - raise NotImplementedError('Prefix not defined') + raise NotImplementedError("Prefix not defined") @property def rule_code(self): @@ -392,7 +449,7 @@ def freqstr(self): return repr(self) if self.n != 1: - fstr = '{n}{code}'.format(n=self.n, code=code) + fstr = "{n}{code}".format(n=self.n, code=code) else: fstr = code @@ -406,7 +463,7 @@ def freqstr(self): return fstr def _offset_str(self): - return '' + return "" @property def nanos(self): @@ -427,10 +484,11 @@ class _CustomMixin: Mixin for classes that define and validate calendar, holidays, and weekdays attributes. """ + def __init__(self, weekmask, holidays, calendar): - calendar, holidays = _get_calendar(weekmask=weekmask, - holidays=holidays, - calendar=calendar) + calendar, holidays = _get_calendar( + weekmask=weekmask, holidays=holidays, calendar=calendar + ) # Custom offset instances are identified by the # following two attributes. See DateOffset._params() # holidays, weekmask @@ -455,12 +513,12 @@ def offset(self): def _repr_attrs(self): if self.offset: - attrs = ['offset={offset!r}'.format(offset=self.offset)] + attrs = ["offset={offset!r}".format(offset=self.offset)] else: attrs = None - out = '' + out = "" if attrs: - out += ': ' + ', '.join(attrs) + out += ": " + ", ".join(attrs) return out @@ -468,9 +526,10 @@ class BusinessDay(BusinessMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n business days. """ - _prefix = 'B' + + _prefix = "B" _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'offset']) + _attributes = frozenset(["n", "normalize", "offset"]) def __init__(self, n=1, normalize=False, offset=timedelta(0)): BaseOffset.__init__(self, n, normalize) @@ -478,34 +537,34 @@ def __init__(self, n=1, normalize=False, offset=timedelta(0)): def _offset_str(self): def get_str(td): - off_str = '' + off_str = "" if td.days > 0: - off_str += str(td.days) + 'D' + off_str += str(td.days) + "D" if td.seconds > 0: s = td.seconds hrs = int(s / 3600) if hrs != 0: - off_str += str(hrs) + 'H' + off_str += str(hrs) + "H" s -= hrs * 3600 mts = int(s / 60) if mts != 0: - off_str += str(mts) + 'Min' + off_str += str(mts) + "Min" s -= mts * 60 if s != 0: - off_str += str(s) + 's' + off_str += str(s) + "s" if td.microseconds > 0: - off_str += str(td.microseconds) + 'us' + off_str += str(td.microseconds) + "us" return off_str if isinstance(self.offset, timedelta): zero = timedelta(0, 0, 0) if self.offset >= zero: - off_str = '+' + get_str(self.offset) + off_str = "+" + get_str(self.offset) else: - off_str = '-' + get_str(-self.offset) + off_str = "-" + get_str(-self.offset) return off_str else: - return '+' + repr(self.offset) + return "+" + repr(self.offset) @apply_wraps def apply(self, other): @@ -541,24 +600,24 @@ def apply(self, other): return result elif isinstance(other, (timedelta, Tick)): - return BDay(self.n, offset=self.offset + other, - normalize=self.normalize) + return BDay(self.n, offset=self.offset + other, normalize=self.normalize) else: - raise ApplyTypeError('Only know how to combine business day with ' - 'datetime or timedelta.') + raise ApplyTypeError( + "Only know how to combine business day with " "datetime or timedelta." + ) @apply_index_wraps def apply_index(self, i): - time = i.to_perioddelta('D') + time = i.to_perioddelta("D") # to_period rolls forward to next BDay; track and # reduce n where it does when rolling forward - asper = i.to_period('B') + asper = i.to_period("B") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data if self.n > 0: - shifted = (i.to_perioddelta('B') - time).asi8 != 0 + shifted = (i.to_perioddelta("B") - time).asi8 != 0 # Integer-array addition is deprecated, so we use # _time_shift directly @@ -579,27 +638,26 @@ def onOffset(self, dt): class BusinessHourMixin(BusinessMixin): - - def __init__(self, start='09:00', end='17:00', offset=timedelta(0)): + def __init__(self, start="09:00", end="17:00", offset=timedelta(0)): # must be validated here to equality check if not is_list_like(start): start = [start] if not len(start): - raise ValueError('Must include at least 1 start time') + raise ValueError("Must include at least 1 start time") if not is_list_like(end): end = [end] if not len(end): - raise ValueError('Must include at least 1 end time') + raise ValueError("Must include at least 1 end time") - start = np.array([liboffsets._validate_business_time(x) - for x in start]) + start = np.array([liboffsets._validate_business_time(x) for x in start]) end = np.array([liboffsets._validate_business_time(x) for x in end]) # Validation of input if len(start) != len(end): - raise ValueError('number of starting time and ending time ' - 'must be the same') + raise ValueError( + "number of starting time and ending time " "must be the same" + ) num_openings = len(start) # sort starting and ending time by starting time @@ -613,11 +671,14 @@ def __init__(self, start='09:00', end='17:00', offset=timedelta(0)): for i in range(num_openings): total_secs += self._get_business_hours_by_sec(start[i], end[i]) total_secs += self._get_business_hours_by_sec( - end[i], start[(i + 1) % num_openings]) + end[i], start[(i + 1) % num_openings] + ) if total_secs != 24 * 60 * 60: - raise ValueError('invalid starting and ending time(s): ' - 'opening hours should not touch or overlap with ' - 'one another') + raise ValueError( + "invalid starting and ending time(s): " + "opening hours should not touch or overlap with " + "one another" + ) object.__setattr__(self, "start", start) object.__setattr__(self, "end", end) @@ -632,12 +693,14 @@ def next_bday(self): nb_offset = 1 else: nb_offset = -1 - if self._prefix.startswith('C'): + if self._prefix.startswith("C"): # CustomBusinessHour - return CustomBusinessDay(n=nb_offset, - weekmask=self.weekmask, - holidays=self.holidays, - calendar=self.calendar) + return CustomBusinessDay( + n=nb_offset, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) else: return BusinessDay(n=nb_offset) @@ -771,25 +834,31 @@ def _get_closing_time(self, dt): for i, st in enumerate(self.start): if st.hour == dt.hour and st.minute == dt.minute: return dt + timedelta( - seconds=self._get_business_hours_by_sec(st, self.end[i])) + seconds=self._get_business_hours_by_sec(st, self.end[i]) + ) assert False @apply_wraps def apply(self, other): if isinstance(other, datetime): # used for detecting edge condition - nanosecond = getattr(other, 'nanosecond', 0) + nanosecond = getattr(other, "nanosecond", 0) # reset timezone and nanosecond # other may be a Timestamp, thus not use replace - other = datetime(other.year, other.month, other.day, - other.hour, other.minute, - other.second, other.microsecond) + other = datetime( + other.year, + other.month, + other.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) n = self.n # adjust other to reduce number of cases to handle if n >= 0: - if (other.time() in self.end or - not self._onOffset(other)): + if other.time() in self.end or not self._onOffset(other): other = self._next_opening_time(other) else: if other.time() in self.start: @@ -800,8 +869,10 @@ def apply(self, other): other = self._get_closing_time(other) # get total business hours by sec in one business day - businesshours = sum(self._get_business_hours_by_sec(st, en) - for st, en in zip(self.start, self.end)) + businesshours = sum( + self._get_business_hours_by_sec(st, en) + for st, en in zip(self.start, self.end) + ) bd, r = divmod(abs(n * 60), businesshours // 60) if n < 0: @@ -824,8 +895,9 @@ def apply(self, other): if n >= 0: while bhour_remain != timedelta(0): # business hour left in this business time interval - bhour = self._get_closing_time( - self._prev_opening_time(other)) - other + bhour = ( + self._get_closing_time(self._prev_opening_time(other)) - other + ) if bhour_remain < bhour: # finish adjusting if possible other += bhour_remain @@ -838,8 +910,11 @@ def apply(self, other): while bhour_remain != timedelta(0): # business hour left in this business time interval bhour = self._next_opening_time(other) - other - if (bhour_remain > bhour or - bhour_remain == bhour and nanosecond != 0): + if ( + bhour_remain > bhour + or bhour_remain == bhour + and nanosecond != 0 + ): # finish adjusting if possible other += bhour_remain bhour_remain = timedelta(0) @@ -848,20 +923,22 @@ def apply(self, other): bhour_remain -= bhour other = self._get_closing_time( self._next_opening_time( - other + bhour - timedelta(seconds=1))) + other + bhour - timedelta(seconds=1) + ) + ) return other else: - raise ApplyTypeError( - 'Only know how to combine business hour with datetime') + raise ApplyTypeError("Only know how to combine business hour with datetime") def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False if dt.tzinfo is not None: - dt = datetime(dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second, dt.microsecond) + dt = datetime( + dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond + ) # Valid BH can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time return self._onOffset(dt) @@ -882,8 +959,7 @@ def _onOffset(self, dt): businesshours = 0 for i, st in enumerate(self.start): if op.hour == st.hour and op.minute == st.minute: - businesshours = self._get_business_hours_by_sec( - st, self.end[i]) + businesshours = self._get_business_hours_by_sec(st, self.end[i]) if span <= businesshours: return True else: @@ -891,11 +967,12 @@ def _onOffset(self, dt): def _repr_attrs(self): out = super()._repr_attrs() - hours = ','.join('{}-{}'.format( - st.strftime('%H:%M'), en.strftime('%H:%M')) - for st, en in zip(self.start, self.end)) - attrs = ['{prefix}={hours}'.format(prefix=self._prefix, hours=hours)] - out += ': ' + ', '.join(attrs) + hours = ",".join( + "{}-{}".format(st.strftime("%H:%M"), en.strftime("%H:%M")) + for st, en in zip(self.start, self.end) + ) + attrs = ["{prefix}={hours}".format(prefix=self._prefix, hours=hours)] + out += ": " + ", ".join(attrs) return out @@ -905,12 +982,14 @@ class BusinessHour(BusinessHourMixin, SingleConstructorOffset): .. versionadded:: 0.16.1 """ - _prefix = 'BH' + + _prefix = "BH" _anchor = 0 - _attributes = frozenset(['n', 'normalize', 'start', 'end', 'offset']) + _attributes = frozenset(["n", "normalize", "start", "end", "offset"]) - def __init__(self, n=1, normalize=False, start='09:00', - end='17:00', offset=timedelta(0)): + def __init__( + self, n=1, normalize=False, start="09:00", end="17:00", offset=timedelta(0) + ): BaseOffset.__init__(self, n, normalize) super().__init__(start=start, end=end, offset=offset) @@ -933,12 +1012,21 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): calendar : pd.HolidayCalendar or np.busdaycalendar offset : timedelta, default timedelta(0) """ - _prefix = 'C' - _attributes = frozenset(['n', 'normalize', - 'weekmask', 'holidays', 'calendar', 'offset']) - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): + _prefix = "C" + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + offset=timedelta(0), + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) @@ -947,16 +1035,17 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', @apply_wraps def apply(self, other): if self.n <= 0: - roll = 'forward' + roll = "forward" else: - roll = 'backward' + roll = "backward" if isinstance(other, datetime): date_in = other np_dt = np.datetime64(date_in.date()) - np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll, - busdaycal=self.calendar) + np_incr_dt = np.busday_offset( + np_dt, self.n, roll=roll, busdaycal=self.calendar + ) dt_date = np_incr_dt.astype(datetime) result = datetime.combine(dt_date, date_in.time()) @@ -966,11 +1055,12 @@ def apply(self, other): return result elif isinstance(other, (timedelta, Tick)): - return BDay(self.n, offset=self.offset + other, - normalize=self.normalize) + return BDay(self.n, offset=self.offset + other, normalize=self.normalize) else: - raise ApplyTypeError('Only know how to combine trading day with ' - 'datetime, datetime64 or timedelta.') + raise ApplyTypeError( + "Only know how to combine trading day with " + "datetime, datetime64 or timedelta." + ) def apply_index(self, i): raise NotImplementedError @@ -978,26 +1068,34 @@ def apply_index(self, i): def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False - day64 = _to_dt64(dt, 'datetime64[D]') + day64 = _to_dt64(dt, "datetime64[D]") return np.is_busday(day64, busdaycal=self.calendar) -class CustomBusinessHour(_CustomMixin, BusinessHourMixin, - SingleConstructorOffset): +class CustomBusinessHour(_CustomMixin, BusinessHourMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n custom business days. .. versionadded:: 0.18.1 """ - _prefix = 'CBH' - _anchor = 0 - _attributes = frozenset(['n', 'normalize', - 'weekmask', 'holidays', 'calendar', - 'start', 'end', 'offset']) - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, - start='09:00', end='17:00', offset=timedelta(0)): + _prefix = "CBH" + _anchor = 0 + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "start", "end", "offset"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + start="09:00", + end="17:00", + offset=timedelta(0), + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) @@ -1011,7 +1109,7 @@ def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', class MonthOffset(SingleConstructorOffset): _adjust_dst = True - _attributes = frozenset(['n', 'normalize']) + _attributes = frozenset(["n", "normalize"]) __init__ = BaseOffset.__init__ @@ -1021,8 +1119,7 @@ def name(self): return self.rule_code else: month = ccalendar.MONTH_ALIASES[self.n] - return "{code}-{month}".format(code=self.rule_code, - month=month) + return "{code}-{month}".format(code=self.rule_code, month=month) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1047,32 +1144,36 @@ class MonthEnd(MonthOffset): """ DateOffset of one month end. """ - _prefix = 'M' - _day_opt = 'end' + + _prefix = "M" + _day_opt = "end" class MonthBegin(MonthOffset): """ DateOffset of one month at beginning. """ - _prefix = 'MS' - _day_opt = 'start' + + _prefix = "MS" + _day_opt = "start" class BusinessMonthEnd(MonthOffset): """ DateOffset increments between business EOM dates. """ - _prefix = 'BM' - _day_opt = 'business_end' + + _prefix = "BM" + _day_opt = "business_end" class BusinessMonthBegin(MonthOffset): """ DateOffset of one business month at beginning. """ - _prefix = 'BMS' - _day_opt = 'business_start' + + _prefix = "BMS" + _day_opt = "business_start" class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): @@ -1097,14 +1198,23 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): offset : timedelta, default timedelta(0) Time offset to apply. """ - _attributes = frozenset(['n', 'normalize', - 'weekmask', 'holidays', 'calendar', 'offset']) - onOffset = DateOffset.onOffset # override MonthOffset method + _attributes = frozenset( + ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] + ) + + onOffset = DateOffset.onOffset # override MonthOffset method apply_index = DateOffset.apply_index # override MonthOffset method - def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', - holidays=None, calendar=None, offset=timedelta(0)): + def __init__( + self, + n=1, + normalize=False, + weekmask="Mon Tue Wed Thu Fri", + holidays=None, + calendar=None, + offset=timedelta(0), + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) @@ -1117,7 +1227,7 @@ def cbday_roll(self): """ cbday = CustomBusinessDay(n=self.n, normalize=False, **self.kwds) - if self._prefix.endswith('S'): + if self._prefix.endswith("S"): # MonthBegin roll_func = cbday.rollforward else: @@ -1127,7 +1237,7 @@ def cbday_roll(self): @cache_readonly def m_offset(self): - if self._prefix.endswith('S'): + if self._prefix.endswith("S"): # MonthBegin moff = MonthBegin(n=1, normalize=False) else: @@ -1140,7 +1250,7 @@ def month_roll(self): """ Define default roll function to be called in apply method. """ - if self._prefix.endswith('S'): + if self._prefix.endswith("S"): # MonthBegin roll_func = self.m_offset.rollback else: @@ -1165,36 +1275,37 @@ def apply(self, other): @Substitution(bound="end") @Appender(_CustomBusinessMonth.__doc__) class CustomBusinessMonthEnd(_CustomBusinessMonth): - _prefix = 'CBM' + _prefix = "CBM" @Substitution(bound="beginning") @Appender(_CustomBusinessMonth.__doc__) class CustomBusinessMonthBegin(_CustomBusinessMonth): - _prefix = 'CBMS' + _prefix = "CBMS" # --------------------------------------------------------------------- # Semi-Month Based Offset Classes + class SemiMonthOffset(DateOffset): _adjust_dst = True _default_day_of_month = 15 _min_day_of_month = 2 - _attributes = frozenset(['n', 'normalize', 'day_of_month']) + _attributes = frozenset(["n", "normalize", "day_of_month"]) def __init__(self, n=1, normalize=False, day_of_month=None): BaseOffset.__init__(self, n, normalize) if day_of_month is None: - object.__setattr__(self, "day_of_month", - self._default_day_of_month) + object.__setattr__(self, "day_of_month", self._default_day_of_month) else: object.__setattr__(self, "day_of_month", int(day_of_month)) if not self._min_day_of_month <= self.day_of_month <= 27: - msg = 'day_of_month must be {min}<=day_of_month<=27, got {day}' - raise ValueError(msg.format(min=self._min_day_of_month, - day=self.day_of_month)) + msg = "day_of_month must be {min}<=day_of_month<=27, got {day}" + raise ValueError( + msg.format(min=self._min_day_of_month, day=self.day_of_month) + ) @classmethod def _from_name(cls, suffix=None): @@ -1202,7 +1313,7 @@ def _from_name(cls, suffix=None): @property def rule_code(self): - suffix = '-{day_of_month}'.format(day_of_month=self.day_of_month) + suffix = "-{day_of_month}".format(day_of_month=self.day_of_month) return self._prefix + suffix @apply_wraps @@ -1219,8 +1330,7 @@ def apply(self, other): # initially positive. if type(self) is SemiMonthBegin and (self.n <= 0 and other.day == 1): n -= 1 - elif type(self) is SemiMonthEnd and (self.n > 0 and - other.day == days_in_month): + elif type(self) is SemiMonthEnd and (self.n > 0 and other.day == days_in_month): n += 1 return self._apply(n, other) @@ -1235,7 +1345,7 @@ def _apply(self, n, other): def apply_index(self, i): # determine how many days away from the 1st of the month we are dti = i - days_from_start = i.to_perioddelta('M').asi8 + days_from_start = i.to_perioddelta("M").asi8 delta = Timedelta(days=self.day_of_month - 1).value # get boolean array for each element before the day_of_month @@ -1248,13 +1358,13 @@ def apply_index(self, i): roll = self._get_roll(i, before_day_of_month, after_day_of_month) # isolate the time since it will be striped away one the next line - time = i.to_perioddelta('D') + time = i.to_perioddelta("D") # apply the correct number of months # integer-array addition on PeriodIndex is deprecated, # so we use _addsub_int_array directly - asper = i.to_period('M') + asper = i.to_period("M") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data @@ -1296,7 +1406,8 @@ class SemiMonthEnd(SemiMonthOffset): normalize : bool, default False day_of_month : int, {1, 3,...,27}, default 15 """ - _prefix = 'SM' + + _prefix = "SM" _min_day_of_month = 1 def onOffset(self, dt): @@ -1339,7 +1450,7 @@ def _apply_index_days(self, i, roll): result : DatetimeIndex """ nanos = (roll % 2) * Timedelta(days=self.day_of_month).value - i += nanos.astype('timedelta64[ns]') + i += nanos.astype("timedelta64[ns]") return i + Timedelta(days=-1) @@ -1356,7 +1467,8 @@ class SemiMonthBegin(SemiMonthOffset): normalize : bool, default False day_of_month : int, {2, 3,...,27}, default 15 """ - _prefix = 'SMS' + + _prefix = "SMS" def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1397,12 +1509,13 @@ def _apply_index_days(self, i, roll): result : DatetimeIndex """ nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value - return i + nanos.astype('timedelta64[ns]') + return i + nanos.astype("timedelta64[ns]") # --------------------------------------------------------------------- # Week-Based Offset Classes + class Week(DateOffset): """ Weekly offset. @@ -1412,10 +1525,11 @@ class Week(DateOffset): weekday : int, default None Always generate specific day of week. 0 for Monday """ + _adjust_dst = True _inc = timedelta(weeks=1) - _prefix = 'W' - _attributes = frozenset(['n', 'normalize', 'weekday']) + _prefix = "W" + _attributes = frozenset(["n", "normalize", "weekday"]) def __init__(self, n=1, normalize=False, weekday=None): BaseOffset.__init__(self, n, normalize) @@ -1423,11 +1537,12 @@ def __init__(self, n=1, normalize=False, weekday=None): if self.weekday is not None: if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got {day}' - .format(day=self.weekday)) + raise ValueError( + "Day must be 0<=day<=6, got {day}".format(day=self.weekday) + ) def isAnchored(self): - return (self.n == 1 and self.weekday is not None) + return self.n == 1 and self.weekday is not None @apply_wraps def apply(self, other): @@ -1448,13 +1563,13 @@ def apply_index(self, i): if self.weekday is None: # integer addition on PeriodIndex is deprecated, # so we use _time_shift directly - asper = i.to_period('W') + asper = i.to_period("W") if not isinstance(asper._data, np.ndarray): # unwrap PeriodIndex --> PeriodArray asper = asper._data shifted = asper._time_shift(self.n) - return shifted.to_timestamp() + i.to_perioddelta('W') + return shifted.to_timestamp() + i.to_perioddelta("W") else: return self._end_apply_index(i) @@ -1471,7 +1586,7 @@ def _end_apply_index(self, dtindex): ------- result : DatetimeIndex """ - off = dtindex.to_perioddelta('D') + off = dtindex.to_perioddelta("D") base, mult = libfrequencies.get_freq_code(self.freqstr) base_period = dtindex.to_period(base) @@ -1481,20 +1596,21 @@ def _end_apply_index(self, dtindex): if self.n > 0: # when adding, dates on end roll to next - normed = dtindex - off + Timedelta(1, 'D') - Timedelta(1, 'ns') - roll = np.where(base_period.to_timestamp(how='end') == normed, - self.n, self.n - 1) + normed = dtindex - off + Timedelta(1, "D") - Timedelta(1, "ns") + roll = np.where( + base_period.to_timestamp(how="end") == normed, self.n, self.n - 1 + ) # integer-array addition on PeriodIndex is deprecated, # so we use _addsub_int_array directly shifted = base_period._addsub_int_array(roll, operator.add) - base = shifted.to_timestamp(how='end') + base = shifted.to_timestamp(how="end") else: # integer addition on PeriodIndex is deprecated, # so we use _time_shift directly roll = self.n - base = base_period._time_shift(roll).to_timestamp(how='end') + base = base_period._time_shift(roll).to_timestamp(how="end") - return base + off + Timedelta(1, 'ns') - Timedelta(1, 'D') + return base + off + Timedelta(1, "ns") - Timedelta(1, "D") def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1505,10 +1621,10 @@ def onOffset(self, dt): @property def rule_code(self): - suffix = '' + suffix = "" if self.weekday is not None: weekday = ccalendar.int_to_weekday[self.weekday] - suffix = '-{weekday}'.format(weekday=weekday) + suffix = "-{weekday}".format(weekday=weekday) return self._prefix + suffix @classmethod @@ -1524,6 +1640,7 @@ class _WeekOfMonthMixin: """ Mixin for methods common to WeekOfMonth and LastWeekOfMonth. """ + @apply_wraps def apply(self, other): compare_day = self._get_offset_day(other) @@ -1534,7 +1651,7 @@ def apply(self, other): elif months <= 0 and compare_day < other.day: months += 1 - shifted = shift_month(other, months, 'start') + shifted = shift_month(other, months, "start") to_day = self._get_offset_day(shifted) return liboffsets.shift_day(shifted, to_day - shifted.day) @@ -1562,9 +1679,10 @@ class WeekOfMonth(_WeekOfMonthMixin, DateOffset): 5: Saturdays 6: Sundays """ - _prefix = 'WOM' + + _prefix = "WOM" _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'week', 'weekday']) + _attributes = frozenset(["n", "normalize", "week", "weekday"]) def __init__(self, n=1, normalize=False, week=0, weekday=0): BaseOffset.__init__(self, n, normalize) @@ -1572,11 +1690,13 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): object.__setattr__(self, "week", week) if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got {day}' - .format(day=self.weekday)) + raise ValueError( + "Day must be 0<=day<=6, got {day}".format(day=self.weekday) + ) if self.week < 0 or self.week > 3: - raise ValueError('Week must be 0<=week<=3, got {week}' - .format(week=self.week)) + raise ValueError( + "Week must be 0<=week<=3, got {week}".format(week=self.week) + ) def _get_offset_day(self, other): """ @@ -1598,16 +1718,17 @@ def _get_offset_day(self, other): @property def rule_code(self): - weekday = ccalendar.int_to_weekday.get(self.weekday, '') - return '{prefix}-{week}{weekday}'.format(prefix=self._prefix, - week=self.week + 1, - weekday=weekday) + weekday = ccalendar.int_to_weekday.get(self.weekday, "") + return "{prefix}-{week}{weekday}".format( + prefix=self._prefix, week=self.week + 1, weekday=weekday + ) @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError("Prefix {prefix!r} requires a suffix." - .format(prefix=cls._prefix)) + raise ValueError( + "Prefix {prefix!r} requires a suffix.".format(prefix=cls._prefix) + ) # TODO: handle n here... # only one digit weeks (1 --> week 0, 2 --> week 1, etc.) week = int(suffix[0]) - 1 @@ -1632,20 +1753,22 @@ class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset): 5: Saturdays 6: Sundays """ - _prefix = 'LWOM' + + _prefix = "LWOM" _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'weekday']) + _attributes = frozenset(["n", "normalize", "weekday"]) def __init__(self, n=1, normalize=False, weekday=0): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "weekday", weekday) if self.n == 0: - raise ValueError('N cannot be 0') + raise ValueError("N cannot be 0") if self.weekday < 0 or self.weekday > 6: - raise ValueError('Day must be 0<=day<=6, got {day}' - .format(day=self.weekday)) + raise ValueError( + "Day must be 0<=day<=6, got {day}".format(day=self.weekday) + ) def _get_offset_day(self, other): """ @@ -1668,19 +1791,20 @@ def _get_offset_day(self, other): @property def rule_code(self): - weekday = ccalendar.int_to_weekday.get(self.weekday, '') - return '{prefix}-{weekday}'.format(prefix=self._prefix, - weekday=weekday) + weekday = ccalendar.int_to_weekday.get(self.weekday, "") + return "{prefix}-{weekday}".format(prefix=self._prefix, weekday=weekday) @classmethod def _from_name(cls, suffix=None): if not suffix: - raise ValueError("Prefix {prefix!r} requires a suffix." - .format(prefix=cls._prefix)) + raise ValueError( + "Prefix {prefix!r} requires a suffix.".format(prefix=cls._prefix) + ) # TODO: handle n here... weekday = ccalendar.weekday_to_int[suffix] return cls(weekday=weekday) + # --------------------------------------------------------------------- # Quarter-Based Offset Classes @@ -1689,10 +1813,11 @@ class QuarterOffset(DateOffset): """ Quarter representation - doesn't call super. """ + _default_startingMonth = None # type: Optional[int] - _from_name_startingMonth = None # type: Optional[int] + _from_name_startingMonth = None # type: Optional[int] _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'startingMonth']) + _attributes = frozenset(["n", "normalize", "startingMonth"]) # TODO: Consider combining QuarterOffset and YearOffset __init__ at some # point. Also apply_index, onOffset, rule_code if # startingMonth vs month attr names are resolved @@ -1705,22 +1830,22 @@ def __init__(self, n=1, normalize=False, startingMonth=None): object.__setattr__(self, "startingMonth", startingMonth) def isAnchored(self): - return (self.n == 1 and self.startingMonth is not None) + return self.n == 1 and self.startingMonth is not None @classmethod def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['startingMonth'] = ccalendar.MONTH_TO_CAL_NUM[suffix] + kwargs["startingMonth"] = ccalendar.MONTH_TO_CAL_NUM[suffix] else: if cls._from_name_startingMonth is not None: - kwargs['startingMonth'] = cls._from_name_startingMonth + kwargs["startingMonth"] = cls._from_name_startingMonth return cls(**kwargs) @property def rule_code(self): month = ccalendar.MONTH_ALIASES[self.startingMonth] - return '{prefix}-{month}'.format(prefix=self._prefix, month=month) + return "{prefix}-{month}".format(prefix=self._prefix, month=month) @apply_wraps def apply(self, other): @@ -1730,8 +1855,9 @@ def apply(self, other): # self. `months_since` is the number of months to shift other.month # to get to this on-offset month. months_since = other.month % 3 - self.startingMonth % 3 - qtrs = liboffsets.roll_qtrday(other, self.n, self.startingMonth, - day_opt=self._day_opt, modby=3) + qtrs = liboffsets.roll_qtrday( + other, self.n, self.startingMonth, day_opt=self._day_opt, modby=3 + ) months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) @@ -1743,12 +1869,14 @@ def onOffset(self, dt): @apply_index_wraps def apply_index(self, dtindex): - shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, - self.startingMonth, self._day_opt) + shifted = liboffsets.shift_quarters( + dtindex.asi8, self.n, self.startingMonth, self._day_opt + ) # TODO: going through __new__ raises on call to _validate_frequency; # are we passing incorrect freq? - return type(dtindex)._simple_new(shifted, freq=dtindex.freq, - dtype=dtindex.dtype) + return type(dtindex)._simple_new( + shifted, freq=dtindex.freq, dtype=dtindex.dtype + ) class BQuarterEnd(QuarterOffset): @@ -1759,11 +1887,12 @@ class BQuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... """ - _outputName = 'BusinessQuarterEnd' + + _outputName = "BusinessQuarterEnd" _default_startingMonth = 3 _from_name_startingMonth = 12 - _prefix = 'BQ' - _day_opt = 'business_end' + _prefix = "BQ" + _day_opt = "business_end" # TODO: This is basically the same as BQuarterEnd @@ -1772,8 +1901,8 @@ class BQuarterBegin(QuarterOffset): # I suspect this is wrong for *all* of them. _default_startingMonth = 3 _from_name_startingMonth = 1 - _prefix = 'BQS' - _day_opt = 'business_start' + _prefix = "BQS" + _day_opt = "business_start" class QuarterEnd(QuarterOffset): @@ -1784,35 +1913,39 @@ class QuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... """ - _outputName = 'QuarterEnd' + + _outputName = "QuarterEnd" _default_startingMonth = 3 - _prefix = 'Q' - _day_opt = 'end' + _prefix = "Q" + _day_opt = "end" class QuarterBegin(QuarterOffset): - _outputName = 'QuarterBegin' + _outputName = "QuarterBegin" _default_startingMonth = 3 _from_name_startingMonth = 1 - _prefix = 'QS' - _day_opt = 'start' + _prefix = "QS" + _day_opt = "start" # --------------------------------------------------------------------- # Year-Based Offset Classes + class YearOffset(DateOffset): """ DateOffset that just needs a month. """ + _adjust_dst = True - _attributes = frozenset(['n', 'normalize', 'month']) + _attributes = frozenset(["n", "normalize", "month"]) def _get_offset_day(self, other): # override BaseOffset method to use self.month instead of other.month # TODO: there may be a more performant way to do this - return liboffsets.get_day_of_month(other.replace(month=self.month), - self._day_opt) + return liboffsets.get_day_of_month( + other.replace(month=self.month), self._day_opt + ) @apply_wraps def apply(self, other): @@ -1822,13 +1955,14 @@ def apply(self, other): @apply_index_wraps def apply_index(self, dtindex): - shifted = liboffsets.shift_quarters(dtindex.asi8, self.n, - self.month, self._day_opt, - modby=12) + shifted = liboffsets.shift_quarters( + dtindex.asi8, self.n, self.month, self._day_opt, modby=12 + ) # TODO: going through __new__ raises on call to _validate_frequency; # are we passing incorrect freq? - return type(dtindex)._simple_new(shifted, freq=dtindex.freq, - dtype=dtindex.dtype) + return type(dtindex)._simple_new( + shifted, freq=dtindex.freq, dtype=dtindex.dtype + ) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1842,62 +1976,67 @@ def __init__(self, n=1, normalize=False, month=None): object.__setattr__(self, "month", month) if self.month < 1 or self.month > 12: - raise ValueError('Month must go from 1 to 12') + raise ValueError("Month must go from 1 to 12") @classmethod def _from_name(cls, suffix=None): kwargs = {} if suffix: - kwargs['month'] = ccalendar.MONTH_TO_CAL_NUM[suffix] + kwargs["month"] = ccalendar.MONTH_TO_CAL_NUM[suffix] return cls(**kwargs) @property def rule_code(self): month = ccalendar.MONTH_ALIASES[self.month] - return '{prefix}-{month}'.format(prefix=self._prefix, month=month) + return "{prefix}-{month}".format(prefix=self._prefix, month=month) class BYearEnd(YearOffset): """ DateOffset increments between business EOM dates. """ - _outputName = 'BusinessYearEnd' + + _outputName = "BusinessYearEnd" _default_month = 12 - _prefix = 'BA' - _day_opt = 'business_end' + _prefix = "BA" + _day_opt = "business_end" class BYearBegin(YearOffset): """ DateOffset increments between business year begin dates. """ - _outputName = 'BusinessYearBegin' + + _outputName = "BusinessYearBegin" _default_month = 1 - _prefix = 'BAS' - _day_opt = 'business_start' + _prefix = "BAS" + _day_opt = "business_start" class YearEnd(YearOffset): """ DateOffset increments between calendar year ends. """ + _default_month = 12 - _prefix = 'A' - _day_opt = 'end' + _prefix = "A" + _day_opt = "end" class YearBegin(YearOffset): """ DateOffset increments between calendar year begin dates. """ + _default_month = 1 - _prefix = 'AS' - _day_opt = 'start' + _prefix = "AS" + _day_opt = "start" # --------------------------------------------------------------------- # Special Offset Classes + class FY5253(DateOffset): """ Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar. @@ -1934,12 +2073,14 @@ class FY5253(DateOffset): variation : str {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" """ - _prefix = 'RE' + + _prefix = "RE" _adjust_dst = True - _attributes = frozenset(['weekday', 'startingMonth', 'variation']) + _attributes = frozenset(["weekday", "startingMonth", "variation"]) - def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, - variation="nearest"): + def __init__( + self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest" + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "startingMonth", startingMonth) object.__setattr__(self, "weekday", weekday) @@ -1947,16 +2088,17 @@ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, object.__setattr__(self, "variation", variation) if self.n == 0: - raise ValueError('N cannot be 0') + raise ValueError("N cannot be 0") if self.variation not in ["nearest", "last"]: - raise ValueError('{variation} is not a valid variation' - .format(variation=self.variation)) + raise ValueError( + "{variation} is not a valid variation".format(variation=self.variation) + ) def isAnchored(self): - return (self.n == 1 and - self.startingMonth is not None and - self.weekday is not None) + return ( + self.n == 1 and self.startingMonth is not None and self.weekday is not None + ) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1966,8 +2108,7 @@ def onOffset(self, dt): if self.variation == "nearest": # We have to check the year end of "this" cal year AND the previous - return (year_end == dt or - self.get_year_end(shift_month(dt, -1, None)) == dt) + return year_end == dt or self.get_year_end(shift_month(dt, -1, None)) == dt else: return year_end == dt @@ -1976,12 +2117,9 @@ def apply(self, other): norm = Timestamp(other).normalize() n = self.n - prev_year = self.get_year_end( - datetime(other.year - 1, self.startingMonth, 1)) - cur_year = self.get_year_end( - datetime(other.year, self.startingMonth, 1)) - next_year = self.get_year_end( - datetime(other.year + 1, self.startingMonth, 1)) + prev_year = self.get_year_end(datetime(other.year - 1, self.startingMonth, 1)) + cur_year = self.get_year_end(datetime(other.year, self.startingMonth, 1)) + next_year = self.get_year_end(datetime(other.year + 1, self.startingMonth, 1)) prev_year = conversion.localize_pydatetime(prev_year, other.tzinfo) cur_year = conversion.localize_pydatetime(cur_year, other.tzinfo) @@ -2005,8 +2143,11 @@ def apply(self, other): n += 1 elif prev_year < norm < cur_year: pass - elif (norm.year == prev_year.year and norm < prev_year and - prev_year - norm <= timedelta(6)): + elif ( + norm.year == prev_year.year + and norm < prev_year + and prev_year - norm <= timedelta(6) + ): # GH#14774, error when next_year.year == cur_year.year # e.g. prev_year == datetime(2004, 1, 3), # other == datetime(2004, 1, 1) @@ -2016,9 +2157,15 @@ def apply(self, other): shifted = datetime(other.year + n, self.startingMonth, 1) result = self.get_year_end(shifted) - result = datetime(result.year, result.month, result.day, - other.hour, other.minute, other.second, - other.microsecond) + result = datetime( + result.year, + result.month, + result.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) return result def get_year_end(self, dt): @@ -2055,16 +2202,17 @@ def rule_code(self): def _get_suffix_prefix(self): if self.variation == "nearest": - return 'N' + return "N" else: - return 'L' + return "L" def get_rule_code_suffix(self): prefix = self._get_suffix_prefix() month = ccalendar.MONTH_ALIASES[self.startingMonth] weekday = ccalendar.int_to_weekday[self.weekday] - return '{prefix}-{month}-{weekday}'.format(prefix=prefix, month=month, - weekday=weekday) + return "{prefix}-{month}-{weekday}".format( + prefix=prefix, month=month, weekday=weekday + ) @classmethod def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): @@ -2073,15 +2221,18 @@ def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code): elif varion_code == "L": variation = "last" else: - raise ValueError("Unable to parse varion_code: " - "{code}".format(code=varion_code)) + raise ValueError( + "Unable to parse varion_code: " "{code}".format(code=varion_code) + ) startingMonth = ccalendar.MONTH_TO_CAL_NUM[startingMonth_code] weekday = ccalendar.weekday_to_int[weekday_code] - return {"weekday": weekday, - "startingMonth": startingMonth, - "variation": variation} + return { + "weekday": weekday, + "startingMonth": startingMonth, + "variation": variation, + } @classmethod def _from_name(cls, *args): @@ -2132,13 +2283,21 @@ class FY5253Quarter(DateOffset): {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" """ - _prefix = 'REQ' + _prefix = "REQ" _adjust_dst = True - _attributes = frozenset(['weekday', 'startingMonth', 'qtr_with_extra_week', - 'variation']) - - def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, - qtr_with_extra_week=1, variation="nearest"): + _attributes = frozenset( + ["weekday", "startingMonth", "qtr_with_extra_week", "variation"] + ) + + def __init__( + self, + n=1, + normalize=False, + weekday=0, + startingMonth=1, + qtr_with_extra_week=1, + variation="nearest", + ): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "startingMonth", startingMonth) @@ -2147,13 +2306,15 @@ def __init__(self, n=1, normalize=False, weekday=0, startingMonth=1, object.__setattr__(self, "variation", variation) if self.n == 0: - raise ValueError('N cannot be 0') + raise ValueError("N cannot be 0") @cache_readonly def _offset(self): - return FY5253(startingMonth=self.startingMonth, - weekday=self.weekday, - variation=self.variation) + return FY5253( + startingMonth=self.startingMonth, + weekday=self.weekday, + variation=self.variation, + ) def isAnchored(self): return self.n == 1 and self._offset.isAnchored() @@ -2275,13 +2436,15 @@ def onOffset(self, dt): def rule_code(self): suffix = self._offset.get_rule_code_suffix() qtr = self.qtr_with_extra_week - return "{prefix}-{suffix}-{qtr}".format(prefix=self._prefix, - suffix=suffix, qtr=qtr) + return "{prefix}-{suffix}-{qtr}".format( + prefix=self._prefix, suffix=suffix, qtr=qtr + ) @classmethod def _from_name(cls, *args): - return cls(**dict(FY5253._parse_suffix(*args[:-1]), - qtr_with_extra_week=int(args[-1]))) + return cls( + **dict(FY5253._parse_suffix(*args[:-1]), qtr_with_extra_week=int(args[-1])) + ) class Easter(DateOffset): @@ -2290,18 +2453,19 @@ class Easter(DateOffset): Right now uses the revised method which is valid in years 1583-4099. """ + _adjust_dst = True - _attributes = frozenset(['n', 'normalize']) + _attributes = frozenset(["n", "normalize"]) __init__ = BaseOffset.__init__ @apply_wraps def apply(self, other): current_easter = easter(other.year) - current_easter = datetime(current_easter.year, - current_easter.month, current_easter.day) - current_easter = conversion.localize_pydatetime(current_easter, - other.tzinfo) + current_easter = datetime( + current_easter.year, current_easter.month, current_easter.day + ) + current_easter = conversion.localize_pydatetime(current_easter, other.tzinfo) n = self.n if n >= 0 and other < current_easter: @@ -2313,8 +2477,15 @@ def apply(self, other): # NOTE: easter returns a datetime.date so we have to convert to type of # other new = easter(other.year + n) - new = datetime(new.year, new.month, new.day, other.hour, - other.minute, other.second, other.microsecond) + new = datetime( + new.year, + new.month, + new.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) return new def onOffset(self, dt): @@ -2322,6 +2493,7 @@ def onOffset(self, dt): return False return date(dt.year, dt.month, dt.day) == easter(dt.year) + # --------------------------------------------------------------------- # Ticks @@ -2334,24 +2506,27 @@ def f(self, other): return op(self.delta, other.delta) except AttributeError: # comparing with a non-Tick object - raise TypeError("Invalid comparison between {cls} and {typ}" - .format(cls=type(self).__name__, - typ=type(other).__name__)) + raise TypeError( + "Invalid comparison between {cls} and {typ}".format( + cls=type(self).__name__, typ=type(other).__name__ + ) + ) - f.__name__ = '__{opname}__'.format(opname=op.__name__) + f.__name__ = "__{opname}__".format(opname=op.__name__) return f class Tick(liboffsets._Tick, SingleConstructorOffset): _inc = Timedelta(microseconds=1000) - _prefix = 'undefined' - _attributes = frozenset(['n', 'normalize']) + _prefix = "undefined" + _attributes = frozenset(["n", "normalize"]) def __init__(self, n=1, normalize=False): BaseOffset.__init__(self, n, normalize) if normalize: - raise ValueError("Tick offset with `normalize=True` are not " - "allowed.") # GH#21427 + raise ValueError( + "Tick offset with `normalize=True` are not " "allowed." + ) # GH#21427 __gt__ = _tick_comp(operator.gt) __ge__ = _tick_comp(operator.ge) @@ -2371,12 +2546,15 @@ def __add__(self, other): except ApplyTypeError: return NotImplemented except OverflowError: - raise OverflowError("the add operation between {self} and {other} " - "will overflow".format(self=self, other=other)) + raise OverflowError( + "the add operation between {self} and {other} " + "will overflow".format(self=self, other=other) + ) def __eq__(self, other): if isinstance(other, str): from pandas.tseries.frequencies import to_offset + try: # GH#23524 if to_offset fails, we are dealing with an # incomparable type so == is False and != is True @@ -2398,6 +2576,7 @@ def __hash__(self): def __ne__(self, other): if isinstance(other, str): from pandas.tseries.frequencies import to_offset + try: # GH#23524 if to_offset fails, we are dealing with an # incomparable type so == is False and != is True @@ -2441,8 +2620,9 @@ def apply(self, other): elif isinstance(other, type(self)): return type(self)(self.n + other.n) - raise ApplyTypeError('Unhandled type: {type_str}' - .format(type_str=type(other).__name__)) + raise ApplyTypeError( + "Unhandled type: {type_str}".format(type_str=type(other).__name__) + ) def isAnchored(self): return False @@ -2473,37 +2653,37 @@ def _delta_to_tick(delta): class Day(Tick): _inc = Timedelta(days=1) - _prefix = 'D' + _prefix = "D" class Hour(Tick): _inc = Timedelta(hours=1) - _prefix = 'H' + _prefix = "H" class Minute(Tick): _inc = Timedelta(minutes=1) - _prefix = 'T' + _prefix = "T" class Second(Tick): _inc = Timedelta(seconds=1) - _prefix = 'S' + _prefix = "S" class Milli(Tick): _inc = Timedelta(milliseconds=1) - _prefix = 'L' + _prefix = "L" class Micro(Tick): _inc = Timedelta(microseconds=1) - _prefix = 'U' + _prefix = "U" class Nano(Tick): _inc = Timedelta(nanoseconds=1) - _prefix = 'N' + _prefix = "N" BDay = BusinessDay @@ -2541,6 +2721,7 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): dates : generator object """ from pandas.tseries.frequencies import to_offset + offset = to_offset(offset) start = to_datetime(start) @@ -2575,8 +2756,9 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): # faster than cur + offset next_date = offset.apply(cur) if next_date <= cur: - raise ValueError('Offset {offset} did not increment date' - .format(offset=offset)) + raise ValueError( + "Offset {offset} did not increment date".format(offset=offset) + ) cur = next_date else: while cur >= end: @@ -2590,41 +2772,45 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): # faster than cur + offset next_date = offset.apply(cur) if next_date >= cur: - raise ValueError('Offset {offset} did not decrement date' - .format(offset=offset)) + raise ValueError( + "Offset {offset} did not decrement date".format(offset=offset) + ) cur = next_date -prefix_mapping = {offset._prefix: offset for offset in [ - YearBegin, # 'AS' - YearEnd, # 'A' - BYearBegin, # 'BAS' - BYearEnd, # 'BA' - BusinessDay, # 'B' - BusinessMonthBegin, # 'BMS' - BusinessMonthEnd, # 'BM' - BQuarterEnd, # 'BQ' - BQuarterBegin, # 'BQS' - BusinessHour, # 'BH' - CustomBusinessDay, # 'C' - CustomBusinessMonthEnd, # 'CBM' - CustomBusinessMonthBegin, # 'CBMS' - CustomBusinessHour, # 'CBH' - MonthEnd, # 'M' - MonthBegin, # 'MS' - Nano, # 'N' - SemiMonthEnd, # 'SM' - SemiMonthBegin, # 'SMS' - Week, # 'W' - Second, # 'S' - Minute, # 'T' - Micro, # 'U' - QuarterEnd, # 'Q' - QuarterBegin, # 'QS' - Milli, # 'L' - Hour, # 'H' - Day, # 'D' - WeekOfMonth, # 'WOM' - FY5253, - FY5253Quarter -]} +prefix_mapping = { + offset._prefix: offset + for offset in [ + YearBegin, # 'AS' + YearEnd, # 'A' + BYearBegin, # 'BAS' + BYearEnd, # 'BA' + BusinessDay, # 'B' + BusinessMonthBegin, # 'BMS' + BusinessMonthEnd, # 'BM' + BQuarterEnd, # 'BQ' + BQuarterBegin, # 'BQS' + BusinessHour, # 'BH' + CustomBusinessDay, # 'C' + CustomBusinessMonthEnd, # 'CBM' + CustomBusinessMonthBegin, # 'CBMS' + CustomBusinessHour, # 'CBH' + MonthEnd, # 'M' + MonthBegin, # 'MS' + Nano, # 'N' + SemiMonthEnd, # 'SM' + SemiMonthBegin, # 'SMS' + Week, # 'W' + Second, # 'S' + Minute, # 'T' + Micro, # 'U' + QuarterEnd, # 'Q' + QuarterBegin, # 'QS' + Milli, # 'L' + Hour, # 'H' + Day, # 'D' + WeekOfMonth, # 'WOM' + FY5253, + FY5253Quarter, + ] +} diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index 9600109f01534..d906c0371d207 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,4 +1,3 @@ -from pandas.util._decorators import ( # noqa - Appender, Substitution, cache_readonly) +from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index cdda02324ba06..f39020f4165df 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -6,8 +6,9 @@ from pandas._libs.properties import cache_readonly # noqa -def deprecate(name, alternative, version, alt_name=None, - klass=None, stacklevel=2, msg=None): +def deprecate( + name, alternative, version, alt_name=None, klass=None, stacklevel=2, msg=None +): """ Return a new function that emits a deprecation warning on use. @@ -36,8 +37,7 @@ def deprecate(name, alternative, version, alt_name=None, alt_name = alt_name or alternative.__name__ klass = klass or FutureWarning - warning_msg = msg or '{} is deprecated, use {} instead'.format(name, - alt_name) + warning_msg = msg or "{} is deprecated, use {} instead".format(name, alt_name) @wraps(alternative) def wrapper(*args, **kwargs): @@ -45,31 +45,37 @@ def wrapper(*args, **kwargs): return alternative(*args, **kwargs) # adding deprecated directive to the docstring - msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name) - doc_error_msg = ('deprecate needs a correctly formatted docstring in ' - 'the target function (should have a one liner short ' - 'summary, and opening quotes should be in their own ' - 'line). Found:\n{}'.format(alternative.__doc__)) + msg = msg or "Use `{alt_name}` instead.".format(alt_name=alt_name) + doc_error_msg = ( + "deprecate needs a correctly formatted docstring in " + "the target function (should have a one liner short " + "summary, and opening quotes should be in their own " + "line). Found:\n{}".format(alternative.__doc__) + ) # when python is running in optimized mode (i.e. `-OO`), docstrings are # removed, so we check that a docstring with correct formatting is used # but we allow empty docstrings if alternative.__doc__: - if alternative.__doc__.count('\n') < 3: + if alternative.__doc__.count("\n") < 3: raise AssertionError(doc_error_msg) - empty1, summary, empty2, doc = alternative.__doc__.split('\n', 3) + empty1, summary, empty2, doc = alternative.__doc__.split("\n", 3) if empty1 or empty2 and not summary: raise AssertionError(doc_error_msg) - wrapper.__doc__ = dedent(""" + wrapper.__doc__ = dedent( + """ {summary} .. deprecated:: {depr_version} {depr_msg} - {rest_of_docstring}""").format(summary=summary.strip(), - depr_version=version, - depr_msg=msg, - rest_of_docstring=dedent(doc)) + {rest_of_docstring}""" + ).format( + summary=summary.strip(), + depr_version=version, + depr_msg=msg, + rest_of_docstring=dedent(doc), + ) return wrapper @@ -137,10 +143,10 @@ def deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2): should raise warning """ - if mapping is not None and not hasattr(mapping, 'get') and \ - not callable(mapping): - raise TypeError("mapping from old to new argument values " - "must be dict or callable!") + if mapping is not None and not hasattr(mapping, "get") and not callable(mapping): + raise TypeError( + "mapping from old to new argument values " "must be dict or callable!" + ) def _deprecate_kwarg(func): @wraps(func) @@ -159,34 +165,38 @@ def wrapper(*args, **kwargs): if old_arg_value is not None: if mapping is not None: - if hasattr(mapping, 'get'): - new_arg_value = mapping.get(old_arg_value, - old_arg_value) + if hasattr(mapping, "get"): + new_arg_value = mapping.get(old_arg_value, old_arg_value) else: new_arg_value = mapping(old_arg_value) - msg = ("the {old_name}={old_val!r} keyword is deprecated, " - "use {new_name}={new_val!r} instead" - ).format(old_name=old_arg_name, - old_val=old_arg_value, - new_name=new_arg_name, - new_val=new_arg_value) + msg = ( + "the {old_name}={old_val!r} keyword is deprecated, " + "use {new_name}={new_val!r} instead" + ).format( + old_name=old_arg_name, + old_val=old_arg_value, + new_name=new_arg_name, + new_val=new_arg_value, + ) else: new_arg_value = old_arg_value - msg = ("the '{old_name}' keyword is deprecated, " - "use '{new_name}' instead" - ).format(old_name=old_arg_name, - new_name=new_arg_name) + msg = ( + "the '{old_name}' keyword is deprecated, " + "use '{new_name}' instead" + ).format(old_name=old_arg_name, new_name=new_arg_name) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) if kwargs.get(new_arg_name, None) is not None: - msg = ("Can only specify '{old_name}' or '{new_name}', " - "not both").format(old_name=old_arg_name, - new_name=new_arg_name) + msg = ( + "Can only specify '{old_name}' or '{new_name}', " "not both" + ).format(old_name=old_arg_name, new_name=new_arg_name) raise TypeError(msg) else: kwargs[new_arg_name] = new_arg_value return func(*args, **kwargs) + return wrapper + return _deprecate_kwarg @@ -198,11 +208,11 @@ def wrapper(*args, **kwargs): kind = inspect.Parameter.POSITIONAL_OR_KEYWORD params = [ - inspect.Parameter('self', kind), + inspect.Parameter("self", kind), inspect.Parameter(name, kind, default=None), - inspect.Parameter('index', kind, default=None), - inspect.Parameter('columns', kind, default=None), - inspect.Parameter('axis', kind, default=None), + inspect.Parameter("index", kind, default=None), + inspect.Parameter("columns", kind, default=None), + inspect.Parameter("axis", kind, default=None), ] for pname, default in extra_params: @@ -212,8 +222,10 @@ def wrapper(*args, **kwargs): func.__signature__ = sig return wrapper + return decorate + # Substitution and Appender are derived from matplotlib.docstring (1.1.0) # module http://matplotlib.org/users/license.html @@ -248,7 +260,7 @@ def some_function(x): """ def __init__(self, *args, **kwargs): - if (args and kwargs): + if args and kwargs: raise AssertionError("Only positional or keyword args are allowed") self.params = args or kwargs @@ -299,7 +311,7 @@ def my_dog(has='fleas'): pass """ - def __init__(self, addendum, join='', indents=0): + def __init__(self, addendum, join="", indents=0): if indents > 0: self.addendum = indent(addendum, indents=indents) else: @@ -307,8 +319,8 @@ def __init__(self, addendum, join='', indents=0): self.join = join def __call__(self, func): - func.__doc__ = func.__doc__ if func.__doc__ else '' - self.addendum = self.addendum if self.addendum else '' + func.__doc__ = func.__doc__ if func.__doc__ else "" + self.addendum = self.addendum if self.addendum else "" docitems = [func.__doc__, self.addendum] func.__doc__ = dedent(self.join.join(docitems)) return func @@ -316,6 +328,6 @@ def __call__(self, func): def indent(text, indents=1): if not text or not isinstance(text, str): - return '' - jointext = ''.join(['\n'] + [' '] * indents) - return jointext.join(text.split('\n')) + return "" + jointext = "".join(["\n"] + [" "] * indents) + return jointext.join(text.split("\n")) diff --git a/pandas/util/_depr_module.py b/pandas/util/_depr_module.py index 714ea1ce8086f..54f090ede3fc4 100644 --- a/pandas/util/_depr_module.py +++ b/pandas/util/_depr_module.py @@ -23,8 +23,7 @@ class _DeprecatedModule: objects """ - def __init__(self, deprmod, deprmodto=None, removals=None, - moved=None): + def __init__(self, deprmod, deprmodto=None, removals=None, moved=None): self.deprmod = deprmod self.deprmodto = deprmodto self.removals = removals @@ -64,23 +63,27 @@ def __getattr__(self, name): warnings.warn( "{deprmod}.{name} is deprecated and will be removed in " "a future version.".format(deprmod=self.deprmod, name=name), - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) elif self.moved is not None and name in self.moved: warnings.warn( "{deprmod} is deprecated and will be removed in " "a future version.\nYou can access {name} as {moved}".format( - deprmod=self.deprmod, - name=name, - moved=self.moved[name]), - FutureWarning, stacklevel=2) + deprmod=self.deprmod, name=name, moved=self.moved[name] + ), + FutureWarning, + stacklevel=2, + ) else: deprmodto = self.deprmodto if deprmodto is False: warnings.warn( "{deprmod}.{name} is deprecated and will be removed in " - "a future version.".format( - deprmod=self.deprmod, name=name), - FutureWarning, stacklevel=2) + "a future version.".format(deprmod=self.deprmod, name=name), + FutureWarning, + stacklevel=2, + ) else: if deprmodto is None: deprmodto = obj.__module__ @@ -88,8 +91,11 @@ def __getattr__(self, name): warnings.warn( "{deprmod}.{name} is deprecated. Please use " "{deprmodto}.{name} instead.".format( - deprmod=self.deprmod, name=name, deprmodto=deprmodto), - FutureWarning, stacklevel=2) + deprmod=self.deprmod, name=name, deprmodto=deprmodto + ), + FutureWarning, + stacklevel=2, + ) return obj @@ -98,6 +104,6 @@ def _import_deprmod(self, mod=None): mod = self.deprmod with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=FutureWarning) + warnings.filterwarnings("ignore", category=FutureWarning) deprmodule = importlib.import_module(mod) return deprmodule diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 3a64f86a6668d..11156bc972857 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -29,15 +29,11 @@ def _get_cells(self, left, right, vertical): if vertical: # calculate required number of cells - vcells = max(sum(self._shape(l)[0] for l in left), - self._shape(right)[0]) - hcells = (max(self._shape(l)[1] for l in left) + - self._shape(right)[1]) + vcells = max(sum(self._shape(l)[0] for l in left), self._shape(right)[0]) + hcells = max(self._shape(l)[1] for l in left) + self._shape(right)[1] else: - vcells = max([self._shape(l)[0] for l in left] + - [self._shape(right)[0]]) - hcells = sum([self._shape(l)[1] for l in left] + - [self._shape(right)[1]]) + vcells = max([self._shape(l)[0] for l in left] + [self._shape(right)[0]]) + hcells = sum([self._shape(l)[1] for l in left] + [self._shape(right)[1]]) return hcells, vcells def plot(self, left, right, labels=None, vertical=True): @@ -76,11 +72,10 @@ def plot(self, left, right, labels=None, vertical=True): max_left_rows = max(self._shape(l)[0] for l in left) for i, (l, label) in enumerate(zip(left, labels)): ax = fig.add_subplot(gs[i, 0:max_left_cols]) - self._make_table(ax, l, title=label, - height=1.0 / max_left_rows) + self._make_table(ax, l, title=label, height=1.0 / max_left_rows) # right ax = plt.subplot(gs[:, max_left_cols:]) - self._make_table(ax, right, title='Result', height=1.05 / vcells) + self._make_table(ax, right, title="Result", height=1.05 / vcells) fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95) else: max_rows = max(self._shape(df)[0] for df in left + [right]) @@ -90,12 +85,12 @@ def plot(self, left, right, labels=None, vertical=True): i = 0 for l, label in zip(left, labels): sp = self._shape(l) - ax = fig.add_subplot(gs[0, i:i + sp[1]]) + ax = fig.add_subplot(gs[0, i : i + sp[1]]) self._make_table(ax, l, title=label, height=height) i += sp[1] # right ax = plt.subplot(gs[0, i:]) - self._make_table(ax, right, title='Result', height=height) + self._make_table(ax, right, title="Result", height=height) fig.subplots_adjust(top=0.85, bottom=0.05, left=0.05, right=0.95) return fig @@ -104,10 +99,10 @@ def _conv(self, data): """Convert each input to appropriate for table outplot""" if isinstance(data, pd.Series): if data.name is None: - data = data.to_frame(name='') + data = data.to_frame(name="") else: data = data.to_frame() - data = data.fillna('NaN') + data = data.fillna("NaN") return data def _insert_index(self, data): @@ -115,17 +110,17 @@ def _insert_index(self, data): data = data.copy() idx_nlevels = data.index.nlevels if idx_nlevels == 1: - data.insert(0, 'Index', data.index) + data.insert(0, "Index", data.index) else: for i in range(idx_nlevels): - data.insert(i, 'Index{0}'.format(i), - data.index._get_level_values(i)) + data.insert(i, "Index{0}".format(i), data.index._get_level_values(i)) col_nlevels = data.columns.nlevels if col_nlevels > 1: col = data.columns._get_level_values(0) - values = [data.columns._get_level_values(i).values - for i in range(1, col_nlevels)] + values = [ + data.columns._get_level_values(i).values for i in range(1, col_nlevels) + ] col_df = pd.DataFrame(values) data.columns = col_df.columns data = pd.concat([col_df, data]) @@ -150,17 +145,17 @@ def _make_table(self, ax, df, title, height=None): height = 1.0 / (len(df) + 1) props = tb.properties() - for (r, c), cell in props['celld'].items(): + for (r, c), cell in props["celld"].items(): if c == -1: cell.set_visible(False) elif r < col_nlevels and c < idx_nlevels: cell.set_visible(False) elif r < col_nlevels or c < idx_nlevels: - cell.set_facecolor('#AAAAAA') + cell.set_facecolor("#AAAAAA") cell.set_height(height) ax.set_title(title, size=self.font_size) - ax.axis('off') + ax.axis("off") if __name__ == "__main__": @@ -168,29 +163,24 @@ def _make_table(self, ax, df, title, height=None): p = TablePlotter() - df1 = pd.DataFrame({'A': [10, 11, 12], - 'B': [20, 21, 22], - 'C': [30, 31, 32]}) - df2 = pd.DataFrame({'A': [10, 12], - 'C': [30, 32]}) + df1 = pd.DataFrame({"A": [10, 11, 12], "B": [20, 21, 22], "C": [30, 31, 32]}) + df2 = pd.DataFrame({"A": [10, 12], "C": [30, 32]}) - p.plot([df1, df2], pd.concat([df1, df2]), - labels=['df1', 'df2'], vertical=True) + p.plot([df1, df2], pd.concat([df1, df2]), labels=["df1", "df2"], vertical=True) plt.show() - df3 = pd.DataFrame({'X': [10, 12], - 'Z': [30, 32]}) + df3 = pd.DataFrame({"X": [10, 12], "Z": [30, 32]}) - p.plot([df1, df3], pd.concat([df1, df3], axis=1), - labels=['df1', 'df2'], vertical=False) + p.plot( + [df1, df3], pd.concat([df1, df3], axis=1), labels=["df1", "df2"], vertical=False + ) plt.show() - idx = pd.MultiIndex.from_tuples([(1, 'A'), (1, 'B'), (1, 'C'), - (2, 'A'), (2, 'B'), (2, 'C')]) - col = pd.MultiIndex.from_tuples([(1, 'A'), (1, 'B')]) - df3 = pd.DataFrame({'v1': [1, 2, 3, 4, 5, 6], - 'v2': [5, 6, 7, 8, 9, 10]}, - index=idx) + idx = pd.MultiIndex.from_tuples( + [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")] + ) + col = pd.MultiIndex.from_tuples([(1, "A"), (1, "B")]) + df3 = pd.DataFrame({"v1": [1, 2, 3, 4, 5, 6], "v2": [5, 6, 7, 8, 9, 10]}, index=idx) df3.columns = col - p.plot(df3, df3, labels=['df3']) + p.plot(df3, df3, labels=["df3"]) plt.show() diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 5e2e013c4afcc..21d09c06940ca 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -6,8 +6,7 @@ import subprocess import sys -from pandas.compat._optional import ( - VERSIONS, _get_version, import_optional_dependency) +from pandas.compat._optional import VERSIONS, _get_version, import_optional_dependency def get_sys_info(): @@ -19,9 +18,11 @@ def get_sys_info(): commit = None if os.path.isdir(".git") and os.path.isdir("pandas"): try: - pipe = subprocess.Popen('git log --format="%H" -n 1'.split(" "), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + pipe = subprocess.Popen( + 'git log --format="%H" -n 1'.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) so, serr = pipe.communicate() except (OSError, ValueError): pass @@ -29,29 +30,30 @@ def get_sys_info(): if pipe.returncode == 0: commit = so try: - commit = so.decode('utf-8') + commit = so.decode("utf-8") except ValueError: pass commit = commit.strip().strip('"') - blob.append(('commit', commit)) + blob.append(("commit", commit)) try: - (sysname, nodename, release, - version, machine, processor) = platform.uname() - blob.extend([ - ("python", '.'.join(map(str, sys.version_info))), - ("python-bits", struct.calcsize("P") * 8), - ("OS", "{sysname}".format(sysname=sysname)), - ("OS-release", "{release}".format(release=release)), - # ("Version", "{version}".format(version=version)), - ("machine", "{machine}".format(machine=machine)), - ("processor", "{processor}".format(processor=processor)), - ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)), - ("LC_ALL", "{lc}".format(lc=os.environ.get('LC_ALL', "None"))), - ("LANG", "{lang}".format(lang=os.environ.get('LANG', "None"))), - ("LOCALE", '.'.join(map(str, locale.getlocale()))), - ]) + (sysname, nodename, release, version, machine, processor) = platform.uname() + blob.extend( + [ + ("python", ".".join(map(str, sys.version_info))), + ("python-bits", struct.calcsize("P") * 8), + ("OS", "{sysname}".format(sysname=sysname)), + ("OS-release", "{release}".format(release=release)), + # ("Version", "{version}".format(version=version)), + ("machine", "{machine}".format(machine=machine)), + ("processor", "{processor}".format(processor=processor)), + ("byteorder", "{byteorder}".format(byteorder=sys.byteorder)), + ("LC_ALL", "{lc}".format(lc=os.environ.get("LC_ALL", "None"))), + ("LANG", "{lang}".format(lang=os.environ.get("LANG", "None"))), + ("LOCALE", ".".join(map(str, locale.getlocale()))), + ] + ) except (KeyError, ValueError): pass @@ -61,18 +63,18 @@ def get_sys_info(): def show_versions(as_json=False): sys_info = get_sys_info() deps = [ - 'pandas', + "pandas", # required - 'numpy', - 'pytz', - 'dateutil', + "numpy", + "pytz", + "dateutil", # install / build, - 'pip', - 'setuptools', - 'Cython', + "pip", + "setuptools", + "Cython", # test - 'pytest', - 'hypothesis', + "pytest", + "hypothesis", # docs "sphinx", # Other, need a min version @@ -93,9 +95,9 @@ def show_versions(as_json=False): deps_blob = [] for modname in deps: - mod = import_optional_dependency(modname, - raise_on_missing=False, - on_version="ignore") + mod = import_optional_dependency( + modname, raise_on_missing=False, on_version="ignore" + ) if mod: ver = _get_version(mod) else: @@ -113,12 +115,12 @@ def show_versions(as_json=False): if as_json is True: print(j) else: - with codecs.open(as_json, "wb", encoding='utf8') as f: + with codecs.open(as_json, "wb", encoding="utf8") as f: json.dump(j, f, indent=2) else: maxlen = max(len(x) for x in deps) - tpl = '{{k:<{maxlen}}}: {{stat}}'.format(maxlen=maxlen) + tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen) print("\nINSTALLED VERSIONS") print("------------------") for k, stat in sys_info: @@ -130,10 +132,15 @@ def show_versions(as_json=False): def main(): from optparse import OptionParser + parser = OptionParser() - parser.add_option("-j", "--json", metavar="FILE", nargs=1, - help="Save output as JSON into file, pass in " - "'-' to output to stdout") + parser.add_option( + "-j", + "--json", + metavar="FILE", + nargs=1, + help="Save output as JSON into file, pass in " "'-' to output to stdout", + ) (options, args) = parser.parse_args() diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index ab22539f4530f..3de4e5d66d577 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -33,8 +33,7 @@ def test_foo(): from pandas.compat import is_platform_32bit, is_platform_windows from pandas.compat.numpy import _np_version -from pandas.core.computation.expressions import ( - _NUMEXPR_INSTALLED, _USE_NUMEXPR) +from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR def safe_import(mod_name, min_version=None): @@ -60,13 +59,15 @@ def safe_import(mod_name, min_version=None): return mod else: import sys + try: - version = getattr(sys.modules[mod_name], '__version__') + version = getattr(sys.modules[mod_name], "__version__") except AttributeError: # xlrd uses a capitalized attribute name - version = getattr(sys.modules[mod_name], '__VERSION__') + version = getattr(sys.modules[mod_name], "__VERSION__") if version: from distutils.version import LooseVersion + if LooseVersion(version) >= LooseVersion(min_version): return mod @@ -89,20 +90,20 @@ def _skip_if_has_locale(): def _skip_if_not_us_locale(): lang, _ = locale.getlocale() - if lang != 'en_US': + if lang != "en_US": return True def _skip_if_no_scipy(): - return not (safe_import('scipy.stats') and - safe_import('scipy.sparse') and - safe_import('scipy.interpolate') and - safe_import('scipy.signal')) + return not ( + safe_import("scipy.stats") + and safe_import("scipy.sparse") + and safe_import("scipy.interpolate") + and safe_import("scipy.signal") + ) -def skip_if_installed( - package: str, -) -> MarkDecorator: +def skip_if_installed(package: str,) -> MarkDecorator: """ Skip a test if a package is installed. @@ -112,15 +113,11 @@ def skip_if_installed( The name of the package. """ return pytest.mark.skipif( - safe_import(package), - reason="Skipping because {} is installed.".format(package) + safe_import(package), reason="Skipping because {} is installed.".format(package) ) -def skip_if_no( - package: str, - min_version: Optional[str] = None -) -> MarkDecorator: +def skip_if_no(package: str, min_version: Optional[str] = None) -> MarkDecorator: """ Generic function to help skip tests when required packages are not present on the testing system. @@ -158,37 +155,39 @@ def skip_if_no( ) -skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), - reason="Missing matplotlib dependency") -skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), - reason="matplotlib is present") -skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), - reason="skipping for 32 bit") -skip_if_windows = pytest.mark.skipif(is_platform_windows(), - reason="Running on Windows") -skip_if_windows_python_3 = pytest.mark.skipif(is_platform_windows(), - reason="not used on win32") -skip_if_has_locale = pytest.mark.skipif(_skip_if_has_locale(), - reason="Specific locale is set {lang}" - .format(lang=locale.getlocale()[0])) -skip_if_not_us_locale = pytest.mark.skipif(_skip_if_not_us_locale(), - reason="Specific locale is set " - "{lang}".format( - lang=locale.getlocale()[0])) -skip_if_no_scipy = pytest.mark.skipif(_skip_if_no_scipy(), - reason="Missing SciPy requirement") -skip_if_no_ne = pytest.mark.skipif(not _USE_NUMEXPR, - reason="numexpr enabled->{enabled}, " - "installed->{installed}".format( - enabled=_USE_NUMEXPR, - installed=_NUMEXPR_INSTALLED)) +skip_if_no_mpl = pytest.mark.skipif( + _skip_if_no_mpl(), reason="Missing matplotlib dependency" +) +skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") +skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit") +skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") +skip_if_windows_python_3 = pytest.mark.skipif( + is_platform_windows(), reason="not used on win32" +) +skip_if_has_locale = pytest.mark.skipif( + _skip_if_has_locale(), + reason="Specific locale is set {lang}".format(lang=locale.getlocale()[0]), +) +skip_if_not_us_locale = pytest.mark.skipif( + _skip_if_not_us_locale(), + reason="Specific locale is set " "{lang}".format(lang=locale.getlocale()[0]), +) +skip_if_no_scipy = pytest.mark.skipif( + _skip_if_no_scipy(), reason="Missing SciPy requirement" +) +skip_if_no_ne = pytest.mark.skipif( + not _USE_NUMEXPR, + reason="numexpr enabled->{enabled}, " + "installed->{installed}".format(enabled=_USE_NUMEXPR, installed=_NUMEXPR_INSTALLED), +) def skip_if_np_lt(ver_str, reason=None, *args, **kwds): if reason is None: reason = "NumPy %s or greater required" % ver_str - return pytest.mark.skipif(_np_version < LooseVersion(ver_str), - reason=reason, *args, **kwds) + return pytest.mark.skipif( + _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds + ) def parametrize_fixture_doc(*args): @@ -210,7 +209,9 @@ def parametrize_fixture_doc(*args): The decorated function wrapped within a pytest ``parametrize_fixture_doc`` mark """ + def documented_fixture(fixture): fixture.__doc__ = fixture.__doc__.format(*args) return fixture + return documented_fixture diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py index 19b1cc700261c..0f5324c8d02ba 100644 --- a/pandas/util/_tester.py +++ b/pandas/util/_tester.py @@ -16,14 +16,14 @@ def test(extra_args=None): import hypothesis # noqa except ImportError: raise ImportError("Need hypothesis>=3.58 to run tests") - cmd = ['--skip-slow', '--skip-network', '--skip-db'] + cmd = ["--skip-slow", "--skip-network", "--skip-db"] if extra_args: if not isinstance(extra_args, list): extra_args = [extra_args] cmd = extra_args cmd += [PKG] - print("running: pytest {}".format(' '.join(cmd))) + print("running: pytest {}".format(" ".join(cmd))) sys.exit(pytest.main(cmd)) -__all__ = ['test'] +__all__ = ["test"] diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 41faaf68d7f40..8d5f9f7749682 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -20,13 +20,17 @@ def _check_arg_length(fname, args, max_fname_arg_count, compat_args): if len(args) > len(compat_args): max_arg_count = len(compat_args) + max_fname_arg_count actual_arg_count = len(args) + max_fname_arg_count - argument = 'argument' if max_arg_count == 1 else 'arguments' + argument = "argument" if max_arg_count == 1 else "arguments" raise TypeError( "{fname}() takes at most {max_arg} {argument} " "({given_arg} given)".format( - fname=fname, max_arg=max_arg_count, - argument=argument, given_arg=actual_arg_count)) + fname=fname, + max_arg=max_arg_count, + argument=argument, + given_arg=actual_arg_count, + ) + ) def _check_for_default_values(fname, arg_val_dict, compat_args): @@ -48,11 +52,10 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): # check for None-ness otherwise we could end up # comparing a numpy array vs None - if (v1 is not None and v2 is None) or \ - (v1 is None and v2 is not None): + if (v1 is not None and v2 is None) or (v1 is None and v2 is not None): match = False else: - match = (v1 == v2) + match = v1 == v2 if not is_bool(match): raise ValueError("'match' is not a boolean") @@ -60,13 +63,16 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): # could not compare them directly, so try comparison # using the 'is' operator except ValueError: - match = (arg_val_dict[key] is compat_args[key]) + match = arg_val_dict[key] is compat_args[key] if not match: - raise ValueError(("the '{arg}' parameter is not " - "supported in the pandas " - "implementation of {fname}()". - format(fname=fname, arg=key))) + raise ValueError( + ( + "the '{arg}' parameter is not " + "supported in the pandas " + "implementation of {fname}()".format(fname=fname, arg=key) + ) + ) def validate_args(fname, args, max_fname_arg_count, compat_args): @@ -122,9 +128,12 @@ def _check_for_invalid_keys(fname, kwargs, compat_args): if diff: bad_arg = list(diff)[0] - raise TypeError(("{fname}() got an unexpected " - "keyword argument '{arg}'". - format(fname=fname, arg=bad_arg))) + raise TypeError( + ( + "{fname}() got an unexpected " + "keyword argument '{arg}'".format(fname=fname, arg=bad_arg) + ) + ) def validate_kwargs(fname, kwargs, compat_args): @@ -157,9 +166,7 @@ def validate_kwargs(fname, kwargs, compat_args): _check_for_default_values(fname, kwds, compat_args) -def validate_args_and_kwargs(fname, args, kwargs, - max_fname_arg_count, - compat_args): +def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_args): """ Checks whether parameters passed to the *args and **kwargs argument in a function `fname` are valid parameters as specified in `*compat_args` @@ -202,8 +209,9 @@ def validate_args_and_kwargs(fname, args, kwargs, """ # Check that the total number of arguments passed in (i.e. # args and kwargs) does not exceed the length of compat_args - _check_arg_length(fname, args + tuple(kwargs.values()), - max_fname_arg_count, compat_args) + _check_arg_length( + fname, args + tuple(kwargs.values()), max_fname_arg_count, compat_args + ) # Check there is no overlap with the positional and keyword # arguments, similar to what is done in actual Python functions @@ -211,8 +219,10 @@ def validate_args_and_kwargs(fname, args, kwargs, for key in args_dict: if key in kwargs: - raise TypeError("{fname}() got multiple values for keyword " - "argument '{arg}'".format(fname=fname, arg=key)) + raise TypeError( + "{fname}() got multiple values for keyword " + "argument '{arg}'".format(fname=fname, arg=key) + ) kwargs.update(args_dict) validate_kwargs(fname, kwargs, compat_args) @@ -221,9 +231,10 @@ def validate_args_and_kwargs(fname, args, kwargs, def validate_bool_kwarg(value, arg_name): """ Ensures that argument passed in arg_name is of type bool. """ if not (is_bool(value) or value is None): - raise ValueError('For argument "{arg}" expected type bool, received ' - 'type {typ}.'.format(arg=arg_name, - typ=type(value).__name__)) + raise ValueError( + 'For argument "{arg}" expected type bool, received ' + "type {typ}.".format(arg=arg_name, typ=type(value).__name__) + ) return value @@ -268,18 +279,19 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): # like out = {'index': foo, 'columns': bar} # Start by validating for consistency - if 'axis' in kwargs and any(x in kwargs for x in data._AXIS_NUMBERS): + if "axis" in kwargs and any(x in kwargs for x in data._AXIS_NUMBERS): msg = "Cannot specify both 'axis' and any of 'index' or 'columns'." raise TypeError(msg) # First fill with explicit values provided by the user... if arg_name in kwargs: if args: - msg = ("{} got multiple values for argument " - "'{}'".format(method_name, arg_name)) + msg = "{} got multiple values for argument " "'{}'".format( + method_name, arg_name + ) raise TypeError(msg) - axis = data._get_axis_name(kwargs.get('axis', 0)) + axis = data._get_axis_name(kwargs.get("axis", 0)) out[axis] = kwargs[arg_name] # More user-provided arguments, now from kwargs @@ -298,22 +310,22 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): if len(args) == 0: pass # It's up to the function to decide if this is valid elif len(args) == 1: - axis = data._get_axis_name(kwargs.get('axis', 0)) + axis = data._get_axis_name(kwargs.get("axis", 0)) out[axis] = args[0] elif len(args) == 2: - if 'axis' in kwargs: + if "axis" in kwargs: # Unambiguously wrong - msg = ("Cannot specify both 'axis' and any of 'index' " - "or 'columns'") + msg = "Cannot specify both 'axis' and any of 'index' " "or 'columns'" raise TypeError(msg) - msg = ("Interpreting call\n\t'.{method_name}(a, b)' as " - "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " - "arguments to remove any ambiguity. In the future, using " - "positional arguments for 'index' or 'columns' will raise " - " a 'TypeError'.") - warnings.warn(msg.format(method_name=method_name,), FutureWarning, - stacklevel=4) + msg = ( + "Interpreting call\n\t'.{method_name}(a, b)' as " + "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " + "arguments to remove any ambiguity. In the future, using " + "positional arguments for 'index' or 'columns' will raise " + " a 'TypeError'." + ) + warnings.warn(msg.format(method_name=method_name), FutureWarning, stacklevel=4) out[data._AXIS_NAMES[0]] = args[0] out[data._AXIS_NAMES[1]] = args[1] else: @@ -349,8 +361,10 @@ def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True): elif value is not None and method is None: if validate_scalar_dict_value and isinstance(value, (list, tuple)): - raise TypeError('"value" parameter must be a scalar or dict, but ' - 'you passed a "{0}"'.format(type(value).__name__)) + raise TypeError( + '"value" parameter must be a scalar or dict, but ' + 'you passed a "{0}"'.format(type(value).__name__) + ) elif value is not None and method is not None: raise ValueError("Cannot specify both 'value' and 'method'.") diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cec9416e5d2c5..037c885e4733f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -20,26 +20,54 @@ from numpy.random import rand, randn from pandas._config.localization import ( # noqa:F401 - can_set_locale, get_locales, set_locale) + can_set_locale, + get_locales, + set_locale, +) import pandas._libs.testing as _testing from pandas.compat import raise_with_traceback from pandas.core.dtypes.common import ( - is_bool, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetimelike_v_numeric, is_datetimelike_v_object, - is_extension_array_dtype, is_interval_dtype, is_list_like, is_number, - is_period_dtype, is_sequence, is_timedelta64_dtype, needs_i8_conversion) + is_bool, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetimelike_v_numeric, + is_datetimelike_v_object, + is_extension_array_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_period_dtype, + is_sequence, + is_timedelta64_dtype, + needs_i8_conversion, +) from pandas.core.dtypes.missing import array_equivalent import pandas as pd from pandas import ( - Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Index, - IntervalIndex, MultiIndex, RangeIndex, Series, bdate_range) + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + bdate_range, +) from pandas.core.algorithms import take_1d from pandas.core.arrays import ( - DatetimeArray, ExtensionArray, IntervalArray, PeriodArray, TimedeltaArray, - period_array) + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, + period_array, +) from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing @@ -54,16 +82,16 @@ def set_testing_mode(): # set the testing mode filters - testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None') - if 'deprecate' in testing_mode: - warnings.simplefilter('always', _testing_mode_warnings) + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("always", _testing_mode_warnings) def reset_testing_mode(): # reset the testing mode filters - testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None') - if 'deprecate' in testing_mode: - warnings.simplefilter('ignore', _testing_mode_warnings) + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("ignore", _testing_mode_warnings) set_testing_mode() @@ -74,7 +102,7 @@ def reset_display_options(): Reset the display options for printing and representing objects. """ - pd.reset_option('^display.', silent=True) + pd.reset_option("^display.", silent=True) def round_trip_pickle(obj, path=None): @@ -95,7 +123,7 @@ def round_trip_pickle(obj, path=None): """ if path is None: - path = '__{random_bytes}__.pickle'.format(random_bytes=rands(10)) + path = "__{random_bytes}__.pickle".format(random_bytes=rands(10)) with ensure_clean(path) as path: pd.to_pickle(obj, path) return pd.read_pickle(path) @@ -121,9 +149,10 @@ def round_trip_pathlib(writer, reader, path=None): """ import pytest - Path = pytest.importorskip('pathlib').Path + + Path = pytest.importorskip("pathlib").Path if path is None: - path = '___pathlib___' + path = "___pathlib___" with ensure_clean(path) as path: writer(Path(path)) obj = reader(Path(path)) @@ -149,9 +178,10 @@ def round_trip_localpath(writer, reader, path=None): The original object that was serialized and then re-read. """ import pytest - LocalPath = pytest.importorskip('py.path').local + + LocalPath = pytest.importorskip("py.path").local if path is None: - path = '___localpath___' + path = "___localpath___" with ensure_clean(path) as path: writer(LocalPath(path)) obj = reader(LocalPath(path)) @@ -177,23 +207,22 @@ def decompress_file(path, compression): """ if compression is None: - f = open(path, 'rb') - elif compression == 'gzip': - f = gzip.open(path, 'rb') - elif compression == 'bz2': - f = bz2.BZ2File(path, 'rb') - elif compression == 'xz': - f = lzma.LZMAFile(path, 'rb') - elif compression == 'zip': + f = open(path, "rb") + elif compression == "gzip": + f = gzip.open(path, "rb") + elif compression == "bz2": + f = bz2.BZ2File(path, "rb") + elif compression == "xz": + f = lzma.LZMAFile(path, "rb") + elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: f = zip_file.open(zip_names.pop()) else: - raise ValueError('ZIP file {} error. Only one file per ZIP.' - .format(path)) + raise ValueError("ZIP file {} error. Only one file per ZIP.".format(path)) else: - msg = 'Unrecognized compression type: {}'.format(compression) + msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) try: @@ -226,15 +255,19 @@ def write_to_compressed(compression, path, data, dest="test"): if compression == "zip": import zipfile + compress_method = zipfile.ZipFile elif compression == "gzip": import gzip + compress_method = gzip.GzipFile elif compression == "bz2": import bz2 + compress_method = bz2.BZ2File elif compression == "xz": import lzma + compress_method = lzma.LZMAFile else: msg = "Unrecognized compression type: {}".format(compression) @@ -253,8 +286,9 @@ def write_to_compressed(compression, path, data, dest="test"): getattr(f, method)(*args) -def assert_almost_equal(left, right, check_dtype="equiv", - check_less_precise=False, **kwargs): +def assert_almost_equal( + left, right, check_dtype="equiv", check_less_precise=False, **kwargs +): """ Check that the left and right objects are approximately equal. @@ -282,25 +316,34 @@ def assert_almost_equal(left, right, check_dtype="equiv", """ if isinstance(left, pd.Index): - assert_index_equal(left, right, - check_exact=False, - exact=check_dtype, - check_less_precise=check_less_precise, - **kwargs) + assert_index_equal( + left, + right, + check_exact=False, + exact=check_dtype, + check_less_precise=check_less_precise, + **kwargs + ) elif isinstance(left, pd.Series): - assert_series_equal(left, right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs) + assert_series_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs + ) elif isinstance(left, pd.DataFrame): - assert_frame_equal(left, right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs) + assert_frame_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs + ) else: # Other sequences. @@ -312,17 +355,18 @@ def assert_almost_equal(left, right, check_dtype="equiv", # Do not compare bool classes, like np.bool_ and bool. pass else: - if (isinstance(left, np.ndarray) or - isinstance(right, np.ndarray)): + if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): obj = "numpy array" else: obj = "Input" assert_class_equal(left, right, obj=obj) _testing.assert_almost_equal( - left, right, + left, + right, check_dtype=check_dtype, check_less_precise=check_less_precise, - **kwargs) + **kwargs + ) def _check_isinstance(left, right, cls): @@ -346,11 +390,13 @@ def _check_isinstance(left, right, cls): cls_name = cls.__name__ if not isinstance(left, cls): - raise AssertionError(err_msg.format(name=cls_name, exp_type=cls, - act_type=type(left))) + raise AssertionError( + err_msg.format(name=cls_name, exp_type=cls, act_type=type(left)) + ) if not isinstance(right, cls): - raise AssertionError(err_msg.format(name=cls_name, exp_type=cls, - act_type=type(right))) + raise AssertionError( + err_msg.format(name=cls_name, exp_type=cls, act_type=type(right)) + ) def assert_dict_equal(left, right, compare_keys=True): @@ -363,26 +409,33 @@ def randbool(size=(), p=0.5): return rand(*size) <= p -RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), - dtype=(np.str_, 1)) -RANDU_CHARS = np.array(list("".join(map(chr, range(1488, 1488 + 26))) + - string.digits), dtype=(np.unicode_, 1)) +RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) +RANDU_CHARS = np.array( + list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), + dtype=(np.unicode_, 1), +) -def rands_array(nchars, size, dtype='O'): +def rands_array(nchars, size, dtype="O"): """Generate an array of byte strings.""" - retval = (np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) - .view((np.str_, nchars)).reshape(size)) + retval = ( + np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) + .view((np.str_, nchars)) + .reshape(size) + ) if dtype is None: return retval else: return retval.astype(dtype) -def randu_array(nchars, size, dtype='O'): +def randu_array(nchars, size, dtype="O"): """Generate an array of unicode strings.""" - retval = (np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) - .view((np.unicode_, nchars)).reshape(size)) + retval = ( + np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) + .view((np.unicode_, nchars)) + .reshape(size) + ) if dtype is None: return retval else: @@ -396,7 +449,7 @@ def rands(nchars): See `rands_array` if you want to create an array of random strings. """ - return ''.join(np.random.choice(RANDS_CHARS, nchars)) + return "".join(np.random.choice(RANDS_CHARS, nchars)) def randu(nchars): @@ -406,7 +459,7 @@ def randu(nchars): See `randu_array` if you want to create an array of random unicode strings. """ - return ''.join(np.random.choice(RANDU_CHARS, nchars)) + return "".join(np.random.choice(RANDU_CHARS, nchars)) def close(fignum=None): @@ -436,7 +489,7 @@ def ensure_clean(filename=None, return_filelike=False): if True, returns a file-like which is *always* cleaned. Necessary for savefig and other functions which want to append extensions. """ - filename = filename or '' + filename = filename or "" fd = None if return_filelike: @@ -454,7 +507,8 @@ def ensure_clean(filename=None, return_filelike=False): fd, filename = tempfile.mkstemp(suffix=filename) except UnicodeEncodeError: import pytest - pytest.skip('no unicode file names on this system') + + pytest.skip("no unicode file names on this system") try: yield filename @@ -462,8 +516,11 @@ def ensure_clean(filename=None, return_filelike=False): try: os.close(fd) except Exception: - print("Couldn't close file descriptor: {fdesc} (file: {fname})" - .format(fdesc=fd, fname=filename)) + print( + "Couldn't close file descriptor: {fdesc} (file: {fname})".format( + fdesc=fd, fname=filename + ) + ) try: if os.path.exists(filename): os.remove(filename) @@ -480,7 +537,7 @@ def ensure_clean_dir(): ------ Temporary directory path """ - directory_name = tempfile.mkdtemp(suffix='') + directory_name = tempfile.mkdtemp(suffix="") try: yield directory_name finally: @@ -516,14 +573,16 @@ def equalContents(arr1, arr2): return frozenset(arr1) == frozenset(arr2) -def assert_index_equal(left: Index, - right: Index, - exact: Union[bool, str] = 'equiv', - check_names: bool = True, - check_less_precise: Union[bool, int] = False, - check_exact: bool = True, - check_categorical: bool = True, - obj: str = 'Index') -> None: +def assert_index_equal( + left: Index, + right: Index, + exact: Union[bool, str] = "equiv", + check_names: bool = True, + check_less_precise: Union[bool, int] = False, + check_exact: bool = True, + check_categorical: bool = True, + obj: str = "Index", +) -> None: """Check that left and right Index are equal. Parameters @@ -550,19 +609,19 @@ def assert_index_equal(left: Index, """ __tracebackhide__ = True - def _check_types(l, r, obj='Index'): + def _check_types(l, r, obj="Index"): if exact: assert_class_equal(l, r, exact=exact, obj=obj) # Skip exact dtype checking when `check_categorical` is False if check_categorical: - assert_attr_equal('dtype', l, r, obj=obj) + assert_attr_equal("dtype", l, r, obj=obj) # allow string-like to have different inferred_types - if l.inferred_type in ('string', 'unicode'): - assert r.inferred_type in ('string', 'unicode') + if l.inferred_type in ("string", "unicode"): + assert r.inferred_type in ("string", "unicode") else: - assert_attr_equal('inferred_type', l, r, obj=obj) + assert_attr_equal("inferred_type", l, r, obj=obj) def _get_ilevel_values(index, level): # accept level number only @@ -580,16 +639,16 @@ def _get_ilevel_values(index, level): # level comparison if left.nlevels != right.nlevels: - msg1 = '{obj} levels are different'.format(obj=obj) - msg2 = '{nlevels}, {left}'.format(nlevels=left.nlevels, left=left) - msg3 = '{nlevels}, {right}'.format(nlevels=right.nlevels, right=right) + msg1 = "{obj} levels are different".format(obj=obj) + msg2 = "{nlevels}, {left}".format(nlevels=left.nlevels, left=left) + msg3 = "{nlevels}, {right}".format(nlevels=right.nlevels, right=right) raise_assert_detail(obj, msg1, msg2, msg3) # length comparison if len(left) != len(right): - msg1 = '{obj} length are different'.format(obj=obj) - msg2 = '{length}, {left}'.format(length=len(left), left=left) - msg3 = '{length}, {right}'.format(length=len(right), right=right) + msg1 = "{obj} length are different".format(obj=obj) + msg2 = "{length}, {left}".format(length=len(left), left=left) + msg3 = "{length}, {right}".format(length=len(right), right=right) raise_assert_detail(obj, msg1, msg2, msg3) # MultiIndex special comparison for little-friendly error messages @@ -602,44 +661,54 @@ def _get_ilevel_values(index, level): llevel = _get_ilevel_values(left, level) rlevel = _get_ilevel_values(right, level) - lobj = 'MultiIndex level [{level}]'.format(level=level) - assert_index_equal(llevel, rlevel, - exact=exact, check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, obj=lobj) + lobj = "MultiIndex level [{level}]".format(level=level) + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + obj=lobj, + ) # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): - diff = np.sum((left.values != right.values) - .astype(int)) * 100.0 / len(left) - msg = '{obj} values are different ({pct} %)'.format( - obj=obj, pct=np.round(diff, 5)) + diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + msg = "{obj} values are different ({pct} %)".format( + obj=obj, pct=np.round(diff, 5) + ) raise_assert_detail(obj, msg, left, right) else: - _testing.assert_almost_equal(left.values, right.values, - check_less_precise=check_less_precise, - check_dtype=exact, - obj=obj, lobj=left, robj=right) + _testing.assert_almost_equal( + left.values, + right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, + lobj=left, + robj=right, + ) # metadata comparison if check_names: - assert_attr_equal('names', left, right, obj=obj) + assert_attr_equal("names", left, right, obj=obj) if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): - assert_attr_equal('freq', left, right, obj=obj) - if (isinstance(left, pd.IntervalIndex) or - isinstance(right, pd.IntervalIndex)): + assert_attr_equal("freq", left, right, obj=obj) + if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): assert_interval_array_equal(left.values, right.values) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, - obj='{obj} category'.format(obj=obj)) + assert_categorical_equal( + left.values, right.values, obj="{obj} category".format(obj=obj) + ) -def assert_class_equal(left, right, exact=True, obj='Input'): +def assert_class_equal(left, right, exact=True, obj="Input"): """checks classes are equal.""" __tracebackhide__ = True @@ -653,22 +722,20 @@ def repr_class(x): except AttributeError: return repr(type(x)) - if exact == 'equiv': + if exact == "equiv": if type(left) != type(right): # allow equivalence of Int64Index/RangeIndex types = {type(left).__name__, type(right).__name__} - if len(types - {'Int64Index', 'RangeIndex'}): - msg = '{obj} classes are not equivalent'.format(obj=obj) - raise_assert_detail(obj, msg, repr_class(left), - repr_class(right)) + if len(types - {"Int64Index", "RangeIndex"}): + msg = "{obj} classes are not equivalent".format(obj=obj) + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) elif exact: if type(left) != type(right): - msg = '{obj} classes are different'.format(obj=obj) - raise_assert_detail(obj, msg, repr_class(left), - repr_class(right)) + msg = "{obj} classes are different".format(obj=obj) + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) -def assert_attr_equal(attr, left, right, obj='Attributes'): +def assert_attr_equal(attr, left, right, obj="Attributes"): """checks attributes are equal. Both objects must have attribute. Parameters @@ -688,8 +755,12 @@ def assert_attr_equal(attr, left, right, obj='Attributes'): if left_attr is right_attr: return True - elif (is_number(left_attr) and np.isnan(left_attr) and - is_number(right_attr) and np.isnan(right_attr)): + elif ( + is_number(left_attr) + and np.isnan(left_attr) + and is_number(right_attr) + and np.isnan(right_attr) + ): # np.nan return True @@ -710,20 +781,25 @@ def assert_attr_equal(attr, left, right, obj='Attributes'): def assert_is_valid_plot_return_object(objs): import matplotlib.pyplot as plt + if isinstance(objs, (pd.Series, np.ndarray)): for el in objs.ravel(): - msg = ("one of 'objs' is not a matplotlib Axes instance, type " - "encountered {name!r}").format(name=el.__class__.__name__) + msg = ( + "one of 'objs' is not a matplotlib Axes instance, type " + "encountered {name!r}" + ).format(name=el.__class__.__name__) assert isinstance(el, (plt.Axes, dict)), msg else: assert isinstance(objs, (plt.Artist, tuple, dict)), ( - 'objs is neither an ndarray of Artist instances nor a ' - 'single Artist instance, tuple, or dict, "objs" is a {name!r}' - .format(name=objs.__class__.__name__)) + "objs is neither an ndarray of Artist instances nor a " + 'single Artist instance, tuple, or dict, "objs" is a {name!r}'.format( + name=objs.__class__.__name__ + ) + ) def isiterable(obj): - return hasattr(obj, '__iter__') + return hasattr(obj, "__iter__") def assert_is_sorted(seq): @@ -734,8 +810,9 @@ def assert_is_sorted(seq): assert_numpy_array_equal(seq, np.sort(np.array(seq))) -def assert_categorical_equal(left, right, check_dtype=True, - check_category_order=True, obj='Categorical'): +def assert_categorical_equal( + left, right, check_dtype=True, check_category_order=True, obj="Categorical" +): """Test that Categoricals are equivalent. Parameters @@ -756,24 +833,31 @@ def assert_categorical_equal(left, right, check_dtype=True, _check_isinstance(left, right, Categorical) if check_category_order: - assert_index_equal(left.categories, right.categories, - obj='{obj}.categories'.format(obj=obj)) - assert_numpy_array_equal(left.codes, right.codes, - check_dtype=check_dtype, - obj='{obj}.codes'.format(obj=obj)) + assert_index_equal( + left.categories, right.categories, obj="{obj}.categories".format(obj=obj) + ) + assert_numpy_array_equal( + left.codes, + right.codes, + check_dtype=check_dtype, + obj="{obj}.codes".format(obj=obj), + ) else: - assert_index_equal(left.categories.sort_values(), - right.categories.sort_values(), - obj='{obj}.categories'.format(obj=obj)) - assert_index_equal(left.categories.take(left.codes), - right.categories.take(right.codes), - obj='{obj}.values'.format(obj=obj)) + assert_index_equal( + left.categories.sort_values(), + right.categories.sort_values(), + obj="{obj}.categories".format(obj=obj), + ) + assert_index_equal( + left.categories.take(left.codes), + right.categories.take(right.codes), + obj="{obj}.values".format(obj=obj), + ) - assert_attr_equal('ordered', left, right, obj=obj) + assert_attr_equal("ordered", left, right, obj=obj) -def assert_interval_array_equal(left, right, exact='equiv', - obj='IntervalArray'): +def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): """Test that two IntervalArrays are equivalent. Parameters @@ -790,37 +874,38 @@ def assert_interval_array_equal(left, right, exact='equiv', """ _check_isinstance(left, right, IntervalArray) - assert_index_equal(left.left, right.left, exact=exact, - obj='{obj}.left'.format(obj=obj)) - assert_index_equal(left.right, right.right, exact=exact, - obj='{obj}.left'.format(obj=obj)) - assert_attr_equal('closed', left, right, obj=obj) + assert_index_equal( + left.left, right.left, exact=exact, obj="{obj}.left".format(obj=obj) + ) + assert_index_equal( + left.right, right.right, exact=exact, obj="{obj}.left".format(obj=obj) + ) + assert_attr_equal("closed", left, right, obj=obj) -def assert_period_array_equal(left, right, obj='PeriodArray'): +def assert_period_array_equal(left, right, obj="PeriodArray"): _check_isinstance(left, right, PeriodArray) - assert_numpy_array_equal(left._data, right._data, - obj='{obj}.values'.format(obj=obj)) - assert_attr_equal('freq', left, right, obj=obj) + assert_numpy_array_equal( + left._data, right._data, obj="{obj}.values".format(obj=obj) + ) + assert_attr_equal("freq", left, right, obj=obj) -def assert_datetime_array_equal(left, right, obj='DatetimeArray'): +def assert_datetime_array_equal(left, right, obj="DatetimeArray"): __tracebackhide__ = True _check_isinstance(left, right, DatetimeArray) - assert_numpy_array_equal(left._data, right._data, - obj='{obj}._data'.format(obj=obj)) - assert_attr_equal('freq', left, right, obj=obj) - assert_attr_equal('tz', left, right, obj=obj) + assert_numpy_array_equal(left._data, right._data, obj="{obj}._data".format(obj=obj)) + assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("tz", left, right, obj=obj) -def assert_timedelta_array_equal(left, right, obj='TimedeltaArray'): +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): __tracebackhide__ = True _check_isinstance(left, right, TimedeltaArray) - assert_numpy_array_equal(left._data, right._data, - obj='{obj}._data'.format(obj=obj)) - assert_attr_equal('freq', left, right, obj=obj) + assert_numpy_array_equal(left._data, right._data, obj="{obj}._data".format(obj=obj)) + assert_attr_equal("freq", left, right, obj=obj) def raise_assert_detail(obj, message, left, right, diff=None): @@ -840,7 +925,9 @@ def raise_assert_detail(obj, message, left, right, diff=None): {message} [left]: {left} -[right]: {right}""".format(obj=obj, message=message, left=left, right=right) +[right]: {right}""".format( + obj=obj, message=message, left=left, right=right + ) if diff is not None: msg += "\n[diff]: {diff}".format(diff=diff) @@ -848,9 +935,15 @@ def raise_assert_detail(obj, message, left, right, diff=None): raise AssertionError(msg) -def assert_numpy_array_equal(left, right, strict_nan=False, - check_dtype=True, err_msg=None, - check_same=None, obj='numpy array'): +def assert_numpy_array_equal( + left, + right, + strict_nan=False, + check_dtype=True, + err_msg=None, + check_same=None, + obj="numpy array", +): """ Checks that 'np.ndarray' is equivalent Parameters @@ -878,27 +971,29 @@ def assert_numpy_array_equal(left, right, strict_nan=False, _check_isinstance(left, right, np.ndarray) def _get_base(obj): - return obj.base if getattr(obj, 'base', None) is not None else obj + return obj.base if getattr(obj, "base", None) is not None else obj left_base = _get_base(left) right_base = _get_base(right) - if check_same == 'same': + if check_same == "same": if left_base is not right_base: - msg = "{left!r} is not {right!r}".format( - left=left_base, right=right_base) + msg = "{left!r} is not {right!r}".format(left=left_base, right=right_base) raise AssertionError(msg) - elif check_same == 'copy': + elif check_same == "copy": if left_base is right_base: - msg = "{left!r} is {right!r}".format( - left=left_base, right=right_base) + msg = "{left!r} is {right!r}".format(left=left_base, right=right_base) raise AssertionError(msg) def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: - raise_assert_detail(obj, '{obj} shapes are different' - .format(obj=obj), left.shape, right.shape) + raise_assert_detail( + obj, + "{obj} shapes are different".format(obj=obj), + left.shape, + right.shape, + ) diff = 0 for l, r in zip(left, right): @@ -907,8 +1002,9 @@ def _raise(left, right, err_msg): diff += 1 diff = diff * 100.0 / left.size - msg = '{obj} values are different ({pct} %)'.format( - obj=obj, pct=np.round(diff, 5)) + msg = "{obj} values are different ({pct} %)".format( + obj=obj, pct=np.round(diff, 5) + ) raise_assert_detail(obj, msg, left, right) raise AssertionError(err_msg) @@ -919,12 +1015,12 @@ def _raise(left, right, err_msg): if check_dtype: if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - assert_attr_equal('dtype', left, right, obj=obj) + assert_attr_equal("dtype", left, right, obj=obj) -def assert_extension_array_equal(left, right, check_dtype=True, - check_less_precise=False, - check_exact=False): +def assert_extension_array_equal( + left, right, check_dtype=True, check_less_precise=False, check_exact=False +): """Check that left and right ExtensionArrays are equal. Parameters @@ -946,10 +1042,10 @@ def assert_extension_array_equal(left, right, check_dtype=True, A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. """ - assert isinstance(left, ExtensionArray), 'left is not an ExtensionArray' - assert isinstance(right, ExtensionArray), 'right is not an ExtensionArray' + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" + assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: - assert_attr_equal('dtype', left, right, obj='ExtensionArray') + assert_attr_equal("dtype", left, right, obj="ExtensionArray") if hasattr(left, "asi8") and type(right) == type(left): # Avoid slow object-dtype comparisons @@ -958,29 +1054,36 @@ def assert_extension_array_equal(left, right, check_dtype=True, left_na = np.asarray(left.isna()) right_na = np.asarray(right.isna()) - assert_numpy_array_equal(left_na, right_na, obj='ExtensionArray NA mask') + assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") left_valid = np.asarray(left[~left_na].astype(object)) right_valid = np.asarray(right[~right_na].astype(object)) if check_exact: - assert_numpy_array_equal(left_valid, right_valid, obj='ExtensionArray') + assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") else: - _testing.assert_almost_equal(left_valid, right_valid, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - obj='ExtensionArray') + _testing.assert_almost_equal( + left_valid, + right_valid, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + obj="ExtensionArray", + ) # This could be refactored to use the NDFrame.equals method -def assert_series_equal(left, right, check_dtype=True, - check_index_type='equiv', - check_series_type=True, - check_less_precise=False, - check_names=True, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - obj='Series'): +def assert_series_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_series_type=True, + check_less_precise=False, + check_names=True, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + obj="Series", +): """Check that left and right Series are equal. Parameters @@ -1029,93 +1132,117 @@ def assert_series_equal(left, right, check_dtype=True, # length comparison if len(left) != len(right): - msg1 = '{len}, {left}'.format(len=len(left), left=left.index) - msg2 = '{len}, {right}'.format(len=len(right), right=right.index) - raise_assert_detail(obj, 'Series length are different', msg1, msg2) + msg1 = "{len}, {left}".format(len=len(left), left=left.index) + msg2 = "{len}, {right}".format(len=len(right), right=right.index) + raise_assert_detail(obj, "Series length are different", msg1, msg2) # index comparison - assert_index_equal(left.index, right.index, exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj='{obj}.index'.format(obj=obj)) + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj="{obj}.index".format(obj=obj), + ) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` - if (is_categorical_dtype(left) and is_categorical_dtype(right) and - not check_categorical): + if ( + is_categorical_dtype(left) + and is_categorical_dtype(right) + and not check_categorical + ): pass else: - assert_attr_equal('dtype', left, right) + assert_attr_equal("dtype", left, right) if check_exact: - assert_numpy_array_equal(left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - obj='{obj}'.format(obj=obj),) + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + obj="{obj}".format(obj=obj), + ) elif check_datetimelike_compat: # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case - if (is_datetimelike_v_numeric(left, right) or - is_datetimelike_v_object(left, right) or - needs_i8_conversion(left) or - needs_i8_conversion(right)): + if ( + is_datetimelike_v_numeric(left, right) + or is_datetimelike_v_object(left, right) + or needs_i8_conversion(left) + or needs_i8_conversion(right) + ): # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left.values).equals(Index(right.values)): - msg = ('[datetimelike_compat=True] {left} is not equal to ' - '{right}.').format(left=left.values, right=right.values) + msg = ( + "[datetimelike_compat=True] {left} is not equal to " "{right}." + ).format(left=left.values, right=right.values) raise AssertionError(msg) else: - assert_numpy_array_equal(left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype) + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + ) elif is_interval_dtype(left) or is_interval_dtype(right): assert_interval_array_equal(left.array, right.array) - elif (is_extension_array_dtype(left.dtype) and - is_datetime64tz_dtype(left.dtype)): + elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): # .values is an ndarray, but ._values is the ExtensionArray. # TODO: Use .array assert is_extension_array_dtype(right.dtype) assert_extension_array_equal(left._values, right._values) - elif (is_extension_array_dtype(left) and not is_categorical_dtype(left) and - is_extension_array_dtype(right) and not is_categorical_dtype(right)): + elif ( + is_extension_array_dtype(left) + and not is_categorical_dtype(left) + and is_extension_array_dtype(right) + and not is_categorical_dtype(right) + ): assert_extension_array_equal(left.array, right.array) else: - _testing.assert_almost_equal(left._internal_get_values(), - right._internal_get_values(), - check_less_precise=check_less_precise, - check_dtype=check_dtype, - obj='{obj}'.format(obj=obj)) + _testing.assert_almost_equal( + left._internal_get_values(), + right._internal_get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj="{obj}".format(obj=obj), + ) # metadata comparison if check_names: - assert_attr_equal('name', left, right, obj=obj) + assert_attr_equal("name", left, right, obj=obj) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, - obj='{obj} category'.format(obj=obj)) + assert_categorical_equal( + left.values, right.values, obj="{obj} category".format(obj=obj) + ) # This could be refactored to use the NDFrame.equals method -def assert_frame_equal(left, right, check_dtype=True, - check_index_type='equiv', - check_column_type='equiv', - check_frame_type=True, - check_less_precise=False, - check_names=True, - by_blocks=False, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_like=False, - obj='DataFrame'): +def assert_frame_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_column_type="equiv", + check_frame_type=True, + check_less_precise=False, + check_names=True, + by_blocks=False, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + check_like=False, + obj="DataFrame", +): """ Check that left and right DataFrame are equal. @@ -1219,29 +1346,39 @@ def assert_frame_equal(left, right, check_dtype=True, # shape comparison if left.shape != right.shape: - raise_assert_detail(obj, - '{obj} shape mismatch'.format(obj=obj), - '{shape!r}'.format(shape=left.shape), - '{shape!r}'.format(shape=right.shape)) + raise_assert_detail( + obj, + "{obj} shape mismatch".format(obj=obj), + "{shape!r}".format(shape=left.shape), + "{shape!r}".format(shape=right.shape), + ) if check_like: left, right = left.reindex_like(right), right # index comparison - assert_index_equal(left.index, right.index, exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj='{obj}.index'.format(obj=obj)) + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj="{obj}.index".format(obj=obj), + ) # column comparison - assert_index_equal(left.columns, right.columns, exact=check_column_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj='{obj}.columns'.format(obj=obj)) + assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj="{obj}.columns".format(obj=obj), + ) # compare by blocks if by_blocks: @@ -1250,8 +1387,9 @@ def assert_frame_equal(left, right, check_dtype=True, for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks - assert_frame_equal(lblocks[dtype], rblocks[dtype], - check_dtype=check_dtype, obj=obj) + assert_frame_equal( + lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj + ) # compare by columns else: @@ -1260,13 +1398,17 @@ def assert_frame_equal(left, right, check_dtype=True, lcol = left.iloc[:, i] rcol = right.iloc[:, i] assert_series_equal( - lcol, rcol, check_dtype=check_dtype, + lcol, + rcol, + check_dtype=check_dtype, check_index_type=check_index_type, check_less_precise=check_less_precise, - check_exact=check_exact, check_names=check_names, + check_exact=check_exact, + check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, - obj='{obj}.iloc[:, {idx}]'.format(obj=obj, idx=i)) + obj="{obj}.iloc[:, {idx}]".format(obj=obj, idx=i), + ) def assert_equal(left, right, **kwargs): @@ -1359,9 +1501,14 @@ def to_array(obj): # Sparse -def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True, - check_fill_value=True, - consolidate_block_indices=False): +def assert_sp_array_equal( + left, + right, + check_dtype=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, +): """Check that the left and right SparseArray are equal. Parameters @@ -1384,8 +1531,7 @@ def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True, _check_isinstance(left, right, pd.SparseArray) - assert_numpy_array_equal(left.sp_values, right.sp_values, - check_dtype=check_dtype) + assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) # SparseIndex comparison assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) @@ -1398,32 +1544,38 @@ def assert_sp_array_equal(left, right, check_dtype=True, check_kind=True, left_index = left.sp_index right_index = right.sp_index - if consolidate_block_indices and left.kind == 'block': + if consolidate_block_indices and left.kind == "block": # we'll probably remove this hack... left_index = left_index.to_int_index().to_block_index() right_index = right_index.to_int_index().to_block_index() if not left_index.equals(right_index): - raise_assert_detail('SparseArray.index', 'index are not equal', - left_index, right_index) + raise_assert_detail( + "SparseArray.index", "index are not equal", left_index, right_index + ) else: # Just ensure a pass if check_fill_value: - assert_attr_equal('fill_value', left, right) + assert_attr_equal("fill_value", left, right) if check_dtype: - assert_attr_equal('dtype', left, right) - assert_numpy_array_equal(left.to_dense(), right.to_dense(), - check_dtype=check_dtype) - - -def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, - check_series_type=True, check_names=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, - obj='SparseSeries'): + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + + +def assert_sp_series_equal( + left, + right, + check_dtype=True, + exact_indices=True, + check_series_type=True, + check_names=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, + obj="SparseSeries", +): """Check that the left and right SparseSeries are equal. Parameters @@ -1456,28 +1608,35 @@ def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, if check_series_type: assert_class_equal(left, right, obj=obj) - assert_index_equal(left.index, right.index, - obj='{obj}.index'.format(obj=obj)) + assert_index_equal(left.index, right.index, obj="{obj}.index".format(obj=obj)) - assert_sp_array_equal(left.values, right.values, - check_kind=check_kind, - check_fill_value=check_fill_value, - consolidate_block_indices=consolidate_block_indices) + assert_sp_array_equal( + left.values, + right.values, + check_kind=check_kind, + check_fill_value=check_fill_value, + consolidate_block_indices=consolidate_block_indices, + ) if check_names: - assert_attr_equal('name', left, right) + assert_attr_equal("name", left, right) if check_dtype: - assert_attr_equal('dtype', left, right) - - assert_numpy_array_equal(np.asarray(left.values), - np.asarray(right.values)) - - -def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, - check_frame_type=True, check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, - obj='SparseDataFrame'): + assert_attr_equal("dtype", left, right) + + assert_numpy_array_equal(np.asarray(left.values), np.asarray(right.values)) + + +def assert_sp_frame_equal( + left, + right, + check_dtype=True, + exact_indices=True, + check_frame_type=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, + obj="SparseDataFrame", +): """Check that the left and right SparseDataFrame are equal. Parameters @@ -1510,35 +1669,36 @@ def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, if check_frame_type: assert_class_equal(left, right, obj=obj) - assert_index_equal(left.index, right.index, - obj='{obj}.index'.format(obj=obj)) - assert_index_equal(left.columns, right.columns, - obj='{obj}.columns'.format(obj=obj)) + assert_index_equal(left.index, right.index, obj="{obj}.index".format(obj=obj)) + assert_index_equal(left.columns, right.columns, obj="{obj}.columns".format(obj=obj)) if check_fill_value: - assert_attr_equal('default_fill_value', left, right, obj=obj) + assert_attr_equal("default_fill_value", left, right, obj=obj) for col, series in left.items(): - assert (col in right) + assert col in right # trade-off? if exact_indices: assert_sp_series_equal( - series, right[col], + series, + right[col], check_dtype=check_dtype, check_kind=check_kind, check_fill_value=check_fill_value, - consolidate_block_indices=consolidate_block_indices + consolidate_block_indices=consolidate_block_indices, ) else: - assert_series_equal(series.to_dense(), right[col].to_dense(), - check_dtype=check_dtype) + assert_series_equal( + series.to_dense(), right[col].to_dense(), check_dtype=check_dtype + ) # do I care? # assert(left.default_kind == right.default_kind) for col in right: - assert (col in left) + assert col in left + # ----------------------------------------------------------------------------- # Others @@ -1560,9 +1720,10 @@ def assert_copy(iter1, iter2, **eql_kwargs): """ for elem1, elem2 in zip(iter1, iter2): assert_almost_equal(elem1, elem2, **eql_kwargs) - msg = ("Expected object {obj1!r} and object {obj2!r} to be " - "different objects, but they were the same object." - ).format(obj1=type(elem1), obj2=type(elem2)) + msg = ( + "Expected object {obj1!r} and object {obj2!r} to be " + "different objects, but they were the same object." + ).format(obj1=type(elem1), obj2=type(elem2)) assert elem1 is not elem2, msg @@ -1604,7 +1765,7 @@ def makeIntIndex(k=10, name=None): def makeUIntIndex(k=10, name=None): - return Index([2**63 + i for i in range(k)], name=name) + return Index([2 ** 63 + i for i in range(k)], name=name) def makeRangeIndex(k=10, name=None, **kwargs): @@ -1616,26 +1777,24 @@ def makeFloatIndex(k=10, name=None): return Index(values * (10 ** np.random.randint(0, 9)), name=name) -def makeDateIndex(k=10, freq='B', name=None, **kwargs): +def makeDateIndex(k=10, freq="B", name=None, **kwargs): dt = datetime(2000, 1, 1) dr = bdate_range(dt, periods=k, freq=freq, name=name) return DatetimeIndex(dr, name=name, **kwargs) -def makeTimedeltaIndex(k=10, freq='D', name=None, **kwargs): - return pd.timedelta_range(start='1 day', periods=k, freq=freq, - name=name, **kwargs) +def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): + return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) def makePeriodIndex(k=10, name=None, **kwargs): dt = datetime(2000, 1, 1) - dr = pd.period_range(start=dt, periods=k, freq='B', name=name, **kwargs) + dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) return dr def makeMultiIndex(k=10, names=None, **kwargs): - return MultiIndex.from_product( - (('foo', 'bar'), (1, 2)), names=names, **kwargs) + return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) def all_index_generator(k=10): @@ -1646,21 +1805,32 @@ def all_index_generator(k=10): ---------- k: length of each of the index instances """ - all_make_index_funcs = [makeIntIndex, makeFloatIndex, makeStringIndex, - makeUnicodeIndex, makeDateIndex, makePeriodIndex, - makeTimedeltaIndex, makeBoolIndex, makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex] + all_make_index_funcs = [ + makeIntIndex, + makeFloatIndex, + makeStringIndex, + makeUnicodeIndex, + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeBoolIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + ] for make_index_func in all_make_index_funcs: yield make_index_func(k=k) def index_subclass_makers_generator(): make_index_funcs = [ - makeDateIndex, makePeriodIndex, - makeTimedeltaIndex, makeRangeIndex, - makeIntervalIndex, makeCategoricalIndex, - makeMultiIndex + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + makeMultiIndex, ] for make_index_func in make_index_funcs: yield make_index_func @@ -1702,7 +1872,7 @@ def getSeriesData(): return {c: Series(randn(N), index=index) for c in getCols(K)} -def makeTimeSeries(nper=None, freq='B', name=None): +def makeTimeSeries(nper=None, freq="B", name=None): if nper is None: nper = N return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) @@ -1714,7 +1884,7 @@ def makePeriodSeries(nper=None, name=None): return Series(randn(nper), index=makePeriodIndex(nper), name=name) -def getTimeSeriesData(nper=None, freq='B'): +def getTimeSeriesData(nper=None, freq="B"): return {c: makeTimeSeries(nper, freq) for c in getCols(K)} @@ -1723,7 +1893,7 @@ def getPeriodData(nper=None): # make frame -def makeTimeDataFrame(nper=None, freq='B'): +def makeTimeDataFrame(nper=None, freq="B"): data = getTimeSeriesData(nper, freq) return DataFrame(data) @@ -1734,13 +1904,13 @@ def makeDataFrame(): def getMixedTypeDict(): - index = Index(['a', 'b', 'c', 'd', 'e']) + index = Index(["a", "b", "c", "d", "e"]) data = { - 'A': [0., 1., 2., 3., 4.], - 'B': [0., 1., 0., 1., 0.], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': bdate_range('1/1/2009', periods=5) + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), } return index, data @@ -1755,8 +1925,9 @@ def makePeriodFrame(nper=None): return DataFrame(data) -def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, - idx_type=None): +def makeCustomIndex( + nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None +): """Create an index/multindex with given dimensions, levels, names, etc' nentries - number of entries in index @@ -1781,12 +1952,11 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, if ndupe_l is None: ndupe_l = [1] * nlevels - assert (is_sequence(ndupe_l) and len(ndupe_l) <= nlevels) - assert (names is None or names is False or - names is True or len(names) is nlevels) - assert idx_type is None or (idx_type in ('i', 'f', 's', 'u', - 'dt', 'p', 'td') - and nlevels == 1) + assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels + assert names is None or names is False or names is True or len(names) is nlevels + assert idx_type is None or ( + idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 + ) if names is True: # build default names @@ -1800,10 +1970,15 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, names = [names] # specific 1D index type requested? - idx_func = dict(i=makeIntIndex, f=makeFloatIndex, - s=makeStringIndex, u=makeUnicodeIndex, - dt=makeDateIndex, td=makeTimedeltaIndex, - p=makePeriodIndex).get(idx_type) + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) if idx_func: idx = idx_func(nentries) # but we need to fill in the name @@ -1811,9 +1986,10 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, idx.name = names[0] return idx elif idx_type is not None: - raise ValueError('"{idx_type}" is not a legal value for `idx_type`, ' - 'use "i"/"f"/"s"/"u"/"dt/"p"/"td".' - .format(idx_type=idx_type)) + raise ValueError( + '"{idx_type}" is not a legal value for `idx_type`, ' + 'use "i"/"f"/"s"/"u"/"dt/"p"/"td".'.format(idx_type=idx_type) + ) if len(ndupe_l) < nlevels: ndupe_l.extend([1] * (nlevels - len(ndupe_l))) @@ -1823,8 +1999,10 @@ def makeCustomIndex(nentries, nlevels, prefix='#', names=False, ndupe_l=None, tuples = [] for i in range(nlevels): + def keyfunc(x): import re + numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") return [int(num) for num in numeric_tuple] @@ -1832,7 +2010,7 @@ def keyfunc(x): div_factor = nentries // ndupe_l[i] + 1 cnt = Counter() for j in range(div_factor): - label = '{prefix}_l{i}_g{j}'.format(prefix=prefix, i=i, j=j) + label = "{prefix}_l{i}_g{j}".format(prefix=prefix, i=i, j=j) cnt[label] = ndupe_l[i] # cute Counter trick result = list(sorted(cnt.elements(), key=keyfunc))[:nentries] @@ -1852,10 +2030,20 @@ def keyfunc(x): return index -def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, - c_idx_nlevels=1, r_idx_nlevels=1, data_gen_f=None, - c_ndupe_l=None, r_ndupe_l=None, dtype=None, - c_idx_type=None, r_idx_type=None): +def makeCustomDataframe( + nrows, + ncols, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): """ nrows, ncols - number of data rows/cols c_idx_names, idx_names - False/True/list of strings, yields No names , @@ -1913,19 +2101,29 @@ def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, assert c_idx_nlevels > 0 assert r_idx_nlevels > 0 - assert r_idx_type is None or (r_idx_type in ('i', 'f', 's', - 'u', 'dt', 'p', 'td') - and r_idx_nlevels == 1) - assert c_idx_type is None or (c_idx_type in ('i', 'f', 's', - 'u', 'dt', 'p', 'td') - and c_idx_nlevels == 1) - - columns = makeCustomIndex(ncols, nlevels=c_idx_nlevels, prefix='C', - names=c_idx_names, ndupe_l=c_ndupe_l, - idx_type=c_idx_type) - index = makeCustomIndex(nrows, nlevels=r_idx_nlevels, prefix='R', - names=r_idx_names, ndupe_l=r_ndupe_l, - idx_type=r_idx_type) + assert r_idx_type is None or ( + r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 + ) + assert c_idx_type is None or ( + c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 + ) + + columns = makeCustomIndex( + ncols, + nlevels=c_idx_nlevels, + prefix="C", + names=c_idx_names, + ndupe_l=c_ndupe_l, + idx_type=c_idx_type, + ) + index = makeCustomIndex( + nrows, + nlevels=r_idx_nlevels, + prefix="R", + names=r_idx_names, + ndupe_l=r_ndupe_l, + idx_type=r_idx_type, + ) # by default, generate data based on location if data_gen_f is None: @@ -1958,17 +2156,27 @@ def _gen_unique_rand(rng, _extra_size): extra_size *= 1.05 ind = _gen_unique_rand(random_state, extra_size) - j = np.floor(ind * 1. / nrows).astype(int) + j = np.floor(ind * 1.0 / nrows).astype(int) i = (ind - j * nrows).astype(int) return i.tolist(), j.tolist() -def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None, - c_idx_names=True, r_idx_names=True, - c_idx_nlevels=1, r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, r_ndupe_l=None, dtype=None, - c_idx_type=None, r_idx_type=None): +def makeMissingCustomDataframe( + nrows, + ncols, + density=0.9, + random_state=None, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): """ Parameters ---------- @@ -1980,30 +2188,34 @@ def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None, See makeCustomDataframe for descriptions of the rest of the parameters. """ - df = makeCustomDataframe(nrows, ncols, c_idx_names=c_idx_names, - r_idx_names=r_idx_names, - c_idx_nlevels=c_idx_nlevels, - r_idx_nlevels=r_idx_nlevels, - data_gen_f=data_gen_f, - c_ndupe_l=c_ndupe_l, r_ndupe_l=r_ndupe_l, - dtype=dtype, c_idx_type=c_idx_type, - r_idx_type=r_idx_type) + df = makeCustomDataframe( + nrows, + ncols, + c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, + r_ndupe_l=r_ndupe_l, + dtype=dtype, + c_idx_type=c_idx_type, + r_idx_type=r_idx_type, + ) i, j = _create_missing_idx(nrows, ncols, density, random_state) df.values[i, j] = np.nan return df -def makeMissingDataframe(density=.9, random_state=None): +def makeMissingDataframe(density=0.9, random_state=None): df = makeDataFrame() - i, j = _create_missing_idx(*df.shape, density=density, - random_state=random_state) + i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) df.values[i, j] = np.nan return df class TestSubDict(dict): - def __init__(self, *args, **kwargs): dict.__init__(self, *args, **kwargs) @@ -2039,19 +2251,19 @@ def dec(f): # 'urlopen error timed out', # 'timeout: timed out', # 'socket.timeout: timed out', - 'timed out', - 'Server Hangup', - 'HTTP Error 503: Service Unavailable', - '502: Proxy Error', - 'HTTP Error 502: internal error', - 'HTTP Error 502', - 'HTTP Error 503', - 'HTTP Error 403', - 'HTTP Error 400', - 'Temporary failure in name resolution', - 'Name or service not known', - 'Connection refused', - 'certificate verify', + "timed out", + "Server Hangup", + "HTTP Error 503: Service Unavailable", + "502: Proxy Error", + "HTTP Error 502: internal error", + "HTTP Error 502", + "HTTP Error 503", + "HTTP Error 403", + "HTTP Error 400", + "Temporary failure in name resolution", + "Name or service not known", + "Connection refused", + "certificate verify", ) # or this e.errno/e.reason.errno @@ -2060,8 +2272,8 @@ def dec(f): 111, # Connection refused 110, # Connection timed out 104, # Connection reset Error - 54, # Connection reset by peer - 60, # urllib.error.URLError: [Errno 60] Connection timed out + 54, # Connection reset by peer + 60, # urllib.error.URLError: [Errno 60] Connection timed out ) # Both of the above shouldn't mask real issues such as 404's @@ -2098,13 +2310,15 @@ def can_connect(url, error_classes=_network_error_classes): @optional_args -def network(t, url="http://www.google.com", - raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, - check_before_test=False, - error_classes=_network_error_classes, - skip_errnos=_network_errno_vals, - _skip_on_messages=_network_error_messages, - ): +def network( + t, + url="http://www.google.com", + raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, + check_before_test=False, + error_classes=_network_error_classes, + skip_errnos=_network_errno_vals, + _skip_on_messages=_network_error_messages, +): """ Label a test as requiring network connection and, if an error is encountered, only raise if it does not find a network connection. @@ -2188,6 +2402,7 @@ def network(t, url="http://www.google.com", Errors not related to networking will always be raised. """ from pytest import skip + t.network = True @wraps(t) @@ -2198,13 +2413,15 @@ def wrapper(*args, **kwargs): try: return t(*args, **kwargs) except Exception as e: - errno = getattr(e, 'errno', None) + errno = getattr(e, "errno", None) if not errno and hasattr(errno, "reason"): - errno = getattr(e.reason, 'errno', None) + errno = getattr(e.reason, "errno", None) if errno in skip_errnos: - skip("Skipping test due to known errno" - " and error {error}".format(error=e)) + skip( + "Skipping test due to known errno" + " and error {error}".format(error=e) + ) try: e_str = traceback.format_exc(e) @@ -2212,8 +2429,10 @@ def wrapper(*args, **kwargs): e_str = str(e) if any(m.lower() in e_str.lower() for m in _skip_on_messages): - skip("Skipping test because exception " - "message is known and error {error}".format(error=e)) + skip( + "Skipping test because exception " + "message is known and error {error}".format(error=e) + ) if not isinstance(e, error_classes): raise @@ -2221,8 +2440,10 @@ def wrapper(*args, **kwargs): if raise_on_error or can_connect(url, error_classes): raise else: - skip("Skipping test due to lack of connectivity" - " and error {error}".format(error=e)) + skip( + "Skipping test due to lack of connectivity" + " and error {error}".format(error=e) + ) return wrapper @@ -2230,8 +2451,7 @@ def wrapper(*args, **kwargs): with_connectivity_check = network -def assert_raises_regex(_exception, _regexp, _callable=None, - *args, **kwargs): +def assert_raises_regex(_exception, _regexp, _callable=None, *args, **kwargs): r""" Check that the specified Exception is raised and that the error message matches a given regular expression pattern. This may be a regular @@ -2271,9 +2491,15 @@ def assert_raises_regex(_exception, _regexp, _callable=None, AssertionError: "banana" does not match "'str' object does not support \ item assignment" """ - warnings.warn(("assert_raises_regex has been deprecated and will " - "be removed in the next release. Please use " - "`pytest.raises` instead."), FutureWarning, stacklevel=2) + warnings.warn( + ( + "assert_raises_regex has been deprecated and will " + "be removed in the next release. Please use " + "`pytest.raises` instead." + ), + FutureWarning, + stacklevel=2, + ) manager = _AssertRaisesContextmanager(exception=_exception, regexp=_regexp) if _callable is not None: @@ -2351,7 +2577,8 @@ def exception_matches(self, exc_type, exc_value, trace_back): if not self.regexp.search(val): msg = '"{pat}" does not match "{val}"'.format( - pat=self.regexp.pattern, val=val) + pat=self.regexp.pattern, val=val + ) e = AssertionError(msg) raise_with_traceback(e, trace_back) @@ -2362,9 +2589,13 @@ def exception_matches(self, exc_type, exc_value, trace_back): @contextmanager -def assert_produces_warning(expected_warning=Warning, filter_level="always", - clear=None, check_stacklevel=True, - raise_on_extra_warnings=True): +def assert_produces_warning( + expected_warning=Warning, + filter_level="always", + clear=None, + check_stacklevel=True, + raise_on_extra_warnings=True, +): """ Context manager for running code expected to either raise a specific warning, or not raise any warnings. Verifies that the code raises the @@ -2447,30 +2678,40 @@ class for all warnings. To check that no warning is returned, extra_warnings = [] for actual_warning in w: - if (expected_warning and issubclass(actual_warning.category, - expected_warning)): + if expected_warning and issubclass( + actual_warning.category, expected_warning + ): saw_warning = True - if check_stacklevel and issubclass(actual_warning.category, - (FutureWarning, - DeprecationWarning)): + if check_stacklevel and issubclass( + actual_warning.category, (FutureWarning, DeprecationWarning) + ): from inspect import getframeinfo, stack + caller = getframeinfo(stack()[2][0]) - msg = ("Warning not set with correct stacklevel. " - "File where warning is raised: {actual} != " - "{caller}. Warning message: {message}" - ).format(actual=actual_warning.filename, - caller=caller.filename, - message=actual_warning.message) + msg = ( + "Warning not set with correct stacklevel. " + "File where warning is raised: {actual} != " + "{caller}. Warning message: {message}" + ).format( + actual=actual_warning.filename, + caller=caller.filename, + message=actual_warning.message, + ) assert actual_warning.filename == caller.filename, msg else: - extra_warnings.append((actual_warning.category.__name__, - actual_warning.message, - actual_warning.filename, - actual_warning.lineno)) + extra_warnings.append( + ( + actual_warning.category.__name__, + actual_warning.message, + actual_warning.filename, + actual_warning.lineno, + ) + ) if expected_warning: msg = "Did not see expected warning of class {name!r}.".format( - name=expected_warning.__name__) + name=expected_warning.__name__ + ) assert saw_warning, msg if raise_on_extra_warnings and extra_warnings: raise AssertionError( @@ -2529,6 +2770,7 @@ def with_csv_dialect(name, **kwargs): csv : Python's CSV library. """ import csv + _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} if name in _BUILTIN_DIALECTS: @@ -2542,6 +2784,7 @@ def with_csv_dialect(name, **kwargs): @contextmanager def use_numexpr(use, min_elements=None): from pandas.core.computation import expressions as expr + if min_elements is None: min_elements = expr._MIN_ELEMENTS @@ -2590,19 +2833,20 @@ def inner(*args, **kwargs): threads = [] for i in range(num_threads): updated_kwargs = update_kwargs(i) - thread = threading.Thread(target=func, args=args, - kwargs=updated_kwargs) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join() + return inner + return wrapper class SubclassedSeries(Series): - _metadata = ['testattr', 'name'] + _metadata = ["testattr", "name"] @property def _constructor(self): @@ -2614,7 +2858,7 @@ def _constructor_expanddim(self): class SubclassedDataFrame(DataFrame): - _metadata = ['testattr'] + _metadata = ["testattr"] @property def _constructor(self): @@ -2626,7 +2870,7 @@ def _constructor_sliced(self): class SubclassedSparseSeries(pd.SparseSeries): - _metadata = ['testattr'] + _metadata = ["testattr"] @property def _constructor(self): @@ -2638,7 +2882,7 @@ def _constructor_expanddim(self): class SubclassedSparseDataFrame(pd.SparseDataFrame): - _metadata = ['testattr'] + _metadata = ["testattr"] @property def _constructor(self): @@ -2650,7 +2894,6 @@ def _constructor_sliced(self): class SubclassedCategorical(Categorical): - @property def _constructor(self): return SubclassedCategorical @@ -2685,14 +2928,14 @@ def set_timezone(tz): def setTZ(tz): if tz is None: try: - del os.environ['TZ'] + del os.environ["TZ"] except KeyError: pass else: - os.environ['TZ'] = tz + os.environ["TZ"] = tz time.tzset() - orig_tz = os.environ.get('TZ') + orig_tz = os.environ.get("TZ") setTZ(tz) try: yield @@ -2716,9 +2959,12 @@ def _make_skipna_wrapper(alternative, skipna_alternative=None): skipna_wrapper : function """ if skipna_alternative: + def skipna_wrapper(x): return skipna_alternative(x.values) + else: + def skipna_wrapper(x): nona = x.dropna() if len(nona) == 0: diff --git a/scripts/download_wheels.py b/scripts/download_wheels.py index f5cdbbe36d90d..4ca1354321134 100644 --- a/scripts/download_wheels.py +++ b/scripts/download_wheels.py @@ -16,16 +16,18 @@ def parse_args(args=None): def fetch(version): - base = 'http://wheels.scipy.org' + base = "http://wheels.scipy.org" tree = html.parse(base) root = tree.getroot() - dest = pathlib.Path('dist') + dest = pathlib.Path("dist") dest.mkdir(exist_ok=True) - files = [x for x in root.xpath("//a/text()") - if x.startswith('pandas-{}'.format(version)) - and not dest.joinpath(x).exists()] + files = [ + x + for x in root.xpath("//a/text()") + if x.startswith("pandas-{}".format(version)) and not dest.joinpath(x).exists() + ] N = len(files) @@ -33,9 +35,9 @@ def fetch(version): out = str(dest.joinpath(filename)) link = urllib.request.urljoin(base, filename) urllib.request.urlretrieve(link, out) - print("Downloaded {link} to {out} [{i}/{N}]".format( - link=link, out=out, i=i, N=N - )) + print( + "Downloaded {link} to {out} [{i}/{N}]".format(link=link, out=out, i=i, N=N) + ) def main(args=None): @@ -43,5 +45,5 @@ def main(args=None): fetch(args.version) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 959623f4e2b65..1075a257d4270 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -28,32 +28,50 @@ Find all commits touching a specified function across the codebase. """.strip() argparser = argparse.ArgumentParser(description=desc) -argparser.add_argument('funcname', metavar='FUNCNAME', - help='Name of function/method to search for changes on') -argparser.add_argument('-f', '--file-masks', metavar='f_re(,f_re)*', - default=[r"\.py.?$"], - help='comma separated list of regexes to match ' - 'filenames against\ndefaults all .py? files') -argparser.add_argument('-d', '--dir-masks', metavar='d_re(,d_re)*', - default=[], - help='comma separated list of regexes to match base ' - 'path against') -argparser.add_argument('-p', '--path-masks', metavar='p_re(,p_re)*', - default=[], - help='comma separated list of regexes to match full ' - 'file path against') -argparser.add_argument('-y', '--saw-the-warning', - action='store_true', default=False, - help='must specify this to run, acknowledge you ' - 'realize this will erase untracked files') -argparser.add_argument('--debug-level', - default="CRITICAL", - help='debug level of messages (DEBUG, INFO, etc...)') +argparser.add_argument( + "funcname", + metavar="FUNCNAME", + help="Name of function/method to search for changes on", +) +argparser.add_argument( + "-f", + "--file-masks", + metavar="f_re(,f_re)*", + default=[r"\.py.?$"], + help="comma separated list of regexes to match " + "filenames against\ndefaults all .py? files", +) +argparser.add_argument( + "-d", + "--dir-masks", + metavar="d_re(,d_re)*", + default=[], + help="comma separated list of regexes to match base " "path against", +) +argparser.add_argument( + "-p", + "--path-masks", + metavar="p_re(,p_re)*", + default=[], + help="comma separated list of regexes to match full " "file path against", +) +argparser.add_argument( + "-y", + "--saw-the-warning", + action="store_true", + default=False, + help="must specify this to run, acknowledge you " + "realize this will erase untracked files", +) +argparser.add_argument( + "--debug-level", + default="CRITICAL", + help="debug level of messages (DEBUG, INFO, etc...)", +) args = argparser.parse_args() -lfmt = logging.Formatter(fmt='%(levelname)-8s %(message)s', - datefmt='%m-%d %H:%M:%S') +lfmt = logging.Formatter(fmt="%(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") shh = logging.StreamHandler() shh.setFormatter(lfmt) logger = logging.getLogger("findit") @@ -70,19 +88,21 @@ def clean_checkout(comm): s = s.split("\n")[0] logger.info("CO: %s %s" % (comm, s)) - sh.git('checkout', comm, _tty_out=False) - sh.git('clean', '-f') + sh.git("checkout", comm, _tty_out=False) + sh.git("clean", "-f") def get_hits(defname, files=()): cs = set() for f in files: try: - r = sh.git('blame', - '-L', - r'/def\s*{start}/,/def/'.format(start=defname), - f, - _tty_out=False) + r = sh.git( + "blame", + "-L", + r"/def\s*{start}/,/def/".format(start=defname), + f, + _tty_out=False, + ) except sh.ErrorReturnCode_128: logger.debug("no matches in %s" % f) continue @@ -96,31 +116,33 @@ def get_hits(defname, files=()): return cs -def get_commit_info(c, fmt, sep='\t'): - r = sh.git('log', - "--format={}".format(fmt), - '{}^..{}'.format(c, c), - "-n", - "1", - _tty_out=False) +def get_commit_info(c, fmt, sep="\t"): + r = sh.git( + "log", + "--format={}".format(fmt), + "{}^..{}".format(c, c), + "-n", + "1", + _tty_out=False, + ) return str(r).split(sep) def get_commit_vitals(c, hlen=HASH_LEN): - h, s, d = get_commit_info(c, '%H\t%s\t%ci', "\t") + h, s, d = get_commit_info(c, "%H\t%s\t%ci", "\t") return h[:hlen], s, parse(d) def file_filter(state, dirname, fnames): - if (args.dir_masks and - not any(re.search(x, dirname) for x in args.dir_masks)): + if args.dir_masks and not any(re.search(x, dirname) for x in args.dir_masks): return for f in fnames: p = os.path.abspath(os.path.join(os.path.realpath(dirname), f)) - if (any(re.search(x, f) for x in args.file_masks) or - any(re.search(x, p) for x in args.path_masks)): + if any(re.search(x, f) for x in args.file_masks) or any( + re.search(x, p) for x in args.path_masks + ): if os.path.isfile(p): - state['files'].append(p) + state["files"].append(p) def search(defname, head_commit="HEAD"): @@ -130,7 +152,7 @@ def search(defname, head_commit="HEAD"): # allhits = set() files = [] state = dict(files=files) - os.walk('.', file_filter, state) + os.walk(".", file_filter, state) # files now holds a list of paths to files # seed with hits from q @@ -172,8 +194,10 @@ def sorter(i): h, s, d = get_commit_vitals(hits[i].commit) return hits[i].path, d - print(('\nThese commits touched the %s method in these files ' - 'on these dates:\n') % args.funcname) + print( + ("\nThese commits touched the %s method in these files " "on these dates:\n") + % args.funcname + ) for i in sorted(range(len(hits)), key=sorter): hit = hits[i] h, s, d = get_commit_vitals(hit.commit) @@ -181,7 +205,7 @@ def sorter(i): fmt = "{:%d} {:10} {:<%d} {:<%d}" % (HASH_LEN, SUBJ_LEN, PATH_LEN) if len(s) > SUBJ_LEN: - s = s[:SUBJ_LEN - 5] + " ..." + s = s[: SUBJ_LEN - 5] + " ..." print(fmt.format(h[:HASH_LEN], d.isoformat()[:10], s, p[-20:])) print("\n") @@ -190,21 +214,23 @@ def sorter(i): def main(): if not args.saw_the_warning: argparser.print_help() - print(""" + print( + """ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! WARNING: this script uses git clean -f, running it on a repo with untracked files. It's recommended that you make a fresh clone and run from its root directory. You must specify the -y argument to ignore this warning. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -""") +""" + ) return if isinstance(args.file_masks, str): - args.file_masks = args.file_masks.split(',') + args.file_masks = args.file_masks.split(",") if isinstance(args.path_masks, str): - args.path_masks = args.path_masks.split(',') + args.path_masks = args.path_masks.split(",") if isinstance(args.dir_masks, str): - args.dir_masks = args.dir_masks.split(',') + args.dir_masks = args.dir_masks.split(",") logger.setLevel(getattr(logging, args.debug_level)) @@ -214,4 +240,5 @@ def main(): if __name__ == "__main__": import sys + sys.exit(main()) diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 7b6eb1f9a32b5..ac73859b22598 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -19,8 +19,8 @@ import yaml -EXCLUDE = {'python=3'} -RENAME = {'pytables': 'tables'} +EXCLUDE = {"python=3"} +RENAME = {"pytables": "tables"} def conda_package_to_pip(package): @@ -36,15 +36,15 @@ def conda_package_to_pip(package): if package in EXCLUDE: return - package = re.sub('(?<=[^<>])=', '==', package).strip() - for compare in ('<=', '>=', '=='): + package = re.sub("(?<=[^<>])=", "==", package).strip() + for compare in ("<=", ">=", "=="): if compare not in package: continue pkg, version = package.split(compare) if pkg in RENAME: - return ''.join((RENAME[pkg], compare, version)) + return "".join((RENAME[pkg], compare, version)) break @@ -73,7 +73,7 @@ def main(conda_fname, pip_fname, compare=False): True if the comparison fails, False otherwise """ with open(conda_fname) as conda_fd: - deps = yaml.safe_load(conda_fd)['dependencies'] + deps = yaml.safe_load(conda_fd)["dependencies"] pip_deps = [] for dep in deps: @@ -81,42 +81,51 @@ def main(conda_fname, pip_fname, compare=False): conda_dep = conda_package_to_pip(dep) if conda_dep: pip_deps.append(conda_dep) - elif isinstance(dep, dict) and len(dep) == 1 and 'pip' in dep: - pip_deps += dep['pip'] + elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep: + pip_deps += dep["pip"] else: - raise ValueError('Unexpected dependency {}'.format(dep)) + raise ValueError("Unexpected dependency {}".format(dep)) - pip_content = '\n'.join(pip_deps) + pip_content = "\n".join(pip_deps) if compare: with open(pip_fname) as pip_fd: return pip_content != pip_fd.read() else: - with open(pip_fname, 'w') as pip_fd: + with open(pip_fname, "w") as pip_fd: pip_fd.write(pip_content) return False -if __name__ == '__main__': +if __name__ == "__main__": argparser = argparse.ArgumentParser( - description='convert (or compare) conda file to pip') - argparser.add_argument('--compare', - action='store_true', - help='compare whether the two files are equivalent') - argparser.add_argument('--azure', - action='store_true', - help='show the output in azure-pipelines format') + description="convert (or compare) conda file to pip" + ) + argparser.add_argument( + "--compare", + action="store_true", + help="compare whether the two files are equivalent", + ) + argparser.add_argument( + "--azure", action="store_true", help="show the output in azure-pipelines format" + ) args = argparser.parse_args() repo_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) - res = main(os.path.join(repo_path, 'environment.yml'), - os.path.join(repo_path, 'requirements-dev.txt'), - compare=args.compare) + res = main( + os.path.join(repo_path, "environment.yml"), + os.path.join(repo_path, "requirements-dev.txt"), + compare=args.compare, + ) if res: - msg = ('`requirements-dev.txt` has to be generated with `{}` after ' - '`environment.yml` is modified.\n'.format(sys.argv[0])) + msg = ( + "`requirements-dev.txt` has to be generated with `{}` after " + "`environment.yml` is modified.\n".format(sys.argv[0]) + ) if args.azure: - msg = ('##vso[task.logissue type=error;' - 'sourcepath=requirements-dev.txt]{}'.format(msg)) + msg = ( + "##vso[task.logissue type=error;" + "sourcepath=requirements-dev.txt]{}".format(msg) + ) sys.stderr.write(msg) sys.exit(res) diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 5c665faac5976..95352751a23c6 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -30,8 +30,8 @@ import sys import textwrap -PANDAS_HOME = '.' -PROJECT_NAME = 'pandas' +PANDAS_HOME = "." +PROJECT_NAME = "pandas" print("PANDAS_HOME = " + PANDAS_HOME) # Remote name with the PR @@ -51,10 +51,12 @@ auth_required = False if auth_required: - GITHUB_USERNAME = os.environ['GITHUB_USER'] + GITHUB_USERNAME = os.environ["GITHUB_USER"] import getpass - GITHUB_PASSWORD = getpass.getpass('Enter github.com password for %s:' - % GITHUB_USERNAME) + + GITHUB_PASSWORD = getpass.getpass( + "Enter github.com password for %s:" % GITHUB_USERNAME + ) def get_json_auth(url): auth = HTTPBasicAuth(GITHUB_USERNAME, GITHUB_PASSWORD) @@ -63,6 +65,7 @@ def get_json_auth(url): get_json = get_json_auth else: + def get_json_no_auth(url): req = requests.get(url) return req.json() @@ -78,12 +81,12 @@ def fail(msg): def run_cmd(cmd): if isinstance(cmd, str): - cmd = cmd.split(' ') + cmd = cmd.split(" ") output = check_output(cmd) if isinstance(output, bytes): - output = output.decode('utf-8') + output = output.decode("utf-8") return output @@ -119,40 +122,44 @@ def clean_up(): def merge_pr(pr_num, target_ref): pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) - target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, - target_ref.upper()) - run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, - pr_branch_name)) - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, - target_branch_name)) + target_branch_name = "%s_MERGE_PR_%s_%s" % ( + BRANCH_PREFIX, + pr_num, + target_ref.upper(), + ) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name)) run_cmd("git checkout %s" % target_branch_name) had_conflicts = False try: - run_cmd(['git', 'merge', pr_branch_name, '--squash']) + run_cmd(["git", "merge", pr_branch_name, "--squash"]) except Exception as e: - msg = ("Error merging: %s\nWould you like to manually fix-up " - "this merge?" % e) + msg = "Error merging: %s\nWould you like to manually fix-up " "this merge?" % e continue_maybe(msg) - msg = ("Okay, please fix any conflicts and 'git add' " - "conflicting files... Finished?") + msg = ( + "Okay, please fix any conflicts and 'git add' " + "conflicting files... Finished?" + ) continue_maybe(msg) had_conflicts = True - commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%an <%ae>']).split("\n") - distinct_authors = sorted(set(commit_authors), - key=lambda x: commit_authors.count(x), - reverse=True) + commit_authors = run_cmd( + ["git", "log", "HEAD..%s" % pr_branch_name, "--pretty=format:%an <%ae>"] + ).split("\n") + distinct_authors = sorted( + set(commit_authors), key=lambda x: commit_authors.count(x), reverse=True + ) primary_author = distinct_authors[0] - commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, - '--pretty=format:%h [%an] %s']).split("\n\n") + commits = run_cmd( + ["git", "log", "HEAD..%s" % pr_branch_name, "--pretty=format:%h [%an] %s"] + ).split("\n\n") merge_message_flags = [] merge_message_flags += ["-m", title] if body is not None: - merge_message_flags += ["-m", '\n'.join(textwrap.wrap(body))] + merge_message_flags += ["-m", "\n".join(textwrap.wrap(body))] authors = "\n".join("Author: %s" % a for a in distinct_authors) @@ -161,9 +168,10 @@ def merge_pr(pr_num, target_ref): if had_conflicts: committer_name = run_cmd("git config --get user.name").strip() committer_email = run_cmd("git config --get user.email").strip() - message = ("This patch had conflicts when merged, " - "resolved by\nCommitter: %s <%s>" - % (committer_name, committer_email)) + message = ( + "This patch had conflicts when merged, " + "resolved by\nCommitter: %s <%s>" % (committer_name, committer_email) + ) merge_message_flags += ["-m", message] # The string "Closes #%s" string is required for GitHub to correctly close @@ -171,19 +179,22 @@ def merge_pr(pr_num, target_ref): merge_message_flags += [ "-m", "Closes #%s from %s and squashes the following commits:" - % (pr_num, pr_repo_desc)] + % (pr_num, pr_repo_desc), + ] for c in commits: merge_message_flags += ["-m", c] - run_cmd(['git', 'commit', '--author="%s"' % primary_author] + - merge_message_flags) + run_cmd(["git", "commit", '--author="%s"' % primary_author] + merge_message_flags) - continue_maybe("Merge complete (local ref %s). Push to %s?" % ( - target_branch_name, PUSH_REMOTE_NAME)) + continue_maybe( + "Merge complete (local ref %s). Push to %s?" + % (target_branch_name, PUSH_REMOTE_NAME) + ) try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, - target_ref)) + run_cmd( + "git push %s %s:%s" % (PUSH_REMOTE_NAME, target_branch_name, target_ref) + ) except Exception as e: clean_up() fail("Exception while pushing: %s" % e) @@ -199,25 +210,26 @@ def update_pr(pr_num, user_login, base_ref): pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) - run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, - pr_branch_name)) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) run_cmd("git checkout %s" % pr_branch_name) - continue_maybe("Update ready (local ref %s)? Push to %s/%s?" % ( - pr_branch_name, user_login, base_ref)) + continue_maybe( + "Update ready (local ref %s)? Push to %s/%s?" + % (pr_branch_name, user_login, base_ref) + ) push_user_remote = "https://github.com/%s/pandas.git" % user_login try: - run_cmd('git push %s %s:%s' % (push_user_remote, pr_branch_name, - base_ref)) + run_cmd("git push %s %s:%s" % (push_user_remote, pr_branch_name, base_ref)) except Exception as e: if continue_maybe2("Force push?"): try: run_cmd( - 'git push -f %s %s:%s' % (push_user_remote, pr_branch_name, - base_ref)) + "git push -f %s %s:%s" + % (push_user_remote, pr_branch_name, base_ref) + ) except Exception as e: fail("Exception while pushing: %s" % e) clean_up() @@ -234,20 +246,19 @@ def cherry_pick(pr_num, merge_hash, default_branch): if pick_ref == "": pick_ref = default_branch - pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, - pick_ref.upper()) + pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper()) - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, - pick_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name)) run_cmd("git checkout %s" % pick_branch_name) run_cmd("git cherry-pick -sx %s" % merge_hash) - continue_maybe("Pick complete (local ref %s). Push to %s?" % ( - pick_branch_name, PUSH_REMOTE_NAME)) + continue_maybe( + "Pick complete (local ref %s). Push to %s?" + % (pick_branch_name, PUSH_REMOTE_NAME) + ) try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, - pick_ref)) + run_cmd("git push %s %s:%s" % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) except Exception as e: clean_up() fail("Exception while pushing: %s" % e) @@ -282,35 +293,44 @@ def fix_version_from_branch(branch, versions): pr_repo_desc = "%s/%s" % (user_login, base_ref) if pr["merged"] is True: - print("Pull request {0} has already been merged, please backport manually" - .format(pr_num)) + print( + "Pull request {0} has already been merged, please backport manually".format( + pr_num + ) + ) sys.exit(0) if not bool(pr["mergeable"]): - msg = ("Pull request {0} is not mergeable in its current form.\n" - "Continue? (experts only!)".format(pr_num)) + msg = ( + "Pull request {0} is not mergeable in its current form.\n" + "Continue? (experts only!)".format(pr_num) + ) continue_maybe(msg) print("\n=== Pull Request #%s ===" % pr_num) # we may have un-printable unicode in our title try: - title = title.encode('raw_unicode_escape') + title = title.encode("raw_unicode_escape") except Exception: pass -print("title\t{title}\nsource\t{source}\ntarget\t{target}\nurl\t{url}".format( - title=title, source=pr_repo_desc, target=target_ref, url=url)) +print( + "title\t{title}\nsource\t{source}\ntarget\t{target}\nurl\t{url}".format( + title=title, source=pr_repo_desc, target=target_ref, url=url + ) +) merged_refs = [target_ref] print("\nProceed with updating or merging pull request #%s?" % pr_num) -update = input("Update PR and push to remote (r), merge locally (l), " - "or do nothing (n) ?") +update = input( + "Update PR and push to remote (r), merge locally (l), " "or do nothing (n) ?" +) update = update.lower() -if update == 'r': +if update == "r": merge_hash = update_pr(pr_num, user_login, base_ref) -elif update == 'l': +elif update == "l": merge_hash = merge_pr(pr_num, target_ref) diff --git a/scripts/tests/conftest.py b/scripts/tests/conftest.py index f8318b8d402af..496a5195bfc84 100644 --- a/scripts/tests/conftest.py +++ b/scripts/tests/conftest.py @@ -1,3 +1,6 @@ def pytest_addoption(parser): - parser.addoption("--strict-data-files", action="store_true", - help="Unused. For compat with setup.cfg.") + parser.addoption( + "--strict-data-files", + action="store_true", + help="Unused. For compat with setup.cfg.", + ) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 34395435bd8c5..f3364e6725a20 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -7,6 +7,7 @@ import pandas as pd import validate_docstrings + validate_one = validate_docstrings.validate_one @@ -18,7 +19,7 @@ class GoodDocStrings: script without any errors. """ - def plot(self, kind, color='blue', **kwargs): + def plot(self, kind, color="blue", **kwargs): """ Generate a plot. @@ -244,8 +245,10 @@ def empty_returns(self): Since this function never returns a value, this docstring doesn't need a return section. """ + def say_hello(): return "Hello World!" + say_hello() if True: return @@ -446,7 +449,6 @@ def method_wo_docstrings(self): class BadSummaries: - def wrong_line(self): """Exists on the wrong line""" pass @@ -612,7 +614,6 @@ def list_incorrect_parameter_type(self, kind): class BadReturns: - def return_not_documented(self): """ Lacks section for Returns @@ -695,7 +696,6 @@ def no_period_multi(self): class BadSeeAlso: - def desc_no_period(self): """ Return the first 5 elements of the Series. @@ -733,7 +733,6 @@ def prefix_pandas(self): class BadExamples: - def unused_import(self): """ Examples @@ -771,7 +770,6 @@ def missing_whitespace_after_comma(self): class TestValidator: - def _import_path(self, klass=None, func=None): """ Build the required import path for tests in this module. @@ -799,162 +797,314 @@ def _import_path(self, klass=None, func=None): return base_path def test_good_class(self, capsys): - errors = validate_one(self._import_path( - klass='GoodDocStrings'))['errors'] + errors = validate_one(self._import_path(klass="GoodDocStrings"))["errors"] assert isinstance(errors, list) assert not errors - @pytest.mark.parametrize("func", [ - 'plot', 'sample', 'random_letters', 'sample_values', 'head', 'head1', - 'contains', 'mode', 'good_imports', 'no_returns', 'empty_returns']) + @pytest.mark.parametrize( + "func", + [ + "plot", + "sample", + "random_letters", + "sample_values", + "head", + "head1", + "contains", + "mode", + "good_imports", + "no_returns", + "empty_returns", + ], + ) def test_good_functions(self, capsys, func): - errors = validate_one(self._import_path( - klass='GoodDocStrings', func=func))['errors'] + errors = validate_one(self._import_path(klass="GoodDocStrings", func=func))[ + "errors" + ] assert isinstance(errors, list) assert not errors def test_bad_class(self, capsys): - errors = validate_one(self._import_path( - klass='BadGenericDocStrings'))['errors'] + errors = validate_one(self._import_path(klass="BadGenericDocStrings"))["errors"] assert isinstance(errors, list) assert errors - @pytest.mark.parametrize("func", [ - 'func', 'astype', 'astype1', 'astype2', 'astype3', 'plot', 'method', - 'private_classes', - ]) + @pytest.mark.parametrize( + "func", + [ + "func", + "astype", + "astype1", + "astype2", + "astype3", + "plot", + "method", + "private_classes", + ], + ) def test_bad_generic_functions(self, capsys, func): - errors = validate_one(self._import_path( # noqa:F821 - klass='BadGenericDocStrings', func=func))['errors'] + errors = validate_one( + self._import_path(klass="BadGenericDocStrings", func=func) # noqa:F821 + )["errors"] assert isinstance(errors, list) assert errors - @pytest.mark.parametrize("klass,func,msgs", [ - # See Also tests - ('BadGenericDocStrings', 'private_classes', - ("Private classes (NDFrame) should not be mentioned in public " - 'docstrings',)), - ('BadGenericDocStrings', 'unknown_section', - ('Found unknown section "Unknown Section".',)), - ('BadGenericDocStrings', 'sections_in_wrong_order', - ('Sections are in the wrong order. Correct order is: Parameters, ' - 'See Also, Examples',)), - ('BadGenericDocStrings', 'deprecation_in_wrong_order', - ('Deprecation warning should precede extended summary',)), - ('BadSeeAlso', 'desc_no_period', - ('Missing period at end of description for See Also "Series.iloc"',)), - ('BadSeeAlso', 'desc_first_letter_lowercase', - ('should be capitalized for See Also "Series.tail"',)), - # Summary tests - ('BadSummaries', 'wrong_line', - ('should start in the line immediately after the opening quotes',)), - ('BadSummaries', 'no_punctuation', - ('Summary does not end with a period',)), - ('BadSummaries', 'no_capitalization', - ('Summary does not start with a capital letter',)), - ('BadSummaries', 'no_capitalization', - ('Summary must start with infinitive verb',)), - ('BadSummaries', 'multi_line', - ('Summary should fit in a single line',)), - ('BadSummaries', 'two_paragraph_multi_line', - ('Summary should fit in a single line',)), - # Parameters tests - ('BadParameters', 'missing_params', - ('Parameters {**kwargs} not documented',)), - ('BadParameters', 'bad_colon_spacing', - ('Parameter "kind" requires a space before the colon ' - 'separating the parameter name and type',)), - ('BadParameters', 'no_description_period', - ('Parameter "kind" description should finish with "."',)), - ('BadParameters', 'no_description_period_with_directive', - ('Parameter "kind" description should finish with "."',)), - ('BadParameters', 'parameter_capitalization', - ('Parameter "kind" description should start with a capital letter',)), - ('BadParameters', 'integer_parameter', - ('Parameter "kind" type should use "int" instead of "integer"',)), - ('BadParameters', 'string_parameter', - ('Parameter "kind" type should use "str" instead of "string"',)), - ('BadParameters', 'boolean_parameter', - ('Parameter "kind" type should use "bool" instead of "boolean"',)), - ('BadParameters', 'list_incorrect_parameter_type', - ('Parameter "kind" type should use "bool" instead of "boolean"',)), - ('BadParameters', 'list_incorrect_parameter_type', - ('Parameter "kind" type should use "int" instead of "integer"',)), - ('BadParameters', 'list_incorrect_parameter_type', - ('Parameter "kind" type should use "str" instead of "string"',)), - pytest.param('BadParameters', 'blank_lines', ('No error yet?',), - marks=pytest.mark.xfail), - # Returns tests - ('BadReturns', 'return_not_documented', ('No Returns section found',)), - ('BadReturns', 'yield_not_documented', ('No Yields section found',)), - pytest.param('BadReturns', 'no_type', ('foo',), - marks=pytest.mark.xfail), - ('BadReturns', 'no_description', - ('Return value has no description',)), - ('BadReturns', 'no_punctuation', - ('Return value description should finish with "."',)), - ('BadReturns', 'named_single_return', - ('The first line of the Returns section should contain only the ' - 'type, unless multiple values are being returned',)), - ('BadReturns', 'no_capitalization', - ('Return value description should start with a capital ' - 'letter',)), - ('BadReturns', 'no_period_multi', - ('Return value description should finish with "."',)), - # Examples tests - ('BadGenericDocStrings', 'method', - ('Do not import numpy, as it is imported automatically',)), - ('BadGenericDocStrings', 'method', - ('Do not import pandas, as it is imported automatically',)), - ('BadGenericDocStrings', 'method_wo_docstrings', - ("The object does not have a docstring",)), - # See Also tests - ('BadSeeAlso', 'prefix_pandas', - ('pandas.Series.rename in `See Also` section ' - 'does not need `pandas` prefix',)), - # Examples tests - ('BadExamples', 'unused_import', - ("flake8 error: F401 'pandas as pdf' imported but unused",)), - ('BadExamples', 'indentation_is_not_a_multiple_of_four', - ('flake8 error: E111 indentation is not a multiple of four',)), - ('BadExamples', 'missing_whitespace_around_arithmetic_operator', - ('flake8 error: ' - 'E226 missing whitespace around arithmetic operator',)), - ('BadExamples', 'missing_whitespace_after_comma', - ("flake8 error: E231 missing whitespace after ',' (3 times)",)), - ('BadGenericDocStrings', 'two_linebreaks_between_sections', - ('Double line break found; please use only one blank line to ' - 'separate sections or paragraphs, and do not leave blank lines ' - 'at the end of docstrings',)), - ('BadGenericDocStrings', 'linebreak_at_end_of_docstring', - ('Double line break found; please use only one blank line to ' - 'separate sections or paragraphs, and do not leave blank lines ' - 'at the end of docstrings',)), - ]) + @pytest.mark.parametrize( + "klass,func,msgs", + [ + # See Also tests + ( + "BadGenericDocStrings", + "private_classes", + ( + "Private classes (NDFrame) should not be mentioned in public " + "docstrings", + ), + ), + ( + "BadGenericDocStrings", + "unknown_section", + ('Found unknown section "Unknown Section".',), + ), + ( + "BadGenericDocStrings", + "sections_in_wrong_order", + ( + "Sections are in the wrong order. Correct order is: Parameters, " + "See Also, Examples", + ), + ), + ( + "BadGenericDocStrings", + "deprecation_in_wrong_order", + ("Deprecation warning should precede extended summary",), + ), + ( + "BadSeeAlso", + "desc_no_period", + ('Missing period at end of description for See Also "Series.iloc"',), + ), + ( + "BadSeeAlso", + "desc_first_letter_lowercase", + ('should be capitalized for See Also "Series.tail"',), + ), + # Summary tests + ( + "BadSummaries", + "wrong_line", + ("should start in the line immediately after the opening quotes",), + ), + ("BadSummaries", "no_punctuation", ("Summary does not end with a period",)), + ( + "BadSummaries", + "no_capitalization", + ("Summary does not start with a capital letter",), + ), + ( + "BadSummaries", + "no_capitalization", + ("Summary must start with infinitive verb",), + ), + ("BadSummaries", "multi_line", ("Summary should fit in a single line",)), + ( + "BadSummaries", + "two_paragraph_multi_line", + ("Summary should fit in a single line",), + ), + # Parameters tests + ( + "BadParameters", + "missing_params", + ("Parameters {**kwargs} not documented",), + ), + ( + "BadParameters", + "bad_colon_spacing", + ( + 'Parameter "kind" requires a space before the colon ' + "separating the parameter name and type", + ), + ), + ( + "BadParameters", + "no_description_period", + ('Parameter "kind" description should finish with "."',), + ), + ( + "BadParameters", + "no_description_period_with_directive", + ('Parameter "kind" description should finish with "."',), + ), + ( + "BadParameters", + "parameter_capitalization", + ('Parameter "kind" description should start with a capital letter',), + ), + ( + "BadParameters", + "integer_parameter", + ('Parameter "kind" type should use "int" instead of "integer"',), + ), + ( + "BadParameters", + "string_parameter", + ('Parameter "kind" type should use "str" instead of "string"',), + ), + ( + "BadParameters", + "boolean_parameter", + ('Parameter "kind" type should use "bool" instead of "boolean"',), + ), + ( + "BadParameters", + "list_incorrect_parameter_type", + ('Parameter "kind" type should use "bool" instead of "boolean"',), + ), + ( + "BadParameters", + "list_incorrect_parameter_type", + ('Parameter "kind" type should use "int" instead of "integer"',), + ), + ( + "BadParameters", + "list_incorrect_parameter_type", + ('Parameter "kind" type should use "str" instead of "string"',), + ), + pytest.param( + "BadParameters", + "blank_lines", + ("No error yet?",), + marks=pytest.mark.xfail, + ), + # Returns tests + ("BadReturns", "return_not_documented", ("No Returns section found",)), + ("BadReturns", "yield_not_documented", ("No Yields section found",)), + pytest.param("BadReturns", "no_type", ("foo",), marks=pytest.mark.xfail), + ("BadReturns", "no_description", ("Return value has no description",)), + ( + "BadReturns", + "no_punctuation", + ('Return value description should finish with "."',), + ), + ( + "BadReturns", + "named_single_return", + ( + "The first line of the Returns section should contain only the " + "type, unless multiple values are being returned", + ), + ), + ( + "BadReturns", + "no_capitalization", + ("Return value description should start with a capital " "letter",), + ), + ( + "BadReturns", + "no_period_multi", + ('Return value description should finish with "."',), + ), + # Examples tests + ( + "BadGenericDocStrings", + "method", + ("Do not import numpy, as it is imported automatically",), + ), + ( + "BadGenericDocStrings", + "method", + ("Do not import pandas, as it is imported automatically",), + ), + ( + "BadGenericDocStrings", + "method_wo_docstrings", + ("The object does not have a docstring",), + ), + # See Also tests + ( + "BadSeeAlso", + "prefix_pandas", + ( + "pandas.Series.rename in `See Also` section " + "does not need `pandas` prefix", + ), + ), + # Examples tests + ( + "BadExamples", + "unused_import", + ("flake8 error: F401 'pandas as pdf' imported but unused",), + ), + ( + "BadExamples", + "indentation_is_not_a_multiple_of_four", + ("flake8 error: E111 indentation is not a multiple of four",), + ), + ( + "BadExamples", + "missing_whitespace_around_arithmetic_operator", + ( + "flake8 error: " + "E226 missing whitespace around arithmetic operator", + ), + ), + ( + "BadExamples", + "missing_whitespace_after_comma", + ("flake8 error: E231 missing whitespace after ',' (3 times)",), + ), + ( + "BadGenericDocStrings", + "two_linebreaks_between_sections", + ( + "Double line break found; please use only one blank line to " + "separate sections or paragraphs, and do not leave blank lines " + "at the end of docstrings", + ), + ), + ( + "BadGenericDocStrings", + "linebreak_at_end_of_docstring", + ( + "Double line break found; please use only one blank line to " + "separate sections or paragraphs, and do not leave blank lines " + "at the end of docstrings", + ), + ), + ], + ) def test_bad_docstrings(self, capsys, klass, func, msgs): result = validate_one(self._import_path(klass=klass, func=func)) for msg in msgs: - assert msg in ' '.join(err[1] for err in result['errors']) + assert msg in " ".join(err[1] for err in result["errors"]) def test_validate_all_ignore_deprecated(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_one', lambda func_name: { - 'docstring': 'docstring1', - 'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'warnings': [], - 'examples_errors': '', - 'deprecated': True}) - result = validate_docstrings.validate_all(prefix=None, - ignore_deprecated=True) + validate_docstrings, + "validate_one", + lambda func_name: { + "docstring": "docstring1", + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "warnings": [], + "examples_errors": "", + "deprecated": True, + }, + ) + result = validate_docstrings.validate_all(prefix=None, ignore_deprecated=True) assert len(result) == 0 class TestApiItems: @property def api_doc(self): - return io.StringIO(textwrap.dedent(''' + return io.StringIO( + textwrap.dedent( + """ .. currentmodule:: itertools Itertools @@ -987,73 +1137,88 @@ def api_doc(self): seed randint - ''')) - - @pytest.mark.parametrize('idx,name', [(0, 'itertools.cycle'), - (1, 'itertools.count'), - (2, 'itertools.chain'), - (3, 'random.seed'), - (4, 'random.randint')]) + """ + ) + ) + + @pytest.mark.parametrize( + "idx,name", + [ + (0, "itertools.cycle"), + (1, "itertools.count"), + (2, "itertools.chain"), + (3, "random.seed"), + (4, "random.randint"), + ], + ) def test_item_name(self, idx, name): result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][0] == name - @pytest.mark.parametrize('idx,func', [(0, 'cycle'), - (1, 'count'), - (2, 'chain'), - (3, 'seed'), - (4, 'randint')]) + @pytest.mark.parametrize( + "idx,func", + [(0, "cycle"), (1, "count"), (2, "chain"), (3, "seed"), (4, "randint")], + ) def test_item_function(self, idx, func): result = list(validate_docstrings.get_api_items(self.api_doc)) assert callable(result[idx][1]) assert result[idx][1].__name__ == func - @pytest.mark.parametrize('idx,section', [(0, 'Itertools'), - (1, 'Itertools'), - (2, 'Itertools'), - (3, 'Random'), - (4, 'Random')]) + @pytest.mark.parametrize( + "idx,section", + [ + (0, "Itertools"), + (1, "Itertools"), + (2, "Itertools"), + (3, "Random"), + (4, "Random"), + ], + ) def test_item_section(self, idx, section): result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][2] == section - @pytest.mark.parametrize('idx,subsection', [(0, 'Infinite'), - (1, 'Infinite'), - (2, 'Finite'), - (3, 'All'), - (4, 'All')]) + @pytest.mark.parametrize( + "idx,subsection", + [(0, "Infinite"), (1, "Infinite"), (2, "Finite"), (3, "All"), (4, "All")], + ) def test_item_subsection(self, idx, subsection): result = list(validate_docstrings.get_api_items(self.api_doc)) assert result[idx][3] == subsection class TestDocstringClass: - @pytest.mark.parametrize('name, expected_obj', - [('pandas.isnull', pd.isnull), - ('pandas.DataFrame', pd.DataFrame), - ('pandas.Series.sum', pd.Series.sum)]) + @pytest.mark.parametrize( + "name, expected_obj", + [ + ("pandas.isnull", pd.isnull), + ("pandas.DataFrame", pd.DataFrame), + ("pandas.Series.sum", pd.Series.sum), + ], + ) def test_resolves_class_name(self, name, expected_obj): d = validate_docstrings.Docstring(name) assert d.obj is expected_obj - @pytest.mark.parametrize('invalid_name', ['panda', 'panda.DataFrame']) + @pytest.mark.parametrize("invalid_name", ["panda", "panda.DataFrame"]) def test_raises_for_invalid_module_name(self, invalid_name): msg = 'No module can be imported from "{}"'.format(invalid_name) with pytest.raises(ImportError, match=msg): validate_docstrings.Docstring(invalid_name) - @pytest.mark.parametrize('invalid_name', - ['pandas.BadClassName', - 'pandas.Series.bad_method_name']) + @pytest.mark.parametrize( + "invalid_name", ["pandas.BadClassName", "pandas.Series.bad_method_name"] + ) def test_raises_for_invalid_attribute_name(self, invalid_name): - name_components = invalid_name.split('.') + name_components = invalid_name.split(".") obj_name, invalid_attr_name = name_components[-2], name_components[-1] msg = "'{}' has no attribute '{}'".format(obj_name, invalid_attr_name) with pytest.raises(AttributeError, match=msg): validate_docstrings.Docstring(invalid_name) - @pytest.mark.parametrize('name', ['pandas.Series.str.isdecimal', - 'pandas.Series.str.islower']) + @pytest.mark.parametrize( + "name", ["pandas.Series.str.isdecimal", "pandas.Series.str.islower"] + ) def test_encode_content_write_to_file(self, name): # GH25466 docstr = validate_docstrings.Docstring(name).validate_pep8() @@ -1064,97 +1229,141 @@ def test_encode_content_write_to_file(self, name): class TestMainFunction: def test_exit_status_for_validate_one(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_one', lambda func_name: { - 'docstring': 'docstring1', - 'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'warnings': [], - 'examples_errors': ''}) - exit_status = validate_docstrings.main(func_name='docstring1', - prefix=None, - errors=[], - output_format='default', - ignore_deprecated=False) + validate_docstrings, + "validate_one", + lambda func_name: { + "docstring": "docstring1", + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "warnings": [], + "examples_errors": "", + }, + ) + exit_status = validate_docstrings.main( + func_name="docstring1", + prefix=None, + errors=[], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 0 def test_exit_status_errors_for_validate_all(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'docstring1': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'file': 'module1.py', - 'file_line': 23}, - 'docstring2': {'errors': [('ER04', 'err desc'), - ('ER05', 'err desc')], - 'file': 'module2.py', - 'file_line': 925}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=[], - output_format='default', - ignore_deprecated=False) + "docstring1": { + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "file": "module1.py", + "file_line": 23, + }, + "docstring2": { + "errors": [("ER04", "err desc"), ("ER05", "err desc")], + "file": "module2.py", + "file_line": 925, + }, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=[], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 5 def test_no_exit_status_noerrors_for_validate_all(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'docstring1': {'errors': [], - 'warnings': [('WN01', 'warn desc')]}, - 'docstring2': {'errors': []}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=[], - output_format='default', - ignore_deprecated=False) + "docstring1": {"errors": [], "warnings": [("WN01", "warn desc")]}, + "docstring2": {"errors": []}, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=[], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 0 def test_exit_status_for_validate_all_json(self, monkeypatch): - print('EXECUTED') + print("EXECUTED") monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'docstring1': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')]}, - 'docstring2': {'errors': [('ER04', 'err desc'), - ('ER05', 'err desc')]}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=[], - output_format='json', - ignore_deprecated=False) + "docstring1": { + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ] + }, + "docstring2": {"errors": [("ER04", "err desc"), ("ER05", "err desc")]}, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=[], + output_format="json", + ignore_deprecated=False, + ) assert exit_status == 0 def test_errors_param_filters_errors(self, monkeypatch): monkeypatch.setattr( - validate_docstrings, 'validate_all', + validate_docstrings, + "validate_all", lambda prefix, ignore_deprecated=False: { - 'Series.foo': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc'), - ('ER03', 'err desc')], - 'file': 'series.py', - 'file_line': 142}, - 'DataFrame.bar': {'errors': [('ER01', 'err desc'), - ('ER02', 'err desc')], - 'file': 'frame.py', - 'file_line': 598}, - 'Series.foobar': {'errors': [('ER01', 'err desc')], - 'file': 'series.py', - 'file_line': 279}}) - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=['ER01'], - output_format='default', - ignore_deprecated=False) + "Series.foo": { + "errors": [ + ("ER01", "err desc"), + ("ER02", "err desc"), + ("ER03", "err desc"), + ], + "file": "series.py", + "file_line": 142, + }, + "DataFrame.bar": { + "errors": [("ER01", "err desc"), ("ER02", "err desc")], + "file": "frame.py", + "file_line": 598, + }, + "Series.foobar": { + "errors": [("ER01", "err desc")], + "file": "series.py", + "file_line": 279, + }, + }, + ) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=["ER01"], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 3 - exit_status = validate_docstrings.main(func_name=None, - prefix=None, - errors=['ER03'], - output_format='default', - ignore_deprecated=False) + exit_status = validate_docstrings.main( + func_name=None, + prefix=None, + errors=["ER03"], + output_format="default", + ignore_deprecated=False, + ) assert exit_status == 1 diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index dddd5eb1f1eab..37623d32db685 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -40,9 +40,10 @@ # to avoid that plot windows are open from the doctests while running the # script. Setting here before matplotlib is loaded. # We don't warn for the number of open plots, as none is actually being opened -os.environ['MPLBACKEND'] = 'Template' +os.environ["MPLBACKEND"] = "Template" import matplotlib -matplotlib.rc('figure', max_open_warning=10000) + +matplotlib.rc("figure", max_open_warning=10000) import numpy @@ -51,80 +52,90 @@ sys.path.insert(0, os.path.join(BASE_PATH)) import pandas -sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext')) +sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) from numpydoc.docscrape import NumpyDocString from pandas.io.formats.printing import pprint_thing -PRIVATE_CLASSES = ['NDFrame', 'IndexOpsMixin'] -DIRECTIVES = ['versionadded', 'versionchanged', 'deprecated'] -ALLOWED_SECTIONS = ['Parameters', 'Attributes', 'Methods', 'Returns', 'Yields', - 'Other Parameters', 'Raises', 'Warns', 'See Also', 'Notes', - 'References', 'Examples'] +PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] +DIRECTIVES = ["versionadded", "versionchanged", "deprecated"] +ALLOWED_SECTIONS = [ + "Parameters", + "Attributes", + "Methods", + "Returns", + "Yields", + "Other Parameters", + "Raises", + "Warns", + "See Also", + "Notes", + "References", + "Examples", +] ERROR_MSGS = { - 'GL01': 'Docstring text (summary) should start in the line immediately ' - 'after the opening quotes (not in the same line, or leaving a ' - 'blank line in between)', - 'GL02': 'Closing quotes should be placed in the line after the last text ' - 'in the docstring (do not close the quotes in the same line as ' - 'the text, or leave a blank line between the last text and the ' - 'quotes)', - 'GL03': 'Double line break found; please use only one blank line to ' - 'separate sections or paragraphs, and do not leave blank lines ' - 'at the end of docstrings', - 'GL04': 'Private classes ({mentioned_private_classes}) should not be ' - 'mentioned in public docstrings', - 'GL05': 'Tabs found at the start of line "{line_with_tabs}", please use ' - 'whitespace only', - 'GL06': 'Found unknown section "{section}". Allowed sections are: ' - '{allowed_sections}', - 'GL07': 'Sections are in the wrong order. Correct order is: ' - '{correct_sections}', - 'GL08': 'The object does not have a docstring', - 'GL09': 'Deprecation warning should precede extended summary', - 'SS01': 'No summary found (a short summary in a single line should be ' - 'present at the beginning of the docstring)', - 'SS02': 'Summary does not start with a capital letter', - 'SS03': 'Summary does not end with a period', - 'SS04': 'Summary contains heading whitespaces', - 'SS05': 'Summary must start with infinitive verb, not third person ' - '(e.g. use "Generate" instead of "Generates")', - 'SS06': 'Summary should fit in a single line', - 'ES01': 'No extended summary found', - 'PR01': 'Parameters {missing_params} not documented', - 'PR02': 'Unknown parameters {unknown_params}', - 'PR03': 'Wrong parameters order. Actual: {actual_params}. ' - 'Documented: {documented_params}', - 'PR04': 'Parameter "{param_name}" has no type', - 'PR05': 'Parameter "{param_name}" type should not finish with "."', - 'PR06': 'Parameter "{param_name}" type should use "{right_type}" instead ' - 'of "{wrong_type}"', - 'PR07': 'Parameter "{param_name}" has no description', - 'PR08': 'Parameter "{param_name}" description should start with a ' - 'capital letter', - 'PR09': 'Parameter "{param_name}" description should finish with "."', - 'PR10': 'Parameter "{param_name}" requires a space before the colon ' - 'separating the parameter name and type', - 'RT01': 'No Returns section found', - 'RT02': 'The first line of the Returns section should contain only the ' - 'type, unless multiple values are being returned', - 'RT03': 'Return value has no description', - 'RT04': 'Return value description should start with a capital letter', - 'RT05': 'Return value description should finish with "."', - 'YD01': 'No Yields section found', - 'SA01': 'See Also section not found', - 'SA02': 'Missing period at end of description for See Also ' - '"{reference_name}" reference', - 'SA03': 'Description should be capitalized for See Also ' - '"{reference_name}" reference', - 'SA04': 'Missing description for See Also "{reference_name}" reference', - 'SA05': '{reference_name} in `See Also` section does not need `pandas` ' - 'prefix, use {right_reference} instead.', - 'EX01': 'No examples section found', - 'EX02': 'Examples do not pass tests:\n{doctest_log}', - 'EX03': 'flake8 error: {error_code} {error_message}{times_happening}', - 'EX04': 'Do not import {imported_library}, as it is imported ' - 'automatically for the examples (numpy as np, pandas as pd)', + "GL01": "Docstring text (summary) should start in the line immediately " + "after the opening quotes (not in the same line, or leaving a " + "blank line in between)", + "GL02": "Closing quotes should be placed in the line after the last text " + "in the docstring (do not close the quotes in the same line as " + "the text, or leave a blank line between the last text and the " + "quotes)", + "GL03": "Double line break found; please use only one blank line to " + "separate sections or paragraphs, and do not leave blank lines " + "at the end of docstrings", + "GL04": "Private classes ({mentioned_private_classes}) should not be " + "mentioned in public docstrings", + "GL05": 'Tabs found at the start of line "{line_with_tabs}", please use ' + "whitespace only", + "GL06": 'Found unknown section "{section}". Allowed sections are: ' + "{allowed_sections}", + "GL07": "Sections are in the wrong order. Correct order is: " "{correct_sections}", + "GL08": "The object does not have a docstring", + "GL09": "Deprecation warning should precede extended summary", + "SS01": "No summary found (a short summary in a single line should be " + "present at the beginning of the docstring)", + "SS02": "Summary does not start with a capital letter", + "SS03": "Summary does not end with a period", + "SS04": "Summary contains heading whitespaces", + "SS05": "Summary must start with infinitive verb, not third person " + '(e.g. use "Generate" instead of "Generates")', + "SS06": "Summary should fit in a single line", + "ES01": "No extended summary found", + "PR01": "Parameters {missing_params} not documented", + "PR02": "Unknown parameters {unknown_params}", + "PR03": "Wrong parameters order. Actual: {actual_params}. " + "Documented: {documented_params}", + "PR04": 'Parameter "{param_name}" has no type', + "PR05": 'Parameter "{param_name}" type should not finish with "."', + "PR06": 'Parameter "{param_name}" type should use "{right_type}" instead ' + 'of "{wrong_type}"', + "PR07": 'Parameter "{param_name}" has no description', + "PR08": 'Parameter "{param_name}" description should start with a ' + "capital letter", + "PR09": 'Parameter "{param_name}" description should finish with "."', + "PR10": 'Parameter "{param_name}" requires a space before the colon ' + "separating the parameter name and type", + "RT01": "No Returns section found", + "RT02": "The first line of the Returns section should contain only the " + "type, unless multiple values are being returned", + "RT03": "Return value has no description", + "RT04": "Return value description should start with a capital letter", + "RT05": 'Return value description should finish with "."', + "YD01": "No Yields section found", + "SA01": "See Also section not found", + "SA02": "Missing period at end of description for See Also " + '"{reference_name}" reference', + "SA03": "Description should be capitalized for See Also " + '"{reference_name}" reference', + "SA04": 'Missing description for See Also "{reference_name}" reference', + "SA05": "{reference_name} in `See Also` section does not need `pandas` " + "prefix, use {right_reference} instead.", + "EX01": "No examples section found", + "EX02": "Examples do not pass tests:\n{doctest_log}", + "EX03": "flake8 error: {error_code} {error_message}{times_happening}", + "EX04": "Do not import {imported_library}, as it is imported " + "automatically for the examples (numpy as np, pandas as pd)", } @@ -182,43 +193,47 @@ def get_api_items(api_doc_fd): The name of the subsection in the API page where the object item is located. """ - current_module = 'pandas' - previous_line = current_section = current_subsection = '' + current_module = "pandas" + previous_line = current_section = current_subsection = "" position = None for line in api_doc_fd: line = line.strip() if len(line) == len(previous_line): - if set(line) == set('-'): + if set(line) == set("-"): current_section = previous_line continue - if set(line) == set('~'): + if set(line) == set("~"): current_subsection = previous_line continue - if line.startswith('.. currentmodule::'): - current_module = line.replace('.. currentmodule::', '').strip() + if line.startswith(".. currentmodule::"): + current_module = line.replace(".. currentmodule::", "").strip() continue - if line == '.. autosummary::': - position = 'autosummary' + if line == ".. autosummary::": + position = "autosummary" continue - if position == 'autosummary': - if line == '': - position = 'items' + if position == "autosummary": + if line == "": + position = "items" continue - if position == 'items': - if line == '': + if position == "items": + if line == "": position = None continue item = line.strip() func = importlib.import_module(current_module) - for part in item.split('.'): + for part in item.split("."): func = getattr(func, part) - yield ('.'.join([current_module, item]), func, - current_section, current_subsection) + yield ( + ".".join([current_module, item]), + func, + current_section, + current_subsection, + ) previous_line = line @@ -229,7 +244,7 @@ def __init__(self, name): obj = self._load_obj(name) self.obj = obj self.code_obj = self._to_original_callable(obj) - self.raw_doc = obj.__doc__ or '' + self.raw_doc = obj.__doc__ or "" self.clean_doc = pydoc.getdoc(obj) self.doc = NumpyDocString(self.clean_doc) @@ -256,9 +271,9 @@ def _load_obj(name): >>> Docstring._load_obj('pandas.Series') """ - for maxsplit in range(1, name.count('.') + 1): + for maxsplit in range(1, name.count(".") + 1): # TODO when py3 only replace by: module, *func_parts = ... - func_name_split = name.rsplit('.', maxsplit) + func_name_split = name.rsplit(".", maxsplit) module = func_name_split[0] func_parts = func_name_split[1:] try: @@ -268,9 +283,8 @@ def _load_obj(name): else: continue - if 'obj' not in locals(): - raise ImportError('No module can be imported ' - 'from "{}"'.format(name)) + if "obj" not in locals(): + raise ImportError("No module can be imported " 'from "{}"'.format(name)) for part in func_parts: obj = getattr(obj, part) @@ -288,7 +302,7 @@ def _to_original_callable(obj): while True: if inspect.isfunction(obj) or inspect.isclass(obj): f = inspect.getfile(obj) - if f.startswith('<') and f.endswith('>'): + if f.startswith("<") and f.endswith(">"): return None return obj if inspect.ismethod(obj): @@ -307,8 +321,7 @@ def type(self): @property def is_function_or_method(self): # TODO(py27): remove ismethod - return (inspect.isfunction(self.obj) - or inspect.ismethod(self.obj)) + return inspect.isfunction(self.obj) or inspect.ismethod(self.obj) @property def source_file_name(self): @@ -342,16 +355,15 @@ def source_file_def_line(self): @property def github_url(self): - url = 'https://github.com/pandas-dev/pandas/blob/master/' - url += '{}#L{}'.format(self.source_file_name, - self.source_file_def_line) + url = "https://github.com/pandas-dev/pandas/blob/master/" + url += "{}#L{}".format(self.source_file_name, self.source_file_def_line) return url @property def start_blank_lines(self): i = None if self.raw_doc: - for i, row in enumerate(self.raw_doc.split('\n')): + for i, row in enumerate(self.raw_doc.split("\n")): if row.strip(): break return i @@ -360,7 +372,7 @@ def start_blank_lines(self): def end_blank_lines(self): i = None if self.raw_doc: - for i, row in enumerate(reversed(self.raw_doc.split('\n'))): + for i, row in enumerate(reversed(self.raw_doc.split("\n"))): if row.strip(): break return i @@ -368,7 +380,7 @@ def end_blank_lines(self): @property def double_blank_lines(self): prev = True - for row in self.raw_doc.split('\n'): + for row in self.raw_doc.split("\n"): if not prev and not row.strip(): return True prev = row.strip() @@ -380,25 +392,27 @@ def section_titles(self): self.doc._doc.reset() while not self.doc._doc.eof(): content = self.doc._read_to_next_section() - if (len(content) > 1 - and len(content[0]) == len(content[1]) - and set(content[1]) == {'-'}): + if ( + len(content) > 1 + and len(content[0]) == len(content[1]) + and set(content[1]) == {"-"} + ): sections.append(content[0]) return sections @property def summary(self): - return ' '.join(self.doc['Summary']) + return " ".join(self.doc["Summary"]) @property def num_summary_lines(self): - return len(self.doc['Summary']) + return len(self.doc["Summary"]) @property def extended_summary(self): - if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1: - return ' '.join(self.doc['Summary']) - return ' '.join(self.doc['Extended Summary']) + if not self.doc["Extended Summary"] and len(self.doc["Summary"]) > 1: + return " ".join(self.doc["Summary"]) + return " ".join(self.doc["Extended Summary"]) @property def needs_summary(self): @@ -406,16 +420,17 @@ def needs_summary(self): @property def doc_parameters(self): - return collections.OrderedDict((name, (type_, ''.join(desc))) - for name, type_, desc - in self.doc['Parameters']) + return collections.OrderedDict( + (name, (type_, "".join(desc))) + for name, type_, desc in self.doc["Parameters"] + ) @property def signature_parameters(self): if inspect.isclass(self.obj): - if hasattr(self.obj, '_accessors') and ( - self.name.split('.')[-1] in - self.obj._accessors): + if hasattr(self.obj, "_accessors") and ( + self.name.split(".")[-1] in self.obj._accessors + ): # accessor classes have a signature but don't want to show this return tuple() try: @@ -430,7 +445,7 @@ def signature_parameters(self): if sig.varkw: params.append("**" + sig.varkw) params = tuple(params) - if params and params[0] in ('self', 'cls'): + if params and params[0] in ("self", "cls"): return params[1:] return params @@ -441,15 +456,21 @@ def parameter_mismatches(self): doc_params = tuple(self.doc_parameters) missing = set(signature_params) - set(doc_params) if missing: - errs.append(error('PR01', missing_params=pprint_thing(missing))) + errs.append(error("PR01", missing_params=pprint_thing(missing))) extra = set(doc_params) - set(signature_params) if extra: - errs.append(error('PR02', unknown_params=pprint_thing(extra))) - if (not missing and not extra and signature_params != doc_params - and not (not signature_params and not doc_params)): - errs.append(error('PR03', - actual_params=signature_params, - documented_params=doc_params)) + errs.append(error("PR02", unknown_params=pprint_thing(extra))) + if ( + not missing + and not extra + and signature_params != doc_params + and not (not signature_params and not doc_params) + ): + errs.append( + error( + "PR03", actual_params=signature_params, documented_params=doc_params + ) + ) return errs @@ -464,44 +485,44 @@ def parameter_desc(self, param): desc = self.doc_parameters[param][1] # Find and strip out any sphinx directives for directive in DIRECTIVES: - full_directive = '.. {}'.format(directive) + full_directive = ".. {}".format(directive) if full_directive in desc: # Only retain any description before the directive - desc = desc[:desc.index(full_directive)] + desc = desc[: desc.index(full_directive)] return desc @property def see_also(self): result = collections.OrderedDict() - for funcs, desc in self.doc['See Also']: + for funcs, desc in self.doc["See Also"]: for func, _ in funcs: - result[func] = ''.join(desc) + result[func] = "".join(desc) return result @property def examples(self): - return self.doc['Examples'] + return self.doc["Examples"] @property def returns(self): - return self.doc['Returns'] + return self.doc["Returns"] @property def yields(self): - return self.doc['Yields'] + return self.doc["Yields"] @property def method_source(self): try: source = inspect.getsource(self.obj) except TypeError: - return '' + return "" return textwrap.dedent(source) @property def method_returns_something(self): - ''' + """ Check if the docstrings method can return something. Bare returns, returns valued None and returns from nested functions are @@ -511,7 +532,7 @@ def method_returns_something(self): ------- bool Whether the docstrings method can return something. - ''' + """ def get_returns_not_on_nested_functions(node): returns = [node] if isinstance(node, ast.Return) else [] @@ -537,11 +558,11 @@ def get_returns_not_on_nested_functions(node): @property def first_line_ends_in_dot(self): if self.doc: - return self.doc.split('\n')[0][-1] == '.' + return self.doc.split("\n")[0][-1] == "." @property def deprecated(self): - return '.. deprecated:: ' in (self.summary + self.extended_summary) + return ".. deprecated:: " in (self.summary + self.extended_summary) @property def mentioned_private_classes(self): @@ -552,8 +573,8 @@ def examples_errors(self): flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL finder = doctest.DocTestFinder() runner = doctest.DocTestRunner(optionflags=flags) - context = {'np': numpy, 'pd': pandas} - error_msgs = '' + context = {"np": numpy, "pd": pandas} + error_msgs = "" for test in finder.find(self.raw_doc, self.name, globs=context): f = StringIO() runner.run(test, out=f.write) @@ -571,14 +592,18 @@ def validate_pep8(self): # F401 is needed to not generate flake8 errors in examples # that do not user numpy or pandas - content = ''.join(('import numpy as np # noqa: F401\n', - 'import pandas as pd # noqa: F401\n', - *self.examples_source_code)) + content = "".join( + ( + "import numpy as np # noqa: F401\n", + "import pandas as pd # noqa: F401\n", + *self.examples_source_code, + ) + ) application = flake8.main.application.Application() application.initialize(["--quiet"]) - with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as file: + with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as file: file.write(content) file.flush() application.run_checks([file.name]) @@ -588,7 +613,7 @@ def validate_pep8(self): application.formatter.write = lambda line, source: None application.report() - yield from application.guide.stats.statistics_for('') + yield from application.guide.stats.statistics_for("") def get_validation_data(doc): @@ -639,57 +664,55 @@ def get_validation_data(doc): errs = [] wrns = [] if not doc.raw_doc: - errs.append(error('GL08')) - return errs, wrns, '' + errs.append(error("GL08")) + return errs, wrns, "" if doc.start_blank_lines != 1: - errs.append(error('GL01')) + errs.append(error("GL01")) if doc.end_blank_lines != 1: - errs.append(error('GL02')) + errs.append(error("GL02")) if doc.double_blank_lines: - errs.append(error('GL03')) + errs.append(error("GL03")) mentioned_errs = doc.mentioned_private_classes if mentioned_errs: - errs.append(error('GL04', - mentioned_private_classes=', '.join(mentioned_errs))) + errs.append(error("GL04", mentioned_private_classes=", ".join(mentioned_errs))) for line in doc.raw_doc.splitlines(): if re.match("^ *\t", line): - errs.append(error('GL05', line_with_tabs=line.lstrip())) + errs.append(error("GL05", line_with_tabs=line.lstrip())) - unexpected_sections = [section for section in doc.section_titles - if section not in ALLOWED_SECTIONS] + unexpected_sections = [ + section for section in doc.section_titles if section not in ALLOWED_SECTIONS + ] for section in unexpected_sections: - errs.append(error('GL06', - section=section, - allowed_sections=', '.join(ALLOWED_SECTIONS))) + errs.append( + error("GL06", section=section, allowed_sections=", ".join(ALLOWED_SECTIONS)) + ) - correct_order = [section for section in ALLOWED_SECTIONS - if section in doc.section_titles] + correct_order = [ + section for section in ALLOWED_SECTIONS if section in doc.section_titles + ] if correct_order != doc.section_titles: - errs.append(error('GL07', - correct_sections=', '.join(correct_order))) + errs.append(error("GL07", correct_sections=", ".join(correct_order))) - if (doc.deprecated - and not doc.extended_summary.startswith('.. deprecated:: ')): - errs.append(error('GL09')) + if doc.deprecated and not doc.extended_summary.startswith(".. deprecated:: "): + errs.append(error("GL09")) if not doc.summary: - errs.append(error('SS01')) + errs.append(error("SS01")) else: if not doc.summary[0].isupper(): - errs.append(error('SS02')) - if doc.summary[-1] != '.': - errs.append(error('SS03')) + errs.append(error("SS02")) + if doc.summary[-1] != ".": + errs.append(error("SS03")) if doc.summary != doc.summary.lstrip(): - errs.append(error('SS04')) - elif (doc.is_function_or_method - and doc.summary.split(' ')[0][-1] == 's'): - errs.append(error('SS05')) + errs.append(error("SS04")) + elif doc.is_function_or_method and doc.summary.split(" ")[0][-1] == "s": + errs.append(error("SS05")) if doc.num_summary_lines > 1: - errs.append(error('SS06')) + errs.append(error("SS06")) if not doc.extended_summary: - wrns.append(('ES01', 'No extended summary found')) + wrns.append(("ES01", "No extended summary found")) # PR01: Parameters not documented # PR02: Unknown parameters @@ -699,84 +722,98 @@ def get_validation_data(doc): for param in doc.doc_parameters: if not param.startswith("*"): # Check can ignore var / kwargs if not doc.parameter_type(param): - if ':' in param: - errs.append(error('PR10', - param_name=param.split(':')[0])) + if ":" in param: + errs.append(error("PR10", param_name=param.split(":")[0])) else: - errs.append(error('PR04', param_name=param)) + errs.append(error("PR04", param_name=param)) else: - if doc.parameter_type(param)[-1] == '.': - errs.append(error('PR05', param_name=param)) - common_type_errors = [('integer', 'int'), - ('boolean', 'bool'), - ('string', 'str')] + if doc.parameter_type(param)[-1] == ".": + errs.append(error("PR05", param_name=param)) + common_type_errors = [ + ("integer", "int"), + ("boolean", "bool"), + ("string", "str"), + ] for wrong_type, right_type in common_type_errors: if wrong_type in doc.parameter_type(param): - errs.append(error('PR06', - param_name=param, - right_type=right_type, - wrong_type=wrong_type)) + errs.append( + error( + "PR06", + param_name=param, + right_type=right_type, + wrong_type=wrong_type, + ) + ) if not doc.parameter_desc(param): - errs.append(error('PR07', param_name=param)) + errs.append(error("PR07", param_name=param)) else: if not doc.parameter_desc(param)[0].isupper(): - errs.append(error('PR08', param_name=param)) - if doc.parameter_desc(param)[-1] != '.': - errs.append(error('PR09', param_name=param)) + errs.append(error("PR08", param_name=param)) + if doc.parameter_desc(param)[-1] != ".": + errs.append(error("PR09", param_name=param)) if doc.is_function_or_method: if not doc.returns: if doc.method_returns_something: - errs.append(error('RT01')) + errs.append(error("RT01")) else: if len(doc.returns) == 1 and doc.returns[0].name: - errs.append(error('RT02')) + errs.append(error("RT02")) for name_or_type, type_, desc in doc.returns: if not desc: - errs.append(error('RT03')) + errs.append(error("RT03")) else: - desc = ' '.join(desc) + desc = " ".join(desc) if not desc[0].isupper(): - errs.append(error('RT04')) - if not desc.endswith('.'): - errs.append(error('RT05')) + errs.append(error("RT04")) + if not desc.endswith("."): + errs.append(error("RT05")) - if not doc.yields and 'yield' in doc.method_source: - errs.append(error('YD01')) + if not doc.yields and "yield" in doc.method_source: + errs.append(error("YD01")) if not doc.see_also: - wrns.append(error('SA01')) + wrns.append(error("SA01")) else: for rel_name, rel_desc in doc.see_also.items(): if rel_desc: - if not rel_desc.endswith('.'): - errs.append(error('SA02', reference_name=rel_name)) + if not rel_desc.endswith("."): + errs.append(error("SA02", reference_name=rel_name)) if not rel_desc[0].isupper(): - errs.append(error('SA03', reference_name=rel_name)) + errs.append(error("SA03", reference_name=rel_name)) else: - errs.append(error('SA04', reference_name=rel_name)) - if rel_name.startswith('pandas.'): - errs.append(error('SA05', - reference_name=rel_name, - right_reference=rel_name[len('pandas.'):])) - - examples_errs = '' + errs.append(error("SA04", reference_name=rel_name)) + if rel_name.startswith("pandas."): + errs.append( + error( + "SA05", + reference_name=rel_name, + right_reference=rel_name[len("pandas.") :], + ) + ) + + examples_errs = "" if not doc.examples: - wrns.append(error('EX01')) + wrns.append(error("EX01")) else: examples_errs = doc.examples_errors if examples_errs: - errs.append(error('EX02', doctest_log=examples_errs)) + errs.append(error("EX02", doctest_log=examples_errs)) for err in doc.validate_pep8(): - errs.append(error('EX03', - error_code=err.error_code, - error_message=err.message, - times_happening=' ({} times)'.format(err.count) - if err.count > 1 else '')) - examples_source_code = ''.join(doc.examples_source_code) - for wrong_import in ('numpy', 'pandas'): - if 'import {}'.format(wrong_import) in examples_source_code: - errs.append(error('EX04', imported_library=wrong_import)) + errs.append( + error( + "EX03", + error_code=err.error_code, + error_message=err.message, + times_happening=" ({} times)".format(err.count) + if err.count > 1 + else "", + ) + ) + examples_source_code = "".join(doc.examples_source_code) + for wrong_import in ("numpy", "pandas"): + if "import {}".format(wrong_import) in examples_source_code: + errs.append(error("EX04", imported_library=wrong_import)) return errs, wrns, examples_errs @@ -797,15 +834,17 @@ def validate_one(func_name): """ doc = Docstring(func_name) errs, wrns, examples_errs = get_validation_data(doc) - return {'type': doc.type, - 'docstring': doc.clean_doc, - 'deprecated': doc.deprecated, - 'file': doc.source_file_name, - 'file_line': doc.source_file_def_line, - 'github_link': doc.github_url, - 'errors': errs, - 'warnings': wrns, - 'examples_errors': examples_errs} + return { + "type": doc.type, + "docstring": doc.clean_doc, + "deprecated": doc.deprecated, + "file": doc.source_file_name, + "file_line": doc.source_file_def_line, + "github_link": doc.github_url, + "errors": errs, + "warnings": wrns, + "examples_errors": examples_errs, + } def validate_all(prefix, ignore_deprecated=False): @@ -831,8 +870,7 @@ def validate_all(prefix, ignore_deprecated=False): seen = {} # functions from the API docs - api_doc_fnames = os.path.join( - BASE_PATH, 'doc', 'source', 'reference', '*.rst') + api_doc_fnames = os.path.join(BASE_PATH, "doc", "source", "reference", "*.rst") api_items = [] for api_doc_fname in glob.glob(api_doc_fnames): with open(api_doc_fname) as f: @@ -841,16 +879,20 @@ def validate_all(prefix, ignore_deprecated=False): if prefix and not func_name.startswith(prefix): continue doc_info = validate_one(func_name) - if ignore_deprecated and doc_info['deprecated']: + if ignore_deprecated and doc_info["deprecated"]: continue result[func_name] = doc_info - shared_code_key = doc_info['file'], doc_info['file_line'] - shared_code = seen.get(shared_code_key, '') - result[func_name].update({'in_api': True, - 'section': section, - 'subsection': subsection, - 'shared_code_with': shared_code}) + shared_code_key = doc_info["file"], doc_info["file_line"] + shared_code = seen.get(shared_code_key, "") + result[func_name].update( + { + "in_api": True, + "section": section, + "subsection": subsection, + "shared_code_with": shared_code, + } + ) seen[shared_code_key] = func_name @@ -858,54 +900,55 @@ def validate_all(prefix, ignore_deprecated=False): api_item_names = set(list(zip(*api_items))[0]) for class_ in (pandas.Series, pandas.DataFrame): for member in inspect.getmembers(class_): - func_name = 'pandas.{}.{}'.format(class_.__name__, member[0]) - if (not member[0].startswith('_') - and func_name not in api_item_names): + func_name = "pandas.{}.{}".format(class_.__name__, member[0]) + if not member[0].startswith("_") and func_name not in api_item_names: if prefix and not func_name.startswith(prefix): continue doc_info = validate_one(func_name) - if ignore_deprecated and doc_info['deprecated']: + if ignore_deprecated and doc_info["deprecated"]: continue result[func_name] = doc_info - result[func_name]['in_api'] = False + result[func_name]["in_api"] = False return result def main(func_name, prefix, errors, output_format, ignore_deprecated): - def header(title, width=80, char='#'): + def header(title, width=80, char="#"): full_line = char * width side_len = (width - len(title) - 2) // 2 - adj = '' if len(title) % 2 == 0 else ' ' - title_line = '{side} {title}{adj} {side}'.format(side=char * side_len, - title=title, - adj=adj) + adj = "" if len(title) % 2 == 0 else " " + title_line = "{side} {title}{adj} {side}".format( + side=char * side_len, title=title, adj=adj + ) - return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format( - full_line=full_line, title_line=title_line) + return "\n{full_line}\n{title_line}\n{full_line}\n\n".format( + full_line=full_line, title_line=title_line + ) exit_status = 0 if func_name is None: result = validate_all(prefix, ignore_deprecated) - if output_format == 'json': + if output_format == "json": output = json.dumps(result) else: - if output_format == 'default': - output_format = '{text}\n' - elif output_format == 'azure': - output_format = ('##vso[task.logissue type=error;' - 'sourcepath={path};' - 'linenumber={row};' - 'code={code};' - ']{text}\n') + if output_format == "default": + output_format = "{text}\n" + elif output_format == "azure": + output_format = ( + "##vso[task.logissue type=error;" + "sourcepath={path};" + "linenumber={row};" + "code={code};" + "]{text}\n" + ) else: - raise ValueError('Unknown output_format "{}"'.format( - output_format)) + raise ValueError('Unknown output_format "{}"'.format(output_format)) - output = '' + output = "" for name, res in result.items(): - for err_code, err_desc in res['errors']: + for err_code, err_desc in res["errors"]: # The script would be faster if instead of filtering the # errors after validating them, it didn't validate them # initially. But that would complicate the code too much @@ -914,76 +957,93 @@ def header(title, width=80, char='#'): exit_status += 1 output += output_format.format( name=name, - path=res['file'], - row=res['file_line'], + path=res["file"], + row=res["file_line"], code=err_code, - text='{}: {}'.format(name, err_desc)) + text="{}: {}".format(name, err_desc), + ) sys.stdout.write(output) else: result = validate_one(func_name) - sys.stderr.write(header('Docstring ({})'.format(func_name))) - sys.stderr.write('{}\n'.format(result['docstring'])) - sys.stderr.write(header('Validation')) - if result['errors']: - sys.stderr.write('{} Errors found:\n'.format( - len(result['errors']))) - for err_code, err_desc in result['errors']: + sys.stderr.write(header("Docstring ({})".format(func_name))) + sys.stderr.write("{}\n".format(result["docstring"])) + sys.stderr.write(header("Validation")) + if result["errors"]: + sys.stderr.write("{} Errors found:\n".format(len(result["errors"]))) + for err_code, err_desc in result["errors"]: # Failing examples are printed at the end - if err_code == 'EX02': - sys.stderr.write('\tExamples do not pass tests\n') + if err_code == "EX02": + sys.stderr.write("\tExamples do not pass tests\n") continue - sys.stderr.write('\t{}\n'.format(err_desc)) - if result['warnings']: - sys.stderr.write('{} Warnings found:\n'.format( - len(result['warnings']))) - for wrn_code, wrn_desc in result['warnings']: - sys.stderr.write('\t{}\n'.format(wrn_desc)) + sys.stderr.write("\t{}\n".format(err_desc)) + if result["warnings"]: + sys.stderr.write("{} Warnings found:\n".format(len(result["warnings"]))) + for wrn_code, wrn_desc in result["warnings"]: + sys.stderr.write("\t{}\n".format(wrn_desc)) - if not result['errors']: - sys.stderr.write('Docstring for "{}" correct. :)\n'.format( - func_name)) + if not result["errors"]: + sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) - if result['examples_errors']: - sys.stderr.write(header('Doctests')) - sys.stderr.write(result['examples_errors']) + if result["examples_errors"]: + sys.stderr.write(header("Doctests")) + sys.stderr.write(result["examples_errors"]) return exit_status -if __name__ == '__main__': - format_opts = 'default', 'json', 'azure' - func_help = ('function or method to validate (e.g. pandas.DataFrame.head) ' - 'if not provided, all docstrings are validated and returned ' - 'as JSON') - argparser = argparse.ArgumentParser( - description='validate pandas docstrings') - argparser.add_argument('function', - nargs='?', - default=None, - help=func_help) - argparser.add_argument('--format', default='default', choices=format_opts, - help='format of the output when validating ' - 'multiple docstrings (ignored when validating one).' - 'It can be {}'.format(str(format_opts)[1:-1])) - argparser.add_argument('--prefix', default=None, help='pattern for the ' - 'docstring names, in order to decide which ones ' - 'will be validated. A prefix "pandas.Series.str.' - 'will make the script validate all the docstrings' - 'of methods starting by this pattern. It is ' - 'ignored if parameter function is provided') - argparser.add_argument('--errors', default=None, help='comma separated ' - 'list of error codes to validate. By default it ' - 'validates all errors (ignored when validating ' - 'a single docstring)') - argparser.add_argument('--ignore_deprecated', default=False, - action='store_true', help='if this flag is set, ' - 'deprecated objects are ignored when validating ' - 'all docstrings') +if __name__ == "__main__": + format_opts = "default", "json", "azure" + func_help = ( + "function or method to validate (e.g. pandas.DataFrame.head) " + "if not provided, all docstrings are validated and returned " + "as JSON" + ) + argparser = argparse.ArgumentParser(description="validate pandas docstrings") + argparser.add_argument("function", nargs="?", default=None, help=func_help) + argparser.add_argument( + "--format", + default="default", + choices=format_opts, + help="format of the output when validating " + "multiple docstrings (ignored when validating one)." + "It can be {}".format(str(format_opts)[1:-1]), + ) + argparser.add_argument( + "--prefix", + default=None, + help="pattern for the " + "docstring names, in order to decide which ones " + 'will be validated. A prefix "pandas.Series.str.' + "will make the script validate all the docstrings" + "of methods starting by this pattern. It is " + "ignored if parameter function is provided", + ) + argparser.add_argument( + "--errors", + default=None, + help="comma separated " + "list of error codes to validate. By default it " + "validates all errors (ignored when validating " + "a single docstring)", + ) + argparser.add_argument( + "--ignore_deprecated", + default=False, + action="store_true", + help="if this flag is set, " + "deprecated objects are ignored when validating " + "all docstrings", + ) args = argparser.parse_args() - sys.exit(main(args.function, args.prefix, - args.errors.split(',') if args.errors else None, - args.format, - args.ignore_deprecated)) + sys.exit( + main( + args.function, + args.prefix, + args.errors.split(",") if args.errors else None, + args.format, + args.ignore_deprecated, + ) + ) diff --git a/setup.py b/setup.py index 19c22fc25733d..53e12da53cdeb 100755 --- a/setup.py +++ b/setup.py @@ -19,34 +19,37 @@ # versioning import versioneer + cmdclass = versioneer.get_cmdclass() def is_platform_windows(): - return sys.platform == 'win32' or sys.platform == 'cygwin' + return sys.platform == "win32" or sys.platform == "cygwin" def is_platform_mac(): - return sys.platform == 'darwin' + return sys.platform == "darwin" -min_numpy_ver = '1.13.3' +min_numpy_ver = "1.13.3" setuptools_kwargs = { - 'install_requires': [ - 'python-dateutil >= 2.6.1', - 'pytz >= 2017.2', - 'numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver), + "install_requires": [ + "python-dateutil >= 2.6.1", + "pytz >= 2017.2", + "numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver), ], - 'setup_requires': ['numpy >= {numpy_ver}'.format(numpy_ver=min_numpy_ver)], - 'zip_safe': False, + "setup_requires": ["numpy >= {numpy_ver}".format(numpy_ver=min_numpy_ver)], + "zip_safe": False, } -min_cython_ver = '0.28.2' +min_cython_ver = "0.28.2" try: import Cython + ver = Cython.__version__ from Cython.Build import cythonize + _CYTHON_INSTALLED = ver >= LooseVersion(min_cython_ver) except ImportError: _CYTHON_INSTALLED = False @@ -60,11 +63,13 @@ def is_platform_mac(): try: if not _CYTHON_INSTALLED: - raise ImportError('No supported version of Cython installed.') + raise ImportError("No supported version of Cython installed.") from Cython.Distutils.old_build_ext import old_build_ext as _build_ext + cython = True except ImportError: from distutils.command.build_ext import build_ext as _build_ext + cython = False else: try: @@ -73,25 +78,29 @@ def is_platform_mac(): except ImportError: import tempita except ImportError: - raise ImportError('Building pandas requires Tempita: ' - 'pip install Tempita') + raise ImportError("Building pandas requires Tempita: " "pip install Tempita") _pxi_dep_template = { - 'algos': ['_libs/algos_common_helper.pxi.in', - '_libs/algos_take_helper.pxi.in', - '_libs/algos_rank_helper.pxi.in'], - 'groupby': ['_libs/groupby_helper.pxi.in'], - 'hashtable': ['_libs/hashtable_class_helper.pxi.in', - '_libs/hashtable_func_helper.pxi.in'], - 'index': ['_libs/index_class_helper.pxi.in'], - 'sparse': ['_libs/sparse_op_helper.pxi.in'], - 'interval': ['_libs/intervaltree.pxi.in']} + "algos": [ + "_libs/algos_common_helper.pxi.in", + "_libs/algos_take_helper.pxi.in", + "_libs/algos_rank_helper.pxi.in", + ], + "groupby": ["_libs/groupby_helper.pxi.in"], + "hashtable": [ + "_libs/hashtable_class_helper.pxi.in", + "_libs/hashtable_func_helper.pxi.in", + ], + "index": ["_libs/index_class_helper.pxi.in"], + "sparse": ["_libs/sparse_op_helper.pxi.in"], + "interval": ["_libs/intervaltree.pxi.in"], +} _pxifiles = [] _pxi_dep = {} for module, files in _pxi_dep_template.items(): - pxi_files = [pjoin('pandas', x) for x in files] + pxi_files = [pjoin("pandas", x) for x in files] _pxifiles.extend(pxi_files) _pxi_dep[module] = pxi_files @@ -101,11 +110,13 @@ class build_ext(_build_ext): def render_templates(cls, pxifiles): for pxifile in pxifiles: # build pxifiles first, template extension must be .pxi.in - assert pxifile.endswith('.pxi.in') + assert pxifile.endswith(".pxi.in") outfile = pxifile[:-3] - if (os.path.exists(outfile) and - os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime): + if ( + os.path.exists(outfile) + and os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime + ): # if .pxi.in is not updated, no need to output .pxi continue @@ -122,17 +133,17 @@ def build_extensions(self): if cython: self.render_templates(_pxifiles) - numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') + numpy_incl = pkg_resources.resource_filename("numpy", "core/include") for ext in self.extensions: - if (hasattr(ext, 'include_dirs') and - numpy_incl not in ext.include_dirs): + if hasattr(ext, "include_dirs") and numpy_incl not in ext.include_dirs: ext.include_dirs.append(numpy_incl) _build_ext.build_extensions(self) -DESCRIPTION = ("Powerful data structures for data analysis, time series, " - "and statistics") +DESCRIPTION = ( + "Powerful data structures for data analysis, time series, " "and statistics" +) LONG_DESCRIPTION = """ **pandas** is a Python package providing fast, flexible, and expressive data structures designed to make working with structured (tabular, multidimensional, @@ -197,29 +208,30 @@ def build_extensions(self): the ideal tool for all of these tasks. """ -DISTNAME = 'pandas' -LICENSE = 'BSD' +DISTNAME = "pandas" +LICENSE = "BSD" AUTHOR = "The PyData Development Team" EMAIL = "pydata@googlegroups.com" URL = "http://pandas.pydata.org" -DOWNLOAD_URL = '' +DOWNLOAD_URL = "" PROJECT_URLS = { - 'Bug Tracker': 'https://github.com/pandas-dev/pandas/issues', - 'Documentation': 'http://pandas.pydata.org/pandas-docs/stable/', - 'Source Code': 'https://github.com/pandas-dev/pandas' + "Bug Tracker": "https://github.com/pandas-dev/pandas/issues", + "Documentation": "http://pandas.pydata.org/pandas-docs/stable/", + "Source Code": "https://github.com/pandas-dev/pandas", } CLASSIFIERS = [ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'Operating System :: OS Independent', - 'Intended Audience :: Science/Research', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Cython', - 'Topic :: Scientific/Engineering'] + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Cython", + "Topic :: Scientific/Engineering", +] class CleanCommand(Command): @@ -232,37 +244,44 @@ def initialize_options(self): self._clean_me = [] self._clean_trees = [] - base = pjoin('pandas', '_libs', 'src') - tsbase = pjoin('pandas', '_libs', 'tslibs', 'src') - dt = pjoin(tsbase, 'datetime') - util = pjoin('pandas', 'util') - parser = pjoin(base, 'parser') - ujson_python = pjoin(base, 'ujson', 'python') - ujson_lib = pjoin(base, 'ujson', 'lib') - self._clean_exclude = [pjoin(dt, 'np_datetime.c'), - pjoin(dt, 'np_datetime_strings.c'), - pjoin(parser, 'tokenizer.c'), - pjoin(parser, 'io.c'), - pjoin(ujson_python, 'ujson.c'), - pjoin(ujson_python, 'objToJSON.c'), - pjoin(ujson_python, 'JSONtoObj.c'), - pjoin(ujson_lib, 'ultrajsonenc.c'), - pjoin(ujson_lib, 'ultrajsondec.c'), - pjoin(util, 'move.c'), - ] - - for root, dirs, files in os.walk('pandas'): + base = pjoin("pandas", "_libs", "src") + tsbase = pjoin("pandas", "_libs", "tslibs", "src") + dt = pjoin(tsbase, "datetime") + util = pjoin("pandas", "util") + parser = pjoin(base, "parser") + ujson_python = pjoin(base, "ujson", "python") + ujson_lib = pjoin(base, "ujson", "lib") + self._clean_exclude = [ + pjoin(dt, "np_datetime.c"), + pjoin(dt, "np_datetime_strings.c"), + pjoin(parser, "tokenizer.c"), + pjoin(parser, "io.c"), + pjoin(ujson_python, "ujson.c"), + pjoin(ujson_python, "objToJSON.c"), + pjoin(ujson_python, "JSONtoObj.c"), + pjoin(ujson_lib, "ultrajsonenc.c"), + pjoin(ujson_lib, "ultrajsondec.c"), + pjoin(util, "move.c"), + ] + + for root, dirs, files in os.walk("pandas"): for f in files: filepath = pjoin(root, f) if filepath in self._clean_exclude: continue - if os.path.splitext(f)[-1] in ('.pyc', '.so', '.o', - '.pyo', - '.pyd', '.c', '.orig'): + if os.path.splitext(f)[-1] in ( + ".pyc", + ".so", + ".o", + ".pyo", + ".pyd", + ".c", + ".orig", + ): self._clean_me.append(filepath) for d in dirs: - if d == '__pycache__': + if d == "__pycache__": self._clean_trees.append(pjoin(root, d)) # clean the generated pxi files @@ -270,7 +289,7 @@ def initialize_options(self): pxifile = pxifile.replace(".pxi.in", ".pxi") self._clean_me.append(pxifile) - for d in ('build', 'dist'): + for d in ("build", "dist"): if os.path.exists(d): self._clean_trees.append(d) @@ -292,68 +311,75 @@ def run(self): # we need to inherit from the versioneer # class as it encodes the version info -sdist_class = cmdclass['sdist'] +sdist_class = cmdclass["sdist"] class CheckSDist(sdist_class): """Custom sdist that ensures Cython has compiled all pyx files to c.""" - _pyxfiles = ['pandas/_libs/lib.pyx', - 'pandas/_libs/hashtable.pyx', - 'pandas/_libs/tslib.pyx', - 'pandas/_libs/index.pyx', - 'pandas/_libs/internals.pyx', - 'pandas/_libs/algos.pyx', - 'pandas/_libs/join.pyx', - 'pandas/_libs/indexing.pyx', - 'pandas/_libs/interval.pyx', - 'pandas/_libs/hashing.pyx', - 'pandas/_libs/missing.pyx', - 'pandas/_libs/reduction.pyx', - 'pandas/_libs/testing.pyx', - 'pandas/_libs/skiplist.pyx', - 'pandas/_libs/sparse.pyx', - 'pandas/_libs/ops.pyx', - 'pandas/_libs/parsers.pyx', - 'pandas/_libs/tslibs/c_timestamp.pyx', - 'pandas/_libs/tslibs/ccalendar.pyx', - 'pandas/_libs/tslibs/period.pyx', - 'pandas/_libs/tslibs/strptime.pyx', - 'pandas/_libs/tslibs/np_datetime.pyx', - 'pandas/_libs/tslibs/timedeltas.pyx', - 'pandas/_libs/tslibs/timestamps.pyx', - 'pandas/_libs/tslibs/timezones.pyx', - 'pandas/_libs/tslibs/conversion.pyx', - 'pandas/_libs/tslibs/fields.pyx', - 'pandas/_libs/tslibs/offsets.pyx', - 'pandas/_libs/tslibs/frequencies.pyx', - 'pandas/_libs/tslibs/resolution.pyx', - 'pandas/_libs/tslibs/parsing.pyx', - 'pandas/_libs/tslibs/tzconversion.pyx', - 'pandas/_libs/writers.pyx', - 'pandas/io/sas/sas.pyx'] - - _cpp_pyxfiles = ['pandas/_libs/window.pyx', - 'pandas/io/msgpack/_packer.pyx', - 'pandas/io/msgpack/_unpacker.pyx'] + _pyxfiles = [ + "pandas/_libs/lib.pyx", + "pandas/_libs/hashtable.pyx", + "pandas/_libs/tslib.pyx", + "pandas/_libs/index.pyx", + "pandas/_libs/internals.pyx", + "pandas/_libs/algos.pyx", + "pandas/_libs/join.pyx", + "pandas/_libs/indexing.pyx", + "pandas/_libs/interval.pyx", + "pandas/_libs/hashing.pyx", + "pandas/_libs/missing.pyx", + "pandas/_libs/reduction.pyx", + "pandas/_libs/testing.pyx", + "pandas/_libs/skiplist.pyx", + "pandas/_libs/sparse.pyx", + "pandas/_libs/ops.pyx", + "pandas/_libs/parsers.pyx", + "pandas/_libs/tslibs/c_timestamp.pyx", + "pandas/_libs/tslibs/ccalendar.pyx", + "pandas/_libs/tslibs/period.pyx", + "pandas/_libs/tslibs/strptime.pyx", + "pandas/_libs/tslibs/np_datetime.pyx", + "pandas/_libs/tslibs/timedeltas.pyx", + "pandas/_libs/tslibs/timestamps.pyx", + "pandas/_libs/tslibs/timezones.pyx", + "pandas/_libs/tslibs/conversion.pyx", + "pandas/_libs/tslibs/fields.pyx", + "pandas/_libs/tslibs/offsets.pyx", + "pandas/_libs/tslibs/frequencies.pyx", + "pandas/_libs/tslibs/resolution.pyx", + "pandas/_libs/tslibs/parsing.pyx", + "pandas/_libs/tslibs/tzconversion.pyx", + "pandas/_libs/writers.pyx", + "pandas/io/sas/sas.pyx", + ] + + _cpp_pyxfiles = [ + "pandas/_libs/window.pyx", + "pandas/io/msgpack/_packer.pyx", + "pandas/io/msgpack/_unpacker.pyx", + ] def initialize_options(self): sdist_class.initialize_options(self) def run(self): - if 'cython' in cmdclass: - self.run_command('cython') + if "cython" in cmdclass: + self.run_command("cython") else: # If we are not running cython then # compile the extensions correctly - pyx_files = [(self._pyxfiles, 'c'), (self._cpp_pyxfiles, 'cpp')] + pyx_files = [(self._pyxfiles, "c"), (self._cpp_pyxfiles, "cpp")] for pyxfiles, extension in pyx_files: for pyxfile in pyxfiles: sourcefile = pyxfile[:-3] + extension - msg = ("{extension}-source file '{source}' not found.\n" - "Run 'setup.py cython' before sdist.".format( - source=sourcefile, extension=extension)) + msg = ( + "{extension}-source file '{source}' not found.\n" + "Run 'setup.py cython' before sdist.".format( + source=sourcefile, extension=extension + ) + ) assert os.path.isfile(sourcefile), msg sdist_class.run(self) @@ -368,10 +394,14 @@ def check_cython_extensions(self, extensions): for src in ext.sources: if not os.path.exists(src): print("{}: -> [{}]".format(ext.name, ext.sources)) - raise Exception("""Cython-generated file '{src}' not found. + raise Exception( + """Cython-generated file '{src}' not found. Cython is required to compile pandas from a development branch. Please install Cython or download a release package of pandas. - """.format(src=src)) + """.format( + src=src + ) + ) def build_extensions(self): self.check_cython_extensions(self.extensions) @@ -384,6 +414,7 @@ class CythonCommand(build_ext): to compile pyx->c, and stop there. All this does is override the C-compile method build_extension() with a no-op. """ + def build_extension(self, ext): pass @@ -391,6 +422,7 @@ def build_extension(self, ext): class DummyBuildSrc(Command): """ numpy's build_src command interferes with Cython's build_ext. """ + user_options = [] def initialize_options(self): @@ -403,81 +435,82 @@ def run(self): pass -cmdclass.update({'clean': CleanCommand, - 'build': build}) +cmdclass.update({"clean": CleanCommand, "build": build}) if cython: - suffix = '.pyx' - cmdclass['build_ext'] = CheckingBuildExt - cmdclass['cython'] = CythonCommand + suffix = ".pyx" + cmdclass["build_ext"] = CheckingBuildExt + cmdclass["cython"] = CythonCommand else: - suffix = '.c' - cmdclass['build_src'] = DummyBuildSrc - cmdclass['build_ext'] = CheckingBuildExt + suffix = ".c" + cmdclass["build_src"] = DummyBuildSrc + cmdclass["build_ext"] = CheckingBuildExt # ---------------------------------------------------------------------- # Preparation of compiler arguments -debugging_symbols_requested = '--with-debugging-symbols' in sys.argv +debugging_symbols_requested = "--with-debugging-symbols" in sys.argv if debugging_symbols_requested: - sys.argv.remove('--with-debugging-symbols') + sys.argv.remove("--with-debugging-symbols") -if sys.byteorder == 'big': - endian_macro = [('__BIG_ENDIAN__', '1')] +if sys.byteorder == "big": + endian_macro = [("__BIG_ENDIAN__", "1")] else: - endian_macro = [('__LITTLE_ENDIAN__', '1')] + endian_macro = [("__LITTLE_ENDIAN__", "1")] if is_platform_windows(): extra_compile_args = [] extra_link_args = [] if debugging_symbols_requested: - extra_compile_args.append('/Z7') - extra_link_args.append('/DEBUG') + extra_compile_args.append("/Z7") + extra_link_args.append("/DEBUG") else: # args to ignore warnings - extra_compile_args = ['-Wno-unused-function'] + extra_compile_args = ["-Wno-unused-function"] extra_link_args = [] if debugging_symbols_requested: - extra_compile_args.append('-g') + extra_compile_args.append("-g") # Build for at least macOS 10.9 when compiling on a 10.9 system or above, # overriding CPython distuitls behaviour which is to target the version that # python was built for. This may be overridden by setting # MACOSX_DEPLOYMENT_TARGET before calling setup.py if is_platform_mac(): - if 'MACOSX_DEPLOYMENT_TARGET' not in os.environ: + if "MACOSX_DEPLOYMENT_TARGET" not in os.environ: current_system = platform.mac_ver()[0] - python_target = get_config_vars().get('MACOSX_DEPLOYMENT_TARGET', - current_system) - if (LooseVersion(python_target) < '10.9' and - LooseVersion(current_system) >= '10.9'): - os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9' + python_target = get_config_vars().get( + "MACOSX_DEPLOYMENT_TARGET", current_system + ) + if ( + LooseVersion(python_target) < "10.9" + and LooseVersion(current_system) >= "10.9" + ): + os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9" # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled -linetrace = os.environ.get('PANDAS_CYTHON_COVERAGE', False) -if '--with-cython-coverage' in sys.argv: +linetrace = os.environ.get("PANDAS_CYTHON_COVERAGE", False) +if "--with-cython-coverage" in sys.argv: linetrace = True - sys.argv.remove('--with-cython-coverage') + sys.argv.remove("--with-cython-coverage") # Note: if not using `cythonize`, coverage can be enabled by # pinning `ext.cython_directives = directives` to each ext in extensions. # github.com/cython/cython/wiki/enhancements-compilerdirectives#in-setuppy -directives = {'linetrace': False, - 'language_level': 3} +directives = {"linetrace": False, "language_level": 3} macros = [] if linetrace: # https://pypkg.com/pypi/pytest-cython/f/tests/example-project/setup.py - directives['linetrace'] = True - macros = [('CYTHON_TRACE', '1'), ('CYTHON_TRACE_NOGIL', '1')] + directives["linetrace"] = True + macros = [("CYTHON_TRACE", "1"), ("CYTHON_TRACE_NOGIL", "1")] # in numpy>=1.16.0, silence build warnings about deprecated API usage # we can't do anything about these warnings because they stem from # cython+numpy version mismatches. -macros.append(('NPY_NO_DEPRECATED_API', '0')) +macros.append(("NPY_NO_DEPRECATED_API", "0")) # ---------------------------------------------------------------------- @@ -489,7 +522,7 @@ def maybe_cythonize(extensions, *args, **kwargs): """ Render tempita templates before calling cythonize """ - if len(sys.argv) > 1 and 'clean' in sys.argv: + if len(sys.argv) > 1 and "clean" in sys.argv: # Avoid running cythonize on `python setup.py clean` # See https://github.com/cython/cython/issues/1495 return extensions @@ -499,253 +532,255 @@ def maybe_cythonize(extensions, *args, **kwargs): # TODO: See if this can be removed after pyproject.toml added. return extensions - numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') + numpy_incl = pkg_resources.resource_filename("numpy", "core/include") # TODO: Is this really necessary here? for ext in extensions: - if (hasattr(ext, 'include_dirs') and - numpy_incl not in ext.include_dirs): + if hasattr(ext, "include_dirs") and numpy_incl not in ext.include_dirs: ext.include_dirs.append(numpy_incl) build_ext.render_templates(_pxifiles) return cythonize(extensions, *args, **kwargs) -def srcpath(name=None, suffix='.pyx', subdir='src'): - return pjoin('pandas', subdir, name + suffix) +def srcpath(name=None, suffix=".pyx", subdir="src"): + return pjoin("pandas", subdir, name + suffix) -common_include = ['pandas/_libs/src/klib', 'pandas/_libs/src'] -ts_include = ['pandas/_libs/tslibs/src', 'pandas/_libs/tslibs'] +common_include = ["pandas/_libs/src/klib", "pandas/_libs/src"] +ts_include = ["pandas/_libs/tslibs/src", "pandas/_libs/tslibs"] -lib_depends = ['pandas/_libs/src/parse_helper.h', - 'pandas/_libs/src/compat_helper.h'] +lib_depends = ["pandas/_libs/src/parse_helper.h", "pandas/_libs/src/compat_helper.h"] np_datetime_headers = [ - 'pandas/_libs/tslibs/src/datetime/np_datetime.h', - 'pandas/_libs/tslibs/src/datetime/np_datetime_strings.h'] + "pandas/_libs/tslibs/src/datetime/np_datetime.h", + "pandas/_libs/tslibs/src/datetime/np_datetime_strings.h", +] np_datetime_sources = [ - 'pandas/_libs/tslibs/src/datetime/np_datetime.c', - 'pandas/_libs/tslibs/src/datetime/np_datetime_strings.c'] + "pandas/_libs/tslibs/src/datetime/np_datetime.c", + "pandas/_libs/tslibs/src/datetime/np_datetime_strings.c", +] tseries_depends = np_datetime_headers ext_data = { - '_libs.algos': { - 'pyxfile': '_libs/algos', - 'depends': _pxi_dep['algos']}, - '_libs.groupby': { - 'pyxfile': '_libs/groupby', - 'depends': _pxi_dep['groupby']}, - '_libs.hashing': { - 'pyxfile': '_libs/hashing', - 'include': [], - 'depends': []}, - '_libs.hashtable': { - 'pyxfile': '_libs/hashtable', - 'depends': (['pandas/_libs/src/klib/khash_python.h'] + - _pxi_dep['hashtable'])}, - '_libs.index': { - 'pyxfile': '_libs/index', - 'include': common_include + ts_include, - 'depends': _pxi_dep['index'], - 'sources': np_datetime_sources}, - '_libs.indexing': { - 'pyxfile': '_libs/indexing'}, - '_libs.internals': { - 'pyxfile': '_libs/internals'}, - '_libs.interval': { - 'pyxfile': '_libs/interval', - 'depends': _pxi_dep['interval']}, - '_libs.join': { - 'pyxfile': '_libs/join'}, - '_libs.lib': { - 'pyxfile': '_libs/lib', - 'include': common_include + ts_include, - 'depends': lib_depends + tseries_depends, - 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, - '_libs.missing': { - 'pyxfile': '_libs/missing', - 'include': common_include + ts_include, - 'depends': tseries_depends}, - '_libs.parsers': { - 'pyxfile': '_libs/parsers', - 'depends': ['pandas/_libs/src/parser/tokenizer.h', - 'pandas/_libs/src/parser/io.h'], - 'sources': ['pandas/_libs/src/parser/tokenizer.c', - 'pandas/_libs/src/parser/io.c']}, - '_libs.reduction': { - 'pyxfile': '_libs/reduction'}, - '_libs.ops': { - 'pyxfile': '_libs/ops'}, - '_libs.properties': { - 'pyxfile': '_libs/properties', - 'include': []}, - '_libs.reshape': { - 'pyxfile': '_libs/reshape', - 'depends': []}, - '_libs.skiplist': { - 'pyxfile': '_libs/skiplist', - 'depends': ['pandas/_libs/src/skiplist.h']}, - '_libs.sparse': { - 'pyxfile': '_libs/sparse', - 'depends': _pxi_dep['sparse']}, - '_libs.tslib': { - 'pyxfile': '_libs/tslib', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.c_timestamp': { - 'pyxfile': '_libs/tslibs/c_timestamp', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.ccalendar': { - 'pyxfile': '_libs/tslibs/ccalendar', - 'include': []}, - '_libs.tslibs.conversion': { - 'pyxfile': '_libs/tslibs/conversion', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.fields': { - 'pyxfile': '_libs/tslibs/fields', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.frequencies': { - 'pyxfile': '_libs/tslibs/frequencies', - 'include': []}, - '_libs.tslibs.nattype': { - 'pyxfile': '_libs/tslibs/nattype', - 'include': []}, - '_libs.tslibs.np_datetime': { - 'pyxfile': '_libs/tslibs/np_datetime', - 'include': ts_include, - 'depends': np_datetime_headers, - 'sources': np_datetime_sources}, - '_libs.tslibs.offsets': { - 'pyxfile': '_libs/tslibs/offsets', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.parsing': { - 'pyxfile': '_libs/tslibs/parsing', - 'depends': ['pandas/_libs/src/parser/tokenizer.h'], - 'sources': ['pandas/_libs/src/parser/tokenizer.c']}, - '_libs.tslibs.period': { - 'pyxfile': '_libs/tslibs/period', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.resolution': { - 'pyxfile': '_libs/tslibs/resolution', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.strptime': { - 'pyxfile': '_libs/tslibs/strptime', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.timedeltas': { - 'pyxfile': '_libs/tslibs/timedeltas', - 'include': ts_include, - 'depends': np_datetime_headers, - 'sources': np_datetime_sources}, - '_libs.tslibs.timestamps': { - 'pyxfile': '_libs/tslibs/timestamps', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.tslibs.timezones': { - 'pyxfile': '_libs/tslibs/timezones', - 'include': []}, - '_libs.tslibs.tzconversion': { - 'pyxfile': '_libs/tslibs/tzconversion', - 'include': ts_include, - 'depends': tseries_depends, - 'sources': np_datetime_sources}, - '_libs.testing': { - 'pyxfile': '_libs/testing'}, - '_libs.window': { - 'pyxfile': '_libs/window', - 'language': 'c++', - 'suffix': '.cpp'}, - '_libs.writers': { - 'pyxfile': '_libs/writers'}, - 'io.sas._sas': { - 'pyxfile': 'io/sas/sas'}, - 'io.msgpack._packer': { - 'macros': endian_macro + macros, - 'depends': ['pandas/_libs/src/msgpack/pack.h', - 'pandas/_libs/src/msgpack/pack_template.h'], - 'include': ['pandas/_libs/src/msgpack'] + common_include, - 'language': 'c++', - 'suffix': '.cpp', - 'pyxfile': 'io/msgpack/_packer', - 'subdir': 'io/msgpack'}, - 'io.msgpack._unpacker': { - 'depends': ['pandas/_libs/src/msgpack/unpack.h', - 'pandas/_libs/src/msgpack/unpack_define.h', - 'pandas/_libs/src/msgpack/unpack_template.h'], - 'macros': endian_macro + macros, - 'include': ['pandas/_libs/src/msgpack'] + common_include, - 'language': 'c++', - 'suffix': '.cpp', - 'pyxfile': 'io/msgpack/_unpacker', - 'subdir': 'io/msgpack' - } + "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]}, + "_libs.groupby": {"pyxfile": "_libs/groupby", "depends": _pxi_dep["groupby"]}, + "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []}, + "_libs.hashtable": { + "pyxfile": "_libs/hashtable", + "depends": (["pandas/_libs/src/klib/khash_python.h"] + _pxi_dep["hashtable"]), + }, + "_libs.index": { + "pyxfile": "_libs/index", + "include": common_include + ts_include, + "depends": _pxi_dep["index"], + "sources": np_datetime_sources, + }, + "_libs.indexing": {"pyxfile": "_libs/indexing"}, + "_libs.internals": {"pyxfile": "_libs/internals"}, + "_libs.interval": {"pyxfile": "_libs/interval", "depends": _pxi_dep["interval"]}, + "_libs.join": {"pyxfile": "_libs/join"}, + "_libs.lib": { + "pyxfile": "_libs/lib", + "include": common_include + ts_include, + "depends": lib_depends + tseries_depends, + "sources": ["pandas/_libs/src/parser/tokenizer.c"], + }, + "_libs.missing": { + "pyxfile": "_libs/missing", + "include": common_include + ts_include, + "depends": tseries_depends, + }, + "_libs.parsers": { + "pyxfile": "_libs/parsers", + "depends": [ + "pandas/_libs/src/parser/tokenizer.h", + "pandas/_libs/src/parser/io.h", + ], + "sources": [ + "pandas/_libs/src/parser/tokenizer.c", + "pandas/_libs/src/parser/io.c", + ], + }, + "_libs.reduction": {"pyxfile": "_libs/reduction"}, + "_libs.ops": {"pyxfile": "_libs/ops"}, + "_libs.properties": {"pyxfile": "_libs/properties", "include": []}, + "_libs.reshape": {"pyxfile": "_libs/reshape", "depends": []}, + "_libs.skiplist": { + "pyxfile": "_libs/skiplist", + "depends": ["pandas/_libs/src/skiplist.h"], + }, + "_libs.sparse": {"pyxfile": "_libs/sparse", "depends": _pxi_dep["sparse"]}, + "_libs.tslib": { + "pyxfile": "_libs/tslib", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.c_timestamp": { + "pyxfile": "_libs/tslibs/c_timestamp", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.ccalendar": {"pyxfile": "_libs/tslibs/ccalendar", "include": []}, + "_libs.tslibs.conversion": { + "pyxfile": "_libs/tslibs/conversion", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.fields": { + "pyxfile": "_libs/tslibs/fields", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.frequencies": {"pyxfile": "_libs/tslibs/frequencies", "include": []}, + "_libs.tslibs.nattype": {"pyxfile": "_libs/tslibs/nattype", "include": []}, + "_libs.tslibs.np_datetime": { + "pyxfile": "_libs/tslibs/np_datetime", + "include": ts_include, + "depends": np_datetime_headers, + "sources": np_datetime_sources, + }, + "_libs.tslibs.offsets": { + "pyxfile": "_libs/tslibs/offsets", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.parsing": { + "pyxfile": "_libs/tslibs/parsing", + "depends": ["pandas/_libs/src/parser/tokenizer.h"], + "sources": ["pandas/_libs/src/parser/tokenizer.c"], + }, + "_libs.tslibs.period": { + "pyxfile": "_libs/tslibs/period", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.resolution": { + "pyxfile": "_libs/tslibs/resolution", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.strptime": { + "pyxfile": "_libs/tslibs/strptime", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.timedeltas": { + "pyxfile": "_libs/tslibs/timedeltas", + "include": ts_include, + "depends": np_datetime_headers, + "sources": np_datetime_sources, + }, + "_libs.tslibs.timestamps": { + "pyxfile": "_libs/tslibs/timestamps", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.tslibs.timezones": {"pyxfile": "_libs/tslibs/timezones", "include": []}, + "_libs.tslibs.tzconversion": { + "pyxfile": "_libs/tslibs/tzconversion", + "include": ts_include, + "depends": tseries_depends, + "sources": np_datetime_sources, + }, + "_libs.testing": {"pyxfile": "_libs/testing"}, + "_libs.window": {"pyxfile": "_libs/window", "language": "c++", "suffix": ".cpp"}, + "_libs.writers": {"pyxfile": "_libs/writers"}, + "io.sas._sas": {"pyxfile": "io/sas/sas"}, + "io.msgpack._packer": { + "macros": endian_macro + macros, + "depends": [ + "pandas/_libs/src/msgpack/pack.h", + "pandas/_libs/src/msgpack/pack_template.h", + ], + "include": ["pandas/_libs/src/msgpack"] + common_include, + "language": "c++", + "suffix": ".cpp", + "pyxfile": "io/msgpack/_packer", + "subdir": "io/msgpack", + }, + "io.msgpack._unpacker": { + "depends": [ + "pandas/_libs/src/msgpack/unpack.h", + "pandas/_libs/src/msgpack/unpack_define.h", + "pandas/_libs/src/msgpack/unpack_template.h", + ], + "macros": endian_macro + macros, + "include": ["pandas/_libs/src/msgpack"] + common_include, + "language": "c++", + "suffix": ".cpp", + "pyxfile": "io/msgpack/_unpacker", + "subdir": "io/msgpack", + }, } extensions = [] for name, data in ext_data.items(): - source_suffix = suffix if suffix == '.pyx' else data.get('suffix', '.c') + source_suffix = suffix if suffix == ".pyx" else data.get("suffix", ".c") - sources = [srcpath(data['pyxfile'], suffix=source_suffix, subdir='')] + sources = [srcpath(data["pyxfile"], suffix=source_suffix, subdir="")] - sources.extend(data.get('sources', [])) + sources.extend(data.get("sources", [])) - include = data.get('include', common_include) + include = data.get("include", common_include) - obj = Extension('pandas.{name}'.format(name=name), - sources=sources, - depends=data.get('depends', []), - include_dirs=include, - language=data.get('language', 'c'), - define_macros=data.get('macros', macros), - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args) + obj = Extension( + "pandas.{name}".format(name=name), + sources=sources, + depends=data.get("depends", []), + include_dirs=include, + language=data.get("language", "c"), + define_macros=data.get("macros", macros), + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + ) extensions.append(obj) # ---------------------------------------------------------------------- # ujson -if suffix == '.pyx': +if suffix == ".pyx": # undo dumb setuptools bug clobbering .pyx sources back to .c for ext in extensions: - if ext.sources[0].endswith(('.c', '.cpp')): + if ext.sources[0].endswith((".c", ".cpp")): root, _ = os.path.splitext(ext.sources[0]) ext.sources[0] = root + suffix -ujson_ext = Extension('pandas._libs.json', - depends=['pandas/_libs/src/ujson/lib/ultrajson.h'], - sources=(['pandas/_libs/src/ujson/python/ujson.c', - 'pandas/_libs/src/ujson/python/objToJSON.c', - 'pandas/_libs/src/ujson/python/JSONtoObj.c', - 'pandas/_libs/src/ujson/lib/ultrajsonenc.c', - 'pandas/_libs/src/ujson/lib/ultrajsondec.c'] + - np_datetime_sources), - include_dirs=['pandas/_libs/src/ujson/python', - 'pandas/_libs/src/ujson/lib', - 'pandas/_libs/src/datetime'], - extra_compile_args=(['-D_GNU_SOURCE'] + - extra_compile_args), - extra_link_args=extra_link_args, - define_macros=macros) +ujson_ext = Extension( + "pandas._libs.json", + depends=["pandas/_libs/src/ujson/lib/ultrajson.h"], + sources=( + [ + "pandas/_libs/src/ujson/python/ujson.c", + "pandas/_libs/src/ujson/python/objToJSON.c", + "pandas/_libs/src/ujson/python/JSONtoObj.c", + "pandas/_libs/src/ujson/lib/ultrajsonenc.c", + "pandas/_libs/src/ujson/lib/ultrajsondec.c", + ] + + np_datetime_sources + ), + include_dirs=[ + "pandas/_libs/src/ujson/python", + "pandas/_libs/src/ujson/lib", + "pandas/_libs/src/datetime", + ], + extra_compile_args=(["-D_GNU_SOURCE"] + extra_compile_args), + extra_link_args=extra_link_args, + define_macros=macros, +) extensions.append(ujson_ext) @@ -753,12 +788,14 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): # ---------------------------------------------------------------------- # util # extension for pseudo-safely moving bytes into mutable buffers -_move_ext = Extension('pandas.util._move', - depends=[], - sources=['pandas/util/move.c'], - define_macros=macros, - extra_compile_args=extra_compile_args, - extra_link_args=extra_link_args) +_move_ext = Extension( + "pandas.util._move", + depends=[], + sources=["pandas/util/move.c"], + define_macros=macros, + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, +) extensions.append(_move_ext) # ---------------------------------------------------------------------- @@ -767,29 +804,31 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): # The build cache system does string matching below this point. # if you change something, be careful. -setup(name=DISTNAME, - maintainer=AUTHOR, - version=versioneer.get_version(), - packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['templates/*', '_libs/*.dll']}, - ext_modules=maybe_cythonize(extensions, compiler_directives=directives), - maintainer_email=EMAIL, - description=DESCRIPTION, - license=LICENSE, - cmdclass=cmdclass, - url=URL, - download_url=DOWNLOAD_URL, - project_urls=PROJECT_URLS, - long_description=LONG_DESCRIPTION, - classifiers=CLASSIFIERS, - platforms='any', - python_requires='>=3.5.3', - extras_require={ - 'test': [ - # sync with setup.cfg minversion & install.rst - 'pytest>=4.0.2', - 'pytest-xdist', - 'hypothesis>=3.58', - ] - }, - **setuptools_kwargs) +setup( + name=DISTNAME, + maintainer=AUTHOR, + version=versioneer.get_version(), + packages=find_packages(include=["pandas", "pandas.*"]), + package_data={"": ["templates/*", "_libs/*.dll"]}, + ext_modules=maybe_cythonize(extensions, compiler_directives=directives), + maintainer_email=EMAIL, + description=DESCRIPTION, + license=LICENSE, + cmdclass=cmdclass, + url=URL, + download_url=DOWNLOAD_URL, + project_urls=PROJECT_URLS, + long_description=LONG_DESCRIPTION, + classifiers=CLASSIFIERS, + platforms="any", + python_requires=">=3.5.3", + extras_require={ + "test": [ + # sync with setup.cfg minversion & install.rst + "pytest>=4.0.2", + "pytest-xdist", + "hypothesis>=3.58", + ] + }, + **setuptools_kwargs +) diff --git a/versioneer.py b/versioneer.py index 865dc10f09216..24d8105c307c0 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,3 @@ - # Version: 0.15 """ @@ -367,11 +366,13 @@ def get_root(): setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -382,8 +383,10 @@ def get_root(): # versioneer.py was first imported, even in later projects. me = os.path.realpath(os.path.abspath(__file__)) if os.path.splitext(me)[0] != os.path.splitext(versioneer_py)[0]: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py)) + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) + ) except NameError: pass return root @@ -404,6 +407,7 @@ def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None + cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" @@ -418,6 +422,7 @@ def get(parser, name): class NotThisMethod(Exception): pass + # these dictionaries contain VCS-specific tools LONG_VERSION_PY = {} HANDLERS = {} @@ -429,6 +434,7 @@ def decorate(f): HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate @@ -439,9 +445,12 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -465,7 +474,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): return stdout -LONG_VERSION_PY['git'] = r''' +LONG_VERSION_PY[ + "git" +] = r""" # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -925,7 +936,7 @@ def get_versions(): return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version"} -''' +""" @register_vcs_handler("git", "get_keywords") @@ -965,7 +976,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -974,27 +985,32 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} + tags = {r for r in refs if re.search(r"\d", r)} if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) + print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None - } + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags"} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + } @register_vcs_handler("git", "pieces_from_vcs") @@ -1014,9 +1030,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): GITS = ["git.cmd", "git.exe"] # if there is a tag, this yields TAG-NUM-gHEX[-dirty] # if there are no tags, this yields HEX[-dirty] (no NUM) - describe_out = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long"], - cwd=root) + describe_out = run_command( + GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1039,17 +1055,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -1058,10 +1073,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1072,8 +1089,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits return pieces @@ -1118,12 +1134,18 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: - print("guessing rootdir is '%s', but '%s' doesn't start with " - "prefix '%s'" % (root, dirname, parentdir_prefix)) + print( + "guessing rootdir is '%s', but '%s' doesn't start with " + "prefix '%s'" % (root, dirname, parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + } + SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.15) from @@ -1152,8 +1174,9 @@ def versions_from_file(filename): contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1161,8 +1184,7 @@ def versions_from_file(filename): def write_to_version_file(filename, versions): os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1192,8 +1214,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1300,10 +1321,12 @@ def render_git_describe_long(pieces): def render(pieces, style): if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"]} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + } if not style or style == "default": style = "pep440" # the default @@ -1323,8 +1346,12 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + } class VersioneerBadRootError(Exception): @@ -1345,8 +1372,9 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1400,8 +1428,12 @@ def get_versions(verbose=False): if verbose: print("unable to compute version") - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version"} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + } def get_version(): @@ -1447,6 +1479,7 @@ def run(self): print(" dirty: %s" % vers.get("dirty")) if vers["error"]: print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools @@ -1470,10 +1503,10 @@ def run(self): # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? @@ -1492,13 +1525,17 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["build_exe"] = cmd_build_exe del cmds["build_py"] @@ -1526,8 +1563,10 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + cmds["sdist"] = cmd_sdist return cmds @@ -1581,11 +1620,13 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except (EnvironmentError, configparser.NoSectionError, - configparser.NoOptionError) as e: + except ( + EnvironmentError, + configparser.NoSectionError, + configparser.NoOptionError, + ) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -1594,15 +1635,18 @@ def do_setup(): print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -1644,8 +1688,10 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: @@ -1693,6 +1739,7 @@ def scan_setup_py(): errors += 1 return errors + if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup":